stringtriebuilder.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. /*
  2. *******************************************************************************
  3. * Copyright (C) 2010-2012,2014, International Business Machines
  4. * Corporation and others. All Rights Reserved.
  5. *******************************************************************************
  6. * file name: stringtriebuilder.h
  7. * encoding: US-ASCII
  8. * tab size: 8 (not used)
  9. * indentation:4
  10. *
  11. * created on: 2010dec24
  12. * created by: Markus W. Scherer
  13. */
  14. #ifndef __STRINGTRIEBUILDER_H__
  15. #define __STRINGTRIEBUILDER_H__
  16. #include "unicode/utypes.h"
  17. #include "unicode/uobject.h"
  18. /**
  19. * \file
  20. * \brief C++ API: Builder API for trie builders
  21. */
  22. // Forward declaration.
  23. struct UHashtable;
  24. typedef struct UHashtable UHashtable;
  25. /**
  26. * Build options for BytesTrieBuilder and CharsTrieBuilder.
  27. * @stable ICU 4.8
  28. */
  29. enum UStringTrieBuildOption {
  30. /**
  31. * Builds a trie quickly.
  32. * @stable ICU 4.8
  33. */
  34. USTRINGTRIE_BUILD_FAST,
  35. /**
  36. * Builds a trie more slowly, attempting to generate
  37. * a shorter but equivalent serialization.
  38. * This build option also uses more memory.
  39. *
  40. * This option can be effective when many integer values are the same
  41. * and string/byte sequence suffixes can be shared.
  42. * Runtime speed is not expected to improve.
  43. * @stable ICU 4.8
  44. */
  45. USTRINGTRIE_BUILD_SMALL
  46. };
  47. U_NAMESPACE_BEGIN
  48. /**
  49. * Base class for string trie builder classes.
  50. *
  51. * This class is not intended for public subclassing.
  52. * @stable ICU 4.8
  53. */
  54. class U_COMMON_API StringTrieBuilder : public UObject {
  55. public:
  56. #ifndef U_HIDE_INTERNAL_API
  57. /** @internal */
  58. static UBool hashNode(const void *node);
  59. /** @internal */
  60. static UBool equalNodes(const void *left, const void *right);
  61. #endif /* U_HIDE_INTERNAL_API */
  62. protected:
  63. // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
  64. // or else the compiler will create a public default constructor.
  65. /** @internal */
  66. StringTrieBuilder();
  67. /** @internal */
  68. virtual ~StringTrieBuilder();
  69. #ifndef U_HIDE_INTERNAL_API
  70. /** @internal */
  71. void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
  72. /** @internal */
  73. void deleteCompactBuilder();
  74. /** @internal */
  75. void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
  76. /** @internal */
  77. int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
  78. /** @internal */
  79. int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
  80. #endif /* U_HIDE_INTERNAL_API */
  81. class Node;
  82. #ifndef U_HIDE_INTERNAL_API
  83. /** @internal */
  84. Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
  85. /** @internal */
  86. Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
  87. int32_t length, UErrorCode &errorCode);
  88. #endif /* U_HIDE_INTERNAL_API */
  89. /** @internal */
  90. virtual int32_t getElementStringLength(int32_t i) const = 0;
  91. /** @internal */
  92. virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
  93. /** @internal */
  94. virtual int32_t getElementValue(int32_t i) const = 0;
  95. // Finds the first unit index after this one where
  96. // the first and last element have different units again.
  97. /** @internal */
  98. virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
  99. // Number of different units at unitIndex.
  100. /** @internal */
  101. virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
  102. /** @internal */
  103. virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
  104. /** @internal */
  105. virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
  106. /** @internal */
  107. virtual UBool matchNodesCanHaveValues() const = 0;
  108. /** @internal */
  109. virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
  110. /** @internal */
  111. virtual int32_t getMinLinearMatch() const = 0;
  112. /** @internal */
  113. virtual int32_t getMaxLinearMatchLength() const = 0;
  114. #ifndef U_HIDE_INTERNAL_API
  115. // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
  116. /** @internal */
  117. static const int32_t kMaxBranchLinearSubNodeLength=5;
  118. // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
  119. // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
  120. /** @internal */
  121. static const int32_t kMaxSplitBranchLevels=14;
  122. /**
  123. * Makes sure that there is only one unique node registered that is
  124. * equivalent to newNode.
  125. * @param newNode Input node. The builder takes ownership.
  126. * @param errorCode ICU in/out UErrorCode.
  127. Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
  128. * @return newNode if it is the first of its kind, or
  129. * an equivalent node if newNode is a duplicate.
  130. * @internal
  131. */
  132. Node *registerNode(Node *newNode, UErrorCode &errorCode);
  133. /**
  134. * Makes sure that there is only one unique FinalValueNode registered
  135. * with this value.
  136. * Avoids creating a node if the value is a duplicate.
  137. * @param value A final value.
  138. * @param errorCode ICU in/out UErrorCode.
  139. Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
  140. * @return A FinalValueNode with the given value.
  141. * @internal
  142. */
  143. Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
  144. #endif /* U_HIDE_INTERNAL_API */
  145. /*
  146. * C++ note:
  147. * registerNode() and registerFinalValue() take ownership of their input nodes,
  148. * and only return owned nodes.
  149. * If they see a failure UErrorCode, they will delete the input node.
  150. * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
  151. * If there is a failure, they return NULL.
  152. *
  153. * NULL Node pointers can be safely passed into other Nodes because
  154. * they call the static Node::hashCode() which checks for a NULL pointer first.
  155. *
  156. * Therefore, as long as builder functions register a new node,
  157. * they need to check for failures only before explicitly dereferencing
  158. * a Node pointer, or before setting a new UErrorCode.
  159. */
  160. // Hash set of nodes, maps from nodes to integer 1.
  161. /** @internal */
  162. UHashtable *nodes;
  163. #ifndef U_HIDE_INTERNAL_API
  164. /** @internal */
  165. class Node : public UObject {
  166. public:
  167. Node(int32_t initialHash) : hash(initialHash), offset(0) {}
  168. inline int32_t hashCode() const { return hash; }
  169. // Handles node==NULL.
  170. static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
  171. // Base class operator==() compares the actual class types.
  172. virtual UBool operator==(const Node &other) const;
  173. inline UBool operator!=(const Node &other) const { return !operator==(other); }
  174. /**
  175. * Traverses the Node graph and numbers branch edges, with rightmost edges first.
  176. * This is to avoid writing a duplicate node twice.
  177. *
  178. * Branch nodes in this trie data structure are not symmetric.
  179. * Most branch edges "jump" to other nodes but the rightmost branch edges
  180. * just continue without a jump.
  181. * Therefore, write() must write the rightmost branch edge last
  182. * (trie units are written backwards), and must write it at that point even if
  183. * it is a duplicate of a node previously written elsewhere.
  184. *
  185. * This function visits and marks right branch edges first.
  186. * Edges are numbered with increasingly negative values because we share the
  187. * offset field which gets positive values when nodes are written.
  188. * A branch edge also remembers the first number for any of its edges.
  189. *
  190. * When a further-left branch edge has a number in the range of the rightmost
  191. * edge's numbers, then it will be written as part of the required right edge
  192. * and we can avoid writing it first.
  193. *
  194. * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
  195. * edge numbers.
  196. *
  197. * @param edgeNumber The first edge number for this node and its sub-nodes.
  198. * @return An edge number that is at least the maximum-negative
  199. * of the input edge number and the numbers of this node and all of its sub-nodes.
  200. */
  201. virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
  202. // write() must set the offset to a positive value.
  203. virtual void write(StringTrieBuilder &builder) = 0;
  204. // See markRightEdgesFirst.
  205. inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
  206. StringTrieBuilder &builder) {
  207. // Note: Edge numbers are negative, lastRight<=firstRight.
  208. // If offset>0 then this node and its sub-nodes have been written already
  209. // and we need not write them again.
  210. // If this node is part of the unwritten right branch edge,
  211. // then we wait until that is written.
  212. if(offset<0 && (offset<lastRight || firstRight<offset)) {
  213. write(builder);
  214. }
  215. }
  216. inline int32_t getOffset() const { return offset; }
  217. protected:
  218. int32_t hash;
  219. int32_t offset;
  220. };
  221. // This class should not be overridden because
  222. // registerFinalValue() compares a stack-allocated FinalValueNode
  223. // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
  224. // with the input node, and the
  225. // !Node::operator==(other) used inside FinalValueNode::operator==(other)
  226. // will be false if the typeid's are different.
  227. /** @internal */
  228. class FinalValueNode : public Node {
  229. public:
  230. FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
  231. virtual UBool operator==(const Node &other) const;
  232. virtual void write(StringTrieBuilder &builder);
  233. protected:
  234. int32_t value;
  235. };
  236. /**
  237. * @internal
  238. */
  239. class ValueNode : public Node {
  240. public:
  241. ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
  242. virtual UBool operator==(const Node &other) const;
  243. void setValue(int32_t v) {
  244. hasValue=TRUE;
  245. value=v;
  246. hash=hash*37+v;
  247. }
  248. protected:
  249. UBool hasValue;
  250. int32_t value;
  251. };
  252. /**
  253. * @internal
  254. */
  255. class IntermediateValueNode : public ValueNode {
  256. public:
  257. IntermediateValueNode(int32_t v, Node *nextNode)
  258. : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
  259. virtual UBool operator==(const Node &other) const;
  260. virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
  261. virtual void write(StringTrieBuilder &builder);
  262. protected:
  263. Node *next;
  264. };
  265. /**
  266. * @internal
  267. */
  268. class LinearMatchNode : public ValueNode {
  269. public:
  270. LinearMatchNode(int32_t len, Node *nextNode)
  271. : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
  272. length(len), next(nextNode) {}
  273. virtual UBool operator==(const Node &other) const;
  274. virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
  275. protected:
  276. int32_t length;
  277. Node *next;
  278. };
  279. /**
  280. * @internal
  281. */
  282. class BranchNode : public Node {
  283. public:
  284. BranchNode(int32_t initialHash) : Node(initialHash) {}
  285. protected:
  286. int32_t firstEdgeNumber;
  287. };
  288. /**
  289. * @internal
  290. */
  291. class ListBranchNode : public BranchNode {
  292. public:
  293. ListBranchNode() : BranchNode(0x444444), length(0) {}
  294. virtual UBool operator==(const Node &other) const;
  295. virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
  296. virtual void write(StringTrieBuilder &builder);
  297. // Adds a unit with a final value.
  298. void add(int32_t c, int32_t value) {
  299. units[length]=(UChar)c;
  300. equal[length]=NULL;
  301. values[length]=value;
  302. ++length;
  303. hash=(hash*37+c)*37+value;
  304. }
  305. // Adds a unit which leads to another match node.
  306. void add(int32_t c, Node *node) {
  307. units[length]=(UChar)c;
  308. equal[length]=node;
  309. values[length]=0;
  310. ++length;
  311. hash=(hash*37+c)*37+hashCode(node);
  312. }
  313. protected:
  314. Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value".
  315. int32_t length;
  316. int32_t values[kMaxBranchLinearSubNodeLength];
  317. UChar units[kMaxBranchLinearSubNodeLength];
  318. };
  319. /**
  320. * @internal
  321. */
  322. class SplitBranchNode : public BranchNode {
  323. public:
  324. SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
  325. : BranchNode(((0x555555*37+middleUnit)*37+
  326. hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
  327. unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
  328. virtual UBool operator==(const Node &other) const;
  329. virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
  330. virtual void write(StringTrieBuilder &builder);
  331. protected:
  332. UChar unit;
  333. Node *lessThan;
  334. Node *greaterOrEqual;
  335. };
  336. // Branch head node, for writing the actual node lead unit.
  337. /** @internal */
  338. class BranchHeadNode : public ValueNode {
  339. public:
  340. BranchHeadNode(int32_t len, Node *subNode)
  341. : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
  342. length(len), next(subNode) {}
  343. virtual UBool operator==(const Node &other) const;
  344. virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
  345. virtual void write(StringTrieBuilder &builder);
  346. protected:
  347. int32_t length;
  348. Node *next; // A branch sub-node.
  349. };
  350. #endif /* U_HIDE_INTERNAL_API */
  351. /** @internal */
  352. virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
  353. Node *nextNode) const = 0;
  354. /** @internal */
  355. virtual int32_t write(int32_t unit) = 0;
  356. /** @internal */
  357. virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
  358. /** @internal */
  359. virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
  360. /** @internal */
  361. virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
  362. /** @internal */
  363. virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
  364. };
  365. U_NAMESPACE_END
  366. #endif // __STRINGTRIEBUILDER_H__