coleitr.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404
  1. /*
  2. ******************************************************************************
  3. * Copyright (C) 1997-2014, International Business Machines
  4. * Corporation and others. All Rights Reserved.
  5. ******************************************************************************
  6. */
  7. /**
  8. * \file
  9. * \brief C++ API: Collation Element Iterator.
  10. */
  11. /**
  12. * File coleitr.h
  13. *
  14. * Created by: Helena Shih
  15. *
  16. * Modification History:
  17. *
  18. * Date Name Description
  19. *
  20. * 8/18/97 helena Added internal API documentation.
  21. * 08/03/98 erm Synched with 1.2 version CollationElementIterator.java
  22. * 12/10/99 aliu Ported Thai collation support from Java.
  23. * 01/25/01 swquek Modified into a C++ wrapper calling C APIs (ucoliter.h)
  24. * 02/19/01 swquek Removed CollationElementsIterator() since it is
  25. * private constructor and no calls are made to it
  26. * 2012-2014 markus Rewritten in C++ again.
  27. */
  28. #ifndef COLEITR_H
  29. #define COLEITR_H
  30. #include "unicode/utypes.h"
  31. #if !UCONFIG_NO_COLLATION
  32. #include "unicode/unistr.h"
  33. #include "unicode/uobject.h"
  34. struct UCollationElements;
  35. struct UHashtable;
  36. U_NAMESPACE_BEGIN
  37. struct CollationData;
  38. class CollationIterator;
  39. class RuleBasedCollator;
  40. class UCollationPCE;
  41. class UVector32;
  42. /**
  43. * The CollationElementIterator class is used as an iterator to walk through
  44. * each character of an international string. Use the iterator to return the
  45. * ordering priority of the positioned character. The ordering priority of a
  46. * character, which we refer to as a key, defines how a character is collated in
  47. * the given collation object.
  48. * For example, consider the following in Slovak and in traditional Spanish collation:
  49. * <pre>
  50. * "ca" -> the first key is key('c') and second key is key('a').
  51. * "cha" -> the first key is key('ch') and second key is key('a').</pre>
  52. * And in German phonebook collation,
  53. * <pre> \htmlonly "&#x00E6;b"-> the first key is key('a'), the second key is key('e'), and
  54. * the third key is key('b'). \endhtmlonly </pre>
  55. * The key of a character, is an integer composed of primary order(short),
  56. * secondary order(char), and tertiary order(char). Java strictly defines the
  57. * size and signedness of its primitive data types. Therefore, the static
  58. * functions primaryOrder(), secondaryOrder(), and tertiaryOrder() return
  59. * int32_t to ensure the correctness of the key value.
  60. * <p>Example of the iterator usage: (without error checking)
  61. * <pre>
  62. * \code
  63. * void CollationElementIterator_Example()
  64. * {
  65. * UnicodeString str = "This is a test";
  66. * UErrorCode success = U_ZERO_ERROR;
  67. * RuleBasedCollator* rbc =
  68. * (RuleBasedCollator*) RuleBasedCollator::createInstance(success);
  69. * CollationElementIterator* c =
  70. * rbc->createCollationElementIterator( str );
  71. * int32_t order = c->next(success);
  72. * c->reset();
  73. * order = c->previous(success);
  74. * delete c;
  75. * delete rbc;
  76. * }
  77. * \endcode
  78. * </pre>
  79. * <p>
  80. * The method next() returns the collation order of the next character based on
  81. * the comparison level of the collator. The method previous() returns the
  82. * collation order of the previous character based on the comparison level of
  83. * the collator. The Collation Element Iterator moves only in one direction
  84. * between calls to reset(), setOffset(), or setText(). That is, next()
  85. * and previous() can not be inter-used. Whenever previous() is to be called after
  86. * next() or vice versa, reset(), setOffset() or setText() has to be called first
  87. * to reset the status, shifting pointers to either the end or the start of
  88. * the string (reset() or setText()), or the specified position (setOffset()).
  89. * Hence at the next call of next() or previous(), the first or last collation order,
  90. * or collation order at the spefcifieid position will be returned. If a change of
  91. * direction is done without one of these calls, the result is undefined.
  92. * <p>
  93. * The result of a forward iterate (next()) and reversed result of the backward
  94. * iterate (previous()) on the same string are equivalent, if collation orders
  95. * with the value 0 are ignored.
  96. * Character based on the comparison level of the collator. A collation order
  97. * consists of primary order, secondary order and tertiary order. The data
  98. * type of the collation order is <strong>int32_t</strong>.
  99. *
  100. * Note, CollationElementIterator should not be subclassed.
  101. * @see Collator
  102. * @see RuleBasedCollator
  103. * @version 1.8 Jan 16 2001
  104. */
  105. class U_I18N_API CollationElementIterator U_FINAL : public UObject {
  106. public:
  107. // CollationElementIterator public data member ------------------------------
  108. enum {
  109. /**
  110. * NULLORDER indicates that an error has occured while processing
  111. * @stable ICU 2.0
  112. */
  113. NULLORDER = (int32_t)0xffffffff
  114. };
  115. // CollationElementIterator public constructor/destructor -------------------
  116. /**
  117. * Copy constructor.
  118. *
  119. * @param other the object to be copied from
  120. * @stable ICU 2.0
  121. */
  122. CollationElementIterator(const CollationElementIterator& other);
  123. /**
  124. * Destructor
  125. * @stable ICU 2.0
  126. */
  127. virtual ~CollationElementIterator();
  128. // CollationElementIterator public methods ----------------------------------
  129. /**
  130. * Returns true if "other" is the same as "this"
  131. *
  132. * @param other the object to be compared
  133. * @return true if "other" is the same as "this"
  134. * @stable ICU 2.0
  135. */
  136. UBool operator==(const CollationElementIterator& other) const;
  137. /**
  138. * Returns true if "other" is not the same as "this".
  139. *
  140. * @param other the object to be compared
  141. * @return true if "other" is not the same as "this"
  142. * @stable ICU 2.0
  143. */
  144. UBool operator!=(const CollationElementIterator& other) const;
  145. /**
  146. * Resets the cursor to the beginning of the string.
  147. * @stable ICU 2.0
  148. */
  149. void reset(void);
  150. /**
  151. * Gets the ordering priority of the next character in the string.
  152. * @param status the error code status.
  153. * @return the next character's ordering. otherwise returns NULLORDER if an
  154. * error has occured or if the end of string has been reached
  155. * @stable ICU 2.0
  156. */
  157. int32_t next(UErrorCode& status);
  158. /**
  159. * Get the ordering priority of the previous collation element in the string.
  160. * @param status the error code status.
  161. * @return the previous element's ordering. otherwise returns NULLORDER if an
  162. * error has occured or if the start of string has been reached
  163. * @stable ICU 2.0
  164. */
  165. int32_t previous(UErrorCode& status);
  166. /**
  167. * Gets the primary order of a collation order.
  168. * @param order the collation order
  169. * @return the primary order of a collation order.
  170. * @stable ICU 2.0
  171. */
  172. static inline int32_t primaryOrder(int32_t order);
  173. /**
  174. * Gets the secondary order of a collation order.
  175. * @param order the collation order
  176. * @return the secondary order of a collation order.
  177. * @stable ICU 2.0
  178. */
  179. static inline int32_t secondaryOrder(int32_t order);
  180. /**
  181. * Gets the tertiary order of a collation order.
  182. * @param order the collation order
  183. * @return the tertiary order of a collation order.
  184. * @stable ICU 2.0
  185. */
  186. static inline int32_t tertiaryOrder(int32_t order);
  187. /**
  188. * Return the maximum length of any expansion sequences that end with the
  189. * specified comparison order.
  190. * @param order a collation order returned by previous or next.
  191. * @return maximum size of the expansion sequences ending with the collation
  192. * element or 1 if collation element does not occur at the end of any
  193. * expansion sequence
  194. * @stable ICU 2.0
  195. */
  196. int32_t getMaxExpansion(int32_t order) const;
  197. /**
  198. * Gets the comparison order in the desired strength. Ignore the other
  199. * differences.
  200. * @param order The order value
  201. * @stable ICU 2.0
  202. */
  203. int32_t strengthOrder(int32_t order) const;
  204. /**
  205. * Sets the source string.
  206. * @param str the source string.
  207. * @param status the error code status.
  208. * @stable ICU 2.0
  209. */
  210. void setText(const UnicodeString& str, UErrorCode& status);
  211. /**
  212. * Sets the source string.
  213. * @param str the source character iterator.
  214. * @param status the error code status.
  215. * @stable ICU 2.0
  216. */
  217. void setText(CharacterIterator& str, UErrorCode& status);
  218. /**
  219. * Checks if a comparison order is ignorable.
  220. * @param order the collation order.
  221. * @return TRUE if a character is ignorable, FALSE otherwise.
  222. * @stable ICU 2.0
  223. */
  224. static inline UBool isIgnorable(int32_t order);
  225. /**
  226. * Gets the offset of the currently processed character in the source string.
  227. * @return the offset of the character.
  228. * @stable ICU 2.0
  229. */
  230. int32_t getOffset(void) const;
  231. /**
  232. * Sets the offset of the currently processed character in the source string.
  233. * @param newOffset the new offset.
  234. * @param status the error code status.
  235. * @return the offset of the character.
  236. * @stable ICU 2.0
  237. */
  238. void setOffset(int32_t newOffset, UErrorCode& status);
  239. /**
  240. * ICU "poor man's RTTI", returns a UClassID for the actual class.
  241. *
  242. * @stable ICU 2.2
  243. */
  244. virtual UClassID getDynamicClassID() const;
  245. /**
  246. * ICU "poor man's RTTI", returns a UClassID for this class.
  247. *
  248. * @stable ICU 2.2
  249. */
  250. static UClassID U_EXPORT2 getStaticClassID();
  251. #ifndef U_HIDE_INTERNAL_API
  252. /** @internal */
  253. static inline CollationElementIterator *fromUCollationElements(UCollationElements *uc) {
  254. return reinterpret_cast<CollationElementIterator *>(uc);
  255. }
  256. /** @internal */
  257. static inline const CollationElementIterator *fromUCollationElements(const UCollationElements *uc) {
  258. return reinterpret_cast<const CollationElementIterator *>(uc);
  259. }
  260. /** @internal */
  261. inline UCollationElements *toUCollationElements() {
  262. return reinterpret_cast<UCollationElements *>(this);
  263. }
  264. /** @internal */
  265. inline const UCollationElements *toUCollationElements() const {
  266. return reinterpret_cast<const UCollationElements *>(this);
  267. }
  268. #endif // U_HIDE_INTERNAL_API
  269. private:
  270. friend class RuleBasedCollator;
  271. friend class UCollationPCE;
  272. /**
  273. * CollationElementIterator constructor. This takes the source string and the
  274. * collation object. The cursor will walk thru the source string based on the
  275. * predefined collation rules. If the source string is empty, NULLORDER will
  276. * be returned on the calls to next().
  277. * @param sourceText the source string.
  278. * @param order the collation object.
  279. * @param status the error code status.
  280. */
  281. CollationElementIterator(const UnicodeString& sourceText,
  282. const RuleBasedCollator* order, UErrorCode& status);
  283. // Note: The constructors should take settings & tailoring, not a collator,
  284. // to avoid circular dependencies.
  285. // However, for operator==() we would need to be able to compare tailoring data for equality
  286. // without making CollationData or CollationTailoring depend on TailoredSet.
  287. // (See the implementation of RuleBasedCollator::operator==().)
  288. // That might require creating an intermediate class that would be used
  289. // by both CollationElementIterator and RuleBasedCollator
  290. // but only contain the part of RBC== related to data and rules.
  291. /**
  292. * CollationElementIterator constructor. This takes the source string and the
  293. * collation object. The cursor will walk thru the source string based on the
  294. * predefined collation rules. If the source string is empty, NULLORDER will
  295. * be returned on the calls to next().
  296. * @param sourceText the source string.
  297. * @param order the collation object.
  298. * @param status the error code status.
  299. */
  300. CollationElementIterator(const CharacterIterator& sourceText,
  301. const RuleBasedCollator* order, UErrorCode& status);
  302. /**
  303. * Assignment operator
  304. *
  305. * @param other the object to be copied
  306. */
  307. const CollationElementIterator&
  308. operator=(const CollationElementIterator& other);
  309. CollationElementIterator(); // default constructor not implemented
  310. /** Normalizes dir_=1 (just after setOffset()) to dir_=0 (just after reset()). */
  311. inline int8_t normalizeDir() const { return dir_ == 1 ? 0 : dir_; }
  312. static UHashtable *computeMaxExpansions(const CollationData *data, UErrorCode &errorCode);
  313. static int32_t getMaxExpansion(const UHashtable *maxExpansions, int32_t order);
  314. // CollationElementIterator private data members ----------------------------
  315. CollationIterator *iter_; // owned
  316. const RuleBasedCollator *rbc_; // aliased
  317. uint32_t otherHalf_;
  318. /**
  319. * <0: backwards; 0: just after reset() (previous() begins from end);
  320. * 1: just after setOffset(); >1: forward
  321. */
  322. int8_t dir_;
  323. /**
  324. * Stores offsets from expansions and from unsafe-backwards iteration,
  325. * so that getOffset() returns intermediate offsets for the CEs
  326. * that are consistent with forward iteration.
  327. */
  328. UVector32 *offsets_;
  329. UnicodeString string_;
  330. };
  331. // CollationElementIterator inline method definitions --------------------------
  332. inline int32_t CollationElementIterator::primaryOrder(int32_t order)
  333. {
  334. return (order >> 16) & 0xffff;
  335. }
  336. inline int32_t CollationElementIterator::secondaryOrder(int32_t order)
  337. {
  338. return (order >> 8) & 0xff;
  339. }
  340. inline int32_t CollationElementIterator::tertiaryOrder(int32_t order)
  341. {
  342. return order & 0xff;
  343. }
  344. inline UBool CollationElementIterator::isIgnorable(int32_t order)
  345. {
  346. return (order & 0xffff0000) == 0;
  347. }
  348. U_NAMESPACE_END
  349. #endif /* #if !UCONFIG_NO_COLLATION */
  350. #endif