tblcoll.h 36 KB


  1. /*
  2. ******************************************************************************
  3. * Copyright (C) 1996-2016, International Business Machines Corporation and
  4. * others. All Rights Reserved.
  5. ******************************************************************************
  6. */
  7. /**
  8. * \file
  9. * \brief C++ API: The RuleBasedCollator class implements the Collator abstract base class.
  10. */
  11. /**
  12. * File tblcoll.h
  13. *
  14. * Created by: Helena Shih
  15. *
  16. * Modification History:
  17. *
  18. * Date Name Description
  19. * 2/5/97 aliu Added streamIn and streamOut methods. Added
  20. * constructor which reads RuleBasedCollator object from
  21. * a binary file. Added writeToFile method which streams
  22. * RuleBasedCollator out to a binary file. The streamIn
  23. * and streamOut methods use istream and ostream objects
  24. * in binary mode.
  25. * 2/12/97 aliu Modified to use TableCollationData sub-object to
  26. * hold invariant data.
  27. * 2/13/97 aliu Moved several methods into this class from Collation.
  28. * Added a private RuleBasedCollator(Locale&) constructor,
  29. * to be used by Collator::createDefault(). General
  30. * clean up.
  31. * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy
  32. * constructor and getDynamicClassID.
  33. * 3/5/97 aliu Modified constructFromFile() to add parameter
  34. * specifying whether or not binary loading is to be
  35. * attempted. This is required for dynamic rule loading.
  36. * 05/07/97 helena Added memory allocation error detection.
  37. * 6/17/97 helena Added IDENTICAL strength for compare, changed getRules to
  38. * use MergeCollation::getPattern.
  39. * 6/20/97 helena Java class name change.
  40. * 8/18/97 helena Added internal API documentation.
  41. * 09/03/97 helena Added createCollationKeyValues().
  42. * 02/10/98 damiba Added compare with "length" parameter
  43. * 08/05/98 erm Synched with 1.2 version of RuleBasedCollator.java
  44. * 04/23/99 stephen Removed EDecompositionMode, merged with
  45. * Normalizer::EMode
  46. * 06/14/99 stephen Removed kResourceBundleSuffix
  47. * 11/02/99 helena Collator performance enhancements. Eliminates the
  48. * UnicodeString construction and special case for NO_OP.
  49. * 11/23/99 srl More performance enhancements. Updates to NormalizerIterator
  50. * internal state management.
  51. * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator
  52. * to implementation file.
  53. * 01/29/01 synwee Modified into a C++ wrapper which calls C API
  54. * (ucol.h)
  55. * 2012-2014 markus Rewritten in C++ again.
  56. */
  57. #ifndef TBLCOLL_H
  58. #define TBLCOLL_H
  59. #include "unicode/utypes.h"
  60. #if !UCONFIG_NO_COLLATION
  61. #include "unicode/coll.h"
  62. #include "unicode/locid.h"
  63. #include "unicode/uiter.h"
  64. #include "unicode/ucol.h"
  65. U_NAMESPACE_BEGIN
  66. struct CollationCacheEntry;
  67. struct CollationData;
  68. struct CollationSettings;
  69. struct CollationTailoring;
  70. /**
  71. * @stable ICU 2.0
  72. */
  73. class StringSearch;
  74. /**
  75. * @stable ICU 2.0
  76. */
  77. class CollationElementIterator;
  78. class CollationKey;
  79. class SortKeyByteSink;
  80. class UnicodeSet;
  81. class UnicodeString;
  82. class UVector64;
  83. /**
  84. * The RuleBasedCollator class provides the implementation of
  85. * Collator, using data-driven tables. The user can create a customized
  86. * table-based collation.
  87. * <p>
  88. * For more information about the collation service see
  89. * <a href="http://userguide.icu-project.org/collation">the User Guide</a>.
  90. * <p>
  91. * Collation service provides correct sorting orders for most locales supported in ICU.
  92. * If specific data for a locale is not available, the orders eventually falls back
  93. * to the <a href="http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Collation">CLDR root sort order</a>.
  94. * <p>
  95. * Sort ordering may be customized by providing your own set of rules. For more on
  96. * this subject see the <a href="http://userguide.icu-project.org/collation/customization">
  97. * Collation Customization</a> section of the User Guide.
  98. * <p>
  99. * Note, RuleBasedCollator is not to be subclassed.
  100. * @see Collator
  101. */
  102. class U_I18N_API RuleBasedCollator : public Collator {
  103. public:
  104. /**
  105. * RuleBasedCollator constructor. This takes the table rules and builds a
  106. * collation table out of them. Please see RuleBasedCollator class
  107. * description for more details on the collation rule syntax.
  108. * @param rules the collation rules to build the collation table from.
  109. * @param status reporting a success or an error.
  110. * @stable ICU 2.0
  111. */
  112. RuleBasedCollator(const UnicodeString& rules, UErrorCode& status);
  113. /**
  114. * RuleBasedCollator constructor. This takes the table rules and builds a
  115. * collation table out of them. Please see RuleBasedCollator class
  116. * description for more details on the collation rule syntax.
  117. * @param rules the collation rules to build the collation table from.
  118. * @param collationStrength strength for comparison
  119. * @param status reporting a success or an error.
  120. * @stable ICU 2.0
  121. */
  122. RuleBasedCollator(const UnicodeString& rules,
  123. ECollationStrength collationStrength,
  124. UErrorCode& status);
  125. /**
  126. * RuleBasedCollator constructor. This takes the table rules and builds a
  127. * collation table out of them. Please see RuleBasedCollator class
  128. * description for more details on the collation rule syntax.
  129. * @param rules the collation rules to build the collation table from.
  130. * @param decompositionMode the normalisation mode
  131. * @param status reporting a success or an error.
  132. * @stable ICU 2.0
  133. */
  134. RuleBasedCollator(const UnicodeString& rules,
  135. UColAttributeValue decompositionMode,
  136. UErrorCode& status);
  137. /**
  138. * RuleBasedCollator constructor. This takes the table rules and builds a
  139. * collation table out of them. Please see RuleBasedCollator class
  140. * description for more details on the collation rule syntax.
  141. * @param rules the collation rules to build the collation table from.
  142. * @param collationStrength strength for comparison
  143. * @param decompositionMode the normalisation mode
  144. * @param status reporting a success or an error.
  145. * @stable ICU 2.0
  146. */
  147. RuleBasedCollator(const UnicodeString& rules,
  148. ECollationStrength collationStrength,
  149. UColAttributeValue decompositionMode,
  150. UErrorCode& status);
  151. #ifndef U_HIDE_INTERNAL_API
  152. /**
  153. * TODO: document & propose as public API
  154. * @internal
  155. */
  156. RuleBasedCollator(const UnicodeString &rules,
  157. UParseError &parseError, UnicodeString &reason,
  158. UErrorCode &errorCode);
  159. #endif /* U_HIDE_INTERNAL_API */
  160. /**
  161. * Copy constructor.
  162. * @param other the RuleBasedCollator object to be copied
  163. * @stable ICU 2.0
  164. */
  165. RuleBasedCollator(const RuleBasedCollator& other);
  166. /** Opens a collator from a collator binary image created using
  167. * cloneBinary. Binary image used in instantiation of the
  168. * collator remains owned by the user and should stay around for
  169. * the lifetime of the collator. The API also takes a base collator
  170. * which must be the root collator.
  171. * @param bin binary image owned by the user and required through the
  172. * lifetime of the collator
  173. * @param length size of the image. If negative, the API will try to
  174. * figure out the length of the image
  175. * @param base Base collator, for lookup of untailored characters.
  176. * Must be the root collator, must not be NULL.
  177. * The base is required to be present through the lifetime of the collator.
  178. * @param status for catching errors
  179. * @return newly created collator
  180. * @see cloneBinary
  181. * @stable ICU 3.4
  182. */
  183. RuleBasedCollator(const uint8_t *bin, int32_t length,
  184. const RuleBasedCollator *base,
  185. UErrorCode &status);
  186. /**
  187. * Destructor.
  188. * @stable ICU 2.0
  189. */
  190. virtual ~RuleBasedCollator();
  191. /**
  192. * Assignment operator.
  193. * @param other other RuleBasedCollator object to copy from.
  194. * @stable ICU 2.0
  195. */
  196. RuleBasedCollator& operator=(const RuleBasedCollator& other);
  197. /**
  198. * Returns true if argument is the same as this object.
  199. * @param other Collator object to be compared.
  200. * @return true if arguments is the same as this object.
  201. * @stable ICU 2.0
  202. */
  203. virtual UBool operator==(const Collator& other) const;
  204. /**
  205. * Makes a copy of this object.
  206. * @return a copy of this object, owned by the caller
  207. * @stable ICU 2.0
  208. */
  209. virtual Collator* clone(void) const;
  210. /**
  211. * Creates a collation element iterator for the source string. The caller of
  212. * this method is responsible for the memory management of the return
  213. * pointer.
  214. * @param source the string over which the CollationElementIterator will
  215. * iterate.
  216. * @return the collation element iterator of the source string using this as
  217. * the based Collator.
  218. * @stable ICU 2.2
  219. */
  220. virtual CollationElementIterator* createCollationElementIterator(
  221. const UnicodeString& source) const;
  222. /**
  223. * Creates a collation element iterator for the source. The caller of this
  224. * method is responsible for the memory management of the returned pointer.
  225. * @param source the CharacterIterator which produces the characters over
  226. * which the CollationElementItgerator will iterate.
  227. * @return the collation element iterator of the source using this as the
  228. * based Collator.
  229. * @stable ICU 2.2
  230. */
  231. virtual CollationElementIterator* createCollationElementIterator(
  232. const CharacterIterator& source) const;
  233. // Make deprecated versions of Collator::compare() visible.
  234. using Collator::compare;
  235. /**
  236. * The comparison function compares the character data stored in two
  237. * different strings. Returns information about whether a string is less
  238. * than, greater than or equal to another string.
  239. * @param source the source string to be compared with.
  240. * @param target the string that is to be compared with the source string.
  241. * @param status possible error code
  242. * @return Returns an enum value. UCOL_GREATER if source is greater
  243. * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less
  244. * than target
  245. * @stable ICU 2.6
  246. **/
  247. virtual UCollationResult compare(const UnicodeString& source,
  248. const UnicodeString& target,
  249. UErrorCode &status) const;
  250. /**
  251. * Does the same thing as compare but limits the comparison to a specified
  252. * length
  253. * @param source the source string to be compared with.
  254. * @param target the string that is to be compared with the source string.
  255. * @param length the length the comparison is limited to
  256. * @param status possible error code
  257. * @return Returns an enum value. UCOL_GREATER if source (up to the specified
  258. * length) is greater than target; UCOL_EQUAL if source (up to specified
  259. * length) is equal to target; UCOL_LESS if source (up to the specified
  260. * length) is less than target.
  261. * @stable ICU 2.6
  262. */
  263. virtual UCollationResult compare(const UnicodeString& source,
  264. const UnicodeString& target,
  265. int32_t length,
  266. UErrorCode &status) const;
  267. /**
  268. * The comparison function compares the character data stored in two
  269. * different string arrays. Returns information about whether a string array
  270. * is less than, greater than or equal to another string array.
  271. * @param source the source string array to be compared with.
  272. * @param sourceLength the length of the source string array. If this value
  273. * is equal to -1, the string array is null-terminated.
  274. * @param target the string that is to be compared with the source string.
  275. * @param targetLength the length of the target string array. If this value
  276. * is equal to -1, the string array is null-terminated.
  277. * @param status possible error code
  278. * @return Returns an enum value. UCOL_GREATER if source is greater
  279. * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less
  280. * than target
  281. * @stable ICU 2.6
  282. */
  283. virtual UCollationResult compare(const UChar* source, int32_t sourceLength,
  284. const UChar* target, int32_t targetLength,
  285. UErrorCode &status) const;
  286. /**
  287. * Compares two strings using the Collator.
  288. * Returns whether the first one compares less than/equal to/greater than
  289. * the second one.
  290. * This version takes UCharIterator input.
  291. * @param sIter the first ("source") string iterator
  292. * @param tIter the second ("target") string iterator
  293. * @param status ICU status
  294. * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER
  295. * @stable ICU 4.2
  296. */
  297. virtual UCollationResult compare(UCharIterator &sIter,
  298. UCharIterator &tIter,
  299. UErrorCode &status) const;
  300. /**
  301. * Compares two UTF-8 strings using the Collator.
  302. * Returns whether the first one compares less than/equal to/greater than
  303. * the second one.
  304. * This version takes UTF-8 input.
  305. * Note that a StringPiece can be implicitly constructed
  306. * from a std::string or a NUL-terminated const char * string.
  307. * @param source the first UTF-8 string
  308. * @param target the second UTF-8 string
  309. * @param status ICU status
  310. * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER
  311. * @stable ICU 51
  312. */
  313. virtual UCollationResult compareUTF8(const StringPiece &source,
  314. const StringPiece &target,
  315. UErrorCode &status) const;
  316. /**
  317. * Transforms the string into a series of characters
  318. * that can be compared with CollationKey.compare().
  319. *
  320. * Note that sort keys are often less efficient than simply doing comparison.
  321. * For more details, see the ICU User Guide.
  322. *
  323. * @param source the source string.
  324. * @param key the transformed key of the source string.
  325. * @param status the error code status.
  326. * @return the transformed key.
  327. * @see CollationKey
  328. * @stable ICU 2.0
  329. */
  330. virtual CollationKey& getCollationKey(const UnicodeString& source,
  331. CollationKey& key,
  332. UErrorCode& status) const;
  333. /**
  334. * Transforms a specified region of the string into a series of characters
  335. * that can be compared with CollationKey.compare.
  336. *
  337. * Note that sort keys are often less efficient than simply doing comparison.
  338. * For more details, see the ICU User Guide.
  339. *
  340. * @param source the source string.
  341. * @param sourceLength the length of the source string.
  342. * @param key the transformed key of the source string.
  343. * @param status the error code status.
  344. * @return the transformed key.
  345. * @see CollationKey
  346. * @stable ICU 2.0
  347. */
  348. virtual CollationKey& getCollationKey(const UChar *source,
  349. int32_t sourceLength,
  350. CollationKey& key,
  351. UErrorCode& status) const;
  352. /**
  353. * Generates the hash code for the rule-based collation object.
  354. * @return the hash code.
  355. * @stable ICU 2.0
  356. */
  357. virtual int32_t hashCode() const;
  358. /**
  359. * Gets the locale of the Collator
  360. * @param type can be either requested, valid or actual locale. For more
  361. * information see the definition of ULocDataLocaleType in
  362. * uloc.h
  363. * @param status the error code status.
  364. * @return locale where the collation data lives. If the collator
  365. * was instantiated from rules, locale is empty.
  366. * @deprecated ICU 2.8 likely to change in ICU 3.0, based on feedback
  367. */
  368. virtual Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const;
  369. /**
  370. * Gets the tailoring rules for this collator.
  371. * @return the collation tailoring from which this collator was created
  372. * @stable ICU 2.0
  373. */
  374. const UnicodeString& getRules() const;
  375. /**
  376. * Gets the version information for a Collator.
  377. * @param info the version # information, the result will be filled in
  378. * @stable ICU 2.0
  379. */
  380. virtual void getVersion(UVersionInfo info) const;
  381. #ifndef U_HIDE_DEPRECATED_API
  382. /**
  383. * Returns the maximum length of any expansion sequences that end with the
  384. * specified comparison order.
  385. *
  386. * This is specific to the kind of collation element values and sequences
  387. * returned by the CollationElementIterator.
  388. * Call CollationElementIterator::getMaxExpansion() instead.
  389. *
  390. * @param order a collation order returned by CollationElementIterator::previous
  391. * or CollationElementIterator::next.
  392. * @return maximum size of the expansion sequences ending with the collation
  393. * element, or 1 if the collation element does not occur at the end of
  394. * any expansion sequence
  395. * @see CollationElementIterator#getMaxExpansion
  396. * @deprecated ICU 51 Use CollationElementIterator::getMaxExpansion() instead.
  397. */
  398. int32_t getMaxExpansion(int32_t order) const;
  399. #endif /* U_HIDE_DEPRECATED_API */
  400. /**
  401. * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. This
  402. * method is to implement a simple version of RTTI, since not all C++
  403. * compilers support genuine RTTI. Polymorphic operator==() and clone()
  404. * methods call this method.
  405. * @return The class ID for this object. All objects of a given class have
  406. * the same class ID. Objects of other classes have different class
  407. * IDs.
  408. * @stable ICU 2.0
  409. */
  410. virtual UClassID getDynamicClassID(void) const;
  411. /**
  412. * Returns the class ID for this class. This is useful only for comparing to
  413. * a return value from getDynamicClassID(). For example:
  414. * <pre>
  415. * Base* polymorphic_pointer = createPolymorphicObject();
  416. * if (polymorphic_pointer->getDynamicClassID() ==
  417. * Derived::getStaticClassID()) ...
  418. * </pre>
  419. * @return The class ID for all objects of this class.
  420. * @stable ICU 2.0
  421. */
  422. static UClassID U_EXPORT2 getStaticClassID(void);
  423. #ifndef U_HIDE_DEPRECATED_API
  424. /**
  425. * Do not use this method: The caller and the ICU library might use different heaps.
  426. * Use cloneBinary() instead which writes to caller-provided memory.
  427. *
  428. * Returns a binary format of this collator.
  429. * @param length Returns the length of the data, in bytes
  430. * @param status the error code status.
  431. * @return memory, owned by the caller, of size 'length' bytes.
  432. * @deprecated ICU 52. Use cloneBinary() instead.
  433. */
  434. uint8_t *cloneRuleData(int32_t &length, UErrorCode &status) const;
  435. #endif /* U_HIDE_DEPRECATED_API */
  436. /** Creates a binary image of a collator. This binary image can be stored and
  437. * later used to instantiate a collator using ucol_openBinary.
  438. * This API supports preflighting.
  439. * @param buffer a fill-in buffer to receive the binary image
  440. * @param capacity capacity of the destination buffer
  441. * @param status for catching errors
  442. * @return size of the image
  443. * @see ucol_openBinary
  444. * @stable ICU 3.4
  445. */
  446. int32_t cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status) const;
  447. /**
  448. * Returns current rules. Delta defines whether full rules are returned or
  449. * just the tailoring.
  450. *
  451. * getRules(void) should normally be used instead.
  452. * See http://userguide.icu-project.org/collation/customization#TOC-Building-on-Existing-Locales
  453. * @param delta one of UCOL_TAILORING_ONLY, UCOL_FULL_RULES.
  454. * @param buffer UnicodeString to store the result rules
  455. * @stable ICU 2.2
  456. * @see UCOL_FULL_RULES
  457. */
  458. void getRules(UColRuleOption delta, UnicodeString &buffer) const;
  459. /**
  460. * Universal attribute setter
  461. * @param attr attribute type
  462. * @param value attribute value
  463. * @param status to indicate whether the operation went on smoothly or there were errors
  464. * @stable ICU 2.2
  465. */
  466. virtual void setAttribute(UColAttribute attr, UColAttributeValue value,
  467. UErrorCode &status);
  468. /**
  469. * Universal attribute getter.
  470. * @param attr attribute type
  471. * @param status to indicate whether the operation went on smoothly or there were errors
  472. * @return attribute value
  473. * @stable ICU 2.2
  474. */
  475. virtual UColAttributeValue getAttribute(UColAttribute attr,
  476. UErrorCode &status) const;
  477. /**
  478. * Sets the variable top to the top of the specified reordering group.
  479. * The variable top determines the highest-sorting character
  480. * which is affected by UCOL_ALTERNATE_HANDLING.
  481. * If that attribute is set to UCOL_NON_IGNORABLE, then the variable top has no effect.
  482. * @param group one of UCOL_REORDER_CODE_SPACE, UCOL_REORDER_CODE_PUNCTUATION,
  483. * UCOL_REORDER_CODE_SYMBOL, UCOL_REORDER_CODE_CURRENCY;
  484. * or UCOL_REORDER_CODE_DEFAULT to restore the default max variable group
  485. * @param errorCode Standard ICU error code. Its input value must
  486. * pass the U_SUCCESS() test, or else the function returns
  487. * immediately. Check for U_FAILURE() on output or use with
  488. * function chaining. (See User Guide for details.)
  489. * @return *this
  490. * @see getMaxVariable
  491. * @stable ICU 53
  492. */
  493. virtual Collator &setMaxVariable(UColReorderCode group, UErrorCode &errorCode);
  494. /**
  495. * Returns the maximum reordering group whose characters are affected by UCOL_ALTERNATE_HANDLING.
  496. * @return the maximum variable reordering group.
  497. * @see setMaxVariable
  498. * @stable ICU 53
  499. */
  500. virtual UColReorderCode getMaxVariable() const;
  501. /**
  502. * Sets the variable top to the primary weight of the specified string.
  503. *
  504. * Beginning with ICU 53, the variable top is pinned to
  505. * the top of one of the supported reordering groups,
  506. * and it must not be beyond the last of those groups.
  507. * See setMaxVariable().
  508. * @param varTop one or more (if contraction) UChars to which the variable top should be set
  509. * @param len length of variable top string. If -1 it is considered to be zero terminated.
  510. * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br>
  511. * U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br>
  512. * U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond
  513. * the last reordering group supported by setMaxVariable()
  514. * @return variable top primary weight
  515. * @deprecated ICU 53 Call setMaxVariable() instead.
  516. */
  517. virtual uint32_t setVariableTop(const UChar *varTop, int32_t len, UErrorCode &status);
  518. /**
  519. * Sets the variable top to the primary weight of the specified string.
  520. *
  521. * Beginning with ICU 53, the variable top is pinned to
  522. * the top of one of the supported reordering groups,
  523. * and it must not be beyond the last of those groups.
  524. * See setMaxVariable().
  525. * @param varTop a UnicodeString size 1 or more (if contraction) of UChars to which the variable top should be set
  526. * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br>
  527. * U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br>
  528. * U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond
  529. * the last reordering group supported by setMaxVariable()
  530. * @return variable top primary weight
  531. * @deprecated ICU 53 Call setMaxVariable() instead.
  532. */
  533. virtual uint32_t setVariableTop(const UnicodeString &varTop, UErrorCode &status);
  534. /**
  535. * Sets the variable top to the specified primary weight.
  536. *
  537. * Beginning with ICU 53, the variable top is pinned to
  538. * the top of one of the supported reordering groups,
  539. * and it must not be beyond the last of those groups.
  540. * See setMaxVariable().
  541. * @param varTop primary weight, as returned by setVariableTop or ucol_getVariableTop
  542. * @param status error code
  543. * @deprecated ICU 53 Call setMaxVariable() instead.
  544. */
  545. virtual void setVariableTop(uint32_t varTop, UErrorCode &status);
  546. /**
  547. * Gets the variable top value of a Collator.
  548. * @param status error code (not changed by function). If error code is set, the return value is undefined.
  549. * @return the variable top primary weight
  550. * @see getMaxVariable
  551. * @stable ICU 2.0
  552. */
  553. virtual uint32_t getVariableTop(UErrorCode &status) const;
  554. /**
  555. * Get a UnicodeSet that contains all the characters and sequences tailored in
  556. * this collator.
  557. * @param status error code of the operation
  558. * @return a pointer to a UnicodeSet object containing all the
  559. * code points and sequences that may sort differently than
  560. * in the root collator. The object must be disposed of by using delete
  561. * @stable ICU 2.4
  562. */
  563. virtual UnicodeSet *getTailoredSet(UErrorCode &status) const;
  564. /**
  565. * Get the sort key as an array of bytes from a UnicodeString.
  566. *
  567. * Note that sort keys are often less efficient than simply doing comparison.
  568. * For more details, see the ICU User Guide.
  569. *
  570. * @param source string to be processed.
  571. * @param result buffer to store result in. If NULL, number of bytes needed
  572. * will be returned.
  573. * @param resultLength length of the result buffer. If if not enough the
  574. * buffer will be filled to capacity.
  575. * @return Number of bytes needed for storing the sort key
  576. * @stable ICU 2.0
  577. */
  578. virtual int32_t getSortKey(const UnicodeString& source, uint8_t *result,
  579. int32_t resultLength) const;
  580. /**
  581. * Get the sort key as an array of bytes from a UChar buffer.
  582. *
  583. * Note that sort keys are often less efficient than simply doing comparison.
  584. * For more details, see the ICU User Guide.
  585. *
  586. * @param source string to be processed.
  587. * @param sourceLength length of string to be processed. If -1, the string
  588. * is 0 terminated and length will be decided by the function.
  589. * @param result buffer to store result in. If NULL, number of bytes needed
  590. * will be returned.
  591. * @param resultLength length of the result buffer. If if not enough the
  592. * buffer will be filled to capacity.
  593. * @return Number of bytes needed for storing the sort key
  594. * @stable ICU 2.2
  595. */
  596. virtual int32_t getSortKey(const UChar *source, int32_t sourceLength,
  597. uint8_t *result, int32_t resultLength) const;
  598. /**
  599. * Retrieves the reordering codes for this collator.
  600. * @param dest The array to fill with the script ordering.
  601. * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function
  602. * will only return the length of the result without writing any codes (pre-flighting).
  603. * @param status A reference to an error code value, which must not indicate
  604. * a failure before the function call.
  605. * @return The length of the script ordering array.
  606. * @see ucol_setReorderCodes
  607. * @see Collator#getEquivalentReorderCodes
  608. * @see Collator#setReorderCodes
  609. * @stable ICU 4.8
  610. */
  611. virtual int32_t getReorderCodes(int32_t *dest,
  612. int32_t destCapacity,
  613. UErrorCode& status) const;
  614. /**
  615. * Sets the ordering of scripts for this collator.
  616. * @param reorderCodes An array of script codes in the new order. This can be NULL if the
  617. * length is also set to 0. An empty array will clear any reordering codes on the collator.
  618. * @param reorderCodesLength The length of reorderCodes.
  619. * @param status error code
  620. * @see ucol_setReorderCodes
  621. * @see Collator#getReorderCodes
  622. * @see Collator#getEquivalentReorderCodes
  623. * @stable ICU 4.8
  624. */
  625. virtual void setReorderCodes(const int32_t* reorderCodes,
  626. int32_t reorderCodesLength,
  627. UErrorCode& status) ;
  628. /**
  629. * Implements ucol_strcollUTF8().
  630. * @internal
  631. */
  632. virtual UCollationResult internalCompareUTF8(
  633. const char *left, int32_t leftLength,
  634. const char *right, int32_t rightLength,
  635. UErrorCode &errorCode) const;
  636. /** Get the short definition string for a collator. This internal API harvests the collator's
  637. * locale and the attribute set and produces a string that can be used for opening
  638. * a collator with the same attributes using the ucol_openFromShortString API.
  639. * This string will be normalized.
  640. * The structure and the syntax of the string is defined in the "Naming collators"
  641. * section of the users guide:
  642. * http://userguide.icu-project.org/collation/concepts#TOC-Collator-naming-scheme
  643. * This function supports preflighting.
  644. *
  645. * This is internal, and intended to be used with delegate converters.
  646. *
  647. * @param locale a locale that will appear as a collators locale in the resulting
  648. * short string definition. If NULL, the locale will be harvested
  649. * from the collator.
  650. * @param buffer space to hold the resulting string
  651. * @param capacity capacity of the buffer
  652. * @param status for returning errors. All the preflighting errors are featured
  653. * @return length of the resulting string
  654. * @see ucol_openFromShortString
  655. * @see ucol_normalizeShortDefinitionString
  656. * @see ucol_getShortDefinitionString
  657. * @internal
  658. */
  659. virtual int32_t internalGetShortDefinitionString(const char *locale,
  660. char *buffer,
  661. int32_t capacity,
  662. UErrorCode &status) const;
  663. /**
  664. * Implements ucol_nextSortKeyPart().
  665. * @internal
  666. */
  667. virtual int32_t internalNextSortKeyPart(
  668. UCharIterator *iter, uint32_t state[2],
  669. uint8_t *dest, int32_t count, UErrorCode &errorCode) const;
  670. // Do not enclose the default constructor with #ifndef U_HIDE_INTERNAL_API
  671. /**
  672. * Only for use in ucol_openRules().
  673. * @internal
  674. */
  675. RuleBasedCollator();
  676. #ifndef U_HIDE_INTERNAL_API
  677. /**
  678. * Implements ucol_getLocaleByType().
  679. * Needed because the lifetime of the locale ID string must match that of the collator.
  680. * getLocale() returns a copy of a Locale, with minimal lifetime in a C wrapper.
  681. * @internal
  682. */
  683. const char *internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const;
  684. /**
  685. * Implements ucol_getContractionsAndExpansions().
  686. * Gets this collator's sets of contraction strings and/or
  687. * characters and strings that map to multiple collation elements (expansions).
  688. * If addPrefixes is TRUE, then contractions that are expressed as
  689. * prefix/pre-context rules are included.
  690. * @param contractions if not NULL, the set to hold the contractions
  691. * @param expansions if not NULL, the set to hold the expansions
  692. * @param addPrefixes include prefix contextual mappings
  693. * @param errorCode in/out ICU error code
  694. * @internal
  695. */
  696. void internalGetContractionsAndExpansions(
  697. UnicodeSet *contractions, UnicodeSet *expansions,
  698. UBool addPrefixes, UErrorCode &errorCode) const;
  699. /**
  700. * Adds the contractions that start with character c to the set.
  701. * Ignores prefixes. Used by AlphabeticIndex.
  702. * @internal
  703. */
  704. void internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const;
  705. /**
  706. * Implements from-rule constructors, and ucol_openRules().
  707. * @internal
  708. */
  709. void internalBuildTailoring(
  710. const UnicodeString &rules,
  711. int32_t strength,
  712. UColAttributeValue decompositionMode,
  713. UParseError *outParseError, UnicodeString *outReason,
  714. UErrorCode &errorCode);
  715. /** @internal */
  716. static inline RuleBasedCollator *rbcFromUCollator(UCollator *uc) {
  717. return dynamic_cast<RuleBasedCollator *>(fromUCollator(uc));
  718. }
  719. /** @internal */
  720. static inline const RuleBasedCollator *rbcFromUCollator(const UCollator *uc) {
  721. return dynamic_cast<const RuleBasedCollator *>(fromUCollator(uc));
  722. }
  723. /**
  724. * Appends the CEs for the string to the vector.
  725. * @internal for tests & tools
  726. */
  727. void internalGetCEs(const UnicodeString &str, UVector64 &ces, UErrorCode &errorCode) const;
  728. #endif // U_HIDE_INTERNAL_API
  729. protected:
  730. /**
  731. * Used internally by registration to define the requested and valid locales.
  732. * @param requestedLocale the requested locale
  733. * @param validLocale the valid locale
  734. * @param actualLocale the actual locale
  735. * @internal
  736. */
  737. virtual void setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale);
  738. private:
  739. friend class CollationElementIterator;
  740. friend class Collator;
  741. RuleBasedCollator(const CollationCacheEntry *entry);
  742. /**
  743. * Enumeration of attributes that are relevant for short definition strings
  744. * (e.g., ucol_getShortDefinitionString()).
  745. * Effectively extends UColAttribute.
  746. */
  747. enum Attributes {
  748. ATTR_VARIABLE_TOP = UCOL_ATTRIBUTE_COUNT,
  749. ATTR_LIMIT
  750. };
  751. void adoptTailoring(CollationTailoring *t, UErrorCode &errorCode);
  752. // Both lengths must be <0 or else both must be >=0.
  753. UCollationResult doCompare(const UChar *left, int32_t leftLength,
  754. const UChar *right, int32_t rightLength,
  755. UErrorCode &errorCode) const;
  756. UCollationResult doCompare(const uint8_t *left, int32_t leftLength,
  757. const uint8_t *right, int32_t rightLength,
  758. UErrorCode &errorCode) const;
  759. void writeSortKey(const UChar *s, int32_t length,
  760. SortKeyByteSink &sink, UErrorCode &errorCode) const;
  761. void writeIdenticalLevel(const UChar *s, const UChar *limit,
  762. SortKeyByteSink &sink, UErrorCode &errorCode) const;
  763. const CollationSettings &getDefaultSettings() const;
  764. void setAttributeDefault(int32_t attribute) {
  765. explicitlySetAttributes &= ~((uint32_t)1 << attribute);
  766. }
  767. void setAttributeExplicitly(int32_t attribute) {
  768. explicitlySetAttributes |= (uint32_t)1 << attribute;
  769. }
  770. UBool attributeHasBeenSetExplicitly(int32_t attribute) const {
  771. // assert(0 <= attribute < ATTR_LIMIT);
  772. return (UBool)((explicitlySetAttributes & ((uint32_t)1 << attribute)) != 0);
  773. }
  774. /**
  775. * Tests whether a character is "unsafe" for use as a collation starting point.
  776. *
  777. * @param c code point or code unit
  778. * @return TRUE if c is unsafe
  779. * @see CollationElementIterator#setOffset(int)
  780. */
  781. UBool isUnsafe(UChar32 c) const;
  782. static void computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode);
  783. UBool initMaxExpansions(UErrorCode &errorCode) const;
  784. void setFastLatinOptions(CollationSettings &ownedSettings) const;
  785. const CollationData *data;
  786. const CollationSettings *settings; // reference-counted
  787. const CollationTailoring *tailoring; // alias of cacheEntry->tailoring
  788. const CollationCacheEntry *cacheEntry; // reference-counted
  789. Locale validLocale;
  790. uint32_t explicitlySetAttributes;
  791. UBool actualLocaleIsSameAsValid;
  792. };
  793. U_NAMESPACE_END
  794. #endif // !UCONFIG_NO_COLLATION
  795. #endif // TBLCOLL_H