normalizer2.h 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658
  1. /*
  2. *******************************************************************************
  3. *
  4. * Copyright (C) 2009-2013, International Business Machines
  5. * Corporation and others. All Rights Reserved.
  6. *
  7. *******************************************************************************
  8. * file name: normalizer2.h
  9. * encoding: US-ASCII
  10. * tab size: 8 (not used)
  11. * indentation:4
  12. *
  13. * created on: 2009nov22
  14. * created by: Markus W. Scherer
  15. */
  16. #ifndef __NORMALIZER2_H__
  17. #define __NORMALIZER2_H__
  18. /**
  19. * \file
  20. * \brief C++ API: New API for Unicode Normalization.
  21. */
  22. #include "unicode/utypes.h"
  23. #if !UCONFIG_NO_NORMALIZATION
  24. #include "unicode/uniset.h"
  25. #include "unicode/unistr.h"
  26. #include "unicode/unorm2.h"
  27. U_NAMESPACE_BEGIN
  28. /**
  29. * Unicode normalization functionality for standard Unicode normalization or
  30. * for using custom mapping tables.
  31. * All instances of this class are unmodifiable/immutable.
  32. * Instances returned by getInstance() are singletons that must not be deleted by the caller.
  33. * The Normalizer2 class is not intended for public subclassing.
  34. *
  35. * The primary functions are to produce a normalized string and to detect whether
  36. * a string is already normalized.
  37. * The most commonly used normalization forms are those defined in
  38. * http://www.unicode.org/unicode/reports/tr15/
  39. * However, this API supports additional normalization forms for specialized purposes.
  40. * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
  41. * and can be used in implementations of UTS #46.
  42. *
  43. * Not only are the standard compose and decompose modes supplied,
  44. * but additional modes are provided as documented in the Mode enum.
  45. *
  46. * Some of the functions in this class identify normalization boundaries.
  47. * At a normalization boundary, the portions of the string
  48. * before it and starting from it do not interact and can be handled independently.
  49. *
  50. * The spanQuickCheckYes() stops at a normalization boundary.
  51. * When the goal is a normalized string, then the text before the boundary
  52. * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
  53. *
  54. * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
  55. * a character is guaranteed to be at a normalization boundary,
  56. * regardless of context.
  57. * This is used for moving from one normalization boundary to the next
  58. * or preceding boundary, and for performing iterative normalization.
  59. *
  60. * Iterative normalization is useful when only a small portion of a
  61. * longer string needs to be processed.
  62. * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
  63. * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
  64. * (to process only the substring for which sort key bytes are computed).
  65. *
  66. * The set of normalization boundaries returned by these functions may not be
  67. * complete: There may be more boundaries that could be returned.
  68. * Different functions may return different boundaries.
  69. * @stable ICU 4.4
  70. */
  71. class U_COMMON_API Normalizer2 : public UObject {
  72. public:
  73. /**
  74. * Destructor.
  75. * @stable ICU 4.4
  76. */
  77. ~Normalizer2();
  78. /**
  79. * Returns a Normalizer2 instance for Unicode NFC normalization.
  80. * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode).
  81. * Returns an unmodifiable singleton instance. Do not delete it.
  82. * @param errorCode Standard ICU error code. Its input value must
  83. * pass the U_SUCCESS() test, or else the function returns
  84. * immediately. Check for U_FAILURE() on output or use with
  85. * function chaining. (See User Guide for details.)
  86. * @return the requested Normalizer2, if successful
  87. * @stable ICU 49
  88. */
  89. static const Normalizer2 *
  90. getNFCInstance(UErrorCode &errorCode);
  91. /**
  92. * Returns a Normalizer2 instance for Unicode NFD normalization.
  93. * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode).
  94. * Returns an unmodifiable singleton instance. Do not delete it.
  95. * @param errorCode Standard ICU error code. Its input value must
  96. * pass the U_SUCCESS() test, or else the function returns
  97. * immediately. Check for U_FAILURE() on output or use with
  98. * function chaining. (See User Guide for details.)
  99. * @return the requested Normalizer2, if successful
  100. * @stable ICU 49
  101. */
  102. static const Normalizer2 *
  103. getNFDInstance(UErrorCode &errorCode);
  104. /**
  105. * Returns a Normalizer2 instance for Unicode NFKC normalization.
  106. * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode).
  107. * Returns an unmodifiable singleton instance. Do not delete it.
  108. * @param errorCode Standard ICU error code. Its input value must
  109. * pass the U_SUCCESS() test, or else the function returns
  110. * immediately. Check for U_FAILURE() on output or use with
  111. * function chaining. (See User Guide for details.)
  112. * @return the requested Normalizer2, if successful
  113. * @stable ICU 49
  114. */
  115. static const Normalizer2 *
  116. getNFKCInstance(UErrorCode &errorCode);
  117. /**
  118. * Returns a Normalizer2 instance for Unicode NFKD normalization.
  119. * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode).
  120. * Returns an unmodifiable singleton instance. Do not delete it.
  121. * @param errorCode Standard ICU error code. Its input value must
  122. * pass the U_SUCCESS() test, or else the function returns
  123. * immediately. Check for U_FAILURE() on output or use with
  124. * function chaining. (See User Guide for details.)
  125. * @return the requested Normalizer2, if successful
  126. * @stable ICU 49
  127. */
  128. static const Normalizer2 *
  129. getNFKDInstance(UErrorCode &errorCode);
  130. /**
  131. * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
  132. * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode).
  133. * Returns an unmodifiable singleton instance. Do not delete it.
  134. * @param errorCode Standard ICU error code. Its input value must
  135. * pass the U_SUCCESS() test, or else the function returns
  136. * immediately. Check for U_FAILURE() on output or use with
  137. * function chaining. (See User Guide for details.)
  138. * @return the requested Normalizer2, if successful
  139. * @stable ICU 49
  140. */
  141. static const Normalizer2 *
  142. getNFKCCasefoldInstance(UErrorCode &errorCode);
  143. /**
  144. * Returns a Normalizer2 instance which uses the specified data file
  145. * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
  146. * and which composes or decomposes text according to the specified mode.
  147. * Returns an unmodifiable singleton instance. Do not delete it.
  148. *
  149. * Use packageName=NULL for data files that are part of ICU's own data.
  150. * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
  151. * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
  152. * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
  153. *
  154. * @param packageName NULL for ICU built-in data, otherwise application data package name
  155. * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
  156. * @param mode normalization mode (compose or decompose etc.)
  157. * @param errorCode Standard ICU error code. Its input value must
  158. * pass the U_SUCCESS() test, or else the function returns
  159. * immediately. Check for U_FAILURE() on output or use with
  160. * function chaining. (See User Guide for details.)
  161. * @return the requested Normalizer2, if successful
  162. * @stable ICU 4.4
  163. */
  164. static const Normalizer2 *
  165. getInstance(const char *packageName,
  166. const char *name,
  167. UNormalization2Mode mode,
  168. UErrorCode &errorCode);
  169. /**
  170. * Returns the normalized form of the source string.
  171. * @param src source string
  172. * @param errorCode Standard ICU error code. Its input value must
  173. * pass the U_SUCCESS() test, or else the function returns
  174. * immediately. Check for U_FAILURE() on output or use with
  175. * function chaining. (See User Guide for details.)
  176. * @return normalized src
  177. * @stable ICU 4.4
  178. */
  179. UnicodeString
  180. normalize(const UnicodeString &src, UErrorCode &errorCode) const {
  181. UnicodeString result;
  182. normalize(src, result, errorCode);
  183. return result;
  184. }
  185. /**
  186. * Writes the normalized form of the source string to the destination string
  187. * (replacing its contents) and returns the destination string.
  188. * The source and destination strings must be different objects.
  189. * @param src source string
  190. * @param dest destination string; its contents is replaced with normalized src
  191. * @param errorCode Standard ICU error code. Its input value must
  192. * pass the U_SUCCESS() test, or else the function returns
  193. * immediately. Check for U_FAILURE() on output or use with
  194. * function chaining. (See User Guide for details.)
  195. * @return dest
  196. * @stable ICU 4.4
  197. */
  198. virtual UnicodeString &
  199. normalize(const UnicodeString &src,
  200. UnicodeString &dest,
  201. UErrorCode &errorCode) const = 0;
  202. /**
  203. * Appends the normalized form of the second string to the first string
  204. * (merging them at the boundary) and returns the first string.
  205. * The result is normalized if the first string was normalized.
  206. * The first and second strings must be different objects.
  207. * @param first string, should be normalized
  208. * @param second string, will be normalized
  209. * @param errorCode Standard ICU error code. Its input value must
  210. * pass the U_SUCCESS() test, or else the function returns
  211. * immediately. Check for U_FAILURE() on output or use with
  212. * function chaining. (See User Guide for details.)
  213. * @return first
  214. * @stable ICU 4.4
  215. */
  216. virtual UnicodeString &
  217. normalizeSecondAndAppend(UnicodeString &first,
  218. const UnicodeString &second,
  219. UErrorCode &errorCode) const = 0;
  220. /**
  221. * Appends the second string to the first string
  222. * (merging them at the boundary) and returns the first string.
  223. * The result is normalized if both the strings were normalized.
  224. * The first and second strings must be different objects.
  225. * @param first string, should be normalized
  226. * @param second string, should be normalized
  227. * @param errorCode Standard ICU error code. Its input value must
  228. * pass the U_SUCCESS() test, or else the function returns
  229. * immediately. Check for U_FAILURE() on output or use with
  230. * function chaining. (See User Guide for details.)
  231. * @return first
  232. * @stable ICU 4.4
  233. */
  234. virtual UnicodeString &
  235. append(UnicodeString &first,
  236. const UnicodeString &second,
  237. UErrorCode &errorCode) const = 0;
  238. /**
  239. * Gets the decomposition mapping of c.
  240. * Roughly equivalent to normalizing the String form of c
  241. * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
  242. * returns FALSE and does not write a string
  243. * if c does not have a decomposition mapping in this instance's data.
  244. * This function is independent of the mode of the Normalizer2.
  245. * @param c code point
  246. * @param decomposition String object which will be set to c's
  247. * decomposition mapping, if there is one.
  248. * @return TRUE if c has a decomposition, otherwise FALSE
  249. * @stable ICU 4.6
  250. */
  251. virtual UBool
  252. getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
  253. /**
  254. * Gets the raw decomposition mapping of c.
  255. *
  256. * This is similar to the getDecomposition() method but returns the
  257. * raw decomposition mapping as specified in UnicodeData.txt or
  258. * (for custom data) in the mapping files processed by the gennorm2 tool.
  259. * By contrast, getDecomposition() returns the processed,
  260. * recursively-decomposed version of this mapping.
  261. *
  262. * When used on a standard NFKC Normalizer2 instance,
  263. * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
  264. *
  265. * When used on a standard NFC Normalizer2 instance,
  266. * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
  267. * in this case, the result contains either one or two code points (=1..4 UChars).
  268. *
  269. * This function is independent of the mode of the Normalizer2.
  270. * The default implementation returns FALSE.
  271. * @param c code point
  272. * @param decomposition String object which will be set to c's
  273. * raw decomposition mapping, if there is one.
  274. * @return TRUE if c has a decomposition, otherwise FALSE
  275. * @stable ICU 49
  276. */
  277. virtual UBool
  278. getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
  279. /**
  280. * Performs pairwise composition of a & b and returns the composite if there is one.
  281. *
  282. * Returns a composite code point c only if c has a two-way mapping to a+b.
  283. * In standard Unicode normalization, this means that
  284. * c has a canonical decomposition to a+b
  285. * and c does not have the Full_Composition_Exclusion property.
  286. *
  287. * This function is independent of the mode of the Normalizer2.
  288. * The default implementation returns a negative value.
  289. * @param a A (normalization starter) code point.
  290. * @param b Another code point.
  291. * @return The non-negative composite code point if there is one; otherwise a negative value.
  292. * @stable ICU 49
  293. */
  294. virtual UChar32
  295. composePair(UChar32 a, UChar32 b) const;
  296. /**
  297. * Gets the combining class of c.
  298. * The default implementation returns 0
  299. * but all standard implementations return the Unicode Canonical_Combining_Class value.
  300. * @param c code point
  301. * @return c's combining class
  302. * @stable ICU 49
  303. */
  304. virtual uint8_t
  305. getCombiningClass(UChar32 c) const;
  306. /**
  307. * Tests if the string is normalized.
  308. * Internally, in cases where the quickCheck() method would return "maybe"
  309. * (which is only possible for the two COMPOSE modes) this method
  310. * resolves to "yes" or "no" to provide a definitive result,
  311. * at the cost of doing more work in those cases.
  312. * @param s input string
  313. * @param errorCode Standard ICU error code. Its input value must
  314. * pass the U_SUCCESS() test, or else the function returns
  315. * immediately. Check for U_FAILURE() on output or use with
  316. * function chaining. (See User Guide for details.)
  317. * @return TRUE if s is normalized
  318. * @stable ICU 4.4
  319. */
  320. virtual UBool
  321. isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
  322. /**
  323. * Tests if the string is normalized.
  324. * For the two COMPOSE modes, the result could be "maybe" in cases that
  325. * would take a little more work to resolve definitively.
  326. * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
  327. * combination of quick check + normalization, to avoid
  328. * re-checking the "yes" prefix.
  329. * @param s input string
  330. * @param errorCode Standard ICU error code. Its input value must
  331. * pass the U_SUCCESS() test, or else the function returns
  332. * immediately. Check for U_FAILURE() on output or use with
  333. * function chaining. (See User Guide for details.)
  334. * @return UNormalizationCheckResult
  335. * @stable ICU 4.4
  336. */
  337. virtual UNormalizationCheckResult
  338. quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
  339. /**
  340. * Returns the end of the normalized substring of the input string.
  341. * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
  342. * the substring <code>UnicodeString(s, 0, end)</code>
  343. * will pass the quick check with a "yes" result.
  344. *
  345. * The returned end index is usually one or more characters before the
  346. * "no" or "maybe" character: The end index is at a normalization boundary.
  347. * (See the class documentation for more about normalization boundaries.)
  348. *
  349. * When the goal is a normalized string and most input strings are expected
  350. * to be normalized already, then call this method,
  351. * and if it returns a prefix shorter than the input string,
  352. * copy that prefix and use normalizeSecondAndAppend() for the remainder.
  353. * @param s input string
  354. * @param errorCode Standard ICU error code. Its input value must
  355. * pass the U_SUCCESS() test, or else the function returns
  356. * immediately. Check for U_FAILURE() on output or use with
  357. * function chaining. (See User Guide for details.)
  358. * @return "yes" span end index
  359. * @stable ICU 4.4
  360. */
  361. virtual int32_t
  362. spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
  363. /**
  364. * Tests if the character always has a normalization boundary before it,
  365. * regardless of context.
  366. * If true, then the character does not normalization-interact with
  367. * preceding characters.
  368. * In other words, a string containing this character can be normalized
  369. * by processing portions before this character and starting from this
  370. * character independently.
  371. * This is used for iterative normalization. See the class documentation for details.
  372. * @param c character to test
  373. * @return TRUE if c has a normalization boundary before it
  374. * @stable ICU 4.4
  375. */
  376. virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
  377. /**
  378. * Tests if the character always has a normalization boundary after it,
  379. * regardless of context.
  380. * If true, then the character does not normalization-interact with
  381. * following characters.
  382. * In other words, a string containing this character can be normalized
  383. * by processing portions up to this character and after this
  384. * character independently.
  385. * This is used for iterative normalization. See the class documentation for details.
  386. * Note that this operation may be significantly slower than hasBoundaryBefore().
  387. * @param c character to test
  388. * @return TRUE if c has a normalization boundary after it
  389. * @stable ICU 4.4
  390. */
  391. virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
  392. /**
  393. * Tests if the character is normalization-inert.
  394. * If true, then the character does not change, nor normalization-interact with
  395. * preceding or following characters.
  396. * In other words, a string containing this character can be normalized
  397. * by processing portions before this character and after this
  398. * character independently.
  399. * This is used for iterative normalization. See the class documentation for details.
  400. * Note that this operation may be significantly slower than hasBoundaryBefore().
  401. * @param c character to test
  402. * @return TRUE if c is normalization-inert
  403. * @stable ICU 4.4
  404. */
  405. virtual UBool isInert(UChar32 c) const = 0;
  406. };
  407. /**
  408. * Normalization filtered by a UnicodeSet.
  409. * Normalizes portions of the text contained in the filter set and leaves
  410. * portions not contained in the filter set unchanged.
  411. * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
  412. * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
  413. * This class implements all of (and only) the Normalizer2 API.
  414. * An instance of this class is unmodifiable/immutable but is constructed and
  415. * must be destructed by the owner.
  416. * @stable ICU 4.4
  417. */
  418. class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
  419. public:
  420. /**
  421. * Constructs a filtered normalizer wrapping any Normalizer2 instance
  422. * and a filter set.
  423. * Both are aliased and must not be modified or deleted while this object
  424. * is used.
  425. * The filter set should be frozen; otherwise the performance will suffer greatly.
  426. * @param n2 wrapped Normalizer2 instance
  427. * @param filterSet UnicodeSet which determines the characters to be normalized
  428. * @stable ICU 4.4
  429. */
  430. FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
  431. norm2(n2), set(filterSet) {}
  432. /**
  433. * Destructor.
  434. * @stable ICU 4.4
  435. */
  436. ~FilteredNormalizer2();
  437. /**
  438. * Writes the normalized form of the source string to the destination string
  439. * (replacing its contents) and returns the destination string.
  440. * The source and destination strings must be different objects.
  441. * @param src source string
  442. * @param dest destination string; its contents is replaced with normalized src
  443. * @param errorCode Standard ICU error code. Its input value must
  444. * pass the U_SUCCESS() test, or else the function returns
  445. * immediately. Check for U_FAILURE() on output or use with
  446. * function chaining. (See User Guide for details.)
  447. * @return dest
  448. * @stable ICU 4.4
  449. */
  450. virtual UnicodeString &
  451. normalize(const UnicodeString &src,
  452. UnicodeString &dest,
  453. UErrorCode &errorCode) const;
  454. /**
  455. * Appends the normalized form of the second string to the first string
  456. * (merging them at the boundary) and returns the first string.
  457. * The result is normalized if the first string was normalized.
  458. * The first and second strings must be different objects.
  459. * @param first string, should be normalized
  460. * @param second string, will be normalized
  461. * @param errorCode Standard ICU error code. Its input value must
  462. * pass the U_SUCCESS() test, or else the function returns
  463. * immediately. Check for U_FAILURE() on output or use with
  464. * function chaining. (See User Guide for details.)
  465. * @return first
  466. * @stable ICU 4.4
  467. */
  468. virtual UnicodeString &
  469. normalizeSecondAndAppend(UnicodeString &first,
  470. const UnicodeString &second,
  471. UErrorCode &errorCode) const;
  472. /**
  473. * Appends the second string to the first string
  474. * (merging them at the boundary) and returns the first string.
  475. * The result is normalized if both the strings were normalized.
  476. * The first and second strings must be different objects.
  477. * @param first string, should be normalized
  478. * @param second string, should be normalized
  479. * @param errorCode Standard ICU error code. Its input value must
  480. * pass the U_SUCCESS() test, or else the function returns
  481. * immediately. Check for U_FAILURE() on output or use with
  482. * function chaining. (See User Guide for details.)
  483. * @return first
  484. * @stable ICU 4.4
  485. */
  486. virtual UnicodeString &
  487. append(UnicodeString &first,
  488. const UnicodeString &second,
  489. UErrorCode &errorCode) const;
  490. /**
  491. * Gets the decomposition mapping of c.
  492. * For details see the base class documentation.
  493. *
  494. * This function is independent of the mode of the Normalizer2.
  495. * @param c code point
  496. * @param decomposition String object which will be set to c's
  497. * decomposition mapping, if there is one.
  498. * @return TRUE if c has a decomposition, otherwise FALSE
  499. * @stable ICU 4.6
  500. */
  501. virtual UBool
  502. getDecomposition(UChar32 c, UnicodeString &decomposition) const;
  503. /**
  504. * Gets the raw decomposition mapping of c.
  505. * For details see the base class documentation.
  506. *
  507. * This function is independent of the mode of the Normalizer2.
  508. * @param c code point
  509. * @param decomposition String object which will be set to c's
  510. * raw decomposition mapping, if there is one.
  511. * @return TRUE if c has a decomposition, otherwise FALSE
  512. * @stable ICU 49
  513. */
  514. virtual UBool
  515. getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
  516. /**
  517. * Performs pairwise composition of a & b and returns the composite if there is one.
  518. * For details see the base class documentation.
  519. *
  520. * This function is independent of the mode of the Normalizer2.
  521. * @param a A (normalization starter) code point.
  522. * @param b Another code point.
  523. * @return The non-negative composite code point if there is one; otherwise a negative value.
  524. * @stable ICU 49
  525. */
  526. virtual UChar32
  527. composePair(UChar32 a, UChar32 b) const;
  528. /**
  529. * Gets the combining class of c.
  530. * The default implementation returns 0
  531. * but all standard implementations return the Unicode Canonical_Combining_Class value.
  532. * @param c code point
  533. * @return c's combining class
  534. * @stable ICU 49
  535. */
  536. virtual uint8_t
  537. getCombiningClass(UChar32 c) const;
  538. /**
  539. * Tests if the string is normalized.
  540. * For details see the Normalizer2 base class documentation.
  541. * @param s input string
  542. * @param errorCode Standard ICU error code. Its input value must
  543. * pass the U_SUCCESS() test, or else the function returns
  544. * immediately. Check for U_FAILURE() on output or use with
  545. * function chaining. (See User Guide for details.)
  546. * @return TRUE if s is normalized
  547. * @stable ICU 4.4
  548. */
  549. virtual UBool
  550. isNormalized(const UnicodeString &s, UErrorCode &errorCode) const;
  551. /**
  552. * Tests if the string is normalized.
  553. * For details see the Normalizer2 base class documentation.
  554. * @param s input string
  555. * @param errorCode Standard ICU error code. Its input value must
  556. * pass the U_SUCCESS() test, or else the function returns
  557. * immediately. Check for U_FAILURE() on output or use with
  558. * function chaining. (See User Guide for details.)
  559. * @return UNormalizationCheckResult
  560. * @stable ICU 4.4
  561. */
  562. virtual UNormalizationCheckResult
  563. quickCheck(const UnicodeString &s, UErrorCode &errorCode) const;
  564. /**
  565. * Returns the end of the normalized substring of the input string.
  566. * For details see the Normalizer2 base class documentation.
  567. * @param s input string
  568. * @param errorCode Standard ICU error code. Its input value must
  569. * pass the U_SUCCESS() test, or else the function returns
  570. * immediately. Check for U_FAILURE() on output or use with
  571. * function chaining. (See User Guide for details.)
  572. * @return "yes" span end index
  573. * @stable ICU 4.4
  574. */
  575. virtual int32_t
  576. spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const;
  577. /**
  578. * Tests if the character always has a normalization boundary before it,
  579. * regardless of context.
  580. * For details see the Normalizer2 base class documentation.
  581. * @param c character to test
  582. * @return TRUE if c has a normalization boundary before it
  583. * @stable ICU 4.4
  584. */
  585. virtual UBool hasBoundaryBefore(UChar32 c) const;
  586. /**
  587. * Tests if the character always has a normalization boundary after it,
  588. * regardless of context.
  589. * For details see the Normalizer2 base class documentation.
  590. * @param c character to test
  591. * @return TRUE if c has a normalization boundary after it
  592. * @stable ICU 4.4
  593. */
  594. virtual UBool hasBoundaryAfter(UChar32 c) const;
  595. /**
  596. * Tests if the character is normalization-inert.
  597. * For details see the Normalizer2 base class documentation.
  598. * @param c character to test
  599. * @return TRUE if c is normalization-inert
  600. * @stable ICU 4.4
  601. */
  602. virtual UBool isInert(UChar32 c) const;
  603. private:
  604. UnicodeString &
  605. normalize(const UnicodeString &src,
  606. UnicodeString &dest,
  607. USetSpanCondition spanCondition,
  608. UErrorCode &errorCode) const;
  609. UnicodeString &
  610. normalizeSecondAndAppend(UnicodeString &first,
  611. const UnicodeString &second,
  612. UBool doNormalize,
  613. UErrorCode &errorCode) const;
  614. const Normalizer2 &norm2;
  615. const UnicodeSet &set;
  616. };
  617. U_NAMESPACE_END
  618. #endif // !UCONFIG_NO_NORMALIZATION
  619. #endif // __NORMALIZER2_H__