ustring.h 73 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700
  1. /*
  2. **********************************************************************
  3. * Copyright (C) 1998-2014, International Business Machines
  4. * Corporation and others. All Rights Reserved.
  5. **********************************************************************
  6. *
  7. * File ustring.h
  8. *
  9. * Modification History:
  10. *
  11. * Date Name Description
  12. * 12/07/98 bertrand Creation.
  13. ******************************************************************************
  14. */
  15. #ifndef USTRING_H
  16. #define USTRING_H
  17. #include "unicode/utypes.h"
  18. #include "unicode/putil.h"
  19. #include "unicode/uiter.h"
  20. /**
  21. * \def UBRK_TYPEDEF_UBREAK_ITERATOR
  22. * @internal
  23. */
  24. #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
  25. # define UBRK_TYPEDEF_UBREAK_ITERATOR
  26. /** Simple declaration for u_strToTitle() to avoid including unicode/ubrk.h. @stable ICU 2.1*/
  27. typedef struct UBreakIterator UBreakIterator;
  28. #endif
  29. /**
  30. * \file
  31. * \brief C API: Unicode string handling functions
  32. *
  33. * These C API functions provide general Unicode string handling.
  34. *
  35. * Some functions are equivalent in name, signature, and behavior to the ANSI C <string.h>
  36. * functions. (For example, they do not check for bad arguments like NULL string pointers.)
  37. * In some cases, only the thread-safe variant of such a function is implemented here
  38. * (see u_strtok_r()).
  39. *
  40. * Other functions provide more Unicode-specific functionality like locale-specific
  41. * upper/lower-casing and string comparison in code point order.
  42. *
  43. * ICU uses 16-bit Unicode (UTF-16) in the form of arrays of UChar code units.
  44. * UTF-16 encodes each Unicode code point with either one or two UChar code units.
  45. * (This is the default form of Unicode, and a forward-compatible extension of the original,
  46. * fixed-width form that was known as UCS-2. UTF-16 superseded UCS-2 with Unicode 2.0
  47. * in 1996.)
  48. *
  49. * Some APIs accept a 32-bit UChar32 value for a single code point.
  50. *
  51. * ICU also handles 16-bit Unicode text with unpaired surrogates.
  52. * Such text is not well-formed UTF-16.
  53. * Code-point-related functions treat unpaired surrogates as surrogate code points,
  54. * i.e., as separate units.
  55. *
  56. * Although UTF-16 is a variable-width encoding form (like some legacy multi-byte encodings),
  57. * it is much more efficient even for random access because the code unit values
  58. * for single-unit characters vs. lead units vs. trail units are completely disjoint.
  59. * This means that it is easy to determine character (code point) boundaries from
  60. * random offsets in the string.
  61. *
  62. * Unicode (UTF-16) string processing is optimized for the single-unit case.
  63. * Although it is important to support supplementary characters
  64. * (which use pairs of lead/trail code units called "surrogates"),
  65. * their occurrence is rare. Almost all characters in modern use require only
  66. * a single UChar code unit (i.e., their code point values are <=0xffff).
  67. *
  68. * For more details see the User Guide Strings chapter (http://icu-project.org/userguide/strings.html).
  69. * For a discussion of the handling of unpaired surrogates see also
  70. * Jitterbug 2145 and its icu mailing list proposal on 2002-sep-18.
  71. */
  72. /**
  73. * \defgroup ustring_ustrlen String Length
  74. * \ingroup ustring_strlen
  75. */
  76. /*@{*/
  77. /**
  78. * Determine the length of an array of UChar.
  79. *
  80. * @param s The array of UChars, NULL (U+0000) terminated.
  81. * @return The number of UChars in <code>chars</code>, minus the terminator.
  82. * @stable ICU 2.0
  83. */
  84. U_STABLE int32_t U_EXPORT2
  85. u_strlen(const UChar *s);
  86. /*@}*/
  87. /**
  88. * Count Unicode code points in the length UChar code units of the string.
  89. * A code point may occupy either one or two UChar code units.
  90. * Counting code points involves reading all code units.
  91. *
  92. * This functions is basically the inverse of the U16_FWD_N() macro (see utf.h).
  93. *
  94. * @param s The input string.
  95. * @param length The number of UChar code units to be checked, or -1 to count all
  96. * code points before the first NUL (U+0000).
  97. * @return The number of code points in the specified code units.
  98. * @stable ICU 2.0
  99. */
  100. U_STABLE int32_t U_EXPORT2
  101. u_countChar32(const UChar *s, int32_t length);
  102. /**
  103. * Check if the string contains more Unicode code points than a certain number.
  104. * This is more efficient than counting all code points in the entire string
  105. * and comparing that number with a threshold.
  106. * This function may not need to scan the string at all if the length is known
  107. * (not -1 for NUL-termination) and falls within a certain range, and
  108. * never needs to count more than 'number+1' code points.
  109. * Logically equivalent to (u_countChar32(s, length)>number).
  110. * A Unicode code point may occupy either one or two UChar code units.
  111. *
  112. * @param s The input string.
  113. * @param length The length of the string, or -1 if it is NUL-terminated.
  114. * @param number The number of code points in the string is compared against
  115. * the 'number' parameter.
  116. * @return Boolean value for whether the string contains more Unicode code points
  117. * than 'number'. Same as (u_countChar32(s, length)>number).
  118. * @stable ICU 2.4
  119. */
  120. U_STABLE UBool U_EXPORT2
  121. u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number);
  122. /**
  123. * Concatenate two ustrings. Appends a copy of <code>src</code>,
  124. * including the null terminator, to <code>dst</code>. The initial copied
  125. * character from <code>src</code> overwrites the null terminator in <code>dst</code>.
  126. *
  127. * @param dst The destination string.
  128. * @param src The source string.
  129. * @return A pointer to <code>dst</code>.
  130. * @stable ICU 2.0
  131. */
  132. U_STABLE UChar* U_EXPORT2
  133. u_strcat(UChar *dst,
  134. const UChar *src);
  135. /**
  136. * Concatenate two ustrings.
  137. * Appends at most <code>n</code> characters from <code>src</code> to <code>dst</code>.
  138. * Adds a terminating NUL.
  139. * If src is too long, then only <code>n-1</code> characters will be copied
  140. * before the terminating NUL.
  141. * If <code>n&lt;=0</code> then dst is not modified.
  142. *
  143. * @param dst The destination string.
  144. * @param src The source string (can be NULL/invalid if n<=0).
  145. * @param n The maximum number of characters to append; no-op if <=0.
  146. * @return A pointer to <code>dst</code>.
  147. * @stable ICU 2.0
  148. */
  149. U_STABLE UChar* U_EXPORT2
  150. u_strncat(UChar *dst,
  151. const UChar *src,
  152. int32_t n);
  153. /**
  154. * Find the first occurrence of a substring in a string.
  155. * The substring is found at code point boundaries.
  156. * That means that if the substring begins with
  157. * a trail surrogate or ends with a lead surrogate,
  158. * then it is found only if these surrogates stand alone in the text.
  159. * Otherwise, the substring edge units would be matched against
  160. * halves of surrogate pairs.
  161. *
  162. * @param s The string to search (NUL-terminated).
  163. * @param substring The substring to find (NUL-terminated).
  164. * @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>,
  165. * or <code>s</code> itself if the <code>substring</code> is empty,
  166. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  167. * @stable ICU 2.0
  168. *
  169. * @see u_strrstr
  170. * @see u_strFindFirst
  171. * @see u_strFindLast
  172. */
  173. U_STABLE UChar * U_EXPORT2
  174. u_strstr(const UChar *s, const UChar *substring);
  175. /**
  176. * Find the first occurrence of a substring in a string.
  177. * The substring is found at code point boundaries.
  178. * That means that if the substring begins with
  179. * a trail surrogate or ends with a lead surrogate,
  180. * then it is found only if these surrogates stand alone in the text.
  181. * Otherwise, the substring edge units would be matched against
  182. * halves of surrogate pairs.
  183. *
  184. * @param s The string to search.
  185. * @param length The length of s (number of UChars), or -1 if it is NUL-terminated.
  186. * @param substring The substring to find (NUL-terminated).
  187. * @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated.
  188. * @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>,
  189. * or <code>s</code> itself if the <code>substring</code> is empty,
  190. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  191. * @stable ICU 2.4
  192. *
  193. * @see u_strstr
  194. * @see u_strFindLast
  195. */
  196. U_STABLE UChar * U_EXPORT2
  197. u_strFindFirst(const UChar *s, int32_t length, const UChar *substring, int32_t subLength);
  198. /**
  199. * Find the first occurrence of a BMP code point in a string.
  200. * A surrogate code point is found only if its match in the text is not
  201. * part of a surrogate pair.
  202. * A NUL character is found at the string terminator.
  203. *
  204. * @param s The string to search (NUL-terminated).
  205. * @param c The BMP code point to find.
  206. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  207. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  208. * @stable ICU 2.0
  209. *
  210. * @see u_strchr32
  211. * @see u_memchr
  212. * @see u_strstr
  213. * @see u_strFindFirst
  214. */
  215. U_STABLE UChar * U_EXPORT2
  216. u_strchr(const UChar *s, UChar c);
  217. /**
  218. * Find the first occurrence of a code point in a string.
  219. * A surrogate code point is found only if its match in the text is not
  220. * part of a surrogate pair.
  221. * A NUL character is found at the string terminator.
  222. *
  223. * @param s The string to search (NUL-terminated).
  224. * @param c The code point to find.
  225. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  226. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  227. * @stable ICU 2.0
  228. *
  229. * @see u_strchr
  230. * @see u_memchr32
  231. * @see u_strstr
  232. * @see u_strFindFirst
  233. */
  234. U_STABLE UChar * U_EXPORT2
  235. u_strchr32(const UChar *s, UChar32 c);
  236. /**
  237. * Find the last occurrence of a substring in a string.
  238. * The substring is found at code point boundaries.
  239. * That means that if the substring begins with
  240. * a trail surrogate or ends with a lead surrogate,
  241. * then it is found only if these surrogates stand alone in the text.
  242. * Otherwise, the substring edge units would be matched against
  243. * halves of surrogate pairs.
  244. *
  245. * @param s The string to search (NUL-terminated).
  246. * @param substring The substring to find (NUL-terminated).
  247. * @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>,
  248. * or <code>s</code> itself if the <code>substring</code> is empty,
  249. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  250. * @stable ICU 2.4
  251. *
  252. * @see u_strstr
  253. * @see u_strFindFirst
  254. * @see u_strFindLast
  255. */
  256. U_STABLE UChar * U_EXPORT2
  257. u_strrstr(const UChar *s, const UChar *substring);
  258. /**
  259. * Find the last occurrence of a substring in a string.
  260. * The substring is found at code point boundaries.
  261. * That means that if the substring begins with
  262. * a trail surrogate or ends with a lead surrogate,
  263. * then it is found only if these surrogates stand alone in the text.
  264. * Otherwise, the substring edge units would be matched against
  265. * halves of surrogate pairs.
  266. *
  267. * @param s The string to search.
  268. * @param length The length of s (number of UChars), or -1 if it is NUL-terminated.
  269. * @param substring The substring to find (NUL-terminated).
  270. * @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated.
  271. * @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>,
  272. * or <code>s</code> itself if the <code>substring</code> is empty,
  273. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  274. * @stable ICU 2.4
  275. *
  276. * @see u_strstr
  277. * @see u_strFindLast
  278. */
  279. U_STABLE UChar * U_EXPORT2
  280. u_strFindLast(const UChar *s, int32_t length, const UChar *substring, int32_t subLength);
  281. /**
  282. * Find the last occurrence of a BMP code point in a string.
  283. * A surrogate code point is found only if its match in the text is not
  284. * part of a surrogate pair.
  285. * A NUL character is found at the string terminator.
  286. *
  287. * @param s The string to search (NUL-terminated).
  288. * @param c The BMP code point to find.
  289. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  290. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  291. * @stable ICU 2.4
  292. *
  293. * @see u_strrchr32
  294. * @see u_memrchr
  295. * @see u_strrstr
  296. * @see u_strFindLast
  297. */
  298. U_STABLE UChar * U_EXPORT2
  299. u_strrchr(const UChar *s, UChar c);
  300. /**
  301. * Find the last occurrence of a code point in a string.
  302. * A surrogate code point is found only if its match in the text is not
  303. * part of a surrogate pair.
  304. * A NUL character is found at the string terminator.
  305. *
  306. * @param s The string to search (NUL-terminated).
  307. * @param c The code point to find.
  308. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  309. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  310. * @stable ICU 2.4
  311. *
  312. * @see u_strrchr
  313. * @see u_memchr32
  314. * @see u_strrstr
  315. * @see u_strFindLast
  316. */
  317. U_STABLE UChar * U_EXPORT2
  318. u_strrchr32(const UChar *s, UChar32 c);
  319. /**
  320. * Locates the first occurrence in the string <code>string</code> of any of the characters
  321. * in the string <code>matchSet</code>.
  322. * Works just like C's strpbrk but with Unicode.
  323. *
  324. * @param string The string in which to search, NUL-terminated.
  325. * @param matchSet A NUL-terminated string defining a set of code points
  326. * for which to search in the text string.
  327. * @return A pointer to the character in <code>string</code> that matches one of the
  328. * characters in <code>matchSet</code>, or NULL if no such character is found.
  329. * @stable ICU 2.0
  330. */
  331. U_STABLE UChar * U_EXPORT2
  332. u_strpbrk(const UChar *string, const UChar *matchSet);
  333. /**
  334. * Returns the number of consecutive characters in <code>string</code>,
  335. * beginning with the first, that do not occur somewhere in <code>matchSet</code>.
  336. * Works just like C's strcspn but with Unicode.
  337. *
  338. * @param string The string in which to search, NUL-terminated.
  339. * @param matchSet A NUL-terminated string defining a set of code points
  340. * for which to search in the text string.
  341. * @return The number of initial characters in <code>string</code> that do not
  342. * occur in <code>matchSet</code>.
  343. * @see u_strspn
  344. * @stable ICU 2.0
  345. */
  346. U_STABLE int32_t U_EXPORT2
  347. u_strcspn(const UChar *string, const UChar *matchSet);
  348. /**
  349. * Returns the number of consecutive characters in <code>string</code>,
  350. * beginning with the first, that occur somewhere in <code>matchSet</code>.
  351. * Works just like C's strspn but with Unicode.
  352. *
  353. * @param string The string in which to search, NUL-terminated.
  354. * @param matchSet A NUL-terminated string defining a set of code points
  355. * for which to search in the text string.
  356. * @return The number of initial characters in <code>string</code> that do
  357. * occur in <code>matchSet</code>.
  358. * @see u_strcspn
  359. * @stable ICU 2.0
  360. */
  361. U_STABLE int32_t U_EXPORT2
  362. u_strspn(const UChar *string, const UChar *matchSet);
  363. /**
  364. * The string tokenizer API allows an application to break a string into
  365. * tokens. Unlike strtok(), the saveState (the current pointer within the
  366. * original string) is maintained in saveState. In the first call, the
  367. * argument src is a pointer to the string. In subsequent calls to
  368. * return successive tokens of that string, src must be specified as
  369. * NULL. The value saveState is set by this function to maintain the
  370. * function's position within the string, and on each subsequent call
  371. * you must give this argument the same variable. This function does
  372. * handle surrogate pairs. This function is similar to the strtok_r()
  373. * the POSIX Threads Extension (1003.1c-1995) version.
  374. *
  375. * @param src String containing token(s). This string will be modified.
  376. * After the first call to u_strtok_r(), this argument must
  377. * be NULL to get to the next token.
  378. * @param delim Set of delimiter characters (Unicode code points).
  379. * @param saveState The current pointer within the original string,
  380. * which is set by this function. The saveState
  381. * parameter should the address of a local variable of type
  382. * UChar *. (i.e. defined "Uhar *myLocalSaveState" and use
  383. * &myLocalSaveState for this parameter).
  384. * @return A pointer to the next token found in src, or NULL
  385. * when there are no more tokens.
  386. * @stable ICU 2.0
  387. */
  388. U_STABLE UChar * U_EXPORT2
  389. u_strtok_r(UChar *src,
  390. const UChar *delim,
  391. UChar **saveState);
  392. /**
  393. * Compare two Unicode strings for bitwise equality (code unit order).
  394. *
  395. * @param s1 A string to compare.
  396. * @param s2 A string to compare.
  397. * @return 0 if <code>s1</code> and <code>s2</code> are bitwise equal; a negative
  398. * value if <code>s1</code> is bitwise less than <code>s2,</code>; a positive
  399. * value if <code>s1</code> is bitwise greater than <code>s2</code>.
  400. * @stable ICU 2.0
  401. */
  402. U_STABLE int32_t U_EXPORT2
  403. u_strcmp(const UChar *s1,
  404. const UChar *s2);
  405. /**
  406. * Compare two Unicode strings in code point order.
  407. * See u_strCompare for details.
  408. *
  409. * @param s1 A string to compare.
  410. * @param s2 A string to compare.
  411. * @return a negative/zero/positive integer corresponding to whether
  412. * the first string is less than/equal to/greater than the second one
  413. * in code point order
  414. * @stable ICU 2.0
  415. */
  416. U_STABLE int32_t U_EXPORT2
  417. u_strcmpCodePointOrder(const UChar *s1, const UChar *s2);
  418. /**
  419. * Compare two Unicode strings (binary order).
  420. *
  421. * The comparison can be done in code unit order or in code point order.
  422. * They differ only in UTF-16 when
  423. * comparing supplementary code points (U+10000..U+10ffff)
  424. * to BMP code points near the end of the BMP (i.e., U+e000..U+ffff).
  425. * In code unit order, high BMP code points sort after supplementary code points
  426. * because they are stored as pairs of surrogates which are at U+d800..U+dfff.
  427. *
  428. * This functions works with strings of different explicitly specified lengths
  429. * unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
  430. * NUL-terminated strings are possible with length arguments of -1.
  431. *
  432. * @param s1 First source string.
  433. * @param length1 Length of first source string, or -1 if NUL-terminated.
  434. *
  435. * @param s2 Second source string.
  436. * @param length2 Length of second source string, or -1 if NUL-terminated.
  437. *
  438. * @param codePointOrder Choose between code unit order (FALSE)
  439. * and code point order (TRUE).
  440. *
  441. * @return <0 or 0 or >0 as usual for string comparisons
  442. *
  443. * @stable ICU 2.2
  444. */
  445. U_STABLE int32_t U_EXPORT2
  446. u_strCompare(const UChar *s1, int32_t length1,
  447. const UChar *s2, int32_t length2,
  448. UBool codePointOrder);
  449. /**
  450. * Compare two Unicode strings (binary order)
  451. * as presented by UCharIterator objects.
  452. * Works otherwise just like u_strCompare().
  453. *
  454. * Both iterators are reset to their start positions.
  455. * When the function returns, it is undefined where the iterators
  456. * have stopped.
  457. *
  458. * @param iter1 First source string iterator.
  459. * @param iter2 Second source string iterator.
  460. * @param codePointOrder Choose between code unit order (FALSE)
  461. * and code point order (TRUE).
  462. *
  463. * @return <0 or 0 or >0 as usual for string comparisons
  464. *
  465. * @see u_strCompare
  466. *
  467. * @stable ICU 2.6
  468. */
  469. U_STABLE int32_t U_EXPORT2
  470. u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder);
  471. #ifndef U_COMPARE_CODE_POINT_ORDER
  472. /* see also unistr.h and unorm.h */
  473. /**
  474. * Option bit for u_strCaseCompare, u_strcasecmp, unorm_compare, etc:
  475. * Compare strings in code point order instead of code unit order.
  476. * @stable ICU 2.2
  477. */
  478. #define U_COMPARE_CODE_POINT_ORDER 0x8000
  479. #endif
  480. /**
  481. * Compare two strings case-insensitively using full case folding.
  482. * This is equivalent to
  483. * u_strCompare(u_strFoldCase(s1, options),
  484. * u_strFoldCase(s2, options),
  485. * (options&U_COMPARE_CODE_POINT_ORDER)!=0).
  486. *
  487. * The comparison can be done in UTF-16 code unit order or in code point order.
  488. * They differ only when comparing supplementary code points (U+10000..U+10ffff)
  489. * to BMP code points near the end of the BMP (i.e., U+e000..U+ffff).
  490. * In code unit order, high BMP code points sort after supplementary code points
  491. * because they are stored as pairs of surrogates which are at U+d800..U+dfff.
  492. *
  493. * This functions works with strings of different explicitly specified lengths
  494. * unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
  495. * NUL-terminated strings are possible with length arguments of -1.
  496. *
  497. * @param s1 First source string.
  498. * @param length1 Length of first source string, or -1 if NUL-terminated.
  499. *
  500. * @param s2 Second source string.
  501. * @param length2 Length of second source string, or -1 if NUL-terminated.
  502. *
  503. * @param options A bit set of options:
  504. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  505. * Comparison in code unit order with default case folding.
  506. *
  507. * - U_COMPARE_CODE_POINT_ORDER
  508. * Set to choose code point order instead of code unit order
  509. * (see u_strCompare for details).
  510. *
  511. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  512. *
  513. * @param pErrorCode Must be a valid pointer to an error code value,
  514. * which must not indicate a failure before the function call.
  515. *
  516. * @return <0 or 0 or >0 as usual for string comparisons
  517. *
  518. * @stable ICU 2.2
  519. */
  520. U_STABLE int32_t U_EXPORT2
  521. u_strCaseCompare(const UChar *s1, int32_t length1,
  522. const UChar *s2, int32_t length2,
  523. uint32_t options,
  524. UErrorCode *pErrorCode);
  525. /**
  526. * Compare two ustrings for bitwise equality.
  527. * Compares at most <code>n</code> characters.
  528. *
  529. * @param ucs1 A string to compare (can be NULL/invalid if n<=0).
  530. * @param ucs2 A string to compare (can be NULL/invalid if n<=0).
  531. * @param n The maximum number of characters to compare; always returns 0 if n<=0.
  532. * @return 0 if <code>s1</code> and <code>s2</code> are bitwise equal; a negative
  533. * value if <code>s1</code> is bitwise less than <code>s2</code>; a positive
  534. * value if <code>s1</code> is bitwise greater than <code>s2</code>.
  535. * @stable ICU 2.0
  536. */
  537. U_STABLE int32_t U_EXPORT2
  538. u_strncmp(const UChar *ucs1,
  539. const UChar *ucs2,
  540. int32_t n);
  541. /**
  542. * Compare two Unicode strings in code point order.
  543. * This is different in UTF-16 from u_strncmp() if supplementary characters are present.
  544. * For details, see u_strCompare().
  545. *
  546. * @param s1 A string to compare.
  547. * @param s2 A string to compare.
  548. * @param n The maximum number of characters to compare.
  549. * @return a negative/zero/positive integer corresponding to whether
  550. * the first string is less than/equal to/greater than the second one
  551. * in code point order
  552. * @stable ICU 2.0
  553. */
  554. U_STABLE int32_t U_EXPORT2
  555. u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n);
  556. /**
  557. * Compare two strings case-insensitively using full case folding.
  558. * This is equivalent to u_strcmp(u_strFoldCase(s1, options), u_strFoldCase(s2, options)).
  559. *
  560. * @param s1 A string to compare.
  561. * @param s2 A string to compare.
  562. * @param options A bit set of options:
  563. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  564. * Comparison in code unit order with default case folding.
  565. *
  566. * - U_COMPARE_CODE_POINT_ORDER
  567. * Set to choose code point order instead of code unit order
  568. * (see u_strCompare for details).
  569. *
  570. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  571. *
  572. * @return A negative, zero, or positive integer indicating the comparison result.
  573. * @stable ICU 2.0
  574. */
  575. U_STABLE int32_t U_EXPORT2
  576. u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options);
  577. /**
  578. * Compare two strings case-insensitively using full case folding.
  579. * This is equivalent to u_strcmp(u_strFoldCase(s1, at most n, options),
  580. * u_strFoldCase(s2, at most n, options)).
  581. *
  582. * @param s1 A string to compare.
  583. * @param s2 A string to compare.
  584. * @param n The maximum number of characters each string to case-fold and then compare.
  585. * @param options A bit set of options:
  586. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  587. * Comparison in code unit order with default case folding.
  588. *
  589. * - U_COMPARE_CODE_POINT_ORDER
  590. * Set to choose code point order instead of code unit order
  591. * (see u_strCompare for details).
  592. *
  593. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  594. *
  595. * @return A negative, zero, or positive integer indicating the comparison result.
  596. * @stable ICU 2.0
  597. */
  598. U_STABLE int32_t U_EXPORT2
  599. u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options);
  600. /**
  601. * Compare two strings case-insensitively using full case folding.
  602. * This is equivalent to u_strcmp(u_strFoldCase(s1, n, options),
  603. * u_strFoldCase(s2, n, options)).
  604. *
  605. * @param s1 A string to compare.
  606. * @param s2 A string to compare.
  607. * @param length The number of characters in each string to case-fold and then compare.
  608. * @param options A bit set of options:
  609. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  610. * Comparison in code unit order with default case folding.
  611. *
  612. * - U_COMPARE_CODE_POINT_ORDER
  613. * Set to choose code point order instead of code unit order
  614. * (see u_strCompare for details).
  615. *
  616. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  617. *
  618. * @return A negative, zero, or positive integer indicating the comparison result.
  619. * @stable ICU 2.0
  620. */
  621. U_STABLE int32_t U_EXPORT2
  622. u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options);
  623. /**
  624. * Copy a ustring. Adds a null terminator.
  625. *
  626. * @param dst The destination string.
  627. * @param src The source string.
  628. * @return A pointer to <code>dst</code>.
  629. * @stable ICU 2.0
  630. */
  631. U_STABLE UChar* U_EXPORT2
  632. u_strcpy(UChar *dst,
  633. const UChar *src);
  634. /**
  635. * Copy a ustring.
  636. * Copies at most <code>n</code> characters. The result will be null terminated
  637. * if the length of <code>src</code> is less than <code>n</code>.
  638. *
  639. * @param dst The destination string.
  640. * @param src The source string (can be NULL/invalid if n<=0).
  641. * @param n The maximum number of characters to copy; no-op if <=0.
  642. * @return A pointer to <code>dst</code>.
  643. * @stable ICU 2.0
  644. */
  645. U_STABLE UChar* U_EXPORT2
  646. u_strncpy(UChar *dst,
  647. const UChar *src,
  648. int32_t n);
  649. #if !UCONFIG_NO_CONVERSION
  650. /**
  651. * Copy a byte string encoded in the default codepage to a ustring.
  652. * Adds a null terminator.
  653. * Performs a host byte to UChar conversion
  654. *
  655. * @param dst The destination string.
  656. * @param src The source string.
  657. * @return A pointer to <code>dst</code>.
  658. * @stable ICU 2.0
  659. */
  660. U_STABLE UChar* U_EXPORT2 u_uastrcpy(UChar *dst,
  661. const char *src );
  662. /**
  663. * Copy a byte string encoded in the default codepage to a ustring.
  664. * Copies at most <code>n</code> characters. The result will be null terminated
  665. * if the length of <code>src</code> is less than <code>n</code>.
  666. * Performs a host byte to UChar conversion
  667. *
  668. * @param dst The destination string.
  669. * @param src The source string.
  670. * @param n The maximum number of characters to copy.
  671. * @return A pointer to <code>dst</code>.
  672. * @stable ICU 2.0
  673. */
  674. U_STABLE UChar* U_EXPORT2 u_uastrncpy(UChar *dst,
  675. const char *src,
  676. int32_t n);
  677. /**
  678. * Copy ustring to a byte string encoded in the default codepage.
  679. * Adds a null terminator.
  680. * Performs a UChar to host byte conversion
  681. *
  682. * @param dst The destination string.
  683. * @param src The source string.
  684. * @return A pointer to <code>dst</code>.
  685. * @stable ICU 2.0
  686. */
  687. U_STABLE char* U_EXPORT2 u_austrcpy(char *dst,
  688. const UChar *src );
  689. /**
  690. * Copy ustring to a byte string encoded in the default codepage.
  691. * Copies at most <code>n</code> characters. The result will be null terminated
  692. * if the length of <code>src</code> is less than <code>n</code>.
  693. * Performs a UChar to host byte conversion
  694. *
  695. * @param dst The destination string.
  696. * @param src The source string.
  697. * @param n The maximum number of characters to copy.
  698. * @return A pointer to <code>dst</code>.
  699. * @stable ICU 2.0
  700. */
  701. U_STABLE char* U_EXPORT2 u_austrncpy(char *dst,
  702. const UChar *src,
  703. int32_t n );
  704. #endif
  705. /**
  706. * Synonym for memcpy(), but with UChars only.
  707. * @param dest The destination string
  708. * @param src The source string (can be NULL/invalid if count<=0)
  709. * @param count The number of characters to copy; no-op if <=0
  710. * @return A pointer to <code>dest</code>
  711. * @stable ICU 2.0
  712. */
  713. U_STABLE UChar* U_EXPORT2
  714. u_memcpy(UChar *dest, const UChar *src, int32_t count);
  715. /**
  716. * Synonym for memmove(), but with UChars only.
  717. * @param dest The destination string
  718. * @param src The source string (can be NULL/invalid if count<=0)
  719. * @param count The number of characters to move; no-op if <=0
  720. * @return A pointer to <code>dest</code>
  721. * @stable ICU 2.0
  722. */
  723. U_STABLE UChar* U_EXPORT2
  724. u_memmove(UChar *dest, const UChar *src, int32_t count);
  725. /**
  726. * Initialize <code>count</code> characters of <code>dest</code> to <code>c</code>.
  727. *
  728. * @param dest The destination string.
  729. * @param c The character to initialize the string.
  730. * @param count The maximum number of characters to set.
  731. * @return A pointer to <code>dest</code>.
  732. * @stable ICU 2.0
  733. */
  734. U_STABLE UChar* U_EXPORT2
  735. u_memset(UChar *dest, UChar c, int32_t count);
  736. /**
  737. * Compare the first <code>count</code> UChars of each buffer.
  738. *
  739. * @param buf1 The first string to compare.
  740. * @param buf2 The second string to compare.
  741. * @param count The maximum number of UChars to compare.
  742. * @return When buf1 < buf2, a negative number is returned.
  743. * When buf1 == buf2, 0 is returned.
  744. * When buf1 > buf2, a positive number is returned.
  745. * @stable ICU 2.0
  746. */
  747. U_STABLE int32_t U_EXPORT2
  748. u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count);
  749. /**
  750. * Compare two Unicode strings in code point order.
  751. * This is different in UTF-16 from u_memcmp() if supplementary characters are present.
  752. * For details, see u_strCompare().
  753. *
  754. * @param s1 A string to compare.
  755. * @param s2 A string to compare.
  756. * @param count The maximum number of characters to compare.
  757. * @return a negative/zero/positive integer corresponding to whether
  758. * the first string is less than/equal to/greater than the second one
  759. * in code point order
  760. * @stable ICU 2.0
  761. */
  762. U_STABLE int32_t U_EXPORT2
  763. u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count);
  764. /**
  765. * Find the first occurrence of a BMP code point in a string.
  766. * A surrogate code point is found only if its match in the text is not
  767. * part of a surrogate pair.
  768. * A NUL character is found at the string terminator.
  769. *
  770. * @param s The string to search (contains <code>count</code> UChars).
  771. * @param c The BMP code point to find.
  772. * @param count The length of the string.
  773. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  774. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  775. * @stable ICU 2.0
  776. *
  777. * @see u_strchr
  778. * @see u_memchr32
  779. * @see u_strFindFirst
  780. */
  781. U_STABLE UChar* U_EXPORT2
  782. u_memchr(const UChar *s, UChar c, int32_t count);
  783. /**
  784. * Find the first occurrence of a code point in a string.
  785. * A surrogate code point is found only if its match in the text is not
  786. * part of a surrogate pair.
  787. * A NUL character is found at the string terminator.
  788. *
  789. * @param s The string to search (contains <code>count</code> UChars).
  790. * @param c The code point to find.
  791. * @param count The length of the string.
  792. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  793. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  794. * @stable ICU 2.0
  795. *
  796. * @see u_strchr32
  797. * @see u_memchr
  798. * @see u_strFindFirst
  799. */
  800. U_STABLE UChar* U_EXPORT2
  801. u_memchr32(const UChar *s, UChar32 c, int32_t count);
  802. /**
  803. * Find the last occurrence of a BMP code point in a string.
  804. * A surrogate code point is found only if its match in the text is not
  805. * part of a surrogate pair.
  806. * A NUL character is found at the string terminator.
  807. *
  808. * @param s The string to search (contains <code>count</code> UChars).
  809. * @param c The BMP code point to find.
  810. * @param count The length of the string.
  811. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  812. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  813. * @stable ICU 2.4
  814. *
  815. * @see u_strrchr
  816. * @see u_memrchr32
  817. * @see u_strFindLast
  818. */
  819. U_STABLE UChar* U_EXPORT2
  820. u_memrchr(const UChar *s, UChar c, int32_t count);
  821. /**
  822. * Find the last occurrence of a code point in a string.
  823. * A surrogate code point is found only if its match in the text is not
  824. * part of a surrogate pair.
  825. * A NUL character is found at the string terminator.
  826. *
  827. * @param s The string to search (contains <code>count</code> UChars).
  828. * @param c The code point to find.
  829. * @param count The length of the string.
  830. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  831. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  832. * @stable ICU 2.4
  833. *
  834. * @see u_strrchr32
  835. * @see u_memrchr
  836. * @see u_strFindLast
  837. */
  838. U_STABLE UChar* U_EXPORT2
  839. u_memrchr32(const UChar *s, UChar32 c, int32_t count);
  840. /**
  841. * Unicode String literals in C.
  842. * We need one macro to declare a variable for the string
  843. * and to statically preinitialize it if possible,
  844. * and a second macro to dynamically intialize such a string variable if necessary.
  845. *
  846. * The macros are defined for maximum performance.
  847. * They work only for strings that contain "invariant characters", i.e.,
  848. * only latin letters, digits, and some punctuation.
  849. * See utypes.h for details.
  850. *
  851. * A pair of macros for a single string must be used with the same
  852. * parameters.
  853. * The string parameter must be a C string literal.
  854. * The length of the string, not including the terminating
  855. * <code>NUL</code>, must be specified as a constant.
  856. * The U_STRING_DECL macro should be invoked exactly once for one
  857. * such string variable before it is used.
  858. *
  859. * Usage:
  860. * <pre>
  861. * U_STRING_DECL(ustringVar1, "Quick-Fox 2", 11);
  862. * U_STRING_DECL(ustringVar2, "jumps 5%", 8);
  863. * static UBool didInit=FALSE;
  864. *
  865. * int32_t function() {
  866. * if(!didInit) {
  867. * U_STRING_INIT(ustringVar1, "Quick-Fox 2", 11);
  868. * U_STRING_INIT(ustringVar2, "jumps 5%", 8);
  869. * didInit=TRUE;
  870. * }
  871. * return u_strcmp(ustringVar1, ustringVar2);
  872. * }
  873. * </pre>
  874. *
  875. * Note that the macros will NOT consistently work if their argument is another <code>#define</code>.
  876. * The following will not work on all platforms, don't use it.
  877. *
  878. * <pre>
  879. * #define GLUCK "Mr. Gluck"
  880. * U_STRING_DECL(var, GLUCK, 9)
  881. * U_STRING_INIT(var, GLUCK, 9)
  882. * </pre>
  883. *
  884. * Instead, use the string literal "Mr. Gluck" as the argument to both macro
  885. * calls.
  886. *
  887. *
  888. * @stable ICU 2.0
  889. */
  890. #if defined(U_DECLARE_UTF16)
  891. # define U_STRING_DECL(var, cs, length) static const UChar *var=(const UChar *)U_DECLARE_UTF16(cs)
  892. /**@stable ICU 2.0 */
  893. # define U_STRING_INIT(var, cs, length)
  894. #elif U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && (U_CHARSET_FAMILY==U_ASCII_FAMILY || (U_SIZEOF_UCHAR == 2 && defined(U_WCHAR_IS_UTF16)))
  895. # define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=L ## cs
  896. /**@stable ICU 2.0 */
  897. # define U_STRING_INIT(var, cs, length)
  898. #elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY
  899. # define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=cs
  900. /**@stable ICU 2.0 */
  901. # define U_STRING_INIT(var, cs, length)
  902. #else
  903. # define U_STRING_DECL(var, cs, length) static UChar var[(length)+1]
  904. /**@stable ICU 2.0 */
  905. # define U_STRING_INIT(var, cs, length) u_charsToUChars(cs, var, length+1)
  906. #endif
  907. /**
  908. * Unescape a string of characters and write the resulting
  909. * Unicode characters to the destination buffer. The following escape
  910. * sequences are recognized:
  911. *
  912. * \\uhhhh 4 hex digits; h in [0-9A-Fa-f]
  913. * \\Uhhhhhhhh 8 hex digits
  914. * \\xhh 1-2 hex digits
  915. * \\x{h...} 1-8 hex digits
  916. * \\ooo 1-3 octal digits; o in [0-7]
  917. * \\cX control-X; X is masked with 0x1F
  918. *
  919. * as well as the standard ANSI C escapes:
  920. *
  921. * \\a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A,
  922. * \\v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B,
  923. * \\&quot; => U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C
  924. *
  925. * Anything else following a backslash is generically escaped. For
  926. * example, "[a\\-z]" returns "[a-z]".
  927. *
  928. * If an escape sequence is ill-formed, this method returns an empty
  929. * string. An example of an ill-formed sequence is "\\u" followed by
  930. * fewer than 4 hex digits.
  931. *
  932. * The above characters are recognized in the compiler's codepage,
  933. * that is, they are coded as 'u', '\\', etc. Characters that are
  934. * not parts of escape sequences are converted using u_charsToUChars().
  935. *
  936. * This function is similar to UnicodeString::unescape() but not
  937. * identical to it. The latter takes a source UnicodeString, so it
  938. * does escape recognition but no conversion.
  939. *
  940. * @param src a zero-terminated string of invariant characters
  941. * @param dest pointer to buffer to receive converted and unescaped
  942. * text and, if there is room, a zero terminator. May be NULL for
  943. * preflighting, in which case no UChars will be written, but the
  944. * return value will still be valid. On error, an empty string is
  945. * stored here (if possible).
  946. * @param destCapacity the number of UChars that may be written at
  947. * dest. Ignored if dest == NULL.
  948. * @return the length of unescaped string.
  949. * @see u_unescapeAt
  950. * @see UnicodeString#unescape()
  951. * @see UnicodeString#unescapeAt()
  952. * @stable ICU 2.0
  953. */
  954. U_STABLE int32_t U_EXPORT2
  955. u_unescape(const char *src,
  956. UChar *dest, int32_t destCapacity);
  957. U_CDECL_BEGIN
  958. /**
  959. * Callback function for u_unescapeAt() that returns a character of
  960. * the source text given an offset and a context pointer. The context
  961. * pointer will be whatever is passed into u_unescapeAt().
  962. *
  963. * @param offset pointer to the offset that will be passed to u_unescapeAt().
  964. * @param context an opaque pointer passed directly into u_unescapeAt()
  965. * @return the character represented by the escape sequence at
  966. * offset
  967. * @see u_unescapeAt
  968. * @stable ICU 2.0
  969. */
  970. typedef UChar (U_CALLCONV *UNESCAPE_CHAR_AT)(int32_t offset, void *context);
  971. U_CDECL_END
  972. /**
  973. * Unescape a single sequence. The character at offset-1 is assumed
  974. * (without checking) to be a backslash. This method takes a callback
  975. * pointer to a function that returns the UChar at a given offset. By
  976. * varying this callback, ICU functions are able to unescape char*
  977. * strings, UnicodeString objects, and UFILE pointers.
  978. *
  979. * If offset is out of range, or if the escape sequence is ill-formed,
  980. * (UChar32)0xFFFFFFFF is returned. See documentation of u_unescape()
  981. * for a list of recognized sequences.
  982. *
  983. * @param charAt callback function that returns a UChar of the source
  984. * text given an offset and a context pointer.
  985. * @param offset pointer to the offset that will be passed to charAt.
  986. * The offset value will be updated upon return to point after the
  987. * last parsed character of the escape sequence. On error the offset
  988. * is unchanged.
  989. * @param length the number of characters in the source text. The
  990. * last character of the source text is considered to be at offset
  991. * length-1.
  992. * @param context an opaque pointer passed directly into charAt.
  993. * @return the character represented by the escape sequence at
  994. * offset, or (UChar32)0xFFFFFFFF on error.
  995. * @see u_unescape()
  996. * @see UnicodeString#unescape()
  997. * @see UnicodeString#unescapeAt()
  998. * @stable ICU 2.0
  999. */
  1000. U_STABLE UChar32 U_EXPORT2
  1001. u_unescapeAt(UNESCAPE_CHAR_AT charAt,
  1002. int32_t *offset,
  1003. int32_t length,
  1004. void *context);
  1005. /**
  1006. * Uppercase the characters in a string.
  1007. * Casing is locale-dependent and context-sensitive.
  1008. * The result may be longer or shorter than the original.
  1009. * The source string and the destination buffer are allowed to overlap.
  1010. *
  1011. * @param dest A buffer for the result string. The result will be zero-terminated if
  1012. * the buffer is large enough.
  1013. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1014. * dest may be NULL and the function will only return the length of the result
  1015. * without writing any of the result string.
  1016. * @param src The original string
  1017. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1018. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale.
  1019. * @param pErrorCode Must be a valid pointer to an error code value,
  1020. * which must not indicate a failure before the function call.
  1021. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1022. * only some of the result was written to the destination buffer.
  1023. * @stable ICU 2.0
  1024. */
  1025. U_STABLE int32_t U_EXPORT2
  1026. u_strToUpper(UChar *dest, int32_t destCapacity,
  1027. const UChar *src, int32_t srcLength,
  1028. const char *locale,
  1029. UErrorCode *pErrorCode);
  1030. /**
  1031. * Lowercase the characters in a string.
  1032. * Casing is locale-dependent and context-sensitive.
  1033. * The result may be longer or shorter than the original.
  1034. * The source string and the destination buffer are allowed to overlap.
  1035. *
  1036. * @param dest A buffer for the result string. The result will be zero-terminated if
  1037. * the buffer is large enough.
  1038. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1039. * dest may be NULL and the function will only return the length of the result
  1040. * without writing any of the result string.
  1041. * @param src The original string
  1042. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1043. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale.
  1044. * @param pErrorCode Must be a valid pointer to an error code value,
  1045. * which must not indicate a failure before the function call.
  1046. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1047. * only some of the result was written to the destination buffer.
  1048. * @stable ICU 2.0
  1049. */
  1050. U_STABLE int32_t U_EXPORT2
  1051. u_strToLower(UChar *dest, int32_t destCapacity,
  1052. const UChar *src, int32_t srcLength,
  1053. const char *locale,
  1054. UErrorCode *pErrorCode);
  1055. #if !UCONFIG_NO_BREAK_ITERATION
  1056. /**
  1057. * Titlecase a string.
  1058. * Casing is locale-dependent and context-sensitive.
  1059. * Titlecasing uses a break iterator to find the first characters of words
  1060. * that are to be titlecased. It titlecases those characters and lowercases
  1061. * all others.
  1062. *
  1063. * The titlecase break iterator can be provided to customize for arbitrary
  1064. * styles, using rules and dictionaries beyond the standard iterators.
  1065. * It may be more efficient to always provide an iterator to avoid
  1066. * opening and closing one for each string.
  1067. * The standard titlecase iterator for the root locale implements the
  1068. * algorithm of Unicode TR 21.
  1069. *
  1070. * This function uses only the setText(), first() and next() methods of the
  1071. * provided break iterator.
  1072. *
  1073. * The result may be longer or shorter than the original.
  1074. * The source string and the destination buffer are allowed to overlap.
  1075. *
  1076. * @param dest A buffer for the result string. The result will be zero-terminated if
  1077. * the buffer is large enough.
  1078. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1079. * dest may be NULL and the function will only return the length of the result
  1080. * without writing any of the result string.
  1081. * @param src The original string
  1082. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1083. * @param titleIter A break iterator to find the first characters of words
  1084. * that are to be titlecased.
  1085. * If none is provided (NULL), then a standard titlecase
  1086. * break iterator is opened.
  1087. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale.
  1088. * @param pErrorCode Must be a valid pointer to an error code value,
  1089. * which must not indicate a failure before the function call.
  1090. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1091. * only some of the result was written to the destination buffer.
  1092. * @stable ICU 2.1
  1093. */
  1094. U_STABLE int32_t U_EXPORT2
  1095. u_strToTitle(UChar *dest, int32_t destCapacity,
  1096. const UChar *src, int32_t srcLength,
  1097. UBreakIterator *titleIter,
  1098. const char *locale,
  1099. UErrorCode *pErrorCode);
  1100. #endif
  1101. /**
  1102. * Case-folds the characters in a string.
  1103. *
  1104. * Case-folding is locale-independent and not context-sensitive,
  1105. * but there is an option for whether to include or exclude mappings for dotted I
  1106. * and dotless i that are marked with 'T' in CaseFolding.txt.
  1107. *
  1108. * The result may be longer or shorter than the original.
  1109. * The source string and the destination buffer are allowed to overlap.
  1110. *
  1111. * @param dest A buffer for the result string. The result will be zero-terminated if
  1112. * the buffer is large enough.
  1113. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1114. * dest may be NULL and the function will only return the length of the result
  1115. * without writing any of the result string.
  1116. * @param src The original string
  1117. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1118. * @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I
  1119. * @param pErrorCode Must be a valid pointer to an error code value,
  1120. * which must not indicate a failure before the function call.
  1121. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1122. * only some of the result was written to the destination buffer.
  1123. * @stable ICU 2.0
  1124. */
  1125. U_STABLE int32_t U_EXPORT2
  1126. u_strFoldCase(UChar *dest, int32_t destCapacity,
  1127. const UChar *src, int32_t srcLength,
  1128. uint32_t options,
  1129. UErrorCode *pErrorCode);
  1130. #if defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION
  1131. /**
  1132. * Convert a UTF-16 string to a wchar_t string.
  1133. * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then
  1134. * this function simply calls the fast, dedicated function for that.
  1135. * Otherwise, two conversions UTF-16 -> default charset -> wchar_t* are performed.
  1136. *
  1137. * @param dest A buffer for the result string. The result will be zero-terminated if
  1138. * the buffer is large enough.
  1139. * @param destCapacity The size of the buffer (number of wchar_t's). If it is 0, then
  1140. * dest may be NULL and the function will only return the length of the
  1141. * result without writing any of the result string (pre-flighting).
  1142. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1143. * pDestLength!=NULL then *pDestLength is always set to the
  1144. * number of output units corresponding to the transformation of
  1145. * all the input units, even in case of a buffer overflow.
  1146. * @param src The original source string
  1147. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1148. * @param pErrorCode Must be a valid pointer to an error code value,
  1149. * which must not indicate a failure before the function call.
  1150. * @return The pointer to destination buffer.
  1151. * @stable ICU 2.0
  1152. */
  1153. U_STABLE wchar_t* U_EXPORT2
  1154. u_strToWCS(wchar_t *dest,
  1155. int32_t destCapacity,
  1156. int32_t *pDestLength,
  1157. const UChar *src,
  1158. int32_t srcLength,
  1159. UErrorCode *pErrorCode);
  1160. /**
  1161. * Convert a wchar_t string to UTF-16.
  1162. * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then
  1163. * this function simply calls the fast, dedicated function for that.
  1164. * Otherwise, two conversions wchar_t* -> default charset -> UTF-16 are performed.
  1165. *
  1166. * @param dest A buffer for the result string. The result will be zero-terminated if
  1167. * the buffer is large enough.
  1168. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1169. * dest may be NULL and the function will only return the length of the
  1170. * result without writing any of the result string (pre-flighting).
  1171. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1172. * pDestLength!=NULL then *pDestLength is always set to the
  1173. * number of output units corresponding to the transformation of
  1174. * all the input units, even in case of a buffer overflow.
  1175. * @param src The original source string
  1176. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1177. * @param pErrorCode Must be a valid pointer to an error code value,
  1178. * which must not indicate a failure before the function call.
  1179. * @return The pointer to destination buffer.
  1180. * @stable ICU 2.0
  1181. */
  1182. U_STABLE UChar* U_EXPORT2
  1183. u_strFromWCS(UChar *dest,
  1184. int32_t destCapacity,
  1185. int32_t *pDestLength,
  1186. const wchar_t *src,
  1187. int32_t srcLength,
  1188. UErrorCode *pErrorCode);
  1189. #endif /* defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION */
  1190. /**
  1191. * Convert a UTF-16 string to UTF-8.
  1192. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1193. *
  1194. * @param dest A buffer for the result string. The result will be zero-terminated if
  1195. * the buffer is large enough.
  1196. * @param destCapacity The size of the buffer (number of chars). If it is 0, then
  1197. * dest may be NULL and the function will only return the length of the
  1198. * result without writing any of the result string (pre-flighting).
  1199. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1200. * pDestLength!=NULL then *pDestLength is always set to the
  1201. * number of output units corresponding to the transformation of
  1202. * all the input units, even in case of a buffer overflow.
  1203. * @param src The original source string
  1204. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1205. * @param pErrorCode Must be a valid pointer to an error code value,
  1206. * which must not indicate a failure before the function call.
  1207. * @return The pointer to destination buffer.
  1208. * @stable ICU 2.0
  1209. * @see u_strToUTF8WithSub
  1210. * @see u_strFromUTF8
  1211. */
  1212. U_STABLE char* U_EXPORT2
  1213. u_strToUTF8(char *dest,
  1214. int32_t destCapacity,
  1215. int32_t *pDestLength,
  1216. const UChar *src,
  1217. int32_t srcLength,
  1218. UErrorCode *pErrorCode);
  1219. /**
  1220. * Convert a UTF-8 string to UTF-16.
  1221. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1222. *
  1223. * @param dest A buffer for the result string. The result will be zero-terminated if
  1224. * the buffer is large enough.
  1225. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1226. * dest may be NULL and the function will only return the length of the
  1227. * result without writing any of the result string (pre-flighting).
  1228. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1229. * pDestLength!=NULL then *pDestLength is always set to the
  1230. * number of output units corresponding to the transformation of
  1231. * all the input units, even in case of a buffer overflow.
  1232. * @param src The original source string
  1233. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1234. * @param pErrorCode Must be a valid pointer to an error code value,
  1235. * which must not indicate a failure before the function call.
  1236. * @return The pointer to destination buffer.
  1237. * @stable ICU 2.0
  1238. * @see u_strFromUTF8WithSub
  1239. * @see u_strFromUTF8Lenient
  1240. */
  1241. U_STABLE UChar* U_EXPORT2
  1242. u_strFromUTF8(UChar *dest,
  1243. int32_t destCapacity,
  1244. int32_t *pDestLength,
  1245. const char *src,
  1246. int32_t srcLength,
  1247. UErrorCode *pErrorCode);
  1248. /**
  1249. * Convert a UTF-16 string to UTF-8.
  1250. *
  1251. * Same as u_strToUTF8() except for the additional subchar which is output for
  1252. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1253. * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF8().
  1254. *
  1255. * @param dest A buffer for the result string. The result will be zero-terminated if
  1256. * the buffer is large enough.
  1257. * @param destCapacity The size of the buffer (number of chars). If it is 0, then
  1258. * dest may be NULL and the function will only return the length of the
  1259. * result without writing any of the result string (pre-flighting).
  1260. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1261. * pDestLength!=NULL then *pDestLength is always set to the
  1262. * number of output units corresponding to the transformation of
  1263. * all the input units, even in case of a buffer overflow.
  1264. * @param src The original source string
  1265. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1266. * @param subchar The substitution character to use in place of an illegal input sequence,
  1267. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1268. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1269. * except for surrogate code points (U+D800..U+DFFF).
  1270. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1271. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1272. * Set to 0 if no substitutions occur or subchar<0.
  1273. * pNumSubstitutions can be NULL.
  1274. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1275. * pass the U_SUCCESS() test, or else the function returns
  1276. * immediately. Check for U_FAILURE() on output or use with
  1277. * function chaining. (See User Guide for details.)
  1278. * @return The pointer to destination buffer.
  1279. * @see u_strToUTF8
  1280. * @see u_strFromUTF8WithSub
  1281. * @stable ICU 3.6
  1282. */
  1283. U_STABLE char* U_EXPORT2
  1284. u_strToUTF8WithSub(char *dest,
  1285. int32_t destCapacity,
  1286. int32_t *pDestLength,
  1287. const UChar *src,
  1288. int32_t srcLength,
  1289. UChar32 subchar, int32_t *pNumSubstitutions,
  1290. UErrorCode *pErrorCode);
  1291. /**
  1292. * Convert a UTF-8 string to UTF-16.
  1293. *
  1294. * Same as u_strFromUTF8() except for the additional subchar which is output for
  1295. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1296. * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF8().
  1297. *
  1298. * @param dest A buffer for the result string. The result will be zero-terminated if
  1299. * the buffer is large enough.
  1300. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1301. * dest may be NULL and the function will only return the length of the
  1302. * result without writing any of the result string (pre-flighting).
  1303. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1304. * pDestLength!=NULL then *pDestLength is always set to the
  1305. * number of output units corresponding to the transformation of
  1306. * all the input units, even in case of a buffer overflow.
  1307. * @param src The original source string
  1308. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1309. * @param subchar The substitution character to use in place of an illegal input sequence,
  1310. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1311. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1312. * except for surrogate code points (U+D800..U+DFFF).
  1313. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1314. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1315. * Set to 0 if no substitutions occur or subchar<0.
  1316. * pNumSubstitutions can be NULL.
  1317. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1318. * pass the U_SUCCESS() test, or else the function returns
  1319. * immediately. Check for U_FAILURE() on output or use with
  1320. * function chaining. (See User Guide for details.)
  1321. * @return The pointer to destination buffer.
  1322. * @see u_strFromUTF8
  1323. * @see u_strFromUTF8Lenient
  1324. * @see u_strToUTF8WithSub
  1325. * @stable ICU 3.6
  1326. */
  1327. U_STABLE UChar* U_EXPORT2
  1328. u_strFromUTF8WithSub(UChar *dest,
  1329. int32_t destCapacity,
  1330. int32_t *pDestLength,
  1331. const char *src,
  1332. int32_t srcLength,
  1333. UChar32 subchar, int32_t *pNumSubstitutions,
  1334. UErrorCode *pErrorCode);
  1335. /**
  1336. * Convert a UTF-8 string to UTF-16.
  1337. *
  1338. * Same as u_strFromUTF8() except that this function is designed to be very fast,
  1339. * which it achieves by being lenient about malformed UTF-8 sequences.
  1340. * This function is intended for use in environments where UTF-8 text is
  1341. * expected to be well-formed.
  1342. *
  1343. * Its semantics are:
  1344. * - Well-formed UTF-8 text is correctly converted to well-formed UTF-16 text.
  1345. * - The function will not read beyond the input string, nor write beyond
  1346. * the destCapacity.
  1347. * - Malformed UTF-8 results in "garbage" 16-bit Unicode strings which may not
  1348. * be well-formed UTF-16.
  1349. * The function will resynchronize to valid code point boundaries
  1350. * within a small number of code points after an illegal sequence.
  1351. * - Non-shortest forms are not detected and will result in "spoofing" output.
  1352. *
  1353. * For further performance improvement, if srcLength is given (>=0),
  1354. * then it must be destCapacity>=srcLength.
  1355. *
  1356. * There is no inverse u_strToUTF8Lenient() function because there is practically
  1357. * no performance gain from not checking that a UTF-16 string is well-formed.
  1358. *
  1359. * @param dest A buffer for the result string. The result will be zero-terminated if
  1360. * the buffer is large enough.
  1361. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1362. * dest may be NULL and the function will only return the length of the
  1363. * result without writing any of the result string (pre-flighting).
  1364. * Unlike for other ICU functions, if srcLength>=0 then it
  1365. * must be destCapacity>=srcLength.
  1366. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1367. * pDestLength!=NULL then *pDestLength is always set to the
  1368. * number of output units corresponding to the transformation of
  1369. * all the input units, even in case of a buffer overflow.
  1370. * Unlike for other ICU functions, if srcLength>=0 but
  1371. * destCapacity<srcLength, then *pDestLength will be set to srcLength
  1372. * (and U_BUFFER_OVERFLOW_ERROR will be set)
  1373. * regardless of the actual result length.
  1374. * @param src The original source string
  1375. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1376. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1377. * pass the U_SUCCESS() test, or else the function returns
  1378. * immediately. Check for U_FAILURE() on output or use with
  1379. * function chaining. (See User Guide for details.)
  1380. * @return The pointer to destination buffer.
  1381. * @see u_strFromUTF8
  1382. * @see u_strFromUTF8WithSub
  1383. * @see u_strToUTF8WithSub
  1384. * @stable ICU 3.6
  1385. */
  1386. U_STABLE UChar * U_EXPORT2
  1387. u_strFromUTF8Lenient(UChar *dest,
  1388. int32_t destCapacity,
  1389. int32_t *pDestLength,
  1390. const char *src,
  1391. int32_t srcLength,
  1392. UErrorCode *pErrorCode);
  1393. /**
  1394. * Convert a UTF-16 string to UTF-32.
  1395. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1396. *
  1397. * @param dest A buffer for the result string. The result will be zero-terminated if
  1398. * the buffer is large enough.
  1399. * @param destCapacity The size of the buffer (number of UChar32s). If it is 0, then
  1400. * dest may be NULL and the function will only return the length of the
  1401. * result without writing any of the result string (pre-flighting).
  1402. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1403. * pDestLength!=NULL then *pDestLength is always set to the
  1404. * number of output units corresponding to the transformation of
  1405. * all the input units, even in case of a buffer overflow.
  1406. * @param src The original source string
  1407. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1408. * @param pErrorCode Must be a valid pointer to an error code value,
  1409. * which must not indicate a failure before the function call.
  1410. * @return The pointer to destination buffer.
  1411. * @see u_strToUTF32WithSub
  1412. * @see u_strFromUTF32
  1413. * @stable ICU 2.0
  1414. */
  1415. U_STABLE UChar32* U_EXPORT2
  1416. u_strToUTF32(UChar32 *dest,
  1417. int32_t destCapacity,
  1418. int32_t *pDestLength,
  1419. const UChar *src,
  1420. int32_t srcLength,
  1421. UErrorCode *pErrorCode);
  1422. /**
  1423. * Convert a UTF-32 string to UTF-16.
  1424. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1425. *
  1426. * @param dest A buffer for the result string. The result will be zero-terminated if
  1427. * the buffer is large enough.
  1428. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1429. * dest may be NULL and the function will only return the length of the
  1430. * result without writing any of the result string (pre-flighting).
  1431. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1432. * pDestLength!=NULL then *pDestLength is always set to the
  1433. * number of output units corresponding to the transformation of
  1434. * all the input units, even in case of a buffer overflow.
  1435. * @param src The original source string
  1436. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1437. * @param pErrorCode Must be a valid pointer to an error code value,
  1438. * which must not indicate a failure before the function call.
  1439. * @return The pointer to destination buffer.
  1440. * @see u_strFromUTF32WithSub
  1441. * @see u_strToUTF32
  1442. * @stable ICU 2.0
  1443. */
  1444. U_STABLE UChar* U_EXPORT2
  1445. u_strFromUTF32(UChar *dest,
  1446. int32_t destCapacity,
  1447. int32_t *pDestLength,
  1448. const UChar32 *src,
  1449. int32_t srcLength,
  1450. UErrorCode *pErrorCode);
  1451. /**
  1452. * Convert a UTF-16 string to UTF-32.
  1453. *
  1454. * Same as u_strToUTF32() except for the additional subchar which is output for
  1455. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1456. * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF32().
  1457. *
  1458. * @param dest A buffer for the result string. The result will be zero-terminated if
  1459. * the buffer is large enough.
  1460. * @param destCapacity The size of the buffer (number of UChar32s). If it is 0, then
  1461. * dest may be NULL and the function will only return the length of the
  1462. * result without writing any of the result string (pre-flighting).
  1463. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1464. * pDestLength!=NULL then *pDestLength is always set to the
  1465. * number of output units corresponding to the transformation of
  1466. * all the input units, even in case of a buffer overflow.
  1467. * @param src The original source string
  1468. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1469. * @param subchar The substitution character to use in place of an illegal input sequence,
  1470. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1471. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1472. * except for surrogate code points (U+D800..U+DFFF).
  1473. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1474. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1475. * Set to 0 if no substitutions occur or subchar<0.
  1476. * pNumSubstitutions can be NULL.
  1477. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1478. * pass the U_SUCCESS() test, or else the function returns
  1479. * immediately. Check for U_FAILURE() on output or use with
  1480. * function chaining. (See User Guide for details.)
  1481. * @return The pointer to destination buffer.
  1482. * @see u_strToUTF32
  1483. * @see u_strFromUTF32WithSub
  1484. * @stable ICU 4.2
  1485. */
  1486. U_STABLE UChar32* U_EXPORT2
  1487. u_strToUTF32WithSub(UChar32 *dest,
  1488. int32_t destCapacity,
  1489. int32_t *pDestLength,
  1490. const UChar *src,
  1491. int32_t srcLength,
  1492. UChar32 subchar, int32_t *pNumSubstitutions,
  1493. UErrorCode *pErrorCode);
  1494. /**
  1495. * Convert a UTF-32 string to UTF-16.
  1496. *
  1497. * Same as u_strFromUTF32() except for the additional subchar which is output for
  1498. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1499. * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF32().
  1500. *
  1501. * @param dest A buffer for the result string. The result will be zero-terminated if
  1502. * the buffer is large enough.
  1503. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1504. * dest may be NULL and the function will only return the length of the
  1505. * result without writing any of the result string (pre-flighting).
  1506. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1507. * pDestLength!=NULL then *pDestLength is always set to the
  1508. * number of output units corresponding to the transformation of
  1509. * all the input units, even in case of a buffer overflow.
  1510. * @param src The original source string
  1511. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1512. * @param subchar The substitution character to use in place of an illegal input sequence,
  1513. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1514. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1515. * except for surrogate code points (U+D800..U+DFFF).
  1516. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1517. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1518. * Set to 0 if no substitutions occur or subchar<0.
  1519. * pNumSubstitutions can be NULL.
  1520. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1521. * pass the U_SUCCESS() test, or else the function returns
  1522. * immediately. Check for U_FAILURE() on output or use with
  1523. * function chaining. (See User Guide for details.)
  1524. * @return The pointer to destination buffer.
  1525. * @see u_strFromUTF32
  1526. * @see u_strToUTF32WithSub
  1527. * @stable ICU 4.2
  1528. */
  1529. U_STABLE UChar* U_EXPORT2
  1530. u_strFromUTF32WithSub(UChar *dest,
  1531. int32_t destCapacity,
  1532. int32_t *pDestLength,
  1533. const UChar32 *src,
  1534. int32_t srcLength,
  1535. UChar32 subchar, int32_t *pNumSubstitutions,
  1536. UErrorCode *pErrorCode);
  1537. /**
  1538. * Convert a 16-bit Unicode string to Java Modified UTF-8.
  1539. * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8
  1540. *
  1541. * This function behaves according to the documentation for Java DataOutput.writeUTF()
  1542. * except that it does not encode the output length in the destination buffer
  1543. * and does not have an output length restriction.
  1544. * See http://java.sun.com/javase/6/docs/api/java/io/DataOutput.html#writeUTF(java.lang.String)
  1545. *
  1546. * The input string need not be well-formed UTF-16.
  1547. * (Therefore there is no subchar parameter.)
  1548. *
  1549. * @param dest A buffer for the result string. The result will be zero-terminated if
  1550. * the buffer is large enough.
  1551. * @param destCapacity The size of the buffer (number of chars). If it is 0, then
  1552. * dest may be NULL and the function will only return the length of the
  1553. * result without writing any of the result string (pre-flighting).
  1554. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1555. * pDestLength!=NULL then *pDestLength is always set to the
  1556. * number of output units corresponding to the transformation of
  1557. * all the input units, even in case of a buffer overflow.
  1558. * @param src The original source string
  1559. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1560. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1561. * pass the U_SUCCESS() test, or else the function returns
  1562. * immediately. Check for U_FAILURE() on output or use with
  1563. * function chaining. (See User Guide for details.)
  1564. * @return The pointer to destination buffer.
  1565. * @stable ICU 4.4
  1566. * @see u_strToUTF8WithSub
  1567. * @see u_strFromJavaModifiedUTF8WithSub
  1568. */
  1569. U_STABLE char* U_EXPORT2
  1570. u_strToJavaModifiedUTF8(
  1571. char *dest,
  1572. int32_t destCapacity,
  1573. int32_t *pDestLength,
  1574. const UChar *src,
  1575. int32_t srcLength,
  1576. UErrorCode *pErrorCode);
  1577. /**
  1578. * Convert a Java Modified UTF-8 string to a 16-bit Unicode string.
  1579. * If the input string is not well-formed and no substitution char is specified,
  1580. * then the U_INVALID_CHAR_FOUND error code is set.
  1581. *
  1582. * This function behaves according to the documentation for Java DataInput.readUTF()
  1583. * except that it takes a length parameter rather than
  1584. * interpreting the first two input bytes as the length.
  1585. * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#readUTF()
  1586. *
  1587. * The output string may not be well-formed UTF-16.
  1588. *
  1589. * @param dest A buffer for the result string. The result will be zero-terminated if
  1590. * the buffer is large enough.
  1591. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1592. * dest may be NULL and the function will only return the length of the
  1593. * result without writing any of the result string (pre-flighting).
  1594. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1595. * pDestLength!=NULL then *pDestLength is always set to the
  1596. * number of output units corresponding to the transformation of
  1597. * all the input units, even in case of a buffer overflow.
  1598. * @param src The original source string
  1599. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1600. * @param subchar The substitution character to use in place of an illegal input sequence,
  1601. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1602. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1603. * except for surrogate code points (U+D800..U+DFFF).
  1604. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1605. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1606. * Set to 0 if no substitutions occur or subchar<0.
  1607. * pNumSubstitutions can be NULL.
  1608. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1609. * pass the U_SUCCESS() test, or else the function returns
  1610. * immediately. Check for U_FAILURE() on output or use with
  1611. * function chaining. (See User Guide for details.)
  1612. * @return The pointer to destination buffer.
  1613. * @see u_strFromUTF8WithSub
  1614. * @see u_strFromUTF8Lenient
  1615. * @see u_strToJavaModifiedUTF8
  1616. * @stable ICU 4.4
  1617. */
  1618. U_STABLE UChar* U_EXPORT2
  1619. u_strFromJavaModifiedUTF8WithSub(
  1620. UChar *dest,
  1621. int32_t destCapacity,
  1622. int32_t *pDestLength,
  1623. const char *src,
  1624. int32_t srcLength,
  1625. UChar32 subchar, int32_t *pNumSubstitutions,
  1626. UErrorCode *pErrorCode);
  1627. #endif