grapheme_util.c 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396
  1. /*
  2. +----------------------------------------------------------------------+
  3. | This source file is subject to version 3.01 of the PHP license, |
  4. | that is bundled with this package in the file LICENSE, and is |
  5. | available through the world-wide-web at the following url: |
  6. | https://www.php.net/license/3_01.txt |
  7. | If you did not receive a copy of the PHP license and are unable to |
  8. | obtain it through the world-wide-web, please send a note to |
  9. | license@php.net so we can mail you a copy immediately. |
  10. +----------------------------------------------------------------------+
  11. | Author: Ed Batutis <ed@batutis.com> |
  12. +----------------------------------------------------------------------+
  13. */
  14. /* {{{ includes */
  15. #ifdef HAVE_CONFIG_H
  16. #include "config.h"
  17. #endif
  18. #include <php.h>
  19. #include "grapheme.h"
  20. #include "grapheme_util.h"
  21. #include "intl_common.h"
  22. #include <unicode/utypes.h>
  23. #include <unicode/ucol.h>
  24. #include <unicode/ustring.h>
  25. #include <unicode/ubrk.h>
  26. #include <unicode/usearch.h>
  27. #include "ext/standard/php_string.h"
  28. ZEND_EXTERN_MODULE_GLOBALS( intl )
  29. /* }}} */
  30. /* {{{ grapheme_close_global_iterator - clean up */
  31. void
  32. grapheme_close_global_iterator( void )
  33. {
  34. UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
  35. if ( NULL != global_break_iterator ) {
  36. ubrk_close(global_break_iterator);
  37. }
  38. }
  39. /* }}} */
  40. /* {{{ grapheme_substr_ascii f='from' - starting point, l='length' */
  41. void grapheme_substr_ascii(char *str, size_t str_len, int32_t f, int32_t l, char **sub_str, int32_t *sub_str_len)
  42. {
  43. int32_t str_len2 = (int32_t)str_len; /* in order to avoid signed/unsigned problems */
  44. *sub_str = NULL;
  45. if(str_len > INT32_MAX) {
  46. /* We can not return long strings from ICU functions, so we won't here too */
  47. return;
  48. }
  49. /* if "from" position is negative, count start position from the end
  50. * of the string
  51. */
  52. if (f < 0) {
  53. f = str_len2 + f;
  54. if (f < 0) {
  55. f = 0;
  56. }
  57. } else if (f > str_len2) {
  58. f = str_len2;
  59. }
  60. /* if "length" position is negative, set it to the length
  61. * needed to stop that many chars from the end of the string
  62. */
  63. if (l < 0) {
  64. l = (str_len2 - f) + l;
  65. if (l < 0) {
  66. l = 0;
  67. }
  68. } else if (l > str_len2 - f) {
  69. l = str_len2 - f;
  70. }
  71. *sub_str = str + f;
  72. *sub_str_len = l;
  73. }
  74. /* }}} */
  75. #define STRPOS_CHECK_STATUS(status, error) \
  76. if ( U_FAILURE( (status) ) ) { \
  77. intl_error_set_code( NULL, (status) ); \
  78. intl_error_set_custom_msg( NULL, (error), 0 ); \
  79. ret_pos = -1; \
  80. goto finish; \
  81. }
  82. /* {{{ grapheme_strpos_utf16 - strrpos using utf16*/
  83. int32_t grapheme_strpos_utf16(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset, int32_t *puchar_pos, int f_ignore_case, int last)
  84. {
  85. UChar *uhaystack = NULL, *uneedle = NULL;
  86. int32_t uhaystack_len = 0, uneedle_len = 0, char_pos, ret_pos, offset_pos = 0;
  87. unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
  88. UBreakIterator* bi = NULL;
  89. UErrorCode status;
  90. UStringSearch* src = NULL;
  91. if(puchar_pos) {
  92. *puchar_pos = -1;
  93. }
  94. /* convert the strings to UTF-16. */
  95. status = U_ZERO_ERROR;
  96. intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, haystack, haystack_len, &status );
  97. STRPOS_CHECK_STATUS(status, "Error converting input string to UTF-16");
  98. status = U_ZERO_ERROR;
  99. intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, needle, needle_len, &status );
  100. STRPOS_CHECK_STATUS(status, "Error converting needle string to UTF-16");
  101. /* get a pointer to the haystack taking into account the offset */
  102. status = U_ZERO_ERROR;
  103. bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
  104. STRPOS_CHECK_STATUS(status, "Failed to get iterator");
  105. status = U_ZERO_ERROR;
  106. ubrk_setText(bi, uhaystack, uhaystack_len, &status);
  107. STRPOS_CHECK_STATUS(status, "Failed to set up iterator");
  108. if (uneedle_len == 0) {
  109. offset_pos = grapheme_get_haystack_offset(bi, offset);
  110. if (offset_pos == -1) {
  111. zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
  112. ret_pos = -1;
  113. goto finish;
  114. }
  115. ret_pos = last && offset >= 0 ? uhaystack_len : offset_pos;
  116. goto finish;
  117. }
  118. status = U_ZERO_ERROR;
  119. src = usearch_open(uneedle, uneedle_len, uhaystack, uhaystack_len, "", bi, &status);
  120. STRPOS_CHECK_STATUS(status, "Error creating search object");
  121. if(f_ignore_case) {
  122. UCollator *coll = usearch_getCollator(src);
  123. status = U_ZERO_ERROR;
  124. ucol_setAttribute(coll, UCOL_STRENGTH, UCOL_SECONDARY, &status);
  125. STRPOS_CHECK_STATUS(status, "Error setting collation strength");
  126. usearch_reset(src);
  127. }
  128. if(offset != 0) {
  129. offset_pos = grapheme_get_haystack_offset(bi, offset);
  130. if (offset_pos == -1) {
  131. zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
  132. ret_pos = -1;
  133. goto finish;
  134. }
  135. status = U_ZERO_ERROR;
  136. usearch_setOffset(src, last ? 0 : offset_pos, &status);
  137. STRPOS_CHECK_STATUS(status, "Invalid search offset");
  138. }
  139. if(last) {
  140. if (offset >= 0) {
  141. char_pos = usearch_last(src, &status);
  142. if(char_pos < offset_pos) {
  143. /* last one is beyond our start offset */
  144. char_pos = USEARCH_DONE;
  145. }
  146. } else {
  147. /* searching backwards is broken, so we search forwards, albeit it's less efficient */
  148. int32_t prev_pos = USEARCH_DONE;
  149. do {
  150. char_pos = usearch_next(src, &status);
  151. if (char_pos == USEARCH_DONE || char_pos > offset_pos) {
  152. char_pos = prev_pos;
  153. break;
  154. }
  155. prev_pos = char_pos;
  156. } while(1);
  157. }
  158. } else {
  159. char_pos = usearch_next(src, &status);
  160. }
  161. STRPOS_CHECK_STATUS(status, "Error looking up string");
  162. if(char_pos != USEARCH_DONE && ubrk_isBoundary(bi, char_pos)) {
  163. ret_pos = grapheme_count_graphemes(bi, uhaystack,char_pos);
  164. if(puchar_pos) {
  165. *puchar_pos = char_pos;
  166. }
  167. } else {
  168. ret_pos = -1;
  169. }
  170. finish:
  171. if (uhaystack) {
  172. efree( uhaystack );
  173. }
  174. if (uneedle) {
  175. efree( uneedle );
  176. }
  177. if (bi) {
  178. ubrk_close (bi);
  179. }
  180. if (src) {
  181. usearch_close (src);
  182. }
  183. return ret_pos;
  184. }
  185. /* }}} */
  186. /* {{{ grapheme_ascii_check: ASCII check */
  187. zend_long grapheme_ascii_check(const unsigned char *day, size_t len)
  188. {
  189. int ret_len = len;
  190. while ( len-- ) {
  191. if ( *day++ > 0x7f || (*day == '\n' && *(day - 1) == '\r') )
  192. return -1;
  193. }
  194. return ret_len;
  195. }
  196. /* }}} */
  197. /* {{{ grapheme_split_string: find and optionally return grapheme boundaries */
  198. int32_t grapheme_split_string(const UChar *text, int32_t text_length, int boundary_array[], int boundary_array_len )
  199. {
  200. unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
  201. UErrorCode status = U_ZERO_ERROR;
  202. int ret_len, pos;
  203. UBreakIterator* bi;
  204. bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
  205. if( U_FAILURE(status) ) {
  206. return -1;
  207. }
  208. ubrk_setText(bi, text, text_length, &status);
  209. pos = 0;
  210. for ( ret_len = 0; pos != UBRK_DONE; ) {
  211. pos = ubrk_next(bi);
  212. if ( pos != UBRK_DONE ) {
  213. if ( NULL != boundary_array && ret_len < boundary_array_len ) {
  214. boundary_array[ret_len] = pos;
  215. }
  216. ret_len++;
  217. }
  218. }
  219. ubrk_close(bi);
  220. return ret_len;
  221. }
  222. /* }}} */
  223. /* {{{ grapheme_count_graphemes */
  224. int32_t grapheme_count_graphemes(UBreakIterator *bi, UChar *string, int32_t string_len)
  225. {
  226. int ret_len = 0;
  227. int pos = 0;
  228. UErrorCode status = U_ZERO_ERROR;
  229. ubrk_setText(bi, string, string_len, &status);
  230. do {
  231. pos = ubrk_next(bi);
  232. if ( UBRK_DONE != pos ) {
  233. ret_len++;
  234. }
  235. } while ( UBRK_DONE != pos );
  236. return ret_len;
  237. }
  238. /* }}} */
  239. /* {{{ grapheme_get_haystack_offset - bump the haystack pointer based on the grapheme count offset */
  240. int32_t grapheme_get_haystack_offset(UBreakIterator* bi, int32_t offset)
  241. {
  242. int32_t pos;
  243. int32_t (*iter_op)(UBreakIterator* bi);
  244. int iter_incr;
  245. if ( 0 == offset ) {
  246. return 0;
  247. }
  248. if ( offset < 0 ) {
  249. iter_op = ubrk_previous;
  250. ubrk_last(bi); /* one past the end */
  251. iter_incr = 1;
  252. }
  253. else {
  254. iter_op = ubrk_next;
  255. iter_incr = -1;
  256. }
  257. pos = 0;
  258. while ( pos != UBRK_DONE && offset != 0 ) {
  259. pos = iter_op(bi);
  260. if ( UBRK_DONE != pos ) {
  261. offset += iter_incr;
  262. }
  263. }
  264. if ( offset != 0 ) {
  265. return -1;
  266. }
  267. return pos;
  268. }
  269. /* }}} */
  270. /* {{{ grapheme_strrpos_ascii: borrowed from the php ext/standard/string.c */
  271. zend_long grapheme_strrpos_ascii(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset)
  272. {
  273. char *p, *e;
  274. if (offset >= 0) {
  275. p = haystack + offset;
  276. e = haystack + haystack_len - needle_len;
  277. } else {
  278. p = haystack;
  279. if (needle_len > (size_t)-offset) {
  280. e = haystack + haystack_len - needle_len;
  281. } else {
  282. e = haystack + haystack_len + offset;
  283. }
  284. }
  285. if (needle_len == 1) {
  286. /* Single character search can shortcut memcmps */
  287. while (e >= p) {
  288. if (*e == *needle) {
  289. return (e - p + (offset > 0 ? offset : 0));
  290. }
  291. e--;
  292. }
  293. return -1;
  294. }
  295. while (e >= p) {
  296. if (memcmp(e, needle, needle_len) == 0) {
  297. return (e - p + (offset > 0 ? offset : 0));
  298. }
  299. e--;
  300. }
  301. return -1;
  302. }
  303. /* }}} */
  304. /* {{{ grapheme_get_break_iterator: get a clone of the global character break iterator */
  305. UBreakIterator* grapheme_get_break_iterator(void *stack_buffer, UErrorCode *status )
  306. {
  307. UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
  308. if ( NULL == global_break_iterator ) {
  309. global_break_iterator = ubrk_open(UBRK_CHARACTER,
  310. NULL, /* icu default locale - locale has no effect on this iterator */
  311. NULL, /* text not set in global iterator */
  312. 0, /* text length = 0 */
  313. status);
  314. INTL_G(grapheme_iterator) = global_break_iterator;
  315. }
  316. #if U_ICU_VERSION_MAJOR_NUM >= 69
  317. return ubrk_clone(global_break_iterator, status);
  318. #else
  319. int32_t buffer_size = U_BRK_SAFECLONE_BUFFERSIZE;
  320. return ubrk_safeClone(global_break_iterator, stack_buffer, &buffer_size, status);
  321. #endif
  322. }
  323. /* }}} */