grapheme_string.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837
  1. /*
  2. +----------------------------------------------------------------------+
  3. | This source file is subject to version 3.01 of the PHP license, |
  4. | that is bundled with this package in the file LICENSE, and is |
  5. | available through the world-wide-web at the following url: |
  6. | https://www.php.net/license/3_01.txt |
  7. | If you did not receive a copy of the PHP license and are unable to |
  8. | obtain it through the world-wide-web, please send a note to |
  9. | license@php.net so we can mail you a copy immediately. |
  10. +----------------------------------------------------------------------+
  11. | Author: Ed Batutis <ed@batutis.com> |
  12. +----------------------------------------------------------------------+
  13. */
  14. /* {{{ includes */
  15. #ifdef HAVE_CONFIG_H
  16. #include "config.h"
  17. #endif
  18. #include <php.h>
  19. #include "grapheme.h"
  20. #include "grapheme_util.h"
  21. #include <unicode/utypes.h>
  22. #include <unicode/utf8.h>
  23. #include <unicode/ucol.h>
  24. #include <unicode/ustring.h>
  25. #include <unicode/ubrk.h>
  26. /* }}} */
  27. #define GRAPHEME_EXTRACT_TYPE_COUNT 0
  28. #define GRAPHEME_EXTRACT_TYPE_MAXBYTES 1
  29. #define GRAPHEME_EXTRACT_TYPE_MAXCHARS 2
  30. #define GRAPHEME_EXTRACT_TYPE_MIN GRAPHEME_EXTRACT_TYPE_COUNT
  31. #define GRAPHEME_EXTRACT_TYPE_MAX GRAPHEME_EXTRACT_TYPE_MAXCHARS
  32. /* {{{ grapheme_register_constants
  33. * Register API constants
  34. */
  35. void grapheme_register_constants( INIT_FUNC_ARGS )
  36. {
  37. REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
  38. REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
  39. REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
  40. }
  41. /* }}} */
  42. /* {{{ Get number of graphemes in a string */
  43. PHP_FUNCTION(grapheme_strlen)
  44. {
  45. char* string;
  46. size_t string_len;
  47. UChar* ustring = NULL;
  48. int ustring_len = 0;
  49. zend_long ret_len;
  50. UErrorCode status;
  51. if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &string, &string_len) == FAILURE) {
  52. RETURN_THROWS();
  53. }
  54. ret_len = grapheme_ascii_check((unsigned char *)string, string_len);
  55. if ( ret_len >= 0 )
  56. RETURN_LONG(string_len);
  57. /* convert the string to UTF-16. */
  58. status = U_ZERO_ERROR;
  59. intl_convert_utf8_to_utf16(&ustring, &ustring_len, string, string_len, &status );
  60. if ( U_FAILURE( status ) ) {
  61. /* Set global error code. */
  62. intl_error_set_code( NULL, status );
  63. /* Set error messages. */
  64. intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
  65. if (ustring) {
  66. efree( ustring );
  67. }
  68. RETURN_NULL();
  69. }
  70. ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 );
  71. if (ustring) {
  72. efree( ustring );
  73. }
  74. if (ret_len >= 0) {
  75. RETVAL_LONG(ret_len);
  76. } else {
  77. RETVAL_FALSE;
  78. }
  79. }
  80. /* }}} */
  81. /* {{{ Find position of first occurrence of a string within another */
  82. PHP_FUNCTION(grapheme_strpos)
  83. {
  84. char *haystack, *needle;
  85. size_t haystack_len, needle_len;
  86. const char *found;
  87. zend_long loffset = 0;
  88. int32_t offset = 0;
  89. size_t noffset = 0;
  90. zend_long ret_pos;
  91. if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
  92. RETURN_THROWS();
  93. }
  94. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  95. zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
  96. RETURN_THROWS();
  97. }
  98. /* we checked that it will fit: */
  99. offset = (int32_t) loffset;
  100. noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
  101. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  102. if (offset >= 0 && grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0) {
  103. /* quick check to see if the string might be there
  104. * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
  105. */
  106. found = php_memnstr(haystack + noffset, needle, needle_len, haystack + haystack_len);
  107. /* if it isn't there the we are done */
  108. if (found) {
  109. RETURN_LONG(found - haystack);
  110. }
  111. RETURN_FALSE;
  112. }
  113. /* do utf16 part of the strpos */
  114. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ );
  115. if ( ret_pos >= 0 ) {
  116. RETURN_LONG(ret_pos);
  117. } else {
  118. RETURN_FALSE;
  119. }
  120. }
  121. /* }}} */
  122. /* {{{ Find position of first occurrence of a string within another, ignoring case differences */
  123. PHP_FUNCTION(grapheme_stripos)
  124. {
  125. char *haystack, *needle;
  126. size_t haystack_len, needle_len;
  127. const char *found;
  128. zend_long loffset = 0;
  129. int32_t offset = 0;
  130. zend_long ret_pos;
  131. int is_ascii;
  132. if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
  133. RETURN_THROWS();
  134. }
  135. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  136. zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
  137. RETURN_THROWS();
  138. }
  139. /* we checked that it will fit: */
  140. offset = (int32_t) loffset;
  141. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  142. is_ascii = ( grapheme_ascii_check((unsigned char*)haystack, haystack_len) >= 0 );
  143. if ( is_ascii ) {
  144. char *haystack_dup, *needle_dup;
  145. int32_t noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
  146. needle_dup = estrndup(needle, needle_len);
  147. zend_str_tolower(needle_dup, needle_len);
  148. haystack_dup = estrndup(haystack, haystack_len);
  149. zend_str_tolower(haystack_dup, haystack_len);
  150. found = php_memnstr(haystack_dup + noffset, needle_dup, needle_len, haystack_dup + haystack_len);
  151. efree(haystack_dup);
  152. efree(needle_dup);
  153. if (found) {
  154. RETURN_LONG(found - haystack_dup);
  155. }
  156. /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
  157. if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
  158. RETURN_FALSE;
  159. }
  160. }
  161. /* do utf16 part of the strpos */
  162. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ );
  163. if ( ret_pos >= 0 ) {
  164. RETURN_LONG(ret_pos);
  165. } else {
  166. RETURN_FALSE;
  167. }
  168. }
  169. /* }}} */
  170. /* {{{ Find position of last occurrence of a string within another */
  171. PHP_FUNCTION(grapheme_strrpos)
  172. {
  173. char *haystack, *needle;
  174. size_t haystack_len, needle_len;
  175. zend_long loffset = 0;
  176. int32_t offset = 0;
  177. zend_long ret_pos;
  178. int is_ascii;
  179. if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
  180. RETURN_THROWS();
  181. }
  182. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  183. zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
  184. RETURN_THROWS();
  185. }
  186. /* we checked that it will fit: */
  187. offset = (int32_t) loffset;
  188. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  189. is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
  190. if ( is_ascii ) {
  191. ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
  192. if ( ret_pos >= 0 ) {
  193. RETURN_LONG(ret_pos);
  194. }
  195. /* if the needle was ascii too, we are done */
  196. if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
  197. RETURN_FALSE;
  198. }
  199. /* else we need to continue via utf16 */
  200. }
  201. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */);
  202. if ( ret_pos >= 0 ) {
  203. RETURN_LONG(ret_pos);
  204. } else {
  205. RETURN_FALSE;
  206. }
  207. }
  208. /* }}} */
  209. /* {{{ Find position of last occurrence of a string within another, ignoring case */
  210. PHP_FUNCTION(grapheme_strripos)
  211. {
  212. char *haystack, *needle;
  213. size_t haystack_len, needle_len;
  214. zend_long loffset = 0;
  215. int32_t offset = 0;
  216. zend_long ret_pos;
  217. int is_ascii;
  218. if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
  219. RETURN_THROWS();
  220. }
  221. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  222. zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
  223. RETURN_THROWS();
  224. }
  225. /* we checked that it will fit: */
  226. offset = (int32_t) loffset;
  227. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  228. is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
  229. if ( is_ascii ) {
  230. char *needle_dup, *haystack_dup;
  231. needle_dup = estrndup(needle, needle_len);
  232. zend_str_tolower(needle_dup, needle_len);
  233. haystack_dup = estrndup(haystack, haystack_len);
  234. zend_str_tolower(haystack_dup, haystack_len);
  235. ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
  236. efree(haystack_dup);
  237. efree(needle_dup);
  238. if ( ret_pos >= 0 ) {
  239. RETURN_LONG(ret_pos);
  240. }
  241. /* if the needle was ascii too, we are done */
  242. if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
  243. RETURN_FALSE;
  244. }
  245. /* else we need to continue via utf16 */
  246. }
  247. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* f_ignore_case */, 1 /*last */);
  248. if ( ret_pos >= 0 ) {
  249. RETURN_LONG(ret_pos);
  250. } else {
  251. RETURN_FALSE;
  252. }
  253. }
  254. /* }}} */
  255. /* {{{ Returns part of a string */
  256. PHP_FUNCTION(grapheme_substr)
  257. {
  258. char *str;
  259. zend_string *u8_sub_str;
  260. UChar *ustr;
  261. size_t str_len;
  262. int32_t ustr_len;
  263. zend_long lstart = 0, length = 0;
  264. int32_t start = 0;
  265. int iter_val;
  266. UErrorCode status;
  267. unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
  268. UBreakIterator* bi = NULL;
  269. int sub_str_start_pos, sub_str_end_pos;
  270. int32_t (*iter_func)(UBreakIterator *);
  271. bool no_length = 1;
  272. if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|l!", &str, &str_len, &lstart, &length, &no_length) == FAILURE) {
  273. RETURN_THROWS();
  274. }
  275. if (lstart < INT32_MIN || lstart > INT32_MAX) {
  276. zend_argument_value_error(2, "is too large");
  277. RETURN_THROWS();
  278. }
  279. start = (int32_t) lstart;
  280. if (no_length) {
  281. length = str_len;
  282. }
  283. if (length < INT32_MIN || length > INT32_MAX) {
  284. zend_argument_value_error(3, "is too large");
  285. RETURN_THROWS();
  286. }
  287. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  288. if ( grapheme_ascii_check((unsigned char *)str, str_len) >= 0 ) {
  289. int32_t asub_str_len;
  290. char *sub_str;
  291. grapheme_substr_ascii(str, str_len, start, (int32_t)length, &sub_str, &asub_str_len);
  292. if ( NULL == sub_str ) {
  293. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 );
  294. RETURN_FALSE;
  295. }
  296. RETURN_STRINGL(sub_str, asub_str_len);
  297. }
  298. ustr = NULL;
  299. ustr_len = 0;
  300. status = U_ZERO_ERROR;
  301. intl_convert_utf8_to_utf16(&ustr, &ustr_len, str, str_len, &status);
  302. if ( U_FAILURE( status ) ) {
  303. /* Set global error code. */
  304. intl_error_set_code( NULL, status );
  305. /* Set error messages. */
  306. intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
  307. if (ustr) {
  308. efree( ustr );
  309. }
  310. RETURN_FALSE;
  311. }
  312. bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
  313. if( U_FAILURE(status) ) {
  314. RETURN_FALSE;
  315. }
  316. ubrk_setText(bi, ustr, ustr_len, &status);
  317. if ( start < 0 ) {
  318. iter_func = ubrk_previous;
  319. ubrk_last(bi);
  320. iter_val = 1;
  321. }
  322. else {
  323. iter_func = ubrk_next;
  324. iter_val = -1;
  325. }
  326. sub_str_start_pos = 0;
  327. while ( start ) {
  328. sub_str_start_pos = iter_func(bi);
  329. if ( UBRK_DONE == sub_str_start_pos ) {
  330. break;
  331. }
  332. start += iter_val;
  333. }
  334. if (0 != start) {
  335. if (start > 0) {
  336. if (ustr) {
  337. efree(ustr);
  338. }
  339. ubrk_close(bi);
  340. RETURN_EMPTY_STRING();
  341. }
  342. sub_str_start_pos = 0;
  343. ubrk_first(bi);
  344. }
  345. /* OK to convert here since if str_len were big, convert above would fail */
  346. if (length >= (int32_t)str_len) {
  347. /* no length supplied or length is too big, return the rest of the string */
  348. status = U_ZERO_ERROR;
  349. u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
  350. if (ustr) {
  351. efree( ustr );
  352. }
  353. ubrk_close( bi );
  354. if ( !u8_sub_str ) {
  355. /* Set global error code. */
  356. intl_error_set_code( NULL, status );
  357. /* Set error messages. */
  358. intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
  359. RETURN_FALSE;
  360. }
  361. /* return the allocated string, not a duplicate */
  362. RETVAL_NEW_STR(u8_sub_str);
  363. return;
  364. }
  365. if(length == 0) {
  366. /* empty length - we've validated start, we can return "" now */
  367. if (ustr) {
  368. efree(ustr);
  369. }
  370. ubrk_close(bi);
  371. RETURN_EMPTY_STRING();
  372. }
  373. /* find the end point of the string to return */
  374. if ( length < 0 ) {
  375. iter_func = ubrk_previous;
  376. ubrk_last(bi);
  377. iter_val = 1;
  378. }
  379. else {
  380. iter_func = ubrk_next;
  381. iter_val = -1;
  382. }
  383. sub_str_end_pos = 0;
  384. while ( length ) {
  385. sub_str_end_pos = iter_func(bi);
  386. if ( UBRK_DONE == sub_str_end_pos ) {
  387. break;
  388. }
  389. length += iter_val;
  390. }
  391. ubrk_close(bi);
  392. if ( UBRK_DONE == sub_str_end_pos) {
  393. if (length < 0) {
  394. efree(ustr);
  395. RETURN_EMPTY_STRING();
  396. } else {
  397. sub_str_end_pos = ustr_len;
  398. }
  399. }
  400. if (sub_str_start_pos > sub_str_end_pos) {
  401. efree(ustr);
  402. RETURN_EMPTY_STRING();
  403. }
  404. status = U_ZERO_ERROR;
  405. u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
  406. efree( ustr );
  407. if ( !u8_sub_str ) {
  408. /* Set global error code. */
  409. intl_error_set_code( NULL, status );
  410. /* Set error messages. */
  411. intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
  412. RETURN_FALSE;
  413. }
  414. /* return the allocated string, not a duplicate */
  415. RETVAL_NEW_STR(u8_sub_str);
  416. }
  417. /* }}} */
  418. /* {{{ strstr_common_handler */
  419. static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
  420. {
  421. char *haystack, *needle;
  422. const char *found;
  423. size_t haystack_len, needle_len;
  424. int32_t ret_pos, uchar_pos;
  425. bool part = 0;
  426. if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|b", &haystack, &haystack_len, &needle, &needle_len, &part) == FAILURE) {
  427. RETURN_THROWS();
  428. }
  429. if ( !f_ignore_case ) {
  430. /* ASCII optimization: quick check to see if the string might be there */
  431. found = php_memnstr(haystack, needle, needle_len, haystack + haystack_len);
  432. /* if it isn't there the we are done */
  433. if ( !found ) {
  434. RETURN_FALSE;
  435. }
  436. /* if it is there, and if the haystack is ascii, we are all done */
  437. if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
  438. size_t found_offset = found - haystack;
  439. if (part) {
  440. RETURN_STRINGL(haystack, found_offset);
  441. } else {
  442. RETURN_STRINGL(found, haystack_len - found_offset);
  443. }
  444. }
  445. }
  446. /* need to work in utf16 */
  447. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ );
  448. if ( ret_pos < 0 ) {
  449. RETURN_FALSE;
  450. }
  451. /* uchar_pos is the 'nth' Unicode character position of the needle */
  452. ret_pos = 0;
  453. U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
  454. if (part) {
  455. RETURN_STRINGL(haystack, ret_pos);
  456. } else {
  457. RETURN_STRINGL(haystack + ret_pos, haystack_len - ret_pos);
  458. }
  459. }
  460. /* }}} */
  461. /* {{{ Finds first occurrence of a string within another */
  462. PHP_FUNCTION(grapheme_strstr)
  463. {
  464. strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
  465. }
  466. /* }}} */
  467. /* {{{ Finds first occurrence of a string within another */
  468. PHP_FUNCTION(grapheme_stristr)
  469. {
  470. strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
  471. }
  472. /* }}} */
  473. /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
  474. static inline int32_t
  475. grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
  476. {
  477. int pos = 0;
  478. int ret_pos = 0;
  479. int break_pos, prev_break_pos;
  480. int count = 0;
  481. while ( 1 ) {
  482. pos = ubrk_next(bi);
  483. if ( UBRK_DONE == pos ) {
  484. break;
  485. }
  486. for ( break_pos = ret_pos; break_pos < pos; ) {
  487. count++;
  488. prev_break_pos = break_pos;
  489. U8_FWD_1(pstr, break_pos, str_len);
  490. if ( prev_break_pos == break_pos ) {
  491. /* something wrong - malformed utf8? */
  492. csize = 0;
  493. break;
  494. }
  495. }
  496. /* if we are beyond our limit, then the loop is done */
  497. if ( count > csize ) {
  498. break;
  499. }
  500. ret_pos = break_pos;
  501. }
  502. return ret_pos;
  503. }
  504. /* }}} */
  505. /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
  506. static inline int32_t
  507. grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
  508. {
  509. int pos = 0;
  510. int ret_pos = 0;
  511. while ( 1 ) {
  512. pos = ubrk_next(bi);
  513. if ( UBRK_DONE == pos ) {
  514. break;
  515. }
  516. if ( pos > bsize ) {
  517. break;
  518. }
  519. ret_pos = pos;
  520. }
  521. return ret_pos;
  522. }
  523. /* }}} */
  524. /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
  525. static inline int32_t
  526. grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
  527. {
  528. int next_pos = 0;
  529. int ret_pos = 0;
  530. while ( size ) {
  531. next_pos = ubrk_next(bi);
  532. if ( UBRK_DONE == next_pos ) {
  533. break;
  534. }
  535. ret_pos = next_pos;
  536. size--;
  537. }
  538. return ret_pos;
  539. }
  540. /* }}} */
  541. /* {{{ grapheme extract iter function pointer array */
  542. typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
  543. static grapheme_extract_iter grapheme_extract_iters[] = {
  544. &grapheme_extract_count_iter,
  545. &grapheme_extract_bytecount_iter,
  546. &grapheme_extract_charcount_iter,
  547. };
  548. /* }}} */
  549. /* {{{ Function to extract a sequence of default grapheme clusters */
  550. PHP_FUNCTION(grapheme_extract)
  551. {
  552. char *str, *pstr;
  553. UText ut = UTEXT_INITIALIZER;
  554. size_t str_len;
  555. zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
  556. zend_long lstart = 0; /* starting position in str in bytes */
  557. int32_t start = 0;
  558. zend_long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
  559. UErrorCode status;
  560. unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
  561. UBreakIterator* bi = NULL;
  562. int ret_pos;
  563. zval *next = NULL; /* return offset of next part of the string */
  564. if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|llz", &str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
  565. RETURN_THROWS();
  566. }
  567. if (lstart < 0) {
  568. lstart += str_len;
  569. }
  570. if ( NULL != next ) {
  571. if ( !Z_ISREF_P(next) ) {
  572. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  573. "grapheme_extract: 'next' was not passed by reference", 0 );
  574. RETURN_FALSE;
  575. } else {
  576. ZVAL_DEREF(next);
  577. /* initialize next */
  578. zval_ptr_dtor(next);
  579. ZVAL_LONG(next, lstart);
  580. }
  581. }
  582. if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
  583. zend_argument_value_error(3, "must be one of GRAPHEME_EXTR_COUNT, GRAPHEME_EXTR_MAXBYTES, or GRAPHEME_EXTR_MAXCHARS");
  584. RETURN_THROWS();
  585. }
  586. if ( lstart > INT32_MAX || lstart < 0 || (size_t)lstart >= str_len ) {
  587. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 );
  588. RETURN_FALSE;
  589. }
  590. if (size < 0) {
  591. zend_argument_value_error(2, "must be greater than or equal to 0");
  592. RETURN_THROWS();
  593. }
  594. if (size > INT32_MAX) {
  595. zend_argument_value_error(2, "is too large");
  596. RETURN_THROWS();
  597. }
  598. if (size == 0) {
  599. RETURN_EMPTY_STRING();
  600. }
  601. /* we checked that it will fit: */
  602. start = (int32_t) lstart;
  603. pstr = str + start;
  604. /* just in case pstr points in the middle of a character, move forward to the start of the next char */
  605. if ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
  606. char *str_end = str + str_len;
  607. while ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
  608. pstr++;
  609. if ( pstr >= str_end ) {
  610. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  611. "grapheme_extract: invalid input string", 0 );
  612. RETURN_FALSE;
  613. }
  614. }
  615. }
  616. str_len -= (pstr - str);
  617. /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
  618. (size + 1 because the size-th character might be the beginning of a grapheme cluster)
  619. */
  620. if ( -1 != grapheme_ascii_check((unsigned char *)pstr, MIN(size + 1, str_len)) ) {
  621. size_t nsize = MIN(size, str_len);
  622. if ( NULL != next ) {
  623. ZVAL_LONG(next, start+nsize);
  624. }
  625. RETURN_STRINGL(pstr, nsize);
  626. }
  627. status = U_ZERO_ERROR;
  628. utext_openUTF8(&ut, pstr, str_len, &status);
  629. if ( U_FAILURE( status ) ) {
  630. /* Set global error code. */
  631. intl_error_set_code( NULL, status );
  632. /* Set error messages. */
  633. intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
  634. RETURN_FALSE;
  635. }
  636. bi = NULL;
  637. status = U_ZERO_ERROR;
  638. bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
  639. ubrk_setUText(bi, &ut, &status);
  640. /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
  641. can't back up. So, we will not do anything. */
  642. /* now we need to find the end of the chunk the user wants us to return */
  643. /* it's ok to convert str_len to in32_t since if it were too big intl_convert_utf8_to_utf16 above would fail */
  644. ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, (unsigned char *)pstr, (int32_t)str_len);
  645. utext_close(&ut);
  646. ubrk_close(bi);
  647. if ( NULL != next ) {
  648. ZVAL_LONG(next, start+ret_pos);
  649. }
  650. RETURN_STRINGL(((char *)pstr), ret_pos);
  651. }
  652. /* }}} */