grapheme_string.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913
  1. /*
  2. +----------------------------------------------------------------------+
  3. | PHP Version 7 |
  4. +----------------------------------------------------------------------+
  5. | This source file is subject to version 3.01 of the PHP license, |
  6. | that is bundled with this package in the file LICENSE, and is |
  7. | available through the world-wide-web at the following url: |
  8. | http://www.php.net/license/3_01.txt |
  9. | If you did not receive a copy of the PHP license and are unable to |
  10. | obtain it through the world-wide-web, please send a note to |
  11. | license@php.net so we can mail you a copy immediately. |
  12. +----------------------------------------------------------------------+
  13. | Author: Ed Batutis <ed@batutis.com> |
  14. +----------------------------------------------------------------------+
  15. */
  16. /* {{{ includes */
  17. #ifdef HAVE_CONFIG_H
  18. #include "config.h"
  19. #endif
  20. #include <php.h>
  21. #include "grapheme.h"
  22. #include "grapheme_util.h"
  23. #include <unicode/utypes.h>
  24. #if U_ICU_VERSION_MAJOR_NUM >= 49
  25. #include <unicode/utf8.h>
  26. #endif
  27. #include <unicode/ucol.h>
  28. #include <unicode/ustring.h>
  29. #include <unicode/ubrk.h>
  30. #include "ext/standard/php_string.h"
  31. /* }}} */
  32. #define GRAPHEME_EXTRACT_TYPE_COUNT 0
  33. #define GRAPHEME_EXTRACT_TYPE_MAXBYTES 1
  34. #define GRAPHEME_EXTRACT_TYPE_MAXCHARS 2
  35. #define GRAPHEME_EXTRACT_TYPE_MIN GRAPHEME_EXTRACT_TYPE_COUNT
  36. #define GRAPHEME_EXTRACT_TYPE_MAX GRAPHEME_EXTRACT_TYPE_MAXCHARS
  37. /* {{{ grapheme_register_constants
  38. * Register API constants
  39. */
  40. void grapheme_register_constants( INIT_FUNC_ARGS )
  41. {
  42. REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
  43. REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
  44. REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
  45. }
  46. /* }}} */
  47. /* {{{ proto size_t grapheme_strlen(string str)
  48. Get number of graphemes in a string */
  49. PHP_FUNCTION(grapheme_strlen)
  50. {
  51. char* string;
  52. size_t string_len;
  53. UChar* ustring = NULL;
  54. int ustring_len = 0;
  55. zend_long ret_len;
  56. UErrorCode status;
  57. if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &string, &string_len) == FAILURE) {
  58. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  59. "grapheme_strlen: unable to parse input param", 0 );
  60. RETURN_FALSE;
  61. }
  62. ret_len = grapheme_ascii_check((unsigned char *)string, string_len);
  63. if ( ret_len >= 0 )
  64. RETURN_LONG(string_len);
  65. /* convert the string to UTF-16. */
  66. status = U_ZERO_ERROR;
  67. intl_convert_utf8_to_utf16(&ustring, &ustring_len, string, string_len, &status );
  68. if ( U_FAILURE( status ) ) {
  69. /* Set global error code. */
  70. intl_error_set_code( NULL, status );
  71. /* Set error messages. */
  72. intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
  73. if (ustring) {
  74. efree( ustring );
  75. }
  76. RETURN_NULL();
  77. }
  78. ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 );
  79. if (ustring) {
  80. efree( ustring );
  81. }
  82. if (ret_len >= 0) {
  83. RETVAL_LONG(ret_len);
  84. } else {
  85. RETVAL_FALSE;
  86. }
  87. }
  88. /* }}} */
  89. /* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
  90. Find position of first occurrence of a string within another */
  91. PHP_FUNCTION(grapheme_strpos)
  92. {
  93. char *haystack, *needle;
  94. size_t haystack_len, needle_len;
  95. const char *found;
  96. zend_long loffset = 0;
  97. int32_t offset = 0;
  98. size_t noffset = 0;
  99. zend_long ret_pos;
  100. if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
  101. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  102. "grapheme_strpos: unable to parse input param", 0 );
  103. RETURN_FALSE;
  104. }
  105. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  106. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
  107. RETURN_FALSE;
  108. }
  109. /* we checked that it will fit: */
  110. offset = (int32_t) loffset;
  111. noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
  112. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  113. if (needle_len == 0) {
  114. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
  115. RETURN_FALSE;
  116. }
  117. if (offset >= 0) {
  118. /* quick check to see if the string might be there
  119. * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
  120. */
  121. found = php_memnstr(haystack + noffset, needle, needle_len, haystack + haystack_len);
  122. /* if it isn't there the we are done */
  123. if (!found) {
  124. RETURN_FALSE;
  125. }
  126. /* if it is there, and if the haystack is ascii, we are all done */
  127. if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
  128. RETURN_LONG(found - haystack);
  129. }
  130. }
  131. /* do utf16 part of the strpos */
  132. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ );
  133. if ( ret_pos >= 0 ) {
  134. RETURN_LONG(ret_pos);
  135. } else {
  136. RETURN_FALSE;
  137. }
  138. }
  139. /* }}} */
  140. /* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
  141. Find position of first occurrence of a string within another, ignoring case differences */
  142. PHP_FUNCTION(grapheme_stripos)
  143. {
  144. char *haystack, *needle;
  145. size_t haystack_len, needle_len;
  146. const char *found;
  147. zend_long loffset = 0;
  148. int32_t offset = 0;
  149. zend_long ret_pos;
  150. int is_ascii;
  151. if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
  152. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  153. "grapheme_stripos: unable to parse input param", 0 );
  154. RETURN_FALSE;
  155. }
  156. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  157. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 );
  158. RETURN_FALSE;
  159. }
  160. /* we checked that it will fit: */
  161. offset = (int32_t) loffset;
  162. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  163. if (needle_len == 0) {
  164. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 );
  165. RETURN_FALSE;
  166. }
  167. is_ascii = ( grapheme_ascii_check((unsigned char*)haystack, haystack_len) >= 0 );
  168. if ( is_ascii ) {
  169. char *haystack_dup, *needle_dup;
  170. int32_t noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
  171. needle_dup = estrndup(needle, needle_len);
  172. php_strtolower(needle_dup, needle_len);
  173. haystack_dup = estrndup(haystack, haystack_len);
  174. php_strtolower(haystack_dup, haystack_len);
  175. found = php_memnstr(haystack_dup + noffset, needle_dup, needle_len, haystack_dup + haystack_len);
  176. efree(haystack_dup);
  177. efree(needle_dup);
  178. if (found) {
  179. RETURN_LONG(found - haystack_dup);
  180. }
  181. /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
  182. if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
  183. RETURN_FALSE;
  184. }
  185. }
  186. /* do utf16 part of the strpos */
  187. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ );
  188. if ( ret_pos >= 0 ) {
  189. RETURN_LONG(ret_pos);
  190. } else {
  191. RETURN_FALSE;
  192. }
  193. }
  194. /* }}} */
  195. /* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
  196. Find position of last occurrence of a string within another */
  197. PHP_FUNCTION(grapheme_strrpos)
  198. {
  199. char *haystack, *needle;
  200. size_t haystack_len, needle_len;
  201. zend_long loffset = 0;
  202. int32_t offset = 0;
  203. zend_long ret_pos;
  204. int is_ascii;
  205. if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
  206. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  207. "grapheme_strrpos: unable to parse input param", 0 );
  208. RETURN_FALSE;
  209. }
  210. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  211. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
  212. RETURN_FALSE;
  213. }
  214. /* we checked that it will fit: */
  215. offset = (int32_t) loffset;
  216. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  217. if (needle_len == 0) {
  218. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
  219. RETURN_FALSE;
  220. }
  221. is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
  222. if ( is_ascii ) {
  223. ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
  224. if ( ret_pos >= 0 ) {
  225. RETURN_LONG(ret_pos);
  226. }
  227. /* if the needle was ascii too, we are done */
  228. if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
  229. RETURN_FALSE;
  230. }
  231. /* else we need to continue via utf16 */
  232. }
  233. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */);
  234. if ( ret_pos >= 0 ) {
  235. RETURN_LONG(ret_pos);
  236. } else {
  237. RETURN_FALSE;
  238. }
  239. }
  240. /* }}} */
  241. /* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
  242. Find position of last occurrence of a string within another, ignoring case */
  243. PHP_FUNCTION(grapheme_strripos)
  244. {
  245. char *haystack, *needle;
  246. size_t haystack_len, needle_len;
  247. zend_long loffset = 0;
  248. int32_t offset = 0;
  249. zend_long ret_pos;
  250. int is_ascii;
  251. if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
  252. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  253. "grapheme_strrpos: unable to parse input param", 0 );
  254. RETURN_FALSE;
  255. }
  256. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  257. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
  258. RETURN_FALSE;
  259. }
  260. /* we checked that it will fit: */
  261. offset = (int32_t) loffset;
  262. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  263. if (needle_len == 0) {
  264. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
  265. RETURN_FALSE;
  266. }
  267. is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
  268. if ( is_ascii ) {
  269. char *needle_dup, *haystack_dup;
  270. needle_dup = estrndup(needle, needle_len);
  271. php_strtolower(needle_dup, needle_len);
  272. haystack_dup = estrndup(haystack, haystack_len);
  273. php_strtolower(haystack_dup, haystack_len);
  274. ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
  275. efree(haystack_dup);
  276. efree(needle_dup);
  277. if ( ret_pos >= 0 ) {
  278. RETURN_LONG(ret_pos);
  279. }
  280. /* if the needle was ascii too, we are done */
  281. if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
  282. RETURN_FALSE;
  283. }
  284. /* else we need to continue via utf16 */
  285. }
  286. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* f_ignore_case */, 1 /*last */);
  287. if ( ret_pos >= 0 ) {
  288. RETURN_LONG(ret_pos);
  289. } else {
  290. RETURN_FALSE;
  291. }
  292. }
  293. /* }}} */
  294. /* {{{ proto string grapheme_substr(string str, int start [, int length])
  295. Returns part of a string */
  296. PHP_FUNCTION(grapheme_substr)
  297. {
  298. char *str;
  299. zend_string *u8_sub_str;
  300. UChar *ustr;
  301. size_t str_len;
  302. int32_t ustr_len;
  303. zend_long lstart = 0, length = 0;
  304. int32_t start = 0;
  305. int iter_val;
  306. UErrorCode status;
  307. unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
  308. UBreakIterator* bi = NULL;
  309. int sub_str_start_pos, sub_str_end_pos;
  310. int32_t (*iter_func)(UBreakIterator *);
  311. zend_bool no_length = 1;
  312. if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|l!", &str, &str_len, &lstart, &length, &no_length) == FAILURE) {
  313. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  314. "grapheme_substr: unable to parse input param", 0 );
  315. RETURN_FALSE;
  316. }
  317. if ( OUTSIDE_STRING(lstart, str_len)) {
  318. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 );
  319. RETURN_FALSE;
  320. }
  321. /* we checked that it will fit: */
  322. start = (int32_t) lstart;
  323. if(no_length) {
  324. length = str_len;
  325. }
  326. if(length < INT32_MIN) {
  327. length = INT32_MIN;
  328. } else if(length > INT32_MAX) {
  329. length = INT32_MAX;
  330. }
  331. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  332. if ( grapheme_ascii_check((unsigned char *)str, str_len) >= 0 ) {
  333. int32_t asub_str_len;
  334. char *sub_str;
  335. grapheme_substr_ascii(str, str_len, start, (int32_t)length, &sub_str, &asub_str_len);
  336. if ( NULL == sub_str ) {
  337. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 );
  338. RETURN_FALSE;
  339. }
  340. RETURN_STRINGL(sub_str, asub_str_len);
  341. }
  342. ustr = NULL;
  343. ustr_len = 0;
  344. status = U_ZERO_ERROR;
  345. intl_convert_utf8_to_utf16(&ustr, &ustr_len, str, str_len, &status);
  346. if ( U_FAILURE( status ) ) {
  347. /* Set global error code. */
  348. intl_error_set_code( NULL, status );
  349. /* Set error messages. */
  350. intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
  351. if (ustr) {
  352. efree( ustr );
  353. }
  354. RETURN_FALSE;
  355. }
  356. bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
  357. if( U_FAILURE(status) ) {
  358. RETURN_FALSE;
  359. }
  360. ubrk_setText(bi, ustr, ustr_len, &status);
  361. if ( start < 0 ) {
  362. iter_func = ubrk_previous;
  363. ubrk_last(bi);
  364. iter_val = 1;
  365. }
  366. else {
  367. iter_func = ubrk_next;
  368. iter_val = -1;
  369. }
  370. sub_str_start_pos = 0;
  371. while ( start ) {
  372. sub_str_start_pos = iter_func(bi);
  373. if ( UBRK_DONE == sub_str_start_pos ) {
  374. break;
  375. }
  376. start += iter_val;
  377. }
  378. if ( 0 != start || sub_str_start_pos >= ustr_len ) {
  379. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 );
  380. if (ustr) {
  381. efree(ustr);
  382. }
  383. ubrk_close(bi);
  384. RETURN_FALSE;
  385. }
  386. /* OK to convert here since if str_len were big, convert above would fail */
  387. if (length >= (int32_t)str_len) {
  388. /* no length supplied or length is too big, return the rest of the string */
  389. status = U_ZERO_ERROR;
  390. u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
  391. if (ustr) {
  392. efree( ustr );
  393. }
  394. ubrk_close( bi );
  395. if ( !u8_sub_str ) {
  396. /* Set global error code. */
  397. intl_error_set_code( NULL, status );
  398. /* Set error messages. */
  399. intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
  400. RETURN_FALSE;
  401. }
  402. /* return the allocated string, not a duplicate */
  403. RETVAL_NEW_STR(u8_sub_str);
  404. return;
  405. }
  406. if(length == 0) {
  407. /* empty length - we've validated start, we can return "" now */
  408. if (ustr) {
  409. efree(ustr);
  410. }
  411. ubrk_close(bi);
  412. RETURN_EMPTY_STRING();
  413. }
  414. /* find the end point of the string to return */
  415. if ( length < 0 ) {
  416. iter_func = ubrk_previous;
  417. ubrk_last(bi);
  418. iter_val = 1;
  419. }
  420. else {
  421. iter_func = ubrk_next;
  422. iter_val = -1;
  423. }
  424. sub_str_end_pos = 0;
  425. while ( length ) {
  426. sub_str_end_pos = iter_func(bi);
  427. if ( UBRK_DONE == sub_str_end_pos ) {
  428. break;
  429. }
  430. length += iter_val;
  431. }
  432. ubrk_close(bi);
  433. if ( UBRK_DONE == sub_str_end_pos) {
  434. if(length < 0) {
  435. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 );
  436. efree(ustr);
  437. RETURN_FALSE;
  438. } else {
  439. sub_str_end_pos = ustr_len;
  440. }
  441. }
  442. if(sub_str_start_pos > sub_str_end_pos) {
  443. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length is beyond start", 1 );
  444. efree(ustr);
  445. RETURN_FALSE;
  446. }
  447. status = U_ZERO_ERROR;
  448. u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
  449. efree( ustr );
  450. if ( !u8_sub_str ) {
  451. /* Set global error code. */
  452. intl_error_set_code( NULL, status );
  453. /* Set error messages. */
  454. intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
  455. RETURN_FALSE;
  456. }
  457. /* return the allocated string, not a duplicate */
  458. RETVAL_NEW_STR(u8_sub_str);
  459. }
  460. /* }}} */
  461. /* {{{ strstr_common_handler */
  462. static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
  463. {
  464. char *haystack, *needle;
  465. const char *found;
  466. size_t haystack_len, needle_len;
  467. int32_t ret_pos, uchar_pos;
  468. zend_bool part = 0;
  469. if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|b", &haystack, &haystack_len, &needle, &needle_len, &part) == FAILURE) {
  470. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  471. "grapheme_strstr: unable to parse input param", 0 );
  472. RETURN_FALSE;
  473. }
  474. if (needle_len == 0) {
  475. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
  476. RETURN_FALSE;
  477. }
  478. if ( !f_ignore_case ) {
  479. /* ASCII optimization: quick check to see if the string might be there
  480. * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
  481. */
  482. found = php_memnstr(haystack, needle, needle_len, haystack + haystack_len);
  483. /* if it isn't there the we are done */
  484. if ( !found ) {
  485. RETURN_FALSE;
  486. }
  487. /* if it is there, and if the haystack is ascii, we are all done */
  488. if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
  489. size_t found_offset = found - haystack;
  490. if (part) {
  491. RETURN_STRINGL(haystack, found_offset);
  492. } else {
  493. RETURN_STRINGL(found, haystack_len - found_offset);
  494. }
  495. }
  496. }
  497. /* need to work in utf16 */
  498. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ );
  499. if ( ret_pos < 0 ) {
  500. RETURN_FALSE;
  501. }
  502. /* uchar_pos is the 'nth' Unicode character position of the needle */
  503. ret_pos = 0;
  504. U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
  505. if (part) {
  506. RETURN_STRINGL(haystack, ret_pos);
  507. } else {
  508. RETURN_STRINGL(haystack + ret_pos, haystack_len - ret_pos);
  509. }
  510. }
  511. /* }}} */
  512. /* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
  513. Finds first occurrence of a string within another */
  514. PHP_FUNCTION(grapheme_strstr)
  515. {
  516. strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
  517. }
  518. /* }}} */
  519. /* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
  520. Finds first occurrence of a string within another */
  521. PHP_FUNCTION(grapheme_stristr)
  522. {
  523. strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
  524. }
  525. /* }}} */
  526. /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
  527. static inline int32_t
  528. grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
  529. {
  530. int pos = 0;
  531. int ret_pos = 0;
  532. int break_pos, prev_break_pos;
  533. int count = 0;
  534. while ( 1 ) {
  535. pos = ubrk_next(bi);
  536. if ( UBRK_DONE == pos ) {
  537. break;
  538. }
  539. for ( break_pos = ret_pos; break_pos < pos; ) {
  540. count++;
  541. prev_break_pos = break_pos;
  542. U8_FWD_1(pstr, break_pos, str_len);
  543. if ( prev_break_pos == break_pos ) {
  544. /* something wrong - malformed utf8? */
  545. csize = 0;
  546. break;
  547. }
  548. }
  549. /* if we are beyond our limit, then the loop is done */
  550. if ( count > csize ) {
  551. break;
  552. }
  553. ret_pos = break_pos;
  554. }
  555. return ret_pos;
  556. }
  557. /* }}} */
  558. /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
  559. static inline int32_t
  560. grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
  561. {
  562. int pos = 0;
  563. int ret_pos = 0;
  564. while ( 1 ) {
  565. pos = ubrk_next(bi);
  566. if ( UBRK_DONE == pos ) {
  567. break;
  568. }
  569. if ( pos > bsize ) {
  570. break;
  571. }
  572. ret_pos = pos;
  573. }
  574. return ret_pos;
  575. }
  576. /* }}} */
  577. /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
  578. static inline int32_t
  579. grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
  580. {
  581. int next_pos = 0;
  582. int ret_pos = 0;
  583. while ( size ) {
  584. next_pos = ubrk_next(bi);
  585. if ( UBRK_DONE == next_pos ) {
  586. break;
  587. }
  588. ret_pos = next_pos;
  589. size--;
  590. }
  591. return ret_pos;
  592. }
  593. /* }}} */
  594. /* {{{ grapheme extract iter function pointer array */
  595. typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
  596. static grapheme_extract_iter grapheme_extract_iters[] = {
  597. &grapheme_extract_count_iter,
  598. &grapheme_extract_bytecount_iter,
  599. &grapheme_extract_charcount_iter,
  600. };
  601. /* }}} */
  602. /* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
  603. Function to extract a sequence of default grapheme clusters */
  604. PHP_FUNCTION(grapheme_extract)
  605. {
  606. char *str, *pstr;
  607. UText ut = UTEXT_INITIALIZER;
  608. size_t str_len;
  609. zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
  610. zend_long lstart = 0; /* starting position in str in bytes */
  611. int32_t start = 0;
  612. zend_long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
  613. UErrorCode status;
  614. unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
  615. UBreakIterator* bi = NULL;
  616. int ret_pos;
  617. zval *next = NULL; /* return offset of next part of the string */
  618. if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|llz", &str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
  619. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  620. "grapheme_extract: unable to parse input param", 0 );
  621. RETURN_FALSE;
  622. }
  623. if (lstart < 0) {
  624. lstart += str_len;
  625. }
  626. if ( NULL != next ) {
  627. if ( !Z_ISREF_P(next) ) {
  628. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  629. "grapheme_extract: 'next' was not passed by reference", 0 );
  630. RETURN_FALSE;
  631. } else {
  632. ZVAL_DEREF(next);
  633. /* initialize next */
  634. zval_ptr_dtor(next);
  635. ZVAL_LONG(next, lstart);
  636. }
  637. }
  638. if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
  639. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  640. "grapheme_extract: unknown extract type param", 0 );
  641. RETURN_FALSE;
  642. }
  643. if ( lstart > INT32_MAX || lstart < 0 || (size_t)lstart >= str_len ) {
  644. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 );
  645. RETURN_FALSE;
  646. }
  647. if ( size > INT32_MAX || size < 0) {
  648. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 );
  649. RETURN_FALSE;
  650. }
  651. if (size == 0) {
  652. RETURN_EMPTY_STRING();
  653. }
  654. /* we checked that it will fit: */
  655. start = (int32_t) lstart;
  656. pstr = str + start;
  657. /* just in case pstr points in the middle of a character, move forward to the start of the next char */
  658. if ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
  659. char *str_end = str + str_len;
  660. while ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
  661. pstr++;
  662. if ( pstr >= str_end ) {
  663. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  664. "grapheme_extract: invalid input string", 0 );
  665. RETURN_FALSE;
  666. }
  667. }
  668. }
  669. str_len -= (pstr - str);
  670. /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
  671. (size + 1 because the size-th character might be the beginning of a grapheme cluster)
  672. */
  673. if ( -1 != grapheme_ascii_check((unsigned char *)pstr, MIN(size + 1, str_len)) ) {
  674. size_t nsize = MIN(size, str_len);
  675. if ( NULL != next ) {
  676. ZVAL_LONG(next, start+nsize);
  677. }
  678. RETURN_STRINGL(pstr, nsize);
  679. }
  680. status = U_ZERO_ERROR;
  681. utext_openUTF8(&ut, pstr, str_len, &status);
  682. if ( U_FAILURE( status ) ) {
  683. /* Set global error code. */
  684. intl_error_set_code( NULL, status );
  685. /* Set error messages. */
  686. intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
  687. RETURN_FALSE;
  688. }
  689. bi = NULL;
  690. status = U_ZERO_ERROR;
  691. bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
  692. ubrk_setUText(bi, &ut, &status);
  693. /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
  694. can't back up. So, we will not do anything. */
  695. /* now we need to find the end of the chunk the user wants us to return */
  696. /* it's ok to convert str_len to in32_t since if it were too big intl_convert_utf8_to_utf16 above would fail */
  697. ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, (unsigned char *)pstr, (int32_t)str_len);
  698. utext_close(&ut);
  699. ubrk_close(bi);
  700. if ( NULL != next ) {
  701. ZVAL_LONG(next, start+ret_pos);
  702. }
  703. RETURN_STRINGL(((char *)pstr), ret_pos);
  704. }
  705. /* }}} */
  706. /*
  707. * Local variables:
  708. * tab-width: 4
  709. * c-basic-offset: 4
  710. * End:
  711. * vim600: fdm=marker
  712. * vim: noet sw=4 ts=4
  713. */