grapheme_string.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936
  1. /*
  2. +----------------------------------------------------------------------+
  3. | PHP Version 5 |
  4. +----------------------------------------------------------------------+
  5. | This source file is subject to version 3.01 of the PHP license, |
  6. | that is bundled with this package in the file LICENSE, and is |
  7. | available through the world-wide-web at the following url: |
  8. | http://www.php.net/license/3_01.txt |
  9. | If you did not receive a copy of the PHP license and are unable to |
  10. | obtain it through the world-wide-web, please send a note to |
  11. | license@php.net so we can mail you a copy immediately. |
  12. +----------------------------------------------------------------------+
  13. | Author: Ed Batutis <ed@batutis.com> |
  14. +----------------------------------------------------------------------+
  15. */
  16. /* {{{ includes */
  17. #ifdef HAVE_CONFIG_H
  18. #include "config.h"
  19. #endif
  20. #include <php.h>
  21. #include "grapheme.h"
  22. #include "grapheme_util.h"
  23. #include <unicode/utypes.h>
  24. #include <unicode/ucol.h>
  25. #include <unicode/ustring.h>
  26. #include <unicode/ubrk.h>
  27. #include "ext/standard/php_string.h"
  28. /* }}} */
  29. #define GRAPHEME_EXTRACT_TYPE_COUNT 0
  30. #define GRAPHEME_EXTRACT_TYPE_MAXBYTES 1
  31. #define GRAPHEME_EXTRACT_TYPE_MAXCHARS 2
  32. #define GRAPHEME_EXTRACT_TYPE_MIN GRAPHEME_EXTRACT_TYPE_COUNT
  33. #define GRAPHEME_EXTRACT_TYPE_MAX GRAPHEME_EXTRACT_TYPE_MAXCHARS
  34. /* {{{ grapheme_register_constants
  35. * Register API constants
  36. */
  37. void grapheme_register_constants( INIT_FUNC_ARGS )
  38. {
  39. REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
  40. REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
  41. REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
  42. }
  43. /* }}} */
  44. /* {{{ proto int grapheme_strlen(string str)
  45. Get number of graphemes in a string */
  46. PHP_FUNCTION(grapheme_strlen)
  47. {
  48. unsigned char* string;
  49. int string_len;
  50. UChar* ustring = NULL;
  51. int ustring_len = 0;
  52. int ret_len;
  53. UErrorCode status;
  54. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", (char **)&string, &string_len) == FAILURE) {
  55. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  56. "grapheme_strlen: unable to parse input param", 0 TSRMLS_CC );
  57. RETURN_FALSE;
  58. }
  59. ret_len = grapheme_ascii_check(string, string_len);
  60. if ( ret_len >= 0 )
  61. RETURN_LONG(ret_len);
  62. /* convert the string to UTF-16. */
  63. status = U_ZERO_ERROR;
  64. intl_convert_utf8_to_utf16(&ustring, &ustring_len, (char*) string, string_len, &status );
  65. if ( U_FAILURE( status ) ) {
  66. /* Set global error code. */
  67. intl_error_set_code( NULL, status TSRMLS_CC );
  68. /* Set error messages. */
  69. intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
  70. if (ustring) {
  71. efree( ustring );
  72. }
  73. RETURN_NULL();
  74. }
  75. ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 TSRMLS_CC );
  76. if (ustring) {
  77. efree( ustring );
  78. }
  79. if (ret_len >= 0) {
  80. RETVAL_LONG(ret_len);
  81. } else {
  82. RETVAL_FALSE;
  83. }
  84. }
  85. /* }}} */
  86. /* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
  87. Find position of first occurrence of a string within another */
  88. PHP_FUNCTION(grapheme_strpos)
  89. {
  90. unsigned char *haystack, *needle;
  91. int haystack_len, needle_len;
  92. unsigned char *found;
  93. long loffset = 0;
  94. int32_t offset = 0, noffset = 0;
  95. int ret_pos;
  96. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
  97. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  98. "grapheme_strpos: unable to parse input param", 0 TSRMLS_CC );
  99. RETURN_FALSE;
  100. }
  101. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  102. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
  103. RETURN_FALSE;
  104. }
  105. /* we checked that it will fit: */
  106. offset = (int32_t) loffset;
  107. noffset = offset >= 0 ? offset : haystack_len + offset;
  108. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  109. if (needle_len == 0) {
  110. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
  111. RETURN_FALSE;
  112. }
  113. /* quick check to see if the string might be there
  114. * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
  115. */
  116. found = (unsigned char *)php_memnstr((char *)haystack + noffset, (char *)needle, needle_len, (char *)haystack + haystack_len);
  117. /* if it isn't there the we are done */
  118. if (!found) {
  119. RETURN_FALSE;
  120. }
  121. /* if it is there, and if the haystack is ascii, we are all done */
  122. if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
  123. RETURN_LONG(found - haystack);
  124. }
  125. /* do utf16 part of the strpos */
  126. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ TSRMLS_CC );
  127. if ( ret_pos >= 0 ) {
  128. RETURN_LONG(ret_pos);
  129. } else {
  130. RETURN_FALSE;
  131. }
  132. }
  133. /* }}} */
  134. /* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
  135. Find position of first occurrence of a string within another, ignoring case differences */
  136. PHP_FUNCTION(grapheme_stripos)
  137. {
  138. unsigned char *haystack, *needle, *haystack_dup, *needle_dup;
  139. int haystack_len, needle_len;
  140. unsigned char *found;
  141. long loffset = 0;
  142. int32_t offset = 0;
  143. int ret_pos;
  144. int is_ascii;
  145. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
  146. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  147. "grapheme_stripos: unable to parse input param", 0 TSRMLS_CC );
  148. RETURN_FALSE;
  149. }
  150. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  151. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 TSRMLS_CC );
  152. RETURN_FALSE;
  153. }
  154. /* we checked that it will fit: */
  155. offset = (int32_t) loffset;
  156. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  157. if (needle_len == 0) {
  158. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 TSRMLS_CC );
  159. RETURN_FALSE;
  160. }
  161. is_ascii = ( grapheme_ascii_check(haystack, haystack_len) >= 0 );
  162. if ( is_ascii ) {
  163. int32_t noffset = offset >= 0 ? offset : haystack_len + offset;
  164. needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
  165. php_strtolower((char *)needle_dup, needle_len);
  166. haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
  167. php_strtolower((char *)haystack_dup, haystack_len);
  168. found = (unsigned char*) php_memnstr((char *)haystack_dup + noffset, (char *)needle_dup, needle_len, (char *)haystack_dup + haystack_len);
  169. efree(haystack_dup);
  170. efree(needle_dup);
  171. if (found) {
  172. RETURN_LONG(found - haystack_dup);
  173. }
  174. /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
  175. if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
  176. RETURN_FALSE;
  177. }
  178. }
  179. /* do utf16 part of the strpos */
  180. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ TSRMLS_CC );
  181. if ( ret_pos >= 0 ) {
  182. RETURN_LONG(ret_pos);
  183. } else {
  184. RETURN_FALSE;
  185. }
  186. }
  187. /* }}} */
  188. /* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
  189. Find position of last occurrence of a string within another */
  190. PHP_FUNCTION(grapheme_strrpos)
  191. {
  192. unsigned char *haystack, *needle;
  193. int haystack_len, needle_len;
  194. long loffset = 0;
  195. int32_t offset = 0;
  196. int32_t ret_pos;
  197. int is_ascii;
  198. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
  199. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  200. "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
  201. RETURN_FALSE;
  202. }
  203. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  204. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
  205. RETURN_FALSE;
  206. }
  207. /* we checked that it will fit: */
  208. offset = (int32_t) loffset;
  209. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  210. if (needle_len == 0) {
  211. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
  212. RETURN_FALSE;
  213. }
  214. is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
  215. if ( is_ascii ) {
  216. ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
  217. if ( ret_pos >= 0 ) {
  218. RETURN_LONG(ret_pos);
  219. }
  220. /* if the needle was ascii too, we are done */
  221. if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
  222. RETURN_FALSE;
  223. }
  224. /* else we need to continue via utf16 */
  225. }
  226. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */ TSRMLS_CC);
  227. if ( ret_pos >= 0 ) {
  228. RETURN_LONG(ret_pos);
  229. } else {
  230. RETURN_FALSE;
  231. }
  232. }
  233. /* }}} */
  234. /* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
  235. Find position of last occurrence of a string within another, ignoring case */
  236. PHP_FUNCTION(grapheme_strripos)
  237. {
  238. unsigned char *haystack, *needle;
  239. int haystack_len, needle_len;
  240. long loffset = 0;
  241. int32_t offset = 0;
  242. int32_t ret_pos;
  243. int is_ascii;
  244. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
  245. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  246. "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
  247. RETURN_FALSE;
  248. }
  249. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  250. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
  251. RETURN_FALSE;
  252. }
  253. /* we checked that it will fit: */
  254. offset = (int32_t) loffset;
  255. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  256. if (needle_len == 0) {
  257. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
  258. RETURN_FALSE;
  259. }
  260. is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
  261. if ( is_ascii ) {
  262. unsigned char *needle_dup, *haystack_dup;
  263. needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
  264. php_strtolower((char *)needle_dup, needle_len);
  265. haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
  266. php_strtolower((char *)haystack_dup, haystack_len);
  267. ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
  268. efree(haystack_dup);
  269. efree(needle_dup);
  270. if ( ret_pos >= 0 ) {
  271. RETURN_LONG(ret_pos);
  272. }
  273. /* if the needle was ascii too, we are done */
  274. if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
  275. RETURN_FALSE;
  276. }
  277. /* else we need to continue via utf16 */
  278. }
  279. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* f_ignore_case */, 1 /*last */ TSRMLS_CC);
  280. if ( ret_pos >= 0 ) {
  281. RETURN_LONG(ret_pos);
  282. } else {
  283. RETURN_FALSE;
  284. }
  285. }
  286. /* }}} */
  287. /* {{{ proto string grapheme_substr(string str, int start [, int length])
  288. Returns part of a string */
  289. PHP_FUNCTION(grapheme_substr)
  290. {
  291. unsigned char *str, *sub_str;
  292. UChar *ustr;
  293. int str_len, sub_str_len, ustr_len;
  294. long lstart = 0, length = 0;
  295. int32_t start = 0;
  296. int iter_val;
  297. UErrorCode status;
  298. unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
  299. UBreakIterator* bi = NULL;
  300. int sub_str_start_pos, sub_str_end_pos;
  301. int32_t (*iter_func)(UBreakIterator *);
  302. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|l", (char **)&str, &str_len, &lstart, &length) == FAILURE) {
  303. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  304. "grapheme_substr: unable to parse input param", 0 TSRMLS_CC );
  305. RETURN_FALSE;
  306. }
  307. if ( OUTSIDE_STRING(lstart, str_len) ) {
  308. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
  309. RETURN_FALSE;
  310. }
  311. /* we checked that it will fit: */
  312. start = (int32_t) lstart;
  313. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  314. if ( grapheme_ascii_check(str, str_len) >= 0 ) {
  315. grapheme_substr_ascii((char *)str, str_len, start, length, ZEND_NUM_ARGS(), (char **) &sub_str, &sub_str_len);
  316. if ( NULL == sub_str ) {
  317. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 TSRMLS_CC );
  318. RETURN_FALSE;
  319. }
  320. RETURN_STRINGL(((char *)sub_str), sub_str_len, 1);
  321. }
  322. ustr = NULL;
  323. ustr_len = 0;
  324. status = U_ZERO_ERROR;
  325. intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)str, str_len, &status);
  326. if ( U_FAILURE( status ) ) {
  327. /* Set global error code. */
  328. intl_error_set_code( NULL, status TSRMLS_CC );
  329. /* Set error messages. */
  330. intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
  331. if (ustr) {
  332. efree( ustr );
  333. }
  334. RETURN_FALSE;
  335. }
  336. bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
  337. if( U_FAILURE(status) ) {
  338. RETURN_FALSE;
  339. }
  340. ubrk_setText(bi, ustr, ustr_len, &status);
  341. if ( start < 0 ) {
  342. iter_func = ubrk_previous;
  343. ubrk_last(bi);
  344. iter_val = 1;
  345. }
  346. else {
  347. iter_func = ubrk_next;
  348. iter_val = -1;
  349. }
  350. sub_str_start_pos = 0;
  351. while ( start ) {
  352. sub_str_start_pos = iter_func(bi);
  353. if ( UBRK_DONE == sub_str_start_pos ) {
  354. break;
  355. }
  356. start += iter_val;
  357. }
  358. if ( 0 != start || sub_str_start_pos >= ustr_len ) {
  359. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
  360. if (ustr) {
  361. efree(ustr);
  362. }
  363. ubrk_close(bi);
  364. RETURN_FALSE;
  365. }
  366. if (ZEND_NUM_ARGS() <= 2) {
  367. /* no length supplied, return the rest of the string */
  368. sub_str = NULL;
  369. sub_str_len = 0;
  370. status = U_ZERO_ERROR;
  371. intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
  372. if (ustr) {
  373. efree( ustr );
  374. }
  375. ubrk_close( bi );
  376. if ( U_FAILURE( status ) ) {
  377. /* Set global error code. */
  378. intl_error_set_code( NULL, status TSRMLS_CC );
  379. /* Set error messages. */
  380. intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
  381. if (sub_str) {
  382. efree( sub_str );
  383. }
  384. RETURN_FALSE;
  385. }
  386. /* return the allocated string, not a duplicate */
  387. RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
  388. }
  389. if(length == 0) {
  390. /* empty length - we've validated start, we can return "" now */
  391. if (ustr) {
  392. efree(ustr);
  393. }
  394. ubrk_close(bi);
  395. RETURN_EMPTY_STRING();
  396. }
  397. /* find the end point of the string to return */
  398. if ( length < 0 ) {
  399. iter_func = ubrk_previous;
  400. ubrk_last(bi);
  401. iter_val = 1;
  402. }
  403. else {
  404. iter_func = ubrk_next;
  405. iter_val = -1;
  406. }
  407. sub_str_end_pos = 0;
  408. while ( length ) {
  409. sub_str_end_pos = iter_func(bi);
  410. if ( UBRK_DONE == sub_str_end_pos ) {
  411. break;
  412. }
  413. length += iter_val;
  414. }
  415. ubrk_close(bi);
  416. if ( UBRK_DONE == sub_str_end_pos) {
  417. if(length < 0) {
  418. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 TSRMLS_CC );
  419. efree(ustr);
  420. RETURN_FALSE;
  421. } else {
  422. sub_str_end_pos = ustr_len;
  423. }
  424. }
  425. if(sub_str_start_pos > sub_str_end_pos) {
  426. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length is beyond start", 1 TSRMLS_CC );
  427. efree(ustr);
  428. RETURN_FALSE;
  429. }
  430. sub_str = NULL;
  431. status = U_ZERO_ERROR;
  432. intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
  433. efree( ustr );
  434. if ( U_FAILURE( status ) ) {
  435. /* Set global error code. */
  436. intl_error_set_code( NULL, status TSRMLS_CC );
  437. /* Set error messages. */
  438. intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
  439. if ( NULL != sub_str )
  440. efree( sub_str );
  441. RETURN_FALSE;
  442. }
  443. /* return the allocated string, not a duplicate */
  444. RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
  445. }
  446. /* }}} */
  447. /* {{{ strstr_common_handler */
  448. static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
  449. {
  450. unsigned char *haystack, *needle, *found;
  451. int haystack_len, needle_len;
  452. int ret_pos, uchar_pos;
  453. zend_bool part = 0;
  454. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|b", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &part) == FAILURE) {
  455. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  456. "grapheme_strstr: unable to parse input param", 0 TSRMLS_CC );
  457. RETURN_FALSE;
  458. }
  459. if (needle_len == 0) {
  460. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
  461. RETURN_FALSE;
  462. }
  463. if ( !f_ignore_case ) {
  464. /* ASCII optimization: quick check to see if the string might be there
  465. * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
  466. */
  467. found = (unsigned char *)php_memnstr((char *)haystack, (char *)needle, needle_len, (char *)haystack + haystack_len);
  468. /* if it isn't there the we are done */
  469. if ( !found ) {
  470. RETURN_FALSE;
  471. }
  472. /* if it is there, and if the haystack is ascii, we are all done */
  473. if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
  474. size_t found_offset = found - haystack;
  475. if (part) {
  476. RETURN_STRINGL(((char *)haystack) , found_offset, 1);
  477. } else {
  478. RETURN_STRINGL(((char *)found), haystack_len - found_offset, 1);
  479. }
  480. }
  481. }
  482. /* need to work in utf16 */
  483. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ TSRMLS_CC );
  484. if ( ret_pos < 0 ) {
  485. RETURN_FALSE;
  486. }
  487. /* uchar_pos is the 'nth' Unicode character position of the needle */
  488. ret_pos = 0;
  489. U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
  490. if (part) {
  491. RETURN_STRINGL(((char *)haystack), ret_pos, 1);
  492. }
  493. else {
  494. RETURN_STRINGL(((char *)haystack) + ret_pos, haystack_len - ret_pos, 1);
  495. }
  496. }
  497. /* }}} */
  498. /* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
  499. Finds first occurrence of a string within another */
  500. PHP_FUNCTION(grapheme_strstr)
  501. {
  502. strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
  503. }
  504. /* }}} */
  505. /* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
  506. Finds first occurrence of a string within another */
  507. PHP_FUNCTION(grapheme_stristr)
  508. {
  509. strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
  510. }
  511. /* }}} */
  512. /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
  513. static inline int32_t
  514. grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
  515. {
  516. int pos = 0;
  517. int ret_pos = 0;
  518. int break_pos, prev_break_pos;
  519. int count = 0;
  520. while ( 1 ) {
  521. pos = ubrk_next(bi);
  522. if ( UBRK_DONE == pos ) {
  523. break;
  524. }
  525. for ( break_pos = ret_pos; break_pos < pos; ) {
  526. count++;
  527. prev_break_pos = break_pos;
  528. U8_FWD_1(pstr, break_pos, str_len);
  529. if ( prev_break_pos == break_pos ) {
  530. /* something wrong - malformed utf8? */
  531. csize = 0;
  532. break;
  533. }
  534. }
  535. /* if we are beyond our limit, then the loop is done */
  536. if ( count > csize ) {
  537. break;
  538. }
  539. ret_pos = break_pos;
  540. }
  541. return ret_pos;
  542. }
  543. /* }}} */
  544. /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
  545. static inline int32_t
  546. grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
  547. {
  548. int pos = 0;
  549. int ret_pos = 0;
  550. while ( 1 ) {
  551. pos = ubrk_next(bi);
  552. if ( UBRK_DONE == pos ) {
  553. break;
  554. }
  555. if ( pos > bsize ) {
  556. break;
  557. }
  558. ret_pos = pos;
  559. }
  560. return ret_pos;
  561. }
  562. /* }}} */
  563. /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
  564. static inline int32_t
  565. grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
  566. {
  567. int next_pos = 0;
  568. int ret_pos = 0;
  569. while ( size ) {
  570. next_pos = ubrk_next(bi);
  571. if ( UBRK_DONE == next_pos ) {
  572. break;
  573. }
  574. ret_pos = next_pos;
  575. size--;
  576. }
  577. return ret_pos;
  578. }
  579. /* }}} */
  580. /* {{{ grapheme extract iter function pointer array */
  581. typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
  582. static grapheme_extract_iter grapheme_extract_iters[] = {
  583. &grapheme_extract_count_iter,
  584. &grapheme_extract_bytecount_iter,
  585. &grapheme_extract_charcount_iter,
  586. };
  587. /* }}} */
  588. /* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
  589. Function to extract a sequence of default grapheme clusters */
  590. PHP_FUNCTION(grapheme_extract)
  591. {
  592. char *str, *pstr;
  593. UText ut = UTEXT_INITIALIZER;
  594. int str_len;
  595. long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
  596. long lstart = 0; /* starting position in str in bytes */
  597. int32_t start = 0;
  598. long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
  599. UErrorCode status;
  600. unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
  601. UBreakIterator* bi = NULL;
  602. int ret_pos;
  603. zval *next = NULL; /* return offset of next part of the string */
  604. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|llz", (char **)&str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
  605. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  606. "grapheme_extract: unable to parse input param", 0 TSRMLS_CC );
  607. RETURN_FALSE;
  608. }
  609. if ( NULL != next ) {
  610. if ( !PZVAL_IS_REF(next) ) {
  611. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  612. "grapheme_extract: 'next' was not passed by reference", 0 TSRMLS_CC );
  613. RETURN_FALSE;
  614. }
  615. else {
  616. /* initialize next */
  617. zval_dtor(next);
  618. ZVAL_LONG(next, lstart);
  619. }
  620. }
  621. if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
  622. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  623. "grapheme_extract: unknown extract type param", 0 TSRMLS_CC );
  624. RETURN_FALSE;
  625. }
  626. if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) {
  627. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 TSRMLS_CC );
  628. RETURN_FALSE;
  629. }
  630. if ( size > INT32_MAX || size < 0) {
  631. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 TSRMLS_CC );
  632. RETURN_FALSE;
  633. }
  634. if (size == 0) {
  635. RETURN_EMPTY_STRING();
  636. }
  637. /* we checked that it will fit: */
  638. start = (int32_t) lstart;
  639. pstr = str + start;
  640. /* just in case pstr points in the middle of a character, move forward to the start of the next char */
  641. if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
  642. unsigned char *str_end = str + str_len;
  643. while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
  644. pstr++;
  645. if ( pstr >= str_end ) {
  646. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  647. "grapheme_extract: invalid input string", 0 TSRMLS_CC );
  648. RETURN_FALSE;
  649. }
  650. }
  651. }
  652. str_len -= (pstr - str);
  653. /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
  654. (size + 1 because the size-th character might be the beginning of a grapheme cluster)
  655. */
  656. if ( -1 != grapheme_ascii_check(pstr, size + 1 < str_len ? size + 1 : str_len ) ) {
  657. long nsize = ( size < str_len ? size : str_len );
  658. if ( NULL != next ) {
  659. ZVAL_LONG(next, start+nsize);
  660. }
  661. RETURN_STRINGL(((char *)pstr), nsize, 1);
  662. }
  663. status = U_ZERO_ERROR;
  664. utext_openUTF8(&ut, pstr, str_len, &status);
  665. if ( U_FAILURE( status ) ) {
  666. /* Set global error code. */
  667. intl_error_set_code( NULL, status TSRMLS_CC );
  668. /* Set error messages. */
  669. intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 TSRMLS_CC );
  670. RETURN_FALSE;
  671. }
  672. bi = NULL;
  673. status = U_ZERO_ERROR;
  674. bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
  675. ubrk_setUText(bi, &ut, &status);
  676. /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
  677. can't back up. So, we will not do anything. */
  678. /* now we need to find the end of the chunk the user wants us to return */
  679. ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
  680. utext_close(&ut);
  681. ubrk_close(bi);
  682. if ( NULL != next ) {
  683. ZVAL_LONG(next, start+ret_pos);
  684. }
  685. RETURN_STRINGL(((char *)pstr), ret_pos, 1);
  686. }
  687. /* }}} */
  688. /*
  689. * Local variables:
  690. * tab-width: 4
  691. * c-basic-offset: 4
  692. * End:
  693. * vim600: fdm=marker
  694. * vim: noet sw=4 ts=4
  695. */