php_mbregex.c 43 KB


  1. /*
  2. +----------------------------------------------------------------------+
  3. | PHP Version 7 |
  4. +----------------------------------------------------------------------+
  5. | Copyright (c) 1997-2018 The PHP Group |
  6. +----------------------------------------------------------------------+
  7. | This source file is subject to version 3.01 of the PHP license, |
  8. | that is bundled with this package in the file LICENSE, and is |
  9. | available through the world-wide-web at the following url: |
  10. | http://www.php.net/license/3_01.txt |
  11. | If you did not receive a copy of the PHP license and are unable to |
  12. | obtain it through the world-wide-web, please send a note to |
  13. | license@php.net so we can mail you a copy immediately. |
  14. +----------------------------------------------------------------------+
  15. | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp> |
  16. +----------------------------------------------------------------------+
  17. */
  18. #ifdef HAVE_CONFIG_H
  19. #include "config.h"
  20. #endif
  21. #include "php.h"
  22. #include "php_ini.h"
  23. #if HAVE_MBREGEX
  24. #include "zend_smart_str.h"
  25. #include "ext/standard/info.h"
  26. #include "php_mbregex.h"
  27. #include "mbstring.h"
  28. #include "php_onig_compat.h" /* must come prior to the oniguruma header */
  29. #include <oniguruma.h>
  30. #undef UChar
  31. #if ONIGURUMA_VERSION_INT < 60800
  32. typedef void OnigMatchParam;
  33. #define onig_new_match_param() (NULL)
  34. #define onig_initialize_match_param(x) (void)(x)
  35. #define onig_set_match_stack_limit_size_of_match_param(x, y)
  36. #define onig_free_match_param(x)
  37. #define onig_search_with_param(reg, str, end, start, range, region, option, mp) \
  38. onig_search(reg, str, end, start, range, region, option)
  39. #define onig_match_with_param(re, str, end, at, region, option, mp) \
  40. onig_match(re, str, end, at, region, option)
  41. #endif
  42. ZEND_EXTERN_MODULE_GLOBALS(mbstring)
  43. struct _zend_mb_regex_globals {
  44. OnigEncoding default_mbctype;
  45. OnigEncoding current_mbctype;
  46. HashTable ht_rc;
  47. zval search_str;
  48. zval *search_str_val;
  49. size_t search_pos;
  50. php_mb_regex_t *search_re;
  51. OnigRegion *search_regs;
  52. OnigOptionType regex_default_options;
  53. OnigSyntaxType *regex_default_syntax;
  54. };
  55. #define MBREX(g) (MBSTRG(mb_regex_globals)->g)
  56. /* {{{ static void php_mb_regex_free_cache() */
  57. static void php_mb_regex_free_cache(zval *el) {
  58. onig_free((php_mb_regex_t *)Z_PTR_P(el));
  59. }
  60. /* }}} */
  61. /* {{{ _php_mb_regex_globals_ctor */
  62. static int _php_mb_regex_globals_ctor(zend_mb_regex_globals *pglobals)
  63. {
  64. pglobals->default_mbctype = ONIG_ENCODING_UTF8;
  65. pglobals->current_mbctype = ONIG_ENCODING_UTF8;
  66. ZVAL_UNDEF(&pglobals->search_str);
  67. pglobals->search_re = (php_mb_regex_t*)NULL;
  68. pglobals->search_pos = 0;
  69. pglobals->search_regs = (OnigRegion*)NULL;
  70. pglobals->regex_default_options = ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
  71. pglobals->regex_default_syntax = ONIG_SYNTAX_RUBY;
  72. return SUCCESS;
  73. }
  74. /* }}} */
  75. /* {{{ _php_mb_regex_globals_dtor */
  76. static void _php_mb_regex_globals_dtor(zend_mb_regex_globals *pglobals)
  77. {
  78. }
  79. /* }}} */
  80. /* {{{ php_mb_regex_globals_alloc */
  81. zend_mb_regex_globals *php_mb_regex_globals_alloc(void)
  82. {
  83. zend_mb_regex_globals *pglobals = pemalloc(
  84. sizeof(zend_mb_regex_globals), 1);
  85. if (SUCCESS != _php_mb_regex_globals_ctor(pglobals)) {
  86. pefree(pglobals, 1);
  87. return NULL;
  88. }
  89. return pglobals;
  90. }
  91. /* }}} */
  92. /* {{{ php_mb_regex_globals_free */
  93. void php_mb_regex_globals_free(zend_mb_regex_globals *pglobals)
  94. {
  95. if (!pglobals) {
  96. return;
  97. }
  98. _php_mb_regex_globals_dtor(pglobals);
  99. pefree(pglobals, 1);
  100. }
  101. /* }}} */
  102. /* {{{ PHP_MINIT_FUNCTION(mb_regex) */
  103. PHP_MINIT_FUNCTION(mb_regex)
  104. {
  105. onig_init();
  106. return SUCCESS;
  107. }
  108. /* }}} */
  109. /* {{{ PHP_MSHUTDOWN_FUNCTION(mb_regex) */
  110. PHP_MSHUTDOWN_FUNCTION(mb_regex)
  111. {
  112. onig_end();
  113. return SUCCESS;
  114. }
  115. /* }}} */
  116. /* {{{ PHP_RINIT_FUNCTION(mb_regex) */
  117. PHP_RINIT_FUNCTION(mb_regex)
  118. {
  119. if (!MBSTRG(mb_regex_globals)) return FAILURE;
  120. zend_hash_init(&MBREX(ht_rc), 0, NULL, php_mb_regex_free_cache, 0);
  121. return SUCCESS;
  122. }
  123. /* }}} */
  124. /* {{{ PHP_RSHUTDOWN_FUNCTION(mb_regex) */
  125. PHP_RSHUTDOWN_FUNCTION(mb_regex)
  126. {
  127. MBREX(current_mbctype) = MBREX(default_mbctype);
  128. if (!Z_ISUNDEF(MBREX(search_str))) {
  129. zval_ptr_dtor(&MBREX(search_str));
  130. ZVAL_UNDEF(&MBREX(search_str));
  131. }
  132. MBREX(search_pos) = 0;
  133. MBREX(search_re) = NULL;
  134. if (MBREX(search_regs) != NULL) {
  135. onig_region_free(MBREX(search_regs), 1);
  136. MBREX(search_regs) = (OnigRegion *)NULL;
  137. }
  138. zend_hash_destroy(&MBREX(ht_rc));
  139. return SUCCESS;
  140. }
  141. /* }}} */
  142. /* {{{ PHP_MINFO_FUNCTION(mb_regex) */
  143. PHP_MINFO_FUNCTION(mb_regex)
  144. {
  145. char buf[32];
  146. php_info_print_table_start();
  147. php_info_print_table_row(2, "Multibyte (japanese) regex support", "enabled");
  148. snprintf(buf, sizeof(buf), "%d.%d.%d",
  149. ONIGURUMA_VERSION_MAJOR,
  150. ONIGURUMA_VERSION_MINOR,
  151. ONIGURUMA_VERSION_TEENY);
  152. #ifdef PHP_ONIG_BUNDLED
  153. #ifdef USE_COMBINATION_EXPLOSION_CHECK
  154. php_info_print_table_row(2, "Multibyte regex (oniguruma) backtrack check", "On");
  155. #else /* USE_COMBINATION_EXPLOSION_CHECK */
  156. php_info_print_table_row(2, "Multibyte regex (oniguruma) backtrack check", "Off");
  157. #endif /* USE_COMBINATION_EXPLOSION_CHECK */
  158. #endif /* PHP_BUNDLED_ONIG */
  159. php_info_print_table_row(2, "Multibyte regex (oniguruma) version", buf);
  160. php_info_print_table_end();
  161. }
  162. /* }}} */
  163. /*
  164. * encoding name resolver
  165. */
  166. /* {{{ encoding name map */
  167. typedef struct _php_mb_regex_enc_name_map_t {
  168. const char *names;
  169. OnigEncoding code;
  170. } php_mb_regex_enc_name_map_t;
  171. static const php_mb_regex_enc_name_map_t enc_name_map[] = {
  172. #ifdef ONIG_ENCODING_EUC_JP
  173. {
  174. "EUC-JP\0EUCJP\0X-EUC-JP\0UJIS\0EUCJP\0EUCJP-WIN\0",
  175. ONIG_ENCODING_EUC_JP
  176. },
  177. #endif
  178. #ifdef ONIG_ENCODING_UTF8
  179. {
  180. "UTF-8\0UTF8\0",
  181. ONIG_ENCODING_UTF8
  182. },
  183. #endif
  184. #ifdef ONIG_ENCODING_UTF16_BE
  185. {
  186. "UTF-16\0UTF-16BE\0",
  187. ONIG_ENCODING_UTF16_BE
  188. },
  189. #endif
  190. #ifdef ONIG_ENCODING_UTF16_LE
  191. {
  192. "UTF-16LE\0",
  193. ONIG_ENCODING_UTF16_LE
  194. },
  195. #endif
  196. #ifdef ONIG_ENCODING_UTF32_BE
  197. {
  198. "UCS-4\0UTF-32\0UTF-32BE\0",
  199. ONIG_ENCODING_UTF32_BE
  200. },
  201. #endif
  202. #ifdef ONIG_ENCODING_UTF32_LE
  203. {
  204. "UCS-4LE\0UTF-32LE\0",
  205. ONIG_ENCODING_UTF32_LE
  206. },
  207. #endif
  208. #ifdef ONIG_ENCODING_SJIS
  209. {
  210. "SJIS\0CP932\0MS932\0SHIFT_JIS\0SJIS-WIN\0WINDOWS-31J\0",
  211. ONIG_ENCODING_SJIS
  212. },
  213. #endif
  214. #ifdef ONIG_ENCODING_BIG5
  215. {
  216. "BIG5\0BIG-5\0BIGFIVE\0CN-BIG5\0BIG-FIVE\0",
  217. ONIG_ENCODING_BIG5
  218. },
  219. #endif
  220. #ifdef ONIG_ENCODING_EUC_CN
  221. {
  222. "EUC-CN\0EUCCN\0EUC_CN\0GB-2312\0GB2312\0",
  223. ONIG_ENCODING_EUC_CN
  224. },
  225. #endif
  226. #ifdef ONIG_ENCODING_EUC_TW
  227. {
  228. "EUC-TW\0EUCTW\0EUC_TW\0",
  229. ONIG_ENCODING_EUC_TW
  230. },
  231. #endif
  232. #ifdef ONIG_ENCODING_EUC_KR
  233. {
  234. "EUC-KR\0EUCKR\0EUC_KR\0",
  235. ONIG_ENCODING_EUC_KR
  236. },
  237. #endif
  238. #if defined(ONIG_ENCODING_KOI8) && !PHP_ONIG_BAD_KOI8_ENTRY
  239. {
  240. "KOI8\0KOI-8\0",
  241. ONIG_ENCODING_KOI8
  242. },
  243. #endif
  244. #ifdef ONIG_ENCODING_KOI8_R
  245. {
  246. "KOI8R\0KOI8-R\0KOI-8R\0",
  247. ONIG_ENCODING_KOI8_R
  248. },
  249. #endif
  250. #ifdef ONIG_ENCODING_ISO_8859_1
  251. {
  252. "ISO-8859-1\0ISO8859-1\0ISO_8859_1\0ISO8859_1\0",
  253. ONIG_ENCODING_ISO_8859_1
  254. },
  255. #endif
  256. #ifdef ONIG_ENCODING_ISO_8859_2
  257. {
  258. "ISO-8859-2\0ISO8859-2\0ISO_8859_2\0ISO8859_2\0",
  259. ONIG_ENCODING_ISO_8859_2
  260. },
  261. #endif
  262. #ifdef ONIG_ENCODING_ISO_8859_3
  263. {
  264. "ISO-8859-3\0ISO8859-3\0ISO_8859_3\0ISO8859_3\0",
  265. ONIG_ENCODING_ISO_8859_3
  266. },
  267. #endif
  268. #ifdef ONIG_ENCODING_ISO_8859_4
  269. {
  270. "ISO-8859-4\0ISO8859-4\0ISO_8859_4\0ISO8859_4\0",
  271. ONIG_ENCODING_ISO_8859_4
  272. },
  273. #endif
  274. #ifdef ONIG_ENCODING_ISO_8859_5
  275. {
  276. "ISO-8859-5\0ISO8859-5\0ISO_8859_5\0ISO8859_5\0",
  277. ONIG_ENCODING_ISO_8859_5
  278. },
  279. #endif
  280. #ifdef ONIG_ENCODING_ISO_8859_6
  281. {
  282. "ISO-8859-6\0ISO8859-6\0ISO_8859_6\0ISO8859_6\0",
  283. ONIG_ENCODING_ISO_8859_6
  284. },
  285. #endif
  286. #ifdef ONIG_ENCODING_ISO_8859_7
  287. {
  288. "ISO-8859-7\0ISO8859-7\0ISO_8859_7\0ISO8859_7\0",
  289. ONIG_ENCODING_ISO_8859_7
  290. },
  291. #endif
  292. #ifdef ONIG_ENCODING_ISO_8859_8
  293. {
  294. "ISO-8859-8\0ISO8859-8\0ISO_8859_8\0ISO8859_8\0",
  295. ONIG_ENCODING_ISO_8859_8
  296. },
  297. #endif
  298. #ifdef ONIG_ENCODING_ISO_8859_9
  299. {
  300. "ISO-8859-9\0ISO8859-9\0ISO_8859_9\0ISO8859_9\0",
  301. ONIG_ENCODING_ISO_8859_9
  302. },
  303. #endif
  304. #ifdef ONIG_ENCODING_ISO_8859_10
  305. {
  306. "ISO-8859-10\0ISO8859-10\0ISO_8859_10\0ISO8859_10\0",
  307. ONIG_ENCODING_ISO_8859_10
  308. },
  309. #endif
  310. #ifdef ONIG_ENCODING_ISO_8859_11
  311. {
  312. "ISO-8859-11\0ISO8859-11\0ISO_8859_11\0ISO8859_11\0",
  313. ONIG_ENCODING_ISO_8859_11
  314. },
  315. #endif
  316. #ifdef ONIG_ENCODING_ISO_8859_13
  317. {
  318. "ISO-8859-13\0ISO8859-13\0ISO_8859_13\0ISO8859_13\0",
  319. ONIG_ENCODING_ISO_8859_13
  320. },
  321. #endif
  322. #ifdef ONIG_ENCODING_ISO_8859_14
  323. {
  324. "ISO-8859-14\0ISO8859-14\0ISO_8859_14\0ISO8859_14\0",
  325. ONIG_ENCODING_ISO_8859_14
  326. },
  327. #endif
  328. #ifdef ONIG_ENCODING_ISO_8859_15
  329. {
  330. "ISO-8859-15\0ISO8859-15\0ISO_8859_15\0ISO8859_15\0",
  331. ONIG_ENCODING_ISO_8859_15
  332. },
  333. #endif
  334. #ifdef ONIG_ENCODING_ISO_8859_16
  335. {
  336. "ISO-8859-16\0ISO8859-16\0ISO_8859_16\0ISO8859_16\0",
  337. ONIG_ENCODING_ISO_8859_16
  338. },
  339. #endif
  340. #ifdef ONIG_ENCODING_ASCII
  341. {
  342. "ASCII\0US-ASCII\0US_ASCII\0ISO646\0",
  343. ONIG_ENCODING_ASCII
  344. },
  345. #endif
  346. { NULL, ONIG_ENCODING_UNDEF }
  347. };
  348. /* }}} */
  349. /* {{{ php_mb_regex_name2mbctype */
  350. static OnigEncoding _php_mb_regex_name2mbctype(const char *pname)
  351. {
  352. const char *p;
  353. const php_mb_regex_enc_name_map_t *mapping;
  354. if (pname == NULL || !*pname) {
  355. return ONIG_ENCODING_UNDEF;
  356. }
  357. for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
  358. for (p = mapping->names; *p != '\0'; p += (strlen(p) + 1)) {
  359. if (strcasecmp(p, pname) == 0) {
  360. return mapping->code;
  361. }
  362. }
  363. }
  364. return ONIG_ENCODING_UNDEF;
  365. }
  366. /* }}} */
  367. /* {{{ php_mb_regex_mbctype2name */
  368. static const char *_php_mb_regex_mbctype2name(OnigEncoding mbctype)
  369. {
  370. const php_mb_regex_enc_name_map_t *mapping;
  371. for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
  372. if (mapping->code == mbctype) {
  373. return mapping->names;
  374. }
  375. }
  376. return NULL;
  377. }
  378. /* }}} */
  379. /* {{{ php_mb_regex_set_mbctype */
  380. int php_mb_regex_set_mbctype(const char *encname)
  381. {
  382. OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname);
  383. if (mbctype == ONIG_ENCODING_UNDEF) {
  384. return FAILURE;
  385. }
  386. MBREX(current_mbctype) = mbctype;
  387. return SUCCESS;
  388. }
  389. /* }}} */
  390. /* {{{ php_mb_regex_set_default_mbctype */
  391. int php_mb_regex_set_default_mbctype(const char *encname)
  392. {
  393. OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname);
  394. if (mbctype == ONIG_ENCODING_UNDEF) {
  395. return FAILURE;
  396. }
  397. MBREX(default_mbctype) = mbctype;
  398. return SUCCESS;
  399. }
  400. /* }}} */
  401. /* {{{ php_mb_regex_get_mbctype */
  402. const char *php_mb_regex_get_mbctype(void)
  403. {
  404. return _php_mb_regex_mbctype2name(MBREX(current_mbctype));
  405. }
  406. /* }}} */
  407. /* {{{ php_mb_regex_get_default_mbctype */
  408. const char *php_mb_regex_get_default_mbctype(void)
  409. {
  410. return _php_mb_regex_mbctype2name(MBREX(default_mbctype));
  411. }
  412. /* }}} */
  413. /*
  414. * regex cache
  415. */
  416. /* {{{ php_mbregex_compile_pattern */
  417. static php_mb_regex_t *php_mbregex_compile_pattern(const char *pattern, size_t patlen, OnigOptionType options, OnigEncoding enc, OnigSyntaxType *syntax)
  418. {
  419. int err_code = 0;
  420. php_mb_regex_t *retval = NULL, *rc = NULL;
  421. OnigErrorInfo err_info;
  422. OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
  423. if (!php_mb_check_encoding(pattern, patlen, _php_mb_regex_mbctype2name(enc))) {
  424. php_error_docref(NULL, E_WARNING,
  425. "Pattern is not valid under %s encoding", _php_mb_regex_mbctype2name(enc));
  426. return NULL;
  427. }
  428. rc = zend_hash_str_find_ptr(&MBREX(ht_rc), (char *)pattern, patlen);
  429. if (!rc || onig_get_options(rc) != options || onig_get_encoding(rc) != enc || onig_get_syntax(rc) != syntax) {
  430. if ((err_code = onig_new(&retval, (OnigUChar *)pattern, (OnigUChar *)(pattern + patlen), options, enc, syntax, &err_info)) != ONIG_NORMAL) {
  431. onig_error_code_to_str(err_str, err_code, &err_info);
  432. php_error_docref(NULL, E_WARNING, "mbregex compile err: %s", err_str);
  433. return NULL;
  434. }
  435. if (rc == MBREX(search_re)) {
  436. /* reuse the new rc? see bug #72399 */
  437. MBREX(search_re) = NULL;
  438. }
  439. zend_hash_str_update_ptr(&MBREX(ht_rc), (char *)pattern, patlen, retval);
  440. } else {
  441. retval = rc;
  442. }
  443. return retval;
  444. }
  445. /* }}} */
  446. /* {{{ _php_mb_regex_get_option_string */
  447. static size_t _php_mb_regex_get_option_string(char *str, size_t len, OnigOptionType option, OnigSyntaxType *syntax)
  448. {
  449. size_t len_left = len;
  450. size_t len_req = 0;
  451. char *p = str;
  452. char c;
  453. if ((option & ONIG_OPTION_IGNORECASE) != 0) {
  454. if (len_left > 0) {
  455. --len_left;
  456. *(p++) = 'i';
  457. }
  458. ++len_req;
  459. }
  460. if ((option & ONIG_OPTION_EXTEND) != 0) {
  461. if (len_left > 0) {
  462. --len_left;
  463. *(p++) = 'x';
  464. }
  465. ++len_req;
  466. }
  467. if ((option & (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) ==
  468. (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) {
  469. if (len_left > 0) {
  470. --len_left;
  471. *(p++) = 'p';
  472. }
  473. ++len_req;
  474. } else {
  475. if ((option & ONIG_OPTION_MULTILINE) != 0) {
  476. if (len_left > 0) {
  477. --len_left;
  478. *(p++) = 'm';
  479. }
  480. ++len_req;
  481. }
  482. if ((option & ONIG_OPTION_SINGLELINE) != 0) {
  483. if (len_left > 0) {
  484. --len_left;
  485. *(p++) = 's';
  486. }
  487. ++len_req;
  488. }
  489. }
  490. if ((option & ONIG_OPTION_FIND_LONGEST) != 0) {
  491. if (len_left > 0) {
  492. --len_left;
  493. *(p++) = 'l';
  494. }
  495. ++len_req;
  496. }
  497. if ((option & ONIG_OPTION_FIND_NOT_EMPTY) != 0) {
  498. if (len_left > 0) {
  499. --len_left;
  500. *(p++) = 'n';
  501. }
  502. ++len_req;
  503. }
  504. c = 0;
  505. if (syntax == ONIG_SYNTAX_JAVA) {
  506. c = 'j';
  507. } else if (syntax == ONIG_SYNTAX_GNU_REGEX) {
  508. c = 'u';
  509. } else if (syntax == ONIG_SYNTAX_GREP) {
  510. c = 'g';
  511. } else if (syntax == ONIG_SYNTAX_EMACS) {
  512. c = 'c';
  513. } else if (syntax == ONIG_SYNTAX_RUBY) {
  514. c = 'r';
  515. } else if (syntax == ONIG_SYNTAX_PERL) {
  516. c = 'z';
  517. } else if (syntax == ONIG_SYNTAX_POSIX_BASIC) {
  518. c = 'b';
  519. } else if (syntax == ONIG_SYNTAX_POSIX_EXTENDED) {
  520. c = 'd';
  521. }
  522. if (c != 0) {
  523. if (len_left > 0) {
  524. --len_left;
  525. *(p++) = c;
  526. }
  527. ++len_req;
  528. }
  529. if (len_left > 0) {
  530. --len_left;
  531. *(p++) = '\0';
  532. }
  533. ++len_req;
  534. if (len < len_req) {
  535. return len_req;
  536. }
  537. return 0;
  538. }
  539. /* }}} */
  540. /* {{{ _php_mb_regex_init_options */
  541. static void
  542. _php_mb_regex_init_options(const char *parg, size_t narg, OnigOptionType *option, OnigSyntaxType **syntax, int *eval)
  543. {
  544. size_t n;
  545. char c;
  546. OnigOptionType optm = 0;
  547. *syntax = ONIG_SYNTAX_RUBY;
  548. if (parg != NULL) {
  549. n = 0;
  550. while(n < narg) {
  551. c = parg[n++];
  552. switch (c) {
  553. case 'i':
  554. optm |= ONIG_OPTION_IGNORECASE;
  555. break;
  556. case 'x':
  557. optm |= ONIG_OPTION_EXTEND;
  558. break;
  559. case 'm':
  560. optm |= ONIG_OPTION_MULTILINE;
  561. break;
  562. case 's':
  563. optm |= ONIG_OPTION_SINGLELINE;
  564. break;
  565. case 'p':
  566. optm |= ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
  567. break;
  568. case 'l':
  569. optm |= ONIG_OPTION_FIND_LONGEST;
  570. break;
  571. case 'n':
  572. optm |= ONIG_OPTION_FIND_NOT_EMPTY;
  573. break;
  574. case 'j':
  575. *syntax = ONIG_SYNTAX_JAVA;
  576. break;
  577. case 'u':
  578. *syntax = ONIG_SYNTAX_GNU_REGEX;
  579. break;
  580. case 'g':
  581. *syntax = ONIG_SYNTAX_GREP;
  582. break;
  583. case 'c':
  584. *syntax = ONIG_SYNTAX_EMACS;
  585. break;
  586. case 'r':
  587. *syntax = ONIG_SYNTAX_RUBY;
  588. break;
  589. case 'z':
  590. *syntax = ONIG_SYNTAX_PERL;
  591. break;
  592. case 'b':
  593. *syntax = ONIG_SYNTAX_POSIX_BASIC;
  594. break;
  595. case 'd':
  596. *syntax = ONIG_SYNTAX_POSIX_EXTENDED;
  597. break;
  598. case 'e':
  599. if (eval != NULL) *eval = 1;
  600. break;
  601. default:
  602. break;
  603. }
  604. }
  605. if (option != NULL) *option|=optm;
  606. }
  607. }
  608. /* }}} */
  609. /*
  610. * Callbacks for named subpatterns
  611. */
  612. /* {{{ struct mb_ereg_groups_iter_arg */
  613. typedef struct mb_regex_groups_iter_args {
  614. zval *groups;
  615. char *search_str;
  616. size_t search_len;
  617. OnigRegion *region;
  618. } mb_regex_groups_iter_args;
  619. /* }}} */
  620. /* {{{ mb_ereg_groups_iter */
  621. static int
  622. mb_regex_groups_iter(const OnigUChar* name, const OnigUChar* name_end, int ngroup_num, int* group_nums, regex_t* reg, void* parg)
  623. {
  624. mb_regex_groups_iter_args *args = (mb_regex_groups_iter_args *) parg;
  625. int gn, beg, end;
  626. /*
  627. * In case of duplicate groups, keep only the last succeeding one
  628. * to be consistent with preg_match with the PCRE_DUPNAMES option.
  629. */
  630. gn = onig_name_to_backref_number(reg, name, name_end, args->region);
  631. beg = args->region->beg[gn];
  632. end = args->region->end[gn];
  633. if (beg >= 0 && beg < end && end <= args->search_len) {
  634. add_assoc_stringl_ex(args->groups, (char *)name, name_end - name, &args->search_str[beg], end - beg);
  635. } else {
  636. add_assoc_bool_ex(args->groups, (char *)name, name_end - name, 0);
  637. }
  638. return 0;
  639. }
  640. /* }}} */
  641. /*
  642. * Helper for _php_mb_regex_ereg_replace_exec
  643. */
  644. /* {{{ mb_regex_substitute */
  645. static inline void mb_regex_substitute(
  646. smart_str *pbuf,
  647. const char *subject,
  648. size_t subject_len,
  649. char *replace,
  650. size_t replace_len,
  651. php_mb_regex_t *regexp,
  652. OnigRegion *regs,
  653. const mbfl_encoding *enc
  654. ) {
  655. char *p, *sp, *eos;
  656. int no; /* bakreference group number */
  657. int clen; /* byte-length of the current character */
  658. p = replace;
  659. eos = replace + replace_len;
  660. while (p < eos) {
  661. clen = (int) php_mb_mbchar_bytes_ex(p, enc);
  662. if (clen != 1 || p == eos || p[0] != '\\') {
  663. /* skip anything that's not an ascii backslash */
  664. smart_str_appendl(pbuf, p, clen);
  665. p += clen;
  666. continue;
  667. }
  668. sp = p; /* save position */
  669. clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
  670. if (clen != 1 || p == eos) {
  671. /* skip backslash followed by multibyte char */
  672. smart_str_appendl(pbuf, sp, p - sp);
  673. continue;
  674. }
  675. no = -1;
  676. switch (p[0]) {
  677. case '0':
  678. no = 0;
  679. p++;
  680. break;
  681. case '1': case '2': case '3': case '4':
  682. case '5': case '6': case '7': case '8': case '9':
  683. if (!onig_noname_group_capture_is_active(regexp)) {
  684. /*
  685. * FIXME:
  686. * Oniguruma throws a compile error if numbered backrefs are used with named groups in the pattern.
  687. * For now we just ignore them, but in the future we might want to raise a warning
  688. * and abort the whole replace operation.
  689. */
  690. p++;
  691. smart_str_appendl(pbuf, sp, p - sp);
  692. continue;
  693. }
  694. no = p[0] - '0';
  695. p++;
  696. break;
  697. case 'k':
  698. clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
  699. if (clen != 1 || p == eos || (p[0] != '<' && p[0] != '\'')) {
  700. /* not a backref delimiter */
  701. p += clen;
  702. smart_str_appendl(pbuf, sp, p - sp);
  703. continue;
  704. }
  705. /* try to consume everything until next delimiter */
  706. char delim = p[0] == '<' ? '>' : '\'';
  707. char *name, *name_end;
  708. char maybe_num = 1;
  709. name_end = name = p + 1;
  710. while (name_end < eos) {
  711. clen = (int) php_mb_mbchar_bytes_ex(name_end, enc);
  712. if (clen != 1) {
  713. name_end += clen;
  714. maybe_num = 0;
  715. continue;
  716. }
  717. if (name_end[0] == delim) break;
  718. if (maybe_num && !isdigit(name_end[0])) maybe_num = 0;
  719. name_end++;
  720. }
  721. p = name_end + 1;
  722. if (name_end - name < 1 || name_end >= eos) {
  723. /* the backref was empty or we failed to find the end delimiter */
  724. smart_str_appendl(pbuf, sp, p - sp);
  725. continue;
  726. }
  727. /* we have either a name or a number */
  728. if (maybe_num) {
  729. if (!onig_noname_group_capture_is_active(regexp)) {
  730. /* see above note on mixing numbered & named backrefs */
  731. smart_str_appendl(pbuf, sp, p - sp);
  732. continue;
  733. }
  734. if (name_end - name == 1) {
  735. no = name[0] - '0';
  736. break;
  737. }
  738. if (name[0] == '0') {
  739. /* 01 is not a valid number */
  740. break;
  741. }
  742. no = (int) strtoul(name, NULL, 10);
  743. break;
  744. }
  745. no = onig_name_to_backref_number(regexp, (OnigUChar *)name, (OnigUChar *)name_end, regs);
  746. break;
  747. default:
  748. /* We're not treating \ as an escape character and will interpret something like
  749. * \\1 as \ followed by \1, rather than \\ followed by 1. This is because this
  750. * function has not supported escaping of backslashes historically. */
  751. smart_str_appendl(pbuf, sp, p - sp);
  752. continue;
  753. }
  754. if (no < 0 || no >= regs->num_regs) {
  755. /* invalid group number reference, keep the escape sequence in the output */
  756. smart_str_appendl(pbuf, sp, p - sp);
  757. continue;
  758. }
  759. if (regs->beg[no] >= 0 && regs->beg[no] < regs->end[no] && (size_t)regs->end[no] <= subject_len) {
  760. smart_str_appendl(pbuf, subject + regs->beg[no], regs->end[no] - regs->beg[no]);
  761. }
  762. }
  763. if (p < eos) {
  764. smart_str_appendl(pbuf, p, eos - p);
  765. }
  766. }
  767. /* }}} */
  768. /*
  769. * php functions
  770. */
  771. /* {{{ proto string mb_regex_encoding([string encoding])
  772. Returns the current encoding for regex as a string. */
  773. PHP_FUNCTION(mb_regex_encoding)
  774. {
  775. char *encoding = NULL;
  776. size_t encoding_len;
  777. OnigEncoding mbctype;
  778. if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s", &encoding, &encoding_len) == FAILURE) {
  779. return;
  780. }
  781. if (!encoding) {
  782. const char *retval = _php_mb_regex_mbctype2name(MBREX(current_mbctype));
  783. if (retval == NULL) {
  784. RETURN_FALSE;
  785. }
  786. RETURN_STRING((char *)retval);
  787. } else {
  788. mbctype = _php_mb_regex_name2mbctype(encoding);
  789. if (mbctype == ONIG_ENCODING_UNDEF) {
  790. php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", encoding);
  791. RETURN_FALSE;
  792. }
  793. MBREX(current_mbctype) = mbctype;
  794. RETURN_TRUE;
  795. }
  796. }
  797. /* }}} */
  798. /* {{{ _php_mb_onig_search */
  799. static int _php_mb_onig_search(regex_t* reg, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start,
  800. const OnigUChar* range, OnigRegion* region, OnigOptionType option) {
  801. OnigMatchParam *mp = onig_new_match_param();
  802. int err;
  803. onig_initialize_match_param(mp);
  804. if (!ZEND_LONG_UINT_OVFL(MBSTRG(regex_stack_limit))) {
  805. onig_set_match_stack_limit_size_of_match_param(mp, (unsigned int)MBSTRG(regex_stack_limit));
  806. }
  807. /* search */
  808. err = onig_search_with_param(reg, str, end, start, range, region, option, mp);
  809. onig_free_match_param(mp);
  810. return err;
  811. }
  812. /* }}} */
  813. /* {{{ _php_mb_regex_ereg_exec */
  814. static void _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS, int icase)
  815. {
  816. zval *arg_pattern, *array = NULL;
  817. char *string;
  818. size_t string_len;
  819. php_mb_regex_t *re;
  820. OnigRegion *regs = NULL;
  821. int i, match_len, beg, end;
  822. OnigOptionType options;
  823. char *str;
  824. if (zend_parse_parameters(ZEND_NUM_ARGS(), "zs|z/", &arg_pattern, &string, &string_len, &array) == FAILURE) {
  825. RETURN_FALSE;
  826. }
  827. if (array != NULL) {
  828. zval_ptr_dtor(array);
  829. array_init(array);
  830. }
  831. if (!php_mb_check_encoding(
  832. string,
  833. string_len,
  834. _php_mb_regex_mbctype2name(MBREX(current_mbctype))
  835. )) {
  836. RETURN_FALSE;
  837. }
  838. options = MBREX(regex_default_options);
  839. if (icase) {
  840. options |= ONIG_OPTION_IGNORECASE;
  841. }
  842. /* compile the regular expression from the supplied regex */
  843. if (Z_TYPE_P(arg_pattern) != IS_STRING) {
  844. /* we convert numbers to integers and treat them as a string */
  845. if (Z_TYPE_P(arg_pattern) == IS_DOUBLE) {
  846. convert_to_long_ex(arg_pattern); /* get rid of decimal places */
  847. }
  848. convert_to_string_ex(arg_pattern);
  849. /* don't bother doing an extended regex with just a number */
  850. }
  851. if (Z_STRLEN_P(arg_pattern) == 0) {
  852. php_error_docref(NULL, E_WARNING, "empty pattern");
  853. RETVAL_FALSE;
  854. goto out;
  855. }
  856. re = php_mbregex_compile_pattern(Z_STRVAL_P(arg_pattern), Z_STRLEN_P(arg_pattern), options, MBREX(current_mbctype), MBREX(regex_default_syntax));
  857. if (re == NULL) {
  858. RETVAL_FALSE;
  859. goto out;
  860. }
  861. regs = onig_region_new();
  862. /* actually execute the regular expression */
  863. if (_php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, (OnigUChar *)(string + string_len), regs, 0) < 0) {
  864. RETVAL_FALSE;
  865. goto out;
  866. }
  867. match_len = 1;
  868. str = string;
  869. if (array != NULL) {
  870. match_len = regs->end[0] - regs->beg[0];
  871. for (i = 0; i < regs->num_regs; i++) {
  872. beg = regs->beg[i];
  873. end = regs->end[i];
  874. if (beg >= 0 && beg < end && (size_t)end <= string_len) {
  875. add_index_stringl(array, i, (char *)&str[beg], end - beg);
  876. } else {
  877. add_index_bool(array, i, 0);
  878. }
  879. }
  880. if (onig_number_of_names(re) > 0) {
  881. mb_regex_groups_iter_args args = {array, string, string_len, regs};
  882. onig_foreach_name(re, mb_regex_groups_iter, &args);
  883. }
  884. }
  885. if (match_len == 0) {
  886. match_len = 1;
  887. }
  888. RETVAL_LONG(match_len);
  889. out:
  890. if (regs != NULL) {
  891. onig_region_free(regs, 1);
  892. }
  893. }
  894. /* }}} */
  895. /* {{{ proto int mb_ereg(string pattern, string string [, array registers])
  896. Regular expression match for multibyte string */
  897. PHP_FUNCTION(mb_ereg)
  898. {
  899. _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
  900. }
  901. /* }}} */
  902. /* {{{ proto int mb_eregi(string pattern, string string [, array registers])
  903. Case-insensitive regular expression match for multibyte string */
  904. PHP_FUNCTION(mb_eregi)
  905. {
  906. _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
  907. }
  908. /* }}} */
  909. /* {{{ _php_mb_regex_ereg_replace_exec */
  910. static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOptionType options, int is_callable)
  911. {
  912. zval *arg_pattern_zval;
  913. char *arg_pattern;
  914. size_t arg_pattern_len;
  915. char *replace;
  916. size_t replace_len;
  917. zend_fcall_info arg_replace_fci;
  918. zend_fcall_info_cache arg_replace_fci_cache;
  919. char *string;
  920. size_t string_len;
  921. php_mb_regex_t *re;
  922. OnigSyntaxType *syntax;
  923. OnigRegion *regs = NULL;
  924. smart_str out_buf = {0};
  925. smart_str eval_buf = {0};
  926. smart_str *pbuf;
  927. int err, eval, n;
  928. OnigUChar *pos;
  929. OnigUChar *string_lim;
  930. char *description = NULL;
  931. char pat_buf[6];
  932. const mbfl_encoding *enc;
  933. {
  934. const char *current_enc_name;
  935. current_enc_name = _php_mb_regex_mbctype2name(MBREX(current_mbctype));
  936. if (current_enc_name == NULL ||
  937. (enc = mbfl_name2encoding(current_enc_name)) == NULL) {
  938. php_error_docref(NULL, E_WARNING, "Unknown error");
  939. RETURN_FALSE;
  940. }
  941. }
  942. eval = 0;
  943. {
  944. char *option_str = NULL;
  945. size_t option_str_len = 0;
  946. if (!is_callable) {
  947. if (zend_parse_parameters(ZEND_NUM_ARGS(), "zss|s",
  948. &arg_pattern_zval,
  949. &replace, &replace_len,
  950. &string, &string_len,
  951. &option_str, &option_str_len) == FAILURE) {
  952. RETURN_FALSE;
  953. }
  954. } else {
  955. if (zend_parse_parameters(ZEND_NUM_ARGS(), "zfs|s",
  956. &arg_pattern_zval,
  957. &arg_replace_fci, &arg_replace_fci_cache,
  958. &string, &string_len,
  959. &option_str, &option_str_len) == FAILURE) {
  960. RETURN_FALSE;
  961. }
  962. }
  963. if (!php_mb_check_encoding(
  964. string,
  965. string_len,
  966. _php_mb_regex_mbctype2name(MBREX(current_mbctype))
  967. )) {
  968. RETURN_NULL();
  969. }
  970. if (option_str != NULL) {
  971. _php_mb_regex_init_options(option_str, option_str_len, &options, &syntax, &eval);
  972. } else {
  973. options |= MBREX(regex_default_options);
  974. syntax = MBREX(regex_default_syntax);
  975. }
  976. }
  977. if (eval && !is_callable) {
  978. php_error_docref(NULL, E_DEPRECATED, "The 'e' option is deprecated, use mb_ereg_replace_callback instead");
  979. }
  980. if (Z_TYPE_P(arg_pattern_zval) == IS_STRING) {
  981. arg_pattern = Z_STRVAL_P(arg_pattern_zval);
  982. arg_pattern_len = Z_STRLEN_P(arg_pattern_zval);
  983. } else {
  984. /* FIXME: this code is not multibyte aware! */
  985. convert_to_long_ex(arg_pattern_zval);
  986. pat_buf[0] = (char)Z_LVAL_P(arg_pattern_zval);
  987. pat_buf[1] = '\0';
  988. pat_buf[2] = '\0';
  989. pat_buf[3] = '\0';
  990. pat_buf[4] = '\0';
  991. pat_buf[5] = '\0';
  992. arg_pattern = pat_buf;
  993. arg_pattern_len = 1;
  994. }
  995. /* create regex pattern buffer */
  996. re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, options, MBREX(current_mbctype), syntax);
  997. if (re == NULL) {
  998. RETURN_FALSE;
  999. }
  1000. if (eval || is_callable) {
  1001. pbuf = &eval_buf;
  1002. description = zend_make_compiled_string_description("mbregex replace");
  1003. } else {
  1004. pbuf = &out_buf;
  1005. description = NULL;
  1006. }
  1007. if (is_callable) {
  1008. if (eval) {
  1009. php_error_docref(NULL, E_WARNING, "Option 'e' cannot be used with replacement callback");
  1010. RETURN_FALSE;
  1011. }
  1012. }
  1013. /* do the actual work */
  1014. err = 0;
  1015. pos = (OnigUChar *)string;
  1016. string_lim = (OnigUChar*)(string + string_len);
  1017. regs = onig_region_new();
  1018. while (err >= 0) {
  1019. err = _php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)string_lim, pos, (OnigUChar *)string_lim, regs, 0);
  1020. if (err <= -2) {
  1021. OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
  1022. onig_error_code_to_str(err_str, err);
  1023. php_error_docref(NULL, E_WARNING, "mbregex search failure in php_mbereg_replace_exec(): %s", err_str);
  1024. break;
  1025. }
  1026. if (err >= 0) {
  1027. /* copy the part of the string before the match */
  1028. smart_str_appendl(&out_buf, (char *)pos, (size_t)((OnigUChar *)(string + regs->beg[0]) - pos));
  1029. if (!is_callable) {
  1030. mb_regex_substitute(pbuf, string, string_len, replace, replace_len, re, regs, enc);
  1031. }
  1032. if (eval) {
  1033. zval v;
  1034. zend_string *eval_str;
  1035. /* null terminate buffer */
  1036. smart_str_0(&eval_buf);
  1037. if (eval_buf.s) {
  1038. eval_str = eval_buf.s;
  1039. } else {
  1040. eval_str = ZSTR_EMPTY_ALLOC();
  1041. }
  1042. /* do eval */
  1043. if (zend_eval_stringl(ZSTR_VAL(eval_str), ZSTR_LEN(eval_str), &v, description) == FAILURE) {
  1044. efree(description);
  1045. zend_throw_error(NULL, "Failed evaluating code: %s%s", PHP_EOL, ZSTR_VAL(eval_str));
  1046. onig_region_free(regs, 0);
  1047. smart_str_free(&out_buf);
  1048. smart_str_free(&eval_buf);
  1049. RETURN_FALSE;
  1050. }
  1051. /* result of eval */
  1052. convert_to_string(&v);
  1053. smart_str_appendl(&out_buf, Z_STRVAL(v), Z_STRLEN(v));
  1054. /* Clean up */
  1055. smart_str_free(&eval_buf);
  1056. zval_ptr_dtor_str(&v);
  1057. } else if (is_callable) {
  1058. zval args[1];
  1059. zval subpats, retval;
  1060. int i;
  1061. array_init(&subpats);
  1062. for (i = 0; i < regs->num_regs; i++) {
  1063. add_next_index_stringl(&subpats, string + regs->beg[i], regs->end[i] - regs->beg[i]);
  1064. }
  1065. if (onig_number_of_names(re) > 0) {
  1066. mb_regex_groups_iter_args args = {&subpats, string, string_len, regs};
  1067. onig_foreach_name(re, mb_regex_groups_iter, &args);
  1068. }
  1069. ZVAL_COPY_VALUE(&args[0], &subpats);
  1070. /* null terminate buffer */
  1071. smart_str_0(&eval_buf);
  1072. arg_replace_fci.param_count = 1;
  1073. arg_replace_fci.params = args;
  1074. arg_replace_fci.retval = &retval;
  1075. if (zend_call_function(&arg_replace_fci, &arg_replace_fci_cache) == SUCCESS &&
  1076. !Z_ISUNDEF(retval)) {
  1077. convert_to_string_ex(&retval);
  1078. smart_str_appendl(&out_buf, Z_STRVAL(retval), Z_STRLEN(retval));
  1079. smart_str_free(&eval_buf);
  1080. zval_ptr_dtor(&retval);
  1081. } else {
  1082. if (!EG(exception)) {
  1083. php_error_docref(NULL, E_WARNING, "Unable to call custom replacement function");
  1084. }
  1085. }
  1086. zval_ptr_dtor(&subpats);
  1087. }
  1088. n = regs->end[0];
  1089. if ((pos - (OnigUChar *)string) < n) {
  1090. pos = (OnigUChar *)string + n;
  1091. } else {
  1092. if (pos < string_lim) {
  1093. smart_str_appendl(&out_buf, (char *)pos, 1);
  1094. }
  1095. pos++;
  1096. }
  1097. } else { /* nomatch */
  1098. /* stick that last bit of string on our output */
  1099. if (string_lim - pos > 0) {
  1100. smart_str_appendl(&out_buf, (char *)pos, string_lim - pos);
  1101. }
  1102. }
  1103. onig_region_free(regs, 0);
  1104. }
  1105. if (description) {
  1106. efree(description);
  1107. }
  1108. if (regs != NULL) {
  1109. onig_region_free(regs, 1);
  1110. }
  1111. smart_str_free(&eval_buf);
  1112. if (err <= -2) {
  1113. smart_str_free(&out_buf);
  1114. RETVAL_FALSE;
  1115. } else if (out_buf.s) {
  1116. smart_str_0(&out_buf);
  1117. RETVAL_STR(out_buf.s);
  1118. } else {
  1119. RETVAL_EMPTY_STRING();
  1120. }
  1121. }
  1122. /* }}} */
  1123. /* {{{ proto string mb_ereg_replace(string pattern, string replacement, string string [, string option])
  1124. Replace regular expression for multibyte string */
  1125. PHP_FUNCTION(mb_ereg_replace)
  1126. {
  1127. _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 0);
  1128. }
  1129. /* }}} */
  1130. /* {{{ proto string mb_eregi_replace(string pattern, string replacement, string string)
  1131. Case insensitive replace regular expression for multibyte string */
  1132. PHP_FUNCTION(mb_eregi_replace)
  1133. {
  1134. _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, ONIG_OPTION_IGNORECASE, 0);
  1135. }
  1136. /* }}} */
  1137. /* {{{ proto string mb_ereg_replace_callback(string pattern, string callback, string string [, string option])
  1138. regular expression for multibyte string using replacement callback */
  1139. PHP_FUNCTION(mb_ereg_replace_callback)
  1140. {
  1141. _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 1);
  1142. }
  1143. /* }}} */
  1144. /* {{{ proto array mb_split(string pattern, string string [, int limit])
  1145. split multibyte string into array by regular expression */
  1146. PHP_FUNCTION(mb_split)
  1147. {
  1148. char *arg_pattern;
  1149. size_t arg_pattern_len;
  1150. php_mb_regex_t *re;
  1151. OnigRegion *regs = NULL;
  1152. char *string;
  1153. OnigUChar *pos, *chunk_pos;
  1154. size_t string_len;
  1155. int err;
  1156. zend_long count = -1;
  1157. if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &arg_pattern, &arg_pattern_len, &string, &string_len, &count) == FAILURE) {
  1158. RETURN_FALSE;
  1159. }
  1160. if (count > 0) {
  1161. count--;
  1162. }
  1163. if (!php_mb_check_encoding(string, string_len,
  1164. _php_mb_regex_mbctype2name(MBREX(current_mbctype)))) {
  1165. RETURN_FALSE;
  1166. }
  1167. /* create regex pattern buffer */
  1168. if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, MBREX(regex_default_options), MBREX(current_mbctype), MBREX(regex_default_syntax))) == NULL) {
  1169. RETURN_FALSE;
  1170. }
  1171. array_init(return_value);
  1172. chunk_pos = pos = (OnigUChar *)string;
  1173. err = 0;
  1174. regs = onig_region_new();
  1175. /* churn through str, generating array entries as we go */
  1176. while (count != 0 && (size_t)(pos - (OnigUChar *)string) < string_len) {
  1177. size_t beg, end;
  1178. err = _php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), pos, (OnigUChar *)(string + string_len), regs, 0);
  1179. if (err < 0) {
  1180. break;
  1181. }
  1182. beg = regs->beg[0], end = regs->end[0];
  1183. /* add it to the array */
  1184. if ((size_t)(pos - (OnigUChar *)string) < end) {
  1185. if (beg < string_len && beg >= (size_t)(chunk_pos - (OnigUChar *)string)) {
  1186. add_next_index_stringl(return_value, (char *)chunk_pos, ((OnigUChar *)(string + beg) - chunk_pos));
  1187. --count;
  1188. } else {
  1189. err = -2;
  1190. break;
  1191. }
  1192. /* point at our new starting point */
  1193. chunk_pos = pos = (OnigUChar *)string + end;
  1194. } else {
  1195. pos++;
  1196. }
  1197. onig_region_free(regs, 0);
  1198. }
  1199. onig_region_free(regs, 1);
  1200. /* see if we encountered an error */
  1201. if (err <= -2) {
  1202. OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
  1203. onig_error_code_to_str(err_str, err);
  1204. php_error_docref(NULL, E_WARNING, "mbregex search failure in mbsplit(): %s", err_str);
  1205. zend_array_destroy(Z_ARR_P(return_value));
  1206. RETURN_FALSE;
  1207. }
  1208. /* otherwise we just have one last element to add to the array */
  1209. if ((OnigUChar *)(string + string_len) > chunk_pos) {
  1210. size_t n = ((OnigUChar *)(string + string_len) - chunk_pos);
  1211. add_next_index_stringl(return_value, (char *)chunk_pos, n);
  1212. } else {
  1213. add_next_index_stringl(return_value, "", 0);
  1214. }
  1215. }
  1216. /* }}} */
  1217. /* {{{ proto bool mb_ereg_match(string pattern, string string [,string option])
  1218. Regular expression match for multibyte string */
  1219. PHP_FUNCTION(mb_ereg_match)
  1220. {
  1221. char *arg_pattern;
  1222. size_t arg_pattern_len;
  1223. char *string;
  1224. size_t string_len;
  1225. php_mb_regex_t *re;
  1226. OnigSyntaxType *syntax;
  1227. OnigOptionType option = 0;
  1228. int err;
  1229. OnigMatchParam *mp;
  1230. {
  1231. char *option_str = NULL;
  1232. size_t option_str_len = 0;
  1233. if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|s",
  1234. &arg_pattern, &arg_pattern_len, &string, &string_len,
  1235. &option_str, &option_str_len)==FAILURE) {
  1236. RETURN_FALSE;
  1237. }
  1238. if (option_str != NULL) {
  1239. _php_mb_regex_init_options(option_str, option_str_len, &option, &syntax, NULL);
  1240. } else {
  1241. option |= MBREX(regex_default_options);
  1242. syntax = MBREX(regex_default_syntax);
  1243. }
  1244. }
  1245. if (!php_mb_check_encoding(string, string_len,
  1246. _php_mb_regex_mbctype2name(MBREX(current_mbctype)))) {
  1247. RETURN_FALSE;
  1248. }
  1249. if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, MBREX(current_mbctype), syntax)) == NULL) {
  1250. RETURN_FALSE;
  1251. }
  1252. mp = onig_new_match_param();
  1253. onig_initialize_match_param(mp);
  1254. if(MBSTRG(regex_stack_limit) > 0 && MBSTRG(regex_stack_limit) < UINT_MAX) {
  1255. onig_set_match_stack_limit_size_of_match_param(mp, (unsigned int)MBSTRG(regex_stack_limit));
  1256. }
  1257. /* match */
  1258. err = onig_match_with_param(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, NULL, 0, mp);
  1259. onig_free_match_param(mp);
  1260. if (err >= 0) {
  1261. RETVAL_TRUE;
  1262. } else {
  1263. RETVAL_FALSE;
  1264. }
  1265. }
  1266. /* }}} */
  1267. /* regex search */
  1268. /* {{{ _php_mb_regex_ereg_search_exec */
  1269. static void
  1270. _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS, int mode)
  1271. {
  1272. char *arg_pattern = NULL, *arg_options = NULL;
  1273. size_t arg_pattern_len, arg_options_len;
  1274. int err;
  1275. size_t n, i, pos, len, beg, end;
  1276. OnigOptionType option;
  1277. OnigUChar *str;
  1278. OnigSyntaxType *syntax;
  1279. if (zend_parse_parameters(ZEND_NUM_ARGS(), "|ss", &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) {
  1280. return;
  1281. }
  1282. option = MBREX(regex_default_options);
  1283. if (arg_options) {
  1284. option = 0;
  1285. _php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax, NULL);
  1286. }
  1287. if (MBREX(search_regs)) {
  1288. onig_region_free(MBREX(search_regs), 1);
  1289. MBREX(search_regs) = NULL;
  1290. }
  1291. if (arg_pattern) {
  1292. /* create regex pattern buffer */
  1293. if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, MBREX(current_mbctype), MBREX(regex_default_syntax))) == NULL) {
  1294. RETURN_FALSE;
  1295. }
  1296. }
  1297. pos = MBREX(search_pos);
  1298. str = NULL;
  1299. len = 0;
  1300. if (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING){
  1301. str = (OnigUChar *)Z_STRVAL(MBREX(search_str));
  1302. len = Z_STRLEN(MBREX(search_str));
  1303. }
  1304. if (MBREX(search_re) == NULL) {
  1305. php_error_docref(NULL, E_WARNING, "No regex given");
  1306. RETURN_FALSE;
  1307. }
  1308. if (str == NULL) {
  1309. php_error_docref(NULL, E_WARNING, "No string given");
  1310. RETURN_FALSE;
  1311. }
  1312. MBREX(search_regs) = onig_region_new();
  1313. err = _php_mb_onig_search(MBREX(search_re), str, str + len, str + pos, str + len, MBREX(search_regs), 0);
  1314. if (err == ONIG_MISMATCH) {
  1315. MBREX(search_pos) = len;
  1316. RETVAL_FALSE;
  1317. } else if (err <= -2) {
  1318. OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
  1319. onig_error_code_to_str(err_str, err);
  1320. php_error_docref(NULL, E_WARNING, "mbregex search failure in mbregex_search(): %s", err_str);
  1321. RETVAL_FALSE;
  1322. } else {
  1323. switch (mode) {
  1324. case 1:
  1325. array_init(return_value);
  1326. beg = MBREX(search_regs)->beg[0];
  1327. end = MBREX(search_regs)->end[0];
  1328. add_next_index_long(return_value, beg);
  1329. add_next_index_long(return_value, end - beg);
  1330. break;
  1331. case 2:
  1332. array_init(return_value);
  1333. n = MBREX(search_regs)->num_regs;
  1334. for (i = 0; i < n; i++) {
  1335. beg = MBREX(search_regs)->beg[i];
  1336. end = MBREX(search_regs)->end[i];
  1337. if (beg >= 0 && beg <= end && end <= len) {
  1338. add_index_stringl(return_value, i, (char *)&str[beg], end - beg);
  1339. } else {
  1340. add_index_bool(return_value, i, 0);
  1341. }
  1342. }
  1343. if (onig_number_of_names(MBREX(search_re)) > 0) {
  1344. mb_regex_groups_iter_args args = {
  1345. return_value,
  1346. Z_STRVAL(MBREX(search_str)),
  1347. Z_STRLEN(MBREX(search_str)),
  1348. MBREX(search_regs)
  1349. };
  1350. onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
  1351. }
  1352. break;
  1353. default:
  1354. RETVAL_TRUE;
  1355. break;
  1356. }
  1357. end = MBREX(search_regs)->end[0];
  1358. if (pos <= end) {
  1359. MBREX(search_pos) = end;
  1360. } else {
  1361. MBREX(search_pos) = pos + 1;
  1362. }
  1363. }
  1364. if (err < 0) {
  1365. onig_region_free(MBREX(search_regs), 1);
  1366. MBREX(search_regs) = (OnigRegion *)NULL;
  1367. }
  1368. }
  1369. /* }}} */
  1370. /* {{{ proto bool mb_ereg_search([string pattern[, string option]])
  1371. Regular expression search for multibyte string */
  1372. PHP_FUNCTION(mb_ereg_search)
  1373. {
  1374. _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
  1375. }
  1376. /* }}} */
  1377. /* {{{ proto array mb_ereg_search_pos([string pattern[, string option]])
  1378. Regular expression search for multibyte string */
  1379. PHP_FUNCTION(mb_ereg_search_pos)
  1380. {
  1381. _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
  1382. }
  1383. /* }}} */
  1384. /* {{{ proto array mb_ereg_search_regs([string pattern[, string option]])
  1385. Regular expression search for multibyte string */
  1386. PHP_FUNCTION(mb_ereg_search_regs)
  1387. {
  1388. _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 2);
  1389. }
  1390. /* }}} */
  1391. /* {{{ proto bool mb_ereg_search_init(string string [, string pattern[, string option]])
  1392. Initialize string and regular expression for search. */
  1393. PHP_FUNCTION(mb_ereg_search_init)
  1394. {
  1395. int argc = ZEND_NUM_ARGS();
  1396. zend_string *arg_str;
  1397. char *arg_pattern = NULL, *arg_options = NULL;
  1398. size_t arg_pattern_len = 0, arg_options_len = 0;
  1399. OnigSyntaxType *syntax = NULL;
  1400. OnigOptionType option;
  1401. if (zend_parse_parameters(argc, "S|ss", &arg_str, &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) {
  1402. return;
  1403. }
  1404. if (argc > 1 && arg_pattern_len == 0) {
  1405. php_error_docref(NULL, E_WARNING, "Empty pattern");
  1406. RETURN_FALSE;
  1407. }
  1408. option = MBREX(regex_default_options);
  1409. syntax = MBREX(regex_default_syntax);
  1410. if (argc == 3) {
  1411. option = 0;
  1412. _php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax, NULL);
  1413. }
  1414. if (argc > 1) {
  1415. /* create regex pattern buffer */
  1416. if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, MBREX(current_mbctype), syntax)) == NULL) {
  1417. RETURN_FALSE;
  1418. }
  1419. }
  1420. if (!Z_ISNULL(MBREX(search_str))) {
  1421. zval_ptr_dtor(&MBREX(search_str));
  1422. }
  1423. ZVAL_STR_COPY(&MBREX(search_str), arg_str);
  1424. if (php_mb_check_encoding(
  1425. ZSTR_VAL(arg_str),
  1426. ZSTR_LEN(arg_str),
  1427. _php_mb_regex_mbctype2name(MBREX(current_mbctype))
  1428. )) {
  1429. MBREX(search_pos) = 0;
  1430. RETVAL_TRUE;
  1431. } else {
  1432. MBREX(search_pos) = ZSTR_LEN(arg_str);
  1433. RETVAL_FALSE;
  1434. }
  1435. if (MBREX(search_regs) != NULL) {
  1436. onig_region_free(MBREX(search_regs), 1);
  1437. MBREX(search_regs) = NULL;
  1438. }
  1439. }
  1440. /* }}} */
  1441. /* {{{ proto array mb_ereg_search_getregs(void)
  1442. Get matched substring of the last time */
  1443. PHP_FUNCTION(mb_ereg_search_getregs)
  1444. {
  1445. size_t n, i, len, beg, end;
  1446. OnigUChar *str;
  1447. if (MBREX(search_regs) != NULL && Z_TYPE(MBREX(search_str)) == IS_STRING) {
  1448. array_init(return_value);
  1449. str = (OnigUChar *)Z_STRVAL(MBREX(search_str));
  1450. len = Z_STRLEN(MBREX(search_str));
  1451. n = MBREX(search_regs)->num_regs;
  1452. for (i = 0; i < n; i++) {
  1453. beg = MBREX(search_regs)->beg[i];
  1454. end = MBREX(search_regs)->end[i];
  1455. if (beg >= 0 && beg <= end && end <= len) {
  1456. add_index_stringl(return_value, i, (char *)&str[beg], end - beg);
  1457. } else {
  1458. add_index_bool(return_value, i, 0);
  1459. }
  1460. }
  1461. if (onig_number_of_names(MBREX(search_re)) > 0) {
  1462. mb_regex_groups_iter_args args = {
  1463. return_value,
  1464. Z_STRVAL(MBREX(search_str)),
  1465. len,
  1466. MBREX(search_regs)
  1467. };
  1468. onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
  1469. }
  1470. } else {
  1471. RETVAL_FALSE;
  1472. }
  1473. }
  1474. /* }}} */
  1475. /* {{{ proto int mb_ereg_search_getpos(void)
  1476. Get search start position */
  1477. PHP_FUNCTION(mb_ereg_search_getpos)
  1478. {
  1479. RETVAL_LONG(MBREX(search_pos));
  1480. }
  1481. /* }}} */
  1482. /* {{{ proto bool mb_ereg_search_setpos(int position)
  1483. Set search start position */
  1484. PHP_FUNCTION(mb_ereg_search_setpos)
  1485. {
  1486. zend_long position;
  1487. if (zend_parse_parameters(ZEND_NUM_ARGS(), "l", &position) == FAILURE) {
  1488. return;
  1489. }
  1490. /* Accept negative position if length of search string can be determined */
  1491. if ((position < 0) && (!Z_ISUNDEF(MBREX(search_str))) && (Z_TYPE(MBREX(search_str)) == IS_STRING)) {
  1492. position += Z_STRLEN(MBREX(search_str));
  1493. }
  1494. if (position < 0 || (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING && (size_t)position > Z_STRLEN(MBREX(search_str)))) {
  1495. php_error_docref(NULL, E_WARNING, "Position is out of range");
  1496. MBREX(search_pos) = 0;
  1497. RETURN_FALSE;
  1498. }
  1499. MBREX(search_pos) = position;
  1500. RETURN_TRUE;
  1501. }
  1502. /* }}} */
  1503. /* {{{ php_mb_regex_set_options */
  1504. static void _php_mb_regex_set_options(OnigOptionType options, OnigSyntaxType *syntax, OnigOptionType *prev_options, OnigSyntaxType **prev_syntax)
  1505. {
  1506. if (prev_options != NULL) {
  1507. *prev_options = MBREX(regex_default_options);
  1508. }
  1509. if (prev_syntax != NULL) {
  1510. *prev_syntax = MBREX(regex_default_syntax);
  1511. }
  1512. MBREX(regex_default_options) = options;
  1513. MBREX(regex_default_syntax) = syntax;
  1514. }
  1515. /* }}} */
  1516. /* {{{ proto string mb_regex_set_options([string options])
  1517. Set or get the default options for mbregex functions */
  1518. PHP_FUNCTION(mb_regex_set_options)
  1519. {
  1520. OnigOptionType opt;
  1521. OnigSyntaxType *syntax;
  1522. char *string = NULL;
  1523. size_t string_len;
  1524. char buf[16];
  1525. if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s",
  1526. &string, &string_len) == FAILURE) {
  1527. RETURN_FALSE;
  1528. }
  1529. if (string != NULL) {
  1530. opt = 0;
  1531. syntax = NULL;
  1532. _php_mb_regex_init_options(string, string_len, &opt, &syntax, NULL);
  1533. _php_mb_regex_set_options(opt, syntax, NULL, NULL);
  1534. } else {
  1535. opt = MBREX(regex_default_options);
  1536. syntax = MBREX(regex_default_syntax);
  1537. }
  1538. _php_mb_regex_get_option_string(buf, sizeof(buf), opt, syntax);
  1539. RETVAL_STRING(buf);
  1540. }
  1541. /* }}} */
  1542. #endif /* HAVE_MBREGEX */
  1543. /*
  1544. * Local variables:
  1545. * tab-width: 4
  1546. * c-basic-offset: 4
  1547. * End:
  1548. * vim600: fdm=marker
  1549. * vim: noet sw=4 ts=4
  1550. */