php_mbregex.c 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639
  1. /*
  2. +----------------------------------------------------------------------+
  3. | Copyright (c) The PHP Group |
  4. +----------------------------------------------------------------------+
  5. | This source file is subject to version 3.01 of the PHP license, |
  6. | that is bundled with this package in the file LICENSE, and is |
  7. | available through the world-wide-web at the following url: |
  8. | https://www.php.net/license/3_01.txt |
  9. | If you did not receive a copy of the PHP license and are unable to |
  10. | obtain it through the world-wide-web, please send a note to |
  11. | license@php.net so we can mail you a copy immediately. |
  12. +----------------------------------------------------------------------+
  13. | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp> |
  14. +----------------------------------------------------------------------+
  15. */
  16. #include "libmbfl/config.h"
  17. #include "php.h"
  18. #include "php_ini.h"
  19. #ifdef HAVE_MBREGEX
  20. #include "zend_smart_str.h"
  21. #include "ext/standard/info.h"
  22. #include "php_mbregex.h"
  23. #include "mbstring.h"
  24. #include "libmbfl/filters/mbfilter_utf8.h"
  25. #include "php_onig_compat.h" /* must come prior to the oniguruma header */
  26. #include <oniguruma.h>
  27. #undef UChar
  28. #if !defined(ONIGURUMA_VERSION_INT) || ONIGURUMA_VERSION_INT < 60800
  29. typedef void OnigMatchParam;
  30. #define onig_new_match_param() (NULL)
  31. #define onig_initialize_match_param(x) (void)(x)
  32. #define onig_set_match_stack_limit_size_of_match_param(x, y)
  33. #define onig_set_retry_limit_in_match_of_match_param(x, y)
  34. #define onig_free_match_param(x)
  35. #define onig_search_with_param(reg, str, end, start, range, region, option, mp) \
  36. onig_search(reg, str, end, start, range, region, option)
  37. #define onig_match_with_param(re, str, end, at, region, option, mp) \
  38. onig_match(re, str, end, at, region, option)
  39. #endif
  40. ZEND_EXTERN_MODULE_GLOBALS(mbstring)
  41. struct _zend_mb_regex_globals {
  42. OnigEncoding default_mbctype;
  43. OnigEncoding current_mbctype;
  44. const mbfl_encoding *current_mbctype_mbfl_encoding;
  45. HashTable ht_rc;
  46. zval search_str;
  47. zval *search_str_val;
  48. size_t search_pos;
  49. php_mb_regex_t *search_re;
  50. OnigRegion *search_regs;
  51. OnigOptionType regex_default_options;
  52. OnigSyntaxType *regex_default_syntax;
  53. };
  54. #define MBREX(g) (MBSTRG(mb_regex_globals)->g)
  55. /* {{{ static void php_mb_regex_free_cache() */
  56. static void php_mb_regex_free_cache(zval *el) {
  57. onig_free((php_mb_regex_t *)Z_PTR_P(el));
  58. }
  59. /* }}} */
  60. /* {{{ _php_mb_regex_globals_ctor */
  61. static int _php_mb_regex_globals_ctor(zend_mb_regex_globals *pglobals)
  62. {
  63. pglobals->default_mbctype = ONIG_ENCODING_UTF8;
  64. pglobals->current_mbctype = ONIG_ENCODING_UTF8;
  65. pglobals->current_mbctype_mbfl_encoding = &mbfl_encoding_utf8;
  66. ZVAL_UNDEF(&pglobals->search_str);
  67. pglobals->search_re = (php_mb_regex_t*)NULL;
  68. pglobals->search_pos = 0;
  69. pglobals->search_regs = (OnigRegion*)NULL;
  70. pglobals->regex_default_options = ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
  71. pglobals->regex_default_syntax = ONIG_SYNTAX_RUBY;
  72. return SUCCESS;
  73. }
  74. /* }}} */
  75. /* {{{ php_mb_regex_globals_alloc */
  76. zend_mb_regex_globals *php_mb_regex_globals_alloc(void)
  77. {
  78. zend_mb_regex_globals *pglobals = pemalloc(
  79. sizeof(zend_mb_regex_globals), 1);
  80. if (SUCCESS != _php_mb_regex_globals_ctor(pglobals)) {
  81. pefree(pglobals, 1);
  82. return NULL;
  83. }
  84. return pglobals;
  85. }
  86. /* }}} */
  87. /* {{{ php_mb_regex_globals_free */
  88. void php_mb_regex_globals_free(zend_mb_regex_globals *pglobals)
  89. {
  90. if (!pglobals) {
  91. return;
  92. }
  93. pefree(pglobals, 1);
  94. }
  95. /* }}} */
  96. /* {{{ PHP_MINIT_FUNCTION(mb_regex) */
  97. PHP_MINIT_FUNCTION(mb_regex)
  98. {
  99. char version[256];
  100. onig_init();
  101. snprintf(version, sizeof(version), "%d.%d.%d",
  102. ONIGURUMA_VERSION_MAJOR, ONIGURUMA_VERSION_MINOR, ONIGURUMA_VERSION_TEENY);
  103. REGISTER_STRING_CONSTANT("MB_ONIGURUMA_VERSION", version, CONST_CS | CONST_PERSISTENT);
  104. return SUCCESS;
  105. }
  106. /* }}} */
  107. /* {{{ PHP_MSHUTDOWN_FUNCTION(mb_regex) */
  108. PHP_MSHUTDOWN_FUNCTION(mb_regex)
  109. {
  110. onig_end();
  111. return SUCCESS;
  112. }
  113. /* }}} */
  114. /* {{{ PHP_RINIT_FUNCTION(mb_regex) */
  115. PHP_RINIT_FUNCTION(mb_regex)
  116. {
  117. if (!MBSTRG(mb_regex_globals)) return FAILURE;
  118. zend_hash_init(&MBREX(ht_rc), 0, NULL, php_mb_regex_free_cache, 0);
  119. return SUCCESS;
  120. }
  121. /* }}} */
  122. /* {{{ PHP_RSHUTDOWN_FUNCTION(mb_regex) */
  123. PHP_RSHUTDOWN_FUNCTION(mb_regex)
  124. {
  125. MBREX(current_mbctype) = MBREX(default_mbctype);
  126. MBREX(current_mbctype_mbfl_encoding) = mbfl_name2encoding(php_mb_regex_get_default_mbctype());
  127. if (!Z_ISUNDEF(MBREX(search_str))) {
  128. zval_ptr_dtor(&MBREX(search_str));
  129. ZVAL_UNDEF(&MBREX(search_str));
  130. }
  131. MBREX(search_pos) = 0;
  132. MBREX(search_re) = NULL;
  133. if (MBREX(search_regs) != NULL) {
  134. onig_region_free(MBREX(search_regs), 1);
  135. MBREX(search_regs) = (OnigRegion *)NULL;
  136. }
  137. zend_hash_destroy(&MBREX(ht_rc));
  138. return SUCCESS;
  139. }
  140. /* }}} */
  141. /* {{{ PHP_MINFO_FUNCTION(mb_regex) */
  142. PHP_MINFO_FUNCTION(mb_regex)
  143. {
  144. char buf[32];
  145. php_info_print_table_start();
  146. php_info_print_table_row(2, "Multibyte (japanese) regex support", "enabled");
  147. snprintf(buf, sizeof(buf), "%d.%d.%d",
  148. ONIGURUMA_VERSION_MAJOR,
  149. ONIGURUMA_VERSION_MINOR,
  150. ONIGURUMA_VERSION_TEENY);
  151. php_info_print_table_row(2, "Multibyte regex (oniguruma) version", buf);
  152. php_info_print_table_end();
  153. }
  154. /* }}} */
  155. /*
  156. * encoding name resolver
  157. */
  158. /* {{{ encoding name map */
  159. typedef struct _php_mb_regex_enc_name_map_t {
  160. const char *names;
  161. OnigEncoding code;
  162. } php_mb_regex_enc_name_map_t;
  163. static const php_mb_regex_enc_name_map_t enc_name_map[] = {
  164. #ifdef ONIG_ENCODING_EUC_JP
  165. {
  166. "EUC-JP\0EUCJP\0X-EUC-JP\0UJIS\0EUCJP\0EUCJP-WIN\0",
  167. ONIG_ENCODING_EUC_JP
  168. },
  169. #endif
  170. #ifdef ONIG_ENCODING_UTF8
  171. {
  172. "UTF-8\0UTF8\0",
  173. ONIG_ENCODING_UTF8
  174. },
  175. #endif
  176. #ifdef ONIG_ENCODING_UTF16_BE
  177. {
  178. "UTF-16\0UTF-16BE\0",
  179. ONIG_ENCODING_UTF16_BE
  180. },
  181. #endif
  182. #ifdef ONIG_ENCODING_UTF16_LE
  183. {
  184. "UTF-16LE\0",
  185. ONIG_ENCODING_UTF16_LE
  186. },
  187. #endif
  188. #ifdef ONIG_ENCODING_UTF32_BE
  189. {
  190. "UCS-4\0UTF-32\0UTF-32BE\0",
  191. ONIG_ENCODING_UTF32_BE
  192. },
  193. #endif
  194. #ifdef ONIG_ENCODING_UTF32_LE
  195. {
  196. "UCS-4LE\0UTF-32LE\0",
  197. ONIG_ENCODING_UTF32_LE
  198. },
  199. #endif
  200. #ifdef ONIG_ENCODING_SJIS
  201. {
  202. "SJIS\0CP932\0MS932\0SHIFT_JIS\0SJIS-WIN\0WINDOWS-31J\0",
  203. ONIG_ENCODING_SJIS
  204. },
  205. #endif
  206. #ifdef ONIG_ENCODING_BIG5
  207. {
  208. "BIG5\0BIG-5\0BIGFIVE\0CN-BIG5\0BIG-FIVE\0",
  209. ONIG_ENCODING_BIG5
  210. },
  211. #endif
  212. #ifdef ONIG_ENCODING_EUC_CN
  213. {
  214. "EUC-CN\0EUCCN\0EUC_CN\0GB-2312\0GB2312\0",
  215. ONIG_ENCODING_EUC_CN
  216. },
  217. #endif
  218. #ifdef ONIG_ENCODING_EUC_TW
  219. {
  220. "EUC-TW\0EUCTW\0EUC_TW\0",
  221. ONIG_ENCODING_EUC_TW
  222. },
  223. #endif
  224. #ifdef ONIG_ENCODING_EUC_KR
  225. {
  226. "EUC-KR\0EUCKR\0EUC_KR\0",
  227. ONIG_ENCODING_EUC_KR
  228. },
  229. #endif
  230. #if defined(ONIG_ENCODING_KOI8) && !PHP_ONIG_BAD_KOI8_ENTRY
  231. {
  232. "KOI8\0KOI-8\0",
  233. ONIG_ENCODING_KOI8
  234. },
  235. #endif
  236. #ifdef ONIG_ENCODING_KOI8_R
  237. {
  238. "KOI8R\0KOI8-R\0KOI-8R\0",
  239. ONIG_ENCODING_KOI8_R
  240. },
  241. #endif
  242. #ifdef ONIG_ENCODING_ISO_8859_1
  243. {
  244. "ISO-8859-1\0ISO8859-1\0",
  245. ONIG_ENCODING_ISO_8859_1
  246. },
  247. #endif
  248. #ifdef ONIG_ENCODING_ISO_8859_2
  249. {
  250. "ISO-8859-2\0ISO8859-2\0",
  251. ONIG_ENCODING_ISO_8859_2
  252. },
  253. #endif
  254. #ifdef ONIG_ENCODING_ISO_8859_3
  255. {
  256. "ISO-8859-3\0ISO8859-3\0",
  257. ONIG_ENCODING_ISO_8859_3
  258. },
  259. #endif
  260. #ifdef ONIG_ENCODING_ISO_8859_4
  261. {
  262. "ISO-8859-4\0ISO8859-4\0",
  263. ONIG_ENCODING_ISO_8859_4
  264. },
  265. #endif
  266. #ifdef ONIG_ENCODING_ISO_8859_5
  267. {
  268. "ISO-8859-5\0ISO8859-5\0",
  269. ONIG_ENCODING_ISO_8859_5
  270. },
  271. #endif
  272. #ifdef ONIG_ENCODING_ISO_8859_6
  273. {
  274. "ISO-8859-6\0ISO8859-6\0",
  275. ONIG_ENCODING_ISO_8859_6
  276. },
  277. #endif
  278. #ifdef ONIG_ENCODING_ISO_8859_7
  279. {
  280. "ISO-8859-7\0ISO8859-7\0",
  281. ONIG_ENCODING_ISO_8859_7
  282. },
  283. #endif
  284. #ifdef ONIG_ENCODING_ISO_8859_8
  285. {
  286. "ISO-8859-8\0ISO8859-8\0",
  287. ONIG_ENCODING_ISO_8859_8
  288. },
  289. #endif
  290. #ifdef ONIG_ENCODING_ISO_8859_9
  291. {
  292. "ISO-8859-9\0ISO8859-9\0",
  293. ONIG_ENCODING_ISO_8859_9
  294. },
  295. #endif
  296. #ifdef ONIG_ENCODING_ISO_8859_10
  297. {
  298. "ISO-8859-10\0ISO8859-10\0",
  299. ONIG_ENCODING_ISO_8859_10
  300. },
  301. #endif
  302. #ifdef ONIG_ENCODING_ISO_8859_11
  303. {
  304. "ISO-8859-11\0ISO8859-11\0",
  305. ONIG_ENCODING_ISO_8859_11
  306. },
  307. #endif
  308. #ifdef ONIG_ENCODING_ISO_8859_13
  309. {
  310. "ISO-8859-13\0ISO8859-13\0",
  311. ONIG_ENCODING_ISO_8859_13
  312. },
  313. #endif
  314. #ifdef ONIG_ENCODING_ISO_8859_14
  315. {
  316. "ISO-8859-14\0ISO8859-14\0",
  317. ONIG_ENCODING_ISO_8859_14
  318. },
  319. #endif
  320. #ifdef ONIG_ENCODING_ISO_8859_15
  321. {
  322. "ISO-8859-15\0ISO8859-15\0",
  323. ONIG_ENCODING_ISO_8859_15
  324. },
  325. #endif
  326. #ifdef ONIG_ENCODING_ISO_8859_16
  327. {
  328. "ISO-8859-16\0ISO8859-16\0",
  329. ONIG_ENCODING_ISO_8859_16
  330. },
  331. #endif
  332. #ifdef ONIG_ENCODING_ASCII
  333. {
  334. "ASCII\0US-ASCII\0US_ASCII\0ISO646\0",
  335. ONIG_ENCODING_ASCII
  336. },
  337. #endif
  338. { NULL, ONIG_ENCODING_UNDEF }
  339. };
  340. /* }}} */
  341. /* {{{ php_mb_regex_name2mbctype */
  342. static OnigEncoding _php_mb_regex_name2mbctype(const char *pname)
  343. {
  344. const char *p;
  345. const php_mb_regex_enc_name_map_t *mapping;
  346. if (pname == NULL || !*pname) {
  347. return ONIG_ENCODING_UNDEF;
  348. }
  349. for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
  350. for (p = mapping->names; *p != '\0'; p += (strlen(p) + 1)) {
  351. if (strcasecmp(p, pname) == 0) {
  352. return mapping->code;
  353. }
  354. }
  355. }
  356. return ONIG_ENCODING_UNDEF;
  357. }
  358. /* }}} */
  359. /* {{{ php_mb_regex_mbctype2name */
  360. static const char *_php_mb_regex_mbctype2name(OnigEncoding mbctype)
  361. {
  362. const php_mb_regex_enc_name_map_t *mapping;
  363. for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
  364. if (mapping->code == mbctype) {
  365. return mapping->names;
  366. }
  367. }
  368. return NULL;
  369. }
  370. /* }}} */
  371. /* {{{ php_mb_regex_set_mbctype */
  372. int php_mb_regex_set_mbctype(const char *encname)
  373. {
  374. OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname);
  375. if (mbctype == ONIG_ENCODING_UNDEF) {
  376. return FAILURE;
  377. }
  378. MBREX(current_mbctype) = mbctype;
  379. MBREX(current_mbctype_mbfl_encoding) = mbfl_name2encoding(encname);
  380. return SUCCESS;
  381. }
  382. /* }}} */
  383. /* {{{ php_mb_regex_set_default_mbctype */
  384. int php_mb_regex_set_default_mbctype(const char *encname)
  385. {
  386. OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname);
  387. if (mbctype == ONIG_ENCODING_UNDEF) {
  388. return FAILURE;
  389. }
  390. MBREX(default_mbctype) = mbctype;
  391. return SUCCESS;
  392. }
  393. /* }}} */
  394. /* {{{ php_mb_regex_get_mbctype */
  395. const char *php_mb_regex_get_mbctype(void)
  396. {
  397. return _php_mb_regex_mbctype2name(MBREX(current_mbctype));
  398. }
  399. /* }}} */
  400. /* {{{ php_mb_regex_get_mbctype_encoding */
  401. const mbfl_encoding *php_mb_regex_get_mbctype_encoding(void)
  402. {
  403. return MBREX(current_mbctype_mbfl_encoding);
  404. }
  405. /* }}} */
  406. /* {{{ php_mb_regex_get_default_mbctype */
  407. const char *php_mb_regex_get_default_mbctype(void)
  408. {
  409. return _php_mb_regex_mbctype2name(MBREX(default_mbctype));
  410. }
  411. /* }}} */
  412. /*
  413. * regex cache
  414. */
  415. /* {{{ php_mbregex_compile_pattern */
  416. static php_mb_regex_t *php_mbregex_compile_pattern(const char *pattern, size_t patlen, OnigOptionType options, OnigSyntaxType *syntax)
  417. {
  418. int err_code = 0;
  419. php_mb_regex_t *retval = NULL, *rc = NULL;
  420. OnigErrorInfo err_info;
  421. OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
  422. OnigEncoding enc = MBREX(current_mbctype);
  423. if (!php_mb_check_encoding(pattern, patlen, php_mb_regex_get_mbctype_encoding())) {
  424. php_error_docref(NULL, E_WARNING,
  425. "Pattern is not valid under %s encoding", _php_mb_regex_mbctype2name(enc));
  426. return NULL;
  427. }
  428. rc = zend_hash_str_find_ptr(&MBREX(ht_rc), (char *)pattern, patlen);
  429. if (!rc || onig_get_options(rc) != options || onig_get_encoding(rc) != enc || onig_get_syntax(rc) != syntax) {
  430. if ((err_code = onig_new(&retval, (OnigUChar *)pattern, (OnigUChar *)(pattern + patlen), options, enc, syntax, &err_info)) != ONIG_NORMAL) {
  431. onig_error_code_to_str(err_str, err_code, &err_info);
  432. php_error_docref(NULL, E_WARNING, "mbregex compile err: %s", err_str);
  433. return NULL;
  434. }
  435. if (rc == MBREX(search_re)) {
  436. /* reuse the new rc? see bug #72399 */
  437. MBREX(search_re) = NULL;
  438. }
  439. zend_hash_str_update_ptr(&MBREX(ht_rc), (char *)pattern, patlen, retval);
  440. } else {
  441. retval = rc;
  442. }
  443. return retval;
  444. }
  445. /* }}} */
  446. /* {{{ _php_mb_regex_get_option_string */
  447. static size_t _php_mb_regex_get_option_string(char *str, size_t len, OnigOptionType option, OnigSyntaxType *syntax)
  448. {
  449. size_t len_left = len;
  450. size_t len_req = 0;
  451. char *p = str;
  452. char c;
  453. if ((option & ONIG_OPTION_IGNORECASE) != 0) {
  454. if (len_left > 0) {
  455. --len_left;
  456. *(p++) = 'i';
  457. }
  458. ++len_req;
  459. }
  460. if ((option & ONIG_OPTION_EXTEND) != 0) {
  461. if (len_left > 0) {
  462. --len_left;
  463. *(p++) = 'x';
  464. }
  465. ++len_req;
  466. }
  467. if ((option & (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) ==
  468. (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) {
  469. if (len_left > 0) {
  470. --len_left;
  471. *(p++) = 'p';
  472. }
  473. ++len_req;
  474. } else {
  475. if ((option & ONIG_OPTION_MULTILINE) != 0) {
  476. if (len_left > 0) {
  477. --len_left;
  478. *(p++) = 'm';
  479. }
  480. ++len_req;
  481. }
  482. if ((option & ONIG_OPTION_SINGLELINE) != 0) {
  483. if (len_left > 0) {
  484. --len_left;
  485. *(p++) = 's';
  486. }
  487. ++len_req;
  488. }
  489. }
  490. if ((option & ONIG_OPTION_FIND_LONGEST) != 0) {
  491. if (len_left > 0) {
  492. --len_left;
  493. *(p++) = 'l';
  494. }
  495. ++len_req;
  496. }
  497. if ((option & ONIG_OPTION_FIND_NOT_EMPTY) != 0) {
  498. if (len_left > 0) {
  499. --len_left;
  500. *(p++) = 'n';
  501. }
  502. ++len_req;
  503. }
  504. c = 0;
  505. if (syntax == ONIG_SYNTAX_JAVA) {
  506. c = 'j';
  507. } else if (syntax == ONIG_SYNTAX_GNU_REGEX) {
  508. c = 'u';
  509. } else if (syntax == ONIG_SYNTAX_GREP) {
  510. c = 'g';
  511. } else if (syntax == ONIG_SYNTAX_EMACS) {
  512. c = 'c';
  513. } else if (syntax == ONIG_SYNTAX_RUBY) {
  514. c = 'r';
  515. } else if (syntax == ONIG_SYNTAX_PERL) {
  516. c = 'z';
  517. } else if (syntax == ONIG_SYNTAX_POSIX_BASIC) {
  518. c = 'b';
  519. } else if (syntax == ONIG_SYNTAX_POSIX_EXTENDED) {
  520. c = 'd';
  521. }
  522. if (c != 0) {
  523. if (len_left > 0) {
  524. --len_left;
  525. *(p++) = c;
  526. }
  527. ++len_req;
  528. }
  529. if (len_left > 0) {
  530. --len_left;
  531. *(p++) = '\0';
  532. }
  533. ++len_req;
  534. if (len < len_req) {
  535. return len_req;
  536. }
  537. return 0;
  538. }
  539. /* }}} */
  540. /* {{{ _php_mb_regex_init_options */
  541. static bool _php_mb_regex_init_options(const char *parg, size_t narg, OnigOptionType *option,
  542. OnigSyntaxType **syntax)
  543. {
  544. size_t n;
  545. char c;
  546. OnigOptionType optm = 0;
  547. *syntax = ONIG_SYNTAX_RUBY;
  548. if (parg != NULL) {
  549. n = 0;
  550. while(n < narg) {
  551. c = parg[n++];
  552. switch (c) {
  553. case 'i':
  554. optm |= ONIG_OPTION_IGNORECASE;
  555. break;
  556. case 'x':
  557. optm |= ONIG_OPTION_EXTEND;
  558. break;
  559. case 'm':
  560. optm |= ONIG_OPTION_MULTILINE;
  561. break;
  562. case 's':
  563. optm |= ONIG_OPTION_SINGLELINE;
  564. break;
  565. case 'p':
  566. optm |= ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
  567. break;
  568. case 'l':
  569. optm |= ONIG_OPTION_FIND_LONGEST;
  570. break;
  571. case 'n':
  572. optm |= ONIG_OPTION_FIND_NOT_EMPTY;
  573. break;
  574. case 'j':
  575. *syntax = ONIG_SYNTAX_JAVA;
  576. break;
  577. case 'u':
  578. *syntax = ONIG_SYNTAX_GNU_REGEX;
  579. break;
  580. case 'g':
  581. *syntax = ONIG_SYNTAX_GREP;
  582. break;
  583. case 'c':
  584. *syntax = ONIG_SYNTAX_EMACS;
  585. break;
  586. case 'r':
  587. *syntax = ONIG_SYNTAX_RUBY;
  588. break;
  589. case 'z':
  590. *syntax = ONIG_SYNTAX_PERL;
  591. break;
  592. case 'b':
  593. *syntax = ONIG_SYNTAX_POSIX_BASIC;
  594. break;
  595. case 'd':
  596. *syntax = ONIG_SYNTAX_POSIX_EXTENDED;
  597. break;
  598. default:
  599. zend_value_error("Option \"%c\" is not supported", c);
  600. return false;
  601. }
  602. }
  603. if (option != NULL) *option|=optm;
  604. }
  605. return true;
  606. }
  607. /* }}} */
  608. /*
  609. * Callbacks for named subpatterns
  610. */
  611. /* {{{ struct mb_ereg_groups_iter_arg */
  612. typedef struct mb_regex_groups_iter_args {
  613. zval *groups;
  614. char *search_str;
  615. size_t search_len;
  616. OnigRegion *region;
  617. } mb_regex_groups_iter_args;
  618. /* }}} */
  619. /* {{{ mb_ereg_groups_iter */
  620. static int
  621. mb_regex_groups_iter(const OnigUChar* name, const OnigUChar* name_end, int ngroup_num, int* group_nums, regex_t* reg, void* parg)
  622. {
  623. mb_regex_groups_iter_args *args = (mb_regex_groups_iter_args *) parg;
  624. int gn, beg, end;
  625. /*
  626. * In case of duplicate groups, keep only the last succeeding one
  627. * to be consistent with preg_match with the PCRE_DUPNAMES option.
  628. */
  629. gn = onig_name_to_backref_number(reg, name, name_end, args->region);
  630. beg = args->region->beg[gn];
  631. end = args->region->end[gn];
  632. if (beg >= 0 && beg < end && end <= args->search_len) {
  633. add_assoc_stringl_ex(args->groups, (char *)name, name_end - name, &args->search_str[beg], end - beg);
  634. } else {
  635. add_assoc_bool_ex(args->groups, (char *)name, name_end - name, 0);
  636. }
  637. return 0;
  638. }
  639. /* }}} */
  640. /*
  641. * Helper for _php_mb_regex_ereg_replace_exec
  642. */
  643. /* {{{ mb_regex_substitute */
  644. static inline void mb_regex_substitute(
  645. smart_str *pbuf,
  646. const char *subject,
  647. size_t subject_len,
  648. char *replace,
  649. size_t replace_len,
  650. php_mb_regex_t *regexp,
  651. OnigRegion *regs,
  652. const mbfl_encoding *enc
  653. ) {
  654. char *p, *sp, *eos;
  655. int no; /* bakreference group number */
  656. int clen; /* byte-length of the current character */
  657. p = replace;
  658. eos = replace + replace_len;
  659. while (p < eos) {
  660. clen = (int) php_mb_mbchar_bytes_ex(p, enc);
  661. if (clen != 1 || p == eos || p[0] != '\\') {
  662. /* skip anything that's not an ascii backslash */
  663. smart_str_appendl(pbuf, p, clen);
  664. p += clen;
  665. continue;
  666. }
  667. sp = p; /* save position */
  668. clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
  669. if (clen != 1 || p == eos) {
  670. /* skip backslash followed by multibyte char */
  671. smart_str_appendl(pbuf, sp, p - sp);
  672. continue;
  673. }
  674. no = -1;
  675. switch (p[0]) {
  676. case '0':
  677. no = 0;
  678. p++;
  679. break;
  680. case '1': case '2': case '3': case '4':
  681. case '5': case '6': case '7': case '8': case '9':
  682. if (!onig_noname_group_capture_is_active(regexp)) {
  683. /*
  684. * FIXME:
  685. * Oniguruma throws a compile error if numbered backrefs are used with named groups in the pattern.
  686. * For now we just ignore them, but in the future we might want to raise a warning
  687. * and abort the whole replace operation.
  688. */
  689. p++;
  690. smart_str_appendl(pbuf, sp, p - sp);
  691. continue;
  692. }
  693. no = p[0] - '0';
  694. p++;
  695. break;
  696. case 'k':
  697. {
  698. clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
  699. if (clen != 1 || p == eos || (p[0] != '<' && p[0] != '\'')) {
  700. /* not a backref delimiter */
  701. p += clen;
  702. smart_str_appendl(pbuf, sp, p - sp);
  703. continue;
  704. }
  705. /* try to consume everything until next delimiter */
  706. char delim = p[0] == '<' ? '>' : '\'';
  707. char *name, *name_end;
  708. char maybe_num = 1;
  709. name_end = name = p + 1;
  710. while (name_end < eos) {
  711. clen = (int) php_mb_mbchar_bytes_ex(name_end, enc);
  712. if (clen != 1) {
  713. name_end += clen;
  714. maybe_num = 0;
  715. continue;
  716. }
  717. if (name_end[0] == delim) break;
  718. if (maybe_num && !isdigit(name_end[0])) maybe_num = 0;
  719. name_end++;
  720. }
  721. p = name_end + 1;
  722. if (name_end - name < 1 || name_end >= eos) {
  723. /* the backref was empty or we failed to find the end delimiter */
  724. smart_str_appendl(pbuf, sp, p - sp);
  725. continue;
  726. }
  727. /* we have either a name or a number */
  728. if (maybe_num) {
  729. if (!onig_noname_group_capture_is_active(regexp)) {
  730. /* see above note on mixing numbered & named backrefs */
  731. smart_str_appendl(pbuf, sp, p - sp);
  732. continue;
  733. }
  734. if (name_end - name == 1) {
  735. no = name[0] - '0';
  736. break;
  737. }
  738. if (name[0] == '0') {
  739. /* 01 is not a valid number */
  740. break;
  741. }
  742. no = (int) strtoul(name, NULL, 10);
  743. break;
  744. }
  745. no = onig_name_to_backref_number(regexp, (OnigUChar *)name, (OnigUChar *)name_end, regs);
  746. break;
  747. }
  748. default:
  749. /* We're not treating \ as an escape character and will interpret something like
  750. * \\1 as \ followed by \1, rather than \\ followed by 1. This is because this
  751. * function has not supported escaping of backslashes historically. */
  752. smart_str_appendl(pbuf, sp, p - sp);
  753. continue;
  754. }
  755. if (no < 0 || no >= regs->num_regs) {
  756. /* invalid group number reference, keep the escape sequence in the output */
  757. smart_str_appendl(pbuf, sp, p - sp);
  758. continue;
  759. }
  760. if (regs->beg[no] >= 0 && regs->beg[no] < regs->end[no] && (size_t)regs->end[no] <= subject_len) {
  761. smart_str_appendl(pbuf, subject + regs->beg[no], regs->end[no] - regs->beg[no]);
  762. }
  763. }
  764. if (p < eos) {
  765. smart_str_appendl(pbuf, p, eos - p);
  766. }
  767. }
  768. /* }}} */
  769. /*
  770. * php functions
  771. */
  772. /* {{{ Returns the current encoding for regex as a string. */
  773. PHP_FUNCTION(mb_regex_encoding)
  774. {
  775. char *encoding = NULL;
  776. size_t encoding_len;
  777. if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s!", &encoding, &encoding_len) == FAILURE) {
  778. RETURN_THROWS();
  779. }
  780. if (!encoding) {
  781. const char *retval = php_mb_regex_get_mbctype();
  782. ZEND_ASSERT(retval != NULL);
  783. RETURN_STRING(retval);
  784. } else {
  785. if (php_mb_regex_set_mbctype(encoding) == FAILURE) {
  786. zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", encoding);
  787. RETURN_THROWS();
  788. }
  789. /* TODO Make function return previous encoding? */
  790. RETURN_TRUE;
  791. }
  792. }
  793. /* }}} */
  794. /* {{{ _php_mb_onig_search */
  795. static int _php_mb_onig_search(regex_t* reg, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start,
  796. const OnigUChar* range, OnigRegion* region, OnigOptionType option) {
  797. OnigMatchParam *mp = onig_new_match_param();
  798. int err;
  799. onig_initialize_match_param(mp);
  800. if (!ZEND_LONG_UINT_OVFL(MBSTRG(regex_stack_limit))) {
  801. onig_set_match_stack_limit_size_of_match_param(mp, (unsigned int)MBSTRG(regex_stack_limit));
  802. }
  803. if (!ZEND_LONG_UINT_OVFL(MBSTRG(regex_retry_limit))) {
  804. onig_set_retry_limit_in_match_of_match_param(mp, (unsigned int)MBSTRG(regex_retry_limit));
  805. }
  806. /* search */
  807. err = onig_search_with_param(reg, str, end, start, range, region, option, mp);
  808. onig_free_match_param(mp);
  809. return err;
  810. }
  811. /* }}} */
  812. /* {{{ _php_mb_regex_ereg_exec */
  813. static void _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS, int icase)
  814. {
  815. zval *array = NULL;
  816. char *arg_pattern, *string;
  817. size_t arg_pattern_len, string_len;
  818. php_mb_regex_t *re;
  819. OnigRegion *regs = NULL;
  820. int i, beg, end;
  821. OnigOptionType options;
  822. char *str;
  823. if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|z", &arg_pattern, &arg_pattern_len, &string, &string_len, &array) == FAILURE) {
  824. RETURN_THROWS();
  825. }
  826. if (arg_pattern_len == 0) {
  827. zend_argument_value_error(1, "must not be empty");
  828. RETURN_THROWS();
  829. }
  830. if (array != NULL) {
  831. array = zend_try_array_init(array);
  832. if (!array) {
  833. RETURN_THROWS();
  834. }
  835. }
  836. if (!php_mb_check_encoding(
  837. string,
  838. string_len,
  839. php_mb_regex_get_mbctype_encoding()
  840. )) {
  841. RETURN_FALSE;
  842. }
  843. options = MBREX(regex_default_options);
  844. if (icase) {
  845. options |= ONIG_OPTION_IGNORECASE;
  846. }
  847. re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, options, MBREX(regex_default_syntax));
  848. if (re == NULL) {
  849. RETVAL_FALSE;
  850. goto out;
  851. }
  852. regs = onig_region_new();
  853. /* actually execute the regular expression */
  854. if (_php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, (OnigUChar *)(string + string_len), regs, 0) < 0) {
  855. RETVAL_FALSE;
  856. goto out;
  857. }
  858. str = string;
  859. if (array != NULL) {
  860. for (i = 0; i < regs->num_regs; i++) {
  861. beg = regs->beg[i];
  862. end = regs->end[i];
  863. if (beg >= 0 && beg < end && (size_t)end <= string_len) {
  864. add_index_stringl(array, i, (char *)&str[beg], end - beg);
  865. } else {
  866. add_index_bool(array, i, 0);
  867. }
  868. }
  869. if (onig_number_of_names(re) > 0) {
  870. mb_regex_groups_iter_args args = {array, string, string_len, regs};
  871. onig_foreach_name(re, mb_regex_groups_iter, &args);
  872. }
  873. }
  874. RETVAL_TRUE;
  875. out:
  876. if (regs != NULL) {
  877. onig_region_free(regs, 1);
  878. }
  879. }
  880. /* }}} */
  881. /* {{{ Regular expression match for multibyte string */
  882. PHP_FUNCTION(mb_ereg)
  883. {
  884. _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
  885. }
  886. /* }}} */
  887. /* {{{ Case-insensitive regular expression match for multibyte string */
  888. PHP_FUNCTION(mb_eregi)
  889. {
  890. _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
  891. }
  892. /* }}} */
  893. /* {{{ _php_mb_regex_ereg_replace_exec */
  894. static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOptionType options, int is_callable)
  895. {
  896. char *arg_pattern;
  897. size_t arg_pattern_len;
  898. char *replace;
  899. size_t replace_len;
  900. zend_fcall_info arg_replace_fci;
  901. zend_fcall_info_cache arg_replace_fci_cache;
  902. char *string;
  903. size_t string_len;
  904. php_mb_regex_t *re;
  905. OnigSyntaxType *syntax;
  906. OnigRegion *regs = NULL;
  907. smart_str out_buf = {0};
  908. smart_str eval_buf = {0};
  909. smart_str *pbuf;
  910. int err, n;
  911. OnigUChar *pos;
  912. OnigUChar *string_lim;
  913. char *description = NULL;
  914. const mbfl_encoding *enc = php_mb_regex_get_mbctype_encoding();
  915. ZEND_ASSERT(enc != NULL);
  916. {
  917. char *option_str = NULL;
  918. size_t option_str_len = 0;
  919. if (!is_callable) {
  920. if (zend_parse_parameters(ZEND_NUM_ARGS(), "sss|s!",
  921. &arg_pattern, &arg_pattern_len,
  922. &replace, &replace_len,
  923. &string, &string_len,
  924. &option_str, &option_str_len) == FAILURE) {
  925. RETURN_THROWS();
  926. }
  927. } else {
  928. if (zend_parse_parameters(ZEND_NUM_ARGS(), "sfs|s!",
  929. &arg_pattern, &arg_pattern_len,
  930. &arg_replace_fci, &arg_replace_fci_cache,
  931. &string, &string_len,
  932. &option_str, &option_str_len) == FAILURE) {
  933. RETURN_THROWS();
  934. }
  935. }
  936. if (!php_mb_check_encoding(string, string_len, enc)) {
  937. RETURN_NULL();
  938. }
  939. if (option_str != NULL) {
  940. /* Initialize option and in case of failure it means there is a value error */
  941. if (!_php_mb_regex_init_options(option_str, option_str_len, &options, &syntax)) {
  942. RETURN_THROWS();
  943. }
  944. } else {
  945. options |= MBREX(regex_default_options);
  946. syntax = MBREX(regex_default_syntax);
  947. }
  948. }
  949. /* create regex pattern buffer */
  950. re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, options, syntax);
  951. if (re == NULL) {
  952. RETURN_FALSE;
  953. }
  954. if (is_callable) {
  955. pbuf = &eval_buf;
  956. description = zend_make_compiled_string_description("mbregex replace");
  957. } else {
  958. pbuf = &out_buf;
  959. description = NULL;
  960. }
  961. /* do the actual work */
  962. err = 0;
  963. pos = (OnigUChar *)string;
  964. string_lim = (OnigUChar*)(string + string_len);
  965. regs = onig_region_new();
  966. while (err >= 0) {
  967. err = _php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)string_lim, pos, (OnigUChar *)string_lim, regs, 0);
  968. if (err <= -2) {
  969. OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
  970. onig_error_code_to_str(err_str, err);
  971. php_error_docref(NULL, E_WARNING, "mbregex search failure in php_mbereg_replace_exec(): %s", err_str);
  972. break;
  973. }
  974. if (err >= 0) {
  975. /* copy the part of the string before the match */
  976. smart_str_appendl(&out_buf, (char *)pos, (size_t)((OnigUChar *)(string + regs->beg[0]) - pos));
  977. if (!is_callable) {
  978. mb_regex_substitute(pbuf, string, string_len, replace, replace_len, re, regs, enc);
  979. }
  980. if (is_callable) {
  981. zval args[1];
  982. zval subpats, retval;
  983. int i;
  984. array_init(&subpats);
  985. for (i = 0; i < regs->num_regs; i++) {
  986. add_next_index_stringl(&subpats, string + regs->beg[i], regs->end[i] - regs->beg[i]);
  987. }
  988. if (onig_number_of_names(re) > 0) {
  989. mb_regex_groups_iter_args args = {&subpats, string, string_len, regs};
  990. onig_foreach_name(re, mb_regex_groups_iter, &args);
  991. }
  992. ZVAL_COPY_VALUE(&args[0], &subpats);
  993. /* null terminate buffer */
  994. smart_str_0(&eval_buf);
  995. arg_replace_fci.param_count = 1;
  996. arg_replace_fci.params = args;
  997. arg_replace_fci.retval = &retval;
  998. if (zend_call_function(&arg_replace_fci, &arg_replace_fci_cache) == SUCCESS &&
  999. !Z_ISUNDEF(retval)) {
  1000. convert_to_string(&retval);
  1001. smart_str_appendl(&out_buf, Z_STRVAL(retval), Z_STRLEN(retval));
  1002. smart_str_free(&eval_buf);
  1003. zval_ptr_dtor(&retval);
  1004. } else {
  1005. if (!EG(exception)) {
  1006. zend_throw_error(NULL, "Unable to call custom replacement function");
  1007. zval_ptr_dtor(&subpats);
  1008. RETURN_THROWS();
  1009. }
  1010. }
  1011. zval_ptr_dtor(&subpats);
  1012. }
  1013. n = regs->end[0];
  1014. if ((pos - (OnigUChar *)string) < n) {
  1015. pos = (OnigUChar *)string + n;
  1016. } else {
  1017. if (pos < string_lim) {
  1018. smart_str_appendl(&out_buf, (char *)pos, 1);
  1019. }
  1020. pos++;
  1021. }
  1022. } else { /* nomatch */
  1023. /* stick that last bit of string on our output */
  1024. if (string_lim - pos > 0) {
  1025. smart_str_appendl(&out_buf, (char *)pos, string_lim - pos);
  1026. }
  1027. }
  1028. onig_region_free(regs, 0);
  1029. }
  1030. if (description) {
  1031. efree(description);
  1032. }
  1033. if (regs != NULL) {
  1034. onig_region_free(regs, 1);
  1035. }
  1036. smart_str_free(&eval_buf);
  1037. if (err <= -2) {
  1038. smart_str_free(&out_buf);
  1039. RETVAL_FALSE;
  1040. } else if (out_buf.s) {
  1041. smart_str_0(&out_buf);
  1042. RETVAL_STR(out_buf.s);
  1043. } else {
  1044. RETVAL_EMPTY_STRING();
  1045. }
  1046. }
  1047. /* }}} */
  1048. /* {{{ Replace regular expression for multibyte string */
  1049. PHP_FUNCTION(mb_ereg_replace)
  1050. {
  1051. _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 0);
  1052. }
  1053. /* }}} */
  1054. /* {{{ Case insensitive replace regular expression for multibyte string */
  1055. PHP_FUNCTION(mb_eregi_replace)
  1056. {
  1057. _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, ONIG_OPTION_IGNORECASE, 0);
  1058. }
  1059. /* }}} */
  1060. /* {{{ regular expression for multibyte string using replacement callback */
  1061. PHP_FUNCTION(mb_ereg_replace_callback)
  1062. {
  1063. _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 1);
  1064. }
  1065. /* }}} */
  1066. /* {{{ split multibyte string into array by regular expression */
  1067. PHP_FUNCTION(mb_split)
  1068. {
  1069. char *arg_pattern;
  1070. size_t arg_pattern_len;
  1071. php_mb_regex_t *re;
  1072. OnigRegion *regs = NULL;
  1073. char *string;
  1074. OnigUChar *pos, *chunk_pos;
  1075. size_t string_len;
  1076. int err;
  1077. zend_long count = -1;
  1078. if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &arg_pattern, &arg_pattern_len, &string, &string_len, &count) == FAILURE) {
  1079. RETURN_THROWS();
  1080. }
  1081. if (count > 0) {
  1082. count--;
  1083. }
  1084. if (!php_mb_check_encoding(string, string_len, php_mb_regex_get_mbctype_encoding())) {
  1085. RETURN_FALSE;
  1086. }
  1087. /* create regex pattern buffer */
  1088. if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, MBREX(regex_default_options), MBREX(regex_default_syntax))) == NULL) {
  1089. RETURN_FALSE;
  1090. }
  1091. array_init(return_value);
  1092. chunk_pos = pos = (OnigUChar *)string;
  1093. err = 0;
  1094. regs = onig_region_new();
  1095. /* churn through str, generating array entries as we go */
  1096. while (count != 0 && (size_t)(pos - (OnigUChar *)string) < string_len) {
  1097. size_t beg, end;
  1098. err = _php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), pos, (OnigUChar *)(string + string_len), regs, 0);
  1099. if (err < 0) {
  1100. break;
  1101. }
  1102. beg = regs->beg[0], end = regs->end[0];
  1103. /* add it to the array */
  1104. if ((size_t)(pos - (OnigUChar *)string) < end) {
  1105. if (beg < string_len && beg >= (size_t)(chunk_pos - (OnigUChar *)string)) {
  1106. add_next_index_stringl(return_value, (char *)chunk_pos, ((OnigUChar *)(string + beg) - chunk_pos));
  1107. --count;
  1108. } else {
  1109. err = -2;
  1110. break;
  1111. }
  1112. /* point at our new starting point */
  1113. chunk_pos = pos = (OnigUChar *)string + end;
  1114. } else {
  1115. pos++;
  1116. }
  1117. onig_region_free(regs, 0);
  1118. }
  1119. onig_region_free(regs, 1);
  1120. /* see if we encountered an error */
  1121. // ToDo investigate if this can actually/should happen ...
  1122. if (err <= -2) {
  1123. OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
  1124. onig_error_code_to_str(err_str, err);
  1125. php_error_docref(NULL, E_WARNING, "mbregex search failure in mbsplit(): %s", err_str);
  1126. zend_array_destroy(Z_ARR_P(return_value));
  1127. RETURN_FALSE;
  1128. }
  1129. /* otherwise we just have one last element to add to the array */
  1130. if ((OnigUChar *)(string + string_len) > chunk_pos) {
  1131. size_t n = ((OnigUChar *)(string + string_len) - chunk_pos);
  1132. add_next_index_stringl(return_value, (char *)chunk_pos, n);
  1133. } else {
  1134. add_next_index_stringl(return_value, "", 0);
  1135. }
  1136. }
  1137. /* }}} */
  1138. /* {{{ Regular expression match for multibyte string */
  1139. PHP_FUNCTION(mb_ereg_match)
  1140. {
  1141. char *arg_pattern;
  1142. size_t arg_pattern_len;
  1143. char *string;
  1144. size_t string_len;
  1145. php_mb_regex_t *re;
  1146. OnigSyntaxType *syntax;
  1147. OnigOptionType option = 0;
  1148. int err;
  1149. OnigMatchParam *mp;
  1150. {
  1151. char *option_str = NULL;
  1152. size_t option_str_len = 0;
  1153. if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|s!",
  1154. &arg_pattern, &arg_pattern_len, &string, &string_len,
  1155. &option_str, &option_str_len)==FAILURE) {
  1156. RETURN_THROWS();
  1157. }
  1158. if (option_str != NULL) {
  1159. if(!_php_mb_regex_init_options(option_str, option_str_len, &option, &syntax)) {
  1160. RETURN_THROWS();
  1161. }
  1162. } else {
  1163. option |= MBREX(regex_default_options);
  1164. syntax = MBREX(regex_default_syntax);
  1165. }
  1166. }
  1167. if (!php_mb_check_encoding(string, string_len, php_mb_regex_get_mbctype_encoding())) {
  1168. RETURN_FALSE;
  1169. }
  1170. if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, syntax)) == NULL) {
  1171. RETURN_FALSE;
  1172. }
  1173. mp = onig_new_match_param();
  1174. onig_initialize_match_param(mp);
  1175. if (MBSTRG(regex_stack_limit) > 0 && MBSTRG(regex_stack_limit) < UINT_MAX) {
  1176. onig_set_match_stack_limit_size_of_match_param(mp, (unsigned int)MBSTRG(regex_stack_limit));
  1177. }
  1178. if (MBSTRG(regex_retry_limit) > 0 && MBSTRG(regex_retry_limit) < UINT_MAX) {
  1179. onig_set_retry_limit_in_match_of_match_param(mp, (unsigned int)MBSTRG(regex_retry_limit));
  1180. }
  1181. /* match */
  1182. err = onig_match_with_param(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, NULL, 0, mp);
  1183. onig_free_match_param(mp);
  1184. if (err >= 0) {
  1185. RETVAL_TRUE;
  1186. } else {
  1187. RETVAL_FALSE;
  1188. }
  1189. }
  1190. /* }}} */
  1191. /* regex search */
  1192. /* {{{ _php_mb_regex_ereg_search_exec */
  1193. static void _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS, int mode)
  1194. {
  1195. char *arg_pattern = NULL, *arg_options = NULL;
  1196. size_t arg_pattern_len, arg_options_len;
  1197. int err;
  1198. size_t n, i, pos, len;
  1199. /* Stored as int* in the OnigRegion struct */
  1200. int beg, end;
  1201. OnigOptionType option = 0;
  1202. OnigUChar *str;
  1203. OnigSyntaxType *syntax;
  1204. if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s!s!", &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) {
  1205. RETURN_THROWS();
  1206. }
  1207. if (arg_options) {
  1208. _php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax);
  1209. } else {
  1210. option |= MBREX(regex_default_options);
  1211. syntax = MBREX(regex_default_syntax);
  1212. }
  1213. if (MBREX(search_regs)) {
  1214. onig_region_free(MBREX(search_regs), 1);
  1215. MBREX(search_regs) = NULL;
  1216. }
  1217. if (arg_pattern) {
  1218. /* create regex pattern buffer */
  1219. if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, syntax)) == NULL) {
  1220. RETURN_FALSE;
  1221. }
  1222. }
  1223. pos = MBREX(search_pos);
  1224. str = NULL;
  1225. len = 0;
  1226. if (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING){
  1227. str = (OnigUChar *)Z_STRVAL(MBREX(search_str));
  1228. len = Z_STRLEN(MBREX(search_str));
  1229. }
  1230. if (MBREX(search_re) == NULL) {
  1231. zend_throw_error(NULL, "No pattern was provided");
  1232. RETURN_THROWS();
  1233. }
  1234. if (str == NULL) {
  1235. zend_throw_error(NULL, "No string was provided");
  1236. RETURN_THROWS();
  1237. }
  1238. MBREX(search_regs) = onig_region_new();
  1239. err = _php_mb_onig_search(MBREX(search_re), str, str + len, str + pos, str + len, MBREX(search_regs), 0);
  1240. if (err == ONIG_MISMATCH) {
  1241. MBREX(search_pos) = len;
  1242. RETVAL_FALSE;
  1243. } else if (err <= -2) {
  1244. OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
  1245. onig_error_code_to_str(err_str, err);
  1246. php_error_docref(NULL, E_WARNING, "mbregex search failure in mbregex_search(): %s", err_str);
  1247. RETVAL_FALSE;
  1248. } else {
  1249. switch (mode) {
  1250. case 1:
  1251. array_init(return_value);
  1252. beg = MBREX(search_regs)->beg[0];
  1253. end = MBREX(search_regs)->end[0];
  1254. add_next_index_long(return_value, beg);
  1255. add_next_index_long(return_value, end - beg);
  1256. break;
  1257. case 2:
  1258. array_init(return_value);
  1259. n = MBREX(search_regs)->num_regs;
  1260. for (i = 0; i < n; i++) {
  1261. beg = MBREX(search_regs)->beg[i];
  1262. end = MBREX(search_regs)->end[i];
  1263. if (beg >= 0 && beg <= end && end <= len) {
  1264. add_index_stringl(return_value, i, (char *)&str[beg], end - beg);
  1265. } else {
  1266. add_index_bool(return_value, i, 0);
  1267. }
  1268. }
  1269. if (onig_number_of_names(MBREX(search_re)) > 0) {
  1270. mb_regex_groups_iter_args args = {
  1271. return_value,
  1272. Z_STRVAL(MBREX(search_str)),
  1273. Z_STRLEN(MBREX(search_str)),
  1274. MBREX(search_regs)
  1275. };
  1276. onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
  1277. }
  1278. break;
  1279. default:
  1280. RETVAL_TRUE;
  1281. break;
  1282. }
  1283. end = MBREX(search_regs)->end[0];
  1284. if (pos <= end) {
  1285. MBREX(search_pos) = end;
  1286. } else {
  1287. MBREX(search_pos) = pos + 1;
  1288. }
  1289. }
  1290. if (err < 0) {
  1291. onig_region_free(MBREX(search_regs), 1);
  1292. MBREX(search_regs) = (OnigRegion *)NULL;
  1293. }
  1294. }
  1295. /* }}} */
  1296. /* {{{ Regular expression search for multibyte string */
  1297. PHP_FUNCTION(mb_ereg_search)
  1298. {
  1299. _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
  1300. }
  1301. /* }}} */
  1302. /* {{{ Regular expression search for multibyte string */
  1303. PHP_FUNCTION(mb_ereg_search_pos)
  1304. {
  1305. _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
  1306. }
  1307. /* }}} */
  1308. /* {{{ Regular expression search for multibyte string */
  1309. PHP_FUNCTION(mb_ereg_search_regs)
  1310. {
  1311. _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 2);
  1312. }
  1313. /* }}} */
  1314. /* {{{ Initialize string and regular expression for search. */
  1315. PHP_FUNCTION(mb_ereg_search_init)
  1316. {
  1317. zend_string *arg_str;
  1318. char *arg_pattern = NULL, *arg_options = NULL;
  1319. size_t arg_pattern_len = 0, arg_options_len = 0;
  1320. OnigSyntaxType *syntax = NULL;
  1321. OnigOptionType option;
  1322. if (zend_parse_parameters(ZEND_NUM_ARGS(), "S|s!s!", &arg_str, &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) {
  1323. RETURN_THROWS();
  1324. }
  1325. if (arg_pattern && arg_pattern_len == 0) {
  1326. zend_argument_value_error(2, "must not be empty");
  1327. RETURN_THROWS();
  1328. }
  1329. if (arg_options) {
  1330. option = 0;
  1331. _php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax);
  1332. } else {
  1333. option = MBREX(regex_default_options);
  1334. syntax = MBREX(regex_default_syntax);
  1335. }
  1336. if (arg_pattern) {
  1337. /* create regex pattern buffer */
  1338. if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, syntax)) == NULL) {
  1339. RETURN_FALSE;
  1340. }
  1341. }
  1342. if (!Z_ISNULL(MBREX(search_str))) {
  1343. zval_ptr_dtor(&MBREX(search_str));
  1344. }
  1345. ZVAL_STR_COPY(&MBREX(search_str), arg_str);
  1346. if (php_mb_check_encoding(ZSTR_VAL(arg_str), ZSTR_LEN(arg_str), php_mb_regex_get_mbctype_encoding())) {
  1347. MBREX(search_pos) = 0;
  1348. RETVAL_TRUE;
  1349. } else {
  1350. MBREX(search_pos) = ZSTR_LEN(arg_str);
  1351. RETVAL_FALSE;
  1352. }
  1353. if (MBREX(search_regs) != NULL) {
  1354. onig_region_free(MBREX(search_regs), 1);
  1355. MBREX(search_regs) = NULL;
  1356. }
  1357. }
  1358. /* }}} */
  1359. /* {{{ Get matched substring of the last time */
  1360. PHP_FUNCTION(mb_ereg_search_getregs)
  1361. {
  1362. size_t n, i, len;
  1363. /* Stored as int* in the OnigRegion struct */
  1364. int beg, end;
  1365. OnigUChar *str;
  1366. if (zend_parse_parameters_none() == FAILURE) {
  1367. RETURN_THROWS();
  1368. }
  1369. if (MBREX(search_regs) != NULL && Z_TYPE(MBREX(search_str)) == IS_STRING) {
  1370. array_init(return_value);
  1371. str = (OnigUChar *)Z_STRVAL(MBREX(search_str));
  1372. len = Z_STRLEN(MBREX(search_str));
  1373. n = MBREX(search_regs)->num_regs;
  1374. for (i = 0; i < n; i++) {
  1375. beg = MBREX(search_regs)->beg[i];
  1376. end = MBREX(search_regs)->end[i];
  1377. if (beg >= 0 && beg <= end && end <= len) {
  1378. add_index_stringl(return_value, i, (char *)&str[beg], end - beg);
  1379. } else {
  1380. add_index_bool(return_value, i, 0);
  1381. }
  1382. }
  1383. if (onig_number_of_names(MBREX(search_re)) > 0) {
  1384. mb_regex_groups_iter_args args = {
  1385. return_value,
  1386. Z_STRVAL(MBREX(search_str)),
  1387. len,
  1388. MBREX(search_regs)
  1389. };
  1390. onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
  1391. }
  1392. } else {
  1393. // TODO This seems to be some logical error, promote to Error
  1394. RETVAL_FALSE;
  1395. }
  1396. }
  1397. /* }}} */
  1398. /* {{{ Get search start position */
  1399. PHP_FUNCTION(mb_ereg_search_getpos)
  1400. {
  1401. if (zend_parse_parameters_none() == FAILURE) {
  1402. RETURN_THROWS();
  1403. }
  1404. RETVAL_LONG(MBREX(search_pos));
  1405. }
  1406. /* }}} */
  1407. /* {{{ Set search start position */
  1408. PHP_FUNCTION(mb_ereg_search_setpos)
  1409. {
  1410. zend_long position;
  1411. if (zend_parse_parameters(ZEND_NUM_ARGS(), "l", &position) == FAILURE) {
  1412. RETURN_THROWS();
  1413. }
  1414. /* Accept negative position if length of search string can be determined */
  1415. if ((position < 0) && (!Z_ISUNDEF(MBREX(search_str))) && (Z_TYPE(MBREX(search_str)) == IS_STRING)) {
  1416. position += Z_STRLEN(MBREX(search_str));
  1417. }
  1418. if (position < 0 || (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING && (size_t)position > Z_STRLEN(MBREX(search_str)))) {
  1419. zend_argument_value_error(1, "is out of range");
  1420. RETURN_THROWS();
  1421. }
  1422. MBREX(search_pos) = position;
  1423. // TODO Return void
  1424. RETURN_TRUE;
  1425. }
  1426. /* }}} */
  1427. /* {{{ php_mb_regex_set_options */
  1428. static void _php_mb_regex_set_options(OnigOptionType options, OnigSyntaxType *syntax, OnigOptionType *prev_options, OnigSyntaxType **prev_syntax)
  1429. {
  1430. if (prev_options != NULL) {
  1431. *prev_options = MBREX(regex_default_options);
  1432. }
  1433. if (prev_syntax != NULL) {
  1434. *prev_syntax = MBREX(regex_default_syntax);
  1435. }
  1436. MBREX(regex_default_options) = options;
  1437. MBREX(regex_default_syntax) = syntax;
  1438. }
  1439. /* }}} */
  1440. /* {{{ Set or get the default options for mbregex functions */
  1441. PHP_FUNCTION(mb_regex_set_options)
  1442. {
  1443. OnigOptionType opt, prev_opt;
  1444. OnigSyntaxType *syntax, *prev_syntax;
  1445. char *string = NULL;
  1446. size_t string_len;
  1447. char buf[16];
  1448. if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s!",
  1449. &string, &string_len) == FAILURE) {
  1450. RETURN_THROWS();
  1451. }
  1452. if (string != NULL) {
  1453. opt = 0;
  1454. syntax = NULL;
  1455. if(!_php_mb_regex_init_options(string, string_len, &opt, &syntax)) {
  1456. RETURN_THROWS();
  1457. }
  1458. _php_mb_regex_set_options(opt, syntax, &prev_opt, &prev_syntax);
  1459. opt = prev_opt;
  1460. syntax = prev_syntax;
  1461. } else {
  1462. opt = MBREX(regex_default_options);
  1463. syntax = MBREX(regex_default_syntax);
  1464. }
  1465. _php_mb_regex_get_option_string(buf, sizeof(buf), opt, syntax);
  1466. RETVAL_STRING(buf);
  1467. }
  1468. /* }}} */
  1469. #endif /* HAVE_MBREGEX */