json_scanner.re 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393
  1. /*
  2. +----------------------------------------------------------------------+
  3. | Copyright (c) The PHP Group |
  4. +----------------------------------------------------------------------+
  5. | This source file is subject to version 3.01 of the PHP license, |
  6. | that is bundled with this package in the file LICENSE, and is |
  7. | available through the world-wide-web at the following url: |
  8. | https://www.php.net/license/3_01.txt |
  9. | If you did not receive a copy of the PHP license and are unable to |
  10. | obtain it through the world-wide-web, please send a note to |
  11. | license@php.net so we can mail you a copy immediately. |
  12. +----------------------------------------------------------------------+
  13. | Author: Jakub Zelenka <bukka@php.net> |
  14. +----------------------------------------------------------------------+
  15. */
  16. #include "php.h"
  17. #include "php_json_scanner.h"
  18. #include "php_json_scanner_defs.h"
  19. #include "php_json_parser.h"
  20. #include "json_parser.tab.h"
  21. #define YYCTYPE php_json_ctype
  22. #define YYCURSOR s->cursor
  23. #define YYLIMIT s->limit
  24. #define YYMARKER s->marker
  25. #define YYCTXMARKER s->ctxmarker
  26. #define YYGETCONDITION() s->state
  27. #define YYSETCONDITION(yystate) s->state = yystate
  28. #define YYFILL(n)
  29. #define PHP_JSON_CONDITION_SET(condition) YYSETCONDITION(yyc##condition)
  30. #define PHP_JSON_CONDITION_GOTO(condition) goto yyc_##condition
  31. #define PHP_JSON_CONDITION_SET_AND_GOTO(condition) \
  32. PHP_JSON_CONDITION_SET(condition); \
  33. PHP_JSON_CONDITION_GOTO(condition)
  34. #define PHP_JSON_CONDITION_GOTO_STR_P2() \
  35. do { \
  36. if (s->utf8_invalid) { \
  37. PHP_JSON_CONDITION_GOTO(STR_P2_BIN); \
  38. } else { \
  39. PHP_JSON_CONDITION_GOTO(STR_P2_UTF); \
  40. } \
  41. } while(0)
  42. #define PHP_JSON_SCANNER_COPY_ESC() php_json_scanner_copy_string(s, 0)
  43. #define PHP_JSON_SCANNER_COPY_UTF() php_json_scanner_copy_string(s, 5)
  44. #define PHP_JSON_SCANNER_COPY_UTF_SP() php_json_scanner_copy_string(s, 11)
  45. #define PHP_JSON_INT_MAX_LENGTH (MAX_LENGTH_OF_LONG - 1)
  46. static void php_json_scanner_copy_string(php_json_scanner *s, int esc_size)
  47. {
  48. size_t len = s->cursor - s->str_start - esc_size - 1;
  49. if (len) {
  50. memcpy(s->pstr, s->str_start, len);
  51. s->pstr += len;
  52. }
  53. }
  54. static int php_json_hex_to_int(char code)
  55. {
  56. if (code >= '0' && code <= '9') {
  57. return code - '0';
  58. } else if (code >= 'A' && code <= 'F') {
  59. return code - ('A' - 10);
  60. } else if (code >= 'a' && code <= 'f') {
  61. return code - ('a' - 10);
  62. } else {
  63. /* this should never happened (just to suppress compiler warning) */
  64. return -1;
  65. }
  66. }
  67. static int php_json_ucs2_to_int_ex(php_json_scanner *s, int size, int start)
  68. {
  69. int i, code = 0;
  70. php_json_ctype *pc = s->cursor - start;
  71. for (i = 0; i < size; i++) {
  72. code |= php_json_hex_to_int(*(pc--)) << (i * 4);
  73. }
  74. return code;
  75. }
  76. static int php_json_ucs2_to_int(php_json_scanner *s, int size)
  77. {
  78. return php_json_ucs2_to_int_ex(s, size, 1);
  79. }
  80. void php_json_scanner_init(php_json_scanner *s, const char *str, size_t str_len, int options)
  81. {
  82. s->cursor = (php_json_ctype *) str;
  83. s->limit = (php_json_ctype *) str + str_len;
  84. s->options = options;
  85. PHP_JSON_CONDITION_SET(JS);
  86. }
  87. int php_json_scan(php_json_scanner *s)
  88. {
  89. ZVAL_NULL(&s->value);
  90. std:
  91. s->token = s->cursor;
  92. /*!re2c
  93. re2c:indent:top = 1;
  94. re2c:yyfill:enable = 0;
  95. DIGIT = [0-9] ;
  96. DIGITNZ = [1-9] ;
  97. UINT = "0" | ( DIGITNZ DIGIT* ) ;
  98. INT = "-"? UINT ;
  99. HEX = DIGIT | [a-fA-F] ;
  100. HEXNZ = DIGITNZ | [a-fA-F] ;
  101. HEX7 = [0-7] ;
  102. HEXC = DIGIT | [a-cA-C] ;
  103. FLOAT = INT "." DIGIT+ ;
  104. EXP = ( INT | FLOAT ) [eE] [+-]? DIGIT+ ;
  105. NL = "\r"? "\n" ;
  106. WS = [ \t\r]+ ;
  107. EOI = "\000";
  108. CTRL = [\x00-\x1F] ;
  109. UTF8T = [\x80-\xBF] ;
  110. UTF8_1 = [\x00-\x7F] ;
  111. UTF8_2 = [\xC2-\xDF] UTF8T ;
  112. UTF8_3A = "\xE0" [\xA0-\xBF] UTF8T ;
  113. UTF8_3B = [\xE1-\xEC] UTF8T{2} ;
  114. UTF8_3C = "\xED" [\x80-\x9F] UTF8T ;
  115. UTF8_3D = [\xEE-\xEF] UTF8T{2} ;
  116. UTF8_3 = UTF8_3A | UTF8_3B | UTF8_3C | UTF8_3D ;
  117. UTF8_4A = "\xF0"[\x90-\xBF] UTF8T{2} ;
  118. UTF8_4B = [\xF1-\xF3] UTF8T{3} ;
  119. UTF8_4C = "\xF4" [\x80-\x8F] UTF8T{2} ;
  120. UTF8_4 = UTF8_4A | UTF8_4B | UTF8_4C ;
  121. UTF8 = UTF8_1 | UTF8_2 | UTF8_3 | UTF8_4 ;
  122. ANY = [^] ;
  123. ESCPREF = "\\" ;
  124. ESCSYM = ( "\"" | "\\" | "/" | [bfnrt] ) ;
  125. ESC = ESCPREF ESCSYM ;
  126. UTFSYM = "u" ;
  127. UTFPREF = ESCPREF UTFSYM ;
  128. UCS2 = UTFPREF HEX{4} ;
  129. UTF16_1 = UTFPREF "00" HEX7 HEX ;
  130. UTF16_2 = UTFPREF "0" HEX7 HEX{2} ;
  131. UTF16_3 = UTFPREF ( ( ( HEXC | [efEF] ) HEX ) | ( [dD] HEX7 ) ) HEX{2} ;
  132. UTF16_4 = UTFPREF [dD] [89abAB] HEX{2} UTFPREF [dD] [c-fC-F] HEX{2} ;
  133. <JS>"{" { return '{'; }
  134. <JS>"}" { return '}'; }
  135. <JS>"[" { return '['; }
  136. <JS>"]" { return ']'; }
  137. <JS>":" { return ':'; }
  138. <JS>"," { return ','; }
  139. <JS>"null" {
  140. ZVAL_NULL(&s->value);
  141. return PHP_JSON_T_NUL;
  142. }
  143. <JS>"true" {
  144. ZVAL_TRUE(&s->value);
  145. return PHP_JSON_T_TRUE;
  146. }
  147. <JS>"false" {
  148. ZVAL_FALSE(&s->value);
  149. return PHP_JSON_T_FALSE;
  150. }
  151. <JS>INT {
  152. bool bigint = 0, negative = s->token[0] == '-';
  153. size_t digits = (size_t) (s->cursor - s->token - negative);
  154. if (digits >= PHP_JSON_INT_MAX_LENGTH) {
  155. if (digits == PHP_JSON_INT_MAX_LENGTH) {
  156. int cmp = strncmp((char *) (s->token + negative), LONG_MIN_DIGITS, PHP_JSON_INT_MAX_LENGTH);
  157. if (!(cmp < 0 || (cmp == 0 && negative))) {
  158. bigint = 1;
  159. }
  160. } else {
  161. bigint = 1;
  162. }
  163. }
  164. if (!bigint) {
  165. ZVAL_LONG(&s->value, ZEND_STRTOL((char *) s->token, NULL, 10));
  166. return PHP_JSON_T_INT;
  167. } else if (s->options & PHP_JSON_BIGINT_AS_STRING) {
  168. ZVAL_STRINGL(&s->value, (char *) s->token, s->cursor - s->token);
  169. return PHP_JSON_T_STRING;
  170. } else {
  171. ZVAL_DOUBLE(&s->value, zend_strtod((char *) s->token, NULL));
  172. return PHP_JSON_T_DOUBLE;
  173. }
  174. }
  175. <JS>FLOAT|EXP {
  176. ZVAL_DOUBLE(&s->value, zend_strtod((char *) s->token, NULL));
  177. return PHP_JSON_T_DOUBLE;
  178. }
  179. <JS>NL|WS { goto std; }
  180. <JS>EOI {
  181. if (s->limit < s->cursor) {
  182. return PHP_JSON_T_EOI;
  183. } else {
  184. s->errcode = PHP_JSON_ERROR_CTRL_CHAR;
  185. return PHP_JSON_T_ERROR;
  186. }
  187. }
  188. <JS>["] {
  189. s->str_start = s->cursor;
  190. s->str_esc = 0;
  191. s->utf8_invalid = 0;
  192. s->utf8_invalid_count = 0;
  193. PHP_JSON_CONDITION_SET_AND_GOTO(STR_P1);
  194. }
  195. <JS>CTRL {
  196. s->errcode = PHP_JSON_ERROR_CTRL_CHAR;
  197. return PHP_JSON_T_ERROR;
  198. }
  199. <JS>UTF8 {
  200. s->errcode = PHP_JSON_ERROR_SYNTAX;
  201. return PHP_JSON_T_ERROR;
  202. }
  203. <JS>ANY {
  204. s->errcode = PHP_JSON_ERROR_UTF8;
  205. return PHP_JSON_T_ERROR;
  206. }
  207. <STR_P1>CTRL {
  208. s->errcode = PHP_JSON_ERROR_CTRL_CHAR;
  209. return PHP_JSON_T_ERROR;
  210. }
  211. <STR_P1>UTF16_1 {
  212. s->str_esc += 5;
  213. PHP_JSON_CONDITION_GOTO(STR_P1);
  214. }
  215. <STR_P1>UTF16_2 {
  216. s->str_esc += 4;
  217. PHP_JSON_CONDITION_GOTO(STR_P1);
  218. }
  219. <STR_P1>UTF16_3 {
  220. s->str_esc += 3;
  221. PHP_JSON_CONDITION_GOTO(STR_P1);
  222. }
  223. <STR_P1>UTF16_4 {
  224. s->str_esc += 8;
  225. PHP_JSON_CONDITION_GOTO(STR_P1);
  226. }
  227. <STR_P1>UCS2 {
  228. s->errcode = PHP_JSON_ERROR_UTF16;
  229. return PHP_JSON_T_ERROR;
  230. }
  231. <STR_P1>ESC {
  232. s->str_esc++;
  233. PHP_JSON_CONDITION_GOTO(STR_P1);
  234. }
  235. <STR_P1>ESCPREF {
  236. s->errcode = PHP_JSON_ERROR_SYNTAX;
  237. return PHP_JSON_T_ERROR;
  238. }
  239. <STR_P1>["] {
  240. zend_string *str;
  241. size_t len = s->cursor - s->str_start - s->str_esc - 1 + s->utf8_invalid_count;
  242. if (len == 0) {
  243. PHP_JSON_CONDITION_SET(JS);
  244. ZVAL_EMPTY_STRING(&s->value);
  245. return PHP_JSON_T_ESTRING;
  246. }
  247. str = zend_string_alloc(len, 0);
  248. ZSTR_VAL(str)[len] = '\0';
  249. ZVAL_STR(&s->value, str);
  250. if (s->str_esc || s->utf8_invalid) {
  251. s->pstr = (php_json_ctype *) Z_STRVAL(s->value);
  252. s->cursor = s->str_start;
  253. PHP_JSON_CONDITION_GOTO_STR_P2();
  254. } else {
  255. memcpy(Z_STRVAL(s->value), s->str_start, len);
  256. PHP_JSON_CONDITION_SET(JS);
  257. return PHP_JSON_T_STRING;
  258. }
  259. }
  260. <STR_P1>UTF8 { PHP_JSON_CONDITION_GOTO(STR_P1); }
  261. <STR_P1>ANY {
  262. if (s->options & (PHP_JSON_INVALID_UTF8_IGNORE | PHP_JSON_INVALID_UTF8_SUBSTITUTE)) {
  263. if (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) {
  264. if (s->utf8_invalid_count > INT_MAX - 2) {
  265. s->errcode = PHP_JSON_ERROR_UTF8;
  266. return PHP_JSON_T_ERROR;
  267. }
  268. s->utf8_invalid_count += 2;
  269. } else {
  270. s->utf8_invalid_count--;
  271. }
  272. s->utf8_invalid = 1;
  273. PHP_JSON_CONDITION_GOTO(STR_P1);
  274. }
  275. s->errcode = PHP_JSON_ERROR_UTF8;
  276. return PHP_JSON_T_ERROR;
  277. }
  278. <STR_P2_UTF,STR_P2_BIN>UTF16_1 {
  279. int utf16 = php_json_ucs2_to_int(s, 2);
  280. PHP_JSON_SCANNER_COPY_UTF();
  281. *(s->pstr++) = (char) utf16;
  282. s->str_start = s->cursor;
  283. PHP_JSON_CONDITION_GOTO_STR_P2();
  284. }
  285. <STR_P2_UTF,STR_P2_BIN>UTF16_2 {
  286. int utf16 = php_json_ucs2_to_int(s, 3);
  287. PHP_JSON_SCANNER_COPY_UTF();
  288. *(s->pstr++) = (char) (0xc0 | (utf16 >> 6));
  289. *(s->pstr++) = (char) (0x80 | (utf16 & 0x3f));
  290. s->str_start = s->cursor;
  291. PHP_JSON_CONDITION_GOTO_STR_P2();
  292. }
  293. <STR_P2_UTF,STR_P2_BIN>UTF16_3 {
  294. int utf16 = php_json_ucs2_to_int(s, 4);
  295. PHP_JSON_SCANNER_COPY_UTF();
  296. *(s->pstr++) = (char) (0xe0 | (utf16 >> 12));
  297. *(s->pstr++) = (char) (0x80 | ((utf16 >> 6) & 0x3f));
  298. *(s->pstr++) = (char) (0x80 | (utf16 & 0x3f));
  299. s->str_start = s->cursor;
  300. PHP_JSON_CONDITION_GOTO_STR_P2();
  301. }
  302. <STR_P2_UTF,STR_P2_BIN>UTF16_4 {
  303. int utf32, utf16_hi, utf16_lo;
  304. utf16_hi = php_json_ucs2_to_int(s, 4);
  305. utf16_lo = php_json_ucs2_to_int_ex(s, 4, 7);
  306. utf32 = ((utf16_lo & 0x3FF) << 10) + (utf16_hi & 0x3FF) + 0x10000;
  307. PHP_JSON_SCANNER_COPY_UTF_SP();
  308. *(s->pstr++) = (char) (0xf0 | (utf32 >> 18));
  309. *(s->pstr++) = (char) (0x80 | ((utf32 >> 12) & 0x3f));
  310. *(s->pstr++) = (char) (0x80 | ((utf32 >> 6) & 0x3f));
  311. *(s->pstr++) = (char) (0x80 | (utf32 & 0x3f));
  312. s->str_start = s->cursor;
  313. PHP_JSON_CONDITION_GOTO_STR_P2();
  314. }
  315. <STR_P2_UTF,STR_P2_BIN>ESCPREF {
  316. char esc;
  317. PHP_JSON_SCANNER_COPY_ESC();
  318. switch (*s->cursor) {
  319. case 'b':
  320. esc = '\b';
  321. break;
  322. case 'f':
  323. esc = '\f'; break;
  324. case 'n':
  325. esc = '\n';
  326. break;
  327. case 'r':
  328. esc = '\r';
  329. break;
  330. case 't':
  331. esc = '\t';
  332. break;
  333. case '\\':
  334. case '/':
  335. case '"':
  336. esc = *s->cursor;
  337. break;
  338. default:
  339. s->errcode = PHP_JSON_ERROR_SYNTAX;
  340. return PHP_JSON_T_ERROR;
  341. }
  342. *(s->pstr++) = esc;
  343. ++YYCURSOR;
  344. s->str_start = s->cursor;
  345. PHP_JSON_CONDITION_GOTO_STR_P2();
  346. }
  347. <STR_P2_UTF,STR_P2_BIN>["] => JS {
  348. PHP_JSON_SCANNER_COPY_ESC();
  349. return PHP_JSON_T_STRING;
  350. }
  351. <STR_P2_BIN>UTF8 { PHP_JSON_CONDITION_GOTO(STR_P2_BIN); }
  352. <STR_P2_BIN>ANY {
  353. if (s->utf8_invalid) {
  354. PHP_JSON_SCANNER_COPY_ESC();
  355. if (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) {
  356. *(s->pstr++) = (char) (0xe0 | (0xfffd >> 12));
  357. *(s->pstr++) = (char) (0x80 | ((0xfffd >> 6) & 0x3f));
  358. *(s->pstr++) = (char) (0x80 | (0xfffd & 0x3f));
  359. }
  360. s->str_start = s->cursor;
  361. }
  362. PHP_JSON_CONDITION_GOTO(STR_P2_BIN);
  363. }
  364. <STR_P2_UTF>ANY { PHP_JSON_CONDITION_GOTO(STR_P2_UTF); }
  365. <*>ANY {
  366. s->errcode = PHP_JSON_ERROR_SYNTAX;
  367. return PHP_JSON_T_ERROR;
  368. }
  369. */
  370. }