gb18030.c 12 KB


  1. /**********************************************************************
  2. gb18030.c - Oniguruma (regular expression library)
  3. **********************************************************************/
  4. /*-
  5. * Copyright (c) 2005-2007 KUBO Takehiro <kubo AT jiubao DOT org>
  6. * K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
  7. * All rights reserved.
  8. *
  9. * Redistribution and use in source and binary forms, with or without
  10. * modification, are permitted provided that the following conditions
  11. * are met:
  12. * 1. Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. * 2. Redistributions in binary form must reproduce the above copyright
  15. * notice, this list of conditions and the following disclaimer in the
  16. * documentation and/or other materials provided with the distribution.
  17. *
  18. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  19. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  22. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  23. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  24. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  25. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  26. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  27. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  28. * SUCH DAMAGE.
  29. */
  30. #include "regenc.h"
  31. #if 1
  32. #define DEBUG_GB18030(arg)
  33. #else
  34. #define DEBUG_GB18030(arg) printf arg
  35. #endif
  36. enum {
  37. C1, /* one-byte char */
  38. C2, /* one-byte or second of two-byte char */
  39. C4, /* one-byte or second or fourth of four-byte char */
  40. CM /* first of two- or four-byte char or second of two-byte char */
  41. };
  42. static const char GB18030_MAP[] = {
  43. C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
  44. C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
  45. C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
  46. C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1,
  47. C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
  48. C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
  49. C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
  50. C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1,
  51. C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
  52. CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
  53. CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
  54. CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
  55. CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
  56. CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
  57. CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
  58. CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1
  59. };
  60. static int
  61. gb18030_mbc_enc_len(const UChar* p)
  62. {
  63. if (GB18030_MAP[*p] != CM)
  64. return 1;
  65. p++;
  66. if (GB18030_MAP[*p] == C4)
  67. return 4;
  68. if (GB18030_MAP[*p] == C1)
  69. return 1; /* illegal sequence */
  70. return 2;
  71. }
  72. static OnigCodePoint
  73. gb18030_mbc_to_code(const UChar* p, const UChar* end)
  74. {
  75. return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end);
  76. }
  77. static int
  78. gb18030_code_to_mbc(OnigCodePoint code, UChar *buf)
  79. {
  80. return onigenc_mb4_code_to_mbc(ONIG_ENCODING_GB18030, code, buf);
  81. }
  82. static int
  83. gb18030_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
  84. UChar* lower)
  85. {
  86. return onigenc_mbn_mbc_case_fold(ONIG_ENCODING_GB18030, flag,
  87. pp, end, lower);
  88. }
  89. #if 0
  90. static int
  91. gb18030_is_mbc_ambiguous(OnigCaseFoldType flag,
  92. const UChar** pp, const UChar* end)
  93. {
  94. return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_GB18030, flag, pp, end);
  95. }
  96. #endif
  97. static int
  98. gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype)
  99. {
  100. return onigenc_mb4_is_code_ctype(ONIG_ENCODING_GB18030, code, ctype);
  101. }
  102. enum state {
  103. S_START,
  104. S_one_C2,
  105. S_one_C4,
  106. S_one_CM,
  107. S_odd_CM_one_CX,
  108. S_even_CM_one_CX,
  109. /* CMC4 : pair of "CM C4" */
  110. S_one_CMC4,
  111. S_odd_CMC4,
  112. S_one_C4_odd_CMC4,
  113. S_even_CMC4,
  114. S_one_C4_even_CMC4,
  115. S_odd_CM_odd_CMC4,
  116. S_even_CM_odd_CMC4,
  117. S_odd_CM_even_CMC4,
  118. S_even_CM_even_CMC4,
  119. /* C4CM : pair of "C4 CM" */
  120. S_odd_C4CM,
  121. S_one_CM_odd_C4CM,
  122. S_even_C4CM,
  123. S_one_CM_even_C4CM,
  124. S_even_CM_odd_C4CM,
  125. S_odd_CM_odd_C4CM,
  126. S_even_CM_even_C4CM,
  127. S_odd_CM_even_C4CM,
  128. };
  129. static UChar*
  130. gb18030_left_adjust_char_head(const UChar* start, const UChar* s)
  131. {
  132. const UChar *p;
  133. enum state state = S_START;
  134. DEBUG_GB18030(("----------------\n"));
  135. for (p = s; p >= start; p--) {
  136. DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p));
  137. switch (state) {
  138. case S_START:
  139. switch (GB18030_MAP[*p]) {
  140. case C1:
  141. return (UChar *)s;
  142. case C2:
  143. state = S_one_C2; /* C2 */
  144. break;
  145. case C4:
  146. state = S_one_C4; /* C4 */
  147. break;
  148. case CM:
  149. state = S_one_CM; /* CM */
  150. break;
  151. }
  152. break;
  153. case S_one_C2: /* C2 */
  154. switch (GB18030_MAP[*p]) {
  155. case C1:
  156. case C2:
  157. case C4:
  158. return (UChar *)s;
  159. case CM:
  160. state = S_odd_CM_one_CX; /* CM C2 */
  161. break;
  162. }
  163. break;
  164. case S_one_C4: /* C4 */
  165. switch (GB18030_MAP[*p]) {
  166. case C1:
  167. case C2:
  168. case C4:
  169. return (UChar *)s;
  170. case CM:
  171. state = S_one_CMC4;
  172. break;
  173. }
  174. break;
  175. case S_one_CM: /* CM */
  176. switch (GB18030_MAP[*p]) {
  177. case C1:
  178. case C2:
  179. return (UChar *)s;
  180. case C4:
  181. state = S_odd_C4CM;
  182. break;
  183. case CM:
  184. state = S_odd_CM_one_CX; /* CM CM */
  185. break;
  186. }
  187. break;
  188. case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */
  189. switch (GB18030_MAP[*p]) {
  190. case C1:
  191. case C2:
  192. case C4:
  193. return (UChar *)(s - 1);
  194. case CM:
  195. state = S_even_CM_one_CX;
  196. break;
  197. }
  198. break;
  199. case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */
  200. switch (GB18030_MAP[*p]) {
  201. case C1:
  202. case C2:
  203. case C4:
  204. return (UChar *)s;
  205. case CM:
  206. state = S_odd_CM_one_CX;
  207. break;
  208. }
  209. break;
  210. case S_one_CMC4: /* CM C4 */
  211. switch (GB18030_MAP[*p]) {
  212. case C1:
  213. case C2:
  214. return (UChar *)(s - 1);
  215. case C4:
  216. state = S_one_C4_odd_CMC4; /* C4 CM C4 */
  217. break;
  218. case CM:
  219. state = S_even_CM_one_CX; /* CM CM C4 */
  220. break;
  221. }
  222. break;
  223. case S_odd_CMC4: /* CM C4 CM C4 CM C4 */
  224. switch (GB18030_MAP[*p]) {
  225. case C1:
  226. case C2:
  227. return (UChar *)(s - 1);
  228. case C4:
  229. state = S_one_C4_odd_CMC4;
  230. break;
  231. case CM:
  232. state = S_odd_CM_odd_CMC4;
  233. break;
  234. }
  235. break;
  236. case S_one_C4_odd_CMC4: /* C4 CM C4 */
  237. switch (GB18030_MAP[*p]) {
  238. case C1:
  239. case C2:
  240. case C4:
  241. return (UChar *)(s - 1);
  242. case CM:
  243. state = S_even_CMC4; /* CM C4 CM C4 */
  244. break;
  245. }
  246. break;
  247. case S_even_CMC4: /* CM C4 CM C4 */
  248. switch (GB18030_MAP[*p]) {
  249. case C1:
  250. case C2:
  251. return (UChar *)(s - 3);
  252. case C4:
  253. state = S_one_C4_even_CMC4;
  254. break;
  255. case CM:
  256. state = S_odd_CM_even_CMC4;
  257. break;
  258. }
  259. break;
  260. case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */
  261. switch (GB18030_MAP[*p]) {
  262. case C1:
  263. case C2:
  264. case C4:
  265. return (UChar *)(s - 3);
  266. case CM:
  267. state = S_odd_CMC4;
  268. break;
  269. }
  270. break;
  271. case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */
  272. switch (GB18030_MAP[*p]) {
  273. case C1:
  274. case C2:
  275. case C4:
  276. return (UChar *)(s - 3);
  277. case CM:
  278. state = S_even_CM_odd_CMC4;
  279. break;
  280. }
  281. break;
  282. case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */
  283. switch (GB18030_MAP[*p]) {
  284. case C1:
  285. case C2:
  286. case C4:
  287. return (UChar *)(s - 1);
  288. case CM:
  289. state = S_odd_CM_odd_CMC4;
  290. break;
  291. }
  292. break;
  293. case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */
  294. switch (GB18030_MAP[*p]) {
  295. case C1:
  296. case C2:
  297. case C4:
  298. return (UChar *)(s - 1);
  299. case CM:
  300. state = S_even_CM_even_CMC4;
  301. break;
  302. }
  303. break;
  304. case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */
  305. switch (GB18030_MAP[*p]) {
  306. case C1:
  307. case C2:
  308. case C4:
  309. return (UChar *)(s - 3);
  310. case CM:
  311. state = S_odd_CM_even_CMC4;
  312. break;
  313. }
  314. break;
  315. case S_odd_C4CM: /* C4 CM */ /* C4 CM C4 CM C4 CM*/
  316. switch (GB18030_MAP[*p]) {
  317. case C1:
  318. case C2:
  319. case C4:
  320. return (UChar *)s;
  321. case CM:
  322. state = S_one_CM_odd_C4CM; /* CM C4 CM */
  323. break;
  324. }
  325. break;
  326. case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */
  327. switch (GB18030_MAP[*p]) {
  328. case C1:
  329. case C2:
  330. return (UChar *)(s - 2); /* |CM C4 CM */
  331. case C4:
  332. state = S_even_C4CM;
  333. break;
  334. case CM:
  335. state = S_even_CM_odd_C4CM;
  336. break;
  337. }
  338. break;
  339. case S_even_C4CM: /* C4 CM C4 CM */
  340. switch (GB18030_MAP[*p]) {
  341. case C1:
  342. case C2:
  343. case C4:
  344. return (UChar *)(s - 2); /* C4|CM C4 CM */
  345. case CM:
  346. state = S_one_CM_even_C4CM;
  347. break;
  348. }
  349. break;
  350. case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */
  351. switch (GB18030_MAP[*p]) {
  352. case C1:
  353. case C2:
  354. return (UChar *)(s - 0); /*|CM C4 CM C4|CM */
  355. case C4:
  356. state = S_odd_C4CM;
  357. break;
  358. case CM:
  359. state = S_even_CM_even_C4CM;
  360. break;
  361. }
  362. break;
  363. case S_even_CM_odd_C4CM: /* CM CM C4 CM */
  364. switch (GB18030_MAP[*p]) {
  365. case C1:
  366. case C2:
  367. case C4:
  368. return (UChar *)(s - 0); /* |CM CM|C4|CM */
  369. case CM:
  370. state = S_odd_CM_odd_C4CM;
  371. break;
  372. }
  373. break;
  374. case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */
  375. switch (GB18030_MAP[*p]) {
  376. case C1:
  377. case C2:
  378. case C4:
  379. return (UChar *)(s - 2); /* |CM CM|CM C4 CM */
  380. case CM:
  381. state = S_even_CM_odd_C4CM;
  382. break;
  383. }
  384. break;
  385. case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */
  386. switch (GB18030_MAP[*p]) {
  387. case C1:
  388. case C2:
  389. case C4:
  390. return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */
  391. case CM:
  392. state = S_odd_CM_even_C4CM;
  393. break;
  394. }
  395. break;
  396. case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */
  397. switch (GB18030_MAP[*p]) {
  398. case C1:
  399. case C2:
  400. case C4:
  401. return (UChar *)(s - 0); /* |CM CM|CM C4 CM C4|CM */
  402. case CM:
  403. state = S_even_CM_even_C4CM;
  404. break;
  405. }
  406. break;
  407. }
  408. }
  409. DEBUG_GB18030(("state %d\n", state));
  410. switch (state) {
  411. case S_START: return (UChar *)(s - 0);
  412. case S_one_C2: return (UChar *)(s - 0);
  413. case S_one_C4: return (UChar *)(s - 0);
  414. case S_one_CM: return (UChar *)(s - 0);
  415. case S_odd_CM_one_CX: return (UChar *)(s - 1);
  416. case S_even_CM_one_CX: return (UChar *)(s - 0);
  417. case S_one_CMC4: return (UChar *)(s - 1);
  418. case S_odd_CMC4: return (UChar *)(s - 1);
  419. case S_one_C4_odd_CMC4: return (UChar *)(s - 1);
  420. case S_even_CMC4: return (UChar *)(s - 3);
  421. case S_one_C4_even_CMC4: return (UChar *)(s - 3);
  422. case S_odd_CM_odd_CMC4: return (UChar *)(s - 3);
  423. case S_even_CM_odd_CMC4: return (UChar *)(s - 1);
  424. case S_odd_CM_even_CMC4: return (UChar *)(s - 1);
  425. case S_even_CM_even_CMC4: return (UChar *)(s - 3);
  426. case S_odd_C4CM: return (UChar *)(s - 0);
  427. case S_one_CM_odd_C4CM: return (UChar *)(s - 2);
  428. case S_even_C4CM: return (UChar *)(s - 2);
  429. case S_one_CM_even_C4CM: return (UChar *)(s - 0);
  430. case S_even_CM_odd_C4CM: return (UChar *)(s - 0);
  431. case S_odd_CM_odd_C4CM: return (UChar *)(s - 2);
  432. case S_even_CM_even_C4CM: return (UChar *)(s - 2);
  433. case S_odd_CM_even_C4CM: return (UChar *)(s - 0);
  434. }
  435. return (UChar* )s; /* never come here. (escape warning) */
  436. }
  437. static int
  438. gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED)
  439. {
  440. return GB18030_MAP[*s] == C1 ? TRUE : FALSE;
  441. }
  442. OnigEncodingType OnigEncodingGB18030 = {
  443. gb18030_mbc_enc_len,
  444. "GB18030", /* name */
  445. 4, /* max enc length */
  446. 1, /* min enc length */
  447. onigenc_is_mbc_newline_0x0a,
  448. gb18030_mbc_to_code,
  449. onigenc_mb4_code_to_mbclen,
  450. gb18030_code_to_mbc,
  451. gb18030_mbc_case_fold,
  452. onigenc_ascii_apply_all_case_fold,
  453. onigenc_ascii_get_case_fold_codes_by_str,
  454. onigenc_minimum_property_name_to_ctype,
  455. gb18030_is_code_ctype,
  456. onigenc_not_support_get_ctype_code_range,
  457. gb18030_left_adjust_char_head,
  458. gb18030_is_allowed_reverse_match
  459. };