utf8.c 8.1 KB


  1. /**********************************************************************
  2. utf8.c - Oniguruma (regular expression library)
  3. **********************************************************************/
  4. /*-
  5. * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
  6. * All rights reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27. * SUCH DAMAGE.
  28. */
  29. #include "regenc.h"
  30. #define USE_INVALID_CODE_SCHEME
  31. #ifdef USE_INVALID_CODE_SCHEME
  32. /* virtual codepoint values for invalid encoding byte 0xfe and 0xff */
  33. #define INVALID_CODE_FE 0xfffffffe
  34. #define INVALID_CODE_FF 0xffffffff
  35. #define VALID_CODE_LIMIT 0x7fffffff
  36. #endif
  37. #define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80)
  38. static const int EncLen_UTF8[] = {
  39. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  40. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  41. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  42. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  43. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  44. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  45. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  46. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  47. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  48. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  49. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  50. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  51. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  52. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  53. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  54. 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
  55. };
  56. static int
  57. mbc_enc_len(const UChar* p)
  58. {
  59. return EncLen_UTF8[*p];
  60. }
  61. static int
  62. is_mbc_newline(const UChar* p, const UChar* end)
  63. {
  64. if (p < end) {
  65. if (*p == 0x0a) return 1;
  66. #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
  67. #ifndef USE_CRNL_AS_LINE_TERMINATOR
  68. if (*p == 0x0d) return 1;
  69. #endif
  70. if (p + 1 < end) {
  71. if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */
  72. return 1;
  73. if (p + 2 < end) {
  74. if ((*(p+2) == 0xa8 || *(p+2) == 0xa9)
  75. && *(p+1) == 0x80 && *p == 0xe2) /* U+2028, U+2029 */
  76. return 1;
  77. }
  78. }
  79. #endif
  80. }
  81. return 0;
  82. }
  83. static OnigCodePoint
  84. mbc_to_code(const UChar* p, const UChar* end)
  85. {
  86. int c, len;
  87. OnigCodePoint n;
  88. len = mbc_enc_len(p);
  89. if (len > end - p) len = end - p;
  90. c = *p++;
  91. if (len > 1) {
  92. len--;
  93. n = c & ((1 << (6 - len)) - 1);
  94. while (len--) {
  95. c = *p++;
  96. n = (n << 6) | (c & ((1 << 6) - 1));
  97. }
  98. return n;
  99. }
  100. else {
  101. #ifdef USE_INVALID_CODE_SCHEME
  102. if (c > 0xfd) {
  103. return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF);
  104. }
  105. #endif
  106. return (OnigCodePoint )c;
  107. }
  108. }
  109. static int
  110. code_to_mbclen(OnigCodePoint code)
  111. {
  112. if ((code & 0xffffff80) == 0) return 1;
  113. else if ((code & 0xfffff800) == 0) return 2;
  114. else if ((code & 0xffff0000) == 0) return 3;
  115. else if ((code & 0xffe00000) == 0) return 4;
  116. else if ((code & 0xfc000000) == 0) return 5;
  117. else if ((code & 0x80000000) == 0) return 6;
  118. #ifdef USE_INVALID_CODE_SCHEME
  119. else if (code == INVALID_CODE_FE) return 1;
  120. else if (code == INVALID_CODE_FF) return 1;
  121. #endif
  122. else
  123. return ONIGERR_INVALID_CODE_POINT_VALUE;
  124. }
  125. static int
  126. code_to_mbc(OnigCodePoint code, UChar *buf)
  127. {
  128. #define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) | 0x80)
  129. #define UTF8_TRAIL0(code) (UChar )(((code) & 0x3f) | 0x80)
  130. if ((code & 0xffffff80) == 0) {
  131. *buf = (UChar )code;
  132. return 1;
  133. }
  134. else {
  135. UChar *p = buf;
  136. if ((code & 0xfffff800) == 0) {
  137. *p++ = (UChar )(((code>>6)& 0x1f) | 0xc0);
  138. }
  139. else if ((code & 0xffff0000) == 0) {
  140. *p++ = (UChar )(((code>>12) & 0x0f) | 0xe0);
  141. *p++ = UTF8_TRAILS(code, 6);
  142. }
  143. else if ((code & 0xffe00000) == 0) {
  144. *p++ = (UChar )(((code>>18) & 0x07) | 0xf0);
  145. *p++ = UTF8_TRAILS(code, 12);
  146. *p++ = UTF8_TRAILS(code, 6);
  147. }
  148. else if ((code & 0xfc000000) == 0) {
  149. *p++ = (UChar )(((code>>24) & 0x03) | 0xf8);
  150. *p++ = UTF8_TRAILS(code, 18);
  151. *p++ = UTF8_TRAILS(code, 12);
  152. *p++ = UTF8_TRAILS(code, 6);
  153. }
  154. else if ((code & 0x80000000) == 0) {
  155. *p++ = (UChar )(((code>>30) & 0x01) | 0xfc);
  156. *p++ = UTF8_TRAILS(code, 24);
  157. *p++ = UTF8_TRAILS(code, 18);
  158. *p++ = UTF8_TRAILS(code, 12);
  159. *p++ = UTF8_TRAILS(code, 6);
  160. }
  161. #ifdef USE_INVALID_CODE_SCHEME
  162. else if (code == INVALID_CODE_FE) {
  163. *p = 0xfe;
  164. return 1;
  165. }
  166. else if (code == INVALID_CODE_FF) {
  167. *p = 0xff;
  168. return 1;
  169. }
  170. #endif
  171. else {
  172. return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
  173. }
  174. *p++ = UTF8_TRAIL0(code);
  175. return p - buf;
  176. }
  177. }
  178. static int
  179. mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
  180. const UChar* end, UChar* fold)
  181. {
  182. const UChar* p = *pp;
  183. if (ONIGENC_IS_MBC_ASCII(p)) {
  184. #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
  185. if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
  186. if (*p == 0x49) {
  187. *fold++ = 0xc4;
  188. *fold = 0xb1;
  189. (*pp)++;
  190. return 2;
  191. }
  192. }
  193. #endif
  194. *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
  195. (*pp)++;
  196. return 1; /* return byte length of converted char to lower */
  197. }
  198. else {
  199. return onigenc_unicode_mbc_case_fold(ONIG_ENCODING_UTF8, flag,
  200. pp, end, fold);
  201. }
  202. }
  203. #if 0
  204. static int
  205. is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end)
  206. {
  207. const UChar* p = *pp;
  208. if (ONIGENC_IS_MBC_ASCII(p)) {
  209. (*pp)++;
  210. return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
  211. }
  212. else {
  213. (*pp) += enclen(ONIG_ENCODING_UTF8, p);
  214. if (*p == 0xc3) {
  215. int c = *(p + 1);
  216. if (c >= 0x80) {
  217. if (c <= (UChar )0x9e) { /* upper */
  218. if (c == (UChar )0x97) return FALSE;
  219. return TRUE;
  220. }
  221. else if (c >= (UChar )0xa0 && c <= (UChar )0xbe) { /* lower */
  222. if (c == (UChar )'\267') return FALSE;
  223. return TRUE;
  224. }
  225. else if (c == (UChar )0x9f &&
  226. (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
  227. return TRUE;
  228. }
  229. }
  230. }
  231. }
  232. return FALSE;
  233. }
  234. #endif
  235. static int
  236. get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out,
  237. const OnigCodePoint* ranges[])
  238. {
  239. *sb_out = 0x80;
  240. return onigenc_unicode_ctype_code_range(ctype, ranges);
  241. }
  242. static UChar*
  243. left_adjust_char_head(const UChar* start, const UChar* s)
  244. {
  245. const UChar *p;
  246. if (s <= start) return (UChar* )s;
  247. p = s;
  248. while (!utf8_islead(*p) && p > start) p--;
  249. return (UChar* )p;
  250. }
  251. static int
  252. get_case_fold_codes_by_str(OnigCaseFoldType flag,
  253. const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
  254. {
  255. return onigenc_unicode_get_case_fold_codes_by_str(ONIG_ENCODING_UTF8,
  256. flag, p, end, items);
  257. }
  258. OnigEncodingType OnigEncodingUTF8 = {
  259. mbc_enc_len,
  260. "UTF-8", /* name */
  261. 6, /* max byte length */
  262. 1, /* min byte length */
  263. is_mbc_newline,
  264. mbc_to_code,
  265. code_to_mbclen,
  266. code_to_mbc,
  267. mbc_case_fold,
  268. onigenc_unicode_apply_all_case_fold,
  269. get_case_fold_codes_by_str,
  270. onigenc_unicode_property_name_to_ctype,
  271. onigenc_unicode_is_code_ctype,
  272. get_ctype_code_range,
  273. left_adjust_char_head,
  274. onigenc_always_true_is_allowed_reverse_match
  275. };