iso8859_1.c 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. /**********************************************************************
  2. iso8859_1.c - Oniguruma (regular expression library)
  3. **********************************************************************/
  4. /*-
  5. * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
  6. * All rights reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27. * SUCH DAMAGE.
  28. */
  29. #include "regenc.h"
  30. #define ENC_IS_ISO_8859_1_CTYPE(code,ctype) \
  31. ((EncISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
  32. static const unsigned short EncISO_8859_1_CtypeTable[256] = {
  33. 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
  34. 0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
  35. 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
  36. 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
  37. 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
  38. 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
  39. 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
  40. 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
  41. 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
  42. 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
  43. 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
  44. 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
  45. 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
  46. 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
  47. 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
  48. 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
  49. 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
  50. 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
  51. 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
  52. 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
  53. 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
  54. 0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x01a0, 0x00a0, 0x00a0,
  55. 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
  56. 0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
  57. 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
  58. 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
  59. 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
  60. 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
  61. 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
  62. 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
  63. 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
  64. 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
  65. };
  66. static const OnigPairCaseFoldCodes CaseFoldMap[] = {
  67. { 0xc0, 0xe0 },
  68. { 0xc1, 0xe1 },
  69. { 0xc2, 0xe2 },
  70. { 0xc3, 0xe3 },
  71. { 0xc4, 0xe4 },
  72. { 0xc5, 0xe5 },
  73. { 0xc6, 0xe6 },
  74. { 0xc7, 0xe7 },
  75. { 0xc8, 0xe8 },
  76. { 0xc9, 0xe9 },
  77. { 0xca, 0xea },
  78. { 0xcb, 0xeb },
  79. { 0xcc, 0xec },
  80. { 0xcd, 0xed },
  81. { 0xce, 0xee },
  82. { 0xcf, 0xef },
  83. { 0xd0, 0xf0 },
  84. { 0xd1, 0xf1 },
  85. { 0xd2, 0xf2 },
  86. { 0xd3, 0xf3 },
  87. { 0xd4, 0xf4 },
  88. { 0xd5, 0xf5 },
  89. { 0xd6, 0xf6 },
  90. { 0xd8, 0xf8 },
  91. { 0xd9, 0xf9 },
  92. { 0xda, 0xfa },
  93. { 0xdb, 0xfb },
  94. { 0xdc, 0xfc },
  95. { 0xdd, 0xfd },
  96. { 0xde, 0xfe }
  97. };
  98. static int
  99. apply_all_case_fold(OnigCaseFoldType flag,
  100. OnigApplyAllCaseFoldFunc f, void* arg)
  101. {
  102. return onigenc_apply_all_case_fold_with_map(
  103. sizeof(CaseFoldMap)/sizeof(OnigPairCaseFoldCodes), CaseFoldMap, 1,
  104. flag, f, arg);
  105. }
  106. static int
  107. get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
  108. const OnigUChar* p, const OnigUChar* end,
  109. OnigCaseFoldCodeItem items[])
  110. {
  111. if (0x41 <= *p && *p <= 0x5a) {
  112. items[0].byte_len = 1;
  113. items[0].code_len = 1;
  114. items[0].code[0] = (OnigCodePoint )(*p + 0x20);
  115. if (*p == 0x53 && end > p + 1
  116. && (*(p+1) == 0x53 || *(p+1) == 0x73)) { /* SS */
  117. items[1].byte_len = 2;
  118. items[1].code_len = 1;
  119. items[1].code[0] = (OnigCodePoint )0xdf;
  120. return 2;
  121. }
  122. else
  123. return 1;
  124. }
  125. else if (0x61 <= *p && *p <= 0x7a) {
  126. items[0].byte_len = 1;
  127. items[0].code_len = 1;
  128. items[0].code[0] = (OnigCodePoint )(*p - 0x20);
  129. if (*p == 0x73 && end > p + 1
  130. && (*(p+1) == 0x73 || *(p+1) == 0x53)) { /* ss */
  131. items[1].byte_len = 2;
  132. items[1].code_len = 1;
  133. items[1].code[0] = (OnigCodePoint )0xdf;
  134. return 2;
  135. }
  136. else
  137. return 1;
  138. }
  139. else if (0xc0 <= *p && *p <= 0xcf) {
  140. items[0].byte_len = 1;
  141. items[0].code_len = 1;
  142. items[0].code[0] = (OnigCodePoint )(*p + 0x20);
  143. return 1;
  144. }
  145. else if (0xd0 <= *p && *p <= 0xdf) {
  146. if (*p == 0xdf) {
  147. items[0].byte_len = 1;
  148. items[0].code_len = 2;
  149. items[0].code[0] = (OnigCodePoint )'s';
  150. items[0].code[1] = (OnigCodePoint )'s';
  151. items[1].byte_len = 1;
  152. items[1].code_len = 2;
  153. items[1].code[0] = (OnigCodePoint )'S';
  154. items[1].code[1] = (OnigCodePoint )'S';
  155. items[2].byte_len = 1;
  156. items[2].code_len = 2;
  157. items[2].code[0] = (OnigCodePoint )'s';
  158. items[2].code[1] = (OnigCodePoint )'S';
  159. items[3].byte_len = 1;
  160. items[3].code_len = 2;
  161. items[3].code[0] = (OnigCodePoint )'S';
  162. items[3].code[1] = (OnigCodePoint )'s';
  163. return 4;
  164. }
  165. else if (*p != 0xd7) {
  166. items[0].byte_len = 1;
  167. items[0].code_len = 1;
  168. items[0].code[0] = (OnigCodePoint )(*p + 0x20);
  169. return 1;
  170. }
  171. }
  172. else if (0xe0 <= *p && *p <= 0xef) {
  173. items[0].byte_len = 1;
  174. items[0].code_len = 1;
  175. items[0].code[0] = (OnigCodePoint )(*p - 0x20);
  176. return 1;
  177. }
  178. else if (0xf0 <= *p && *p <= 0xfe) {
  179. if (*p != 0xf7) {
  180. items[0].byte_len = 1;
  181. items[0].code_len = 1;
  182. items[0].code[0] = (OnigCodePoint )(*p - 0x20);
  183. return 1;
  184. }
  185. }
  186. return 0;
  187. }
  188. static int
  189. mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
  190. const UChar* end ARG_UNUSED, UChar* lower)
  191. {
  192. const UChar* p = *pp;
  193. if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
  194. *lower++ = 's';
  195. *lower = 's';
  196. (*pp)++;
  197. return 2;
  198. }
  199. *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p);
  200. (*pp)++;
  201. return 1;
  202. }
  203. #if 0
  204. static int
  205. is_mbc_ambiguous(OnigCaseFoldType flag,
  206. const UChar** pp, const UChar* end)
  207. {
  208. int v;
  209. const UChar* p = *pp;
  210. if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
  211. (*pp)++;
  212. return TRUE;
  213. }
  214. (*pp)++;
  215. v = (EncISO_8859_1_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
  216. if ((v | BIT_CTYPE_LOWER) != 0) {
  217. /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
  218. if (*p >= 0xaa && *p <= 0xba)
  219. return FALSE;
  220. else
  221. return TRUE;
  222. }
  223. return (v != 0 ? TRUE : FALSE);
  224. }
  225. #endif
  226. static int
  227. is_code_ctype(OnigCodePoint code, unsigned int ctype)
  228. {
  229. if (code < 256)
  230. return ENC_IS_ISO_8859_1_CTYPE(code, ctype);
  231. else
  232. return FALSE;
  233. }
  234. OnigEncodingType OnigEncodingISO_8859_1 = {
  235. onigenc_single_byte_mbc_enc_len,
  236. "ISO-8859-1", /* name */
  237. 1, /* max enc length */
  238. 1, /* min enc length */
  239. onigenc_is_mbc_newline_0x0a,
  240. onigenc_single_byte_mbc_to_code,
  241. onigenc_single_byte_code_to_mbclen,
  242. onigenc_single_byte_code_to_mbc,
  243. mbc_case_fold,
  244. apply_all_case_fold,
  245. get_case_fold_codes_by_str,
  246. onigenc_minimum_property_name_to_ctype,
  247. is_code_ctype,
  248. onigenc_not_support_get_ctype_code_range,
  249. onigenc_single_byte_left_adjust_char_head,
  250. onigenc_always_true_is_allowed_reverse_match
  251. };