utf16_le.c 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. /**********************************************************************
  2. utf16_le.c - Oniguruma (regular expression library)
  3. **********************************************************************/
  4. /*-
  5. * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
  6. * All rights reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27. * SUCH DAMAGE.
  28. */
  29. #include "regenc.h"
  30. static const int EncLen_UTF16[] = {
  31. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  32. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  33. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  34. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  35. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  36. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  37. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  38. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  39. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  40. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  41. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  42. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  43. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  44. 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
  45. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  46. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
  47. };
  48. static int
  49. utf16le_code_to_mbclen(OnigCodePoint code)
  50. {
  51. return (code > 0xffff ? 4 : 2);
  52. }
  53. static int
  54. utf16le_mbc_enc_len(const UChar* p)
  55. {
  56. return EncLen_UTF16[*(p+1)];
  57. }
  58. static int
  59. utf16le_is_mbc_newline(const UChar* p, const UChar* end)
  60. {
  61. if (p + 1 < end) {
  62. if (*p == 0x0a && *(p+1) == 0x00)
  63. return 1;
  64. #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
  65. if ((
  66. #ifndef USE_CRNL_AS_LINE_TERMINATOR
  67. *p == 0x0d ||
  68. #endif
  69. *p == 0x85) && *(p+1) == 0x00)
  70. return 1;
  71. if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28))
  72. return 1;
  73. #endif
  74. }
  75. return 0;
  76. }
  77. static OnigCodePoint
  78. utf16le_mbc_to_code(const UChar* p, const UChar* end)
  79. {
  80. OnigCodePoint code;
  81. UChar c0 = *p;
  82. UChar c1 = *(p+1);
  83. if (UTF16_IS_SURROGATE_FIRST(c1)) {
  84. if (end - p < 4) return 0;
  85. code = ((((c1 - 0xd8) << 2) + ((c0 & 0xc0) >> 6) + 1) << 16)
  86. + ((((c0 & 0x3f) << 2) + (p[3] - 0xdc)) << 8)
  87. + p[2];
  88. }
  89. else {
  90. code = c1 * 256 + p[0];
  91. }
  92. return code;
  93. }
  94. static int
  95. utf16le_code_to_mbc(OnigCodePoint code, UChar *buf)
  96. {
  97. UChar* p = buf;
  98. if (code > 0xffff) {
  99. unsigned int plane, high;
  100. plane = (code >> 16) - 1;
  101. high = (code & 0xff00) >> 8;
  102. *p++ = ((plane & 0x03) << 6) + (high >> 2);
  103. *p++ = (plane >> 2) + 0xd8;
  104. *p++ = (UChar )(code & 0xff);
  105. *p = (high & 0x03) + 0xdc;
  106. return 4;
  107. }
  108. else {
  109. *p++ = (UChar )(code & 0xff);
  110. *p++ = (UChar )((code & 0xff00) >> 8);
  111. return 2;
  112. }
  113. }
  114. static int
  115. utf16le_mbc_case_fold(OnigCaseFoldType flag,
  116. const UChar** pp, const UChar* end, UChar* fold)
  117. {
  118. const UChar* p = *pp;
  119. if (ONIGENC_IS_ASCII_CODE(*p) && *(p+1) == 0) {
  120. #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
  121. if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
  122. if (*p == 0x49) {
  123. *fold++ = 0x31;
  124. *fold = 0x01;
  125. (*pp) += 2;
  126. return 2;
  127. }
  128. }
  129. #endif
  130. *fold++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
  131. *fold = 0;
  132. *pp += 2;
  133. return 2;
  134. }
  135. else
  136. return onigenc_unicode_mbc_case_fold(ONIG_ENCODING_UTF16_LE, flag, pp, end,
  137. fold);
  138. }
  139. #if 0
  140. static int
  141. utf16le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp,
  142. const UChar* end)
  143. {
  144. const UChar* p = *pp;
  145. (*pp) += EncLen_UTF16[*(p+1)];
  146. if (*(p+1) == 0) {
  147. int c, v;
  148. if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
  149. return TRUE;
  150. }
  151. c = *p;
  152. v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c,
  153. (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
  154. if ((v | BIT_CTYPE_LOWER) != 0) {
  155. /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
  156. if (c >= 0xaa && c <= 0xba)
  157. return FALSE;
  158. else
  159. return TRUE;
  160. }
  161. return (v != 0 ? TRUE : FALSE);
  162. }
  163. return FALSE;
  164. }
  165. #endif
  166. static UChar*
  167. utf16le_left_adjust_char_head(const UChar* start, const UChar* s)
  168. {
  169. if (s <= start) return (UChar* )s;
  170. if ((s - start) % 2 == 1) {
  171. s--;
  172. }
  173. if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1)
  174. s -= 2;
  175. return (UChar* )s;
  176. }
  177. static int
  178. utf16le_get_case_fold_codes_by_str(OnigCaseFoldType flag,
  179. const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
  180. {
  181. return onigenc_unicode_get_case_fold_codes_by_str(ONIG_ENCODING_UTF16_LE,
  182. flag, p, end, items);
  183. }
  184. OnigEncodingType OnigEncodingUTF16_LE = {
  185. utf16le_mbc_enc_len,
  186. "UTF-16LE", /* name */
  187. 4, /* max byte length */
  188. 2, /* min byte length */
  189. utf16le_is_mbc_newline,
  190. utf16le_mbc_to_code,
  191. utf16le_code_to_mbclen,
  192. utf16le_code_to_mbc,
  193. utf16le_mbc_case_fold,
  194. onigenc_unicode_apply_all_case_fold,
  195. utf16le_get_case_fold_codes_by_str,
  196. onigenc_unicode_property_name_to_ctype,
  197. onigenc_unicode_is_code_ctype,
  198. onigenc_utf16_32_get_ctype_code_range,
  199. utf16le_left_adjust_char_head,
  200. onigenc_always_false_is_allowed_reverse_match
  201. };