utf16_be.c 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. /**********************************************************************
  2. utf16_be.c - Oniguruma (regular expression library)
  3. **********************************************************************/
  4. /*-
  5. * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
  6. * All rights reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27. * SUCH DAMAGE.
  28. */
  29. #include "regenc.h"
  30. static const int EncLen_UTF16[] = {
  31. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  32. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  33. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  34. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  35. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  36. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  37. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  38. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  39. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  40. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  41. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  42. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  43. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  44. 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
  45. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  46. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
  47. };
  48. static int
  49. utf16be_mbc_enc_len(const UChar* p)
  50. {
  51. return EncLen_UTF16[*p];
  52. }
  53. static int
  54. utf16be_is_mbc_newline(const UChar* p, const UChar* end)
  55. {
  56. if (p + 1 < end) {
  57. if (*(p+1) == 0x0a && *p == 0x00)
  58. return 1;
  59. #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
  60. if ((
  61. #ifndef USE_CRNL_AS_LINE_TERMINATOR
  62. *(p+1) == 0x0d ||
  63. #endif
  64. *(p+1) == 0x85) && *p == 0x00)
  65. return 1;
  66. if (*p == 0x20 && (*(p+1) == 0x29 || *(p+1) == 0x28))
  67. return 1;
  68. #endif
  69. }
  70. return 0;
  71. }
  72. static OnigCodePoint
  73. utf16be_mbc_to_code(const UChar* p, const UChar* end)
  74. {
  75. OnigCodePoint code;
  76. if (UTF16_IS_SURROGATE_FIRST(*p)) {
  77. if (end - p < 4) return 0;
  78. code = ((((p[0] - 0xd8) << 2) + ((p[1] & 0xc0) >> 6) + 1) << 16)
  79. + ((((p[1] & 0x3f) << 2) + (p[2] - 0xdc)) << 8)
  80. + p[3];
  81. }
  82. else {
  83. if (end - p < 2) return 0;
  84. code = p[0] * 256 + p[1];
  85. }
  86. return code;
  87. }
  88. static int
  89. utf16be_code_to_mbclen(OnigCodePoint code)
  90. {
  91. return (code > 0xffff ? 4 : 2);
  92. }
  93. static int
  94. utf16be_code_to_mbc(OnigCodePoint code, UChar *buf)
  95. {
  96. UChar* p = buf;
  97. if (code > 0xffff) {
  98. unsigned int plane, high;
  99. plane = (code >> 16) - 1;
  100. *p++ = (plane >> 2) + 0xd8;
  101. high = (code & 0xff00) >> 8;
  102. *p++ = ((plane & 0x03) << 6) + (high >> 2);
  103. *p++ = (high & 0x03) + 0xdc;
  104. *p = (UChar )(code & 0xff);
  105. return 4;
  106. }
  107. else {
  108. *p++ = (UChar )((code & 0xff00) >> 8);
  109. *p++ = (UChar )(code & 0xff);
  110. return 2;
  111. }
  112. }
  113. static int
  114. utf16be_mbc_case_fold(OnigCaseFoldType flag,
  115. const UChar** pp, const UChar* end, UChar* fold)
  116. {
  117. const UChar* p = *pp;
  118. if (ONIGENC_IS_ASCII_CODE(*(p+1)) && *p == 0) {
  119. p++;
  120. #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
  121. if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
  122. if (*p == 0x49) {
  123. *fold++ = 0x01;
  124. *fold = 0x31;
  125. (*pp) += 2;
  126. return 2;
  127. }
  128. }
  129. #endif
  130. *fold++ = 0;
  131. *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
  132. *pp += 2;
  133. return 2;
  134. }
  135. else
  136. return onigenc_unicode_mbc_case_fold(ONIG_ENCODING_UTF16_BE, flag,
  137. pp, end, fold);
  138. }
  139. #if 0
  140. static int
  141. utf16be_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end)
  142. {
  143. const UChar* p = *pp;
  144. (*pp) += EncLen_UTF16[*p];
  145. if (*p == 0) {
  146. int c, v;
  147. p++;
  148. if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
  149. return TRUE;
  150. }
  151. c = *p;
  152. v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c,
  153. (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
  154. if ((v | BIT_CTYPE_LOWER) != 0) {
  155. /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
  156. if (c >= 0xaa && c <= 0xba)
  157. return FALSE;
  158. else
  159. return TRUE;
  160. }
  161. return (v != 0 ? TRUE : FALSE);
  162. }
  163. return FALSE;
  164. }
  165. #endif
  166. static UChar*
  167. utf16be_left_adjust_char_head(const UChar* start, const UChar* s)
  168. {
  169. if (s <= start) return (UChar* )s;
  170. if ((s - start) % 2 == 1) {
  171. s--;
  172. }
  173. if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1)
  174. s -= 2;
  175. return (UChar* )s;
  176. }
  177. static int
  178. utf16be_get_case_fold_codes_by_str(OnigCaseFoldType flag,
  179. const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
  180. {
  181. return onigenc_unicode_get_case_fold_codes_by_str(ONIG_ENCODING_UTF16_BE,
  182. flag, p, end, items);
  183. }
  184. OnigEncodingType OnigEncodingUTF16_BE = {
  185. utf16be_mbc_enc_len,
  186. "UTF-16BE", /* name */
  187. 4, /* max byte length */
  188. 2, /* min byte length */
  189. utf16be_is_mbc_newline,
  190. utf16be_mbc_to_code,
  191. utf16be_code_to_mbclen,
  192. utf16be_code_to_mbc,
  193. utf16be_mbc_case_fold,
  194. onigenc_unicode_apply_all_case_fold,
  195. utf16be_get_case_fold_codes_by_str,
  196. onigenc_unicode_property_name_to_ctype,
  197. onigenc_unicode_is_code_ctype,
  198. onigenc_utf16_32_get_ctype_code_range,
  199. utf16be_left_adjust_char_head,
  200. onigenc_always_false_is_allowed_reverse_match
  201. };