regposix.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. /**********************************************************************
  2. regposix.c - Oniguruma (regular expression library)
  3. **********************************************************************/
  4. /*-
  5. * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
  6. * All rights reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27. * SUCH DAMAGE.
  28. */
  29. #define regex_t onig_regex_t
  30. #include "regint.h"
  31. #undef regex_t
  32. #include "onigposix.h"
  33. #define ONIG_C(reg) ((onig_regex_t* )((reg)->onig))
  34. #define PONIG_C(reg) ((onig_regex_t** )(&(reg)->onig))
  35. /* #define ENC_STRING_LEN(enc,s,len) len = strlen(s) */
  36. #define ENC_STRING_LEN(enc,s,len) do { \
  37. if (ONIGENC_MBC_MINLEN(enc) == 1) { \
  38. UChar* tmps = (UChar* )(s); \
  39. while (*tmps != 0) tmps++; \
  40. len = tmps - (UChar* )(s); \
  41. } \
  42. else { \
  43. len = onigenc_str_bytelen_null(enc, (UChar* )s); \
  44. } \
  45. } while(0)
  46. typedef struct {
  47. int onig_err;
  48. int posix_err;
  49. } O2PERR;
  50. static int
  51. onig2posix_error_code(int code)
  52. {
  53. static const O2PERR o2p[] = {
  54. { ONIG_MISMATCH, REG_NOMATCH },
  55. { ONIG_NO_SUPPORT_CONFIG, REG_EONIG_INTERNAL },
  56. { ONIGERR_MEMORY, REG_ESPACE },
  57. { ONIGERR_MATCH_STACK_LIMIT_OVER, REG_EONIG_INTERNAL },
  58. { ONIGERR_TYPE_BUG, REG_EONIG_INTERNAL },
  59. { ONIGERR_PARSER_BUG, REG_EONIG_INTERNAL },
  60. { ONIGERR_STACK_BUG, REG_EONIG_INTERNAL },
  61. { ONIGERR_UNDEFINED_BYTECODE, REG_EONIG_INTERNAL },
  62. { ONIGERR_UNEXPECTED_BYTECODE, REG_EONIG_INTERNAL },
  63. { ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED, REG_EONIG_BADARG },
  64. { ONIGERR_SPECIFIED_ENCODING_CANT_CONVERT_TO_WIDE_CHAR, REG_EONIG_BADARG },
  65. { ONIGERR_INVALID_ARGUMENT, REG_EONIG_BADARG },
  66. { ONIGERR_END_PATTERN_AT_LEFT_BRACE, REG_EBRACE },
  67. { ONIGERR_END_PATTERN_AT_LEFT_BRACKET, REG_EBRACK },
  68. { ONIGERR_EMPTY_CHAR_CLASS, REG_ECTYPE },
  69. { ONIGERR_PREMATURE_END_OF_CHAR_CLASS, REG_ECTYPE },
  70. { ONIGERR_END_PATTERN_AT_ESCAPE, REG_EESCAPE },
  71. { ONIGERR_END_PATTERN_AT_META, REG_EESCAPE },
  72. { ONIGERR_END_PATTERN_AT_CONTROL, REG_EESCAPE },
  73. { ONIGERR_META_CODE_SYNTAX, REG_BADPAT },
  74. { ONIGERR_CONTROL_CODE_SYNTAX, REG_BADPAT },
  75. { ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE, REG_ECTYPE },
  76. { ONIGERR_CHAR_CLASS_VALUE_AT_START_OF_RANGE, REG_ECTYPE },
  77. { ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS, REG_ECTYPE },
  78. { ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED, REG_BADRPT },
  79. { ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID, REG_BADRPT },
  80. { ONIGERR_NESTED_REPEAT_OPERATOR, REG_BADRPT },
  81. { ONIGERR_UNMATCHED_CLOSE_PARENTHESIS, REG_EPAREN },
  82. { ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS, REG_EPAREN },
  83. { ONIGERR_END_PATTERN_IN_GROUP, REG_BADPAT },
  84. { ONIGERR_UNDEFINED_GROUP_OPTION, REG_BADPAT },
  85. { ONIGERR_INVALID_POSIX_BRACKET_TYPE, REG_BADPAT },
  86. { ONIGERR_INVALID_LOOK_BEHIND_PATTERN, REG_BADPAT },
  87. { ONIGERR_INVALID_REPEAT_RANGE_PATTERN, REG_BADPAT },
  88. { ONIGERR_TOO_BIG_NUMBER, REG_BADPAT },
  89. { ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE, REG_BADBR },
  90. { ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE, REG_BADBR },
  91. { ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS, REG_ECTYPE },
  92. { ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE, REG_ECTYPE },
  93. { ONIGERR_TOO_MANY_MULTI_BYTE_RANGES, REG_ECTYPE },
  94. { ONIGERR_TOO_SHORT_MULTI_BYTE_STRING, REG_BADPAT },
  95. { ONIGERR_TOO_BIG_BACKREF_NUMBER, REG_ESUBREG },
  96. { ONIGERR_INVALID_BACKREF, REG_ESUBREG },
  97. { ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED, REG_BADPAT },
  98. { ONIGERR_TOO_BIG_WIDE_CHAR_VALUE, REG_EONIG_BADWC },
  99. { ONIGERR_TOO_LONG_WIDE_CHAR_VALUE, REG_EONIG_BADWC },
  100. { ONIGERR_INVALID_CODE_POINT_VALUE, REG_EONIG_BADWC },
  101. { ONIGERR_EMPTY_GROUP_NAME, REG_BADPAT },
  102. { ONIGERR_INVALID_GROUP_NAME, REG_BADPAT },
  103. { ONIGERR_INVALID_CHAR_IN_GROUP_NAME, REG_BADPAT },
  104. { ONIGERR_UNDEFINED_NAME_REFERENCE, REG_BADPAT },
  105. { ONIGERR_UNDEFINED_GROUP_REFERENCE, REG_BADPAT },
  106. { ONIGERR_MULTIPLEX_DEFINED_NAME, REG_BADPAT },
  107. { ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, REG_BADPAT },
  108. { ONIGERR_NEVER_ENDING_RECURSION, REG_BADPAT },
  109. { ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY, REG_BADPAT },
  110. { ONIGERR_INVALID_CHAR_PROPERTY_NAME, REG_BADPAT },
  111. { ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION, REG_EONIG_BADARG },
  112. { ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT, REG_EONIG_THREAD }
  113. };
  114. int i;
  115. if (code >= 0) return 0;
  116. for (i = 0; i < (int )(sizeof(o2p) / sizeof(o2p[0])); i++) {
  117. if (code == o2p[i].onig_err)
  118. return o2p[i].posix_err;
  119. }
  120. return REG_EONIG_INTERNAL; /* but, unknown error code */
  121. }
  122. extern int
  123. regcomp(regex_t* reg, const char* pattern, int posix_options)
  124. {
  125. int r, len;
  126. OnigSyntaxType* syntax = OnigDefaultSyntax;
  127. OnigOptionType options;
  128. if ((posix_options & REG_EXTENDED) == 0)
  129. syntax = ONIG_SYNTAX_POSIX_BASIC;
  130. options = syntax->options;
  131. if ((posix_options & REG_ICASE) != 0)
  132. ONIG_OPTION_ON(options, ONIG_OPTION_IGNORECASE);
  133. if ((posix_options & REG_NEWLINE) != 0) {
  134. ONIG_OPTION_ON( options, ONIG_OPTION_NEGATE_SINGLELINE);
  135. ONIG_OPTION_OFF(options, ONIG_OPTION_SINGLELINE);
  136. }
  137. reg->comp_options = posix_options;
  138. ENC_STRING_LEN(OnigEncDefaultCharEncoding, pattern, len);
  139. r = onig_new(PONIG_C(reg), (UChar* )pattern, (UChar* )(pattern + len),
  140. options, OnigEncDefaultCharEncoding, syntax,
  141. (OnigErrorInfo* )NULL);
  142. if (r != ONIG_NORMAL) {
  143. return onig2posix_error_code(r);
  144. }
  145. reg->re_nsub = ONIG_C(reg)->num_mem;
  146. return 0;
  147. }
  148. extern int
  149. regexec(regex_t* reg, const char* str, size_t nmatch,
  150. regmatch_t pmatch[], int posix_options)
  151. {
  152. int r, i, len;
  153. UChar* end;
  154. regmatch_t* pm;
  155. OnigOptionType options;
  156. options = ONIG_OPTION_POSIX_REGION;
  157. if ((posix_options & REG_NOTBOL) != 0) options |= ONIG_OPTION_NOTBOL;
  158. if ((posix_options & REG_NOTEOL) != 0) options |= ONIG_OPTION_NOTEOL;
  159. if (nmatch == 0 || (reg->comp_options & REG_NOSUB) != 0) {
  160. pm = (regmatch_t* )NULL;
  161. nmatch = 0;
  162. }
  163. else if ((int )nmatch < ONIG_C(reg)->num_mem + 1) {
  164. pm = (regmatch_t* )xmalloc(sizeof(regmatch_t)
  165. * (ONIG_C(reg)->num_mem + 1));
  166. if (pm == NULL)
  167. return REG_ESPACE;
  168. }
  169. else {
  170. pm = pmatch;
  171. }
  172. ENC_STRING_LEN(ONIG_C(reg)->enc, str, len);
  173. end = (UChar* )(str + len);
  174. r = onig_search(ONIG_C(reg), (UChar* )str, end, (UChar* )str, end,
  175. (OnigRegion* )pm, options);
  176. if (r >= 0) {
  177. r = 0; /* Match */
  178. if (pm != pmatch && pm != NULL) {
  179. xmemcpy(pmatch, pm, sizeof(regmatch_t) * nmatch);
  180. }
  181. }
  182. else if (r == ONIG_MISMATCH) {
  183. r = REG_NOMATCH;
  184. for (i = 0; i < (int )nmatch; i++)
  185. pmatch[i].rm_so = pmatch[i].rm_eo = ONIG_REGION_NOTPOS;
  186. }
  187. else {
  188. r = onig2posix_error_code(r);
  189. }
  190. if (pm != pmatch && pm != NULL)
  191. xfree(pm);
  192. #if 0
  193. if (reg->re_nsub > nmatch - 1)
  194. reg->re_nsub = (nmatch <= 1 ? 0 : nmatch - 1);
  195. #endif
  196. return r;
  197. }
  198. extern void
  199. regfree(regex_t* reg)
  200. {
  201. onig_free(ONIG_C(reg));
  202. }
  203. extern void
  204. reg_set_encoding(int mb_code)
  205. {
  206. OnigEncoding enc;
  207. switch (mb_code) {
  208. case REG_POSIX_ENCODING_ASCII:
  209. enc = ONIG_ENCODING_ASCII;
  210. break;
  211. case REG_POSIX_ENCODING_EUC_JP:
  212. enc = ONIG_ENCODING_EUC_JP;
  213. break;
  214. case REG_POSIX_ENCODING_SJIS:
  215. enc = ONIG_ENCODING_SJIS;
  216. break;
  217. case REG_POSIX_ENCODING_UTF8:
  218. enc = ONIG_ENCODING_UTF8;
  219. break;
  220. case REG_POSIX_ENCODING_UTF16_BE:
  221. enc = ONIG_ENCODING_UTF16_BE;
  222. break;
  223. case REG_POSIX_ENCODING_UTF16_LE:
  224. enc = ONIG_ENCODING_UTF16_LE;
  225. break;
  226. default:
  227. return ;
  228. break;
  229. }
  230. onigenc_set_default_encoding(enc);
  231. }
  232. extern int
  233. reg_name_to_group_numbers(regex_t* reg,
  234. const unsigned char* name, const unsigned char* name_end, int** nums)
  235. {
  236. return onig_name_to_group_numbers(ONIG_C(reg), name, name_end, nums);
  237. }
  238. typedef struct {
  239. int (*func)(const unsigned char*, const unsigned char*,int,int*,regex_t*,void*);
  240. regex_t* reg;
  241. void* arg;
  242. } i_wrap;
  243. static int
  244. i_wrapper(const UChar* name, const UChar* name_end, int ng, int* gs,
  245. onig_regex_t* reg ARG_UNUSED, void* arg)
  246. {
  247. i_wrap* warg = (i_wrap* )arg;
  248. return (*warg->func)(name, name_end, ng, gs, warg->reg, warg->arg);
  249. }
  250. extern int
  251. reg_foreach_name(regex_t* reg,
  252. int (*func)(const unsigned char*, const unsigned char*,int,int*,regex_t*,void*),
  253. void* arg)
  254. {
  255. i_wrap warg;
  256. warg.func = func;
  257. warg.reg = reg;
  258. warg.arg = arg;
  259. return onig_foreach_name(ONIG_C(reg), i_wrapper, &warg);
  260. }
  261. extern int
  262. reg_number_of_names(regex_t* reg)
  263. {
  264. return onig_number_of_names(ONIG_C(reg));
  265. }