pcre2_intmodedep.h 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923
  1. /*************************************************
  2. * Perl-Compatible Regular Expressions *
  3. *************************************************/
  4. /* PCRE is a library of functions to support regular expressions whose syntax
  5. and semantics are as close as possible to those of the Perl 5 language.
  6. Written by Philip Hazel
  7. Original API code Copyright (c) 1997-2012 University of Cambridge
  8. New API code Copyright (c) 2016-2018 University of Cambridge
  9. -----------------------------------------------------------------------------
  10. Redistribution and use in source and binary forms, with or without
  11. modification, are permitted provided that the following conditions are met:
  12. * Redistributions of source code must retain the above copyright notice,
  13. this list of conditions and the following disclaimer.
  14. * Redistributions in binary form must reproduce the above copyright
  15. notice, this list of conditions and the following disclaimer in the
  16. documentation and/or other materials provided with the distribution.
  17. * Neither the name of the University of Cambridge nor the names of its
  18. contributors may be used to endorse or promote products derived from
  19. this software without specific prior written permission.
  20. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  21. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  24. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25. CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26. SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27. INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28. CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29. ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30. POSSIBILITY OF SUCH DAMAGE.
  31. -----------------------------------------------------------------------------
  32. */
  33. /* This module contains mode-dependent macro and structure definitions. The
  34. file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined.
  35. These mode-dependent items are kept in a separate file so that they can also be
  36. #included multiple times for different code unit widths by pcre2test in order
  37. to have access to the hidden structures at all supported widths.
  38. Some of the mode-dependent macros are required at different widths for
  39. different parts of the pcre2test code (in particular, the included
  40. pcre_printint.c file). We undefine them here so that they can be re-defined for
  41. multiple inclusions. Not all of these are used in pcre2test, but it's easier
  42. just to undefine them all. */
  43. #undef ACROSSCHAR
  44. #undef BACKCHAR
  45. #undef BYTES2CU
  46. #undef CHMAX_255
  47. #undef CU2BYTES
  48. #undef FORWARDCHAR
  49. #undef FORWARDCHARTEST
  50. #undef GET
  51. #undef GET2
  52. #undef GETCHAR
  53. #undef GETCHARINC
  54. #undef GETCHARINCTEST
  55. #undef GETCHARLEN
  56. #undef GETCHARLENTEST
  57. #undef GETCHARTEST
  58. #undef GET_EXTRALEN
  59. #undef HAS_EXTRALEN
  60. #undef IMM2_SIZE
  61. #undef MAX_255
  62. #undef MAX_MARK
  63. #undef MAX_PATTERN_SIZE
  64. #undef MAX_UTF_SINGLE_CU
  65. #undef NOT_FIRSTCU
  66. #undef PUT
  67. #undef PUT2
  68. #undef PUT2INC
  69. #undef PUTCHAR
  70. #undef PUTINC
  71. #undef TABLE_GET
  72. /* -------------------------- MACROS ----------------------------- */
  73. /* PCRE keeps offsets in its compiled code as at least 16-bit quantities
  74. (always stored in big-endian order in 8-bit mode) by default. These are used,
  75. for example, to link from the start of a subpattern to its alternatives and its
  76. end. The use of 16 bits per offset limits the size of an 8-bit compiled regex
  77. to around 64K, which is big enough for almost everybody. However, I received a
  78. request for an even bigger limit. For this reason, and also to make the code
  79. easier to maintain, the storing and loading of offsets from the compiled code
  80. unit string is now handled by the macros that are defined here.
  81. The macros are controlled by the value of LINK_SIZE. This defaults to 2, but
  82. values of 3 or 4 are also supported. */
  83. /* ------------------- 8-bit support ------------------ */
  84. #if PCRE2_CODE_UNIT_WIDTH == 8
  85. #if LINK_SIZE == 2
  86. #define PUT(a,n,d) \
  87. (a[n] = (PCRE2_UCHAR)((d) >> 8)), \
  88. (a[(n)+1] = (PCRE2_UCHAR)((d) & 255))
  89. #define GET(a,n) \
  90. (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
  91. #define MAX_PATTERN_SIZE (1 << 16)
  92. #elif LINK_SIZE == 3
  93. #define PUT(a,n,d) \
  94. (a[n] = (PCRE2_UCHAR)((d) >> 16)), \
  95. (a[(n)+1] = (PCRE2_UCHAR)((d) >> 8)), \
  96. (a[(n)+2] = (PCRE2_UCHAR)((d) & 255))
  97. #define GET(a,n) \
  98. (unsigned int)(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
  99. #define MAX_PATTERN_SIZE (1 << 24)
  100. #elif LINK_SIZE == 4
  101. #define PUT(a,n,d) \
  102. (a[n] = (PCRE2_UCHAR)((d) >> 24)), \
  103. (a[(n)+1] = (PCRE2_UCHAR)((d) >> 16)), \
  104. (a[(n)+2] = (PCRE2_UCHAR)((d) >> 8)), \
  105. (a[(n)+3] = (PCRE2_UCHAR)((d) & 255))
  106. #define GET(a,n) \
  107. (unsigned int)(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
  108. #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
  109. #else
  110. #error LINK_SIZE must be 2, 3, or 4
  111. #endif
  112. /* ------------------- 16-bit support ------------------ */
  113. #elif PCRE2_CODE_UNIT_WIDTH == 16
  114. #if LINK_SIZE == 2
  115. #undef LINK_SIZE
  116. #define LINK_SIZE 1
  117. #define PUT(a,n,d) \
  118. (a[n] = (PCRE2_UCHAR)(d))
  119. #define GET(a,n) \
  120. (a[n])
  121. #define MAX_PATTERN_SIZE (1 << 16)
  122. #elif LINK_SIZE == 3 || LINK_SIZE == 4
  123. #undef LINK_SIZE
  124. #define LINK_SIZE 2
  125. #define PUT(a,n,d) \
  126. (a[n] = (PCRE2_UCHAR)((d) >> 16)), \
  127. (a[(n)+1] = (PCRE2_UCHAR)((d) & 65535))
  128. #define GET(a,n) \
  129. (unsigned int)(((a)[n] << 16) | (a)[(n)+1])
  130. #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
  131. #else
  132. #error LINK_SIZE must be 2, 3, or 4
  133. #endif
  134. /* ------------------- 32-bit support ------------------ */
  135. #elif PCRE2_CODE_UNIT_WIDTH == 32
  136. #undef LINK_SIZE
  137. #define LINK_SIZE 1
  138. #define PUT(a,n,d) \
  139. (a[n] = (d))
  140. #define GET(a,n) \
  141. (a[n])
  142. #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
  143. #else
  144. #error Unsupported compiling mode
  145. #endif
  146. /* --------------- Other mode-specific macros ----------------- */
  147. /* PCRE uses some other (at least) 16-bit quantities that do not change when
  148. the size of offsets changes. There are used for repeat counts and for other
  149. things such as capturing parenthesis numbers in back references.
  150. Define the number of code units required to hold a 16-bit count/offset, and
  151. macros to load and store such a value. For reasons that I do not understand,
  152. the expression in the 8-bit GET2 macro is treated by gcc as a signed
  153. expression, even when a is declared as unsigned. It seems that any kind of
  154. arithmetic results in a signed value. Hence the cast. */
  155. #if PCRE2_CODE_UNIT_WIDTH == 8
  156. #define IMM2_SIZE 2
  157. #define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
  158. #define PUT2(a,n,d) a[n] = (d) >> 8, a[(n)+1] = (d) & 255
  159. #else /* Code units are 16 or 32 bits */
  160. #define IMM2_SIZE 1
  161. #define GET2(a,n) a[n]
  162. #define PUT2(a,n,d) a[n] = d
  163. #endif
  164. /* Other macros that are different for 8-bit mode. The MAX_255 macro checks
  165. whether its argument, which is assumed to be one code unit, is less than 256.
  166. The CHMAX_255 macro does not assume one code unit. The maximum length of a MARK
  167. name must fit in one code unit; currently it is set to 255 or 65535. The
  168. TABLE_GET macro is used to access elements of tables containing exactly 256
  169. items. Its argument is a code unit. When code points can be greater than 255, a
  170. check is needed before accessing these tables. */
  171. #if PCRE2_CODE_UNIT_WIDTH == 8
  172. #define MAX_255(c) TRUE
  173. #define MAX_MARK ((1u << 8) - 1)
  174. #define TABLE_GET(c, table, default) ((table)[c])
  175. #ifdef SUPPORT_UNICODE
  176. #define SUPPORT_WIDE_CHARS
  177. #define CHMAX_255(c) ((c) <= 255u)
  178. #else
  179. #define CHMAX_255(c) TRUE
  180. #endif /* SUPPORT_UNICODE */
  181. #else /* Code units are 16 or 32 bits */
  182. #define CHMAX_255(c) ((c) <= 255u)
  183. #define MAX_255(c) ((c) <= 255u)
  184. #define MAX_MARK ((1u << 16) - 1)
  185. #define SUPPORT_WIDE_CHARS
  186. #define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default))
  187. #endif
  188. /* ----------------- Character-handling macros ----------------- */
  189. /* There is a proposed future special "UTF-21" mode, in which only the lowest
  190. 21 bits of a 32-bit character are interpreted as UTF, with the remaining 11
  191. high-order bits available to the application for other uses. In preparation for
  192. the future implementation of this mode, there are macros that load a data item
  193. and, if in this special mode, mask it to 21 bits. These macros all have names
  194. starting with UCHAR21. In all other modes, including the normal 32-bit
  195. library, the macros all have the same simple definitions. When the new mode is
  196. implemented, it is expected that these definitions will be varied appropriately
  197. using #ifdef when compiling the library that supports the special mode. */
  198. #define UCHAR21(eptr) (*(eptr))
  199. #define UCHAR21TEST(eptr) (*(eptr))
  200. #define UCHAR21INC(eptr) (*(eptr)++)
  201. #define UCHAR21INCTEST(eptr) (*(eptr)++)
  202. /* When UTF encoding is being used, a character is no longer just a single
  203. byte in 8-bit mode or a single short in 16-bit mode. The macros for character
  204. handling generate simple sequences when used in the basic mode, and more
  205. complicated ones for UTF characters. GETCHARLENTEST and other macros are not
  206. used when UTF is not supported. To make sure they can never even appear when
  207. UTF support is omitted, we don't even define them. */
  208. #ifndef SUPPORT_UNICODE
  209. /* #define MAX_UTF_SINGLE_CU */
  210. /* #define HAS_EXTRALEN(c) */
  211. /* #define GET_EXTRALEN(c) */
  212. /* #define NOT_FIRSTCU(c) */
  213. #define GETCHAR(c, eptr) c = *eptr;
  214. #define GETCHARTEST(c, eptr) c = *eptr;
  215. #define GETCHARINC(c, eptr) c = *eptr++;
  216. #define GETCHARINCTEST(c, eptr) c = *eptr++;
  217. #define GETCHARLEN(c, eptr, len) c = *eptr;
  218. #define PUTCHAR(c, p) (*p = c, 1)
  219. /* #define GETCHARLENTEST(c, eptr, len) */
  220. /* #define BACKCHAR(eptr) */
  221. /* #define FORWARDCHAR(eptr) */
  222. /* #define FORWARCCHARTEST(eptr,end) */
  223. /* #define ACROSSCHAR(condition, eptr, action) */
  224. #else /* SUPPORT_UNICODE */
  225. /* ------------------- 8-bit support ------------------ */
  226. #if PCRE2_CODE_UNIT_WIDTH == 8
  227. #define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */
  228. /* The largest UTF code point that can be encoded as a single code unit. */
  229. #define MAX_UTF_SINGLE_CU 127
  230. /* Tests whether the code point needs extra characters to decode. */
  231. #define HAS_EXTRALEN(c) HASUTF8EXTRALEN(c)
  232. /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
  233. Otherwise it has an undefined behaviour. */
  234. #define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3fu])
  235. /* Returns TRUE, if the given value is not the first code unit of a UTF
  236. sequence. */
  237. #define NOT_FIRSTCU(c) (((c) & 0xc0u) == 0x80u)
  238. /* Get the next UTF-8 character, not advancing the pointer. This is called when
  239. we know we are in UTF-8 mode. */
  240. #define GETCHAR(c, eptr) \
  241. c = *eptr; \
  242. if (c >= 0xc0u) GETUTF8(c, eptr);
  243. /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
  244. pointer. */
  245. #define GETCHARTEST(c, eptr) \
  246. c = *eptr; \
  247. if (utf && c >= 0xc0u) GETUTF8(c, eptr);
  248. /* Get the next UTF-8 character, advancing the pointer. This is called when we
  249. know we are in UTF-8 mode. */
  250. #define GETCHARINC(c, eptr) \
  251. c = *eptr++; \
  252. if (c >= 0xc0u) GETUTF8INC(c, eptr);
  253. /* Get the next character, testing for UTF-8 mode, and advancing the pointer.
  254. This is called when we don't know if we are in UTF-8 mode. */
  255. #define GETCHARINCTEST(c, eptr) \
  256. c = *eptr++; \
  257. if (utf && c >= 0xc0u) GETUTF8INC(c, eptr);
  258. /* Get the next UTF-8 character, not advancing the pointer, incrementing length
  259. if there are extra bytes. This is called when we know we are in UTF-8 mode. */
  260. #define GETCHARLEN(c, eptr, len) \
  261. c = *eptr; \
  262. if (c >= 0xc0u) GETUTF8LEN(c, eptr, len);
  263. /* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
  264. pointer, incrementing length if there are extra bytes. This is called when we
  265. do not know if we are in UTF-8 mode. */
  266. #define GETCHARLENTEST(c, eptr, len) \
  267. c = *eptr; \
  268. if (utf && c >= 0xc0u) GETUTF8LEN(c, eptr, len);
  269. /* If the pointer is not at the start of a character, move it back until
  270. it is. This is called only in UTF-8 mode - we don't put a test within the macro
  271. because almost all calls are already within a block of UTF-8 only code. */
  272. #define BACKCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr--
  273. /* Same as above, just in the other direction. */
  274. #define FORWARDCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr++
  275. #define FORWARDCHARTEST(eptr,end) while(eptr < end && (*eptr & 0xc0u) == 0x80u) eptr++
  276. /* Same as above, but it allows a fully customizable form. */
  277. #define ACROSSCHAR(condition, eptr, action) \
  278. while((condition) && ((*eptr) & 0xc0u) == 0x80u) action
  279. /* Deposit a character into memory, returning the number of code units. */
  280. #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
  281. PRIV(ord2utf)(c,p) : (*p = c, 1))
  282. /* ------------------- 16-bit support ------------------ */
  283. #elif PCRE2_CODE_UNIT_WIDTH == 16
  284. #define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */
  285. /* The largest UTF code point that can be encoded as a single code unit. */
  286. #define MAX_UTF_SINGLE_CU 65535
  287. /* Tests whether the code point needs extra characters to decode. */
  288. #define HAS_EXTRALEN(c) (((c) & 0xfc00u) == 0xd800u)
  289. /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
  290. Otherwise it has an undefined behaviour. */
  291. #define GET_EXTRALEN(c) 1
  292. /* Returns TRUE, if the given value is not the first code unit of a UTF
  293. sequence. */
  294. #define NOT_FIRSTCU(c) (((c) & 0xfc00u) == 0xdc00u)
  295. /* Base macro to pick up the low surrogate of a UTF-16 character, not
  296. advancing the pointer. */
  297. #define GETUTF16(c, eptr) \
  298. { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; }
  299. /* Get the next UTF-16 character, not advancing the pointer. This is called when
  300. we know we are in UTF-16 mode. */
  301. #define GETCHAR(c, eptr) \
  302. c = *eptr; \
  303. if ((c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);
  304. /* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the
  305. pointer. */
  306. #define GETCHARTEST(c, eptr) \
  307. c = *eptr; \
  308. if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);
  309. /* Base macro to pick up the low surrogate of a UTF-16 character, advancing
  310. the pointer. */
  311. #define GETUTF16INC(c, eptr) \
  312. { c = (((c & 0x3ffu) << 10) | (*eptr++ & 0x3ffu)) + 0x10000u; }
  313. /* Get the next UTF-16 character, advancing the pointer. This is called when we
  314. know we are in UTF-16 mode. */
  315. #define GETCHARINC(c, eptr) \
  316. c = *eptr++; \
  317. if ((c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);
  318. /* Get the next character, testing for UTF-16 mode, and advancing the pointer.
  319. This is called when we don't know if we are in UTF-16 mode. */
  320. #define GETCHARINCTEST(c, eptr) \
  321. c = *eptr++; \
  322. if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);
  323. /* Base macro to pick up the low surrogate of a UTF-16 character, not
  324. advancing the pointer, incrementing the length. */
  325. #define GETUTF16LEN(c, eptr, len) \
  326. { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; len++; }
  327. /* Get the next UTF-16 character, not advancing the pointer, incrementing
  328. length if there is a low surrogate. This is called when we know we are in
  329. UTF-16 mode. */
  330. #define GETCHARLEN(c, eptr, len) \
  331. c = *eptr; \
  332. if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
  333. /* Get the next UTF-816character, testing for UTF-16 mode, not advancing the
  334. pointer, incrementing length if there is a low surrogate. This is called when
  335. we do not know if we are in UTF-16 mode. */
  336. #define GETCHARLENTEST(c, eptr, len) \
  337. c = *eptr; \
  338. if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
  339. /* If the pointer is not at the start of a character, move it back until
  340. it is. This is called only in UTF-16 mode - we don't put a test within the
  341. macro because almost all calls are already within a block of UTF-16 only
  342. code. */
  343. #define BACKCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr--
  344. /* Same as above, just in the other direction. */
  345. #define FORWARDCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr++
  346. #define FORWARDCHARTEST(eptr,end) if (eptr < end && (*eptr & 0xfc00u) == 0xdc00u) eptr++
  347. /* Same as above, but it allows a fully customizable form. */
  348. #define ACROSSCHAR(condition, eptr, action) \
  349. if ((condition) && ((*eptr) & 0xfc00u) == 0xdc00u) action
  350. /* Deposit a character into memory, returning the number of code units. */
  351. #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
  352. PRIV(ord2utf)(c,p) : (*p = c, 1))
  353. /* ------------------- 32-bit support ------------------ */
  354. #else
  355. /* These are trivial for the 32-bit library, since all UTF-32 characters fit
  356. into one PCRE2_UCHAR unit. */
  357. #define MAX_UTF_SINGLE_CU (0x10ffffu)
  358. #define HAS_EXTRALEN(c) (0)
  359. #define GET_EXTRALEN(c) (0)
  360. #define NOT_FIRSTCU(c) (0)
  361. /* Get the next UTF-32 character, not advancing the pointer. This is called when
  362. we know we are in UTF-32 mode. */
  363. #define GETCHAR(c, eptr) \
  364. c = *(eptr);
  365. /* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the
  366. pointer. */
  367. #define GETCHARTEST(c, eptr) \
  368. c = *(eptr);
  369. /* Get the next UTF-32 character, advancing the pointer. This is called when we
  370. know we are in UTF-32 mode. */
  371. #define GETCHARINC(c, eptr) \
  372. c = *((eptr)++);
  373. /* Get the next character, testing for UTF-32 mode, and advancing the pointer.
  374. This is called when we don't know if we are in UTF-32 mode. */
  375. #define GETCHARINCTEST(c, eptr) \
  376. c = *((eptr)++);
  377. /* Get the next UTF-32 character, not advancing the pointer, not incrementing
  378. length (since all UTF-32 is of length 1). This is called when we know we are in
  379. UTF-32 mode. */
  380. #define GETCHARLEN(c, eptr, len) \
  381. GETCHAR(c, eptr)
  382. /* Get the next UTF-32character, testing for UTF-32 mode, not advancing the
  383. pointer, not incrementing the length (since all UTF-32 is of length 1).
  384. This is called when we do not know if we are in UTF-32 mode. */
  385. #define GETCHARLENTEST(c, eptr, len) \
  386. GETCHARTEST(c, eptr)
  387. /* If the pointer is not at the start of a character, move it back until
  388. it is. This is called only in UTF-32 mode - we don't put a test within the
  389. macro because almost all calls are already within a block of UTF-32 only
  390. code.
  391. These are all no-ops since all UTF-32 characters fit into one pcre_uchar. */
  392. #define BACKCHAR(eptr) do { } while (0)
  393. /* Same as above, just in the other direction. */
  394. #define FORWARDCHAR(eptr) do { } while (0)
  395. #define FORWARDCHARTEST(eptr,end) do { } while (0)
  396. /* Same as above, but it allows a fully customizable form. */
  397. #define ACROSSCHAR(condition, eptr, action) do { } while (0)
  398. /* Deposit a character into memory, returning the number of code units. */
  399. #define PUTCHAR(c, p) (*p = c, 1)
  400. #endif /* UTF-32 character handling */
  401. #endif /* SUPPORT_UNICODE */
  402. /* Mode-dependent macros that have the same definition in all modes. */
  403. #define CU2BYTES(x) ((x)*((PCRE2_CODE_UNIT_WIDTH/8)))
  404. #define BYTES2CU(x) ((x)/((PCRE2_CODE_UNIT_WIDTH/8)))
  405. #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE
  406. #define PUT2INC(a,n,d) PUT2(a,n,d), a += IMM2_SIZE
  407. /* ----------------------- HIDDEN STRUCTURES ----------------------------- */
  408. /* NOTE: All these structures *must* start with a pcre2_memctl structure. The
  409. code that uses them is simpler because it assumes this. */
  410. /* The real general context structure. At present it holds only data for custom
  411. memory control. */
  412. typedef struct pcre2_real_general_context {
  413. pcre2_memctl memctl;
  414. } pcre2_real_general_context;
  415. /* The real compile context structure */
  416. typedef struct pcre2_real_compile_context {
  417. pcre2_memctl memctl;
  418. int (*stack_guard)(uint32_t, void *);
  419. void *stack_guard_data;
  420. const uint8_t *tables;
  421. PCRE2_SIZE max_pattern_length;
  422. uint16_t bsr_convention;
  423. uint16_t newline_convention;
  424. uint32_t parens_nest_limit;
  425. uint32_t extra_options;
  426. } pcre2_real_compile_context;
  427. /* The real match context structure. */
  428. typedef struct pcre2_real_match_context {
  429. pcre2_memctl memctl;
  430. #ifdef SUPPORT_JIT
  431. pcre2_jit_callback jit_callback;
  432. void *jit_callback_data;
  433. #endif
  434. int (*callout)(pcre2_callout_block *, void *);
  435. void *callout_data;
  436. int (*substitute_callout)(pcre2_substitute_callout_block *, void *);
  437. void *substitute_callout_data;
  438. PCRE2_SIZE offset_limit;
  439. uint32_t heap_limit;
  440. uint32_t match_limit;
  441. uint32_t depth_limit;
  442. } pcre2_real_match_context;
  443. /* The real convert context structure. */
  444. typedef struct pcre2_real_convert_context {
  445. pcre2_memctl memctl;
  446. uint32_t glob_separator;
  447. uint32_t glob_escape;
  448. } pcre2_real_convert_context;
  449. /* The real compiled code structure. The type for the blocksize field is
  450. defined specially because it is required in pcre2_serialize_decode() when
  451. copying the size from possibly unaligned memory into a variable of the same
  452. type. Use a macro rather than a typedef to avoid compiler warnings when this
  453. file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the
  454. largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit
  455. argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field
  456. here.) */
  457. #undef CODE_BLOCKSIZE_TYPE
  458. #define CODE_BLOCKSIZE_TYPE size_t
  459. #undef LOOKBEHIND_MAX
  460. #define LOOKBEHIND_MAX UINT16_MAX
  461. typedef struct pcre2_real_code {
  462. pcre2_memctl memctl; /* Memory control fields */
  463. const uint8_t *tables; /* The character tables */
  464. void *executable_jit; /* Pointer to JIT code */
  465. uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */
  466. CODE_BLOCKSIZE_TYPE blocksize; /* Total (bytes) that was malloc-ed */
  467. uint32_t magic_number; /* Paranoid and endianness check */
  468. uint32_t compile_options; /* Options passed to pcre2_compile() */
  469. uint32_t overall_options; /* Options after processing the pattern */
  470. uint32_t extra_options; /* Taken from compile_context */
  471. uint32_t flags; /* Various state flags */
  472. uint32_t limit_heap; /* Limit set in the pattern */
  473. uint32_t limit_match; /* Limit set in the pattern */
  474. uint32_t limit_depth; /* Limit set in the pattern */
  475. uint32_t first_codeunit; /* Starting code unit */
  476. uint32_t last_codeunit; /* This codeunit must be seen */
  477. uint16_t bsr_convention; /* What \R matches */
  478. uint16_t newline_convention; /* What is a newline? */
  479. uint16_t max_lookbehind; /* Longest lookbehind (characters) */
  480. uint16_t minlength; /* Minimum length of match */
  481. uint16_t top_bracket; /* Highest numbered group */
  482. uint16_t top_backref; /* Highest numbered back reference */
  483. uint16_t name_entry_size; /* Size (code units) of table entries */
  484. uint16_t name_count; /* Number of name entries in the table */
  485. } pcre2_real_code;
  486. /* The real match data structure. Define ovector as large as it can ever
  487. actually be so that array bound checkers don't grumble. Memory for this
  488. structure is obtained by calling pcre2_match_data_create(), which sets the size
  489. as the offset of ovector plus a pair of elements for each capturable string, so
  490. the size varies from call to call. As the maximum number of capturing
  491. subpatterns is 65535 we must allow for 65536 strings to include the overall
  492. match. (See also the heapframe structure below.) */
  493. typedef struct pcre2_real_match_data {
  494. pcre2_memctl memctl;
  495. const pcre2_real_code *code; /* The pattern used for the match */
  496. PCRE2_SPTR subject; /* The subject that was matched */
  497. PCRE2_SPTR mark; /* Pointer to last mark */
  498. PCRE2_SIZE leftchar; /* Offset to leftmost code unit */
  499. PCRE2_SIZE rightchar; /* Offset to rightmost code unit */
  500. PCRE2_SIZE startchar; /* Offset to starting code unit */
  501. uint8_t matchedby; /* Type of match (normal, JIT, DFA) */
  502. uint8_t flags; /* Various flags */
  503. uint16_t oveccount; /* Number of pairs */
  504. int rc; /* The return code from the match */
  505. PCRE2_SIZE ovector[131072]; /* Must be last in the structure */
  506. } pcre2_real_match_data;
  507. /* ----------------------- PRIVATE STRUCTURES ----------------------------- */
  508. /* These structures are not needed for pcre2test. */
  509. #ifndef PCRE2_PCRE2TEST
  510. /* Structures for checking for mutual recursion when scanning compiled or
  511. parsed code. */
  512. typedef struct recurse_check {
  513. struct recurse_check *prev;
  514. PCRE2_SPTR group;
  515. } recurse_check;
  516. typedef struct parsed_recurse_check {
  517. struct parsed_recurse_check *prev;
  518. uint32_t *groupptr;
  519. } parsed_recurse_check;
  520. /* Structure for building a cache when filling in recursion offsets. */
  521. typedef struct recurse_cache {
  522. PCRE2_SPTR group;
  523. int groupnumber;
  524. } recurse_cache;
  525. /* Structure for maintaining a chain of pointers to the currently incomplete
  526. branches, for testing for left recursion while compiling. */
  527. typedef struct branch_chain {
  528. struct branch_chain *outer;
  529. PCRE2_UCHAR *current_branch;
  530. } branch_chain;
  531. /* Structure for building a list of named groups during the first pass of
  532. compiling. */
  533. typedef struct named_group {
  534. PCRE2_SPTR name; /* Points to the name in the pattern */
  535. uint32_t number; /* Group number */
  536. uint16_t length; /* Length of the name */
  537. uint16_t isdup; /* TRUE if a duplicate */
  538. } named_group;
  539. /* Structure for passing "static" information around between the functions
  540. doing the compiling, so that they are thread-safe. */
  541. typedef struct compile_block {
  542. pcre2_real_compile_context *cx; /* Points to the compile context */
  543. const uint8_t *lcc; /* Points to lower casing table */
  544. const uint8_t *fcc; /* Points to case-flipping table */
  545. const uint8_t *cbits; /* Points to character type table */
  546. const uint8_t *ctypes; /* Points to table of type maps */
  547. PCRE2_SPTR start_workspace; /* The start of working space */
  548. PCRE2_SPTR start_code; /* The start of the compiled code */
  549. PCRE2_SPTR start_pattern; /* The start of the pattern */
  550. PCRE2_SPTR end_pattern; /* The end of the pattern */
  551. PCRE2_UCHAR *name_table; /* The name/number table */
  552. PCRE2_SIZE workspace_size; /* Size of workspace */
  553. PCRE2_SIZE small_ref_offset[10]; /* Offsets for \1 to \9 */
  554. PCRE2_SIZE erroroffset; /* Offset of error in pattern */
  555. uint16_t names_found; /* Number of entries so far */
  556. uint16_t name_entry_size; /* Size of each entry */
  557. uint16_t parens_depth; /* Depth of nested parentheses */
  558. uint16_t assert_depth; /* Depth of nested assertions */
  559. open_capitem *open_caps; /* Chain of open capture items */
  560. named_group *named_groups; /* Points to vector in pre-compile */
  561. uint32_t named_group_list_size; /* Number of entries in the list */
  562. uint32_t external_options; /* External (initial) options */
  563. uint32_t external_flags; /* External flag bits to be set */
  564. uint32_t bracount; /* Count of capturing parentheses */
  565. uint32_t lastcapture; /* Last capture encountered */
  566. uint32_t *parsed_pattern; /* Parsed pattern buffer */
  567. uint32_t *parsed_pattern_end; /* Parsed pattern should not get here */
  568. uint32_t *groupinfo; /* Group info vector */
  569. uint32_t top_backref; /* Maximum back reference */
  570. uint32_t backref_map; /* Bitmap of low back refs */
  571. uint32_t nltype; /* Newline type */
  572. uint32_t nllen; /* Newline string length */
  573. uint32_t class_range_start; /* Overall class range start */
  574. uint32_t class_range_end; /* Overall class range end */
  575. PCRE2_UCHAR nl[4]; /* Newline string when fixed length */
  576. int max_lookbehind; /* Maximum lookbehind (characters) */
  577. int req_varyopt; /* "After variable item" flag for reqbyte */
  578. BOOL had_accept; /* (*ACCEPT) encountered */
  579. BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */
  580. BOOL had_recurse; /* Had a recursion or subroutine call */
  581. BOOL dupnames; /* Duplicate names exist */
  582. } compile_block;
  583. /* Structure for keeping the properties of the in-memory stack used
  584. by the JIT matcher. */
  585. typedef struct pcre2_real_jit_stack {
  586. pcre2_memctl memctl;
  587. void* stack;
  588. } pcre2_real_jit_stack;
  589. /* Structure for items in a linked list that represents an explicit recursive
  590. call within the pattern when running pcre_dfa_match(). */
  591. typedef struct dfa_recursion_info {
  592. struct dfa_recursion_info *prevrec;
  593. PCRE2_SPTR subject_position;
  594. uint32_t group_num;
  595. } dfa_recursion_info;
  596. /* Structure for "stack" frames that are used for remembering backtracking
  597. positions during matching. As these are used in a vector, with the ovector item
  598. being extended, the size of the structure must be a multiple of PCRE2_SIZE. The
  599. only way to check this at compile time is to force an error by generating an
  600. array with a negative size. By putting this in a typedef (which is never used),
  601. we don't generate any code when all is well. */
  602. typedef struct heapframe {
  603. /* The first set of fields are variables that have to be preserved over calls
  604. to RRMATCH(), but which do not need to be copied to new frames. */
  605. PCRE2_SPTR ecode; /* The current position in the pattern */
  606. PCRE2_SPTR temp_sptr[2]; /* Used for short-term PCRE_SPTR values */
  607. PCRE2_SIZE length; /* Used for character, string, or code lengths */
  608. PCRE2_SIZE back_frame; /* Amount to subtract on RRETURN */
  609. PCRE2_SIZE temp_size; /* Used for short-term PCRE2_SIZE values */
  610. uint32_t rdepth; /* "Recursion" depth */
  611. uint32_t group_frame_type; /* Type information for group frames */
  612. uint32_t temp_32[4]; /* Used for short-term 32-bit or BOOL values */
  613. uint8_t return_id; /* Where to go on in internal "return" */
  614. uint8_t op; /* Processing opcode */
  615. /* At this point, the structure is 16-bit aligned. On most architectures
  616. the alignment requirement for a pointer will ensure that the eptr field below
  617. is 32-bit or 64-bit aligned. However, on m68k it is fine to have a pointer
  618. that is 16-bit aligned. We must therefore ensure that what comes between here
  619. and eptr is an odd multiple of 16 bits so as to get back into 32-bit
  620. alignment. This happens naturally when PCRE2_UCHAR is 8 bits wide, but needs
  621. fudges in the other cases. In the 32-bit case the padding comes first so that
  622. the occu field itself is 32-bit aligned. Without the padding, this structure
  623. is no longer a multiple of PCRE2_SIZE on m68k, and the check below fails. */
  624. #if PCRE2_CODE_UNIT_WIDTH == 8
  625. PCRE2_UCHAR occu[6]; /* Used for other case code units */
  626. #elif PCRE2_CODE_UNIT_WIDTH == 16
  627. PCRE2_UCHAR occu[2]; /* Used for other case code units */
  628. uint8_t unused[2]; /* Ensure 32-bit alignment (see above) */
  629. #else
  630. uint8_t unused[2]; /* Ensure 32-bit alignment (see above) */
  631. PCRE2_UCHAR occu[1]; /* Used for other case code units */
  632. #endif
  633. /* The rest have to be copied from the previous frame whenever a new frame
  634. becomes current. The final field is specified as a large vector so that
  635. runtime array bound checks don't catch references to it. However, for any
  636. specific call to pcre2_match() the memory allocated for each frame structure
  637. allows for exactly the right size ovector for the number of capturing
  638. parentheses. (See also the comment for pcre2_real_match_data above.) */
  639. PCRE2_SPTR eptr; /* MUST BE FIRST */
  640. PCRE2_SPTR start_match; /* Can be adjusted by \K */
  641. PCRE2_SPTR mark; /* Most recent mark on the success path */
  642. uint32_t current_recurse; /* Current (deepest) recursion number */
  643. uint32_t capture_last; /* Most recent capture */
  644. PCRE2_SIZE last_group_offset; /* Saved offset to most recent group frame */
  645. PCRE2_SIZE offset_top; /* Offset after highest capture */
  646. PCRE2_SIZE ovector[131072]; /* Must be last in the structure */
  647. } heapframe;
  648. /* This typedef is a check that the size of the heapframe structure is a
  649. multiple of PCRE2_SIZE. See various comments above. */
  650. typedef char check_heapframe_size[
  651. ((sizeof(heapframe) % sizeof(PCRE2_SIZE)) == 0)? (+1):(-1)];
  652. /* Structure for passing "static" information around between the functions
  653. doing traditional NFA matching (pcre2_match() and friends). */
  654. typedef struct match_block {
  655. pcre2_memctl memctl; /* For general use */
  656. PCRE2_SIZE frame_vector_size; /* Size of a backtracking frame */
  657. heapframe *match_frames; /* Points to vector of frames */
  658. heapframe *match_frames_top; /* Points after the end of the vector */
  659. heapframe *stack_frames; /* The original vector on the stack */
  660. PCRE2_SIZE heap_limit; /* As it says */
  661. uint32_t match_limit; /* As it says */
  662. uint32_t match_limit_depth; /* As it says */
  663. uint32_t match_call_count; /* Number of times a new frame is created */
  664. BOOL hitend; /* Hit the end of the subject at some point */
  665. BOOL hasthen; /* Pattern contains (*THEN) */
  666. BOOL allowemptypartial; /* Allow empty hard partial */
  667. const uint8_t *lcc; /* Points to lower casing table */
  668. const uint8_t *fcc; /* Points to case-flipping table */
  669. const uint8_t *ctypes; /* Points to table of type maps */
  670. PCRE2_SIZE start_offset; /* The start offset value */
  671. PCRE2_SIZE end_offset_top; /* Highwater mark at end of match */
  672. uint16_t partial; /* PARTIAL options */
  673. uint16_t bsr_convention; /* \R interpretation */
  674. uint16_t name_count; /* Number of names in name table */
  675. uint16_t name_entry_size; /* Size of entry in names table */
  676. PCRE2_SPTR name_table; /* Table of group names */
  677. PCRE2_SPTR start_code; /* For use when recursing */
  678. PCRE2_SPTR start_subject; /* Start of the subject string */
  679. PCRE2_SPTR check_subject; /* Where UTF-checked from */
  680. PCRE2_SPTR end_subject; /* End of the subject string */
  681. PCRE2_SPTR end_match_ptr; /* Subject position at end match */
  682. PCRE2_SPTR start_used_ptr; /* Earliest consulted character */
  683. PCRE2_SPTR last_used_ptr; /* Latest consulted character */
  684. PCRE2_SPTR mark; /* Mark pointer to pass back on success */
  685. PCRE2_SPTR nomatch_mark; /* Mark pointer to pass back on failure */
  686. PCRE2_SPTR verb_ecode_ptr; /* For passing back info */
  687. PCRE2_SPTR verb_skip_ptr; /* For passing back a (*SKIP) name */
  688. uint32_t verb_current_recurse; /* Current recurse when (*VERB) happens */
  689. uint32_t moptions; /* Match options */
  690. uint32_t poptions; /* Pattern options */
  691. uint32_t skip_arg_count; /* For counting SKIP_ARGs */
  692. uint32_t ignore_skip_arg; /* For re-run when SKIP arg name not found */
  693. uint32_t nltype; /* Newline type */
  694. uint32_t nllen; /* Newline string length */
  695. PCRE2_UCHAR nl[4]; /* Newline string when fixed */
  696. pcre2_callout_block *cb; /* Points to a callout block */
  697. void *callout_data; /* To pass back to callouts */
  698. int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */
  699. } match_block;
  700. /* A similar structure is used for the same purpose by the DFA matching
  701. functions. */
  702. typedef struct dfa_match_block {
  703. pcre2_memctl memctl; /* For general use */
  704. PCRE2_SPTR start_code; /* Start of the compiled pattern */
  705. PCRE2_SPTR start_subject ; /* Start of the subject string */
  706. PCRE2_SPTR end_subject; /* End of subject string */
  707. PCRE2_SPTR start_used_ptr; /* Earliest consulted character */
  708. PCRE2_SPTR last_used_ptr; /* Latest consulted character */
  709. const uint8_t *tables; /* Character tables */
  710. PCRE2_SIZE start_offset; /* The start offset value */
  711. PCRE2_SIZE heap_limit; /* As it says */
  712. PCRE2_SIZE heap_used; /* As it says */
  713. uint32_t match_limit; /* As it says */
  714. uint32_t match_limit_depth; /* As it says */
  715. uint32_t match_call_count; /* Number of calls of internal function */
  716. uint32_t moptions; /* Match options */
  717. uint32_t poptions; /* Pattern options */
  718. uint32_t nltype; /* Newline type */
  719. uint32_t nllen; /* Newline string length */
  720. BOOL allowemptypartial; /* Allow empty hard partial */
  721. PCRE2_UCHAR nl[4]; /* Newline string when fixed */
  722. uint16_t bsr_convention; /* \R interpretation */
  723. pcre2_callout_block *cb; /* Points to a callout block */
  724. void *callout_data; /* To pass back to callouts */
  725. int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */
  726. dfa_recursion_info *recursive; /* Linked list of recursion data */
  727. } dfa_match_block;
  728. #endif /* PCRE2_PCRE2TEST */
  729. /* End of pcre2_intmodedep.h */