123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987 |
- /*************************************************
- * Perl-Compatible Regular Expressions *
- *************************************************/
- /* PCRE is a library of functions to support regular expressions whose syntax
- and semantics are as close as possible to those of the Perl 5 language.
- Written by Philip Hazel
- Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2016-2020 University of Cambridge
- -----------------------------------------------------------------------------
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- POSSIBILITY OF SUCH DAMAGE.
- -----------------------------------------------------------------------------
- */
- #ifdef HAVE_CONFIG_H
- #include "config.h"
- #endif
- #include "pcre2_internal.h"
- #define PTR_STACK_SIZE 20
- #define SUBSTITUTE_OPTIONS \
- (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
- PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \
- PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \
- PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY)
- /*************************************************
- * Find end of substitute text *
- *************************************************/
- /* In extended mode, we recognize ${name:+set text:unset text} and similar
- constructions. This requires the identification of unescaped : and }
- characters. This function scans for such. It must deal with nested ${
- constructions. The pointer to the text is updated, either to the required end
- character, or to where an error was detected.
- Arguments:
- code points to the compiled expression (for options)
- ptrptr points to the pointer to the start of the text (updated)
- ptrend end of the whole string
- last TRUE if the last expected string (only } recognized)
- Returns: 0 on success
- negative error code on failure
- */
- static int
- find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
- BOOL last)
- {
- int rc = 0;
- uint32_t nestlevel = 0;
- BOOL literal = FALSE;
- PCRE2_SPTR ptr = *ptrptr;
- for (; ptr < ptrend; ptr++)
- {
- if (literal)
- {
- if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)
- {
- literal = FALSE;
- ptr += 1;
- }
- }
- else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
- {
- if (nestlevel == 0) goto EXIT;
- nestlevel--;
- }
- else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;
- else if (*ptr == CHAR_DOLLAR_SIGN)
- {
- if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
- {
- nestlevel++;
- ptr += 1;
- }
- }
- else if (*ptr == CHAR_BACKSLASH)
- {
- int erc;
- int errorcode;
- uint32_t ch;
- if (ptr < ptrend - 1) switch (ptr[1])
- {
- case CHAR_L:
- case CHAR_l:
- case CHAR_U:
- case CHAR_u:
- ptr += 1;
- continue;
- }
- ptr += 1; /* Must point after \ */
- erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,
- code->overall_options, code->extra_options, FALSE, NULL);
- ptr -= 1; /* Back to last code unit of escape */
- if (errorcode != 0)
- {
- rc = errorcode;
- goto EXIT;
- }
- switch(erc)
- {
- case 0: /* Data character */
- case ESC_E: /* Isolated \E is ignored */
- break;
- case ESC_Q:
- literal = TRUE;
- break;
- default:
- rc = PCRE2_ERROR_BADREPESCAPE;
- goto EXIT;
- }
- }
- }
- rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */
- EXIT:
- *ptrptr = ptr;
- return rc;
- }
- /*************************************************
- * Match and substitute *
- *************************************************/
- /* This function applies a compiled re to a subject string and creates a new
- string with substitutions. The first 7 arguments are the same as for
- pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.
- Arguments:
- code points to the compiled expression
- subject points to the subject string
- length length of subject string (may contain binary zeros)
- start_offset where to start in the subject string
- options option bits
- match_data points to a match_data block, or is NULL
- context points a PCRE2 context
- replacement points to the replacement string
- rlength length of replacement string
- buffer where to put the substituted string
- blength points to length of buffer; updated to length of string
- Returns: >= 0 number of substitutions made
- < 0 an error code
- PCRE2_ERROR_BADREPLACEMENT means invalid use of $
- */
- /* This macro checks for space in the buffer before copying into it. On
- overflow, either give an error immediately, or keep on, accumulating the
- length. */
- #define CHECKMEMCPY(from,length) \
- { \
- if (!overflowed && lengthleft < length) \
- { \
- if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
- overflowed = TRUE; \
- extra_needed = length - lengthleft; \
- } \
- else if (overflowed) \
- { \
- extra_needed += length; \
- } \
- else \
- { \
- memcpy(buffer + buff_offset, from, CU2BYTES(length)); \
- buff_offset += length; \
- lengthleft -= length; \
- } \
- }
- /* Here's the function */
- PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
- pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
- PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
- pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,
- PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)
- {
- int rc;
- int subs;
- int forcecase = 0;
- int forcecasereset = 0;
- uint32_t ovector_count;
- uint32_t goptions = 0;
- uint32_t suboptions;
- pcre2_match_data *internal_match_data = NULL;
- BOOL escaped_literal = FALSE;
- BOOL overflowed = FALSE;
- BOOL use_existing_match;
- BOOL replacement_only;
- #ifdef SUPPORT_UNICODE
- BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
- BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
- #endif
- PCRE2_UCHAR temp[6];
- PCRE2_SPTR ptr;
- PCRE2_SPTR repend;
- PCRE2_SIZE extra_needed = 0;
- PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
- PCRE2_SIZE *ovector;
- PCRE2_SIZE ovecsave[3];
- pcre2_substitute_callout_block scb;
- /* General initialization */
- buff_offset = 0;
- lengthleft = buff_length = *blength;
- *blength = PCRE2_UNSET;
- ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
- /* Partial matching is not valid. This must come after setting *blength to
- PCRE2_UNSET, so as not to imply an offset in the replacement. */
- if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
- return PCRE2_ERROR_BADOPTION;
- /* Check for using a match that has already happened. Note that the subject
- pointer in the match data may be NULL after a no-match. */
- use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0);
- replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0);
- /* If starting from an existing match, there must be an externally provided
- match data block. We create an internal match_data block in two cases: (a) an
- external one is not supplied (and we are not starting from an existing match);
- (b) an existing match is to be used for the first substitution. In the latter
- case, we copy the existing match into the internal block. This ensures that no
- changes are made to the existing match data block. */
- if (match_data == NULL)
- {
- pcre2_general_context *gcontext;
- if (use_existing_match) return PCRE2_ERROR_NULL;
- gcontext = (mcontext == NULL)?
- (pcre2_general_context *)code :
- (pcre2_general_context *)mcontext;
- match_data = internal_match_data =
- pcre2_match_data_create_from_pattern(code, gcontext);
- if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
- }
- else if (use_existing_match)
- {
- pcre2_general_context *gcontext = (mcontext == NULL)?
- (pcre2_general_context *)code :
- (pcre2_general_context *)mcontext;
- int pairs = (code->top_bracket + 1 < match_data->oveccount)?
- code->top_bracket + 1 : match_data->oveccount;
- internal_match_data = pcre2_match_data_create(match_data->oveccount,
- gcontext);
- if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
- memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)
- + 2*pairs*sizeof(PCRE2_SIZE));
- match_data = internal_match_data;
- }
- /* Remember ovector details */
- ovector = pcre2_get_ovector_pointer(match_data);
- ovector_count = pcre2_get_ovector_count(match_data);
- /* Fixed things in the callout block */
- scb.version = 0;
- scb.input = subject;
- scb.output = (PCRE2_SPTR)buffer;
- scb.ovector = ovector;
- /* Find lengths of zero-terminated strings and the end of the replacement. */
- if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
- if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
- repend = replacement + rlength;
- /* Check UTF replacement string if necessary. */
- #ifdef SUPPORT_UNICODE
- if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
- {
- rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));
- if (rc != 0)
- {
- match_data->leftchar = 0;
- goto EXIT;
- }
- }
- #endif /* SUPPORT_UNICODE */
- /* Save the substitute options and remove them from the match options. */
- suboptions = options & SUBSTITUTE_OPTIONS;
- options &= ~SUBSTITUTE_OPTIONS;
- /* Error if the start match offset is greater than the length of the subject. */
- if (start_offset > length)
- {
- match_data->leftchar = 0;
- rc = PCRE2_ERROR_BADOFFSET;
- goto EXIT;
- }
- /* Copy up to the start offset, unless only the replacement is required. */
- if (!replacement_only) CHECKMEMCPY(subject, start_offset);
- /* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first
- match is taken from the match_data that was passed in. */
- subs = 0;
- do
- {
- PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
- uint32_t ptrstackptr = 0;
- if (use_existing_match)
- {
- rc = match_data->rc;
- use_existing_match = FALSE;
- }
- else rc = pcre2_match(code, subject, length, start_offset, options|goptions,
- match_data, mcontext);
- #ifdef SUPPORT_UNICODE
- if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */
- #endif
- /* Any error other than no match returns the error code. No match when not
- doing the special after-empty-match global rematch, or when at the end of the
- subject, breaks the global loop. Otherwise, advance the starting point by one
- character, copying it to the output, and try again. */
- if (rc < 0)
- {
- PCRE2_SIZE save_start;
- if (rc != PCRE2_ERROR_NOMATCH) goto EXIT;
- if (goptions == 0 || start_offset >= length) break;
- /* Advance by one code point. Then, if CRLF is a valid newline sequence and
- we have advanced into the middle of it, advance one more code point. In
- other words, do not start in the middle of CRLF, even if CR and LF on their
- own are valid newlines. */
- save_start = start_offset++;
- if (subject[start_offset-1] == CHAR_CR &&
- code->newline_convention != PCRE2_NEWLINE_CR &&
- code->newline_convention != PCRE2_NEWLINE_LF &&
- start_offset < length &&
- subject[start_offset] == CHAR_LF)
- start_offset++;
- /* Otherwise, in UTF mode, advance past any secondary code points. */
- else if ((code->overall_options & PCRE2_UTF) != 0)
- {
- #if PCRE2_CODE_UNIT_WIDTH == 8
- while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80)
- start_offset++;
- #elif PCRE2_CODE_UNIT_WIDTH == 16
- while (start_offset < length &&
- (subject[start_offset] & 0xfc00) == 0xdc00)
- start_offset++;
- #endif
- }
- /* Copy what we have advanced past (unless not required), reset the special
- global options, and continue to the next match. */
- fraglength = start_offset - save_start;
- if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength);
- goptions = 0;
- continue;
- }
- /* Handle a successful match. Matches that use \K to end before they start
- or start before the current point in the subject are not supported. */
- if (ovector[1] < ovector[0] || ovector[0] < start_offset)
- {
- rc = PCRE2_ERROR_BADSUBSPATTERN;
- goto EXIT;
- }
- /* Check for the same match as previous. This is legitimate after matching an
- empty string that starts after the initial match offset. We have tried again
- at the match point in case the pattern is one like /(?<=\G.)/ which can never
- match at its starting point, so running the match achieves the bumpalong. If
- we do get the same (null) match at the original match point, it isn't such a
- pattern, so we now do the empty string magic. In all other cases, a repeat
- match should never occur. */
- if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
- {
- if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
- {
- goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
- ovecsave[2] = start_offset;
- continue; /* Back to the top of the loop */
- }
- rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
- goto EXIT;
- }
- /* Count substitutions with a paranoid check for integer overflow; surely no
- real call to this function would ever hit this! */
- if (subs == INT_MAX)
- {
- rc = PCRE2_ERROR_TOOMANYREPLACE;
- goto EXIT;
- }
- subs++;
- /* Copy the text leading up to the match (unless not required), and remember
- where the insert begins and how many ovector pairs are set. */
- if (rc == 0) rc = ovector_count;
- fraglength = ovector[0] - start_offset;
- if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength);
- scb.output_offsets[0] = buff_offset;
- scb.oveccount = rc;
- /* Process the replacement string. If the entire replacement is literal, just
- copy it with length check. */
- ptr = replacement;
- if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0)
- {
- CHECKMEMCPY(ptr, rlength);
- }
- /* Within a non-literal replacement, which must be scanned character by
- character, local literal mode can be set by \Q, but only in extended mode
- when backslashes are being interpreted. In extended mode we must handle
- nested substrings that are to be reprocessed. */
- else for (;;)
- {
- uint32_t ch;
- unsigned int chlen;
- /* If at the end of a nested substring, pop the stack. */
- if (ptr >= repend)
- {
- if (ptrstackptr == 0) break; /* End of replacement string */
- repend = ptrstack[--ptrstackptr];
- ptr = ptrstack[--ptrstackptr];
- continue;
- }
- /* Handle the next character */
- if (escaped_literal)
- {
- if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
- {
- escaped_literal = FALSE;
- ptr += 2;
- continue;
- }
- goto LOADLITERAL;
- }
- /* Not in literal mode. */
- if (*ptr == CHAR_DOLLAR_SIGN)
- {
- int group, n;
- uint32_t special = 0;
- BOOL inparens;
- BOOL star;
- PCRE2_SIZE sublength;
- PCRE2_SPTR text1_start = NULL;
- PCRE2_SPTR text1_end = NULL;
- PCRE2_SPTR text2_start = NULL;
- PCRE2_SPTR text2_end = NULL;
- PCRE2_UCHAR next;
- PCRE2_UCHAR name[33];
- if (++ptr >= repend) goto BAD;
- if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;
- group = -1;
- n = 0;
- inparens = FALSE;
- star = FALSE;
- if (next == CHAR_LEFT_CURLY_BRACKET)
- {
- if (++ptr >= repend) goto BAD;
- next = *ptr;
- inparens = TRUE;
- }
- if (next == CHAR_ASTERISK)
- {
- if (++ptr >= repend) goto BAD;
- next = *ptr;
- star = TRUE;
- }
- if (!star && next >= CHAR_0 && next <= CHAR_9)
- {
- group = next - CHAR_0;
- while (++ptr < repend)
- {
- next = *ptr;
- if (next < CHAR_0 || next > CHAR_9) break;
- group = group * 10 + next - CHAR_0;
- /* A check for a number greater than the hightest captured group
- is sufficient here; no need for a separate overflow check. If unknown
- groups are to be treated as unset, just skip over any remaining
- digits and carry on. */
- if (group > code->top_bracket)
- {
- if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
- {
- while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);
- break;
- }
- else
- {
- rc = PCRE2_ERROR_NOSUBSTRING;
- goto PTREXIT;
- }
- }
- }
- }
- else
- {
- const uint8_t *ctypes = code->tables + ctypes_offset;
- while (MAX_255(next) && (ctypes[next] & ctype_word) != 0)
- {
- name[n++] = next;
- if (n > 32) goto BAD;
- if (++ptr >= repend) break;
- next = *ptr;
- }
- if (n == 0) goto BAD;
- name[n] = 0;
- }
- /* In extended mode we recognize ${name:+set text:unset text} and
- ${name:-default text}. */
- if (inparens)
- {
- if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
- !star && ptr < repend - 2 && next == CHAR_COLON)
- {
- special = *(++ptr);
- if (special != CHAR_PLUS && special != CHAR_MINUS)
- {
- rc = PCRE2_ERROR_BADSUBSTITUTION;
- goto PTREXIT;
- }
- text1_start = ++ptr;
- rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);
- if (rc != 0) goto PTREXIT;
- text1_end = ptr;
- if (special == CHAR_PLUS && *ptr == CHAR_COLON)
- {
- text2_start = ++ptr;
- rc = find_text_end(code, &ptr, repend, TRUE);
- if (rc != 0) goto PTREXIT;
- text2_end = ptr;
- }
- }
- else
- {
- if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)
- {
- rc = PCRE2_ERROR_REPMISSINGBRACE;
- goto PTREXIT;
- }
- }
- ptr++;
- }
- /* Have found a syntactically correct group number or name, or *name.
- Only *MARK is currently recognized. */
- if (star)
- {
- if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)
- {
- PCRE2_SPTR mark = pcre2_get_mark(match_data);
- if (mark != NULL)
- {
- PCRE2_SPTR mark_start = mark;
- while (*mark != 0) mark++;
- fraglength = mark - mark_start;
- CHECKMEMCPY(mark_start, fraglength);
- }
- }
- else goto BAD;
- }
- /* Substitute the contents of a group. We don't use substring_copy
- functions any more, in order to support case forcing. */
- else
- {
- PCRE2_SPTR subptr, subptrend;
- /* Find a number for a named group. In case there are duplicate names,
- search for the first one that is set. If the name is not found when
- PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a
- non-existent group. */
- if (group < 0)
- {
- PCRE2_SPTR first, last, entry;
- rc = pcre2_substring_nametable_scan(code, name, &first, &last);
- if (rc == PCRE2_ERROR_NOSUBSTRING &&
- (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
- {
- group = code->top_bracket + 1;
- }
- else
- {
- if (rc < 0) goto PTREXIT;
- for (entry = first; entry <= last; entry += rc)
- {
- uint32_t ng = GET2(entry, 0);
- if (ng < ovector_count)
- {
- if (group < 0) group = ng; /* First in ovector */
- if (ovector[ng*2] != PCRE2_UNSET)
- {
- group = ng; /* First that is set */
- break;
- }
- }
- }
- /* If group is still negative, it means we did not find a group
- that is in the ovector. Just set the first group. */
- if (group < 0) group = GET2(first, 0);
- }
- }
- /* We now have a group that is identified by number. Find the length of
- the captured string. If a group in a non-special substitution is unset
- when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */
- rc = pcre2_substring_length_bynumber(match_data, group, &sublength);
- if (rc < 0)
- {
- if (rc == PCRE2_ERROR_NOSUBSTRING &&
- (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
- {
- rc = PCRE2_ERROR_UNSET;
- }
- if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */
- if (special == 0) /* Plain substitution */
- {
- if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;
- goto PTREXIT; /* Else error */
- }
- }
- /* If special is '+' we have a 'set' and possibly an 'unset' text,
- both of which are reprocessed when used. If special is '-' we have a
- default text for when the group is unset; it must be reprocessed. */
- if (special != 0)
- {
- if (special == CHAR_MINUS)
- {
- if (rc == 0) goto LITERAL_SUBSTITUTE;
- text2_start = text1_start;
- text2_end = text1_end;
- }
- if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;
- ptrstack[ptrstackptr++] = ptr;
- ptrstack[ptrstackptr++] = repend;
- if (rc == 0)
- {
- ptr = text1_start;
- repend = text1_end;
- }
- else
- {
- ptr = text2_start;
- repend = text2_end;
- }
- continue;
- }
- /* Otherwise we have a literal substitution of a group's contents. */
- LITERAL_SUBSTITUTE:
- subptr = subject + ovector[group*2];
- subptrend = subject + ovector[group*2 + 1];
- /* Substitute a literal string, possibly forcing alphabetic case. */
- while (subptr < subptrend)
- {
- GETCHARINCTEST(ch, subptr);
- if (forcecase != 0)
- {
- #ifdef SUPPORT_UNICODE
- if (utf || ucp)
- {
- uint32_t type = UCD_CHARTYPE(ch);
- if (PRIV(ucp_gentype)[type] == ucp_L &&
- type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
- ch = UCD_OTHERCASE(ch);
- }
- else
- #endif
- {
- if (((code->tables + cbits_offset +
- ((forcecase > 0)? cbit_upper:cbit_lower)
- )[ch/8] & (1u << (ch%8))) == 0)
- ch = (code->tables + fcc_offset)[ch];
- }
- forcecase = forcecasereset;
- }
- #ifdef SUPPORT_UNICODE
- if (utf) chlen = PRIV(ord2utf)(ch, temp); else
- #endif
- {
- temp[0] = ch;
- chlen = 1;
- }
- CHECKMEMCPY(temp, chlen);
- }
- }
- }
- /* Handle an escape sequence in extended mode. We can use check_escape()
- to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but
- the case-forcing escapes are not supported in pcre2_compile() so must be
- recognized here. */
- else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
- *ptr == CHAR_BACKSLASH)
- {
- int errorcode;
- if (ptr < repend - 1) switch (ptr[1])
- {
- case CHAR_L:
- forcecase = forcecasereset = -1;
- ptr += 2;
- continue;
- case CHAR_l:
- forcecase = -1;
- forcecasereset = 0;
- ptr += 2;
- continue;
- case CHAR_U:
- forcecase = forcecasereset = 1;
- ptr += 2;
- continue;
- case CHAR_u:
- forcecase = 1;
- forcecasereset = 0;
- ptr += 2;
- continue;
- default:
- break;
- }
- ptr++; /* Point after \ */
- rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,
- code->overall_options, code->extra_options, FALSE, NULL);
- if (errorcode != 0) goto BADESCAPE;
- switch(rc)
- {
- case ESC_E:
- forcecase = forcecasereset = 0;
- continue;
- case ESC_Q:
- escaped_literal = TRUE;
- continue;
- case 0: /* Data character */
- goto LITERAL;
- default:
- goto BADESCAPE;
- }
- }
- /* Handle a literal code unit */
- else
- {
- LOADLITERAL:
- GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */
- LITERAL:
- if (forcecase != 0)
- {
- #ifdef SUPPORT_UNICODE
- if (utf || ucp)
- {
- uint32_t type = UCD_CHARTYPE(ch);
- if (PRIV(ucp_gentype)[type] == ucp_L &&
- type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
- ch = UCD_OTHERCASE(ch);
- }
- else
- #endif
- {
- if (((code->tables + cbits_offset +
- ((forcecase > 0)? cbit_upper:cbit_lower)
- )[ch/8] & (1u << (ch%8))) == 0)
- ch = (code->tables + fcc_offset)[ch];
- }
- forcecase = forcecasereset;
- }
- #ifdef SUPPORT_UNICODE
- if (utf) chlen = PRIV(ord2utf)(ch, temp); else
- #endif
- {
- temp[0] = ch;
- chlen = 1;
- }
- CHECKMEMCPY(temp, chlen);
- } /* End handling a literal code unit */
- } /* End of loop for scanning the replacement. */
- /* The replacement has been copied to the output, or its size has been
- remembered. Do the callout if there is one and we have done an actual
- replacement. */
- if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL)
- {
- scb.subscount = subs;
- scb.output_offsets[1] = buff_offset;
- rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data);
- /* A non-zero return means cancel this substitution. Instead, copy the
- matched string fragment. */
- if (rc != 0)
- {
- PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0];
- PCRE2_SIZE oldlength = ovector[1] - ovector[0];
- buff_offset -= newlength;
- lengthleft += newlength;
- if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength);
- /* A negative return means do not do any more. */
- if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL);
- }
- }
- /* Save the details of this match. See above for how this data is used. If we
- matched an empty string, do the magic for global matches. Update the start
- offset to point to the rest of the subject string. If we re-used an existing
- match for the first match, switch to the internal match data block. */
- ovecsave[0] = ovector[0];
- ovecsave[1] = ovector[1];
- ovecsave[2] = start_offset;
- goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :
- PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
- start_offset = ovector[1];
- } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */
- /* Copy the rest of the subject unless not required, and terminate the output
- with a binary zero. */
- if (!replacement_only)
- {
- fraglength = length - start_offset;
- CHECKMEMCPY(subject + start_offset, fraglength);
- }
- temp[0] = 0;
- CHECKMEMCPY(temp, 1);
- /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
- and matching has carried on after a full buffer, in order to compute the length
- needed. Otherwise, an overflow generates an immediate error return. */
- if (overflowed)
- {
- rc = PCRE2_ERROR_NOMEMORY;
- *blength = buff_length + extra_needed;
- }
- /* After a successful execution, return the number of substitutions and set the
- length of buffer used, excluding the trailing zero. */
- else
- {
- rc = subs;
- *blength = buff_offset - 1;
- }
- EXIT:
- if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data);
- else match_data->rc = rc;
- return rc;
- NOROOM:
- rc = PCRE2_ERROR_NOMEMORY;
- goto EXIT;
- BAD:
- rc = PCRE2_ERROR_BADREPLACEMENT;
- goto PTREXIT;
- BADESCAPE:
- rc = PCRE2_ERROR_BADREPESCAPE;
- PTREXIT:
- *blength = (PCRE2_SIZE)(ptr - replacement);
- goto EXIT;
- }
- /* End of pcre2_substitute.c */
|