return c;
}
+static int calculateCompiledPatternLengthAndFlags(const pcre_char* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase, compile_data& compile_block, ErrorCode errorcode)
+{
+ /* Make a pass over the pattern to compute the
+ amount of store required to hold the compiled code. This does not have to be
+ perfect as long as errors are overestimates. At the same time we can detect any
+ flag settings right at the start, and extract them. Make an attempt to correct
+ for any counted white space if an "extended" flag setting appears late in the
+ pattern. We can't be so clever for #-comments. */
+
+ int length = 1 + LINK_SIZE; /* For initial BRA plus length */
+ int branch_extra = 0;
+ int branch_newextra;
+ int lastitemlength = 0;
+ BOOL class_utf8;
+ BOOL capturing;
+ unsigned int brastackptr = 0;
+ int brastack[BRASTACK_SIZE];
+ uschar bralenstack[BRASTACK_SIZE];
+ int item_count = -1;
+ int bracount = 0;
+
+ const pcre_uchar* ptr = (const pcre_uchar*)(pattern - 1);
+ const pcre_uchar* patternEnd = (const pcre_uchar*)(pattern + patternLength);
+
+ while (++ptr < patternEnd)
+ {
+ int min = 0, max = 0;
+ int class_optcount;
+ int bracket_length;
+ int duplength;
+
+ int c = *ptr;
+
+ item_count++; /* Is zero for the first non-comment item */
+
+ switch(c)
+ {
+ /* A backslashed item may be an escaped data character or it may be a
+ character type. */
+
+ case '\\':
+ c = check_escape(&ptr, patternEnd, &errorcode, bracount, false);
+ if (errorcode != 0)
+ return -1;;
+
+ lastitemlength = 1; /* Default length of last item for repeats */
+
+ if (c >= 0) /* Data character */
+ {
+ length += 2; /* For a one-byte character */
+
+ if (c > 127)
+ {
+ int i;
+ for (i = 0; i < _pcre_utf8_table1_size; i++)
+ if (c <= _pcre_utf8_table1[i]) break;
+ length += i;
+ lastitemlength += i;
+ }
+
+ continue;
+ }
+
+ /* Other escapes need one byte */
+
+ length++;
+
+ /* A back reference needs an additional 2 bytes, plus either one or 5
+ bytes for a repeat. We also need to keep the value of the highest
+ back reference. */
+
+ if (c <= -ESC_REF)
+ {
+ int refnum = -c - ESC_REF;
+ compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
+ if (refnum > compile_block.top_backref)
+ compile_block.top_backref = refnum;
+ length += 2; /* For single back reference */
+ if (ptr[1] == '{' && is_counted_repeat(ptr+2, patternEnd))
+ {
+ ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
+ if (errorcode != 0) return -1;;
+ if ((min == 0 && (max == 1 || max == -1)) ||
+ (min == 1 && max == -1))
+ length++;
+ else length += 5;
+ if (ptr[1] == '?') ptr++;
+ ptr++;
+ }
+ }
+ continue;
+
+ case '^': /* Single-byte metacharacters */
+ case '.':
+ case '$':
+ length++;
+ lastitemlength = 1;
+ continue;
+
+ case '*': /* These repeats won't be after brackets; */
+ case '+': /* those are handled separately */
+ case '?':
+ length++;
+ goto POSESSIVE; /* A few lines below */
+
+ /* This covers the cases of braced repeats after a single char, metachar,
+ class, or back reference. */
+
+ case '{':
+ if (!is_counted_repeat(ptr+1, patternEnd)) goto NORMAL_CHAR;
+ ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
+ if (errorcode != 0) return -1;;
+
+ /* These special cases just insert one extra opcode */
+
+ if ((min == 0 && (max == 1 || max == -1)) ||
+ (min == 1 && max == -1))
+ length++;
+
+ /* These cases might insert additional copies of a preceding character. */
+
+ else
+ {
+ if (min != 1)
+ {
+ length -= lastitemlength; /* Uncount the original char or metachar */
+ if (min > 0) length += 3 + lastitemlength;
+ }
+ length += lastitemlength + ((max > 0)? 3 : 1);
+ }
+
+ if (ptr[1] == '?') ptr++; /* Needs no extra length */
+
+ POSESSIVE: /* Test for possessive quantifier */
+ if (ptr[1] == '+')
+ {
+ ptr++;
+ length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
+ }
+ continue;
+
+ /* An alternation contains an offset to the next branch or ket. If any ims
+ options changed in the previous branch(es), and/or if we are in a
+ lookbehind assertion, extra space will be needed at the start of the
+ branch. This is handled by branch_extra. */
+
+ case '|':
+ length += 1 + LINK_SIZE + branch_extra;
+ continue;
+
+ /* A character class uses 33 characters provided that all the character
+ values are less than 256. Otherwise, it uses a bit map for low valued
+ characters, and individual items for others. Don't worry about character
+ types that aren't allowed in classes - they'll get picked up during the
+ compile. A character class that contains only one single-byte character
+ uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
+ where we can. (In UTF-8 mode we can do this only for chars < 128.) */
+
+ case '[':
+ if (*(++ptr) == '^')
+ {
+ class_optcount = 10; /* Greater than one */
+ ptr++;
+ }
+ else class_optcount = 0;
+
+ class_utf8 = false;
+
+ for (; ptr < patternEnd && *ptr != ']'; ++ptr)
+ {
+ /* Check for escapes */
+
+ if (*ptr == '\\')
+ {
+ c = check_escape(&ptr, patternEnd, &errorcode, bracount, true);
+ if (errorcode != 0) return -1;;
+
+ /* \b is backspace inside a class; \X is literal */
+
+ if (-c == ESC_b) c = '\b';
+
+ /* Handle escapes that turn into characters */
+
+ if (c >= 0) goto NON_SPECIAL_CHARACTER;
+
+ /* Escapes that are meta-things. The normal ones just affect the
+ bit map, but Unicode properties require an XCLASS extended item. */
+
+ else
+ {
+ class_optcount = 10; /* \d, \s etc; make sure > 1 */
+ }
+ }
+
+ /* Anything else increments the possible optimization count. We have to
+ detect ranges here so that we can compute the number of extra ranges for
+ caseless wide characters when UCP support is available. If there are wide
+ characters, we are going to have to use an XCLASS, even for single
+ characters. */
+
+ else
+ {
+ int d;
+
+ {
+ int extra = 0;
+ GETCHARLENEND(c, ptr, patternEnd, extra);
+ ptr += extra;
+ }
+
+ /* Come here from handling \ above when it escapes to a char value */
+
+ NON_SPECIAL_CHARACTER:
+ class_optcount++;
+
+ d = -1;
+ if (ptr + 1 < patternEnd && ptr[1] == '-')
+ {
+ pcre_uchar const *hyptr = ptr++;
+ if (ptr + 1 < patternEnd && ptr[1] == '\\')
+ {
+ ptr++;
+ d = check_escape(&ptr, patternEnd, &errorcode, bracount, true);
+ if (errorcode != 0) return -1;;
+ if (-d == ESC_b) d = '\b'; /* backspace */
+ }
+ else if (ptr + 1 < patternEnd && ptr[1] != ']')
+ {
+ ptr++;
+ {
+ int extra = 0;
+ GETCHARLENEND(d, ptr, patternEnd, extra);
+ ptr += extra;
+ }
+ }
+ if (d < 0) ptr = hyptr; /* go back to hyphen as data */
+ }
+
+ /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
+ 127 for caseless matching, we will need to use an XCLASS. */
+
+ if (d >= 0)
+ {
+ class_optcount = 10; /* Ensure > 1 */
+ if (d < c)
+ {
+ errorcode = ERR8;
+ return -1;;
+ }
+
+ if ((d > 255 || (ignoreCase && d > 127)))
+ {
+ uschar buffer[6];
+ if (!class_utf8) /* Allow for XCLASS overhead */
+ {
+ class_utf8 = true;
+ length += LINK_SIZE + 2;
+ }
+
+ /* If we have UCP support, find out how many extra ranges are
+ needed to map the other case of characters within this range. We
+ have to mimic the range optimization here, because extending the
+ range upwards might push d over a boundary that makes is use
+ another byte in the UTF-8 representation. */
+
+ if (ignoreCase)
+ {
+ int occ, ocd;
+ int cc = c;
+ int origd = d;
+ while (get_othercase_range(&cc, origd, &occ, &ocd))
+ {
+ if (occ >= c && ocd <= d) continue; /* Skip embedded */
+
+ if (occ < c && ocd >= c - 1) /* Extend the basic range */
+ { /* if there is overlap, */
+ c = occ; /* noting that if occ < c */
+ continue; /* we can't have ocd > d */
+ } /* because a subrange is */
+ if (ocd > d && occ <= d + 1) /* always shorter than */
+ { /* the basic range. */
+ d = ocd;
+ continue;
+ }
+
+ /* An extra item is needed */
+
+ length += 1 + _pcre_ord2utf8(occ, buffer) +
+ ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
+ }
+ }
+
+ /* The length of the (possibly extended) range */
+
+ length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
+ }
+
+ }
+
+ /* We have a single character. There is nothing to be done unless we
+ are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
+ allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
+ support. */
+
+ else
+ {
+ if ((c > 255 || (ignoreCase && c > 127)))
+ {
+ uschar buffer[6];
+ class_optcount = 10; /* Ensure > 1 */
+ if (!class_utf8) /* Allow for XCLASS overhead */
+ {
+ class_utf8 = true;
+ length += LINK_SIZE + 2;
+ }
+ length += (ignoreCase ? 2 : 1) * (1 + _pcre_ord2utf8(c, buffer));
+ }
+ }
+ }
+ }
+
+ if (ptr >= patternEnd) /* Missing terminating ']' */
+ {
+ errorcode = ERR6;
+ return -1;;
+ }
+
+ /* We can optimize when there was only one optimizable character. Repeats
+ for positive and negated single one-byte chars are handled by the general
+ code. Here, we handle repeats for the class opcodes. */
+
+ if (class_optcount == 1) length += 3; else
+ {
+ length += 33;
+
+ /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
+ we also need extra for wrapping the whole thing in a sub-pattern. */
+
+ if (ptr + 1 < patternEnd && ptr[1] == '{' && is_counted_repeat(ptr+2, patternEnd))
+ {
+ ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
+ if (errorcode != 0) return -1;;
+ if ((min == 0 && (max == 1 || max == -1)) ||
+ (min == 1 && max == -1))
+ length++;
+ else length += 5;
+ if (ptr + 1 < patternEnd && ptr[1] == '+')
+ {
+ ptr++;
+ length += 2 + 2*LINK_SIZE;
+ }
+ else if (ptr + 1 < patternEnd && ptr[1] == '?') ptr++;
+ }
+ }
+ continue;
+
+ /* Brackets may be genuine groups or special things */
+
+ case '(':
+ branch_newextra = 0;
+ bracket_length = 1 + LINK_SIZE;
+ capturing = false;
+
+ /* Handle special forms of bracket, which all start (? */
+
+ if (ptr + 1 < patternEnd && ptr[1] == '?')
+ {
+ switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0))
+ {
+ /* Non-referencing groups and lookaheads just move the pointer on, and
+ then behave like a non-special bracket, except that they don't increment
+ the count of extracting brackets. Ditto for the "once only" bracket,
+ which is in Perl from version 5.005. */
+
+ case ':':
+ case '=':
+ case '!':
+ ptr += 2;
+ break;
+
+ /* Else loop checking valid options until ) is met. Anything else is an
+ error. If we are without any brackets, i.e. at top level, the settings
+ act as if specified in the options, so massage the options immediately.
+ This is for backward compatibility with Perl 5.004. */
+
+ default:
+ errorcode = ERR12;
+ return -1;;
+ }
+ }
+
+ else capturing = 1;
+
+ /* Capturing brackets must be counted so we can process escapes in a
+ Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
+ an additional 3 bytes of memory per capturing bracket. */
+
+ if (capturing)
+ {
+ bracount++;
+ if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
+ }
+
+ /* Save length for computing whole length at end if there's a repeat that
+ requires duplication of the group. Also save the current value of
+ branch_extra, and start the new group with the new value. If non-zero, this
+ will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
+
+ if (brastackptr >= sizeof(brastack)/sizeof(int))
+ {
+ errorcode = ERR17;
+ return -1;;
+ }
+
+ bralenstack[brastackptr] = branch_extra;
+ branch_extra = branch_newextra;
+
+ brastack[brastackptr++] = length;
+ length += bracket_length;
+ continue;
+
+ /* Handle ket. Look for subsequent max/min; for certain sets of values we
+ have to replicate this bracket up to that many times. If brastackptr is
+ 0 this is an unmatched bracket which will generate an error, but take care
+ not to try to access brastack[-1] when computing the length and restoring
+ the branch_extra value. */
+
+ case ')':
+ length += 1 + LINK_SIZE;
+ if (brastackptr > 0)
+ {
+ duplength = length - brastack[--brastackptr];
+ branch_extra = bralenstack[brastackptr];
+ }
+ else duplength = 0;
+
+ /* Leave ptr at the final char; for read_repeat_counts this happens
+ automatically; for the others we need an increment. */
+
+ if (ptr + 1 < patternEnd && (c = ptr[1]) == '{' && is_counted_repeat(ptr+2, patternEnd))
+ {
+ ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
+ if (errorcode != 0) return -1;;
+ }
+ else if (c == '*') { min = 0; max = -1; ptr++; }
+ else if (c == '+') { min = 1; max = -1; ptr++; }
+ else if (c == '?') { min = 0; max = 1; ptr++; }
+ else { min = 1; max = 1; }
+
+ /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
+ group, and if the maximum is greater than zero, we have to replicate
+ maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
+ bracket set. */
+
+ if (min == 0)
+ {
+ length++;
+ if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
+ }
+
+ /* When the minimum is greater than zero, we have to replicate up to
+ minval-1 times, with no additions required in the copies. Then, if there
+ is a limited maximum we have to replicate up to maxval-1 times allowing
+ for a BRAZERO item before each optional copy and nesting brackets for all
+ but one of the optional copies. */
+
+ else
+ {
+ length += (min - 1) * duplength;
+ if (max > min) /* Need this test as max=-1 means no limit */
+ length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
+ - (2 + 2*LINK_SIZE);
+ }
+
+ /* Allow space for once brackets for "possessive quantifier" */
+
+ if (ptr + 1 < patternEnd && ptr[1] == '+')
+ {
+ ptr++;
+ length += 2 + 2*LINK_SIZE;
+ }
+ continue;
+
+ /* Non-special character. It won't be space or # in extended mode, so it is
+ always a genuine character. If we are in a \Q...\E sequence, check for the
+ end; if not, we have a literal. */
+
+ default:
+ NORMAL_CHAR:
+
+ length += 2; /* For a one-byte character */
+ lastitemlength = 1; /* Default length of last item for repeats */
+
+ /* In UTF-8 mode, check for additional bytes. */
+
+ if (c > 127)
+ {
+ if (IS_LEADING_SURROGATE(c))
+ {
+ c = DECODE_SURROGATE_PAIR(c, ptr < patternEnd ? *ptr : 0);
+ ++ptr;
+ }
+
+ {
+ int i;
+ for (i = 0; i < _pcre_utf8_table1_size; i++)
+ if (c <= _pcre_utf8_table1[i]) break;
+ length += i;
+ lastitemlength += i;
+ }
+ }
+
+ continue;
+ }
+ }
+
+ length += 2 + LINK_SIZE; /* For final KET and END */
+ return length;
+}
+#ifdef DEBUG
+static void printCompiledRegExp(real_pcre* re, int length)
+{
+ printf("Length = %d top_bracket = %d top_backref = %d\n",
+ length, re->top_bracket, re->top_backref);
+
+ if (re->options) {
+ printf("%s%s%s\n",
+ ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
+ ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
+ ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "");
+ }
+
+ if (re->options & PCRE_FIRSTSET) {
+ char ch = re->first_byte & 255;
+ const char* caseless = ((re->first_byte & REQ_CASELESS) == 0) ? "" : " (caseless)";
+ if (isASCIIAlphanumeric(ch))
+ printf("First char = %c%s\n", ch, caseless);
+ else
+ printf("First char = \\x%02x%s\n", ch, caseless);
+ }
+
+ if (re->options & PCRE_REQCHSET) {
+ char ch = re->req_byte & 255;
+ const char* caseless = ((re->req_byte & REQ_CASELESS) == 0) ? "" : " (caseless)";
+ if (isASCIIAlphanumeric(ch))
+ printf("Req char = %c%s\n", ch, caseless);
+ else
+ printf("Req char = \\x%02x%s\n", ch, caseless);
+ }
+
+ // This debugging function has been removed from JavaScriptCore's PCRE
+ //pcre_printint(re, stdout);
+}
+#endif
/*************************************************
* Compile a Regular Expression *
with errorptr and erroroffset set
*/
+static pcre* returnError(ErrorCode errorcode, const char** errorptr)
+{
+ *errorptr = error_text(errorcode);
+ return 0;
+}
+
pcre *
jsRegExpCompile(const pcre_char* pattern, int patternLength,
- JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,
- unsigned* numSubpatterns, const char** errorptr)
+ JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,
+ unsigned* numSubpatterns, const char** errorptr)
{
-real_pcre *re;
-int length = 1 + LINK_SIZE; /* For initial BRA plus length */
-int c, firstbyte, reqbyte;
-int bracount = 0;
-int branch_extra = 0;
-int branch_newextra;
-int item_count = -1;
-int name_count = 0;
-int max_name_size = 0;
-int lastitemlength = 0;
-ErrorCode errorcode = ERR0;
-BOOL class_utf8;
-BOOL capturing;
-unsigned int brastackptr = 0;
-size_t size;
-uschar *code;
-const uschar *codestart;
-const pcre_uchar *ptr;
-const pcre_uchar *patternEnd;
-compile_data compile_block;
-int brastack[BRASTACK_SIZE];
-uschar bralenstack[BRASTACK_SIZE];
-
-/* We can't pass back an error message if errorptr is NULL; I guess the best we
-can do is just return NULL, but we can set a code value if there is a code
-pointer. */
-
-if (errorptr == NULL)
- {
- return NULL;
- }
-
-*errorptr = NULL;
-
-/* Set up pointers to the individual character tables */
-
-compile_block.lcc = _pcre_default_tables + lcc_offset;
-compile_block.fcc = _pcre_default_tables + fcc_offset;
-compile_block.cbits = _pcre_default_tables + cbits_offset;
-compile_block.ctypes = _pcre_default_tables + ctypes_offset;
-
-/* Maximum back reference and backref bitmap. This is updated for numeric
-references during the first pass, but for named references during the actual
-compile pass. The bitmap records up to 31 back references to help in deciding
-whether (.*) can be treated as anchored or not. */
-
-compile_block.top_backref = 0;
-compile_block.backref_map = 0;
-
-/* Reflect pattern for debugging output */
-
-DPRINTF(("------------------------------------------------------------------\n"));
-
-/* The first thing to do is to make a pass over the pattern to compute the
-amount of store required to hold the compiled code. This does not have to be
-perfect as long as errors are overestimates. At the same time we can detect any
-flag settings right at the start, and extract them. Make an attempt to correct
-for any counted white space if an "extended" flag setting appears late in the
-pattern. We can't be so clever for #-comments. */
-
-ptr = (const pcre_uchar *)(pattern - 1);
-patternEnd = (const pcre_uchar *)(pattern + patternLength);
-
-while (++ptr < patternEnd)
- {
- int min = 0, max = 0;
- int class_optcount;
- int bracket_length;
- int duplength;
-
- c = *ptr;
-
- item_count++; /* Is zero for the first non-comment item */
-
- switch(c)
- {
- /* A backslashed item may be an escaped data character or it may be a
- character type. */
-
- case '\\':
- c = check_escape(&ptr, patternEnd, &errorcode, bracount, false);
- if (errorcode != 0) goto PCRE_ERROR_RETURN;
-
- lastitemlength = 1; /* Default length of last item for repeats */
-
- if (c >= 0) /* Data character */
- {
- length += 2; /* For a one-byte character */
-
- if (c > 127)
- {
- int i;
- for (i = 0; i < _pcre_utf8_table1_size; i++)
- if (c <= _pcre_utf8_table1[i]) break;
- length += i;
- lastitemlength += i;
- }
-
- continue;
- }
-
- /* Other escapes need one byte */
-
- length++;
-
- /* A back reference needs an additional 2 bytes, plus either one or 5
- bytes for a repeat. We also need to keep the value of the highest
- back reference. */
-
- if (c <= -ESC_REF)
- {
- int refnum = -c - ESC_REF;
- compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
- if (refnum > compile_block.top_backref)
- compile_block.top_backref = refnum;
- length += 2; /* For single back reference */
- if (ptr[1] == '{' && is_counted_repeat(ptr+2, patternEnd))
- {
- ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
- if (errorcode != 0) goto PCRE_ERROR_RETURN;
- if ((min == 0 && (max == 1 || max == -1)) ||
- (min == 1 && max == -1))
- length++;
- else length += 5;
- if (ptr[1] == '?') ptr++;
- ptr++;
- }
- }
- continue;
-
- case '^': /* Single-byte metacharacters */
- case '.':
- case '$':
- length++;
- lastitemlength = 1;
- continue;
-
- case '*': /* These repeats won't be after brackets; */
- case '+': /* those are handled separately */
- case '?':
- length++;
- goto POSESSIVE; /* A few lines below */
-
- /* This covers the cases of braced repeats after a single char, metachar,
- class, or back reference. */
-
- case '{':
- if (!is_counted_repeat(ptr+1, patternEnd)) goto NORMAL_CHAR;
- ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
- if (errorcode != 0) goto PCRE_ERROR_RETURN;
-
- /* These special cases just insert one extra opcode */
-
- if ((min == 0 && (max == 1 || max == -1)) ||
- (min == 1 && max == -1))
- length++;
-
- /* These cases might insert additional copies of a preceding character. */
-
- else
- {
- if (min != 1)
- {
- length -= lastitemlength; /* Uncount the original char or metachar */
- if (min > 0) length += 3 + lastitemlength;
- }
- length += lastitemlength + ((max > 0)? 3 : 1);
- }
-
- if (ptr[1] == '?') ptr++; /* Needs no extra length */
-
- POSESSIVE: /* Test for possessive quantifier */
- if (ptr[1] == '+')
- {
- ptr++;
- length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
- }
- continue;
-
- /* An alternation contains an offset to the next branch or ket. If any ims
- options changed in the previous branch(es), and/or if we are in a
- lookbehind assertion, extra space will be needed at the start of the
- branch. This is handled by branch_extra. */
-
- case '|':
- length += 1 + LINK_SIZE + branch_extra;
- continue;
-
- /* A character class uses 33 characters provided that all the character
- values are less than 256. Otherwise, it uses a bit map for low valued
- characters, and individual items for others. Don't worry about character
- types that aren't allowed in classes - they'll get picked up during the
- compile. A character class that contains only one single-byte character
- uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
- where we can. (In UTF-8 mode we can do this only for chars < 128.) */
-
- case '[':
- if (*(++ptr) == '^')
- {
- class_optcount = 10; /* Greater than one */
- ptr++;
- }
- else class_optcount = 0;
-
- class_utf8 = false;
-
- for (; ptr < patternEnd && *ptr != ']'; ++ptr)
- {
- /* Check for escapes */
-
- if (*ptr == '\\')
- {
- c = check_escape(&ptr, patternEnd, &errorcode, bracount, true);
- if (errorcode != 0) goto PCRE_ERROR_RETURN;
-
- /* \b is backspace inside a class; \X is literal */
-
- if (-c == ESC_b) c = '\b';
-
- /* Handle escapes that turn into characters */
-
- if (c >= 0) goto NON_SPECIAL_CHARACTER;
-
- /* Escapes that are meta-things. The normal ones just affect the
- bit map, but Unicode properties require an XCLASS extended item. */
-
- else
- {
- class_optcount = 10; /* \d, \s etc; make sure > 1 */
- }
- }
-
- /* Anything else increments the possible optimization count. We have to
- detect ranges here so that we can compute the number of extra ranges for
- caseless wide characters when UCP support is available. If there are wide
- characters, we are going to have to use an XCLASS, even for single
- characters. */
-
- else
+ /* We can't pass back an error message if errorptr is NULL; I guess the best we
+ can do is just return NULL, but we can set a code value if there is a code
+ pointer. */
+ if (!errorptr)
+ return 0;
+
+ *errorptr = NULL;
+
+ /* Set up pointers to the individual character tables */
+
+ compile_data compile_block;
+
+ ErrorCode errorcode = ERR0;
+ int length = calculateCompiledPatternLengthAndFlags(pattern, patternLength, ignoreCase, compile_block, errorcode);
+ if (errorcode)
+ return returnError(errorcode, errorptr);
+
+ if (length > MAX_PATTERN_SIZE)
+ return returnError(ERR16, errorptr);
+
+ /* Compute the size of data block needed and get it. */
+
+ size_t size = length + sizeof(real_pcre);
+ real_pcre* re = reinterpret_cast<real_pcre*>(new char[size]);
+
+ if (!re)
+ return returnError(ERR13, errorptr);
+
+ /* Put in the magic number, and save the sizes, options, and character table
+ pointer. NULL is used for the default character tables. The nullpad field is at
+ the end; it's there to help in the case when a regex compiled on a system with
+ 4-byte pointers is run on another with 8-byte pointers. */
+
+ re->size = (pcre_uint32)size;
+ re->options = (ignoreCase ? PCRE_CASELESS : 0) | (multiline ? PCRE_MULTILINE : 0);
+
+ /* The starting points of the name/number translation table and of the code are
+ passed around in the compile data block. */
+
+ const uschar* codestart = (const uschar*)(re + 1);
+ compile_block.start_code = codestart;
+ compile_block.start_pattern = (const pcre_uchar*)pattern;
+
+ /* Set up a starting, non-extracting bracket, then compile the expression. On
+ error, errorcode will be set non-zero, so we don't need to look at the result
+ of the function here. */
+
+ const pcre_uchar* ptr = (const pcre_uchar*)pattern;
+ const pcre_uchar* patternEnd = pattern + patternLength;
+ uschar* code = (uschar*)codestart;
+ *code = OP_BRA;
+ int firstbyte, reqbyte;
+ int bracketCount = 0;
+ (void)compile_regex(re->options, &bracketCount, &code, &ptr,
+ patternEnd,
+ &errorcode, 0, &firstbyte, &reqbyte, &compile_block);
+ re->top_bracket = bracketCount;
+ re->top_backref = compile_block.top_backref;
+
+ /* If not reached end of pattern on success, there's an excess bracket. */
+
+ if (errorcode == 0 && ptr < patternEnd)
+ errorcode = ERR10;
+
+ /* Fill in the terminating state and check for disastrous overflow, but
+ if debugging, leave the test till after things are printed out. */
+
+ *code++ = OP_END;
+
+#ifndef DEBUG
+ if (code - codestart > length)
+ errorcode = ERR7;
+#endif
+
+ /* Give an error if there's back reference to a non-existent capturing
+ subpattern. */
+
+ if (re->top_backref > re->top_bracket)
+ errorcode = ERR15;
+
+ /* Failed to compile, or error while post-processing */
+
+ if (errorcode != ERR0) {
+ delete [] reinterpret_cast<char*>(re);
+ return returnError(errorcode, errorptr);
+ }
+
+ /* If the anchored option was not passed, set the flag if we can determine that
+ the pattern is anchored by virtue of ^ characters or \A or anything else (such
+ as starting with .* when DOTALL is set).
+
+ Otherwise, if we know what the first character has to be, save it, because that
+ speeds up unanchored matches no end. If not, see if we can set the
+ PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
+ start with ^. and also when all branches start with .* for non-DOTALL matches.
+ */
+
+ if (is_anchored(codestart, re->options, 0, compile_block.backref_map))
+ re->options |= PCRE_ANCHORED;
+ else {
+ if (firstbyte < 0)
+ firstbyte = find_firstassertedchar(codestart, re->options, false);
+ if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
{
- int d;
-
- {
- int extra = 0;
- GETCHARLENEND(c, ptr, patternEnd, extra);
- ptr += extra;
- }
-
- /* Come here from handling \ above when it escapes to a char value */
-
- NON_SPECIAL_CHARACTER:
- class_optcount++;
-
- d = -1;
- if (ptr + 1 < patternEnd && ptr[1] == '-')
- {
- pcre_uchar const *hyptr = ptr++;
- if (ptr + 1 < patternEnd && ptr[1] == '\\')
- {
- ptr++;
- d = check_escape(&ptr, patternEnd, &errorcode, bracount, true);
- if (errorcode != 0) goto PCRE_ERROR_RETURN;
- if (-d == ESC_b) d = '\b'; /* backspace */
- }
- else if (ptr + 1 < patternEnd && ptr[1] != ']')
- {
- ptr++;
- {
- int extra = 0;
- GETCHARLENEND(d, ptr, patternEnd, extra);
- ptr += extra;
- }
+ int ch = firstbyte & 255;
+ if (ch < 127) {
+ re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
+ compile_block.fcc[ch] == ch)? ch : firstbyte;
+ re->options |= PCRE_FIRSTSET;
}
- if (d < 0) ptr = hyptr; /* go back to hyphen as data */
- }
-
- /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
- 127 for caseless matching, we will need to use an XCLASS. */
-
- if (d >= 0)
- {
- class_optcount = 10; /* Ensure > 1 */
- if (d < c)
- {
- errorcode = ERR8;
- goto PCRE_ERROR_RETURN;
- }
-
- if ((d > 255 || (ignoreCase && d > 127)))
- {
- uschar buffer[6];
- if (!class_utf8) /* Allow for XCLASS overhead */
- {
- class_utf8 = true;
- length += LINK_SIZE + 2;
- }
-
- /* If we have UCP support, find out how many extra ranges are
- needed to map the other case of characters within this range. We
- have to mimic the range optimization here, because extending the
- range upwards might push d over a boundary that makes is use
- another byte in the UTF-8 representation. */
-
- if (ignoreCase)
- {
- int occ, ocd;
- int cc = c;
- int origd = d;
- while (get_othercase_range(&cc, origd, &occ, &ocd))
- {
- if (occ >= c && ocd <= d) continue; /* Skip embedded */
-
- if (occ < c && ocd >= c - 1) /* Extend the basic range */
- { /* if there is overlap, */
- c = occ; /* noting that if occ < c */
- continue; /* we can't have ocd > d */
- } /* because a subrange is */
- if (ocd > d && occ <= d + 1) /* always shorter than */
- { /* the basic range. */
- d = ocd;
- continue;
- }
-
- /* An extra item is needed */
-
- length += 1 + _pcre_ord2utf8(occ, buffer) +
- ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
- }
- }
-
- /* The length of the (possibly extended) range */
-
- length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
- }
-
- }
-
- /* We have a single character. There is nothing to be done unless we
- are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
- allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
- support. */
-
- else
- {
- if ((c > 255 || (ignoreCase && c > 127)))
- {
- uschar buffer[6];
- class_optcount = 10; /* Ensure > 1 */
- if (!class_utf8) /* Allow for XCLASS overhead */
- {
- class_utf8 = true;
- length += LINK_SIZE + 2;
- }
- length += (ignoreCase ? 2 : 1) * (1 + _pcre_ord2utf8(c, buffer));
- }
- }
- }
- }
-
- if (ptr >= patternEnd) /* Missing terminating ']' */
- {
- errorcode = ERR6;
- goto PCRE_ERROR_RETURN;
- }
-
- /* We can optimize when there was only one optimizable character. Repeats
- for positive and negated single one-byte chars are handled by the general
- code. Here, we handle repeats for the class opcodes. */
-
- if (class_optcount == 1) length += 3; else
- {
- length += 33;
-
- /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
- we also need extra for wrapping the whole thing in a sub-pattern. */
-
- if (ptr + 1 < patternEnd && ptr[1] == '{' && is_counted_repeat(ptr+2, patternEnd))
- {
- ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
- if (errorcode != 0) goto PCRE_ERROR_RETURN;
- if ((min == 0 && (max == 1 || max == -1)) ||
- (min == 1 && max == -1))
- length++;
- else length += 5;
- if (ptr + 1 < patternEnd && ptr[1] == '+')
- {
- ptr++;
- length += 2 + 2*LINK_SIZE;
- }
- else if (ptr + 1 < patternEnd && ptr[1] == '?') ptr++;
- }
- }
- continue;
-
- /* Brackets may be genuine groups or special things */
-
- case '(':
- branch_newextra = 0;
- bracket_length = 1 + LINK_SIZE;
- capturing = false;
-
- /* Handle special forms of bracket, which all start (? */
-
- if (ptr + 1 < patternEnd && ptr[1] == '?')
- {
- switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0))
- {
- /* Non-referencing groups and lookaheads just move the pointer on, and
- then behave like a non-special bracket, except that they don't increment
- the count of extracting brackets. Ditto for the "once only" bracket,
- which is in Perl from version 5.005. */
-
- case ':':
- case '=':
- case '!':
- ptr += 2;
- break;
-
- /* Else loop checking valid options until ) is met. Anything else is an
- error. If we are without any brackets, i.e. at top level, the settings
- act as if specified in the options, so massage the options immediately.
- This is for backward compatibility with Perl 5.004. */
-
- default:
- errorcode = ERR12;
- goto PCRE_ERROR_RETURN;
- }
- }
-
- else capturing = 1;
-
- /* Capturing brackets must be counted so we can process escapes in a
- Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
- an additional 3 bytes of memory per capturing bracket. */
-
- if (capturing)
- {
- bracount++;
- if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
- }
-
- /* Save length for computing whole length at end if there's a repeat that
- requires duplication of the group. Also save the current value of
- branch_extra, and start the new group with the new value. If non-zero, this
- will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
-
- if (brastackptr >= sizeof(brastack)/sizeof(int))
- {
- errorcode = ERR17;
- goto PCRE_ERROR_RETURN;
- }
-
- bralenstack[brastackptr] = branch_extra;
- branch_extra = branch_newextra;
-
- brastack[brastackptr++] = length;
- length += bracket_length;
- continue;
-
- /* Handle ket. Look for subsequent max/min; for certain sets of values we
- have to replicate this bracket up to that many times. If brastackptr is
- 0 this is an unmatched bracket which will generate an error, but take care
- not to try to access brastack[-1] when computing the length and restoring
- the branch_extra value. */
-
- case ')':
- length += 1 + LINK_SIZE;
- if (brastackptr > 0)
- {
- duplength = length - brastack[--brastackptr];
- branch_extra = bralenstack[brastackptr];
- }
- else duplength = 0;
-
- /* Leave ptr at the final char; for read_repeat_counts this happens
- automatically; for the others we need an increment. */
-
- if (ptr + 1 < patternEnd && (c = ptr[1]) == '{' && is_counted_repeat(ptr+2, patternEnd))
- {
- ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
- if (errorcode != 0) goto PCRE_ERROR_RETURN;
- }
- else if (c == '*') { min = 0; max = -1; ptr++; }
- else if (c == '+') { min = 1; max = -1; ptr++; }
- else if (c == '?') { min = 0; max = 1; ptr++; }
- else { min = 1; max = 1; }
-
- /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
- group, and if the maximum is greater than zero, we have to replicate
- maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
- bracket set. */
-
- if (min == 0)
- {
- length++;
- if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
- }
-
- /* When the minimum is greater than zero, we have to replicate up to
- minval-1 times, with no additions required in the copies. Then, if there
- is a limited maximum we have to replicate up to maxval-1 times allowing
- for a BRAZERO item before each optional copy and nesting brackets for all
- but one of the optional copies. */
-
- else
- {
- length += (min - 1) * duplength;
- if (max > min) /* Need this test as max=-1 means no limit */
- length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
- - (2 + 2*LINK_SIZE);
- }
-
- /* Allow space for once brackets for "possessive quantifier" */
-
- if (ptr + 1 < patternEnd && ptr[1] == '+')
- {
- ptr++;
- length += 2 + 2*LINK_SIZE;
- }
- continue;
-
- /* Non-special character. It won't be space or # in extended mode, so it is
- always a genuine character. If we are in a \Q...\E sequence, check for the
- end; if not, we have a literal. */
-
- default:
- NORMAL_CHAR:
-
- length += 2; /* For a one-byte character */
- lastitemlength = 1; /* Default length of last item for repeats */
-
- /* In UTF-8 mode, check for additional bytes. */
-
- if (c > 127)
- {
- if (IS_LEADING_SURROGATE(c))
- {
- c = DECODE_SURROGATE_PAIR(c, ptr < patternEnd ? *ptr : 0);
- ++ptr;
- }
-
- {
- int i;
- for (i = 0; i < _pcre_utf8_table1_size; i++)
- if (c <= _pcre_utf8_table1[i]) break;
- length += i;
- lastitemlength += i;
}
- }
-
- continue;
+ else if (is_startline(codestart, 0, compile_block.backref_map))
+ re->options |= PCRE_STARTLINE;
}
- }
-
-length += 2 + LINK_SIZE; /* For final KET and END */
-
-if (length > MAX_PATTERN_SIZE)
- {
- errorcode = ERR16;
- goto PCRE_ERROR_RETURN;
- }
-
-/* Compute the size of data block needed and get it. */
-
-size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
-re = reinterpret_cast<real_pcre*>(new char[size]);
-
-if (re == NULL)
- {
- errorcode = ERR13;
- goto PCRE_ERROR_RETURN;
- }
-
-/* Put in the magic number, and save the sizes, options, and character table
-pointer. NULL is used for the default character tables. The nullpad field is at
-the end; it's there to help in the case when a regex compiled on a system with
-4-byte pointers is run on another with 8-byte pointers. */
-
-re->size = (pcre_uint32)size;
-re->options = (ignoreCase ? PCRE_CASELESS : 0) | (multiline ? PCRE_MULTILINE : 0);
-
-/* The starting points of the name/number translation table and of the code are
-passed around in the compile data block. */
-
-codestart = (const uschar *)(re + 1);
-compile_block.start_code = codestart;
-compile_block.start_pattern = (const pcre_uchar *)pattern;
-compile_block.req_varyopt = 0;
-
-/* Set up a starting, non-extracting bracket, then compile the expression. On
-error, errorcode will be set non-zero, so we don't need to look at the result
-of the function here. */
-
-ptr = (const pcre_uchar *)pattern;
-code = (uschar *)codestart;
-*code = OP_BRA;
-bracount = 0;
-(void)compile_regex(re->options, &bracount, &code, &ptr,
- patternEnd,
- &errorcode, 0, &firstbyte, &reqbyte, &compile_block);
-re->top_bracket = bracount;
-re->top_backref = compile_block.top_backref;
-
-/* If not reached end of pattern on success, there's an excess bracket. */
-
-if (errorcode == 0 && ptr < patternEnd) errorcode = ERR10;
-
-/* Fill in the terminating state and check for disastrous overflow, but
-if debugging, leave the test till after things are printed out. */
-
-*code++ = OP_END;
-
-#ifndef DEBUG
-if (code - codestart > length) errorcode = ERR7;
-#endif
-
-/* Give an error if there's back reference to a non-existent capturing
-subpattern. */
-
-if (re->top_backref > re->top_bracket) errorcode = ERR15;
-
-/* Failed to compile, or error while post-processing */
-
-if (errorcode != ERR0)
- {
- delete [] reinterpret_cast<char*>(re);
- PCRE_ERROR_RETURN:
- *errorptr = error_text(errorcode);
- return NULL;
- }
-
-/* If the anchored option was not passed, set the flag if we can determine that
-the pattern is anchored by virtue of ^ characters or \A or anything else (such
-as starting with .* when DOTALL is set).
-
-Otherwise, if we know what the first character has to be, save it, because that
-speeds up unanchored matches no end. If not, see if we can set the
-PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
-start with ^. and also when all branches start with .* for non-DOTALL matches.
-*/
-
- {
- if (is_anchored(codestart, re->options, 0, compile_block.backref_map))
- re->options |= PCRE_ANCHORED;
- else
- {
- if (firstbyte < 0)
- firstbyte = find_firstassertedchar(codestart, re->options, false);
- if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
- {
- int ch = firstbyte & 255;
- if (ch < 127)
- { /* Strange indentation to aid in merging. */
- re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
- compile_block.fcc[ch] == ch)? ch : firstbyte;
- re->options |= PCRE_FIRSTSET;
- }
- }
- else if (is_startline(codestart, 0, compile_block.backref_map))
- re->options |= PCRE_STARTLINE;
+
+ /* For an anchored pattern, we use the "required byte" only if it follows a
+ variable length item in the regex. Remove the caseless flag for non-caseable
+ bytes. */
+
+ if (reqbyte >= 0 && (!(re->options & PCRE_ANCHORED) || (reqbyte & REQ_VARY))) {
+ int ch = reqbyte & 255;
+ if (ch < 127) {
+ re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
+ compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
+ re->options |= PCRE_REQCHSET;
+ }
}
- }
-
-/* For an anchored pattern, we use the "required byte" only if it follows a
-variable length item in the regex. Remove the caseless flag for non-caseable
-bytes. */
-
-if (reqbyte >= 0 &&
- ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
- {
- int ch = reqbyte & 255;
- if (ch < 127)
- { /* Strange indentation to aid in merging. */
- re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
- compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
- re->options |= PCRE_REQCHSET;
- }
- }
-
-/* Print out the compiled data if debugging is enabled. This is never the
-case when building a production library. */
-
+
#ifdef DEBUG
-
-printf("Length = %d top_bracket = %d top_backref = %d\n",
- length, re->top_bracket, re->top_backref);
-
-if (re->options != 0)
- {
- printf("%s%s%s\n",
- ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
- ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
- ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "");
- }
-
-if ((re->options & PCRE_FIRSTSET) != 0)
- {
- int ch = re->first_byte & 255;
- const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
- "" : " (caseless)";
- if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
- else printf("First char = \\x%02x%s\n", ch, caseless);
- }
-
-if ((re->options & PCRE_REQCHSET) != 0)
- {
- int ch = re->req_byte & 255;
- const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
- "" : " (caseless)";
- if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
- else printf("Req char = \\x%02x%s\n", ch, caseless);
- }
-
-pcre_printint(re, stdout);
-
-/* This check is done here in the debugging case so that the code that
-was compiled can be seen. */
-
-if (code - codestart > length)
- {
- (pcre_free)(re);
- *errorptr = error_text(ERR7);
- return NULL;
- }
-
+ printCompiledRegExp(re);
+
+ /* This check is done here in the debugging case so that the code that
+ was compiled can be seen. */
+ if (code - codestart > length) {
+ (pcre_free)(re);
+ *errorptr = error_text(ERR7);
+ return NULL;
+ }
+
#endif
-
-if (numSubpatterns)
- *numSubpatterns = re->top_bracket;
-return (pcre *)re;
+
+ if (numSubpatterns)
+ *numSubpatterns = re->top_bracket;
+ return (pcre *)re;
}
void jsRegExpFree(JSRegExp* re)