1 /* This is JavaScriptCore's variant of the PCRE library. While this library
2 started out as a copy of PCRE, many of the features of PCRE have been
3 removed. This library now supports only the regular expression features
4 required by the JavaScript language specification, and has only the functions
5 needed by JavaScriptCore and the rest of WebKit.
7 Originally written by Philip Hazel
8 Copyright (c) 1997-2006 University of Cambridge
9 Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved.
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
40 /* This module contains the external function jsRegExpExecute(), along with
41 supporting internal functions that are not used by other modules. */
45 #include "pcre_internal.h"
47 #include <wtf/ASCIICType.h>
48 #include <wtf/FastMalloc.h>
52 /*************************************************
53 * Code parameters and static tables *
54 *************************************************/
56 /* Maximum number of items on the nested bracket stacks at compile time. This
57 applies to the nesting of all kinds of parentheses. It does not limit
58 un-nested, non-capturing parentheses. This number can be made bigger if
59 necessary - it is used to dimension one int and one unsigned char vector at
62 #define BRASTACK_SIZE 200
64 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
65 are simple data values; negative values are for special things like \d and so
66 on. Zero means further processing is needed (for things like \x), or the escape
69 static const short escapes[] = {
70 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
71 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
72 '@', 0, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
73 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
74 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
75 0, 0, 0, '[', '\\', ']', '^', '_', /* X - _ */
76 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
77 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
78 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
82 /* Error code numbers. They are given names so that they can more easily be
86 ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
87 ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17
90 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
91 the definition is next to the definition of the opcodes in pcre_internal.h. */
93 static const uschar OP_lengths[] = { OP_LENGTHS };
95 /* The texts of compile-time error messages. These are "char *" because they
96 are passed to the outside world. */
98 static const char* error_text(ErrorCode code)
100 static const char error_texts[] =
102 "\\ at end of pattern\0"
103 "\\c at end of pattern\0"
104 "character value in \\x{...} sequence is too large\0"
105 "numbers out of order in {} quantifier\0"
107 "number too big in {} quantifier\0"
108 "missing terminating ] for character class\0"
109 "internal error: code overflow\0"
110 "range out of order in character class\0"
111 "nothing to repeat\0"
113 "unmatched parentheses\0"
114 "internal error: unexpected repeat\0"
115 "unrecognized character after (?\0"
116 "failed to get memory\0"
119 "reference to non-existent subpattern\0"
120 "regular expression too large\0"
121 "parentheses nested too deeply"
125 const char* text = error_texts;
131 /* Definition to allow mutual recursion */
134 compile_regex(int, int *, uschar **, const pcre_uchar **, const pcre_uchar *, ErrorCode*, int,
135 int *, int *, compile_data *);
137 /*************************************************
139 *************************************************/
141 /* This function is called when a \ has been encountered. It either returns a
142 positive value for a simple escape such as \n, or a negative value which
143 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
144 a positive value greater than 255 may be returned. On entry, ptr is pointing at
145 the \. On exit, it is on the final character of the escape sequence.
148 ptrptr points to the pattern position pointer
149 errorcodeptr points to the errorcode variable
150 bracount number of previous extracting brackets
151 options the options bits
152 isclass true if inside a character class
154 Returns: zero or positive => a data character
155 negative => a special escape sequence
156 on error, errorptr is set
160 check_escape(const pcre_uchar **ptrptr, const pcre_uchar *patternEnd, ErrorCode* errorcodeptr, int bracount,
163 const pcre_uchar *ptr = *ptrptr + 1;
166 /* If backslash is at the end of the pattern, it's an error. */
167 if (ptr == patternEnd) {
168 *errorcodeptr = ERR1;
175 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
176 a table. A non-zero result is something that can be returned immediately.
177 Otherwise further processing may be required. */
179 if (c < '0' || c > 'z') {} /* Not alphameric */
180 else if ((i = escapes[c - '0']) != 0) c = i;
182 /* Escapes that need further processing, or are illegal. */
188 /* Escape sequences starting with a non-zero digit are backreferences,
189 unless there are insufficient brackets, in which case they are octal
190 escape sequences. Those sequences end on the first non-octal character
191 or when we overflow 0-255, whichever comes first. */
193 case '1': case '2': case '3': case '4': case '5':
194 case '6': case '7': case '8': case '9':
198 const pcre_uchar *oldptr = ptr;
200 while (ptr + 1 < patternEnd && isASCIIDigit(ptr[1]) && c <= bracount)
201 c = c * 10 + *(++ptr) - '0';
207 ptr = oldptr; /* Put the pointer back and fall through */
210 /* Handle an octal number following \. If the first digit is 8 or 9,
211 this is not octal. */
213 if ((c = *ptr) >= '8')
216 /* \0 always starts an octal number, but we may drop through to here with a
217 larger first octal digit. */
221 for (i = 1; i <= 2; ++i)
223 if (ptr + i >= patternEnd || ptr[i] < '0' || ptr[i] > '7')
225 int cc = c * 8 + ptr[i] - '0';
235 for (i = 1; i <= 2; ++i)
237 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i]))
244 if (cc >= 'a') cc -= 32; /* Convert to upper case */
245 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
252 for (i = 1; i <= 4; ++i)
254 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i]))
261 if (cc >= 'a') cc -= 32; /* Convert to upper case */
262 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
267 /* Other special escapes not starting with a digit are straightforward */
270 if (++ptr == patternEnd)
272 *errorcodeptr = ERR2;
277 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
278 is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
280 if (c >= 'a' && c <= 'z') c -= 32;
292 /*************************************************
293 * Check for counted repeat *
294 *************************************************/
296 /* This function is called when a '{' is encountered in a place where it might
297 start a quantifier. It looks ahead to see if it really is a quantifier or not.
298 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
299 where the ddds are digits.
302 p pointer to the first char after '{'
304 Returns: true or false
308 is_counted_repeat(const pcre_uchar *p, const pcre_uchar *patternEnd)
310 if (p >= patternEnd || !isASCIIDigit(*p))
313 while (p < patternEnd && isASCIIDigit(*p))
315 if (p < patternEnd && *p == '}')
318 if (p >= patternEnd || *p++ != ',')
320 if (p < patternEnd && *p == '}')
323 if (p >= patternEnd || !isASCIIDigit(*p))
326 while (p < patternEnd && isASCIIDigit(*p))
329 return (p < patternEnd && *p == '}');
334 /*************************************************
335 * Read repeat counts *
336 *************************************************/
338 /* Read an item of the form {n,m} and return the values. This is called only
339 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
340 so the syntax is guaranteed to be correct, but we need to check the values.
343 p pointer to first char after '{'
344 minp pointer to int for min
345 maxp pointer to int for max
346 returned as -1 if no max
347 errorcodeptr points to error code variable
349 Returns: pointer to '}' on success;
350 current ptr on error, with errorcodeptr set non-zero
353 static const pcre_uchar *
354 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, ErrorCode* errorcodeptr)
359 /* Read the minimum value and do a paranoid check: a negative value indicates
360 an integer overflow. */
362 while (isASCIIDigit(*p)) min = min * 10 + *p++ - '0';
363 if (min < 0 || min > 65535)
365 *errorcodeptr = ERR5;
369 /* Read the maximum value if there is one, and again do a paranoid on its size.
370 Also, max must not be less than min. */
372 if (*p == '}') max = min; else
377 while (isASCIIDigit(*p)) max = max * 10 + *p++ - '0';
378 if (max < 0 || max > 65535)
380 *errorcodeptr = ERR5;
385 *errorcodeptr = ERR4;
391 /* Fill in the required variables, and pass back the pointer to the terminating
401 /*************************************************
402 * Find first significant op code *
403 *************************************************/
405 /* This is called by several functions that scan a compiled expression looking
406 for a fixed first character, or an anchoring op code etc. It skips over things
407 that do not influence this. For some calls, a change of option is important.
408 For some calls, it makes sense to skip negative forward and all backward
409 assertions, and also the \b assertion; for others it does not.
412 code pointer to the start of the group
413 skipassert true if certain assertions are to be skipped
415 Returns: pointer to the first significant opcode
419 first_significant_code(const uschar *code, BOOL skipassert)
426 if (!skipassert) return code;
427 do code += GET(code, 1); while (*code == OP_ALT);
428 code += OP_lengths[*code];
431 case OP_WORD_BOUNDARY:
432 case OP_NOT_WORD_BOUNDARY:
433 if (!skipassert) return code;
437 code += OP_lengths[*code];
444 /* Control never reaches here */
450 /*************************************************
451 * Find the fixed length of a pattern *
452 *************************************************/
454 /* Scan a pattern and compute the fixed length of subject that will match it,
455 if the length is fixed. This is needed for dealing with backward assertions.
456 In UTF8 mode, the result is in characters rather than bytes.
459 code points to the start of the pattern (the bracket)
460 options the compiling options
462 Returns: the fixed length, or -1 if there is no fixed length,
463 or -2 if \C was encountered
467 find_fixedlength(uschar *code, int options)
471 register int branchlength = 0;
472 register uschar *cc = code + 1 + LINK_SIZE;
474 /* Scan along the opcodes for this branch. If we get to the end of the
475 branch, check the length against that of the other branches. */
480 register int op = *cc;
481 if (op >= OP_BRA) op = OP_BRA;
487 d = find_fixedlength(cc, options);
490 do cc += GET(cc, 1); while (*cc == OP_ALT);
494 /* Reached end of a branch; if it's a ket it is the end of a nested
495 call. If it's ALT it is an alternation in a nested call. If it is
496 END it's the end of the outer call. All can be handled by the same code. */
503 if (length < 0) length = branchlength;
504 else if (length != branchlength) return -1;
505 if (*cc != OP_ALT) return length;
510 /* Skip over assertive subpatterns */
514 do cc += GET(cc, 1); while (*cc == OP_ALT);
517 /* Skip over things that don't match chars */
522 case OP_NOT_WORD_BOUNDARY:
523 case OP_WORD_BOUNDARY:
524 cc += OP_lengths[*cc];
527 /* Handle literal characters */
534 while ((*cc & 0xc0) == 0x80) cc++;
538 case OP_ASCII_LETTER_NC:
543 /* Handle exact repetitions. The count is already in characters, but we
544 need to skip over a multibyte character in UTF8 mode. */
547 branchlength += GET2(cc,1);
549 while((*cc & 0x80) == 0x80) cc++;
553 branchlength += GET2(cc,1);
557 /* Handle single-char matchers */
561 case OP_NOT_WHITESPACE:
563 case OP_NOT_WORDCHAR:
570 /* Check a class for variable quantification */
573 cc += GET(cc, 1) - 33;
590 if (GET2(cc,1) != GET2(cc,3)) return -1;
591 branchlength += GET2(cc,1);
600 /* Anything else is variable length */
606 /* Control never gets here */
612 /*************************************************
613 * Scan compiled branch for non-emptiness *
614 *************************************************/
616 /* This function scans through a branch of a compiled pattern to see whether it
617 can match the empty string or not. It is called only from could_be_empty()
618 below. Note that first_significant_code() skips over assertions. If we hit an
619 unclosed bracket, we return "empty" - this means we've struck an inner bracket
620 whose current branch will already have been scanned.
623 code points to start of search
624 endcode points to where to stop
626 Returns: true if what is matched could be empty
630 could_be_empty_branch(const uschar *code, const uschar *endcode)
633 for (code = first_significant_code(code + 1 + LINK_SIZE, true);
635 code = first_significant_code(code + OP_lengths[c], true))
644 if (GET(code, 1) == 0) return true; /* Hit unclosed bracket */
646 /* Scan a closed bracket */
648 empty_branch = false;
651 if (!empty_branch && could_be_empty_branch(code, endcode))
653 code += GET(code, 1);
655 while (*code == OP_ALT);
656 if (!empty_branch) return false; /* All branches are non-empty */
657 code += 1 + LINK_SIZE;
663 /* Check for quantifiers after a class */
666 ccode = code + GET(code, 1);
667 goto CHECK_CLASS_REPEAT;
677 case OP_CRSTAR: /* These could be empty; continue */
683 default: /* Non-repeat => class must match */
684 case OP_CRPLUS: /* These repeats aren't empty */
690 if (GET2(ccode, 1) > 0) return false; /* Minimum > 0 */
695 /* Opcodes that must match a character */
699 case OP_NOT_WHITESPACE:
701 case OP_NOT_WORDCHAR:
707 case OP_ASCII_LETTER_NC:
728 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
729 followed by a multibyte character */
737 while ((code[2] & 0xc0) == 0x80) code++;
747 /*************************************************
748 * Complete a callout item *
749 *************************************************/
751 /* A callout item contains the length of the next item in the pattern, which
752 we can't fill in till after we have reached the relevant point. This is used
753 for both automatic and manual callouts.
756 previous_callout points to previous callout item
757 ptr current pattern pointer
758 cd pointers to tables etc
764 complete_callout(uschar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
766 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
767 PUT(previous_callout, 2 + LINK_SIZE, length);
772 /*************************************************
773 * Get othercase range *
774 *************************************************/
776 /* This function is passed the start and end of a class range, in UTF-8 mode
777 with UCP support. It searches up the characters, looking for internal ranges of
778 characters in the "other" case. Each call returns the next one, updating the
782 cptr points to starting character value; updated
784 ocptr where to put start of othercase range
785 odptr where to put end of othercase range
787 Yield: true when range returned; false when no more
791 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
793 int c, othercase = 0, next;
795 for (c = *cptr; c <= d; c++)
796 { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }
798 if (c > d) return false;
801 next = othercase + 1;
803 for (++c; c <= d; c++)
805 if (_pcre_ucp_othercase(c) != next) break;
816 /*************************************************
817 * Compile one branch *
818 *************************************************/
820 /* Scan the pattern, compiling it into the code vector. If the options are
821 changed during the branch, the pointer is used to change the external options
825 options the option bits
826 brackets points to number of extracting brackets used
827 codeptr points to the pointer to the current code point
828 ptrptr points to the current pattern pointer
829 errorcodeptr points to error code variable
830 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
831 reqbyteptr set to the last literal character required, else < 0
832 cd contains pointers to tables etc.
834 Returns: true on success
835 false, with *errorcodeptr set non-zero on error
839 compile_branch(int options, int *brackets, uschar **codeptr,
840 const pcre_uchar **ptrptr, const pcre_uchar *patternEnd, ErrorCode* errorcodeptr, int *firstbyteptr,
841 int *reqbyteptr, compile_data *cd)
843 int repeat_type, op_type;
844 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
846 int firstbyte, reqbyte;
847 int zeroreqbyte, zerofirstbyte;
848 int req_caseopt, reqvary, tempreqvary;
849 int after_manual_callout = 0;
851 register uschar *code = *codeptr;
853 BOOL groupsetfirstbyte = false;
854 const pcre_uchar *ptr = *ptrptr;
855 const pcre_uchar *tempptr;
856 uschar *previous = NULL;
857 uschar *previous_callout = NULL;
858 uschar classbits[32];
861 uschar *class_utf8data;
864 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
865 matching encountered yet". It gets changed to REQ_NONE if we hit something that
866 matches a non-fixed char first char; reqbyte just remains unset if we never
869 When we hit a repeat whose minimum is zero, we may have to adjust these values
870 to take the zero repeat into account. This is implemented by setting them to
871 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
872 item types that can be repeated set these backoff variables appropriately. */
874 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
876 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
877 according to the current setting of the caseless flag. REQ_CASELESS is a bit
878 value > 255. It is added into the firstbyte or reqbyte variables to record the
879 case status of the value. This is used only for ASCII characters. */
881 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
883 /* Switch on next character until the end of the branch */
888 BOOL should_flip_negation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */
898 /* Next byte in the pattern */
900 c = ptr < patternEnd ? *ptr : 0;
902 /* Fill in length of a previous callout, except when the next thing is
905 is_quantifier = c == '*' || c == '+' || c == '?' ||
906 (c == '{' && is_counted_repeat(ptr+1, patternEnd));
908 if (!is_quantifier && previous_callout != NULL &&
909 after_manual_callout-- <= 0)
911 complete_callout(previous_callout, ptr, cd);
912 previous_callout = NULL;
917 /* The branch terminates at end of string, |, or ). */
920 if (ptr < patternEnd)
922 // End of string; fall through
925 *firstbyteptr = firstbyte;
926 *reqbyteptr = reqbyte;
931 /* Handle single-character metacharacters. In multiline mode, ^ disables
932 the setting of any following char as a first character. */
935 if ((options & PCRE_MULTILINE) != 0)
937 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
948 /* There can never be a first char if '.' is first, whatever happens about
949 repeats. The value of reqbyte doesn't change either. */
952 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
953 zerofirstbyte = firstbyte;
954 zeroreqbyte = reqbyte;
959 /* Character classes. If the included characters are all < 256, we build a
960 32-byte bitmap of the permitted characters, except in the special case
961 where there is only one such character. For negated classes, we build the
962 map as usual, then invert it at the end. However, we use a different opcode
963 so that data characters > 255 can be handled correctly.
965 If the class contains characters outside the 0-255 range, a different
966 opcode is compiled. It may optionally have a bit map for characters < 256,
967 but those above are are explicitly listed afterwards. A flag byte tells
968 whether the bitmap is present, and whether this is a negated class or not.
973 should_flip_negation = false;
975 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
976 they are encountered at the top level, so we'll do that too. */
978 /* If the first character is '^', set the negation flag and skip it. */
987 negate_class = false;
990 /* Keep a count of chars with values < 256 so that we can optimize the case
991 of just a single character (as long as it's < 256). For higher valued UTF-8
992 characters, we don't yet do any optimization. */
997 class_utf8 = false; /* No chars >= 256 */
998 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1000 /* Initialize the 32-char bit map to all zeros. We have to build the
1001 map in a temporary bit of store, in case the class contains only 1
1002 character (< 256), because in that case the compiled code doesn't use the
1005 memset(classbits, 0, 32 * sizeof(uschar));
1007 /* Process characters until ] is reached. The first pass
1008 through the regex checked the overall syntax, so we don't need to be very
1009 strict here. At the start of the loop, c contains the first byte of the
1012 while ((c = *(++ptr)) != ']')
1015 { /* Braces are required because the */
1016 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1019 /* Backslash may introduce a single character, or it may introduce one
1020 of the specials, which just set a flag. Escaped items are checked for
1021 validity in the pre-compiling pass. The sequence \b is a special case.
1022 Inside a class (and only there) it is treated as backspace. Elsewhere
1023 it marks a word boundary. Other escapes have preset maps ready to
1024 or into the one we are building. We assume they have more than one
1025 character in them, so set class_charcount bigger than one. */
1029 c = check_escape(&ptr, patternEnd, errorcodeptr, *brackets, true);
1031 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1035 register const uschar *cbits = cd->cbits;
1036 class_charcount += 2; /* Greater than 1 is what matters */
1040 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1044 should_flip_negation = true;
1045 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1049 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1053 should_flip_negation = true;
1054 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
1058 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
1062 should_flip_negation = true;
1063 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
1066 /* Unrecognized escapes are faulted if PCRE is running in its
1067 strict mode. By default, for compatibility with Perl, they are
1068 treated as literals. */
1071 c = *ptr; /* The final character */
1072 class_charcount -= 2; /* Undo the default count from above */
1076 /* Fall through if we have a single character (c >= 0). This may be
1077 > 256 in UTF-8 mode. */
1079 } /* End of backslash handling */
1081 /* A single character may be followed by '-' to form a range. However,
1082 Perl does not permit ']' to be the end of the range. A '-' character
1083 here is treated as a literal. */
1085 if (ptr[1] == '-' && ptr[2] != ']')
1090 GETCHARLEN(d, ptr, ptr);
1092 /* The second part of a range can be a single-character escape, but
1093 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1094 in such circumstances. */
1098 const pcre_uchar *oldptr = ptr;
1099 d = check_escape(&ptr, patternEnd, errorcodeptr, *brackets, true);
1101 /* \b is backslash; \X is literal X; any other special means the '-'
1106 if (d == -ESC_b) d = '\b';
1110 goto LONE_SINGLE_CHARACTER; /* A few lines below */
1115 /* The check that the two values are in the correct order happens in
1116 the pre-pass. Optimize one-character ranges */
1118 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
1120 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
1121 matching, we have to use an XCLASS with extra data items. Caseless
1122 matching for characters > 127 is available only if UCP support is
1125 if ((d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
1129 /* With UCP support, we can find the other case equivalents of
1130 the relevant characters. There may be several ranges. Optimize how
1131 they fit with the basic range. */
1133 if ((options & PCRE_CASELESS) != 0)
1138 while (get_othercase_range(&cc, origd, &occ, &ocd))
1140 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
1142 if (occ < c && ocd >= c - 1) /* Extend the basic range */
1143 { /* if there is overlap, */
1144 c = occ; /* noting that if occ < c */
1145 continue; /* we can't have ocd > d */
1146 } /* because a subrange is */
1147 if (ocd > d && occ <= d + 1) /* always shorter than */
1148 { /* the basic range. */
1155 *class_utf8data++ = XCL_SINGLE;
1159 *class_utf8data++ = XCL_RANGE;
1160 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
1162 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
1166 /* Now record the original range, possibly modified for UCP caseless
1167 overlapping ranges. */
1169 *class_utf8data++ = XCL_RANGE;
1170 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
1171 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
1173 /* With UCP support, we are done. Without UCP support, there is no
1174 caseless matching for UTF-8 characters > 127; we can use the bit map
1175 for the smaller ones. */
1177 continue; /* With next character in the class */
1180 /* We use the bit map for all cases when not in UTF-8 mode; else
1181 ranges that lie entirely within 0-127 when there is UCP support; else
1182 for partial ranges without UCP support. */
1186 classbits[c/8] |= (1 << (c&7));
1187 if ((options & PCRE_CASELESS) != 0)
1189 int uc = cd->fcc[c]; /* flip case */
1190 classbits[uc/8] |= (1 << (uc&7));
1192 class_charcount++; /* in case a one-char range */
1196 continue; /* Go get the next char in the class */
1199 /* Handle a lone single character - we can get here for a normal
1200 non-escape char, or after \ that introduces a single character or for an
1201 apparent range that isn't. */
1203 LONE_SINGLE_CHARACTER:
1205 /* Handle a character that cannot go in the bit map */
1207 if ((c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
1210 *class_utf8data++ = XCL_SINGLE;
1211 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
1213 if ((options & PCRE_CASELESS) != 0)
1216 if ((othercase = _pcre_ucp_othercase(c)) >= 0)
1218 *class_utf8data++ = XCL_SINGLE;
1219 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
1226 /* Handle a single-byte character */
1228 classbits[c/8] |= (1 << (c&7));
1229 if ((options & PCRE_CASELESS) != 0)
1231 c = cd->fcc[c]; /* flip case */
1232 classbits[c/8] |= (1 << (c&7));
1239 /* If class_charcount is 1, we saw precisely one character whose value is
1240 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
1241 can optimize the negative case only if there were no characters >= 128
1242 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
1243 single-bytes only. This is an historical hangover. Maybe one day we can
1244 tidy these opcodes to handle multi-byte characters.
1246 The optimization throws away the bit map. We turn the item into a
1247 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
1248 that OP_NOT does not support multibyte characters. In the positive case, it
1249 can cause firstbyte to be set. Otherwise, there can be no first char if
1250 this item is first, whatever repeat count may follow. In the case of
1251 reqbyte, save the previous value for reinstating. */
1253 if (class_charcount == 1 &&
1254 (!class_utf8 && (!negate_class || class_lastchar < 128)))
1256 zeroreqbyte = reqbyte;
1258 /* The OP_NOT opcode works on one-byte characters only. */
1262 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1263 zerofirstbyte = firstbyte;
1265 *code++ = class_lastchar;
1269 /* For a single, positive character, get the value into c, and
1270 then we can handle this with the normal one-character code. */
1274 } /* End of 1-char optimization */
1276 /* The general case - not the one-char optimization. If this is the first
1277 thing in the branch, there can be no first char setting, whatever the
1278 repeat count. Any reqbyte setting must remain unchanged after any kind of
1281 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1282 zerofirstbyte = firstbyte;
1283 zeroreqbyte = reqbyte;
1285 /* If there are characters with values > 255, we have to compile an
1286 extended class, with its own opcode. If there are no characters < 256,
1287 we can omit the bitmap. */
1289 if (class_utf8 && !should_flip_negation)
1291 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
1292 *code++ = OP_XCLASS;
1294 *code = negate_class? XCL_NOT : 0;
1296 /* If the map is required, install it, and move on to the end of
1299 if (class_charcount > 0)
1302 memcpy(code, classbits, 32);
1303 code = class_utf8data;
1306 /* If the map is not required, slide down the extra data. */
1310 int len = class_utf8data - (code + 33);
1311 memmove(code + 1, code + 33, len);
1315 /* Now fill in the complete length of the item */
1317 PUT(previous, 1, code - previous);
1318 break; /* End of class handling */
1321 /* If there are no characters > 255, negate the 32-byte map if necessary,
1322 and copy it into the code vector. If this is the first thing in the branch,
1323 there can be no first char setting, whatever the repeat count. Any reqbyte
1324 setting must remain unchanged after any kind of repeat. */
1326 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
1329 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
1333 memcpy(code, classbits, 32);
1338 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
1339 has been tested above. */
1342 if (!is_quantifier) goto NORMAL_CHAR;
1343 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
1344 if (*errorcodeptr != 0) goto FAILED;
1362 if (previous == NULL)
1364 *errorcodeptr = ERR9;
1368 if (repeat_min == 0)
1370 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
1371 reqbyte = zeroreqbyte; /* Ditto */
1374 /* Remember whether this is a variable length repeat */
1376 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
1378 op_type = 0; /* Default single-char op codes */
1380 /* Save start of previous item, in case we have to move it up to make space
1381 for an inserted OP_ONCE for the additional '+' extension. */
1383 tempcode = previous;
1385 /* If the next character is '+', we have a possessive quantifier. This
1386 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
1387 If the next character is '?' this is a minimizing repeat, by default,
1388 but if PCRE_UNGREEDY is set, it works the other way round. We change the
1389 repeat type to the non-default. */
1391 if (ptr + 1 < patternEnd && ptr[1] == '?')
1396 else repeat_type = 0;
1398 /* If previous was a character match, abolish the item and generate a
1399 repeat item instead. If a char item has a minumum of more than one, ensure
1400 that it is set in reqbyte - it might not be if a sequence such as x{3} is
1401 the first thing in a branch because the x will have gone into firstbyte
1404 if (*previous == OP_CHAR || *previous == OP_CHARNC)
1406 /* Deal with UTF-8 characters that take up more than one byte. It's
1407 easier to write this out separately than try to macrify it. Use c to
1408 hold the length of the character in bytes, plus 0x80 to flag that it's a
1409 length rather than a small character. */
1411 if ((code[-1] & 0x80) != 0)
1413 uschar *lastchar = code - 1;
1414 while((*lastchar & 0xc0) == 0x80) lastchar--;
1415 c = code - lastchar; /* Length of UTF-8 character */
1416 memcpy(utf8_char, lastchar, c); /* Save the char */
1417 c |= 0x80; /* Flag c as a length */
1422 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
1425 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1428 else if (*previous == OP_ASCII_CHAR || *previous == OP_ASCII_LETTER_NC)
1431 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
1432 goto OUTPUT_SINGLE_REPEAT;
1435 /* If previous was a single negated character ([^a] or similar), we use
1436 one of the special opcodes, replacing it. The code is shared with single-
1437 character repeats by setting opt_type to add a suitable offset into
1438 repeat_type. OP_NOT is currently used only for single-byte chars. */
1440 else if (*previous == OP_NOT)
1442 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1444 goto OUTPUT_SINGLE_REPEAT;
1447 /* If previous was a character type match (\d or similar), abolish it and
1448 create a suitable repeat item. The code is shared with single-character
1449 repeats by setting op_type to add a suitable offset into repeat_type. */
1451 else if (*previous <= OP_ANY)
1454 int prop_type, prop_value;
1455 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1458 OUTPUT_SINGLE_REPEAT:
1459 prop_type = prop_value = -1;
1462 code = previous; /* Usually overwrite previous item */
1464 /* If the maximum is zero then the minimum must also be zero; Perl allows
1465 this case, so we do too - by simply omitting the item altogether. */
1467 if (repeat_max == 0) goto END_REPEAT;
1469 /* Combine the op_type with the repeat_type */
1471 repeat_type += op_type;
1473 /* A minimum of zero is handled either as the special case * or ?, or as
1474 an UPTO, with the maximum given. */
1476 if (repeat_min == 0)
1478 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1479 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1482 *code++ = OP_UPTO + repeat_type;
1483 PUT2INC(code, 0, repeat_max);
1487 /* A repeat minimum of 1 is optimized into some special cases. If the
1488 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
1489 left in place and, if the maximum is greater than 1, we use OP_UPTO with
1490 one less than the maximum. */
1492 else if (repeat_min == 1)
1494 if (repeat_max == -1)
1495 *code++ = OP_PLUS + repeat_type;
1498 code = oldcode; /* leave previous item in place */
1499 if (repeat_max == 1) goto END_REPEAT;
1500 *code++ = OP_UPTO + repeat_type;
1501 PUT2INC(code, 0, repeat_max - 1);
1505 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1506 handled as an EXACT followed by an UPTO. */
1510 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1511 PUT2INC(code, 0, repeat_min);
1513 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
1514 we have to insert the character for the previous code. For a repeated
1515 Unicode property match, there are two extra bytes that define the
1516 required property. In UTF-8 mode, long characters have their length in
1517 c, with the 0x80 bit as a flag. */
1523 memcpy(code, utf8_char, c & 7);
1531 *code++ = prop_type;
1532 *code++ = prop_value;
1535 *code++ = OP_STAR + repeat_type;
1538 /* Else insert an UPTO if the max is greater than the min, again
1539 preceded by the character, for the previously inserted code. */
1541 else if (repeat_max != repeat_min)
1545 memcpy(code, utf8_char, c & 7);
1552 *code++ = prop_type;
1553 *code++ = prop_value;
1555 repeat_max -= repeat_min;
1556 *code++ = OP_UPTO + repeat_type;
1557 PUT2INC(code, 0, repeat_max);
1561 /* The character or character type itself comes last in all cases. */
1565 memcpy(code, utf8_char, c & 7);
1571 /* For a repeated Unicode property match, there are two extra bytes that
1572 define the required property. */
1576 *code++ = prop_type;
1577 *code++ = prop_value;
1581 /* If previous was a character class or a back reference, we put the repeat
1582 stuff after it, but just skip the item if the repeat was {0,0}. */
1584 else if (*previous == OP_CLASS ||
1585 *previous == OP_NCLASS ||
1586 *previous == OP_XCLASS ||
1587 *previous == OP_REF)
1589 if (repeat_max == 0)
1595 if (repeat_min == 0 && repeat_max == -1)
1596 *code++ = OP_CRSTAR + repeat_type;
1597 else if (repeat_min == 1 && repeat_max == -1)
1598 *code++ = OP_CRPLUS + repeat_type;
1599 else if (repeat_min == 0 && repeat_max == 1)
1600 *code++ = OP_CRQUERY + repeat_type;
1603 *code++ = OP_CRRANGE + repeat_type;
1604 PUT2INC(code, 0, repeat_min);
1605 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1606 PUT2INC(code, 0, repeat_max);
1610 /* If previous was a bracket group, we may have to replicate it in certain
1613 else if (*previous >= OP_BRA || *previous == OP_ONCE)
1617 int len = code - previous;
1618 uschar *bralink = NULL;
1620 /* If the maximum repeat count is unlimited, find the end of the bracket
1621 by scanning through from the start, and compute the offset back to it
1622 from the current code pointer. There may be an OP_OPT setting following
1623 the final KET, so we can't find the end just by going back from the code
1626 if (repeat_max == -1)
1628 register uschar *ket = previous;
1629 do ket += GET(ket, 1); while (*ket != OP_KET);
1630 ketoffset = code - ket;
1633 /* The case of a zero minimum is special because of the need to stick
1634 OP_BRAZERO in front of it, and because the group appears once in the
1635 data, whereas in other cases it appears the minimum number of times. For
1636 this reason, it is simplest to treat this case separately, as otherwise
1637 the code gets far too messy. There are several special subcases when the
1640 if (repeat_min == 0)
1642 /* If the maximum is also zero, we just omit the group from the output
1645 if (repeat_max == 0)
1651 /* If the maximum is 1 or unlimited, we just have to stick in the
1652 BRAZERO and do no more at this point. However, we do need to adjust
1653 any OP_RECURSE calls inside the group that refer to the group itself or
1654 any internal group, because the offset is from the start of the whole
1655 regex. Temporarily terminate the pattern while doing this. */
1657 if (repeat_max <= 1)
1660 memmove(previous+1, previous, len);
1662 *previous++ = OP_BRAZERO + repeat_type;
1665 /* If the maximum is greater than 1 and limited, we have to replicate
1666 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1667 The first one has to be handled carefully because it's the original
1668 copy, which has to be moved up. The remainder can be handled by code
1669 that is common with the non-zero minimum case below. We have to
1670 adjust the value or repeat_max, since one less copy is required. Once
1671 again, we may have to adjust any OP_RECURSE calls inside the group. */
1677 memmove(previous + 2 + LINK_SIZE, previous, len);
1678 code += 2 + LINK_SIZE;
1679 *previous++ = OP_BRAZERO + repeat_type;
1680 *previous++ = OP_BRA;
1682 /* We chain together the bracket offset fields that have to be
1683 filled in later when the ends of the brackets are reached. */
1685 offset = (bralink == NULL)? 0 : previous - bralink;
1687 PUTINC(previous, 0, offset);
1693 /* If the minimum is greater than zero, replicate the group as many
1694 times as necessary, and adjust the maximum to the number of subsequent
1695 copies that we need. If we set a first char from the group, and didn't
1696 set a required char, copy the latter from the former. */
1702 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
1703 for (i = 1; i < repeat_min; i++)
1705 memcpy(code, previous, len);
1709 if (repeat_max > 0) repeat_max -= repeat_min;
1712 /* This code is common to both the zero and non-zero minimum cases. If
1713 the maximum is limited, it replicates the group in a nested fashion,
1714 remembering the bracket starts on a stack. In the case of a zero minimum,
1715 the first one was set up above. In all cases the repeat_max now specifies
1716 the number of additional copies needed. */
1718 if (repeat_max >= 0)
1720 for (i = repeat_max - 1; i >= 0; i--)
1722 *code++ = OP_BRAZERO + repeat_type;
1724 /* All but the final copy start a new nesting, maintaining the
1725 chain of brackets outstanding. */
1731 offset = (bralink == NULL)? 0 : code - bralink;
1733 PUTINC(code, 0, offset);
1736 memcpy(code, previous, len);
1740 /* Now chain through the pending brackets, and fill in their length
1741 fields (which are holding the chain links pro tem). */
1743 while (bralink != NULL)
1746 int offset = code - bralink + 1;
1747 uschar *bra = code - offset;
1748 oldlinkoffset = GET(bra, 1);
1749 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1751 PUTINC(code, 0, offset);
1752 PUT(bra, 1, offset);
1756 /* If the maximum is unlimited, set a repeater in the final copy. We
1757 can't just offset backwards from the current code point, because we
1758 don't know if there's been an options resetting after the ket. The
1759 correct offset was computed above. */
1761 else code[-ketoffset] = OP_KETRMAX + repeat_type;
1764 /* Else there's some kind of shambles */
1768 *errorcodeptr = ERR11;
1772 /* In all case we no longer have a previous item. We also set the
1773 "follows varying string" flag for subsequently encountered reqbytes if
1774 it isn't already set and we have just passed a varying length item. */
1778 cd->req_varyopt |= reqvary;
1782 /* Start of nested bracket sub-expression, or comment or lookahead or
1783 lookbehind or option setting or condition. First deal with special things
1784 that can come after a bracket; all are introduced by ?, and the appearance
1785 of any of them means that this is not a referencing group. They were
1786 checked for validity in the first pass over the string, so we don't have to
1787 check for syntax errors here. */
1792 if (*(++ptr) == '?')
1796 case ':': /* Non-extracting bracket */
1801 case '=': /* Positive lookahead */
1802 bravalue = OP_ASSERT;
1806 case '!': /* Negative lookahead */
1807 bravalue = OP_ASSERT_NOT;
1811 /* Character after (? not specially recognized */
1813 default: /* Option setting */
1814 *errorcodeptr = ERR12;
1819 /* Else we have a referencing group; adjust the opcode. If the bracket
1820 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1821 arrange for the true number to follow later, in an OP_BRANUMBER item. */
1825 if (++(*brackets) > EXTRACT_BASIC_MAX)
1827 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1828 code[1+LINK_SIZE] = OP_BRANUMBER;
1829 PUT2(code, 2+LINK_SIZE, *brackets);
1832 else bravalue = OP_BRA + *brackets;
1835 /* Process nested bracketed re. Assertions may not be repeated, but other
1836 kinds can be. We copy code into a non-register variable in order to be able
1837 to pass its address because some compilers complain otherwise. Pass in a
1838 new setting for the ims options if they have changed. */
1840 previous = (bravalue >= OP_ONCE)? code : NULL;
1843 tempreqvary = cd->req_varyopt; /* Save value before bracket */
1847 brackets, /* Extracting bracket count */
1848 &tempcode, /* Where to put code (updated) */
1849 &ptr, /* Input pointer (updated) */
1851 errorcodeptr, /* Where to put an error message */
1852 skipbytes, /* Skip over OP_BRANUMBER */
1853 &subfirstbyte, /* For possible first char */
1854 &subreqbyte, /* For possible last char */
1855 cd)) /* Tables block */
1858 /* At the end of compiling, code is still pointing to the start of the
1859 group, while tempcode has been updated to point past the end of the group
1860 and any option resetting that may follow it. The pattern pointer (ptr)
1861 is on the bracket. */
1863 /* Handle updating of the required and first characters. Update for normal
1864 brackets of all kinds, and conditions with two branches (see code above).
1865 If the bracket is followed by a quantifier with zero repeat, we have to
1866 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
1867 main loop so that they can be accessed for the back off. */
1869 zeroreqbyte = reqbyte;
1870 zerofirstbyte = firstbyte;
1871 groupsetfirstbyte = false;
1873 if (bravalue >= OP_BRA || bravalue == OP_ONCE)
1875 /* If we have not yet set a firstbyte in this branch, take it from the
1876 subpattern, remembering that it was set here so that a repeat of more
1877 than one can replicate it as reqbyte if necessary. If the subpattern has
1878 no firstbyte, set "none" for the whole branch. In both cases, a zero
1879 repeat forces firstbyte to "none". */
1881 if (firstbyte == REQ_UNSET)
1883 if (subfirstbyte >= 0)
1885 firstbyte = subfirstbyte;
1886 groupsetfirstbyte = true;
1888 else firstbyte = REQ_NONE;
1889 zerofirstbyte = REQ_NONE;
1892 /* If firstbyte was previously set, convert the subpattern's firstbyte
1893 into reqbyte if there wasn't one, using the vary flag that was in
1894 existence beforehand. */
1896 else if (subfirstbyte >= 0 && subreqbyte < 0)
1897 subreqbyte = subfirstbyte | tempreqvary;
1899 /* If the subpattern set a required byte (or set a first byte that isn't
1900 really the first byte - see above), set it. */
1902 if (subreqbyte >= 0) reqbyte = subreqbyte;
1905 /* For a forward assertion, we take the reqbyte, if set. This can be
1906 helpful if the pattern that follows the assertion doesn't set a different
1907 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
1908 for an assertion, however because it leads to incorrect effect for patterns
1909 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
1910 of a firstbyte. This is overcome by a scan at the end if there's no
1911 firstbyte, looking for an asserted first char. */
1913 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
1915 /* Now update the main code pointer to the end of the group. */
1919 /* Error if hit end of pattern */
1921 if (ptr >= patternEnd || *ptr != ')')
1923 *errorcodeptr = ERR14;
1928 /* Check \ for being a real metacharacter; if not, fall through and handle
1929 it as a data character at the start of a string. Escape items are checked
1930 for validity in the pre-compiling pass. */
1934 c = check_escape(&ptr, patternEnd, errorcodeptr, *brackets, false);
1936 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1937 are arranged to be the negation of the corresponding OP_values. For the
1938 back references, the values are ESC_REF plus the reference number. Only
1939 back references and those types that consume a character may be repeated.
1940 We can test for values between ESC_b and ESC_w for the latter; this may
1941 have to change if any new ones are ever created. */
1945 /* For metasequences that actually match a character, we disable the
1946 setting of a first character if it hasn't already been set. */
1948 if (firstbyte == REQ_UNSET && -c > ESC_b && -c <= ESC_w)
1949 firstbyte = REQ_NONE;
1951 /* Set values to reset to if this is followed by a zero repeat. */
1953 zerofirstbyte = firstbyte;
1954 zeroreqbyte = reqbyte;
1956 /* Back references are handled specially */
1960 int number = -c - ESC_REF;
1963 PUT2INC(code, 0, number);
1966 /* For the rest, we can obtain the OP value by negating the escape
1971 previous = (-c > ESC_b && -c <= ESC_w)? code : NULL;
1979 /* Handle a literal character. It is guaranteed not to be whitespace or #
1980 when the extended flag is set. If we are in UTF-8 mode, it may be a
1981 multi-byte literal character. */
1993 if (options & PCRE_CASELESS && (c | 0x20) >= 'a' && (c | 0x20) <= 'z')
1995 *code++ = OP_ASCII_LETTER_NC;
2000 *code++ = OP_ASCII_CHAR;
2006 mclength = _pcre_ord2utf8(c, mcbuffer);
2008 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
2009 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
2012 /* Set the first and required bytes appropriately. If no previous first
2013 byte, set it from this character, but revert to none on a zero repeat.
2014 Otherwise, leave the firstbyte value alone, and don't change it on a zero
2017 if (firstbyte == REQ_UNSET)
2019 zerofirstbyte = REQ_NONE;
2020 zeroreqbyte = reqbyte;
2022 /* If the character is more than one byte long, we can set firstbyte
2023 only if it is not to be matched caselessly. */
2025 if (mclength == 1 || req_caseopt == 0)
2027 firstbyte = mcbuffer[0] | req_caseopt;
2028 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
2030 else firstbyte = reqbyte = REQ_NONE;
2033 /* firstbyte was previously set; we can set reqbyte only the length is
2034 1 or the matching is caseful. */
2038 zerofirstbyte = firstbyte;
2039 zeroreqbyte = reqbyte;
2040 if (mclength == 1 || req_caseopt == 0)
2041 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
2044 break; /* End of literal character handling */
2046 } /* end of big loop */
2048 /* Control never reaches here by falling through, only by a goto for all the
2049 error states. Pass back the position in the pattern so that it can be displayed
2050 to the user for diagnosing the error. */
2060 /*************************************************
2061 * Compile sequence of alternatives *
2062 *************************************************/
2064 /* On entry, ptr is pointing past the bracket character, but on return
2065 it points to the closing bracket, or vertical bar, or end of string.
2066 The code variable is pointing at the byte into which the BRA operator has been
2067 stored. If the ims options are changed at the start (for a (?ims: group) or
2068 during any branch, we need to insert an OP_OPT item at the start of every
2069 following branch to ensure they get set correctly at run time, and also pass
2070 the new options into every subsequent branch compile.
2073 options option bits, including any changes for this subpattern
2074 brackets -> int containing the number of extracting brackets used
2075 codeptr -> the address of the current code pointer
2076 ptrptr -> the address of the current pattern pointer
2077 errorcodeptr -> pointer to error code variable
2078 skipbytes skip this many bytes at start (for OP_BRANUMBER)
2079 firstbyteptr place to put the first required character, or a negative number
2080 reqbyteptr place to put the last required character, or a negative number
2081 cd points to the data block with tables pointers etc.
2083 Returns: true on success
2087 compile_regex(int options, int *brackets, uschar **codeptr,
2088 const pcre_uchar **ptrptr, const pcre_uchar *patternEnd, ErrorCode* errorcodeptr, int skipbytes,
2089 int *firstbyteptr, int *reqbyteptr, compile_data *cd)
2091 const pcre_uchar *ptr = *ptrptr;
2092 uschar *code = *codeptr;
2093 uschar *last_branch = code;
2094 uschar *start_bracket = code;
2095 int firstbyte, reqbyte;
2096 int branchfirstbyte, branchreqbyte;
2098 firstbyte = reqbyte = REQ_UNSET;
2100 /* Offset is set zero to mark that this bracket is still open */
2103 code += 1 + LINK_SIZE + skipbytes;
2105 /* Loop for each alternative branch */
2109 /* Now compile the branch */
2111 if (!compile_branch(options, brackets, &code, &ptr, patternEnd, errorcodeptr,
2112 &branchfirstbyte, &branchreqbyte, cd))
2118 /* If this is the first branch, the firstbyte and reqbyte values for the
2119 branch become the values for the regex. */
2121 if (*last_branch != OP_ALT)
2123 firstbyte = branchfirstbyte;
2124 reqbyte = branchreqbyte;
2127 /* If this is not the first branch, the first char and reqbyte have to
2128 match the values from all the previous branches, except that if the previous
2129 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
2130 REQ_VARY for the regex. */
2134 /* If we previously had a firstbyte, but it doesn't match the new branch,
2135 we have to abandon the firstbyte for the regex, but if there was previously
2136 no reqbyte, it takes on the value of the old firstbyte. */
2138 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
2140 if (reqbyte < 0) reqbyte = firstbyte;
2141 firstbyte = REQ_NONE;
2144 /* If we (now or from before) have no firstbyte, a firstbyte from the
2145 branch becomes a reqbyte if there isn't a branch reqbyte. */
2147 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
2148 branchreqbyte = branchfirstbyte;
2150 /* Now ensure that the reqbytes match */
2152 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
2154 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
2157 /* Reached end of expression, either ')' or end of pattern. Go back through
2158 the alternative branches and reverse the chain of offsets, with the field in
2159 the BRA item now becoming an offset to the first alternative. If there are
2160 no alternatives, it points to the end of the group. The length in the
2161 terminating ket is always the length of the whole bracketed item. If any of
2162 the ims options were changed inside the group, compile a resetting op-code
2163 following, except at the very end of the pattern. Return leaving the pointer
2164 at the terminating char. */
2166 if (ptr >= patternEnd || *ptr != '|')
2168 int length = code - last_branch;
2171 int prev_length = GET(last_branch, 1);
2172 PUT(last_branch, 1, length);
2173 length = prev_length;
2174 last_branch -= length;
2178 /* Fill in the ket */
2181 PUT(code, 1, code - start_bracket);
2182 code += 1 + LINK_SIZE;
2184 /* Set values to pass back */
2188 *firstbyteptr = firstbyte;
2189 *reqbyteptr = reqbyte;
2193 /* Another branch follows; insert an "or" node. Its length field points back
2194 to the previous branch while the bracket remains open. At the end the chain
2195 is reversed. It's done like this so that the start of the bracket has a
2196 zero offset until it is closed, making it possible to detect recursion. */
2199 PUT(code, 1, code - last_branch);
2201 code += 1 + LINK_SIZE;
2204 /* Control never reaches here */
2210 /*************************************************
2211 * Check for anchored expression *
2212 *************************************************/
2214 /* Try to find out if this is an anchored regular expression. Consider each
2215 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2216 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2217 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2218 counts, since OP_CIRC can match in the middle.
2220 We can also consider a regex to be anchored if OP_SOM starts all its branches.
2221 This is the code for \G, which means "match at start of match position, taking
2222 into account the match offset".
2224 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2225 because that will try the rest of the pattern at all possible matching points,
2226 so there is no point trying again.... er ....
2228 .... except when the .* appears inside capturing parentheses, and there is a
2229 subsequent back reference to those parentheses. We haven't enough information
2230 to catch that case precisely.
2232 At first, the best we could do was to detect when .* was in capturing brackets
2233 and the highest back reference was greater than or equal to that level.
2234 However, by keeping a bitmap of the first 31 back references, we can catch some
2235 of the more common cases more precisely.
2238 code points to start of expression (the bracket)
2239 options points to the options setting
2240 bracket_map a bitmap of which brackets we are inside while testing; this
2241 handles up to substring 31; after that we just have to take
2242 the less precise approach
2243 backref_map the back reference bitmap
2245 Returns: true or false
2249 is_anchored(register const uschar *code, int options, unsigned int bracket_map,
2250 unsigned int backref_map)
2253 const uschar *scode =
2254 first_significant_code(code + 1+LINK_SIZE, false);
2255 register int op = *scode;
2257 /* Capturing brackets */
2263 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
2264 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
2265 if (!is_anchored(scode, options, new_map, backref_map)) return false;
2268 /* Other brackets */
2270 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE)
2272 if (!is_anchored(scode, options, bracket_map, backref_map)) return false;
2275 /* Check for explicit anchoring */
2277 else if (((options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2279 code += GET(code, 1);
2281 while (*code == OP_ALT); /* Loop for each alternative */
2287 /*************************************************
2288 * Check for starting with ^ or .* *
2289 *************************************************/
2291 /* This is called to find out if every branch starts with ^ or .* so that
2292 "first char" processing can be done to speed things up in multiline
2293 matching and for non-DOTALL patterns that start with .* (which must start at
2294 the beginning or after \n). As in the case of is_anchored() (see above), we
2295 have to take account of back references to capturing brackets that contain .*
2296 because in that case we can't make the assumption.
2299 code points to start of expression (the bracket)
2300 bracket_map a bitmap of which brackets we are inside while testing; this
2301 handles up to substring 31; after that we just have to take
2302 the less precise approach
2303 backref_map the back reference bitmap
2305 Returns: true or false
2309 is_startline(const uschar *code, unsigned int bracket_map,
2310 unsigned int backref_map)
2313 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, false);
2314 register int op = *scode;
2316 /* Capturing brackets */
2322 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
2323 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
2324 if (!is_startline(scode, new_map, backref_map)) return false;
2327 /* Other brackets */
2329 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE)
2330 { if (!is_startline(scode, bracket_map, backref_map)) return false; }
2332 /* .* means "start at start or after \n" if it isn't in brackets that
2333 may be referenced. */
2335 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2337 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return false;
2340 /* Check for explicit circumflex */
2342 else if (op != OP_CIRC) return false;
2344 /* Move on to the next alternative */
2346 code += GET(code, 1);
2348 while (*code == OP_ALT); /* Loop for each alternative */
2354 /*************************************************
2355 * Check for asserted fixed first char *
2356 *************************************************/
2358 /* During compilation, the "first char" settings from forward assertions are
2359 discarded, because they can cause conflicts with actual literals that follow.
2360 However, if we end up without a first char setting for an unanchored pattern,
2361 it is worth scanning the regex to see if there is an initial asserted first
2362 char. If all branches start with the same asserted char, or with a bracket all
2363 of whose alternatives start with the same asserted char (recurse ad lib), then
2364 we return that char, otherwise -1.
2367 code points to start of expression (the bracket)
2368 options pointer to the options (used to check casing changes)
2369 inassert true if in an assertion
2371 Returns: -1 or the fixed first char
2375 find_firstassertedchar(const uschar *code, int options, BOOL inassert)
2377 register int c = -1;
2380 const uschar *scode =
2381 first_significant_code(code + 1+LINK_SIZE, true);
2382 register int op = *scode;
2384 if (op >= OP_BRA) op = OP_BRA;
2394 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
2396 if (c < 0) c = d; else if (c != d) return -1;
2399 case OP_EXACT: /* Fall through */
2405 case OP_ASCII_LETTER_NC:
2408 if (!inassert) return -1;
2412 if ((options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
2414 else if (c != scode[1]) return -1;
2418 code += GET(code, 1);
2420 while (*code == OP_ALT);
2424 static int calculateCompiledPatternLengthAndFlags(const pcre_char* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase, compile_data& compile_block, ErrorCode errorcode)
2426 /* Make a pass over the pattern to compute the
2427 amount of store required to hold the compiled code. This does not have to be
2428 perfect as long as errors are overestimates. At the same time we can detect any
2429 flag settings right at the start, and extract them. Make an attempt to correct
2430 for any counted white space if an "extended" flag setting appears late in the
2431 pattern. We can't be so clever for #-comments. */
2433 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
2434 int branch_extra = 0;
2435 int branch_newextra;
2436 int lastitemlength = 0;
2439 unsigned int brastackptr = 0;
2440 int brastack[BRASTACK_SIZE];
2441 uschar bralenstack[BRASTACK_SIZE];
2442 int item_count = -1;
2445 const pcre_uchar* ptr = (const pcre_uchar*)(pattern - 1);
2446 const pcre_uchar* patternEnd = (const pcre_uchar*)(pattern + patternLength);
2448 while (++ptr < patternEnd)
2450 int min = 0, max = 0;
2457 item_count++; /* Is zero for the first non-comment item */
2461 /* A backslashed item may be an escaped data character or it may be a
2465 c = check_escape(&ptr, patternEnd, &errorcode, bracount, false);
2469 lastitemlength = 1; /* Default length of last item for repeats */
2471 if (c >= 0) /* Data character */
2473 length += 2; /* For a one-byte character */
2478 for (i = 0; i < _pcre_utf8_table1_size; i++)
2479 if (c <= _pcre_utf8_table1[i]) break;
2481 lastitemlength += i;
2487 /* Other escapes need one byte */
2491 /* A back reference needs an additional 2 bytes, plus either one or 5
2492 bytes for a repeat. We also need to keep the value of the highest
2497 int refnum = -c - ESC_REF;
2498 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
2499 if (refnum > compile_block.top_backref)
2500 compile_block.top_backref = refnum;
2501 length += 2; /* For single back reference */
2502 if (ptr[1] == '{' && is_counted_repeat(ptr+2, patternEnd))
2504 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
2505 if (errorcode != 0) return -1;;
2506 if ((min == 0 && (max == 1 || max == -1)) ||
2507 (min == 1 && max == -1))
2510 if (ptr[1] == '?') ptr++;
2516 case '^': /* Single-byte metacharacters */
2523 case '*': /* These repeats won't be after brackets; */
2524 case '+': /* those are handled separately */
2527 goto POSESSIVE; /* A few lines below */
2529 /* This covers the cases of braced repeats after a single char, metachar,
2530 class, or back reference. */
2533 if (!is_counted_repeat(ptr+1, patternEnd)) goto NORMAL_CHAR;
2534 ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
2535 if (errorcode != 0) return -1;;
2537 /* These special cases just insert one extra opcode */
2539 if ((min == 0 && (max == 1 || max == -1)) ||
2540 (min == 1 && max == -1))
2543 /* These cases might insert additional copies of a preceding character. */
2549 length -= lastitemlength; /* Uncount the original char or metachar */
2550 if (min > 0) length += 3 + lastitemlength;
2552 length += lastitemlength + ((max > 0)? 3 : 1);
2555 if (ptr[1] == '?') ptr++; /* Needs no extra length */
2557 POSESSIVE: /* Test for possessive quantifier */
2561 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
2565 /* An alternation contains an offset to the next branch or ket. If any ims
2566 options changed in the previous branch(es), and/or if we are in a
2567 lookbehind assertion, extra space will be needed at the start of the
2568 branch. This is handled by branch_extra. */
2571 length += 1 + LINK_SIZE + branch_extra;
2574 /* A character class uses 33 characters provided that all the character
2575 values are less than 256. Otherwise, it uses a bit map for low valued
2576 characters, and individual items for others. Don't worry about character
2577 types that aren't allowed in classes - they'll get picked up during the
2578 compile. A character class that contains only one single-byte character
2579 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
2580 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
2583 if (*(++ptr) == '^')
2585 class_optcount = 10; /* Greater than one */
2588 else class_optcount = 0;
2592 for (; ptr < patternEnd && *ptr != ']'; ++ptr)
2594 /* Check for escapes */
2598 c = check_escape(&ptr, patternEnd, &errorcode, bracount, true);
2599 if (errorcode != 0) return -1;;
2601 /* \b is backspace inside a class; \X is literal */
2603 if (-c == ESC_b) c = '\b';
2605 /* Handle escapes that turn into characters */
2607 if (c >= 0) goto NON_SPECIAL_CHARACTER;
2609 /* Escapes that are meta-things. The normal ones just affect the
2610 bit map, but Unicode properties require an XCLASS extended item. */
2614 class_optcount = 10; /* \d, \s etc; make sure > 1 */
2618 /* Anything else increments the possible optimization count. We have to
2619 detect ranges here so that we can compute the number of extra ranges for
2620 caseless wide characters when UCP support is available. If there are wide
2621 characters, we are going to have to use an XCLASS, even for single
2630 GETCHARLENEND(c, ptr, patternEnd, extra);
2634 /* Come here from handling \ above when it escapes to a char value */
2636 NON_SPECIAL_CHARACTER:
2640 if (ptr + 1 < patternEnd && ptr[1] == '-')
2642 pcre_uchar const *hyptr = ptr++;
2643 if (ptr + 1 < patternEnd && ptr[1] == '\\')
2646 d = check_escape(&ptr, patternEnd, &errorcode, bracount, true);
2647 if (errorcode != 0) return -1;;
2648 if (-d == ESC_b) d = '\b'; /* backspace */
2650 else if (ptr + 1 < patternEnd && ptr[1] != ']')
2655 GETCHARLENEND(d, ptr, patternEnd, extra);
2659 if (d < 0) ptr = hyptr; /* go back to hyphen as data */
2662 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
2663 127 for caseless matching, we will need to use an XCLASS. */
2667 class_optcount = 10; /* Ensure > 1 */
2674 if ((d > 255 || (ignoreCase && d > 127)))
2677 if (!class_utf8) /* Allow for XCLASS overhead */
2680 length += LINK_SIZE + 2;
2683 /* If we have UCP support, find out how many extra ranges are
2684 needed to map the other case of characters within this range. We
2685 have to mimic the range optimization here, because extending the
2686 range upwards might push d over a boundary that makes is use
2687 another byte in the UTF-8 representation. */
2694 while (get_othercase_range(&cc, origd, &occ, &ocd))
2696 if (occ >= c && ocd <= d) continue; /* Skip embedded */
2698 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2699 { /* if there is overlap, */
2700 c = occ; /* noting that if occ < c */
2701 continue; /* we can't have ocd > d */
2702 } /* because a subrange is */
2703 if (ocd > d && occ <= d + 1) /* always shorter than */
2704 { /* the basic range. */
2709 /* An extra item is needed */
2711 length += 1 + _pcre_ord2utf8(occ, buffer) +
2712 ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
2716 /* The length of the (possibly extended) range */
2718 length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
2723 /* We have a single character. There is nothing to be done unless we
2724 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
2725 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
2730 if ((c > 255 || (ignoreCase && c > 127)))
2733 class_optcount = 10; /* Ensure > 1 */
2734 if (!class_utf8) /* Allow for XCLASS overhead */
2737 length += LINK_SIZE + 2;
2739 length += (ignoreCase ? 2 : 1) * (1 + _pcre_ord2utf8(c, buffer));
2745 if (ptr >= patternEnd) /* Missing terminating ']' */
2751 /* We can optimize when there was only one optimizable character. Repeats
2752 for positive and negated single one-byte chars are handled by the general
2753 code. Here, we handle repeats for the class opcodes. */
2755 if (class_optcount == 1) length += 3; else
2759 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
2760 we also need extra for wrapping the whole thing in a sub-pattern. */
2762 if (ptr + 1 < patternEnd && ptr[1] == '{' && is_counted_repeat(ptr+2, patternEnd))
2764 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
2765 if (errorcode != 0) return -1;;
2766 if ((min == 0 && (max == 1 || max == -1)) ||
2767 (min == 1 && max == -1))
2770 if (ptr + 1 < patternEnd && ptr[1] == '+')
2773 length += 2 + 2*LINK_SIZE;
2775 else if (ptr + 1 < patternEnd && ptr[1] == '?') ptr++;
2780 /* Brackets may be genuine groups or special things */
2783 branch_newextra = 0;
2784 bracket_length = 1 + LINK_SIZE;
2787 /* Handle special forms of bracket, which all start (? */
2789 if (ptr + 1 < patternEnd && ptr[1] == '?')
2791 switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0))
2793 /* Non-referencing groups and lookaheads just move the pointer on, and
2794 then behave like a non-special bracket, except that they don't increment
2795 the count of extracting brackets. Ditto for the "once only" bracket,
2796 which is in Perl from version 5.005. */
2804 /* Else loop checking valid options until ) is met. Anything else is an
2805 error. If we are without any brackets, i.e. at top level, the settings
2806 act as if specified in the options, so massage the options immediately.
2807 This is for backward compatibility with Perl 5.004. */
2817 /* Capturing brackets must be counted so we can process escapes in a
2818 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
2819 an additional 3 bytes of memory per capturing bracket. */
2824 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
2827 /* Save length for computing whole length at end if there's a repeat that
2828 requires duplication of the group. Also save the current value of
2829 branch_extra, and start the new group with the new value. If non-zero, this
2830 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
2832 if (brastackptr >= sizeof(brastack)/sizeof(int))
2838 bralenstack[brastackptr] = branch_extra;
2839 branch_extra = branch_newextra;
2841 brastack[brastackptr++] = length;
2842 length += bracket_length;
2845 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2846 have to replicate this bracket up to that many times. If brastackptr is
2847 0 this is an unmatched bracket which will generate an error, but take care
2848 not to try to access brastack[-1] when computing the length and restoring
2849 the branch_extra value. */
2852 length += 1 + LINK_SIZE;
2853 if (brastackptr > 0)
2855 duplength = length - brastack[--brastackptr];
2856 branch_extra = bralenstack[brastackptr];
2860 /* Leave ptr at the final char; for read_repeat_counts this happens
2861 automatically; for the others we need an increment. */
2863 if (ptr + 1 < patternEnd && (c = ptr[1]) == '{' && is_counted_repeat(ptr+2, patternEnd))
2865 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
2866 if (errorcode != 0) return -1;;
2868 else if (c == '*') { min = 0; max = -1; ptr++; }
2869 else if (c == '+') { min = 1; max = -1; ptr++; }
2870 else if (c == '?') { min = 0; max = 1; ptr++; }
2871 else { min = 1; max = 1; }
2873 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2874 group, and if the maximum is greater than zero, we have to replicate
2875 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2881 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
2884 /* When the minimum is greater than zero, we have to replicate up to
2885 minval-1 times, with no additions required in the copies. Then, if there
2886 is a limited maximum we have to replicate up to maxval-1 times allowing
2887 for a BRAZERO item before each optional copy and nesting brackets for all
2888 but one of the optional copies. */
2892 length += (min - 1) * duplength;
2893 if (max > min) /* Need this test as max=-1 means no limit */
2894 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
2895 - (2 + 2*LINK_SIZE);
2898 /* Allow space for once brackets for "possessive quantifier" */
2900 if (ptr + 1 < patternEnd && ptr[1] == '+')
2903 length += 2 + 2*LINK_SIZE;
2907 /* Non-special character. It won't be space or # in extended mode, so it is
2908 always a genuine character. If we are in a \Q...\E sequence, check for the
2909 end; if not, we have a literal. */
2914 length += 2; /* For a one-byte character */
2915 lastitemlength = 1; /* Default length of last item for repeats */
2917 /* In UTF-8 mode, check for additional bytes. */
2921 if (IS_LEADING_SURROGATE(c))
2923 c = DECODE_SURROGATE_PAIR(c, ptr < patternEnd ? *ptr : 0);
2929 for (i = 0; i < _pcre_utf8_table1_size; i++)
2930 if (c <= _pcre_utf8_table1[i]) break;
2932 lastitemlength += i;
2940 length += 2 + LINK_SIZE; /* For final KET and END */
2945 static void printCompiledRegExp(real_pcre* re, int length)
2947 printf("Length = %d top_bracket = %d top_backref = %d\n",
2948 length, re->top_bracket, re->top_backref);
2952 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2953 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2954 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "");
2957 if (re->options & PCRE_FIRSTSET) {
2958 char ch = re->first_byte & 255;
2959 const char* caseless = ((re->first_byte & REQ_CASELESS) == 0) ? "" : " (caseless)";
2960 if (isASCIIAlphanumeric(ch))
2961 printf("First char = %c%s\n", ch, caseless);
2963 printf("First char = \\x%02x%s\n", ch, caseless);
2966 if (re->options & PCRE_REQCHSET) {
2967 char ch = re->req_byte & 255;
2968 const char* caseless = ((re->req_byte & REQ_CASELESS) == 0) ? "" : " (caseless)";
2969 if (isASCIIAlphanumeric(ch))
2970 printf("Req char = %c%s\n", ch, caseless);
2972 printf("Req char = \\x%02x%s\n", ch, caseless);
2975 // This debugging function has been removed from JavaScriptCore's PCRE
2976 //pcre_printint(re, stdout);
2980 /*************************************************
2981 * Compile a Regular Expression *
2982 *************************************************/
2984 /* This function takes a string and returns a pointer to a block of store
2985 holding a compiled version of the expression. The original API for this
2986 function had no error code return variable; it is retained for backwards
2987 compatibility. The new function is given a new name.
2990 pattern the regular expression
2991 options various option bits
2992 errorcodeptr pointer to error code variable (pcre_compile2() only)
2993 can be NULL if you don't want a code value
2994 errorptr pointer to pointer to error text
2995 erroroffset ptr offset in pattern where error was detected
2996 tables pointer to character tables or NULL
2998 Returns: pointer to compiled data block, or NULL on error,
2999 with errorptr and erroroffset set
3002 static pcre* returnError(ErrorCode errorcode, const char** errorptr)
3004 *errorptr = error_text(errorcode);
3009 jsRegExpCompile(const pcre_char* pattern, int patternLength,
3010 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,
3011 unsigned* numSubpatterns, const char** errorptr)
3013 /* We can't pass back an error message if errorptr is NULL; I guess the best we
3014 can do is just return NULL, but we can set a code value if there is a code
3021 /* Set up pointers to the individual character tables */
3023 compile_data compile_block;
3025 ErrorCode errorcode = ERR0;
3026 int length = calculateCompiledPatternLengthAndFlags(pattern, patternLength, ignoreCase, compile_block, errorcode);
3028 return returnError(errorcode, errorptr);
3030 if (length > MAX_PATTERN_SIZE)
3031 return returnError(ERR16, errorptr);
3033 /* Compute the size of data block needed and get it. */
3035 size_t size = length + sizeof(real_pcre);
3036 real_pcre* re = reinterpret_cast<real_pcre*>(new char[size]);
3039 return returnError(ERR13, errorptr);
3041 /* Put in the magic number, and save the sizes, options, and character table
3042 pointer. NULL is used for the default character tables. The nullpad field is at
3043 the end; it's there to help in the case when a regex compiled on a system with
3044 4-byte pointers is run on another with 8-byte pointers. */
3046 re->size = (pcre_uint32)size;
3047 re->options = (ignoreCase ? PCRE_CASELESS : 0) | (multiline ? PCRE_MULTILINE : 0);
3049 /* The starting points of the name/number translation table and of the code are
3050 passed around in the compile data block. */
3052 const uschar* codestart = (const uschar*)(re + 1);
3053 compile_block.start_code = codestart;
3054 compile_block.start_pattern = (const pcre_uchar*)pattern;
3056 /* Set up a starting, non-extracting bracket, then compile the expression. On
3057 error, errorcode will be set non-zero, so we don't need to look at the result
3058 of the function here. */
3060 const pcre_uchar* ptr = (const pcre_uchar*)pattern;
3061 const pcre_uchar* patternEnd = pattern + patternLength;
3062 uschar* code = (uschar*)codestart;
3064 int firstbyte, reqbyte;
3065 int bracketCount = 0;
3066 (void)compile_regex(re->options, &bracketCount, &code, &ptr,
3068 &errorcode, 0, &firstbyte, &reqbyte, &compile_block);
3069 re->top_bracket = bracketCount;
3070 re->top_backref = compile_block.top_backref;
3072 /* If not reached end of pattern on success, there's an excess bracket. */
3074 if (errorcode == 0 && ptr < patternEnd)
3077 /* Fill in the terminating state and check for disastrous overflow, but
3078 if debugging, leave the test till after things are printed out. */
3083 if (code - codestart > length)
3087 /* Give an error if there's back reference to a non-existent capturing
3090 if (re->top_backref > re->top_bracket)
3093 /* Failed to compile, or error while post-processing */
3095 if (errorcode != ERR0) {
3096 delete [] reinterpret_cast<char*>(re);
3097 return returnError(errorcode, errorptr);
3100 /* If the anchored option was not passed, set the flag if we can determine that
3101 the pattern is anchored by virtue of ^ characters or \A or anything else (such
3102 as starting with .* when DOTALL is set).
3104 Otherwise, if we know what the first character has to be, save it, because that
3105 speeds up unanchored matches no end. If not, see if we can set the
3106 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3107 start with ^. and also when all branches start with .* for non-DOTALL matches.
3110 if (is_anchored(codestart, re->options, 0, compile_block.backref_map))
3111 re->options |= PCRE_ANCHORED;
3114 firstbyte = find_firstassertedchar(codestart, re->options, false);
3115 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
3117 int ch = firstbyte & 255;
3119 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
3120 compile_block.fcc[ch] == ch)? ch : firstbyte;
3121 re->options |= PCRE_FIRSTSET;
3124 else if (is_startline(codestart, 0, compile_block.backref_map))
3125 re->options |= PCRE_STARTLINE;
3128 /* For an anchored pattern, we use the "required byte" only if it follows a
3129 variable length item in the regex. Remove the caseless flag for non-caseable
3132 if (reqbyte >= 0 && (!(re->options & PCRE_ANCHORED) || (reqbyte & REQ_VARY))) {
3133 int ch = reqbyte & 255;
3135 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
3136 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
3137 re->options |= PCRE_REQCHSET;
3142 printCompiledRegExp(re);
3144 /* This check is done here in the debugging case so that the code that
3145 was compiled can be seen. */
3146 if (code - codestart > length) {
3148 *errorptr = error_text(ERR7);
3155 *numSubpatterns = re->top_bracket;
3159 void jsRegExpFree(JSRegExp* re)
3161 delete [] reinterpret_cast<char*>(re);