1 /* This is JavaScriptCore's variant of the PCRE library. While this library
2 started out as a copy of PCRE, many of the features of PCRE have been
3 removed. This library now supports only the regular expression features
4 required by the JavaScript language specification, and has only the functions
5 needed by JavaScriptCore and the rest of WebKit.
7 Originally written by Philip Hazel
8 Copyright (c) 1997-2006 University of Cambridge
9 Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved.
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
40 /* This module contains the external function jsRegExpExecute(), along with
41 supporting internal functions that are not used by other modules. */
45 #include "pcre_internal.h"
47 #include <wtf/ASCIICType.h>
48 #include <wtf/FastMalloc.h>
52 /*************************************************
53 * Code parameters and static tables *
54 *************************************************/
56 /* Maximum number of items on the nested bracket stacks at compile time. This
57 applies to the nesting of all kinds of parentheses. It does not limit
58 un-nested, non-capturing parentheses. This number can be made bigger if
59 necessary - it is used to dimension one int and one unsigned char vector at
62 #define BRASTACK_SIZE 200
64 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
65 are simple data values; negative values are for special things like \d and so
66 on. Zero means further processing is needed (for things like \x), or the escape
69 static const short escapes[] = {
70 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
71 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
72 '@', 0, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
73 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
74 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
75 0, 0, 0, '[', '\\', ']', '^', '_', /* X - _ */
76 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
77 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
78 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
82 /* Error code numbers. They are given names so that they can more easily be
86 ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
87 ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17
90 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
91 the definition is next to the definition of the opcodes in pcre_internal.h. */
93 static const uschar OP_lengths[] = { OP_LENGTHS };
95 /* The texts of compile-time error messages. These are "char *" because they
96 are passed to the outside world. */
98 static const char* error_text(ErrorCode code)
100 static const char error_texts[] =
102 "\\ at end of pattern\0"
103 "\\c at end of pattern\0"
104 "character value in \\x{...} sequence is too large\0"
105 "numbers out of order in {} quantifier\0"
107 "number too big in {} quantifier\0"
108 "missing terminating ] for character class\0"
109 "internal error: code overflow\0"
110 "range out of order in character class\0"
111 "nothing to repeat\0"
113 "unmatched parentheses\0"
114 "internal error: unexpected repeat\0"
115 "unrecognized character after (?\0"
116 "failed to get memory\0"
119 "reference to non-existent subpattern\0"
120 "regular expression too large\0"
121 "parentheses nested too deeply"
125 const char* text = error_texts;
131 /* Definition to allow mutual recursion */
134 compile_regex(int, int *, uschar **, const pcre_uchar **, const pcre_uchar *, ErrorCode*, int,
135 int *, int *, compile_data *);
137 /*************************************************
139 *************************************************/
141 /* This function is called when a \ has been encountered. It either returns a
142 positive value for a simple escape such as \n, or a negative value which
143 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
144 a positive value greater than 255 may be returned. On entry, ptr is pointing at
145 the \. On exit, it is on the final character of the escape sequence.
148 ptrptr points to the pattern position pointer
149 errorcodeptr points to the errorcode variable
150 bracount number of previous extracting brackets
151 options the options bits
152 isclass true if inside a character class
154 Returns: zero or positive => a data character
155 negative => a special escape sequence
156 on error, errorptr is set
160 check_escape(const pcre_uchar **ptrptr, const pcre_uchar *patternEnd, ErrorCode* errorcodeptr, int bracount,
163 const pcre_uchar *ptr = *ptrptr + 1;
166 /* If backslash is at the end of the pattern, it's an error. */
167 if (ptr == patternEnd) {
168 *errorcodeptr = ERR1;
175 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
176 a table. A non-zero result is something that can be returned immediately.
177 Otherwise further processing may be required. */
179 if (c < '0' || c > 'z') {} /* Not alphameric */
180 else if ((i = escapes[c - '0']) != 0) c = i;
182 /* Escapes that need further processing, or are illegal. */
186 const pcre_uchar *oldptr;
189 /* A number of Perl escapes are not handled by PCRE. We give an explicit
192 /* The handling of escape sequences consisting of a string of digits
193 starting with one that is not zero is not straightforward. By experiment,
194 the way Perl works seems to be as follows:
196 Outside a character class, the digits are read as a decimal number. If the
197 number is less than 10, or if there are that many previous extracting
198 left brackets, then it is a back reference. Otherwise, up to three octal
199 digits are read to form an escaped byte. Thus \123 is likely to be octal
200 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
201 value is greater than 377, the least significant 8 bits are taken. Inside a
202 character class, \ followed by a digit is always an octal number. */
204 case '1': case '2': case '3': case '4': case '5':
205 case '6': case '7': case '8': case '9':
211 while (ptr + 1 < patternEnd && isASCIIDigit(ptr[1]))
212 c = c * 10 + *(++ptr) - '0';
213 if (c < 10 || c <= bracount)
218 ptr = oldptr; /* Put the pointer back and fall through */
221 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
222 generates a binary zero byte and treats the digit as a following literal.
223 Thus we have to pull back the pointer by one. */
225 if ((c = *ptr) >= '8')
232 /* \0 always starts an octal number, but we may drop through to here with a
233 larger first octal digit. */
237 while (i++ < 2 && ptr + 1 < patternEnd && ptr[1] >= '0' && ptr[1] <= '7')
238 c = c * 8 + *(++ptr) - '0';
239 c &= 255; /* Take least significant 8 bits */
242 /* \x is complicated. \x{ddd} is a character number which can be greater
243 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
244 treated as a data character. */
247 if (ptr + 1 < patternEnd && ptr[1] == '{')
249 const pcre_uchar *pt = ptr + 2;
253 while (pt < patternEnd && isASCIIHexDigit(*pt))
255 register int cc = *pt++;
256 if (c == 0 && cc == '0') continue; /* Leading zeroes */
259 if (cc >= 'a') cc -= 32; /* Convert to upper case */
260 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
263 if (pt < patternEnd && *pt == '}')
265 if (c < 0 || count > 8) *errorcodeptr = ERR3;
266 else if (c >= 0xD800 && c <= 0xDFFF) *errorcodeptr = ERR3; // half of surrogate pair
267 else if (c >= 0xFDD0 && c <= 0xFDEF) *errorcodeptr = ERR3; // ?
268 else if (c == 0xFFFE) *errorcodeptr = ERR3; // not a character
269 else if (c == 0xFFFF) *errorcodeptr = ERR3; // not a character
270 else if (c > 0x10FFFF) *errorcodeptr = ERR3; // out of Unicode character range
275 /* If the sequence of hex digits does not end with '}', then we don't
276 recognize this construct; fall through to the normal \x handling. */
279 /* Read just a single-byte hex-defined char */
282 while (i++ < 2 && ptr + 1 < patternEnd && isASCIIHexDigit(ptr[1]))
284 int cc; /* Some compilers don't like ++ */
285 cc = *(++ptr); /* in initializers */
286 if (cc >= 'a') cc -= 32; /* Convert to upper case */
287 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
292 const pcre_uchar *pt = ptr;
296 if (pt + 1 >= patternEnd || !isASCIIHexDigit(pt[1]))
304 int cc; /* Some compilers don't like ++ */
305 cc = *(++pt); /* in initializers */
306 if (cc >= 'a') cc -= 32; /* Convert to upper case */
307 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
314 /* Other special escapes not starting with a digit are straightforward */
317 if (++ptr == patternEnd)
319 *errorcodeptr = ERR2;
324 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
325 is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
327 if (c >= 'a' && c <= 'z') c -= 32;
339 /*************************************************
340 * Check for counted repeat *
341 *************************************************/
343 /* This function is called when a '{' is encountered in a place where it might
344 start a quantifier. It looks ahead to see if it really is a quantifier or not.
345 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
346 where the ddds are digits.
349 p pointer to the first char after '{'
351 Returns: true or false
355 is_counted_repeat(const pcre_uchar *p, const pcre_uchar *patternEnd)
357 if (p >= patternEnd || !isASCIIDigit(*p))
360 while (p < patternEnd && isASCIIDigit(*p))
362 if (p < patternEnd && *p == '}')
365 if (p >= patternEnd || *p++ != ',')
367 if (p < patternEnd && *p == '}')
370 if (p >= patternEnd || !isASCIIDigit(*p))
373 while (p < patternEnd && isASCIIDigit(*p))
376 return (p < patternEnd && *p == '}');
381 /*************************************************
382 * Read repeat counts *
383 *************************************************/
385 /* Read an item of the form {n,m} and return the values. This is called only
386 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
387 so the syntax is guaranteed to be correct, but we need to check the values.
390 p pointer to first char after '{'
391 minp pointer to int for min
392 maxp pointer to int for max
393 returned as -1 if no max
394 errorcodeptr points to error code variable
396 Returns: pointer to '}' on success;
397 current ptr on error, with errorcodeptr set non-zero
400 static const pcre_uchar *
401 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, ErrorCode* errorcodeptr)
406 /* Read the minimum value and do a paranoid check: a negative value indicates
407 an integer overflow. */
409 while (isASCIIDigit(*p)) min = min * 10 + *p++ - '0';
410 if (min < 0 || min > 65535)
412 *errorcodeptr = ERR5;
416 /* Read the maximum value if there is one, and again do a paranoid on its size.
417 Also, max must not be less than min. */
419 if (*p == '}') max = min; else
424 while (isASCIIDigit(*p)) max = max * 10 + *p++ - '0';
425 if (max < 0 || max > 65535)
427 *errorcodeptr = ERR5;
432 *errorcodeptr = ERR4;
438 /* Fill in the required variables, and pass back the pointer to the terminating
448 /*************************************************
449 * Find first significant op code *
450 *************************************************/
452 /* This is called by several functions that scan a compiled expression looking
453 for a fixed first character, or an anchoring op code etc. It skips over things
454 that do not influence this. For some calls, a change of option is important.
455 For some calls, it makes sense to skip negative forward and all backward
456 assertions, and also the \b assertion; for others it does not.
459 code pointer to the start of the group
460 skipassert true if certain assertions are to be skipped
462 Returns: pointer to the first significant opcode
466 first_significant_code(const uschar *code, BOOL skipassert)
473 if (!skipassert) return code;
474 do code += GET(code, 1); while (*code == OP_ALT);
475 code += OP_lengths[*code];
478 case OP_WORD_BOUNDARY:
479 case OP_NOT_WORD_BOUNDARY:
480 if (!skipassert) return code;
484 code += OP_lengths[*code];
491 /* Control never reaches here */
497 /*************************************************
498 * Find the fixed length of a pattern *
499 *************************************************/
501 /* Scan a pattern and compute the fixed length of subject that will match it,
502 if the length is fixed. This is needed for dealing with backward assertions.
503 In UTF8 mode, the result is in characters rather than bytes.
506 code points to the start of the pattern (the bracket)
507 options the compiling options
509 Returns: the fixed length, or -1 if there is no fixed length,
510 or -2 if \C was encountered
514 find_fixedlength(uschar *code, int options)
518 register int branchlength = 0;
519 register uschar *cc = code + 1 + LINK_SIZE;
521 /* Scan along the opcodes for this branch. If we get to the end of the
522 branch, check the length against that of the other branches. */
527 register int op = *cc;
528 if (op >= OP_BRA) op = OP_BRA;
534 d = find_fixedlength(cc, options);
537 do cc += GET(cc, 1); while (*cc == OP_ALT);
541 /* Reached end of a branch; if it's a ket it is the end of a nested
542 call. If it's ALT it is an alternation in a nested call. If it is
543 END it's the end of the outer call. All can be handled by the same code. */
550 if (length < 0) length = branchlength;
551 else if (length != branchlength) return -1;
552 if (*cc != OP_ALT) return length;
557 /* Skip over assertive subpatterns */
561 do cc += GET(cc, 1); while (*cc == OP_ALT);
564 /* Skip over things that don't match chars */
569 case OP_NOT_WORD_BOUNDARY:
570 case OP_WORD_BOUNDARY:
571 cc += OP_lengths[*cc];
574 /* Handle literal characters */
581 while ((*cc & 0xc0) == 0x80) cc++;
585 case OP_ASCII_LETTER_NC:
590 /* Handle exact repetitions. The count is already in characters, but we
591 need to skip over a multibyte character in UTF8 mode. */
594 branchlength += GET2(cc,1);
596 while((*cc & 0x80) == 0x80) cc++;
600 branchlength += GET2(cc,1);
604 /* Handle single-char matchers */
608 case OP_NOT_WHITESPACE:
610 case OP_NOT_WORDCHAR:
617 /* Check a class for variable quantification */
620 cc += GET(cc, 1) - 33;
637 if (GET2(cc,1) != GET2(cc,3)) return -1;
638 branchlength += GET2(cc,1);
647 /* Anything else is variable length */
653 /* Control never gets here */
659 /*************************************************
660 * Scan compiled branch for non-emptiness *
661 *************************************************/
663 /* This function scans through a branch of a compiled pattern to see whether it
664 can match the empty string or not. It is called only from could_be_empty()
665 below. Note that first_significant_code() skips over assertions. If we hit an
666 unclosed bracket, we return "empty" - this means we've struck an inner bracket
667 whose current branch will already have been scanned.
670 code points to start of search
671 endcode points to where to stop
673 Returns: true if what is matched could be empty
677 could_be_empty_branch(const uschar *code, const uschar *endcode)
680 for (code = first_significant_code(code + 1 + LINK_SIZE, true);
682 code = first_significant_code(code + OP_lengths[c], true))
691 if (GET(code, 1) == 0) return true; /* Hit unclosed bracket */
693 /* Scan a closed bracket */
695 empty_branch = false;
698 if (!empty_branch && could_be_empty_branch(code, endcode))
700 code += GET(code, 1);
702 while (*code == OP_ALT);
703 if (!empty_branch) return false; /* All branches are non-empty */
704 code += 1 + LINK_SIZE;
710 /* Check for quantifiers after a class */
713 ccode = code + GET(code, 1);
714 goto CHECK_CLASS_REPEAT;
724 case OP_CRSTAR: /* These could be empty; continue */
730 default: /* Non-repeat => class must match */
731 case OP_CRPLUS: /* These repeats aren't empty */
737 if (GET2(ccode, 1) > 0) return false; /* Minimum > 0 */
742 /* Opcodes that must match a character */
746 case OP_NOT_WHITESPACE:
748 case OP_NOT_WORDCHAR:
754 case OP_ASCII_LETTER_NC:
775 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
776 followed by a multibyte character */
784 while ((code[2] & 0xc0) == 0x80) code++;
794 /*************************************************
795 * Complete a callout item *
796 *************************************************/
798 /* A callout item contains the length of the next item in the pattern, which
799 we can't fill in till after we have reached the relevant point. This is used
800 for both automatic and manual callouts.
803 previous_callout points to previous callout item
804 ptr current pattern pointer
805 cd pointers to tables etc
811 complete_callout(uschar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
813 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
814 PUT(previous_callout, 2 + LINK_SIZE, length);
819 /*************************************************
820 * Get othercase range *
821 *************************************************/
823 /* This function is passed the start and end of a class range, in UTF-8 mode
824 with UCP support. It searches up the characters, looking for internal ranges of
825 characters in the "other" case. Each call returns the next one, updating the
829 cptr points to starting character value; updated
831 ocptr where to put start of othercase range
832 odptr where to put end of othercase range
834 Yield: true when range returned; false when no more
838 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
840 int c, othercase = 0, next;
842 for (c = *cptr; c <= d; c++)
843 { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }
845 if (c > d) return false;
848 next = othercase + 1;
850 for (++c; c <= d; c++)
852 if (_pcre_ucp_othercase(c) != next) break;
863 /*************************************************
864 * Compile one branch *
865 *************************************************/
867 /* Scan the pattern, compiling it into the code vector. If the options are
868 changed during the branch, the pointer is used to change the external options
872 options the option bits
873 brackets points to number of extracting brackets used
874 codeptr points to the pointer to the current code point
875 ptrptr points to the current pattern pointer
876 errorcodeptr points to error code variable
877 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
878 reqbyteptr set to the last literal character required, else < 0
879 cd contains pointers to tables etc.
881 Returns: true on success
882 false, with *errorcodeptr set non-zero on error
886 compile_branch(int options, int *brackets, uschar **codeptr,
887 const pcre_uchar **ptrptr, const pcre_uchar *patternEnd, ErrorCode* errorcodeptr, int *firstbyteptr,
888 int *reqbyteptr, compile_data *cd)
890 int repeat_type, op_type;
891 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
893 int firstbyte, reqbyte;
894 int zeroreqbyte, zerofirstbyte;
895 int req_caseopt, reqvary, tempreqvary;
896 int after_manual_callout = 0;
898 register uschar *code = *codeptr;
900 BOOL groupsetfirstbyte = false;
901 const pcre_uchar *ptr = *ptrptr;
902 const pcre_uchar *tempptr;
903 uschar *previous = NULL;
904 uschar *previous_callout = NULL;
905 uschar classbits[32];
908 uschar *class_utf8data;
911 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
912 matching encountered yet". It gets changed to REQ_NONE if we hit something that
913 matches a non-fixed char first char; reqbyte just remains unset if we never
916 When we hit a repeat whose minimum is zero, we may have to adjust these values
917 to take the zero repeat into account. This is implemented by setting them to
918 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
919 item types that can be repeated set these backoff variables appropriately. */
921 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
923 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
924 according to the current setting of the caseless flag. REQ_CASELESS is a bit
925 value > 255. It is added into the firstbyte or reqbyte variables to record the
926 case status of the value. This is used only for ASCII characters. */
928 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
930 /* Switch on next character until the end of the branch */
935 BOOL should_flip_negation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */
936 BOOL possessive_quantifier;
946 /* Next byte in the pattern */
948 c = ptr < patternEnd ? *ptr : 0;
950 /* Fill in length of a previous callout, except when the next thing is
953 is_quantifier = c == '*' || c == '+' || c == '?' ||
954 (c == '{' && is_counted_repeat(ptr+1, patternEnd));
956 if (!is_quantifier && previous_callout != NULL &&
957 after_manual_callout-- <= 0)
959 complete_callout(previous_callout, ptr, cd);
960 previous_callout = NULL;
965 /* The branch terminates at end of string, |, or ). */
968 if (ptr < patternEnd)
970 // End of string; fall through
973 *firstbyteptr = firstbyte;
974 *reqbyteptr = reqbyte;
979 /* Handle single-character metacharacters. In multiline mode, ^ disables
980 the setting of any following char as a first character. */
983 if ((options & PCRE_MULTILINE) != 0)
985 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
996 /* There can never be a first char if '.' is first, whatever happens about
997 repeats. The value of reqbyte doesn't change either. */
1000 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1001 zerofirstbyte = firstbyte;
1002 zeroreqbyte = reqbyte;
1007 /* Character classes. If the included characters are all < 256, we build a
1008 32-byte bitmap of the permitted characters, except in the special case
1009 where there is only one such character. For negated classes, we build the
1010 map as usual, then invert it at the end. However, we use a different opcode
1011 so that data characters > 255 can be handled correctly.
1013 If the class contains characters outside the 0-255 range, a different
1014 opcode is compiled. It may optionally have a bit map for characters < 256,
1015 but those above are are explicitly listed afterwards. A flag byte tells
1016 whether the bitmap is present, and whether this is a negated class or not.
1021 should_flip_negation = false;
1023 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1024 they are encountered at the top level, so we'll do that too. */
1026 /* If the first character is '^', set the negation flag and skip it. */
1028 if ((c = *(++ptr)) == '^')
1030 negate_class = true;
1035 negate_class = false;
1038 /* Keep a count of chars with values < 256 so that we can optimize the case
1039 of just a single character (as long as it's < 256). For higher valued UTF-8
1040 characters, we don't yet do any optimization. */
1042 class_charcount = 0;
1043 class_lastchar = -1;
1045 class_utf8 = false; /* No chars >= 256 */
1046 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1048 /* Initialize the 32-char bit map to all zeros. We have to build the
1049 map in a temporary bit of store, in case the class contains only 1
1050 character (< 256), because in that case the compiled code doesn't use the
1053 memset(classbits, 0, 32 * sizeof(uschar));
1055 /* Process characters until ] is reached. By writing this as a "do" it
1056 means that an initial ] is taken as a data character. The first pass
1057 through the regex checked the overall syntax, so we don't need to be very
1058 strict here. At the start of the loop, c contains the first byte of the
1064 { /* Braces are required because the */
1065 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1068 /* Backslash may introduce a single character, or it may introduce one
1069 of the specials, which just set a flag. Escaped items are checked for
1070 validity in the pre-compiling pass. The sequence \b is a special case.
1071 Inside a class (and only there) it is treated as backspace. Elsewhere
1072 it marks a word boundary. Other escapes have preset maps ready to
1073 or into the one we are building. We assume they have more than one
1074 character in them, so set class_charcount bigger than one. */
1078 c = check_escape(&ptr, patternEnd, errorcodeptr, *brackets, true);
1080 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1084 register const uschar *cbits = cd->cbits;
1085 class_charcount += 2; /* Greater than 1 is what matters */
1089 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1093 should_flip_negation = true;
1094 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1098 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1102 should_flip_negation = true;
1103 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
1107 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
1111 should_flip_negation = true;
1112 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
1115 /* Unrecognized escapes are faulted if PCRE is running in its
1116 strict mode. By default, for compatibility with Perl, they are
1117 treated as literals. */
1120 c = *ptr; /* The final character */
1121 class_charcount -= 2; /* Undo the default count from above */
1125 /* Fall through if we have a single character (c >= 0). This may be
1126 > 256 in UTF-8 mode. */
1128 } /* End of backslash handling */
1130 /* A single character may be followed by '-' to form a range. However,
1131 Perl does not permit ']' to be the end of the range. A '-' character
1132 here is treated as a literal. */
1134 if (ptr[1] == '-' && ptr[2] != ']')
1139 GETCHARLEN(d, ptr, ptr);
1141 /* The second part of a range can be a single-character escape, but
1142 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1143 in such circumstances. */
1147 const pcre_uchar *oldptr = ptr;
1148 d = check_escape(&ptr, patternEnd, errorcodeptr, *brackets, true);
1150 /* \b is backslash; \X is literal X; any other special means the '-'
1155 if (d == -ESC_b) d = '\b';
1159 goto LONE_SINGLE_CHARACTER; /* A few lines below */
1164 /* The check that the two values are in the correct order happens in
1165 the pre-pass. Optimize one-character ranges */
1167 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
1169 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
1170 matching, we have to use an XCLASS with extra data items. Caseless
1171 matching for characters > 127 is available only if UCP support is
1174 if ((d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
1178 /* With UCP support, we can find the other case equivalents of
1179 the relevant characters. There may be several ranges. Optimize how
1180 they fit with the basic range. */
1182 if ((options & PCRE_CASELESS) != 0)
1187 while (get_othercase_range(&cc, origd, &occ, &ocd))
1189 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
1191 if (occ < c && ocd >= c - 1) /* Extend the basic range */
1192 { /* if there is overlap, */
1193 c = occ; /* noting that if occ < c */
1194 continue; /* we can't have ocd > d */
1195 } /* because a subrange is */
1196 if (ocd > d && occ <= d + 1) /* always shorter than */
1197 { /* the basic range. */
1204 *class_utf8data++ = XCL_SINGLE;
1208 *class_utf8data++ = XCL_RANGE;
1209 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
1211 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
1215 /* Now record the original range, possibly modified for UCP caseless
1216 overlapping ranges. */
1218 *class_utf8data++ = XCL_RANGE;
1219 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
1220 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
1222 /* With UCP support, we are done. Without UCP support, there is no
1223 caseless matching for UTF-8 characters > 127; we can use the bit map
1224 for the smaller ones. */
1226 continue; /* With next character in the class */
1229 /* We use the bit map for all cases when not in UTF-8 mode; else
1230 ranges that lie entirely within 0-127 when there is UCP support; else
1231 for partial ranges without UCP support. */
1235 classbits[c/8] |= (1 << (c&7));
1236 if ((options & PCRE_CASELESS) != 0)
1238 int uc = cd->fcc[c]; /* flip case */
1239 classbits[uc/8] |= (1 << (uc&7));
1241 class_charcount++; /* in case a one-char range */
1245 continue; /* Go get the next char in the class */
1248 /* Handle a lone single character - we can get here for a normal
1249 non-escape char, or after \ that introduces a single character or for an
1250 apparent range that isn't. */
1252 LONE_SINGLE_CHARACTER:
1254 /* Handle a character that cannot go in the bit map */
1256 if ((c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
1259 *class_utf8data++ = XCL_SINGLE;
1260 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
1262 if ((options & PCRE_CASELESS) != 0)
1265 if ((othercase = _pcre_ucp_othercase(c)) >= 0)
1267 *class_utf8data++ = XCL_SINGLE;
1268 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
1275 /* Handle a single-byte character */
1277 classbits[c/8] |= (1 << (c&7));
1278 if ((options & PCRE_CASELESS) != 0)
1280 c = cd->fcc[c]; /* flip case */
1281 classbits[c/8] |= (1 << (c&7));
1288 /* Loop until ']' reached; the check for end of string happens inside the
1289 loop. This "while" is the end of the "do" above. */
1291 while ((c = *(++ptr)) != ']');
1293 /* If class_charcount is 1, we saw precisely one character whose value is
1294 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
1295 can optimize the negative case only if there were no characters >= 128
1296 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
1297 single-bytes only. This is an historical hangover. Maybe one day we can
1298 tidy these opcodes to handle multi-byte characters.
1300 The optimization throws away the bit map. We turn the item into a
1301 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
1302 that OP_NOT does not support multibyte characters. In the positive case, it
1303 can cause firstbyte to be set. Otherwise, there can be no first char if
1304 this item is first, whatever repeat count may follow. In the case of
1305 reqbyte, save the previous value for reinstating. */
1307 if (class_charcount == 1 &&
1308 (!class_utf8 && (!negate_class || class_lastchar < 128)))
1310 zeroreqbyte = reqbyte;
1312 /* The OP_NOT opcode works on one-byte characters only. */
1316 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1317 zerofirstbyte = firstbyte;
1319 *code++ = class_lastchar;
1323 /* For a single, positive character, get the value into c, and
1324 then we can handle this with the normal one-character code. */
1328 } /* End of 1-char optimization */
1330 /* The general case - not the one-char optimization. If this is the first
1331 thing in the branch, there can be no first char setting, whatever the
1332 repeat count. Any reqbyte setting must remain unchanged after any kind of
1335 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1336 zerofirstbyte = firstbyte;
1337 zeroreqbyte = reqbyte;
1339 /* If there are characters with values > 255, we have to compile an
1340 extended class, with its own opcode. If there are no characters < 256,
1341 we can omit the bitmap. */
1343 if (class_utf8 && !should_flip_negation)
1345 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
1346 *code++ = OP_XCLASS;
1348 *code = negate_class? XCL_NOT : 0;
1350 /* If the map is required, install it, and move on to the end of
1353 if (class_charcount > 0)
1356 memcpy(code, classbits, 32);
1357 code = class_utf8data;
1360 /* If the map is not required, slide down the extra data. */
1364 int len = class_utf8data - (code + 33);
1365 memmove(code + 1, code + 33, len);
1369 /* Now fill in the complete length of the item */
1371 PUT(previous, 1, code - previous);
1372 break; /* End of class handling */
1375 /* If there are no characters > 255, negate the 32-byte map if necessary,
1376 and copy it into the code vector. If this is the first thing in the branch,
1377 there can be no first char setting, whatever the repeat count. Any reqbyte
1378 setting must remain unchanged after any kind of repeat. */
1380 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
1383 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
1387 memcpy(code, classbits, 32);
1392 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
1393 has been tested above. */
1396 if (!is_quantifier) goto NORMAL_CHAR;
1397 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
1398 if (*errorcodeptr != 0) goto FAILED;
1416 if (previous == NULL)
1418 *errorcodeptr = ERR9;
1422 if (repeat_min == 0)
1424 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
1425 reqbyte = zeroreqbyte; /* Ditto */
1428 /* Remember whether this is a variable length repeat */
1430 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
1432 op_type = 0; /* Default single-char op codes */
1433 possessive_quantifier = false; /* Default not possessive quantifier */
1435 /* Save start of previous item, in case we have to move it up to make space
1436 for an inserted OP_ONCE for the additional '+' extension. */
1438 tempcode = previous;
1440 /* If the next character is '+', we have a possessive quantifier. This
1441 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
1442 If the next character is '?' this is a minimizing repeat, by default,
1443 but if PCRE_UNGREEDY is set, it works the other way round. We change the
1444 repeat type to the non-default. */
1446 if (ptr + 1 < patternEnd && ptr[1] == '+')
1448 repeat_type = 0; /* Force greedy */
1449 possessive_quantifier = true;
1452 else if (ptr + 1 < patternEnd && ptr[1] == '?')
1457 else repeat_type = 0;
1459 /* If previous was a character match, abolish the item and generate a
1460 repeat item instead. If a char item has a minumum of more than one, ensure
1461 that it is set in reqbyte - it might not be if a sequence such as x{3} is
1462 the first thing in a branch because the x will have gone into firstbyte
1465 if (*previous == OP_CHAR || *previous == OP_CHARNC)
1467 /* Deal with UTF-8 characters that take up more than one byte. It's
1468 easier to write this out separately than try to macrify it. Use c to
1469 hold the length of the character in bytes, plus 0x80 to flag that it's a
1470 length rather than a small character. */
1472 if ((code[-1] & 0x80) != 0)
1474 uschar *lastchar = code - 1;
1475 while((*lastchar & 0xc0) == 0x80) lastchar--;
1476 c = code - lastchar; /* Length of UTF-8 character */
1477 memcpy(utf8_char, lastchar, c); /* Save the char */
1478 c |= 0x80; /* Flag c as a length */
1483 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
1486 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1489 else if (*previous == OP_ASCII_CHAR || *previous == OP_ASCII_LETTER_NC)
1492 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
1493 goto OUTPUT_SINGLE_REPEAT;
1496 /* If previous was a single negated character ([^a] or similar), we use
1497 one of the special opcodes, replacing it. The code is shared with single-
1498 character repeats by setting opt_type to add a suitable offset into
1499 repeat_type. OP_NOT is currently used only for single-byte chars. */
1501 else if (*previous == OP_NOT)
1503 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1505 goto OUTPUT_SINGLE_REPEAT;
1508 /* If previous was a character type match (\d or similar), abolish it and
1509 create a suitable repeat item. The code is shared with single-character
1510 repeats by setting op_type to add a suitable offset into repeat_type. */
1512 else if (*previous <= OP_ANY)
1515 int prop_type, prop_value;
1516 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1519 OUTPUT_SINGLE_REPEAT:
1520 prop_type = prop_value = -1;
1523 code = previous; /* Usually overwrite previous item */
1525 /* If the maximum is zero then the minimum must also be zero; Perl allows
1526 this case, so we do too - by simply omitting the item altogether. */
1528 if (repeat_max == 0) goto END_REPEAT;
1530 /* Combine the op_type with the repeat_type */
1532 repeat_type += op_type;
1534 /* A minimum of zero is handled either as the special case * or ?, or as
1535 an UPTO, with the maximum given. */
1537 if (repeat_min == 0)
1539 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1540 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1543 *code++ = OP_UPTO + repeat_type;
1544 PUT2INC(code, 0, repeat_max);
1548 /* A repeat minimum of 1 is optimized into some special cases. If the
1549 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
1550 left in place and, if the maximum is greater than 1, we use OP_UPTO with
1551 one less than the maximum. */
1553 else if (repeat_min == 1)
1555 if (repeat_max == -1)
1556 *code++ = OP_PLUS + repeat_type;
1559 code = oldcode; /* leave previous item in place */
1560 if (repeat_max == 1) goto END_REPEAT;
1561 *code++ = OP_UPTO + repeat_type;
1562 PUT2INC(code, 0, repeat_max - 1);
1566 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1567 handled as an EXACT followed by an UPTO. */
1571 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1572 PUT2INC(code, 0, repeat_min);
1574 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
1575 we have to insert the character for the previous code. For a repeated
1576 Unicode property match, there are two extra bytes that define the
1577 required property. In UTF-8 mode, long characters have their length in
1578 c, with the 0x80 bit as a flag. */
1584 memcpy(code, utf8_char, c & 7);
1592 *code++ = prop_type;
1593 *code++ = prop_value;
1596 *code++ = OP_STAR + repeat_type;
1599 /* Else insert an UPTO if the max is greater than the min, again
1600 preceded by the character, for the previously inserted code. */
1602 else if (repeat_max != repeat_min)
1606 memcpy(code, utf8_char, c & 7);
1613 *code++ = prop_type;
1614 *code++ = prop_value;
1616 repeat_max -= repeat_min;
1617 *code++ = OP_UPTO + repeat_type;
1618 PUT2INC(code, 0, repeat_max);
1622 /* The character or character type itself comes last in all cases. */
1626 memcpy(code, utf8_char, c & 7);
1632 /* For a repeated Unicode property match, there are two extra bytes that
1633 define the required property. */
1637 *code++ = prop_type;
1638 *code++ = prop_value;
1642 /* If previous was a character class or a back reference, we put the repeat
1643 stuff after it, but just skip the item if the repeat was {0,0}. */
1645 else if (*previous == OP_CLASS ||
1646 *previous == OP_NCLASS ||
1647 *previous == OP_XCLASS ||
1648 *previous == OP_REF)
1650 if (repeat_max == 0)
1656 if (repeat_min == 0 && repeat_max == -1)
1657 *code++ = OP_CRSTAR + repeat_type;
1658 else if (repeat_min == 1 && repeat_max == -1)
1659 *code++ = OP_CRPLUS + repeat_type;
1660 else if (repeat_min == 0 && repeat_max == 1)
1661 *code++ = OP_CRQUERY + repeat_type;
1664 *code++ = OP_CRRANGE + repeat_type;
1665 PUT2INC(code, 0, repeat_min);
1666 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1667 PUT2INC(code, 0, repeat_max);
1671 /* If previous was a bracket group, we may have to replicate it in certain
1674 else if (*previous >= OP_BRA || *previous == OP_ONCE)
1678 int len = code - previous;
1679 uschar *bralink = NULL;
1681 /* If the maximum repeat count is unlimited, find the end of the bracket
1682 by scanning through from the start, and compute the offset back to it
1683 from the current code pointer. There may be an OP_OPT setting following
1684 the final KET, so we can't find the end just by going back from the code
1687 if (repeat_max == -1)
1689 register uschar *ket = previous;
1690 do ket += GET(ket, 1); while (*ket != OP_KET);
1691 ketoffset = code - ket;
1694 /* The case of a zero minimum is special because of the need to stick
1695 OP_BRAZERO in front of it, and because the group appears once in the
1696 data, whereas in other cases it appears the minimum number of times. For
1697 this reason, it is simplest to treat this case separately, as otherwise
1698 the code gets far too messy. There are several special subcases when the
1701 if (repeat_min == 0)
1703 /* If the maximum is also zero, we just omit the group from the output
1706 if (repeat_max == 0)
1712 /* If the maximum is 1 or unlimited, we just have to stick in the
1713 BRAZERO and do no more at this point. However, we do need to adjust
1714 any OP_RECURSE calls inside the group that refer to the group itself or
1715 any internal group, because the offset is from the start of the whole
1716 regex. Temporarily terminate the pattern while doing this. */
1718 if (repeat_max <= 1)
1721 memmove(previous+1, previous, len);
1723 *previous++ = OP_BRAZERO + repeat_type;
1726 /* If the maximum is greater than 1 and limited, we have to replicate
1727 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1728 The first one has to be handled carefully because it's the original
1729 copy, which has to be moved up. The remainder can be handled by code
1730 that is common with the non-zero minimum case below. We have to
1731 adjust the value or repeat_max, since one less copy is required. Once
1732 again, we may have to adjust any OP_RECURSE calls inside the group. */
1738 memmove(previous + 2 + LINK_SIZE, previous, len);
1739 code += 2 + LINK_SIZE;
1740 *previous++ = OP_BRAZERO + repeat_type;
1741 *previous++ = OP_BRA;
1743 /* We chain together the bracket offset fields that have to be
1744 filled in later when the ends of the brackets are reached. */
1746 offset = (bralink == NULL)? 0 : previous - bralink;
1748 PUTINC(previous, 0, offset);
1754 /* If the minimum is greater than zero, replicate the group as many
1755 times as necessary, and adjust the maximum to the number of subsequent
1756 copies that we need. If we set a first char from the group, and didn't
1757 set a required char, copy the latter from the former. */
1763 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
1764 for (i = 1; i < repeat_min; i++)
1766 memcpy(code, previous, len);
1770 if (repeat_max > 0) repeat_max -= repeat_min;
1773 /* This code is common to both the zero and non-zero minimum cases. If
1774 the maximum is limited, it replicates the group in a nested fashion,
1775 remembering the bracket starts on a stack. In the case of a zero minimum,
1776 the first one was set up above. In all cases the repeat_max now specifies
1777 the number of additional copies needed. */
1779 if (repeat_max >= 0)
1781 for (i = repeat_max - 1; i >= 0; i--)
1783 *code++ = OP_BRAZERO + repeat_type;
1785 /* All but the final copy start a new nesting, maintaining the
1786 chain of brackets outstanding. */
1792 offset = (bralink == NULL)? 0 : code - bralink;
1794 PUTINC(code, 0, offset);
1797 memcpy(code, previous, len);
1801 /* Now chain through the pending brackets, and fill in their length
1802 fields (which are holding the chain links pro tem). */
1804 while (bralink != NULL)
1807 int offset = code - bralink + 1;
1808 uschar *bra = code - offset;
1809 oldlinkoffset = GET(bra, 1);
1810 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1812 PUTINC(code, 0, offset);
1813 PUT(bra, 1, offset);
1817 /* If the maximum is unlimited, set a repeater in the final copy. We
1818 can't just offset backwards from the current code point, because we
1819 don't know if there's been an options resetting after the ket. The
1820 correct offset was computed above. */
1822 else code[-ketoffset] = OP_KETRMAX + repeat_type;
1825 /* Else there's some kind of shambles */
1829 *errorcodeptr = ERR11;
1833 /* If the character following a repeat is '+', we wrap the entire repeated
1834 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
1835 Sun's Java package. The repeated item starts at tempcode, not at previous,
1836 which might be the first part of a string whose (former) last char we
1837 repeated. However, we don't support '+' after a greediness '?'. */
1839 if (possessive_quantifier)
1841 int len = code - tempcode;
1842 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
1843 code += 1 + LINK_SIZE;
1844 len += 1 + LINK_SIZE;
1845 tempcode[0] = OP_ONCE;
1847 PUTINC(code, 0, len);
1848 PUT(tempcode, 1, len);
1851 /* In all case we no longer have a previous item. We also set the
1852 "follows varying string" flag for subsequently encountered reqbytes if
1853 it isn't already set and we have just passed a varying length item. */
1857 cd->req_varyopt |= reqvary;
1861 /* Start of nested bracket sub-expression, or comment or lookahead or
1862 lookbehind or option setting or condition. First deal with special things
1863 that can come after a bracket; all are introduced by ?, and the appearance
1864 of any of them means that this is not a referencing group. They were
1865 checked for validity in the first pass over the string, so we don't have to
1866 check for syntax errors here. */
1871 if (*(++ptr) == '?')
1875 case ':': /* Non-extracting bracket */
1880 case '=': /* Positive lookahead */
1881 bravalue = OP_ASSERT;
1885 case '!': /* Negative lookahead */
1886 bravalue = OP_ASSERT_NOT;
1890 /* Character after (? not specially recognized */
1892 default: /* Option setting */
1893 *errorcodeptr = ERR12;
1898 /* Else we have a referencing group; adjust the opcode. If the bracket
1899 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1900 arrange for the true number to follow later, in an OP_BRANUMBER item. */
1904 if (++(*brackets) > EXTRACT_BASIC_MAX)
1906 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1907 code[1+LINK_SIZE] = OP_BRANUMBER;
1908 PUT2(code, 2+LINK_SIZE, *brackets);
1911 else bravalue = OP_BRA + *brackets;
1914 /* Process nested bracketed re. Assertions may not be repeated, but other
1915 kinds can be. We copy code into a non-register variable in order to be able
1916 to pass its address because some compilers complain otherwise. Pass in a
1917 new setting for the ims options if they have changed. */
1919 previous = (bravalue >= OP_ONCE)? code : NULL;
1922 tempreqvary = cd->req_varyopt; /* Save value before bracket */
1926 brackets, /* Extracting bracket count */
1927 &tempcode, /* Where to put code (updated) */
1928 &ptr, /* Input pointer (updated) */
1930 errorcodeptr, /* Where to put an error message */
1931 skipbytes, /* Skip over OP_BRANUMBER */
1932 &subfirstbyte, /* For possible first char */
1933 &subreqbyte, /* For possible last char */
1934 cd)) /* Tables block */
1937 /* At the end of compiling, code is still pointing to the start of the
1938 group, while tempcode has been updated to point past the end of the group
1939 and any option resetting that may follow it. The pattern pointer (ptr)
1940 is on the bracket. */
1942 /* Handle updating of the required and first characters. Update for normal
1943 brackets of all kinds, and conditions with two branches (see code above).
1944 If the bracket is followed by a quantifier with zero repeat, we have to
1945 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
1946 main loop so that they can be accessed for the back off. */
1948 zeroreqbyte = reqbyte;
1949 zerofirstbyte = firstbyte;
1950 groupsetfirstbyte = false;
1952 if (bravalue >= OP_BRA || bravalue == OP_ONCE)
1954 /* If we have not yet set a firstbyte in this branch, take it from the
1955 subpattern, remembering that it was set here so that a repeat of more
1956 than one can replicate it as reqbyte if necessary. If the subpattern has
1957 no firstbyte, set "none" for the whole branch. In both cases, a zero
1958 repeat forces firstbyte to "none". */
1960 if (firstbyte == REQ_UNSET)
1962 if (subfirstbyte >= 0)
1964 firstbyte = subfirstbyte;
1965 groupsetfirstbyte = true;
1967 else firstbyte = REQ_NONE;
1968 zerofirstbyte = REQ_NONE;
1971 /* If firstbyte was previously set, convert the subpattern's firstbyte
1972 into reqbyte if there wasn't one, using the vary flag that was in
1973 existence beforehand. */
1975 else if (subfirstbyte >= 0 && subreqbyte < 0)
1976 subreqbyte = subfirstbyte | tempreqvary;
1978 /* If the subpattern set a required byte (or set a first byte that isn't
1979 really the first byte - see above), set it. */
1981 if (subreqbyte >= 0) reqbyte = subreqbyte;
1984 /* For a forward assertion, we take the reqbyte, if set. This can be
1985 helpful if the pattern that follows the assertion doesn't set a different
1986 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
1987 for an assertion, however because it leads to incorrect effect for patterns
1988 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
1989 of a firstbyte. This is overcome by a scan at the end if there's no
1990 firstbyte, looking for an asserted first char. */
1992 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
1994 /* Now update the main code pointer to the end of the group. */
1998 /* Error if hit end of pattern */
2000 if (ptr >= patternEnd || *ptr != ')')
2002 *errorcodeptr = ERR14;
2007 /* Check \ for being a real metacharacter; if not, fall through and handle
2008 it as a data character at the start of a string. Escape items are checked
2009 for validity in the pre-compiling pass. */
2013 c = check_escape(&ptr, patternEnd, errorcodeptr, *brackets, false);
2015 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
2016 are arranged to be the negation of the corresponding OP_values. For the
2017 back references, the values are ESC_REF plus the reference number. Only
2018 back references and those types that consume a character may be repeated.
2019 We can test for values between ESC_b and ESC_w for the latter; this may
2020 have to change if any new ones are ever created. */
2024 /* For metasequences that actually match a character, we disable the
2025 setting of a first character if it hasn't already been set. */
2027 if (firstbyte == REQ_UNSET && -c > ESC_b && -c <= ESC_w)
2028 firstbyte = REQ_NONE;
2030 /* Set values to reset to if this is followed by a zero repeat. */
2032 zerofirstbyte = firstbyte;
2033 zeroreqbyte = reqbyte;
2035 /* Back references are handled specially */
2039 int number = -c - ESC_REF;
2042 PUT2INC(code, 0, number);
2045 /* For the rest, we can obtain the OP value by negating the escape
2050 previous = (-c > ESC_b && -c <= ESC_w)? code : NULL;
2058 /* Handle a literal character. It is guaranteed not to be whitespace or #
2059 when the extended flag is set. If we are in UTF-8 mode, it may be a
2060 multi-byte literal character. */
2072 if (options & PCRE_CASELESS && (c | 0x20) >= 'a' && (c | 0x20) <= 'z')
2074 *code++ = OP_ASCII_LETTER_NC;
2079 *code++ = OP_ASCII_CHAR;
2085 mclength = _pcre_ord2utf8(c, mcbuffer);
2087 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
2088 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
2091 /* Set the first and required bytes appropriately. If no previous first
2092 byte, set it from this character, but revert to none on a zero repeat.
2093 Otherwise, leave the firstbyte value alone, and don't change it on a zero
2096 if (firstbyte == REQ_UNSET)
2098 zerofirstbyte = REQ_NONE;
2099 zeroreqbyte = reqbyte;
2101 /* If the character is more than one byte long, we can set firstbyte
2102 only if it is not to be matched caselessly. */
2104 if (mclength == 1 || req_caseopt == 0)
2106 firstbyte = mcbuffer[0] | req_caseopt;
2107 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
2109 else firstbyte = reqbyte = REQ_NONE;
2112 /* firstbyte was previously set; we can set reqbyte only the length is
2113 1 or the matching is caseful. */
2117 zerofirstbyte = firstbyte;
2118 zeroreqbyte = reqbyte;
2119 if (mclength == 1 || req_caseopt == 0)
2120 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
2123 break; /* End of literal character handling */
2125 } /* end of big loop */
2127 /* Control never reaches here by falling through, only by a goto for all the
2128 error states. Pass back the position in the pattern so that it can be displayed
2129 to the user for diagnosing the error. */
2139 /*************************************************
2140 * Compile sequence of alternatives *
2141 *************************************************/
2143 /* On entry, ptr is pointing past the bracket character, but on return
2144 it points to the closing bracket, or vertical bar, or end of string.
2145 The code variable is pointing at the byte into which the BRA operator has been
2146 stored. If the ims options are changed at the start (for a (?ims: group) or
2147 during any branch, we need to insert an OP_OPT item at the start of every
2148 following branch to ensure they get set correctly at run time, and also pass
2149 the new options into every subsequent branch compile.
2152 options option bits, including any changes for this subpattern
2153 brackets -> int containing the number of extracting brackets used
2154 codeptr -> the address of the current code pointer
2155 ptrptr -> the address of the current pattern pointer
2156 errorcodeptr -> pointer to error code variable
2157 skipbytes skip this many bytes at start (for OP_BRANUMBER)
2158 firstbyteptr place to put the first required character, or a negative number
2159 reqbyteptr place to put the last required character, or a negative number
2160 cd points to the data block with tables pointers etc.
2162 Returns: true on success
2166 compile_regex(int options, int *brackets, uschar **codeptr,
2167 const pcre_uchar **ptrptr, const pcre_uchar *patternEnd, ErrorCode* errorcodeptr, int skipbytes,
2168 int *firstbyteptr, int *reqbyteptr, compile_data *cd)
2170 const pcre_uchar *ptr = *ptrptr;
2171 uschar *code = *codeptr;
2172 uschar *last_branch = code;
2173 uschar *start_bracket = code;
2174 int firstbyte, reqbyte;
2175 int branchfirstbyte, branchreqbyte;
2177 firstbyte = reqbyte = REQ_UNSET;
2179 /* Offset is set zero to mark that this bracket is still open */
2182 code += 1 + LINK_SIZE + skipbytes;
2184 /* Loop for each alternative branch */
2188 /* Now compile the branch */
2190 if (!compile_branch(options, brackets, &code, &ptr, patternEnd, errorcodeptr,
2191 &branchfirstbyte, &branchreqbyte, cd))
2197 /* If this is the first branch, the firstbyte and reqbyte values for the
2198 branch become the values for the regex. */
2200 if (*last_branch != OP_ALT)
2202 firstbyte = branchfirstbyte;
2203 reqbyte = branchreqbyte;
2206 /* If this is not the first branch, the first char and reqbyte have to
2207 match the values from all the previous branches, except that if the previous
2208 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
2209 REQ_VARY for the regex. */
2213 /* If we previously had a firstbyte, but it doesn't match the new branch,
2214 we have to abandon the firstbyte for the regex, but if there was previously
2215 no reqbyte, it takes on the value of the old firstbyte. */
2217 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
2219 if (reqbyte < 0) reqbyte = firstbyte;
2220 firstbyte = REQ_NONE;
2223 /* If we (now or from before) have no firstbyte, a firstbyte from the
2224 branch becomes a reqbyte if there isn't a branch reqbyte. */
2226 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
2227 branchreqbyte = branchfirstbyte;
2229 /* Now ensure that the reqbytes match */
2231 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
2233 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
2236 /* Reached end of expression, either ')' or end of pattern. Go back through
2237 the alternative branches and reverse the chain of offsets, with the field in
2238 the BRA item now becoming an offset to the first alternative. If there are
2239 no alternatives, it points to the end of the group. The length in the
2240 terminating ket is always the length of the whole bracketed item. If any of
2241 the ims options were changed inside the group, compile a resetting op-code
2242 following, except at the very end of the pattern. Return leaving the pointer
2243 at the terminating char. */
2245 if (ptr >= patternEnd || *ptr != '|')
2247 int length = code - last_branch;
2250 int prev_length = GET(last_branch, 1);
2251 PUT(last_branch, 1, length);
2252 length = prev_length;
2253 last_branch -= length;
2257 /* Fill in the ket */
2260 PUT(code, 1, code - start_bracket);
2261 code += 1 + LINK_SIZE;
2263 /* Set values to pass back */
2267 *firstbyteptr = firstbyte;
2268 *reqbyteptr = reqbyte;
2272 /* Another branch follows; insert an "or" node. Its length field points back
2273 to the previous branch while the bracket remains open. At the end the chain
2274 is reversed. It's done like this so that the start of the bracket has a
2275 zero offset until it is closed, making it possible to detect recursion. */
2278 PUT(code, 1, code - last_branch);
2280 code += 1 + LINK_SIZE;
2283 /* Control never reaches here */
2289 /*************************************************
2290 * Check for anchored expression *
2291 *************************************************/
2293 /* Try to find out if this is an anchored regular expression. Consider each
2294 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2295 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2296 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2297 counts, since OP_CIRC can match in the middle.
2299 We can also consider a regex to be anchored if OP_SOM starts all its branches.
2300 This is the code for \G, which means "match at start of match position, taking
2301 into account the match offset".
2303 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2304 because that will try the rest of the pattern at all possible matching points,
2305 so there is no point trying again.... er ....
2307 .... except when the .* appears inside capturing parentheses, and there is a
2308 subsequent back reference to those parentheses. We haven't enough information
2309 to catch that case precisely.
2311 At first, the best we could do was to detect when .* was in capturing brackets
2312 and the highest back reference was greater than or equal to that level.
2313 However, by keeping a bitmap of the first 31 back references, we can catch some
2314 of the more common cases more precisely.
2317 code points to start of expression (the bracket)
2318 options points to the options setting
2319 bracket_map a bitmap of which brackets we are inside while testing; this
2320 handles up to substring 31; after that we just have to take
2321 the less precise approach
2322 backref_map the back reference bitmap
2324 Returns: true or false
2328 is_anchored(register const uschar *code, int options, unsigned int bracket_map,
2329 unsigned int backref_map)
2332 const uschar *scode =
2333 first_significant_code(code + 1+LINK_SIZE, false);
2334 register int op = *scode;
2336 /* Capturing brackets */
2342 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
2343 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
2344 if (!is_anchored(scode, options, new_map, backref_map)) return false;
2347 /* Other brackets */
2349 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE)
2351 if (!is_anchored(scode, options, bracket_map, backref_map)) return false;
2354 /* Check for explicit anchoring */
2356 else if (((options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2358 code += GET(code, 1);
2360 while (*code == OP_ALT); /* Loop for each alternative */
2366 /*************************************************
2367 * Check for starting with ^ or .* *
2368 *************************************************/
2370 /* This is called to find out if every branch starts with ^ or .* so that
2371 "first char" processing can be done to speed things up in multiline
2372 matching and for non-DOTALL patterns that start with .* (which must start at
2373 the beginning or after \n). As in the case of is_anchored() (see above), we
2374 have to take account of back references to capturing brackets that contain .*
2375 because in that case we can't make the assumption.
2378 code points to start of expression (the bracket)
2379 bracket_map a bitmap of which brackets we are inside while testing; this
2380 handles up to substring 31; after that we just have to take
2381 the less precise approach
2382 backref_map the back reference bitmap
2384 Returns: true or false
2388 is_startline(const uschar *code, unsigned int bracket_map,
2389 unsigned int backref_map)
2392 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, false);
2393 register int op = *scode;
2395 /* Capturing brackets */
2401 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
2402 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
2403 if (!is_startline(scode, new_map, backref_map)) return false;
2406 /* Other brackets */
2408 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE)
2409 { if (!is_startline(scode, bracket_map, backref_map)) return false; }
2411 /* .* means "start at start or after \n" if it isn't in brackets that
2412 may be referenced. */
2414 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2416 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return false;
2419 /* Check for explicit circumflex */
2421 else if (op != OP_CIRC) return false;
2423 /* Move on to the next alternative */
2425 code += GET(code, 1);
2427 while (*code == OP_ALT); /* Loop for each alternative */
2433 /*************************************************
2434 * Check for asserted fixed first char *
2435 *************************************************/
2437 /* During compilation, the "first char" settings from forward assertions are
2438 discarded, because they can cause conflicts with actual literals that follow.
2439 However, if we end up without a first char setting for an unanchored pattern,
2440 it is worth scanning the regex to see if there is an initial asserted first
2441 char. If all branches start with the same asserted char, or with a bracket all
2442 of whose alternatives start with the same asserted char (recurse ad lib), then
2443 we return that char, otherwise -1.
2446 code points to start of expression (the bracket)
2447 options pointer to the options (used to check casing changes)
2448 inassert true if in an assertion
2450 Returns: -1 or the fixed first char
2454 find_firstassertedchar(const uschar *code, int options, BOOL inassert)
2456 register int c = -1;
2459 const uschar *scode =
2460 first_significant_code(code + 1+LINK_SIZE, true);
2461 register int op = *scode;
2463 if (op >= OP_BRA) op = OP_BRA;
2473 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
2475 if (c < 0) c = d; else if (c != d) return -1;
2478 case OP_EXACT: /* Fall through */
2484 case OP_ASCII_LETTER_NC:
2487 if (!inassert) return -1;
2491 if ((options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
2493 else if (c != scode[1]) return -1;
2497 code += GET(code, 1);
2499 while (*code == OP_ALT);
2505 /*************************************************
2506 * Compile a Regular Expression *
2507 *************************************************/
2509 /* This function takes a string and returns a pointer to a block of store
2510 holding a compiled version of the expression. The original API for this
2511 function had no error code return variable; it is retained for backwards
2512 compatibility. The new function is given a new name.
2515 pattern the regular expression
2516 options various option bits
2517 errorcodeptr pointer to error code variable (pcre_compile2() only)
2518 can be NULL if you don't want a code value
2519 errorptr pointer to pointer to error text
2520 erroroffset ptr offset in pattern where error was detected
2521 tables pointer to character tables or NULL
2523 Returns: pointer to compiled data block, or NULL on error,
2524 with errorptr and erroroffset set
2528 jsRegExpCompile(const pcre_char* pattern, int patternLength,
2529 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,
2530 unsigned* numSubpatterns, const char** errorptr)
2533 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
2534 int c, firstbyte, reqbyte;
2536 int branch_extra = 0;
2537 int branch_newextra;
2538 int item_count = -1;
2540 int max_name_size = 0;
2541 int lastitemlength = 0;
2542 ErrorCode errorcode = ERR0;
2545 unsigned int brastackptr = 0;
2548 const uschar *codestart;
2549 const pcre_uchar *ptr;
2550 const pcre_uchar *patternEnd;
2551 compile_data compile_block;
2552 int brastack[BRASTACK_SIZE];
2553 uschar bralenstack[BRASTACK_SIZE];
2555 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2556 can do is just return NULL, but we can set a code value if there is a code
2559 if (errorptr == NULL)
2566 /* Set up pointers to the individual character tables */
2568 compile_block.lcc = _pcre_default_tables + lcc_offset;
2569 compile_block.fcc = _pcre_default_tables + fcc_offset;
2570 compile_block.cbits = _pcre_default_tables + cbits_offset;
2571 compile_block.ctypes = _pcre_default_tables + ctypes_offset;
2573 /* Maximum back reference and backref bitmap. This is updated for numeric
2574 references during the first pass, but for named references during the actual
2575 compile pass. The bitmap records up to 31 back references to help in deciding
2576 whether (.*) can be treated as anchored or not. */
2578 compile_block.top_backref = 0;
2579 compile_block.backref_map = 0;
2581 /* Reflect pattern for debugging output */
2583 DPRINTF(("------------------------------------------------------------------\n"));
2585 /* The first thing to do is to make a pass over the pattern to compute the
2586 amount of store required to hold the compiled code. This does not have to be
2587 perfect as long as errors are overestimates. At the same time we can detect any
2588 flag settings right at the start, and extract them. Make an attempt to correct
2589 for any counted white space if an "extended" flag setting appears late in the
2590 pattern. We can't be so clever for #-comments. */
2592 ptr = (const pcre_uchar *)(pattern - 1);
2593 patternEnd = (const pcre_uchar *)(pattern + patternLength);
2595 while (++ptr < patternEnd)
2597 int min = 0, max = 0;
2604 item_count++; /* Is zero for the first non-comment item */
2608 /* A backslashed item may be an escaped data character or it may be a
2612 c = check_escape(&ptr, patternEnd, &errorcode, bracount, false);
2613 if (errorcode != 0) goto PCRE_ERROR_RETURN;
2615 lastitemlength = 1; /* Default length of last item for repeats */
2617 if (c >= 0) /* Data character */
2619 length += 2; /* For a one-byte character */
2624 for (i = 0; i < _pcre_utf8_table1_size; i++)
2625 if (c <= _pcre_utf8_table1[i]) break;
2627 lastitemlength += i;
2633 /* Other escapes need one byte */
2637 /* A back reference needs an additional 2 bytes, plus either one or 5
2638 bytes for a repeat. We also need to keep the value of the highest
2643 int refnum = -c - ESC_REF;
2644 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
2645 if (refnum > compile_block.top_backref)
2646 compile_block.top_backref = refnum;
2647 length += 2; /* For single back reference */
2648 if (ptr[1] == '{' && is_counted_repeat(ptr+2, patternEnd))
2650 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
2651 if (errorcode != 0) goto PCRE_ERROR_RETURN;
2652 if ((min == 0 && (max == 1 || max == -1)) ||
2653 (min == 1 && max == -1))
2656 if (ptr[1] == '?') ptr++;
2662 case '^': /* Single-byte metacharacters */
2669 case '*': /* These repeats won't be after brackets; */
2670 case '+': /* those are handled separately */
2673 goto POSESSIVE; /* A few lines below */
2675 /* This covers the cases of braced repeats after a single char, metachar,
2676 class, or back reference. */
2679 if (!is_counted_repeat(ptr+1, patternEnd)) goto NORMAL_CHAR;
2680 ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
2681 if (errorcode != 0) goto PCRE_ERROR_RETURN;
2683 /* These special cases just insert one extra opcode */
2685 if ((min == 0 && (max == 1 || max == -1)) ||
2686 (min == 1 && max == -1))
2689 /* These cases might insert additional copies of a preceding character. */
2695 length -= lastitemlength; /* Uncount the original char or metachar */
2696 if (min > 0) length += 3 + lastitemlength;
2698 length += lastitemlength + ((max > 0)? 3 : 1);
2701 if (ptr[1] == '?') ptr++; /* Needs no extra length */
2703 POSESSIVE: /* Test for possessive quantifier */
2707 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
2711 /* An alternation contains an offset to the next branch or ket. If any ims
2712 options changed in the previous branch(es), and/or if we are in a
2713 lookbehind assertion, extra space will be needed at the start of the
2714 branch. This is handled by branch_extra. */
2717 length += 1 + LINK_SIZE + branch_extra;
2720 /* A character class uses 33 characters provided that all the character
2721 values are less than 256. Otherwise, it uses a bit map for low valued
2722 characters, and individual items for others. Don't worry about character
2723 types that aren't allowed in classes - they'll get picked up during the
2724 compile. A character class that contains only one single-byte character
2725 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
2726 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
2729 if (*(++ptr) == '^')
2731 class_optcount = 10; /* Greater than one */
2734 else class_optcount = 0;
2738 /* Written as a "do" so that an initial ']' is taken as data */
2742 /* Outside \Q...\E, check for escapes */
2746 c = check_escape(&ptr, patternEnd, &errorcode, bracount, true);
2747 if (errorcode != 0) goto PCRE_ERROR_RETURN;
2749 /* \b is backspace inside a class; \X is literal */
2751 if (-c == ESC_b) c = '\b';
2753 /* Handle escapes that turn into characters */
2755 if (c >= 0) goto NON_SPECIAL_CHARACTER;
2757 /* Escapes that are meta-things. The normal ones just affect the
2758 bit map, but Unicode properties require an XCLASS extended item. */
2762 class_optcount = 10; /* \d, \s etc; make sure > 1 */
2766 /* Anything else increments the possible optimization count. We have to
2767 detect ranges here so that we can compute the number of extra ranges for
2768 caseless wide characters when UCP support is available. If there are wide
2769 characters, we are going to have to use an XCLASS, even for single
2778 GETCHARLENEND(c, ptr, patternEnd, extra);
2782 /* Come here from handling \ above when it escapes to a char value */
2784 NON_SPECIAL_CHARACTER:
2788 if (ptr + 1 < patternEnd && ptr[1] == '-')
2790 pcre_uchar const *hyptr = ptr++;
2791 if (ptr + 1 < patternEnd && ptr[1] == '\\')
2794 d = check_escape(&ptr, patternEnd, &errorcode, bracount, true);
2795 if (errorcode != 0) goto PCRE_ERROR_RETURN;
2796 if (-d == ESC_b) d = '\b'; /* backspace */
2798 else if (ptr + 1 < patternEnd && ptr[1] != ']')
2803 GETCHARLENEND(d, ptr, patternEnd, extra);
2807 if (d < 0) ptr = hyptr; /* go back to hyphen as data */
2810 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
2811 127 for caseless matching, we will need to use an XCLASS. */
2815 class_optcount = 10; /* Ensure > 1 */
2819 goto PCRE_ERROR_RETURN;
2822 if ((d > 255 || (ignoreCase && d > 127)))
2825 if (!class_utf8) /* Allow for XCLASS overhead */
2828 length += LINK_SIZE + 2;
2831 /* If we have UCP support, find out how many extra ranges are
2832 needed to map the other case of characters within this range. We
2833 have to mimic the range optimization here, because extending the
2834 range upwards might push d over a boundary that makes is use
2835 another byte in the UTF-8 representation. */
2842 while (get_othercase_range(&cc, origd, &occ, &ocd))
2844 if (occ >= c && ocd <= d) continue; /* Skip embedded */
2846 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2847 { /* if there is overlap, */
2848 c = occ; /* noting that if occ < c */
2849 continue; /* we can't have ocd > d */
2850 } /* because a subrange is */
2851 if (ocd > d && occ <= d + 1) /* always shorter than */
2852 { /* the basic range. */
2857 /* An extra item is needed */
2859 length += 1 + _pcre_ord2utf8(occ, buffer) +
2860 ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
2864 /* The length of the (possibly extended) range */
2866 length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
2871 /* We have a single character. There is nothing to be done unless we
2872 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
2873 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
2878 if ((c > 255 || (ignoreCase && c > 127)))
2881 class_optcount = 10; /* Ensure > 1 */
2882 if (!class_utf8) /* Allow for XCLASS overhead */
2885 length += LINK_SIZE + 2;
2887 length += (ignoreCase ? 2 : 1) * (1 + _pcre_ord2utf8(c, buffer));
2892 while (++ptr < patternEnd && *ptr != ']'); /* Concludes "do" above */
2894 if (ptr >= patternEnd) /* Missing terminating ']' */
2897 goto PCRE_ERROR_RETURN;
2900 /* We can optimize when there was only one optimizable character. Repeats
2901 for positive and negated single one-byte chars are handled by the general
2902 code. Here, we handle repeats for the class opcodes. */
2904 if (class_optcount == 1) length += 3; else
2908 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
2909 we also need extra for wrapping the whole thing in a sub-pattern. */
2911 if (ptr + 1 < patternEnd && ptr[1] == '{' && is_counted_repeat(ptr+2, patternEnd))
2913 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
2914 if (errorcode != 0) goto PCRE_ERROR_RETURN;
2915 if ((min == 0 && (max == 1 || max == -1)) ||
2916 (min == 1 && max == -1))
2919 if (ptr + 1 < patternEnd && ptr[1] == '+')
2922 length += 2 + 2*LINK_SIZE;
2924 else if (ptr + 1 < patternEnd && ptr[1] == '?') ptr++;
2929 /* Brackets may be genuine groups or special things */
2932 branch_newextra = 0;
2933 bracket_length = 1 + LINK_SIZE;
2936 /* Handle special forms of bracket, which all start (? */
2938 if (ptr + 1 < patternEnd && ptr[1] == '?')
2940 switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0))
2942 /* Non-referencing groups and lookaheads just move the pointer on, and
2943 then behave like a non-special bracket, except that they don't increment
2944 the count of extracting brackets. Ditto for the "once only" bracket,
2945 which is in Perl from version 5.005. */
2953 /* Else loop checking valid options until ) is met. Anything else is an
2954 error. If we are without any brackets, i.e. at top level, the settings
2955 act as if specified in the options, so massage the options immediately.
2956 This is for backward compatibility with Perl 5.004. */
2960 goto PCRE_ERROR_RETURN;
2966 /* Capturing brackets must be counted so we can process escapes in a
2967 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
2968 an additional 3 bytes of memory per capturing bracket. */
2973 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
2976 /* Save length for computing whole length at end if there's a repeat that
2977 requires duplication of the group. Also save the current value of
2978 branch_extra, and start the new group with the new value. If non-zero, this
2979 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
2981 if (brastackptr >= sizeof(brastack)/sizeof(int))
2984 goto PCRE_ERROR_RETURN;
2987 bralenstack[brastackptr] = branch_extra;
2988 branch_extra = branch_newextra;
2990 brastack[brastackptr++] = length;
2991 length += bracket_length;
2994 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2995 have to replicate this bracket up to that many times. If brastackptr is
2996 0 this is an unmatched bracket which will generate an error, but take care
2997 not to try to access brastack[-1] when computing the length and restoring
2998 the branch_extra value. */
3001 length += 1 + LINK_SIZE;
3002 if (brastackptr > 0)
3004 duplength = length - brastack[--brastackptr];
3005 branch_extra = bralenstack[brastackptr];
3009 /* Leave ptr at the final char; for read_repeat_counts this happens
3010 automatically; for the others we need an increment. */
3012 if (ptr + 1 < patternEnd && (c = ptr[1]) == '{' && is_counted_repeat(ptr+2, patternEnd))
3014 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
3015 if (errorcode != 0) goto PCRE_ERROR_RETURN;
3017 else if (c == '*') { min = 0; max = -1; ptr++; }
3018 else if (c == '+') { min = 1; max = -1; ptr++; }
3019 else if (c == '?') { min = 0; max = 1; ptr++; }
3020 else { min = 1; max = 1; }
3022 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
3023 group, and if the maximum is greater than zero, we have to replicate
3024 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
3030 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
3033 /* When the minimum is greater than zero, we have to replicate up to
3034 minval-1 times, with no additions required in the copies. Then, if there
3035 is a limited maximum we have to replicate up to maxval-1 times allowing
3036 for a BRAZERO item before each optional copy and nesting brackets for all
3037 but one of the optional copies. */
3041 length += (min - 1) * duplength;
3042 if (max > min) /* Need this test as max=-1 means no limit */
3043 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
3044 - (2 + 2*LINK_SIZE);
3047 /* Allow space for once brackets for "possessive quantifier" */
3049 if (ptr + 1 < patternEnd && ptr[1] == '+')
3052 length += 2 + 2*LINK_SIZE;
3056 /* Non-special character. It won't be space or # in extended mode, so it is
3057 always a genuine character. If we are in a \Q...\E sequence, check for the
3058 end; if not, we have a literal. */
3063 length += 2; /* For a one-byte character */
3064 lastitemlength = 1; /* Default length of last item for repeats */
3066 /* In UTF-8 mode, check for additional bytes. */
3070 if (IS_LEADING_SURROGATE(c))
3072 c = DECODE_SURROGATE_PAIR(c, ptr < patternEnd ? *ptr : 0);
3078 for (i = 0; i < _pcre_utf8_table1_size; i++)
3079 if (c <= _pcre_utf8_table1[i]) break;
3081 lastitemlength += i;
3089 length += 2 + LINK_SIZE; /* For final KET and END */
3091 if (length > MAX_PATTERN_SIZE)
3094 goto PCRE_ERROR_RETURN;
3097 /* Compute the size of data block needed and get it. */
3099 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
3100 re = reinterpret_cast<real_pcre*>(new char[size]);
3105 goto PCRE_ERROR_RETURN;
3108 /* Put in the magic number, and save the sizes, options, and character table
3109 pointer. NULL is used for the default character tables. The nullpad field is at
3110 the end; it's there to help in the case when a regex compiled on a system with
3111 4-byte pointers is run on another with 8-byte pointers. */
3113 re->size = (pcre_uint32)size;
3114 re->options = (ignoreCase ? PCRE_CASELESS : 0) | (multiline ? PCRE_MULTILINE : 0);
3116 /* The starting points of the name/number translation table and of the code are
3117 passed around in the compile data block. */
3119 codestart = (const uschar *)(re + 1);
3120 compile_block.start_code = codestart;
3121 compile_block.start_pattern = (const pcre_uchar *)pattern;
3122 compile_block.req_varyopt = 0;
3124 /* Set up a starting, non-extracting bracket, then compile the expression. On
3125 error, errorcode will be set non-zero, so we don't need to look at the result
3126 of the function here. */
3128 ptr = (const pcre_uchar *)pattern;
3129 code = (uschar *)codestart;
3132 (void)compile_regex(re->options, &bracount, &code, &ptr,
3134 &errorcode, 0, &firstbyte, &reqbyte, &compile_block);
3135 re->top_bracket = bracount;
3136 re->top_backref = compile_block.top_backref;
3138 /* If not reached end of pattern on success, there's an excess bracket. */
3140 if (errorcode == 0 && ptr < patternEnd) errorcode = ERR10;
3142 /* Fill in the terminating state and check for disastrous overflow, but
3143 if debugging, leave the test till after things are printed out. */
3148 if (code - codestart > length) errorcode = ERR7;
3151 /* Give an error if there's back reference to a non-existent capturing
3154 if (re->top_backref > re->top_bracket) errorcode = ERR15;
3156 /* Failed to compile, or error while post-processing */
3158 if (errorcode != ERR0)
3160 delete [] reinterpret_cast<char*>(re);
3162 *errorptr = error_text(errorcode);
3166 /* If the anchored option was not passed, set the flag if we can determine that
3167 the pattern is anchored by virtue of ^ characters or \A or anything else (such
3168 as starting with .* when DOTALL is set).
3170 Otherwise, if we know what the first character has to be, save it, because that
3171 speeds up unanchored matches no end. If not, see if we can set the
3172 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3173 start with ^. and also when all branches start with .* for non-DOTALL matches.
3177 if (is_anchored(codestart, re->options, 0, compile_block.backref_map))
3178 re->options |= PCRE_ANCHORED;
3182 firstbyte = find_firstassertedchar(codestart, re->options, false);
3183 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
3185 int ch = firstbyte & 255;
3187 { /* Strange indentation to aid in merging. */
3188 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
3189 compile_block.fcc[ch] == ch)? ch : firstbyte;
3190 re->options |= PCRE_FIRSTSET;
3193 else if (is_startline(codestart, 0, compile_block.backref_map))
3194 re->options |= PCRE_STARTLINE;
3198 /* For an anchored pattern, we use the "required byte" only if it follows a
3199 variable length item in the regex. Remove the caseless flag for non-caseable
3203 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
3205 int ch = reqbyte & 255;
3207 { /* Strange indentation to aid in merging. */
3208 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
3209 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
3210 re->options |= PCRE_REQCHSET;
3214 /* Print out the compiled data if debugging is enabled. This is never the
3215 case when building a production library. */
3219 printf("Length = %d top_bracket = %d top_backref = %d\n",
3220 length, re->top_bracket, re->top_backref);
3222 if (re->options != 0)
3225 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3226 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3227 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "");
3230 if ((re->options & PCRE_FIRSTSET) != 0)
3232 int ch = re->first_byte & 255;
3233 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
3235 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
3236 else printf("First char = \\x%02x%s\n", ch, caseless);
3239 if ((re->options & PCRE_REQCHSET) != 0)
3241 int ch = re->req_byte & 255;
3242 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
3244 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
3245 else printf("Req char = \\x%02x%s\n", ch, caseless);
3248 pcre_printint(re, stdout);
3250 /* This check is done here in the debugging case so that the code that
3251 was compiled can be seen. */
3253 if (code - codestart > length)
3256 *errorptr = error_text(ERR7);
3263 *numSubpatterns = re->top_bracket;
3267 void jsRegExpFree(JSRegExp* re)
3269 delete [] reinterpret_cast<char*>(re);