1 /* This is JavaScriptCore's variant of the PCRE library. While this library
2 started out as a copy of PCRE, many of the features of PCRE have been
3 removed. This library now supports only the regular expression features
4 required by the JavaScript language specification, and has only the functions
5 needed by JavaScriptCore and the rest of WebKit.
7 Originally written by Philip Hazel
8 Copyright (c) 1997-2006 University of Cambridge
9 Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved.
10 Copyright (C) 2007 Eric Seidel <eric@webkit.org>
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
41 /* This module contains the external function jsRegExpExecute(), along with
42 supporting internal functions that are not used by other modules. */
46 #include "pcre_internal.h"
48 #include <wtf/ASCIICType.h>
49 #include <wtf/FastMalloc.h>
53 /*************************************************
54 * Code parameters and static tables *
55 *************************************************/
57 /* Maximum number of items on the nested bracket stacks at compile time. This
58 applies to the nesting of all kinds of parentheses. It does not limit
59 un-nested, non-capturing parentheses. This number can be made bigger if
60 necessary - it is used to dimension one int and one unsigned char vector at
63 #define BRASTACK_SIZE 200
65 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
66 are simple data values; negative values are for special things like \d and so
67 on. Zero means further processing is needed (for things like \x), or the escape
70 static const short escapes[] = {
71 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
72 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
73 '@', 0, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
74 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
75 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
76 0, 0, 0, '[', '\\', ']', '^', '_', /* X - _ */
77 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
78 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
79 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
83 /* Error code numbers. They are given names so that they can more easily be
87 ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
88 ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17
91 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
92 the definition is next to the definition of the opcodes in pcre_internal.h. */
94 static const uschar OP_lengths[] = { OP_LENGTHS };
96 /* The texts of compile-time error messages. These are "char *" because they
97 are passed to the outside world. */
99 static const char* error_text(ErrorCode code)
101 static const char error_texts[] =
103 "\\ at end of pattern\0"
104 "\\c at end of pattern\0"
105 "character value in \\x{...} sequence is too large\0"
106 "numbers out of order in {} quantifier\0"
108 "number too big in {} quantifier\0"
109 "missing terminating ] for character class\0"
110 "internal error: code overflow\0"
111 "range out of order in character class\0"
112 "nothing to repeat\0"
114 "unmatched parentheses\0"
115 "internal error: unexpected repeat\0"
116 "unrecognized character after (?\0"
117 "failed to get memory\0"
120 "reference to non-existent subpattern\0"
121 "regular expression too large\0"
122 "parentheses nested too deeply"
126 const char* text = error_texts;
132 /* Definition to allow mutual recursion */
134 static bool compile_regex(int, int*, uschar**, const UChar**, const UChar*, ErrorCode*, int, int*, int*, CompileData&);
136 /*************************************************
138 *************************************************/
140 /* This function is called when a \ has been encountered. It either returns a
141 positive value for a simple escape such as \n, or a negative value which
142 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
143 a positive value greater than 255 may be returned. On entry, ptr is pointing at
144 the \. On exit, it is on the final character of the escape sequence.
147 ptrptr points to the pattern position pointer
148 errorcodeptr points to the errorcode variable
149 bracount number of previous extracting brackets
150 options the options bits
151 isclass true if inside a character class
153 Returns: zero or positive => a data character
154 negative => a special escape sequence
155 on error, errorptr is set
158 static int check_escape(const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int bracount, bool isclass)
160 const UChar* ptr = *ptrptr + 1;
162 /* If backslash is at the end of the pattern, it's an error. */
163 if (ptr == patternEnd) {
164 *errorcodeptr = ERR1;
171 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
172 a table. A non-zero result is something that can be returned immediately.
173 Otherwise further processing may be required. */
175 if (c < '0' || c > 'z') { /* Not alphameric */
176 } else if (int escapeValue = escapes[c - '0'])
179 /* Escapes that need further processing, or are illegal. */
192 /* Escape sequences starting with a non-zero digit are backreferences,
193 unless there are insufficient brackets, in which case they are octal
194 escape sequences. Those sequences end on the first non-octal character
195 or when we overflow 0-255, whichever comes first. */
198 const UChar* oldptr = ptr;
200 while (ptr + 1 < patternEnd && isASCIIDigit(ptr[1]) && c <= bracount)
201 c = c * 10 + *(++ptr) - '0';
206 ptr = oldptr; /* Put the pointer back and fall through */
209 /* Handle an octal number following \. If the first digit is 8 or 9,
210 this is not octal. */
212 if ((c = *ptr) >= '8')
215 /* \0 always starts an octal number, but we may drop through to here with a
216 larger first octal digit. */
221 for (i = 1; i <= 2; ++i) {
222 if (ptr + i >= patternEnd || ptr[i] < '0' || ptr[i] > '7')
224 int cc = c * 8 + ptr[i] - '0';
235 for (i = 1; i <= 2; ++i) {
236 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
243 cc -= 32; /* Convert to upper case */
244 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
252 for (i = 1; i <= 4; ++i) {
253 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
260 cc -= 32; /* Convert to upper case */
261 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
266 /* Other special escapes not starting with a digit are straightforward */
269 if (++ptr == patternEnd) {
270 *errorcodeptr = ERR2;
275 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
276 is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
278 if (c >= 'a' && c <= 'z')
291 /*************************************************
292 * Check for counted repeat *
293 *************************************************/
295 /* This function is called when a '{' is encountered in a place where it might
296 start a quantifier. It looks ahead to see if it really is a quantifier or not.
297 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
298 where the ddds are digits.
301 p pointer to the first char after '{'
303 Returns: true or false
306 static bool is_counted_repeat(const UChar* p, const UChar* patternEnd)
308 if (p >= patternEnd || !isASCIIDigit(*p))
311 while (p < patternEnd && isASCIIDigit(*p))
313 if (p < patternEnd && *p == '}')
316 if (p >= patternEnd || *p++ != ',')
318 if (p < patternEnd && *p == '}')
321 if (p >= patternEnd || !isASCIIDigit(*p))
324 while (p < patternEnd && isASCIIDigit(*p))
327 return (p < patternEnd && *p == '}');
331 /*************************************************
332 * Read repeat counts *
333 *************************************************/
335 /* Read an item of the form {n,m} and return the values. This is called only
336 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
337 so the syntax is guaranteed to be correct, but we need to check the values.
340 p pointer to first char after '{'
341 minp pointer to int for min
342 maxp pointer to int for max
343 returned as -1 if no max
344 errorcodeptr points to error code variable
346 Returns: pointer to '}' on success;
347 current ptr on error, with errorcodeptr set non-zero
350 static const UChar* read_repeat_counts(const UChar* p, int* minp, int* maxp, ErrorCode* errorcodeptr)
355 /* Read the minimum value and do a paranoid check: a negative value indicates
356 an integer overflow. */
358 while (isASCIIDigit(*p))
359 min = min * 10 + *p++ - '0';
360 if (min < 0 || min > 65535) {
361 *errorcodeptr = ERR5;
365 /* Read the maximum value if there is one, and again do a paranoid on its size.
366 Also, max must not be less than min. */
373 while (isASCIIDigit(*p))
374 max = max * 10 + *p++ - '0';
375 if (max < 0 || max > 65535) {
376 *errorcodeptr = ERR5;
380 *errorcodeptr = ERR4;
386 /* Fill in the required variables, and pass back the pointer to the terminating
395 /*************************************************
396 * Find first significant op code *
397 *************************************************/
399 /* This is called by several functions that scan a compiled expression looking
400 for a fixed first character, or an anchoring op code etc. It skips over things
401 that do not influence this. For some calls, a change of option is important.
402 For some calls, it makes sense to skip negative forward and all backward
403 assertions, and also the \b assertion; for others it does not.
406 code pointer to the start of the group
407 skipassert true if certain assertions are to be skipped
409 Returns: pointer to the first significant opcode
412 static const uschar* firstSignificantOpCode(const uschar* code)
414 while (*code == OP_BRANUMBER)
415 code += OP_lengths[*code];
419 static const uschar* firstSignificantOpCodeSkippingAssertions(const uschar* code)
425 code += getOpcodeValueAtOffset(code, 1);
426 } while (*code == OP_ALT);
427 code += OP_lengths[*code];
429 case OP_WORD_BOUNDARY:
430 case OP_NOT_WORD_BOUNDARY:
432 code += OP_lengths[*code];
438 ASSERT_NOT_REACHED();
442 /*************************************************
443 * Find the fixed length of a pattern *
444 *************************************************/
446 /* Scan a pattern and compute the fixed length of subject that will match it,
447 if the length is fixed. This is needed for dealing with backward assertions.
448 In UTF8 mode, the result is in characters rather than bytes.
451 code points to the start of the pattern (the bracket)
452 options the compiling options
454 Returns: the fixed length, or -1 if there is no fixed length,
455 or -2 if \C was encountered
458 static int find_fixedlength(uschar* code, int options)
462 int branchlength = 0;
463 uschar* cc = code + 1 + LINK_SIZE;
465 /* Scan along the opcodes for this branch. If we get to the end of the
466 branch, check the length against that of the other branches. */
477 d = find_fixedlength(cc, options);
482 cc += getOpcodeValueAtOffset(cc, 1);
483 } while (*cc == OP_ALT);
487 /* Reached end of a branch; if it's a ket it is the end of a nested
488 call. If it's ALT it is an alternation in a nested call. If it is
489 END it's the end of the outer call. All can be handled by the same code. */
497 length = branchlength;
498 else if (length != branchlength)
506 /* Skip over assertive subpatterns */
511 cc += getOpcodeValueAtOffset(cc, 1);
512 } while (*cc == OP_ALT);
515 /* Skip over things that don't match chars */
520 case OP_NOT_WORD_BOUNDARY:
521 case OP_WORD_BOUNDARY:
522 cc += OP_lengths[*cc];
525 /* Handle literal characters */
528 case OP_CHAR_IGNORING_CASE:
532 while ((*cc & 0xc0) == 0x80)
537 case OP_ASCII_LETTER_IGNORING_CASE:
542 /* Handle exact repetitions. The count is already in characters, but we
543 need to skip over a multibyte character in UTF8 mode. */
546 branchlength += get2ByteOpcodeValueAtOffset(cc,1);
548 while((*cc & 0x80) == 0x80)
553 branchlength += get2ByteOpcodeValueAtOffset(cc,1);
557 /* Handle single-char matchers */
561 case OP_NOT_WHITESPACE:
563 case OP_NOT_WORDCHAR:
570 /* Check a class for variable quantification */
573 cc += getOpcodeValueAtOffset(cc, 1) - 33;
589 if (get2ByteOpcodeValueAtOffset(cc, 1) != get2ByteOpcodeValueAtOffset(cc, 3))
591 branchlength += get2ByteOpcodeValueAtOffset(cc, 1);
600 /* Anything else is variable length */
606 ASSERT_NOT_REACHED();
610 /*************************************************
611 * Complete a callout item *
612 *************************************************/
614 /* A callout item contains the length of the next item in the pattern, which
615 we can't fill in till after we have reached the relevant point. This is used
616 for both automatic and manual callouts.
619 previous_callout points to previous callout item
620 ptr current pattern pointer
621 cd pointers to tables etc
624 static void complete_callout(uschar* previous_callout, const UChar* ptr, const CompileData& cd)
626 int length = ptr - cd.start_pattern - getOpcodeValueAtOffset(previous_callout, 2);
627 putOpcodeValueAtOffset(previous_callout, 2 + LINK_SIZE, length);
632 /*************************************************
633 * Get othercase range *
634 *************************************************/
636 /* This function is passed the start and end of a class range, in UTF-8 mode
637 with UCP support. It searches up the characters, looking for internal ranges of
638 characters in the "other" case. Each call returns the next one, updating the
642 cptr points to starting character value; updated
644 ocptr where to put start of othercase range
645 odptr where to put end of othercase range
647 Yield: true when range returned; false when no more
650 static bool get_othercase_range(int* cptr, int d, int* ocptr, int* odptr)
652 int c, othercase = 0;
654 for (c = *cptr; c <= d; c++) {
655 if ((othercase = _pcre_ucp_othercase(c)) >= 0)
663 int next = othercase + 1;
665 for (++c; c <= d; c++) {
666 if (_pcre_ucp_othercase(c) != next)
677 /*************************************************
678 * Convert character value to UTF-8 *
679 *************************************************/
681 /* This function takes an integer value in the range 0 - 0x7fffffff
682 and encodes it as a UTF-8 character in 0 to 6 bytes.
685 cvalue the character value
686 buffer pointer to buffer for result - at least 6 bytes long
688 Returns: number of characters placed in the buffer
691 // FIXME: This should be removed as soon as all UTF8 uses are removed from PCRE
692 int _pcre_ord2utf8(int cvalue, uschar *buffer)
695 for (i = 0; i < _pcre_utf8_table1_size; i++)
696 if (cvalue <= _pcre_utf8_table1[i])
699 for (int j = i; j > 0; j--) {
700 *buffer-- = 0x80 | (cvalue & 0x3f);
703 *buffer = _pcre_utf8_table2[i] | cvalue;
707 /*************************************************
708 * Compile one branch *
709 *************************************************/
711 /* Scan the pattern, compiling it into the code vector. If the options are
712 changed during the branch, the pointer is used to change the external options
716 options the option bits
717 brackets points to number of extracting brackets used
718 codeptr points to the pointer to the current code point
719 ptrptr points to the current pattern pointer
720 errorcodeptr points to error code variable
721 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
722 reqbyteptr set to the last literal character required, else < 0
723 cd contains pointers to tables etc.
725 Returns: true on success
726 false, with *errorcodeptr set non-zero on error
730 compile_branch(int options, int* brackets, uschar** codeptr,
731 const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int *firstbyteptr,
732 int* reqbyteptr, CompileData& cd)
734 int repeat_type, op_type;
735 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
737 int reqvary, tempreqvary;
738 int after_manual_callout = 0;
740 uschar* code = *codeptr;
742 bool groupsetfirstbyte = false;
743 const UChar* ptr = *ptrptr;
744 const UChar* tempptr;
745 uschar* previous = NULL;
746 uschar* previous_callout = NULL;
747 uschar classbits[32];
750 uschar* class_utf8data;
753 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
754 matching encountered yet". It gets changed to REQ_NONE if we hit something that
755 matches a non-fixed char first char; reqbyte just remains unset if we never
758 When we hit a repeat whose minimum is zero, we may have to adjust these values
759 to take the zero repeat into account. This is implemented by setting them to
760 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
761 item types that can be repeated set these backoff variables appropriately. */
763 int firstbyte = REQ_UNSET;
764 int reqbyte = REQ_UNSET;
765 int zeroreqbyte = REQ_UNSET;
766 int zerofirstbyte = REQ_UNSET;
768 /* The variable req_caseopt contains either the REQ_IGNORE_CASE value or zero,
769 according to the current setting of the ignores-case flag. REQ_IGNORE_CASE is a bit
770 value > 255. It is added into the firstbyte or reqbyte variables to record the
771 case status of the value. This is used only for ASCII characters. */
773 int req_caseopt = (options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0;
775 /* Switch on next character until the end of the branch */
780 bool should_flip_negation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */
789 /* Next byte in the pattern */
791 c = ptr < patternEnd ? *ptr : 0;
793 /* Fill in length of a previous callout, except when the next thing is
796 bool is_quantifier = c == '*' || c == '+' || c == '?' || (c == '{' && is_counted_repeat(ptr + 1, patternEnd));
798 if (!is_quantifier && previous_callout && after_manual_callout-- <= 0) {
799 complete_callout(previous_callout, ptr, cd);
800 previous_callout = NULL;
804 /* The branch terminates at end of string, |, or ). */
807 if (ptr < patternEnd)
809 // End of string; fall through
812 *firstbyteptr = firstbyte;
813 *reqbyteptr = reqbyte;
818 /* Handle single-character metacharacters. In multiline mode, ^ disables
819 the setting of any following char as a first character. */
822 if (options & MatchAcrossMultipleLinesOption) {
823 if (firstbyte == REQ_UNSET)
824 firstbyte = REQ_NONE;
835 /* There can never be a first char if '.' is first, whatever happens about
836 repeats. The value of reqbyte doesn't change either. */
839 if (firstbyte == REQ_UNSET)
840 firstbyte = REQ_NONE;
841 zerofirstbyte = firstbyte;
842 zeroreqbyte = reqbyte;
844 *code++ = OP_ANY_CHAR;
847 /* Character classes. If the included characters are all < 256, we build a
848 32-byte bitmap of the permitted characters, except in the special case
849 where there is only one such character. For negated classes, we build the
850 map as usual, then invert it at the end. However, we use a different opcode
851 so that data characters > 255 can be handled correctly.
853 If the class contains characters outside the 0-255 range, a different
854 opcode is compiled. It may optionally have a bit map for characters < 256,
855 but those above are are explicitly listed afterwards. A flag byte tells
856 whether the bitmap is present, and whether this is a negated class or not.
862 should_flip_negation = false;
864 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
865 they are encountered at the top level, so we'll do that too. */
867 /* If the first character is '^', set the negation flag and skip it. */
873 negate_class = false;
875 /* Keep a count of chars with values < 256 so that we can optimize the case
876 of just a single character (as long as it's < 256). For higher valued UTF-8
877 characters, we don't yet do any optimization. */
882 class_utf8 = false; /* No chars >= 256 */
883 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
885 /* Initialize the 32-char bit map to all zeros. We have to build the
886 map in a temporary bit of store, in case the class contains only 1
887 character (< 256), because in that case the compiled code doesn't use the
890 memset(classbits, 0, 32 * sizeof(uschar));
892 /* Process characters until ] is reached. The first pass
893 through the regex checked the overall syntax, so we don't need to be very
894 strict here. At the start of the loop, c contains the first byte of the
896 while ((c = *(++ptr)) != ']') {
898 c = getCharAndAdvanceIfSurrogate(ptr);
900 /* Backslash may introduce a single character, or it may introduce one
901 of the specials, which just set a flag. Escaped items are checked for
902 validity in the pre-compiling pass. The sequence \b is a special case.
903 Inside a class (and only there) it is treated as backspace. Elsewhere
904 it marks a word boundary. Other escapes have preset maps ready to
905 or into the one we are building. We assume they have more than one
906 character in them, so set class_charcount bigger than one. */
909 c = check_escape(&ptr, patternEnd, errorcodeptr, *brackets, true);
912 c = '\b'; /* \b is backslash in a class */
915 class_charcount += 2; /* Greater than 1 is what matters */
918 for (c = 0; c < 32; c++)
919 classbits[c] |= classBitmapForChar(c + cbit_digit);
923 should_flip_negation = true;
924 for (c = 0; c < 32; c++)
925 classbits[c] |= ~classBitmapForChar(c + cbit_digit);
929 for (c = 0; c < 32; c++)
930 classbits[c] |= classBitmapForChar(c + cbit_word);
934 should_flip_negation = true;
935 for (c = 0; c < 32; c++)
936 classbits[c] |= ~classBitmapForChar(c + cbit_word);
940 for (c = 0; c < 32; c++)
941 classbits[c] |= classBitmapForChar(c + cbit_space);
945 should_flip_negation = true;
946 for (c = 0; c < 32; c++)
947 classbits[c] |= ~classBitmapForChar(c + cbit_space);
950 /* Unrecognized escapes are faulted if PCRE is running in its
951 strict mode. By default, for compatibility with Perl, they are
952 treated as literals. */
955 c = *ptr; /* The final character */
956 class_charcount -= 2; /* Undo the default count from above */
960 /* Fall through if we have a single character (c >= 0). This may be
961 > 256 in UTF-8 mode. */
963 } /* End of backslash handling */
965 /* A single character may be followed by '-' to form a range. However,
966 Perl does not permit ']' to be the end of the range. A '-' character
967 here is treated as a literal. */
969 if (ptr[1] == '-' && ptr[2] != ']') {
972 int d = getCharAndAdvanceIfSurrogate(ptr);
974 /* The second part of a range can be a single-character escape, but
975 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
976 in such circumstances. */
979 const UChar* oldptr = ptr;
980 d = check_escape(&ptr, patternEnd, errorcodeptr, *brackets, true);
982 /* \b is backslash; \X is literal X; any other special means the '-'
990 goto LONE_SINGLE_CHARACTER; /* A few lines below */
995 /* The check that the two values are in the correct order happens in
996 the pre-pass. Optimize one-character ranges */
999 goto LONE_SINGLE_CHARACTER; /* A few lines below */
1001 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
1002 matching, we have to use an XCLASS with extra data items. Caseless
1003 matching for characters > 127 is available only if UCP support is
1006 if ((d > 255 || ((options & IgnoreCaseOption) && d > 127))) {
1009 /* With UCP support, we can find the other case equivalents of
1010 the relevant characters. There may be several ranges. Optimize how
1011 they fit with the basic range. */
1013 if (options & IgnoreCaseOption) {
1017 while (get_othercase_range(&cc, origd, &occ, &ocd)) {
1018 if (occ >= c && ocd <= d)
1019 continue; /* Skip embedded ranges */
1021 if (occ < c && ocd >= c - 1) /* Extend the basic range */
1022 { /* if there is overlap, */
1023 c = occ; /* noting that if occ < c */
1024 continue; /* we can't have ocd > d */
1025 } /* because a subrange is */
1026 if (ocd > d && occ <= d + 1) /* always shorter than */
1027 { /* the basic range. */
1033 *class_utf8data++ = XCL_SINGLE;
1035 *class_utf8data++ = XCL_RANGE;
1036 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
1038 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
1042 /* Now record the original range, possibly modified for UCP caseless
1043 overlapping ranges. */
1045 *class_utf8data++ = XCL_RANGE;
1046 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
1047 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
1049 /* With UCP support, we are done. Without UCP support, there is no
1050 caseless matching for UTF-8 characters > 127; we can use the bit map
1051 for the smaller ones. */
1053 continue; /* With next character in the class */
1056 /* We use the bit map for all cases when not in UTF-8 mode; else
1057 ranges that lie entirely within 0-127 when there is UCP support; else
1058 for partial ranges without UCP support. */
1060 for (; c <= d; c++) {
1061 classbits[c/8] |= (1 << (c&7));
1062 if (options & IgnoreCaseOption) {
1063 int uc = flipCase(c);
1064 classbits[uc/8] |= (1 << (uc&7));
1066 class_charcount++; /* in case a one-char range */
1070 continue; /* Go get the next char in the class */
1073 /* Handle a lone single character - we can get here for a normal
1074 non-escape char, or after \ that introduces a single character or for an
1075 apparent range that isn't. */
1077 LONE_SINGLE_CHARACTER:
1079 /* Handle a character that cannot go in the bit map */
1081 if ((c > 255 || ((options & IgnoreCaseOption) && c > 127))) {
1083 *class_utf8data++ = XCL_SINGLE;
1084 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
1086 if (options & IgnoreCaseOption) {
1088 if ((othercase = _pcre_ucp_othercase(c)) >= 0) {
1089 *class_utf8data++ = XCL_SINGLE;
1090 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
1096 /* Handle a single-byte character */
1098 classbits[c/8] |= (1 << (c&7));
1099 if (options & IgnoreCaseOption) {
1101 classbits[c/8] |= (1 << (c&7));
1108 /* If class_charcount is 1, we saw precisely one character whose value is
1109 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
1110 can optimize the negative case only if there were no characters >= 128
1111 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
1112 single-bytes only. This is an historical hangover. Maybe one day we can
1113 tidy these opcodes to handle multi-byte characters.
1115 The optimization throws away the bit map. We turn the item into a
1116 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
1117 that OP_NOT does not support multibyte characters. In the positive case, it
1118 can cause firstbyte to be set. Otherwise, there can be no first char if
1119 this item is first, whatever repeat count may follow. In the case of
1120 reqbyte, save the previous value for reinstating. */
1122 if (class_charcount == 1 && (!class_utf8 && (!negate_class || class_lastchar < 128))) {
1123 zeroreqbyte = reqbyte;
1125 /* The OP_NOT opcode works on one-byte characters only. */
1128 if (firstbyte == REQ_UNSET)
1129 firstbyte = REQ_NONE;
1130 zerofirstbyte = firstbyte;
1132 *code++ = class_lastchar;
1136 /* For a single, positive character, get the value into c, and
1137 then we can handle this with the normal one-character code. */
1141 } /* End of 1-char optimization */
1143 /* The general case - not the one-char optimization. If this is the first
1144 thing in the branch, there can be no first char setting, whatever the
1145 repeat count. Any reqbyte setting must remain unchanged after any kind of
1148 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1149 zerofirstbyte = firstbyte;
1150 zeroreqbyte = reqbyte;
1152 /* If there are characters with values > 255, we have to compile an
1153 extended class, with its own opcode. If there are no characters < 256,
1154 we can omit the bitmap. */
1156 if (class_utf8 && !should_flip_negation) {
1157 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
1158 *code++ = OP_XCLASS;
1160 *code = negate_class? XCL_NOT : 0;
1162 /* If the map is required, install it, and move on to the end of
1165 if (class_charcount > 0) {
1167 memcpy(code, classbits, 32);
1168 code = class_utf8data;
1171 /* If the map is not required, slide down the extra data. */
1174 int len = class_utf8data - (code + 33);
1175 memmove(code + 1, code + 33, len);
1179 /* Now fill in the complete length of the item */
1181 putOpcodeValueAtOffset(previous, 1, code - previous);
1182 break; /* End of class handling */
1185 /* If there are no characters > 255, negate the 32-byte map if necessary,
1186 and copy it into the code vector. If this is the first thing in the branch,
1187 there can be no first char setting, whatever the repeat count. Any reqbyte
1188 setting must remain unchanged after any kind of repeat. */
1190 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
1192 for (c = 0; c < 32; c++)
1193 code[c] = ~classbits[c];
1195 memcpy(code, classbits, 32);
1199 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
1200 has been tested above. */
1205 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
1226 *errorcodeptr = ERR9;
1230 if (repeat_min == 0) {
1231 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
1232 reqbyte = zeroreqbyte; /* Ditto */
1235 /* Remember whether this is a variable length repeat */
1237 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
1239 op_type = 0; /* Default single-char op codes */
1241 /* Save start of previous item, in case we have to move it up to make space
1242 for an inserted OP_ONCE for the additional '+' extension. */
1244 tempcode = previous;
1246 /* If the next character is '+', we have a possessive quantifier. This
1247 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
1248 If the next character is '?' this is a minimizing repeat, by default,
1249 but if PCRE_UNGREEDY is set, it works the other way round. We change the
1250 repeat type to the non-default. */
1252 if (ptr + 1 < patternEnd && ptr[1] == '?') {
1258 /* If previous was a character match, abolish the item and generate a
1259 repeat item instead. If a char item has a minumum of more than one, ensure
1260 that it is set in reqbyte - it might not be if a sequence such as x{3} is
1261 the first thing in a branch because the x will have gone into firstbyte
1264 if (*previous == OP_CHAR || *previous == OP_CHAR_IGNORING_CASE) {
1265 /* Deal with UTF-8 characters that take up more than one byte. It's
1266 easier to write this out separately than try to macrify it. Use c to
1267 hold the length of the character in bytes, plus 0x80 to flag that it's a
1268 length rather than a small character. */
1270 if (code[-1] & 0x80) {
1271 uschar *lastchar = code - 1;
1272 while((*lastchar & 0xc0) == 0x80)
1274 c = code - lastchar; /* Length of UTF-8 character */
1275 memcpy(utf8_char, lastchar, c); /* Save the char */
1276 c |= 0x80; /* Flag c as a length */
1281 reqbyte = c | req_caseopt | cd.req_varyopt;
1284 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1287 else if (*previous == OP_ASCII_CHAR || *previous == OP_ASCII_LETTER_IGNORING_CASE) {
1290 reqbyte = c | req_caseopt | cd.req_varyopt;
1291 goto OUTPUT_SINGLE_REPEAT;
1294 /* If previous was a single negated character ([^a] or similar), we use
1295 one of the special opcodes, replacing it. The code is shared with single-
1296 character repeats by setting opt_type to add a suitable offset into
1297 repeat_type. OP_NOT is currently used only for single-byte chars. */
1299 else if (*previous == OP_NOT) {
1300 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1302 goto OUTPUT_SINGLE_REPEAT;
1305 /* If previous was a character type match (\d or similar), abolish it and
1306 create a suitable repeat item. The code is shared with single-character
1307 repeats by setting op_type to add a suitable offset into repeat_type. */
1309 else if (*previous <= OP_ANY_CHAR) {
1310 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1313 OUTPUT_SINGLE_REPEAT:
1315 int prop_value = -1;
1317 uschar* oldcode = code;
1318 code = previous; /* Usually overwrite previous item */
1320 /* If the maximum is zero then the minimum must also be zero; Perl allows
1321 this case, so we do too - by simply omitting the item altogether. */
1323 if (repeat_max == 0)
1326 /* Combine the op_type with the repeat_type */
1328 repeat_type += op_type;
1330 /* A minimum of zero is handled either as the special case * or ?, or as
1331 an UPTO, with the maximum given. */
1333 if (repeat_min == 0) {
1334 if (repeat_max == -1)
1335 *code++ = OP_STAR + repeat_type;
1336 else if (repeat_max == 1)
1337 *code++ = OP_QUERY + repeat_type;
1339 *code++ = OP_UPTO + repeat_type;
1340 put2ByteOpcodeValueAtOffsetAndAdvance(code, 0, repeat_max);
1344 /* A repeat minimum of 1 is optimized into some special cases. If the
1345 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
1346 left in place and, if the maximum is greater than 1, we use OP_UPTO with
1347 one less than the maximum. */
1349 else if (repeat_min == 1) {
1350 if (repeat_max == -1)
1351 *code++ = OP_PLUS + repeat_type;
1353 code = oldcode; /* leave previous item in place */
1354 if (repeat_max == 1)
1356 *code++ = OP_UPTO + repeat_type;
1357 put2ByteOpcodeValueAtOffsetAndAdvance(code, 0, repeat_max - 1);
1361 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1362 handled as an EXACT followed by an UPTO. */
1365 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1366 put2ByteOpcodeValueAtOffsetAndAdvance(code, 0, repeat_min);
1368 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
1369 we have to insert the character for the previous code. For a repeated
1370 Unicode property match, there are two extra bytes that define the
1371 required property. In UTF-8 mode, long characters have their length in
1372 c, with the 0x80 bit as a flag. */
1374 if (repeat_max < 0) {
1376 memcpy(code, utf8_char, c & 7);
1380 if (prop_type >= 0) {
1381 *code++ = prop_type;
1382 *code++ = prop_value;
1385 *code++ = OP_STAR + repeat_type;
1388 /* Else insert an UPTO if the max is greater than the min, again
1389 preceded by the character, for the previously inserted code. */
1391 else if (repeat_max != repeat_min) {
1393 memcpy(code, utf8_char, c & 7);
1397 if (prop_type >= 0) {
1398 *code++ = prop_type;
1399 *code++ = prop_value;
1401 repeat_max -= repeat_min;
1402 *code++ = OP_UPTO + repeat_type;
1403 put2ByteOpcodeValueAtOffsetAndAdvance(code, 0, repeat_max);
1407 /* The character or character type itself comes last in all cases. */
1410 memcpy(code, utf8_char, c & 7);
1415 /* For a repeated Unicode property match, there are two extra bytes that
1416 define the required property. */
1418 if (prop_type >= 0) {
1419 *code++ = prop_type;
1420 *code++ = prop_value;
1424 /* If previous was a character class or a back reference, we put the repeat
1425 stuff after it, but just skip the item if the repeat was {0,0}. */
1427 else if (*previous == OP_CLASS ||
1428 *previous == OP_NCLASS ||
1429 *previous == OP_XCLASS ||
1430 *previous == OP_REF)
1432 if (repeat_max == 0) {
1437 if (repeat_min == 0 && repeat_max == -1)
1438 *code++ = OP_CRSTAR + repeat_type;
1439 else if (repeat_min == 1 && repeat_max == -1)
1440 *code++ = OP_CRPLUS + repeat_type;
1441 else if (repeat_min == 0 && repeat_max == 1)
1442 *code++ = OP_CRQUERY + repeat_type;
1444 *code++ = OP_CRRANGE + repeat_type;
1445 put2ByteOpcodeValueAtOffsetAndAdvance(code, 0, repeat_min);
1446 if (repeat_max == -1)
1447 repeat_max = 0; /* 2-byte encoding for max */
1448 put2ByteOpcodeValueAtOffsetAndAdvance(code, 0, repeat_max);
1452 /* If previous was a bracket group, we may have to replicate it in certain
1455 else if (*previous >= OP_BRA || *previous == OP_ONCE) {
1457 int len = code - previous;
1458 uschar* bralink = NULL;
1460 /* If the maximum repeat count is unlimited, find the end of the bracket
1461 by scanning through from the start, and compute the offset back to it
1462 from the current code pointer. There may be an OP_OPT setting following
1463 the final KET, so we can't find the end just by going back from the code
1466 if (repeat_max == -1) {
1467 uschar* ket = previous;
1469 ket += getOpcodeValueAtOffset(ket, 1);
1470 } while (*ket != OP_KET);
1471 ketoffset = code - ket;
1474 /* The case of a zero minimum is special because of the need to stick
1475 OP_BRAZERO in front of it, and because the group appears once in the
1476 data, whereas in other cases it appears the minimum number of times. For
1477 this reason, it is simplest to treat this case separately, as otherwise
1478 the code gets far too messy. There are several special subcases when the
1481 if (repeat_min == 0) {
1482 /* If the maximum is also zero, we just omit the group from the output
1485 if (repeat_max == 0) {
1490 /* If the maximum is 1 or unlimited, we just have to stick in the
1491 BRAZERO and do no more at this point. However, we do need to adjust
1492 any OP_RECURSE calls inside the group that refer to the group itself or
1493 any internal group, because the offset is from the start of the whole
1494 regex. Temporarily terminate the pattern while doing this. */
1496 if (repeat_max <= 1) {
1498 memmove(previous+1, previous, len);
1500 *previous++ = OP_BRAZERO + repeat_type;
1503 /* If the maximum is greater than 1 and limited, we have to replicate
1504 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1505 The first one has to be handled carefully because it's the original
1506 copy, which has to be moved up. The remainder can be handled by code
1507 that is common with the non-zero minimum case below. We have to
1508 adjust the value or repeat_max, since one less copy is required. Once
1509 again, we may have to adjust any OP_RECURSE calls inside the group. */
1513 memmove(previous + 2 + LINK_SIZE, previous, len);
1514 code += 2 + LINK_SIZE;
1515 *previous++ = OP_BRAZERO + repeat_type;
1516 *previous++ = OP_BRA;
1518 /* We chain together the bracket offset fields that have to be
1519 filled in later when the ends of the brackets are reached. */
1521 int offset = (!bralink) ? 0 : previous - bralink;
1523 putOpcodeValueAtOffsetAndAdvance(previous, 0, offset);
1529 /* If the minimum is greater than zero, replicate the group as many
1530 times as necessary, and adjust the maximum to the number of subsequent
1531 copies that we need. If we set a first char from the group, and didn't
1532 set a required char, copy the latter from the former. */
1535 if (repeat_min > 1) {
1536 if (groupsetfirstbyte && reqbyte < 0)
1537 reqbyte = firstbyte;
1538 for (int i = 1; i < repeat_min; i++) {
1539 memcpy(code, previous, len);
1544 repeat_max -= repeat_min;
1547 /* This code is common to both the zero and non-zero minimum cases. If
1548 the maximum is limited, it replicates the group in a nested fashion,
1549 remembering the bracket starts on a stack. In the case of a zero minimum,
1550 the first one was set up above. In all cases the repeat_max now specifies
1551 the number of additional copies needed. */
1553 if (repeat_max >= 0) {
1554 for (int i = repeat_max - 1; i >= 0; i--) {
1555 *code++ = OP_BRAZERO + repeat_type;
1557 /* All but the final copy start a new nesting, maintaining the
1558 chain of brackets outstanding. */
1562 int offset = (!bralink) ? 0 : code - bralink;
1564 putOpcodeValueAtOffsetAndAdvance(code, 0, offset);
1567 memcpy(code, previous, len);
1571 /* Now chain through the pending brackets, and fill in their length
1572 fields (which are holding the chain links pro tem). */
1575 int offset = code - bralink + 1;
1576 uschar* bra = code - offset;
1577 int oldlinkoffset = getOpcodeValueAtOffset(bra, 1);
1578 bralink = oldlinkoffset ? 0 : bralink - oldlinkoffset;
1580 putOpcodeValueAtOffsetAndAdvance(code, 0, offset);
1581 putOpcodeValueAtOffset(bra, 1, offset);
1585 /* If the maximum is unlimited, set a repeater in the final copy. We
1586 can't just offset backwards from the current code point, because we
1587 don't know if there's been an options resetting after the ket. The
1588 correct offset was computed above. */
1591 code[-ketoffset] = OP_KETRMAX + repeat_type;
1594 /* Else there's some kind of shambles */
1597 *errorcodeptr = ERR11;
1601 /* In all case we no longer have a previous item. We also set the
1602 "follows varying string" flag for subsequently encountered reqbytes if
1603 it isn't already set and we have just passed a varying length item. */
1607 cd.req_varyopt |= reqvary;
1611 /* Start of nested bracket sub-expression, or comment or lookahead or
1612 lookbehind or option setting or condition. First deal with special things
1613 that can come after a bracket; all are introduced by ?, and the appearance
1614 of any of them means that this is not a referencing group. They were
1615 checked for validity in the first pass over the string, so we don't have to
1616 check for syntax errors here. */
1621 if (*(++ptr) == '?') {
1623 case ':': /* Non-extracting bracket */
1628 case '=': /* Positive lookahead */
1629 bravalue = OP_ASSERT;
1633 case '!': /* Negative lookahead */
1634 bravalue = OP_ASSERT_NOT;
1638 /* Character after (? not specially recognized */
1640 default: /* Option setting */
1641 *errorcodeptr = ERR12;
1646 /* Else we have a referencing group; adjust the opcode. If the bracket
1647 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1648 arrange for the true number to follow later, in an OP_BRANUMBER item. */
1651 if (++(*brackets) > EXTRACT_BASIC_MAX) {
1652 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1653 code[1 + LINK_SIZE] = OP_BRANUMBER;
1654 put2ByteOpcodeValueAtOffset(code, 2+LINK_SIZE, *brackets);
1658 bravalue = OP_BRA + *brackets;
1661 /* Process nested bracketed re. Assertions may not be repeated, but other
1662 kinds can be. We copy code into a non-variable in order to be able
1663 to pass its address because some compilers complain otherwise. Pass in a
1664 new setting for the ims options if they have changed. */
1666 previous = (bravalue >= OP_ONCE) ? code : 0;
1669 tempreqvary = cd.req_varyopt; /* Save value before bracket */
1673 brackets, /* Extracting bracket count */
1674 &tempcode, /* Where to put code (updated) */
1675 &ptr, /* Input pointer (updated) */
1677 errorcodeptr, /* Where to put an error message */
1678 skipbytes, /* Skip over OP_BRANUMBER */
1679 &subfirstbyte, /* For possible first char */
1680 &subreqbyte, /* For possible last char */
1681 cd)) /* Tables block */
1684 /* At the end of compiling, code is still pointing to the start of the
1685 group, while tempcode has been updated to point past the end of the group
1686 and any option resetting that may follow it. The pattern pointer (ptr)
1687 is on the bracket. */
1689 /* Handle updating of the required and first characters. Update for normal
1690 brackets of all kinds, and conditions with two branches (see code above).
1691 If the bracket is followed by a quantifier with zero repeat, we have to
1692 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
1693 main loop so that they can be accessed for the back off. */
1695 zeroreqbyte = reqbyte;
1696 zerofirstbyte = firstbyte;
1697 groupsetfirstbyte = false;
1699 if (bravalue >= OP_BRA || bravalue == OP_ONCE) {
1700 /* If we have not yet set a firstbyte in this branch, take it from the
1701 subpattern, remembering that it was set here so that a repeat of more
1702 than one can replicate it as reqbyte if necessary. If the subpattern has
1703 no firstbyte, set "none" for the whole branch. In both cases, a zero
1704 repeat forces firstbyte to "none". */
1706 if (firstbyte == REQ_UNSET) {
1707 if (subfirstbyte >= 0) {
1708 firstbyte = subfirstbyte;
1709 groupsetfirstbyte = true;
1712 firstbyte = REQ_NONE;
1713 zerofirstbyte = REQ_NONE;
1716 /* If firstbyte was previously set, convert the subpattern's firstbyte
1717 into reqbyte if there wasn't one, using the vary flag that was in
1718 existence beforehand. */
1720 else if (subfirstbyte >= 0 && subreqbyte < 0)
1721 subreqbyte = subfirstbyte | tempreqvary;
1723 /* If the subpattern set a required byte (or set a first byte that isn't
1724 really the first byte - see above), set it. */
1726 if (subreqbyte >= 0)
1727 reqbyte = subreqbyte;
1730 /* For a forward assertion, we take the reqbyte, if set. This can be
1731 helpful if the pattern that follows the assertion doesn't set a different
1732 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
1733 for an assertion, however because it leads to incorrect effect for patterns
1734 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
1735 of a firstbyte. This is overcome by a scan at the end if there's no
1736 firstbyte, looking for an asserted first char. */
1738 else if (bravalue == OP_ASSERT && subreqbyte >= 0)
1739 reqbyte = subreqbyte;
1741 /* Now update the main code pointer to the end of the group. */
1745 /* Error if hit end of pattern */
1747 if (ptr >= patternEnd || *ptr != ')') {
1748 *errorcodeptr = ERR14;
1753 /* Check \ for being a real metacharacter; if not, fall through and handle
1754 it as a data character at the start of a string. Escape items are checked
1755 for validity in the pre-compiling pass. */
1759 c = check_escape(&ptr, patternEnd, errorcodeptr, *brackets, false);
1761 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1762 are arranged to be the negation of the corresponding OP_values. For the
1763 back references, the values are ESC_REF plus the reference number. Only
1764 back references and those types that consume a character may be repeated.
1765 We can test for values between ESC_b and ESC_w for the latter; this may
1766 have to change if any new ones are ever created. */
1769 /* For metasequences that actually match a character, we disable the
1770 setting of a first character if it hasn't already been set. */
1772 if (firstbyte == REQ_UNSET && -c > ESC_b && -c <= ESC_w)
1773 firstbyte = REQ_NONE;
1775 /* Set values to reset to if this is followed by a zero repeat. */
1777 zerofirstbyte = firstbyte;
1778 zeroreqbyte = reqbyte;
1780 /* Back references are handled specially */
1782 if (-c >= ESC_REF) {
1783 int number = -c - ESC_REF;
1786 put2ByteOpcodeValueAtOffsetAndAdvance(code, 0, number);
1789 /* For the rest, we can obtain the OP value by negating the escape
1793 previous = (-c > ESC_b && -c <= ESC_w)? code : NULL;
1801 /* Handle a literal character. It is guaranteed not to be whitespace or #
1802 when the extended flag is set. If we are in UTF-8 mode, it may be a
1803 multi-byte literal character. */
1814 if ((options & IgnoreCaseOption) && (c | 0x20) >= 'a' && (c | 0x20) <= 'z') {
1815 *code++ = OP_ASCII_LETTER_IGNORING_CASE;
1818 *code++ = OP_ASCII_CHAR;
1822 mclength = _pcre_ord2utf8(c, mcbuffer);
1824 *code++ = (options & IgnoreCaseOption) ? OP_CHAR_IGNORING_CASE : OP_CHAR;
1825 for (c = 0; c < mclength; c++)
1826 *code++ = mcbuffer[c];
1829 /* Set the first and required bytes appropriately. If no previous first
1830 byte, set it from this character, but revert to none on a zero repeat.
1831 Otherwise, leave the firstbyte value alone, and don't change it on a zero
1834 if (firstbyte == REQ_UNSET) {
1835 zerofirstbyte = REQ_NONE;
1836 zeroreqbyte = reqbyte;
1838 /* If the character is more than one byte long, we can set firstbyte
1839 only if it is not to be matched caselessly. */
1841 if (mclength == 1 || req_caseopt == 0) {
1842 firstbyte = mcbuffer[0] | req_caseopt;
1844 reqbyte = code[-1] | cd.req_varyopt;
1847 firstbyte = reqbyte = REQ_NONE;
1850 /* firstbyte was previously set; we can set reqbyte only the length is
1851 1 or the matching is caseful. */
1854 zerofirstbyte = firstbyte;
1855 zeroreqbyte = reqbyte;
1856 if (mclength == 1 || req_caseopt == 0)
1857 reqbyte = code[-1] | req_caseopt | cd.req_varyopt;
1860 break; /* End of literal character handling */
1862 } /* end of big loop */
1864 /* Control never reaches here by falling through, only by a goto for all the
1865 error states. Pass back the position in the pattern so that it can be displayed
1866 to the user for diagnosing the error. */
1876 /*************************************************
1877 * Compile sequence of alternatives *
1878 *************************************************/
1880 /* On entry, ptr is pointing past the bracket character, but on return
1881 it points to the closing bracket, or vertical bar, or end of string.
1882 The code variable is pointing at the byte into which the BRA operator has been
1883 stored. If the ims options are changed at the start (for a (?ims: group) or
1884 during any branch, we need to insert an OP_OPT item at the start of every
1885 following branch to ensure they get set correctly at run time, and also pass
1886 the new options into every subsequent branch compile.
1889 options option bits, including any changes for this subpattern
1890 brackets -> int containing the number of extracting brackets used
1891 codeptr -> the address of the current code pointer
1892 ptrptr -> the address of the current pattern pointer
1893 errorcodeptr -> pointer to error code variable
1894 skipbytes skip this many bytes at start (for OP_BRANUMBER)
1895 firstbyteptr place to put the first required character, or a negative number
1896 reqbyteptr place to put the last required character, or a negative number
1897 cd points to the data block with tables pointers etc.
1899 Returns: true on success
1903 compile_regex(int options, int* brackets, uschar** codeptr,
1904 const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int skipbytes,
1905 int* firstbyteptr, int* reqbyteptr, CompileData& cd)
1907 const UChar* ptr = *ptrptr;
1908 uschar* code = *codeptr;
1909 uschar* last_branch = code;
1910 uschar* start_bracket = code;
1911 int firstbyte, reqbyte;
1912 int branchfirstbyte, branchreqbyte;
1914 firstbyte = reqbyte = REQ_UNSET;
1916 /* Offset is set zero to mark that this bracket is still open */
1918 putOpcodeValueAtOffset(code, 1, 0);
1919 code += 1 + LINK_SIZE + skipbytes;
1921 /* Loop for each alternative branch */
1924 /* Now compile the branch */
1926 if (!compile_branch(options, brackets, &code, &ptr, patternEnd, errorcodeptr,
1927 &branchfirstbyte, &branchreqbyte, cd)) {
1932 /* If this is the first branch, the firstbyte and reqbyte values for the
1933 branch become the values for the regex. */
1935 if (*last_branch != OP_ALT) {
1936 firstbyte = branchfirstbyte;
1937 reqbyte = branchreqbyte;
1940 /* If this is not the first branch, the first char and reqbyte have to
1941 match the values from all the previous branches, except that if the previous
1942 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
1943 REQ_VARY for the regex. */
1946 /* If we previously had a firstbyte, but it doesn't match the new branch,
1947 we have to abandon the firstbyte for the regex, but if there was previously
1948 no reqbyte, it takes on the value of the old firstbyte. */
1950 if (firstbyte >= 0 && firstbyte != branchfirstbyte) {
1952 reqbyte = firstbyte;
1953 firstbyte = REQ_NONE;
1956 /* If we (now or from before) have no firstbyte, a firstbyte from the
1957 branch becomes a reqbyte if there isn't a branch reqbyte. */
1959 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
1960 branchreqbyte = branchfirstbyte;
1962 /* Now ensure that the reqbytes match */
1964 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
1967 reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
1970 /* Reached end of expression, either ')' or end of pattern. Go back through
1971 the alternative branches and reverse the chain of offsets, with the field in
1972 the BRA item now becoming an offset to the first alternative. If there are
1973 no alternatives, it points to the end of the group. The length in the
1974 terminating ket is always the length of the whole bracketed item. If any of
1975 the ims options were changed inside the group, compile a resetting op-code
1976 following, except at the very end of the pattern. Return leaving the pointer
1977 at the terminating char. */
1979 if (ptr >= patternEnd || *ptr != '|') {
1980 int length = code - last_branch;
1982 int prev_length = getOpcodeValueAtOffset(last_branch, 1);
1983 putOpcodeValueAtOffset(last_branch, 1, length);
1984 length = prev_length;
1985 last_branch -= length;
1986 } while (length > 0);
1988 /* Fill in the ket */
1991 putOpcodeValueAtOffset(code, 1, code - start_bracket);
1992 code += 1 + LINK_SIZE;
1994 /* Set values to pass back */
1998 *firstbyteptr = firstbyte;
1999 *reqbyteptr = reqbyte;
2003 /* Another branch follows; insert an "or" node. Its length field points back
2004 to the previous branch while the bracket remains open. At the end the chain
2005 is reversed. It's done like this so that the start of the bracket has a
2006 zero offset until it is closed, making it possible to detect recursion. */
2009 putOpcodeValueAtOffset(code, 1, code - last_branch);
2011 code += 1 + LINK_SIZE;
2014 ASSERT_NOT_REACHED();
2018 /*************************************************
2019 * Check for anchored expression *
2020 *************************************************/
2022 /* Try to find out if this is an anchored regular expression. Consider each
2023 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2024 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2025 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2026 counts, since OP_CIRC can match in the middle.
2028 We can also consider a regex to be anchored if OP_SOM starts all its branches.
2029 This is the code for \G, which means "match at start of match position, taking
2030 into account the match offset".
2032 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2033 because that will try the rest of the pattern at all possible matching points,
2034 so there is no point trying again.... er ....
2036 .... except when the .* appears inside capturing parentheses, and there is a
2037 subsequent back reference to those parentheses. We haven't enough information
2038 to catch that case precisely.
2040 At first, the best we could do was to detect when .* was in capturing brackets
2041 and the highest back reference was greater than or equal to that level.
2042 However, by keeping a bitmap of the first 31 back references, we can catch some
2043 of the more common cases more precisely.
2046 code points to start of expression (the bracket)
2047 options points to the options setting
2048 bracket_map a bitmap of which brackets we are inside while testing; this
2049 handles up to substring 31; after that we just have to take
2050 the less precise approach
2051 backref_map the back reference bitmap
2053 Returns: true or false
2056 static bool is_anchored(const uschar* code, int options, unsigned bracket_map, unsigned backref_map)
2059 const uschar* scode = firstSignificantOpCode(code + 1 + LINK_SIZE);
2062 /* Capturing brackets */
2065 if (op > EXTRACT_BASIC_MAX)
2066 op = get2ByteOpcodeValueAtOffset(scode, 2 + LINK_SIZE);
2067 int new_map = bracket_map | ((op < 32)? (1 << op) : 1);
2068 if (!is_anchored(scode, options, new_map, backref_map))
2072 /* Other brackets */
2073 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE) {
2074 if (!is_anchored(scode, options, bracket_map, backref_map))
2077 /* Check for explicit anchoring */
2079 } else if ((options & MatchAcrossMultipleLinesOption) || op != OP_CIRC)
2081 code += getOpcodeValueAtOffset(code, 1);
2082 } while (*code == OP_ALT); /* Loop for each alternative */
2088 /*************************************************
2089 * Check for starting with ^ or .* *
2090 *************************************************/
2092 /* This is called to find out if every branch starts with ^ or .* so that
2093 "first char" processing can be done to speed things up in multiline
2094 matching and for non-DOTALL patterns that start with .* (which must start at
2095 the beginning or after \n). As in the case of is_anchored() (see above), we
2096 have to take account of back references to capturing brackets that contain .*
2097 because in that case we can't make the assumption.
2100 code points to start of expression (the bracket)
2101 bracket_map a bitmap of which brackets we are inside while testing; this
2102 handles up to substring 31; after that we just have to take
2103 the less precise approach
2104 backref_map the back reference bitmap
2107 static bool canApplyFirstCharOptimization(const uschar* code, unsigned bracket_map, unsigned backref_map)
2110 const uschar* scode = firstSignificantOpCode(code + 1 + LINK_SIZE);
2113 /* Capturing brackets */
2116 if (op > EXTRACT_BASIC_MAX)
2117 op = get2ByteOpcodeValueAtOffset(scode, 2+LINK_SIZE);
2118 int new_map = bracket_map | ((op < 32)? (1 << op) : 1);
2119 if (!canApplyFirstCharOptimization(scode, new_map, backref_map))
2123 /* Other brackets */
2124 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE) {
2125 if (!canApplyFirstCharOptimization(scode, bracket_map, backref_map))
2128 /* .* means "start at start or after \n" if it isn't in brackets that
2129 may be referenced. */
2131 } else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR) {
2132 if (scode[1] != OP_ANY_CHAR || (bracket_map & backref_map))
2134 } else if (op != OP_CIRC) /* Check for explicit circumflex */
2137 /* Move on to the next alternative */
2139 code += getOpcodeValueAtOffset(code, 1);
2140 } while (*code == OP_ALT); /* Loop for each alternative */
2145 /*************************************************
2146 * Check for asserted fixed first char *
2147 *************************************************/
2149 /* During compilation, the "first char" settings from forward assertions are
2150 discarded, because they can cause conflicts with actual literals that follow.
2151 However, if we end up without a first char setting for an unanchored pattern,
2152 it is worth scanning the regex to see if there is an initial asserted first
2153 char. If all branches start with the same asserted char, or with a bracket all
2154 of whose alternatives start with the same asserted char (recurse ad lib), then
2155 we return that char, otherwise -1.
2158 code points to start of expression (the bracket)
2159 options pointer to the options (used to check casing changes)
2160 inassert true if in an assertion
2162 Returns: -1 or the fixed first char
2165 static int find_firstassertedchar(const uschar* code, int options, bool inassert)
2169 const uschar* scode = firstSignificantOpCodeSkippingAssertions(code + 1 + LINK_SIZE);
2183 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
2191 case OP_EXACT: /* Fall through */
2195 case OP_CHAR_IGNORING_CASE:
2197 case OP_ASCII_LETTER_IGNORING_CASE:
2204 if (options & IgnoreCaseOption)
2205 c |= REQ_IGNORE_CASE;
2207 else if (c != scode[1])
2212 code += getOpcodeValueAtOffset(code, 1);
2213 } while (*code == OP_ALT);
2217 static int calculateCompiledPatternLengthAndFlags(const UChar* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase, CompileData& compile_block, ErrorCode& errorcode)
2219 /* Make a pass over the pattern to compute the
2220 amount of store required to hold the compiled code. This does not have to be
2221 perfect as long as errors are overestimates. At the same time we can detect any
2222 flag settings right at the start, and extract them. Make an attempt to correct
2223 for any counted white space if an "extended" flag setting appears late in the
2224 pattern. We can't be so clever for #-comments. */
2226 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
2227 int branch_extra = 0;
2228 int lastitemlength = 0;
2229 unsigned brastackptr = 0;
2230 int brastack[BRASTACK_SIZE];
2231 uschar bralenstack[BRASTACK_SIZE];
2232 int item_count = -1;
2235 const UChar* ptr = (const UChar*)(pattern - 1);
2236 const UChar* patternEnd = (const UChar*)(pattern + patternLength);
2238 while (++ptr < patternEnd) {
2239 int minRepeats = 0, maxRepeats = 0;
2242 item_count++; /* Is zero for the first non-comment item */
2245 /* A backslashed item may be an escaped data character or it may be a
2249 c = check_escape(&ptr, patternEnd, &errorcode, bracount, false);
2253 lastitemlength = 1; /* Default length of last item for repeats */
2255 if (c >= 0) { /* Data character */
2256 length += 2; /* For a one-byte character */
2260 for (i = 0; i < _pcre_utf8_table1_size; i++)
2261 if (c <= _pcre_utf8_table1[i]) break;
2263 lastitemlength += i;
2269 /* Other escapes need one byte */
2273 /* A back reference needs an additional 2 bytes, plus either one or 5
2274 bytes for a repeat. We also need to keep the value of the highest
2277 if (c <= -ESC_REF) {
2278 int refnum = -c - ESC_REF;
2279 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
2280 if (refnum > compile_block.top_backref)
2281 compile_block.top_backref = refnum;
2282 length += 2; /* For single back reference */
2283 if (ptr[1] == '{' && is_counted_repeat(ptr+2, patternEnd)) {
2284 ptr = read_repeat_counts(ptr+2, &minRepeats, &maxRepeats, &errorcode);
2287 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2288 (minRepeats == 1 && maxRepeats == -1))
2298 case '^': /* Single-byte metacharacters */
2305 case '*': /* These repeats won't be after brackets; */
2306 case '+': /* those are handled separately */
2309 goto POSESSIVE; /* A few lines below */
2311 /* This covers the cases of braced repeats after a single char, metachar,
2312 class, or back reference. */
2315 if (!is_counted_repeat(ptr+1, patternEnd))
2317 ptr = read_repeat_counts(ptr+1, &minRepeats, &maxRepeats, &errorcode);
2321 /* These special cases just insert one extra opcode */
2323 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2324 (minRepeats == 1 && maxRepeats == -1))
2327 /* These cases might insert additional copies of a preceding character. */
2330 if (minRepeats != 1) {
2331 length -= lastitemlength; /* Uncount the original char or metachar */
2333 length += 3 + lastitemlength;
2335 length += lastitemlength + ((maxRepeats > 0)? 3 : 1);
2339 ptr++; /* Needs no extra length */
2341 POSESSIVE: /* Test for possessive quantifier */
2342 if (ptr[1] == '+') {
2344 length += 2 + 2 * LINK_SIZE; /* Allow for atomic brackets */
2348 /* An alternation contains an offset to the next branch or ket. If any ims
2349 options changed in the previous branch(es), and/or if we are in a
2350 lookbehind assertion, extra space will be needed at the start of the
2351 branch. This is handled by branch_extra. */
2354 length += 1 + LINK_SIZE + branch_extra;
2357 /* A character class uses 33 characters provided that all the character
2358 values are less than 256. Otherwise, it uses a bit map for low valued
2359 characters, and individual items for others. Don't worry about character
2360 types that aren't allowed in classes - they'll get picked up during the
2361 compile. A character class that contains only one single-byte character
2362 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
2363 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
2368 if (*(++ptr) == '^') {
2369 class_optcount = 10; /* Greater than one */
2375 bool class_utf8 = false;
2377 for (; ptr < patternEnd && *ptr != ']'; ++ptr) {
2378 /* Check for escapes */
2381 c = check_escape(&ptr, patternEnd, &errorcode, bracount, true);
2385 /* \b is backspace inside a class; \X is literal */
2390 /* Handle escapes that turn into characters */
2393 goto NON_SPECIAL_CHARACTER;
2395 /* Escapes that are meta-things. The normal ones just affect the
2396 bit map, but Unicode properties require an XCLASS extended item. */
2399 class_optcount = 10; /* \d, \s etc; make sure > 1 */
2402 /* Anything else increments the possible optimization count. We have to
2403 detect ranges here so that we can compute the number of extra ranges for
2404 caseless wide characters when UCP support is available. If there are wide
2405 characters, we are going to have to use an XCLASS, even for single
2409 c = getCharAndAdvanceIfSurrogate(ptr, patternEnd);
2411 /* Come here from handling \ above when it escapes to a char value */
2413 NON_SPECIAL_CHARACTER:
2417 if (ptr + 1 < patternEnd && ptr[1] == '-') {
2418 UChar const *hyptr = ptr++;
2419 if (ptr + 1 < patternEnd && ptr[1] == '\\') {
2421 d = check_escape(&ptr, patternEnd, &errorcode, bracount, true);
2425 d = '\b'; /* backspace */
2427 else if (ptr + 1 < patternEnd && ptr[1] != ']') {
2429 d = getCharAndAdvanceIfSurrogate(ptr, patternEnd);
2432 ptr = hyptr; /* go back to hyphen as data */
2435 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
2436 127 for caseless matching, we will need to use an XCLASS. */
2439 class_optcount = 10; /* Ensure > 1 */
2445 if ((d > 255 || (ignoreCase && d > 127))) {
2447 if (!class_utf8) /* Allow for XCLASS overhead */
2450 length += LINK_SIZE + 2;
2453 /* If we have UCP support, find out how many extra ranges are
2454 needed to map the other case of characters within this range. We
2455 have to mimic the range optimization here, because extending the
2456 range upwards might push d over a boundary that makes it use
2457 another byte in the UTF-8 representation. */
2463 while (get_othercase_range(&cc, origd, &occ, &ocd)) {
2464 if (occ >= c && ocd <= d)
2465 continue; /* Skip embedded */
2467 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2468 { /* if there is overlap, */
2469 c = occ; /* noting that if occ < c */
2470 continue; /* we can't have ocd > d */
2471 } /* because a subrange is */
2472 if (ocd > d && occ <= d + 1) /* always shorter than */
2473 { /* the basic range. */
2478 /* An extra item is needed */
2480 length += 1 + _pcre_ord2utf8(occ, buffer) +
2481 ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
2485 /* The length of the (possibly extended) range */
2487 length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
2492 /* We have a single character. There is nothing to be done unless we
2493 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
2494 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
2498 if ((c > 255 || (ignoreCase && c > 127))) {
2500 class_optcount = 10; /* Ensure > 1 */
2501 if (!class_utf8) /* Allow for XCLASS overhead */
2504 length += LINK_SIZE + 2;
2506 length += (ignoreCase ? 2 : 1) * (1 + _pcre_ord2utf8(c, buffer));
2512 if (ptr >= patternEnd) { /* Missing terminating ']' */
2517 /* We can optimize when there was only one optimizable character. Repeats
2518 for positive and negated single one-byte chars are handled by the general
2519 code. Here, we handle repeats for the class opcodes. */
2521 if (class_optcount == 1)
2526 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
2527 we also need extra for wrapping the whole thing in a sub-pattern. */
2529 if (ptr + 1 < patternEnd && ptr[1] == '{' && is_counted_repeat(ptr+2, patternEnd)) {
2530 ptr = read_repeat_counts(ptr+2, &minRepeats, &maxRepeats, &errorcode);
2533 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2534 (minRepeats == 1 && maxRepeats == -1))
2538 if (ptr + 1 < patternEnd && ptr[1] == '+') {
2540 length += 2 + 2 * LINK_SIZE;
2541 } else if (ptr + 1 < patternEnd && ptr[1] == '?')
2547 /* Brackets may be genuine groups or special things */
2551 int branch_newextra = 0;
2552 int bracket_length = 1 + LINK_SIZE;
2553 bool capturing = false;
2555 /* Handle special forms of bracket, which all start (? */
2557 if (ptr + 1 < patternEnd && ptr[1] == '?') {
2558 switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0)) {
2559 /* Non-referencing groups and lookaheads just move the pointer on, and
2560 then behave like a non-special bracket, except that they don't increment
2561 the count of extracting brackets. Ditto for the "once only" bracket,
2562 which is in Perl from version 5.005. */
2570 /* Else loop checking valid options until ) is met. Anything else is an
2571 error. If we are without any brackets, i.e. at top level, the settings
2572 act as if specified in the options, so massage the options immediately.
2573 This is for backward compatibility with Perl 5.004. */
2582 /* Capturing brackets must be counted so we can process escapes in a
2583 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
2584 an additional 3 bytes of memory per capturing bracket. */
2588 if (bracount > EXTRACT_BASIC_MAX)
2589 bracket_length += 3;
2592 /* Save length for computing whole length at end if there's a repeat that
2593 requires duplication of the group. Also save the current value of
2594 branch_extra, and start the new group with the new value. If non-zero, this
2595 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
2597 if (brastackptr >= sizeof(brastack)/sizeof(int)) {
2602 bralenstack[brastackptr] = branch_extra;
2603 branch_extra = branch_newextra;
2605 brastack[brastackptr++] = length;
2606 length += bracket_length;
2609 /* Handle ket. Look for subsequent maxRepeats/minRepeats; for certain sets of values we
2610 have to replicate this bracket up to that many times. If brastackptr is
2611 0 this is an unmatched bracket which will generate an error, but take care
2612 not to try to access brastack[-1] when computing the length and restoring
2613 the branch_extra value. */
2617 length += 1 + LINK_SIZE;
2618 if (brastackptr > 0) {
2619 duplength = length - brastack[--brastackptr];
2620 branch_extra = bralenstack[brastackptr];
2625 /* Leave ptr at the final char; for read_repeat_counts this happens
2626 automatically; for the others we need an increment. */
2628 if (ptr + 1 < patternEnd && (c = ptr[1]) == '{' && is_counted_repeat(ptr+2, patternEnd)) {
2629 ptr = read_repeat_counts(ptr+2, &minRepeats, &maxRepeats, &errorcode);
2632 } else if (c == '*') {
2636 } else if (c == '+') {
2640 } else if (c == '?') {
2649 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2650 group, and if the maximum is greater than zero, we have to replicate
2651 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2654 if (minRepeats == 0) {
2656 if (maxRepeats > 0) length += (maxRepeats - 1) * (duplength + 3 + 2 * LINK_SIZE);
2659 /* When the minimum is greater than zero, we have to replicate up to
2660 minval-1 times, with no additions required in the copies. Then, if there
2661 is a limited maximum we have to replicate up to maxval-1 times allowing
2662 for a BRAZERO item before each optional copy and nesting brackets for all
2663 but one of the optional copies. */
2666 length += (minRepeats - 1) * duplength;
2667 if (maxRepeats > minRepeats) /* Need this test as maxRepeats=-1 means no limit */
2668 length += (maxRepeats - minRepeats) * (duplength + 3 + 2 * LINK_SIZE)
2669 - (2 + 2 * LINK_SIZE);
2672 /* Allow space for once brackets for "possessive quantifier" */
2674 if (ptr + 1 < patternEnd && ptr[1] == '+') {
2676 length += 2 + 2 * LINK_SIZE;
2680 /* Non-special character. It won't be space or # in extended mode, so it is
2681 always a genuine character. If we are in a \Q...\E sequence, check for the
2682 end; if not, we have a literal. */
2687 length += 2; /* For a one-byte character */
2688 lastitemlength = 1; /* Default length of last item for repeats */
2690 /* In UTF-8 mode, check for additional bytes. */
2693 c = getCharAndAdvanceIfSurrogate(ptr, patternEnd);
2696 for (i = 0; i < _pcre_utf8_table1_size; i++)
2697 if (c <= _pcre_utf8_table1[i])
2700 lastitemlength += i;
2708 length += 2 + LINK_SIZE; /* For final KET and END */
2713 static void printCompiledRegExp(JSRegExp* re, int length)
2715 printf("Length = %d top_bracket = %d top_backref = %d\n",
2716 length, re->top_bracket, re->top_backref);
2720 ((re->options & IsAnchoredOption) != 0)? "anchored " : "",
2721 ((re->options & IgnoreCaseOption) != 0)? "ignores case " : "",
2722 ((re->options & MatchAcrossMultipleLinesOption) != 0)? "multiline " : "");
2725 if (re->options & UseFirstByteOptimizationOption) {
2726 char ch = re->first_byte & 255;
2727 const char* caseless = (re->first_byte & REQ_IGNORE_CASE) ? " (ignores case)" : "";
2728 if (isASCIIAlphanumeric(ch))
2729 printf("First char = %c%s\n", ch, caseless);
2731 printf("First char = \\x%02x%s\n", ch, caseless);
2734 if (re->options & UseRequiredByteOptimizationOption) {
2735 char ch = re->req_byte & 255;
2736 const char* caseless = (re->req_byte & REQ_IGNORE_CASE) ? " (ignores case)" : "";
2737 if (isASCIIAlphanumeric(ch))
2738 printf("Req char = %c%s\n", ch, caseless);
2740 printf("Req char = \\x%02x%s\n", ch, caseless);
2743 // This debugging function has been removed from JavaScriptCore's PCRE
2744 //pcre_printint(re, stdout);
2748 /*************************************************
2749 * Compile a Regular Expression *
2750 *************************************************/
2752 /* This function takes a string and returns a pointer to a block of store
2753 holding a compiled version of the expression. The original API for this
2754 function had no error code return variable; it is retained for backwards
2755 compatibility. The new function is given a new name.
2758 pattern the regular expression
2759 options various option bits
2760 errorcodeptr pointer to error code variable (pcre_compile2() only)
2761 can be NULL if you don't want a code value
2762 errorptr pointer to pointer to error text
2763 erroroffset ptr offset in pattern where error was detected
2764 tables pointer to character tables or NULL
2766 Returns: pointer to compiled data block, or NULL on error,
2767 with errorptr and erroroffset set
2770 static JSRegExp* returnError(ErrorCode errorcode, const char** errorptr)
2772 *errorptr = error_text(errorcode);
2776 JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
2777 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,
2778 unsigned* numSubpatterns, const char** errorptr)
2780 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2781 can do is just return NULL, but we can set a code value if there is a code pointer. */
2786 CompileData compile_block;
2788 ErrorCode errorcode = ERR0;
2789 int length = calculateCompiledPatternLengthAndFlags(pattern, patternLength, ignoreCase, compile_block, errorcode);
2791 return returnError(errorcode, errorptr);
2793 if (length > MAX_PATTERN_SIZE)
2794 return returnError(ERR16, errorptr);
2796 size_t size = length + sizeof(JSRegExp);
2797 JSRegExp* re = reinterpret_cast<JSRegExp*>(new char[size]);
2800 return returnError(ERR13, errorptr);
2802 /* Put in the magic number, and save the sizes, options, and character table
2803 pointer. NULL is used for the default character tables. The nullpad field is at
2804 the end; it's there to help in the case when a regex compiled on a system with
2805 4-byte pointers is run on another with 8-byte pointers. */
2807 re->size = (pcre_uint32)size;
2808 re->options = (ignoreCase ? IgnoreCaseOption : 0) | (multiline ? MatchAcrossMultipleLinesOption : 0);
2810 /* The starting points of the name/number translation table and of the code are
2811 passed around in the compile data block. */
2813 const uschar* codestart = (const uschar*)(re + 1);
2814 compile_block.start_code = codestart;
2815 compile_block.start_pattern = (const UChar*)pattern;
2817 /* Set up a starting, non-extracting bracket, then compile the expression. On
2818 error, errorcode will be set non-zero, so we don't need to look at the result
2819 of the function here. */
2821 const UChar* ptr = (const UChar*)pattern;
2822 const UChar* patternEnd = pattern + patternLength;
2823 uschar* code = (uschar*)codestart;
2825 int firstbyte, reqbyte;
2826 int bracketCount = 0;
2827 (void)compile_regex(re->options, &bracketCount, &code, &ptr,
2829 &errorcode, 0, &firstbyte, &reqbyte, compile_block);
2830 re->top_bracket = bracketCount;
2831 re->top_backref = compile_block.top_backref;
2833 /* If not reached end of pattern on success, there's an excess bracket. */
2835 if (errorcode == 0 && ptr < patternEnd)
2838 /* Fill in the terminating state and check for disastrous overflow, but
2839 if debugging, leave the test till after things are printed out. */
2844 if (code - codestart > length)
2848 /* Give an error if there's back reference to a non-existent capturing
2851 if (re->top_backref > re->top_bracket)
2854 /* Failed to compile, or error while post-processing */
2856 if (errorcode != ERR0) {
2857 delete [] reinterpret_cast<char*>(re);
2858 return returnError(errorcode, errorptr);
2861 /* If the anchored option was not passed, set the flag if we can determine that
2862 the pattern is anchored by virtue of ^ characters or \A or anything else (such
2863 as starting with .* when DOTALL is set).
2865 Otherwise, if we know what the first character has to be, save it, because that
2866 speeds up unanchored matches no end. If not, see if we can set the
2867 UseMultiLineFirstByteOptimizationOption flag. This is helpful for multiline matches when all branches
2868 start with ^. and also when all branches start with .* for non-DOTALL matches.
2871 if (is_anchored(codestart, re->options, 0, compile_block.backref_map))
2872 re->options |= IsAnchoredOption;
2875 firstbyte = find_firstassertedchar(codestart, re->options, false);
2876 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
2878 int ch = firstbyte & 255;
2880 re->first_byte = ((firstbyte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? ch : firstbyte;
2881 re->options |= UseFirstByteOptimizationOption;
2884 else if (canApplyFirstCharOptimization(codestart, 0, compile_block.backref_map))
2885 re->options |= UseMultiLineFirstByteOptimizationOption;
2888 /* For an anchored pattern, we use the "required byte" only if it follows a
2889 variable length item in the regex. Remove the caseless flag for non-caseable
2892 if (reqbyte >= 0 && (!(re->options & IsAnchoredOption) || (reqbyte & REQ_VARY))) {
2893 int ch = reqbyte & 255;
2895 re->req_byte = ((reqbyte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? (reqbyte & ~REQ_IGNORE_CASE) : reqbyte;
2896 re->options |= UseRequiredByteOptimizationOption;
2901 printCompiledRegExp(re);
2903 /* This check is done here in the debugging case so that the code that
2904 was compiled can be seen. */
2905 if (code - codestart > length) {
2907 *errorptr = error_text(ERR7);
2914 *numSubpatterns = re->top_bracket;
2918 void jsRegExpFree(JSRegExp* re)
2920 delete [] reinterpret_cast<char*>(re);