JavaScriptCore:
authordarin@apple.com <darin@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Fri, 30 Nov 2007 18:54:34 +0000 (18:54 +0000)
committerdarin@apple.com <darin@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Fri, 30 Nov 2007 18:54:34 +0000 (18:54 +0000)
        Reviewed by Adam Roben.

        - fix http://bugs.webkit.org/show_bug.cgi?id=16207
          JavaScript regular expressions should match UTF-16 code units rather than characters

        SunSpider says this is 5.5% faster on the regexp test, 0.4% faste overall.

        Test: fast/js/regexp-non-bmp.html

        Renamed ANY_CHAR to NOT_NEWLINE to more-accurately reflect its meaning.

        * pcre/pcre_compile.cpp:
        (compile_branch): Removed calls to the UTF-16 character accessor functions, replacing
        them with simple pointer dereferences in some cases, and no code at all in others.
        (calculateCompiledPatternLengthAndFlags): Ditto.

        * pcre/pcre_exec.cpp:
        (match): Fixed indentation of some case labels (including all the BEGIN_OPCODE).
        Removed calls to the UTF-16 character accessor functions, replacing them with simple
        pointer dereferences in some cases, and no code at all in others. Also removed some
        explicit UTF-16 support code in a few cases. Removed the unneeded "UTF-8" code path
        in the ANY_CHAR repeat code, and in another case, eliminated the code to check against
        end_subject in because it is already done outside the loop.
        (jsRegExpExecute):

        * pcre/pcre_internal.h: Removed all the UTF-16 helper functions.

LayoutTests:

        Reviewed by Adam Roben.

        - test for http://bugs.webkit.org/show_bug.cgi?id=16207
          JavaScript regular expressions should match UTF-16 code units rather than characters

        * fast/js/regexp-non-bmp-expected.txt: Added.
        * fast/js/regexp-non-bmp.html: Added.
        * fast/js/resources/regexp-non-bmp.js: Added.

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@28243 268f45cc-cd09-0410-ab3c-d52691b4dbfc

JavaScriptCore/ChangeLog
JavaScriptCore/pcre/pcre_compile.cpp
JavaScriptCore/pcre/pcre_exec.cpp
JavaScriptCore/pcre/pcre_internal.h
LayoutTests/ChangeLog
LayoutTests/fast/js/regexp-non-bmp-expected.txt [new file with mode: 0644]
LayoutTests/fast/js/regexp-non-bmp.html [new file with mode: 0644]
LayoutTests/fast/js/resources/regexp-non-bmp.js [new file with mode: 0644]

index 97e5d7113256ac401fd2b33bdac321e9a52ace0a..ac0ad54a430cad82284a3a383ac00ea29eadfcb1 100644 (file)
@@ -1,3 +1,32 @@
+2007-11-30  Darin Adler  <darin@apple.com>
+
+        Reviewed by Adam Roben.
+
+        - fix http://bugs.webkit.org/show_bug.cgi?id=16207
+          JavaScript regular expressions should match UTF-16 code units rather than characters
+
+        SunSpider says this is 5.5% faster on the regexp test, 0.4% faste overall.
+
+        Test: fast/js/regexp-non-bmp.html
+
+        Renamed ANY_CHAR to NOT_NEWLINE to more-accurately reflect its meaning.
+
+        * pcre/pcre_compile.cpp:
+        (compile_branch): Removed calls to the UTF-16 character accessor functions, replacing
+        them with simple pointer dereferences in some cases, and no code at all in others.
+        (calculateCompiledPatternLengthAndFlags): Ditto.
+
+        * pcre/pcre_exec.cpp:
+        (match): Fixed indentation of some case labels (including all the BEGIN_OPCODE).
+        Removed calls to the UTF-16 character accessor functions, replacing them with simple
+        pointer dereferences in some cases, and no code at all in others. Also removed some
+        explicit UTF-16 support code in a few cases. Removed the unneeded "UTF-8" code path
+        in the ANY_CHAR repeat code, and in another case, eliminated the code to check against
+        end_subject in because it is already done outside the loop.
+        (jsRegExpExecute):
+
+        * pcre/pcre_internal.h: Removed all the UTF-16 helper functions.
+
 2007-11-30  Eric Seidel  <eric@webkit.org>
 
         Reviewed by Maciej.
index be8fd211b74b8e4f55102551c5d3e45166aa7476..99bdbf4a52e424a83d98dbfc4a7117c51b5b9293 100644 (file)
@@ -562,7 +562,7 @@ static int find_fixedlength(uschar* code, int options)
             case OP_WHITESPACE:
             case OP_NOT_WORDCHAR:
             case OP_WORDCHAR:
-            case OP_ANY_CHAR:
+            case OP_NOT_NEWLINE:
                 branchlength++;
                 cc++;
                 break;
@@ -800,8 +800,8 @@ compile_branch(int options, int* brackets, uschar** codeptr,
             previous_callout = NULL;
         }
         
-        switch(c) {
-                /* The branch terminates at end of string, |, or ). */
+        switch (c) {
+            /* The branch terminates at end of string, |, or ). */
                 
             case 0:
                 if (ptr < patternEnd)
@@ -815,8 +815,8 @@ compile_branch(int options, int* brackets, uschar** codeptr,
                 *ptrptr = ptr;
                 return true;
                 
-                /* Handle single-character metacharacters. In multiline mode, ^ disables
-                 the setting of any following char as a first character. */
+            /* Handle single-character metacharacters. In multiline mode, ^ disables
+             the setting of any following char as a first character. */
                 
             case '^':
                 if (options & MatchAcrossMultipleLinesOption) {
@@ -832,8 +832,8 @@ compile_branch(int options, int* brackets, uschar** codeptr,
                 *code++ = OP_DOLL;
                 break;
                 
-                /* There can never be a first char if '.' is first, whatever happens about
-                 repeats. The value of reqbyte doesn't change either. */
+            /* There can never be a first char if '.' is first, whatever happens about
+             repeats. The value of reqbyte doesn't change either. */
                 
             case '.':
                 if (firstbyte == REQ_UNSET)
@@ -841,23 +841,22 @@ compile_branch(int options, int* brackets, uschar** codeptr,
                 zerofirstbyte = firstbyte;
                 zeroreqbyte = reqbyte;
                 previous = code;
-                *code++ = OP_ANY_CHAR;
+                *code++ = OP_NOT_NEWLINE;
                 break;
                 
-                /* Character classes. If the included characters are all < 256, we build a
-                 32-byte bitmap of the permitted characters, except in the special case
-                 where there is only one such character. For negated classes, we build the
-                 map as usual, then invert it at the end. However, we use a different opcode
-                 so that data characters > 255 can be handled correctly.
-                 
-                 If the class contains characters outside the 0-255 range, a different
-                 opcode is compiled. It may optionally have a bit map for characters < 256,
-                 but those above are are explicitly listed afterwards. A flag byte tells
-                 whether the bitmap is present, and whether this is a negated class or not.
-                 */
-                
-            case '[':
-            {
+            /* Character classes. If the included characters are all < 256, we build a
+             32-byte bitmap of the permitted characters, except in the special case
+             where there is only one such character. For negated classes, we build the
+             map as usual, then invert it at the end. However, we use a different opcode
+             so that data characters > 255 can be handled correctly.
+             
+             If the class contains characters outside the 0-255 range, a different
+             opcode is compiled. It may optionally have a bit map for characters < 256,
+             but those above are are explicitly listed afterwards. A flag byte tells
+             whether the bitmap is present, and whether this is a negated class or not.
+             */
+                
+            case '[': {
                 previous = code;
                 should_flip_negation = false;
                 
@@ -894,9 +893,6 @@ compile_branch(int options, int* brackets, uschar** codeptr,
                  strict here. At the start of the loop, c contains the first byte of the
                  character. */
                 while ((c = *(++ptr)) != ']') {
-                    if (c > 127)
-                        c = getCharAndAdvanceIfSurrogate(ptr);
-                    
                     /* Backslash may introduce a single character, or it may introduce one
                      of the specials, which just set a flag. Escaped items are checked for
                      validity in the pre-compiling pass. The sequence \b is a special case.
@@ -969,7 +965,7 @@ compile_branch(int options, int* brackets, uschar** codeptr,
                     if (ptr[1] == '-' && ptr[2] != ']') {
                         ptr += 2;
                         
-                        int d = getCharAndAdvanceIfSurrogate(ptr);
+                        int d = *ptr;
                         
                         /* The second part of a range can be a single-character escape, but
                          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
@@ -1195,10 +1191,11 @@ compile_branch(int options, int* brackets, uschar** codeptr,
                     memcpy(code, classbits, 32);
                 code += 32;
                 break;
-                
-                /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
-                 has been tested above. */
             }
+                
+            /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
+             has been tested above. */
+
             case '{':
                 if (!is_quantifier)
                     goto NORMAL_CHAR;
@@ -1306,7 +1303,7 @@ compile_branch(int options, int* brackets, uschar** codeptr,
                  create a suitable repeat item. The code is shared with single-character
                  repeats by setting op_type to add a suitable offset into repeat_type. */
                 
-                else if (*previous <= OP_ANY_CHAR) {
+                else if (*previous <= OP_NOT_NEWLINE) {
                     op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
                     c = *previous;
                     
@@ -1607,13 +1604,12 @@ compile_branch(int options, int* brackets, uschar** codeptr,
                 cd.req_varyopt |= reqvary;
                 break;
                 
-                
-                /* Start of nested bracket sub-expression, or comment or lookahead or
-                 lookbehind or option setting or condition. First deal with special things
-                 that can come after a bracket; all are introduced by ?, and the appearance
-                 of any of them means that this is not a referencing group. They were
-                 checked for validity in the first pass over the string, so we don't have to
-                 check for syntax errors here.  */
+            /* Start of nested bracket sub-expression, or comment or lookahead or
+             lookbehind or option setting or condition. First deal with special things
+             that can come after a bracket; all are introduced by ?, and the appearance
+             of any of them means that this is not a referencing group. They were
+             checked for validity in the first pass over the string, so we don't have to
+             check for syntax errors here.  */
                 
             case '(':
                 skipbytes = 0;
@@ -1750,9 +1746,9 @@ compile_branch(int options, int* brackets, uschar** codeptr,
                 }
                 break;
                 
-                /* Check \ for being a real metacharacter; if not, fall through and handle
-                 it as a data character at the start of a string. Escape items are checked
-                 for validity in the pre-compiling pass. */
+            /* Check \ for being a real metacharacter; if not, fall through and handle
+             it as a data character at the start of a string. Escape items are checked
+             for validity in the pre-compiling pass. */
                 
             case '\\':
                 tempptr = ptr;
@@ -2129,7 +2125,7 @@ static bool canApplyFirstCharOptimization(const uschar* code, unsigned bracket_m
          may be referenced. */
         
         } else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR) {
-            if (scode[1] != OP_ANY_CHAR || (bracket_map & backref_map))
+            if (scode[1] != OP_NOT_NEWLINE || (bracket_map & backref_map))
                 return false;
         } else if (op != OP_CIRC) /* Check for explicit circumflex */
             return false;
@@ -2172,43 +2168,45 @@ static int find_firstassertedchar(const uschar* code, int options, bool inassert
         if (op >= OP_BRA)
             op = OP_BRA;
         
-        switch(op) {
-        default:
-            return -1;
-            
-        case OP_BRA:
-        case OP_ASSERT:
-        case OP_ONCE: {
-            int d;
-            if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
-                return -1;
-            if (c < 0)
-                c = d;
-            else if (c != d)
-                return -1;
-            break;
-        }
-        case OP_EXACT:       /* Fall through */
-            scode += 2;
-            
-        case OP_CHAR:
-        case OP_CHAR_IGNORING_CASE:
-        case OP_ASCII_CHAR:
-        case OP_ASCII_LETTER_IGNORING_CASE:
-        case OP_PLUS:
-        case OP_MINPLUS:
-            if (!inassert)
+        switch (op) {
+            default:
                 return -1;
-            if (c < 0) {
-                c = scode[1];
-                if (options & IgnoreCaseOption)
-                    c |= REQ_IGNORE_CASE;
+                
+            case OP_BRA:
+            case OP_ASSERT:
+            case OP_ONCE: {
+                int d;
+                if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
+                    return -1;
+                if (c < 0)
+                    c = d;
+                else if (c != d)
+                    return -1;
+                break;
             }
-            else if (c != scode[1])
-                return -1;
-            break;
+
+            case OP_EXACT:
+                scode += 2;
+                /* Fall through */
+
+            case OP_CHAR:
+            case OP_CHAR_IGNORING_CASE:
+            case OP_ASCII_CHAR:
+            case OP_ASCII_LETTER_IGNORING_CASE:
+            case OP_PLUS:
+            case OP_MINPLUS:
+                if (!inassert)
+                    return -1;
+                if (c < 0) {
+                    c = scode[1];
+                    if (options & IgnoreCaseOption)
+                        c |= REQ_IGNORE_CASE;
+                }
+                else if (c != scode[1])
+                    return -1;
+                break;
         }
-        
+
         code += getOpcodeValueAtOffset(code, 1);
     } while (*code == OP_ALT);
     return c;
@@ -2241,9 +2239,9 @@ static int calculateCompiledPatternLengthAndFlags(const UChar* pattern, int patt
         
         item_count++;    /* Is zero for the first non-comment item */
         
-        switch(c) {
-                /* A backslashed item may be an escaped data character or it may be a
-                 character type. */
+        switch (c) {
+            /* A backslashed item may be an escaped data character or it may be a
+             character type. */
                 
             case '\\':
                 c = check_escape(&ptr, patternEnd, &errorcode, bracount, false);
@@ -2295,23 +2293,23 @@ static int calculateCompiledPatternLengthAndFlags(const UChar* pattern, int patt
                 }
                 continue;
                 
-                case '^':     /* Single-byte metacharacters */
-                case '.':
-                case '$':
+            case '^':     /* Single-byte metacharacters */
+            case '.':
+            case '$':
                 length++;
                 lastitemlength = 1;
                 continue;
                 
-                case '*':            /* These repeats won't be after brackets; */
-                case '+':            /* those are handled separately */
-                case '?':
+            case '*':            /* These repeats won't be after brackets; */
+            case '+':            /* those are handled separately */
+            case '?':
                 length++;
-                goto POSESSIVE;      /* A few lines below */
+                goto POSSESSIVE;
                 
-                /* This covers the cases of braced repeats after a single char, metachar,
-                 class, or back reference. */
-                
-                case '{':
+            /* This covers the cases of braced repeats after a single char, metachar,
+             class, or back reference. */
+            
+            case '{':
                 if (!is_counted_repeat(ptr+1, patternEnd))
                     goto NORMAL_CHAR;
                 ptr = read_repeat_counts(ptr+1, &minRepeats, &maxRepeats, &errorcode);
@@ -2338,32 +2336,31 @@ static int calculateCompiledPatternLengthAndFlags(const UChar* pattern, int patt
                 if (ptr[1] == '?')
                     ptr++;      /* Needs no extra length */
                 
-            POSESSIVE:                     /* Test for possessive quantifier */
+            POSSESSIVE:                     /* Test for possessive quantifier */
                 if (ptr[1] == '+') {
                     ptr++;
                     length += 2 + 2 * LINK_SIZE;   /* Allow for atomic brackets */
                 }
                 continue;
                 
-                /* An alternation contains an offset to the next branch or ket. If any ims
-                 options changed in the previous branch(es), and/or if we are in a
-                 lookbehind assertion, extra space will be needed at the start of the
-                 branch. This is handled by branch_extra. */
+            /* An alternation contains an offset to the next branch or ket. If any ims
+             options changed in the previous branch(es), and/or if we are in a
+             lookbehind assertion, extra space will be needed at the start of the
+             branch. This is handled by branch_extra. */
                 
             case '|':
                 length += 1 + LINK_SIZE + branch_extra;
                 continue;
                 
-                /* A character class uses 33 characters provided that all the character
-                 values are less than 256. Otherwise, it uses a bit map for low valued
-                 characters, and individual items for others. Don't worry about character
-                 types that aren't allowed in classes - they'll get picked up during the
-                 compile. A character class that contains only one single-byte character
-                 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
-                 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
+            /* A character class uses 33 characters provided that all the character
+             values are less than 256. Otherwise, it uses a bit map for low valued
+             characters, and individual items for others. Don't worry about character
+             types that aren't allowed in classes - they'll get picked up during the
+             compile. A character class that contains only one single-byte character
+             uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
+             where we can. (In UTF-8 mode we can do this only for chars < 128.) */
                 
-            case '[':
-            {
+            case '[': {
                 int class_optcount;
                 if (*(++ptr) == '^') {
                     class_optcount = 10;  /* Greater than one */
@@ -2406,7 +2403,7 @@ static int calculateCompiledPatternLengthAndFlags(const UChar* pattern, int patt
                      characters. */
                     
                     else {
-                        c = getCharAndAdvanceIfSurrogate(ptr, patternEnd);
+                        c = *ptr;
                         
                         /* Come here from handling \ above when it escapes to a char value */
                         
@@ -2424,10 +2421,8 @@ static int calculateCompiledPatternLengthAndFlags(const UChar* pattern, int patt
                                 if (-d == ESC_b)
                                     d = '\b';        /* backspace */
                             }
-                            else if (ptr + 1 < patternEnd && ptr[1] != ']') {
-                                ptr++;
-                                d = getCharAndAdvanceIfSurrogate(ptr, patternEnd);
-                            }
+                            else if (ptr + 1 < patternEnd && ptr[1] != ']')
+                                d = *++ptr;
                             if (d < 0)
                                 ptr = hyptr;      /* go back to hyphen as data */
                         }
@@ -2544,10 +2539,10 @@ static int calculateCompiledPatternLengthAndFlags(const UChar* pattern, int patt
                 }
                 continue;
             }
-                /* Brackets may be genuine groups or special things */
+
+            /* Brackets may be genuine groups or special things */
                 
-            case '(':
-            {
+            case '(': {
                 int branch_newextra = 0;
                 int bracket_length = 1 + LINK_SIZE;
                 bool capturing = false;
@@ -2606,13 +2601,14 @@ static int calculateCompiledPatternLengthAndFlags(const UChar* pattern, int patt
                 length += bracket_length;
                 continue;
             }
-                /* Handle ket. Look for subsequent maxRepeats/minRepeats; for certain sets of values we
-                 have to replicate this bracket up to that many times. If brastackptr is
-                 0 this is an unmatched bracket which will generate an error, but take care
-                 not to try to access brastack[-1] when computing the length and restoring
-                 the branch_extra value. */
-            case ')':
-            {
+
+            /* Handle ket. Look for subsequent maxRepeats/minRepeats; for certain sets of values we
+             have to replicate this bracket up to that many times. If brastackptr is
+             0 this is an unmatched bracket which will generate an error, but take care
+             not to try to access brastack[-1] when computing the length and restoring
+             the branch_extra value. */
+
+            case ')': {
                 int duplength;
                 length += 1 + LINK_SIZE;
                 if (brastackptr > 0) {
@@ -2677,28 +2673,23 @@ static int calculateCompiledPatternLengthAndFlags(const UChar* pattern, int patt
                 }
                 continue;
             }
-                /* Non-special character. It won't be space or # in extended mode, so it is
-                 always a genuine character. If we are in a \Q...\E sequence, check for the
-                 end; if not, we have a literal. */
+
+            /* Non-special character. It won't be space or # in extended mode, so it is
+             always a genuine character. If we are in a \Q...\E sequence, check for the
+             end; if not, we have a literal. */
                 
             default:
             NORMAL_CHAR:
-                
                 length += 2;          /* For a one-byte character */
                 lastitemlength = 1;   /* Default length of last item for repeats */
-                
-                /* In UTF-8 mode, check for additional bytes. */
-                
+
                 if (c > 127) {
-                    c = getCharAndAdvanceIfSurrogate(ptr, patternEnd);
-                    {
-                        int i;
-                        for (i = 0; i < _pcre_utf8_table1_size; i++)
-                            if (c <= _pcre_utf8_table1[i])
-                                break;
-                        length += i;
-                        lastitemlength += i;
-                    }
+                    int i;
+                    for (i = 0; i < _pcre_utf8_table1_size; i++)
+                        if (c <= _pcre_utf8_table1[i])
+                            break;
+                    length += i;
+                    lastitemlength += i;
                 }
                 
                 continue;
index fa9a2dd5099b0b2a08e4f7593c7dd1c4df94111b..9c60be10a693e1c5ebd0ff72a0f7a3f0a619d1d1 100644 (file)
@@ -460,9 +460,9 @@ RECURSE:
         switch (*stack.currentFrame->args.instructionPtr)
 #endif
         {
-                /* Non-capturing bracket: optimized */
+            /* Non-capturing bracket: optimized */
                 
-                BEGIN_OPCODE(BRA):
+            BEGIN_OPCODE(BRA):
             NON_CAPTURING_BRACKET:
                 DPRINTF(("start bracket 0\n"));
                 do {
@@ -474,27 +474,27 @@ RECURSE:
                 DPRINTF(("bracket 0 failed\n"));
                 RRETURN;
                 
-                /* Skip over large extraction number data if encountered. */
+            /* Skip over large extraction number data if encountered. */
                 
-                BEGIN_OPCODE(BRANUMBER):
+            BEGIN_OPCODE(BRANUMBER):
                 stack.currentFrame->args.instructionPtr += 3;
                 NEXT_OPCODE;
                 
-                /* End of the pattern. */
+            /* End of the pattern. */
                 
-                BEGIN_OPCODE(END):
+            BEGIN_OPCODE(END):
                 md.end_match_ptr = stack.currentFrame->args.subjectPtr;          /* Record where we ended */
                 md.end_offset_top = stack.currentFrame->args.offset_top;   /* and how many extracts were taken */
                 is_match = true;
                 RRETURN;
                 
-                /* Assertion brackets. Check the alternative branches in turn - the
-                 matching won't pass the KET for an assertion. If any one branch matches,
-                 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
-                 start of each branch to move the current point backwards, so the code at
-                 this level is identical to the lookahead case. */
+            /* Assertion brackets. Check the alternative branches in turn - the
+             matching won't pass the KET for an assertion. If any one branch matches,
+             the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
+             start of each branch to move the current point backwards, so the code at
+             this level is identical to the lookahead case. */
                 
-                BEGIN_OPCODE(ASSERT):
+            BEGIN_OPCODE(ASSERT):
                 do {
                     RECURSIVE_MATCH_STARTNG_NEW_GROUP(6, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL);
                     if (is_match)
@@ -512,9 +512,9 @@ RECURSE:
                 stack.currentFrame->args.offset_top = md.end_offset_top;
                 NEXT_OPCODE;
                 
-                /* Negative assertion: all branches must fail to match */
+            /* Negative assertion: all branches must fail to match */
                 
-                BEGIN_OPCODE(ASSERT_NOT):
+            BEGIN_OPCODE(ASSERT_NOT):
                 do {
                     RECURSIVE_MATCH_STARTNG_NEW_GROUP(7, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL);
                     if (is_match)
@@ -525,14 +525,14 @@ RECURSE:
                 stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE;
                 NEXT_OPCODE;
                 
-                /* "Once" brackets are like assertion brackets except that after a match,
-                 the point in the subject string is not moved back. Thus there can never be
-                 a move back into the brackets. Friedl calls these "atomic" subpatterns.
-                 Check the alternative branches in turn - the matching won't pass the KET
-                 for this kind of subpattern. If any one branch matches, we carry on as at
-                 the end of a normal bracket, leaving the subject pointer. */
+            /* "Once" brackets are like assertion brackets except that after a match,
+             the point in the subject string is not moved back. Thus there can never be
+             a move back into the brackets. Friedl calls these "atomic" subpatterns.
+             Check the alternative branches in turn - the matching won't pass the KET
+             for this kind of subpattern. If any one branch matches, we carry on as at
+             the end of a normal bracket, leaving the subject pointer. */
                 
-                BEGIN_OPCODE(ONCE):
+            BEGIN_OPCODE(ONCE):
                 stack.currentFrame->locals.instructionPtrAtStartOfOnce = stack.currentFrame->args.instructionPtr;
                 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
                 
@@ -589,49 +589,47 @@ RECURSE:
                 }
                 RRETURN;
                 
-                /* An alternation is the end of a branch; scan along to find the end of the
-                 bracketed group and go to there. */
+            /* An alternation is the end of a branch; scan along to find the end of the
+             bracketed group and go to there. */
                 
-                BEGIN_OPCODE(ALT):
+            BEGIN_OPCODE(ALT):
                 moveOpcodePtrPastAnyAlternateBranches(stack.currentFrame->args.instructionPtr);
                 NEXT_OPCODE;
                 
-                /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
-                 that it may occur zero times. It may repeat infinitely, or not at all -
-                 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
-                 repeat limits are compiled as a number of copies, with the optional ones
-                 preceded by BRAZERO or BRAMINZERO. */
+            /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
+             that it may occur zero times. It may repeat infinitely, or not at all -
+             i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
+             repeat limits are compiled as a number of copies, with the optional ones
+             preceded by BRAZERO or BRAMINZERO. */
                 
-                BEGIN_OPCODE(BRAZERO):
-                {
-                    stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1;
-                    RECURSIVE_MATCH_STARTNG_NEW_GROUP(14, stack.currentFrame->locals.startOfRepeatingBracket, stack.currentFrame->args.subpatternStart);
-                    if (is_match)
-                        RRETURN;
-                    moveOpcodePtrPastAnyAlternateBranches(stack.currentFrame->locals.startOfRepeatingBracket);
-                    stack.currentFrame->args.instructionPtr = stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE;
-                }
+            BEGIN_OPCODE(BRAZERO): {
+                stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1;
+                RECURSIVE_MATCH_STARTNG_NEW_GROUP(14, stack.currentFrame->locals.startOfRepeatingBracket, stack.currentFrame->args.subpatternStart);
+                if (is_match)
+                    RRETURN;
+                moveOpcodePtrPastAnyAlternateBranches(stack.currentFrame->locals.startOfRepeatingBracket);
+                stack.currentFrame->args.instructionPtr = stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE;
                 NEXT_OPCODE;
+            }
                 
-                BEGIN_OPCODE(BRAMINZERO):
-                {
-                    stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1;
-                    moveOpcodePtrPastAnyAlternateBranches(stack.currentFrame->locals.startOfRepeatingBracket);
-                    RECURSIVE_MATCH_STARTNG_NEW_GROUP(15, stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE, stack.currentFrame->args.subpatternStart);
-                    if (is_match)
-                        RRETURN;
-                    stack.currentFrame->args.instructionPtr++;
-                }
+            BEGIN_OPCODE(BRAMINZERO): {
+                stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1;
+                moveOpcodePtrPastAnyAlternateBranches(stack.currentFrame->locals.startOfRepeatingBracket);
+                RECURSIVE_MATCH_STARTNG_NEW_GROUP(15, stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE, stack.currentFrame->args.subpatternStart);
+                if (is_match)
+                    RRETURN;
+                stack.currentFrame->args.instructionPtr++;
                 NEXT_OPCODE;
+            }
                 
-                /* End of a group, repeated or non-repeating. If we are at the end of
-                 an assertion "group", stop matching and return MATCH_MATCH, but record the
-                 current high water mark for use by positive assertions. Do this also
-                 for the "once" (not-backup up) groups. */
+            /* End of a group, repeated or non-repeating. If we are at the end of
+             an assertion "group", stop matching and return MATCH_MATCH, but record the
+             current high water mark for use by positive assertions. Do this also
+             for the "once" (not-backup up) groups. */
                 
-                BEGIN_OPCODE(KET):
-                BEGIN_OPCODE(KETRMIN):
-                BEGIN_OPCODE(KETRMAX):
+            BEGIN_OPCODE(KET):
+            BEGIN_OPCODE(KETRMIN):
+            BEGIN_OPCODE(KETRMAX):
                 stack.currentFrame->locals.instructionPtrAtStartOfOnce = stack.currentFrame->args.instructionPtr - getOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 1);
                 stack.currentFrame->args.subpatternStart = stack.currentFrame->locals.subpatternStart;
                 stack.currentFrame->locals.subpatternStart = stack.currentFrame->previousFrame->args.subpatternStart;
@@ -709,34 +707,33 @@ RECURSE:
                 }
                 RRETURN;
                 
-                /* Start of subject, or after internal newline if multiline. */
+            /* Start of subject, or after internal newline if multiline. */
                 
-                BEGIN_OPCODE(CIRC):
+            BEGIN_OPCODE(CIRC):
                 if (stack.currentFrame->args.subjectPtr != md.start_subject && (!md.multiline || !isNewline(stack.currentFrame->args.subjectPtr[-1])))
                     RRETURN_NO_MATCH;
                 stack.currentFrame->args.instructionPtr++;
                 NEXT_OPCODE;
                 
-                /* End of subject, or before internal newline if multiline. */
+            /* End of subject, or before internal newline if multiline. */
                 
-                BEGIN_OPCODE(DOLL):
+            BEGIN_OPCODE(DOLL):
                 if (stack.currentFrame->args.subjectPtr < md.end_subject && (!md.multiline || !isNewline(*stack.currentFrame->args.subjectPtr)))
                     RRETURN_NO_MATCH;
                 stack.currentFrame->args.instructionPtr++;
                 NEXT_OPCODE;
                 
-                /* Word boundary assertions */
+            /* Word boundary assertions */
                 
-                BEGIN_OPCODE(NOT_WORD_BOUNDARY):
-                BEGIN_OPCODE(WORD_BOUNDARY):
-            {
+            BEGIN_OPCODE(NOT_WORD_BOUNDARY):
+            BEGIN_OPCODE(WORD_BOUNDARY): {
                 bool currentCharIsWordChar = false;
                 bool previousCharIsWordChar = false;
                 
                 if (stack.currentFrame->args.subjectPtr > md.start_subject)
-                    previousCharIsWordChar = isWordChar(getPreviousChar(stack.currentFrame->args.subjectPtr));
+                    previousCharIsWordChar = isWordChar(stack.currentFrame->args.subjectPtr[-1]);
                 if (stack.currentFrame->args.subjectPtr < md.end_subject)
-                    currentCharIsWordChar = isWordChar(getChar(stack.currentFrame->args.subjectPtr));
+                    currentCharIsWordChar = isWordChar(*stack.currentFrame->args.subjectPtr);
                 
                 /* Now see if the situation is what we want */
                 bool wordBoundaryDesired = (*stack.currentFrame->args.instructionPtr++ == OP_WORD_BOUNDARY);
@@ -745,90 +742,73 @@ RECURSE:
                 NEXT_OPCODE;
             }
                 
-                /* Match a single character type; inline for speed */
+            /* Match a single character type; inline for speed */
                 
-                BEGIN_OPCODE(ANY_CHAR):
-                if (stack.currentFrame->args.subjectPtr < md.end_subject && isNewline(*stack.currentFrame->args.subjectPtr))
+            BEGIN_OPCODE(NOT_NEWLINE):
+                if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                     RRETURN_NO_MATCH;
-                if (!movePtrToNextChar(stack.currentFrame->args.subjectPtr, md.end_subject))
+                if (isNewline(*stack.currentFrame->args.subjectPtr++))
                     RRETURN_NO_MATCH;
                 stack.currentFrame->args.instructionPtr++;
                 NEXT_OPCODE;
-                
-                BEGIN_OPCODE(NOT_DIGIT):
-            {
+
+            BEGIN_OPCODE(NOT_DIGIT):
                 if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                     RRETURN_NO_MATCH;
-                int c = getCharAndAdvance(stack.currentFrame->args.subjectPtr);
-                if (isASCIIDigit(c))
+                if (isASCIIDigit(*stack.currentFrame->args.subjectPtr++))
                     RRETURN_NO_MATCH;
                 stack.currentFrame->args.instructionPtr++;
                 NEXT_OPCODE;
-            }    
-                BEGIN_OPCODE(DIGIT):
-            {
+
+            BEGIN_OPCODE(DIGIT):
                 if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                     RRETURN_NO_MATCH;
-                int c = getCharAndAdvance(stack.currentFrame->args.subjectPtr);
-                if (!isASCIIDigit(c))
+                if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr++))
                     RRETURN_NO_MATCH;
                 stack.currentFrame->args.instructionPtr++;
                 NEXT_OPCODE;
-            }
-                
-                BEGIN_OPCODE(NOT_WHITESPACE):
-            {
+
+            BEGIN_OPCODE(NOT_WHITESPACE):
                 if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                     RRETURN_NO_MATCH;
-                int c = getCharAndAdvance(stack.currentFrame->args.subjectPtr);
-                if (isSpaceChar(c))
+                if (isSpaceChar(*stack.currentFrame->args.subjectPtr++))
                     RRETURN_NO_MATCH;
                 stack.currentFrame->args.instructionPtr++;
                 NEXT_OPCODE;
-            }
-                
-                BEGIN_OPCODE(WHITESPACE):
-            {
+
+            BEGIN_OPCODE(WHITESPACE):
                 if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                     RRETURN_NO_MATCH;
-                int c = getCharAndAdvance(stack.currentFrame->args.subjectPtr);
-                if (!isSpaceChar(c))
+                if (!isSpaceChar(*stack.currentFrame->args.subjectPtr++))
                     RRETURN_NO_MATCH;
                 stack.currentFrame->args.instructionPtr++;
                 NEXT_OPCODE;
-            }
                 
-                BEGIN_OPCODE(NOT_WORDCHAR):
-            {
+            BEGIN_OPCODE(NOT_WORDCHAR):
                 if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                     RRETURN_NO_MATCH;
-                int c = getCharAndAdvance(stack.currentFrame->args.subjectPtr);
-                if (isWordChar(c))
+                if (isWordChar(*stack.currentFrame->args.subjectPtr++))
                     RRETURN_NO_MATCH;
                 stack.currentFrame->args.instructionPtr++;
                 NEXT_OPCODE;
-            }
                 
-                BEGIN_OPCODE(WORDCHAR):
-            {
+            BEGIN_OPCODE(WORDCHAR):
                 if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                     RRETURN_NO_MATCH;
-                int c = getCharAndAdvance(stack.currentFrame->args.subjectPtr);
-                if (!isWordChar(c))
+                if (!isWordChar(*stack.currentFrame->args.subjectPtr++))
                     RRETURN_NO_MATCH;
                 stack.currentFrame->args.instructionPtr++;
                 NEXT_OPCODE;
-            }
                 
-                /* Match a back reference, possibly repeatedly. Look past the end of the
-                 item to see if there is repeat information following. The code is similar
-                 to that for character classes, but repeated for efficiency. Then obey
-                 similar code to character type repeats - written out again for speed.
-                 However, if the referenced string is the empty string, always treat
-                 it as matched, any number of times (otherwise there could be infinite
-                 loops). */
+            /* Match a back reference, possibly repeatedly. Look past the end of the
+             item to see if there is repeat information following. The code is similar
+             to that for character classes, but repeated for efficiency. Then obey
+             similar code to character type repeats - written out again for speed.
+             However, if the referenced string is the empty string, always treat
+             it as matched, any number of times (otherwise there could be infinite
+             loops). */
                 
-                BEGIN_OPCODE(REF):
+            BEGIN_OPCODE(REF):
                 stack.currentFrame->locals.offset = get2ByteOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 1) << 1;               /* Doubled ref number */
                 stack.currentFrame->args.instructionPtr += 3;                                 /* Advance past item */
                 
@@ -845,30 +825,30 @@ RECURSE:
                 /* Set up for repetition, or handle the non-repeated case */
                 
                 switch (*stack.currentFrame->args.instructionPtr) {
-                case OP_CRSTAR:
-                case OP_CRMINSTAR:
-                case OP_CRPLUS:
-                case OP_CRMINPLUS:
-                case OP_CRQUERY:
-                case OP_CRMINQUERY:
-                    repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
-                    break;
+                    case OP_CRSTAR:
+                    case OP_CRMINSTAR:
+                    case OP_CRPLUS:
+                    case OP_CRMINPLUS:
+                    case OP_CRQUERY:
+                    case OP_CRMINQUERY:
+                        repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
+                        break;
+                        
+                    case OP_CRRANGE:
+                    case OP_CRMINRANGE:
+                        minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
+                        min = get2ByteOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 1);
+                        stack.currentFrame->locals.max = get2ByteOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 3);
+                        if (stack.currentFrame->locals.max == 0)
+                            stack.currentFrame->locals.max = INT_MAX;
+                        stack.currentFrame->args.instructionPtr += 5;
+                        break;
                     
-                case OP_CRRANGE:
-                case OP_CRMINRANGE:
-                    minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
-                    min = get2ByteOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 1);
-                    stack.currentFrame->locals.max = get2ByteOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 3);
-                    if (stack.currentFrame->locals.max == 0)
-                        stack.currentFrame->locals.max = INT_MAX;
-                    stack.currentFrame->args.instructionPtr += 5;
-                    break;
-                
-                default:               /* No repeat follows */
-                    if (!match_ref(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
-                        RRETURN_NO_MATCH;
-                    stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
-                    NEXT_OPCODE;
+                    default:               /* No repeat follows */
+                        if (!match_ref(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
+                            RRETURN_NO_MATCH;
+                        stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
+                        NEXT_OPCODE;
                 }
                 
                 /* If the length of the reference is zero, just continue with the
@@ -924,45 +904,45 @@ RECURSE:
                 }
                 /* Control never reaches here */
                 
-                /* Match a bit-mapped character class, possibly repeatedly. This op code is
-                 used when all the characters in the class have values in the range 0-255,
-                 and either the matching is caseful, or the characters are in the range
-                 0-127 when UTF-8 processing is enabled. The only difference between
-                 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
-                 encountered.
-                 
-                 First, look past the end of the item to see if there is repeat information
-                 following. Then obey similar code to character type repeats - written out
-                 again for speed. */
-                
-                BEGIN_OPCODE(NCLASS):
-                BEGIN_OPCODE(CLASS):
+            /* Match a bit-mapped character class, possibly repeatedly. This op code is
+             used when all the characters in the class have values in the range 0-255,
+             and either the matching is caseful, or the characters are in the range
+             0-127 when UTF-8 processing is enabled. The only difference between
+             OP_CLASS and OP_NCLASS occurs when a data character outside the range is
+             encountered.
+             
+             First, look past the end of the item to see if there is repeat information
+             following. Then obey similar code to character type repeats - written out
+             again for speed. */
+                
+            BEGIN_OPCODE(NCLASS):
+            BEGIN_OPCODE(CLASS):
                 stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1;                /* Save for matching */
                 stack.currentFrame->args.instructionPtr += 33;                     /* Advance past the item */
                 
                 switch (*stack.currentFrame->args.instructionPtr) {
-                case OP_CRSTAR:
-                case OP_CRMINSTAR:
-                case OP_CRPLUS:
-                case OP_CRMINPLUS:
-                case OP_CRQUERY:
-                case OP_CRMINQUERY:
-                    repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
-                    break;
-                    
-                case OP_CRRANGE:
-                case OP_CRMINRANGE:
-                    minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
-                    min = get2ByteOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 1);
-                    stack.currentFrame->locals.max = get2ByteOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 3);
-                    if (stack.currentFrame->locals.max == 0)
-                        stack.currentFrame->locals.max = INT_MAX;
-                    stack.currentFrame->args.instructionPtr += 5;
-                    break;
-                    
-                default:               /* No repeat follows */
-                    min = stack.currentFrame->locals.max = 1;
-                    break;
+                    case OP_CRSTAR:
+                    case OP_CRMINSTAR:
+                    case OP_CRPLUS:
+                    case OP_CRMINPLUS:
+                    case OP_CRQUERY:
+                    case OP_CRMINQUERY:
+                        repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
+                        break;
+                        
+                    case OP_CRRANGE:
+                    case OP_CRMINRANGE:
+                        minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
+                        min = get2ByteOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 1);
+                        stack.currentFrame->locals.max = get2ByteOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 3);
+                        if (stack.currentFrame->locals.max == 0)
+                            stack.currentFrame->locals.max = INT_MAX;
+                        stack.currentFrame->args.instructionPtr += 5;
+                        break;
+                        
+                    default:               /* No repeat follows */
+                        min = stack.currentFrame->locals.max = 1;
+                        break;
                 }
                 
                 /* First, ensure the minimum number of matches are present. */
@@ -970,7 +950,7 @@ RECURSE:
                 for (int i = 1; i <= min; i++) {
                     if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                         RRETURN_NO_MATCH;
-                    int c = getCharAndAdvance(stack.currentFrame->args.subjectPtr);
+                    int c = *stack.currentFrame->args.subjectPtr++;
                     if (c > 255) {
                         if (stack.currentFrame->locals.data[-1] == OP_CLASS)
                             RRETURN_NO_MATCH;
@@ -995,7 +975,7 @@ RECURSE:
                             RRETURN;
                         if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.end_subject)
                             RRETURN;
-                        int c = getCharAndAdvance(stack.currentFrame->args.subjectPtr);
+                        int c = *stack.currentFrame->args.subjectPtr++;
                         if (c > 255) {
                             if (stack.currentFrame->locals.data[-1] == OP_CLASS)
                                 RRETURN;
@@ -1013,8 +993,7 @@ RECURSE:
                     for (int i = min; i < stack.currentFrame->locals.max; i++) {
                         if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                             break;
-                        int length;
-                        int c = getCharAndLength(stack.currentFrame->args.subjectPtr, length);
+                        int c = *stack.currentFrame->args.subjectPtr;
                         if (c > 255) {
                             if (stack.currentFrame->locals.data[-1] == OP_CLASS)
                                 break;
@@ -1022,7 +1001,7 @@ RECURSE:
                             if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7))))
                                 break;
                         }
-                        stack.currentFrame->args.subjectPtr += length;
+                        ++stack.currentFrame->args.subjectPtr;
                     }
                     for (;;) {
                         RECURSIVE_MATCH(24, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.subpatternStart);
@@ -1030,42 +1009,40 @@ RECURSE:
                             RRETURN;
                         if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
                             break;        /* Stop if tried at original pos */
-                        movePtrToStartOfCurrentChar(stack.currentFrame->args.subjectPtr);
                     }
                     
                     RRETURN;
                 }
                 /* Control never reaches here */
                 
-                /* Match an extended character class. This opcode is encountered only
-                 in UTF-8 mode, because that's the only time it is compiled. */
+            /* Match an extended character class. */
                 
-                BEGIN_OPCODE(XCLASS):
+            BEGIN_OPCODE(XCLASS):
                 stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE;                /* Save for matching */
                 stack.currentFrame->args.instructionPtr += getOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 1);                      /* Advance past the item */
                 
                 switch (*stack.currentFrame->args.instructionPtr) {
-                case OP_CRSTAR:
-                case OP_CRMINSTAR:
-                case OP_CRPLUS:
-                case OP_CRMINPLUS:
-                case OP_CRQUERY:
-                case OP_CRMINQUERY:
-                    repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
-                    break;
-                    
-                case OP_CRRANGE:
-                case OP_CRMINRANGE:
-                    minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
-                    min = get2ByteOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 1);
-                    stack.currentFrame->locals.max = get2ByteOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 3);
-                    if (stack.currentFrame->locals.max == 0)
-                        stack.currentFrame->locals.max = INT_MAX;
-                    stack.currentFrame->args.instructionPtr += 5;
-                    break;
-                    
-                default:               /* No repeat follows */
-                    min = stack.currentFrame->locals.max = 1;
+                    case OP_CRSTAR:
+                    case OP_CRMINSTAR:
+                    case OP_CRPLUS:
+                    case OP_CRMINPLUS:
+                    case OP_CRQUERY:
+                    case OP_CRMINQUERY:
+                        repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
+                        break;
+                        
+                    case OP_CRRANGE:
+                    case OP_CRMINRANGE:
+                        minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
+                        min = get2ByteOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 1);
+                        stack.currentFrame->locals.max = get2ByteOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 3);
+                        if (stack.currentFrame->locals.max == 0)
+                            stack.currentFrame->locals.max = INT_MAX;
+                        stack.currentFrame->args.instructionPtr += 5;
+                        break;
+                        
+                    default:               /* No repeat follows */
+                        min = stack.currentFrame->locals.max = 1;
             }
                 
                 /* First, ensure the minimum number of matches are present. */
@@ -1073,7 +1050,7 @@ RECURSE:
                 for (int i = 1; i <= min; i++) {
                     if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                         RRETURN_NO_MATCH;
-                    int c = getCharAndAdvance(stack.currentFrame->args.subjectPtr);
+                    int c = *stack.currentFrame->args.subjectPtr++;
                     if (!_pcre_xclass(c, stack.currentFrame->locals.data))
                         RRETURN_NO_MATCH;
                 }
@@ -1094,7 +1071,7 @@ RECURSE:
                             RRETURN;
                         if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.end_subject)
                             RRETURN;
-                        int c = getCharAndAdvance(stack.currentFrame->args.subjectPtr);
+                        int c = *stack.currentFrame->args.subjectPtr++;
                         if (!_pcre_xclass(c, stack.currentFrame->locals.data))
                             RRETURN;
                     }
@@ -1108,11 +1085,10 @@ RECURSE:
                     for (int i = min; i < stack.currentFrame->locals.max; i++) {
                         if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                             break;
-                        int length;
-                        int c = getCharAndLength(stack.currentFrame->args.subjectPtr, length);
+                        int c = *stack.currentFrame->args.subjectPtr;
                         if (!_pcre_xclass(c, stack.currentFrame->locals.data))
                             break;
-                        stack.currentFrame->args.subjectPtr += length;
+                        ++stack.currentFrame->args.subjectPtr;
                     }
                     for(;;) {
                         RECURSIVE_MATCH(27, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.subpatternStart);
@@ -1120,71 +1096,43 @@ RECURSE:
                             RRETURN;
                         if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
                             break;        /* Stop if tried at original pos */
-                        movePtrToStartOfCurrentChar(stack.currentFrame->args.subjectPtr);
                     }
                     RRETURN;
                 }
                 
                 /* Control never reaches here */
                 
-                /* Match a single character, casefully */
+            /* Match a single character, casefully */
                 
-                BEGIN_OPCODE(CHAR):
+            BEGIN_OPCODE(CHAR):
                 stack.currentFrame->locals.length = 1;
                 stack.currentFrame->args.instructionPtr++;
                 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length);
-            {
-                int dc;
                 stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length;
-                switch (md.end_subject - stack.currentFrame->args.subjectPtr) {
-                case 0:
+                if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                     RRETURN_NO_MATCH;
-                case 1:
-                    dc = *stack.currentFrame->args.subjectPtr++;
-                    if (isLeadingSurrogate(dc))
-                        RRETURN_NO_MATCH;
-                    break;
-                default:
-                    dc = getCharAndAdvance(stack.currentFrame->args.subjectPtr);
-                }
-                if (stack.currentFrame->locals.fc != dc)
+                if (stack.currentFrame->locals.fc != *stack.currentFrame->args.subjectPtr++)
                     RRETURN_NO_MATCH;
-            }
                 NEXT_OPCODE;
                 
-                /* Match a single character, caselessly */
+            /* Match a single character, caselessly */
                 
-                BEGIN_OPCODE(CHAR_IGNORING_CASE):
+            BEGIN_OPCODE(CHAR_IGNORING_CASE): {
                 stack.currentFrame->locals.length = 1;
                 stack.currentFrame->args.instructionPtr++;
                 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length);
-                
-                if (md.end_subject - stack.currentFrame->args.subjectPtr == 0)
-                    RRETURN_NO_MATCH;
-                
-            {
-                int dc;
-                if (md.end_subject - stack.currentFrame->args.subjectPtr == 1) {
-                    dc = *stack.currentFrame->args.subjectPtr++;
-                    if (isLeadingSurrogate(dc))
-                        RRETURN_NO_MATCH;
-                } else
-                    dc = getCharAndAdvance(stack.currentFrame->args.subjectPtr);
                 stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length;
-                
-                /* If we have Unicode property support, we can use it to test the other
-                 case of the character, if there is one. */
-                
-                if (stack.currentFrame->locals.fc != dc) {
-                    if (dc != _pcre_ucp_othercase(stack.currentFrame->locals.fc))
-                        RRETURN_NO_MATCH;
-                }
-            }
+                if (stack.currentFrame->args.subjectPtr >= md.end_subject)
+                    RRETURN_NO_MATCH;
+                int dc = *stack.currentFrame->args.subjectPtr++;
+                if (stack.currentFrame->locals.fc != dc && _pcre_ucp_othercase(stack.currentFrame->locals.fc) != dc)
+                    RRETURN_NO_MATCH;
                 NEXT_OPCODE;
+            }
                 
-                /* Match a single ASCII character. */
+            /* Match a single ASCII character. */
                 
-                BEGIN_OPCODE(ASCII_CHAR):
+            BEGIN_OPCODE(ASCII_CHAR):
                 if (md.end_subject == stack.currentFrame->args.subjectPtr)
                     RRETURN_NO_MATCH;
                 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->args.instructionPtr[1])
@@ -1193,9 +1141,9 @@ RECURSE:
                 stack.currentFrame->args.instructionPtr += 2;
                 NEXT_OPCODE;
                 
-                /* Match one of two cases of an ASCII character. */
+            /* Match one of two cases of an ASCII letter. */
                 
-                BEGIN_OPCODE(ASCII_LETTER_IGNORING_CASE):
+            BEGIN_OPCODE(ASCII_LETTER_IGNORING_CASE):
                 if (md.end_subject == stack.currentFrame->args.subjectPtr)
                     RRETURN_NO_MATCH;
                 if ((*stack.currentFrame->args.subjectPtr | 0x20) != stack.currentFrame->args.instructionPtr[1])
@@ -1204,28 +1152,28 @@ RECURSE:
                 stack.currentFrame->args.instructionPtr += 2;
                 NEXT_OPCODE;
                 
-                /* Match a single character repeatedly; different opcodes share code. */
+            /* Match a single character repeatedly; different opcodes share code. */
                 
-                BEGIN_OPCODE(EXACT):
+            BEGIN_OPCODE(EXACT):
                 min = stack.currentFrame->locals.max = get2ByteOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 1);
                 minimize = false;
                 stack.currentFrame->args.instructionPtr += 3;
                 goto REPEATCHAR;
                 
-                BEGIN_OPCODE(UPTO):
-                BEGIN_OPCODE(MINUPTO):
+            BEGIN_OPCODE(UPTO):
+            BEGIN_OPCODE(MINUPTO):
                 min = 0;
                 stack.currentFrame->locals.max = get2ByteOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 1);
                 minimize = *stack.currentFrame->args.instructionPtr == OP_MINUPTO;
                 stack.currentFrame->args.instructionPtr += 3;
                 goto REPEATCHAR;
                 
-                BEGIN_OPCODE(STAR):
-                BEGIN_OPCODE(MINSTAR):
-                BEGIN_OPCODE(PLUS):
-                BEGIN_OPCODE(MINPLUS):
-                BEGIN_OPCODE(QUERY):
-                BEGIN_OPCODE(MINQUERY):
+            BEGIN_OPCODE(STAR):
+            BEGIN_OPCODE(MINSTAR):
+            BEGIN_OPCODE(PLUS):
+            BEGIN_OPCODE(MINPLUS):
+            BEGIN_OPCODE(QUERY):
+            BEGIN_OPCODE(MINQUERY):
                 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_STAR, minimize, min, stack.currentFrame->locals.max);
                 
                 /* Common code for all repeated single-character matches. We can give
@@ -1328,15 +1276,13 @@ RECURSE:
                 }
                 /* Control never reaches here */
                 
-                /* Match a negated single one-byte character. The character we are
-                 checking can be multibyte. */
+            /* Match a negated single one-byte character. */
                 
-                BEGIN_OPCODE(NOT):
-            {
+            BEGIN_OPCODE(NOT): {
                 if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                     RRETURN_NO_MATCH;
                 stack.currentFrame->args.instructionPtr++;
-                int c = getCharAndAdvance(stack.currentFrame->args.subjectPtr);
+                int c = *stack.currentFrame->args.subjectPtr++;
                 if (md.ignoreCase) {
                     if (c < 128)
                         c = toLowerCase(c);
@@ -1349,38 +1295,38 @@ RECURSE:
                 NEXT_OPCODE;
             }
                 
-                /* Match a negated single one-byte character repeatedly. This is almost a
-                 repeat of the code for a repeated single character, but I haven't found a
-                 nice way of commoning these up that doesn't require a test of the
-                 positive/negative option for each character match. Maybe that wouldn't add
-                 very much to the time taken, but character matching *is* what this is all
-                 about... */
+            /* Match a negated single one-byte character repeatedly. This is almost a
+             repeat of the code for a repeated single character, but I haven't found a
+             nice way of commoning these up that doesn't require a test of the
+             positive/negative option for each character match. Maybe that wouldn't add
+             very much to the time taken, but character matching *is* what this is all
+             about... */
                 
-                BEGIN_OPCODE(NOTEXACT):
+            BEGIN_OPCODE(NOTEXACT):
                 min = stack.currentFrame->locals.max = get2ByteOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 1);
                 minimize = false;
                 stack.currentFrame->args.instructionPtr += 3;
                 goto REPEATNOTCHAR;
                 
-                BEGIN_OPCODE(NOTUPTO):
-                BEGIN_OPCODE(NOTMINUPTO):
+            BEGIN_OPCODE(NOTUPTO):
+            BEGIN_OPCODE(NOTMINUPTO):
                 min = 0;
                 stack.currentFrame->locals.max = get2ByteOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 1);
                 minimize = *stack.currentFrame->args.instructionPtr == OP_NOTMINUPTO;
                 stack.currentFrame->args.instructionPtr += 3;
                 goto REPEATNOTCHAR;
                 
-                BEGIN_OPCODE(NOTSTAR):
-                BEGIN_OPCODE(NOTMINSTAR):
-                BEGIN_OPCODE(NOTPLUS):
-                BEGIN_OPCODE(NOTMINPLUS):
-                BEGIN_OPCODE(NOTQUERY):
-                BEGIN_OPCODE(NOTMINQUERY):
+            BEGIN_OPCODE(NOTSTAR):
+            BEGIN_OPCODE(NOTMINSTAR):
+            BEGIN_OPCODE(NOTPLUS):
+            BEGIN_OPCODE(NOTMINPLUS):
+            BEGIN_OPCODE(NOTQUERY):
+            BEGIN_OPCODE(NOTMINQUERY):
                 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_NOTSTAR, minimize, min, stack.currentFrame->locals.max);
                 
-                /* Common code for all repeated single-byte matches. We can give up quickly
-                 if there are fewer than the minimum number of bytes left in the
-                 subject. */
+            /* Common code for all repeated single-byte matches. We can give up quickly
+             if there are fewer than the minimum number of bytes left in the
+             subject. */
                 
             REPEATNOTCHAR:
                 if (min > md.end_subject - stack.currentFrame->args.subjectPtr)
@@ -1401,14 +1347,12 @@ RECURSE:
                     if (stack.currentFrame->locals.fc < 128)
                         stack.currentFrame->locals.fc = toLowerCase(stack.currentFrame->locals.fc);
                     
-                    {
-                        for (int i = 1; i <= min; i++) {
-                            int d = getCharAndAdvance(stack.currentFrame->args.subjectPtr);
-                            if (d < 128)
-                                d = toLowerCase(d);
-                            if (stack.currentFrame->locals.fc == d)
-                                RRETURN_NO_MATCH;
-                        }
+                    for (int i = 1; i <= min; i++) {
+                        int d = *stack.currentFrame->args.subjectPtr++;
+                        if (d < 128)
+                            d = toLowerCase(d);
+                        if (stack.currentFrame->locals.fc == d)
+                            RRETURN_NO_MATCH;
                     }
                     
                     if (min == stack.currentFrame->locals.max)
@@ -1419,7 +1363,7 @@ RECURSE:
                             RECURSIVE_MATCH(38, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.subpatternStart);
                             if (is_match)
                                 RRETURN;
-                            int d = getCharAndAdvance(stack.currentFrame->args.subjectPtr);
+                            int d = *stack.currentFrame->args.subjectPtr++;
                             if (d < 128)
                                 d = toLowerCase(d);
                             if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.end_subject || stack.currentFrame->locals.fc == d)
@@ -1436,13 +1380,12 @@ RECURSE:
                         for (int i = min; i < stack.currentFrame->locals.max; i++) {
                             if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                                 break;
-                            int length;
-                            int d = getCharAndLength(stack.currentFrame->args.subjectPtr, length);
+                            int d = *stack.currentFrame->args.subjectPtr;
                             if (d < 128)
                                 d = toLowerCase(d);
                             if (stack.currentFrame->locals.fc == d)
                                 break;
-                            stack.currentFrame->args.subjectPtr += length;
+                            ++stack.currentFrame->args.subjectPtr;
                         }
                         for (;;) {
                             RECURSIVE_MATCH(40, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.subpatternStart);
@@ -1450,7 +1393,6 @@ RECURSE:
                                 RRETURN;
                             if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
                                 break;        /* Stop if tried at original pos */
-                            movePtrToStartOfCurrentChar(stack.currentFrame->args.subjectPtr);
                         }
                         
                         RRETURN;
@@ -1462,7 +1404,7 @@ RECURSE:
                 
                 else {
                     for (int i = 1; i <= min; i++) {
-                        int d = getCharAndAdvance(stack.currentFrame->args.subjectPtr);
+                        int d = *stack.currentFrame->args.subjectPtr++;
                         if (stack.currentFrame->locals.fc == d)
                             RRETURN_NO_MATCH;
                     }
@@ -1475,7 +1417,7 @@ RECURSE:
                             RECURSIVE_MATCH(42, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.subpatternStart);
                             if (is_match)
                                 RRETURN;
-                            int d = getCharAndAdvance(stack.currentFrame->args.subjectPtr);
+                            int d = *stack.currentFrame->args.subjectPtr++;
                             if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.end_subject || stack.currentFrame->locals.fc == d)
                                 RRETURN;
                         }
@@ -1487,55 +1429,51 @@ RECURSE:
                     else {
                         stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
                         
-                        {
-                            for (int i = min; i < stack.currentFrame->locals.max; i++) {
-                                if (stack.currentFrame->args.subjectPtr >= md.end_subject)
-                                    break;
-                                int length;
-                                int d = getCharAndLength(stack.currentFrame->args.subjectPtr, length);
-                                if (stack.currentFrame->locals.fc == d)
-                                    break;
-                                stack.currentFrame->args.subjectPtr += length;
-                            }
-                            for (;;) {
-                                RECURSIVE_MATCH(44, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.subpatternStart);
-                                if (is_match)
-                                    RRETURN;
-                                if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
-                                    break;        /* Stop if tried at original pos */
-                                movePtrToStartOfCurrentChar(stack.currentFrame->args.subjectPtr);
-                            }
+                        for (int i = min; i < stack.currentFrame->locals.max; i++) {
+                            if (stack.currentFrame->args.subjectPtr >= md.end_subject)
+                                break;
+                            int d = *stack.currentFrame->args.subjectPtr;
+                            if (stack.currentFrame->locals.fc == d)
+                                break;
+                            ++stack.currentFrame->args.subjectPtr;
                         }
-                        
+                        for (;;) {
+                            RECURSIVE_MATCH(44, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.subpatternStart);
+                            if (is_match)
+                                RRETURN;
+                            if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
+                                break;        /* Stop if tried at original pos */
+                        }
+
                         RRETURN;
                     }
                 }
                 /* Control never reaches here */
                 
-                /* Match a single character type repeatedly; several different opcodes
-                 share code. This is very similar to the code for single characters, but we
-                 repeat it in the interests of efficiency. */
+            /* Match a single character type repeatedly; several different opcodes
+             share code. This is very similar to the code for single characters, but we
+             repeat it in the interests of efficiency. */
                 
-                BEGIN_OPCODE(TYPEEXACT):
+            BEGIN_OPCODE(TYPEEXACT):
                 min = stack.currentFrame->locals.max = get2ByteOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 1);
                 minimize = true;
                 stack.currentFrame->args.instructionPtr += 3;
                 goto REPEATTYPE;
                 
-                BEGIN_OPCODE(TYPEUPTO):
-                BEGIN_OPCODE(TYPEMINUPTO):
+            BEGIN_OPCODE(TYPEUPTO):
+            BEGIN_OPCODE(TYPEMINUPTO):
                 min = 0;
                 stack.currentFrame->locals.max = get2ByteOpcodeValueAtOffset(stack.currentFrame->args.instructionPtr, 1);
                 minimize = *stack.currentFrame->args.instructionPtr == OP_TYPEMINUPTO;
                 stack.currentFrame->args.instructionPtr += 3;
                 goto REPEATTYPE;
                 
-                BEGIN_OPCODE(TYPESTAR):
-                BEGIN_OPCODE(TYPEMINSTAR):
-                BEGIN_OPCODE(TYPEPLUS):
-                BEGIN_OPCODE(TYPEMINPLUS):
-                BEGIN_OPCODE(TYPEQUERY):
-                BEGIN_OPCODE(TYPEMINQUERY):
+            BEGIN_OPCODE(TYPESTAR):
+            BEGIN_OPCODE(TYPEMINSTAR):
+            BEGIN_OPCODE(TYPEPLUS):
+            BEGIN_OPCODE(TYPEMINPLUS):
+            BEGIN_OPCODE(TYPEQUERY):
+            BEGIN_OPCODE(TYPEMINQUERY):
                 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_TYPESTAR, minimize, min, stack.currentFrame->locals.max);
                 
                 /* Common code for all repeated single character type matches. Note that
@@ -1548,77 +1486,69 @@ RECURSE:
                 /* First, ensure the minimum number of matches are present. Use inline
                  code for maximizing the speed, and do the type test once at the start
                  (i.e. keep it out of the loop). Also we can test that there are at least
-                 the minimum number of bytes before we start. This isn't as effective in
-                 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
-                 is tidier. Also separate the UCP code, which can be the same for both UTF-8
-                 and single-bytes. */
+                 the minimum number of characters before we start. */
                 
                 if (min > md.end_subject - stack.currentFrame->args.subjectPtr)
                     RRETURN_NO_MATCH;
                 if (min > 0) {
-                    switch(stack.currentFrame->locals.ctype) {
-                        case OP_ANY_CHAR:
+                    switch (stack.currentFrame->locals.ctype) {
+                        case OP_NOT_NEWLINE:
                             for (int i = 1; i <= min; i++) {
                                 if (isNewline(*stack.currentFrame->args.subjectPtr))
                                     RRETURN_NO_MATCH;
-                                if (!movePtrToNextChar(stack.currentFrame->args.subjectPtr, md.end_subject))
-                                    RRETURN_NO_MATCH;
+                                ++stack.currentFrame->args.subjectPtr;
                             }
                             break;
                             
-                            case OP_NOT_DIGIT:
+                        case OP_NOT_DIGIT:
                             for (int i = 1; i <= min; i++) {
                                 if (isASCIIDigit(*stack.currentFrame->args.subjectPtr))
                                     RRETURN_NO_MATCH;
-                                if (!movePtrToNextChar(stack.currentFrame->args.subjectPtr, md.end_subject))
-                                    RRETURN_NO_MATCH;
+                                ++stack.currentFrame->args.subjectPtr;
                             }
                             break;
                             
-                            case OP_DIGIT:
+                        case OP_DIGIT:
                             for (int i = 1; i <= min; i++) {
-                                // FIXME: Why do we advance the subjectPtr here but not in OP_WHITESPACE or OP_WORDCHAR ?
-                                if (stack.currentFrame->args.subjectPtr >= md.end_subject || !isASCIIDigit(*stack.currentFrame->args.subjectPtr++))
+                                if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr))
                                     RRETURN_NO_MATCH;
-                                /* No need to skip more bytes - we know it's a 1-byte character */
+                                ++stack.currentFrame->args.subjectPtr;
                             }
                             break;
                             
-                            case OP_NOT_WHITESPACE:
+                        case OP_NOT_WHITESPACE:
                             for (int i = 1; i <= min; i++) {
                                 if (isSpaceChar(*stack.currentFrame->args.subjectPtr))
                                     RRETURN_NO_MATCH;
-                                if (!movePtrToNextChar(stack.currentFrame->args.subjectPtr, md.end_subject))
-                                    RRETURN_NO_MATCH;
+                                ++stack.currentFrame->args.subjectPtr;
                             }
                             break;
                             
-                            case OP_WHITESPACE:
+                        case OP_WHITESPACE:
                             for (int i = 1; i <= min; i++) {
-                                if (stack.currentFrame->args.subjectPtr >= md.end_subject || !isSpaceChar(*stack.currentFrame->args.subjectPtr++))
+                                if (!isSpaceChar(*stack.currentFrame->args.subjectPtr))
                                     RRETURN_NO_MATCH;
-                                /* No need to skip more bytes - we know it's a 1-byte character */
+                                ++stack.currentFrame->args.subjectPtr;
                             }
                             break;
                             
-                            case OP_NOT_WORDCHAR:
+                        case OP_NOT_WORDCHAR:
                             for (int i = 1; i <= min; i++) {
                                 if (isWordChar(*stack.currentFrame->args.subjectPtr))
                                     RRETURN_NO_MATCH;
-                                if (!movePtrToNextChar(stack.currentFrame->args.subjectPtr, md.end_subject))
-                                    RRETURN_NO_MATCH;
+                                ++stack.currentFrame->args.subjectPtr;
                             }
                             break;
                             
-                            case OP_WORDCHAR:
+                        case OP_WORDCHAR:
                             for (int i = 1; i <= min; i++) {
-                                if (stack.currentFrame->args.subjectPtr >= md.end_subject || !isWordChar(*stack.currentFrame->args.subjectPtr++))
+                                if (!isWordChar(*stack.currentFrame->args.subjectPtr))
                                     RRETURN_NO_MATCH;
-                                /* No need to skip more bytes - we know it's a 1-byte character */
+                                ++stack.currentFrame->args.subjectPtr;
                             }
                             break;
                             
-                            default:
+                        default:
                             ASSERT_NOT_REACHED();
                             return matchError(JSRegExpErrorInternal, stack);
                     }  /* End switch(stack.currentFrame->locals.ctype) */
@@ -1640,46 +1570,46 @@ RECURSE:
                         if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.end_subject)
                             RRETURN;
                         
-                        int c = getCharAndAdvance(stack.currentFrame->args.subjectPtr);
-                        switch(stack.currentFrame->locals.ctype) {
-                        case OP_ANY_CHAR:
-                            if (isNewline(c))
-                                RRETURN;
-                            break;
-                            
-                        case OP_NOT_DIGIT:
-                            if (isASCIIDigit(c))
-                                RRETURN;
-                            break;
-                            
-                        case OP_DIGIT:
-                            if (!isASCIIDigit(c))
-                                RRETURN;
-                            break;
-                            
-                        case OP_NOT_WHITESPACE:
-                            if (isSpaceChar(c))
-                                RRETURN;
-                            break;
-                            
-                        case OP_WHITESPACE:
-                            if  (!isSpaceChar(c))
-                                RRETURN;
-                            break;
-                            
-                        case OP_NOT_WORDCHAR:
-                            if (isWordChar(c))
-                                RRETURN;
-                            break;
-                            
-                        case OP_WORDCHAR:
-                            if (!isWordChar(c))
-                                RRETURN;
-                            break;
-                            
-                        default:
-                            ASSERT_NOT_REACHED();
-                            return matchError(JSRegExpErrorInternal, stack);
+                        int c = *stack.currentFrame->args.subjectPtr++;
+                        switch (stack.currentFrame->locals.ctype) {
+                            case OP_NOT_NEWLINE:
+                                if (isNewline(c))
+                                    RRETURN;
+                                break;
+                                
+                            case OP_NOT_DIGIT:
+                                if (isASCIIDigit(c))
+                                    RRETURN;
+                                break;
+                                
+                            case OP_DIGIT:
+                                if (!isASCIIDigit(c))
+                                    RRETURN;
+                                break;
+                                
+                            case OP_NOT_WHITESPACE:
+                                if (isSpaceChar(c))
+                                    RRETURN;
+                                break;
+                                
+                            case OP_WHITESPACE:
+                                if (!isSpaceChar(c))
+                                    RRETURN;
+                                break;
+                                
+                            case OP_NOT_WORDCHAR:
+                                if (isWordChar(c))
+                                    RRETURN;
+                                break;
+                                
+                            case OP_WORDCHAR:
+                                if (!isWordChar(c))
+                                    RRETURN;
+                                break;
+                                
+                            default:
+                                ASSERT_NOT_REACHED();
+                                return matchError(JSRegExpErrorInternal, stack);
                         }
                     }
                     /* Control never reaches here */
@@ -1691,108 +1621,82 @@ RECURSE:
                 else {
                     stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;  /* Remember where we started */
                     
-                    switch(stack.currentFrame->locals.ctype) {
-                        case OP_ANY_CHAR:
-                            
-                            /* Special code is required for UTF8, but when the maximum is unlimited
-                             we don't need it, so we repeat the non-UTF8 code. This is probably
-                             worth it, because .* is quite a common idiom. */
-                            
-                            if (stack.currentFrame->locals.max < INT_MAX) {
-                                for (int i = min; i < stack.currentFrame->locals.max; i++) {
-                                    if (stack.currentFrame->args.subjectPtr >= md.end_subject || isNewline(*stack.currentFrame->args.subjectPtr))
-                                        break;
-                                    stack.currentFrame->args.subjectPtr++;
-                                    while (stack.currentFrame->args.subjectPtr < md.end_subject && (*stack.currentFrame->args.subjectPtr & 0xc0) == 0x80)
-                                        stack.currentFrame->args.subjectPtr++;
-                                }
-                            }
-                            
-                            /* Handle unlimited UTF-8 repeat */
-                            
-                            else {
-                                for (int i = min; i < stack.currentFrame->locals.max; i++) {
-                                    if (stack.currentFrame->args.subjectPtr >= md.end_subject || isNewline(*stack.currentFrame->args.subjectPtr))
-                                        break;
-                                    stack.currentFrame->args.subjectPtr++;
-                                }
-                                break;
+                    switch (stack.currentFrame->locals.ctype) {
+                        case OP_NOT_NEWLINE:
+                            for (int i = min; i < stack.currentFrame->locals.max; i++) {
+                                if (stack.currentFrame->args.subjectPtr >= md.end_subject || isNewline(*stack.currentFrame->args.subjectPtr))
+                                    break;
+                                stack.currentFrame->args.subjectPtr++;
                             }
                             break;
                             
-                            case OP_NOT_DIGIT:
+                        case OP_NOT_DIGIT:
                             for (int i = min; i < stack.currentFrame->locals.max; i++) {
                                 if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                                     break;
-                                int length;
-                                int c = getCharAndLength(stack.currentFrame->args.subjectPtr, length);
+                                int c = *stack.currentFrame->args.subjectPtr;
                                 if (isASCIIDigit(c))
                                     break;
-                                stack.currentFrame->args.subjectPtr += length;
+                                ++stack.currentFrame->args.subjectPtr;
                             }
                             break;
                             
-                            case OP_DIGIT:
+                        case OP_DIGIT:
                             for (int i = min; i < stack.currentFrame->locals.max; i++) {
                                 if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                                     break;
-                                int length;
-                                int c = getCharAndLength(stack.currentFrame->args.subjectPtr, length);
+                                int c = *stack.currentFrame->args.subjectPtr;
                                 if (!isASCIIDigit(c))
                                     break;
-                                stack.currentFrame->args.subjectPtr += length;
+                                ++stack.currentFrame->args.subjectPtr;
                             }
                             break;
                             
-                            case OP_NOT_WHITESPACE:
+                        case OP_NOT_WHITESPACE:
                             for (int i = min; i < stack.currentFrame->locals.max; i++) {
                                 if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                                     break;
-                                int length;
-                                int c = getCharAndLength(stack.currentFrame->args.subjectPtr, length);
+                                int c = *stack.currentFrame->args.subjectPtr;
                                 if (isSpaceChar(c))
                                     break;
-                                stack.currentFrame->args.subjectPtr += length;
+                                ++stack.currentFrame->args.subjectPtr;
                             }
                             break;
                             
-                            case OP_WHITESPACE:
+                        case OP_WHITESPACE:
                             for (int i = min; i < stack.currentFrame->locals.max; i++) {
                                 if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                                     break;
-                                int length;
-                                int c = getCharAndLength(stack.currentFrame->args.subjectPtr, length);
+                                int c = *stack.currentFrame->args.subjectPtr;
                                 if (!isSpaceChar(c))
                                     break;
-                                stack.currentFrame->args.subjectPtr += length;
+                                ++stack.currentFrame->args.subjectPtr;
                             }
                             break;
                             
-                            case OP_NOT_WORDCHAR:
+                        case OP_NOT_WORDCHAR:
                             for (int i = min; i < stack.currentFrame->locals.max; i++) {
                                 if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                                     break;
-                                int length;
-                                int c = getCharAndLength(stack.currentFrame->args.subjectPtr, length);
+                                int c = *stack.currentFrame->args.subjectPtr;
                                 if (isWordChar(c))
                                     break;
-                                stack.currentFrame->args.subjectPtr += length;
+                                ++stack.currentFrame->args.subjectPtr;
                             }
                             break;
                             
-                            case OP_WORDCHAR:
+                        case OP_WORDCHAR:
                             for (int i = min; i < stack.currentFrame->locals.max; i++) {
                                 if (stack.currentFrame->args.subjectPtr >= md.end_subject)
                                     break;
-                                int length;
-                                int c = getCharAndLength(stack.currentFrame->args.subjectPtr, length);
+                                int c = *stack.currentFrame->args.subjectPtr;
                                 if (!isWordChar(c))
                                     break;
-                                stack.currentFrame->args.subjectPtr += length;
+                                ++stack.currentFrame->args.subjectPtr;
                             }
                             break;
                             
-                            default:
+                        default:
                             ASSERT_NOT_REACHED();
                             return matchError(JSRegExpErrorInternal, stack);
                     }
@@ -1805,7 +1709,6 @@ RECURSE:
                             RRETURN;
                         if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
                             break;        /* Stop if tried at original pos */
-                        movePtrToStartOfCurrentChar(stack.currentFrame->args.subjectPtr);
                     }
                     
                     /* Get here if we can't make it match with any permitted repetitions */
@@ -1814,21 +1717,21 @@ RECURSE:
                 }
                 /* Control never reaches here */
                 
-                BEGIN_OPCODE(CRMINPLUS):
-                BEGIN_OPCODE(CRMINQUERY):
-                BEGIN_OPCODE(CRMINRANGE):
-                BEGIN_OPCODE(CRMINSTAR):
-                BEGIN_OPCODE(CRPLUS):
-                BEGIN_OPCODE(CRQUERY):
-                BEGIN_OPCODE(CRRANGE):
-                BEGIN_OPCODE(CRSTAR):
+            BEGIN_OPCODE(CRMINPLUS):
+            BEGIN_OPCODE(CRMINQUERY):
+            BEGIN_OPCODE(CRMINRANGE):
+            BEGIN_OPCODE(CRMINSTAR):
+            BEGIN_OPCODE(CRPLUS):
+            BEGIN_OPCODE(CRQUERY):
+            BEGIN_OPCODE(CRRANGE):
+            BEGIN_OPCODE(CRSTAR):
                 ASSERT_NOT_REACHED();
                 return matchError(JSRegExpErrorInternal, stack);
                 
 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
             CAPTURING_BRACKET:
 #else
-                default:
+            default:
 #endif
                 /* Opening capturing bracket. If there is space in the offset vector, save
                  the current subject position in the working slot at the top of the vector. We
@@ -1901,8 +1804,7 @@ RECURSE:
 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
     
 RRETURN_SWITCH:
-    switch (stack.currentFrame->returnLocation)
-    {
+    switch (stack.currentFrame->returnLocation) {
         case 0: goto RETURN;
         case 1: goto RRETURN_1;
         case 2: goto RRETURN_2;
@@ -2184,8 +2086,6 @@ int jsRegExpExecute(const JSRegExp* re,
         
         if (returnCode == MATCH_NOMATCH) {
             start_match++;
-            if (start_match < end_subject && isTrailingSurrogate(*start_match))
-                start_match++;
             continue;
         }
         
index a54777d33f2aacf73e431995e0061a42807694c5..d2b1073f549c0d933404e19dbf17070f4c3827a9 100644 (file)
@@ -235,112 +235,6 @@ static inline void put2ByteOpcodeValueAtOffsetAndAdvance(uschar*& opcodePtr, siz
     opcodePtr += 2;
 }
 
-#define LEAD_OFFSET (0xd800 - (0x10000 >> 10))
-#define SURROGATE_OFFSET (0x10000 - (0xd800 << 10) - 0xdc00)
-
-static inline bool isLeadingSurrogate(int c)
-{
-    return ((c & ~0x3ff) == 0xd800);
-}
-
-static inline bool isTrailingSurrogate(int c)
-{
-    return ((c & ~0x3ff) == 0xdc00);
-}
-
-static inline int decodeSurrogatePair(int leadingSurrogate, int trailingSurrogate)
-{
-    return ((leadingSurrogate << 10) + trailingSurrogate + SURROGATE_OFFSET);
-}
-
-static inline int getChar(const UChar* subjectPtr)
-{
-    int c = subjectPtr[0];
-    if (isLeadingSurrogate(c))
-        c = decodeSurrogatePair(c, subjectPtr[1]);
-    return c;
-}
-
-static inline int getCharAndAdvance(const UChar*& subjectPtr)
-{
-    int c = *subjectPtr++;
-    if (isLeadingSurrogate(c))
-        c = decodeSurrogatePair(c, *subjectPtr++);
-    return c;
-}
-
-static inline int getCharAndLength(const UChar*& subjectPtr, int& length)
-{
-    int c = subjectPtr[0];
-    if (isLeadingSurrogate(c)) {
-        c = decodeSurrogatePair(c, subjectPtr[1]);
-        length = 2;
-    } else
-        length = 1;
-    return c;
-}
-
-// FIXME: All (2) calls to this funtion should be removed and replaced with
-// calls to getCharAndAdvance
-static inline int getCharAndAdvanceIfSurrogate(const UChar*& subjectPtr)
-{
-    int c = subjectPtr[0];
-    if (isLeadingSurrogate(c)) {
-        c = decodeSurrogatePair(c, subjectPtr[1]);
-        subjectPtr++;
-    }
-    return c;
-}
-
-// This flavor checks to make sure we don't walk off the end
-// FIXME: This could also be removed and an end-aware getCharAndAdvance added instead.
-static inline int getCharAndAdvanceIfSurrogate(const UChar*& subjectPtr, const UChar* end)
-{
-    int c = subjectPtr[0];
-    if (isLeadingSurrogate(c)) {
-        if (subjectPtr + 1 < end)
-            c = decodeSurrogatePair(c, subjectPtr[1]);
-        else
-            c = decodeSurrogatePair(c, 0);
-        subjectPtr++;
-    }
-    return c;
-}
-
-static inline int getPreviousChar(const UChar* subjectPtr)
-{
-    int valueAtSubjectMinusOne = subjectPtr[-1];
-    if (isTrailingSurrogate(valueAtSubjectMinusOne))
-        return decodeSurrogatePair(subjectPtr[-2], valueAtSubjectMinusOne);
-    return valueAtSubjectMinusOne;
-}
-
-static inline void movePtrToPreviousChar(const UChar*& subjectPtr)
-{
-    subjectPtr--;
-    if (isTrailingSurrogate(*subjectPtr))
-        subjectPtr--;
-}
-
-static inline bool movePtrToNextChar(const UChar*& subjectPtr, const UChar* endSubject)
-{
-    if (subjectPtr < endSubject) {
-        subjectPtr++;
-        if (subjectPtr < endSubject && isTrailingSurrogate(*subjectPtr)) {
-            subjectPtr++;
-            return subjectPtr < endSubject;
-        }
-        return true;
-    }
-    return false;
-}
-
-static inline void movePtrToStartOfCurrentChar(const UChar*& subjectPtr)
-{
-    if (isTrailingSurrogate(*subjectPtr))
-        subjectPtr--;
-}
-
 // FIXME: These are really more of a "compiled regexp state" than "regexp options"
 enum RegExpOptions {
     UseFirstByteOptimizationOption = 0x40000000,  /* first_byte is set */
@@ -382,10 +276,9 @@ contain UTF-8 characters with values greater than 255. */
 /* These are escaped items that aren't just an encoding of a particular data
 value such as \n. They must have non-zero values, as check_escape() returns
 their negation. Also, they must appear in the same order as in the opcode
-definitions below, up to ESC_z. There's a dummy for OP_ANY_CHAR because it
-corresponds to "." rather than an escape sequence. The final one must be
+definitions below, up to ESC_w. The final one must be
 ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
-tests in the code for an escape greater than ESC_b and less than ESC_Z to
+tests in the code for an escape > ESC_b and <= ESC_w to
 detect the types that may be repeated. These are the types that consume
 characters. If any new escapes are put in between that don't consume a
 character, that code will have to change. */
@@ -410,7 +303,7 @@ must also be updated to match. */
     macro(NOT_WORDCHAR) \
     macro(WORDCHAR) \
     \
-    macro(ANY_CHAR) \
+    macro(NOT_NEWLINE) \
     \
     macro(CIRC) \
     macro(DOLL) \
index 9ffc8efb7e183d97eb4b6b5b02d0493f353c1459..2c35080216255cef7131ac435deb3f78fa14d511 100644 (file)
@@ -1,3 +1,14 @@
+2007-11-30  Darin Adler  <darin@apple.com>
+
+        Reviewed by Adam Roben.
+
+        - test for http://bugs.webkit.org/show_bug.cgi?id=16207
+          JavaScript regular expressions should match UTF-16 code units rather than characters
+
+        * fast/js/regexp-non-bmp-expected.txt: Added.
+        * fast/js/regexp-non-bmp.html: Added.
+        * fast/js/resources/regexp-non-bmp.js: Added.
+
 2007-11-30  Adele Peterson  <adele@apple.com>
 
         Reviewed by Darin.
diff --git a/LayoutTests/fast/js/regexp-non-bmp-expected.txt b/LayoutTests/fast/js/regexp-non-bmp-expected.txt
new file mode 100644 (file)
index 0000000..2e43854
--- /dev/null
@@ -0,0 +1,18 @@
+Tests that regular expressions treat non-BMP characters as two separate characters. From a Unicode correctness point of view this is wrong, but it is what other browsers do. And given that we store strings as UTF-16, it is also more efficient to implement. Also test some other cases related to UTF-8 and UTF-16.
+
+On success, you will see a series of "PASS" messages, followed by "TEST COMPLETE".
+
+
+PASS /./.exec(surrogatePair).toString().length is 1
+PASS /\D/.exec(surrogatePair).toString().length is 1
+PASS /\S/.exec(surrogatePair).toString().length is 1
+PASS /\W/.exec(surrogatePair).toString().length is 1
+PASS /[^x]/.exec(surrogatePair).toString().length is 1
+
+PASS /.{1,2}/.exec("!!" + String.fromCharCode(0xA1)).toString().length is 2
+PASS /./.exec("") is null
+
+PASS successfullyParsed is true
+
+TEST COMPLETE
+
diff --git a/LayoutTests/fast/js/regexp-non-bmp.html b/LayoutTests/fast/js/regexp-non-bmp.html
new file mode 100644 (file)
index 0000000..e6c5f2f
--- /dev/null
@@ -0,0 +1,13 @@
+<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
+<html>
+<head>
+<link rel="stylesheet" href="resources/js-test-style.css">
+<script src="resources/js-test-pre.js"></script>
+</head>
+<body>
+<p id="description"></p>
+<div id="console"></div>
+<script src="resources/regexp-non-bmp.js"></script>
+<script src="resources/js-test-post.js"></script>
+</body>
+</html>
diff --git a/LayoutTests/fast/js/resources/regexp-non-bmp.js b/LayoutTests/fast/js/resources/regexp-non-bmp.js
new file mode 100644 (file)
index 0000000..7b96593
--- /dev/null
@@ -0,0 +1,23 @@
+description(
+'Tests that regular expressions treat non-BMP characters as two separate characters. '
++ 'From a Unicode correctness point of view this is wrong, but it is what other browsers do. '
++ 'And given that we store strings as UTF-16, it is also more efficient to implement. '
++ 'Also test some other cases related to UTF-8 and UTF-16.'
+);
+
+var surrogatePair = String.fromCharCode(0xD800) + String.fromCharCode(0xDC00);
+
+shouldBe('/./.exec(surrogatePair).toString().length', '1');
+shouldBe('/\\D/.exec(surrogatePair).toString().length', '1');
+shouldBe('/\\S/.exec(surrogatePair).toString().length', '1');
+shouldBe('/\\W/.exec(surrogatePair).toString().length', '1');
+shouldBe('/[^x]/.exec(surrogatePair).toString().length', '1');
+
+debug('');
+
+shouldBe('/.{1,2}/.exec("!!" + String.fromCharCode(0xA1)).toString().length', '2');
+shouldBe('/./.exec("")', 'null');
+
+debug('');
+
+var successfullyParsed = true;