f31e553775e6b976302eb246dac71fa3514cba26
[WebKit-https.git] / JavaScriptCore / yarr / RegexInterpreter.cpp
1 /*
2  * Copyright (C) 2009 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
24  */
25
26 #include "config.h"
27 #include "RegexInterpreter.h"
28
29 #include "RegexCompiler.h"
30 #include "RegexPattern.h"
31
32 #ifndef NDEBUG
33 #include <stdio.h>
34 #endif
35
36 #if ENABLE(YARR)
37
38 using namespace WTF;
39
40 namespace JSC { namespace Yarr {
41
42 class Interpreter {
43 public:
44     struct ParenthesesDisjunctionContext;
45
46     struct BackTrackInfoPatternCharacter {
47         uintptr_t matchAmount;
48     };
49     struct BackTrackInfoCharacterClass {
50         uintptr_t matchAmount;
51     };
52     struct BackTrackInfoBackReference {
53         uintptr_t begin; // Not really needed for greedy quantifiers.
54         uintptr_t matchAmount; // Not really needed for fixed quantifiers.
55     };
56     struct BackTrackInfoAlternative {
57         uintptr_t offset;
58     };
59     struct BackTrackInfoParentheticalAssertion {
60         uintptr_t begin;
61     };
62     struct BackTrackInfoParenthesesOnce {
63         uintptr_t inParentheses;
64     };
65     struct BackTrackInfoParentheses {
66         uintptr_t matchAmount;
67         ParenthesesDisjunctionContext* lastContext;
68         uintptr_t prevBegin;
69         uintptr_t prevEnd;
70     };
71
72     static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
73     {
74         context->next = backTrack->lastContext;
75         backTrack->lastContext = context;
76         ++backTrack->matchAmount;
77     }
78
79     static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
80     {
81         ASSERT(backTrack->matchAmount);
82         ASSERT(backTrack->lastContext);
83         backTrack->lastContext = backTrack->lastContext->next;
84         --backTrack->matchAmount;
85     }
86
87     struct DisjunctionContext
88     {
89         DisjunctionContext()
90             : term(0)
91         {
92         }
93         
94         void* operator new(size_t, void* where)
95         {
96             return where;
97         }
98
99         int term;
100         unsigned matchBegin;
101         unsigned matchEnd;
102         uintptr_t frame[1];
103     };
104
105     DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
106     {
107         return new(malloc(sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) DisjunctionContext();
108     }
109
110     void freeDisjunctionContext(DisjunctionContext* context)
111     {
112         free(context);
113     }
114
115     struct ParenthesesDisjunctionContext
116     {
117         ParenthesesDisjunctionContext(int* output, ByteTerm& term)
118             : next(0)
119         {
120             unsigned firstSubpatternId = term.atom.subpatternId;
121             unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
122
123             for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
124                 subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
125                 output[(firstSubpatternId << 1) + i] = -1;
126             }
127             
128             new(getDisjunctionContext(term)) DisjunctionContext();
129         }
130
131         void* operator new(size_t, void* where)
132         {
133             return where;
134         }
135
136         void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
137         {
138             for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
139                 output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
140         }
141         
142         DisjunctionContext* getDisjunctionContext(ByteTerm& term)
143         {
144             return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
145         }
146
147         ParenthesesDisjunctionContext* next;
148         int subpatternBackup[1];
149     };
150
151     ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term)
152     {
153         return new(malloc(sizeof(ParenthesesDisjunctionContext) + (((term.atom.parenthesesDisjunction->m_numSubpatterns << 1) - 1) * sizeof(int)) + sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) ParenthesesDisjunctionContext(output, term);
154     }
155
156     void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
157     {
158         free(context);
159     }
160
161     class InputStream {
162     public:
163         InputStream(const UChar* input, unsigned start, unsigned length)
164             : input(input)
165             , pos(start)
166             , length(length)
167         {
168         }
169
170         void next()
171         {
172             ++pos;
173         }
174
175         void rewind(unsigned amount)
176         {
177             ASSERT(pos >= amount);
178             pos -= amount;
179         }
180
181         int read()
182         {
183             ASSERT(pos < length);
184             if (pos < length)
185                 return input[pos];
186             return -1;
187         }
188
189         int readChecked(int position)
190         {
191             ASSERT(position < 0);
192             ASSERT((unsigned)-position <= pos);
193             unsigned p = pos + position;
194             ASSERT(p < length);
195             return input[p];
196         }
197
198         int reread(unsigned from)
199         {
200             ASSERT(from < length);
201             return input[from];
202         }
203
204         int prev()
205         {
206             ASSERT(!(pos > length));
207             if (pos && length)
208                 return input[pos - 1];
209             return -1;
210         }
211         
212         unsigned getPos()
213         {
214             return pos;
215         }
216
217         void setPos(unsigned p)
218         {
219             pos = p;
220         }
221         
222         bool atStart()
223         {
224             return pos == 0;
225         }
226
227         bool atEnd()
228         {
229             return pos == length;
230         }
231
232         bool checkInput(int count)
233         {
234             if ((pos + count) <= length) {
235                 pos += count;
236                 return true;
237             } else
238                 return false;
239         }
240
241         void uncheckInput(int count)
242         {
243             pos -= count;
244         }
245
246         bool atStart(int position)
247         {
248             return (pos + position) == 0;
249         }
250
251         bool atEnd(int position)
252         {
253             return (pos + position) == length;
254         }
255
256     private:
257         const UChar* input;
258         unsigned pos;
259         unsigned length;
260     };
261
262     bool testCharacterClass(CharacterClass* characterClass, int ch)
263     {
264         if (ch & 0xFF80) {
265             for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
266                 if (ch == characterClass->m_matchesUnicode[i])
267                     return true;
268             for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
269                 if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
270                     return true;
271         } else {
272             for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
273                 if (ch == characterClass->m_matches[i])
274                     return true;
275             for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
276                 if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
277                     return true;
278         }
279
280         return false;
281     }
282
283     bool tryConsumeCharacter(int testChar)
284     {
285         if (input.atEnd())
286             return false;
287         
288         int ch = input.read();
289
290         if (pattern->m_ignoreCase ? ((Unicode::toLower(testChar) == ch) || (Unicode::toUpper(testChar) == ch)) : (testChar == ch)) {
291             input.next();
292             return true;
293         }
294         return false;
295     }
296
297     bool checkCharacter(int testChar, int inputPosition)
298     {
299         return testChar == input.readChecked(inputPosition);
300     }
301
302     bool checkCasedCharacter(int loChar, int hiChar, int inputPosition)
303     {
304         int ch = input.readChecked(inputPosition);
305         return (loChar == ch) || (hiChar == ch);
306     }
307
308     bool tryConsumeCharacterClass(CharacterClass* characterClass, bool invert)
309     {
310         if (input.atEnd())
311             return false;
312
313         bool match = testCharacterClass(characterClass, input.read());
314
315         if (invert)
316             match = !match;
317
318         if (match) {
319             input.next();
320             return true;
321         }
322         return false;
323     }
324
325     bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition)
326     {
327         bool match = testCharacterClass(characterClass, input.readChecked(inputPosition));
328         return invert ? !match : match;
329     }
330
331     bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset)
332     {
333         int matchSize = matchEnd - matchBegin;
334
335         if (!input.checkInput(matchSize))
336             return false;
337
338         for (int i = 0; i < matchSize; ++i) {
339             if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) {
340                 input.uncheckInput(matchSize);
341                 return false;
342             }
343         }
344         
345         return true;
346     }
347
348     bool matchAssertionBOL(ByteTerm& term)
349     {
350         return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1)));
351     }
352
353     bool matchAssertionEOL(ByteTerm& term)
354     {
355         if (term.inputPosition)
356             return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
357         else
358             return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
359     }
360
361     bool matchAssertionWordBoundary(ByteTerm& term)
362     {
363         bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1));
364         bool readIsWordchar;
365         if (term.inputPosition)
366             readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
367         else
368             readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
369
370         bool wordBoundary = prevIsWordchar != readIsWordchar;
371         return term.invert() ? !wordBoundary : wordBoundary;
372     }
373
374     bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
375     {
376         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
377
378         switch (term.atom.quantityType) {
379         case QuantifierFixedCount:
380             break;
381
382         case QuantifierGreedy:
383             if (backTrack->matchAmount) {
384                 --backTrack->matchAmount;
385                 input.uncheckInput(1);
386                 return true;
387             }
388             break;
389
390         case QuantifierNonGreedy:
391             if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
392                 ++backTrack->matchAmount;
393                 if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1))
394                     return true;
395             }
396             input.uncheckInput(backTrack->matchAmount);
397             break;
398         }
399
400         return false;
401     }
402
403     bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
404     {
405         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
406
407         switch (term.atom.quantityType) {
408         case QuantifierFixedCount:
409             break;
410
411         case QuantifierGreedy:
412             if (backTrack->matchAmount) {
413                 --backTrack->matchAmount;
414                 input.uncheckInput(1);
415                 return true;
416             }
417             break;
418
419         case QuantifierNonGreedy:
420             if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
421                 ++backTrack->matchAmount;
422                 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1))
423                     return true;
424             }
425             input.uncheckInput(backTrack->matchAmount);
426             break;
427         }
428
429         return false;
430     }
431
432     bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
433     {
434         ASSERT(term.type == ByteTerm::TypeCharacterClass);
435         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
436
437         switch (term.atom.quantityType) {
438         case QuantifierFixedCount: {
439             for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
440                 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount))
441                     return false;
442             }
443             return true;
444         }
445
446         case QuantifierGreedy: {
447             unsigned matchAmount = 0;
448             while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
449                 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) {
450                     input.uncheckInput(1);
451                     break;
452                 }
453                 ++matchAmount;
454             }
455             backTrack->matchAmount = matchAmount;
456
457             return true;
458         }
459
460         case QuantifierNonGreedy:
461             backTrack->matchAmount = 0;
462             return true;
463         }
464
465         ASSERT_NOT_REACHED();
466         return false;
467     }
468
469     bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
470     {
471         ASSERT(term.type == ByteTerm::TypeCharacterClass);
472         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
473
474         switch (term.atom.quantityType) {
475         case QuantifierFixedCount:
476             break;
477
478         case QuantifierGreedy:
479             if (backTrack->matchAmount) {
480                 --backTrack->matchAmount;
481                 input.uncheckInput(1);
482                 return true;
483             }
484             break;
485
486         case QuantifierNonGreedy:
487             if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
488                 ++backTrack->matchAmount;
489                 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1))
490                     return true;
491             }
492             input.uncheckInput(backTrack->matchAmount);
493             break;
494         }
495
496         return false;
497     }
498
499     bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
500     {
501         ASSERT(term.type == ByteTerm::TypeBackReference);
502         BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
503
504         int matchBegin = output[(term.atom.subpatternId << 1)];
505         int matchEnd = output[(term.atom.subpatternId << 1) + 1];
506         ASSERT((matchBegin == -1) == (matchEnd == -1));
507         ASSERT(matchBegin <= matchEnd);
508
509         if (matchBegin == matchEnd)
510             return true;
511
512         switch (term.atom.quantityType) {
513         case QuantifierFixedCount: {
514             backTrack->begin = input.getPos();
515             for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
516                 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
517                     input.setPos(backTrack->begin);
518                     return false;
519                 }
520             }
521             return true;
522         }
523
524         case QuantifierGreedy: {
525             unsigned matchAmount = 0;
526             while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
527                 ++matchAmount;
528             backTrack->matchAmount = matchAmount;
529             return true;
530         }
531
532         case QuantifierNonGreedy:
533             backTrack->begin = input.getPos();
534             backTrack->matchAmount = 0;
535             return true;
536         }
537
538         ASSERT_NOT_REACHED();
539         return false;
540     }
541
542     bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
543     {
544         ASSERT(term.type == ByteTerm::TypeBackReference);
545         BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
546
547         int matchBegin = output[(term.atom.subpatternId << 1)];
548         int matchEnd = output[(term.atom.subpatternId << 1) + 1];
549         ASSERT((matchBegin == -1) == (matchEnd == -1));
550         ASSERT(matchBegin <= matchEnd);
551
552         if (matchBegin == matchEnd)
553             return false;
554
555         switch (term.atom.quantityType) {
556         case QuantifierFixedCount:
557             // for quantityCount == 1, could rewind.
558             input.setPos(backTrack->begin);
559             break;
560
561         case QuantifierGreedy:
562             if (backTrack->matchAmount) {
563                 --backTrack->matchAmount;
564                 input.rewind(matchEnd - matchBegin);
565                 return true;
566             }
567             break;
568
569         case QuantifierNonGreedy:
570             if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
571                 ++backTrack->matchAmount;
572                 return true;
573             } else
574                 input.setPos(backTrack->begin);
575             break;
576         }
577
578         return false;
579     }
580
581     void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
582     {
583         if (term.capture()) {
584             unsigned subpatternId = term.atom.subpatternId;
585             output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
586             output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
587         }
588     }
589     void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
590     {
591         unsigned firstSubpatternId = term.atom.subpatternId;
592         unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
593         context->restoreOutput(output, firstSubpatternId, count);
594     }
595     void resetAssertionMatches(ByteTerm& term)
596     {
597         unsigned firstSubpatternId = term.atom.subpatternId;
598         unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
599         for (unsigned i = 0; i < (count << 1); ++i)
600             output[(firstSubpatternId << 1) + i] = -1;
601     }
602     bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
603     {
604         while (backTrack->matchAmount) {
605             ParenthesesDisjunctionContext* context = backTrack->lastContext;
606
607             if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true))
608                 return true;
609             
610             resetMatches(term, context);
611             freeParenthesesDisjunctionContext(context);
612             popParenthesesDisjunctionContext(backTrack);
613         }
614
615         return false;
616     }
617
618     bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
619     {
620         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
621         ASSERT(term.atom.quantityCount == 1);
622
623         BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
624
625         switch (term.atom.quantityType) {
626         case QuantifierGreedy: {
627             // set this speculatively; if we get to the parens end this will be true.
628             backTrack->inParentheses = 1;
629             break;
630         }
631         case QuantifierNonGreedy: {
632             backTrack->inParentheses = 0;
633             context->term += term.atom.parenthesesWidth;
634             return true;
635         }
636         case QuantifierFixedCount:
637             break;
638         }
639
640         if (term.capture()) {
641             unsigned subpatternId = term.atom.subpatternId;
642             output[(subpatternId << 1)] = input.getPos() + term.inputPosition;
643         }
644
645         return true;
646     }
647
648     bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*)
649     {
650         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
651         ASSERT(term.atom.quantityCount == 1);
652
653         if (term.capture()) {
654             unsigned subpatternId = term.atom.subpatternId;
655             output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
656         }
657         return true;
658     }
659
660     bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
661     {
662         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
663         ASSERT(term.atom.quantityCount == 1);
664
665         BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
666
667         if (term.capture()) {
668             unsigned subpatternId = term.atom.subpatternId;
669             output[(subpatternId << 1)] = -1;
670             output[(subpatternId << 1) + 1] = -1;
671         }
672
673         switch (term.atom.quantityType) {
674         case QuantifierGreedy:
675             // if we backtrack to this point, there is another chance - try matching nothing.
676             ASSERT(backTrack->inParentheses);
677             backTrack->inParentheses = 0;
678             context->term += term.atom.parenthesesWidth;
679             return true;
680         case QuantifierNonGreedy:
681             ASSERT(backTrack->inParentheses);
682         case QuantifierFixedCount:
683             break;
684         }
685
686         return false;
687     }
688
689     bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
690     {
691         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
692         ASSERT(term.atom.quantityCount == 1);
693
694         BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
695
696         switch (term.atom.quantityType) {
697         case QuantifierGreedy:
698             if (!backTrack->inParentheses) {
699                 context->term -= term.atom.parenthesesWidth;
700                 return false;
701             }
702         case QuantifierNonGreedy:
703             if (!backTrack->inParentheses) {
704                 // now try to match the parens; set this speculatively.
705                 backTrack->inParentheses = 1;
706                 if (term.capture()) {
707                     unsigned subpatternId = term.atom.subpatternId;
708                     output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
709                 }
710                 context->term -= term.atom.parenthesesWidth;
711                 return true;
712             }
713         case QuantifierFixedCount:
714             break;
715         }
716
717         return false;
718     }
719
720     bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
721     {
722         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
723         ASSERT(term.atom.quantityCount == 1);
724
725         BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
726
727         backTrack->begin = input.getPos();
728         return true;
729     }
730
731     bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
732     {
733         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
734         ASSERT(term.atom.quantityCount == 1);
735
736         BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
737
738         input.setPos(backTrack->begin);
739
740         // We've reached the end of the parens; if they are inverted, this is failure.
741         if (term.invert()) {
742             context->term -= term.atom.parenthesesWidth;
743             return false;
744         }
745
746         return true;
747     }
748
749     bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
750     {
751         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
752         ASSERT(term.atom.quantityCount == 1);
753
754         // We've failed to match parens; if they are inverted, this is win!
755         if (term.invert()) {
756             context->term += term.atom.parenthesesWidth;
757             return true;
758         }
759
760         return false;
761     }
762
763     bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
764     {
765         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
766         ASSERT(term.atom.quantityCount == 1);
767
768         BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
769
770         input.setPos(backTrack->begin);
771
772         context->term -= term.atom.parenthesesWidth;
773         return false;
774     }
775
776     bool matchParentheses(ByteTerm& term, DisjunctionContext* context)
777     {
778         ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
779
780         BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
781
782         unsigned subpatternId = term.atom.subpatternId;
783         ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
784
785         backTrack->prevBegin = output[(subpatternId << 1)];
786         backTrack->prevEnd = output[(subpatternId << 1) + 1];
787
788         backTrack->matchAmount = 0;
789         backTrack->lastContext = 0;
790
791         switch (term.atom.quantityType) {
792         case QuantifierFixedCount: {
793             // While we haven't yet reached our fixed limit,
794             while (backTrack->matchAmount < term.atom.quantityCount) {
795                 // Try to do a match, and it it succeeds, add it to the list.
796                 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
797                 if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
798                     appendParenthesesDisjunctionContext(backTrack, context);
799                 else {
800                     // The match failed; try to find an alternate point to carry on from.
801                     resetMatches(term, context);
802                     freeParenthesesDisjunctionContext(context);
803                     if (!parenthesesDoBacktrack(term, backTrack))
804                         return false;
805                 }
806             }
807
808             ASSERT(backTrack->matchAmount == term.atom.quantityCount);
809             ParenthesesDisjunctionContext* context = backTrack->lastContext;
810             recordParenthesesMatch(term, context);
811             return true;
812         }
813
814         case QuantifierGreedy: {
815             while (backTrack->matchAmount < term.atom.quantityCount) {
816                 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
817                 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
818                     appendParenthesesDisjunctionContext(backTrack, context);
819                 else {
820                     resetMatches(term, context);
821                     freeParenthesesDisjunctionContext(context);
822                     break;
823                 }
824             }
825
826             if (backTrack->matchAmount) {
827                 ParenthesesDisjunctionContext* context = backTrack->lastContext;
828                 recordParenthesesMatch(term, context);
829             }
830             return true;
831         }
832
833         case QuantifierNonGreedy:
834             return true;
835         }
836
837         ASSERT_NOT_REACHED();
838         return false;
839     }
840
841     // Rules for backtracking differ depending on whether this is greedy or non-greedy.
842     //
843     // Greedy matches never should try just adding more - you should already have done
844     // the 'more' cases.  Always backtrack, at least a leetle bit.  However cases where
845     // you backtrack an item off the list needs checking, since we'll never have matched
846     // the one less case.  Tracking forwards, still add as much as possible.
847     //
848     // Non-greedy, we've already done the one less case, so don't match on popping.
849     // We haven't done the one more case, so always try to add that.
850     //
851     bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
852     {
853         ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
854
855         BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
856
857         if (term.capture()) {
858             unsigned subpatternId = term.atom.subpatternId;
859             output[(subpatternId << 1)] = backTrack->prevBegin;
860             output[(subpatternId << 1) + 1] = backTrack->prevEnd;
861         }
862
863         ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
864
865         switch (term.atom.quantityType) {
866         case QuantifierFixedCount: {
867             ASSERT(backTrack->matchAmount == term.atom.quantityCount);
868
869             ParenthesesDisjunctionContext* context = 0;
870
871             if (!parenthesesDoBacktrack(term, backTrack))
872                 return false;
873
874             // While we haven't yet reached our fixed limit,
875             while (backTrack->matchAmount < term.atom.quantityCount) {
876                 // Try to do a match, and it it succeeds, add it to the list.
877                 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
878                 if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
879                     appendParenthesesDisjunctionContext(backTrack, context);
880                 else {
881                     // The match failed; try to find an alternate point to carry on from.
882                     resetMatches(term, context);
883                     freeParenthesesDisjunctionContext(context);
884                     if (!parenthesesDoBacktrack(term, backTrack))
885                         return false;
886                 }
887             }
888
889             ASSERT(backTrack->matchAmount == term.atom.quantityCount);
890             context = backTrack->lastContext;
891             recordParenthesesMatch(term, context);
892             return true;
893         }
894
895         case QuantifierGreedy: {
896             if (!backTrack->matchAmount)
897                 return false;
898
899             ParenthesesDisjunctionContext* context = backTrack->lastContext;
900             if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
901                 while (backTrack->matchAmount < term.atom.quantityCount) {
902                     ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
903                     if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
904                         appendParenthesesDisjunctionContext(backTrack, context);
905                     else {
906                         resetMatches(term, context);
907                         freeParenthesesDisjunctionContext(context);
908                         break;
909                     }
910                 }
911             } else {
912                 resetMatches(term, context);
913                 freeParenthesesDisjunctionContext(context);
914                 popParenthesesDisjunctionContext(backTrack);
915             }
916
917             if (backTrack->matchAmount) {
918                 ParenthesesDisjunctionContext* context = backTrack->lastContext;
919                 recordParenthesesMatch(term, context);
920             }
921             return true;
922         }
923
924         case QuantifierNonGreedy: {
925             // If we've not reached the limit, try to add one more match.
926             if (backTrack->matchAmount < term.atom.quantityCount) {
927                 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
928                 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) {
929                     appendParenthesesDisjunctionContext(backTrack, context);
930                     recordParenthesesMatch(term, context);
931                     return true;
932                 } else {
933                     resetMatches(term, context);
934                     freeParenthesesDisjunctionContext(context);
935                 }
936             }
937
938             // Nope - okay backtrack looking for an alternative.
939             while (backTrack->matchAmount) {
940                 ParenthesesDisjunctionContext* context = backTrack->lastContext;
941                 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
942                     // successful backtrack! we're back in the game!
943                     if (backTrack->matchAmount) {
944                         context = backTrack->lastContext;
945                         recordParenthesesMatch(term, context);
946                     }
947                     return true;
948                 }
949                 
950                 // pop a match off the stack
951                 resetMatches(term, context);
952                 freeParenthesesDisjunctionContext(context);
953                 popParenthesesDisjunctionContext(backTrack);
954             }
955
956             return false;
957         }
958         }
959
960         ASSERT_NOT_REACHED();
961         return false;
962     }
963
964 #define MATCH_NEXT() { ++context->term; goto matchAgain; }
965 #define BACKTRACK() { --context->term; goto backtrack; }
966 #define currentTerm() (disjunction->terms[context->term])
967     bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
968     {
969         if (btrack)
970             BACKTRACK();
971
972         context->matchBegin = input.getPos();
973         context->term = 0;
974
975     matchAgain:
976         ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
977
978         switch (currentTerm().type) {
979         case ByteTerm::TypeSubpatternBegin:
980             MATCH_NEXT();
981         case ByteTerm::TypeSubpatternEnd:
982             context->matchEnd = input.getPos();
983             return true;
984
985         case ByteTerm::TypeBodyAlternativeBegin:
986             MATCH_NEXT();
987         case ByteTerm::TypeBodyAlternativeDisjunction:
988         case ByteTerm::TypeBodyAlternativeEnd:
989             context->matchEnd = input.getPos();
990             return true;
991
992         case ByteTerm::TypeAlternativeBegin:
993             MATCH_NEXT();
994         case ByteTerm::TypeAlternativeDisjunction:
995         case ByteTerm::TypeAlternativeEnd: {
996             int offset = currentTerm().alternative.end;
997             BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
998             backTrack->offset = offset;
999             context->term += offset;
1000             MATCH_NEXT();
1001         }
1002
1003         case ByteTerm::TypeAssertionBOL:
1004             if (matchAssertionBOL(currentTerm()))
1005                 MATCH_NEXT();
1006             BACKTRACK();
1007         case ByteTerm::TypeAssertionEOL:
1008             if (matchAssertionEOL(currentTerm()))
1009                 MATCH_NEXT();
1010             BACKTRACK();
1011         case ByteTerm::TypeAssertionWordBoundary:
1012             if (matchAssertionWordBoundary(currentTerm()))
1013                 MATCH_NEXT();
1014             BACKTRACK();
1015
1016         case ByteTerm::TypePatternCharacterOnce:
1017         case ByteTerm::TypePatternCharacterFixed: {
1018             for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1019                 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount))
1020                     BACKTRACK();
1021             }
1022             MATCH_NEXT();
1023         }
1024         case ByteTerm::TypePatternCharacterGreedy: {
1025             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1026             unsigned matchAmount = 0;
1027             while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1028                 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) {
1029                     input.uncheckInput(1);
1030                     break;
1031                 }
1032                 ++matchAmount;
1033             }
1034             backTrack->matchAmount = matchAmount;
1035
1036             MATCH_NEXT();
1037         }
1038         case ByteTerm::TypePatternCharacterNonGreedy: {
1039             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1040             backTrack->matchAmount = 0;
1041             MATCH_NEXT();
1042         }
1043
1044         case ByteTerm::TypePatternCasedCharacterOnce:
1045         case ByteTerm::TypePatternCasedCharacterFixed: {
1046             for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1047                 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount))
1048                     BACKTRACK();
1049             }
1050             MATCH_NEXT();
1051         }
1052         case ByteTerm::TypePatternCasedCharacterGreedy: {
1053             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1054             unsigned matchAmount = 0;
1055             while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1056                 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) {
1057                     input.uncheckInput(1);
1058                     break;
1059                 }
1060                 ++matchAmount;
1061             }
1062             backTrack->matchAmount = matchAmount;
1063
1064             MATCH_NEXT();
1065         }
1066         case ByteTerm::TypePatternCasedCharacterNonGreedy: {
1067             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1068             backTrack->matchAmount = 0;
1069             MATCH_NEXT();
1070         }
1071
1072         case ByteTerm::TypeCharacterClass:
1073             if (matchCharacterClass(currentTerm(), context))
1074                 MATCH_NEXT();
1075             BACKTRACK();
1076         case ByteTerm::TypeBackReference:
1077             if (matchBackReference(currentTerm(), context))
1078                 MATCH_NEXT();
1079             BACKTRACK();
1080         case ByteTerm::TypeParenthesesSubpattern:
1081             if (matchParentheses(currentTerm(), context))
1082                 MATCH_NEXT();
1083             BACKTRACK();
1084         case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1085             if (matchParenthesesOnceBegin(currentTerm(), context))
1086                 MATCH_NEXT();
1087             BACKTRACK();
1088         case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1089             if (matchParenthesesOnceEnd(currentTerm(), context))
1090                 MATCH_NEXT();
1091             BACKTRACK();
1092         case ByteTerm::TypeParentheticalAssertionBegin:
1093             if (matchParentheticalAssertionBegin(currentTerm(), context))
1094                 MATCH_NEXT();
1095             BACKTRACK();
1096         case ByteTerm::TypeParentheticalAssertionEnd:
1097             if (matchParentheticalAssertionEnd(currentTerm(), context))
1098                 MATCH_NEXT();
1099             BACKTRACK();
1100
1101         case ByteTerm::TypeCheckInput:
1102             if (input.checkInput(currentTerm().checkInputCount))
1103                 MATCH_NEXT();
1104             BACKTRACK();
1105         }
1106
1107         // We should never fall-through to here.
1108         ASSERT_NOT_REACHED();
1109
1110     backtrack:
1111         ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1112
1113         switch (currentTerm().type) {
1114         case ByteTerm::TypeSubpatternBegin:
1115             return false;
1116         case ByteTerm::TypeSubpatternEnd:
1117             ASSERT_NOT_REACHED();
1118
1119         case ByteTerm::TypeBodyAlternativeBegin:
1120         case ByteTerm::TypeBodyAlternativeDisjunction: {
1121             int offset = currentTerm().alternative.next;
1122             context->term += offset;
1123             if (offset > 0)
1124                 MATCH_NEXT();
1125
1126             if (input.atEnd())
1127                 return false;
1128
1129             input.next();
1130             context->matchBegin = input.getPos();
1131             MATCH_NEXT();
1132         }
1133         case ByteTerm::TypeBodyAlternativeEnd:
1134             ASSERT_NOT_REACHED();
1135
1136             case ByteTerm::TypeAlternativeBegin:
1137             case ByteTerm::TypeAlternativeDisjunction: {
1138                 int offset = currentTerm().alternative.next;
1139                 context->term += offset;
1140                 if (offset > 0)
1141                     MATCH_NEXT();
1142                 BACKTRACK();
1143             }
1144             case ByteTerm::TypeAlternativeEnd: {
1145                 // We should never backtrack back into an alternative of the main body of the regex.
1146                 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1147                 unsigned offset = backTrack->offset;
1148                 context->term -= offset;
1149                 BACKTRACK();
1150             }
1151
1152             case ByteTerm::TypeAssertionBOL:
1153             case ByteTerm::TypeAssertionEOL:
1154             case ByteTerm::TypeAssertionWordBoundary:
1155                 BACKTRACK();
1156
1157             case ByteTerm::TypePatternCharacterOnce:
1158             case ByteTerm::TypePatternCharacterFixed:
1159             case ByteTerm::TypePatternCharacterGreedy:
1160             case ByteTerm::TypePatternCharacterNonGreedy:
1161                 if (backtrackPatternCharacter(currentTerm(), context))
1162                     MATCH_NEXT();
1163                 BACKTRACK();
1164             case ByteTerm::TypePatternCasedCharacterOnce:
1165             case ByteTerm::TypePatternCasedCharacterFixed:
1166             case ByteTerm::TypePatternCasedCharacterGreedy:
1167             case ByteTerm::TypePatternCasedCharacterNonGreedy:
1168                 if (backtrackPatternCasedCharacter(currentTerm(), context))
1169                     MATCH_NEXT();
1170                 BACKTRACK();
1171             case ByteTerm::TypeCharacterClass:
1172                 if (backtrackCharacterClass(currentTerm(), context))
1173                     MATCH_NEXT();
1174                 BACKTRACK();
1175             case ByteTerm::TypeBackReference:
1176                 if (backtrackBackReference(currentTerm(), context))
1177                     MATCH_NEXT();
1178                 BACKTRACK();
1179             case ByteTerm::TypeParenthesesSubpattern:
1180                 if (backtrackParentheses(currentTerm(), context))
1181                     MATCH_NEXT();
1182                 BACKTRACK();
1183             case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1184                 if (backtrackParenthesesOnceBegin(currentTerm(), context))
1185                     MATCH_NEXT();
1186                 BACKTRACK();
1187             case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1188                 if (backtrackParenthesesOnceEnd(currentTerm(), context))
1189                     MATCH_NEXT();
1190                 BACKTRACK();
1191             case ByteTerm::TypeParentheticalAssertionBegin:
1192                 if (backtrackParentheticalAssertionBegin(currentTerm(), context))
1193                     MATCH_NEXT();
1194                 BACKTRACK();
1195             case ByteTerm::TypeParentheticalAssertionEnd:
1196                 if (backtrackParentheticalAssertionEnd(currentTerm(), context))
1197                     MATCH_NEXT();
1198                 BACKTRACK();
1199
1200             case ByteTerm::TypeCheckInput:
1201                 input.uncheckInput(currentTerm().checkInputCount);
1202                 BACKTRACK();
1203         }
1204
1205         ASSERT_NOT_REACHED();
1206         return false;
1207     }
1208
1209     bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1210     {
1211         if (matchDisjunction(disjunction, context, btrack)) {
1212             while (context->matchBegin == context->matchEnd) {
1213                 if (!matchDisjunction(disjunction, context, true))
1214                     return false;
1215             }
1216             return true;
1217         }
1218
1219         return false;
1220     }
1221
1222     int interpret()
1223     {
1224         for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i)
1225             output[i] = -1;
1226
1227         DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
1228
1229         if (matchDisjunction(pattern->m_body.get(), context)) {
1230             output[0] = context->matchBegin;
1231             output[1] = context->matchEnd;
1232         }
1233
1234         freeDisjunctionContext(context);
1235
1236         return output[0];
1237     }
1238
1239     Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length)
1240         : pattern(pattern)
1241         , output(output)
1242         , input(inputChar, start, length)
1243     {
1244     }
1245
1246 private:
1247     BytecodePattern *pattern;
1248     int* output;
1249     InputStream input;
1250 };
1251
1252
1253
1254 class ByteCompiler {
1255     struct ParenthesesStackEntry {
1256         unsigned beginTerm;
1257         unsigned savedAlternativeIndex;
1258         ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
1259             : beginTerm(beginTerm)
1260             , savedAlternativeIndex(savedAlternativeIndex)
1261         {
1262         }
1263     };
1264
1265 public:
1266     ByteCompiler(RegexPattern& pattern)
1267         : m_pattern(pattern)
1268     {
1269         bodyDisjunction = 0;
1270         currentAlternativeIndex = 0;
1271     }
1272     
1273     BytecodePattern* compile()
1274     {
1275         regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize);
1276         emitDisjunction(m_pattern.m_body);
1277         regexEnd();
1278
1279         return new BytecodePattern(bodyDisjunction, m_allParenthesesInfo, m_pattern);
1280     }
1281     
1282     void checkInput(unsigned count)
1283     {
1284         bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
1285     }
1286
1287     void assertionBOL(int inputPosition)
1288     {
1289         bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
1290     }
1291
1292     void assertionEOL(int inputPosition)
1293     {
1294         bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
1295     }
1296
1297     void assertionWordBoundary(bool invert, int inputPosition)
1298     {
1299         bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
1300     }
1301
1302     void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1303     {
1304         if (m_pattern.m_ignoreCase) {
1305             UChar lo = Unicode::toLower(ch);
1306             UChar hi = Unicode::toUpper(ch);
1307             
1308             if (lo != hi) {
1309                 bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
1310                 return;
1311             }
1312         }
1313
1314         bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
1315     }
1316     
1317     void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1318     {
1319         bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
1320
1321         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
1322         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1323         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1324     }
1325
1326     void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1327     {
1328         ASSERT(subpatternId);
1329
1330         bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
1331
1332         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
1333         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1334         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1335     }
1336
1337     void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1338     {
1339         int beginTerm = bodyDisjunction->terms.size();
1340
1341         bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition));
1342         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1343         bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1344         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1345
1346         m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, currentAlternativeIndex));
1347         currentAlternativeIndex = beginTerm + 1;
1348     }
1349
1350     void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
1351     {
1352         int beginTerm = bodyDisjunction->terms.size();
1353
1354         bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, invert, 0));
1355         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1356         bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1357         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1358
1359         m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, currentAlternativeIndex));
1360         currentAlternativeIndex = beginTerm + 1;
1361     }
1362
1363     unsigned popParenthesesStack()
1364     {
1365         ASSERT(m_parenthesesStack.size());
1366         int stackEnd = m_parenthesesStack.size() - 1;
1367         unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
1368         currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
1369         m_parenthesesStack.shrink(stackEnd);
1370
1371         ASSERT(beginTerm < bodyDisjunction->terms.size());
1372         ASSERT(currentAlternativeIndex < bodyDisjunction->terms.size());
1373         
1374         return beginTerm;
1375     }
1376
1377 #ifndef NDEBUG
1378     void dumpDisjunction(ByteDisjunction* disjunction)
1379     {
1380         printf("ByteDisjunction(%p):\n\t", disjunction);
1381         for (unsigned i = 0; i < disjunction->terms.size(); ++i)
1382             printf("{ %d } ", disjunction->terms[i].type);
1383         printf("\n");
1384     }
1385 #endif
1386
1387     void closeAlternative(int beginTerm)
1388     {
1389         int origBeginTerm = beginTerm;
1390         ASSERT(bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
1391         int endIndex = bodyDisjunction->terms.size();
1392
1393         unsigned frameLocation = bodyDisjunction->terms[beginTerm].frameLocation;
1394
1395         if (!bodyDisjunction->terms[beginTerm].alternative.next)
1396             bodyDisjunction->terms.remove(beginTerm);
1397         else {
1398             while (bodyDisjunction->terms[beginTerm].alternative.next) {
1399                 beginTerm += bodyDisjunction->terms[beginTerm].alternative.next;
1400                 ASSERT(bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
1401                 bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1402                 bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1403             }
1404             
1405             bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1406
1407             bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
1408             bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1409         }
1410     }
1411
1412     void closeBodyAlternative()
1413     {
1414         int beginTerm = 0;
1415         int origBeginTerm = 0;
1416         ASSERT(bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
1417         int endIndex = bodyDisjunction->terms.size();
1418
1419         unsigned frameLocation = bodyDisjunction->terms[beginTerm].frameLocation;
1420
1421         while (bodyDisjunction->terms[beginTerm].alternative.next) {
1422             beginTerm += bodyDisjunction->terms[beginTerm].alternative.next;
1423             ASSERT(bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
1424             bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1425             bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1426         }
1427         
1428         bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1429
1430         bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
1431         bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1432     }
1433
1434     void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
1435     {
1436         unsigned beginTerm = popParenthesesStack();
1437         closeAlternative(beginTerm + 1);
1438         unsigned endTerm = bodyDisjunction->terms.size();
1439
1440         bool isAssertion = bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin;
1441         bool invertOrCapture = bodyDisjunction->terms[beginTerm].invertOrCapture;
1442         unsigned subpatternId = bodyDisjunction->terms[beginTerm].atom.subpatternId;
1443
1444         bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition));
1445         bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1446         bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1447         bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1448
1449         if (doInline) {
1450             bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1451             bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1452             bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
1453             bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1454         } else {
1455             ByteTerm& parenthesesBegin = bodyDisjunction->terms[beginTerm];
1456             ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1457
1458             bool invertOrCapture = parenthesesBegin.invertOrCapture;
1459             unsigned subpatternId = parenthesesBegin.atom.subpatternId;
1460
1461             unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
1462             ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
1463
1464             parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
1465             for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
1466                 parenthesesDisjunction->terms.append(bodyDisjunction->terms[termInParentheses]);
1467             parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
1468
1469             bodyDisjunction->terms.shrink(beginTerm);
1470
1471             m_allParenthesesInfo.append(parenthesesDisjunction);
1472             bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition));
1473
1474             bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1475             bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1476             bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1477         }
1478     }
1479
1480     void regexBegin(unsigned numSubpatterns, unsigned callFrameSize)
1481     {
1482         bodyDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
1483         bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin());
1484         bodyDisjunction->terms[0].frameLocation = 0;
1485         currentAlternativeIndex = 0;
1486     }
1487
1488     void regexEnd()
1489     {
1490         closeBodyAlternative();
1491     }
1492
1493     void alterantiveBodyDisjunction()
1494     {
1495         int newAlternativeIndex = bodyDisjunction->terms.size();
1496         bodyDisjunction->terms[currentAlternativeIndex].alternative.next = newAlternativeIndex - currentAlternativeIndex;
1497         bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction());
1498
1499         currentAlternativeIndex = newAlternativeIndex;
1500     }
1501
1502     void alterantiveDisjunction()
1503     {
1504         int newAlternativeIndex = bodyDisjunction->terms.size();
1505         bodyDisjunction->terms[currentAlternativeIndex].alternative.next = newAlternativeIndex - currentAlternativeIndex;
1506         bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
1507
1508         currentAlternativeIndex = newAlternativeIndex;
1509     }
1510
1511     void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0)
1512     {
1513         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
1514             unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
1515         
1516             if (alt) {
1517                 if (disjunction == m_pattern.m_body)
1518                     alterantiveBodyDisjunction();
1519                 else
1520                     alterantiveDisjunction();
1521             }
1522
1523             PatternAlternative* alternative = disjunction->m_alternatives[alt];
1524             unsigned minimumSize = alternative->m_minimumSize;
1525
1526             ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
1527             unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
1528             if (countToCheck)
1529                 checkInput(countToCheck);
1530             currentCountAlreadyChecked += countToCheck;
1531
1532             for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
1533                 PatternTerm& term = alternative->m_terms[i];
1534
1535                 switch (term.type) {
1536                 case PatternTerm::TypeAssertionBOL:
1537                     assertionBOL(term.inputPosition - currentCountAlreadyChecked);
1538                     break;
1539
1540                 case PatternTerm::TypeAssertionEOL:
1541                     assertionEOL(term.inputPosition - currentCountAlreadyChecked);
1542                     break;
1543
1544                 case PatternTerm::TypeAssertionWordBoundary:
1545                     assertionWordBoundary(term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked);
1546                     break;
1547
1548                 case PatternTerm::TypePatternCharacter:
1549                     atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1550                     break;
1551
1552                 case PatternTerm::TypeCharacterClass:
1553                     atomCharacterClass(term.characterClass, term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1554                     break;
1555
1556                 case PatternTerm::TypeBackReference:
1557                     atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1558                     break;
1559
1560                 case PatternTerm::TypeParenthesesSubpattern: {
1561                     unsigned disjunctionAlreadyCheckedCount = 0;
1562                     if ((term.quantityCount == 1) && !term.parentheses.isCopy) {
1563                         if (term.quantityType == QuantifierFixedCount) {
1564                             disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
1565                             unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1566                             atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation);
1567                             emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize);
1568                             atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1569                         } else {
1570                             unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1571                             atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce);
1572                             emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1573                             atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1574                         }
1575                     } else {
1576                         unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1577                         atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0);
1578                         emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1579                         atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1580                     }
1581                     break;
1582                 }
1583
1584                 case PatternTerm::TypeParentheticalAssertion: {
1585                     unsigned alternativeFrameLocation = term.inputPosition + RegexStackSpaceForBackTrackInfoParentheticalAssertion;
1586                     
1587                     atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation);
1588                     emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1589                     atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType);
1590                     break;
1591                 }
1592                 }
1593             }
1594         }
1595     }
1596
1597 private:
1598     RegexPattern& m_pattern;
1599     ByteDisjunction* bodyDisjunction;
1600     unsigned currentAlternativeIndex;
1601     Vector<ParenthesesStackEntry> m_parenthesesStack;
1602     Vector<ByteDisjunction*> m_allParenthesesInfo;
1603 };
1604
1605
1606 BytecodePattern* byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline)
1607 {
1608     RegexPattern pattern(ignoreCase, multiline);
1609
1610     if ((error = compileRegex(patternString, pattern)))
1611         return 0;
1612
1613     numSubpatterns = pattern.m_numSubpatterns;
1614
1615     return ByteCompiler(pattern).compile();
1616 }
1617
1618 int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output)
1619 {
1620     return Interpreter(regex, output, input, start, length).interpret();
1621 }
1622
1623
1624 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter);
1625 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass);
1626 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference);
1627 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative);
1628 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion);
1629 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce);
1630 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses);
1631
1632
1633 } }
1634
1635 #endif