00eb4b15d3bce5a9dc2e074cf75068af4eb2372b
[WebKit-https.git] / JavaScriptCore / yarr / RegexInterpreter.cpp
1 /*
2  * Copyright (C) 2009 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
24  */
25
26 #include "config.h"
27 #include "RegexInterpreter.h"
28
29 #include "RegexCompiler.h"
30 #include "RegexPattern.h"
31
32 #if ENABLE(YARR)
33
34 using namespace WTF;
35
36 namespace JSC { namespace Yarr {
37
38 class Interpreter {
39 public:
40     struct ParenthesesDisjunctionContext;
41
42     struct BackTrackInfoPatternCharacter {
43         uintptr_t matchAmount;
44     };
45     struct BackTrackInfoCharacterClass {
46         uintptr_t matchAmount;
47     };
48     struct BackTrackInfoBackReference {
49         uintptr_t begin; // Not really needed for greedy quantifiers.
50         uintptr_t matchAmount; // Not really needed for fixed quantifiers.
51     };
52     struct BackTrackInfoAlternative {
53         uintptr_t offset;
54     };
55     struct BackTrackInfoParentheticalAssertion {
56         uintptr_t begin;
57     };
58     struct BackTrackInfoParenthesesOnce {
59         uintptr_t inParentheses;
60     };
61     struct BackTrackInfoParentheses {
62         uintptr_t matchAmount;
63         ParenthesesDisjunctionContext* lastContext;
64         uintptr_t prevBegin;
65         uintptr_t prevEnd;
66     };
67
68     static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
69     {
70         context->next = backTrack->lastContext;
71         backTrack->lastContext = context;
72         ++backTrack->matchAmount;
73     }
74
75     static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
76     {
77         ASSERT(backTrack->matchAmount);
78         ASSERT(backTrack->lastContext);
79         backTrack->lastContext = backTrack->lastContext->next;
80         --backTrack->matchAmount;
81     }
82
83     struct DisjunctionContext
84     {
85         DisjunctionContext()
86             : term(0)
87         {
88         }
89         
90         void* operator new(size_t, void* where)
91         {
92             return where;
93         }
94
95         int term;
96         unsigned matchBegin;
97         unsigned matchEnd;
98         uintptr_t frame[1];
99     };
100
101     DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
102     {
103         return new(malloc(sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) DisjunctionContext();
104     }
105
106     void freeDisjunctionContext(DisjunctionContext* context)
107     {
108         free(context);
109     }
110
111     struct ParenthesesDisjunctionContext
112     {
113         ParenthesesDisjunctionContext(int* output, ByteTerm& term)
114             : next(0)
115         {
116             unsigned firstSubpatternId = term.atom.subpatternId;
117             unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
118
119             for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
120                 subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
121                 output[(firstSubpatternId << 1) + i] = -1;
122             }
123             
124             new(getDisjunctionContext(term)) DisjunctionContext();
125         }
126
127         void* operator new(size_t, void* where)
128         {
129             return where;
130         }
131
132         void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
133         {
134             for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
135                 output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
136         }
137         
138         DisjunctionContext* getDisjunctionContext(ByteTerm& term)
139         {
140             return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
141         }
142
143         ParenthesesDisjunctionContext* next;
144         int subpatternBackup[1];
145     };
146
147     ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term)
148     {
149         return new(malloc(sizeof(ParenthesesDisjunctionContext) + (((term.atom.parenthesesDisjunction->m_numSubpatterns << 1) - 1) * sizeof(int)) + sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) ParenthesesDisjunctionContext(output, term);
150     }
151
152     void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
153     {
154         free(context);
155     }
156
157     class InputStream {
158     public:
159         InputStream(const UChar* input, unsigned start, unsigned length)
160             : input(input)
161             , pos(start)
162             , length(length)
163         {
164         }
165
166         void next()
167         {
168             ++pos;
169         }
170
171         void rewind(unsigned amount)
172         {
173             ASSERT(pos >= amount);
174             pos -= amount;
175         }
176
177         int read()
178         {
179             ASSERT(pos < length);
180             if (pos < length)
181                 return input[pos];
182             return -1;
183         }
184
185         int readChecked(int position)
186         {
187             ASSERT(position < 0);
188             ASSERT((unsigned)-position <= pos);
189             unsigned p = pos + position;
190             ASSERT(p < length);
191             return input[p];
192         }
193
194         int reread(unsigned from)
195         {
196             ASSERT(from < length);
197             return input[from];
198         }
199
200         int prev()
201         {
202             ASSERT(!(pos > length));
203             if (pos && length)
204                 return input[pos - 1];
205             return -1;
206         }
207         
208         unsigned getPos()
209         {
210             return pos;
211         }
212
213         void setPos(unsigned p)
214         {
215             pos = p;
216         }
217         
218         bool atStart()
219         {
220             return pos == 0;
221         }
222
223         bool atEnd()
224         {
225             return pos == length;
226         }
227
228         bool checkInput(int count)
229         {
230             if ((pos + count) <= length) {
231                 pos += count;
232                 return true;
233             } else
234                 return false;
235         }
236
237         void uncheckInput(int count)
238         {
239             pos -= count;
240         }
241
242         bool atStart(int position)
243         {
244             return (pos + position) == 0;
245         }
246
247         bool atEnd(int position)
248         {
249             return (pos + position) == length;
250         }
251
252     private:
253         const UChar* input;
254         unsigned pos;
255         unsigned length;
256     };
257
258     bool testCharacterClass(CharacterClass* characterClass, int ch)
259     {
260         if (ch & 0xFF80) {
261             for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
262                 if (ch == characterClass->m_matchesUnicode[i])
263                     return true;
264             for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
265                 if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
266                     return true;
267         } else {
268             for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
269                 if (ch == characterClass->m_matches[i])
270                     return true;
271             for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
272                 if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
273                     return true;
274         }
275
276         return false;
277     }
278
279     bool tryConsumeCharacter(int testChar)
280     {
281         if (input.atEnd())
282             return false;
283         
284         int ch = input.read();
285
286         if (pattern->m_ignoreCase ? ((Unicode::toLower(testChar) == ch) || (Unicode::toUpper(testChar) == ch)) : (testChar == ch)) {
287             input.next();
288             return true;
289         }
290         return false;
291     }
292
293     bool checkCharacter(int testChar, int inputPosition)
294     {
295         return testChar == input.readChecked(inputPosition);
296     }
297
298     bool checkCasedCharacter(int loChar, int hiChar, int inputPosition)
299     {
300         int ch = input.readChecked(inputPosition);
301         return (loChar == ch) || (hiChar == ch);
302     }
303
304     bool tryConsumeCharacterClass(CharacterClass* characterClass, bool invert)
305     {
306         if (input.atEnd())
307             return false;
308
309         bool match = testCharacterClass(characterClass, input.read());
310
311         if (invert)
312             match = !match;
313
314         if (match) {
315             input.next();
316             return true;
317         }
318         return false;
319     }
320
321     bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition)
322     {
323         bool match = testCharacterClass(characterClass, input.readChecked(inputPosition));
324         return invert ? !match : match;
325     }
326
327     bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset)
328     {
329         int matchSize = matchEnd - matchBegin;
330
331         if (!input.checkInput(matchSize))
332             return false;
333
334         for (int i = 0; i < matchSize; ++i) {
335             if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) {
336                 input.uncheckInput(matchSize);
337                 return false;
338             }
339         }
340         
341         return true;
342     }
343
344     bool matchAssertionBOL(ByteTerm& term)
345     {
346         return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1)));
347     }
348
349     bool matchAssertionEOL(ByteTerm& term)
350     {
351         if (term.inputPosition)
352             return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
353         else
354             return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
355     }
356
357     bool matchAssertionWordBoundary(ByteTerm& term)
358     {
359         bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1));
360         bool readIsWordchar;
361         if (term.inputPosition)
362             readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
363         else
364             readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
365
366         bool wordBoundary = prevIsWordchar != readIsWordchar;
367         return term.invert() ? !wordBoundary : wordBoundary;
368     }
369
370     bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
371     {
372         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
373
374         switch (term.atom.quantityType) {
375         case QuantifierFixedCount:
376             break;
377
378         case QuantifierGreedy:
379             if (backTrack->matchAmount) {
380                 --backTrack->matchAmount;
381                 input.uncheckInput(1);
382                 return true;
383             }
384             break;
385
386         case QuantifierNonGreedy:
387             if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
388                 ++backTrack->matchAmount;
389                 if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1))
390                     return true;
391             }
392             input.uncheckInput(backTrack->matchAmount);
393             break;
394         }
395
396         return false;
397     }
398
399     bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
400     {
401         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
402
403         switch (term.atom.quantityType) {
404         case QuantifierFixedCount:
405             break;
406
407         case QuantifierGreedy:
408             if (backTrack->matchAmount) {
409                 --backTrack->matchAmount;
410                 input.uncheckInput(1);
411                 return true;
412             }
413             break;
414
415         case QuantifierNonGreedy:
416             if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
417                 ++backTrack->matchAmount;
418                 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1))
419                     return true;
420             }
421             input.uncheckInput(backTrack->matchAmount);
422             break;
423         }
424
425         return false;
426     }
427
428     bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
429     {
430         ASSERT(term.type == ByteTerm::TypeCharacterClass);
431         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
432
433         switch (term.atom.quantityType) {
434         case QuantifierFixedCount: {
435             for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
436                 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount))
437                     return false;
438             }
439             return true;
440         }
441
442         case QuantifierGreedy: {
443             unsigned matchAmount = 0;
444             while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
445                 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) {
446                     input.uncheckInput(1);
447                     break;
448                 }
449                 ++matchAmount;
450             }
451             backTrack->matchAmount = matchAmount;
452
453             return true;
454         }
455
456         case QuantifierNonGreedy:
457             backTrack->matchAmount = 0;
458             return true;
459         }
460
461         ASSERT_NOT_REACHED();
462         return false;
463     }
464
465     bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
466     {
467         ASSERT(term.type == ByteTerm::TypeCharacterClass);
468         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
469
470         switch (term.atom.quantityType) {
471         case QuantifierFixedCount:
472             break;
473
474         case QuantifierGreedy:
475             if (backTrack->matchAmount) {
476                 --backTrack->matchAmount;
477                 input.uncheckInput(1);
478                 return true;
479             }
480             break;
481
482         case QuantifierNonGreedy:
483             if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
484                 ++backTrack->matchAmount;
485                 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1))
486                     return true;
487             }
488             input.uncheckInput(backTrack->matchAmount);
489             break;
490         }
491
492         return false;
493     }
494
495     bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
496     {
497         ASSERT(term.type == ByteTerm::TypeBackReference);
498         BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
499
500         int matchBegin = output[(term.atom.subpatternId << 1)];
501         int matchEnd = output[(term.atom.subpatternId << 1) + 1];
502         ASSERT((matchBegin == -1) == (matchEnd == -1));
503         ASSERT(matchBegin <= matchEnd);
504
505         if (matchBegin == matchEnd)
506             return true;
507
508         switch (term.atom.quantityType) {
509         case QuantifierFixedCount: {
510             backTrack->begin = input.getPos();
511             for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
512                 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
513                     input.setPos(backTrack->begin);
514                     return false;
515                 }
516             }
517             return true;
518         }
519
520         case QuantifierGreedy: {
521             unsigned matchAmount = 0;
522             while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
523                 ++matchAmount;
524             backTrack->matchAmount = matchAmount;
525             return true;
526         }
527
528         case QuantifierNonGreedy:
529             backTrack->begin = input.getPos();
530             backTrack->matchAmount = 0;
531             return true;
532         }
533
534         ASSERT_NOT_REACHED();
535         return false;
536     }
537
538     bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
539     {
540         ASSERT(term.type == ByteTerm::TypeBackReference);
541         BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
542
543         int matchBegin = output[(term.atom.subpatternId << 1)];
544         int matchEnd = output[(term.atom.subpatternId << 1) + 1];
545         ASSERT((matchBegin == -1) == (matchEnd == -1));
546         ASSERT(matchBegin <= matchEnd);
547
548         if (matchBegin == matchEnd)
549             return false;
550
551         switch (term.atom.quantityType) {
552         case QuantifierFixedCount:
553             // for quantityCount == 1, could rewind.
554             input.setPos(backTrack->begin);
555             break;
556
557         case QuantifierGreedy:
558             if (backTrack->matchAmount) {
559                 --backTrack->matchAmount;
560                 input.rewind(matchEnd - matchBegin);
561                 return true;
562             }
563             break;
564
565         case QuantifierNonGreedy:
566             if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
567                 ++backTrack->matchAmount;
568                 return true;
569             } else
570                 input.setPos(backTrack->begin);
571             break;
572         }
573
574         return false;
575     }
576
577     void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
578     {
579         if (term.capture()) {
580             unsigned subpatternId = term.atom.subpatternId;
581             output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
582             output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
583         }
584     }
585     void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
586     {
587         unsigned firstSubpatternId = term.atom.subpatternId;
588         unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
589         context->restoreOutput(output, firstSubpatternId, count);
590     }
591     void resetAssertionMatches(ByteTerm& term)
592     {
593         unsigned firstSubpatternId = term.atom.subpatternId;
594         unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
595         for (unsigned i = 0; i < (count << 1); ++i)
596             output[(firstSubpatternId << 1) + i] = -1;
597     }
598     bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
599     {
600         while (backTrack->matchAmount) {
601             ParenthesesDisjunctionContext* context = backTrack->lastContext;
602
603             if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true))
604                 return true;
605             
606             resetMatches(term, context);
607             freeParenthesesDisjunctionContext(context);
608             popParenthesesDisjunctionContext(backTrack);
609         }
610
611         return false;
612     }
613
614     bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
615     {
616         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
617         ASSERT(term.atom.quantityCount == 1);
618
619         BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
620
621         switch (term.atom.quantityType) {
622         case QuantifierGreedy: {
623             // set this speculatively; if we get to the parens end this will be true.
624             backTrack->inParentheses = 1;
625             break;
626         }
627         case QuantifierNonGreedy: {
628             backTrack->inParentheses = 0;
629             context->term += term.atom.parenthesesWidth;
630             return true;
631         }
632         case QuantifierFixedCount:
633             break;
634         }
635
636         if (term.capture()) {
637             unsigned subpatternId = term.atom.subpatternId;
638             output[(subpatternId << 1)] = input.getPos() + term.inputPosition;
639         }
640
641         return true;
642     }
643
644     bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*)
645     {
646         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
647         ASSERT(term.atom.quantityCount == 1);
648
649         if (term.capture()) {
650             unsigned subpatternId = term.atom.subpatternId;
651             output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
652         }
653         return true;
654     }
655
656     bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
657     {
658         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
659         ASSERT(term.atom.quantityCount == 1);
660
661         BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
662
663         if (term.capture()) {
664             unsigned subpatternId = term.atom.subpatternId;
665             output[(subpatternId << 1)] = -1;
666             output[(subpatternId << 1) + 1] = -1;
667         }
668
669         switch (term.atom.quantityType) {
670         case QuantifierGreedy:
671             // if we backtrack to this point, there is another chance - try matching nothing.
672             ASSERT(backTrack->inParentheses);
673             backTrack->inParentheses = 0;
674             context->term += term.atom.parenthesesWidth;
675             return true;
676         case QuantifierNonGreedy:
677             ASSERT(backTrack->inParentheses);
678         case QuantifierFixedCount:
679             break;
680         }
681
682         return false;
683     }
684
685     bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
686     {
687         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
688         ASSERT(term.atom.quantityCount == 1);
689
690         BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
691
692         switch (term.atom.quantityType) {
693         case QuantifierGreedy:
694             if (!backTrack->inParentheses) {
695                 context->term -= term.atom.parenthesesWidth;
696                 return false;
697             }
698         case QuantifierNonGreedy:
699             if (!backTrack->inParentheses) {
700                 // now try to match the parens; set this speculatively.
701                 backTrack->inParentheses = 1;
702                 if (term.capture()) {
703                     unsigned subpatternId = term.atom.subpatternId;
704                     output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
705                 }
706                 context->term -= term.atom.parenthesesWidth;
707                 return true;
708             }
709         case QuantifierFixedCount:
710             break;
711         }
712
713         return false;
714     }
715
716     bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
717     {
718         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
719         ASSERT(term.atom.quantityCount == 1);
720
721         BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
722
723         backTrack->begin = input.getPos();
724         return true;
725     }
726
727     bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
728     {
729         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
730         ASSERT(term.atom.quantityCount == 1);
731
732         BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
733
734         input.setPos(backTrack->begin);
735
736         // We've reached the end of the parens; if they are inverted, this is failure.
737         if (term.invert()) {
738             context->term -= term.atom.parenthesesWidth;
739             return false;
740         }
741
742         return true;
743     }
744
745     bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
746     {
747         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
748         ASSERT(term.atom.quantityCount == 1);
749
750         // We've failed to match parens; if they are inverted, this is win!
751         if (term.invert()) {
752             context->term += term.atom.parenthesesWidth;
753             return true;
754         }
755
756         return false;
757     }
758
759     bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
760     {
761         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
762         ASSERT(term.atom.quantityCount == 1);
763
764         BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
765
766         input.setPos(backTrack->begin);
767
768         context->term -= term.atom.parenthesesWidth;
769         return false;
770     }
771
772     bool matchParentheses(ByteTerm& term, DisjunctionContext* context)
773     {
774         ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
775
776         BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
777
778         unsigned subpatternId = term.atom.subpatternId;
779         ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
780
781         backTrack->prevBegin = output[(subpatternId << 1)];
782         backTrack->prevEnd = output[(subpatternId << 1) + 1];
783
784         backTrack->matchAmount = 0;
785         backTrack->lastContext = 0;
786
787         switch (term.atom.quantityType) {
788         case QuantifierFixedCount: {
789             // While we haven't yet reached our fixed limit,
790             while (backTrack->matchAmount < term.atom.quantityCount) {
791                 // Try to do a match, and it it succeeds, add it to the list.
792                 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
793                 if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
794                     appendParenthesesDisjunctionContext(backTrack, context);
795                 else {
796                     // The match failed; try to find an alternate point to carry on from.
797                     resetMatches(term, context);
798                     freeParenthesesDisjunctionContext(context);
799                     if (!parenthesesDoBacktrack(term, backTrack))
800                         return false;
801                 }
802             }
803
804             ASSERT(backTrack->matchAmount == term.atom.quantityCount);
805             ParenthesesDisjunctionContext* context = backTrack->lastContext;
806             recordParenthesesMatch(term, context);
807             return true;
808         }
809
810         case QuantifierGreedy: {
811             while (backTrack->matchAmount < term.atom.quantityCount) {
812                 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
813                 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
814                     appendParenthesesDisjunctionContext(backTrack, context);
815                 else {
816                     resetMatches(term, context);
817                     freeParenthesesDisjunctionContext(context);
818                     break;
819                 }
820             }
821
822             if (backTrack->matchAmount) {
823                 ParenthesesDisjunctionContext* context = backTrack->lastContext;
824                 recordParenthesesMatch(term, context);
825             }
826             return true;
827         }
828
829         case QuantifierNonGreedy:
830             return true;
831         }
832
833         ASSERT_NOT_REACHED();
834         return false;
835     }
836
837     // Rules for backtracking differ depending on whether this is greedy or non-greedy.
838     //
839     // Greedy matches never should try just adding more - you should already have done
840     // the 'more' cases.  Always backtrack, at least a leetle bit.  However cases where
841     // you backtrack an item off the list needs checking, since we'll never have matched
842     // the one less case.  Tracking forwards, still add as much as possible.
843     //
844     // Non-greedy, we've already done the one less case, so don't match on popping.
845     // We haven't done the one more case, so always try to add that.
846     //
847     bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
848     {
849         ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
850
851         BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
852
853         if (term.capture()) {
854             unsigned subpatternId = term.atom.subpatternId;
855             output[(subpatternId << 1)] = backTrack->prevBegin;
856             output[(subpatternId << 1) + 1] = backTrack->prevEnd;
857         }
858
859         ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
860
861         switch (term.atom.quantityType) {
862         case QuantifierFixedCount: {
863             ASSERT(backTrack->matchAmount == term.atom.quantityCount);
864
865             ParenthesesDisjunctionContext* context = 0;
866
867             if (!parenthesesDoBacktrack(term, backTrack))
868                 return false;
869
870             // While we haven't yet reached our fixed limit,
871             while (backTrack->matchAmount < term.atom.quantityCount) {
872                 // Try to do a match, and it it succeeds, add it to the list.
873                 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
874                 if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
875                     appendParenthesesDisjunctionContext(backTrack, context);
876                 else {
877                     // The match failed; try to find an alternate point to carry on from.
878                     resetMatches(term, context);
879                     freeParenthesesDisjunctionContext(context);
880                     if (!parenthesesDoBacktrack(term, backTrack))
881                         return false;
882                 }
883             }
884
885             ASSERT(backTrack->matchAmount == term.atom.quantityCount);
886             context = backTrack->lastContext;
887             recordParenthesesMatch(term, context);
888             return true;
889         }
890
891         case QuantifierGreedy: {
892             if (!backTrack->matchAmount)
893                 return false;
894
895             ParenthesesDisjunctionContext* context = backTrack->lastContext;
896             if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
897                 while (backTrack->matchAmount < term.atom.quantityCount) {
898                     ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
899                     if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
900                         appendParenthesesDisjunctionContext(backTrack, context);
901                     else {
902                         resetMatches(term, context);
903                         freeParenthesesDisjunctionContext(context);
904                         break;
905                     }
906                 }
907             } else {
908                 resetMatches(term, context);
909                 freeParenthesesDisjunctionContext(context);
910                 popParenthesesDisjunctionContext(backTrack);
911             }
912
913             if (backTrack->matchAmount) {
914                 ParenthesesDisjunctionContext* context = backTrack->lastContext;
915                 recordParenthesesMatch(term, context);
916             }
917             return true;
918         }
919
920         case QuantifierNonGreedy: {
921             // If we've not reached the limit, try to add one more match.
922             if (backTrack->matchAmount < term.atom.quantityCount) {
923                 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
924                 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) {
925                     appendParenthesesDisjunctionContext(backTrack, context);
926                     recordParenthesesMatch(term, context);
927                     return true;
928                 } else {
929                     resetMatches(term, context);
930                     freeParenthesesDisjunctionContext(context);
931                 }
932             }
933
934             // Nope - okay backtrack looking for an alternative.
935             while (backTrack->matchAmount) {
936                 ParenthesesDisjunctionContext* context = backTrack->lastContext;
937                 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
938                     // successful backtrack! we're back in the game!
939                     if (backTrack->matchAmount) {
940                         context = backTrack->lastContext;
941                         recordParenthesesMatch(term, context);
942                     }
943                     return true;
944                 }
945                 
946                 // pop a match off the stack
947                 resetMatches(term, context);
948                 freeParenthesesDisjunctionContext(context);
949                 popParenthesesDisjunctionContext(backTrack);
950             }
951
952             return false;
953         }
954         }
955
956         ASSERT_NOT_REACHED();
957         return false;
958     }
959
960 #define MATCH_NEXT() { ++context->term; goto matchAgain; }
961 #define BACKTRACK() { --context->term; goto backtrack; }
962 #define currentTerm() (disjunction->terms[context->term])
963     bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
964     {
965         if (btrack)
966             BACKTRACK();
967
968         context->matchBegin = input.getPos();
969         context->term = 0;
970
971     matchAgain:
972         ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
973
974         switch (currentTerm().type) {
975         case ByteTerm::TypeSubpatternBegin:
976             MATCH_NEXT();
977         case ByteTerm::TypeSubpatternEnd:
978             context->matchEnd = input.getPos();
979             return true;
980
981         case ByteTerm::TypeBodyAlternativeBegin:
982             MATCH_NEXT();
983         case ByteTerm::TypeBodyAlternativeDisjunction:
984         case ByteTerm::TypeBodyAlternativeEnd:
985             context->matchEnd = input.getPos();
986             return true;
987
988         case ByteTerm::TypeAlternativeBegin:
989             MATCH_NEXT();
990         case ByteTerm::TypeAlternativeDisjunction:
991         case ByteTerm::TypeAlternativeEnd: {
992             int offset = currentTerm().alternative.end;
993             BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
994             backTrack->offset = offset;
995             context->term += offset;
996             MATCH_NEXT();
997         }
998
999         case ByteTerm::TypeAssertionBOL:
1000             if (matchAssertionBOL(currentTerm()))
1001                 MATCH_NEXT();
1002             BACKTRACK();
1003         case ByteTerm::TypeAssertionEOL:
1004             if (matchAssertionEOL(currentTerm()))
1005                 MATCH_NEXT();
1006             BACKTRACK();
1007         case ByteTerm::TypeAssertionWordBoundary:
1008             if (matchAssertionWordBoundary(currentTerm()))
1009                 MATCH_NEXT();
1010             BACKTRACK();
1011
1012         case ByteTerm::TypePatternCharacterOnce:
1013         case ByteTerm::TypePatternCharacterFixed: {
1014             for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1015                 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount))
1016                     BACKTRACK();
1017             }
1018             MATCH_NEXT();
1019         }
1020         case ByteTerm::TypePatternCharacterGreedy: {
1021             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1022             unsigned matchAmount = 0;
1023             while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1024                 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) {
1025                     input.uncheckInput(1);
1026                     break;
1027                 }
1028                 ++matchAmount;
1029             }
1030             backTrack->matchAmount = matchAmount;
1031
1032             MATCH_NEXT();
1033         }
1034         case ByteTerm::TypePatternCharacterNonGreedy: {
1035             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1036             backTrack->matchAmount = 0;
1037             MATCH_NEXT();
1038         }
1039
1040         case ByteTerm::TypePatternCasedCharacterOnce:
1041         case ByteTerm::TypePatternCasedCharacterFixed: {
1042             for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1043                 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount))
1044                     BACKTRACK();
1045             }
1046             MATCH_NEXT();
1047         }
1048         case ByteTerm::TypePatternCasedCharacterGreedy: {
1049             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1050             unsigned matchAmount = 0;
1051             while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1052                 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) {
1053                     input.uncheckInput(1);
1054                     break;
1055                 }
1056                 ++matchAmount;
1057             }
1058             backTrack->matchAmount = matchAmount;
1059
1060             MATCH_NEXT();
1061         }
1062         case ByteTerm::TypePatternCasedCharacterNonGreedy: {
1063             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1064             backTrack->matchAmount = 0;
1065             MATCH_NEXT();
1066         }
1067
1068         case ByteTerm::TypeCharacterClass:
1069             if (matchCharacterClass(currentTerm(), context))
1070                 MATCH_NEXT();
1071             BACKTRACK();
1072         case ByteTerm::TypeBackReference:
1073             if (matchBackReference(currentTerm(), context))
1074                 MATCH_NEXT();
1075             BACKTRACK();
1076         case ByteTerm::TypeParenthesesSubpattern:
1077             if (matchParentheses(currentTerm(), context))
1078                 MATCH_NEXT();
1079             BACKTRACK();
1080         case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1081             if (matchParenthesesOnceBegin(currentTerm(), context))
1082                 MATCH_NEXT();
1083             BACKTRACK();
1084         case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1085             if (matchParenthesesOnceEnd(currentTerm(), context))
1086                 MATCH_NEXT();
1087             BACKTRACK();
1088         case ByteTerm::TypeParentheticalAssertionBegin:
1089             if (matchParentheticalAssertionBegin(currentTerm(), context))
1090                 MATCH_NEXT();
1091             BACKTRACK();
1092         case ByteTerm::TypeParentheticalAssertionEnd:
1093             if (matchParentheticalAssertionEnd(currentTerm(), context))
1094                 MATCH_NEXT();
1095             BACKTRACK();
1096
1097         case ByteTerm::TypeCheckInput:
1098             if (input.checkInput(currentTerm().checkInputCount))
1099                 MATCH_NEXT();
1100             BACKTRACK();
1101         }
1102
1103         // We should never fall-through to here.
1104         ASSERT_NOT_REACHED();
1105
1106     backtrack:
1107         ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1108
1109         switch (currentTerm().type) {
1110         case ByteTerm::TypeSubpatternBegin:
1111             return false;
1112         case ByteTerm::TypeSubpatternEnd:
1113             ASSERT_NOT_REACHED();
1114
1115         case ByteTerm::TypeBodyAlternativeBegin:
1116         case ByteTerm::TypeBodyAlternativeDisjunction: {
1117             int offset = currentTerm().alternative.next;
1118             context->term += offset;
1119             if (offset > 0)
1120                 MATCH_NEXT();
1121
1122             if (input.atEnd())
1123                 return false;
1124
1125             input.next();
1126             context->matchBegin = input.getPos();
1127             MATCH_NEXT();
1128         }
1129         case ByteTerm::TypeBodyAlternativeEnd:
1130             ASSERT_NOT_REACHED();
1131
1132             case ByteTerm::TypeAlternativeBegin:
1133             case ByteTerm::TypeAlternativeDisjunction: {
1134                 int offset = currentTerm().alternative.next;
1135                 context->term += offset;
1136                 if (offset > 0)
1137                     MATCH_NEXT();
1138                 BACKTRACK();
1139             }
1140             case ByteTerm::TypeAlternativeEnd: {
1141                 // We should never backtrack back into an alternative of the main body of the regex.
1142                 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1143                 unsigned offset = backTrack->offset;
1144                 context->term -= offset;
1145                 BACKTRACK();
1146             }
1147
1148             case ByteTerm::TypeAssertionBOL:
1149             case ByteTerm::TypeAssertionEOL:
1150             case ByteTerm::TypeAssertionWordBoundary:
1151                 BACKTRACK();
1152
1153             case ByteTerm::TypePatternCharacterOnce:
1154             case ByteTerm::TypePatternCharacterFixed:
1155             case ByteTerm::TypePatternCharacterGreedy:
1156             case ByteTerm::TypePatternCharacterNonGreedy:
1157                 if (backtrackPatternCharacter(currentTerm(), context))
1158                     MATCH_NEXT();
1159                 BACKTRACK();
1160             case ByteTerm::TypePatternCasedCharacterOnce:
1161             case ByteTerm::TypePatternCasedCharacterFixed:
1162             case ByteTerm::TypePatternCasedCharacterGreedy:
1163             case ByteTerm::TypePatternCasedCharacterNonGreedy:
1164                 if (backtrackPatternCasedCharacter(currentTerm(), context))
1165                     MATCH_NEXT();
1166                 BACKTRACK();
1167             case ByteTerm::TypeCharacterClass:
1168                 if (backtrackCharacterClass(currentTerm(), context))
1169                     MATCH_NEXT();
1170                 BACKTRACK();
1171             case ByteTerm::TypeBackReference:
1172                 if (backtrackBackReference(currentTerm(), context))
1173                     MATCH_NEXT();
1174                 BACKTRACK();
1175             case ByteTerm::TypeParenthesesSubpattern:
1176                 if (backtrackParentheses(currentTerm(), context))
1177                     MATCH_NEXT();
1178                 BACKTRACK();
1179             case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1180                 if (backtrackParenthesesOnceBegin(currentTerm(), context))
1181                     MATCH_NEXT();
1182                 BACKTRACK();
1183             case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1184                 if (backtrackParenthesesOnceEnd(currentTerm(), context))
1185                     MATCH_NEXT();
1186                 BACKTRACK();
1187             case ByteTerm::TypeParentheticalAssertionBegin:
1188                 if (backtrackParentheticalAssertionBegin(currentTerm(), context))
1189                     MATCH_NEXT();
1190                 BACKTRACK();
1191             case ByteTerm::TypeParentheticalAssertionEnd:
1192                 if (backtrackParentheticalAssertionEnd(currentTerm(), context))
1193                     MATCH_NEXT();
1194                 BACKTRACK();
1195
1196             case ByteTerm::TypeCheckInput:
1197                 input.uncheckInput(currentTerm().checkInputCount);
1198                 BACKTRACK();
1199         }
1200
1201         ASSERT_NOT_REACHED();
1202         return false;
1203     }
1204
1205     bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1206     {
1207         if (matchDisjunction(disjunction, context, btrack)) {
1208             while (context->matchBegin == context->matchEnd) {
1209                 if (!matchDisjunction(disjunction, context, true))
1210                     return false;
1211             }
1212             return true;
1213         }
1214
1215         return false;
1216     }
1217
1218     int interpret()
1219     {
1220         for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i)
1221             output[i] = -1;
1222
1223         DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
1224
1225         if (matchDisjunction(pattern->m_body.get(), context)) {
1226             output[0] = context->matchBegin;
1227             output[1] = context->matchEnd;
1228         }
1229
1230         freeDisjunctionContext(context);
1231
1232         return output[0];
1233     }
1234
1235     Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length)
1236         : pattern(pattern)
1237         , output(output)
1238         , input(inputChar, start, length)
1239     {
1240     }
1241
1242 private:
1243     BytecodePattern *pattern;
1244     int* output;
1245     InputStream input;
1246 };
1247
1248
1249
1250 class ByteCompiler {
1251     struct ParenthesesStackEntry {
1252         unsigned beginTerm;
1253         unsigned savedAlternativeIndex;
1254         ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
1255             : beginTerm(beginTerm)
1256             , savedAlternativeIndex(savedAlternativeIndex)
1257         {
1258         }
1259     };
1260
1261 public:
1262     ByteCompiler(RegexPattern& pattern)
1263         : m_pattern(pattern)
1264     {
1265         bodyDisjunction = 0;
1266         currentAlternativeIndex = 0;
1267     }
1268     
1269     BytecodePattern* compile()
1270     {
1271         regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize);
1272         emitDisjunction(m_pattern.m_body);
1273         regexEnd();
1274
1275         return new BytecodePattern(bodyDisjunction, m_allParenthesesInfo, m_pattern);
1276     }
1277     
1278     void checkInput(unsigned count)
1279     {
1280         bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
1281     }
1282
1283     void assertionBOL(int inputPosition)
1284     {
1285         bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
1286     }
1287
1288     void assertionEOL(int inputPosition)
1289     {
1290         bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
1291     }
1292
1293     void assertionWordBoundary(bool invert, int inputPosition)
1294     {
1295         bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
1296     }
1297
1298     void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1299     {
1300         if (m_pattern.m_ignoreCase) {
1301             UChar lo = Unicode::toLower(ch);
1302             UChar hi = Unicode::toUpper(ch);
1303             
1304             if (lo != hi) {
1305                 bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
1306                 return;
1307             }
1308         }
1309
1310         bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
1311     }
1312     
1313     void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1314     {
1315         bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
1316
1317         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
1318         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1319         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1320     }
1321
1322     void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1323     {
1324         ASSERT(subpatternId);
1325
1326         bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
1327
1328         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
1329         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1330         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1331     }
1332
1333     void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1334     {
1335         int beginTerm = bodyDisjunction->terms.size();
1336
1337         bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition));
1338         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1339         bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1340         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1341
1342         m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, currentAlternativeIndex));
1343         currentAlternativeIndex = beginTerm + 1;
1344     }
1345
1346     void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
1347     {
1348         int beginTerm = bodyDisjunction->terms.size();
1349
1350         bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, invert, 0));
1351         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1352         bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1353         bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1354
1355         m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, currentAlternativeIndex));
1356         currentAlternativeIndex = beginTerm + 1;
1357     }
1358
1359     unsigned popParenthesesStack()
1360     {
1361         ASSERT(m_parenthesesStack.size());
1362         int stackEnd = m_parenthesesStack.size() - 1;
1363         unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
1364         currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
1365         m_parenthesesStack.shrink(stackEnd);
1366
1367         ASSERT(beginTerm < bodyDisjunction->terms.size());
1368         ASSERT(currentAlternativeIndex < bodyDisjunction->terms.size());
1369         
1370         return beginTerm;
1371     }
1372
1373 #ifndef NDEBUG
1374     void dumpDisjunction(ByteDisjunction* disjunction)
1375     {
1376         printf("ByteDisjunction(%p):\n\t", disjunction);
1377         for (unsigned i = 0; i < disjunction->terms.size(); ++i)
1378             printf("{ %d } ", disjunction->terms[i].type);
1379         printf("\n");
1380     }
1381 #endif
1382
1383     void closeAlternative(int beginTerm)
1384     {
1385         int origBeginTerm = beginTerm;
1386         ASSERT(bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
1387         int endIndex = bodyDisjunction->terms.size();
1388
1389         unsigned frameLocation = bodyDisjunction->terms[beginTerm].frameLocation;
1390
1391         if (!bodyDisjunction->terms[beginTerm].alternative.next)
1392             bodyDisjunction->terms.remove(beginTerm);
1393         else {
1394             while (bodyDisjunction->terms[beginTerm].alternative.next) {
1395                 beginTerm += bodyDisjunction->terms[beginTerm].alternative.next;
1396                 ASSERT(bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
1397                 bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1398                 bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1399             }
1400             
1401             bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1402
1403             bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
1404             bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1405         }
1406     }
1407
1408     void closeBodyAlternative()
1409     {
1410         int beginTerm = 0;
1411         int origBeginTerm = 0;
1412         ASSERT(bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
1413         int endIndex = bodyDisjunction->terms.size();
1414
1415         unsigned frameLocation = bodyDisjunction->terms[beginTerm].frameLocation;
1416
1417         while (bodyDisjunction->terms[beginTerm].alternative.next) {
1418             beginTerm += bodyDisjunction->terms[beginTerm].alternative.next;
1419             ASSERT(bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
1420             bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1421             bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1422         }
1423         
1424         bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1425
1426         bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
1427         bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1428     }
1429
1430     void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
1431     {
1432         unsigned beginTerm = popParenthesesStack();
1433         closeAlternative(beginTerm + 1);
1434         unsigned endTerm = bodyDisjunction->terms.size();
1435
1436         bool isAssertion = bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin;
1437         bool invertOrCapture = bodyDisjunction->terms[beginTerm].invertOrCapture;
1438         unsigned subpatternId = bodyDisjunction->terms[beginTerm].atom.subpatternId;
1439
1440         bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition));
1441         bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1442         bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1443         bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1444
1445         if (doInline) {
1446             bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1447             bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1448             bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
1449             bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1450         } else {
1451             ByteTerm& parenthesesBegin = bodyDisjunction->terms[beginTerm];
1452             ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1453
1454             bool invertOrCapture = parenthesesBegin.invertOrCapture;
1455             unsigned subpatternId = parenthesesBegin.atom.subpatternId;
1456
1457             unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
1458             ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
1459
1460             parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
1461             for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
1462                 parenthesesDisjunction->terms.append(bodyDisjunction->terms[termInParentheses]);
1463             parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
1464
1465             bodyDisjunction->terms.shrink(beginTerm);
1466
1467             m_allParenthesesInfo.append(parenthesesDisjunction);
1468             bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition));
1469
1470             bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1471             bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1472             bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1473         }
1474     }
1475
1476     void regexBegin(unsigned numSubpatterns, unsigned callFrameSize)
1477     {
1478         bodyDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
1479         bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin());
1480         bodyDisjunction->terms[0].frameLocation = 0;
1481         currentAlternativeIndex = 0;
1482     }
1483
1484     void regexEnd()
1485     {
1486         closeBodyAlternative();
1487     }
1488
1489     void alterantiveBodyDisjunction()
1490     {
1491         int newAlternativeIndex = bodyDisjunction->terms.size();
1492         bodyDisjunction->terms[currentAlternativeIndex].alternative.next = newAlternativeIndex - currentAlternativeIndex;
1493         bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction());
1494
1495         currentAlternativeIndex = newAlternativeIndex;
1496     }
1497
1498     void alterantiveDisjunction()
1499     {
1500         int newAlternativeIndex = bodyDisjunction->terms.size();
1501         bodyDisjunction->terms[currentAlternativeIndex].alternative.next = newAlternativeIndex - currentAlternativeIndex;
1502         bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
1503
1504         currentAlternativeIndex = newAlternativeIndex;
1505     }
1506
1507     void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0)
1508     {
1509         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
1510             unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
1511         
1512             if (alt) {
1513                 if (disjunction == m_pattern.m_body)
1514                     alterantiveBodyDisjunction();
1515                 else
1516                     alterantiveDisjunction();
1517             }
1518
1519             PatternAlternative* alternative = disjunction->m_alternatives[alt];
1520             unsigned minimumSize = alternative->m_minimumSize;
1521
1522             ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
1523             unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
1524             if (countToCheck)
1525                 checkInput(countToCheck);
1526             currentCountAlreadyChecked += countToCheck;
1527
1528             for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
1529                 PatternTerm& term = alternative->m_terms[i];
1530
1531                 switch (term.type) {
1532                 case PatternTerm::TypeAssertionBOL:
1533                     assertionBOL(term.inputPosition - currentCountAlreadyChecked);
1534                     break;
1535
1536                 case PatternTerm::TypeAssertionEOL:
1537                     assertionEOL(term.inputPosition - currentCountAlreadyChecked);
1538                     break;
1539
1540                 case PatternTerm::TypeAssertionWordBoundary:
1541                     assertionWordBoundary(term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked);
1542                     break;
1543
1544                 case PatternTerm::TypePatternCharacter:
1545                     atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1546                     break;
1547
1548                 case PatternTerm::TypeCharacterClass:
1549                     atomCharacterClass(term.characterClass, term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1550                     break;
1551
1552                 case PatternTerm::TypeBackReference:
1553                     atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1554                     break;
1555
1556                 case PatternTerm::TypeParenthesesSubpattern: {
1557                     unsigned disjunctionAlreadyCheckedCount = 0;
1558                     if ((term.quantityCount == 1) && !term.parentheses.isCopy) {
1559                         if (term.quantityType == QuantifierFixedCount) {
1560                             disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
1561                             unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1562                             atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation);
1563                             emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize);
1564                             atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1565                         } else {
1566                             unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1567                             atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce);
1568                             emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1569                             atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1570                         }
1571                     } else {
1572                         unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1573                         atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0);
1574                         emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1575                         atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1576                     }
1577                     break;
1578                 }
1579
1580                 case PatternTerm::TypeParentheticalAssertion: {
1581                     unsigned alternativeFrameLocation = term.inputPosition + RegexStackSpaceForBackTrackInfoParentheticalAssertion;
1582                     
1583                     atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation);
1584                     emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1585                     atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType);
1586                     break;
1587                 }
1588                 }
1589             }
1590         }
1591     }
1592
1593 private:
1594     RegexPattern& m_pattern;
1595     ByteDisjunction* bodyDisjunction;
1596     unsigned currentAlternativeIndex;
1597     Vector<ParenthesesStackEntry> m_parenthesesStack;
1598     Vector<ByteDisjunction*> m_allParenthesesInfo;
1599 };
1600
1601
1602 BytecodePattern* byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline)
1603 {
1604     RegexPattern pattern(ignoreCase, multiline);
1605
1606     if ((error = compileRegex(patternString, pattern)))
1607         return 0;
1608
1609     numSubpatterns = pattern.m_numSubpatterns;
1610
1611     return ByteCompiler(pattern).compile();
1612 }
1613
1614 int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output)
1615 {
1616     return Interpreter(regex, output, input, start, length).interpret();
1617 }
1618
1619
1620 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter);
1621 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass);
1622 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference);
1623 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative);
1624 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion);
1625 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce);
1626 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses);
1627
1628
1629 } }
1630
1631 #endif