2010-10-28 Peter Varga <pvarga@inf.u-szeged.hu>
[WebKit-https.git] / JavaScriptCore / yarr / RegexInterpreter.cpp
1 /*
2  * Copyright (C) 2009 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "RegexInterpreter.h"
28
29 #include "RegexCompiler.h"
30 #include "RegexPattern.h"
31 #include <wtf/BumpPointerAllocator.h>
32
33 #ifndef NDEBUG
34 #include <stdio.h>
35 #endif
36
37 #if ENABLE(YARR)
38
39 using namespace WTF;
40
41 namespace JSC { namespace Yarr {
42
43 class Interpreter {
44 public:
45     struct ParenthesesDisjunctionContext;
46
47     struct BackTrackInfoPatternCharacter {
48         uintptr_t matchAmount;
49     };
50     struct BackTrackInfoCharacterClass {
51         uintptr_t matchAmount;
52     };
53     struct BackTrackInfoBackReference {
54         uintptr_t begin; // Not really needed for greedy quantifiers.
55         uintptr_t matchAmount; // Not really needed for fixed quantifiers.
56     };
57     struct BackTrackInfoAlternative {
58         uintptr_t offset;
59     };
60     struct BackTrackInfoParentheticalAssertion {
61         uintptr_t begin;
62     };
63     struct BackTrackInfoParenthesesOnce {
64         uintptr_t inParentheses;
65     };
66     struct BackTrackInfoParentheses {
67         uintptr_t matchAmount;
68         ParenthesesDisjunctionContext* lastContext;
69         uintptr_t prevBegin;
70         uintptr_t prevEnd;
71     };
72
73     static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
74     {
75         context->next = backTrack->lastContext;
76         backTrack->lastContext = context;
77         ++backTrack->matchAmount;
78     }
79
80     static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
81     {
82         ASSERT(backTrack->matchAmount);
83         ASSERT(backTrack->lastContext);
84         backTrack->lastContext = backTrack->lastContext->next;
85         --backTrack->matchAmount;
86     }
87
88     struct DisjunctionContext
89     {
90         DisjunctionContext()
91             : term(0)
92         {
93         }
94
95         void* operator new(size_t, void* where)
96         {
97             return where;
98         }
99
100         int term;
101         unsigned matchBegin;
102         unsigned matchEnd;
103         uintptr_t frame[1];
104     };
105
106     DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
107     {
108         size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
109         allocatorPool = allocatorPool->ensureCapacity(size);
110         if (!allocatorPool)
111             CRASH();
112         return new(allocatorPool->alloc(size)) DisjunctionContext();
113     }
114
115     void freeDisjunctionContext(DisjunctionContext* context)
116     {
117         allocatorPool = allocatorPool->dealloc(context);
118     }
119
120     struct ParenthesesDisjunctionContext
121     {
122         ParenthesesDisjunctionContext(int* output, ByteTerm& term)
123             : next(0)
124         {
125             unsigned firstSubpatternId = term.atom.subpatternId;
126             unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
127
128             for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
129                 subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
130                 output[(firstSubpatternId << 1) + i] = -1;
131             }
132
133             new(getDisjunctionContext(term)) DisjunctionContext();
134         }
135
136         void* operator new(size_t, void* where)
137         {
138             return where;
139         }
140
141         void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
142         {
143             for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
144                 output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
145         }
146
147         DisjunctionContext* getDisjunctionContext(ByteTerm& term)
148         {
149             return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
150         }
151
152         ParenthesesDisjunctionContext* next;
153         int subpatternBackup[1];
154     };
155
156     ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term)
157     {
158         size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(int) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(int) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
159         allocatorPool = allocatorPool->ensureCapacity(size);
160         if (!allocatorPool)
161             CRASH();
162         return new(allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
163     }
164
165     void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
166     {
167         allocatorPool = allocatorPool->dealloc(context);
168     }
169
170     class InputStream {
171     public:
172         InputStream(const UChar* input, unsigned start, unsigned length)
173             : input(input)
174             , pos(start)
175             , length(length)
176         {
177         }
178
179         void next()
180         {
181             ++pos;
182         }
183
184         void rewind(unsigned amount)
185         {
186             ASSERT(pos >= amount);
187             pos -= amount;
188         }
189
190         int read()
191         {
192             ASSERT(pos < length);
193             if (pos < length)
194                 return input[pos];
195             return -1;
196         }
197
198         int readChecked(int position)
199         {
200             ASSERT(position < 0);
201             ASSERT((unsigned)-position <= pos);
202             unsigned p = pos + position;
203             ASSERT(p < length);
204             return input[p];
205         }
206
207         int reread(unsigned from)
208         {
209             ASSERT(from < length);
210             return input[from];
211         }
212
213         int prev()
214         {
215             ASSERT(!(pos > length));
216             if (pos && length)
217                 return input[pos - 1];
218             return -1;
219         }
220
221         unsigned getPos()
222         {
223             return pos;
224         }
225
226         void setPos(unsigned p)
227         {
228             pos = p;
229         }
230
231         bool atStart()
232         {
233             return pos == 0;
234         }
235
236         bool atEnd()
237         {
238             return pos == length;
239         }
240
241         bool checkInput(int count)
242         {
243             if ((pos + count) <= length) {
244                 pos += count;
245                 return true;
246             } else
247                 return false;
248         }
249
250         void uncheckInput(int count)
251         {
252             pos -= count;
253         }
254
255         bool atStart(int position)
256         {
257             return (pos + position) == 0;
258         }
259
260         bool atEnd(int position)
261         {
262             return (pos + position) == length;
263         }
264
265     private:
266         const UChar* input;
267         unsigned pos;
268         unsigned length;
269     };
270
271     bool testCharacterClass(CharacterClass* characterClass, int ch)
272     {
273         if (ch & 0xFF80) {
274             for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
275                 if (ch == characterClass->m_matchesUnicode[i])
276                     return true;
277             for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
278                 if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
279                     return true;
280         } else {
281             for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
282                 if (ch == characterClass->m_matches[i])
283                     return true;
284             for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
285                 if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
286                     return true;
287         }
288
289         return false;
290     }
291
292     bool checkCharacter(int testChar, int inputPosition)
293     {
294         return testChar == input.readChecked(inputPosition);
295     }
296
297     bool checkCasedCharacter(int loChar, int hiChar, int inputPosition)
298     {
299         int ch = input.readChecked(inputPosition);
300         return (loChar == ch) || (hiChar == ch);
301     }
302
303     bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition)
304     {
305         bool match = testCharacterClass(characterClass, input.readChecked(inputPosition));
306         return invert ? !match : match;
307     }
308
309     bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset)
310     {
311         int matchSize = matchEnd - matchBegin;
312
313         if (!input.checkInput(matchSize))
314             return false;
315
316         if (pattern->m_ignoreCase) {
317             for (int i = 0; i < matchSize; ++i) {
318                 int ch = input.reread(matchBegin + i);
319
320                 int lo = Unicode::toLower(ch);
321                 int hi = Unicode::toUpper(ch);
322
323                 if ((lo != hi) ? (!checkCasedCharacter(lo, hi, inputOffset - matchSize + i)) : (!checkCharacter(ch, inputOffset - matchSize + i))) {
324                     input.uncheckInput(matchSize);
325                     return false;
326                 }
327             }
328         } else {
329             for (int i = 0; i < matchSize; ++i) {
330                 if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) {
331                     input.uncheckInput(matchSize);
332                     return false;
333                 }
334             }
335         }
336
337         return true;
338     }
339
340     bool matchAssertionBOL(ByteTerm& term)
341     {
342         return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1)));
343     }
344
345     bool matchAssertionEOL(ByteTerm& term)
346     {
347         if (term.inputPosition)
348             return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
349         else
350             return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
351     }
352
353     bool matchAssertionWordBoundary(ByteTerm& term)
354     {
355         bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1));
356         bool readIsWordchar;
357         if (term.inputPosition)
358             readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
359         else
360             readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
361
362         bool wordBoundary = prevIsWordchar != readIsWordchar;
363         return term.invert() ? !wordBoundary : wordBoundary;
364     }
365
366     bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
367     {
368         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
369
370         switch (term.atom.quantityType) {
371         case QuantifierFixedCount:
372             break;
373
374         case QuantifierGreedy:
375             if (backTrack->matchAmount) {
376                 --backTrack->matchAmount;
377                 input.uncheckInput(1);
378                 return true;
379             }
380             break;
381
382         case QuantifierNonGreedy:
383             if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
384                 ++backTrack->matchAmount;
385                 if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1))
386                     return true;
387             }
388             input.uncheckInput(backTrack->matchAmount);
389             break;
390         }
391
392         return false;
393     }
394
395     bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
396     {
397         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
398
399         switch (term.atom.quantityType) {
400         case QuantifierFixedCount:
401             break;
402
403         case QuantifierGreedy:
404             if (backTrack->matchAmount) {
405                 --backTrack->matchAmount;
406                 input.uncheckInput(1);
407                 return true;
408             }
409             break;
410
411         case QuantifierNonGreedy:
412             if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
413                 ++backTrack->matchAmount;
414                 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1))
415                     return true;
416             }
417             input.uncheckInput(backTrack->matchAmount);
418             break;
419         }
420
421         return false;
422     }
423
424     bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
425     {
426         ASSERT(term.type == ByteTerm::TypeCharacterClass);
427         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
428
429         switch (term.atom.quantityType) {
430         case QuantifierFixedCount: {
431             for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
432                 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount))
433                     return false;
434             }
435             return true;
436         }
437
438         case QuantifierGreedy: {
439             unsigned matchAmount = 0;
440             while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
441                 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) {
442                     input.uncheckInput(1);
443                     break;
444                 }
445                 ++matchAmount;
446             }
447             backTrack->matchAmount = matchAmount;
448
449             return true;
450         }
451
452         case QuantifierNonGreedy:
453             backTrack->matchAmount = 0;
454             return true;
455         }
456
457         ASSERT_NOT_REACHED();
458         return false;
459     }
460
461     bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
462     {
463         ASSERT(term.type == ByteTerm::TypeCharacterClass);
464         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
465
466         switch (term.atom.quantityType) {
467         case QuantifierFixedCount:
468             break;
469
470         case QuantifierGreedy:
471             if (backTrack->matchAmount) {
472                 --backTrack->matchAmount;
473                 input.uncheckInput(1);
474                 return true;
475             }
476             break;
477
478         case QuantifierNonGreedy:
479             if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
480                 ++backTrack->matchAmount;
481                 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1))
482                     return true;
483             }
484             input.uncheckInput(backTrack->matchAmount);
485             break;
486         }
487
488         return false;
489     }
490
491     bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
492     {
493         ASSERT(term.type == ByteTerm::TypeBackReference);
494         BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
495
496         int matchBegin = output[(term.atom.subpatternId << 1)];
497         int matchEnd = output[(term.atom.subpatternId << 1) + 1];
498
499         // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
500         // In this case the result of match is empty string like when it references to a parentheses with zero-width match.
501         // Eg.: /(a\1)/
502         if (matchEnd == -1)
503             return true;
504
505         ASSERT((matchBegin == -1) == (matchEnd == -1));
506         ASSERT(matchBegin <= matchEnd);
507
508         if (matchBegin == matchEnd)
509             return true;
510
511         switch (term.atom.quantityType) {
512         case QuantifierFixedCount: {
513             backTrack->begin = input.getPos();
514             for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
515                 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
516                     input.setPos(backTrack->begin);
517                     return false;
518                 }
519             }
520             return true;
521         }
522
523         case QuantifierGreedy: {
524             unsigned matchAmount = 0;
525             while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
526                 ++matchAmount;
527             backTrack->matchAmount = matchAmount;
528             return true;
529         }
530
531         case QuantifierNonGreedy:
532             backTrack->begin = input.getPos();
533             backTrack->matchAmount = 0;
534             return true;
535         }
536
537         ASSERT_NOT_REACHED();
538         return false;
539     }
540
541     bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
542     {
543         ASSERT(term.type == ByteTerm::TypeBackReference);
544         BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
545
546         int matchBegin = output[(term.atom.subpatternId << 1)];
547         int matchEnd = output[(term.atom.subpatternId << 1) + 1];
548         ASSERT((matchBegin == -1) == (matchEnd == -1));
549         ASSERT(matchBegin <= matchEnd);
550
551         if (matchBegin == matchEnd)
552             return false;
553
554         switch (term.atom.quantityType) {
555         case QuantifierFixedCount:
556             // for quantityCount == 1, could rewind.
557             input.setPos(backTrack->begin);
558             break;
559
560         case QuantifierGreedy:
561             if (backTrack->matchAmount) {
562                 --backTrack->matchAmount;
563                 input.rewind(matchEnd - matchBegin);
564                 return true;
565             }
566             break;
567
568         case QuantifierNonGreedy:
569             if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
570                 ++backTrack->matchAmount;
571                 return true;
572             } else
573                 input.setPos(backTrack->begin);
574             break;
575         }
576
577         return false;
578     }
579
580     void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
581     {
582         if (term.capture()) {
583             unsigned subpatternId = term.atom.subpatternId;
584             output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
585             output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
586         }
587     }
588     void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
589     {
590         unsigned firstSubpatternId = term.atom.subpatternId;
591         unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
592         context->restoreOutput(output, firstSubpatternId, count);
593     }
594     bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
595     {
596         while (backTrack->matchAmount) {
597             ParenthesesDisjunctionContext* context = backTrack->lastContext;
598
599             if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true))
600                 return true;
601
602             resetMatches(term, context);
603             popParenthesesDisjunctionContext(backTrack);
604             freeParenthesesDisjunctionContext(context);
605         }
606
607         return false;
608     }
609
610     bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
611     {
612         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
613         ASSERT(term.atom.quantityCount == 1);
614
615         BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
616
617         switch (term.atom.quantityType) {
618         case QuantifierGreedy: {
619             // set this speculatively; if we get to the parens end this will be true.
620             backTrack->inParentheses = 1;
621             break;
622         }
623         case QuantifierNonGreedy: {
624             backTrack->inParentheses = 0;
625             context->term += term.atom.parenthesesWidth;
626             return true;
627         }
628         case QuantifierFixedCount:
629             break;
630         }
631
632         if (term.capture()) {
633             unsigned subpatternId = term.atom.subpatternId;
634             output[(subpatternId << 1)] = input.getPos() + term.inputPosition;
635         }
636
637         return true;
638     }
639
640     bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*)
641     {
642         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
643         ASSERT(term.atom.quantityCount == 1);
644
645         if (term.capture()) {
646             unsigned subpatternId = term.atom.subpatternId;
647             output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
648         }
649         return true;
650     }
651
652     bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
653     {
654         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
655         ASSERT(term.atom.quantityCount == 1);
656
657         BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
658
659         if (term.capture()) {
660             unsigned subpatternId = term.atom.subpatternId;
661             output[(subpatternId << 1)] = -1;
662             output[(subpatternId << 1) + 1] = -1;
663         }
664
665         switch (term.atom.quantityType) {
666         case QuantifierGreedy:
667             // if we backtrack to this point, there is another chance - try matching nothing.
668             ASSERT(backTrack->inParentheses);
669             backTrack->inParentheses = 0;
670             context->term += term.atom.parenthesesWidth;
671             return true;
672         case QuantifierNonGreedy:
673             ASSERT(backTrack->inParentheses);
674         case QuantifierFixedCount:
675             break;
676         }
677
678         return false;
679     }
680
681     bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
682     {
683         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
684         ASSERT(term.atom.quantityCount == 1);
685
686         BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
687
688         switch (term.atom.quantityType) {
689         case QuantifierGreedy:
690             if (!backTrack->inParentheses) {
691                 context->term -= term.atom.parenthesesWidth;
692                 return false;
693             }
694         case QuantifierNonGreedy:
695             if (!backTrack->inParentheses) {
696                 // now try to match the parens; set this speculatively.
697                 backTrack->inParentheses = 1;
698                 if (term.capture()) {
699                     unsigned subpatternId = term.atom.subpatternId;
700                     output[subpatternId << 1] = input.getPos() + term.inputPosition;
701                 }
702                 context->term -= term.atom.parenthesesWidth;
703                 return true;
704             }
705         case QuantifierFixedCount:
706             break;
707         }
708
709         return false;
710     }
711
712     bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
713     {
714         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
715         ASSERT(term.atom.quantityCount == 1);
716
717         BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
718
719         backTrack->begin = input.getPos();
720         return true;
721     }
722
723     bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
724     {
725         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
726         ASSERT(term.atom.quantityCount == 1);
727
728         BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
729
730         input.setPos(backTrack->begin);
731
732         // We've reached the end of the parens; if they are inverted, this is failure.
733         if (term.invert()) {
734             context->term -= term.atom.parenthesesWidth;
735             return false;
736         }
737
738         return true;
739     }
740
741     bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
742     {
743         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
744         ASSERT(term.atom.quantityCount == 1);
745
746         // We've failed to match parens; if they are inverted, this is win!
747         if (term.invert()) {
748             context->term += term.atom.parenthesesWidth;
749             return true;
750         }
751
752         return false;
753     }
754
755     bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
756     {
757         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
758         ASSERT(term.atom.quantityCount == 1);
759
760         BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
761
762         input.setPos(backTrack->begin);
763
764         context->term -= term.atom.parenthesesWidth;
765         return false;
766     }
767
768     bool matchParentheses(ByteTerm& term, DisjunctionContext* context)
769     {
770         ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
771
772         BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
773
774         unsigned subpatternId = term.atom.subpatternId;
775         ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
776
777         backTrack->prevBegin = output[(subpatternId << 1)];
778         backTrack->prevEnd = output[(subpatternId << 1) + 1];
779
780         backTrack->matchAmount = 0;
781         backTrack->lastContext = 0;
782
783         switch (term.atom.quantityType) {
784         case QuantifierFixedCount: {
785             // While we haven't yet reached our fixed limit,
786             while (backTrack->matchAmount < term.atom.quantityCount) {
787                 // Try to do a match, and it it succeeds, add it to the list.
788                 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
789                 if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
790                     appendParenthesesDisjunctionContext(backTrack, context);
791                 else {
792                     // The match failed; try to find an alternate point to carry on from.
793                     resetMatches(term, context);
794                     freeParenthesesDisjunctionContext(context);
795                     if (!parenthesesDoBacktrack(term, backTrack))
796                         return false;
797                 }
798             }
799
800             ASSERT(backTrack->matchAmount == term.atom.quantityCount);
801             ParenthesesDisjunctionContext* context = backTrack->lastContext;
802             recordParenthesesMatch(term, context);
803             return true;
804         }
805
806         case QuantifierGreedy: {
807             while (backTrack->matchAmount < term.atom.quantityCount) {
808                 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
809                 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
810                     appendParenthesesDisjunctionContext(backTrack, context);
811                 else {
812                     resetMatches(term, context);
813                     freeParenthesesDisjunctionContext(context);
814                     break;
815                 }
816             }
817
818             if (backTrack->matchAmount) {
819                 ParenthesesDisjunctionContext* context = backTrack->lastContext;
820                 recordParenthesesMatch(term, context);
821             }
822             return true;
823         }
824
825         case QuantifierNonGreedy:
826             return true;
827         }
828
829         ASSERT_NOT_REACHED();
830         return false;
831     }
832
833     // Rules for backtracking differ depending on whether this is greedy or non-greedy.
834     //
835     // Greedy matches never should try just adding more - you should already have done
836     // the 'more' cases.  Always backtrack, at least a leetle bit.  However cases where
837     // you backtrack an item off the list needs checking, since we'll never have matched
838     // the one less case.  Tracking forwards, still add as much as possible.
839     //
840     // Non-greedy, we've already done the one less case, so don't match on popping.
841     // We haven't done the one more case, so always try to add that.
842     //
843     bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
844     {
845         ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
846
847         BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
848
849         if (term.capture()) {
850             unsigned subpatternId = term.atom.subpatternId;
851             output[(subpatternId << 1)] = backTrack->prevBegin;
852             output[(subpatternId << 1) + 1] = backTrack->prevEnd;
853         }
854
855         ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
856
857         switch (term.atom.quantityType) {
858         case QuantifierFixedCount: {
859             ASSERT(backTrack->matchAmount == term.atom.quantityCount);
860
861             ParenthesesDisjunctionContext* context = 0;
862
863             if (!parenthesesDoBacktrack(term, backTrack))
864                 return false;
865
866             // While we haven't yet reached our fixed limit,
867             while (backTrack->matchAmount < term.atom.quantityCount) {
868                 // Try to do a match, and it it succeeds, add it to the list.
869                 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
870                 if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
871                     appendParenthesesDisjunctionContext(backTrack, context);
872                 else {
873                     // The match failed; try to find an alternate point to carry on from.
874                     resetMatches(term, context);
875                     freeParenthesesDisjunctionContext(context);
876                     if (!parenthesesDoBacktrack(term, backTrack))
877                         return false;
878                 }
879             }
880
881             ASSERT(backTrack->matchAmount == term.atom.quantityCount);
882             context = backTrack->lastContext;
883             recordParenthesesMatch(term, context);
884             return true;
885         }
886
887         case QuantifierGreedy: {
888             if (!backTrack->matchAmount)
889                 return false;
890
891             ParenthesesDisjunctionContext* context = backTrack->lastContext;
892             if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
893                 while (backTrack->matchAmount < term.atom.quantityCount) {
894                     ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
895                     if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
896                         appendParenthesesDisjunctionContext(backTrack, context);
897                     else {
898                         resetMatches(term, context);
899                         freeParenthesesDisjunctionContext(context);
900                         break;
901                     }
902                 }
903             } else {
904                 resetMatches(term, context);
905                 popParenthesesDisjunctionContext(backTrack);
906                 freeParenthesesDisjunctionContext(context);
907             }
908
909             if (backTrack->matchAmount) {
910                 ParenthesesDisjunctionContext* context = backTrack->lastContext;
911                 recordParenthesesMatch(term, context);
912             }
913             return true;
914         }
915
916         case QuantifierNonGreedy: {
917             // If we've not reached the limit, try to add one more match.
918             if (backTrack->matchAmount < term.atom.quantityCount) {
919                 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
920                 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) {
921                     appendParenthesesDisjunctionContext(backTrack, context);
922                     recordParenthesesMatch(term, context);
923                     return true;
924                 } else {
925                     resetMatches(term, context);
926                     freeParenthesesDisjunctionContext(context);
927                 }
928             }
929
930             // Nope - okay backtrack looking for an alternative.
931             while (backTrack->matchAmount) {
932                 ParenthesesDisjunctionContext* context = backTrack->lastContext;
933                 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
934                     // successful backtrack! we're back in the game!
935                     if (backTrack->matchAmount) {
936                         context = backTrack->lastContext;
937                         recordParenthesesMatch(term, context);
938                     }
939                     return true;
940                 }
941
942                 // pop a match off the stack
943                 resetMatches(term, context);
944                 popParenthesesDisjunctionContext(backTrack);
945                 freeParenthesesDisjunctionContext(context);
946             }
947
948             return false;
949         }
950         }
951
952         ASSERT_NOT_REACHED();
953         return false;
954     }
955
956 #define MATCH_NEXT() { ++context->term; goto matchAgain; }
957 #define BACKTRACK() { --context->term; goto backtrack; }
958 #define currentTerm() (disjunction->terms[context->term])
959     bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
960     {
961         if (btrack)
962             BACKTRACK();
963
964         context->matchBegin = input.getPos();
965         context->term = 0;
966
967     matchAgain:
968         ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
969
970         switch (currentTerm().type) {
971         case ByteTerm::TypeSubpatternBegin:
972             MATCH_NEXT();
973         case ByteTerm::TypeSubpatternEnd:
974             context->matchEnd = input.getPos();
975             return true;
976
977         case ByteTerm::TypeBodyAlternativeBegin:
978             MATCH_NEXT();
979         case ByteTerm::TypeBodyAlternativeDisjunction:
980         case ByteTerm::TypeBodyAlternativeEnd:
981             context->matchEnd = input.getPos();
982             return true;
983
984         case ByteTerm::TypeAlternativeBegin:
985             MATCH_NEXT();
986         case ByteTerm::TypeAlternativeDisjunction:
987         case ByteTerm::TypeAlternativeEnd: {
988             int offset = currentTerm().alternative.end;
989             BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
990             backTrack->offset = offset;
991             context->term += offset;
992             MATCH_NEXT();
993         }
994
995         case ByteTerm::TypeAssertionBOL:
996             if (matchAssertionBOL(currentTerm()))
997                 MATCH_NEXT();
998             BACKTRACK();
999         case ByteTerm::TypeAssertionEOL:
1000             if (matchAssertionEOL(currentTerm()))
1001                 MATCH_NEXT();
1002             BACKTRACK();
1003         case ByteTerm::TypeAssertionWordBoundary:
1004             if (matchAssertionWordBoundary(currentTerm()))
1005                 MATCH_NEXT();
1006             BACKTRACK();
1007
1008         case ByteTerm::TypePatternCharacterOnce:
1009         case ByteTerm::TypePatternCharacterFixed: {
1010             for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1011                 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount))
1012                     BACKTRACK();
1013             }
1014             MATCH_NEXT();
1015         }
1016         case ByteTerm::TypePatternCharacterGreedy: {
1017             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1018             unsigned matchAmount = 0;
1019             while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1020                 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) {
1021                     input.uncheckInput(1);
1022                     break;
1023                 }
1024                 ++matchAmount;
1025             }
1026             backTrack->matchAmount = matchAmount;
1027
1028             MATCH_NEXT();
1029         }
1030         case ByteTerm::TypePatternCharacterNonGreedy: {
1031             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1032             backTrack->matchAmount = 0;
1033             MATCH_NEXT();
1034         }
1035
1036         case ByteTerm::TypePatternCasedCharacterOnce:
1037         case ByteTerm::TypePatternCasedCharacterFixed: {
1038             for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1039                 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount))
1040                     BACKTRACK();
1041             }
1042             MATCH_NEXT();
1043         }
1044         case ByteTerm::TypePatternCasedCharacterGreedy: {
1045             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1046             unsigned matchAmount = 0;
1047             while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1048                 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) {
1049                     input.uncheckInput(1);
1050                     break;
1051                 }
1052                 ++matchAmount;
1053             }
1054             backTrack->matchAmount = matchAmount;
1055
1056             MATCH_NEXT();
1057         }
1058         case ByteTerm::TypePatternCasedCharacterNonGreedy: {
1059             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1060             backTrack->matchAmount = 0;
1061             MATCH_NEXT();
1062         }
1063
1064         case ByteTerm::TypeCharacterClass:
1065             if (matchCharacterClass(currentTerm(), context))
1066                 MATCH_NEXT();
1067             BACKTRACK();
1068         case ByteTerm::TypeBackReference:
1069             if (matchBackReference(currentTerm(), context))
1070                 MATCH_NEXT();
1071             BACKTRACK();
1072         case ByteTerm::TypeParenthesesSubpattern:
1073             if (matchParentheses(currentTerm(), context))
1074                 MATCH_NEXT();
1075             BACKTRACK();
1076         case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1077             if (matchParenthesesOnceBegin(currentTerm(), context))
1078                 MATCH_NEXT();
1079             BACKTRACK();
1080         case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1081             if (matchParenthesesOnceEnd(currentTerm(), context))
1082                 MATCH_NEXT();
1083             BACKTRACK();
1084         case ByteTerm::TypeParentheticalAssertionBegin:
1085             if (matchParentheticalAssertionBegin(currentTerm(), context))
1086                 MATCH_NEXT();
1087             BACKTRACK();
1088         case ByteTerm::TypeParentheticalAssertionEnd:
1089             if (matchParentheticalAssertionEnd(currentTerm(), context))
1090                 MATCH_NEXT();
1091             BACKTRACK();
1092
1093         case ByteTerm::TypeCheckInput:
1094             if (input.checkInput(currentTerm().checkInputCount))
1095                 MATCH_NEXT();
1096             BACKTRACK();
1097         }
1098
1099         // We should never fall-through to here.
1100         ASSERT_NOT_REACHED();
1101
1102     backtrack:
1103         ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1104
1105         switch (currentTerm().type) {
1106         case ByteTerm::TypeSubpatternBegin:
1107             return false;
1108         case ByteTerm::TypeSubpatternEnd:
1109             ASSERT_NOT_REACHED();
1110
1111         case ByteTerm::TypeBodyAlternativeBegin:
1112         case ByteTerm::TypeBodyAlternativeDisjunction: {
1113             int offset = currentTerm().alternative.next;
1114             context->term += offset;
1115             if (offset > 0)
1116                 MATCH_NEXT();
1117
1118             if (input.atEnd())
1119                 return false;
1120
1121             input.next();
1122             context->matchBegin = input.getPos();
1123
1124             if (currentTerm().alternative.onceThrough)
1125                 context->term += currentTerm().alternative.next;
1126
1127             MATCH_NEXT();
1128         }
1129         case ByteTerm::TypeBodyAlternativeEnd:
1130             ASSERT_NOT_REACHED();
1131
1132             case ByteTerm::TypeAlternativeBegin:
1133             case ByteTerm::TypeAlternativeDisjunction: {
1134                 int offset = currentTerm().alternative.next;
1135                 context->term += offset;
1136                 if (offset > 0)
1137                     MATCH_NEXT();
1138                 BACKTRACK();
1139             }
1140             case ByteTerm::TypeAlternativeEnd: {
1141                 // We should never backtrack back into an alternative of the main body of the regex.
1142                 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1143                 unsigned offset = backTrack->offset;
1144                 context->term -= offset;
1145                 BACKTRACK();
1146             }
1147
1148             case ByteTerm::TypeAssertionBOL:
1149             case ByteTerm::TypeAssertionEOL:
1150             case ByteTerm::TypeAssertionWordBoundary:
1151                 BACKTRACK();
1152
1153             case ByteTerm::TypePatternCharacterOnce:
1154             case ByteTerm::TypePatternCharacterFixed:
1155             case ByteTerm::TypePatternCharacterGreedy:
1156             case ByteTerm::TypePatternCharacterNonGreedy:
1157                 if (backtrackPatternCharacter(currentTerm(), context))
1158                     MATCH_NEXT();
1159                 BACKTRACK();
1160             case ByteTerm::TypePatternCasedCharacterOnce:
1161             case ByteTerm::TypePatternCasedCharacterFixed:
1162             case ByteTerm::TypePatternCasedCharacterGreedy:
1163             case ByteTerm::TypePatternCasedCharacterNonGreedy:
1164                 if (backtrackPatternCasedCharacter(currentTerm(), context))
1165                     MATCH_NEXT();
1166                 BACKTRACK();
1167             case ByteTerm::TypeCharacterClass:
1168                 if (backtrackCharacterClass(currentTerm(), context))
1169                     MATCH_NEXT();
1170                 BACKTRACK();
1171             case ByteTerm::TypeBackReference:
1172                 if (backtrackBackReference(currentTerm(), context))
1173                     MATCH_NEXT();
1174                 BACKTRACK();
1175             case ByteTerm::TypeParenthesesSubpattern:
1176                 if (backtrackParentheses(currentTerm(), context))
1177                     MATCH_NEXT();
1178                 BACKTRACK();
1179             case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1180                 if (backtrackParenthesesOnceBegin(currentTerm(), context))
1181                     MATCH_NEXT();
1182                 BACKTRACK();
1183             case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1184                 if (backtrackParenthesesOnceEnd(currentTerm(), context))
1185                     MATCH_NEXT();
1186                 BACKTRACK();
1187             case ByteTerm::TypeParentheticalAssertionBegin:
1188                 if (backtrackParentheticalAssertionBegin(currentTerm(), context))
1189                     MATCH_NEXT();
1190                 BACKTRACK();
1191             case ByteTerm::TypeParentheticalAssertionEnd:
1192                 if (backtrackParentheticalAssertionEnd(currentTerm(), context))
1193                     MATCH_NEXT();
1194                 BACKTRACK();
1195
1196             case ByteTerm::TypeCheckInput:
1197                 input.uncheckInput(currentTerm().checkInputCount);
1198                 BACKTRACK();
1199         }
1200
1201         ASSERT_NOT_REACHED();
1202         return false;
1203     }
1204
1205     bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1206     {
1207         if (matchDisjunction(disjunction, context, btrack)) {
1208             while (context->matchBegin == context->matchEnd) {
1209                 if (!matchDisjunction(disjunction, context, true))
1210                     return false;
1211             }
1212             return true;
1213         }
1214
1215         return false;
1216     }
1217
1218     int interpret()
1219     {
1220         allocatorPool = pattern->m_allocator->startAllocator();
1221         if (!allocatorPool)
1222             CRASH();
1223
1224         for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i)
1225             output[i] = -1;
1226
1227         DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
1228
1229         if (matchDisjunction(pattern->m_body.get(), context)) {
1230             output[0] = context->matchBegin;
1231             output[1] = context->matchEnd;
1232         }
1233
1234         freeDisjunctionContext(context);
1235
1236         pattern->m_allocator->stopAllocator();
1237
1238         return output[0];
1239     }
1240
1241     Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length)
1242         : pattern(pattern)
1243         , output(output)
1244         , input(inputChar, start, length)
1245         , allocatorPool(0)
1246     {
1247     }
1248
1249 private:
1250     BytecodePattern *pattern;
1251     int* output;
1252     InputStream input;
1253     BumpPointerPool* allocatorPool;
1254 };
1255
1256
1257
1258 class ByteCompiler {
1259     struct ParenthesesStackEntry {
1260         unsigned beginTerm;
1261         unsigned savedAlternativeIndex;
1262         ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
1263             : beginTerm(beginTerm)
1264             , savedAlternativeIndex(savedAlternativeIndex)
1265         {
1266         }
1267     };
1268
1269 public:
1270     ByteCompiler(RegexPattern& pattern)
1271         : m_pattern(pattern)
1272     {
1273         m_currentAlternativeIndex = 0;
1274     }
1275
1276     PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator)
1277     {
1278         regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
1279         emitDisjunction(m_pattern.m_body);
1280         regexEnd();
1281
1282         return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator));
1283     }
1284
1285     void checkInput(unsigned count)
1286     {
1287         m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
1288     }
1289
1290     void assertionBOL(int inputPosition)
1291     {
1292         m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
1293     }
1294
1295     void assertionEOL(int inputPosition)
1296     {
1297         m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
1298     }
1299
1300     void assertionWordBoundary(bool invert, int inputPosition)
1301     {
1302         m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
1303     }
1304
1305     void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1306     {
1307         if (m_pattern.m_ignoreCase) {
1308             UChar lo = Unicode::toLower(ch);
1309             UChar hi = Unicode::toUpper(ch);
1310
1311             if (lo != hi) {
1312                 m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
1313                 return;
1314             }
1315         }
1316
1317         m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
1318     }
1319
1320     void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1321     {
1322         m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
1323
1324         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
1325         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1326         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1327     }
1328
1329     void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1330     {
1331         ASSERT(subpatternId);
1332
1333         m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
1334
1335         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
1336         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1337         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1338     }
1339
1340     void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1341     {
1342         int beginTerm = m_bodyDisjunction->terms.size();
1343
1344         m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition));
1345         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1346         m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1347         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1348
1349         m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1350         m_currentAlternativeIndex = beginTerm + 1;
1351     }
1352
1353     void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
1354     {
1355         int beginTerm = m_bodyDisjunction->terms.size();
1356
1357         m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, invert, 0));
1358         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1359         m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1360         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1361
1362         m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1363         m_currentAlternativeIndex = beginTerm + 1;
1364     }
1365
1366     unsigned popParenthesesStack()
1367     {
1368         ASSERT(m_parenthesesStack.size());
1369         int stackEnd = m_parenthesesStack.size() - 1;
1370         unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
1371         m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
1372         m_parenthesesStack.shrink(stackEnd);
1373
1374         ASSERT(beginTerm < m_bodyDisjunction->terms.size());
1375         ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
1376
1377         return beginTerm;
1378     }
1379
1380 #ifndef NDEBUG
1381     void dumpDisjunction(ByteDisjunction* disjunction)
1382     {
1383         printf("ByteDisjunction(%p):\n\t", disjunction);
1384         for (unsigned i = 0; i < disjunction->terms.size(); ++i)
1385             printf("{ %d } ", disjunction->terms[i].type);
1386         printf("\n");
1387     }
1388 #endif
1389
1390     void closeAlternative(int beginTerm)
1391     {
1392         int origBeginTerm = beginTerm;
1393         ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
1394         int endIndex = m_bodyDisjunction->terms.size();
1395
1396         unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1397
1398         if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
1399             m_bodyDisjunction->terms.remove(beginTerm);
1400         else {
1401             while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1402                 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1403                 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
1404                 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1405                 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1406             }
1407
1408             m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1409
1410             m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
1411             m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1412         }
1413     }
1414
1415     void closeBodyAlternative()
1416     {
1417         int beginTerm = 0;
1418         int origBeginTerm = 0;
1419         ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
1420         int endIndex = m_bodyDisjunction->terms.size();
1421
1422         unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1423
1424         while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1425             beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1426             ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
1427             m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1428             m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1429         }
1430
1431         m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1432
1433         m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
1434         m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1435     }
1436
1437     void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
1438     {
1439         unsigned beginTerm = popParenthesesStack();
1440         closeAlternative(beginTerm + 1);
1441         unsigned endTerm = m_bodyDisjunction->terms.size();
1442
1443         bool isAssertion = m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin;
1444         bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture;
1445         unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1446
1447         m_bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition));
1448         m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1449         m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1450         m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1451
1452         if (doInline) {
1453             m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1454             m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1455             m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
1456             m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1457         } else {
1458             ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
1459             ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1460
1461             bool invertOrCapture = parenthesesBegin.invertOrCapture;
1462             unsigned subpatternId = parenthesesBegin.atom.subpatternId;
1463
1464             unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
1465             ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
1466
1467             parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
1468             for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
1469                 parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
1470             parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
1471
1472             m_bodyDisjunction->terms.shrink(beginTerm);
1473
1474             m_allParenthesesInfo.append(parenthesesDisjunction);
1475             m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition));
1476
1477             m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1478             m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1479             m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1480         }
1481     }
1482
1483     void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
1484     {
1485         m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize));
1486         m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
1487         m_bodyDisjunction->terms[0].frameLocation = 0;
1488         m_currentAlternativeIndex = 0;
1489     }
1490
1491     void regexEnd()
1492     {
1493         closeBodyAlternative();
1494     }
1495
1496     void alternativeBodyDisjunction(bool onceThrough)
1497     {
1498         int newAlternativeIndex = m_bodyDisjunction->terms.size();
1499         m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1500         m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
1501
1502         m_currentAlternativeIndex = newAlternativeIndex;
1503     }
1504
1505     void alternativeDisjunction()
1506     {
1507         int newAlternativeIndex = m_bodyDisjunction->terms.size();
1508         m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1509         m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
1510
1511         m_currentAlternativeIndex = newAlternativeIndex;
1512     }
1513
1514     void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0, bool isParentheticalAssertion = false)
1515     {
1516         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
1517             unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
1518
1519             PatternAlternative* alternative = disjunction->m_alternatives[alt];
1520
1521             if (alt) {
1522                 if (disjunction == m_pattern.m_body)
1523                     alternativeBodyDisjunction(alternative->onceThrough());
1524                 else
1525                     alternativeDisjunction();
1526             }
1527
1528             unsigned minimumSize = alternative->m_minimumSize;
1529             int countToCheck;
1530
1531             if (isParentheticalAssertion && parenthesesInputCountAlreadyChecked > minimumSize)
1532                 countToCheck = 0;
1533             else
1534                 countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
1535
1536             ASSERT(countToCheck >= 0);
1537             if (countToCheck) {
1538                 checkInput(countToCheck);
1539                 currentCountAlreadyChecked += countToCheck;
1540             }
1541
1542             for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
1543                 PatternTerm& term = alternative->m_terms[i];
1544
1545                 switch (term.type) {
1546                 case PatternTerm::TypeAssertionBOL:
1547                     assertionBOL(term.inputPosition - currentCountAlreadyChecked);
1548                     break;
1549
1550                 case PatternTerm::TypeAssertionEOL:
1551                     assertionEOL(term.inputPosition - currentCountAlreadyChecked);
1552                     break;
1553
1554                 case PatternTerm::TypeAssertionWordBoundary:
1555                     assertionWordBoundary(term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked);
1556                     break;
1557
1558                 case PatternTerm::TypePatternCharacter:
1559                     atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1560                     break;
1561
1562                 case PatternTerm::TypeCharacterClass:
1563                     atomCharacterClass(term.characterClass, term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1564                     break;
1565
1566                 case PatternTerm::TypeBackReference:
1567                     atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1568                         break;
1569
1570                 case PatternTerm::TypeForwardReference:
1571                     break;
1572
1573                 case PatternTerm::TypeParenthesesSubpattern: {
1574                     unsigned disjunctionAlreadyCheckedCount = 0;
1575                     if ((term.quantityCount == 1) && !term.parentheses.isCopy) {
1576                         if (term.quantityType == QuantifierFixedCount) {
1577                             disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
1578                             unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1579                             atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation);
1580                             emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize);
1581                             atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1582                         } else {
1583                             unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1584                             atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce);
1585                             emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1586                             atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1587                         }
1588                     } else {
1589                         unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1590                         atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0);
1591                         emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1592                         atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1593                     }
1594                     break;
1595                 }
1596
1597                 case PatternTerm::TypeParentheticalAssertion: {
1598                     unsigned alternativeFrameLocation = term.frameLocation + RegexStackSpaceForBackTrackInfoParentheticalAssertion;
1599
1600                     ASSERT(currentCountAlreadyChecked >= (unsigned)term.inputPosition);
1601                     int positiveInputOffset = currentCountAlreadyChecked - term.inputPosition;
1602
1603                     atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation);
1604                     emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset, true);
1605                     atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType);
1606                     break;
1607                 }
1608                 }
1609             }
1610         }
1611     }
1612
1613 private:
1614     RegexPattern& m_pattern;
1615     OwnPtr<ByteDisjunction> m_bodyDisjunction;
1616     unsigned m_currentAlternativeIndex;
1617     Vector<ParenthesesStackEntry> m_parenthesesStack;
1618     Vector<ByteDisjunction*> m_allParenthesesInfo;
1619 };
1620
1621
1622 PassOwnPtr<BytecodePattern> byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, BumpPointerAllocator* allocator, bool ignoreCase, bool multiline)
1623 {
1624     RegexPattern pattern(ignoreCase, multiline);
1625
1626     if ((error = compileRegex(patternString, pattern)))
1627         return PassOwnPtr<BytecodePattern>();
1628
1629     numSubpatterns = pattern.m_numSubpatterns;
1630
1631     return ByteCompiler(pattern).compile(allocator);
1632 }
1633
1634 int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output)
1635 {
1636     return Interpreter(regex, output, input, start, length).interpret();
1637 }
1638
1639
1640 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter);
1641 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass);
1642 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference);
1643 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative);
1644 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion);
1645 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce);
1646 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses);
1647
1648
1649 } }
1650
1651 #endif