33f3c89220e57db40b6855ab0f35c7f9d6f8281a
[WebKit-https.git] / JavaScriptCore / yarr / RegexInterpreter.cpp
1 /*
2  * Copyright (C) 2009 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "RegexInterpreter.h"
28
29 #include "RegexCompiler.h"
30 #include "RegexPattern.h"
31 #include <wtf/BumpPointerAllocator.h>
32
33 #ifndef NDEBUG
34 #include <stdio.h>
35 #endif
36
37 #if ENABLE(YARR)
38
39 using namespace WTF;
40
41 namespace JSC { namespace Yarr {
42
43 class Interpreter {
44 public:
45     struct ParenthesesDisjunctionContext;
46
47     struct BackTrackInfoPatternCharacter {
48         uintptr_t matchAmount;
49     };
50     struct BackTrackInfoCharacterClass {
51         uintptr_t matchAmount;
52     };
53     struct BackTrackInfoBackReference {
54         uintptr_t begin; // Not really needed for greedy quantifiers.
55         uintptr_t matchAmount; // Not really needed for fixed quantifiers.
56     };
57     struct BackTrackInfoAlternative {
58         uintptr_t offset;
59     };
60     struct BackTrackInfoParentheticalAssertion {
61         uintptr_t begin;
62     };
63     struct BackTrackInfoParenthesesOnce {
64         uintptr_t inParentheses;
65     };
66     struct BackTrackInfoParentheses {
67         uintptr_t matchAmount;
68         ParenthesesDisjunctionContext* lastContext;
69         uintptr_t prevBegin;
70         uintptr_t prevEnd;
71     };
72
73     static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
74     {
75         context->next = backTrack->lastContext;
76         backTrack->lastContext = context;
77         ++backTrack->matchAmount;
78     }
79
80     static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
81     {
82         ASSERT(backTrack->matchAmount);
83         ASSERT(backTrack->lastContext);
84         backTrack->lastContext = backTrack->lastContext->next;
85         --backTrack->matchAmount;
86     }
87
88     struct DisjunctionContext
89     {
90         DisjunctionContext()
91             : term(0)
92         {
93         }
94
95         void* operator new(size_t, void* where)
96         {
97             return where;
98         }
99
100         int term;
101         unsigned matchBegin;
102         unsigned matchEnd;
103         uintptr_t frame[1];
104     };
105
106     DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
107     {
108         size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
109         allocatorPool = allocatorPool->ensureCapacity(size);
110         if (!allocatorPool)
111             CRASH();
112         return new(allocatorPool->alloc(size)) DisjunctionContext();
113     }
114
115     void freeDisjunctionContext(DisjunctionContext* context)
116     {
117         allocatorPool = allocatorPool->dealloc(context);
118     }
119
120     struct ParenthesesDisjunctionContext
121     {
122         ParenthesesDisjunctionContext(int* output, ByteTerm& term)
123             : next(0)
124         {
125             unsigned firstSubpatternId = term.atom.subpatternId;
126             unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
127
128             for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
129                 subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
130                 output[(firstSubpatternId << 1) + i] = -1;
131             }
132
133             new(getDisjunctionContext(term)) DisjunctionContext();
134         }
135
136         void* operator new(size_t, void* where)
137         {
138             return where;
139         }
140
141         void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
142         {
143             for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
144                 output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
145         }
146
147         DisjunctionContext* getDisjunctionContext(ByteTerm& term)
148         {
149             return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
150         }
151
152         ParenthesesDisjunctionContext* next;
153         int subpatternBackup[1];
154     };
155
156     ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term)
157     {
158         size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(int) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(int) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
159         allocatorPool = allocatorPool->ensureCapacity(size);
160         if (!allocatorPool)
161             CRASH();
162         return new(allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
163     }
164
165     void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
166     {
167         allocatorPool = allocatorPool->dealloc(context);
168     }
169
170     class InputStream {
171     public:
172         InputStream(const UChar* input, unsigned start, unsigned length)
173             : input(input)
174             , pos(start)
175             , length(length)
176         {
177         }
178
179         void next()
180         {
181             ++pos;
182         }
183
184         void rewind(unsigned amount)
185         {
186             ASSERT(pos >= amount);
187             pos -= amount;
188         }
189
190         int read()
191         {
192             ASSERT(pos < length);
193             if (pos < length)
194                 return input[pos];
195             return -1;
196         }
197
198         int readChecked(int position)
199         {
200             ASSERT(position < 0);
201             ASSERT((unsigned)-position <= pos);
202             unsigned p = pos + position;
203             ASSERT(p < length);
204             return input[p];
205         }
206
207         int reread(unsigned from)
208         {
209             ASSERT(from < length);
210             return input[from];
211         }
212
213         int prev()
214         {
215             ASSERT(!(pos > length));
216             if (pos && length)
217                 return input[pos - 1];
218             return -1;
219         }
220
221         unsigned getPos()
222         {
223             return pos;
224         }
225
226         void setPos(unsigned p)
227         {
228             pos = p;
229         }
230
231         bool atStart()
232         {
233             return pos == 0;
234         }
235
236         bool atEnd()
237         {
238             return pos == length;
239         }
240
241         bool checkInput(int count)
242         {
243             if ((pos + count) <= length) {
244                 pos += count;
245                 return true;
246             } else
247                 return false;
248         }
249
250         void uncheckInput(int count)
251         {
252             pos -= count;
253         }
254
255         bool atStart(int position)
256         {
257             return (pos + position) == 0;
258         }
259
260         bool atEnd(int position)
261         {
262             return (pos + position) == length;
263         }
264
265     private:
266         const UChar* input;
267         unsigned pos;
268         unsigned length;
269     };
270
271     bool testCharacterClass(CharacterClass* characterClass, int ch)
272     {
273         if (ch & 0xFF80) {
274             for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
275                 if (ch == characterClass->m_matchesUnicode[i])
276                     return true;
277             for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
278                 if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
279                     return true;
280         } else {
281             for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
282                 if (ch == characterClass->m_matches[i])
283                     return true;
284             for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
285                 if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
286                     return true;
287         }
288
289         return false;
290     }
291
292     bool checkCharacter(int testChar, int inputPosition)
293     {
294         return testChar == input.readChecked(inputPosition);
295     }
296
297     bool checkCasedCharacter(int loChar, int hiChar, int inputPosition)
298     {
299         int ch = input.readChecked(inputPosition);
300         return (loChar == ch) || (hiChar == ch);
301     }
302
303     bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition)
304     {
305         bool match = testCharacterClass(characterClass, input.readChecked(inputPosition));
306         return invert ? !match : match;
307     }
308
309     bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset)
310     {
311         int matchSize = matchEnd - matchBegin;
312
313         if (!input.checkInput(matchSize))
314             return false;
315
316         if (pattern->m_ignoreCase) {
317             for (int i = 0; i < matchSize; ++i) {
318                 int ch = input.reread(matchBegin + i);
319
320                 int lo = Unicode::toLower(ch);
321                 int hi = Unicode::toUpper(ch);
322
323                 if ((lo != hi) ? (!checkCasedCharacter(lo, hi, inputOffset - matchSize + i)) : (!checkCharacter(ch, inputOffset - matchSize + i))) {
324                     input.uncheckInput(matchSize);
325                     return false;
326                 }
327             }
328         } else {
329             for (int i = 0; i < matchSize; ++i) {
330                 if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) {
331                     input.uncheckInput(matchSize);
332                     return false;
333                 }
334             }
335         }
336
337         return true;
338     }
339
340     bool matchAssertionBOL(ByteTerm& term)
341     {
342         return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1)));
343     }
344
345     bool matchAssertionEOL(ByteTerm& term)
346     {
347         if (term.inputPosition)
348             return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
349         else
350             return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
351     }
352
353     bool matchAssertionWordBoundary(ByteTerm& term)
354     {
355         bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1));
356         bool readIsWordchar;
357         if (term.inputPosition)
358             readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
359         else
360             readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
361
362         bool wordBoundary = prevIsWordchar != readIsWordchar;
363         return term.invert() ? !wordBoundary : wordBoundary;
364     }
365
366     bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
367     {
368         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
369
370         switch (term.atom.quantityType) {
371         case QuantifierFixedCount:
372             break;
373
374         case QuantifierGreedy:
375             if (backTrack->matchAmount) {
376                 --backTrack->matchAmount;
377                 input.uncheckInput(1);
378                 return true;
379             }
380             break;
381
382         case QuantifierNonGreedy:
383             if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
384                 ++backTrack->matchAmount;
385                 if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1))
386                     return true;
387             }
388             input.uncheckInput(backTrack->matchAmount);
389             break;
390         }
391
392         return false;
393     }
394
395     bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
396     {
397         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
398
399         switch (term.atom.quantityType) {
400         case QuantifierFixedCount:
401             break;
402
403         case QuantifierGreedy:
404             if (backTrack->matchAmount) {
405                 --backTrack->matchAmount;
406                 input.uncheckInput(1);
407                 return true;
408             }
409             break;
410
411         case QuantifierNonGreedy:
412             if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
413                 ++backTrack->matchAmount;
414                 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1))
415                     return true;
416             }
417             input.uncheckInput(backTrack->matchAmount);
418             break;
419         }
420
421         return false;
422     }
423
424     bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
425     {
426         ASSERT(term.type == ByteTerm::TypeCharacterClass);
427         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
428
429         switch (term.atom.quantityType) {
430         case QuantifierFixedCount: {
431             for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
432                 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount))
433                     return false;
434             }
435             return true;
436         }
437
438         case QuantifierGreedy: {
439             unsigned matchAmount = 0;
440             while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
441                 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) {
442                     input.uncheckInput(1);
443                     break;
444                 }
445                 ++matchAmount;
446             }
447             backTrack->matchAmount = matchAmount;
448
449             return true;
450         }
451
452         case QuantifierNonGreedy:
453             backTrack->matchAmount = 0;
454             return true;
455         }
456
457         ASSERT_NOT_REACHED();
458         return false;
459     }
460
461     bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
462     {
463         ASSERT(term.type == ByteTerm::TypeCharacterClass);
464         BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
465
466         switch (term.atom.quantityType) {
467         case QuantifierFixedCount:
468             break;
469
470         case QuantifierGreedy:
471             if (backTrack->matchAmount) {
472                 --backTrack->matchAmount;
473                 input.uncheckInput(1);
474                 return true;
475             }
476             break;
477
478         case QuantifierNonGreedy:
479             if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
480                 ++backTrack->matchAmount;
481                 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1))
482                     return true;
483             }
484             input.uncheckInput(backTrack->matchAmount);
485             break;
486         }
487
488         return false;
489     }
490
491     bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
492     {
493         ASSERT(term.type == ByteTerm::TypeBackReference);
494         BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
495
496         int matchBegin = output[(term.atom.subpatternId << 1)];
497         int matchEnd = output[(term.atom.subpatternId << 1) + 1];
498
499         // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
500         // In this case the result of match is empty string like when it references to a parentheses with zero-width match.
501         // Eg.: /(a\1)/
502         if (matchEnd == -1)
503             return true;
504
505         ASSERT((matchBegin == -1) == (matchEnd == -1));
506         ASSERT(matchBegin <= matchEnd);
507
508         if (matchBegin == matchEnd)
509             return true;
510
511         switch (term.atom.quantityType) {
512         case QuantifierFixedCount: {
513             backTrack->begin = input.getPos();
514             for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
515                 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
516                     input.setPos(backTrack->begin);
517                     return false;
518                 }
519             }
520             return true;
521         }
522
523         case QuantifierGreedy: {
524             unsigned matchAmount = 0;
525             while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
526                 ++matchAmount;
527             backTrack->matchAmount = matchAmount;
528             return true;
529         }
530
531         case QuantifierNonGreedy:
532             backTrack->begin = input.getPos();
533             backTrack->matchAmount = 0;
534             return true;
535         }
536
537         ASSERT_NOT_REACHED();
538         return false;
539     }
540
541     bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
542     {
543         ASSERT(term.type == ByteTerm::TypeBackReference);
544         BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
545
546         int matchBegin = output[(term.atom.subpatternId << 1)];
547         int matchEnd = output[(term.atom.subpatternId << 1) + 1];
548         ASSERT((matchBegin == -1) == (matchEnd == -1));
549         ASSERT(matchBegin <= matchEnd);
550
551         if (matchBegin == matchEnd)
552             return false;
553
554         switch (term.atom.quantityType) {
555         case QuantifierFixedCount:
556             // for quantityCount == 1, could rewind.
557             input.setPos(backTrack->begin);
558             break;
559
560         case QuantifierGreedy:
561             if (backTrack->matchAmount) {
562                 --backTrack->matchAmount;
563                 input.rewind(matchEnd - matchBegin);
564                 return true;
565             }
566             break;
567
568         case QuantifierNonGreedy:
569             if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
570                 ++backTrack->matchAmount;
571                 return true;
572             } else
573                 input.setPos(backTrack->begin);
574             break;
575         }
576
577         return false;
578     }
579
580     void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
581     {
582         if (term.capture()) {
583             unsigned subpatternId = term.atom.subpatternId;
584             output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
585             output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
586         }
587     }
588     void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
589     {
590         unsigned firstSubpatternId = term.atom.subpatternId;
591         unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
592         context->restoreOutput(output, firstSubpatternId, count);
593     }
594     void resetAssertionMatches(ByteTerm& term)
595     {
596         unsigned firstSubpatternId = term.atom.subpatternId;
597         unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
598         for (unsigned i = 0; i < (count << 1); ++i)
599             output[(firstSubpatternId << 1) + i] = -1;
600     }
601     bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
602     {
603         while (backTrack->matchAmount) {
604             ParenthesesDisjunctionContext* context = backTrack->lastContext;
605
606             if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true))
607                 return true;
608
609             resetMatches(term, context);
610             popParenthesesDisjunctionContext(backTrack);
611             freeParenthesesDisjunctionContext(context);
612         }
613
614         return false;
615     }
616
617     bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
618     {
619         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
620         ASSERT(term.atom.quantityCount == 1);
621
622         BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
623
624         switch (term.atom.quantityType) {
625         case QuantifierGreedy: {
626             // set this speculatively; if we get to the parens end this will be true.
627             backTrack->inParentheses = 1;
628             break;
629         }
630         case QuantifierNonGreedy: {
631             backTrack->inParentheses = 0;
632             context->term += term.atom.parenthesesWidth;
633             return true;
634         }
635         case QuantifierFixedCount:
636             break;
637         }
638
639         if (term.capture()) {
640             unsigned subpatternId = term.atom.subpatternId;
641             output[(subpatternId << 1)] = input.getPos() + term.inputPosition;
642         }
643
644         return true;
645     }
646
647     bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*)
648     {
649         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
650         ASSERT(term.atom.quantityCount == 1);
651
652         if (term.capture()) {
653             unsigned subpatternId = term.atom.subpatternId;
654             output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
655         }
656         return true;
657     }
658
659     bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
660     {
661         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
662         ASSERT(term.atom.quantityCount == 1);
663
664         BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
665
666         if (term.capture()) {
667             unsigned subpatternId = term.atom.subpatternId;
668             output[(subpatternId << 1)] = -1;
669             output[(subpatternId << 1) + 1] = -1;
670         }
671
672         switch (term.atom.quantityType) {
673         case QuantifierGreedy:
674             // if we backtrack to this point, there is another chance - try matching nothing.
675             ASSERT(backTrack->inParentheses);
676             backTrack->inParentheses = 0;
677             context->term += term.atom.parenthesesWidth;
678             return true;
679         case QuantifierNonGreedy:
680             ASSERT(backTrack->inParentheses);
681         case QuantifierFixedCount:
682             break;
683         }
684
685         return false;
686     }
687
688     bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
689     {
690         ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
691         ASSERT(term.atom.quantityCount == 1);
692
693         BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
694
695         switch (term.atom.quantityType) {
696         case QuantifierGreedy:
697             if (!backTrack->inParentheses) {
698                 context->term -= term.atom.parenthesesWidth;
699                 return false;
700             }
701         case QuantifierNonGreedy:
702             if (!backTrack->inParentheses) {
703                 // now try to match the parens; set this speculatively.
704                 backTrack->inParentheses = 1;
705                 if (term.capture()) {
706                     unsigned subpatternId = term.atom.subpatternId;
707                     output[subpatternId << 1] = input.getPos() + term.inputPosition;
708                 }
709                 context->term -= term.atom.parenthesesWidth;
710                 return true;
711             }
712         case QuantifierFixedCount:
713             break;
714         }
715
716         return false;
717     }
718
719     bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
720     {
721         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
722         ASSERT(term.atom.quantityCount == 1);
723
724         BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
725
726         backTrack->begin = input.getPos();
727         return true;
728     }
729
730     bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
731     {
732         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
733         ASSERT(term.atom.quantityCount == 1);
734
735         BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
736
737         input.setPos(backTrack->begin);
738
739         // We've reached the end of the parens; if they are inverted, this is failure.
740         if (term.invert()) {
741             context->term -= term.atom.parenthesesWidth;
742             return false;
743         }
744
745         return true;
746     }
747
748     bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
749     {
750         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
751         ASSERT(term.atom.quantityCount == 1);
752
753         // We've failed to match parens; if they are inverted, this is win!
754         if (term.invert()) {
755             context->term += term.atom.parenthesesWidth;
756             return true;
757         }
758
759         return false;
760     }
761
762     bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
763     {
764         ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
765         ASSERT(term.atom.quantityCount == 1);
766
767         BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
768
769         input.setPos(backTrack->begin);
770
771         context->term -= term.atom.parenthesesWidth;
772         return false;
773     }
774
775     bool matchParentheses(ByteTerm& term, DisjunctionContext* context)
776     {
777         ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
778
779         BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
780
781         unsigned subpatternId = term.atom.subpatternId;
782         ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
783
784         backTrack->prevBegin = output[(subpatternId << 1)];
785         backTrack->prevEnd = output[(subpatternId << 1) + 1];
786
787         backTrack->matchAmount = 0;
788         backTrack->lastContext = 0;
789
790         switch (term.atom.quantityType) {
791         case QuantifierFixedCount: {
792             // While we haven't yet reached our fixed limit,
793             while (backTrack->matchAmount < term.atom.quantityCount) {
794                 // Try to do a match, and it it succeeds, add it to the list.
795                 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
796                 if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
797                     appendParenthesesDisjunctionContext(backTrack, context);
798                 else {
799                     // The match failed; try to find an alternate point to carry on from.
800                     resetMatches(term, context);
801                     freeParenthesesDisjunctionContext(context);
802                     if (!parenthesesDoBacktrack(term, backTrack))
803                         return false;
804                 }
805             }
806
807             ASSERT(backTrack->matchAmount == term.atom.quantityCount);
808             ParenthesesDisjunctionContext* context = backTrack->lastContext;
809             recordParenthesesMatch(term, context);
810             return true;
811         }
812
813         case QuantifierGreedy: {
814             while (backTrack->matchAmount < term.atom.quantityCount) {
815                 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
816                 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
817                     appendParenthesesDisjunctionContext(backTrack, context);
818                 else {
819                     resetMatches(term, context);
820                     freeParenthesesDisjunctionContext(context);
821                     break;
822                 }
823             }
824
825             if (backTrack->matchAmount) {
826                 ParenthesesDisjunctionContext* context = backTrack->lastContext;
827                 recordParenthesesMatch(term, context);
828             }
829             return true;
830         }
831
832         case QuantifierNonGreedy:
833             return true;
834         }
835
836         ASSERT_NOT_REACHED();
837         return false;
838     }
839
840     // Rules for backtracking differ depending on whether this is greedy or non-greedy.
841     //
842     // Greedy matches never should try just adding more - you should already have done
843     // the 'more' cases.  Always backtrack, at least a leetle bit.  However cases where
844     // you backtrack an item off the list needs checking, since we'll never have matched
845     // the one less case.  Tracking forwards, still add as much as possible.
846     //
847     // Non-greedy, we've already done the one less case, so don't match on popping.
848     // We haven't done the one more case, so always try to add that.
849     //
850     bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
851     {
852         ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
853
854         BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
855
856         if (term.capture()) {
857             unsigned subpatternId = term.atom.subpatternId;
858             output[(subpatternId << 1)] = backTrack->prevBegin;
859             output[(subpatternId << 1) + 1] = backTrack->prevEnd;
860         }
861
862         ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
863
864         switch (term.atom.quantityType) {
865         case QuantifierFixedCount: {
866             ASSERT(backTrack->matchAmount == term.atom.quantityCount);
867
868             ParenthesesDisjunctionContext* context = 0;
869
870             if (!parenthesesDoBacktrack(term, backTrack))
871                 return false;
872
873             // While we haven't yet reached our fixed limit,
874             while (backTrack->matchAmount < term.atom.quantityCount) {
875                 // Try to do a match, and it it succeeds, add it to the list.
876                 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
877                 if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
878                     appendParenthesesDisjunctionContext(backTrack, context);
879                 else {
880                     // The match failed; try to find an alternate point to carry on from.
881                     resetMatches(term, context);
882                     freeParenthesesDisjunctionContext(context);
883                     if (!parenthesesDoBacktrack(term, backTrack))
884                         return false;
885                 }
886             }
887
888             ASSERT(backTrack->matchAmount == term.atom.quantityCount);
889             context = backTrack->lastContext;
890             recordParenthesesMatch(term, context);
891             return true;
892         }
893
894         case QuantifierGreedy: {
895             if (!backTrack->matchAmount)
896                 return false;
897
898             ParenthesesDisjunctionContext* context = backTrack->lastContext;
899             if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
900                 while (backTrack->matchAmount < term.atom.quantityCount) {
901                     ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
902                     if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
903                         appendParenthesesDisjunctionContext(backTrack, context);
904                     else {
905                         resetMatches(term, context);
906                         freeParenthesesDisjunctionContext(context);
907                         break;
908                     }
909                 }
910             } else {
911                 resetMatches(term, context);
912                 popParenthesesDisjunctionContext(backTrack);
913                 freeParenthesesDisjunctionContext(context);
914             }
915
916             if (backTrack->matchAmount) {
917                 ParenthesesDisjunctionContext* context = backTrack->lastContext;
918                 recordParenthesesMatch(term, context);
919             }
920             return true;
921         }
922
923         case QuantifierNonGreedy: {
924             // If we've not reached the limit, try to add one more match.
925             if (backTrack->matchAmount < term.atom.quantityCount) {
926                 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
927                 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) {
928                     appendParenthesesDisjunctionContext(backTrack, context);
929                     recordParenthesesMatch(term, context);
930                     return true;
931                 } else {
932                     resetMatches(term, context);
933                     freeParenthesesDisjunctionContext(context);
934                 }
935             }
936
937             // Nope - okay backtrack looking for an alternative.
938             while (backTrack->matchAmount) {
939                 ParenthesesDisjunctionContext* context = backTrack->lastContext;
940                 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
941                     // successful backtrack! we're back in the game!
942                     if (backTrack->matchAmount) {
943                         context = backTrack->lastContext;
944                         recordParenthesesMatch(term, context);
945                     }
946                     return true;
947                 }
948
949                 // pop a match off the stack
950                 resetMatches(term, context);
951                 popParenthesesDisjunctionContext(backTrack);
952                 freeParenthesesDisjunctionContext(context);
953             }
954
955             return false;
956         }
957         }
958
959         ASSERT_NOT_REACHED();
960         return false;
961     }
962
963 #define MATCH_NEXT() { ++context->term; goto matchAgain; }
964 #define BACKTRACK() { --context->term; goto backtrack; }
965 #define currentTerm() (disjunction->terms[context->term])
966     bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
967     {
968         if (btrack)
969             BACKTRACK();
970
971         context->matchBegin = input.getPos();
972         context->term = 0;
973
974     matchAgain:
975         ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
976
977         switch (currentTerm().type) {
978         case ByteTerm::TypeSubpatternBegin:
979             MATCH_NEXT();
980         case ByteTerm::TypeSubpatternEnd:
981             context->matchEnd = input.getPos();
982             return true;
983
984         case ByteTerm::TypeBodyAlternativeBegin:
985             MATCH_NEXT();
986         case ByteTerm::TypeBodyAlternativeDisjunction:
987         case ByteTerm::TypeBodyAlternativeEnd:
988             context->matchEnd = input.getPos();
989             return true;
990
991         case ByteTerm::TypeAlternativeBegin:
992             MATCH_NEXT();
993         case ByteTerm::TypeAlternativeDisjunction:
994         case ByteTerm::TypeAlternativeEnd: {
995             int offset = currentTerm().alternative.end;
996             BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
997             backTrack->offset = offset;
998             context->term += offset;
999             MATCH_NEXT();
1000         }
1001
1002         case ByteTerm::TypeAssertionBOL:
1003             if (matchAssertionBOL(currentTerm()))
1004                 MATCH_NEXT();
1005             BACKTRACK();
1006         case ByteTerm::TypeAssertionEOL:
1007             if (matchAssertionEOL(currentTerm()))
1008                 MATCH_NEXT();
1009             BACKTRACK();
1010         case ByteTerm::TypeAssertionWordBoundary:
1011             if (matchAssertionWordBoundary(currentTerm()))
1012                 MATCH_NEXT();
1013             BACKTRACK();
1014
1015         case ByteTerm::TypePatternCharacterOnce:
1016         case ByteTerm::TypePatternCharacterFixed: {
1017             for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1018                 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount))
1019                     BACKTRACK();
1020             }
1021             MATCH_NEXT();
1022         }
1023         case ByteTerm::TypePatternCharacterGreedy: {
1024             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1025             unsigned matchAmount = 0;
1026             while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1027                 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) {
1028                     input.uncheckInput(1);
1029                     break;
1030                 }
1031                 ++matchAmount;
1032             }
1033             backTrack->matchAmount = matchAmount;
1034
1035             MATCH_NEXT();
1036         }
1037         case ByteTerm::TypePatternCharacterNonGreedy: {
1038             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1039             backTrack->matchAmount = 0;
1040             MATCH_NEXT();
1041         }
1042
1043         case ByteTerm::TypePatternCasedCharacterOnce:
1044         case ByteTerm::TypePatternCasedCharacterFixed: {
1045             for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1046                 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount))
1047                     BACKTRACK();
1048             }
1049             MATCH_NEXT();
1050         }
1051         case ByteTerm::TypePatternCasedCharacterGreedy: {
1052             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1053             unsigned matchAmount = 0;
1054             while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1055                 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) {
1056                     input.uncheckInput(1);
1057                     break;
1058                 }
1059                 ++matchAmount;
1060             }
1061             backTrack->matchAmount = matchAmount;
1062
1063             MATCH_NEXT();
1064         }
1065         case ByteTerm::TypePatternCasedCharacterNonGreedy: {
1066             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1067             backTrack->matchAmount = 0;
1068             MATCH_NEXT();
1069         }
1070
1071         case ByteTerm::TypeCharacterClass:
1072             if (matchCharacterClass(currentTerm(), context))
1073                 MATCH_NEXT();
1074             BACKTRACK();
1075         case ByteTerm::TypeBackReference:
1076             if (matchBackReference(currentTerm(), context))
1077                 MATCH_NEXT();
1078             BACKTRACK();
1079         case ByteTerm::TypeParenthesesSubpattern:
1080             if (matchParentheses(currentTerm(), context))
1081                 MATCH_NEXT();
1082             BACKTRACK();
1083         case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1084             if (matchParenthesesOnceBegin(currentTerm(), context))
1085                 MATCH_NEXT();
1086             BACKTRACK();
1087         case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1088             if (matchParenthesesOnceEnd(currentTerm(), context))
1089                 MATCH_NEXT();
1090             BACKTRACK();
1091         case ByteTerm::TypeParentheticalAssertionBegin:
1092             if (matchParentheticalAssertionBegin(currentTerm(), context))
1093                 MATCH_NEXT();
1094             BACKTRACK();
1095         case ByteTerm::TypeParentheticalAssertionEnd:
1096             if (matchParentheticalAssertionEnd(currentTerm(), context))
1097                 MATCH_NEXT();
1098             BACKTRACK();
1099
1100         case ByteTerm::TypeCheckInput:
1101             if (input.checkInput(currentTerm().checkInputCount))
1102                 MATCH_NEXT();
1103             BACKTRACK();
1104         }
1105
1106         // We should never fall-through to here.
1107         ASSERT_NOT_REACHED();
1108
1109     backtrack:
1110         ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1111
1112         switch (currentTerm().type) {
1113         case ByteTerm::TypeSubpatternBegin:
1114             return false;
1115         case ByteTerm::TypeSubpatternEnd:
1116             ASSERT_NOT_REACHED();
1117
1118         case ByteTerm::TypeBodyAlternativeBegin:
1119         case ByteTerm::TypeBodyAlternativeDisjunction: {
1120             int offset = currentTerm().alternative.next;
1121             context->term += offset;
1122             if (offset > 0)
1123                 MATCH_NEXT();
1124
1125             if (input.atEnd())
1126                 return false;
1127
1128             input.next();
1129             context->matchBegin = input.getPos();
1130
1131             if (currentTerm().alternative.onceThrough)
1132                 context->term += currentTerm().alternative.next;
1133
1134             MATCH_NEXT();
1135         }
1136         case ByteTerm::TypeBodyAlternativeEnd:
1137             ASSERT_NOT_REACHED();
1138
1139             case ByteTerm::TypeAlternativeBegin:
1140             case ByteTerm::TypeAlternativeDisjunction: {
1141                 int offset = currentTerm().alternative.next;
1142                 context->term += offset;
1143                 if (offset > 0)
1144                     MATCH_NEXT();
1145                 BACKTRACK();
1146             }
1147             case ByteTerm::TypeAlternativeEnd: {
1148                 // We should never backtrack back into an alternative of the main body of the regex.
1149                 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1150                 unsigned offset = backTrack->offset;
1151                 context->term -= offset;
1152                 BACKTRACK();
1153             }
1154
1155             case ByteTerm::TypeAssertionBOL:
1156             case ByteTerm::TypeAssertionEOL:
1157             case ByteTerm::TypeAssertionWordBoundary:
1158                 BACKTRACK();
1159
1160             case ByteTerm::TypePatternCharacterOnce:
1161             case ByteTerm::TypePatternCharacterFixed:
1162             case ByteTerm::TypePatternCharacterGreedy:
1163             case ByteTerm::TypePatternCharacterNonGreedy:
1164                 if (backtrackPatternCharacter(currentTerm(), context))
1165                     MATCH_NEXT();
1166                 BACKTRACK();
1167             case ByteTerm::TypePatternCasedCharacterOnce:
1168             case ByteTerm::TypePatternCasedCharacterFixed:
1169             case ByteTerm::TypePatternCasedCharacterGreedy:
1170             case ByteTerm::TypePatternCasedCharacterNonGreedy:
1171                 if (backtrackPatternCasedCharacter(currentTerm(), context))
1172                     MATCH_NEXT();
1173                 BACKTRACK();
1174             case ByteTerm::TypeCharacterClass:
1175                 if (backtrackCharacterClass(currentTerm(), context))
1176                     MATCH_NEXT();
1177                 BACKTRACK();
1178             case ByteTerm::TypeBackReference:
1179                 if (backtrackBackReference(currentTerm(), context))
1180                     MATCH_NEXT();
1181                 BACKTRACK();
1182             case ByteTerm::TypeParenthesesSubpattern:
1183                 if (backtrackParentheses(currentTerm(), context))
1184                     MATCH_NEXT();
1185                 BACKTRACK();
1186             case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1187                 if (backtrackParenthesesOnceBegin(currentTerm(), context))
1188                     MATCH_NEXT();
1189                 BACKTRACK();
1190             case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1191                 if (backtrackParenthesesOnceEnd(currentTerm(), context))
1192                     MATCH_NEXT();
1193                 BACKTRACK();
1194             case ByteTerm::TypeParentheticalAssertionBegin:
1195                 if (backtrackParentheticalAssertionBegin(currentTerm(), context))
1196                     MATCH_NEXT();
1197                 BACKTRACK();
1198             case ByteTerm::TypeParentheticalAssertionEnd:
1199                 if (backtrackParentheticalAssertionEnd(currentTerm(), context))
1200                     MATCH_NEXT();
1201                 BACKTRACK();
1202
1203             case ByteTerm::TypeCheckInput:
1204                 input.uncheckInput(currentTerm().checkInputCount);
1205                 BACKTRACK();
1206         }
1207
1208         ASSERT_NOT_REACHED();
1209         return false;
1210     }
1211
1212     bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1213     {
1214         if (matchDisjunction(disjunction, context, btrack)) {
1215             while (context->matchBegin == context->matchEnd) {
1216                 if (!matchDisjunction(disjunction, context, true))
1217                     return false;
1218             }
1219             return true;
1220         }
1221
1222         return false;
1223     }
1224
1225     int interpret()
1226     {
1227         allocatorPool = pattern->m_allocator->startAllocator();
1228         if (!allocatorPool)
1229             CRASH();
1230
1231         for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i)
1232             output[i] = -1;
1233
1234         DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
1235
1236         if (matchDisjunction(pattern->m_body.get(), context)) {
1237             output[0] = context->matchBegin;
1238             output[1] = context->matchEnd;
1239         }
1240
1241         freeDisjunctionContext(context);
1242
1243         pattern->m_allocator->stopAllocator();
1244
1245         return output[0];
1246     }
1247
1248     Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length)
1249         : pattern(pattern)
1250         , output(output)
1251         , input(inputChar, start, length)
1252         , allocatorPool(0)
1253     {
1254     }
1255
1256 private:
1257     BytecodePattern *pattern;
1258     int* output;
1259     InputStream input;
1260     BumpPointerPool* allocatorPool;
1261 };
1262
1263
1264
1265 class ByteCompiler {
1266     struct ParenthesesStackEntry {
1267         unsigned beginTerm;
1268         unsigned savedAlternativeIndex;
1269         ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
1270             : beginTerm(beginTerm)
1271             , savedAlternativeIndex(savedAlternativeIndex)
1272         {
1273         }
1274     };
1275
1276 public:
1277     ByteCompiler(RegexPattern& pattern)
1278         : m_pattern(pattern)
1279     {
1280         m_currentAlternativeIndex = 0;
1281     }
1282
1283     PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator)
1284     {
1285         regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
1286         emitDisjunction(m_pattern.m_body);
1287         regexEnd();
1288
1289         return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator));
1290     }
1291
1292     void checkInput(unsigned count)
1293     {
1294         m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
1295     }
1296
1297     void assertionBOL(int inputPosition)
1298     {
1299         m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
1300     }
1301
1302     void assertionEOL(int inputPosition)
1303     {
1304         m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
1305     }
1306
1307     void assertionWordBoundary(bool invert, int inputPosition)
1308     {
1309         m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
1310     }
1311
1312     void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1313     {
1314         if (m_pattern.m_ignoreCase) {
1315             UChar lo = Unicode::toLower(ch);
1316             UChar hi = Unicode::toUpper(ch);
1317
1318             if (lo != hi) {
1319                 m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
1320                 return;
1321             }
1322         }
1323
1324         m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
1325     }
1326
1327     void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1328     {
1329         m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
1330
1331         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
1332         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1333         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1334     }
1335
1336     void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1337     {
1338         ASSERT(subpatternId);
1339
1340         m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
1341
1342         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
1343         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1344         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1345     }
1346
1347     void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1348     {
1349         int beginTerm = m_bodyDisjunction->terms.size();
1350
1351         m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition));
1352         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1353         m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1354         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1355
1356         m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1357         m_currentAlternativeIndex = beginTerm + 1;
1358     }
1359
1360     void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
1361     {
1362         int beginTerm = m_bodyDisjunction->terms.size();
1363
1364         m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, invert, 0));
1365         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1366         m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1367         m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1368
1369         m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1370         m_currentAlternativeIndex = beginTerm + 1;
1371     }
1372
1373     unsigned popParenthesesStack()
1374     {
1375         ASSERT(m_parenthesesStack.size());
1376         int stackEnd = m_parenthesesStack.size() - 1;
1377         unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
1378         m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
1379         m_parenthesesStack.shrink(stackEnd);
1380
1381         ASSERT(beginTerm < m_bodyDisjunction->terms.size());
1382         ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
1383
1384         return beginTerm;
1385     }
1386
1387 #ifndef NDEBUG
1388     void dumpDisjunction(ByteDisjunction* disjunction)
1389     {
1390         printf("ByteDisjunction(%p):\n\t", disjunction);
1391         for (unsigned i = 0; i < disjunction->terms.size(); ++i)
1392             printf("{ %d } ", disjunction->terms[i].type);
1393         printf("\n");
1394     }
1395 #endif
1396
1397     void closeAlternative(int beginTerm)
1398     {
1399         int origBeginTerm = beginTerm;
1400         ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
1401         int endIndex = m_bodyDisjunction->terms.size();
1402
1403         unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1404
1405         if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
1406             m_bodyDisjunction->terms.remove(beginTerm);
1407         else {
1408             while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1409                 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1410                 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
1411                 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1412                 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1413             }
1414
1415             m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1416
1417             m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
1418             m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1419         }
1420     }
1421
1422     void closeBodyAlternative()
1423     {
1424         int beginTerm = 0;
1425         int origBeginTerm = 0;
1426         ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
1427         int endIndex = m_bodyDisjunction->terms.size();
1428
1429         unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1430
1431         while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1432             beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1433             ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
1434             m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1435             m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1436         }
1437
1438         m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1439
1440         m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
1441         m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1442     }
1443
1444     void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
1445     {
1446         unsigned beginTerm = popParenthesesStack();
1447         closeAlternative(beginTerm + 1);
1448         unsigned endTerm = m_bodyDisjunction->terms.size();
1449
1450         bool isAssertion = m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin;
1451         bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture;
1452         unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1453
1454         m_bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition));
1455         m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1456         m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1457         m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1458
1459         if (doInline) {
1460             m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1461             m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1462             m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
1463             m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1464         } else {
1465             ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
1466             ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1467
1468             bool invertOrCapture = parenthesesBegin.invertOrCapture;
1469             unsigned subpatternId = parenthesesBegin.atom.subpatternId;
1470
1471             unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
1472             ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
1473
1474             parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
1475             for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
1476                 parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
1477             parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
1478
1479             m_bodyDisjunction->terms.shrink(beginTerm);
1480
1481             m_allParenthesesInfo.append(parenthesesDisjunction);
1482             m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition));
1483
1484             m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1485             m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1486             m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1487         }
1488     }
1489
1490     void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
1491     {
1492         m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize));
1493         m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
1494         m_bodyDisjunction->terms[0].frameLocation = 0;
1495         m_currentAlternativeIndex = 0;
1496     }
1497
1498     void regexEnd()
1499     {
1500         closeBodyAlternative();
1501     }
1502
1503     void alternativeBodyDisjunction(bool onceThrough)
1504     {
1505         int newAlternativeIndex = m_bodyDisjunction->terms.size();
1506         m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1507         m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
1508
1509         m_currentAlternativeIndex = newAlternativeIndex;
1510     }
1511
1512     void alternativeDisjunction()
1513     {
1514         int newAlternativeIndex = m_bodyDisjunction->terms.size();
1515         m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1516         m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
1517
1518         m_currentAlternativeIndex = newAlternativeIndex;
1519     }
1520
1521     void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0, bool isParentheticalAssertion = false)
1522     {
1523         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
1524             unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
1525
1526             PatternAlternative* alternative = disjunction->m_alternatives[alt];
1527
1528             if (alt) {
1529                 if (disjunction == m_pattern.m_body)
1530                     alternativeBodyDisjunction(alternative->onceThrough());
1531                 else
1532                     alternativeDisjunction();
1533             }
1534
1535             unsigned minimumSize = alternative->m_minimumSize;
1536             int countToCheck;
1537
1538             if (isParentheticalAssertion && parenthesesInputCountAlreadyChecked > minimumSize)
1539                 countToCheck = 0;
1540             else
1541                 countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
1542
1543             ASSERT(countToCheck >= 0);
1544             if (countToCheck) {
1545                 checkInput(countToCheck);
1546                 currentCountAlreadyChecked += countToCheck;
1547             }
1548
1549             for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
1550                 PatternTerm& term = alternative->m_terms[i];
1551
1552                 switch (term.type) {
1553                 case PatternTerm::TypeAssertionBOL:
1554                     assertionBOL(term.inputPosition - currentCountAlreadyChecked);
1555                     break;
1556
1557                 case PatternTerm::TypeAssertionEOL:
1558                     assertionEOL(term.inputPosition - currentCountAlreadyChecked);
1559                     break;
1560
1561                 case PatternTerm::TypeAssertionWordBoundary:
1562                     assertionWordBoundary(term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked);
1563                     break;
1564
1565                 case PatternTerm::TypePatternCharacter:
1566                     atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1567                     break;
1568
1569                 case PatternTerm::TypeCharacterClass:
1570                     atomCharacterClass(term.characterClass, term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1571                     break;
1572
1573                 case PatternTerm::TypeBackReference:
1574                     atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1575                         break;
1576
1577                 case PatternTerm::TypeForwardReference:
1578                     break;
1579
1580                 case PatternTerm::TypeParenthesesSubpattern: {
1581                     unsigned disjunctionAlreadyCheckedCount = 0;
1582                     if ((term.quantityCount == 1) && !term.parentheses.isCopy) {
1583                         if (term.quantityType == QuantifierFixedCount) {
1584                             disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
1585                             unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1586                             atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation);
1587                             emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize);
1588                             atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1589                         } else {
1590                             unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1591                             atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce);
1592                             emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1593                             atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1594                         }
1595                     } else {
1596                         unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1597                         atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0);
1598                         emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1599                         atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1600                     }
1601                     break;
1602                 }
1603
1604                 case PatternTerm::TypeParentheticalAssertion: {
1605                     unsigned alternativeFrameLocation = term.frameLocation + RegexStackSpaceForBackTrackInfoParentheticalAssertion;
1606
1607                     ASSERT(currentCountAlreadyChecked >= (unsigned)term.inputPosition);
1608                     int positiveInputOffset = currentCountAlreadyChecked - term.inputPosition;
1609
1610                     atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation);
1611                     emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset, true);
1612                     atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType);
1613                     break;
1614                 }
1615                 }
1616             }
1617         }
1618     }
1619
1620 private:
1621     RegexPattern& m_pattern;
1622     OwnPtr<ByteDisjunction> m_bodyDisjunction;
1623     unsigned m_currentAlternativeIndex;
1624     Vector<ParenthesesStackEntry> m_parenthesesStack;
1625     Vector<ByteDisjunction*> m_allParenthesesInfo;
1626 };
1627
1628
1629 PassOwnPtr<BytecodePattern> byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, BumpPointerAllocator* allocator, bool ignoreCase, bool multiline)
1630 {
1631     RegexPattern pattern(ignoreCase, multiline);
1632
1633     if ((error = compileRegex(patternString, pattern)))
1634         return PassOwnPtr<BytecodePattern>();
1635
1636     numSubpatterns = pattern.m_numSubpatterns;
1637
1638     return ByteCompiler(pattern).compile(allocator);
1639 }
1640
1641 int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output)
1642 {
1643     return Interpreter(regex, output, input, start, length).interpret();
1644 }
1645
1646
1647 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter);
1648 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass);
1649 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference);
1650 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative);
1651 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion);
1652 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce);
1653 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses);
1654
1655
1656 } }
1657
1658 #endif