2 * Copyright (C) 2009 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #include "RegexInterpreter.h"
29 #include "RegexCompiler.h"
30 #include "RegexPattern.h"
31 #include <wtf/BumpPointerAllocator.h>
41 namespace JSC { namespace Yarr {
45 struct ParenthesesDisjunctionContext;
47 struct BackTrackInfoPatternCharacter {
48 uintptr_t matchAmount;
50 struct BackTrackInfoCharacterClass {
51 uintptr_t matchAmount;
53 struct BackTrackInfoBackReference {
54 uintptr_t begin; // Not really needed for greedy quantifiers.
55 uintptr_t matchAmount; // Not really needed for fixed quantifiers.
57 struct BackTrackInfoAlternative {
60 struct BackTrackInfoParentheticalAssertion {
63 struct BackTrackInfoParenthesesOnce {
64 uintptr_t inParentheses;
66 struct BackTrackInfoParentheses {
67 uintptr_t matchAmount;
68 ParenthesesDisjunctionContext* lastContext;
73 static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
75 context->next = backTrack->lastContext;
76 backTrack->lastContext = context;
77 ++backTrack->matchAmount;
80 static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
82 ASSERT(backTrack->matchAmount);
83 ASSERT(backTrack->lastContext);
84 backTrack->lastContext = backTrack->lastContext->next;
85 --backTrack->matchAmount;
88 struct DisjunctionContext
95 void* operator new(size_t, void* where)
106 DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
108 size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
109 allocatorPool = allocatorPool->ensureCapacity(size);
112 return new(allocatorPool->alloc(size)) DisjunctionContext();
115 void freeDisjunctionContext(DisjunctionContext* context)
117 allocatorPool = allocatorPool->dealloc(context);
120 struct ParenthesesDisjunctionContext
122 ParenthesesDisjunctionContext(int* output, ByteTerm& term)
125 unsigned firstSubpatternId = term.atom.subpatternId;
126 unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
128 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
129 subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
130 output[(firstSubpatternId << 1) + i] = -1;
133 new(getDisjunctionContext(term)) DisjunctionContext();
136 void* operator new(size_t, void* where)
141 void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
143 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
144 output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
147 DisjunctionContext* getDisjunctionContext(ByteTerm& term)
149 return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
152 ParenthesesDisjunctionContext* next;
153 int subpatternBackup[1];
156 ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term)
158 size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(int) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(int) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
159 allocatorPool = allocatorPool->ensureCapacity(size);
162 return new(allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
165 void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
167 allocatorPool = allocatorPool->dealloc(context);
172 InputStream(const UChar* input, unsigned start, unsigned length)
184 void rewind(unsigned amount)
186 ASSERT(pos >= amount);
192 ASSERT(pos < length);
198 int readChecked(int position)
200 ASSERT(position < 0);
201 ASSERT((unsigned)-position <= pos);
202 unsigned p = pos + position;
207 int reread(unsigned from)
209 ASSERT(from < length);
215 ASSERT(!(pos > length));
217 return input[pos - 1];
226 void setPos(unsigned p)
238 return pos == length;
241 bool checkInput(int count)
243 if ((pos + count) <= length) {
250 void uncheckInput(int count)
255 bool atStart(int position)
257 return (pos + position) == 0;
260 bool atEnd(int position)
262 return (pos + position) == length;
271 bool testCharacterClass(CharacterClass* characterClass, int ch)
274 for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
275 if (ch == characterClass->m_matchesUnicode[i])
277 for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
278 if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
281 for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
282 if (ch == characterClass->m_matches[i])
284 for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
285 if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
292 bool checkCharacter(int testChar, int inputPosition)
294 return testChar == input.readChecked(inputPosition);
297 bool checkCasedCharacter(int loChar, int hiChar, int inputPosition)
299 int ch = input.readChecked(inputPosition);
300 return (loChar == ch) || (hiChar == ch);
303 bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition)
305 bool match = testCharacterClass(characterClass, input.readChecked(inputPosition));
306 return invert ? !match : match;
309 bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset)
311 int matchSize = matchEnd - matchBegin;
313 if (!input.checkInput(matchSize))
316 if (pattern->m_ignoreCase) {
317 for (int i = 0; i < matchSize; ++i) {
318 int ch = input.reread(matchBegin + i);
320 int lo = Unicode::toLower(ch);
321 int hi = Unicode::toUpper(ch);
323 if ((lo != hi) ? (!checkCasedCharacter(lo, hi, inputOffset - matchSize + i)) : (!checkCharacter(ch, inputOffset - matchSize + i))) {
324 input.uncheckInput(matchSize);
329 for (int i = 0; i < matchSize; ++i) {
330 if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) {
331 input.uncheckInput(matchSize);
340 bool matchAssertionBOL(ByteTerm& term)
342 return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1)));
345 bool matchAssertionEOL(ByteTerm& term)
347 if (term.inputPosition)
348 return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
350 return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
353 bool matchAssertionWordBoundary(ByteTerm& term)
355 bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1));
357 if (term.inputPosition)
358 readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
360 readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
362 bool wordBoundary = prevIsWordchar != readIsWordchar;
363 return term.invert() ? !wordBoundary : wordBoundary;
366 bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
368 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
370 switch (term.atom.quantityType) {
371 case QuantifierFixedCount:
374 case QuantifierGreedy:
375 if (backTrack->matchAmount) {
376 --backTrack->matchAmount;
377 input.uncheckInput(1);
382 case QuantifierNonGreedy:
383 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
384 ++backTrack->matchAmount;
385 if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1))
388 input.uncheckInput(backTrack->matchAmount);
395 bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
397 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
399 switch (term.atom.quantityType) {
400 case QuantifierFixedCount:
403 case QuantifierGreedy:
404 if (backTrack->matchAmount) {
405 --backTrack->matchAmount;
406 input.uncheckInput(1);
411 case QuantifierNonGreedy:
412 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
413 ++backTrack->matchAmount;
414 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1))
417 input.uncheckInput(backTrack->matchAmount);
424 bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
426 ASSERT(term.type == ByteTerm::TypeCharacterClass);
427 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
429 switch (term.atom.quantityType) {
430 case QuantifierFixedCount: {
431 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
432 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount))
438 case QuantifierGreedy: {
439 unsigned matchAmount = 0;
440 while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
441 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) {
442 input.uncheckInput(1);
447 backTrack->matchAmount = matchAmount;
452 case QuantifierNonGreedy:
453 backTrack->matchAmount = 0;
457 ASSERT_NOT_REACHED();
461 bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
463 ASSERT(term.type == ByteTerm::TypeCharacterClass);
464 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
466 switch (term.atom.quantityType) {
467 case QuantifierFixedCount:
470 case QuantifierGreedy:
471 if (backTrack->matchAmount) {
472 --backTrack->matchAmount;
473 input.uncheckInput(1);
478 case QuantifierNonGreedy:
479 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
480 ++backTrack->matchAmount;
481 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1))
484 input.uncheckInput(backTrack->matchAmount);
491 bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
493 ASSERT(term.type == ByteTerm::TypeBackReference);
494 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
496 int matchBegin = output[(term.atom.subpatternId << 1)];
497 int matchEnd = output[(term.atom.subpatternId << 1) + 1];
499 // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
500 // In this case the result of match is empty string like when it references to a parentheses with zero-width match.
505 ASSERT((matchBegin == -1) == (matchEnd == -1));
506 ASSERT(matchBegin <= matchEnd);
508 if (matchBegin == matchEnd)
511 switch (term.atom.quantityType) {
512 case QuantifierFixedCount: {
513 backTrack->begin = input.getPos();
514 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
515 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
516 input.setPos(backTrack->begin);
523 case QuantifierGreedy: {
524 unsigned matchAmount = 0;
525 while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
527 backTrack->matchAmount = matchAmount;
531 case QuantifierNonGreedy:
532 backTrack->begin = input.getPos();
533 backTrack->matchAmount = 0;
537 ASSERT_NOT_REACHED();
541 bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
543 ASSERT(term.type == ByteTerm::TypeBackReference);
544 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
546 int matchBegin = output[(term.atom.subpatternId << 1)];
547 int matchEnd = output[(term.atom.subpatternId << 1) + 1];
548 ASSERT((matchBegin == -1) == (matchEnd == -1));
549 ASSERT(matchBegin <= matchEnd);
551 if (matchBegin == matchEnd)
554 switch (term.atom.quantityType) {
555 case QuantifierFixedCount:
556 // for quantityCount == 1, could rewind.
557 input.setPos(backTrack->begin);
560 case QuantifierGreedy:
561 if (backTrack->matchAmount) {
562 --backTrack->matchAmount;
563 input.rewind(matchEnd - matchBegin);
568 case QuantifierNonGreedy:
569 if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
570 ++backTrack->matchAmount;
573 input.setPos(backTrack->begin);
580 void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
582 if (term.capture()) {
583 unsigned subpatternId = term.atom.subpatternId;
584 output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
585 output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
588 void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
590 unsigned firstSubpatternId = term.atom.subpatternId;
591 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
592 context->restoreOutput(output, firstSubpatternId, count);
594 void resetAssertionMatches(ByteTerm& term)
596 unsigned firstSubpatternId = term.atom.subpatternId;
597 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
598 for (unsigned i = 0; i < (count << 1); ++i)
599 output[(firstSubpatternId << 1) + i] = -1;
601 bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
603 while (backTrack->matchAmount) {
604 ParenthesesDisjunctionContext* context = backTrack->lastContext;
606 if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true))
609 resetMatches(term, context);
610 popParenthesesDisjunctionContext(backTrack);
611 freeParenthesesDisjunctionContext(context);
617 bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
619 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
620 ASSERT(term.atom.quantityCount == 1);
622 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
624 switch (term.atom.quantityType) {
625 case QuantifierGreedy: {
626 // set this speculatively; if we get to the parens end this will be true.
627 backTrack->inParentheses = 1;
630 case QuantifierNonGreedy: {
631 backTrack->inParentheses = 0;
632 context->term += term.atom.parenthesesWidth;
635 case QuantifierFixedCount:
639 if (term.capture()) {
640 unsigned subpatternId = term.atom.subpatternId;
641 output[(subpatternId << 1)] = input.getPos() + term.inputPosition;
647 bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*)
649 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
650 ASSERT(term.atom.quantityCount == 1);
652 if (term.capture()) {
653 unsigned subpatternId = term.atom.subpatternId;
654 output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
659 bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
661 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
662 ASSERT(term.atom.quantityCount == 1);
664 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
666 if (term.capture()) {
667 unsigned subpatternId = term.atom.subpatternId;
668 output[(subpatternId << 1)] = -1;
669 output[(subpatternId << 1) + 1] = -1;
672 switch (term.atom.quantityType) {
673 case QuantifierGreedy:
674 // if we backtrack to this point, there is another chance - try matching nothing.
675 ASSERT(backTrack->inParentheses);
676 backTrack->inParentheses = 0;
677 context->term += term.atom.parenthesesWidth;
679 case QuantifierNonGreedy:
680 ASSERT(backTrack->inParentheses);
681 case QuantifierFixedCount:
688 bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
690 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
691 ASSERT(term.atom.quantityCount == 1);
693 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
695 switch (term.atom.quantityType) {
696 case QuantifierGreedy:
697 if (!backTrack->inParentheses) {
698 context->term -= term.atom.parenthesesWidth;
701 case QuantifierNonGreedy:
702 if (!backTrack->inParentheses) {
703 // now try to match the parens; set this speculatively.
704 backTrack->inParentheses = 1;
705 if (term.capture()) {
706 unsigned subpatternId = term.atom.subpatternId;
707 output[subpatternId << 1] = input.getPos() + term.inputPosition;
709 context->term -= term.atom.parenthesesWidth;
712 case QuantifierFixedCount:
719 bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
721 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
722 ASSERT(term.atom.quantityCount == 1);
724 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
726 backTrack->begin = input.getPos();
730 bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
732 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
733 ASSERT(term.atom.quantityCount == 1);
735 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
737 input.setPos(backTrack->begin);
739 // We've reached the end of the parens; if they are inverted, this is failure.
741 context->term -= term.atom.parenthesesWidth;
748 bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
750 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
751 ASSERT(term.atom.quantityCount == 1);
753 // We've failed to match parens; if they are inverted, this is win!
755 context->term += term.atom.parenthesesWidth;
762 bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
764 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
765 ASSERT(term.atom.quantityCount == 1);
767 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
769 input.setPos(backTrack->begin);
771 context->term -= term.atom.parenthesesWidth;
775 bool matchParentheses(ByteTerm& term, DisjunctionContext* context)
777 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
779 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
781 unsigned subpatternId = term.atom.subpatternId;
782 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
784 backTrack->prevBegin = output[(subpatternId << 1)];
785 backTrack->prevEnd = output[(subpatternId << 1) + 1];
787 backTrack->matchAmount = 0;
788 backTrack->lastContext = 0;
790 switch (term.atom.quantityType) {
791 case QuantifierFixedCount: {
792 // While we haven't yet reached our fixed limit,
793 while (backTrack->matchAmount < term.atom.quantityCount) {
794 // Try to do a match, and it it succeeds, add it to the list.
795 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
796 if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
797 appendParenthesesDisjunctionContext(backTrack, context);
799 // The match failed; try to find an alternate point to carry on from.
800 resetMatches(term, context);
801 freeParenthesesDisjunctionContext(context);
802 if (!parenthesesDoBacktrack(term, backTrack))
807 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
808 ParenthesesDisjunctionContext* context = backTrack->lastContext;
809 recordParenthesesMatch(term, context);
813 case QuantifierGreedy: {
814 while (backTrack->matchAmount < term.atom.quantityCount) {
815 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
816 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
817 appendParenthesesDisjunctionContext(backTrack, context);
819 resetMatches(term, context);
820 freeParenthesesDisjunctionContext(context);
825 if (backTrack->matchAmount) {
826 ParenthesesDisjunctionContext* context = backTrack->lastContext;
827 recordParenthesesMatch(term, context);
832 case QuantifierNonGreedy:
836 ASSERT_NOT_REACHED();
840 // Rules for backtracking differ depending on whether this is greedy or non-greedy.
842 // Greedy matches never should try just adding more - you should already have done
843 // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
844 // you backtrack an item off the list needs checking, since we'll never have matched
845 // the one less case. Tracking forwards, still add as much as possible.
847 // Non-greedy, we've already done the one less case, so don't match on popping.
848 // We haven't done the one more case, so always try to add that.
850 bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
852 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
854 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
856 if (term.capture()) {
857 unsigned subpatternId = term.atom.subpatternId;
858 output[(subpatternId << 1)] = backTrack->prevBegin;
859 output[(subpatternId << 1) + 1] = backTrack->prevEnd;
862 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
864 switch (term.atom.quantityType) {
865 case QuantifierFixedCount: {
866 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
868 ParenthesesDisjunctionContext* context = 0;
870 if (!parenthesesDoBacktrack(term, backTrack))
873 // While we haven't yet reached our fixed limit,
874 while (backTrack->matchAmount < term.atom.quantityCount) {
875 // Try to do a match, and it it succeeds, add it to the list.
876 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
877 if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
878 appendParenthesesDisjunctionContext(backTrack, context);
880 // The match failed; try to find an alternate point to carry on from.
881 resetMatches(term, context);
882 freeParenthesesDisjunctionContext(context);
883 if (!parenthesesDoBacktrack(term, backTrack))
888 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
889 context = backTrack->lastContext;
890 recordParenthesesMatch(term, context);
894 case QuantifierGreedy: {
895 if (!backTrack->matchAmount)
898 ParenthesesDisjunctionContext* context = backTrack->lastContext;
899 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
900 while (backTrack->matchAmount < term.atom.quantityCount) {
901 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
902 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
903 appendParenthesesDisjunctionContext(backTrack, context);
905 resetMatches(term, context);
906 freeParenthesesDisjunctionContext(context);
911 resetMatches(term, context);
912 popParenthesesDisjunctionContext(backTrack);
913 freeParenthesesDisjunctionContext(context);
916 if (backTrack->matchAmount) {
917 ParenthesesDisjunctionContext* context = backTrack->lastContext;
918 recordParenthesesMatch(term, context);
923 case QuantifierNonGreedy: {
924 // If we've not reached the limit, try to add one more match.
925 if (backTrack->matchAmount < term.atom.quantityCount) {
926 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
927 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) {
928 appendParenthesesDisjunctionContext(backTrack, context);
929 recordParenthesesMatch(term, context);
932 resetMatches(term, context);
933 freeParenthesesDisjunctionContext(context);
937 // Nope - okay backtrack looking for an alternative.
938 while (backTrack->matchAmount) {
939 ParenthesesDisjunctionContext* context = backTrack->lastContext;
940 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
941 // successful backtrack! we're back in the game!
942 if (backTrack->matchAmount) {
943 context = backTrack->lastContext;
944 recordParenthesesMatch(term, context);
949 // pop a match off the stack
950 resetMatches(term, context);
951 popParenthesesDisjunctionContext(backTrack);
952 freeParenthesesDisjunctionContext(context);
959 ASSERT_NOT_REACHED();
963 #define MATCH_NEXT() { ++context->term; goto matchAgain; }
964 #define BACKTRACK() { --context->term; goto backtrack; }
965 #define currentTerm() (disjunction->terms[context->term])
966 bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
971 context->matchBegin = input.getPos();
975 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
977 switch (currentTerm().type) {
978 case ByteTerm::TypeSubpatternBegin:
980 case ByteTerm::TypeSubpatternEnd:
981 context->matchEnd = input.getPos();
984 case ByteTerm::TypeBodyAlternativeBegin:
986 case ByteTerm::TypeBodyAlternativeDisjunction:
987 case ByteTerm::TypeBodyAlternativeEnd:
988 context->matchEnd = input.getPos();
991 case ByteTerm::TypeAlternativeBegin:
993 case ByteTerm::TypeAlternativeDisjunction:
994 case ByteTerm::TypeAlternativeEnd: {
995 int offset = currentTerm().alternative.end;
996 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
997 backTrack->offset = offset;
998 context->term += offset;
1002 case ByteTerm::TypeAssertionBOL:
1003 if (matchAssertionBOL(currentTerm()))
1006 case ByteTerm::TypeAssertionEOL:
1007 if (matchAssertionEOL(currentTerm()))
1010 case ByteTerm::TypeAssertionWordBoundary:
1011 if (matchAssertionWordBoundary(currentTerm()))
1015 case ByteTerm::TypePatternCharacterOnce:
1016 case ByteTerm::TypePatternCharacterFixed: {
1017 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1018 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount))
1023 case ByteTerm::TypePatternCharacterGreedy: {
1024 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1025 unsigned matchAmount = 0;
1026 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1027 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) {
1028 input.uncheckInput(1);
1033 backTrack->matchAmount = matchAmount;
1037 case ByteTerm::TypePatternCharacterNonGreedy: {
1038 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1039 backTrack->matchAmount = 0;
1043 case ByteTerm::TypePatternCasedCharacterOnce:
1044 case ByteTerm::TypePatternCasedCharacterFixed: {
1045 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1046 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount))
1051 case ByteTerm::TypePatternCasedCharacterGreedy: {
1052 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1053 unsigned matchAmount = 0;
1054 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1055 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) {
1056 input.uncheckInput(1);
1061 backTrack->matchAmount = matchAmount;
1065 case ByteTerm::TypePatternCasedCharacterNonGreedy: {
1066 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1067 backTrack->matchAmount = 0;
1071 case ByteTerm::TypeCharacterClass:
1072 if (matchCharacterClass(currentTerm(), context))
1075 case ByteTerm::TypeBackReference:
1076 if (matchBackReference(currentTerm(), context))
1079 case ByteTerm::TypeParenthesesSubpattern:
1080 if (matchParentheses(currentTerm(), context))
1083 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1084 if (matchParenthesesOnceBegin(currentTerm(), context))
1087 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1088 if (matchParenthesesOnceEnd(currentTerm(), context))
1091 case ByteTerm::TypeParentheticalAssertionBegin:
1092 if (matchParentheticalAssertionBegin(currentTerm(), context))
1095 case ByteTerm::TypeParentheticalAssertionEnd:
1096 if (matchParentheticalAssertionEnd(currentTerm(), context))
1100 case ByteTerm::TypeCheckInput:
1101 if (input.checkInput(currentTerm().checkInputCount))
1106 // We should never fall-through to here.
1107 ASSERT_NOT_REACHED();
1110 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1112 switch (currentTerm().type) {
1113 case ByteTerm::TypeSubpatternBegin:
1115 case ByteTerm::TypeSubpatternEnd:
1116 ASSERT_NOT_REACHED();
1118 case ByteTerm::TypeBodyAlternativeBegin:
1119 case ByteTerm::TypeBodyAlternativeDisjunction: {
1120 int offset = currentTerm().alternative.next;
1121 context->term += offset;
1129 context->matchBegin = input.getPos();
1131 if (currentTerm().alternative.onceThrough)
1132 context->term += currentTerm().alternative.next;
1136 case ByteTerm::TypeBodyAlternativeEnd:
1137 ASSERT_NOT_REACHED();
1139 case ByteTerm::TypeAlternativeBegin:
1140 case ByteTerm::TypeAlternativeDisjunction: {
1141 int offset = currentTerm().alternative.next;
1142 context->term += offset;
1147 case ByteTerm::TypeAlternativeEnd: {
1148 // We should never backtrack back into an alternative of the main body of the regex.
1149 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1150 unsigned offset = backTrack->offset;
1151 context->term -= offset;
1155 case ByteTerm::TypeAssertionBOL:
1156 case ByteTerm::TypeAssertionEOL:
1157 case ByteTerm::TypeAssertionWordBoundary:
1160 case ByteTerm::TypePatternCharacterOnce:
1161 case ByteTerm::TypePatternCharacterFixed:
1162 case ByteTerm::TypePatternCharacterGreedy:
1163 case ByteTerm::TypePatternCharacterNonGreedy:
1164 if (backtrackPatternCharacter(currentTerm(), context))
1167 case ByteTerm::TypePatternCasedCharacterOnce:
1168 case ByteTerm::TypePatternCasedCharacterFixed:
1169 case ByteTerm::TypePatternCasedCharacterGreedy:
1170 case ByteTerm::TypePatternCasedCharacterNonGreedy:
1171 if (backtrackPatternCasedCharacter(currentTerm(), context))
1174 case ByteTerm::TypeCharacterClass:
1175 if (backtrackCharacterClass(currentTerm(), context))
1178 case ByteTerm::TypeBackReference:
1179 if (backtrackBackReference(currentTerm(), context))
1182 case ByteTerm::TypeParenthesesSubpattern:
1183 if (backtrackParentheses(currentTerm(), context))
1186 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1187 if (backtrackParenthesesOnceBegin(currentTerm(), context))
1190 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1191 if (backtrackParenthesesOnceEnd(currentTerm(), context))
1194 case ByteTerm::TypeParentheticalAssertionBegin:
1195 if (backtrackParentheticalAssertionBegin(currentTerm(), context))
1198 case ByteTerm::TypeParentheticalAssertionEnd:
1199 if (backtrackParentheticalAssertionEnd(currentTerm(), context))
1203 case ByteTerm::TypeCheckInput:
1204 input.uncheckInput(currentTerm().checkInputCount);
1208 ASSERT_NOT_REACHED();
1212 bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1214 if (matchDisjunction(disjunction, context, btrack)) {
1215 while (context->matchBegin == context->matchEnd) {
1216 if (!matchDisjunction(disjunction, context, true))
1227 allocatorPool = pattern->m_allocator->startAllocator();
1231 for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i)
1234 DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
1236 if (matchDisjunction(pattern->m_body.get(), context)) {
1237 output[0] = context->matchBegin;
1238 output[1] = context->matchEnd;
1241 freeDisjunctionContext(context);
1243 pattern->m_allocator->stopAllocator();
1248 Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length)
1251 , input(inputChar, start, length)
1257 BytecodePattern *pattern;
1260 BumpPointerPool* allocatorPool;
1265 class ByteCompiler {
1266 struct ParenthesesStackEntry {
1268 unsigned savedAlternativeIndex;
1269 ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
1270 : beginTerm(beginTerm)
1271 , savedAlternativeIndex(savedAlternativeIndex)
1277 ByteCompiler(RegexPattern& pattern)
1278 : m_pattern(pattern)
1280 m_currentAlternativeIndex = 0;
1283 PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator)
1285 regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
1286 emitDisjunction(m_pattern.m_body);
1289 return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator));
1292 void checkInput(unsigned count)
1294 m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
1297 void assertionBOL(int inputPosition)
1299 m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
1302 void assertionEOL(int inputPosition)
1304 m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
1307 void assertionWordBoundary(bool invert, int inputPosition)
1309 m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
1312 void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1314 if (m_pattern.m_ignoreCase) {
1315 UChar lo = Unicode::toLower(ch);
1316 UChar hi = Unicode::toUpper(ch);
1319 m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
1324 m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
1327 void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1329 m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
1331 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
1332 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1333 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1336 void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1338 ASSERT(subpatternId);
1340 m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
1342 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
1343 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1344 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1347 void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1349 int beginTerm = m_bodyDisjunction->terms.size();
1351 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition));
1352 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1353 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1354 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1356 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1357 m_currentAlternativeIndex = beginTerm + 1;
1360 void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
1362 int beginTerm = m_bodyDisjunction->terms.size();
1364 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, invert, 0));
1365 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1366 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1367 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1369 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1370 m_currentAlternativeIndex = beginTerm + 1;
1373 unsigned popParenthesesStack()
1375 ASSERT(m_parenthesesStack.size());
1376 int stackEnd = m_parenthesesStack.size() - 1;
1377 unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
1378 m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
1379 m_parenthesesStack.shrink(stackEnd);
1381 ASSERT(beginTerm < m_bodyDisjunction->terms.size());
1382 ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
1388 void dumpDisjunction(ByteDisjunction* disjunction)
1390 printf("ByteDisjunction(%p):\n\t", disjunction);
1391 for (unsigned i = 0; i < disjunction->terms.size(); ++i)
1392 printf("{ %d } ", disjunction->terms[i].type);
1397 void closeAlternative(int beginTerm)
1399 int origBeginTerm = beginTerm;
1400 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
1401 int endIndex = m_bodyDisjunction->terms.size();
1403 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1405 if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
1406 m_bodyDisjunction->terms.remove(beginTerm);
1408 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1409 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1410 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
1411 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1412 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1415 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1417 m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
1418 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1422 void closeBodyAlternative()
1425 int origBeginTerm = 0;
1426 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
1427 int endIndex = m_bodyDisjunction->terms.size();
1429 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1431 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1432 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1433 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
1434 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1435 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1438 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1440 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
1441 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1444 void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
1446 unsigned beginTerm = popParenthesesStack();
1447 closeAlternative(beginTerm + 1);
1448 unsigned endTerm = m_bodyDisjunction->terms.size();
1450 bool isAssertion = m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin;
1451 bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture;
1452 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1454 m_bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition));
1455 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1456 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1457 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1460 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1461 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1462 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
1463 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1465 ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
1466 ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1468 bool invertOrCapture = parenthesesBegin.invertOrCapture;
1469 unsigned subpatternId = parenthesesBegin.atom.subpatternId;
1471 unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
1472 ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
1474 parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
1475 for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
1476 parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
1477 parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
1479 m_bodyDisjunction->terms.shrink(beginTerm);
1481 m_allParenthesesInfo.append(parenthesesDisjunction);
1482 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition));
1484 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
1485 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1486 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1490 void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
1492 m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize));
1493 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
1494 m_bodyDisjunction->terms[0].frameLocation = 0;
1495 m_currentAlternativeIndex = 0;
1500 closeBodyAlternative();
1503 void alternativeBodyDisjunction(bool onceThrough)
1505 int newAlternativeIndex = m_bodyDisjunction->terms.size();
1506 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1507 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
1509 m_currentAlternativeIndex = newAlternativeIndex;
1512 void alternativeDisjunction()
1514 int newAlternativeIndex = m_bodyDisjunction->terms.size();
1515 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1516 m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
1518 m_currentAlternativeIndex = newAlternativeIndex;
1521 void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0, bool isParentheticalAssertion = false)
1523 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
1524 unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
1526 PatternAlternative* alternative = disjunction->m_alternatives[alt];
1529 if (disjunction == m_pattern.m_body)
1530 alternativeBodyDisjunction(alternative->onceThrough());
1532 alternativeDisjunction();
1535 unsigned minimumSize = alternative->m_minimumSize;
1538 if (isParentheticalAssertion && parenthesesInputCountAlreadyChecked > minimumSize)
1541 countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
1543 ASSERT(countToCheck >= 0);
1545 checkInput(countToCheck);
1546 currentCountAlreadyChecked += countToCheck;
1549 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
1550 PatternTerm& term = alternative->m_terms[i];
1552 switch (term.type) {
1553 case PatternTerm::TypeAssertionBOL:
1554 assertionBOL(term.inputPosition - currentCountAlreadyChecked);
1557 case PatternTerm::TypeAssertionEOL:
1558 assertionEOL(term.inputPosition - currentCountAlreadyChecked);
1561 case PatternTerm::TypeAssertionWordBoundary:
1562 assertionWordBoundary(term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked);
1565 case PatternTerm::TypePatternCharacter:
1566 atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1569 case PatternTerm::TypeCharacterClass:
1570 atomCharacterClass(term.characterClass, term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1573 case PatternTerm::TypeBackReference:
1574 atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1577 case PatternTerm::TypeForwardReference:
1580 case PatternTerm::TypeParenthesesSubpattern: {
1581 unsigned disjunctionAlreadyCheckedCount = 0;
1582 if ((term.quantityCount == 1) && !term.parentheses.isCopy) {
1583 if (term.quantityType == QuantifierFixedCount) {
1584 disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
1585 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1586 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation);
1587 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize);
1588 atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1590 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1591 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce);
1592 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1593 atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1596 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1597 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0);
1598 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1599 atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1604 case PatternTerm::TypeParentheticalAssertion: {
1605 unsigned alternativeFrameLocation = term.frameLocation + RegexStackSpaceForBackTrackInfoParentheticalAssertion;
1607 ASSERT(currentCountAlreadyChecked >= (unsigned)term.inputPosition);
1608 int positiveInputOffset = currentCountAlreadyChecked - term.inputPosition;
1610 atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation);
1611 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset, true);
1612 atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType);
1621 RegexPattern& m_pattern;
1622 OwnPtr<ByteDisjunction> m_bodyDisjunction;
1623 unsigned m_currentAlternativeIndex;
1624 Vector<ParenthesesStackEntry> m_parenthesesStack;
1625 Vector<ByteDisjunction*> m_allParenthesesInfo;
1629 PassOwnPtr<BytecodePattern> byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, BumpPointerAllocator* allocator, bool ignoreCase, bool multiline)
1631 RegexPattern pattern(ignoreCase, multiline);
1633 if ((error = compileRegex(patternString, pattern)))
1634 return PassOwnPtr<BytecodePattern>();
1636 numSubpatterns = pattern.m_numSubpatterns;
1638 return ByteCompiler(pattern).compile(allocator);
1641 int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output)
1643 return Interpreter(regex, output, input, start, length).interpret();
1647 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter);
1648 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass);
1649 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference);
1650 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative);
1651 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion);
1652 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce);
1653 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses);