2 * Copyright (C) 2009, 2013-2017 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #include "YarrInterpreter.h"
30 #include "SuperSampler.h"
32 #include "YarrCanonicalize.h"
33 #include <wtf/BumpPointerAllocator.h>
34 #include <wtf/DataLog.h>
35 #include <wtf/text/CString.h>
36 #include <wtf/text/WTFString.h>
40 namespace JSC { namespace Yarr {
42 template<typename CharType>
45 struct ParenthesesDisjunctionContext;
47 struct BackTrackInfoParentheses {
48 uintptr_t matchAmount;
49 ParenthesesDisjunctionContext* lastContext;
52 static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
54 context->next = backTrack->lastContext;
55 backTrack->lastContext = context;
56 ++backTrack->matchAmount;
59 static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
61 RELEASE_ASSERT(backTrack->matchAmount);
62 RELEASE_ASSERT(backTrack->lastContext);
63 backTrack->lastContext = backTrack->lastContext->next;
64 --backTrack->matchAmount;
67 struct DisjunctionContext
74 void* operator new(size_t, void* where)
85 DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
87 size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
88 allocatorPool = allocatorPool->ensureCapacity(size);
89 RELEASE_ASSERT(allocatorPool);
90 return new (allocatorPool->alloc(size)) DisjunctionContext();
93 void freeDisjunctionContext(DisjunctionContext* context)
95 allocatorPool = allocatorPool->dealloc(context);
98 struct ParenthesesDisjunctionContext
100 ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term)
103 unsigned firstSubpatternId = term.atom.subpatternId;
104 unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
106 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
107 subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
108 output[(firstSubpatternId << 1) + i] = offsetNoMatch;
111 new (getDisjunctionContext(term)) DisjunctionContext();
114 void* operator new(size_t, void* where)
119 void restoreOutput(unsigned* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
121 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
122 output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
125 DisjunctionContext* getDisjunctionContext(ByteTerm& term)
127 return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
130 ParenthesesDisjunctionContext* next;
131 unsigned subpatternBackup[1];
134 ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term)
136 size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(unsigned) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + static_cast<size_t>(disjunction->m_frameSize) * sizeof(uintptr_t);
137 allocatorPool = allocatorPool->ensureCapacity(size);
138 RELEASE_ASSERT(allocatorPool);
139 return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
142 void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
144 allocatorPool = allocatorPool->dealloc(context);
149 InputStream(const CharType* input, unsigned start, unsigned length, bool decodeSurrogatePairs)
153 , decodeSurrogatePairs(decodeSurrogatePairs)
162 void rewind(unsigned amount)
164 ASSERT(pos >= amount);
170 ASSERT(pos < length);
178 ASSERT(pos + 1 < length);
179 return input[pos] | input[pos + 1] << 16;
182 int readChecked(unsigned negativePositionOffest)
184 RELEASE_ASSERT(pos >= negativePositionOffest);
185 unsigned p = pos - negativePositionOffest;
187 int result = input[p];
188 if (U16_IS_LEAD(result) && decodeSurrogatePairs && p + 1 < length && U16_IS_TRAIL(input[p + 1])) {
192 result = U16_GET_SUPPLEMENTARY(result, input[p + 1]);
198 int readSurrogatePairChecked(unsigned negativePositionOffset)
200 RELEASE_ASSERT(pos >= negativePositionOffset);
201 unsigned p = pos - negativePositionOffset;
206 int first = input[p];
207 int second = input[p + 1];
208 if (U16_IS_LEAD(first) && U16_IS_TRAIL(second))
209 return U16_GET_SUPPLEMENTARY(first, second);
214 int reread(unsigned from)
216 ASSERT(from < length);
217 int result = input[from];
218 if (U16_IS_LEAD(result) && decodeSurrogatePairs && from + 1 < length && U16_IS_TRAIL(input[from + 1]))
219 result = U16_GET_SUPPLEMENTARY(result, input[from + 1]);
225 ASSERT(!(pos > length));
227 return input[pos - 1];
236 void setPos(unsigned p)
248 return pos == length;
256 bool checkInput(unsigned count)
258 if (((pos + count) <= length) && ((pos + count) >= pos)) {
265 void uncheckInput(unsigned count)
267 RELEASE_ASSERT(pos >= count);
271 bool atStart(unsigned negativePositionOffset)
273 return pos == negativePositionOffset;
276 bool atEnd(unsigned negativePositionOffest)
278 RELEASE_ASSERT(pos >= negativePositionOffest);
279 return (pos - negativePositionOffest) == length;
282 bool isAvailableInput(unsigned offset)
284 return (((pos + offset) <= length) && ((pos + offset) >= pos));
288 const CharType* input;
291 bool decodeSurrogatePairs;
294 bool testCharacterClass(CharacterClass* characterClass, int ch)
296 auto linearSearchMatches = [&ch](const Vector<UChar32>& matches) {
297 for (unsigned i = 0; i < matches.size(); ++i) {
298 if (ch == matches[i])
305 auto binarySearchMatches = [&ch](const Vector<UChar32>& matches) {
307 size_t high = matches.size() - 1;
309 while (low <= high) {
310 size_t mid = low + (high - low) / 2;
311 int diff = ch - matches[mid];
325 auto linearSearchRanges = [&ch](const Vector<CharacterRange>& ranges) {
326 for (unsigned i = 0; i < ranges.size(); ++i) {
327 if ((ch >= ranges[i].begin) && (ch <= ranges[i].end))
334 auto binarySearchRanges = [&ch](const Vector<CharacterRange>& ranges) {
336 size_t high = ranges.size() - 1;
338 while (low <= high) {
339 size_t mid = low + (high - low) / 2;
340 int rangeBeginDiff = ch - ranges[mid].begin;
341 if (rangeBeginDiff >= 0 && ch <= ranges[mid].end)
344 if (rangeBeginDiff < 0) {
354 const size_t thresholdForBinarySearch = 6;
357 if (characterClass->m_matchesUnicode.size()) {
358 if (characterClass->m_matchesUnicode.size() > thresholdForBinarySearch) {
359 if (binarySearchMatches(characterClass->m_matchesUnicode))
361 } else if (linearSearchMatches(characterClass->m_matchesUnicode))
365 if (characterClass->m_rangesUnicode.size()) {
366 if (characterClass->m_rangesUnicode.size() > thresholdForBinarySearch) {
367 if (binarySearchRanges(characterClass->m_rangesUnicode))
369 } else if (linearSearchRanges(characterClass->m_rangesUnicode))
373 if (characterClass->m_matches.size()) {
374 if (characterClass->m_matches.size() > thresholdForBinarySearch) {
375 if (binarySearchMatches(characterClass->m_matches))
377 } else if (linearSearchMatches(characterClass->m_matches))
381 if (characterClass->m_ranges.size()) {
382 if (characterClass->m_ranges.size() > thresholdForBinarySearch) {
383 if (binarySearchRanges(characterClass->m_ranges))
385 } else if (linearSearchRanges(characterClass->m_ranges))
393 bool checkCharacter(int testChar, unsigned negativeInputOffset)
395 return testChar == input.readChecked(negativeInputOffset);
398 bool checkSurrogatePair(int testUnicodeChar, unsigned negativeInputOffset)
400 return testUnicodeChar == input.readSurrogatePairChecked(negativeInputOffset);
403 bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset)
405 int ch = input.readChecked(negativeInputOffset);
406 return (loChar == ch) || (hiChar == ch);
409 bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset)
411 bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset));
412 return invert ? !match : match;
415 bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset)
417 unsigned matchSize = (unsigned)(matchEnd - matchBegin);
419 if (!input.checkInput(matchSize))
422 for (unsigned i = 0; i < matchSize; ++i) {
423 int oldCh = input.reread(matchBegin + i);
425 if (!U_IS_BMP(oldCh)) {
426 ch = input.readSurrogatePairChecked(negativeInputOffset + matchSize - i);
429 ch = input.readChecked(negativeInputOffset + matchSize - i);
434 if (pattern->ignoreCase()) {
435 // See ES 6.0, 21.2.2.8.2 for the definition of Canonicalize(). For non-Unicode
436 // patterns, Unicode values are never allowed to match against ASCII ones.
437 // For Unicode, we need to check all canonical equivalents of a character.
438 if (!unicode && (isASCII(oldCh) || isASCII(ch))) {
439 if (toASCIIUpper(oldCh) == toASCIIUpper(ch))
441 } else if (areCanonicallyEquivalent(oldCh, ch, unicode ? CanonicalMode::Unicode : CanonicalMode::UCS2))
445 input.uncheckInput(matchSize);
452 bool matchAssertionBOL(ByteTerm& term)
454 return (input.atStart(term.inputPosition)) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1)));
457 bool matchAssertionEOL(ByteTerm& term)
459 if (term.inputPosition)
460 return (input.atEnd(term.inputPosition)) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
462 return (input.atEnd()) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.read()));
465 bool matchAssertionWordBoundary(ByteTerm& term)
467 bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition + 1));
469 if (term.inputPosition)
470 readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
472 readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
474 bool wordBoundary = prevIsWordchar != readIsWordchar;
475 return term.invert() ? !wordBoundary : wordBoundary;
478 bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
480 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
482 switch (term.atom.quantityType) {
483 case QuantifierFixedCount:
486 case QuantifierGreedy:
487 if (backTrack->matchAmount) {
488 --backTrack->matchAmount;
489 input.uncheckInput(U16_LENGTH(term.atom.patternCharacter));
494 case QuantifierNonGreedy:
495 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
496 ++backTrack->matchAmount;
497 if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1))
500 input.setPos(backTrack->begin);
507 bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
509 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
511 switch (term.atom.quantityType) {
512 case QuantifierFixedCount:
515 case QuantifierGreedy:
516 if (backTrack->matchAmount) {
517 --backTrack->matchAmount;
518 input.uncheckInput(1);
523 case QuantifierNonGreedy:
524 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
525 ++backTrack->matchAmount;
526 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1))
529 input.uncheckInput(backTrack->matchAmount);
536 bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
538 ASSERT(term.type == ByteTerm::TypeCharacterClass);
539 BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation);
541 switch (term.atom.quantityType) {
542 case QuantifierFixedCount: {
544 backTrack->begin = input.getPos();
545 unsigned matchAmount = 0;
546 for (matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) {
547 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount)) {
548 input.setPos(backTrack->begin);
556 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) {
557 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount))
563 case QuantifierGreedy: {
564 unsigned position = input.getPos();
565 backTrack->begin = position;
566 unsigned matchAmount = 0;
567 while ((matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
568 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
569 input.setPos(position);
573 position = input.getPos();
575 backTrack->matchAmount = matchAmount;
580 case QuantifierNonGreedy:
581 backTrack->begin = input.getPos();
582 backTrack->matchAmount = 0;
586 RELEASE_ASSERT_NOT_REACHED();
590 bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
592 ASSERT(term.type == ByteTerm::TypeCharacterClass);
593 BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation);
595 switch (term.atom.quantityType) {
596 case QuantifierFixedCount:
598 input.setPos(backTrack->begin);
601 case QuantifierGreedy:
602 if (backTrack->matchAmount) {
604 // Rematch one less match
605 input.setPos(backTrack->begin);
606 --backTrack->matchAmount;
607 for (unsigned matchAmount = 0; (matchAmount < backTrack->matchAmount) && input.checkInput(1); ++matchAmount) {
608 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
609 input.uncheckInput(1);
615 --backTrack->matchAmount;
616 input.uncheckInput(1);
621 case QuantifierNonGreedy:
622 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
623 ++backTrack->matchAmount;
624 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1))
627 input.setPos(backTrack->begin);
634 bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
636 ASSERT(term.type == ByteTerm::TypeBackReference);
637 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
639 unsigned matchBegin = output[(term.atom.subpatternId << 1)];
640 unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
642 // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
643 // In this case the result of match is empty string like when it references to a parentheses with zero-width match.
645 if (matchEnd == offsetNoMatch)
648 if (matchBegin == offsetNoMatch)
651 ASSERT(matchBegin <= matchEnd);
653 if (matchBegin == matchEnd)
656 switch (term.atom.quantityType) {
657 case QuantifierFixedCount: {
658 backTrack->begin = input.getPos();
659 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) {
660 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
661 input.setPos(backTrack->begin);
668 case QuantifierGreedy: {
669 unsigned matchAmount = 0;
670 while ((matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
672 backTrack->matchAmount = matchAmount;
676 case QuantifierNonGreedy:
677 backTrack->begin = input.getPos();
678 backTrack->matchAmount = 0;
682 RELEASE_ASSERT_NOT_REACHED();
686 bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
688 ASSERT(term.type == ByteTerm::TypeBackReference);
689 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
691 unsigned matchBegin = output[(term.atom.subpatternId << 1)];
692 unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
694 if (matchBegin == offsetNoMatch)
697 ASSERT(matchBegin <= matchEnd);
699 if (matchBegin == matchEnd)
702 switch (term.atom.quantityType) {
703 case QuantifierFixedCount:
704 // for quantityMaxCount == 1, could rewind.
705 input.setPos(backTrack->begin);
708 case QuantifierGreedy:
709 if (backTrack->matchAmount) {
710 --backTrack->matchAmount;
711 input.rewind(matchEnd - matchBegin);
716 case QuantifierNonGreedy:
717 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
718 ++backTrack->matchAmount;
721 input.setPos(backTrack->begin);
728 void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
730 if (term.capture()) {
731 unsigned subpatternId = term.atom.subpatternId;
732 output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin - term.inputPosition;
733 output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd - term.inputPosition;
736 void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
738 unsigned firstSubpatternId = term.atom.subpatternId;
739 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
740 context->restoreOutput(output, firstSubpatternId, count);
742 JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
744 while (backTrack->matchAmount) {
745 ParenthesesDisjunctionContext* context = backTrack->lastContext;
747 JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true);
748 if (result == JSRegExpMatch)
749 return JSRegExpMatch;
751 resetMatches(term, context);
752 popParenthesesDisjunctionContext(backTrack);
753 freeParenthesesDisjunctionContext(context);
755 if (result != JSRegExpNoMatch)
759 return JSRegExpNoMatch;
762 bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
764 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
765 ASSERT(term.atom.quantityMaxCount == 1);
767 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
769 switch (term.atom.quantityType) {
770 case QuantifierGreedy: {
771 // set this speculatively; if we get to the parens end this will be true.
772 backTrack->begin = input.getPos();
775 case QuantifierNonGreedy: {
776 backTrack->begin = notFound;
777 context->term += term.atom.parenthesesWidth;
780 case QuantifierFixedCount:
784 if (term.capture()) {
785 unsigned subpatternId = term.atom.subpatternId;
786 output[(subpatternId << 1)] = input.getPos() - term.inputPosition;
792 bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
794 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
795 ASSERT(term.atom.quantityMaxCount == 1);
797 if (term.capture()) {
798 unsigned subpatternId = term.atom.subpatternId;
799 output[(subpatternId << 1) + 1] = input.getPos() - term.inputPosition;
802 if (term.atom.quantityType == QuantifierFixedCount)
805 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
806 return backTrack->begin != input.getPos();
809 bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
811 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
812 ASSERT(term.atom.quantityMaxCount == 1);
814 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
816 if (term.capture()) {
817 unsigned subpatternId = term.atom.subpatternId;
818 output[(subpatternId << 1)] = offsetNoMatch;
819 output[(subpatternId << 1) + 1] = offsetNoMatch;
822 switch (term.atom.quantityType) {
823 case QuantifierGreedy:
824 // if we backtrack to this point, there is another chance - try matching nothing.
825 ASSERT(backTrack->begin != notFound);
826 backTrack->begin = notFound;
827 context->term += term.atom.parenthesesWidth;
829 case QuantifierNonGreedy:
830 ASSERT(backTrack->begin != notFound);
832 case QuantifierFixedCount:
839 bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
841 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
842 ASSERT(term.atom.quantityMaxCount == 1);
844 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
846 switch (term.atom.quantityType) {
847 case QuantifierGreedy:
848 if (backTrack->begin == notFound) {
849 context->term -= term.atom.parenthesesWidth;
853 case QuantifierNonGreedy:
854 if (backTrack->begin == notFound) {
855 backTrack->begin = input.getPos();
856 if (term.capture()) {
857 // Technically this access to inputPosition should be accessing the begin term's
858 // inputPosition, but for repeats other than fixed these values should be
859 // the same anyway! (We don't pre-check for greedy or non-greedy matches.)
860 ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
861 ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
862 unsigned subpatternId = term.atom.subpatternId;
863 output[subpatternId << 1] = input.getPos() - term.inputPosition;
865 context->term -= term.atom.parenthesesWidth;
869 case QuantifierFixedCount:
876 bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
878 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
879 ASSERT(term.atom.quantityType == QuantifierGreedy);
880 ASSERT(term.atom.quantityMaxCount == quantifyInfinite);
881 ASSERT(!term.capture());
883 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
884 backTrack->begin = input.getPos();
888 bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context)
890 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd);
892 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
893 // Empty match is a failed match.
894 if (backTrack->begin == input.getPos())
897 // Successful match! Okay, what's next? - loop around and try to match more!
898 context->term -= (term.atom.parenthesesWidth + 1);
902 bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
904 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
905 ASSERT(term.atom.quantityType == QuantifierGreedy);
906 ASSERT(term.atom.quantityMaxCount == quantifyInfinite);
907 ASSERT(!term.capture());
909 // If we backtrack to this point, we have failed to match this iteration of the parens.
910 // Since this is greedy / zero minimum a failed is also accepted as a match!
911 context->term += term.atom.parenthesesWidth;
915 bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
917 // 'Terminal' parentheses are at the end of the regex, and as such a match past end
918 // should always be returned as a successful match - we should never backtrack to here.
919 RELEASE_ASSERT_NOT_REACHED();
923 bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
925 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
926 ASSERT(term.atom.quantityMaxCount == 1);
928 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
930 backTrack->begin = input.getPos();
934 bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
936 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
937 ASSERT(term.atom.quantityMaxCount == 1);
939 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
941 input.setPos(backTrack->begin);
943 // We've reached the end of the parens; if they are inverted, this is failure.
945 context->term -= term.atom.parenthesesWidth;
952 bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
954 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
955 ASSERT(term.atom.quantityMaxCount == 1);
957 // We've failed to match parens; if they are inverted, this is win!
959 context->term += term.atom.parenthesesWidth;
966 bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
968 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
969 ASSERT(term.atom.quantityMaxCount == 1);
971 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
973 input.setPos(backTrack->begin);
975 context->term -= term.atom.parenthesesWidth;
979 JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
981 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
983 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
984 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
986 backTrack->matchAmount = 0;
987 backTrack->lastContext = 0;
989 ASSERT(term.atom.quantityType != QuantifierFixedCount || term.atom.quantityMinCount == term.atom.quantityMaxCount);
991 unsigned minimumMatchCount = term.atom.quantityMinCount;
992 JSRegExpResult fixedMatchResult;
994 // Handle fixed matches and the minimum part of a variable length match.
995 if (minimumMatchCount) {
996 // While we haven't yet reached our fixed limit,
997 while (backTrack->matchAmount < minimumMatchCount) {
998 // Try to do a match, and it it succeeds, add it to the list.
999 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1000 fixedMatchResult = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1001 if (fixedMatchResult == JSRegExpMatch)
1002 appendParenthesesDisjunctionContext(backTrack, context);
1004 // The match failed; try to find an alternate point to carry on from.
1005 resetMatches(term, context);
1006 freeParenthesesDisjunctionContext(context);
1008 if (fixedMatchResult != JSRegExpNoMatch)
1009 return fixedMatchResult;
1010 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
1011 if (backtrackResult != JSRegExpMatch)
1012 return backtrackResult;
1016 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1017 recordParenthesesMatch(term, context);
1020 switch (term.atom.quantityType) {
1021 case QuantifierFixedCount: {
1022 ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount);
1023 return JSRegExpMatch;
1026 case QuantifierGreedy: {
1027 while (backTrack->matchAmount < term.atom.quantityMaxCount) {
1028 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1029 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1030 if (result == JSRegExpMatch)
1031 appendParenthesesDisjunctionContext(backTrack, context);
1033 resetMatches(term, context);
1034 freeParenthesesDisjunctionContext(context);
1036 if (result != JSRegExpNoMatch)
1043 if (backTrack->matchAmount) {
1044 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1045 recordParenthesesMatch(term, context);
1047 return JSRegExpMatch;
1050 case QuantifierNonGreedy:
1051 return JSRegExpMatch;
1054 RELEASE_ASSERT_NOT_REACHED();
1055 return JSRegExpErrorNoMatch;
1058 // Rules for backtracking differ depending on whether this is greedy or non-greedy.
1060 // Greedy matches never should try just adding more - you should already have done
1061 // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
1062 // you backtrack an item off the list needs checking, since we'll never have matched
1063 // the one less case. Tracking forwards, still add as much as possible.
1065 // Non-greedy, we've already done the one less case, so don't match on popping.
1066 // We haven't done the one more case, so always try to add that.
1068 JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
1070 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
1072 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
1073 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
1075 switch (term.atom.quantityType) {
1076 case QuantifierFixedCount: {
1077 ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount);
1079 ParenthesesDisjunctionContext* context = 0;
1080 JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
1082 if (result != JSRegExpMatch)
1085 // While we haven't yet reached our fixed limit,
1086 while (backTrack->matchAmount < term.atom.quantityMaxCount) {
1087 // Try to do a match, and it it succeeds, add it to the list.
1088 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1089 result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1091 if (result == JSRegExpMatch)
1092 appendParenthesesDisjunctionContext(backTrack, context);
1094 // The match failed; try to find an alternate point to carry on from.
1095 resetMatches(term, context);
1096 freeParenthesesDisjunctionContext(context);
1098 if (result != JSRegExpNoMatch)
1100 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
1101 if (backtrackResult != JSRegExpMatch)
1102 return backtrackResult;
1106 ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount);
1107 context = backTrack->lastContext;
1108 recordParenthesesMatch(term, context);
1109 return JSRegExpMatch;
1112 case QuantifierGreedy: {
1113 if (!backTrack->matchAmount)
1114 return JSRegExpNoMatch;
1116 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1117 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
1118 if (result == JSRegExpMatch) {
1119 while (backTrack->matchAmount < term.atom.quantityMaxCount) {
1120 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1121 JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1122 if (parenthesesResult == JSRegExpMatch)
1123 appendParenthesesDisjunctionContext(backTrack, context);
1125 resetMatches(term, context);
1126 freeParenthesesDisjunctionContext(context);
1128 if (parenthesesResult != JSRegExpNoMatch)
1129 return parenthesesResult;
1135 resetMatches(term, context);
1136 popParenthesesDisjunctionContext(backTrack);
1137 freeParenthesesDisjunctionContext(context);
1139 if (result != JSRegExpNoMatch || backTrack->matchAmount < term.atom.quantityMinCount)
1143 if (backTrack->matchAmount) {
1144 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1145 recordParenthesesMatch(term, context);
1147 return JSRegExpMatch;
1150 case QuantifierNonGreedy: {
1151 // If we've not reached the limit, try to add one more match.
1152 if (backTrack->matchAmount < term.atom.quantityMaxCount) {
1153 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1154 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1155 if (result == JSRegExpMatch) {
1156 appendParenthesesDisjunctionContext(backTrack, context);
1157 recordParenthesesMatch(term, context);
1158 return JSRegExpMatch;
1161 resetMatches(term, context);
1162 freeParenthesesDisjunctionContext(context);
1164 if (result != JSRegExpNoMatch)
1168 // Nope - okay backtrack looking for an alternative.
1169 while (backTrack->matchAmount) {
1170 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1171 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
1172 if (result == JSRegExpMatch) {
1173 // successful backtrack! we're back in the game!
1174 if (backTrack->matchAmount) {
1175 context = backTrack->lastContext;
1176 recordParenthesesMatch(term, context);
1178 return JSRegExpMatch;
1181 // pop a match off the stack
1182 resetMatches(term, context);
1183 popParenthesesDisjunctionContext(backTrack);
1184 freeParenthesesDisjunctionContext(context);
1186 if (result != JSRegExpNoMatch)
1190 return JSRegExpNoMatch;
1194 RELEASE_ASSERT_NOT_REACHED();
1195 return JSRegExpErrorNoMatch;
1198 bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context)
1202 if (pattern->dotAll()) {
1203 context->matchBegin = startOffset;
1204 context->matchEnd = input.end();
1208 unsigned matchBegin = context->matchBegin;
1210 if (matchBegin > startOffset) {
1211 for (matchBegin--; true; matchBegin--) {
1212 if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) {
1217 if (matchBegin == startOffset)
1222 unsigned matchEnd = input.getPos();
1224 for (; (matchEnd != input.end())
1225 && (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { }
1227 if (((matchBegin && term.anchors.m_bol)
1228 || ((matchEnd != input.end()) && term.anchors.m_eol))
1229 && !pattern->multiline())
1232 context->matchBegin = matchBegin;
1233 context->matchEnd = matchEnd;
1237 #define MATCH_NEXT() { ++context->term; goto matchAgain; }
1238 #define BACKTRACK() { --context->term; goto backtrack; }
1239 #define currentTerm() (disjunction->terms[context->term])
1240 JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1242 if (!--remainingMatchCount)
1243 return JSRegExpErrorHitLimit;
1248 context->matchBegin = input.getPos();
1252 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1254 switch (currentTerm().type) {
1255 case ByteTerm::TypeSubpatternBegin:
1257 case ByteTerm::TypeSubpatternEnd:
1258 context->matchEnd = input.getPos();
1259 return JSRegExpMatch;
1261 case ByteTerm::TypeBodyAlternativeBegin:
1263 case ByteTerm::TypeBodyAlternativeDisjunction:
1264 case ByteTerm::TypeBodyAlternativeEnd:
1265 context->matchEnd = input.getPos();
1266 return JSRegExpMatch;
1268 case ByteTerm::TypeAlternativeBegin:
1270 case ByteTerm::TypeAlternativeDisjunction:
1271 case ByteTerm::TypeAlternativeEnd: {
1272 int offset = currentTerm().alternative.end;
1273 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1274 backTrack->offset = offset;
1275 context->term += offset;
1279 case ByteTerm::TypeAssertionBOL:
1280 if (matchAssertionBOL(currentTerm()))
1283 case ByteTerm::TypeAssertionEOL:
1284 if (matchAssertionEOL(currentTerm()))
1287 case ByteTerm::TypeAssertionWordBoundary:
1288 if (matchAssertionWordBoundary(currentTerm()))
1292 case ByteTerm::TypePatternCharacterOnce:
1293 case ByteTerm::TypePatternCharacterFixed: {
1295 if (!U_IS_BMP(currentTerm().atom.patternCharacter)) {
1296 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1297 if (!checkSurrogatePair(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 2 * matchAmount)) {
1304 unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
1306 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1307 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount)) {
1308 input.setPos(position);
1314 case ByteTerm::TypePatternCharacterGreedy: {
1315 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1316 unsigned matchAmount = 0;
1317 unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
1318 while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) {
1319 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) {
1320 input.setPos(position);
1324 position = input.getPos();
1326 backTrack->matchAmount = matchAmount;
1330 case ByteTerm::TypePatternCharacterNonGreedy: {
1331 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1332 backTrack->begin = input.getPos();
1333 backTrack->matchAmount = 0;
1337 case ByteTerm::TypePatternCasedCharacterOnce:
1338 case ByteTerm::TypePatternCasedCharacterFixed: {
1340 // Case insensitive matching of unicode characters is handled as TypeCharacterClass.
1341 ASSERT(U_IS_BMP(currentTerm().atom.patternCharacter));
1343 unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
1345 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1346 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount)) {
1347 input.setPos(position);
1354 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1355 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount))
1360 case ByteTerm::TypePatternCasedCharacterGreedy: {
1361 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1363 // Case insensitive matching of unicode characters is handled as TypeCharacterClass.
1364 ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter));
1366 unsigned matchAmount = 0;
1367 while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) {
1368 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) {
1369 input.uncheckInput(1);
1374 backTrack->matchAmount = matchAmount;
1378 case ByteTerm::TypePatternCasedCharacterNonGreedy: {
1379 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1381 // Case insensitive matching of unicode characters is handled as TypeCharacterClass.
1382 ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter));
1384 backTrack->matchAmount = 0;
1388 case ByteTerm::TypeCharacterClass:
1389 if (matchCharacterClass(currentTerm(), context))
1392 case ByteTerm::TypeBackReference:
1393 if (matchBackReference(currentTerm(), context))
1396 case ByteTerm::TypeParenthesesSubpattern: {
1397 JSRegExpResult result = matchParentheses(currentTerm(), context);
1399 if (result == JSRegExpMatch) {
1401 } else if (result != JSRegExpNoMatch)
1406 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1407 if (matchParenthesesOnceBegin(currentTerm(), context))
1410 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1411 if (matchParenthesesOnceEnd(currentTerm(), context))
1414 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1415 if (matchParenthesesTerminalBegin(currentTerm(), context))
1418 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1419 if (matchParenthesesTerminalEnd(currentTerm(), context))
1422 case ByteTerm::TypeParentheticalAssertionBegin:
1423 if (matchParentheticalAssertionBegin(currentTerm(), context))
1426 case ByteTerm::TypeParentheticalAssertionEnd:
1427 if (matchParentheticalAssertionEnd(currentTerm(), context))
1431 case ByteTerm::TypeCheckInput:
1432 if (input.checkInput(currentTerm().checkInputCount))
1436 case ByteTerm::TypeUncheckInput:
1437 input.uncheckInput(currentTerm().checkInputCount);
1440 case ByteTerm::TypeDotStarEnclosure:
1441 if (matchDotStarEnclosure(currentTerm(), context))
1442 return JSRegExpMatch;
1446 // We should never fall-through to here.
1447 RELEASE_ASSERT_NOT_REACHED();
1450 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1452 switch (currentTerm().type) {
1453 case ByteTerm::TypeSubpatternBegin:
1454 return JSRegExpNoMatch;
1455 case ByteTerm::TypeSubpatternEnd:
1456 RELEASE_ASSERT_NOT_REACHED();
1458 case ByteTerm::TypeBodyAlternativeBegin:
1459 case ByteTerm::TypeBodyAlternativeDisjunction: {
1460 int offset = currentTerm().alternative.next;
1461 context->term += offset;
1465 if (input.atEnd() || pattern->sticky())
1466 return JSRegExpNoMatch;
1470 context->matchBegin = input.getPos();
1472 if (currentTerm().alternative.onceThrough)
1473 context->term += currentTerm().alternative.next;
1477 case ByteTerm::TypeBodyAlternativeEnd:
1478 RELEASE_ASSERT_NOT_REACHED();
1480 case ByteTerm::TypeAlternativeBegin:
1481 case ByteTerm::TypeAlternativeDisjunction: {
1482 int offset = currentTerm().alternative.next;
1483 context->term += offset;
1488 case ByteTerm::TypeAlternativeEnd: {
1489 // We should never backtrack back into an alternative of the main body of the regex.
1490 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1491 unsigned offset = backTrack->offset;
1492 context->term -= offset;
1496 case ByteTerm::TypeAssertionBOL:
1497 case ByteTerm::TypeAssertionEOL:
1498 case ByteTerm::TypeAssertionWordBoundary:
1501 case ByteTerm::TypePatternCharacterOnce:
1502 case ByteTerm::TypePatternCharacterFixed:
1503 case ByteTerm::TypePatternCharacterGreedy:
1504 case ByteTerm::TypePatternCharacterNonGreedy:
1505 if (backtrackPatternCharacter(currentTerm(), context))
1508 case ByteTerm::TypePatternCasedCharacterOnce:
1509 case ByteTerm::TypePatternCasedCharacterFixed:
1510 case ByteTerm::TypePatternCasedCharacterGreedy:
1511 case ByteTerm::TypePatternCasedCharacterNonGreedy:
1512 if (backtrackPatternCasedCharacter(currentTerm(), context))
1515 case ByteTerm::TypeCharacterClass:
1516 if (backtrackCharacterClass(currentTerm(), context))
1519 case ByteTerm::TypeBackReference:
1520 if (backtrackBackReference(currentTerm(), context))
1523 case ByteTerm::TypeParenthesesSubpattern: {
1524 JSRegExpResult result = backtrackParentheses(currentTerm(), context);
1526 if (result == JSRegExpMatch) {
1528 } else if (result != JSRegExpNoMatch)
1533 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1534 if (backtrackParenthesesOnceBegin(currentTerm(), context))
1537 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1538 if (backtrackParenthesesOnceEnd(currentTerm(), context))
1541 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1542 if (backtrackParenthesesTerminalBegin(currentTerm(), context))
1545 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1546 if (backtrackParenthesesTerminalEnd(currentTerm(), context))
1549 case ByteTerm::TypeParentheticalAssertionBegin:
1550 if (backtrackParentheticalAssertionBegin(currentTerm(), context))
1553 case ByteTerm::TypeParentheticalAssertionEnd:
1554 if (backtrackParentheticalAssertionEnd(currentTerm(), context))
1558 case ByteTerm::TypeCheckInput:
1559 input.uncheckInput(currentTerm().checkInputCount);
1562 case ByteTerm::TypeUncheckInput:
1563 input.checkInput(currentTerm().checkInputCount);
1566 case ByteTerm::TypeDotStarEnclosure:
1567 RELEASE_ASSERT_NOT_REACHED();
1570 RELEASE_ASSERT_NOT_REACHED();
1571 return JSRegExpErrorNoMatch;
1574 JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1576 JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
1578 if (result == JSRegExpMatch) {
1579 while (context->matchBegin == context->matchEnd) {
1580 result = matchDisjunction(disjunction, context, true);
1581 if (result != JSRegExpMatch)
1584 return JSRegExpMatch;
1590 unsigned interpret()
1592 if (!input.isAvailableInput(0))
1593 return offsetNoMatch;
1595 if (pattern->m_lock)
1596 pattern->m_lock->lock();
1598 for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i)
1599 output[i << 1] = offsetNoMatch;
1601 allocatorPool = pattern->m_allocator->startAllocator();
1602 RELEASE_ASSERT(allocatorPool);
1604 DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
1606 JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false);
1607 if (result == JSRegExpMatch) {
1608 output[0] = context->matchBegin;
1609 output[1] = context->matchEnd;
1612 freeDisjunctionContext(context);
1614 pattern->m_allocator->stopAllocator();
1616 ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch));
1618 if (pattern->m_lock)
1619 pattern->m_lock->unlock();
1624 Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start)
1626 , unicode(pattern->unicode())
1628 , input(input, start, length, pattern->unicode())
1630 , startOffset(start)
1631 , remainingMatchCount(matchLimit)
1636 BytecodePattern* pattern;
1640 BumpPointerPool* allocatorPool;
1641 unsigned startOffset;
1642 unsigned remainingMatchCount;
1645 class ByteCompiler {
1646 struct ParenthesesStackEntry {
1648 unsigned savedAlternativeIndex;
1649 ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
1650 : beginTerm(beginTerm)
1651 , savedAlternativeIndex(savedAlternativeIndex)
1657 ByteCompiler(YarrPattern& pattern)
1658 : m_pattern(pattern)
1660 m_currentAlternativeIndex = 0;
1663 std::unique_ptr<BytecodePattern> compile(BumpPointerAllocator* allocator, ConcurrentJSLock* lock)
1665 regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
1666 emitDisjunction(m_pattern.m_body);
1669 return std::make_unique<BytecodePattern>(WTFMove(m_bodyDisjunction), m_allParenthesesInfo, m_pattern, allocator, lock);
1672 void checkInput(unsigned count)
1674 m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
1677 void uncheckInput(unsigned count)
1679 m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count));
1682 void assertionBOL(unsigned inputPosition)
1684 m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
1687 void assertionEOL(unsigned inputPosition)
1689 m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
1692 void assertionWordBoundary(bool invert, unsigned inputPosition)
1694 m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
1697 void atomPatternCharacter(UChar32 ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1699 if (m_pattern.ignoreCase()) {
1700 UChar32 lo = u_tolower(ch);
1701 UChar32 hi = u_toupper(ch);
1704 m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityMaxCount, quantityType));
1709 m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityMaxCount, quantityType));
1712 void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1714 m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
1716 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1717 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1718 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1721 void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1723 ASSERT(subpatternId);
1725 m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
1727 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1728 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1729 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1732 void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1734 int beginTerm = m_bodyDisjunction->terms.size();
1736 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1737 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1738 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1739 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1741 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1742 m_currentAlternativeIndex = beginTerm + 1;
1745 void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1747 int beginTerm = m_bodyDisjunction->terms.size();
1749 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition));
1750 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1751 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1752 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1754 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1755 m_currentAlternativeIndex = beginTerm + 1;
1758 void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1760 // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
1761 // then fix this up at the end! - simplifying this should make it much clearer.
1762 // https://bugs.webkit.org/show_bug.cgi?id=50136
1764 int beginTerm = m_bodyDisjunction->terms.size();
1766 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1767 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1768 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1769 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1771 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1772 m_currentAlternativeIndex = beginTerm + 1;
1775 void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
1777 int beginTerm = m_bodyDisjunction->terms.size();
1779 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0));
1780 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1781 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1782 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1784 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1785 m_currentAlternativeIndex = beginTerm + 1;
1788 void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1790 unsigned beginTerm = popParenthesesStack();
1791 closeAlternative(beginTerm + 1);
1792 unsigned endTerm = m_bodyDisjunction->terms.size();
1794 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin);
1796 bool invert = m_bodyDisjunction->terms[beginTerm].invert();
1797 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1799 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition));
1800 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1801 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1802 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1804 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1805 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1806 m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1807 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1810 void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored)
1812 m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored));
1815 unsigned popParenthesesStack()
1817 ASSERT(m_parenthesesStack.size());
1818 int stackEnd = m_parenthesesStack.size() - 1;
1819 unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
1820 m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
1821 m_parenthesesStack.shrink(stackEnd);
1823 ASSERT(beginTerm < m_bodyDisjunction->terms.size());
1824 ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
1830 void dumpDisjunction(ByteDisjunction* disjunction)
1832 dataLogF("ByteDisjunction(%p):\n\t", disjunction);
1833 for (unsigned i = 0; i < disjunction->terms.size(); ++i)
1834 dataLogF("{ %d } ", disjunction->terms[i].type);
1839 void closeAlternative(int beginTerm)
1841 int origBeginTerm = beginTerm;
1842 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
1843 int endIndex = m_bodyDisjunction->terms.size();
1845 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1847 if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
1848 m_bodyDisjunction->terms.remove(beginTerm);
1850 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1851 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1852 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
1853 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1854 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1857 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1859 m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
1860 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1864 void closeBodyAlternative()
1867 int origBeginTerm = 0;
1868 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
1869 int endIndex = m_bodyDisjunction->terms.size();
1871 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1873 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1874 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1875 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
1876 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1877 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1880 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1882 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
1883 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1886 void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType, unsigned callFrameSize = 0)
1888 unsigned beginTerm = popParenthesesStack();
1889 closeAlternative(beginTerm + 1);
1890 unsigned endTerm = m_bodyDisjunction->terms.size();
1892 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1894 ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
1896 bool capture = parenthesesBegin.capture();
1897 unsigned subpatternId = parenthesesBegin.atom.subpatternId;
1899 unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
1900 auto parenthesesDisjunction = std::make_unique<ByteDisjunction>(numSubpatterns, callFrameSize);
1902 unsigned firstTermInParentheses = beginTerm + 1;
1903 parenthesesDisjunction->terms.reserveInitialCapacity(endTerm - firstTermInParentheses + 2);
1905 parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
1906 for (unsigned termInParentheses = firstTermInParentheses; termInParentheses < endTerm; ++termInParentheses)
1907 parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
1908 parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
1910 m_bodyDisjunction->terms.shrink(beginTerm);
1912 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, inputPosition));
1913 m_allParenthesesInfo.append(WTFMove(parenthesesDisjunction));
1915 m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1916 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1917 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1918 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1921 void atomParenthesesOnceEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1923 unsigned beginTerm = popParenthesesStack();
1924 closeAlternative(beginTerm + 1);
1925 unsigned endTerm = m_bodyDisjunction->terms.size();
1927 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1929 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1930 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1932 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition));
1933 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1934 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1935 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1937 m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1938 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1939 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1940 m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1941 m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1942 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1945 void atomParenthesesTerminalEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1947 unsigned beginTerm = popParenthesesStack();
1948 closeAlternative(beginTerm + 1);
1949 unsigned endTerm = m_bodyDisjunction->terms.size();
1951 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
1953 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1954 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1956 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition));
1957 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1958 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1959 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1961 m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1962 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1963 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1964 m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1965 m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1966 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1969 void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
1971 m_bodyDisjunction = std::make_unique<ByteDisjunction>(numSubpatterns, callFrameSize);
1972 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
1973 m_bodyDisjunction->terms[0].frameLocation = 0;
1974 m_currentAlternativeIndex = 0;
1979 closeBodyAlternative();
1982 void alternativeBodyDisjunction(bool onceThrough)
1984 int newAlternativeIndex = m_bodyDisjunction->terms.size();
1985 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1986 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
1988 m_currentAlternativeIndex = newAlternativeIndex;
1991 void alternativeDisjunction()
1993 int newAlternativeIndex = m_bodyDisjunction->terms.size();
1994 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1995 m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
1997 m_currentAlternativeIndex = newAlternativeIndex;
2000 void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0)
2002 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
2003 unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
2005 PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
2008 if (disjunction == m_pattern.m_body)
2009 alternativeBodyDisjunction(alternative->onceThrough());
2011 alternativeDisjunction();
2014 unsigned minimumSize = alternative->m_minimumSize;
2015 ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
2016 unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
2019 checkInput(countToCheck);
2020 currentCountAlreadyChecked += countToCheck;
2023 for (auto& term : alternative->m_terms) {
2024 switch (term.type) {
2025 case PatternTerm::TypeAssertionBOL:
2026 assertionBOL(currentCountAlreadyChecked - term.inputPosition);
2029 case PatternTerm::TypeAssertionEOL:
2030 assertionEOL(currentCountAlreadyChecked - term.inputPosition);
2033 case PatternTerm::TypeAssertionWordBoundary:
2034 assertionWordBoundary(term.invert(), currentCountAlreadyChecked - term.inputPosition);
2037 case PatternTerm::TypePatternCharacter:
2038 atomPatternCharacter(term.patternCharacter, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityMaxCount, term.quantityType);
2041 case PatternTerm::TypeCharacterClass:
2042 atomCharacterClass(term.characterClass, term.invert(), currentCountAlreadyChecked- term.inputPosition, term.frameLocation, term.quantityMaxCount, term.quantityType);
2045 case PatternTerm::TypeBackReference:
2046 atomBackReference(term.backReferenceSubpatternId, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityMaxCount, term.quantityType);
2049 case PatternTerm::TypeForwardReference:
2052 case PatternTerm::TypeParenthesesSubpattern: {
2053 unsigned disjunctionAlreadyCheckedCount = 0;
2054 if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) {
2055 unsigned alternativeFrameLocation = term.frameLocation;
2056 // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame.
2057 if (term.quantityType == QuantifierFixedCount)
2058 disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
2060 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
2061 ASSERT(currentCountAlreadyChecked >= term.inputPosition);
2062 unsigned delegateEndInputOffset = currentCountAlreadyChecked - term.inputPosition;
2063 atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, alternativeFrameLocation);
2064 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
2065 atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType);
2066 } else if (term.parentheses.isTerminal) {
2067 ASSERT(currentCountAlreadyChecked >= term.inputPosition);
2068 unsigned delegateEndInputOffset = currentCountAlreadyChecked - term.inputPosition;
2069 atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce);
2070 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
2071 atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType);
2073 ASSERT(currentCountAlreadyChecked >= term.inputPosition);
2074 unsigned delegateEndInputOffset = currentCountAlreadyChecked - term.inputPosition;
2075 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, 0);
2076 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
2077 atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
2082 case PatternTerm::TypeParentheticalAssertion: {
2083 unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion;
2085 ASSERT(currentCountAlreadyChecked >= term.inputPosition);
2086 unsigned positiveInputOffset = currentCountAlreadyChecked - term.inputPosition;
2087 unsigned uncheckAmount = 0;
2088 if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) {
2089 uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize;
2090 uncheckInput(uncheckAmount);
2091 currentCountAlreadyChecked -= uncheckAmount;
2094 atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation);
2095 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount);
2096 atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityMaxCount, term.quantityType);
2097 if (uncheckAmount) {
2098 checkInput(uncheckAmount);
2099 currentCountAlreadyChecked += uncheckAmount;
2104 case PatternTerm::TypeDotStarEnclosure:
2105 assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor);
2113 YarrPattern& m_pattern;
2114 std::unique_ptr<ByteDisjunction> m_bodyDisjunction;
2115 unsigned m_currentAlternativeIndex;
2116 Vector<ParenthesesStackEntry> m_parenthesesStack;
2117 Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo;
2120 std::unique_ptr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator, ConcurrentJSLock* lock)
2122 return ByteCompiler(pattern).compile(allocator, lock);
2125 unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned start, unsigned* output)
2127 SuperSamplerScope superSamplerScope(false);
2129 return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret();
2130 return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret();
2133 unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output)
2135 SuperSamplerScope superSamplerScope(false);
2136 return Interpreter<LChar>(bytecode, output, input, length, start).interpret();
2139 unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output)
2141 SuperSamplerScope superSamplerScope(false);
2142 return Interpreter<UChar>(bytecode, output, input, length, start).interpret();
2145 // These should be the same for both UChar & LChar.
2146 COMPILE_ASSERT(sizeof(BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
2147 COMPILE_ASSERT(sizeof(BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
2148 COMPILE_ASSERT(sizeof(BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
2149 COMPILE_ASSERT(sizeof(BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
2150 COMPILE_ASSERT(sizeof(BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
2151 COMPILE_ASSERT(sizeof(BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
2152 COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);