REGRESSION (r225695): Repro crash on yahoo login page
[WebKit-https.git] / Source / JavaScriptCore / yarr / YarrPattern.cpp
1 /*
2  * Copyright (C) 2009, 2013-2016 Apple Inc. All rights reserved.
3  * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
25  */
26
27 #include "config.h"
28 #include "YarrPattern.h"
29
30 #include "Options.h"
31 #include "Yarr.h"
32 #include "YarrCanonicalize.h"
33 #include "YarrParser.h"
34 #include <wtf/DataLog.h>
35 #include <wtf/Optional.h>
36 #include <wtf/Threading.h>
37 #include <wtf/Vector.h>
38 #include <wtf/text/WTFString.h>
39
40 using namespace WTF;
41
42 namespace JSC { namespace Yarr {
43
44 #include "RegExpJitTables.h"
45
46 class CharacterClassConstructor {
47 public:
48     CharacterClassConstructor(bool isCaseInsensitive, CanonicalMode canonicalMode)
49         : m_isCaseInsensitive(isCaseInsensitive)
50         , m_hasNonBMPCharacters(false)
51         , m_anyCharacter(false)
52         , m_canonicalMode(canonicalMode)
53     {
54     }
55     
56     void reset()
57     {
58         m_matches.clear();
59         m_ranges.clear();
60         m_matchesUnicode.clear();
61         m_rangesUnicode.clear();
62         m_hasNonBMPCharacters = false;
63         m_anyCharacter = false;
64     }
65
66     void append(const CharacterClass* other)
67     {
68         for (size_t i = 0; i < other->m_matches.size(); ++i)
69             addSorted(m_matches, other->m_matches[i]);
70         for (size_t i = 0; i < other->m_ranges.size(); ++i)
71             addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
72         for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
73             addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
74         for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
75             addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
76     }
77
78     void appendInverted(const CharacterClass* other)
79     {
80         auto addSortedInverted = [&](UChar32 min, UChar32 max,
81             const Vector<UChar32>& srcMatches, const Vector<CharacterRange>& srcRanges,
82             Vector<UChar32>& destMatches, Vector<CharacterRange>& destRanges) {
83
84             auto addSortedMatchOrRange = [&](UChar32 lo, UChar32 hiPlusOne) {
85                 if (lo < hiPlusOne) {
86                     if (lo + 1 == hiPlusOne)
87                         addSorted(destMatches, lo);
88                     else
89                         addSortedRange(destRanges, lo, hiPlusOne - 1);
90                 }
91             };
92
93             UChar32 lo = min;
94             size_t matchesIndex = 0;
95             size_t rangesIndex = 0;
96             bool matchesRemaining = matchesIndex < srcMatches.size();
97             bool rangesRemaining = rangesIndex < srcRanges.size();
98
99             if (!matchesRemaining && !rangesRemaining) {
100                 addSortedMatchOrRange(min, max + 1);
101                 return;
102             }
103
104             while (matchesRemaining || rangesRemaining) {
105                 UChar32 hiPlusOne;
106                 UChar32 nextLo;
107
108                 if (matchesRemaining
109                     && (!rangesRemaining || srcMatches[matchesIndex] < srcRanges[rangesIndex].begin)) {
110                     hiPlusOne = srcMatches[matchesIndex];
111                     nextLo = hiPlusOne + 1;
112                     ++matchesIndex;
113                     matchesRemaining = matchesIndex < srcMatches.size();
114                 } else {
115                     hiPlusOne = srcRanges[rangesIndex].begin;
116                     nextLo = srcRanges[rangesIndex].end + 1;
117                     ++rangesIndex;
118                     rangesRemaining = rangesIndex < srcRanges.size();
119                 }
120
121                 addSortedMatchOrRange(lo, hiPlusOne);
122
123                 lo = nextLo;
124             }
125
126             addSortedMatchOrRange(lo, max + 1);
127         };
128
129         addSortedInverted(0, 0x7f, other->m_matches, other->m_ranges, m_matches, m_ranges);
130         addSortedInverted(0x80, 0x10ffff, other->m_matchesUnicode, other->m_rangesUnicode, m_matchesUnicode, m_rangesUnicode);
131     }
132
133     void putChar(UChar32 ch)
134     {
135         if (!m_isCaseInsensitive) {
136             addSorted(ch);
137             return;
138         }
139
140         if (m_canonicalMode == CanonicalMode::UCS2 && isASCII(ch)) {
141             // Handle ASCII cases.
142             if (isASCIIAlpha(ch)) {
143                 addSorted(m_matches, toASCIIUpper(ch));
144                 addSorted(m_matches, toASCIILower(ch));
145             } else
146                 addSorted(m_matches, ch);
147             return;
148         }
149
150         // Add multiple matches, if necessary.
151         const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_canonicalMode);
152         if (info->type == CanonicalizeUnique)
153             addSorted(ch);
154         else
155             putUnicodeIgnoreCase(ch, info);
156     }
157
158     void putUnicodeIgnoreCase(UChar32 ch, const CanonicalizationRange* info)
159     {
160         ASSERT(m_isCaseInsensitive);
161         ASSERT(ch >= info->begin && ch <= info->end);
162         ASSERT(info->type != CanonicalizeUnique);
163         if (info->type == CanonicalizeSet) {
164             for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set)
165                 addSorted(ch);
166         } else {
167             addSorted(ch);
168             addSorted(getCanonicalPair(info, ch));
169         }
170     }
171
172     void putRange(UChar32 lo, UChar32 hi)
173     {
174         if (isASCII(lo)) {
175             char asciiLo = lo;
176             char asciiHi = std::min(hi, (UChar32)0x7f);
177             addSortedRange(m_ranges, lo, asciiHi);
178             
179             if (m_isCaseInsensitive) {
180                 if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
181                     addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
182                 if ((asciiLo <= 'z') && (asciiHi >= 'a'))
183                     addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
184             }
185         }
186         if (isASCII(hi))
187             return;
188
189         lo = std::max(lo, (UChar32)0x80);
190         addSortedRange(m_rangesUnicode, lo, hi);
191         
192         if (!m_isCaseInsensitive)
193             return;
194
195         const CanonicalizationRange* info = canonicalRangeInfoFor(lo, m_canonicalMode);
196         while (true) {
197             // Handle the range [lo .. end]
198             UChar32 end = std::min<UChar32>(info->end, hi);
199
200             switch (info->type) {
201             case CanonicalizeUnique:
202                 // Nothing to do - no canonical equivalents.
203                 break;
204             case CanonicalizeSet: {
205                 UChar ch;
206                 for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set)
207                     addSorted(m_matchesUnicode, ch);
208                 break;
209             }
210             case CanonicalizeRangeLo:
211                 addSortedRange(m_rangesUnicode, lo + info->value, end + info->value);
212                 break;
213             case CanonicalizeRangeHi:
214                 addSortedRange(m_rangesUnicode, lo - info->value, end - info->value);
215                 break;
216             case CanonicalizeAlternatingAligned:
217                 // Use addSortedRange since there is likely an abutting range to combine with.
218                 if (lo & 1)
219                     addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
220                 if (!(end & 1))
221                     addSortedRange(m_rangesUnicode, end + 1, end + 1);
222                 break;
223             case CanonicalizeAlternatingUnaligned:
224                 // Use addSortedRange since there is likely an abutting range to combine with.
225                 if (!(lo & 1))
226                     addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
227                 if (end & 1)
228                     addSortedRange(m_rangesUnicode, end + 1, end + 1);
229                 break;
230             }
231
232             if (hi == end)
233                 return;
234
235             ++info;
236             lo = info->begin;
237         };
238
239     }
240
241     std::unique_ptr<CharacterClass> charClass()
242     {
243         coalesceTables();
244
245         auto characterClass = std::make_unique<CharacterClass>();
246
247         characterClass->m_matches.swap(m_matches);
248         characterClass->m_ranges.swap(m_ranges);
249         characterClass->m_matchesUnicode.swap(m_matchesUnicode);
250         characterClass->m_rangesUnicode.swap(m_rangesUnicode);
251         characterClass->m_hasNonBMPCharacters = hasNonBMPCharacters();
252         characterClass->m_anyCharacter = anyCharacter();
253
254         m_hasNonBMPCharacters = false;
255         m_anyCharacter = false;
256
257         return characterClass;
258     }
259
260 private:
261     void addSorted(UChar32 ch)
262     {
263         addSorted(isASCII(ch) ? m_matches : m_matchesUnicode, ch);
264     }
265
266     void addSorted(Vector<UChar32>& matches, UChar32 ch)
267     {
268         unsigned pos = 0;
269         unsigned range = matches.size();
270
271         if (!U_IS_BMP(ch))
272             m_hasNonBMPCharacters = true;
273
274         // binary chop, find position to insert char.
275         while (range) {
276             unsigned index = range >> 1;
277
278             int val = matches[pos+index] - ch;
279             if (!val)
280                 return;
281             else if (val > 0) {
282                 if (val == 1) {
283                     UChar32 lo = ch;
284                     UChar32 hi = ch + 1;
285                     matches.remove(pos + index);
286                     if (pos + index > 0 && matches[pos + index - 1] == ch - 1) {
287                         lo = ch - 1;
288                         matches.remove(pos + index - 1);
289                     }
290                     addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
291                     return;
292                 }
293                 range = index;
294             } else {
295                 if (val == -1) {
296                     UChar32 lo = ch - 1;
297                     UChar32 hi = ch;
298                     matches.remove(pos + index);
299                     if (pos + index + 1 < matches.size() && matches[pos + index + 1] == ch + 1) {
300                         hi = ch + 1;
301                         matches.remove(pos + index + 1);
302                     }
303                     addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
304                     return;
305                 }
306                 pos += (index+1);
307                 range -= (index+1);
308             }
309         }
310         
311         if (pos == matches.size())
312             matches.append(ch);
313         else
314             matches.insert(pos, ch);
315     }
316
317     void addSortedRange(Vector<CharacterRange>& ranges, UChar32 lo, UChar32 hi)
318     {
319         size_t end = ranges.size();
320
321         if (!U_IS_BMP(hi))
322             m_hasNonBMPCharacters = true;
323
324         // Simple linear scan - I doubt there are that many ranges anyway...
325         // feel free to fix this with something faster (eg binary chop).
326         for (size_t i = 0; i < end; ++i) {
327             // does the new range fall before the current position in the array
328             if (hi < ranges[i].begin) {
329                 // Concatenate appending ranges.
330                 if (hi == (ranges[i].begin - 1)) {
331                     ranges[i].begin = lo;
332                     return;
333                 }
334                 ranges.insert(i, CharacterRange(lo, hi));
335                 return;
336             }
337             // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
338             // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
339             // end of the last range they concatenate, which is just as good.
340             if (lo <= (ranges[i].end + 1)) {
341                 // found an intersect! we'll replace this entry in the array.
342                 ranges[i].begin = std::min(ranges[i].begin, lo);
343                 ranges[i].end = std::max(ranges[i].end, hi);
344
345                 mergeRangesFrom(ranges, i);
346                 return;
347             }
348         }
349
350         // CharacterRange comes after all existing ranges.
351         ranges.append(CharacterRange(lo, hi));
352     }
353
354     void mergeRangesFrom(Vector<CharacterRange>& ranges, size_t index)
355     {
356         unsigned next = index + 1;
357
358         // each iteration of the loop we will either remove something from the list, or break out of the loop.
359         while (next < ranges.size()) {
360             if (ranges[next].begin <= (ranges[index].end + 1)) {
361                 // the next entry now overlaps / concatenates with this one.
362                 ranges[index].end = std::max(ranges[index].end, ranges[next].end);
363                 ranges.remove(next);
364             } else
365                 break;
366         }
367
368     }
369
370     void coalesceTables()
371     {
372         auto coalesceMatchesAndRanges = [&](Vector<UChar32>& matches, Vector<CharacterRange>& ranges) {
373
374             size_t matchesIndex = 0;
375             size_t rangesIndex = 0;
376
377             while (matchesIndex < matches.size() && rangesIndex < ranges.size()) {
378                 while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].begin - 1)
379                     matchesIndex++;
380
381                 if (matchesIndex < matches.size() && matches[matchesIndex] == ranges[rangesIndex].begin - 1) {
382                     ranges[rangesIndex].begin = matches[matchesIndex];
383                     matches.remove(matchesIndex);
384                 }
385
386                 while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].end + 1)
387                     matchesIndex++;
388
389                 if (matchesIndex < matches.size()) {
390                     if (matches[matchesIndex] == ranges[rangesIndex].end + 1) {
391                         ranges[rangesIndex].end = matches[matchesIndex];
392                         matches.remove(matchesIndex);
393
394                         mergeRangesFrom(ranges, rangesIndex);
395                     } else
396                         matchesIndex++;
397                 }
398             }
399         };
400
401         coalesceMatchesAndRanges(m_matches, m_ranges);
402         coalesceMatchesAndRanges(m_matchesUnicode, m_rangesUnicode);
403
404         if (!m_matches.size() && !m_matchesUnicode.size()
405             && m_ranges.size() == 1 && m_rangesUnicode.size() == 1
406             && m_ranges[0].begin == 0 && m_ranges[0].end == 0x7f
407             && m_rangesUnicode[0].begin == 0x80 && m_rangesUnicode[0].end == 0x10ffff)
408             m_anyCharacter = true;
409     }
410
411     bool hasNonBMPCharacters()
412     {
413         return m_hasNonBMPCharacters;
414     }
415
416     bool anyCharacter()
417     {
418         return m_anyCharacter;
419     }
420
421     bool m_isCaseInsensitive : 1;
422     bool m_hasNonBMPCharacters : 1;
423     bool m_anyCharacter : 1;
424     CanonicalMode m_canonicalMode;
425
426     Vector<UChar32> m_matches;
427     Vector<CharacterRange> m_ranges;
428     Vector<UChar32> m_matchesUnicode;
429     Vector<CharacterRange> m_rangesUnicode;
430 };
431
432 class YarrPatternConstructor {
433 public:
434     YarrPatternConstructor(YarrPattern& pattern, void* stackLimit)
435         : m_pattern(pattern)
436         , m_characterClassConstructor(pattern.ignoreCase(), pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2)
437         , m_stackLimit(stackLimit)
438         , m_invertParentheticalAssertion(false)
439     {
440         auto body = std::make_unique<PatternDisjunction>();
441         m_pattern.m_body = body.get();
442         m_alternative = body->addNewAlternative();
443         m_pattern.m_disjunctions.append(WTFMove(body));
444     }
445
446     ~YarrPatternConstructor()
447     {
448     }
449
450     void reset()
451     {
452         m_pattern.reset();
453         m_characterClassConstructor.reset();
454
455         auto body = std::make_unique<PatternDisjunction>();
456         m_pattern.m_body = body.get();
457         m_alternative = body->addNewAlternative();
458         m_pattern.m_disjunctions.append(WTFMove(body));
459     }
460     
461     void assertionBOL()
462     {
463         if (!m_alternative->m_terms.size() && !m_invertParentheticalAssertion) {
464             m_alternative->m_startsWithBOL = true;
465             m_alternative->m_containsBOL = true;
466             m_pattern.m_containsBOL = true;
467         }
468         m_alternative->m_terms.append(PatternTerm::BOL());
469     }
470     void assertionEOL()
471     {
472         m_alternative->m_terms.append(PatternTerm::EOL());
473     }
474     void assertionWordBoundary(bool invert)
475     {
476         m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
477     }
478
479     void atomPatternCharacter(UChar32 ch)
480     {
481         // We handle case-insensitive checking of unicode characters which do have both
482         // cases by handling them as if they were defined using a CharacterClass.
483         if (!m_pattern.ignoreCase() || (isASCII(ch) && !m_pattern.unicode())) {
484             m_alternative->m_terms.append(PatternTerm(ch));
485             return;
486         }
487
488         const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2);
489         if (info->type == CanonicalizeUnique) {
490             m_alternative->m_terms.append(PatternTerm(ch));
491             return;
492         }
493
494         m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
495         auto newCharacterClass = m_characterClassConstructor.charClass();
496         m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), false));
497         m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
498     }
499
500     void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
501     {
502         switch (classID) {
503         case BuiltInCharacterClassID::DigitClassID:
504             m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
505             break;
506         case BuiltInCharacterClassID::SpaceClassID:
507             m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
508             break;
509         case BuiltInCharacterClassID::WordClassID:
510             if (m_pattern.unicode() && m_pattern.ignoreCase())
511                 m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), invert));
512             else
513                 m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
514             break;
515         case BuiltInCharacterClassID::DotClassID:
516             ASSERT(!invert);
517             if (m_pattern.dotAll())
518                 m_alternative->m_terms.append(PatternTerm(m_pattern.anyCharacterClass(), false));
519             else
520                 m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), true));
521             break;
522         default:
523             m_alternative->m_terms.append(PatternTerm(m_pattern.unicodeCharacterClassFor(classID), invert));
524             break;
525         }
526     }
527
528     void atomCharacterClassBegin(bool invert = false)
529     {
530         m_invertCharacterClass = invert;
531     }
532
533     void atomCharacterClassAtom(UChar32 ch)
534     {
535         m_characterClassConstructor.putChar(ch);
536     }
537
538     void atomCharacterClassRange(UChar32 begin, UChar32 end)
539     {
540         m_characterClassConstructor.putRange(begin, end);
541     }
542
543     void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
544     {
545         ASSERT(classID != BuiltInCharacterClassID::DotClassID);
546
547         switch (classID) {
548         case BuiltInCharacterClassID::DigitClassID:
549             m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
550             break;
551         
552         case BuiltInCharacterClassID::SpaceClassID:
553             m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
554             break;
555         
556         case BuiltInCharacterClassID::WordClassID:
557             if (m_pattern.unicode() && m_pattern.ignoreCase())
558                 m_characterClassConstructor.append(invert ? m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass() : m_pattern.wordUnicodeIgnoreCaseCharCharacterClass());
559             else
560                 m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
561             break;
562         
563         default:
564             if (!invert)
565                 m_characterClassConstructor.append(m_pattern.unicodeCharacterClassFor(classID));
566             else
567                 m_characterClassConstructor.appendInverted(m_pattern.unicodeCharacterClassFor(classID));
568         }
569     }
570
571     void atomCharacterClassEnd()
572     {
573         auto newCharacterClass = m_characterClassConstructor.charClass();
574
575         if (!m_invertCharacterClass && newCharacterClass.get()->m_anyCharacter) {
576             m_alternative->m_terms.append(PatternTerm(m_pattern.anyCharacterClass(), false));
577             return;
578         }
579         m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), m_invertCharacterClass));
580         m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
581     }
582
583     void atomParenthesesSubpatternBegin(bool capture = true, std::optional<String> optGroupName = std::nullopt)
584     {
585         unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
586         if (capture) {
587             m_pattern.m_numSubpatterns++;
588             if (optGroupName) {
589                 while (m_pattern.m_captureGroupNames.size() < subpatternId)
590                     m_pattern.m_captureGroupNames.append(String());
591                 m_pattern.m_captureGroupNames.append(optGroupName.value());
592                 m_pattern.m_namedGroupToParenIndex.add(optGroupName.value(), subpatternId);
593             }
594         } else
595             ASSERT(!optGroupName);
596
597         auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative);
598         m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false));
599         m_alternative = parenthesesDisjunction->addNewAlternative();
600         m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
601     }
602
603     void atomParentheticalAssertionBegin(bool invert = false)
604     {
605         auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative);
606         m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction.get(), false, invert));
607         m_alternative = parenthesesDisjunction->addNewAlternative();
608         m_invertParentheticalAssertion = invert;
609         m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
610     }
611
612     void atomParenthesesEnd()
613     {
614         ASSERT(m_alternative->m_parent);
615         ASSERT(m_alternative->m_parent->m_parent);
616
617         PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
618         m_alternative = m_alternative->m_parent->m_parent;
619
620         PatternTerm& lastTerm = m_alternative->lastTerm();
621
622         unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
623         unsigned numBOLAnchoredAlts = 0;
624
625         for (unsigned i = 0; i < numParenAlternatives; i++) {
626             // Bubble up BOL flags
627             if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL)
628                 numBOLAnchoredAlts++;
629         }
630
631         if (numBOLAnchoredAlts) {
632             m_alternative->m_containsBOL = true;
633             // If all the alternatives in parens start with BOL, then so does this one
634             if (numBOLAnchoredAlts == numParenAlternatives)
635                 m_alternative->m_startsWithBOL = true;
636         }
637
638         lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
639         m_invertParentheticalAssertion = false;
640     }
641
642     void atomBackReference(unsigned subpatternId)
643     {
644         ASSERT(subpatternId);
645         m_pattern.m_containsBackreferences = true;
646         m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
647
648         if (subpatternId > m_pattern.m_numSubpatterns) {
649             m_alternative->m_terms.append(PatternTerm::ForwardReference());
650             return;
651         }
652
653         PatternAlternative* currentAlternative = m_alternative;
654         ASSERT(currentAlternative);
655
656         // Note to self: if we waited until the AST was baked, we could also remove forwards refs 
657         while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
658             PatternTerm& term = currentAlternative->lastTerm();
659             ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
660
661             if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
662                 m_alternative->m_terms.append(PatternTerm::ForwardReference());
663                 return;
664             }
665         }
666
667         m_alternative->m_terms.append(PatternTerm(subpatternId));
668     }
669
670     void atomNamedBackReference(String subpatternName)
671     {
672         ASSERT(m_pattern.m_namedGroupToParenIndex.find(subpatternName) != m_pattern.m_namedGroupToParenIndex.end());
673         atomBackReference(m_pattern.m_namedGroupToParenIndex.get(subpatternName));
674     }
675
676     // deep copy the argument disjunction.  If filterStartsWithBOL is true,
677     // skip alternatives with m_startsWithBOL set true.
678     PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
679     {
680         std::unique_ptr<PatternDisjunction> newDisjunction;
681         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
682             PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
683             if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
684                 if (!newDisjunction) {
685                     newDisjunction = std::make_unique<PatternDisjunction>();
686                     newDisjunction->m_parent = disjunction->m_parent;
687                 }
688                 PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
689                 newAlternative->m_terms.reserveInitialCapacity(alternative->m_terms.size());
690                 for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
691                     newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL));
692             }
693         }
694         
695         if (!newDisjunction)
696             return 0;
697
698         PatternDisjunction* copiedDisjunction = newDisjunction.get();
699         m_pattern.m_disjunctions.append(WTFMove(newDisjunction));
700         return copiedDisjunction;
701     }
702     
703     PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
704     {
705         if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
706             return PatternTerm(term);
707         
708         PatternTerm termCopy = term;
709         termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
710         m_pattern.m_hasCopiedParenSubexpressions = true;
711         return termCopy;
712     }
713     
714     void quantifyAtom(unsigned min, unsigned max, bool greedy)
715     {
716         ASSERT(min <= max);
717         ASSERT(m_alternative->m_terms.size());
718
719         if (!max) {
720             m_alternative->removeLastTerm();
721             return;
722         }
723
724         PatternTerm& term = m_alternative->lastTerm();
725         ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
726         ASSERT(term.quantityMinCount == 1 && term.quantityMaxCount == 1 && term.quantityType == QuantifierFixedCount);
727
728         if (term.type == PatternTerm::TypeParentheticalAssertion) {
729             // If an assertion is quantified with a minimum count of zero, it can simply be removed.
730             // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never
731             // results in any input being consumed, however the continuation passed to the assertion
732             // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will
733             // reject all zero length matches (see step 2.1). A match from the continuation of the
734             // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all
735             // this is that matches from the assertion are not required, and won't be accepted anyway,
736             // so no need to ever run it.
737             if (!min)
738                 m_alternative->removeLastTerm();
739             // We never need to run an assertion more than once. Subsequent interations will be run
740             // with the same start index (since assertions are non-capturing) and the same captures
741             // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the
742             // same result and captures. If the first match succeeds then the subsequent (min - 1)
743             // matches will too. Any additional optional matches will fail (on the same basis as the
744             // minimum zero quantified assertions, above), but this will still result in a match.
745             return;
746         }
747
748         if (min == max)
749             term.quantify(min, max, QuantifierFixedCount);
750         else if (!min || (term.type == PatternTerm::TypeParenthesesSubpattern && m_pattern.m_hasCopiedParenSubexpressions))
751             term.quantify(min, max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
752         else {
753             term.quantify(min, min, QuantifierFixedCount);
754             m_alternative->m_terms.append(copyTerm(term));
755             // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
756             m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
757             if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
758                 m_alternative->lastTerm().parentheses.isCopy = true;
759         }
760     }
761
762     void disjunction()
763     {
764         m_alternative = m_alternative->m_parent->addNewAlternative();
765     }
766
767     YarrPattern::ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, unsigned& newCallFrameSize) WARN_UNUSED_RETURN
768     {
769         if (UNLIKELY(!isSafeToRecurse()))
770             return YarrPattern::TooManyDisjunctions;
771
772         YarrPattern::ErrorCode error = YarrPattern::NoError;
773         alternative->m_hasFixedSize = true;
774         Checked<unsigned, RecordOverflow> currentInputPosition = initialInputPosition;
775
776         for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
777             PatternTerm& term = alternative->m_terms[i];
778
779             switch (term.type) {
780             case PatternTerm::TypeAssertionBOL:
781             case PatternTerm::TypeAssertionEOL:
782             case PatternTerm::TypeAssertionWordBoundary:
783                 term.inputPosition = currentInputPosition.unsafeGet();
784                 break;
785
786             case PatternTerm::TypeBackReference:
787                 term.inputPosition = currentInputPosition.unsafeGet();
788                 term.frameLocation = currentCallFrameSize;
789                 currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
790                 alternative->m_hasFixedSize = false;
791                 break;
792
793             case PatternTerm::TypeForwardReference:
794                 break;
795
796             case PatternTerm::TypePatternCharacter:
797                 term.inputPosition = currentInputPosition.unsafeGet();
798                 if (term.quantityType != QuantifierFixedCount) {
799                     term.frameLocation = currentCallFrameSize;
800                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
801                     alternative->m_hasFixedSize = false;
802                 } else if (m_pattern.unicode()) {
803                     Checked<unsigned, RecordOverflow> tempCount = term.quantityMaxCount;
804                     tempCount *= U16_LENGTH(term.patternCharacter);
805                     if (tempCount.hasOverflowed())
806                         return YarrPattern::OffsetTooLarge;
807                     currentInputPosition += tempCount;
808                 } else
809                     currentInputPosition += term.quantityMaxCount;
810                 break;
811
812             case PatternTerm::TypeCharacterClass:
813                 term.inputPosition = currentInputPosition.unsafeGet();
814                 if (term.quantityType != QuantifierFixedCount) {
815                     term.frameLocation = currentCallFrameSize;
816                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
817                     alternative->m_hasFixedSize = false;
818                 } else if (m_pattern.unicode()) {
819                     term.frameLocation = currentCallFrameSize;
820                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
821                     currentInputPosition += term.quantityMaxCount;
822                     alternative->m_hasFixedSize = false;
823                 } else
824                     currentInputPosition += term.quantityMaxCount;
825                 break;
826
827             case PatternTerm::TypeParenthesesSubpattern:
828                 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
829                 term.frameLocation = currentCallFrameSize;
830                 if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) {
831                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
832                     error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
833                     if (error)
834                         return error;
835                     // If quantity is fixed, then pre-check its minimum size.
836                     if (term.quantityType == QuantifierFixedCount)
837                         currentInputPosition += term.parentheses.disjunction->m_minimumSize;
838                     term.inputPosition = currentInputPosition.unsafeGet();
839                 } else if (term.parentheses.isTerminal) {
840                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
841                     error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
842                     if (error)
843                         return error;
844                     term.inputPosition = currentInputPosition.unsafeGet();
845                 } else {
846                     term.inputPosition = currentInputPosition.unsafeGet();
847                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
848                     error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
849                     if (error)
850                         return error;
851                 }
852                 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
853                 alternative->m_hasFixedSize = false;
854                 break;
855
856             case PatternTerm::TypeParentheticalAssertion:
857                 term.inputPosition = currentInputPosition.unsafeGet();
858                 term.frameLocation = currentCallFrameSize;
859                 error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet(), currentCallFrameSize);
860                 if (error)
861                     return error;
862                 break;
863
864             case PatternTerm::TypeDotStarEnclosure:
865                 ASSERT(!m_pattern.m_saveInitialStartValue);
866                 alternative->m_hasFixedSize = false;
867                 term.inputPosition = initialInputPosition;
868                 m_pattern.m_initialStartValueFrameLocation = currentCallFrameSize;
869                 currentCallFrameSize += YarrStackSpaceForDotStarEnclosure;
870                 m_pattern.m_saveInitialStartValue = true;
871                 break;
872             }
873             if (currentInputPosition.hasOverflowed())
874                 return YarrPattern::OffsetTooLarge;
875         }
876
877         alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet();
878         newCallFrameSize = currentCallFrameSize;
879         return error;
880     }
881
882     YarrPattern::ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned& callFrameSize)
883     {
884         if (UNLIKELY(!isSafeToRecurse()))
885             return YarrPattern::TooManyDisjunctions;
886
887         if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
888             initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
889
890         unsigned minimumInputSize = UINT_MAX;
891         unsigned maximumCallFrameSize = 0;
892         bool hasFixedSize = true;
893         YarrPattern::ErrorCode error = YarrPattern::NoError;
894
895         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
896             PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
897             unsigned currentAlternativeCallFrameSize;
898             error = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition, currentAlternativeCallFrameSize);
899             if (error)
900                 return error;
901             minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize);
902             maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize);
903             hasFixedSize &= alternative->m_hasFixedSize;
904             if (alternative->m_minimumSize > INT_MAX)
905                 m_pattern.m_containsUnsignedLengthPattern = true;
906         }
907         
908         ASSERT(minimumInputSize != UINT_MAX);
909         ASSERT(maximumCallFrameSize >= initialCallFrameSize);
910
911         disjunction->m_hasFixedSize = hasFixedSize;
912         disjunction->m_minimumSize = minimumInputSize;
913         disjunction->m_callFrameSize = maximumCallFrameSize;
914         callFrameSize = maximumCallFrameSize;
915         return error;
916     }
917
918     const char* setupOffsets()
919     {
920         // FIXME: Yarr should not use the stack to handle subpatterns (rdar://problem/26436314).
921         unsigned ignoredCallFrameSize;
922         YarrPattern::ErrorCode error = setupDisjunctionOffsets(m_pattern.m_body, 0, 0, ignoredCallFrameSize);
923         if (error)
924             return YarrPattern::errorMessage(error);
925         return nullptr;
926     }
927
928     // This optimization identifies sets of parentheses that we will never need to backtrack.
929     // In these cases we do not need to store state from prior iterations.
930     // We can presently avoid backtracking for:
931     //   * where the parens are at the end of the regular expression (last term in any of the
932     //     alternatives of the main body disjunction).
933     //   * where the parens are non-capturing, and quantified unbounded greedy (*).
934     //   * where the parens do not contain any capturing subpatterns.
935     void checkForTerminalParentheses()
936     {
937         // This check is much too crude; should be just checking whether the candidate
938         // node contains nested capturing subpatterns, not the whole expression!
939         if (m_pattern.m_numSubpatterns)
940             return;
941
942         Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
943         for (size_t i = 0; i < alternatives.size(); ++i) {
944             Vector<PatternTerm>& terms = alternatives[i]->m_terms;
945             if (terms.size()) {
946                 PatternTerm& term = terms.last();
947                 if (term.type == PatternTerm::TypeParenthesesSubpattern
948                     && term.quantityType == QuantifierGreedy
949                     && term.quantityMinCount == 0
950                     && term.quantityMaxCount == quantifyInfinite
951                     && !term.capture())
952                     term.parentheses.isTerminal = true;
953             }
954         }
955     }
956
957     void optimizeBOL()
958     {
959         // Look for expressions containing beginning of line (^) anchoring and unroll them.
960         // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
961         // This code relies on the parsing code tagging alternatives with m_containsBOL and
962         // m_startsWithBOL and rolling those up to containing alternatives.
963         // At this point, this is only valid for non-multiline expressions.
964         PatternDisjunction* disjunction = m_pattern.m_body;
965         
966         if (!m_pattern.m_containsBOL || m_pattern.multiline())
967             return;
968         
969         PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
970
971         // Set alternatives in disjunction to "onceThrough"
972         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt)
973             disjunction->m_alternatives[alt]->setOnceThrough();
974
975         if (loopDisjunction) {
976             // Move alternatives from loopDisjunction to disjunction
977             for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
978                 disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt].release());
979                 
980             loopDisjunction->m_alternatives.clear();
981         }
982     }
983
984     bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t endIndex)
985     {
986         Vector<PatternTerm>& terms = alternative->m_terms;
987
988         ASSERT(endIndex <= terms.size());
989         for (size_t termIndex = firstTermIndex; termIndex < endIndex; ++termIndex) {
990             PatternTerm& term = terms[termIndex];
991
992             if (term.m_capture)
993                 return true;
994
995             if (term.type == PatternTerm::TypeParenthesesSubpattern) {
996                 PatternDisjunction* nestedDisjunction = term.parentheses.disjunction;
997                 for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) {
998                     if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size()))
999                         return true;
1000                 }
1001             }
1002         }
1003
1004         return false;
1005     }
1006
1007     // This optimization identifies alternatives in the form of 
1008     // [^].*[?]<expression>.*[$] for expressions that don't have any 
1009     // capturing terms. The alternative is changed to <expression> 
1010     // followed by processing of the dot stars to find and adjust the 
1011     // beginning and the end of the match.
1012     void optimizeDotStarWrappedExpressions()
1013     {
1014         Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
1015         if (alternatives.size() != 1)
1016             return;
1017
1018         CharacterClass* dotCharacterClass = m_pattern.dotAll() ? m_pattern.anyCharacterClass() : m_pattern.newlineCharacterClass();
1019         PatternAlternative* alternative = alternatives[0].get();
1020         Vector<PatternTerm>& terms = alternative->m_terms;
1021         if (terms.size() >= 3) {
1022             bool startsWithBOL = false;
1023             bool endsWithEOL = false;
1024             size_t termIndex, firstExpressionTerm;
1025
1026             termIndex = 0;
1027             if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) {
1028                 startsWithBOL = true;
1029                 ++termIndex;
1030             }
1031             
1032             PatternTerm& firstNonAnchorTerm = terms[termIndex];
1033             if ((firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass)
1034                 || (firstNonAnchorTerm.characterClass != dotCharacterClass)
1035                 || !((firstNonAnchorTerm.quantityType == QuantifierGreedy)
1036                     || (firstNonAnchorTerm.quantityType == QuantifierNonGreedy)))
1037                 return;
1038             
1039             firstExpressionTerm = termIndex + 1;
1040             
1041             termIndex = terms.size() - 1;
1042             if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) {
1043                 endsWithEOL = true;
1044                 --termIndex;
1045             }
1046             
1047             PatternTerm& lastNonAnchorTerm = terms[termIndex];
1048             if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass)
1049                 || (lastNonAnchorTerm.characterClass != dotCharacterClass)
1050                 || (lastNonAnchorTerm.quantityType != QuantifierGreedy))
1051                 return;
1052
1053             size_t endIndex = termIndex;
1054             if (firstExpressionTerm >= endIndex)
1055                 return;
1056
1057             if (!containsCapturingTerms(alternative, firstExpressionTerm, endIndex)) {
1058                 for (termIndex = terms.size() - 1; termIndex >= endIndex; --termIndex)
1059                     terms.remove(termIndex);
1060
1061                 for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex)
1062                     terms.remove(termIndex - 1);
1063
1064                 terms.append(PatternTerm(startsWithBOL, endsWithEOL));
1065                 
1066                 m_pattern.m_containsBOL = false;
1067             }
1068         }
1069     }
1070
1071 private:
1072     bool isSafeToRecurse() const
1073     {
1074         if (!m_stackLimit)
1075             return true;
1076         ASSERT(Thread::current().stack().isGrowingDownward());
1077         int8_t* curr = reinterpret_cast<int8_t*>(&curr);
1078         int8_t* limit = reinterpret_cast<int8_t*>(m_stackLimit);
1079         return curr >= limit;
1080     }
1081
1082     YarrPattern& m_pattern;
1083     PatternAlternative* m_alternative;
1084     CharacterClassConstructor m_characterClassConstructor;
1085     void* m_stackLimit;
1086     bool m_invertCharacterClass;
1087     bool m_invertParentheticalAssertion;
1088 };
1089
1090 const char* YarrPattern::errorMessage(YarrPattern::ErrorCode error)
1091 {
1092 #define REGEXP_ERROR_PREFIX "Invalid regular expression: "
1093     // The order of this array must match the ErrorCode enum.
1094     static const char* errorMessages[NumberOfErrorCodes] = {
1095         nullptr,                                                              // NoError
1096         REGEXP_ERROR_PREFIX "regular expression too large",                   // PatternTooLarge     
1097         REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",          // QuantifierOutOfOrder
1098         REGEXP_ERROR_PREFIX "nothing to repeat",                              // QuantifierWithoutAtom
1099         REGEXP_ERROR_PREFIX "number too large in {} quantifier",              // QuantifierTooLarge
1100         REGEXP_ERROR_PREFIX "missing )",                                      // MissingParentheses
1101         REGEXP_ERROR_PREFIX "unmatched parentheses",                          // ParenthesesUnmatched
1102         REGEXP_ERROR_PREFIX "unrecognized character after (?",                // ParenthesesTypeInvalid
1103         REGEXP_ERROR_PREFIX "invalid group specifier name",                   // InvalidGroupName
1104         REGEXP_ERROR_PREFIX "duplicate group specifier name",                 // DuplicateGroupName
1105         REGEXP_ERROR_PREFIX "missing terminating ] for character class",      // CharacterClassUnmatched
1106         REGEXP_ERROR_PREFIX "range out of order in character class",          // CharacterClassOutOfOrder
1107         REGEXP_ERROR_PREFIX "\\ at end of pattern",                           // EscapeUnterminated
1108         REGEXP_ERROR_PREFIX "invalid unicode {} escape",                      // InvalidUnicodeEscape
1109         REGEXP_ERROR_PREFIX "invalid backreference for unicode pattern",      // InvalidBackreference
1110         REGEXP_ERROR_PREFIX "invalid escaped character for unicode pattern",  // InvalidIdentityEscape
1111         REGEXP_ERROR_PREFIX "invalid property expression",                    // InvalidUnicodePropertyExpression
1112         REGEXP_ERROR_PREFIX "too many nested disjunctions",                   // TooManyDisjunctions
1113         REGEXP_ERROR_PREFIX "pattern exceeds string length limits",           // OffsetTooLarge
1114         REGEXP_ERROR_PREFIX "invalid flags"                                   // InvalidRegularExpressionFlags
1115     };
1116
1117     return errorMessages[error];
1118 }
1119
1120 const char* YarrPattern::compile(const String& patternString, void* stackLimit)
1121 {
1122     YarrPatternConstructor constructor(*this, stackLimit);
1123
1124     if (m_flags == InvalidFlags)
1125         return errorMessage(InvalidRegularExpressionFlags);
1126
1127     if (const char* error = parse(constructor, patternString, unicode()))
1128         return error;
1129     
1130     // If the pattern contains illegal backreferences reset & reparse.
1131     // Quoting Netscape's "What's new in JavaScript 1.2",
1132     //      "Note: if the number of left parentheses is less than the number specified
1133     //       in \#, the \# is taken as an octal escape as described in the next row."
1134     if (containsIllegalBackReference()) {
1135         if (unicode())
1136             return errorMessage(InvalidBackreference);
1137
1138         unsigned numSubpatterns = m_numSubpatterns;
1139
1140         constructor.reset();
1141 #if !ASSERT_DISABLED
1142         const char* error =
1143 #endif
1144             parse(constructor, patternString, unicode(), numSubpatterns);
1145
1146         ASSERT(!error);
1147         ASSERT(numSubpatterns == m_numSubpatterns);
1148     }
1149
1150     constructor.checkForTerminalParentheses();
1151     constructor.optimizeDotStarWrappedExpressions();
1152     constructor.optimizeBOL();
1153         
1154     if (const char* error = constructor.setupOffsets())
1155         return error;
1156
1157     if (Options::dumpCompiledRegExpPatterns())
1158         dumpPattern(patternString);
1159
1160     return nullptr;
1161 }
1162
1163 YarrPattern::YarrPattern(const String& pattern, RegExpFlags flags, const char** error, void* stackLimit)
1164     : m_containsBackreferences(false)
1165     , m_containsBOL(false)
1166     , m_containsUnsignedLengthPattern(false)
1167     , m_hasCopiedParenSubexpressions(false)
1168     , m_saveInitialStartValue(false)
1169     , m_flags(flags)
1170     , m_numSubpatterns(0)
1171     , m_maxBackReference(0)
1172     , anycharCached(0)
1173     , newlineCached(0)
1174     , digitsCached(0)
1175     , spacesCached(0)
1176     , wordcharCached(0)
1177     , wordUnicodeIgnoreCaseCharCached(0)
1178     , nondigitsCached(0)
1179     , nonspacesCached(0)
1180     , nonwordcharCached(0)
1181     , nonwordUnicodeIgnoreCasecharCached(0)
1182 {
1183     *error = compile(pattern, stackLimit);
1184 }
1185
1186 void indentForNestingLevel(PrintStream& out, unsigned nestingDepth)
1187 {
1188     out.print("    ");
1189     for (; nestingDepth; --nestingDepth)
1190         out.print("  ");
1191 }
1192
1193 void dumpUChar32(PrintStream& out, UChar32 c)
1194 {
1195     if (c >= ' '&& c <= 0xff)
1196         out.printf("'%c'", static_cast<char>(c));
1197     else
1198         out.printf("0x%04x", c);
1199 }
1200
1201 void dumpCharacterClass(PrintStream& out, YarrPattern* pattern, CharacterClass* characterClass)
1202 {
1203     if (characterClass == pattern->anyCharacterClass())
1204         out.print("<any character>");
1205     else if (characterClass == pattern->newlineCharacterClass())
1206         out.print("<newline>");
1207     else if (characterClass == pattern->digitsCharacterClass())
1208         out.print("<digits>");
1209     else if (characterClass == pattern->spacesCharacterClass())
1210         out.print("<whitespace>");
1211     else if (characterClass == pattern->wordcharCharacterClass())
1212         out.print("<word>");
1213     else if (characterClass == pattern->wordUnicodeIgnoreCaseCharCharacterClass())
1214         out.print("<unicode ignore case>");
1215     else if (characterClass == pattern->nondigitsCharacterClass())
1216         out.print("<non-digits>");
1217     else if (characterClass == pattern->nonspacesCharacterClass())
1218         out.print("<non-whitespace>");
1219     else if (characterClass == pattern->nonwordcharCharacterClass())
1220         out.print("<non-word>");
1221     else if (characterClass == pattern->nonwordUnicodeIgnoreCaseCharCharacterClass())
1222         out.print("<unicode non-ignore case>");
1223     else {
1224         bool needMatchesRangesSeperator = false;
1225
1226         auto dumpMatches = [&] (const char* prefix, Vector<UChar32> matches) {
1227             size_t matchesSize = matches.size();
1228             if (matchesSize) {
1229                 if (needMatchesRangesSeperator)
1230                     out.print(",");
1231                 needMatchesRangesSeperator = true;
1232
1233                 out.print(prefix, ":(");
1234                 for (size_t i = 0; i < matchesSize; ++i) {
1235                     if (i)
1236                         out.print(",");
1237                     dumpUChar32(out, matches[i]);
1238                 }
1239                 out.print(")");
1240             }
1241         };
1242
1243         auto dumpRanges = [&] (const char* prefix, Vector<CharacterRange> ranges) {
1244             size_t rangeSize = ranges.size();
1245             if (rangeSize) {
1246                 if (needMatchesRangesSeperator)
1247                     out.print(",");
1248                 needMatchesRangesSeperator = true;
1249
1250                 out.print(prefix, " ranges:(");
1251                 for (size_t i = 0; i < rangeSize; ++i) {
1252                     if (i)
1253                         out.print(",");
1254                     CharacterRange range = ranges[i];
1255                     out.print("(");
1256                     dumpUChar32(out, range.begin);
1257                     out.print("..");
1258                     dumpUChar32(out, range.end);
1259                     out.print(")");
1260                 }
1261                 out.print(")");
1262             }
1263         };
1264
1265         out.print("[");
1266         dumpMatches("ASCII", characterClass->m_matches);
1267         dumpRanges("ASCII", characterClass->m_ranges);
1268         dumpMatches("Unicode", characterClass->m_matchesUnicode);
1269         dumpRanges("Unicode", characterClass->m_rangesUnicode);
1270         out.print("]");
1271     }
1272 }
1273
1274 void PatternAlternative::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
1275 {
1276     out.print("minimum size: ", m_minimumSize);
1277     if (m_hasFixedSize)
1278         out.print(",fixed size");
1279     if (m_onceThrough)
1280         out.print(",once through");
1281     if (m_startsWithBOL)
1282         out.print(",starts with ^");
1283     if (m_containsBOL)
1284         out.print(",contains ^");
1285     out.print("\n");
1286
1287     for (size_t i = 0; i < m_terms.size(); ++i)
1288         m_terms[i].dump(out, thisPattern, nestingDepth);
1289 }
1290
1291 void PatternTerm::dumpQuantifier(PrintStream& out)
1292 {
1293     if (quantityType == QuantifierFixedCount && quantityMinCount == 1 && quantityMaxCount == 1)
1294         return;
1295     out.print(" {", quantityMinCount.unsafeGet());
1296     if (quantityMinCount != quantityMaxCount) {
1297         if (quantityMaxCount == UINT_MAX)
1298             out.print(",...");
1299         else
1300             out.print(",", quantityMaxCount.unsafeGet());
1301     }
1302     out.print("}");
1303     if (quantityType == QuantifierGreedy)
1304         out.print(" greedy");
1305     else if (quantityType == QuantifierNonGreedy)
1306         out.print(" non-greedy");
1307 }
1308
1309 void PatternTerm::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
1310 {
1311     indentForNestingLevel(out, nestingDepth);
1312
1313     if (type != TypeParenthesesSubpattern && type != TypeParentheticalAssertion) {
1314         if (invert())
1315             out.print("not ");
1316     }
1317
1318     switch (type) {
1319     case TypeAssertionBOL:
1320         out.println("BOL");
1321         break;
1322     case TypeAssertionEOL:
1323         out.println("EOL");
1324         break;
1325     case TypeAssertionWordBoundary:
1326         out.println("word boundary");
1327         break;
1328     case TypePatternCharacter:
1329         out.printf("character ");
1330         out.printf("inputPosition %u ", inputPosition);
1331         if (thisPattern->ignoreCase() && isASCIIAlpha(patternCharacter)) {
1332             dumpUChar32(out, toASCIIUpper(patternCharacter));
1333             out.print("/");
1334             dumpUChar32(out, toASCIILower(patternCharacter));
1335         } else
1336             dumpUChar32(out, patternCharacter);
1337         dumpQuantifier(out);
1338         if (quantityType != QuantifierFixedCount)
1339             out.print(",frame location ", frameLocation);
1340         out.println();
1341         break;
1342     case TypeCharacterClass:
1343         out.print("character class ");
1344         if (characterClass->m_anyCharacter)
1345             out.print("<any character>");
1346         else if (characterClass == thisPattern->newlineCharacterClass())
1347             out.print("<newline>");
1348         else if (characterClass == thisPattern->digitsCharacterClass())
1349             out.print("<digits>");
1350         else if (characterClass == thisPattern->spacesCharacterClass())
1351             out.print("<whitespace>");
1352         else if (characterClass == thisPattern->wordcharCharacterClass())
1353             out.print("<word>");
1354         else if (characterClass == thisPattern->wordUnicodeIgnoreCaseCharCharacterClass())
1355             out.print("<unicode ignore case>");
1356         else if (characterClass == thisPattern->nondigitsCharacterClass())
1357             out.print("<non-digits>");
1358         else if (characterClass == thisPattern->nonspacesCharacterClass())
1359             out.print("<non-whitespace>");
1360         else if (characterClass == thisPattern->nonwordcharCharacterClass())
1361             out.print("<non-word>");
1362         else if (characterClass == thisPattern->nonwordUnicodeIgnoreCaseCharCharacterClass())
1363             out.print("<unicode non-ignore case>");
1364         else {
1365             bool needMatchesRangesSeperator = false;
1366
1367             auto dumpMatches = [&] (const char* prefix, Vector<UChar32> matches) {
1368                 size_t matchesSize = matches.size();
1369                 if (matchesSize) {
1370                     if (needMatchesRangesSeperator)
1371                         out.print(",");
1372                     needMatchesRangesSeperator = true;
1373
1374                     out.print(prefix, ":(");
1375                     for (size_t i = 0; i < matchesSize; ++i) {
1376                         if (i)
1377                             out.print(",");
1378                         dumpUChar32(out, matches[i]);
1379                     }
1380                     out.print(")");
1381                 }
1382             };
1383
1384             auto dumpRanges = [&] (const char* prefix, Vector<CharacterRange> ranges) {
1385                 size_t rangeSize = ranges.size();
1386                 if (rangeSize) {
1387                     if (needMatchesRangesSeperator)
1388                         out.print(",");
1389                     needMatchesRangesSeperator = true;
1390
1391                     out.print(prefix, " ranges:(");
1392                     for (size_t i = 0; i < rangeSize; ++i) {
1393                         if (i)
1394                             out.print(",");
1395                         CharacterRange range = ranges[i];
1396                         out.print("(");
1397                         dumpUChar32(out, range.begin);
1398                         out.print("..");
1399                         dumpUChar32(out, range.end);
1400                         out.print(")");
1401                     }
1402                     out.print(")");
1403                 }
1404             };
1405
1406             out.print("[");
1407             dumpMatches("ASCII", characterClass->m_matches);
1408             dumpRanges("ASCII", characterClass->m_ranges);
1409             dumpMatches("Unicode", characterClass->m_matchesUnicode);
1410             dumpRanges("Unicode", characterClass->m_rangesUnicode);
1411             out.print("]");
1412         }
1413         dumpQuantifier(out);
1414         if (quantityType != QuantifierFixedCount || thisPattern->unicode())
1415             out.print(",frame location ", frameLocation);
1416         out.println();
1417         break;
1418     case TypeBackReference:
1419         out.print("back reference to subpattern #", backReferenceSubpatternId);
1420         out.println(",frame location ", frameLocation);
1421         break;
1422     case TypeForwardReference:
1423         out.println("forward reference");
1424         break;
1425     case TypeParenthesesSubpattern:
1426         if (m_capture)
1427             out.print("captured ");
1428         else
1429             out.print("non-captured ");
1430
1431         FALLTHROUGH;
1432     case TypeParentheticalAssertion:
1433         if (m_invert)
1434             out.print("inverted ");
1435
1436         if (type == TypeParenthesesSubpattern)
1437             out.print("subpattern");
1438         else if (type == TypeParentheticalAssertion)
1439             out.print("assertion");
1440
1441         if (m_capture)
1442             out.print(" #", parentheses.subpatternId);
1443
1444         dumpQuantifier(out);
1445
1446         if (parentheses.isCopy)
1447             out.print(",copy");
1448
1449         if (parentheses.isTerminal)
1450             out.print(",terminal");
1451
1452         out.println(",frame location ", frameLocation);
1453
1454         if (parentheses.disjunction->m_alternatives.size() > 1) {
1455             indentForNestingLevel(out, nestingDepth + 1);
1456             unsigned alternativeFrameLocation = frameLocation;
1457             if (quantityMaxCount == 1 && !parentheses.isCopy)
1458                 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
1459             else if (parentheses.isTerminal)
1460                 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
1461             else
1462                 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParentheses;
1463             out.println("alternative list,frame location ", alternativeFrameLocation);
1464         }
1465
1466         parentheses.disjunction->dump(out, thisPattern, nestingDepth + 1);
1467         break;
1468     case TypeDotStarEnclosure:
1469         out.println(".* enclosure,frame location ", thisPattern->m_initialStartValueFrameLocation);
1470         break;
1471     }
1472 }
1473
1474 void PatternDisjunction::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth = 0)
1475 {
1476     unsigned alternativeCount = m_alternatives.size();
1477     for (unsigned i = 0; i < alternativeCount; ++i) {
1478         indentForNestingLevel(out, nestingDepth);
1479         if (alternativeCount > 1)
1480             out.print("alternative #", i, ": ");
1481         m_alternatives[i].get()->dump(out, thisPattern, nestingDepth + (alternativeCount > 1));
1482     }
1483 }
1484
1485 void YarrPattern::dumpPattern(const String& patternString)
1486 {
1487     dumpPattern(WTF::dataFile(), patternString);
1488 }
1489
1490 void YarrPattern::dumpPattern(PrintStream& out, const String& patternString)
1491 {
1492     out.print("RegExp pattern for /");
1493     out.print(patternString);
1494     out.print("/");
1495     if (global())
1496         out.print("g");
1497     if (ignoreCase())
1498         out.print("i");
1499     if (multiline())
1500         out.print("m");
1501     if (unicode())
1502         out.print("u");
1503     if (sticky())
1504         out.print("y");
1505     if (m_flags != NoFlags) {
1506         bool printSeperator = false;
1507         out.print(" (");
1508         if (global()) {
1509             out.print("global");
1510             printSeperator = true;
1511         }
1512         if (ignoreCase()) {
1513             if (printSeperator)
1514                 out.print("|");
1515             out.print("ignore case");
1516             printSeperator = true;
1517         }
1518         if (multiline()) {
1519             if (printSeperator)
1520                 out.print("|");
1521             out.print("multiline");
1522             printSeperator = true;
1523         }
1524         if (unicode()) {
1525             if (printSeperator)
1526                 out.print("|");
1527             out.print("unicode");
1528             printSeperator = true;
1529         }
1530         if (sticky()) {
1531             if (printSeperator)
1532                 out.print("|");
1533             out.print("sticky");
1534             printSeperator = true;
1535         }
1536         out.print(")");
1537     }
1538     out.print(":\n");
1539     if (m_body->m_callFrameSize)
1540         out.print("    callframe size: ", m_body->m_callFrameSize, "\n");
1541     m_body->dump(out, this);
1542 }
1543
1544 std::unique_ptr<CharacterClass> anycharCreate()
1545 {
1546     auto characterClass = std::make_unique<CharacterClass>();
1547     characterClass->m_ranges.append(CharacterRange(0x00, 0x7f));
1548     characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x10ffff));
1549     characterClass->m_hasNonBMPCharacters = true;
1550     characterClass->m_anyCharacter = true;
1551     return characterClass;
1552 }
1553
1554 } } // namespace JSC::Yarr