Add support for RegExp named capture groups
[WebKit-https.git] / Source / JavaScriptCore / yarr / YarrPattern.cpp
1 /*
2  * Copyright (C) 2009, 2013-2016 Apple Inc. All rights reserved.
3  * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
25  */
26
27 #include "config.h"
28 #include "YarrPattern.h"
29
30 #include "Options.h"
31 #include "Yarr.h"
32 #include "YarrCanonicalize.h"
33 #include "YarrParser.h"
34 #include <wtf/DataLog.h>
35 #include <wtf/Optional.h>
36 #include <wtf/Threading.h>
37 #include <wtf/Vector.h>
38 #include <wtf/text/WTFString.h>
39
40 using namespace WTF;
41
42 namespace JSC { namespace Yarr {
43
44 #include "RegExpJitTables.h"
45
46 class CharacterClassConstructor {
47 public:
48     CharacterClassConstructor(bool isCaseInsensitive, CanonicalMode canonicalMode)
49         : m_isCaseInsensitive(isCaseInsensitive)
50         , m_hasNonBMPCharacters(false)
51         , m_canonicalMode(canonicalMode)
52     {
53     }
54     
55     void reset()
56     {
57         m_matches.clear();
58         m_ranges.clear();
59         m_matchesUnicode.clear();
60         m_rangesUnicode.clear();
61         m_hasNonBMPCharacters = false;
62     }
63
64     void append(const CharacterClass* other)
65     {
66         for (size_t i = 0; i < other->m_matches.size(); ++i)
67             addSorted(m_matches, other->m_matches[i]);
68         for (size_t i = 0; i < other->m_ranges.size(); ++i)
69             addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
70         for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
71             addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
72         for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
73             addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
74     }
75
76     void putChar(UChar32 ch)
77     {
78         if (!m_isCaseInsensitive) {
79             addSorted(ch);
80             return;
81         }
82
83         if (m_canonicalMode == CanonicalMode::UCS2 && isASCII(ch)) {
84             // Handle ASCII cases.
85             if (isASCIIAlpha(ch)) {
86                 addSorted(m_matches, toASCIIUpper(ch));
87                 addSorted(m_matches, toASCIILower(ch));
88             } else
89                 addSorted(m_matches, ch);
90             return;
91         }
92
93         // Add multiple matches, if necessary.
94         const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_canonicalMode);
95         if (info->type == CanonicalizeUnique)
96             addSorted(ch);
97         else
98             putUnicodeIgnoreCase(ch, info);
99     }
100
101     void putUnicodeIgnoreCase(UChar32 ch, const CanonicalizationRange* info)
102     {
103         ASSERT(m_isCaseInsensitive);
104         ASSERT(ch >= info->begin && ch <= info->end);
105         ASSERT(info->type != CanonicalizeUnique);
106         if (info->type == CanonicalizeSet) {
107             for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set)
108                 addSorted(ch);
109         } else {
110             addSorted(ch);
111             addSorted(getCanonicalPair(info, ch));
112         }
113     }
114
115     void putRange(UChar32 lo, UChar32 hi)
116     {
117         if (isASCII(lo)) {
118             char asciiLo = lo;
119             char asciiHi = std::min(hi, (UChar32)0x7f);
120             addSortedRange(m_ranges, lo, asciiHi);
121             
122             if (m_isCaseInsensitive) {
123                 if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
124                     addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
125                 if ((asciiLo <= 'z') && (asciiHi >= 'a'))
126                     addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
127             }
128         }
129         if (isASCII(hi))
130             return;
131
132         lo = std::max(lo, (UChar32)0x80);
133         addSortedRange(m_rangesUnicode, lo, hi);
134         
135         if (!m_isCaseInsensitive)
136             return;
137
138         const CanonicalizationRange* info = canonicalRangeInfoFor(lo, m_canonicalMode);
139         while (true) {
140             // Handle the range [lo .. end]
141             UChar32 end = std::min<UChar32>(info->end, hi);
142
143             switch (info->type) {
144             case CanonicalizeUnique:
145                 // Nothing to do - no canonical equivalents.
146                 break;
147             case CanonicalizeSet: {
148                 UChar ch;
149                 for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set)
150                     addSorted(m_matchesUnicode, ch);
151                 break;
152             }
153             case CanonicalizeRangeLo:
154                 addSortedRange(m_rangesUnicode, lo + info->value, end + info->value);
155                 break;
156             case CanonicalizeRangeHi:
157                 addSortedRange(m_rangesUnicode, lo - info->value, end - info->value);
158                 break;
159             case CanonicalizeAlternatingAligned:
160                 // Use addSortedRange since there is likely an abutting range to combine with.
161                 if (lo & 1)
162                     addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
163                 if (!(end & 1))
164                     addSortedRange(m_rangesUnicode, end + 1, end + 1);
165                 break;
166             case CanonicalizeAlternatingUnaligned:
167                 // Use addSortedRange since there is likely an abutting range to combine with.
168                 if (!(lo & 1))
169                     addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
170                 if (end & 1)
171                     addSortedRange(m_rangesUnicode, end + 1, end + 1);
172                 break;
173             }
174
175             if (hi == end)
176                 return;
177
178             ++info;
179             lo = info->begin;
180         };
181
182     }
183
184     std::unique_ptr<CharacterClass> charClass()
185     {
186         auto characterClass = std::make_unique<CharacterClass>();
187
188         characterClass->m_matches.swap(m_matches);
189         characterClass->m_ranges.swap(m_ranges);
190         characterClass->m_matchesUnicode.swap(m_matchesUnicode);
191         characterClass->m_rangesUnicode.swap(m_rangesUnicode);
192         characterClass->m_hasNonBMPCharacters = hasNonBMPCharacters();
193
194         return characterClass;
195     }
196
197 private:
198     void addSorted(UChar32 ch)
199     {
200         addSorted(isASCII(ch) ? m_matches : m_matchesUnicode, ch);
201     }
202
203     void addSorted(Vector<UChar32>& matches, UChar32 ch)
204     {
205         unsigned pos = 0;
206         unsigned range = matches.size();
207
208         if (!U_IS_BMP(ch))
209             m_hasNonBMPCharacters = true;
210
211         // binary chop, find position to insert char.
212         while (range) {
213             unsigned index = range >> 1;
214
215             int val = matches[pos+index] - ch;
216             if (!val)
217                 return;
218             else if (val > 0)
219                 range = index;
220             else {
221                 pos += (index+1);
222                 range -= (index+1);
223             }
224         }
225         
226         if (pos == matches.size())
227             matches.append(ch);
228         else
229             matches.insert(pos, ch);
230     }
231
232     void addSortedRange(Vector<CharacterRange>& ranges, UChar32 lo, UChar32 hi)
233     {
234         unsigned end = ranges.size();
235
236         if (!U_IS_BMP(hi))
237             m_hasNonBMPCharacters = true;
238
239         // Simple linear scan - I doubt there are that many ranges anyway...
240         // feel free to fix this with something faster (eg binary chop).
241         for (unsigned i = 0; i < end; ++i) {
242             // does the new range fall before the current position in the array
243             if (hi < ranges[i].begin) {
244                 // optional optimization: concatenate appending ranges? - may not be worthwhile.
245                 if (hi == (ranges[i].begin - 1)) {
246                     ranges[i].begin = lo;
247                     return;
248                 }
249                 ranges.insert(i, CharacterRange(lo, hi));
250                 return;
251             }
252             // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
253             // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
254             // end of the last range they concatenate, which is just as good.
255             if (lo <= (ranges[i].end + 1)) {
256                 // found an intersect! we'll replace this entry in the array.
257                 ranges[i].begin = std::min(ranges[i].begin, lo);
258                 ranges[i].end = std::max(ranges[i].end, hi);
259
260                 // now check if the new range can subsume any subsequent ranges.
261                 unsigned next = i+1;
262                 // each iteration of the loop we will either remove something from the list, or break the loop.
263                 while (next < ranges.size()) {
264                     if (ranges[next].begin <= (ranges[i].end + 1)) {
265                         // the next entry now overlaps / concatenates this one.
266                         ranges[i].end = std::max(ranges[i].end, ranges[next].end);
267                         ranges.remove(next);
268                     } else
269                         break;
270                 }
271                 
272                 return;
273             }
274         }
275
276         // CharacterRange comes after all existing ranges.
277         ranges.append(CharacterRange(lo, hi));
278     }
279
280     bool hasNonBMPCharacters()
281     {
282         return m_hasNonBMPCharacters;
283     }
284
285     bool m_isCaseInsensitive;
286     bool m_hasNonBMPCharacters;
287     CanonicalMode m_canonicalMode;
288
289     Vector<UChar32> m_matches;
290     Vector<CharacterRange> m_ranges;
291     Vector<UChar32> m_matchesUnicode;
292     Vector<CharacterRange> m_rangesUnicode;
293 };
294
295 class YarrPatternConstructor {
296 public:
297     YarrPatternConstructor(YarrPattern& pattern, void* stackLimit)
298         : m_pattern(pattern)
299         , m_characterClassConstructor(pattern.ignoreCase(), pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2)
300         , m_stackLimit(stackLimit)
301         , m_invertParentheticalAssertion(false)
302     {
303         auto body = std::make_unique<PatternDisjunction>();
304         m_pattern.m_body = body.get();
305         m_alternative = body->addNewAlternative();
306         m_pattern.m_disjunctions.append(WTFMove(body));
307     }
308
309     ~YarrPatternConstructor()
310     {
311     }
312
313     void reset()
314     {
315         m_pattern.reset();
316         m_characterClassConstructor.reset();
317
318         auto body = std::make_unique<PatternDisjunction>();
319         m_pattern.m_body = body.get();
320         m_alternative = body->addNewAlternative();
321         m_pattern.m_disjunctions.append(WTFMove(body));
322     }
323     
324     void assertionBOL()
325     {
326         if (!m_alternative->m_terms.size() && !m_invertParentheticalAssertion) {
327             m_alternative->m_startsWithBOL = true;
328             m_alternative->m_containsBOL = true;
329             m_pattern.m_containsBOL = true;
330         }
331         m_alternative->m_terms.append(PatternTerm::BOL());
332     }
333     void assertionEOL()
334     {
335         m_alternative->m_terms.append(PatternTerm::EOL());
336     }
337     void assertionWordBoundary(bool invert)
338     {
339         m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
340     }
341
342     void atomPatternCharacter(UChar32 ch)
343     {
344         // We handle case-insensitive checking of unicode characters which do have both
345         // cases by handling them as if they were defined using a CharacterClass.
346         if (!m_pattern.ignoreCase() || (isASCII(ch) && !m_pattern.unicode())) {
347             m_alternative->m_terms.append(PatternTerm(ch));
348             return;
349         }
350
351         const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2);
352         if (info->type == CanonicalizeUnique) {
353             m_alternative->m_terms.append(PatternTerm(ch));
354             return;
355         }
356
357         m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
358         auto newCharacterClass = m_characterClassConstructor.charClass();
359         m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), false));
360         m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
361     }
362
363     void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
364     {
365         switch (classID) {
366         case DigitClassID:
367             m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
368             break;
369         case SpaceClassID:
370             m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
371             break;
372         case WordClassID:
373             if (m_pattern.unicode() && m_pattern.ignoreCase())
374                 m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), invert));
375             else
376                 m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
377             break;
378         case DotClassID:
379             ASSERT(!invert);
380             if (m_pattern.dotAll())
381                 m_alternative->m_terms.append(PatternTerm(m_pattern.anyCharacterClass(), false));
382             else
383                 m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), true));
384             break;
385         }
386     }
387
388     void atomCharacterClassBegin(bool invert = false)
389     {
390         m_invertCharacterClass = invert;
391     }
392
393     void atomCharacterClassAtom(UChar32 ch)
394     {
395         m_characterClassConstructor.putChar(ch);
396     }
397
398     void atomCharacterClassRange(UChar32 begin, UChar32 end)
399     {
400         m_characterClassConstructor.putRange(begin, end);
401     }
402
403     void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
404     {
405         ASSERT(classID != DotClassID);
406
407         switch (classID) {
408         case DigitClassID:
409             m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
410             break;
411         
412         case SpaceClassID:
413             m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
414             break;
415         
416         case WordClassID:
417             if (m_pattern.unicode() && m_pattern.ignoreCase())
418                 m_characterClassConstructor.append(invert ? m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass() : m_pattern.wordUnicodeIgnoreCaseCharCharacterClass());
419             else
420                 m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
421             break;
422         
423         default:
424             RELEASE_ASSERT_NOT_REACHED();
425         }
426     }
427
428     void atomCharacterClassEnd()
429     {
430         auto newCharacterClass = m_characterClassConstructor.charClass();
431         m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), m_invertCharacterClass));
432         m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
433     }
434
435     void atomParenthesesSubpatternBegin(bool capture = true, std::optional<String> optGroupName = std::nullopt)
436     {
437         unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
438         if (capture) {
439             m_pattern.m_numSubpatterns++;
440             if (optGroupName) {
441                 while (m_pattern.m_captureGroupNames.size() < subpatternId)
442                     m_pattern.m_captureGroupNames.append(String());
443                 m_pattern.m_captureGroupNames.append(optGroupName.value());
444                 m_pattern.m_namedGroupToParenIndex.add(optGroupName.value(), subpatternId);
445             }
446         } else
447             ASSERT(!optGroupName);
448
449         auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative);
450         m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false));
451         m_alternative = parenthesesDisjunction->addNewAlternative();
452         m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
453     }
454
455     void atomParentheticalAssertionBegin(bool invert = false)
456     {
457         auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative);
458         m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction.get(), false, invert));
459         m_alternative = parenthesesDisjunction->addNewAlternative();
460         m_invertParentheticalAssertion = invert;
461         m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
462     }
463
464     void atomParenthesesEnd()
465     {
466         ASSERT(m_alternative->m_parent);
467         ASSERT(m_alternative->m_parent->m_parent);
468
469         PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
470         m_alternative = m_alternative->m_parent->m_parent;
471
472         PatternTerm& lastTerm = m_alternative->lastTerm();
473
474         unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
475         unsigned numBOLAnchoredAlts = 0;
476
477         for (unsigned i = 0; i < numParenAlternatives; i++) {
478             // Bubble up BOL flags
479             if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL)
480                 numBOLAnchoredAlts++;
481         }
482
483         if (numBOLAnchoredAlts) {
484             m_alternative->m_containsBOL = true;
485             // If all the alternatives in parens start with BOL, then so does this one
486             if (numBOLAnchoredAlts == numParenAlternatives)
487                 m_alternative->m_startsWithBOL = true;
488         }
489
490         lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
491         m_invertParentheticalAssertion = false;
492     }
493
494     void atomBackReference(unsigned subpatternId)
495     {
496         ASSERT(subpatternId);
497         m_pattern.m_containsBackreferences = true;
498         m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
499
500         if (subpatternId > m_pattern.m_numSubpatterns) {
501             m_alternative->m_terms.append(PatternTerm::ForwardReference());
502             return;
503         }
504
505         PatternAlternative* currentAlternative = m_alternative;
506         ASSERT(currentAlternative);
507
508         // Note to self: if we waited until the AST was baked, we could also remove forwards refs 
509         while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
510             PatternTerm& term = currentAlternative->lastTerm();
511             ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
512
513             if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
514                 m_alternative->m_terms.append(PatternTerm::ForwardReference());
515                 return;
516             }
517         }
518
519         m_alternative->m_terms.append(PatternTerm(subpatternId));
520     }
521
522     void atomNamedBackReference(String subpatternName)
523     {
524         ASSERT(m_pattern.m_namedGroupToParenIndex.find(subpatternName) != m_pattern.m_namedGroupToParenIndex.end());
525         atomBackReference(m_pattern.m_namedGroupToParenIndex.get(subpatternName));
526     }
527
528     // deep copy the argument disjunction.  If filterStartsWithBOL is true,
529     // skip alternatives with m_startsWithBOL set true.
530     PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
531     {
532         std::unique_ptr<PatternDisjunction> newDisjunction;
533         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
534             PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
535             if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
536                 if (!newDisjunction) {
537                     newDisjunction = std::make_unique<PatternDisjunction>();
538                     newDisjunction->m_parent = disjunction->m_parent;
539                 }
540                 PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
541                 newAlternative->m_terms.reserveInitialCapacity(alternative->m_terms.size());
542                 for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
543                     newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL));
544             }
545         }
546         
547         if (!newDisjunction)
548             return 0;
549
550         PatternDisjunction* copiedDisjunction = newDisjunction.get();
551         m_pattern.m_disjunctions.append(WTFMove(newDisjunction));
552         return copiedDisjunction;
553     }
554     
555     PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
556     {
557         if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
558             return PatternTerm(term);
559         
560         PatternTerm termCopy = term;
561         termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
562         m_pattern.m_hasCopiedParenSubexpressions = true;
563         return termCopy;
564     }
565     
566     void quantifyAtom(unsigned min, unsigned max, bool greedy)
567     {
568         ASSERT(min <= max);
569         ASSERT(m_alternative->m_terms.size());
570
571         if (!max) {
572             m_alternative->removeLastTerm();
573             return;
574         }
575
576         PatternTerm& term = m_alternative->lastTerm();
577         ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
578         ASSERT(term.quantityMinCount == 1 && term.quantityMaxCount == 1 && term.quantityType == QuantifierFixedCount);
579
580         if (term.type == PatternTerm::TypeParentheticalAssertion) {
581             // If an assertion is quantified with a minimum count of zero, it can simply be removed.
582             // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never
583             // results in any input being consumed, however the continuation passed to the assertion
584             // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will
585             // reject all zero length matches (see step 2.1). A match from the continuation of the
586             // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all
587             // this is that matches from the assertion are not required, and won't be accepted anyway,
588             // so no need to ever run it.
589             if (!min)
590                 m_alternative->removeLastTerm();
591             // We never need to run an assertion more than once. Subsequent interations will be run
592             // with the same start index (since assertions are non-capturing) and the same captures
593             // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the
594             // same result and captures. If the first match succeeds then the subsequent (min - 1)
595             // matches will too. Any additional optional matches will fail (on the same basis as the
596             // minimum zero quantified assertions, above), but this will still result in a match.
597             return;
598         }
599
600         if (min == max)
601             term.quantify(min, max, QuantifierFixedCount);
602         else if (!min || (term.type == PatternTerm::TypeParenthesesSubpattern && m_pattern.m_hasCopiedParenSubexpressions))
603             term.quantify(min, max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
604         else {
605             term.quantify(min, min, QuantifierFixedCount);
606             m_alternative->m_terms.append(copyTerm(term));
607             // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
608             m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
609             if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
610                 m_alternative->lastTerm().parentheses.isCopy = true;
611         }
612     }
613
614     void disjunction()
615     {
616         m_alternative = m_alternative->m_parent->addNewAlternative();
617     }
618
619     YarrPattern::ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, unsigned& newCallFrameSize) WARN_UNUSED_RETURN
620     {
621         if (UNLIKELY(!isSafeToRecurse()))
622             return YarrPattern::TooManyDisjunctions;
623
624         YarrPattern::ErrorCode error = YarrPattern::NoError;
625         alternative->m_hasFixedSize = true;
626         Checked<unsigned, RecordOverflow> currentInputPosition = initialInputPosition;
627
628         for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
629             PatternTerm& term = alternative->m_terms[i];
630
631             switch (term.type) {
632             case PatternTerm::TypeAssertionBOL:
633             case PatternTerm::TypeAssertionEOL:
634             case PatternTerm::TypeAssertionWordBoundary:
635                 term.inputPosition = currentInputPosition.unsafeGet();
636                 break;
637
638             case PatternTerm::TypeBackReference:
639                 term.inputPosition = currentInputPosition.unsafeGet();
640                 term.frameLocation = currentCallFrameSize;
641                 currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
642                 alternative->m_hasFixedSize = false;
643                 break;
644
645             case PatternTerm::TypeForwardReference:
646                 break;
647
648             case PatternTerm::TypePatternCharacter:
649                 term.inputPosition = currentInputPosition.unsafeGet();
650                 if (term.quantityType != QuantifierFixedCount) {
651                     term.frameLocation = currentCallFrameSize;
652                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
653                     alternative->m_hasFixedSize = false;
654                 } else if (m_pattern.unicode()) {
655                     Checked<unsigned, RecordOverflow> tempCount = term.quantityMaxCount;
656                     tempCount *= U16_LENGTH(term.patternCharacter);
657                     if (tempCount.hasOverflowed())
658                         return YarrPattern::OffsetTooLarge;
659                     currentInputPosition += tempCount;
660                 } else
661                     currentInputPosition += term.quantityMaxCount;
662                 break;
663
664             case PatternTerm::TypeCharacterClass:
665                 term.inputPosition = currentInputPosition.unsafeGet();
666                 if (term.quantityType != QuantifierFixedCount) {
667                     term.frameLocation = currentCallFrameSize;
668                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
669                     alternative->m_hasFixedSize = false;
670                 } else if (m_pattern.unicode()) {
671                     term.frameLocation = currentCallFrameSize;
672                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
673                     currentInputPosition += term.quantityMaxCount;
674                     alternative->m_hasFixedSize = false;
675                 } else
676                     currentInputPosition += term.quantityMaxCount;
677                 break;
678
679             case PatternTerm::TypeParenthesesSubpattern:
680                 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
681                 term.frameLocation = currentCallFrameSize;
682                 if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) {
683                     if (term.quantityType != QuantifierFixedCount)
684                         currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
685                     error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
686                     if (error)
687                         return error;
688                     // If quantity is fixed, then pre-check its minimum size.
689                     if (term.quantityType == QuantifierFixedCount)
690                         currentInputPosition += term.parentheses.disjunction->m_minimumSize;
691                     term.inputPosition = currentInputPosition.unsafeGet();
692                 } else if (term.parentheses.isTerminal) {
693                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
694                     error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
695                     if (error)
696                         return error;
697                     term.inputPosition = currentInputPosition.unsafeGet();
698                 } else {
699                     term.inputPosition = currentInputPosition.unsafeGet();
700                     unsigned ignoredCallFrameSize;
701                     error = setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet(), ignoredCallFrameSize);
702                     if (error)
703                         return error;
704                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
705                 }
706                 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
707                 alternative->m_hasFixedSize = false;
708                 break;
709
710             case PatternTerm::TypeParentheticalAssertion:
711                 term.inputPosition = currentInputPosition.unsafeGet();
712                 term.frameLocation = currentCallFrameSize;
713                 error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet(), currentCallFrameSize);
714                 if (error)
715                     return error;
716                 break;
717
718             case PatternTerm::TypeDotStarEnclosure:
719                 ASSERT(!m_pattern.m_saveInitialStartValue);
720                 alternative->m_hasFixedSize = false;
721                 term.inputPosition = initialInputPosition;
722                 m_pattern.m_initialStartValueFrameLocation = currentCallFrameSize;
723                 currentCallFrameSize += YarrStackSpaceForDotStarEnclosure;
724                 m_pattern.m_saveInitialStartValue = true;
725                 break;
726             }
727             if (currentInputPosition.hasOverflowed())
728                 return YarrPattern::OffsetTooLarge;
729         }
730
731         alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet();
732         newCallFrameSize = currentCallFrameSize;
733         return error;
734     }
735
736     YarrPattern::ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned& callFrameSize)
737     {
738         if (UNLIKELY(!isSafeToRecurse()))
739             return YarrPattern::TooManyDisjunctions;
740
741         if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
742             initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
743
744         unsigned minimumInputSize = UINT_MAX;
745         unsigned maximumCallFrameSize = 0;
746         bool hasFixedSize = true;
747         YarrPattern::ErrorCode error = YarrPattern::NoError;
748
749         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
750             PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
751             unsigned currentAlternativeCallFrameSize;
752             error = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition, currentAlternativeCallFrameSize);
753             if (error)
754                 return error;
755             minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize);
756             maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize);
757             hasFixedSize &= alternative->m_hasFixedSize;
758             if (alternative->m_minimumSize > INT_MAX)
759                 m_pattern.m_containsUnsignedLengthPattern = true;
760         }
761         
762         ASSERT(minimumInputSize != UINT_MAX);
763         ASSERT(maximumCallFrameSize >= initialCallFrameSize);
764
765         disjunction->m_hasFixedSize = hasFixedSize;
766         disjunction->m_minimumSize = minimumInputSize;
767         disjunction->m_callFrameSize = maximumCallFrameSize;
768         callFrameSize = maximumCallFrameSize;
769         return error;
770     }
771
772     const char* setupOffsets()
773     {
774         // FIXME: Yarr should not use the stack to handle subpatterns (rdar://problem/26436314).
775         unsigned ignoredCallFrameSize;
776         YarrPattern::ErrorCode error = setupDisjunctionOffsets(m_pattern.m_body, 0, 0, ignoredCallFrameSize);
777         if (error)
778             return YarrPattern::errorMessage(error);
779         return nullptr;
780     }
781
782     // This optimization identifies sets of parentheses that we will never need to backtrack.
783     // In these cases we do not need to store state from prior iterations.
784     // We can presently avoid backtracking for:
785     //   * where the parens are at the end of the regular expression (last term in any of the
786     //     alternatives of the main body disjunction).
787     //   * where the parens are non-capturing, and quantified unbounded greedy (*).
788     //   * where the parens do not contain any capturing subpatterns.
789     void checkForTerminalParentheses()
790     {
791         // This check is much too crude; should be just checking whether the candidate
792         // node contains nested capturing subpatterns, not the whole expression!
793         if (m_pattern.m_numSubpatterns)
794             return;
795
796         Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
797         for (size_t i = 0; i < alternatives.size(); ++i) {
798             Vector<PatternTerm>& terms = alternatives[i]->m_terms;
799             if (terms.size()) {
800                 PatternTerm& term = terms.last();
801                 if (term.type == PatternTerm::TypeParenthesesSubpattern
802                     && term.quantityType == QuantifierGreedy
803                     && term.quantityMinCount == 0
804                     && term.quantityMaxCount == quantifyInfinite
805                     && !term.capture())
806                     term.parentheses.isTerminal = true;
807             }
808         }
809     }
810
811     void optimizeBOL()
812     {
813         // Look for expressions containing beginning of line (^) anchoring and unroll them.
814         // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
815         // This code relies on the parsing code tagging alternatives with m_containsBOL and
816         // m_startsWithBOL and rolling those up to containing alternatives.
817         // At this point, this is only valid for non-multiline expressions.
818         PatternDisjunction* disjunction = m_pattern.m_body;
819         
820         if (!m_pattern.m_containsBOL || m_pattern.multiline())
821             return;
822         
823         PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
824
825         // Set alternatives in disjunction to "onceThrough"
826         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt)
827             disjunction->m_alternatives[alt]->setOnceThrough();
828
829         if (loopDisjunction) {
830             // Move alternatives from loopDisjunction to disjunction
831             for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
832                 disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt].release());
833                 
834             loopDisjunction->m_alternatives.clear();
835         }
836     }
837
838     bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t endIndex)
839     {
840         Vector<PatternTerm>& terms = alternative->m_terms;
841
842         ASSERT(endIndex <= terms.size());
843         for (size_t termIndex = firstTermIndex; termIndex < endIndex; ++termIndex) {
844             PatternTerm& term = terms[termIndex];
845
846             if (term.m_capture)
847                 return true;
848
849             if (term.type == PatternTerm::TypeParenthesesSubpattern) {
850                 PatternDisjunction* nestedDisjunction = term.parentheses.disjunction;
851                 for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) {
852                     if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size()))
853                         return true;
854                 }
855             }
856         }
857
858         return false;
859     }
860
861     // This optimization identifies alternatives in the form of 
862     // [^].*[?]<expression>.*[$] for expressions that don't have any 
863     // capturing terms. The alternative is changed to <expression> 
864     // followed by processing of the dot stars to find and adjust the 
865     // beginning and the end of the match.
866     void optimizeDotStarWrappedExpressions()
867     {
868         Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
869         if (alternatives.size() != 1)
870             return;
871
872         CharacterClass* dotCharacterClass = m_pattern.dotAll() ? m_pattern.anyCharacterClass() : m_pattern.newlineCharacterClass();
873         PatternAlternative* alternative = alternatives[0].get();
874         Vector<PatternTerm>& terms = alternative->m_terms;
875         if (terms.size() >= 3) {
876             bool startsWithBOL = false;
877             bool endsWithEOL = false;
878             size_t termIndex, firstExpressionTerm;
879
880             termIndex = 0;
881             if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) {
882                 startsWithBOL = true;
883                 ++termIndex;
884             }
885             
886             PatternTerm& firstNonAnchorTerm = terms[termIndex];
887             if ((firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass)
888                 || (firstNonAnchorTerm.characterClass != dotCharacterClass)
889                 || !((firstNonAnchorTerm.quantityType == QuantifierGreedy)
890                     || (firstNonAnchorTerm.quantityType == QuantifierNonGreedy)))
891                 return;
892             
893             firstExpressionTerm = termIndex + 1;
894             
895             termIndex = terms.size() - 1;
896             if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) {
897                 endsWithEOL = true;
898                 --termIndex;
899             }
900             
901             PatternTerm& lastNonAnchorTerm = terms[termIndex];
902             if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass)
903                 || (lastNonAnchorTerm.characterClass != dotCharacterClass)
904                 || (lastNonAnchorTerm.quantityType != QuantifierGreedy))
905                 return;
906
907             size_t endIndex = termIndex;
908             if (firstExpressionTerm >= endIndex)
909                 return;
910
911             if (!containsCapturingTerms(alternative, firstExpressionTerm, endIndex)) {
912                 for (termIndex = terms.size() - 1; termIndex >= endIndex; --termIndex)
913                     terms.remove(termIndex);
914
915                 for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex)
916                     terms.remove(termIndex - 1);
917
918                 terms.append(PatternTerm(startsWithBOL, endsWithEOL));
919                 
920                 m_pattern.m_containsBOL = false;
921             }
922         }
923     }
924
925 private:
926     bool isSafeToRecurse() const
927     {
928         if (!m_stackLimit)
929             return true;
930         ASSERT(Thread::current().stack().isGrowingDownward());
931         int8_t* curr = reinterpret_cast<int8_t*>(&curr);
932         int8_t* limit = reinterpret_cast<int8_t*>(m_stackLimit);
933         return curr >= limit;
934     }
935
936     YarrPattern& m_pattern;
937     PatternAlternative* m_alternative;
938     CharacterClassConstructor m_characterClassConstructor;
939     void* m_stackLimit;
940     bool m_invertCharacterClass;
941     bool m_invertParentheticalAssertion;
942 };
943
944 const char* YarrPattern::errorMessage(YarrPattern::ErrorCode error)
945 {
946 #define REGEXP_ERROR_PREFIX "Invalid regular expression: "
947     // The order of this array must match the ErrorCode enum.
948     static const char* errorMessages[NumberOfErrorCodes] = {
949         nullptr,                                                              // NoError
950         REGEXP_ERROR_PREFIX "regular expression too large",                   // PatternTooLarge     
951         REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",          // QuantifierOutOfOrder
952         REGEXP_ERROR_PREFIX "nothing to repeat",                              // QuantifierWithoutAtom
953         REGEXP_ERROR_PREFIX "number too large in {} quantifier",              // QuantifierTooLarge
954         REGEXP_ERROR_PREFIX "missing )",                                      // MissingParentheses
955         REGEXP_ERROR_PREFIX "unmatched parentheses",                          // ParenthesesUnmatched
956         REGEXP_ERROR_PREFIX "unrecognized character after (?",                // ParenthesesTypeInvalid
957         REGEXP_ERROR_PREFIX "invalid group specifier name",                   // InvalidGroupName
958         REGEXP_ERROR_PREFIX "duplicate group specifier name",                 // DuplicateGroupName
959         REGEXP_ERROR_PREFIX "missing terminating ] for character class",      // CharacterClassUnmatched
960         REGEXP_ERROR_PREFIX "range out of order in character class",          // CharacterClassOutOfOrder
961         REGEXP_ERROR_PREFIX "\\ at end of pattern",                           // EscapeUnterminated
962         REGEXP_ERROR_PREFIX "invalid unicode {} escape",                      // InvalidUnicodeEscape
963         REGEXP_ERROR_PREFIX "invalid backreference for unicode pattern",      // InvalidBackreference
964         REGEXP_ERROR_PREFIX "invalid escaped character for unicode pattern",  // InvalidIdentityEscape
965         REGEXP_ERROR_PREFIX "too many nested disjunctions",                   // TooManyDisjunctions
966         REGEXP_ERROR_PREFIX "pattern exceeds string length limits",           // OffsetTooLarge
967         REGEXP_ERROR_PREFIX "invalid flags"                                   // InvalidRegularExpressionFlags
968     };
969
970     return errorMessages[error];
971 }
972
973 const char* YarrPattern::compile(const String& patternString, void* stackLimit)
974 {
975     YarrPatternConstructor constructor(*this, stackLimit);
976
977     if (m_flags == InvalidFlags)
978         return errorMessage(InvalidRegularExpressionFlags);
979
980     if (const char* error = parse(constructor, patternString, unicode()))
981         return error;
982     
983     // If the pattern contains illegal backreferences reset & reparse.
984     // Quoting Netscape's "What's new in JavaScript 1.2",
985     //      "Note: if the number of left parentheses is less than the number specified
986     //       in \#, the \# is taken as an octal escape as described in the next row."
987     if (containsIllegalBackReference()) {
988         if (unicode())
989             return errorMessage(InvalidBackreference);
990
991         unsigned numSubpatterns = m_numSubpatterns;
992
993         constructor.reset();
994 #if !ASSERT_DISABLED
995         const char* error =
996 #endif
997             parse(constructor, patternString, unicode(), numSubpatterns);
998
999         ASSERT(!error);
1000         ASSERT(numSubpatterns == m_numSubpatterns);
1001     }
1002
1003     constructor.checkForTerminalParentheses();
1004     constructor.optimizeDotStarWrappedExpressions();
1005     constructor.optimizeBOL();
1006         
1007     if (const char* error = constructor.setupOffsets())
1008         return error;
1009
1010     if (Options::dumpCompiledRegExpPatterns())
1011         dumpPattern(patternString);
1012
1013     return nullptr;
1014 }
1015
1016 YarrPattern::YarrPattern(const String& pattern, RegExpFlags flags, const char** error, void* stackLimit)
1017     : m_containsBackreferences(false)
1018     , m_containsBOL(false)
1019     , m_containsUnsignedLengthPattern(false)
1020     , m_hasCopiedParenSubexpressions(false)
1021     , m_saveInitialStartValue(false)
1022     , m_flags(flags)
1023     , m_numSubpatterns(0)
1024     , m_maxBackReference(0)
1025     , anycharCached(0)
1026     , newlineCached(0)
1027     , digitsCached(0)
1028     , spacesCached(0)
1029     , wordcharCached(0)
1030     , wordUnicodeIgnoreCaseCharCached(0)
1031     , nondigitsCached(0)
1032     , nonspacesCached(0)
1033     , nonwordcharCached(0)
1034     , nonwordUnicodeIgnoreCasecharCached(0)
1035 {
1036     *error = compile(pattern, stackLimit);
1037 }
1038
1039 static void indentForNestingLevel(PrintStream& out, unsigned nestingDepth)
1040 {
1041     out.print("    ");
1042     for (; nestingDepth; --nestingDepth)
1043         out.print("  ");
1044 }
1045
1046 static void dumpUChar32(PrintStream& out, UChar32 c)
1047 {
1048     if (c >= ' '&& c <= 0xff)
1049         out.printf("'%c'", static_cast<char>(c));
1050     else
1051         out.printf("0x%04x", c);
1052 }
1053
1054 void PatternAlternative::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
1055 {
1056     out.print("minimum size: ", m_minimumSize);
1057     if (m_hasFixedSize)
1058         out.print(",fixed size");
1059     if (m_onceThrough)
1060         out.print(",once through");
1061     if (m_startsWithBOL)
1062         out.print(",starts with ^");
1063     if (m_containsBOL)
1064         out.print(",contains ^");
1065     out.print("\n");
1066
1067     for (size_t i = 0; i < m_terms.size(); ++i)
1068         m_terms[i].dump(out, thisPattern, nestingDepth);
1069 }
1070
1071 void PatternTerm::dumpQuantifier(PrintStream& out)
1072 {
1073     if (quantityType == QuantifierFixedCount && quantityMinCount == 1 && quantityMaxCount == 1)
1074         return;
1075     out.print(" {", quantityMinCount.unsafeGet());
1076     if (quantityMinCount != quantityMaxCount) {
1077         if (quantityMaxCount == UINT_MAX)
1078             out.print(",...");
1079         else
1080             out.print(",", quantityMaxCount.unsafeGet());
1081     }
1082     out.print("}");
1083     if (quantityType == QuantifierGreedy)
1084         out.print(" greedy");
1085     else if (quantityType == QuantifierNonGreedy)
1086         out.print(" non-greedy");
1087 }
1088
1089 void PatternTerm::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
1090 {
1091     indentForNestingLevel(out, nestingDepth);
1092
1093     if (invert() && (type != TypeParenthesesSubpattern && type != TypeParentheticalAssertion))
1094         out.print("not ");
1095
1096     switch (type) {
1097     case TypeAssertionBOL:
1098         out.println("BOL");
1099         break;
1100     case TypeAssertionEOL:
1101         out.println("EOL");
1102         break;
1103     case TypeAssertionWordBoundary:
1104         out.println("word boundary");
1105         break;
1106     case TypePatternCharacter:
1107         out.printf("character ");
1108         if (thisPattern->ignoreCase() && isASCIIAlpha(patternCharacter)) {
1109             dumpUChar32(out, toASCIIUpper(patternCharacter));
1110             out.print("/");
1111             dumpUChar32(out, toASCIILower(patternCharacter));
1112         } else
1113             dumpUChar32(out, patternCharacter);
1114         dumpQuantifier(out);
1115         if (quantityType != QuantifierFixedCount)
1116             out.print(",frame location ", frameLocation);
1117         out.println();
1118         break;
1119     case TypeCharacterClass:
1120         out.print("character class ");
1121         if (characterClass == thisPattern->anyCharacterClass())
1122             out.print("<any character>");
1123         else if (characterClass == thisPattern->newlineCharacterClass())
1124             out.print("<newline>");
1125         else if (characterClass == thisPattern->digitsCharacterClass())
1126             out.print("<digits>");
1127         else if (characterClass == thisPattern->spacesCharacterClass())
1128             out.print("<whitespace>");
1129         else if (characterClass == thisPattern->wordcharCharacterClass())
1130             out.print("<word>");
1131         else if (characterClass == thisPattern->wordUnicodeIgnoreCaseCharCharacterClass())
1132             out.print("<unicode ignore case>");
1133         else if (characterClass == thisPattern->nondigitsCharacterClass())
1134             out.print("<non-digits>");
1135         else if (characterClass == thisPattern->nonspacesCharacterClass())
1136             out.print("<non-whitespace>");
1137         else if (characterClass == thisPattern->nonwordcharCharacterClass())
1138             out.print("<non-word>");
1139         else if (characterClass == thisPattern->nonwordUnicodeIgnoreCaseCharCharacterClass())
1140             out.print("<unicode non-ignore case>");
1141         else {
1142             bool needMatchesRangesSeperator = false;
1143
1144             auto dumpMatches = [&] (const char* prefix, Vector<UChar32> matches) {
1145                 size_t matchesSize = matches.size();
1146                 if (matchesSize) {
1147                     if (needMatchesRangesSeperator)
1148                         out.print(",");
1149                     needMatchesRangesSeperator = true;
1150
1151                     out.print(prefix, ":(");
1152                     for (size_t i = 0; i < matchesSize; ++i) {
1153                         if (i)
1154                             out.print(",");
1155                         dumpUChar32(out, matches[i]);
1156                     }
1157                     out.print(")");
1158                 }
1159             };
1160
1161             auto dumpRanges = [&] (const char* prefix, Vector<CharacterRange> ranges) {
1162                 size_t rangeSize = ranges.size();
1163                 if (rangeSize) {
1164                     if (needMatchesRangesSeperator)
1165                         out.print(",");
1166                     needMatchesRangesSeperator = true;
1167
1168                     out.print(prefix, "ranges:(");
1169                     for (size_t i = 0; i < rangeSize; ++i) {
1170                         if (i)
1171                             out.print(",");
1172                         CharacterRange range = ranges[i];
1173                         out.print("(");
1174                         dumpUChar32(out, range.begin);
1175                         out.print("..");
1176                         dumpUChar32(out, range.end);
1177                         out.print(")");
1178                     }
1179                     out.print(")");
1180                 }
1181             };
1182
1183             out.print("[");
1184             dumpMatches("ASCII", characterClass->m_matches);
1185             dumpRanges("ASCII", characterClass->m_ranges);
1186             dumpMatches("Unicode", characterClass->m_matchesUnicode);
1187             dumpRanges("Unicode", characterClass->m_rangesUnicode);
1188             out.print("]");
1189         }
1190         dumpQuantifier(out);
1191         if (quantityType != QuantifierFixedCount || thisPattern->unicode())
1192             out.print(",frame location ", frameLocation);
1193         out.println();
1194         break;
1195     case TypeBackReference:
1196         out.print("back reference to subpattern #", backReferenceSubpatternId);
1197         out.println(",frame location ", frameLocation);
1198         break;
1199     case TypeForwardReference:
1200         out.println("forward reference");
1201         break;
1202     case TypeParenthesesSubpattern:
1203         if (m_capture)
1204             out.print("captured ");
1205         else
1206             out.print("non-captured ");
1207
1208         FALLTHROUGH;
1209     case TypeParentheticalAssertion:
1210         if (m_invert)
1211             out.print("inverted ");
1212
1213         if (type == TypeParenthesesSubpattern)
1214             out.print("subpattern");
1215         else if (type == TypeParentheticalAssertion)
1216             out.print("assertion");
1217
1218         if (m_capture)
1219             out.print(" #", parentheses.subpatternId);
1220
1221         dumpQuantifier(out);
1222
1223         if (parentheses.isCopy)
1224             out.print(",copy");
1225
1226         if (parentheses.isTerminal)
1227             out.print(",terminal");
1228
1229         if (quantityMaxCount != 1 || parentheses.isCopy || quantityType != QuantifierFixedCount)
1230             out.println(",frame location ", frameLocation);
1231         else
1232             out.println();
1233
1234         if (parentheses.disjunction->m_alternatives.size() > 1) {
1235             indentForNestingLevel(out, nestingDepth + 1);
1236             unsigned alternativeFrameLocation = frameLocation;
1237             if (quantityType != QuantifierFixedCount)
1238                 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
1239             out.println("alternative list,frame location ", alternativeFrameLocation);
1240         }
1241
1242         parentheses.disjunction->dump(out, thisPattern, nestingDepth + 1);
1243         break;
1244     case TypeDotStarEnclosure:
1245         out.println(".* enclosure,frame location ", thisPattern->m_initialStartValueFrameLocation);
1246         break;
1247     }
1248 }
1249
1250 void PatternDisjunction::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth = 0)
1251 {
1252     unsigned alternativeCount = m_alternatives.size();
1253     for (unsigned i = 0; i < alternativeCount; ++i) {
1254         indentForNestingLevel(out, nestingDepth);
1255         if (alternativeCount > 1)
1256             out.print("alternative #", i, ": ");
1257         m_alternatives[i].get()->dump(out, thisPattern, nestingDepth + (alternativeCount > 1));
1258     }
1259 }
1260
1261 void YarrPattern::dumpPattern(const String& patternString)
1262 {
1263     dumpPattern(WTF::dataFile(), patternString);
1264 }
1265
1266 void YarrPattern::dumpPattern(PrintStream& out, const String& patternString)
1267 {
1268     out.print("RegExp pattern for /");
1269     out.print(patternString);
1270     out.print("/");
1271     if (global())
1272         out.print("g");
1273     if (ignoreCase())
1274         out.print("i");
1275     if (multiline())
1276         out.print("m");
1277     if (unicode())
1278         out.print("u");
1279     if (sticky())
1280         out.print("y");
1281     if (m_flags != NoFlags) {
1282         bool printSeperator = false;
1283         out.print(" (");
1284         if (global()) {
1285             out.print("global");
1286             printSeperator = true;
1287         }
1288         if (ignoreCase()) {
1289             if (printSeperator)
1290                 out.print("|");
1291             out.print("ignore case");
1292             printSeperator = true;
1293         }
1294         if (multiline()) {
1295             if (printSeperator)
1296                 out.print("|");
1297             out.print("multiline");
1298             printSeperator = true;
1299         }
1300         if (unicode()) {
1301             if (printSeperator)
1302                 out.print("|");
1303             out.print("unicode");
1304             printSeperator = true;
1305         }
1306         if (sticky()) {
1307             if (printSeperator)
1308                 out.print("|");
1309             out.print("sticky");
1310             printSeperator = true;
1311         }
1312         out.print(")");
1313     }
1314     out.print(":\n");
1315     m_body->dump(out, this);
1316 }
1317
1318 std::unique_ptr<CharacterClass> anycharCreate()
1319 {
1320     auto characterClass = std::make_unique<CharacterClass>();
1321     characterClass->m_ranges.append(CharacterRange(0x00, 0x7f));
1322     characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x10ffff));
1323     characterClass->m_hasNonBMPCharacters = true;
1324     return characterClass;
1325 }
1326
1327 } }