JavaScriptCore:
[WebKit-https.git] / JavaScriptCore / yarr / RegexCompiler.cpp
1 /*
2  * Copyright (C) 2009 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
24  */
25
26 #include "config.h"
27 #include "RegexCompiler.h"
28
29 #include "RegexInterpreter.h"
30 #include "RegexPattern.h"
31 #include <wtf/Vector.h>
32
33 #if ENABLE(YARR)
34
35 using namespace WTF;
36
37 namespace JSC { namespace Yarr {
38
39 class CharacterClassConstructor {
40 public:
41     CharacterClassConstructor(bool isCaseInsensitive = false)
42         : m_isCaseInsensitive(isCaseInsensitive)
43     {
44     }
45     
46     void reset()
47     {
48         m_matches.clear();
49         m_ranges.clear();
50         m_matchesUnicode.clear();
51         m_rangesUnicode.clear();
52     }
53
54     void append(const CharacterClass* other)
55     {
56         for (size_t i = 0; i < other->m_matches.size(); ++i)
57             addSorted(m_matches, other->m_matches[i]);
58         for (size_t i = 0; i < other->m_ranges.size(); ++i)
59             addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
60         for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
61             addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
62         for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
63             addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
64     }
65
66     void putChar(UChar ch)
67     {
68         if (ch <= 0x7f) {
69             if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
70                 addSorted(m_matches, toASCIIUpper(ch));
71                 addSorted(m_matches, toASCIILower(ch));
72             } else
73                 addSorted(m_matches, ch);
74         } else {
75             UChar upper, lower;
76             if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) {
77                 addSorted(m_matchesUnicode, upper);
78                 addSorted(m_matchesUnicode, lower);
79             } else
80                 addSorted(m_matchesUnicode, ch);
81         }
82     }
83
84     // returns true if this character has another case, and 'ch' is the upper case form.
85     static inline bool isUnicodeUpper(UChar ch)
86     {
87         return ch != Unicode::toLower(ch);
88     }
89
90     // returns true if this character has another case, and 'ch' is the lower case form.
91     static inline bool isUnicodeLower(UChar ch)
92     {
93         return ch != Unicode::toUpper(ch);
94     }
95
96     void putRange(UChar lo, UChar hi)
97     {
98         if (lo <= 0x7f) {
99             char asciiLo = lo;
100             char asciiHi = std::min(hi, (UChar)0x7f);
101             addSortedRange(m_ranges, lo, asciiHi);
102             
103             if (m_isCaseInsensitive) {
104                 if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
105                     addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
106                 if ((asciiLo <= 'z') && (asciiHi >= 'a'))
107                     addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
108             }
109         }
110         if (hi >= 0x80) {
111             UChar unicodeCurr = std::max(lo, (UChar)0x80);
112             addSortedRange(m_rangesUnicode, unicodeCurr, hi);
113             
114             if (m_isCaseInsensitive) {
115                 while (unicodeCurr <= hi) {
116                     if (isUnicodeUpper(unicodeCurr)) {
117                         UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr);
118                         UChar lowerCaseRangeEnd = lowerCaseRangeBegin;
119                         while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1)))
120                             lowerCaseRangeEnd++;
121                         addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd);
122                     } else if (isUnicodeLower(unicodeCurr)) {
123                         UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr);
124                         UChar upperCaseRangeEnd = upperCaseRangeBegin;
125                         while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1)))
126                             upperCaseRangeEnd++;
127                         addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd);
128                     } else
129                         ++unicodeCurr;
130                 }
131             }
132         }
133     }
134
135     CharacterClass* charClass()
136     {
137         CharacterClass* characterClass = new CharacterClass();
138
139         characterClass->m_matches.append(m_matches);
140         characterClass->m_ranges.append(m_ranges);
141         characterClass->m_matchesUnicode.append(m_matchesUnicode);
142         characterClass->m_rangesUnicode.append(m_rangesUnicode);
143
144         reset();
145
146         return characterClass;
147     }
148
149 private:
150     void addSorted(Vector<UChar>& matches, UChar ch)
151     {
152         unsigned pos = 0;
153         unsigned range = matches.size();
154
155         // binary chop, find position to insert char.
156         while (range) {
157             unsigned index = range >> 1;
158
159             int val = matches[pos+index] - ch;
160             if (!val)
161                 return;
162             else if (val > 0)
163                 range = index;
164             else {
165                 pos += (index+1);
166                 range -= (index+1);
167             }
168         }
169         
170         if (pos == matches.size())
171             matches.append(ch);
172         else
173             matches.insert(pos, ch);
174     }
175
176     void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
177     {
178         unsigned end = ranges.size();
179         
180         // Simple linear scan - I doubt there are that many ranges anyway...
181         // feel free to fix this with something faster (eg binary chop).
182         for (unsigned i = 0; i < end; ++i) {
183             // does the new range fall before the current position in the array
184             if (hi < ranges[i].begin) {
185                 // optional optimization: concatenate appending ranges? - may not be worthwhile.
186                 if (hi == (ranges[i].begin - 1)) {
187                     ranges[i].begin = lo;
188                     return;
189                 }
190                 ranges.insert(i, CharacterRange(lo, hi));
191                 return;
192             }
193             // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
194             // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
195             // end of the last range they concatenate, which is just as good.
196             if (lo <= (ranges[i].end + 1)) {
197                 // found an intersect! we'll replace this entry in the array.
198                 ranges[i].begin = std::min(ranges[i].begin, lo);
199                 ranges[i].end = std::max(ranges[i].end, hi);
200
201                 // now check if the new range can subsume any subsequent ranges.
202                 unsigned next = i+1;
203                 // each iteration of the loop we will either remove something from the list, or break the loop.
204                 while (next < ranges.size()) {
205                     if (ranges[next].begin <= (ranges[i].end + 1)) {
206                         // the next entry now overlaps / concatenates this one.
207                         ranges[i].end = std::max(ranges[i].end, ranges[next].end);
208                         ranges.remove(next);
209                     } else
210                         break;
211                 }
212                 
213                 return;
214             }
215         }
216
217         // CharacterRange comes after all existing ranges.
218         ranges.append(CharacterRange(lo, hi));
219     }
220
221     bool m_isCaseInsensitive;
222
223     Vector<UChar> m_matches;
224     Vector<CharacterRange> m_ranges;
225     Vector<UChar> m_matchesUnicode;
226     Vector<CharacterRange> m_rangesUnicode;
227 };
228
229
230 CharacterClass* newlineCreate()
231 {
232     CharacterClass* characterClass = new CharacterClass();
233
234     characterClass->m_matches.append('\n');
235     characterClass->m_matches.append('\r');
236     characterClass->m_matchesUnicode.append(0x2028);
237     characterClass->m_matchesUnicode.append(0x2029);
238     
239     return characterClass;
240 }
241
242 CharacterClass* digitsCreate()
243 {
244     CharacterClass* characterClass = new CharacterClass();
245
246     characterClass->m_ranges.append(CharacterRange('0', '9'));
247     
248     return characterClass;
249 }
250
251 CharacterClass* spacesCreate()
252 {
253     CharacterClass* characterClass = new CharacterClass();
254
255     characterClass->m_matches.append(' ');
256     characterClass->m_ranges.append(CharacterRange('\t', '\r'));
257     characterClass->m_matchesUnicode.append(0x00a0);
258     characterClass->m_matchesUnicode.append(0x1680);
259     characterClass->m_matchesUnicode.append(0x180e);
260     characterClass->m_matchesUnicode.append(0x2028);
261     characterClass->m_matchesUnicode.append(0x2029);
262     characterClass->m_matchesUnicode.append(0x202f);
263     characterClass->m_matchesUnicode.append(0x205f);
264     characterClass->m_matchesUnicode.append(0x3000);
265     characterClass->m_rangesUnicode.append(CharacterRange(0x2000, 0x200a));
266     
267     return characterClass;
268 }
269
270 CharacterClass* wordcharCreate()
271 {
272     CharacterClass* characterClass = new CharacterClass();
273
274     characterClass->m_matches.append('_');
275     characterClass->m_ranges.append(CharacterRange('0', '9'));
276     characterClass->m_ranges.append(CharacterRange('A', 'Z'));
277     characterClass->m_ranges.append(CharacterRange('a', 'z'));
278     
279     return characterClass;
280 }
281
282 CharacterClass* nondigitsCreate()
283 {
284     CharacterClass* characterClass = new CharacterClass();
285
286     characterClass->m_ranges.append(CharacterRange(0, '0' - 1));
287     characterClass->m_ranges.append(CharacterRange('9' + 1, 0x7f));
288     characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff));
289     
290     return characterClass;
291 }
292
293 CharacterClass* nonspacesCreate()
294 {
295     CharacterClass* characterClass = new CharacterClass();
296
297     characterClass->m_ranges.append(CharacterRange(0, '\t' - 1));
298     characterClass->m_ranges.append(CharacterRange('\r' + 1, ' ' - 1));
299     characterClass->m_ranges.append(CharacterRange(' ' + 1, 0x7f));
300     characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x009f));
301     characterClass->m_rangesUnicode.append(CharacterRange(0x00a1, 0x167f));
302     characterClass->m_rangesUnicode.append(CharacterRange(0x1681, 0x180d));
303     characterClass->m_rangesUnicode.append(CharacterRange(0x180f, 0x1fff));
304     characterClass->m_rangesUnicode.append(CharacterRange(0x200b, 0x2027));
305     characterClass->m_rangesUnicode.append(CharacterRange(0x202a, 0x202e));
306     characterClass->m_rangesUnicode.append(CharacterRange(0x2030, 0x205e));
307     characterClass->m_rangesUnicode.append(CharacterRange(0x2060, 0x2fff));
308     characterClass->m_rangesUnicode.append(CharacterRange(0x3001, 0xffff));
309     
310     return characterClass;
311 }
312
313 CharacterClass* nonwordcharCreate()
314 {
315     CharacterClass* characterClass = new CharacterClass();
316
317     characterClass->m_matches.append('`');
318     characterClass->m_ranges.append(CharacterRange(0, '0' - 1));
319     characterClass->m_ranges.append(CharacterRange('9' + 1, 'A' - 1));
320     characterClass->m_ranges.append(CharacterRange('Z' + 1, '_' - 1));
321     characterClass->m_ranges.append(CharacterRange('z' + 1, 0x7f));
322     characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff));
323
324     return characterClass;
325 }
326
327
328 class RegexPatternConstructor {
329 public:
330     RegexPatternConstructor(RegexPattern& pattern)
331         : m_pattern(pattern)
332         , m_characterClassConstructor(pattern.m_ignoreCase)
333     {
334     }
335
336     ~RegexPatternConstructor()
337     {
338     }
339
340     void reset()
341     {
342         m_pattern.reset();
343         m_characterClassConstructor.reset();
344     }
345     
346     void assertionBOL()
347     {
348         m_alternative->m_terms.append(PatternTerm::BOL());
349     }
350     void assertionEOL()
351     {
352         m_alternative->m_terms.append(PatternTerm::EOL());
353     }
354     void assertionWordBoundary(bool invert)
355     {
356         m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
357     }
358
359     void atomPatternCharacter(UChar ch)
360     {
361         // We handle case-insensitive checking of unicode characters which do have both
362         // cases by handling them as if they were defined using a CharacterClass.
363         if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) {
364             atomCharacterClassBegin();
365             atomCharacterClassAtom(ch);
366             atomCharacterClassEnd();
367         } else
368             m_alternative->m_terms.append(PatternTerm(ch));
369     }
370
371     void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
372     {
373         switch (classID) {
374         case DigitClassID:
375             m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
376             break;
377         case SpaceClassID:
378             m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
379             break;
380         case WordClassID:
381             m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
382             break;
383         case NewlineClassID:
384             m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
385             break;
386         }
387     }
388
389     void atomCharacterClassBegin(bool invert = false)
390     {
391         m_invertCharacterClass = invert;
392     }
393
394     void atomCharacterClassAtom(UChar ch)
395     {
396         m_characterClassConstructor.putChar(ch);
397     }
398
399     void atomCharacterClassRange(UChar begin, UChar end)
400     {
401         m_characterClassConstructor.putRange(begin, end);
402     }
403
404     void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
405     {
406         ASSERT(classID != NewlineClassID);
407
408         switch (classID) {
409         case DigitClassID:
410             m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
411             break;
412         
413         case SpaceClassID:
414             m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
415             break;
416         
417         case WordClassID:
418             m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
419             break;
420         
421         default:
422             ASSERT_NOT_REACHED();
423         }
424     }
425
426     void atomCharacterClassEnd()
427     {
428         CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
429         m_pattern.m_userCharacterClasses.append(newCharacterClass);
430         m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass));
431     }
432
433     void atomParenthesesSubpatternBegin(bool capture = true)
434     {
435         unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
436         if (capture)
437             m_pattern.m_numSubpatterns++;
438
439         PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
440         m_pattern.m_disjunctions.append(parenthesesDisjunction);
441         m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture));
442         m_alternative = parenthesesDisjunction->addNewAlternative();
443     }
444
445     void atomParentheticalAssertionBegin(bool invert = false)
446     {
447         PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
448         m_pattern.m_disjunctions.append(parenthesesDisjunction);
449         m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, invert));
450         m_alternative = parenthesesDisjunction->addNewAlternative();
451     }
452
453     void atomParenthesesEnd()
454     {
455         ASSERT(m_alternative->m_parent);
456         ASSERT(m_alternative->m_parent->m_parent);
457         m_alternative = m_alternative->m_parent->m_parent;
458         
459         m_alternative->lastTerm().parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
460     }
461
462     void atomBackReference(unsigned subpatternId)
463     {
464         ASSERT(subpatternId);
465         m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
466
467         if (subpatternId > m_pattern.m_numSubpatterns)
468             return;
469
470         PatternAlternative* currentAlternative = m_alternative;
471         ASSERT(currentAlternative);
472
473         // Note to self: if we waited until the AST was baked, we could also remove forwards refs 
474         while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
475             PatternTerm& term = currentAlternative->lastTerm();
476             ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
477
478             if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.subpatternId))
479                 return;
480         }
481
482         m_alternative->m_terms.append(PatternTerm(subpatternId));
483     }
484
485     PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction)
486     {
487         PatternDisjunction* newDisjunction = new PatternDisjunction();
488
489         newDisjunction->m_parent = disjunction->m_parent;
490         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
491             PatternAlternative* alternative = disjunction->m_alternatives[alt];
492             PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
493             for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
494                 newAlternative->m_terms.append(copyTerm(alternative->m_terms[i]));
495         }
496
497         return newDisjunction;
498     }
499
500     PatternTerm copyTerm(PatternTerm& term)
501     {
502         if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
503             return PatternTerm(term);
504
505         PatternTerm termCopy = term;
506         termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction);
507         return termCopy;
508     }
509
510     void quantifyAtom(unsigned min, unsigned max, bool greedy)
511     {
512         ASSERT(min <= max);
513         ASSERT(m_alternative->m_terms.size());
514
515         if (!max) {
516             m_alternative->removeLastTerm();
517             return;
518         }
519
520         PatternTerm& term = m_alternative->lastTerm();
521         ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
522         ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
523
524         // For any assertion with a zero minimum, not matching is valid and has no effect,
525         // remove it.  Otherwise, we need to match as least once, but there is no point
526         // matching more than once, so remove the quantifier.  It is not entirely clear
527         // from the spec whether or not this behavior is correct, but I believe this
528         // matches Firefox. :-/
529         if (term.type == PatternTerm::TypeParentheticalAssertion) {
530             if (!min)
531                 m_alternative->removeLastTerm();
532             return;
533         }
534
535         if (min == 0)
536             term.quantify(max, greedy   ? QuantifierGreedy : QuantifierNonGreedy);
537         else if (min == max)
538             term.quantify(min, QuantifierFixedCount);
539         else {
540             term.quantify(min, QuantifierFixedCount);
541             m_alternative->m_terms.append(copyTerm(term));
542             // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
543             m_alternative->lastTerm().quantify((max == UINT_MAX) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
544             if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
545                 m_alternative->lastTerm().parentheses.isCopy = true;
546         }
547     }
548
549     void disjunction()
550     {
551         m_alternative = m_alternative->m_parent->addNewAlternative();
552     }
553
554     void regexBegin()
555     {
556         m_pattern.m_body = new PatternDisjunction();
557         m_alternative = m_pattern.m_body->addNewAlternative();
558         m_pattern.m_disjunctions.append(m_pattern.m_body);
559     }
560     void regexEnd()
561     {
562     }
563     void regexError()
564     {
565     }
566
567     unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition)
568     {
569         alternative->m_hasFixedSize = true;
570         unsigned currentInputPosition = initialInputPosition;
571
572         for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
573             PatternTerm& term = alternative->m_terms[i];
574
575             switch (term.type) {
576             case PatternTerm::TypeAssertionBOL:
577             case PatternTerm::TypeAssertionEOL:
578             case PatternTerm::TypeAssertionWordBoundary:
579                 term.inputPosition = currentInputPosition;
580                 break;
581
582             case PatternTerm::TypeBackReference:
583                 term.inputPosition = currentInputPosition;
584                 term.frameLocation = currentCallFrameSize;
585                 currentCallFrameSize += RegexStackSpaceForBackTrackInfoBackReference;
586                 alternative->m_hasFixedSize = false;
587                 break;
588
589             case PatternTerm::TypePatternCharacter:
590                 term.inputPosition = currentInputPosition;
591                 if (term.quantityType != QuantifierFixedCount) {
592                     term.frameLocation = currentCallFrameSize;
593                     currentCallFrameSize += RegexStackSpaceForBackTrackInfoPatternCharacter;
594                     alternative->m_hasFixedSize = false;
595                 } else
596                     currentInputPosition += term.quantityCount;
597                 break;
598
599             case PatternTerm::TypeCharacterClass:
600                 term.inputPosition = currentInputPosition;
601                 if (term.quantityType != QuantifierFixedCount) {
602                     term.frameLocation = currentCallFrameSize;
603                     currentCallFrameSize += RegexStackSpaceForBackTrackInfoCharacterClass;
604                     alternative->m_hasFixedSize = false;
605                 } else
606                     currentInputPosition += term.quantityCount;
607                 break;
608
609             case PatternTerm::TypeParenthesesSubpattern:
610                 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
611                 term.frameLocation = currentCallFrameSize;
612                 if ((term.quantityCount == 1) && !term.parentheses.isCopy) {
613                     if (term.quantityType == QuantifierFixedCount) {
614                         currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
615                         currentInputPosition += term.parentheses.disjunction->m_minimumSize;
616                     } else {
617                         currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce;
618                         currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
619                     }
620                     term.inputPosition = currentInputPosition;
621                 } else {
622                     term.inputPosition = currentInputPosition;
623                     setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition);
624                     currentCallFrameSize += RegexStackSpaceForBackTrackInfoParentheses;
625                 }
626                 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
627                 alternative->m_hasFixedSize = false;
628                 break;
629
630             case PatternTerm::TypeParentheticalAssertion:
631                 term.inputPosition = currentInputPosition;
632                 term.frameLocation = currentCallFrameSize;
633                 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + RegexStackSpaceForBackTrackInfoParentheticalAssertionOnce, currentInputPosition);
634                 break;
635             }
636         }
637
638         alternative->m_minimumSize = currentInputPosition - initialInputPosition;
639         return currentCallFrameSize;
640     }
641
642     unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition)
643     {
644         if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
645             initialCallFrameSize += RegexStackSpaceForBackTrackInfoAlternative;
646
647         unsigned minimumInputSize = UINT_MAX;
648         unsigned maximumCallFrameSize = 0;
649         bool hasFixedSize = true;
650
651         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
652             PatternAlternative* alternative = disjunction->m_alternatives[alt];
653             unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition);
654             minimumInputSize = min(minimumInputSize, alternative->m_minimumSize);
655             maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize);
656             hasFixedSize &= alternative->m_hasFixedSize;
657         }
658         
659         ASSERT(minimumInputSize != UINT_MAX);
660         ASSERT(maximumCallFrameSize >= initialCallFrameSize);
661
662         disjunction->m_hasFixedSize = hasFixedSize;
663         disjunction->m_minimumSize = minimumInputSize;
664         disjunction->m_callFrameSize = maximumCallFrameSize;
665         return maximumCallFrameSize;
666     }
667
668     void setupOffsets()
669     {
670         setupDisjunctionOffsets(m_pattern.m_body, 0, 0);
671     }
672
673 private:
674     RegexPattern& m_pattern;
675     PatternAlternative* m_alternative;
676     CharacterClassConstructor m_characterClassConstructor;
677     bool m_invertCharacterClass;
678 };
679
680
681 const char* compileRegex(const UString& patternString, RegexPattern& pattern)
682 {
683     RegexPatternConstructor constructor(pattern);
684
685     if (const char* error = parse(constructor, patternString))
686         return error;
687     
688     // If the pattern contains illegal backreferences reset & reparse.
689     // Quoting Netscape's "What's new in JavaScript 1.2",
690     //      "Note: if the number of left parentheses is less than the number specified
691     //       in \#, the \# is taken as an octal escape as described in the next row."
692     if (pattern.containsIllegalBackReference()) {
693         unsigned numSubpatterns = pattern.m_numSubpatterns;
694
695         constructor.reset();
696 #ifndef NDEBUG
697         const char* error =
698 #endif
699             parse(constructor, patternString, numSubpatterns);
700
701         ASSERT(!error);
702         ASSERT(numSubpatterns == pattern.m_numSubpatterns);
703     }
704
705     constructor.setupOffsets();
706
707     return false;
708 };
709
710
711 } }
712
713 #endif