2009-04-13 Gavin Barraclough <barraclough@apple.com>
[WebKit-https.git] / JavaScriptCore / yarr / RegexPattern.h
1 /*
2  * Copyright (C) 2009 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
24  */
25
26 #ifndef RegexPattern_h
27 #define RegexPattern_h
28
29 #include <wtf/Platform.h>
30
31 #if ENABLE(YARR)
32
33 #include <wtf/Vector.h>
34 #include <wtf/unicode/Unicode.h>
35
36
37 namespace JSC { namespace Yarr {
38
39 #define RegexStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers.
40 #define RegexStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers.
41 #define RegexStackSpaceForBackTrackInfoBackReference 2
42 #define RegexStackSpaceForBackTrackInfoAlternative 1 // One per alternative.
43 #define RegexStackSpaceForBackTrackInfoParentheticalAssertionOnce 1
44 #define RegexStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers.
45 #define RegexStackSpaceForBackTrackInfoParentheses 4
46
47 struct PatternDisjunction;
48
49 struct CharacterRange {
50     UChar begin;
51     UChar end;
52
53     CharacterRange(UChar begin, UChar end)
54         : begin(begin)
55         , end(end)
56     {
57     }
58 };
59
60 struct CharacterClass {
61     Vector<UChar> m_matches;
62     Vector<CharacterRange> m_ranges;
63     Vector<UChar> m_matchesUnicode;
64     Vector<CharacterRange> m_rangesUnicode;
65 };
66
67 enum QuantifierType {
68     QuantifierFixedCount,
69     QuantifierGreedy,
70     QuantifierNonGreedy,
71 };
72
73 struct PatternTerm {
74     enum Type {
75         TypeAssertionBOL,
76         TypeAssertionEOL,
77         TypeAssertionWordBoundary,
78         TypePatternCharacter,
79         TypeCharacterClass,
80         TypeBackReference,
81         TypeParenthesesSubpattern,
82         TypeParentheticalAssertion,
83     } type;
84     bool invertOrCapture;
85     union {
86         UChar patternCharacter;
87         CharacterClass* characterClass;
88         unsigned subpatternId;
89         struct {
90             PatternDisjunction* disjunction;
91             unsigned subpatternId;
92             unsigned lastSubpatternId;
93         } parentheses;
94     };
95     QuantifierType quantityType;
96     unsigned quantityCount;
97     int inputPosition;
98     unsigned frameLocation;
99
100     PatternTerm(UChar ch)
101         : type(PatternTerm::TypePatternCharacter)
102     {
103         patternCharacter = ch;
104         quantityType = QuantifierFixedCount;
105         quantityCount = 1;
106     }
107
108     PatternTerm(CharacterClass* charClass, bool invert)
109         : type(PatternTerm::TypeCharacterClass)
110         , invertOrCapture(invert)
111     {
112         characterClass = charClass;
113         quantityType = QuantifierFixedCount;
114         quantityCount = 1;
115     }
116
117     PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool invertOrCapture)
118         : type(type)
119         , invertOrCapture(invertOrCapture)
120     {
121         parentheses.disjunction = disjunction;
122         parentheses.subpatternId = subpatternId;
123         quantityType = QuantifierFixedCount;
124         quantityCount = 1;
125     }
126     
127     PatternTerm(Type type, bool invert = false)
128         : type(type)
129         , invertOrCapture(invert)
130     {
131         quantityType = QuantifierFixedCount;
132         quantityCount = 1;
133     }
134
135     PatternTerm(unsigned spatternId)
136         : type(TypeBackReference)
137         , invertOrCapture(invertOrCapture)
138     {
139         subpatternId = spatternId;
140         quantityType = QuantifierFixedCount;
141         quantityCount = 1;
142     }
143
144     static PatternTerm BOL()
145     {
146         return PatternTerm(TypeAssertionBOL);
147     }
148
149     static PatternTerm EOL()
150     {
151         return PatternTerm(TypeAssertionEOL);
152     }
153
154     static PatternTerm WordBoundary(bool invert)
155     {
156         return PatternTerm(TypeAssertionWordBoundary, invert);
157     }
158     
159     bool invert()
160     {
161         return invertOrCapture;
162     }
163
164     bool capture()
165     {
166         return invertOrCapture;
167     }
168     
169     void quantify(unsigned count, QuantifierType type)
170     {
171         quantityCount = count;
172         quantityType = type;
173     }
174 };
175
176 struct PatternAlternative {
177     PatternAlternative(PatternDisjunction* disjunction)
178         : m_parent(disjunction)
179     {
180     }
181
182     PatternTerm& lastTerm()
183     {
184         ASSERT(m_terms.size());
185         return m_terms[m_terms.size() - 1];
186     }
187     
188     void removeLastTerm()
189     {
190         ASSERT(m_terms.size());
191         m_terms.shrink(m_terms.size() - 1);
192     }
193
194     Vector<PatternTerm> m_terms;
195     PatternDisjunction* m_parent;
196     unsigned m_minimumSize;
197     bool m_hasFixedSize;
198 };
199
200 struct PatternDisjunction {
201     PatternDisjunction(PatternAlternative* parent = 0)
202         : m_parent(parent)
203     {
204     }
205     
206     ~PatternDisjunction()
207     {
208         deleteAllValues(m_alternatives);
209     }
210
211     PatternAlternative* addNewAlternative()
212     {
213         PatternAlternative* alternative = new PatternAlternative(this);
214         m_alternatives.append(alternative);
215         return alternative;
216     }
217
218     Vector<PatternAlternative*> m_alternatives;
219     PatternAlternative* m_parent;
220     unsigned m_minimumSize;
221     unsigned m_callFrameSize;
222     bool m_hasFixedSize;
223 };
224
225 // You probably don't want to be calling these functions directly
226 // (please to be calling newlineCharacterClass() et al on your
227 // friendly neighborhood RegexPattern instance to get nicely
228 // cached copies).
229 CharacterClass* newlineCreate();
230 CharacterClass* digitsCreate();
231 CharacterClass* spacesCreate();
232 CharacterClass* wordcharCreate();
233 CharacterClass* nondigitsCreate();
234 CharacterClass* nonspacesCreate();
235 CharacterClass* nonwordcharCreate();
236
237 struct RegexPattern {
238     RegexPattern(bool ignoreCase, bool multiline)
239         : m_ignoreCase(ignoreCase)
240         , m_multiline(multiline)
241         , m_numSubpatterns(0)
242         , m_maxBackReference(0)
243         , newlineCached(0)
244         , digitsCached(0)
245         , spacesCached(0)
246         , wordcharCached(0)
247         , nondigitsCached(0)
248         , nonspacesCached(0)
249         , nonwordcharCached(0)
250     {
251     }
252
253     ~RegexPattern()
254     {
255         deleteAllValues(m_disjunctions);
256         deleteAllValues(m_userCharacterClasses);
257     }
258
259     void reset()
260     {
261         m_numSubpatterns = 0;
262         m_maxBackReference = 0;
263
264         deleteAllValues(m_disjunctions);
265         m_disjunctions.clear();
266         deleteAllValues(m_userCharacterClasses);
267         m_userCharacterClasses.clear();
268     }
269
270     bool containsIllegalBackReference()
271     {
272         return m_maxBackReference > m_numSubpatterns;
273     }
274
275     CharacterClass* newlineCharacterClass()
276     {
277         if (!newlineCached)
278             m_userCharacterClasses.append(newlineCached = newlineCreate());
279         return newlineCached;
280     }
281     CharacterClass* digitsCharacterClass()
282     {
283         if (!digitsCached)
284             m_userCharacterClasses.append(digitsCached = digitsCreate());
285         return digitsCached;
286     }
287     CharacterClass* spacesCharacterClass()
288     {
289         if (!spacesCached)
290             m_userCharacterClasses.append(spacesCached = spacesCreate());
291         return spacesCached;
292     }
293     CharacterClass* wordcharCharacterClass()
294     {
295         if (!wordcharCached)
296             m_userCharacterClasses.append(wordcharCached = wordcharCreate());
297         return wordcharCached;
298     }
299     CharacterClass* nondigitsCharacterClass()
300     {
301         if (!nondigitsCached)
302             m_userCharacterClasses.append(nondigitsCached = nondigitsCreate());
303         return nondigitsCached;
304     }
305     CharacterClass* nonspacesCharacterClass()
306     {
307         if (!nonspacesCached)
308             m_userCharacterClasses.append(nonspacesCached = nonspacesCreate());
309         return nonspacesCached;
310     }
311     CharacterClass* nonwordcharCharacterClass()
312     {
313         if (!nonwordcharCached)
314             m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate());
315         return nonwordcharCached;
316     }
317
318     bool m_ignoreCase;
319     bool m_multiline;
320     unsigned m_numSubpatterns;
321     unsigned m_maxBackReference;
322     PatternDisjunction* m_body;
323     Vector<PatternDisjunction*, 4> m_disjunctions;
324     Vector<CharacterClass*> m_userCharacterClasses;
325
326 private:
327     CharacterClass* newlineCached;
328     CharacterClass* digitsCached;
329     CharacterClass* spacesCached;
330     CharacterClass* wordcharCached;
331     CharacterClass* nondigitsCached;
332     CharacterClass* nonspacesCached;
333     CharacterClass* nonwordcharCached;
334 };
335
336 } } // namespace JSC::Yarr
337
338 #endif
339
340 #endif // RegexPattern_h