2009-05-26 Gavin Barraclough <barraclough@apple.com>
[WebKit-https.git] / JavaScriptCore / yarr / RegexPattern.h
1 /*
2  * Copyright (C) 2009 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
24  */
25
26 #ifndef RegexPattern_h
27 #define RegexPattern_h
28
29 #include <wtf/Platform.h>
30
31 #if ENABLE(YARR)
32
33 #include <wtf/Vector.h>
34 #include <wtf/unicode/Unicode.h>
35
36
37 namespace JSC { namespace Yarr {
38
39 #define RegexStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers.
40 #define RegexStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers.
41 #define RegexStackSpaceForBackTrackInfoBackReference 2
42 #define RegexStackSpaceForBackTrackInfoAlternative 1 // One per alternative.
43 #define RegexStackSpaceForBackTrackInfoParentheticalAssertion 1
44 #define RegexStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers.
45 #define RegexStackSpaceForBackTrackInfoParentheses 4
46
47 struct PatternDisjunction;
48
49 struct CharacterRange {
50     UChar begin;
51     UChar end;
52
53     CharacterRange(UChar begin, UChar end)
54         : begin(begin)
55         , end(end)
56     {
57     }
58 };
59
60 struct CharacterClass {
61     Vector<UChar> m_matches;
62     Vector<CharacterRange> m_ranges;
63     Vector<UChar> m_matchesUnicode;
64     Vector<CharacterRange> m_rangesUnicode;
65 };
66
67 enum QuantifierType {
68     QuantifierFixedCount,
69     QuantifierGreedy,
70     QuantifierNonGreedy,
71 };
72
73 struct PatternTerm {
74     enum Type {
75         TypeAssertionBOL,
76         TypeAssertionEOL,
77         TypeAssertionWordBoundary,
78         TypePatternCharacter,
79         TypeCharacterClass,
80         TypeBackReference,
81         TypeParenthesesSubpattern,
82         TypeParentheticalAssertion,
83     } type;
84     bool invertOrCapture;
85     union {
86         UChar patternCharacter;
87         CharacterClass* characterClass;
88         unsigned subpatternId;
89         struct {
90             PatternDisjunction* disjunction;
91             unsigned subpatternId;
92             unsigned lastSubpatternId;
93             bool isCopy;
94         } parentheses;
95     };
96     QuantifierType quantityType;
97     unsigned quantityCount;
98     int inputPosition;
99     unsigned frameLocation;
100
101     PatternTerm(UChar ch)
102         : type(PatternTerm::TypePatternCharacter)
103     {
104         patternCharacter = ch;
105         quantityType = QuantifierFixedCount;
106         quantityCount = 1;
107     }
108
109     PatternTerm(CharacterClass* charClass, bool invert)
110         : type(PatternTerm::TypeCharacterClass)
111         , invertOrCapture(invert)
112     {
113         characterClass = charClass;
114         quantityType = QuantifierFixedCount;
115         quantityCount = 1;
116     }
117
118     PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool invertOrCapture)
119         : type(type)
120         , invertOrCapture(invertOrCapture)
121     {
122         parentheses.disjunction = disjunction;
123         parentheses.subpatternId = subpatternId;
124         parentheses.isCopy = false;
125         quantityType = QuantifierFixedCount;
126         quantityCount = 1;
127     }
128     
129     PatternTerm(Type type, bool invert = false)
130         : type(type)
131         , invertOrCapture(invert)
132     {
133         quantityType = QuantifierFixedCount;
134         quantityCount = 1;
135     }
136
137     PatternTerm(unsigned spatternId)
138         : type(TypeBackReference)
139         , invertOrCapture(invertOrCapture)
140     {
141         subpatternId = spatternId;
142         quantityType = QuantifierFixedCount;
143         quantityCount = 1;
144     }
145
146     static PatternTerm BOL()
147     {
148         return PatternTerm(TypeAssertionBOL);
149     }
150
151     static PatternTerm EOL()
152     {
153         return PatternTerm(TypeAssertionEOL);
154     }
155
156     static PatternTerm WordBoundary(bool invert)
157     {
158         return PatternTerm(TypeAssertionWordBoundary, invert);
159     }
160     
161     bool invert()
162     {
163         return invertOrCapture;
164     }
165
166     bool capture()
167     {
168         return invertOrCapture;
169     }
170     
171     void quantify(unsigned count, QuantifierType type)
172     {
173         quantityCount = count;
174         quantityType = type;
175     }
176 };
177
178 struct PatternAlternative {
179     PatternAlternative(PatternDisjunction* disjunction)
180         : m_parent(disjunction)
181     {
182     }
183
184     PatternTerm& lastTerm()
185     {
186         ASSERT(m_terms.size());
187         return m_terms[m_terms.size() - 1];
188     }
189     
190     void removeLastTerm()
191     {
192         ASSERT(m_terms.size());
193         m_terms.shrink(m_terms.size() - 1);
194     }
195
196     Vector<PatternTerm> m_terms;
197     PatternDisjunction* m_parent;
198     unsigned m_minimumSize;
199     bool m_hasFixedSize;
200 };
201
202 struct PatternDisjunction {
203     PatternDisjunction(PatternAlternative* parent = 0)
204         : m_parent(parent)
205     {
206     }
207     
208     ~PatternDisjunction()
209     {
210         deleteAllValues(m_alternatives);
211     }
212
213     PatternAlternative* addNewAlternative()
214     {
215         PatternAlternative* alternative = new PatternAlternative(this);
216         m_alternatives.append(alternative);
217         return alternative;
218     }
219
220     Vector<PatternAlternative*> m_alternatives;
221     PatternAlternative* m_parent;
222     unsigned m_minimumSize;
223     unsigned m_callFrameSize;
224     bool m_hasFixedSize;
225 };
226
227 // You probably don't want to be calling these functions directly
228 // (please to be calling newlineCharacterClass() et al on your
229 // friendly neighborhood RegexPattern instance to get nicely
230 // cached copies).
231 CharacterClass* newlineCreate();
232 CharacterClass* digitsCreate();
233 CharacterClass* spacesCreate();
234 CharacterClass* wordcharCreate();
235 CharacterClass* nondigitsCreate();
236 CharacterClass* nonspacesCreate();
237 CharacterClass* nonwordcharCreate();
238
239 struct RegexPattern {
240     RegexPattern(bool ignoreCase, bool multiline)
241         : m_ignoreCase(ignoreCase)
242         , m_multiline(multiline)
243         , m_numSubpatterns(0)
244         , m_maxBackReference(0)
245         , newlineCached(0)
246         , digitsCached(0)
247         , spacesCached(0)
248         , wordcharCached(0)
249         , nondigitsCached(0)
250         , nonspacesCached(0)
251         , nonwordcharCached(0)
252     {
253     }
254
255     ~RegexPattern()
256     {
257         deleteAllValues(m_disjunctions);
258         deleteAllValues(m_userCharacterClasses);
259     }
260
261     void reset()
262     {
263         m_numSubpatterns = 0;
264         m_maxBackReference = 0;
265
266         newlineCached = 0;
267         digitsCached = 0;
268         spacesCached = 0;
269         wordcharCached = 0;
270         nondigitsCached = 0;
271         nonspacesCached = 0;
272         nonwordcharCached = 0;
273
274         deleteAllValues(m_disjunctions);
275         m_disjunctions.clear();
276         deleteAllValues(m_userCharacterClasses);
277         m_userCharacterClasses.clear();
278     }
279
280     bool containsIllegalBackReference()
281     {
282         return m_maxBackReference > m_numSubpatterns;
283     }
284
285     CharacterClass* newlineCharacterClass()
286     {
287         if (!newlineCached)
288             m_userCharacterClasses.append(newlineCached = newlineCreate());
289         return newlineCached;
290     }
291     CharacterClass* digitsCharacterClass()
292     {
293         if (!digitsCached)
294             m_userCharacterClasses.append(digitsCached = digitsCreate());
295         return digitsCached;
296     }
297     CharacterClass* spacesCharacterClass()
298     {
299         if (!spacesCached)
300             m_userCharacterClasses.append(spacesCached = spacesCreate());
301         return spacesCached;
302     }
303     CharacterClass* wordcharCharacterClass()
304     {
305         if (!wordcharCached)
306             m_userCharacterClasses.append(wordcharCached = wordcharCreate());
307         return wordcharCached;
308     }
309     CharacterClass* nondigitsCharacterClass()
310     {
311         if (!nondigitsCached)
312             m_userCharacterClasses.append(nondigitsCached = nondigitsCreate());
313         return nondigitsCached;
314     }
315     CharacterClass* nonspacesCharacterClass()
316     {
317         if (!nonspacesCached)
318             m_userCharacterClasses.append(nonspacesCached = nonspacesCreate());
319         return nonspacesCached;
320     }
321     CharacterClass* nonwordcharCharacterClass()
322     {
323         if (!nonwordcharCached)
324             m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate());
325         return nonwordcharCached;
326     }
327
328     bool m_ignoreCase;
329     bool m_multiline;
330     unsigned m_numSubpatterns;
331     unsigned m_maxBackReference;
332     PatternDisjunction* m_body;
333     Vector<PatternDisjunction*, 4> m_disjunctions;
334     Vector<CharacterClass*> m_userCharacterClasses;
335
336 private:
337     CharacterClass* newlineCached;
338     CharacterClass* digitsCached;
339     CharacterClass* spacesCached;
340     CharacterClass* wordcharCached;
341     CharacterClass* nondigitsCached;
342     CharacterClass* nonspacesCached;
343     CharacterClass* nonwordcharCached;
344 };
345
346 } } // namespace JSC::Yarr
347
348 #endif
349
350 #endif // RegexPattern_h