2009-04-24 Gavin Barraclough <barraclough@apple.com>
[WebKit-https.git] / JavaScriptCore / yarr / RegexPattern.h
1 /*
2  * Copyright (C) 2009 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
24  */
25
26 #ifndef RegexPattern_h
27 #define RegexPattern_h
28
29 #include <wtf/Platform.h>
30
31 #if ENABLE(YARR)
32
33 #include <wtf/Vector.h>
34 #include <wtf/unicode/Unicode.h>
35
36
37 namespace JSC { namespace Yarr {
38
39 #define RegexStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers.
40 #define RegexStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers.
41 #define RegexStackSpaceForBackTrackInfoBackReference 2
42 #define RegexStackSpaceForBackTrackInfoAlternative 1 // One per alternative.
43 #define RegexStackSpaceForBackTrackInfoParentheticalAssertion 1
44 #define RegexStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers.
45 #define RegexStackSpaceForBackTrackInfoParentheses 4
46
47 struct PatternDisjunction;
48
49 struct CharacterRange {
50     UChar begin;
51     UChar end;
52
53     CharacterRange(UChar begin, UChar end)
54         : begin(begin)
55         , end(end)
56     {
57     }
58 };
59
60 struct CharacterClass {
61     Vector<UChar> m_matches;
62     Vector<CharacterRange> m_ranges;
63     Vector<UChar> m_matchesUnicode;
64     Vector<CharacterRange> m_rangesUnicode;
65 };
66
67 enum QuantifierType {
68     QuantifierFixedCount,
69     QuantifierGreedy,
70     QuantifierNonGreedy,
71 };
72
73 struct PatternTerm {
74     enum Type {
75         TypeAssertionBOL,
76         TypeAssertionEOL,
77         TypeAssertionWordBoundary,
78         TypePatternCharacter,
79         TypeCharacterClass,
80         TypeBackReference,
81         TypeParenthesesSubpattern,
82         TypeParentheticalAssertion,
83     } type;
84     bool invertOrCapture;
85     union {
86         UChar patternCharacter;
87         CharacterClass* characterClass;
88         unsigned subpatternId;
89         struct {
90             PatternDisjunction* disjunction;
91             unsigned subpatternId;
92             unsigned lastSubpatternId;
93             bool isCopy;
94         } parentheses;
95     };
96     QuantifierType quantityType;
97     unsigned quantityCount;
98     int inputPosition;
99     unsigned frameLocation;
100
101     PatternTerm(UChar ch)
102         : type(PatternTerm::TypePatternCharacter)
103     {
104         patternCharacter = ch;
105         quantityType = QuantifierFixedCount;
106         quantityCount = 1;
107     }
108
109     PatternTerm(CharacterClass* charClass, bool invert)
110         : type(PatternTerm::TypeCharacterClass)
111         , invertOrCapture(invert)
112     {
113         characterClass = charClass;
114         quantityType = QuantifierFixedCount;
115         quantityCount = 1;
116     }
117
118     PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool invertOrCapture)
119         : type(type)
120         , invertOrCapture(invertOrCapture)
121     {
122         parentheses.disjunction = disjunction;
123         parentheses.subpatternId = subpatternId;
124         parentheses.isCopy = false;
125         quantityType = QuantifierFixedCount;
126         quantityCount = 1;
127     }
128     
129     PatternTerm(Type type, bool invert = false)
130         : type(type)
131         , invertOrCapture(invert)
132     {
133         quantityType = QuantifierFixedCount;
134         quantityCount = 1;
135     }
136
137     PatternTerm(unsigned spatternId)
138         : type(TypeBackReference)
139         , invertOrCapture(invertOrCapture)
140     {
141         subpatternId = spatternId;
142         quantityType = QuantifierFixedCount;
143         quantityCount = 1;
144     }
145
146     static PatternTerm BOL()
147     {
148         return PatternTerm(TypeAssertionBOL);
149     }
150
151     static PatternTerm EOL()
152     {
153         return PatternTerm(TypeAssertionEOL);
154     }
155
156     static PatternTerm WordBoundary(bool invert)
157     {
158         return PatternTerm(TypeAssertionWordBoundary, invert);
159     }
160     
161     bool invert()
162     {
163         return invertOrCapture;
164     }
165
166     bool capture()
167     {
168         return invertOrCapture;
169     }
170     
171     void quantify(unsigned count, QuantifierType type)
172     {
173         quantityCount = count;
174         quantityType = type;
175     }
176 };
177
178 struct PatternAlternative {
179     PatternAlternative(PatternDisjunction* disjunction)
180         : m_parent(disjunction)
181     {
182     }
183
184     PatternTerm& lastTerm()
185     {
186         ASSERT(m_terms.size());
187         return m_terms[m_terms.size() - 1];
188     }
189     
190     void removeLastTerm()
191     {
192         ASSERT(m_terms.size());
193         m_terms.shrink(m_terms.size() - 1);
194     }
195
196     Vector<PatternTerm> m_terms;
197     PatternDisjunction* m_parent;
198     unsigned m_minimumSize;
199     bool m_hasFixedSize;
200 };
201
202 struct PatternDisjunction {
203     PatternDisjunction(PatternAlternative* parent = 0)
204         : m_parent(parent)
205     {
206     }
207     
208     ~PatternDisjunction()
209     {
210         deleteAllValues(m_alternatives);
211     }
212
213     PatternAlternative* addNewAlternative()
214     {
215         PatternAlternative* alternative = new PatternAlternative(this);
216         m_alternatives.append(alternative);
217         return alternative;
218     }
219
220     Vector<PatternAlternative*> m_alternatives;
221     PatternAlternative* m_parent;
222     unsigned m_minimumSize;
223     unsigned m_callFrameSize;
224     bool m_hasFixedSize;
225 };
226
227 // You probably don't want to be calling these functions directly
228 // (please to be calling newlineCharacterClass() et al on your
229 // friendly neighborhood RegexPattern instance to get nicely
230 // cached copies).
231 CharacterClass* newlineCreate();
232 CharacterClass* digitsCreate();
233 CharacterClass* spacesCreate();
234 CharacterClass* wordcharCreate();
235 CharacterClass* nondigitsCreate();
236 CharacterClass* nonspacesCreate();
237 CharacterClass* nonwordcharCreate();
238
239 struct RegexPattern {
240     RegexPattern(bool ignoreCase, bool multiline)
241         : m_ignoreCase(ignoreCase)
242         , m_multiline(multiline)
243         , m_numSubpatterns(0)
244         , m_maxBackReference(0)
245         , newlineCached(0)
246         , digitsCached(0)
247         , spacesCached(0)
248         , wordcharCached(0)
249         , nondigitsCached(0)
250         , nonspacesCached(0)
251         , nonwordcharCached(0)
252     {
253     }
254
255     ~RegexPattern()
256     {
257         deleteAllValues(m_disjunctions);
258         deleteAllValues(m_userCharacterClasses);
259     }
260
261     void reset()
262     {
263         m_numSubpatterns = 0;
264         m_maxBackReference = 0;
265
266         deleteAllValues(m_disjunctions);
267         m_disjunctions.clear();
268         deleteAllValues(m_userCharacterClasses);
269         m_userCharacterClasses.clear();
270     }
271
272     bool containsIllegalBackReference()
273     {
274         return m_maxBackReference > m_numSubpatterns;
275     }
276
277     CharacterClass* newlineCharacterClass()
278     {
279         if (!newlineCached)
280             m_userCharacterClasses.append(newlineCached = newlineCreate());
281         return newlineCached;
282     }
283     CharacterClass* digitsCharacterClass()
284     {
285         if (!digitsCached)
286             m_userCharacterClasses.append(digitsCached = digitsCreate());
287         return digitsCached;
288     }
289     CharacterClass* spacesCharacterClass()
290     {
291         if (!spacesCached)
292             m_userCharacterClasses.append(spacesCached = spacesCreate());
293         return spacesCached;
294     }
295     CharacterClass* wordcharCharacterClass()
296     {
297         if (!wordcharCached)
298             m_userCharacterClasses.append(wordcharCached = wordcharCreate());
299         return wordcharCached;
300     }
301     CharacterClass* nondigitsCharacterClass()
302     {
303         if (!nondigitsCached)
304             m_userCharacterClasses.append(nondigitsCached = nondigitsCreate());
305         return nondigitsCached;
306     }
307     CharacterClass* nonspacesCharacterClass()
308     {
309         if (!nonspacesCached)
310             m_userCharacterClasses.append(nonspacesCached = nonspacesCreate());
311         return nonspacesCached;
312     }
313     CharacterClass* nonwordcharCharacterClass()
314     {
315         if (!nonwordcharCached)
316             m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate());
317         return nonwordcharCached;
318     }
319
320     bool m_ignoreCase;
321     bool m_multiline;
322     unsigned m_numSubpatterns;
323     unsigned m_maxBackReference;
324     PatternDisjunction* m_body;
325     Vector<PatternDisjunction*, 4> m_disjunctions;
326     Vector<CharacterClass*> m_userCharacterClasses;
327
328 private:
329     CharacterClass* newlineCached;
330     CharacterClass* digitsCached;
331     CharacterClass* spacesCached;
332     CharacterClass* wordcharCached;
333     CharacterClass* nondigitsCached;
334     CharacterClass* nonspacesCached;
335     CharacterClass* nonwordcharCached;
336 };
337
338 } } // namespace JSC::Yarr
339
340 #endif
341
342 #endif // RegexPattern_h