1 /* This is JavaScriptCore's variant of the PCRE library. While this library
2 started out as a copy of PCRE, many of the features of PCRE have been
3 removed. This library now supports only the regular expression features
4 required by the JavaScript language specification, and has only the functions
5 needed by JavaScriptCore and the rest of WebKit.
7 Originally written by Philip Hazel
8 Copyright (c) 1997-2006 University of Cambridge
9 Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved.
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
40 /* This header contains definitions that are shared between the different
41 modules, but which are not relevant to the exported API. This includes some
42 functions whose names all begin with "_pcre_". */
44 #ifndef PCRE_INTERNAL_H
45 #define PCRE_INTERNAL_H
47 /* Bit definitions for entries in the pcre_ctypes table. */
49 #define ctype_space 0x01
50 #define ctype_xdigit 0x08
51 #define ctype_word 0x10 /* alphameric or '_' */
53 /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
54 of bits for a class map. Some classes are built by combining these tables. */
56 #define cbit_space 0 /* \s */
57 #define cbit_digit 32 /* \d */
58 #define cbit_word 64 /* \w */
59 #define cbit_length 96 /* Length of the cbits table */
61 /* Offsets of the various tables from the base tables pointer, and
65 #define fcc_offset 128
66 #define cbits_offset 256
67 #define ctypes_offset (cbits_offset + cbit_length)
68 #define tables_length (ctypes_offset + 128)
72 #include "Assertions.h"
75 #pragma warning(disable: 4232)
76 #pragma warning(disable: 4244)
79 /* The value of LINK_SIZE determines the number of bytes used to store links as
80 offsets within the compiled regex. The default is 2, which allows for compiled
81 patterns up to 64K long. This covers the vast majority of cases. However, PCRE
82 can also be compiled to use 3 or 4 bytes instead. This allows for longer
83 patterns in extreme cases. On systems that support it, "configure" can be used
84 to override this default. */
88 /* The below limit restricts the number of recursive match calls in order to
89 limit the maximum amount of stack (or heap, if NO_RECURSE is defined) that is used. The
90 value of MATCH_LIMIT_RECURSION applies only to recursive calls of match().
92 This limit is tied to the size of MatchFrame. Right now we allow PCRE to allocate up
93 to MATCH_LIMIT_RECURSION - 16 * sizeof(MatchFrame) bytes of "stack" space before we give up.
94 Currently that's 100000 - 16 * (23 * 4) ~ 90MB
97 #define MATCH_LIMIT_RECURSION 100000
99 #define _pcre_default_tables kjs_pcre_default_tables
100 #define _pcre_ord2utf8 kjs_pcre_ord2utf8
101 #define _pcre_utf8_table1 kjs_pcre_utf8_table1
102 #define _pcre_utf8_table2 kjs_pcre_utf8_table2
103 #define _pcre_utf8_table3 kjs_pcre_utf8_table3
104 #define _pcre_utf8_table4 kjs_pcre_utf8_table4
105 #define _pcre_xclass kjs_pcre_xclass
107 /* Define DEBUG to get debugging output on stdout. */
113 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
114 inline, and there are *still* stupid compilers about that don't like indented
115 pre-processor statements, or at least there were when I first wrote this. After
116 all, it had only been about 10 years then... */
119 #define DPRINTF(p) printf p
121 #define DPRINTF(p) /*nothing*/
124 /* Standard C headers plus the external interface definition. The only time
125 setjmp and stdarg are used is when NO_RECURSE is set. */
136 /* Include the public PCRE header and the definitions of UCP character property
141 typedef unsigned short pcre_uint16;
142 typedef unsigned pcre_uint32;
143 typedef unsigned char uschar;
145 /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
146 in big-endian order) by default. These are used, for example, to link from the
147 start of a subpattern to its alternatives and its end. The use of 2 bytes per
148 offset limits the size of the compiled regex to around 64K, which is big enough
149 for almost everybody. However, I received a request for an even bigger limit.
150 For this reason, and also to make the code easier to maintain, the storing and
151 loading of offsets from the byte string is now handled by the macros that are
154 The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
155 the config.h file, but can be overridden by using -D on the command line. This
156 is automated on Unix systems via the "configure" command. */
160 static inline void putOpcodeValueAtOffset(uschar* opcodePtr, size_t offset, unsigned short value)
162 opcodePtr[offset] = value >> 8;
163 opcodePtr[offset + 1] = value & 255;
166 static inline short getOpcodeValueAtOffset(const uschar* opcodePtr, size_t offset)
168 return ((opcodePtr[offset] << 8) | opcodePtr[offset + 1]);
171 #define MAX_PATTERN_SIZE (1 << 16)
175 static inline void putOpcodeValueAtOffset(uschar* opcodePtr, size_t offset, unsigned value)
177 ASSERT(!(value & 0xFF000000)); // This function only allows values < 2^24
178 opcodePtr[offset] = value >> 16;
179 opcodePtr[offset + 1] = value >> 8;
180 opcodePtr[offset + 2] = value & 255;
183 static inline int getOpcodeValueAtOffset(const uschar* opcodePtr, size_t offset)
185 return ((opcodePtr[offset] << 16) | (opcodePtr[offset + 1] << 8) | opcodePtr[offset + 2]);
188 #define MAX_PATTERN_SIZE (1 << 24)
192 static inline void putOpcodeValueAtOffset(uschar* opcodePtr, size_t offset, unsigned value)
194 opcodePtr[offset] = value >> 24;
195 opcodePtr[offset + 1] = value >> 16;
196 opcodePtr[offset + 2] = value >> 8;
197 opcodePtr[offset + 3] = value & 255;
200 static inline int getOpcodeValueAtOffset(const uschar* opcodePtr, size_t offset)
202 return ((opcodePtr[offset] << 24) | (opcodePtr[offset + 1] << 16) | (opcodePtr[offset + 2] << 8) | opcodePtr[offset + 3]);
205 #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
208 #error LINK_SIZE must be either 2, 3, or 4
211 static inline void putOpcodeValueAtOffsetAndAdvance(uschar*& opcodePtr, size_t offset, unsigned short value)
213 putOpcodeValueAtOffset(opcodePtr, offset, value);
214 opcodePtr += LINK_SIZE;
217 /* PCRE uses some other 2-byte quantities that do not change when the size of
218 offsets changes. There are used for repeat counts and for other things such as
219 capturing parenthesis numbers in back references. */
221 static inline void put2ByteOpcodeValueAtOffset(uschar* opcodePtr, size_t offset, unsigned short value)
223 opcodePtr[offset] = value >> 8;
224 opcodePtr[offset + 1] = value & 255;
227 static inline short get2ByteOpcodeValueAtOffset(const uschar* opcodePtr, size_t offset)
229 return ((opcodePtr[offset] << 8) | opcodePtr[offset + 1]);
232 static inline void put2ByteOpcodeValueAtOffsetAndAdvance(uschar*& opcodePtr, size_t offset, unsigned short value)
234 put2ByteOpcodeValueAtOffset(opcodePtr, offset, value);
238 // FIXME: These are really more of a "compiled regexp state" than "regexp options"
240 UseFirstByteOptimizationOption = 0x40000000, /* first_byte is set */
241 UseRequiredByteOptimizationOption = 0x20000000, /* req_byte is set */
242 UseMultiLineFirstByteOptimizationOption = 0x10000000, /* start after \n for multiline */
243 IsAnchoredOption = 0x02000000, /* can't use partial with this regex */
244 IgnoreCaseOption = 0x00000001,
245 MatchAcrossMultipleLinesOption = 0x00000002
248 /* Negative values for the firstchar and reqchar variables */
250 #define REQ_UNSET (-2)
251 #define REQ_NONE (-1)
253 /* The maximum remaining length of subject we are prepared to search for a
256 #define REQ_BYTE_MAX 1000
258 /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a
259 variable-length repeat, or a anything other than literal characters. */
261 #define REQ_IGNORE_CASE 0x0100 /* indicates should ignore case */
262 #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */
264 /* Miscellaneous definitions */
266 /* Flag bits and data types for the extended class (OP_XCLASS) for classes that
267 contain UTF-8 characters with values greater than 255. */
269 #define XCL_NOT 0x01 /* Flag: this is a negative class */
270 #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
272 #define XCL_END 0 /* Marks end of individual items */
273 #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
274 #define XCL_RANGE 2 /* A range (two multibyte chars) follows */
276 /* These are escaped items that aren't just an encoding of a particular data
277 value such as \n. They must have non-zero values, as check_escape() returns
278 their negation. Also, they must appear in the same order as in the opcode
279 definitions below, up to ESC_w. The final one must be
280 ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
281 tests in the code for an escape > ESC_b and <= ESC_w to
282 detect the types that may be repeated. These are the types that consume
283 characters. If any new escapes are put in between that don't consume a
284 character, that code will have to change. */
286 enum { ESC_B = 1, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w, ESC_REF };
288 /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
289 that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
290 OP_EOD must correspond in order to the list of escapes immediately above.
291 Note that whenever this list is updated, the two macro definitions that follow
292 must also be updated to match. */
294 #define FOR_EACH_OPCODE(macro) \
297 macro(NOT_WORD_BOUNDARY) \
298 macro(WORD_BOUNDARY) \
301 macro(NOT_WHITESPACE) \
303 macro(NOT_WORDCHAR) \
311 macro(CHAR_IGNORING_CASE) \
313 macro(ASCII_LETTER_IGNORING_CASE) \
341 macro(TYPEMINQUERY) \
376 #define OPCODE_ENUM_VALUE(opcode) OP_##opcode,
377 enum { FOR_EACH_OPCODE(OPCODE_ENUM_VALUE) };
379 /* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and
380 study.c that all opcodes are less than 128 in value. This makes handling UTF-8
381 character sequences easier. */
383 /* The highest extraction number before we have to start using additional
384 bytes. (Originally PCRE didn't have support for extraction counts highter than
385 this number.) The value is limited by the number of opcodes left after OP_BRA,
386 i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
389 #define EXTRACT_BASIC_MAX 100
391 /* This macro defines the length of fixed length operations in the compiled
392 regex. The lengths are used when searching for specific things, and also in the
393 debugging printing of a compiled regex. We use a macro so that it can be
394 defined close to the definitions of the opcodes themselves.
396 As things have been extended, some of these are no longer fixed lenths, but are
397 minima instead. For example, the length of a single-character repeat may vary
398 in UTF-8 mode. The code that uses this table must know about such things. */
402 1, 1, 1, 1, 1, 1, 1, 1, /* \B, \b, \D, \d, \S, \s, \W, \w */ \
405 2, 2, /* Char, Charnc - minimum lengths */ \
406 2, 2, /* ASCII char or non-cased */ \
408 /* Positive single-char repeats ** These are */ \
409 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
410 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \
411 /* Negative single-char repeats - only for chars < 256 */ \
412 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
413 4, 4, 4, /* NOT upto, minupto, exact */ \
414 /* Positive type repeats */ \
415 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
416 4, 4, 4, /* Type upto, minupto, exact */ \
417 /* Character class & ref repeats */ \
418 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
419 5, 5, /* CRRANGE, CRMINRANGE */ \
422 0, /* XCLASS - variable length */ \
424 1 + LINK_SIZE, /* Alt */ \
425 1 + LINK_SIZE, /* Ket */ \
426 1 + LINK_SIZE, /* KetRmax */ \
427 1 + LINK_SIZE, /* KetRmin */ \
428 1 + LINK_SIZE, /* Assert */ \
429 1 + LINK_SIZE, /* Assert not */ \
430 1 + LINK_SIZE, /* Once */ \
431 1, 1, /* BRAZERO, BRAMINZERO */ \
433 1 + LINK_SIZE /* BRA */ \
436 /* The index of names and the
437 code vector run on as long as necessary after the end. We store an explicit
438 offset to the name table so that if a regex is compiled on one host, saved, and
439 then run on another where the size of pointers is different, all might still
440 be well. For the case of compiled-on-4 and run-on-8, we include an extra
441 pointer that is always NULL.
445 pcre_uint32 size; /* Total that was malloced */
448 pcre_uint16 top_bracket;
449 pcre_uint16 top_backref;
451 // jsRegExpExecute && jsRegExpCompile currently only how to handle ASCII
452 // chars for thse optimizations, however it would be trivial to add support
453 // for optimized UChar first_byte/req_byte scans
454 pcre_uint16 first_byte;
455 pcre_uint16 req_byte;
458 /* Internal shared data tables. These are tables that are used by more than one
459 of the exported public functions. They have to be "external" in the C sense,
460 but are not part of the PCRE public API. The data for these tables is in the
461 pcre_tables.c module. */
463 #define _pcre_utf8_table1_size 6
465 extern const int _pcre_utf8_table1[6];
466 extern const int _pcre_utf8_table2[6];
467 extern const int _pcre_utf8_table3[6];
468 extern const uschar _pcre_utf8_table4[0x40];
470 extern const uschar _pcre_default_tables[tables_length];
472 static inline uschar toLowerCase(uschar c)
474 static const uschar* lowerCaseChars = _pcre_default_tables + lcc_offset;
475 return lowerCaseChars[c];
478 static inline uschar flipCase(uschar c)
480 static const uschar* flippedCaseChars = _pcre_default_tables + fcc_offset;
481 return flippedCaseChars[c];
484 static inline uschar classBitmapForChar(uschar c)
486 static const uschar* charClassBitmaps = _pcre_default_tables + cbits_offset;
487 return charClassBitmaps[c];
490 static inline uschar charTypeForChar(uschar c)
492 const uschar* charTypeMap = _pcre_default_tables + ctypes_offset;
493 return charTypeMap[c];
496 static inline bool isWordChar(UChar c)
498 /* UTF8 Characters > 128 are assumed to be "non-word" characters. */
499 return (c < 128 && (charTypeForChar(c) & ctype_word));
502 static inline bool isSpaceChar(UChar c)
504 return (c < 128 && (charTypeForChar(c) & ctype_space));
507 /* Structure for passing "static" information around between the functions
508 doing the compiling, so that they are thread-safe. */
518 const uschar* start_code; /* The start of the compiled code */
519 const UChar* start_pattern; /* The start of the pattern */
520 int top_backref; /* Maximum back reference */
521 unsigned backref_map; /* Bitmap of low back refs */
522 int req_varyopt; /* "After variable item" flag for reqbyte */
525 /* Internal shared functions. These are functions that are used by more than
526 one of the exported public functions. They have to be "external" in the C
527 sense, but are not part of the PCRE public API. */
529 extern int _pcre_ucp_othercase(const unsigned int);
530 extern bool _pcre_xclass(int, const uschar*);
532 static inline bool isNewline(UChar nl)
534 return (nl == 0xA || nl == 0xD || nl == 0x2028 || nl == 0x2029);
537 // FIXME: It's unclear to me if this moves the opcode ptr to the start of all branches
538 // or to the end of all branches -- ecs
539 // FIXME: This abstraction is poor since it assumes that you want to jump based on whatever
540 // the next value in the stream is, and *then* follow any OP_ALT branches.
541 static inline void moveOpcodePtrPastAnyAlternateBranches(const uschar*& opcodePtr)
544 opcodePtr += getOpcodeValueAtOffset(opcodePtr, 1);
545 } while (*opcodePtr == OP_ALT);
552 /* End of pcre_internal.h */