4925656a71bbf86d52051d9a436de1bc9cd47167
[WebKit-https.git] / Source / JavaScriptCore / parser / Lexer.cpp
1 /*
2  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3  *  Copyright (C) 2006, 2007, 2008, 2009, 2011, 2012, 2013 Apple Inc. All Rights Reserved.
4  *  Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
5  *  Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
6  *  Copyright (C) 2012 Mathias Bynens (mathias@qiwi.be)
7  *
8  *  This library is free software; you can redistribute it and/or
9  *  modify it under the terms of the GNU Library General Public
10  *  License as published by the Free Software Foundation; either
11  *  version 2 of the License, or (at your option) any later version.
12  *
13  *  This library is distributed in the hope that it will be useful,
14  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  *  Library General Public License for more details.
17  *
18  *  You should have received a copy of the GNU Library General Public License
19  *  along with this library; see the file COPYING.LIB.  If not, write to
20  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21  *  Boston, MA 02110-1301, USA.
22  *
23  */
24
25 #include "config.h"
26 #include "Lexer.h"
27
28 #include "JSFunction.h"
29
30 #include "JSGlobalObjectFunctions.h"
31 #include "Identifier.h"
32 #include "NodeInfo.h"
33 #include "Nodes.h"
34 #include <wtf/dtoa.h>
35 #include <ctype.h>
36 #include <limits.h>
37 #include <string.h>
38 #include <wtf/Assertions.h>
39
40 using namespace WTF;
41 using namespace Unicode;
42
43 #include "KeywordLookup.h"
44 #include "Lexer.lut.h"
45 #include "Parser.h"
46
47 namespace JSC {
48
49 Keywords::Keywords(VM* vm)
50     : m_vm(vm)
51     , m_keywordTable(JSC::mainTable)
52 {
53 }
54
55 enum CharacterType {
56     // Types for the main switch
57
58     // The first three types are fixed, and also used for identifying
59     // ASCII alpha and alphanumeric characters (see isIdentStart and isIdentPart).
60     CharacterIdentifierStart,
61     CharacterZero,
62     CharacterNumber,
63
64     CharacterInvalid,
65     CharacterLineTerminator,
66     CharacterExclamationMark,
67     CharacterOpenParen,
68     CharacterCloseParen,
69     CharacterOpenBracket,
70     CharacterCloseBracket,
71     CharacterComma,
72     CharacterColon,
73     CharacterQuestion,
74     CharacterTilde,
75     CharacterQuote,
76     CharacterDot,
77     CharacterSlash,
78     CharacterBackSlash,
79     CharacterSemicolon,
80     CharacterOpenBrace,
81     CharacterCloseBrace,
82
83     CharacterAdd,
84     CharacterSub,
85     CharacterMultiply,
86     CharacterModulo,
87     CharacterAnd,
88     CharacterXor,
89     CharacterOr,
90     CharacterLess,
91     CharacterGreater,
92     CharacterEqual,
93
94     // Other types (only one so far)
95     CharacterWhiteSpace,
96 };
97
98 // 256 Latin-1 codes
99 static const unsigned short typesOfLatin1Characters[256] = {
100 /*   0 - Null               */ CharacterInvalid,
101 /*   1 - Start of Heading   */ CharacterInvalid,
102 /*   2 - Start of Text      */ CharacterInvalid,
103 /*   3 - End of Text        */ CharacterInvalid,
104 /*   4 - End of Transm.     */ CharacterInvalid,
105 /*   5 - Enquiry            */ CharacterInvalid,
106 /*   6 - Acknowledgment     */ CharacterInvalid,
107 /*   7 - Bell               */ CharacterInvalid,
108 /*   8 - Back Space         */ CharacterInvalid,
109 /*   9 - Horizontal Tab     */ CharacterWhiteSpace,
110 /*  10 - Line Feed          */ CharacterLineTerminator,
111 /*  11 - Vertical Tab       */ CharacterWhiteSpace,
112 /*  12 - Form Feed          */ CharacterWhiteSpace,
113 /*  13 - Carriage Return    */ CharacterLineTerminator,
114 /*  14 - Shift Out          */ CharacterInvalid,
115 /*  15 - Shift In           */ CharacterInvalid,
116 /*  16 - Data Line Escape   */ CharacterInvalid,
117 /*  17 - Device Control 1   */ CharacterInvalid,
118 /*  18 - Device Control 2   */ CharacterInvalid,
119 /*  19 - Device Control 3   */ CharacterInvalid,
120 /*  20 - Device Control 4   */ CharacterInvalid,
121 /*  21 - Negative Ack.      */ CharacterInvalid,
122 /*  22 - Synchronous Idle   */ CharacterInvalid,
123 /*  23 - End of Transmit    */ CharacterInvalid,
124 /*  24 - Cancel             */ CharacterInvalid,
125 /*  25 - End of Medium      */ CharacterInvalid,
126 /*  26 - Substitute         */ CharacterInvalid,
127 /*  27 - Escape             */ CharacterInvalid,
128 /*  28 - File Separator     */ CharacterInvalid,
129 /*  29 - Group Separator    */ CharacterInvalid,
130 /*  30 - Record Separator   */ CharacterInvalid,
131 /*  31 - Unit Separator     */ CharacterInvalid,
132 /*  32 - Space              */ CharacterWhiteSpace,
133 /*  33 - !                  */ CharacterExclamationMark,
134 /*  34 - "                  */ CharacterQuote,
135 /*  35 - #                  */ CharacterInvalid,
136 /*  36 - $                  */ CharacterIdentifierStart,
137 /*  37 - %                  */ CharacterModulo,
138 /*  38 - &                  */ CharacterAnd,
139 /*  39 - '                  */ CharacterQuote,
140 /*  40 - (                  */ CharacterOpenParen,
141 /*  41 - )                  */ CharacterCloseParen,
142 /*  42 - *                  */ CharacterMultiply,
143 /*  43 - +                  */ CharacterAdd,
144 /*  44 - ,                  */ CharacterComma,
145 /*  45 - -                  */ CharacterSub,
146 /*  46 - .                  */ CharacterDot,
147 /*  47 - /                  */ CharacterSlash,
148 /*  48 - 0                  */ CharacterZero,
149 /*  49 - 1                  */ CharacterNumber,
150 /*  50 - 2                  */ CharacterNumber,
151 /*  51 - 3                  */ CharacterNumber,
152 /*  52 - 4                  */ CharacterNumber,
153 /*  53 - 5                  */ CharacterNumber,
154 /*  54 - 6                  */ CharacterNumber,
155 /*  55 - 7                  */ CharacterNumber,
156 /*  56 - 8                  */ CharacterNumber,
157 /*  57 - 9                  */ CharacterNumber,
158 /*  58 - :                  */ CharacterColon,
159 /*  59 - ;                  */ CharacterSemicolon,
160 /*  60 - <                  */ CharacterLess,
161 /*  61 - =                  */ CharacterEqual,
162 /*  62 - >                  */ CharacterGreater,
163 /*  63 - ?                  */ CharacterQuestion,
164 /*  64 - @                  */ CharacterInvalid,
165 /*  65 - A                  */ CharacterIdentifierStart,
166 /*  66 - B                  */ CharacterIdentifierStart,
167 /*  67 - C                  */ CharacterIdentifierStart,
168 /*  68 - D                  */ CharacterIdentifierStart,
169 /*  69 - E                  */ CharacterIdentifierStart,
170 /*  70 - F                  */ CharacterIdentifierStart,
171 /*  71 - G                  */ CharacterIdentifierStart,
172 /*  72 - H                  */ CharacterIdentifierStart,
173 /*  73 - I                  */ CharacterIdentifierStart,
174 /*  74 - J                  */ CharacterIdentifierStart,
175 /*  75 - K                  */ CharacterIdentifierStart,
176 /*  76 - L                  */ CharacterIdentifierStart,
177 /*  77 - M                  */ CharacterIdentifierStart,
178 /*  78 - N                  */ CharacterIdentifierStart,
179 /*  79 - O                  */ CharacterIdentifierStart,
180 /*  80 - P                  */ CharacterIdentifierStart,
181 /*  81 - Q                  */ CharacterIdentifierStart,
182 /*  82 - R                  */ CharacterIdentifierStart,
183 /*  83 - S                  */ CharacterIdentifierStart,
184 /*  84 - T                  */ CharacterIdentifierStart,
185 /*  85 - U                  */ CharacterIdentifierStart,
186 /*  86 - V                  */ CharacterIdentifierStart,
187 /*  87 - W                  */ CharacterIdentifierStart,
188 /*  88 - X                  */ CharacterIdentifierStart,
189 /*  89 - Y                  */ CharacterIdentifierStart,
190 /*  90 - Z                  */ CharacterIdentifierStart,
191 /*  91 - [                  */ CharacterOpenBracket,
192 /*  92 - \                  */ CharacterBackSlash,
193 /*  93 - ]                  */ CharacterCloseBracket,
194 /*  94 - ^                  */ CharacterXor,
195 /*  95 - _                  */ CharacterIdentifierStart,
196 /*  96 - `                  */ CharacterInvalid,
197 /*  97 - a                  */ CharacterIdentifierStart,
198 /*  98 - b                  */ CharacterIdentifierStart,
199 /*  99 - c                  */ CharacterIdentifierStart,
200 /* 100 - d                  */ CharacterIdentifierStart,
201 /* 101 - e                  */ CharacterIdentifierStart,
202 /* 102 - f                  */ CharacterIdentifierStart,
203 /* 103 - g                  */ CharacterIdentifierStart,
204 /* 104 - h                  */ CharacterIdentifierStart,
205 /* 105 - i                  */ CharacterIdentifierStart,
206 /* 106 - j                  */ CharacterIdentifierStart,
207 /* 107 - k                  */ CharacterIdentifierStart,
208 /* 108 - l                  */ CharacterIdentifierStart,
209 /* 109 - m                  */ CharacterIdentifierStart,
210 /* 110 - n                  */ CharacterIdentifierStart,
211 /* 111 - o                  */ CharacterIdentifierStart,
212 /* 112 - p                  */ CharacterIdentifierStart,
213 /* 113 - q                  */ CharacterIdentifierStart,
214 /* 114 - r                  */ CharacterIdentifierStart,
215 /* 115 - s                  */ CharacterIdentifierStart,
216 /* 116 - t                  */ CharacterIdentifierStart,
217 /* 117 - u                  */ CharacterIdentifierStart,
218 /* 118 - v                  */ CharacterIdentifierStart,
219 /* 119 - w                  */ CharacterIdentifierStart,
220 /* 120 - x                  */ CharacterIdentifierStart,
221 /* 121 - y                  */ CharacterIdentifierStart,
222 /* 122 - z                  */ CharacterIdentifierStart,
223 /* 123 - {                  */ CharacterOpenBrace,
224 /* 124 - |                  */ CharacterOr,
225 /* 125 - }                  */ CharacterCloseBrace,
226 /* 126 - ~                  */ CharacterTilde,
227 /* 127 - Delete             */ CharacterInvalid,
228 /* 128 - Cc category        */ CharacterInvalid,
229 /* 129 - Cc category        */ CharacterInvalid,
230 /* 130 - Cc category        */ CharacterInvalid,
231 /* 131 - Cc category        */ CharacterInvalid,
232 /* 132 - Cc category        */ CharacterInvalid,
233 /* 133 - Cc category        */ CharacterInvalid,
234 /* 134 - Cc category        */ CharacterInvalid,
235 /* 135 - Cc category        */ CharacterInvalid,
236 /* 136 - Cc category        */ CharacterInvalid,
237 /* 137 - Cc category        */ CharacterInvalid,
238 /* 138 - Cc category        */ CharacterInvalid,
239 /* 139 - Cc category        */ CharacterInvalid,
240 /* 140 - Cc category        */ CharacterInvalid,
241 /* 141 - Cc category        */ CharacterInvalid,
242 /* 142 - Cc category        */ CharacterInvalid,
243 /* 143 - Cc category        */ CharacterInvalid,
244 /* 144 - Cc category        */ CharacterInvalid,
245 /* 145 - Cc category        */ CharacterInvalid,
246 /* 146 - Cc category        */ CharacterInvalid,
247 /* 147 - Cc category        */ CharacterInvalid,
248 /* 148 - Cc category        */ CharacterInvalid,
249 /* 149 - Cc category        */ CharacterInvalid,
250 /* 150 - Cc category        */ CharacterInvalid,
251 /* 151 - Cc category        */ CharacterInvalid,
252 /* 152 - Cc category        */ CharacterInvalid,
253 /* 153 - Cc category        */ CharacterInvalid,
254 /* 154 - Cc category        */ CharacterInvalid,
255 /* 155 - Cc category        */ CharacterInvalid,
256 /* 156 - Cc category        */ CharacterInvalid,
257 /* 157 - Cc category        */ CharacterInvalid,
258 /* 158 - Cc category        */ CharacterInvalid,
259 /* 159 - Cc category        */ CharacterInvalid,
260 /* 160 - Zs category (nbsp) */ CharacterWhiteSpace,
261 /* 161 - Po category        */ CharacterInvalid,
262 /* 162 - Sc category        */ CharacterInvalid,
263 /* 163 - Sc category        */ CharacterInvalid,
264 /* 164 - Sc category        */ CharacterInvalid,
265 /* 165 - Sc category        */ CharacterInvalid,
266 /* 166 - So category        */ CharacterInvalid,
267 /* 167 - So category        */ CharacterInvalid,
268 /* 168 - Sk category        */ CharacterInvalid,
269 /* 169 - So category        */ CharacterInvalid,
270 /* 170 - Ll category        */ CharacterIdentifierStart,
271 /* 171 - Pi category        */ CharacterInvalid,
272 /* 172 - Sm category        */ CharacterInvalid,
273 /* 173 - Cf category        */ CharacterInvalid,
274 /* 174 - So category        */ CharacterInvalid,
275 /* 175 - Sk category        */ CharacterInvalid,
276 /* 176 - So category        */ CharacterInvalid,
277 /* 177 - Sm category        */ CharacterInvalid,
278 /* 178 - No category        */ CharacterInvalid,
279 /* 179 - No category        */ CharacterInvalid,
280 /* 180 - Sk category        */ CharacterInvalid,
281 /* 181 - Ll category        */ CharacterIdentifierStart,
282 /* 182 - So category        */ CharacterInvalid,
283 /* 183 - Po category        */ CharacterInvalid,
284 /* 184 - Sk category        */ CharacterInvalid,
285 /* 185 - No category        */ CharacterInvalid,
286 /* 186 - Ll category        */ CharacterIdentifierStart,
287 /* 187 - Pf category        */ CharacterInvalid,
288 /* 188 - No category        */ CharacterInvalid,
289 /* 189 - No category        */ CharacterInvalid,
290 /* 190 - No category        */ CharacterInvalid,
291 /* 191 - Po category        */ CharacterInvalid,
292 /* 192 - Lu category        */ CharacterIdentifierStart,
293 /* 193 - Lu category        */ CharacterIdentifierStart,
294 /* 194 - Lu category        */ CharacterIdentifierStart,
295 /* 195 - Lu category        */ CharacterIdentifierStart,
296 /* 196 - Lu category        */ CharacterIdentifierStart,
297 /* 197 - Lu category        */ CharacterIdentifierStart,
298 /* 198 - Lu category        */ CharacterIdentifierStart,
299 /* 199 - Lu category        */ CharacterIdentifierStart,
300 /* 200 - Lu category        */ CharacterIdentifierStart,
301 /* 201 - Lu category        */ CharacterIdentifierStart,
302 /* 202 - Lu category        */ CharacterIdentifierStart,
303 /* 203 - Lu category        */ CharacterIdentifierStart,
304 /* 204 - Lu category        */ CharacterIdentifierStart,
305 /* 205 - Lu category        */ CharacterIdentifierStart,
306 /* 206 - Lu category        */ CharacterIdentifierStart,
307 /* 207 - Lu category        */ CharacterIdentifierStart,
308 /* 208 - Lu category        */ CharacterIdentifierStart,
309 /* 209 - Lu category        */ CharacterIdentifierStart,
310 /* 210 - Lu category        */ CharacterIdentifierStart,
311 /* 211 - Lu category        */ CharacterIdentifierStart,
312 /* 212 - Lu category        */ CharacterIdentifierStart,
313 /* 213 - Lu category        */ CharacterIdentifierStart,
314 /* 214 - Lu category        */ CharacterIdentifierStart,
315 /* 215 - Sm category        */ CharacterInvalid,
316 /* 216 - Lu category        */ CharacterIdentifierStart,
317 /* 217 - Lu category        */ CharacterIdentifierStart,
318 /* 218 - Lu category        */ CharacterIdentifierStart,
319 /* 219 - Lu category        */ CharacterIdentifierStart,
320 /* 220 - Lu category        */ CharacterIdentifierStart,
321 /* 221 - Lu category        */ CharacterIdentifierStart,
322 /* 222 - Lu category        */ CharacterIdentifierStart,
323 /* 223 - Ll category        */ CharacterIdentifierStart,
324 /* 224 - Ll category        */ CharacterIdentifierStart,
325 /* 225 - Ll category        */ CharacterIdentifierStart,
326 /* 226 - Ll category        */ CharacterIdentifierStart,
327 /* 227 - Ll category        */ CharacterIdentifierStart,
328 /* 228 - Ll category        */ CharacterIdentifierStart,
329 /* 229 - Ll category        */ CharacterIdentifierStart,
330 /* 230 - Ll category        */ CharacterIdentifierStart,
331 /* 231 - Ll category        */ CharacterIdentifierStart,
332 /* 232 - Ll category        */ CharacterIdentifierStart,
333 /* 233 - Ll category        */ CharacterIdentifierStart,
334 /* 234 - Ll category        */ CharacterIdentifierStart,
335 /* 235 - Ll category        */ CharacterIdentifierStart,
336 /* 236 - Ll category        */ CharacterIdentifierStart,
337 /* 237 - Ll category        */ CharacterIdentifierStart,
338 /* 238 - Ll category        */ CharacterIdentifierStart,
339 /* 239 - Ll category        */ CharacterIdentifierStart,
340 /* 240 - Ll category        */ CharacterIdentifierStart,
341 /* 241 - Ll category        */ CharacterIdentifierStart,
342 /* 242 - Ll category        */ CharacterIdentifierStart,
343 /* 243 - Ll category        */ CharacterIdentifierStart,
344 /* 244 - Ll category        */ CharacterIdentifierStart,
345 /* 245 - Ll category        */ CharacterIdentifierStart,
346 /* 246 - Ll category        */ CharacterIdentifierStart,
347 /* 247 - Sm category        */ CharacterInvalid,
348 /* 248 - Ll category        */ CharacterIdentifierStart,
349 /* 249 - Ll category        */ CharacterIdentifierStart,
350 /* 250 - Ll category        */ CharacterIdentifierStart,
351 /* 251 - Ll category        */ CharacterIdentifierStart,
352 /* 252 - Ll category        */ CharacterIdentifierStart,
353 /* 253 - Ll category        */ CharacterIdentifierStart,
354 /* 254 - Ll category        */ CharacterIdentifierStart,
355 /* 255 - Ll category        */ CharacterIdentifierStart
356 };
357
358 // This table provides the character that results from \X where X is the index in the table beginning
359 // with SPACE. A table value of 0 means that more processing needs to be done.
360 static const LChar singleCharacterEscapeValuesForASCII[128] = {
361 /*   0 - Null               */ 0,
362 /*   1 - Start of Heading   */ 0,
363 /*   2 - Start of Text      */ 0,
364 /*   3 - End of Text        */ 0,
365 /*   4 - End of Transm.     */ 0,
366 /*   5 - Enquiry            */ 0,
367 /*   6 - Acknowledgment     */ 0,
368 /*   7 - Bell               */ 0,
369 /*   8 - Back Space         */ 0,
370 /*   9 - Horizontal Tab     */ 0,
371 /*  10 - Line Feed          */ 0,
372 /*  11 - Vertical Tab       */ 0,
373 /*  12 - Form Feed          */ 0,
374 /*  13 - Carriage Return    */ 0,
375 /*  14 - Shift Out          */ 0,
376 /*  15 - Shift In           */ 0,
377 /*  16 - Data Line Escape   */ 0,
378 /*  17 - Device Control 1   */ 0,
379 /*  18 - Device Control 2   */ 0,
380 /*  19 - Device Control 3   */ 0,
381 /*  20 - Device Control 4   */ 0,
382 /*  21 - Negative Ack.      */ 0,
383 /*  22 - Synchronous Idle   */ 0,
384 /*  23 - End of Transmit    */ 0,
385 /*  24 - Cancel             */ 0,
386 /*  25 - End of Medium      */ 0,
387 /*  26 - Substitute         */ 0,
388 /*  27 - Escape             */ 0,
389 /*  28 - File Separator     */ 0,
390 /*  29 - Group Separator    */ 0,
391 /*  30 - Record Separator   */ 0,
392 /*  31 - Unit Separator     */ 0,
393 /*  32 - Space              */ ' ',
394 /*  33 - !                  */ '!',
395 /*  34 - "                  */ '"',
396 /*  35 - #                  */ '#',
397 /*  36 - $                  */ '$',
398 /*  37 - %                  */ '%',
399 /*  38 - &                  */ '&',
400 /*  39 - '                  */ '\'',
401 /*  40 - (                  */ '(',
402 /*  41 - )                  */ ')',
403 /*  42 - *                  */ '*',
404 /*  43 - +                  */ '+',
405 /*  44 - ,                  */ ',',
406 /*  45 - -                  */ '-',
407 /*  46 - .                  */ '.',
408 /*  47 - /                  */ '/',
409 /*  48 - 0                  */ 0,
410 /*  49 - 1                  */ 0,
411 /*  50 - 2                  */ 0,
412 /*  51 - 3                  */ 0,
413 /*  52 - 4                  */ 0,
414 /*  53 - 5                  */ 0,
415 /*  54 - 6                  */ 0,
416 /*  55 - 7                  */ 0,
417 /*  56 - 8                  */ 0,
418 /*  57 - 9                  */ 0,
419 /*  58 - :                  */ ':',
420 /*  59 - ;                  */ ';',
421 /*  60 - <                  */ '<',
422 /*  61 - =                  */ '=',
423 /*  62 - >                  */ '>',
424 /*  63 - ?                  */ '?',
425 /*  64 - @                  */ '@',
426 /*  65 - A                  */ 'A',
427 /*  66 - B                  */ 'B',
428 /*  67 - C                  */ 'C',
429 /*  68 - D                  */ 'D',
430 /*  69 - E                  */ 'E',
431 /*  70 - F                  */ 'F',
432 /*  71 - G                  */ 'G',
433 /*  72 - H                  */ 'H',
434 /*  73 - I                  */ 'I',
435 /*  74 - J                  */ 'J',
436 /*  75 - K                  */ 'K',
437 /*  76 - L                  */ 'L',
438 /*  77 - M                  */ 'M',
439 /*  78 - N                  */ 'N',
440 /*  79 - O                  */ 'O',
441 /*  80 - P                  */ 'P',
442 /*  81 - Q                  */ 'Q',
443 /*  82 - R                  */ 'R',
444 /*  83 - S                  */ 'S',
445 /*  84 - T                  */ 'T',
446 /*  85 - U                  */ 'U',
447 /*  86 - V                  */ 'V',
448 /*  87 - W                  */ 'W',
449 /*  88 - X                  */ 'X',
450 /*  89 - Y                  */ 'Y',
451 /*  90 - Z                  */ 'Z',
452 /*  91 - [                  */ '[',
453 /*  92 - \                  */ '\\',
454 /*  93 - ]                  */ ']',
455 /*  94 - ^                  */ '^',
456 /*  95 - _                  */ '_',
457 /*  96 - `                  */ '`',
458 /*  97 - a                  */ 'a',
459 /*  98 - b                  */ 0x08,
460 /*  99 - c                  */ 'c',
461 /* 100 - d                  */ 'd',
462 /* 101 - e                  */ 'e',
463 /* 102 - f                  */ 0x0C,
464 /* 103 - g                  */ 'g',
465 /* 104 - h                  */ 'h',
466 /* 105 - i                  */ 'i',
467 /* 106 - j                  */ 'j',
468 /* 107 - k                  */ 'k',
469 /* 108 - l                  */ 'l',
470 /* 109 - m                  */ 'm',
471 /* 110 - n                  */ 0x0A,
472 /* 111 - o                  */ 'o',
473 /* 112 - p                  */ 'p',
474 /* 113 - q                  */ 'q',
475 /* 114 - r                  */ 0x0D,
476 /* 115 - s                  */ 's',
477 /* 116 - t                  */ 0x09,
478 /* 117 - u                  */ 0,
479 /* 118 - v                  */ 0x0B,
480 /* 119 - w                  */ 'w',
481 /* 120 - x                  */ 0,
482 /* 121 - y                  */ 'y',
483 /* 122 - z                  */ 'z',
484 /* 123 - {                  */ '{',
485 /* 124 - |                  */ '|',
486 /* 125 - }                  */ '}',
487 /* 126 - ~                  */ '~',
488 /* 127 - Delete             */ 0
489 };
490
491 template <typename T>
492 Lexer<T>::Lexer(VM* vm)
493     : m_isReparsing(false)
494     , m_vm(vm)
495 {
496 }
497
498 template <typename T>
499 Lexer<T>::~Lexer()
500 {
501 }
502
503 template <typename T>
504 String Lexer<T>::invalidCharacterMessage() const
505 {
506     switch (m_current) {
507     case 0:
508         return "Invalid character: '\\0'";
509     case 10:
510         return "Invalid character: '\\n'";
511     case 11:
512         return "Invalid character: '\\v'";
513     case 13:
514         return "Invalid character: '\\r'";
515     case 35:
516         return "Invalid character: '#'";
517     case 64:
518         return "Invalid character: '@'";
519     case 96:
520         return "Invalid character: '`'";
521     default:
522         return String::format("Invalid character '\\u%04u'", static_cast<unsigned>(m_current)).impl();
523     }
524 }
525
526 template <typename T>
527 ALWAYS_INLINE const T* Lexer<T>::currentSourcePtr() const
528 {
529     ASSERT(m_code <= m_codeEnd);
530     return m_code;
531 }
532
533 template <typename T>
534 void Lexer<T>::setCode(const SourceCode& source, ParserArena* arena)
535 {
536     m_arena = &arena->identifierArena();
537     
538     m_lineNumber = source.firstLine();
539     m_lastToken = -1;
540     
541     const String& sourceString = source.provider()->source();
542
543     if (!sourceString.isNull())
544         setCodeStart(sourceString.impl());
545     else
546         m_codeStart = 0;
547
548     m_source = &source;
549     m_sourceOffset = source.startOffset();
550     m_codeStartPlusOffset = m_codeStart + source.startOffset();
551     m_code = m_codeStartPlusOffset;
552     m_codeEnd = m_codeStart + source.endOffset();
553     m_error = false;
554     m_atLineStart = true;
555     m_lineStart = m_code;
556     m_lexErrorMessage = String();
557     
558     m_buffer8.reserveInitialCapacity(initialReadBufferCapacity);
559     m_buffer16.reserveInitialCapacity((m_codeEnd - m_code) / 2);
560     
561     if (LIKELY(m_code < m_codeEnd))
562         m_current = *m_code;
563     else
564         m_current = 0;
565     ASSERT(currentOffset() == source.startOffset());
566 }
567
568 template <typename T>
569 template <int shiftAmount> ALWAYS_INLINE void Lexer<T>::internalShift()
570 {
571     m_code += shiftAmount;
572     ASSERT(currentOffset() >= currentLineStartOffset());
573     m_current = *m_code;
574 }
575
576 template <typename T>
577 ALWAYS_INLINE void Lexer<T>::shift()
578 {
579     // At one point timing showed that setting m_current to 0 unconditionally was faster than an if-else sequence.
580     m_current = 0;
581     ++m_code;
582     if (LIKELY(m_code < m_codeEnd))
583         m_current = *m_code;
584 }
585
586 template <typename T>
587 ALWAYS_INLINE bool Lexer<T>::atEnd() const
588 {
589     ASSERT(!m_current || m_code < m_codeEnd);
590     return UNLIKELY(UNLIKELY(!m_current) && m_code == m_codeEnd);
591 }
592
593 template <typename T>
594 ALWAYS_INLINE T Lexer<T>::peek(int offset) const
595 {
596     ASSERT(offset > 0 && offset < 5);
597     const T* code = m_code + offset;
598     return (code < m_codeEnd) ? *code : 0;
599 }
600
601 template <typename T>
602 typename Lexer<T>::UnicodeHexValue Lexer<T>::parseFourDigitUnicodeHex()
603 {
604     T char1 = peek(1);
605     T char2 = peek(2);
606     T char3 = peek(3);
607
608     if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(char1) || !isASCIIHexDigit(char2) || !isASCIIHexDigit(char3)))
609         return UnicodeHexValue((m_code + 4) >= m_codeEnd ? UnicodeHexValue::IncompleteHex : UnicodeHexValue::InvalidHex);
610
611     int result = convertUnicode(m_current, char1, char2, char3);
612     shift();
613     shift();
614     shift();
615     shift();
616     return UnicodeHexValue(result);
617 }
618
619 template <typename T>
620 void Lexer<T>::shiftLineTerminator()
621 {
622     ASSERT(isLineTerminator(m_current));
623
624     T prev = m_current;
625     shift();
626
627     // Allow both CRLF and LFCR.
628     if (prev + m_current == '\n' + '\r')
629         shift();
630
631     ++m_lineNumber;
632 }
633
634 template <typename T>
635 ALWAYS_INLINE bool Lexer<T>::lastTokenWasRestrKeyword() const
636 {
637     return m_lastToken == CONTINUE || m_lastToken == BREAK || m_lastToken == RETURN || m_lastToken == THROW;
638 }
639
640 static NEVER_INLINE bool isNonLatin1IdentStart(int c)
641 {
642     return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other);
643 }
644
645 static ALWAYS_INLINE bool isLatin1(LChar)
646 {
647     return true;
648 }
649
650 static ALWAYS_INLINE bool isLatin1(UChar c)
651 {
652     return c < 256;
653 }
654
655 static inline bool isIdentStart(LChar c)
656 {
657     return typesOfLatin1Characters[c] == CharacterIdentifierStart;
658 }
659
660 static inline bool isIdentStart(UChar c)
661 {
662     return isLatin1(c) ? isIdentStart(static_cast<LChar>(c)) : isNonLatin1IdentStart(c);
663 }
664
665 static NEVER_INLINE bool isNonLatin1IdentPart(int c)
666 {
667     return (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other
668         | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector)) || c == 0x200C || c == 0x200D;
669 }
670
671 static ALWAYS_INLINE bool isIdentPart(LChar c)
672 {
673     // Character types are divided into two groups depending on whether they can be part of an
674     // identifier or not. Those whose type value is less or equal than CharacterNumber can be
675     // part of an identifier. (See the CharacterType definition for more details.)
676     return typesOfLatin1Characters[c] <= CharacterNumber;
677 }
678
679 static ALWAYS_INLINE bool isIdentPart(UChar c)
680 {
681     return isLatin1(c) ? isIdentPart(static_cast<LChar>(c)) : isNonLatin1IdentPart(c);
682 }
683
684 static inline LChar singleEscape(int c)
685 {
686     if (c < 128) {
687         ASSERT(static_cast<size_t>(c) < ARRAY_SIZE(singleCharacterEscapeValuesForASCII));
688         return singleCharacterEscapeValuesForASCII[c];
689     }
690     return 0;
691 }
692
693 template <typename T>
694 inline void Lexer<T>::record8(int c)
695 {
696     ASSERT(c >= 0);
697     ASSERT(c <= 0xFF);
698     m_buffer8.append(static_cast<LChar>(c));
699 }
700
701 template <typename T>
702 inline void assertCharIsIn8BitRange(T c)
703 {
704     UNUSED_PARAM(c);
705     ASSERT(c >= 0);
706     ASSERT(c <= 0xFF);
707 }
708
709 template <>
710 inline void assertCharIsIn8BitRange(UChar c)
711 {
712     UNUSED_PARAM(c);
713     ASSERT(c <= 0xFF);
714 }
715
716 template <>
717 inline void assertCharIsIn8BitRange(LChar)
718 {
719 }
720
721 template <typename T>
722 inline void Lexer<T>::append8(const T* p, size_t length)
723 {
724     size_t currentSize = m_buffer8.size();
725     m_buffer8.grow(currentSize + length);
726     LChar* rawBuffer = m_buffer8.data() + currentSize;
727
728     for (size_t i = 0; i < length; i++) {
729         T c = p[i];
730         assertCharIsIn8BitRange(c);
731         rawBuffer[i] = c;
732     }
733 }
734
735 template <typename T>
736 inline void Lexer<T>::append16(const LChar* p, size_t length)
737 {
738     size_t currentSize = m_buffer16.size();
739     m_buffer16.grow(currentSize + length);
740     UChar* rawBuffer = m_buffer16.data() + currentSize;
741
742     for (size_t i = 0; i < length; i++)
743         rawBuffer[i] = p[i];
744 }
745
746 template <typename T>
747 inline void Lexer<T>::record16(T c)
748 {
749     m_buffer16.append(c);
750 }
751
752 template <typename T>
753 inline void Lexer<T>::record16(int c)
754 {
755     ASSERT(c >= 0);
756     ASSERT(c <= static_cast<int>(USHRT_MAX));
757     m_buffer16.append(static_cast<UChar>(c));
758 }
759
760 template <>
761 template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType Lexer<LChar>::parseIdentifier(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
762 {
763     const ptrdiff_t remaining = m_codeEnd - m_code;
764     if ((remaining >= maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) {
765         JSTokenType keyword = parseKeyword<shouldCreateIdentifier>(tokenData);
766         if (keyword != IDENT) {
767             ASSERT((!shouldCreateIdentifier) || tokenData->ident);
768             return keyword == RESERVED_IF_STRICT && !strictMode ? IDENT : keyword;
769         }
770     }
771
772     const LChar* identifierStart = currentSourcePtr();
773     unsigned identifierLineStart = currentLineStartOffset();
774     
775     while (isIdentPart(m_current))
776         shift();
777     
778     if (UNLIKELY(m_current == '\\')) {
779         setOffsetFromSourcePtr(identifierStart, identifierLineStart);
780         return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode);
781     }
782
783     const Identifier* ident = 0;
784     
785     if (shouldCreateIdentifier) {
786         int identifierLength = currentSourcePtr() - identifierStart;
787         ident = makeIdentifier(identifierStart, identifierLength);
788
789         tokenData->ident = ident;
790     } else
791         tokenData->ident = 0;
792
793     if (UNLIKELY((remaining < maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords))) {
794         ASSERT(shouldCreateIdentifier);
795         if (remaining < maxTokenLength) {
796             const HashEntry* entry = m_vm->keywords->getKeyword(*ident);
797             ASSERT((remaining < maxTokenLength) || !entry);
798             if (!entry)
799                 return IDENT;
800             JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
801             return (token != RESERVED_IF_STRICT) || strictMode ? token : IDENT;
802         }
803         return IDENT;
804     }
805
806     return IDENT;
807 }
808
809 template <>
810 template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType Lexer<UChar>::parseIdentifier(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
811 {
812     const ptrdiff_t remaining = m_codeEnd - m_code;
813     if ((remaining >= maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) {
814         JSTokenType keyword = parseKeyword<shouldCreateIdentifier>(tokenData);
815         if (keyword != IDENT) {
816             ASSERT((!shouldCreateIdentifier) || tokenData->ident);
817             return keyword == RESERVED_IF_STRICT && !strictMode ? IDENT : keyword;
818         }
819     }
820
821     const UChar* identifierStart = currentSourcePtr();
822     int identifierLineStart = currentLineStartOffset();
823
824     UChar orAllChars = 0;
825     
826     while (isIdentPart(m_current)) {
827         orAllChars |= m_current;
828         shift();
829     }
830     
831     if (UNLIKELY(m_current == '\\')) {
832         setOffsetFromSourcePtr(identifierStart, identifierLineStart);
833         return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode);
834     }
835
836     bool isAll8Bit = false;
837
838     if (!(orAllChars & ~0xff))
839         isAll8Bit = true;
840
841     const Identifier* ident = 0;
842     
843     if (shouldCreateIdentifier) {
844         int identifierLength = currentSourcePtr() - identifierStart;
845         if (isAll8Bit)
846             ident = makeIdentifierLCharFromUChar(identifierStart, identifierLength);
847         else
848             ident = makeIdentifier(identifierStart, identifierLength);
849         
850         tokenData->ident = ident;
851     } else
852         tokenData->ident = 0;
853     
854     if (UNLIKELY((remaining < maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords))) {
855         ASSERT(shouldCreateIdentifier);
856         if (remaining < maxTokenLength) {
857             const HashEntry* entry = m_vm->keywords->getKeyword(*ident);
858             ASSERT((remaining < maxTokenLength) || !entry);
859             if (!entry)
860                 return IDENT;
861             JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
862             return (token != RESERVED_IF_STRICT) || strictMode ? token : IDENT;
863         }
864         return IDENT;
865     }
866
867     return IDENT;
868 }
869
870 template <typename T>
871 template <bool shouldCreateIdentifier> JSTokenType Lexer<T>::parseIdentifierSlowCase(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
872 {
873     const ptrdiff_t remaining = m_codeEnd - m_code;
874     const T* identifierStart = currentSourcePtr();
875     bool bufferRequired = false;
876
877     while (true) {
878         if (LIKELY(isIdentPart(m_current))) {
879             shift();
880             continue;
881         }
882         if (LIKELY(m_current != '\\'))
883             break;
884
885         // \uXXXX unicode characters.
886         bufferRequired = true;
887         if (identifierStart != currentSourcePtr())
888             m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);
889         shift();
890         if (UNLIKELY(m_current != 'u'))
891             return atEnd() ? UNTERMINATED_IDENTIFIER_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_ESCAPE_ERRORTOK;
892         shift();
893         UnicodeHexValue character = parseFourDigitUnicodeHex();
894         if (UNLIKELY(!character.isValid()))
895             return character.valueType() == UnicodeHexValue::IncompleteHex ? UNTERMINATED_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
896         UChar ucharacter = static_cast<UChar>(character.value());
897         if (UNLIKELY(m_buffer16.size() ? !isIdentPart(ucharacter) : !isIdentStart(ucharacter)))
898             return INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
899         if (shouldCreateIdentifier)
900             record16(ucharacter);
901         identifierStart = currentSourcePtr();
902     }
903
904     int identifierLength;
905     const Identifier* ident = 0;
906     if (shouldCreateIdentifier) {
907         if (!bufferRequired) {
908             identifierLength = currentSourcePtr() - identifierStart;
909             ident = makeIdentifier(identifierStart, identifierLength);
910         } else {
911             if (identifierStart != currentSourcePtr())
912                 m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);
913             ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
914         }
915
916         tokenData->ident = ident;
917     } else
918         tokenData->ident = 0;
919
920     if (LIKELY(!bufferRequired && !(lexerFlags & LexerFlagsIgnoreReservedWords))) {
921         ASSERT(shouldCreateIdentifier);
922         // Keywords must not be recognized if there was an \uXXXX in the identifier.
923         if (remaining < maxTokenLength) {
924             const HashEntry* entry = m_vm->keywords->getKeyword(*ident);
925             ASSERT((remaining < maxTokenLength) || !entry);
926             if (!entry)
927                 return IDENT;
928             JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
929             return (token != RESERVED_IF_STRICT) || strictMode ? token : IDENT;
930         }
931         return IDENT;
932     }
933
934     m_buffer16.resize(0);
935     return IDENT;
936 }
937
938 static ALWAYS_INLINE bool characterRequiresParseStringSlowCase(LChar character)
939 {
940     return character < 0xE;
941 }
942
943 static ALWAYS_INLINE bool characterRequiresParseStringSlowCase(UChar character)
944 {
945     return character < 0xE || character > 0xFF;
946 }
947
948 template <typename T>
949 template <bool shouldBuildStrings> ALWAYS_INLINE typename Lexer<T>::StringParseResult Lexer<T>::parseString(JSTokenData* tokenData, bool strictMode)
950 {
951     int startingOffset = currentOffset();
952     int startingLineStartOffset = currentLineStartOffset();
953     int startingLineNumber = lineNumber();
954     T stringQuoteCharacter = m_current;
955     shift();
956
957     const T* stringStart = currentSourcePtr();
958
959     while (m_current != stringQuoteCharacter) {
960         if (UNLIKELY(m_current == '\\')) {
961             if (stringStart != currentSourcePtr() && shouldBuildStrings)
962                 append8(stringStart, currentSourcePtr() - stringStart);
963             shift();
964
965             LChar escape = singleEscape(m_current);
966
967             // Most common escape sequences first
968             if (escape) {
969                 if (shouldBuildStrings)
970                     record8(escape);
971                 shift();
972             } else if (UNLIKELY(isLineTerminator(m_current)))
973                 shiftLineTerminator();
974             else if (m_current == 'x') {
975                 shift();
976                 if (!isASCIIHexDigit(m_current) || !isASCIIHexDigit(peek(1))) {
977                     m_lexErrorMessage = "\\x can only be followed by a hex character sequence";
978                     return (atEnd() || (isASCIIHexDigit(m_current) && (m_code + 1 == m_codeEnd))) ? StringUnterminated : StringCannotBeParsed;
979                 }
980                 T prev = m_current;
981                 shift();
982                 if (shouldBuildStrings)
983                     record8(convertHex(prev, m_current));
984                 shift();
985             } else {
986                 setOffset(startingOffset, startingLineStartOffset);
987                 setLineNumber(startingLineNumber);
988                 m_buffer8.resize(0);
989                 return parseStringSlowCase<shouldBuildStrings>(tokenData, strictMode);
990             }
991             stringStart = currentSourcePtr();
992             continue;
993         }
994
995         if (UNLIKELY(characterRequiresParseStringSlowCase(m_current))) {
996             setOffset(startingOffset, startingLineStartOffset);
997             setLineNumber(startingLineNumber);
998             m_buffer8.resize(0);
999             return parseStringSlowCase<shouldBuildStrings>(tokenData, strictMode);
1000         }
1001
1002         shift();
1003     }
1004
1005     if (currentSourcePtr() != stringStart && shouldBuildStrings)
1006         append8(stringStart, currentSourcePtr() - stringStart);
1007     if (shouldBuildStrings) {
1008         tokenData->ident = makeIdentifier(m_buffer8.data(), m_buffer8.size());
1009         m_buffer8.resize(0);
1010     } else
1011         tokenData->ident = 0;
1012
1013     return StringParsedSuccessfully;
1014 }
1015
1016 template <typename T>
1017 template <bool shouldBuildStrings> typename Lexer<T>::StringParseResult Lexer<T>::parseStringSlowCase(JSTokenData* tokenData, bool strictMode)
1018 {
1019     T stringQuoteCharacter = m_current;
1020     shift();
1021
1022     const T* stringStart = currentSourcePtr();
1023
1024     while (m_current != stringQuoteCharacter) {
1025         if (UNLIKELY(m_current == '\\')) {
1026             if (stringStart != currentSourcePtr() && shouldBuildStrings)
1027                 append16(stringStart, currentSourcePtr() - stringStart);
1028             shift();
1029
1030             LChar escape = singleEscape(m_current);
1031
1032             // Most common escape sequences first
1033             if (escape) {
1034                 if (shouldBuildStrings)
1035                     record16(escape);
1036                 shift();
1037             } else if (UNLIKELY(isLineTerminator(m_current)))
1038                 shiftLineTerminator();
1039             else if (m_current == 'x') {
1040                 shift();
1041                 if (!isASCIIHexDigit(m_current) || !isASCIIHexDigit(peek(1))) {
1042                     m_lexErrorMessage = "\\x can only be followed by a hex character sequence";
1043                     return StringCannotBeParsed;
1044                 }
1045                 T prev = m_current;
1046                 shift();
1047                 if (shouldBuildStrings)
1048                     record16(convertHex(prev, m_current));
1049                 shift();
1050             } else if (m_current == 'u') {
1051                 shift();
1052                 UnicodeHexValue character = parseFourDigitUnicodeHex();
1053                 if (character.isValid()) {
1054                     if (shouldBuildStrings)
1055                         record16(character.value());
1056                 } else if (m_current == stringQuoteCharacter) {
1057                     if (shouldBuildStrings)
1058                         record16('u');
1059                 } else {
1060                     m_lexErrorMessage = "\\u can only be followed by a Unicode character sequence";
1061                     return character.valueType() == UnicodeHexValue::IncompleteHex ? StringUnterminated : StringCannotBeParsed;
1062                 }
1063             } else if (strictMode && isASCIIDigit(m_current)) {
1064                 // The only valid numeric escape in strict mode is '\0', and this must not be followed by a decimal digit.
1065                 int character1 = m_current;
1066                 shift();
1067                 if (character1 != '0' || isASCIIDigit(m_current)) {
1068                     m_lexErrorMessage = "The only valid numeric escape in strict mode is '\\0'";
1069                     return StringCannotBeParsed;
1070                 }
1071                 if (shouldBuildStrings)
1072                     record16(0);
1073             } else if (!strictMode && isASCIIOctalDigit(m_current)) {
1074                 // Octal character sequences
1075                 T character1 = m_current;
1076                 shift();
1077                 if (isASCIIOctalDigit(m_current)) {
1078                     // Two octal characters
1079                     T character2 = m_current;
1080                     shift();
1081                     if (character1 >= '0' && character1 <= '3' && isASCIIOctalDigit(m_current)) {
1082                         if (shouldBuildStrings)
1083                             record16((character1 - '0') * 64 + (character2 - '0') * 8 + m_current - '0');
1084                         shift();
1085                     } else {
1086                         if (shouldBuildStrings)
1087                             record16((character1 - '0') * 8 + character2 - '0');
1088                     }
1089                 } else {
1090                     if (shouldBuildStrings)
1091                         record16(character1 - '0');
1092                 }
1093             } else if (!atEnd()) {
1094                 if (shouldBuildStrings)
1095                     record16(m_current);
1096                 shift();
1097             } else {
1098                 m_lexErrorMessage = "Unterminated string constant";
1099                 return StringUnterminated;
1100             }
1101
1102             stringStart = currentSourcePtr();
1103             continue;
1104         }
1105         // Fast check for characters that require special handling.
1106         // Catches 0, \n, \r, 0x2028, and 0x2029 as efficiently
1107         // as possible, and lets through all common ASCII characters.
1108         if (UNLIKELY(((static_cast<unsigned>(m_current) - 0xE) & 0x2000))) {
1109             // New-line or end of input is not allowed
1110             if (atEnd() || isLineTerminator(m_current)) {
1111                 m_lexErrorMessage = "Unexpected EOF";
1112                 return atEnd() ? StringUnterminated : StringCannotBeParsed;
1113             }
1114             // Anything else is just a normal character
1115         }
1116         shift();
1117     }
1118
1119     if (currentSourcePtr() != stringStart && shouldBuildStrings)
1120         append16(stringStart, currentSourcePtr() - stringStart);
1121     if (shouldBuildStrings)
1122         tokenData->ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
1123     else
1124         tokenData->ident = 0;
1125
1126     m_buffer16.resize(0);
1127     return StringParsedSuccessfully;
1128 }
1129
1130 template <typename T>
1131 ALWAYS_INLINE void Lexer<T>::parseHex(double& returnValue)
1132 {
1133     // Optimization: most hexadecimal values fit into 4 bytes.
1134     uint32_t hexValue = 0;
1135     int maximumDigits = 7;
1136
1137     // Shift out the 'x' prefix.
1138     shift();
1139
1140     do {
1141         hexValue = (hexValue << 4) + toASCIIHexValue(m_current);
1142         shift();
1143         --maximumDigits;
1144     } while (isASCIIHexDigit(m_current) && maximumDigits >= 0);
1145
1146     if (maximumDigits >= 0) {
1147         returnValue = hexValue;
1148         return;
1149     }
1150
1151     // No more place in the hexValue buffer.
1152     // The values are shifted out and placed into the m_buffer8 vector.
1153     for (int i = 0; i < 8; ++i) {
1154          int digit = hexValue >> 28;
1155          if (digit < 10)
1156              record8(digit + '0');
1157          else
1158              record8(digit - 10 + 'a');
1159          hexValue <<= 4;
1160     }
1161
1162     while (isASCIIHexDigit(m_current)) {
1163         record8(m_current);
1164         shift();
1165     }
1166
1167     returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 16);
1168 }
1169
1170 template <typename T>
1171 ALWAYS_INLINE bool Lexer<T>::parseOctal(double& returnValue)
1172 {
1173     // Optimization: most octal values fit into 4 bytes.
1174     uint32_t octalValue = 0;
1175     int maximumDigits = 9;
1176     // Temporary buffer for the digits. Makes easier
1177     // to reconstruct the input characters when needed.
1178     LChar digits[10];
1179
1180     do {
1181         octalValue = octalValue * 8 + (m_current - '0');
1182         digits[maximumDigits] = m_current;
1183         shift();
1184         --maximumDigits;
1185     } while (isASCIIOctalDigit(m_current) && maximumDigits >= 0);
1186
1187     if (!isASCIIDigit(m_current) && maximumDigits >= 0) {
1188         returnValue = octalValue;
1189         return true;
1190     }
1191
1192     for (int i = 9; i > maximumDigits; --i)
1193          record8(digits[i]);
1194
1195     while (isASCIIOctalDigit(m_current)) {
1196         record8(m_current);
1197         shift();
1198     }
1199
1200     if (isASCIIDigit(m_current))
1201         return false;
1202
1203     returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 8);
1204     return true;
1205 }
1206
1207 template <typename T>
1208 ALWAYS_INLINE bool Lexer<T>::parseDecimal(double& returnValue)
1209 {
1210     // Optimization: most decimal values fit into 4 bytes.
1211     uint32_t decimalValue = 0;
1212
1213     // Since parseOctal may be executed before parseDecimal,
1214     // the m_buffer8 may hold ascii digits.
1215     if (!m_buffer8.size()) {
1216         int maximumDigits = 9;
1217         // Temporary buffer for the digits. Makes easier
1218         // to reconstruct the input characters when needed.
1219         LChar digits[10];
1220
1221         do {
1222             decimalValue = decimalValue * 10 + (m_current - '0');
1223             digits[maximumDigits] = m_current;
1224             shift();
1225             --maximumDigits;
1226         } while (isASCIIDigit(m_current) && maximumDigits >= 0);
1227
1228         if (maximumDigits >= 0 && m_current != '.' && (m_current | 0x20) != 'e') {
1229             returnValue = decimalValue;
1230             return true;
1231         }
1232
1233         for (int i = 9; i > maximumDigits; --i)
1234             record8(digits[i]);
1235     }
1236
1237     while (isASCIIDigit(m_current)) {
1238         record8(m_current);
1239         shift();
1240     }
1241
1242     return false;
1243 }
1244
1245 template <typename T>
1246 ALWAYS_INLINE void Lexer<T>::parseNumberAfterDecimalPoint()
1247 {
1248     record8('.');
1249     while (isASCIIDigit(m_current)) {
1250         record8(m_current);
1251         shift();
1252     }
1253 }
1254
1255 template <typename T>
1256 ALWAYS_INLINE bool Lexer<T>::parseNumberAfterExponentIndicator()
1257 {
1258     record8('e');
1259     shift();
1260     if (m_current == '+' || m_current == '-') {
1261         record8(m_current);
1262         shift();
1263     }
1264
1265     if (!isASCIIDigit(m_current))
1266         return false;
1267
1268     do {
1269         record8(m_current);
1270         shift();
1271     } while (isASCIIDigit(m_current));
1272     return true;
1273 }
1274
1275 template <typename T>
1276 ALWAYS_INLINE bool Lexer<T>::parseMultilineComment()
1277 {
1278     while (true) {
1279         while (UNLIKELY(m_current == '*')) {
1280             shift();
1281             if (m_current == '/') {
1282                 shift();
1283                 return true;
1284             }
1285         }
1286
1287         if (atEnd())
1288             return false;
1289
1290         if (isLineTerminator(m_current)) {
1291             shiftLineTerminator();
1292             m_terminator = true;
1293         } else
1294             shift();
1295     }
1296 }
1297
1298 template <typename T>
1299 bool Lexer<T>::nextTokenIsColon()
1300 {
1301     const T* code = m_code;
1302     while (code < m_codeEnd && (isWhiteSpace(*code) || isLineTerminator(*code)))
1303         code++;
1304     
1305     return code < m_codeEnd && *code == ':';
1306 }
1307
1308 template <typename T>
1309 JSTokenType Lexer<T>::lex(JSTokenData* tokenData, JSTokenLocation* tokenLocation, unsigned lexerFlags, bool strictMode)
1310 {
1311     ASSERT(!m_error);
1312     ASSERT(m_buffer8.isEmpty());
1313     ASSERT(m_buffer16.isEmpty());
1314
1315     JSTokenType token = ERRORTOK;
1316     m_terminator = false;
1317
1318 start:
1319     while (isWhiteSpace(m_current))
1320         shift();
1321
1322     if (atEnd())
1323         return EOFTOK;
1324     
1325     tokenLocation->startOffset = currentOffset();
1326     ASSERT(currentOffset() >= currentLineStartOffset());
1327
1328     CharacterType type;
1329     if (LIKELY(isLatin1(m_current)))
1330         type = static_cast<CharacterType>(typesOfLatin1Characters[m_current]);
1331     else if (isNonLatin1IdentStart(m_current))
1332         type = CharacterIdentifierStart;
1333     else if (isLineTerminator(m_current))
1334         type = CharacterLineTerminator;
1335     else
1336         type = CharacterInvalid;
1337
1338     switch (type) {
1339     case CharacterGreater:
1340         shift();
1341         if (m_current == '>') {
1342             shift();
1343             if (m_current == '>') {
1344                 shift();
1345                 if (m_current == '=') {
1346                     shift();
1347                     token = URSHIFTEQUAL;
1348                     break;
1349                 }
1350                 token = URSHIFT;
1351                 break;
1352             }
1353             if (m_current == '=') {
1354                 shift();
1355                 token = RSHIFTEQUAL;
1356                 break;
1357             }
1358             token = RSHIFT;
1359             break;
1360         }
1361         if (m_current == '=') {
1362             shift();
1363             token = GE;
1364             break;
1365         }
1366         token = GT;
1367         break;
1368     case CharacterEqual:
1369         shift();
1370         if (m_current == '=') {
1371             shift();
1372             if (m_current == '=') {
1373                 shift();
1374                 token = STREQ;
1375                 break;
1376             }
1377             token = EQEQ;
1378             break;
1379         }
1380         token = EQUAL;
1381         break;
1382     case CharacterLess:
1383         shift();
1384         if (m_current == '!' && peek(1) == '-' && peek(2) == '-') {
1385             // <!-- marks the beginning of a line comment (for www usage)
1386             goto inSingleLineComment;
1387         }
1388         if (m_current == '<') {
1389             shift();
1390             if (m_current == '=') {
1391                 shift();
1392                 token = LSHIFTEQUAL;
1393                 break;
1394             }
1395             token = LSHIFT;
1396             break;
1397         }
1398         if (m_current == '=') {
1399             shift();
1400             token = LE;
1401             break;
1402         }
1403         token = LT;
1404         break;
1405     case CharacterExclamationMark:
1406         shift();
1407         if (m_current == '=') {
1408             shift();
1409             if (m_current == '=') {
1410                 shift();
1411                 token = STRNEQ;
1412                 break;
1413             }
1414             token = NE;
1415             break;
1416         }
1417         token = EXCLAMATION;
1418         break;
1419     case CharacterAdd:
1420         shift();
1421         if (m_current == '+') {
1422             shift();
1423             token = (!m_terminator) ? PLUSPLUS : AUTOPLUSPLUS;
1424             break;
1425         }
1426         if (m_current == '=') {
1427             shift();
1428             token = PLUSEQUAL;
1429             break;
1430         }
1431         token = PLUS;
1432         break;
1433     case CharacterSub:
1434         shift();
1435         if (m_current == '-') {
1436             shift();
1437             if (m_atLineStart && m_current == '>') {
1438                 shift();
1439                 goto inSingleLineComment;
1440             }
1441             token = (!m_terminator) ? MINUSMINUS : AUTOMINUSMINUS;
1442             break;
1443         }
1444         if (m_current == '=') {
1445             shift();
1446             token = MINUSEQUAL;
1447             break;
1448         }
1449         token = MINUS;
1450         break;
1451     case CharacterMultiply:
1452         shift();
1453         if (m_current == '=') {
1454             shift();
1455             token = MULTEQUAL;
1456             break;
1457         }
1458         token = TIMES;
1459         break;
1460     case CharacterSlash:
1461         shift();
1462         if (m_current == '/') {
1463             shift();
1464             goto inSingleLineComment;
1465         }
1466         if (m_current == '*') {
1467             shift();
1468             if (parseMultilineComment())
1469                 goto start;
1470             m_lexErrorMessage = "Multiline comment was not closed properly";
1471             token = UNTERMINATED_MULTILINE_COMMENT_ERRORTOK;
1472             goto returnError;
1473         }
1474         if (m_current == '=') {
1475             shift();
1476             token = DIVEQUAL;
1477             break;
1478         }
1479         token = DIVIDE;
1480         break;
1481     case CharacterAnd:
1482         shift();
1483         if (m_current == '&') {
1484             shift();
1485             token = AND;
1486             break;
1487         }
1488         if (m_current == '=') {
1489             shift();
1490             token = ANDEQUAL;
1491             break;
1492         }
1493         token = BITAND;
1494         break;
1495     case CharacterXor:
1496         shift();
1497         if (m_current == '=') {
1498             shift();
1499             token = XOREQUAL;
1500             break;
1501         }
1502         token = BITXOR;
1503         break;
1504     case CharacterModulo:
1505         shift();
1506         if (m_current == '=') {
1507             shift();
1508             token = MODEQUAL;
1509             break;
1510         }
1511         token = MOD;
1512         break;
1513     case CharacterOr:
1514         shift();
1515         if (m_current == '=') {
1516             shift();
1517             token = OREQUAL;
1518             break;
1519         }
1520         if (m_current == '|') {
1521             shift();
1522             token = OR;
1523             break;
1524         }
1525         token = BITOR;
1526         break;
1527     case CharacterOpenParen:
1528         token = OPENPAREN;
1529         shift();
1530         break;
1531     case CharacterCloseParen:
1532         token = CLOSEPAREN;
1533         shift();
1534         break;
1535     case CharacterOpenBracket:
1536         token = OPENBRACKET;
1537         shift();
1538         break;
1539     case CharacterCloseBracket:
1540         token = CLOSEBRACKET;
1541         shift();
1542         break;
1543     case CharacterComma:
1544         token = COMMA;
1545         shift();
1546         break;
1547     case CharacterColon:
1548         token = COLON;
1549         shift();
1550         break;
1551     case CharacterQuestion:
1552         token = QUESTION;
1553         shift();
1554         break;
1555     case CharacterTilde:
1556         token = TILDE;
1557         shift();
1558         break;
1559     case CharacterSemicolon:
1560         shift();
1561         token = SEMICOLON;
1562         break;
1563     case CharacterOpenBrace:
1564         tokenData->line = lineNumber();
1565         tokenData->offset = currentOffset();
1566         tokenData->lineStartOffset = currentLineStartOffset();
1567         ASSERT(tokenData->offset >= tokenData->lineStartOffset);
1568         shift();
1569         token = OPENBRACE;
1570         break;
1571     case CharacterCloseBrace:
1572         tokenData->line = lineNumber();
1573         tokenData->offset = currentOffset();
1574         tokenData->lineStartOffset = currentLineStartOffset();
1575         ASSERT(tokenData->offset >= tokenData->lineStartOffset);
1576         shift();
1577         token = CLOSEBRACE;
1578         break;
1579     case CharacterDot:
1580         shift();
1581         if (!isASCIIDigit(m_current)) {
1582             token = DOT;
1583             break;
1584         }
1585         goto inNumberAfterDecimalPoint;
1586     case CharacterZero:
1587         shift();
1588         if ((m_current | 0x20) == 'x' && isASCIIHexDigit(peek(1))) {
1589             parseHex(tokenData->doubleValue);
1590             token = NUMBER;
1591         } else {
1592             record8('0');
1593             if (isASCIIOctalDigit(m_current)) {
1594                 if (parseOctal(tokenData->doubleValue)) {
1595                     if (strictMode) {
1596                         m_lexErrorMessage = "Octal escapes are forbidden in strict mode";
1597                         token = INVALID_OCTAL_NUMBER_ERRORTOK;
1598                         goto returnError;
1599                     }
1600                     token = NUMBER;
1601                 }
1602             }
1603         }
1604         // Fall through into CharacterNumber
1605     case CharacterNumber:
1606         if (LIKELY(token != NUMBER)) {
1607             if (!parseDecimal(tokenData->doubleValue)) {
1608                 if (m_current == '.') {
1609                     shift();
1610 inNumberAfterDecimalPoint:
1611                     parseNumberAfterDecimalPoint();
1612                 }
1613                 if ((m_current | 0x20) == 'e') {
1614                     if (!parseNumberAfterExponentIndicator()) {
1615                         m_lexErrorMessage = "Non-number found after exponent indicator";
1616                         token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK;
1617                         goto returnError;
1618                     }
1619                 }
1620                 size_t parsedLength;
1621                 tokenData->doubleValue = parseDouble(m_buffer8.data(), m_buffer8.size(), parsedLength);
1622             }
1623             token = NUMBER;
1624         }
1625
1626         // No identifiers allowed directly after numeric literal, e.g. "3in" is bad.
1627         if (UNLIKELY(isIdentStart(m_current))) {
1628             m_lexErrorMessage = "At least one digit must occur after a decimal point";
1629             token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK;
1630             goto returnError;
1631         }
1632         m_buffer8.resize(0);
1633         break;
1634     case CharacterQuote:
1635         if (lexerFlags & LexerFlagsDontBuildStrings) {
1636             StringParseResult result = parseString<false>(tokenData, strictMode);
1637             if (UNLIKELY(result != StringParsedSuccessfully)) {
1638                 token = result == StringUnterminated ? UNTERMINATED_STRING_LITERAL_ERRORTOK : INVALID_STRING_LITERAL_ERRORTOK;
1639                 goto returnError;
1640             }
1641         } else {
1642             StringParseResult result = parseString<true>(tokenData, strictMode);
1643             if (UNLIKELY(result != StringParsedSuccessfully)) {
1644                 token = result == StringUnterminated ? UNTERMINATED_STRING_LITERAL_ERRORTOK : INVALID_STRING_LITERAL_ERRORTOK;
1645                 goto returnError;
1646             }
1647         }
1648         shift();
1649         token = STRING;
1650         break;
1651     case CharacterIdentifierStart:
1652         ASSERT(isIdentStart(m_current));
1653         // Fall through into CharacterBackSlash.
1654     case CharacterBackSlash:
1655         if (lexerFlags & LexexFlagsDontBuildKeywords)
1656             token = parseIdentifier<false>(tokenData, lexerFlags, strictMode);
1657         else
1658             token = parseIdentifier<true>(tokenData, lexerFlags, strictMode);
1659         break;
1660     case CharacterLineTerminator:
1661         ASSERT(isLineTerminator(m_current));
1662         shiftLineTerminator();
1663         m_atLineStart = true;
1664         m_terminator = true;
1665         m_lineStart = m_code;
1666         goto start;
1667     case CharacterInvalid:
1668         m_lexErrorMessage = invalidCharacterMessage();
1669         token = ERRORTOK;
1670         goto returnError;
1671     default:
1672         RELEASE_ASSERT_NOT_REACHED();
1673         m_lexErrorMessage = "Internal Error";
1674         token = ERRORTOK;
1675         goto returnError;
1676     }
1677
1678     m_atLineStart = false;
1679     goto returnToken;
1680
1681 inSingleLineComment:
1682     while (!isLineTerminator(m_current)) {
1683         if (atEnd())
1684             return EOFTOK;
1685         shift();
1686     }
1687     shiftLineTerminator();
1688     m_atLineStart = true;
1689     m_terminator = true;
1690     m_lineStart = m_code;
1691     if (!lastTokenWasRestrKeyword())
1692         goto start;
1693
1694     token = SEMICOLON;
1695     // Fall through into returnToken.
1696
1697 returnToken:
1698     tokenLocation->line = m_lineNumber;
1699     tokenLocation->endOffset = currentOffset();
1700     tokenLocation->lineStartOffset = currentLineStartOffset();
1701     ASSERT(tokenLocation->endOffset >= tokenLocation->lineStartOffset);
1702     m_lastToken = token;
1703     return token;
1704
1705 returnError:
1706     m_error = true;
1707     tokenLocation->line = m_lineNumber;
1708     tokenLocation->endOffset = currentOffset();
1709     tokenLocation->lineStartOffset = currentLineStartOffset();
1710     ASSERT(tokenLocation->endOffset >= tokenLocation->lineStartOffset);
1711     RELEASE_ASSERT(token & ErrorTokenFlag);
1712     return token;
1713 }
1714
1715 template <typename T>
1716 static inline void orCharacter(UChar&, UChar);
1717
1718 template <>
1719 inline void orCharacter<LChar>(UChar&, UChar) { }
1720
1721 template <>
1722 inline void orCharacter<UChar>(UChar& orAccumulator, UChar character)
1723 {
1724     orAccumulator |= character;
1725 }
1726
1727 template <typename T>
1728 bool Lexer<T>::scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix)
1729 {
1730     ASSERT(m_buffer16.isEmpty());
1731
1732     bool lastWasEscape = false;
1733     bool inBrackets = false;
1734     UChar charactersOredTogether = 0;
1735
1736     if (patternPrefix) {
1737         ASSERT(!isLineTerminator(patternPrefix));
1738         ASSERT(patternPrefix != '/');
1739         ASSERT(patternPrefix != '[');
1740         record16(patternPrefix);
1741     }
1742
1743     while (true) {
1744         if (isLineTerminator(m_current) || atEnd()) {
1745             m_buffer16.resize(0);
1746             return false;
1747         }
1748
1749         T prev = m_current;
1750         
1751         shift();
1752
1753         if (prev == '/' && !lastWasEscape && !inBrackets)
1754             break;
1755
1756         record16(prev);
1757         orCharacter<T>(charactersOredTogether, prev);
1758
1759         if (lastWasEscape) {
1760             lastWasEscape = false;
1761             continue;
1762         }
1763
1764         switch (prev) {
1765         case '[':
1766             inBrackets = true;
1767             break;
1768         case ']':
1769             inBrackets = false;
1770             break;
1771         case '\\':
1772             lastWasEscape = true;
1773             break;
1774         }
1775     }
1776
1777     pattern = makeRightSizedIdentifier(m_buffer16.data(), m_buffer16.size(), charactersOredTogether);
1778
1779     m_buffer16.resize(0);
1780     charactersOredTogether = 0;
1781
1782     while (isIdentPart(m_current)) {
1783         record16(m_current);
1784         orCharacter<T>(charactersOredTogether, m_current);
1785         shift();
1786     }
1787
1788     flags = makeRightSizedIdentifier(m_buffer16.data(), m_buffer16.size(), charactersOredTogether);
1789     m_buffer16.resize(0);
1790
1791     return true;
1792 }
1793
1794 template <typename T>
1795 bool Lexer<T>::skipRegExp()
1796 {
1797     bool lastWasEscape = false;
1798     bool inBrackets = false;
1799
1800     while (true) {
1801         if (isLineTerminator(m_current) || atEnd())
1802             return false;
1803
1804         T prev = m_current;
1805         
1806         shift();
1807
1808         if (prev == '/' && !lastWasEscape && !inBrackets)
1809             break;
1810
1811         if (lastWasEscape) {
1812             lastWasEscape = false;
1813             continue;
1814         }
1815
1816         switch (prev) {
1817         case '[':
1818             inBrackets = true;
1819             break;
1820         case ']':
1821             inBrackets = false;
1822             break;
1823         case '\\':
1824             lastWasEscape = true;
1825             break;
1826         }
1827     }
1828
1829     while (isIdentPart(m_current))
1830         shift();
1831
1832     return true;
1833 }
1834
1835 template <typename T>
1836 void Lexer<T>::clear()
1837 {
1838     m_arena = 0;
1839
1840     Vector<LChar> newBuffer8;
1841     m_buffer8.swap(newBuffer8);
1842
1843     Vector<UChar> newBuffer16;
1844     m_buffer16.swap(newBuffer16);
1845
1846     m_isReparsing = false;
1847 }
1848
1849 template <typename T>
1850 SourceCode Lexer<T>::sourceCode(int openBrace, int closeBrace, int firstLine, unsigned startColumn)
1851 {
1852     ASSERT(m_source->provider()->source()[openBrace] == '{');
1853     ASSERT(m_source->provider()->source()[closeBrace] == '}');
1854     return SourceCode(m_source->provider(), openBrace, closeBrace + 1, firstLine, startColumn);
1855 }
1856
1857 // Instantiate the two flavors of Lexer we need instead of putting most of this file in Lexer.h
1858 template class Lexer<LChar>;
1859 template class Lexer<UChar>;
1860
1861 } // namespace JSC