1 // -*- c-basic-offset: 2 -*-
3 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
4 * Copyright (C) 2006, 2007 Apple Inc. All Rights Reserved.
5 * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Library General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Library General Public License for more details.
17 * You should have received a copy of the GNU Library General Public License
18 * along with this library; see the file COPYING.LIB. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 * Boston, MA 02110-1301, USA.
28 #include "interpreter.h"
33 #include <wtf/Assertions.h>
34 #include <wtf/unicode/Unicode.h>
37 using namespace Unicode;
39 // we can't specify the namespace in yacc's C output, so do it here
47 #include "lexer.lut.h"
49 extern YYLTYPE kjsyylloc; // global bison variable holding token info
51 // a bridge for yacc from the C world to C++
59 static bool isDecimalDigit(int);
63 ASSERT(JSLock::currentThreadIsHoldingLock());
65 // FIXME: We'd like to avoid calling new here, but we don't currently
66 // support tearing down the Lexer at app quit time, since that would involve
67 // tearing down its UString data members without holding the JSLock.
68 static Lexer* staticLexer = new Lexer;
74 size8(128), size16(128), restrKeyword(false),
75 eatNextIdentifier(false), stackToken(-1), lastToken(-1), pos(0),
80 current(0), next1(0), next2(0), next3(0),
81 strings(0), numStrings(0), stringsCapacity(0),
82 identifiers(0), numIdentifiers(0), identifiersCapacity(0)
84 // allocate space for read buffers
85 buffer8 = new char[size8];
86 buffer16 = new KJS::UChar[size16];
95 void Lexer::setCode(const UString &sourceURL, int startingLineNumber, const KJS::UChar *c, unsigned int len)
97 yylineno = 1 + startingLineNumber;
98 m_sourceURL = sourceURL;
101 eatNextIdentifier = false;
110 #ifndef KJS_PURE_ECMA
114 // read first characters
115 current = (length > 0) ? code[0].uc : -1;
116 next1 = (length > 1) ? code[1].uc : -1;
117 next2 = (length > 2) ? code[2].uc : -1;
118 next3 = (length > 3) ? code[3].uc : -1;
121 void Lexer::shift(unsigned int p)
123 // Here would be a good place to strip Cf characters, but that has caused compatibility problems:
124 // <http://bugs.webkit.org/show_bug.cgi?id=10183>.
130 next3 = (pos + 3 < length) ? code[pos + 3].uc : -1;
134 // called on each new line
135 void Lexer::nextLine()
138 #ifndef KJS_PURE_ECMA
143 void Lexer::setDone(State s)
153 unsigned short stringType = 0; // either single or double quotes
160 // did we push a token on the stack previously ?
161 // (after an automatic semicolon insertion)
162 if (stackToken >= 0) {
169 if (skipLF && current != '\n') // found \r but not \n afterwards
171 if (skipCR && current != '\r') // found \n but not \r afterwards
173 if (skipLF || skipCR) // found \r\n or \n\r -> eat the second one
181 if (isWhiteSpace()) {
183 } else if (current == '/' && next1 == '/') {
185 state = InSingleLineComment;
186 } else if (current == '/' && next1 == '*') {
188 state = InMultiLineComment;
189 } else if (current == -1) {
190 if (!terminator && !delimited) {
191 // automatic semicolon insertion if program incomplete
197 } else if (isLineTerminator()) {
204 } else if (current == '"' || current == '\'') {
206 stringType = static_cast<unsigned short>(current);
207 } else if (isIdentStart(current)) {
209 state = InIdentifierOrKeyword;
210 } else if (current == '\\') {
211 state = InIdentifierUnicodeEscapeStart;
212 } else if (current == '0') {
215 } else if (isDecimalDigit(current)) {
218 } else if (current == '.' && isDecimalDigit(next1)) {
221 #ifndef KJS_PURE_ECMA
222 // <!-- marks the beginning of a line comment (for www usage)
223 } else if (current == '<' && next1 == '!' &&
224 next2 == '-' && next3 == '-') {
226 state = InSingleLineComment;
228 } else if (bol && current == '-' && next1 == '-' && next2 == '>') {
230 state = InSingleLineComment;
233 token = matchPunctuator(current, next1, next2, next3);
237 // cerr << "encountered unknown character" << endl;
243 if (current == stringType) {
246 } else if (isLineTerminator() || current == -1) {
248 } else if (current == '\\') {
249 state = InEscapeSequence;
254 // Escape Sequences inside of strings
255 case InEscapeSequence:
256 if (isOctalDigit(current)) {
257 if (current >= '0' && current <= '3' &&
258 isOctalDigit(next1) && isOctalDigit(next2)) {
259 record16(convertOctal(current, next1, next2));
262 } else if (isOctalDigit(current) && isOctalDigit(next1)) {
263 record16(convertOctal('0', current, next1));
266 } else if (isOctalDigit(current)) {
267 record16(convertOctal('0', '0', current));
272 } else if (current == 'x')
274 else if (current == 'u')
275 state = InUnicodeEscape;
276 else if (isLineTerminator()) {
280 record16(singleEscape(static_cast<unsigned short>(current)));
285 if (isHexDigit(current) && isHexDigit(next1)) {
287 record16(convertHex(current, next1));
289 } else if (current == stringType) {
299 case InUnicodeEscape:
300 if (isHexDigit(current) && isHexDigit(next1) && isHexDigit(next2) && isHexDigit(next3)) {
301 record16(convertUnicode(current, next1, next2, next3));
304 } else if (current == stringType) {
312 case InSingleLineComment:
313 if (isLineTerminator()) {
321 } else if (current == -1) {
325 case InMultiLineComment:
328 } else if (isLineTerminator()) {
330 } else if (current == '*' && next1 == '/') {
335 case InIdentifierOrKeyword:
337 if (isIdentPart(current))
339 else if (current == '\\')
340 state = InIdentifierUnicodeEscapeStart;
342 setDone(state == InIdentifierOrKeyword ? IdentifierOrKeyword : Identifier);
345 if (current == 'x' || current == 'X') {
348 } else if (current == '.') {
351 } else if (current == 'e' || current == 'E') {
353 state = InExponentIndicator;
354 } else if (isOctalDigit(current)) {
357 } else if (isDecimalDigit(current)) {
365 if (isHexDigit(current)) {
372 if (isOctalDigit(current)) {
375 else if (isDecimalDigit(current)) {
382 if (isDecimalDigit(current)) {
384 } else if (current == '.') {
387 } else if (current == 'e' || current == 'E') {
389 state = InExponentIndicator;
394 if (isDecimalDigit(current)) {
396 } else if (current == 'e' || current == 'E') {
398 state = InExponentIndicator;
402 case InExponentIndicator:
403 if (current == '+' || current == '-') {
405 } else if (isDecimalDigit(current)) {
412 if (isDecimalDigit(current)) {
417 case InIdentifierUnicodeEscapeStart:
419 state = InIdentifierUnicodeEscape;
423 case InIdentifierUnicodeEscape:
424 if (isHexDigit(current) && isHexDigit(next1) && isHexDigit(next2) && isHexDigit(next3)) {
425 record16(convertUnicode(current, next1, next2, next3));
427 state = InIdentifier;
433 ASSERT(!"Unhandled state in switch statement");
436 // move on to the next character
439 #ifndef KJS_PURE_ECMA
440 if (state != Start && state != InSingleLineComment)
445 // no identifiers allowed directly after numeric literal, e.g. "3in" is bad
446 if ((state == Number || state == Octal || state == Hex) && isIdentStart(current))
450 buffer8[pos8] = '\0';
453 fprintf(stderr, "line: %d ", lineNo());
454 fprintf(stderr, "yytext (%x): ", buffer8[0]);
455 fprintf(stderr, "%s ", buffer8);
459 if (state == Number) {
460 dval = strtod(buffer8, 0L);
461 } else if (state == Hex) { // scan hex numbers
462 const char *p = buffer8 + 2;
463 while (char c = *p++) {
465 dval += convertHex(c);
468 if (dval >= mantissaOverflowLowerBound)
469 dval = parseIntOverflow(buffer8 + 2, p - (buffer8 + 3), 16);
472 } else if (state == Octal) { // scan octal number
473 const char *p = buffer8 + 1;
474 while (char c = *p++) {
479 if (dval >= mantissaOverflowLowerBound)
480 dval = parseIntOverflow(buffer8 + 1, p - (buffer8 + 2), 8);
494 printf("(Identifier)/(Keyword)\n");
497 printf("(String)\n");
500 printf("(Number)\n");
507 if (state != Identifier && eatNextIdentifier)
508 eatNextIdentifier = false;
510 restrKeyword = false;
512 kjsyylloc.first_line = yylineno; // ???
513 kjsyylloc.last_line = yylineno;
520 if(token == '}' || token == ';') {
524 case IdentifierOrKeyword:
525 if ((token = Lookup::find(&mainTable, buffer16, pos16)) < 0) {
527 // Lookup for keyword failed, means this is an identifier
528 // Apply anonymous-function hack below (eat the identifier)
529 if (eatNextIdentifier) {
530 eatNextIdentifier = false;
534 kjsyylval.ident = makeIdentifier(buffer16, pos16);
539 eatNextIdentifier = false;
540 // Hack for "f = function somename() { ... }", too hard to get into the grammar
541 if (token == FUNCTION && lastToken == '=' )
542 eatNextIdentifier = true;
544 if (token == CONTINUE || token == BREAK ||
545 token == RETURN || token == THROW)
549 kjsyylval.string = makeUString(buffer16, pos16);
553 kjsyylval.doubleValue = dval;
558 fprintf(stderr, "yylex: ERROR.\n");
563 ASSERT(!"unhandled numeration value in switch");
571 bool Lexer::isWhiteSpace() const
573 return current == '\t' || current == 0x0b || current == 0x0c || isSeparatorSpace(current);
576 bool Lexer::isLineTerminator()
578 bool cr = (current == '\r');
579 bool lf = (current == '\n');
584 return cr || lf || current == 0x2028 || current == 0x2029;
587 bool Lexer::isIdentStart(int c)
589 return (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other))
590 || c == '$' || c == '_';
593 bool Lexer::isIdentPart(int c)
595 return (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other
596 | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector))
597 || c == '$' || c == '_';
600 static bool isDecimalDigit(int c)
602 return (c >= '0' && c <= '9');
605 bool Lexer::isHexDigit(int c)
607 return (c >= '0' && c <= '9' ||
608 c >= 'a' && c <= 'f' ||
609 c >= 'A' && c <= 'F');
612 bool Lexer::isOctalDigit(int c)
614 return (c >= '0' && c <= '7');
617 int Lexer::matchPunctuator(int c1, int c2, int c3, int c4)
619 if (c1 == '>' && c2 == '>' && c3 == '>' && c4 == '=') {
622 } else if (c1 == '=' && c2 == '=' && c3 == '=') {
625 } else if (c1 == '!' && c2 == '=' && c3 == '=') {
628 } else if (c1 == '>' && c2 == '>' && c3 == '>') {
631 } else if (c1 == '<' && c2 == '<' && c3 == '=') {
634 } else if (c1 == '>' && c2 == '>' && c3 == '=') {
637 } else if (c1 == '<' && c2 == '=') {
640 } else if (c1 == '>' && c2 == '=') {
643 } else if (c1 == '!' && c2 == '=') {
646 } else if (c1 == '+' && c2 == '+') {
652 } else if (c1 == '-' && c2 == '-') {
655 return AUTOMINUSMINUS;
658 } else if (c1 == '=' && c2 == '=') {
661 } else if (c1 == '+' && c2 == '=') {
664 } else if (c1 == '-' && c2 == '=') {
667 } else if (c1 == '*' && c2 == '=') {
670 } else if (c1 == '/' && c2 == '=') {
673 } else if (c1 == '&' && c2 == '=') {
676 } else if (c1 == '^' && c2 == '=') {
679 } else if (c1 == '%' && c2 == '=') {
682 } else if (c1 == '|' && c2 == '=') {
685 } else if (c1 == '<' && c2 == '<') {
688 } else if (c1 == '>' && c2 == '>') {
691 } else if (c1 == '&' && c2 == '&') {
694 } else if (c1 == '|' && c2 == '|') {
725 return static_cast<int>(c1);
731 unsigned short Lexer::singleEscape(unsigned short c)
757 unsigned short Lexer::convertOctal(int c1, int c2, int c3)
759 return static_cast<unsigned short>((c1 - '0') * 64 + (c2 - '0') * 8 + c3 - '0');
762 unsigned char Lexer::convertHex(int c)
764 if (c >= '0' && c <= '9')
765 return static_cast<unsigned char>(c - '0');
766 if (c >= 'a' && c <= 'f')
767 return static_cast<unsigned char>(c - 'a' + 10);
768 return static_cast<unsigned char>(c - 'A' + 10);
771 unsigned char Lexer::convertHex(int c1, int c2)
773 return ((convertHex(c1) << 4) + convertHex(c2));
776 KJS::UChar Lexer::convertUnicode(int c1, int c2, int c3, int c4)
778 // FIXME: This conversion is lossy. See http://bugs.webkit.org/show_bug.cgi?id=4920.
779 return KJS::UChar((convertHex(c1) << 4) + convertHex(c2),
780 (convertHex(c3) << 4) + convertHex(c4));
783 void Lexer::record8(int c)
788 // enlarge buffer if full
789 if (pos8 >= size8 - 1) {
790 char *tmp = new char[2 * size8];
791 memcpy(tmp, buffer8, size8 * sizeof(char));
797 buffer8[pos8++] = (char) c;
800 void Lexer::record16(int c)
803 ASSERT(c <= USHRT_MAX);
804 record16(UChar(static_cast<unsigned short>(c)));
807 void Lexer::record16(KJS::UChar c)
809 // enlarge buffer if full
810 if (pos16 >= size16 - 1) {
811 KJS::UChar *tmp = new KJS::UChar[2 * size16];
812 memcpy(tmp, buffer16, size16 * sizeof(KJS::UChar));
818 buffer16[pos16++] = c;
821 bool Lexer::scanRegExp()
824 bool lastWasEscape = false;
825 bool inBrackets = false;
828 if (isLineTerminator() || current == -1)
830 else if (current != '/' || lastWasEscape == true || inBrackets == true)
832 // keep track of '[' and ']'
833 if (!lastWasEscape) {
834 if ( current == '[' && !inBrackets )
836 if ( current == ']' && inBrackets )
841 !lastWasEscape && (current == '\\');
843 else { // end of regexp
844 m_pattern = UString(buffer16, pos16);
852 while (isIdentPart(current)) {
856 m_flags = UString(buffer16, pos16);
863 for (unsigned i = 0; i < numIdentifiers; i++)
864 delete identifiers[i];
865 fastFree(identifiers);
868 identifiersCapacity = 0;
870 for (unsigned i = 0; i < numStrings; i++)
882 const int initialCapacity = 64;
883 const int growthFactor = 2;
885 Identifier* Lexer::makeIdentifier(KJS::UChar* buffer, unsigned int pos)
887 if (numIdentifiers == identifiersCapacity) {
888 identifiersCapacity = (identifiersCapacity == 0) ? initialCapacity : identifiersCapacity *growthFactor;
889 identifiers = (KJS::Identifier **)fastRealloc(identifiers, sizeof(KJS::Identifier *) * identifiersCapacity);
892 KJS::Identifier *identifier = new KJS::Identifier(buffer, pos);
893 identifiers[numIdentifiers++] = identifier;
897 UString* Lexer::makeUString(KJS::UChar* buffer, unsigned int pos)
899 if (numStrings == stringsCapacity) {
900 stringsCapacity = (stringsCapacity == 0) ? initialCapacity : stringsCapacity *growthFactor;
901 strings = (UString **)fastRealloc(strings, sizeof(UString *) * stringsCapacity);
904 UString *string = new UString(buffer, pos);
905 strings[numStrings++] = string;