1 // -*- c-basic-offset: 2 -*-
3 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
4 * Copyright (C) 2006, 2007 Apple Inc. All Rights Reserved.
5 * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Library General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Library General Public License for more details.
17 * You should have received a copy of the GNU Library General Public License
18 * along with this library; see the file COPYING.LIB. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 * Boston, MA 02110-1301, USA.
32 #include <wtf/Assertions.h>
33 #include <wtf/unicode/Unicode.h>
36 using namespace Unicode;
38 // we can't specify the namespace in yacc's C output, so do it here
46 #include "lexer.lut.h"
48 extern YYLTYPE kjsyylloc; // global bison variable holding token info
50 // a bridge for yacc from the C world to C++
58 static bool isDecimalDigit(int);
60 static const size_t initialReadBufferCapacity = 32;
61 static const size_t initialStringTableCapacity = 64;
65 ASSERT(JSLock::currentThreadIsHoldingLock());
67 // FIXME: We'd like to avoid calling new here, but we don't currently
68 // support tearing down the Lexer at app quit time, since that would involve
69 // tearing down its UString data members without holding the JSLock.
70 static Lexer* staticLexer = new Lexer;
77 , eatNextIdentifier(false)
91 m_buffer8.reserveCapacity(initialReadBufferCapacity);
92 m_buffer16.reserveCapacity(initialReadBufferCapacity);
93 m_strings.reserveCapacity(initialStringTableCapacity);
94 m_identifiers.reserveCapacity(initialStringTableCapacity);
97 void Lexer::setCode(const UString &sourceURL, int startingLineNumber, const KJS::UChar *c, unsigned int len)
99 yylineno = 1 + startingLineNumber;
100 m_sourceURL = sourceURL;
101 restrKeyword = false;
103 eatNextIdentifier = false;
112 #ifndef KJS_PURE_ECMA
116 // read first characters
117 current = (length > 0) ? code[0].uc : -1;
118 next1 = (length > 1) ? code[1].uc : -1;
119 next2 = (length > 2) ? code[2].uc : -1;
120 next3 = (length > 3) ? code[3].uc : -1;
123 void Lexer::shift(unsigned int p)
125 // Here would be a good place to strip Cf characters, but that has caused compatibility problems:
126 // <http://bugs.webkit.org/show_bug.cgi?id=10183>.
132 next3 = (pos + 3 < length) ? code[pos + 3].uc : -1;
136 // called on each new line
137 void Lexer::nextLine()
140 #ifndef KJS_PURE_ECMA
145 void Lexer::setDone(State s)
155 unsigned short stringType = 0; // either single or double quotes
163 // did we push a token on the stack previously ?
164 // (after an automatic semicolon insertion)
165 if (stackToken >= 0) {
172 if (skipLF && current != '\n') // found \r but not \n afterwards
174 if (skipCR && current != '\r') // found \n but not \r afterwards
176 if (skipLF || skipCR) // found \r\n or \n\r -> eat the second one
184 if (isWhiteSpace()) {
186 } else if (current == '/' && next1 == '/') {
188 state = InSingleLineComment;
189 } else if (current == '/' && next1 == '*') {
191 state = InMultiLineComment;
192 } else if (current == -1) {
193 if (!terminator && !delimited) {
194 // automatic semicolon insertion if program incomplete
200 } else if (isLineTerminator()) {
207 } else if (current == '"' || current == '\'') {
209 stringType = static_cast<unsigned short>(current);
210 } else if (isIdentStart(current)) {
212 state = InIdentifierOrKeyword;
213 } else if (current == '\\') {
214 state = InIdentifierUnicodeEscapeStart;
215 } else if (current == '0') {
218 } else if (isDecimalDigit(current)) {
221 } else if (current == '.' && isDecimalDigit(next1)) {
224 #ifndef KJS_PURE_ECMA
225 // <!-- marks the beginning of a line comment (for www usage)
226 } else if (current == '<' && next1 == '!' &&
227 next2 == '-' && next3 == '-') {
229 state = InSingleLineComment;
231 } else if (bol && current == '-' && next1 == '-' && next2 == '>') {
233 state = InSingleLineComment;
236 token = matchPunctuator(current, next1, next2, next3);
240 // cerr << "encountered unknown character" << endl;
246 if (current == stringType) {
249 } else if (isLineTerminator() || current == -1) {
251 } else if (current == '\\') {
252 state = InEscapeSequence;
257 // Escape Sequences inside of strings
258 case InEscapeSequence:
259 if (isOctalDigit(current)) {
260 if (current >= '0' && current <= '3' &&
261 isOctalDigit(next1) && isOctalDigit(next2)) {
262 record16(convertOctal(current, next1, next2));
265 } else if (isOctalDigit(current) && isOctalDigit(next1)) {
266 record16(convertOctal('0', current, next1));
269 } else if (isOctalDigit(current)) {
270 record16(convertOctal('0', '0', current));
275 } else if (current == 'x')
277 else if (current == 'u')
278 state = InUnicodeEscape;
279 else if (isLineTerminator()) {
283 record16(singleEscape(static_cast<unsigned short>(current)));
288 if (isHexDigit(current) && isHexDigit(next1)) {
290 record16(convertHex(current, next1));
292 } else if (current == stringType) {
302 case InUnicodeEscape:
303 if (isHexDigit(current) && isHexDigit(next1) && isHexDigit(next2) && isHexDigit(next3)) {
304 record16(convertUnicode(current, next1, next2, next3));
307 } else if (current == stringType) {
315 case InSingleLineComment:
316 if (isLineTerminator()) {
324 } else if (current == -1) {
328 case InMultiLineComment:
331 } else if (isLineTerminator()) {
333 } else if (current == '*' && next1 == '/') {
338 case InIdentifierOrKeyword:
340 if (isIdentPart(current))
342 else if (current == '\\')
343 state = InIdentifierUnicodeEscapeStart;
345 setDone(state == InIdentifierOrKeyword ? IdentifierOrKeyword : Identifier);
348 if (current == 'x' || current == 'X') {
351 } else if (current == '.') {
354 } else if (current == 'e' || current == 'E') {
356 state = InExponentIndicator;
357 } else if (isOctalDigit(current)) {
360 } else if (isDecimalDigit(current)) {
368 if (isHexDigit(current)) {
375 if (isOctalDigit(current)) {
378 else if (isDecimalDigit(current)) {
385 if (isDecimalDigit(current)) {
387 } else if (current == '.') {
390 } else if (current == 'e' || current == 'E') {
392 state = InExponentIndicator;
397 if (isDecimalDigit(current)) {
399 } else if (current == 'e' || current == 'E') {
401 state = InExponentIndicator;
405 case InExponentIndicator:
406 if (current == '+' || current == '-') {
408 } else if (isDecimalDigit(current)) {
415 if (isDecimalDigit(current)) {
420 case InIdentifierUnicodeEscapeStart:
422 state = InIdentifierUnicodeEscape;
426 case InIdentifierUnicodeEscape:
427 if (isHexDigit(current) && isHexDigit(next1) && isHexDigit(next2) && isHexDigit(next3)) {
428 record16(convertUnicode(current, next1, next2, next3));
430 state = InIdentifier;
436 ASSERT(!"Unhandled state in switch statement");
439 // move on to the next character
442 #ifndef KJS_PURE_ECMA
443 if (state != Start && state != InSingleLineComment)
448 // no identifiers allowed directly after numeric literal, e.g. "3in" is bad
449 if ((state == Number || state == Octal || state == Hex) && isIdentStart(current))
453 m_buffer8.append('\0');
456 fprintf(stderr, "line: %d ", lineNo());
457 fprintf(stderr, "yytext (%x): ", m_buffer8[0]);
458 fprintf(stderr, "%s ", buffer8.data());
462 if (state == Number) {
463 dval = strtod(m_buffer8.data(), 0L);
464 } else if (state == Hex) { // scan hex numbers
465 const char* p = m_buffer8.data() + 2;
466 while (char c = *p++) {
468 dval += convertHex(c);
471 if (dval >= mantissaOverflowLowerBound)
472 dval = parseIntOverflow(m_buffer8.data() + 2, p - (m_buffer8.data() + 3), 16);
475 } else if (state == Octal) { // scan octal number
476 const char* p = m_buffer8.data() + 1;
477 while (char c = *p++) {
482 if (dval >= mantissaOverflowLowerBound)
483 dval = parseIntOverflow(m_buffer8.data() + 1, p - (m_buffer8.data() + 2), 8);
497 printf("(Identifier)/(Keyword)\n");
500 printf("(String)\n");
503 printf("(Number)\n");
510 if (state != Identifier && eatNextIdentifier)
511 eatNextIdentifier = false;
513 restrKeyword = false;
515 kjsyylloc.first_line = yylineno; // ???
516 kjsyylloc.last_line = yylineno;
523 if(token == '}' || token == ';') {
527 case IdentifierOrKeyword:
528 if ((token = Lookup::find(&mainTable, m_buffer16.data(), m_buffer16.size())) < 0) {
530 // Lookup for keyword failed, means this is an identifier
531 // Apply anonymous-function hack below (eat the identifier)
532 if (eatNextIdentifier) {
533 eatNextIdentifier = false;
537 kjsyylval.ident = makeIdentifier(m_buffer16);
542 eatNextIdentifier = false;
543 // Hack for "f = function somename() { ... }", too hard to get into the grammar
544 if (token == FUNCTION && lastToken == '=' )
545 eatNextIdentifier = true;
547 if (token == CONTINUE || token == BREAK ||
548 token == RETURN || token == THROW)
552 kjsyylval.string = makeUString(m_buffer16);
556 kjsyylval.doubleValue = dval;
561 fprintf(stderr, "yylex: ERROR.\n");
566 ASSERT(!"unhandled numeration value in switch");
574 bool Lexer::isWhiteSpace() const
576 return current == '\t' || current == 0x0b || current == 0x0c || isSeparatorSpace(current);
579 bool Lexer::isLineTerminator()
581 bool cr = (current == '\r');
582 bool lf = (current == '\n');
587 return cr || lf || current == 0x2028 || current == 0x2029;
590 bool Lexer::isIdentStart(int c)
592 return (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other))
593 || c == '$' || c == '_';
596 bool Lexer::isIdentPart(int c)
598 return (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other
599 | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector))
600 || c == '$' || c == '_';
603 static bool isDecimalDigit(int c)
605 return (c >= '0' && c <= '9');
608 bool Lexer::isHexDigit(int c)
610 return (c >= '0' && c <= '9' ||
611 c >= 'a' && c <= 'f' ||
612 c >= 'A' && c <= 'F');
615 bool Lexer::isOctalDigit(int c)
617 return (c >= '0' && c <= '7');
620 int Lexer::matchPunctuator(int c1, int c2, int c3, int c4)
622 if (c1 == '>' && c2 == '>' && c3 == '>' && c4 == '=') {
625 } else if (c1 == '=' && c2 == '=' && c3 == '=') {
628 } else if (c1 == '!' && c2 == '=' && c3 == '=') {
631 } else if (c1 == '>' && c2 == '>' && c3 == '>') {
634 } else if (c1 == '<' && c2 == '<' && c3 == '=') {
637 } else if (c1 == '>' && c2 == '>' && c3 == '=') {
640 } else if (c1 == '<' && c2 == '=') {
643 } else if (c1 == '>' && c2 == '=') {
646 } else if (c1 == '!' && c2 == '=') {
649 } else if (c1 == '+' && c2 == '+') {
655 } else if (c1 == '-' && c2 == '-') {
658 return AUTOMINUSMINUS;
661 } else if (c1 == '=' && c2 == '=') {
664 } else if (c1 == '+' && c2 == '=') {
667 } else if (c1 == '-' && c2 == '=') {
670 } else if (c1 == '*' && c2 == '=') {
673 } else if (c1 == '/' && c2 == '=') {
676 } else if (c1 == '&' && c2 == '=') {
679 } else if (c1 == '^' && c2 == '=') {
682 } else if (c1 == '%' && c2 == '=') {
685 } else if (c1 == '|' && c2 == '=') {
688 } else if (c1 == '<' && c2 == '<') {
691 } else if (c1 == '>' && c2 == '>') {
694 } else if (c1 == '&' && c2 == '&') {
697 } else if (c1 == '|' && c2 == '|') {
728 return static_cast<int>(c1);
734 unsigned short Lexer::singleEscape(unsigned short c)
760 unsigned short Lexer::convertOctal(int c1, int c2, int c3)
762 return static_cast<unsigned short>((c1 - '0') * 64 + (c2 - '0') * 8 + c3 - '0');
765 unsigned char Lexer::convertHex(int c)
767 if (c >= '0' && c <= '9')
768 return static_cast<unsigned char>(c - '0');
769 if (c >= 'a' && c <= 'f')
770 return static_cast<unsigned char>(c - 'a' + 10);
771 return static_cast<unsigned char>(c - 'A' + 10);
774 unsigned char Lexer::convertHex(int c1, int c2)
776 return ((convertHex(c1) << 4) + convertHex(c2));
779 KJS::UChar Lexer::convertUnicode(int c1, int c2, int c3, int c4)
781 // FIXME: This conversion is lossy. See http://bugs.webkit.org/show_bug.cgi?id=4920.
782 return KJS::UChar((convertHex(c1) << 4) + convertHex(c2),
783 (convertHex(c3) << 4) + convertHex(c4));
786 void Lexer::record8(int c)
790 m_buffer8.append(static_cast<char>(c));
793 void Lexer::record16(int c)
796 ASSERT(c <= USHRT_MAX);
797 record16(UChar(static_cast<unsigned short>(c)));
800 void Lexer::record16(KJS::UChar c)
802 m_buffer16.append(c);
805 bool Lexer::scanRegExp()
808 bool lastWasEscape = false;
809 bool inBrackets = false;
812 if (isLineTerminator() || current == -1)
814 else if (current != '/' || lastWasEscape == true || inBrackets == true)
816 // keep track of '[' and ']'
817 if (!lastWasEscape) {
818 if ( current == '[' && !inBrackets )
820 if ( current == ']' && inBrackets )
825 !lastWasEscape && (current == '\\');
826 } else { // end of regexp
827 m_pattern = UString(m_buffer16);
835 while (isIdentPart(current)) {
839 m_flags = UString(m_buffer16);
846 deleteAllValues(m_strings);
847 Vector<UString*> newStrings;
848 newStrings.reserveCapacity(initialStringTableCapacity);
849 m_strings.swap(newStrings);
851 deleteAllValues(m_identifiers);
852 Vector<KJS::Identifier*> newIdentifiers;
853 newIdentifiers.reserveCapacity(initialStringTableCapacity);
854 m_identifiers.swap(newIdentifiers);
856 Vector<char> newBuffer8;
857 newBuffer8.reserveCapacity(initialReadBufferCapacity);
858 m_buffer8.swap(newBuffer8);
860 Vector<UChar> newBuffer16;
861 newBuffer16.reserveCapacity(initialReadBufferCapacity);
862 m_buffer16.swap(newBuffer16);
869 Identifier* Lexer::makeIdentifier(const Vector<KJS::UChar>& buffer)
871 KJS::Identifier* identifier = new KJS::Identifier(buffer.data(), buffer.size());
872 m_identifiers.append(identifier);
876 UString* Lexer::makeUString(const Vector<KJS::UChar>& buffer)
878 UString* string = new UString(buffer);
879 m_strings.append(string);