2 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3 * Copyright (C) 2006, 2007, 2008 Apple Inc. All Rights Reserved.
4 * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
16 * You should have received a copy of the GNU Library General Public License
17 * along with this library; see the file COPYING.LIB. If not, write to
18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
26 #include "JSFunction.h"
27 #include "JSGlobalObjectFunctions.h"
34 #include <wtf/ASCIICType.h>
35 #include <wtf/Assertions.h>
36 #include <wtf/unicode/Unicode.h>
39 using namespace Unicode;
41 // we can't specify the namespace in yacc's C output, so do it here
49 #include "Lexer.lut.h"
51 // a bridge for yacc from the C world to C++
52 int kjsyylex(void* lvalp, void* llocp, void* globalData)
54 return static_cast<JSGlobalData*>(globalData)->lexer->lex(lvalp, llocp);
59 static bool isDecimalDigit(int);
61 Lexer::Lexer(JSGlobalData* globalData)
63 , m_restrKeyword(false)
64 , m_eatNextIdentifier(false)
70 , m_isReparsing(false)
80 , m_globalData(globalData)
81 , m_mainTable(JSC::mainTable)
83 m_buffer8.reserveCapacity(initialReadBufferCapacity);
84 m_buffer16.reserveCapacity(initialReadBufferCapacity);
89 m_mainTable.deleteTable();
92 void Lexer::setCode(const SourceCode& source)
94 yylineno = source.firstLine();
95 m_restrKeyword = false;
97 m_eatNextIdentifier = false;
101 m_position = source.startOffset();
103 m_code = source.provider()->data();
104 m_length = source.endOffset();
108 m_atLineStart = true;
110 // read first characters
114 void Lexer::shift(unsigned p)
116 // ECMA-262 calls for stripping Cf characters here, but we only do this for BOM,
117 // see <https://bugs.webkit.org/show_bug.cgi?id=4931>.
123 m_currentOffset = m_nextOffset1;
124 m_nextOffset1 = m_nextOffset2;
125 m_nextOffset2 = m_nextOffset3;
127 if (m_position >= m_length) {
128 m_nextOffset3 = m_position;
133 m_nextOffset3 = m_position;
134 m_next3 = m_code[m_position++];
135 } while (m_next3 == 0xFEFF);
139 // called on each new line
140 void Lexer::nextLine()
143 m_atLineStart = true;
146 void Lexer::setDone(State s)
152 int Lexer::lex(void* p1, void* p2)
154 YYSTYPE* lvalp = static_cast<YYSTYPE*>(p1);
155 YYLTYPE* llocp = static_cast<YYLTYPE*>(p2);
158 unsigned short stringType = 0; // either single or double quotes
162 m_terminator = false;
166 // did we push a token on the stack previously ?
167 // (after an automatic semicolon insertion)
168 if (m_stackToken >= 0) {
170 token = m_stackToken;
173 int startOffset = m_currentOffset;
175 if (m_skipLF && m_current != '\n') // found \r but not \n afterwards
177 if (m_skipCR && m_current != '\r') // found \n but not \r afterwards
179 if (m_skipLF || m_skipCR) { // found \r\n or \n\r -> eat the second one
186 startOffset = m_currentOffset;
187 if (isWhiteSpace()) {
189 } else if (m_current == '/' && m_next1 == '/') {
191 m_state = InSingleLineComment;
192 } else if (m_current == '/' && m_next1 == '*') {
194 m_state = InMultiLineComment;
195 } else if (m_current == -1) {
196 if (!m_terminator && !m_delimited && !m_isReparsing) {
197 // automatic semicolon insertion if program incomplete
203 } else if (isLineTerminator()) {
206 if (m_restrKeyword) {
210 } else if (m_current == '"' || m_current == '\'') {
212 stringType = static_cast<unsigned short>(m_current);
213 } else if (isIdentStart(m_current)) {
215 m_state = InIdentifierOrKeyword;
216 } else if (m_current == '\\')
217 m_state = InIdentifierStartUnicodeEscapeStart;
218 else if (m_current == '0') {
221 } else if (isDecimalDigit(m_current)) {
224 } else if (m_current == '.' && isDecimalDigit(m_next1)) {
227 // <!-- marks the beginning of a line comment (for www usage)
228 } else if (m_current == '<' && m_next1 == '!' && m_next2 == '-' && m_next3 == '-') {
230 m_state = InSingleLineComment;
232 } else if (m_atLineStart && m_current == '-' && m_next1 == '-' && m_next2 == '>') {
234 m_state = InSingleLineComment;
236 token = matchPunctuator(lvalp->intValue, m_current, m_next1, m_next2, m_next3);
244 if (m_current == stringType) {
247 } else if (isLineTerminator() || m_current == -1)
249 else if (m_current == '\\')
250 m_state = InEscapeSequence;
254 // Escape Sequences inside of strings
255 case InEscapeSequence:
256 if (isOctalDigit(m_current)) {
257 if (m_current >= '0' && m_current <= '3' &&
258 isOctalDigit(m_next1) && isOctalDigit(m_next2)) {
259 record16(convertOctal(m_current, m_next1, m_next2));
262 } else if (isOctalDigit(m_current) && isOctalDigit(m_next1)) {
263 record16(convertOctal('0', m_current, m_next1));
266 } else if (isOctalDigit(m_current)) {
267 record16(convertOctal('0', '0', m_current));
271 } else if (m_current == 'x')
272 m_state = InHexEscape;
273 else if (m_current == 'u')
274 m_state = InUnicodeEscape;
275 else if (isLineTerminator()) {
279 record16(singleEscape(static_cast<unsigned short>(m_current)));
284 if (isHexDigit(m_current) && isHexDigit(m_next1)) {
286 record16(convertHex(m_current, m_next1));
288 } else if (m_current == stringType) {
298 case InUnicodeEscape:
299 if (isHexDigit(m_current) && isHexDigit(m_next1) && isHexDigit(m_next2) && isHexDigit(m_next3)) {
300 record16(convertUnicode(m_current, m_next1, m_next2, m_next3));
303 } else if (m_current == stringType) {
310 case InSingleLineComment:
311 if (isLineTerminator()) {
314 if (m_restrKeyword) {
319 } else if (m_current == -1)
322 case InMultiLineComment:
325 else if (isLineTerminator())
327 else if (m_current == '*' && m_next1 == '/') {
332 case InIdentifierOrKeyword:
334 if (isIdentPart(m_current))
336 else if (m_current == '\\')
337 m_state = InIdentifierPartUnicodeEscapeStart;
339 setDone(m_state == InIdentifierOrKeyword ? IdentifierOrKeyword : Identifier);
342 if (m_current == 'x' || m_current == 'X') {
345 } else if (m_current == '.') {
348 } else if (m_current == 'e' || m_current == 'E') {
350 m_state = InExponentIndicator;
351 } else if (isOctalDigit(m_current)) {
354 } else if (isDecimalDigit(m_current)) {
361 if (isHexDigit(m_current))
367 if (isOctalDigit(m_current))
369 else if (isDecimalDigit(m_current)) {
376 if (isDecimalDigit(m_current))
378 else if (m_current == '.') {
381 } else if (m_current == 'e' || m_current == 'E') {
383 m_state = InExponentIndicator;
388 if (isDecimalDigit(m_current))
390 else if (m_current == 'e' || m_current == 'E') {
392 m_state = InExponentIndicator;
396 case InExponentIndicator:
397 if (m_current == '+' || m_current == '-')
399 else if (isDecimalDigit(m_current)) {
401 m_state = InExponent;
406 if (isDecimalDigit(m_current))
411 case InIdentifierStartUnicodeEscapeStart:
412 if (m_current == 'u')
413 m_state = InIdentifierStartUnicodeEscape;
417 case InIdentifierPartUnicodeEscapeStart:
418 if (m_current == 'u')
419 m_state = InIdentifierPartUnicodeEscape;
423 case InIdentifierStartUnicodeEscape:
424 if (!isHexDigit(m_current) || !isHexDigit(m_next1) || !isHexDigit(m_next2) || !isHexDigit(m_next3)) {
428 token = convertUnicode(m_current, m_next1, m_next2, m_next3);
430 if (!isIdentStart(token)) {
435 m_state = InIdentifier;
437 case InIdentifierPartUnicodeEscape:
438 if (!isHexDigit(m_current) || !isHexDigit(m_next1) || !isHexDigit(m_next2) || !isHexDigit(m_next3)) {
442 token = convertUnicode(m_current, m_next1, m_next2, m_next3);
444 if (!isIdentPart(token)) {
449 m_state = InIdentifier;
452 ASSERT(!"Unhandled state in switch statement");
455 // move on to the next character
458 if (m_state != Start && m_state != InSingleLineComment)
459 m_atLineStart = false;
462 // no identifiers allowed directly after numeric literal, e.g. "3in" is bad
463 if ((m_state == Number || m_state == Octal || m_state == Hex) && isIdentStart(m_current))
467 m_buffer8.append('\0');
470 fprintf(stderr, "line: %d ", lineNo());
471 fprintf(stderr, "yytext (%x): ", m_buffer8[0]);
472 fprintf(stderr, "%s ", m_buffer8.data());
476 if (m_state == Number)
477 dval = WTF::strtod(m_buffer8.data(), 0L);
478 else if (m_state == Hex) { // scan hex numbers
479 const char* p = m_buffer8.data() + 2;
480 while (char c = *p++) {
482 dval += convertHex(c);
485 if (dval >= mantissaOverflowLowerBound)
486 dval = parseIntOverflow(m_buffer8.data() + 2, p - (m_buffer8.data() + 3), 16);
489 } else if (m_state == Octal) { // scan octal number
490 const char* p = m_buffer8.data() + 1;
491 while (char c = *p++) {
496 if (dval >= mantissaOverflowLowerBound)
497 dval = parseIntOverflow(m_buffer8.data() + 1, p - (m_buffer8.data() + 2), 8);
511 printf("(Identifier)/(Keyword)\n");
514 printf("(String)\n");
517 printf("(Number)\n");
524 if (m_state != Identifier)
525 m_eatNextIdentifier = false;
527 m_restrKeyword = false;
529 llocp->first_line = yylineno;
530 llocp->last_line = yylineno;
531 llocp->first_column = startOffset;
532 llocp->last_column = m_currentOffset;
538 if (token == '}' || token == ';')
542 // Apply anonymous-function hack below (eat the identifier).
543 if (m_eatNextIdentifier) {
544 m_eatNextIdentifier = false;
545 token = lex(lvalp, llocp);
548 lvalp->ident = makeIdentifier(m_buffer16);
551 case IdentifierOrKeyword: {
552 lvalp->ident = makeIdentifier(m_buffer16);
553 const HashEntry* entry = m_mainTable.entry(m_globalData, *lvalp->ident);
555 // Lookup for keyword failed, means this is an identifier.
559 token = entry->lexerValue();
560 // Hack for "f = function somename() { ... }"; too hard to get into the grammar.
561 m_eatNextIdentifier = token == FUNCTION && m_lastToken == '=';
562 if (token == CONTINUE || token == BREAK || token == RETURN || token == THROW)
563 m_restrKeyword = true;
567 // Atomize constant strings in case they're later used in property lookup.
568 lvalp->ident = makeIdentifier(m_buffer16);
572 lvalp->doubleValue = dval;
577 fprintf(stderr, "yylex: ERROR.\n");
582 ASSERT(!"unhandled numeration value in switch");
590 bool Lexer::isWhiteSpace() const
592 return m_current == '\t' || m_current == 0x0b || m_current == 0x0c || isSeparatorSpace(m_current);
595 bool Lexer::isLineTerminator()
597 bool cr = (m_current == '\r');
598 bool lf = (m_current == '\n');
603 return cr || lf || m_current == 0x2028 || m_current == 0x2029;
606 bool Lexer::isIdentStart(int c)
608 return isASCIIAlpha(c) || c == '$' || c == '_' || (!isASCII(c) && (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other)));
611 bool Lexer::isIdentPart(int c)
613 return isASCIIAlphanumeric(c) || c == '$' || c == '_' || (!isASCII(c) && (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other
614 | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector)));
617 static bool isDecimalDigit(int c)
619 return isASCIIDigit(c);
622 bool Lexer::isHexDigit(int c)
624 return isASCIIHexDigit(c);
627 bool Lexer::isOctalDigit(int c)
629 return isASCIIOctalDigit(c);
632 int Lexer::matchPunctuator(int& charPos, int c1, int c2, int c3, int c4)
634 if (c1 == '>' && c2 == '>' && c3 == '>' && c4 == '=') {
638 if (c1 == '=' && c2 == '=' && c3 == '=') {
642 if (c1 == '!' && c2 == '=' && c3 == '=') {
646 if (c1 == '>' && c2 == '>' && c3 == '>') {
650 if (c1 == '<' && c2 == '<' && c3 == '=') {
654 if (c1 == '>' && c2 == '>' && c3 == '=') {
658 if (c1 == '<' && c2 == '=') {
662 if (c1 == '>' && c2 == '=') {
666 if (c1 == '!' && c2 == '=') {
670 if (c1 == '+' && c2 == '+') {
676 if (c1 == '-' && c2 == '-') {
679 return AUTOMINUSMINUS;
682 if (c1 == '=' && c2 == '=') {
686 if (c1 == '+' && c2 == '=') {
690 if (c1 == '-' && c2 == '=') {
694 if (c1 == '*' && c2 == '=') {
698 if (c1 == '/' && c2 == '=') {
702 if (c1 == '&' && c2 == '=') {
706 if (c1 == '^' && c2 == '=') {
710 if (c1 == '%' && c2 == '=') {
714 if (c1 == '|' && c2 == '=') {
718 if (c1 == '<' && c2 == '<') {
722 if (c1 == '>' && c2 == '>') {
726 if (c1 == '&' && c2 == '&') {
730 if (c1 == '|' && c2 == '|') {
759 return static_cast<int>(c1);
761 charPos = m_position - 4;
765 charPos = m_position - 4;
773 unsigned short Lexer::singleEscape(unsigned short c)
799 unsigned short Lexer::convertOctal(int c1, int c2, int c3)
801 return static_cast<unsigned short>((c1 - '0') * 64 + (c2 - '0') * 8 + c3 - '0');
804 unsigned char Lexer::convertHex(int c)
806 if (c >= '0' && c <= '9')
807 return static_cast<unsigned char>(c - '0');
808 if (c >= 'a' && c <= 'f')
809 return static_cast<unsigned char>(c - 'a' + 10);
810 return static_cast<unsigned char>(c - 'A' + 10);
813 unsigned char Lexer::convertHex(int c1, int c2)
815 return ((convertHex(c1) << 4) + convertHex(c2));
818 UChar Lexer::convertUnicode(int c1, int c2, int c3, int c4)
820 unsigned char highByte = (convertHex(c1) << 4) + convertHex(c2);
821 unsigned char lowByte = (convertHex(c3) << 4) + convertHex(c4);
822 return (highByte << 8 | lowByte);
825 void Lexer::record8(int c)
829 m_buffer8.append(static_cast<char>(c));
832 void Lexer::record16(int c)
835 ASSERT(c <= USHRT_MAX);
836 record16(UChar(static_cast<unsigned short>(c)));
839 void Lexer::record16(UChar c)
841 m_buffer16.append(c);
844 bool Lexer::scanRegExp()
847 bool lastWasEscape = false;
848 bool inBrackets = false;
851 if (isLineTerminator() || m_current == -1)
853 else if (m_current != '/' || lastWasEscape == true || inBrackets == true) {
854 // keep track of '[' and ']'
855 if (!lastWasEscape) {
856 if ( m_current == '[' && !inBrackets )
858 if ( m_current == ']' && inBrackets )
863 !lastWasEscape && (m_current == '\\');
864 } else { // end of regexp
865 m_pattern = UString(m_buffer16);
873 while (isIdentPart(m_current)) {
877 m_flags = UString(m_buffer16);
884 m_identifiers.clear();
886 Vector<char> newBuffer8;
887 newBuffer8.reserveCapacity(initialReadBufferCapacity);
888 m_buffer8.swap(newBuffer8);
890 Vector<UChar> newBuffer16;
891 newBuffer16.reserveCapacity(initialReadBufferCapacity);
892 m_buffer16.swap(newBuffer16);
894 m_isReparsing = false;