2010-05-31 Adam Barth <abarth@webkit.org>
[WebKit-https.git] / WebCore / html / HTML5Lexer.cpp
1 /*
2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3  * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
26  */
27
28 #include "config.h"
29 #include "HTML5Lexer.h"
30
31 #include "AtomicString.h"
32 #include "HTML5Token.h"
33 #include "HTMLNames.h"
34 #include "NotImplemented.h"
35 #include <wtf/CurrentTime.h>
36 #include <wtf/UnusedParam.h>
37 #include <wtf/text/CString.h>
38 #include <wtf/unicode/Unicode.h>
39
40
41 // Use __GNUC__ instead of PLATFORM(GCC) to stay consistent with the gperf generated c file
42 #ifdef __GNUC__
43 // The main tokenizer includes this too so we are getting two copies of the data. However, this way the code gets inlined.
44 #include "HTMLEntityNames.c"
45 #else
46 // Not inlined for non-GCC compilers
47 struct Entity {
48     const char* name;
49     int code;
50 };
51 const struct Entity* findEntity(register const char* str, register unsigned int len);
52 #endif
53
54 using namespace WTF;
55
56 namespace WebCore {
57
58 using namespace HTMLNames;
59
60 namespace {
61
62 static const UChar windowsLatin1ExtensionArray[32] = {
63     0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
64     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
65     0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
66     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
67 };
68
69 inline UChar toLowerCase(UChar cc)
70 {
71     ASSERT(cc >= 'A' && cc <= 'Z');
72     const int lowerCaseOffset = 0x20;
73     return cc + lowerCaseOffset;
74 }
75
76 inline void advanceStringAndASSERTIgnoringCase(SegmentedString& source, const char* expectedCharacters)
77 {
78     while (*expectedCharacters)
79         source.advanceAndASSERTIgnoringCase(*expectedCharacters++);
80 }
81
82 inline bool vectorEqualsString(const Vector<UChar, 32>& vector, const String& string)
83 {
84     if (vector.size() != string.length())
85         return false;
86     const UChar* stringData = string.characters();
87     const UChar* vectorData = vector.data();
88     // FIXME: Is there a higher-level function we should be calling here?
89     return !memcmp(stringData, vectorData, vector.size() * sizeof(UChar));
90 }
91
92 inline UChar adjustEntity(unsigned value)
93 {
94     if ((value & ~0x1F) != 0x0080)
95         return value;
96     return windowsLatin1ExtensionArray[value - 0x80];
97 }
98
99 inline unsigned legalEntityFor(unsigned value)
100 {
101     // FIXME: A number of specific entity values generate parse errors.
102     if (value == 0 || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))
103         return 0xFFFD;
104     if (value < 0xFFFF)
105         return adjustEntity(value);
106     return value;
107 }
108
109 inline bool isHexDigit(UChar cc)
110 {
111     return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
112 }
113
114 inline bool isAlphaNumeric(UChar cc)
115 {
116     return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
117 }
118
119 void unconsumeCharacters(SegmentedString& source, const Vector<UChar, 10>& consumedCharacters)
120 {
121     if (consumedCharacters.size() == 1)
122         source.push(consumedCharacters[0]);
123     else if (consumedCharacters.size() == 2) {
124         source.push(consumedCharacters[0]);
125         source.push(consumedCharacters[1]);
126     } else
127         source.prepend(SegmentedString(String(consumedCharacters.data(), consumedCharacters.size())));
128 }
129
130 inline bool isEndTagBufferingState(HTML5Lexer::State state)
131 {
132     return state == HTML5Lexer::RCDATAEndTagOpenState
133         || state == HTML5Lexer::RCDATAEndTagNameState
134         || state == HTML5Lexer::RAWTEXTEndTagOpenState
135         || state == HTML5Lexer::RAWTEXTEndTagNameState
136         || state == HTML5Lexer::ScriptDataEndTagOpenState
137         || state == HTML5Lexer::ScriptDataEndTagNameState
138         || state == HTML5Lexer::ScriptDataEscapedEndTagOpenState
139         || state == HTML5Lexer::ScriptDataEscapedEndTagNameState;
140 }
141
142 }
143
144 HTML5Lexer::HTML5Lexer()
145 {
146     reset();
147 }
148
149 HTML5Lexer::~HTML5Lexer()
150 {
151 }
152
153 void HTML5Lexer::reset()
154 {
155     m_state = DataState;
156     m_token = 0;
157     m_skipLeadingNewLineForListing = false;
158     m_emitPending = false;
159     m_additionalAllowedCharacter = '\0';
160 }
161
162 unsigned HTML5Lexer::consumeEntity(SegmentedString& source, bool& notEnoughCharacters)
163 {
164     ASSERT(m_state != CharacterReferenceInAttributeValueState || m_additionalAllowedCharacter == '"' || m_additionalAllowedCharacter == '\'' || m_additionalAllowedCharacter == '>');
165     ASSERT(!notEnoughCharacters);
166
167     enum EntityState {
168         Initial,
169         NumberType,
170         MaybeHexLowerCaseX,
171         MaybeHexUpperCaseX,
172         Hex,
173         Decimal,
174         Named
175     };
176     EntityState entityState = Initial;
177     unsigned result = 0;
178     Vector<UChar, 10> consumedCharacters;
179     Vector<char, 10> entityName;
180
181     while (!source.isEmpty()) {
182         UChar cc = *source;
183         switch (entityState) {
184         case Initial: {
185             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
186                 return 0;
187             if (m_state == CharacterReferenceInAttributeValueState && cc == m_additionalAllowedCharacter)
188                 return 0;
189             if (cc == '#') {
190                 entityState = NumberType;
191                 break;
192             }
193             if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
194                 entityState = Named;
195                 continue;
196             }
197             return 0;
198         }
199         case NumberType: {
200             if (cc == 'x') {
201                 entityState = MaybeHexLowerCaseX;
202                 break;
203             }
204             if (cc == 'X') {
205                 entityState = MaybeHexUpperCaseX;
206                 break;
207             }
208             if (cc >= '0' && cc <= '9') {
209                 entityState = Decimal;
210                 continue;
211             }
212             source.push('#');
213             return 0;
214         }
215         case MaybeHexLowerCaseX: {
216             if (isHexDigit(cc)) {
217                 entityState = Hex;
218                 continue;
219             }
220             source.push('#');
221             source.push('x');
222             return 0;
223         }
224         case MaybeHexUpperCaseX: {
225             if (isHexDigit(cc)) {
226                 entityState = Hex;
227                 continue;
228             }
229             source.push('#');
230             source.push('X');
231             return 0;
232         }
233         case Hex: {
234             if (cc >= '0' && cc <= '9')
235                 result = result * 16 + cc - '0';
236             else if (cc >= 'a' && cc <= 'f')
237                 result = result * 16 + 10 + cc - 'a';
238             else if (cc >= 'A' && cc <= 'F')
239                 result = result * 16 + 10 + cc - 'A';
240             else if (cc == ';') {
241                 source.advance();
242                 return legalEntityFor(result);
243             } else 
244                 return legalEntityFor(result);
245             break;
246         }
247         case Decimal: {
248             if (cc >= '0' && cc <= '9')
249                 result = result * 10 + cc - '0';
250             else if (cc == ';') {
251                 source.advance();
252                 return legalEntityFor(result);
253             } else
254                 return legalEntityFor(result);
255             break;
256         }
257         case Named: {
258             // FIXME: This code is wrong. We need to find the longest matching entity.
259             //        The examples from the spec are:
260             //            I'm &notit; I tell you
261             //            I'm &notin; I tell you
262             //        In the first case, "&not" is the entity.  In the second
263             //        case, "&notin;" is the entity.
264             // FIXME: Our list of HTML entities is incomplete.
265             // FIXME: The number 8 below is bogus.
266             while (!source.isEmpty() && entityName.size() <= 8) {
267                 cc = *source;
268                 if (cc == ';') {
269                     const Entity* entity = findEntity(entityName.data(), entityName.size());
270                     if (entity) {
271                         source.advanceAndASSERT(';');
272                         return entity->code;
273                     }
274                     emitParseError();
275                     break;
276                 }
277                 if (!isAlphaNumeric(cc)) {
278                     const Entity* entity = findEntity(entityName.data(), entityName.size());
279                     if (entity) {
280                         // HTML5 tells us to ignore this entity, for historical reasons,
281                         // if the lookhead character is '='.
282                         if (m_state == CharacterReferenceInAttributeValueState && cc == '=')
283                             break;
284                         emitParseError();
285                         return entity->code;
286                     }
287                     break;
288                 }
289                 entityName.append(cc);
290                 consumedCharacters.append(cc);
291                 source.advanceAndASSERT(cc);
292             }
293             notEnoughCharacters = source.isEmpty();
294             unconsumeCharacters(source, consumedCharacters);
295             return 0;
296         }
297         }
298         consumedCharacters.append(cc);
299         source.advanceAndASSERT(cc);
300     }
301     ASSERT(source.isEmpty());
302     notEnoughCharacters = true;
303     unconsumeCharacters(source, consumedCharacters);
304     return 0;
305 }
306
307 inline bool HTML5Lexer::processEntity(SegmentedString& source)
308 {
309     bool notEnoughCharacters = false;
310     unsigned value = consumeEntity(source, notEnoughCharacters);
311     if (notEnoughCharacters)
312         return false;
313     if (!value)
314         emitCharacter('&');
315     else
316         emitCodePoint(value);
317     return true;
318 }
319
320 bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
321 {
322     // If we have a token in progress, then we're supposed to be called back
323     // with the same token so we can finish it.
324     ASSERT(!m_token || m_token == &token || token.type() == HTML5Token::Uninitialized);
325     m_token = &token;
326
327     if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) {
328         // FIXME: This should call flushBufferedEndTag().
329         // We started an end tag during our last iteration.
330         m_token->beginEndTag(m_bufferedEndTagName);
331         m_bufferedEndTagName.clear();
332         if (m_state == DataState) {
333             // We're back in the data state, so we must be done with the tag.
334             return true;
335         }
336     }
337
338     // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
339     if (m_skipLeadingNewLineForListing && m_state == DataState && !source.isEmpty() && *source == '\x0A')
340         source.advanceAndASSERT('\x0A');
341     m_skipLeadingNewLineForListing = false;
342
343     // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
344     // FIXME: This while should stop as soon as we have a token to return.
345     while (!source.isEmpty()) {
346         UChar cc = *source;
347         switch (m_state) {
348         case DataState: {
349             if (cc == '&')
350                 m_state = CharacterReferenceInDataState;
351             else if (cc == '<') {
352                 if (m_token->type() == HTML5Token::Character) {
353                     // We have a bunch of character tokens queued up that we
354                     // are emitting lazily here.
355                     return true;
356                 }
357                 m_state = TagOpenState;
358             } else
359                 emitCharacter(cc);
360             break;
361         }
362         case CharacterReferenceInDataState: {
363             if (!processEntity(source))
364                 return shouldEmitBufferedCharacterToken(source);
365             m_state = DataState;
366             continue;
367         }
368         case RCDATAState: {
369             if (cc == '&')
370                 m_state = CharacterReferenceInRCDATAState;
371             else if (cc == '<')
372                 m_state = RCDATALessThanSignState;
373             else
374                 emitCharacter(cc);
375             break;
376         }
377         case CharacterReferenceInRCDATAState: {
378             if (!processEntity(source))
379                 return shouldEmitBufferedCharacterToken(source);
380             m_state = RCDATAState;
381             continue;
382         }
383         case RAWTEXTState: {
384             if (cc == '<')
385                 m_state = RAWTEXTLessThanSignState;
386             else
387                 emitCharacter(cc);
388             break;
389         }
390         case ScriptDataState: {
391             if (cc == '<')
392                 m_state = ScriptDataLessThanSignState;
393             else
394                 emitCharacter(cc);
395             break;
396         }
397         case PLAINTEXTState: {
398             emitCharacter(cc);
399             break;
400         }
401         case TagOpenState: {
402             if (cc == '!')
403                 m_state = MarkupDeclarationOpenState;
404             else if (cc == '/')
405                 m_state = EndTagOpenState;
406             else if (cc >= 'A' && cc <= 'Z') {
407                 m_token->beginStartTag(toLowerCase(cc));
408                 m_state = TagNameState;
409             } else if (cc >= 'a' && cc <= 'z') {
410                 m_token->beginStartTag(cc);
411                 m_state = TagNameState;
412             } else if (cc == '?') {
413                 emitParseError();
414                 m_state = BogusCommentState;
415                 // The spec consumes the current character before switching
416                 // to the bogus comment state, but it's easier to implement
417                 // if we reconsume the current character.
418                 continue;
419             } else {
420                 emitParseError();
421                 m_state = DataState;
422                 emitCharacter('<');
423                 continue;
424             }
425             break;
426         }
427         case EndTagOpenState: {
428             if (cc >= 'A' && cc <= 'Z') {
429                 m_token->beginEndTag(toLowerCase(cc));
430                 m_state = TagNameState;
431             } else if (cc >= 'a' && cc <= 'z') {
432                 m_token->beginEndTag(cc);
433                 m_state = TagNameState;
434             } else if (cc == '>') {
435                 emitParseError();
436                 m_state = DataState;
437             } else {
438                 emitParseError();
439                 m_state = BogusCommentState;
440                 continue;
441             }
442             // FIXME: Handle EOF properly.
443             break;
444         }
445         case TagNameState: {
446             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
447                 m_state = BeforeAttributeNameState;
448             else if (cc == '/')
449                 m_state = SelfClosingStartTagState;
450             else if (cc == '>') {
451                 emitCurrentToken();
452                 m_state = DataState;
453             } else if (cc >= 'A' && cc <= 'Z')
454                 m_token->appendToName(toLowerCase(cc));
455             else
456                 m_token->appendToName(cc);
457             // FIXME: Handle EOF properly.
458             break;
459         }
460         case RCDATALessThanSignState: {
461             if (cc == '/') {
462                 m_temporaryBuffer.clear();
463                 ASSERT(m_bufferedEndTagName.isEmpty());
464                 m_state = RCDATAEndTagOpenState;
465             } else {
466                 emitCharacter('<');
467                 m_state = RCDATAState;
468                 continue;
469             }
470             break;
471         }
472         case RCDATAEndTagOpenState: {
473             if (cc >= 'A' && cc <= 'Z') {
474                 m_temporaryBuffer.append(cc);
475                 addToPossibleEndTag(toLowerCase(cc));
476                 m_state = RCDATAEndTagNameState;
477             } else if (cc >= 'a' && cc <= 'z') {
478                 m_temporaryBuffer.append(cc);
479                 addToPossibleEndTag(cc);
480                 m_state = RCDATAEndTagNameState;
481             } else {
482                 emitCharacter('<');
483                 emitCharacter('/');
484                 m_state = RCDATAState;
485                 continue;
486             }
487             break;
488         }
489         case RCDATAEndTagNameState: {
490             if (cc >= 'A' && cc <= 'Z') {
491                 m_temporaryBuffer.append(cc);
492                 addToPossibleEndTag(toLowerCase(cc));
493             } else if (cc >= 'a' && cc <= 'z') {
494                 m_temporaryBuffer.append(cc);
495                 addToPossibleEndTag(cc);
496             } else {
497                 if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ') {
498                     if (isAppropriateEndTag()) {
499                         m_state = BeforeAttributeNameState;
500                         maybeFlushBufferedEndTag();
501                         break;
502                     }
503                 } else if (cc == '/') {
504                     if (isAppropriateEndTag()) {
505                         m_state = SelfClosingStartTagState;
506                         maybeFlushBufferedEndTag();
507                         break;
508                     }
509                 } else if (cc == '>') {
510                     if (isAppropriateEndTag()) {
511                         m_state = DataState;
512                         maybeFlushBufferedEndTag();
513                         break;
514                     }
515                 }
516                 emitCharacter('<');
517                 emitCharacter('/');
518                 m_token->appendToCharacter(m_temporaryBuffer);
519                 m_bufferedEndTagName.clear();
520                 m_state = RCDATAState;
521                 continue;
522             }
523             break;
524         }
525         case RAWTEXTLessThanSignState: {
526             if (cc == '/') {
527                 m_temporaryBuffer.clear();
528                 ASSERT(m_bufferedEndTagName.isEmpty());
529                 m_state = RAWTEXTEndTagOpenState;
530             } else {
531                 emitCharacter('<');
532                 m_state = RAWTEXTState;
533                 continue;
534             }
535             break;
536         }
537         case RAWTEXTEndTagOpenState: {
538             if (cc >= 'A' && cc <= 'Z') {
539                 m_temporaryBuffer.append(cc);
540                 addToPossibleEndTag(toLowerCase(cc));
541                 m_state = RAWTEXTEndTagNameState;
542             } else if (cc >= 'a' && cc <= 'z') {
543                 m_temporaryBuffer.append(cc);
544                 addToPossibleEndTag(cc);
545                 m_state = RAWTEXTEndTagNameState;
546             } else {
547                 emitCharacter('<');
548                 emitCharacter('/');
549                 m_state = RAWTEXTState;
550                 continue;
551             }
552             break;
553         }
554         case RAWTEXTEndTagNameState: {
555             if (cc >= 'A' && cc <= 'Z') {
556                 m_temporaryBuffer.append(cc);
557                 addToPossibleEndTag(toLowerCase(cc));
558             } else if (cc >= 'a' && cc <= 'z') {
559                 m_temporaryBuffer.append(cc);
560                 addToPossibleEndTag(cc);
561             } else {
562                 if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ') {
563                     if (isAppropriateEndTag()) {
564                         m_state = BeforeAttributeNameState;
565                         maybeFlushBufferedEndTag();
566                         break;
567                     }
568                 } else if (cc == '/') {
569                     if (isAppropriateEndTag()) {
570                         m_state = SelfClosingStartTagState;
571                         maybeFlushBufferedEndTag();
572                         break;
573                     }
574                 } else if (cc == '>') {
575                     if (isAppropriateEndTag()) {
576                         m_state = DataState;
577                         maybeFlushBufferedEndTag();
578                         break;
579                     }
580                 }
581                 emitCharacter('<');
582                 emitCharacter('/');
583                 m_token->appendToCharacter(m_temporaryBuffer);
584                 m_bufferedEndTagName.clear();
585                 m_state = RAWTEXTState;
586                 continue;
587             }
588             break;
589         }
590         case ScriptDataLessThanSignState: {
591             if (cc == '/') {
592                 m_temporaryBuffer.clear();
593                 ASSERT(m_bufferedEndTagName.isEmpty());
594                 m_state = ScriptDataEndTagOpenState;
595             } else if (cc == '!') {
596                 emitCharacter('<');
597                 emitCharacter('!');
598                 m_state = ScriptDataEscapeStartState;
599             } else {
600                 emitCharacter('<');
601                 m_state = ScriptDataState;
602                 continue;
603             }
604             break;
605         }
606         case ScriptDataEndTagOpenState: {
607             if (cc >= 'A' && cc <= 'Z') {
608                 m_temporaryBuffer.append(cc);
609                 addToPossibleEndTag(toLowerCase(cc));
610                 m_state = ScriptDataEndTagNameState;
611             } else if (cc >= 'a' && cc <= 'z') {
612                 m_temporaryBuffer.append(cc);
613                 addToPossibleEndTag(cc);
614                 m_state = ScriptDataEndTagNameState;
615             } else {
616                 emitCharacter('<');
617                 emitCharacter('/');
618                 m_state = ScriptDataState;
619                 continue;
620             }
621             break;
622         }
623         case ScriptDataEndTagNameState: {
624             if (cc >= 'A' && cc <= 'Z') {
625                 m_temporaryBuffer.append(cc);
626                 addToPossibleEndTag(toLowerCase(cc));
627             } else if (cc >= 'a' && cc <= 'z') {
628                 m_temporaryBuffer.append(cc);
629                 addToPossibleEndTag(cc);
630             } else {
631                 if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ') {
632                     if (isAppropriateEndTag()) {
633                         m_state = BeforeAttributeNameState;
634                         maybeFlushBufferedEndTag();
635                         break;
636                     }
637                 } else if (cc == '/') {
638                     if (isAppropriateEndTag()) {
639                         m_state = SelfClosingStartTagState;
640                         maybeFlushBufferedEndTag();
641                         break;
642                     }
643                 } else if (cc == '>') {
644                     if (isAppropriateEndTag()) {
645                         m_state = DataState;
646                         maybeFlushBufferedEndTag();
647                         break;
648                     }
649                 }
650                 emitCharacter('<');
651                 emitCharacter('/');
652                 m_token->appendToCharacter(m_temporaryBuffer);
653                 m_bufferedEndTagName.clear();
654                 m_state = ScriptDataState;
655                 continue;
656             }
657             break;
658         }
659         case ScriptDataEscapeStartState: {
660             if (cc == '-') {
661                 emitCharacter(cc);
662                 m_state = ScriptDataEscapeStartDashState;
663             } else {
664                 m_state = ScriptDataState;
665                 continue;
666             }
667             break;
668         }
669         case ScriptDataEscapeStartDashState: {
670             if (cc == '-') {
671                 emitCharacter(cc);
672                 m_state = ScriptDataEscapedDashDashState;
673             } else {
674                 m_state = ScriptDataState;
675                 continue;
676             }
677             break;
678         }
679         case ScriptDataEscapedState: {
680             if (cc == '-') {
681                 emitCharacter(cc);
682                 m_state = ScriptDataEscapedDashState;
683             } else if (cc == '<')
684                 m_state = ScriptDataEscapedLessThanSignState;
685             else
686                 emitCharacter(cc);
687             // FIXME: Handle EOF properly.
688             break;
689         }
690         case ScriptDataEscapedDashState: {
691             if (cc == '-') {
692                 emitCharacter(cc);
693                 m_state = ScriptDataEscapedDashDashState;
694             } else if (cc == '<')
695                 m_state = ScriptDataEscapedLessThanSignState;
696             else {
697                 emitCharacter(cc);
698                 m_state = ScriptDataEscapedState;
699             }
700             // FIXME: Handle EOF properly.
701             break;
702         }
703         case ScriptDataEscapedDashDashState: {
704             if (cc == '-')
705                 emitCharacter(cc);
706             else if (cc == '<')
707                 m_state = ScriptDataEscapedLessThanSignState;
708             else if (cc == '>') {
709                 emitCharacter(cc);
710                 m_state = ScriptDataState;
711             } else {
712                 emitCharacter(cc);
713                 m_state = ScriptDataEscapedState;
714             }
715             // FIXME: Handle EOF properly.
716             break;
717         }
718         case ScriptDataEscapedLessThanSignState: {
719             if (cc == '/') {
720                 m_temporaryBuffer.clear();
721                 ASSERT(m_bufferedEndTagName.isEmpty());
722                 m_state = ScriptDataEscapedEndTagOpenState;
723             } else if (cc >= 'A' && cc <= 'Z') {
724                 emitCharacter('<');
725                 emitCharacter(cc);
726                 m_temporaryBuffer.clear();
727                 m_temporaryBuffer.append(toLowerCase(cc));
728                 m_state = ScriptDataDoubleEscapeStartState;
729             } else if (cc >= 'a' && cc <= 'z') {
730                 emitCharacter('<');
731                 emitCharacter(cc);
732                 m_temporaryBuffer.clear();
733                 m_temporaryBuffer.append(cc);
734                 m_state = ScriptDataDoubleEscapeStartState;
735             } else {
736                 emitCharacter('<');
737                 m_state = ScriptDataEscapedState;
738                 continue;
739             }
740             break;
741         }
742         case ScriptDataEscapedEndTagOpenState: {
743             if (cc >= 'A' && cc <= 'Z') {
744                 m_temporaryBuffer.append(cc);
745                 addToPossibleEndTag(toLowerCase(cc));
746                 m_state = ScriptDataEscapedEndTagNameState;
747             } else if (cc >= 'a' && cc <= 'z') {
748                 m_temporaryBuffer.append(cc);
749                 addToPossibleEndTag(cc);
750                 m_state = ScriptDataEscapedEndTagNameState;
751             } else {
752                 emitCharacter('<');
753                 emitCharacter('/');
754                 m_state = ScriptDataEscapedState;
755                 continue;
756             }
757             break;
758         }
759         case ScriptDataEscapedEndTagNameState: {
760             if (cc >= 'A' && cc <= 'Z') {
761                 m_temporaryBuffer.append(cc);
762                 addToPossibleEndTag(toLowerCase(cc));
763             } else if (cc >= 'a' && cc <= 'z') {
764                 m_temporaryBuffer.append(cc);
765                 addToPossibleEndTag(cc);
766             } else {
767                 if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ') {
768                     if (isAppropriateEndTag()) {
769                         m_state = BeforeAttributeNameState;
770                         maybeFlushBufferedEndTag();
771                         break;
772                     }
773                 } else if (cc == '/') {
774                     if (isAppropriateEndTag()) {
775                         m_state = SelfClosingStartTagState;
776                         maybeFlushBufferedEndTag();
777                         break;
778                     }
779                 } else if (cc == '>') {
780                     if (isAppropriateEndTag()) {
781                         m_state = DataState;
782                         maybeFlushBufferedEndTag();
783                         break;
784                     }
785                 }
786                 emitCharacter('<');
787                 emitCharacter('/');
788                 m_token->appendToCharacter(m_temporaryBuffer);
789                 m_bufferedEndTagName.clear();
790                 m_state = ScriptDataEscapedState;
791                 continue;
792             }
793             break;
794         }
795         case ScriptDataDoubleEscapeStartState: {
796             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '/' || cc == '>') {
797                 emitCharacter(cc);
798                 if (temporaryBufferIs(scriptTag.localName()))
799                     m_state = ScriptDataDoubleEscapedState;
800                 else
801                     m_state = ScriptDataEscapedState;
802             } else if (cc >= 'A' && cc <= 'Z') {
803                 emitCharacter(cc);
804                 m_temporaryBuffer.append(toLowerCase(cc));
805             } else if (cc >= 'a' && cc <= 'z') {
806                 emitCharacter(cc);
807                 m_temporaryBuffer.append(cc);
808             } else {
809                 m_state = ScriptDataEscapedState;
810                 continue;
811             }
812             break;
813         }
814         case ScriptDataDoubleEscapedState: {
815             if (cc == '-') {
816                 emitCharacter(cc);
817                 m_state = ScriptDataDoubleEscapedDashState;
818             } else if (cc == '<') {
819                 emitCharacter(cc);
820                 m_state = ScriptDataDoubleEscapedLessThanSignState;
821             } else
822                 emitCharacter(cc);
823             // FIXME: Handle EOF properly.
824             break;
825         }
826         case ScriptDataDoubleEscapedDashState: {
827             if (cc == '-') {
828                 emitCharacter(cc);
829                 m_state = ScriptDataDoubleEscapedDashDashState;
830             } else if (cc == '<') {
831                 emitCharacter(cc);
832                 m_state = ScriptDataDoubleEscapedLessThanSignState;
833             } else {
834                 emitCharacter(cc);
835                 m_state = ScriptDataDoubleEscapedState;
836             }
837             // FIXME: Handle EOF properly.
838             break;
839         }
840         case ScriptDataDoubleEscapedDashDashState: {
841             if (cc == '-')
842                 emitCharacter(cc);
843             else if (cc == '<') {
844                 emitCharacter(cc);
845                 m_state = ScriptDataDoubleEscapedLessThanSignState;
846             } else if (cc == '>') {
847                 emitCharacter(cc);
848                 m_state = ScriptDataState;
849             } else {
850                 emitCharacter(cc);
851                 m_state = ScriptDataDoubleEscapedState;
852             }
853             // FIXME: Handle EOF properly.
854             break;
855         }
856         case ScriptDataDoubleEscapedLessThanSignState: {
857             if (cc == '/') {
858                 emitCharacter(cc);
859                 m_temporaryBuffer.clear();
860                 m_state = ScriptDataDoubleEscapeEndState;
861             } else {
862                 m_state = ScriptDataDoubleEscapedState;
863                 continue;
864             }
865             break;
866         }
867         case ScriptDataDoubleEscapeEndState: {
868             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '/' || cc == '>') {
869                 emitCharacter(cc);
870                 if (temporaryBufferIs(scriptTag.localName()))
871                     m_state = ScriptDataEscapedState;
872                 else
873                     m_state = ScriptDataDoubleEscapedState;
874             } else if (cc >= 'A' && cc <= 'Z') {
875                 emitCharacter(cc);
876                 m_temporaryBuffer.append(toLowerCase(cc));
877             } else if (cc >= 'a' && cc <= 'z') {
878                 emitCharacter(cc);
879                 m_temporaryBuffer.append(cc);
880             } else {
881                 m_state = ScriptDataDoubleEscapedState;
882                 continue;
883             }
884             break;
885         }
886         case BeforeAttributeNameState: {
887             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
888                 break;
889             else if (cc == '/')
890                 m_state = SelfClosingStartTagState;
891             else if (cc == '>') {
892                 emitCurrentToken();
893                 m_state = DataState;
894             } else if (cc >= 'A' && cc <= 'Z') {
895                 m_token->addNewAttribute();
896                 m_token->appendToAttributeName(toLowerCase(cc));
897                 m_state = AttributeNameState;
898             } else {
899                 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
900                     emitParseError();
901                 m_token->addNewAttribute();
902                 m_token->appendToAttributeName(cc);
903                 m_state = AttributeNameState;
904             }
905             // FIXME: Handle EOF properly.
906             break;
907         }
908         case AttributeNameState: {
909             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
910                 m_state = AfterAttributeNameState;
911             else if (cc == '/')
912                 m_state = SelfClosingStartTagState;
913             else if (cc == '=')
914                 m_state = BeforeAttributeValueState;
915             else if (cc == '>') {
916                 emitCurrentToken();
917                 m_state = DataState;
918             } else if (cc >= 'A' && cc <= 'Z')
919                 m_token->appendToAttributeName(toLowerCase(cc));
920             else {
921                 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
922                     emitParseError();
923                 m_token->appendToAttributeName(cc);
924                 m_state = AttributeNameState;
925             }
926             // FIXME: Handle EOF properly.
927             break;
928         }
929         case AfterAttributeNameState: {
930             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
931                 break;
932             else if (cc == '/')
933                 m_state = SelfClosingStartTagState;
934             else if (cc == '=')
935                 m_state = BeforeAttributeValueState;
936             else if (cc == '=') {
937                 emitCurrentToken();
938                 m_state = DataState;
939             } else if (cc >= 'A' && cc <= 'Z') {
940                 m_token->addNewAttribute();
941                 m_token->appendToAttributeName(toLowerCase(cc));
942                 m_state = AttributeNameState;
943             } else {
944                 if (cc == '"' || cc == '\'' || cc == '<')
945                     emitParseError();
946                 m_token->addNewAttribute();
947                 m_token->appendToAttributeName(cc);
948                 m_state = AttributeNameState;
949             }
950             // FIXME: Handle EOF properly.
951             break;
952         }
953         case BeforeAttributeValueState: {
954             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
955                 break;
956             else if (cc == '"')
957                 m_state = AttributeValueDoubleQuotedState;
958             else if (cc == '&') {
959                 m_state = AttributeValueUnquotedState;
960                 continue;
961             } else if (cc == '\'')
962                 m_state = AttributeValueSingleQuotedState;
963             else if (cc == '>') {
964                 emitParseError();
965                 emitCurrentToken();
966                 m_state = DataState;
967             } else {
968                 if (cc == '<' || cc == '=' || cc == '`')
969                     emitParseError();
970                 m_token->appendToAttributeValue(cc);
971                 m_state = AttributeValueUnquotedState;
972             }
973             break;
974         }
975         case AttributeValueDoubleQuotedState: {
976             if (cc == '"')
977                 m_state = AfterAttributeValueQuotedState;
978             else if (cc == '&') {
979                 m_state = CharacterReferenceInAttributeValueState;
980                 m_additionalAllowedCharacter = '"';
981             } else
982                 m_token->appendToAttributeValue(cc);
983             // FIXME: Handle EOF properly.
984             break;
985         }
986         case AttributeValueSingleQuotedState: {
987             if (cc == '\'')
988                 m_state = AfterAttributeValueQuotedState;
989             else if (cc == '&') {
990                 m_state = CharacterReferenceInAttributeValueState;
991                 m_additionalAllowedCharacter = '\'';
992             } else
993                 m_token->appendToAttributeValue(cc);
994             // FIXME: Handle EOF properly.
995             break;
996         }
997         case AttributeValueUnquotedState: {
998             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
999                 m_state = BeforeAttributeNameState;
1000             else if (cc == '&') {
1001                 m_state = CharacterReferenceInAttributeValueState;
1002                 m_additionalAllowedCharacter = '>';
1003             } else if (cc == '>') {
1004                 emitCurrentToken();
1005                 m_state = DataState;
1006             } else {
1007                 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`')
1008                     emitParseError();
1009                 m_token->appendToAttributeValue(cc);
1010             }
1011             // FIXME: Handle EOF properly.
1012             break;
1013         }
1014         case CharacterReferenceInAttributeValueState: {
1015             bool notEnoughCharacters = false;
1016             unsigned value = consumeEntity(source, notEnoughCharacters);
1017             if (notEnoughCharacters)
1018                 return shouldEmitBufferedCharacterToken(source);
1019             if (!value)
1020                 m_token->appendToAttributeValue('&');
1021             else if (value < 0xFFFF)
1022                 m_token->appendToAttributeValue(value);
1023             else {
1024                 m_token->appendToAttributeValue(U16_LEAD(value));
1025                 m_token->appendToAttributeValue(U16_TRAIL(value));
1026             }
1027             // We're supposed to switch back to the attribute value state that
1028             // we were in when we were switched into this state.  Rather than
1029             // keeping track of this explictly, we observe that the previous
1030             // state can be determined by m_additionalAllowedCharacter.
1031             if (m_additionalAllowedCharacter == '"')
1032                 m_state = AttributeValueDoubleQuotedState;
1033             else if (m_additionalAllowedCharacter == '\'')
1034                 m_state = AttributeValueSingleQuotedState;
1035             else if (m_additionalAllowedCharacter == '>')
1036                 m_state = AttributeValueUnquotedState;
1037             else
1038                 ASSERT_NOT_REACHED();
1039             continue;
1040         }
1041         case AfterAttributeValueQuotedState: {
1042             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1043                 m_state = BeforeAttributeNameState;
1044             else if (cc == '/')
1045                 m_state = SelfClosingStartTagState;
1046             else if (cc == '>') {
1047                 emitCurrentToken();
1048                 m_state = DataState;
1049             } else {
1050                 emitParseError();
1051                 m_state = BeforeAttributeNameState;
1052                 continue;
1053             }
1054             // FIXME: Handle EOF properly.
1055             break;
1056         }
1057         case SelfClosingStartTagState: {
1058             if (cc == '>') {
1059                 notImplemented();
1060                 emitCurrentToken();
1061                 m_state = DataState;
1062             } else {
1063                 emitParseError();
1064                 m_state = BeforeAttributeNameState;
1065                 continue;
1066             }
1067             // FIXME: Handle EOF properly.
1068             break;
1069         }
1070         case BogusCommentState: {
1071             m_token->beginComment();
1072             while (!source.isEmpty()) {
1073                 cc = *source;
1074                 if (cc == '>')
1075                     break;
1076                 m_token->appendToComment(cc);
1077                 source.advance();
1078             }
1079             emitCurrentToken();
1080             m_state = DataState;
1081             if (source.isEmpty())
1082                 return true;
1083             // FIXME: Handle EOF properly.
1084             break;
1085         }
1086         case MarkupDeclarationOpenState: {
1087             DEFINE_STATIC_LOCAL(String, dashDashString, ("--"));
1088             DEFINE_STATIC_LOCAL(String, doctypeString, ("doctype"));
1089             if (cc == '-') {
1090                 SegmentedString::LookAheadResult result = source.lookAhead(dashDashString);
1091                 if (result == SegmentedString::DidMatch) {
1092                     source.advanceAndASSERT('-');
1093                     source.advanceAndASSERT('-');
1094                     m_token->beginComment();
1095                     m_state = CommentStartState;
1096                     continue;
1097                 } else if (result == SegmentedString::NotEnoughCharacters)
1098                     return shouldEmitBufferedCharacterToken(source);
1099             } else if (cc == 'D' || cc == 'd') {
1100                 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(doctypeString);
1101                 if (result == SegmentedString::DidMatch) {
1102                     advanceStringAndASSERTIgnoringCase(source, "doctype");
1103                     m_state = DOCTYPEState;
1104                     continue;
1105                 } else if (result == SegmentedString::NotEnoughCharacters)
1106                     return shouldEmitBufferedCharacterToken(source);
1107             }
1108             notImplemented();
1109             // FIXME: We're still missing the bits about the insertion mode being in foreign content:
1110             // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#markup-declaration-open-state
1111             emitParseError();
1112             m_state = BogusCommentState;
1113             continue;
1114         }
1115         case CommentStartState: {
1116             if (cc == '-')
1117                 m_state = CommentStartDashState;
1118             else if (cc == '>') {
1119                 emitParseError();
1120                 emitCurrentToken();
1121                 m_state = DataState;
1122             } else {
1123                 m_token->appendToComment(cc);
1124                 m_state = CommentState;
1125             }
1126             // FIXME: Handle EOF properly.
1127             break;
1128         }
1129         case CommentStartDashState: {
1130             if (cc == '-')
1131                 m_state = CommentEndState;
1132             else if (cc == '>') {
1133                 emitParseError();
1134                 emitCurrentToken();
1135                 m_state = DataState;
1136             } else {
1137                 m_token->appendToComment('-');
1138                 m_token->appendToComment(cc);
1139                 m_state = CommentState;
1140             }
1141             // FIXME: Handle EOF properly.
1142             break;
1143         }
1144         case CommentState: {
1145             if (cc == '-')
1146                 m_state = CommentEndDashState;
1147             else
1148                 m_token->appendToComment(cc);
1149             // FIXME: Handle EOF properly.
1150             break;
1151         }
1152         case CommentEndDashState: {
1153             if (cc == '-')
1154                 m_state = CommentEndState;
1155             else {
1156                 m_token->appendToComment('-');
1157                 m_token->appendToComment(cc);
1158                 m_state = CommentState;
1159             }
1160             // FIXME: Handle EOF properly.
1161             break;
1162         }
1163         case CommentEndState: {
1164             if (cc == '>') {
1165                 emitCurrentToken();
1166                 m_state = DataState;
1167             } else if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ') {
1168                 emitParseError();
1169                 m_token->appendToComment('-');
1170                 m_token->appendToComment('-');
1171                 m_token->appendToComment(cc);
1172                 m_state = CommentEndSpaceState;
1173             } else if (cc == '!') {
1174                 emitParseError();
1175                 m_state = CommentEndBangState;
1176             } else if (cc == '-') {
1177                 emitParseError();
1178                 m_token->appendToComment('-');
1179                 m_token->appendToComment(cc);
1180             } else {
1181                 emitParseError();
1182                 m_token->appendToComment('-');
1183                 m_token->appendToComment('-');
1184                 m_token->appendToComment(cc);
1185                 m_state = CommentState;
1186             }
1187             // FIXME: Handle EOF properly.
1188             break;
1189         }
1190         case CommentEndBangState: {
1191             if (cc == '-') {
1192                 m_token->appendToComment('-');
1193                 m_token->appendToComment('-');
1194                 m_token->appendToComment('!');
1195                 m_state = CommentEndDashState;
1196             } else if (cc == '>') {
1197                 emitCurrentToken();
1198                 m_state = DataState;
1199             } else {
1200                 m_token->appendToComment('-');
1201                 m_token->appendToComment('-');
1202                 m_token->appendToComment('!');
1203                 m_token->appendToComment(cc);
1204                 m_state = CommentState;
1205             }
1206             // FIXME: Handle EOF properly.
1207             break;
1208         }
1209         case CommentEndSpaceState: {
1210             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1211                 m_token->appendToComment(cc);
1212             else if (cc == '-')
1213                 m_state = CommentEndDashState;
1214             else if (cc == '>') {
1215                 emitCurrentToken();
1216                 m_state = DataState;
1217             } else {
1218                 m_token->appendToComment(cc);
1219                 m_state = CommentState;
1220             }
1221             // FIXME: Handle EOF properly.
1222             break;
1223         }
1224         case DOCTYPEState: {
1225             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1226                 m_state = BeforeDOCTYPENameState;
1227             else {
1228                 emitParseError();
1229                 m_state = BeforeDOCTYPENameState;
1230                 continue;
1231             }
1232             // FIXME: Handle EOF properly.
1233             break;
1234         }
1235         case BeforeDOCTYPENameState: {
1236             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1237                 break;
1238             else if (cc >= 'A' && cc <= 'Z') {
1239                 m_token->beginDOCTYPE(toLowerCase(cc));
1240                 m_state = DOCTYPENameState;
1241             } else if (cc == '>') {
1242                 emitParseError();
1243                 m_token->beginDOCTYPE();
1244                 notImplemented();
1245                 emitCurrentToken();
1246                 m_state = DataState;
1247             } else {
1248                 m_token->beginDOCTYPE(cc);
1249                 m_state = DOCTYPENameState;
1250             }
1251             // FIXME: Handle EOF properly.
1252             break;
1253         }
1254         case DOCTYPENameState: {
1255             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1256                 m_state = AfterDOCTYPENameState;
1257             else if (cc == '>') {
1258                 emitCurrentToken();
1259                 m_state = DataState;
1260             } else if (cc >= 'A' && cc <= 'Z')
1261                 m_token->appendToName(toLowerCase(cc));
1262             else
1263                 m_token->appendToName(cc);
1264             // FIXME: Handle EOF properly.
1265             break;
1266         }
1267         case AfterDOCTYPENameState: {
1268             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1269                 break;
1270             if (cc == '>') {
1271                 emitCurrentToken();
1272                 m_state = DataState;
1273             } else {
1274                 DEFINE_STATIC_LOCAL(String, publicString, ("public"));
1275                 DEFINE_STATIC_LOCAL(String, systemString, ("system"));
1276                 if (cc == 'P' || cc == 'p') {
1277                     SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(publicString);
1278                     if (result == SegmentedString::DidMatch) {
1279                         advanceStringAndASSERTIgnoringCase(source, "public");
1280                         m_state = AfterDOCTYPEPublicKeywordState;
1281                         continue;
1282                     } else if (result == SegmentedString::NotEnoughCharacters)
1283                         return shouldEmitBufferedCharacterToken(source);
1284                 } else if (cc == 'S' || cc == 's') {
1285                     SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(systemString);
1286                     if (result == SegmentedString::DidMatch) {
1287                         advanceStringAndASSERTIgnoringCase(source, "system");
1288                         m_state = AfterDOCTYPESystemKeywordState;
1289                         continue;
1290                     } else if (result == SegmentedString::NotEnoughCharacters)
1291                         return shouldEmitBufferedCharacterToken(source);
1292                 }
1293                 emitParseError();
1294                 notImplemented();
1295                 m_state = BogusDOCTYPEState;
1296             }
1297             // FIXME: Handle EOF properly.
1298             break;
1299         }
1300         case AfterDOCTYPEPublicKeywordState: {
1301             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1302                 m_state = BeforeDOCTYPEPublicIdentifierState;
1303             else if (cc == '"') {
1304                 emitParseError();
1305                 m_token->setPublicIdentifierToEmptyString();
1306                 m_state = DOCTYPEPublicIdentifierDoubleQuotedState;
1307             } else if (cc == '\'') {
1308                 emitParseError();
1309                 m_token->setPublicIdentifierToEmptyString();
1310                 m_state = DOCTYPEPublicIdentifierSingleQuotedState;
1311             } else if (cc == '>') {
1312                 emitParseError();
1313                 notImplemented();
1314                 emitCurrentToken();
1315                 m_state = DataState;
1316             } else {
1317                 emitParseError();
1318                 notImplemented();
1319                 m_state = BogusDOCTYPEState;
1320             }
1321             // FIXME: Handle EOF properly.
1322             break;
1323         }
1324         case BeforeDOCTYPEPublicIdentifierState: {
1325             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1326                 break;
1327             else if (cc == '"') {
1328                 m_token->setPublicIdentifierToEmptyString();
1329                 m_state = DOCTYPEPublicIdentifierDoubleQuotedState;
1330             } else if (cc == '\'') {
1331                 m_token->setPublicIdentifierToEmptyString();
1332                 m_state = DOCTYPEPublicIdentifierSingleQuotedState;
1333             } else if (cc == '>') {
1334                 emitParseError();
1335                 notImplemented();
1336                 emitCurrentToken();
1337                 m_state = DataState;
1338             } else {
1339                 emitParseError();
1340                 notImplemented();
1341                 m_state = BogusDOCTYPEState;
1342             }
1343             // FIXME: Handle EOF properly.
1344             break;
1345         }
1346         case DOCTYPEPublicIdentifierDoubleQuotedState: {
1347             if (cc == '"')
1348                 m_state = AfterDOCTYPEPublicIdentifierState;
1349             else if (cc == '>') {
1350                 emitParseError();
1351                 notImplemented();
1352                 emitCurrentToken();
1353                 m_state = DataState;
1354             } else
1355                 m_token->appendToPublicIdentifier(cc);
1356             // FIXME: Handle EOF properly.
1357             break;
1358         }
1359         case DOCTYPEPublicIdentifierSingleQuotedState: {
1360             if (cc == '\'')
1361                 m_state = AfterDOCTYPEPublicIdentifierState;
1362             else if (cc == '>') {
1363                 emitParseError();
1364                 notImplemented();
1365                 emitCurrentToken();
1366                 m_state = DataState;
1367             } else
1368                 m_token->appendToPublicIdentifier(cc);
1369             // FIXME: Handle EOF properly.
1370             break;
1371         }
1372         case AfterDOCTYPEPublicIdentifierState: {
1373             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1374                 m_state = BetweenDOCTYPEPublicAndSystemIdentifiersState;
1375             else if (cc == '>') {
1376                 emitCurrentToken();
1377                 m_state = DataState;
1378             } else if (cc == '"') {
1379                 emitParseError();
1380                 m_token->setPublicIdentifierToEmptyString();
1381                 m_state = DOCTYPESystemIdentifierDoubleQuotedState;
1382             } else if (cc == '\'') {
1383                 emitParseError();
1384                 m_token->setPublicIdentifierToEmptyString();
1385                 m_state = DOCTYPESystemIdentifierSingleQuotedState;
1386             } else {
1387                 emitParseError();
1388                 notImplemented();
1389                 m_state = BogusDOCTYPEState;
1390             }
1391             // FIXME: Handle EOF properly.
1392             break;
1393         }
1394         case BetweenDOCTYPEPublicAndSystemIdentifiersState: {
1395             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1396                 m_state = BetweenDOCTYPEPublicAndSystemIdentifiersState;
1397             else if (cc == '>') {
1398                 emitCurrentToken();
1399                 m_state = DataState;
1400             } else if (cc == '"') {
1401                 m_token->setSystemIdentifierToEmptyString();
1402                 m_state = DOCTYPESystemIdentifierDoubleQuotedState;
1403             } else if (cc == '\'') {
1404                 m_token->setSystemIdentifierToEmptyString();
1405                 m_state = DOCTYPESystemIdentifierSingleQuotedState;
1406             } else {
1407                 emitParseError();
1408                 notImplemented();
1409                 m_state = BogusDOCTYPEState;
1410             }
1411             // FIXME: Handle EOF properly.
1412             break;
1413         }
1414         case AfterDOCTYPESystemKeywordState: {
1415             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1416                 m_state = BeforeDOCTYPESystemIdentifierState;
1417             else if (cc == '"') {
1418                 emitParseError();
1419                 m_token->setSystemIdentifierToEmptyString();
1420                 m_state = DOCTYPESystemIdentifierDoubleQuotedState;
1421             } else if (cc == '\'') {
1422                 emitParseError();
1423                 m_token->setSystemIdentifierToEmptyString();
1424                 m_state = DOCTYPESystemIdentifierSingleQuotedState;
1425             } else if (cc == '>') {
1426                 emitParseError();
1427                 notImplemented();
1428                 emitCurrentToken();
1429                 m_state = DataState;
1430             } else {
1431                 emitParseError();
1432                 notImplemented();
1433                 m_state = BogusDOCTYPEState;
1434             }
1435             // FIXME: Handle EOF properly.
1436             break;
1437         }
1438         case BeforeDOCTYPESystemIdentifierState: {
1439             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1440                 break;
1441             if (cc == '"') {
1442                 m_token->setSystemIdentifierToEmptyString();
1443                 m_state = DOCTYPESystemIdentifierDoubleQuotedState;
1444             } else if (cc == '\'') {
1445                 m_token->setSystemIdentifierToEmptyString();
1446                 m_state = DOCTYPESystemIdentifierSingleQuotedState;
1447             } else if (cc == '>') {
1448                 emitParseError();
1449                 notImplemented();
1450                 emitCurrentToken();
1451                 m_state = DataState;
1452             } else {
1453                 emitParseError();
1454                 notImplemented();
1455                 m_state = BogusDOCTYPEState;
1456             }
1457             // FIXME: Handle EOF properly.
1458             break;
1459         }
1460         case DOCTYPESystemIdentifierDoubleQuotedState: {
1461             if (cc == '"')
1462                 m_state = AfterDOCTYPESystemIdentifierState;
1463             else if (cc == '>') {
1464                 emitParseError();
1465                 notImplemented();
1466                 emitCurrentToken();
1467                 m_state = DataState;
1468             } else
1469                 m_token->appendToSystemIdentifier(cc);
1470             // FIXME: Handle EOF properly.
1471             break;
1472         }
1473         case DOCTYPESystemIdentifierSingleQuotedState: {
1474             if (cc == '\'')
1475                 m_state = AfterDOCTYPESystemIdentifierState;
1476             else if (cc == '>') {
1477                 emitParseError();
1478                 notImplemented();
1479                 emitCurrentToken();
1480                 m_state = DataState;
1481             } else
1482                 m_token->appendToSystemIdentifier(cc);
1483             // FIXME: Handle EOF properly.
1484             break;
1485         }
1486         case AfterDOCTYPESystemIdentifierState: {
1487             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1488                 break;
1489             else if (cc == '>') {
1490                 emitCurrentToken();
1491                 m_state = DataState;
1492             } else {
1493                 emitParseError();
1494                 m_state = BogusDOCTYPEState;
1495             }
1496             // FIXME: Handle EOF properly.
1497             break;
1498         }
1499         case BogusDOCTYPEState: {
1500             if (cc == '>') {
1501                 emitCurrentToken();
1502                 m_state = DataState;
1503             }
1504             // FIXME: Handle EOF properly.
1505             break;
1506         }
1507         case CDATASectionState: {
1508             notImplemented();
1509             break;
1510         }
1511         }
1512         source.advance();
1513         if (m_emitPending) {
1514             m_emitPending = false;
1515             return true;
1516         }
1517     }
1518     // We've reached the end of the input stream.  If we have a character
1519     // token buffered, we should emit it.
1520     return shouldEmitBufferedCharacterToken(source);
1521 }
1522
1523 inline bool HTML5Lexer::temporaryBufferIs(const String& expectedString)
1524 {
1525     return vectorEqualsString(m_temporaryBuffer, expectedString);
1526 }
1527
1528 inline void HTML5Lexer::addToPossibleEndTag(UChar cc)
1529 {
1530     ASSERT(isEndTagBufferingState(m_state));
1531     m_bufferedEndTagName.append(cc);
1532 }
1533
1534 inline bool HTML5Lexer::isAppropriateEndTag()
1535 {
1536     return vectorEqualsString(m_bufferedEndTagName, m_appropriateEndTagName);
1537 }
1538
1539 inline void HTML5Lexer::emitCharacter(UChar character)
1540 {
1541     if (m_token->type() != HTML5Token::Character) {
1542         m_token->beginCharacter(character);
1543         return;
1544     }
1545     m_token->appendToCharacter(character);
1546 }
1547
1548 inline void HTML5Lexer::emitCodePoint(unsigned value)
1549 {
1550     if (value < 0xFFFF) {
1551         emitCharacter(value);
1552         return;
1553     }
1554     emitCharacter(U16_LEAD(value));
1555     emitCharacter(U16_TRAIL(value));
1556 }
1557
1558 inline void HTML5Lexer::emitParseError()
1559 {
1560     notImplemented();
1561 }
1562
1563 inline void HTML5Lexer::maybeFlushBufferedEndTag()
1564 {
1565     ASSERT(m_token->type() == HTML5Token::Character || m_token->type() == HTML5Token::Uninitialized);
1566     if (m_token->type() == HTML5Token::Character) {
1567         // We have a character token queued up.  We need to emit it before we
1568         // can start begin the buffered end tag token.
1569         emitCurrentToken();
1570         return;
1571     }
1572     flushBufferedEndTag();
1573 }
1574
1575 inline void HTML5Lexer::flushBufferedEndTag()
1576 {
1577     m_token->beginEndTag(m_bufferedEndTagName);
1578     m_bufferedEndTagName.clear();
1579     if (m_state == DataState)
1580         emitCurrentToken();
1581 }
1582
1583 inline void HTML5Lexer::emitCurrentToken()
1584 {
1585     ASSERT(m_token->type() != HTML5Token::Uninitialized);
1586     m_emitPending = true;
1587     if (m_token->type() == HTML5Token::StartTag)
1588         m_appropriateEndTagName = m_token->name();
1589 }
1590
1591 inline bool HTML5Lexer::shouldEmitBufferedCharacterToken(const SegmentedString& source)
1592 {
1593     return source.isClosed() && m_token->type() == HTML5Token::Character;
1594 }
1595
1596 }