098ae53dde4b47f9bef119516976c317d8a9d7fc
[WebKit-https.git] / WebCore / html / HTML5Lexer.cpp
1 /*
2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3  * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
26  */
27
28 #include "config.h"
29 #include "HTML5Lexer.h"
30
31 #include "AtomicString.h"
32 #include "HTML5Token.h"
33 #include "HTMLNames.h"
34 #include "NotImplemented.h"
35 #include <wtf/CurrentTime.h>
36 #include <wtf/UnusedParam.h>
37 #include <wtf/text/CString.h>
38 #include <wtf/unicode/Unicode.h>
39
40
41 // Use __GNUC__ instead of PLATFORM(GCC) to stay consistent with the gperf generated c file
42 #ifdef __GNUC__
43 // The main tokenizer includes this too so we are getting two copies of the data. However, this way the code gets inlined.
44 #include "HTMLEntityNames.c"
45 #else
46 // Not inlined for non-GCC compilers
47 struct Entity {
48     const char* name;
49     int code;
50 };
51 const struct Entity* findEntity(register const char* str, register unsigned int len);
52 #endif
53
54 using namespace WTF;
55
56 namespace WebCore {
57
58 using namespace HTMLNames;
59
60 namespace {
61
62 static const UChar windowsLatin1ExtensionArray[32] = {
63     0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
64     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
65     0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
66     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
67 };
68
69 inline UChar toLowerCase(UChar cc)
70 {
71     ASSERT(cc >= 'A' && cc <= 'Z');
72     const int lowerCaseOffset = 0x20;
73     return cc + lowerCaseOffset;
74 }
75
76 inline void advanceStringAndASSERTIgnoringCase(SegmentedString& source, const char* expectedCharacters)
77 {
78     while (*expectedCharacters)
79         source.advanceAndASSERTIgnoringCase(*expectedCharacters++);
80 }
81
82 inline bool vectorEqualsString(const Vector<UChar, 32>& vector, const String& string)
83 {
84     if (vector.size() != string.length())
85         return false;
86     const UChar* stringData = string.characters();
87     const UChar* vectorData = vector.data();
88     // FIXME: Is there a higher-level function we should be calling here?
89     return !memcmp(stringData, vectorData, vector.size() * sizeof(UChar));
90 }
91
92 inline UChar adjustEntity(unsigned value)
93 {
94     if ((value & ~0x1F) != 0x0080)
95         return value;
96     return windowsLatin1ExtensionArray[value - 0x80];
97 }
98
99 inline unsigned legalEntityFor(unsigned value)
100 {
101     // FIXME: A number of specific entity values generate parse errors.
102     if (value == 0 || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))
103         return 0xFFFD;
104     if (value < 0xFFFF)
105         return adjustEntity(value);
106     return value;
107 }
108
109 inline bool isHexDigit(UChar cc)
110 {
111     return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
112 }
113
114 inline bool isAlphaNumeric(UChar cc)
115 {
116     return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
117 }
118
119 void unconsumeCharacters(SegmentedString& source, const Vector<UChar, 10>& consumedCharacters)
120 {
121     if (consumedCharacters.size() == 1)
122         source.push(consumedCharacters[0]);
123     else if (consumedCharacters.size() == 2) {
124         source.push(consumedCharacters[0]);
125         source.push(consumedCharacters[1]);
126     } else
127         source.prepend(SegmentedString(String(consumedCharacters.data(), consumedCharacters.size())));
128 }
129
130 inline bool isEndTagBufferingState(HTML5Lexer::State state)
131 {
132     return state == HTML5Lexer::RCDATAEndTagOpenState
133         || state == HTML5Lexer::RCDATAEndTagNameState
134         || state == HTML5Lexer::RAWTEXTEndTagOpenState
135         || state == HTML5Lexer::RAWTEXTEndTagNameState
136         || state == HTML5Lexer::ScriptDataEndTagOpenState
137         || state == HTML5Lexer::ScriptDataEndTagNameState
138         || state == HTML5Lexer::ScriptDataEscapedEndTagOpenState
139         || state == HTML5Lexer::ScriptDataEscapedEndTagNameState;
140 }
141
142 }
143
144 HTML5Lexer::HTML5Lexer()
145 {
146     reset();
147 }
148
149 HTML5Lexer::~HTML5Lexer()
150 {
151 }
152
153 void HTML5Lexer::reset()
154 {
155     m_state = DataState;
156     m_token = 0;
157     m_skipLeadingNewLineForListing = false;
158     m_emitPending = false;
159     m_additionalAllowedCharacter = '\0';
160 }
161
162 unsigned HTML5Lexer::consumeEntity(SegmentedString& source, bool& notEnoughCharacters)
163 {
164     ASSERT(m_state != CharacterReferenceInAttributeValueState || m_additionalAllowedCharacter == '"' || m_additionalAllowedCharacter == '\'' || m_additionalAllowedCharacter == '>');
165     ASSERT(!notEnoughCharacters);
166
167     enum EntityState {
168         Initial,
169         NumberType,
170         MaybeHexLowerCaseX,
171         MaybeHexUpperCaseX,
172         Hex,
173         Decimal,
174         Named
175     };
176     EntityState entityState = Initial;
177     unsigned result = 0;
178     Vector<UChar, 10> consumedCharacters;
179     Vector<char, 10> entityName;
180
181     while (!source.isEmpty()) {
182         UChar cc = *source;
183         switch (entityState) {
184         case Initial: {
185             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
186                 return 0;
187             if (m_state == CharacterReferenceInAttributeValueState && cc == m_additionalAllowedCharacter)
188                 return 0;
189             if (cc == '#') {
190                 entityState = NumberType;
191                 break;
192             }
193             if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
194                 entityState = Named;
195                 continue;
196             }
197             return 0;
198         }
199         case NumberType: {
200             if (cc == 'x') {
201                 entityState = MaybeHexLowerCaseX;
202                 break;
203             }
204             if (cc == 'X') {
205                 entityState = MaybeHexUpperCaseX;
206                 break;
207             }
208             if (cc >= '0' && cc <= '9') {
209                 entityState = Decimal;
210                 continue;
211             }
212             source.push('#');
213             return 0;
214         }
215         case MaybeHexLowerCaseX: {
216             if (isHexDigit(cc)) {
217                 entityState = Hex;
218                 continue;
219             }
220             source.push('#');
221             source.push('x');
222             return 0;
223         }
224         case MaybeHexUpperCaseX: {
225             if (isHexDigit(cc)) {
226                 entityState = Hex;
227                 continue;
228             }
229             source.push('#');
230             source.push('X');
231             return 0;
232         }
233         case Hex: {
234             if (cc >= '0' && cc <= '9')
235                 result = result * 16 + cc - '0';
236             else if (cc >= 'a' && cc <= 'f')
237                 result = result * 16 + 10 + cc - 'a';
238             else if (cc >= 'A' && cc <= 'F')
239                 result = result * 16 + 10 + cc - 'A';
240             else if (cc == ';') {
241                 source.advance();
242                 return legalEntityFor(result);
243             } else 
244                 return legalEntityFor(result);
245             break;
246         }
247         case Decimal: {
248             if (cc >= '0' && cc <= '9')
249                 result = result * 10 + cc - '0';
250             else if (cc == ';') {
251                 source.advance();
252                 return legalEntityFor(result);
253             } else
254                 return legalEntityFor(result);
255             break;
256         }
257         case Named: {
258             // FIXME: This code is wrong. We need to find the longest matching entity.
259             //        The examples from the spec are:
260             //            I'm &notit; I tell you
261             //            I'm &notin; I tell you
262             //        In the first case, "&not" is the entity.  In the second
263             //        case, "&notin;" is the entity.
264             // FIXME: Our list of HTML entities is incomplete.
265             // FIXME: The number 8 below is bogus.
266             while (!source.isEmpty() && entityName.size() <= 8) {
267                 cc = *source;
268                 if (cc == ';') {
269                     const Entity* entity = findEntity(entityName.data(), entityName.size());
270                     if (entity) {
271                         source.advanceAndASSERT(';');
272                         return entity->code;
273                     }
274                     emitParseError();
275                     break;
276                 }
277                 if (!isAlphaNumeric(cc)) {
278                     const Entity* entity = findEntity(entityName.data(), entityName.size());
279                     if (entity) {
280                         // HTML5 tells us to ignore this entity, for historical reasons,
281                         // if the lookhead character is '='.
282                         if (m_state == CharacterReferenceInAttributeValueState && cc == '=')
283                             break;
284                         emitParseError();
285                         return entity->code;
286                     }
287                     break;
288                 }
289                 entityName.append(cc);
290                 consumedCharacters.append(cc);
291                 source.advanceAndASSERT(cc);
292             }
293             notEnoughCharacters = source.isEmpty();
294             unconsumeCharacters(source, consumedCharacters);
295             return 0;
296         }
297         }
298         consumedCharacters.append(cc);
299         source.advanceAndASSERT(cc);
300     }
301     ASSERT(source.isEmpty());
302     notEnoughCharacters = true;
303     unconsumeCharacters(source, consumedCharacters);
304     return 0;
305 }
306
307 inline bool HTML5Lexer::processEntity(SegmentedString& source)
308 {
309     bool notEnoughCharacters = false;
310     unsigned value = consumeEntity(source, notEnoughCharacters);
311     if (notEnoughCharacters)
312         return false;
313     if (!value)
314         emitCharacter('&');
315     else
316         emitCodePoint(value);
317     return true;
318 }
319
320 bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
321 {
322     // If we have a token in progress, then we're supposed to be called back
323     // with the same token so we can finish it.
324     ASSERT(!m_token || m_token == &token || token.type() == HTML5Token::Uninitialized);
325     m_token = &token;
326
327     if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) {
328         // FIXME: This should call flushBufferedEndTag().
329         // We started an end tag during our last iteration.
330         m_token->beginEndTag(m_bufferedEndTagName);
331         m_bufferedEndTagName.clear();
332         if (m_state == DataState) {
333             // We're back in the data state, so we must be done with the tag.
334             return true;
335         }
336     }
337
338     // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
339     if (m_skipLeadingNewLineForListing && m_state == DataState && !source.isEmpty() && *source == '\x0A')
340         source.advanceAndASSERT('\x0A');
341     m_skipLeadingNewLineForListing = false;
342
343     // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
344     // FIXME: This while should stop as soon as we have a token to return.
345     while (!source.isEmpty()) {
346         UChar cc = *source;
347         switch (m_state) {
348         case DataState: {
349             if (cc == '&')
350                 m_state = CharacterReferenceInDataState;
351             else if (cc == '<') {
352                 if (m_token->type() == HTML5Token::Character) {
353                     // We have a bunch of character tokens queued up that we
354                     // are emitting lazily here.
355                     return true;
356                 }
357                 m_state = TagOpenState;
358             } else
359                 emitCharacter(cc);
360             break;
361         }
362         case CharacterReferenceInDataState: {
363             if (!processEntity(source))
364                 return shouldEmitBufferedCharacterToken(source);
365             m_state = DataState;
366             continue;
367         }
368         case RCDATAState: {
369             if (cc == '&')
370                 m_state = CharacterReferenceInRCDATAState;
371             else if (cc == '<')
372                 m_state = RCDATALessThanSignState;
373             else
374                 emitCharacter(cc);
375             break;
376         }
377         case CharacterReferenceInRCDATAState: {
378             if (!processEntity(source))
379                 return shouldEmitBufferedCharacterToken(source);
380             m_state = RCDATAState;
381             continue;
382         }
383         case RAWTEXTState: {
384             if (cc == '<')
385                 m_state = RAWTEXTLessThanSignState;
386             else
387                 emitCharacter(cc);
388             break;
389         }
390         case ScriptDataState: {
391             if (cc == '<')
392                 m_state = ScriptDataLessThanSignState;
393             else
394                 emitCharacter(cc);
395             break;
396         }
397         case PLAINTEXTState: {
398             emitCharacter(cc);
399             break;
400         }
401         case TagOpenState: {
402             if (cc == '!')
403                 m_state = MarkupDeclarationOpenState;
404             else if (cc == '/')
405                 m_state = EndTagOpenState;
406             else if (cc >= 'A' && cc <= 'Z') {
407                 m_token->beginStartTag(toLowerCase(cc));
408                 m_state = TagNameState;
409             } else if (cc >= 'a' && cc <= 'z') {
410                 m_token->beginStartTag(cc);
411                 m_state = TagNameState;
412             } else if (cc == '?') {
413                 emitParseError();
414                 m_state = BogusCommentState;
415                 // The spec consumes the current character before switching
416                 // to the bogus comment state, but it's easier to implement
417                 // if we reconsume the current character.
418                 continue;
419             } else {
420                 emitParseError();
421                 m_state = DataState;
422                 emitCharacter('<');
423                 continue;
424             }
425             break;
426         }
427         case EndTagOpenState: {
428             if (cc >= 'A' && cc <= 'Z') {
429                 m_token->beginEndTag(toLowerCase(cc));
430                 m_state = TagNameState;
431             } else if (cc >= 'a' && cc <= 'z') {
432                 m_token->beginEndTag(cc);
433                 m_state = TagNameState;
434             } else if (cc == '>') {
435                 emitParseError();
436                 m_state = DataState;
437             } else {
438                 emitParseError();
439                 m_state = DataState;
440             }
441             // FIXME: Handle EOF properly.
442             break;
443         }
444         case TagNameState: {
445             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
446                 m_state = BeforeAttributeNameState;
447             else if (cc == '/')
448                 m_state = SelfClosingStartTagState;
449             else if (cc == '>') {
450                 emitCurrentToken();
451                 m_state = DataState;
452             } else if (cc >= 'A' && cc <= 'Z')
453                 m_token->appendToName(toLowerCase(cc));
454             else
455                 m_token->appendToName(cc);
456             // FIXME: Handle EOF properly.
457             break;
458         }
459         case RCDATALessThanSignState: {
460             if (cc == '/') {
461                 m_temporaryBuffer.clear();
462                 ASSERT(m_bufferedEndTagName.isEmpty());
463                 m_state = RCDATAEndTagOpenState;
464             } else {
465                 emitCharacter('<');
466                 m_state = RCDATAState;
467                 continue;
468             }
469             break;
470         }
471         case RCDATAEndTagOpenState: {
472             if (cc >= 'A' && cc <= 'Z') {
473                 m_temporaryBuffer.append(cc);
474                 addToPossibleEndTag(toLowerCase(cc));
475                 m_state = RCDATAEndTagNameState;
476             } else if (cc >= 'a' && cc <= 'z') {
477                 m_temporaryBuffer.append(cc);
478                 addToPossibleEndTag(cc);
479                 m_state = RCDATAEndTagNameState;
480             } else {
481                 emitCharacter('<');
482                 emitCharacter('/');
483                 m_state = RCDATAState;
484                 continue;
485             }
486             break;
487         }
488         case RCDATAEndTagNameState: {
489             if (cc >= 'A' && cc <= 'Z') {
490                 m_temporaryBuffer.append(cc);
491                 addToPossibleEndTag(toLowerCase(cc));
492             } else if (cc >= 'a' && cc <= 'z') {
493                 m_temporaryBuffer.append(cc);
494                 addToPossibleEndTag(cc);
495             } else {
496                 if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ') {
497                     if (isAppropriateEndTag()) {
498                         m_state = BeforeAttributeNameState;
499                         maybeFlushBufferedEndTag();
500                         break;
501                     }
502                 } else if (cc == '/') {
503                     if (isAppropriateEndTag()) {
504                         m_state = SelfClosingStartTagState;
505                         maybeFlushBufferedEndTag();
506                         break;
507                     }
508                 } else if (cc == '>') {
509                     if (isAppropriateEndTag()) {
510                         m_state = DataState;
511                         maybeFlushBufferedEndTag();
512                         break;
513                     }
514                 }
515                 emitCharacter('<');
516                 emitCharacter('/');
517                 m_token->appendToCharacter(m_temporaryBuffer);
518                 m_bufferedEndTagName.clear();
519                 m_state = RCDATAState;
520                 continue;
521             }
522             break;
523         }
524         case RAWTEXTLessThanSignState: {
525             if (cc == '/') {
526                 m_temporaryBuffer.clear();
527                 ASSERT(m_bufferedEndTagName.isEmpty());
528                 m_state = RAWTEXTEndTagOpenState;
529             } else {
530                 emitCharacter('<');
531                 m_state = RAWTEXTState;
532                 continue;
533             }
534             break;
535         }
536         case RAWTEXTEndTagOpenState: {
537             if (cc >= 'A' && cc <= 'Z') {
538                 m_temporaryBuffer.append(cc);
539                 addToPossibleEndTag(toLowerCase(cc));
540                 m_state = RAWTEXTEndTagNameState;
541             } else if (cc >= 'a' && cc <= 'z') {
542                 m_temporaryBuffer.append(cc);
543                 addToPossibleEndTag(cc);
544                 m_state = RAWTEXTEndTagNameState;
545             } else {
546                 emitCharacter('<');
547                 emitCharacter('/');
548                 m_state = RAWTEXTState;
549                 continue;
550             }
551             break;
552         }
553         case RAWTEXTEndTagNameState: {
554             if (cc >= 'A' && cc <= 'Z') {
555                 m_temporaryBuffer.append(cc);
556                 addToPossibleEndTag(toLowerCase(cc));
557             } else if (cc >= 'a' && cc <= 'z') {
558                 m_temporaryBuffer.append(cc);
559                 addToPossibleEndTag(cc);
560             } else {
561                 if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ') {
562                     if (isAppropriateEndTag()) {
563                         m_state = BeforeAttributeNameState;
564                         maybeFlushBufferedEndTag();
565                         break;
566                     }
567                 } else if (cc == '/') {
568                     if (isAppropriateEndTag()) {
569                         m_state = SelfClosingStartTagState;
570                         maybeFlushBufferedEndTag();
571                         break;
572                     }
573                 } else if (cc == '>') {
574                     if (isAppropriateEndTag()) {
575                         m_state = DataState;
576                         maybeFlushBufferedEndTag();
577                         break;
578                     }
579                 }
580                 emitCharacter('<');
581                 emitCharacter('/');
582                 m_token->appendToCharacter(m_temporaryBuffer);
583                 m_bufferedEndTagName.clear();
584                 m_state = RAWTEXTState;
585                 continue;
586             }
587             break;
588         }
589         case ScriptDataLessThanSignState: {
590             if (cc == '/') {
591                 m_temporaryBuffer.clear();
592                 ASSERT(m_bufferedEndTagName.isEmpty());
593                 m_state = ScriptDataEndTagOpenState;
594             } else if (cc == '!') {
595                 emitCharacter('<');
596                 emitCharacter('!');
597                 m_state = ScriptDataEscapeStartState;
598             } else {
599                 emitCharacter('<');
600                 m_state = ScriptDataState;
601                 continue;
602             }
603             break;
604         }
605         case ScriptDataEndTagOpenState: {
606             if (cc >= 'A' && cc <= 'Z') {
607                 m_temporaryBuffer.append(cc);
608                 addToPossibleEndTag(toLowerCase(cc));
609                 m_state = ScriptDataEndTagNameState;
610             } else if (cc >= 'a' && cc <= 'z') {
611                 m_temporaryBuffer.append(cc);
612                 addToPossibleEndTag(cc);
613                 m_state = ScriptDataEndTagNameState;
614             } else {
615                 emitCharacter('<');
616                 emitCharacter('/');
617                 m_state = ScriptDataState;
618                 continue;
619             }
620             break;
621         }
622         case ScriptDataEndTagNameState: {
623             if (cc >= 'A' && cc <= 'Z') {
624                 m_temporaryBuffer.append(cc);
625                 addToPossibleEndTag(toLowerCase(cc));
626             } else if (cc >= 'a' && cc <= 'z') {
627                 m_temporaryBuffer.append(cc);
628                 addToPossibleEndTag(cc);
629             } else {
630                 if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ') {
631                     if (isAppropriateEndTag()) {
632                         m_state = BeforeAttributeNameState;
633                         maybeFlushBufferedEndTag();
634                         break;
635                     }
636                 } else if (cc == '/') {
637                     if (isAppropriateEndTag()) {
638                         m_state = SelfClosingStartTagState;
639                         maybeFlushBufferedEndTag();
640                         break;
641                     }
642                 } else if (cc == '>') {
643                     if (isAppropriateEndTag()) {
644                         m_state = DataState;
645                         maybeFlushBufferedEndTag();
646                         break;
647                     }
648                 }
649                 emitCharacter('<');
650                 emitCharacter('/');
651                 m_token->appendToCharacter(m_temporaryBuffer);
652                 m_bufferedEndTagName.clear();
653                 m_state = ScriptDataState;
654                 continue;
655             }
656             break;
657         }
658         case ScriptDataEscapeStartState: {
659             if (cc == '-') {
660                 emitCharacter(cc);
661                 m_state = ScriptDataEscapeStartDashState;
662             } else {
663                 m_state = ScriptDataState;
664                 continue;
665             }
666             break;
667         }
668         case ScriptDataEscapeStartDashState: {
669             if (cc == '-') {
670                 emitCharacter(cc);
671                 m_state = ScriptDataEscapedDashDashState;
672             } else {
673                 m_state = ScriptDataState;
674                 continue;
675             }
676             break;
677         }
678         case ScriptDataEscapedState: {
679             if (cc == '-') {
680                 emitCharacter(cc);
681                 m_state = ScriptDataEscapedDashState;
682             } else if (cc == '<')
683                 m_state = ScriptDataEscapedLessThanSignState;
684             else
685                 emitCharacter(cc);
686             // FIXME: Handle EOF properly.
687             break;
688         }
689         case ScriptDataEscapedDashState: {
690             if (cc == '-') {
691                 emitCharacter(cc);
692                 m_state = ScriptDataEscapedDashDashState;
693             } else if (cc == '<')
694                 m_state = ScriptDataEscapedLessThanSignState;
695             else {
696                 emitCharacter(cc);
697                 m_state = ScriptDataEscapedState;
698             }
699             // FIXME: Handle EOF properly.
700             break;
701         }
702         case ScriptDataEscapedDashDashState: {
703             if (cc == '-')
704                 emitCharacter(cc);
705             else if (cc == '<')
706                 m_state = ScriptDataEscapedLessThanSignState;
707             else if (cc == '>') {
708                 emitCharacter(cc);
709                 m_state = ScriptDataState;
710             } else {
711                 emitCharacter(cc);
712                 m_state = ScriptDataEscapedState;
713             }
714             // FIXME: Handle EOF properly.
715             break;
716         }
717         case ScriptDataEscapedLessThanSignState: {
718             if (cc == '/') {
719                 m_temporaryBuffer.clear();
720                 ASSERT(m_bufferedEndTagName.isEmpty());
721                 m_state = ScriptDataEscapedEndTagOpenState;
722             } else if (cc >= 'A' && cc <= 'Z') {
723                 emitCharacter('<');
724                 emitCharacter(cc);
725                 m_temporaryBuffer.clear();
726                 m_temporaryBuffer.append(toLowerCase(cc));
727                 m_state = ScriptDataDoubleEscapeStartState;
728             } else if (cc >= 'a' && cc <= 'z') {
729                 emitCharacter('<');
730                 emitCharacter(cc);
731                 m_temporaryBuffer.clear();
732                 m_temporaryBuffer.append(cc);
733                 m_state = ScriptDataDoubleEscapeStartState;
734             } else {
735                 emitCharacter('<');
736                 m_state = ScriptDataEscapedState;
737                 continue;
738             }
739             break;
740         }
741         case ScriptDataEscapedEndTagOpenState: {
742             if (cc >= 'A' && cc <= 'Z') {
743                 m_temporaryBuffer.append(cc);
744                 addToPossibleEndTag(toLowerCase(cc));
745                 m_state = ScriptDataEscapedEndTagNameState;
746             } else if (cc >= 'a' && cc <= 'z') {
747                 m_temporaryBuffer.append(cc);
748                 addToPossibleEndTag(cc);
749                 m_state = ScriptDataEscapedEndTagNameState;
750             } else {
751                 emitCharacter('<');
752                 emitCharacter('/');
753                 m_state = ScriptDataEscapedState;
754                 continue;
755             }
756             break;
757         }
758         case ScriptDataEscapedEndTagNameState: {
759             if (cc >= 'A' && cc <= 'Z') {
760                 m_temporaryBuffer.append(cc);
761                 addToPossibleEndTag(toLowerCase(cc));
762             } else if (cc >= 'a' && cc <= 'z') {
763                 m_temporaryBuffer.append(cc);
764                 addToPossibleEndTag(cc);
765             } else {
766                 if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ') {
767                     if (isAppropriateEndTag()) {
768                         m_state = BeforeAttributeNameState;
769                         maybeFlushBufferedEndTag();
770                         break;
771                     }
772                 } else if (cc == '/') {
773                     if (isAppropriateEndTag()) {
774                         m_state = SelfClosingStartTagState;
775                         maybeFlushBufferedEndTag();
776                         break;
777                     }
778                 } else if (cc == '>') {
779                     if (isAppropriateEndTag()) {
780                         m_state = DataState;
781                         maybeFlushBufferedEndTag();
782                         break;
783                     }
784                 }
785                 emitCharacter('<');
786                 emitCharacter('/');
787                 m_token->appendToCharacter(m_temporaryBuffer);
788                 m_bufferedEndTagName.clear();
789                 m_state = ScriptDataEscapedState;
790                 continue;
791             }
792             break;
793         }
794         case ScriptDataDoubleEscapeStartState: {
795             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '/' || cc == '>') {
796                 emitCharacter(cc);
797                 if (temporaryBufferIs(scriptTag.localName()))
798                     m_state = ScriptDataDoubleEscapedState;
799                 else
800                     m_state = ScriptDataEscapedState;
801             } else if (cc >= 'A' && cc <= 'Z') {
802                 emitCharacter(cc);
803                 m_temporaryBuffer.append(toLowerCase(cc));
804             } else if (cc >= 'a' && cc <= 'z') {
805                 emitCharacter(cc);
806                 m_temporaryBuffer.append(cc);
807             } else {
808                 m_state = ScriptDataEscapedState;
809                 continue;
810             }
811             break;
812         }
813         case ScriptDataDoubleEscapedState: {
814             if (cc == '-') {
815                 emitCharacter(cc);
816                 m_state = ScriptDataDoubleEscapedDashState;
817             } else if (cc == '<') {
818                 emitCharacter(cc);
819                 m_state = ScriptDataDoubleEscapedLessThanSignState;
820             } else
821                 emitCharacter(cc);
822             // FIXME: Handle EOF properly.
823             break;
824         }
825         case ScriptDataDoubleEscapedDashState: {
826             if (cc == '-') {
827                 emitCharacter(cc);
828                 m_state = ScriptDataDoubleEscapedDashDashState;
829             } else if (cc == '<') {
830                 emitCharacter(cc);
831                 m_state = ScriptDataDoubleEscapedLessThanSignState;
832             } else {
833                 emitCharacter(cc);
834                 m_state = ScriptDataDoubleEscapedState;
835             }
836             // FIXME: Handle EOF properly.
837             break;
838         }
839         case ScriptDataDoubleEscapedDashDashState: {
840             if (cc == '-')
841                 emitCharacter(cc);
842             else if (cc == '<') {
843                 emitCharacter(cc);
844                 m_state = ScriptDataDoubleEscapedLessThanSignState;
845             } else if (cc == '>') {
846                 emitCharacter(cc);
847                 m_state = ScriptDataState;
848             } else {
849                 emitCharacter(cc);
850                 m_state = ScriptDataDoubleEscapedState;
851             }
852             // FIXME: Handle EOF properly.
853             break;
854         }
855         case ScriptDataDoubleEscapedLessThanSignState: {
856             if (cc == '/') {
857                 emitCharacter(cc);
858                 m_temporaryBuffer.clear();
859                 m_state = ScriptDataDoubleEscapeEndState;
860             } else {
861                 m_state = ScriptDataDoubleEscapedState;
862                 continue;
863             }
864             break;
865         }
866         case ScriptDataDoubleEscapeEndState: {
867             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '/' || cc == '>') {
868                 emitCharacter(cc);
869                 if (temporaryBufferIs(scriptTag.localName()))
870                     m_state = ScriptDataEscapedState;
871                 else
872                     m_state = ScriptDataDoubleEscapedState;
873             } else if (cc >= 'A' && cc <= 'Z') {
874                 emitCharacter(cc);
875                 m_temporaryBuffer.append(toLowerCase(cc));
876             } else if (cc >= 'a' && cc <= 'z') {
877                 emitCharacter(cc);
878                 m_temporaryBuffer.append(cc);
879             } else {
880                 m_state = ScriptDataDoubleEscapedState;
881                 continue;
882             }
883             break;
884         }
885         case BeforeAttributeNameState: {
886             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
887                 break;
888             else if (cc == '/')
889                 m_state = SelfClosingStartTagState;
890             else if (cc == '>') {
891                 emitCurrentToken();
892                 m_state = DataState;
893             } else if (cc >= 'A' && cc <= 'Z') {
894                 m_token->addNewAttribute();
895                 m_token->appendToAttributeName(toLowerCase(cc));
896                 m_state = AttributeNameState;
897             } else {
898                 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
899                     emitParseError();
900                 m_token->addNewAttribute();
901                 m_token->appendToAttributeName(cc);
902                 m_state = AttributeNameState;
903             }
904             // FIXME: Handle EOF properly.
905             break;
906         }
907         case AttributeNameState: {
908             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
909                 m_state = AfterAttributeNameState;
910             else if (cc == '/')
911                 m_state = SelfClosingStartTagState;
912             else if (cc == '=')
913                 m_state = BeforeAttributeValueState;
914             else if (cc == '>') {
915                 emitCurrentToken();
916                 m_state = DataState;
917             } else if (cc >= 'A' && cc <= 'Z')
918                 m_token->appendToAttributeName(toLowerCase(cc));
919             else {
920                 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
921                     emitParseError();
922                 m_token->appendToAttributeName(cc);
923                 m_state = AttributeNameState;
924             }
925             // FIXME: Handle EOF properly.
926             break;
927         }
928         case AfterAttributeNameState: {
929             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
930                 break;
931             else if (cc == '/')
932                 m_state = SelfClosingStartTagState;
933             else if (cc == '=')
934                 m_state = BeforeAttributeValueState;
935             else if (cc == '=') {
936                 emitCurrentToken();
937                 m_state = DataState;
938             } else if (cc >= 'A' && cc <= 'Z') {
939                 m_token->addNewAttribute();
940                 m_token->appendToAttributeName(toLowerCase(cc));
941                 m_state = AttributeNameState;
942             } else {
943                 if (cc == '"' || cc == '\'' || cc == '<')
944                     emitParseError();
945                 m_token->addNewAttribute();
946                 m_token->appendToAttributeName(cc);
947                 m_state = AttributeNameState;
948             }
949             // FIXME: Handle EOF properly.
950             break;
951         }
952         case BeforeAttributeValueState: {
953             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
954                 break;
955             else if (cc == '"')
956                 m_state = AttributeValueDoubleQuotedState;
957             else if (cc == '&') {
958                 m_state = AttributeValueUnquotedState;
959                 continue;
960             } else if (cc == '\'')
961                 m_state = AttributeValueSingleQuotedState;
962             else if (cc == '>') {
963                 emitParseError();
964                 emitCurrentToken();
965                 m_state = DataState;
966             } else {
967                 if (cc == '<' || cc == '=' || cc == '`')
968                     emitParseError();
969                 m_token->appendToAttributeValue(cc);
970                 m_state = AttributeValueUnquotedState;
971             }
972             break;
973         }
974         case AttributeValueDoubleQuotedState: {
975             if (cc == '"')
976                 m_state = AfterAttributeValueQuotedState;
977             else if (cc == '&') {
978                 m_state = CharacterReferenceInAttributeValueState;
979                 m_additionalAllowedCharacter = '"';
980             } else
981                 m_token->appendToAttributeValue(cc);
982             // FIXME: Handle EOF properly.
983             break;
984         }
985         case AttributeValueSingleQuotedState: {
986             if (cc == '\'')
987                 m_state = AfterAttributeValueQuotedState;
988             else if (cc == '&') {
989                 m_state = CharacterReferenceInAttributeValueState;
990                 m_additionalAllowedCharacter = '\'';
991             } else
992                 m_token->appendToAttributeValue(cc);
993             // FIXME: Handle EOF properly.
994             break;
995         }
996         case AttributeValueUnquotedState: {
997             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
998                 m_state = BeforeAttributeNameState;
999             else if (cc == '&') {
1000                 m_state = CharacterReferenceInAttributeValueState;
1001                 m_additionalAllowedCharacter = '>';
1002             } else if (cc == '>') {
1003                 emitCurrentToken();
1004                 m_state = DataState;
1005             } else {
1006                 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`')
1007                     emitParseError();
1008                 m_token->appendToAttributeValue(cc);
1009             }
1010             // FIXME: Handle EOF properly.
1011             break;
1012         }
1013         case CharacterReferenceInAttributeValueState: {
1014             bool notEnoughCharacters = false;
1015             unsigned value = consumeEntity(source, notEnoughCharacters);
1016             if (notEnoughCharacters)
1017                 return shouldEmitBufferedCharacterToken(source);
1018             if (!value)
1019                 m_token->appendToAttributeValue('&');
1020             else if (value < 0xFFFF)
1021                 m_token->appendToAttributeValue(value);
1022             else {
1023                 m_token->appendToAttributeValue(U16_LEAD(value));
1024                 m_token->appendToAttributeValue(U16_TRAIL(value));
1025             }
1026             // We're supposed to switch back to the attribute value state that
1027             // we were in when we were switched into this state.  Rather than
1028             // keeping track of this explictly, we observe that the previous
1029             // state can be determined by m_additionalAllowedCharacter.
1030             if (m_additionalAllowedCharacter == '"')
1031                 m_state = AttributeValueDoubleQuotedState;
1032             else if (m_additionalAllowedCharacter == '\'')
1033                 m_state = AttributeValueSingleQuotedState;
1034             else if (m_additionalAllowedCharacter == '>')
1035                 m_state = AttributeValueUnquotedState;
1036             else
1037                 ASSERT_NOT_REACHED();
1038             continue;
1039         }
1040         case AfterAttributeValueQuotedState: {
1041             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1042                 m_state = BeforeAttributeNameState;
1043             else if (cc == '/')
1044                 m_state = SelfClosingStartTagState;
1045             else if (cc == '>') {
1046                 emitCurrentToken();
1047                 m_state = DataState;
1048             } else {
1049                 emitParseError();
1050                 m_state = BeforeAttributeNameState;
1051                 continue;
1052             }
1053             // FIXME: Handle EOF properly.
1054             break;
1055         }
1056         case SelfClosingStartTagState: {
1057             if (cc == '>') {
1058                 notImplemented();
1059                 emitCurrentToken();
1060                 m_state = DataState;
1061             } else {
1062                 emitParseError();
1063                 m_state = BeforeAttributeNameState;
1064                 continue;
1065             }
1066             // FIXME: Handle EOF properly.
1067             break;
1068         }
1069         case BogusCommentState: {
1070             m_token->beginComment();
1071             while (!source.isEmpty()) {
1072                 cc = *source;
1073                 if (cc == '>')
1074                     break;
1075                 m_token->appendToComment(cc);
1076                 source.advance();
1077             }
1078             emitCurrentToken();
1079             m_state = DataState;
1080             if (source.isEmpty())
1081                 return true;
1082             // FIXME: Handle EOF properly.
1083             break;
1084         }
1085         case MarkupDeclarationOpenState: {
1086             DEFINE_STATIC_LOCAL(String, dashDashString, ("--"));
1087             DEFINE_STATIC_LOCAL(String, doctypeString, ("doctype"));
1088             if (cc == '-') {
1089                 SegmentedString::LookAheadResult result = source.lookAhead(dashDashString);
1090                 if (result == SegmentedString::DidMatch) {
1091                     source.advanceAndASSERT('-');
1092                     source.advanceAndASSERT('-');
1093                     m_token->beginComment();
1094                     m_state = CommentStartState;
1095                     continue;
1096                 } else if (result == SegmentedString::NotEnoughCharacters)
1097                     return shouldEmitBufferedCharacterToken(source);
1098             } else if (cc == 'D' || cc == 'd') {
1099                 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(doctypeString);
1100                 if (result == SegmentedString::DidMatch) {
1101                     advanceStringAndASSERTIgnoringCase(source, "doctype");
1102                     m_state = DOCTYPEState;
1103                     continue;
1104                 } else if (result == SegmentedString::NotEnoughCharacters)
1105                     return shouldEmitBufferedCharacterToken(source);
1106             }
1107             notImplemented();
1108             // FIXME: We're still missing the bits about the insertion mode being in foreign content:
1109             // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#markup-declaration-open-state
1110             emitParseError();
1111             m_state = BogusCommentState;
1112             continue;
1113         }
1114         case CommentStartState: {
1115             if (cc == '-')
1116                 m_state = CommentStartDashState;
1117             else if (cc == '>') {
1118                 emitParseError();
1119                 emitCurrentToken();
1120                 m_state = DataState;
1121             } else {
1122                 m_token->appendToComment(cc);
1123                 m_state = CommentState;
1124             }
1125             // FIXME: Handle EOF properly.
1126             break;
1127         }
1128         case CommentStartDashState: {
1129             if (cc == '-')
1130                 m_state = CommentEndState;
1131             else if (cc == '>') {
1132                 emitParseError();
1133                 emitCurrentToken();
1134                 m_state = DataState;
1135             } else {
1136                 m_token->appendToComment('-');
1137                 m_token->appendToComment(cc);
1138                 m_state = CommentState;
1139             }
1140             // FIXME: Handle EOF properly.
1141             break;
1142         }
1143         case CommentState: {
1144             if (cc == '-')
1145                 m_state = CommentEndDashState;
1146             else
1147                 m_token->appendToComment(cc);
1148             // FIXME: Handle EOF properly.
1149             break;
1150         }
1151         case CommentEndDashState: {
1152             if (cc == '-')
1153                 m_state = CommentEndState;
1154             else {
1155                 m_token->appendToComment('-');
1156                 m_token->appendToComment(cc);
1157                 m_state = CommentState;
1158             }
1159             // FIXME: Handle EOF properly.
1160             break;
1161         }
1162         case CommentEndState: {
1163             if (cc == '>') {
1164                 emitCurrentToken();
1165                 m_state = DataState;
1166             } else if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ') {
1167                 emitParseError();
1168                 m_token->appendToComment('-');
1169                 m_token->appendToComment('-');
1170                 m_token->appendToComment(cc);
1171                 m_state = CommentEndSpaceState;
1172             } else if (cc == '!') {
1173                 emitParseError();
1174                 m_state = CommentEndBangState;
1175             } else if (cc == '-') {
1176                 emitParseError();
1177                 m_token->appendToComment('-');
1178                 m_token->appendToComment(cc);
1179             } else {
1180                 emitParseError();
1181                 m_token->appendToComment('-');
1182                 m_token->appendToComment('-');
1183                 m_token->appendToComment(cc);
1184                 m_state = CommentState;
1185             }
1186             // FIXME: Handle EOF properly.
1187             break;
1188         }
1189         case CommentEndBangState: {
1190             if (cc == '-') {
1191                 m_token->appendToComment('-');
1192                 m_token->appendToComment('-');
1193                 m_token->appendToComment('!');
1194                 m_state = CommentEndDashState;
1195             } else if (cc == '>') {
1196                 emitCurrentToken();
1197                 m_state = DataState;
1198             } else {
1199                 m_token->appendToComment('-');
1200                 m_token->appendToComment('-');
1201                 m_token->appendToComment('!');
1202                 m_token->appendToComment(cc);
1203                 m_state = CommentState;
1204             }
1205             // FIXME: Handle EOF properly.
1206             break;
1207         }
1208         case CommentEndSpaceState: {
1209             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1210                 m_token->appendToComment(cc);
1211             else if (cc == '-')
1212                 m_state = CommentEndDashState;
1213             else if (cc == '>') {
1214                 emitCurrentToken();
1215                 m_state = DataState;
1216             } else {
1217                 m_token->appendToComment(cc);
1218                 m_state = CommentState;
1219             }
1220             // FIXME: Handle EOF properly.
1221             break;
1222         }
1223         case DOCTYPEState: {
1224             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1225                 m_state = BeforeDOCTYPENameState;
1226             else {
1227                 emitParseError();
1228                 m_state = BeforeDOCTYPENameState;
1229                 continue;
1230             }
1231             // FIXME: Handle EOF properly.
1232             break;
1233         }
1234         case BeforeDOCTYPENameState: {
1235             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1236                 break;
1237             else if (cc >= 'A' && cc <= 'Z') {
1238                 m_token->beginDOCTYPE(toLowerCase(cc));
1239                 m_state = DOCTYPENameState;
1240             } else if (cc == '>') {
1241                 emitParseError();
1242                 m_token->beginDOCTYPE();
1243                 notImplemented();
1244                 emitCurrentToken();
1245                 m_state = DataState;
1246             } else {
1247                 m_token->beginDOCTYPE(cc);
1248                 m_state = DOCTYPENameState;
1249             }
1250             // FIXME: Handle EOF properly.
1251             break;
1252         }
1253         case DOCTYPENameState: {
1254             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1255                 m_state = AfterDOCTYPENameState;
1256             else if (cc == '>') {
1257                 emitCurrentToken();
1258                 m_state = DataState;
1259             } else if (cc >= 'A' && cc <= 'Z')
1260                 m_token->appendToName(toLowerCase(cc));
1261             else
1262                 m_token->appendToName(cc);
1263             // FIXME: Handle EOF properly.
1264             break;
1265         }
1266         case AfterDOCTYPENameState: {
1267             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1268                 break;
1269             if (cc == '>') {
1270                 emitCurrentToken();
1271                 m_state = DataState;
1272             } else {
1273                 DEFINE_STATIC_LOCAL(String, publicString, ("public"));
1274                 DEFINE_STATIC_LOCAL(String, systemString, ("system"));
1275                 if (cc == 'P' || cc == 'p') {
1276                     SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(publicString);
1277                     if (result == SegmentedString::DidMatch) {
1278                         advanceStringAndASSERTIgnoringCase(source, "public");
1279                         m_state = AfterDOCTYPEPublicKeywordState;
1280                         continue;
1281                     } else if (result == SegmentedString::NotEnoughCharacters)
1282                         return shouldEmitBufferedCharacterToken(source);
1283                 } else if (cc == 'S' || cc == 's') {
1284                     SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(systemString);
1285                     if (result == SegmentedString::DidMatch) {
1286                         advanceStringAndASSERTIgnoringCase(source, "system");
1287                         m_state = AfterDOCTYPESystemKeywordState;
1288                         continue;
1289                     } else if (result == SegmentedString::NotEnoughCharacters)
1290                         return shouldEmitBufferedCharacterToken(source);
1291                 }
1292                 emitParseError();
1293                 notImplemented();
1294                 m_state = BogusDOCTYPEState;
1295             }
1296             // FIXME: Handle EOF properly.
1297             break;
1298         }
1299         case AfterDOCTYPEPublicKeywordState: {
1300             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1301                 m_state = BeforeDOCTYPEPublicIdentifierState;
1302             else if (cc == '"') {
1303                 emitParseError();
1304                 m_token->setPublicIdentifierToEmptyString();
1305                 m_state = DOCTYPEPublicIdentifierDoubleQuotedState;
1306             } else if (cc == '\'') {
1307                 emitParseError();
1308                 m_token->setPublicIdentifierToEmptyString();
1309                 m_state = DOCTYPEPublicIdentifierSingleQuotedState;
1310             } else if (cc == '>') {
1311                 emitParseError();
1312                 notImplemented();
1313                 emitCurrentToken();
1314                 m_state = DataState;
1315             } else {
1316                 emitParseError();
1317                 notImplemented();
1318                 m_state = BogusDOCTYPEState;
1319             }
1320             // FIXME: Handle EOF properly.
1321             break;
1322         }
1323         case BeforeDOCTYPEPublicIdentifierState: {
1324             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1325                 break;
1326             else if (cc == '"') {
1327                 m_token->setPublicIdentifierToEmptyString();
1328                 m_state = DOCTYPEPublicIdentifierDoubleQuotedState;
1329             } else if (cc == '\'') {
1330                 m_token->setPublicIdentifierToEmptyString();
1331                 m_state = DOCTYPEPublicIdentifierSingleQuotedState;
1332             } else if (cc == '>') {
1333                 emitParseError();
1334                 notImplemented();
1335                 emitCurrentToken();
1336                 m_state = DataState;
1337             } else {
1338                 emitParseError();
1339                 notImplemented();
1340                 m_state = BogusDOCTYPEState;
1341             }
1342             // FIXME: Handle EOF properly.
1343             break;
1344         }
1345         case DOCTYPEPublicIdentifierDoubleQuotedState: {
1346             if (cc == '"')
1347                 m_state = AfterDOCTYPEPublicIdentifierState;
1348             else if (cc == '>') {
1349                 emitParseError();
1350                 notImplemented();
1351                 emitCurrentToken();
1352                 m_state = DataState;
1353             } else
1354                 m_token->appendToPublicIdentifier(cc);
1355             // FIXME: Handle EOF properly.
1356             break;
1357         }
1358         case DOCTYPEPublicIdentifierSingleQuotedState: {
1359             if (cc == '\'')
1360                 m_state = AfterDOCTYPEPublicIdentifierState;
1361             else if (cc == '>') {
1362                 emitParseError();
1363                 notImplemented();
1364                 emitCurrentToken();
1365                 m_state = DataState;
1366             } else
1367                 m_token->appendToPublicIdentifier(cc);
1368             // FIXME: Handle EOF properly.
1369             break;
1370         }
1371         case AfterDOCTYPEPublicIdentifierState: {
1372             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1373                 m_state = BetweenDOCTYPEPublicAndSystemIdentifiersState;
1374             else if (cc == '>') {
1375                 emitCurrentToken();
1376                 m_state = DataState;
1377             } else if (cc == '"') {
1378                 emitParseError();
1379                 m_token->setPublicIdentifierToEmptyString();
1380                 m_state = DOCTYPESystemIdentifierDoubleQuotedState;
1381             } else if (cc == '\'') {
1382                 emitParseError();
1383                 m_token->setPublicIdentifierToEmptyString();
1384                 m_state = DOCTYPESystemIdentifierSingleQuotedState;
1385             } else {
1386                 emitParseError();
1387                 notImplemented();
1388                 m_state = BogusDOCTYPEState;
1389             }
1390             // FIXME: Handle EOF properly.
1391             break;
1392         }
1393         case BetweenDOCTYPEPublicAndSystemIdentifiersState: {
1394             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1395                 m_state = BetweenDOCTYPEPublicAndSystemIdentifiersState;
1396             else if (cc == '>') {
1397                 emitCurrentToken();
1398                 m_state = DataState;
1399             } else if (cc == '"') {
1400                 m_token->setSystemIdentifierToEmptyString();
1401                 m_state = DOCTYPESystemIdentifierDoubleQuotedState;
1402             } else if (cc == '\'') {
1403                 m_token->setSystemIdentifierToEmptyString();
1404                 m_state = DOCTYPESystemIdentifierSingleQuotedState;
1405             } else {
1406                 emitParseError();
1407                 notImplemented();
1408                 m_state = BogusDOCTYPEState;
1409             }
1410             // FIXME: Handle EOF properly.
1411             break;
1412         }
1413         case AfterDOCTYPESystemKeywordState: {
1414             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1415                 m_state = BeforeDOCTYPESystemIdentifierState;
1416             else if (cc == '"') {
1417                 emitParseError();
1418                 m_token->setSystemIdentifierToEmptyString();
1419                 m_state = DOCTYPESystemIdentifierDoubleQuotedState;
1420             } else if (cc == '\'') {
1421                 emitParseError();
1422                 m_token->setSystemIdentifierToEmptyString();
1423                 m_state = DOCTYPESystemIdentifierSingleQuotedState;
1424             } else if (cc == '>') {
1425                 emitParseError();
1426                 notImplemented();
1427                 emitCurrentToken();
1428                 m_state = DataState;
1429             } else {
1430                 emitParseError();
1431                 notImplemented();
1432                 m_state = BogusDOCTYPEState;
1433             }
1434             // FIXME: Handle EOF properly.
1435             break;
1436         }
1437         case BeforeDOCTYPESystemIdentifierState: {
1438             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1439                 break;
1440             if (cc == '"') {
1441                 m_token->setSystemIdentifierToEmptyString();
1442                 m_state = DOCTYPESystemIdentifierDoubleQuotedState;
1443             } else if (cc == '\'') {
1444                 m_token->setSystemIdentifierToEmptyString();
1445                 m_state = DOCTYPESystemIdentifierSingleQuotedState;
1446             } else if (cc == '>') {
1447                 emitParseError();
1448                 notImplemented();
1449                 emitCurrentToken();
1450                 m_state = DataState;
1451             } else {
1452                 emitParseError();
1453                 notImplemented();
1454                 m_state = BogusDOCTYPEState;
1455             }
1456             // FIXME: Handle EOF properly.
1457             break;
1458         }
1459         case DOCTYPESystemIdentifierDoubleQuotedState: {
1460             if (cc == '"')
1461                 m_state = AfterDOCTYPESystemIdentifierState;
1462             else if (cc == '>') {
1463                 emitParseError();
1464                 notImplemented();
1465                 emitCurrentToken();
1466                 m_state = DataState;
1467             } else
1468                 m_token->appendToSystemIdentifier(cc);
1469             // FIXME: Handle EOF properly.
1470             break;
1471         }
1472         case DOCTYPESystemIdentifierSingleQuotedState: {
1473             if (cc == '\'')
1474                 m_state = AfterDOCTYPESystemIdentifierState;
1475             else if (cc == '>') {
1476                 emitParseError();
1477                 notImplemented();
1478                 emitCurrentToken();
1479                 m_state = DataState;
1480             } else
1481                 m_token->appendToSystemIdentifier(cc);
1482             // FIXME: Handle EOF properly.
1483             break;
1484         }
1485         case AfterDOCTYPESystemIdentifierState: {
1486             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
1487                 break;
1488             else if (cc == '>') {
1489                 emitCurrentToken();
1490                 m_state = DataState;
1491             } else {
1492                 emitParseError();
1493                 m_state = BogusDOCTYPEState;
1494             }
1495             // FIXME: Handle EOF properly.
1496             break;
1497         }
1498         case BogusDOCTYPEState: {
1499             if (cc == '>') {
1500                 emitCurrentToken();
1501                 m_state = DataState;
1502             }
1503             // FIXME: Handle EOF properly.
1504             break;
1505         }
1506         case CDATASectionState: {
1507             notImplemented();
1508             break;
1509         }
1510         }
1511         source.advance();
1512         if (m_emitPending) {
1513             m_emitPending = false;
1514             return true;
1515         }
1516     }
1517     // We've reached the end of the input stream.  If we have a character
1518     // token buffered, we should emit it.
1519     return shouldEmitBufferedCharacterToken(source);
1520 }
1521
1522 inline bool HTML5Lexer::temporaryBufferIs(const String& expectedString)
1523 {
1524     return vectorEqualsString(m_temporaryBuffer, expectedString);
1525 }
1526
1527 inline void HTML5Lexer::addToPossibleEndTag(UChar cc)
1528 {
1529     ASSERT(isEndTagBufferingState(m_state));
1530     m_bufferedEndTagName.append(cc);
1531 }
1532
1533 inline bool HTML5Lexer::isAppropriateEndTag()
1534 {
1535     return vectorEqualsString(m_bufferedEndTagName, m_appropriateEndTagName);
1536 }
1537
1538 inline void HTML5Lexer::emitCharacter(UChar character)
1539 {
1540     if (m_token->type() != HTML5Token::Character) {
1541         m_token->beginCharacter(character);
1542         return;
1543     }
1544     m_token->appendToCharacter(character);
1545 }
1546
1547 inline void HTML5Lexer::emitCodePoint(unsigned value)
1548 {
1549     if (value < 0xFFFF) {
1550         emitCharacter(value);
1551         return;
1552     }
1553     emitCharacter(U16_LEAD(value));
1554     emitCharacter(U16_TRAIL(value));
1555 }
1556
1557 inline void HTML5Lexer::emitParseError()
1558 {
1559     notImplemented();
1560 }
1561
1562 inline void HTML5Lexer::maybeFlushBufferedEndTag()
1563 {
1564     ASSERT(m_token->type() == HTML5Token::Character || m_token->type() == HTML5Token::Uninitialized);
1565     if (m_token->type() == HTML5Token::Character) {
1566         // We have a character token queued up.  We need to emit it before we
1567         // can start begin the buffered end tag token.
1568         emitCurrentToken();
1569         return;
1570     }
1571     flushBufferedEndTag();
1572 }
1573
1574 inline void HTML5Lexer::flushBufferedEndTag()
1575 {
1576     m_token->beginEndTag(m_bufferedEndTagName);
1577     m_bufferedEndTagName.clear();
1578     if (m_state == DataState)
1579         emitCurrentToken();
1580 }
1581
1582 inline void HTML5Lexer::emitCurrentToken()
1583 {
1584     ASSERT(m_token->type() != HTML5Token::Uninitialized);
1585     m_emitPending = true;
1586     if (m_token->type() == HTML5Token::StartTag)
1587         m_appropriateEndTagName = m_token->name();
1588 }
1589
1590 inline bool HTML5Lexer::shouldEmitBufferedCharacterToken(const SegmentedString& source)
1591 {
1592     return source.isClosed() && m_token->type() == HTML5Token::Character;
1593 }
1594
1595 }