Fix for bug 12751, doctype nodes aren't part of the Document (Acid3).
[WebKit-https.git] / WebCore / html / HTMLTokenizer.cpp
1 /*
2     Copyright (C) 1997 Martin Jones (mjones@kde.org)
3               (C) 1997 Torben Weis (weis@kde.org)
4               (C) 1998 Waldo Bastian (bastian@kde.org)
5               (C) 1999 Lars Knoll (knoll@kde.org)
6               (C) 1999 Antti Koivisto (koivisto@kde.org)
7               (C) 2001 Dirk Mueller (mueller@kde.org)
8     Copyright (C) 2004, 2005, 2006, 2007, 2008 Apple Inc. All rights reserved.
9     Copyright (C) 2005, 2006 Alexey Proskuryakov (ap@nypop.com)
10
11     This library is free software; you can redistribute it and/or
12     modify it under the terms of the GNU Library General Public
13     License as published by the Free Software Foundation; either
14     version 2 of the License, or (at your option) any later version.
15
16     This library is distributed in the hope that it will be useful,
17     but WITHOUT ANY WARRANTY; without even the implied warranty of
18     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19     Library General Public License for more details.
20
21     You should have received a copy of the GNU Library General Public License
22     along with this library; see the file COPYING.LIB.  If not, write to
23     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
24     Boston, MA 02110-1301, USA.
25 */
26
27 #include "config.h"
28 #include "HTMLTokenizer.h"
29
30 #include "CSSHelper.h"
31 #include "Cache.h"
32 #include "CachedScript.h"
33 #include "DocLoader.h"
34 #include "DocumentFragment.h"
35 #include "EventNames.h"
36 #include "Frame.h"
37 #include "FrameLoader.h"
38 #include "FrameView.h"
39 #include "HTMLElement.h"
40 #include "HTMLNames.h"
41 #include "HTMLParser.h"
42 #include "HTMLScriptElement.h"
43 #include "HTMLViewSourceDocument.h"
44 #include "Settings.h"
45 #include "SystemTime.h"
46 #include "kjs_proxy.h"
47 #include <wtf/ASCIICType.h>
48
49 #include "HTMLEntityNames.c"
50
51 // #define INSTRUMENT_LAYOUT_SCHEDULING 1
52
53 #if MOBILE
54 // The mobile device needs to be responsive, as such the tokenizer chunk size is reduced.
55 // This value is used to define how many characters the tokenizer will process before 
56 // yeilding control.
57 #define TOKENIZER_CHUNK_SIZE  256
58 #else
59 #define TOKENIZER_CHUNK_SIZE  4096
60 #endif
61
62 using namespace std;
63 using namespace WTF;
64
65 namespace WebCore {
66
67 using namespace HTMLNames;
68 using namespace EventNames;
69
70 #if MOBILE
71 // As the chunks are smaller (above), the tokenizer should not yield for as long a period, otherwise
72 // it will take way to long to load a page.
73 const double tokenizerTimeDelay = 0.300;
74
75 #else
76 // FIXME: We would like this constant to be 200ms.
77 // Yielding more aggressively results in increased responsiveness and better incremental rendering.
78 // It slows down overall page-load on slower machines, though, so for now we set a value of 500.
79 const double tokenizerTimeDelay = 0.500;
80 #endif
81
82 static const char commentStart [] = "<!--";
83 static const char doctypeStart [] = "<!doctype";
84 static const char publicStart [] = "public";
85 static const char systemStart [] = "system";
86 static const char scriptEnd [] = "</script";
87 static const char xmpEnd [] = "</xmp";
88 static const char styleEnd [] =  "</style";
89 static const char textareaEnd [] = "</textarea";
90 static const char titleEnd [] = "</title";
91 static const char iframeEnd [] = "</iframe";
92
93 // Full support for MS Windows extensions to Latin-1.
94 // Technically these extensions should only be activated for pages
95 // marked "windows-1252" or "cp1252", but
96 // in the standard Microsoft way, these extensions infect hundreds of thousands
97 // of web pages.  Note that people with non-latin-1 Microsoft extensions
98 // are SOL.
99 //
100 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp
101 //      http://www.bbsinc.com/iso8859.html
102 //      http://www.obviously.com/
103 //
104 // There may be better equivalents
105
106 // We only need this for entities. For non-entity text, we handle this in the text encoding.
107
108 static const UChar windowsLatin1ExtensionArray[32] = {
109     0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
110     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
111     0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
112     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178  // 98-9F
113 };
114
115 static inline UChar fixUpChar(UChar c)
116 {
117     if ((c & ~0x1F) != 0x0080)
118         return c;
119     return windowsLatin1ExtensionArray[c - 0x80];
120 }
121
122 static inline bool tagMatch(const char* s1, const UChar* s2, unsigned length)
123 {
124     for (unsigned i = 0; i != length; ++i) {
125         unsigned char c1 = s1[i];
126         unsigned char uc1 = toASCIIUpper(static_cast<char>(c1));
127         UChar c2 = s2[i];
128         if (c1 != c2 && uc1 != c2)
129             return false;
130     }
131     return true;
132 }
133
134 inline void Token::addAttribute(Document* doc, AtomicString& attrName, const AtomicString& v, bool viewSourceMode)
135 {
136     if (!attrName.isEmpty()) {
137         ASSERT(!attrName.contains('/'));
138         RefPtr<MappedAttribute> a = new MappedAttribute(attrName, v);
139         if (!attrs)
140             attrs = new NamedMappedAttrMap(0);
141         attrs->insertAttribute(a.release(), viewSourceMode);
142     }
143     
144     attrName = emptyAtom;
145 }
146
147 // ----------------------------------------------------------------------------
148
149 HTMLTokenizer::HTMLTokenizer(HTMLDocument* doc, bool reportErrors)
150     : Tokenizer()
151     , buffer(0)
152     , scriptCode(0)
153     , scriptCodeSize(0)
154     , scriptCodeMaxSize(0)
155     , scriptCodeResync(0)
156     , m_executingScript(0)
157     , m_requestingScript(false)
158     , m_hasScriptsWaitingForStylesheets(false)
159     , m_timer(this, &HTMLTokenizer::timerFired)
160     , m_doc(doc)
161     , parser(new HTMLParser(doc, reportErrors))
162     , inWrite(false)
163     , m_fragment(false)
164 {
165     begin();
166 }
167
168 HTMLTokenizer::HTMLTokenizer(HTMLViewSourceDocument* doc)
169     : Tokenizer(true)
170     , buffer(0)
171     , scriptCode(0)
172     , scriptCodeSize(0)
173     , scriptCodeMaxSize(0)
174     , scriptCodeResync(0)
175     , m_executingScript(0)
176     , m_requestingScript(false)
177     , m_hasScriptsWaitingForStylesheets(false)
178     , m_timer(this, &HTMLTokenizer::timerFired)
179     , m_doc(doc)
180     , parser(0)
181     , inWrite(false)
182     , m_fragment(false)
183 {
184     begin();
185 }
186
187 HTMLTokenizer::HTMLTokenizer(DocumentFragment* frag)
188     : buffer(0)
189     , scriptCode(0)
190     , scriptCodeSize(0)
191     , scriptCodeMaxSize(0)
192     , scriptCodeResync(0)
193     , m_executingScript(0)
194     , m_requestingScript(false)
195     , m_hasScriptsWaitingForStylesheets(false)
196     , m_timer(this, &HTMLTokenizer::timerFired)
197     , m_doc(frag->document())
198     , inWrite(false)
199     , m_fragment(true)
200 {
201     parser = new HTMLParser(frag);
202     begin();
203 }
204
205 void HTMLTokenizer::reset()
206 {
207     ASSERT(m_executingScript == 0);
208
209     while (!pendingScripts.isEmpty()) {
210       CachedScript *cs = pendingScripts.dequeue();
211       ASSERT(cache()->disabled() || cs->accessCount() > 0);
212       cs->deref(this);
213     }
214     
215     fastFree(buffer);
216     buffer = dest = 0;
217     size = 0;
218
219     fastFree(scriptCode);
220     scriptCode = 0;
221     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
222
223     m_timer.stop();
224     m_state.setAllowYield(false);
225     m_state.setForceSynchronous(false);
226
227     currToken.reset();
228     m_doctypeToken.reset();
229     m_doctypeSearchCount = 0;
230     m_doctypeSecondarySearchCount = 0;
231 }
232
233 void HTMLTokenizer::begin()
234 {
235     m_executingScript = 0;
236     m_requestingScript = false;
237     m_hasScriptsWaitingForStylesheets = false;
238     m_state.setLoadingExtScript(false);
239     reset();
240     size = 254;
241     buffer = static_cast<UChar*>(fastMalloc(sizeof(UChar) * 254));
242     dest = buffer;
243     tquote = NoQuote;
244     searchCount = 0;
245     m_state.setEntityState(NoEntity);
246     scriptSrc = String();
247     pendingSrc.clear();
248     currentPrependingSrc = 0;
249     noMoreData = false;
250     brokenComments = false;
251     brokenServer = false;
252     m_lineNumber = 0;
253     scriptStartLineno = 0;
254     tagStartLineno = 0;
255     m_state.setForceSynchronous(false);
256 }
257
258 void HTMLTokenizer::setForceSynchronous(bool force)
259 {
260     m_state.setForceSynchronous(force);
261 }
262
263 HTMLTokenizer::State HTMLTokenizer::processListing(SegmentedString list, State state)
264 {
265     // This function adds the listing 'list' as
266     // preformatted text-tokens to the token-collection
267     while (!list.isEmpty()) {
268         if (state.skipLF()) {
269             state.setSkipLF(false);
270             if (*list == '\n') {
271                 list.advance();
272                 continue;
273             }
274         }
275
276         checkBuffer();
277
278         if (*list == '\n' || *list == '\r') {
279             if (state.discardLF())
280                 // Ignore this LF
281                 state.setDiscardLF(false); // We have discarded 1 LF
282             else
283                 *dest++ = '\n';
284
285             /* Check for MS-DOS CRLF sequence */
286             if (*list == '\r')
287                 state.setSkipLF(true);
288
289             list.advance();
290         } else {
291             state.setDiscardLF(false);
292             *dest++ = *list;
293             list.advance();
294         }
295     }
296
297     return state;
298 }
299
300 HTMLTokenizer::State HTMLTokenizer::parseSpecial(SegmentedString &src, State state)
301 {
302     ASSERT(state.inTextArea() || state.inTitle() || state.inIFrame() || !state.hasEntityState());
303     ASSERT(!state.hasTagState());
304     ASSERT(state.inXmp() + state.inTextArea() + state.inTitle() + state.inStyle() + state.inScript() + state.inIFrame() == 1 );
305     if (state.inScript())
306         scriptStartLineno = m_lineNumber;
307
308     if (state.inComment()) 
309         state = parseComment(src, state);
310
311     while ( !src.isEmpty() ) {
312         checkScriptBuffer();
313         UChar ch = *src;
314
315         if (!scriptCodeResync && !brokenComments && !state.inTextArea() && !state.inXmp() && ch == '-' && scriptCodeSize >= 3 && !src.escaped() && scriptCode[scriptCodeSize-3] == '<' && scriptCode[scriptCodeSize-2] == '!' && scriptCode[scriptCodeSize-1] == '-') {
316             state.setInComment(true);
317             state = parseComment(src, state);
318             continue;
319         }
320         if (scriptCodeResync && !tquote && ch == '>') {
321             src.advancePastNonNewline();
322             scriptCodeSize = scriptCodeResync-1;
323             scriptCodeResync = 0;
324             scriptCode[ scriptCodeSize ] = scriptCode[ scriptCodeSize + 1 ] = 0;
325             if (state.inScript())
326                 state = scriptHandler(state);
327             else {
328                 state = processListing(SegmentedString(scriptCode, scriptCodeSize), state);
329                 processToken();
330                 if (state.inStyle()) { 
331                     currToken.tagName = styleTag.localName(); 
332                     currToken.beginTag = false; 
333                 } else if (state.inTextArea()) { 
334                     currToken.tagName = textareaTag.localName(); 
335                     currToken.beginTag = false; 
336                 } else if (state.inTitle()) { 
337                     currToken.tagName = titleTag.localName(); 
338                     currToken.beginTag = false; 
339                 } else if (state.inXmp()) {
340                     currToken.tagName = xmpTag.localName(); 
341                     currToken.beginTag = false; 
342                 } else if (state.inIFrame()) {
343                     currToken.tagName = iframeTag.localName();
344                     currToken.beginTag = false;
345                 }
346                 processToken();
347                 state.setInStyle(false);
348                 state.setInScript(false);
349                 state.setInTextArea(false);
350                 state.setInTitle(false);
351                 state.setInXmp(false);
352                 state.setInIFrame(false);
353                 tquote = NoQuote;
354                 scriptCodeSize = scriptCodeResync = 0;
355             }
356             return state;
357         }
358         // possible end of tagname, lets check.
359         if (!scriptCodeResync && !state.escaped() && !src.escaped() && (ch == '>' || ch == '/' || isASCIISpace(ch)) &&
360              scriptCodeSize >= searchStopperLen &&
361              tagMatch( searchStopper, scriptCode+scriptCodeSize-searchStopperLen, searchStopperLen )) {
362             scriptCodeResync = scriptCodeSize-searchStopperLen+1;
363             tquote = NoQuote;
364             continue;
365         }
366         if (scriptCodeResync && !state.escaped()) {
367             if (ch == '\"')
368                 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
369             else if (ch == '\'')
370                 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
371             else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
372                 tquote = NoQuote;
373         }
374         state.setEscaped(!state.escaped() && ch == '\\');
375         if (!scriptCodeResync && (state.inTextArea() || state.inTitle() || state.inIFrame()) && !src.escaped() && ch == '&') {
376             UChar* scriptCodeDest = scriptCode+scriptCodeSize;
377             src.advancePastNonNewline();
378             state = parseEntity(src, scriptCodeDest, state, m_cBufferPos, true, false);
379             scriptCodeSize = scriptCodeDest - scriptCode;
380         } else {
381             scriptCode[scriptCodeSize++] = ch;
382             src.advance(m_lineNumber);
383         }
384     }
385
386     return state;
387 }
388
389 HTMLTokenizer::State HTMLTokenizer::scriptHandler(State state)
390 {
391     // We are inside a <script>
392     bool doScriptExec = false;
393
394     // (Bugzilla 3837) Scripts following a frameset element should not execute or, 
395     // in the case of extern scripts, even load.
396     bool followingFrameset = (m_doc->body() && m_doc->body()->hasTagName(framesetTag));
397   
398     CachedScript* cs = 0;
399     // don't load external scripts for standalone documents (for now)
400     if (!inViewSourceMode()) {
401         if (!scriptSrc.isEmpty() && m_doc->frame()) {
402             // forget what we just got; load from src url instead
403             if (!parser->skipMode() && !followingFrameset) {
404 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
405                 if (!m_doc->ownerElement())
406                     printf("Requesting script at time %d\n", m_doc->elapsedTime());
407 #endif
408                 // The parser might have been stopped by for example a window.close call in an earlier script.
409                 // If so, we don't want to load scripts.
410                 if (!m_parserStopped && (cs = m_doc->docLoader()->requestScript(scriptSrc, scriptSrcCharset)))
411                     pendingScripts.enqueue(cs);
412                 else
413                     scriptNode = 0;
414             } else
415                 scriptNode = 0;
416             scriptSrc = String();
417         } else {
418             // Parse scriptCode containing <script> info
419 #if USE(LOW_BANDWIDTH_DISPLAY)
420             if (m_doc->inLowBandwidthDisplay()) {
421                 // ideal solution is only skipping internal JavaScript if there is external JavaScript.
422                 // but internal JavaScript can use document.write() to create an external JavaScript,
423                 // so we have to skip internal JavaScript all the time.
424                 m_doc->frame()->loader()->needToSwitchOutLowBandwidthDisplay();
425                 doScriptExec = false;
426             } else
427 #endif
428             doScriptExec = static_cast<HTMLScriptElement*>(scriptNode.get())->shouldExecuteAsJavaScript();
429             scriptNode = 0;
430         }
431     }
432
433     state = processListing(SegmentedString(scriptCode, scriptCodeSize), state);
434     String scriptCode(buffer, dest - buffer);
435     processToken();
436     currToken.tagName = scriptTag.localName();
437     currToken.beginTag = false;
438     processToken();
439
440     state.setInScript(false);
441     
442     // FIXME: The script should be syntax highlighted.
443     if (inViewSourceMode())
444         return state;
445
446     SegmentedString *savedPrependingSrc = currentPrependingSrc;
447     SegmentedString prependingSrc;
448     currentPrependingSrc = &prependingSrc;
449     scriptCodeSize = scriptCodeResync = 0;
450
451     if (!parser->skipMode() && !followingFrameset) {
452         if (cs) {
453             if (savedPrependingSrc)
454                 savedPrependingSrc->append(src);
455             else
456                 pendingSrc.prepend(src);
457             setSrc(SegmentedString());
458
459             // the ref() call below may call notifyFinished if the script is already in cache,
460             // and that mucks with the state directly, so we must write it back to the object.
461             m_state = state;
462             bool savedRequestingScript = m_requestingScript;
463             m_requestingScript = true;
464             cs->ref(this);
465             m_requestingScript = savedRequestingScript;
466             state = m_state;
467             // will be 0 if script was already loaded and ref() executed it
468             if (!pendingScripts.isEmpty())
469                 state.setLoadingExtScript(true);
470         } else if (!m_fragment && doScriptExec) {
471             if (!m_executingScript)
472                 pendingSrc.prepend(src);
473             else
474                 prependingSrc = src;
475             setSrc(SegmentedString());
476             state = scriptExecution(scriptCode, state, String(), scriptStartLineno);
477         }
478     }
479
480     if (!m_executingScript && !state.loadingExtScript()) {
481         src.append(pendingSrc);
482         pendingSrc.clear();
483     } else if (!prependingSrc.isEmpty()) {
484         // restore first so that the write appends in the right place
485         // (does not hurt to do it again below)
486         currentPrependingSrc = savedPrependingSrc;
487
488         // we need to do this slightly modified bit of one of the write() cases
489         // because we want to prepend to pendingSrc rather than appending
490         // if there's no previous prependingSrc
491         if (state.loadingExtScript()) {
492             if (currentPrependingSrc) {
493                 currentPrependingSrc->append(prependingSrc);
494             } else {
495                 pendingSrc.prepend(prependingSrc);
496             }
497         } else {
498             m_state = state;
499             write(prependingSrc, false);
500             state = m_state;
501         }
502     }
503
504     currentPrependingSrc = savedPrependingSrc;
505
506     return state;
507 }
508
509 HTMLTokenizer::State HTMLTokenizer::scriptExecution(const String& str, State state, const String& scriptURL, int baseLine)
510 {
511     if (m_fragment || !m_doc->frame())
512         return state;
513     m_executingScript++;
514     String url = scriptURL.isNull() ? m_doc->frame()->document()->url().string() : scriptURL;
515
516     SegmentedString *savedPrependingSrc = currentPrependingSrc;
517     SegmentedString prependingSrc;
518     currentPrependingSrc = &prependingSrc;
519
520 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
521     if (!m_doc->ownerElement())
522         printf("beginning script execution at %d\n", m_doc->elapsedTime());
523 #endif
524
525     m_state = state;
526     m_doc->frame()->loader()->executeScript(url, baseLine, str);
527     state = m_state;
528
529     state.setAllowYield(true);
530
531 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
532     if (!m_doc->ownerElement())
533         printf("ending script execution at %d\n", m_doc->elapsedTime());
534 #endif
535     
536     m_executingScript--;
537
538     if (!m_executingScript && !state.loadingExtScript()) {
539         pendingSrc.prepend(prependingSrc);        
540         src.append(pendingSrc);
541         pendingSrc.clear();
542     } else if (!prependingSrc.isEmpty()) {
543         // restore first so that the write appends in the right place
544         // (does not hurt to do it again below)
545         currentPrependingSrc = savedPrependingSrc;
546
547         // we need to do this slightly modified bit of one of the write() cases
548         // because we want to prepend to pendingSrc rather than appending
549         // if there's no previous prependingSrc
550         if (state.loadingExtScript()) {
551             if (currentPrependingSrc)
552                 currentPrependingSrc->append(prependingSrc);
553             else
554                 pendingSrc.prepend(prependingSrc);
555         } else {
556             m_state = state;
557             write(prependingSrc, false);
558             state = m_state;
559         }
560     }
561
562     currentPrependingSrc = savedPrependingSrc;
563
564     return state;
565 }
566
567 HTMLTokenizer::State HTMLTokenizer::parseComment(SegmentedString &src, State state)
568 {
569     // FIXME: Why does this code even run for comments inside <script> and <style>? This seems bogus.
570     checkScriptBuffer(src.length());
571     while (!src.isEmpty()) {
572         UChar ch = *src;
573         scriptCode[scriptCodeSize++] = ch;
574         if (ch == '>') {
575             bool handleBrokenComments = brokenComments && !(state.inScript() || state.inStyle());
576             int endCharsCount = 1; // start off with one for the '>' character
577             if (scriptCodeSize > 2 && scriptCode[scriptCodeSize-3] == '-' && scriptCode[scriptCodeSize-2] == '-') {
578                 endCharsCount = 3;
579             } else if (scriptCodeSize > 3 && scriptCode[scriptCodeSize-4] == '-' && scriptCode[scriptCodeSize-3] == '-' && 
580                 scriptCode[scriptCodeSize-2] == '!') {
581                 // Other browsers will accept --!> as a close comment, even though it's
582                 // not technically valid.
583                 endCharsCount = 4;
584             }
585             if (handleBrokenComments || endCharsCount > 1) {
586                 src.advancePastNonNewline();
587                 if (!(state.inTitle() || state.inScript() || state.inXmp() || state.inTextArea() || state.inStyle() || state.inIFrame())) {
588                     checkScriptBuffer();
589                     scriptCode[scriptCodeSize] = 0;
590                     scriptCode[scriptCodeSize + 1] = 0;
591                     currToken.tagName = commentAtom;
592                     currToken.beginTag = true;
593                     state = processListing(SegmentedString(scriptCode, scriptCodeSize - endCharsCount), state);
594                     processToken();
595                     currToken.tagName = commentAtom;
596                     currToken.beginTag = false;
597                     processToken();
598                     scriptCodeSize = 0;
599                 }
600                 state.setInComment(false);
601                 return state; // Finished parsing comment
602             }
603         }
604         src.advance(m_lineNumber);
605     }
606
607     return state;
608 }
609
610 HTMLTokenizer::State HTMLTokenizer::parseServer(SegmentedString& src, State state)
611 {
612     checkScriptBuffer(src.length());
613     while (!src.isEmpty()) {
614         UChar ch = *src;
615         scriptCode[scriptCodeSize++] = ch;
616         if (ch == '>' && scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') {
617             src.advancePastNonNewline();
618             state.setInServer(false);
619             scriptCodeSize = 0;
620             return state; // Finished parsing server include
621         }
622         src.advance(m_lineNumber);
623     }
624     return state;
625 }
626
627 HTMLTokenizer::State HTMLTokenizer::parseProcessingInstruction(SegmentedString &src, State state)
628 {
629     UChar oldchar = 0;
630     while (!src.isEmpty()) {
631         UChar chbegin = *src;
632         if (chbegin == '\'')
633             tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
634         else if (chbegin == '\"')
635             tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
636         // Look for '?>'
637         // Some crappy sites omit the "?" before it, so
638         // we look for an unquoted '>' instead. (IE compatible)
639         else if (chbegin == '>' && (!tquote || oldchar == '?')) {
640             // We got a '?>' sequence
641             state.setInProcessingInstruction(false);
642             src.advancePastNonNewline();
643             state.setDiscardLF(true);
644             return state; // Finished parsing comment!
645         }
646         src.advance(m_lineNumber);
647         oldchar = chbegin;
648     }
649     
650     return state;
651 }
652
653 HTMLTokenizer::State HTMLTokenizer::parseText(SegmentedString &src, State state)
654 {
655     while (!src.isEmpty()) {
656         UChar cc = *src;
657
658         if (state.skipLF()) {
659             state.setSkipLF(false);
660             if (cc == '\n') {
661                 src.advancePastNewline(m_lineNumber);
662                 continue;
663             }
664         }
665
666         // do we need to enlarge the buffer?
667         checkBuffer();
668
669         if (cc == '\r') {
670             state.setSkipLF(true);
671             *dest++ = '\n';
672         } else
673             *dest++ = cc;
674         src.advance(m_lineNumber);
675     }
676
677     return state;
678 }
679
680
681 HTMLTokenizer::State HTMLTokenizer::parseEntity(SegmentedString &src, UChar*& dest, State state, unsigned &cBufferPos, bool start, bool parsingTag)
682 {
683     if (start)
684     {
685         cBufferPos = 0;
686         state.setEntityState(SearchEntity);
687         EntityUnicodeValue = 0;
688     }
689
690     while(!src.isEmpty())
691     {
692         UChar cc = *src;
693         switch(state.entityState()) {
694         case NoEntity:
695             ASSERT(state.entityState() != NoEntity);
696             return state;
697         
698         case SearchEntity:
699             if (cc == '#') {
700                 cBuffer[cBufferPos++] = cc;
701                 src.advancePastNonNewline();
702                 state.setEntityState(NumericSearch);
703             } else
704                 state.setEntityState(EntityName);
705             break;
706
707         case NumericSearch:
708             if (cc == 'x' || cc == 'X') {
709                 cBuffer[cBufferPos++] = cc;
710                 src.advancePastNonNewline();
711                 state.setEntityState(Hexadecimal);
712             } else if (cc >= '0' && cc <= '9')
713                 state.setEntityState(Decimal);
714             else
715                 state.setEntityState(SearchSemicolon);
716             break;
717
718         case Hexadecimal: {
719             int ll = min(src.length(), 10 - cBufferPos);
720             while (ll--) {
721                 cc = *src;
722                 if (!((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'))) {
723                     state.setEntityState(SearchSemicolon);
724                     break;
725                 }
726                 int digit;
727                 if (cc < 'A')
728                     digit = cc - '0';
729                 else
730                     digit = (cc - 'A' + 10) & 0xF; // handle both upper and lower case without a branch
731                 EntityUnicodeValue = EntityUnicodeValue * 16 + digit;
732                 cBuffer[cBufferPos++] = cc;
733                 src.advancePastNonNewline();
734             }
735             if (cBufferPos == 10)  
736                 state.setEntityState(SearchSemicolon);
737             break;
738         }
739         case Decimal:
740         {
741             int ll = min(src.length(), 9-cBufferPos);
742             while(ll--) {
743                 cc = *src;
744
745                 if (!(cc >= '0' && cc <= '9')) {
746                     state.setEntityState(SearchSemicolon);
747                     break;
748                 }
749
750                 EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0');
751                 cBuffer[cBufferPos++] = cc;
752                 src.advancePastNonNewline();
753             }
754             if (cBufferPos == 9)  
755                 state.setEntityState(SearchSemicolon);
756             break;
757         }
758         case EntityName:
759         {
760             int ll = min(src.length(), 9-cBufferPos);
761             while(ll--) {
762                 cc = *src;
763
764                 if (!((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
765                     state.setEntityState(SearchSemicolon);
766                     break;
767                 }
768
769                 cBuffer[cBufferPos++] = cc;
770                 src.advancePastNonNewline();
771             }
772             if (cBufferPos == 9) 
773                 state.setEntityState(SearchSemicolon);
774             if (state.entityState() == SearchSemicolon) {
775                 if(cBufferPos > 1) {
776                     // Since the maximum length of entity name is 9,
777                     // so a single char array which is allocated on
778                     // the stack, its length is 10, should be OK.
779                     // Also if we have an illegal character, we treat it
780                     // as illegal entity name.
781                     unsigned testedEntityNameLen = 0;
782                     char tmpEntityNameBuffer[10];
783
784                     ASSERT(cBufferPos < 10);
785                     for (; testedEntityNameLen < cBufferPos; ++testedEntityNameLen) {
786                         if (cBuffer[testedEntityNameLen] > 0x7e)
787                             break;
788                         tmpEntityNameBuffer[testedEntityNameLen] = cBuffer[testedEntityNameLen];
789                     }
790
791                     const Entity *e;
792
793                     if (testedEntityNameLen == cBufferPos)
794                         e = findEntity(tmpEntityNameBuffer, cBufferPos);
795                     else
796                         e = 0;
797
798                     if(e)
799                         EntityUnicodeValue = e->code;
800
801                     // be IE compatible
802                     if(parsingTag && EntityUnicodeValue > 255 && *src != ';')
803                         EntityUnicodeValue = 0;
804                 }
805             }
806             else
807                 break;
808         }
809         case SearchSemicolon:
810             // Don't allow values that are more than 21 bits.
811             if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x10FFFF) {
812                 if (!inViewSourceMode()) {
813                     if (*src == ';')
814                         src.advancePastNonNewline();
815                     if (EntityUnicodeValue <= 0xFFFF) {
816                         checkBuffer();
817                         src.push(fixUpChar(EntityUnicodeValue));
818                     } else {
819                         // Convert to UTF-16, using surrogate code points.
820                         checkBuffer(2);
821                         src.push(U16_LEAD(EntityUnicodeValue));
822                         src.push(U16_TRAIL(EntityUnicodeValue));
823                     }
824                 } else {
825                     // FIXME: We should eventually colorize entities by sending them as a special token.
826                     checkBuffer(11);
827                     *dest++ = '&';
828                     for (unsigned i = 0; i < cBufferPos; i++)
829                         dest[i] = cBuffer[i];
830                     dest += cBufferPos;
831                     if (*src == ';') {
832                         *dest++ = ';';
833                         src.advancePastNonNewline();
834                     }
835                 }
836             } else {
837                 checkBuffer(10);
838                 // ignore the sequence, add it to the buffer as plaintext
839                 *dest++ = '&';
840                 for (unsigned i = 0; i < cBufferPos; i++)
841                     dest[i] = cBuffer[i];
842                 dest += cBufferPos;
843             }
844
845             state.setEntityState(NoEntity);
846             return state;
847         }
848     }
849
850     return state;
851 }
852
853 HTMLTokenizer::State HTMLTokenizer::parseDoctype(SegmentedString& src, State state)
854 {
855     ASSERT(state.inDoctype());
856     while (!src.isEmpty() && state.inDoctype()) {
857         UChar c = *src;
858         bool isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' ';
859         switch (m_doctypeToken.state()) {
860             case DoctypeBegin: {
861                 m_doctypeToken.setState(DoctypeBeforeName);
862                 if (isWhitespace) {
863                     src.advance(m_lineNumber);
864                     if (inViewSourceMode())
865                         m_doctypeToken.m_source.append(c);
866                 }
867                 break;
868             }
869             case DoctypeBeforeName: {
870                 if (c == '>') {
871                     // Malformed.  Just exit.
872                     src.advancePastNonNewline();
873                     state.setInDoctype(false);
874                     if (inViewSourceMode())
875                         processDoctypeToken();
876                 } else if (isWhitespace) {
877                     src.advance(m_lineNumber);
878                     if (inViewSourceMode())
879                         m_doctypeToken.m_source.append(c);
880                 } else
881                     m_doctypeToken.setState(DoctypeName);
882                 break;
883             }
884             case DoctypeName: {
885                 if (c == '>') {
886                     // Valid doctype. Emit it.
887                     src.advancePastNonNewline();
888                     state.setInDoctype(false);
889                     processDoctypeToken();
890                 } else if (isWhitespace) {
891                     m_doctypeSearchCount = 0; // Used now to scan for PUBLIC
892                     m_doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM
893                     m_doctypeToken.setState(DoctypeAfterName);
894                     src.advance(m_lineNumber);
895                     if (inViewSourceMode())
896                         m_doctypeToken.m_source.append(c);
897                 } else {
898                     src.advancePastNonNewline();
899                     m_doctypeToken.m_name.append(c);
900                     if (inViewSourceMode())
901                         m_doctypeToken.m_source.append(c);
902                 }
903                 break;
904             }
905             case DoctypeAfterName: {
906                 if (c == '>') {
907                     // Valid doctype. Emit it.
908                     src.advancePastNonNewline();
909                     state.setInDoctype(false);
910                     processDoctypeToken();
911                 } else if (!isWhitespace) {
912                     src.advancePastNonNewline();
913                     if (toASCIILower(c) == publicStart[m_doctypeSearchCount]) {
914                         m_doctypeSearchCount++;
915                         if (m_doctypeSearchCount == 6)
916                             // Found 'PUBLIC' sequence
917                             m_doctypeToken.setState(DoctypeBeforePublicID);
918                     } else if (m_doctypeSearchCount > 0) {
919                         m_doctypeSearchCount = 0;
920                         m_doctypeToken.setState(DoctypeBogus);
921                     } else if (toASCIILower(c) == systemStart[m_doctypeSecondarySearchCount]) {
922                         m_doctypeSecondarySearchCount++;
923                         if (m_doctypeSecondarySearchCount == 6)
924                             // Found 'SYSTEM' sequence
925                             m_doctypeToken.setState(DoctypeBeforeSystemID);
926                     } else {
927                         m_doctypeSecondarySearchCount = 0;
928                         m_doctypeToken.setState(DoctypeBogus);
929                     }
930                     if (inViewSourceMode())
931                         m_doctypeToken.m_source.append(c);
932                 } else {
933                     src.advance(m_lineNumber); // Whitespace keeps us in the after name state.
934                     if (inViewSourceMode())
935                         m_doctypeToken.m_source.append(c);
936                 }
937                 break;
938             }
939             case DoctypeBeforePublicID: {
940                 if (c == '\"' || c == '\'') {
941                     tquote = c == '\"' ? DoubleQuote : SingleQuote;
942                     m_doctypeToken.setState(DoctypePublicID);
943                     src.advancePastNonNewline();
944                     if (inViewSourceMode())
945                         m_doctypeToken.m_source.append(c);
946                 } else if (c == '>') {
947                     // Considered bogus.  Don't process the doctype.
948                     src.advancePastNonNewline();
949                     state.setInDoctype(false);
950                     if (inViewSourceMode())
951                         processDoctypeToken();
952                 } else if (isWhitespace) {
953                     src.advance(m_lineNumber);
954                     if (inViewSourceMode())
955                         m_doctypeToken.m_source.append(c);
956                 } else
957                     m_doctypeToken.setState(DoctypeBogus);
958                 break;
959             }
960             case DoctypePublicID: {
961                 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
962                     src.advancePastNonNewline();
963                     m_doctypeToken.setState(DoctypeAfterPublicID);
964                     if (inViewSourceMode())
965                         m_doctypeToken.m_source.append(c);
966                 } else if (c == '>') {
967                      // Considered bogus.  Don't process the doctype.
968                     src.advancePastNonNewline();
969                     state.setInDoctype(false);
970                     if (inViewSourceMode())
971                         processDoctypeToken();
972                 } else {
973                     m_doctypeToken.m_publicID.append(c);
974                     src.advance(m_lineNumber);
975                     if (inViewSourceMode())
976                         m_doctypeToken.m_source.append(c);
977                 }
978                 break;
979             }
980             case DoctypeAfterPublicID:
981                 if (c == '\"' || c == '\'') {
982                     tquote = c == '\"' ? DoubleQuote : SingleQuote;
983                     m_doctypeToken.setState(DoctypeSystemID);
984                     src.advancePastNonNewline();
985                     if (inViewSourceMode())
986                         m_doctypeToken.m_source.append(c);
987                 } else if (c == '>') {
988                     // Valid doctype. Emit it now.
989                     src.advancePastNonNewline();
990                     state.setInDoctype(false);
991                     processDoctypeToken();
992                 } else if (isWhitespace) {
993                     src.advance(m_lineNumber);
994                     if (inViewSourceMode())
995                         m_doctypeToken.m_source.append(c);
996                 } else
997                     m_doctypeToken.setState(DoctypeBogus);
998                 break;
999             case DoctypeBeforeSystemID:
1000                 if (c == '\"' || c == '\'') {
1001                     tquote = c == '\"' ? DoubleQuote : SingleQuote;
1002                     m_doctypeToken.setState(DoctypeSystemID);
1003                     src.advancePastNonNewline();
1004                     if (inViewSourceMode())
1005                         m_doctypeToken.m_source.append(c);
1006                 } else if (c == '>') {
1007                     // Considered bogus.  Don't process the doctype.
1008                     src.advancePastNonNewline();
1009                     state.setInDoctype(false);
1010                 } else if (isWhitespace) {
1011                     src.advance(m_lineNumber);
1012                     if (inViewSourceMode())
1013                         m_doctypeToken.m_source.append(c);
1014                 } else
1015                     m_doctypeToken.setState(DoctypeBogus);
1016                 break;
1017             case DoctypeSystemID:
1018                 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
1019                     src.advancePastNonNewline();
1020                     m_doctypeToken.setState(DoctypeAfterSystemID);
1021                     if (inViewSourceMode())
1022                         m_doctypeToken.m_source.append(c);
1023                 } else if (c == '>') {
1024                      // Considered bogus.  Don't process the doctype.
1025                     src.advancePastNonNewline();
1026                     state.setInDoctype(false);
1027                     if (inViewSourceMode())
1028                         processDoctypeToken();
1029                 } else {
1030                     m_doctypeToken.m_systemID.append(c);
1031                     src.advance(m_lineNumber);
1032                     if (inViewSourceMode())
1033                         m_doctypeToken.m_source.append(c);
1034                 }
1035                 break;
1036             case DoctypeAfterSystemID:
1037                 if (c == '>') {
1038                     // Valid doctype. Emit it now.
1039                     src.advancePastNonNewline();
1040                     state.setInDoctype(false);
1041                     processDoctypeToken();
1042                 } else if (isWhitespace) {
1043                     src.advance(m_lineNumber);
1044                     if (inViewSourceMode())
1045                         m_doctypeToken.m_source.append(c);
1046                 } else
1047                     m_doctypeToken.setState(DoctypeBogus);
1048                 break;
1049             case DoctypeBogus:
1050                 if (c == '>') {
1051                     // Done with the bogus doctype.
1052                     src.advancePastNonNewline();
1053                     state.setInDoctype(false);
1054                     if (inViewSourceMode())
1055                        processDoctypeToken();
1056                 } else {
1057                     src.advance(m_lineNumber); // Just keep scanning for '>'
1058                     if (inViewSourceMode())
1059                         m_doctypeToken.m_source.append(c);
1060                 }
1061                 break;
1062             default:
1063                 break;
1064         }
1065     }
1066     return state;
1067 }
1068
1069 HTMLTokenizer::State HTMLTokenizer::parseTag(SegmentedString &src, State state)
1070 {
1071     ASSERT(!state.hasEntityState());
1072
1073     unsigned cBufferPos = m_cBufferPos;
1074
1075     bool lastIsSlash = false;
1076
1077     while (!src.isEmpty()) {
1078         checkBuffer();
1079         switch(state.tagState()) {
1080         case NoTag:
1081         {
1082             m_cBufferPos = cBufferPos;
1083             return state;
1084         }
1085         case TagName:
1086         {
1087             if (searchCount > 0) {
1088                 if (*src == commentStart[searchCount]) {
1089                     searchCount++;
1090                     if (searchCount == 2)
1091                         m_doctypeSearchCount++; // A '!' is also part of a doctype, so we are moving through that still as well.
1092                     if (searchCount == 4) {
1093                         // Found '<!--' sequence
1094                         src.advancePastNonNewline();
1095                         dest = buffer; // ignore the previous part of this tag
1096                         state.setInComment(true);
1097                         state.setTagState(NoTag);
1098
1099                         // Fix bug 34302 at kde.bugs.org.  Go ahead and treat
1100                         // <!--> as a valid comment, since both mozilla and IE on windows
1101                         // can handle this case.  Only do this in quirks mode. -dwh
1102                         if (!src.isEmpty() && *src == '>' && m_doc->inCompatMode()) {
1103                             state.setInComment(false);
1104                             src.advancePastNonNewline();
1105                             if (!src.isEmpty())
1106                                 cBuffer[cBufferPos++] = *src;
1107                         } else
1108                           state = parseComment(src, state);
1109
1110                         m_cBufferPos = cBufferPos;
1111                         return state; // Finished parsing tag!
1112                     }
1113                     cBuffer[cBufferPos++] = *src;
1114                     src.advancePastNonNewline();
1115                     break;
1116                 } else
1117                     searchCount = 0; // Stop looking for '<!--' sequence
1118             }
1119             
1120             if (m_doctypeSearchCount > 0) {
1121                 if (toASCIILower(*src) == doctypeStart[m_doctypeSearchCount]) {
1122                     m_doctypeSearchCount++;
1123                     cBuffer[cBufferPos++] = *src;
1124                     src.advancePastNonNewline();
1125                     if (m_doctypeSearchCount == 9) {
1126                         // Found '<!DOCTYPE' sequence
1127                         state.setInDoctype(true);
1128                         state.setTagState(NoTag);
1129                         m_doctypeToken.reset();
1130                         if (inViewSourceMode())
1131                             m_doctypeToken.m_source.append(cBuffer, cBufferPos);
1132                         state = parseDoctype(src, state);
1133                         m_cBufferPos = cBufferPos;
1134                         return state;
1135                     }
1136                     break;
1137                 }
1138                 else
1139                     m_doctypeSearchCount = 0; // Stop looking for '<!DOCTYPE' sequence
1140             }
1141
1142             bool finish = false;
1143             unsigned int ll = min(src.length(), CBUFLEN - cBufferPos);
1144             while (ll--) {
1145                 UChar curchar = *src;
1146                 if (isASCIISpace(curchar) || curchar == '>' || curchar == '<') {
1147                     finish = true;
1148                     break;
1149                 }
1150                 
1151                 // tolower() shows up on profiles. This is faster!
1152                 if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())
1153                     cBuffer[cBufferPos++] = curchar + ('a' - 'A');
1154                 else
1155                     cBuffer[cBufferPos++] = curchar;
1156                 src.advancePastNonNewline();
1157             }
1158
1159             // Disadvantage: we add the possible rest of the tag
1160             // as attribute names. ### judge if this causes problems
1161             if(finish || CBUFLEN == cBufferPos) {
1162                 bool beginTag;
1163                 UChar* ptr = cBuffer;
1164                 unsigned int len = cBufferPos;
1165                 cBuffer[cBufferPos] = '\0';
1166                 if ((cBufferPos > 0) && (*ptr == '/')) {
1167                     // End Tag
1168                     beginTag = false;
1169                     ptr++;
1170                     len--;
1171                 }
1172                 else
1173                     // Start Tag
1174                     beginTag = true;
1175
1176                 // Ignore the / in fake xml tags like <br/>.  We trim off the "/" so that we'll get "br" as the tag name and not "br/".
1177                 if (len > 1 && ptr[len-1] == '/' && !inViewSourceMode())
1178                     ptr[--len] = '\0';
1179
1180                 // Now that we've shaved off any invalid / that might have followed the name), make the tag.
1181                 // FIXME: FireFox and WinIE turn !foo nodes into comments, we ignore comments. (fast/parser/tag-with-exclamation-point.html)
1182                 if (ptr[0] != '!' || inViewSourceMode()) {
1183                     currToken.tagName = AtomicString(ptr);
1184                     currToken.beginTag = beginTag;
1185                 }
1186                 dest = buffer;
1187                 state.setTagState(SearchAttribute);
1188                 cBufferPos = 0;
1189             }
1190             break;
1191         }
1192         case SearchAttribute:
1193 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1194             qDebug("SearchAttribute");
1195 #endif
1196             while(!src.isEmpty()) {
1197                 UChar curchar = *src;
1198                 // In this mode just ignore any quotes we encounter and treat them like spaces.
1199                 if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"') {
1200                     if (curchar == '<' || curchar == '>')
1201                         state.setTagState(SearchEnd);
1202                     else
1203                         state.setTagState(AttributeName);
1204
1205                     cBufferPos = 0;
1206                     break;
1207                 }
1208                 if (inViewSourceMode())
1209                     currToken.addViewSourceChar(curchar);
1210                 src.advance(m_lineNumber);
1211             }
1212             break;
1213         case AttributeName:
1214         {
1215 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1216             qDebug("AttributeName");
1217 #endif
1218             int ll = min(src.length(), CBUFLEN-cBufferPos);
1219             while(ll--) {
1220                 UChar curchar = *src;
1221                 // If we encounter a "/" when scanning an attribute name, treat it as a delimiter.  This allows the 
1222                 // cases like <input type=checkbox checked/> to work (and accommodates XML-style syntax as per HTML5).
1223                 if (curchar <= '>' && (curchar >= '<' || isASCIISpace(curchar) || curchar == '/')) {
1224                     cBuffer[cBufferPos] = '\0';
1225                     attrName = AtomicString(cBuffer);
1226                     dest = buffer;
1227                     *dest++ = 0;
1228                     state.setTagState(SearchEqual);
1229                     if (inViewSourceMode())
1230                         currToken.addViewSourceChar('a');
1231                     break;
1232                 }
1233                 
1234                 // tolower() shows up on profiles. This is faster!
1235                 if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())
1236                     cBuffer[cBufferPos++] = curchar + ('a' - 'A');
1237                 else
1238                     cBuffer[cBufferPos++] = curchar;
1239                     
1240                 src.advance(m_lineNumber);
1241             }
1242             if ( cBufferPos == CBUFLEN ) {
1243                 cBuffer[cBufferPos] = '\0';
1244                 attrName = AtomicString(cBuffer);
1245                 dest = buffer;
1246                 *dest++ = 0;
1247                 state.setTagState(SearchEqual);
1248                 if (inViewSourceMode())
1249                     currToken.addViewSourceChar('a');
1250             }
1251             break;
1252         }
1253         case SearchEqual:
1254 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1255             qDebug("SearchEqual");
1256 #endif
1257             while(!src.isEmpty()) {
1258                 UChar curchar = *src;
1259
1260                 if (lastIsSlash && curchar == '>') {
1261                     // This is a quirk (with a long sad history).  We have to do this
1262                     // since widgets do <script src="foo.js"/> and expect the tag to close.
1263                     if (currToken.tagName == scriptTag)
1264                         currToken.flat = true;
1265                     currToken.brokenXMLStyle = true;
1266                 }
1267
1268                 // In this mode just ignore any quotes or slashes we encounter and treat them like spaces.
1269                 if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"' && curchar != '/') {
1270                     if (curchar == '=') {
1271 #ifdef TOKEN_DEBUG
1272                         kdDebug(6036) << "found equal" << endl;
1273 #endif
1274                         state.setTagState(SearchValue);
1275                         if (inViewSourceMode())
1276                             currToken.addViewSourceChar(curchar);
1277                         src.advancePastNonNewline();
1278                     } else {
1279                         currToken.addAttribute(m_doc, attrName, emptyAtom, inViewSourceMode());
1280                         dest = buffer;
1281                         state.setTagState(SearchAttribute);
1282                         lastIsSlash = false;
1283                     }
1284                     break;
1285                 }
1286                 if (inViewSourceMode())
1287                     currToken.addViewSourceChar(curchar);
1288                     
1289                 lastIsSlash = curchar == '/';
1290
1291                 src.advance(m_lineNumber);
1292             }
1293             break;
1294         case SearchValue:
1295             while (!src.isEmpty()) {
1296                 UChar curchar = *src;
1297                 if (!isASCIISpace(curchar)) {
1298                     if (curchar == '\'' || curchar == '\"') {
1299                         tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
1300                         state.setTagState(QuotedValue);
1301                         if (inViewSourceMode())
1302                             currToken.addViewSourceChar(curchar);
1303                         src.advancePastNonNewline();
1304                     } else
1305                         state.setTagState(Value);
1306
1307                     break;
1308                 }
1309                 if (inViewSourceMode())
1310                     currToken.addViewSourceChar(curchar);
1311                 src.advance(m_lineNumber);
1312             }
1313             break;
1314         case QuotedValue:
1315 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1316             qDebug("QuotedValue");
1317 #endif
1318             while (!src.isEmpty()) {
1319                 checkBuffer();
1320
1321                 UChar curchar = *src;
1322                 if (curchar <= '>' && !src.escaped()) {
1323                     if (curchar == '>' && attrName.isEmpty()) {
1324                         // Handle a case like <img '>.  Just go ahead and be willing
1325                         // to close the whole tag.  Don't consume the character and
1326                         // just go back into SearchEnd while ignoring the whole
1327                         // value.
1328                         // FIXME: Note that this is actually not a very good solution.
1329                         // It doesn't handle the general case of
1330                         // unmatched quotes among attributes that have names. -dwh
1331                         while (dest > buffer + 1 && (dest[-1] == '\n' || dest[-1] == '\r'))
1332                             dest--; // remove trailing newlines
1333                         AtomicString v(buffer + 1, dest - buffer - 1);
1334                         attrName = v; // Just make the name/value match. (FIXME: Is this some WinIE quirk?)
1335                         currToken.addAttribute(m_doc, attrName, v, inViewSourceMode());
1336                         if (inViewSourceMode())
1337                             currToken.addViewSourceChar('x');
1338                         state.setTagState(SearchAttribute);
1339                         dest = buffer;
1340                         tquote = NoQuote;
1341                         break;
1342                     }
1343                     
1344                     if (curchar == '&') {
1345                         src.advancePastNonNewline();
1346                         state = parseEntity(src, dest, state, cBufferPos, true, true);
1347                         break;
1348                     }
1349
1350                     if ((tquote == SingleQuote && curchar == '\'') || (tquote == DoubleQuote && curchar == '\"')) {
1351                         // some <input type=hidden> rely on trailing spaces. argh
1352                         while (dest > buffer + 1 && (dest[-1] == '\n' || dest[-1] == '\r'))
1353                             dest--; // remove trailing newlines
1354                         AtomicString v(buffer + 1, dest - buffer - 1);
1355                         if (attrName.isEmpty()) {
1356                             attrName = v; // Make the name match the value. (FIXME: Is this a WinIE quirk?)
1357                             if (inViewSourceMode())
1358                                 currToken.addViewSourceChar('x');
1359                         } else if (inViewSourceMode())
1360                             currToken.addViewSourceChar('v');
1361                         currToken.addAttribute(m_doc, attrName, v, inViewSourceMode());
1362                         dest = buffer;
1363                         state.setTagState(SearchAttribute);
1364                         tquote = NoQuote;
1365                         if (inViewSourceMode())
1366                             currToken.addViewSourceChar(curchar);
1367                         src.advancePastNonNewline();
1368                         break;
1369                     }
1370                 }
1371
1372                 *dest++ = curchar;
1373                 src.advance(m_lineNumber);
1374             }
1375             break;
1376         case Value:
1377 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1378             qDebug("Value");
1379 #endif
1380             while(!src.isEmpty()) {
1381                 checkBuffer();
1382                 UChar curchar = *src;
1383                 if (curchar <= '>' && !src.escaped()) {
1384                     // parse Entities
1385                     if (curchar == '&') {
1386                         src.advancePastNonNewline();
1387                         state = parseEntity(src, dest, state, cBufferPos, true, true);
1388                         break;
1389                     }
1390                     // no quotes. Every space means end of value
1391                     // '/' does not delimit in IE!
1392                     if (isASCIISpace(curchar) || curchar == '>') {
1393                         AtomicString v(buffer+1, dest-buffer-1);
1394                         currToken.addAttribute(m_doc, attrName, v, inViewSourceMode());
1395                         if (inViewSourceMode())
1396                             currToken.addViewSourceChar('v');
1397                         dest = buffer;
1398                         state.setTagState(SearchAttribute);
1399                         break;
1400                     }
1401                 }
1402
1403                 *dest++ = curchar;
1404                 src.advance(m_lineNumber);
1405             }
1406             break;
1407         case SearchEnd:
1408         {
1409 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1410                 qDebug("SearchEnd");
1411 #endif
1412             while (!src.isEmpty()) {
1413                 UChar ch = *src;
1414                 if (ch == '>' || ch == '<')
1415                     break;
1416                 if (ch == '/')
1417                     currToken.flat = true;
1418                 if (inViewSourceMode())
1419                     currToken.addViewSourceChar(ch);
1420                 src.advance(m_lineNumber);
1421             }
1422             if (src.isEmpty()) break;
1423
1424             searchCount = 0; // Stop looking for '<!--' sequence
1425             state.setTagState(NoTag);
1426             tquote = NoQuote;
1427
1428             if (*src != '<')
1429                 src.advance(m_lineNumber);
1430
1431             if (currToken.tagName == nullAtom) { //stop if tag is unknown
1432                 m_cBufferPos = cBufferPos;
1433                 return state;
1434             }
1435
1436             AtomicString tagName = currToken.tagName;
1437
1438             // Handle <script src="foo"/> like Mozilla/Opera. We have to do this now for Dashboard
1439             // compatibility.
1440             bool isSelfClosingScript = currToken.flat && currToken.beginTag && currToken.tagName == scriptTag;
1441             bool beginTag = !currToken.flat && currToken.beginTag;
1442             if (currToken.beginTag && currToken.tagName == scriptTag && !inViewSourceMode() && !parser->skipMode()) {
1443                 Attribute* a = 0;
1444                 scriptSrc = String();
1445                 scriptSrcCharset = String();
1446                 if (currToken.attrs && !m_fragment) {
1447                     if (m_doc->frame()->scriptProxy()->isEnabled()) {
1448                         if ((a = currToken.attrs->getAttributeItem(srcAttr)))
1449                             scriptSrc = m_doc->completeURL(parseURL(a->value())).string();
1450                         if ((a = currToken.attrs->getAttributeItem(charsetAttr)))
1451                             scriptSrcCharset = a->value().domString().stripWhiteSpace();
1452                         if (scriptSrcCharset.isEmpty())
1453                             scriptSrcCharset = m_doc->frame()->loader()->encoding();
1454                     }
1455                 }
1456             }
1457
1458             RefPtr<Node> n = processToken();
1459             m_cBufferPos = cBufferPos;
1460             if (n) {
1461                 if ((tagName == preTag || tagName == listingTag) && !inViewSourceMode()) {
1462                     if (beginTag)
1463                         state.setDiscardLF(true); // Discard the first LF after we open a pre.
1464                 } else if (tagName == scriptTag && n) {
1465                     ASSERT(!scriptNode);
1466                     scriptNode = n;
1467                     if (beginTag) {
1468                         searchStopper = scriptEnd;
1469                         searchStopperLen = 8;
1470                         state.setInScript(true);
1471                         state = parseSpecial(src, state);
1472                     } else if (isSelfClosingScript) { // Handle <script src="foo"/>
1473                         state.setInScript(true);
1474                         state = scriptHandler(state);
1475                     }
1476                 } else if (tagName == styleTag) {
1477                     if (beginTag) {
1478                         searchStopper = styleEnd;
1479                         searchStopperLen = 7;
1480                         state.setInStyle(true);
1481                         state = parseSpecial(src, state);
1482                     }
1483                 } else if (tagName == textareaTag) {
1484                     if (beginTag) {
1485                         searchStopper = textareaEnd;
1486                         searchStopperLen = 10;
1487                         state.setInTextArea(true);
1488                         state = parseSpecial(src, state);
1489                     }
1490                 } else if (tagName == titleTag) {
1491                     if (beginTag) {
1492                         searchStopper = titleEnd;
1493                         searchStopperLen = 7;
1494                         State savedState = state;
1495                         SegmentedString savedSrc = src;
1496                         long savedLineno = m_lineNumber;
1497                         state.setInTitle(true);
1498                         state = parseSpecial(src, state);
1499                         if (state.inTitle() && src.isEmpty()) {
1500                             // We just ate the rest of the document as the title #text node!
1501                             // Reset the state then retokenize without special title handling.
1502                             // Let the parser clean up the missing </title> tag.
1503                             // FIXME: This is incorrect, because src.isEmpty() doesn't mean we're
1504                             // at the end of the document unless noMoreData is also true. We need
1505                             // to detect this case elsewhere, and save the state somewhere other
1506                             // than a local variable.
1507                             state = savedState;
1508                             src = savedSrc;
1509                             m_lineNumber = savedLineno;
1510                             scriptCodeSize = 0;
1511                         }
1512                     }
1513                 } else if (tagName == xmpTag) {
1514                     if (beginTag) {
1515                         searchStopper = xmpEnd;
1516                         searchStopperLen = 5;
1517                         state.setInXmp(true);
1518                         state = parseSpecial(src, state);
1519                     }
1520                 } else if (tagName == iframeTag) {
1521                     if (beginTag) {
1522                         searchStopper = iframeEnd;
1523                         searchStopperLen = 8;
1524                         state.setInIFrame(true);
1525                         state = parseSpecial(src, state);
1526                     }
1527                 }
1528             }
1529             if (tagName == plaintextTag)
1530                 state.setInPlainText(beginTag);
1531             return state; // Finished parsing tag!
1532         }
1533         } // end switch
1534     }
1535     m_cBufferPos = cBufferPos;
1536     return state;
1537 }
1538
1539 inline bool HTMLTokenizer::continueProcessing(int& processedCount, double startTime, State &state)
1540 {
1541     // We don't want to be checking elapsed time with every character, so we only check after we've
1542     // processed a certain number of characters.
1543     bool allowedYield = state.allowYield();
1544     state.setAllowYield(false);
1545     if (!state.loadingExtScript() && !state.forceSynchronous() && !m_executingScript && (processedCount > TOKENIZER_CHUNK_SIZE || allowedYield)) {
1546         processedCount = 0;
1547         if (currentTime() - startTime > tokenizerTimeDelay) {
1548             /* FIXME: We'd like to yield aggressively to give stylesheets the opportunity to
1549                load, but this hurts overall performance on slower machines.  For now turn this
1550                off.
1551             || (!m_doc->haveStylesheetsLoaded() && 
1552                 (m_doc->documentElement()->id() != ID_HTML || m_doc->body()))) {*/
1553             // Schedule the timer to keep processing as soon as possible.
1554             m_timer.startOneShot(0);
1555 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1556             if (currentTime() - startTime > tokenizerTimeDelay)
1557                 printf("Deferring processing of data because 500ms elapsed away from event loop.\n");
1558 #endif
1559             return false;
1560         }
1561     }
1562     
1563     processedCount++;
1564     return true;
1565 }
1566
1567 bool HTMLTokenizer::write(const SegmentedString& str, bool appendData)
1568 {
1569 #ifdef TOKEN_DEBUG
1570     kdDebug( 6036 ) << this << " Tokenizer::write(\"" << str.toString() << "\"," << appendData << ")" << endl;
1571 #endif
1572
1573     if (!buffer)
1574         return false;
1575     
1576     if (m_parserStopped)
1577         return false;
1578
1579     SegmentedString source(str);
1580     if (m_executingScript)
1581         source.setExcludeLineNumbers();
1582
1583     if ((m_executingScript && appendData) || !pendingScripts.isEmpty()) {
1584         // don't parse; we will do this later
1585         if (currentPrependingSrc)
1586             currentPrependingSrc->append(source);
1587         else
1588             pendingSrc.append(source);
1589         return false;
1590     }
1591
1592     if (!src.isEmpty())
1593         src.append(source);
1594     else
1595         setSrc(source);
1596
1597     // Once a timer is set, it has control of when the tokenizer continues.
1598     if (m_timer.isActive())
1599         return false;
1600
1601     bool wasInWrite = inWrite;
1602     inWrite = true;
1603     
1604 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1605     if (!m_doc->ownerElement())
1606         printf("Beginning write at time %d\n", m_doc->elapsedTime());
1607 #endif
1608     
1609     int processedCount = 0;
1610     double startTime = currentTime();
1611
1612     Frame *frame = m_doc->frame();
1613
1614     State state = m_state;
1615
1616     while (!src.isEmpty() && (!frame || !frame->loader()->isScheduledLocationChangePending())) {
1617         if (!continueProcessing(processedCount, startTime, state))
1618             break;
1619
1620         // do we need to enlarge the buffer?
1621         checkBuffer();
1622
1623         UChar cc = *src;
1624
1625         bool wasSkipLF = state.skipLF();
1626         if (wasSkipLF)
1627             state.setSkipLF(false);
1628
1629         if (wasSkipLF && (cc == '\n'))
1630             src.advance();
1631         else if (state.needsSpecialWriteHandling()) {
1632             // it's important to keep needsSpecialWriteHandling with the flags this block tests
1633             if (state.hasEntityState())
1634                 state = parseEntity(src, dest, state, m_cBufferPos, false, state.hasTagState());
1635             else if (state.inPlainText())
1636                 state = parseText(src, state);
1637             else if (state.inAnySpecial())
1638                 state = parseSpecial(src, state);
1639             else if (state.inComment())
1640                 state = parseComment(src, state);
1641             else if (state.inDoctype())
1642                 state = parseDoctype(src, state);
1643             else if (state.inServer())
1644                 state = parseServer(src, state);
1645             else if (state.inProcessingInstruction())
1646                 state = parseProcessingInstruction(src, state);
1647             else if (state.hasTagState())
1648                 state = parseTag(src, state);
1649             else if (state.startTag()) {
1650                 state.setStartTag(false);
1651                 
1652                 switch(cc) {
1653                 case '/':
1654                     break;
1655                 case '!': {
1656                     // <!-- comment --> or <!DOCTYPE ...>
1657                     searchCount = 1; // Look for '<!--' sequence to start comment or '<!DOCTYPE' sequence to start doctype
1658                     m_doctypeSearchCount = 1;
1659                     break;
1660                 }
1661                 case '?': {
1662                     // xml processing instruction
1663                     state.setInProcessingInstruction(true);
1664                     tquote = NoQuote;
1665                     state = parseProcessingInstruction(src, state);
1666                     continue;
1667
1668                     break;
1669                 }
1670                 case '%':
1671                     if (!brokenServer) {
1672                         // <% server stuff, handle as comment %>
1673                         state.setInServer(true);
1674                         tquote = NoQuote;
1675                         state = parseServer(src, state);
1676                         continue;
1677                     }
1678                     // else fall through
1679                 default: {
1680                     if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) {
1681                         // Start of a Start-Tag
1682                     } else {
1683                         // Invalid tag
1684                         // Add as is
1685                         *dest = '<';
1686                         dest++;
1687                         continue;
1688                     }
1689                 }
1690                 }; // end case
1691
1692                 processToken();
1693
1694                 m_cBufferPos = 0;
1695                 state.setTagState(TagName);
1696                 state = parseTag(src, state);
1697             }
1698         } else if (cc == '&' && !src.escaped()) {
1699             src.advancePastNonNewline();
1700             state = parseEntity(src, dest, state, m_cBufferPos, true, state.hasTagState());
1701         } else if (cc == '<' && !src.escaped()) {
1702             tagStartLineno = m_lineNumber;
1703             src.advancePastNonNewline();
1704             state.setStartTag(true);
1705         } else if (cc == '\n' || cc == '\r') {
1706             if (state.discardLF())
1707                 // Ignore this LF
1708                 state.setDiscardLF(false); // We have discarded 1 LF
1709             else {
1710                 // Process this LF
1711                 *dest++ = '\n';
1712                 if (cc == '\r' && !src.excludeLineNumbers())
1713                     m_lineNumber++;
1714             }
1715
1716             /* Check for MS-DOS CRLF sequence */
1717             if (cc == '\r')
1718                 state.setSkipLF(true);
1719             src.advance(m_lineNumber);
1720         } else {
1721             state.setDiscardLF(false);
1722             *dest++ = cc;
1723             src.advancePastNonNewline();
1724         }
1725     }
1726     
1727 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1728     if (!m_doc->ownerElement())
1729         printf("Ending write at time %d\n", m_doc->elapsedTime());
1730 #endif
1731     
1732     inWrite = wasInWrite;
1733
1734     m_state = state;
1735
1736     if (noMoreData && !inWrite && !state.loadingExtScript() && !m_executingScript && !m_timer.isActive()) {
1737         end(); // this actually causes us to be deleted
1738         return true;
1739     }
1740     return false;
1741 }
1742
1743 void HTMLTokenizer::stopParsing()
1744 {
1745     Tokenizer::stopParsing();
1746     m_timer.stop();
1747
1748     // The part needs to know that the tokenizer has finished with its data,
1749     // regardless of whether it happened naturally or due to manual intervention.
1750     if (!m_fragment && m_doc->frame())
1751         m_doc->frame()->loader()->tokenizerProcessedData();
1752 }
1753
1754 bool HTMLTokenizer::processingData() const
1755 {
1756     return m_timer.isActive() || inWrite;
1757 }
1758
1759 void HTMLTokenizer::timerFired(Timer<HTMLTokenizer>*)
1760 {
1761 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1762     if (!m_doc->ownerElement())
1763         printf("Beginning timer write at time %d\n", m_doc->elapsedTime());
1764 #endif
1765
1766     if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay()) {
1767         // Restart the timer and let layout win.  This is basically a way of ensuring that the layout
1768         // timer has higher priority than our timer.
1769         m_timer.startOneShot(0);
1770         return;
1771     }
1772
1773     // Invoke write() as though more data came in. This might cause us to get deleted.
1774     write(SegmentedString(), true);
1775 }
1776
1777 void HTMLTokenizer::end()
1778 {
1779     ASSERT(!m_timer.isActive());
1780     m_timer.stop(); // Only helps if assertion above fires, but do it anyway.
1781
1782     if (buffer) {
1783         // parseTag is using the buffer for different matters
1784         if (!m_state.hasTagState())
1785             processToken();
1786
1787         fastFree(scriptCode);
1788         scriptCode = 0;
1789         scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
1790
1791         fastFree(buffer);
1792         buffer = 0;
1793     }
1794
1795     if (!inViewSourceMode())
1796         parser->finished();
1797     else
1798         m_doc->finishedParsing();
1799 }
1800
1801 void HTMLTokenizer::finish()
1802 {
1803     // do this as long as we don't find matching comment ends
1804     while ((m_state.inComment() || m_state.inServer()) && scriptCode && scriptCodeSize) {
1805         // we've found an unmatched comment start
1806         if (m_state.inComment())
1807             brokenComments = true;
1808         else
1809             brokenServer = true;
1810         checkScriptBuffer();
1811         scriptCode[scriptCodeSize] = 0;
1812         scriptCode[scriptCodeSize + 1] = 0;
1813         int pos;
1814         String food;
1815         if (m_state.inScript() || m_state.inStyle())
1816             food = String(scriptCode, scriptCodeSize);
1817         else if (m_state.inServer()) {
1818             food = "<";
1819             food.append(String(scriptCode, scriptCodeSize));
1820         } else {
1821             pos = DeprecatedConstString(reinterpret_cast<DeprecatedChar*>(scriptCode), scriptCodeSize).string().find('>');
1822             food = String(scriptCode + pos + 1, scriptCodeSize - pos - 1);
1823         }
1824         fastFree(scriptCode);
1825         scriptCode = 0;
1826         scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
1827         m_state.setInComment(false);
1828         m_state.setInServer(false);
1829         if (!food.isEmpty())
1830             write(food, true);
1831     }
1832     // this indicates we will not receive any more data... but if we are waiting on
1833     // an external script to load, we can't finish parsing until that is done
1834     noMoreData = true;
1835     if (!inWrite && !m_state.loadingExtScript() && !m_executingScript && !m_timer.isActive())
1836         end(); // this actually causes us to be deleted
1837 }
1838
1839 PassRefPtr<Node> HTMLTokenizer::processToken()
1840 {
1841     KJSProxy* jsProxy = (!m_fragment && m_doc->frame()) ? m_doc->frame()->scriptProxy() : 0;
1842     if (jsProxy && m_doc->frame()->scriptProxy()->isEnabled())
1843         jsProxy->setEventHandlerLineno(tagStartLineno);
1844     if (dest > buffer) {
1845         currToken.text = StringImpl::createStrippingNullCharacters(buffer, dest - buffer);
1846         if (currToken.tagName != commentAtom)
1847             currToken.tagName = textAtom;
1848     } else if (currToken.tagName == nullAtom) {
1849         currToken.reset();
1850         if (jsProxy)
1851             jsProxy->setEventHandlerLineno(m_lineNumber);
1852         return 0;
1853     }
1854
1855     dest = buffer;
1856
1857     RefPtr<Node> n;
1858     
1859     if (!m_parserStopped) {
1860         if (inViewSourceMode())
1861             static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceToken(&currToken);
1862         else
1863             // pass the token over to the parser, the parser DOES NOT delete the token
1864             n = parser->parseToken(&currToken);
1865     }
1866     currToken.reset();
1867     if (jsProxy)
1868         jsProxy->setEventHandlerLineno(0);
1869
1870     return n.release();
1871 }
1872
1873 void HTMLTokenizer::processDoctypeToken()
1874 {
1875     if (inViewSourceMode())
1876         static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceDoctypeToken(&m_doctypeToken);
1877     else
1878         parser->parseDoctypeToken(&m_doctypeToken);
1879 }
1880
1881 HTMLTokenizer::~HTMLTokenizer()
1882 {
1883     ASSERT(!inWrite);
1884     reset();
1885     delete parser;
1886 }
1887
1888
1889 void HTMLTokenizer::enlargeBuffer(int len)
1890 {
1891     int newSize = max(size * 2, size + len);
1892     int oldOffset = dest - buffer;
1893     buffer = static_cast<UChar*>(fastRealloc(buffer, newSize * sizeof(UChar)));
1894     dest = buffer + oldOffset;
1895     size = newSize;
1896 }
1897
1898 void HTMLTokenizer::enlargeScriptBuffer(int len)
1899 {
1900     int newSize = max(scriptCodeMaxSize * 2, scriptCodeMaxSize + len);
1901     scriptCode = static_cast<UChar*>(fastRealloc(scriptCode, newSize * sizeof(UChar)));
1902     scriptCodeMaxSize = newSize;
1903 }
1904     
1905 void HTMLTokenizer::executeScriptsWaitingForStylesheets()
1906 {
1907     ASSERT(m_doc->haveStylesheetsLoaded());
1908
1909     if (m_hasScriptsWaitingForStylesheets)
1910         notifyFinished(0);
1911 }
1912
1913 void HTMLTokenizer::notifyFinished(CachedResource*)
1914 {
1915 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1916     if (!m_doc->ownerElement())
1917         printf("script loaded at %d\n", m_doc->elapsedTime());
1918 #endif
1919
1920     ASSERT(!pendingScripts.isEmpty());
1921
1922     // Make scripts loaded from file URLs wait for stylesheets to match Tiger behavior where
1923     // file loads were serialized in lower level.
1924     // FIXME: this should really be done for all script loads or the same effect should be achieved by other
1925     // means, like javascript suspend/resume
1926     m_hasScriptsWaitingForStylesheets = !m_doc->haveStylesheetsLoaded() && protocolIs(pendingScripts.head()->url(), "file");
1927     if (m_hasScriptsWaitingForStylesheets)
1928         return;
1929
1930     bool finished = false;
1931     while (!finished && pendingScripts.head()->isLoaded()) {
1932         CachedScript* cs = pendingScripts.dequeue();
1933         ASSERT(cache()->disabled() || cs->accessCount() > 0);
1934
1935         String scriptSource = cs->script();
1936         setSrc(SegmentedString());
1937
1938         // make sure we forget about the script before we execute the new one
1939         // infinite recursion might happen otherwise
1940         String cachedScriptUrl(cs->url());
1941         bool errorOccurred = cs->errorOccurred();
1942         cs->deref(this);
1943         RefPtr<Node> n = scriptNode.release();
1944
1945 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1946         if (!m_doc->ownerElement())
1947             printf("external script beginning execution at %d\n", m_doc->elapsedTime());
1948 #endif
1949
1950         if (errorOccurred)
1951             EventTargetNodeCast(n.get())->dispatchHTMLEvent(errorEvent, true, false);
1952         else {
1953             if (static_cast<HTMLScriptElement*>(n.get())->shouldExecuteAsJavaScript())
1954                 m_state = scriptExecution(scriptSource, m_state, cachedScriptUrl);
1955             EventTargetNodeCast(n.get())->dispatchHTMLEvent(loadEvent, false, false);
1956         }
1957
1958         // The state of pendingScripts.isEmpty() can change inside the scriptExecution()
1959         // call above, so test afterwards.
1960         finished = pendingScripts.isEmpty();
1961         if (finished) {
1962             m_state.setLoadingExtScript(false);
1963 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1964             if (!m_doc->ownerElement())
1965                 printf("external script finished execution at %d\n", m_doc->elapsedTime());
1966 #endif
1967         }
1968
1969         // 'm_requestingScript' is true when we are called synchronously from
1970         // scriptHandler(). In that case scriptHandler() will take care
1971         // of pendingSrc.
1972         if (!m_requestingScript) {
1973             SegmentedString rest = pendingSrc;
1974             pendingSrc.clear();
1975             write(rest, false);
1976             // we might be deleted at this point, do not access any members.
1977         }
1978     }
1979 }
1980
1981 bool HTMLTokenizer::isWaitingForScripts() const
1982 {
1983     return m_state.loadingExtScript();
1984 }
1985
1986 void HTMLTokenizer::setSrc(const SegmentedString &source)
1987 {
1988     src = source;
1989 }
1990
1991 void parseHTMLDocumentFragment(const String& source, DocumentFragment* fragment)
1992 {
1993     HTMLTokenizer tok(fragment);
1994     tok.setForceSynchronous(true);
1995     tok.write(source, true);
1996     tok.finish();
1997     ASSERT(!tok.processingData());      // make sure we're done (see 3963151)
1998 }
1999
2000 UChar decodeNamedEntity(const char* name)
2001 {
2002     const Entity* e = findEntity(name, strlen(name));
2003     return e ? e->code : 0;
2004 }
2005
2006 }