3c7f8a4f6295b8707ef3c99ace3a44854d148baa
[WebKit-https.git] / WebCore / html / HTMLTokenizer.cpp
1 /*
2     Copyright (C) 1997 Martin Jones (mjones@kde.org)
3               (C) 1997 Torben Weis (weis@kde.org)
4               (C) 1998 Waldo Bastian (bastian@kde.org)
5               (C) 1999 Lars Knoll (knoll@kde.org)
6               (C) 1999 Antti Koivisto (koivisto@kde.org)
7               (C) 2001 Dirk Mueller (mueller@kde.org)
8     Copyright (C) 2004, 2005, 2006, 2007 Apple Inc. All rights reserved.
9     Copyright (C) 2005, 2006 Alexey Proskuryakov (ap@nypop.com)
10
11     This library is free software; you can redistribute it and/or
12     modify it under the terms of the GNU Library General Public
13     License as published by the Free Software Foundation; either
14     version 2 of the License, or (at your option) any later version.
15
16     This library is distributed in the hope that it will be useful,
17     but WITHOUT ANY WARRANTY; without even the implied warranty of
18     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19     Library General Public License for more details.
20
21     You should have received a copy of the GNU Library General Public License
22     along with this library; see the file COPYING.LIB.  If not, write to
23     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
24     Boston, MA 02110-1301, USA.
25 */
26
27 #include "config.h"
28 #include "HTMLTokenizer.h"
29
30 #include "CSSHelper.h"
31 #include "Cache.h"
32 #include "CachedScript.h"
33 #include "DocLoader.h"
34 #include "DocumentFragment.h"
35 #include "EventNames.h"
36 #include "Frame.h"
37 #include "FrameLoader.h"
38 #include "FrameView.h"
39 #include "HTMLElement.h"
40 #include "HTMLNames.h"
41 #include "HTMLParser.h"
42 #include "HTMLScriptElement.h"
43 #include "HTMLViewSourceDocument.h"
44 #include "Settings.h"
45 #include "SystemTime.h"
46 #include "kjs_proxy.h"
47 #include <wtf/ASCIICType.h>
48
49 #include "HTMLEntityNames.c"
50
51 // #define INSTRUMENT_LAYOUT_SCHEDULING 1
52
53 #if MOBILE
54 // The mobile device needs to be responsive, as such the tokenizer chunk size is reduced.
55 // This value is used to define how many characters the tokenizer will process before 
56 // yeilding control.
57 #define TOKENIZER_CHUNK_SIZE  256
58 #else
59 #define TOKENIZER_CHUNK_SIZE  4096
60 #endif
61
62 using namespace std;
63 using namespace WTF;
64
65 namespace WebCore {
66
67 using namespace HTMLNames;
68 using namespace EventNames;
69
70 #if MOBILE
71 // As the chunks are smaller (above), the tokenizer should not yield for as long a period, otherwise
72 // it will take way to long to load a page.
73 const double tokenizerTimeDelay = 0.300;
74
75 #else
76 // FIXME: We would like this constant to be 200ms.
77 // Yielding more aggressively results in increased responsiveness and better incremental rendering.
78 // It slows down overall page-load on slower machines, though, so for now we set a value of 500.
79 const double tokenizerTimeDelay = 0.500;
80 #endif
81
82 static const char commentStart [] = "<!--";
83 static const char scriptEnd [] = "</script";
84 static const char xmpEnd [] = "</xmp";
85 static const char styleEnd [] =  "</style";
86 static const char textareaEnd [] = "</textarea";
87 static const char titleEnd [] = "</title";
88
89 // Full support for MS Windows extensions to Latin-1.
90 // Technically these extensions should only be activated for pages
91 // marked "windows-1252" or "cp1252", but
92 // in the standard Microsoft way, these extensions infect hundreds of thousands
93 // of web pages.  Note that people with non-latin-1 Microsoft extensions
94 // are SOL.
95 //
96 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp
97 //      http://www.bbsinc.com/iso8859.html
98 //      http://www.obviously.com/
99 //
100 // There may be better equivalents
101
102 // We only need this for entities. For non-entity text, we handle this in the text encoding.
103
104 static const UChar windowsLatin1ExtensionArray[32] = {
105     0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
106     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
107     0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
108     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178  // 98-9F
109 };
110
111 static inline UChar fixUpChar(UChar c)
112 {
113     if ((c & ~0x1F) != 0x0080)
114         return c;
115     return windowsLatin1ExtensionArray[c - 0x80];
116 }
117
118 static inline bool tagMatch(const char* s1, const UChar* s2, unsigned length)
119 {
120     for (unsigned i = 0; i != length; ++i) {
121         unsigned char c1 = s1[i];
122         unsigned char uc1 = toASCIIUpper(static_cast<char>(c1));
123         UChar c2 = s2[i];
124         if (c1 != c2 && uc1 != c2)
125             return false;
126     }
127     return true;
128 }
129
130 inline void Token::addAttribute(Document* doc, AtomicString& attrName, const AtomicString& v, bool viewSourceMode)
131 {
132     if (!attrName.isEmpty()) {
133         ASSERT(!attrName.contains('/'));
134         Attribute* a = new MappedAttribute(attrName, v);
135         if (!attrs)
136             attrs = new NamedMappedAttrMap(0);
137         attrs->insertAttribute(a, viewSourceMode);
138     }
139     
140     attrName = emptyAtom;
141 }
142
143 // ----------------------------------------------------------------------------
144
145 HTMLTokenizer::HTMLTokenizer(HTMLDocument* doc, bool reportErrors)
146     : Tokenizer()
147     , buffer(0)
148     , scriptCode(0)
149     , scriptCodeSize(0)
150     , scriptCodeMaxSize(0)
151     , scriptCodeResync(0)
152     , m_executingScript(0)
153     , m_requestingScript(false)
154     , m_hasScriptsWaitingForStylesheets(false)
155     , m_timer(this, &HTMLTokenizer::timerFired)
156     , m_doc(doc)
157     , parser(new HTMLParser(doc, reportErrors))
158     , inWrite(false)
159     , m_fragment(false)
160 {
161     begin();
162 }
163
164 HTMLTokenizer::HTMLTokenizer(HTMLViewSourceDocument* doc)
165     : Tokenizer(true)
166     , buffer(0)
167     , scriptCode(0)
168     , scriptCodeSize(0)
169     , scriptCodeMaxSize(0)
170     , scriptCodeResync(0)
171     , m_executingScript(0)
172     , m_requestingScript(false)
173     , m_hasScriptsWaitingForStylesheets(false)
174     , m_timer(this, &HTMLTokenizer::timerFired)
175     , m_doc(doc)
176     , parser(0)
177     , inWrite(false)
178     , m_fragment(false)
179 {
180     begin();
181 }
182
183 HTMLTokenizer::HTMLTokenizer(DocumentFragment* frag)
184     : buffer(0)
185     , scriptCode(0)
186     , scriptCodeSize(0)
187     , scriptCodeMaxSize(0)
188     , scriptCodeResync(0)
189     , m_executingScript(0)
190     , m_requestingScript(false)
191     , m_hasScriptsWaitingForStylesheets(false)
192     , m_timer(this, &HTMLTokenizer::timerFired)
193     , m_doc(frag->document())
194     , inWrite(false)
195     , m_fragment(true)
196 {
197     parser = new HTMLParser(frag);
198     begin();
199 }
200
201 void HTMLTokenizer::reset()
202 {
203     ASSERT(m_executingScript == 0);
204
205     while (!pendingScripts.isEmpty()) {
206       CachedScript *cs = pendingScripts.dequeue();
207       ASSERT(cache()->disabled() || cs->accessCount() > 0);
208       cs->deref(this);
209     }
210     
211     fastFree(buffer);
212     buffer = dest = 0;
213     size = 0;
214
215     fastFree(scriptCode);
216     scriptCode = 0;
217     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
218
219     m_timer.stop();
220     m_state.setAllowYield(false);
221     m_state.setForceSynchronous(false);
222
223     currToken.reset();
224 }
225
226 void HTMLTokenizer::begin()
227 {
228     m_executingScript = 0;
229     m_requestingScript = false;
230     m_hasScriptsWaitingForStylesheets = false;
231     m_state.setLoadingExtScript(false);
232     reset();
233     size = 254;
234     buffer = static_cast<UChar*>(fastMalloc(sizeof(UChar) * 254));
235     dest = buffer;
236     tquote = NoQuote;
237     searchCount = 0;
238     m_state.setEntityState(NoEntity);
239     scriptSrc = String();
240     pendingSrc.clear();
241     currentPrependingSrc = 0;
242     noMoreData = false;
243     brokenComments = false;
244     brokenServer = false;
245     m_lineNumber = 0;
246     scriptStartLineno = 0;
247     tagStartLineno = 0;
248     m_state.setForceSynchronous(false);
249 }
250
251 void HTMLTokenizer::setForceSynchronous(bool force)
252 {
253     m_state.setForceSynchronous(force);
254 }
255
256 HTMLTokenizer::State HTMLTokenizer::processListing(SegmentedString list, State state)
257 {
258     // This function adds the listing 'list' as
259     // preformatted text-tokens to the token-collection
260     while (!list.isEmpty()) {
261         if (state.skipLF()) {
262             state.setSkipLF(false);
263             if (*list == '\n') {
264                 list.advance();
265                 continue;
266             }
267         }
268
269         checkBuffer();
270
271         if (*list == '\n' || *list == '\r') {
272             if (state.discardLF())
273                 // Ignore this LF
274                 state.setDiscardLF(false); // We have discarded 1 LF
275             else
276                 *dest++ = '\n';
277
278             /* Check for MS-DOS CRLF sequence */
279             if (*list == '\r')
280                 state.setSkipLF(true);
281
282             list.advance();
283         } else {
284             state.setDiscardLF(false);
285             *dest++ = *list;
286             list.advance();
287         }
288     }
289
290     return state;
291 }
292
293 HTMLTokenizer::State HTMLTokenizer::parseSpecial(SegmentedString &src, State state)
294 {
295     ASSERT(state.inTextArea() || state.inTitle() || !state.hasEntityState());
296     ASSERT(!state.hasTagState());
297     ASSERT(state.inXmp() + state.inTextArea() + state.inTitle() + state.inStyle() + state.inScript() == 1 );
298     if (state.inScript())
299         scriptStartLineno = m_lineNumber;
300
301     if (state.inComment()) 
302         state = parseComment(src, state);
303
304     while ( !src.isEmpty() ) {
305         checkScriptBuffer();
306         UChar ch = *src;
307
308         if (!scriptCodeResync && !brokenComments && !state.inTextArea() && !state.inXmp() && ch == '-' && scriptCodeSize >= 3 && !src.escaped() && scriptCode[scriptCodeSize-3] == '<' && scriptCode[scriptCodeSize-2] == '!' && scriptCode[scriptCodeSize-1] == '-') {
309             state.setInComment(true);
310             state = parseComment(src, state);
311             continue;
312         }
313         if (scriptCodeResync && !tquote && ch == '>') {
314             src.advance(m_lineNumber);
315             scriptCodeSize = scriptCodeResync-1;
316             scriptCodeResync = 0;
317             scriptCode[ scriptCodeSize ] = scriptCode[ scriptCodeSize + 1 ] = 0;
318             if (state.inScript())
319                 state = scriptHandler(state);
320             else {
321                 state = processListing(SegmentedString(scriptCode, scriptCodeSize), state);
322                 processToken();
323                 if (state.inStyle()) { 
324                     currToken.tagName = styleTag.localName(); 
325                     currToken.beginTag = false; 
326                 } else if (state.inTextArea()) { 
327                     currToken.tagName = textareaTag.localName(); 
328                     currToken.beginTag = false; 
329                 } else if (state.inTitle()) { 
330                     currToken.tagName = titleTag.localName(); 
331                     currToken.beginTag = false; 
332                 } else if (state.inXmp()) {
333                     currToken.tagName = xmpTag.localName(); 
334                     currToken.beginTag = false; 
335                 }
336                 processToken();
337                 state.setInStyle(false);
338                 state.setInScript(false);
339                 state.setInTextArea(false);
340                 state.setInTitle(false);
341                 state.setInXmp(false);
342                 tquote = NoQuote;
343                 scriptCodeSize = scriptCodeResync = 0;
344             }
345             return state;
346         }
347         // possible end of tagname, lets check.
348         if (!scriptCodeResync && !state.escaped() && !src.escaped() && (ch == '>' || ch == '/' || ch <= ' ') && ch &&
349              scriptCodeSize >= searchStopperLen &&
350              tagMatch( searchStopper, scriptCode+scriptCodeSize-searchStopperLen, searchStopperLen )) {
351             scriptCodeResync = scriptCodeSize-searchStopperLen+1;
352             tquote = NoQuote;
353             continue;
354         }
355         if (scriptCodeResync && !state.escaped()) {
356             if (ch == '\"')
357                 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
358             else if (ch == '\'')
359                 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
360             else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
361                 tquote = NoQuote;
362         }
363         state.setEscaped(!state.escaped() && ch == '\\');
364         if (!scriptCodeResync && (state.inTextArea() || state.inTitle()) && !src.escaped() && ch == '&') {
365             UChar* scriptCodeDest = scriptCode+scriptCodeSize;
366             src.advance(m_lineNumber);
367             state = parseEntity(src, scriptCodeDest, state, m_cBufferPos, true, false);
368             scriptCodeSize = scriptCodeDest-scriptCode;
369         } else {
370             scriptCode[scriptCodeSize++] = *src;
371             src.advance(m_lineNumber);
372         }
373     }
374
375     return state;
376 }
377
378 HTMLTokenizer::State HTMLTokenizer::scriptHandler(State state)
379 {
380     // We are inside a <script>
381     bool doScriptExec = false;
382
383     // (Bugzilla 3837) Scripts following a frameset element should not execute or, 
384     // in the case of extern scripts, even load.
385     bool followingFrameset = (m_doc->body() && m_doc->body()->hasTagName(framesetTag));
386   
387     CachedScript* cs = 0;
388     // don't load external scripts for standalone documents (for now)
389     if (!inViewSourceMode()) {
390         if (!scriptSrc.isEmpty() && m_doc->frame()) {
391             // forget what we just got; load from src url instead
392             if (!parser->skipMode() && !followingFrameset) {
393 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
394                 if (!m_doc->ownerElement())
395                     printf("Requesting script at time %d\n", m_doc->elapsedTime());
396 #endif
397                 // The parser might have been stopped by for example a window.close call in an earlier script.
398                 // If so, we don't want to load scripts.
399                 if (!m_parserStopped && (cs = m_doc->docLoader()->requestScript(scriptSrc, scriptSrcCharset)))
400                     pendingScripts.enqueue(cs);
401                 else
402                     scriptNode = 0;
403             } else
404                 scriptNode = 0;
405             scriptSrc = String();
406         } else {
407 #ifdef TOKEN_DEBUG
408             kdDebug( 6036 ) << "---START SCRIPT---" << endl;
409             kdDebug( 6036 ) << DeprecatedString(scriptCode, scriptCodeSize) << endl;
410             kdDebug( 6036 ) << "---END SCRIPT---" << endl;
411 #endif
412             // Parse scriptCode containing <script> info
413 #if USE(LOW_BANDWIDTH_DISPLAY)
414             if (m_doc->inLowBandwidthDisplay()) {
415                 // ideal solution is only skipping internal JavaScript if there is external JavaScript.
416                 // but internal JavaScript can use document.write() to create an external JavaScript,
417                 // so we have to skip internal JavaScript all the time.
418                 m_doc->frame()->loader()->needToSwitchOutLowBandwidthDisplay();
419                 doScriptExec = false;
420             } else
421 #endif
422             doScriptExec = static_cast<HTMLScriptElement*>(scriptNode.get())->shouldExecuteAsJavaScript();
423             scriptNode = 0;
424         }
425     }
426
427     state = processListing(SegmentedString(scriptCode, scriptCodeSize), state);
428     DeprecatedString exScript(reinterpret_cast<DeprecatedChar*>(buffer), dest - buffer);
429     processToken();
430     currToken.tagName = scriptTag.localName();
431     currToken.beginTag = false;
432     processToken();
433
434     state.setInScript(false);
435     
436     // FIXME: The script should be syntax highlighted.
437     if (inViewSourceMode())
438         return state;
439
440     SegmentedString *savedPrependingSrc = currentPrependingSrc;
441     SegmentedString prependingSrc;
442     currentPrependingSrc = &prependingSrc;
443     scriptCodeSize = scriptCodeResync = 0;
444
445     if (!parser->skipMode() && !followingFrameset) {
446         if (cs) {
447             if (savedPrependingSrc)
448                 savedPrependingSrc->append(src);
449             else
450                 pendingSrc.prepend(src);
451             setSrc(SegmentedString());
452
453             // the ref() call below may call notifyFinished if the script is already in cache,
454             // and that mucks with the state directly, so we must write it back to the object.
455             m_state = state;
456             bool savedRequestingScript = m_requestingScript;
457             m_requestingScript = true;
458             cs->ref(this);
459             m_requestingScript = savedRequestingScript;
460             state = m_state;
461             // will be 0 if script was already loaded and ref() executed it
462             if (!pendingScripts.isEmpty())
463                 state.setLoadingExtScript(true);
464         } else if (!m_fragment && doScriptExec) {
465             if (!m_executingScript)
466                 pendingSrc.prepend(src);
467             else
468                 prependingSrc = src;
469             setSrc(SegmentedString());
470             state = scriptExecution(exScript, state, DeprecatedString::null, scriptStartLineno);
471         }
472     }
473
474     if (!m_executingScript && !state.loadingExtScript()) {
475         src.append(pendingSrc);
476         pendingSrc.clear();
477     } else if (!prependingSrc.isEmpty()) {
478         // restore first so that the write appends in the right place
479         // (does not hurt to do it again below)
480         currentPrependingSrc = savedPrependingSrc;
481
482         // we need to do this slightly modified bit of one of the write() cases
483         // because we want to prepend to pendingSrc rather than appending
484         // if there's no previous prependingSrc
485         if (state.loadingExtScript()) {
486             if (currentPrependingSrc) {
487                 currentPrependingSrc->append(prependingSrc);
488             } else {
489                 pendingSrc.prepend(prependingSrc);
490             }
491         } else {
492             m_state = state;
493             write(prependingSrc, false);
494             state = m_state;
495         }
496     }
497
498     currentPrependingSrc = savedPrependingSrc;
499
500     return state;
501 }
502
503 HTMLTokenizer::State HTMLTokenizer::scriptExecution(const DeprecatedString& str, State state, DeprecatedString scriptURL, int baseLine)
504 {
505     if (m_fragment || !m_doc->frame())
506         return state;
507     m_executingScript++;
508     DeprecatedString url = scriptURL.isNull() ? m_doc->frame()->document()->URL() : scriptURL;
509
510     SegmentedString *savedPrependingSrc = currentPrependingSrc;
511     SegmentedString prependingSrc;
512     currentPrependingSrc = &prependingSrc;
513
514 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
515     if (!m_doc->ownerElement())
516         printf("beginning script execution at %d\n", m_doc->elapsedTime());
517 #endif
518
519     m_state = state;
520     m_doc->frame()->loader()->executeScript(url, baseLine, str);
521     state = m_state;
522
523     state.setAllowYield(true);
524
525 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
526     if (!m_doc->ownerElement())
527         printf("ending script execution at %d\n", m_doc->elapsedTime());
528 #endif
529     
530     m_executingScript--;
531
532     if (!m_executingScript && !state.loadingExtScript()) {
533         pendingSrc.prepend(prependingSrc);        
534         src.append(pendingSrc);
535         pendingSrc.clear();
536     } else if (!prependingSrc.isEmpty()) {
537         // restore first so that the write appends in the right place
538         // (does not hurt to do it again below)
539         currentPrependingSrc = savedPrependingSrc;
540
541         // we need to do this slightly modified bit of one of the write() cases
542         // because we want to prepend to pendingSrc rather than appending
543         // if there's no previous prependingSrc
544         if (state.loadingExtScript()) {
545             if (currentPrependingSrc)
546                 currentPrependingSrc->append(prependingSrc);
547             else
548                 pendingSrc.prepend(prependingSrc);
549         } else {
550             m_state = state;
551             write(prependingSrc, false);
552             state = m_state;
553         }
554     }
555
556     currentPrependingSrc = savedPrependingSrc;
557
558     return state;
559 }
560
561 HTMLTokenizer::State HTMLTokenizer::parseComment(SegmentedString &src, State state)
562 {
563     // FIXME: Why does this code even run for comments inside <script> and <style>? This seems bogus.
564     checkScriptBuffer(src.length());
565     while ( !src.isEmpty() ) {
566         scriptCode[ scriptCodeSize++ ] = *src;
567
568         if (*src == '>') {
569             bool handleBrokenComments = brokenComments && !(state.inScript() || state.inStyle());
570             int endCharsCount = 1; // start off with one for the '>' character
571             if (scriptCodeSize > 2 && scriptCode[scriptCodeSize-3] == '-' && scriptCode[scriptCodeSize-2] == '-') {
572                 endCharsCount = 3;
573             }
574             else if (scriptCodeSize > 3 && scriptCode[scriptCodeSize-4] == '-' && scriptCode[scriptCodeSize-3] == '-' && 
575                 scriptCode[scriptCodeSize-2] == '!') {
576                 // Other browsers will accept --!> as a close comment, even though it's
577                 // not technically valid.
578                 endCharsCount = 4;
579             }
580             if (handleBrokenComments || endCharsCount > 1) {
581                 src.advance(m_lineNumber);
582                 if (!(state.inTitle() || state.inScript() || state.inXmp() || state.inTextArea() || state.inStyle())) {
583                     checkScriptBuffer();
584                     scriptCode[scriptCodeSize] = 0;
585                     scriptCode[scriptCodeSize + 1] = 0;
586                     currToken.tagName = commentAtom;
587                     currToken.beginTag = true;
588                     state = processListing(SegmentedString(scriptCode, scriptCodeSize - endCharsCount), state);
589                     processToken();
590                     currToken.tagName = commentAtom;
591                     currToken.beginTag = false;
592                     processToken();
593                     scriptCodeSize = 0;
594                 }
595                 state.setInComment(false);
596                 return state; // Finished parsing comment
597             }
598         }
599         src.advance(m_lineNumber);
600     }
601
602     return state;
603 }
604
605 HTMLTokenizer::State HTMLTokenizer::parseServer(SegmentedString& src, State state)
606 {
607     checkScriptBuffer(src.length());
608     while (!src.isEmpty()) {
609         scriptCode[scriptCodeSize++] = *src;
610         if (*src == '>' &&
611             scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') {
612             src.advance(m_lineNumber);
613             state.setInServer(false);
614             scriptCodeSize = 0;
615             return state; // Finished parsing server include
616         }
617         src.advance(m_lineNumber);
618     }
619     return state;
620 }
621
622 HTMLTokenizer::State HTMLTokenizer::parseProcessingInstruction(SegmentedString &src, State state)
623 {
624     UChar oldchar = 0;
625     while (!src.isEmpty()) {
626         UChar chbegin = *src;
627         if (chbegin == '\'')
628             tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
629         else if (chbegin == '\"')
630             tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
631         // Look for '?>'
632         // Some crappy sites omit the "?" before it, so
633         // we look for an unquoted '>' instead. (IE compatible)
634         else if (chbegin == '>' && (!tquote || oldchar == '?')) {
635             // We got a '?>' sequence
636             state.setInProcessingInstruction(false);
637             src.advance(m_lineNumber);
638             state.setDiscardLF(true);
639             return state; // Finished parsing comment!
640         }
641         src.advance(m_lineNumber);
642         oldchar = chbegin;
643     }
644     
645     return state;
646 }
647
648 HTMLTokenizer::State HTMLTokenizer::parseText(SegmentedString &src, State state)
649 {
650     while (!src.isEmpty()) {
651         UChar cc = *src;
652
653         if (state.skipLF()) {
654             state.setSkipLF(false);
655             if (cc == '\n') {
656                 src.advance(m_lineNumber);
657                 continue;
658             }
659         }
660
661         // do we need to enlarge the buffer?
662         checkBuffer();
663
664         if (cc == '\r') {
665             state.setSkipLF(true);
666             *dest++ = '\n';
667         } else
668             *dest++ = cc;
669         src.advance(m_lineNumber);
670     }
671
672     return state;
673 }
674
675
676 HTMLTokenizer::State HTMLTokenizer::parseEntity(SegmentedString &src, UChar*& dest, State state, unsigned &cBufferPos, bool start, bool parsingTag)
677 {
678     if (start)
679     {
680         cBufferPos = 0;
681         state.setEntityState(SearchEntity);
682         EntityUnicodeValue = 0;
683     }
684
685     while(!src.isEmpty())
686     {
687         UChar cc = *src;
688         switch(state.entityState()) {
689         case NoEntity:
690             ASSERT(state.entityState() != NoEntity);
691             return state;
692         
693         case SearchEntity:
694             if(cc == '#') {
695                 cBuffer[cBufferPos++] = cc;
696                 src.advance(m_lineNumber);
697                 state.setEntityState(NumericSearch);
698             }
699             else
700                 state.setEntityState(EntityName);
701
702             break;
703
704         case NumericSearch:
705             if (cc == 'x' || cc == 'X') {
706                 cBuffer[cBufferPos++] = cc;
707                 src.advance(m_lineNumber);
708                 state.setEntityState(Hexadecimal);
709             } else if (cc >= '0' && cc <= '9')
710                 state.setEntityState(Decimal);
711             else
712                 state.setEntityState(SearchSemicolon);
713             break;
714
715         case Hexadecimal: {
716             int ll = min(src.length(), 10 - cBufferPos);
717             while (ll--) {
718                 cc = *src;
719                 if (!((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'))) {
720                     state.setEntityState(SearchSemicolon);
721                     break;
722                 }
723                 int digit;
724                 if (cc < 'A')
725                     digit = cc - '0';
726                 else
727                     digit = (cc - 'A' + 10) & 0xF; // handle both upper and lower case without a branch
728                 EntityUnicodeValue = EntityUnicodeValue * 16 + digit;
729                 cBuffer[cBufferPos++] = cc;
730                 src.advance(m_lineNumber);
731             }
732             if (cBufferPos == 10)  
733                 state.setEntityState(SearchSemicolon);
734             break;
735         }
736         case Decimal:
737         {
738             int ll = min(src.length(), 9-cBufferPos);
739             while(ll--) {
740                 cc = *src;
741
742                 if (!(cc >= '0' && cc <= '9')) {
743                     state.setEntityState(SearchSemicolon);
744                     break;
745                 }
746
747                 EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0');
748                 cBuffer[cBufferPos++] = cc;
749                 src.advance(m_lineNumber);
750             }
751             if (cBufferPos == 9)  
752                 state.setEntityState(SearchSemicolon);
753             break;
754         }
755         case EntityName:
756         {
757             int ll = min(src.length(), 9-cBufferPos);
758             while(ll--) {
759                 cc = *src;
760
761                 if (!((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
762                     state.setEntityState(SearchSemicolon);
763                     break;
764                 }
765
766                 cBuffer[cBufferPos++] = cc;
767                 src.advance(m_lineNumber);
768             }
769             if (cBufferPos == 9) 
770                 state.setEntityState(SearchSemicolon);
771             if (state.entityState() == SearchSemicolon) {
772                 if(cBufferPos > 1) {
773                     const Entity *e = findEntity(cBuffer, cBufferPos);
774                     if(e)
775                         EntityUnicodeValue = e->code;
776
777                     // be IE compatible
778                     if(parsingTag && EntityUnicodeValue > 255 && *src != ';')
779                         EntityUnicodeValue = 0;
780                 }
781             }
782             else
783                 break;
784         }
785         case SearchSemicolon:
786             // Don't allow values that are more than 21 bits.
787             if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x10FFFF) {
788                 if (!inViewSourceMode()) {
789                     if (*src == ';')
790                         src.advance(m_lineNumber);
791                     if (EntityUnicodeValue <= 0xFFFF) {
792                         checkBuffer();
793                         src.push(fixUpChar(EntityUnicodeValue));
794                     } else {
795                         // Convert to UTF-16, using surrogate code points.
796                         checkBuffer(2);
797                         src.push(U16_LEAD(EntityUnicodeValue));
798                         src.push(U16_TRAIL(EntityUnicodeValue));
799                     }
800                 } else {
801                     // FIXME: We should eventually colorize entities by sending them as a special token.
802                     checkBuffer(11);
803                     *dest++ = '&';
804                     for (unsigned i = 0; i < cBufferPos; i++)
805                         dest[i] = cBuffer[i];
806                     dest += cBufferPos;
807                     if (*src == ';') {
808                         *dest++ = ';';
809                         src.advance(m_lineNumber);
810                     }
811                 }
812             } else {
813                 checkBuffer(10);
814                 // ignore the sequence, add it to the buffer as plaintext
815                 *dest++ = '&';
816                 for (unsigned i = 0; i < cBufferPos; i++)
817                     dest[i] = cBuffer[i];
818                 dest += cBufferPos;
819             }
820
821             state.setEntityState(NoEntity);
822             return state;
823         }
824     }
825
826     return state;
827 }
828
829 HTMLTokenizer::State HTMLTokenizer::parseTag(SegmentedString &src, State state)
830 {
831     ASSERT(!state.hasEntityState());
832
833     unsigned cBufferPos = m_cBufferPos;
834
835     bool lastIsSlash = false;
836
837     while (!src.isEmpty()) {
838         checkBuffer();
839         switch(state.tagState()) {
840         case NoTag:
841         {
842             m_cBufferPos = cBufferPos;
843             return state;
844         }
845         case TagName:
846         {
847 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
848             qDebug("TagName");
849 #endif
850             if (searchCount > 0)
851             {
852                 if (*src == commentStart[searchCount])
853                 {
854                     searchCount++;
855                     if (searchCount == 4)
856                     {
857 #ifdef TOKEN_DEBUG
858                         kdDebug( 6036 ) << "Found comment" << endl;
859 #endif
860                         // Found '<!--' sequence
861                         src.advance(m_lineNumber);
862                         dest = buffer; // ignore the previous part of this tag
863                         state.setInComment(true);
864                         state.setTagState(NoTag);
865
866                         // Fix bug 34302 at kde.bugs.org.  Go ahead and treat
867                         // <!--> as a valid comment, since both mozilla and IE on windows
868                         // can handle this case.  Only do this in quirks mode. -dwh
869                         if (!src.isEmpty() && *src == '>' && m_doc->inCompatMode()) {
870                           state.setInComment(false);
871                           src.advance(m_lineNumber);
872                           if (!src.isEmpty())
873                               // cuts off high bits, which is okay
874                               cBuffer[cBufferPos++] = *src;
875                         }
876                         else
877                           state = parseComment(src, state);
878
879                         m_cBufferPos = cBufferPos;
880                         return state; // Finished parsing tag!
881                     }
882                     // cuts off high bits, which is okay
883                     cBuffer[cBufferPos++] = *src;
884                     src.advance(m_lineNumber);
885                     break;
886                 }
887                 else
888                     searchCount = 0; // Stop looking for '<!--' sequence
889             }
890
891             bool finish = false;
892             unsigned int ll = min(src.length(), CBUFLEN - cBufferPos);
893             while (ll--) {
894                 UChar curchar = *src;
895                 if (curchar <= ' ' || curchar == '>' || curchar == '<') {
896                     finish = true;
897                     break;
898                 }
899                 
900                 // tolower() shows up on profiles. This is faster!
901                 if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())
902                     cBuffer[cBufferPos++] = curchar + ('a' - 'A');
903                 else
904                     cBuffer[cBufferPos++] = curchar;
905                 src.advance(m_lineNumber);
906             }
907
908             // Disadvantage: we add the possible rest of the tag
909             // as attribute names. ### judge if this causes problems
910             if(finish || CBUFLEN == cBufferPos) {
911                 bool beginTag;
912                 char* ptr = cBuffer;
913                 unsigned int len = cBufferPos;
914                 cBuffer[cBufferPos] = '\0';
915                 if ((cBufferPos > 0) && (*ptr == '/')) {
916                     // End Tag
917                     beginTag = false;
918                     ptr++;
919                     len--;
920                 }
921                 else
922                     // Start Tag
923                     beginTag = true;
924
925                 // Ignore the / in fake xml tags like <br/>.  We trim off the "/" so that we'll get "br" as the tag name and not "br/".
926                 if (len > 1 && ptr[len-1] == '/' && !inViewSourceMode())
927                     ptr[--len] = '\0';
928
929                 // Now that we've shaved off any invalid / that might have followed the name), make the tag.
930                 // FIXME: FireFox and WinIE turn !foo nodes into comments, we ignore comments. (fast/parser/tag-with-exclamation-point.html)
931                 if (ptr[0] != '!' || inViewSourceMode()) {
932                     currToken.tagName = AtomicString(ptr);
933                     currToken.beginTag = beginTag;
934                 }
935                 dest = buffer;
936                 state.setTagState(SearchAttribute);
937                 cBufferPos = 0;
938             }
939             break;
940         }
941         case SearchAttribute:
942 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
943             qDebug("SearchAttribute");
944 #endif
945             while(!src.isEmpty()) {
946                 UChar curchar = *src;
947                 // In this mode just ignore any quotes we encounter and treat them like spaces.
948                 if (curchar > ' ' && curchar != '\'' && curchar != '"') {
949                     if (curchar == '<' || curchar == '>')
950                         state.setTagState(SearchEnd);
951                     else
952                         state.setTagState(AttributeName);
953
954                     cBufferPos = 0;
955                     break;
956                 }
957                 if (inViewSourceMode())
958                     currToken.addViewSourceChar(curchar);
959                 src.advance(m_lineNumber);
960             }
961             break;
962         case AttributeName:
963         {
964 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
965             qDebug("AttributeName");
966 #endif
967             int ll = min(src.length(), CBUFLEN-cBufferPos);
968             while(ll--) {
969                 UChar curchar = *src;
970                 // If we encounter a "/" when scanning an attribute name, treat it as a delimiter.  This allows the 
971                 // cases like <input type=checkbox checked/> to work (and accommodates XML-style syntax as per HTML5).
972                 if (curchar <= '>' && (curchar >= '<' || curchar <= ' ' || curchar == '/')) {
973                     cBuffer[cBufferPos] = '\0';
974                     attrName = AtomicString(cBuffer);
975                     dest = buffer;
976                     *dest++ = 0;
977                     state.setTagState(SearchEqual);
978                     if (inViewSourceMode())
979                         currToken.addViewSourceChar('a');
980                     break;
981                 }
982                 
983                 // tolower() shows up on profiles. This is faster!
984                 if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())
985                     cBuffer[cBufferPos++] = curchar + ('a' - 'A');
986                 else
987                     cBuffer[cBufferPos++] = curchar;
988                     
989                 src.advance(m_lineNumber);
990             }
991             if ( cBufferPos == CBUFLEN ) {
992                 cBuffer[cBufferPos] = '\0';
993                 attrName = AtomicString(cBuffer);
994                 dest = buffer;
995                 *dest++ = 0;
996                 state.setTagState(SearchEqual);
997                 if (inViewSourceMode())
998                     currToken.addViewSourceChar('a');
999             }
1000             break;
1001         }
1002         case SearchEqual:
1003 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1004             qDebug("SearchEqual");
1005 #endif
1006             while(!src.isEmpty()) {
1007                 UChar curchar = *src;
1008
1009                 if (lastIsSlash && curchar == '>') {
1010                     // This is a quirk (with a long sad history).  We have to do this
1011                     // since widgets do <script src="foo.js"/> and expect the tag to close.
1012                     if (currToken.tagName == scriptTag)
1013                         currToken.flat = true;
1014                     currToken.brokenXMLStyle = true;
1015                 }
1016
1017                 // In this mode just ignore any quotes or slashes we encounter and treat them like spaces.
1018                 if (curchar > ' ' && curchar != '\'' && curchar != '"' && curchar != '/') {
1019                     if(curchar == '=') {
1020 #ifdef TOKEN_DEBUG
1021                         kdDebug(6036) << "found equal" << endl;
1022 #endif
1023                         state.setTagState(SearchValue);
1024                         if (inViewSourceMode())
1025                             currToken.addViewSourceChar(curchar);
1026                         src.advance(m_lineNumber);
1027                     }
1028                     else {
1029                         currToken.addAttribute(m_doc, attrName, emptyAtom, inViewSourceMode());
1030                         dest = buffer;
1031                         state.setTagState(SearchAttribute);
1032                         lastIsSlash = false;
1033                     }
1034                     break;
1035                 }
1036                 if (inViewSourceMode())
1037                     currToken.addViewSourceChar(curchar);
1038                     
1039                 lastIsSlash = curchar == '/';
1040
1041                 src.advance(m_lineNumber);
1042             }
1043             break;
1044         case SearchValue:
1045             while(!src.isEmpty()) {
1046                 UChar curchar = *src;
1047                 if(curchar > ' ') {
1048                     if(( curchar == '\'' || curchar == '\"' )) {
1049                         tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
1050                         state.setTagState(QuotedValue);
1051                         if (inViewSourceMode())
1052                             currToken.addViewSourceChar(curchar);
1053                         src.advance(m_lineNumber);
1054                     } else
1055                         state.setTagState(Value);
1056
1057                     break;
1058                 }
1059                 if (inViewSourceMode())
1060                     currToken.addViewSourceChar(curchar);
1061                 src.advance(m_lineNumber);
1062             }
1063             break;
1064         case QuotedValue:
1065 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1066             qDebug("QuotedValue");
1067 #endif
1068             while(!src.isEmpty()) {
1069                 checkBuffer();
1070
1071                 UChar curchar = *src;
1072                 if (curchar == '>' && attrName.isEmpty()) {
1073                     // Handle a case like <img '>.  Just go ahead and be willing
1074                     // to close the whole tag.  Don't consume the character and
1075                     // just go back into SearchEnd while ignoring the whole
1076                     // value.
1077                     // FIXME: Note that this is actually not a very good solution. It's
1078                     // an interim hack and doesn't handle the general case of
1079                     // unmatched quotes among attributes that have names. -dwh
1080                     while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r'))
1081                         dest--; // remove trailing newlines
1082                     AtomicString v(buffer+1, dest-buffer-1);
1083                     attrName = v; // Just make the name/value match. (FIXME: Is this some WinIE quirk?)
1084                     currToken.addAttribute(m_doc, attrName, v, inViewSourceMode());
1085                     if (inViewSourceMode())
1086                         currToken.addViewSourceChar('x');
1087                     state.setTagState(SearchAttribute);
1088                     dest = buffer;
1089                     tquote = NoQuote;
1090                     break;
1091                 }
1092                 
1093                 if(curchar <= '\'' && !src.escaped()) {
1094                     // ### attributes like '&{blaa....};' are supposed to be treated as jscript.
1095                     if ( curchar == '&' )
1096                     {
1097                         src.advance(m_lineNumber);
1098                         state = parseEntity(src, dest, state, cBufferPos, true, true);
1099                         break;
1100                     }
1101                     else if ( (tquote == SingleQuote && curchar == '\'') ||
1102                               (tquote == DoubleQuote && curchar == '\"') )
1103                     {
1104                         // some <input type=hidden> rely on trailing spaces. argh
1105                         while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r'))
1106                             dest--; // remove trailing newlines
1107                         AtomicString v(buffer+1, dest-buffer-1);
1108                         if (attrName.isEmpty()) {
1109                             attrName = v; // Make the name match the value. (FIXME: Is this a WinIE quirk?)
1110                             if (inViewSourceMode())
1111                                 currToken.addViewSourceChar('x');
1112                         } else if (inViewSourceMode())
1113                             currToken.addViewSourceChar('v');
1114                         currToken.addAttribute(m_doc, attrName, v, inViewSourceMode());
1115                         dest = buffer;
1116                         state.setTagState(SearchAttribute);
1117                         tquote = NoQuote;
1118                         if (inViewSourceMode())
1119                             currToken.addViewSourceChar(curchar);
1120                         src.advance(m_lineNumber);
1121                         break;
1122                     }
1123                 }
1124                 *dest++ = *src;
1125                 src.advance(m_lineNumber);
1126             }
1127             break;
1128         case Value:
1129 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1130             qDebug("Value");
1131 #endif
1132             while(!src.isEmpty()) {
1133                 checkBuffer();
1134                 UChar curchar = *src;
1135                 if(curchar <= '>' && !src.escaped()) {
1136                     // parse Entities
1137                     if ( curchar == '&' )
1138                     {
1139                         src.advance(m_lineNumber);
1140                         state = parseEntity(src, dest, state, cBufferPos, true, true);
1141                         break;
1142                     }
1143                     // no quotes. Every space means end of value
1144                     // '/' does not delimit in IE!
1145                     if ( curchar <= ' ' || curchar == '>' )
1146                     {
1147                         AtomicString v(buffer+1, dest-buffer-1);
1148                         currToken.addAttribute(m_doc, attrName, v, inViewSourceMode());
1149                         if (inViewSourceMode())
1150                             currToken.addViewSourceChar('v');
1151                         dest = buffer;
1152                         state.setTagState(SearchAttribute);
1153                         break;
1154                     }
1155                 }
1156
1157                 *dest++ = *src;
1158                 src.advance(m_lineNumber);
1159             }
1160             break;
1161         case SearchEnd:
1162         {
1163 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1164                 qDebug("SearchEnd");
1165 #endif
1166             while(!src.isEmpty()) {
1167                 if (*src == '>' || *src == '<')
1168                     break;
1169
1170                 if (*src == '/')
1171                     currToken.flat = true;
1172
1173                 if (inViewSourceMode())
1174                     currToken.addViewSourceChar(*src);
1175                 src.advance(m_lineNumber);
1176             }
1177             if (src.isEmpty()) break;
1178
1179             searchCount = 0; // Stop looking for '<!--' sequence
1180             state.setTagState(NoTag);
1181             tquote = NoQuote;
1182
1183             if (*src != '<')
1184                 src.advance(m_lineNumber);
1185
1186             if (currToken.tagName == nullAtom) { //stop if tag is unknown
1187                 m_cBufferPos = cBufferPos;
1188                 return state;
1189             }
1190
1191             AtomicString tagName = currToken.tagName;
1192 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0
1193             kdDebug( 6036 ) << "appending Tag: " << tagName.deprecatedString() << endl;
1194 #endif
1195
1196             // Handle <script src="foo"/> like Mozilla/Opera. We have to do this now for Dashboard
1197             // compatibility.
1198             bool isSelfClosingScript = currToken.flat && currToken.beginTag && currToken.tagName == scriptTag;
1199             bool beginTag = !currToken.flat && currToken.beginTag;
1200             if (currToken.beginTag && currToken.tagName == scriptTag && !inViewSourceMode() && !parser->skipMode()) {
1201                 Attribute* a = 0;
1202                 scriptSrc = String();
1203                 scriptSrcCharset = String();
1204                 if (currToken.attrs && !m_fragment) {
1205                     Settings* settings = m_doc->settings();
1206                     if (settings && settings->isJavaScriptEnabled()) {
1207                         if ((a = currToken.attrs->getAttributeItem(srcAttr)))
1208                             scriptSrc = m_doc->completeURL(parseURL(a->value()));
1209                         if ((a = currToken.attrs->getAttributeItem(charsetAttr)))
1210                             scriptSrcCharset = a->value().domString().stripWhiteSpace();
1211                         if (scriptSrcCharset.isEmpty())
1212                             scriptSrcCharset = m_doc->frame()->loader()->encoding();
1213                     }
1214                 }
1215             }
1216
1217             RefPtr<Node> n = processToken();
1218             m_cBufferPos = cBufferPos;
1219             if (n) {
1220                 if ((tagName == preTag || tagName == listingTag) && !inViewSourceMode()) {
1221                     if (beginTag)
1222                         state.setDiscardLF(true); // Discard the first LF after we open a pre.
1223                 } else if (tagName == scriptTag && n) {
1224                     ASSERT(!scriptNode);
1225                     scriptNode = n;
1226                     if (beginTag) {
1227                         searchStopper = scriptEnd;
1228                         searchStopperLen = 8;
1229                         state.setInScript(true);
1230                         state = parseSpecial(src, state);
1231                     } else if (isSelfClosingScript) { // Handle <script src="foo"/>
1232                         state.setInScript(true);
1233                         state = scriptHandler(state);
1234                     }
1235                 } else if (tagName == styleTag) {
1236                     if (beginTag) {
1237                         searchStopper = styleEnd;
1238                         searchStopperLen = 7;
1239                         state.setInStyle(true);
1240                         state = parseSpecial(src, state);
1241                     }
1242                 } else if (tagName == textareaTag) {
1243                     if (beginTag) {
1244                         searchStopper = textareaEnd;
1245                         searchStopperLen = 10;
1246                         state.setInTextArea(true);
1247                         state = parseSpecial(src, state);
1248                     }
1249                 } else if (tagName == titleTag) {
1250                     if (beginTag) {
1251                         searchStopper = titleEnd;
1252                         searchStopperLen = 7;
1253                         State savedState = state;
1254                         SegmentedString savedSrc = src;
1255                         long savedLineno = m_lineNumber;
1256                         state.setInTitle(true);
1257                         state = parseSpecial(src, state);
1258                         if (state.inTitle() && src.isEmpty()) {
1259                             // We just ate the rest of the document as the title #text node!
1260                             // Reset the state then retokenize without special title handling.
1261                             // Let the parser clean up the missing </title> tag.
1262                             // FIXME: This is incorrect, because src.isEmpty() doesn't mean we're
1263                             // at the end of the document unless noMoreData is also true. We need
1264                             // to detect this case elsewhere, and save the state somewhere other
1265                             // than a local variable.
1266                             state = savedState;
1267                             src = savedSrc;
1268                             m_lineNumber = savedLineno;
1269                             scriptCodeSize = 0;
1270                         }
1271                     }
1272                 } else if (tagName == xmpTag) {
1273                     if (beginTag) {
1274                         searchStopper = xmpEnd;
1275                         searchStopperLen = 5;
1276                         state.setInXmp(true);
1277                         state = parseSpecial(src, state);
1278                     }
1279                 }
1280             }
1281             if (tagName == plaintextTag)
1282                 state.setInPlainText(beginTag);
1283             return state; // Finished parsing tag!
1284         }
1285         } // end switch
1286     }
1287     m_cBufferPos = cBufferPos;
1288     return state;
1289 }
1290
1291 inline bool HTMLTokenizer::continueProcessing(int& processedCount, double startTime, State &state)
1292 {
1293     // We don't want to be checking elapsed time with every character, so we only check after we've
1294     // processed a certain number of characters.
1295     bool allowedYield = state.allowYield();
1296     state.setAllowYield(false);
1297     if (!state.loadingExtScript() && !state.forceSynchronous() && !m_executingScript && (processedCount > TOKENIZER_CHUNK_SIZE || allowedYield)) {
1298         processedCount = 0;
1299         if (currentTime() - startTime > tokenizerTimeDelay) {
1300             /* FIXME: We'd like to yield aggressively to give stylesheets the opportunity to
1301                load, but this hurts overall performance on slower machines.  For now turn this
1302                off.
1303             || (!m_doc->haveStylesheetsLoaded() && 
1304                 (m_doc->documentElement()->id() != ID_HTML || m_doc->body()))) {*/
1305             // Schedule the timer to keep processing as soon as possible.
1306             m_timer.startOneShot(0);
1307 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1308             if (currentTime() - startTime > tokenizerTimeDelay)
1309                 printf("Deferring processing of data because 500ms elapsed away from event loop.\n");
1310 #endif
1311             return false;
1312         }
1313     }
1314     
1315     processedCount++;
1316     return true;
1317 }
1318
1319 bool HTMLTokenizer::write(const SegmentedString& str, bool appendData)
1320 {
1321 #ifdef TOKEN_DEBUG
1322     kdDebug( 6036 ) << this << " Tokenizer::write(\"" << str.toString() << "\"," << appendData << ")" << endl;
1323 #endif
1324
1325     if (!buffer)
1326         return false;
1327     
1328     if (m_parserStopped)
1329         return false;
1330
1331     SegmentedString source(str);
1332     if (m_executingScript)
1333         source.setExcludeLineNumbers();
1334
1335     if ((m_executingScript && appendData) || !pendingScripts.isEmpty()) {
1336         // don't parse; we will do this later
1337         if (currentPrependingSrc)
1338             currentPrependingSrc->append(source);
1339         else
1340             pendingSrc.append(source);
1341         return false;
1342     }
1343
1344     if (!src.isEmpty())
1345         src.append(source);
1346     else
1347         setSrc(source);
1348
1349     // Once a timer is set, it has control of when the tokenizer continues.
1350     if (m_timer.isActive())
1351         return false;
1352
1353     bool wasInWrite = inWrite;
1354     inWrite = true;
1355     
1356 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1357     if (!m_doc->ownerElement())
1358         printf("Beginning write at time %d\n", m_doc->elapsedTime());
1359 #endif
1360     
1361     int processedCount = 0;
1362     double startTime = currentTime();
1363
1364     Frame *frame = m_doc->frame();
1365
1366     State state = m_state;
1367
1368     while (!src.isEmpty() && (!frame || !frame->loader()->isScheduledLocationChangePending())) {
1369         if (!continueProcessing(processedCount, startTime, state))
1370             break;
1371
1372         // do we need to enlarge the buffer?
1373         checkBuffer();
1374
1375         UChar cc = *src;
1376
1377         bool wasSkipLF = state.skipLF();
1378         if (wasSkipLF)
1379             state.setSkipLF(false);
1380
1381         if (wasSkipLF && (cc == '\n'))
1382             src.advance();
1383         else if (state.needsSpecialWriteHandling()) {
1384             // it's important to keep needsSpecialWriteHandling with the flags this block tests
1385             if (state.hasEntityState())
1386                 state = parseEntity(src, dest, state, m_cBufferPos, false, state.hasTagState());
1387             else if (state.inPlainText())
1388                 state = parseText(src, state);
1389             else if (state.inAnySpecial())
1390                 state = parseSpecial(src, state);
1391             else if (state.inComment())
1392                 state = parseComment(src, state);
1393             else if (state.inServer())
1394                 state = parseServer(src, state);
1395             else if (state.inProcessingInstruction())
1396                 state = parseProcessingInstruction(src, state);
1397             else if (state.hasTagState())
1398                 state = parseTag(src, state);
1399             else if (state.startTag()) {
1400                 state.setStartTag(false);
1401                 
1402                 switch(cc) {
1403                 case '/':
1404                     break;
1405                 case '!': {
1406                     // <!-- comment -->
1407                     searchCount = 1; // Look for '<!--' sequence to start comment
1408                     
1409                     break;
1410                 }
1411                 case '?': {
1412                     // xml processing instruction
1413                     state.setInProcessingInstruction(true);
1414                     tquote = NoQuote;
1415                     state = parseProcessingInstruction(src, state);
1416                     continue;
1417
1418                     break;
1419                 }
1420                 case '%':
1421                     if (!brokenServer) {
1422                         // <% server stuff, handle as comment %>
1423                         state.setInServer(true);
1424                         tquote = NoQuote;
1425                         state = parseServer(src, state);
1426                         continue;
1427                     }
1428                     // else fall through
1429                 default: {
1430                     if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) {
1431                         // Start of a Start-Tag
1432                     } else {
1433                         // Invalid tag
1434                         // Add as is
1435                         *dest = '<';
1436                         dest++;
1437                         continue;
1438                     }
1439                 }
1440                 }; // end case
1441
1442                 processToken();
1443
1444                 m_cBufferPos = 0;
1445                 state.setTagState(TagName);
1446                 state = parseTag(src, state);
1447             }
1448         } else if (cc == '&' && !src.escaped()) {
1449             src.advance(m_lineNumber);
1450             state = parseEntity(src, dest, state, m_cBufferPos, true, state.hasTagState());
1451         } else if (cc == '<' && !src.escaped()) {
1452             tagStartLineno = m_lineNumber;
1453             src.advance(m_lineNumber);
1454             state.setStartTag(true);
1455         } else if (cc == '\n' || cc == '\r') {
1456             if (state.discardLF())
1457                 // Ignore this LF
1458                 state.setDiscardLF(false); // We have discarded 1 LF
1459             else {
1460                 // Process this LF
1461                 *dest++ = '\n';
1462                 if (cc == '\r' && !src.excludeLineNumbers())
1463                     m_lineNumber++;
1464             }
1465
1466             /* Check for MS-DOS CRLF sequence */
1467             if (cc == '\r')
1468                 state.setSkipLF(true);
1469             src.advance(m_lineNumber);
1470         } else {
1471             state.setDiscardLF(false);
1472             *dest++ = cc;
1473             src.advance(m_lineNumber);
1474         }
1475     }
1476     
1477 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1478     if (!m_doc->ownerElement())
1479         printf("Ending write at time %d\n", m_doc->elapsedTime());
1480 #endif
1481     
1482     inWrite = wasInWrite;
1483
1484     m_state = state;
1485
1486     if (noMoreData && !inWrite && !state.loadingExtScript() && !m_executingScript && !m_timer.isActive()) {
1487         end(); // this actually causes us to be deleted
1488         return true;
1489     }
1490     return false;
1491 }
1492
1493 void HTMLTokenizer::stopParsing()
1494 {
1495     Tokenizer::stopParsing();
1496     m_timer.stop();
1497
1498     // The part needs to know that the tokenizer has finished with its data,
1499     // regardless of whether it happened naturally or due to manual intervention.
1500     if (!m_fragment && m_doc->frame())
1501         m_doc->frame()->loader()->tokenizerProcessedData();
1502 }
1503
1504 bool HTMLTokenizer::processingData() const
1505 {
1506     return m_timer.isActive() || inWrite;
1507 }
1508
1509 void HTMLTokenizer::timerFired(Timer<HTMLTokenizer>*)
1510 {
1511 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1512     if (!m_doc->ownerElement())
1513         printf("Beginning timer write at time %d\n", m_doc->elapsedTime());
1514 #endif
1515
1516     if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay()) {
1517         // Restart the timer and let layout win.  This is basically a way of ensuring that the layout
1518         // timer has higher priority than our timer.
1519         m_timer.startOneShot(0);
1520         return;
1521     }
1522     
1523     RefPtr<Frame> frame = m_fragment ? 0 : m_doc->frame();
1524
1525     // Invoke write() as though more data came in.
1526     bool didCallEnd = write(SegmentedString(), true);
1527   
1528     // If we called end() during the write,  we need to let WebKit know that we're done processing the data.
1529     if (didCallEnd && frame)
1530         frame->loader()->tokenizerProcessedData();
1531 }
1532
1533 void HTMLTokenizer::end()
1534 {
1535     ASSERT(!m_timer.isActive());
1536     m_timer.stop(); // Only helps if assertion above fires, but do it anyway.
1537
1538     if (buffer) {
1539         // parseTag is using the buffer for different matters
1540         if (!m_state.hasTagState())
1541             processToken();
1542
1543         fastFree(scriptCode);
1544         scriptCode = 0;
1545         scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
1546
1547         fastFree(buffer);
1548         buffer = 0;
1549     }
1550
1551     if (!inViewSourceMode())
1552         parser->finished();
1553     else
1554         m_doc->finishedParsing();
1555 }
1556
1557 void HTMLTokenizer::finish()
1558 {
1559     // do this as long as we don't find matching comment ends
1560     while((m_state.inComment() || m_state.inServer()) && scriptCode && scriptCodeSize) {
1561         // we've found an unmatched comment start
1562         if (m_state.inComment())
1563             brokenComments = true;
1564         else
1565             brokenServer = true;
1566         checkScriptBuffer();
1567         scriptCode[scriptCodeSize] = 0;
1568         scriptCode[scriptCodeSize + 1] = 0;
1569         int pos;
1570         String food;
1571         if (m_state.inScript() || m_state.inStyle())
1572             food = String(scriptCode, scriptCodeSize);
1573         else if (m_state.inServer()) {
1574             food = "<";
1575             food.append(String(scriptCode, scriptCodeSize));
1576         } else {
1577             pos = DeprecatedConstString(reinterpret_cast<DeprecatedChar*>(scriptCode), scriptCodeSize).string().find('>');
1578             food = String(scriptCode + pos + 1, scriptCodeSize - pos - 1);
1579         }
1580         fastFree(scriptCode);
1581         scriptCode = 0;
1582         scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
1583         m_state.setInComment(false);
1584         m_state.setInServer(false);
1585         if (!food.isEmpty())
1586             write(food, true);
1587     }
1588     // this indicates we will not receive any more data... but if we are waiting on
1589     // an external script to load, we can't finish parsing until that is done
1590     noMoreData = true;
1591     if (!inWrite && !m_state.loadingExtScript() && !m_executingScript && !m_timer.isActive())
1592         end(); // this actually causes us to be deleted
1593 }
1594
1595 PassRefPtr<Node> HTMLTokenizer::processToken()
1596 {
1597     KJSProxy* jsProxy = (!m_fragment && m_doc->frame()) ? m_doc->frame()->scriptProxy() : 0;
1598     if (jsProxy)
1599         jsProxy->setEventHandlerLineno(tagStartLineno);
1600     if (dest > buffer) {
1601 #ifdef TOKEN_DEBUG
1602         if(currToken.tagName.length()) {
1603             qDebug( "unexpected token: %s, str: *%s*", currToken.tagName.deprecatedString().latin1(),DeprecatedConstString( buffer,dest-buffer ).deprecatedString().latin1() );
1604             ASSERT(0);
1605         }
1606
1607 #endif
1608         currToken.text = StringImpl::createStrippingNull(buffer, dest - buffer);
1609         if (currToken.tagName != commentAtom)
1610             currToken.tagName = textAtom;
1611     } else if (currToken.tagName == nullAtom) {
1612         currToken.reset();
1613         if (jsProxy)
1614             jsProxy->setEventHandlerLineno(m_lineNumber);
1615         return 0;
1616     }
1617
1618     dest = buffer;
1619
1620 #ifdef TOKEN_DEBUG
1621     DeprecatedString name = currToken.tagName.deprecatedString();
1622     DeprecatedString text;
1623     if(currToken.text)
1624         text = DeprecatedConstString(currToken.text->unicode(), currToken.text->length()).deprecatedString();
1625
1626     kdDebug( 6036 ) << "Token --> " << name << endl;
1627     if (currToken.flat)
1628         kdDebug( 6036 ) << "Token is FLAT!" << endl;
1629     if(!text.isNull())
1630         kdDebug( 6036 ) << "text: \"" << text << "\"" << endl;
1631     unsigned l = currToken.attrs ? currToken.attrs->length() : 0;
1632     if(l) {
1633         kdDebug( 6036 ) << "Attributes: " << l << endl;
1634         for (unsigned i = 0; i < l; ++i) {
1635             Attribute* c = currToken.attrs->attributeItem(i);
1636             kdDebug( 6036 ) << "    " << c->localName().deprecatedString()
1637                             << "=\"" << c->value().deprecatedString() << "\"" << endl;
1638         }
1639     }
1640     kdDebug( 6036 ) << endl;
1641 #endif
1642
1643     RefPtr<Node> n;
1644     
1645     if (!m_parserStopped) {
1646         if (inViewSourceMode())
1647             static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceToken(&currToken);
1648         else
1649             // pass the token over to the parser, the parser DOES NOT delete the token
1650             n = parser->parseToken(&currToken);
1651     }
1652     currToken.reset();
1653     if (jsProxy)
1654         jsProxy->setEventHandlerLineno(0);
1655
1656     return n.release();
1657 }
1658
1659 HTMLTokenizer::~HTMLTokenizer()
1660 {
1661     ASSERT(!inWrite);
1662     reset();
1663     delete parser;
1664 }
1665
1666
1667 void HTMLTokenizer::enlargeBuffer(int len)
1668 {
1669     int newSize = max(size * 2, size + len);
1670     int oldOffset = dest - buffer;
1671     buffer = static_cast<UChar*>(fastRealloc(buffer, newSize * sizeof(UChar)));
1672     dest = buffer + oldOffset;
1673     size = newSize;
1674 }
1675
1676 void HTMLTokenizer::enlargeScriptBuffer(int len)
1677 {
1678     int newSize = max(scriptCodeMaxSize * 2, scriptCodeMaxSize + len);
1679     scriptCode = static_cast<UChar*>(fastRealloc(scriptCode, newSize * sizeof(UChar)));
1680     scriptCodeMaxSize = newSize;
1681 }
1682     
1683 void HTMLTokenizer::executeScriptsWaitingForStylesheets()
1684 {
1685     ASSERT(m_doc->haveStylesheetsLoaded());
1686
1687     if (m_hasScriptsWaitingForStylesheets)
1688         notifyFinished(0);
1689 }
1690
1691 void HTMLTokenizer::notifyFinished(CachedResource*)
1692 {
1693 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1694     if (!m_doc->ownerElement())
1695         printf("script loaded at %d\n", m_doc->elapsedTime());
1696 #endif
1697
1698     ASSERT(!pendingScripts.isEmpty());
1699
1700     // Make scripts loaded from file URLs wait for stylesheets to match Tiger behavior where
1701     // file loads were serialized in lower level.
1702     // FIXME: this should really be done for all script loads or the same effect should be achieved by other
1703     // means, like javascript suspend/resume
1704     m_hasScriptsWaitingForStylesheets = !m_doc->haveStylesheetsLoaded() && pendingScripts.head()->url().startsWith("file:", false);
1705     if (m_hasScriptsWaitingForStylesheets)
1706         return;
1707
1708     bool finished = false;
1709     while (!finished && pendingScripts.head()->isLoaded()) {
1710 #ifdef TOKEN_DEBUG
1711         kdDebug( 6036 ) << "Finished loading an external script" << endl;
1712 #endif
1713         CachedScript* cs = pendingScripts.dequeue();
1714         ASSERT(cache()->disabled() || cs->accessCount() > 0);
1715
1716         String scriptSource = cs->script();
1717 #ifdef TOKEN_DEBUG
1718         kdDebug( 6036 ) << "External script is:" << endl << scriptSource.deprecatedString() << endl;
1719 #endif
1720         setSrc(SegmentedString());
1721
1722         // make sure we forget about the script before we execute the new one
1723         // infinite recursion might happen otherwise
1724         DeprecatedString cachedScriptUrl( cs->url().deprecatedString() );
1725         bool errorOccurred = cs->errorOccurred();
1726         cs->deref(this);
1727         RefPtr<Node> n = scriptNode.release();
1728
1729 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1730         if (!m_doc->ownerElement())
1731             printf("external script beginning execution at %d\n", m_doc->elapsedTime());
1732 #endif
1733
1734         if (errorOccurred)
1735             EventTargetNodeCast(n.get())->dispatchHTMLEvent(errorEvent, true, false);
1736         else {
1737             if (static_cast<HTMLScriptElement*>(n.get())->shouldExecuteAsJavaScript())
1738                 m_state = scriptExecution(scriptSource.deprecatedString(), m_state, cachedScriptUrl);
1739             EventTargetNodeCast(n.get())->dispatchHTMLEvent(loadEvent, false, false);
1740         }
1741
1742         // The state of pendingScripts.isEmpty() can change inside the scriptExecution()
1743         // call above, so test afterwards.
1744         finished = pendingScripts.isEmpty();
1745         if (finished) {
1746             m_state.setLoadingExtScript(false);
1747 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1748             if (!m_doc->ownerElement())
1749                 printf("external script finished execution at %d\n", m_doc->elapsedTime());
1750 #endif
1751         }
1752
1753         // 'm_requestingScript' is true when we are called synchronously from
1754         // scriptHandler(). In that case scriptHandler() will take care
1755         // of pendingSrc.
1756         if (!m_requestingScript) {
1757             SegmentedString rest = pendingSrc;
1758             pendingSrc.clear();
1759             write(rest, false);
1760             // we might be deleted at this point, do not
1761             // access any members.
1762         }
1763     }
1764 }
1765
1766 bool HTMLTokenizer::isWaitingForScripts() const
1767 {
1768     return m_state.loadingExtScript();
1769 }
1770
1771 void HTMLTokenizer::setSrc(const SegmentedString &source)
1772 {
1773     src = source;
1774 }
1775
1776 void parseHTMLDocumentFragment(const String& source, DocumentFragment* fragment)
1777 {
1778     HTMLTokenizer tok(fragment);
1779     tok.setForceSynchronous(true);
1780     tok.write(source, true);
1781     tok.finish();
1782     ASSERT(!tok.processingData());      // make sure we're done (see 3963151)
1783 }
1784
1785 UChar decodeNamedEntity(const char* name)
1786 {
1787     const Entity* e = findEntity(name, strlen(name));
1788     return e ? e->code : 0;
1789 }
1790
1791 }