3b4b6a8db90ff56844b166cdbbc27238a55261af
[WebKit-https.git] / WebCore / html / HTMLTokenizer.cpp
1 /*
2     This file is part of the KDE libraries
3
4     Copyright (C) 1997 Martin Jones (mjones@kde.org)
5               (C) 1997 Torben Weis (weis@kde.org)
6               (C) 1998 Waldo Bastian (bastian@kde.org)
7               (C) 1999 Lars Knoll (knoll@kde.org)
8               (C) 1999 Antti Koivisto (koivisto@kde.org)
9               (C) 2001 Dirk Mueller (mueller@kde.org)
10     Copyright (C) 2004, 2005, 2006, 2007 Apple Inc.
11     Copyright (C) 2005, 2006 Alexey Proskuryakov (ap@nypop.com)
12
13     This library is free software; you can redistribute it and/or
14     modify it under the terms of the GNU Library General Public
15     License as published by the Free Software Foundation; either
16     version 2 of the License, or (at your option) any later version.
17
18     This library is distributed in the hope that it will be useful,
19     but WITHOUT ANY WARRANTY; without even the implied warranty of
20     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21     Library General Public License for more details.
22
23     You should have received a copy of the GNU Library General Public License
24     along with this library; see the file COPYING.LIB.  If not, write to
25     the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
26     Boston, MA 02111-1307, USA.
27 */
28
29 #include "config.h"
30 #include "HTMLTokenizer.h"
31
32 #include "CSSHelper.h"
33 #include "Cache.h"
34 #include "CachedScript.h"
35 #include "DocLoader.h"
36 #include "DocumentFragment.h"
37 #include "EventNames.h"
38 #include "Frame.h"
39 #include "FrameLoader.h"
40 #include "FrameView.h"
41 #include "HTMLElement.h"
42 #include "HTMLNames.h"
43 #include "HTMLParser.h"
44 #include "HTMLScriptElement.h"
45 #include "HTMLViewSourceDocument.h"
46 #include "Settings.h"
47 #include "SystemTime.h"
48 #include "kjs_proxy.h"
49
50 #include "HTMLEntityNames.c"
51
52 // #define INSTRUMENT_LAYOUT_SCHEDULING 1
53
54 #if MOBILE
55 // The mobile device needs to be responsive, as such the tokenizer chunk size is reduced.
56 // This value is used to define how many characters the tokenizer will process before 
57 // yeilding control.
58 #define TOKENIZER_CHUNK_SIZE  256
59 #else
60 #define TOKENIZER_CHUNK_SIZE  4096
61 #endif
62
63 using namespace std;
64
65 namespace WebCore {
66
67 using namespace HTMLNames;
68 using namespace EventNames;
69
70 #if MOBILE
71 // As the chunks are smaller (above), the tokenizer should not yield for as long a period, otherwise
72 // it will take way to long to load a page.
73 const double tokenizerTimeDelay = 0.300;
74
75 #else
76 // FIXME: We would like this constant to be 200ms.
77 // Yielding more aggressively results in increased responsiveness and better incremental rendering.
78 // It slows down overall page-load on slower machines, though, so for now we set a value of 500.
79 const double tokenizerTimeDelay = 0.500;
80 #endif
81
82 static const char commentStart [] = "<!--";
83 static const char scriptEnd [] = "</script";
84 static const char xmpEnd [] = "</xmp";
85 static const char styleEnd [] =  "</style";
86 static const char textareaEnd [] = "</textarea";
87 static const char titleEnd [] = "</title";
88
89 // Full support for MS Windows extensions to Latin-1.
90 // Technically these extensions should only be activated for pages
91 // marked "windows-1252" or "cp1252", but
92 // in the standard Microsoft way, these extensions infect hundreds of thousands
93 // of web pages.  Note that people with non-latin-1 Microsoft extensions
94 // are SOL.
95 //
96 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp
97 //      http://www.bbsinc.com/iso8859.html
98 //      http://www.obviously.com/
99 //
100 // There may be better equivalents
101
102 // We only need this for entities. For non-entity text, we handle this in the text encoding.
103
104 static const UChar windowsLatin1ExtensionArray[32] = {
105     0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
106     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
107     0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
108     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178  // 98-9F
109 };
110
111 static inline UChar fixUpChar(UChar c)
112 {
113     if ((c & ~0x1F) != 0x0080)
114         return c;
115     return windowsLatin1ExtensionArray[c - 0x80];
116 }
117
118 static inline bool tagMatch(const char* s1, const UChar* s2, unsigned length)
119 {
120     for (unsigned i = 0; i != length; ++i) {
121         unsigned char c1 = s1[i];
122         unsigned char uc1 = toupper(c1);
123         UChar c2 = s2[i];
124         if (c1 != c2 && uc1 != c2)
125             return false;
126     }
127     return true;
128 }
129
130 inline void Token::addAttribute(Document* doc, AtomicString& attrName, const AtomicString& v, bool viewSourceMode)
131 {
132     if (!attrName.isEmpty()) {
133         ASSERT(!attrName.contains('/'));
134         Attribute* a = new MappedAttribute(attrName, v);
135         if (!attrs)
136             attrs = new NamedMappedAttrMap(0);
137         attrs->insertAttribute(a, viewSourceMode);
138     }
139     
140     attrName = emptyAtom;
141 }
142
143 // ----------------------------------------------------------------------------
144
145 HTMLTokenizer::HTMLTokenizer(HTMLDocument* doc, bool reportErrors)
146     : Tokenizer()
147     , buffer(0)
148     , scriptCode(0)
149     , scriptCodeSize(0)
150     , scriptCodeMaxSize(0)
151     , scriptCodeResync(0)
152     , m_executingScript(0)
153     , m_requestingScript(false)
154     , m_timer(this, &HTMLTokenizer::timerFired)
155     , m_doc(doc)
156     , parser(new HTMLParser(doc, reportErrors))
157     , inWrite(false)
158     , m_fragment(false)
159 {
160     begin();
161 }
162
163 HTMLTokenizer::HTMLTokenizer(HTMLViewSourceDocument* doc)
164     : Tokenizer(true)
165     , buffer(0)
166     , scriptCode(0)
167     , scriptCodeSize(0)
168     , scriptCodeMaxSize(0)
169     , scriptCodeResync(0)
170     , m_executingScript(0)
171     , m_requestingScript(false)
172     , m_timer(this, &HTMLTokenizer::timerFired)
173     , m_doc(doc)
174     , parser(0)
175     , inWrite(false)
176     , m_fragment(false)
177 {
178     begin();
179 }
180
181 HTMLTokenizer::HTMLTokenizer(DocumentFragment* frag)
182     : buffer(0)
183     , scriptCode(0)
184     , scriptCodeSize(0)
185     , scriptCodeMaxSize(0)
186     , scriptCodeResync(0)
187     , m_executingScript(0)
188     , m_requestingScript(false)
189     , m_timer(this, &HTMLTokenizer::timerFired)
190     , m_doc(frag->document())
191     , inWrite(false)
192     , m_fragment(true)
193 {
194     parser = new HTMLParser(frag);
195     begin();
196 }
197
198 void HTMLTokenizer::reset()
199 {
200     ASSERT(m_executingScript == 0);
201
202     while (!pendingScripts.isEmpty()) {
203       CachedScript *cs = pendingScripts.dequeue();
204       ASSERT(cache()->disabled() || cs->accessCount() > 0);
205       cs->deref(this);
206     }
207     
208     fastFree(buffer);
209     buffer = dest = 0;
210     size = 0;
211
212     fastFree(scriptCode);
213     scriptCode = 0;
214     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
215
216     m_timer.stop();
217     m_state.setAllowYield(false);
218     m_state.setForceSynchronous(false);
219
220     currToken.reset();
221 }
222
223 void HTMLTokenizer::begin()
224 {
225     m_executingScript = 0;
226     m_requestingScript = false;
227     m_state.setLoadingExtScript(false);
228     reset();
229     size = 254;
230     buffer = static_cast<UChar*>(fastMalloc(sizeof(UChar) * 254));
231     dest = buffer;
232     tquote = NoQuote;
233     searchCount = 0;
234     m_state.setEntityState(NoEntity);
235     scriptSrc = String();
236     pendingSrc.clear();
237     currentPrependingSrc = 0;
238     noMoreData = false;
239     brokenComments = false;
240     brokenServer = false;
241     lineno = 0;
242     scriptStartLineno = 0;
243     tagStartLineno = 0;
244     m_state.setForceSynchronous(false);
245 }
246
247 void HTMLTokenizer::setForceSynchronous(bool force)
248 {
249     m_state.setForceSynchronous(force);
250 }
251
252 HTMLTokenizer::State HTMLTokenizer::processListing(SegmentedString list, State state)
253 {
254     // This function adds the listing 'list' as
255     // preformatted text-tokens to the token-collection
256     while (!list.isEmpty()) {
257         if (state.skipLF()) {
258             state.setSkipLF(false);
259             if (*list == '\n') {
260                 list.advance(0);
261                 continue;
262             }
263         }
264
265         checkBuffer();
266
267         if (*list == '\n' || *list == '\r') {
268             if (state.discardLF())
269                 // Ignore this LF
270                 state.setDiscardLF(false); // We have discarded 1 LF
271             else
272                 *dest++ = '\n';
273
274             /* Check for MS-DOS CRLF sequence */
275             if (*list == '\r')
276                 state.setSkipLF(true);
277
278             list.advance(0);
279         } else {
280             state.setDiscardLF(false);
281             *dest++ = *list;
282             list.advance(0);
283         }
284     }
285
286     return state;
287 }
288
289 HTMLTokenizer::State HTMLTokenizer::parseSpecial(SegmentedString &src, State state)
290 {
291     ASSERT(state.inTextArea() || state.inTitle() || !state.hasEntityState());
292     ASSERT(!state.hasTagState());
293     ASSERT(state.inXmp() + state.inTextArea() + state.inTitle() + state.inStyle() + state.inScript() == 1 );
294     if (state.inScript())
295         scriptStartLineno = lineno;
296
297     if (state.inComment()) 
298         state = parseComment(src, state);
299
300     while ( !src.isEmpty() ) {
301         checkScriptBuffer();
302         UChar ch = *src;
303
304         if (!scriptCodeResync && !brokenComments && !state.inTextArea() && !state.inXmp() && ch == '-' && scriptCodeSize >= 3 && !src.escaped() && scriptCode[scriptCodeSize-3] == '<' && scriptCode[scriptCodeSize-2] == '!' && scriptCode[scriptCodeSize-1] == '-') {
305             state.setInComment(true);
306             state = parseComment(src, state);
307             continue;
308         }
309         if (scriptCodeResync && !tquote && ch == '>') {
310             src.advance(lineNumberPtr());
311             scriptCodeSize = scriptCodeResync-1;
312             scriptCodeResync = 0;
313             scriptCode[ scriptCodeSize ] = scriptCode[ scriptCodeSize + 1 ] = 0;
314             if (state.inScript())
315                 state = scriptHandler(state);
316             else {
317                 state = processListing(SegmentedString(scriptCode, scriptCodeSize), state);
318                 processToken();
319                 if (state.inStyle()) { 
320                     currToken.tagName = styleTag.localName(); 
321                     currToken.beginTag = false; 
322                 } else if (state.inTextArea()) { 
323                     currToken.tagName = textareaTag.localName(); 
324                     currToken.beginTag = false; 
325                 } else if (state.inTitle()) { 
326                     currToken.tagName = titleTag.localName(); 
327                     currToken.beginTag = false; 
328                 } else if (state.inXmp()) {
329                     currToken.tagName = xmpTag.localName(); 
330                     currToken.beginTag = false; 
331                 }
332                 processToken();
333                 state.setInStyle(false);
334                 state.setInScript(false);
335                 state.setInTextArea(false);
336                 state.setInTitle(false);
337                 state.setInXmp(false);
338                 tquote = NoQuote;
339                 scriptCodeSize = scriptCodeResync = 0;
340             }
341             return state;
342         }
343         // possible end of tagname, lets check.
344         if (!scriptCodeResync && !state.escaped() && !src.escaped() && (ch == '>' || ch == '/' || ch <= ' ') && ch &&
345              scriptCodeSize >= searchStopperLen &&
346              tagMatch( searchStopper, scriptCode+scriptCodeSize-searchStopperLen, searchStopperLen )) {
347             scriptCodeResync = scriptCodeSize-searchStopperLen+1;
348             tquote = NoQuote;
349             continue;
350         }
351         if (scriptCodeResync && !state.escaped()) {
352             if (ch == '\"')
353                 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
354             else if (ch == '\'')
355                 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
356             else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
357                 tquote = NoQuote;
358         }
359         state.setEscaped(!state.escaped() && ch == '\\');
360         if (!scriptCodeResync && (state.inTextArea() || state.inTitle()) && !src.escaped() && ch == '&') {
361             UChar* scriptCodeDest = scriptCode+scriptCodeSize;
362             src.advance(lineNumberPtr());
363             state = parseEntity(src, scriptCodeDest, state, m_cBufferPos, true, false);
364             scriptCodeSize = scriptCodeDest-scriptCode;
365         } else {
366             scriptCode[scriptCodeSize++] = *src;
367             src.advance(lineNumberPtr());
368         }
369     }
370
371     return state;
372 }
373
374 HTMLTokenizer::State HTMLTokenizer::scriptHandler(State state)
375 {
376     // We are inside a <script>
377     bool doScriptExec = false;
378
379     // (Bugzilla 3837) Scripts following a frameset element should not execute or, 
380     // in the case of extern scripts, even load.
381     bool followingFrameset = (m_doc->body() && m_doc->body()->hasTagName(framesetTag));
382   
383     CachedScript* cs = 0;
384     // don't load external scripts for standalone documents (for now)
385     if (!inViewSourceMode()) {
386         if (!scriptSrc.isEmpty() && m_doc->frame()) {
387             // forget what we just got; load from src url instead
388             if (!parser->skipMode() && !followingFrameset) {
389 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
390                 if (!m_doc->ownerElement())
391                     printf("Requesting script at time %d\n", m_doc->elapsedTime());
392 #endif
393                 // The parser might have been stopped by for example a window.close call in an earlier script.
394                 // If so, we don't want to load scripts.
395                 if (!m_parserStopped && (cs = m_doc->docLoader()->requestScript(scriptSrc, scriptSrcCharset)))
396                     pendingScripts.enqueue(cs);
397                 else
398                     scriptNode = 0;
399             } else
400                 scriptNode = 0;
401             scriptSrc = String();
402         } else {
403 #ifdef TOKEN_DEBUG
404             kdDebug( 6036 ) << "---START SCRIPT---" << endl;
405             kdDebug( 6036 ) << DeprecatedString(scriptCode, scriptCodeSize) << endl;
406             kdDebug( 6036 ) << "---END SCRIPT---" << endl;
407 #endif
408             // Parse scriptCode containing <script> info
409 #if USE(LOW_BANDWIDTH_DISPLAY)
410             if (m_doc->inLowBandwidthDisplay()) {
411                 // ideal solution is only skipping internal JavaScript if there is external JavaScript.
412                 // but internal JavaScript can use document.write() to create an external JavaScript,
413                 // so we have to skip internal JavaScript all the time.
414                 m_doc->frame()->loader()->needToSwitchOutLowBandwidthDisplay();
415                 doScriptExec = false;
416             } else
417 #endif
418             doScriptExec = static_cast<HTMLScriptElement*>(scriptNode.get())->shouldExecuteAsJavaScript();
419             scriptNode = 0;
420         }
421     }
422
423     state = processListing(SegmentedString(scriptCode, scriptCodeSize), state);
424     DeprecatedString exScript(reinterpret_cast<DeprecatedChar*>(buffer), dest - buffer);
425     processToken();
426     currToken.tagName = scriptTag.localName();
427     currToken.beginTag = false;
428     processToken();
429
430     state.setInScript(false);
431     
432     // FIXME: The script should be syntax highlighted.
433     if (inViewSourceMode())
434         return state;
435
436     SegmentedString *savedPrependingSrc = currentPrependingSrc;
437     SegmentedString prependingSrc;
438     currentPrependingSrc = &prependingSrc;
439     scriptCodeSize = scriptCodeResync = 0;
440
441     if (!parser->skipMode() && !followingFrameset) {
442         if (cs) {
443             if (savedPrependingSrc)
444                 savedPrependingSrc->append(src);
445             else
446                 pendingSrc.prepend(src);
447             setSrc(SegmentedString());
448
449             // the ref() call below may call notifyFinished if the script is already in cache,
450             // and that mucks with the state directly, so we must write it back to the object.
451             m_state = state;
452             bool savedRequestingScript = m_requestingScript;
453             m_requestingScript = true;
454             cs->ref(this);
455             m_requestingScript = savedRequestingScript;
456             state = m_state;
457             // will be 0 if script was already loaded and ref() executed it
458             if (!pendingScripts.isEmpty())
459                 state.setLoadingExtScript(true);
460         } else if (!m_fragment && doScriptExec) {
461             if (!m_executingScript)
462                 pendingSrc.prepend(src);
463             else
464                 prependingSrc = src;
465             setSrc(SegmentedString());
466             state = scriptExecution(exScript, state, DeprecatedString::null, scriptStartLineno);
467         }
468     }
469
470     if (!m_executingScript && !state.loadingExtScript()) {
471         src.append(pendingSrc);
472         pendingSrc.clear();
473     } else if (!prependingSrc.isEmpty()) {
474         // restore first so that the write appends in the right place
475         // (does not hurt to do it again below)
476         currentPrependingSrc = savedPrependingSrc;
477
478         // we need to do this slightly modified bit of one of the write() cases
479         // because we want to prepend to pendingSrc rather than appending
480         // if there's no previous prependingSrc
481         if (state.loadingExtScript()) {
482             if (currentPrependingSrc) {
483                 currentPrependingSrc->append(prependingSrc);
484             } else {
485                 pendingSrc.prepend(prependingSrc);
486             }
487         } else {
488             m_state = state;
489             write(prependingSrc, false);
490             state = m_state;
491         }
492     }
493
494     currentPrependingSrc = savedPrependingSrc;
495
496     return state;
497 }
498
499 HTMLTokenizer::State HTMLTokenizer::scriptExecution(const DeprecatedString& str, State state, DeprecatedString scriptURL, int baseLine)
500 {
501     if (m_fragment || !m_doc->frame())
502         return state;
503     m_executingScript++;
504     DeprecatedString url = scriptURL.isNull() ? m_doc->frame()->document()->URL() : scriptURL;
505
506     SegmentedString *savedPrependingSrc = currentPrependingSrc;
507     SegmentedString prependingSrc;
508     currentPrependingSrc = &prependingSrc;
509
510 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
511     if (!m_doc->ownerElement())
512         printf("beginning script execution at %d\n", m_doc->elapsedTime());
513 #endif
514
515     m_state = state;
516     m_doc->frame()->loader()->executeScript(url, baseLine, str);
517     state = m_state;
518
519     state.setAllowYield(true);
520
521 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
522     if (!m_doc->ownerElement())
523         printf("ending script execution at %d\n", m_doc->elapsedTime());
524 #endif
525     
526     m_executingScript--;
527
528     if (!m_executingScript && !state.loadingExtScript()) {
529         pendingSrc.prepend(prependingSrc);        
530         src.append(pendingSrc);
531         pendingSrc.clear();
532     } else if (!prependingSrc.isEmpty()) {
533         // restore first so that the write appends in the right place
534         // (does not hurt to do it again below)
535         currentPrependingSrc = savedPrependingSrc;
536
537         // we need to do this slightly modified bit of one of the write() cases
538         // because we want to prepend to pendingSrc rather than appending
539         // if there's no previous prependingSrc
540         if (state.loadingExtScript()) {
541             if (currentPrependingSrc)
542                 currentPrependingSrc->append(prependingSrc);
543             else
544                 pendingSrc.prepend(prependingSrc);
545         } else {
546             m_state = state;
547             write(prependingSrc, false);
548             state = m_state;
549         }
550     }
551
552     currentPrependingSrc = savedPrependingSrc;
553
554     return state;
555 }
556
557 HTMLTokenizer::State HTMLTokenizer::parseComment(SegmentedString &src, State state)
558 {
559     // FIXME: Why does this code even run for comments inside <script> and <style>? This seems bogus.
560     checkScriptBuffer(src.length());
561     while ( !src.isEmpty() ) {
562         scriptCode[ scriptCodeSize++ ] = *src;
563
564         if (*src == '>') {
565             bool handleBrokenComments = brokenComments && !(state.inScript() || state.inStyle());
566             int endCharsCount = 1; // start off with one for the '>' character
567             if (scriptCodeSize > 2 && scriptCode[scriptCodeSize-3] == '-' && scriptCode[scriptCodeSize-2] == '-') {
568                 endCharsCount = 3;
569             }
570             else if (scriptCodeSize > 3 && scriptCode[scriptCodeSize-4] == '-' && scriptCode[scriptCodeSize-3] == '-' && 
571                 scriptCode[scriptCodeSize-2] == '!') {
572                 // Other browsers will accept --!> as a close comment, even though it's
573                 // not technically valid.
574                 endCharsCount = 4;
575             }
576             if (handleBrokenComments || endCharsCount > 1) {
577                 src.advance(lineNumberPtr());
578                 if (!(state.inTitle() || state.inScript() || state.inXmp() || state.inTextArea() || state.inStyle())) {
579                     checkScriptBuffer();
580                     scriptCode[scriptCodeSize] = 0;
581                     scriptCode[scriptCodeSize + 1] = 0;
582                     currToken.tagName = commentAtom;
583                     currToken.beginTag = true;
584                     state = processListing(SegmentedString(scriptCode, scriptCodeSize - endCharsCount), state);
585                     processToken();
586                     currToken.tagName = commentAtom;
587                     currToken.beginTag = false;
588                     processToken();
589                     scriptCodeSize = 0;
590                 }
591                 state.setInComment(false);
592                 return state; // Finished parsing comment
593             }
594         }
595         src.advance(lineNumberPtr());
596     }
597
598     return state;
599 }
600
601 HTMLTokenizer::State HTMLTokenizer::parseServer(SegmentedString& src, State state)
602 {
603     checkScriptBuffer(src.length());
604     while (!src.isEmpty()) {
605         scriptCode[scriptCodeSize++] = *src;
606         if (*src == '>' &&
607             scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') {
608             src.advance(lineNumberPtr());
609             state.setInServer(false);
610             scriptCodeSize = 0;
611             return state; // Finished parsing server include
612         }
613         src.advance(lineNumberPtr());
614     }
615     return state;
616 }
617
618 HTMLTokenizer::State HTMLTokenizer::parseProcessingInstruction(SegmentedString &src, State state)
619 {
620     UChar oldchar = 0;
621     while (!src.isEmpty()) {
622         UChar chbegin = *src;
623         if (chbegin == '\'')
624             tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
625         else if (chbegin == '\"')
626             tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
627         // Look for '?>'
628         // Some crappy sites omit the "?" before it, so
629         // we look for an unquoted '>' instead. (IE compatible)
630         else if (chbegin == '>' && (!tquote || oldchar == '?')) {
631             // We got a '?>' sequence
632             state.setInProcessingInstruction(false);
633             src.advance(lineNumberPtr());
634             state.setDiscardLF(true);
635             return state; // Finished parsing comment!
636         }
637         src.advance(lineNumberPtr());
638         oldchar = chbegin;
639     }
640     
641     return state;
642 }
643
644 HTMLTokenizer::State HTMLTokenizer::parseText(SegmentedString &src, State state)
645 {
646     while (!src.isEmpty()) {
647         UChar cc = *src;
648
649         if (state.skipLF()) {
650             state.setSkipLF(false);
651             if (cc == '\n') {
652                 src.advance(lineNumberPtr());
653                 continue;
654             }
655         }
656
657         // do we need to enlarge the buffer?
658         checkBuffer();
659
660         if (cc == '\r') {
661             state.setSkipLF(true);
662             *dest++ = '\n';
663         } else
664             *dest++ = cc;
665         src.advance(lineNumberPtr());
666     }
667
668     return state;
669 }
670
671
672 HTMLTokenizer::State HTMLTokenizer::parseEntity(SegmentedString &src, UChar*& dest, State state, unsigned &cBufferPos, bool start, bool parsingTag)
673 {
674     if (start)
675     {
676         cBufferPos = 0;
677         state.setEntityState(SearchEntity);
678         EntityUnicodeValue = 0;
679     }
680
681     while(!src.isEmpty())
682     {
683         UChar cc = *src;
684         switch(state.entityState()) {
685         case NoEntity:
686             ASSERT(state.entityState() != NoEntity);
687             return state;
688         
689         case SearchEntity:
690             if(cc == '#') {
691                 cBuffer[cBufferPos++] = cc;
692                 src.advance(lineNumberPtr());
693                 state.setEntityState(NumericSearch);
694             }
695             else
696                 state.setEntityState(EntityName);
697
698             break;
699
700         case NumericSearch:
701             if (cc == 'x' || cc == 'X') {
702                 cBuffer[cBufferPos++] = cc;
703                 src.advance(lineNumberPtr());
704                 state.setEntityState(Hexadecimal);
705             } else if (cc >= '0' && cc <= '9')
706                 state.setEntityState(Decimal);
707             else
708                 state.setEntityState(SearchSemicolon);
709             break;
710
711         case Hexadecimal: {
712             int ll = min(src.length(), 10 - cBufferPos);
713             while (ll--) {
714                 cc = *src;
715                 if (!((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'))) {
716                     state.setEntityState(SearchSemicolon);
717                     break;
718                 }
719                 int digit;
720                 if (cc < 'A')
721                     digit = cc - '0';
722                 else
723                     digit = (cc - 'A' + 10) & 0xF; // handle both upper and lower case without a branch
724                 EntityUnicodeValue = EntityUnicodeValue * 16 + digit;
725                 cBuffer[cBufferPos++] = cc;
726                 src.advance(lineNumberPtr());
727             }
728             if (cBufferPos == 10)  
729                 state.setEntityState(SearchSemicolon);
730             break;
731         }
732         case Decimal:
733         {
734             int ll = min(src.length(), 9-cBufferPos);
735             while(ll--) {
736                 cc = *src;
737
738                 if (!(cc >= '0' && cc <= '9')) {
739                     state.setEntityState(SearchSemicolon);
740                     break;
741                 }
742
743                 EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0');
744                 cBuffer[cBufferPos++] = cc;
745                 src.advance(lineNumberPtr());
746             }
747             if (cBufferPos == 9)  
748                 state.setEntityState(SearchSemicolon);
749             break;
750         }
751         case EntityName:
752         {
753             int ll = min(src.length(), 9-cBufferPos);
754             while(ll--) {
755                 cc = *src;
756
757                 if (!((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
758                     state.setEntityState(SearchSemicolon);
759                     break;
760                 }
761
762                 cBuffer[cBufferPos++] = cc;
763                 src.advance(lineNumberPtr());
764             }
765             if (cBufferPos == 9) 
766                 state.setEntityState(SearchSemicolon);
767             if (state.entityState() == SearchSemicolon) {
768                 if(cBufferPos > 1) {
769                     const Entity *e = findEntity(cBuffer, cBufferPos);
770                     if(e)
771                         EntityUnicodeValue = e->code;
772
773                     // be IE compatible
774                     if(parsingTag && EntityUnicodeValue > 255 && *src != ';')
775                         EntityUnicodeValue = 0;
776                 }
777             }
778             else
779                 break;
780         }
781         case SearchSemicolon:
782             // Don't allow values that are more than 21 bits.
783             if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x10FFFF) {
784                 if (!inViewSourceMode()) {
785                     if (*src == ';')
786                         src.advance(lineNumberPtr());
787                     if (EntityUnicodeValue <= 0xFFFF) {
788                         checkBuffer();
789                         src.push(fixUpChar(EntityUnicodeValue));
790                     } else {
791                         // Convert to UTF-16, using surrogate code points.
792                         checkBuffer(2);
793                         src.push(U16_LEAD(EntityUnicodeValue));
794                         src.push(U16_TRAIL(EntityUnicodeValue));
795                     }
796                 } else {
797                     // FIXME: We should eventually colorize entities by sending them as a special token.
798                     checkBuffer(11);
799                     *dest++ = '&';
800                     for (unsigned i = 0; i < cBufferPos; i++)
801                         dest[i] = cBuffer[i];
802                     dest += cBufferPos;
803                     if (*src == ';') {
804                         *dest++ = ';';
805                         src.advance(lineNumberPtr());
806                     }
807                 }
808             } else {
809                 checkBuffer(10);
810                 // ignore the sequence, add it to the buffer as plaintext
811                 *dest++ = '&';
812                 for (unsigned i = 0; i < cBufferPos; i++)
813                     dest[i] = cBuffer[i];
814                 dest += cBufferPos;
815             }
816
817             state.setEntityState(NoEntity);
818             return state;
819         }
820     }
821
822     return state;
823 }
824
825 HTMLTokenizer::State HTMLTokenizer::parseTag(SegmentedString &src, State state)
826 {
827     ASSERT(!state.hasEntityState());
828
829     unsigned cBufferPos = m_cBufferPos;
830
831     int* lineNoPtr = lineNumberPtr();
832     bool lastIsSlash = false;
833
834     while (!src.isEmpty()) {
835         checkBuffer();
836         switch(state.tagState()) {
837         case NoTag:
838         {
839             m_cBufferPos = cBufferPos;
840             return state;
841         }
842         case TagName:
843         {
844 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
845             qDebug("TagName");
846 #endif
847             if (searchCount > 0)
848             {
849                 if (*src == commentStart[searchCount])
850                 {
851                     searchCount++;
852                     if (searchCount == 4)
853                     {
854 #ifdef TOKEN_DEBUG
855                         kdDebug( 6036 ) << "Found comment" << endl;
856 #endif
857                         // Found '<!--' sequence
858                         src.advance(lineNoPtr);
859                         dest = buffer; // ignore the previous part of this tag
860                         state.setInComment(true);
861                         state.setTagState(NoTag);
862
863                         // Fix bug 34302 at kde.bugs.org.  Go ahead and treat
864                         // <!--> as a valid comment, since both mozilla and IE on windows
865                         // can handle this case.  Only do this in quirks mode. -dwh
866                         if (!src.isEmpty() && *src == '>' && m_doc->inCompatMode()) {
867                           state.setInComment(false);
868                           src.advance(lineNoPtr);
869                           if (!src.isEmpty())
870                               // cuts off high bits, which is okay
871                               cBuffer[cBufferPos++] = *src;
872                         }
873                         else
874                           state = parseComment(src, state);
875
876                         m_cBufferPos = cBufferPos;
877                         return state; // Finished parsing tag!
878                     }
879                     // cuts off high bits, which is okay
880                     cBuffer[cBufferPos++] = *src;
881                     src.advance(lineNoPtr);
882                     break;
883                 }
884                 else
885                     searchCount = 0; // Stop looking for '<!--' sequence
886             }
887
888             bool finish = false;
889             unsigned int ll = min(src.length(), CBUFLEN - cBufferPos);
890             while (ll--) {
891                 UChar curchar = *src;
892                 if (curchar <= ' ' || curchar == '>' || curchar == '<') {
893                     finish = true;
894                     break;
895                 }
896                 
897                 // tolower() shows up on profiles. This is faster!
898                 if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())
899                     cBuffer[cBufferPos++] = curchar + ('a' - 'A');
900                 else
901                     cBuffer[cBufferPos++] = curchar;
902                 src.advance(lineNoPtr);
903             }
904
905             // Disadvantage: we add the possible rest of the tag
906             // as attribute names. ### judge if this causes problems
907             if(finish || CBUFLEN == cBufferPos) {
908                 bool beginTag;
909                 char* ptr = cBuffer;
910                 unsigned int len = cBufferPos;
911                 cBuffer[cBufferPos] = '\0';
912                 if ((cBufferPos > 0) && (*ptr == '/')) {
913                     // End Tag
914                     beginTag = false;
915                     ptr++;
916                     len--;
917                 }
918                 else
919                     // Start Tag
920                     beginTag = true;
921
922                 // Ignore the / in fake xml tags like <br/>.  We trim off the "/" so that we'll get "br" as the tag name and not "br/".
923                 if (len > 1 && ptr[len-1] == '/' && !inViewSourceMode())
924                     ptr[--len] = '\0';
925
926                 // Now that we've shaved off any invalid / that might have followed the name), make the tag.
927                 // FIXME: FireFox and WinIE turn !foo nodes into comments, we ignore comments. (fast/parser/tag-with-exclamation-point.html)
928                 if (ptr[0] != '!' || inViewSourceMode()) {
929                     currToken.tagName = AtomicString(ptr);
930                     currToken.beginTag = beginTag;
931                 }
932                 dest = buffer;
933                 state.setTagState(SearchAttribute);
934                 cBufferPos = 0;
935             }
936             break;
937         }
938         case SearchAttribute:
939 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
940             qDebug("SearchAttribute");
941 #endif
942             while(!src.isEmpty()) {
943                 UChar curchar = *src;
944                 // In this mode just ignore any quotes we encounter and treat them like spaces.
945                 if (curchar > ' ' && curchar != '\'' && curchar != '"') {
946                     if (curchar == '<' || curchar == '>')
947                         state.setTagState(SearchEnd);
948                     else
949                         state.setTagState(AttributeName);
950
951                     cBufferPos = 0;
952                     break;
953                 }
954                 if (inViewSourceMode())
955                     currToken.addViewSourceChar(curchar);
956                 src.advance(lineNoPtr);
957             }
958             break;
959         case AttributeName:
960         {
961 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
962             qDebug("AttributeName");
963 #endif
964             int ll = min(src.length(), CBUFLEN-cBufferPos);
965             while(ll--) {
966                 UChar curchar = *src;
967                 // If we encounter a "/" when scanning an attribute name, treat it as a delimiter.  This allows the 
968                 // cases like <input type=checkbox checked/> to work (and accommodates XML-style syntax as per HTML5).
969                 if (curchar <= '>' && (curchar >= '<' || curchar <= ' ' || curchar == '/')) {
970                     cBuffer[cBufferPos] = '\0';
971                     attrName = AtomicString(cBuffer);
972                     dest = buffer;
973                     *dest++ = 0;
974                     state.setTagState(SearchEqual);
975                     if (inViewSourceMode())
976                         currToken.addViewSourceChar('a');
977                     break;
978                 }
979                 
980                 // tolower() shows up on profiles. This is faster!
981                 if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())
982                     cBuffer[cBufferPos++] = curchar + ('a' - 'A');
983                 else
984                     cBuffer[cBufferPos++] = curchar;
985                     
986                 src.advance(lineNoPtr);
987             }
988             if ( cBufferPos == CBUFLEN ) {
989                 cBuffer[cBufferPos] = '\0';
990                 attrName = AtomicString(cBuffer);
991                 dest = buffer;
992                 *dest++ = 0;
993                 state.setTagState(SearchEqual);
994                 if (inViewSourceMode())
995                     currToken.addViewSourceChar('a');
996             }
997             break;
998         }
999         case SearchEqual:
1000 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1001             qDebug("SearchEqual");
1002 #endif
1003             while(!src.isEmpty()) {
1004                 UChar curchar = *src;
1005
1006                 if (lastIsSlash && curchar == '>') {
1007                     // This is a quirk (with a long sad history).  We have to do this
1008                     // since widgets do <script src="foo.js"/> and expect the tag to close.
1009                     if (currToken.tagName == scriptTag)
1010                         currToken.flat = true;
1011                     currToken.brokenXMLStyle = true;
1012                 }
1013
1014                 // In this mode just ignore any quotes or slashes we encounter and treat them like spaces.
1015                 if (curchar > ' ' && curchar != '\'' && curchar != '"' && curchar != '/') {
1016                     if(curchar == '=') {
1017 #ifdef TOKEN_DEBUG
1018                         kdDebug(6036) << "found equal" << endl;
1019 #endif
1020                         state.setTagState(SearchValue);
1021                         if (inViewSourceMode())
1022                             currToken.addViewSourceChar(curchar);
1023                         src.advance(lineNoPtr);
1024                     }
1025                     else {
1026                         currToken.addAttribute(m_doc, attrName, emptyAtom, inViewSourceMode());
1027                         dest = buffer;
1028                         state.setTagState(SearchAttribute);
1029                         lastIsSlash = false;
1030                     }
1031                     break;
1032                 }
1033                 if (inViewSourceMode())
1034                     currToken.addViewSourceChar(curchar);
1035                     
1036                 lastIsSlash = curchar == '/';
1037
1038                 src.advance(lineNoPtr);
1039             }
1040             break;
1041         case SearchValue:
1042             while(!src.isEmpty()) {
1043                 UChar curchar = *src;
1044                 if(curchar > ' ') {
1045                     if(( curchar == '\'' || curchar == '\"' )) {
1046                         tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
1047                         state.setTagState(QuotedValue);
1048                         if (inViewSourceMode())
1049                             currToken.addViewSourceChar(curchar);
1050                         src.advance(lineNoPtr);
1051                     } else
1052                         state.setTagState(Value);
1053
1054                     break;
1055                 }
1056                 if (inViewSourceMode())
1057                     currToken.addViewSourceChar(curchar);
1058                 src.advance(lineNoPtr);
1059             }
1060             break;
1061         case QuotedValue:
1062 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1063             qDebug("QuotedValue");
1064 #endif
1065             while(!src.isEmpty()) {
1066                 checkBuffer();
1067
1068                 UChar curchar = *src;
1069                 if (curchar == '>' && attrName.isEmpty()) {
1070                     // Handle a case like <img '>.  Just go ahead and be willing
1071                     // to close the whole tag.  Don't consume the character and
1072                     // just go back into SearchEnd while ignoring the whole
1073                     // value.
1074                     // FIXME: Note that this is actually not a very good solution. It's
1075                     // an interim hack and doesn't handle the general case of
1076                     // unmatched quotes among attributes that have names. -dwh
1077                     while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r'))
1078                         dest--; // remove trailing newlines
1079                     AtomicString v(buffer+1, dest-buffer-1);
1080                     attrName = v; // Just make the name/value match. (FIXME: Is this some WinIE quirk?)
1081                     currToken.addAttribute(m_doc, attrName, v, inViewSourceMode());
1082                     if (inViewSourceMode())
1083                         currToken.addViewSourceChar('x');
1084                     state.setTagState(SearchAttribute);
1085                     dest = buffer;
1086                     tquote = NoQuote;
1087                     break;
1088                 }
1089                 
1090                 if(curchar <= '\'' && !src.escaped()) {
1091                     // ### attributes like '&{blaa....};' are supposed to be treated as jscript.
1092                     if ( curchar == '&' )
1093                     {
1094                         src.advance(lineNoPtr);
1095                         state = parseEntity(src, dest, state, cBufferPos, true, true);
1096                         break;
1097                     }
1098                     else if ( (tquote == SingleQuote && curchar == '\'') ||
1099                               (tquote == DoubleQuote && curchar == '\"') )
1100                     {
1101                         // some <input type=hidden> rely on trailing spaces. argh
1102                         while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r'))
1103                             dest--; // remove trailing newlines
1104                         AtomicString v(buffer+1, dest-buffer-1);
1105                         if (attrName.isEmpty()) {
1106                             attrName = v; // Make the name match the value. (FIXME: Is this a WinIE quirk?)
1107                             if (inViewSourceMode())
1108                                 currToken.addViewSourceChar('x');
1109                         } else if (inViewSourceMode())
1110                             currToken.addViewSourceChar('v');
1111                         currToken.addAttribute(m_doc, attrName, v, inViewSourceMode());
1112                         dest = buffer;
1113                         state.setTagState(SearchAttribute);
1114                         tquote = NoQuote;
1115                         if (inViewSourceMode())
1116                             currToken.addViewSourceChar(curchar);
1117                         src.advance(lineNoPtr);
1118                         break;
1119                     }
1120                 }
1121                 *dest++ = *src;
1122                 src.advance(lineNoPtr);
1123             }
1124             break;
1125         case Value:
1126 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1127             qDebug("Value");
1128 #endif
1129             while(!src.isEmpty()) {
1130                 checkBuffer();
1131                 UChar curchar = *src;
1132                 if(curchar <= '>' && !src.escaped()) {
1133                     // parse Entities
1134                     if ( curchar == '&' )
1135                     {
1136                         src.advance(lineNoPtr);
1137                         state = parseEntity(src, dest, state, cBufferPos, true, true);
1138                         break;
1139                     }
1140                     // no quotes. Every space means end of value
1141                     // '/' does not delimit in IE!
1142                     if ( curchar <= ' ' || curchar == '>' )
1143                     {
1144                         AtomicString v(buffer+1, dest-buffer-1);
1145                         currToken.addAttribute(m_doc, attrName, v, inViewSourceMode());
1146                         if (inViewSourceMode())
1147                             currToken.addViewSourceChar('v');
1148                         dest = buffer;
1149                         state.setTagState(SearchAttribute);
1150                         break;
1151                     }
1152                 }
1153
1154                 *dest++ = *src;
1155                 src.advance(lineNoPtr);
1156             }
1157             break;
1158         case SearchEnd:
1159         {
1160 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1161                 qDebug("SearchEnd");
1162 #endif
1163             while(!src.isEmpty()) {
1164                 if (*src == '>' || *src == '<')
1165                     break;
1166
1167                 if (*src == '/')
1168                     currToken.flat = true;
1169
1170                 if (inViewSourceMode())
1171                     currToken.addViewSourceChar(*src);
1172                 src.advance(lineNoPtr);
1173             }
1174             if (src.isEmpty()) break;
1175
1176             searchCount = 0; // Stop looking for '<!--' sequence
1177             state.setTagState(NoTag);
1178             tquote = NoQuote;
1179
1180             if (*src != '<')
1181                 src.advance(lineNoPtr);
1182
1183             if (currToken.tagName == nullAtom) { //stop if tag is unknown
1184                 m_cBufferPos = cBufferPos;
1185                 return state;
1186             }
1187
1188             AtomicString tagName = currToken.tagName;
1189 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0
1190             kdDebug( 6036 ) << "appending Tag: " << tagName.deprecatedString() << endl;
1191 #endif
1192
1193             // Handle <script src="foo"/> like Mozilla/Opera. We have to do this now for Dashboard
1194             // compatibility.
1195             bool isSelfClosingScript = currToken.flat && currToken.beginTag && currToken.tagName == scriptTag;
1196             bool beginTag = !currToken.flat && currToken.beginTag;
1197             if (currToken.beginTag && currToken.tagName == scriptTag && !inViewSourceMode() && !parser->skipMode()) {
1198                 Attribute* a = 0;
1199                 scriptSrc = String();
1200                 scriptSrcCharset = String();
1201                 if (currToken.attrs && !m_fragment) {
1202                     Settings* settings = m_doc->settings();
1203                     if (settings && settings->isJavaScriptEnabled()) {
1204                         if ((a = currToken.attrs->getAttributeItem(srcAttr)))
1205                             scriptSrc = m_doc->completeURL(parseURL(a->value()));
1206                         if ((a = currToken.attrs->getAttributeItem(charsetAttr)))
1207                             scriptSrcCharset = a->value().domString().stripWhiteSpace();
1208                         if (scriptSrcCharset.isEmpty())
1209                             scriptSrcCharset = m_doc->frame()->loader()->encoding();
1210                     }
1211                 }
1212             }
1213
1214             RefPtr<Node> n = processToken();
1215             m_cBufferPos = cBufferPos;
1216             if (n) {
1217                 if ((tagName == preTag || tagName == listingTag) && !inViewSourceMode()) {
1218                     if (beginTag)
1219                         state.setDiscardLF(true); // Discard the first LF after we open a pre.
1220                 } else if (tagName == scriptTag && n) {
1221                     ASSERT(!scriptNode);
1222                     scriptNode = n;
1223                     if (beginTag) {
1224                         searchStopper = scriptEnd;
1225                         searchStopperLen = 8;
1226                         state.setInScript(true);
1227                         state = parseSpecial(src, state);
1228                     } else if (isSelfClosingScript) { // Handle <script src="foo"/>
1229                         state.setInScript(true);
1230                         state = scriptHandler(state);
1231                     }
1232                 } else if (tagName == styleTag) {
1233                     if (beginTag) {
1234                         searchStopper = styleEnd;
1235                         searchStopperLen = 7;
1236                         state.setInStyle(true);
1237                         state = parseSpecial(src, state);
1238                     }
1239                 } else if (tagName == textareaTag) {
1240                     if (beginTag) {
1241                         searchStopper = textareaEnd;
1242                         searchStopperLen = 10;
1243                         state.setInTextArea(true);
1244                         state = parseSpecial(src, state);
1245                     }
1246                 } else if (tagName == titleTag) {
1247                     if (beginTag) {
1248                         searchStopper = titleEnd;
1249                         searchStopperLen = 7;
1250                         State savedState = state;
1251                         SegmentedString savedSrc = src;
1252                         long savedLineno = lineno;
1253                         state.setInTitle(true);
1254                         state = parseSpecial(src, state);
1255                         if (state.inTitle() && src.isEmpty()) {
1256                             // We just ate the rest of the document as the title #text node!
1257                             // Reset the state then retokenize without special title handling.
1258                             // Let the parser clean up the missing </title> tag.
1259                             // FIXME: This is incorrect, because src.isEmpty() doesn't mean we're
1260                             // at the end of the document unless noMoreData is also true. We need
1261                             // to detect this case elsewhere, and save the state somewhere other
1262                             // than a local variable.
1263                             state = savedState;
1264                             src = savedSrc;
1265                             lineno = savedLineno;
1266                             scriptCodeSize = 0;
1267                         }
1268                     }
1269                 } else if (tagName == xmpTag) {
1270                     if (beginTag) {
1271                         searchStopper = xmpEnd;
1272                         searchStopperLen = 5;
1273                         state.setInXmp(true);
1274                         state = parseSpecial(src, state);
1275                     }
1276                 }
1277             }
1278             if (tagName == plaintextTag)
1279                 state.setInPlainText(beginTag);
1280             return state; // Finished parsing tag!
1281         }
1282         } // end switch
1283     }
1284     m_cBufferPos = cBufferPos;
1285     return state;
1286 }
1287
1288 inline bool HTMLTokenizer::continueProcessing(int& processedCount, double startTime, State &state)
1289 {
1290     // We don't want to be checking elapsed time with every character, so we only check after we've
1291     // processed a certain number of characters.
1292     bool allowedYield = state.allowYield();
1293     state.setAllowYield(false);
1294     if (!state.loadingExtScript() && !state.forceSynchronous() && !m_executingScript && (processedCount > TOKENIZER_CHUNK_SIZE || allowedYield)) {
1295         processedCount = 0;
1296         if (currentTime() - startTime > tokenizerTimeDelay) {
1297             /* FIXME: We'd like to yield aggressively to give stylesheets the opportunity to
1298                load, but this hurts overall performance on slower machines.  For now turn this
1299                off.
1300             || (!m_doc->haveStylesheetsLoaded() && 
1301                 (m_doc->documentElement()->id() != ID_HTML || m_doc->body()))) {*/
1302             // Schedule the timer to keep processing as soon as possible.
1303             m_timer.startOneShot(0);
1304 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1305             if (currentTime() - startTime > tokenizerTimeDelay)
1306                 printf("Deferring processing of data because 500ms elapsed away from event loop.\n");
1307 #endif
1308             return false;
1309         }
1310     }
1311     
1312     processedCount++;
1313     return true;
1314 }
1315
1316 bool HTMLTokenizer::write(const SegmentedString& str, bool appendData)
1317 {
1318 #ifdef TOKEN_DEBUG
1319     kdDebug( 6036 ) << this << " Tokenizer::write(\"" << str.toString() << "\"," << appendData << ")" << endl;
1320 #endif
1321
1322     if (!buffer)
1323         return false;
1324     
1325     if (m_parserStopped)
1326         return false;
1327
1328     SegmentedString source(str);
1329     if (m_executingScript)
1330         source.setExcludeLineNumbers();
1331
1332     if ((m_executingScript && appendData) || !pendingScripts.isEmpty()) {
1333         // don't parse; we will do this later
1334         if (currentPrependingSrc)
1335             currentPrependingSrc->append(source);
1336         else
1337             pendingSrc.append(source);
1338         return false;
1339     }
1340
1341     if (!src.isEmpty())
1342         src.append(source);
1343     else
1344         setSrc(source);
1345
1346     // Once a timer is set, it has control of when the tokenizer continues.
1347     if (m_timer.isActive())
1348         return false;
1349
1350     bool wasInWrite = inWrite;
1351     inWrite = true;
1352     
1353 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1354     if (!m_doc->ownerElement())
1355         printf("Beginning write at time %d\n", m_doc->elapsedTime());
1356 #endif
1357     
1358     int processedCount = 0;
1359     double startTime = currentTime();
1360
1361     Frame *frame = m_doc->frame();
1362
1363     State state = m_state;
1364
1365     int* lineNoPtr = lineNumberPtr();
1366
1367     while (!src.isEmpty() && (!frame || !frame->loader()->isScheduledLocationChangePending())) {
1368         if (!continueProcessing(processedCount, startTime, state))
1369             break;
1370
1371         // do we need to enlarge the buffer?
1372         checkBuffer();
1373
1374         UChar cc = *src;
1375
1376         bool wasSkipLF = state.skipLF();
1377         if (wasSkipLF)
1378             state.setSkipLF(false);
1379
1380         if (wasSkipLF && (cc == '\n'))
1381             src.advance(0);
1382         else if (state.needsSpecialWriteHandling()) {
1383             // it's important to keep needsSpecialWriteHandling with the flags this block tests
1384             if (state.hasEntityState())
1385                 state = parseEntity(src, dest, state, m_cBufferPos, false, state.hasTagState());
1386             else if (state.inPlainText())
1387                 state = parseText(src, state);
1388             else if (state.inAnySpecial())
1389                 state = parseSpecial(src, state);
1390             else if (state.inComment())
1391                 state = parseComment(src, state);
1392             else if (state.inServer())
1393                 state = parseServer(src, state);
1394             else if (state.inProcessingInstruction())
1395                 state = parseProcessingInstruction(src, state);
1396             else if (state.hasTagState())
1397                 state = parseTag(src, state);
1398             else if (state.startTag()) {
1399                 state.setStartTag(false);
1400                 
1401                 switch(cc) {
1402                 case '/':
1403                     break;
1404                 case '!': {
1405                     // <!-- comment -->
1406                     searchCount = 1; // Look for '<!--' sequence to start comment
1407                     
1408                     break;
1409                 }
1410                 case '?': {
1411                     // xml processing instruction
1412                     state.setInProcessingInstruction(true);
1413                     tquote = NoQuote;
1414                     state = parseProcessingInstruction(src, state);
1415                     continue;
1416
1417                     break;
1418                 }
1419                 case '%':
1420                     if (!brokenServer) {
1421                         // <% server stuff, handle as comment %>
1422                         state.setInServer(true);
1423                         tquote = NoQuote;
1424                         state = parseServer(src, state);
1425                         continue;
1426                     }
1427                     // else fall through
1428                 default: {
1429                     if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) {
1430                         // Start of a Start-Tag
1431                     } else {
1432                         // Invalid tag
1433                         // Add as is
1434                         *dest = '<';
1435                         dest++;
1436                         continue;
1437                     }
1438                 }
1439                 }; // end case
1440
1441                 processToken();
1442
1443                 m_cBufferPos = 0;
1444                 state.setTagState(TagName);
1445                 state = parseTag(src, state);
1446             }
1447         } else if (cc == '&' && !src.escaped()) {
1448             src.advance(lineNoPtr);
1449             state = parseEntity(src, dest, state, m_cBufferPos, true, state.hasTagState());
1450         } else if (cc == '<' && !src.escaped()) {
1451             tagStartLineno = lineno;
1452             src.advance(lineNoPtr);
1453             state.setStartTag(true);
1454         } else if (cc == '\n' || cc == '\r') {
1455             if (state.discardLF())
1456                 // Ignore this LF
1457                 state.setDiscardLF(false); // We have discarded 1 LF
1458             else {
1459                 // Process this LF
1460                 *dest++ = '\n';
1461                 if (cc == '\r' && !src.excludeLineNumbers())
1462                     lineno++;
1463             }
1464
1465             /* Check for MS-DOS CRLF sequence */
1466             if (cc == '\r')
1467                 state.setSkipLF(true);
1468             src.advance(lineNoPtr);
1469         } else {
1470             state.setDiscardLF(false);
1471             *dest++ = cc;
1472             src.advance(lineNoPtr);
1473         }
1474     }
1475     
1476 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1477     if (!m_doc->ownerElement())
1478         printf("Ending write at time %d\n", m_doc->elapsedTime());
1479 #endif
1480     
1481     inWrite = wasInWrite;
1482
1483     m_state = state;
1484
1485     if (noMoreData && !inWrite && !state.loadingExtScript() && !m_executingScript && !m_timer.isActive()) {
1486         end(); // this actually causes us to be deleted
1487         return true;
1488     }
1489     return false;
1490 }
1491
1492 void HTMLTokenizer::stopParsing()
1493 {
1494     Tokenizer::stopParsing();
1495     m_timer.stop();
1496
1497     // The part needs to know that the tokenizer has finished with its data,
1498     // regardless of whether it happened naturally or due to manual intervention.
1499     if (!m_fragment && m_doc->frame())
1500         m_doc->frame()->loader()->tokenizerProcessedData();
1501 }
1502
1503 bool HTMLTokenizer::processingData() const
1504 {
1505     return m_timer.isActive() || inWrite;
1506 }
1507
1508 void HTMLTokenizer::timerFired(Timer<HTMLTokenizer>*)
1509 {
1510 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1511     if (!m_doc->ownerElement())
1512         printf("Beginning timer write at time %d\n", m_doc->elapsedTime());
1513 #endif
1514
1515     if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay()) {
1516         // Restart the timer and let layout win.  This is basically a way of ensuring that the layout
1517         // timer has higher priority than our timer.
1518         m_timer.startOneShot(0);
1519         return;
1520     }
1521     
1522     RefPtr<Frame> frame = m_fragment ? 0 : m_doc->frame();
1523
1524     // Invoke write() as though more data came in.
1525     bool didCallEnd = write(SegmentedString(), true);
1526   
1527     // If we called end() during the write,  we need to let WebKit know that we're done processing the data.
1528     if (didCallEnd && frame)
1529         frame->loader()->tokenizerProcessedData();
1530 }
1531
1532 void HTMLTokenizer::end()
1533 {
1534     ASSERT(!m_timer.isActive());
1535     m_timer.stop(); // Only helps if assertion above fires, but do it anyway.
1536
1537     if (buffer) {
1538         // parseTag is using the buffer for different matters
1539         if (!m_state.hasTagState())
1540             processToken();
1541
1542         fastFree(scriptCode);
1543         scriptCode = 0;
1544         scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
1545
1546         fastFree(buffer);
1547         buffer = 0;
1548     }
1549
1550     if (!inViewSourceMode())
1551         parser->finished();
1552     else
1553         m_doc->finishedParsing();
1554 }
1555
1556 void HTMLTokenizer::finish()
1557 {
1558     // do this as long as we don't find matching comment ends
1559     while((m_state.inComment() || m_state.inServer()) && scriptCode && scriptCodeSize) {
1560         // we've found an unmatched comment start
1561         if (m_state.inComment())
1562             brokenComments = true;
1563         else
1564             brokenServer = true;
1565         checkScriptBuffer();
1566         scriptCode[scriptCodeSize] = 0;
1567         scriptCode[scriptCodeSize + 1] = 0;
1568         int pos;
1569         String food;
1570         if (m_state.inScript() || m_state.inStyle())
1571             food = String(scriptCode, scriptCodeSize);
1572         else if (m_state.inServer()) {
1573             food = "<";
1574             food.append(String(scriptCode, scriptCodeSize));
1575         } else {
1576             pos = DeprecatedConstString(reinterpret_cast<DeprecatedChar*>(scriptCode), scriptCodeSize).string().find('>');
1577             food = String(scriptCode + pos + 1, scriptCodeSize - pos - 1);
1578         }
1579         fastFree(scriptCode);
1580         scriptCode = 0;
1581         scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
1582         m_state.setInComment(false);
1583         m_state.setInServer(false);
1584         if (!food.isEmpty())
1585             write(food, true);
1586     }
1587     // this indicates we will not receive any more data... but if we are waiting on
1588     // an external script to load, we can't finish parsing until that is done
1589     noMoreData = true;
1590     if (!inWrite && !m_state.loadingExtScript() && !m_executingScript && !m_timer.isActive())
1591         end(); // this actually causes us to be deleted
1592 }
1593
1594 PassRefPtr<Node> HTMLTokenizer::processToken()
1595 {
1596     KJSProxy* jsProxy = (!m_fragment && m_doc->frame()) ? m_doc->frame()->scriptProxy() : 0;
1597     if (jsProxy)
1598         jsProxy->setEventHandlerLineno(tagStartLineno);
1599     if (dest > buffer) {
1600 #ifdef TOKEN_DEBUG
1601         if(currToken.tagName.length()) {
1602             qDebug( "unexpected token: %s, str: *%s*", currToken.tagName.deprecatedString().latin1(),DeprecatedConstString( buffer,dest-buffer ).deprecatedString().latin1() );
1603             ASSERT(0);
1604         }
1605
1606 #endif
1607         currToken.text = StringImpl::createStrippingNull(buffer, dest - buffer);
1608         if (currToken.tagName != commentAtom)
1609             currToken.tagName = textAtom;
1610     } else if (currToken.tagName == nullAtom) {
1611         currToken.reset();
1612         if (jsProxy)
1613             jsProxy->setEventHandlerLineno(lineno);
1614         return 0;
1615     }
1616
1617     dest = buffer;
1618
1619 #ifdef TOKEN_DEBUG
1620     DeprecatedString name = currToken.tagName.deprecatedString();
1621     DeprecatedString text;
1622     if(currToken.text)
1623         text = DeprecatedConstString(currToken.text->unicode(), currToken.text->length()).deprecatedString();
1624
1625     kdDebug( 6036 ) << "Token --> " << name << endl;
1626     if (currToken.flat)
1627         kdDebug( 6036 ) << "Token is FLAT!" << endl;
1628     if(!text.isNull())
1629         kdDebug( 6036 ) << "text: \"" << text << "\"" << endl;
1630     unsigned l = currToken.attrs ? currToken.attrs->length() : 0;
1631     if(l) {
1632         kdDebug( 6036 ) << "Attributes: " << l << endl;
1633         for (unsigned i = 0; i < l; ++i) {
1634             Attribute* c = currToken.attrs->attributeItem(i);
1635             kdDebug( 6036 ) << "    " << c->localName().deprecatedString()
1636                             << "=\"" << c->value().deprecatedString() << "\"" << endl;
1637         }
1638     }
1639     kdDebug( 6036 ) << endl;
1640 #endif
1641
1642     RefPtr<Node> n;
1643     
1644     if (!m_parserStopped) {
1645         if (inViewSourceMode())
1646             static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceToken(&currToken);
1647         else
1648             // pass the token over to the parser, the parser DOES NOT delete the token
1649             n = parser->parseToken(&currToken);
1650     }
1651     currToken.reset();
1652     if (jsProxy)
1653         jsProxy->setEventHandlerLineno(0);
1654
1655     return n.release();
1656 }
1657
1658 HTMLTokenizer::~HTMLTokenizer()
1659 {
1660     ASSERT(!inWrite);
1661     reset();
1662     delete parser;
1663 }
1664
1665
1666 void HTMLTokenizer::enlargeBuffer(int len)
1667 {
1668     int newSize = max(size * 2, size + len);
1669     int oldOffset = dest - buffer;
1670     buffer = static_cast<UChar*>(fastRealloc(buffer, newSize * sizeof(UChar)));
1671     dest = buffer + oldOffset;
1672     size = newSize;
1673 }
1674
1675 void HTMLTokenizer::enlargeScriptBuffer(int len)
1676 {
1677     int newSize = max(scriptCodeMaxSize * 2, scriptCodeMaxSize + len);
1678     scriptCode = static_cast<UChar*>(fastRealloc(scriptCode, newSize * sizeof(UChar)));
1679     scriptCodeMaxSize = newSize;
1680 }
1681
1682 void HTMLTokenizer::notifyFinished(CachedResource*)
1683 {
1684 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1685     if (!m_doc->ownerElement())
1686         printf("script loaded at %d\n", m_doc->elapsedTime());
1687 #endif
1688
1689     ASSERT(!pendingScripts.isEmpty());
1690     bool finished = false;
1691     while (!finished && pendingScripts.head()->isLoaded()) {
1692 #ifdef TOKEN_DEBUG
1693         kdDebug( 6036 ) << "Finished loading an external script" << endl;
1694 #endif
1695         CachedScript* cs = pendingScripts.dequeue();
1696         ASSERT(cache()->disabled() || cs->accessCount() > 0);
1697
1698         String scriptSource = cs->script();
1699 #ifdef TOKEN_DEBUG
1700         kdDebug( 6036 ) << "External script is:" << endl << scriptSource.deprecatedString() << endl;
1701 #endif
1702         setSrc(SegmentedString());
1703
1704         // make sure we forget about the script before we execute the new one
1705         // infinite recursion might happen otherwise
1706         DeprecatedString cachedScriptUrl( cs->url().deprecatedString() );
1707         bool errorOccurred = cs->errorOccurred();
1708         cs->deref(this);
1709         RefPtr<Node> n = scriptNode.release();
1710
1711 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1712         if (!m_doc->ownerElement())
1713             printf("external script beginning execution at %d\n", m_doc->elapsedTime());
1714 #endif
1715
1716         if (errorOccurred)
1717             EventTargetNodeCast(n.get())->dispatchHTMLEvent(errorEvent, true, false);
1718         else {
1719             if (static_cast<HTMLScriptElement*>(n.get())->shouldExecuteAsJavaScript())
1720                 m_state = scriptExecution(scriptSource.deprecatedString(), m_state, cachedScriptUrl);
1721             EventTargetNodeCast(n.get())->dispatchHTMLEvent(loadEvent, false, false);
1722         }
1723
1724         // The state of pendingScripts.isEmpty() can change inside the scriptExecution()
1725         // call above, so test afterwards.
1726         finished = pendingScripts.isEmpty();
1727         if (finished) {
1728             m_state.setLoadingExtScript(false);
1729 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1730             if (!m_doc->ownerElement())
1731                 printf("external script finished execution at %d\n", m_doc->elapsedTime());
1732 #endif
1733         }
1734
1735         // 'm_requestingScript' is true when we are called synchronously from
1736         // scriptHandler(). In that case scriptHandler() will take care
1737         // of pendingSrc.
1738         if (!m_requestingScript) {
1739             SegmentedString rest = pendingSrc;
1740             pendingSrc.clear();
1741             write(rest, false);
1742             // we might be deleted at this point, do not
1743             // access any members.
1744         }
1745     }
1746 }
1747
1748 bool HTMLTokenizer::isWaitingForScripts() const
1749 {
1750     return m_state.loadingExtScript();
1751 }
1752
1753 void HTMLTokenizer::setSrc(const SegmentedString &source)
1754 {
1755     src = source;
1756 }
1757
1758 void parseHTMLDocumentFragment(const String& source, DocumentFragment* fragment)
1759 {
1760     HTMLTokenizer tok(fragment);
1761     tok.setForceSynchronous(true);
1762     tok.write(source, true);
1763     tok.finish();
1764     ASSERT(!tok.processingData());      // make sure we're done (see 3963151)
1765 }
1766
1767 UChar decodeNamedEntity(const char* name)
1768 {
1769     const Entity* e = findEntity(name, strlen(name));
1770     return e ? e->code : 0;
1771 }
1772
1773 }