2 Copyright (C) 1997 Martin Jones (mjones@kde.org)
3 (C) 1997 Torben Weis (weis@kde.org)
4 (C) 1998 Waldo Bastian (bastian@kde.org)
5 (C) 2001 Dirk Mueller (mueller@kde.org)
6 Copyright (C) 2003, 2004, 2005, 2006, 2007 Apple Inc. All rights reserved.
8 This library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Library General Public
10 License as published by the Free Software Foundation; either
11 version 2 of the License, or (at your option) any later version.
13 This library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Library General Public License for more details.
18 You should have received a copy of the GNU Library General Public License
19 along with this library; see the file COPYING.LIB. If not, write to
20 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 Boston, MA 02110-1301, USA.
24 #ifndef HTMLTokenizer_h
25 #define HTMLTokenizer_h
27 #include "DeprecatedPtrQueue.h"
28 #include "NamedMappedAttrMap.h"
29 #include "SegmentedString.h"
31 #include "Tokenizer.h"
32 #include "CachedResourceClient.h"
33 #include <wtf/Vector.h>
34 #include <wtf/OwnPtr.h>
39 class DocumentFragment;
42 class HTMLViewSourceDocument;
49 * represents one HTML tag. Consists of a numerical id, and the list
50 * of attributes. Can also represent text. In this case the id = 0 and
51 * text contains the text.
55 Token() : beginTag(true), flat(false), brokenXMLStyle(false), m_sourceInfo(0) { }
58 void addAttribute(Document*, AtomicString& attrName, const AtomicString& v, bool viewSourceMode);
60 bool isOpenTag(const QualifiedName& fullName) const { return beginTag && fullName.localName() == tagName; }
61 bool isCloseTag(const QualifiedName& fullName) const { return !beginTag && fullName.localName() == tagName; }
70 brokenXMLStyle = false;
72 m_sourceInfo->clear();
75 void addViewSourceChar(UChar c) { if (!m_sourceInfo.get()) m_sourceInfo.set(new Vector<UChar>); m_sourceInfo->append(c); }
77 RefPtr<NamedMappedAttrMap> attrs;
78 RefPtr<StringImpl> text;
83 OwnPtr<Vector<UChar> > m_sourceInfo;
86 //-----------------------------------------------------------------------------
88 class HTMLTokenizer : public Tokenizer, public CachedResourceClient {
90 HTMLTokenizer(HTMLDocument*, bool reportErrors);
91 HTMLTokenizer(HTMLViewSourceDocument*);
92 HTMLTokenizer(DocumentFragment*);
93 virtual ~HTMLTokenizer();
95 virtual bool write(const SegmentedString&, bool appendData);
96 virtual void finish();
97 virtual void setForceSynchronous(bool force);
98 virtual bool isWaitingForScripts() const;
99 virtual void stopParsing();
100 virtual bool processingData() const;
101 virtual int executingScript() const { return m_executingScript; }
103 virtual int lineNumber() const { return m_lineNumber; }
104 virtual int columnNumber() const { return 1; }
106 bool processingContentWrittenByScript() const { return src.excludeLineNumbers(); }
108 virtual void executeScriptsWaitingForStylesheets();
110 virtual bool isHTMLTokenizer() const { return true; }
111 HTMLParser* htmlParser() const { return parser; }
116 // Where we are in parsing a tag
121 PassRefPtr<Node> processToken();
123 State processListing(SegmentedString, State);
124 State parseComment(SegmentedString&, State);
125 State parseServer(SegmentedString&, State);
126 State parseText(SegmentedString&, State);
127 State parseSpecial(SegmentedString&, State);
128 State parseTag(SegmentedString&, State);
129 State parseEntity(SegmentedString&, UChar*& dest, State, unsigned& _cBufferPos, bool start, bool parsingTag);
130 State parseProcessingInstruction(SegmentedString&, State);
131 State scriptHandler(State);
132 State scriptExecution(const DeprecatedString& script, State, DeprecatedString scriptURL, int baseLine = 0);
133 void setSrc(const SegmentedString&);
135 // check if we have enough space in the buffer.
137 inline void checkBuffer(int len = 10)
139 if ((dest - buffer) > size - len)
143 inline void checkScriptBuffer(int len = 10)
145 if (scriptCodeSize + len >= scriptCodeMaxSize)
146 enlargeScriptBuffer(len);
149 void enlargeBuffer(int len);
150 void enlargeScriptBuffer(int len);
152 bool continueProcessing(int& processedCount, double startTime, State&);
153 void timerFired(Timer<HTMLTokenizer>*);
154 void allDataProcessed();
156 // from CachedResourceClient
157 void notifyFinished(CachedResource *finishedObj);
166 // the size of buffer
171 // are we in quotes within a html tag
172 enum { NoQuote, SingleQuote, DoubleQuote } tquote;
174 // Are we in a &... character entity description?
184 unsigned EntityUnicodeValue;
200 State() : m_bits(0) { }
202 TagState tagState() const { return static_cast<TagState>(m_bits & TagMask); }
203 void setTagState(TagState t) { m_bits = (m_bits & ~TagMask) | t; }
204 EntityState entityState() const { return static_cast<EntityState>((m_bits & EntityMask) >> EntityShift); }
205 void setEntityState(EntityState e) { m_bits = (m_bits & ~EntityMask) | (e << EntityShift); }
207 bool inScript() const { return testBit(InScript); }
208 void setInScript(bool v) { setBit(InScript, v); }
209 bool inStyle() const { return testBit(InStyle); }
210 void setInStyle(bool v) { setBit(InStyle, v); }
211 bool inXmp() const { return testBit(InXmp); }
212 void setInXmp(bool v) { setBit(InXmp, v); }
213 bool inTitle() const { return testBit(InTitle); }
214 void setInTitle(bool v) { setBit(InTitle, v); }
215 bool inPlainText() const { return testBit(InPlainText); }
216 void setInPlainText(bool v) { setBit(InPlainText, v); }
217 bool inProcessingInstruction() const { return testBit(InProcessingInstruction); }
218 void setInProcessingInstruction(bool v) { return setBit(InProcessingInstruction, v); }
219 bool inComment() const { return testBit(InComment); }
220 void setInComment(bool v) { setBit(InComment, v); }
221 bool inTextArea() const { return testBit(InTextArea); }
222 void setInTextArea(bool v) { setBit(InTextArea, v); }
223 bool escaped() const { return testBit(Escaped); }
224 void setEscaped(bool v) { setBit(Escaped, v); }
225 bool inServer() const { return testBit(InServer); }
226 void setInServer(bool v) { setBit(InServer, v); }
227 bool skipLF() const { return testBit(SkipLF); }
228 void setSkipLF(bool v) { setBit(SkipLF, v); }
229 bool startTag() const { return testBit(StartTag); }
230 void setStartTag(bool v) { setBit(StartTag, v); }
231 bool discardLF() const { return testBit(DiscardLF); }
232 void setDiscardLF(bool v) { setBit(DiscardLF, v); }
233 bool allowYield() const { return testBit(AllowYield); }
234 void setAllowYield(bool v) { setBit(AllowYield, v); }
235 bool loadingExtScript() const { return testBit(LoadingExtScript); }
236 void setLoadingExtScript(bool v) { setBit(LoadingExtScript, v); }
237 bool forceSynchronous() const { return testBit(ForceSynchronous); }
238 void setForceSynchronous(bool v) { setBit(ForceSynchronous, v); }
240 bool inAnySpecial() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle); }
241 bool hasTagState() const { return m_bits & TagMask; }
242 bool hasEntityState() const { return m_bits & EntityMask; }
244 bool needsSpecialWriteHandling() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | TagMask | EntityMask | InPlainText | InComment | InServer | InProcessingInstruction | StartTag); }
247 static const int EntityShift = 4;
249 TagMask = (1 << 4) - 1,
250 EntityMask = (1 << 7) - (1 << 4),
256 InPlainText = 1 << 12,
257 InProcessingInstruction = 1 << 13,
259 InTextArea = 1 << 15,
264 DiscardLF = 1 << 20, // FIXME: should clarify difference between skip and discard
265 AllowYield = 1 << 21,
266 LoadingExtScript = 1 << 22,
267 ForceSynchronous = 1 << 23
270 void setBit(StateBits bit, bool value)
277 bool testBit(StateBits bit) const { return m_bits & bit; }
286 // Name of an attribute that we just scanned.
287 AtomicString attrName;
289 // Used to store the code of a srcipting sequence
291 // Size of the script sequenze stored in @ref #scriptCode
293 // Maximal size that can be stored in @ref #scriptCode
294 int scriptCodeMaxSize;
295 // resync point of script code size
296 int scriptCodeResync;
298 // Stores characters if we are scanning for a string like "</script>"
299 UChar searchBuffer[10];
300 // Counts where we are in the string we are scanning for
302 // The string we are searching for
303 const UChar* searchFor;
304 // the stopper string
305 const char* searchStopper;
307 int searchStopperLen;
308 // if no more data is coming, just parse what we have (including ext scripts that
309 // may be still downloading) and finish
311 // URL to get source code of script from
313 String scriptSrcCharset;
314 // the HTML code we will parse after the external script we are waiting for has loaded
315 SegmentedString pendingSrc;
317 // the HTML code we will parse after this particular script has
318 // loaded, but before all pending HTML
319 SegmentedString *currentPrependingSrc;
321 // true if we are executing a script while parsing a document. This causes the parsing of
322 // the output of the script to be postponed until after the script has finished executing
323 int m_executingScript;
324 DeprecatedPtrQueue<CachedScript> pendingScripts;
325 RefPtr<Node> scriptNode;
327 bool m_requestingScript;
328 bool m_hasScriptsWaitingForStylesheets;
330 // if we found one broken comment, there are most likely others as well
331 // store a flag to get rid of the O(n^2) behaviour in such a case.
333 // current line number
335 // line number at which the current <script> started
336 int scriptStartLineno;
339 // The timer for continued processing.
340 Timer<HTMLTokenizer> m_timer;
342 // This buffer can hold arbitrarily long user-defined attribute names, such as in EMBED tags.
343 // So any fixed number might be too small, but rather than rewriting all usage of this buffer
344 // we'll just make it large enough to handle all imaginable cases.
346 char cBuffer[CBUFLEN + 2];
347 unsigned int m_cBufferPos;
356 void parseHTMLDocumentFragment(const String&, DocumentFragment*);
358 UChar decodeNamedEntity(const char*);
360 } // namespace WebCore
362 #endif // HTMLTokenizer_h