49af60144b9a6eb347ab82bb87c77fc676e79f67
[WebKit-https.git] / Source / WebCore / html / parser / HTMLTokenizer.h
1 /*
2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
25  */
26
27 #ifndef HTMLTokenizer_h
28 #define HTMLTokenizer_h
29
30 #include "HTMLToken.h"
31 #include "MarkupTokenizerBase.h"
32 #include "SegmentedString.h"
33
34 namespace WebCore {
35
36 class Frame;
37
38 class HTMLTokenizerState {
39 public:
40     enum State {
41         DataState,
42         CharacterReferenceInDataState,
43         RCDATAState,
44         CharacterReferenceInRCDATAState,
45         RAWTEXTState,
46         ScriptDataState,
47         PLAINTEXTState,
48         TagOpenState,
49         EndTagOpenState,
50         TagNameState,
51         RCDATALessThanSignState,
52         RCDATAEndTagOpenState,
53         RCDATAEndTagNameState,
54         RAWTEXTLessThanSignState,
55         RAWTEXTEndTagOpenState,
56         RAWTEXTEndTagNameState,
57         ScriptDataLessThanSignState,
58         ScriptDataEndTagOpenState,
59         ScriptDataEndTagNameState,
60         ScriptDataEscapeStartState,
61         ScriptDataEscapeStartDashState,
62         ScriptDataEscapedState,
63         ScriptDataEscapedDashState,
64         ScriptDataEscapedDashDashState,
65         ScriptDataEscapedLessThanSignState,
66         ScriptDataEscapedEndTagOpenState,
67         ScriptDataEscapedEndTagNameState,
68         ScriptDataDoubleEscapeStartState,
69         ScriptDataDoubleEscapedState,
70         ScriptDataDoubleEscapedDashState,
71         ScriptDataDoubleEscapedDashDashState,
72         ScriptDataDoubleEscapedLessThanSignState,
73         ScriptDataDoubleEscapeEndState,
74         BeforeAttributeNameState,
75         AttributeNameState,
76         AfterAttributeNameState,
77         BeforeAttributeValueState,
78         AttributeValueDoubleQuotedState,
79         AttributeValueSingleQuotedState,
80         AttributeValueUnquotedState,
81         CharacterReferenceInAttributeValueState,
82         AfterAttributeValueQuotedState,
83         SelfClosingStartTagState,
84         BogusCommentState,
85         // The ContinueBogusCommentState is not in the HTML5 spec, but we use
86         // it internally to keep track of whether we've started the bogus
87         // comment token yet.
88         ContinueBogusCommentState,
89         MarkupDeclarationOpenState,
90         CommentStartState,
91         CommentStartDashState,
92         CommentState,
93         CommentEndDashState,
94         CommentEndState,
95         CommentEndBangState,
96         DOCTYPEState,
97         BeforeDOCTYPENameState,
98         DOCTYPENameState,
99         AfterDOCTYPENameState,
100         AfterDOCTYPEPublicKeywordState,
101         BeforeDOCTYPEPublicIdentifierState,
102         DOCTYPEPublicIdentifierDoubleQuotedState,
103         DOCTYPEPublicIdentifierSingleQuotedState,
104         AfterDOCTYPEPublicIdentifierState,
105         BetweenDOCTYPEPublicAndSystemIdentifiersState,
106         AfterDOCTYPESystemKeywordState,
107         BeforeDOCTYPESystemIdentifierState,
108         DOCTYPESystemIdentifierDoubleQuotedState,
109         DOCTYPESystemIdentifierSingleQuotedState,
110         AfterDOCTYPESystemIdentifierState,
111         BogusDOCTYPEState,
112         CDATASectionState,
113         // These CDATA states are not in the HTML5 spec, but we use them internally.
114         CDATASectionRightSquareBracketState,
115         CDATASectionDoubleRightSquareBracketState,
116     };
117 };
118
119 class HTMLTokenizer : public MarkupTokenizerBase<HTMLToken, HTMLTokenizerState> {
120     WTF_MAKE_NONCOPYABLE(HTMLTokenizer);
121     WTF_MAKE_FAST_ALLOCATED;
122 public:
123     static PassOwnPtr<HTMLTokenizer> create(bool usePreHTML5ParserQuirks) { return adoptPtr(new HTMLTokenizer(usePreHTML5ParserQuirks)); }
124     ~HTMLTokenizer();
125
126     void reset();
127
128     // This function returns true if it emits a token. Otherwise, callers
129     // must provide the same (in progress) token on the next call (unless
130     // they call reset() first).
131     bool nextToken(SegmentedString&, HTMLToken&);
132
133     // Updates the tokenizer's state according to the given tag name. This is
134     // an approximation of how the tree builder would update the tokenizer's
135     // state. This method is useful for approximating HTML tokenization. To
136     // get exactly the correct tokenization, you need the real tree builder.
137     //
138     // The main failures in the approximation are as follows:
139     //
140     //  * The first set of character tokens emitted for a <pre> element might
141     //    contain an extra leading newline.
142     //  * The replacement of U+0000 with U+FFFD will not be sensitive to the
143     //    tree builder's insertion mode.
144     //  * CDATA sections in foreign content will be tokenized as bogus comments
145     //    instead of as character tokens.
146     //
147     void updateStateFor(const AtomicString& tagName, Frame*);
148
149     // Hack to skip leading newline in <pre>/<listing> for authoring ease.
150     // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
151     void setSkipLeadingNewLineForListing(bool value) { m_skipLeadingNewLineForListing = value; }
152
153     bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
154     void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }
155
156     bool shouldAllowCDATA() const { return m_shouldAllowCDATA; }
157     void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; }
158
159 private:
160     HTMLTokenizer(bool usePreHTML5ParserQuirks);
161
162     inline bool processEntity(SegmentedString&);
163
164     inline void parseError();
165     
166     inline bool emitAndResumeIn(SegmentedString& source, HTMLTokenizerState::State state)
167     {
168         saveEndTagNameIfNeeded();
169         return MarkupTokenizerBase<HTMLToken, HTMLTokenizerState>::emitAndResumeIn(source, state);
170     }
171     
172     inline bool emitAndReconsumeIn(SegmentedString& source, HTMLTokenizerState::State state)
173     {
174         saveEndTagNameIfNeeded();
175         return MarkupTokenizerBase<HTMLToken, HTMLTokenizerState>::emitAndReconsumeIn(source, state);
176     }
177
178     inline bool flushEmitAndResumeIn(SegmentedString&, HTMLTokenizerState::State);
179
180     // Return whether we need to emit a character token before dealing with
181     // the buffered end tag.
182     inline bool flushBufferedEndTag(SegmentedString&);
183     inline bool temporaryBufferIs(const String&);
184
185     // Sometimes we speculatively consume input characters and we don't
186     // know whether they represent end tags or RCDATA, etc. These
187     // functions help manage these state.
188     inline void addToPossibleEndTag(UChar cc);
189
190     inline void saveEndTagNameIfNeeded()
191     {
192         ASSERT(m_token->type() != HTMLTokenTypes::Uninitialized);
193         if (m_token->type() == HTMLTokenTypes::StartTag)
194             m_appropriateEndTagName = m_token->name();
195     }
196     inline bool isAppropriateEndTag();
197
198     Vector<UChar, 32> m_appropriateEndTagName;
199
200     bool m_shouldAllowCDATA;
201
202     // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer
203     Vector<UChar, 32> m_temporaryBuffer;
204
205     // We occationally want to emit both a character token and an end tag
206     // token (e.g., when lexing script). We buffer the name of the end tag
207     // token here so we remember it next time we re-enter the tokenizer.
208     Vector<UChar, 32> m_bufferedEndTagName;
209     
210     bool m_usePreHTML5ParserQuirks;
211 };
212
213 }
214
215 #endif