<!DOCTYPE html><pre>&#x0a;&#x0a;A</pre> doesn't parse correctly
authorabarth@webkit.org <abarth@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Fri, 16 Dec 2011 21:44:45 +0000 (21:44 +0000)
committerabarth@webkit.org <abarth@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Fri, 16 Dec 2011 21:44:45 +0000 (21:44 +0000)
https://bugs.webkit.org/show_bug.cgi?id=74658

Reviewed by Darin Adler.

Source/WebCore:

Previously, we handled skipping newlines after <pre> in the tokenizer,
which isn't how the spec handles them.  Instead, the spec skips them in
the tree builder.  This isn't usually observable, except in the case of
an HTML entity.  In that case, the tokenzier sees '&' (because the
entity hasn't been decoded yet), but the tree builder sees '\n' (the
decoded entity).  This patch fixes the bug by more closely aligning our
implementation with the spec.

Test: html5lib/runner.html

* html/parser/HTMLTokenizer.cpp:
(WebCore::HTMLTokenizer::reset):
(WebCore::HTMLTokenizer::nextToken):
* html/parser/HTMLTokenizer.h:
* html/parser/HTMLTreeBuilder.cpp:
(WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::skipAtMostOneLeadingNewline):
(WebCore::HTMLTreeBuilder::HTMLTreeBuilder):
(WebCore::HTMLTreeBuilder::processStartTagForInBody):
(WebCore::HTMLTreeBuilder::processCharacterBuffer):
* html/parser/HTMLTreeBuilder.h:
* xml/parser/MarkupTokenizerBase.h:

LayoutTests:

Shows test progression.

* html5lib/runner-expected.txt:

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@103102 268f45cc-cd09-0410-ab3c-d52691b4dbfc

LayoutTests/ChangeLog
LayoutTests/fast/forms/textarea-newline-expected.txt
LayoutTests/fast/forms/textarea-newline.html
LayoutTests/html5lib/runner-expected.txt
Source/WebCore/ChangeLog
Source/WebCore/html/parser/HTMLTokenizer.cpp
Source/WebCore/html/parser/HTMLTokenizer.h
Source/WebCore/html/parser/HTMLTreeBuilder.cpp
Source/WebCore/html/parser/HTMLTreeBuilder.h
Source/WebCore/html/parser/TextDocumentParser.cpp
Source/WebCore/xml/parser/MarkupTokenizerBase.h

index 295af8d..583268d 100644 (file)
@@ -1,3 +1,14 @@
+2011-12-16  Adam Barth  <abarth@webkit.org>
+
+        <!DOCTYPE html><pre>&#x0a;&#x0a;A</pre> doesn't parse correctly
+        https://bugs.webkit.org/show_bug.cgi?id=74658
+
+        Reviewed by Darin Adler.
+
+        Shows test progression.
+
+        * html5lib/runner-expected.txt:
+
 2011-12-16  Adrienne Walker  <enne@google.com>
 
         [chromium] Further suppress media/video-transformed.html failures
index 761d1de..b84cdba 100644 (file)
@@ -3,8 +3,8 @@ These tests check if leading line feeds characters in textarea as default values
 PASS document.getElementById("no-line-feed").value is "Madoka"
 PASS document.getElementById("one-line-feed").value is "Sayaka"
 PASS document.getElementById("two-line-feeds").value is "\nMami"
-PASS document.getElementById("one-line-feed-escaped-char-and-one-line-feed").value is "\n\nKyoko"
-PASS document.getElementById("two-line-feed-escaped-chars").value is "\n\nHomura"
+PASS document.getElementById("one-line-feed-escaped-char-and-one-line-feed").value is "\nKyoko"
+PASS document.getElementById("two-line-feed-escaped-chars").value is "\nHomura"
 PASS successfullyParsed is true
 
 TEST COMPLETE
index 347b3b2..afc7d6c 100644 (file)
@@ -31,9 +31,9 @@ shouldBe('document.getElementById("one-line-feed").value', '"Sayaka"');
 
 shouldBe('document.getElementById("two-line-feeds").value', '"\\nMami"');
 
-shouldBe('document.getElementById("one-line-feed-escaped-char-and-one-line-feed").value', '"\\n\\nKyoko"');
+shouldBe('document.getElementById("one-line-feed-escaped-char-and-one-line-feed").value', '"\\nKyoko"');
 
-shouldBe('document.getElementById("two-line-feed-escaped-chars").value', '"\\n\\nHomura"');
+shouldBe('document.getElementById("two-line-feed-escaped-chars").value', '"\\nHomura"');
 </script>
 
 <script src="../../fast/js/resources/js-test-post.js"></script>
index 0b54695..e964ad5 100644 (file)
Binary files a/LayoutTests/html5lib/runner-expected.txt and b/LayoutTests/html5lib/runner-expected.txt differ
index 8c324ba..786b1ab 100644 (file)
@@ -1,3 +1,32 @@
+2011-12-16  Adam Barth  <abarth@webkit.org>
+
+        <!DOCTYPE html><pre>&#x0a;&#x0a;A</pre> doesn't parse correctly
+        https://bugs.webkit.org/show_bug.cgi?id=74658
+
+        Reviewed by Darin Adler.
+
+        Previously, we handled skipping newlines after <pre> in the tokenizer,
+        which isn't how the spec handles them.  Instead, the spec skips them in
+        the tree builder.  This isn't usually observable, except in the case of
+        an HTML entity.  In that case, the tokenzier sees '&' (because the
+        entity hasn't been decoded yet), but the tree builder sees '\n' (the
+        decoded entity).  This patch fixes the bug by more closely aligning our
+        implementation with the spec.
+
+        Test: html5lib/runner.html
+
+        * html/parser/HTMLTokenizer.cpp:
+        (WebCore::HTMLTokenizer::reset):
+        (WebCore::HTMLTokenizer::nextToken):
+        * html/parser/HTMLTokenizer.h:
+        * html/parser/HTMLTreeBuilder.cpp:
+        (WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::skipAtMostOneLeadingNewline):
+        (WebCore::HTMLTreeBuilder::HTMLTreeBuilder):
+        (WebCore::HTMLTreeBuilder::processStartTagForInBody):
+        (WebCore::HTMLTreeBuilder::processCharacterBuffer):
+        * html/parser/HTMLTreeBuilder.h:
+        * xml/parser/MarkupTokenizerBase.h:
+
 2011-12-16  Joshua Bell  <jsbell@chromium.org>
 
         IndexedDB: Implement IDBObjectStore.count() and IDBIndex.count()
index 7a269be..9fb5ce3 100644 (file)
@@ -136,7 +136,6 @@ void HTMLTokenizer::reset()
     m_state = HTMLTokenizerState::DataState;
     m_token = 0;
     m_lineNumber = 0;
-    m_skipLeadingNewLineForListing = false;
     m_forceNullCharacterReplacement = false;
     m_shouldAllowCDATA = false;
     m_additionalAllowedCharacter = '\0';
@@ -211,30 +210,6 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
         return haveBufferedCharacterToken();
     UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
 
-    // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
-    // Note that this logic is different than the generic \r\n collapsing
-    // handled in the input stream preprocessor. This logic is here as an
-    // "authoring convenience" so folks can write:
-    //
-    // <pre>
-    // lorem ipsum
-    // lorem ipsum
-    // </pre>
-    //
-    // without getting an extra newline at the start of their <pre> element.
-    if (m_skipLeadingNewLineForListing) {
-        m_skipLeadingNewLineForListing = false;
-        if (cc == '\n') {
-            if (m_state == HTMLTokenizerState::DataState)
-                HTML_ADVANCE_TO(DataState);
-            if (m_state == HTMLTokenizerState::RCDATAState)
-                HTML_ADVANCE_TO(RCDATAState);
-            // When parsing text/plain documents, we run the tokenizer in the
-            // PLAINTEXTState and ignore m_skipLeadingNewLineForListing.
-            ASSERT(m_state == HTMLTokenizerState::PLAINTEXTState);
-        }
-    }
-
     // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
     switch (m_state) {
     HTML_BEGIN_STATE(DataState) {
index 49af601..1e4798b 100644 (file)
@@ -146,10 +146,6 @@ public:
     //
     void updateStateFor(const AtomicString& tagName, Frame*);
 
-    // Hack to skip leading newline in <pre>/<listing> for authoring ease.
-    // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
-    void setSkipLeadingNewLineForListing(bool value) { m_skipLeadingNewLineForListing = value; }
-
     bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
     void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }
 
index 98147c5..a1f7c25 100644 (file)
@@ -268,6 +268,13 @@ public:
 
     bool isEmpty() const { return m_current == m_end; }
 
+    void skipAtMostOneLeadingNewline()
+    {
+        ASSERT(!isEmpty());
+        if (*m_current == '\n')
+            ++m_current;
+    }
+
     void skipLeadingWhitespace()
     {
         skipLeading<isHTMLSpace>();
@@ -349,6 +356,7 @@ HTMLTreeBuilder::HTMLTreeBuilder(HTMLDocumentParser* parser, HTMLDocument* docum
     , m_isPaused(false)
     , m_insertionMode(InitialMode)
     , m_originalInsertionMode(InitialMode)
+    , m_shouldSkipLeadingNewline(false)
     , m_parser(parser)
     , m_scriptToProcessStartPosition(uninitializedPositionValue1())
     , m_lastScriptElementStartPosition(TextPosition::belowRangePosition())
@@ -367,6 +375,7 @@ HTMLTreeBuilder::HTMLTreeBuilder(HTMLDocumentParser* parser, DocumentFragment* f
     , m_isPaused(false)
     , m_insertionMode(InitialMode)
     , m_originalInsertionMode(InitialMode)
+    , m_shouldSkipLeadingNewline(false)
     , m_parser(parser)
     , m_scriptToProcessStartPosition(uninitializedPositionValue1())
     , m_lastScriptElementStartPosition(TextPosition::belowRangePosition())
@@ -476,21 +485,26 @@ void HTMLTreeBuilder::processToken(AtomicHTMLToken& token)
         ASSERT_NOT_REACHED();
         break;
     case HTMLTokenTypes::DOCTYPE:
+        m_shouldSkipLeadingNewline = false;
         processDoctypeToken(token);
         break;
     case HTMLTokenTypes::StartTag:
+        m_shouldSkipLeadingNewline = false;
         processStartTag(token);
         break;
     case HTMLTokenTypes::EndTag:
+        m_shouldSkipLeadingNewline = false;
         processEndTag(token);
         break;
     case HTMLTokenTypes::Comment:
+        m_shouldSkipLeadingNewline = false;
         processComment(token);
         return;
     case HTMLTokenTypes::Character:
         processCharacter(token);
         break;
     case HTMLTokenTypes::EndOfFile:
+        m_shouldSkipLeadingNewline = false;
         processEndOfFile(token);
         break;
     }
@@ -810,7 +824,7 @@ void HTMLTreeBuilder::processStartTagForInBody(AtomicHTMLToken& token)
     if (token.name() == preTag || token.name() == listingTag) {
         processFakePEndTagIfPInButtonScope();
         m_tree.insertHTMLElement(token);
-        m_parser->tokenizer()->setSkipLeadingNewLineForListing(true);
+        m_shouldSkipLeadingNewline = true;
         m_framesetOk = false;
         return;
     }
@@ -937,7 +951,7 @@ void HTMLTreeBuilder::processStartTagForInBody(AtomicHTMLToken& token)
     }
     if (token.name() == textareaTag) {
         m_tree.insertHTMLElement(token);
-        m_parser->tokenizer()->setSkipLeadingNewLineForListing(true);
+        m_shouldSkipLeadingNewline = true;
         m_parser->tokenizer()->setState(HTMLTokenizerState::RCDATAState);
         m_originalInsertionMode = m_insertionMode;
         m_framesetOk = false;
@@ -2272,6 +2286,24 @@ void HTMLTreeBuilder::processCharacter(AtomicHTMLToken& token)
 void HTMLTreeBuilder::processCharacterBuffer(ExternalCharacterTokenBuffer& buffer)
 {
 ReprocessBuffer:
+    // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
+    // Note that this logic is different than the generic \r\n collapsing
+    // handled in the input stream preprocessor. This logic is here as an
+    // "authoring convenience" so folks can write:
+    //
+    // <pre>
+    // lorem ipsum
+    // lorem ipsum
+    // </pre>
+    //
+    // without getting an extra newline at the start of their <pre> element.
+    if (m_shouldSkipLeadingNewline) {
+        m_shouldSkipLeadingNewline = false;
+        buffer.skipAtMostOneLeadingNewline();
+        if (buffer.isEmpty())
+            return;
+    }
+
     switch (insertionMode()) {
     case InitialMode: {
         ASSERT(insertionMode() == InitialMode);
index 32a19a7..aceb669 100644 (file)
@@ -83,6 +83,8 @@ public:
     // Done, close any open tags, etc.
     void finished();
 
+    void setShouldSkipLeadingNewline(bool shouldSkip) { m_shouldSkipLeadingNewline = shouldSkip; }
+
     static bool scriptEnabled(Frame*);
     static bool pluginsEnabled(Frame*);
 
@@ -239,6 +241,8 @@ private:
     // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#pending-table-character-tokens
     StringBuilder m_pendingTableCharacters;
 
+    bool m_shouldSkipLeadingNewline;
+
     // We access parser because HTML5 spec requires that we be able to change the state of the tokenizer
     // from within parser actions. We also need it to track the current position.
     HTMLDocumentParser* m_parser;
index ed5543e..de3c1d3 100644 (file)
@@ -66,6 +66,10 @@ void TextDocumentParser::insertFakePreElement()
     AtomicHTMLToken fakePre(HTMLTokenTypes::StartTag, preTag.localName(), attributes.release());
 
     treeBuilder()->constructTreeFromAtomicToken(fakePre);
+    // Normally we would skip the first \n after a <pre> element, but we don't
+    // want to skip the first \n for text documents!
+    treeBuilder()->setShouldSkipLeadingNewline(false);
+
     m_haveInsertedFakePreElement = true;
 }
 
index a1ca402..b8f9756 100644 (file)
@@ -197,7 +197,6 @@ protected:
     Token* m_token;
     int m_lineNumber;
 
-    bool m_skipLeadingNewLineForListing;
     bool m_forceNullCharacterReplacement;
 
     // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character