Reviewed by Eric Seidel.
authorap@webkit.org <ap@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Tue, 13 May 2008 18:36:58 +0000 (18:36 +0000)
committerap@webkit.org <ap@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Tue, 13 May 2008 18:36:58 +0000 (18:36 +0000)
        https://bugs.webkit.org/show_bug.cgi?id=18681
        <rdar://problem/5888130> WebKit should not remove BOM characters from content.

        We were only trying to match Firefox, and it doesn't do this any more.

        Tests: fast/encoding/bom-in-content.html
               fast/encoding/bom-in-content-utf16.html

        * platform/text/TextDecoder.cpp: (WebCore::TextDecoder::checkForBOM): Skip the BOM if it's
        at the start of input stream.

        * platform/text/TextCodec.cpp:
        * platform/text/TextCodec.h:
        * platform/text/TextCodecICU.cpp:
        (WebCore::TextCodecICU::decode):
        * platform/text/TextCodecUTF16.cpp:
        (WebCore::TextCodecUTF16::decode):
        * platform/text/mac/TextCodecMac.cpp:
        (WebCore::TextCodecMac::decode):
        Don't remove the BOM.

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@33380 268f45cc-cd09-0410-ab3c-d52691b4dbfc

18 files changed:
LayoutTests/ChangeLog
LayoutTests/fast/encoding/bom-in-content-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/bom-in-content-utf16-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/bom-in-content-utf16.html [new file with mode: 0644]
LayoutTests/fast/encoding/bom-in-content.html [new file with mode: 0644]
LayoutTests/http/tests/incremental/resources/slow-utf8-css.pl [moved from LayoutTests/http/tests/incremental/slow-utf8-css.pl with 63% similarity]
LayoutTests/http/tests/incremental/slow-utf8-css-expected.txt [new file with mode: 0644]
LayoutTests/http/tests/incremental/slow-utf8-css.html [new file with mode: 0644]
LayoutTests/platform/mac/http/tests/incremental/slow-utf8-css-expected.checksum [deleted file]
LayoutTests/platform/mac/http/tests/incremental/slow-utf8-css-expected.png [deleted file]
LayoutTests/platform/mac/http/tests/incremental/slow-utf8-css-expected.txt [deleted file]
WebCore/ChangeLog
WebCore/platform/text/TextCodec.cpp
WebCore/platform/text/TextCodec.h
WebCore/platform/text/TextCodecICU.cpp
WebCore/platform/text/TextCodecUTF16.cpp
WebCore/platform/text/TextDecoder.cpp
WebCore/platform/text/mac/TextCodecMac.cpp

index d0dac93..df0415f 100644 (file)
@@ -1,5 +1,29 @@
 2008-05-13  Alexey Proskuryakov  <ap@webkit.org>
 
+        Reviewed by Eric Seidel.
+
+        https://bugs.webkit.org/show_bug.cgi?id=18681
+        <rdar://problem/5888130> WebKit should not remove BOM characters from content.
+
+        * fast/encoding/bom-in-content-expected.txt: Added.
+        * fast/encoding/bom-in-content.html: Added.
+        * fast/encoding/bom-in-content-utf16-expected.txt: Added.
+        * fast/encoding/bom-in-content-utf16.html: Added.
+
+        * http/tests/incremental/resources: Added.
+        * http/tests/incremental/resources/slow-utf8-css.pl: Copied from LayoutTests/http/tests/incremental/slow-utf8-css.pl.
+        * http/tests/incremental/slow-utf8-css-expected.txt: Added.
+        * http/tests/incremental/slow-utf8-css.html: Added.
+        * http/tests/incremental/slow-utf8-css.pl: Removed.
+        * platform/mac/http/tests/incremental: Removed.
+        * platform/mac/http/tests/incremental/slow-utf8-css-expected.checksum: Removed.
+        * platform/mac/http/tests/incremental/slow-utf8-css-expected.png: Removed.
+        * platform/mac/http/tests/incremental/slow-utf8-css-expected.txt: Removed.
+        This test was relying on BOM characters being removed, but this was not what it tested for.
+        Rewrote it and made text-only.
+
+2008-05-13  Alexey Proskuryakov  <ap@webkit.org>
+
         Reviewed by Dan Bernstein.
 
         Add tests verifying that we don't mix up some similar, but incompatible encodings.
diff --git a/LayoutTests/fast/encoding/bom-in-content-expected.txt b/LayoutTests/fast/encoding/bom-in-content-expected.txt
new file mode 100644 (file)
index 0000000..11252db
--- /dev/null
@@ -0,0 +1,4 @@
+Test for bug 18681: BOM characters should not be removed from input stream.
+
+
+PASS
diff --git a/LayoutTests/fast/encoding/bom-in-content-utf16-expected.txt b/LayoutTests/fast/encoding/bom-in-content-utf16-expected.txt
new file mode 100644 (file)
index 0000000..11252db
--- /dev/null
@@ -0,0 +1,4 @@
+Test for bug 18681: BOM characters should not be removed from input stream.
+
+
+PASS
diff --git a/LayoutTests/fast/encoding/bom-in-content-utf16.html b/LayoutTests/fast/encoding/bom-in-content-utf16.html
new file mode 100644 (file)
index 0000000..3b55ad1
Binary files /dev/null and b/LayoutTests/fast/encoding/bom-in-content-utf16.html differ
diff --git a/LayoutTests/fast/encoding/bom-in-content.html b/LayoutTests/fast/encoding/bom-in-content.html
new file mode 100644 (file)
index 0000000..c30a914
--- /dev/null
@@ -0,0 +1,14 @@
+<head>
+<meta charset="utf-8">
+</head>
+<body>
+<p>Test for <a href="https://bugs.webkit.org/show_bug.cgi?id=18681">bug 18681</a>:
+BOM characters should not be removed from input stream.<p>
+<div id=BOMs></div>
+<script>
+if (window.layoutTestController)
+    layoutTestController.dumpAsText();
+
+document.write(document.getElementById("BOMs").innerHTML.length == 1 ? "PASS" : "FAIL");
+</script>
+</body>
@@ -10,12 +10,12 @@ print "Cache-Control: no-store, no-cache, must-revalidate\n";
 print "Pragma: no-cache\n";
 print "\n";
 
-print "\xef\xbb\xbfTest for bug 10753: The beginning of a CSS file is missing.\n\n";
-# Dump some BOMs to bypass CFNetwork buffering.
+print "\xef\xbb\xbf#result {color:green;}\n";
+# Dump some spaces to bypass CFNetwork buffering.
 for ($count = 1; $count < 4000; $count++) {
-    print "\xef\xbb\xbf";
+    print "   ";
 }
 
 # Delay to force the second line of text to be decoded as a separate chunk.
 sleep 1;
-print "You should see a bug description on a separate line above this one.";
+print "body {}";
diff --git a/LayoutTests/http/tests/incremental/slow-utf8-css-expected.txt b/LayoutTests/http/tests/incremental/slow-utf8-css-expected.txt
new file mode 100644 (file)
index 0000000..581f70b
--- /dev/null
@@ -0,0 +1,3 @@
+Test for bug 10753: The beginning of a CSS file is missing.
+
+PASS
diff --git a/LayoutTests/http/tests/incremental/slow-utf8-css.html b/LayoutTests/http/tests/incremental/slow-utf8-css.html
new file mode 100644 (file)
index 0000000..cd827a7
--- /dev/null
@@ -0,0 +1,16 @@
+<head>
+<link rel="stylesheet" href="resources/slow-utf8-css.pl" type="text/css" charset="utf-8">
+<script>
+function test() {
+    if (window.layoutTestController)
+        layoutTestController.dumpAsText();
+    document.getElementById("result").innerHTML = 
+        (document.styleSheets.item(0).cssRules.item(0).selectorText == "#result") ? "PASS" : "FAIL";
+}
+</script>
+</head>
+<body onload="test()">
+<p>Test for <a href="https://bugs.webkit.org/show_bug.cgi?id=10753">bug 10753</a>:
+The beginning of a CSS file is missing.
+<div id=result>Should be green</div>
+</body>
diff --git a/LayoutTests/platform/mac/http/tests/incremental/slow-utf8-css-expected.checksum b/LayoutTests/platform/mac/http/tests/incremental/slow-utf8-css-expected.checksum
deleted file mode 100644 (file)
index fcc75b9..0000000
+++ /dev/null
@@ -1 +0,0 @@
-b11756f779dae6e4e4852507da3ed50b
\ No newline at end of file
diff --git a/LayoutTests/platform/mac/http/tests/incremental/slow-utf8-css-expected.png b/LayoutTests/platform/mac/http/tests/incremental/slow-utf8-css-expected.png
deleted file mode 100644 (file)
index e2ece01..0000000
Binary files a/LayoutTests/platform/mac/http/tests/incremental/slow-utf8-css-expected.png and /dev/null differ
diff --git a/LayoutTests/platform/mac/http/tests/incremental/slow-utf8-css-expected.txt b/LayoutTests/platform/mac/http/tests/incremental/slow-utf8-css-expected.txt
deleted file mode 100644 (file)
index a77b220..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-layer at (0,0) size 800x600
-  RenderView at (0,0) size 800x600
-layer at (0,0) size 800x600
-  RenderBlock {HTML} at (0,0) size 800x600
-    RenderBody {BODY} at (8,8) size 784x579
-      RenderBlock {PRE} at (0,0) size 784x45
-        RenderText {#text} at (0,0) size 472x30
-          text run at (0,0) width 472: "Test for bug 10753: The beginning of a CSS file is missing."
-          text run at (472,0) width 0: " "
-          text run at (0,15) width 0: " "
-        RenderText {#text} at (0,30) size 536x15
-          text run at (0,30) width 536: "You should see a bug description on a separate line above this one."
index 7b729a9..a387072 100644 (file)
@@ -1,3 +1,28 @@
+2008-05-13  Alexey Proskuryakov  <ap@webkit.org>
+
+        Reviewed by Eric Seidel.
+
+        https://bugs.webkit.org/show_bug.cgi?id=18681
+        <rdar://problem/5888130> WebKit should not remove BOM characters from content.
+
+        We were only trying to match Firefox, and it doesn't do this any more.
+
+        Tests: fast/encoding/bom-in-content.html
+               fast/encoding/bom-in-content-utf16.html
+
+        * platform/text/TextDecoder.cpp: (WebCore::TextDecoder::checkForBOM): Skip the BOM if it's
+        at the start of input stream.
+
+        * platform/text/TextCodec.cpp:
+        * platform/text/TextCodec.h:
+        * platform/text/TextCodecICU.cpp:
+        (WebCore::TextCodecICU::decode):
+        * platform/text/TextCodecUTF16.cpp:
+        (WebCore::TextCodecUTF16::decode):
+        * platform/text/mac/TextCodecMac.cpp:
+        (WebCore::TextCodecMac::decode):
+        Don't remove the BOM.
+
 2008-05-13  Anders Carlsson  <andersca@apple.com>
 
         Reviewed by Darin.
index 83edf73..4222ee1 100644 (file)
 
 namespace WebCore {
 
-const UChar BOM = 0xFEFF;
-
 TextCodec::~TextCodec()
 {
 }
 
-// We strip BOM characters because they can show up both at the start of content
-// and inside content, and we never want them to end up in the decoded text.
-void TextCodec::appendOmittingBOM(Vector<UChar>& v, const UChar* characters, size_t length)
-{
-    size_t start = 0;
-    for (size_t i = 0; i != length; ++i) {
-        if (BOM == characters[i]) {
-            if (start != i)
-                v.append(&characters[start], i - start);
-            start = i + 1;
-        }
-    }
-    if (start != length)
-        v.append(&characters[start], length - start);
-}
-
 int TextCodec::getUnencodableReplacement(unsigned codePoint, UnencodableHandling handling, UnencodableReplacementArray replacement)
 {
     switch (handling) {
index 9654a2f..0a56262 100644 (file)
@@ -72,9 +72,6 @@ namespace WebCore {
         // unencodable character into the given replacement buffer. 
         // The length of the string (not including the null) will be returned.
         static int getUnencodableReplacement(unsigned codePoint, UnencodableHandling, UnencodableReplacementArray);
-
-    protected:
-        static void appendOmittingBOM(Vector<UChar>&, const UChar*, size_t length);
     };
 
     typedef void (*EncodingNameRegistrar)(const char* alias, const char* name);
index 68b59c6..dc60910 100644 (file)
@@ -282,7 +282,7 @@ String TextCodecICU::decode(const char* bytes, size_t length, bool flush, bool s
 
     do {
         int ucharsDecoded = decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, flush, err);
-        appendOmittingBOM(result, buffer, ucharsDecoded);
+        result.append(buffer, ucharsDecoded);
     } while (err == U_BUFFER_OVERFLOW_ERROR);
 
     if (U_FAILURE(err)) {
index 1148bce..88e4e73 100644 (file)
@@ -34,8 +34,6 @@ using std::auto_ptr;
 
 namespace WebCore {
 
-const UChar BOM = 0xFEFF;
-
 void TextCodecUTF16::registerEncodingNames(EncodingNameRegistrar registrar)
 {
     registrar("UTF-16LE", "UTF-16LE");
@@ -85,8 +83,7 @@ String TextCodecUTF16::decode(const char* bytes, size_t length, bool, bool stopO
             c = m_bufferedByte | (p[0] << 8);
         else
             c = (m_bufferedByte << 8) | p[0];
-        if (c != BOM)
-            *q++ = c;
+        *q++ = c;
         m_haveBufferedByte = false;
         p += 1;
         numChars -= 1;
@@ -96,15 +93,13 @@ String TextCodecUTF16::decode(const char* bytes, size_t length, bool, bool stopO
         for (size_t i = 0; i < numChars; ++i) {
             UChar c = p[0] | (p[1] << 8);
             p += 2;
-            if (c != BOM)
-                *q++ = c;
+            *q++ = c;
         }
     else
         for (size_t i = 0; i < numChars; ++i) {
             UChar c = (p[0] << 8) | p[1];
             p += 2;
-            if (c != BOM)
-                *q++ = c;
+            *q++ = c;
         }
 
     if (numBytes & 1) {
index 9016382..e39a6b7 100644 (file)
@@ -49,6 +49,8 @@ void TextDecoder::reset(const TextEncoding& encoding)
 
 String TextDecoder::checkForBOM(const char* data, size_t length, bool flush, bool stopOnError, bool& sawError)
 {
+    ASSERT(!m_checkedForBOM);
+
     // Check to see if we found a BOM.
     size_t numBufferedBytes = m_numBufferedBytes;
     size_t buf1Len = numBufferedBytes;
@@ -62,22 +64,28 @@ String TextDecoder::checkForBOM(const char* data, size_t length, bool flush, boo
 
     const TextEncoding* encodingConsideringBOM = &m_encoding;
     bool foundBOM = true;
+    size_t lengthOfBOM = 0;
     if (c1 == 0xFF && c2 == 0xFE) {
-        if (c3 != 0 || c4 != 0) 
+        if (c3 != 0 || c4 != 0)  {
             encodingConsideringBOM = &UTF16LittleEndianEncoding();
-        else if (numBufferedBytes + length > sizeof(m_bufferedBytes))
+            lengthOfBOM = 2;
+        } else if (numBufferedBytes + length > sizeof(m_bufferedBytes)) {
             encodingConsideringBOM = &UTF32LittleEndianEncoding();
-        else
+            lengthOfBOM = 4;
+        } else
             foundBOM = false;
-    }
-    else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
+    } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
         encodingConsideringBOM = &UTF8Encoding();
-    else if (c1 == 0xFE && c2 == 0xFF)
+        lengthOfBOM = 3;
+    } else if (c1 == 0xFE && c2 == 0xFF) {
         encodingConsideringBOM = &UTF16BigEndianEncoding();
-    else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF)
+        lengthOfBOM = 2;
+    } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
         encodingConsideringBOM = &UTF32BigEndianEncoding();
-    else
+        lengthOfBOM = 4;
+    } else
         foundBOM = false;
+
     if (!foundBOM && numBufferedBytes + length <= sizeof(m_bufferedBytes) && !flush) {
         // Continue to look for the BOM.
         memcpy(&m_bufferedBytes[numBufferedBytes], data, length);
@@ -91,6 +99,18 @@ String TextDecoder::checkForBOM(const char* data, size_t length, bool flush, boo
         return String();
     m_checkedForBOM = true;
 
+    // Skip the BOM.
+    if (foundBOM) {
+        ASSERT(numBufferedBytes < lengthOfBOM);
+        size_t numUnbufferedBOMBytes = lengthOfBOM - numBufferedBytes;
+        ASSERT(numUnbufferedBOMBytes <= length);
+
+        data += numUnbufferedBOMBytes;
+        length -= numUnbufferedBOMBytes;
+        numBufferedBytes = 0;
+        m_numBufferedBytes = 0;
+    }
+
     // Handle case where we have some buffered bytes to deal with.
     if (numBufferedBytes) {
         char bufferedBytes[sizeof(m_bufferedBytes)];
index aae12f0..ac1f0fb 100644 (file)
@@ -243,7 +243,7 @@ String TextCodecMac::decode(const char* bytes, size_t length, bool flush, bool s
         }
 
         ASSERT(!(bytesWritten % sizeof(UChar)));
-        appendOmittingBOM(result, buffer, bytesWritten / sizeof(UChar));
+        result.append(buffer, bytesWritten / sizeof(UChar));
 
         bufferWasFull = status == kTECOutputBufferFullStatus;
     }
@@ -252,7 +252,7 @@ String TextCodecMac::decode(const char* bytes, size_t length, bool flush, bool s
         unsigned long bytesWritten = 0;
         TECFlushText(m_converterTEC, reinterpret_cast<unsigned char*>(buffer), sizeof(buffer), &bytesWritten);
         ASSERT(!(bytesWritten % sizeof(UChar)));
-        appendOmittingBOM(result, buffer, bytesWritten / sizeof(UChar));
+        result.append(buffer, bytesWritten / sizeof(UChar));
     }
 
     String resultString = String::adopt(result);