Reviewed by Eric.
authorap <ap@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Fri, 1 Sep 2006 19:36:06 +0000 (19:36 +0000)
committerap <ap@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Fri, 1 Sep 2006 19:36:06 +0000 (19:36 +0000)
        - http://bugzilla.opendarwin.org/show_bug.cgi?id=5620
        Should only honor encoding from <meta> in HTML

        - http://bugzilla.opendarwin.org/show_bug.cgi?id=9783
        An XML declaration without an explicit encoding incorrectly triggers
        UTF-8 encoding in an HTML document

        - http://bugzilla.opendarwin.org/show_bug.cgi?id=10155
        CSS2: @charset is not supported

WebCore:
        * loader/Decoder.cpp:
        (Decoder::Decoder): Decoder now knows what kind of content it is decoding.
        Also, the browser default encoding can now be passed directly to the constructor,
        to streamline the logic.
        (Decoder::decode): Add support for @charset, differentiate between HTML and XML.
        (Decoder::setEncodingName): Style cleanup.
        (Decoder::encodingName): Ditto.
        (Decoder::flush): Ditto.
        * loader/Decoder.h:
        (WebCore::Decoder::):

        * bridge/mac/WebCoreEncodings.mm: Pass a content type of text/html to Decoder.

        * loader/CachedCSSStyleSheet.cpp:
        (WebCore::CachedCSSStyleSheet::CachedCSSStyleSheet):
        (WebCore::CachedCSSStyleSheet::setCharset):
        (WebCore::CachedCSSStyleSheet::data):
        * loader/CachedCSSStyleSheet.h:
        Use Decoder instead of TextEncoding::toUnicode() to enable @charset support.

        * loader/CachedXBLDocument.cpp:
        (WebCore::CachedXBLDocument::CachedXBLDocument): Pass a content type.
        * loader/CachedXSLStyleSheet.cpp:
        (WebCore::CachedXSLStyleSheet::CachedXSLStyleSheet): Ditto.
        (WebCore::CachedXSLStyleSheet::data): Flush the decoder to be safe.

        * page/Frame.cpp:
        (WebCore::Frame::write): Pass a content type and a default encoding to
        the Decoder.
        * xml/XSLTProcessor.cpp:
        (WebCore::XSLTProcessor::createDocumentFromSource): Pass the output document
        MIME type.
        * xml/xmlhttprequest.cpp:
        (WebCore::XMLHttpRequest::receivedData): Ditto.

LayoutTests:
        * fast/encoding/css-charset-expected.txt: Added.
        * fast/encoding/css-charset.css: Added.
        * fast/encoding/css-charset.html: Added.
        * fast/encoding/css-charset-evil-expected.txt: Added.
        * fast/encoding/css-charset-evil.css: Added.
        * fast/encoding/css-charset-evil.html: Added.
        * fast/encoding/default-xhtml-encoding-expected.txt: Added.
        * fast/encoding/default-xhtml-encoding.xhtml: Added.
        * fast/encoding/meta-in-xhtml-expected.txt: Added.
        * fast/encoding/meta-in-xhtml.xhtml: Added.
        * fast/encoding/pseudo-xml-2-expected.txt: Added.
        * fast/encoding/pseudo-xml-2.html: Added.
        * fast/encoding/pseudo-xml-3-expected.txt: Added.
        * fast/encoding/pseudo-xml-3.html: Added.
        * fast/encoding/pseudo-xml-4-expected.txt: Added.
        * fast/encoding/pseudo-xml-4.html: Added.
        * fast/encoding/pseudo-xml-expected.txt: Added.
        * fast/encoding/pseudo-xml.html: Added.

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@16175 268f45cc-cd09-0410-ab3c-d52691b4dbfc

30 files changed:
LayoutTests/ChangeLog
LayoutTests/fast/encoding/css-charset-evil-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/css-charset-evil.css [new file with mode: 0644]
LayoutTests/fast/encoding/css-charset-evil.html [new file with mode: 0644]
LayoutTests/fast/encoding/css-charset-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/css-charset.css [new file with mode: 0644]
LayoutTests/fast/encoding/css-charset.html [new file with mode: 0644]
LayoutTests/fast/encoding/default-xhtml-encoding-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/default-xhtml-encoding.xhtml [new file with mode: 0644]
LayoutTests/fast/encoding/meta-in-xhtml-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/meta-in-xhtml.xhtml [new file with mode: 0644]
LayoutTests/fast/encoding/pseudo-xml-2-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/pseudo-xml-2.html [new file with mode: 0644]
LayoutTests/fast/encoding/pseudo-xml-3-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/pseudo-xml-3.html [new file with mode: 0644]
LayoutTests/fast/encoding/pseudo-xml-4-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/pseudo-xml-4.html [new file with mode: 0644]
LayoutTests/fast/encoding/pseudo-xml-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/pseudo-xml.html [new file with mode: 0644]
WebCore/ChangeLog
WebCore/bridge/mac/WebCoreEncodings.mm
WebCore/loader/CachedCSSStyleSheet.cpp
WebCore/loader/CachedCSSStyleSheet.h
WebCore/loader/CachedXBLDocument.cpp
WebCore/loader/CachedXSLStyleSheet.cpp
WebCore/loader/Decoder.cpp
WebCore/loader/Decoder.h
WebCore/page/Frame.cpp
WebCore/xml/XSLTProcessor.cpp
WebCore/xml/xmlhttprequest.cpp

index c685948e7defdd954cda833a1eedac03c9f1cf67..45b0ff0ff0b773c1e165e6cb901c1a4933208b68 100644 (file)
@@ -1,3 +1,37 @@
+2006-09-01  Alexey Proskuryakov  <ap@nypop.com>
+
+        Reviewed by Eric.
+
+        Tests for:
+        - http://bugzilla.opendarwin.org/show_bug.cgi?id=5620
+        Should only honor encoding from <meta> in HTML
+
+        - http://bugzilla.opendarwin.org/show_bug.cgi?id=9783
+        An XML declaration without an explicit encoding incorrectly triggers
+        UTF-8 encoding in an HTML document
+
+        - http://bugzilla.opendarwin.org/show_bug.cgi?id=10155
+        CSS2: @charset is not supported
+
+        * fast/encoding/css-charset-expected.txt: Added.
+        * fast/encoding/css-charset.css: Added.
+        * fast/encoding/css-charset.html: Added.
+        * fast/encoding/css-charset-evil-expected.txt: Added.
+        * fast/encoding/css-charset-evil.css: Added.
+        * fast/encoding/css-charset-evil.html: Added.
+        * fast/encoding/default-xhtml-encoding-expected.txt: Added.
+        * fast/encoding/default-xhtml-encoding.xhtml: Added.
+        * fast/encoding/meta-in-xhtml-expected.txt: Added.
+        * fast/encoding/meta-in-xhtml.xhtml: Added.
+        * fast/encoding/pseudo-xml-2-expected.txt: Added.
+        * fast/encoding/pseudo-xml-2.html: Added.
+        * fast/encoding/pseudo-xml-3-expected.txt: Added.
+        * fast/encoding/pseudo-xml-3.html: Added.
+        * fast/encoding/pseudo-xml-4-expected.txt: Added.
+        * fast/encoding/pseudo-xml-4.html: Added.
+        * fast/encoding/pseudo-xml-expected.txt: Added.
+        * fast/encoding/pseudo-xml.html: Added.
+
 2006-08-31  Alice Liu  <alice.liu@apple.com>
 
         Reviewed by Darin.
diff --git a/LayoutTests/fast/encoding/css-charset-evil-expected.txt b/LayoutTests/fast/encoding/css-charset-evil-expected.txt
new file mode 100644 (file)
index 0000000..6a8d01a
--- /dev/null
@@ -0,0 +1,7 @@
+Test for bug 10155: CSS2: @charset is not supported
+
+CSS 2.1 says that @charset " must be written literally, but Firefox and IE allow extra spaces and single quotes.
+
+SUССЕSS
+
+
diff --git a/LayoutTests/fast/encoding/css-charset-evil.css b/LayoutTests/fast/encoding/css-charset-evil.css
new file mode 100644 (file)
index 0000000..e134cca
--- /dev/null
@@ -0,0 +1,2 @@
+@charset         'utf-8'       ;
+#dummy:before { content: "SUССЕSS"; }
diff --git a/LayoutTests/fast/encoding/css-charset-evil.html b/LayoutTests/fast/encoding/css-charset-evil.html
new file mode 100644 (file)
index 0000000..eaa633d
--- /dev/null
@@ -0,0 +1,31 @@
+<html>
+<head>
+    <meta content="text/html; charset=windows-1251" http-equiv="Content-Type"/>
+    <link rel="stylesheet" type="text/css" href="css-charset-evil.css">
+</head>
+<body onload="test()">
+<p>Test for <a href="http://bugzilla.opendarwin.org/show_bug.cgi?id=10155">bug 10155</a>:
+CSS2: @charset is not supported</p>
+
+<p>CSS 2.1 says that <code>@charset "</code> must be written literally, 
+but Firefox and IE allow extra spaces and single quotes.</p>
+
+<p id="result"></p>
+
+<script>
+if (window.layoutTestController)
+    layoutTestController.dumpAsText();
+
+function test() {
+  try {
+    text = document.styleSheets[0].rules[0].style.getPropertyValue("content");
+    text = text.replace(/"/g, "");
+    document.getElementById("result").textContent = text;
+  } catch (ex) {
+    document.getElementById("result").textContent = ex.toString();
+  }
+}
+</script>
+
+</body>
+</html>
diff --git a/LayoutTests/fast/encoding/css-charset-expected.txt b/LayoutTests/fast/encoding/css-charset-expected.txt
new file mode 100644 (file)
index 0000000..637feae
--- /dev/null
@@ -0,0 +1,7 @@
+Test for bug 10155: CSS2: @charset is not supported
+
+Test that @charset works and that indexed rule access via an IE-specific rules property does not take it into account.
+
+SUCCESS
+
+
diff --git a/LayoutTests/fast/encoding/css-charset.css b/LayoutTests/fast/encoding/css-charset.css
new file mode 100644 (file)
index 0000000..675084b
--- /dev/null
@@ -0,0 +1,2 @@
+@charset "utf-8";
+#dummy:before { content: "SUССЕSS"; }
diff --git a/LayoutTests/fast/encoding/css-charset.html b/LayoutTests/fast/encoding/css-charset.html
new file mode 100644 (file)
index 0000000..9087d22
--- /dev/null
@@ -0,0 +1,34 @@
+<html>
+<head>
+    <meta content="text/html; charset=windows-1251" http-equiv="Content-Type"/>
+    <link rel="stylesheet" type="text/css" href="css-charset.css" charset="windows-1251">
+    <!-- The document charset and link charset have lower priority than 
+         @charset, so they shouldn't affect anything. -->
+</head>
+<body onload="test()">
+<p>Test for <a href="http://bugzilla.opendarwin.org/show_bug.cgi?id=10155">bug 10155</a>:
+CSS2: @charset is not supported</p>
+<p>Test that <code>@charset</code> works and that indexed rule access via 
+an IE-specific <code>rules</code> property does not take it into account.</p>
+
+<p id="result"></p>
+
+<script>
+if (window.layoutTestController)
+    layoutTestController.dumpAsText();
+
+function test() {
+  try {
+    text = document.styleSheets[0].rules[0].style.cssText;
+    if (text.match('.*SUÑÑÅSS.*'))
+      result.innerHTML = "SUCCESS";
+    else
+      result.innerHTML = "FAILURE: " + text;
+  } catch (ex) {
+    result.innerHTML = "FAILURE: " + ex;
+  }
+}
+</script>
+
+</body>
+</html>
diff --git a/LayoutTests/fast/encoding/default-xhtml-encoding-expected.txt b/LayoutTests/fast/encoding/default-xhtml-encoding-expected.txt
new file mode 100644 (file)
index 0000000..5dffac1
--- /dev/null
@@ -0,0 +1,5 @@
+Test default XHTML encoding (in the absence of an XML declaration).
+
+Charset: UTF-8 (should be UTF-8)
+
+
diff --git a/LayoutTests/fast/encoding/default-xhtml-encoding.xhtml b/LayoutTests/fast/encoding/default-xhtml-encoding.xhtml
new file mode 100644 (file)
index 0000000..ffbc92c
--- /dev/null
@@ -0,0 +1,16 @@
+<html xmlns="http://www.w3.org/1999/xhtml">
+<body>
+<p>Test default XHTML encoding (in the absence of an XML declaration).</p>
+
+<p id="result" />
+
+<script>
+if (window.layoutTestController)
+    layoutTestController.dumpAsText();
+
+document.getElementById("result").innerHTML = "Charset: " + 
+    (document.charset ? document.charset : document.characterSet) + " (should be UTF-8)";
+</script>
+
+</body>
+</html>
diff --git a/LayoutTests/fast/encoding/meta-in-xhtml-expected.txt b/LayoutTests/fast/encoding/meta-in-xhtml-expected.txt
new file mode 100644 (file)
index 0000000..6fc06c9
--- /dev/null
@@ -0,0 +1,5 @@
+Test for bug 5620: Should only honor encoding from <meta> in HTML
+
+Charset: UTF-8 (should be UTF-8)
+
+
diff --git a/LayoutTests/fast/encoding/meta-in-xhtml.xhtml b/LayoutTests/fast/encoding/meta-in-xhtml.xhtml
new file mode 100644 (file)
index 0000000..d9c6065
--- /dev/null
@@ -0,0 +1,23 @@
+<?xml version="1.0" ?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+    "http://www.w3.org/TR/xhtml1/DTD/transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta content="text/html; charset=windows-1251" http-equiv="Content-Type"/>
+</head>
+<body>
+<p>Test for <a href="http://bugzilla.opendarwin.org/show_bug.cgi?id=5620">bug 5620</a>:
+Should only honor encoding from &lt;meta> in HTML</p>
+
+<p id="result" />
+
+<script>
+if (window.layoutTestController)
+    layoutTestController.dumpAsText();
+
+document.getElementById("result").innerHTML = "Charset: " + 
+    (document.charset ? document.charset : document.characterSet) + " (should be UTF-8)";
+</script>
+
+</body>
+</html>
diff --git a/LayoutTests/fast/encoding/pseudo-xml-2-expected.txt b/LayoutTests/fast/encoding/pseudo-xml-2-expected.txt
new file mode 100644 (file)
index 0000000..2ece178
--- /dev/null
@@ -0,0 +1,5 @@
+Test for bug 9783: An XML declaration without an explicit encoding incorrectly triggers UTF-8 encoding in an HTML document
+
+Test that an XML declaration with an explicit encoding is still honored. This is what Firefox 1.5 and Safari 2.0 do, unlike WinIE 6.
+
+Charset: KOI8-R (should be KOI8-R)
diff --git a/LayoutTests/fast/encoding/pseudo-xml-2.html b/LayoutTests/fast/encoding/pseudo-xml-2.html
new file mode 100644 (file)
index 0000000..3d15529
--- /dev/null
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="koi8-r" ?>
+<html>
+<body>
+<p>Test for <a href="http://bugzilla.opendarwin.org/show_bug.cgi?id=9783">bug 9783</a>:
+An XML declaration without an explicit encoding incorrectly triggers UTF-8 encoding in an HTML document</p>
+<p>Test that an XML declaration <b>with</b> an explicit encoding is still honored. This is what Firefox 1.5 and Safari 2.0 do, unlike WinIE 6.</p>
+<p id="result" />
+
+<script>
+if (window.layoutTestController)
+    layoutTestController.dumpAsText();
+
+document.getElementById("result").innerHTML = "Charset: " + 
+    (document.charset ? document.charset : document.characterSet) + " (should be KOI8-R)";
+</script>
+
+</body>
+</html>
diff --git a/LayoutTests/fast/encoding/pseudo-xml-3-expected.txt b/LayoutTests/fast/encoding/pseudo-xml-3-expected.txt
new file mode 100644 (file)
index 0000000..74f20f2
--- /dev/null
@@ -0,0 +1,3 @@
+Test for bug 9783: An XML declaration without an explicit encoding incorrectly triggers UTF-8 encoding in an HTML document
+
+Charset: KOI8-R (should be KOI8-R)
diff --git a/LayoutTests/fast/encoding/pseudo-xml-3.html b/LayoutTests/fast/encoding/pseudo-xml-3.html
new file mode 100644 (file)
index 0000000..b9edc45
--- /dev/null
@@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+    "http://www.w3.org/TR/xhtml1/DTD/transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta content="text/html; charset=KOI8-R" http-equiv="Content-Type"/>
+</head>
+<body>
+<p>Test for <a href="http://bugzilla.opendarwin.org/show_bug.cgi?id=9783">bug 9783</a>:
+An XML declaration without an explicit encoding incorrectly triggers UTF-8 encoding in an HTML document</p>
+
+<p id="result" />
+
+<script>
+if (window.layoutTestController)
+    layoutTestController.dumpAsText();
+
+document.getElementById("result").innerHTML = "Charset: " + 
+    (document.charset ? document.charset : document.characterSet) + " (should be KOI8-R)";
+</script>
+
+</body>
+</html>
diff --git a/LayoutTests/fast/encoding/pseudo-xml-4-expected.txt b/LayoutTests/fast/encoding/pseudo-xml-4-expected.txt
new file mode 100644 (file)
index 0000000..da4a985
--- /dev/null
@@ -0,0 +1,3 @@
+Test for bug 9783: An XML declaration without an explicit encoding incorrectly triggers UTF-8 encoding in an HTML document
+
+Charset: windows-1251 (should be windows-1251)
diff --git a/LayoutTests/fast/encoding/pseudo-xml-4.html b/LayoutTests/fast/encoding/pseudo-xml-4.html
new file mode 100644 (file)
index 0000000..64f6036
--- /dev/null
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="KOI8-R" ?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+    "http://www.w3.org/TR/xhtml1/DTD/transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta content="text/html; charset=windows-1251" http-equiv="Content-Type"/>
+</head>
+<body>
+<p>Test for <a href="http://bugzilla.opendarwin.org/show_bug.cgi?id=9783">bug 9783</a>:
+An XML declaration without an explicit encoding incorrectly triggers UTF-8 encoding in an HTML document</p>
+
+<p id="result" />
+
+<script>
+if (window.layoutTestController)
+    layoutTestController.dumpAsText();
+
+document.getElementById("result").innerHTML = "Charset: " + 
+    (document.charset ? document.charset : document.characterSet) + " (should be windows-1251)";
+</script>
+
+</body>
+</html>
diff --git a/LayoutTests/fast/encoding/pseudo-xml-expected.txt b/LayoutTests/fast/encoding/pseudo-xml-expected.txt
new file mode 100644 (file)
index 0000000..1b56ef3
--- /dev/null
@@ -0,0 +1,3 @@
+Test for bug 9783: An XML declaration without an explicit encoding incorrectly triggers UTF-8 encoding in an HTML document
+
+Charset: ISO-8859-1 (should be your browser default one)
diff --git a/LayoutTests/fast/encoding/pseudo-xml.html b/LayoutTests/fast/encoding/pseudo-xml.html
new file mode 100644 (file)
index 0000000..911d18c
--- /dev/null
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+    "http://www.w3.org/TR/xhtml1/DTD/transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<body>
+<p>Test for <a href="http://bugzilla.opendarwin.org/show_bug.cgi?id=9783">bug 9783</a>:
+An XML declaration without an explicit encoding incorrectly triggers UTF-8 encoding in an HTML document</p>
+
+<p id="result" />
+
+<script>
+if (window.layoutTestController)
+    layoutTestController.dumpAsText();
+
+document.getElementById("result").innerHTML = "Charset: " + 
+    (document.charset ? document.charset : document.characterSet) + " (should be your browser default one)";
+</script>
+
+</body>
+</html>
index 9a2a4d9624fbc4cc8633cd277a4a16d694c02b02..1921c3fa960fde46cb6e20b545654964b6cb62e0 100644 (file)
@@ -1,3 +1,52 @@
+2006-09-01  Alexey Proskuryakov  <ap@nypop.com>
+
+        Reviewed by Eric.
+
+        - http://bugzilla.opendarwin.org/show_bug.cgi?id=5620
+        Should only honor encoding from <meta> in HTML
+
+        - http://bugzilla.opendarwin.org/show_bug.cgi?id=9783
+        An XML declaration without an explicit encoding incorrectly triggers
+        UTF-8 encoding in an HTML document
+
+        - http://bugzilla.opendarwin.org/show_bug.cgi?id=10155
+        CSS2: @charset is not supported
+        
+        * loader/Decoder.cpp:
+        (Decoder::Decoder): Decoder now knows what kind of content it is decoding.
+        Also, the browser default encoding can now be passed directly to the constructor,
+        to streamline the logic.
+        (Decoder::decode): Add support for @charset, differentiate between HTML and XML.
+        (Decoder::setEncodingName): Style cleanup.
+        (Decoder::encodingName): Ditto.
+        (Decoder::flush): Ditto.
+        * loader/Decoder.h:
+        (WebCore::Decoder::):
+
+        * bridge/mac/WebCoreEncodings.mm: Pass a content type of text/html to Decoder.
+
+        * loader/CachedCSSStyleSheet.cpp:
+        (WebCore::CachedCSSStyleSheet::CachedCSSStyleSheet):
+        (WebCore::CachedCSSStyleSheet::setCharset):
+        (WebCore::CachedCSSStyleSheet::data):
+        * loader/CachedCSSStyleSheet.h:
+        Use Decoder instead of TextEncoding::toUnicode() to enable @charset support.
+
+        * loader/CachedXBLDocument.cpp:
+        (WebCore::CachedXBLDocument::CachedXBLDocument): Pass a content type.
+        * loader/CachedXSLStyleSheet.cpp:
+        (WebCore::CachedXSLStyleSheet::CachedXSLStyleSheet): Ditto.
+        (WebCore::CachedXSLStyleSheet::data): Flush the decoder to be safe.
+
+        * page/Frame.cpp:
+        (WebCore::Frame::write): Pass a content type and a default encoding to 
+        the Decoder.
+        * xml/XSLTProcessor.cpp:
+        (WebCore::XSLTProcessor::createDocumentFromSource): Pass the output document
+        MIME type.
+        * xml/xmlhttprequest.cpp:
+        (WebCore::XMLHttpRequest::receivedData): Ditto.
+
 2006-09-01  Adele Peterson  <adele@apple.com>
 
         Reviewed by John.
index 06b26bc9c99f7f387c718bdcdbb23a94b40f4393..1666921f0da9ef5c29e9ab38dc640e8d69703020 100644 (file)
@@ -36,7 +36,7 @@ using namespace WebCore;
 + (NSString *)decodeData:(NSData *)data
 {
     HTMLNames::init(); // this method is used for importing bookmarks at startup, so HTMLNames are likely to be uninitialized yet
-    Decoder* decoder = new Decoder;
+    Decoder* decoder = new Decoder("text/html"); // bookmark files are HTML
     DeprecatedString result = decoder->decode(static_cast<const char *>([data bytes]), [data length]);
     result += decoder->flush();
     decoder->deref();
index 8645ee997c88bc03ebe3d171e8809ab4934fd825..ae7fedaf090041482be1f0d2a604919114c4a5de 100644 (file)
@@ -32,6 +32,7 @@
 #include "Cache.h"
 #include "CachedResourceClient.h"
 #include "CachedResourceClientWalker.h"
+#include "Decoder.h"
 #include "LoaderFunctions.h"
 #include "loader.h"
 #include <wtf/Vector.h>
@@ -40,20 +41,17 @@ namespace WebCore {
 
 CachedCSSStyleSheet::CachedCSSStyleSheet(DocLoader* dl, const String &url, CachePolicy cachePolicy, time_t _expireDate, const DeprecatedString& charset)
     : CachedResource(url, CSSStyleSheet, cachePolicy, _expireDate)
-    , m_encoding(charset.latin1())
+    , m_decoder(new Decoder("text/css"))
 {
     // It's css we want.
     setAccept("text/css");
     // load the file
     Cache::loader()->load(dl, this, false);
     m_loading = true;
-    if (!m_encoding.isValid())
-        m_encoding = TextEncoding(Latin1Encoding);
 }
 
 CachedCSSStyleSheet::CachedCSSStyleSheet(const String &url, const DeprecatedString &stylesheet_data)
     : CachedResource(url, CSSStyleSheet, CachePolicyVerify, 0, stylesheet_data.length())
-    , m_encoding(InvalidEncoding)
 {
     m_loading = false;
     m_status = Persistent;
@@ -82,11 +80,8 @@ void CachedCSSStyleSheet::deref(CachedResourceClient *c)
 
 void CachedCSSStyleSheet::setCharset(const DeprecatedString& chs)
 {
-    if (!chs.isEmpty()) {
-        TextEncoding encoding = TextEncoding(chs.latin1());
-        if (encoding.isValid())
-            m_encoding = encoding;
-    }
+    if (!chs.isEmpty())
+        m_decoder->setEncodingName(chs.latin1(), Decoder::EncodingFromHTTPHeader);
 }
 
 void CachedCSSStyleSheet::data(Vector<char>& data, bool allDataReceived)
@@ -95,7 +90,8 @@ void CachedCSSStyleSheet::data(Vector<char>& data, bool allDataReceived)
         return;
 
     setSize(data.size());
-    m_sheet = String(m_encoding.toUnicode(data.data(), size()));
+    m_sheet = m_decoder->decode(data.data(), size());
+    m_sheet += m_decoder->flush();
     m_loading = false;
     checkNotify();
 }
index 4dfdcc4f3d2e296221549e1c4ce945e209cef96d..8dad8ae797d8c84bd71099356eeda7bf24732f6a 100644 (file)
@@ -35,6 +35,7 @@
 namespace WebCore
 {
     class DocLoader;
+    class Decoder;
 
     class CachedCSSStyleSheet : public CachedResource
     {
@@ -58,7 +59,7 @@ namespace WebCore
 
     protected:
         String m_sheet;
-        TextEncoding m_encoding;
+        RefPtr<Decoder> m_decoder;
     };
 
 }
index 4c18d0f3d8b591494a4f1669eb7d6c6ffbc90710..3b7e3022e61d915d81b7a0db8ebc7483cd2bc55c 100644 (file)
@@ -49,7 +49,7 @@ CachedXBLDocument::CachedXBLDocument(DocLoader* dl, const String &url, CachePoli
     // Load the file
     Cache::loader()->load(dl, this, false);
     m_loading = true;
-    m_decoder = new Decoder;
+    m_decoder = new Decoder("application/xml");
 }
 
 CachedXBLDocument::~CachedXBLDocument()
index a0f9b99034e5ad432c522ceca396981df331b761..058c952f170cbc1acea931e48d4929c6d157ac26 100644 (file)
@@ -42,6 +42,7 @@ namespace WebCore {
 
 CachedXSLStyleSheet::CachedXSLStyleSheet(DocLoader* dl, const String &url, CachePolicy cachePolicy, time_t _expireDate)
     : CachedResource(url, XSLStyleSheet, cachePolicy, _expireDate)
+    , m_decoder(new Decoder("text/xsl"))
 {
     // It's XML we want.
     // FIXME: This should accept more general xml formats */*+xml, image/svg+xml for example.
@@ -50,7 +51,6 @@ CachedXSLStyleSheet::CachedXSLStyleSheet(DocLoader* dl, const String &url, Cache
     // load the file
     Cache::loader()->load(dl, this, false);
     m_loading = true;
-    m_decoder = new Decoder;
 }
 
 void CachedXSLStyleSheet::ref(CachedResourceClient *c)
@@ -82,6 +82,7 @@ void CachedXSLStyleSheet::data(Vector<char>& data, bool allDataReceived)
 
     setSize(data.size());
     m_sheet = String(m_decoder->decode(data.data(), size()));
+    m_sheet += m_decoder->flush();
     m_loading = false;
     checkNotify();
 }
index b73faa45efa95a014af463b720d71a6531463200..2b5944cd712856ca52d374de98b4a6ee2c83e08e 100644 (file)
@@ -3,6 +3,7 @@
 
     Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
     Copyright (C) 2003, 2004, 2005, 2006 Apple Computer, Inc.
+    Copyright (C) 2005, 2006 Alexey Proskuryakov (ap@nypop.com)
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Library General Public
@@ -24,6 +25,7 @@
 #include "config.h"
 #include "Decoder.h"
 
+#include "DOMImplementation.h"
 #include "HTMLNames.h"
 #include "StreamingTextDecoder.h"
 #include "RegularExpression.h"
@@ -248,33 +250,48 @@ breakBreak:
     return (code);
 }
 
-Decoder::Decoder() 
-  : m_encoding(Latin1Encoding)
-  , m_decoder(StreamingTextDecoder::create(m_encoding))
-  , enc(0)
+Decoder::Decoder(const String& mimeType, const String& defaultEncodingName)
+  : m_encoding(defaultEncodingName.isNull() ? "iso8859-1" : defaultEncodingName.ascii().data())
+  , m_encodingName(defaultEncodingName.isNull() ? "iso8859-1" : defaultEncodingName.ascii().data())
   , m_type(DefaultEncoding)
-  , body(false)
-  , beginning(true)
+  , m_reachedBody(false)
+  , m_checkedForCSSCharset(false)
+  , m_checkedForBOM(false)
 {
+    if (mimeType == "text/css")
+        m_contentType = CSS;
+    else if (mimeType == "text/html")
+        m_contentType = HTML;
+    else if (DOMImplementation::isXMLMIMEType(mimeType)) {
+        m_contentType = XML;
+        // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we do not assume us-ascii 
+        // for text/xml, to match Firefox.
+        m_encoding = TextEncoding(UTF8Encoding);
+        m_encodingName = "UTF-8";
+    } else
+        m_contentType = PlainText;
+
+    m_decoder.set(StreamingTextDecoder::create(m_encoding));
 }
 
 Decoder::~Decoder()
 {
 }
 
-void Decoder::setEncodingName(const char* _encoding, EncodingSource type)
+void Decoder::setEncodingName(const char* encodingName, EncodingSource type)
 {
-    enc = _encoding;
-    enc = enc.lower();
+    m_encodingName = encodingName;
+    m_encodingName = m_encodingName.lower();
 
-    if (enc.isEmpty())
+    if (m_encodingName.isEmpty())
         return;
 
-    TextEncoding encoding = TextEncoding(enc, type == EncodingFromMetaTag || type == EncodingFromXMLHeader);
+    bool eightBitOnly = type == EncodingFromMetaTag || type == EncodingFromXMLHeader || type == EncodingFromCSSCharset;
+    TextEncoding encoding = TextEncoding(m_encodingName, eightBitOnly);
 
     // in case the encoding didn't exist, we keep the old one (fixes some sites specifying invalid encodings)
     if (encoding.isValid()) {
-        enc = encoding.name();
+        m_encodingName = encoding.name();
         m_encoding = encoding;
         m_type = type;
         m_decoder.set(StreamingTextDecoder::create(m_encoding));
@@ -283,7 +300,7 @@ void Decoder::setEncodingName(const char* _encoding, EncodingSource type)
 
 const char* Decoder::encodingName() const
 {
-    return enc;
+    return m_encodingName;
 }
 
 // Other browsers allow comments in the head section, so we need to also.
@@ -355,19 +372,27 @@ static int findXMLEncoding(const DeprecatedCString &str, int &encodingLength)
     return pos;
 }
 
+// true if there is more to parse
+static inline bool skipWhitespace(const char*& pos, const char* dataEnd)
+{
+    while (pos < dataEnd && (*pos == '\t' || *pos == ' '))
+        ++pos;
+    return pos != dataEnd;
+}
+
 DeprecatedString Decoder::decode(const char *data, int len)
 {
     // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
-    int bufferLength = buffer.length();
+    int bufferLength = m_buffer.length();
     const int maximumBOMLength = 3;
-    if (beginning && bufferLength + len >= maximumBOMLength) {
+    if (!m_checkedForBOM && bufferLength + len >= maximumBOMLength) {
         if (m_type != UserChosenEncoding) {
             // Extract the first three bytes.
             // Handle the case where some of bytes are already in the buffer.
             // The last byte is always guaranteed to not be in the buffer.
             const unsigned char *udata = (const unsigned char *)data;
-            unsigned char c1 = bufferLength >= 1 ? buffer[0].unicode() : *udata++;
-            unsigned char c2 = bufferLength >= 2 ? buffer[1].unicode() : *udata++;
+            unsigned char c1 = bufferLength >= 1 ? m_buffer[0].unicode() : *udata++;
+            unsigned char c2 = bufferLength >= 2 ? m_buffer[1].unicode() : *udata++;
             ASSERT(bufferLength < 3);
             unsigned char c3 = *udata;
 
@@ -386,194 +411,239 @@ DeprecatedString Decoder::decode(const char *data, int len)
                 m_type = AutoDetectedEncoding;
                 m_encoding = TextEncoding(autoDetectedEncoding);
                 ASSERT(m_encoding.isValid());
-                enc = m_encoding.name();
+                m_encodingName = m_encoding.name();
                 m_decoder.set(StreamingTextDecoder::create(m_encoding));
             }
         }
-        beginning = false;
+        m_checkedForBOM = true;
     }
     
-    // this is not completely efficient, since the function might go
-    // through the html head several times...
+    bool currentChunkInBuffer = false;
+    
+    if (m_type == DefaultEncoding && m_contentType == CSS && !m_checkedForCSSCharset) {
+        m_buffer.append(data, len);
+        currentChunkInBuffer = true;
+
+        if (len > 8) { // strlen("@charset") == 8
+            const char* dataStart = m_buffer.latin1();
+            const char* dataEnd = dataStart + m_buffer.length();
+
+            if (dataStart[0] == '@' && dataStart[1] == 'c' && dataStart[2] == 'h' && dataStart[3] == 'a' && dataStart[4] == 'r' && 
+                dataStart[5] == 's' && dataStart[6] == 'e' && dataStart[7] == 't') {
+        
+                dataStart += 8;
+                const char* pos = dataStart;
+                if (!skipWhitespace(pos, dataEnd))
+                    return DeprecatedString::null;
+
+                if (*pos == '"' || *pos == '\'') {
+                    char quotationMark = *pos;
+                    ++pos;
+                    dataStart = pos;
+                
+                    while (pos < dataEnd && *pos != quotationMark)
+                        ++pos;
+                    if (pos == dataEnd)
+                        return DeprecatedString::null;
+
+                    DeprecatedCString encodingName(dataStart, pos - dataStart + 1);
+                    
+                    ++pos;
+                    if (!skipWhitespace(pos, dataEnd))
+                        return DeprecatedString::null;
+
+                    if (*pos == ';')
+                        setEncodingName(encodingName, EncodingFromCSSCharset);
+                }
+            }
+            m_checkedForCSSCharset = true;
+        }
+        return DeprecatedString::null;
 
-    bool lookForMetaTag = m_type == DefaultEncoding && !body;
+    } else if (m_type == DefaultEncoding && m_contentType != PlainText && !m_reachedBody) { // HTML and XML
+        // this is not completely efficient, since the function might go
+        // through the html head several times...
     
-    if (lookForMetaTag) {
-#ifdef DECODE_DEBUG
-        kdDebug(6005) << "looking for charset definition" << endl;
-#endif
-        { // extra level of braces to keep indenting matching original for better diff'ing
-            buffer.append(data, len);
-            // we still don't have an encoding, and are in the head
-            // the following tags are allowed in <head>:
-            // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
-            
-            // We stop scanning when a tag that is not permitted in <head>
-            // is seen, rather when </head> is seen, because that more closely
-            // matches behavior in other browsers; more details in
-            // <http://bugzilla.opendarwin.org/show_bug.cgi?id=3590>.
-            
-            // Additionally, we ignore things that looks like tags in <title>; see
-            // <http://bugzilla.opendarwin.org/show_bug.cgi?id=4560>.
-            
-            bool withinTitle = false;
-
-            const char *ptr = buffer.latin1();
-            const char *pEnd = ptr + buffer.length();
-            while(ptr != pEnd)
-            {
-                if(*ptr == '<') {
-                    bool end = false;
-                    ptr++;
+        m_buffer.append(data, len);
+        currentChunkInBuffer = true;
+        
+        // we still don't have an encoding, and are in the head
+        // the following tags are allowed in <head>:
+        // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
+        
+        // We stop scanning when a tag that is not permitted in <head>
+        // is seen, rather when </head> is seen, because that more closely
+        // matches behavior in other browsers; more details in
+        // <http://bugzilla.opendarwin.org/show_bug.cgi?id=3590>.
+        
+        // Additionally, we ignore things that looks like tags in <title>; see
+        // <http://bugzilla.opendarwin.org/show_bug.cgi?id=4560>.
+        
+        bool withinTitle = false;
+
+        const char *ptr = m_buffer.latin1();
+        const char *pEnd = ptr + m_buffer.length();
+        while (ptr != pEnd) {
+            if (*ptr == '<') {
+                bool end = false;
+                ptr++;
+
+                // Handle comments.
+                if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') {
+                    ptr += 3;
+                    skipComment(ptr, pEnd);
+                    continue;
+                }
+                
+                // Handle XML declaration, which can have encoding in it.
+                // This encoding is honored even for HTML documents.
+                if (ptr[0] == '?' && ptr[1] == 'x' && ptr[2] == 'm' && ptr[3] == 'l') {
+                    const char *end = ptr;
+                    while (*end != '>' && *end != '\0')
+                        end++;
+                    if (*end == '\0')
+                        break;
+                    DeprecatedCString str(ptr, end - ptr);
+                    int len;
+                    int pos = findXMLEncoding(str, len);
+                    if (pos != -1)
+                        setEncodingName(str.mid(pos, len), EncodingFromXMLHeader);
+                    // continue looking for a charset - it may be specified in an HTTP-Equiv meta
+                } else if (ptr[0] == 0 && ptr[1] == '?' && ptr[2] == 0 && ptr[3] == 'x' && ptr[4] == 0 && ptr[5] == 'm' && ptr[6] == 0 && ptr[7] == 'l') {
+                    // UTF-16 without BOM
+                    setEncodingName(((ptr - m_buffer.latin1()) % 2) ? "UTF-16LE" : "UTF-16BE", AutoDetectedEncoding);
+                    goto found;
+                }
+                
+                // the HTTP-EQUIV meta has no effect on XHTML
+                if (m_contentType == XML)
+                    goto found;
+
+                if (*ptr == '/') {
+                    ++ptr;
+                    end=true;
+                }
 
-                    // Handle comments.
-                    if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') {
-                        ptr += 3;
-                        skipComment(ptr, pEnd);
-                        continue;
-                    }
-                    
-                    // Handle XML header, which can have encoding in it.
-                    if (ptr[0] == '?' && ptr[1] == 'x' && ptr[2] == 'm' && ptr[3] == 'l') {
-                        const char *end = ptr;
-                        while (*end != '>' && *end != '\0') end++;
-                        if (*end == '\0')
+                char tmp[20];
+                int len = 0;
+                while (
+                    ((*ptr >= 'a') && (*ptr <= 'z') ||
+                     (*ptr >= 'A') && (*ptr <= 'Z') ||
+                     (*ptr >= '0') && (*ptr <= '9'))
+                    && len < 19 )
+                {
+                    tmp[len] = tolower(*ptr);
+                    ptr++;
+                    len++;
+                }
+                tmp[len] = 0;
+                AtomicString tag(tmp);
+                
+                if (tag == titleTag)
+                    withinTitle = !end;
+                
+                if (!end && tag == metaTag) {
+                    const char* end = ptr;
+                    while (*end != '>' && *end != '\0')
+                        end++;
+                    if (*end == '\0')
+                        break;
+                    DeprecatedCString str(ptr, (end-ptr)+1);
+                    str = str.lower();
+                    int pos = 0;
+                    while (pos < (int)str.length()) {
+                        if ((pos = str.find("charset", pos, false)) == -1)
+                            break;
+                        pos += 7;
+                        // skip whitespace..
+                        while (pos < (int)str.length() && str[pos] <= ' ')
+                            pos++;
+                        if (pos == (int)str.length())
+                            break;
+                        if (str[pos++] != '=')
+                            continue;
+                        while (pos < (int)str.length() &&
+                                (str[pos] <= ' ') || str[pos] == '=' || str[pos] == '"' || str[pos] == '\'')
+                            pos++;
+
+                        // end ?
+                        if (pos == (int)str.length())
+                            break;
+                        unsigned endpos = pos;
+                        while (endpos < str.length() &&
+                               str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\'' &&
+                               str[endpos] != ';' && str[endpos] != '>')
+                            endpos++;
+                        setEncodingName(str.mid(pos, endpos-pos), EncodingFromMetaTag);
+                        if (m_type == EncodingFromMetaTag)
+                            goto found;
+
+                        if (endpos >= str.length() || str[endpos] == '/' || str[endpos] == '>')
                             break;
-                        DeprecatedCString str(ptr, end - ptr);
-                        int len;
-                        int pos = findXMLEncoding(str, len);
-                        if (pos != -1)
-                            setEncodingName(str.mid(pos, len), EncodingFromXMLHeader);
-                        if (m_type != EncodingFromXMLHeader)
-                            setEncodingName("UTF-8", EncodingFromXMLHeader);
-                        // continue looking for a charset - it may be specified in an HTTP-Equiv meta
-                    } else if (ptr[0] == 0 && ptr[1] == '?' && ptr[2] == 0 && ptr[3] == 'x' && ptr[4] == 0 && ptr[5] == 'm' && ptr[6] == 0 && ptr[7] == 'l') {
-                        // UTF-16 without BOM
-                        setEncodingName(((ptr - buffer.latin1()) % 2) ? "UTF-16LE" : "UTF-16BE", AutoDetectedEncoding);
-                        goto found;
-                    }
 
-                    if(*ptr == '/') ptr++, end=true;
-                    char tmp[20];
-                    int len = 0;
-                    while (
-                        ((*ptr >= 'a') && (*ptr <= 'z') ||
-                         (*ptr >= 'A') && (*ptr <= 'Z') ||
-                         (*ptr >= '0') && (*ptr <= '9'))
-                        && len < 19 )
-                    {
-                        tmp[len] = tolower( *ptr );
-                        ptr++;
-                        len++;
-                    }
-                    tmp[len] = 0;
-                    AtomicString tag(tmp);
-                    
-                    if (tag == titleTag)
-                        withinTitle = !end;
-                    
-                    if (!end && tag == metaTag) {
-                        const char * end = ptr;
-                        while(*end != '>' && *end != '\0') end++;
-                        if ( *end == '\0' ) break;
-                        DeprecatedCString str( ptr, (end-ptr)+1);
-                        str = str.lower();
-                        int pos = 0;
-                        while( pos < ( int ) str.length() ) {
-                            if( (pos = str.find("charset", pos, false)) == -1) break;
-                            pos += 7;
-                            // skip whitespace..
-                            while(  pos < (int)str.length() && str[pos] <= ' ' ) pos++;
-                            if ( pos == ( int )str.length()) break;
-                            if ( str[pos++] != '=' ) continue;
-                            while ( pos < ( int )str.length() &&
-                                    ( str[pos] <= ' ' ) || str[pos] == '=' || str[pos] == '"' || str[pos] == '\'')
-                                pos++;
-
-                            // end ?
-                            if ( pos == ( int )str.length() ) break;
-                            unsigned endpos = pos;
-                            while( endpos < str.length() &&
-                                   (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
-                                    && str[endpos] != ';' && str[endpos] != '>') )
-                                endpos++;
-                            setEncodingName(str.mid(pos, endpos-pos), EncodingFromMetaTag);
-                            if( m_type == EncodingFromMetaTag ) goto found;
-
-                            if ( endpos >= str.length() || str[endpos] == '/' || str[endpos] == '>' ) break;
-
-                            pos = endpos + 1;
-                        }
-                    } else if (tag != scriptTag && tag != noscriptTag && tag != styleTag &&
-                               tag != linkTag && tag != metaTag && tag != objectTag &&
-                               tag != titleTag && tag != baseTag && 
-                               (end || tag != htmlTag) && !withinTitle &&
-                               (tag != headTag) && isalpha(tmp[0])) {
-                        body = true;
-                        goto found;
+                        pos = endpos + 1;
                     }
+                } else if (tag != scriptTag && tag != noscriptTag && tag != styleTag &&
+                           tag != linkTag && tag != metaTag && tag != objectTag &&
+                           tag != titleTag && tag != baseTag && 
+                           (end || tag != htmlTag) && !withinTitle &&
+                           (tag != headTag) && isalpha(tmp[0])) {
+                    m_reachedBody = true;
+                    goto found;
                 }
-                else
-                    ptr++;
             }
-            return DeprecatedString::null;
+            else
+                ptr++;
         }
+        return DeprecatedString::null;
     }
 
  found:
     // Do the auto-detect if our default encoding is one of the Japanese ones.
-    if (m_type != UserChosenEncoding && m_type != AutoDetectedEncoding && m_encoding.isJapanese())
-    {
+    if (m_type != UserChosenEncoding && m_type != AutoDetectedEncoding && m_encoding.isJapanese()) {
         const char *autoDetectedEncoding;
         switch (KanjiCode::judge(data, len)) {
-        case KanjiCode::JIS:
-            autoDetectedEncoding = "jis7";
-            break;
-        case KanjiCode::EUC:
-            autoDetectedEncoding = "eucjp";
-            break;
-        case KanjiCode::SJIS:
-            autoDetectedEncoding = "sjis";
-            break;
-        default:
-            autoDetectedEncoding = NULL;
-            break;
+            case KanjiCode::JIS:
+                autoDetectedEncoding = "jis7";
+                break;
+            case KanjiCode::EUC:
+                autoDetectedEncoding = "eucjp";
+                break;
+            case KanjiCode::SJIS:
+                autoDetectedEncoding = "sjis";
+                break;
+            default:
+                autoDetectedEncoding = NULL;
+                break;
         }
-        if (autoDetectedEncoding != 0) {
+        if (autoDetectedEncoding)
             setEncodingName(autoDetectedEncoding, AutoDetectedEncoding);
-        }
     }
 
-    // if we still haven't found an encoding, assume latin1
-    if (!m_encoding.isValid())
-    {
-        if (enc.isEmpty()) 
-            enc = "iso8859-1";
-        m_encoding = TextEncoding(enc);
-        // be sure not to crash
-        if (!m_encoding.isValid()) {
-            enc = "iso8859-1";
-            m_encoding = TextEncoding(Latin1Encoding);
-        }
-        m_decoder.set(StreamingTextDecoder::create(m_encoding));
+    // If we still haven't found an encoding, assume latin1
+    // (this can happen if an empty name is passed from outside).
+    if (m_encodingName.isEmpty() || !m_encoding.isValid()) {
+        m_encodingName = "iso8859-1";
+        m_encoding = TextEncoding(Latin1Encoding);
     }
+    m_decoder.set(StreamingTextDecoder::create(m_encoding));
+
     DeprecatedString out;
 
-    if (!buffer.isEmpty()) {
-        if (!lookForMetaTag)
-            buffer.append(data, len);
-        out = m_decoder->toUnicode(buffer.latin1(), buffer.length());
-        buffer.truncate(0);
-    } else {
+    if (!m_buffer.isEmpty()) {
+        if (!currentChunkInBuffer)
+            m_buffer.append(data, len);
+        out = m_decoder->toUnicode(m_buffer.latin1(), m_buffer.length());
+        m_buffer.truncate(0);
+    } else
         out = m_decoder->toUnicode(data, len);
-    }
 
     return out;
 }
 
 DeprecatedString Decoder::flush() const
 {
-    return m_decoder->toUnicode(buffer.latin1(), buffer.length(), true);
+    return m_decoder->toUnicode(m_buffer.latin1(), m_buffer.length(), true);
 }
 
 // -----------------------------------------------------------------------------
index 270fc3ec3f0aebae79892f8f0595f8b3440ee446..7357eb043acad488e7b69a41a0bc9681c306f391 100644 (file)
@@ -24,6 +24,7 @@
 
 #include <wtf/OwnPtr.h>
 #include "TextEncoding.h"
+#include "PlatformString.h"
 
 namespace WebCore {
 
@@ -40,11 +41,12 @@ public:
         AutoDetectedEncoding,
         EncodingFromXMLHeader,
         EncodingFromMetaTag,
+        EncodingFromCSSCharset,
         EncodingFromHTTPHeader,
         UserChosenEncoding
     };
     
-    Decoder();
+    Decoder(const String& mimeType, const String& defaultEncodingName = String());
     ~Decoder();
 
     void setEncodingName(const char* encoding, EncodingSource type);
@@ -56,19 +58,28 @@ public:
     DeprecatedString decode(const char* data, int len);
     DeprecatedString flush() const;
 
-protected:
+private:
+    enum ContentType {
+        HTML,
+        XML,
+        CSS,
+        PlainText // Do not look inside the document (equivalent to directly using StreamingTextDecoder)
+    };
+
     // encoding used for decoding. default is Latin1.
     TextEncoding m_encoding;
+    ContentType m_contentType;
     OwnPtr<StreamingTextDecoder> m_decoder;
-    DeprecatedCString enc;
+    DeprecatedCString m_encodingName;
     EncodingSource m_type;
 
     // Our version of DeprecatedString works well for all-8-bit characters, and allows null characters.
     // This works better than DeprecatedCString when there are null characters involved.
-    DeprecatedString buffer;
+    DeprecatedString m_buffer;
 
-    bool body;
-    bool beginning;
+    bool m_reachedBody;
+    bool m_checkedForCSSCharset;
+    bool m_checkedForBOM;
 };
 
 }
index 89a8c4de965f0c431af11b4ddd66421ed6a4a005..8cd155981434cbe9f2e95070de840f94c45498b4 100644 (file)
@@ -643,12 +643,10 @@ void Frame::write(const char* str, int len)
     }
     
     if (!d->m_decoder) {
-        d->m_decoder = new Decoder;
+        d->m_decoder = new Decoder(d->m_request.m_responseMIMEType, settings()->encoding().latin1());
         if (!d->m_encoding.isNull())
             d->m_decoder->setEncodingName(d->m_encoding.latin1(),
                 d->m_haveEncoding ? Decoder::UserChosenEncoding : Decoder::EncodingFromHTTPHeader);
-        else
-            d->m_decoder->setEncodingName(settings()->encoding().latin1(), Decoder::DefaultEncoding);
 
         if (d->m_doc)
             d->m_doc->setDecoder(d->m_decoder.get());
index 96431cb9a396331d58bd79f504f447411f81ab7e..f7bffd5ea8f062f57e8e35b4ecc5373f2005fb21 100644 (file)
@@ -212,7 +212,7 @@ RefPtr<Document> XSLTProcessor::createDocumentFromSource(const DeprecatedString
     }
     result->determineParseMode(documentSource); // Make sure we parse in the correct mode.
     
-    RefPtr<Decoder> decoder = new Decoder;
+    RefPtr<Decoder> decoder = new Decoder(sourceMIMEType);
     decoder->setEncodingName(sourceEncoding.isEmpty() ? "UTF-8" : sourceEncoding.latin1(), Decoder::EncodingFromXMLHeader);
     result->setDecoder(decoder.get());
     
index 517aa021ae540b9b007b468f811bfff5e38889c0..5cab0dd691df13e47fd5fa670c1234081265e751 100644 (file)
@@ -539,12 +539,13 @@ void XMLHttpRequest::receivedData(ResourceLoader*, const char *data, int len)
         if (m_encoding.isEmpty() && m_job)
             m_encoding = m_job->queryMetaData("charset");
     
-        m_decoder = new Decoder;
         if (!m_encoding.isEmpty())
-            m_decoder->setEncodingName(m_encoding.deprecatedString().latin1(), Decoder::EncodingFromHTTPHeader);
+            m_decoder = new Decoder("text/plain", m_encoding);
+        else if (responseIsXML())
+            // allow Decoder to look inside the m_response if it's XML
+            m_decoder = new Decoder("application/xml");
         else
-            // only allow Decoder to look inside the m_response if it's XML
-            m_decoder->setEncodingName("UTF-8", responseIsXML() ? Decoder::DefaultEncoding : Decoder::EncodingFromHTTPHeader);
+            m_decoder = new Decoder("text/plain", "UTF-8");
     }
     if (len == 0)
         return;