Reviewed by Darin.
authorap <ap@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Sat, 15 Jul 2006 06:53:11 +0000 (06:53 +0000)
committerap <ap@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Sat, 15 Jul 2006 06:53:11 +0000 (06:53 +0000)
        Fix http://bugzilla.opendarwin.org/show_bug.cgi?id=4195
        REGRESSION: KOI8-U encoding no longer supported.

        Tests:
        * http/tests/misc/BOM-override.pl
        * http/tests/misc/BOM-override-script.html
        * fast/encoding/charset-koi8-u.html
        * fast/encoding/charset-x-nextstep.html

        Restored a TEC code path for encodings that are not supported by ICU (but which currently
        passes all layout tests even by itself with ICU disabled). A lot of refactoring is
        still needed - most importantly, round-tripping encoding names via CFStringEncoding
        makes little sense now.

        * WebCore.exp:
        * bridge/mac/WebCoreTextDecoder.h: Removed.
        * bridge/mac/WebCoreTextDecoder.mm: Removed.
        WebCoreTextDecoder was not used anywhere since WebTextView was moved into WebCore.

        * loader/Decoder.cpp:
        (Decoder::Decoder):
        (Decoder::setEncodingName):
        (Decoder::decode):
        Use StreamingTextDecoder::create().

        * platform/StreamingTextDecoder.cpp:
        (WebCore::StreamingTextDecoder::create):
        (WebCore::StreamingTextDecoder::~StreamingTextDecoder):
        * platform/StreamingTextDecoder.h:
        StreamingTextDecoder is just an abstract interface to implementations now.

        * platform/StreamingTextDecoderICU.cpp: Added.
        (WebCore::StreamingTextDecoderICU::StreamingTextDecoderICU):
        (WebCore::StreamingTextDecoderICU::~StreamingTextDecoderICU):
        (WebCore::StreamingTextDecoderICU::releaseICUConverter):
        (WebCore::StreamingTextDecoderICU::textEncodingSupported):
        (WebCore::StreamingTextDecoderICU::convertUTF16):
        (WebCore::StreamingTextDecoderICU::convertIfASCII):
        (WebCore::StreamingTextDecoderICU::createICUConverter):
        (WebCore::StreamingTextDecoderICU::appendOmittingBOM):
        (WebCore::StreamingTextDecoderICU::convertUsingICU):
        (WebCore::StreamingTextDecoderICU::convert):
        (WebCore::StreamingTextDecoderICU::toUnicode):
        (WebCore::StreamingTextDecoderICU::fromUnicode):
        * platform/StreamingTextDecoderICU.h: Added.
        Renamed from StreamingTextDecoder; added a way to tell whether the encoding is actually
        supported by the decoder; minor cleanup.

        * platform/TextEncoding.cpp:
        (WebCore::TextEncoding::effectiveEncoding): Moved from StreamingTextDecoder.
        (WebCore::TextEncoding::toUnicode): Use StreamingTextDecoder::create().
        (WebCore::TextEncoding::fromUnicode): Moved to StreamingTextDecoderICU.

        * platform/TextEncoding.h: Changed __APPLE__ to PLATFORM(MAC); added effectiveEncoding().

        * platform/mac/StreamingTextDecoderMac.cpp: Added.
        (WebCore::StreamingTextDecoderMac::StreamingTextDecoderMac):
        (WebCore::StreamingTextDecoderMac::~StreamingTextDecoderMac):
        (WebCore::StreamingTextDecoderMac::releaseTECConverter):
        (WebCore::StreamingTextDecoderMac::textEncodingSupported):
        (WebCore::StreamingTextDecoderMac::convertUTF16):
        (WebCore::StreamingTextDecoderMac::convertIfASCII):
        (WebCore::StreamingTextDecoderMac::createTECConverter):
        (WebCore::StreamingTextDecoderMac::appendOmittingBOM):
        (WebCore::StreamingTextDecoderMac::convertOneChunkUsingTEC):
        (WebCore::StreamingTextDecoderMac::convertUsingTEC):
        (WebCore::StreamingTextDecoderMac::convert):
        (WebCore::StreamingTextDecoderMac::toUnicode):
        (WebCore::StreamingTextDecoderMac::fromUnicode):
        * platform/mac/StreamingTextDecoderMac.h: Added.
        (WebCore::StreamingTextDecoderMac::convert):
        This is a TEC+CFString code path for decoding, basically restored from a year-old revision.

        * platform/mac/TextEncodingMac.cpp: Removed. Code moved to StreamingTextDecoderMac.

        * WebCore.xcodeproj/project.pbxproj:

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@15449 268f45cc-cd09-0410-ab3c-d52691b4dbfc

26 files changed:
LayoutTests/ChangeLog
LayoutTests/fast/encoding/charset-koi8-u-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/charset-koi8-u.html [new file with mode: 0644]
LayoutTests/fast/encoding/charset-x-nextstep-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/charset-x-nextstep.html [new file with mode: 0644]
LayoutTests/http/tests/misc/BOM-override-expected.txt [new file with mode: 0644]
LayoutTests/http/tests/misc/BOM-override-script-expected.txt [new file with mode: 0644]
LayoutTests/http/tests/misc/BOM-override-script.html [new file with mode: 0644]
LayoutTests/http/tests/misc/BOM-override.pl [new file with mode: 0755]
LayoutTests/http/tests/misc/resources/BOM-override-2.pl [new file with mode: 0755]
LayoutTests/http/tests/misc/resources/BOM-override-3.pl [new file with mode: 0755]
WebCore/ChangeLog
WebCore/WebCore.exp
WebCore/WebCore.xcodeproj/project.pbxproj
WebCore/bridge/mac/WebCoreTextDecoder.h [deleted file]
WebCore/bridge/mac/WebCoreTextDecoder.mm [deleted file]
WebCore/loader/Decoder.cpp
WebCore/platform/StreamingTextDecoder.cpp
WebCore/platform/StreamingTextDecoder.h
WebCore/platform/StreamingTextDecoderICU.cpp [new file with mode: 0644]
WebCore/platform/StreamingTextDecoderICU.h [new file with mode: 0644]
WebCore/platform/TextEncoding.cpp
WebCore/platform/TextEncoding.h
WebCore/platform/mac/StreamingTextDecoderMac.cpp [new file with mode: 0644]
WebCore/platform/mac/StreamingTextDecoderMac.h [new file with mode: 0644]
WebCore/platform/mac/TextEncodingMac.cpp [deleted file]

index 7307277bf3514a6b0520ea33643b2049215440b3..68c57cdb9f26408086a38c6066d03361524fcd11 100644 (file)
@@ -1,3 +1,21 @@
+2006-07-14  Alexey Proskuryakov  <ap@nypop.com>
+
+        Reviewed by Darin.
+
+        Fix http://bugzilla.opendarwin.org/show_bug.cgi?id=4195
+        REGRESSION: KOI8-U encoding no longer supported.
+
+        * fast/encoding/charset-koi8-u-expected.txt: Added.
+        * fast/encoding/charset-koi8-u.html: Added.
+        * fast/encoding/charset-x-nextstep-expected.txt: Added.
+        * fast/encoding/charset-x-nextstep.html: Added.
+        * http/tests/misc/BOM-override-expected.txt: Added.
+        * http/tests/misc/BOM-override-script-expected.txt: Added.
+        * http/tests/misc/BOM-override-script.html: Added.
+        * http/tests/misc/BOM-override.pl: Added.
+        * http/tests/misc/resources/BOM-override-2.pl: Added.
+        * http/tests/misc/resources/BOM-override-3.pl: Added.
+
 2006-07-14  Timothy Hatcher  <timothy@apple.com>
  
         Rolling out this fix from r15358 since it isn't resolved.
diff --git a/LayoutTests/fast/encoding/charset-koi8-u-expected.txt b/LayoutTests/fast/encoding/charset-koi8-u-expected.txt
new file mode 100644 (file)
index 0000000..7e0ff00
--- /dev/null
@@ -0,0 +1,5 @@
+Test for bug 4195 - REGRESSION: KOI8-U encoding no longer supported.
+
+The test passes if these two letters look inverted along the vertical axis: "ЭЄ"
+
+
diff --git a/LayoutTests/fast/encoding/charset-koi8-u.html b/LayoutTests/fast/encoding/charset-koi8-u.html
new file mode 100644 (file)
index 0000000..61f8c63
--- /dev/null
@@ -0,0 +1,15 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=KOI8-U">
+<title>KOI8-U</title>
+</head>
+<body>
+<script>
+if (window.layoutTestController)
+       layoutTestController.dumpAsText();
+</script>
+<p>Test for <a href="http://bugzilla.opendarwin.org/show_bug.cgi?id=4195"> bug 4195</a> - 
+REGRESSION: KOI8-U encoding no longer supported.</p>
+<p>The test passes if these two letters look inverted along the vertical axis: "ü´"</p>
+</body>
+</html>
diff --git a/LayoutTests/fast/encoding/charset-x-nextstep-expected.txt b/LayoutTests/fast/encoding/charset-x-nextstep-expected.txt
new file mode 100644 (file)
index 0000000..8d41b8a
--- /dev/null
@@ -0,0 +1,5 @@
+Test for bug 4195 - REGRESSION: KOI8-U encoding no longer supported.
+
+Testing for an encoding that's less likely to be in your version of ICU than KOI8-U. These two characters should look the same: "ÆÆ".
+
+
diff --git a/LayoutTests/fast/encoding/charset-x-nextstep.html b/LayoutTests/fast/encoding/charset-x-nextstep.html
new file mode 100644 (file)
index 0000000..7dd2250
--- /dev/null
@@ -0,0 +1,15 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=x-nextstep">
+<title>x-nextstep</title>
+</head>
+<body>
+<script>
+if (window.layoutTestController)
+       layoutTestController.dumpAsText();
+</script>
+<p>Test for <a href="http://bugzilla.opendarwin.org/show_bug.cgi?id=4195"> bug 4195</a> - 
+REGRESSION: KOI8-U encoding no longer supported.</p>
+<p>Testing for an encoding that's less likely to be in your version of ICU than KOI8-U. These two characters should look the same: "á&AElig;".</p>
+</body>
+</html>
diff --git a/LayoutTests/http/tests/misc/BOM-override-expected.txt b/LayoutTests/http/tests/misc/BOM-override-expected.txt
new file mode 100644 (file)
index 0000000..85bb6a7
--- /dev/null
@@ -0,0 +1 @@
+SUССESS
diff --git a/LayoutTests/http/tests/misc/BOM-override-script-expected.txt b/LayoutTests/http/tests/misc/BOM-override-script-expected.txt
new file mode 100644 (file)
index 0000000..b1c6ce9
--- /dev/null
@@ -0,0 +1,9 @@
+Test that BOM can override whatever charset was set in Content-Type (this is not the behavior of Firefox, nor expected by any standard).
+
+Should say SUCCESS twice
+
+SUССESS
+
+SUССESS
+
+
diff --git a/LayoutTests/http/tests/misc/BOM-override-script.html b/LayoutTests/http/tests/misc/BOM-override-script.html
new file mode 100644 (file)
index 0000000..41e76f8
--- /dev/null
@@ -0,0 +1,10 @@
+<script>
+  if (window.layoutTestController)
+    layoutTestController.dumpAsText();
+</script>
+<p>Test that BOM can override whatever charset was set in Content-Type
+(this is not the behavior of Firefox, nor expected by any standard).</p>
+<p>Should say SUCCESS twice</p>
+
+<script src="resources/BOM-override-2.pl"></script>
+<script src="resources/BOM-override-3.pl"></script>
diff --git a/LayoutTests/http/tests/misc/BOM-override.pl b/LayoutTests/http/tests/misc/BOM-override.pl
new file mode 100755 (executable)
index 0000000..6157540
--- /dev/null
@@ -0,0 +1,10 @@
+#!/usr/bin/perl
+# Test that BOM can override whatever charset was set in Content-Type
+# (this is not the behavior of Firefox, nor expected by any standard).
+
+print "Content-type: text/html;charset=x-mac-hebrew\r\n";
+print "\r\n";
+
+print "\xef\xbb\xbf";
+print "SUССESS";
+print "<script>if (window.layoutTestController) layoutTestController.dumpAsText();</script>";
diff --git a/LayoutTests/http/tests/misc/resources/BOM-override-2.pl b/LayoutTests/http/tests/misc/resources/BOM-override-2.pl
new file mode 100755 (executable)
index 0000000..f72171c
--- /dev/null
@@ -0,0 +1,9 @@
+#!/usr/bin/perl
+# Test that BOM can override whatever charset was set in Content-Type
+# (this is not the behavior of Firefox, nor expected by any standard).
+
+print "Content-type: text/javascript;charset=x-mac-hebrew\r\n";
+print "\r\n";
+
+print "\xef\xbb\xbf";
+print "document.write('<p>SUССESS</p>');";
diff --git a/LayoutTests/http/tests/misc/resources/BOM-override-3.pl b/LayoutTests/http/tests/misc/resources/BOM-override-3.pl
new file mode 100755 (executable)
index 0000000..e5754de
--- /dev/null
@@ -0,0 +1,9 @@
+#!/usr/bin/perl
+# Test that BOM can override whatever charset was set in Content-Type
+# (this is not the behavior of Firefox, nor expected by any standard).
+
+print "Content-type: text/javascript;charset=iso-8859-1\r\n";
+print "\r\n";
+
+print "\xef\xbb\xbf";
+print "document.write('<p>SUССESS</p>');";
index 8cd340f154ebb832772d53d1c1997d7a90426047..33a5a412d5a80097ef1dabca58178d5dd9ab6e34 100644 (file)
@@ -1,3 +1,84 @@
+2006-07-14  Alexey Proskuryakov  <ap@nypop.com>
+
+        Reviewed by Darin.
+
+        Fix http://bugzilla.opendarwin.org/show_bug.cgi?id=4195
+        REGRESSION: KOI8-U encoding no longer supported.
+
+        Tests:
+        * http/tests/misc/BOM-override.pl
+        * http/tests/misc/BOM-override-script.html
+        * fast/encoding/charset-koi8-u.html
+        * fast/encoding/charset-x-nextstep.html
+
+        Restored a TEC code path for encodings that are not supported by ICU (but which currently
+        passes all layout tests even by itself with ICU disabled). A lot of refactoring is 
+        still needed - most importantly, round-tripping encoding names via CFStringEncoding 
+        makes little sense now.
+
+        * WebCore.exp:
+        * bridge/mac/WebCoreTextDecoder.h: Removed.
+        * bridge/mac/WebCoreTextDecoder.mm: Removed.
+        WebCoreTextDecoder was not used anywhere since WebTextView was moved into WebCore.
+
+        * loader/Decoder.cpp:
+        (Decoder::Decoder):
+        (Decoder::setEncodingName):
+        (Decoder::decode):
+        Use StreamingTextDecoder::create().
+
+        * platform/StreamingTextDecoder.cpp:
+        (WebCore::StreamingTextDecoder::create):
+        (WebCore::StreamingTextDecoder::~StreamingTextDecoder):
+        * platform/StreamingTextDecoder.h:
+        StreamingTextDecoder is just an abstract interface to implementations now.
+
+        * platform/StreamingTextDecoderICU.cpp: Added.
+        (WebCore::StreamingTextDecoderICU::StreamingTextDecoderICU):
+        (WebCore::StreamingTextDecoderICU::~StreamingTextDecoderICU):
+        (WebCore::StreamingTextDecoderICU::releaseICUConverter):
+        (WebCore::StreamingTextDecoderICU::textEncodingSupported):
+        (WebCore::StreamingTextDecoderICU::convertUTF16):
+        (WebCore::StreamingTextDecoderICU::convertIfASCII):
+        (WebCore::StreamingTextDecoderICU::createICUConverter):
+        (WebCore::StreamingTextDecoderICU::appendOmittingBOM):
+        (WebCore::StreamingTextDecoderICU::convertUsingICU):
+        (WebCore::StreamingTextDecoderICU::convert):
+        (WebCore::StreamingTextDecoderICU::toUnicode):
+        (WebCore::StreamingTextDecoderICU::fromUnicode):
+        * platform/StreamingTextDecoderICU.h: Added.
+        Renamed from StreamingTextDecoder; added a way to tell whether the encoding is actually
+        supported by the decoder; minor cleanup.
+
+        * platform/TextEncoding.cpp:
+        (WebCore::TextEncoding::effectiveEncoding): Moved from StreamingTextDecoder.
+        (WebCore::TextEncoding::toUnicode): Use StreamingTextDecoder::create().
+        (WebCore::TextEncoding::fromUnicode): Moved to StreamingTextDecoderICU.
+        
+        * platform/TextEncoding.h: Changed __APPLE__ to PLATFORM(MAC); added effectiveEncoding().
+
+        * platform/mac/StreamingTextDecoderMac.cpp: Added.
+        (WebCore::StreamingTextDecoderMac::StreamingTextDecoderMac):
+        (WebCore::StreamingTextDecoderMac::~StreamingTextDecoderMac):
+        (WebCore::StreamingTextDecoderMac::releaseTECConverter):
+        (WebCore::StreamingTextDecoderMac::textEncodingSupported):
+        (WebCore::StreamingTextDecoderMac::convertUTF16):
+        (WebCore::StreamingTextDecoderMac::convertIfASCII):
+        (WebCore::StreamingTextDecoderMac::createTECConverter):
+        (WebCore::StreamingTextDecoderMac::appendOmittingBOM):
+        (WebCore::StreamingTextDecoderMac::convertOneChunkUsingTEC):
+        (WebCore::StreamingTextDecoderMac::convertUsingTEC):
+        (WebCore::StreamingTextDecoderMac::convert):
+        (WebCore::StreamingTextDecoderMac::toUnicode):
+        (WebCore::StreamingTextDecoderMac::fromUnicode):
+        * platform/mac/StreamingTextDecoderMac.h: Added.
+        (WebCore::StreamingTextDecoderMac::convert):
+        This is a TEC+CFString code path for decoding, basically restored from a year-old revision.
+
+        * platform/mac/TextEncodingMac.cpp: Removed. Code moved to StreamingTextDecoderMac.
+
+        * WebCore.xcodeproj/project.pbxproj:
+
 === Safari-521.17 ===
 
 2006-07-14  Timothy Hatcher  <timothy@apple.com>
index 8d7717564b165d0f61e116513dcf069dec63aef3..05e432814ba24bbca315c8022d4f33dce62587bc 100644 (file)
 .objc_class_name_WebCoreScriptDebugger
 .objc_class_name_WebCoreSettings
 .objc_class_name_WebCoreStringTruncator
-.objc_class_name_WebCoreTextDecoder
 .objc_class_name_WebCoreViewFactory
 .objc_class_name_WebDashboardRegion
 _WebCoreDrawTextAtPoint
index dcb9de8e664a0fe72c95fc1998d003e1d1020a68..7401139de4670a56b2c2c753f01dfd0a92ae9087 100644 (file)
                6565821309D15111000E61D7 /* ksvgcssvalues.h in Headers */ = {isa = PBXBuildFile; fileRef = 6565820F09D15111000E61D7 /* ksvgcssvalues.h */; };
                65743B52097076F8001E7CEF /* RenderSVGText.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 65743B50097076F8001E7CEF /* RenderSVGText.cpp */; };
                65743B53097076F8001E7CEF /* RenderSVGText.h in Headers */ = {isa = PBXBuildFile; fileRef = 65743B51097076F8001E7CEF /* RenderSVGText.h */; };
-               6576F9D609B2484A000041F7 /* TextEncodingMac.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6576F9D509B2484A000041F7 /* TextEncodingMac.cpp */; };
                657BD74D09AFDC54005A2056 /* StreamingTextDecoder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 657BD74909AFDC54005A2056 /* StreamingTextDecoder.cpp */; };
                657BD74E09AFDC54005A2056 /* StreamingTextDecoder.h in Headers */ = {isa = PBXBuildFile; fileRef = 657BD74A09AFDC54005A2056 /* StreamingTextDecoder.h */; };
                657BD74F09AFDC54005A2056 /* TextEncoding.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 657BD74B09AFDC54005A2056 /* TextEncoding.cpp */; };
                DD763BB20992C2C900740B8E /* libxml2.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = DD763BB10992C2C900740B8E /* libxml2.dylib */; };
                DD7CDF250A23CF9800069928 /* CSSUnknownRule.h in Headers */ = {isa = PBXBuildFile; fileRef = A80E6CCE0A1989CA007FB8C5 /* CSSUnknownRule.h */; };
                E1052C320A4D70010072D99B /* DOMEventsNonstandard.mm in Sources */ = {isa = PBXBuildFile; fileRef = E1052C310A4D70010072D99B /* DOMEventsNonstandard.mm */; };
-               E1EE773708F1086C00166870 /* WebCoreTextDecoder.h in Headers */ = {isa = PBXBuildFile; fileRef = E1EE773508F1086C00166870 /* WebCoreTextDecoder.h */; settings = {ATTRIBUTES = (Private, ); }; };
-               E1EE773808F1086C00166870 /* WebCoreTextDecoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = E1EE773608F1086C00166870 /* WebCoreTextDecoder.mm */; };
+               E14842DE0A674934007E4D39 /* StreamingTextDecoderICU.h in Headers */ = {isa = PBXBuildFile; fileRef = E14842DD0A674934007E4D39 /* StreamingTextDecoderICU.h */; };
+               E14842FF0A674A31007E4D39 /* StreamingTextDecoderICU.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E14842FE0A674A31007E4D39 /* StreamingTextDecoderICU.cpp */; };
+               E148432F0A674FC2007E4D39 /* StreamingTextDecoderMac.h in Headers */ = {isa = PBXBuildFile; fileRef = E148432E0A674FC2007E4D39 /* StreamingTextDecoderMac.h */; };
+               E14843D60A6754A6007E4D39 /* StreamingTextDecoderMac.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E14843910A6752BF007E4D39 /* StreamingTextDecoderMac.cpp */; };
                E1F0424609839389006694EA /* xmlhttprequest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E1F0424409839389006694EA /* xmlhttprequest.cpp */; };
                E1F0424709839389006694EA /* xmlhttprequest.h in Headers */ = {isa = PBXBuildFile; fileRef = E1F0424509839389006694EA /* xmlhttprequest.h */; };
                ED048ABC0833F132006E1E67 /* textAreaResizeCorner.tiff in Resources */ = {isa = PBXBuildFile; fileRef = ED048ABB0833F132006E1E67 /* textAreaResizeCorner.tiff */; };
                6565821009D15111000E61D7 /* ksvgcssvalues.in */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = text; path = ksvgcssvalues.in; sourceTree = "<group>"; };
                65743B50097076F8001E7CEF /* RenderSVGText.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = RenderSVGText.cpp; sourceTree = "<group>"; };
                65743B51097076F8001E7CEF /* RenderSVGText.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = RenderSVGText.h; sourceTree = "<group>"; };
-               6576F9D509B2484A000041F7 /* TextEncodingMac.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = TextEncodingMac.cpp; sourceTree = "<group>"; };
-               657BD74909AFDC54005A2056 /* StreamingTextDecoder.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = StreamingTextDecoder.cpp; sourceTree = "<group>"; };
+               657BD74909AFDC54005A2056 /* StreamingTextDecoder.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; lineEnding = 0; path = StreamingTextDecoder.cpp; sourceTree = "<group>"; };
                657BD74A09AFDC54005A2056 /* StreamingTextDecoder.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = StreamingTextDecoder.h; sourceTree = "<group>"; };
                657BD74B09AFDC54005A2056 /* TextEncoding.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = TextEncoding.cpp; sourceTree = "<group>"; };
                657BD74C09AFDC54005A2056 /* TextEncoding.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = TextEncoding.h; sourceTree = "<group>"; };
                DB23C2CA0A508D29002489EB /* IndentOutdentCommand.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = IndentOutdentCommand.h; sourceTree = "<group>"; };
                DD763BB10992C2C900740B8E /* libxml2.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libxml2.dylib; path = /usr/lib/libxml2.dylib; sourceTree = "<absolute>"; };
                E1052C310A4D70010072D99B /* DOMEventsNonstandard.mm */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.objcpp; path = DOMEventsNonstandard.mm; sourceTree = "<group>"; };
-               E1EE773508F1086C00166870 /* WebCoreTextDecoder.h */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.c.h; path = WebCoreTextDecoder.h; sourceTree = "<group>"; tabWidth = 8; usesTabs = 0; };
-               E1EE773608F1086C00166870 /* WebCoreTextDecoder.mm */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = WebCoreTextDecoder.mm; sourceTree = "<group>"; tabWidth = 8; usesTabs = 0; };
+               E14842DD0A674934007E4D39 /* StreamingTextDecoderICU.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = StreamingTextDecoderICU.h; sourceTree = "<group>"; };
+               E14842FE0A674A31007E4D39 /* StreamingTextDecoderICU.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = StreamingTextDecoderICU.cpp; sourceTree = "<group>"; };
+               E148432E0A674FC2007E4D39 /* StreamingTextDecoderMac.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; lineEnding = 0; name = StreamingTextDecoderMac.h; path = mac/StreamingTextDecoderMac.h; sourceTree = "<group>"; };
+               E14843910A6752BF007E4D39 /* StreamingTextDecoderMac.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; lineEnding = 0; name = StreamingTextDecoderMac.cpp; path = mac/StreamingTextDecoderMac.cpp; sourceTree = "<group>"; };
                E1F0424409839389006694EA /* xmlhttprequest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = xmlhttprequest.cpp; sourceTree = "<group>"; };
                E1F0424509839389006694EA /* xmlhttprequest.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = xmlhttprequest.h; sourceTree = "<group>"; };
                ED048ABB0833F132006E1E67 /* textAreaResizeCorner.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = textAreaResizeCorner.tiff; sourceTree = "<group>"; };
                                6582A15509999D6D00BEEB6D /* SystemTimeMac.cpp */,
                                9352071B09BD3BBB00F2038D /* TextBoundaries.mm */,
                                F587853802DE375901EA4122 /* TextBoxMac.mm */,
-                               6576F9D509B2484A000041F7 /* TextEncodingMac.cpp */,
                                F587851202DE375901EA4122 /* TextFieldMac.mm */,
                                6545F67509B830180013006F /* TransferJobMac.mm */,
                                F5517DC2031AB56301A80180 /* WebCoreHistory.h */,
                                F565AE8602ECA583018635CA /* WebCoreSettings.mm */,
                                65901A4209FC6039005BD752 /* WebCoreStringTruncator.h */,
                                65901A4309FC6039005BD752 /* WebCoreStringTruncator.mm */,
-                               E1EE773508F1086C00166870 /* WebCoreTextDecoder.h */,
-                               E1EE773608F1086C00166870 /* WebCoreTextDecoder.mm */,
                                F587855402DE375901EA4122 /* WebCoreViewFactory.h */,
                                F587855502DE375901EA4122 /* WebCoreViewFactory.m */,
                                5150C2A10702629000AF642C /* WebDashboardRegion.h */,
                                9352071709BD3BA500F2038D /* StaticConstructors.h */,
                                657BD74909AFDC54005A2056 /* StreamingTextDecoder.cpp */,
                                657BD74A09AFDC54005A2056 /* StreamingTextDecoder.h */,
+                               E14842FE0A674A31007E4D39 /* StreamingTextDecoderICU.cpp */,
+                               E14842DD0A674934007E4D39 /* StreamingTextDecoderICU.h */,
+                               E14843910A6752BF007E4D39 /* StreamingTextDecoderMac.cpp */,
+                               E148432E0A674FC2007E4D39 /* StreamingTextDecoderMac.h */,
                                93CD4FDA0995F9EA007ECC97 /* String.cpp */,
                                93126F6009D7A736008D9626 /* StringHash.h */,
                                93CD4FDC0995F9EA007ECC97 /* StringImpl.cpp */,
                                7E6FEED80898582300C44C3F /* WebCoreScriptDebugger.h in Headers */,
                                939885C408B7E3D100E707C4 /* EventNames.h in Headers */,
                                65C97AF308EA908800ACD273 /* config.h in Headers */,
-                               E1EE773708F1086C00166870 /* WebCoreTextDecoder.h in Headers */,
                                A88AD3870952486D001DD196 /* KRenderingDevice.h in Headers */,
                                A88AD3890952486D001DD196 /* KRenderingFillPainter.h in Headers */,
                                A88AD38A0952486D001DD196 /* KRenderingPaintServer.h in Headers */,
                                ABDDFE7C0A5C6E7000A3E11D /* RenderPopupMenu.h in Headers */,
                                ABDDFE7D0A5C6E7000A3E11D /* RenderPopupMenuMac.h in Headers */,
                                85217E030A5ECD4700DB8D00 /* XSLImportRule.h in Headers */,
+                               E14842DE0A674934007E4D39 /* StreamingTextDecoderICU.h in Headers */,
+                               E148432F0A674FC2007E4D39 /* StreamingTextDecoderMac.h in Headers */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                };
                                A8FD8B86087CB44C00DC3707 /* AffineTransform.cpp in Sources */,
                                7E6FEED90898582300C44C3F /* WebCoreScriptDebugger.mm in Sources */,
                                939885C308B7E3D100E707C4 /* EventNames.cpp in Sources */,
-                               E1EE773808F1086C00166870 /* WebCoreTextDecoder.mm in Sources */,
                                A88AD3860952486D001DD196 /* KRenderingDevice.cpp in Sources */,
                                A88AD3880952486D001DD196 /* KRenderingFillPainter.cpp in Sources */,
                                A88AD38B0952486D001DD196 /* KRenderingPaintServerGradient.cpp in Sources */,
                                935367E909AF77EF00D35CD6 /* GraphicsContextMac.mm in Sources */,
                                657BD74D09AFDC54005A2056 /* StreamingTextDecoder.cpp in Sources */,
                                657BD74F09AFDC54005A2056 /* TextEncoding.cpp in Sources */,
-                               6576F9D609B2484A000041F7 /* TextEncodingMac.cpp in Sources */,
                                65F5386B09B2C05E00F3DC4A /* CharsetNames.cpp in Sources */,
                                A82398A809B3ACF500B60641 /* PlugInInfoStoreMac.mm in Sources */,
                                A8239E0009B3CF8A00B60641 /* Logging.cpp in Sources */,
                                ABDDFE7E0A5C6E7000A3E11D /* RenderPopupMenuMac.mm in Sources */,
                                85217E020A5ECD4700DB8D00 /* XSLImportRule.cpp in Sources */,
                                93CF35AA0A6169F700543E52 /* AffineTransformCG.cpp in Sources */,
+                               E14842FF0A674A31007E4D39 /* StreamingTextDecoderICU.cpp in Sources */,
+                               E14843D60A6754A6007E4D39 /* StreamingTextDecoderMac.cpp in Sources */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                };
diff --git a/WebCore/bridge/mac/WebCoreTextDecoder.h b/WebCore/bridge/mac/WebCoreTextDecoder.h
deleted file mode 100644 (file)
index 7bc62f6..0000000
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (C) 2005 Apple Computer, Inc.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
- */
-
-#ifdef __cplusplus
-namespace WebCore {
-    class StreamingTextDecoder;
-}
-typedef WebCore::StreamingTextDecoder PlatformDecoder;
-#else
-    @class PlatformDecoder;
-#endif
-
-@interface WebCoreTextDecoder : NSObject
-{
-@private
-    PlatformDecoder *_decoder;
-}
-
-- (WebCoreTextDecoder *)initWithEncodingName:(NSString *)encodingName;
-+ (WebCoreTextDecoder *)decoderWithEncodingName:(NSString *)encodingName;
-
-- (NSString *)decodeData:(NSData *)data;
-- (NSString *)flush;
-
-@end
diff --git a/WebCore/bridge/mac/WebCoreTextDecoder.mm b/WebCore/bridge/mac/WebCoreTextDecoder.mm
deleted file mode 100644 (file)
index b8b7aa0..0000000
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (C) 2005 Apple Computer, Inc.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
- */
-
-#import "config.h"
-#import "WebCoreTextDecoder.h"
-
-#import "TextEncoding.h"
-#import "StreamingTextDecoder.h"
-
-using namespace WebCore;
-
-@implementation WebCoreTextDecoder
-
-- (WebCoreTextDecoder *)initWithEncodingName:(NSString *)encodingName
-{
-    self = [super init];
-    
-    WebCore::TextEncoding encoding = WebCore::TextEncoding([encodingName cStringUsingEncoding:NSASCIIStringEncoding]);
-    if (!encoding.isValid())
-        encoding = WebCore::TextEncoding(Latin1Encoding);
-    
-    _decoder = new StreamingTextDecoder(encoding);
-    
-    return self;
-}
-
-+ (WebCoreTextDecoder *)decoderWithEncodingName:(NSString *)encodingName
-{
-    return [[[WebCoreTextDecoder alloc] initWithEncodingName:encodingName] autorelease];
-}
-
-- (void)dealloc
-{
-    delete _decoder;
-    [super dealloc];
-}
-
-- (void)finalize
-{
-    delete _decoder;
-    [super finalize];
-}
-
-- (NSString *)decodeData:(NSData *)data
-{
-    return _decoder->toUnicode((const char *)[data bytes], [data length], false).getNSString();
-}
-
-- (NSString *)flush
-{
-    return _decoder->toUnicode("", 0, true).getNSString();
-}
-
-@end
index 63e427e79bba85b07e374f0e7ecce2fc486e85fb..b73faa45efa95a014af463b720d71a6531463200 100644 (file)
@@ -250,7 +250,7 @@ breakBreak:
 
 Decoder::Decoder() 
   : m_encoding(Latin1Encoding)
-  , m_decoder(new StreamingTextDecoder(m_encoding))
+  , m_decoder(StreamingTextDecoder::create(m_encoding))
   , enc(0)
   , m_type(DefaultEncoding)
   , body(false)
@@ -277,7 +277,7 @@ void Decoder::setEncodingName(const char* _encoding, EncodingSource type)
         enc = encoding.name();
         m_encoding = encoding;
         m_type = type;
-        m_decoder.set(new StreamingTextDecoder(m_encoding));
+        m_decoder.set(StreamingTextDecoder::create(m_encoding));
     }
 }
 
@@ -387,7 +387,7 @@ DeprecatedString Decoder::decode(const char *data, int len)
                 m_encoding = TextEncoding(autoDetectedEncoding);
                 ASSERT(m_encoding.isValid());
                 enc = m_encoding.name();
-                m_decoder.set(new StreamingTextDecoder(m_encoding));
+                m_decoder.set(StreamingTextDecoder::create(m_encoding));
             }
         }
         beginning = false;
@@ -555,7 +555,7 @@ DeprecatedString Decoder::decode(const char *data, int len)
             enc = "iso8859-1";
             m_encoding = TextEncoding(Latin1Encoding);
         }
-        m_decoder.set(new StreamingTextDecoder(m_encoding));
+        m_decoder.set(StreamingTextDecoder::create(m_encoding));
     }
     DeprecatedString out;
 
index fafdb51ca3bde1d2ffdb0e7d286edcf6a9c3871f..c6c6813d7a0c7de25996bd658d8dbe43cd48f796 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2004, 2006 Apple Computer, Inc.  All rights reserved.
+ * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
 #include "config.h"
 #include "StreamingTextDecoder.h"
 
-#include <wtf/Assertions.h>
-
-using std::min;
-
-namespace WebCore {
-
-StreamingTextDecoder::StreamingTextDecoder(const TextEncoding& encoding)
-    : m_encoding(encoding)
-    , m_littleEndian(encoding.flags() & LittleEndian)
-    , m_atStart(true)
-    , m_error(false)
-    , m_numBufferedBytes(0)
-    , m_converterICU(0)
-{
-}
-
-static const UChar BOM = 0xFEFF;
-static const size_t ConversionBufferSize = 16384;
-    
-static UConverter* cachedConverterICU;
-static TextEncodingID cachedConverterEncoding = InvalidEncoding;
-
-StreamingTextDecoder::~StreamingTextDecoder()
-{
-    if (m_converterICU) {
-        if (cachedConverterICU != 0)
-            ucnv_close(cachedConverterICU);
-        cachedConverterICU = m_converterICU;
-        cachedConverterEncoding = m_encoding.encodingID();
-    }
-}
-
-DeprecatedString StreamingTextDecoder::convertUTF16(const unsigned char* s, int length)
-{
-    ASSERT(m_numBufferedBytes == 0 || m_numBufferedBytes == 1);
-
-    const unsigned char* p = s;
-    size_t len = length;
-    
-    DeprecatedString result("");
-    
-    result.reserve(length / 2);
-
-    if (m_numBufferedBytes != 0 && len != 0) {
-        ASSERT(m_numBufferedBytes == 1);
-        UChar c;
-        if (m_littleEndian)
-            c = m_bufferedBytes[0] | (p[0] << 8);
-        else
-            c = (m_bufferedBytes[0] << 8) | p[0];
-
-        if (c)
-            result.append(reinterpret_cast<DeprecatedChar*>(&c), 1);
-
-        m_numBufferedBytes = 0;
-        p += 1;
-        len -= 1;
-    }
-    
-    while (len > 1) {
-        UChar buffer[ConversionBufferSize];
-        int runLength = min(len / 2, ConversionBufferSize);
-        int bufferLength = 0;
-        if (m_littleEndian) {
-            for (int i = 0; i < runLength; ++i) {
-                UChar c = p[0] | (p[1] << 8);
-                p += 2;
-                if (c != BOM)
-                    buffer[bufferLength++] = c;
-            }
-        } else {
-            for (int i = 0; i < runLength; ++i) {
-                UChar c = (p[0] << 8) | p[1];
-                p += 2;
-                if (c != BOM)
-                    buffer[bufferLength++] = c;
-            }
-        }
-        result.append(reinterpret_cast<DeprecatedChar*>(buffer), bufferLength);
-        len -= runLength * 2;
-    }
-    
-    if (len) {
-        ASSERT(m_numBufferedBytes == 0);
-        m_numBufferedBytes = 1;
-        m_bufferedBytes[0] = p[0];
-    }
-    
-    return result;
-}
-
-bool StreamingTextDecoder::convertIfASCII(const unsigned char* s, int length, DeprecatedString& str)
-{
-    ASSERT(m_numBufferedBytes == 0 || m_numBufferedBytes == 1);
-
-    DeprecatedString result("");
-    result.reserve(length);
-
-    const unsigned char* p = s;
-    size_t len = length;
-    unsigned char ored = 0;
-    while (len) {
-        UChar buffer[ConversionBufferSize];
-        int runLength = min(len, ConversionBufferSize);
-        int bufferLength = 0;
-        for (int i = 0; i < runLength; ++i) {
-            unsigned char c = *p++;
-            ored |= c;
-            buffer[bufferLength++] = c;
-        }
-        if (ored & 0x80)
-            return false;
-        result.append(reinterpret_cast<DeprecatedChar*>(buffer), bufferLength);
-        len -= runLength;
-    }
-
-    str = result;
-    return true;
-}
-
-static inline TextEncoding effectiveEncoding(const TextEncoding& encoding)
-{
-    TextEncodingID id = encoding.encodingID();
-    if (id == Latin1Encoding || id == ASCIIEncoding)
-        id = WinLatin1Encoding;
-    return TextEncoding(id, encoding.flags());
-}
-
-void StreamingTextDecoder::createICUConverter()
-{
-    TextEncoding encoding = effectiveEncoding(m_encoding);
-    const char* encodingName = encoding.name();
-
-    bool cachedEncodingEqual = cachedConverterEncoding == encoding.encodingID();
-    cachedConverterEncoding = InvalidEncoding;
-
-    if (cachedEncodingEqual && cachedConverterICU) {
-        m_converterICU = cachedConverterICU;
-        cachedConverterICU = 0;
-    } else {    
-        UErrorCode err = U_ZERO_ERROR;
-        ASSERT(!m_converterICU);
-        m_converterICU = ucnv_open(encodingName, &err);
-#if !LOG_DISABLED
-        if (err == U_AMBIGUOUS_ALIAS_WARNING)
-            LOG_ERROR("ICU ambiguous alias warning for encoding: %s", encodingName);
-        if (!m_converterICU)
-            LOG_ERROR("the ICU Converter won't convert from text encoding 0x%X, error %d", encoding.encodingID(), err);
+#if USE(ICU_UNICODE)
+    #include "StreamingTextDecoderICU.h"
 #endif
-    }
-}
-
-// We strip BOM characters because they can show up both at the start of content
-// and inside content, and we never want them to end up in the decoded text.
-void StreamingTextDecoder::appendOmittingBOM(DeprecatedString& s, const UChar* characters, int byteCount)
-{
-    ASSERT(byteCount % sizeof(UChar) == 0);
-    int start = 0;
-    int characterCount = byteCount / sizeof(UChar);
-    for (int i = 0; i != characterCount; ++i) {
-        if (BOM == characters[i]) {
-            if (start != i)
-                s.append(reinterpret_cast<const DeprecatedChar*>(&characters[start]), i - start);
-            start = i + 1;
-        }
-    }
-    if (start != characterCount)
-        s.append(reinterpret_cast<const DeprecatedChar*>(&characters[start]), characterCount - start);
-}
-
-DeprecatedString StreamingTextDecoder::convertUsingICU(const unsigned char* chs, int len, bool flush)
-{
-    // Get a converter for the passed-in encoding.
-    if (!m_converterICU) {
-        createICUConverter();
-        if (!m_converterICU)
-            return DeprecatedString();
-    }
-
-    DeprecatedString result("");
-    result.reserve(len);
 
-    UChar buffer[ConversionBufferSize];
-    const char* source = reinterpret_cast<const char*>(chs);
-    const char* sourceLimit = source + len;
-    int32_t* offsets = NULL;
-    UErrorCode err;
-
-    do {
-        UChar* target = buffer;
-        const UChar* targetLimit = target + ConversionBufferSize;
-        err = U_ZERO_ERROR;
-        ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, flush, &err);
-        int count = target - buffer;
-        appendOmittingBOM(result, reinterpret_cast<const UChar*>(buffer), count * sizeof(UChar));
-    } while (err == U_BUFFER_OVERFLOW_ERROR);
+#if PLATFORM(MAC)
+    #include "StreamingTextDecoderMac.h"
+#endif
 
-    if (U_FAILURE(err)) {
-        // flush the converter so it can be reused, and not be bothered by this error.
-        do {
-            UChar *target = buffer;
-            const UChar *targetLimit = target + ConversionBufferSize;
-            err = U_ZERO_ERROR;
-            ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, true, &err);
-        } while (source < sourceLimit);
-        LOG_ERROR("ICU conversion error");
-        return DeprecatedString();
-    }
+#include <wtf/Assertions.h>
+#include <wtf/OwnPtr.h>
 
-    return result;
-}
+namespace WebCore {
 
-DeprecatedString StreamingTextDecoder::convert(const unsigned char* chs, int len, bool flush)
+StreamingTextDecoder* StreamingTextDecoder::create(const TextEncoding& encoding)
 {
-    switch (m_encoding.encodingID()) {
-        case UTF16Encoding:
-            return convertUTF16(chs, len);
-
-        case ASCIIEncoding:
-        case Latin1Encoding:
-        case WinLatin1Encoding: {
-            DeprecatedString result;
-            if (convertIfASCII(chs, len, result))
-                return result;
-            break;
-        }
+#if USE(ICU_UNICODE)
+    OwnPtr<StreamingTextDecoderICU> decoderICU(new StreamingTextDecoderICU(encoding));
+    if (decoderICU->textEncodingSupported())
+        return decoderICU.release();
+#endif
 
-        case UTF8Encoding:
-            // If a previous run used ICU, we might have a partly converted character.
-            // If so, don't use the optimized ASCII code path.
-            if (!m_converterICU) {
-                DeprecatedString result;
-                if (convertIfASCII(chs, len, result))
-                    return result;
-            }
-            break;
+#if PLATFORM(MAC)
+    OwnPtr<StreamingTextDecoderMac> decoderMac(new StreamingTextDecoderMac(encoding));
+    if (decoderMac->textEncodingSupported())
+        return decoderMac.release();
+#endif
 
-        default:
-            break;
-    }
+    LOG_ERROR("no converter can convert from text encoding 0x%X", encoding.encodingID());
 
-    //#define PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE 1000
-#if PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE
-    DeprecatedString result;
-    int chunkSize;
-    for (int i = 0; i != len; i += chunkSize) {
-        chunkSize = len - i;
-        if (chunkSize > PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE) {
-            chunkSize = PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE;
-        }
-        result += convertUsingICU(chs + i, chunkSize, flush && (i + chunkSize == len));
-    }
-    return result;
-#else
-    return convertUsingICU(chs, len, flush);
+#if USE(ICU_UNICODE)
+    return decoderICU.release();
+#elif PLATFORM(MAC)
+    return decoderMac.release();
 #endif
 }
 
-DeprecatedString StreamingTextDecoder::toUnicode(const char* chs, int len, bool flush)
+StreamingTextDecoder::~StreamingTextDecoder()
 {
-    ASSERT_ARG(len, len >= 0);
-    
-    if (m_error || !chs)
-        return DeprecatedString();
-
-    if (len <= 0 && !flush)
-        return "";
-
-    // Handle normal case.
-    if (!m_atStart)
-        return convert(chs, len, flush);
-
-    // Check to see if we found a BOM.
-    int numBufferedBytes = m_numBufferedBytes;
-    int buf1Len = numBufferedBytes;
-    int buf2Len = len;
-    const unsigned char* buf1 = m_bufferedBytes;
-    const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(chs);
-    unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
-    unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
-    unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
-    int BOMLength = 0;
-    if (c1 == 0xFF && c2 == 0xFE) {
-        m_encoding = TextEncoding(UTF16Encoding, LittleEndian);
-        m_littleEndian = true;
-        BOMLength = 2;
-    } else if (c1 == 0xFE && c2 == 0xFF) {
-        m_encoding = TextEncoding(UTF16Encoding, BigEndian);
-        m_littleEndian = false;
-        BOMLength = 2;
-    } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
-        m_encoding = TextEncoding(UTF8Encoding);
-        BOMLength = 3;
-    }
-
-    // Handle case where we found a BOM.
-    if (BOMLength != 0) {
-        ASSERT(numBufferedBytes + len >= BOMLength);
-        int skip = BOMLength - numBufferedBytes;
-        m_numBufferedBytes = 0;
-        m_atStart = false;
-        return len == skip ? DeprecatedString("") : convert(chs + skip, len - skip, flush);
-    }
-
-    // Handle case where we know there is no BOM coming.
-    const int bufferSize = sizeof(m_bufferedBytes);
-    if (numBufferedBytes + len > bufferSize || flush) {
-        m_atStart = false;
-        if (numBufferedBytes == 0) {
-            return convert(chs, len, flush);
-        }
-        unsigned char bufferedBytes[sizeof(m_bufferedBytes)];
-        memcpy(bufferedBytes, m_bufferedBytes, numBufferedBytes);
-        m_numBufferedBytes = 0;
-        return convert(bufferedBytes, numBufferedBytes, false) + convert(chs, len, flush);
-    }
-
-    // Continue to look for the BOM.
-    memcpy(&m_bufferedBytes[numBufferedBytes], chs, len);
-    m_numBufferedBytes += len;
-    return "";
 }
-    
+
 } // namespace WebCore
index 90f2e27806f291edfb05e52b2bb36f398c7ea5a4..ba6206b63e1914eb826177ea268a23032fa2f62e 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2004, 2006 Apple Computer, Inc.  All rights reserved.
+ * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
 
 #include "TextEncoding.h"
 #include <wtf/Noncopyable.h>
-#include <unicode/ucnv.h>
-#include <unicode/utypes.h>
 
 namespace WebCore {
 
     class StreamingTextDecoder : Noncopyable {
     public:
-        StreamingTextDecoder(const TextEncoding&);
-        ~StreamingTextDecoder();
+        static StreamingTextDecoder* create(const TextEncoding&);
+        virtual ~StreamingTextDecoder();
 
-        DeprecatedString toUnicode(const char* chs, int len, bool flush = false);
-
-    private:
-        DeprecatedString convert(const char* chs, int len, bool flush)
-            { return convert(reinterpret_cast<const unsigned char*>(chs), len, flush); }
-        DeprecatedString convert(const unsigned char* chs, int len, bool flush);
-
-        bool convertIfASCII(const unsigned char*, int len, DeprecatedString&);
-        DeprecatedString convertUTF16(const unsigned char*, int len);
-        DeprecatedString convertUsingICU(const unsigned char*, int len, bool flush);
-
-        void createICUConverter();
-
-        static void appendOmittingBOM(DeprecatedString&, const UChar* characters, int byteCount);
-
-        TextEncoding m_encoding;
-        bool m_littleEndian;
-        bool m_atStart;
-        bool m_error;
-        
-        unsigned m_numBufferedBytes;
-        unsigned char m_bufferedBytes[16]; // bigger than any single multi-byte character
-        
-        // ICU decoding.
-        UConverter* m_converterICU;
+        virtual DeprecatedString toUnicode(const char* chs, int len, bool flush = false) = 0;
+        virtual DeprecatedCString fromUnicode(const DeprecatedString&, bool allowEntities = false) = 0;
     };
     
 } // namespace WebCore
diff --git a/WebCore/platform/StreamingTextDecoderICU.cpp b/WebCore/platform/StreamingTextDecoderICU.cpp
new file mode 100644 (file)
index 0000000..1be1ef8
--- /dev/null
@@ -0,0 +1,438 @@
+/*
+ * Copyright (C) 2004, 2006 Apple Computer, Inc.  All rights reserved.
+ * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#include "config.h"
+#include "StreamingTextDecoderICU.h"
+
+#include <wtf/Assertions.h>
+#include <unicode/unorm.h>
+
+using std::min;
+
+namespace WebCore {
+
+StreamingTextDecoderICU::StreamingTextDecoderICU(const TextEncoding& encoding)
+    : m_encoding(encoding)
+    , m_littleEndian(encoding.flags() & LittleEndian)
+    , m_atStart(true)
+    , m_numBufferedBytes(0)
+    , m_converterICU(0)
+{
+}
+
+static const UChar BOM = 0xFEFF;
+static const size_t ConversionBufferSize = 16384;
+    
+static UConverter* cachedConverterICU;
+static TextEncodingID cachedConverterEncoding = InvalidEncoding;
+
+StreamingTextDecoderICU::~StreamingTextDecoderICU()
+{
+    releaseICUConverter();
+}
+
+void StreamingTextDecoderICU::releaseICUConverter()
+{
+    if (m_converterICU) {
+        if (cachedConverterICU != 0)
+            ucnv_close(cachedConverterICU);
+        cachedConverterICU = m_converterICU;
+        cachedConverterEncoding = m_encoding.encodingID();
+        m_converterICU = 0;
+    }
+}
+
+bool StreamingTextDecoderICU::textEncodingSupported()
+{
+    if (!m_converterICU)
+        createICUConverter();
+    
+    return m_converterICU;
+}
+
+DeprecatedString StreamingTextDecoderICU::convertUTF16(const unsigned char* s, int length)
+{
+    ASSERT(m_numBufferedBytes == 0 || m_numBufferedBytes == 1);
+
+    const unsigned char* p = s;
+    size_t len = length;
+    
+    DeprecatedString result("");
+    
+    result.reserve(length / 2);
+
+    if (m_numBufferedBytes != 0 && len != 0) {
+        ASSERT(m_numBufferedBytes == 1);
+        UChar c;
+        if (m_littleEndian)
+            c = m_bufferedBytes[0] | (p[0] << 8);
+        else
+            c = (m_bufferedBytes[0] << 8) | p[0];
+
+        if (c)
+            result.append(reinterpret_cast<DeprecatedChar*>(&c), 1);
+
+        m_numBufferedBytes = 0;
+        p += 1;
+        len -= 1;
+    }
+    
+    while (len > 1) {
+        UChar buffer[ConversionBufferSize];
+        int runLength = min(len / 2, ConversionBufferSize);
+        int bufferLength = 0;
+        if (m_littleEndian) {
+            for (int i = 0; i < runLength; ++i) {
+                UChar c = p[0] | (p[1] << 8);
+                p += 2;
+                if (c != BOM)
+                    buffer[bufferLength++] = c;
+            }
+        } else {
+            for (int i = 0; i < runLength; ++i) {
+                UChar c = (p[0] << 8) | p[1];
+                p += 2;
+                if (c != BOM)
+                    buffer[bufferLength++] = c;
+            }
+        }
+        result.append(reinterpret_cast<DeprecatedChar*>(buffer), bufferLength);
+        len -= runLength * 2;
+    }
+    
+    if (len) {
+        ASSERT(m_numBufferedBytes == 0);
+        m_numBufferedBytes = 1;
+        m_bufferedBytes[0] = p[0];
+    }
+    
+    return result;
+}
+
+bool StreamingTextDecoderICU::convertIfASCII(const unsigned char* s, int length, DeprecatedString& str)
+{
+    ASSERT(m_numBufferedBytes == 0 || m_numBufferedBytes == 1);
+
+    DeprecatedString result("");
+    result.reserve(length);
+
+    const unsigned char* p = s;
+    size_t len = length;
+    unsigned char ored = 0;
+    while (len) {
+        UChar buffer[ConversionBufferSize];
+        int runLength = min(len, ConversionBufferSize);
+        int bufferLength = 0;
+        for (int i = 0; i < runLength; ++i) {
+            unsigned char c = *p++;
+            ored |= c;
+            buffer[bufferLength++] = c;
+        }
+        if (ored & 0x80)
+            return false;
+        result.append(reinterpret_cast<DeprecatedChar*>(buffer), bufferLength);
+        len -= runLength;
+    }
+
+    str = result;
+    return true;
+}
+
+void StreamingTextDecoderICU::createICUConverter()
+{
+    TextEncoding encoding = m_encoding.effectiveEncoding();
+    const char* encodingName = encoding.name();
+
+    bool cachedEncodingEqual = cachedConverterEncoding == encoding.encodingID();
+    cachedConverterEncoding = InvalidEncoding;
+
+    if (cachedEncodingEqual && cachedConverterICU) {
+        m_converterICU = cachedConverterICU;
+        cachedConverterICU = 0;
+    } else {
+        UErrorCode err = U_ZERO_ERROR;
+        ASSERT(!m_converterICU);
+        m_converterICU = ucnv_open(encodingName, &err);
+#if !LOG_DISABLED
+        if (err == U_AMBIGUOUS_ALIAS_WARNING)
+            LOG_ERROR("ICU ambiguous alias warning for encoding: %s", encodingName);
+#endif
+    }
+}
+
+// We strip BOM characters because they can show up both at the start of content
+// and inside content, and we never want them to end up in the decoded text.
+void StreamingTextDecoderICU::appendOmittingBOM(DeprecatedString& s, const UChar* characters, int byteCount)
+{
+    ASSERT(byteCount % sizeof(UChar) == 0);
+    int start = 0;
+    int characterCount = byteCount / sizeof(UChar);
+    for (int i = 0; i != characterCount; ++i) {
+        if (BOM == characters[i]) {
+            if (start != i)
+                s.append(reinterpret_cast<const DeprecatedChar*>(&characters[start]), i - start);
+            start = i + 1;
+        }
+    }
+    if (start != characterCount)
+        s.append(reinterpret_cast<const DeprecatedChar*>(&characters[start]), characterCount - start);
+}
+
+DeprecatedString StreamingTextDecoderICU::convertUsingICU(const unsigned char* chs, int len, bool flush)
+{
+    // Get a converter for the passed-in encoding.
+    if (!m_converterICU) {
+        createICUConverter();
+        if (!m_converterICU)
+            return DeprecatedString();
+    }
+
+    DeprecatedString result("");
+    result.reserve(len);
+
+    UChar buffer[ConversionBufferSize];
+    const char* source = reinterpret_cast<const char*>(chs);
+    const char* sourceLimit = source + len;
+    int32_t* offsets = NULL;
+    UErrorCode err;
+
+    do {
+        UChar* target = buffer;
+        const UChar* targetLimit = target + ConversionBufferSize;
+        err = U_ZERO_ERROR;
+        ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, flush, &err);
+        int count = target - buffer;
+        appendOmittingBOM(result, reinterpret_cast<const UChar*>(buffer), count * sizeof(UChar));
+    } while (err == U_BUFFER_OVERFLOW_ERROR);
+
+    if (U_FAILURE(err)) {
+        // flush the converter so it can be reused, and not be bothered by this error.
+        do {
+            UChar *target = buffer;
+            const UChar *targetLimit = target + ConversionBufferSize;
+            err = U_ZERO_ERROR;
+            ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, true, &err);
+        } while (source < sourceLimit);
+        LOG_ERROR("ICU conversion error");
+        return DeprecatedString();
+    }
+
+    return result;
+}
+
+DeprecatedString StreamingTextDecoderICU::convert(const unsigned char* chs, int len, bool flush)
+{
+    switch (m_encoding.encodingID()) {
+        case UTF16Encoding:
+            return convertUTF16(chs, len);
+
+        case ASCIIEncoding:
+        case Latin1Encoding:
+        case WinLatin1Encoding: {
+            DeprecatedString result;
+            if (convertIfASCII(chs, len, result))
+                return result;
+            break;
+        }
+
+        case UTF8Encoding:
+            // If a previous run used ICU, we might have a partly converted character.
+            // If so, don't use the optimized ASCII code path.
+            if (!m_converterICU) {
+                DeprecatedString result;
+                if (convertIfASCII(chs, len, result))
+                    return result;
+            }
+            break;
+
+        default:
+            break;
+    }
+
+    //#define PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE 1000
+#if PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE
+    DeprecatedString result;
+    int chunkSize;
+    for (int i = 0; i != len; i += chunkSize) {
+        chunkSize = len - i;
+        if (chunkSize > PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE) {
+            chunkSize = PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE;
+        }
+        result += convertUsingICU(chs + i, chunkSize, flush && (i + chunkSize == len));
+    }
+    return result;
+#else
+    return convertUsingICU(chs, len, flush);
+#endif
+}
+
+DeprecatedString StreamingTextDecoderICU::toUnicode(const char* chs, int len, bool flush)
+{
+    ASSERT_ARG(len, len >= 0);
+    
+    if (!chs)
+        return DeprecatedString();
+
+    if (len <= 0 && !flush)
+        return "";
+
+    // Handle normal case.
+    if (!m_atStart)
+        return convert(chs, len, flush);
+
+    // Check to see if we found a BOM.
+    int numBufferedBytes = m_numBufferedBytes;
+    int buf1Len = numBufferedBytes;
+    int buf2Len = len;
+    const unsigned char* buf1 = m_bufferedBytes;
+    const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(chs);
+    unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
+    unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
+    unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
+    int BOMLength = 0;
+    if (c1 == 0xFF && c2 == 0xFE) {
+        if (m_encoding != TextEncoding(UTF16Encoding, LittleEndian)) {
+            releaseICUConverter();
+            m_encoding = TextEncoding(UTF16Encoding, LittleEndian);
+            m_littleEndian = true;
+        }
+        BOMLength = 2;
+    } else if (c1 == 0xFE && c2 == 0xFF) {
+        if (m_encoding != TextEncoding(UTF16Encoding, BigEndian)) {
+            releaseICUConverter();
+            m_encoding = TextEncoding(UTF16Encoding, BigEndian);
+            m_littleEndian = false;
+        }
+        BOMLength = 2;
+    } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
+        if (m_encoding != TextEncoding(UTF8Encoding)) {
+            releaseICUConverter();
+            m_encoding = TextEncoding(UTF8Encoding);
+        }
+        BOMLength = 3;
+    }
+
+    // Handle case where we found a BOM.
+    if (BOMLength != 0) {
+        ASSERT(numBufferedBytes + len >= BOMLength);
+        int skip = BOMLength - numBufferedBytes;
+        m_numBufferedBytes = 0;
+        m_atStart = false;
+        return len == skip ? DeprecatedString("") : convert(chs + skip, len - skip, flush);
+    }
+
+    // Handle case where we know there is no BOM coming.
+    const int bufferSize = sizeof(m_bufferedBytes);
+    if (numBufferedBytes + len > bufferSize || flush) {
+        m_atStart = false;
+        if (numBufferedBytes == 0) {
+            return convert(chs, len, flush);
+        }
+        unsigned char bufferedBytes[sizeof(m_bufferedBytes)];
+        memcpy(bufferedBytes, m_bufferedBytes, numBufferedBytes);
+        m_numBufferedBytes = 0;
+        return convert(bufferedBytes, numBufferedBytes, false) + convert(chs, len, flush);
+    }
+
+    // Continue to look for the BOM.
+    memcpy(&m_bufferedBytes[numBufferedBytes], chs, len);
+    m_numBufferedBytes += len;
+    return "";
+}
+    
+DeprecatedCString StreamingTextDecoderICU::fromUnicode(const DeprecatedString &qcs, bool allowEntities)
+{
+    TextEncodingID encoding = m_encoding.effectiveEncoding().encodingID();
+
+    if (encoding == WinLatin1Encoding && qcs.isAllLatin1())
+        return qcs.latin1();
+
+    if ((encoding == WinLatin1Encoding || encoding == UTF8Encoding || encoding == ASCIIEncoding) 
+        && qcs.isAllASCII())
+        return qcs.ascii();
+
+    // FIXME: We should see if there is "force ASCII range" mode in ICU;
+    // until then, we change the backslash into a yen sign.
+    // Encoding will change the yen sign back into a backslash.
+    DeprecatedString copy = qcs;
+    copy.replace('\\', m_encoding.backslashAsCurrencySymbol());
+
+    if (!m_converterICU)
+        createICUConverter();
+    if (!m_converterICU)
+        return DeprecatedCString();
+
+    // FIXME: when DeprecatedString buffer is latin1, it would be nice to
+    // convert from that w/o having to allocate a unicode buffer
+
+    char buffer[ConversionBufferSize];
+    const UChar* source = reinterpret_cast<const UChar*>(copy.unicode());
+    const UChar* sourceLimit = source + copy.length();
+
+    UErrorCode err = U_ZERO_ERROR;
+    DeprecatedString normalizedString;
+    if (UNORM_YES != unorm_quickCheck(source, copy.length(), UNORM_NFC, &err)) {
+        normalizedString.truncate(copy.length()); // normalization to NFC rarely increases the length, so this first attempt will usually succeed
+        
+        int32_t normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), copy.length(), &err);
+        if (err == U_BUFFER_OVERFLOW_ERROR) {
+            err = U_ZERO_ERROR;
+            normalizedString.truncate(normalizedLength);
+            normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), normalizedLength, &err);
+        }
+        
+        source = reinterpret_cast<const UChar*>(normalizedString.unicode());
+        sourceLimit = source + normalizedLength;
+    }
+
+    DeprecatedCString result(1); // for trailing zero
+
+    if (allowEntities)
+        ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err);
+    else {
+        ucnv_setSubstChars(m_converterICU, "?", 1, &err);
+        ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err);
+    }
+
+    ASSERT(U_SUCCESS(err));
+    if (U_FAILURE(err))
+        return DeprecatedCString();
+
+    do {
+        char* target = buffer;
+        char* targetLimit = target + ConversionBufferSize;
+        err = U_ZERO_ERROR;
+        ucnv_fromUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, 0, true,  &err);
+        int count = target - buffer;
+        buffer[count] = 0;
+        result.append(buffer);
+    } while (err == U_BUFFER_OVERFLOW_ERROR);
+
+    return result;
+}
+
+
+} // namespace WebCore
diff --git a/WebCore/platform/StreamingTextDecoderICU.h b/WebCore/platform/StreamingTextDecoderICU.h
new file mode 100644 (file)
index 0000000..93fc911
--- /dev/null
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2004, 2006 Apple Computer, Inc.  All rights reserved.
+ * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#ifndef StreamingTextDecoderICU_H
+#define StreamingTextDecoderICU_H
+
+#include "StreamingTextDecoder.h"
+#include <unicode/ucnv.h>
+#include <unicode/utypes.h>
+
+namespace WebCore {
+
+    class StreamingTextDecoderICU : public StreamingTextDecoder {
+    public:
+        StreamingTextDecoderICU(const TextEncoding&);
+        virtual ~StreamingTextDecoderICU();
+
+        bool textEncodingSupported();
+
+        virtual DeprecatedString toUnicode(const char* chs, int len, bool flush = false);
+        virtual DeprecatedCString fromUnicode(const DeprecatedString&, bool allowEntities = false);
+
+    private:
+        DeprecatedString convert(const char* chs, int len, bool flush)
+            { return convert(reinterpret_cast<const unsigned char*>(chs), len, flush); }
+        DeprecatedString convert(const unsigned char* chs, int len, bool flush);
+
+        bool convertIfASCII(const unsigned char*, int len, DeprecatedString&);
+        DeprecatedString convertUTF16(const unsigned char*, int len);
+        DeprecatedString convertUsingICU(const unsigned char*, int len, bool flush);
+
+        void createICUConverter();
+        void releaseICUConverter();
+
+        static void appendOmittingBOM(DeprecatedString&, const UChar* characters, int byteCount);
+
+        TextEncoding m_encoding;
+        bool m_littleEndian;
+        bool m_atStart;
+        
+        unsigned m_numBufferedBytes;
+        unsigned char m_bufferedBytes[16]; // bigger than any single multi-byte character
+        
+        UConverter* m_converterICU;
+    };
+    
+} // namespace WebCore
+
+#endif // StreamingTextDecoderICU_H
index 2cb37d72887f18378d4bd35a81d0bdc4699bc7f7..b34590d937dd5064458753a1cd0238f1eb4ab12e 100644 (file)
 #include "TextEncoding.h"
 
 #include "CharsetNames.h"
-#include <wtf/Assertions.h>
-#include <wtf/HashSet.h>
 #include "StreamingTextDecoder.h"
-#include <unicode/unorm.h>
+
+#include <wtf/OwnPtr.h>
 
 namespace WebCore {
 
@@ -41,10 +40,19 @@ TextEncoding::TextEncoding(const char* name, bool eightBitOnly)
         m_encodingID = UTF8Encoding;
 }
 
+TextEncoding TextEncoding::effectiveEncoding() const
+{
+    TextEncodingID id = m_encodingID;
+    if (id == Latin1Encoding || id == ASCIIEncoding)
+        id = WinLatin1Encoding;
+    return TextEncoding(id, m_flags);
+}
+
 const char* TextEncoding::name() const
 {
     return charsetNameFromTextEncodingID(m_encodingID);
 }
+
 UChar TextEncoding::backslashAsCurrencySymbol() const
 {
     if (m_flags & BackslashIsYen)
@@ -55,132 +63,14 @@ UChar TextEncoding::backslashAsCurrencySymbol() const
 
 DeprecatedString TextEncoding::toUnicode(const char* chs, int len) const
 {
-    return StreamingTextDecoder(*this).toUnicode(chs, len, true);
-}
-
-// We'd like to use ICU for this on OS X as well eventually, but we need to make sure
-// it covers all the encodings that we need
-#ifndef __APPLE__
-
-static UConverter* cachedConverter;
-static TextEncodingID cachedConverterEncoding = InvalidEncoding;
-
-static const int ConversionBufferSize = 16384;
-
-static inline UConverter* getConverter(TextEncodingID encoding, UErrorCode* status)
-{
-    if (cachedConverter && encoding == cachedConverterEncoding) {
-        UConverter* conv = cachedConverter;
-        cachedConverter = 0;
-        return conv;
-    }
-
-    const char* encodingName = charsetNameFromTextEncodingID(encoding);
-    UErrorCode err = U_ZERO_ERROR;
-    UConverter* conv = ucnv_open(encodingName, &err);
-    if (err == U_AMBIGUOUS_ALIAS_WARNING)
-        LOG_ERROR("ICU ambiguous alias warning for encoding: %s", encodingName);
-
-    if (!conv) {
-        LOG_ERROR("the ICU Converter won't convert to text encoding 0x%X, error %d", encoding, err);
-        *status = err;
-        return 0;
-    }
-
-    return conv;
-}
-
-static inline void cacheConverter(TextEncodingID id, UConverter* conv)
-{
-    if (conv) {
-        if (cachedConverter)
-            ucnv_close(cachedConverter);
-        cachedConverter = conv;
-        cachedConverterEncoding = id;
-    }
-}
-
-static inline TextEncodingID effectiveEncoding(TextEncodingID encoding)
-{
-    if (encoding == Latin1Encoding || encoding == ASCIIEncoding)
-        return WinLatin1Encoding;
-    return encoding;
+    OwnPtr<StreamingTextDecoder> decoder(StreamingTextDecoder::create(*this));
+    return decoder->toUnicode(chs, len, true);
 }
 
 DeprecatedCString TextEncoding::fromUnicode(const DeprecatedString &qcs, bool allowEntities) const
 {
-    TextEncodingID encoding = effectiveEncoding(m_encodingID);
-
-    if (encoding == WinLatin1Encoding && qcs.isAllLatin1())
-        return qcs.latin1();
-
-    if ((encoding == WinLatin1Encoding || encoding == UTF8Encoding || encoding == ASCIIEncoding) 
-        && qcs.isAllASCII())
-        return qcs.ascii();
-
-    // FIXME: We should see if there is "force ASCII range" mode in ICU;
-    // until then, we change the backslash into a yen sign.
-    // Encoding will change the yen sign back into a backslash.
-    DeprecatedString copy = qcs;
-    copy.replace('\\', backslashAsCurrencySymbol());
-
-    UErrorCode err = U_ZERO_ERROR;
-    UConverter* conv = getConverter(encoding, &err);
-    if (!conv && U_FAILURE(err))
-        return DeprecatedCString();
-
-    ASSERT(conv);
-
-    // FIXME: when DeprecatedString buffer is latin1, it would be nice to
-    // convert from that w/o having to allocate a unicode buffer
-
-    char buffer[ConversionBufferSize];
-    const UChar* source = reinterpret_cast<const UChar*>(copy.unicode());
-    const UChar* sourceLimit = source + copy.length();
-
-    DeprecatedString normalizedString;
-    if (UNORM_YES != unorm_quickCheck(source, copy.length(), UNORM_NFC, &err)) {
-        normalizedString.truncate(copy.length()); // normalization to NFC rarely increases the length, so this first attempt will usually succeed
-        
-        int32_t normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), copy.length(), &err);
-        if (err == U_BUFFER_OVERFLOW_ERROR) {
-            err = U_ZERO_ERROR;
-            normalizedString.truncate(normalizedLength);
-            normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), normalizedLength, &err);
-        }
-        
-        source = reinterpret_cast<const UChar*>(normalizedString.unicode());
-        sourceLimit = source + normalizedLength;
-    }
-
-    DeprecatedCString result(1); // for trailing zero
-
-    if (allowEntities)
-        ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err);
-    else {
-        ucnv_setSubstChars(conv, "?", 1, &err);
-        ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err);
-    }
-
-    ASSERT(U_SUCCESS(err));
-    if (U_FAILURE(err))
-        return DeprecatedCString();
-
-    do {
-        char* target = buffer;
-        char* targetLimit = target + ConversionBufferSize;
-        err = U_ZERO_ERROR;
-        ucnv_fromUnicode(conv, &target, targetLimit, &source, sourceLimit, 0, true,  &err);
-        int count = target - buffer;
-        buffer[count] = 0;
-        result.append(buffer);
-    } while (err == U_BUFFER_OVERFLOW_ERROR);
-
-    cacheConverter(encoding, conv);
-
-    return result;
+    OwnPtr<StreamingTextDecoder> decoder(StreamingTextDecoder::create(*this));
+    return decoder->fromUnicode(qcs, allowEntities);
 }
-#endif
-
 
 } // namespace WebCore
index 69525dd8d9d435491edbce2dbfa91cfee21d8f3d..b6da64046a2e5a5aa6a128a67482ce69cd280c95 100644 (file)
@@ -30,9 +30,7 @@
 
 namespace WebCore {
 
-    class StreamingTextDecoder;
-
-#ifdef __APPLE__
+#if PLATFORM(MAC)
     typedef CFStringEncoding TextEncodingID;
     
     const TextEncodingID InvalidEncoding = kCFStringEncodingInvalidId;
@@ -77,6 +75,8 @@ namespace WebCore {
 
         explicit TextEncoding(const char*, bool eightBitOnly = false);
 
+        TextEncoding effectiveEncoding() const;
+
         bool isValid() const { return m_encodingID != InvalidEncoding; }
         const char* name() const;
         bool usesVisualOrdering() const { return m_flags & VisualOrdering; }
diff --git a/WebCore/platform/mac/StreamingTextDecoderMac.cpp b/WebCore/platform/mac/StreamingTextDecoderMac.cpp
new file mode 100644 (file)
index 0000000..85a98c8
--- /dev/null
@@ -0,0 +1,518 @@
+/*
+ * Copyright (C) 2004, 2006 Apple Computer, Inc.  All rights reserved.
+ * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#include "config.h"
+#include "StreamingTextDecoderMac.h"
+
+#include <wtf/Assertions.h>
+
+using std::min;
+
+namespace WebCore {
+
+// We need to keep this version because ICU doesn't support some of the encodings that we need:
+// <http://bugzilla.opendarwin.org/show_bug.cgi?id=4195>.
+
+StreamingTextDecoderMac::StreamingTextDecoderMac(const TextEncoding& encoding)
+    : m_encoding(encoding)
+    , m_littleEndian(encoding.flags() & LittleEndian)
+    , m_atStart(true)
+    , m_error(false)
+    , m_numBufferedBytes(0)
+    , m_converterTEC(0)
+{
+}
+
+static const UChar BOM = 0xFEFF;
+static const size_t ConversionBufferSize = 16384;
+
+static TECObjectRef cachedConverterTEC;
+static TextEncodingID cachedConverterEncoding = InvalidEncoding;
+
+StreamingTextDecoderMac::~StreamingTextDecoderMac()
+{
+    releaseTECConverter();
+}
+
+void StreamingTextDecoderMac::releaseTECConverter()
+{
+    if (m_converterTEC) {
+        if (cachedConverterTEC != 0)
+            TECDisposeConverter(cachedConverterTEC);
+        cachedConverterTEC = m_converterTEC;
+        cachedConverterEncoding = m_encoding.encodingID();
+        m_converterTEC = 0;
+    }
+}
+
+bool StreamingTextDecoderMac::textEncodingSupported()
+{
+    if (m_encoding.encodingID() == kCFStringEncodingUTF16 || 
+        m_encoding.encodingID() == kCFStringEncodingUTF16BE ||
+        m_encoding.encodingID() == kCFStringEncodingUTF16LE)
+          return true;
+
+    if (!m_converterTEC)
+        createTECConverter();
+    
+    return m_converterTEC;
+}
+
+DeprecatedString StreamingTextDecoderMac::convertUTF16(const unsigned char* s, int length)
+{
+    ASSERT(m_numBufferedBytes == 0 || m_numBufferedBytes == 1);
+
+    const unsigned char* p = s;
+    size_t len = length;
+    
+    DeprecatedString result("");
+    
+    result.reserve(length / 2);
+
+    if (m_numBufferedBytes != 0 && len != 0) {
+        ASSERT(m_numBufferedBytes == 1);
+        UChar c;
+        if (m_littleEndian)
+            c = m_bufferedBytes[0] | (p[0] << 8);
+        else
+            c = (m_bufferedBytes[0] << 8) | p[0];
+
+        if (c)
+            result.append(reinterpret_cast<DeprecatedChar*>(&c), 1);
+
+        m_numBufferedBytes = 0;
+        p += 1;
+        len -= 1;
+    }
+    
+    while (len > 1) {
+        UChar buffer[ConversionBufferSize];
+        int runLength = min(len / 2, ConversionBufferSize);
+        int bufferLength = 0;
+        if (m_littleEndian) {
+            for (int i = 0; i < runLength; ++i) {
+                UChar c = p[0] | (p[1] << 8);
+                p += 2;
+                if (c != BOM)
+                    buffer[bufferLength++] = c;
+            }
+        } else {
+            for (int i = 0; i < runLength; ++i) {
+                UChar c = (p[0] << 8) | p[1];
+                p += 2;
+                if (c != BOM)
+                    buffer[bufferLength++] = c;
+            }
+        }
+        result.append(reinterpret_cast<DeprecatedChar*>(buffer), bufferLength);
+        len -= runLength * 2;
+    }
+    
+    if (len) {
+        ASSERT(m_numBufferedBytes == 0);
+        m_numBufferedBytes = 1;
+        m_bufferedBytes[0] = p[0];
+    }
+    
+    return result;
+}
+
+bool StreamingTextDecoderMac::convertIfASCII(const unsigned char* s, int length, DeprecatedString& str)
+{
+    ASSERT(m_numBufferedBytes == 0 || m_numBufferedBytes == 1);
+
+    DeprecatedString result("");
+    result.reserve(length);
+
+    const unsigned char* p = s;
+    size_t len = length;
+    unsigned char ored = 0;
+    while (len) {
+        UChar buffer[ConversionBufferSize];
+        int runLength = min(len, ConversionBufferSize);
+        int bufferLength = 0;
+        for (int i = 0; i < runLength; ++i) {
+            unsigned char c = *p++;
+            ored |= c;
+            buffer[bufferLength++] = c;
+        }
+        if (ored & 0x80)
+            return false;
+        result.append(reinterpret_cast<DeprecatedChar*>(buffer), bufferLength);
+        len -= runLength;
+    }
+
+    str = result;
+    return true;
+}
+
+OSStatus StreamingTextDecoderMac::createTECConverter()
+{
+    const TextEncodingID encoding = m_encoding.effectiveEncoding().encodingID();
+
+    bool cachedEncodingEqual = cachedConverterEncoding == encoding;
+    cachedConverterEncoding = InvalidEncoding;
+
+    if (cachedEncodingEqual && cachedConverterTEC) {
+        m_converterTEC = cachedConverterTEC;
+        cachedConverterTEC = 0;
+        TECClearConverterContextInfo(m_converterTEC);
+    } else {
+        OSStatus status = TECCreateConverter(&m_converterTEC, encoding,
+            CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicode16BitFormat));
+        if (status)
+            return status;
+
+        TECSetBasicOptions(m_converterTEC, kUnicodeForceASCIIRangeMask);
+    }
+    
+    return noErr;
+}
+
+// We strip BOM characters because they can show up both at the start of content
+// and inside content, and we never want them to end up in the decoded text.
+void StreamingTextDecoderMac::appendOmittingBOM(DeprecatedString& s, const UChar* characters, int byteCount)
+{
+    ASSERT(byteCount % sizeof(UChar) == 0);
+    int start = 0;
+    int characterCount = byteCount / sizeof(UChar);
+    for (int i = 0; i != characterCount; ++i) {
+        if (BOM == characters[i]) {
+            if (start != i)
+                s.append(reinterpret_cast<const DeprecatedChar*>(&characters[start]), i - start);
+            start = i + 1;
+        }
+    }
+    if (start != characterCount)
+        s.append(reinterpret_cast<const DeprecatedChar*>(&characters[start]), characterCount - start);
+}
+
+OSStatus StreamingTextDecoderMac::convertOneChunkUsingTEC(const unsigned char *inputBuffer, int inputBufferLength, int &inputLength,
+    void *outputBuffer, int outputBufferLength, int &outputLength)
+{
+    OSStatus status;
+    unsigned long bytesRead = 0;
+    unsigned long bytesWritten = 0;
+
+    if (m_numBufferedBytes != 0) {
+        // Finish converting a partial character that's in our buffer.
+        
+        // First, fill the partial character buffer with as many bytes as are available.
+        ASSERT(m_numBufferedBytes < sizeof(m_bufferedBytes));
+        const int spaceInBuffer = sizeof(m_bufferedBytes) - m_numBufferedBytes;
+        const int bytesToPutInBuffer = MIN(spaceInBuffer, inputBufferLength);
+        ASSERT(bytesToPutInBuffer != 0);
+        memcpy(m_bufferedBytes + m_numBufferedBytes, inputBuffer, bytesToPutInBuffer);
+
+        // Now, do a conversion on the buffer.
+        status = TECConvertText(m_converterTEC, m_bufferedBytes, m_numBufferedBytes + bytesToPutInBuffer, &bytesRead,
+            reinterpret_cast<unsigned char *>(outputBuffer), outputBufferLength, &bytesWritten);
+        ASSERT(bytesRead <= m_numBufferedBytes + bytesToPutInBuffer);
+
+        if (status == kTECPartialCharErr && bytesRead == 0) {
+            // Handle the case where the partial character was not converted.
+            if (bytesToPutInBuffer >= spaceInBuffer) {
+                LOG_ERROR("TECConvertText gave a kTECPartialCharErr but read none of the %u bytes in the buffer", sizeof(m_bufferedBytes));
+                m_numBufferedBytes = 0;
+                status = kTECUnmappableElementErr; // should never happen, but use this error code
+            } else {
+                // Tell the caller we read all the source bytes and keep them in the buffer.
+                m_numBufferedBytes += bytesToPutInBuffer;
+                bytesRead = bytesToPutInBuffer;
+                status = noErr;
+            }
+        } else {
+            // We are done with the partial character buffer.
+            // Also, we have read some of the bytes from the main buffer.
+            if (bytesRead > m_numBufferedBytes) {
+                bytesRead -= m_numBufferedBytes;
+            } else {
+                LOG_ERROR("TECConvertText accepted some bytes it previously rejected with kTECPartialCharErr");
+                bytesRead = 0;
+            }
+            m_numBufferedBytes = 0;
+            if (status == kTECPartialCharErr) {
+                // While there may be a partial character problem in the small buffer,
+                // we have to try again and not get confused and think there is a partial
+                // character problem in the large buffer.
+                status = noErr;
+            }
+        }
+    } else {
+        status = TECConvertText(m_converterTEC, inputBuffer, inputBufferLength, &bytesRead,
+            static_cast<unsigned char *>(outputBuffer), outputBufferLength, &bytesWritten);
+        ASSERT(static_cast<int>(bytesRead) <= inputBufferLength);
+    }
+
+    // Work around bug 3351093, where sometimes we get kTECBufferBelowMinimumSizeErr instead of kTECOutputBufferFullStatus.
+    if (status == kTECBufferBelowMinimumSizeErr && bytesWritten != 0) {
+        status = kTECOutputBufferFullStatus;
+    }
+
+    inputLength = bytesRead;
+    outputLength = bytesWritten;
+    return status;
+}
+
+DeprecatedString StreamingTextDecoderMac::convertUsingTEC(const unsigned char *chs, int len, bool flush)
+{
+    // Get a converter for the passed-in encoding.
+    if (!m_converterTEC && createTECConverter() != noErr)
+        return DeprecatedString();
+    
+    DeprecatedString result("");
+
+    result.reserve(len);
+
+    const unsigned char *sourcePointer = chs;
+    int sourceLength = len;
+    bool bufferWasFull = false;
+    UniChar buffer[ConversionBufferSize];
+
+    while (sourceLength || bufferWasFull) {
+        int bytesRead = 0;
+        int bytesWritten = 0;
+        OSStatus status = convertOneChunkUsingTEC(sourcePointer, sourceLength, bytesRead, buffer, sizeof(buffer), bytesWritten);
+        ASSERT(bytesRead <= sourceLength);
+        sourcePointer += bytesRead;
+        sourceLength -= bytesRead;
+        
+        switch (status) {
+            case noErr:
+            case kTECOutputBufferFullStatus:
+                break;
+            case kTextMalformedInputErr:
+            case kTextUndefinedElementErr:
+                // FIXME: Put FFFD character into the output string in this case?
+                TECClearConverterContextInfo(m_converterTEC);
+                if (sourceLength) {
+                    sourcePointer += 1;
+                    sourceLength -= 1;
+                }
+                break;
+            case kTECPartialCharErr: {
+                // Put the partial character into the buffer.
+                ASSERT(m_numBufferedBytes == 0);
+                const int bufferSize = sizeof(m_numBufferedBytes);
+                if (sourceLength < bufferSize) {
+                    memcpy(m_bufferedBytes, sourcePointer, sourceLength);
+                    m_numBufferedBytes = sourceLength;
+                } else {
+                    LOG_ERROR("TECConvertText gave a kTECPartialCharErr, but left %u bytes in the buffer", sourceLength);
+                }
+                sourceLength = 0;
+                break;
+            }
+            default:
+                LOG_ERROR("text decoding failed with error %d", status);
+                m_error = true;
+                return DeprecatedString();
+        }
+
+        appendOmittingBOM(result, buffer, bytesWritten);
+
+        bufferWasFull = status == kTECOutputBufferFullStatus;
+    }
+    
+    if (flush) {
+        unsigned long bytesWritten = 0;
+        TECFlushText(m_converterTEC, reinterpret_cast<unsigned char *>(buffer), sizeof(buffer), &bytesWritten);
+        appendOmittingBOM(result, buffer, bytesWritten);
+    }
+
+    // Workaround for a bug in the Text Encoding Converter (see bug 3225472).
+    // Simplified Chinese pages use the code U+A3A0 to mean "full-width space".
+    // But GB18030 decodes it to U+E5E5, which is correct in theory but not in practice.
+    // To work around, just change all occurences of U+E5E5 to U+3000 (ideographic space).
+    if (m_encoding.encodingID() == kCFStringEncodingGB_18030_2000)
+        result.replace(0xE5E5, 0x3000);
+    
+    return result;
+}
+
+DeprecatedString StreamingTextDecoderMac::convert(const unsigned char *chs, int len, bool flush)
+{
+    switch (m_encoding.encodingID()) {
+        case UTF16Encoding:
+            return convertUTF16(chs, len);
+
+        case ASCIIEncoding:
+        case Latin1Encoding:
+        case WinLatin1Encoding: {
+            DeprecatedString result;
+            if (convertIfASCII(chs, len, result))
+                return result;
+            break;
+        }
+
+        case UTF8Encoding:
+            // If a previous run used TEC, we might have a partly converted character.
+            // If so, don't use the optimized ASCII code path.
+            if (!m_converterTEC) {
+                DeprecatedString result;
+                if (convertIfASCII(chs, len, result))
+                    return result;
+            }
+            break;
+
+        default:
+            break;
+    }
+
+    return convertUsingTEC(chs, len, flush);
+}
+
+DeprecatedString StreamingTextDecoderMac::toUnicode(const char* chs, int len, bool flush)
+{
+    ASSERT_ARG(len, len >= 0);
+    
+    if (m_error || !chs)
+        return DeprecatedString();
+
+    if (len <= 0 && !flush)
+        return "";
+
+    // Handle normal case.
+    if (!m_atStart)
+        return convert(chs, len, flush);
+
+    // Check to see if we found a BOM.
+    int numBufferedBytes = m_numBufferedBytes;
+    int buf1Len = numBufferedBytes;
+    int buf2Len = len;
+    const unsigned char* buf1 = m_bufferedBytes;
+    const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(chs);
+    unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
+    unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
+    unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
+    int BOMLength = 0;
+    if (c1 == 0xFF && c2 == 0xFE) {
+        if (m_encoding != TextEncoding(UTF16Encoding, LittleEndian)) {
+            releaseTECConverter();
+            m_encoding = TextEncoding(UTF16Encoding, LittleEndian);
+            m_littleEndian = true;
+        }
+        BOMLength = 2;
+    } else if (c1 == 0xFE && c2 == 0xFF) {
+        if (m_encoding != TextEncoding(UTF16Encoding, BigEndian)) {
+            releaseTECConverter();
+            m_encoding = TextEncoding(UTF16Encoding, BigEndian);
+            m_littleEndian = false;
+        }
+        BOMLength = 2;
+    } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
+        if (m_encoding != TextEncoding(UTF8Encoding)) {
+            releaseTECConverter();
+            m_encoding = TextEncoding(UTF8Encoding);
+        }
+        BOMLength = 3;
+    }
+
+    // Handle case where we found a BOM.
+    if (BOMLength != 0) {
+        ASSERT(numBufferedBytes + len >= BOMLength);
+        int skip = BOMLength - numBufferedBytes;
+        m_numBufferedBytes = 0;
+        m_atStart = false;
+        return len == skip ? DeprecatedString("") : convert(chs + skip, len - skip, flush);
+    }
+
+    // Handle case where we know there is no BOM coming.
+    const int bufferSize = sizeof(m_bufferedBytes);
+    if (numBufferedBytes + len > bufferSize || flush) {
+        m_atStart = false;
+        if (numBufferedBytes == 0) {
+            return convert(chs, len, flush);
+        }
+        unsigned char bufferedBytes[sizeof(m_bufferedBytes)];
+        memcpy(bufferedBytes, m_bufferedBytes, numBufferedBytes);
+        m_numBufferedBytes = 0;
+        return convert(bufferedBytes, numBufferedBytes, false) + convert(chs, len, flush);
+    }
+
+    // Continue to look for the BOM.
+    memcpy(&m_bufferedBytes[numBufferedBytes], chs, len);
+    m_numBufferedBytes += len;
+    return "";
+}
+
+DeprecatedCString StreamingTextDecoderMac::fromUnicode(const DeprecatedString &qcs, bool allowEntities)
+{
+    // FIXME: We should really use the same API in both directions.
+    // Currently we use TEC to decode and CFString to encode; it would be better to encode with TEC too.
+    
+    TextEncoding encoding = m_encoding.effectiveEncoding();
+
+    // FIXME: Since there's no "force ASCII range" mode in CFString, we change the backslash into a yen sign.
+    // Encoding will change the yen sign back into a backslash.
+    DeprecatedString copy = qcs;
+    copy.replace('\\', encoding.backslashAsCurrencySymbol());
+    CFStringRef cfs = copy.getCFString();
+    CFMutableStringRef cfms = CFStringCreateMutableCopy(0, 0, cfs); // in rare cases, normalization can make the string longer, thus no limit on its length
+    CFStringNormalize(cfms, kCFStringNormalizationFormC);
+    
+    CFIndex startPos = 0;
+    CFIndex charactersLeft = CFStringGetLength(cfms);
+    DeprecatedCString result(1); // for trailing zero
+
+    while (charactersLeft > 0) {
+        CFRange range = CFRangeMake(startPos, charactersLeft);
+        CFIndex bufferLength;
+        CFStringGetBytes(cfms, range, encoding.encodingID(), allowEntities ? 0 : '?', false, NULL, 0x7FFFFFFF, &bufferLength);
+        
+        DeprecatedCString chunk(bufferLength + 1);
+        unsigned char *buffer = reinterpret_cast<unsigned char *>(chunk.data());
+        CFIndex charactersConverted = CFStringGetBytes(cfms, range, encoding.encodingID(), allowEntities ? 0 : '?', false, buffer, bufferLength, &bufferLength);
+        buffer[bufferLength] = 0;
+        result.append(chunk);
+        
+        if (charactersConverted != charactersLeft) {
+            unsigned int badChar = CFStringGetCharacterAtIndex(cfms, startPos + charactersConverted);
+            ++charactersConverted;
+
+            if ((badChar & 0xfc00) == 0xd800 &&     // is high surrogate
+                  charactersConverted != charactersLeft) {
+                UniChar low = CFStringGetCharacterAtIndex(cfms, startPos + charactersConverted);
+                if ((low & 0xfc00) == 0xdc00) {     // is low surrogate
+                    badChar <<= 10;
+                    badChar += low;
+                    badChar += 0x10000 - (0xd800 << 10) - 0xdc00;
+                    ++charactersConverted;
+                }
+            }
+            char buf[16];
+            sprintf(buf, "&#%u;", badChar);
+            result.append(buf);
+        }
+        
+        startPos += charactersConverted;
+        charactersLeft -= charactersConverted;
+    }
+    CFRelease(cfms);
+    return result;
+}
+
+} // namespace WebCore
diff --git a/WebCore/platform/mac/StreamingTextDecoderMac.h b/WebCore/platform/mac/StreamingTextDecoderMac.h
new file mode 100644 (file)
index 0000000..097794a
--- /dev/null
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2004, 2006 Apple Computer, Inc.  All rights reserved.
+ * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#ifndef StreamingTextDecoderMac_H
+#define StreamingTextDecoderMac_H
+
+#include "StreamingTextDecoder.h"
+
+namespace WebCore {
+
+    class StreamingTextDecoderMac : public StreamingTextDecoder {
+    public:
+        StreamingTextDecoderMac(const TextEncoding&);
+        virtual ~StreamingTextDecoderMac();
+
+        bool textEncodingSupported();
+
+        virtual DeprecatedString toUnicode(const char* chs, int len, bool flush = false);
+        virtual DeprecatedCString fromUnicode(const DeprecatedString&, bool allowEntities = false);
+
+    private:
+        DeprecatedString convert(const char* chs, int len, bool flush)
+            { return convert(reinterpret_cast<const unsigned char*>(chs), len, flush); }
+        DeprecatedString convert(const unsigned char *chs, int len, bool flush);
+
+        bool convertIfASCII(const unsigned char*, int len, DeprecatedString&);
+        DeprecatedString convertUTF16(const unsigned char*, int len);
+        DeprecatedString convertUsingTEC(const unsigned char*, int len, bool flush);
+        OSStatus convertOneChunkUsingTEC(const unsigned char *inputBuffer, int inputBufferLength, int &inputLength, void *outputBuffer, int outputBufferLength, int &outputLength);
+
+        OSStatus createTECConverter();
+        void releaseTECConverter();
+
+        static void appendOmittingBOM(DeprecatedString&, const UChar* characters, int byteCount);
+
+        TextEncoding m_encoding;
+        bool m_littleEndian;
+        bool m_atStart;
+        bool m_error;
+
+        unsigned m_numBufferedBytes;
+        unsigned char m_bufferedBytes[16]; // bigger than any single multi-byte character
+
+        TECObjectRef m_converterTEC;
+    };
+    
+} // namespace WebCore
+
+#endif // StreamingTextDecoderMac_H
diff --git a/WebCore/platform/mac/TextEncodingMac.cpp b/WebCore/platform/mac/TextEncodingMac.cpp
deleted file mode 100644 (file)
index 1450ae8..0000000
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (C) 2004, 2006 Apple Computer, Inc.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
- */
-
-#include "config.h"
-#include "TextEncoding.h"
-
-#include <wtf/Assertions.h>
-#include <wtf/HashSet.h>
-#include "StreamingTextDecoder.h"
-
-namespace WebCore {
-
-static inline TextEncodingID effectiveEncoding(TextEncodingID encoding)
-{
-    if (encoding == Latin1Encoding || encoding == ASCIIEncoding)
-        return WinLatin1Encoding;
-    return encoding;
-}
-
-// We'd like to use ICU for this on OS X as well eventually, but we need to make sure
-// it covers all the encodings that we need
-DeprecatedCString TextEncoding::fromUnicode(const DeprecatedString &qcs, bool allowEntities) const
-{
-    // FIXME: We should really use the same API in both directions.
-    // Currently we use ICU to decode and CFString to encode; it would be better to encode with ICU too.
-    
-    TextEncodingID encoding = effectiveEncoding(m_encodingID);
-
-    // FIXME: Since there's no "force ASCII range" mode in CFString, we change the backslash into a yen sign.
-    // Encoding will change the yen sign back into a backslash.
-    DeprecatedString copy = qcs;
-    copy.replace('\\', backslashAsCurrencySymbol());
-    CFStringRef cfs = copy.getCFString();
-    CFMutableStringRef cfms = CFStringCreateMutableCopy(0, 0, cfs); // in rare cases, normalization can make the string longer, thus no limit on its length
-    CFStringNormalize(cfms, kCFStringNormalizationFormC);
-    
-    CFIndex startPos = 0;
-    CFIndex charactersLeft = CFStringGetLength(cfms);
-    DeprecatedCString result(1); // for trailing zero
-
-    while (charactersLeft > 0) {
-        CFRange range = CFRangeMake(startPos, charactersLeft);
-        CFIndex bufferLength;
-        CFStringGetBytes(cfms, range, encoding, allowEntities ? 0 : '?', false, NULL, 0x7FFFFFFF, &bufferLength);
-        
-        DeprecatedCString chunk(bufferLength + 1);
-        unsigned char *buffer = reinterpret_cast<unsigned char *>(chunk.data());
-        CFIndex charactersConverted = CFStringGetBytes(cfms, range, encoding, allowEntities ? 0 : '?', false, buffer, bufferLength, &bufferLength);
-        buffer[bufferLength] = 0;
-        result.append(chunk);
-        
-        if (charactersConverted != charactersLeft) {
-            unsigned int badChar = CFStringGetCharacterAtIndex(cfms, startPos + charactersConverted);
-            ++charactersConverted;
-
-            if ((badChar & 0xfc00) == 0xd800 &&     // is high surrogate
-                  charactersConverted != charactersLeft) {
-                UniChar low = CFStringGetCharacterAtIndex(cfms, startPos + charactersConverted);
-                if ((low & 0xfc00) == 0xdc00) {     // is low surrogate
-                    badChar <<= 10;
-                    badChar += low;
-                    badChar += 0x10000 - (0xd800 << 10) - 0xdc00;
-                    ++charactersConverted;
-                }
-            }
-            char buf[16];
-            sprintf(buf, "&#%u;", badChar);
-            result.append(buf);
-        }
-        
-        startPos += charactersConverted;
-        charactersLeft -= charactersConverted;
-    }
-    CFRelease(cfms);
-    return result;
-}
-
-} // namespace WebCore