2007-07-06 Jungshik Shin <jungshik.shin@gmail.com>
authorbdash <bdash@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Fri, 6 Jul 2007 10:00:45 +0000 (10:00 +0000)
committerbdash <bdash@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Fri, 6 Jul 2007 10:00:45 +0000 (10:00 +0000)
        Reviewed by Alexey.

        - Add UTF-32 encoding support
          http://bugs.webkit.org/show_bug.cgi?id=13415

        Test:
         - fast/encoding/utf-32-big-endian-bom.html
         - fast/encoding/utf-32-big-endian-nobom.xml
         - fast/encoding/utf-32-little-endian-bom.html
         - fast/encoding/utf-32-little-endian-nobom.xml

        * loader/TextResourceDecoder.cpp:
        (WebCore::TextResourceDecoder::checkForBOM):
        (WebCore::TextResourceDecoder::checkForHeadCharset):
        * platform/TextDecoder.cpp:
        (WebCore::TextDecoder::checkForBOM):
        * platform/TextDecoder.h:
        * platform/TextEncoding.cpp:
        (WebCore::UTF32BigEndianEncoding):
        (WebCore::UTF32LittleEndianEncoding):
        * platform/TextEncoding.h:

2007-07-06  Jungshik Shin <jungshik.shin@gmail.com>

        Reviewed by Alexey.

       - test for http://bugs.webkit.org/show_bug.cgi?id=13415

        * fast/encoding/utf-32-big-endian-bom-expected.txt: Added.
        * fast/encoding/utf-32-big-endian-bom.html: Added.
        * fast/encoding/utf-32-big-endian-nobom-expected.txt: Added.
        * fast/encoding/utf-32-big-endian-nobom.xml: Added.
        * fast/encoding/utf-32-little-endian-bom-expected.txt: Added.
        * fast/encoding/utf-32-little-endian-bom.html: Added.
        * fast/encoding/utf-32-little-endian-nobom-expected.txt: Added.
        * fast/encoding/utf-32-little-endian-nobom.xml: Added.

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@24052 268f45cc-cd09-0410-ab3c-d52691b4dbfc

15 files changed:
LayoutTests/ChangeLog
LayoutTests/fast/encoding/utf-32-big-endian-bom-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/utf-32-big-endian-bom.html [new file with mode: 0644]
LayoutTests/fast/encoding/utf-32-big-endian-nobom-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/utf-32-big-endian-nobom.xml [new file with mode: 0644]
LayoutTests/fast/encoding/utf-32-little-endian-bom-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/utf-32-little-endian-bom.html [new file with mode: 0644]
LayoutTests/fast/encoding/utf-32-little-endian-nobom-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/utf-32-little-endian-nobom.xml [new file with mode: 0644]
WebCore/ChangeLog
WebCore/loader/TextResourceDecoder.cpp
WebCore/platform/TextDecoder.cpp
WebCore/platform/TextDecoder.h
WebCore/platform/TextEncoding.cpp
WebCore/platform/TextEncoding.h

index b60ee43..65f695b 100644 (file)
@@ -1,3 +1,18 @@
+2007-07-06  Jungshik Shin <jungshik.shin@gmail.com>
+
+        Reviewed by Alexey.
+
+       - test for http://bugs.webkit.org/show_bug.cgi?id=13415
+
+        * fast/encoding/utf-32-big-endian-bom-expected.txt: Added.
+        * fast/encoding/utf-32-big-endian-bom.html: Added.
+        * fast/encoding/utf-32-big-endian-nobom-expected.txt: Added.
+        * fast/encoding/utf-32-big-endian-nobom.xml: Added.
+        * fast/encoding/utf-32-little-endian-bom-expected.txt: Added.
+        * fast/encoding/utf-32-little-endian-bom.html: Added.
+        * fast/encoding/utf-32-little-endian-nobom-expected.txt: Added.
+        * fast/encoding/utf-32-little-endian-nobom.xml: Added.
+
 2007-07-06  Rob Buis  <buis@kde.org>
 
         Reviewed by Adam.
diff --git a/LayoutTests/fast/encoding/utf-32-big-endian-bom-expected.txt b/LayoutTests/fast/encoding/utf-32-big-endian-bom-expected.txt
new file mode 100644 (file)
index 0000000..e7f2263
--- /dev/null
@@ -0,0 +1,7 @@
+When dumped into a text file by the test controller, this needs to be viewed as UTF-8:
+
+This is an em dash, —, this is is a delta symbol, ∆, this is an uppercase pi, ∏, and this is a lowercase pi, π.
+
+Success : UTF-32BE
+
+
diff --git a/LayoutTests/fast/encoding/utf-32-big-endian-bom.html b/LayoutTests/fast/encoding/utf-32-big-endian-bom.html
new file mode 100644 (file)
index 0000000..bab8e9c
Binary files /dev/null and b/LayoutTests/fast/encoding/utf-32-big-endian-bom.html differ
diff --git a/LayoutTests/fast/encoding/utf-32-big-endian-nobom-expected.txt b/LayoutTests/fast/encoding/utf-32-big-endian-nobom-expected.txt
new file mode 100644 (file)
index 0000000..e7f2263
--- /dev/null
@@ -0,0 +1,7 @@
+When dumped into a text file by the test controller, this needs to be viewed as UTF-8:
+
+This is an em dash, —, this is is a delta symbol, ∆, this is an uppercase pi, ∏, and this is a lowercase pi, π.
+
+Success : UTF-32BE
+
+
diff --git a/LayoutTests/fast/encoding/utf-32-big-endian-nobom.xml b/LayoutTests/fast/encoding/utf-32-big-endian-nobom.xml
new file mode 100644 (file)
index 0000000..78dc012
Binary files /dev/null and b/LayoutTests/fast/encoding/utf-32-big-endian-nobom.xml differ
diff --git a/LayoutTests/fast/encoding/utf-32-little-endian-bom-expected.txt b/LayoutTests/fast/encoding/utf-32-little-endian-bom-expected.txt
new file mode 100644 (file)
index 0000000..b8f61cf
--- /dev/null
@@ -0,0 +1,7 @@
+When dumped into a text file by the test controller, this needs to be viewed as UTF-8:
+
+This is an em dash, —, this is is a delta symbol, ∆, this is an uppercase pi, ∏, and this is a lowercase pi, π.
+
+Success : UTF-32LE
+
+
diff --git a/LayoutTests/fast/encoding/utf-32-little-endian-bom.html b/LayoutTests/fast/encoding/utf-32-little-endian-bom.html
new file mode 100644 (file)
index 0000000..2f566b4
Binary files /dev/null and b/LayoutTests/fast/encoding/utf-32-little-endian-bom.html differ
diff --git a/LayoutTests/fast/encoding/utf-32-little-endian-nobom-expected.txt b/LayoutTests/fast/encoding/utf-32-little-endian-nobom-expected.txt
new file mode 100644 (file)
index 0000000..b8f61cf
--- /dev/null
@@ -0,0 +1,7 @@
+When dumped into a text file by the test controller, this needs to be viewed as UTF-8:
+
+This is an em dash, —, this is is a delta symbol, ∆, this is an uppercase pi, ∏, and this is a lowercase pi, π.
+
+Success : UTF-32LE
+
+
diff --git a/LayoutTests/fast/encoding/utf-32-little-endian-nobom.xml b/LayoutTests/fast/encoding/utf-32-little-endian-nobom.xml
new file mode 100644 (file)
index 0000000..af7a8f9
Binary files /dev/null and b/LayoutTests/fast/encoding/utf-32-little-endian-nobom.xml differ
index 370c465..8daa8c8 100644 (file)
@@ -1,3 +1,27 @@
+2007-07-06  Jungshik Shin  <jungshik.shin@gmail.com>
+
+        Reviewed by Alexey.
+
+        - Add UTF-32 encoding support 
+          http://bugs.webkit.org/show_bug.cgi?id=13415
+
+        Test:
+         - fast/encoding/utf-32-big-endian-bom.html
+         - fast/encoding/utf-32-big-endian-nobom.xml
+         - fast/encoding/utf-32-little-endian-bom.html
+         - fast/encoding/utf-32-little-endian-nobom.xml
+
+        * loader/TextResourceDecoder.cpp:
+        (WebCore::TextResourceDecoder::checkForBOM):
+        (WebCore::TextResourceDecoder::checkForHeadCharset):
+        * platform/TextDecoder.cpp:
+        (WebCore::TextDecoder::checkForBOM):
+        * platform/TextDecoder.h:
+        * platform/TextEncoding.cpp:
+        (WebCore::UTF32BigEndianEncoding):
+        (WebCore::UTF32LittleEndianEncoding):
+        * platform/TextEncoding.h:
+
 2007-07-06  Holger Hans Peter Freyther  <zecke@selfish.org>
 
         Reviewed by Maciej.
index 6b6aa7d..d99efa2 100644 (file)
@@ -346,7 +346,7 @@ static inline bool skipWhitespace(const char*& pos, const char* dataEnd)
 
 void TextResourceDecoder::checkForBOM(const char* data, size_t len)
 {
-    // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
+    // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
 
     if (m_source == UserChosenEncoding) {
         // FIXME: Maybe a BOM should override even a user-chosen encoding.
@@ -356,27 +356,34 @@ void TextResourceDecoder::checkForBOM(const char* data, size_t len)
 
     // Check if we have enough data.
     size_t bufferLength = m_buffer.size();
-    if (bufferLength + len < 3)
+    if (bufferLength + len < 4)
         return;
 
     m_checkedForBOM = true;
 
-    // Extract the first three bytes.
+    // Extract the first four bytes.
     // Handle the case where some of bytes are already in the buffer.
     // The last byte is always guaranteed to not be in the buffer.
     const unsigned char* udata = reinterpret_cast<const unsigned char*>(data);
     unsigned char c1 = bufferLength >= 1 ? m_buffer[0] : *udata++;
     unsigned char c2 = bufferLength >= 2 ? m_buffer[1] : *udata++;
-    ASSERT(bufferLength < 3);
-    unsigned char c3 = *udata;
+    unsigned char c3 = bufferLength >= 3 ? m_buffer[2] : *udata++;
+    ASSERT(bufferLength < 4);
+    unsigned char c4 = *udata;
 
     // Check for the BOM.
-    if (c1 == 0xFE && c2 == 0xFF)
-        setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
-    else if (c1 == 0xFF && c2 == 0xFE)
-        setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
+    if (c1 == 0xFF && c2 == 0xFE) {
+        if (c3 !=0 || c4 != 0)
+            setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
+        else 
+            setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
+    }
     else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
         setEncoding(UTF8Encoding(), AutoDetectedEncoding);
+    else if (c1 == 0xFE && c2 == 0xFF)
+        setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
+    else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF)
+        setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
 }
 
 bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)
@@ -519,7 +526,11 @@ bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool
                 // continue looking for a charset - it may be specified in an HTTP-Equiv meta
             } else if (ptr[0] == 0 && ptr[1] == '?' && ptr[2] == 0 && ptr[3] == 'x' && ptr[4] == 0 && ptr[5] == 'm' && ptr[6] == 0 && ptr[7] == 'l') {
                 // UTF-16 without BOM
-                setEncoding(((ptr - m_buffer.data()) % 2) ? "UTF-16LE" : "UTF-16BE", AutoDetectedEncoding);
+                setEncoding(((ptr - m_buffer.data()) % 2) ? UTF16LittleEndianEncoding() : UTF16BigEndianEncoding(), AutoDetectedEncoding);
+                return true;
+            } else if (ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == '?' && ptr[4] == 0 && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == 'x') {
+                // UTF-32 without BOM
+                setEncoding(((ptr - m_buffer.data()) % 4) ? UTF32LittleEndianEncoding() : UTF32BigEndianEncoding(), AutoDetectedEncoding);
                 return true;
             }
 
index 23f82cb..8633e9f 100644 (file)
@@ -57,16 +57,28 @@ String TextDecoder::checkForBOM(const char* data, size_t length, bool flush)
     const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
     unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
     unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
-    unsigned char c3 = buf2Len ? (--buf2Len, *buf2++) : 0;
+    unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
+    unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
 
     const TextEncoding* encodingConsideringBOM = &m_encoding;
-    if (c1 == 0xFF && c2 == 0xFE)
-        encodingConsideringBOM = &UTF16LittleEndianEncoding();
-    else if (c1 == 0xFE && c2 == 0xFF)
-        encodingConsideringBOM = &UTF16BigEndianEncoding();
+    bool foundBOM = true;
+    if (c1 == 0xFF && c2 == 0xFE) {
+        if (c3 != 0 || c4 != 0) 
+            encodingConsideringBOM = &UTF16LittleEndianEncoding();
+        else if (numBufferedBytes + length > sizeof(m_bufferedBytes))
+            encodingConsideringBOM = &UTF32LittleEndianEncoding();
+        else
+            foundBOM = false;
+    }
     else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
         encodingConsideringBOM = &UTF8Encoding();
-    else if (numBufferedBytes + length <= sizeof(m_bufferedBytes) && !flush) {
+    else if (c1 == 0xFE && c2 == 0xFF)
+        encodingConsideringBOM = &UTF16BigEndianEncoding();
+    else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF)
+        encodingConsideringBOM = &UTF32BigEndianEncoding();
+    else
+        foundBOM = false;
+    if (!foundBOM && numBufferedBytes + length <= sizeof(m_bufferedBytes) && !flush) {
         // Continue to look for the BOM.
         memcpy(&m_bufferedBytes[numBufferedBytes], data, length);
         m_numBufferedBytes += length;
index 2464ac9..3892032 100644 (file)
@@ -56,7 +56,7 @@ namespace WebCore {
 
         bool m_checkedForBOM;
         unsigned char m_numBufferedBytes;
-        unsigned char m_bufferedBytes[2];
+        unsigned char m_bufferedBytes[3];
     };
 
 } // namespace WebCore
index ef73adc..d440b9d 100644 (file)
@@ -185,6 +185,19 @@ const TextEncoding& UTF16LittleEndianEncoding()
     return globalUTF16LittleEndianEncoding;
 }
 
+const TextEncoding& UTF32BigEndianEncoding()
+{
+    static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
+    return globalUTF32BigEndianEncoding;
+}
+
+const TextEncoding& UTF32LittleEndianEncoding()
+{
+    static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
+    return globalUTF32LittleEndianEncoding;
+}
+
+
 const TextEncoding& UTF8Encoding()
 {
     static TextEncoding globalUTF8Encoding("UTF-8");
index 89b276b..59d225c 100644 (file)
@@ -60,6 +60,8 @@ namespace WebCore {
     const TextEncoding& Latin1Encoding();
     const TextEncoding& UTF16BigEndianEncoding();
     const TextEncoding& UTF16LittleEndianEncoding();
+    const TextEncoding& UTF32BigEndianEncoding();
+    const TextEncoding& UTF32LittleEndianEncoding();
     const TextEncoding& UTF8Encoding();
     const TextEncoding& WindowsLatin1Encoding();