WebCore:
authordarin@apple.com <darin@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Mon, 17 Mar 2008 03:47:03 +0000 (03:47 +0000)
committerdarin@apple.com <darin@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Mon, 17 Mar 2008 03:47:03 +0000 (03:47 +0000)
2008-03-16  Marvin Decker  <marv.decker@gmail.com>

        Reviewed by Darin.

        Fix bug 15119: URL query characters that are unencodable in the
        request's character set should be converted to XML entities with
        non-alphanumeric characters escaped.

        Test: http/tests/uri/escaped-entity.html

        * html/FormDataList.cpp:
        (WebCore::FormDataList::appendString):
        * html/HTMLFormElement.cpp:
        (WebCore::HTMLFormElement::formData):
        * platform/KURL.cpp:
        (WebCore::encodeRelativeString):
        * platform/text/String.cpp:
        (WebCore::String::latin1):
        (WebCore::String::utf8):
        * platform/text/TextCodec.cpp:
        (WebCore::TextCodec::unencodableCharReplacement):
        * platform/text/TextCodec.h:
        (WebCore::):
        * platform/text/TextCodecICU.cpp:
        (WebCore::urlEscapedEntityCallback):
        (WebCore::gbkUrlEscapedEntityCallack):
        (WebCore::TextCodecICU::encode):
        * platform/text/TextCodecICU.h:
        (WebCore::TextCodecICU::setNeedsGBKFallbacks):
        * platform/text/TextCodecLatin1.cpp:
        (WebCore::encodeComplexWindowsLatin1):
        (WebCore::TextCodecLatin1::encode):
        * platform/text/TextCodecLatin1.h:
        * platform/text/TextCodecUTF16.cpp:
        (WebCore::TextCodecUTF16::encode):
        * platform/text/TextCodecUTF16.h:
        * platform/text/TextCodecUserDefined.cpp:
        (WebCore::encodeComplexUserDefined):
        (WebCore::TextCodecUserDefined::encode):
        * platform/text/TextCodecUserDefined.h:
        * platform/text/TextEncoding.cpp:
        (WebCore::TextEncoding::encode):
        * platform/text/TextEncoding.h:
        * platform/text/mac/TextCodecMac.cpp:
        (WebCore::TextCodecMac::encode):
        * platform/text/mac/TextCodecMac.h:
        * xml/XMLHttpRequest.cpp:
        (WebCore::XMLHttpRequest::send):

LayoutTests:

2008-03-16  Marvin Decker  <marv.decker@gmail.com>

        Reviewed by Darin.

        Fix bug 15119, unencodable characters in URLs should be entity-escaped.

        * http/tests/uri/escaped-entity-expected.txt: Added.
        * http/tests/uri/escaped-entity.html: Added.

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@31089 268f45cc-cd09-0410-ab3c-d52691b4dbfc

23 files changed:
LayoutTests/ChangeLog
LayoutTests/http/tests/uri/escaped-entity-expected.txt [new file with mode: 0644]
LayoutTests/http/tests/uri/escaped-entity.html [new file with mode: 0644]
WebCore/ChangeLog
WebCore/html/FormDataList.cpp
WebCore/html/HTMLFormElement.cpp
WebCore/platform/KURL.cpp
WebCore/platform/text/String.cpp
WebCore/platform/text/TextCodec.cpp
WebCore/platform/text/TextCodec.h
WebCore/platform/text/TextCodecICU.cpp
WebCore/platform/text/TextCodecICU.h
WebCore/platform/text/TextCodecLatin1.cpp
WebCore/platform/text/TextCodecLatin1.h
WebCore/platform/text/TextCodecUTF16.cpp
WebCore/platform/text/TextCodecUTF16.h
WebCore/platform/text/TextCodecUserDefined.cpp
WebCore/platform/text/TextCodecUserDefined.h
WebCore/platform/text/TextEncoding.cpp
WebCore/platform/text/TextEncoding.h
WebCore/platform/text/mac/TextCodecMac.cpp
WebCore/platform/text/mac/TextCodecMac.h
WebCore/xml/XMLHttpRequest.cpp

index f66e724..43a33bb 100644 (file)
@@ -1,3 +1,12 @@
+2008-03-16  Marvin Decker  <marv.decker@gmail.com>
+
+        Reviewed by Darin.
+
+        Fix bug 15119, unencodable characters in URLs should be entity-escaped.
+
+        * http/tests/uri/escaped-entity-expected.txt: Added.
+        * http/tests/uri/escaped-entity.html: Added.
+
 2008-03-16  Darin Adler  <darin@apple.com>
 
         Reviewed by Mark Rowe.
diff --git a/LayoutTests/http/tests/uri/escaped-entity-expected.txt b/LayoutTests/http/tests/uri/escaped-entity-expected.txt
new file mode 100644 (file)
index 0000000..e9931ee
--- /dev/null
@@ -0,0 +1,10 @@
+Test for bug 15119: Unrepresentable characters in a URL's character set should be converted to escaped entities. We use the character U+06DE (۞) which does not exist in Big-5.
+
+Note that this exact page won't work in IE or Firefox. Firefox seems to always use UTF-8 for local files, and IE actually preserves the Unicode in the URL when we get it from JS, so we don't know what would get sent over the wire. However, both browsers will send %26%231758%3B over HTTP for the query.
+
+"/uri/intercept/print/script.js?%26%231758%3B" (no target charset specified, should be Big5)
+"/uri/intercept/print/script.js?%26%231758%3B" (Big5 specified)
+Show the source attribute of the scripts.
+"http://127.0.0.1:8000/uri/intercept/print/script.js?%26%231758%3B"
+"http://127.0.0.1:8000/uri/intercept/print/script.js?%26%231758%3B"
+
diff --git a/LayoutTests/http/tests/uri/escaped-entity.html b/LayoutTests/http/tests/uri/escaped-entity.html
new file mode 100644 (file)
index 0000000..44cd0b1
--- /dev/null
@@ -0,0 +1,39 @@
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=big5">
+</head>
+<body>
+
+<script>
+    if (window.layoutTestController)
+        layoutTestController.dumpAsText();
+</script>
+
+<p>Test for <a href='http://bugs.webkit.org/show_bug.cgi?id=15119'>bug 15119</a>:
+Unrepresentable characters in a URL's character set should be converted to
+escaped entities. We use the character U+06DE (&#x6de;) which does not exist in
+Big-5.</p>
+
+<p>Note that this exact page won't work in IE or Firefox. Firefox seems to
+always use UTF-8 for local files, and IE actually preserves the Unicode in the
+URL when we get it from JS, so we don't know what would get sent over the wire.
+However, both browsers will send <tt>%26%231758%3B</tt> over HTTP for the
+query.</p>
+
+"<script id=scr1 src="intercept/print/script.js?&#x6DE;"></script>" (no target charset specified, should be Big5)<br>
+"<script id=scr2 charset="big5" src="intercept/print/script.js?&#x6DE;"></script>" (Big5 specified)<br>
+
+Show the source attribute of the scripts.<br>
+<script>
+ try {
+  document.write('"' + document.scripts[1].src + '"<br>');
+  document.write('"' + document.scripts[2].src + '"<br>');
+ } catch (ex) {
+  document.write('"' + document.getElementById("scr1").src + '"<br>');
+  document.write('"' + document.getElementById("scr2").src + '"<br>');
+ }
+</script>
+
+</body>
+</html>
+
index 4e07a21..2cf8885 100644 (file)
@@ -1,3 +1,52 @@
+2008-03-16  Marvin Decker  <marv.decker@gmail.com>
+
+        Reviewed by Darin.
+
+        Fix bug 15119: URL query characters that are unencodable in the
+        request's character set should be converted to XML entities with
+        non-alphanumeric characters escaped.
+
+        Test: http/tests/uri/escaped-entity.html
+
+        * html/FormDataList.cpp:
+        (WebCore::FormDataList::appendString):
+        * html/HTMLFormElement.cpp:
+        (WebCore::HTMLFormElement::formData):
+        * platform/KURL.cpp:
+        (WebCore::encodeRelativeString):
+        * platform/text/String.cpp:
+        (WebCore::String::latin1):
+        (WebCore::String::utf8):
+        * platform/text/TextCodec.cpp:
+        (WebCore::TextCodec::unencodableCharReplacement):
+        * platform/text/TextCodec.h:
+        (WebCore::):
+        * platform/text/TextCodecICU.cpp:
+        (WebCore::urlEscapedEntityCallback):
+        (WebCore::gbkUrlEscapedEntityCallack):
+        (WebCore::TextCodecICU::encode):
+        * platform/text/TextCodecICU.h:
+        (WebCore::TextCodecICU::setNeedsGBKFallbacks):
+        * platform/text/TextCodecLatin1.cpp:
+        (WebCore::encodeComplexWindowsLatin1):
+        (WebCore::TextCodecLatin1::encode):
+        * platform/text/TextCodecLatin1.h:
+        * platform/text/TextCodecUTF16.cpp:
+        (WebCore::TextCodecUTF16::encode):
+        * platform/text/TextCodecUTF16.h:
+        * platform/text/TextCodecUserDefined.cpp:
+        (WebCore::encodeComplexUserDefined):
+        (WebCore::TextCodecUserDefined::encode):
+        * platform/text/TextCodecUserDefined.h:
+        * platform/text/TextEncoding.cpp:
+        (WebCore::TextEncoding::encode):
+        * platform/text/TextEncoding.h:
+        * platform/text/mac/TextCodecMac.cpp:
+        (WebCore::TextCodecMac::encode):
+        * platform/text/mac/TextCodecMac.h:
+        * xml/XMLHttpRequest.cpp:
+        (WebCore::XMLHttpRequest::send):
+
 2008-03-16  Kevin Ollivier  <kevino@theolliviers.com>
 
         Rubber stamped by Darin.
index 8698666..15ca9a8 100644 (file)
@@ -90,7 +90,7 @@ static CString fixLineBreaks(const CString &s)
 
 void FormDataList::appendString(const String& s)
 {
-    CString cstr = fixLineBreaks(m_encoding.encode(s.characters(), s.length(), true));
+    CString cstr = fixLineBreaks(m_encoding.encode(s.characters(), s.length(), EntitiesForUnencodables));
     m_list.append(cstr);
 }
 
index e2b0ea6..b35a575 100644 (file)
@@ -295,7 +295,7 @@ PassRefPtr<FormData> HTMLFormElement::formData(const char* boundary) const
                         // things if the filename includes characters you can't encode
                         // in the website's character set.
                         appendString(header, "; filename=\"");
-                        appendString(header, encoding.encode(filename.characters(), filename.length(), true));
+                        appendString(header, encoding.encode(filename.characters(), filename.length(), QuestionMarksForUnencodables));
                         header.append('"');
 
                         if (!path.isEmpty()) {
index f402888..84fdeb6 100644 (file)
@@ -1446,12 +1446,14 @@ static void encodeRelativeString(const String& rel, const TextEncoding& encoding
     }
 
     if (pathEnd == -1) {
-        CString decoded = pathEncoding.encode(s.data(), s.size());
+        CString decoded = pathEncoding.encode(s.data(), s.size(), URLEncodedEntitiesForUnencodables);
         output.resize(decoded.length());
         memcpy(output.data(), decoded.data(), decoded.length());
     } else {
-        CString pathDecoded = pathEncoding.encode(s.data(), pathEnd);
-        CString otherDecoded = otherEncoding.encode(s.data() + pathEnd, s.size() - pathEnd);
+        CString pathDecoded = pathEncoding.encode(s.data(), pathEnd, URLEncodedEntitiesForUnencodables);
+        // Unencodable characters in URLs are represented by converting
+        // them to XML entities and escaping non-alphanumeric characters.
+        CString otherDecoded = otherEncoding.encode(s.data() + pathEnd, s.size() - pathEnd, URLEncodedEntitiesForUnencodables);
 
         output.resize(pathDecoded.length() + otherDecoded.length());
         memcpy(output.data(), pathDecoded.data(), pathDecoded.length());
index 9c3f4b1..6ceed2e 100644 (file)
@@ -570,12 +570,12 @@ Vector<char> String::ascii() const
 
 CString String::latin1() const
 {
-    return Latin1Encoding().encode(characters(), length());
+    return Latin1Encoding().encode(characters(), length(), QuestionMarksForUnencodables);
 }
     
 CString String::utf8() const
 {
-    return UTF8Encoding().encode(characters(), length());
+    return UTF8Encoding().encode(characters(), length(), QuestionMarksForUnencodables);
 }
 
 String String::fromUTF8(const char* string, size_t size)
index 1985c49..83edf73 100644 (file)
@@ -28,6 +28,7 @@
 #include "TextCodec.h"
 
 #include "PlatformString.h"
+#include <wtf/StringExtras.h>
 
 namespace WebCore {
 
@@ -53,4 +54,23 @@ void TextCodec::appendOmittingBOM(Vector<UChar>& v, const UChar* characters, siz
         v.append(&characters[start], length - start);
 }
 
+int TextCodec::getUnencodableReplacement(unsigned codePoint, UnencodableHandling handling, UnencodableReplacementArray replacement)
+{
+    switch (handling) {
+        case QuestionMarksForUnencodables:
+            replacement[0] = '?';
+            replacement[1] = 0;
+            return 1;
+        case EntitiesForUnencodables:
+            snprintf(replacement, sizeof(UnencodableReplacementArray), "&#%u;", codePoint);
+            return static_cast<int>(strlen(replacement));
+        case URLEncodedEntitiesForUnencodables:
+            snprintf(replacement, sizeof(UnencodableReplacementArray), "%%26%%23%u%%3B", codePoint);
+            return static_cast<int>(strlen(replacement));
+    }
+    ASSERT_NOT_REACHED();
+    replacement[0] = 0;
+    return 0;
+}
+
 } // namespace WebCore
index 77ffcf4..9d4f507 100644 (file)
@@ -38,12 +38,35 @@ namespace WebCore {
     class String;
     class TextEncoding;
 
+    // Specifies what will happen when a character is encountered that is
+    // not encodable in the character set.
+    enum UnencodableHandling {
+        // Substitutes the replacement character "?".
+        QuestionMarksForUnencodables,
+
+        // Encodes the character as an XML entity. For example, U+06DE
+        // would be "&#1758;" (0x6DE = 1758 in octal).
+        EntitiesForUnencodables,
+
+        // Encodes the character as en entity as above, but escaped
+        // non-alphanumeric characters. This is used in URLs.
+        // For example, U+6DE would be "%26%231758%3B".
+        URLEncodedEntitiesForUnencodables,
+    };
+
+    typedef char UnencodableReplacementArray[32];
+
     class TextCodec : Noncopyable {
     public:
         virtual ~TextCodec();
 
         virtual String decode(const char*, size_t length, bool flush = false) = 0;
-        virtual CString encode(const UChar*, size_t length, bool allowEntities = false) = 0;
+        virtual CString encode(const UChar*, size_t length, UnencodableHandling) = 0;
+
+        // Fills a null-terminated string representation of the given
+        // unencodable character into the given replacement buffer. 
+        // The length of the string (not including the null) will be returned.
+        static int getUnencodableReplacement(unsigned codePoint, UnencodableHandling, UnencodableReplacementArray);
 
     protected:
         static void appendOmittingBOM(Vector<UChar>&, const UChar*, size_t length);
index 39c0233..8579fff 100644 (file)
@@ -40,7 +40,7 @@ using std::min;
 namespace WebCore {
 
 const size_t ConversionBufferSize = 16384;
-    
+
 static UConverter* cachedConverterICU;
 
 static auto_ptr<TextCodec> newTextCodecICU(const TextEncoding& encoding, const void*)
@@ -285,6 +285,22 @@ static UChar getGbkEscape(UChar32 codePoint)
     }
 }
 
+// Invalid character handler when writing escaped entities for unrepresentable
+// characters. See the declaration of TextCodec::encode for more.
+static void urlEscapedEntityCallback(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length,
+                                     UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err)
+{
+    if (reason == UCNV_UNASSIGNED) {
+        *err = U_ZERO_ERROR;
+
+        UnencodableReplacementArray entity;
+        int entityLen = TextCodec::getUnencodableReplacement(codePoint, URLEncodedEntitiesForUnencodables, entity);
+        ucnv_cbFromUWriteBytes(fromUArgs, entity, entityLen, 0, err);
+    } else
+        UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, err);
+}
+
+// Substitutes special GBK characters, escaping all other unassigned entities.
 static void gbkCallbackEscape(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length,
                               UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err) 
 {
@@ -298,6 +314,23 @@ static void gbkCallbackEscape(const void* context, UConverterFromUnicodeArgs* fr
     UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, err);
 }
 
+// Combines both gbkUrlEscapedEntityCallback and GBK character substitution.
+static void gbkUrlEscapedEntityCallack(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length,
+                                       UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err) 
+{
+    if (reason == UCNV_UNASSIGNED) {
+        if (UChar outChar = getGbkEscape(codePoint)) {
+            const UChar* source = &outChar;
+            *err = U_ZERO_ERROR;
+            ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, err);
+            return;
+        }
+        urlEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint, reason, err);
+        return;
+    }
+    UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, err);
+}
+
 static void gbkCallbackSubstitute(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length,
                                   UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err) 
 {
@@ -311,7 +344,7 @@ static void gbkCallbackSubstitute(const void* context, UConverterFromUnicodeArgs
     UCNV_FROM_U_CALLBACK_SUBSTITUTE(context, fromUArgs, codeUnits, length, codePoint, reason, err);
 }
 
-CString TextCodecICU::encode(const UChar* characters, size_t length, bool allowEntities)
+CString TextCodecICU::encode(const UChar* characters, size_t length, UnencodableHandling handling)
 {
     if (!length)
         return "";
@@ -329,14 +362,20 @@ CString TextCodecICU::encode(const UChar* characters, size_t length, bool allowE
 
     const UChar* source = copy.characters();
     const UChar* sourceLimit = source + copy.length();
-    
+
     UErrorCode err = U_ZERO_ERROR;
 
-    if (allowEntities)
-        ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackEscape : UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err);
-    else {
-        ucnv_setSubstChars(m_converterICU, "?", 1, &err);
-        ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackSubstitute : UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err);
+    switch (handling) {
+        case QuestionMarksForUnencodables:
+            ucnv_setSubstChars(m_converterICU, "?", 1, &err);
+            ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackSubstitute : UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err);
+            break;
+        case EntitiesForUnencodables:
+            ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackEscape : UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err);
+            break;
+        case URLEncodedEntitiesForUnencodables:
+            ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkUrlEscapedEntityCallack : urlEscapedEntityCallback, 0, 0, 0, &err);
+            break;
     }
 
     ASSERT(U_SUCCESS(err));
index c2a30b1..f5c664d 100644 (file)
@@ -46,17 +46,17 @@ namespace WebCore {
         virtual ~TextCodecICU();
 
         virtual String decode(const char*, size_t length, bool flush = false);
-        virtual CString encode(const UChar*, size_t length, bool allowEntities = false);
+        virtual CString encode(const UChar*, size_t length, UnencodableHandling);
 
     private:
         void createICUConverter() const;
         void releaseICUConverter() const;
         bool needsGBKFallbacks() const { return m_needsGBKFallbacks; }
-        void setNeedsGBKFallbacks(bool needsFallbacks) { m_needsGBKFallbacks = needsFallbacks; } 
+        void setNeedsGBKFallbacks(bool needsFallbacks) { m_needsGBKFallbacks = needsFallbacks; }
 
         TextEncoding m_encoding;
         unsigned m_numBufferedBytes;
-        unsigned char m_bufferedBytes[16]; // bigger than any single multi-byte character        
+        unsigned char m_bufferedBytes[16]; // bigger than any single multi-byte character
         mutable UConverter* m_converterICU;
         mutable bool m_needsGBKFallbacks;
     };
index 2e9d116..e8ef690 100644 (file)
@@ -142,7 +142,7 @@ String TextCodecLatin1::decode(const char* bytes, size_t length, bool)
     return String::adopt(characters);
 }
 
-static CString encodeComplexWindowsLatin1(const UChar* characters, size_t length, bool allowEntities)
+static CString encodeComplexWindowsLatin1(const UChar* characters, size_t length, UnencodableHandling handling)
 {
     Vector<char> result(length);
     char* bytes = result.data();
@@ -159,17 +159,13 @@ static CString encodeComplexWindowsLatin1(const UChar* characters, size_t length
                 if (table[b] == c)
                     goto gotByte;
             // No way to encode this character with Windows Latin-1.
-            if (allowEntities) {
-                char entityBuffer[16];
-                sprintf(entityBuffer, "&#%u;", c);
-                size_t entityLength = strlen(entityBuffer);
-                result.grow(resultLength + entityLength + length - i);
-                bytes = result.data();
-                memcpy(bytes + resultLength, entityBuffer, entityLength);
-                resultLength += entityLength;
-                continue;
-            }
-            b = '?';
+            UnencodableReplacementArray replacement;
+            int replacementLength = TextCodec::getUnencodableReplacement(c, handling, replacement);
+            result.grow(resultLength + replacementLength + length - i);
+            bytes = result.data();
+            memcpy(bytes + resultLength, replacement, replacementLength);
+            resultLength += replacementLength;
+            continue;
         }
     gotByte:
         bytes[resultLength++] = b;
@@ -178,7 +174,7 @@ static CString encodeComplexWindowsLatin1(const UChar* characters, size_t length
     return CString(bytes, resultLength);
 }
 
-CString TextCodecLatin1::encode(const UChar* characters, size_t length, bool allowEntities)
+CString TextCodecLatin1::encode(const UChar* characters, size_t length, UnencodableHandling handling)
 {
     {
         char* bytes;
@@ -197,7 +193,7 @@ CString TextCodecLatin1::encode(const UChar* characters, size_t length, bool all
     }
 
     // If it wasn't all ASCII, call the function that handles more-complex cases.
-    return encodeComplexWindowsLatin1(characters, length, allowEntities);
+    return encodeComplexWindowsLatin1(characters, length, handling);
 }
 
 } // namespace WebCore
index 46d6e66..ac4a3ea 100644 (file)
@@ -36,7 +36,7 @@ namespace WebCore {
         static void registerCodecs(TextCodecRegistrar);
 
         virtual String decode(const char*, size_t length, bool flush = false);
-        virtual CString encode(const UChar*, size_t length, bool allowEntities = false);
+        virtual CString encode(const UChar*, size_t length, UnencodableHandling);
     };
 
 } // namespace WebCore
index 9ecd2a9..08ada74 100644 (file)
@@ -118,7 +118,7 @@ String TextCodecUTF16::decode(const char* bytes, size_t length, bool)
     return String::adopt(buffer);
 }
 
-CString TextCodecUTF16::encode(const UChar* characters, size_t length, bool)
+CString TextCodecUTF16::encode(const UChar* characters, size_t length, UnencodableHandling)
 {
     char* bytes;
     CString string = CString::newUninitialized(length * 2, bytes);
index 2bde221..66842ed 100644 (file)
@@ -38,7 +38,7 @@ namespace WebCore {
         TextCodecUTF16(bool littleEndian) : m_littleEndian(littleEndian), m_haveBufferedByte(false) { }
 
         virtual String decode(const char*, size_t length, bool flush = false);
-        virtual CString encode(const UChar*, size_t length, bool allowEntities = false);
+        virtual CString encode(const UChar*, size_t length, UnencodableHandling);
 
     private:
         bool m_littleEndian;
index 3ef1bc9..da4a49d 100644 (file)
@@ -62,7 +62,7 @@ String TextCodecUserDefined::decode(const char* bytes, size_t length, bool)
     return String::adopt(buffer);
 }
 
-static CString encodeComplexUserDefined(const UChar* characters, size_t length, bool allowEntities)
+static CString encodeComplexUserDefined(const UChar* characters, size_t length, UnencodableHandling handling)
 {
     Vector<char> result(length);
     char* bytes = result.data();
@@ -72,27 +72,23 @@ static CString encodeComplexUserDefined(const UChar* characters, size_t length,
         UChar32 c;
         U16_NEXT(characters, i, length, c);
         signed char signedByte = c;
-        if ((signedByte & 0xf7ff) == c)
+        if ((signedByte & 0xF7FF) == c)
             bytes[resultLength++] = signedByte;
         else {
             // No way to encode this character with x-user-defined.
-            if (allowEntities) {
-                char entityBuffer[16];
-                sprintf(entityBuffer, "&#%u;", c);
-                size_t entityLength = strlen(entityBuffer);
-                result.grow(resultLength + entityLength + length - i);
-                bytes = result.data();
-                memcpy(bytes + resultLength, entityBuffer, entityLength);
-                resultLength += entityLength;
-            } else
-                bytes[resultLength++] = '?';
+            UnencodableReplacementArray replacement;
+            int replacementLength = TextCodec::getUnencodableReplacement(c, handling, replacement);
+            result.grow(resultLength + replacementLength + length - i);
+            bytes = result.data();
+            memcpy(bytes + resultLength, replacement, replacementLength);
+            resultLength += replacementLength;
         }
     }
 
     return CString(bytes, resultLength);
 }
 
-CString TextCodecUserDefined::encode(const UChar* characters, size_t length, bool allowEntities)
+CString TextCodecUserDefined::encode(const UChar* characters, size_t length, UnencodableHandling handling)
 {
     char* bytes;
     CString string = CString::newUninitialized(length, bytes);
@@ -109,7 +105,7 @@ CString TextCodecUserDefined::encode(const UChar* characters, size_t length, boo
         return string;
 
     // If it wasn't all ASCII, call the function that handles more-complex cases.
-    return encodeComplexUserDefined(characters, length, allowEntities);
+    return encodeComplexUserDefined(characters, length, handling);
 }
 
 } // namespace WebCore
index 4fba907..2759471 100644 (file)
@@ -36,7 +36,7 @@ namespace WebCore {
         static void registerCodecs(TextCodecRegistrar);
 
         virtual String decode(const char*, size_t length, bool flush = false);
-        virtual CString encode(const UChar*, size_t length, bool allowEntities = false);
+        virtual CString encode(const UChar*, size_t length, UnencodableHandling);
     };
 
 } // namespace WebCore
index c7676e9..e4df842 100644 (file)
@@ -67,7 +67,7 @@ String TextEncoding::decode(const char* data, size_t length) const
     return TextDecoder(*this).decode(data, length, true);
 }
 
-CString TextEncoding::encode(const UChar* characters, size_t length, bool allowEntities) const
+CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
 {
     if (!m_name)
         return CString();
@@ -100,11 +100,11 @@ CString TextEncoding::encode(const UChar* characters, size_t length, bool allowE
         source = normalizedCharacters.data();
         sourceLength = normalizedLength;
     }
-    return newTextCodec(*this)->encode(source, sourceLength, allowEntities);
+    return newTextCodec(*this)->encode(source, sourceLength, handling);
 #elif USE(QT4_UNICODE)
     QString str(reinterpret_cast<const QChar*>(characters), length);
     str = str.normalized(QString::NormalizationForm_C);
-    return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), allowEntities);
+    return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
 #endif
 }
 
index 59d225c..b88ad33 100644 (file)
@@ -26,6 +26,7 @@
 #ifndef TextEncoding_h
 #define TextEncoding_h
 
+#include "TextCodec.h"
 #include <wtf/unicode/Unicode.h>
 
 namespace WebCore {
@@ -47,7 +48,7 @@ namespace WebCore {
         const TextEncoding& closest8BitEquivalent() const;
 
         String decode(const char*, size_t length) const;
-        CString encode(const UChar*, size_t length, bool allowEntities = false) const;
+        CString encode(const UChar*, size_t length, UnencodableHandling) const;
 
     private:
         const char* m_name;
index 7270a26..b552c4c 100644 (file)
@@ -266,7 +266,7 @@ String TextCodecMac::decode(const char* bytes, size_t length, bool flush)
     return resultString;
 }
 
-CString TextCodecMac::encode(const UChar* characters, size_t length, bool allowEntities)
+CString TextCodecMac::encode(const UChar* characters, size_t length, UnencodableHandling handling)
 {
     // FIXME: We should really use TEC here instead of CFString for consistency with the other direction.
 
@@ -280,7 +280,7 @@ CString TextCodecMac::encode(const UChar* characters, size_t length, bool allowE
     CFIndex charactersLeft = CFStringGetLength(cfs);
     Vector<char> result;
     size_t size = 0;
-    UInt8 lossByte = allowEntities ? 0 : '?';
+    UInt8 lossByte = handling == QuestionMarksForUnencodables ? '?' : 0;
     while (charactersLeft > 0) {
         CFRange range = CFRangeMake(startPos, charactersLeft);
         CFIndex bufferLength;
@@ -303,11 +303,10 @@ CString TextCodecMac::encode(const UChar* characters, size_t length, bool allowE
                     ++charactersConverted;
                 }
             }
-            char entityBuffer[16];
-            sprintf(entityBuffer, "&#%u;", badChar);
-            size_t entityLength = strlen(entityBuffer);
+            UnencodableReplacementArray entity;
+            int entityLength = getUnencodableReplacement(badChar, handling, entity);
             result.grow(size + entityLength);
-            memcpy(result.data() + size, entityBuffer, entityLength);
+            memcpy(result.data() + size, entity, entityLength);
             size += entityLength;
         }
 
index 639e214..d400659 100644 (file)
@@ -44,7 +44,7 @@ namespace WebCore {
         virtual ~TextCodecMac();
 
         virtual String decode(const char*, size_t length, bool flush = false);
-        virtual CString encode(const UChar*, size_t length, bool allowEntities = false);
+        virtual CString encode(const UChar*, size_t length, UnencodableHandling);
 
     private:
         OSStatus decode(const unsigned char* inputBuffer, int inputBufferLength, int& inputLength,
index 4af6f4d..2608186 100644 (file)
@@ -421,7 +421,7 @@ void XMLHttpRequest::send(const String& body, ExceptionCode& ec)
         TextEncoding m_encoding(charset);
         if (!m_encoding.isValid()) // FIXME: report an error?
             m_encoding = UTF8Encoding();
-        request.setHTTPBody(FormData::create(m_encoding.encode(body.characters(), body.length())));
+        request.setHTTPBody(FormData::create(m_encoding.encode(body.characters(), body.length(), EntitiesForUnencodables)));
     }
 
     if (m_requestHeaders.size() > 0)