2007-07-06 Jungshik Shin <jungshik.shin@gmail.com>
[WebKit-https.git] / WebCore / platform / TextEncoding.cpp
index 3d5d7fd..d440b9d 100644 (file)
@@ -1,5 +1,6 @@
 /*
- * Copyright (C) 2004, 2006 Apple Computer, Inc.  All rights reserved.
+ * Copyright (C) 2004, 2006, 2007 Apple Inc. All rights reserved.
+ * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
 #include "config.h"
 #include "TextEncoding.h"
 
-#include "CharsetNames.h"
-#include <kxmlcore/Assertions.h>
-#include <kxmlcore/HashSet.h>
-#include "StreamingTextDecoder.h"
+#include "CString.h"
+#include "PlatformString.h"
+#include "TextCodec.h"
+#include "TextDecoder.h"
+#include "TextEncodingRegistry.h"
+#if USE(ICU_UNICODE)
+#include <unicode/unorm.h>
+#elif USE(QT4_UNICODE)
+#include <QString>
+#endif
+#include <wtf/HashSet.h>
+#include <wtf/OwnPtr.h>
 
 namespace WebCore {
 
-TextEncoding::TextEncoding(const char* name, bool eightBitOnly)
+static void addEncodingName(HashSet<const char*>& set, const char* name)
 {
-    m_encodingID = textEncodingIDFromCharsetName(name, &m_flags);
-    if (eightBitOnly && m_encodingID == UTF16Encoding)
-        m_encodingID = UTF8Encoding;
+    const char* atomicName = atomicCanonicalTextEncodingName(name);
+    if (atomicName)
+        set.add(atomicName);
 }
 
-const char* TextEncoding::name() const
+TextEncoding::TextEncoding(const char* name)
+    : m_name(atomicCanonicalTextEncodingName(name))
 {
-    return charsetNameFromTextEncodingID(m_encodingID);
 }
 
-inline TextEncodingID effectiveEncoding(TextEncodingID encoding)
+TextEncoding::TextEncoding(const String& name)
+    : m_name(atomicCanonicalTextEncodingName(name.characters(), name.length()))
 {
-    if (encoding == Latin1Encoding || encoding == ASCIIEncoding)
-        return WinLatin1Encoding;
-    return encoding;
 }
 
-QChar TextEncoding::backslashAsCurrencySymbol() const
+String TextEncoding::decode(const char* data, size_t length) const
 {
-    if (m_flags & BackslashIsYen)
-        return 0x00A5; // yen sign
-    return '\\';
+    if (!m_name)
+        return String();
+
+    return TextDecoder(*this).decode(data, length, true);
+}
+
+CString TextEncoding::encode(const UChar* characters, size_t length, bool allowEntities) const
+{
+    if (!m_name)
+        return CString();
+
+    if (!length)
+        return "";
+
+#if USE(ICU_UNICODE)
+    // FIXME: What's the right place to do normalization?
+    // It's a little strange to do it inside the encode function.
+    // Perhaps normalization should be an explicit step done before calling encode.
+
+    const UChar* source = characters;
+    size_t sourceLength = length;
+
+    Vector<UChar> normalizedCharacters;
+
+    UErrorCode err = U_ZERO_ERROR;
+    if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
+        // First try using the length of the original string, since normalization to NFC rarely increases length.
+        normalizedCharacters.resize(sourceLength);
+        int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
+        if (err == U_BUFFER_OVERFLOW_ERROR) {
+            err = U_ZERO_ERROR;
+            normalizedCharacters.resize(normalizedLength);
+            normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
+        }
+        ASSERT(U_SUCCESS(err));
+
+        source = normalizedCharacters.data();
+        sourceLength = normalizedLength;
+    }
+    return newTextCodec(*this)->encode(source, sourceLength, allowEntities);
+#elif USE(QT4_UNICODE)
+    QString str(reinterpret_cast<const QChar*>(characters), length);
+    str = str.normalized(QString::NormalizationForm_C);
+    return newTextCodec(*this)->encode(str.utf16(), str.length(), allowEntities);
+#endif
+}
+
+bool TextEncoding::usesVisualOrdering() const
+{
+    if (noExtendedTextEncodingNameUsed())
+        return false;
+
+    static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
+    return m_name == a;
+}
+
+bool TextEncoding::isJapanese() const
+{
+    if (noExtendedTextEncodingNameUsed())
+        return false;
+
+    static HashSet<const char*> set;
+    if (set.isEmpty()) {
+        addEncodingName(set, "x-mac-japanese");
+        addEncodingName(set, "cp932");
+        addEncodingName(set, "JIS_X0201");
+        addEncodingName(set, "JIS_X0208-1983");
+        addEncodingName(set, "JIS_X0208-1990");
+        addEncodingName(set, "JIS_X0212-1990");
+        addEncodingName(set, "JIS_C6226-1978");
+        addEncodingName(set, "Shift_JIS_X0213-2000");
+        addEncodingName(set, "ISO-2022-JP");
+        addEncodingName(set, "ISO-2022-JP-2");
+        addEncodingName(set, "ISO-2022-JP-1");
+        addEncodingName(set, "ISO-2022-JP-3");
+        addEncodingName(set, "EUC-JP");
+        addEncodingName(set, "Shift_JIS");
+    }
+    return m_name && set.contains(m_name);
+}
+
+UChar TextEncoding::backslashAsCurrencySymbol() const
+{
+    if (noExtendedTextEncodingNameUsed())
+        return '\\';
+
+    // The text encodings below treat backslash as a currency symbol.
+    // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
+    static const char* const a = atomicCanonicalTextEncodingName("Shift_JIS_X0213-2000");
+    static const char* const b = atomicCanonicalTextEncodingName("EUC-JP");
+    return (m_name == a || m_name == b) ? 0x00A5 : '\\';
 }
 
-QString TextEncoding::toUnicode(const char *chs, int len) const
+const TextEncoding& TextEncoding::closest8BitEquivalent() const
+{
+    if (*this == UTF16BigEndianEncoding() || *this == UTF16LittleEndianEncoding())
+        return UTF8Encoding();
+    return *this;
+}
+
+const TextEncoding& ASCIIEncoding()
+{
+    static TextEncoding globalASCIIEncoding("ASCII");
+    return globalASCIIEncoding;
+}
+
+const TextEncoding& Latin1Encoding()
+{
+    static TextEncoding globalLatin1Encoding("Latin-1");
+    return globalLatin1Encoding;
+}
+
+const TextEncoding& UTF16BigEndianEncoding()
+{
+    static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");
+    return globalUTF16BigEndianEncoding;
+}
+
+const TextEncoding& UTF16LittleEndianEncoding()
+{
+    static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");
+    return globalUTF16LittleEndianEncoding;
+}
+
+const TextEncoding& UTF32BigEndianEncoding()
+{
+    static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
+    return globalUTF32BigEndianEncoding;
+}
+
+const TextEncoding& UTF32LittleEndianEncoding()
+{
+    static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
+    return globalUTF32LittleEndianEncoding;
+}
+
+
+const TextEncoding& UTF8Encoding()
 {
-    return StreamingTextDecoder(*this).toUnicode(chs, len, true);
+    static TextEncoding globalUTF8Encoding("UTF-8");
+    return globalUTF8Encoding;
 }
 
-QString TextEncoding::toUnicode(const ByteArray &qba, int len) const
+const TextEncoding& WindowsLatin1Encoding()
 {
-    return StreamingTextDecoder(*this).toUnicode(qba, len, true);
+    static TextEncoding globalWindowsLatin1Encoding("WinLatin-1");
+    return globalWindowsLatin1Encoding;
 }
 
 } // namespace WebCore