2 * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #include "TextEncoding.h"
29 #include "CharsetNames.h"
30 #include <kxmlcore/Assertions.h>
31 #include <kxmlcore/HashSet.h>
32 #include "StreamingTextDecoder.h"
33 #include <unicode/unorm.h>
37 TextEncoding::TextEncoding(const char* name, bool eightBitOnly)
39 m_encodingID = textEncodingIDFromCharsetName(name, &m_flags);
40 if (eightBitOnly && m_encodingID == UTF16Encoding)
41 m_encodingID = UTF8Encoding;
44 const char* TextEncoding::name() const
46 return charsetNameFromTextEncodingID(m_encodingID);
49 QChar TextEncoding::backslashAsCurrencySymbol() const
51 if (m_flags & BackslashIsYen)
52 return 0x00A5; // yen sign
57 DeprecatedString TextEncoding::toUnicode(const char *chs, int len) const
59 return StreamingTextDecoder(*this).toUnicode(chs, len, true);
62 DeprecatedString TextEncoding::toUnicode(const DeprecatedByteArray &qba, int len) const
64 return StreamingTextDecoder(*this).toUnicode(qba, len, true);
67 // We'd like to use ICU for this on OS X as well eventually, but we need to make sure
68 // it covers all the encodings that we need
71 static UConverter* cachedConverter;
72 static TextEncodingID cachedConverterEncoding = InvalidEncoding;
74 static const int ConversionBufferSize = 16384;
76 static inline UConverter* getConverter(TextEncodingID encoding, UErrorCode* status)
78 if (cachedConverter && encoding == cachedConverterEncoding) {
79 UConverter* conv = cachedConverter;
84 const char* encodingName = charsetNameFromTextEncodingID(encoding);
85 UErrorCode err = U_ZERO_ERROR;
86 UConverter* conv = ucnv_open(encodingName, &err);
87 if (err == U_AMBIGUOUS_ALIAS_WARNING)
88 LOG_ERROR("ICU ambiguous alias warning for encoding: %s", encodingName);
91 LOG_ERROR("the ICU Converter won't convert to text encoding 0x%X, error %d", encoding, err);
99 static inline void cacheConverter(TextEncodingID id, UConverter* conv)
103 ucnv_close(cachedConverter);
104 cachedConverter = conv;
105 cachedConverterEncoding = id;
109 static inline TextEncodingID effectiveEncoding(TextEncodingID encoding)
111 if (encoding == Latin1Encoding || encoding == ASCIIEncoding)
112 return WinLatin1Encoding;
116 DeprecatedCString TextEncoding::fromUnicode(const DeprecatedString &qcs, bool allowEntities) const
118 TextEncodingID encoding = effectiveEncoding(m_encodingID);
120 if (encoding == WinLatin1Encoding && qcs.isAllLatin1())
123 if ((encoding == WinLatin1Encoding || encoding == UTF8Encoding || encoding == ASCIIEncoding)
127 // FIXME: We should see if there is "force ASCII range" mode in ICU;
128 // until then, we change the backslash into a yen sign.
129 // Encoding will change the yen sign back into a backslash.
130 DeprecatedString copy = qcs;
131 copy.replace(QChar('\\'), backslashAsCurrencySymbol());
133 UErrorCode err = U_ZERO_ERROR;
134 UConverter* conv = getConverter(encoding, &err);
135 if (!conv && U_FAILURE(err))
136 return DeprecatedCString();
140 // FIXME: when DeprecatedString buffer is latin1, it would be nice to
141 // convert from that w/o having to allocate a unicode buffer
143 char buffer[ConversionBufferSize];
144 const UChar* source = reinterpret_cast<const UChar*>(copy.unicode());
145 const UChar* sourceLimit = source + copy.length();
147 DeprecatedString normalizedString;
148 if (UNORM_YES != unorm_quickCheck(source, copy.length(), UNORM_NFC, &err)) {
149 normalizedString.truncate(copy.length()); // normalization to NFC rarely increases the length, so this first attempt will usually succeed
151 int32_t normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<QChar*>(normalizedString.unicode())), copy.length(), &err);
152 if (err == U_BUFFER_OVERFLOW_ERROR) {
154 normalizedString.truncate(normalizedLength);
155 normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<QChar*>(normalizedString.unicode())), normalizedLength, &err);
158 source = reinterpret_cast<const UChar*>(normalizedString.unicode());
159 sourceLimit = source + normalizedLength;
162 DeprecatedCString result(1); // for trailing zero
165 ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err);
167 ucnv_setSubstChars(conv, "?", 1, &err);
168 ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err);
171 ASSERT(U_SUCCESS(err));
173 return DeprecatedCString();
176 char* target = buffer;
177 char* targetLimit = target + ConversionBufferSize;
179 ucnv_fromUnicode(conv, &target, targetLimit, &source, sourceLimit, 0, true, &err);
180 int count = target - buffer;
182 result.append(buffer);
183 } while (err == U_BUFFER_OVERFLOW_ERROR);
185 cacheConverter(encoding, conv);
192 } // namespace WebCore