2 * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #include "TextEncoding.h"
29 #include "CharsetNames.h"
30 #include <kxmlcore/Assertions.h>
31 #include <kxmlcore/HashSet.h>
32 #include "StreamingTextDecoder.h"
36 TextEncoding::TextEncoding(const char* name, bool eightBitOnly)
38 m_encodingID = textEncodingIDFromCharsetName(name, &m_flags);
39 if (eightBitOnly && m_encodingID == UTF16Encoding)
40 m_encodingID = UTF8Encoding;
43 const char* TextEncoding::name() const
45 return charsetNameFromTextEncodingID(m_encodingID);
48 QChar TextEncoding::backslashAsCurrencySymbol() const
50 if (m_flags & BackslashIsYen)
51 return 0x00A5; // yen sign
56 DeprecatedString TextEncoding::toUnicode(const char *chs, int len) const
58 return StreamingTextDecoder(*this).toUnicode(chs, len, true);
61 DeprecatedString TextEncoding::toUnicode(const DeprecatedByteArray &qba, int len) const
63 return StreamingTextDecoder(*this).toUnicode(qba, len, true);
66 // We'd like to use ICU for this on OS X as well eventually, but we need to make sure
67 // it covers all the encodings that we need
70 static UConverter* cachedConverter;
71 static TextEncodingID cachedConverterEncoding = InvalidEncoding;
73 static const int ConversionBufferSize = 16384;
75 static inline UConverter* getConverter(TextEncodingID encoding, UErrorCode* status)
77 if (cachedConverter && encoding == cachedConverterEncoding) {
78 UConverter* conv = cachedConverter;
83 const char* encodingName = charsetNameFromTextEncodingID(encoding);
84 UErrorCode err = U_ZERO_ERROR;
85 UConverter* conv = ucnv_open(encodingName, &err);
86 if (err == U_AMBIGUOUS_ALIAS_WARNING)
87 LOG_ERROR("ICU ambiguous alias warning for encoding: %s", encodingName);
90 LOG_ERROR("the ICU Converter won't convert to text encoding 0x%X, error %d", encoding, err);
98 static inline void cacheConverter(TextEncodingID id, UConverter* conv)
102 ucnv_close(cachedConverter);
103 cachedConverter = conv;
104 cachedConverterEncoding = id;
108 static inline TextEncodingID effectiveEncoding(TextEncodingID encoding)
110 if (encoding == Latin1Encoding || encoding == ASCIIEncoding)
111 return WinLatin1Encoding;
115 DeprecatedCString TextEncoding::fromUnicode(const DeprecatedString &qcs, bool allowEntities) const
117 TextEncodingID encoding = effectiveEncoding(m_encodingID);
119 if (encoding == WinLatin1Encoding && qcs.isAllLatin1())
122 if ((encoding == WinLatin1Encoding || encoding == UTF8Encoding || encoding == ASCIIEncoding)
126 // FIXME: We should see if there is "force ASCII range" mode in ICU;
127 // until then, we change the backslash into a yen sign.
128 // Encoding will change the yen sign back into a backslash.
129 DeprecatedString copy = qcs;
130 copy.replace(QChar('\\'), backslashAsCurrencySymbol());
132 UErrorCode err = U_ZERO_ERROR;
133 UConverter* conv = getConverter(encoding, &err);
134 if (!conv && U_FAILURE(err))
135 return DeprecatedCString();
139 // FIXME: when DeprecatedString buffer is latin1, it would be nice to
140 // convert from that w/o having to allocate a unicode buffer
142 char buffer[ConversionBufferSize];
143 const UChar* source = reinterpret_cast<const UChar*>(copy.unicode());
144 const UChar* sourceLimit = source + copy.length();
146 DeprecatedCString result(1); // for trailng zero
149 ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err);
151 ucnv_setSubstChars(conv, "?", 1, &err);
152 ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err);
156 char* target = buffer;
157 char* targetLimit = target + ConversionBufferSize;
159 ucnv_fromUnicode(conv, &target, targetLimit, &source, sourceLimit, 0, true, &err);
160 int count = target - buffer;
162 result.append(buffer);
163 } while (err == U_BUFFER_OVERFLOW_ERROR);
165 cacheConverter(encoding, conv);
172 } // namespace WebCore