2 * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #include "StreamingTextDecoder.h"
29 #include <kxmlcore/Assertions.h>
35 StreamingTextDecoder::StreamingTextDecoder(const TextEncoding& encoding)
36 : m_encoding(encoding)
37 , m_littleEndian(encoding.flags() & LittleEndian)
40 , m_numBufferedBytes(0)
45 static const UChar replacementCharacter = 0xFFFD;
46 static const UChar BOM = 0xFEFF;
47 static const int ConversionBufferSize = 16384;
49 static UConverter* cachedConverterICU;
50 static TextEncodingID cachedConverterEncoding = InvalidEncoding;
52 StreamingTextDecoder::~StreamingTextDecoder()
55 if (cachedConverterICU != 0)
56 ucnv_close(cachedConverterICU);
57 cachedConverterICU = m_converterICU;
58 cachedConverterEncoding = m_encoding.encodingID();
62 DeprecatedString StreamingTextDecoder::convertUTF16(const unsigned char *s, int length)
64 ASSERT(m_numBufferedBytes == 0 || m_numBufferedBytes == 1);
66 const unsigned char *p = s;
67 unsigned len = length;
69 DeprecatedString result("");
71 result.reserve(length / 2);
73 if (m_numBufferedBytes != 0 && len != 0) {
74 ASSERT(m_numBufferedBytes == 1);
77 c = m_bufferedBytes[0] | (p[0] << 8);
79 c = (m_bufferedBytes[0] << 8) | p[0];
82 result.append(reinterpret_cast<QChar *>(&c), 1);
84 m_numBufferedBytes = 0;
90 UChar buffer[ConversionBufferSize];
91 int runLength = min(len / 2, (unsigned)(sizeof(buffer) / sizeof(buffer[0])));
94 for (int i = 0; i < runLength; ++i) {
95 UChar c = p[0] | (p[1] << 8);
98 buffer[bufferLength++] = c;
101 for (int i = 0; i < runLength; ++i) {
102 UChar c = (p[0] << 8) | p[1];
105 buffer[bufferLength++] = c;
108 result.append(reinterpret_cast<QChar *>(buffer), bufferLength);
109 len -= runLength * 2;
113 ASSERT(m_numBufferedBytes == 0);
114 m_numBufferedBytes = 1;
115 m_bufferedBytes[0] = p[0];
121 static inline TextEncoding effectiveEncoding(const TextEncoding& encoding)
123 TextEncodingID id = encoding.encodingID();
124 if (id == Latin1Encoding || id == ASCIIEncoding)
125 id = WinLatin1Encoding;
126 return TextEncoding(id, encoding.flags());
129 UErrorCode StreamingTextDecoder::createICUConverter()
131 TextEncoding encoding = effectiveEncoding(m_encoding);
132 const char* encodingName = encoding.name();
134 bool cachedEncodingEqual = cachedConverterEncoding == encoding.encodingID();
135 cachedConverterEncoding = InvalidEncoding;
137 if (cachedEncodingEqual && cachedConverterICU) {
138 m_converterICU = cachedConverterICU;
139 cachedConverterICU = 0;
141 UErrorCode err = U_ZERO_ERROR;
142 ASSERT(!m_converterICU);
143 m_converterICU = ucnv_open(encodingName, &err);
144 if (err == U_AMBIGUOUS_ALIAS_WARNING)
145 LOG_ERROR("ICU ambiguous alias warning for encoding: %s", encodingName);
147 if (!m_converterICU) {
148 LOG_ERROR("the ICU Converter won't convert from text encoding 0x%X, error %d", encoding.encodingID(), err);
156 // We strip replacement characters because the ICU converter for UTF-8 converts
157 // invalid sequences into replacement characters, but other browsers discard them.
158 // We strip BOM characters because they can show up both at the start of content
159 // and inside content, and we never want them to end up in the decoded text.
160 static inline bool unwanted(UChar c)
162 return c == replacementCharacter || c == BOM;
165 void StreamingTextDecoder::appendOmittingUnwanted(DeprecatedString &s, const UChar *characters, int byteCount)
167 ASSERT(byteCount % sizeof(UChar) == 0);
169 int characterCount = byteCount / sizeof(UChar);
170 for (int i = 0; i != characterCount; ++i) {
171 if (unwanted(characters[i])) {
173 s.append(reinterpret_cast<const QChar *>(&characters[start]), i - start);
177 if (start != characterCount)
178 s.append(reinterpret_cast<const QChar *>(&characters[start]), characterCount - start);
181 DeprecatedString StreamingTextDecoder::convertUsingICU(const unsigned char *chs, int len, bool flush)
183 // Get a converter for the passed-in encoding.
184 if (!m_converterICU && U_FAILURE(createICUConverter()))
185 return DeprecatedString();
187 ASSERT(m_converterICU);
189 DeprecatedString result("");
192 UChar buffer[ConversionBufferSize];
193 const char *source = reinterpret_cast<const char *>(chs);
194 const char *sourceLimit = source + len;
195 int32_t *offsets = NULL;
199 UChar *target = buffer;
200 const UChar *targetLimit = target + ConversionBufferSize;
202 ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, flush, &err);
203 int count = target - buffer;
204 appendOmittingUnwanted(result, reinterpret_cast<const UChar *>(buffer), count * sizeof(UChar));
205 } while (err == U_BUFFER_OVERFLOW_ERROR);
207 if (U_FAILURE(err)) {
208 // flush the converter so it can be reused, and not be bothered by this error.
210 UChar *target = buffer;
211 const UChar *targetLimit = target + ConversionBufferSize;
213 ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, true, &err);
214 } while (source < sourceLimit);
215 LOG_ERROR("ICU conversion error");
216 return DeprecatedString();
222 DeprecatedString StreamingTextDecoder::convert(const unsigned char *chs, int len, bool flush)
224 //#define PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE 1000
226 switch (m_encoding.encodingID()) {
228 return convertUTF16(chs, len);
231 #if PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE
232 DeprecatedString result;
234 for (int i = 0; i != len; i += chunkSize) {
236 if (chunkSize > PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE) {
237 chunkSize = PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE;
239 result += convertUsingICU(chs + i, chunkSize, flush && (i + chunkSize == len));
243 return convertUsingICU(chs, len, flush);
246 ASSERT_NOT_REACHED();
247 return DeprecatedString();
250 DeprecatedString StreamingTextDecoder::toUnicode(const char *chs, int len, bool flush)
252 ASSERT_ARG(len, len >= 0);
255 return DeprecatedString();
257 if (len <= 0 && !flush)
260 // Handle normal case.
262 return convert(chs, len, flush);
264 // Check to see if we found a BOM.
265 int numBufferedBytes = m_numBufferedBytes;
266 int buf1Len = numBufferedBytes;
268 const unsigned char *buf1 = m_bufferedBytes;
269 const unsigned char *buf2 = reinterpret_cast<const unsigned char *>(chs);
270 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
271 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
272 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
274 if (c1 == 0xFF && c2 == 0xFE) {
275 m_encoding = TextEncoding(UTF16Encoding, LittleEndian);
276 m_littleEndian = true;
278 } else if (c1 == 0xFE && c2 == 0xFF) {
279 m_encoding = TextEncoding(UTF16Encoding, BigEndian);
280 m_littleEndian = false;
282 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
283 m_encoding = TextEncoding(UTF8Encoding);
287 // Handle case where we found a BOM.
288 if (BOMLength != 0) {
289 ASSERT(numBufferedBytes + len >= BOMLength);
290 int skip = BOMLength - numBufferedBytes;
291 m_numBufferedBytes = 0;
293 return len == skip ? DeprecatedString("") : convert(chs + skip, len - skip, flush);
296 // Handle case where we know there is no BOM coming.
297 const int bufferSize = sizeof(m_bufferedBytes);
298 if (numBufferedBytes + len > bufferSize || flush) {
300 if (numBufferedBytes == 0) {
301 return convert(chs, len, flush);
303 unsigned char bufferedBytes[sizeof(m_bufferedBytes)];
304 memcpy(bufferedBytes, m_bufferedBytes, numBufferedBytes);
305 m_numBufferedBytes = 0;
306 return convert(bufferedBytes, numBufferedBytes, false) + convert(chs, len, flush);
309 // Continue to look for the BOM.
310 memcpy(&m_bufferedBytes[numBufferedBytes], chs, len);
311 m_numBufferedBytes += len;
315 } // namespace WebCore