2 * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #include "StreamingTextDecoder.h"
29 #include <wtf/Assertions.h>
35 StreamingTextDecoder::StreamingTextDecoder(const TextEncoding& encoding)
36 : m_encoding(encoding)
37 , m_littleEndian(encoding.flags() & LittleEndian)
40 , m_numBufferedBytes(0)
45 static const UChar replacementCharacter = 0xFFFD;
46 static const UChar BOM = 0xFEFF;
47 static const size_t ConversionBufferSize = 16384;
49 static UConverter* cachedConverterICU;
50 static TextEncodingID cachedConverterEncoding = InvalidEncoding;
52 StreamingTextDecoder::~StreamingTextDecoder()
55 if (cachedConverterICU != 0)
56 ucnv_close(cachedConverterICU);
57 cachedConverterICU = m_converterICU;
58 cachedConverterEncoding = m_encoding.encodingID();
62 DeprecatedString StreamingTextDecoder::convertUTF16(const unsigned char* s, int length)
64 ASSERT(m_numBufferedBytes == 0 || m_numBufferedBytes == 1);
66 const unsigned char* p = s;
69 DeprecatedString result("");
71 result.reserve(length / 2);
73 if (m_numBufferedBytes != 0 && len != 0) {
74 ASSERT(m_numBufferedBytes == 1);
77 c = m_bufferedBytes[0] | (p[0] << 8);
79 c = (m_bufferedBytes[0] << 8) | p[0];
82 result.append(reinterpret_cast<QChar*>(&c), 1);
84 m_numBufferedBytes = 0;
90 UChar buffer[ConversionBufferSize];
91 int runLength = min(len / 2, ConversionBufferSize);
94 for (int i = 0; i < runLength; ++i) {
95 UChar c = p[0] | (p[1] << 8);
98 buffer[bufferLength++] = c;
101 for (int i = 0; i < runLength; ++i) {
102 UChar c = (p[0] << 8) | p[1];
105 buffer[bufferLength++] = c;
108 result.append(reinterpret_cast<QChar*>(buffer), bufferLength);
109 len -= runLength * 2;
113 ASSERT(m_numBufferedBytes == 0);
114 m_numBufferedBytes = 1;
115 m_bufferedBytes[0] = p[0];
121 bool StreamingTextDecoder::convertIfASCII(const unsigned char* s, int length, DeprecatedString& str)
123 ASSERT(m_numBufferedBytes == 0 || m_numBufferedBytes == 1);
125 DeprecatedString result("");
126 result.reserve(length);
128 const unsigned char* p = s;
130 unsigned char ored = 0;
132 UChar buffer[ConversionBufferSize];
133 int runLength = min(len, ConversionBufferSize);
134 int bufferLength = 0;
135 for (int i = 0; i < runLength; ++i) {
136 unsigned char c = *p++;
138 buffer[bufferLength++] = c;
142 result.append(reinterpret_cast<QChar*>(buffer), bufferLength);
150 static inline TextEncoding effectiveEncoding(const TextEncoding& encoding)
152 TextEncodingID id = encoding.encodingID();
153 if (id == Latin1Encoding || id == ASCIIEncoding)
154 id = WinLatin1Encoding;
155 return TextEncoding(id, encoding.flags());
158 void StreamingTextDecoder::createICUConverter()
160 TextEncoding encoding = effectiveEncoding(m_encoding);
161 const char* encodingName = encoding.name();
163 bool cachedEncodingEqual = cachedConverterEncoding == encoding.encodingID();
164 cachedConverterEncoding = InvalidEncoding;
166 if (cachedEncodingEqual && cachedConverterICU) {
167 m_converterICU = cachedConverterICU;
168 cachedConverterICU = 0;
170 UErrorCode err = U_ZERO_ERROR;
171 ASSERT(!m_converterICU);
172 m_converterICU = ucnv_open(encodingName, &err);
174 if (err == U_AMBIGUOUS_ALIAS_WARNING)
175 LOG_ERROR("ICU ambiguous alias warning for encoding: %s", encodingName);
177 LOG_ERROR("the ICU Converter won't convert from text encoding 0x%X, error %d", encoding.encodingID(), err);
182 // We strip replacement characters because the ICU converter for UTF-8 converts
183 // invalid sequences into replacement characters, but other browsers discard them.
184 // We strip BOM characters because they can show up both at the start of content
185 // and inside content, and we never want them to end up in the decoded text.
186 static inline bool unwanted(UChar c)
188 return c == replacementCharacter || c == BOM;
191 void StreamingTextDecoder::appendOmittingUnwanted(DeprecatedString& s, const UChar* characters, int byteCount)
193 ASSERT(byteCount % sizeof(UChar) == 0);
195 int characterCount = byteCount / sizeof(UChar);
196 for (int i = 0; i != characterCount; ++i) {
197 if (unwanted(characters[i])) {
199 s.append(reinterpret_cast<const QChar*>(&characters[start]), i - start);
203 if (start != characterCount)
204 s.append(reinterpret_cast<const QChar*>(&characters[start]), characterCount - start);
207 DeprecatedString StreamingTextDecoder::convertUsingICU(const unsigned char* chs, int len, bool flush)
209 // Get a converter for the passed-in encoding.
210 if (!m_converterICU) {
211 createICUConverter();
213 return DeprecatedString();
216 DeprecatedString result("");
219 UChar buffer[ConversionBufferSize];
220 const char* source = reinterpret_cast<const char*>(chs);
221 const char* sourceLimit = source + len;
222 int32_t* offsets = NULL;
226 UChar* target = buffer;
227 const UChar* targetLimit = target + ConversionBufferSize;
229 ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, flush, &err);
230 int count = target - buffer;
231 appendOmittingUnwanted(result, reinterpret_cast<const UChar*>(buffer), count * sizeof(UChar));
232 } while (err == U_BUFFER_OVERFLOW_ERROR);
234 if (U_FAILURE(err)) {
235 // flush the converter so it can be reused, and not be bothered by this error.
237 UChar *target = buffer;
238 const UChar *targetLimit = target + ConversionBufferSize;
240 ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, true, &err);
241 } while (source < sourceLimit);
242 LOG_ERROR("ICU conversion error");
243 return DeprecatedString();
249 DeprecatedString StreamingTextDecoder::convert(const unsigned char* chs, int len, bool flush)
251 switch (m_encoding.encodingID()) {
253 return convertUTF16(chs, len);
257 case WinLatin1Encoding: {
258 DeprecatedString result;
259 if (convertIfASCII(chs, len, result))
265 // If a previous run used ICU, we might have a partly converted character.
266 // If so, don't use the optimized ASCII code path.
267 if (!m_converterICU) {
268 DeprecatedString result;
269 if (convertIfASCII(chs, len, result))
278 //#define PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE 1000
279 #if PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE
280 DeprecatedString result;
282 for (int i = 0; i != len; i += chunkSize) {
284 if (chunkSize > PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE) {
285 chunkSize = PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE;
287 result += convertUsingICU(chs + i, chunkSize, flush && (i + chunkSize == len));
291 return convertUsingICU(chs, len, flush);
295 DeprecatedString StreamingTextDecoder::toUnicode(const char* chs, int len, bool flush)
297 ASSERT_ARG(len, len >= 0);
300 return DeprecatedString();
302 if (len <= 0 && !flush)
305 // Handle normal case.
307 return convert(chs, len, flush);
309 // Check to see if we found a BOM.
310 int numBufferedBytes = m_numBufferedBytes;
311 int buf1Len = numBufferedBytes;
313 const unsigned char* buf1 = m_bufferedBytes;
314 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(chs);
315 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
316 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
317 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
319 if (c1 == 0xFF && c2 == 0xFE) {
320 m_encoding = TextEncoding(UTF16Encoding, LittleEndian);
321 m_littleEndian = true;
323 } else if (c1 == 0xFE && c2 == 0xFF) {
324 m_encoding = TextEncoding(UTF16Encoding, BigEndian);
325 m_littleEndian = false;
327 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
328 m_encoding = TextEncoding(UTF8Encoding);
332 // Handle case where we found a BOM.
333 if (BOMLength != 0) {
334 ASSERT(numBufferedBytes + len >= BOMLength);
335 int skip = BOMLength - numBufferedBytes;
336 m_numBufferedBytes = 0;
338 return len == skip ? DeprecatedString("") : convert(chs + skip, len - skip, flush);
341 // Handle case where we know there is no BOM coming.
342 const int bufferSize = sizeof(m_bufferedBytes);
343 if (numBufferedBytes + len > bufferSize || flush) {
345 if (numBufferedBytes == 0) {
346 return convert(chs, len, flush);
348 unsigned char bufferedBytes[sizeof(m_bufferedBytes)];
349 memcpy(bufferedBytes, m_bufferedBytes, numBufferedBytes);
350 m_numBufferedBytes = 0;
351 return convert(bufferedBytes, numBufferedBytes, false) + convert(chs, len, flush);
354 // Continue to look for the BOM.
355 memcpy(&m_bufferedBytes[numBufferedBytes], chs, len);
356 m_numBufferedBytes += len;
360 } // namespace WebCore