2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version.
11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
16 You should have received a copy of the GNU Library General Public License
17 along with this library; see the file COPYING.LIB. If not, write to
18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 Boston, MA 02110-1301, USA.
24 #include "TextResourceDecoder.h"
26 #include "DOMImplementation.h"
27 #include "HTMLNames.h"
28 #include "TextCodec.h"
29 #include "TextEncodingRegistry.h"
30 #include <wtf/ASCIICType.h>
31 #include <wtf/StringExtras.h>
37 using namespace HTMLNames;
39 // You might think we should put these find functions elsewhere, perhaps with the
40 // similar functions that operate on UChar, but arguably only the decoder has
41 // a reason to process strings of char rather than UChar.
43 static int find(const char* subject, size_t subjectLength, const char* target)
45 size_t targetLength = strlen(target);
46 if (targetLength > subjectLength)
48 for (size_t i = 0; i <= subjectLength - targetLength; ++i) {
50 for (size_t j = 0; j < targetLength; ++j) {
51 if (subject[i + j] != target[j]) {
62 static int findIgnoringCase(const char* subject, size_t subjectLength, const char* target)
64 size_t targetLength = strlen(target);
65 if (targetLength > subjectLength)
68 for (size_t i = 0; i < targetLength; ++i)
69 ASSERT(isASCIILower(target[i]));
71 for (size_t i = 0; i <= subjectLength - targetLength; ++i) {
73 for (size_t j = 0; j < targetLength; ++j) {
74 if (toASCIILower(subject[i + j]) != target[j]) {
85 static TextEncoding findTextEncoding(const char* encodingName, int length)
87 Vector<char, 64> buffer(length + 1);
88 memcpy(buffer.data(), encodingName, length);
89 buffer[length] = '\0';
95 enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 };
96 static enum Type judge(const char* str, int length);
97 static const int ESC = 0x1b;
98 static const unsigned char sjisMap[256];
99 static int ISkanji(int code)
103 return sjisMap[code & 0xff] & 1;
105 static int ISkana(int code)
109 return sjisMap[code & 0xff] & 2;
113 const unsigned char KanjiCode::sjisMap[256] = {
114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
124 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
125 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
126 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
127 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
128 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
134 * [0xa1 - 0xfe][0xa1 - 0xfe]
135 * 0x8e[0xa1 - 0xfe](SS2)
136 * 0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
139 * [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
141 * Shift_Jis Hankaku Kana is
146 * KanjiCode::judge() is based on judge_jcode() from jvim
147 * http://hp.vector.co.jp/authors/VA003457/vim/
149 * Special Thanks to Kenichi Tsuchida
152 enum KanjiCode::Type KanjiCode::judge(const char* str, int size)
156 int bfr = false; /* Kana Moji */
157 int bfk = 0; /* EUC Kana */
161 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);
167 if (ptr[i] == ESC && (size - i >= 3)) {
168 if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B')
169 || (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) {
172 } else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@')
173 || (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) {
176 } else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') {
179 } else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') {
191 /* ?? check kudokuten ?? && ?? hiragana ?? */
192 if ((i >= 2) && (ptr[i - 2] == 0x81)
193 && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
195 sjis += 100; /* kudokuten */
196 } else if ((i >= 2) && (ptr[i - 2] == 0xa1)
197 && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
199 euc += 100; /* kudokuten */
200 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
201 sjis += 40; /* hiragana */
202 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
203 euc += 40; /* hiragana */
206 /* ?? check hiragana or katana ?? */
207 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
208 sjis++; /* hiragana */
209 } else if ((size - i > 1) && (ptr[i] == 0x83)
210 && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
211 sjis++; /* katakana */
212 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
213 euc++; /* hiragana */
214 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
215 euc++; /* katakana */
218 if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
221 } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
224 } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
227 } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
230 } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
233 } else if (ptr[i] <= 0x7f) {
237 if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
238 euc++; /* sjis hankaku kana kigo */
239 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
240 ; /* sjis hankaku kana */
241 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
243 } else if (0x8e == ptr[i]) {
245 } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
251 } else if (0x8e == ptr[i]) {
254 } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
255 /* EUC KANA or SJIS KANJI */
266 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
270 && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
271 || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
274 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
278 && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
281 } else if (ptr[i] <= 0x7f) {
294 } else if (sjis < euc) {
302 TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType)
304 if (equalIgnoringCase(mimeType, "text/css"))
306 if (equalIgnoringCase(mimeType, "text/html"))
308 if (DOMImplementation::isXMLMIMEType(mimeType))
313 const TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType, const TextEncoding& specifiedDefaultEncoding)
315 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII
316 // for text/xml. This matches Firefox.
317 if (contentType == XML)
318 return UTF8Encoding();
319 if (!specifiedDefaultEncoding.isValid())
320 return Latin1Encoding();
321 return specifiedDefaultEncoding;
324 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const TextEncoding& specifiedDefaultEncoding)
325 : m_contentType(determineContentType(mimeType))
326 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))
327 , m_source(DefaultEncoding)
328 , m_checkedForBOM(false)
329 , m_checkedForCSSCharset(false)
330 , m_checkedForHeadCharset(false)
331 , m_useLenientXMLDecoding(false)
336 TextResourceDecoder::~TextResourceDecoder()
340 void TextResourceDecoder::setEncoding(const TextEncoding& encoding, EncodingSource source)
342 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
343 if (!encoding.isValid())
346 // When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR),
347 // treat x-user-defined as windows-1252 (bug 18270)
348 if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-defined") == 0)
349 m_encoding = "windows-1252";
350 else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset)
351 m_encoding = encoding.closestByteBasedEquivalent();
353 m_encoding = encoding;
359 // Returns the position of the encoding string.
360 static int findXMLEncoding(const char* str, int len, int& encodingLength)
362 int pos = find(str, len, "encoding");
367 // Skip spaces and stray control characters.
368 while (pos < len && str[pos] <= ' ')
372 if (pos >= len || str[pos] != '=')
376 // Skip spaces and stray control characters.
377 while (pos < len && str[pos] <= ' ')
380 // Skip quotation mark.
383 char quoteMark = str[pos];
384 if (quoteMark != '"' && quoteMark != '\'')
388 // Find the trailing quotation mark.
390 while (end < len && str[end] != quoteMark)
395 encodingLength = end - pos;
399 // true if there is more to parse
400 static inline bool skipWhitespace(const char*& pos, const char* dataEnd)
402 while (pos < dataEnd && (*pos == '\t' || *pos == ' '))
404 return pos != dataEnd;
407 size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)
409 // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
410 // We let it override even a user-chosen encoding.
411 ASSERT(!m_checkedForBOM);
413 size_t lengthOfBOM = 0;
415 size_t bufferLength = m_buffer.size();
417 size_t buf1Len = bufferLength;
418 size_t buf2Len = len;
419 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.data());
420 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
421 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
422 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
423 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
424 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
426 // Check for the BOM.
427 if (c1 == 0xFF && c2 == 0xFE) {
428 if (c3 != 0 || c4 != 0) {
429 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
432 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
435 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
436 setEncoding(UTF8Encoding(), AutoDetectedEncoding);
438 } else if (c1 == 0xFE && c2 == 0xFF) {
439 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
441 } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
442 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
446 if (lengthOfBOM || bufferLength + len >= 4)
447 m_checkedForBOM = true;
452 bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)
454 if (m_source != DefaultEncoding) {
455 m_checkedForCSSCharset = true;
459 size_t oldSize = m_buffer.size();
460 m_buffer.grow(oldSize + len);
461 memcpy(m_buffer.data() + oldSize, data, len);
463 movedDataToBuffer = true;
465 if (m_buffer.size() > 8) { // strlen("@charset") == 8
466 const char* dataStart = m_buffer.data();
467 const char* dataEnd = dataStart + m_buffer.size();
469 if (dataStart[0] == '@' && dataStart[1] == 'c' && dataStart[2] == 'h' && dataStart[3] == 'a' && dataStart[4] == 'r' &&
470 dataStart[5] == 's' && dataStart[6] == 'e' && dataStart[7] == 't') {
473 const char* pos = dataStart;
474 if (!skipWhitespace(pos, dataEnd))
477 if (*pos == '"' || *pos == '\'') {
478 char quotationMark = *pos;
482 while (pos < dataEnd && *pos != quotationMark)
487 int encodingNameLength = pos - dataStart + 1;
490 if (!skipWhitespace(pos, dataEnd))
494 setEncoding(findTextEncoding(dataStart, encodingNameLength), EncodingFromCSSCharset);
497 m_checkedForCSSCharset = true;
503 // Other browsers allow comments in the head section, so we need to also.
504 // It's important not to look for tags inside the comments.
505 static inline void skipComment(const char*& ptr, const char* pEnd)
508 // Allow <!-->; other browsers do.
514 // This is the real end of comment, "-->".
515 if (p[1] == '-' && p[2] == '>') {
519 // This is the incorrect end of comment that other browsers allow, "--!>".
520 if (p[1] == '-' && p[2] == '!' && p[3] == '>') {
531 const int bytesToCheckUnconditionally = 1024; // That many input bytes will be checked for meta charset even if <head> section is over.
533 bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool& movedDataToBuffer)
535 if (m_source != DefaultEncoding) {
536 m_checkedForHeadCharset = true;
540 // This is not completely efficient, since the function might go
541 // through the HTML head several times.
543 size_t oldSize = m_buffer.size();
544 m_buffer.grow(oldSize + len);
545 memcpy(m_buffer.data() + oldSize, data, len);
547 movedDataToBuffer = true;
549 const char* ptr = m_buffer.data();
550 const char* pEnd = ptr + m_buffer.size();
552 // Is there enough data available to check for XML declaration?
553 if (m_buffer.size() < 8)
556 // Handle XML declaration, which can have encoding in it. This encoding is honored even for HTML documents.
557 // It is an error for an XML declaration not to be at the start of an XML document, and it is ignored in HTML documents in such case.
558 if (ptr[0] == '<' && ptr[1] == '?' && ptr[2] == 'x' && ptr[3] == 'm' && ptr[4] == 'l') {
559 const char* xmlDeclarationEnd = ptr;
560 while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')
562 if (xmlDeclarationEnd == pEnd)
564 // No need for +1, because we have an extra "?" to lose at the end of XML declaration.
566 int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);
568 setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader);
569 // continue looking for a charset - it may be specified in an HTTP-Equiv meta
570 } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == '?' && ptr[3] == 0 && ptr[4] == 'x' && ptr[5] == 0) {
571 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
573 } else if (ptr[0] == 0 && ptr[1] == '<' && ptr[2] == 0 && ptr[3] == '?' && ptr[4] == 0 && ptr[5] == 'x') {
574 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
576 } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0 && ptr[4] == '?' && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == 0) {
577 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
579 } else if (ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == '<' && ptr[4] == 0 && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == '?') {
580 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
584 // we still don't have an encoding, and are in the head
585 // the following tags are allowed in <head>:
586 // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
588 // We stop scanning when a tag that is not permitted in <head>
589 // is seen, rather when </head> is seen, because that more closely
590 // matches behavior in other browsers; more details in
591 // <http://bugs.webkit.org/show_bug.cgi?id=3590>.
593 // Additionally, we ignore things that looks like tags in <title>, <script> and <noscript>; see
594 // <http://bugs.webkit.org/show_bug.cgi?id=4560>, <http://bugs.webkit.org/show_bug.cgi?id=12165>
595 // and <http://bugs.webkit.org/show_bug.cgi?id=12389>.
597 // Since many sites have charset declarations after <body> or other tags that are disallowed in <head>,
598 // we don't bail out until we've checked at least bytesToCheckUnconditionally bytes of input.
600 AtomicStringImpl* enclosingTagName = 0;
601 bool inHeadSection = true; // Becomes false when </head> or any tag not allowed in head is encountered.
603 // the HTTP-EQUIV meta has no effect on XHTML
604 if (m_contentType == XML)
607 while (ptr + 3 < pEnd) { // +3 guarantees that "<!--" fits in the buffer - and certainly we aren't going to lose any "charset" that way.
613 if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') {
615 skipComment(ptr, pEnd);
616 if (ptr - m_buffer.data() >= bytesToCheckUnconditionally && !inHeadSection) {
617 // Some pages that test bandwidth from within the browser do it by having
618 // huge comments and measuring the time they take to load. Repeatedly scanning
619 // these comments can take a lot of CPU time.
620 m_checkedForHeadCharset = true;
631 // Grab the tag name, but mostly ignore namespaces.
632 bool sawNamespace = false;
645 if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9'))
647 else if (c >= 'A' && c <= 'Z')
651 tagBuffer[len++] = c;
655 AtomicString tag(tagBuffer);
657 if (enclosingTagName) {
658 if (end && tag.impl() == enclosingTagName)
659 enclosingTagName = 0;
662 enclosingTagName = titleTag.localName().impl();
663 else if (tag == scriptTag)
664 enclosingTagName = scriptTag.localName().impl();
665 else if (tag == noscriptTag)
666 enclosingTagName = noscriptTag.localName().impl();
669 // Find where the opening tag ends.
670 const char* tagContentStart = ptr;
672 while (ptr != pEnd && *ptr != '>') {
673 if (*ptr == '\'' || *ptr == '"') {
674 char quoteMark = *ptr;
676 while (ptr != pEnd && *ptr != quoteMark)
688 if (!end && tag == metaTag && !sawNamespace) {
689 const char* str = tagContentStart;
690 int length = ptr - tagContentStart;
692 while (pos < length) {
693 int charsetPos = findIgnoringCase(str + pos, length - pos, "charset");
694 if (charsetPos == -1)
696 pos += charsetPos + 7;
698 while (pos < length && str[pos] <= ' ')
702 if (str[pos++] != '=')
704 while ((pos < length) &&
705 (str[pos] <= ' ' || str[pos] == '=' || str[pos] == '"' || str[pos] == '\''))
712 while (end < length &&
713 str[end] != ' ' && str[end] != '"' && str[end] != '\'' &&
714 str[end] != ';' && str[end] != '>')
716 setEncoding(findTextEncoding(str + pos, end - pos), EncodingFromMetaTag);
717 if (m_source == EncodingFromMetaTag)
720 if (end >= length || str[end] == '/' || str[end] == '>')
726 if (!enclosingTagName && tag != scriptTag && tag != noscriptTag && tag != styleTag
727 && tag != linkTag && tag != metaTag && tag != objectTag && tag != titleTag && tag != baseTag
728 && (end || tag != htmlTag) && (end || tag != headTag) && isASCIIAlpha(tagBuffer[0])) {
729 inHeadSection = false;
732 if (ptr - m_buffer.data() >= bytesToCheckUnconditionally && !inHeadSection) {
733 m_checkedForHeadCharset = true;
743 void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)
745 switch (KanjiCode::judge(data, len)) {
747 setEncoding("ISO-2022-JP", AutoDetectedEncoding);
750 setEncoding("EUC-JP", AutoDetectedEncoding);
752 case KanjiCode::SJIS:
753 setEncoding("Shift_JIS", AutoDetectedEncoding);
755 case KanjiCode::ASCII:
756 case KanjiCode::UTF16:
757 case KanjiCode::UTF8:
762 String TextResourceDecoder::decode(const char* data, size_t len)
764 size_t lengthOfBOM = 0;
765 if (!m_checkedForBOM)
766 lengthOfBOM = checkForBOM(data, len);
768 bool movedDataToBuffer = false;
770 if (m_contentType == CSS && !m_checkedForCSSCharset)
771 if (!checkForCSSCharset(data, len, movedDataToBuffer))
774 if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForHeadCharset) // HTML and XML
775 if (!checkForHeadCharset(data, len, movedDataToBuffer))
778 // Do the auto-detect if our default encoding is one of the Japanese ones.
779 // FIXME: It seems wrong to change our encoding downstream after we have already done some decoding.
780 if (m_source != UserChosenEncoding && m_source != AutoDetectedEncoding && m_encoding.isJapanese())
781 detectJapaneseEncoding(data, len);
783 ASSERT(m_encoding.isValid());
786 m_codec.set(newTextCodec(m_encoding).release());
788 if (m_buffer.isEmpty())
789 return m_codec->decode(data + lengthOfBOM, len - lengthOfBOM, false, m_contentType == XML, m_sawError);
791 if (!movedDataToBuffer) {
792 size_t oldSize = m_buffer.size();
793 m_buffer.grow(oldSize + len);
794 memcpy(m_buffer.data() + oldSize, data, len);
797 String result = m_codec->decode(m_buffer.data() + lengthOfBOM, m_buffer.size() - lengthOfBOM, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
802 String TextResourceDecoder::flush()
805 m_codec.set(newTextCodec(m_encoding).release());
807 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
810 m_checkedForBOM = false; // Skip BOM again when re-decoding.