--- /dev/null
+#!/usr/bin/perl -w
+
+# flush the buffers after each print
+select (STDOUT);
+$| = 1;
+
+print "Content-Type: text/html\n";
+print "Expires: Thu, 01 Dec 2003 16:00:00 GMT\n";
+print "Cache-Control: no-store, no-cache, must-revalidate\n";
+print "Pragma: no-cache\n";
+print "\n";
+
+print "\xef\xbb\xbf<body><p>Test for bug 10697: Errors in incremental decoding of UTF-8.</p>\n";
+print "<p>Should be a blank page (except for this description).</p>\n";
+print "<script>\n";
+print "if (window.layoutTestController)\n";
+print " layoutTestController.dumpAsText();\n";
+print "</script>\n";
+
+# U+2003 = UTF-8 E28083 = EM SPACE
+print "\xe2";
+for ($count=1; $count<4000; $count++) {
+ print "\x80\x83\xe2";
+}
+print "\x80";
+for ($count=1; $count<4000; $count++) {
+ print "\x83\xe2\x80";
+}
+print "\x83";
+2006-09-03 Alexey Proskuryakov <ap@nypop.com>
+
+ Reviewed by Eric.
+
+ http://bugzilla.opendarwin.org/show_bug.cgi?id=10697
+ REGRESSION (r16175): Errors in incremental decoding of UTF-8
+
+ Tests:
+ - http/tests/incremental/slow-utf8-html.pl
+ - fast/encoding/charset-invalid.html
+
+ * loader/Decoder.cpp:
+ (Decoder::Decoder): Ensure that we have a valid encoding. Get its name via TextEncoding, to match
+ the logic in setEncodingName().
+ (Decoder::setEncodingName): Only set m_encodingName if the encoding is valid. Rely on TextEncoding
+ constructor to lowercase it if necessary.
+ (Decoder::decode): Use setEncodingName() to apply encoding from BOM. Don't try to ensure the
+ validity of encoding - it is enforced by class constructor and setEncodingName().
+
2006-09-02 Sam Weinig <sam.weinig@gmail.com>
Reviewed by Tim H.
Decoder::Decoder(const String& mimeType, const String& defaultEncodingName)
: m_encoding(defaultEncodingName.isNull() ? "iso8859-1" : defaultEncodingName.latin1())
- , m_encodingName(defaultEncodingName.isNull() ? "iso8859-1" : defaultEncodingName.latin1())
+ , m_encodingName(m_encoding.name())
, m_type(DefaultEncoding)
, m_reachedBody(false)
, m_checkedForCSSCharset(false)
} else
m_contentType = PlainText;
- m_decoder.set(StreamingTextDecoder::create(m_encoding));
+ if (m_encoding.isValid())
+ m_decoder.set(StreamingTextDecoder::create(m_encoding));
+ else
+ setEncodingName("iso-8859-1", DefaultEncoding);
}
Decoder::~Decoder()
void Decoder::setEncodingName(const char* encodingName, EncodingSource type)
{
- m_encodingName = encodingName;
- m_encodingName = m_encodingName.lower();
-
- if (m_encodingName.isEmpty())
+ if (encodingName[0] == '\0')
return;
bool eightBitOnly = type == EncodingFromMetaTag || type == EncodingFromXMLHeader || type == EncodingFromCSSCharset;
- TextEncoding encoding = TextEncoding(m_encodingName, eightBitOnly);
+ TextEncoding encoding = TextEncoding(encodingName, eightBitOnly);
// in case the encoding didn't exist, we keep the old one (fixes some sites specifying invalid encodings)
if (encoding.isValid()) {
- m_encodingName = encoding.name();
+ m_encodingName = encoding.name(); // use a standard name for the encoding
m_encoding = encoding;
m_type = type;
m_decoder.set(StreamingTextDecoder::create(m_encoding));
}
// If we found a BOM, use the encoding it implies.
- if (autoDetectedEncoding != 0) {
- m_type = AutoDetectedEncoding;
- m_encoding = TextEncoding(autoDetectedEncoding);
- ASSERT(m_encoding.isValid());
- m_encodingName = m_encoding.name();
- m_decoder.set(StreamingTextDecoder::create(m_encoding));
- }
+ if (autoDetectedEncoding != 0)
+ setEncodingName(autoDetectedEncoding, AutoDetectedEncoding);
}
m_checkedForBOM = true;
}
setEncodingName(autoDetectedEncoding, AutoDetectedEncoding);
}
- // If we still haven't found an encoding, assume latin1
- // (this can happen if an empty name is passed from outside).
- if (m_encodingName.isEmpty() || !m_encoding.isValid()) {
- m_encodingName = "iso8859-1";
- m_encoding = TextEncoding(Latin1Encoding);
- }
- m_decoder.set(StreamingTextDecoder::create(m_encoding));
+ ASSERT(m_encoding.isValid());
DeprecatedString out;