- http://bugzilla.opendarwin.org/show_bug.cgi?id=8972
REGRESSION: invalid UTF-8 sequences are not displayed
Test: LayoutTests/fast/encoding/invalid-UTF-8.html
* dom/xml_tokenizer.cpp:
(WebCore::getXHTMLEntity): Properly null-terminate the result. This didn't matter
before, because the garbage at the end was guaranteed to be invalid UTF-8, and was
omitted in appendOmittingUnwanted();
* platform/StreamingTextDecoder.cpp:
(WebCore::StreamingTextDecoder::appendOmittingBOM): Only omit BOM characters.
Renamed back from appendOmittingUnwanted();
(WebCore::StreamingTextDecoder::convertUsingICU):
* platform/StreamingTextDecoder.h:
Update for the above function renaming.
git-svn-id: https://svn.webkit.org/repository/webkit/trunk@14911
268f45cc-cd09-0410-ab3c-
d52691b4dbfc
+2006-06-19 Alexey Proskuryakov <ap@nypop.com>
+
+ Reviewed by Darin.
+
+ - test for http://bugzilla.opendarwin.org/show_bug.cgi?id=8972
+ REGRESSION: invalid UTF-8 sequences are not displayed
+
+ * fast/encoding/invalid-UTF-8-expected.checksum:
+ * fast/encoding/invalid-UTF-8-expected.png:
+ * fast/encoding/invalid-UTF-8-expected.txt:
+ * fast/encoding/invalid-UTF-8.html:
+
2006-06-18 David Kilzer <ddkilzer@kilzer.net>
Reviewed by ggaren.
-d4d6443a0bb96d027ddc4b88c41e9a2a
\ No newline at end of file
+b99931093e6c8f66eab8d8d6bbcdf651
\ No newline at end of file
RenderBlock {HTML} at (0,0) size 800x600
RenderBody {BODY} at (8,8) size 784x576
RenderBlock {P} at (0,0) size 784x18
- RenderText {#text} at (0,0) size 721x18
- text run at (0,0) width 721: "This tests the rendering of invalid UTF-8 sequences. The way other browsers handle these is to omit them entirely."
- RenderBlock {P} at (0,34) size 784x36
- RenderText {#text} at (0,0) size 764x36
- text run at (0,0) width 764: "The text before should show the word \"everywhere\" right next to the word \"including\" and the word \"cube\" right next to"
- text run at (0,18) width 396: "the word \"showing\" without any visible characters in between."
- RenderBlock {HR} at (0,86) size 784x2 [border: (1px inset #000000)]
- RenderBlock {P} at (0,104) size 784x18
- RenderText {#text} at (0,0) size 346x18
- text run at (0,0) width 346: "everywhereincluding a 120-screen cubeshowing sports"
+ RenderText {#text} at (0,0) size 330x18
+ text run at (0,0) width 330: "This tests the rendering of invalid UTF-8 sequences."
+ RenderBlock {P} at (0,34) size 784x18
+ RenderText {#text} at (0,0) size 527x18
+ text run at (0,0) width 527: "The output should be: \"\x{442}??\x{442} ???\" (with black diamonds in place of question marks)."
+ RenderBlock {HR} at (0,68) size 784x2 [border: (1px inset #000000)]
+ RenderBlock {P} at (0,86) size 784x18
+ RenderText {#text} at (0,0) size 98x18
+ text run at (0,0) width 98: "\x{442}\x{FFFD}\x{FFFD}\x{442} \x{FFFD}\x{FFFD}\x{FFFD}"
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
</head>
<body>
-<p>This tests the rendering of invalid UTF-8 sequences. The way other browsers handle these is to omit them entirely.</p>
-<p>The text before should show the word "everywhere" right next to the word "including" and the word "cube" right next to the
-word "showing" without any visible characters in between.</p>
+<p>This tests the rendering of invalid UTF-8 sequences.</p>
+<p>The output should be: "т??т ???" (with black diamonds in place of question marks).</p>
<hr>
-<p>everywhere\97including a 120-screen cube\97showing sports</p>
+<p>Ñ\82µÑÑ\82 ð\90\80ð\80ð\90\90</p>
</body>
</html>
+2006-06-19 Alexey Proskuryakov <ap@nypop.com>
+
+ Reviewed by Darin.
+
+ - http://bugzilla.opendarwin.org/show_bug.cgi?id=8972
+ REGRESSION: invalid UTF-8 sequences are not displayed
+
+ Test: LayoutTests/fast/encoding/invalid-UTF-8.html
+
+ * dom/xml_tokenizer.cpp:
+ (WebCore::getXHTMLEntity): Properly null-terminate the result. This didn't matter
+ before, because the garbage at the end was guaranteed to be invalid UTF-8, and was
+ omitted in appendOmittingUnwanted();
+
+ * platform/StreamingTextDecoder.cpp:
+ (WebCore::StreamingTextDecoder::appendOmittingBOM): Only omit BOM characters.
+ Renamed back from appendOmittingUnwanted();
+
+ (WebCore::StreamingTextDecoder::convertUsingICU):
+ * platform/StreamingTextDecoder.h:
+ Update for the above function renaming.
+
2006-06-18 David Kilzer <ddkilzer@kilzer.net>
Reviewed by darin.
assert(value.length() < 5);
sharedXHTMLEntity.length = value.length();
sharedXHTMLEntity.name = name;
- memcpy(sharedXHTMLEntityResult, value.data(), sharedXHTMLEntity.length);
+ memcpy(sharedXHTMLEntityResult, value.data(), sharedXHTMLEntity.length + 1);
return &sharedXHTMLEntity;
}
{
}
-static const UChar replacementCharacter = 0xFFFD;
static const UChar BOM = 0xFEFF;
static const size_t ConversionBufferSize = 16384;
}
}
-// We strip replacement characters because the ICU converter for UTF-8 converts
-// invalid sequences into replacement characters, but other browsers discard them.
// We strip BOM characters because they can show up both at the start of content
// and inside content, and we never want them to end up in the decoded text.
-static inline bool unwanted(UChar c)
-{
- return c == replacementCharacter || c == BOM;
-}
-
-void StreamingTextDecoder::appendOmittingUnwanted(DeprecatedString& s, const UChar* characters, int byteCount)
+void StreamingTextDecoder::appendOmittingBOM(DeprecatedString& s, const UChar* characters, int byteCount)
{
ASSERT(byteCount % sizeof(UChar) == 0);
int start = 0;
int characterCount = byteCount / sizeof(UChar);
for (int i = 0; i != characterCount; ++i) {
- if (unwanted(characters[i])) {
+ if (BOM == characters[i]) {
if (start != i)
s.append(reinterpret_cast<const QChar*>(&characters[start]), i - start);
start = i + 1;
err = U_ZERO_ERROR;
ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, flush, &err);
int count = target - buffer;
- appendOmittingUnwanted(result, reinterpret_cast<const UChar*>(buffer), count * sizeof(UChar));
+ appendOmittingBOM(result, reinterpret_cast<const UChar*>(buffer), count * sizeof(UChar));
} while (err == U_BUFFER_OVERFLOW_ERROR);
if (U_FAILURE(err)) {
void createICUConverter();
- static void appendOmittingUnwanted(DeprecatedString&, const UChar* characters, int byteCount);
+ static void appendOmittingBOM(DeprecatedString&, const UChar* characters, int byteCount);
TextEncoding m_encoding;
bool m_littleEndian;