Reviewed by Maciej.
authorap@webkit.org <ap@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Thu, 27 Dec 2007 08:22:24 +0000 (08:22 +0000)
committerap@webkit.org <ap@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Thu, 27 Dec 2007 08:22:24 +0000 (08:22 +0000)
        http://bugs.webkit.org/show_bug.cgi?id=14500
        need to be more generous about charset declaration with meta tag

        http://bugs.webkit.org/show_bug.cgi?id=12526
        <rdar://problem/4867183> Safari ignores encoding description "charset=Shift_JIS" in invalid html

        <rdar://problem/4892428> Unlike other browsers, WebKit ignores <meta> charset definitions outside the head

        <rdar://problem/5643774> REGRESSION: Text is garbled when clicking a link inside an Arabic website

        Tests: fast/encoding/ahram-org-eg.html
               fast/encoding/bandai-co-jp-releases.html
               fast/encoding/floraexpress-ru.html
               fast/encoding/hanarei-blog32-fc2-com.html
               fast/encoding/yahoo-mail.html

        * loader/TextResourceDecoder.cpp:
        (WebCore::TextResourceDecoder::checkForHeadCharset): Don't stop looking for <meta> until we've
        seen at least 512 bytes of input.

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@28998 268f45cc-cd09-0410-ab3c-d52691b4dbfc

13 files changed:
LayoutTests/ChangeLog
LayoutTests/fast/encoding/ahram-org-eg-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/ahram-org-eg.html [new file with mode: 0644]
LayoutTests/fast/encoding/bandai-co-jp-releases-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/bandai-co-jp-releases.html [new file with mode: 0644]
LayoutTests/fast/encoding/floraexpress-ru-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/floraexpress-ru.html [new file with mode: 0644]
LayoutTests/fast/encoding/hanarei-blog32-fc2-com-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/hanarei-blog32-fc2-com.html [new file with mode: 0644]
LayoutTests/fast/encoding/yahoo-mail-expected.txt [new file with mode: 0644]
LayoutTests/fast/encoding/yahoo-mail.html [new file with mode: 0644]
WebCore/ChangeLog
WebCore/loader/TextResourceDecoder.cpp

index 6fc4a69..329a236 100644 (file)
@@ -1,3 +1,28 @@
+2007-12-27  Alexey Proskuryakov  <ap@webkit.org>
+
+        Reviewed by Maciej.
+
+        http://bugs.webkit.org/show_bug.cgi?id=14500
+        need to be more generous about charset declaration with meta tag
+        
+        http://bugs.webkit.org/show_bug.cgi?id=12526
+        <rdar://problem/4867183> Safari ignores encoding description "charset=Shift_JIS" in invalid html
+
+        <rdar://problem/4892428> Unlike other browsers, WebKit ignores <meta> charset definitions outside the head
+
+        <rdar://problem/5643774> REGRESSION: Text is garbled when clicking a link inside an Arabic website
+
+        * fast/encoding/ahram-org-eg-expected.txt: Added.
+        * fast/encoding/ahram-org-eg.html: Added.
+        * fast/encoding/bandai-co-jp-releases-expected.txt: Added.
+        * fast/encoding/bandai-co-jp-releases.html: Added.
+        * fast/encoding/floraexpres-ru-expected.txt: Added.
+        * fast/encoding/floraexpres-ru.html: Added.
+        * fast/encoding/hanarei-blog32-fc2-com-expected.txt: Added.
+        * fast/encoding/hanarei-blog32-fc2-com.html: Added.
+        * fast/encoding/yahoo-mail-expected.txt: Added.
+        * fast/encoding/yahoo-mail.html: Added.
+
 2007-12-26  Dan Bernstein  <mitz@apple.com>
 
         Rubber-stamped by Anders Carlsson.
diff --git a/LayoutTests/fast/encoding/ahram-org-eg-expected.txt b/LayoutTests/fast/encoding/ahram-org-eg-expected.txt
new file mode 100644 (file)
index 0000000..990472f
--- /dev/null
@@ -0,0 +1,3 @@
+Test a particular example of broken markup that was making us ignore a charset declaration.
+
+Encoding: windows-1256 (should be windows-1256)
diff --git a/LayoutTests/fast/encoding/ahram-org-eg.html b/LayoutTests/fast/encoding/ahram-org-eg.html
new file mode 100644 (file)
index 0000000..d75cf06
--- /dev/null
@@ -0,0 +1,28 @@
+<htm>
+
+<head>
+<meta http-equiv="content-language" content="en-us">
+<meta http-equiv='content-type' content='text/htm; charset=windows-1256'>
+<script language="javascript">
+       if (navigator.appName == "Netscape")
+       {
+               document.write("<meta http-equiv='content-type' content='text/htm; charset=x-user-defined'>");
+       }
+</script>
+</head>
+
+<body>
+<p>Test a particular example of broken markup that was making us ignore a charset declaration.</p>
+<script>
+  if (window.layoutTestController)
+    layoutTestController.dumpAsText();
+
+  var charset = document.characterSet;
+  if (!charset)
+    charset = document.charset;
+  if (!charset)
+    charset = document.inputEncoding;
+  document.write("Encoding: " + charset + " (should be windows-1256)");
+</script>
+</body>
+</htm>
diff --git a/LayoutTests/fast/encoding/bandai-co-jp-releases-expected.txt b/LayoutTests/fast/encoding/bandai-co-jp-releases-expected.txt
new file mode 100644 (file)
index 0000000..f069c44
--- /dev/null
@@ -0,0 +1,3 @@
+Test a particular example of broken markup that was making us ignore a charset declaration.
+
+Encoding: Shift_JIS (should be Shift_JIS)
diff --git a/LayoutTests/fast/encoding/bandai-co-jp-releases.html b/LayoutTests/fast/encoding/bandai-co-jp-releases.html
new file mode 100644 (file)
index 0000000..c3c2fdd
--- /dev/null
@@ -0,0 +1,21 @@
+<div class="moz-text-flowed" style="font-family: -moz-fixed">
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=Shift_JIS">
+</head>
+
+<body>
+<p>Test a particular example of broken markup that was making us ignore a charset declaration.</p>
+<script>
+  if (window.layoutTestController)
+    layoutTestController.dumpAsText();
+
+  var charset = document.characterSet;
+  if (!charset)
+    charset = document.charset;
+  if (!charset)
+    charset = document.inputEncoding;
+  document.write("Encoding: " + charset + " (should be Shift_JIS)");
+</script>
+</body>
+</html>
diff --git a/LayoutTests/fast/encoding/floraexpress-ru-expected.txt b/LayoutTests/fast/encoding/floraexpress-ru-expected.txt
new file mode 100644 (file)
index 0000000..5ba4587
--- /dev/null
@@ -0,0 +1,3 @@
+Test a particular example of broken markup that was making us ignore a charset declaration.
+
+Encoding: windows-1251 (should be windows-1251)
diff --git a/LayoutTests/fast/encoding/floraexpress-ru.html b/LayoutTests/fast/encoding/floraexpress-ru.html
new file mode 100644 (file)
index 0000000..8004ef7
--- /dev/null
@@ -0,0 +1,26 @@
+<input type=hidden name=test value=''><input type=hidden name=test1 value=''><input type=hidden name=test value=''><input type=hidden name=test1 value=''>
+<html>
+<head>
+<title>Floraexpress - çàêàç è äîñòàâêà öâåòîâ ïî Ìèðó, ìåæäóíàðîäíàÿ äîñòàâêà öâåòîâ, çàêàç è äîñòàâêà áóêåòîâ</title>
+<title>Çàêàç öâåòîâ è äîñòàâêà áóêåòîâ öâåòîâ ïî âñåìó ìèðó - Floraexpress - ìåæäóíàðîäíàÿ äîñòàâêà öâåòîâ</title>
+
+<meta http-equiv=Content-Type content="text/html; charset=windows-1251">
+</head>
+<body bgcolor=#F5F5EF text="#313131" leftmargin="0" topmargin="0" marginwidth="0" marginheight="0">
+</head>
+<body>
+
+<p>Test a particular example of broken markup that was making us ignore a charset declaration.</p>
+<script>
+  if (window.layoutTestController)
+    layoutTestController.dumpAsText();
+
+  var charset = document.characterSet;
+  if (!charset)
+    charset = document.charset;
+  if (!charset)
+    charset = document.inputEncoding;
+  document.write("Encoding: " + charset + " (should be windows-1251)");
+</script>
+</body>
+</html>
diff --git a/LayoutTests/fast/encoding/hanarei-blog32-fc2-com-expected.txt b/LayoutTests/fast/encoding/hanarei-blog32-fc2-com-expected.txt
new file mode 100644 (file)
index 0000000..41080b5
--- /dev/null
@@ -0,0 +1,3 @@
+Test a particular example of broken markup that was making us ignore a charset declaration.
+
+Encoding: Extended_UNIX_Code_Packed_Format_for_Japanese (should be EUC-JP == Extended_UNIX_Code_Packed_Format_for_Japanese)
diff --git a/LayoutTests/fast/encoding/hanarei-blog32-fc2-com.html b/LayoutTests/fast/encoding/hanarei-blog32-fc2-com.html
new file mode 100644 (file)
index 0000000..fcae929
--- /dev/null
@@ -0,0 +1,42 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html lang="ja" dir="ltr" xmlns="http://www.w3.org/1999/xhtml" xml:lang="ja">
+<head>
+
+
+
+<HTML>
+<HEAD>
+<TITLE>̵ÎÁ¥ª¥ó¥é¥¤¥ó¥²¡¼¥à¤Ë»²²Ã¤·¤è¤¦¢ö</TITLE>
+</HEAD>
+
+<BODY>
+<script type="text/javascript"><!--
+var ID="100099131";
+var AD=1;
+var FRAME=0;
+// --></script>
+<script src="about:blank" type="text/javascript"></script>
+<noscript>
+<a href="http://w1.ax.xrea.com/c.f?id=100099131" target="_blank"><img src="about:blank" alt="AX" border="0"></a>
+</noscript>
+
+
+
+<meta http-equiv="Content-Type" content="text/html; charset=EUC-JP" />
+</head>
+<body>
+
+<p>Test a particular example of broken markup that was making us ignore a charset declaration.</p>
+<script>
+  if (window.layoutTestController)
+    layoutTestController.dumpAsText();
+
+  var charset = document.characterSet;
+  if (!charset)
+    charset = document.charset;
+  if (!charset)
+    charset = document.inputEncoding;
+  document.write("Encoding: " + charset + " (should be EUC-JP == Extended_UNIX_Code_Packed_Format_for_Japanese)");
+</script>
+</body>
+</html>
diff --git a/LayoutTests/fast/encoding/yahoo-mail-expected.txt b/LayoutTests/fast/encoding/yahoo-mail-expected.txt
new file mode 100644 (file)
index 0000000..89ac9f6
--- /dev/null
@@ -0,0 +1,3 @@
+Test a particular example of broken markup that was making us ignore a charset declaration.
+
+Encoding: UTF-8 (should be UTF-8)
diff --git a/LayoutTests/fast/encoding/yahoo-mail.html b/LayoutTests/fast/encoding/yahoo-mail.html
new file mode 100644 (file)
index 0000000..82331c8
--- /dev/null
@@ -0,0 +1,26 @@
+<html>
+<head>
+               <title>
+Yahoo! Mail - someone@yahoo.com</title>
+</head>
+<body>
+
+<div id=yiv1034332614><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html
+><head><meta http-equiv="content-type"
+ content="text/html; charset=UTF-8"/></head>
+
+<p>Test a particular example of broken markup that was making us ignore a charset declaration.</p>
+<script>
+  if (window.layoutTestController)
+    layoutTestController.dumpAsText();
+
+  var charset = document.characterSet;
+  if (!charset)
+    charset = document.charset;
+  if (!charset)
+    charset = document.inputEncoding;
+  document.write("Encoding: " + charset + " (should be UTF-8)");
+</script>
+</body>
+</html>
index 424cf63..6d958a9 100644 (file)
@@ -1,3 +1,27 @@
+2007-12-27  Alexey Proskuryakov  <ap@webkit.org>
+
+        Reviewed by Maciej.
+
+        http://bugs.webkit.org/show_bug.cgi?id=14500
+        need to be more generous about charset declaration with meta tag
+        
+        http://bugs.webkit.org/show_bug.cgi?id=12526
+        <rdar://problem/4867183> Safari ignores encoding description "charset=Shift_JIS" in invalid html
+
+        <rdar://problem/4892428> Unlike other browsers, WebKit ignores <meta> charset definitions outside the head
+
+        <rdar://problem/5643774> REGRESSION: Text is garbled when clicking a link inside an Arabic website
+
+        Tests: fast/encoding/ahram-org-eg.html
+               fast/encoding/bandai-co-jp-releases.html
+               fast/encoding/floraexpress-ru.html
+               fast/encoding/hanarei-blog32-fc2-com.html
+               fast/encoding/yahoo-mail.html
+
+        * loader/TextResourceDecoder.cpp:
+        (WebCore::TextResourceDecoder::checkForHeadCharset): Don't stop looking for <meta> until we've
+        seen at least 512 bytes of input.
+
 2007-12-26  Jan Michael Alonzo  <jmalonzo@unpluggable.com>
 
         Reviewed by Alp Toker.
index d832ebd..ad99dd1 100644 (file)
@@ -529,7 +529,10 @@ bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool
     // Additionally, we ignore things that looks like tags in <title>, <script> and <noscript>; see
     // <http://bugs.webkit.org/show_bug.cgi?id=4560>, <http://bugs.webkit.org/show_bug.cgi?id=12165>
     // and <http://bugs.webkit.org/show_bug.cgi?id=12389>.
-    
+
+    // Since many sites have charset declarations after <body> or other tags that are disallowed in <head>,
+    // we don't bail out until we've checked at least 512 bytes of input.
+
     AtomicStringImpl* enclosingTagName = 0;
 
     while (ptr + 3 < pEnd) { // +3 guarantees that "<!--" fits in the buffer - and certainly we aren't going to lose any "charset" that way.
@@ -646,7 +649,7 @@ bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool
 
                     pos = endpos + 1;
                 }
-            } else if (tag != scriptTag && tag != noscriptTag && tag != styleTag &&
+            } else if (ptr - m_buffer.data() >= 512 && tag != scriptTag && tag != noscriptTag && tag != styleTag &&
                        tag != linkTag && tag != metaTag && tag != objectTag &&
                        tag != titleTag && tag != baseTag && 
                        (end || tag != htmlTag) && !enclosingTagName &&