Modernize some aspects of text codecs, eliminate WebKit use of strcasecmp
[WebKit-https.git] / Tools / TestWebKitAPI / Tests / WebCore / TextCodec.cpp
1 /*
2  * Copyright (C) 2017 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27
28 #include <WebCore/TextCodec.h>
29 #include <WebCore/TextEncoding.h>
30 #include <WebCore/TextEncodingRegistry.h>
31 #include <wtf/text/StringBuilder.h>
32
33 namespace TestWebKitAPI {
34
35 // Expects hex bytes with optional spaces between them.
36 // Returns an empty vector if it encounters non-hex-digit characters.
37 static Vector<char> decodeHexTestBytes(const char* input)
38 {
39     Vector<char> result;
40     for (size_t i = 0; input[i]; ) {
41         if (!isASCIIHexDigit(input[i]))
42             return { };
43         if (!isASCIIHexDigit(input[i + 1]))
44             return { };
45         result.append(toASCIIHexValue(input[i], input[i + 1]));
46         i += 2;
47         if (input[i] == ' ')
48             ++i;
49     }
50     return result;
51 }
52
53 // The word "escape" here just means a format easy to read in test results.
54 // It doesn't have to be a format suitable for other users or even one
55 // that is completely unambiguous.
56 static const char* escapeNonASCIIPrintableCharacters(StringView string)
57 {
58     static char resultBuffer[100];
59     size_t i = 0;
60     auto append = [&i] (char character) {
61         if (i < sizeof(resultBuffer))
62             resultBuffer[i++] = character;
63     };
64     auto appendNibble = [append] (char nibble) {
65         if (nibble)
66             append(lowerNibbleToASCIIHexDigit(nibble));
67     };
68     auto appendLastNibble = [append] (char nibble) {
69         append(lowerNibbleToASCIIHexDigit(nibble));
70     };
71     for (auto character : string.codePoints()) {
72         if (isASCIIPrintable(character))
73             append(character);
74         else {
75             append('{');
76             for (unsigned i = 32 - 4; i; i -= 4)
77                 appendNibble(character >> i);
78             appendLastNibble(character);
79             append('}');
80         }
81     }
82     if (i == sizeof(resultBuffer))
83         return "";
84     resultBuffer[i] = '\0';
85     return resultBuffer;
86 }
87
88 static const char* testDecode(const char* encodingName, std::initializer_list<const char*> inputs)
89 {
90     StringBuilder resultBuilder;
91     auto codec = newTextCodec(WebCore::TextEncoding { encodingName });
92     size_t size = inputs.size();
93     for (size_t i = 0; i < size; ++i) {
94         auto vector = decodeHexTestBytes(inputs.begin()[i]);
95         bool last = i == size - 1;
96         bool sawError = false;
97         resultBuilder.append(escapeNonASCIIPrintableCharacters(codec->decode(vector.data(), vector.size(), last, false, sawError)));
98         if (sawError)
99             resultBuilder.appendLiteral(" ERROR");
100     }
101     return escapeNonASCIIPrintableCharacters(resultBuilder.toString());
102 }
103
104 TEST(TextCodec, UTF8)
105 {
106     EXPECT_STREQ("", testDecode("UTF-8", { "" }));
107
108     EXPECT_STREQ("{0}", testDecode("UTF-8", { "00" }));
109
110     EXPECT_STREQ("a", testDecode("UTF-8", { "61" }));
111     EXPECT_STREQ("a", testDecode("UTF-8", { "", "61" }));
112     EXPECT_STREQ("a", testDecode("UTF-8", { "61", "" }));
113     EXPECT_STREQ("a", testDecode("UTF-8", { "", "61", "" }));
114
115     EXPECT_STREQ("{B6}", testDecode("UTF-8", { "C2 B6" }));
116     EXPECT_STREQ("{B6}", testDecode("UTF-8", { "C2", "B6" }));
117     EXPECT_STREQ("{B6}", testDecode("UTF-8", { "", "C2", "", "B6", "" }));
118     EXPECT_STREQ("x{B6}", testDecode("UTF-8", { "78 C2 B6" }));
119     EXPECT_STREQ("{B6}x", testDecode("UTF-8", { "C2 B6 78" }));
120
121     EXPECT_STREQ("{2603}", testDecode("UTF-8", { "E2 98 83" }));
122     EXPECT_STREQ("{2603}", testDecode("UTF-8", { "E2", "98", "83" }));
123     EXPECT_STREQ("{2603}", testDecode("UTF-8", { "", "E2", "", "98", "83" }));
124     EXPECT_STREQ("{2603}", testDecode("UTF-8", { "E2 98", "83" }));
125     EXPECT_STREQ("{2603}", testDecode("UTF-8", { "", "E2 98", "", "83", "" }));
126     EXPECT_STREQ("{2603}", testDecode("UTF-8", { "E2", "9883" }));
127     EXPECT_STREQ("{2603}", testDecode("UTF-8", { "", "E2", "", "9883", "" }));
128     EXPECT_STREQ("x{2603}", testDecode("UTF-8", { "78 E2 98 83" }));
129     EXPECT_STREQ("{2603}x", testDecode("UTF-8", { "E2 98 83 78" }));
130
131     EXPECT_STREQ("{1F4A9}", testDecode("UTF-8", { "F0 9F 92 A9" }));
132     EXPECT_STREQ("{1F4A9}", testDecode("UTF-8", { "F0", "9F", "92", "A9" }));
133     EXPECT_STREQ("{1F4A9}", testDecode("UTF-8", { "", "F0", "", "9F", "", "92", "" , "A9", "" }));
134     EXPECT_STREQ("{1F4A9}", testDecode("UTF-8", { "F09F92", "A9" }));
135     EXPECT_STREQ("x{1F4A9}", testDecode("UTF-8", { "78 F0 9F 92 A9" }));
136     EXPECT_STREQ("{1F4A9}x", testDecode("UTF-8", { "F0 9F 92 A9 78" }));
137 }
138
139 TEST(TextCodec, UTF8InvalidSequences)
140 {
141     EXPECT_STREQ("{FFFD}? ERROR", testDecode("UTF-8", { "E0 A5 3F" }));
142     EXPECT_STREQ("{FFFD}? ERROR", testDecode("UTF-8", { "E0 A5", "3F" }));
143     EXPECT_STREQ("{FFFD}? ERROR", testDecode("UTF-8", { "E0", "A5 3F" }));
144     EXPECT_STREQ("{FFFD}? ERROR", testDecode("UTF-8", { "E0", "A5", "3F" }));
145     EXPECT_STREQ("{FFFD}? ERROR", testDecode("UTF-8", { "E0", "", "A5", "", "3F" }));
146     EXPECT_STREQ("{FFFD}? ERROR", testDecode("UTF-8", { "E0", "", "A5", "", "3F", "" }));
147     EXPECT_STREQ("{FFFD}? ERROR", testDecode("UTF-8", { "", "E0", "", "A5", "", "3F", "" }));
148
149     EXPECT_STREQ("a{FFFD}? ERROR", testDecode("UTF-8", { "61 E0 A5 3F" }));
150     EXPECT_STREQ("a{FFFD}? ERROR", testDecode("UTF-8", { "61 E0 A5", "3F" }));
151     EXPECT_STREQ("a{FFFD}? ERROR", testDecode("UTF-8", { "61 E0", "A5 3F" }));
152     EXPECT_STREQ("a{FFFD}? ERROR", testDecode("UTF-8", { "61 E0", "A5", "3F" }));
153
154     EXPECT_STREQ("{B6}{FFFD}? ERROR", testDecode("UTF-8", { "C2 B6 E0 A5 3F" }));
155     EXPECT_STREQ("{B6}{FFFD}? ERROR", testDecode("UTF-8", { "C2 B6 E0 A5", "3F" }));
156     EXPECT_STREQ("{B6}{FFFD}? ERROR", testDecode("UTF-8", { "C2 B6 E0", "A5 3F" }));
157     EXPECT_STREQ("{B6}{FFFD}? ERROR", testDecode("UTF-8", { "C2 B6 E0", "A5", "3F" }));
158
159     EXPECT_STREQ("{FFFD} ERROR", testDecode("UTF-8", { "C2" }));
160     EXPECT_STREQ("{FFFD} ERROR", testDecode("UTF-8", { "E2" }));
161     EXPECT_STREQ("{FFFD} ERROR", testDecode("UTF-8", { "E2 98" }));
162     EXPECT_STREQ("{FFFD} ERROR", testDecode("UTF-8", { "F0" }));
163     EXPECT_STREQ("{FFFD} ERROR", testDecode("UTF-8", { "F0 9F" }));
164     EXPECT_STREQ("{FFFD} ERROR", testDecode("UTF-8", { "F0 9F 92" }));
165
166     EXPECT_STREQ("{FFFD} ERROR", testDecode("UTF-8", { "E2", "98" }));
167     EXPECT_STREQ("{FFFD} ERROR", testDecode("UTF-8", { "F0", "9F" }));
168     EXPECT_STREQ("{FFFD} ERROR", testDecode("UTF-8", { "F0 9F", "92" }));
169     EXPECT_STREQ("{FFFD} ERROR", testDecode("UTF-8", { "F0", "9F92" }));
170     EXPECT_STREQ("{FFFD} ERROR", testDecode("UTF-8", { "F0", "9F", "92" }));
171
172     EXPECT_STREQ("{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "C0 80" }));
173     EXPECT_STREQ("{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "E0 80 80" }));
174     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "F0 80 80 80" }));
175     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "F8 80 80 80 80" }));
176     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "FC 80 80 80 80 80" }));
177
178     EXPECT_STREQ("{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "C1 BF" }));
179     EXPECT_STREQ("{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "E0 81 BF" }));
180     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "F0 80 81 BF" }));
181     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "F8 80 80 81 BF" }));
182     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "FC 80 80 80 81 BF" }));
183
184     EXPECT_STREQ("{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "E0 82 80" }));
185     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "F0 80 82 80" }));
186     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "F8 80 80 82 80" }));
187     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "FC 80 80 80 82 80" }));
188
189     EXPECT_STREQ("{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "E0 9F BF" }));
190     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "F0 80 9F BF" }));
191     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "F8 80 80 9F BF" }));
192     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "FC 80 80 80 9F BF" }));
193
194     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "F0 80 A0 80" }));
195     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "F8 80 80 A0 80" }));
196     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "FC 80 80 80 A0 80" }));
197
198     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "F0 8F BF BF" }));
199     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "F8 80 8F BF BF" }));
200     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "FC 80 80 8F BF BF" }));
201
202     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "F8 80 90 80 80" }));
203     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "FC 80 80 90 80 80" }));
204
205     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "F8 84 8F BF BF" }));
206     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "FC 80 84 8F BF BF" }));
207
208     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "F4 90 80 80" }));
209     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "FB BF BF BF BF" }));
210     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "FD BF BF BF BF BF" }));
211     EXPECT_STREQ("{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "ED A0 80" }));
212     EXPECT_STREQ("{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "ED BF BF" }));
213     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "ED A0 BD ED B2 A9" }));
214
215     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "F8 84 90 80 80" }));
216     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "FC 80 84 90 80 80" }));
217     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "F0 8D A0 80" }));
218     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "F0 8D BF BF" }));
219     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "F0 8D A0 BD F0 8D B2 A9" }));
220
221     EXPECT_STREQ("{FFFD} ERROR", testDecode("UTF-8", { "80" }));
222     EXPECT_STREQ("{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "80 80" }));
223     EXPECT_STREQ("{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "80 80 80" }));
224     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "80 80 80 80" }));
225     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "80 80 80 80 80" }));
226     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "80 80 80 80 80 80" }));
227     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "80 80 80 80 80 80 80" }));
228     EXPECT_STREQ("{B6}{FFFD} ERROR", testDecode("UTF-8", { "C2 B6 80" }));
229     EXPECT_STREQ("{2603}{FFFD} ERROR", testDecode("UTF-8", { "E2 98 83 80" }));
230     EXPECT_STREQ("{1F4A9}{FFFD} ERROR", testDecode("UTF-8", { "F0 9F 92 A9 80" }));
231     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "FB BF BF BF BF 80" }));
232     EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "FD BF BF BF BF BF 80" }));
233
234     EXPECT_STREQ("{FFFD} ERROR", testDecode("UTF-8", { "FE" }));
235     EXPECT_STREQ("{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "FE 80" }));
236     EXPECT_STREQ("{FFFD} ERROR", testDecode("UTF-8", { "FF" }));
237     EXPECT_STREQ("{FFFD}{FFFD} ERROR", testDecode("UTF-8", { "FF 80" }));
238 }
239
240 }