2 * Copyright (C) 2004, 2007, 2008, 2011, 2012, 2013, 2015-2016 Apple Inc. All rights reserved.
3 * Copyright (C) 2012 Research In Motion Limited. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 #include "DecodeEscapeSequences.h"
31 #include "MIMETypeRegistry.h"
32 #include "TextEncoding.h"
33 #include "URLParser.h"
36 #include <unicode/uidna.h>
37 #include <wtf/HashMap.h>
38 #include <wtf/HexNumber.h>
39 #include <wtf/NeverDestroyed.h>
40 #include <wtf/StdLibExtras.h>
41 #include <wtf/text/CString.h>
42 #include <wtf/text/StringBuilder.h>
43 #include <wtf/text/StringHash.h>
45 // FIXME: This file makes too much use of the + operator on String.
46 // We either have to optimize that operator so it doesn't involve
47 // so many allocations, or change this to use StringBuffer instead.
53 typedef Vector<char, 512> CharBuffer;
54 typedef Vector<UChar, 512> UCharBuffer;
56 static const unsigned invalidPortNumber = 0xFFFF;
58 static inline bool isLetterMatchIgnoringCase(UChar character, char lowercaseLetter)
60 ASSERT(isASCIILower(lowercaseLetter));
61 return (character | 0x20) == lowercaseLetter;
64 static const char wsScheme[] = {'w', 's'};
65 static const char ftpScheme[] = {'f', 't', 'p'};
66 static const char ftpPort[] = {'2', '1'};
67 static const char wssScheme[] = {'w', 's', 's'};
68 static const char fileScheme[] = {'f', 'i', 'l', 'e'};
69 static const char httpScheme[] = {'h', 't', 't', 'p'};
70 static const char httpPort[] = {'8', '0'};
71 static const char httpsScheme[] = {'h', 't', 't', 'p', 's'};
72 static const char httpsPort[] = {'4', '4', '3'};
73 static const char gopherScheme[] = {'g', 'o', 'p', 'h', 'e', 'r'};
74 static const char gopherPort[] = {'7', '0'};
76 static inline bool isLetterMatchIgnoringCase(char character, char lowercaseLetter)
78 ASSERT(isASCIILower(lowercaseLetter));
79 return (character | 0x20) == lowercaseLetter;
82 enum URLCharacterClasses {
84 SchemeFirstChar = 1 << 0,
86 // ( alpha | digit | "+" | "-" | "." )
89 // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
90 // unreserved = alphanum | mark
91 // ( unreserved | escaped | ";" | ":" | "&" | "=" | "+" | "$" | "," )
92 UserInfoChar = 1 << 2,
94 // alnum | "." | "-" | "%"
95 // The above is what the specification says, but we are lenient to
96 // match existing practice and also allow:
98 HostnameChar = 1 << 3,
100 // hexdigit | ":" | "%"
103 // "#" | "?" | "/" | nul
104 PathSegmentEndChar = 1 << 5,
106 // not allowed in path
109 // "\t" | "\n" | "\r"
113 static const unsigned char characterClassTable[256] = {
114 /* 0 nul */ PathSegmentEndChar, /* 1 soh */ BadChar,
115 /* 2 stx */ BadChar, /* 3 etx */ BadChar,
116 /* 4 eot */ BadChar, /* 5 enq */ BadChar, /* 6 ack */ BadChar, /* 7 bel */ BadChar,
117 /* 8 bs */ BadChar, /* 9 ht */ BadChar | TabNewline, /* 10 nl */ BadChar | TabNewline,
118 /* 11 vt */ BadChar, /* 12 np */ BadChar, /* 13 cr */ BadChar | TabNewline,
119 /* 14 so */ BadChar, /* 15 si */ BadChar,
120 /* 16 dle */ BadChar, /* 17 dc1 */ BadChar, /* 18 dc2 */ BadChar, /* 19 dc3 */ BadChar,
121 /* 20 dc4 */ BadChar, /* 21 nak */ BadChar, /* 22 syn */ BadChar, /* 23 etb */ BadChar,
122 /* 24 can */ BadChar, /* 25 em */ BadChar, /* 26 sub */ BadChar, /* 27 esc */ BadChar,
123 /* 28 fs */ BadChar, /* 29 gs */ BadChar, /* 30 rs */ BadChar, /* 31 us */ BadChar,
124 /* 32 sp */ BadChar, /* 33 ! */ UserInfoChar,
125 /* 34 " */ BadChar, /* 35 # */ PathSegmentEndChar | BadChar,
126 /* 36 $ */ UserInfoChar, /* 37 % */ UserInfoChar | HostnameChar | IPv6Char | BadChar,
127 /* 38 & */ UserInfoChar, /* 39 ' */ UserInfoChar,
128 /* 40 ( */ UserInfoChar, /* 41 ) */ UserInfoChar,
129 /* 42 * */ UserInfoChar, /* 43 + */ SchemeChar | UserInfoChar,
130 /* 44 , */ UserInfoChar,
131 /* 45 - */ SchemeChar | UserInfoChar | HostnameChar,
132 /* 46 . */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
133 /* 47 / */ PathSegmentEndChar,
134 /* 48 0 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
135 /* 49 1 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
136 /* 50 2 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
137 /* 51 3 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
138 /* 52 4 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
139 /* 53 5 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
140 /* 54 6 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
141 /* 55 7 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
142 /* 56 8 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
143 /* 57 9 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
144 /* 58 : */ UserInfoChar | IPv6Char, /* 59 ; */ UserInfoChar,
145 /* 60 < */ BadChar, /* 61 = */ UserInfoChar,
146 /* 62 > */ BadChar, /* 63 ? */ PathSegmentEndChar | BadChar,
148 /* 65 A */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
149 /* 66 B */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
150 /* 67 C */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
151 /* 68 D */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
152 /* 69 E */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
153 /* 70 F */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
154 /* 71 G */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
155 /* 72 H */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
156 /* 73 I */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
157 /* 74 J */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
158 /* 75 K */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
159 /* 76 L */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
160 /* 77 M */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
161 /* 78 N */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
162 /* 79 O */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
163 /* 80 P */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
164 /* 81 Q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
165 /* 82 R */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
166 /* 83 S */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
167 /* 84 T */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
168 /* 85 U */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
169 /* 86 V */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
170 /* 87 W */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
171 /* 88 X */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
172 /* 89 Y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
173 /* 90 Z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
175 /* 92 \ */ 0, /* 93 ] */ 0,
177 /* 95 _ */ UserInfoChar | HostnameChar,
179 /* 97 a */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
180 /* 98 b */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
181 /* 99 c */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
182 /* 100 d */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
183 /* 101 e */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
184 /* 102 f */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
185 /* 103 g */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
186 /* 104 h */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
187 /* 105 i */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
188 /* 106 j */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
189 /* 107 k */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
190 /* 108 l */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
191 /* 109 m */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
192 /* 110 n */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
193 /* 111 o */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
194 /* 112 p */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
195 /* 113 q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
196 /* 114 r */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
197 /* 115 s */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
198 /* 116 t */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
199 /* 117 u */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
200 /* 118 v */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
201 /* 119 w */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
202 /* 120 x */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
203 /* 121 y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
204 /* 122 z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
206 /* 124 | */ 0, /* 125 } */ 0, /* 126 ~ */ UserInfoChar, /* 127 del */ BadChar,
207 /* 128 */ BadChar, /* 129 */ BadChar, /* 130 */ BadChar, /* 131 */ BadChar,
208 /* 132 */ BadChar, /* 133 */ BadChar, /* 134 */ BadChar, /* 135 */ BadChar,
209 /* 136 */ BadChar, /* 137 */ BadChar, /* 138 */ BadChar, /* 139 */ BadChar,
210 /* 140 */ BadChar, /* 141 */ BadChar, /* 142 */ BadChar, /* 143 */ BadChar,
211 /* 144 */ BadChar, /* 145 */ BadChar, /* 146 */ BadChar, /* 147 */ BadChar,
212 /* 148 */ BadChar, /* 149 */ BadChar, /* 150 */ BadChar, /* 151 */ BadChar,
213 /* 152 */ BadChar, /* 153 */ BadChar, /* 154 */ BadChar, /* 155 */ BadChar,
214 /* 156 */ BadChar, /* 157 */ BadChar, /* 158 */ BadChar, /* 159 */ BadChar,
215 /* 160 */ BadChar, /* 161 */ BadChar, /* 162 */ BadChar, /* 163 */ BadChar,
216 /* 164 */ BadChar, /* 165 */ BadChar, /* 166 */ BadChar, /* 167 */ BadChar,
217 /* 168 */ BadChar, /* 169 */ BadChar, /* 170 */ BadChar, /* 171 */ BadChar,
218 /* 172 */ BadChar, /* 173 */ BadChar, /* 174 */ BadChar, /* 175 */ BadChar,
219 /* 176 */ BadChar, /* 177 */ BadChar, /* 178 */ BadChar, /* 179 */ BadChar,
220 /* 180 */ BadChar, /* 181 */ BadChar, /* 182 */ BadChar, /* 183 */ BadChar,
221 /* 184 */ BadChar, /* 185 */ BadChar, /* 186 */ BadChar, /* 187 */ BadChar,
222 /* 188 */ BadChar, /* 189 */ BadChar, /* 190 */ BadChar, /* 191 */ BadChar,
223 /* 192 */ BadChar, /* 193 */ BadChar, /* 194 */ BadChar, /* 195 */ BadChar,
224 /* 196 */ BadChar, /* 197 */ BadChar, /* 198 */ BadChar, /* 199 */ BadChar,
225 /* 200 */ BadChar, /* 201 */ BadChar, /* 202 */ BadChar, /* 203 */ BadChar,
226 /* 204 */ BadChar, /* 205 */ BadChar, /* 206 */ BadChar, /* 207 */ BadChar,
227 /* 208 */ BadChar, /* 209 */ BadChar, /* 210 */ BadChar, /* 211 */ BadChar,
228 /* 212 */ BadChar, /* 213 */ BadChar, /* 214 */ BadChar, /* 215 */ BadChar,
229 /* 216 */ BadChar, /* 217 */ BadChar, /* 218 */ BadChar, /* 219 */ BadChar,
230 /* 220 */ BadChar, /* 221 */ BadChar, /* 222 */ BadChar, /* 223 */ BadChar,
231 /* 224 */ BadChar, /* 225 */ BadChar, /* 226 */ BadChar, /* 227 */ BadChar,
232 /* 228 */ BadChar, /* 229 */ BadChar, /* 230 */ BadChar, /* 231 */ BadChar,
233 /* 232 */ BadChar, /* 233 */ BadChar, /* 234 */ BadChar, /* 235 */ BadChar,
234 /* 236 */ BadChar, /* 237 */ BadChar, /* 238 */ BadChar, /* 239 */ BadChar,
235 /* 240 */ BadChar, /* 241 */ BadChar, /* 242 */ BadChar, /* 243 */ BadChar,
236 /* 244 */ BadChar, /* 245 */ BadChar, /* 246 */ BadChar, /* 247 */ BadChar,
237 /* 248 */ BadChar, /* 249 */ BadChar, /* 250 */ BadChar, /* 251 */ BadChar,
238 /* 252 */ BadChar, /* 253 */ BadChar, /* 254 */ BadChar, /* 255 */ BadChar
241 enum PercentEncodeCharacterClass {
242 // Class names match the URL Standard; each class is a superset of the previous one.
243 PercentEncodeSimple = 255,
244 PercentEncodeDefault = 127,
245 PercentEncodePassword = 63,
246 PercentEncodeUsername = 31,
249 static const unsigned char percentEncodeClassTable[256] = {
250 /* 0 nul */ PercentEncodeSimple, /* 1 soh */ PercentEncodeSimple, /* 2 stx */ PercentEncodeSimple, /* 3 etx */ PercentEncodeSimple,
251 /* 4 eot */ PercentEncodeSimple, /* 5 enq */ PercentEncodeSimple, /* 6 ack */ PercentEncodeSimple, /* 7 bel */ PercentEncodeSimple,
252 /* 8 bs */ PercentEncodeSimple, /* 9 ht */ PercentEncodeSimple, /* 10 nl */ PercentEncodeSimple, /* 11 vt */ PercentEncodeSimple,
253 /* 12 np */ PercentEncodeSimple, /* 13 cr */ PercentEncodeSimple, /* 14 so */ PercentEncodeSimple, /* 15 si */ PercentEncodeSimple,
254 /* 16 dle */ PercentEncodeSimple, /* 17 dc1 */ PercentEncodeSimple, /* 18 dc2 */ PercentEncodeSimple, /* 19 dc3 */ PercentEncodeSimple,
255 /* 20 dc4 */ PercentEncodeSimple, /* 21 nak */ PercentEncodeSimple, /* 22 syn */ PercentEncodeSimple, /* 23 etb */ PercentEncodeSimple,
256 /* 24 can */ PercentEncodeSimple, /* 25 em */ PercentEncodeSimple, /* 26 sub */ PercentEncodeSimple, /* 27 esc */ PercentEncodeSimple,
257 /* 28 fs */ PercentEncodeSimple, /* 29 gs */ PercentEncodeSimple, /* 30 rs */ PercentEncodeSimple, /* 31 us */ PercentEncodeSimple,
258 /* 32 sp */ PercentEncodeDefault,
260 /* 34 " */ PercentEncodeDefault,
261 /* 35 # */ PercentEncodeDefault,
273 /* 47 / */ PercentEncodePassword,
274 /* 48 0 */ 0, /* 49 1 */ 0, /* 50 2 */ 0, /* 51 3 */ 0,
275 /* 52 4 */ 0, /* 53 5 */ 0, /* 54 6 */ 0, /* 55 7 */ 0,
276 /* 56 8 */ 0, /* 57 9 */ 0,
277 /* 58 : */ PercentEncodeUsername,
279 /* 60 < */ PercentEncodeDefault,
281 /* 62 > */ PercentEncodeDefault,
282 /* 63 ? */ PercentEncodeDefault,
283 /* 64 @ */ PercentEncodePassword,
284 /* 65 A */ 0, /* 66 B */ 0, /* 67 C */ 0, /* 68 D */ 0,
285 /* 69 E */ 0, /* 70 F */ 0, /* 71 G */ 0, /* 72 H */ 0,
286 /* 73 I */ 0, /* 74 J */ 0, /* 75 K */ 0, /* 76 L */ 0,
287 /* 77 M */ 0, /* 78 N */ 0, /* 79 O */ 0, /* 80 P */ 0,
288 /* 81 Q */ 0, /* 82 R */ 0, /* 83 S */ 0, /* 84 T */ 0,
289 /* 85 U */ 0, /* 86 V */ 0, /* 87 W */ 0, /* 88 X */ 0,
290 /* 89 Y */ 0, /* 90 Z */ 0,
292 /* 92 \ */ PercentEncodePassword,
296 /* 96 ` */ PercentEncodeDefault,
297 /* 97 a */ 0, /* 98 b */ 0, /* 99 c */ 0, /* 100 d */ 0,
298 /* 101 e */ 0, /* 102 f */ 0, /* 103 g */ 0, /* 104 h */ 0,
299 /* 105 i */ 0, /* 106 j */ 0, /* 107 k */ 0, /* 108 l */ 0,
300 /* 109 m */ 0, /* 110 n */ 0, /* 111 o */ 0, /* 112 p */ 0,
301 /* 113 q */ 0, /* 114 r */ 0, /* 115 s */ 0, /* 116 t */ 0,
302 /* 117 u */ 0, /* 118 v */ 0, /* 119 w */ 0, /* 120 x */ 0,
303 /* 121 y */ 0, /* 122 z */ 0,
308 /* 127 del */ PercentEncodeSimple,
309 /* 128 */ PercentEncodeSimple, /* 129 */ PercentEncodeSimple, /* 130 */ PercentEncodeSimple, /* 131 */ PercentEncodeSimple,
310 /* 132 */ PercentEncodeSimple, /* 133 */ PercentEncodeSimple, /* 134 */ PercentEncodeSimple, /* 135 */ PercentEncodeSimple,
311 /* 136 */ PercentEncodeSimple, /* 137 */ PercentEncodeSimple, /* 138 */ PercentEncodeSimple, /* 139 */ PercentEncodeSimple,
312 /* 140 */ PercentEncodeSimple, /* 141 */ PercentEncodeSimple, /* 142 */ PercentEncodeSimple, /* 143 */ PercentEncodeSimple,
313 /* 144 */ PercentEncodeSimple, /* 145 */ PercentEncodeSimple, /* 146 */ PercentEncodeSimple, /* 147 */ PercentEncodeSimple,
314 /* 148 */ PercentEncodeSimple, /* 149 */ PercentEncodeSimple, /* 150 */ PercentEncodeSimple, /* 151 */ PercentEncodeSimple,
315 /* 152 */ PercentEncodeSimple, /* 153 */ PercentEncodeSimple, /* 154 */ PercentEncodeSimple, /* 155 */ PercentEncodeSimple,
316 /* 156 */ PercentEncodeSimple, /* 157 */ PercentEncodeSimple, /* 158 */ PercentEncodeSimple, /* 159 */ PercentEncodeSimple,
317 /* 160 */ PercentEncodeSimple, /* 161 */ PercentEncodeSimple, /* 162 */ PercentEncodeSimple, /* 163 */ PercentEncodeSimple,
318 /* 164 */ PercentEncodeSimple, /* 165 */ PercentEncodeSimple, /* 166 */ PercentEncodeSimple, /* 167 */ PercentEncodeSimple,
319 /* 168 */ PercentEncodeSimple, /* 169 */ PercentEncodeSimple, /* 170 */ PercentEncodeSimple, /* 171 */ PercentEncodeSimple,
320 /* 172 */ PercentEncodeSimple, /* 173 */ PercentEncodeSimple, /* 174 */ PercentEncodeSimple, /* 175 */ PercentEncodeSimple,
321 /* 176 */ PercentEncodeSimple, /* 177 */ PercentEncodeSimple, /* 178 */ PercentEncodeSimple, /* 179 */ PercentEncodeSimple,
322 /* 180 */ PercentEncodeSimple, /* 181 */ PercentEncodeSimple, /* 182 */ PercentEncodeSimple, /* 183 */ PercentEncodeSimple,
323 /* 184 */ PercentEncodeSimple, /* 185 */ PercentEncodeSimple, /* 186 */ PercentEncodeSimple, /* 187 */ PercentEncodeSimple,
324 /* 188 */ PercentEncodeSimple, /* 189 */ PercentEncodeSimple, /* 190 */ PercentEncodeSimple, /* 191 */ PercentEncodeSimple,
325 /* 192 */ PercentEncodeSimple, /* 193 */ PercentEncodeSimple, /* 194 */ PercentEncodeSimple, /* 195 */ PercentEncodeSimple,
326 /* 196 */ PercentEncodeSimple, /* 197 */ PercentEncodeSimple, /* 198 */ PercentEncodeSimple, /* 199 */ PercentEncodeSimple,
327 /* 200 */ PercentEncodeSimple, /* 201 */ PercentEncodeSimple, /* 202 */ PercentEncodeSimple, /* 203 */ PercentEncodeSimple,
328 /* 204 */ PercentEncodeSimple, /* 205 */ PercentEncodeSimple, /* 206 */ PercentEncodeSimple, /* 207 */ PercentEncodeSimple,
329 /* 208 */ PercentEncodeSimple, /* 209 */ PercentEncodeSimple, /* 210 */ PercentEncodeSimple, /* 211 */ PercentEncodeSimple,
330 /* 212 */ PercentEncodeSimple, /* 213 */ PercentEncodeSimple, /* 214 */ PercentEncodeSimple, /* 215 */ PercentEncodeSimple,
331 /* 216 */ PercentEncodeSimple, /* 217 */ PercentEncodeSimple, /* 218 */ PercentEncodeSimple, /* 219 */ PercentEncodeSimple,
332 /* 220 */ PercentEncodeSimple, /* 221 */ PercentEncodeSimple, /* 222 */ PercentEncodeSimple, /* 223 */ PercentEncodeSimple,
333 /* 224 */ PercentEncodeSimple, /* 225 */ PercentEncodeSimple, /* 226 */ PercentEncodeSimple, /* 227 */ PercentEncodeSimple,
334 /* 228 */ PercentEncodeSimple, /* 229 */ PercentEncodeSimple, /* 230 */ PercentEncodeSimple, /* 231 */ PercentEncodeSimple,
335 /* 232 */ PercentEncodeSimple, /* 233 */ PercentEncodeSimple, /* 234 */ PercentEncodeSimple, /* 235 */ PercentEncodeSimple,
336 /* 236 */ PercentEncodeSimple, /* 237 */ PercentEncodeSimple, /* 238 */ PercentEncodeSimple, /* 239 */ PercentEncodeSimple,
337 /* 240 */ PercentEncodeSimple, /* 241 */ PercentEncodeSimple, /* 242 */ PercentEncodeSimple, /* 243 */ PercentEncodeSimple,
338 /* 244 */ PercentEncodeSimple, /* 245 */ PercentEncodeSimple, /* 246 */ PercentEncodeSimple, /* 247 */ PercentEncodeSimple,
339 /* 248 */ PercentEncodeSimple, /* 249 */ PercentEncodeSimple, /* 250 */ PercentEncodeSimple, /* 251 */ PercentEncodeSimple,
340 /* 252 */ PercentEncodeSimple, /* 253 */ PercentEncodeSimple, /* 254 */ PercentEncodeSimple, /* 255 */ PercentEncodeSimple
343 static unsigned copyPathRemovingDots(char* dst, const char* src, unsigned srcStart, unsigned srcEnd);
344 static bool encodeRelativeString(const String& rel, const TextEncoding&, CharBuffer& ouput);
345 static String substituteBackslashes(const String&);
347 static inline bool isSchemeFirstChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeFirstChar; }
348 static inline bool isSchemeFirstChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeFirstChar); }
349 static inline bool isSchemeChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeChar; }
350 static inline bool isSchemeChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeChar); }
351 static inline bool isUserInfoChar(unsigned char c) { return characterClassTable[c] & UserInfoChar; }
352 static inline bool isHostnameChar(unsigned char c) { return characterClassTable[c] & HostnameChar; }
353 static inline bool isIPv6Char(unsigned char c) { return characterClassTable[c] & IPv6Char; }
354 static inline bool isPathSegmentEndChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & PathSegmentEndChar; }
355 static inline bool isPathSegmentEndChar(UChar c) { return c <= 0xff && (characterClassTable[c] & PathSegmentEndChar); }
356 static inline bool isBadChar(unsigned char c) { return characterClassTable[c] & BadChar; }
357 static inline bool isTabNewline(UChar c) { return c <= 0xff && (characterClassTable[c] & TabNewline); }
359 static inline bool isSchemeCharacterMatchIgnoringCase(char character, char schemeCharacter)
361 ASSERT(isSchemeChar(character));
362 ASSERT(schemeCharacter & 0x20);
363 ASSERT(isASCIILower(schemeCharacter) || (!isASCIIUpper(schemeCharacter) && isSchemeChar(schemeCharacter)));
364 return (character | 0x20) == schemeCharacter;
367 String encodeWithURLEscapeSequences(const String& notEncodedString, PercentEncodeCharacterClass whatToEncode);
369 // Copies the source to the destination, assuming all the source characters are
370 // ASCII. The destination buffer must be large enough. Null characters are allowed
371 // in the source string, and no attempt is made to null-terminate the result.
372 static void copyASCII(const String& string, char* dest)
374 if (string.isEmpty())
378 memcpy(dest, string.characters8(), string.length());
380 const UChar* src = string.characters16();
381 size_t length = string.length();
382 for (size_t i = 0; i < length; i++)
383 dest[i] = static_cast<char>(src[i]);
387 static void appendASCII(const String& base, const char* rel, size_t len, CharBuffer& buffer)
389 buffer.resize(base.length() + len + 1);
390 copyASCII(base, buffer.data());
391 memcpy(buffer.data() + base.length(), rel, len);
392 buffer[buffer.size() - 1] = '\0';
395 // FIXME: Move to WTFString.h eventually.
396 // Returns the index of the first index in string |s| of any of the characters
397 // in |toFind|. |toFind| should be a null-terminated string, all characters up
398 // to the null will be searched. Returns int if not found.
399 const unsigned notFoundUnsigned = std::numeric_limits<unsigned>::max();
400 static unsigned findFirstOf(StringView string, unsigned startPosition, const char* target)
402 unsigned length = string.length();
403 for (unsigned i = startPosition; i < length; ++i) {
404 for (unsigned j = 0; target[j]; ++j) {
405 if (string[i] == target[j])
409 return notFoundUnsigned;
412 static inline void checkEncodedString(const String& url)
414 ASSERT_UNUSED(url, url.containsOnlyASCII());
415 ASSERT_UNUSED(url, url.isEmpty() || isSchemeFirstChar(url[0]));
418 inline bool URL::protocolIs(const String& string, const char* protocol)
420 return WebCore::protocolIs(string, protocol);
423 void URL::invalidate()
426 m_protocolIsInHTTPFamily = false;
427 m_cannotBeABaseURL = false;
435 m_pathAfterLastSlash = 0;
440 URL::URL(ParsedURLStringTag, const String& url)
442 if (URLParser::enabled()) {
443 URLParser parser(url);
444 *this = parser.result();
448 // FIXME(148598): Work around Windows local file handling bug in CFNetwork
449 ASSERT(isLocalFile() || url == m_string);
451 ASSERT(url == m_string);
455 URL::URL(const URL& base, const String& relative)
457 if (URLParser::enabled()) {
458 URLParser parser(relative, base);
459 *this = parser.result();
461 init(base, relative, UTF8Encoding());
464 URL::URL(const URL& base, const String& relative, const TextEncoding& encoding)
466 // For UTF-{7,16,32}, we want to use UTF-8 for the query part as
467 // we do when submitting a form. A form with GET method
468 // has its contents added to a URL as query params and it makes sense
470 if (URLParser::enabled()) {
471 URLParser parser(relative, base, encoding.encodingForFormSubmission());
472 *this = parser.result();
474 init(base, relative, encoding.encodingForFormSubmission());
478 static bool shouldTrimFromURL(UChar c)
480 // Browsers ignore leading/trailing whitespace and control
481 // characters from URLs. Note that c is an *unsigned* char here
482 // so this comparison should only catch control characters.
486 void URL::init(const URL& base, const String& relative, const TextEncoding& encoding)
488 if (URLParser::enabled())
489 ASSERT_NOT_REACHED();
491 // Allow resolutions with a null or empty base URL, but not with any other invalid one.
492 // FIXME: Is this a good rule?
493 if (!base.m_isValid && !base.isEmpty()) {
499 // Get rid of leading and trailing whitespace and control characters.
500 String rel = relative.stripWhiteSpace(shouldTrimFromURL);
502 // Get rid of any tabs and newlines.
503 rel = rel.removeCharacters(isTabNewline);
505 // For compatibility with Win IE, treat backslashes as if they were slashes,
506 // as long as we're not dealing with javascript: or data: URLs.
507 if (rel.contains('\\') && !(protocolIsJavaScript(rel) || protocolIs(rel, "data")))
508 rel = substituteBackslashes(rel);
510 bool allASCII = rel.containsOnlyASCII();
511 CharBuffer strBuffer;
516 strBuffer.resize(len + 1);
517 copyASCII(rel, strBuffer.data());
519 str = strBuffer.data();
521 if (!encodeRelativeString(rel, encoding, strBuffer)) {
522 m_string = blankURL();
527 str = strBuffer.data();
531 // According to the RFC, the reference should be interpreted as an
532 // absolute URI if possible, using the "leftmost, longest"
533 // algorithm. If the URI reference is absolute it will have a
534 // scheme, meaning that it will have a colon before the first
535 // non-scheme element.
536 bool absolute = false;
538 if (isSchemeFirstChar(*p)) {
540 while (isSchemeChar(*p)) {
544 if (p[1] != '/' && equalIgnoringASCIICase(base.protocol(), StringView(reinterpret_cast<LChar*>(str), p - str)) && base.isHierarchical())
551 CharBuffer parseBuffer;
554 parse(str, &relative);
556 // If the base is empty or opaque (e.g. data: or javascript:), then the URL is invalid
557 // unless the relative URL is a single fragment.
558 if (!base.isHierarchical()) {
560 appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer);
561 parse(parseBuffer.data(), &relative);
571 // The reference is empty, so this is a reference to the same document with any fragment identifier removed.
573 removeFragmentIdentifier();
576 // must be fragment-only reference
577 appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer);
578 parse(parseBuffer.data(), &relative);
582 // query-only reference, special case needed for non-URL results
583 appendASCII(base.m_string.left(base.m_pathEnd), str, len, parseBuffer);
584 parse(parseBuffer.data(), &relative);
588 // must be net-path or absolute-path reference
591 appendASCII(base.m_string.left(base.m_schemeEnd + 1), str, len, parseBuffer);
592 parse(parseBuffer.data(), &relative);
595 appendASCII(base.m_string.left(base.m_portEnd), str, len, parseBuffer);
596 parse(parseBuffer.data(), &relative);
601 // must be relative-path reference
603 // Base part plus relative part plus one possible slash added in between plus terminating \0 byte.
604 const size_t bufferSize = base.m_pathEnd + 1 + len + 1;
605 parseBuffer.resize(bufferSize);
607 char* bufferPos = parseBuffer.data();
608 char* bufferStart = bufferPos;
610 // first copy everything before the path from the base
611 CharBuffer baseStringBuffer(base.m_string.length());
612 copyASCII(base.m_string, baseStringBuffer.data());
613 const char* baseString = baseStringBuffer.data();
614 const char* baseStringStart = baseString;
615 const char* pathStart = baseStringStart + base.m_portEnd;
616 while (baseStringStart < pathStart)
617 *bufferPos++ = *baseStringStart++;
618 char* bufferPathStart = bufferPos;
620 // now copy the base path
621 const char* baseStringEnd = baseString + base.m_pathEnd;
623 // go back to the last slash
624 while (baseStringEnd > baseStringStart && baseStringEnd[-1] != '/')
627 if (baseStringEnd == baseStringStart) {
628 // no path in base, add a path separator if necessary
629 if (base.m_schemeEnd + 1 != base.m_pathEnd && *str && *str != '?' && *str != '#')
632 bufferPos += copyPathRemovingDots(bufferPos, baseStringStart, 0, baseStringEnd - baseStringStart);
635 const char* relStringStart = str;
636 const char* relStringPos = relStringStart;
638 while (*relStringPos && *relStringPos != '?' && *relStringPos != '#') {
639 if (relStringPos[0] == '.' && bufferPos[-1] == '/') {
640 if (isPathSegmentEndChar(relStringPos[1])) {
641 // skip over "." segment
643 if (relStringPos[0] == '/')
646 } else if (relStringPos[1] == '.' && isPathSegmentEndChar(relStringPos[2])) {
647 // skip over ".." segment and rewind the last segment
648 // the RFC leaves it up to the app to decide what to do with excess
649 // ".." segments - we choose to drop them since some web content
652 if (relStringPos[0] == '/')
654 if (bufferPos > bufferPathStart + 1)
656 while (bufferPos > bufferPathStart + 1 && bufferPos[-1] != '/')
662 *bufferPos = *relStringPos;
667 // all done with the path work, now copy any remainder
668 // of the relative reference; this will also add a null terminator
669 strncpy(bufferPos, relStringPos, bufferSize - (bufferPos - bufferStart));
671 parse(parseBuffer.data(), &relative);
673 ASSERT(strlen(parseBuffer.data()) + 1 <= parseBuffer.size());
680 URL URL::isolatedCopy() const
683 result.m_string = result.m_string.isolatedCopy();
687 String URL::lastPathComponent() const
692 unsigned end = m_pathEnd - 1;
693 if (m_string[end] == '/')
696 size_t start = m_string.reverseFind('/', end);
697 if (start < static_cast<unsigned>(m_portEnd))
701 return m_string.substring(start, end - start + 1);
704 StringView URL::protocol() const
706 return StringView(m_string).substring(0, m_schemeEnd);
709 String URL::host() const
711 unsigned start = hostStart();
712 return m_string.substring(start, m_hostEnd - start);
715 Optional<uint16_t> URL::port() const
717 if (!m_portEnd || m_hostEnd >= m_portEnd - 1)
722 if (m_string.is8Bit())
723 number = charactersToUIntStrict(m_string.characters8() + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok);
725 number = charactersToUIntStrict(m_string.characters16() + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok);
726 if (!ok || number > std::numeric_limits<uint16_t>::max())
731 String URL::user() const
733 return decodeURLEscapeSequences(m_string.substring(m_userStart, m_userEnd - m_userStart));
736 String URL::pass() const
738 if (m_passwordEnd == m_userEnd)
741 return decodeURLEscapeSequences(m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1));
744 String URL::encodedUser() const
746 return m_string.substring(m_userStart, m_userEnd - m_userStart);
749 String URL::encodedPass() const
751 if (m_passwordEnd == m_userEnd)
754 return m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1);
757 String URL::fragmentIdentifier() const
759 if (m_fragmentEnd == m_queryEnd)
762 return m_string.substring(m_queryEnd + 1, m_fragmentEnd - (m_queryEnd + 1));
765 bool URL::hasFragmentIdentifier() const
767 return m_fragmentEnd != m_queryEnd;
770 String URL::baseAsString() const
772 return m_string.left(m_pathAfterLastSlash);
776 String URL::fileSystemPath() const
778 if (!isValid() || !isLocalFile())
781 return decodeURLEscapeSequences(path());
787 static inline void assertProtocolIsGood(StringView)
793 static void assertProtocolIsGood(StringView protocol)
795 for (size_t i = 0; i < protocol.length(); ++i) {
796 const char c = protocol[i];
797 ASSERT(c > ' ' && c < 0x7F && !(c >= 'A' && c <= 'Z'));
803 bool URL::protocolIs(const char* protocol) const
805 assertProtocolIsGood(StringView(reinterpret_cast<const LChar*>(protocol), strlen(protocol)));
807 // JavaScript URLs are "valid" and should be executed even if URL decides they are invalid.
808 // The free function protocolIsJavaScript() should be used instead.
809 ASSERT(!equalLettersIgnoringASCIICase(StringView(protocol), "javascript"));
814 // Do the comparison without making a new string object.
815 for (unsigned i = 0; i < m_schemeEnd; ++i) {
816 if (!protocol[i] || !isSchemeCharacterMatchIgnoringCase(m_string[i], protocol[i]))
819 return !protocol[m_schemeEnd]; // We should have consumed all characters in the argument.
822 bool URL::protocolIs(StringView protocol) const
824 assertProtocolIsGood(protocol);
829 if (m_schemeEnd != protocol.length())
832 // Do the comparison without making a new string object.
833 for (unsigned i = 0; i < m_schemeEnd; ++i) {
834 if (!isSchemeCharacterMatchIgnoringCase(m_string[i], protocol[i]))
840 String URL::query() const
842 if (m_queryEnd == m_pathEnd)
845 return m_string.substring(m_pathEnd + 1, m_queryEnd - (m_pathEnd + 1));
848 String URL::path() const
850 return m_string.substring(m_portEnd, m_pathEnd - m_portEnd);
853 bool URL::setProtocol(const String& s)
855 // Firefox and IE remove everything after the first ':'.
856 size_t separatorPosition = s.find(':');
857 String newProtocol = s.substring(0, separatorPosition);
859 if (!isValidProtocol(newProtocol))
863 if (URLParser::enabled()) {
864 URLParser parser(makeString(newProtocol, ":", m_string));
865 *this = parser.result();
867 parse(newProtocol + ':' + m_string);
871 if (URLParser::enabled()) {
872 URLParser parser(makeString(newProtocol, m_string.substring(m_schemeEnd)));
873 *this = parser.result();
875 parse(newProtocol + m_string.substring(m_schemeEnd));
880 static bool containsOnlyASCII(StringView string)
883 return charactersAreAllASCII(string.characters8(), string.length());
884 return charactersAreAllASCII(string.characters16(), string.length());
887 // Appends the punycoded hostname identified by the given string and length to
888 // the output buffer. The result will not be null terminated.
889 // Return value of false means error in encoding.
890 static bool appendEncodedHostname(UCharBuffer& buffer, StringView string)
892 // Needs to be big enough to hold an IDN-encoded name.
893 // For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
894 const unsigned hostnameBufferLength = 2048;
896 if (string.length() > hostnameBufferLength || containsOnlyASCII(string)) {
897 append(buffer, string);
901 UChar hostnameBuffer[hostnameBufferLength];
902 UErrorCode error = U_ZERO_ERROR;
904 #if COMPILER(GCC_OR_CLANG)
905 #pragma GCC diagnostic push
906 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
908 int32_t numCharactersConverted = uidna_IDNToASCII(string.upconvertedCharacters(), string.length(), hostnameBuffer,
909 hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, 0, &error);
910 #if COMPILER(GCC_OR_CLANG)
911 #pragma GCC diagnostic pop
914 if (error == U_ZERO_ERROR) {
915 buffer.append(hostnameBuffer, numCharactersConverted);
921 void URL::setHost(const String& s)
926 auto colonIndex = s.find(':');
927 if (colonIndex != notFound)
930 UCharBuffer encodedHostName;
931 if (!appendEncodedHostname(encodedHostName, s))
934 bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
936 StringBuilder builder;
937 builder.append(m_string.left(hostStart()));
938 if (slashSlashNeeded)
939 builder.appendLiteral("//");
940 builder.append(StringView(encodedHostName.data(), encodedHostName.size()));
941 builder.append(m_string.substring(m_hostEnd));
943 if (URLParser::enabled()) {
944 URLParser parser(builder.toString());
945 *this = parser.result();
947 parse(builder.toString());
950 void URL::removePort()
952 if (m_hostEnd == m_portEnd)
954 if (URLParser::enabled()) {
955 URLParser parser(m_string.left(m_hostEnd) + m_string.substring(m_portEnd));
956 *this = parser.result();
958 parse(m_string.left(m_hostEnd) + m_string.substring(m_portEnd));
961 void URL::setPort(unsigned short i)
966 bool colonNeeded = m_portEnd == m_hostEnd;
967 unsigned portStart = (colonNeeded ? m_hostEnd : m_hostEnd + 1);
969 if (URLParser::enabled()) {
970 URLParser parser(makeString(m_string.left(portStart), (colonNeeded ? ":" : ""), String::number(i), m_string.substring(m_portEnd)));
971 *this = parser.result();
973 parse(m_string.left(portStart) + (colonNeeded ? ":" : "") + String::number(i) + m_string.substring(m_portEnd));
976 void URL::setHostAndPort(const String& hostAndPort)
981 StringView hostName(hostAndPort);
984 auto colonIndex = hostName.find(':');
985 if (colonIndex != notFound) {
986 port = hostName.substring(colonIndex + 1);
988 int portInt = port.toIntStrict(ok);
989 if (!ok || portInt < 0)
991 hostName = hostName.substring(0, colonIndex);
994 if (hostName.isEmpty())
997 UCharBuffer encodedHostName;
998 if (!appendEncodedHostname(encodedHostName, hostName))
1001 bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
1003 StringBuilder builder;
1004 builder.append(m_string.left(hostStart()));
1005 if (slashSlashNeeded)
1006 builder.appendLiteral("//");
1007 builder.append(StringView(encodedHostName.data(), encodedHostName.size()));
1008 if (!port.isEmpty()) {
1009 builder.appendLiteral(":");
1010 builder.append(port);
1012 builder.append(m_string.substring(m_portEnd));
1014 if (URLParser::enabled()) {
1015 URLParser parser(builder.toString());
1016 *this = parser.result();
1018 parse(builder.toString());
1021 void URL::setUser(const String& user)
1026 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
1027 // and to avoid changing more than just the user login.
1029 unsigned end = m_userEnd;
1030 if (!user.isEmpty()) {
1031 String u = encodeWithURLEscapeSequences(user, PercentEncodeUsername);
1032 if (m_userStart == m_schemeEnd + 1)
1034 // Add '@' if we didn't have one before.
1035 if (end == m_hostEnd || (end == m_passwordEnd && m_string[end] != '@'))
1037 if (URLParser::enabled()) {
1038 URLParser parser(makeString(m_string.left(m_userStart), u, m_string.substring(end)));
1039 *this = parser.result();
1041 parse(m_string.left(m_userStart) + u + m_string.substring(end));
1043 // Remove '@' if we now have neither user nor password.
1044 if (m_userEnd == m_passwordEnd && end != m_hostEnd && m_string[end] == '@')
1046 // We don't want to parse in the extremely common case where we are not going to make a change.
1047 if (m_userStart != end) {
1048 if (URLParser::enabled()) {
1049 URLParser parser(makeString(m_string.left(m_userStart), m_string.substring(end)));
1050 *this = parser.result();
1052 parse(m_string.left(m_userStart) + m_string.substring(end));
1057 void URL::setPass(const String& password)
1062 unsigned end = m_passwordEnd;
1063 if (!password.isEmpty()) {
1064 String p = ":" + encodeWithURLEscapeSequences(password, PercentEncodePassword) + "@";
1065 if (m_userEnd == m_schemeEnd + 1)
1067 // Eat the existing '@' since we are going to add our own.
1068 if (end != m_hostEnd && m_string[end] == '@')
1070 if (URLParser::enabled()) {
1071 URLParser parser(makeString(m_string.left(m_userEnd), p, m_string.substring(end)));
1072 *this = parser.result();
1074 parse(m_string.left(m_userEnd) + p + m_string.substring(end));
1076 // Remove '@' if we now have neither user nor password.
1077 if (m_userStart == m_userEnd && end != m_hostEnd && m_string[end] == '@')
1079 // We don't want to parse in the extremely common case where we are not going to make a change.
1080 if (m_userEnd != end) {
1081 if (URLParser::enabled()) {
1082 URLParser parser(makeString(m_string.left(m_userEnd), m_string.substring(end)));
1083 *this = parser.result();
1085 parse(m_string.left(m_userEnd) + m_string.substring(end));
1090 void URL::setFragmentIdentifier(const String& s)
1095 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations.
1096 if (URLParser::enabled()) {
1097 URLParser parser(makeString(m_string.left(m_queryEnd), "#", s));
1098 *this = parser.result();
1100 parse(m_string.left(m_queryEnd) + "#" + s);
1103 void URL::removeFragmentIdentifier()
1106 ASSERT(!m_fragmentEnd);
1107 ASSERT(!m_queryEnd);
1110 if (m_fragmentEnd > m_queryEnd)
1111 m_string = m_string.left(m_queryEnd);
1112 m_fragmentEnd = m_queryEnd;
1115 void URL::setQuery(const String& query)
1120 // FIXME: '#' and non-ASCII characters must be encoded and escaped.
1121 // Usually, the query is encoded using document encoding, not UTF-8, but we don't have
1122 // access to the document in this function.
1123 // https://webkit.org/b/161176
1124 if ((query.isEmpty() || query[0] != '?') && !query.isNull()) {
1125 if (URLParser::enabled()) {
1126 URLParser parser(makeString(m_string.left(m_pathEnd), "?", query, m_string.substring(m_queryEnd)));
1127 *this = parser.result();
1129 parse(m_string.left(m_pathEnd) + "?" + query + m_string.substring(m_queryEnd));
1131 if (URLParser::enabled()) {
1132 URLParser parser(makeString(m_string.left(m_pathEnd), query, m_string.substring(m_queryEnd)));
1133 *this = parser.result();
1135 parse(m_string.left(m_pathEnd) + query + m_string.substring(m_queryEnd));
1140 void URL::setPath(const String& s)
1145 // FIXME: encodeWithURLEscapeSequences does not correctly escape '#' and '?', so fragment and query parts
1146 // may be inadvertently affected.
1148 if (path.isEmpty() || path[0] != '/')
1151 if (URLParser::enabled()) {
1152 URLParser parser(makeString(m_string.left(m_portEnd), encodeWithURLEscapeSequences(path), m_string.substring(m_pathEnd)));
1153 *this = parser.result();
1155 parse(m_string.left(m_portEnd) + encodeWithURLEscapeSequences(path) + m_string.substring(m_pathEnd));
1158 String decodeURLEscapeSequences(const String& string)
1160 return decodeEscapeSequences<URLEscapeSequence>(string, UTF8Encoding());
1163 String decodeURLEscapeSequences(const String& string, const TextEncoding& encoding)
1165 return decodeEscapeSequences<URLEscapeSequence>(string, encoding);
1168 // Caution: This function does not bounds check.
1169 static void appendEscapedChar(char*& buffer, unsigned char c)
1172 placeByteAsHex(c, buffer);
1175 static void appendEscapingBadChars(char*& buffer, const char* strStart, size_t length)
1179 const char* str = strStart;
1180 const char* strEnd = strStart + length;
1181 while (str < strEnd) {
1182 unsigned char c = *str++;
1184 if (c == '%' || c == '?')
1186 else if (c != 0x09 && c != 0x0a && c != 0x0d)
1187 appendEscapedChar(p, c);
1195 static void escapeAndAppendNonHierarchicalPart(char*& buffer, const char* strStart, size_t length)
1199 const char* str = strStart;
1200 const char* strEnd = strStart + length;
1201 while (str < strEnd) {
1202 unsigned char c = *str++;
1203 // Strip CR, LF and Tab from fragments, per:
1204 // https://bugs.webkit.org/show_bug.cgi?id=8770
1205 if (c == 0x09 || c == 0x0a || c == 0x0d)
1208 // Chrome and IE allow non-ascii characters in fragments, however doing
1209 // so would hit an ASSERT in checkEncodedString, so for now we don't.
1210 if (c < 0x20 || c >= 127) {
1211 appendEscapedChar(p, c);
1220 // copy a path, accounting for "." and ".." segments
1221 static unsigned copyPathRemovingDots(char* dst, const char* src, unsigned srcStart, unsigned srcEnd)
1223 char* bufferPathStart = dst;
1225 // empty path is a special case, and need not have a leading slash
1226 if (srcStart != srcEnd) {
1227 const char* baseStringStart = src + srcStart;
1228 const char* baseStringEnd = src + srcEnd;
1229 const char* baseStringPos = baseStringStart;
1231 // this code is unprepared for paths that do not begin with a
1232 // slash and we should always have one in the source string
1233 ASSERT(baseStringPos[0] == '/');
1235 // copy the leading slash into the destination
1236 *dst = *baseStringPos;
1240 while (baseStringPos < baseStringEnd) {
1241 if (baseStringPos[0] == '.' && dst[-1] == '/') {
1242 if (baseStringPos[1] == '/' || baseStringPos + 1 == baseStringEnd) {
1243 // skip over "." segment
1246 } else if (baseStringPos[1] == '.' && (baseStringPos[2] == '/' ||
1247 baseStringPos + 2 == baseStringEnd)) {
1248 // skip over ".." segment and rewind the last segment
1249 // the RFC leaves it up to the app to decide what to do with excess
1250 // ".." segments - we choose to drop them since some web content
1253 if (dst > bufferPathStart + 1)
1255 while (dst > bufferPathStart && dst[-1] != '/')
1261 *dst = *baseStringPos;
1267 return dst - bufferPathStart;
1270 static inline bool hasSlashDotOrDotDot(const char* str)
1272 const unsigned char* p = reinterpret_cast<const unsigned char*>(str);
1275 unsigned char pc = *p;
1276 while (unsigned char c = *++p) {
1277 if (c == '.' && (pc == '/' || pc == '.'))
1284 void URL::parse(const String& string)
1286 if (URLParser::enabled())
1287 ASSERT_NOT_REACHED();
1288 checkEncodedString(string);
1290 CharBuffer buffer(string.length() + 1);
1291 copyASCII(string, buffer.data());
1292 buffer[string.length()] = '\0';
1293 parse(buffer.data(), &string);
1296 static inline bool cannotBeABaseURL(const URL& url)
1298 // FIXME: Support https://url.spec.whatwg.org/#url-cannot-be-a-base-url-flag properly
1299 // According spec, this should be computed at parsing time.
1300 // For the moment, we just check whether the scheme is special or not.
1301 if (url.protocolIs("ftp") || url.protocolIs("file") || url.protocolIs("gopher") || url.protocolIs("http") || url.protocolIs("https") || url.protocolIs("ws") || url.protocolIs("wss"))
1306 // Implementation of https://url.spec.whatwg.org/#url-serializing
1307 String URL::serialize(bool omitFragment) const
1309 if (URLParser::enabled()) {
1311 return m_string.left(m_queryEnd);
1318 StringBuilder urlBuilder;
1319 urlBuilder.append(m_string, 0, m_schemeEnd);
1320 urlBuilder.appendLiteral(":");
1321 unsigned start = hostStart();
1322 if (start < m_hostEnd) {
1323 urlBuilder.appendLiteral("//");
1324 if (hasUsername()) {
1325 urlBuilder.append(m_string, m_userStart, m_userEnd - m_userStart);
1326 unsigned passwordStart = m_userEnd + 1;
1327 if (hasPassword()) {
1328 urlBuilder.appendLiteral(":");
1329 urlBuilder.append(m_string, passwordStart, m_passwordEnd - passwordStart);
1331 urlBuilder.appendLiteral("@");
1333 // FIXME: Serialize host according https://url.spec.whatwg.org/#concept-host-serializer for IPv4 and IPv6 addresses.
1334 urlBuilder.append(m_string, start, m_hostEnd - start);
1336 urlBuilder.appendLiteral(":");
1337 urlBuilder.appendNumber(port().value());
1339 } else if (protocolIs("file"))
1340 urlBuilder.appendLiteral("//");
1341 if (cannotBeABaseURL(*this))
1342 urlBuilder.append(m_string, m_portEnd, m_pathEnd - m_portEnd);
1344 urlBuilder.appendLiteral("/");
1345 if (m_pathEnd > m_portEnd) {
1346 unsigned pathStart = m_portEnd + 1;
1347 urlBuilder.append(m_string, pathStart, m_pathEnd - pathStart);
1351 urlBuilder.appendLiteral("?");
1352 urlBuilder.append(m_string, m_pathEnd + 1, m_queryEnd - (m_pathEnd + 1));
1354 if (!omitFragment && hasFragment()) {
1355 urlBuilder.appendLiteral("#");
1356 urlBuilder.append(m_string, m_queryEnd + 1, m_fragmentEnd - (m_queryEnd + 1));
1358 return urlBuilder.toString();
1362 static bool shouldCanonicalizeScheme = true;
1364 void enableURLSchemeCanonicalization(bool enableSchemeCanonicalization)
1366 shouldCanonicalizeScheme = enableSchemeCanonicalization;
1370 template<size_t length>
1371 static inline bool equal(const char* a, const char (&b)[length])
1374 if (!shouldCanonicalizeScheme) {
1375 for (size_t i = 0; i < length; ++i) {
1376 if (toASCIILower(a[i]) != b[i])
1382 for (size_t i = 0; i < length; ++i) {
1389 template<size_t lengthB>
1390 static inline bool equal(const char* stringA, size_t lengthA, const char (&stringB)[lengthB])
1392 return lengthA == lengthB && equal(stringA, stringB);
1395 // List of default schemes is taken from google-url:
1396 // http://code.google.com/p/google-url/source/browse/trunk/src/url_canon_stdurl.cc#120
1397 static inline bool isDefaultPortForScheme(const char* port, size_t portLength, const char* scheme, size_t schemeLength)
1399 // This switch is theoretically a performance optimization. It came over when
1400 // the code was moved from google-url, but may be removed later.
1401 switch (schemeLength) {
1403 return equal(scheme, wsScheme) && equal(port, portLength, httpPort);
1405 if (equal(scheme, ftpScheme))
1406 return equal(port, portLength, ftpPort);
1407 if (equal(scheme, wssScheme))
1408 return equal(port, portLength, httpsPort);
1411 return equal(scheme, httpScheme) && equal(port, portLength, httpPort);
1413 return equal(scheme, httpsScheme) && equal(port, portLength, httpsPort);
1415 return equal(scheme, gopherScheme) && equal(port, portLength, gopherPort);
1420 static inline bool hostPortIsEmptyButCredentialsArePresent(unsigned hostStart, unsigned portEnd, char userinfoEndChar)
1422 return userinfoEndChar == '@' && hostStart == portEnd;
1425 static bool isNonFileHierarchicalScheme(const char* scheme, size_t schemeLength)
1427 switch (schemeLength) {
1429 return equal(scheme, wsScheme);
1431 return equal(scheme, ftpScheme) || equal(scheme, wssScheme);
1433 return equal(scheme, httpScheme);
1435 return equal(scheme, httpsScheme);
1437 return equal(scheme, gopherScheme);
1442 static bool isCanonicalHostnameLowercaseForScheme(const char* scheme, size_t schemeLength)
1444 switch (schemeLength) {
1446 return equal(scheme, wsScheme);
1448 return equal(scheme, ftpScheme) || equal(scheme, wssScheme);
1450 return equal(scheme, httpScheme) || equal(scheme, fileScheme);
1452 return equal(scheme, httpsScheme);
1454 return equal(scheme, gopherScheme);
1459 void URL::parse(const char* url, const String* originalString)
1461 if (URLParser::enabled())
1462 ASSERT_NOT_REACHED();
1463 if (!url || url[0] == '\0') {
1464 // valid URL must be non-empty
1465 m_string = originalString ? *originalString : url;
1470 if (!isSchemeFirstChar(url[0])) {
1471 // scheme must start with an alphabetic character
1472 m_string = originalString ? *originalString : url;
1477 unsigned schemeEnd = 0;
1478 while (isSchemeChar(url[schemeEnd]))
1481 if (url[schemeEnd] != ':') {
1482 m_string = originalString ? *originalString : url;
1487 unsigned userStart = schemeEnd + 1;
1489 unsigned passwordStart;
1490 unsigned passwordEnd;
1496 bool hierarchical = url[schemeEnd + 1] == '/';
1497 bool hasSecondSlash = hierarchical && url[schemeEnd + 2] == '/';
1499 bool isFile = schemeEnd == 4
1500 && isLetterMatchIgnoringCase(url[0], 'f')
1501 && isLetterMatchIgnoringCase(url[1], 'i')
1502 && isLetterMatchIgnoringCase(url[2], 'l')
1503 && isLetterMatchIgnoringCase(url[3], 'e');
1505 m_protocolIsInHTTPFamily = isLetterMatchIgnoringCase(url[0], 'h')
1506 && isLetterMatchIgnoringCase(url[1], 't')
1507 && isLetterMatchIgnoringCase(url[2], 't')
1508 && isLetterMatchIgnoringCase(url[3], 'p')
1509 && (url[4] == ':' || (isLetterMatchIgnoringCase(url[4], 's') && url[5] == ':'));
1511 if ((hierarchical && hasSecondSlash) || isNonFileHierarchicalScheme(url, schemeEnd)) {
1512 // The part after the scheme is either a net_path or an abs_path whose first path segment is empty.
1513 // Attempt to find an authority.
1514 // FIXME: Authority characters may be scanned twice, and it would be nice to be faster.
1518 if (hasSecondSlash) {
1520 if (isNonFileHierarchicalScheme(url, schemeEnd)) {
1521 while (url[userStart] == '/')
1527 userEnd = userStart;
1529 unsigned colonPos = 0;
1530 while (isUserInfoChar(url[userEnd])) {
1531 if (url[userEnd] == ':' && colonPos == 0)
1536 if (url[userEnd] == '@') {
1537 // actual end of the userinfo, start on the host
1538 if (colonPos != 0) {
1539 passwordEnd = userEnd;
1541 passwordStart = colonPos + 1;
1543 passwordStart = passwordEnd = userEnd;
1545 hostStart = passwordEnd + 1;
1546 } else if (url[userEnd] == '[' || isPathSegmentEndChar(url[userEnd])) {
1547 // hit the end of the authority, must have been no user
1548 // or looks like an IPv6 hostname
1549 // either way, try to parse it as a hostname
1550 userEnd = userStart;
1551 passwordStart = passwordEnd = userEnd;
1552 hostStart = userStart;
1554 // invalid character
1555 m_string = originalString ? *originalString : url;
1560 hostEnd = hostStart;
1563 if (url[hostEnd] == '[') {
1565 while (isIPv6Char(url[hostEnd]))
1567 if (url[hostEnd] == ']')
1570 // invalid character
1571 m_string = originalString ? *originalString : url;
1576 while (isHostnameChar(url[hostEnd]))
1580 if (url[hostEnd] == ':') {
1581 portStart = portEnd = hostEnd + 1;
1583 // possible start of port
1584 portEnd = portStart;
1585 while (isASCIIDigit(url[portEnd]))
1588 portStart = portEnd = hostEnd;
1590 if (!isPathSegmentEndChar(url[portEnd])) {
1591 // invalid character
1592 m_string = originalString ? *originalString : url;
1597 if (hostPortIsEmptyButCredentialsArePresent(hostStart, portEnd, url[passwordEnd])) {
1598 m_string = originalString ? *originalString : url;
1603 if (userStart == portEnd && !m_protocolIsInHTTPFamily && !isFile) {
1604 // No authority found, which means that this is not a net_path, but rather an abs_path whose first two
1605 // path segments are empty. For file, http and https only, an empty authority is allowed.
1607 userEnd = userStart;
1608 passwordStart = userEnd;
1609 passwordEnd = passwordStart;
1610 hostStart = passwordEnd;
1611 hostEnd = hostStart;
1612 portStart = hostEnd;
1616 // the part after the scheme must be an opaque_part or an abs_path
1617 userEnd = userStart;
1618 passwordStart = passwordEnd = userEnd;
1619 hostStart = hostEnd = passwordEnd;
1620 portStart = portEnd = hostEnd;
1623 unsigned pathStart = portEnd;
1624 unsigned pathEnd = pathStart;
1625 while (url[pathEnd] && url[pathEnd] != '?' && url[pathEnd] != '#')
1628 unsigned queryStart = pathEnd;
1629 unsigned queryEnd = queryStart;
1630 if (url[queryStart] == '?') {
1631 while (url[queryEnd] && url[queryEnd] != '#')
1635 unsigned fragmentStart = queryEnd;
1636 unsigned fragmentEnd = fragmentStart;
1637 if (url[fragmentStart] == '#') {
1639 fragmentEnd = fragmentStart;
1640 while (url[fragmentEnd])
1644 // assemble it all, remembering the real ranges
1646 Vector<char, 4096> buffer(fragmentEnd * 3 + 1);
1648 char* p = buffer.data();
1649 const char* strPtr = url;
1651 // copy in the scheme
1652 const char* schemeEndPtr = url + schemeEnd;
1654 if (shouldCanonicalizeScheme || m_protocolIsInHTTPFamily) {
1655 while (strPtr < schemeEndPtr)
1656 *p++ = toASCIILower(*strPtr++);
1658 while (strPtr < schemeEndPtr)
1662 while (strPtr < schemeEndPtr)
1663 *p++ = toASCIILower(*strPtr++);
1665 m_schemeEnd = p - buffer.data();
1667 bool hostIsLocalHost = portEnd - userStart == 9
1668 && isLetterMatchIgnoringCase(url[userStart], 'l')
1669 && isLetterMatchIgnoringCase(url[userStart+1], 'o')
1670 && isLetterMatchIgnoringCase(url[userStart+2], 'c')
1671 && isLetterMatchIgnoringCase(url[userStart+3], 'a')
1672 && isLetterMatchIgnoringCase(url[userStart+4], 'l')
1673 && isLetterMatchIgnoringCase(url[userStart+5], 'h')
1674 && isLetterMatchIgnoringCase(url[userStart+6], 'o')
1675 && isLetterMatchIgnoringCase(url[userStart+7], 's')
1676 && isLetterMatchIgnoringCase(url[userStart+8], 't');
1678 // File URLs need a host part unless it is just file:// or file://localhost
1679 bool degenerateFilePath = pathStart == pathEnd && (hostStart == hostEnd || hostIsLocalHost);
1681 // We drop empty credentials, but keep a colon in an empty host/port pair.
1682 // Removing hostname completely would change the structure of the URL on re-parsing.
1683 bool haveNonHostAuthorityPart = userStart != userEnd || passwordStart != passwordEnd || hostEnd != portEnd;
1685 // add ":" after scheme
1688 // if we have at least one authority part or a file URL - add "//" and authority
1689 if (isFile ? !degenerateFilePath : (haveNonHostAuthorityPart || hostStart != hostEnd)) {
1693 m_userStart = p - buffer.data();
1696 strPtr = url + userStart;
1697 const char* userEndPtr = url + userEnd;
1698 while (strPtr < userEndPtr) {
1700 ASSERT(isUserInfoChar(c));
1703 m_userEnd = p - buffer.data();
1705 // copy in the password
1706 if (passwordEnd != passwordStart) {
1708 strPtr = url + passwordStart;
1709 const char* passwordEndPtr = url + passwordEnd;
1710 while (strPtr < passwordEndPtr) {
1712 ASSERT(isUserInfoChar(c));
1716 m_passwordEnd = p - buffer.data();
1718 // If we had any user info, add "@"
1719 if (static_cast<unsigned>(p - buffer.data()) != m_userStart)
1722 // copy in the host, except in the case of a file URL with authority="localhost"
1723 if (!(isFile && hostIsLocalHost && !haveNonHostAuthorityPart)) {
1724 strPtr = url + hostStart;
1725 const char* hostEndPtr = url + hostEnd;
1726 if (isCanonicalHostnameLowercaseForScheme(buffer.data(), m_schemeEnd)) {
1727 while (strPtr < hostEndPtr) {
1728 char c = toASCIILower(*strPtr++);
1729 ASSERT(isHostnameChar(c) || c == '[' || c == ']' || c == ':');
1733 while (strPtr < hostEndPtr) {
1735 ASSERT(isHostnameChar(c) || c == '[' || c == ']' || c == ':');
1740 m_hostEnd = p - buffer.data();
1742 // Copy in the port if the URL has one (and it's not default). Also, copy it if there was no hostname, so that there is still something in authority component.
1743 if (hostEnd != portStart) {
1744 const char* portStr = url + portStart;
1745 size_t portLength = portEnd - portStart;
1746 if ((portLength && !isDefaultPortForScheme(portStr, portLength, buffer.data(), m_schemeEnd))
1747 || (hostStart == hostEnd && hostEnd != portStart)) {
1749 const char* portEndPtr = url + portEnd;
1750 while (portStr < portEndPtr)
1754 m_portEnd = p - buffer.data();
1757 ASSERT(degenerateFilePath);
1761 m_userStart = m_userEnd = m_passwordEnd = m_hostEnd = m_portEnd = p - buffer.data();
1764 // For canonicalization, ensure we have a '/' for no path.
1765 // Do this only for URL with protocol file, http or https.
1766 if ((m_protocolIsInHTTPFamily || isFile) && pathEnd == pathStart)
1769 // add path, escaping bad characters
1771 escapeAndAppendNonHierarchicalPart(p, url + pathStart, pathEnd - pathStart);
1772 else if (!hasSlashDotOrDotDot(url))
1773 appendEscapingBadChars(p, url + pathStart, pathEnd - pathStart);
1775 CharBuffer pathBuffer(pathEnd - pathStart + 1);
1776 unsigned length = copyPathRemovingDots(pathBuffer.data(), url, pathStart, pathEnd);
1777 appendEscapingBadChars(p, pathBuffer.data(), length);
1780 m_pathEnd = p - buffer.data();
1782 // Find the position after the last slash in the path, or
1783 // the position before the path if there are no slashes in it.
1785 for (i = m_pathEnd; i > m_portEnd; --i) {
1786 if (buffer[i - 1] == '/')
1789 m_pathAfterLastSlash = i;
1791 // add query, escaping bad characters
1792 appendEscapingBadChars(p, url + queryStart, queryEnd - queryStart);
1793 m_queryEnd = p - buffer.data();
1795 // add fragment, escaping bad characters
1796 if (fragmentEnd != queryEnd) {
1798 escapeAndAppendNonHierarchicalPart(p, url + fragmentStart, fragmentEnd - fragmentStart);
1800 m_fragmentEnd = p - buffer.data();
1802 ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
1803 ASSERT(buffer.size() > 0);
1805 // If we didn't end up actually changing the original string and
1806 // it was already in a String, reuse it to avoid extra allocation.
1807 if (originalString && equal(originalString->impl(), buffer.data(), m_fragmentEnd))
1808 m_string = *originalString;
1810 m_string = String(buffer.data(), m_fragmentEnd);
1815 bool equalIgnoringFragmentIdentifier(const URL& a, const URL& b)
1817 if (a.m_queryEnd != b.m_queryEnd)
1819 unsigned queryLength = a.m_queryEnd;
1820 for (unsigned i = 0; i < queryLength; ++i)
1821 if (a.string()[i] != b.string()[i])
1826 bool protocolHostAndPortAreEqual(const URL& a, const URL& b)
1828 if (a.m_schemeEnd != b.m_schemeEnd)
1831 unsigned hostStartA = a.hostStart();
1832 unsigned hostLengthA = a.hostEnd() - hostStartA;
1833 unsigned hostStartB = b.hostStart();
1834 unsigned hostLengthB = b.hostEnd() - b.hostStart();
1835 if (hostLengthA != hostLengthB)
1839 for (unsigned i = 0; i < a.m_schemeEnd; ++i) {
1840 if (a.string()[i] != b.string()[i])
1845 for (unsigned i = 0; i < hostLengthA; ++i) {
1846 if (a.string()[hostStartA + i] != b.string()[hostStartB + i])
1850 if (a.port() != b.port())
1856 bool hostsAreEqual(const URL& a, const URL& b)
1858 unsigned hostStartA = a.hostStart();
1859 unsigned hostLengthA = a.hostEnd() - hostStartA;
1860 unsigned hostStartB = b.hostStart();
1861 unsigned hostLengthB = b.hostEnd() - hostStartB;
1862 if (hostLengthA != hostLengthB)
1865 for (unsigned i = 0; i < hostLengthA; ++i) {
1866 if (a.string()[hostStartA + i] != b.string()[hostStartB + i])
1873 String encodeWithURLEscapeSequences(const String& notEncodedString, PercentEncodeCharacterClass whatToEncode)
1875 CString asUTF8 = notEncodedString.utf8();
1877 CharBuffer buffer(asUTF8.length() * 3 + 1);
1878 char* p = buffer.data();
1880 const char* str = asUTF8.data();
1881 const char* strEnd = str + asUTF8.length();
1882 while (str < strEnd) {
1883 unsigned char c = *str++;
1884 if (percentEncodeClassTable[c] >= whatToEncode)
1885 appendEscapedChar(p, c);
1890 ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
1892 return String(buffer.data(), p - buffer.data());
1895 String encodeWithURLEscapeSequences(const String& notEncodedString)
1897 CString asUTF8 = notEncodedString.utf8();
1899 CharBuffer buffer(asUTF8.length() * 3 + 1);
1900 char* p = buffer.data();
1902 const char* str = asUTF8.data();
1903 const char* strEnd = str + asUTF8.length();
1904 while (str < strEnd) {
1905 unsigned char c = *str++;
1907 appendEscapedChar(p, c);
1912 ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
1914 return String(buffer.data(), p - buffer.data());
1917 static bool protocolIs(StringView stringURL, const char* protocol)
1919 assertProtocolIsGood(StringView(reinterpret_cast<const LChar*>(protocol), strlen(protocol)));
1920 unsigned length = stringURL.length();
1921 for (unsigned i = 0; i < length; ++i) {
1923 return stringURL[i] == ':';
1924 if (!isLetterMatchIgnoringCase(stringURL[i], protocol[i]))
1930 static void findHostnamesInMailToURL(StringView string, Vector<std::pair<unsigned, unsigned>>& nameRanges)
1932 // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' or end of string character.
1933 // Skip quoted strings so that characters in them don't confuse us.
1934 // When we find a '?' character, we are past the part of the URL that contains host names.
1940 // Find start of host name or of quoted string.
1941 unsigned hostnameOrStringStart = findFirstOf(string, p, "\"@?");
1942 if (hostnameOrStringStart == notFoundUnsigned)
1944 UChar c = string[hostnameOrStringStart];
1945 p = hostnameOrStringStart + 1;
1951 // Find end of host name.
1952 unsigned hostnameStart = p;
1953 unsigned hostnameEnd = findFirstOf(string, p, ">,?");
1955 if (hostnameEnd == notFoundUnsigned) {
1956 hostnameEnd = string.length();
1963 nameRanges.append(std::make_pair(hostnameStart, hostnameEnd));
1968 // Skip quoted string.
1971 unsigned escapedCharacterOrStringEnd = findFirstOf(string, p, "\"\\");
1972 if (escapedCharacterOrStringEnd == notFoundUnsigned)
1975 c = string[escapedCharacterOrStringEnd];
1976 p = escapedCharacterOrStringEnd + 1;
1978 // If we are the end of the string, then break from the string loop back to the host name loop.
1982 // Skip escaped character.
1984 if (p == string.length())
1993 static bool findHostnameInHierarchicalURL(StringView string, unsigned& startOffset, unsigned& endOffset)
1995 // Find the host name in a hierarchical URL.
1996 // It comes after a "://" sequence, with scheme characters preceding, and
1997 // this should be the first colon in the string.
1998 // It ends with the end of the string or a ":" or a path segment ending character.
1999 // If there is a "@" character, the host part is just the part after the "@".
2000 unsigned separator = findFirstOf(string, 0, ":");
2001 if (separator == notFoundUnsigned || separator + 2 >= string.length() || string[separator + 1] != '/' || string[separator + 2] != '/')
2004 // Check that all characters before the :// are valid scheme characters.
2005 if (!isSchemeFirstChar(string[0]))
2007 for (unsigned i = 1; i < separator; ++i) {
2008 if (!isSchemeChar(string[i]))
2012 // Start after the separator.
2013 unsigned authorityStart = separator + 3;
2015 // Find terminating character.
2016 unsigned hostnameEnd = string.length();
2017 for (unsigned i = authorityStart; i < hostnameEnd; ++i) {
2018 UChar c = string[i];
2019 if (c == ':' || (isPathSegmentEndChar(c) && c != 0)) {
2025 // Find "@" for the start of the host name.
2026 unsigned userInfoTerminator = findFirstOf(string, authorityStart, "@");
2027 unsigned hostnameStart;
2028 if (userInfoTerminator == notFoundUnsigned || userInfoTerminator > hostnameEnd)
2029 hostnameStart = authorityStart;
2031 hostnameStart = userInfoTerminator + 1;
2033 startOffset = hostnameStart;
2034 endOffset = hostnameEnd;
2038 // Converts all hostnames found in the given input to punycode, preserving the
2039 // rest of the URL unchanged. The output will NOT be null-terminated.
2040 // Return value of false means error in encoding.
2041 static bool encodeHostnames(StringView string, UCharBuffer& buffer)
2045 if (protocolIs(string, "mailto")) {
2046 Vector<std::pair<unsigned, unsigned>> hostnameRanges;
2047 findHostnamesInMailToURL(string, hostnameRanges);
2048 unsigned n = hostnameRanges.size();
2050 for (unsigned i = 0; i < n; ++i) {
2051 const std::pair<unsigned, unsigned>& r = hostnameRanges[i];
2052 append(buffer, string.substring(p, r.first - p));
2053 if (!appendEncodedHostname(buffer, string.substring(r.first, r.second - r.first)))
2057 // This will copy either everything after the last hostname, or the
2058 // whole thing if there is no hostname.
2059 append(buffer, string.substring(p));
2061 unsigned hostStart, hostEnd;
2062 if (findHostnameInHierarchicalURL(string, hostStart, hostEnd)) {
2063 append(buffer, string.substring(0, hostStart)); // Before hostname.
2064 if (!appendEncodedHostname(buffer, string.substring(hostStart, hostEnd - hostStart)))
2066 append(buffer, string.substring(hostEnd)); // After hostname.
2068 // No hostname to encode, return the input.
2069 append(buffer, string);
2076 // Return value of false means error in encoding.
2077 static bool encodeRelativeString(const String& rel, const TextEncoding& encoding, CharBuffer& output)
2080 if (!encodeHostnames(rel, s))
2083 TextEncoding pathEncoding(UTF8Encoding()); // Path is always encoded as UTF-8; other parts may depend on the scheme.
2085 unsigned pathEnd = notFoundUnsigned;
2086 if (encoding != pathEncoding && encoding.isValid() && !protocolIs(rel, "mailto") && !protocolIs(rel, "data") && !protocolIsJavaScript(rel)) {
2087 // Find the first instance of either # or ?, keep pathEnd at -1 otherwise.
2088 pathEnd = findFirstOf(StringView(s.data(), s.size()), 0, "#?");
2091 if (pathEnd == notFoundUnsigned) {
2092 CString decoded = pathEncoding.encode(StringView(s.data(), s.size()), URLEncodedEntitiesForUnencodables);
2093 output.resize(decoded.length());
2094 memcpy(output.data(), decoded.data(), decoded.length());
2096 CString pathDecoded = pathEncoding.encode(StringView(s.data(), pathEnd), URLEncodedEntitiesForUnencodables);
2097 // Unencodable characters in URLs are represented by converting
2098 // them to XML entities and escaping non-alphanumeric characters.
2099 CString otherDecoded = encoding.encode(StringView(s.data() + pathEnd, s.size() - pathEnd), URLEncodedEntitiesForUnencodables);
2101 output.resize(pathDecoded.length() + otherDecoded.length());
2102 memcpy(output.data(), pathDecoded.data(), pathDecoded.length());
2103 memcpy(output.data() + pathDecoded.length(), otherDecoded.data(), otherDecoded.length());
2105 output.append('\0'); // null-terminate the output.
2110 static String substituteBackslashes(const String& string)
2112 size_t questionPos = string.find('?');
2113 size_t hashPos = string.find('#');
2116 if (hashPos != notFound && (questionPos == notFound || questionPos > hashPos))
2118 else if (questionPos != notFound)
2119 pathEnd = questionPos;
2121 pathEnd = string.length();
2123 return string.left(pathEnd).replace('\\','/') + string.substring(pathEnd);
2126 bool URL::isHierarchical() const
2130 ASSERT(m_string[m_schemeEnd] == ':');
2131 return m_string[m_schemeEnd + 1] == '/';
2134 void URL::copyToBuffer(Vector<char, 512>& buffer) const
2136 // FIXME: This throws away the high bytes of all the characters in the string!
2137 // That's fine for a valid URL, which is all ASCII, but not for invalid URLs.
2138 buffer.resize(m_string.length());
2139 copyASCII(m_string, buffer.data());
2142 // FIXME: Why is this different than protocolIs(StringView, const char*)?
2143 bool protocolIs(const String& url, const char* protocol)
2145 // Do the comparison without making a new string object.
2146 assertProtocolIsGood(StringView(reinterpret_cast<const LChar*>(protocol), strlen(protocol)));
2147 bool isLeading = true;
2148 for (unsigned i = 0, j = 0; url[i]; ++i) {
2149 // skip leading whitespace and control characters.
2150 if (isLeading && shouldTrimFromURL(url[i]))
2154 // skip any tabs and newlines.
2155 if (isTabNewline(url[i]))
2159 return url[i] == ':';
2160 if (!isLetterMatchIgnoringCase(url[i], protocol[j]))
2169 bool isValidProtocol(const String& protocol)
2171 // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
2172 if (protocol.isEmpty())
2174 if (!isSchemeFirstChar(protocol[0]))
2176 unsigned protocolLength = protocol.length();
2177 for (unsigned i = 1; i < protocolLength; i++) {
2178 if (!isSchemeChar(protocol[i]))
2185 void URL::print() const
2187 printf("%s\n", m_string.utf8().data());
2191 String URL::strippedForUseAsReferrer() const
2193 URL referrer(*this);
2194 referrer.setUser(String());
2195 referrer.setPass(String());
2196 referrer.removeFragmentIdentifier();
2197 return referrer.string();
2200 bool URL::isLocalFile() const
2202 // Including feed here might be a bad idea since drag and drop uses this check
2203 // and including feed would allow feeds to potentially let someone's blog
2204 // read the contents of the clipboard on a drag, even without a drop.
2205 // Likewise with using the FrameLoader::shouldTreatURLAsLocal() function.
2206 return protocolIs("file");
2209 bool protocolIsJavaScript(const String& url)
2211 return protocolIs(url, "javascript");
2214 bool protocolIsInHTTPFamily(const String& url)
2216 // Do the comparison without making a new string object.
2217 return isLetterMatchIgnoringCase(url[0], 'h')
2218 && isLetterMatchIgnoringCase(url[1], 't')
2219 && isLetterMatchIgnoringCase(url[2], 't')
2220 && isLetterMatchIgnoringCase(url[3], 'p')
2221 && (url[4] == ':' || (isLetterMatchIgnoringCase(url[4], 's') && url[5] == ':'));
2224 const URL& blankURL()
2226 static NeverDestroyed<URL> staticBlankURL(ParsedURLString, "about:blank");
2227 return staticBlankURL;
2230 bool URL::isBlankURL() const
2232 return protocolIs("about");
2235 bool portAllowed(const URL& url)
2237 Optional<uint16_t> port = url.port();
2239 // Since most URLs don't have a port, return early for the "no port" case.
2243 // This blocked port list matches the port blocking that Mozilla implements.
2244 // See http://www.mozilla.org/projects/netlib/PortBanning.html for more information.
2245 static const uint16_t blockedPortList[] = {
2279 135, // loc-srv / epmap
2285 512, // print / exec
2302 3659, // apple-sasl / PasswordServer [Apple addition]
2304 4190, // ManageSieve [Apple addition]
2306 6665, // Alternate IRC [Apple addition]
2307 6666, // Alternate IRC [Apple addition]
2308 6667, // Standard IRC [Apple addition]
2309 6668, // Alternate IRC [Apple addition]
2310 6669, // Alternate IRC [Apple addition]
2311 invalidPortNumber, // Used to block all invalid port numbers
2313 const unsigned short* const blockedPortListEnd = blockedPortList + WTF_ARRAY_LENGTH(blockedPortList);
2316 // The port list must be sorted for binary_search to work.
2317 static bool checkedPortList = false;
2318 if (!checkedPortList) {
2319 for (const unsigned short* p = blockedPortList; p != blockedPortListEnd - 1; ++p)
2320 ASSERT(*p < *(p + 1));
2321 checkedPortList = true;
2325 // If the port is not in the blocked port list, allow it.
2326 if (!std::binary_search(blockedPortList, blockedPortListEnd, port.value()))
2329 // Allow ports 21 and 22 for FTP URLs, as Mozilla does.
2330 if ((port.value() == 21 || port.value() == 22) && url.protocolIs("ftp"))
2333 // Allow any port number in a file URL, since the port number is ignored.
2334 if (url.protocolIs("file"))
2340 String mimeTypeFromDataURL(const String& url)
2342 ASSERT(protocolIs(url, "data"));
2344 // FIXME: What's the right behavior when the URL has a comma first, but a semicolon later?
2345 // Currently this code will break at the semicolon in that case. Not sure that's correct.
2346 auto index = url.find(';', 5);
2347 if (index == notFound)
2348 index = url.find(',', 5);
2349 if (index == notFound) {
2350 // FIXME: There was an old comment here that made it sound like this should be returning text/plain.
2351 // But we have been returning empty string here for some time, so not changing its behavior at this time.
2352 return emptyString();
2355 return ASCIILiteral("text/plain");
2357 return url.substring(5, index - 5).convertToASCIILowercase();
2360 String mimeTypeFromURL(const URL& url)
2362 String decodedPath = decodeURLEscapeSequences(url.path());
2363 String extension = decodedPath.substring(decodedPath.reverseFind('.') + 1);
2365 // We don't use MIMETypeRegistry::getMIMETypeForPath() because it returns "application/octet-stream" upon failure
2366 return MIMETypeRegistry::getMIMETypeForExtension(extension);
2369 String URL::stringCenterEllipsizedToLength(unsigned length) const
2371 if (string().length() <= length)
2374 return string().left(length / 2 - 1) + "..." + string().right(length / 2 - 2);
2377 URL URL::fakeURLWithRelativePart(const String& relativePart)
2379 return URL(URL(), "webkit-fake-url://" + createCanonicalUUIDString() + '/' + relativePart);
2382 URL URL::fileURLWithFileSystemPath(const String& filePath)
2384 return URL(URL(), "file:///" + filePath);