2 * Copyright (C) 2004, 2007, 2008, 2011, 2012, 2013, 2015 Apple Inc. All rights reserved.
3 * Copyright (C) 2012 Research In Motion Limited. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 #include "DecodeEscapeSequences.h"
31 #include "MIMETypeRegistry.h"
32 #include "TextEncoding.h"
35 #include <unicode/uidna.h>
36 #include <wtf/HashMap.h>
37 #include <wtf/HexNumber.h>
38 #include <wtf/StdLibExtras.h>
39 #include <wtf/text/CString.h>
40 #include <wtf/text/StringBuilder.h>
41 #include <wtf/text/StringHash.h>
43 // FIXME: This file makes too much use of the + operator on String.
44 // We either have to optimize that operator so it doesn't involve
45 // so many allocations, or change this to use StringBuffer instead.
51 typedef Vector<char, 512> CharBuffer;
52 typedef Vector<UChar, 512> UCharBuffer;
54 static const unsigned maximumValidPortNumber = 0xFFFE;
55 static const unsigned invalidPortNumber = 0xFFFF;
57 static inline bool isLetterMatchIgnoringCase(UChar character, char lowercaseLetter)
59 ASSERT(isASCIILower(lowercaseLetter));
60 return (character | 0x20) == lowercaseLetter;
63 static const char wsScheme[] = {'w', 's'};
64 static const char ftpScheme[] = {'f', 't', 'p'};
65 static const char ftpPort[] = {'2', '1'};
66 static const char wssScheme[] = {'w', 's', 's'};
67 static const char fileScheme[] = {'f', 'i', 'l', 'e'};
68 static const char httpScheme[] = {'h', 't', 't', 'p'};
69 static const char httpPort[] = {'8', '0'};
70 static const char httpsScheme[] = {'h', 't', 't', 'p', 's'};
71 static const char httpsPort[] = {'4', '4', '3'};
72 static const char gopherScheme[] = {'g', 'o', 'p', 'h', 'e', 'r'};
73 static const char gopherPort[] = {'7', '0'};
75 static inline bool isLetterMatchIgnoringCase(char character, char lowercaseLetter)
77 ASSERT(isASCIILower(lowercaseLetter));
78 return (character | 0x20) == lowercaseLetter;
81 enum URLCharacterClasses {
83 SchemeFirstChar = 1 << 0,
85 // ( alpha | digit | "+" | "-" | "." )
88 // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
89 // unreserved = alphanum | mark
90 // ( unreserved | escaped | ";" | ":" | "&" | "=" | "+" | "$" | "," )
91 UserInfoChar = 1 << 2,
93 // alnum | "." | "-" | "%"
94 // The above is what the specification says, but we are lenient to
95 // match existing practice and also allow:
97 HostnameChar = 1 << 3,
99 // hexdigit | ":" | "%"
102 // "#" | "?" | "/" | nul
103 PathSegmentEndChar = 1 << 5,
105 // not allowed in path
109 static const unsigned char characterClassTable[256] = {
110 /* 0 nul */ PathSegmentEndChar, /* 1 soh */ BadChar,
111 /* 2 stx */ BadChar, /* 3 etx */ BadChar,
112 /* 4 eot */ BadChar, /* 5 enq */ BadChar, /* 6 ack */ BadChar, /* 7 bel */ BadChar,
113 /* 8 bs */ BadChar, /* 9 ht */ BadChar, /* 10 nl */ BadChar, /* 11 vt */ BadChar,
114 /* 12 np */ BadChar, /* 13 cr */ BadChar, /* 14 so */ BadChar, /* 15 si */ BadChar,
115 /* 16 dle */ BadChar, /* 17 dc1 */ BadChar, /* 18 dc2 */ BadChar, /* 19 dc3 */ BadChar,
116 /* 20 dc4 */ BadChar, /* 21 nak */ BadChar, /* 22 syn */ BadChar, /* 23 etb */ BadChar,
117 /* 24 can */ BadChar, /* 25 em */ BadChar, /* 26 sub */ BadChar, /* 27 esc */ BadChar,
118 /* 28 fs */ BadChar, /* 29 gs */ BadChar, /* 30 rs */ BadChar, /* 31 us */ BadChar,
119 /* 32 sp */ BadChar, /* 33 ! */ UserInfoChar,
120 /* 34 " */ BadChar, /* 35 # */ PathSegmentEndChar | BadChar,
121 /* 36 $ */ UserInfoChar, /* 37 % */ UserInfoChar | HostnameChar | IPv6Char | BadChar,
122 /* 38 & */ UserInfoChar, /* 39 ' */ UserInfoChar,
123 /* 40 ( */ UserInfoChar, /* 41 ) */ UserInfoChar,
124 /* 42 * */ UserInfoChar, /* 43 + */ SchemeChar | UserInfoChar,
125 /* 44 , */ UserInfoChar,
126 /* 45 - */ SchemeChar | UserInfoChar | HostnameChar,
127 /* 46 . */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
128 /* 47 / */ PathSegmentEndChar,
129 /* 48 0 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
130 /* 49 1 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
131 /* 50 2 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
132 /* 51 3 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
133 /* 52 4 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
134 /* 53 5 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
135 /* 54 6 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
136 /* 55 7 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
137 /* 56 8 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
138 /* 57 9 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
139 /* 58 : */ UserInfoChar | IPv6Char, /* 59 ; */ UserInfoChar,
140 /* 60 < */ BadChar, /* 61 = */ UserInfoChar,
141 /* 62 > */ BadChar, /* 63 ? */ PathSegmentEndChar | BadChar,
143 /* 65 A */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
144 /* 66 B */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
145 /* 67 C */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
146 /* 68 D */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
147 /* 69 E */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
148 /* 70 F */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
149 /* 71 G */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
150 /* 72 H */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
151 /* 73 I */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
152 /* 74 J */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
153 /* 75 K */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
154 /* 76 L */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
155 /* 77 M */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
156 /* 78 N */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
157 /* 79 O */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
158 /* 80 P */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
159 /* 81 Q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
160 /* 82 R */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
161 /* 83 S */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
162 /* 84 T */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
163 /* 85 U */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
164 /* 86 V */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
165 /* 87 W */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
166 /* 88 X */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
167 /* 89 Y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
168 /* 90 Z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
170 /* 92 \ */ 0, /* 93 ] */ 0,
172 /* 95 _ */ UserInfoChar | HostnameChar,
174 /* 97 a */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
175 /* 98 b */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
176 /* 99 c */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
177 /* 100 d */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
178 /* 101 e */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
179 /* 102 f */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
180 /* 103 g */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
181 /* 104 h */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
182 /* 105 i */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
183 /* 106 j */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
184 /* 107 k */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
185 /* 108 l */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
186 /* 109 m */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
187 /* 110 n */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
188 /* 111 o */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
189 /* 112 p */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
190 /* 113 q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
191 /* 114 r */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
192 /* 115 s */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
193 /* 116 t */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
194 /* 117 u */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
195 /* 118 v */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
196 /* 119 w */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
197 /* 120 x */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
198 /* 121 y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
199 /* 122 z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
201 /* 124 | */ 0, /* 125 } */ 0, /* 126 ~ */ UserInfoChar, /* 127 del */ BadChar,
202 /* 128 */ BadChar, /* 129 */ BadChar, /* 130 */ BadChar, /* 131 */ BadChar,
203 /* 132 */ BadChar, /* 133 */ BadChar, /* 134 */ BadChar, /* 135 */ BadChar,
204 /* 136 */ BadChar, /* 137 */ BadChar, /* 138 */ BadChar, /* 139 */ BadChar,
205 /* 140 */ BadChar, /* 141 */ BadChar, /* 142 */ BadChar, /* 143 */ BadChar,
206 /* 144 */ BadChar, /* 145 */ BadChar, /* 146 */ BadChar, /* 147 */ BadChar,
207 /* 148 */ BadChar, /* 149 */ BadChar, /* 150 */ BadChar, /* 151 */ BadChar,
208 /* 152 */ BadChar, /* 153 */ BadChar, /* 154 */ BadChar, /* 155 */ BadChar,
209 /* 156 */ BadChar, /* 157 */ BadChar, /* 158 */ BadChar, /* 159 */ BadChar,
210 /* 160 */ BadChar, /* 161 */ BadChar, /* 162 */ BadChar, /* 163 */ BadChar,
211 /* 164 */ BadChar, /* 165 */ BadChar, /* 166 */ BadChar, /* 167 */ BadChar,
212 /* 168 */ BadChar, /* 169 */ BadChar, /* 170 */ BadChar, /* 171 */ BadChar,
213 /* 172 */ BadChar, /* 173 */ BadChar, /* 174 */ BadChar, /* 175 */ BadChar,
214 /* 176 */ BadChar, /* 177 */ BadChar, /* 178 */ BadChar, /* 179 */ BadChar,
215 /* 180 */ BadChar, /* 181 */ BadChar, /* 182 */ BadChar, /* 183 */ BadChar,
216 /* 184 */ BadChar, /* 185 */ BadChar, /* 186 */ BadChar, /* 187 */ BadChar,
217 /* 188 */ BadChar, /* 189 */ BadChar, /* 190 */ BadChar, /* 191 */ BadChar,
218 /* 192 */ BadChar, /* 193 */ BadChar, /* 194 */ BadChar, /* 195 */ BadChar,
219 /* 196 */ BadChar, /* 197 */ BadChar, /* 198 */ BadChar, /* 199 */ BadChar,
220 /* 200 */ BadChar, /* 201 */ BadChar, /* 202 */ BadChar, /* 203 */ BadChar,
221 /* 204 */ BadChar, /* 205 */ BadChar, /* 206 */ BadChar, /* 207 */ BadChar,
222 /* 208 */ BadChar, /* 209 */ BadChar, /* 210 */ BadChar, /* 211 */ BadChar,
223 /* 212 */ BadChar, /* 213 */ BadChar, /* 214 */ BadChar, /* 215 */ BadChar,
224 /* 216 */ BadChar, /* 217 */ BadChar, /* 218 */ BadChar, /* 219 */ BadChar,
225 /* 220 */ BadChar, /* 221 */ BadChar, /* 222 */ BadChar, /* 223 */ BadChar,
226 /* 224 */ BadChar, /* 225 */ BadChar, /* 226 */ BadChar, /* 227 */ BadChar,
227 /* 228 */ BadChar, /* 229 */ BadChar, /* 230 */ BadChar, /* 231 */ BadChar,
228 /* 232 */ BadChar, /* 233 */ BadChar, /* 234 */ BadChar, /* 235 */ BadChar,
229 /* 236 */ BadChar, /* 237 */ BadChar, /* 238 */ BadChar, /* 239 */ BadChar,
230 /* 240 */ BadChar, /* 241 */ BadChar, /* 242 */ BadChar, /* 243 */ BadChar,
231 /* 244 */ BadChar, /* 245 */ BadChar, /* 246 */ BadChar, /* 247 */ BadChar,
232 /* 248 */ BadChar, /* 249 */ BadChar, /* 250 */ BadChar, /* 251 */ BadChar,
233 /* 252 */ BadChar, /* 253 */ BadChar, /* 254 */ BadChar, /* 255 */ BadChar
236 enum PercentEncodeCharacterClass {
237 // Class names match the URL Standard; each class is a superset of the previous one.
238 PercentEncodeSimple = 255,
239 PercentEncodeDefault = 127,
240 PercentEncodePassword = 63,
241 PercentEncodeUsername = 31,
244 static const unsigned char percentEncodeClassTable[256] = {
245 /* 0 nul */ PercentEncodeSimple, /* 1 soh */ PercentEncodeSimple, /* 2 stx */ PercentEncodeSimple, /* 3 etx */ PercentEncodeSimple,
246 /* 4 eot */ PercentEncodeSimple, /* 5 enq */ PercentEncodeSimple, /* 6 ack */ PercentEncodeSimple, /* 7 bel */ PercentEncodeSimple,
247 /* 8 bs */ PercentEncodeSimple, /* 9 ht */ PercentEncodeSimple, /* 10 nl */ PercentEncodeSimple, /* 11 vt */ PercentEncodeSimple,
248 /* 12 np */ PercentEncodeSimple, /* 13 cr */ PercentEncodeSimple, /* 14 so */ PercentEncodeSimple, /* 15 si */ PercentEncodeSimple,
249 /* 16 dle */ PercentEncodeSimple, /* 17 dc1 */ PercentEncodeSimple, /* 18 dc2 */ PercentEncodeSimple, /* 19 dc3 */ PercentEncodeSimple,
250 /* 20 dc4 */ PercentEncodeSimple, /* 21 nak */ PercentEncodeSimple, /* 22 syn */ PercentEncodeSimple, /* 23 etb */ PercentEncodeSimple,
251 /* 24 can */ PercentEncodeSimple, /* 25 em */ PercentEncodeSimple, /* 26 sub */ PercentEncodeSimple, /* 27 esc */ PercentEncodeSimple,
252 /* 28 fs */ PercentEncodeSimple, /* 29 gs */ PercentEncodeSimple, /* 30 rs */ PercentEncodeSimple, /* 31 us */ PercentEncodeSimple,
253 /* 32 sp */ PercentEncodeDefault,
255 /* 34 " */ PercentEncodeDefault,
256 /* 35 # */ PercentEncodeDefault,
268 /* 47 / */ PercentEncodePassword,
269 /* 48 0 */ 0, /* 49 1 */ 0, /* 50 2 */ 0, /* 51 3 */ 0,
270 /* 52 4 */ 0, /* 53 5 */ 0, /* 54 6 */ 0, /* 55 7 */ 0,
271 /* 56 8 */ 0, /* 57 9 */ 0,
272 /* 58 : */ PercentEncodeUsername,
274 /* 60 < */ PercentEncodeDefault,
276 /* 62 > */ PercentEncodeDefault,
277 /* 63 ? */ PercentEncodeDefault,
278 /* 64 @ */ PercentEncodePassword,
279 /* 65 A */ 0, /* 66 B */ 0, /* 67 C */ 0, /* 68 D */ 0,
280 /* 69 E */ 0, /* 70 F */ 0, /* 71 G */ 0, /* 72 H */ 0,
281 /* 73 I */ 0, /* 74 J */ 0, /* 75 K */ 0, /* 76 L */ 0,
282 /* 77 M */ 0, /* 78 N */ 0, /* 79 O */ 0, /* 80 P */ 0,
283 /* 81 Q */ 0, /* 82 R */ 0, /* 83 S */ 0, /* 84 T */ 0,
284 /* 85 U */ 0, /* 86 V */ 0, /* 87 W */ 0, /* 88 X */ 0,
285 /* 89 Y */ 0, /* 90 Z */ 0,
287 /* 92 \ */ PercentEncodePassword,
291 /* 96 ` */ PercentEncodeDefault,
292 /* 97 a */ 0, /* 98 b */ 0, /* 99 c */ 0, /* 100 d */ 0,
293 /* 101 e */ 0, /* 102 f */ 0, /* 103 g */ 0, /* 104 h */ 0,
294 /* 105 i */ 0, /* 106 j */ 0, /* 107 k */ 0, /* 108 l */ 0,
295 /* 109 m */ 0, /* 110 n */ 0, /* 111 o */ 0, /* 112 p */ 0,
296 /* 113 q */ 0, /* 114 r */ 0, /* 115 s */ 0, /* 116 t */ 0,
297 /* 117 u */ 0, /* 118 v */ 0, /* 119 w */ 0, /* 120 x */ 0,
298 /* 121 y */ 0, /* 122 z */ 0,
303 /* 127 del */ PercentEncodeSimple,
304 /* 128 */ PercentEncodeSimple, /* 129 */ PercentEncodeSimple, /* 130 */ PercentEncodeSimple, /* 131 */ PercentEncodeSimple,
305 /* 132 */ PercentEncodeSimple, /* 133 */ PercentEncodeSimple, /* 134 */ PercentEncodeSimple, /* 135 */ PercentEncodeSimple,
306 /* 136 */ PercentEncodeSimple, /* 137 */ PercentEncodeSimple, /* 138 */ PercentEncodeSimple, /* 139 */ PercentEncodeSimple,
307 /* 140 */ PercentEncodeSimple, /* 141 */ PercentEncodeSimple, /* 142 */ PercentEncodeSimple, /* 143 */ PercentEncodeSimple,
308 /* 144 */ PercentEncodeSimple, /* 145 */ PercentEncodeSimple, /* 146 */ PercentEncodeSimple, /* 147 */ PercentEncodeSimple,
309 /* 148 */ PercentEncodeSimple, /* 149 */ PercentEncodeSimple, /* 150 */ PercentEncodeSimple, /* 151 */ PercentEncodeSimple,
310 /* 152 */ PercentEncodeSimple, /* 153 */ PercentEncodeSimple, /* 154 */ PercentEncodeSimple, /* 155 */ PercentEncodeSimple,
311 /* 156 */ PercentEncodeSimple, /* 157 */ PercentEncodeSimple, /* 158 */ PercentEncodeSimple, /* 159 */ PercentEncodeSimple,
312 /* 160 */ PercentEncodeSimple, /* 161 */ PercentEncodeSimple, /* 162 */ PercentEncodeSimple, /* 163 */ PercentEncodeSimple,
313 /* 164 */ PercentEncodeSimple, /* 165 */ PercentEncodeSimple, /* 166 */ PercentEncodeSimple, /* 167 */ PercentEncodeSimple,
314 /* 168 */ PercentEncodeSimple, /* 169 */ PercentEncodeSimple, /* 170 */ PercentEncodeSimple, /* 171 */ PercentEncodeSimple,
315 /* 172 */ PercentEncodeSimple, /* 173 */ PercentEncodeSimple, /* 174 */ PercentEncodeSimple, /* 175 */ PercentEncodeSimple,
316 /* 176 */ PercentEncodeSimple, /* 177 */ PercentEncodeSimple, /* 178 */ PercentEncodeSimple, /* 179 */ PercentEncodeSimple,
317 /* 180 */ PercentEncodeSimple, /* 181 */ PercentEncodeSimple, /* 182 */ PercentEncodeSimple, /* 183 */ PercentEncodeSimple,
318 /* 184 */ PercentEncodeSimple, /* 185 */ PercentEncodeSimple, /* 186 */ PercentEncodeSimple, /* 187 */ PercentEncodeSimple,
319 /* 188 */ PercentEncodeSimple, /* 189 */ PercentEncodeSimple, /* 190 */ PercentEncodeSimple, /* 191 */ PercentEncodeSimple,
320 /* 192 */ PercentEncodeSimple, /* 193 */ PercentEncodeSimple, /* 194 */ PercentEncodeSimple, /* 195 */ PercentEncodeSimple,
321 /* 196 */ PercentEncodeSimple, /* 197 */ PercentEncodeSimple, /* 198 */ PercentEncodeSimple, /* 199 */ PercentEncodeSimple,
322 /* 200 */ PercentEncodeSimple, /* 201 */ PercentEncodeSimple, /* 202 */ PercentEncodeSimple, /* 203 */ PercentEncodeSimple,
323 /* 204 */ PercentEncodeSimple, /* 205 */ PercentEncodeSimple, /* 206 */ PercentEncodeSimple, /* 207 */ PercentEncodeSimple,
324 /* 208 */ PercentEncodeSimple, /* 209 */ PercentEncodeSimple, /* 210 */ PercentEncodeSimple, /* 211 */ PercentEncodeSimple,
325 /* 212 */ PercentEncodeSimple, /* 213 */ PercentEncodeSimple, /* 214 */ PercentEncodeSimple, /* 215 */ PercentEncodeSimple,
326 /* 216 */ PercentEncodeSimple, /* 217 */ PercentEncodeSimple, /* 218 */ PercentEncodeSimple, /* 219 */ PercentEncodeSimple,
327 /* 220 */ PercentEncodeSimple, /* 221 */ PercentEncodeSimple, /* 222 */ PercentEncodeSimple, /* 223 */ PercentEncodeSimple,
328 /* 224 */ PercentEncodeSimple, /* 225 */ PercentEncodeSimple, /* 226 */ PercentEncodeSimple, /* 227 */ PercentEncodeSimple,
329 /* 228 */ PercentEncodeSimple, /* 229 */ PercentEncodeSimple, /* 230 */ PercentEncodeSimple, /* 231 */ PercentEncodeSimple,
330 /* 232 */ PercentEncodeSimple, /* 233 */ PercentEncodeSimple, /* 234 */ PercentEncodeSimple, /* 235 */ PercentEncodeSimple,
331 /* 236 */ PercentEncodeSimple, /* 237 */ PercentEncodeSimple, /* 238 */ PercentEncodeSimple, /* 239 */ PercentEncodeSimple,
332 /* 240 */ PercentEncodeSimple, /* 241 */ PercentEncodeSimple, /* 242 */ PercentEncodeSimple, /* 243 */ PercentEncodeSimple,
333 /* 244 */ PercentEncodeSimple, /* 245 */ PercentEncodeSimple, /* 246 */ PercentEncodeSimple, /* 247 */ PercentEncodeSimple,
334 /* 248 */ PercentEncodeSimple, /* 249 */ PercentEncodeSimple, /* 250 */ PercentEncodeSimple, /* 251 */ PercentEncodeSimple,
335 /* 252 */ PercentEncodeSimple, /* 253 */ PercentEncodeSimple, /* 254 */ PercentEncodeSimple, /* 255 */ PercentEncodeSimple
338 static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd);
339 static void encodeRelativeString(const String& rel, const TextEncoding&, CharBuffer& ouput);
340 static String substituteBackslashes(const String&);
342 static inline bool isSchemeFirstChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeFirstChar; }
343 static inline bool isSchemeFirstChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeFirstChar); }
344 static inline bool isSchemeChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeChar; }
345 static inline bool isSchemeChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeChar); }
346 static inline bool isUserInfoChar(unsigned char c) { return characterClassTable[c] & UserInfoChar; }
347 static inline bool isHostnameChar(unsigned char c) { return characterClassTable[c] & HostnameChar; }
348 static inline bool isIPv6Char(unsigned char c) { return characterClassTable[c] & IPv6Char; }
349 static inline bool isPathSegmentEndChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & PathSegmentEndChar; }
350 static inline bool isPathSegmentEndChar(UChar c) { return c <= 0xff && (characterClassTable[c] & PathSegmentEndChar); }
351 static inline bool isBadChar(unsigned char c) { return characterClassTable[c] & BadChar; }
353 static inline bool isSchemeCharacterMatchIgnoringCase(char character, char schemeCharacter)
355 ASSERT(isSchemeChar(character));
356 ASSERT(schemeCharacter & 0x20);
357 ASSERT(isASCIILower(schemeCharacter) || (!isASCIIUpper(schemeCharacter) && isSchemeChar(schemeCharacter)));
358 return (character | 0x20) == schemeCharacter;
361 String encodeWithURLEscapeSequences(const String& notEncodedString, PercentEncodeCharacterClass whatToEncode);
363 // Copies the source to the destination, assuming all the source characters are
364 // ASCII. The destination buffer must be large enough. Null characters are allowed
365 // in the source string, and no attempt is made to null-terminate the result.
366 static void copyASCII(const String& string, char* dest)
368 if (string.isEmpty())
372 memcpy(dest, string.characters8(), string.length());
374 const UChar* src = string.characters16();
375 size_t length = string.length();
376 for (size_t i = 0; i < length; i++)
377 dest[i] = static_cast<char>(src[i]);
381 static void appendASCII(const String& base, const char* rel, size_t len, CharBuffer& buffer)
383 buffer.resize(base.length() + len + 1);
384 copyASCII(base, buffer.data());
385 memcpy(buffer.data() + base.length(), rel, len);
386 buffer[buffer.size() - 1] = '\0';
389 // FIXME: Move to WTFString.h eventually.
390 // Returns the index of the first index in string |s| of any of the characters
391 // in |toFind|. |toFind| should be a null-terminated string, all characters up
392 // to the null will be searched. Returns int if not found.
393 static int findFirstOf(StringView string, unsigned startPosition, const char* target)
395 unsigned length = string.length();
396 for (unsigned i = startPosition; i < length; ++i) {
397 for (unsigned j = 0; target[j]; ++j) {
398 if (string[i] == target[j])
405 static inline void checkEncodedString(const String& url)
407 ASSERT_UNUSED(url, url.containsOnlyASCII());
408 ASSERT_UNUSED(url, url.isEmpty() || isSchemeFirstChar(url[0]));
411 inline bool URL::protocolIs(const String& string, const char* protocol)
413 return WebCore::protocolIs(string, protocol);
416 void URL::invalidate()
419 m_protocolIsInHTTPFamily = false;
427 m_pathAfterLastSlash = 0;
432 URL::URL(ParsedURLStringTag, const String& url)
435 ASSERT(url == m_string);
438 URL::URL(const URL& base, const String& relative)
440 init(base, relative, UTF8Encoding());
443 URL::URL(const URL& base, const String& relative, const TextEncoding& encoding)
445 // For UTF-{7,16,32}, we want to use UTF-8 for the query part as
446 // we do when submitting a form. A form with GET method
447 // has its contents added to a URL as query params and it makes sense
449 init(base, relative, encoding.encodingForFormSubmission());
452 static bool shouldTrimFromURL(unsigned char c)
454 // Browsers ignore leading/trailing whitespace and control
455 // characters from URLs. Note that c is an *unsigned* char here
456 // so this comparison should only catch control characters.
460 void URL::init(const URL& base, const String& relative, const TextEncoding& encoding)
462 // Allow resolutions with a null or empty base URL, but not with any other invalid one.
463 // FIXME: Is this a good rule?
464 if (!base.m_isValid && !base.isEmpty()) {
470 // For compatibility with Win IE, treat backslashes as if they were slashes,
471 // as long as we're not dealing with javascript: or data: URLs.
472 String rel = relative;
473 if (rel.contains('\\') && !(protocolIsJavaScript(rel) || protocolIs(rel, "data")))
474 rel = substituteBackslashes(rel);
476 bool allASCII = rel.containsOnlyASCII();
477 CharBuffer strBuffer;
482 strBuffer.resize(len + 1);
483 copyASCII(rel, strBuffer.data());
485 str = strBuffer.data();
487 encodeRelativeString(rel, encoding, strBuffer);
488 str = strBuffer.data();
492 // Get rid of leading whitespace and control characters.
493 while (len && shouldTrimFromURL(*str)) {
498 // Get rid of trailing whitespace and control characters.
499 while (len && shouldTrimFromURL(str[len - 1]))
502 // According to the RFC, the reference should be interpreted as an
503 // absolute URI if possible, using the "leftmost, longest"
504 // algorithm. If the URI reference is absolute it will have a
505 // scheme, meaning that it will have a colon before the first
506 // non-scheme element.
507 bool absolute = false;
509 if (isSchemeFirstChar(*p)) {
511 while (isSchemeChar(*p)) {
515 if (p[1] != '/' && equalIgnoringCase(base.protocol(), String(str, p - str)) && base.isHierarchical())
522 CharBuffer parseBuffer;
525 parse(str, &relative);
527 // If the base is empty or opaque (e.g. data: or javascript:), then the URL is invalid
528 // unless the relative URL is a single fragment.
529 if (!base.isHierarchical()) {
531 appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer);
532 parse(parseBuffer.data(), &relative);
542 // The reference is empty, so this is a reference to the same document with any fragment identifier removed.
544 removeFragmentIdentifier();
547 // must be fragment-only reference
548 appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer);
549 parse(parseBuffer.data(), &relative);
553 // query-only reference, special case needed for non-URL results
554 appendASCII(base.m_string.left(base.m_pathEnd), str, len, parseBuffer);
555 parse(parseBuffer.data(), &relative);
559 // must be net-path or absolute-path reference
562 appendASCII(base.m_string.left(base.m_schemeEnd + 1), str, len, parseBuffer);
563 parse(parseBuffer.data(), &relative);
566 appendASCII(base.m_string.left(base.m_portEnd), str, len, parseBuffer);
567 parse(parseBuffer.data(), &relative);
572 // must be relative-path reference
574 // Base part plus relative part plus one possible slash added in between plus terminating \0 byte.
575 const size_t bufferSize = base.m_pathEnd + 1 + len + 1;
576 parseBuffer.resize(bufferSize);
578 char* bufferPos = parseBuffer.data();
579 char* bufferStart = bufferPos;
581 // first copy everything before the path from the base
582 CharBuffer baseStringBuffer(base.m_string.length());
583 copyASCII(base.m_string, baseStringBuffer.data());
584 const char* baseString = baseStringBuffer.data();
585 const char* baseStringStart = baseString;
586 const char* pathStart = baseStringStart + base.m_portEnd;
587 while (baseStringStart < pathStart)
588 *bufferPos++ = *baseStringStart++;
589 char* bufferPathStart = bufferPos;
591 // now copy the base path
592 const char* baseStringEnd = baseString + base.m_pathEnd;
594 // go back to the last slash
595 while (baseStringEnd > baseStringStart && baseStringEnd[-1] != '/')
598 if (baseStringEnd == baseStringStart) {
599 // no path in base, add a path separator if necessary
600 if (base.m_schemeEnd + 1 != base.m_pathEnd && *str && *str != '?' && *str != '#')
603 bufferPos += copyPathRemovingDots(bufferPos, baseStringStart, 0, baseStringEnd - baseStringStart);
606 const char* relStringStart = str;
607 const char* relStringPos = relStringStart;
609 while (*relStringPos && *relStringPos != '?' && *relStringPos != '#') {
610 if (relStringPos[0] == '.' && bufferPos[-1] == '/') {
611 if (isPathSegmentEndChar(relStringPos[1])) {
612 // skip over "." segment
614 if (relStringPos[0] == '/')
617 } else if (relStringPos[1] == '.' && isPathSegmentEndChar(relStringPos[2])) {
618 // skip over ".." segment and rewind the last segment
619 // the RFC leaves it up to the app to decide what to do with excess
620 // ".." segments - we choose to drop them since some web content
623 if (relStringPos[0] == '/')
625 if (bufferPos > bufferPathStart + 1)
627 while (bufferPos > bufferPathStart + 1 && bufferPos[-1] != '/')
633 *bufferPos = *relStringPos;
638 // all done with the path work, now copy any remainder
639 // of the relative reference; this will also add a null terminator
640 strncpy(bufferPos, relStringPos, bufferSize - (bufferPos - bufferStart));
642 parse(parseBuffer.data(), &relative);
644 ASSERT(strlen(parseBuffer.data()) + 1 <= parseBuffer.size());
651 URL URL::copy() const
654 result.m_string = result.m_string.isolatedCopy();
658 String URL::lastPathComponent() const
663 unsigned end = m_pathEnd - 1;
664 if (m_string[end] == '/')
667 size_t start = m_string.reverseFind('/', end);
668 if (start < static_cast<unsigned>(m_portEnd))
672 return m_string.substring(start, end - start + 1);
675 String URL::protocol() const
677 return m_string.left(m_schemeEnd);
680 String URL::host() const
682 int start = hostStart();
683 return m_string.substring(start, m_hostEnd - start);
686 unsigned short URL::port() const
688 // We return a port of 0 if there is no port specified. This can happen in two situations:
689 // 1) The URL contains no colon after the host name and before the path component of the URL.
690 // 2) The URL contains a colon but there's no port number before the path component of the URL begins.
691 if (m_hostEnd == m_portEnd || m_hostEnd == m_portEnd - 1)
696 if (m_string.is8Bit())
697 number = charactersToUIntStrict(m_string.characters8() + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok);
699 number = charactersToUIntStrict(m_string.characters16() + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok);
700 if (!ok || number > maximumValidPortNumber)
701 return invalidPortNumber;
705 String URL::user() const
707 return decodeURLEscapeSequences(m_string.substring(m_userStart, m_userEnd - m_userStart));
710 String URL::pass() const
712 if (m_passwordEnd == m_userEnd)
715 return decodeURLEscapeSequences(m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1));
718 String URL::encodedUser() const
720 return m_string.substring(m_userStart, m_userEnd - m_userStart);
723 String URL::encodedPass() const
725 if (m_passwordEnd == m_userEnd)
728 return m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1);
731 String URL::fragmentIdentifier() const
733 if (m_fragmentEnd == m_queryEnd)
736 return m_string.substring(m_queryEnd + 1, m_fragmentEnd - (m_queryEnd + 1));
739 bool URL::hasFragmentIdentifier() const
741 return m_fragmentEnd != m_queryEnd;
744 String URL::baseAsString() const
746 return m_string.left(m_pathAfterLastSlash);
750 String URL::fileSystemPath() const
752 if (!isValid() || !isLocalFile())
755 return decodeURLEscapeSequences(path());
761 static inline void assertProtocolIsGood(const char*)
767 static void assertProtocolIsGood(const char* protocol)
769 const char* p = protocol;
771 ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z'));
778 bool URL::protocolIs(const char* protocol) const
780 assertProtocolIsGood(protocol);
782 // JavaScript URLs are "valid" and should be executed even if URL decides they are invalid.
783 // The free function protocolIsJavaScript() should be used instead.
784 ASSERT(!equalIgnoringCase(protocol, String("javascript")));
789 // Do the comparison without making a new string object.
790 for (int i = 0; i < m_schemeEnd; ++i) {
791 if (!protocol[i] || !isSchemeCharacterMatchIgnoringCase(m_string[i], protocol[i]))
794 return !protocol[m_schemeEnd]; // We should have consumed all characters in the argument.
797 String URL::query() const
799 if (m_queryEnd == m_pathEnd)
802 return m_string.substring(m_pathEnd + 1, m_queryEnd - (m_pathEnd + 1));
805 String URL::path() const
807 return m_string.substring(m_portEnd, m_pathEnd - m_portEnd);
810 bool URL::setProtocol(const String& s)
812 // Firefox and IE remove everything after the first ':'.
813 size_t separatorPosition = s.find(':');
814 String newProtocol = s.substring(0, separatorPosition);
816 if (!isValidProtocol(newProtocol))
820 parse(newProtocol + ':' + m_string);
824 parse(newProtocol + m_string.substring(m_schemeEnd));
828 void URL::setHost(const String& s)
833 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
834 // and to avoid changing more than just the host.
836 bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
838 parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + s + m_string.substring(m_hostEnd));
841 void URL::removePort()
843 if (m_hostEnd == m_portEnd)
845 parse(m_string.left(m_hostEnd) + m_string.substring(m_portEnd));
848 void URL::setPort(unsigned short i)
853 bool colonNeeded = m_portEnd == m_hostEnd;
854 int portStart = (colonNeeded ? m_hostEnd : m_hostEnd + 1);
856 parse(m_string.left(portStart) + (colonNeeded ? ":" : "") + String::number(i) + m_string.substring(m_portEnd));
859 void URL::setHostAndPort(const String& hostAndPort)
864 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
865 // and to avoid changing more than just host and port.
867 bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
869 parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + hostAndPort + m_string.substring(m_portEnd));
872 void URL::setUser(const String& user)
877 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
878 // and to avoid changing more than just the user login.
881 if (!user.isEmpty()) {
882 String u = encodeWithURLEscapeSequences(user, PercentEncodeUsername);
883 if (m_userStart == m_schemeEnd + 1)
885 // Add '@' if we didn't have one before.
886 if (end == m_hostEnd || (end == m_passwordEnd && m_string[end] != '@'))
888 parse(m_string.left(m_userStart) + u + m_string.substring(end));
890 // Remove '@' if we now have neither user nor password.
891 if (m_userEnd == m_passwordEnd && end != m_hostEnd && m_string[end] == '@')
893 // We don't want to parse in the extremely common case where we are not going to make a change.
894 if (m_userStart != end)
895 parse(m_string.left(m_userStart) + m_string.substring(end));
899 void URL::setPass(const String& password)
904 int end = m_passwordEnd;
905 if (!password.isEmpty()) {
906 String p = ":" + encodeWithURLEscapeSequences(password, PercentEncodePassword) + "@";
907 if (m_userEnd == m_schemeEnd + 1)
909 // Eat the existing '@' since we are going to add our own.
910 if (end != m_hostEnd && m_string[end] == '@')
912 parse(m_string.left(m_userEnd) + p + m_string.substring(end));
914 // Remove '@' if we now have neither user nor password.
915 if (m_userStart == m_userEnd && end != m_hostEnd && m_string[end] == '@')
917 // We don't want to parse in the extremely common case where we are not going to make a change.
918 if (m_userEnd != end)
919 parse(m_string.left(m_userEnd) + m_string.substring(end));
923 void URL::setFragmentIdentifier(const String& s)
928 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations.
929 parse(m_string.left(m_queryEnd) + "#" + s);
932 void URL::removeFragmentIdentifier()
936 parse(m_string.left(m_queryEnd));
939 void URL::setQuery(const String& query)
944 // FIXME: '#' and non-ASCII characters must be encoded and escaped.
945 // Usually, the query is encoded using document encoding, not UTF-8, but we don't have
946 // access to the document in this function.
947 if ((query.isEmpty() || query[0] != '?') && !query.isNull())
948 parse(m_string.left(m_pathEnd) + "?" + query + m_string.substring(m_queryEnd));
950 parse(m_string.left(m_pathEnd) + query + m_string.substring(m_queryEnd));
954 void URL::setPath(const String& s)
959 // FIXME: encodeWithURLEscapeSequences does not correctly escape '#' and '?', so fragment and query parts
960 // may be inadvertently affected.
962 if (path.isEmpty() || path[0] != '/')
965 parse(m_string.left(m_portEnd) + encodeWithURLEscapeSequences(path) + m_string.substring(m_pathEnd));
968 String decodeURLEscapeSequences(const String& string)
970 return decodeEscapeSequences<URLEscapeSequence>(string, UTF8Encoding());
973 String decodeURLEscapeSequences(const String& string, const TextEncoding& encoding)
975 return decodeEscapeSequences<URLEscapeSequence>(string, encoding);
978 // Caution: This function does not bounds check.
979 static void appendEscapedChar(char*& buffer, unsigned char c)
982 placeByteAsHex(c, buffer);
985 static void appendEscapingBadChars(char*& buffer, const char* strStart, size_t length)
989 const char* str = strStart;
990 const char* strEnd = strStart + length;
991 while (str < strEnd) {
992 unsigned char c = *str++;
994 if (c == '%' || c == '?')
996 else if (c != 0x09 && c != 0x0a && c != 0x0d)
997 appendEscapedChar(p, c);
1005 static void escapeAndAppendNonHierarchicalPart(char*& buffer, const char* strStart, size_t length)
1009 const char* str = strStart;
1010 const char* strEnd = strStart + length;
1011 while (str < strEnd) {
1012 unsigned char c = *str++;
1013 // Strip CR, LF and Tab from fragments, per:
1014 // https://bugs.webkit.org/show_bug.cgi?id=8770
1015 if (c == 0x09 || c == 0x0a || c == 0x0d)
1018 // Chrome and IE allow non-ascii characters in fragments, however doing
1019 // so would hit an ASSERT in checkEncodedString, so for now we don't.
1020 if (c < 0x20 || c >= 127) {
1021 appendEscapedChar(p, c);
1030 // copy a path, accounting for "." and ".." segments
1031 static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd)
1033 char* bufferPathStart = dst;
1035 // empty path is a special case, and need not have a leading slash
1036 if (srcStart != srcEnd) {
1037 const char* baseStringStart = src + srcStart;
1038 const char* baseStringEnd = src + srcEnd;
1039 const char* baseStringPos = baseStringStart;
1041 // this code is unprepared for paths that do not begin with a
1042 // slash and we should always have one in the source string
1043 ASSERT(baseStringPos[0] == '/');
1045 // copy the leading slash into the destination
1046 *dst = *baseStringPos;
1050 while (baseStringPos < baseStringEnd) {
1051 if (baseStringPos[0] == '.' && dst[-1] == '/') {
1052 if (baseStringPos[1] == '/' || baseStringPos + 1 == baseStringEnd) {
1053 // skip over "." segment
1056 } else if (baseStringPos[1] == '.' && (baseStringPos[2] == '/' ||
1057 baseStringPos + 2 == baseStringEnd)) {
1058 // skip over ".." segment and rewind the last segment
1059 // the RFC leaves it up to the app to decide what to do with excess
1060 // ".." segments - we choose to drop them since some web content
1063 if (dst > bufferPathStart + 1)
1065 while (dst > bufferPathStart && dst[-1] != '/')
1071 *dst = *baseStringPos;
1077 return dst - bufferPathStart;
1080 static inline bool hasSlashDotOrDotDot(const char* str)
1082 const unsigned char* p = reinterpret_cast<const unsigned char*>(str);
1085 unsigned char pc = *p;
1086 while (unsigned char c = *++p) {
1087 if (c == '.' && (pc == '/' || pc == '.'))
1094 void URL::parse(const String& string)
1096 checkEncodedString(string);
1098 CharBuffer buffer(string.length() + 1);
1099 copyASCII(string, buffer.data());
1100 buffer[string.length()] = '\0';
1101 parse(buffer.data(), &string);
1105 static bool shouldCanonicalizeScheme = true;
1107 void enableURLSchemeCanonicalization(bool enableSchemeCanonicalization)
1109 shouldCanonicalizeScheme = enableSchemeCanonicalization;
1113 template<size_t length>
1114 static inline bool equal(const char* a, const char (&b)[length])
1117 if (!shouldCanonicalizeScheme) {
1118 for (size_t i = 0; i < length; ++i) {
1119 if (toASCIILower(a[i]) != b[i])
1125 for (size_t i = 0; i < length; ++i) {
1132 template<size_t lengthB>
1133 static inline bool equal(const char* stringA, size_t lengthA, const char (&stringB)[lengthB])
1135 return lengthA == lengthB && equal(stringA, stringB);
1138 // List of default schemes is taken from google-url:
1139 // http://code.google.com/p/google-url/source/browse/trunk/src/url_canon_stdurl.cc#120
1140 static inline bool isDefaultPortForScheme(const char* port, size_t portLength, const char* scheme, size_t schemeLength)
1142 // This switch is theoretically a performance optimization. It came over when
1143 // the code was moved from google-url, but may be removed later.
1144 switch (schemeLength) {
1146 return equal(scheme, wsScheme) && equal(port, portLength, httpPort);
1148 if (equal(scheme, ftpScheme))
1149 return equal(port, portLength, ftpPort);
1150 if (equal(scheme, wssScheme))
1151 return equal(port, portLength, httpsPort);
1154 return equal(scheme, httpScheme) && equal(port, portLength, httpPort);
1156 return equal(scheme, httpsScheme) && equal(port, portLength, httpsPort);
1158 return equal(scheme, gopherScheme) && equal(port, portLength, gopherPort);
1163 static inline bool hostPortIsEmptyButCredentialsArePresent(int hostStart, int portEnd, char userinfoEndChar)
1165 return userinfoEndChar == '@' && hostStart == portEnd;
1168 static bool isNonFileHierarchicalScheme(const char* scheme, size_t schemeLength)
1170 switch (schemeLength) {
1172 return equal(scheme, wsScheme);
1174 return equal(scheme, ftpScheme) || equal(scheme, wssScheme);
1176 return equal(scheme, httpScheme);
1178 return equal(scheme, httpsScheme);
1180 return equal(scheme, gopherScheme);
1185 static bool isCanonicalHostnameLowercaseForScheme(const char* scheme, size_t schemeLength)
1187 switch (schemeLength) {
1189 return equal(scheme, wsScheme);
1191 return equal(scheme, ftpScheme) || equal(scheme, wssScheme);
1193 return equal(scheme, httpScheme) || equal(scheme, fileScheme);
1195 return equal(scheme, httpsScheme);
1197 return equal(scheme, gopherScheme);
1202 void URL::parse(const char* url, const String* originalString)
1204 if (!url || url[0] == '\0') {
1205 // valid URL must be non-empty
1206 m_string = originalString ? *originalString : url;
1211 if (!isSchemeFirstChar(url[0])) {
1212 // scheme must start with an alphabetic character
1213 m_string = originalString ? *originalString : url;
1219 while (isSchemeChar(url[schemeEnd]))
1222 if (url[schemeEnd] != ':') {
1223 m_string = originalString ? *originalString : url;
1228 int userStart = schemeEnd + 1;
1237 bool hierarchical = url[schemeEnd + 1] == '/';
1238 bool hasSecondSlash = hierarchical && url[schemeEnd + 2] == '/';
1240 bool isFile = schemeEnd == 4
1241 && isLetterMatchIgnoringCase(url[0], 'f')
1242 && isLetterMatchIgnoringCase(url[1], 'i')
1243 && isLetterMatchIgnoringCase(url[2], 'l')
1244 && isLetterMatchIgnoringCase(url[3], 'e');
1246 m_protocolIsInHTTPFamily = isLetterMatchIgnoringCase(url[0], 'h')
1247 && isLetterMatchIgnoringCase(url[1], 't')
1248 && isLetterMatchIgnoringCase(url[2], 't')
1249 && isLetterMatchIgnoringCase(url[3], 'p')
1250 && (url[4] == ':' || (isLetterMatchIgnoringCase(url[4], 's') && url[5] == ':'));
1252 if ((hierarchical && hasSecondSlash) || isNonFileHierarchicalScheme(url, schemeEnd)) {
1253 // The part after the scheme is either a net_path or an abs_path whose first path segment is empty.
1254 // Attempt to find an authority.
1255 // FIXME: Authority characters may be scanned twice, and it would be nice to be faster.
1259 if (hasSecondSlash) {
1261 if (isNonFileHierarchicalScheme(url, schemeEnd)) {
1262 while (url[userStart] == '/')
1268 userEnd = userStart;
1271 while (isUserInfoChar(url[userEnd])) {
1272 if (url[userEnd] == ':' && colonPos == 0)
1277 if (url[userEnd] == '@') {
1278 // actual end of the userinfo, start on the host
1279 if (colonPos != 0) {
1280 passwordEnd = userEnd;
1282 passwordStart = colonPos + 1;
1284 passwordStart = passwordEnd = userEnd;
1286 hostStart = passwordEnd + 1;
1287 } else if (url[userEnd] == '[' || isPathSegmentEndChar(url[userEnd])) {
1288 // hit the end of the authority, must have been no user
1289 // or looks like an IPv6 hostname
1290 // either way, try to parse it as a hostname
1291 userEnd = userStart;
1292 passwordStart = passwordEnd = userEnd;
1293 hostStart = userStart;
1295 // invalid character
1296 m_string = originalString ? *originalString : url;
1301 hostEnd = hostStart;
1304 if (url[hostEnd] == '[') {
1306 while (isIPv6Char(url[hostEnd]))
1308 if (url[hostEnd] == ']')
1311 // invalid character
1312 m_string = originalString ? *originalString : url;
1317 while (isHostnameChar(url[hostEnd]))
1321 if (url[hostEnd] == ':') {
1322 portStart = portEnd = hostEnd + 1;
1324 // possible start of port
1325 portEnd = portStart;
1326 while (isASCIIDigit(url[portEnd]))
1329 portStart = portEnd = hostEnd;
1331 if (!isPathSegmentEndChar(url[portEnd])) {
1332 // invalid character
1333 m_string = originalString ? *originalString : url;
1338 if (hostPortIsEmptyButCredentialsArePresent(hostStart, portEnd, url[passwordEnd])) {
1339 m_string = originalString ? *originalString : url;
1344 if (userStart == portEnd && !m_protocolIsInHTTPFamily && !isFile) {
1345 // No authority found, which means that this is not a net_path, but rather an abs_path whose first two
1346 // path segments are empty. For file, http and https only, an empty authority is allowed.
1348 userEnd = userStart;
1349 passwordStart = userEnd;
1350 passwordEnd = passwordStart;
1351 hostStart = passwordEnd;
1352 hostEnd = hostStart;
1353 portStart = hostEnd;
1357 // the part after the scheme must be an opaque_part or an abs_path
1358 userEnd = userStart;
1359 passwordStart = passwordEnd = userEnd;
1360 hostStart = hostEnd = passwordEnd;
1361 portStart = portEnd = hostEnd;
1364 int pathStart = portEnd;
1365 int pathEnd = pathStart;
1366 while (url[pathEnd] && url[pathEnd] != '?' && url[pathEnd] != '#')
1369 int queryStart = pathEnd;
1370 int queryEnd = queryStart;
1371 if (url[queryStart] == '?') {
1372 while (url[queryEnd] && url[queryEnd] != '#')
1376 int fragmentStart = queryEnd;
1377 int fragmentEnd = fragmentStart;
1378 if (url[fragmentStart] == '#') {
1380 fragmentEnd = fragmentStart;
1381 while (url[fragmentEnd])
1385 // assemble it all, remembering the real ranges
1387 Vector<char, 4096> buffer(fragmentEnd * 3 + 1);
1389 char *p = buffer.data();
1390 const char *strPtr = url;
1392 // copy in the scheme
1393 const char *schemeEndPtr = url + schemeEnd;
1395 if (shouldCanonicalizeScheme || m_protocolIsInHTTPFamily) {
1396 while (strPtr < schemeEndPtr)
1397 *p++ = toASCIILower(*strPtr++);
1399 while (strPtr < schemeEndPtr)
1403 while (strPtr < schemeEndPtr)
1404 *p++ = toASCIILower(*strPtr++);
1406 m_schemeEnd = p - buffer.data();
1408 bool hostIsLocalHost = portEnd - userStart == 9
1409 && isLetterMatchIgnoringCase(url[userStart], 'l')
1410 && isLetterMatchIgnoringCase(url[userStart+1], 'o')
1411 && isLetterMatchIgnoringCase(url[userStart+2], 'c')
1412 && isLetterMatchIgnoringCase(url[userStart+3], 'a')
1413 && isLetterMatchIgnoringCase(url[userStart+4], 'l')
1414 && isLetterMatchIgnoringCase(url[userStart+5], 'h')
1415 && isLetterMatchIgnoringCase(url[userStart+6], 'o')
1416 && isLetterMatchIgnoringCase(url[userStart+7], 's')
1417 && isLetterMatchIgnoringCase(url[userStart+8], 't');
1419 // File URLs need a host part unless it is just file:// or file://localhost
1420 bool degenerateFilePath = pathStart == pathEnd && (hostStart == hostEnd || hostIsLocalHost);
1422 // We drop empty credentials, but keep a colon in an empty host/port pair.
1423 // Removing hostname completely would change the structure of the URL on re-parsing.
1424 bool haveNonHostAuthorityPart = userStart != userEnd || passwordStart != passwordEnd || hostEnd != portEnd;
1426 // add ":" after scheme
1429 // if we have at least one authority part or a file URL - add "//" and authority
1430 if (isFile ? !degenerateFilePath : (haveNonHostAuthorityPart || hostStart != hostEnd)) {
1434 m_userStart = p - buffer.data();
1437 strPtr = url + userStart;
1438 const char* userEndPtr = url + userEnd;
1439 while (strPtr < userEndPtr) {
1441 ASSERT(isUserInfoChar(c));
1444 m_userEnd = p - buffer.data();
1446 // copy in the password
1447 if (passwordEnd != passwordStart) {
1449 strPtr = url + passwordStart;
1450 const char* passwordEndPtr = url + passwordEnd;
1451 while (strPtr < passwordEndPtr) {
1453 ASSERT(isUserInfoChar(c));
1457 m_passwordEnd = p - buffer.data();
1459 // If we had any user info, add "@"
1460 if (p - buffer.data() != m_userStart)
1463 // copy in the host, except in the case of a file URL with authority="localhost"
1464 if (!(isFile && hostIsLocalHost && !haveNonHostAuthorityPart)) {
1465 strPtr = url + hostStart;
1466 const char* hostEndPtr = url + hostEnd;
1467 if (isCanonicalHostnameLowercaseForScheme(buffer.data(), m_schemeEnd)) {
1468 while (strPtr < hostEndPtr) {
1469 char c = toASCIILower(*strPtr++);
1470 ASSERT(isHostnameChar(c) || c == '[' || c == ']' || c == ':');
1474 while (strPtr < hostEndPtr) {
1476 ASSERT(isHostnameChar(c) || c == '[' || c == ']' || c == ':');
1481 m_hostEnd = p - buffer.data();
1483 // Copy in the port if the URL has one (and it's not default). Also, copy it if there was no hostname, so that there is still something in authority component.
1484 if (hostEnd != portStart) {
1485 const char* portStr = url + portStart;
1486 size_t portLength = portEnd - portStart;
1487 if ((portLength && !isDefaultPortForScheme(portStr, portLength, buffer.data(), m_schemeEnd))
1488 || (hostStart == hostEnd && hostEnd != portStart)) {
1490 const char* portEndPtr = url + portEnd;
1491 while (portStr < portEndPtr)
1495 m_portEnd = p - buffer.data();
1498 ASSERT(degenerateFilePath);
1502 m_userStart = m_userEnd = m_passwordEnd = m_hostEnd = m_portEnd = p - buffer.data();
1505 // For canonicalization, ensure we have a '/' for no path.
1506 // Do this only for URL with protocol file, http or https.
1507 if ((m_protocolIsInHTTPFamily || isFile) && pathEnd == pathStart)
1510 // add path, escaping bad characters
1512 escapeAndAppendNonHierarchicalPart(p, url + pathStart, pathEnd - pathStart);
1513 else if (!hasSlashDotOrDotDot(url))
1514 appendEscapingBadChars(p, url + pathStart, pathEnd - pathStart);
1516 CharBuffer pathBuffer(pathEnd - pathStart + 1);
1517 size_t length = copyPathRemovingDots(pathBuffer.data(), url, pathStart, pathEnd);
1518 appendEscapingBadChars(p, pathBuffer.data(), length);
1521 m_pathEnd = p - buffer.data();
1523 // Find the position after the last slash in the path, or
1524 // the position before the path if there are no slashes in it.
1526 for (i = m_pathEnd; i > m_portEnd; --i) {
1527 if (buffer[i - 1] == '/')
1530 m_pathAfterLastSlash = i;
1532 // add query, escaping bad characters
1533 appendEscapingBadChars(p, url + queryStart, queryEnd - queryStart);
1534 m_queryEnd = p - buffer.data();
1536 // add fragment, escaping bad characters
1537 if (fragmentEnd != queryEnd) {
1539 escapeAndAppendNonHierarchicalPart(p, url + fragmentStart, fragmentEnd - fragmentStart);
1541 m_fragmentEnd = p - buffer.data();
1543 ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
1544 ASSERT(buffer.size() > 0);
1546 // If we didn't end up actually changing the original string and
1547 // it was already in a String, reuse it to avoid extra allocation.
1548 if (originalString && equal(originalString->impl(), buffer.data(), m_fragmentEnd))
1549 m_string = *originalString;
1551 m_string = String(buffer.data(), m_fragmentEnd);
1556 bool equalIgnoringFragmentIdentifier(const URL& a, const URL& b)
1558 if (a.m_queryEnd != b.m_queryEnd)
1560 unsigned queryLength = a.m_queryEnd;
1561 for (unsigned i = 0; i < queryLength; ++i)
1562 if (a.string()[i] != b.string()[i])
1567 bool protocolHostAndPortAreEqual(const URL& a, const URL& b)
1569 if (a.m_schemeEnd != b.m_schemeEnd)
1572 int hostStartA = a.hostStart();
1573 int hostLengthA = a.hostEnd() - hostStartA;
1574 int hostStartB = b.hostStart();
1575 int hostLengthB = b.hostEnd() - b.hostStart();
1576 if (hostLengthA != hostLengthB)
1580 for (int i = 0; i < a.m_schemeEnd; ++i)
1581 if (a.string()[i] != b.string()[i])
1585 for (int i = 0; i < hostLengthA; ++i)
1586 if (a.string()[hostStartA + i] != b.string()[hostStartB + i])
1589 if (a.port() != b.port())
1595 String encodeWithURLEscapeSequences(const String& notEncodedString, PercentEncodeCharacterClass whatToEncode)
1597 CString asUTF8 = notEncodedString.utf8();
1599 CharBuffer buffer(asUTF8.length() * 3 + 1);
1600 char* p = buffer.data();
1602 const char* str = asUTF8.data();
1603 const char* strEnd = str + asUTF8.length();
1604 while (str < strEnd) {
1605 unsigned char c = *str++;
1606 if (percentEncodeClassTable[c] >= whatToEncode)
1607 appendEscapedChar(p, c);
1612 ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
1614 return String(buffer.data(), p - buffer.data());
1617 String encodeWithURLEscapeSequences(const String& notEncodedString)
1619 CString asUTF8 = notEncodedString.utf8();
1621 CharBuffer buffer(asUTF8.length() * 3 + 1);
1622 char* p = buffer.data();
1624 const char* str = asUTF8.data();
1625 const char* strEnd = str + asUTF8.length();
1626 while (str < strEnd) {
1627 unsigned char c = *str++;
1629 appendEscapedChar(p, c);
1634 ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
1636 return String(buffer.data(), p - buffer.data());
1639 static bool containsOnlyASCII(StringView string)
1641 if (string.is8Bit())
1642 return charactersAreAllASCII(string.characters8(), string.length());
1643 return charactersAreAllASCII(string.characters16(), string.length());
1646 static bool protocolIs(StringView stringURL, const char* protocol)
1648 assertProtocolIsGood(protocol);
1649 unsigned length = stringURL.length();
1650 for (unsigned i = 0; i < length; ++i) {
1652 return stringURL[i] == ':';
1653 if (!isLetterMatchIgnoringCase(stringURL[i], protocol[i]))
1659 // Appends the punycoded hostname identified by the given string and length to
1660 // the output buffer. The result will not be null terminated.
1661 static void appendEncodedHostname(UCharBuffer& buffer, StringView string)
1663 // Needs to be big enough to hold an IDN-encoded name.
1664 // For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
1665 const unsigned hostnameBufferLength = 2048;
1667 if (string.length() > hostnameBufferLength || containsOnlyASCII(string)) {
1668 append(buffer, string);
1672 UChar hostnameBuffer[hostnameBufferLength];
1673 UErrorCode error = U_ZERO_ERROR;
1674 int32_t numCharactersConverted = uidna_IDNToASCII(string.upconvertedCharacters(), string.length(), hostnameBuffer,
1675 hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, 0, &error);
1676 if (error == U_ZERO_ERROR)
1677 buffer.append(hostnameBuffer, numCharactersConverted);
1680 static void findHostnamesInMailToURL(StringView string, Vector<std::pair<int, int>>& nameRanges)
1682 // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' or end of string character.
1683 // Skip quoted strings so that characters in them don't confuse us.
1684 // When we find a '?' character, we are past the part of the URL that contains host names.
1690 // Find start of host name or of quoted string.
1691 int hostnameOrStringStart = findFirstOf(string, p, "\"@?");
1692 if (hostnameOrStringStart == -1)
1694 UChar c = string[hostnameOrStringStart];
1695 p = hostnameOrStringStart + 1;
1701 // Find end of host name.
1702 int hostnameStart = p;
1703 int hostnameEnd = findFirstOf(string, p, ">,?");
1705 if (hostnameEnd == -1) {
1706 hostnameEnd = string.length();
1713 nameRanges.append(std::make_pair(hostnameStart, hostnameEnd));
1718 // Skip quoted string.
1721 int escapedCharacterOrStringEnd = findFirstOf(string, p, "\"\\");
1722 if (escapedCharacterOrStringEnd == -1)
1725 c = string[escapedCharacterOrStringEnd];
1726 p = escapedCharacterOrStringEnd + 1;
1728 // If we are the end of the string, then break from the string loop back to the host name loop.
1732 // Skip escaped character.
1734 if (p == static_cast<int>(string.length()))
1743 static bool findHostnameInHierarchicalURL(StringView string, int& startOffset, int& endOffset)
1745 // Find the host name in a hierarchical URL.
1746 // It comes after a "://" sequence, with scheme characters preceding, and
1747 // this should be the first colon in the string.
1748 // It ends with the end of the string or a ":" or a path segment ending character.
1749 // If there is a "@" character, the host part is just the part after the "@".
1750 int separator = findFirstOf(string, 0, ":");
1751 if (separator == -1 || separator + 2 >= static_cast<int>(string.length()) || string[separator + 1] != '/' || string[separator + 2] != '/')
1754 // Check that all characters before the :// are valid scheme characters.
1755 if (!isSchemeFirstChar(string[0]))
1757 for (int i = 1; i < separator; ++i) {
1758 if (!isSchemeChar(string[i]))
1762 // Start after the separator.
1763 int authorityStart = separator + 3;
1765 // Find terminating character.
1766 int hostnameEnd = string.length();
1767 for (int i = authorityStart; i < hostnameEnd; ++i) {
1768 UChar c = string[i];
1769 if (c == ':' || (isPathSegmentEndChar(c) && c != 0)) {
1775 // Find "@" for the start of the host name.
1776 int userInfoTerminator = findFirstOf(string, authorityStart, "@");
1778 if (userInfoTerminator == -1 || userInfoTerminator > hostnameEnd)
1779 hostnameStart = authorityStart;
1781 hostnameStart = userInfoTerminator + 1;
1783 startOffset = hostnameStart;
1784 endOffset = hostnameEnd;
1788 // Converts all hostnames found in the given input to punycode, preserving the
1789 // rest of the URL unchanged. The output will NOT be null-terminated.
1790 static void encodeHostnames(StringView string, UCharBuffer& buffer)
1794 if (protocolIs(string, "mailto")) {
1795 Vector<std::pair<int, int>> hostnameRanges;
1796 findHostnamesInMailToURL(string, hostnameRanges);
1797 int n = hostnameRanges.size();
1799 for (int i = 0; i < n; ++i) {
1800 const std::pair<int, int>& r = hostnameRanges[i];
1801 append(buffer, string.substring(p, r.first - p));
1802 appendEncodedHostname(buffer, string.substring(r.first, r.second - r.first));
1805 // This will copy either everything after the last hostname, or the
1806 // whole thing if there is no hostname.
1807 append(buffer, string.substring(p));
1809 int hostStart, hostEnd;
1810 if (findHostnameInHierarchicalURL(string, hostStart, hostEnd)) {
1811 append(buffer, string.substring(0, hostStart)); // Before hostname.
1812 appendEncodedHostname(buffer, string.substring(hostStart, hostEnd - hostStart));
1813 append(buffer, string.substring(hostEnd)); // After hostname.
1815 // No hostname to encode, return the input.
1816 append(buffer, string);
1821 static void encodeRelativeString(const String& rel, const TextEncoding& encoding, CharBuffer& output)
1824 encodeHostnames(rel, s);
1826 TextEncoding pathEncoding(UTF8Encoding()); // Path is always encoded as UTF-8; other parts may depend on the scheme.
1829 if (encoding != pathEncoding && encoding.isValid() && !protocolIs(rel, "mailto") && !protocolIs(rel, "data") && !protocolIsJavaScript(rel)) {
1830 // Find the first instance of either # or ?, keep pathEnd at -1 otherwise.
1831 pathEnd = findFirstOf(StringView(s.data(), s.size()), 0, "#?");
1834 if (pathEnd == -1) {
1835 CString decoded = pathEncoding.encode(StringView(s.data(), s.size()), URLEncodedEntitiesForUnencodables);
1836 output.resize(decoded.length());
1837 memcpy(output.data(), decoded.data(), decoded.length());
1839 CString pathDecoded = pathEncoding.encode(StringView(s.data(), pathEnd), URLEncodedEntitiesForUnencodables);
1840 // Unencodable characters in URLs are represented by converting
1841 // them to XML entities and escaping non-alphanumeric characters.
1842 CString otherDecoded = encoding.encode(StringView(s.data() + pathEnd, s.size() - pathEnd), URLEncodedEntitiesForUnencodables);
1844 output.resize(pathDecoded.length() + otherDecoded.length());
1845 memcpy(output.data(), pathDecoded.data(), pathDecoded.length());
1846 memcpy(output.data() + pathDecoded.length(), otherDecoded.data(), otherDecoded.length());
1848 output.append('\0'); // null-terminate the output.
1851 static String substituteBackslashes(const String& string)
1853 size_t questionPos = string.find('?');
1854 size_t hashPos = string.find('#');
1857 if (hashPos != notFound && (questionPos == notFound || questionPos > hashPos))
1859 else if (questionPos != notFound)
1860 pathEnd = questionPos;
1862 pathEnd = string.length();
1864 return string.left(pathEnd).replace('\\','/') + string.substring(pathEnd);
1867 bool URL::isHierarchical() const
1871 ASSERT(m_string[m_schemeEnd] == ':');
1872 return m_string[m_schemeEnd + 1] == '/';
1875 void URL::copyToBuffer(Vector<char, 512>& buffer) const
1877 // FIXME: This throws away the high bytes of all the characters in the string!
1878 // That's fine for a valid URL, which is all ASCII, but not for invalid URLs.
1879 buffer.resize(m_string.length());
1880 copyASCII(m_string, buffer.data());
1883 bool protocolIs(const String& url, const char* protocol)
1885 // Do the comparison without making a new string object.
1886 assertProtocolIsGood(protocol);
1887 for (int i = 0; ; ++i) {
1889 return url[i] == ':';
1890 if (!isLetterMatchIgnoringCase(url[i], protocol[i]))
1895 bool isValidProtocol(const String& protocol)
1897 // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
1898 if (protocol.isEmpty())
1900 if (!isSchemeFirstChar(protocol[0]))
1902 unsigned protocolLength = protocol.length();
1903 for (unsigned i = 1; i < protocolLength; i++) {
1904 if (!isSchemeChar(protocol[i]))
1911 void URL::print() const
1913 printf("%s\n", m_string.utf8().data());
1917 String URL::strippedForUseAsReferrer() const
1919 URL referrer(*this);
1920 referrer.setUser(String());
1921 referrer.setPass(String());
1922 referrer.removeFragmentIdentifier();
1923 return referrer.string();
1926 bool URL::isLocalFile() const
1928 // Including feed here might be a bad idea since drag and drop uses this check
1929 // and including feed would allow feeds to potentially let someone's blog
1930 // read the contents of the clipboard on a drag, even without a drop.
1931 // Likewise with using the FrameLoader::shouldTreatURLAsLocal() function.
1932 return protocolIs("file");
1935 bool protocolIsJavaScript(const String& url)
1937 return protocolIs(url, "javascript");
1940 bool protocolIsInHTTPFamily(const String& url)
1942 // Do the comparison without making a new string object.
1943 return isLetterMatchIgnoringCase(url[0], 'h')
1944 && isLetterMatchIgnoringCase(url[1], 't')
1945 && isLetterMatchIgnoringCase(url[2], 't')
1946 && isLetterMatchIgnoringCase(url[3], 'p')
1947 && (url[4] == ':' || (isLetterMatchIgnoringCase(url[4], 's') && url[5] == ':'));
1950 const URL& blankURL()
1952 DEPRECATED_DEFINE_STATIC_LOCAL(URL, staticBlankURL, (ParsedURLString, "about:blank"));
1953 return staticBlankURL;
1956 bool URL::isBlankURL() const
1958 return protocolIs("about");
1961 bool isDefaultPortForProtocol(unsigned short port, const String& protocol)
1963 if (protocol.isEmpty())
1966 typedef HashMap<String, unsigned, CaseFoldingHash> DefaultPortsMap;
1967 DEPRECATED_DEFINE_STATIC_LOCAL(DefaultPortsMap, defaultPorts, ());
1968 if (defaultPorts.isEmpty()) {
1969 defaultPorts.set("http", 80);
1970 defaultPorts.set("https", 443);
1971 defaultPorts.set("ftp", 21);
1972 defaultPorts.set("ftps", 990);
1974 return defaultPorts.get(protocol) == port;
1977 bool portAllowed(const URL& url)
1979 unsigned short port = url.port();
1981 // Since most URLs don't have a port, return early for the "no port" case.
1985 // This blocked port list matches the port blocking that Mozilla implements.
1986 // See http://www.mozilla.org/projects/netlib/PortBanning.html for more information.
1987 static const unsigned short blockedPortList[] = {
2021 135, // loc-srv / epmap
2027 512, // print / exec
2044 3659, // apple-sasl / PasswordServer [Apple addition]
2047 6665, // Alternate IRC [Apple addition]
2048 6666, // Alternate IRC [Apple addition]
2049 6667, // Standard IRC [Apple addition]
2050 6668, // Alternate IRC [Apple addition]
2051 6669, // Alternate IRC [Apple addition]
2052 invalidPortNumber, // Used to block all invalid port numbers
2054 const unsigned short* const blockedPortListEnd = blockedPortList + WTF_ARRAY_LENGTH(blockedPortList);
2057 // The port list must be sorted for binary_search to work.
2058 static bool checkedPortList = false;
2059 if (!checkedPortList) {
2060 for (const unsigned short* p = blockedPortList; p != blockedPortListEnd - 1; ++p)
2061 ASSERT(*p < *(p + 1));
2062 checkedPortList = true;
2066 // If the port is not in the blocked port list, allow it.
2067 if (!std::binary_search(blockedPortList, blockedPortListEnd, port))
2070 // Allow ports 21 and 22 for FTP URLs, as Mozilla does.
2071 if ((port == 21 || port == 22) && url.protocolIs("ftp"))
2074 // Allow any port number in a file URL, since the port number is ignored.
2075 if (url.protocolIs("file"))
2081 String mimeTypeFromDataURL(const String& url)
2083 ASSERT(protocolIs(url, "data"));
2084 size_t index = url.find(';');
2085 if (index == notFound)
2086 index = url.find(',');
2087 if (index != notFound) {
2089 return url.substring(5, index - 5).lower();
2090 return "text/plain"; // Data URLs with no MIME type are considered text/plain.
2095 String mimeTypeFromURL(const URL& url)
2097 String decodedPath = decodeURLEscapeSequences(url.path());
2098 String extension = decodedPath.substring(decodedPath.reverseFind('.') + 1);
2100 // We don't use MIMETypeRegistry::getMIMETypeForPath() because it returns "application/octet-stream" upon failure
2101 return MIMETypeRegistry::getMIMETypeForExtension(extension);
2104 bool URL::isSafeToSendToAnotherThread() const
2106 return m_string.isSafeToSendToAnotherThread();
2109 String URL::stringCenterEllipsizedToLength(unsigned length) const
2111 if (string().length() <= length)
2114 return string().left(length / 2 - 1) + "..." + string().right(length / 2 - 2);
2117 URL URL::fakeURLWithRelativePart(const String& relativePart)
2119 return URL(URL(), "webkit-fake-url://" + createCanonicalUUIDString() + '/' + relativePart);
2122 URL URL::fileURLWithFileSystemPath(const String& filePath)
2124 return URL(URL(), "file:///" + filePath);