URLParser should fail when parsing invalid relative URLs with no schemes
[WebKit-https.git] / Source / WebCore / platform / URLParser.cpp
1 /*
2  * Copyright (C) 2016 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "URLParser.h"
28
29 #include "Logging.h"
30 #include <array>
31 #include <unicode/uidna.h>
32 #include <unicode/utypes.h>
33
34 namespace WebCore {
35
36 template<typename CharacterType>
37 class CodePointIterator {
38 public:
39     CodePointIterator() { }
40     CodePointIterator(const CharacterType* begin, const CharacterType* end)
41         : m_begin(begin)
42         , m_end(end)
43     {
44     }
45     
46     CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
47         : CodePointIterator(begin.m_begin, end.m_begin)
48     {
49         ASSERT(end.m_begin >= begin.m_begin);
50     }
51     
52     UChar32 operator*() const;
53     CodePointIterator& operator++();
54
55     bool operator==(const CodePointIterator& other) const
56     {
57         return m_begin == other.m_begin
58             && m_end == other.m_end;
59     }
60     bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
61     
62     CodePointIterator& operator=(const CodePointIterator& other)
63     {
64         m_begin = other.m_begin;
65         m_end = other.m_end;
66         return *this;
67     }
68
69     bool atEnd() const
70     {
71         ASSERT(m_begin <= m_end);
72         return m_begin >= m_end;
73     }
74     
75 private:
76     const CharacterType* m_begin { nullptr };
77     const CharacterType* m_end { nullptr };
78 };
79
80 template<>
81 UChar32 CodePointIterator<LChar>::operator*() const
82 {
83     ASSERT(!atEnd());
84     return *m_begin;
85 }
86
87 template<>
88 auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
89 {
90     ASSERT(!atEnd());
91     m_begin++;
92     return *this;
93 }
94
95 template<>
96 UChar32 CodePointIterator<UChar>::operator*() const
97 {
98     ASSERT(!atEnd());
99     UChar32 c;
100     U16_GET(m_begin, 0, 0, m_end - m_begin, c);
101     return c;
102 }
103
104 template<>
105 auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
106 {
107     ASSERT(!atEnd());
108     unsigned i = 0;
109     size_t length = m_end - m_begin;
110     U16_FWD_1(m_begin, i, length);
111     m_begin += i;
112     return *this;
113 }
114     
115 static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
116 {
117     if (U_IS_BMP(codePoint)) {
118         destination.append(static_cast<UChar>(codePoint));
119         return;
120     }
121     destination.reserveCapacity(destination.size() + 2);
122     destination.uncheckedAppend(U16_LEAD(codePoint));
123     destination.uncheckedAppend(U16_TRAIL(codePoint));
124 }
125
126 enum URLCharacterClass {
127     UserInfo = 0x1,
128     Default = 0x2,
129     InvalidDomain = 0x4,
130     QueryPercent = 0x8,
131     SlashQuestionOrHash = 0x10,
132     Scheme = 0x20,
133 };
134
135 static const uint8_t characterClassTable[256] = {
136     UserInfo | Default | InvalidDomain | QueryPercent, // 0x0
137     UserInfo | Default | QueryPercent, // 0x1
138     UserInfo | Default | QueryPercent, // 0x2
139     UserInfo | Default | QueryPercent, // 0x3
140     UserInfo | Default | QueryPercent, // 0x4
141     UserInfo | Default | QueryPercent, // 0x5
142     UserInfo | Default | QueryPercent, // 0x6
143     UserInfo | Default | QueryPercent, // 0x7
144     UserInfo | Default | QueryPercent, // 0x8
145     UserInfo | Default | InvalidDomain | QueryPercent, // 0x9
146     UserInfo | Default | InvalidDomain | QueryPercent, // 0xA
147     UserInfo | Default | QueryPercent, // 0xB
148     UserInfo | Default | QueryPercent, // 0xC
149     UserInfo | Default | InvalidDomain | QueryPercent, // 0xD
150     UserInfo | Default | QueryPercent, // 0xE
151     UserInfo | Default | QueryPercent, // 0xF
152     UserInfo | Default | QueryPercent, // 0x10
153     UserInfo | Default | QueryPercent, // 0x11
154     UserInfo | Default | QueryPercent, // 0x12
155     UserInfo | Default | QueryPercent, // 0x13
156     UserInfo | Default | QueryPercent, // 0x14
157     UserInfo | Default | QueryPercent, // 0x15
158     UserInfo | Default | QueryPercent, // 0x16
159     UserInfo | Default | QueryPercent, // 0x17
160     UserInfo | Default | QueryPercent, // 0x18
161     UserInfo | Default | QueryPercent, // 0x19
162     UserInfo | Default | QueryPercent, // 0x1A
163     UserInfo | Default | QueryPercent, // 0x1B
164     UserInfo | Default | QueryPercent, // 0x1C
165     UserInfo | Default | QueryPercent, // 0x1D
166     UserInfo | Default | QueryPercent, // 0x1E
167     UserInfo | Default | QueryPercent, // 0x1F
168     UserInfo | Default | InvalidDomain | QueryPercent, // ' '
169     0, // '!'
170     UserInfo | Default | QueryPercent, // '"'
171     UserInfo | Default | InvalidDomain | QueryPercent | SlashQuestionOrHash, // '#'
172     0, // '$'
173     InvalidDomain, // '%'
174     0, // '&'
175     0, // '''
176     0, // '('
177     0, // ')'
178     0, // '*'
179     Scheme, // '+'
180     0, // ','
181     Scheme, // '-'
182     Scheme, // '.'
183     UserInfo | InvalidDomain | SlashQuestionOrHash, // '/'
184     Scheme, // '0'
185     Scheme, // '1'
186     Scheme, // '2'
187     Scheme, // '3'
188     Scheme, // '4'
189     Scheme, // '5'
190     Scheme, // '6'
191     Scheme, // '7'
192     Scheme, // '8'
193     Scheme, // '9'
194     UserInfo | InvalidDomain, // ':'
195     UserInfo, // ';'
196     UserInfo | Default | QueryPercent, // '<'
197     UserInfo, // '='
198     UserInfo | Default | QueryPercent, // '>'
199     UserInfo | Default | InvalidDomain | SlashQuestionOrHash, // '?'
200     UserInfo | InvalidDomain, // '@'
201     Scheme, // 'A'
202     Scheme, // 'B'
203     Scheme, // 'C'
204     Scheme, // 'D'
205     Scheme, // 'E'
206     Scheme, // 'F'
207     Scheme, // 'G'
208     Scheme, // 'H'
209     Scheme, // 'I'
210     Scheme, // 'J'
211     Scheme, // 'K'
212     Scheme, // 'L'
213     Scheme, // 'M'
214     Scheme, // 'N'
215     Scheme, // 'O'
216     Scheme, // 'P'
217     Scheme, // 'Q'
218     Scheme, // 'R'
219     Scheme, // 'S'
220     Scheme, // 'T'
221     Scheme, // 'U'
222     Scheme, // 'V'
223     Scheme, // 'W'
224     Scheme, // 'X'
225     Scheme, // 'Y'
226     Scheme, // 'Z'
227     UserInfo | InvalidDomain, // '['
228     UserInfo | InvalidDomain | SlashQuestionOrHash, // '\\'
229     UserInfo | InvalidDomain, // ']'
230     UserInfo, // '^'
231     0, // '_'
232     UserInfo | Default, // '`'
233     Scheme, // 'a'
234     Scheme, // 'b'
235     Scheme, // 'c'
236     Scheme, // 'd'
237     Scheme, // 'e'
238     Scheme, // 'f'
239     Scheme, // 'g'
240     Scheme, // 'h'
241     Scheme, // 'i'
242     Scheme, // 'j'
243     Scheme, // 'k'
244     Scheme, // 'l'
245     Scheme, // 'm'
246     Scheme, // 'n'
247     Scheme, // 'o'
248     Scheme, // 'p'
249     Scheme, // 'q'
250     Scheme, // 'r'
251     Scheme, // 's'
252     Scheme, // 't'
253     Scheme, // 'u'
254     Scheme, // 'v'
255     Scheme, // 'w'
256     Scheme, // 'x'
257     Scheme, // 'y'
258     Scheme, // 'z'
259     UserInfo | Default, // '{'
260     UserInfo, // '|'
261     UserInfo | Default, // '}'
262     0, // '~'
263     QueryPercent, // 0x7F
264     QueryPercent, // 0x80
265     QueryPercent, // 0x81
266     QueryPercent, // 0x82
267     QueryPercent, // 0x83
268     QueryPercent, // 0x84
269     QueryPercent, // 0x85
270     QueryPercent, // 0x86
271     QueryPercent, // 0x87
272     QueryPercent, // 0x88
273     QueryPercent, // 0x89
274     QueryPercent, // 0x8A
275     QueryPercent, // 0x8B
276     QueryPercent, // 0x8C
277     QueryPercent, // 0x8D
278     QueryPercent, // 0x8E
279     QueryPercent, // 0x8F
280     QueryPercent, // 0x90
281     QueryPercent, // 0x91
282     QueryPercent, // 0x92
283     QueryPercent, // 0x93
284     QueryPercent, // 0x94
285     QueryPercent, // 0x95
286     QueryPercent, // 0x96
287     QueryPercent, // 0x97
288     QueryPercent, // 0x98
289     QueryPercent, // 0x99
290     QueryPercent, // 0x9A
291     QueryPercent, // 0x9B
292     QueryPercent, // 0x9C
293     QueryPercent, // 0x9D
294     QueryPercent, // 0x9E
295     QueryPercent, // 0x9F
296     QueryPercent, // 0xA0
297     QueryPercent, // 0xA1
298     QueryPercent, // 0xA2
299     QueryPercent, // 0xA3
300     QueryPercent, // 0xA4
301     QueryPercent, // 0xA5
302     QueryPercent, // 0xA6
303     QueryPercent, // 0xA7
304     QueryPercent, // 0xA8
305     QueryPercent, // 0xA9
306     QueryPercent, // 0xAA
307     QueryPercent, // 0xAB
308     QueryPercent, // 0xAC
309     QueryPercent, // 0xAD
310     QueryPercent, // 0xAE
311     QueryPercent, // 0xAF
312     QueryPercent, // 0xB0
313     QueryPercent, // 0xB1
314     QueryPercent, // 0xB2
315     QueryPercent, // 0xB3
316     QueryPercent, // 0xB4
317     QueryPercent, // 0xB5
318     QueryPercent, // 0xB6
319     QueryPercent, // 0xB7
320     QueryPercent, // 0xB8
321     QueryPercent, // 0xB9
322     QueryPercent, // 0xBA
323     QueryPercent, // 0xBB
324     QueryPercent, // 0xBC
325     QueryPercent, // 0xBD
326     QueryPercent, // 0xBE
327     QueryPercent, // 0xBF
328     QueryPercent, // 0xC0
329     QueryPercent, // 0xC1
330     QueryPercent, // 0xC2
331     QueryPercent, // 0xC3
332     QueryPercent, // 0xC4
333     QueryPercent, // 0xC5
334     QueryPercent, // 0xC6
335     QueryPercent, // 0xC7
336     QueryPercent, // 0xC8
337     QueryPercent, // 0xC9
338     QueryPercent, // 0xCA
339     QueryPercent, // 0xCB
340     QueryPercent, // 0xCC
341     QueryPercent, // 0xCD
342     QueryPercent, // 0xCE
343     QueryPercent, // 0xCF
344     QueryPercent, // 0xD0
345     QueryPercent, // 0xD1
346     QueryPercent, // 0xD2
347     QueryPercent, // 0xD3
348     QueryPercent, // 0xD4
349     QueryPercent, // 0xD5
350     QueryPercent, // 0xD6
351     QueryPercent, // 0xD7
352     QueryPercent, // 0xD8
353     QueryPercent, // 0xD9
354     QueryPercent, // 0xDA
355     QueryPercent, // 0xDB
356     QueryPercent, // 0xDC
357     QueryPercent, // 0xDD
358     QueryPercent, // 0xDE
359     QueryPercent, // 0xDF
360     QueryPercent, // 0xE0
361     QueryPercent, // 0xE1
362     QueryPercent, // 0xE2
363     QueryPercent, // 0xE3
364     QueryPercent, // 0xE4
365     QueryPercent, // 0xE5
366     QueryPercent, // 0xE6
367     QueryPercent, // 0xE7
368     QueryPercent, // 0xE8
369     QueryPercent, // 0xE9
370     QueryPercent, // 0xEA
371     QueryPercent, // 0xEB
372     QueryPercent, // 0xEC
373     QueryPercent, // 0xED
374     QueryPercent, // 0xEE
375     QueryPercent, // 0xEF
376     QueryPercent, // 0xF0
377     QueryPercent, // 0xF1
378     QueryPercent, // 0xF2
379     QueryPercent, // 0xF3
380     QueryPercent, // 0xF4
381     QueryPercent, // 0xF5
382     QueryPercent, // 0xF6
383     QueryPercent, // 0xF7
384     QueryPercent, // 0xF8
385     QueryPercent, // 0xF9
386     QueryPercent, // 0xFA
387     QueryPercent, // 0xFB
388     QueryPercent, // 0xFC
389     QueryPercent, // 0xFD
390     QueryPercent, // 0xFE
391     QueryPercent, // 0xFF
392 };
393
394 template<typename CharacterType> inline static bool isC0Control(CharacterType character) { return character <= 0x1F; }
395 template<typename CharacterType> inline static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
396 template<typename CharacterType> inline static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
397 template<typename CharacterType> inline static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
398 template<typename CharacterType> inline static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
399 template<typename CharacterType> inline static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
400 template<typename CharacterType> inline static bool isInvalidDomainCharacter(CharacterType character) { return character <= ']' && characterClassTable[character] & InvalidDomain; }
401 template<typename CharacterType> inline static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
402 template<typename CharacterType> inline static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
403 template<typename CharacterType> inline static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & Scheme; }
404 static bool shouldPercentEncodeQueryByte(uint8_t byte) { return characterClassTable[byte] & QueryPercent; }
405
406 template<bool serialized, typename CharacterType>
407 void incrementIteratorSkippingTabAndNewLine(CodePointIterator<CharacterType>& iterator)
408 {
409     ++iterator;
410     while (!serialized && !iterator.atEnd() && isTabOrNewline(*iterator))
411         ++iterator;
412 }
413
414 template<bool serialized, typename CharacterType>
415 inline static bool isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
416 {
417     if (iterator.atEnd() || !isASCIIAlpha(*iterator))
418         return false;
419     incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
420     if (iterator.atEnd())
421         return false;
422     return *iterator == ':' || *iterator == '|';
423 }
424
425 inline static bool isWindowsDriveLetter(const Vector<LChar>& buffer, size_t index)
426 {
427     if (buffer.size() < index + 2)
428         return false;
429     return isASCIIAlpha(buffer[index]) && (buffer[index + 1] == ':' || buffer[index + 1] == '|');
430 }
431
432 template<bool serialized, typename CharacterType>
433 inline static void checkWindowsDriveLetter(CodePointIterator<CharacterType>& iterator, Vector<LChar>& asciiBuffer)
434 {
435     if (isWindowsDriveLetter<serialized>(iterator)) {
436         asciiBuffer.reserveCapacity(asciiBuffer.size() + 2);
437         asciiBuffer.uncheckedAppend(*iterator);
438         incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
439         ASSERT(!iterator.atEnd());
440         ASSERT(*iterator == ':' || *iterator == '|');
441         asciiBuffer.uncheckedAppend(':');
442         incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
443     }
444 }
445
446 template<bool serialized, typename CharacterType>
447 inline static bool shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
448 {
449     if (!isWindowsDriveLetter<serialized>(iterator))
450         return true;
451     if (iterator.atEnd())
452         return false;
453     incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
454     if (iterator.atEnd())
455         return true;
456     incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
457     if (iterator.atEnd())
458         return true;
459     return !isSlashQuestionOrHash(*iterator);
460 }
461
462 inline static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
463 {
464     buffer.append('%');
465     buffer.append(upperNibbleToASCIIHexDigit(byte));
466     buffer.append(lowerNibbleToASCIIHexDigit(byte));
467 }
468
469 const char* replacementCharacterUTF8PercentEncoded = "%EF%BF%BD";
470 const size_t replacementCharacterUTF8PercentEncodedLength = 9;
471
472 template<bool serialized>
473 inline static void utf8PercentEncode(UChar32 codePoint, Vector<LChar>& destination, bool(*isInCodeSet)(UChar32))
474 {
475     if (serialized) {
476         ASSERT_WITH_SECURITY_IMPLICATION(isASCII(codePoint));
477         ASSERT_WITH_SECURITY_IMPLICATION(!isInCodeSet(codePoint));
478         destination.append(codePoint);
479     } else {
480         if (isASCII(codePoint)) {
481             if (isInCodeSet(codePoint))
482                 percentEncodeByte(codePoint, destination);
483             else
484                 destination.append(codePoint);
485             return;
486         }
487         ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
488         
489         if (!U_IS_UNICODE_CHAR(codePoint)) {
490             destination.append(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
491             return;
492         }
493         
494         uint8_t buffer[U8_MAX_LENGTH];
495         int32_t offset = 0;
496         U8_APPEND_UNSAFE(buffer, offset, codePoint);
497         for (int32_t i = 0; i < offset; ++i)
498             percentEncodeByte(buffer[i], destination);
499     }
500 }
501
502 template<bool serialized>
503 inline static void utf8QueryEncode(UChar32 codePoint, Vector<LChar>& destination)
504 {
505     if (serialized) {
506         ASSERT_WITH_SECURITY_IMPLICATION(isASCII(codePoint));
507         ASSERT_WITH_SECURITY_IMPLICATION(!shouldPercentEncodeQueryByte(codePoint));
508         destination.append(codePoint);
509     } else {
510         if (isASCII(codePoint)) {
511             if (shouldPercentEncodeQueryByte(codePoint))
512                 percentEncodeByte(codePoint, destination);
513             else
514                 destination.append(codePoint);
515             return;
516         }
517         
518         if (!U_IS_UNICODE_CHAR(codePoint)) {
519             destination.append(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
520             return;
521         }
522
523         uint8_t buffer[U8_MAX_LENGTH];
524         int32_t offset = 0;
525         U8_APPEND_UNSAFE(buffer, offset, codePoint);
526         for (int32_t i = 0; i < offset; ++i) {
527             auto byte = buffer[i];
528             if (shouldPercentEncodeQueryByte(byte))
529                 percentEncodeByte(byte, destination);
530             else
531                 destination.append(byte);
532         }
533     }
534 }
535     
536 inline static void encodeQuery(const Vector<UChar>& source, Vector<LChar>& destination, const TextEncoding& encoding)
537 {
538     // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
539     CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
540     const char* data = encoded.data();
541     size_t length = encoded.length();
542     for (size_t i = 0; i < length; ++i) {
543         uint8_t byte = data[i];
544         if (shouldPercentEncodeQueryByte(byte))
545             percentEncodeByte(byte, destination);
546         else
547             destination.append(byte);
548     }
549 }
550
551 inline static bool isDefaultPort(StringView scheme, uint16_t port)
552 {
553     static const uint16_t ftpPort = 21;
554     static const uint16_t gopherPort = 70;
555     static const uint16_t httpPort = 80;
556     static const uint16_t httpsPort = 443;
557     static const uint16_t wsPort = 80;
558     static const uint16_t wssPort = 443;
559     
560     auto length = scheme.length();
561     if (!length)
562         return false;
563     switch (scheme[0]) {
564     case 'w':
565         switch (length) {
566         case 2:
567             return scheme[1] == 's'
568                 && port == wsPort;
569         case 3:
570             return scheme[1] == 's'
571                 && scheme[2] == 's'
572                 && port == wssPort;
573         default:
574             return false;
575         }
576     case 'h':
577         switch (length) {
578         case 4:
579             return scheme[1] == 't'
580                 && scheme[2] == 't'
581                 && scheme[3] == 'p'
582                 && port == httpPort;
583         case 5:
584             return scheme[1] == 't'
585                 && scheme[2] == 't'
586                 && scheme[3] == 'p'
587                 && scheme[4] == 's'
588                 && port == httpsPort;
589         default:
590             return false;
591         }
592     case 'g':
593         return length == 6
594             && scheme[1] == 'o'
595             && scheme[2] == 'p'
596             && scheme[3] == 'h'
597             && scheme[4] == 'e'
598             && scheme[5] == 'r'
599             && port == gopherPort;
600     case 'f':
601         return length == 3
602             && scheme[1] == 't'
603             && scheme[2] == 'p'
604             && port == ftpPort;
605         return false;
606     default:
607         return false;
608     }
609 }
610
611 inline static bool isSpecialScheme(StringView scheme)
612 {
613     auto length = scheme.length();
614     if (!length)
615         return false;
616     switch (scheme[0]) {
617     case 'f':
618         switch (length) {
619         case 3:
620             return scheme[1] == 't'
621                 && scheme[2] == 'p';
622         case 4:
623             return scheme[1] == 'i'
624                 && scheme[2] == 'l'
625                 && scheme[3] == 'e';
626         default:
627             return false;
628         }
629     case 'g':
630         return length == 6
631             && scheme[1] == 'o'
632             && scheme[2] == 'p'
633             && scheme[3] == 'h'
634             && scheme[4] == 'e'
635             && scheme[5] == 'r';
636     case 'h':
637         switch (length) {
638         case 4:
639             return scheme[1] == 't'
640                 && scheme[2] == 't'
641                 && scheme[3] == 'p';
642         case 5:
643             return scheme[1] == 't'
644                 && scheme[2] == 't'
645                 && scheme[3] == 'p'
646                 && scheme[4] == 's';
647         default:
648             return false;
649         }
650     case 'w':
651         switch (length) {
652         case 2:
653             return scheme[1] == 's';
654         case 3:
655             return scheme[1] == 's'
656                 && scheme[2] == 's';
657         default:
658             return false;
659         }
660     default:
661         return false;
662     }
663 }
664
665 enum class URLParser::URLPart {
666     SchemeEnd,
667     UserStart,
668     UserEnd,
669     PasswordEnd,
670     HostEnd,
671     PortEnd,
672     PathAfterLastSlash,
673     PathEnd,
674     QueryEnd,
675     FragmentEnd,
676 };
677
678 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
679 {
680     switch (part) {
681     case URLPart::FragmentEnd:
682         return url.m_fragmentEnd;
683     case URLPart::QueryEnd:
684         return url.m_queryEnd;
685     case URLPart::PathEnd:
686         return url.m_pathEnd;
687     case URLPart::PathAfterLastSlash:
688         return url.m_pathAfterLastSlash;
689     case URLPart::PortEnd:
690         return url.m_portEnd;
691     case URLPart::HostEnd:
692         return url.m_hostEnd;
693     case URLPart::PasswordEnd:
694         return url.m_passwordEnd;
695     case URLPart::UserEnd:
696         return url.m_userEnd;
697     case URLPart::UserStart:
698         return url.m_userStart;
699     case URLPart::SchemeEnd:
700         return url.m_schemeEnd;
701     }
702     ASSERT_NOT_REACHED();
703     return 0;
704 }
705
706 inline static void copyASCIIStringUntil(Vector<LChar>& destination, const String& string, size_t lengthIf8Bit, size_t lengthIf16Bit)
707 {
708     if (string.isNull()) {
709         ASSERT(!lengthIf8Bit);
710         ASSERT(!lengthIf16Bit);
711         return;
712     }
713     ASSERT(destination.isEmpty());
714     if (string.is8Bit()) {
715         RELEASE_ASSERT(lengthIf8Bit <= string.length());
716         destination.append(string.characters8(), lengthIf8Bit);
717     } else {
718         RELEASE_ASSERT(lengthIf16Bit <= string.length());
719         destination.reserveCapacity(lengthIf16Bit);
720         const UChar* characters = string.characters16();
721         for (size_t i = 0; i < lengthIf16Bit; ++i) {
722             UChar c = characters[i];
723             ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
724             destination.uncheckedAppend(c);
725         }
726     }
727 }
728
729 void URLParser::copyURLPartsUntil(const URL& base, URLPart part)
730 {
731     m_asciiBuffer.clear();
732     m_unicodeFragmentBuffer.clear();
733     if (part == URLPart::FragmentEnd) {
734         copyASCIIStringUntil(m_asciiBuffer, base.m_string, urlLengthUntilPart(base, URLPart::FragmentEnd), urlLengthUntilPart(base, URLPart::QueryEnd));
735         if (!base.m_string.is8Bit()) {
736             const String& fragment = base.m_string;
737             bool seenUnicode = false;
738             for (size_t i = base.m_queryEnd; i < base.m_fragmentEnd; ++i) {
739                 if (!seenUnicode && !isASCII(fragment[i]))
740                     seenUnicode = true;
741                 if (seenUnicode)
742                     m_unicodeFragmentBuffer.uncheckedAppend(fragment[i]);
743                 else
744                     m_asciiBuffer.uncheckedAppend(fragment[i]);
745             }
746         }
747     } else {
748         size_t length = urlLengthUntilPart(base, part);
749         copyASCIIStringUntil(m_asciiBuffer, base.m_string, length, length);
750     }
751     switch (part) {
752     case URLPart::FragmentEnd:
753         m_url.m_fragmentEnd = base.m_fragmentEnd;
754         FALLTHROUGH;
755     case URLPart::QueryEnd:
756         m_url.m_queryEnd = base.m_queryEnd;
757         FALLTHROUGH;
758     case URLPart::PathEnd:
759         m_url.m_pathEnd = base.m_pathEnd;
760         FALLTHROUGH;
761     case URLPart::PathAfterLastSlash:
762         m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
763         FALLTHROUGH;
764     case URLPart::PortEnd:
765         m_url.m_portEnd = base.m_portEnd;
766         FALLTHROUGH;
767     case URLPart::HostEnd:
768         m_url.m_hostEnd = base.m_hostEnd;
769         FALLTHROUGH;
770     case URLPart::PasswordEnd:
771         m_url.m_passwordEnd = base.m_passwordEnd;
772         FALLTHROUGH;
773     case URLPart::UserEnd:
774         m_url.m_userEnd = base.m_userEnd;
775         FALLTHROUGH;
776     case URLPart::UserStart:
777         m_url.m_userStart = base.m_userStart;
778         FALLTHROUGH;
779     case URLPart::SchemeEnd:
780         m_url.m_isValid = base.m_isValid;
781         m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
782         m_url.m_schemeEnd = base.m_schemeEnd;
783     }
784     m_urlIsSpecial = isSpecialScheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd));
785 }
786
787 static const char* dotASCIICode = "2e";
788
789 template<typename CharacterType>
790 inline static bool isPercentEncodedDot(CodePointIterator<CharacterType> c)
791 {
792     if (c.atEnd())
793         return false;
794     if (*c != '%')
795         return false;
796     ++c;
797     if (c.atEnd())
798         return false;
799     if (*c != dotASCIICode[0])
800         return false;
801     ++c;
802     if (c.atEnd())
803         return false;
804     return toASCIILower(*c) == dotASCIICode[1];
805 }
806
807 template<typename CharacterType>
808 inline static bool isSingleDotPathSegment(CodePointIterator<CharacterType> c)
809 {
810     if (c.atEnd())
811         return false;
812     if (*c == '.') {
813         ++c;
814         return c.atEnd() || isSlashQuestionOrHash(*c);
815     }
816     if (*c != '%')
817         return false;
818     ++c;
819     if (c.atEnd() || *c != dotASCIICode[0])
820         return false;
821     ++c;
822     if (c.atEnd())
823         return false;
824     if (toASCIILower(*c) == dotASCIICode[1]) {
825         ++c;
826         return c.atEnd() || isSlashQuestionOrHash(*c);
827     }
828     return false;
829 }
830
831 template<typename CharacterType>
832 inline static bool isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
833 {
834     if (c.atEnd())
835         return false;
836     if (*c == '.') {
837         ++c;
838         return isSingleDotPathSegment(c);
839     }
840     if (*c != '%')
841         return false;
842     ++c;
843     if (c.atEnd() || *c != dotASCIICode[0])
844         return false;
845     ++c;
846     if (c.atEnd())
847         return false;
848     if (toASCIILower(*c) == dotASCIICode[1]) {
849         ++c;
850         return isSingleDotPathSegment(c);
851     }
852     return false;
853 }
854
855 template<typename CharacterType>
856 inline static void consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
857 {
858     ASSERT(isSingleDotPathSegment(c));
859     if (*c == '.') {
860         ++c;
861         if (!c.atEnd()) {
862             if (*c == '/' || *c == '\\')
863                 ++c;
864             else
865                 ASSERT(*c == '?' || *c == '#');
866         }
867     } else {
868         ASSERT(*c == '%');
869         ++c;
870         ASSERT(*c == dotASCIICode[0]);
871         ++c;
872         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
873         ++c;
874         if (!c.atEnd()) {
875             if (*c == '/' || *c == '\\')
876                 ++c;
877             else
878                 ASSERT(*c == '?' || *c == '#');
879         }
880     }
881 }
882
883 template<typename CharacterType>
884 inline static void consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
885 {
886     ASSERT(isDoubleDotPathSegment(c));
887     if (*c == '.')
888         ++c;
889     else {
890         ASSERT(*c == '%');
891         ++c;
892         ASSERT(*c == dotASCIICode[0]);
893         ++c;
894         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
895         ++c;
896     }
897     consumeSingleDotPathSegment(c);
898 }
899
900 void URLParser::popPath()
901 {
902     if (m_url.m_pathAfterLastSlash > m_url.m_portEnd + 1) {
903         m_url.m_pathAfterLastSlash--;
904         if (m_asciiBuffer[m_url.m_pathAfterLastSlash] == '/')
905             m_url.m_pathAfterLastSlash--;
906         while (m_url.m_pathAfterLastSlash > m_url.m_portEnd && m_asciiBuffer[m_url.m_pathAfterLastSlash] != '/')
907             m_url.m_pathAfterLastSlash--;
908         m_url.m_pathAfterLastSlash++;
909     }
910     m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
911 }
912
913 template<typename CharacterType>
914 URL URLParser::failure(const CharacterType* input, unsigned length)
915 {
916     URL url;
917     url.m_isValid = false;
918     url.m_protocolIsInHTTPFamily = false;
919     url.m_cannotBeABaseURL = false;
920     url.m_schemeEnd = 0;
921     url.m_userStart = 0;
922     url.m_userEnd = 0;
923     url.m_passwordEnd = 0;
924     url.m_hostEnd = 0;
925     url.m_portEnd = 0;
926     url.m_pathAfterLastSlash = 0;
927     url.m_pathEnd = 0;
928     url.m_queryEnd = 0;
929     url.m_fragmentEnd = 0;
930     url.m_string = String(input, length);
931     return url;
932 }
933
934 URL URLParser::parse(const String& input, const URL& base, const TextEncoding& encoding)
935 {
936     const bool serialized = false;
937     if (input.is8Bit())
938         return parse<serialized>(input.characters8(), input.length(), base, encoding);
939     return parse<serialized>(input.characters16(), input.length(), base, encoding);
940 }
941
942 URL URLParser::parseSerializedURL(const String& input)
943 {
944     const bool serialized = true;
945     if (input.is8Bit())
946         return parse<serialized>(input.characters8(), input.length(), { }, UTF8Encoding());
947     return parse<serialized>(input.characters16(), input.length(), { }, UTF8Encoding());
948 }
949
950 template<bool serialized, typename CharacterType>
951 URL URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
952 {
953     LOG(URLParser, "Parsing URL <%s> base <%s>", String(input, length).utf8().data(), base.string().utf8().data());
954     m_url = { };
955     ASSERT(m_asciiBuffer.isEmpty());
956     ASSERT(m_unicodeFragmentBuffer.isEmpty());
957     m_asciiBuffer.reserveInitialCapacity(length);
958     
959     bool isUTF8Encoding = encoding == UTF8Encoding();
960     Vector<UChar> queryBuffer;
961
962     unsigned endIndex = length;
963     while (endIndex && isC0ControlOrSpace(input[endIndex - 1]))
964         endIndex--;
965     CodePointIterator<CharacterType> c(input, input + endIndex);
966     CodePointIterator<CharacterType> authorityOrHostBegin;
967     while (!c.atEnd() && isC0ControlOrSpace(*c))
968         ++c;
969     auto beginAfterControlAndSpace = c;
970
971     enum class State : uint8_t {
972         SchemeStart,
973         Scheme,
974         NoScheme,
975         SpecialRelativeOrAuthority,
976         PathOrAuthority,
977         Relative,
978         RelativeSlash,
979         SpecialAuthoritySlashes,
980         SpecialAuthorityIgnoreSlashes,
981         AuthorityOrHost,
982         Host,
983         File,
984         FileSlash,
985         FileHost,
986         PathStart,
987         Path,
988         CannotBeABaseURLPath,
989         Query,
990         Fragment,
991     };
992
993 #define LOG_STATE(x) LOG(URLParser, "State %s, code point %c, asciiBuffer size %zu", x, *c, m_asciiBuffer.size())
994 #define LOG_FINAL_STATE(x) LOG(URLParser, "Final State: %s", x)
995
996     State state = State::SchemeStart;
997     while (!c.atEnd()) {
998         if (!serialized && isTabOrNewline(*c)) {
999             ++c;
1000             continue;
1001         }
1002
1003         switch (state) {
1004         case State::SchemeStart:
1005             LOG_STATE("SchemeStart");
1006             if (isASCIIAlpha(*c)) {
1007                 m_asciiBuffer.uncheckedAppend(toASCIILower(*c));
1008                 incrementIteratorSkippingTabAndNewLine<serialized>(c);
1009                 if (c.atEnd()) {
1010                     m_asciiBuffer.clear();
1011                     state = State::NoScheme;
1012                     c = beginAfterControlAndSpace;
1013                 }
1014                 state = State::Scheme;
1015             } else
1016                 state = State::NoScheme;
1017             break;
1018         case State::Scheme:
1019             LOG_STATE("Scheme");
1020             if (isValidSchemeCharacter(*c))
1021                 m_asciiBuffer.append(toASCIILower(*c));
1022             else if (*c == ':') {
1023                 m_url.m_schemeEnd = m_asciiBuffer.size();
1024                 StringView urlScheme = StringView(m_asciiBuffer.data(), m_url.m_schemeEnd);
1025                 m_url.m_protocolIsInHTTPFamily = urlScheme == "http" || urlScheme == "https";
1026                 if (urlScheme == "file") {
1027                     m_urlIsSpecial = true;
1028                     state = State::File;
1029                     m_asciiBuffer.append(':');
1030                     ++c;
1031                     break;
1032                 }
1033                 m_asciiBuffer.append(':');
1034                 if (isSpecialScheme(urlScheme)) {
1035                     m_urlIsSpecial = true;
1036                     if (base.protocolIs(m_asciiBuffer.data(), m_asciiBuffer.size() - 1))
1037                         state = State::SpecialRelativeOrAuthority;
1038                     else
1039                         state = State::SpecialAuthoritySlashes;
1040                 } else {
1041                     auto maybeSlash = c;
1042                     incrementIteratorSkippingTabAndNewLine<serialized>(maybeSlash);
1043                     if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1044                         m_asciiBuffer.append('/');
1045                         m_url.m_userStart = m_asciiBuffer.size();
1046                         state = State::PathOrAuthority;
1047                         c = maybeSlash;
1048                         ASSERT(*c == '/');
1049                     } else {
1050                         m_url.m_userStart = m_asciiBuffer.size();
1051                         m_url.m_userEnd = m_url.m_userStart;
1052                         m_url.m_passwordEnd = m_url.m_userStart;
1053                         m_url.m_hostEnd = m_url.m_userStart;
1054                         m_url.m_portEnd = m_url.m_userStart;
1055                         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1056                         m_url.m_cannotBeABaseURL = true;
1057                         state = State::CannotBeABaseURLPath;
1058                     }
1059                 }
1060                 ++c;
1061                 break;
1062             } else {
1063                 m_asciiBuffer.clear();
1064                 state = State::NoScheme;
1065                 c = beginAfterControlAndSpace;
1066                 break;
1067             }
1068             incrementIteratorSkippingTabAndNewLine<serialized>(c);
1069             if (c.atEnd()) {
1070                 m_asciiBuffer.clear();
1071                 state = State::NoScheme;
1072                 c = beginAfterControlAndSpace;
1073             }
1074             break;
1075         case State::NoScheme:
1076             LOG_STATE("NoScheme");
1077             if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#'))
1078                 return failure(input, length);
1079             if (base.m_cannotBeABaseURL && *c == '#') {
1080                 copyURLPartsUntil(base, URLPart::QueryEnd);
1081                 state = State::Fragment;
1082                 m_asciiBuffer.append('#');
1083                 ++c;
1084                 break;
1085             }
1086             if (!base.protocolIs("file")) {
1087                 state = State::Relative;
1088                 break;
1089             }
1090             copyURLPartsUntil(base, URLPart::SchemeEnd);
1091             m_asciiBuffer.append(':');
1092             state = State::File;
1093             break;
1094         case State::SpecialRelativeOrAuthority:
1095             LOG_STATE("SpecialRelativeOrAuthority");
1096             if (*c == '/') {
1097                 m_asciiBuffer.append('/');
1098                 incrementIteratorSkippingTabAndNewLine<serialized>(c);
1099                 if (c.atEnd())
1100                     return failure(input, length);
1101                 if (*c == '/') {
1102                     m_asciiBuffer.append('/');
1103                     state = State::SpecialAuthorityIgnoreSlashes;
1104                     ++c;
1105                 } else
1106                     state = State::RelativeSlash;
1107             } else
1108                 state = State::Relative;
1109             break;
1110         case State::PathOrAuthority:
1111             LOG_STATE("PathOrAuthority");
1112             if (*c == '/') {
1113                 m_asciiBuffer.append('/');
1114                 m_url.m_userStart = m_asciiBuffer.size();
1115                 state = State::AuthorityOrHost;
1116                 ++c;
1117                 authorityOrHostBegin = c;
1118             } else {
1119                 ASSERT(m_asciiBuffer.last() == '/');
1120                 m_url.m_userStart = m_asciiBuffer.size() - 1;
1121                 m_url.m_userEnd = m_url.m_userStart;
1122                 m_url.m_passwordEnd = m_url.m_userStart;
1123                 m_url.m_hostEnd = m_url.m_userStart;
1124                 m_url.m_portEnd = m_url.m_userStart;
1125                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1126                 state = State::Path;
1127             }
1128             break;
1129         case State::Relative:
1130             LOG_STATE("Relative");
1131             switch (*c) {
1132             case '/':
1133             case '\\':
1134                 state = State::RelativeSlash;
1135                 ++c;
1136                 break;
1137             case '?':
1138                 copyURLPartsUntil(base, URLPart::PathEnd);
1139                 m_asciiBuffer.append('?');
1140                 state = State::Query;
1141                 ++c;
1142                 break;
1143             case '#':
1144                 copyURLPartsUntil(base, URLPart::QueryEnd);
1145                 m_asciiBuffer.append('#');
1146                 state = State::Fragment;
1147                 ++c;
1148                 break;
1149             default:
1150                 copyURLPartsUntil(base, URLPart::PathAfterLastSlash);
1151                 state = State::Path;
1152                 break;
1153             }
1154             break;
1155         case State::RelativeSlash:
1156             LOG_STATE("RelativeSlash");
1157             if (*c == '/' || *c == '\\') {
1158                 ++c;
1159                 copyURLPartsUntil(base, URLPart::SchemeEnd);
1160                 m_asciiBuffer.append("://", 3);
1161                 state = State::SpecialAuthorityIgnoreSlashes;
1162             } else {
1163                 copyURLPartsUntil(base, URLPart::PortEnd);
1164                 m_asciiBuffer.append('/');
1165                 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1166                 state = State::Path;
1167             }
1168             break;
1169         case State::SpecialAuthoritySlashes:
1170             LOG_STATE("SpecialAuthoritySlashes");
1171             m_asciiBuffer.append("//", 2);
1172             if (*c == '/' || *c == '\\') {
1173                 incrementIteratorSkippingTabAndNewLine<serialized>(c);
1174                 if (!c.atEnd() && (*c == '/' || *c == '\\'))
1175                     ++c;
1176             }
1177             state = State::SpecialAuthorityIgnoreSlashes;
1178             break;
1179         case State::SpecialAuthorityIgnoreSlashes:
1180             LOG_STATE("SpecialAuthorityIgnoreSlashes");
1181             if (*c == '/' || *c == '\\') {
1182                 m_asciiBuffer.append('/');
1183                 ++c;
1184             }
1185             m_url.m_userStart = m_asciiBuffer.size();
1186             state = State::AuthorityOrHost;
1187             authorityOrHostBegin = c;
1188             break;
1189         case State::AuthorityOrHost:
1190             LOG_STATE("AuthorityOrHost");
1191             {
1192                 if (*c == '@') {
1193                     auto lastAt = c;
1194                     auto findLastAt = c;
1195                     while (!findLastAt.atEnd()) {
1196                         if (*findLastAt == '@')
1197                             lastAt = findLastAt;
1198                         ++findLastAt;
1199                     }
1200                     parseAuthority<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1201                     c = lastAt;
1202                     incrementIteratorSkippingTabAndNewLine<serialized>(c);
1203                     authorityOrHostBegin = c;
1204                     state = State::Host;
1205                     m_hostHasPercentOrNonASCII = false;
1206                     break;
1207                 }
1208                 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1209                 if (isSlash || *c == '?' || *c == '#') {
1210                     m_url.m_userEnd = m_asciiBuffer.size();
1211                     m_url.m_passwordEnd = m_url.m_userEnd;
1212                     if (!parseHostAndPort<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
1213                         return failure(input, length);
1214                     if (!isSlash) {
1215                         m_asciiBuffer.append('/');
1216                         m_url.m_pathAfterLastSlash = m_asciiBuffer.size();
1217                     }
1218                     state = State::Path;
1219                     break;
1220                 }
1221                 if (isPercentOrNonASCII(*c))
1222                     m_hostHasPercentOrNonASCII = true;
1223                 ++c;
1224             }
1225             break;
1226         case State::Host:
1227             LOG_STATE("Host");
1228             if (*c == '/' || *c == '?' || *c == '#') {
1229                 if (!parseHostAndPort<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
1230                     return failure(input, length);
1231                 state = State::Path;
1232                 break;
1233             }
1234             if (isPercentOrNonASCII(*c))
1235                 m_hostHasPercentOrNonASCII = true;
1236             ++c;
1237             break;
1238         case State::File:
1239             LOG_STATE("File");
1240             switch (*c) {
1241             case '/':
1242             case '\\':
1243                 m_asciiBuffer.append('/');
1244                 state = State::FileSlash;
1245                 ++c;
1246                 break;
1247             case '?':
1248                 if (base.isValid() && base.protocolIs("file"))
1249                     copyURLPartsUntil(base, URLPart::PathEnd);
1250                 m_asciiBuffer.append("///?", 4);
1251                 m_url.m_userStart = m_asciiBuffer.size() - 2;
1252                 m_url.m_userEnd = m_url.m_userStart;
1253                 m_url.m_passwordEnd = m_url.m_userStart;
1254                 m_url.m_hostEnd = m_url.m_userStart;
1255                 m_url.m_portEnd = m_url.m_userStart;
1256                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1257                 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1258                 state = State::Query;
1259                 ++c;
1260                 break;
1261             case '#':
1262                 if (base.isValid() && base.protocolIs("file"))
1263                     copyURLPartsUntil(base, URLPart::QueryEnd);
1264                 m_asciiBuffer.append("///#", 4);
1265                 m_url.m_userStart = m_asciiBuffer.size() - 2;
1266                 m_url.m_userEnd = m_url.m_userStart;
1267                 m_url.m_passwordEnd = m_url.m_userStart;
1268                 m_url.m_hostEnd = m_url.m_userStart;
1269                 m_url.m_portEnd = m_url.m_userStart;
1270                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1271                 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1272                 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1273                 state = State::Fragment;
1274                 ++c;
1275                 break;
1276             default:
1277                 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL<serialized>(c))
1278                     copyURLPartsUntil(base, URLPart::PathAfterLastSlash);
1279                 else {
1280                     m_asciiBuffer.append("///", 3);
1281                     m_url.m_userStart = m_asciiBuffer.size() - 1;
1282                     m_url.m_userEnd = m_url.m_userStart;
1283                     m_url.m_passwordEnd = m_url.m_userStart;
1284                     m_url.m_hostEnd = m_url.m_userStart;
1285                     m_url.m_portEnd = m_url.m_userStart;
1286                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1287                     checkWindowsDriveLetter<serialized>(c, m_asciiBuffer);
1288                 }
1289                 state = State::Path;
1290                 break;
1291             }
1292             break;
1293         case State::FileSlash:
1294             LOG_STATE("FileSlash");
1295             if (*c == '/' || *c == '\\') {
1296                 ++c;
1297                 m_asciiBuffer.append('/');
1298                 m_url.m_userStart = m_asciiBuffer.size();
1299                 m_url.m_userEnd = m_url.m_userStart;
1300                 m_url.m_passwordEnd = m_url.m_userStart;
1301                 m_url.m_hostEnd = m_url.m_userStart;
1302                 m_url.m_portEnd = m_url.m_userStart;
1303                 authorityOrHostBegin = c;
1304                 state = State::FileHost;
1305                 break;
1306             }
1307             if (base.isValid() && base.protocolIs("file")) {
1308                 // FIXME: This String copy is unnecessary.
1309                 String basePath = base.path();
1310                 if (basePath.length() >= 2) {
1311                     bool windowsQuirk = basePath.is8Bit()
1312                         ? isWindowsDriveLetter<serialized>(CodePointIterator<LChar>(basePath.characters8(), basePath.characters8() + basePath.length()))
1313                         : isWindowsDriveLetter<serialized>(CodePointIterator<UChar>(basePath.characters16(), basePath.characters16() + basePath.length()));
1314                     if (windowsQuirk) {
1315                         m_asciiBuffer.append(basePath[0]);
1316                         m_asciiBuffer.append(basePath[1]);
1317                     }
1318                 }
1319             }
1320             m_asciiBuffer.append("//", 2);
1321             m_url.m_userStart = m_asciiBuffer.size() - 1;
1322             m_url.m_userEnd = m_url.m_userStart;
1323             m_url.m_passwordEnd = m_url.m_userStart;
1324             m_url.m_hostEnd = m_url.m_userStart;
1325             m_url.m_portEnd = m_url.m_userStart;
1326             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1327             checkWindowsDriveLetter<serialized>(c, m_asciiBuffer);
1328             state = State::Path;
1329             break;
1330         case State::FileHost:
1331             LOG_STATE("FileHost");
1332             if (isSlashQuestionOrHash(*c)) {
1333                 if (isWindowsDriveLetter(m_asciiBuffer, m_url.m_portEnd + 1)) {
1334                     state = State::Path;
1335                     break;
1336                 }
1337                 if (authorityOrHostBegin == c) {
1338                     ASSERT(m_asciiBuffer[m_asciiBuffer.size() - 1] == '/');
1339                     if (*c == '?') {
1340                         m_asciiBuffer.append("/?", 2);
1341                         m_url.m_pathAfterLastSlash = m_asciiBuffer.size() - 1;
1342                         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1343                         state = State::Query;
1344                         ++c;
1345                         break;
1346                     }
1347                     if (*c == '#') {
1348                         m_asciiBuffer.append("/#", 2);
1349                         m_url.m_pathAfterLastSlash = m_asciiBuffer.size() - 1;
1350                         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1351                         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1352                         state = State::Fragment;
1353                         ++c;
1354                         break;
1355                     }
1356                     state = State::Path;
1357                     break;
1358                 }
1359                 if (!parseHostAndPort<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
1360                     return failure(input, length);
1361                 
1362                 if (StringView(m_asciiBuffer.data() + m_url.m_passwordEnd, m_asciiBuffer.size() - m_url.m_passwordEnd) == "localhost")  {
1363                     m_asciiBuffer.shrink(m_url.m_passwordEnd);
1364                     m_url.m_hostEnd = m_asciiBuffer.size();
1365                     m_url.m_portEnd = m_url.m_hostEnd;
1366                 }
1367                 
1368                 state = State::PathStart;
1369                 break;
1370             }
1371             if (isPercentOrNonASCII(*c))
1372                 m_hostHasPercentOrNonASCII = true;
1373             ++c;
1374             break;
1375         case State::PathStart:
1376             LOG_STATE("PathStart");
1377             if (*c != '/' && *c != '\\')
1378                 ++c;
1379             state = State::Path;
1380             break;
1381         case State::Path:
1382             LOG_STATE("Path");
1383             if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1384                 m_asciiBuffer.append('/');
1385                 m_url.m_pathAfterLastSlash = m_asciiBuffer.size();
1386                 ++c;
1387                 break;
1388             }
1389             if (m_asciiBuffer.size() && m_asciiBuffer[m_asciiBuffer.size() - 1] == '/') {
1390                 if (isDoubleDotPathSegment(c)) {
1391                     consumeDoubleDotPathSegment(c);
1392                     popPath();
1393                     break;
1394                 }
1395                 if (m_asciiBuffer[m_asciiBuffer.size() - 1] == '/' && isSingleDotPathSegment(c)) {
1396                     consumeSingleDotPathSegment(c);
1397                     break;
1398                 }
1399             }
1400             if (*c == '?') {
1401                 m_url.m_pathEnd = m_asciiBuffer.size();
1402                 state = State::Query;
1403                 break;
1404             }
1405             if (*c == '#') {
1406                 m_url.m_pathEnd = m_asciiBuffer.size();
1407                 m_url.m_queryEnd = m_url.m_pathEnd;
1408                 state = State::Fragment;
1409                 break;
1410             }
1411             if (isPercentEncodedDot(c)) {
1412                 m_asciiBuffer.append('.');
1413                 ASSERT(*c == '%');
1414                 ++c;
1415                 ASSERT(*c == dotASCIICode[0]);
1416                 ++c;
1417                 ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1418                 ++c;
1419                 break;
1420             }
1421             utf8PercentEncode<serialized>(*c, m_asciiBuffer, isInDefaultEncodeSet);
1422             ++c;
1423             break;
1424         case State::CannotBeABaseURLPath:
1425             LOG_STATE("CannotBeABaseURLPath");
1426             if (*c == '?') {
1427                 m_url.m_pathEnd = m_asciiBuffer.size();
1428                 state = State::Query;
1429             } else if (*c == '#') {
1430                 m_url.m_pathEnd = m_asciiBuffer.size();
1431                 m_url.m_queryEnd = m_url.m_pathEnd;
1432                 state = State::Fragment;
1433             } else {
1434                 utf8PercentEncode<serialized>(*c, m_asciiBuffer, isInSimpleEncodeSet);
1435                 ++c;
1436             }
1437             break;
1438         case State::Query:
1439             LOG_STATE("Query");
1440             if (*c == '#') {
1441                 if (!isUTF8Encoding)
1442                     encodeQuery(queryBuffer, m_asciiBuffer, encoding);
1443                 m_url.m_queryEnd = m_asciiBuffer.size();
1444                 state = State::Fragment;
1445                 break;
1446             }
1447             if (isUTF8Encoding)
1448                 utf8QueryEncode<serialized>(*c, m_asciiBuffer);
1449             else
1450                 appendCodePoint(queryBuffer, *c);
1451             ++c;
1452             break;
1453         case State::Fragment:
1454             LOG_STATE("Fragment");
1455             if (m_unicodeFragmentBuffer.isEmpty() && isASCII(*c))
1456                 m_asciiBuffer.append(*c);
1457             else
1458                 appendCodePoint(m_unicodeFragmentBuffer, *c);
1459             ++c;
1460             break;
1461         }
1462     }
1463
1464     switch (state) {
1465     case State::SchemeStart:
1466         LOG_FINAL_STATE("SchemeStart");
1467         if (!m_asciiBuffer.size() && base.isValid())
1468             return base;
1469         return failure(input, length);
1470     case State::Scheme:
1471         LOG_FINAL_STATE("Scheme");
1472         return failure(input, length);
1473     case State::NoScheme:
1474         LOG_FINAL_STATE("NoScheme");
1475         RELEASE_ASSERT_NOT_REACHED();
1476     case State::SpecialRelativeOrAuthority:
1477         LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1478         copyURLPartsUntil(base, URLPart::QueryEnd);
1479         m_url.m_fragmentEnd = m_url.m_queryEnd;
1480         break;
1481     case State::PathOrAuthority:
1482         LOG_FINAL_STATE("PathOrAuthority");
1483         ASSERT(m_url.m_userStart);
1484         ASSERT(m_url.m_userStart == m_asciiBuffer.size());
1485         ASSERT(m_asciiBuffer.last() == '/');
1486         m_url.m_userStart--;
1487         m_url.m_userEnd = m_url.m_userStart;
1488         m_url.m_passwordEnd = m_url.m_userStart;
1489         m_url.m_hostEnd = m_url.m_userStart;
1490         m_url.m_portEnd = m_url.m_userStart;
1491         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1492         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1493         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1494         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1495         break;
1496     case State::Relative:
1497         LOG_FINAL_STATE("Relative");
1498         copyURLPartsUntil(base, URLPart::FragmentEnd);
1499         break;
1500     case State::RelativeSlash:
1501         LOG_FINAL_STATE("RelativeSlash");
1502         copyURLPartsUntil(base, URLPart::PortEnd);
1503         m_asciiBuffer.append('/');
1504         m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1505         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1506         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1507         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1508         break;
1509     case State::SpecialAuthoritySlashes:
1510         LOG_FINAL_STATE("SpecialAuthoritySlashes");
1511         m_url.m_userStart = m_asciiBuffer.size();
1512         m_url.m_userEnd = m_url.m_userStart;
1513         m_url.m_passwordEnd = m_url.m_userStart;
1514         m_url.m_hostEnd = m_url.m_userStart;
1515         m_url.m_portEnd = m_url.m_userStart;
1516         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1517         m_url.m_pathEnd = m_url.m_userStart;
1518         m_url.m_queryEnd = m_url.m_userStart;
1519         m_url.m_fragmentEnd = m_url.m_userStart;
1520         break;
1521     case State::SpecialAuthorityIgnoreSlashes:
1522         LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1523         return failure(input, length);
1524         break;
1525     case State::AuthorityOrHost:
1526         LOG_FINAL_STATE("AuthorityOrHost");
1527         m_url.m_userEnd = m_asciiBuffer.size();
1528         m_url.m_passwordEnd = m_url.m_userEnd;
1529         if (authorityOrHostBegin.atEnd()) {
1530             m_url.m_hostEnd = m_url.m_userEnd;
1531             m_url.m_portEnd = m_url.m_userEnd;
1532         } else if (!parseHostAndPort<serialized>(authorityOrHostBegin))
1533             return failure(input, length);
1534         m_asciiBuffer.append('/');
1535         m_url.m_pathEnd = m_url.m_portEnd + 1;
1536         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1537         m_url.m_queryEnd = m_url.m_pathEnd;
1538         m_url.m_fragmentEnd = m_url.m_pathEnd;
1539         break;
1540     case State::Host:
1541         LOG_FINAL_STATE("Host");
1542         if (!parseHostAndPort<serialized>(authorityOrHostBegin))
1543             return failure(input, length);
1544         m_asciiBuffer.append('/');
1545         m_url.m_pathEnd = m_url.m_portEnd + 1;
1546         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1547         m_url.m_queryEnd = m_url.m_pathEnd;
1548         m_url.m_fragmentEnd = m_url.m_pathEnd;
1549         break;
1550     case State::File:
1551         LOG_FINAL_STATE("File");
1552         if (base.isValid() && base.protocolIs("file")) {
1553             copyURLPartsUntil(base, URLPart::QueryEnd);
1554             m_asciiBuffer.append(':');
1555         }
1556         m_asciiBuffer.append("///", 3);
1557         m_url.m_userStart = m_asciiBuffer.size() - 1;
1558         m_url.m_userEnd = m_url.m_userStart;
1559         m_url.m_passwordEnd = m_url.m_userStart;
1560         m_url.m_hostEnd = m_url.m_userStart;
1561         m_url.m_portEnd = m_url.m_userStart;
1562         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1563         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1564         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1565         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1566         break;
1567     case State::FileSlash:
1568         LOG_FINAL_STATE("FileSlash");
1569         m_asciiBuffer.append("//", 2);
1570         m_url.m_userStart = m_asciiBuffer.size() - 1;
1571         m_url.m_userEnd = m_url.m_userStart;
1572         m_url.m_passwordEnd = m_url.m_userStart;
1573         m_url.m_hostEnd = m_url.m_userStart;
1574         m_url.m_portEnd = m_url.m_userStart;
1575         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1576         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1577         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1578         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1579         break;
1580     case State::FileHost:
1581         LOG_FINAL_STATE("FileHost");
1582         if (authorityOrHostBegin == c) {
1583             m_asciiBuffer.append('/');
1584             m_url.m_userStart = m_asciiBuffer.size() - 1;
1585             m_url.m_userEnd = m_url.m_userStart;
1586             m_url.m_passwordEnd = m_url.m_userStart;
1587             m_url.m_hostEnd = m_url.m_userStart;
1588             m_url.m_portEnd = m_url.m_userStart;
1589             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1590             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1591             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1592             m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1593             break;
1594         }
1595
1596         if (!parseHostAndPort<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
1597             return failure(input, length);
1598
1599         if (StringView(m_asciiBuffer.data() + m_url.m_passwordEnd, m_asciiBuffer.size() - m_url.m_passwordEnd) == "localhost")  {
1600             m_asciiBuffer.shrink(m_url.m_passwordEnd);
1601             m_url.m_hostEnd = m_asciiBuffer.size();
1602             m_url.m_portEnd = m_url.m_hostEnd;
1603         }
1604         m_asciiBuffer.append('/');
1605         m_url.m_pathAfterLastSlash = m_url.m_hostEnd + 1;
1606         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1607         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1608         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1609         break;
1610     case State::PathStart:
1611         LOG_FINAL_STATE("PathStart");
1612         RELEASE_ASSERT_NOT_REACHED();
1613     case State::Path:
1614         LOG_FINAL_STATE("Path");
1615         m_url.m_pathEnd = m_asciiBuffer.size();
1616         m_url.m_queryEnd = m_url.m_pathEnd;
1617         m_url.m_fragmentEnd = m_url.m_pathEnd;
1618         break;
1619     case State::CannotBeABaseURLPath:
1620         LOG_FINAL_STATE("CannotBeABaseURLPath");
1621         m_url.m_pathEnd = m_asciiBuffer.size();
1622         m_url.m_queryEnd = m_url.m_pathEnd;
1623         m_url.m_fragmentEnd = m_url.m_pathEnd;
1624         break;
1625     case State::Query:
1626         LOG_FINAL_STATE("Query");
1627         if (!isUTF8Encoding)
1628             encodeQuery(queryBuffer, m_asciiBuffer, encoding);
1629         m_url.m_queryEnd = m_asciiBuffer.size();
1630         m_url.m_fragmentEnd = m_url.m_queryEnd;
1631         break;
1632     case State::Fragment:
1633         LOG_FINAL_STATE("Fragment");
1634         m_url.m_fragmentEnd = m_asciiBuffer.size() + m_unicodeFragmentBuffer.size();
1635         break;
1636     }
1637
1638     if (m_unicodeFragmentBuffer.isEmpty())
1639         m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
1640     else {
1641         Vector<UChar> buffer;
1642         buffer.reserveInitialCapacity(m_asciiBuffer.size() + m_unicodeFragmentBuffer.size());
1643         buffer.appendVector(m_asciiBuffer);
1644         buffer.appendVector(m_unicodeFragmentBuffer);
1645         m_url.m_string = String::adopt(WTFMove(buffer));
1646     }
1647     m_url.m_isValid = true;
1648     LOG(URLParser, "Parsed URL <%s>", m_url.m_string.utf8().data());
1649     ASSERT(internalValuesConsistent(m_url));
1650     return m_url;
1651 }
1652
1653 template<bool serialized, typename CharacterType>
1654 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
1655 {
1656     if (iterator.atEnd()) {
1657         m_url.m_userEnd = m_asciiBuffer.size();
1658         m_url.m_passwordEnd = m_url.m_userEnd;
1659         return;
1660     }
1661     for (; !iterator.atEnd(); ++iterator) {
1662         if (*iterator == ':') {
1663             ++iterator;
1664             m_url.m_userEnd = m_asciiBuffer.size();
1665             if (iterator.atEnd()) {
1666                 m_url.m_passwordEnd = m_url.m_userEnd;
1667                 if (m_url.m_userEnd > m_url.m_userStart)
1668                     m_asciiBuffer.append('@');
1669                 return;
1670             }
1671             m_asciiBuffer.append(':');
1672             break;
1673         }
1674         utf8PercentEncode<serialized>(*iterator, m_asciiBuffer, isInUserInfoEncodeSet);
1675     }
1676     for (; !iterator.atEnd(); ++iterator)
1677         utf8PercentEncode<serialized>(*iterator, m_asciiBuffer, isInUserInfoEncodeSet);
1678     m_url.m_passwordEnd = m_asciiBuffer.size();
1679     if (!m_url.m_userEnd)
1680         m_url.m_userEnd = m_url.m_passwordEnd;
1681     m_asciiBuffer.append('@');
1682 }
1683
1684 template<typename UnsignedIntegerType>
1685 void append(Vector<LChar>& destination, UnsignedIntegerType number)
1686 {
1687     LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
1688     LChar* end = buf + WTF_ARRAY_LENGTH(buf);
1689     LChar* p = end;
1690     do {
1691         *--p = (number % 10) + '0';
1692         number /= 10;
1693     } while (number);
1694     destination.append(p, end - p);
1695 }
1696
1697 inline static void serializeIPv4(uint32_t address, Vector<LChar>& buffer)
1698 {
1699     append<uint8_t>(buffer, address >> 24);
1700     buffer.append('.');
1701     append<uint8_t>(buffer, address >> 16);
1702     buffer.append('.');
1703     append<uint8_t>(buffer, address >> 8);
1704     buffer.append('.');
1705     append<uint8_t>(buffer, address);
1706 }
1707     
1708 inline static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
1709 {
1710     size_t end = begin;
1711     for (; end < 8; end++) {
1712         if (address[end])
1713             break;
1714     }
1715     return end - begin;
1716 }
1717
1718 inline static Optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
1719 {
1720     Optional<size_t> longest;
1721     size_t longestLength = 0;
1722     for (size_t i = 0; i < 8; i++) {
1723         size_t length = zeroSequenceLength(address, i);
1724         if (length) {
1725             if (length > 1 && (!longest || longestLength < length)) {
1726                 longest = i;
1727                 longestLength = length;
1728             }
1729             i += length;
1730         }
1731     }
1732     return longest;
1733 }
1734     
1735 inline static void serializeIPv6Piece(uint16_t piece, Vector<LChar>& buffer)
1736 {
1737     bool printed = false;
1738     if (auto nibble0 = piece >> 12) {
1739         buffer.append(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
1740         printed = true;
1741     }
1742     auto nibble1 = piece >> 8 & 0xF;
1743     if (printed || nibble1) {
1744         buffer.append(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
1745         printed = true;
1746     }
1747     auto nibble2 = piece >> 4 & 0xF;
1748     if (printed || nibble2)
1749         buffer.append(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
1750     buffer.append(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
1751 }
1752
1753 inline static void serializeIPv6(std::array<uint16_t, 8> address, Vector<LChar>& buffer)
1754 {
1755     buffer.append('[');
1756     auto compressPointer = findLongestZeroSequence(address);
1757     for (size_t piece = 0; piece < 8; piece++) {
1758         if (compressPointer && compressPointer.value() == piece) {
1759             ASSERT(!address[piece]);
1760             if (piece)
1761                 buffer.append(':');
1762             else
1763                 buffer.append("::", 2);
1764             while (piece < 8 && !address[piece])
1765                 piece++;
1766             if (piece == 8)
1767                 break;
1768         }
1769         serializeIPv6Piece(address[piece], buffer);
1770         if (piece < 7)
1771             buffer.append(':');
1772     }
1773     buffer.append(']');
1774 }
1775
1776 template<typename CharacterType>
1777 inline static Optional<uint32_t> parseIPv4Number(CodePointIterator<CharacterType>& iterator)
1778 {
1779     // FIXME: Check for overflow.
1780     enum class State : uint8_t {
1781         UnknownBase,
1782         Decimal,
1783         OctalOrHex,
1784         Octal,
1785         Hex,
1786     };
1787     State state = State::UnknownBase;
1788     uint32_t value = 0;
1789     while (!iterator.atEnd()) {
1790         if (*iterator == '.') {
1791             ++iterator;
1792             return value;
1793         }
1794         switch (state) {
1795         case State::UnknownBase:
1796             if (*iterator == '0') {
1797                 ++iterator;
1798                 state = State::OctalOrHex;
1799                 break;
1800             }
1801             state = State::Decimal;
1802             break;
1803         case State::OctalOrHex:
1804             if (*iterator == 'x' || *iterator == 'X') {
1805                 ++iterator;
1806                 state = State::Hex;
1807                 break;
1808             }
1809             state = State::Octal;
1810             break;
1811         case State::Decimal:
1812             if (*iterator < '0' || *iterator > '9')
1813                 return Nullopt;
1814             value *= 10;
1815             value += *iterator - '0';
1816             ++iterator;
1817             break;
1818         case State::Octal:
1819             if (*iterator < '0' || *iterator > '7')
1820                 return Nullopt;
1821             value *= 8;
1822             value += *iterator - '0';
1823             ++iterator;
1824             break;
1825         case State::Hex:
1826             if (!isASCIIHexDigit(*iterator))
1827                 return Nullopt;
1828             value *= 16;
1829             value += toASCIIHexValue(*iterator);
1830             ++iterator;
1831             break;
1832         }
1833     }
1834     return value;
1835 }
1836
1837 inline static uint64_t pow256(size_t exponent)
1838 {
1839     RELEASE_ASSERT(exponent <= 4);
1840     uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
1841     return values[exponent];
1842 }
1843
1844 template<typename CharacterType>
1845 inline static Optional<uint32_t> parseIPv4Host(CodePointIterator<CharacterType> iterator)
1846 {
1847     Vector<uint32_t, 4> items;
1848     items.reserveInitialCapacity(4);
1849     while (!iterator.atEnd()) {
1850         if (items.size() >= 4)
1851             return Nullopt;
1852         if (auto item = parseIPv4Number(iterator))
1853             items.append(item.value());
1854         else
1855             return Nullopt;
1856     }
1857     if (!items.size() || items.size() > 4)
1858         return Nullopt;
1859     if (items.size() > 2) {
1860         for (size_t i = 0; i < items.size() - 2; i++) {
1861             if (items[i] > 255)
1862                 return Nullopt;
1863         }
1864     }
1865     if (items[items.size() - 1] >= pow256(5 - items.size()))
1866         return Nullopt;
1867     for (auto item : items) {
1868         if (item > 255)
1869             return Nullopt;
1870     }
1871     uint32_t ipv4 = items.takeLast();
1872     for (size_t counter = 0; counter < items.size(); ++counter)
1873         ipv4 += items[counter] * pow256(3 - counter);
1874     return ipv4;
1875 }
1876     
1877 template<typename CharacterType>
1878 inline static Optional<std::array<uint16_t, 8>> parseIPv6Host(CodePointIterator<CharacterType> c)
1879 {
1880     if (c.atEnd())
1881         return Nullopt;
1882
1883     std::array<uint16_t, 8> address = {{0, 0, 0, 0, 0, 0, 0, 0}};
1884     size_t piecePointer = 0;
1885     Optional<size_t> compressPointer;
1886
1887     if (*c == ':') {
1888         ++c;
1889         if (c.atEnd())
1890             return Nullopt;
1891         if (*c != ':')
1892             return Nullopt;
1893         ++c;
1894         ++piecePointer;
1895         compressPointer = piecePointer;
1896     }
1897     
1898     while (!c.atEnd()) {
1899         if (piecePointer == 8)
1900             return Nullopt;
1901         if (*c == ':') {
1902             if (compressPointer)
1903                 return Nullopt;
1904             ++c;
1905             ++piecePointer;
1906             compressPointer = piecePointer;
1907             continue;
1908         }
1909         uint16_t value = 0;
1910         for (size_t length = 0; length < 4; length++) {
1911             if (c.atEnd())
1912                 break;
1913             if (!isASCIIHexDigit(*c))
1914                 break;
1915             value = value * 0x10 + toASCIIHexValue(*c);
1916             ++c;
1917         }
1918         address[piecePointer++] = value;
1919         if (c.atEnd())
1920             break;
1921         if (*c != ':')
1922             return Nullopt;
1923         ++c;
1924     }
1925     
1926     if (!c.atEnd()) {
1927         if (piecePointer > 6)
1928             return Nullopt;
1929         size_t dotsSeen = 0;
1930         while (!c.atEnd()) {
1931             Optional<uint16_t> value;
1932             if (!isASCIIDigit(*c))
1933                 return Nullopt;
1934             while (isASCIIDigit(*c)) {
1935                 auto number = *c - '0';
1936                 if (!value)
1937                     value = number;
1938                 else if (!value.value())
1939                     return Nullopt;
1940                 else
1941                     value = value.value() * 10 + number;
1942                 ++c;
1943                 if (c.atEnd())
1944                     return Nullopt;
1945                 if (value.value() > 255)
1946                     return Nullopt;
1947             }
1948             if (dotsSeen < 3 && *c != '.')
1949                 return Nullopt;
1950             address[piecePointer] = address[piecePointer] * 0x100 + value.valueOr(0);
1951             if (dotsSeen == 1 || dotsSeen == 3)
1952                 piecePointer++;
1953             if (!c.atEnd())
1954                 ++c;
1955             if (dotsSeen == 3 && !c.atEnd())
1956                 return Nullopt;
1957             dotsSeen++;
1958         }
1959     }
1960     if (compressPointer) {
1961         size_t swaps = piecePointer - compressPointer.value();
1962         piecePointer = 7;
1963         while (swaps)
1964             std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
1965     } else if (piecePointer != 8)
1966         return Nullopt;
1967     return address;
1968 }
1969
1970 const size_t defaultInlineBufferSize = 2048;
1971
1972 inline static Vector<LChar, defaultInlineBufferSize> percentDecode(const LChar* input, size_t length)
1973 {
1974     Vector<LChar, defaultInlineBufferSize> output;
1975     output.reserveInitialCapacity(length);
1976     
1977     for (size_t i = 0; i < length; ++i) {
1978         uint8_t byte = input[i];
1979         if (byte != '%')
1980             output.uncheckedAppend(byte);
1981         else if (i < length - 2) {
1982             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
1983                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
1984                 i += 2;
1985             } else
1986                 output.uncheckedAppend(byte);
1987         } else
1988             output.uncheckedAppend(byte);
1989     }
1990     return output;
1991 }
1992
1993 inline static bool containsOnlyASCII(const String& string)
1994 {
1995     if (string.is8Bit())
1996         return charactersAreAllASCII(string.characters8(), string.length());
1997     return charactersAreAllASCII(string.characters16(), string.length());
1998 }
1999
2000 inline static Optional<Vector<LChar, defaultInlineBufferSize>> domainToASCII(const String& domain)
2001 {
2002     Vector<LChar, defaultInlineBufferSize> ascii;
2003     if (containsOnlyASCII(domain)) {
2004         size_t length = domain.length();
2005         if (domain.is8Bit()) {
2006             const LChar* characters = domain.characters8();
2007             ascii.reserveInitialCapacity(length);
2008             for (size_t i = 0; i < length; ++i)
2009                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2010         } else {
2011             const UChar* characters = domain.characters16();
2012             ascii.reserveInitialCapacity(length);
2013             for (size_t i = 0; i < length; ++i)
2014                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2015         }
2016         return ascii;
2017     }
2018     
2019     UChar hostnameBuffer[defaultInlineBufferSize];
2020     UErrorCode error = U_ZERO_ERROR;
2021
2022 #if COMPILER(GCC) || COMPILER(CLANG)
2023 #pragma GCC diagnostic push
2024 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2025 #endif
2026     // FIXME: This should use uidna_openUTS46 / uidna_close instead
2027     int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, UIDNA_ALLOW_UNASSIGNED, nullptr, &error);
2028 #if COMPILER(GCC) || COMPILER(CLANG)
2029 #pragma GCC diagnostic pop
2030 #endif
2031     ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
2032
2033     if (error == U_ZERO_ERROR) {
2034         for (int32_t i = 0; i < numCharactersConverted; ++i) {
2035             ASSERT(isASCII(hostnameBuffer[i]));
2036             ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2037         }
2038         ascii.append(hostnameBuffer, numCharactersConverted);
2039         return ascii;
2040     }
2041
2042     // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
2043     return Nullopt;
2044 }
2045
2046 inline static bool hasInvalidDomainCharacter(const Vector<LChar, defaultInlineBufferSize>& asciiDomain)
2047 {
2048     for (size_t i = 0; i < asciiDomain.size(); ++i) {
2049         if (isInvalidDomainCharacter(asciiDomain[i]))
2050             return true;
2051     }
2052     return false;
2053 }
2054
2055 template<bool serialized, typename CharacterType>
2056 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2057 {
2058     uint32_t port = 0;
2059     if (iterator.atEnd()) {
2060         m_url.m_portEnd = m_asciiBuffer.size();
2061         return true;
2062     }
2063     m_asciiBuffer.append(':');
2064     for (; !iterator.atEnd(); ++iterator) {
2065         if (!serialized && isTabOrNewline(*iterator))
2066             continue;
2067         if (isASCIIDigit(*iterator)) {
2068             port = port * 10 + *iterator - '0';
2069             if (port > std::numeric_limits<uint16_t>::max())
2070                 return false;
2071         } else
2072             return false;
2073     }
2074
2075     if (isDefaultPort(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd), port)) {
2076         ASSERT(m_asciiBuffer.last() == ':');
2077         m_asciiBuffer.shrink(m_asciiBuffer.size() - 1);
2078     } else
2079         append<uint16_t>(m_asciiBuffer, static_cast<uint16_t>(port));
2080
2081     m_url.m_portEnd = m_asciiBuffer.size();
2082     return true;
2083 }
2084
2085 template<bool serialized, typename CharacterType>
2086 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2087 {
2088     if (iterator.atEnd())
2089         return false;
2090     if (*iterator == '[') {
2091         ++iterator;
2092         auto ipv6End = iterator;
2093         while (!ipv6End.atEnd() && *ipv6End != ']')
2094             ++ipv6End;
2095         if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2096             serializeIPv6(address.value(), m_asciiBuffer);
2097             m_url.m_hostEnd = m_asciiBuffer.size();
2098             if (!ipv6End.atEnd()) {
2099                 ++ipv6End;
2100                 if (!ipv6End.atEnd() && *ipv6End == ':') {
2101                     ++ipv6End;
2102                     return parsePort<serialized>(ipv6End);
2103                 }
2104                 m_url.m_portEnd = m_asciiBuffer.size();
2105                 return true;
2106             }
2107             return true;
2108         }
2109     }
2110     
2111     ASSERT(!serialized || !m_hostHasPercentOrNonASCII);
2112     if (!m_hostHasPercentOrNonASCII) {
2113         auto hostIterator = iterator;
2114         for (; !iterator.atEnd(); ++iterator) {
2115             if (!serialized && isTabOrNewline(*iterator))
2116                 continue;
2117             if (*iterator == ':')
2118                 break;
2119             if (isInvalidDomainCharacter(*iterator))
2120                 return false;
2121         }
2122         if (auto address = parseIPv4Host(CodePointIterator<CharacterType>(hostIterator, iterator))) {
2123             serializeIPv4(address.value(), m_asciiBuffer);
2124             m_url.m_hostEnd = m_asciiBuffer.size();
2125             if (iterator.atEnd()) {
2126                 m_url.m_portEnd = m_asciiBuffer.size();
2127                 return true;
2128             }
2129             ++iterator;
2130             return parsePort<serialized>(iterator);
2131         }
2132         for (; hostIterator != iterator; ++hostIterator) {
2133             if (serialized) {
2134                 ASSERT(!isASCIIUpper(*hostIterator));
2135                 m_asciiBuffer.append(*hostIterator);
2136             } else if (!isTabOrNewline(*hostIterator))
2137                 m_asciiBuffer.append(toASCIILower(*hostIterator));
2138         }
2139         m_url.m_hostEnd = m_asciiBuffer.size();
2140         if (!hostIterator.atEnd()) {
2141             ASSERT(*hostIterator == ':');
2142             incrementIteratorSkippingTabAndNewLine<serialized>(hostIterator);
2143             return parsePort<serialized>(hostIterator);
2144         }
2145         m_url.m_portEnd = m_asciiBuffer.size();
2146         return true;
2147     }
2148     
2149     Vector<LChar, defaultInlineBufferSize> utf8Encoded;
2150     for (; !iterator.atEnd(); ++iterator) {
2151         if (!serialized && isTabOrNewline(*iterator))
2152             continue;
2153         if (*iterator == ':')
2154             break;
2155         uint8_t buffer[U8_MAX_LENGTH];
2156         int32_t offset = 0;
2157         UBool error = false;
2158         U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error);
2159         ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
2160         // FIXME: Check error.
2161         utf8Encoded.append(buffer, offset);
2162     }
2163     Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size());
2164     String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2165     auto asciiDomain = domainToASCII(domain);
2166     if (!asciiDomain || hasInvalidDomainCharacter(asciiDomain.value()))
2167         return false;
2168     Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value();
2169     const LChar* asciiDomainCharacters = asciiDomainValue.data();
2170
2171     if (auto address = parseIPv4Host(CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()))) {
2172         serializeIPv4(address.value(), m_asciiBuffer);
2173         m_url.m_hostEnd = m_asciiBuffer.size();
2174         if (iterator.atEnd()) {
2175             m_url.m_portEnd = m_asciiBuffer.size();
2176             return true;
2177         }
2178         ++iterator;
2179         return parsePort<serialized>(iterator);
2180     }
2181
2182     m_asciiBuffer.append(asciiDomainCharacters, asciiDomainValue.size());
2183     m_url.m_hostEnd = m_asciiBuffer.size();
2184     if (!iterator.atEnd()) {
2185         ASSERT(*iterator == ':');
2186         incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
2187         return parsePort<serialized>(iterator);
2188     }
2189     m_url.m_portEnd = m_asciiBuffer.size();
2190     return true;
2191 }
2192
2193 inline static Optional<String> formURLDecode(StringView input)
2194 {
2195     auto utf8 = input.utf8(StrictConversion);
2196     if (utf8.isNull())
2197         return Nullopt;
2198     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2199     return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2200 }
2201
2202 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2203 {
2204     Vector<StringView> sequences = input.split('&');
2205
2206     URLEncodedForm output;
2207     for (auto& bytes : sequences) {
2208         auto valueStart = bytes.find('=');
2209         if (valueStart == notFound) {
2210             if (auto name = formURLDecode(bytes))
2211                 output.append({name.value().replace('+', 0x20), emptyString()});
2212         } else {
2213             auto name = formURLDecode(bytes.substring(0, valueStart));
2214             auto value = formURLDecode(bytes.substring(valueStart + 1));
2215             if (name && value)
2216                 output.append(std::make_pair(name.value().replace('+', 0x20), value.value().replace('+', 0x20)));
2217         }
2218     }
2219     return output;
2220 }
2221
2222 inline static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2223 {
2224     auto utf8 = input.utf8(StrictConversion);
2225     const char* data = utf8.data();
2226     for (size_t i = 0; i < utf8.length(); ++i) {
2227         const char byte = data[i];
2228         if (byte == 0x20)
2229             output.append(0x2B);
2230         else if (byte == 0x2A
2231             || byte == 0x2D
2232             || byte == 0x2E
2233             || (byte >= 0x30 && byte <= 0x39)
2234             || (byte >= 0x41 && byte <= 0x5A)
2235             || byte == 0x5F
2236             || (byte >= 0x61 && byte <= 0x7A))
2237             output.append(byte);
2238         else
2239             percentEncodeByte(byte, output);
2240     }
2241 }
2242     
2243 String URLParser::serialize(const URLEncodedForm& tuples)
2244 {
2245     Vector<LChar> output;
2246     for (auto& tuple : tuples) {
2247         if (!output.isEmpty())
2248             output.append('&');
2249         serializeURLEncodedForm(tuple.first, output);
2250         output.append('=');
2251         serializeURLEncodedForm(tuple.second, output);
2252     }
2253     return String::adopt(WTFMove(output));
2254 }
2255
2256 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2257 {
2258     // FIXME: m_cannotBeABaseURL is not compared because the old URL::parse did not use it,
2259     // but once we get rid of URL::parse its value should be tested.
2260     LOG(URLParser, "%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2261         a.m_isValid,
2262         a.m_protocolIsInHTTPFamily,
2263         a.m_schemeEnd,
2264         a.m_userStart,
2265         a.m_userEnd,
2266         a.m_passwordEnd,
2267         a.m_hostEnd,
2268         a.m_portEnd,
2269         a.m_pathAfterLastSlash,
2270         a.m_pathEnd,
2271         a.m_queryEnd,
2272         a.m_fragmentEnd,
2273         a.m_string.utf8().data(),
2274         b.m_isValid,
2275         b.m_protocolIsInHTTPFamily,
2276         b.m_schemeEnd,
2277         b.m_userStart,
2278         b.m_userEnd,
2279         b.m_passwordEnd,
2280         b.m_hostEnd,
2281         b.m_portEnd,
2282         b.m_pathAfterLastSlash,
2283         b.m_pathEnd,
2284         b.m_queryEnd,
2285         b.m_fragmentEnd,
2286         b.m_string.utf8().data());
2287
2288     return a.m_string == b.m_string
2289         && a.m_isValid == b.m_isValid
2290         && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2291         && a.m_schemeEnd == b.m_schemeEnd
2292         && a.m_userStart == b.m_userStart
2293         && a.m_userEnd == b.m_userEnd
2294         && a.m_passwordEnd == b.m_passwordEnd
2295         && a.m_hostEnd == b.m_hostEnd
2296         && a.m_portEnd == b.m_portEnd
2297         && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2298         && a.m_pathEnd == b.m_pathEnd
2299         && a.m_queryEnd == b.m_queryEnd
2300         && a.m_fragmentEnd == b.m_fragmentEnd;
2301 }
2302
2303 bool URLParser::internalValuesConsistent(const URL& url)
2304 {    
2305     return url.m_schemeEnd <= url.m_userStart
2306         && url.m_userStart <= url.m_userEnd
2307         && url.m_userEnd <= url.m_passwordEnd
2308         && url.m_passwordEnd <= url.m_hostEnd
2309         && url.m_hostEnd <= url.m_hostEnd
2310         && url.m_portEnd <= url.m_pathAfterLastSlash
2311         && url.m_pathAfterLastSlash <= url.m_pathEnd
2312         && url.m_pathEnd <= url.m_queryEnd
2313         && url.m_queryEnd <= url.m_fragmentEnd
2314         && (url.m_isValid ? url.m_fragmentEnd == url.m_string.length() : !url.m_fragmentEnd);
2315     // FIXME: Why do we even store m_fragmentEnd?
2316     // It should be able to be deduced from m_isValid and m_string.length() to save memory.
2317 }
2318
2319 static bool urlParserEnabled = false;
2320
2321 void URLParser::setEnabled(bool enabled)
2322 {
2323     urlParserEnabled = enabled;
2324 }
2325
2326 bool URLParser::enabled()
2327 {
2328     return urlParserEnabled;
2329 }
2330
2331 } // namespace WebCore