URLParser should correctly strip unnecessary 0's in IPv6 addresses
[WebKit-https.git] / Source / WebCore / platform / URLParser.cpp
1 /*
2  * Copyright (C) 2016 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "URLParser.h"
28
29 #include "Logging.h"
30 #include <array>
31 #include <unicode/uidna.h>
32 #include <unicode/utypes.h>
33
34 namespace WebCore {
35
36 template<typename CharacterType>
37 class CodePointIterator {
38 public:
39     ALWAYS_INLINE CodePointIterator() { }
40     ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
41         : m_begin(begin)
42         , m_end(end)
43     {
44     }
45     
46     ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
47         : CodePointIterator(begin.m_begin, end.m_begin)
48     {
49         ASSERT(end.m_begin >= begin.m_begin);
50     }
51     
52     ALWAYS_INLINE UChar32 operator*() const;
53     ALWAYS_INLINE CodePointIterator& operator++();
54
55     ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
56     {
57         return m_begin == other.m_begin
58             && m_end == other.m_end;
59     }
60     ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
61     
62     ALWAYS_INLINE CodePointIterator& operator=(const CodePointIterator& other)
63     {
64         m_begin = other.m_begin;
65         m_end = other.m_end;
66         return *this;
67     }
68
69     ALWAYS_INLINE bool atEnd() const
70     {
71         ASSERT(m_begin <= m_end);
72         return m_begin >= m_end;
73     }
74     
75     ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
76     {
77         ASSERT(m_begin >= reference);
78         return m_begin - reference;
79     }
80
81     ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
82     {
83         return codeUnitsSince(other.m_begin);
84     }
85     
86 private:
87     const CharacterType* m_begin { nullptr };
88     const CharacterType* m_end { nullptr };
89 };
90
91 template<>
92 ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const
93 {
94     ASSERT(!atEnd());
95     return *m_begin;
96 }
97
98 template<>
99 ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
100 {
101     ASSERT(!atEnd());
102     m_begin++;
103     return *this;
104 }
105
106 template<>
107 ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const
108 {
109     ASSERT(!atEnd());
110     UChar32 c;
111     U16_GET(m_begin, 0, 0, m_end - m_begin, c);
112     return c;
113 }
114
115 template<>
116 ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
117 {
118     ASSERT(!atEnd());
119     unsigned i = 0;
120     size_t length = m_end - m_begin;
121     U16_FWD_1(m_begin, i, length);
122     m_begin += i;
123     return *this;
124 }
125     
126 ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
127 {
128     if (U_IS_BMP(codePoint)) {
129         destination.append(static_cast<UChar>(codePoint));
130         return;
131     }
132     destination.reserveCapacity(destination.size() + 2);
133     destination.uncheckedAppend(U16_LEAD(codePoint));
134     destination.uncheckedAppend(U16_TRAIL(codePoint));
135 }
136
137 enum URLCharacterClass {
138     UserInfo = 0x1,
139     Default = 0x2,
140     InvalidDomain = 0x4,
141     QueryPercent = 0x8,
142     SlashQuestionOrHash = 0x10,
143     ValidScheme = 0x20,
144 };
145
146 static const uint8_t characterClassTable[256] = {
147     UserInfo | Default | InvalidDomain | QueryPercent, // 0x0
148     UserInfo | Default | QueryPercent, // 0x1
149     UserInfo | Default | QueryPercent, // 0x2
150     UserInfo | Default | QueryPercent, // 0x3
151     UserInfo | Default | QueryPercent, // 0x4
152     UserInfo | Default | QueryPercent, // 0x5
153     UserInfo | Default | QueryPercent, // 0x6
154     UserInfo | Default | QueryPercent, // 0x7
155     UserInfo | Default | QueryPercent, // 0x8
156     UserInfo | Default | InvalidDomain | QueryPercent, // 0x9
157     UserInfo | Default | InvalidDomain | QueryPercent, // 0xA
158     UserInfo | Default | QueryPercent, // 0xB
159     UserInfo | Default | QueryPercent, // 0xC
160     UserInfo | Default | InvalidDomain | QueryPercent, // 0xD
161     UserInfo | Default | QueryPercent, // 0xE
162     UserInfo | Default | QueryPercent, // 0xF
163     UserInfo | Default | QueryPercent, // 0x10
164     UserInfo | Default | QueryPercent, // 0x11
165     UserInfo | Default | QueryPercent, // 0x12
166     UserInfo | Default | QueryPercent, // 0x13
167     UserInfo | Default | QueryPercent, // 0x14
168     UserInfo | Default | QueryPercent, // 0x15
169     UserInfo | Default | QueryPercent, // 0x16
170     UserInfo | Default | QueryPercent, // 0x17
171     UserInfo | Default | QueryPercent, // 0x18
172     UserInfo | Default | QueryPercent, // 0x19
173     UserInfo | Default | QueryPercent, // 0x1A
174     UserInfo | Default | QueryPercent, // 0x1B
175     UserInfo | Default | QueryPercent, // 0x1C
176     UserInfo | Default | QueryPercent, // 0x1D
177     UserInfo | Default | QueryPercent, // 0x1E
178     UserInfo | Default | QueryPercent, // 0x1F
179     UserInfo | Default | InvalidDomain | QueryPercent, // ' '
180     0, // '!'
181     UserInfo | Default | QueryPercent, // '"'
182     UserInfo | Default | InvalidDomain | QueryPercent | SlashQuestionOrHash, // '#'
183     0, // '$'
184     InvalidDomain, // '%'
185     0, // '&'
186     0, // '''
187     0, // '('
188     0, // ')'
189     0, // '*'
190     ValidScheme, // '+'
191     0, // ','
192     ValidScheme, // '-'
193     ValidScheme, // '.'
194     UserInfo | InvalidDomain | SlashQuestionOrHash, // '/'
195     ValidScheme, // '0'
196     ValidScheme, // '1'
197     ValidScheme, // '2'
198     ValidScheme, // '3'
199     ValidScheme, // '4'
200     ValidScheme, // '5'
201     ValidScheme, // '6'
202     ValidScheme, // '7'
203     ValidScheme, // '8'
204     ValidScheme, // '9'
205     UserInfo | InvalidDomain, // ':'
206     UserInfo, // ';'
207     UserInfo | Default | QueryPercent, // '<'
208     UserInfo, // '='
209     UserInfo | Default | QueryPercent, // '>'
210     UserInfo | Default | InvalidDomain | SlashQuestionOrHash, // '?'
211     UserInfo | InvalidDomain, // '@'
212     ValidScheme, // 'A'
213     ValidScheme, // 'B'
214     ValidScheme, // 'C'
215     ValidScheme, // 'D'
216     ValidScheme, // 'E'
217     ValidScheme, // 'F'
218     ValidScheme, // 'G'
219     ValidScheme, // 'H'
220     ValidScheme, // 'I'
221     ValidScheme, // 'J'
222     ValidScheme, // 'K'
223     ValidScheme, // 'L'
224     ValidScheme, // 'M'
225     ValidScheme, // 'N'
226     ValidScheme, // 'O'
227     ValidScheme, // 'P'
228     ValidScheme, // 'Q'
229     ValidScheme, // 'R'
230     ValidScheme, // 'S'
231     ValidScheme, // 'T'
232     ValidScheme, // 'U'
233     ValidScheme, // 'V'
234     ValidScheme, // 'W'
235     ValidScheme, // 'X'
236     ValidScheme, // 'Y'
237     ValidScheme, // 'Z'
238     UserInfo | InvalidDomain, // '['
239     UserInfo | InvalidDomain | SlashQuestionOrHash, // '\\'
240     UserInfo | InvalidDomain, // ']'
241     UserInfo, // '^'
242     0, // '_'
243     UserInfo | Default, // '`'
244     ValidScheme, // 'a'
245     ValidScheme, // 'b'
246     ValidScheme, // 'c'
247     ValidScheme, // 'd'
248     ValidScheme, // 'e'
249     ValidScheme, // 'f'
250     ValidScheme, // 'g'
251     ValidScheme, // 'h'
252     ValidScheme, // 'i'
253     ValidScheme, // 'j'
254     ValidScheme, // 'k'
255     ValidScheme, // 'l'
256     ValidScheme, // 'm'
257     ValidScheme, // 'n'
258     ValidScheme, // 'o'
259     ValidScheme, // 'p'
260     ValidScheme, // 'q'
261     ValidScheme, // 'r'
262     ValidScheme, // 's'
263     ValidScheme, // 't'
264     ValidScheme, // 'u'
265     ValidScheme, // 'v'
266     ValidScheme, // 'w'
267     ValidScheme, // 'x'
268     ValidScheme, // 'y'
269     ValidScheme, // 'z'
270     UserInfo | Default, // '{'
271     UserInfo, // '|'
272     UserInfo | Default, // '}'
273     0, // '~'
274     QueryPercent, // 0x7F
275     QueryPercent, // 0x80
276     QueryPercent, // 0x81
277     QueryPercent, // 0x82
278     QueryPercent, // 0x83
279     QueryPercent, // 0x84
280     QueryPercent, // 0x85
281     QueryPercent, // 0x86
282     QueryPercent, // 0x87
283     QueryPercent, // 0x88
284     QueryPercent, // 0x89
285     QueryPercent, // 0x8A
286     QueryPercent, // 0x8B
287     QueryPercent, // 0x8C
288     QueryPercent, // 0x8D
289     QueryPercent, // 0x8E
290     QueryPercent, // 0x8F
291     QueryPercent, // 0x90
292     QueryPercent, // 0x91
293     QueryPercent, // 0x92
294     QueryPercent, // 0x93
295     QueryPercent, // 0x94
296     QueryPercent, // 0x95
297     QueryPercent, // 0x96
298     QueryPercent, // 0x97
299     QueryPercent, // 0x98
300     QueryPercent, // 0x99
301     QueryPercent, // 0x9A
302     QueryPercent, // 0x9B
303     QueryPercent, // 0x9C
304     QueryPercent, // 0x9D
305     QueryPercent, // 0x9E
306     QueryPercent, // 0x9F
307     QueryPercent, // 0xA0
308     QueryPercent, // 0xA1
309     QueryPercent, // 0xA2
310     QueryPercent, // 0xA3
311     QueryPercent, // 0xA4
312     QueryPercent, // 0xA5
313     QueryPercent, // 0xA6
314     QueryPercent, // 0xA7
315     QueryPercent, // 0xA8
316     QueryPercent, // 0xA9
317     QueryPercent, // 0xAA
318     QueryPercent, // 0xAB
319     QueryPercent, // 0xAC
320     QueryPercent, // 0xAD
321     QueryPercent, // 0xAE
322     QueryPercent, // 0xAF
323     QueryPercent, // 0xB0
324     QueryPercent, // 0xB1
325     QueryPercent, // 0xB2
326     QueryPercent, // 0xB3
327     QueryPercent, // 0xB4
328     QueryPercent, // 0xB5
329     QueryPercent, // 0xB6
330     QueryPercent, // 0xB7
331     QueryPercent, // 0xB8
332     QueryPercent, // 0xB9
333     QueryPercent, // 0xBA
334     QueryPercent, // 0xBB
335     QueryPercent, // 0xBC
336     QueryPercent, // 0xBD
337     QueryPercent, // 0xBE
338     QueryPercent, // 0xBF
339     QueryPercent, // 0xC0
340     QueryPercent, // 0xC1
341     QueryPercent, // 0xC2
342     QueryPercent, // 0xC3
343     QueryPercent, // 0xC4
344     QueryPercent, // 0xC5
345     QueryPercent, // 0xC6
346     QueryPercent, // 0xC7
347     QueryPercent, // 0xC8
348     QueryPercent, // 0xC9
349     QueryPercent, // 0xCA
350     QueryPercent, // 0xCB
351     QueryPercent, // 0xCC
352     QueryPercent, // 0xCD
353     QueryPercent, // 0xCE
354     QueryPercent, // 0xCF
355     QueryPercent, // 0xD0
356     QueryPercent, // 0xD1
357     QueryPercent, // 0xD2
358     QueryPercent, // 0xD3
359     QueryPercent, // 0xD4
360     QueryPercent, // 0xD5
361     QueryPercent, // 0xD6
362     QueryPercent, // 0xD7
363     QueryPercent, // 0xD8
364     QueryPercent, // 0xD9
365     QueryPercent, // 0xDA
366     QueryPercent, // 0xDB
367     QueryPercent, // 0xDC
368     QueryPercent, // 0xDD
369     QueryPercent, // 0xDE
370     QueryPercent, // 0xDF
371     QueryPercent, // 0xE0
372     QueryPercent, // 0xE1
373     QueryPercent, // 0xE2
374     QueryPercent, // 0xE3
375     QueryPercent, // 0xE4
376     QueryPercent, // 0xE5
377     QueryPercent, // 0xE6
378     QueryPercent, // 0xE7
379     QueryPercent, // 0xE8
380     QueryPercent, // 0xE9
381     QueryPercent, // 0xEA
382     QueryPercent, // 0xEB
383     QueryPercent, // 0xEC
384     QueryPercent, // 0xED
385     QueryPercent, // 0xEE
386     QueryPercent, // 0xEF
387     QueryPercent, // 0xF0
388     QueryPercent, // 0xF1
389     QueryPercent, // 0xF2
390     QueryPercent, // 0xF3
391     QueryPercent, // 0xF4
392     QueryPercent, // 0xF5
393     QueryPercent, // 0xF6
394     QueryPercent, // 0xF7
395     QueryPercent, // 0xF8
396     QueryPercent, // 0xF9
397     QueryPercent, // 0xFA
398     QueryPercent, // 0xFB
399     QueryPercent, // 0xFC
400     QueryPercent, // 0xFD
401     QueryPercent, // 0xFE
402     QueryPercent, // 0xFF
403 };
404
405 template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; }
406 template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
407 template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
408 template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
409 template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
410 template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
411 template<typename CharacterType> ALWAYS_INLINE static bool isInvalidDomainCharacter(CharacterType character) { return character <= ']' && characterClassTable[character] & InvalidDomain; }
412 template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
413 template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
414 template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & ValidScheme; }
415 static bool shouldPercentEncodeQueryByte(uint8_t byte) { return characterClassTable[byte] & QueryPercent; }
416
417 template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation>
418 ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
419 {
420     ++iterator;
421     while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
422         if (reportSyntaxViolation == ReportSyntaxViolation::Yes)
423             syntaxViolation(iteratorForSyntaxViolationPosition);
424         ++iterator;
425     }
426 }
427
428 template<typename CharacterType>
429 bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator)
430 {
431     if (iterator.atEnd())
432         return false;
433     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
434     if (iterator.atEnd())
435         return false;
436     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
437     return iterator.atEnd();
438 }
439
440 template<typename CharacterType>
441 ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
442 {
443     if (iterator.atEnd() || !isASCIIAlpha(*iterator))
444         return false;
445     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
446     if (iterator.atEnd())
447         return false;
448     if (*iterator == ':')
449         return true;
450     if (UNLIKELY(*iterator == '|'))
451         return true;
452     return false;
453 }
454
455 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
456 {
457     ASSERT(m_unicodeFragmentBuffer.isEmpty());
458     ASSERT(isASCII(codePoint));
459     if (UNLIKELY(m_didSeeSyntaxViolation))
460         m_asciiBuffer.append(codePoint);
461 }
462
463 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
464 {
465     ASSERT(m_unicodeFragmentBuffer.isEmpty());
466     if (UNLIKELY(m_didSeeSyntaxViolation))
467         m_asciiBuffer.append(characters, length);
468 }
469
470 template<typename CharacterType>
471 void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
472 {
473     ASSERT(isWindowsDriveLetter(iterator));
474     appendToASCIIBuffer(*iterator);
475     advance(iterator);
476     ASSERT(!iterator.atEnd());
477     ASSERT(*iterator == ':' || *iterator == '|');
478     if (*iterator == '|')
479         syntaxViolation(iterator);
480     appendToASCIIBuffer(':');
481     advance(iterator);
482 }
483
484 template<typename CharacterType>
485 bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
486 {
487     if (!isWindowsDriveLetter(iterator))
488         return true;
489     if (iterator.atEnd())
490         return false;
491     advance(iterator);
492     if (iterator.atEnd())
493         return true;
494     advance(iterator);
495     if (iterator.atEnd())
496         return true;
497     return !isSlashQuestionOrHash(*iterator);
498 }
499
500 static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
501 {
502     buffer.append('%');
503     buffer.append(upperNibbleToASCIIHexDigit(byte));
504     buffer.append(lowerNibbleToASCIIHexDigit(byte));
505 }
506
507 void URLParser::percentEncodeByte(uint8_t byte)
508 {
509     ASSERT(m_didSeeSyntaxViolation);
510     appendToASCIIBuffer('%');
511     appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
512     appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
513 }
514
515 const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD";
516 const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1;
517
518 template<bool(*isInCodeSet)(UChar32), typename CharacterType>
519 ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
520 {
521     ASSERT(!iterator.atEnd());
522     UChar32 codePoint = *iterator;
523     if (LIKELY(isASCII(codePoint))) {
524         if (UNLIKELY(isInCodeSet(codePoint))) {
525             syntaxViolation(iterator);
526             percentEncodeByte(codePoint);
527         } else
528             appendToASCIIBuffer(codePoint);
529         return;
530     }
531     ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
532     syntaxViolation(iterator);
533     
534     if (!U_IS_UNICODE_CHAR(codePoint)) {
535         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
536         return;
537     }
538     
539     uint8_t buffer[U8_MAX_LENGTH];
540     int32_t offset = 0;
541     U8_APPEND_UNSAFE(buffer, offset, codePoint);
542     for (int32_t i = 0; i < offset; ++i)
543         percentEncodeByte(buffer[i]);
544 }
545
546 template<typename CharacterType>
547 ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
548 {
549     ASSERT(!iterator.atEnd());
550     UChar32 codePoint = *iterator;
551     if (LIKELY(isASCII(codePoint))) {
552         if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint))) {
553             syntaxViolation(iterator);
554             percentEncodeByte(codePoint);
555         } else
556             appendToASCIIBuffer(codePoint);
557         return;
558     }
559     
560     syntaxViolation(iterator);
561     
562     if (!U_IS_UNICODE_CHAR(codePoint)) {
563         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
564         return;
565     }
566
567     uint8_t buffer[U8_MAX_LENGTH];
568     int32_t offset = 0;
569     U8_APPEND_UNSAFE(buffer, offset, codePoint);
570     for (int32_t i = 0; i < offset; ++i) {
571         auto byte = buffer[i];
572         if (shouldPercentEncodeQueryByte(byte))
573             percentEncodeByte(byte);
574         else
575             appendToASCIIBuffer(byte);
576     }
577 }
578
579 template<typename CharacterType>
580 void URLParser::encodeQuery(const Vector<UChar>& source, const TextEncoding& encoding, CodePointIterator<CharacterType> iterator)
581 {
582     // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
583     CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
584     const char* data = encoded.data();
585     size_t length = encoded.length();
586     
587     if (!length == !iterator.atEnd()) {
588         syntaxViolation(iterator);
589         return;
590     }
591     
592     size_t i = 0;
593     for (; i < length; ++i) {
594         ASSERT(!iterator.atEnd());
595         uint8_t byte = data[i];
596         if (UNLIKELY(byte != *iterator)) {
597             syntaxViolation(iterator);
598             break;
599         }
600         if (UNLIKELY(shouldPercentEncodeQueryByte(byte))) {
601             syntaxViolation(iterator);
602             break;
603         }
604         appendToASCIIBuffer(byte);
605         ++iterator;
606     }
607     ASSERT((i == length) == iterator.atEnd());
608     for (; i < length; ++i) {
609         ASSERT(m_didSeeSyntaxViolation);
610         uint8_t byte = data[i];
611         if (shouldPercentEncodeQueryByte(byte))
612             percentEncodeByte(byte);
613         else
614             appendToASCIIBuffer(byte);
615     }
616 }
617
618 ALWAYS_INLINE static bool isDefaultPort(StringView scheme, uint16_t port)
619 {
620     static const uint16_t ftpPort = 21;
621     static const uint16_t gopherPort = 70;
622     static const uint16_t httpPort = 80;
623     static const uint16_t httpsPort = 443;
624     static const uint16_t wsPort = 80;
625     static const uint16_t wssPort = 443;
626     
627     auto length = scheme.length();
628     if (!length)
629         return false;
630     switch (scheme[0]) {
631     case 'w':
632         switch (length) {
633         case 2:
634             return scheme[1] == 's'
635                 && port == wsPort;
636         case 3:
637             return scheme[1] == 's'
638                 && scheme[2] == 's'
639                 && port == wssPort;
640         default:
641             return false;
642         }
643     case 'h':
644         switch (length) {
645         case 4:
646             return scheme[1] == 't'
647                 && scheme[2] == 't'
648                 && scheme[3] == 'p'
649                 && port == httpPort;
650         case 5:
651             return scheme[1] == 't'
652                 && scheme[2] == 't'
653                 && scheme[3] == 'p'
654                 && scheme[4] == 's'
655                 && port == httpsPort;
656         default:
657             return false;
658         }
659     case 'g':
660         return length == 6
661             && scheme[1] == 'o'
662             && scheme[2] == 'p'
663             && scheme[3] == 'h'
664             && scheme[4] == 'e'
665             && scheme[5] == 'r'
666             && port == gopherPort;
667     case 'f':
668         return length == 3
669             && scheme[1] == 't'
670             && scheme[2] == 'p'
671             && port == ftpPort;
672         return false;
673     default:
674         return false;
675     }
676 }
677
678 enum class Scheme {
679     WS,
680     WSS,
681     File,
682     FTP,
683     Gopher,
684     HTTP,
685     HTTPS,
686     NonSpecial
687 };
688
689 ALWAYS_INLINE bool isSpecial(Scheme scheme)
690 {
691     switch (scheme) {
692     case Scheme::WS:
693     case Scheme::WSS:
694     case Scheme::File:
695     case Scheme::FTP:
696     case Scheme::Gopher:
697     case Scheme::HTTP:
698     case Scheme::HTTPS:
699         return true;
700     case Scheme::NonSpecial:
701         return false;
702     }
703     ASSERT_NOT_REACHED();
704     return false;
705 }
706
707 ALWAYS_INLINE static Scheme scheme(StringView scheme)
708 {
709     auto length = scheme.length();
710     if (!length)
711         return Scheme::NonSpecial;
712     switch (scheme[0]) {
713     case 'f':
714         switch (length) {
715         case 3:
716             if (scheme[1] == 't'
717                 && scheme[2] == 'p')
718                 return Scheme::FTP;
719             return Scheme::NonSpecial;
720         case 4:
721             if (scheme[1] == 'i'
722                 && scheme[2] == 'l'
723                 && scheme[3] == 'e')
724                 return Scheme::File;
725             return Scheme::NonSpecial;
726         default:
727             return Scheme::NonSpecial;
728         }
729     case 'g':
730         if (length == 6
731             && scheme[1] == 'o'
732             && scheme[2] == 'p'
733             && scheme[3] == 'h'
734             && scheme[4] == 'e'
735             && scheme[5] == 'r')
736             return Scheme::Gopher;
737         return Scheme::NonSpecial;
738     case 'h':
739         switch (length) {
740         case 4:
741             if (scheme[1] == 't'
742                 && scheme[2] == 't'
743                 && scheme[3] == 'p')
744                 return Scheme::HTTP;
745             return Scheme::NonSpecial;
746         case 5:
747             if (scheme[1] == 't'
748                 && scheme[2] == 't'
749                 && scheme[3] == 'p'
750                 && scheme[4] == 's')
751                 return Scheme::HTTPS;
752             return Scheme::NonSpecial;
753         default:
754             return Scheme::NonSpecial;
755         }
756     case 'w':
757         switch (length) {
758         case 2:
759             if (scheme[1] == 's')
760                 return Scheme::WS;
761             return Scheme::NonSpecial;
762         case 3:
763             if (scheme[1] == 's'
764                 && scheme[2] == 's')
765                 return Scheme::WSS;
766             return Scheme::NonSpecial;
767         default:
768             return Scheme::NonSpecial;
769         }
770     default:
771         return Scheme::NonSpecial;
772     }
773 }
774
775 enum class URLParser::URLPart {
776     SchemeEnd,
777     UserStart,
778     UserEnd,
779     PasswordEnd,
780     HostEnd,
781     PortEnd,
782     PathAfterLastSlash,
783     PathEnd,
784     QueryEnd,
785     FragmentEnd,
786 };
787
788 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
789 {
790     switch (part) {
791     case URLPart::FragmentEnd:
792         return url.m_fragmentEnd;
793     case URLPart::QueryEnd:
794         return url.m_queryEnd;
795     case URLPart::PathEnd:
796         return url.m_pathEnd;
797     case URLPart::PathAfterLastSlash:
798         return url.m_pathAfterLastSlash;
799     case URLPart::PortEnd:
800         return url.m_portEnd;
801     case URLPart::HostEnd:
802         return url.m_hostEnd;
803     case URLPart::PasswordEnd:
804         return url.m_passwordEnd;
805     case URLPart::UserEnd:
806         return url.m_userEnd;
807     case URLPart::UserStart:
808         return url.m_userStart;
809     case URLPart::SchemeEnd:
810         return url.m_schemeEnd;
811     }
812     ASSERT_NOT_REACHED();
813     return 0;
814 }
815
816 void URLParser::copyASCIIStringUntil(const String& string, size_t length)
817 {
818     RELEASE_ASSERT(length <= string.length());
819     if (string.isNull())
820         return;
821     ASSERT(m_asciiBuffer.isEmpty());
822     if (string.is8Bit()) {
823         appendToASCIIBuffer(string.characters8(), length);
824     } else {
825         const UChar* characters = string.characters16();
826         for (size_t i = 0; i < length; ++i) {
827             UChar c = characters[i];
828             ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
829             appendToASCIIBuffer(c);
830         }
831     }
832 }
833
834 template<typename CharacterType>
835 void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator)
836 {
837     syntaxViolation(iterator);
838
839     m_asciiBuffer.clear();
840     m_unicodeFragmentBuffer.clear();
841     copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
842     switch (part) {
843     case URLPart::FragmentEnd:
844         RELEASE_ASSERT_NOT_REACHED();
845     case URLPart::QueryEnd:
846         m_url.m_queryEnd = base.m_queryEnd;
847         FALLTHROUGH;
848     case URLPart::PathEnd:
849         m_url.m_pathEnd = base.m_pathEnd;
850         FALLTHROUGH;
851     case URLPart::PathAfterLastSlash:
852         m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
853         FALLTHROUGH;
854     case URLPart::PortEnd:
855         m_url.m_portEnd = base.m_portEnd;
856         FALLTHROUGH;
857     case URLPart::HostEnd:
858         m_url.m_hostEnd = base.m_hostEnd;
859         FALLTHROUGH;
860     case URLPart::PasswordEnd:
861         m_url.m_passwordEnd = base.m_passwordEnd;
862         FALLTHROUGH;
863     case URLPart::UserEnd:
864         m_url.m_userEnd = base.m_userEnd;
865         FALLTHROUGH;
866     case URLPart::UserStart:
867         m_url.m_userStart = base.m_userStart;
868         FALLTHROUGH;
869     case URLPart::SchemeEnd:
870         m_url.m_isValid = base.m_isValid;
871         m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
872         m_url.m_schemeEnd = base.m_schemeEnd;
873     }
874     m_urlIsSpecial = isSpecial(scheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd)));
875 }
876
877 static const char* dotASCIICode = "2e";
878
879 template<typename CharacterType>
880 ALWAYS_INLINE bool URLParser::isPercentEncodedDot(CodePointIterator<CharacterType> c)
881 {
882     if (c.atEnd())
883         return false;
884     if (*c != '%')
885         return false;
886     advance<CharacterType, ReportSyntaxViolation::No>(c);
887     if (c.atEnd())
888         return false;
889     if (*c != dotASCIICode[0])
890         return false;
891     advance<CharacterType, ReportSyntaxViolation::No>(c);
892     if (c.atEnd())
893         return false;
894     return toASCIILower(*c) == dotASCIICode[1];
895 }
896
897 template<typename CharacterType>
898 ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c)
899 {
900     if (c.atEnd())
901         return false;
902     if (*c == '.') {
903         advance<CharacterType, ReportSyntaxViolation::No>(c);
904         return c.atEnd() || isSlashQuestionOrHash(*c);
905     }
906     if (*c != '%')
907         return false;
908     advance<CharacterType, ReportSyntaxViolation::No>(c);
909     if (c.atEnd() || *c != dotASCIICode[0])
910         return false;
911     advance<CharacterType, ReportSyntaxViolation::No>(c);
912     if (c.atEnd())
913         return false;
914     if (toASCIILower(*c) == dotASCIICode[1]) {
915         advance<CharacterType, ReportSyntaxViolation::No>(c);
916         return c.atEnd() || isSlashQuestionOrHash(*c);
917     }
918     return false;
919 }
920
921 template<typename CharacterType>
922 ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
923 {
924     if (c.atEnd())
925         return false;
926     if (*c == '.') {
927         advance<CharacterType, ReportSyntaxViolation::No>(c);
928         return isSingleDotPathSegment(c);
929     }
930     if (*c != '%')
931         return false;
932     advance<CharacterType, ReportSyntaxViolation::No>(c);
933     if (c.atEnd() || *c != dotASCIICode[0])
934         return false;
935     advance<CharacterType, ReportSyntaxViolation::No>(c);
936     if (c.atEnd())
937         return false;
938     if (toASCIILower(*c) == dotASCIICode[1]) {
939         advance<CharacterType, ReportSyntaxViolation::No>(c);
940         return isSingleDotPathSegment(c);
941     }
942     return false;
943 }
944
945 template<typename CharacterType>
946 void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
947 {
948     ASSERT(isSingleDotPathSegment(c));
949     if (*c == '.') {
950         advance(c);
951         if (!c.atEnd()) {
952             if (*c == '/' || *c == '\\')
953                 advance(c);
954             else
955                 ASSERT(*c == '?' || *c == '#');
956         }
957     } else {
958         ASSERT(*c == '%');
959         advance(c);
960         ASSERT(*c == dotASCIICode[0]);
961         advance(c);
962         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
963         advance(c);
964         if (!c.atEnd()) {
965             if (*c == '/' || *c == '\\')
966                 advance(c);
967             else
968                 ASSERT(*c == '?' || *c == '#');
969         }
970     }
971 }
972
973 template<typename CharacterType>
974 void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
975 {
976     ASSERT(isDoubleDotPathSegment(c));
977     if (*c == '.')
978         advance(c);
979     else {
980         ASSERT(*c == '%');
981         advance(c);
982         ASSERT(*c == dotASCIICode[0]);
983         advance(c);
984         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
985         advance(c);
986     }
987     consumeSingleDotPathSegment(c);
988 }
989
990 void URLParser::popPath()
991 {
992     ASSERT(m_didSeeSyntaxViolation);
993     if (m_url.m_pathAfterLastSlash > m_url.m_portEnd + 1) {
994         m_url.m_pathAfterLastSlash--;
995         if (m_asciiBuffer[m_url.m_pathAfterLastSlash] == '/')
996             m_url.m_pathAfterLastSlash--;
997         while (m_url.m_pathAfterLastSlash > m_url.m_portEnd && m_asciiBuffer[m_url.m_pathAfterLastSlash] != '/')
998             m_url.m_pathAfterLastSlash--;
999         m_url.m_pathAfterLastSlash++;
1000     }
1001     m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
1002 }
1003
1004 template<typename CharacterType>
1005 void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
1006 {
1007     if (m_didSeeSyntaxViolation)
1008         return;
1009     m_didSeeSyntaxViolation = true;
1010     
1011     ASSERT(m_asciiBuffer.isEmpty());
1012     ASSERT(m_unicodeFragmentBuffer.isEmpty());
1013     ASSERT_WITH_MESSAGE(!m_url.m_queryEnd, "syntaxViolation should not be used in the fragment, which might contain non-ASCII code points when serialized");
1014     size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1015     RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
1016     m_asciiBuffer.reserveCapacity(m_inputString.length());
1017     for (size_t i = 0; i < codeUnitsToCopy; ++i) {
1018         ASSERT(isASCII(m_inputString[i]));
1019         m_asciiBuffer.uncheckedAppend(m_inputString[i]);
1020     }
1021 }
1022
1023 template<typename CharacterType>
1024 void URLParser::fragmentSyntaxViolation(const CodePointIterator<CharacterType>& iterator)
1025 {
1026     if (m_didSeeSyntaxViolation)
1027         return;
1028     m_didSeeSyntaxViolation = true;
1029     m_didSeeUnicodeFragmentCodePoint = true;
1030
1031     ASSERT(m_asciiBuffer.isEmpty());
1032     ASSERT(m_unicodeFragmentBuffer.isEmpty());
1033     size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1034     size_t asciiCodeUnitsToCopy = m_url.m_queryEnd;
1035     size_t unicodeCodeUnitsToCopy = codeUnitsToCopy - asciiCodeUnitsToCopy;
1036     RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
1037     m_asciiBuffer.reserveCapacity(asciiCodeUnitsToCopy);
1038     for (size_t i = 0; i < asciiCodeUnitsToCopy; ++i) {
1039         ASSERT(isASCII(m_inputString[i]));
1040         m_asciiBuffer.uncheckedAppend(m_inputString[i]);
1041     }
1042     m_unicodeFragmentBuffer.reserveCapacity(m_inputString.length() - asciiCodeUnitsToCopy);
1043     for (size_t i = asciiCodeUnitsToCopy; i < asciiCodeUnitsToCopy + unicodeCodeUnitsToCopy; ++i)
1044         m_unicodeFragmentBuffer.uncheckedAppend(m_inputString[i]);
1045 }
1046
1047 void URLParser::failure()
1048 {
1049     m_url.invalidate();
1050     m_url.m_string = m_inputString;
1051 }
1052
1053 template<typename CharacterType>
1054 bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint)
1055 {
1056     if (iterator.atEnd() || toASCIILower(*iterator) != codePoint)
1057         return false;
1058     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
1059     return true;
1060 }
1061
1062 template<typename CharacterType>
1063 bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator)
1064 {
1065     if (!checkLocalhostCodePoint(iterator, 'l'))
1066         return false;
1067     if (!checkLocalhostCodePoint(iterator, 'o'))
1068         return false;
1069     if (!checkLocalhostCodePoint(iterator, 'c'))
1070         return false;
1071     if (!checkLocalhostCodePoint(iterator, 'a'))
1072         return false;
1073     if (!checkLocalhostCodePoint(iterator, 'l'))
1074         return false;
1075     if (!checkLocalhostCodePoint(iterator, 'h'))
1076         return false;
1077     if (!checkLocalhostCodePoint(iterator, 'o'))
1078         return false;
1079     if (!checkLocalhostCodePoint(iterator, 's'))
1080         return false;
1081     if (!checkLocalhostCodePoint(iterator, 't'))
1082         return false;
1083     return iterator.atEnd();
1084 }
1085
1086 bool URLParser::isLocalhost(StringView view)
1087 {
1088     if (view.is8Bit())
1089         return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length()));
1090     return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length()));
1091 }
1092
1093 ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
1094 {
1095     if (UNLIKELY(m_didSeeSyntaxViolation)) {
1096         ASSERT(start + length <= m_asciiBuffer.size());
1097         return StringView(m_asciiBuffer.data() + start, length);
1098     }
1099     ASSERT(start + length <= m_inputString.length());
1100     return StringView(m_inputString).substring(start, length);
1101 }
1102
1103 template<typename CharacterType>
1104 ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1105 {
1106     if (UNLIKELY(m_didSeeSyntaxViolation)) {
1107         ASSERT(m_unicodeFragmentBuffer.isEmpty());
1108         return m_asciiBuffer.size();
1109     }
1110     
1111     return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1112 }
1113
1114 URLParser::URLParser(const String& input, const URL& base, const TextEncoding& encoding)
1115     : m_inputString(input)
1116 {
1117     if (input.isNull()) {
1118         if (base.isValid() && !base.m_cannotBeABaseURL) {
1119             m_url = base;
1120             m_url.removeFragmentIdentifier();
1121         }
1122         return;
1123     }
1124
1125     if (input.is8Bit()) {
1126         m_inputBegin = input.characters8();
1127         parse(input.characters8(), input.length(), base, encoding);
1128     } else {
1129         m_inputBegin = input.characters16();
1130         parse(input.characters16(), input.length(), base, encoding);
1131     }
1132
1133     ASSERT(!m_url.m_isValid
1134         || m_didSeeSyntaxViolation == (m_url.string() != input)
1135         || (input.isAllSpecialCharacters<isC0ControlOrSpace>()
1136             && m_url.m_string == base.m_string.left(base.m_queryEnd)));
1137     ASSERT(internalValuesConsistent(m_url));
1138 #if !ASSERT_DISABLED
1139     if (!m_didSeeSyntaxViolation) {
1140         // Force a syntax violation at the beginning to make sure we get the same result.
1141         URLParser parser(makeString(" ", input), base, encoding);
1142         URL parsed = parser.result();
1143         if (parsed.isValid())
1144             ASSERT(allValuesEqual(parser.result(), m_url));
1145     }
1146 #endif
1147 }
1148
1149 template<typename CharacterType>
1150 void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
1151 {
1152     LOG(URLParser, "Parsing URL <%s> base <%s> encoding <%s>", String(input, length).utf8().data(), base.string().utf8().data(), encoding.name());
1153     m_url = { };
1154     ASSERT(m_asciiBuffer.isEmpty());
1155     ASSERT(m_unicodeFragmentBuffer.isEmpty());
1156     
1157     bool isUTF8Encoding = encoding == UTF8Encoding();
1158     Vector<UChar> queryBuffer;
1159
1160     unsigned endIndex = length;
1161     while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) {
1162         syntaxViolation(CodePointIterator<CharacterType>(input, input));
1163         endIndex--;
1164     }
1165     CodePointIterator<CharacterType> c(input, input + endIndex);
1166     CodePointIterator<CharacterType> authorityOrHostBegin;
1167     CodePointIterator<CharacterType> queryBegin;
1168     while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1169         syntaxViolation(c);
1170         ++c;
1171     }
1172     auto beginAfterControlAndSpace = c;
1173
1174     enum class State : uint8_t {
1175         SchemeStart,
1176         Scheme,
1177         NoScheme,
1178         SpecialRelativeOrAuthority,
1179         PathOrAuthority,
1180         Relative,
1181         RelativeSlash,
1182         SpecialAuthoritySlashes,
1183         SpecialAuthorityIgnoreSlashes,
1184         AuthorityOrHost,
1185         Host,
1186         File,
1187         FileSlash,
1188         FileHost,
1189         PathStart,
1190         Path,
1191         CannotBeABaseURLPath,
1192         UTF8Query,
1193         NonUTF8Query,
1194         Fragment,
1195     };
1196
1197 #define LOG_STATE(x) LOG(URLParser, "State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1198 #define LOG_FINAL_STATE(x) LOG(URLParser, "Final State: %s", x)
1199
1200     State state = State::SchemeStart;
1201     while (!c.atEnd()) {
1202         if (UNLIKELY(isTabOrNewline(*c))) {
1203             syntaxViolation(c);
1204             ++c;
1205             continue;
1206         }
1207
1208         switch (state) {
1209         case State::SchemeStart:
1210             LOG_STATE("SchemeStart");
1211             if (isASCIIAlpha(*c)) {
1212                 if (UNLIKELY(isASCIIUpper(*c)))
1213                     syntaxViolation(c);
1214                 appendToASCIIBuffer(toASCIILower(*c));
1215                 advance(c);
1216                 if (c.atEnd()) {
1217                     m_asciiBuffer.clear();
1218                     state = State::NoScheme;
1219                     c = beginAfterControlAndSpace;
1220                 }
1221                 state = State::Scheme;
1222             } else
1223                 state = State::NoScheme;
1224             break;
1225         case State::Scheme:
1226             LOG_STATE("Scheme");
1227             if (isValidSchemeCharacter(*c)) {
1228                 if (UNLIKELY(isASCIIUpper(*c)))
1229                     syntaxViolation(c);
1230                 appendToASCIIBuffer(toASCIILower(*c));
1231             } else if (*c == ':') {
1232                 m_url.m_schemeEnd = currentPosition(c);
1233                 StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd);
1234                 appendToASCIIBuffer(':');
1235                 switch (scheme(urlScheme)) {
1236                 case Scheme::File:
1237                     m_urlIsSpecial = true;
1238                     state = State::File;
1239                     ++c;
1240                     break;
1241                 case Scheme::WS:
1242                 case Scheme::WSS:
1243                     isUTF8Encoding = true;
1244                     m_urlIsSpecial = true;
1245                     if (base.protocolIs(urlScheme))
1246                         state = State::SpecialRelativeOrAuthority;
1247                     else
1248                         state = State::SpecialAuthoritySlashes;
1249                     ++c;
1250                     break;
1251                 case Scheme::HTTP:
1252                 case Scheme::HTTPS:
1253                     m_url.m_protocolIsInHTTPFamily = true;
1254                     FALLTHROUGH;
1255                 case Scheme::FTP:
1256                 case Scheme::Gopher:
1257                     m_urlIsSpecial = true;
1258                     if (base.protocolIs(urlScheme))
1259                         state = State::SpecialRelativeOrAuthority;
1260                     else
1261                         state = State::SpecialAuthoritySlashes;
1262                     ++c;
1263                     break;
1264                 case Scheme::NonSpecial:
1265                     isUTF8Encoding = true;
1266                     auto maybeSlash = c;
1267                     advance(maybeSlash);
1268                     if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1269                         appendToASCIIBuffer('/');
1270                         c = maybeSlash;
1271                         state = State::PathOrAuthority;
1272                         ASSERT(*c == '/');
1273                         ++c;
1274                         m_url.m_userStart = currentPosition(c);
1275                     } else {
1276                         ++c;
1277                         m_url.m_userStart = currentPosition(c);
1278                         m_url.m_userEnd = m_url.m_userStart;
1279                         m_url.m_passwordEnd = m_url.m_userStart;
1280                         m_url.m_hostEnd = m_url.m_userStart;
1281                         m_url.m_portEnd = m_url.m_userStart;
1282                         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1283                         m_url.m_cannotBeABaseURL = true;
1284                         state = State::CannotBeABaseURLPath;
1285                     }
1286                     break;
1287                 }
1288                 break;
1289             } else {
1290                 m_asciiBuffer.clear();
1291                 state = State::NoScheme;
1292                 c = beginAfterControlAndSpace;
1293                 break;
1294             }
1295             advance(c);
1296             if (c.atEnd()) {
1297                 m_asciiBuffer.clear();
1298                 state = State::NoScheme;
1299                 c = beginAfterControlAndSpace;
1300             }
1301             break;
1302         case State::NoScheme:
1303             LOG_STATE("NoScheme");
1304             if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) {
1305                 failure();
1306                 return;
1307             }
1308             if (base.m_cannotBeABaseURL && *c == '#') {
1309                 copyURLPartsUntil(base, URLPart::QueryEnd, c);
1310                 state = State::Fragment;
1311                 appendToASCIIBuffer('#');
1312                 ++c;
1313                 break;
1314             }
1315             if (!base.protocolIs("file")) {
1316                 state = State::Relative;
1317                 break;
1318             }
1319             copyURLPartsUntil(base, URLPart::SchemeEnd, c);
1320             appendToASCIIBuffer(':');
1321             state = State::File;
1322             break;
1323         case State::SpecialRelativeOrAuthority:
1324             LOG_STATE("SpecialRelativeOrAuthority");
1325             if (*c == '/') {
1326                 appendToASCIIBuffer('/');
1327                 advance(c);
1328                 if (c.atEnd()) {
1329                     failure();
1330                     return;
1331                 }
1332                 if (*c == '/') {
1333                     appendToASCIIBuffer('/');
1334                     state = State::SpecialAuthorityIgnoreSlashes;
1335                     ++c;
1336                 } else
1337                     state = State::RelativeSlash;
1338             } else
1339                 state = State::Relative;
1340             break;
1341         case State::PathOrAuthority:
1342             LOG_STATE("PathOrAuthority");
1343             if (*c == '/') {
1344                 appendToASCIIBuffer('/');
1345                 state = State::AuthorityOrHost;
1346                 advance(c);
1347                 m_url.m_userStart = currentPosition(c);
1348                 authorityOrHostBegin = c;
1349             } else {
1350                 ASSERT(parsedDataView(currentPosition(c) - 1, 1) == "/");
1351                 m_url.m_userStart = currentPosition(c) - 1;
1352                 m_url.m_userEnd = m_url.m_userStart;
1353                 m_url.m_passwordEnd = m_url.m_userStart;
1354                 m_url.m_hostEnd = m_url.m_userStart;
1355                 m_url.m_portEnd = m_url.m_userStart;
1356                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1357                 state = State::Path;
1358             }
1359             break;
1360         case State::Relative:
1361             LOG_STATE("Relative");
1362             switch (*c) {
1363             case '/':
1364             case '\\':
1365                 state = State::RelativeSlash;
1366                 ++c;
1367                 break;
1368             case '?':
1369                 copyURLPartsUntil(base, URLPart::PathEnd, c);
1370                 appendToASCIIBuffer('?');
1371                 ++c;
1372                 if (isUTF8Encoding)
1373                     state = State::UTF8Query;
1374                 else {
1375                     queryBegin = c;
1376                     state = State::NonUTF8Query;
1377                 }
1378                 break;
1379             case '#':
1380                 copyURLPartsUntil(base, URLPart::QueryEnd, c);
1381                 appendToASCIIBuffer('#');
1382                 state = State::Fragment;
1383                 ++c;
1384                 break;
1385             default:
1386                 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c);
1387                 state = State::Path;
1388                 break;
1389             }
1390             break;
1391         case State::RelativeSlash:
1392             LOG_STATE("RelativeSlash");
1393             if (*c == '/' || *c == '\\') {
1394                 ++c;
1395                 copyURLPartsUntil(base, URLPart::SchemeEnd, c);
1396                 appendToASCIIBuffer("://", 3);
1397                 state = State::SpecialAuthorityIgnoreSlashes;
1398             } else {
1399                 copyURLPartsUntil(base, URLPart::PortEnd, c);
1400                 appendToASCIIBuffer('/');
1401                 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1402                 state = State::Path;
1403             }
1404             break;
1405         case State::SpecialAuthoritySlashes:
1406             LOG_STATE("SpecialAuthoritySlashes");
1407             if (LIKELY(*c == '/' || *c == '\\')) {
1408                 if (UNLIKELY(*c == '\\'))
1409                     syntaxViolation(c);
1410                 appendToASCIIBuffer('/');
1411                 advance(c);
1412                 if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) {
1413                     if (UNLIKELY(*c == '\\'))
1414                         syntaxViolation(c);
1415                     ++c;
1416                     appendToASCIIBuffer('/');
1417                 } else {
1418                     syntaxViolation(c);
1419                     appendToASCIIBuffer('/');
1420                 }
1421             } else {
1422                 syntaxViolation(c);
1423                 appendToASCIIBuffer("//", 2);
1424             }
1425             state = State::SpecialAuthorityIgnoreSlashes;
1426             break;
1427         case State::SpecialAuthorityIgnoreSlashes:
1428             LOG_STATE("SpecialAuthorityIgnoreSlashes");
1429             if (*c == '/' || *c == '\\') {
1430                 syntaxViolation(c);
1431                 ++c;
1432             } else {
1433                 m_url.m_userStart = currentPosition(c);
1434                 state = State::AuthorityOrHost;
1435                 authorityOrHostBegin = c;
1436             }
1437             break;
1438         case State::AuthorityOrHost:
1439             do {
1440                 LOG_STATE("AuthorityOrHost");
1441                 if (*c == '@') {
1442                     auto lastAt = c;
1443                     auto findLastAt = c;
1444                     while (!findLastAt.atEnd()) {
1445                         LOG(URLParser, "Finding last @: %c", *findLastAt);
1446                         if (*findLastAt == '@')
1447                             lastAt = findLastAt;
1448                         bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\');
1449                         if (isSlash || *findLastAt == '?' || *findLastAt == '#')
1450                             break;
1451                         ++findLastAt;
1452                     }
1453                     parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1454                     c = lastAt;
1455                     advance(c);
1456                     authorityOrHostBegin = c;
1457                     state = State::Host;
1458                     m_hostHasPercentOrNonASCII = false;
1459                     break;
1460                 }
1461                 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1462                 if (isSlash || *c == '?' || *c == '#') {
1463                     auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c);
1464                     if (iterator.atEnd()) {
1465                         size_t position = currentPosition(c);
1466                         ASSERT(m_url.m_userStart == position);
1467                         RELEASE_ASSERT(position >= 2);
1468                         position -= 2;
1469                         ASSERT(parsedDataView(position, 2) == "//");
1470                         m_url.m_userStart = position;
1471                         m_url.m_userEnd = position;
1472                         m_url.m_passwordEnd = position;
1473                         m_url.m_hostEnd = position;
1474                         m_url.m_portEnd = position;
1475                         m_url.m_pathAfterLastSlash = position + 2;
1476                     } else {
1477                         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1478                         m_url.m_passwordEnd = m_url.m_userEnd;
1479                         if (!parseHostAndPort(iterator)) {
1480                             failure();
1481                             return;
1482                         }
1483                         if (UNLIKELY(!isSlash)) {
1484                             syntaxViolation(c);
1485                             appendToASCIIBuffer('/');
1486                             m_url.m_pathAfterLastSlash = currentPosition(c);
1487                         }
1488                     }
1489                     state = State::Path;
1490                     break;
1491                 }
1492                 if (isPercentOrNonASCII(*c))
1493                     m_hostHasPercentOrNonASCII = true;
1494                 ++c;
1495             } while (!c.atEnd());
1496             break;
1497         case State::Host:
1498             do {
1499                 LOG_STATE("Host");
1500                 if (*c == '/' || *c == '?' || *c == '#') {
1501                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1502                         failure();
1503                         return;
1504                     }
1505                     if (*c == '?' || *c == '#') {
1506                         syntaxViolation(c);
1507                         appendToASCIIBuffer('/');
1508                         m_url.m_pathAfterLastSlash = currentPosition(c);
1509                     }
1510                     state = State::Path;
1511                     break;
1512                 }
1513                 if (isPercentOrNonASCII(*c))
1514                     m_hostHasPercentOrNonASCII = true;
1515                 ++c;
1516             } while (!c.atEnd());
1517             break;
1518         case State::File:
1519             LOG_STATE("File");
1520             switch (*c) {
1521             case '\\':
1522                 syntaxViolation(c);
1523                 FALLTHROUGH;
1524             case '/':
1525                 appendToASCIIBuffer('/');
1526                 state = State::FileSlash;
1527                 ++c;
1528                 break;
1529             case '?':
1530                 syntaxViolation(c);
1531                 if (base.isValid() && base.protocolIs("file")) {
1532                     copyURLPartsUntil(base, URLPart::PathEnd, c);
1533                     appendToASCIIBuffer('?');
1534                     ++c;
1535                 } else {
1536                     appendToASCIIBuffer("///?", 4);
1537                     ++c;
1538                     m_url.m_userStart = currentPosition(c) - 2;
1539                     m_url.m_userEnd = m_url.m_userStart;
1540                     m_url.m_passwordEnd = m_url.m_userStart;
1541                     m_url.m_hostEnd = m_url.m_userStart;
1542                     m_url.m_portEnd = m_url.m_userStart;
1543                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1544                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1545                 }
1546                 if (isUTF8Encoding)
1547                     state = State::UTF8Query;
1548                 else {
1549                     queryBegin = c;
1550                     state = State::NonUTF8Query;
1551                 }
1552                 break;
1553             case '#':
1554                 syntaxViolation(c);
1555                 if (base.isValid() && base.protocolIs("file")) {
1556                     copyURLPartsUntil(base, URLPart::QueryEnd, c);
1557                     appendToASCIIBuffer('#');
1558                 } else {
1559                     appendToASCIIBuffer("///#", 4);
1560                     m_url.m_userStart = currentPosition(c) - 2;
1561                     m_url.m_userEnd = m_url.m_userStart;
1562                     m_url.m_passwordEnd = m_url.m_userStart;
1563                     m_url.m_hostEnd = m_url.m_userStart;
1564                     m_url.m_portEnd = m_url.m_userStart;
1565                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1566                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1567                     m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1568                 }
1569                 state = State::Fragment;
1570                 ++c;
1571                 break;
1572             default:
1573                 syntaxViolation(c);
1574                 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1575                     copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c);
1576                 else {
1577                     appendToASCIIBuffer("///", 3);
1578                     m_url.m_userStart = currentPosition(c) - 1;
1579                     m_url.m_userEnd = m_url.m_userStart;
1580                     m_url.m_passwordEnd = m_url.m_userStart;
1581                     m_url.m_hostEnd = m_url.m_userStart;
1582                     m_url.m_portEnd = m_url.m_userStart;
1583                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1584                     if (isWindowsDriveLetter(c))
1585                         appendWindowsDriveLetter(c);
1586                 }
1587                 state = State::Path;
1588                 break;
1589             }
1590             break;
1591         case State::FileSlash:
1592             LOG_STATE("FileSlash");
1593             if (LIKELY(*c == '/' || *c == '\\')) {
1594                 if (UNLIKELY(*c == '\\'))
1595                     syntaxViolation(c);
1596                 appendToASCIIBuffer('/');
1597                 advance(c);
1598                 m_url.m_userStart = currentPosition(c);
1599                 m_url.m_userEnd = m_url.m_userStart;
1600                 m_url.m_passwordEnd = m_url.m_userStart;
1601                 m_url.m_hostEnd = m_url.m_userStart;
1602                 m_url.m_portEnd = m_url.m_userStart;
1603                 authorityOrHostBegin = c;
1604                 state = State::FileHost;
1605                 break;
1606             }
1607             if (base.isValid() && base.protocolIs("file")) {
1608                 // FIXME: This String copy is unnecessary.
1609                 String basePath = base.path();
1610                 if (basePath.length() >= 2) {
1611                     bool windowsQuirk = basePath.is8Bit()
1612                         ? isWindowsDriveLetter(CodePointIterator<LChar>(basePath.characters8(), basePath.characters8() + basePath.length()))
1613                         : isWindowsDriveLetter(CodePointIterator<UChar>(basePath.characters16(), basePath.characters16() + basePath.length()));
1614                     if (windowsQuirk) {
1615                         appendToASCIIBuffer(basePath[0]);
1616                         appendToASCIIBuffer(basePath[1]);
1617                     }
1618                 }
1619             }
1620             syntaxViolation(c);
1621             appendToASCIIBuffer("//", 2);
1622             m_url.m_userStart = currentPosition(c) - 1;
1623             m_url.m_userEnd = m_url.m_userStart;
1624             m_url.m_passwordEnd = m_url.m_userStart;
1625             m_url.m_hostEnd = m_url.m_userStart;
1626             m_url.m_portEnd = m_url.m_userStart;
1627             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1628             if (isWindowsDriveLetter(c))
1629                 appendWindowsDriveLetter(c);
1630             state = State::Path;
1631             break;
1632         case State::FileHost:
1633             do {
1634                 LOG_STATE("FileHost");
1635                 if (isSlashQuestionOrHash(*c)) {
1636                     bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1637                         && isWindowsDriveLetter(authorityOrHostBegin);
1638                     if (windowsQuirk) {
1639                         syntaxViolation(authorityOrHostBegin);
1640                         appendToASCIIBuffer('/');
1641                         appendWindowsDriveLetter(authorityOrHostBegin);
1642                     }
1643                     if (windowsQuirk || authorityOrHostBegin == c) {
1644                         ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1, 1) == "/");
1645                         if (UNLIKELY(*c == '?')) {
1646                             syntaxViolation(c);
1647                             appendToASCIIBuffer("/?", 2);
1648                             ++c;
1649                             if (isUTF8Encoding)
1650                                 state = State::UTF8Query;
1651                             else {
1652                                 queryBegin = c;
1653                                 state = State::NonUTF8Query;
1654                             }
1655                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1656                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1657                             break;
1658                         }
1659                         if (UNLIKELY(*c == '#')) {
1660                             syntaxViolation(c);
1661                             appendToASCIIBuffer("/#", 2);
1662                             ++c;
1663                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1664                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1665                             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1666                             state = State::Fragment;
1667                             break;
1668                         }
1669                         state = State::Path;
1670                         break;
1671                     }
1672                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1673                         failure();
1674                         return;
1675                     }
1676                     if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) {
1677                         syntaxViolation(c);
1678                         m_asciiBuffer.shrink(m_url.m_passwordEnd);
1679                         m_url.m_hostEnd = currentPosition(c);
1680                         m_url.m_portEnd = m_url.m_hostEnd;
1681                     }
1682                     
1683                     state = State::PathStart;
1684                     break;
1685                 }
1686                 if (isPercentOrNonASCII(*c))
1687                     m_hostHasPercentOrNonASCII = true;
1688                 ++c;
1689             } while (!c.atEnd());
1690             break;
1691         case State::PathStart:
1692             LOG_STATE("PathStart");
1693             if (*c != '/' && *c != '\\')
1694                 ++c;
1695             state = State::Path;
1696             break;
1697         case State::Path:
1698             LOG_STATE("Path");
1699             if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1700                 if (UNLIKELY(m_urlIsSpecial && *c == '\\'))
1701                     syntaxViolation(c);
1702                 appendToASCIIBuffer('/');
1703                 ++c;
1704                 m_url.m_pathAfterLastSlash = currentPosition(c);
1705                 break;
1706             }
1707             if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1, 1) == "/")) {
1708                 if (UNLIKELY(isDoubleDotPathSegment(c))) {
1709                     syntaxViolation(c);
1710                     consumeDoubleDotPathSegment(c);
1711                     popPath();
1712                     break;
1713                 }
1714                 if (UNLIKELY(isSingleDotPathSegment(c))) {
1715                     syntaxViolation(c);
1716                     consumeSingleDotPathSegment(c);
1717                     break;
1718                 }
1719             }
1720             if (*c == '?') {
1721                 m_url.m_pathEnd = currentPosition(c);
1722                 appendToASCIIBuffer('?');
1723                 ++c;
1724                 if (isUTF8Encoding)
1725                     state = State::UTF8Query;
1726                 else {
1727                     queryBegin = c;
1728                     state = State::NonUTF8Query;
1729                 }
1730                 break;
1731             }
1732             if (*c == '#') {
1733                 m_url.m_pathEnd = currentPosition(c);
1734                 m_url.m_queryEnd = m_url.m_pathEnd;
1735                 state = State::Fragment;
1736                 break;
1737             }
1738             if (UNLIKELY(isPercentEncodedDot(c))) {
1739                 syntaxViolation(c);
1740                 appendToASCIIBuffer('.');
1741                 ASSERT(*c == '%');
1742                 advance(c);
1743                 ASSERT(*c == dotASCIICode[0]);
1744                 advance(c);
1745                 ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1746                 advance(c);
1747                 break;
1748             }
1749             utf8PercentEncode<isInDefaultEncodeSet>(c);
1750             ++c;
1751             break;
1752         case State::CannotBeABaseURLPath:
1753             LOG_STATE("CannotBeABaseURLPath");
1754             if (*c == '?') {
1755                 m_url.m_pathEnd = currentPosition(c);
1756                 appendToASCIIBuffer('?');
1757                 ++c;
1758                 if (isUTF8Encoding)
1759                     state = State::UTF8Query;
1760                 else {
1761                     queryBegin = c;
1762                     state = State::NonUTF8Query;
1763                 }
1764             } else if (*c == '#') {
1765                 m_url.m_pathEnd = currentPosition(c);
1766                 m_url.m_queryEnd = m_url.m_pathEnd;
1767                 state = State::Fragment;
1768             } else if (*c == '/') {
1769                 appendToASCIIBuffer('/');
1770                 ++c;
1771                 m_url.m_pathAfterLastSlash = currentPosition(c);
1772             } else {
1773                 utf8PercentEncode<isInSimpleEncodeSet>(c);
1774                 ++c;
1775             }
1776             break;
1777         case State::UTF8Query:
1778             LOG_STATE("UTF8Query");
1779             ASSERT(queryBegin == CodePointIterator<CharacterType>());
1780             if (*c == '#') {
1781                 m_url.m_queryEnd = currentPosition(c);
1782                 state = State::Fragment;
1783                 break;
1784             }
1785             if (isUTF8Encoding)
1786                 utf8QueryEncode(c);
1787             else
1788                 appendCodePoint(queryBuffer, *c);
1789             ++c;
1790             break;
1791         case State::NonUTF8Query:
1792             do {
1793                 LOG_STATE("NonUTF8Query");
1794                 ASSERT(queryBegin != CodePointIterator<CharacterType>());
1795                 if (*c == '#') {
1796                     encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
1797                     m_url.m_queryEnd = currentPosition(c);
1798                     state = State::Fragment;
1799                     break;
1800                 }
1801                 appendCodePoint(queryBuffer, *c);
1802                 advance(c, queryBegin);
1803             } while (!c.atEnd());
1804             break;
1805         case State::Fragment:
1806             do {
1807                 LOG(URLParser, "State Fragment");
1808                 if (!m_didSeeUnicodeFragmentCodePoint && isASCII(*c))
1809                     appendToASCIIBuffer(*c);
1810                 else {
1811                     m_didSeeUnicodeFragmentCodePoint = true;
1812                     if (UNLIKELY(m_didSeeSyntaxViolation))
1813                         appendCodePoint(m_unicodeFragmentBuffer, *c);
1814                     else {
1815                         ASSERT(m_asciiBuffer.isEmpty());
1816                         ASSERT(m_unicodeFragmentBuffer.isEmpty());
1817                     }
1818                 }
1819                 ++c;
1820                 while (UNLIKELY(!c.atEnd() && isTabOrNewline(*c))) {
1821                     fragmentSyntaxViolation(c);
1822                     ++c;
1823                 }
1824             } while (!c.atEnd());
1825             break;
1826         }
1827     }
1828
1829     switch (state) {
1830     case State::SchemeStart:
1831         LOG_FINAL_STATE("SchemeStart");
1832         if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1833             m_url = base;
1834             m_url.removeFragmentIdentifier();
1835             return;
1836         }
1837         failure();
1838         return;
1839     case State::Scheme:
1840         LOG_FINAL_STATE("Scheme");
1841         failure();
1842         return;
1843     case State::NoScheme:
1844         LOG_FINAL_STATE("NoScheme");
1845         RELEASE_ASSERT_NOT_REACHED();
1846     case State::SpecialRelativeOrAuthority:
1847         LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1848         copyURLPartsUntil(base, URLPart::QueryEnd, c);
1849         m_url.m_fragmentEnd = m_url.m_queryEnd;
1850         break;
1851     case State::PathOrAuthority:
1852         LOG_FINAL_STATE("PathOrAuthority");
1853         ASSERT(m_url.m_userStart);
1854         ASSERT(m_url.m_userStart == currentPosition(c));
1855         ASSERT(parsedDataView(currentPosition(c) - 1, 1) == "/");
1856         m_url.m_userStart--;
1857         m_url.m_userEnd = m_url.m_userStart;
1858         m_url.m_passwordEnd = m_url.m_userStart;
1859         m_url.m_hostEnd = m_url.m_userStart;
1860         m_url.m_portEnd = m_url.m_userStart;
1861         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1862         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1863         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1864         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1865         break;
1866     case State::Relative:
1867         LOG_FINAL_STATE("Relative");
1868         RELEASE_ASSERT_NOT_REACHED();
1869     case State::RelativeSlash:
1870         LOG_FINAL_STATE("RelativeSlash");
1871         copyURLPartsUntil(base, URLPart::PortEnd, c);
1872         appendToASCIIBuffer('/');
1873         m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1874         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1875         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1876         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1877         break;
1878     case State::SpecialAuthoritySlashes:
1879         LOG_FINAL_STATE("SpecialAuthoritySlashes");
1880         m_url.m_userStart = currentPosition(c);
1881         m_url.m_userEnd = m_url.m_userStart;
1882         m_url.m_passwordEnd = m_url.m_userStart;
1883         m_url.m_hostEnd = m_url.m_userStart;
1884         m_url.m_portEnd = m_url.m_userStart;
1885         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1886         m_url.m_pathEnd = m_url.m_userStart;
1887         m_url.m_queryEnd = m_url.m_userStart;
1888         m_url.m_fragmentEnd = m_url.m_userStart;
1889         break;
1890     case State::SpecialAuthorityIgnoreSlashes:
1891         LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1892         failure();
1893         return;
1894         break;
1895     case State::AuthorityOrHost:
1896         LOG_FINAL_STATE("AuthorityOrHost");
1897         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1898         m_url.m_passwordEnd = m_url.m_userEnd;
1899         if (authorityOrHostBegin.atEnd()) {
1900             RELEASE_ASSERT(m_url.m_userStart >= 2);
1901             ASSERT(parsedDataView(m_url.m_userStart - 2, 2) == "//");
1902             m_url.m_userStart -= 2;
1903             m_url.m_userEnd = m_url.m_userStart;
1904             m_url.m_passwordEnd = m_url.m_userStart;
1905             m_url.m_hostEnd = m_url.m_userStart;
1906             m_url.m_portEnd = m_url.m_userStart;
1907             m_url.m_pathEnd = m_url.m_userStart + 2;
1908         } else if (!parseHostAndPort(authorityOrHostBegin)) {
1909             failure();
1910             return;
1911         } else {
1912             syntaxViolation(c);
1913             appendToASCIIBuffer('/');
1914             m_url.m_pathEnd = m_url.m_portEnd + 1;
1915         }
1916         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1917         m_url.m_queryEnd = m_url.m_pathEnd;
1918         m_url.m_fragmentEnd = m_url.m_pathEnd;
1919         break;
1920     case State::Host:
1921         LOG_FINAL_STATE("Host");
1922         if (!parseHostAndPort(authorityOrHostBegin)) {
1923             failure();
1924             return;
1925         }
1926         syntaxViolation(c);
1927         appendToASCIIBuffer('/');
1928         m_url.m_pathEnd = m_url.m_portEnd + 1;
1929         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1930         m_url.m_queryEnd = m_url.m_pathEnd;
1931         m_url.m_fragmentEnd = m_url.m_pathEnd;
1932         break;
1933     case State::File:
1934         LOG_FINAL_STATE("File");
1935         if (base.isValid() && base.protocolIs("file")) {
1936             copyURLPartsUntil(base, URLPart::QueryEnd, c);
1937             appendToASCIIBuffer(':');
1938         }
1939         syntaxViolation(c);
1940         appendToASCIIBuffer("///", 3);
1941         m_url.m_userStart = currentPosition(c) - 1;
1942         m_url.m_userEnd = m_url.m_userStart;
1943         m_url.m_passwordEnd = m_url.m_userStart;
1944         m_url.m_hostEnd = m_url.m_userStart;
1945         m_url.m_portEnd = m_url.m_userStart;
1946         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1947         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1948         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1949         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1950         break;
1951     case State::FileSlash:
1952         LOG_FINAL_STATE("FileSlash");
1953         syntaxViolation(c);
1954         m_url.m_userStart = currentPosition(c) + 1;
1955         appendToASCIIBuffer("//", 2);
1956         m_url.m_userEnd = m_url.m_userStart;
1957         m_url.m_passwordEnd = m_url.m_userStart;
1958         m_url.m_hostEnd = m_url.m_userStart;
1959         m_url.m_portEnd = m_url.m_userStart;
1960         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1961         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1962         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1963         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1964         break;
1965     case State::FileHost:
1966         LOG_FINAL_STATE("FileHost");
1967         if (authorityOrHostBegin == c) {
1968             syntaxViolation(c);
1969             appendToASCIIBuffer('/');
1970             m_url.m_userStart = currentPosition(c) - 1;
1971             m_url.m_userEnd = m_url.m_userStart;
1972             m_url.m_passwordEnd = m_url.m_userStart;
1973             m_url.m_hostEnd = m_url.m_userStart;
1974             m_url.m_portEnd = m_url.m_userStart;
1975             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1976             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1977             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1978             m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1979             break;
1980         }
1981
1982         if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1983             failure();
1984             return;
1985         }
1986
1987         syntaxViolation(c);
1988         if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) {
1989             m_asciiBuffer.shrink(m_url.m_passwordEnd);
1990             m_url.m_hostEnd = currentPosition(c);
1991             m_url.m_portEnd = m_url.m_hostEnd;
1992         }
1993         appendToASCIIBuffer('/');
1994         m_url.m_pathAfterLastSlash = m_url.m_hostEnd + 1;
1995         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1996         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1997         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1998         break;
1999     case State::PathStart:
2000         LOG_FINAL_STATE("PathStart");
2001         RELEASE_ASSERT_NOT_REACHED();
2002     case State::Path:
2003         LOG_FINAL_STATE("Path");
2004         m_url.m_pathEnd = currentPosition(c);
2005         m_url.m_queryEnd = m_url.m_pathEnd;
2006         m_url.m_fragmentEnd = m_url.m_pathEnd;
2007         break;
2008     case State::CannotBeABaseURLPath:
2009         LOG_FINAL_STATE("CannotBeABaseURLPath");
2010         m_url.m_pathEnd = currentPosition(c);
2011         m_url.m_queryEnd = m_url.m_pathEnd;
2012         m_url.m_fragmentEnd = m_url.m_pathEnd;
2013         break;
2014     case State::UTF8Query:
2015         LOG_FINAL_STATE("UTF8Query");
2016         ASSERT(queryBegin == CodePointIterator<CharacterType>());
2017         m_url.m_queryEnd = currentPosition(c);
2018         m_url.m_fragmentEnd = m_url.m_queryEnd;
2019         break;
2020     case State::NonUTF8Query:
2021         LOG_FINAL_STATE("NonUTF8Query");
2022         ASSERT(queryBegin != CodePointIterator<CharacterType>());
2023         encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
2024         m_url.m_queryEnd = currentPosition(c);
2025         m_url.m_fragmentEnd = m_url.m_queryEnd;
2026         break;
2027     case State::Fragment:
2028         {
2029             LOG_FINAL_STATE("Fragment");
2030             size_t length = m_didSeeSyntaxViolation ? m_asciiBuffer.size() + m_unicodeFragmentBuffer.size() : c.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
2031             m_url.m_fragmentEnd = length;
2032             break;
2033         }
2034     }
2035
2036     if (LIKELY(!m_didSeeSyntaxViolation)) {
2037         m_url.m_string = m_inputString;
2038         ASSERT(m_asciiBuffer.isEmpty());
2039         ASSERT(m_unicodeFragmentBuffer.isEmpty());
2040     } else if (!m_didSeeUnicodeFragmentCodePoint) {
2041         ASSERT(m_unicodeFragmentBuffer.isEmpty());
2042         m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
2043     } else {
2044         Vector<UChar> buffer;
2045         buffer.reserveInitialCapacity(m_asciiBuffer.size() + m_unicodeFragmentBuffer.size());
2046         buffer.appendVector(m_asciiBuffer);
2047         buffer.appendVector(m_unicodeFragmentBuffer);
2048         m_url.m_string = String::adopt(WTFMove(buffer));
2049     }
2050     m_url.m_isValid = true;
2051     LOG(URLParser, "Parsed URL <%s>", m_url.m_string.utf8().data());
2052 }
2053
2054 template<typename CharacterType>
2055 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
2056 {
2057     if (UNLIKELY(iterator.atEnd())) {
2058         syntaxViolation(iterator);
2059         m_url.m_userEnd = currentPosition(iterator);
2060         m_url.m_passwordEnd = m_url.m_userEnd;
2061         return;
2062     }
2063     for (; !iterator.atEnd(); advance(iterator)) {
2064         if (*iterator == ':') {
2065             m_url.m_userEnd = currentPosition(iterator);
2066             auto iteratorAtColon = iterator;
2067             ++iterator;
2068             bool tabOrNewlineAfterColon = false;
2069             while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
2070                 tabOrNewlineAfterColon = true;
2071                 ++iterator;
2072             }
2073             if (UNLIKELY(iterator.atEnd())) {
2074                 syntaxViolation(iteratorAtColon);
2075                 m_url.m_passwordEnd = m_url.m_userEnd;
2076                 if (m_url.m_userEnd > m_url.m_userStart)
2077                     appendToASCIIBuffer('@');
2078                 return;
2079             }
2080             if (tabOrNewlineAfterColon)
2081                 syntaxViolation(iteratorAtColon);
2082             appendToASCIIBuffer(':');
2083             break;
2084         }
2085         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2086     }
2087     for (; !iterator.atEnd(); advance(iterator))
2088         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2089     m_url.m_passwordEnd = currentPosition(iterator);
2090     if (!m_url.m_userEnd)
2091         m_url.m_userEnd = m_url.m_passwordEnd;
2092     appendToASCIIBuffer('@');
2093 }
2094
2095 template<typename UnsignedIntegerType>
2096 void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
2097 {
2098     LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
2099     LChar* end = buf + WTF_ARRAY_LENGTH(buf);
2100     LChar* p = end;
2101     do {
2102         *--p = (number % 10) + '0';
2103         number /= 10;
2104     } while (number);
2105     appendToASCIIBuffer(p, end - p);
2106 }
2107
2108 void URLParser::serializeIPv4(IPv4Address address)
2109 {
2110     appendNumberToASCIIBuffer<uint8_t>(address >> 24);
2111     appendToASCIIBuffer('.');
2112     appendNumberToASCIIBuffer<uint8_t>(address >> 16);
2113     appendToASCIIBuffer('.');
2114     appendNumberToASCIIBuffer<uint8_t>(address >> 8);
2115     appendToASCIIBuffer('.');
2116     appendNumberToASCIIBuffer<uint8_t>(address);
2117 }
2118     
2119 static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
2120 {
2121     size_t end = begin;
2122     for (; end < 8; end++) {
2123         if (address[end])
2124             break;
2125     }
2126     return end - begin;
2127 }
2128
2129 static Optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
2130 {
2131     Optional<size_t> longest;
2132     size_t longestLength = 0;
2133     for (size_t i = 0; i < 8; i++) {
2134         size_t length = zeroSequenceLength(address, i);
2135         if (length) {
2136             if (length > 1 && (!longest || longestLength < length)) {
2137                 longest = i;
2138                 longestLength = length;
2139             }
2140             i += length;
2141         }
2142     }
2143     return longest;
2144 }
2145
2146 void URLParser::serializeIPv6Piece(uint16_t piece)
2147 {
2148     bool printed = false;
2149     if (auto nibble0 = piece >> 12) {
2150         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
2151         printed = true;
2152     }
2153     auto nibble1 = piece >> 8 & 0xF;
2154     if (printed || nibble1) {
2155         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
2156         printed = true;
2157     }
2158     auto nibble2 = piece >> 4 & 0xF;
2159     if (printed || nibble2)
2160         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
2161     appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
2162 }
2163
2164 void URLParser::serializeIPv6(URLParser::IPv6Address address)
2165 {
2166     appendToASCIIBuffer('[');
2167     auto compressPointer = findLongestZeroSequence(address);
2168     for (size_t piece = 0; piece < 8; piece++) {
2169         if (compressPointer && compressPointer.value() == piece) {
2170             ASSERT(!address[piece]);
2171             if (piece)
2172                 appendToASCIIBuffer(':');
2173             else
2174                 appendToASCIIBuffer("::", 2);
2175             while (piece < 8 && !address[piece])
2176                 piece++;
2177             if (piece == 8)
2178                 break;
2179         }
2180         serializeIPv6Piece(address[piece]);
2181         if (piece < 7)
2182             appendToASCIIBuffer(':');
2183     }
2184     appendToASCIIBuffer(']');
2185 }
2186
2187 template<typename CharacterType>
2188 Optional<uint32_t> URLParser::parseIPv4Number(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation)
2189 {
2190     enum class State : uint8_t {
2191         UnknownBase,
2192         Decimal,
2193         OctalOrHex,
2194         Octal,
2195         Hex,
2196     };
2197     State state = State::UnknownBase;
2198     Checked<uint32_t, RecordOverflow> value = 0;
2199     if (!iterator.atEnd() && *iterator == '.')
2200         return Nullopt;
2201     while (!iterator.atEnd()) {
2202         if (isTabOrNewline(*iterator)) {
2203             didSeeSyntaxViolation = true;
2204             ++iterator;
2205             continue;
2206         }
2207         if (*iterator == '.') {
2208             ASSERT(!value.hasOverflowed());
2209             return value.unsafeGet();
2210         }
2211         switch (state) {
2212         case State::UnknownBase:
2213             if (UNLIKELY(*iterator == '0')) {
2214                 ++iterator;
2215                 state = State::OctalOrHex;
2216                 break;
2217             }
2218             state = State::Decimal;
2219             break;
2220         case State::OctalOrHex:
2221             didSeeSyntaxViolation = true;
2222             if (*iterator == 'x' || *iterator == 'X') {
2223                 ++iterator;
2224                 state = State::Hex;
2225                 break;
2226             }
2227             state = State::Octal;
2228             break;
2229         case State::Decimal:
2230             if (*iterator < '0' || *iterator > '9')
2231                 return Nullopt;
2232             value *= 10;
2233             value += *iterator - '0';
2234             if (UNLIKELY(value.hasOverflowed()))
2235                 return Nullopt;
2236             ++iterator;
2237             break;
2238         case State::Octal:
2239             ASSERT(didSeeSyntaxViolation);
2240             if (*iterator < '0' || *iterator > '7')
2241                 return Nullopt;
2242             value *= 8;
2243             value += *iterator - '0';
2244             if (UNLIKELY(value.hasOverflowed()))
2245                 return Nullopt;
2246             ++iterator;
2247             break;
2248         case State::Hex:
2249             ASSERT(didSeeSyntaxViolation);
2250             if (!isASCIIHexDigit(*iterator))
2251                 return Nullopt;
2252             value *= 16;
2253             value += toASCIIHexValue(*iterator);
2254             if (UNLIKELY(value.hasOverflowed()))
2255                 return Nullopt;
2256             ++iterator;
2257             break;
2258         }
2259     }
2260     ASSERT(!value.hasOverflowed());
2261     return value.unsafeGet();
2262 }
2263
2264 ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2265 {
2266     RELEASE_ASSERT(exponent <= 4);
2267     uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
2268     return values[exponent];
2269 }
2270
2271 template<typename CharacterType>
2272 Optional<URLParser::IPv4Address> URLParser::parseIPv4Host(CodePointIterator<CharacterType> iterator)
2273 {
2274     auto hostBegin = iterator;
2275
2276     Vector<uint32_t, 4> items;
2277     items.reserveInitialCapacity(4);
2278     bool didSeeSyntaxViolation = false;
2279     while (!iterator.atEnd()) {
2280         if (isTabOrNewline(*iterator)) {
2281             didSeeSyntaxViolation = true;
2282             ++iterator;
2283             continue;
2284         }
2285         if (items.size() >= 4)
2286             return Nullopt;
2287         if (auto item = parseIPv4Number(iterator, didSeeSyntaxViolation))
2288             items.append(item.value());
2289         else
2290             return Nullopt;
2291         if (!iterator.atEnd()) {
2292             if (items.size() >= 4)
2293                 return Nullopt;
2294             if (*iterator == '.')
2295                 ++iterator;
2296             else
2297                 return Nullopt;
2298         }
2299     }
2300     if (!iterator.atEnd() || !items.size() || items.size() > 4)
2301         return Nullopt;
2302     if (items.size() > 1) {
2303         for (size_t i = 0; i < items.size() - 1; i++) {
2304             if (items[i] > 255)
2305                 return Nullopt;
2306         }
2307     }
2308     if (items[items.size() - 1] >= pow256(5 - items.size()))
2309         return Nullopt;
2310
2311     if (didSeeSyntaxViolation)
2312         syntaxViolation(hostBegin);
2313     for (auto item : items) {
2314         if (item > 255)
2315             syntaxViolation(hostBegin);
2316     }
2317
2318     if (UNLIKELY(items.size() != 4))
2319         syntaxViolation(hostBegin);
2320
2321     IPv4Address ipv4 = items.takeLast();
2322     for (size_t counter = 0; counter < items.size(); ++counter)
2323         ipv4 += items[counter] * pow256(3 - counter);
2324     return ipv4;
2325 }
2326     
2327 template<typename CharacterType>
2328 Optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2329 {
2330     ASSERT(*c == '[');
2331     auto hostBegin = c;
2332     advance(c, hostBegin);
2333     if (c.atEnd())
2334         return Nullopt;
2335
2336     IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}};
2337     size_t piecePointer = 0;
2338     Optional<size_t> compressPointer;
2339
2340     if (*c == ':') {
2341         advance(c, hostBegin);
2342         if (c.atEnd())
2343             return Nullopt;
2344         if (*c != ':')
2345             return Nullopt;
2346         advance(c, hostBegin);
2347         ++piecePointer;
2348         compressPointer = piecePointer;
2349     }
2350     
2351     while (!c.atEnd()) {
2352         if (piecePointer == 8)
2353             return Nullopt;
2354         if (*c == ':') {
2355             if (compressPointer)
2356                 return Nullopt;
2357             advance(c, hostBegin);
2358             ++piecePointer;
2359             compressPointer = piecePointer;
2360             continue;
2361         }
2362         uint16_t value = 0;
2363         size_t length = 0;
2364         bool leadingZeros = false;
2365         for (; length < 4; length++) {
2366             if (c.atEnd())
2367                 break;
2368             if (!isASCIIHexDigit(*c))
2369                 break;
2370             if (isASCIIUpper(*c))
2371                 syntaxViolation(hostBegin);
2372             if (*c == '0' && !length)
2373                 leadingZeros = true;
2374             value = value * 0x10 + toASCIIHexValue(*c);
2375             advance(c, hostBegin);
2376         }
2377         
2378         if (UNLIKELY((value && leadingZeros) || (!value && length > 1)))
2379             syntaxViolation(hostBegin);
2380
2381         address[piecePointer++] = value;
2382         if (c.atEnd())
2383             break;
2384         if (piecePointer == 8 || *c != ':')
2385             return Nullopt;
2386         advance(c, hostBegin);
2387     }
2388     
2389     if (!c.atEnd()) {
2390         if (piecePointer > 6)
2391             return Nullopt;
2392         size_t dotsSeen = 0;
2393         while (!c.atEnd()) {
2394             Optional<uint16_t> value;
2395             if (!isASCIIDigit(*c))
2396                 return Nullopt;
2397             while (isASCIIDigit(*c)) {
2398                 auto number = *c - '0';
2399                 if (!value)
2400                     value = number;
2401                 else if (!value.value())
2402                     return Nullopt;
2403                 else
2404                     value = value.value() * 10 + number;
2405                 advance(c, hostBegin);
2406                 if (c.atEnd())
2407                     return Nullopt;
2408                 if (value.value() > 255)
2409                     return Nullopt;
2410             }
2411             if (dotsSeen < 3 && *c != '.')
2412                 return Nullopt;
2413             address[piecePointer] = address[piecePointer] * 0x100 + value.valueOr(0);
2414             if (dotsSeen == 1 || dotsSeen == 3)
2415                 piecePointer++;
2416             if (!c.atEnd())
2417                 advance(c, hostBegin);
2418             if (dotsSeen == 3 && !c.atEnd())
2419                 return Nullopt;
2420             dotsSeen++;
2421         }
2422     }
2423     if (compressPointer) {
2424         size_t swaps = piecePointer - compressPointer.value();
2425         piecePointer = 7;
2426         while (swaps)
2427             std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
2428     } else if (piecePointer != 8)
2429         return Nullopt;
2430
2431     Optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2432     if (possibleCompressPointer)
2433         possibleCompressPointer.value()++;
2434     if (UNLIKELY(compressPointer != possibleCompressPointer))
2435         syntaxViolation(hostBegin);
2436     
2437     return address;
2438 }
2439
2440 const size_t defaultInlineBufferSize = 2048;
2441
2442 static Vector<LChar, defaultInlineBufferSize> percentDecode(const LChar* input, size_t length)
2443 {
2444     Vector<LChar, defaultInlineBufferSize> output;
2445     output.reserveInitialCapacity(length);
2446     
2447     for (size_t i = 0; i < length; ++i) {
2448         uint8_t byte = input[i];
2449         if (byte != '%')
2450             output.uncheckedAppend(byte);
2451         else if (i < length - 2) {
2452             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2453                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2454                 i += 2;
2455             } else
2456                 output.uncheckedAppend(byte);
2457         } else
2458             output.uncheckedAppend(byte);
2459     }
2460     return output;
2461 }
2462
2463 ALWAYS_INLINE static bool containsOnlyASCII(const String& string)
2464 {
2465     if (string.is8Bit())
2466         return charactersAreAllASCII(string.characters8(), string.length());
2467     return charactersAreAllASCII(string.characters16(), string.length());
2468 }
2469
2470 static Optional<Vector<LChar, defaultInlineBufferSize>> domainToASCII(const String& domain)
2471 {
2472     Vector<LChar, defaultInlineBufferSize> ascii;
2473     if (containsOnlyASCII(domain)) {
2474         size_t length = domain.length();
2475         if (domain.is8Bit()) {
2476             const LChar* characters = domain.characters8();
2477             ascii.reserveInitialCapacity(length);
2478             for (size_t i = 0; i < length; ++i)
2479                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2480         } else {
2481             const UChar* characters = domain.characters16();
2482             ascii.reserveInitialCapacity(length);
2483             for (size_t i = 0; i < length; ++i)
2484                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2485         }
2486         return ascii;
2487     }
2488     
2489     UChar hostnameBuffer[defaultInlineBufferSize];
2490     UErrorCode error = U_ZERO_ERROR;
2491
2492 #if COMPILER(GCC) || COMPILER(CLANG)
2493 #pragma GCC diagnostic push
2494 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2495 #endif
2496     // FIXME: This should use uidna_openUTS46 / uidna_close instead
2497     int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, UIDNA_ALLOW_UNASSIGNED, nullptr, &error);
2498 #if COMPILER(GCC) || COMPILER(CLANG)
2499 #pragma GCC diagnostic pop
2500 #endif
2501     ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
2502
2503     if (error == U_ZERO_ERROR) {
2504         for (int32_t i = 0; i < numCharactersConverted; ++i) {
2505             ASSERT(isASCII(hostnameBuffer[i]));
2506             ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2507         }
2508         ascii.append(hostnameBuffer, numCharactersConverted);
2509         return ascii;
2510     }
2511
2512     // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
2513     return Nullopt;
2514 }
2515
2516 static bool hasInvalidDomainCharacter(const Vector<LChar, defaultInlineBufferSize>& asciiDomain)
2517 {
2518     for (size_t i = 0; i < asciiDomain.size(); ++i) {
2519         if (isInvalidDomainCharacter(asciiDomain[i]))
2520             return true;
2521     }
2522     return false;
2523 }
2524
2525 template<typename CharacterType>
2526 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2527 {
2528     ASSERT(*iterator == ':');
2529     auto colonIterator = iterator;
2530     advance(iterator, colonIterator);
2531     uint32_t port = 0;
2532     if (UNLIKELY(iterator.atEnd())) {
2533         m_url.m_portEnd = currentPosition(colonIterator);
2534         syntaxViolation(colonIterator);
2535         return true;
2536     }
2537     size_t digitCount = 0;
2538     bool leadingZeros = false;
2539     for (; !iterator.atEnd(); ++iterator) {
2540         if (UNLIKELY(isTabOrNewline(*iterator))) {
2541             syntaxViolation(colonIterator);
2542             continue;
2543         }
2544         if (isASCIIDigit(*iterator)) {
2545             if (*iterator == '0' && !digitCount)
2546                 leadingZeros = true;
2547             ++digitCount;
2548             port = port * 10 + *iterator - '0';
2549             if (port > std::numeric_limits<uint16_t>::max())
2550                 return false;
2551         } else
2552             return false;
2553     }
2554
2555     if (port && leadingZeros)
2556         syntaxViolation(colonIterator);
2557     
2558     if (!port && digitCount > 1)
2559         syntaxViolation(colonIterator);
2560
2561     if (UNLIKELY(isDefaultPort(parsedDataView(0, m_url.m_schemeEnd), port)))
2562         syntaxViolation(colonIterator);
2563     else {
2564         appendToASCIIBuffer(':');
2565         ASSERT(port <= std::numeric_limits<uint16_t>::max());
2566         appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2567     }
2568
2569     m_url.m_portEnd = currentPosition(iterator);
2570     return true;
2571 }
2572
2573 template<typename CharacterType>
2574 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2575 {
2576     if (iterator.atEnd())
2577         return false;
2578     if (*iterator == ':')
2579         return false;
2580     if (*iterator == '[') {
2581         auto ipv6End = iterator;
2582         while (!ipv6End.atEnd() && *ipv6End != ']')
2583             ++ipv6End;
2584         if (ipv6End.atEnd())
2585             return false;
2586         if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2587             serializeIPv6(address.value());
2588             if (!ipv6End.atEnd()) {
2589                 advance(ipv6End);
2590                 if (!ipv6End.atEnd() && *ipv6End == ':') {
2591                     m_url.m_hostEnd = currentPosition(ipv6End);
2592                     return parsePort(ipv6End);
2593                 }
2594                 m_url.m_hostEnd = currentPosition(ipv6End);
2595                 m_url.m_portEnd = m_url.m_hostEnd;
2596                 return true;
2597             }
2598             m_url.m_hostEnd = currentPosition(ipv6End);
2599             return true;
2600         }
2601     }
2602
2603     if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2604         auto hostIterator = iterator;
2605         for (; !iterator.atEnd(); ++iterator) {
2606             if (isTabOrNewline(*iterator))
2607                 continue;
2608             if (*iterator == ':')
2609                 break;
2610             if (isInvalidDomainCharacter(*iterator))
2611                 return false;
2612         }
2613         if (auto address = parseIPv4Host(CodePointIterator<CharacterType>(hostIterator, iterator))) {
2614             serializeIPv4(address.value());
2615             m_url.m_hostEnd = currentPosition(iterator);
2616             if (iterator.atEnd()) {
2617                 m_url.m_portEnd = currentPosition(iterator);
2618                 return true;
2619             }
2620             return parsePort(iterator);
2621         }
2622         for (; hostIterator != iterator; ++hostIterator) {
2623             if (LIKELY(!isTabOrNewline(*hostIterator))) {
2624                 if (UNLIKELY(isASCIIUpper(*hostIterator)))
2625                     syntaxViolation(hostIterator);
2626                 appendToASCIIBuffer(toASCIILower(*hostIterator));
2627             } else
2628                 syntaxViolation(hostIterator);
2629         }
2630         m_url.m_hostEnd = currentPosition(iterator);
2631         if (!hostIterator.atEnd())
2632             return parsePort(hostIterator);
2633         m_url.m_portEnd = currentPosition(iterator);
2634         return true;
2635     }
2636     
2637     syntaxViolation(iterator);
2638     
2639     Vector<LChar, defaultInlineBufferSize> utf8Encoded;
2640     for (; !iterator.atEnd(); ++iterator) {
2641         if (isTabOrNewline(*iterator))
2642             continue;
2643         if (*iterator == ':')
2644             break;
2645         uint8_t buffer[U8_MAX_LENGTH];
2646         int32_t offset = 0;
2647         UBool error = false;
2648         U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error);
2649         ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
2650         // FIXME: Check error.
2651         utf8Encoded.append(buffer, offset);
2652     }
2653     Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size());
2654     String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2655     auto asciiDomain = domainToASCII(domain);
2656     if (!asciiDomain || hasInvalidDomainCharacter(asciiDomain.value()))
2657         return false;
2658     Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value();
2659     const LChar* asciiDomainCharacters = asciiDomainValue.data();
2660
2661     if (auto address = parseIPv4Host(CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()))) {
2662         serializeIPv4(address.value());
2663         m_url.m_hostEnd = currentPosition(iterator);
2664         if (iterator.atEnd()) {
2665             m_url.m_portEnd = currentPosition(iterator);
2666             return true;
2667         }
2668         return parsePort(iterator);
2669     }
2670
2671     appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2672     m_url.m_hostEnd = currentPosition(iterator);
2673     if (!iterator.atEnd())
2674         return parsePort(iterator);
2675     m_url.m_portEnd = currentPosition(iterator);
2676     return true;
2677 }
2678
2679 static Optional<String> formURLDecode(StringView input)
2680 {
2681     auto utf8 = input.utf8(StrictConversion);
2682     if (utf8.isNull())
2683         return Nullopt;
2684     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2685     return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2686 }
2687
2688 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2689 {
2690     Vector<StringView> sequences = input.split('&');
2691
2692     URLEncodedForm output;
2693     for (auto& bytes : sequences) {
2694         auto valueStart = bytes.find('=');
2695         if (valueStart == notFound) {
2696             if (auto name = formURLDecode(bytes))
2697                 output.append({name.value().replace('+', 0x20), emptyString()});
2698         } else {
2699             auto name = formURLDecode(bytes.substring(0, valueStart));
2700             auto value = formURLDecode(bytes.substring(valueStart + 1));
2701             if (name && value)
2702                 output.append(std::make_pair(name.value().replace('+', 0x20), value.value().replace('+', 0x20)));
2703         }
2704     }
2705     return output;
2706 }
2707
2708 static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2709 {
2710     auto utf8 = input.utf8(StrictConversion);
2711     const char* data = utf8.data();
2712     for (size_t i = 0; i < utf8.length(); ++i) {
2713         const char byte = data[i];
2714         if (byte == 0x20)
2715             output.append(0x2B);
2716         else if (byte == 0x2A
2717             || byte == 0x2D
2718             || byte == 0x2E
2719             || (byte >= 0x30 && byte <= 0x39)
2720             || (byte >= 0x41 && byte <= 0x5A)
2721             || byte == 0x5F
2722             || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches.
2723             output.append(byte);
2724         else
2725             percentEncodeByte(byte, output);
2726     }
2727 }
2728     
2729 String URLParser::serialize(const URLEncodedForm& tuples)
2730 {
2731     Vector<LChar> output;
2732     for (auto& tuple : tuples) {
2733         if (!output.isEmpty())
2734             output.append('&');
2735         serializeURLEncodedForm(tuple.first, output);
2736         output.append('=');
2737         serializeURLEncodedForm(tuple.second, output);
2738     }
2739     return String::adopt(WTFMove(output));
2740 }
2741
2742 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2743 {
2744     // FIXME: m_cannotBeABaseURL is not compared because the old URL::parse did not use it,
2745     // but once we get rid of URL::parse its value should be tested.
2746     LOG(URLParser, "%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2747         a.m_isValid,
2748         a.m_protocolIsInHTTPFamily,
2749         a.m_schemeEnd,
2750         a.m_userStart,
2751         a.m_userEnd,
2752         a.m_passwordEnd,
2753         a.m_hostEnd,
2754         a.m_portEnd,
2755         a.m_pathAfterLastSlash,
2756         a.m_pathEnd,
2757         a.m_queryEnd,
2758         a.m_fragmentEnd,
2759         a.m_string.utf8().data(),
2760         b.m_isValid,
2761         b.m_protocolIsInHTTPFamily,
2762         b.m_schemeEnd,
2763         b.m_userStart,
2764         b.m_userEnd,
2765         b.m_passwordEnd,
2766         b.m_hostEnd,
2767         b.m_portEnd,
2768         b.m_pathAfterLastSlash,
2769         b.m_pathEnd,
2770         b.m_queryEnd,
2771         b.m_fragmentEnd,
2772         b.m_string.utf8().data());
2773
2774     return a.m_string == b.m_string
2775         && a.m_isValid == b.m_isValid
2776         && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2777         && a.m_schemeEnd == b.m_schemeEnd
2778         && a.m_userStart == b.m_userStart
2779         && a.m_userEnd == b.m_userEnd
2780         && a.m_passwordEnd == b.m_passwordEnd
2781         && a.m_hostEnd == b.m_hostEnd
2782         && a.m_portEnd == b.m_portEnd
2783         && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2784         && a.m_pathEnd == b.m_pathEnd
2785         && a.m_queryEnd == b.m_queryEnd
2786         && a.m_fragmentEnd == b.m_fragmentEnd;
2787 }
2788
2789 bool URLParser::internalValuesConsistent(const URL& url)
2790 {
2791     return url.m_schemeEnd <= url.m_userStart
2792         && url.m_userStart <= url.m_userEnd
2793         && url.m_userEnd <= url.m_passwordEnd
2794         && url.m_passwordEnd <= url.m_hostEnd
2795         && url.m_hostEnd <= url.m_portEnd
2796         && url.m_portEnd <= url.m_pathAfterLastSlash
2797         && url.m_pathAfterLastSlash <= url.m_pathEnd
2798         && url.m_pathEnd <= url.m_queryEnd
2799         && url.m_queryEnd <= url.m_fragmentEnd
2800         && (url.m_isValid ? url.m_fragmentEnd == url.m_string.length() : !url.m_fragmentEnd);
2801     // FIXME: Why do we even store m_fragmentEnd?
2802     // It should be able to be deduced from m_isValid and m_string.length() to save memory.
2803 }
2804
2805 static bool urlParserEnabled = false;
2806
2807 void URLParser::setEnabled(bool enabled)
2808 {
2809     urlParserEnabled = enabled;
2810 }
2811
2812 bool URLParser::enabled()
2813 {
2814     return urlParserEnabled;
2815 }
2816
2817 } // namespace WebCore