44ad4ec0a32a7ef1be0cf5c14c985b632953ab14
[WebKit-https.git] / Source / WebCore / platform / URLParser.cpp
1 /*
2  * Copyright (C) 2016-2018 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "URLParser.h"
28
29 #include "Logging.h"
30 #include "RuntimeApplicationChecks.h"
31 #include <array>
32 #include <mutex>
33 #include <unicode/uidna.h>
34 #include <unicode/utypes.h>
35
36 namespace WebCore {
37
38 #define URL_PARSER_DEBUGGING 0
39
40 #if URL_PARSER_DEBUGGING
41 #define URL_PARSER_LOG(...) LOG(URLParser, __VA_ARGS__)
42 #else
43 #define URL_PARSER_LOG(...)
44 #endif
45     
46 template<typename CharacterType>
47 class CodePointIterator {
48 public:
49     ALWAYS_INLINE CodePointIterator() { }
50     ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
51         : m_begin(begin)
52         , m_end(end)
53     {
54     }
55     
56     ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
57         : CodePointIterator(begin.m_begin, end.m_begin)
58     {
59         ASSERT(end.m_begin >= begin.m_begin);
60     }
61     
62     ALWAYS_INLINE UChar32 operator*() const;
63     ALWAYS_INLINE CodePointIterator& operator++();
64
65     ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
66     {
67         return m_begin == other.m_begin
68             && m_end == other.m_end;
69     }
70     ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
71     
72     ALWAYS_INLINE CodePointIterator& operator=(const CodePointIterator& other)
73     {
74         m_begin = other.m_begin;
75         m_end = other.m_end;
76         return *this;
77     }
78
79     ALWAYS_INLINE bool atEnd() const
80     {
81         ASSERT(m_begin <= m_end);
82         return m_begin >= m_end;
83     }
84     
85     ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
86     {
87         ASSERT(m_begin >= reference);
88         return m_begin - reference;
89     }
90
91     ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
92     {
93         return codeUnitsSince(other.m_begin);
94     }
95     
96 private:
97     const CharacterType* m_begin { nullptr };
98     const CharacterType* m_end { nullptr };
99 };
100
101 template<>
102 ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const
103 {
104     ASSERT(!atEnd());
105     return *m_begin;
106 }
107
108 template<>
109 ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
110 {
111     m_begin++;
112     return *this;
113 }
114
115 template<>
116 ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const
117 {
118     ASSERT(!atEnd());
119     UChar32 c;
120     U16_GET(m_begin, 0, 0, m_end - m_begin, c);
121     return c;
122 }
123
124 template<>
125 ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
126 {
127     unsigned i = 0;
128     size_t length = m_end - m_begin;
129     U16_FWD_1(m_begin, i, length);
130     m_begin += i;
131     return *this;
132 }
133     
134 ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
135 {
136     if (U_IS_BMP(codePoint)) {
137         destination.append(static_cast<UChar>(codePoint));
138         return;
139     }
140     destination.reserveCapacity(destination.size() + 2);
141     destination.uncheckedAppend(U16_LEAD(codePoint));
142     destination.uncheckedAppend(U16_TRAIL(codePoint));
143 }
144
145 enum URLCharacterClass {
146     UserInfo = 0x1,
147     Default = 0x2,
148     ForbiddenHost = 0x4,
149     QueryPercent = 0x8,
150     SlashQuestionOrHash = 0x10,
151     ValidScheme = 0x20,
152 };
153
154 static const uint8_t characterClassTable[256] = {
155     UserInfo | Default | QueryPercent | ForbiddenHost, // 0x0
156     UserInfo | Default | QueryPercent, // 0x1
157     UserInfo | Default | QueryPercent, // 0x2
158     UserInfo | Default | QueryPercent, // 0x3
159     UserInfo | Default | QueryPercent, // 0x4
160     UserInfo | Default | QueryPercent, // 0x5
161     UserInfo | Default | QueryPercent, // 0x6
162     UserInfo | Default | QueryPercent, // 0x7
163     UserInfo | Default | QueryPercent, // 0x8
164     UserInfo | Default | QueryPercent | ForbiddenHost, // 0x9
165     UserInfo | Default | QueryPercent | ForbiddenHost, // 0xA
166     UserInfo | Default | QueryPercent, // 0xB
167     UserInfo | Default | QueryPercent, // 0xC
168     UserInfo | Default | QueryPercent | ForbiddenHost, // 0xD
169     UserInfo | Default | QueryPercent, // 0xE
170     UserInfo | Default | QueryPercent, // 0xF
171     UserInfo | Default | QueryPercent, // 0x10
172     UserInfo | Default | QueryPercent, // 0x11
173     UserInfo | Default | QueryPercent, // 0x12
174     UserInfo | Default | QueryPercent, // 0x13
175     UserInfo | Default | QueryPercent, // 0x14
176     UserInfo | Default | QueryPercent, // 0x15
177     UserInfo | Default | QueryPercent, // 0x16
178     UserInfo | Default | QueryPercent, // 0x17
179     UserInfo | Default | QueryPercent, // 0x18
180     UserInfo | Default | QueryPercent, // 0x19
181     UserInfo | Default | QueryPercent, // 0x1A
182     UserInfo | Default | QueryPercent, // 0x1B
183     UserInfo | Default | QueryPercent, // 0x1C
184     UserInfo | Default | QueryPercent, // 0x1D
185     UserInfo | Default | QueryPercent, // 0x1E
186     UserInfo | Default | QueryPercent, // 0x1F
187     UserInfo | Default | QueryPercent | ForbiddenHost, // ' '
188     0, // '!'
189     UserInfo | Default | QueryPercent, // '"'
190     UserInfo | Default | QueryPercent | SlashQuestionOrHash | ForbiddenHost, // '#'
191     0, // '$'
192     ForbiddenHost, // '%'
193     0, // '&'
194     0, // '\''
195     0, // '('
196     0, // ')'
197     0, // '*'
198     ValidScheme, // '+'
199     0, // ','
200     ValidScheme, // '-'
201     ValidScheme, // '.'
202     UserInfo | SlashQuestionOrHash | ForbiddenHost, // '/'
203     ValidScheme, // '0'
204     ValidScheme, // '1'
205     ValidScheme, // '2'
206     ValidScheme, // '3'
207     ValidScheme, // '4'
208     ValidScheme, // '5'
209     ValidScheme, // '6'
210     ValidScheme, // '7'
211     ValidScheme, // '8'
212     ValidScheme, // '9'
213     UserInfo | ForbiddenHost, // ':'
214     UserInfo, // ';'
215     UserInfo | Default | QueryPercent | ForbiddenHost, // '<'
216     UserInfo, // '='
217     UserInfo | Default | QueryPercent | ForbiddenHost, // '>'
218     UserInfo | Default | SlashQuestionOrHash | ForbiddenHost, // '?'
219     UserInfo | ForbiddenHost, // '@'
220     ValidScheme, // 'A'
221     ValidScheme, // 'B'
222     ValidScheme, // 'C'
223     ValidScheme, // 'D'
224     ValidScheme, // 'E'
225     ValidScheme, // 'F'
226     ValidScheme, // 'G'
227     ValidScheme, // 'H'
228     ValidScheme, // 'I'
229     ValidScheme, // 'J'
230     ValidScheme, // 'K'
231     ValidScheme, // 'L'
232     ValidScheme, // 'M'
233     ValidScheme, // 'N'
234     ValidScheme, // 'O'
235     ValidScheme, // 'P'
236     ValidScheme, // 'Q'
237     ValidScheme, // 'R'
238     ValidScheme, // 'S'
239     ValidScheme, // 'T'
240     ValidScheme, // 'U'
241     ValidScheme, // 'V'
242     ValidScheme, // 'W'
243     ValidScheme, // 'X'
244     ValidScheme, // 'Y'
245     ValidScheme, // 'Z'
246     UserInfo | ForbiddenHost, // '['
247     UserInfo | SlashQuestionOrHash | ForbiddenHost, // '\\'
248     UserInfo | ForbiddenHost, // ']'
249     UserInfo, // '^'
250     0, // '_'
251     UserInfo | Default, // '`'
252     ValidScheme, // 'a'
253     ValidScheme, // 'b'
254     ValidScheme, // 'c'
255     ValidScheme, // 'd'
256     ValidScheme, // 'e'
257     ValidScheme, // 'f'
258     ValidScheme, // 'g'
259     ValidScheme, // 'h'
260     ValidScheme, // 'i'
261     ValidScheme, // 'j'
262     ValidScheme, // 'k'
263     ValidScheme, // 'l'
264     ValidScheme, // 'm'
265     ValidScheme, // 'n'
266     ValidScheme, // 'o'
267     ValidScheme, // 'p'
268     ValidScheme, // 'q'
269     ValidScheme, // 'r'
270     ValidScheme, // 's'
271     ValidScheme, // 't'
272     ValidScheme, // 'u'
273     ValidScheme, // 'v'
274     ValidScheme, // 'w'
275     ValidScheme, // 'x'
276     ValidScheme, // 'y'
277     ValidScheme, // 'z'
278     UserInfo | Default, // '{'
279     UserInfo, // '|'
280     UserInfo | Default, // '}'
281     0, // '~'
282     QueryPercent, // 0x7F
283     QueryPercent, // 0x80
284     QueryPercent, // 0x81
285     QueryPercent, // 0x82
286     QueryPercent, // 0x83
287     QueryPercent, // 0x84
288     QueryPercent, // 0x85
289     QueryPercent, // 0x86
290     QueryPercent, // 0x87
291     QueryPercent, // 0x88
292     QueryPercent, // 0x89
293     QueryPercent, // 0x8A
294     QueryPercent, // 0x8B
295     QueryPercent, // 0x8C
296     QueryPercent, // 0x8D
297     QueryPercent, // 0x8E
298     QueryPercent, // 0x8F
299     QueryPercent, // 0x90
300     QueryPercent, // 0x91
301     QueryPercent, // 0x92
302     QueryPercent, // 0x93
303     QueryPercent, // 0x94
304     QueryPercent, // 0x95
305     QueryPercent, // 0x96
306     QueryPercent, // 0x97
307     QueryPercent, // 0x98
308     QueryPercent, // 0x99
309     QueryPercent, // 0x9A
310     QueryPercent, // 0x9B
311     QueryPercent, // 0x9C
312     QueryPercent, // 0x9D
313     QueryPercent, // 0x9E
314     QueryPercent, // 0x9F
315     QueryPercent, // 0xA0
316     QueryPercent, // 0xA1
317     QueryPercent, // 0xA2
318     QueryPercent, // 0xA3
319     QueryPercent, // 0xA4
320     QueryPercent, // 0xA5
321     QueryPercent, // 0xA6
322     QueryPercent, // 0xA7
323     QueryPercent, // 0xA8
324     QueryPercent, // 0xA9
325     QueryPercent, // 0xAA
326     QueryPercent, // 0xAB
327     QueryPercent, // 0xAC
328     QueryPercent, // 0xAD
329     QueryPercent, // 0xAE
330     QueryPercent, // 0xAF
331     QueryPercent, // 0xB0
332     QueryPercent, // 0xB1
333     QueryPercent, // 0xB2
334     QueryPercent, // 0xB3
335     QueryPercent, // 0xB4
336     QueryPercent, // 0xB5
337     QueryPercent, // 0xB6
338     QueryPercent, // 0xB7
339     QueryPercent, // 0xB8
340     QueryPercent, // 0xB9
341     QueryPercent, // 0xBA
342     QueryPercent, // 0xBB
343     QueryPercent, // 0xBC
344     QueryPercent, // 0xBD
345     QueryPercent, // 0xBE
346     QueryPercent, // 0xBF
347     QueryPercent, // 0xC0
348     QueryPercent, // 0xC1
349     QueryPercent, // 0xC2
350     QueryPercent, // 0xC3
351     QueryPercent, // 0xC4
352     QueryPercent, // 0xC5
353     QueryPercent, // 0xC6
354     QueryPercent, // 0xC7
355     QueryPercent, // 0xC8
356     QueryPercent, // 0xC9
357     QueryPercent, // 0xCA
358     QueryPercent, // 0xCB
359     QueryPercent, // 0xCC
360     QueryPercent, // 0xCD
361     QueryPercent, // 0xCE
362     QueryPercent, // 0xCF
363     QueryPercent, // 0xD0
364     QueryPercent, // 0xD1
365     QueryPercent, // 0xD2
366     QueryPercent, // 0xD3
367     QueryPercent, // 0xD4
368     QueryPercent, // 0xD5
369     QueryPercent, // 0xD6
370     QueryPercent, // 0xD7
371     QueryPercent, // 0xD8
372     QueryPercent, // 0xD9
373     QueryPercent, // 0xDA
374     QueryPercent, // 0xDB
375     QueryPercent, // 0xDC
376     QueryPercent, // 0xDD
377     QueryPercent, // 0xDE
378     QueryPercent, // 0xDF
379     QueryPercent, // 0xE0
380     QueryPercent, // 0xE1
381     QueryPercent, // 0xE2
382     QueryPercent, // 0xE3
383     QueryPercent, // 0xE4
384     QueryPercent, // 0xE5
385     QueryPercent, // 0xE6
386     QueryPercent, // 0xE7
387     QueryPercent, // 0xE8
388     QueryPercent, // 0xE9
389     QueryPercent, // 0xEA
390     QueryPercent, // 0xEB
391     QueryPercent, // 0xEC
392     QueryPercent, // 0xED
393     QueryPercent, // 0xEE
394     QueryPercent, // 0xEF
395     QueryPercent, // 0xF0
396     QueryPercent, // 0xF1
397     QueryPercent, // 0xF2
398     QueryPercent, // 0xF3
399     QueryPercent, // 0xF4
400     QueryPercent, // 0xF5
401     QueryPercent, // 0xF6
402     QueryPercent, // 0xF7
403     QueryPercent, // 0xF8
404     QueryPercent, // 0xF9
405     QueryPercent, // 0xFA
406     QueryPercent, // 0xFB
407     QueryPercent, // 0xFC
408     QueryPercent, // 0xFD
409     QueryPercent, // 0xFE
410     QueryPercent, // 0xFF
411 };
412
413 template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; }
414 template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
415 template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
416 template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
417 template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
418 template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
419 template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
420 template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
421 template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & ValidScheme; }
422 template<typename CharacterType> ALWAYS_INLINE static bool isForbiddenHostCodePoint(CharacterType character) { return character <= ']' && characterClassTable[character] & ForbiddenHost; }
423 ALWAYS_INLINE static bool shouldPercentEncodeQueryByte(uint8_t byte, const bool& urlIsSpecial)
424 {
425     if (characterClassTable[byte] & QueryPercent)
426         return true;
427     if (byte == '\'' && urlIsSpecial)
428         return true;
429     return false;
430 }
431
432 bool URLParser::isInUserInfoEncodeSet(UChar c)
433 {
434     return WebCore::isInUserInfoEncodeSet(c);
435 }
436
437 template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation>
438 ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
439 {
440     ++iterator;
441     while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
442         if (reportSyntaxViolation == ReportSyntaxViolation::Yes)
443             syntaxViolation(iteratorForSyntaxViolationPosition);
444         ++iterator;
445     }
446 }
447
448 template<typename CharacterType>
449 bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator)
450 {
451     if (iterator.atEnd())
452         return false;
453     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
454     if (iterator.atEnd())
455         return false;
456     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
457     return iterator.atEnd();
458 }
459
460 template<typename CharacterType>
461 ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
462 {
463     if (iterator.atEnd() || !isASCIIAlpha(*iterator))
464         return false;
465     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
466     if (iterator.atEnd())
467         return false;
468     if (*iterator == ':')
469         return true;
470     if (UNLIKELY(*iterator == '|'))
471         return true;
472     return false;
473 }
474
475 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
476 {
477     ASSERT(isASCII(codePoint));
478     if (UNLIKELY(m_didSeeSyntaxViolation))
479         m_asciiBuffer.append(codePoint);
480 }
481
482 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
483 {
484     if (UNLIKELY(m_didSeeSyntaxViolation))
485         m_asciiBuffer.append(characters, length);
486 }
487
488 template<typename CharacterType>
489 void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
490 {
491     ASSERT(isWindowsDriveLetter(iterator));
492     appendToASCIIBuffer(*iterator);
493     advance(iterator);
494     ASSERT(!iterator.atEnd());
495     ASSERT(*iterator == ':' || *iterator == '|');
496     if (*iterator == '|')
497         syntaxViolation(iterator);
498     appendToASCIIBuffer(':');
499     advance(iterator);
500 }
501
502 bool URLParser::copyBaseWindowsDriveLetter(const URL& base)
503 {
504     if (base.protocolIs("file")) {
505         RELEASE_ASSERT(base.m_portEnd < base.m_string.length());
506         if (base.m_string.is8Bit()) {
507             const LChar* begin = base.m_string.characters8();
508             CodePointIterator<LChar> c(begin + base.m_portEnd + 1, begin + base.m_string.length());
509             if (isWindowsDriveLetter(c)) {
510                 appendWindowsDriveLetter(c);
511                 return true;
512             }
513         } else {
514             const UChar* begin = base.m_string.characters16();
515             CodePointIterator<UChar> c(begin + base.m_portEnd + 1, begin + base.m_string.length());
516             if (isWindowsDriveLetter(c)) {
517                 appendWindowsDriveLetter(c);
518                 return true;
519             }
520         }
521     }
522     return false;
523 }
524
525 template<typename CharacterType>
526 bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
527 {
528     if (!isWindowsDriveLetter(iterator))
529         return true;
530     if (iterator.atEnd())
531         return false;
532     advance(iterator);
533     if (iterator.atEnd())
534         return true;
535     advance(iterator);
536     if (iterator.atEnd())
537         return true;
538     return !isSlashQuestionOrHash(*iterator);
539 }
540
541 static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
542 {
543     buffer.append('%');
544     buffer.append(upperNibbleToASCIIHexDigit(byte));
545     buffer.append(lowerNibbleToASCIIHexDigit(byte));
546 }
547
548 void URLParser::percentEncodeByte(uint8_t byte)
549 {
550     ASSERT(m_didSeeSyntaxViolation);
551     appendToASCIIBuffer('%');
552     appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
553     appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
554 }
555
556 const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD";
557 const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1;
558
559 template<bool(*isInCodeSet)(UChar32), typename CharacterType>
560 ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
561 {
562     ASSERT(!iterator.atEnd());
563     UChar32 codePoint = *iterator;
564     if (LIKELY(isASCII(codePoint))) {
565         if (UNLIKELY(isInCodeSet(codePoint))) {
566             syntaxViolation(iterator);
567             percentEncodeByte(codePoint);
568         } else
569             appendToASCIIBuffer(codePoint);
570         return;
571     }
572     ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
573     syntaxViolation(iterator);
574     
575     if (!U_IS_UNICODE_CHAR(codePoint)) {
576         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
577         return;
578     }
579     
580     uint8_t buffer[U8_MAX_LENGTH];
581     int32_t offset = 0;
582     U8_APPEND_UNSAFE(buffer, offset, codePoint);
583     for (int32_t i = 0; i < offset; ++i)
584         percentEncodeByte(buffer[i]);
585 }
586
587 template<typename CharacterType>
588 ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
589 {
590     ASSERT(!iterator.atEnd());
591     UChar32 codePoint = *iterator;
592     if (LIKELY(isASCII(codePoint))) {
593         if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint, m_urlIsSpecial))) {
594             syntaxViolation(iterator);
595             percentEncodeByte(codePoint);
596         } else
597             appendToASCIIBuffer(codePoint);
598         return;
599     }
600     
601     syntaxViolation(iterator);
602     
603     if (!U_IS_UNICODE_CHAR(codePoint)) {
604         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
605         return;
606     }
607
608     uint8_t buffer[U8_MAX_LENGTH];
609     int32_t offset = 0;
610     U8_APPEND_UNSAFE(buffer, offset, codePoint);
611     for (int32_t i = 0; i < offset; ++i) {
612         auto byte = buffer[i];
613         if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
614             percentEncodeByte(byte);
615         else
616             appendToASCIIBuffer(byte);
617     }
618 }
619
620 template<typename CharacterType>
621 void URLParser::encodeQuery(const Vector<UChar>& source, const TextEncoding& encoding, CodePointIterator<CharacterType> iterator)
622 {
623     // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
624     auto encoded = encoding.encode(StringView(source.data(), source.size()), UnencodableHandling::URLEncodedEntities);
625     auto* data = encoded.data();
626     size_t length = encoded.size();
627     
628     if (!length == !iterator.atEnd()) {
629         syntaxViolation(iterator);
630         return;
631     }
632     
633     size_t i = 0;
634     for (; i < length; ++i) {
635         ASSERT(!iterator.atEnd());
636         uint8_t byte = data[i];
637         if (UNLIKELY(byte != *iterator)) {
638             syntaxViolation(iterator);
639             break;
640         }
641         if (UNLIKELY(shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))) {
642             syntaxViolation(iterator);
643             break;
644         }
645         appendToASCIIBuffer(byte);
646         ++iterator;
647     }
648     while (!iterator.atEnd() && isTabOrNewline(*iterator))
649         ++iterator;
650     ASSERT((i == length) == iterator.atEnd());
651     for (; i < length; ++i) {
652         ASSERT(m_didSeeSyntaxViolation);
653         uint8_t byte = data[i];
654         if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
655             percentEncodeByte(byte);
656         else
657             appendToASCIIBuffer(byte);
658     }
659 }
660
661 std::optional<uint16_t> URLParser::defaultPortForProtocol(StringView scheme)
662 {
663     static const uint16_t ftpPort = 21;
664     static const uint16_t gopherPort = 70;
665     static const uint16_t httpPort = 80;
666     static const uint16_t httpsPort = 443;
667     static const uint16_t wsPort = 80;
668     static const uint16_t wssPort = 443;
669     
670     auto length = scheme.length();
671     if (!length)
672         return std::nullopt;
673     switch (scheme[0]) {
674     case 'w':
675         switch (length) {
676         case 2:
677             if (scheme[1] == 's')
678                 return wsPort;
679             return std::nullopt;
680         case 3:
681             if (scheme[1] == 's'
682                 && scheme[2] == 's')
683                 return wssPort;
684             return std::nullopt;
685         default:
686             return false;
687         }
688     case 'h':
689         switch (length) {
690         case 4:
691             if (scheme[1] == 't'
692                 && scheme[2] == 't'
693                 && scheme[3] == 'p')
694                 return httpPort;
695             return std::nullopt;
696         case 5:
697             if (scheme[1] == 't'
698                 && scheme[2] == 't'
699                 && scheme[3] == 'p'
700                 && scheme[4] == 's')
701                 return httpsPort;
702             return std::nullopt;
703         default:
704             return std::nullopt;
705         }
706     case 'g':
707         if (length == 6
708             && scheme[1] == 'o'
709             && scheme[2] == 'p'
710             && scheme[3] == 'h'
711             && scheme[4] == 'e'
712             && scheme[5] == 'r')
713             return gopherPort;
714         return std::nullopt;
715     case 'f':
716         if (length == 3
717             && scheme[1] == 't'
718             && scheme[2] == 'p')
719             return ftpPort;
720         return std::nullopt;
721     default:
722         return std::nullopt;
723     }
724 }
725
726 enum class Scheme {
727     WS,
728     WSS,
729     File,
730     FTP,
731     Gopher,
732     HTTP,
733     HTTPS,
734     NonSpecial
735 };
736
737 ALWAYS_INLINE static Scheme scheme(StringView scheme)
738 {
739     auto length = scheme.length();
740     if (!length)
741         return Scheme::NonSpecial;
742     switch (scheme[0]) {
743     case 'f':
744         switch (length) {
745         case 3:
746             if (scheme[1] == 't'
747                 && scheme[2] == 'p')
748                 return Scheme::FTP;
749             return Scheme::NonSpecial;
750         case 4:
751             if (scheme[1] == 'i'
752                 && scheme[2] == 'l'
753                 && scheme[3] == 'e')
754                 return Scheme::File;
755             return Scheme::NonSpecial;
756         default:
757             return Scheme::NonSpecial;
758         }
759     case 'g':
760         if (length == 6
761             && scheme[1] == 'o'
762             && scheme[2] == 'p'
763             && scheme[3] == 'h'
764             && scheme[4] == 'e'
765             && scheme[5] == 'r')
766             return Scheme::Gopher;
767         return Scheme::NonSpecial;
768     case 'h':
769         switch (length) {
770         case 4:
771             if (scheme[1] == 't'
772                 && scheme[2] == 't'
773                 && scheme[3] == 'p')
774                 return Scheme::HTTP;
775             return Scheme::NonSpecial;
776         case 5:
777             if (scheme[1] == 't'
778                 && scheme[2] == 't'
779                 && scheme[3] == 'p'
780                 && scheme[4] == 's')
781                 return Scheme::HTTPS;
782             return Scheme::NonSpecial;
783         default:
784             return Scheme::NonSpecial;
785         }
786     case 'w':
787         switch (length) {
788         case 2:
789             if (scheme[1] == 's')
790                 return Scheme::WS;
791             return Scheme::NonSpecial;
792         case 3:
793             if (scheme[1] == 's'
794                 && scheme[2] == 's')
795                 return Scheme::WSS;
796             return Scheme::NonSpecial;
797         default:
798             return Scheme::NonSpecial;
799         }
800     default:
801         return Scheme::NonSpecial;
802     }
803 }
804
805 std::optional<String> URLParser::maybeCanonicalizeScheme(const String& scheme)
806 {
807     if (scheme.isEmpty())
808         return std::nullopt;
809
810     if (!isASCIIAlpha(scheme[0]))
811         return std::nullopt;
812
813     for (size_t i = 1; i < scheme.length(); ++i) {
814         if (isASCIIAlphanumeric(scheme[i]) || scheme[i] == '+' || scheme[i] == '-' || scheme[i] == '.')
815             continue;
816         return std::nullopt;
817     }
818
819     return scheme.convertToASCIILowercase();
820 }
821
822 bool URLParser::isSpecialScheme(const String& schemeArg)
823 {
824     return scheme(schemeArg) != Scheme::NonSpecial;
825 }
826
827 enum class URLParser::URLPart {
828     SchemeEnd,
829     UserStart,
830     UserEnd,
831     PasswordEnd,
832     HostEnd,
833     PortEnd,
834     PathAfterLastSlash,
835     PathEnd,
836     QueryEnd,
837 };
838
839 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
840 {
841     switch (part) {
842     case URLPart::QueryEnd:
843         return url.m_queryEnd;
844     case URLPart::PathEnd:
845         return url.m_pathEnd;
846     case URLPart::PathAfterLastSlash:
847         return url.m_pathAfterLastSlash;
848     case URLPart::PortEnd:
849         return url.m_portEnd;
850     case URLPart::HostEnd:
851         return url.m_hostEnd;
852     case URLPart::PasswordEnd:
853         return url.m_passwordEnd;
854     case URLPart::UserEnd:
855         return url.m_userEnd;
856     case URLPart::UserStart:
857         return url.m_userStart;
858     case URLPart::SchemeEnd:
859         return url.m_schemeEnd;
860     }
861     ASSERT_NOT_REACHED();
862     return 0;
863 }
864
865 void URLParser::copyASCIIStringUntil(const String& string, size_t length)
866 {
867     RELEASE_ASSERT(length <= string.length());
868     if (string.isNull())
869         return;
870     ASSERT(m_asciiBuffer.isEmpty());
871     if (string.is8Bit())
872         appendToASCIIBuffer(string.characters8(), length);
873     else {
874         const UChar* characters = string.characters16();
875         for (size_t i = 0; i < length; ++i) {
876             UChar c = characters[i];
877             ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
878             appendToASCIIBuffer(c);
879         }
880     }
881 }
882
883 template<typename CharacterType>
884 void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, bool& isUTF8Encoding)
885 {
886     syntaxViolation(iterator);
887
888     m_asciiBuffer.clear();
889     copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
890     switch (part) {
891     case URLPart::QueryEnd:
892         m_url.m_queryEnd = base.m_queryEnd;
893         FALLTHROUGH;
894     case URLPart::PathEnd:
895         m_url.m_pathEnd = base.m_pathEnd;
896         FALLTHROUGH;
897     case URLPart::PathAfterLastSlash:
898         m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
899         FALLTHROUGH;
900     case URLPart::PortEnd:
901         m_url.m_portEnd = base.m_portEnd;
902         FALLTHROUGH;
903     case URLPart::HostEnd:
904         m_url.m_hostEnd = base.m_hostEnd;
905         FALLTHROUGH;
906     case URLPart::PasswordEnd:
907         m_url.m_passwordEnd = base.m_passwordEnd;
908         FALLTHROUGH;
909     case URLPart::UserEnd:
910         m_url.m_userEnd = base.m_userEnd;
911         FALLTHROUGH;
912     case URLPart::UserStart:
913         m_url.m_userStart = base.m_userStart;
914         FALLTHROUGH;
915     case URLPart::SchemeEnd:
916         m_url.m_isValid = base.m_isValid;
917         m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
918         m_url.m_schemeEnd = base.m_schemeEnd;
919     }
920     switch (scheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd))) {
921     case Scheme::WS:
922     case Scheme::WSS:
923         isUTF8Encoding = true;
924         m_urlIsSpecial = true;
925         return;
926     case Scheme::File:
927         m_urlIsFile = true;
928         FALLTHROUGH;
929     case Scheme::FTP:
930     case Scheme::Gopher:
931     case Scheme::HTTP:
932     case Scheme::HTTPS:
933         m_urlIsSpecial = true;
934         return;
935     case Scheme::NonSpecial:
936         m_urlIsSpecial = false;
937         isUTF8Encoding = true;
938         return;
939     }
940     ASSERT_NOT_REACHED();
941 }
942
943 static const char dotASCIICode[2] = {'2', 'e'};
944
945 template<typename CharacterType>
946 ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c)
947 {
948     if (c.atEnd())
949         return false;
950     if (*c == '.') {
951         advance<CharacterType, ReportSyntaxViolation::No>(c);
952         return c.atEnd() || isSlashQuestionOrHash(*c);
953     }
954     if (*c != '%')
955         return false;
956     advance<CharacterType, ReportSyntaxViolation::No>(c);
957     if (c.atEnd() || *c != dotASCIICode[0])
958         return false;
959     advance<CharacterType, ReportSyntaxViolation::No>(c);
960     if (c.atEnd())
961         return false;
962     if (toASCIILower(*c) == dotASCIICode[1]) {
963         advance<CharacterType, ReportSyntaxViolation::No>(c);
964         return c.atEnd() || isSlashQuestionOrHash(*c);
965     }
966     return false;
967 }
968
969 template<typename CharacterType>
970 ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
971 {
972     if (c.atEnd())
973         return false;
974     if (*c == '.') {
975         advance<CharacterType, ReportSyntaxViolation::No>(c);
976         return isSingleDotPathSegment(c);
977     }
978     if (*c != '%')
979         return false;
980     advance<CharacterType, ReportSyntaxViolation::No>(c);
981     if (c.atEnd() || *c != dotASCIICode[0])
982         return false;
983     advance<CharacterType, ReportSyntaxViolation::No>(c);
984     if (c.atEnd())
985         return false;
986     if (toASCIILower(*c) == dotASCIICode[1]) {
987         advance<CharacterType, ReportSyntaxViolation::No>(c);
988         return isSingleDotPathSegment(c);
989     }
990     return false;
991 }
992
993 template<typename CharacterType>
994 void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
995 {
996     ASSERT(isSingleDotPathSegment(c));
997     if (*c == '.') {
998         advance(c);
999         if (!c.atEnd()) {
1000             if (*c == '/' || *c == '\\')
1001                 advance(c);
1002             else
1003                 ASSERT(*c == '?' || *c == '#');
1004         }
1005     } else {
1006         ASSERT(*c == '%');
1007         advance(c);
1008         ASSERT(*c == dotASCIICode[0]);
1009         advance(c);
1010         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1011         advance(c);
1012         if (!c.atEnd()) {
1013             if (*c == '/' || *c == '\\')
1014                 advance(c);
1015             else
1016                 ASSERT(*c == '?' || *c == '#');
1017         }
1018     }
1019 }
1020
1021 template<typename CharacterType>
1022 void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
1023 {
1024     ASSERT(isDoubleDotPathSegment(c));
1025     if (*c == '.')
1026         advance(c);
1027     else {
1028         ASSERT(*c == '%');
1029         advance(c);
1030         ASSERT(*c == dotASCIICode[0]);
1031         advance(c);
1032         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1033         advance(c);
1034     }
1035     consumeSingleDotPathSegment(c);
1036 }
1037
1038 bool URLParser::shouldPopPath(unsigned newPathAfterLastSlash)
1039 {
1040     ASSERT(m_didSeeSyntaxViolation);
1041     if (!m_urlIsFile)
1042         return true;
1043
1044     ASSERT(m_url.m_pathAfterLastSlash <= m_asciiBuffer.size());
1045     CodePointIterator<LChar> componentToPop(&m_asciiBuffer[newPathAfterLastSlash], &m_asciiBuffer[0] + m_url.m_pathAfterLastSlash);
1046     if (newPathAfterLastSlash == m_url.m_portEnd + 1 && isWindowsDriveLetter(componentToPop))
1047         return false;
1048     return true;
1049 }
1050
1051 void URLParser::popPath()
1052 {
1053     ASSERT(m_didSeeSyntaxViolation);
1054     if (m_url.m_pathAfterLastSlash > m_url.m_portEnd + 1) {
1055         auto newPathAfterLastSlash = m_url.m_pathAfterLastSlash - 1;
1056         if (m_asciiBuffer[newPathAfterLastSlash] == '/')
1057             newPathAfterLastSlash--;
1058         while (newPathAfterLastSlash > m_url.m_portEnd && m_asciiBuffer[newPathAfterLastSlash] != '/')
1059             newPathAfterLastSlash--;
1060         newPathAfterLastSlash++;
1061         if (shouldPopPath(newPathAfterLastSlash))
1062             m_url.m_pathAfterLastSlash = newPathAfterLastSlash;
1063     }
1064     m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
1065 }
1066
1067 template<typename CharacterType>
1068 void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
1069 {
1070     if (m_didSeeSyntaxViolation)
1071         return;
1072     m_didSeeSyntaxViolation = true;
1073     
1074     ASSERT(m_asciiBuffer.isEmpty());
1075     size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1076     RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
1077     m_asciiBuffer.reserveCapacity(m_inputString.length());
1078     for (size_t i = 0; i < codeUnitsToCopy; ++i) {
1079         ASSERT(isASCII(m_inputString[i]));
1080         m_asciiBuffer.uncheckedAppend(m_inputString[i]);
1081     }
1082 }
1083
1084 void URLParser::failure()
1085 {
1086     m_url.invalidate();
1087     m_url.m_string = m_inputString;
1088 }
1089
1090 template<typename CharacterType>
1091 bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint)
1092 {
1093     if (iterator.atEnd() || toASCIILower(*iterator) != codePoint)
1094         return false;
1095     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
1096     return true;
1097 }
1098
1099 template<typename CharacterType>
1100 bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator)
1101 {
1102     if (!checkLocalhostCodePoint(iterator, 'l'))
1103         return false;
1104     if (!checkLocalhostCodePoint(iterator, 'o'))
1105         return false;
1106     if (!checkLocalhostCodePoint(iterator, 'c'))
1107         return false;
1108     if (!checkLocalhostCodePoint(iterator, 'a'))
1109         return false;
1110     if (!checkLocalhostCodePoint(iterator, 'l'))
1111         return false;
1112     if (!checkLocalhostCodePoint(iterator, 'h'))
1113         return false;
1114     if (!checkLocalhostCodePoint(iterator, 'o'))
1115         return false;
1116     if (!checkLocalhostCodePoint(iterator, 's'))
1117         return false;
1118     if (!checkLocalhostCodePoint(iterator, 't'))
1119         return false;
1120     return iterator.atEnd();
1121 }
1122
1123 bool URLParser::isLocalhost(StringView view)
1124 {
1125     if (view.is8Bit())
1126         return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length()));
1127     return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length()));
1128 }
1129
1130 ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
1131 {
1132     if (UNLIKELY(m_didSeeSyntaxViolation)) {
1133         ASSERT(start + length <= m_asciiBuffer.size());
1134         return StringView(m_asciiBuffer.data() + start, length);
1135     }
1136     ASSERT(start + length <= m_inputString.length());
1137     return StringView(m_inputString).substring(start, length);
1138 }
1139
1140 ALWAYS_INLINE UChar URLParser::parsedDataView(size_t position)
1141 {
1142     if (UNLIKELY(m_didSeeSyntaxViolation))
1143         return m_asciiBuffer[position];
1144     return m_inputString[position];
1145 }
1146
1147 template<typename CharacterType>
1148 ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1149 {
1150     if (UNLIKELY(m_didSeeSyntaxViolation))
1151         return m_asciiBuffer.size();
1152     
1153     return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1154 }
1155
1156 URLParser::URLParser(const String& input, const URL& base, const TextEncoding& encoding)
1157     : m_inputString(input)
1158 {
1159     if (input.isNull()) {
1160         if (base.isValid() && !base.m_cannotBeABaseURL) {
1161             m_url = base;
1162             m_url.removeFragmentIdentifier();
1163         }
1164         return;
1165     }
1166
1167     if (input.is8Bit()) {
1168         m_inputBegin = input.characters8();
1169         parse(input.characters8(), input.length(), base, encoding);
1170     } else {
1171         m_inputBegin = input.characters16();
1172         parse(input.characters16(), input.length(), base, encoding);
1173     }
1174
1175     ASSERT(!m_url.m_isValid
1176         || m_didSeeSyntaxViolation == (m_url.string() != input)
1177         || (input.isAllSpecialCharacters<isC0ControlOrSpace>()
1178             && m_url.m_string == base.m_string.left(base.m_queryEnd)));
1179     ASSERT(internalValuesConsistent(m_url));
1180 #if !ASSERT_DISABLED
1181     if (!m_didSeeSyntaxViolation) {
1182         // Force a syntax violation at the beginning to make sure we get the same result.
1183         URLParser parser(makeString(" ", input), base, encoding);
1184         URL parsed = parser.result();
1185         if (parsed.isValid())
1186             ASSERT(allValuesEqual(parser.result(), m_url));
1187     }
1188 #endif
1189 }
1190
1191 template<typename CharacterType>
1192 void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
1193 {
1194     URL_PARSER_LOG("Parsing URL <%s> base <%s> encoding <%s>", String(input, length).utf8().data(), base.string().utf8().data(), encoding.name());
1195     m_url = { };
1196     ASSERT(m_asciiBuffer.isEmpty());
1197     
1198     bool isUTF8Encoding = encoding == UTF8Encoding();
1199     Vector<UChar> queryBuffer;
1200
1201     unsigned endIndex = length;
1202     while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) {
1203         syntaxViolation(CodePointIterator<CharacterType>(input, input));
1204         endIndex--;
1205     }
1206     CodePointIterator<CharacterType> c(input, input + endIndex);
1207     CodePointIterator<CharacterType> authorityOrHostBegin;
1208     CodePointIterator<CharacterType> queryBegin;
1209     while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1210         syntaxViolation(c);
1211         ++c;
1212     }
1213     auto beginAfterControlAndSpace = c;
1214
1215     enum class State : uint8_t {
1216         SchemeStart,
1217         Scheme,
1218         NoScheme,
1219         SpecialRelativeOrAuthority,
1220         PathOrAuthority,
1221         Relative,
1222         RelativeSlash,
1223         SpecialAuthoritySlashes,
1224         SpecialAuthorityIgnoreSlashes,
1225         AuthorityOrHost,
1226         Host,
1227         File,
1228         FileSlash,
1229         FileHost,
1230         PathStart,
1231         Path,
1232         CannotBeABaseURLPath,
1233         UTF8Query,
1234         NonUTF8Query,
1235         Fragment,
1236     };
1237
1238 #define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1239 #define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x)
1240
1241     State state = State::SchemeStart;
1242     while (!c.atEnd()) {
1243         if (UNLIKELY(isTabOrNewline(*c))) {
1244             syntaxViolation(c);
1245             ++c;
1246             continue;
1247         }
1248
1249         switch (state) {
1250         case State::SchemeStart:
1251             LOG_STATE("SchemeStart");
1252             if (isASCIIAlpha(*c)) {
1253                 if (UNLIKELY(isASCIIUpper(*c)))
1254                     syntaxViolation(c);
1255                 appendToASCIIBuffer(toASCIILower(*c));
1256                 advance(c);
1257                 if (c.atEnd()) {
1258                     m_asciiBuffer.clear();
1259                     state = State::NoScheme;
1260                     c = beginAfterControlAndSpace;
1261                     break;
1262                 }
1263                 state = State::Scheme;
1264             } else
1265                 state = State::NoScheme;
1266             break;
1267         case State::Scheme:
1268             LOG_STATE("Scheme");
1269             if (isValidSchemeCharacter(*c)) {
1270                 if (UNLIKELY(isASCIIUpper(*c)))
1271                     syntaxViolation(c);
1272                 appendToASCIIBuffer(toASCIILower(*c));
1273             } else if (*c == ':') {
1274                 m_url.m_schemeEnd = currentPosition(c);
1275                 StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd);
1276                 appendToASCIIBuffer(':');
1277                 switch (scheme(urlScheme)) {
1278                 case Scheme::File:
1279                     m_urlIsSpecial = true;
1280                     m_urlIsFile = true;
1281                     state = State::File;
1282                     ++c;
1283                     break;
1284                 case Scheme::WS:
1285                 case Scheme::WSS:
1286                     isUTF8Encoding = true;
1287                     m_urlIsSpecial = true;
1288                     if (base.protocolIs(urlScheme))
1289                         state = State::SpecialRelativeOrAuthority;
1290                     else
1291                         state = State::SpecialAuthoritySlashes;
1292                     ++c;
1293                     break;
1294                 case Scheme::HTTP:
1295                 case Scheme::HTTPS:
1296                     m_url.m_protocolIsInHTTPFamily = true;
1297                     FALLTHROUGH;
1298                 case Scheme::FTP:
1299                 case Scheme::Gopher:
1300                     m_urlIsSpecial = true;
1301                     if (base.protocolIs(urlScheme))
1302                         state = State::SpecialRelativeOrAuthority;
1303                     else
1304                         state = State::SpecialAuthoritySlashes;
1305                     ++c;
1306                     break;
1307                 case Scheme::NonSpecial:
1308                     isUTF8Encoding = true;
1309                     auto maybeSlash = c;
1310                     advance(maybeSlash);
1311                     if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1312                         appendToASCIIBuffer('/');
1313                         c = maybeSlash;
1314                         state = State::PathOrAuthority;
1315                         ASSERT(*c == '/');
1316                         ++c;
1317                         m_url.m_userStart = currentPosition(c);
1318                     } else {
1319                         ++c;
1320                         m_url.m_userStart = currentPosition(c);
1321                         m_url.m_userEnd = m_url.m_userStart;
1322                         m_url.m_passwordEnd = m_url.m_userStart;
1323                         m_url.m_hostEnd = m_url.m_userStart;
1324                         m_url.m_portEnd = m_url.m_userStart;
1325                         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1326                         m_url.m_cannotBeABaseURL = true;
1327                         state = State::CannotBeABaseURLPath;
1328                     }
1329                     break;
1330                 }
1331                 break;
1332             } else {
1333                 m_asciiBuffer.clear();
1334                 state = State::NoScheme;
1335                 c = beginAfterControlAndSpace;
1336                 break;
1337             }
1338             advance(c);
1339             if (c.atEnd()) {
1340                 m_asciiBuffer.clear();
1341                 state = State::NoScheme;
1342                 c = beginAfterControlAndSpace;
1343             }
1344             break;
1345         case State::NoScheme:
1346             LOG_STATE("NoScheme");
1347             if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) {
1348                 failure();
1349                 return;
1350             }
1351             if (base.m_cannotBeABaseURL && *c == '#') {
1352                 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1353                 state = State::Fragment;
1354                 appendToASCIIBuffer('#');
1355                 ++c;
1356                 break;
1357             }
1358             if (!base.protocolIs("file")) {
1359                 state = State::Relative;
1360                 break;
1361             }
1362             copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1363             appendToASCIIBuffer(':');
1364             state = State::File;
1365             break;
1366         case State::SpecialRelativeOrAuthority:
1367             LOG_STATE("SpecialRelativeOrAuthority");
1368             if (*c == '/') {
1369                 appendToASCIIBuffer('/');
1370                 advance(c);
1371                 if (c.atEnd()) {
1372                     failure();
1373                     return;
1374                 }
1375                 if (*c == '/') {
1376                     appendToASCIIBuffer('/');
1377                     state = State::SpecialAuthorityIgnoreSlashes;
1378                     ++c;
1379                 } else
1380                     state = State::RelativeSlash;
1381             } else
1382                 state = State::Relative;
1383             break;
1384         case State::PathOrAuthority:
1385             LOG_STATE("PathOrAuthority");
1386             if (*c == '/') {
1387                 appendToASCIIBuffer('/');
1388                 state = State::AuthorityOrHost;
1389                 advance(c);
1390                 m_url.m_userStart = currentPosition(c);
1391                 authorityOrHostBegin = c;
1392             } else {
1393                 ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1394                 m_url.m_userStart = currentPosition(c) - 1;
1395                 m_url.m_userEnd = m_url.m_userStart;
1396                 m_url.m_passwordEnd = m_url.m_userStart;
1397                 m_url.m_hostEnd = m_url.m_userStart;
1398                 m_url.m_portEnd = m_url.m_userStart;
1399                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1400                 state = State::Path;
1401             }
1402             break;
1403         case State::Relative:
1404             LOG_STATE("Relative");
1405             switch (*c) {
1406             case '/':
1407             case '\\':
1408                 state = State::RelativeSlash;
1409                 ++c;
1410                 break;
1411             case '?':
1412                 copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1413                 appendToASCIIBuffer('?');
1414                 ++c;
1415                 if (isUTF8Encoding)
1416                     state = State::UTF8Query;
1417                 else {
1418                     queryBegin = c;
1419                     state = State::NonUTF8Query;
1420                 }
1421                 break;
1422             case '#':
1423                 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1424                 appendToASCIIBuffer('#');
1425                 state = State::Fragment;
1426                 ++c;
1427                 break;
1428             default:
1429                 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1430                 if (currentPosition(c) && parsedDataView(currentPosition(c) - 1) != '/') {
1431                     appendToASCIIBuffer('/');
1432                     m_url.m_pathAfterLastSlash = currentPosition(c);
1433                 }
1434                 state = State::Path;
1435                 break;
1436             }
1437             break;
1438         case State::RelativeSlash:
1439             LOG_STATE("RelativeSlash");
1440             if (*c == '/' || *c == '\\') {
1441                 ++c;
1442                 copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1443                 appendToASCIIBuffer("://", 3);
1444                 if (m_urlIsSpecial)
1445                     state = State::SpecialAuthorityIgnoreSlashes;
1446                 else {
1447                     m_url.m_userStart = currentPosition(c);
1448                     state = State::AuthorityOrHost;
1449                     authorityOrHostBegin = c;
1450                 }
1451             } else {
1452                 copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1453                 appendToASCIIBuffer('/');
1454                 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1455                 state = State::Path;
1456             }
1457             break;
1458         case State::SpecialAuthoritySlashes:
1459             LOG_STATE("SpecialAuthoritySlashes");
1460             if (LIKELY(*c == '/' || *c == '\\')) {
1461                 if (UNLIKELY(*c == '\\'))
1462                     syntaxViolation(c);
1463                 appendToASCIIBuffer('/');
1464                 advance(c);
1465                 if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) {
1466                     if (UNLIKELY(*c == '\\'))
1467                         syntaxViolation(c);
1468                     ++c;
1469                     appendToASCIIBuffer('/');
1470                 } else {
1471                     syntaxViolation(c);
1472                     appendToASCIIBuffer('/');
1473                 }
1474             } else {
1475                 syntaxViolation(c);
1476                 appendToASCIIBuffer("//", 2);
1477             }
1478             state = State::SpecialAuthorityIgnoreSlashes;
1479             break;
1480         case State::SpecialAuthorityIgnoreSlashes:
1481             LOG_STATE("SpecialAuthorityIgnoreSlashes");
1482             if (*c == '/' || *c == '\\') {
1483                 syntaxViolation(c);
1484                 ++c;
1485             } else {
1486                 m_url.m_userStart = currentPosition(c);
1487                 state = State::AuthorityOrHost;
1488                 authorityOrHostBegin = c;
1489             }
1490             break;
1491         case State::AuthorityOrHost:
1492             do {
1493                 LOG_STATE("AuthorityOrHost");
1494                 if (*c == '@') {
1495                     auto lastAt = c;
1496                     auto findLastAt = c;
1497                     while (!findLastAt.atEnd()) {
1498                         URL_PARSER_LOG("Finding last @: %c", *findLastAt);
1499                         if (*findLastAt == '@')
1500                             lastAt = findLastAt;
1501                         bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\');
1502                         if (isSlash || *findLastAt == '?' || *findLastAt == '#')
1503                             break;
1504                         ++findLastAt;
1505                     }
1506                     parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1507                     c = lastAt;
1508                     advance(c);
1509                     authorityOrHostBegin = c;
1510                     state = State::Host;
1511                     m_hostHasPercentOrNonASCII = false;
1512                     break;
1513                 }
1514                 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1515                 if (isSlash || *c == '?' || *c == '#') {
1516                     auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c);
1517                     if (iterator.atEnd()) {
1518                         if (m_urlIsSpecial)
1519                             return failure();
1520                         m_url.m_userEnd = currentPosition(c);
1521                         m_url.m_passwordEnd = m_url.m_userEnd;
1522                         m_url.m_hostEnd = m_url.m_userEnd;
1523                         m_url.m_portEnd = m_url.m_userEnd;
1524                         m_url.m_pathAfterLastSlash = m_url.m_userEnd;
1525                     } else {
1526                         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1527                         m_url.m_passwordEnd = m_url.m_userEnd;
1528                         if (!parseHostAndPort(iterator)) {
1529                             failure();
1530                             return;
1531                         }
1532                         if (UNLIKELY(!isSlash)) {
1533                             if (m_urlIsSpecial) {
1534                                 syntaxViolation(c);
1535                                 appendToASCIIBuffer('/');
1536                             }
1537                             m_url.m_pathAfterLastSlash = currentPosition(c);
1538                         }
1539                     }
1540                     state = State::Path;
1541                     break;
1542                 }
1543                 if (isPercentOrNonASCII(*c))
1544                     m_hostHasPercentOrNonASCII = true;
1545                 ++c;
1546             } while (!c.atEnd());
1547             break;
1548         case State::Host:
1549             do {
1550                 LOG_STATE("Host");
1551                 if (*c == '/' || *c == '?' || *c == '#') {
1552                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1553                         failure();
1554                         return;
1555                     }
1556                     if (*c == '?' || *c == '#') {
1557                         syntaxViolation(c);
1558                         appendToASCIIBuffer('/');
1559                         m_url.m_pathAfterLastSlash = currentPosition(c);
1560                     }
1561                     state = State::Path;
1562                     break;
1563                 }
1564                 if (isPercentOrNonASCII(*c))
1565                     m_hostHasPercentOrNonASCII = true;
1566                 ++c;
1567             } while (!c.atEnd());
1568             break;
1569         case State::File:
1570             LOG_STATE("File");
1571             switch (*c) {
1572             case '\\':
1573                 syntaxViolation(c);
1574                 FALLTHROUGH;
1575             case '/':
1576                 appendToASCIIBuffer('/');
1577                 state = State::FileSlash;
1578                 ++c;
1579                 break;
1580             case '?':
1581                 syntaxViolation(c);
1582                 if (base.isValid() && base.protocolIs("file")) {
1583                     copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1584                     appendToASCIIBuffer('?');
1585                     ++c;
1586                 } else {
1587                     appendToASCIIBuffer("///?", 4);
1588                     ++c;
1589                     m_url.m_userStart = currentPosition(c) - 2;
1590                     m_url.m_userEnd = m_url.m_userStart;
1591                     m_url.m_passwordEnd = m_url.m_userStart;
1592                     m_url.m_hostEnd = m_url.m_userStart;
1593                     m_url.m_portEnd = m_url.m_userStart;
1594                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1595                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1596                 }
1597                 if (isUTF8Encoding)
1598                     state = State::UTF8Query;
1599                 else {
1600                     queryBegin = c;
1601                     state = State::NonUTF8Query;
1602                 }
1603                 break;
1604             case '#':
1605                 syntaxViolation(c);
1606                 if (base.isValid() && base.protocolIs("file")) {
1607                     copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1608                     appendToASCIIBuffer('#');
1609                 } else {
1610                     appendToASCIIBuffer("///#", 4);
1611                     m_url.m_userStart = currentPosition(c) - 2;
1612                     m_url.m_userEnd = m_url.m_userStart;
1613                     m_url.m_passwordEnd = m_url.m_userStart;
1614                     m_url.m_hostEnd = m_url.m_userStart;
1615                     m_url.m_portEnd = m_url.m_userStart;
1616                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1617                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1618                     m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1619                 }
1620                 state = State::Fragment;
1621                 ++c;
1622                 break;
1623             default:
1624                 syntaxViolation(c);
1625                 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1626                     copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1627                 else {
1628                     appendToASCIIBuffer("///", 3);
1629                     m_url.m_userStart = currentPosition(c) - 1;
1630                     m_url.m_userEnd = m_url.m_userStart;
1631                     m_url.m_passwordEnd = m_url.m_userStart;
1632                     m_url.m_hostEnd = m_url.m_userStart;
1633                     m_url.m_portEnd = m_url.m_userStart;
1634                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1635                     if (isWindowsDriveLetter(c))
1636                         appendWindowsDriveLetter(c);
1637                 }
1638                 state = State::Path;
1639                 break;
1640             }
1641             break;
1642         case State::FileSlash:
1643             LOG_STATE("FileSlash");
1644             if (LIKELY(*c == '/' || *c == '\\')) {
1645                 if (UNLIKELY(*c == '\\'))
1646                     syntaxViolation(c);
1647                 appendToASCIIBuffer('/');
1648                 advance(c);
1649                 m_url.m_userStart = currentPosition(c);
1650                 m_url.m_userEnd = m_url.m_userStart;
1651                 m_url.m_passwordEnd = m_url.m_userStart;
1652                 m_url.m_hostEnd = m_url.m_userStart;
1653                 m_url.m_portEnd = m_url.m_userStart;
1654                 authorityOrHostBegin = c;
1655                 state = State::FileHost;
1656                 break;
1657             }
1658             syntaxViolation(c);
1659             appendToASCIIBuffer("//", 2);
1660             m_url.m_userStart = currentPosition(c) - 1;
1661             m_url.m_userEnd = m_url.m_userStart;
1662             m_url.m_passwordEnd = m_url.m_userStart;
1663             m_url.m_hostEnd = m_url.m_userStart;
1664             m_url.m_portEnd = m_url.m_userStart;
1665             if (isWindowsDriveLetter(c)) {
1666                 appendWindowsDriveLetter(c);
1667                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1668             } else if (copyBaseWindowsDriveLetter(base)) {
1669                 appendToASCIIBuffer('/');
1670                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1671             } else
1672                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1673             state = State::Path;
1674             break;
1675         case State::FileHost:
1676             do {
1677                 LOG_STATE("FileHost");
1678                 if (isSlashQuestionOrHash(*c)) {
1679                     bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1680                         && isWindowsDriveLetter(authorityOrHostBegin);
1681                     if (windowsQuirk) {
1682                         syntaxViolation(authorityOrHostBegin);
1683                         appendToASCIIBuffer('/');
1684                         appendWindowsDriveLetter(authorityOrHostBegin);
1685                     }
1686                     if (windowsQuirk || authorityOrHostBegin == c) {
1687                         ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1) == '/');
1688                         if (UNLIKELY(*c == '?')) {
1689                             syntaxViolation(c);
1690                             appendToASCIIBuffer("/?", 2);
1691                             ++c;
1692                             if (isUTF8Encoding)
1693                                 state = State::UTF8Query;
1694                             else {
1695                                 queryBegin = c;
1696                                 state = State::NonUTF8Query;
1697                             }
1698                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1699                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1700                             break;
1701                         }
1702                         if (UNLIKELY(*c == '#')) {
1703                             syntaxViolation(c);
1704                             appendToASCIIBuffer("/#", 2);
1705                             ++c;
1706                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1707                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1708                             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1709                             state = State::Fragment;
1710                             break;
1711                         }
1712                         state = State::Path;
1713                         break;
1714                     }
1715                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1716                         failure();
1717                         return;
1718                     }
1719                     if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) {
1720                         syntaxViolation(c);
1721                         m_asciiBuffer.shrink(m_url.m_passwordEnd);
1722                         m_url.m_hostEnd = currentPosition(c);
1723                         m_url.m_portEnd = m_url.m_hostEnd;
1724                     }
1725                     
1726                     state = State::PathStart;
1727                     break;
1728                 }
1729                 if (isPercentOrNonASCII(*c))
1730                     m_hostHasPercentOrNonASCII = true;
1731                 ++c;
1732             } while (!c.atEnd());
1733             break;
1734         case State::PathStart:
1735             LOG_STATE("PathStart");
1736             if (*c != '/' && *c != '\\') {
1737                 syntaxViolation(c);
1738                 appendToASCIIBuffer('/');
1739             }
1740             m_url.m_pathAfterLastSlash = currentPosition(c);
1741             state = State::Path;
1742             break;
1743         case State::Path:
1744             LOG_STATE("Path");
1745             if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1746                 if (UNLIKELY(m_urlIsSpecial && *c == '\\'))
1747                     syntaxViolation(c);
1748                 appendToASCIIBuffer('/');
1749                 ++c;
1750                 m_url.m_pathAfterLastSlash = currentPosition(c);
1751                 break;
1752             }
1753             if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1) == '/')) {
1754                 if (UNLIKELY(isDoubleDotPathSegment(c))) {
1755                     syntaxViolation(c);
1756                     consumeDoubleDotPathSegment(c);
1757                     popPath();
1758                     break;
1759                 }
1760                 if (UNLIKELY(isSingleDotPathSegment(c))) {
1761                     syntaxViolation(c);
1762                     consumeSingleDotPathSegment(c);
1763                     break;
1764                 }
1765             }
1766             if (*c == '?') {
1767                 m_url.m_pathEnd = currentPosition(c);
1768                 appendToASCIIBuffer('?');
1769                 ++c;
1770                 if (isUTF8Encoding)
1771                     state = State::UTF8Query;
1772                 else {
1773                     queryBegin = c;
1774                     state = State::NonUTF8Query;
1775                 }
1776                 break;
1777             }
1778             if (*c == '#') {
1779                 m_url.m_pathEnd = currentPosition(c);
1780                 m_url.m_queryEnd = m_url.m_pathEnd;
1781                 state = State::Fragment;
1782                 break;
1783             }
1784             utf8PercentEncode<isInDefaultEncodeSet>(c);
1785             ++c;
1786             break;
1787         case State::CannotBeABaseURLPath:
1788             LOG_STATE("CannotBeABaseURLPath");
1789             if (*c == '?') {
1790                 m_url.m_pathEnd = currentPosition(c);
1791                 appendToASCIIBuffer('?');
1792                 ++c;
1793                 if (isUTF8Encoding)
1794                     state = State::UTF8Query;
1795                 else {
1796                     queryBegin = c;
1797                     state = State::NonUTF8Query;
1798                 }
1799             } else if (*c == '#') {
1800                 m_url.m_pathEnd = currentPosition(c);
1801                 m_url.m_queryEnd = m_url.m_pathEnd;
1802                 state = State::Fragment;
1803             } else if (*c == '/') {
1804                 appendToASCIIBuffer('/');
1805                 ++c;
1806                 m_url.m_pathAfterLastSlash = currentPosition(c);
1807             } else {
1808                 utf8PercentEncode<isInSimpleEncodeSet>(c);
1809                 ++c;
1810             }
1811             break;
1812         case State::UTF8Query:
1813             LOG_STATE("UTF8Query");
1814             ASSERT(queryBegin == CodePointIterator<CharacterType>());
1815             if (*c == '#') {
1816                 m_url.m_queryEnd = currentPosition(c);
1817                 state = State::Fragment;
1818                 break;
1819             }
1820             if (isUTF8Encoding)
1821                 utf8QueryEncode(c);
1822             else
1823                 appendCodePoint(queryBuffer, *c);
1824             ++c;
1825             break;
1826         case State::NonUTF8Query:
1827             do {
1828                 LOG_STATE("NonUTF8Query");
1829                 ASSERT(queryBegin != CodePointIterator<CharacterType>());
1830                 if (*c == '#') {
1831                     encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
1832                     m_url.m_queryEnd = currentPosition(c);
1833                     state = State::Fragment;
1834                     break;
1835                 }
1836                 appendCodePoint(queryBuffer, *c);
1837                 advance(c, queryBegin);
1838             } while (!c.atEnd());
1839             break;
1840         case State::Fragment:
1841             URL_PARSER_LOG("State Fragment");
1842             utf8PercentEncode<isInSimpleEncodeSet>(c);
1843             ++c;
1844             break;
1845         }
1846     }
1847
1848     switch (state) {
1849     case State::SchemeStart:
1850         LOG_FINAL_STATE("SchemeStart");
1851         if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1852             m_url = base;
1853             m_url.removeFragmentIdentifier();
1854             return;
1855         }
1856         failure();
1857         return;
1858     case State::Scheme:
1859         LOG_FINAL_STATE("Scheme");
1860         failure();
1861         return;
1862     case State::NoScheme:
1863         LOG_FINAL_STATE("NoScheme");
1864         RELEASE_ASSERT_NOT_REACHED();
1865     case State::SpecialRelativeOrAuthority:
1866         LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1867         copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1868         break;
1869     case State::PathOrAuthority:
1870         LOG_FINAL_STATE("PathOrAuthority");
1871         ASSERT(m_url.m_userStart);
1872         ASSERT(m_url.m_userStart == currentPosition(c));
1873         ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1874         m_url.m_userStart--;
1875         m_url.m_userEnd = m_url.m_userStart;
1876         m_url.m_passwordEnd = m_url.m_userStart;
1877         m_url.m_hostEnd = m_url.m_userStart;
1878         m_url.m_portEnd = m_url.m_userStart;
1879         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1880         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1881         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1882         break;
1883     case State::Relative:
1884         LOG_FINAL_STATE("Relative");
1885         RELEASE_ASSERT_NOT_REACHED();
1886     case State::RelativeSlash:
1887         LOG_FINAL_STATE("RelativeSlash");
1888         copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1889         appendToASCIIBuffer('/');
1890         m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1891         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1892         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1893         break;
1894     case State::SpecialAuthoritySlashes:
1895         LOG_FINAL_STATE("SpecialAuthoritySlashes");
1896         m_url.m_userStart = currentPosition(c);
1897         m_url.m_userEnd = m_url.m_userStart;
1898         m_url.m_passwordEnd = m_url.m_userStart;
1899         m_url.m_hostEnd = m_url.m_userStart;
1900         m_url.m_portEnd = m_url.m_userStart;
1901         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1902         m_url.m_pathEnd = m_url.m_userStart;
1903         m_url.m_queryEnd = m_url.m_userStart;
1904         break;
1905     case State::SpecialAuthorityIgnoreSlashes:
1906         LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1907         failure();
1908         return;
1909     case State::AuthorityOrHost:
1910         LOG_FINAL_STATE("AuthorityOrHost");
1911         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1912         m_url.m_passwordEnd = m_url.m_userEnd;
1913         if (authorityOrHostBegin.atEnd()) {
1914             m_url.m_userEnd = m_url.m_userStart;
1915             m_url.m_passwordEnd = m_url.m_userStart;
1916             m_url.m_hostEnd = m_url.m_userStart;
1917             m_url.m_portEnd = m_url.m_userStart;
1918             m_url.m_pathEnd = m_url.m_userStart;
1919         } else if (!parseHostAndPort(authorityOrHostBegin)) {
1920             failure();
1921             return;
1922         } else {
1923             if (m_urlIsSpecial) {
1924                 syntaxViolation(c);
1925                 appendToASCIIBuffer('/');
1926                 m_url.m_pathEnd = m_url.m_portEnd + 1;
1927             } else
1928                 m_url.m_pathEnd = m_url.m_portEnd;
1929         }
1930         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1931         m_url.m_queryEnd = m_url.m_pathEnd;
1932         break;
1933     case State::Host:
1934         LOG_FINAL_STATE("Host");
1935         if (!parseHostAndPort(authorityOrHostBegin)) {
1936             failure();
1937             return;
1938         }
1939         if (m_urlIsSpecial) {
1940             syntaxViolation(c);
1941             appendToASCIIBuffer('/');
1942             m_url.m_pathEnd = m_url.m_portEnd + 1;
1943         } else
1944             m_url.m_pathEnd = m_url.m_portEnd;
1945         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1946         m_url.m_queryEnd = m_url.m_pathEnd;
1947         break;
1948     case State::File:
1949         LOG_FINAL_STATE("File");
1950         if (base.isValid() && base.protocolIs("file")) {
1951             copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1952             break;
1953         }
1954         syntaxViolation(c);
1955         appendToASCIIBuffer("///", 3);
1956         m_url.m_userStart = currentPosition(c) - 1;
1957         m_url.m_userEnd = m_url.m_userStart;
1958         m_url.m_passwordEnd = m_url.m_userStart;
1959         m_url.m_hostEnd = m_url.m_userStart;
1960         m_url.m_portEnd = m_url.m_userStart;
1961         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1962         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1963         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1964         break;
1965     case State::FileSlash:
1966         LOG_FINAL_STATE("FileSlash");
1967         syntaxViolation(c);
1968         m_url.m_userStart = currentPosition(c) + 1;
1969         appendToASCIIBuffer("//", 2);
1970         m_url.m_userEnd = m_url.m_userStart;
1971         m_url.m_passwordEnd = m_url.m_userStart;
1972         m_url.m_hostEnd = m_url.m_userStart;
1973         m_url.m_portEnd = m_url.m_userStart;
1974         if (copyBaseWindowsDriveLetter(base)) {
1975             appendToASCIIBuffer('/');
1976             m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1977         } else
1978             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1979         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1980         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1981         break;
1982     case State::FileHost:
1983         LOG_FINAL_STATE("FileHost");
1984         if (takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1985             && isWindowsDriveLetter(authorityOrHostBegin)) {
1986             syntaxViolation(authorityOrHostBegin);
1987             appendToASCIIBuffer('/');
1988             appendWindowsDriveLetter(authorityOrHostBegin);
1989             m_url.m_pathAfterLastSlash = currentPosition(c);
1990             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1991             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1992             break;
1993         }
1994         
1995         if (authorityOrHostBegin == c) {
1996             syntaxViolation(c);
1997             appendToASCIIBuffer('/');
1998             m_url.m_userStart = currentPosition(c) - 1;
1999             m_url.m_userEnd = m_url.m_userStart;
2000             m_url.m_passwordEnd = m_url.m_userStart;
2001             m_url.m_hostEnd = m_url.m_userStart;
2002             m_url.m_portEnd = m_url.m_userStart;
2003             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
2004             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
2005             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
2006             break;
2007         }
2008
2009         if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
2010             failure();
2011             return;
2012         }
2013
2014         syntaxViolation(c);
2015         if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) {
2016             m_asciiBuffer.shrink(m_url.m_passwordEnd);
2017             m_url.m_hostEnd = currentPosition(c);
2018             m_url.m_portEnd = m_url.m_hostEnd;
2019         }
2020         appendToASCIIBuffer('/');
2021         m_url.m_pathAfterLastSlash = m_url.m_portEnd + 1;
2022         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
2023         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
2024         break;
2025     case State::PathStart:
2026         LOG_FINAL_STATE("PathStart");
2027         RELEASE_ASSERT_NOT_REACHED();
2028     case State::Path:
2029         LOG_FINAL_STATE("Path");
2030         m_url.m_pathEnd = currentPosition(c);
2031         m_url.m_queryEnd = m_url.m_pathEnd;
2032         break;
2033     case State::CannotBeABaseURLPath:
2034         LOG_FINAL_STATE("CannotBeABaseURLPath");
2035         m_url.m_pathEnd = currentPosition(c);
2036         m_url.m_queryEnd = m_url.m_pathEnd;
2037         break;
2038     case State::UTF8Query:
2039         LOG_FINAL_STATE("UTF8Query");
2040         ASSERT(queryBegin == CodePointIterator<CharacterType>());
2041         m_url.m_queryEnd = currentPosition(c);
2042         break;
2043     case State::NonUTF8Query:
2044         LOG_FINAL_STATE("NonUTF8Query");
2045         ASSERT(queryBegin != CodePointIterator<CharacterType>());
2046         encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
2047         m_url.m_queryEnd = currentPosition(c);
2048         break;
2049     case State::Fragment:
2050         LOG_FINAL_STATE("Fragment");
2051         break;
2052     }
2053
2054     if (LIKELY(!m_didSeeSyntaxViolation)) {
2055         m_url.m_string = m_inputString;
2056         ASSERT(m_asciiBuffer.isEmpty());
2057     } else
2058         m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
2059     m_url.m_isValid = true;
2060     URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
2061 }
2062
2063 template<typename CharacterType>
2064 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
2065 {
2066     if (UNLIKELY(iterator.atEnd())) {
2067         syntaxViolation(iterator);
2068         m_url.m_userEnd = currentPosition(iterator);
2069         m_url.m_passwordEnd = m_url.m_userEnd;
2070         return;
2071     }
2072     for (; !iterator.atEnd(); advance(iterator)) {
2073         if (*iterator == ':') {
2074             m_url.m_userEnd = currentPosition(iterator);
2075             auto iteratorAtColon = iterator;
2076             ++iterator;
2077             bool tabOrNewlineAfterColon = false;
2078             while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
2079                 tabOrNewlineAfterColon = true;
2080                 ++iterator;
2081             }
2082             if (UNLIKELY(iterator.atEnd())) {
2083                 syntaxViolation(iteratorAtColon);
2084                 m_url.m_passwordEnd = m_url.m_userEnd;
2085                 if (m_url.m_userEnd > m_url.m_userStart)
2086                     appendToASCIIBuffer('@');
2087                 return;
2088             }
2089             if (tabOrNewlineAfterColon)
2090                 syntaxViolation(iteratorAtColon);
2091             appendToASCIIBuffer(':');
2092             break;
2093         }
2094         utf8PercentEncode<WebCore::isInUserInfoEncodeSet>(iterator);
2095     }
2096     for (; !iterator.atEnd(); advance(iterator))
2097         utf8PercentEncode<WebCore::isInUserInfoEncodeSet>(iterator);
2098     m_url.m_passwordEnd = currentPosition(iterator);
2099     if (!m_url.m_userEnd)
2100         m_url.m_userEnd = m_url.m_passwordEnd;
2101     appendToASCIIBuffer('@');
2102 }
2103
2104 template<typename UnsignedIntegerType>
2105 void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
2106 {
2107     LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
2108     LChar* end = std::end(buf);
2109     LChar* p = end;
2110     do {
2111         *--p = (number % 10) + '0';
2112         number /= 10;
2113     } while (number);
2114     appendToASCIIBuffer(p, end - p);
2115 }
2116
2117 void URLParser::serializeIPv4(IPv4Address address)
2118 {
2119     appendNumberToASCIIBuffer<uint8_t>(address >> 24);
2120     appendToASCIIBuffer('.');
2121     appendNumberToASCIIBuffer<uint8_t>(address >> 16);
2122     appendToASCIIBuffer('.');
2123     appendNumberToASCIIBuffer<uint8_t>(address >> 8);
2124     appendToASCIIBuffer('.');
2125     appendNumberToASCIIBuffer<uint8_t>(address);
2126 }
2127     
2128 static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
2129 {
2130     size_t end = begin;
2131     for (; end < 8; end++) {
2132         if (address[end])
2133             break;
2134     }
2135     return end - begin;
2136 }
2137
2138 static std::optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
2139 {
2140     std::optional<size_t> longest;
2141     size_t longestLength = 0;
2142     for (size_t i = 0; i < 8; i++) {
2143         size_t length = zeroSequenceLength(address, i);
2144         if (length) {
2145             if (length > 1 && (!longest || longestLength < length)) {
2146                 longest = i;
2147                 longestLength = length;
2148             }
2149             i += length;
2150         }
2151     }
2152     return longest;
2153 }
2154
2155 void URLParser::serializeIPv6Piece(uint16_t piece)
2156 {
2157     bool printed = false;
2158     if (auto nibble0 = piece >> 12) {
2159         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
2160         printed = true;
2161     }
2162     auto nibble1 = piece >> 8 & 0xF;
2163     if (printed || nibble1) {
2164         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
2165         printed = true;
2166     }
2167     auto nibble2 = piece >> 4 & 0xF;
2168     if (printed || nibble2)
2169         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
2170     appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
2171 }
2172
2173 void URLParser::serializeIPv6(URLParser::IPv6Address address)
2174 {
2175     appendToASCIIBuffer('[');
2176     auto compressPointer = findLongestZeroSequence(address);
2177     for (size_t piece = 0; piece < 8; piece++) {
2178         if (compressPointer && compressPointer.value() == piece) {
2179             ASSERT(!address[piece]);
2180             if (piece)
2181                 appendToASCIIBuffer(':');
2182             else
2183                 appendToASCIIBuffer("::", 2);
2184             while (piece < 8 && !address[piece])
2185                 piece++;
2186             if (piece == 8)
2187                 break;
2188         }
2189         serializeIPv6Piece(address[piece]);
2190         if (piece < 7)
2191             appendToASCIIBuffer(':');
2192     }
2193     appendToASCIIBuffer(']');
2194 }
2195
2196 enum class URLParser::IPv4PieceParsingError {
2197     Failure,
2198     Overflow,
2199 };
2200
2201 template<typename CharacterType>
2202 Expected<uint32_t, URLParser::IPv4PieceParsingError> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation)
2203 {
2204     enum class State : uint8_t {
2205         UnknownBase,
2206         Decimal,
2207         OctalOrHex,
2208         Octal,
2209         Hex,
2210     };
2211     State state = State::UnknownBase;
2212     Checked<uint32_t, RecordOverflow> value = 0;
2213     if (!iterator.atEnd() && *iterator == '.')
2214         return makeUnexpected(IPv4PieceParsingError::Failure);
2215     while (!iterator.atEnd()) {
2216         if (isTabOrNewline(*iterator)) {
2217             didSeeSyntaxViolation = true;
2218             ++iterator;
2219             continue;
2220         }
2221         if (*iterator == '.') {
2222             ASSERT(!value.hasOverflowed());
2223             return value.unsafeGet();
2224         }
2225         switch (state) {
2226         case State::UnknownBase:
2227             if (UNLIKELY(*iterator == '0')) {
2228                 ++iterator;
2229                 state = State::OctalOrHex;
2230                 break;
2231             }
2232             state = State::Decimal;
2233             break;
2234         case State::OctalOrHex:
2235             didSeeSyntaxViolation = true;
2236             if (*iterator == 'x' || *iterator == 'X') {
2237                 ++iterator;
2238                 state = State::Hex;
2239                 break;
2240             }
2241             state = State::Octal;
2242             break;
2243         case State::Decimal:
2244             if (!isASCIIDigit(*iterator))
2245                 return makeUnexpected(IPv4PieceParsingError::Failure);
2246             value *= 10;
2247             value += *iterator - '0';
2248             if (UNLIKELY(value.hasOverflowed()))
2249                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2250             ++iterator;
2251             break;
2252         case State::Octal:
2253             ASSERT(didSeeSyntaxViolation);
2254             if (*iterator < '0' || *iterator > '7')
2255                 return makeUnexpected(IPv4PieceParsingError::Failure);
2256             value *= 8;
2257             value += *iterator - '0';
2258             if (UNLIKELY(value.hasOverflowed()))
2259                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2260             ++iterator;
2261             break;
2262         case State::Hex:
2263             ASSERT(didSeeSyntaxViolation);
2264             if (!isASCIIHexDigit(*iterator))
2265                 return makeUnexpected(IPv4PieceParsingError::Failure);
2266             value *= 16;
2267             value += toASCIIHexValue(*iterator);
2268             if (UNLIKELY(value.hasOverflowed()))
2269                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2270             ++iterator;
2271             break;
2272         }
2273     }
2274     ASSERT(!value.hasOverflowed());
2275     return value.unsafeGet();
2276 }
2277
2278 ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2279 {
2280     RELEASE_ASSERT(exponent <= 4);
2281     uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
2282     return values[exponent];
2283 }
2284
2285 enum class URLParser::IPv4ParsingError {
2286     Failure,
2287     NotIPv4,
2288 };
2289
2290 template<typename CharacterTypeForSyntaxViolation, typename CharacterType>
2291 Expected<URLParser::IPv4Address, URLParser::IPv4ParsingError> URLParser::parseIPv4Host(const CodePointIterator<CharacterTypeForSyntaxViolation>& iteratorForSyntaxViolationPosition, CodePointIterator<CharacterType> iterator)
2292 {
2293     Vector<Expected<uint32_t, URLParser::IPv4PieceParsingError>, 4> items;
2294     bool didSeeSyntaxViolation = false;
2295     if (!iterator.atEnd() && *iterator == '.')
2296         return makeUnexpected(IPv4ParsingError::NotIPv4);
2297     while (!iterator.atEnd()) {
2298         if (isTabOrNewline(*iterator)) {
2299             didSeeSyntaxViolation = true;
2300             ++iterator;
2301             continue;
2302         }
2303         if (items.size() >= 4)
2304             return makeUnexpected(IPv4ParsingError::NotIPv4);
2305         items.append(parseIPv4Piece(iterator, didSeeSyntaxViolation));
2306         if (!iterator.atEnd() && *iterator == '.') {
2307             ++iterator;
2308             if (iterator.atEnd())
2309                 syntaxViolation(iteratorForSyntaxViolationPosition);
2310             else if (*iterator == '.')
2311                 return makeUnexpected(IPv4ParsingError::NotIPv4);
2312         }
2313     }
2314     if (!iterator.atEnd() || !items.size() || items.size() > 4)
2315         return makeUnexpected(IPv4ParsingError::NotIPv4);
2316     for (const auto& item : items) {
2317         if (!item.has_value() && item.error() == IPv4PieceParsingError::Failure)
2318             return makeUnexpected(IPv4ParsingError::NotIPv4);
2319     }
2320     for (const auto& item : items) {
2321         if (!item.has_value() && item.error() == IPv4PieceParsingError::Overflow)
2322             return makeUnexpected(IPv4ParsingError::Failure);
2323     }
2324     if (items.size() > 1) {
2325         for (size_t i = 0; i < items.size() - 1; i++) {
2326             if (items[i].value() > 255)
2327                 return makeUnexpected(IPv4ParsingError::Failure);
2328         }
2329     }
2330     if (items[items.size() - 1].value() >= pow256(5 - items.size()))
2331         return makeUnexpected(IPv4ParsingError::Failure);
2332
2333     if (didSeeSyntaxViolation)
2334         syntaxViolation(iteratorForSyntaxViolationPosition);
2335     for (const auto& item : items) {
2336         if (item.value() > 255)
2337             syntaxViolation(iteratorForSyntaxViolationPosition);
2338     }
2339
2340     if (UNLIKELY(items.size() != 4))
2341         syntaxViolation(iteratorForSyntaxViolationPosition);
2342
2343     IPv4Address ipv4 = items.takeLast().value();
2344     for (size_t counter = 0; counter < items.size(); ++counter)
2345         ipv4 += items[counter].value() * pow256(3 - counter);
2346     return ipv4;
2347 }
2348
2349 template<typename CharacterType>
2350 std::optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator)
2351 {
2352     if (iterator.atEnd())
2353         return std::nullopt;
2354     uint32_t piece = 0;
2355     bool leadingZeros = false;
2356     size_t digitCount = 0;
2357     while (!iterator.atEnd()) {
2358         if (!isASCIIDigit(*iterator))
2359             return std::nullopt;
2360         ++digitCount;
2361         if (!piece && *iterator == '0') {
2362             if (leadingZeros)
2363                 return std::nullopt;
2364             leadingZeros = true;
2365         }
2366         if (!piece && *iterator == '0')
2367             leadingZeros = true;
2368         piece = piece * 10 + *iterator - '0';
2369         if (piece > 255)
2370             return std::nullopt;
2371         advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2372         if (iterator.atEnd())
2373             break;
2374         if (*iterator == '.')
2375             break;
2376     }
2377     if (piece && leadingZeros)
2378         return std::nullopt;
2379     return piece;
2380 }
2381
2382 template<typename CharacterType>
2383 std::optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator)
2384 {
2385     IPv4Address address = 0;
2386     for (size_t i = 0; i < 4; ++i) {
2387         if (std::optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator))
2388             address = (address << 8) + piece.value();
2389         else
2390             return std::nullopt;
2391         if (i < 3) {
2392             if (iterator.atEnd())
2393                 return std::nullopt;
2394             if (*iterator != '.')
2395                 return std::nullopt;
2396             advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2397         } else if (!iterator.atEnd())
2398             return std::nullopt;
2399     }
2400     ASSERT(iterator.atEnd());
2401     return address;
2402 }
2403
2404 template<typename CharacterType>
2405 std::optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2406 {
2407     ASSERT(*c == '[');
2408     const auto hostBegin = c;
2409     advance(c, hostBegin);
2410     if (c.atEnd())
2411         return std::nullopt;
2412
2413     IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}};
2414     size_t piecePointer = 0;
2415     std::optional<size_t> compressPointer;
2416
2417     if (*c == ':') {
2418         advance(c, hostBegin);
2419         if (c.atEnd())
2420             return std::nullopt;
2421         if (*c != ':')
2422             return std::nullopt;
2423         advance(c, hostBegin);
2424         ++piecePointer;
2425         compressPointer = piecePointer;
2426     }
2427     
2428     while (!c.atEnd()) {
2429         if (piecePointer == 8)
2430             return std::nullopt;
2431         if (*c == ':') {
2432             if (compressPointer)
2433                 return std::nullopt;
2434             advance(c, hostBegin);
2435             ++piecePointer;
2436             compressPointer = piecePointer;
2437             continue;
2438         }
2439         if (piecePointer == 6 || (compressPointer && piecePointer < 6)) {
2440             if (std::optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) {
2441                 if (compressPointer && piecePointer == 5)
2442                     return std::nullopt;
2443                 syntaxViolation(hostBegin);
2444                 address[piecePointer++] = ipv4Address.value() >> 16;
2445                 address[piecePointer++] = ipv4Address.value() & 0xFFFF;
2446                 c = { };
2447                 break;
2448             }
2449         }
2450         uint16_t value = 0;
2451         size_t length = 0;
2452         bool leadingZeros = false;
2453         for (; length < 4; length++) {
2454             if (c.atEnd())
2455                 break;
2456             if (!isASCIIHexDigit(*c))
2457                 break;
2458             if (isASCIIUpper(*c))
2459                 syntaxViolation(hostBegin);
2460             if (*c == '0' && !length)
2461                 leadingZeros = true;
2462             value = value * 0x10 + toASCIIHexValue(*c);
2463             advance(c, hostBegin);
2464         }
2465         
2466         if (UNLIKELY((value && leadingZeros) || (!value && length > 1)))
2467             syntaxViolation(hostBegin);
2468
2469         address[piecePointer++] = value;
2470         if (c.atEnd())
2471             break;
2472         if (piecePointer == 8 || *c != ':')
2473             return std::nullopt;
2474         advance(c, hostBegin);
2475     }
2476     
2477     if (!c.atEnd())
2478         return std::nullopt;
2479     
2480     if (compressPointer) {
2481         size_t swaps = piecePointer - compressPointer.value();
2482         piecePointer = 7;
2483         while (swaps)
2484             std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
2485     } else if (piecePointer != 8)
2486         return std::nullopt;
2487
2488     std::optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2489     if (possibleCompressPointer)
2490         possibleCompressPointer.value()++;
2491     if (UNLIKELY(compressPointer != possibleCompressPointer))
2492         syntaxViolation(hostBegin);
2493     
2494     return address;
2495 }
2496
2497 template<typename CharacterType>
2498 URLParser::LCharBuffer URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2499 {
2500     LCharBuffer output;
2501     output.reserveInitialCapacity(length);
2502     
2503     for (size_t i = 0; i < length; ++i) {
2504         uint8_t byte = input[i];
2505         if (byte != '%')
2506             output.uncheckedAppend(byte);
2507         else if (length > 2 && i < length - 2) {
2508             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2509                 syntaxViolation(iteratorForSyntaxViolationPosition);
2510                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2511                 i += 2;
2512             } else
2513                 output.uncheckedAppend(byte);
2514         } else
2515             output.uncheckedAppend(byte);
2516     }
2517     return output;
2518 }
2519     
2520 URLParser::LCharBuffer URLParser::percentDecode(const LChar* input, size_t length)
2521 {
2522     LCharBuffer output;
2523     output.reserveInitialCapacity(length);
2524     
2525     for (size_t i = 0; i < length; ++i) {
2526         uint8_t byte = input[i];
2527         if (byte != '%')
2528             output.uncheckedAppend(byte);
2529         else if (length > 2 && i < length - 2) {
2530             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2531                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2532                 i += 2;
2533             } else
2534                 output.uncheckedAppend(byte);
2535         } else
2536             output.uncheckedAppend(byte);
2537     }
2538     return output;
2539 }
2540
2541 template<typename CharacterType> std::optional<URLParser::LCharBuffer> URLParser::domainToASCII(StringImpl& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2542 {
2543     LCharBuffer ascii;
2544     if (domain.isAllASCII()) {
2545         size_t length = domain.length();
2546         if (domain.is8Bit()) {
2547             const LChar* characters = domain.characters8();
2548             ascii.reserveInitialCapacity(length);
2549             for (size_t i = 0; i < length; ++i) {
2550                 if (UNLIKELY(isASCIIUpper(characters[i])))
2551                     syntaxViolation(iteratorForSyntaxViolationPosition);
2552                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2553             }
2554         } else {
2555             const UChar* characters = domain.characters16();
2556             ascii.reserveInitialCapacity(length);
2557             for (size_t i = 0; i < length; ++i) {
2558                 if (UNLIKELY(isASCIIUpper(characters[i])))
2559                     syntaxViolation(iteratorForSyntaxViolationPosition);
2560                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2561             }
2562         }
2563         return ascii;
2564     }
2565     
2566     UChar hostnameBuffer[defaultInlineBufferSize];
2567     UErrorCode error = U_ZERO_ERROR;
2568     UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER;
2569     int32_t numCharactersConverted = uidna_nameToASCII(&internationalDomainNameTranscoder(), StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, &processingDetails, &error);
2570     ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
2571
2572     if (U_SUCCESS(error) && !processingDetails.errors) {
2573         for (int32_t i = 0; i < numCharactersConverted; ++i) {
2574             ASSERT(isASCII(hostnameBuffer[i]));
2575             ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2576         }
2577         ascii.append(hostnameBuffer, numCharactersConverted);
2578         if (domain != StringView(ascii.data(), ascii.size()))
2579             syntaxViolation(iteratorForSyntaxViolationPosition);
2580         return ascii;
2581     }
2582
2583     // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
2584     return std::nullopt;
2585 }
2586
2587 bool URLParser::hasForbiddenHostCodePoint(const URLParser::LCharBuffer& asciiDomain)
2588 {
2589     for (size_t i = 0; i < asciiDomain.size(); ++i) {
2590         if (isForbiddenHostCodePoint(asciiDomain[i]))
2591             return true;
2592     }
2593     return false;
2594 }
2595
2596 template<typename CharacterType>
2597 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2598 {
2599     ASSERT(*iterator == ':');
2600     auto colonIterator = iterator;
2601     advance(iterator, colonIterator);
2602     uint32_t port = 0;
2603     if (UNLIKELY(iterator.atEnd())) {
2604         m_url.m_portEnd = currentPosition(colonIterator);
2605         syntaxViolation(colonIterator);
2606         return true;
2607     }
2608     size_t digitCount = 0;
2609     bool leadingZeros = false;
2610     for (; !iterator.atEnd(); ++iterator) {
2611         if (UNLIKELY(isTabOrNewline(*iterator))) {
2612             syntaxViolation(colonIterator);
2613             continue;
2614         }
2615         if (isASCIIDigit(*iterator)) {
2616             if (*iterator == '0' && !digitCount)
2617                 leadingZeros = true;
2618             ++digitCount;
2619             port = port * 10 + *iterator - '0';
2620             if (port > std::numeric_limits<uint16_t>::max())
2621                 return false;
2622         } else
2623             return false;
2624     }
2625
2626     if (port && leadingZeros)
2627         syntaxViolation(colonIterator);
2628     
2629     if (!port && digitCount > 1)
2630         syntaxViolation(colonIterator);
2631
2632     ASSERT(port == static_cast<uint16_t>(port));
2633     if (UNLIKELY(defaultPortForProtocol(parsedDataView(0, m_url.m_schemeEnd)) == static_cast<uint16_t>(port)))
2634         syntaxViolation(colonIterator);
2635     else {
2636         appendToASCIIBuffer(':');
2637         ASSERT(port <= std::numeric_limits<uint16_t>::max());
2638         appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2639     }
2640
2641     m_url.m_portEnd = currentPosition(iterator);
2642     return true;
2643 }
2644
2645 template<typename CharacterType>
2646 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2647 {
2648     if (iterator.atEnd())
2649         return false;
2650     if (*iterator == ':')
2651         return false;
2652     if (*iterator == '[') {
2653         auto ipv6End = iterator;
2654         while (!ipv6End.atEnd() && *ipv6End != ']')
2655             ++ipv6End;
2656         if (ipv6End.atEnd())
2657             return false;
2658         if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2659             serializeIPv6(address.value());
2660             if (!ipv6End.atEnd()) {
2661                 advance(ipv6End);
2662                 if (!ipv6End.atEnd() && *ipv6End == ':') {
2663                     m_url.m_hostEnd = currentPosition(ipv6End);
2664                     return parsePort(ipv6End);
2665                 }
2666                 m_url.m_hostEnd = currentPosition(ipv6End);
2667                 m_url.m_portEnd = m_url.m_hostEnd;
2668                 return true;
2669             }
2670             m_url.m_hostEnd = currentPosition(ipv6End);
2671             return true;
2672         }
2673         return false;
2674     }
2675
2676     if (!m_urlIsSpecial) {
2677         for (; !iterator.atEnd(); ++iterator) {
2678             if (UNLIKELY(isTabOrNewline(*iterator))) {
2679                 syntaxViolation(iterator);
2680                 continue;
2681             }
2682             if (*iterator == ':')
2683                 break;
2684             if (UNLIKELY(isForbiddenHostCodePoint(*iterator) && *iterator != '%'))
2685                 return false;
2686             utf8PercentEncode<isInSimpleEncodeSet>(iterator);
2687         }
2688         m_url.m_hostEnd = currentPosition(iterator);
2689         if (iterator.atEnd()) {
2690             m_url.m_portEnd = currentPosition(iterator);
2691             return true;
2692         }
2693         return parsePort(iterator);
2694     }
2695     
2696     if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2697         auto hostIterator = iterator;
2698         for (; !iterator.atEnd(); ++iterator) {
2699             if (isTabOrNewline(*iterator))
2700                 continue;
2701             if (*iterator == ':')
2702                 break;
2703             if (isForbiddenHostCodePoint(*iterator))
2704                 return false;
2705         }
2706         auto address = parseIPv4Host(hostIterator, CodePointIterator<CharacterType>(hostIterator, iterator));
2707         if (address) {
2708             serializeIPv4(address.value());
2709             m_url.m_hostEnd = currentPosition(iterator);
2710             if (iterator.atEnd()) {
2711                 m_url.m_portEnd = currentPosition(iterator);
2712                 return true;
2713             }
2714             return parsePort(iterator);
2715         }
2716         if (address.error() == IPv4ParsingError::Failure)
2717             return false;
2718         for (; hostIterator != iterator; ++hostIterator) {
2719             if (UNLIKELY(isTabOrNewline(*hostIterator))) {
2720                 syntaxViolation(hostIterator);
2721                 continue;
2722             }
2723             if (UNLIKELY(isASCIIUpper(*hostIterator)))
2724                 syntaxViolation(hostIterator);
2725             appendToASCIIBuffer(toASCIILower(*hostIterator));
2726         }
2727         m_url.m_hostEnd = currentPosition(iterator);
2728         if (!hostIterator.atEnd())
2729             return parsePort(hostIterator);
2730         m_url.m_portEnd = currentPosition(iterator);
2731         return true;
2732     }
2733     
2734     const auto hostBegin = iterator;
2735     
2736     LCharBuffer utf8Encoded;
2737     for (; !iterator.atEnd(); ++iterator) {
2738         if (UNLIKELY(isTabOrNewline(*iterator))) {
2739             syntaxViolation(hostBegin);
2740             continue;
2741         }
2742         if (*iterator == ':')
2743             break;
2744         if (UNLIKELY(!isASCII(*iterator)))
2745             syntaxViolation(hostBegin);
2746
2747         uint8_t buffer[U8_MAX_LENGTH];
2748         int32_t offset = 0;
2749         UBool error = false;
2750         U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error);
2751         ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
2752         // FIXME: Check error.
2753         utf8Encoded.append(buffer, offset);
2754     }
2755     LCharBuffer percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin);
2756     String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2757     if (domain.isNull())
2758         return false;
2759     if (domain != StringView(percentDecoded.data(), percentDecoded.size()))
2760         syntaxViolation(hostBegin);
2761     auto asciiDomain = domainToASCII(*domain.impl(), hostBegin);
2762     if (!asciiDomain || hasForbiddenHostCodePoint(asciiDomain.value()))
2763         return false;
2764     LCharBuffer& asciiDomainValue = asciiDomain.value();
2765     const LChar* asciiDomainCharacters = asciiDomainValue.data();
2766
2767     auto address = parseIPv4Host(hostBegin, CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()));
2768     if (address) {
2769         serializeIPv4(address.value());
2770         m_url.m_hostEnd = currentPosition(iterator);
2771         if (iterator.atEnd()) {
2772             m_url.m_portEnd = currentPosition(iterator);
2773             return true;
2774         }
2775         return parsePort(iterator);
2776     }
2777     if (address.error() == IPv4ParsingError::Failure)
2778         return false;
2779
2780     appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2781     m_url.m_hostEnd = currentPosition(iterator);
2782     if (!iterator.atEnd())
2783         return parsePort(iterator);
2784     m_url.m_portEnd = currentPosition(iterator);
2785     return true;
2786 }
2787
2788 std::optional<String> URLParser::formURLDecode(StringView input)
2789 {
2790     auto utf8 = input.utf8(StrictConversion);
2791     if (utf8.isNull())
2792         return std::nullopt;
2793     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2794     return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2795 }
2796
2797 // https://url.spec.whatwg.org/#concept-urlencoded-parser
2798 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2799 {
2800     URLEncodedForm output;
2801     for (StringView bytes : input.split('&')) {
2802         auto equalIndex = bytes.find('=');
2803         if (equalIndex == notFound) {
2804             auto name = formURLDecode(bytes.toString().replace('+', 0x20));
2805             if (name)
2806                 output.append({ name.value(), emptyString() });
2807         } else {
2808             auto name = formURLDecode(bytes.substring(0, equalIndex).toString().replace('+', 0x20));
2809             auto value = formURLDecode(bytes.substring(equalIndex + 1).toString().replace('+', 0x20));
2810             if (name && value)
2811                 output.append({ name.value(), value.value() });
2812         }
2813     }
2814     return output;
2815 }
2816
2817 static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2818 {
2819     auto utf8 = input.utf8(StrictConversion);
2820     const char* data = utf8.data();
2821     for (size_t i = 0; i < utf8.length(); ++i) {
2822         const char byte = data[i];
2823         if (byte == 0x20)
2824             output.append(0x2B);
2825         else if (byte == 0x2A
2826             || byte == 0x2D
2827             || byte == 0x2E
2828             || (byte >= 0x30 && byte <= 0x39)
2829             || (byte >= 0x41 && byte <= 0x5A)
2830             || byte == 0x5F
2831             || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches.
2832             output.append(byte);
2833         else
2834             percentEncodeByte(byte, output);
2835     }
2836 }
2837     
2838 String URLParser::serialize(const URLEncodedForm& tuples)
2839 {
2840     if (tuples.isEmpty())
2841         return { };
2842
2843     Vector<LChar> output;
2844     for (auto& tuple : tuples) {
2845         if (!output.isEmpty())
2846             output.append('&');
2847         serializeURLEncodedForm(tuple.key, output);
2848         output.append('=');
2849         serializeURLEncodedForm(tuple.value, output);
2850     }
2851     return String::adopt(WTFMove(output));
2852 }
2853
2854 const UIDNA& URLParser::internationalDomainNameTranscoder()
2855 {
2856     static UIDNA* encoder;
2857     static std::once_flag onceFlag;
2858     std::call_once(onceFlag, [] {
2859         UErrorCode error = U_ZERO_ERROR;
2860         // Warning: Please contact a WebKitGTK+ developer if changing these flags.
2861         // They should be synced with ephy_uri_decode() in ephy-uri-helpers.c.
2862         encoder = uidna_openUTS46(UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_NONTRANSITIONAL_TO_UNICODE | UIDNA_NONTRANSITIONAL_TO_ASCII, &error);
2863         RELEASE_ASSERT(U_SUCCESS(error));
2864         RELEASE_ASSERT(encoder);
2865     });
2866     return *encoder;
2867 }
2868
2869 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2870 {
2871     // FIXME: m_cannotBeABaseURL is not compared because the old URL::parse did not use it,
2872     // but once we get rid of URL::parse its value should be tested.
2873     URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %s",
2874         a.m_isValid,
2875         a.m_protocolIsInHTTPFamily,
2876         a.m_schemeEnd,
2877         a.m_userStart,
2878         a.m_userEnd,
2879         a.m_passwordEnd,
2880         a.m_hostEnd,
2881         a.m_portEnd,
2882         a.m_pathAfterLastSlash,
2883         a.m_pathEnd,
2884         a.m_queryEnd,
2885         a.m_string.utf8().data(),
2886         b.m_isValid,
2887         b.m_protocolIsInHTTPFamily,
2888         b.m_schemeEnd,
2889         b.m_userStart,
2890         b.m_userEnd,
2891         b.m_passwordEnd,
2892         b.m_hostEnd,
2893         b.m_portEnd,
2894         b.m_pathAfterLastSlash,
2895         b.m_pathEnd,
2896         b.m_queryEnd,
2897         b.m_string.utf8().data());
2898
2899     return a.m_string == b.m_string
2900         && a.m_isValid == b.m_isValid
2901         && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2902         && a.m_schemeEnd == b.m_schemeEnd
2903         && a.m_userStart == b.m_userStart
2904         && a.m_userEnd == b.m_userEnd
2905         && a.m_passwordEnd == b.m_passwordEnd
2906         && a.m_hostEnd == b.m_hostEnd
2907         && a.m_portEnd == b.m_portEnd
2908         && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2909         && a.m_pathEnd == b.m_pathEnd
2910         && a.m_queryEnd == b.m_queryEnd;
2911 }
2912
2913 bool URLParser::internalValuesConsistent(const URL& url)
2914 {
2915     return url.m_schemeEnd <= url.m_userStart
2916         && url.m_userStart <= url.m_userEnd
2917         && url.m_userEnd <= url.m_passwordEnd
2918         && url.m_passwordEnd <= url.m_hostEnd
2919         && url.m_hostEnd <= url.m_portEnd
2920         && url.m_portEnd <= url.m_pathAfterLastSlash
2921         && url.m_pathAfterLastSlash <= url.m_pathEnd
2922         && url.m_pathEnd <= url.m_queryEnd
2923         && url.m_queryEnd <= url.m_string.length();
2924 }
2925
2926 } // namespace WebCore