Test that CSS subresource loading are exposed to resource timing in case of a CORS...
[WebKit-https.git] / Source / WebCore / platform / URLParser.cpp
1 /*
2  * Copyright (C) 2016-2018 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "URLParser.h"
28
29 #include "Logging.h"
30 #include "RuntimeApplicationChecks.h"
31 #include <array>
32 #include <mutex>
33 #include <unicode/uidna.h>
34 #include <unicode/utypes.h>
35
36 namespace WebCore {
37
38 #define URL_PARSER_DEBUGGING 0
39
40 #if URL_PARSER_DEBUGGING
41 #define URL_PARSER_LOG(...) LOG(URLParser, __VA_ARGS__)
42 #else
43 #define URL_PARSER_LOG(...)
44 #endif
45     
46 template<typename CharacterType>
47 class CodePointIterator {
48 public:
49     ALWAYS_INLINE CodePointIterator() { }
50     ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
51         : m_begin(begin)
52         , m_end(end)
53     {
54     }
55     
56     ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
57         : CodePointIterator(begin.m_begin, end.m_begin)
58     {
59         ASSERT(end.m_begin >= begin.m_begin);
60     }
61     
62     ALWAYS_INLINE UChar32 operator*() const;
63     ALWAYS_INLINE CodePointIterator& operator++();
64
65     ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
66     {
67         return m_begin == other.m_begin
68             && m_end == other.m_end;
69     }
70     ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
71     
72     ALWAYS_INLINE CodePointIterator& operator=(const CodePointIterator& other)
73     {
74         m_begin = other.m_begin;
75         m_end = other.m_end;
76         return *this;
77     }
78
79     ALWAYS_INLINE bool atEnd() const
80     {
81         ASSERT(m_begin <= m_end);
82         return m_begin >= m_end;
83     }
84     
85     ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
86     {
87         ASSERT(m_begin >= reference);
88         return m_begin - reference;
89     }
90
91     ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
92     {
93         return codeUnitsSince(other.m_begin);
94     }
95     
96 private:
97     const CharacterType* m_begin { nullptr };
98     const CharacterType* m_end { nullptr };
99 };
100
101 template<>
102 ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const
103 {
104     ASSERT(!atEnd());
105     return *m_begin;
106 }
107
108 template<>
109 ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
110 {
111     m_begin++;
112     return *this;
113 }
114
115 template<>
116 ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const
117 {
118     ASSERT(!atEnd());
119     UChar32 c;
120     U16_GET(m_begin, 0, 0, m_end - m_begin, c);
121     return c;
122 }
123
124 template<>
125 ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
126 {
127     unsigned i = 0;
128     size_t length = m_end - m_begin;
129     U16_FWD_1(m_begin, i, length);
130     m_begin += i;
131     return *this;
132 }
133     
134 ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
135 {
136     if (U_IS_BMP(codePoint)) {
137         destination.append(static_cast<UChar>(codePoint));
138         return;
139     }
140     destination.reserveCapacity(destination.size() + 2);
141     destination.uncheckedAppend(U16_LEAD(codePoint));
142     destination.uncheckedAppend(U16_TRAIL(codePoint));
143 }
144
145 enum URLCharacterClass {
146     UserInfo = 0x1,
147     Default = 0x2,
148     ForbiddenHost = 0x4,
149     QueryPercent = 0x8,
150     SlashQuestionOrHash = 0x10,
151     ValidScheme = 0x20,
152 };
153
154 static const uint8_t characterClassTable[256] = {
155     UserInfo | Default | QueryPercent | ForbiddenHost, // 0x0
156     UserInfo | Default | QueryPercent, // 0x1
157     UserInfo | Default | QueryPercent, // 0x2
158     UserInfo | Default | QueryPercent, // 0x3
159     UserInfo | Default | QueryPercent, // 0x4
160     UserInfo | Default | QueryPercent, // 0x5
161     UserInfo | Default | QueryPercent, // 0x6
162     UserInfo | Default | QueryPercent, // 0x7
163     UserInfo | Default | QueryPercent, // 0x8
164     UserInfo | Default | QueryPercent | ForbiddenHost, // 0x9
165     UserInfo | Default | QueryPercent | ForbiddenHost, // 0xA
166     UserInfo | Default | QueryPercent, // 0xB
167     UserInfo | Default | QueryPercent, // 0xC
168     UserInfo | Default | QueryPercent | ForbiddenHost, // 0xD
169     UserInfo | Default | QueryPercent, // 0xE
170     UserInfo | Default | QueryPercent, // 0xF
171     UserInfo | Default | QueryPercent, // 0x10
172     UserInfo | Default | QueryPercent, // 0x11
173     UserInfo | Default | QueryPercent, // 0x12
174     UserInfo | Default | QueryPercent, // 0x13
175     UserInfo | Default | QueryPercent, // 0x14
176     UserInfo | Default | QueryPercent, // 0x15
177     UserInfo | Default | QueryPercent, // 0x16
178     UserInfo | Default | QueryPercent, // 0x17
179     UserInfo | Default | QueryPercent, // 0x18
180     UserInfo | Default | QueryPercent, // 0x19
181     UserInfo | Default | QueryPercent, // 0x1A
182     UserInfo | Default | QueryPercent, // 0x1B
183     UserInfo | Default | QueryPercent, // 0x1C
184     UserInfo | Default | QueryPercent, // 0x1D
185     UserInfo | Default | QueryPercent, // 0x1E
186     UserInfo | Default | QueryPercent, // 0x1F
187     UserInfo | Default | QueryPercent | ForbiddenHost, // ' '
188     0, // '!'
189     UserInfo | Default | QueryPercent, // '"'
190     UserInfo | Default | QueryPercent | SlashQuestionOrHash | ForbiddenHost, // '#'
191     0, // '$'
192     ForbiddenHost, // '%'
193     0, // '&'
194     0, // '\''
195     0, // '('
196     0, // ')'
197     0, // '*'
198     ValidScheme, // '+'
199     0, // ','
200     ValidScheme, // '-'
201     ValidScheme, // '.'
202     UserInfo | SlashQuestionOrHash | ForbiddenHost, // '/'
203     ValidScheme, // '0'
204     ValidScheme, // '1'
205     ValidScheme, // '2'
206     ValidScheme, // '3'
207     ValidScheme, // '4'
208     ValidScheme, // '5'
209     ValidScheme, // '6'
210     ValidScheme, // '7'
211     ValidScheme, // '8'
212     ValidScheme, // '9'
213     UserInfo | ForbiddenHost, // ':'
214     UserInfo, // ';'
215     UserInfo | Default | QueryPercent | ForbiddenHost, // '<'
216     UserInfo, // '='
217     UserInfo | Default | QueryPercent | ForbiddenHost, // '>'
218     UserInfo | Default | SlashQuestionOrHash | ForbiddenHost, // '?'
219     UserInfo | ForbiddenHost, // '@'
220     ValidScheme, // 'A'
221     ValidScheme, // 'B'
222     ValidScheme, // 'C'
223     ValidScheme, // 'D'
224     ValidScheme, // 'E'
225     ValidScheme, // 'F'
226     ValidScheme, // 'G'
227     ValidScheme, // 'H'
228     ValidScheme, // 'I'
229     ValidScheme, // 'J'
230     ValidScheme, // 'K'
231     ValidScheme, // 'L'
232     ValidScheme, // 'M'
233     ValidScheme, // 'N'
234     ValidScheme, // 'O'
235     ValidScheme, // 'P'
236     ValidScheme, // 'Q'
237     ValidScheme, // 'R'
238     ValidScheme, // 'S'
239     ValidScheme, // 'T'
240     ValidScheme, // 'U'
241     ValidScheme, // 'V'
242     ValidScheme, // 'W'
243     ValidScheme, // 'X'
244     ValidScheme, // 'Y'
245     ValidScheme, // 'Z'
246     UserInfo | ForbiddenHost, // '['
247     UserInfo | SlashQuestionOrHash | ForbiddenHost, // '\\'
248     UserInfo | ForbiddenHost, // ']'
249     UserInfo, // '^'
250     0, // '_'
251     UserInfo | Default, // '`'
252     ValidScheme, // 'a'
253     ValidScheme, // 'b'
254     ValidScheme, // 'c'
255     ValidScheme, // 'd'
256     ValidScheme, // 'e'
257     ValidScheme, // 'f'
258     ValidScheme, // 'g'
259     ValidScheme, // 'h'
260     ValidScheme, // 'i'
261     ValidScheme, // 'j'
262     ValidScheme, // 'k'
263     ValidScheme, // 'l'
264     ValidScheme, // 'm'
265     ValidScheme, // 'n'
266     ValidScheme, // 'o'
267     ValidScheme, // 'p'
268     ValidScheme, // 'q'
269     ValidScheme, // 'r'
270     ValidScheme, // 's'
271     ValidScheme, // 't'
272     ValidScheme, // 'u'
273     ValidScheme, // 'v'
274     ValidScheme, // 'w'
275     ValidScheme, // 'x'
276     ValidScheme, // 'y'
277     ValidScheme, // 'z'
278     UserInfo | Default, // '{'
279     UserInfo, // '|'
280     UserInfo | Default, // '}'
281     0, // '~'
282     QueryPercent, // 0x7F
283     QueryPercent, // 0x80
284     QueryPercent, // 0x81
285     QueryPercent, // 0x82
286     QueryPercent, // 0x83
287     QueryPercent, // 0x84
288     QueryPercent, // 0x85
289     QueryPercent, // 0x86
290     QueryPercent, // 0x87
291     QueryPercent, // 0x88
292     QueryPercent, // 0x89
293     QueryPercent, // 0x8A
294     QueryPercent, // 0x8B
295     QueryPercent, // 0x8C
296     QueryPercent, // 0x8D
297     QueryPercent, // 0x8E
298     QueryPercent, // 0x8F
299     QueryPercent, // 0x90
300     QueryPercent, // 0x91
301     QueryPercent, // 0x92
302     QueryPercent, // 0x93
303     QueryPercent, // 0x94
304     QueryPercent, // 0x95
305     QueryPercent, // 0x96
306     QueryPercent, // 0x97
307     QueryPercent, // 0x98
308     QueryPercent, // 0x99
309     QueryPercent, // 0x9A
310     QueryPercent, // 0x9B
311     QueryPercent, // 0x9C
312     QueryPercent, // 0x9D
313     QueryPercent, // 0x9E
314     QueryPercent, // 0x9F
315     QueryPercent, // 0xA0
316     QueryPercent, // 0xA1
317     QueryPercent, // 0xA2
318     QueryPercent, // 0xA3
319     QueryPercent, // 0xA4
320     QueryPercent, // 0xA5
321     QueryPercent, // 0xA6
322     QueryPercent, // 0xA7
323     QueryPercent, // 0xA8
324     QueryPercent, // 0xA9
325     QueryPercent, // 0xAA
326     QueryPercent, // 0xAB
327     QueryPercent, // 0xAC
328     QueryPercent, // 0xAD
329     QueryPercent, // 0xAE
330     QueryPercent, // 0xAF
331     QueryPercent, // 0xB0
332     QueryPercent, // 0xB1
333     QueryPercent, // 0xB2
334     QueryPercent, // 0xB3
335     QueryPercent, // 0xB4
336     QueryPercent, // 0xB5
337     QueryPercent, // 0xB6
338     QueryPercent, // 0xB7
339     QueryPercent, // 0xB8
340     QueryPercent, // 0xB9
341     QueryPercent, // 0xBA
342     QueryPercent, // 0xBB
343     QueryPercent, // 0xBC
344     QueryPercent, // 0xBD
345     QueryPercent, // 0xBE
346     QueryPercent, // 0xBF
347     QueryPercent, // 0xC0
348     QueryPercent, // 0xC1
349     QueryPercent, // 0xC2
350     QueryPercent, // 0xC3
351     QueryPercent, // 0xC4
352     QueryPercent, // 0xC5
353     QueryPercent, // 0xC6
354     QueryPercent, // 0xC7
355     QueryPercent, // 0xC8
356     QueryPercent, // 0xC9
357     QueryPercent, // 0xCA
358     QueryPercent, // 0xCB
359     QueryPercent, // 0xCC
360     QueryPercent, // 0xCD
361     QueryPercent, // 0xCE
362     QueryPercent, // 0xCF
363     QueryPercent, // 0xD0
364     QueryPercent, // 0xD1
365     QueryPercent, // 0xD2
366     QueryPercent, // 0xD3
367     QueryPercent, // 0xD4
368     QueryPercent, // 0xD5
369     QueryPercent, // 0xD6
370     QueryPercent, // 0xD7
371     QueryPercent, // 0xD8
372     QueryPercent, // 0xD9
373     QueryPercent, // 0xDA
374     QueryPercent, // 0xDB
375     QueryPercent, // 0xDC
376     QueryPercent, // 0xDD
377     QueryPercent, // 0xDE
378     QueryPercent, // 0xDF
379     QueryPercent, // 0xE0
380     QueryPercent, // 0xE1
381     QueryPercent, // 0xE2
382     QueryPercent, // 0xE3
383     QueryPercent, // 0xE4
384     QueryPercent, // 0xE5
385     QueryPercent, // 0xE6
386     QueryPercent, // 0xE7
387     QueryPercent, // 0xE8
388     QueryPercent, // 0xE9
389     QueryPercent, // 0xEA
390     QueryPercent, // 0xEB
391     QueryPercent, // 0xEC
392     QueryPercent, // 0xED
393     QueryPercent, // 0xEE
394     QueryPercent, // 0xEF
395     QueryPercent, // 0xF0
396     QueryPercent, // 0xF1
397     QueryPercent, // 0xF2
398     QueryPercent, // 0xF3
399     QueryPercent, // 0xF4
400     QueryPercent, // 0xF5
401     QueryPercent, // 0xF6
402     QueryPercent, // 0xF7
403     QueryPercent, // 0xF8
404     QueryPercent, // 0xF9
405     QueryPercent, // 0xFA
406     QueryPercent, // 0xFB
407     QueryPercent, // 0xFC
408     QueryPercent, // 0xFD
409     QueryPercent, // 0xFE
410     QueryPercent, // 0xFF
411 };
412
413 template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; }
414 template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
415 template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
416 template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
417 template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
418 template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
419 template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
420 template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
421 template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & ValidScheme; }
422 template<typename CharacterType> ALWAYS_INLINE static bool isForbiddenHostCodePoint(CharacterType character) { return character <= ']' && characterClassTable[character] & ForbiddenHost; }
423 ALWAYS_INLINE static bool shouldPercentEncodeQueryByte(uint8_t byte, const bool& urlIsSpecial)
424 {
425     if (characterClassTable[byte] & QueryPercent)
426         return true;
427     if (byte == '\'' && urlIsSpecial)
428         return true;
429     return false;
430 }
431
432 bool URLParser::isInUserInfoEncodeSet(UChar c)
433 {
434     return WebCore::isInUserInfoEncodeSet(c);
435 }
436
437 template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation>
438 ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
439 {
440     ++iterator;
441     while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
442         if (reportSyntaxViolation == ReportSyntaxViolation::Yes)
443             syntaxViolation(iteratorForSyntaxViolationPosition);
444         ++iterator;
445     }
446 }
447
448 template<typename CharacterType>
449 bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator)
450 {
451     if (iterator.atEnd())
452         return false;
453     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
454     if (iterator.atEnd())
455         return false;
456     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
457     return iterator.atEnd();
458 }
459
460 template<typename CharacterType>
461 ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
462 {
463     if (iterator.atEnd() || !isASCIIAlpha(*iterator))
464         return false;
465     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
466     if (iterator.atEnd())
467         return false;
468     if (*iterator == ':')
469         return true;
470     if (UNLIKELY(*iterator == '|'))
471         return true;
472     return false;
473 }
474
475 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
476 {
477     ASSERT(isASCII(codePoint));
478     if (UNLIKELY(m_didSeeSyntaxViolation))
479         m_asciiBuffer.append(codePoint);
480 }
481
482 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
483 {
484     if (UNLIKELY(m_didSeeSyntaxViolation))
485         m_asciiBuffer.append(characters, length);
486 }
487
488 template<typename CharacterType>
489 void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
490 {
491     ASSERT(isWindowsDriveLetter(iterator));
492     appendToASCIIBuffer(*iterator);
493     advance(iterator);
494     ASSERT(!iterator.atEnd());
495     ASSERT(*iterator == ':' || *iterator == '|');
496     if (*iterator == '|')
497         syntaxViolation(iterator);
498     appendToASCIIBuffer(':');
499     advance(iterator);
500 }
501
502 bool URLParser::copyBaseWindowsDriveLetter(const URL& base)
503 {
504     if (base.protocolIs("file")) {
505         RELEASE_ASSERT(base.m_hostEnd + base.m_portLength < base.m_string.length());
506         if (base.m_string.is8Bit()) {
507             const LChar* begin = base.m_string.characters8();
508             CodePointIterator<LChar> c(begin + base.m_hostEnd + base.m_portLength + 1, begin + base.m_string.length());
509             if (isWindowsDriveLetter(c)) {
510                 appendWindowsDriveLetter(c);
511                 return true;
512             }
513         } else {
514             const UChar* begin = base.m_string.characters16();
515             CodePointIterator<UChar> c(begin + base.m_hostEnd + base.m_portLength + 1, begin + base.m_string.length());
516             if (isWindowsDriveLetter(c)) {
517                 appendWindowsDriveLetter(c);
518                 return true;
519             }
520         }
521     }
522     return false;
523 }
524
525 template<typename CharacterType>
526 bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
527 {
528     if (!isWindowsDriveLetter(iterator))
529         return true;
530     if (iterator.atEnd())
531         return false;
532     advance(iterator);
533     if (iterator.atEnd())
534         return true;
535     advance(iterator);
536     if (iterator.atEnd())
537         return true;
538     return !isSlashQuestionOrHash(*iterator);
539 }
540
541 static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
542 {
543     buffer.append('%');
544     buffer.append(upperNibbleToASCIIHexDigit(byte));
545     buffer.append(lowerNibbleToASCIIHexDigit(byte));
546 }
547
548 void URLParser::percentEncodeByte(uint8_t byte)
549 {
550     ASSERT(m_didSeeSyntaxViolation);
551     appendToASCIIBuffer('%');
552     appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
553     appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
554 }
555
556 const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD";
557 const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1;
558
559 template<bool(*isInCodeSet)(UChar32), typename CharacterType>
560 ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
561 {
562     ASSERT(!iterator.atEnd());
563     UChar32 codePoint = *iterator;
564     if (LIKELY(isASCII(codePoint))) {
565         if (UNLIKELY(isInCodeSet(codePoint))) {
566             syntaxViolation(iterator);
567             percentEncodeByte(codePoint);
568         } else
569             appendToASCIIBuffer(codePoint);
570         return;
571     }
572     ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
573     syntaxViolation(iterator);
574     
575     if (!U_IS_UNICODE_CHAR(codePoint)) {
576         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
577         return;
578     }
579     
580     uint8_t buffer[U8_MAX_LENGTH];
581     int32_t offset = 0;
582     U8_APPEND_UNSAFE(buffer, offset, codePoint);
583     for (int32_t i = 0; i < offset; ++i)
584         percentEncodeByte(buffer[i]);
585 }
586
587 template<typename CharacterType>
588 ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
589 {
590     ASSERT(!iterator.atEnd());
591     UChar32 codePoint = *iterator;
592     if (LIKELY(isASCII(codePoint))) {
593         if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint, m_urlIsSpecial))) {
594             syntaxViolation(iterator);
595             percentEncodeByte(codePoint);
596         } else
597             appendToASCIIBuffer(codePoint);
598         return;
599     }
600     
601     syntaxViolation(iterator);
602     
603     if (!U_IS_UNICODE_CHAR(codePoint)) {
604         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
605         return;
606     }
607
608     uint8_t buffer[U8_MAX_LENGTH];
609     int32_t offset = 0;
610     U8_APPEND_UNSAFE(buffer, offset, codePoint);
611     for (int32_t i = 0; i < offset; ++i) {
612         auto byte = buffer[i];
613         if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
614             percentEncodeByte(byte);
615         else
616             appendToASCIIBuffer(byte);
617     }
618 }
619
620 template<typename CharacterType>
621 void URLParser::encodeNonUTF8Query(const Vector<UChar>& source, const URLTextEncoding& encoding, CodePointIterator<CharacterType> iterator)
622 {
623     auto encoded = encoding.encodeForURLParsing(StringView(source.data(), source.size()));
624     auto* data = encoded.data();
625     size_t length = encoded.size();
626     
627     if (!length == !iterator.atEnd()) {
628         syntaxViolation(iterator);
629         return;
630     }
631     
632     size_t i = 0;
633     for (; i < length; ++i) {
634         ASSERT(!iterator.atEnd());
635         uint8_t byte = data[i];
636         if (UNLIKELY(byte != *iterator)) {
637             syntaxViolation(iterator);
638             break;
639         }
640         if (UNLIKELY(shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))) {
641             syntaxViolation(iterator);
642             break;
643         }
644         appendToASCIIBuffer(byte);
645         ++iterator;
646     }
647     while (!iterator.atEnd() && isTabOrNewline(*iterator))
648         ++iterator;
649     ASSERT((i == length) == iterator.atEnd());
650     for (; i < length; ++i) {
651         ASSERT(m_didSeeSyntaxViolation);
652         uint8_t byte = data[i];
653         if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
654             percentEncodeByte(byte);
655         else
656             appendToASCIIBuffer(byte);
657     }
658 }
659
660 std::optional<uint16_t> URLParser::defaultPortForProtocol(StringView scheme)
661 {
662     static const uint16_t ftpPort = 21;
663     static const uint16_t gopherPort = 70;
664     static const uint16_t httpPort = 80;
665     static const uint16_t httpsPort = 443;
666     static const uint16_t wsPort = 80;
667     static const uint16_t wssPort = 443;
668     
669     auto length = scheme.length();
670     if (!length)
671         return std::nullopt;
672     switch (scheme[0]) {
673     case 'w':
674         switch (length) {
675         case 2:
676             if (scheme[1] == 's')
677                 return wsPort;
678             return std::nullopt;
679         case 3:
680             if (scheme[1] == 's'
681                 && scheme[2] == 's')
682                 return wssPort;
683             return std::nullopt;
684         default:
685             return false;
686         }
687     case 'h':
688         switch (length) {
689         case 4:
690             if (scheme[1] == 't'
691                 && scheme[2] == 't'
692                 && scheme[3] == 'p')
693                 return httpPort;
694             return std::nullopt;
695         case 5:
696             if (scheme[1] == 't'
697                 && scheme[2] == 't'
698                 && scheme[3] == 'p'
699                 && scheme[4] == 's')
700                 return httpsPort;
701             return std::nullopt;
702         default:
703             return std::nullopt;
704         }
705     case 'g':
706         if (length == 6
707             && scheme[1] == 'o'
708             && scheme[2] == 'p'
709             && scheme[3] == 'h'
710             && scheme[4] == 'e'
711             && scheme[5] == 'r')
712             return gopherPort;
713         return std::nullopt;
714     case 'f':
715         if (length == 3
716             && scheme[1] == 't'
717             && scheme[2] == 'p')
718             return ftpPort;
719         return std::nullopt;
720     default:
721         return std::nullopt;
722     }
723 }
724
725 enum class Scheme {
726     WS,
727     WSS,
728     File,
729     FTP,
730     Gopher,
731     HTTP,
732     HTTPS,
733     NonSpecial
734 };
735
736 ALWAYS_INLINE static Scheme scheme(StringView scheme)
737 {
738     auto length = scheme.length();
739     if (!length)
740         return Scheme::NonSpecial;
741     switch (scheme[0]) {
742     case 'f':
743         switch (length) {
744         case 3:
745             if (scheme[1] == 't'
746                 && scheme[2] == 'p')
747                 return Scheme::FTP;
748             return Scheme::NonSpecial;
749         case 4:
750             if (scheme[1] == 'i'
751                 && scheme[2] == 'l'
752                 && scheme[3] == 'e')
753                 return Scheme::File;
754             return Scheme::NonSpecial;
755         default:
756             return Scheme::NonSpecial;
757         }
758     case 'g':
759         if (length == 6
760             && scheme[1] == 'o'
761             && scheme[2] == 'p'
762             && scheme[3] == 'h'
763             && scheme[4] == 'e'
764             && scheme[5] == 'r')
765             return Scheme::Gopher;
766         return Scheme::NonSpecial;
767     case 'h':
768         switch (length) {
769         case 4:
770             if (scheme[1] == 't'
771                 && scheme[2] == 't'
772                 && scheme[3] == 'p')
773                 return Scheme::HTTP;
774             return Scheme::NonSpecial;
775         case 5:
776             if (scheme[1] == 't'
777                 && scheme[2] == 't'
778                 && scheme[3] == 'p'
779                 && scheme[4] == 's')
780                 return Scheme::HTTPS;
781             return Scheme::NonSpecial;
782         default:
783             return Scheme::NonSpecial;
784         }
785     case 'w':
786         switch (length) {
787         case 2:
788             if (scheme[1] == 's')
789                 return Scheme::WS;
790             return Scheme::NonSpecial;
791         case 3:
792             if (scheme[1] == 's'
793                 && scheme[2] == 's')
794                 return Scheme::WSS;
795             return Scheme::NonSpecial;
796         default:
797             return Scheme::NonSpecial;
798         }
799     default:
800         return Scheme::NonSpecial;
801     }
802 }
803
804 std::optional<String> URLParser::maybeCanonicalizeScheme(const String& scheme)
805 {
806     if (scheme.isEmpty())
807         return std::nullopt;
808
809     if (!isASCIIAlpha(scheme[0]))
810         return std::nullopt;
811
812     for (size_t i = 1; i < scheme.length(); ++i) {
813         if (isASCIIAlphanumeric(scheme[i]) || scheme[i] == '+' || scheme[i] == '-' || scheme[i] == '.')
814             continue;
815         return std::nullopt;
816     }
817
818     return scheme.convertToASCIILowercase();
819 }
820
821 bool URLParser::isSpecialScheme(const String& schemeArg)
822 {
823     return scheme(schemeArg) != Scheme::NonSpecial;
824 }
825
826 enum class URLParser::URLPart {
827     SchemeEnd,
828     UserStart,
829     UserEnd,
830     PasswordEnd,
831     HostEnd,
832     PortEnd,
833     PathAfterLastSlash,
834     PathEnd,
835     QueryEnd,
836 };
837
838 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
839 {
840     switch (part) {
841     case URLPart::QueryEnd:
842         return url.m_queryEnd;
843     case URLPart::PathEnd:
844         return url.m_pathEnd;
845     case URLPart::PathAfterLastSlash:
846         return url.m_pathAfterLastSlash;
847     case URLPart::PortEnd:
848         return url.m_hostEnd + url.m_portLength;
849     case URLPart::HostEnd:
850         return url.m_hostEnd;
851     case URLPart::PasswordEnd:
852         return url.m_passwordEnd;
853     case URLPart::UserEnd:
854         return url.m_userEnd;
855     case URLPart::UserStart:
856         return url.m_userStart;
857     case URLPart::SchemeEnd:
858         return url.m_schemeEnd;
859     }
860     ASSERT_NOT_REACHED();
861     return 0;
862 }
863
864 void URLParser::copyASCIIStringUntil(const String& string, size_t length)
865 {
866     RELEASE_ASSERT(length <= string.length());
867     if (string.isNull())
868         return;
869     ASSERT(m_asciiBuffer.isEmpty());
870     if (string.is8Bit())
871         appendToASCIIBuffer(string.characters8(), length);
872     else {
873         const UChar* characters = string.characters16();
874         for (size_t i = 0; i < length; ++i) {
875             UChar c = characters[i];
876             ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
877             appendToASCIIBuffer(c);
878         }
879     }
880 }
881
882 template<typename CharacterType>
883 void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, const URLTextEncoding*& nonUTF8QueryEncoding)
884 {
885     syntaxViolation(iterator);
886
887     m_asciiBuffer.clear();
888     copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
889     switch (part) {
890     case URLPart::QueryEnd:
891         m_url.m_queryEnd = base.m_queryEnd;
892         FALLTHROUGH;
893     case URLPart::PathEnd:
894         m_url.m_pathEnd = base.m_pathEnd;
895         FALLTHROUGH;
896     case URLPart::PathAfterLastSlash:
897         m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
898         FALLTHROUGH;
899     case URLPart::PortEnd:
900         m_url.m_portLength = base.m_portLength;
901         FALLTHROUGH;
902     case URLPart::HostEnd:
903         m_url.m_hostEnd = base.m_hostEnd;
904         FALLTHROUGH;
905     case URLPart::PasswordEnd:
906         m_url.m_passwordEnd = base.m_passwordEnd;
907         FALLTHROUGH;
908     case URLPart::UserEnd:
909         m_url.m_userEnd = base.m_userEnd;
910         FALLTHROUGH;
911     case URLPart::UserStart:
912         m_url.m_userStart = base.m_userStart;
913         FALLTHROUGH;
914     case URLPart::SchemeEnd:
915         m_url.m_isValid = base.m_isValid;
916         m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
917         m_url.m_schemeEnd = base.m_schemeEnd;
918     }
919     switch (scheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd))) {
920     case Scheme::WS:
921     case Scheme::WSS:
922         nonUTF8QueryEncoding = nullptr;
923         m_urlIsSpecial = true;
924         return;
925     case Scheme::File:
926         m_urlIsFile = true;
927         FALLTHROUGH;
928     case Scheme::FTP:
929     case Scheme::Gopher:
930     case Scheme::HTTP:
931     case Scheme::HTTPS:
932         m_urlIsSpecial = true;
933         return;
934     case Scheme::NonSpecial:
935         m_urlIsSpecial = false;
936         nonUTF8QueryEncoding = nullptr;
937         return;
938     }
939     ASSERT_NOT_REACHED();
940 }
941
942 static const char dotASCIICode[2] = {'2', 'e'};
943
944 template<typename CharacterType>
945 ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c)
946 {
947     if (c.atEnd())
948         return false;
949     if (*c == '.') {
950         advance<CharacterType, ReportSyntaxViolation::No>(c);
951         return c.atEnd() || isSlashQuestionOrHash(*c);
952     }
953     if (*c != '%')
954         return false;
955     advance<CharacterType, ReportSyntaxViolation::No>(c);
956     if (c.atEnd() || *c != dotASCIICode[0])
957         return false;
958     advance<CharacterType, ReportSyntaxViolation::No>(c);
959     if (c.atEnd())
960         return false;
961     if (toASCIILower(*c) == dotASCIICode[1]) {
962         advance<CharacterType, ReportSyntaxViolation::No>(c);
963         return c.atEnd() || isSlashQuestionOrHash(*c);
964     }
965     return false;
966 }
967
968 template<typename CharacterType>
969 ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
970 {
971     if (c.atEnd())
972         return false;
973     if (*c == '.') {
974         advance<CharacterType, ReportSyntaxViolation::No>(c);
975         return isSingleDotPathSegment(c);
976     }
977     if (*c != '%')
978         return false;
979     advance<CharacterType, ReportSyntaxViolation::No>(c);
980     if (c.atEnd() || *c != dotASCIICode[0])
981         return false;
982     advance<CharacterType, ReportSyntaxViolation::No>(c);
983     if (c.atEnd())
984         return false;
985     if (toASCIILower(*c) == dotASCIICode[1]) {
986         advance<CharacterType, ReportSyntaxViolation::No>(c);
987         return isSingleDotPathSegment(c);
988     }
989     return false;
990 }
991
992 template<typename CharacterType>
993 void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
994 {
995     ASSERT(isSingleDotPathSegment(c));
996     if (*c == '.') {
997         advance(c);
998         if (!c.atEnd()) {
999             if (*c == '/' || *c == '\\')
1000                 advance(c);
1001             else
1002                 ASSERT(*c == '?' || *c == '#');
1003         }
1004     } else {
1005         ASSERT(*c == '%');
1006         advance(c);
1007         ASSERT(*c == dotASCIICode[0]);
1008         advance(c);
1009         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1010         advance(c);
1011         if (!c.atEnd()) {
1012             if (*c == '/' || *c == '\\')
1013                 advance(c);
1014             else
1015                 ASSERT(*c == '?' || *c == '#');
1016         }
1017     }
1018 }
1019
1020 template<typename CharacterType>
1021 void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
1022 {
1023     ASSERT(isDoubleDotPathSegment(c));
1024     if (*c == '.')
1025         advance(c);
1026     else {
1027         ASSERT(*c == '%');
1028         advance(c);
1029         ASSERT(*c == dotASCIICode[0]);
1030         advance(c);
1031         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1032         advance(c);
1033     }
1034     consumeSingleDotPathSegment(c);
1035 }
1036
1037 bool URLParser::shouldPopPath(unsigned newPathAfterLastSlash)
1038 {
1039     ASSERT(m_didSeeSyntaxViolation);
1040     if (!m_urlIsFile)
1041         return true;
1042
1043     ASSERT(m_url.m_pathAfterLastSlash <= m_asciiBuffer.size());
1044     CodePointIterator<LChar> componentToPop(&m_asciiBuffer[newPathAfterLastSlash], &m_asciiBuffer[0] + m_url.m_pathAfterLastSlash);
1045     if (newPathAfterLastSlash == m_url.m_hostEnd + m_url.m_portLength + 1 && isWindowsDriveLetter(componentToPop))
1046         return false;
1047     return true;
1048 }
1049
1050 void URLParser::popPath()
1051 {
1052     ASSERT(m_didSeeSyntaxViolation);
1053     if (m_url.m_pathAfterLastSlash > m_url.m_hostEnd + m_url.m_portLength + 1) {
1054         auto newPathAfterLastSlash = m_url.m_pathAfterLastSlash - 1;
1055         if (m_asciiBuffer[newPathAfterLastSlash] == '/')
1056             newPathAfterLastSlash--;
1057         while (newPathAfterLastSlash > m_url.m_hostEnd + m_url.m_portLength && m_asciiBuffer[newPathAfterLastSlash] != '/')
1058             newPathAfterLastSlash--;
1059         newPathAfterLastSlash++;
1060         if (shouldPopPath(newPathAfterLastSlash))
1061             m_url.m_pathAfterLastSlash = newPathAfterLastSlash;
1062     }
1063     m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
1064 }
1065
1066 template<typename CharacterType>
1067 void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
1068 {
1069     if (m_didSeeSyntaxViolation)
1070         return;
1071     m_didSeeSyntaxViolation = true;
1072     
1073     ASSERT(m_asciiBuffer.isEmpty());
1074     size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1075     RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
1076     m_asciiBuffer.reserveCapacity(m_inputString.length());
1077     for (size_t i = 0; i < codeUnitsToCopy; ++i) {
1078         ASSERT(isASCII(m_inputString[i]));
1079         m_asciiBuffer.uncheckedAppend(m_inputString[i]);
1080     }
1081 }
1082
1083 void URLParser::failure()
1084 {
1085     m_url.invalidate();
1086     m_url.m_string = m_inputString;
1087 }
1088
1089 template<typename CharacterType>
1090 bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint)
1091 {
1092     if (iterator.atEnd() || toASCIILower(*iterator) != codePoint)
1093         return false;
1094     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
1095     return true;
1096 }
1097
1098 template<typename CharacterType>
1099 bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator)
1100 {
1101     if (!checkLocalhostCodePoint(iterator, 'l'))
1102         return false;
1103     if (!checkLocalhostCodePoint(iterator, 'o'))
1104         return false;
1105     if (!checkLocalhostCodePoint(iterator, 'c'))
1106         return false;
1107     if (!checkLocalhostCodePoint(iterator, 'a'))
1108         return false;
1109     if (!checkLocalhostCodePoint(iterator, 'l'))
1110         return false;
1111     if (!checkLocalhostCodePoint(iterator, 'h'))
1112         return false;
1113     if (!checkLocalhostCodePoint(iterator, 'o'))
1114         return false;
1115     if (!checkLocalhostCodePoint(iterator, 's'))
1116         return false;
1117     if (!checkLocalhostCodePoint(iterator, 't'))
1118         return false;
1119     return iterator.atEnd();
1120 }
1121
1122 bool URLParser::isLocalhost(StringView view)
1123 {
1124     if (view.is8Bit())
1125         return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length()));
1126     return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length()));
1127 }
1128
1129 ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
1130 {
1131     if (UNLIKELY(m_didSeeSyntaxViolation)) {
1132         ASSERT(start + length <= m_asciiBuffer.size());
1133         return StringView(m_asciiBuffer.data() + start, length);
1134     }
1135     ASSERT(start + length <= m_inputString.length());
1136     return StringView(m_inputString).substring(start, length);
1137 }
1138
1139 ALWAYS_INLINE UChar URLParser::parsedDataView(size_t position)
1140 {
1141     if (UNLIKELY(m_didSeeSyntaxViolation))
1142         return m_asciiBuffer[position];
1143     return m_inputString[position];
1144 }
1145
1146 template<typename CharacterType>
1147 ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1148 {
1149     if (UNLIKELY(m_didSeeSyntaxViolation))
1150         return m_asciiBuffer.size();
1151     
1152     return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1153 }
1154
1155 URLParser::URLParser(const String& input, const URL& base, const URLTextEncoding* nonUTF8QueryEncoding)
1156     : m_inputString(input)
1157 {
1158     if (input.isNull()) {
1159         if (base.isValid() && !base.m_cannotBeABaseURL) {
1160             m_url = base;
1161             m_url.removeFragmentIdentifier();
1162         }
1163         return;
1164     }
1165
1166     if (input.is8Bit()) {
1167         m_inputBegin = input.characters8();
1168         parse(input.characters8(), input.length(), base, nonUTF8QueryEncoding);
1169     } else {
1170         m_inputBegin = input.characters16();
1171         parse(input.characters16(), input.length(), base, nonUTF8QueryEncoding);
1172     }
1173
1174     ASSERT(!m_url.m_isValid
1175         || m_didSeeSyntaxViolation == (m_url.string() != input)
1176         || (input.isAllSpecialCharacters<isC0ControlOrSpace>()
1177             && m_url.m_string == base.m_string.left(base.m_queryEnd)));
1178     ASSERT(internalValuesConsistent(m_url));
1179 #if !ASSERT_DISABLED
1180     if (!m_didSeeSyntaxViolation) {
1181         // Force a syntax violation at the beginning to make sure we get the same result.
1182         URLParser parser(makeString(" ", input), base, nonUTF8QueryEncoding);
1183         URL parsed = parser.result();
1184         if (parsed.isValid())
1185             ASSERT(allValuesEqual(parser.result(), m_url));
1186     }
1187 #endif
1188 }
1189
1190 template<typename CharacterType>
1191 void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const URLTextEncoding* nonUTF8QueryEncoding)
1192 {
1193     URL_PARSER_LOG("Parsing URL <%s> base <%s>", String(input, length).utf8().data(), base.string().utf8().data());
1194     m_url = { };
1195     ASSERT(m_asciiBuffer.isEmpty());
1196
1197     Vector<UChar> queryBuffer;
1198
1199     unsigned endIndex = length;
1200     while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) {
1201         syntaxViolation(CodePointIterator<CharacterType>(input, input));
1202         endIndex--;
1203     }
1204     CodePointIterator<CharacterType> c(input, input + endIndex);
1205     CodePointIterator<CharacterType> authorityOrHostBegin;
1206     CodePointIterator<CharacterType> queryBegin;
1207     while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1208         syntaxViolation(c);
1209         ++c;
1210     }
1211     auto beginAfterControlAndSpace = c;
1212
1213     enum class State : uint8_t {
1214         SchemeStart,
1215         Scheme,
1216         NoScheme,
1217         SpecialRelativeOrAuthority,
1218         PathOrAuthority,
1219         Relative,
1220         RelativeSlash,
1221         SpecialAuthoritySlashes,
1222         SpecialAuthorityIgnoreSlashes,
1223         AuthorityOrHost,
1224         Host,
1225         File,
1226         FileSlash,
1227         FileHost,
1228         PathStart,
1229         Path,
1230         CannotBeABaseURLPath,
1231         UTF8Query,
1232         NonUTF8Query,
1233         Fragment,
1234     };
1235
1236 #define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1237 #define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x)
1238
1239     State state = State::SchemeStart;
1240     while (!c.atEnd()) {
1241         if (UNLIKELY(isTabOrNewline(*c))) {
1242             syntaxViolation(c);
1243             ++c;
1244             continue;
1245         }
1246
1247         switch (state) {
1248         case State::SchemeStart:
1249             LOG_STATE("SchemeStart");
1250             if (isASCIIAlpha(*c)) {
1251                 if (UNLIKELY(isASCIIUpper(*c)))
1252                     syntaxViolation(c);
1253                 appendToASCIIBuffer(toASCIILower(*c));
1254                 advance(c);
1255                 if (c.atEnd()) {
1256                     m_asciiBuffer.clear();
1257                     state = State::NoScheme;
1258                     c = beginAfterControlAndSpace;
1259                     break;
1260                 }
1261                 state = State::Scheme;
1262             } else
1263                 state = State::NoScheme;
1264             break;
1265         case State::Scheme:
1266             LOG_STATE("Scheme");
1267             if (isValidSchemeCharacter(*c)) {
1268                 if (UNLIKELY(isASCIIUpper(*c)))
1269                     syntaxViolation(c);
1270                 appendToASCIIBuffer(toASCIILower(*c));
1271             } else if (*c == ':') {
1272                 unsigned schemeEnd = currentPosition(c);
1273                 if (schemeEnd > URL::maxSchemeLength) {
1274                     failure();
1275                     return;
1276                 }
1277                 m_url.m_schemeEnd = schemeEnd;
1278                 StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd);
1279                 appendToASCIIBuffer(':');
1280                 switch (scheme(urlScheme)) {
1281                 case Scheme::File:
1282                     m_urlIsSpecial = true;
1283                     m_urlIsFile = true;
1284                     state = State::File;
1285                     ++c;
1286                     break;
1287                 case Scheme::WS:
1288                 case Scheme::WSS:
1289                     nonUTF8QueryEncoding = nullptr;
1290                     m_urlIsSpecial = true;
1291                     if (base.protocolIs(urlScheme))
1292                         state = State::SpecialRelativeOrAuthority;
1293                     else
1294                         state = State::SpecialAuthoritySlashes;
1295                     ++c;
1296                     break;
1297                 case Scheme::HTTP:
1298                 case Scheme::HTTPS:
1299                     m_url.m_protocolIsInHTTPFamily = true;
1300                     FALLTHROUGH;
1301                 case Scheme::FTP:
1302                 case Scheme::Gopher:
1303                     m_urlIsSpecial = true;
1304                     if (base.protocolIs(urlScheme))
1305                         state = State::SpecialRelativeOrAuthority;
1306                     else
1307                         state = State::SpecialAuthoritySlashes;
1308                     ++c;
1309                     break;
1310                 case Scheme::NonSpecial:
1311                     nonUTF8QueryEncoding = nullptr;
1312                     auto maybeSlash = c;
1313                     advance(maybeSlash);
1314                     if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1315                         appendToASCIIBuffer('/');
1316                         c = maybeSlash;
1317                         state = State::PathOrAuthority;
1318                         ASSERT(*c == '/');
1319                         ++c;
1320                         m_url.m_userStart = currentPosition(c);
1321                     } else {
1322                         ++c;
1323                         m_url.m_userStart = currentPosition(c);
1324                         m_url.m_userEnd = m_url.m_userStart;
1325                         m_url.m_passwordEnd = m_url.m_userStart;
1326                         m_url.m_hostEnd = m_url.m_userStart;
1327                         m_url.m_portLength = 0;
1328                         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1329                         m_url.m_cannotBeABaseURL = true;
1330                         state = State::CannotBeABaseURLPath;
1331                     }
1332                     break;
1333                 }
1334                 break;
1335             } else {
1336                 m_asciiBuffer.clear();
1337                 state = State::NoScheme;
1338                 c = beginAfterControlAndSpace;
1339                 break;
1340             }
1341             advance(c);
1342             if (c.atEnd()) {
1343                 m_asciiBuffer.clear();
1344                 state = State::NoScheme;
1345                 c = beginAfterControlAndSpace;
1346             }
1347             break;
1348         case State::NoScheme:
1349             LOG_STATE("NoScheme");
1350             if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) {
1351                 failure();
1352                 return;
1353             }
1354             if (base.m_cannotBeABaseURL && *c == '#') {
1355                 copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1356                 state = State::Fragment;
1357                 appendToASCIIBuffer('#');
1358                 ++c;
1359                 break;
1360             }
1361             if (!base.protocolIs("file")) {
1362                 state = State::Relative;
1363                 break;
1364             }
1365             copyURLPartsUntil(base, URLPart::SchemeEnd, c, nonUTF8QueryEncoding);
1366             appendToASCIIBuffer(':');
1367             state = State::File;
1368             break;
1369         case State::SpecialRelativeOrAuthority:
1370             LOG_STATE("SpecialRelativeOrAuthority");
1371             if (*c == '/') {
1372                 appendToASCIIBuffer('/');
1373                 advance(c);
1374                 if (c.atEnd()) {
1375                     failure();
1376                     return;
1377                 }
1378                 if (*c == '/') {
1379                     appendToASCIIBuffer('/');
1380                     state = State::SpecialAuthorityIgnoreSlashes;
1381                     ++c;
1382                 } else
1383                     state = State::RelativeSlash;
1384             } else
1385                 state = State::Relative;
1386             break;
1387         case State::PathOrAuthority:
1388             LOG_STATE("PathOrAuthority");
1389             if (*c == '/') {
1390                 appendToASCIIBuffer('/');
1391                 state = State::AuthorityOrHost;
1392                 advance(c);
1393                 m_url.m_userStart = currentPosition(c);
1394                 authorityOrHostBegin = c;
1395             } else {
1396                 ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1397                 m_url.m_userStart = currentPosition(c) - 1;
1398                 m_url.m_userEnd = m_url.m_userStart;
1399                 m_url.m_passwordEnd = m_url.m_userStart;
1400                 m_url.m_hostEnd = m_url.m_userStart;
1401                 m_url.m_portLength = 0;
1402                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1403                 state = State::Path;
1404             }
1405             break;
1406         case State::Relative:
1407             LOG_STATE("Relative");
1408             switch (*c) {
1409             case '/':
1410             case '\\':
1411                 state = State::RelativeSlash;
1412                 ++c;
1413                 break;
1414             case '?':
1415                 copyURLPartsUntil(base, URLPart::PathEnd, c, nonUTF8QueryEncoding);
1416                 appendToASCIIBuffer('?');
1417                 ++c;
1418                 if (nonUTF8QueryEncoding) {
1419                     queryBegin = c;
1420                     state = State::NonUTF8Query;
1421                 } else
1422                     state = State::UTF8Query;
1423                 break;
1424             case '#':
1425                 copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1426                 appendToASCIIBuffer('#');
1427                 state = State::Fragment;
1428                 ++c;
1429                 break;
1430             default:
1431                 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, nonUTF8QueryEncoding);
1432                 if (currentPosition(c) && parsedDataView(currentPosition(c) - 1) != '/') {
1433                     appendToASCIIBuffer('/');
1434                     m_url.m_pathAfterLastSlash = currentPosition(c);
1435                 }
1436                 state = State::Path;
1437                 break;
1438             }
1439             break;
1440         case State::RelativeSlash:
1441             LOG_STATE("RelativeSlash");
1442             if (*c == '/' || *c == '\\') {
1443                 ++c;
1444                 copyURLPartsUntil(base, URLPart::SchemeEnd, c, nonUTF8QueryEncoding);
1445                 appendToASCIIBuffer("://", 3);
1446                 if (m_urlIsSpecial)
1447                     state = State::SpecialAuthorityIgnoreSlashes;
1448                 else {
1449                     m_url.m_userStart = currentPosition(c);
1450                     state = State::AuthorityOrHost;
1451                     authorityOrHostBegin = c;
1452                 }
1453             } else {
1454                 copyURLPartsUntil(base, URLPart::PortEnd, c, nonUTF8QueryEncoding);
1455                 appendToASCIIBuffer('/');
1456                 m_url.m_pathAfterLastSlash = base.m_hostEnd + base.m_portLength + 1;
1457                 state = State::Path;
1458             }
1459             break;
1460         case State::SpecialAuthoritySlashes:
1461             LOG_STATE("SpecialAuthoritySlashes");
1462             if (LIKELY(*c == '/' || *c == '\\')) {
1463                 if (UNLIKELY(*c == '\\'))
1464                     syntaxViolation(c);
1465                 appendToASCIIBuffer('/');
1466                 advance(c);
1467                 if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) {
1468                     if (UNLIKELY(*c == '\\'))
1469                         syntaxViolation(c);
1470                     ++c;
1471                     appendToASCIIBuffer('/');
1472                 } else {
1473                     syntaxViolation(c);
1474                     appendToASCIIBuffer('/');
1475                 }
1476             } else {
1477                 syntaxViolation(c);
1478                 appendToASCIIBuffer("//", 2);
1479             }
1480             state = State::SpecialAuthorityIgnoreSlashes;
1481             break;
1482         case State::SpecialAuthorityIgnoreSlashes:
1483             LOG_STATE("SpecialAuthorityIgnoreSlashes");
1484             if (*c == '/' || *c == '\\') {
1485                 syntaxViolation(c);
1486                 ++c;
1487             } else {
1488                 m_url.m_userStart = currentPosition(c);
1489                 state = State::AuthorityOrHost;
1490                 authorityOrHostBegin = c;
1491             }
1492             break;
1493         case State::AuthorityOrHost:
1494             do {
1495                 LOG_STATE("AuthorityOrHost");
1496                 if (*c == '@') {
1497                     auto lastAt = c;
1498                     auto findLastAt = c;
1499                     while (!findLastAt.atEnd()) {
1500                         URL_PARSER_LOG("Finding last @: %c", *findLastAt);
1501                         if (*findLastAt == '@')
1502                             lastAt = findLastAt;
1503                         bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\');
1504                         if (isSlash || *findLastAt == '?' || *findLastAt == '#')
1505                             break;
1506                         ++findLastAt;
1507                     }
1508                     parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1509                     c = lastAt;
1510                     advance(c);
1511                     authorityOrHostBegin = c;
1512                     state = State::Host;
1513                     m_hostHasPercentOrNonASCII = false;
1514                     break;
1515                 }
1516                 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1517                 if (isSlash || *c == '?' || *c == '#') {
1518                     auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c);
1519                     if (iterator.atEnd()) {
1520                         if (m_urlIsSpecial)
1521                             return failure();
1522                         m_url.m_userEnd = currentPosition(c);
1523                         m_url.m_passwordEnd = m_url.m_userEnd;
1524                         m_url.m_hostEnd = m_url.m_userEnd;
1525                         m_url.m_portLength = 0;
1526                         m_url.m_pathAfterLastSlash = m_url.m_userEnd;
1527                     } else {
1528                         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1529                         m_url.m_passwordEnd = m_url.m_userEnd;
1530                         if (!parseHostAndPort(iterator)) {
1531                             failure();
1532                             return;
1533                         }
1534                         if (UNLIKELY(!isSlash)) {
1535                             if (m_urlIsSpecial) {
1536                                 syntaxViolation(c);
1537                                 appendToASCIIBuffer('/');
1538                             }
1539                             m_url.m_pathAfterLastSlash = currentPosition(c);
1540                         }
1541                     }
1542                     state = State::Path;
1543                     break;
1544                 }
1545                 if (isPercentOrNonASCII(*c))
1546                     m_hostHasPercentOrNonASCII = true;
1547                 ++c;
1548             } while (!c.atEnd());
1549             break;
1550         case State::Host:
1551             do {
1552                 LOG_STATE("Host");
1553                 if (*c == '/' || *c == '?' || *c == '#') {
1554                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1555                         failure();
1556                         return;
1557                     }
1558                     if (*c == '?' || *c == '#') {
1559                         syntaxViolation(c);
1560                         appendToASCIIBuffer('/');
1561                         m_url.m_pathAfterLastSlash = currentPosition(c);
1562                     }
1563                     state = State::Path;
1564                     break;
1565                 }
1566                 if (isPercentOrNonASCII(*c))
1567                     m_hostHasPercentOrNonASCII = true;
1568                 ++c;
1569             } while (!c.atEnd());
1570             break;
1571         case State::File:
1572             LOG_STATE("File");
1573             switch (*c) {
1574             case '\\':
1575                 syntaxViolation(c);
1576                 FALLTHROUGH;
1577             case '/':
1578                 appendToASCIIBuffer('/');
1579                 state = State::FileSlash;
1580                 ++c;
1581                 break;
1582             case '?':
1583                 syntaxViolation(c);
1584                 if (base.isValid() && base.protocolIs("file")) {
1585                     copyURLPartsUntil(base, URLPart::PathEnd, c, nonUTF8QueryEncoding);
1586                     appendToASCIIBuffer('?');
1587                     ++c;
1588                 } else {
1589                     appendToASCIIBuffer("///?", 4);
1590                     ++c;
1591                     m_url.m_userStart = currentPosition(c) - 2;
1592                     m_url.m_userEnd = m_url.m_userStart;
1593                     m_url.m_passwordEnd = m_url.m_userStart;
1594                     m_url.m_hostEnd = m_url.m_userStart;
1595                     m_url.m_portLength = 0;
1596                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1597                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1598                 }
1599                 if (nonUTF8QueryEncoding) {
1600                     queryBegin = c;
1601                     state = State::NonUTF8Query;
1602                 } else
1603                     state = State::UTF8Query;
1604                 break;
1605             case '#':
1606                 syntaxViolation(c);
1607                 if (base.isValid() && base.protocolIs("file")) {
1608                     copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1609                     appendToASCIIBuffer('#');
1610                 } else {
1611                     appendToASCIIBuffer("///#", 4);
1612                     m_url.m_userStart = currentPosition(c) - 2;
1613                     m_url.m_userEnd = m_url.m_userStart;
1614                     m_url.m_passwordEnd = m_url.m_userStart;
1615                     m_url.m_hostEnd = m_url.m_userStart;
1616                     m_url.m_portLength = 0;
1617                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1618                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1619                     m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1620                 }
1621                 state = State::Fragment;
1622                 ++c;
1623                 break;
1624             default:
1625                 syntaxViolation(c);
1626                 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1627                     copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, nonUTF8QueryEncoding);
1628                 else {
1629                     appendToASCIIBuffer("///", 3);
1630                     m_url.m_userStart = currentPosition(c) - 1;
1631                     m_url.m_userEnd = m_url.m_userStart;
1632                     m_url.m_passwordEnd = m_url.m_userStart;
1633                     m_url.m_hostEnd = m_url.m_userStart;
1634                     m_url.m_portLength = 0;
1635                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1636                     if (isWindowsDriveLetter(c))
1637                         appendWindowsDriveLetter(c);
1638                 }
1639                 state = State::Path;
1640                 break;
1641             }
1642             break;
1643         case State::FileSlash:
1644             LOG_STATE("FileSlash");
1645             if (LIKELY(*c == '/' || *c == '\\')) {
1646                 if (UNLIKELY(*c == '\\'))
1647                     syntaxViolation(c);
1648                 appendToASCIIBuffer('/');
1649                 advance(c);
1650                 m_url.m_userStart = currentPosition(c);
1651                 m_url.m_userEnd = m_url.m_userStart;
1652                 m_url.m_passwordEnd = m_url.m_userStart;
1653                 m_url.m_hostEnd = m_url.m_userStart;
1654                 m_url.m_portLength = 0;
1655                 authorityOrHostBegin = c;
1656                 state = State::FileHost;
1657                 break;
1658             }
1659             syntaxViolation(c);
1660             appendToASCIIBuffer("//", 2);
1661             m_url.m_userStart = currentPosition(c) - 1;
1662             m_url.m_userEnd = m_url.m_userStart;
1663             m_url.m_passwordEnd = m_url.m_userStart;
1664             m_url.m_hostEnd = m_url.m_userStart;
1665             m_url.m_portLength = 0;
1666             if (isWindowsDriveLetter(c)) {
1667                 appendWindowsDriveLetter(c);
1668                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1669             } else if (copyBaseWindowsDriveLetter(base)) {
1670                 appendToASCIIBuffer('/');
1671                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1672             } else
1673                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1674             state = State::Path;
1675             break;
1676         case State::FileHost:
1677             do {
1678                 LOG_STATE("FileHost");
1679                 if (isSlashQuestionOrHash(*c)) {
1680                     bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1681                         && isWindowsDriveLetter(authorityOrHostBegin);
1682                     if (windowsQuirk) {
1683                         syntaxViolation(authorityOrHostBegin);
1684                         appendToASCIIBuffer('/');
1685                         appendWindowsDriveLetter(authorityOrHostBegin);
1686                     }
1687                     if (windowsQuirk || authorityOrHostBegin == c) {
1688                         ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1) == '/');
1689                         if (UNLIKELY(*c == '?')) {
1690                             syntaxViolation(c);
1691                             appendToASCIIBuffer("/?", 2);
1692                             ++c;
1693                             if (nonUTF8QueryEncoding) {
1694                                 queryBegin = c;
1695                                 state = State::NonUTF8Query;
1696                             } else
1697                                 state = State::UTF8Query;
1698                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1699                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1700                             break;
1701                         }
1702                         if (UNLIKELY(*c == '#')) {
1703                             syntaxViolation(c);
1704                             appendToASCIIBuffer("/#", 2);
1705                             ++c;
1706                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1707                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1708                             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1709                             state = State::Fragment;
1710                             break;
1711                         }
1712                         state = State::Path;
1713                         break;
1714                     }
1715                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1716                         failure();
1717                         return;
1718                     }
1719                     if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) {
1720                         syntaxViolation(c);
1721                         m_asciiBuffer.shrink(m_url.m_passwordEnd);
1722                         m_url.m_hostEnd = currentPosition(c);
1723                         m_url.m_portLength = 0;
1724                     }
1725                     
1726                     state = State::PathStart;
1727                     break;
1728                 }
1729                 if (isPercentOrNonASCII(*c))
1730                     m_hostHasPercentOrNonASCII = true;
1731                 ++c;
1732             } while (!c.atEnd());
1733             break;
1734         case State::PathStart:
1735             LOG_STATE("PathStart");
1736             if (*c != '/' && *c != '\\') {
1737                 syntaxViolation(c);
1738                 appendToASCIIBuffer('/');
1739             }
1740             m_url.m_pathAfterLastSlash = currentPosition(c);
1741             state = State::Path;
1742             break;
1743         case State::Path:
1744             LOG_STATE("Path");
1745             if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1746                 if (UNLIKELY(m_urlIsSpecial && *c == '\\'))
1747                     syntaxViolation(c);
1748                 appendToASCIIBuffer('/');
1749                 ++c;
1750                 m_url.m_pathAfterLastSlash = currentPosition(c);
1751                 break;
1752             }
1753             if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1) == '/')) {
1754                 if (UNLIKELY(isDoubleDotPathSegment(c))) {
1755                     syntaxViolation(c);
1756                     consumeDoubleDotPathSegment(c);
1757                     popPath();
1758                     break;
1759                 }
1760                 if (UNLIKELY(isSingleDotPathSegment(c))) {
1761                     syntaxViolation(c);
1762                     consumeSingleDotPathSegment(c);
1763                     break;
1764                 }
1765             }
1766             if (*c == '?') {
1767                 m_url.m_pathEnd = currentPosition(c);
1768                 appendToASCIIBuffer('?');
1769                 ++c;
1770                 if (nonUTF8QueryEncoding) {
1771                     queryBegin = c;
1772                     state = State::NonUTF8Query;
1773                 } else
1774                     state = State::UTF8Query;
1775                 break;
1776             }
1777             if (*c == '#') {
1778                 m_url.m_pathEnd = currentPosition(c);
1779                 m_url.m_queryEnd = m_url.m_pathEnd;
1780                 state = State::Fragment;
1781                 break;
1782             }
1783             utf8PercentEncode<isInDefaultEncodeSet>(c);
1784             ++c;
1785             break;
1786         case State::CannotBeABaseURLPath:
1787             LOG_STATE("CannotBeABaseURLPath");
1788             if (*c == '?') {
1789                 m_url.m_pathEnd = currentPosition(c);
1790                 appendToASCIIBuffer('?');
1791                 ++c;
1792                 if (nonUTF8QueryEncoding) {
1793                     queryBegin = c;
1794                     state = State::NonUTF8Query;
1795                 } else
1796                     state = State::UTF8Query;
1797             } else if (*c == '#') {
1798                 m_url.m_pathEnd = currentPosition(c);
1799                 m_url.m_queryEnd = m_url.m_pathEnd;
1800                 state = State::Fragment;
1801             } else if (*c == '/') {
1802                 appendToASCIIBuffer('/');
1803                 ++c;
1804                 m_url.m_pathAfterLastSlash = currentPosition(c);
1805             } else {
1806                 utf8PercentEncode<isInSimpleEncodeSet>(c);
1807                 ++c;
1808             }
1809             break;
1810         case State::UTF8Query:
1811             LOG_STATE("UTF8Query");
1812             ASSERT(queryBegin == CodePointIterator<CharacterType>());
1813             if (*c == '#') {
1814                 m_url.m_queryEnd = currentPosition(c);
1815                 state = State::Fragment;
1816                 break;
1817             }
1818             ASSERT(!nonUTF8QueryEncoding);
1819             utf8QueryEncode(c);
1820             ++c;
1821             break;
1822         case State::NonUTF8Query:
1823             do {
1824                 LOG_STATE("NonUTF8Query");
1825                 ASSERT(queryBegin != CodePointIterator<CharacterType>());
1826                 if (*c == '#') {
1827                     encodeNonUTF8Query(queryBuffer, *nonUTF8QueryEncoding, CodePointIterator<CharacterType>(queryBegin, c));
1828                     m_url.m_queryEnd = currentPosition(c);
1829                     state = State::Fragment;
1830                     break;
1831                 }
1832                 appendCodePoint(queryBuffer, *c);
1833                 advance(c, queryBegin);
1834             } while (!c.atEnd());
1835             break;
1836         case State::Fragment:
1837             URL_PARSER_LOG("State Fragment");
1838             utf8PercentEncode<isInSimpleEncodeSet>(c);
1839             ++c;
1840             break;
1841         }
1842     }
1843
1844     switch (state) {
1845     case State::SchemeStart:
1846         LOG_FINAL_STATE("SchemeStart");
1847         if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1848             m_url = base;
1849             m_url.removeFragmentIdentifier();
1850             return;
1851         }
1852         failure();
1853         return;
1854     case State::Scheme:
1855         LOG_FINAL_STATE("Scheme");
1856         failure();
1857         return;
1858     case State::NoScheme:
1859         LOG_FINAL_STATE("NoScheme");
1860         RELEASE_ASSERT_NOT_REACHED();
1861     case State::SpecialRelativeOrAuthority:
1862         LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1863         copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1864         break;
1865     case State::PathOrAuthority:
1866         LOG_FINAL_STATE("PathOrAuthority");
1867         ASSERT(m_url.m_userStart);
1868         ASSERT(m_url.m_userStart == currentPosition(c));
1869         ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1870         m_url.m_userStart--;
1871         m_url.m_userEnd = m_url.m_userStart;
1872         m_url.m_passwordEnd = m_url.m_userStart;
1873         m_url.m_hostEnd = m_url.m_userStart;
1874         m_url.m_portLength = 0;
1875         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1876         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1877         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1878         break;
1879     case State::Relative:
1880         LOG_FINAL_STATE("Relative");
1881         RELEASE_ASSERT_NOT_REACHED();
1882     case State::RelativeSlash:
1883         LOG_FINAL_STATE("RelativeSlash");
1884         copyURLPartsUntil(base, URLPart::PortEnd, c, nonUTF8QueryEncoding);
1885         appendToASCIIBuffer('/');
1886         m_url.m_pathAfterLastSlash = m_url.m_hostEnd + m_url.m_portLength + 1;
1887         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1888         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1889         break;
1890     case State::SpecialAuthoritySlashes:
1891         LOG_FINAL_STATE("SpecialAuthoritySlashes");
1892         m_url.m_userStart = currentPosition(c);
1893         m_url.m_userEnd = m_url.m_userStart;
1894         m_url.m_passwordEnd = m_url.m_userStart;
1895         m_url.m_hostEnd = m_url.m_userStart;
1896         m_url.m_portLength = 0;
1897         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1898         m_url.m_pathEnd = m_url.m_userStart;
1899         m_url.m_queryEnd = m_url.m_userStart;
1900         break;
1901     case State::SpecialAuthorityIgnoreSlashes:
1902         LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1903         failure();
1904         return;
1905     case State::AuthorityOrHost:
1906         LOG_FINAL_STATE("AuthorityOrHost");
1907         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1908         m_url.m_passwordEnd = m_url.m_userEnd;
1909         if (authorityOrHostBegin.atEnd()) {
1910             m_url.m_userEnd = m_url.m_userStart;
1911             m_url.m_passwordEnd = m_url.m_userStart;
1912             m_url.m_hostEnd = m_url.m_userStart;
1913             m_url.m_portLength = 0;
1914             m_url.m_pathEnd = m_url.m_userStart;
1915         } else if (!parseHostAndPort(authorityOrHostBegin)) {
1916             failure();
1917             return;
1918         } else {
1919             if (m_urlIsSpecial) {
1920                 syntaxViolation(c);
1921                 appendToASCIIBuffer('/');
1922                 m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength + 1;
1923             } else
1924                 m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength;
1925         }
1926         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1927         m_url.m_queryEnd = m_url.m_pathEnd;
1928         break;
1929     case State::Host:
1930         LOG_FINAL_STATE("Host");
1931         if (!parseHostAndPort(authorityOrHostBegin)) {
1932             failure();
1933             return;
1934         }
1935         if (m_urlIsSpecial) {
1936             syntaxViolation(c);
1937             appendToASCIIBuffer('/');
1938             m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength + 1;
1939         } else
1940             m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength;
1941         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1942         m_url.m_queryEnd = m_url.m_pathEnd;
1943         break;
1944     case State::File:
1945         LOG_FINAL_STATE("File");
1946         if (base.isValid() && base.protocolIs("file")) {
1947             copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1948             break;
1949         }
1950         syntaxViolation(c);
1951         appendToASCIIBuffer("///", 3);
1952         m_url.m_userStart = currentPosition(c) - 1;
1953         m_url.m_userEnd = m_url.m_userStart;
1954         m_url.m_passwordEnd = m_url.m_userStart;
1955         m_url.m_hostEnd = m_url.m_userStart;
1956         m_url.m_portLength = 0;
1957         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1958         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1959         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1960         break;
1961     case State::FileSlash:
1962         LOG_FINAL_STATE("FileSlash");
1963         syntaxViolation(c);
1964         m_url.m_userStart = currentPosition(c) + 1;
1965         appendToASCIIBuffer("//", 2);
1966         m_url.m_userEnd = m_url.m_userStart;
1967         m_url.m_passwordEnd = m_url.m_userStart;
1968         m_url.m_hostEnd = m_url.m_userStart;
1969         m_url.m_portLength = 0;
1970         if (copyBaseWindowsDriveLetter(base)) {
1971             appendToASCIIBuffer('/');
1972             m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1973         } else
1974             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1975         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1976         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1977         break;
1978     case State::FileHost:
1979         LOG_FINAL_STATE("FileHost");
1980         if (takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1981             && isWindowsDriveLetter(authorityOrHostBegin)) {
1982             syntaxViolation(authorityOrHostBegin);
1983             appendToASCIIBuffer('/');
1984             appendWindowsDriveLetter(authorityOrHostBegin);
1985             m_url.m_pathAfterLastSlash = currentPosition(c);
1986             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1987             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1988             break;
1989         }
1990         
1991         if (authorityOrHostBegin == c) {
1992             syntaxViolation(c);
1993             appendToASCIIBuffer('/');
1994             m_url.m_userStart = currentPosition(c) - 1;
1995             m_url.m_userEnd = m_url.m_userStart;
1996             m_url.m_passwordEnd = m_url.m_userStart;
1997             m_url.m_hostEnd = m_url.m_userStart;
1998             m_url.m_portLength = 0;
1999             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
2000             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
2001             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
2002             break;
2003         }
2004
2005         if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
2006             failure();
2007             return;
2008         }
2009
2010         syntaxViolation(c);
2011         if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) {
2012             m_asciiBuffer.shrink(m_url.m_passwordEnd);
2013             m_url.m_hostEnd = currentPosition(c);
2014             m_url.m_portLength = 0;
2015         }
2016         appendToASCIIBuffer('/');
2017         m_url.m_pathAfterLastSlash = m_url.m_hostEnd + m_url.m_portLength + 1;
2018         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
2019         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
2020         break;
2021     case State::PathStart:
2022         LOG_FINAL_STATE("PathStart");
2023         RELEASE_ASSERT_NOT_REACHED();
2024     case State::Path:
2025         LOG_FINAL_STATE("Path");
2026         m_url.m_pathEnd = currentPosition(c);
2027         m_url.m_queryEnd = m_url.m_pathEnd;
2028         break;
2029     case State::CannotBeABaseURLPath:
2030         LOG_FINAL_STATE("CannotBeABaseURLPath");
2031         m_url.m_pathEnd = currentPosition(c);
2032         m_url.m_queryEnd = m_url.m_pathEnd;
2033         break;
2034     case State::UTF8Query:
2035         LOG_FINAL_STATE("UTF8Query");
2036         ASSERT(queryBegin == CodePointIterator<CharacterType>());
2037         m_url.m_queryEnd = currentPosition(c);
2038         break;
2039     case State::NonUTF8Query:
2040         LOG_FINAL_STATE("NonUTF8Query");
2041         ASSERT(queryBegin != CodePointIterator<CharacterType>());
2042         encodeNonUTF8Query(queryBuffer, *nonUTF8QueryEncoding, CodePointIterator<CharacterType>(queryBegin, c));
2043         m_url.m_queryEnd = currentPosition(c);
2044         break;
2045     case State::Fragment:
2046         LOG_FINAL_STATE("Fragment");
2047         break;
2048     }
2049
2050     if (LIKELY(!m_didSeeSyntaxViolation)) {
2051         m_url.m_string = m_inputString;
2052         ASSERT(m_asciiBuffer.isEmpty());
2053     } else
2054         m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
2055     m_url.m_isValid = true;
2056     URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
2057 }
2058
2059 template<typename CharacterType>
2060 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
2061 {
2062     if (UNLIKELY(iterator.atEnd())) {
2063         syntaxViolation(iterator);
2064         m_url.m_userEnd = currentPosition(iterator);
2065         m_url.m_passwordEnd = m_url.m_userEnd;
2066         return;
2067     }
2068     for (; !iterator.atEnd(); advance(iterator)) {
2069         if (*iterator == ':') {
2070             m_url.m_userEnd = currentPosition(iterator);
2071             auto iteratorAtColon = iterator;
2072             ++iterator;
2073             bool tabOrNewlineAfterColon = false;
2074             while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
2075                 tabOrNewlineAfterColon = true;
2076                 ++iterator;
2077             }
2078             if (UNLIKELY(iterator.atEnd())) {
2079                 syntaxViolation(iteratorAtColon);
2080                 m_url.m_passwordEnd = m_url.m_userEnd;
2081                 if (m_url.m_userEnd > m_url.m_userStart)
2082                     appendToASCIIBuffer('@');
2083                 return;
2084             }
2085             if (tabOrNewlineAfterColon)
2086                 syntaxViolation(iteratorAtColon);
2087             appendToASCIIBuffer(':');
2088             break;
2089         }
2090         utf8PercentEncode<WebCore::isInUserInfoEncodeSet>(iterator);
2091     }
2092     for (; !iterator.atEnd(); advance(iterator))
2093         utf8PercentEncode<WebCore::isInUserInfoEncodeSet>(iterator);
2094     m_url.m_passwordEnd = currentPosition(iterator);
2095     if (!m_url.m_userEnd)
2096         m_url.m_userEnd = m_url.m_passwordEnd;
2097     appendToASCIIBuffer('@');
2098 }
2099
2100 template<typename UnsignedIntegerType>
2101 void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
2102 {
2103     LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
2104     LChar* end = std::end(buf);
2105     LChar* p = end;
2106     do {
2107         *--p = (number % 10) + '0';
2108         number /= 10;
2109     } while (number);
2110     appendToASCIIBuffer(p, end - p);
2111 }
2112
2113 void URLParser::serializeIPv4(IPv4Address address)
2114 {
2115     appendNumberToASCIIBuffer<uint8_t>(address >> 24);
2116     appendToASCIIBuffer('.');
2117     appendNumberToASCIIBuffer<uint8_t>(address >> 16);
2118     appendToASCIIBuffer('.');
2119     appendNumberToASCIIBuffer<uint8_t>(address >> 8);
2120     appendToASCIIBuffer('.');
2121     appendNumberToASCIIBuffer<uint8_t>(address);
2122 }
2123     
2124 static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
2125 {
2126     size_t end = begin;
2127     for (; end < 8; end++) {
2128         if (address[end])
2129             break;
2130     }
2131     return end - begin;
2132 }
2133
2134 static std::optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
2135 {
2136     std::optional<size_t> longest;
2137     size_t longestLength = 0;
2138     for (size_t i = 0; i < 8; i++) {
2139         size_t length = zeroSequenceLength(address, i);
2140         if (length) {
2141             if (length > 1 && (!longest || longestLength < length)) {
2142                 longest = i;
2143                 longestLength = length;
2144             }
2145             i += length;
2146         }
2147     }
2148     return longest;
2149 }
2150
2151 void URLParser::serializeIPv6Piece(uint16_t piece)
2152 {
2153     bool printed = false;
2154     if (auto nibble0 = piece >> 12) {
2155         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
2156         printed = true;
2157     }
2158     auto nibble1 = piece >> 8 & 0xF;
2159     if (printed || nibble1) {
2160         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
2161         printed = true;
2162     }
2163     auto nibble2 = piece >> 4 & 0xF;
2164     if (printed || nibble2)
2165         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
2166     appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
2167 }
2168
2169 void URLParser::serializeIPv6(URLParser::IPv6Address address)
2170 {
2171     appendToASCIIBuffer('[');
2172     auto compressPointer = findLongestZeroSequence(address);
2173     for (size_t piece = 0; piece < 8; piece++) {
2174         if (compressPointer && compressPointer.value() == piece) {
2175             ASSERT(!address[piece]);
2176             if (piece)
2177                 appendToASCIIBuffer(':');
2178             else
2179                 appendToASCIIBuffer("::", 2);
2180             while (piece < 8 && !address[piece])
2181                 piece++;
2182             if (piece == 8)
2183                 break;
2184         }
2185         serializeIPv6Piece(address[piece]);
2186         if (piece < 7)
2187             appendToASCIIBuffer(':');
2188     }
2189     appendToASCIIBuffer(']');
2190 }
2191
2192 enum class URLParser::IPv4PieceParsingError {
2193     Failure,
2194     Overflow,
2195 };
2196
2197 template<typename CharacterType>
2198 Expected<uint32_t, URLParser::IPv4PieceParsingError> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation)
2199 {
2200     enum class State : uint8_t {
2201         UnknownBase,
2202         Decimal,
2203         OctalOrHex,
2204         Octal,
2205         Hex,
2206     };
2207     State state = State::UnknownBase;
2208     Checked<uint32_t, RecordOverflow> value = 0;
2209     if (!iterator.atEnd() && *iterator == '.')
2210         return makeUnexpected(IPv4PieceParsingError::Failure);
2211     while (!iterator.atEnd()) {
2212         if (isTabOrNewline(*iterator)) {
2213             didSeeSyntaxViolation = true;
2214             ++iterator;
2215             continue;
2216         }
2217         if (*iterator == '.') {
2218             ASSERT(!value.hasOverflowed());
2219             return value.unsafeGet();
2220         }
2221         switch (state) {
2222         case State::UnknownBase:
2223             if (UNLIKELY(*iterator == '0')) {
2224                 ++iterator;
2225                 state = State::OctalOrHex;
2226                 break;
2227             }
2228             state = State::Decimal;
2229             break;
2230         case State::OctalOrHex:
2231             didSeeSyntaxViolation = true;
2232             if (*iterator == 'x' || *iterator == 'X') {
2233                 ++iterator;
2234                 state = State::Hex;
2235                 break;
2236             }
2237             state = State::Octal;
2238             break;
2239         case State::Decimal:
2240             if (!isASCIIDigit(*iterator))
2241                 return makeUnexpected(IPv4PieceParsingError::Failure);
2242             value *= 10;
2243             value += *iterator - '0';
2244             if (UNLIKELY(value.hasOverflowed()))
2245                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2246             ++iterator;
2247             break;
2248         case State::Octal:
2249             ASSERT(didSeeSyntaxViolation);
2250             if (*iterator < '0' || *iterator > '7')
2251                 return makeUnexpected(IPv4PieceParsingError::Failure);
2252             value *= 8;
2253             value += *iterator - '0';
2254             if (UNLIKELY(value.hasOverflowed()))
2255                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2256             ++iterator;
2257             break;
2258         case State::Hex:
2259             ASSERT(didSeeSyntaxViolation);
2260             if (!isASCIIHexDigit(*iterator))
2261                 return makeUnexpected(IPv4PieceParsingError::Failure);
2262             value *= 16;
2263             value += toASCIIHexValue(*iterator);
2264             if (UNLIKELY(value.hasOverflowed()))
2265                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2266             ++iterator;
2267             break;
2268         }
2269     }
2270     ASSERT(!value.hasOverflowed());
2271     return value.unsafeGet();
2272 }
2273
2274 ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2275 {
2276     RELEASE_ASSERT(exponent <= 4);
2277     uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
2278     return values[exponent];
2279 }
2280
2281 enum class URLParser::IPv4ParsingError {
2282     Failure,
2283     NotIPv4,
2284 };
2285
2286 template<typename CharacterTypeForSyntaxViolation, typename CharacterType>
2287 Expected<URLParser::IPv4Address, URLParser::IPv4ParsingError> URLParser::parseIPv4Host(const CodePointIterator<CharacterTypeForSyntaxViolation>& iteratorForSyntaxViolationPosition, CodePointIterator<CharacterType> iterator)
2288 {
2289     Vector<Expected<uint32_t, URLParser::IPv4PieceParsingError>, 4> items;
2290     bool didSeeSyntaxViolation = false;
2291     if (!iterator.atEnd() && *iterator == '.')
2292         return makeUnexpected(IPv4ParsingError::NotIPv4);
2293     while (!iterator.atEnd()) {
2294         if (isTabOrNewline(*iterator)) {
2295             didSeeSyntaxViolation = true;
2296             ++iterator;
2297             continue;
2298         }
2299         if (items.size() >= 4)
2300             return makeUnexpected(IPv4ParsingError::NotIPv4);
2301         items.append(parseIPv4Piece(iterator, didSeeSyntaxViolation));
2302         if (!iterator.atEnd() && *iterator == '.') {
2303             ++iterator;
2304             if (iterator.atEnd())
2305                 syntaxViolation(iteratorForSyntaxViolationPosition);
2306             else if (*iterator == '.')
2307                 return makeUnexpected(IPv4ParsingError::NotIPv4);
2308         }
2309     }
2310     if (!iterator.atEnd() || !items.size() || items.size() > 4)
2311         return makeUnexpected(IPv4ParsingError::NotIPv4);
2312     for (const auto& item : items) {
2313         if (!item.has_value() && item.error() == IPv4PieceParsingError::Failure)
2314             return makeUnexpected(IPv4ParsingError::NotIPv4);
2315     }
2316     for (const auto& item : items) {
2317         if (!item.has_value() && item.error() == IPv4PieceParsingError::Overflow)
2318             return makeUnexpected(IPv4ParsingError::Failure);
2319     }
2320     if (items.size() > 1) {
2321         for (size_t i = 0; i < items.size() - 1; i++) {
2322             if (items[i].value() > 255)
2323                 return makeUnexpected(IPv4ParsingError::Failure);
2324         }
2325     }
2326     if (items[items.size() - 1].value() >= pow256(5 - items.size()))
2327         return makeUnexpected(IPv4ParsingError::Failure);
2328
2329     if (didSeeSyntaxViolation)
2330         syntaxViolation(iteratorForSyntaxViolationPosition);
2331     for (const auto& item : items) {
2332         if (item.value() > 255)
2333             syntaxViolation(iteratorForSyntaxViolationPosition);
2334     }
2335
2336     if (UNLIKELY(items.size() != 4))
2337         syntaxViolation(iteratorForSyntaxViolationPosition);
2338
2339     IPv4Address ipv4 = items.takeLast().value();
2340     for (size_t counter = 0; counter < items.size(); ++counter)
2341         ipv4 += items[counter].value() * pow256(3 - counter);
2342     return ipv4;
2343 }
2344
2345 template<typename CharacterType>
2346 std::optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator)
2347 {
2348     if (iterator.atEnd())
2349         return std::nullopt;
2350     uint32_t piece = 0;
2351     bool leadingZeros = false;
2352     size_t digitCount = 0;
2353     while (!iterator.atEnd()) {
2354         if (!isASCIIDigit(*iterator))
2355             return std::nullopt;
2356         ++digitCount;
2357         if (!piece && *iterator == '0') {
2358             if (leadingZeros)
2359                 return std::nullopt;
2360             leadingZeros = true;
2361         }
2362         if (!piece && *iterator == '0')
2363             leadingZeros = true;
2364         piece = piece * 10 + *iterator - '0';
2365         if (piece > 255)
2366             return std::nullopt;
2367         advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2368         if (iterator.atEnd())
2369             break;
2370         if (*iterator == '.')
2371             break;
2372     }
2373     if (piece && leadingZeros)
2374         return std::nullopt;
2375     return piece;
2376 }
2377
2378 template<typename CharacterType>
2379 std::optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator)
2380 {
2381     IPv4Address address = 0;
2382     for (size_t i = 0; i < 4; ++i) {
2383         if (std::optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator))
2384             address = (address << 8) + piece.value();
2385         else
2386             return std::nullopt;
2387         if (i < 3) {
2388             if (iterator.atEnd())
2389                 return std::nullopt;
2390             if (*iterator != '.')
2391                 return std::nullopt;
2392             advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2393         } else if (!iterator.atEnd())
2394             return std::nullopt;
2395     }
2396     ASSERT(iterator.atEnd());
2397     return address;
2398 }
2399
2400 template<typename CharacterType>
2401 std::optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2402 {
2403     ASSERT(*c == '[');
2404     const auto hostBegin = c;
2405     advance(c, hostBegin);
2406     if (c.atEnd())
2407         return std::nullopt;
2408
2409     IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}};
2410     size_t piecePointer = 0;
2411     std::optional<size_t> compressPointer;
2412
2413     if (*c == ':') {
2414         advance(c, hostBegin);
2415         if (c.atEnd())
2416             return std::nullopt;
2417         if (*c != ':')
2418             return std::nullopt;
2419         advance(c, hostBegin);
2420         ++piecePointer;
2421         compressPointer = piecePointer;
2422     }
2423     
2424     while (!c.atEnd()) {
2425         if (piecePointer == 8)
2426             return std::nullopt;
2427         if (*c == ':') {
2428             if (compressPointer)
2429                 return std::nullopt;
2430             advance(c, hostBegin);
2431             ++piecePointer;
2432             compressPointer = piecePointer;
2433             continue;
2434         }
2435         if (piecePointer == 6 || (compressPointer && piecePointer < 6)) {
2436             if (std::optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) {
2437                 if (compressPointer && piecePointer == 5)
2438                     return std::nullopt;
2439                 syntaxViolation(hostBegin);
2440                 address[piecePointer++] = ipv4Address.value() >> 16;
2441                 address[piecePointer++] = ipv4Address.value() & 0xFFFF;
2442                 c = { };
2443                 break;
2444             }
2445         }
2446         uint16_t value = 0;
2447         size_t length = 0;
2448         bool leadingZeros = false;
2449         for (; length < 4; length++) {
2450             if (c.atEnd())
2451                 break;
2452             if (!isASCIIHexDigit(*c))
2453                 break;
2454             if (isASCIIUpper(*c))
2455                 syntaxViolation(hostBegin);
2456             if (*c == '0' && !length)
2457                 leadingZeros = true;
2458             value = value * 0x10 + toASCIIHexValue(*c);
2459             advance(c, hostBegin);
2460         }
2461         
2462         if (UNLIKELY((value && leadingZeros) || (!value && length > 1)))
2463             syntaxViolation(hostBegin);
2464
2465         address[piecePointer++] = value;
2466         if (c.atEnd())
2467             break;
2468         if (piecePointer == 8 || *c != ':')
2469             return std::nullopt;
2470         advance(c, hostBegin);
2471     }
2472     
2473     if (!c.atEnd())
2474         return std::nullopt;
2475     
2476     if (compressPointer) {
2477         size_t swaps = piecePointer - compressPointer.value();
2478         piecePointer = 7;
2479         while (swaps)
2480             std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
2481     } else if (piecePointer != 8)
2482         return std::nullopt;
2483
2484     std::optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2485     if (possibleCompressPointer)
2486         possibleCompressPointer.value()++;
2487     if (UNLIKELY(compressPointer != possibleCompressPointer))
2488         syntaxViolation(hostBegin);
2489     
2490     return address;
2491 }
2492
2493 template<typename CharacterType>
2494 URLParser::LCharBuffer URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2495 {
2496     LCharBuffer output;
2497     output.reserveInitialCapacity(length);
2498     
2499     for (size_t i = 0; i < length; ++i) {
2500         uint8_t byte = input[i];
2501         if (byte != '%')
2502             output.uncheckedAppend(byte);
2503         else if (length > 2 && i < length - 2) {
2504             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2505                 syntaxViolation(iteratorForSyntaxViolationPosition);
2506                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2507                 i += 2;
2508             } else
2509                 output.uncheckedAppend(byte);
2510         } else
2511             output.uncheckedAppend(byte);
2512     }
2513     return output;
2514 }
2515     
2516 URLParser::LCharBuffer URLParser::percentDecode(const LChar* input, size_t length)
2517 {
2518     LCharBuffer output;
2519     output.reserveInitialCapacity(length);
2520     
2521     for (size_t i = 0; i < length; ++i) {
2522         uint8_t byte = input[i];
2523         if (byte != '%')
2524             output.uncheckedAppend(byte);
2525         else if (length > 2 && i < length - 2) {
2526             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2527                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2528                 i += 2;
2529             } else
2530                 output.uncheckedAppend(byte);
2531         } else
2532             output.uncheckedAppend(byte);
2533     }
2534     return output;
2535 }
2536
2537 template<typename CharacterType> std::optional<URLParser::LCharBuffer> URLParser::domainToASCII(StringImpl& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2538 {
2539     LCharBuffer ascii;
2540     if (domain.isAllASCII()) {
2541         size_t length = domain.length();
2542         if (domain.is8Bit()) {
2543             const LChar* characters = domain.characters8();
2544             ascii.reserveInitialCapacity(length);
2545             for (size_t i = 0; i < length; ++i) {
2546                 if (UNLIKELY(isASCIIUpper(characters[i])))
2547                     syntaxViolation(iteratorForSyntaxViolationPosition);
2548                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2549             }
2550         } else {
2551             const UChar* characters = domain.characters16();
2552             ascii.reserveInitialCapacity(length);
2553             for (size_t i = 0; i < length; ++i) {
2554                 if (UNLIKELY(isASCIIUpper(characters[i])))
2555                     syntaxViolation(iteratorForSyntaxViolationPosition);
2556                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2557             }
2558         }
2559         return ascii;
2560     }
2561     
2562     const size_t maxDomainLength = 64;
2563     UChar hostnameBuffer[maxDomainLength];
2564     UErrorCode error = U_ZERO_ERROR;
2565     UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER;
2566     int32_t numCharactersConverted = uidna_nameToASCII(&internationalDomainNameTranscoder(), StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, maxDomainLength, &processingDetails, &error);
2567     ASSERT(numCharactersConverted <= static_cast<int32_t>(maxDomainLength));
2568
2569     if (U_SUCCESS(error) && !processingDetails.errors) {
2570         for (int32_t i = 0; i < numCharactersConverted; ++i) {
2571             ASSERT(isASCII(hostnameBuffer[i]));
2572             ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2573         }
2574         ascii.append(hostnameBuffer, numCharactersConverted);
2575         if (domain != StringView(ascii.data(), ascii.size()))
2576             syntaxViolation(iteratorForSyntaxViolationPosition);
2577         return ascii;
2578     }
2579     return std::nullopt;
2580 }
2581
2582 bool URLParser::hasForbiddenHostCodePoint(const URLParser::LCharBuffer& asciiDomain)
2583 {
2584     for (size_t i = 0; i < asciiDomain.size(); ++i) {
2585         if (isForbiddenHostCodePoint(asciiDomain[i]))
2586             return true;
2587     }
2588     return false;
2589 }
2590
2591 template<typename CharacterType>
2592 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2593 {
2594     ASSERT(*iterator == ':');
2595     auto colonIterator = iterator;
2596     advance(iterator, colonIterator);
2597     uint32_t port = 0;
2598     if (UNLIKELY(iterator.atEnd())) {
2599         unsigned portLength = currentPosition(colonIterator) - m_url.m_hostEnd;
2600         RELEASE_ASSERT(portLength <= URL::maxPortLength);
2601         m_url.m_portLength = portLength;
2602         syntaxViolation(colonIterator);
2603         return true;
2604     }
2605     size_t digitCount = 0;
2606     bool leadingZeros = false;
2607     for (; !iterator.atEnd(); ++iterator) {
2608         if (UNLIKELY(isTabOrNewline(*iterator))) {
2609             syntaxViolation(colonIterator);
2610             continue;
2611         }
2612         if (isASCIIDigit(*iterator)) {
2613             if (*iterator == '0' && !digitCount)
2614                 leadingZeros = true;
2615             ++digitCount;
2616             port = port * 10 + *iterator - '0';
2617             if (port > std::numeric_limits<uint16_t>::max())
2618                 return false;
2619         } else
2620             return false;
2621     }
2622
2623     if (port && leadingZeros)
2624         syntaxViolation(colonIterator);
2625     
2626     if (!port && digitCount > 1)
2627         syntaxViolation(colonIterator);
2628
2629     ASSERT(port == static_cast<uint16_t>(port));
2630     if (UNLIKELY(defaultPortForProtocol(parsedDataView(0, m_url.m_schemeEnd)) == static_cast<uint16_t>(port)))
2631         syntaxViolation(colonIterator);
2632     else {
2633         appendToASCIIBuffer(':');
2634         ASSERT(port <= std::numeric_limits<uint16_t>::max());
2635         appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2636     }
2637
2638     unsigned portLength = currentPosition(iterator) - m_url.m_hostEnd;
2639     RELEASE_ASSERT(portLength <= URL::maxPortLength);
2640     m_url.m_portLength = portLength;
2641     return true;
2642 }
2643
2644 template<typename CharacterType>
2645 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2646 {
2647     if (iterator.atEnd())
2648         return false;
2649     if (*iterator == ':')
2650         return false;
2651     if (*iterator == '[') {
2652         auto ipv6End = iterator;
2653         while (!ipv6End.atEnd() && *ipv6End != ']')
2654             ++ipv6End;
2655         if (ipv6End.atEnd())
2656             return false;
2657         if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2658             serializeIPv6(address.value());
2659             if (!ipv6End.atEnd()) {
2660                 advance(ipv6End);
2661                 if (!ipv6End.atEnd() && *ipv6End == ':') {
2662                     m_url.m_hostEnd = currentPosition(ipv6End);
2663                     return parsePort(ipv6End);
2664                 }
2665                 m_url.m_hostEnd = currentPosition(ipv6End);
2666                 m_url.m_portLength = 0;
2667                 return true;
2668             }
2669             m_url.m_hostEnd = currentPosition(ipv6End);
2670             return true;
2671         }
2672         return false;
2673     }
2674
2675     if (!m_urlIsSpecial) {
2676         for (; !iterator.atEnd(); ++iterator) {
2677             if (UNLIKELY(isTabOrNewline(*iterator))) {
2678                 syntaxViolation(iterator);
2679                 continue;
2680             }
2681             if (*iterator == ':')
2682                 break;
2683             if (UNLIKELY(isForbiddenHostCodePoint(*iterator) && *iterator != '%'))
2684                 return false;
2685             utf8PercentEncode<isInSimpleEncodeSet>(iterator);
2686         }
2687         m_url.m_hostEnd = currentPosition(iterator);
2688         if (iterator.atEnd()) {
2689             m_url.m_portLength = 0;
2690             return true;
2691         }
2692         return parsePort(iterator);
2693     }
2694     
2695     if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2696         auto hostIterator = iterator;
2697         for (; !iterator.atEnd(); ++iterator) {
2698             if (isTabOrNewline(*iterator))
2699                 continue;
2700             if (*iterator == ':')
2701                 break;
2702             if (isForbiddenHostCodePoint(*iterator))
2703                 return false;
2704         }
2705         auto address = parseIPv4Host(hostIterator, CodePointIterator<CharacterType>(hostIterator, iterator));
2706         if (address) {
2707             serializeIPv4(address.value());
2708             m_url.m_hostEnd = currentPosition(iterator);
2709             if (iterator.atEnd()) {
2710                 m_url.m_portLength = 0;
2711                 return true;
2712             }
2713             return parsePort(iterator);
2714         }
2715         if (address.error() == IPv4ParsingError::Failure)
2716             return false;
2717         for (; hostIterator != iterator; ++hostIterator) {
2718             if (UNLIKELY(isTabOrNewline(*hostIterator))) {
2719                 syntaxViolation(hostIterator);
2720                 continue;
2721             }
2722             if (UNLIKELY(isASCIIUpper(*hostIterator)))
2723                 syntaxViolation(hostIterator);
2724             appendToASCIIBuffer(toASCIILower(*hostIterator));
2725         }
2726         m_url.m_hostEnd = currentPosition(iterator);
2727         if (!hostIterator.atEnd())
2728             return parsePort(hostIterator);
2729         unsigned portLength = currentPosition(iterator) - m_url.m_hostEnd;
2730         RELEASE_ASSERT(portLength <= URL::maxPortLength);
2731         m_url.m_portLength = portLength;
2732         return true;
2733     }
2734     
2735     const auto hostBegin = iterator;
2736     
2737     LCharBuffer utf8Encoded;
2738     for (; !iterator.atEnd(); ++iterator) {
2739         if (UNLIKELY(isTabOrNewline(*iterator))) {
2740             syntaxViolation(hostBegin);
2741             continue;
2742         }
2743         if (*iterator == ':')
2744             break;
2745         if (UNLIKELY(!isASCII(*iterator)))
2746             syntaxViolation(hostBegin);
2747
2748         if (!U_IS_UNICODE_CHAR(*iterator))
2749             return false;
2750         uint8_t buffer[U8_MAX_LENGTH];
2751         int32_t offset = 0;
2752         U8_APPEND_UNSAFE(buffer, offset, *iterator);
2753         utf8Encoded.append(buffer, offset);
2754     }
2755     LCharBuffer percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin);
2756     String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2757     if (domain.isNull())
2758         return false;
2759     if (domain != StringView(percentDecoded.data(), percentDecoded.size()))
2760         syntaxViolation(hostBegin);
2761     auto asciiDomain = domainToASCII(*domain.impl(), hostBegin);
2762     if (!asciiDomain || hasForbiddenHostCodePoint(asciiDomain.value()))
2763         return false;
2764     LCharBuffer& asciiDomainValue = asciiDomain.value();
2765     const LChar* asciiDomainCharacters = asciiDomainValue.data();
2766
2767     auto address = parseIPv4Host(hostBegin, CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()));
2768     if (address) {
2769         serializeIPv4(address.value());
2770         m_url.m_hostEnd = currentPosition(iterator);
2771         if (iterator.atEnd()) {
2772             m_url.m_portLength = 0;
2773             return true;
2774         }
2775         return parsePort(iterator);
2776     }
2777     if (address.error() == IPv4ParsingError::Failure)
2778         return false;
2779
2780     appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2781     m_url.m_hostEnd = currentPosition(iterator);
2782     if (!iterator.atEnd())
2783         return parsePort(iterator);
2784     m_url.m_portLength = 0;
2785     return true;
2786 }
2787
2788 std::optional<String> URLParser::formURLDecode(StringView input)
2789 {
2790     auto utf8 = input.utf8(StrictConversion);
2791     if (utf8.isNull())
2792         return std::nullopt;
2793     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2794     return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2795 }
2796
2797 // https://url.spec.whatwg.org/#concept-urlencoded-parser
2798 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2799 {
2800     URLEncodedForm output;
2801     for (StringView bytes : input.split('&')) {
2802         auto equalIndex = bytes.find('=');
2803         if (equalIndex == notFound) {
2804             auto name = formURLDecode(bytes.toString().replace('+', 0x20));
2805             if (name)
2806                 output.append({ name.value(), emptyString() });
2807         } else {
2808             auto name = formURLDecode(bytes.substring(0, equalIndex).toString().replace('+', 0x20));
2809             auto value = formURLDecode(bytes.substring(equalIndex + 1).toString().replace('+', 0x20));
2810             if (name && value)
2811                 output.append({ name.value(), value.value() });
2812         }
2813     }
2814     return output;
2815 }
2816
2817 static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2818 {
2819     auto utf8 = input.utf8(StrictConversion);
2820     const char* data = utf8.data();
2821     for (size_t i = 0; i < utf8.length(); ++i) {
2822         const char byte = data[i];
2823         if (byte == 0x20)
2824             output.append(0x2B);
2825         else if (byte == 0x2A
2826             || byte == 0x2D
2827             || byte == 0x2E
2828             || (byte >= 0x30 && byte <= 0x39)
2829             || (byte >= 0x41 && byte <= 0x5A)
2830             || byte == 0x5F
2831             || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches.
2832             output.append(byte);
2833         else
2834             percentEncodeByte(byte, output);
2835     }
2836 }
2837     
2838 String URLParser::serialize(const URLEncodedForm& tuples)
2839 {
2840     if (tuples.isEmpty())
2841         return { };
2842
2843     Vector<LChar> output;
2844     for (auto& tuple : tuples) {
2845         if (!output.isEmpty())
2846             output.append('&');
2847         serializeURLEncodedForm(tuple.key, output);
2848         output.append('=');
2849         serializeURLEncodedForm(tuple.value, output);
2850     }
2851     return String::adopt(WTFMove(output));
2852 }
2853
2854 const UIDNA& URLParser::internationalDomainNameTranscoder()
2855 {
2856     static UIDNA* encoder;
2857     static std::once_flag onceFlag;
2858     std::call_once(onceFlag, [] {
2859         UErrorCode error = U_ZERO_ERROR;
2860         // Warning: Please contact a WebKitGTK+ developer if changing these flags.
2861         // They should be synced with ephy_uri_decode() in ephy-uri-helpers.c.
2862         encoder = uidna_openUTS46(UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_NONTRANSITIONAL_TO_UNICODE | UIDNA_NONTRANSITIONAL_TO_ASCII, &error);
2863         RELEASE_ASSERT(U_SUCCESS(error));
2864         RELEASE_ASSERT(encoder);
2865     });
2866     return *encoder;
2867 }
2868
2869 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2870 {
2871     URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2872         a.m_isValid,
2873         a.m_cannotBeABaseURL,
2874         a.m_protocolIsInHTTPFamily,
2875         a.m_schemeEnd,
2876         a.m_userStart,
2877         a.m_userEnd,
2878         a.m_passwordEnd,
2879         a.m_hostEnd,
2880         a.m_hostEnd + a.m_portLength,
2881         a.m_pathAfterLastSlash,
2882         a.m_pathEnd,
2883         a.m_queryEnd,
2884         a.m_string.utf8().data(),
2885         b.m_isValid,
2886         b.m_cannotBeABaseURL,
2887         b.m_protocolIsInHTTPFamily,
2888         b.m_schemeEnd,
2889         b.m_userStart,
2890         b.m_userEnd,
2891         b.m_passwordEnd,
2892         b.m_hostEnd,
2893         b.m_hostEnd + b.m_portLength,
2894         b.m_pathAfterLastSlash,
2895         b.m_pathEnd,
2896         b.m_queryEnd,
2897         b.m_string.utf8().data());
2898
2899     return a.m_string == b.m_string
2900         && a.m_isValid == b.m_isValid
2901         && a.m_cannotBeABaseURL == b.m_cannotBeABaseURL
2902         && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2903         && a.m_schemeEnd == b.m_schemeEnd
2904         && a.m_userStart == b.m_userStart
2905         && a.m_userEnd == b.m_userEnd
2906         && a.m_passwordEnd == b.m_passwordEnd
2907         && a.m_hostEnd == b.m_hostEnd
2908         && a.m_portLength == b.m_portLength
2909         && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2910         && a.m_pathEnd == b.m_pathEnd
2911         && a.m_queryEnd == b.m_queryEnd;
2912 }
2913
2914 bool URLParser::internalValuesConsistent(const URL& url)
2915 {
2916     return url.m_schemeEnd <= url.m_userStart
2917         && url.m_userStart <= url.m_userEnd
2918         && url.m_userEnd <= url.m_passwordEnd
2919         && url.m_passwordEnd <= url.m_hostEnd
2920         && url.m_hostEnd + url.m_portLength <= url.m_pathAfterLastSlash
2921         && url.m_pathAfterLastSlash <= url.m_pathEnd
2922         && url.m_pathEnd <= url.m_queryEnd
2923         && url.m_queryEnd <= url.m_string.length();
2924 }
2925
2926 } // namespace WebCore