Unreviewed, rolling out r219024.
[WebKit-https.git] / Source / WebCore / platform / URLParser.cpp
1 /*
2  * Copyright (C) 2016 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "URLParser.h"
28
29 #include "Logging.h"
30 #include "RuntimeApplicationChecks.h"
31 #include <array>
32 #include <mutex>
33 #include <unicode/uidna.h>
34 #include <unicode/utypes.h>
35
36 namespace WebCore {
37
38 #define URL_PARSER_DEBUGGING 0
39
40 #if URL_PARSER_DEBUGGING
41 #define URL_PARSER_LOG(...) LOG(URLParser, __VA_ARGS__)
42 #else
43 #define URL_PARSER_LOG(...)
44 #endif
45     
46 template<typename CharacterType>
47 class CodePointIterator {
48 public:
49     ALWAYS_INLINE CodePointIterator() { }
50     ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
51         : m_begin(begin)
52         , m_end(end)
53     {
54     }
55     
56     ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
57         : CodePointIterator(begin.m_begin, end.m_begin)
58     {
59         ASSERT(end.m_begin >= begin.m_begin);
60     }
61     
62     ALWAYS_INLINE UChar32 operator*() const;
63     ALWAYS_INLINE CodePointIterator& operator++();
64
65     ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
66     {
67         return m_begin == other.m_begin
68             && m_end == other.m_end;
69     }
70     ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
71     
72     ALWAYS_INLINE CodePointIterator& operator=(const CodePointIterator& other)
73     {
74         m_begin = other.m_begin;
75         m_end = other.m_end;
76         return *this;
77     }
78
79     ALWAYS_INLINE bool atEnd() const
80     {
81         ASSERT(m_begin <= m_end);
82         return m_begin >= m_end;
83     }
84     
85     ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
86     {
87         ASSERT(m_begin >= reference);
88         return m_begin - reference;
89     }
90
91     ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
92     {
93         return codeUnitsSince(other.m_begin);
94     }
95     
96 private:
97     const CharacterType* m_begin { nullptr };
98     const CharacterType* m_end { nullptr };
99 };
100
101 template<>
102 ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const
103 {
104     ASSERT(!atEnd());
105     return *m_begin;
106 }
107
108 template<>
109 ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
110 {
111     m_begin++;
112     return *this;
113 }
114
115 template<>
116 ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const
117 {
118     ASSERT(!atEnd());
119     UChar32 c;
120     U16_GET(m_begin, 0, 0, m_end - m_begin, c);
121     return c;
122 }
123
124 template<>
125 ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
126 {
127     unsigned i = 0;
128     size_t length = m_end - m_begin;
129     U16_FWD_1(m_begin, i, length);
130     m_begin += i;
131     return *this;
132 }
133     
134 ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
135 {
136     if (U_IS_BMP(codePoint)) {
137         destination.append(static_cast<UChar>(codePoint));
138         return;
139     }
140     destination.reserveCapacity(destination.size() + 2);
141     destination.uncheckedAppend(U16_LEAD(codePoint));
142     destination.uncheckedAppend(U16_TRAIL(codePoint));
143 }
144
145 enum URLCharacterClass {
146     UserInfo = 0x1,
147     Default = 0x2,
148     ForbiddenHost = 0x4,
149     QueryPercent = 0x8,
150     SlashQuestionOrHash = 0x10,
151     ValidScheme = 0x20,
152 };
153
154 static const uint8_t characterClassTable[256] = {
155     UserInfo | Default | QueryPercent | ForbiddenHost, // 0x0
156     UserInfo | Default | QueryPercent, // 0x1
157     UserInfo | Default | QueryPercent, // 0x2
158     UserInfo | Default | QueryPercent, // 0x3
159     UserInfo | Default | QueryPercent, // 0x4
160     UserInfo | Default | QueryPercent, // 0x5
161     UserInfo | Default | QueryPercent, // 0x6
162     UserInfo | Default | QueryPercent, // 0x7
163     UserInfo | Default | QueryPercent, // 0x8
164     UserInfo | Default | QueryPercent | ForbiddenHost, // 0x9
165     UserInfo | Default | QueryPercent | ForbiddenHost, // 0xA
166     UserInfo | Default | QueryPercent, // 0xB
167     UserInfo | Default | QueryPercent, // 0xC
168     UserInfo | Default | QueryPercent | ForbiddenHost, // 0xD
169     UserInfo | Default | QueryPercent, // 0xE
170     UserInfo | Default | QueryPercent, // 0xF
171     UserInfo | Default | QueryPercent, // 0x10
172     UserInfo | Default | QueryPercent, // 0x11
173     UserInfo | Default | QueryPercent, // 0x12
174     UserInfo | Default | QueryPercent, // 0x13
175     UserInfo | Default | QueryPercent, // 0x14
176     UserInfo | Default | QueryPercent, // 0x15
177     UserInfo | Default | QueryPercent, // 0x16
178     UserInfo | Default | QueryPercent, // 0x17
179     UserInfo | Default | QueryPercent, // 0x18
180     UserInfo | Default | QueryPercent, // 0x19
181     UserInfo | Default | QueryPercent, // 0x1A
182     UserInfo | Default | QueryPercent, // 0x1B
183     UserInfo | Default | QueryPercent, // 0x1C
184     UserInfo | Default | QueryPercent, // 0x1D
185     UserInfo | Default | QueryPercent, // 0x1E
186     UserInfo | Default | QueryPercent, // 0x1F
187     UserInfo | Default | QueryPercent | ForbiddenHost, // ' '
188     0, // '!'
189     UserInfo | Default | QueryPercent, // '"'
190     UserInfo | Default | QueryPercent | SlashQuestionOrHash | ForbiddenHost, // '#'
191     0, // '$'
192     ForbiddenHost, // '%'
193     0, // '&'
194     QueryPercent, // '''
195     0, // '('
196     0, // ')'
197     0, // '*'
198     ValidScheme, // '+'
199     0, // ','
200     ValidScheme, // '-'
201     ValidScheme, // '.'
202     UserInfo | SlashQuestionOrHash | ForbiddenHost, // '/'
203     ValidScheme, // '0'
204     ValidScheme, // '1'
205     ValidScheme, // '2'
206     ValidScheme, // '3'
207     ValidScheme, // '4'
208     ValidScheme, // '5'
209     ValidScheme, // '6'
210     ValidScheme, // '7'
211     ValidScheme, // '8'
212     ValidScheme, // '9'
213     UserInfo | ForbiddenHost, // ':'
214     UserInfo, // ';'
215     UserInfo | Default | QueryPercent, // '<'
216     UserInfo, // '='
217     UserInfo | Default | QueryPercent, // '>'
218     UserInfo | Default | SlashQuestionOrHash | ForbiddenHost, // '?'
219     UserInfo | ForbiddenHost, // '@'
220     ValidScheme, // 'A'
221     ValidScheme, // 'B'
222     ValidScheme, // 'C'
223     ValidScheme, // 'D'
224     ValidScheme, // 'E'
225     ValidScheme, // 'F'
226     ValidScheme, // 'G'
227     ValidScheme, // 'H'
228     ValidScheme, // 'I'
229     ValidScheme, // 'J'
230     ValidScheme, // 'K'
231     ValidScheme, // 'L'
232     ValidScheme, // 'M'
233     ValidScheme, // 'N'
234     ValidScheme, // 'O'
235     ValidScheme, // 'P'
236     ValidScheme, // 'Q'
237     ValidScheme, // 'R'
238     ValidScheme, // 'S'
239     ValidScheme, // 'T'
240     ValidScheme, // 'U'
241     ValidScheme, // 'V'
242     ValidScheme, // 'W'
243     ValidScheme, // 'X'
244     ValidScheme, // 'Y'
245     ValidScheme, // 'Z'
246     UserInfo | ForbiddenHost, // '['
247     UserInfo | SlashQuestionOrHash | ForbiddenHost, // '\\'
248     UserInfo | ForbiddenHost, // ']'
249     UserInfo, // '^'
250     0, // '_'
251     UserInfo | Default, // '`'
252     ValidScheme, // 'a'
253     ValidScheme, // 'b'
254     ValidScheme, // 'c'
255     ValidScheme, // 'd'
256     ValidScheme, // 'e'
257     ValidScheme, // 'f'
258     ValidScheme, // 'g'
259     ValidScheme, // 'h'
260     ValidScheme, // 'i'
261     ValidScheme, // 'j'
262     ValidScheme, // 'k'
263     ValidScheme, // 'l'
264     ValidScheme, // 'm'
265     ValidScheme, // 'n'
266     ValidScheme, // 'o'
267     ValidScheme, // 'p'
268     ValidScheme, // 'q'
269     ValidScheme, // 'r'
270     ValidScheme, // 's'
271     ValidScheme, // 't'
272     ValidScheme, // 'u'
273     ValidScheme, // 'v'
274     ValidScheme, // 'w'
275     ValidScheme, // 'x'
276     ValidScheme, // 'y'
277     ValidScheme, // 'z'
278     UserInfo | Default, // '{'
279     UserInfo, // '|'
280     UserInfo | Default, // '}'
281     0, // '~'
282     QueryPercent, // 0x7F
283     QueryPercent, // 0x80
284     QueryPercent, // 0x81
285     QueryPercent, // 0x82
286     QueryPercent, // 0x83
287     QueryPercent, // 0x84
288     QueryPercent, // 0x85
289     QueryPercent, // 0x86
290     QueryPercent, // 0x87
291     QueryPercent, // 0x88
292     QueryPercent, // 0x89
293     QueryPercent, // 0x8A
294     QueryPercent, // 0x8B
295     QueryPercent, // 0x8C
296     QueryPercent, // 0x8D
297     QueryPercent, // 0x8E
298     QueryPercent, // 0x8F
299     QueryPercent, // 0x90
300     QueryPercent, // 0x91
301     QueryPercent, // 0x92
302     QueryPercent, // 0x93
303     QueryPercent, // 0x94
304     QueryPercent, // 0x95
305     QueryPercent, // 0x96
306     QueryPercent, // 0x97
307     QueryPercent, // 0x98
308     QueryPercent, // 0x99
309     QueryPercent, // 0x9A
310     QueryPercent, // 0x9B
311     QueryPercent, // 0x9C
312     QueryPercent, // 0x9D
313     QueryPercent, // 0x9E
314     QueryPercent, // 0x9F
315     QueryPercent, // 0xA0
316     QueryPercent, // 0xA1
317     QueryPercent, // 0xA2
318     QueryPercent, // 0xA3
319     QueryPercent, // 0xA4
320     QueryPercent, // 0xA5
321     QueryPercent, // 0xA6
322     QueryPercent, // 0xA7
323     QueryPercent, // 0xA8
324     QueryPercent, // 0xA9
325     QueryPercent, // 0xAA
326     QueryPercent, // 0xAB
327     QueryPercent, // 0xAC
328     QueryPercent, // 0xAD
329     QueryPercent, // 0xAE
330     QueryPercent, // 0xAF
331     QueryPercent, // 0xB0
332     QueryPercent, // 0xB1
333     QueryPercent, // 0xB2
334     QueryPercent, // 0xB3
335     QueryPercent, // 0xB4
336     QueryPercent, // 0xB5
337     QueryPercent, // 0xB6
338     QueryPercent, // 0xB7
339     QueryPercent, // 0xB8
340     QueryPercent, // 0xB9
341     QueryPercent, // 0xBA
342     QueryPercent, // 0xBB
343     QueryPercent, // 0xBC
344     QueryPercent, // 0xBD
345     QueryPercent, // 0xBE
346     QueryPercent, // 0xBF
347     QueryPercent, // 0xC0
348     QueryPercent, // 0xC1
349     QueryPercent, // 0xC2
350     QueryPercent, // 0xC3
351     QueryPercent, // 0xC4
352     QueryPercent, // 0xC5
353     QueryPercent, // 0xC6
354     QueryPercent, // 0xC7
355     QueryPercent, // 0xC8
356     QueryPercent, // 0xC9
357     QueryPercent, // 0xCA
358     QueryPercent, // 0xCB
359     QueryPercent, // 0xCC
360     QueryPercent, // 0xCD
361     QueryPercent, // 0xCE
362     QueryPercent, // 0xCF
363     QueryPercent, // 0xD0
364     QueryPercent, // 0xD1
365     QueryPercent, // 0xD2
366     QueryPercent, // 0xD3
367     QueryPercent, // 0xD4
368     QueryPercent, // 0xD5
369     QueryPercent, // 0xD6
370     QueryPercent, // 0xD7
371     QueryPercent, // 0xD8
372     QueryPercent, // 0xD9
373     QueryPercent, // 0xDA
374     QueryPercent, // 0xDB
375     QueryPercent, // 0xDC
376     QueryPercent, // 0xDD
377     QueryPercent, // 0xDE
378     QueryPercent, // 0xDF
379     QueryPercent, // 0xE0
380     QueryPercent, // 0xE1
381     QueryPercent, // 0xE2
382     QueryPercent, // 0xE3
383     QueryPercent, // 0xE4
384     QueryPercent, // 0xE5
385     QueryPercent, // 0xE6
386     QueryPercent, // 0xE7
387     QueryPercent, // 0xE8
388     QueryPercent, // 0xE9
389     QueryPercent, // 0xEA
390     QueryPercent, // 0xEB
391     QueryPercent, // 0xEC
392     QueryPercent, // 0xED
393     QueryPercent, // 0xEE
394     QueryPercent, // 0xEF
395     QueryPercent, // 0xF0
396     QueryPercent, // 0xF1
397     QueryPercent, // 0xF2
398     QueryPercent, // 0xF3
399     QueryPercent, // 0xF4
400     QueryPercent, // 0xF5
401     QueryPercent, // 0xF6
402     QueryPercent, // 0xF7
403     QueryPercent, // 0xF8
404     QueryPercent, // 0xF9
405     QueryPercent, // 0xFA
406     QueryPercent, // 0xFB
407     QueryPercent, // 0xFC
408     QueryPercent, // 0xFD
409     QueryPercent, // 0xFE
410     QueryPercent, // 0xFF
411 };
412
413 template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; }
414 template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
415 template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
416 template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
417 template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
418 template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
419 template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
420 template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
421 template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & ValidScheme; }
422 template<typename CharacterType> ALWAYS_INLINE static bool isForbiddenHostCodePoint(CharacterType character) { return character <= ']' && characterClassTable[character] & ForbiddenHost; }
423 static bool shouldPercentEncodeQueryByte(uint8_t byte) { return characterClassTable[byte] & QueryPercent; }
424
425 template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation>
426 ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
427 {
428     ++iterator;
429     while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
430         if (reportSyntaxViolation == ReportSyntaxViolation::Yes)
431             syntaxViolation(iteratorForSyntaxViolationPosition);
432         ++iterator;
433     }
434 }
435
436 template<typename CharacterType>
437 bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator)
438 {
439     if (iterator.atEnd())
440         return false;
441     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
442     if (iterator.atEnd())
443         return false;
444     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
445     return iterator.atEnd();
446 }
447
448 template<typename CharacterType>
449 ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
450 {
451     if (iterator.atEnd() || !isASCIIAlpha(*iterator))
452         return false;
453     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
454     if (iterator.atEnd())
455         return false;
456     if (*iterator == ':')
457         return true;
458     if (UNLIKELY(*iterator == '|'))
459         return true;
460     return false;
461 }
462
463 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
464 {
465     ASSERT(isASCII(codePoint));
466     if (UNLIKELY(m_didSeeSyntaxViolation))
467         m_asciiBuffer.append(codePoint);
468 }
469
470 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
471 {
472     if (UNLIKELY(m_didSeeSyntaxViolation))
473         m_asciiBuffer.append(characters, length);
474 }
475
476 template<typename CharacterType>
477 void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
478 {
479     ASSERT(isWindowsDriveLetter(iterator));
480     appendToASCIIBuffer(*iterator);
481     advance(iterator);
482     ASSERT(!iterator.atEnd());
483     ASSERT(*iterator == ':' || *iterator == '|');
484     if (*iterator == '|')
485         syntaxViolation(iterator);
486     appendToASCIIBuffer(':');
487     advance(iterator);
488 }
489
490 bool URLParser::copyBaseWindowsDriveLetter(const URL& base)
491 {
492     if (base.protocolIs("file")) {
493         RELEASE_ASSERT(base.m_portEnd < base.m_string.length());
494         if (base.m_string.is8Bit()) {
495             const LChar* begin = base.m_string.characters8();
496             CodePointIterator<LChar> c(begin + base.m_portEnd + 1, begin + base.m_string.length());
497             if (isWindowsDriveLetter(c)) {
498                 appendWindowsDriveLetter(c);
499                 return true;
500             }
501         } else {
502             const UChar* begin = base.m_string.characters16();
503             CodePointIterator<UChar> c(begin + base.m_portEnd + 1, begin + base.m_string.length());
504             if (isWindowsDriveLetter(c)) {
505                 appendWindowsDriveLetter(c);
506                 return true;
507             }
508         }
509     }
510     return false;
511 }
512
513 template<typename CharacterType>
514 bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
515 {
516     if (!isWindowsDriveLetter(iterator))
517         return true;
518     if (iterator.atEnd())
519         return false;
520     advance(iterator);
521     if (iterator.atEnd())
522         return true;
523     advance(iterator);
524     if (iterator.atEnd())
525         return true;
526     return !isSlashQuestionOrHash(*iterator);
527 }
528
529 static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
530 {
531     buffer.append('%');
532     buffer.append(upperNibbleToASCIIHexDigit(byte));
533     buffer.append(lowerNibbleToASCIIHexDigit(byte));
534 }
535
536 void URLParser::percentEncodeByte(uint8_t byte)
537 {
538     ASSERT(m_didSeeSyntaxViolation);
539     appendToASCIIBuffer('%');
540     appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
541     appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
542 }
543
544 const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD";
545 const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1;
546
547 template<bool(*isInCodeSet)(UChar32), typename CharacterType>
548 ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
549 {
550     ASSERT(!iterator.atEnd());
551     UChar32 codePoint = *iterator;
552     if (LIKELY(isASCII(codePoint))) {
553         if (UNLIKELY(isInCodeSet(codePoint))) {
554             syntaxViolation(iterator);
555             percentEncodeByte(codePoint);
556         } else
557             appendToASCIIBuffer(codePoint);
558         return;
559     }
560     ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
561     syntaxViolation(iterator);
562     
563     if (!U_IS_UNICODE_CHAR(codePoint)) {
564         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
565         return;
566     }
567     
568     uint8_t buffer[U8_MAX_LENGTH];
569     int32_t offset = 0;
570     U8_APPEND_UNSAFE(buffer, offset, codePoint);
571     for (int32_t i = 0; i < offset; ++i)
572         percentEncodeByte(buffer[i]);
573 }
574
575 template<typename CharacterType>
576 ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
577 {
578     ASSERT(!iterator.atEnd());
579     UChar32 codePoint = *iterator;
580     if (LIKELY(isASCII(codePoint))) {
581         if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint))) {
582             syntaxViolation(iterator);
583             percentEncodeByte(codePoint);
584         } else
585             appendToASCIIBuffer(codePoint);
586         return;
587     }
588     
589     syntaxViolation(iterator);
590     
591     if (!U_IS_UNICODE_CHAR(codePoint)) {
592         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
593         return;
594     }
595
596     uint8_t buffer[U8_MAX_LENGTH];
597     int32_t offset = 0;
598     U8_APPEND_UNSAFE(buffer, offset, codePoint);
599     for (int32_t i = 0; i < offset; ++i) {
600         auto byte = buffer[i];
601         if (shouldPercentEncodeQueryByte(byte))
602             percentEncodeByte(byte);
603         else
604             appendToASCIIBuffer(byte);
605     }
606 }
607
608 template<typename CharacterType>
609 void URLParser::encodeQuery(const Vector<UChar>& source, const TextEncoding& encoding, CodePointIterator<CharacterType> iterator)
610 {
611     // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
612     CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
613     const char* data = encoded.data();
614     size_t length = encoded.length();
615     
616     if (!length == !iterator.atEnd()) {
617         syntaxViolation(iterator);
618         return;
619     }
620     
621     size_t i = 0;
622     for (; i < length; ++i) {
623         ASSERT(!iterator.atEnd());
624         uint8_t byte = data[i];
625         if (UNLIKELY(byte != *iterator)) {
626             syntaxViolation(iterator);
627             break;
628         }
629         if (UNLIKELY(shouldPercentEncodeQueryByte(byte))) {
630             syntaxViolation(iterator);
631             break;
632         }
633         appendToASCIIBuffer(byte);
634         ++iterator;
635     }
636     while (!iterator.atEnd() && isTabOrNewline(*iterator))
637         ++iterator;
638     ASSERT((i == length) == iterator.atEnd());
639     for (; i < length; ++i) {
640         ASSERT(m_didSeeSyntaxViolation);
641         uint8_t byte = data[i];
642         if (shouldPercentEncodeQueryByte(byte))
643             percentEncodeByte(byte);
644         else
645             appendToASCIIBuffer(byte);
646     }
647 }
648
649 std::optional<uint16_t> URLParser::defaultPortForProtocol(StringView scheme)
650 {
651     static const uint16_t ftpPort = 21;
652     static const uint16_t gopherPort = 70;
653     static const uint16_t httpPort = 80;
654     static const uint16_t httpsPort = 443;
655     static const uint16_t wsPort = 80;
656     static const uint16_t wssPort = 443;
657     
658     auto length = scheme.length();
659     if (!length)
660         return std::nullopt;
661     switch (scheme[0]) {
662     case 'w':
663         switch (length) {
664         case 2:
665             if (scheme[1] == 's')
666                 return wsPort;
667             return std::nullopt;
668         case 3:
669             if (scheme[1] == 's'
670                 && scheme[2] == 's')
671                 return wssPort;
672             return std::nullopt;
673         default:
674             return false;
675         }
676     case 'h':
677         switch (length) {
678         case 4:
679             if (scheme[1] == 't'
680                 && scheme[2] == 't'
681                 && scheme[3] == 'p')
682                 return httpPort;
683             return std::nullopt;
684         case 5:
685             if (scheme[1] == 't'
686                 && scheme[2] == 't'
687                 && scheme[3] == 'p'
688                 && scheme[4] == 's')
689                 return httpsPort;
690             return std::nullopt;
691         default:
692             return std::nullopt;
693         }
694     case 'g':
695         if (length == 6
696             && scheme[1] == 'o'
697             && scheme[2] == 'p'
698             && scheme[3] == 'h'
699             && scheme[4] == 'e'
700             && scheme[5] == 'r')
701             return gopherPort;
702         return std::nullopt;
703     case 'f':
704         if (length == 3
705             && scheme[1] == 't'
706             && scheme[2] == 'p')
707             return ftpPort;
708         return std::nullopt;
709     default:
710         return std::nullopt;
711     }
712 }
713
714 enum class Scheme {
715     WS,
716     WSS,
717     File,
718     FTP,
719     Gopher,
720     HTTP,
721     HTTPS,
722     NonSpecial
723 };
724
725 ALWAYS_INLINE static Scheme scheme(StringView scheme)
726 {
727     auto length = scheme.length();
728     if (!length)
729         return Scheme::NonSpecial;
730     switch (scheme[0]) {
731     case 'f':
732         switch (length) {
733         case 3:
734             if (scheme[1] == 't'
735                 && scheme[2] == 'p')
736                 return Scheme::FTP;
737             return Scheme::NonSpecial;
738         case 4:
739             if (scheme[1] == 'i'
740                 && scheme[2] == 'l'
741                 && scheme[3] == 'e')
742                 return Scheme::File;
743             return Scheme::NonSpecial;
744         default:
745             return Scheme::NonSpecial;
746         }
747     case 'g':
748         if (length == 6
749             && scheme[1] == 'o'
750             && scheme[2] == 'p'
751             && scheme[3] == 'h'
752             && scheme[4] == 'e'
753             && scheme[5] == 'r')
754             return Scheme::Gopher;
755         return Scheme::NonSpecial;
756     case 'h':
757         switch (length) {
758         case 4:
759             if (scheme[1] == 't'
760                 && scheme[2] == 't'
761                 && scheme[3] == 'p')
762                 return Scheme::HTTP;
763             return Scheme::NonSpecial;
764         case 5:
765             if (scheme[1] == 't'
766                 && scheme[2] == 't'
767                 && scheme[3] == 'p'
768                 && scheme[4] == 's')
769                 return Scheme::HTTPS;
770             return Scheme::NonSpecial;
771         default:
772             return Scheme::NonSpecial;
773         }
774     case 'w':
775         switch (length) {
776         case 2:
777             if (scheme[1] == 's')
778                 return Scheme::WS;
779             return Scheme::NonSpecial;
780         case 3:
781             if (scheme[1] == 's'
782                 && scheme[2] == 's')
783                 return Scheme::WSS;
784             return Scheme::NonSpecial;
785         default:
786             return Scheme::NonSpecial;
787         }
788     default:
789         return Scheme::NonSpecial;
790     }
791 }
792
793 std::optional<String> URLParser::maybeCanonicalizeScheme(const String& scheme)
794 {
795     if (scheme.isEmpty())
796         return std::nullopt;
797
798     if (!isASCIIAlpha(scheme[0]))
799         return std::nullopt;
800
801     for (size_t i = 1; i < scheme.length(); ++i) {
802         if (isASCIIAlphanumeric(scheme[i]) || scheme[i] == '+' || scheme[i] == '-' || scheme[i] == '.')
803             continue;
804         return std::nullopt;
805     }
806
807     return scheme.convertToASCIILowercase();
808 }
809
810 bool URLParser::isSpecialScheme(const String& schemeArg)
811 {
812     return scheme(schemeArg) != Scheme::NonSpecial;
813 }
814
815 enum class URLParser::URLPart {
816     SchemeEnd,
817     UserStart,
818     UserEnd,
819     PasswordEnd,
820     HostEnd,
821     PortEnd,
822     PathAfterLastSlash,
823     PathEnd,
824     QueryEnd,
825     FragmentEnd,
826 };
827
828 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
829 {
830     switch (part) {
831     case URLPart::FragmentEnd:
832         return url.m_fragmentEnd;
833     case URLPart::QueryEnd:
834         return url.m_queryEnd;
835     case URLPart::PathEnd:
836         return url.m_pathEnd;
837     case URLPart::PathAfterLastSlash:
838         return url.m_pathAfterLastSlash;
839     case URLPart::PortEnd:
840         return url.m_portEnd;
841     case URLPart::HostEnd:
842         return url.m_hostEnd;
843     case URLPart::PasswordEnd:
844         return url.m_passwordEnd;
845     case URLPart::UserEnd:
846         return url.m_userEnd;
847     case URLPart::UserStart:
848         return url.m_userStart;
849     case URLPart::SchemeEnd:
850         return url.m_schemeEnd;
851     }
852     ASSERT_NOT_REACHED();
853     return 0;
854 }
855
856 void URLParser::copyASCIIStringUntil(const String& string, size_t length)
857 {
858     RELEASE_ASSERT(length <= string.length());
859     if (string.isNull())
860         return;
861     ASSERT(m_asciiBuffer.isEmpty());
862     if (string.is8Bit())
863         appendToASCIIBuffer(string.characters8(), length);
864     else {
865         const UChar* characters = string.characters16();
866         for (size_t i = 0; i < length; ++i) {
867             UChar c = characters[i];
868             ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
869             appendToASCIIBuffer(c);
870         }
871     }
872 }
873
874 template<typename CharacterType>
875 void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, bool& isUTF8Encoding)
876 {
877     syntaxViolation(iterator);
878
879     m_asciiBuffer.clear();
880     copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
881     switch (part) {
882     case URLPart::FragmentEnd:
883         RELEASE_ASSERT_NOT_REACHED();
884     case URLPart::QueryEnd:
885         m_url.m_queryEnd = base.m_queryEnd;
886         FALLTHROUGH;
887     case URLPart::PathEnd:
888         m_url.m_pathEnd = base.m_pathEnd;
889         FALLTHROUGH;
890     case URLPart::PathAfterLastSlash:
891         m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
892         FALLTHROUGH;
893     case URLPart::PortEnd:
894         m_url.m_portEnd = base.m_portEnd;
895         FALLTHROUGH;
896     case URLPart::HostEnd:
897         m_url.m_hostEnd = base.m_hostEnd;
898         FALLTHROUGH;
899     case URLPart::PasswordEnd:
900         m_url.m_passwordEnd = base.m_passwordEnd;
901         FALLTHROUGH;
902     case URLPart::UserEnd:
903         m_url.m_userEnd = base.m_userEnd;
904         FALLTHROUGH;
905     case URLPart::UserStart:
906         m_url.m_userStart = base.m_userStart;
907         FALLTHROUGH;
908     case URLPart::SchemeEnd:
909         m_url.m_isValid = base.m_isValid;
910         m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
911         m_url.m_schemeEnd = base.m_schemeEnd;
912     }
913     switch (scheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd))) {
914     case Scheme::WS:
915     case Scheme::WSS:
916         isUTF8Encoding = true;
917         m_urlIsSpecial = true;
918         return;
919     case Scheme::File:
920         m_urlIsFile = true;
921         FALLTHROUGH;
922     case Scheme::FTP:
923     case Scheme::Gopher:
924     case Scheme::HTTP:
925     case Scheme::HTTPS:
926         m_urlIsSpecial = true;
927         return;
928     case Scheme::NonSpecial:
929         m_urlIsSpecial = false;
930         isUTF8Encoding = true;
931         return;
932     }
933     ASSERT_NOT_REACHED();
934 }
935
936 static const char dotASCIICode[2] = {'2', 'e'};
937
938 template<typename CharacterType>
939 ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c)
940 {
941     if (c.atEnd())
942         return false;
943     if (*c == '.') {
944         advance<CharacterType, ReportSyntaxViolation::No>(c);
945         return c.atEnd() || isSlashQuestionOrHash(*c);
946     }
947     if (*c != '%')
948         return false;
949     advance<CharacterType, ReportSyntaxViolation::No>(c);
950     if (c.atEnd() || *c != dotASCIICode[0])
951         return false;
952     advance<CharacterType, ReportSyntaxViolation::No>(c);
953     if (c.atEnd())
954         return false;
955     if (toASCIILower(*c) == dotASCIICode[1]) {
956         advance<CharacterType, ReportSyntaxViolation::No>(c);
957         return c.atEnd() || isSlashQuestionOrHash(*c);
958     }
959     return false;
960 }
961
962 template<typename CharacterType>
963 ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
964 {
965     if (c.atEnd())
966         return false;
967     if (*c == '.') {
968         advance<CharacterType, ReportSyntaxViolation::No>(c);
969         return isSingleDotPathSegment(c);
970     }
971     if (*c != '%')
972         return false;
973     advance<CharacterType, ReportSyntaxViolation::No>(c);
974     if (c.atEnd() || *c != dotASCIICode[0])
975         return false;
976     advance<CharacterType, ReportSyntaxViolation::No>(c);
977     if (c.atEnd())
978         return false;
979     if (toASCIILower(*c) == dotASCIICode[1]) {
980         advance<CharacterType, ReportSyntaxViolation::No>(c);
981         return isSingleDotPathSegment(c);
982     }
983     return false;
984 }
985
986 template<typename CharacterType>
987 void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
988 {
989     ASSERT(isSingleDotPathSegment(c));
990     if (*c == '.') {
991         advance(c);
992         if (!c.atEnd()) {
993             if (*c == '/' || *c == '\\')
994                 advance(c);
995             else
996                 ASSERT(*c == '?' || *c == '#');
997         }
998     } else {
999         ASSERT(*c == '%');
1000         advance(c);
1001         ASSERT(*c == dotASCIICode[0]);
1002         advance(c);
1003         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1004         advance(c);
1005         if (!c.atEnd()) {
1006             if (*c == '/' || *c == '\\')
1007                 advance(c);
1008             else
1009                 ASSERT(*c == '?' || *c == '#');
1010         }
1011     }
1012 }
1013
1014 template<typename CharacterType>
1015 void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
1016 {
1017     ASSERT(isDoubleDotPathSegment(c));
1018     if (*c == '.')
1019         advance(c);
1020     else {
1021         ASSERT(*c == '%');
1022         advance(c);
1023         ASSERT(*c == dotASCIICode[0]);
1024         advance(c);
1025         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1026         advance(c);
1027     }
1028     consumeSingleDotPathSegment(c);
1029 }
1030
1031 bool URLParser::shouldPopPath(unsigned newPathAfterLastSlash)
1032 {
1033     ASSERT(m_didSeeSyntaxViolation);
1034     if (!m_urlIsFile)
1035         return true;
1036
1037     ASSERT(m_url.m_pathAfterLastSlash <= m_asciiBuffer.size());
1038     CodePointIterator<LChar> componentToPop(&m_asciiBuffer[newPathAfterLastSlash], &m_asciiBuffer[0] + m_url.m_pathAfterLastSlash);
1039     if (newPathAfterLastSlash == m_url.m_portEnd + 1 && isWindowsDriveLetter(componentToPop))
1040         return false;
1041     return true;
1042 }
1043
1044 void URLParser::popPath()
1045 {
1046     ASSERT(m_didSeeSyntaxViolation);
1047     if (m_url.m_pathAfterLastSlash > m_url.m_portEnd + 1) {
1048         auto newPathAfterLastSlash = m_url.m_pathAfterLastSlash - 1;
1049         if (m_asciiBuffer[newPathAfterLastSlash] == '/')
1050             newPathAfterLastSlash--;
1051         while (newPathAfterLastSlash > m_url.m_portEnd && m_asciiBuffer[newPathAfterLastSlash] != '/')
1052             newPathAfterLastSlash--;
1053         newPathAfterLastSlash++;
1054         if (shouldPopPath(newPathAfterLastSlash))
1055             m_url.m_pathAfterLastSlash = newPathAfterLastSlash;
1056     }
1057     m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
1058 }
1059
1060 template<typename CharacterType>
1061 void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
1062 {
1063     if (m_didSeeSyntaxViolation)
1064         return;
1065     m_didSeeSyntaxViolation = true;
1066     
1067     ASSERT(m_asciiBuffer.isEmpty());
1068     size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1069     RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
1070     m_asciiBuffer.reserveCapacity(m_inputString.length());
1071     for (size_t i = 0; i < codeUnitsToCopy; ++i) {
1072         ASSERT(isASCII(m_inputString[i]));
1073         m_asciiBuffer.uncheckedAppend(m_inputString[i]);
1074     }
1075 }
1076
1077 void URLParser::failure()
1078 {
1079     m_url.invalidate();
1080     m_url.m_string = m_inputString;
1081 }
1082
1083 template<typename CharacterType>
1084 bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint)
1085 {
1086     if (iterator.atEnd() || toASCIILower(*iterator) != codePoint)
1087         return false;
1088     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
1089     return true;
1090 }
1091
1092 template<typename CharacterType>
1093 bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator)
1094 {
1095     if (!checkLocalhostCodePoint(iterator, 'l'))
1096         return false;
1097     if (!checkLocalhostCodePoint(iterator, 'o'))
1098         return false;
1099     if (!checkLocalhostCodePoint(iterator, 'c'))
1100         return false;
1101     if (!checkLocalhostCodePoint(iterator, 'a'))
1102         return false;
1103     if (!checkLocalhostCodePoint(iterator, 'l'))
1104         return false;
1105     if (!checkLocalhostCodePoint(iterator, 'h'))
1106         return false;
1107     if (!checkLocalhostCodePoint(iterator, 'o'))
1108         return false;
1109     if (!checkLocalhostCodePoint(iterator, 's'))
1110         return false;
1111     if (!checkLocalhostCodePoint(iterator, 't'))
1112         return false;
1113     return iterator.atEnd();
1114 }
1115
1116 bool URLParser::isLocalhost(StringView view)
1117 {
1118     if (view.is8Bit())
1119         return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length()));
1120     return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length()));
1121 }
1122
1123 ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
1124 {
1125     if (UNLIKELY(m_didSeeSyntaxViolation)) {
1126         ASSERT(start + length <= m_asciiBuffer.size());
1127         return StringView(m_asciiBuffer.data() + start, length);
1128     }
1129     ASSERT(start + length <= m_inputString.length());
1130     return StringView(m_inputString).substring(start, length);
1131 }
1132
1133 ALWAYS_INLINE UChar URLParser::parsedDataView(size_t position)
1134 {
1135     if (UNLIKELY(m_didSeeSyntaxViolation))
1136         return m_asciiBuffer[position];
1137     return m_inputString[position];
1138 }
1139
1140 template<typename CharacterType>
1141 ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1142 {
1143     if (UNLIKELY(m_didSeeSyntaxViolation))
1144         return m_asciiBuffer.size();
1145     
1146     return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1147 }
1148
1149 URLParser::URLParser(const String& input, const URL& base, const TextEncoding& encoding)
1150     : m_inputString(input)
1151 {
1152     if (input.isNull()) {
1153         if (base.isValid() && !base.m_cannotBeABaseURL) {
1154             m_url = base;
1155             m_url.removeFragmentIdentifier();
1156         }
1157         return;
1158     }
1159
1160     if (input.is8Bit()) {
1161         m_inputBegin = input.characters8();
1162         parse(input.characters8(), input.length(), base, encoding);
1163     } else {
1164         m_inputBegin = input.characters16();
1165         parse(input.characters16(), input.length(), base, encoding);
1166     }
1167
1168     ASSERT(!m_url.m_isValid
1169         || m_didSeeSyntaxViolation == (m_url.string() != input)
1170         || (input.isAllSpecialCharacters<isC0ControlOrSpace>()
1171             && m_url.m_string == base.m_string.left(base.m_queryEnd)));
1172     ASSERT(internalValuesConsistent(m_url));
1173 #if !ASSERT_DISABLED
1174     if (!m_didSeeSyntaxViolation) {
1175         // Force a syntax violation at the beginning to make sure we get the same result.
1176         URLParser parser(makeString(" ", input), base, encoding);
1177         URL parsed = parser.result();
1178         if (parsed.isValid())
1179             ASSERT(allValuesEqual(parser.result(), m_url));
1180     }
1181 #endif
1182 }
1183
1184 template<typename CharacterType>
1185 void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
1186 {
1187     URL_PARSER_LOG("Parsing URL <%s> base <%s> encoding <%s>", String(input, length).utf8().data(), base.string().utf8().data(), encoding.name());
1188     m_url = { };
1189     ASSERT(m_asciiBuffer.isEmpty());
1190     
1191     bool isUTF8Encoding = encoding == UTF8Encoding();
1192     Vector<UChar> queryBuffer;
1193
1194     unsigned endIndex = length;
1195     while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) {
1196         syntaxViolation(CodePointIterator<CharacterType>(input, input));
1197         endIndex--;
1198     }
1199     CodePointIterator<CharacterType> c(input, input + endIndex);
1200     CodePointIterator<CharacterType> authorityOrHostBegin;
1201     CodePointIterator<CharacterType> queryBegin;
1202     while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1203         syntaxViolation(c);
1204         ++c;
1205     }
1206     auto beginAfterControlAndSpace = c;
1207
1208     enum class State : uint8_t {
1209         SchemeStart,
1210         Scheme,
1211         NoScheme,
1212         SpecialRelativeOrAuthority,
1213         PathOrAuthority,
1214         Relative,
1215         RelativeSlash,
1216         SpecialAuthoritySlashes,
1217         SpecialAuthorityIgnoreSlashes,
1218         AuthorityOrHost,
1219         Host,
1220         File,
1221         FileSlash,
1222         FileHost,
1223         PathStart,
1224         Path,
1225         CannotBeABaseURLPath,
1226         UTF8Query,
1227         NonUTF8Query,
1228         Fragment,
1229     };
1230
1231 #define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1232 #define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x)
1233
1234     State state = State::SchemeStart;
1235     while (!c.atEnd()) {
1236         if (UNLIKELY(isTabOrNewline(*c))) {
1237             syntaxViolation(c);
1238             ++c;
1239             continue;
1240         }
1241
1242         switch (state) {
1243         case State::SchemeStart:
1244             LOG_STATE("SchemeStart");
1245             if (isASCIIAlpha(*c)) {
1246                 if (UNLIKELY(isASCIIUpper(*c)))
1247                     syntaxViolation(c);
1248                 appendToASCIIBuffer(toASCIILower(*c));
1249                 advance(c);
1250                 if (c.atEnd()) {
1251                     m_asciiBuffer.clear();
1252                     state = State::NoScheme;
1253                     c = beginAfterControlAndSpace;
1254                 }
1255                 state = State::Scheme;
1256             } else
1257                 state = State::NoScheme;
1258             break;
1259         case State::Scheme:
1260             LOG_STATE("Scheme");
1261             if (isValidSchemeCharacter(*c)) {
1262                 if (UNLIKELY(isASCIIUpper(*c)))
1263                     syntaxViolation(c);
1264                 appendToASCIIBuffer(toASCIILower(*c));
1265             } else if (*c == ':') {
1266                 m_url.m_schemeEnd = currentPosition(c);
1267                 StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd);
1268                 appendToASCIIBuffer(':');
1269                 switch (scheme(urlScheme)) {
1270                 case Scheme::File:
1271                     m_urlIsSpecial = true;
1272                     m_urlIsFile = true;
1273                     state = State::File;
1274                     ++c;
1275                     break;
1276                 case Scheme::WS:
1277                 case Scheme::WSS:
1278                     isUTF8Encoding = true;
1279                     m_urlIsSpecial = true;
1280                     if (base.protocolIs(urlScheme))
1281                         state = State::SpecialRelativeOrAuthority;
1282                     else
1283                         state = State::SpecialAuthoritySlashes;
1284                     ++c;
1285                     break;
1286                 case Scheme::HTTP:
1287                 case Scheme::HTTPS:
1288                     m_url.m_protocolIsInHTTPFamily = true;
1289                     FALLTHROUGH;
1290                 case Scheme::FTP:
1291                 case Scheme::Gopher:
1292                     m_urlIsSpecial = true;
1293                     if (base.protocolIs(urlScheme))
1294                         state = State::SpecialRelativeOrAuthority;
1295                     else
1296                         state = State::SpecialAuthoritySlashes;
1297                     ++c;
1298                     break;
1299                 case Scheme::NonSpecial:
1300                     isUTF8Encoding = true;
1301                     auto maybeSlash = c;
1302                     advance(maybeSlash);
1303                     if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1304                         appendToASCIIBuffer('/');
1305                         c = maybeSlash;
1306                         state = State::PathOrAuthority;
1307                         ASSERT(*c == '/');
1308                         ++c;
1309                         m_url.m_userStart = currentPosition(c);
1310                     } else {
1311                         ++c;
1312                         m_url.m_userStart = currentPosition(c);
1313                         m_url.m_userEnd = m_url.m_userStart;
1314                         m_url.m_passwordEnd = m_url.m_userStart;
1315                         m_url.m_hostEnd = m_url.m_userStart;
1316                         m_url.m_portEnd = m_url.m_userStart;
1317                         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1318                         m_url.m_cannotBeABaseURL = true;
1319                         state = State::CannotBeABaseURLPath;
1320                     }
1321                     break;
1322                 }
1323                 break;
1324             } else {
1325                 m_asciiBuffer.clear();
1326                 state = State::NoScheme;
1327                 c = beginAfterControlAndSpace;
1328                 break;
1329             }
1330             advance(c);
1331             if (c.atEnd()) {
1332                 m_asciiBuffer.clear();
1333                 state = State::NoScheme;
1334                 c = beginAfterControlAndSpace;
1335             }
1336             break;
1337         case State::NoScheme:
1338             LOG_STATE("NoScheme");
1339             if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) {
1340                 failure();
1341                 return;
1342             }
1343             if (base.m_cannotBeABaseURL && *c == '#') {
1344                 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1345                 state = State::Fragment;
1346                 appendToASCIIBuffer('#');
1347                 ++c;
1348                 break;
1349             }
1350             if (!base.protocolIs("file")) {
1351                 state = State::Relative;
1352                 break;
1353             }
1354             copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1355             appendToASCIIBuffer(':');
1356             state = State::File;
1357             break;
1358         case State::SpecialRelativeOrAuthority:
1359             LOG_STATE("SpecialRelativeOrAuthority");
1360             if (*c == '/') {
1361                 appendToASCIIBuffer('/');
1362                 advance(c);
1363                 if (c.atEnd()) {
1364                     failure();
1365                     return;
1366                 }
1367                 if (*c == '/') {
1368                     appendToASCIIBuffer('/');
1369                     state = State::SpecialAuthorityIgnoreSlashes;
1370                     ++c;
1371                 } else
1372                     state = State::RelativeSlash;
1373             } else
1374                 state = State::Relative;
1375             break;
1376         case State::PathOrAuthority:
1377             LOG_STATE("PathOrAuthority");
1378             if (*c == '/') {
1379                 appendToASCIIBuffer('/');
1380                 state = State::AuthorityOrHost;
1381                 advance(c);
1382                 m_url.m_userStart = currentPosition(c);
1383                 authorityOrHostBegin = c;
1384             } else {
1385                 ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1386                 m_url.m_userStart = currentPosition(c) - 1;
1387                 m_url.m_userEnd = m_url.m_userStart;
1388                 m_url.m_passwordEnd = m_url.m_userStart;
1389                 m_url.m_hostEnd = m_url.m_userStart;
1390                 m_url.m_portEnd = m_url.m_userStart;
1391                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1392                 state = State::Path;
1393             }
1394             break;
1395         case State::Relative:
1396             LOG_STATE("Relative");
1397             switch (*c) {
1398             case '/':
1399             case '\\':
1400                 state = State::RelativeSlash;
1401                 ++c;
1402                 break;
1403             case '?':
1404                 copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1405                 appendToASCIIBuffer('?');
1406                 ++c;
1407                 if (isUTF8Encoding)
1408                     state = State::UTF8Query;
1409                 else {
1410                     queryBegin = c;
1411                     state = State::NonUTF8Query;
1412                 }
1413                 break;
1414             case '#':
1415                 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1416                 appendToASCIIBuffer('#');
1417                 state = State::Fragment;
1418                 ++c;
1419                 break;
1420             default:
1421                 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1422                 if (currentPosition(c) && parsedDataView(currentPosition(c) - 1) != '/') {
1423                     appendToASCIIBuffer('/');
1424                     m_url.m_pathAfterLastSlash = currentPosition(c);
1425                 }
1426                 state = State::Path;
1427                 break;
1428             }
1429             break;
1430         case State::RelativeSlash:
1431             LOG_STATE("RelativeSlash");
1432             if (*c == '/' || *c == '\\') {
1433                 ++c;
1434                 copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1435                 appendToASCIIBuffer("://", 3);
1436                 if (m_urlIsSpecial)
1437                     state = State::SpecialAuthorityIgnoreSlashes;
1438                 else {
1439                     m_url.m_userStart = currentPosition(c);
1440                     state = State::AuthorityOrHost;
1441                     authorityOrHostBegin = c;
1442                 }
1443             } else {
1444                 copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1445                 appendToASCIIBuffer('/');
1446                 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1447                 state = State::Path;
1448             }
1449             break;
1450         case State::SpecialAuthoritySlashes:
1451             LOG_STATE("SpecialAuthoritySlashes");
1452             if (LIKELY(*c == '/' || *c == '\\')) {
1453                 if (UNLIKELY(*c == '\\'))
1454                     syntaxViolation(c);
1455                 appendToASCIIBuffer('/');
1456                 advance(c);
1457                 if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) {
1458                     if (UNLIKELY(*c == '\\'))
1459                         syntaxViolation(c);
1460                     ++c;
1461                     appendToASCIIBuffer('/');
1462                 } else {
1463                     syntaxViolation(c);
1464                     appendToASCIIBuffer('/');
1465                 }
1466             } else {
1467                 syntaxViolation(c);
1468                 appendToASCIIBuffer("//", 2);
1469             }
1470             state = State::SpecialAuthorityIgnoreSlashes;
1471             break;
1472         case State::SpecialAuthorityIgnoreSlashes:
1473             LOG_STATE("SpecialAuthorityIgnoreSlashes");
1474             if (*c == '/' || *c == '\\') {
1475                 syntaxViolation(c);
1476                 ++c;
1477             } else {
1478                 m_url.m_userStart = currentPosition(c);
1479                 state = State::AuthorityOrHost;
1480                 authorityOrHostBegin = c;
1481             }
1482             break;
1483         case State::AuthorityOrHost:
1484             do {
1485                 LOG_STATE("AuthorityOrHost");
1486                 if (*c == '@') {
1487                     auto lastAt = c;
1488                     auto findLastAt = c;
1489                     while (!findLastAt.atEnd()) {
1490                         URL_PARSER_LOG("Finding last @: %c", *findLastAt);
1491                         if (*findLastAt == '@')
1492                             lastAt = findLastAt;
1493                         bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\');
1494                         if (isSlash || *findLastAt == '?' || *findLastAt == '#')
1495                             break;
1496                         ++findLastAt;
1497                     }
1498                     parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1499                     c = lastAt;
1500                     advance(c);
1501                     authorityOrHostBegin = c;
1502                     state = State::Host;
1503                     m_hostHasPercentOrNonASCII = false;
1504                     break;
1505                 }
1506                 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1507                 if (isSlash || *c == '?' || *c == '#') {
1508                     auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c);
1509                     if (iterator.atEnd()) {
1510                         if (m_urlIsSpecial)
1511                             return failure();
1512                         m_url.m_userEnd = currentPosition(c);
1513                         m_url.m_passwordEnd = m_url.m_userEnd;
1514                         m_url.m_hostEnd = m_url.m_userEnd;
1515                         m_url.m_portEnd = m_url.m_userEnd;
1516                         m_url.m_pathAfterLastSlash = m_url.m_userEnd;
1517                     } else {
1518                         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1519                         m_url.m_passwordEnd = m_url.m_userEnd;
1520                         if (!parseHostAndPort(iterator)) {
1521                             failure();
1522                             return;
1523                         }
1524                         if (UNLIKELY(!isSlash)) {
1525                             if (m_urlIsSpecial) {
1526                                 syntaxViolation(c);
1527                                 appendToASCIIBuffer('/');
1528                             }
1529                             m_url.m_pathAfterLastSlash = currentPosition(c);
1530                         }
1531                     }
1532                     state = State::Path;
1533                     break;
1534                 }
1535                 if (isPercentOrNonASCII(*c))
1536                     m_hostHasPercentOrNonASCII = true;
1537                 ++c;
1538             } while (!c.atEnd());
1539             break;
1540         case State::Host:
1541             do {
1542                 LOG_STATE("Host");
1543                 if (*c == '/' || *c == '?' || *c == '#') {
1544                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1545                         failure();
1546                         return;
1547                     }
1548                     if (*c == '?' || *c == '#') {
1549                         syntaxViolation(c);
1550                         appendToASCIIBuffer('/');
1551                         m_url.m_pathAfterLastSlash = currentPosition(c);
1552                     }
1553                     state = State::Path;
1554                     break;
1555                 }
1556                 if (isPercentOrNonASCII(*c))
1557                     m_hostHasPercentOrNonASCII = true;
1558                 ++c;
1559             } while (!c.atEnd());
1560             break;
1561         case State::File:
1562             LOG_STATE("File");
1563             switch (*c) {
1564             case '\\':
1565                 syntaxViolation(c);
1566                 FALLTHROUGH;
1567             case '/':
1568                 appendToASCIIBuffer('/');
1569                 state = State::FileSlash;
1570                 ++c;
1571                 break;
1572             case '?':
1573                 syntaxViolation(c);
1574                 if (base.isValid() && base.protocolIs("file")) {
1575                     copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1576                     appendToASCIIBuffer('?');
1577                     ++c;
1578                 } else {
1579                     appendToASCIIBuffer("///?", 4);
1580                     ++c;
1581                     m_url.m_userStart = currentPosition(c) - 2;
1582                     m_url.m_userEnd = m_url.m_userStart;
1583                     m_url.m_passwordEnd = m_url.m_userStart;
1584                     m_url.m_hostEnd = m_url.m_userStart;
1585                     m_url.m_portEnd = m_url.m_userStart;
1586                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1587                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1588                 }
1589                 if (isUTF8Encoding)
1590                     state = State::UTF8Query;
1591                 else {
1592                     queryBegin = c;
1593                     state = State::NonUTF8Query;
1594                 }
1595                 break;
1596             case '#':
1597                 syntaxViolation(c);
1598                 if (base.isValid() && base.protocolIs("file")) {
1599                     copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1600                     appendToASCIIBuffer('#');
1601                 } else {
1602                     appendToASCIIBuffer("///#", 4);
1603                     m_url.m_userStart = currentPosition(c) - 2;
1604                     m_url.m_userEnd = m_url.m_userStart;
1605                     m_url.m_passwordEnd = m_url.m_userStart;
1606                     m_url.m_hostEnd = m_url.m_userStart;
1607                     m_url.m_portEnd = m_url.m_userStart;
1608                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1609                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1610                     m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1611                 }
1612                 state = State::Fragment;
1613                 ++c;
1614                 break;
1615             default:
1616                 syntaxViolation(c);
1617                 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1618                     copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1619                 else {
1620                     appendToASCIIBuffer("///", 3);
1621                     m_url.m_userStart = currentPosition(c) - 1;
1622                     m_url.m_userEnd = m_url.m_userStart;
1623                     m_url.m_passwordEnd = m_url.m_userStart;
1624                     m_url.m_hostEnd = m_url.m_userStart;
1625                     m_url.m_portEnd = m_url.m_userStart;
1626                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1627                     if (isWindowsDriveLetter(c))
1628                         appendWindowsDriveLetter(c);
1629                 }
1630                 state = State::Path;
1631                 break;
1632             }
1633             break;
1634         case State::FileSlash:
1635             LOG_STATE("FileSlash");
1636             if (LIKELY(*c == '/' || *c == '\\')) {
1637                 if (UNLIKELY(*c == '\\'))
1638                     syntaxViolation(c);
1639                 appendToASCIIBuffer('/');
1640                 advance(c);
1641                 m_url.m_userStart = currentPosition(c);
1642                 m_url.m_userEnd = m_url.m_userStart;
1643                 m_url.m_passwordEnd = m_url.m_userStart;
1644                 m_url.m_hostEnd = m_url.m_userStart;
1645                 m_url.m_portEnd = m_url.m_userStart;
1646                 authorityOrHostBegin = c;
1647                 state = State::FileHost;
1648                 break;
1649             }
1650             syntaxViolation(c);
1651             appendToASCIIBuffer("//", 2);
1652             m_url.m_userStart = currentPosition(c) - 1;
1653             m_url.m_userEnd = m_url.m_userStart;
1654             m_url.m_passwordEnd = m_url.m_userStart;
1655             m_url.m_hostEnd = m_url.m_userStart;
1656             m_url.m_portEnd = m_url.m_userStart;
1657             if (isWindowsDriveLetter(c)) {
1658                 appendWindowsDriveLetter(c);
1659                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1660             } else if (copyBaseWindowsDriveLetter(base)) {
1661                 appendToASCIIBuffer('/');
1662                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1663             } else
1664                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1665             state = State::Path;
1666             break;
1667         case State::FileHost:
1668             do {
1669                 LOG_STATE("FileHost");
1670                 if (isSlashQuestionOrHash(*c)) {
1671                     bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1672                         && isWindowsDriveLetter(authorityOrHostBegin);
1673                     if (windowsQuirk) {
1674                         syntaxViolation(authorityOrHostBegin);
1675                         appendToASCIIBuffer('/');
1676                         appendWindowsDriveLetter(authorityOrHostBegin);
1677                     }
1678                     if (windowsQuirk || authorityOrHostBegin == c) {
1679                         ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1) == '/');
1680                         if (UNLIKELY(*c == '?')) {
1681                             syntaxViolation(c);
1682                             appendToASCIIBuffer("/?", 2);
1683                             ++c;
1684                             if (isUTF8Encoding)
1685                                 state = State::UTF8Query;
1686                             else {
1687                                 queryBegin = c;
1688                                 state = State::NonUTF8Query;
1689                             }
1690                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1691                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1692                             break;
1693                         }
1694                         if (UNLIKELY(*c == '#')) {
1695                             syntaxViolation(c);
1696                             appendToASCIIBuffer("/#", 2);
1697                             ++c;
1698                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1699                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1700                             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1701                             state = State::Fragment;
1702                             break;
1703                         }
1704                         state = State::Path;
1705                         break;
1706                     }
1707                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1708                         failure();
1709                         return;
1710                     }
1711                     if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) {
1712                         syntaxViolation(c);
1713                         m_asciiBuffer.shrink(m_url.m_passwordEnd);
1714                         m_url.m_hostEnd = currentPosition(c);
1715                         m_url.m_portEnd = m_url.m_hostEnd;
1716                     }
1717                     
1718                     state = State::PathStart;
1719                     break;
1720                 }
1721                 if (isPercentOrNonASCII(*c))
1722                     m_hostHasPercentOrNonASCII = true;
1723                 ++c;
1724             } while (!c.atEnd());
1725             break;
1726         case State::PathStart:
1727             LOG_STATE("PathStart");
1728             if (*c != '/' && *c != '\\')
1729                 ++c;
1730             state = State::Path;
1731             break;
1732         case State::Path:
1733             LOG_STATE("Path");
1734             if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1735                 if (UNLIKELY(m_urlIsSpecial && *c == '\\'))
1736                     syntaxViolation(c);
1737                 appendToASCIIBuffer('/');
1738                 ++c;
1739                 m_url.m_pathAfterLastSlash = currentPosition(c);
1740                 break;
1741             }
1742             if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1) == '/')) {
1743                 if (UNLIKELY(isDoubleDotPathSegment(c))) {
1744                     syntaxViolation(c);
1745                     consumeDoubleDotPathSegment(c);
1746                     popPath();
1747                     break;
1748                 }
1749                 if (UNLIKELY(isSingleDotPathSegment(c))) {
1750                     syntaxViolation(c);
1751                     consumeSingleDotPathSegment(c);
1752                     break;
1753                 }
1754             }
1755             if (*c == '?') {
1756                 m_url.m_pathEnd = currentPosition(c);
1757                 appendToASCIIBuffer('?');
1758                 ++c;
1759                 if (isUTF8Encoding)
1760                     state = State::UTF8Query;
1761                 else {
1762                     queryBegin = c;
1763                     state = State::NonUTF8Query;
1764                 }
1765                 break;
1766             }
1767             if (*c == '#') {
1768                 m_url.m_pathEnd = currentPosition(c);
1769                 m_url.m_queryEnd = m_url.m_pathEnd;
1770                 state = State::Fragment;
1771                 break;
1772             }
1773             utf8PercentEncode<isInDefaultEncodeSet>(c);
1774             ++c;
1775             break;
1776         case State::CannotBeABaseURLPath:
1777             LOG_STATE("CannotBeABaseURLPath");
1778             if (*c == '?') {
1779                 m_url.m_pathEnd = currentPosition(c);
1780                 appendToASCIIBuffer('?');
1781                 ++c;
1782                 if (isUTF8Encoding)
1783                     state = State::UTF8Query;
1784                 else {
1785                     queryBegin = c;
1786                     state = State::NonUTF8Query;
1787                 }
1788             } else if (*c == '#') {
1789                 m_url.m_pathEnd = currentPosition(c);
1790                 m_url.m_queryEnd = m_url.m_pathEnd;
1791                 state = State::Fragment;
1792             } else if (*c == '/') {
1793                 appendToASCIIBuffer('/');
1794                 ++c;
1795                 m_url.m_pathAfterLastSlash = currentPosition(c);
1796             } else {
1797                 utf8PercentEncode<isInSimpleEncodeSet>(c);
1798                 ++c;
1799             }
1800             break;
1801         case State::UTF8Query:
1802             LOG_STATE("UTF8Query");
1803             ASSERT(queryBegin == CodePointIterator<CharacterType>());
1804             if (*c == '#') {
1805                 m_url.m_queryEnd = currentPosition(c);
1806                 state = State::Fragment;
1807                 break;
1808             }
1809             if (isUTF8Encoding)
1810                 utf8QueryEncode(c);
1811             else
1812                 appendCodePoint(queryBuffer, *c);
1813             ++c;
1814             break;
1815         case State::NonUTF8Query:
1816             do {
1817                 LOG_STATE("NonUTF8Query");
1818                 ASSERT(queryBegin != CodePointIterator<CharacterType>());
1819                 if (*c == '#') {
1820                     encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
1821                     m_url.m_queryEnd = currentPosition(c);
1822                     state = State::Fragment;
1823                     break;
1824                 }
1825                 appendCodePoint(queryBuffer, *c);
1826                 advance(c, queryBegin);
1827             } while (!c.atEnd());
1828             break;
1829         case State::Fragment:
1830             URL_PARSER_LOG("State Fragment");
1831             utf8PercentEncode<isInSimpleEncodeSet>(c);
1832             ++c;
1833             break;
1834         }
1835     }
1836
1837     switch (state) {
1838     case State::SchemeStart:
1839         LOG_FINAL_STATE("SchemeStart");
1840         if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1841             m_url = base;
1842             m_url.removeFragmentIdentifier();
1843             return;
1844         }
1845         failure();
1846         return;
1847     case State::Scheme:
1848         LOG_FINAL_STATE("Scheme");
1849         failure();
1850         return;
1851     case State::NoScheme:
1852         LOG_FINAL_STATE("NoScheme");
1853         RELEASE_ASSERT_NOT_REACHED();
1854     case State::SpecialRelativeOrAuthority:
1855         LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1856         copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1857         m_url.m_fragmentEnd = m_url.m_queryEnd;
1858         break;
1859     case State::PathOrAuthority:
1860         LOG_FINAL_STATE("PathOrAuthority");
1861         ASSERT(m_url.m_userStart);
1862         ASSERT(m_url.m_userStart == currentPosition(c));
1863         ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1864         m_url.m_userStart--;
1865         m_url.m_userEnd = m_url.m_userStart;
1866         m_url.m_passwordEnd = m_url.m_userStart;
1867         m_url.m_hostEnd = m_url.m_userStart;
1868         m_url.m_portEnd = m_url.m_userStart;
1869         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1870         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1871         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1872         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1873         break;
1874     case State::Relative:
1875         LOG_FINAL_STATE("Relative");
1876         RELEASE_ASSERT_NOT_REACHED();
1877     case State::RelativeSlash:
1878         LOG_FINAL_STATE("RelativeSlash");
1879         copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1880         appendToASCIIBuffer('/');
1881         m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1882         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1883         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1884         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1885         break;
1886     case State::SpecialAuthoritySlashes:
1887         LOG_FINAL_STATE("SpecialAuthoritySlashes");
1888         m_url.m_userStart = currentPosition(c);
1889         m_url.m_userEnd = m_url.m_userStart;
1890         m_url.m_passwordEnd = m_url.m_userStart;
1891         m_url.m_hostEnd = m_url.m_userStart;
1892         m_url.m_portEnd = m_url.m_userStart;
1893         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1894         m_url.m_pathEnd = m_url.m_userStart;
1895         m_url.m_queryEnd = m_url.m_userStart;
1896         m_url.m_fragmentEnd = m_url.m_userStart;
1897         break;
1898     case State::SpecialAuthorityIgnoreSlashes:
1899         LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1900         failure();
1901         return;
1902     case State::AuthorityOrHost:
1903         LOG_FINAL_STATE("AuthorityOrHost");
1904         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1905         m_url.m_passwordEnd = m_url.m_userEnd;
1906         if (authorityOrHostBegin.atEnd()) {
1907             m_url.m_userEnd = m_url.m_userStart;
1908             m_url.m_passwordEnd = m_url.m_userStart;
1909             m_url.m_hostEnd = m_url.m_userStart;
1910             m_url.m_portEnd = m_url.m_userStart;
1911             m_url.m_pathEnd = m_url.m_userStart;
1912         } else if (!parseHostAndPort(authorityOrHostBegin)) {
1913             failure();
1914             return;
1915         } else {
1916             if (m_urlIsSpecial) {
1917                 syntaxViolation(c);
1918                 appendToASCIIBuffer('/');
1919                 m_url.m_pathEnd = m_url.m_portEnd + 1;
1920             } else
1921                 m_url.m_pathEnd = m_url.m_portEnd;
1922         }
1923         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1924         m_url.m_queryEnd = m_url.m_pathEnd;
1925         m_url.m_fragmentEnd = m_url.m_pathEnd;
1926         break;
1927     case State::Host:
1928         LOG_FINAL_STATE("Host");
1929         if (!parseHostAndPort(authorityOrHostBegin)) {
1930             failure();
1931             return;
1932         }
1933         if (m_urlIsSpecial) {
1934             syntaxViolation(c);
1935             appendToASCIIBuffer('/');
1936             m_url.m_pathEnd = m_url.m_portEnd + 1;
1937         } else
1938             m_url.m_pathEnd = m_url.m_portEnd;
1939         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1940         m_url.m_queryEnd = m_url.m_pathEnd;
1941         m_url.m_fragmentEnd = m_url.m_pathEnd;
1942         break;
1943     case State::File:
1944         LOG_FINAL_STATE("File");
1945         if (base.isValid() && base.protocolIs("file")) {
1946             copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1947             m_url.m_fragmentEnd = m_url.m_queryEnd;
1948             break;
1949         }
1950         syntaxViolation(c);
1951         appendToASCIIBuffer("///", 3);
1952         m_url.m_userStart = currentPosition(c) - 1;
1953         m_url.m_userEnd = m_url.m_userStart;
1954         m_url.m_passwordEnd = m_url.m_userStart;
1955         m_url.m_hostEnd = m_url.m_userStart;
1956         m_url.m_portEnd = m_url.m_userStart;
1957         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1958         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1959         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1960         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1961         break;
1962     case State::FileSlash:
1963         LOG_FINAL_STATE("FileSlash");
1964         syntaxViolation(c);
1965         m_url.m_userStart = currentPosition(c) + 1;
1966         appendToASCIIBuffer("//", 2);
1967         m_url.m_userEnd = m_url.m_userStart;
1968         m_url.m_passwordEnd = m_url.m_userStart;
1969         m_url.m_hostEnd = m_url.m_userStart;
1970         m_url.m_portEnd = m_url.m_userStart;
1971         if (copyBaseWindowsDriveLetter(base)) {
1972             appendToASCIIBuffer('/');
1973             m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1974         } else
1975             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1976         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1977         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1978         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1979         break;
1980     case State::FileHost:
1981         LOG_FINAL_STATE("FileHost");
1982         if (takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1983             && isWindowsDriveLetter(authorityOrHostBegin)) {
1984             syntaxViolation(authorityOrHostBegin);
1985             appendToASCIIBuffer('/');
1986             appendWindowsDriveLetter(authorityOrHostBegin);
1987             m_url.m_pathAfterLastSlash = currentPosition(c);
1988             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1989             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1990             m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1991             break;
1992         }
1993         
1994         if (authorityOrHostBegin == c) {
1995             syntaxViolation(c);
1996             appendToASCIIBuffer('/');
1997             m_url.m_userStart = currentPosition(c) - 1;
1998             m_url.m_userEnd = m_url.m_userStart;
1999             m_url.m_passwordEnd = m_url.m_userStart;
2000             m_url.m_hostEnd = m_url.m_userStart;
2001             m_url.m_portEnd = m_url.m_userStart;
2002             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
2003             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
2004             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
2005             m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
2006             break;
2007         }
2008
2009         if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
2010             failure();
2011             return;
2012         }
2013
2014         syntaxViolation(c);
2015         if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) {
2016             m_asciiBuffer.shrink(m_url.m_passwordEnd);
2017             m_url.m_hostEnd = currentPosition(c);
2018             m_url.m_portEnd = m_url.m_hostEnd;
2019         }
2020         appendToASCIIBuffer('/');
2021         m_url.m_pathAfterLastSlash = m_url.m_portEnd + 1;
2022         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
2023         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
2024         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
2025         break;
2026     case State::PathStart:
2027         LOG_FINAL_STATE("PathStart");
2028         RELEASE_ASSERT_NOT_REACHED();
2029     case State::Path:
2030         LOG_FINAL_STATE("Path");
2031         m_url.m_pathEnd = currentPosition(c);
2032         m_url.m_queryEnd = m_url.m_pathEnd;
2033         m_url.m_fragmentEnd = m_url.m_pathEnd;
2034         break;
2035     case State::CannotBeABaseURLPath:
2036         LOG_FINAL_STATE("CannotBeABaseURLPath");
2037         m_url.m_pathEnd = currentPosition(c);
2038         m_url.m_queryEnd = m_url.m_pathEnd;
2039         m_url.m_fragmentEnd = m_url.m_pathEnd;
2040         break;
2041     case State::UTF8Query:
2042         LOG_FINAL_STATE("UTF8Query");
2043         ASSERT(queryBegin == CodePointIterator<CharacterType>());
2044         m_url.m_queryEnd = currentPosition(c);
2045         m_url.m_fragmentEnd = m_url.m_queryEnd;
2046         break;
2047     case State::NonUTF8Query:
2048         LOG_FINAL_STATE("NonUTF8Query");
2049         ASSERT(queryBegin != CodePointIterator<CharacterType>());
2050         encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
2051         m_url.m_queryEnd = currentPosition(c);
2052         m_url.m_fragmentEnd = m_url.m_queryEnd;
2053         break;
2054     case State::Fragment:
2055         LOG_FINAL_STATE("Fragment");
2056         m_url.m_fragmentEnd = currentPosition(c);
2057         break;
2058     }
2059
2060     if (LIKELY(!m_didSeeSyntaxViolation)) {
2061         m_url.m_string = m_inputString;
2062         ASSERT(m_asciiBuffer.isEmpty());
2063     } else
2064         m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
2065     m_url.m_isValid = true;
2066     URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
2067 }
2068
2069 template<typename CharacterType>
2070 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
2071 {
2072     if (UNLIKELY(iterator.atEnd())) {
2073         syntaxViolation(iterator);
2074         m_url.m_userEnd = currentPosition(iterator);
2075         m_url.m_passwordEnd = m_url.m_userEnd;
2076         return;
2077     }
2078     for (; !iterator.atEnd(); advance(iterator)) {
2079         if (*iterator == ':') {
2080             m_url.m_userEnd = currentPosition(iterator);
2081             auto iteratorAtColon = iterator;
2082             ++iterator;
2083             bool tabOrNewlineAfterColon = false;
2084             while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
2085                 tabOrNewlineAfterColon = true;
2086                 ++iterator;
2087             }
2088             if (UNLIKELY(iterator.atEnd())) {
2089                 syntaxViolation(iteratorAtColon);
2090                 m_url.m_passwordEnd = m_url.m_userEnd;
2091                 if (m_url.m_userEnd > m_url.m_userStart)
2092                     appendToASCIIBuffer('@');
2093                 return;
2094             }
2095             if (tabOrNewlineAfterColon)
2096                 syntaxViolation(iteratorAtColon);
2097             appendToASCIIBuffer(':');
2098             break;
2099         }
2100         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2101     }
2102     for (; !iterator.atEnd(); advance(iterator))
2103         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2104     m_url.m_passwordEnd = currentPosition(iterator);
2105     if (!m_url.m_userEnd)
2106         m_url.m_userEnd = m_url.m_passwordEnd;
2107     appendToASCIIBuffer('@');
2108 }
2109
2110 template<typename UnsignedIntegerType>
2111 void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
2112 {
2113     LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
2114     LChar* end = std::end(buf);
2115     LChar* p = end;
2116     do {
2117         *--p = (number % 10) + '0';
2118         number /= 10;
2119     } while (number);
2120     appendToASCIIBuffer(p, end - p);
2121 }
2122
2123 void URLParser::serializeIPv4(IPv4Address address)
2124 {
2125     appendNumberToASCIIBuffer<uint8_t>(address >> 24);
2126     appendToASCIIBuffer('.');
2127     appendNumberToASCIIBuffer<uint8_t>(address >> 16);
2128     appendToASCIIBuffer('.');
2129     appendNumberToASCIIBuffer<uint8_t>(address >> 8);
2130     appendToASCIIBuffer('.');
2131     appendNumberToASCIIBuffer<uint8_t>(address);
2132 }
2133     
2134 static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
2135 {
2136     size_t end = begin;
2137     for (; end < 8; end++) {
2138         if (address[end])
2139             break;
2140     }
2141     return end - begin;
2142 }
2143
2144 static std::optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
2145 {
2146     std::optional<size_t> longest;
2147     size_t longestLength = 0;
2148     for (size_t i = 0; i < 8; i++) {
2149         size_t length = zeroSequenceLength(address, i);
2150         if (length) {
2151             if (length > 1 && (!longest || longestLength < length)) {
2152                 longest = i;
2153                 longestLength = length;
2154             }
2155             i += length;
2156         }
2157     }
2158     return longest;
2159 }
2160
2161 void URLParser::serializeIPv6Piece(uint16_t piece)
2162 {
2163     bool printed = false;
2164     if (auto nibble0 = piece >> 12) {
2165         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
2166         printed = true;
2167     }
2168     auto nibble1 = piece >> 8 & 0xF;
2169     if (printed || nibble1) {
2170         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
2171         printed = true;
2172     }
2173     auto nibble2 = piece >> 4 & 0xF;
2174     if (printed || nibble2)
2175         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
2176     appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
2177 }
2178
2179 void URLParser::serializeIPv6(URLParser::IPv6Address address)
2180 {
2181     appendToASCIIBuffer('[');
2182     auto compressPointer = findLongestZeroSequence(address);
2183     for (size_t piece = 0; piece < 8; piece++) {
2184         if (compressPointer && compressPointer.value() == piece) {
2185             ASSERT(!address[piece]);
2186             if (piece)
2187                 appendToASCIIBuffer(':');
2188             else
2189                 appendToASCIIBuffer("::", 2);
2190             while (piece < 8 && !address[piece])
2191                 piece++;
2192             if (piece == 8)
2193                 break;
2194         }
2195         serializeIPv6Piece(address[piece]);
2196         if (piece < 7)
2197             appendToASCIIBuffer(':');
2198     }
2199     appendToASCIIBuffer(']');
2200 }
2201
2202 enum class URLParser::IPv4PieceParsingError {
2203     Failure,
2204     Overflow,
2205 };
2206
2207 template<typename CharacterType>
2208 Expected<uint32_t, URLParser::IPv4PieceParsingError> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation)
2209 {
2210     enum class State : uint8_t {
2211         UnknownBase,
2212         Decimal,
2213         OctalOrHex,
2214         Octal,
2215         Hex,
2216     };
2217     State state = State::UnknownBase;
2218     Checked<uint32_t, RecordOverflow> value = 0;
2219     if (!iterator.atEnd() && *iterator == '.')
2220         return makeUnexpected(IPv4PieceParsingError::Failure);
2221     while (!iterator.atEnd()) {
2222         if (isTabOrNewline(*iterator)) {
2223             didSeeSyntaxViolation = true;
2224             ++iterator;
2225             continue;
2226         }
2227         if (*iterator == '.') {
2228             ASSERT(!value.hasOverflowed());
2229             return value.unsafeGet();
2230         }
2231         switch (state) {
2232         case State::UnknownBase:
2233             if (UNLIKELY(*iterator == '0')) {
2234                 ++iterator;
2235                 state = State::OctalOrHex;
2236                 break;
2237             }
2238             state = State::Decimal;
2239             break;
2240         case State::OctalOrHex:
2241             didSeeSyntaxViolation = true;
2242             if (*iterator == 'x' || *iterator == 'X') {
2243                 ++iterator;
2244                 state = State::Hex;
2245                 break;
2246             }
2247             state = State::Octal;
2248             break;
2249         case State::Decimal:
2250             if (!isASCIIDigit(*iterator))
2251                 return makeUnexpected(IPv4PieceParsingError::Failure);
2252             value *= 10;
2253             value += *iterator - '0';
2254             if (UNLIKELY(value.hasOverflowed()))
2255                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2256             ++iterator;
2257             break;
2258         case State::Octal:
2259             ASSERT(didSeeSyntaxViolation);
2260             if (*iterator < '0' || *iterator > '7')
2261                 return makeUnexpected(IPv4PieceParsingError::Failure);
2262             value *= 8;
2263             value += *iterator - '0';
2264             if (UNLIKELY(value.hasOverflowed()))
2265                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2266             ++iterator;
2267             break;
2268         case State::Hex:
2269             ASSERT(didSeeSyntaxViolation);
2270             if (!isASCIIHexDigit(*iterator))
2271                 return makeUnexpected(IPv4PieceParsingError::Failure);
2272             value *= 16;
2273             value += toASCIIHexValue(*iterator);
2274             if (UNLIKELY(value.hasOverflowed()))
2275                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2276             ++iterator;
2277             break;
2278         }
2279     }
2280     ASSERT(!value.hasOverflowed());
2281     return value.unsafeGet();
2282 }
2283
2284 ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2285 {
2286     RELEASE_ASSERT(exponent <= 4);
2287     uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
2288     return values[exponent];
2289 }
2290
2291 enum class URLParser::IPv4ParsingError {
2292     Failure,
2293     NotIPv4,
2294 };
2295
2296 template<typename CharacterTypeForSyntaxViolation, typename CharacterType>
2297 Expected<URLParser::IPv4Address, URLParser::IPv4ParsingError> URLParser::parseIPv4Host(const CodePointIterator<CharacterTypeForSyntaxViolation>& iteratorForSyntaxViolationPosition, CodePointIterator<CharacterType> iterator)
2298 {
2299     Vector<Expected<uint32_t, URLParser::IPv4PieceParsingError>, 4> items;
2300     bool didSeeSyntaxViolation = false;
2301     if (!iterator.atEnd() && *iterator == '.')
2302         return makeUnexpected(IPv4ParsingError::NotIPv4);
2303     while (!iterator.atEnd()) {
2304         if (isTabOrNewline(*iterator)) {
2305             didSeeSyntaxViolation = true;
2306             ++iterator;
2307             continue;
2308         }
2309         if (items.size() >= 4)
2310             return makeUnexpected(IPv4ParsingError::NotIPv4);
2311         items.append(parseIPv4Piece(iterator, didSeeSyntaxViolation));
2312         if (!iterator.atEnd() && *iterator == '.') {
2313             ++iterator;
2314             if (iterator.atEnd())
2315                 syntaxViolation(iteratorForSyntaxViolationPosition);
2316             else if (*iterator == '.')
2317                 return makeUnexpected(IPv4ParsingError::NotIPv4);
2318         }
2319     }
2320     if (!iterator.atEnd() || !items.size() || items.size() > 4)
2321         return makeUnexpected(IPv4ParsingError::NotIPv4);
2322     for (const auto& item : items) {
2323         if (!item.hasValue() && item.error() == IPv4PieceParsingError::Failure)
2324             return makeUnexpected(IPv4ParsingError::NotIPv4);
2325     }
2326     for (const auto& item : items) {
2327         if (!item.hasValue() && item.error() == IPv4PieceParsingError::Overflow)
2328             return makeUnexpected(IPv4ParsingError::Failure);
2329     }
2330     if (items.size() > 1) {
2331         for (size_t i = 0; i < items.size() - 1; i++) {
2332             if (items[i].value() > 255)
2333                 return makeUnexpected(IPv4ParsingError::Failure);
2334         }
2335     }
2336     if (items[items.size() - 1].value() >= pow256(5 - items.size()))
2337         return makeUnexpected(IPv4ParsingError::Failure);
2338
2339     if (didSeeSyntaxViolation)
2340         syntaxViolation(iteratorForSyntaxViolationPosition);
2341     for (const auto& item : items) {
2342         if (item.value() > 255)
2343             syntaxViolation(iteratorForSyntaxViolationPosition);
2344     }
2345
2346     if (UNLIKELY(items.size() != 4))
2347         syntaxViolation(iteratorForSyntaxViolationPosition);
2348
2349     IPv4Address ipv4 = items.takeLast().value();
2350     for (size_t counter = 0; counter < items.size(); ++counter)
2351         ipv4 += items[counter].value() * pow256(3 - counter);
2352     return ipv4;
2353 }
2354
2355 template<typename CharacterType>
2356 std::optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator)
2357 {
2358     if (iterator.atEnd())
2359         return std::nullopt;
2360     uint32_t piece = 0;
2361     bool leadingZeros = false;
2362     size_t digitCount = 0;
2363     while (!iterator.atEnd()) {
2364         if (!isASCIIDigit(*iterator))
2365             return std::nullopt;
2366         ++digitCount;
2367         if (!piece && *iterator == '0') {
2368             if (leadingZeros)
2369                 return std::nullopt;
2370             leadingZeros = true;
2371         }
2372         if (!piece && *iterator == '0')
2373             leadingZeros = true;
2374         piece = piece * 10 + *iterator - '0';
2375         if (piece > 255)
2376             return std::nullopt;
2377         advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2378         if (iterator.atEnd())
2379             break;
2380         if (*iterator == '.')
2381             break;
2382     }
2383     if (piece && leadingZeros)
2384         return std::nullopt;
2385     return piece;
2386 }
2387
2388 template<typename CharacterType>
2389 std::optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator)
2390 {
2391     IPv4Address address = 0;
2392     for (size_t i = 0; i < 4; ++i) {
2393         if (std::optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator))
2394             address = (address << 8) + piece.value();
2395         else
2396             return std::nullopt;
2397         if (i < 3) {
2398             if (iterator.atEnd())
2399                 return std::nullopt;
2400             if (*iterator != '.')
2401                 return std::nullopt;
2402             advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2403         } else if (!iterator.atEnd())
2404             return std::nullopt;
2405     }
2406     ASSERT(iterator.atEnd());
2407     return address;
2408 }
2409
2410 template<typename CharacterType>
2411 std::optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2412 {
2413     ASSERT(*c == '[');
2414     const auto hostBegin = c;
2415     advance(c, hostBegin);
2416     if (c.atEnd())
2417         return std::nullopt;
2418
2419     IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}};
2420     size_t piecePointer = 0;
2421     std::optional<size_t> compressPointer;
2422
2423     if (*c == ':') {
2424         advance(c, hostBegin);
2425         if (c.atEnd())
2426             return std::nullopt;
2427         if (*c != ':')
2428             return std::nullopt;
2429         advance(c, hostBegin);
2430         ++piecePointer;
2431         compressPointer = piecePointer;
2432     }
2433     
2434     while (!c.atEnd()) {
2435         if (piecePointer == 8)
2436             return std::nullopt;
2437         if (*c == ':') {
2438             if (compressPointer)
2439                 return std::nullopt;
2440             advance(c, hostBegin);
2441             ++piecePointer;
2442             compressPointer = piecePointer;
2443             continue;
2444         }
2445         if (piecePointer == 6 || (compressPointer && piecePointer < 6)) {
2446             if (std::optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) {
2447                 if (compressPointer && piecePointer == 5)
2448                     return std::nullopt;
2449                 syntaxViolation(hostBegin);
2450                 address[piecePointer++] = ipv4Address.value() >> 16;
2451                 address[piecePointer++] = ipv4Address.value() & 0xFFFF;
2452                 c = { };
2453                 break;
2454             }
2455         }
2456         uint16_t value = 0;
2457         size_t length = 0;
2458         bool leadingZeros = false;
2459         for (; length < 4; length++) {
2460             if (c.atEnd())
2461                 break;
2462             if (!isASCIIHexDigit(*c))
2463                 break;
2464             if (isASCIIUpper(*c))
2465                 syntaxViolation(hostBegin);
2466             if (*c == '0' && !length)
2467                 leadingZeros = true;
2468             value = value * 0x10 + toASCIIHexValue(*c);
2469             advance(c, hostBegin);
2470         }
2471         
2472         if (UNLIKELY((value && leadingZeros) || (!value && length > 1)))
2473             syntaxViolation(hostBegin);
2474
2475         address[piecePointer++] = value;
2476         if (c.atEnd())
2477             break;
2478         if (piecePointer == 8 || *c != ':')
2479             return std::nullopt;
2480         advance(c, hostBegin);
2481     }
2482     
2483     if (!c.atEnd())
2484         return std::nullopt;
2485     
2486     if (compressPointer) {
2487         size_t swaps = piecePointer - compressPointer.value();
2488         piecePointer = 7;
2489         while (swaps)
2490             std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
2491     } else if (piecePointer != 8)
2492         return std::nullopt;
2493
2494     std::optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2495     if (possibleCompressPointer)
2496         possibleCompressPointer.value()++;
2497     if (UNLIKELY(compressPointer != possibleCompressPointer))
2498         syntaxViolation(hostBegin);
2499     
2500     return address;
2501 }
2502
2503 template<typename CharacterType>
2504 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2505 {
2506     Vector<LChar, defaultInlineBufferSize> output;
2507     output.reserveInitialCapacity(length);
2508     
2509     for (size_t i = 0; i < length; ++i) {
2510         uint8_t byte = input[i];
2511         if (byte != '%')
2512             output.uncheckedAppend(byte);
2513         else if (length > 2 && i < length - 2) {
2514             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2515                 syntaxViolation(iteratorForSyntaxViolationPosition);
2516                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2517                 i += 2;
2518             } else
2519                 output.uncheckedAppend(byte);
2520         } else
2521             output.uncheckedAppend(byte);
2522     }
2523     return output;
2524 }
2525     
2526 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length)
2527 {
2528     Vector<LChar, defaultInlineBufferSize> output;
2529     output.reserveInitialCapacity(length);
2530     
2531     for (size_t i = 0; i < length; ++i) {
2532         uint8_t byte = input[i];
2533         if (byte != '%')
2534             output.uncheckedAppend(byte);
2535         else if (length > 2 && i < length - 2) {
2536             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2537                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2538                 i += 2;
2539             } else
2540                 output.uncheckedAppend(byte);
2541         } else
2542             output.uncheckedAppend(byte);
2543     }
2544     return output;
2545 }
2546
2547 ALWAYS_INLINE static bool containsOnlyASCII(const String& string)
2548 {
2549     ASSERT(!string.isNull());
2550     if (string.is8Bit())
2551         return charactersAreAllASCII(string.characters8(), string.length());
2552     return charactersAreAllASCII(string.characters16(), string.length());
2553 }
2554
2555 template<typename CharacterType>
2556 std::optional<Vector<LChar, URLParser::defaultInlineBufferSize>> URLParser::domainToASCII(const String& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2557 {
2558     Vector<LChar, defaultInlineBufferSize> ascii;
2559     if (containsOnlyASCII(domain)) {
2560         size_t length = domain.length();
2561         if (domain.is8Bit()) {
2562             const LChar* characters = domain.characters8();
2563             ascii.reserveInitialCapacity(length);
2564             for (size_t i = 0; i < length; ++i) {
2565                 if (UNLIKELY(isASCIIUpper(characters[i])))
2566                     syntaxViolation(iteratorForSyntaxViolationPosition);
2567                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2568             }
2569         } else {
2570             const UChar* characters = domain.characters16();
2571             ascii.reserveInitialCapacity(length);
2572             for (size_t i = 0; i < length; ++i) {
2573                 if (UNLIKELY(isASCIIUpper(characters[i])))
2574                     syntaxViolation(iteratorForSyntaxViolationPosition);
2575                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2576             }
2577         }
2578         return ascii;
2579     }
2580     
2581     UChar hostnameBuffer[defaultInlineBufferSize];
2582     UErrorCode error = U_ZERO_ERROR;
2583     UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER;
2584     int32_t numCharactersConverted = uidna_nameToASCII(&internationalDomainNameTranscoder(), StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, &processingDetails, &error);
2585     ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
2586
2587     if (U_SUCCESS(error) && !processingDetails.errors) {
2588         for (int32_t i = 0; i < numCharactersConverted; ++i) {
2589             ASSERT(isASCII(hostnameBuffer[i]));
2590             ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2591         }
2592         ascii.append(hostnameBuffer, numCharactersConverted);
2593         if (domain != StringView(ascii.data(), ascii.size()))
2594             syntaxViolation(iteratorForSyntaxViolationPosition);
2595         return ascii;
2596     }
2597
2598     // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
2599     return std::nullopt;
2600 }
2601
2602 bool URLParser::hasForbiddenHostCodePoint(const Vector<LChar, URLParser::defaultInlineBufferSize>& asciiDomain)
2603 {
2604     for (size_t i = 0; i < asciiDomain.size(); ++i) {
2605         if (isForbiddenHostCodePoint(asciiDomain[i]))
2606             return true;
2607     }
2608     return false;
2609 }
2610
2611 template<typename CharacterType>
2612 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2613 {
2614     ASSERT(*iterator == ':');
2615     auto colonIterator = iterator;
2616     advance(iterator, colonIterator);
2617     uint32_t port = 0;
2618     if (UNLIKELY(iterator.atEnd())) {
2619         m_url.m_portEnd = currentPosition(colonIterator);
2620         syntaxViolation(colonIterator);
2621         return true;
2622     }
2623     size_t digitCount = 0;
2624     bool leadingZeros = false;
2625     for (; !iterator.atEnd(); ++iterator) {
2626         if (UNLIKELY(isTabOrNewline(*iterator))) {
2627             syntaxViolation(colonIterator);
2628             continue;
2629         }
2630         if (isASCIIDigit(*iterator)) {
2631             if (*iterator == '0' && !digitCount)
2632                 leadingZeros = true;
2633             ++digitCount;
2634             port = port * 10 + *iterator - '0';
2635             if (port > std::numeric_limits<uint16_t>::max())
2636                 return false;
2637         } else
2638             return false;
2639     }
2640
2641     if (port && leadingZeros)
2642         syntaxViolation(colonIterator);
2643     
2644     if (!port && digitCount > 1)
2645         syntaxViolation(colonIterator);
2646
2647     ASSERT(port == static_cast<uint16_t>(port));
2648     if (UNLIKELY(defaultPortForProtocol(parsedDataView(0, m_url.m_schemeEnd)) == static_cast<uint16_t>(port)))
2649         syntaxViolation(colonIterator);
2650     else {
2651         appendToASCIIBuffer(':');
2652         ASSERT(port <= std::numeric_limits<uint16_t>::max());
2653         appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2654     }
2655
2656     m_url.m_portEnd = currentPosition(iterator);
2657     return true;
2658 }
2659
2660 template<typename CharacterType>
2661 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2662 {
2663     if (iterator.atEnd())
2664         return false;
2665     if (*iterator == ':')
2666         return false;
2667     if (*iterator == '[') {
2668         auto ipv6End = iterator;
2669         while (!ipv6End.atEnd() && *ipv6End != ']')
2670             ++ipv6End;
2671         if (ipv6End.atEnd())
2672             return false;
2673         if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2674             serializeIPv6(address.value());
2675             if (!ipv6End.atEnd()) {
2676                 advance(ipv6End);
2677                 if (!ipv6End.atEnd() && *ipv6End == ':') {
2678                     m_url.m_hostEnd = currentPosition(ipv6End);
2679                     return parsePort(ipv6End);
2680                 }
2681                 m_url.m_hostEnd = currentPosition(ipv6End);
2682                 m_url.m_portEnd = m_url.m_hostEnd;
2683                 return true;
2684             }
2685             m_url.m_hostEnd = currentPosition(ipv6End);
2686             return true;
2687         }
2688         return false;
2689     }
2690
2691     if (!m_urlIsSpecial) {
2692         for (; !iterator.atEnd(); ++iterator) {
2693             if (UNLIKELY(isTabOrNewline(*iterator))) {
2694                 syntaxViolation(iterator);
2695                 continue;
2696             }
2697             if (*iterator == ':')
2698                 break;
2699             if (UNLIKELY(isForbiddenHostCodePoint(*iterator) && *iterator != '%'))
2700                 return false;
2701             utf8PercentEncode<isInSimpleEncodeSet>(iterator);
2702         }
2703         m_url.m_hostEnd = currentPosition(iterator);
2704         if (iterator.atEnd()) {
2705             m_url.m_portEnd = currentPosition(iterator);
2706             return true;
2707         }
2708         return parsePort(iterator);
2709     }
2710     
2711     if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2712         auto hostIterator = iterator;
2713         for (; !iterator.atEnd(); ++iterator) {
2714             if (isTabOrNewline(*iterator))
2715                 continue;
2716             if (*iterator == ':')
2717                 break;
2718             if (isForbiddenHostCodePoint(*iterator))
2719                 return false;
2720         }
2721         auto address = parseIPv4Host(hostIterator, CodePointIterator<CharacterType>(hostIterator, iterator));
2722         if (address) {
2723             serializeIPv4(address.value());
2724             m_url.m_hostEnd = currentPosition(iterator);
2725             if (iterator.atEnd()) {
2726                 m_url.m_portEnd = currentPosition(iterator);
2727                 return true;
2728             }
2729             return parsePort(iterator);
2730         }
2731         if (address.error() == IPv4ParsingError::Failure)
2732             return false;
2733         for (; hostIterator != iterator; ++hostIterator) {
2734             if (UNLIKELY(isTabOrNewline(*hostIterator))) {
2735                 syntaxViolation(hostIterator);
2736                 continue;
2737             }
2738             if (UNLIKELY(isASCIIUpper(*hostIterator)))
2739                 syntaxViolation(hostIterator);
2740             appendToASCIIBuffer(toASCIILower(*hostIterator));
2741         }
2742         m_url.m_hostEnd = currentPosition(iterator);
2743         if (!hostIterator.atEnd())
2744             return parsePort(hostIterator);
2745         m_url.m_portEnd = currentPosition(iterator);
2746         return true;
2747     }
2748     
2749     const auto hostBegin = iterator;
2750     
2751     Vector<LChar, defaultInlineBufferSize> utf8Encoded;
2752     for (; !iterator.atEnd(); ++iterator) {
2753         if (UNLIKELY(isTabOrNewline(*iterator))) {
2754             syntaxViolation(hostBegin);
2755             continue;
2756         }
2757         if (*iterator == ':')
2758             break;
2759         if (UNLIKELY(!isASCII(*iterator)))
2760             syntaxViolation(hostBegin);
2761
2762         uint8_t buffer[U8_MAX_LENGTH];
2763         int32_t offset = 0;
2764         UBool error = false;
2765         U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error);
2766         ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
2767         // FIXME: Check error.
2768         utf8Encoded.append(buffer, offset);
2769     }
2770     Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin);
2771     String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2772     if (domain.isNull())
2773         return false;
2774     if (domain != StringView(percentDecoded.data(), percentDecoded.size()))
2775         syntaxViolation(hostBegin);
2776     auto asciiDomain = domainToASCII(domain, hostBegin);
2777     if (!asciiDomain || hasForbiddenHostCodePoint(asciiDomain.value()))
2778         return false;
2779     Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value();
2780     const LChar* asciiDomainCharacters = asciiDomainValue.data();
2781
2782     auto address = parseIPv4Host(hostBegin, CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()));
2783     if (address) {
2784         serializeIPv4(address.value());
2785         m_url.m_hostEnd = currentPosition(iterator);
2786         if (iterator.atEnd()) {
2787             m_url.m_portEnd = currentPosition(iterator);
2788             return true;
2789         }
2790         return parsePort(iterator);
2791     }
2792     if (address.error() == IPv4ParsingError::Failure)
2793         return false;
2794
2795     appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2796     m_url.m_hostEnd = currentPosition(iterator);
2797     if (!iterator.atEnd())
2798         return parsePort(iterator);
2799     m_url.m_portEnd = currentPosition(iterator);
2800     return true;
2801 }
2802
2803 std::optional<String> URLParser::formURLDecode(StringView input)
2804 {
2805     auto utf8 = input.utf8(StrictConversion);
2806     if (utf8.isNull())
2807         return std::nullopt;
2808     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2809     return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2810 }
2811
2812 // https://url.spec.whatwg.org/#concept-urlencoded-parser
2813 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2814 {
2815     URLEncodedForm output;
2816     for (StringView bytes : input.split('&')) {
2817         auto equalIndex = bytes.find('=');
2818         if (equalIndex == notFound) {
2819             auto name = formURLDecode(bytes.toString().replace('+', 0x20));
2820             if (name)
2821                 output.append({ name.value(), emptyString() });
2822         } else {
2823             auto name = formURLDecode(bytes.substring(0, equalIndex).toString().replace('+', 0x20));
2824             auto value = formURLDecode(bytes.substring(equalIndex + 1).toString().replace('+', 0x20));
2825             if (name && value)
2826                 output.append({ name.value(), value.value() });
2827         }
2828     }
2829     return output;
2830 }
2831
2832 static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2833 {
2834     auto utf8 = input.utf8(StrictConversion);
2835     const char* data = utf8.data();
2836     for (size_t i = 0; i < utf8.length(); ++i) {
2837         const char byte = data[i];
2838         if (byte == 0x20)
2839             output.append(0x2B);
2840         else if (byte == 0x2A
2841             || byte == 0x2D
2842             || byte == 0x2E
2843             || (byte >= 0x30 && byte <= 0x39)
2844             || (byte >= 0x41 && byte <= 0x5A)
2845             || byte == 0x5F
2846             || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches.
2847             output.append(byte);
2848         else
2849             percentEncodeByte(byte, output);
2850     }
2851 }
2852     
2853 String URLParser::serialize(const URLEncodedForm& tuples)
2854 {
2855     Vector<LChar> output;
2856     for (auto& tuple : tuples) {
2857         if (!output.isEmpty())
2858             output.append('&');
2859         serializeURLEncodedForm(tuple.key, output);
2860         output.append('=');
2861         serializeURLEncodedForm(tuple.value, output);
2862     }
2863     return String::adopt(WTFMove(output));
2864 }
2865
2866 const UIDNA& URLParser::internationalDomainNameTranscoder()
2867 {
2868     static UIDNA* encoder;
2869     static std::once_flag onceFlag;
2870     std::call_once(onceFlag, [] {
2871         UErrorCode error = U_ZERO_ERROR;
2872         encoder = uidna_openUTS46(UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_NONTRANSITIONAL_TO_UNICODE | UIDNA_NONTRANSITIONAL_TO_ASCII, &error);
2873         RELEASE_ASSERT(U_SUCCESS(error));
2874         RELEASE_ASSERT(encoder);
2875     });
2876     return *encoder;
2877 }
2878
2879 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2880 {
2881     // FIXME: m_cannotBeABaseURL is not compared because the old URL::parse did not use it,
2882     // but once we get rid of URL::parse its value should be tested.
2883     URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2884         a.m_isValid,
2885         a.m_protocolIsInHTTPFamily,
2886         a.m_schemeEnd,
2887         a.m_userStart,
2888         a.m_userEnd,
2889         a.m_passwordEnd,
2890         a.m_hostEnd,
2891         a.m_portEnd,
2892         a.m_pathAfterLastSlash,
2893         a.m_pathEnd,
2894         a.m_queryEnd,
2895         a.m_fragmentEnd,
2896         a.m_string.utf8().data(),
2897         b.m_isValid,
2898         b.m_protocolIsInHTTPFamily,
2899         b.m_schemeEnd,
2900         b.m_userStart,
2901         b.m_userEnd,
2902         b.m_passwordEnd,
2903         b.m_hostEnd,
2904         b.m_portEnd,
2905         b.m_pathAfterLastSlash,
2906         b.m_pathEnd,
2907         b.m_queryEnd,
2908         b.m_fragmentEnd,
2909         b.m_string.utf8().data());
2910
2911     return a.m_string == b.m_string
2912         && a.m_isValid == b.m_isValid
2913         && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2914         && a.m_schemeEnd == b.m_schemeEnd
2915         && a.m_userStart == b.m_userStart
2916         && a.m_userEnd == b.m_userEnd
2917         && a.m_passwordEnd == b.m_passwordEnd
2918         && a.m_hostEnd == b.m_hostEnd
2919         && a.m_portEnd == b.m_portEnd
2920         && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2921         && a.m_pathEnd == b.m_pathEnd
2922         && a.m_queryEnd == b.m_queryEnd
2923         && a.m_fragmentEnd == b.m_fragmentEnd;
2924 }
2925
2926 bool URLParser::internalValuesConsistent(const URL& url)
2927 {
2928     return url.m_schemeEnd <= url.m_userStart
2929         && url.m_userStart <= url.m_userEnd
2930         && url.m_userEnd <= url.m_passwordEnd
2931         && url.m_passwordEnd <= url.m_hostEnd
2932         && url.m_hostEnd <= url.m_portEnd
2933         && url.m_portEnd <= url.m_pathAfterLastSlash
2934         && url.m_pathAfterLastSlash <= url.m_pathEnd
2935         && url.m_pathEnd <= url.m_queryEnd
2936         && url.m_queryEnd <= url.m_fragmentEnd
2937         && (url.m_isValid ? url.m_fragmentEnd == url.m_string.length() : !url.m_fragmentEnd);
2938     // FIXME: Why do we even store m_fragmentEnd?
2939     // It should be able to be deduced from m_isValid and m_string.length() to save memory.
2940 }
2941
2942 } // namespace WebCore