1e3d465ae3b3fe13264073c14bfc0f9fb420fa5e
[WebKit-https.git] / Source / WebCore / platform / URLParser.cpp
1 /*
2  * Copyright (C) 2016 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "URLParser.h"
28
29 #include "Logging.h"
30 #include "RuntimeApplicationChecks.h"
31 #include <array>
32 #include <mutex>
33 #include <unicode/uidna.h>
34 #include <unicode/utypes.h>
35
36 namespace WebCore {
37
38 #define URL_PARSER_DEBUGGING 0
39
40 #if URL_PARSER_DEBUGGING
41 #define URL_PARSER_LOG(...) LOG(URLParser, __VA_ARGS__)
42 #else
43 #define URL_PARSER_LOG(...)
44 #endif
45     
46 template<typename CharacterType>
47 class CodePointIterator {
48 public:
49     ALWAYS_INLINE CodePointIterator() { }
50     ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
51         : m_begin(begin)
52         , m_end(end)
53     {
54     }
55     
56     ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
57         : CodePointIterator(begin.m_begin, end.m_begin)
58     {
59         ASSERT(end.m_begin >= begin.m_begin);
60     }
61     
62     ALWAYS_INLINE UChar32 operator*() const;
63     ALWAYS_INLINE CodePointIterator& operator++();
64
65     ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
66     {
67         return m_begin == other.m_begin
68             && m_end == other.m_end;
69     }
70     ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
71     
72     ALWAYS_INLINE CodePointIterator& operator=(const CodePointIterator& other)
73     {
74         m_begin = other.m_begin;
75         m_end = other.m_end;
76         return *this;
77     }
78
79     ALWAYS_INLINE bool atEnd() const
80     {
81         ASSERT(m_begin <= m_end);
82         return m_begin >= m_end;
83     }
84     
85     ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
86     {
87         ASSERT(m_begin >= reference);
88         return m_begin - reference;
89     }
90
91     ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
92     {
93         return codeUnitsSince(other.m_begin);
94     }
95     
96 private:
97     const CharacterType* m_begin { nullptr };
98     const CharacterType* m_end { nullptr };
99 };
100
101 template<>
102 ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const
103 {
104     ASSERT(!atEnd());
105     return *m_begin;
106 }
107
108 template<>
109 ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
110 {
111     m_begin++;
112     return *this;
113 }
114
115 template<>
116 ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const
117 {
118     ASSERT(!atEnd());
119     UChar32 c;
120     U16_GET(m_begin, 0, 0, m_end - m_begin, c);
121     return c;
122 }
123
124 template<>
125 ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
126 {
127     unsigned i = 0;
128     size_t length = m_end - m_begin;
129     U16_FWD_1(m_begin, i, length);
130     m_begin += i;
131     return *this;
132 }
133     
134 ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
135 {
136     if (U_IS_BMP(codePoint)) {
137         destination.append(static_cast<UChar>(codePoint));
138         return;
139     }
140     destination.reserveCapacity(destination.size() + 2);
141     destination.uncheckedAppend(U16_LEAD(codePoint));
142     destination.uncheckedAppend(U16_TRAIL(codePoint));
143 }
144
145 enum URLCharacterClass {
146     UserInfo = 0x1,
147     Default = 0x2,
148     ForbiddenHost = 0x4,
149     QueryPercent = 0x8,
150     SlashQuestionOrHash = 0x10,
151     ValidScheme = 0x20,
152 };
153
154 static const uint8_t characterClassTable[256] = {
155     UserInfo | Default | QueryPercent | ForbiddenHost, // 0x0
156     UserInfo | Default | QueryPercent, // 0x1
157     UserInfo | Default | QueryPercent, // 0x2
158     UserInfo | Default | QueryPercent, // 0x3
159     UserInfo | Default | QueryPercent, // 0x4
160     UserInfo | Default | QueryPercent, // 0x5
161     UserInfo | Default | QueryPercent, // 0x6
162     UserInfo | Default | QueryPercent, // 0x7
163     UserInfo | Default | QueryPercent, // 0x8
164     UserInfo | Default | QueryPercent | ForbiddenHost, // 0x9
165     UserInfo | Default | QueryPercent | ForbiddenHost, // 0xA
166     UserInfo | Default | QueryPercent, // 0xB
167     UserInfo | Default | QueryPercent, // 0xC
168     UserInfo | Default | QueryPercent | ForbiddenHost, // 0xD
169     UserInfo | Default | QueryPercent, // 0xE
170     UserInfo | Default | QueryPercent, // 0xF
171     UserInfo | Default | QueryPercent, // 0x10
172     UserInfo | Default | QueryPercent, // 0x11
173     UserInfo | Default | QueryPercent, // 0x12
174     UserInfo | Default | QueryPercent, // 0x13
175     UserInfo | Default | QueryPercent, // 0x14
176     UserInfo | Default | QueryPercent, // 0x15
177     UserInfo | Default | QueryPercent, // 0x16
178     UserInfo | Default | QueryPercent, // 0x17
179     UserInfo | Default | QueryPercent, // 0x18
180     UserInfo | Default | QueryPercent, // 0x19
181     UserInfo | Default | QueryPercent, // 0x1A
182     UserInfo | Default | QueryPercent, // 0x1B
183     UserInfo | Default | QueryPercent, // 0x1C
184     UserInfo | Default | QueryPercent, // 0x1D
185     UserInfo | Default | QueryPercent, // 0x1E
186     UserInfo | Default | QueryPercent, // 0x1F
187     UserInfo | Default | QueryPercent | ForbiddenHost, // ' '
188     0, // '!'
189     UserInfo | Default | QueryPercent, // '"'
190     UserInfo | Default | QueryPercent | SlashQuestionOrHash | ForbiddenHost, // '#'
191     0, // '$'
192     ForbiddenHost, // '%'
193     0, // '&'
194     0, // '''
195     0, // '('
196     0, // ')'
197     0, // '*'
198     ValidScheme, // '+'
199     0, // ','
200     ValidScheme, // '-'
201     ValidScheme, // '.'
202     UserInfo | SlashQuestionOrHash | ForbiddenHost, // '/'
203     ValidScheme, // '0'
204     ValidScheme, // '1'
205     ValidScheme, // '2'
206     ValidScheme, // '3'
207     ValidScheme, // '4'
208     ValidScheme, // '5'
209     ValidScheme, // '6'
210     ValidScheme, // '7'
211     ValidScheme, // '8'
212     ValidScheme, // '9'
213     UserInfo | ForbiddenHost, // ':'
214     UserInfo, // ';'
215     UserInfo | Default | QueryPercent, // '<'
216     UserInfo, // '='
217     UserInfo | Default | QueryPercent, // '>'
218     UserInfo | Default | SlashQuestionOrHash | ForbiddenHost, // '?'
219     UserInfo | ForbiddenHost, // '@'
220     ValidScheme, // 'A'
221     ValidScheme, // 'B'
222     ValidScheme, // 'C'
223     ValidScheme, // 'D'
224     ValidScheme, // 'E'
225     ValidScheme, // 'F'
226     ValidScheme, // 'G'
227     ValidScheme, // 'H'
228     ValidScheme, // 'I'
229     ValidScheme, // 'J'
230     ValidScheme, // 'K'
231     ValidScheme, // 'L'
232     ValidScheme, // 'M'
233     ValidScheme, // 'N'
234     ValidScheme, // 'O'
235     ValidScheme, // 'P'
236     ValidScheme, // 'Q'
237     ValidScheme, // 'R'
238     ValidScheme, // 'S'
239     ValidScheme, // 'T'
240     ValidScheme, // 'U'
241     ValidScheme, // 'V'
242     ValidScheme, // 'W'
243     ValidScheme, // 'X'
244     ValidScheme, // 'Y'
245     ValidScheme, // 'Z'
246     UserInfo | ForbiddenHost, // '['
247     UserInfo | SlashQuestionOrHash | ForbiddenHost, // '\\'
248     UserInfo | ForbiddenHost, // ']'
249     UserInfo, // '^'
250     0, // '_'
251     UserInfo | Default, // '`'
252     ValidScheme, // 'a'
253     ValidScheme, // 'b'
254     ValidScheme, // 'c'
255     ValidScheme, // 'd'
256     ValidScheme, // 'e'
257     ValidScheme, // 'f'
258     ValidScheme, // 'g'
259     ValidScheme, // 'h'
260     ValidScheme, // 'i'
261     ValidScheme, // 'j'
262     ValidScheme, // 'k'
263     ValidScheme, // 'l'
264     ValidScheme, // 'm'
265     ValidScheme, // 'n'
266     ValidScheme, // 'o'
267     ValidScheme, // 'p'
268     ValidScheme, // 'q'
269     ValidScheme, // 'r'
270     ValidScheme, // 's'
271     ValidScheme, // 't'
272     ValidScheme, // 'u'
273     ValidScheme, // 'v'
274     ValidScheme, // 'w'
275     ValidScheme, // 'x'
276     ValidScheme, // 'y'
277     ValidScheme, // 'z'
278     UserInfo | Default, // '{'
279     UserInfo, // '|'
280     UserInfo | Default, // '}'
281     0, // '~'
282     QueryPercent, // 0x7F
283     QueryPercent, // 0x80
284     QueryPercent, // 0x81
285     QueryPercent, // 0x82
286     QueryPercent, // 0x83
287     QueryPercent, // 0x84
288     QueryPercent, // 0x85
289     QueryPercent, // 0x86
290     QueryPercent, // 0x87
291     QueryPercent, // 0x88
292     QueryPercent, // 0x89
293     QueryPercent, // 0x8A
294     QueryPercent, // 0x8B
295     QueryPercent, // 0x8C
296     QueryPercent, // 0x8D
297     QueryPercent, // 0x8E
298     QueryPercent, // 0x8F
299     QueryPercent, // 0x90
300     QueryPercent, // 0x91
301     QueryPercent, // 0x92
302     QueryPercent, // 0x93
303     QueryPercent, // 0x94
304     QueryPercent, // 0x95
305     QueryPercent, // 0x96
306     QueryPercent, // 0x97
307     QueryPercent, // 0x98
308     QueryPercent, // 0x99
309     QueryPercent, // 0x9A
310     QueryPercent, // 0x9B
311     QueryPercent, // 0x9C
312     QueryPercent, // 0x9D
313     QueryPercent, // 0x9E
314     QueryPercent, // 0x9F
315     QueryPercent, // 0xA0
316     QueryPercent, // 0xA1
317     QueryPercent, // 0xA2
318     QueryPercent, // 0xA3
319     QueryPercent, // 0xA4
320     QueryPercent, // 0xA5
321     QueryPercent, // 0xA6
322     QueryPercent, // 0xA7
323     QueryPercent, // 0xA8
324     QueryPercent, // 0xA9
325     QueryPercent, // 0xAA
326     QueryPercent, // 0xAB
327     QueryPercent, // 0xAC
328     QueryPercent, // 0xAD
329     QueryPercent, // 0xAE
330     QueryPercent, // 0xAF
331     QueryPercent, // 0xB0
332     QueryPercent, // 0xB1
333     QueryPercent, // 0xB2
334     QueryPercent, // 0xB3
335     QueryPercent, // 0xB4
336     QueryPercent, // 0xB5
337     QueryPercent, // 0xB6
338     QueryPercent, // 0xB7
339     QueryPercent, // 0xB8
340     QueryPercent, // 0xB9
341     QueryPercent, // 0xBA
342     QueryPercent, // 0xBB
343     QueryPercent, // 0xBC
344     QueryPercent, // 0xBD
345     QueryPercent, // 0xBE
346     QueryPercent, // 0xBF
347     QueryPercent, // 0xC0
348     QueryPercent, // 0xC1
349     QueryPercent, // 0xC2
350     QueryPercent, // 0xC3
351     QueryPercent, // 0xC4
352     QueryPercent, // 0xC5
353     QueryPercent, // 0xC6
354     QueryPercent, // 0xC7
355     QueryPercent, // 0xC8
356     QueryPercent, // 0xC9
357     QueryPercent, // 0xCA
358     QueryPercent, // 0xCB
359     QueryPercent, // 0xCC
360     QueryPercent, // 0xCD
361     QueryPercent, // 0xCE
362     QueryPercent, // 0xCF
363     QueryPercent, // 0xD0
364     QueryPercent, // 0xD1
365     QueryPercent, // 0xD2
366     QueryPercent, // 0xD3
367     QueryPercent, // 0xD4
368     QueryPercent, // 0xD5
369     QueryPercent, // 0xD6
370     QueryPercent, // 0xD7
371     QueryPercent, // 0xD8
372     QueryPercent, // 0xD9
373     QueryPercent, // 0xDA
374     QueryPercent, // 0xDB
375     QueryPercent, // 0xDC
376     QueryPercent, // 0xDD
377     QueryPercent, // 0xDE
378     QueryPercent, // 0xDF
379     QueryPercent, // 0xE0
380     QueryPercent, // 0xE1
381     QueryPercent, // 0xE2
382     QueryPercent, // 0xE3
383     QueryPercent, // 0xE4
384     QueryPercent, // 0xE5
385     QueryPercent, // 0xE6
386     QueryPercent, // 0xE7
387     QueryPercent, // 0xE8
388     QueryPercent, // 0xE9
389     QueryPercent, // 0xEA
390     QueryPercent, // 0xEB
391     QueryPercent, // 0xEC
392     QueryPercent, // 0xED
393     QueryPercent, // 0xEE
394     QueryPercent, // 0xEF
395     QueryPercent, // 0xF0
396     QueryPercent, // 0xF1
397     QueryPercent, // 0xF2
398     QueryPercent, // 0xF3
399     QueryPercent, // 0xF4
400     QueryPercent, // 0xF5
401     QueryPercent, // 0xF6
402     QueryPercent, // 0xF7
403     QueryPercent, // 0xF8
404     QueryPercent, // 0xF9
405     QueryPercent, // 0xFA
406     QueryPercent, // 0xFB
407     QueryPercent, // 0xFC
408     QueryPercent, // 0xFD
409     QueryPercent, // 0xFE
410     QueryPercent, // 0xFF
411 };
412
413 template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; }
414 template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
415 template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
416 template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
417 template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
418 template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
419 template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
420 template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
421 template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & ValidScheme; }
422 template<typename CharacterType> ALWAYS_INLINE static bool isForbiddenHostCodePoint(CharacterType character) { return character <= ']' && characterClassTable[character] & ForbiddenHost; }
423 ALWAYS_INLINE static bool shouldPercentEncodeQueryByte(uint8_t byte, const bool& urlIsSpecial)
424 {
425     if (characterClassTable[byte] & QueryPercent)
426         return true;
427     if (byte == '\'' && urlIsSpecial)
428         return true;
429     return false;
430 }
431
432 template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation>
433 ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
434 {
435     ++iterator;
436     while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
437         if (reportSyntaxViolation == ReportSyntaxViolation::Yes)
438             syntaxViolation(iteratorForSyntaxViolationPosition);
439         ++iterator;
440     }
441 }
442
443 template<typename CharacterType>
444 bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator)
445 {
446     if (iterator.atEnd())
447         return false;
448     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
449     if (iterator.atEnd())
450         return false;
451     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
452     return iterator.atEnd();
453 }
454
455 template<typename CharacterType>
456 ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
457 {
458     if (iterator.atEnd() || !isASCIIAlpha(*iterator))
459         return false;
460     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
461     if (iterator.atEnd())
462         return false;
463     if (*iterator == ':')
464         return true;
465     if (UNLIKELY(*iterator == '|'))
466         return true;
467     return false;
468 }
469
470 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
471 {
472     ASSERT(isASCII(codePoint));
473     if (UNLIKELY(m_didSeeSyntaxViolation))
474         m_asciiBuffer.append(codePoint);
475 }
476
477 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
478 {
479     if (UNLIKELY(m_didSeeSyntaxViolation))
480         m_asciiBuffer.append(characters, length);
481 }
482
483 template<typename CharacterType>
484 void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
485 {
486     ASSERT(isWindowsDriveLetter(iterator));
487     appendToASCIIBuffer(*iterator);
488     advance(iterator);
489     ASSERT(!iterator.atEnd());
490     ASSERT(*iterator == ':' || *iterator == '|');
491     if (*iterator == '|')
492         syntaxViolation(iterator);
493     appendToASCIIBuffer(':');
494     advance(iterator);
495 }
496
497 bool URLParser::copyBaseWindowsDriveLetter(const URL& base)
498 {
499     if (base.protocolIs("file")) {
500         RELEASE_ASSERT(base.m_portEnd < base.m_string.length());
501         if (base.m_string.is8Bit()) {
502             const LChar* begin = base.m_string.characters8();
503             CodePointIterator<LChar> c(begin + base.m_portEnd + 1, begin + base.m_string.length());
504             if (isWindowsDriveLetter(c)) {
505                 appendWindowsDriveLetter(c);
506                 return true;
507             }
508         } else {
509             const UChar* begin = base.m_string.characters16();
510             CodePointIterator<UChar> c(begin + base.m_portEnd + 1, begin + base.m_string.length());
511             if (isWindowsDriveLetter(c)) {
512                 appendWindowsDriveLetter(c);
513                 return true;
514             }
515         }
516     }
517     return false;
518 }
519
520 template<typename CharacterType>
521 bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
522 {
523     if (!isWindowsDriveLetter(iterator))
524         return true;
525     if (iterator.atEnd())
526         return false;
527     advance(iterator);
528     if (iterator.atEnd())
529         return true;
530     advance(iterator);
531     if (iterator.atEnd())
532         return true;
533     return !isSlashQuestionOrHash(*iterator);
534 }
535
536 static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
537 {
538     buffer.append('%');
539     buffer.append(upperNibbleToASCIIHexDigit(byte));
540     buffer.append(lowerNibbleToASCIIHexDigit(byte));
541 }
542
543 void URLParser::percentEncodeByte(uint8_t byte)
544 {
545     ASSERT(m_didSeeSyntaxViolation);
546     appendToASCIIBuffer('%');
547     appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
548     appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
549 }
550
551 const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD";
552 const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1;
553
554 template<bool(*isInCodeSet)(UChar32), typename CharacterType>
555 ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
556 {
557     ASSERT(!iterator.atEnd());
558     UChar32 codePoint = *iterator;
559     if (LIKELY(isASCII(codePoint))) {
560         if (UNLIKELY(isInCodeSet(codePoint))) {
561             syntaxViolation(iterator);
562             percentEncodeByte(codePoint);
563         } else
564             appendToASCIIBuffer(codePoint);
565         return;
566     }
567     ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
568     syntaxViolation(iterator);
569     
570     if (!U_IS_UNICODE_CHAR(codePoint)) {
571         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
572         return;
573     }
574     
575     uint8_t buffer[U8_MAX_LENGTH];
576     int32_t offset = 0;
577     U8_APPEND_UNSAFE(buffer, offset, codePoint);
578     for (int32_t i = 0; i < offset; ++i)
579         percentEncodeByte(buffer[i]);
580 }
581
582 template<typename CharacterType>
583 ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
584 {
585     ASSERT(!iterator.atEnd());
586     UChar32 codePoint = *iterator;
587     if (LIKELY(isASCII(codePoint))) {
588         if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint, m_urlIsSpecial))) {
589             syntaxViolation(iterator);
590             percentEncodeByte(codePoint);
591         } else
592             appendToASCIIBuffer(codePoint);
593         return;
594     }
595     
596     syntaxViolation(iterator);
597     
598     if (!U_IS_UNICODE_CHAR(codePoint)) {
599         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
600         return;
601     }
602
603     uint8_t buffer[U8_MAX_LENGTH];
604     int32_t offset = 0;
605     U8_APPEND_UNSAFE(buffer, offset, codePoint);
606     for (int32_t i = 0; i < offset; ++i) {
607         auto byte = buffer[i];
608         if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
609             percentEncodeByte(byte);
610         else
611             appendToASCIIBuffer(byte);
612     }
613 }
614
615 template<typename CharacterType>
616 void URLParser::encodeQuery(const Vector<UChar>& source, const TextEncoding& encoding, CodePointIterator<CharacterType> iterator)
617 {
618     // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
619     CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
620     const char* data = encoded.data();
621     size_t length = encoded.length();
622     
623     if (!length == !iterator.atEnd()) {
624         syntaxViolation(iterator);
625         return;
626     }
627     
628     size_t i = 0;
629     for (; i < length; ++i) {
630         ASSERT(!iterator.atEnd());
631         uint8_t byte = data[i];
632         if (UNLIKELY(byte != *iterator)) {
633             syntaxViolation(iterator);
634             break;
635         }
636         if (UNLIKELY(shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))) {
637             syntaxViolation(iterator);
638             break;
639         }
640         appendToASCIIBuffer(byte);
641         ++iterator;
642     }
643     while (!iterator.atEnd() && isTabOrNewline(*iterator))
644         ++iterator;
645     ASSERT((i == length) == iterator.atEnd());
646     for (; i < length; ++i) {
647         ASSERT(m_didSeeSyntaxViolation);
648         uint8_t byte = data[i];
649         if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
650             percentEncodeByte(byte);
651         else
652             appendToASCIIBuffer(byte);
653     }
654 }
655
656 std::optional<uint16_t> URLParser::defaultPortForProtocol(StringView scheme)
657 {
658     static const uint16_t ftpPort = 21;
659     static const uint16_t gopherPort = 70;
660     static const uint16_t httpPort = 80;
661     static const uint16_t httpsPort = 443;
662     static const uint16_t wsPort = 80;
663     static const uint16_t wssPort = 443;
664     
665     auto length = scheme.length();
666     if (!length)
667         return std::nullopt;
668     switch (scheme[0]) {
669     case 'w':
670         switch (length) {
671         case 2:
672             if (scheme[1] == 's')
673                 return wsPort;
674             return std::nullopt;
675         case 3:
676             if (scheme[1] == 's'
677                 && scheme[2] == 's')
678                 return wssPort;
679             return std::nullopt;
680         default:
681             return false;
682         }
683     case 'h':
684         switch (length) {
685         case 4:
686             if (scheme[1] == 't'
687                 && scheme[2] == 't'
688                 && scheme[3] == 'p')
689                 return httpPort;
690             return std::nullopt;
691         case 5:
692             if (scheme[1] == 't'
693                 && scheme[2] == 't'
694                 && scheme[3] == 'p'
695                 && scheme[4] == 's')
696                 return httpsPort;
697             return std::nullopt;
698         default:
699             return std::nullopt;
700         }
701     case 'g':
702         if (length == 6
703             && scheme[1] == 'o'
704             && scheme[2] == 'p'
705             && scheme[3] == 'h'
706             && scheme[4] == 'e'
707             && scheme[5] == 'r')
708             return gopherPort;
709         return std::nullopt;
710     case 'f':
711         if (length == 3
712             && scheme[1] == 't'
713             && scheme[2] == 'p')
714             return ftpPort;
715         return std::nullopt;
716     default:
717         return std::nullopt;
718     }
719 }
720
721 enum class Scheme {
722     WS,
723     WSS,
724     File,
725     FTP,
726     Gopher,
727     HTTP,
728     HTTPS,
729     NonSpecial
730 };
731
732 ALWAYS_INLINE static Scheme scheme(StringView scheme)
733 {
734     auto length = scheme.length();
735     if (!length)
736         return Scheme::NonSpecial;
737     switch (scheme[0]) {
738     case 'f':
739         switch (length) {
740         case 3:
741             if (scheme[1] == 't'
742                 && scheme[2] == 'p')
743                 return Scheme::FTP;
744             return Scheme::NonSpecial;
745         case 4:
746             if (scheme[1] == 'i'
747                 && scheme[2] == 'l'
748                 && scheme[3] == 'e')
749                 return Scheme::File;
750             return Scheme::NonSpecial;
751         default:
752             return Scheme::NonSpecial;
753         }
754     case 'g':
755         if (length == 6
756             && scheme[1] == 'o'
757             && scheme[2] == 'p'
758             && scheme[3] == 'h'
759             && scheme[4] == 'e'
760             && scheme[5] == 'r')
761             return Scheme::Gopher;
762         return Scheme::NonSpecial;
763     case 'h':
764         switch (length) {
765         case 4:
766             if (scheme[1] == 't'
767                 && scheme[2] == 't'
768                 && scheme[3] == 'p')
769                 return Scheme::HTTP;
770             return Scheme::NonSpecial;
771         case 5:
772             if (scheme[1] == 't'
773                 && scheme[2] == 't'
774                 && scheme[3] == 'p'
775                 && scheme[4] == 's')
776                 return Scheme::HTTPS;
777             return Scheme::NonSpecial;
778         default:
779             return Scheme::NonSpecial;
780         }
781     case 'w':
782         switch (length) {
783         case 2:
784             if (scheme[1] == 's')
785                 return Scheme::WS;
786             return Scheme::NonSpecial;
787         case 3:
788             if (scheme[1] == 's'
789                 && scheme[2] == 's')
790                 return Scheme::WSS;
791             return Scheme::NonSpecial;
792         default:
793             return Scheme::NonSpecial;
794         }
795     default:
796         return Scheme::NonSpecial;
797     }
798 }
799
800 std::optional<String> URLParser::maybeCanonicalizeScheme(const String& scheme)
801 {
802     if (scheme.isEmpty())
803         return std::nullopt;
804
805     if (!isASCIIAlpha(scheme[0]))
806         return std::nullopt;
807
808     for (size_t i = 1; i < scheme.length(); ++i) {
809         if (isASCIIAlphanumeric(scheme[i]) || scheme[i] == '+' || scheme[i] == '-' || scheme[i] == '.')
810             continue;
811         return std::nullopt;
812     }
813
814     return scheme.convertToASCIILowercase();
815 }
816
817 bool URLParser::isSpecialScheme(const String& schemeArg)
818 {
819     return scheme(schemeArg) != Scheme::NonSpecial;
820 }
821
822 enum class URLParser::URLPart {
823     SchemeEnd,
824     UserStart,
825     UserEnd,
826     PasswordEnd,
827     HostEnd,
828     PortEnd,
829     PathAfterLastSlash,
830     PathEnd,
831     QueryEnd,
832     FragmentEnd,
833 };
834
835 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
836 {
837     switch (part) {
838     case URLPart::FragmentEnd:
839         return url.m_fragmentEnd;
840     case URLPart::QueryEnd:
841         return url.m_queryEnd;
842     case URLPart::PathEnd:
843         return url.m_pathEnd;
844     case URLPart::PathAfterLastSlash:
845         return url.m_pathAfterLastSlash;
846     case URLPart::PortEnd:
847         return url.m_portEnd;
848     case URLPart::HostEnd:
849         return url.m_hostEnd;
850     case URLPart::PasswordEnd:
851         return url.m_passwordEnd;
852     case URLPart::UserEnd:
853         return url.m_userEnd;
854     case URLPart::UserStart:
855         return url.m_userStart;
856     case URLPart::SchemeEnd:
857         return url.m_schemeEnd;
858     }
859     ASSERT_NOT_REACHED();
860     return 0;
861 }
862
863 void URLParser::copyASCIIStringUntil(const String& string, size_t length)
864 {
865     RELEASE_ASSERT(length <= string.length());
866     if (string.isNull())
867         return;
868     ASSERT(m_asciiBuffer.isEmpty());
869     if (string.is8Bit())
870         appendToASCIIBuffer(string.characters8(), length);
871     else {
872         const UChar* characters = string.characters16();
873         for (size_t i = 0; i < length; ++i) {
874             UChar c = characters[i];
875             ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
876             appendToASCIIBuffer(c);
877         }
878     }
879 }
880
881 template<typename CharacterType>
882 void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, bool& isUTF8Encoding)
883 {
884     syntaxViolation(iterator);
885
886     m_asciiBuffer.clear();
887     copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
888     switch (part) {
889     case URLPart::FragmentEnd:
890         RELEASE_ASSERT_NOT_REACHED();
891     case URLPart::QueryEnd:
892         m_url.m_queryEnd = base.m_queryEnd;
893         FALLTHROUGH;
894     case URLPart::PathEnd:
895         m_url.m_pathEnd = base.m_pathEnd;
896         FALLTHROUGH;
897     case URLPart::PathAfterLastSlash:
898         m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
899         FALLTHROUGH;
900     case URLPart::PortEnd:
901         m_url.m_portEnd = base.m_portEnd;
902         FALLTHROUGH;
903     case URLPart::HostEnd:
904         m_url.m_hostEnd = base.m_hostEnd;
905         FALLTHROUGH;
906     case URLPart::PasswordEnd:
907         m_url.m_passwordEnd = base.m_passwordEnd;
908         FALLTHROUGH;
909     case URLPart::UserEnd:
910         m_url.m_userEnd = base.m_userEnd;
911         FALLTHROUGH;
912     case URLPart::UserStart:
913         m_url.m_userStart = base.m_userStart;
914         FALLTHROUGH;
915     case URLPart::SchemeEnd:
916         m_url.m_isValid = base.m_isValid;
917         m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
918         m_url.m_schemeEnd = base.m_schemeEnd;
919     }
920     switch (scheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd))) {
921     case Scheme::WS:
922     case Scheme::WSS:
923         isUTF8Encoding = true;
924         m_urlIsSpecial = true;
925         return;
926     case Scheme::File:
927         m_urlIsFile = true;
928         FALLTHROUGH;
929     case Scheme::FTP:
930     case Scheme::Gopher:
931     case Scheme::HTTP:
932     case Scheme::HTTPS:
933         m_urlIsSpecial = true;
934         return;
935     case Scheme::NonSpecial:
936         m_urlIsSpecial = false;
937         isUTF8Encoding = true;
938         return;
939     }
940     ASSERT_NOT_REACHED();
941 }
942
943 static const char dotASCIICode[2] = {'2', 'e'};
944
945 template<typename CharacterType>
946 ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c)
947 {
948     if (c.atEnd())
949         return false;
950     if (*c == '.') {
951         advance<CharacterType, ReportSyntaxViolation::No>(c);
952         return c.atEnd() || isSlashQuestionOrHash(*c);
953     }
954     if (*c != '%')
955         return false;
956     advance<CharacterType, ReportSyntaxViolation::No>(c);
957     if (c.atEnd() || *c != dotASCIICode[0])
958         return false;
959     advance<CharacterType, ReportSyntaxViolation::No>(c);
960     if (c.atEnd())
961         return false;
962     if (toASCIILower(*c) == dotASCIICode[1]) {
963         advance<CharacterType, ReportSyntaxViolation::No>(c);
964         return c.atEnd() || isSlashQuestionOrHash(*c);
965     }
966     return false;
967 }
968
969 template<typename CharacterType>
970 ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
971 {
972     if (c.atEnd())
973         return false;
974     if (*c == '.') {
975         advance<CharacterType, ReportSyntaxViolation::No>(c);
976         return isSingleDotPathSegment(c);
977     }
978     if (*c != '%')
979         return false;
980     advance<CharacterType, ReportSyntaxViolation::No>(c);
981     if (c.atEnd() || *c != dotASCIICode[0])
982         return false;
983     advance<CharacterType, ReportSyntaxViolation::No>(c);
984     if (c.atEnd())
985         return false;
986     if (toASCIILower(*c) == dotASCIICode[1]) {
987         advance<CharacterType, ReportSyntaxViolation::No>(c);
988         return isSingleDotPathSegment(c);
989     }
990     return false;
991 }
992
993 template<typename CharacterType>
994 void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
995 {
996     ASSERT(isSingleDotPathSegment(c));
997     if (*c == '.') {
998         advance(c);
999         if (!c.atEnd()) {
1000             if (*c == '/' || *c == '\\')
1001                 advance(c);
1002             else
1003                 ASSERT(*c == '?' || *c == '#');
1004         }
1005     } else {
1006         ASSERT(*c == '%');
1007         advance(c);
1008         ASSERT(*c == dotASCIICode[0]);
1009         advance(c);
1010         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1011         advance(c);
1012         if (!c.atEnd()) {
1013             if (*c == '/' || *c == '\\')
1014                 advance(c);
1015             else
1016                 ASSERT(*c == '?' || *c == '#');
1017         }
1018     }
1019 }
1020
1021 template<typename CharacterType>
1022 void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
1023 {
1024     ASSERT(isDoubleDotPathSegment(c));
1025     if (*c == '.')
1026         advance(c);
1027     else {
1028         ASSERT(*c == '%');
1029         advance(c);
1030         ASSERT(*c == dotASCIICode[0]);
1031         advance(c);
1032         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1033         advance(c);
1034     }
1035     consumeSingleDotPathSegment(c);
1036 }
1037
1038 bool URLParser::shouldPopPath(unsigned newPathAfterLastSlash)
1039 {
1040     ASSERT(m_didSeeSyntaxViolation);
1041     if (!m_urlIsFile)
1042         return true;
1043
1044     ASSERT(m_url.m_pathAfterLastSlash <= m_asciiBuffer.size());
1045     CodePointIterator<LChar> componentToPop(&m_asciiBuffer[newPathAfterLastSlash], &m_asciiBuffer[0] + m_url.m_pathAfterLastSlash);
1046     if (newPathAfterLastSlash == m_url.m_portEnd + 1 && isWindowsDriveLetter(componentToPop))
1047         return false;
1048     return true;
1049 }
1050
1051 void URLParser::popPath()
1052 {
1053     ASSERT(m_didSeeSyntaxViolation);
1054     if (m_url.m_pathAfterLastSlash > m_url.m_portEnd + 1) {
1055         auto newPathAfterLastSlash = m_url.m_pathAfterLastSlash - 1;
1056         if (m_asciiBuffer[newPathAfterLastSlash] == '/')
1057             newPathAfterLastSlash--;
1058         while (newPathAfterLastSlash > m_url.m_portEnd && m_asciiBuffer[newPathAfterLastSlash] != '/')
1059             newPathAfterLastSlash--;
1060         newPathAfterLastSlash++;
1061         if (shouldPopPath(newPathAfterLastSlash))
1062             m_url.m_pathAfterLastSlash = newPathAfterLastSlash;
1063     }
1064     m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
1065 }
1066
1067 template<typename CharacterType>
1068 void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
1069 {
1070     if (m_didSeeSyntaxViolation)
1071         return;
1072     m_didSeeSyntaxViolation = true;
1073     
1074     ASSERT(m_asciiBuffer.isEmpty());
1075     size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1076     RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
1077     m_asciiBuffer.reserveCapacity(m_inputString.length());
1078     for (size_t i = 0; i < codeUnitsToCopy; ++i) {
1079         ASSERT(isASCII(m_inputString[i]));
1080         m_asciiBuffer.uncheckedAppend(m_inputString[i]);
1081     }
1082 }
1083
1084 void URLParser::failure()
1085 {
1086     m_url.invalidate();
1087     m_url.m_string = m_inputString;
1088 }
1089
1090 template<typename CharacterType>
1091 bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint)
1092 {
1093     if (iterator.atEnd() || toASCIILower(*iterator) != codePoint)
1094         return false;
1095     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
1096     return true;
1097 }
1098
1099 template<typename CharacterType>
1100 bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator)
1101 {
1102     if (!checkLocalhostCodePoint(iterator, 'l'))
1103         return false;
1104     if (!checkLocalhostCodePoint(iterator, 'o'))
1105         return false;
1106     if (!checkLocalhostCodePoint(iterator, 'c'))
1107         return false;
1108     if (!checkLocalhostCodePoint(iterator, 'a'))
1109         return false;
1110     if (!checkLocalhostCodePoint(iterator, 'l'))
1111         return false;
1112     if (!checkLocalhostCodePoint(iterator, 'h'))
1113         return false;
1114     if (!checkLocalhostCodePoint(iterator, 'o'))
1115         return false;
1116     if (!checkLocalhostCodePoint(iterator, 's'))
1117         return false;
1118     if (!checkLocalhostCodePoint(iterator, 't'))
1119         return false;
1120     return iterator.atEnd();
1121 }
1122
1123 bool URLParser::isLocalhost(StringView view)
1124 {
1125     if (view.is8Bit())
1126         return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length()));
1127     return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length()));
1128 }
1129
1130 ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
1131 {
1132     if (UNLIKELY(m_didSeeSyntaxViolation)) {
1133         ASSERT(start + length <= m_asciiBuffer.size());
1134         return StringView(m_asciiBuffer.data() + start, length);
1135     }
1136     ASSERT(start + length <= m_inputString.length());
1137     return StringView(m_inputString).substring(start, length);
1138 }
1139
1140 ALWAYS_INLINE UChar URLParser::parsedDataView(size_t position)
1141 {
1142     if (UNLIKELY(m_didSeeSyntaxViolation))
1143         return m_asciiBuffer[position];
1144     return m_inputString[position];
1145 }
1146
1147 template<typename CharacterType>
1148 ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1149 {
1150     if (UNLIKELY(m_didSeeSyntaxViolation))
1151         return m_asciiBuffer.size();
1152     
1153     return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1154 }
1155
1156 URLParser::URLParser(const String& input, const URL& base, const TextEncoding& encoding)
1157     : m_inputString(input)
1158 {
1159     if (input.isNull()) {
1160         if (base.isValid() && !base.m_cannotBeABaseURL) {
1161             m_url = base;
1162             m_url.removeFragmentIdentifier();
1163         }
1164         return;
1165     }
1166
1167     if (input.is8Bit()) {
1168         m_inputBegin = input.characters8();
1169         parse(input.characters8(), input.length(), base, encoding);
1170     } else {
1171         m_inputBegin = input.characters16();
1172         parse(input.characters16(), input.length(), base, encoding);
1173     }
1174
1175     ASSERT(!m_url.m_isValid
1176         || m_didSeeSyntaxViolation == (m_url.string() != input)
1177         || (input.isAllSpecialCharacters<isC0ControlOrSpace>()
1178             && m_url.m_string == base.m_string.left(base.m_queryEnd)));
1179     ASSERT(internalValuesConsistent(m_url));
1180 #if !ASSERT_DISABLED
1181     if (!m_didSeeSyntaxViolation) {
1182         // Force a syntax violation at the beginning to make sure we get the same result.
1183         URLParser parser(makeString(" ", input), base, encoding);
1184         URL parsed = parser.result();
1185         if (parsed.isValid())
1186             ASSERT(allValuesEqual(parser.result(), m_url));
1187     }
1188 #endif
1189 }
1190
1191 template<typename CharacterType>
1192 void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
1193 {
1194     URL_PARSER_LOG("Parsing URL <%s> base <%s> encoding <%s>", String(input, length).utf8().data(), base.string().utf8().data(), encoding.name());
1195     m_url = { };
1196     ASSERT(m_asciiBuffer.isEmpty());
1197     
1198     bool isUTF8Encoding = encoding == UTF8Encoding();
1199     Vector<UChar> queryBuffer;
1200
1201     unsigned endIndex = length;
1202     while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) {
1203         syntaxViolation(CodePointIterator<CharacterType>(input, input));
1204         endIndex--;
1205     }
1206     CodePointIterator<CharacterType> c(input, input + endIndex);
1207     CodePointIterator<CharacterType> authorityOrHostBegin;
1208     CodePointIterator<CharacterType> queryBegin;
1209     while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1210         syntaxViolation(c);
1211         ++c;
1212     }
1213     auto beginAfterControlAndSpace = c;
1214
1215     enum class State : uint8_t {
1216         SchemeStart,
1217         Scheme,
1218         NoScheme,
1219         SpecialRelativeOrAuthority,
1220         PathOrAuthority,
1221         Relative,
1222         RelativeSlash,
1223         SpecialAuthoritySlashes,
1224         SpecialAuthorityIgnoreSlashes,
1225         AuthorityOrHost,
1226         Host,
1227         File,
1228         FileSlash,
1229         FileHost,
1230         PathStart,
1231         Path,
1232         CannotBeABaseURLPath,
1233         UTF8Query,
1234         NonUTF8Query,
1235         Fragment,
1236     };
1237
1238 #define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1239 #define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x)
1240
1241     State state = State::SchemeStart;
1242     while (!c.atEnd()) {
1243         if (UNLIKELY(isTabOrNewline(*c))) {
1244             syntaxViolation(c);
1245             ++c;
1246             continue;
1247         }
1248
1249         switch (state) {
1250         case State::SchemeStart:
1251             LOG_STATE("SchemeStart");
1252             if (isASCIIAlpha(*c)) {
1253                 if (UNLIKELY(isASCIIUpper(*c)))
1254                     syntaxViolation(c);
1255                 appendToASCIIBuffer(toASCIILower(*c));
1256                 advance(c);
1257                 if (c.atEnd()) {
1258                     m_asciiBuffer.clear();
1259                     state = State::NoScheme;
1260                     c = beginAfterControlAndSpace;
1261                 }
1262                 state = State::Scheme;
1263             } else
1264                 state = State::NoScheme;
1265             break;
1266         case State::Scheme:
1267             LOG_STATE("Scheme");
1268             if (isValidSchemeCharacter(*c)) {
1269                 if (UNLIKELY(isASCIIUpper(*c)))
1270                     syntaxViolation(c);
1271                 appendToASCIIBuffer(toASCIILower(*c));
1272             } else if (*c == ':') {
1273                 m_url.m_schemeEnd = currentPosition(c);
1274                 StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd);
1275                 appendToASCIIBuffer(':');
1276                 switch (scheme(urlScheme)) {
1277                 case Scheme::File:
1278                     m_urlIsSpecial = true;
1279                     m_urlIsFile = true;
1280                     state = State::File;
1281                     ++c;
1282                     break;
1283                 case Scheme::WS:
1284                 case Scheme::WSS:
1285                     isUTF8Encoding = true;
1286                     m_urlIsSpecial = true;
1287                     if (base.protocolIs(urlScheme))
1288                         state = State::SpecialRelativeOrAuthority;
1289                     else
1290                         state = State::SpecialAuthoritySlashes;
1291                     ++c;
1292                     break;
1293                 case Scheme::HTTP:
1294                 case Scheme::HTTPS:
1295                     m_url.m_protocolIsInHTTPFamily = true;
1296                     FALLTHROUGH;
1297                 case Scheme::FTP:
1298                 case Scheme::Gopher:
1299                     m_urlIsSpecial = true;
1300                     if (base.protocolIs(urlScheme))
1301                         state = State::SpecialRelativeOrAuthority;
1302                     else
1303                         state = State::SpecialAuthoritySlashes;
1304                     ++c;
1305                     break;
1306                 case Scheme::NonSpecial:
1307                     isUTF8Encoding = true;
1308                     auto maybeSlash = c;
1309                     advance(maybeSlash);
1310                     if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1311                         appendToASCIIBuffer('/');
1312                         c = maybeSlash;
1313                         state = State::PathOrAuthority;
1314                         ASSERT(*c == '/');
1315                         ++c;
1316                         m_url.m_userStart = currentPosition(c);
1317                     } else {
1318                         ++c;
1319                         m_url.m_userStart = currentPosition(c);
1320                         m_url.m_userEnd = m_url.m_userStart;
1321                         m_url.m_passwordEnd = m_url.m_userStart;
1322                         m_url.m_hostEnd = m_url.m_userStart;
1323                         m_url.m_portEnd = m_url.m_userStart;
1324                         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1325                         m_url.m_cannotBeABaseURL = true;
1326                         state = State::CannotBeABaseURLPath;
1327                     }
1328                     break;
1329                 }
1330                 break;
1331             } else {
1332                 m_asciiBuffer.clear();
1333                 state = State::NoScheme;
1334                 c = beginAfterControlAndSpace;
1335                 break;
1336             }
1337             advance(c);
1338             if (c.atEnd()) {
1339                 m_asciiBuffer.clear();
1340                 state = State::NoScheme;
1341                 c = beginAfterControlAndSpace;
1342             }
1343             break;
1344         case State::NoScheme:
1345             LOG_STATE("NoScheme");
1346             if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) {
1347                 failure();
1348                 return;
1349             }
1350             if (base.m_cannotBeABaseURL && *c == '#') {
1351                 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1352                 state = State::Fragment;
1353                 appendToASCIIBuffer('#');
1354                 ++c;
1355                 break;
1356             }
1357             if (!base.protocolIs("file")) {
1358                 state = State::Relative;
1359                 break;
1360             }
1361             copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1362             appendToASCIIBuffer(':');
1363             state = State::File;
1364             break;
1365         case State::SpecialRelativeOrAuthority:
1366             LOG_STATE("SpecialRelativeOrAuthority");
1367             if (*c == '/') {
1368                 appendToASCIIBuffer('/');
1369                 advance(c);
1370                 if (c.atEnd()) {
1371                     failure();
1372                     return;
1373                 }
1374                 if (*c == '/') {
1375                     appendToASCIIBuffer('/');
1376                     state = State::SpecialAuthorityIgnoreSlashes;
1377                     ++c;
1378                 } else
1379                     state = State::RelativeSlash;
1380             } else
1381                 state = State::Relative;
1382             break;
1383         case State::PathOrAuthority:
1384             LOG_STATE("PathOrAuthority");
1385             if (*c == '/') {
1386                 appendToASCIIBuffer('/');
1387                 state = State::AuthorityOrHost;
1388                 advance(c);
1389                 m_url.m_userStart = currentPosition(c);
1390                 authorityOrHostBegin = c;
1391             } else {
1392                 ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1393                 m_url.m_userStart = currentPosition(c) - 1;
1394                 m_url.m_userEnd = m_url.m_userStart;
1395                 m_url.m_passwordEnd = m_url.m_userStart;
1396                 m_url.m_hostEnd = m_url.m_userStart;
1397                 m_url.m_portEnd = m_url.m_userStart;
1398                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1399                 state = State::Path;
1400             }
1401             break;
1402         case State::Relative:
1403             LOG_STATE("Relative");
1404             switch (*c) {
1405             case '/':
1406             case '\\':
1407                 state = State::RelativeSlash;
1408                 ++c;
1409                 break;
1410             case '?':
1411                 copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1412                 appendToASCIIBuffer('?');
1413                 ++c;
1414                 if (isUTF8Encoding)
1415                     state = State::UTF8Query;
1416                 else {
1417                     queryBegin = c;
1418                     state = State::NonUTF8Query;
1419                 }
1420                 break;
1421             case '#':
1422                 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1423                 appendToASCIIBuffer('#');
1424                 state = State::Fragment;
1425                 ++c;
1426                 break;
1427             default:
1428                 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1429                 if (currentPosition(c) && parsedDataView(currentPosition(c) - 1) != '/') {
1430                     appendToASCIIBuffer('/');
1431                     m_url.m_pathAfterLastSlash = currentPosition(c);
1432                 }
1433                 state = State::Path;
1434                 break;
1435             }
1436             break;
1437         case State::RelativeSlash:
1438             LOG_STATE("RelativeSlash");
1439             if (*c == '/' || *c == '\\') {
1440                 ++c;
1441                 copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1442                 appendToASCIIBuffer("://", 3);
1443                 if (m_urlIsSpecial)
1444                     state = State::SpecialAuthorityIgnoreSlashes;
1445                 else {
1446                     m_url.m_userStart = currentPosition(c);
1447                     state = State::AuthorityOrHost;
1448                     authorityOrHostBegin = c;
1449                 }
1450             } else {
1451                 copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1452                 appendToASCIIBuffer('/');
1453                 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1454                 state = State::Path;
1455             }
1456             break;
1457         case State::SpecialAuthoritySlashes:
1458             LOG_STATE("SpecialAuthoritySlashes");
1459             if (LIKELY(*c == '/' || *c == '\\')) {
1460                 if (UNLIKELY(*c == '\\'))
1461                     syntaxViolation(c);
1462                 appendToASCIIBuffer('/');
1463                 advance(c);
1464                 if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) {
1465                     if (UNLIKELY(*c == '\\'))
1466                         syntaxViolation(c);
1467                     ++c;
1468                     appendToASCIIBuffer('/');
1469                 } else {
1470                     syntaxViolation(c);
1471                     appendToASCIIBuffer('/');
1472                 }
1473             } else {
1474                 syntaxViolation(c);
1475                 appendToASCIIBuffer("//", 2);
1476             }
1477             state = State::SpecialAuthorityIgnoreSlashes;
1478             break;
1479         case State::SpecialAuthorityIgnoreSlashes:
1480             LOG_STATE("SpecialAuthorityIgnoreSlashes");
1481             if (*c == '/' || *c == '\\') {
1482                 syntaxViolation(c);
1483                 ++c;
1484             } else {
1485                 m_url.m_userStart = currentPosition(c);
1486                 state = State::AuthorityOrHost;
1487                 authorityOrHostBegin = c;
1488             }
1489             break;
1490         case State::AuthorityOrHost:
1491             do {
1492                 LOG_STATE("AuthorityOrHost");
1493                 if (*c == '@') {
1494                     auto lastAt = c;
1495                     auto findLastAt = c;
1496                     while (!findLastAt.atEnd()) {
1497                         URL_PARSER_LOG("Finding last @: %c", *findLastAt);
1498                         if (*findLastAt == '@')
1499                             lastAt = findLastAt;
1500                         bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\');
1501                         if (isSlash || *findLastAt == '?' || *findLastAt == '#')
1502                             break;
1503                         ++findLastAt;
1504                     }
1505                     parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1506                     c = lastAt;
1507                     advance(c);
1508                     authorityOrHostBegin = c;
1509                     state = State::Host;
1510                     m_hostHasPercentOrNonASCII = false;
1511                     break;
1512                 }
1513                 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1514                 if (isSlash || *c == '?' || *c == '#') {
1515                     auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c);
1516                     if (iterator.atEnd()) {
1517                         if (m_urlIsSpecial)
1518                             return failure();
1519                         m_url.m_userEnd = currentPosition(c);
1520                         m_url.m_passwordEnd = m_url.m_userEnd;
1521                         m_url.m_hostEnd = m_url.m_userEnd;
1522                         m_url.m_portEnd = m_url.m_userEnd;
1523                         m_url.m_pathAfterLastSlash = m_url.m_userEnd;
1524                     } else {
1525                         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1526                         m_url.m_passwordEnd = m_url.m_userEnd;
1527                         if (!parseHostAndPort(iterator)) {
1528                             failure();
1529                             return;
1530                         }
1531                         if (UNLIKELY(!isSlash)) {
1532                             if (m_urlIsSpecial) {
1533                                 syntaxViolation(c);
1534                                 appendToASCIIBuffer('/');
1535                             }
1536                             m_url.m_pathAfterLastSlash = currentPosition(c);
1537                         }
1538                     }
1539                     state = State::Path;
1540                     break;
1541                 }
1542                 if (isPercentOrNonASCII(*c))
1543                     m_hostHasPercentOrNonASCII = true;
1544                 ++c;
1545             } while (!c.atEnd());
1546             break;
1547         case State::Host:
1548             do {
1549                 LOG_STATE("Host");
1550                 if (*c == '/' || *c == '?' || *c == '#') {
1551                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1552                         failure();
1553                         return;
1554                     }
1555                     if (*c == '?' || *c == '#') {
1556                         syntaxViolation(c);
1557                         appendToASCIIBuffer('/');
1558                         m_url.m_pathAfterLastSlash = currentPosition(c);
1559                     }
1560                     state = State::Path;
1561                     break;
1562                 }
1563                 if (isPercentOrNonASCII(*c))
1564                     m_hostHasPercentOrNonASCII = true;
1565                 ++c;
1566             } while (!c.atEnd());
1567             break;
1568         case State::File:
1569             LOG_STATE("File");
1570             switch (*c) {
1571             case '\\':
1572                 syntaxViolation(c);
1573                 FALLTHROUGH;
1574             case '/':
1575                 appendToASCIIBuffer('/');
1576                 state = State::FileSlash;
1577                 ++c;
1578                 break;
1579             case '?':
1580                 syntaxViolation(c);
1581                 if (base.isValid() && base.protocolIs("file")) {
1582                     copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1583                     appendToASCIIBuffer('?');
1584                     ++c;
1585                 } else {
1586                     appendToASCIIBuffer("///?", 4);
1587                     ++c;
1588                     m_url.m_userStart = currentPosition(c) - 2;
1589                     m_url.m_userEnd = m_url.m_userStart;
1590                     m_url.m_passwordEnd = m_url.m_userStart;
1591                     m_url.m_hostEnd = m_url.m_userStart;
1592                     m_url.m_portEnd = m_url.m_userStart;
1593                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1594                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1595                 }
1596                 if (isUTF8Encoding)
1597                     state = State::UTF8Query;
1598                 else {
1599                     queryBegin = c;
1600                     state = State::NonUTF8Query;
1601                 }
1602                 break;
1603             case '#':
1604                 syntaxViolation(c);
1605                 if (base.isValid() && base.protocolIs("file")) {
1606                     copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1607                     appendToASCIIBuffer('#');
1608                 } else {
1609                     appendToASCIIBuffer("///#", 4);
1610                     m_url.m_userStart = currentPosition(c) - 2;
1611                     m_url.m_userEnd = m_url.m_userStart;
1612                     m_url.m_passwordEnd = m_url.m_userStart;
1613                     m_url.m_hostEnd = m_url.m_userStart;
1614                     m_url.m_portEnd = m_url.m_userStart;
1615                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1616                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1617                     m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1618                 }
1619                 state = State::Fragment;
1620                 ++c;
1621                 break;
1622             default:
1623                 syntaxViolation(c);
1624                 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1625                     copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1626                 else {
1627                     appendToASCIIBuffer("///", 3);
1628                     m_url.m_userStart = currentPosition(c) - 1;
1629                     m_url.m_userEnd = m_url.m_userStart;
1630                     m_url.m_passwordEnd = m_url.m_userStart;
1631                     m_url.m_hostEnd = m_url.m_userStart;
1632                     m_url.m_portEnd = m_url.m_userStart;
1633                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1634                     if (isWindowsDriveLetter(c))
1635                         appendWindowsDriveLetter(c);
1636                 }
1637                 state = State::Path;
1638                 break;
1639             }
1640             break;
1641         case State::FileSlash:
1642             LOG_STATE("FileSlash");
1643             if (LIKELY(*c == '/' || *c == '\\')) {
1644                 if (UNLIKELY(*c == '\\'))
1645                     syntaxViolation(c);
1646                 appendToASCIIBuffer('/');
1647                 advance(c);
1648                 m_url.m_userStart = currentPosition(c);
1649                 m_url.m_userEnd = m_url.m_userStart;
1650                 m_url.m_passwordEnd = m_url.m_userStart;
1651                 m_url.m_hostEnd = m_url.m_userStart;
1652                 m_url.m_portEnd = m_url.m_userStart;
1653                 authorityOrHostBegin = c;
1654                 state = State::FileHost;
1655                 break;
1656             }
1657             syntaxViolation(c);
1658             appendToASCIIBuffer("//", 2);
1659             m_url.m_userStart = currentPosition(c) - 1;
1660             m_url.m_userEnd = m_url.m_userStart;
1661             m_url.m_passwordEnd = m_url.m_userStart;
1662             m_url.m_hostEnd = m_url.m_userStart;
1663             m_url.m_portEnd = m_url.m_userStart;
1664             if (isWindowsDriveLetter(c)) {
1665                 appendWindowsDriveLetter(c);
1666                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1667             } else if (copyBaseWindowsDriveLetter(base)) {
1668                 appendToASCIIBuffer('/');
1669                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1670             } else
1671                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1672             state = State::Path;
1673             break;
1674         case State::FileHost:
1675             do {
1676                 LOG_STATE("FileHost");
1677                 if (isSlashQuestionOrHash(*c)) {
1678                     bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1679                         && isWindowsDriveLetter(authorityOrHostBegin);
1680                     if (windowsQuirk) {
1681                         syntaxViolation(authorityOrHostBegin);
1682                         appendToASCIIBuffer('/');
1683                         appendWindowsDriveLetter(authorityOrHostBegin);
1684                     }
1685                     if (windowsQuirk || authorityOrHostBegin == c) {
1686                         ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1) == '/');
1687                         if (UNLIKELY(*c == '?')) {
1688                             syntaxViolation(c);
1689                             appendToASCIIBuffer("/?", 2);
1690                             ++c;
1691                             if (isUTF8Encoding)
1692                                 state = State::UTF8Query;
1693                             else {
1694                                 queryBegin = c;
1695                                 state = State::NonUTF8Query;
1696                             }
1697                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1698                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1699                             break;
1700                         }
1701                         if (UNLIKELY(*c == '#')) {
1702                             syntaxViolation(c);
1703                             appendToASCIIBuffer("/#", 2);
1704                             ++c;
1705                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1706                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1707                             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1708                             state = State::Fragment;
1709                             break;
1710                         }
1711                         state = State::Path;
1712                         break;
1713                     }
1714                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1715                         failure();
1716                         return;
1717                     }
1718                     if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) {
1719                         syntaxViolation(c);
1720                         m_asciiBuffer.shrink(m_url.m_passwordEnd);
1721                         m_url.m_hostEnd = currentPosition(c);
1722                         m_url.m_portEnd = m_url.m_hostEnd;
1723                     }
1724                     
1725                     state = State::PathStart;
1726                     break;
1727                 }
1728                 if (isPercentOrNonASCII(*c))
1729                     m_hostHasPercentOrNonASCII = true;
1730                 ++c;
1731             } while (!c.atEnd());
1732             break;
1733         case State::PathStart:
1734             LOG_STATE("PathStart");
1735             if (*c != '/' && *c != '\\')
1736                 ++c;
1737             state = State::Path;
1738             break;
1739         case State::Path:
1740             LOG_STATE("Path");
1741             if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1742                 if (UNLIKELY(m_urlIsSpecial && *c == '\\'))
1743                     syntaxViolation(c);
1744                 appendToASCIIBuffer('/');
1745                 ++c;
1746                 m_url.m_pathAfterLastSlash = currentPosition(c);
1747                 break;
1748             }
1749             if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1) == '/')) {
1750                 if (UNLIKELY(isDoubleDotPathSegment(c))) {
1751                     syntaxViolation(c);
1752                     consumeDoubleDotPathSegment(c);
1753                     popPath();
1754                     break;
1755                 }
1756                 if (UNLIKELY(isSingleDotPathSegment(c))) {
1757                     syntaxViolation(c);
1758                     consumeSingleDotPathSegment(c);
1759                     break;
1760                 }
1761             }
1762             if (*c == '?') {
1763                 m_url.m_pathEnd = currentPosition(c);
1764                 appendToASCIIBuffer('?');
1765                 ++c;
1766                 if (isUTF8Encoding)
1767                     state = State::UTF8Query;
1768                 else {
1769                     queryBegin = c;
1770                     state = State::NonUTF8Query;
1771                 }
1772                 break;
1773             }
1774             if (*c == '#') {
1775                 m_url.m_pathEnd = currentPosition(c);
1776                 m_url.m_queryEnd = m_url.m_pathEnd;
1777                 state = State::Fragment;
1778                 break;
1779             }
1780             utf8PercentEncode<isInDefaultEncodeSet>(c);
1781             ++c;
1782             break;
1783         case State::CannotBeABaseURLPath:
1784             LOG_STATE("CannotBeABaseURLPath");
1785             if (*c == '?') {
1786                 m_url.m_pathEnd = currentPosition(c);
1787                 appendToASCIIBuffer('?');
1788                 ++c;
1789                 if (isUTF8Encoding)
1790                     state = State::UTF8Query;
1791                 else {
1792                     queryBegin = c;
1793                     state = State::NonUTF8Query;
1794                 }
1795             } else if (*c == '#') {
1796                 m_url.m_pathEnd = currentPosition(c);
1797                 m_url.m_queryEnd = m_url.m_pathEnd;
1798                 state = State::Fragment;
1799             } else if (*c == '/') {
1800                 appendToASCIIBuffer('/');
1801                 ++c;
1802                 m_url.m_pathAfterLastSlash = currentPosition(c);
1803             } else {
1804                 utf8PercentEncode<isInSimpleEncodeSet>(c);
1805                 ++c;
1806             }
1807             break;
1808         case State::UTF8Query:
1809             LOG_STATE("UTF8Query");
1810             ASSERT(queryBegin == CodePointIterator<CharacterType>());
1811             if (*c == '#') {
1812                 m_url.m_queryEnd = currentPosition(c);
1813                 state = State::Fragment;
1814                 break;
1815             }
1816             if (isUTF8Encoding)
1817                 utf8QueryEncode(c);
1818             else
1819                 appendCodePoint(queryBuffer, *c);
1820             ++c;
1821             break;
1822         case State::NonUTF8Query:
1823             do {
1824                 LOG_STATE("NonUTF8Query");
1825                 ASSERT(queryBegin != CodePointIterator<CharacterType>());
1826                 if (*c == '#') {
1827                     encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
1828                     m_url.m_queryEnd = currentPosition(c);
1829                     state = State::Fragment;
1830                     break;
1831                 }
1832                 appendCodePoint(queryBuffer, *c);
1833                 advance(c, queryBegin);
1834             } while (!c.atEnd());
1835             break;
1836         case State::Fragment:
1837             URL_PARSER_LOG("State Fragment");
1838             utf8PercentEncode<isInSimpleEncodeSet>(c);
1839             ++c;
1840             break;
1841         }
1842     }
1843
1844     switch (state) {
1845     case State::SchemeStart:
1846         LOG_FINAL_STATE("SchemeStart");
1847         if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1848             m_url = base;
1849             m_url.removeFragmentIdentifier();
1850             return;
1851         }
1852         failure();
1853         return;
1854     case State::Scheme:
1855         LOG_FINAL_STATE("Scheme");
1856         failure();
1857         return;
1858     case State::NoScheme:
1859         LOG_FINAL_STATE("NoScheme");
1860         RELEASE_ASSERT_NOT_REACHED();
1861     case State::SpecialRelativeOrAuthority:
1862         LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1863         copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1864         m_url.m_fragmentEnd = m_url.m_queryEnd;
1865         break;
1866     case State::PathOrAuthority:
1867         LOG_FINAL_STATE("PathOrAuthority");
1868         ASSERT(m_url.m_userStart);
1869         ASSERT(m_url.m_userStart == currentPosition(c));
1870         ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1871         m_url.m_userStart--;
1872         m_url.m_userEnd = m_url.m_userStart;
1873         m_url.m_passwordEnd = m_url.m_userStart;
1874         m_url.m_hostEnd = m_url.m_userStart;
1875         m_url.m_portEnd = m_url.m_userStart;
1876         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1877         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1878         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1879         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1880         break;
1881     case State::Relative:
1882         LOG_FINAL_STATE("Relative");
1883         RELEASE_ASSERT_NOT_REACHED();
1884     case State::RelativeSlash:
1885         LOG_FINAL_STATE("RelativeSlash");
1886         copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1887         appendToASCIIBuffer('/');
1888         m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1889         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1890         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1891         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1892         break;
1893     case State::SpecialAuthoritySlashes:
1894         LOG_FINAL_STATE("SpecialAuthoritySlashes");
1895         m_url.m_userStart = currentPosition(c);
1896         m_url.m_userEnd = m_url.m_userStart;
1897         m_url.m_passwordEnd = m_url.m_userStart;
1898         m_url.m_hostEnd = m_url.m_userStart;
1899         m_url.m_portEnd = m_url.m_userStart;
1900         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1901         m_url.m_pathEnd = m_url.m_userStart;
1902         m_url.m_queryEnd = m_url.m_userStart;
1903         m_url.m_fragmentEnd = m_url.m_userStart;
1904         break;
1905     case State::SpecialAuthorityIgnoreSlashes:
1906         LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1907         failure();
1908         return;
1909     case State::AuthorityOrHost:
1910         LOG_FINAL_STATE("AuthorityOrHost");
1911         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1912         m_url.m_passwordEnd = m_url.m_userEnd;
1913         if (authorityOrHostBegin.atEnd()) {
1914             m_url.m_userEnd = m_url.m_userStart;
1915             m_url.m_passwordEnd = m_url.m_userStart;
1916             m_url.m_hostEnd = m_url.m_userStart;
1917             m_url.m_portEnd = m_url.m_userStart;
1918             m_url.m_pathEnd = m_url.m_userStart;
1919         } else if (!parseHostAndPort(authorityOrHostBegin)) {
1920             failure();
1921             return;
1922         } else {
1923             if (m_urlIsSpecial) {
1924                 syntaxViolation(c);
1925                 appendToASCIIBuffer('/');
1926                 m_url.m_pathEnd = m_url.m_portEnd + 1;
1927             } else
1928                 m_url.m_pathEnd = m_url.m_portEnd;
1929         }
1930         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1931         m_url.m_queryEnd = m_url.m_pathEnd;
1932         m_url.m_fragmentEnd = m_url.m_pathEnd;
1933         break;
1934     case State::Host:
1935         LOG_FINAL_STATE("Host");
1936         if (!parseHostAndPort(authorityOrHostBegin)) {
1937             failure();
1938             return;
1939         }
1940         if (m_urlIsSpecial) {
1941             syntaxViolation(c);
1942             appendToASCIIBuffer('/');
1943             m_url.m_pathEnd = m_url.m_portEnd + 1;
1944         } else
1945             m_url.m_pathEnd = m_url.m_portEnd;
1946         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1947         m_url.m_queryEnd = m_url.m_pathEnd;
1948         m_url.m_fragmentEnd = m_url.m_pathEnd;
1949         break;
1950     case State::File:
1951         LOG_FINAL_STATE("File");
1952         if (base.isValid() && base.protocolIs("file")) {
1953             copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1954             m_url.m_fragmentEnd = m_url.m_queryEnd;
1955             break;
1956         }
1957         syntaxViolation(c);
1958         appendToASCIIBuffer("///", 3);
1959         m_url.m_userStart = currentPosition(c) - 1;
1960         m_url.m_userEnd = m_url.m_userStart;
1961         m_url.m_passwordEnd = m_url.m_userStart;
1962         m_url.m_hostEnd = m_url.m_userStart;
1963         m_url.m_portEnd = m_url.m_userStart;
1964         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1965         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1966         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1967         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1968         break;
1969     case State::FileSlash:
1970         LOG_FINAL_STATE("FileSlash");
1971         syntaxViolation(c);
1972         m_url.m_userStart = currentPosition(c) + 1;
1973         appendToASCIIBuffer("//", 2);
1974         m_url.m_userEnd = m_url.m_userStart;
1975         m_url.m_passwordEnd = m_url.m_userStart;
1976         m_url.m_hostEnd = m_url.m_userStart;
1977         m_url.m_portEnd = m_url.m_userStart;
1978         if (copyBaseWindowsDriveLetter(base)) {
1979             appendToASCIIBuffer('/');
1980             m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1981         } else
1982             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1983         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1984         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1985         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1986         break;
1987     case State::FileHost:
1988         LOG_FINAL_STATE("FileHost");
1989         if (takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1990             && isWindowsDriveLetter(authorityOrHostBegin)) {
1991             syntaxViolation(authorityOrHostBegin);
1992             appendToASCIIBuffer('/');
1993             appendWindowsDriveLetter(authorityOrHostBegin);
1994             m_url.m_pathAfterLastSlash = currentPosition(c);
1995             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1996             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1997             m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1998             break;
1999         }
2000         
2001         if (authorityOrHostBegin == c) {
2002             syntaxViolation(c);
2003             appendToASCIIBuffer('/');
2004             m_url.m_userStart = currentPosition(c) - 1;
2005             m_url.m_userEnd = m_url.m_userStart;
2006             m_url.m_passwordEnd = m_url.m_userStart;
2007             m_url.m_hostEnd = m_url.m_userStart;
2008             m_url.m_portEnd = m_url.m_userStart;
2009             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
2010             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
2011             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
2012             m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
2013             break;
2014         }
2015
2016         if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
2017             failure();
2018             return;
2019         }
2020
2021         syntaxViolation(c);
2022         if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) {
2023             m_asciiBuffer.shrink(m_url.m_passwordEnd);
2024             m_url.m_hostEnd = currentPosition(c);
2025             m_url.m_portEnd = m_url.m_hostEnd;
2026         }
2027         appendToASCIIBuffer('/');
2028         m_url.m_pathAfterLastSlash = m_url.m_portEnd + 1;
2029         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
2030         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
2031         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
2032         break;
2033     case State::PathStart:
2034         LOG_FINAL_STATE("PathStart");
2035         RELEASE_ASSERT_NOT_REACHED();
2036     case State::Path:
2037         LOG_FINAL_STATE("Path");
2038         m_url.m_pathEnd = currentPosition(c);
2039         m_url.m_queryEnd = m_url.m_pathEnd;
2040         m_url.m_fragmentEnd = m_url.m_pathEnd;
2041         break;
2042     case State::CannotBeABaseURLPath:
2043         LOG_FINAL_STATE("CannotBeABaseURLPath");
2044         m_url.m_pathEnd = currentPosition(c);
2045         m_url.m_queryEnd = m_url.m_pathEnd;
2046         m_url.m_fragmentEnd = m_url.m_pathEnd;
2047         break;
2048     case State::UTF8Query:
2049         LOG_FINAL_STATE("UTF8Query");
2050         ASSERT(queryBegin == CodePointIterator<CharacterType>());
2051         m_url.m_queryEnd = currentPosition(c);
2052         m_url.m_fragmentEnd = m_url.m_queryEnd;
2053         break;
2054     case State::NonUTF8Query:
2055         LOG_FINAL_STATE("NonUTF8Query");
2056         ASSERT(queryBegin != CodePointIterator<CharacterType>());
2057         encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
2058         m_url.m_queryEnd = currentPosition(c);
2059         m_url.m_fragmentEnd = m_url.m_queryEnd;
2060         break;
2061     case State::Fragment:
2062         LOG_FINAL_STATE("Fragment");
2063         m_url.m_fragmentEnd = currentPosition(c);
2064         break;
2065     }
2066
2067     if (LIKELY(!m_didSeeSyntaxViolation)) {
2068         m_url.m_string = m_inputString;
2069         ASSERT(m_asciiBuffer.isEmpty());
2070     } else
2071         m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
2072     m_url.m_isValid = true;
2073     URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
2074 }
2075
2076 template<typename CharacterType>
2077 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
2078 {
2079     if (UNLIKELY(iterator.atEnd())) {
2080         syntaxViolation(iterator);
2081         m_url.m_userEnd = currentPosition(iterator);
2082         m_url.m_passwordEnd = m_url.m_userEnd;
2083         return;
2084     }
2085     for (; !iterator.atEnd(); advance(iterator)) {
2086         if (*iterator == ':') {
2087             m_url.m_userEnd = currentPosition(iterator);
2088             auto iteratorAtColon = iterator;
2089             ++iterator;
2090             bool tabOrNewlineAfterColon = false;
2091             while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
2092                 tabOrNewlineAfterColon = true;
2093                 ++iterator;
2094             }
2095             if (UNLIKELY(iterator.atEnd())) {
2096                 syntaxViolation(iteratorAtColon);
2097                 m_url.m_passwordEnd = m_url.m_userEnd;
2098                 if (m_url.m_userEnd > m_url.m_userStart)
2099                     appendToASCIIBuffer('@');
2100                 return;
2101             }
2102             if (tabOrNewlineAfterColon)
2103                 syntaxViolation(iteratorAtColon);
2104             appendToASCIIBuffer(':');
2105             break;
2106         }
2107         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2108     }
2109     for (; !iterator.atEnd(); advance(iterator))
2110         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2111     m_url.m_passwordEnd = currentPosition(iterator);
2112     if (!m_url.m_userEnd)
2113         m_url.m_userEnd = m_url.m_passwordEnd;
2114     appendToASCIIBuffer('@');
2115 }
2116
2117 template<typename UnsignedIntegerType>
2118 void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
2119 {
2120     LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
2121     LChar* end = std::end(buf);
2122     LChar* p = end;
2123     do {
2124         *--p = (number % 10) + '0';
2125         number /= 10;
2126     } while (number);
2127     appendToASCIIBuffer(p, end - p);
2128 }
2129
2130 void URLParser::serializeIPv4(IPv4Address address)
2131 {
2132     appendNumberToASCIIBuffer<uint8_t>(address >> 24);
2133     appendToASCIIBuffer('.');
2134     appendNumberToASCIIBuffer<uint8_t>(address >> 16);
2135     appendToASCIIBuffer('.');
2136     appendNumberToASCIIBuffer<uint8_t>(address >> 8);
2137     appendToASCIIBuffer('.');
2138     appendNumberToASCIIBuffer<uint8_t>(address);
2139 }
2140     
2141 static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
2142 {
2143     size_t end = begin;
2144     for (; end < 8; end++) {
2145         if (address[end])
2146             break;
2147     }
2148     return end - begin;
2149 }
2150
2151 static std::optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
2152 {
2153     std::optional<size_t> longest;
2154     size_t longestLength = 0;
2155     for (size_t i = 0; i < 8; i++) {
2156         size_t length = zeroSequenceLength(address, i);
2157         if (length) {
2158             if (length > 1 && (!longest || longestLength < length)) {
2159                 longest = i;
2160                 longestLength = length;
2161             }
2162             i += length;
2163         }
2164     }
2165     return longest;
2166 }
2167
2168 void URLParser::serializeIPv6Piece(uint16_t piece)
2169 {
2170     bool printed = false;
2171     if (auto nibble0 = piece >> 12) {
2172         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
2173         printed = true;
2174     }
2175     auto nibble1 = piece >> 8 & 0xF;
2176     if (printed || nibble1) {
2177         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
2178         printed = true;
2179     }
2180     auto nibble2 = piece >> 4 & 0xF;
2181     if (printed || nibble2)
2182         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
2183     appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
2184 }
2185
2186 void URLParser::serializeIPv6(URLParser::IPv6Address address)
2187 {
2188     appendToASCIIBuffer('[');
2189     auto compressPointer = findLongestZeroSequence(address);
2190     for (size_t piece = 0; piece < 8; piece++) {
2191         if (compressPointer && compressPointer.value() == piece) {
2192             ASSERT(!address[piece]);
2193             if (piece)
2194                 appendToASCIIBuffer(':');
2195             else
2196                 appendToASCIIBuffer("::", 2);
2197             while (piece < 8 && !address[piece])
2198                 piece++;
2199             if (piece == 8)
2200                 break;
2201         }
2202         serializeIPv6Piece(address[piece]);
2203         if (piece < 7)
2204             appendToASCIIBuffer(':');
2205     }
2206     appendToASCIIBuffer(']');
2207 }
2208
2209 enum class URLParser::IPv4PieceParsingError {
2210     Failure,
2211     Overflow,
2212 };
2213
2214 template<typename CharacterType>
2215 Expected<uint32_t, URLParser::IPv4PieceParsingError> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation)
2216 {
2217     enum class State : uint8_t {
2218         UnknownBase,
2219         Decimal,
2220         OctalOrHex,
2221         Octal,
2222         Hex,
2223     };
2224     State state = State::UnknownBase;
2225     Checked<uint32_t, RecordOverflow> value = 0;
2226     if (!iterator.atEnd() && *iterator == '.')
2227         return makeUnexpected(IPv4PieceParsingError::Failure);
2228     while (!iterator.atEnd()) {
2229         if (isTabOrNewline(*iterator)) {
2230             didSeeSyntaxViolation = true;
2231             ++iterator;
2232             continue;
2233         }
2234         if (*iterator == '.') {
2235             ASSERT(!value.hasOverflowed());
2236             return value.unsafeGet();
2237         }
2238         switch (state) {
2239         case State::UnknownBase:
2240             if (UNLIKELY(*iterator == '0')) {
2241                 ++iterator;
2242                 state = State::OctalOrHex;
2243                 break;
2244             }
2245             state = State::Decimal;
2246             break;
2247         case State::OctalOrHex:
2248             didSeeSyntaxViolation = true;
2249             if (*iterator == 'x' || *iterator == 'X') {
2250                 ++iterator;
2251                 state = State::Hex;
2252                 break;
2253             }
2254             state = State::Octal;
2255             break;
2256         case State::Decimal:
2257             if (!isASCIIDigit(*iterator))
2258                 return makeUnexpected(IPv4PieceParsingError::Failure);
2259             value *= 10;
2260             value += *iterator - '0';
2261             if (UNLIKELY(value.hasOverflowed()))
2262                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2263             ++iterator;
2264             break;
2265         case State::Octal:
2266             ASSERT(didSeeSyntaxViolation);
2267             if (*iterator < '0' || *iterator > '7')
2268                 return makeUnexpected(IPv4PieceParsingError::Failure);
2269             value *= 8;
2270             value += *iterator - '0';
2271             if (UNLIKELY(value.hasOverflowed()))
2272                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2273             ++iterator;
2274             break;
2275         case State::Hex:
2276             ASSERT(didSeeSyntaxViolation);
2277             if (!isASCIIHexDigit(*iterator))
2278                 return makeUnexpected(IPv4PieceParsingError::Failure);
2279             value *= 16;
2280             value += toASCIIHexValue(*iterator);
2281             if (UNLIKELY(value.hasOverflowed()))
2282                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2283             ++iterator;
2284             break;
2285         }
2286     }
2287     ASSERT(!value.hasOverflowed());
2288     return value.unsafeGet();
2289 }
2290
2291 ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2292 {
2293     RELEASE_ASSERT(exponent <= 4);
2294     uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
2295     return values[exponent];
2296 }
2297
2298 enum class URLParser::IPv4ParsingError {
2299     Failure,
2300     NotIPv4,
2301 };
2302
2303 template<typename CharacterTypeForSyntaxViolation, typename CharacterType>
2304 Expected<URLParser::IPv4Address, URLParser::IPv4ParsingError> URLParser::parseIPv4Host(const CodePointIterator<CharacterTypeForSyntaxViolation>& iteratorForSyntaxViolationPosition, CodePointIterator<CharacterType> iterator)
2305 {
2306     Vector<Expected<uint32_t, URLParser::IPv4PieceParsingError>, 4> items;
2307     bool didSeeSyntaxViolation = false;
2308     if (!iterator.atEnd() && *iterator == '.')
2309         return makeUnexpected(IPv4ParsingError::NotIPv4);
2310     while (!iterator.atEnd()) {
2311         if (isTabOrNewline(*iterator)) {
2312             didSeeSyntaxViolation = true;
2313             ++iterator;
2314             continue;
2315         }
2316         if (items.size() >= 4)
2317             return makeUnexpected(IPv4ParsingError::NotIPv4);
2318         items.append(parseIPv4Piece(iterator, didSeeSyntaxViolation));
2319         if (!iterator.atEnd() && *iterator == '.') {
2320             ++iterator;
2321             if (iterator.atEnd())
2322                 syntaxViolation(iteratorForSyntaxViolationPosition);
2323             else if (*iterator == '.')
2324                 return makeUnexpected(IPv4ParsingError::NotIPv4);
2325         }
2326     }
2327     if (!iterator.atEnd() || !items.size() || items.size() > 4)
2328         return makeUnexpected(IPv4ParsingError::NotIPv4);
2329     for (const auto& item : items) {
2330         if (!item.hasValue() && item.error() == IPv4PieceParsingError::Failure)
2331             return makeUnexpected(IPv4ParsingError::NotIPv4);
2332     }
2333     for (const auto& item : items) {
2334         if (!item.hasValue() && item.error() == IPv4PieceParsingError::Overflow)
2335             return makeUnexpected(IPv4ParsingError::Failure);
2336     }
2337     if (items.size() > 1) {
2338         for (size_t i = 0; i < items.size() - 1; i++) {
2339             if (items[i].value() > 255)
2340                 return makeUnexpected(IPv4ParsingError::Failure);
2341         }
2342     }
2343     if (items[items.size() - 1].value() >= pow256(5 - items.size()))
2344         return makeUnexpected(IPv4ParsingError::Failure);
2345
2346     if (didSeeSyntaxViolation)
2347         syntaxViolation(iteratorForSyntaxViolationPosition);
2348     for (const auto& item : items) {
2349         if (item.value() > 255)
2350             syntaxViolation(iteratorForSyntaxViolationPosition);
2351     }
2352
2353     if (UNLIKELY(items.size() != 4))
2354         syntaxViolation(iteratorForSyntaxViolationPosition);
2355
2356     IPv4Address ipv4 = items.takeLast().value();
2357     for (size_t counter = 0; counter < items.size(); ++counter)
2358         ipv4 += items[counter].value() * pow256(3 - counter);
2359     return ipv4;
2360 }
2361
2362 template<typename CharacterType>
2363 std::optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator)
2364 {
2365     if (iterator.atEnd())
2366         return std::nullopt;
2367     uint32_t piece = 0;
2368     bool leadingZeros = false;
2369     size_t digitCount = 0;
2370     while (!iterator.atEnd()) {
2371         if (!isASCIIDigit(*iterator))
2372             return std::nullopt;
2373         ++digitCount;
2374         if (!piece && *iterator == '0') {
2375             if (leadingZeros)
2376                 return std::nullopt;
2377             leadingZeros = true;
2378         }
2379         if (!piece && *iterator == '0')
2380             leadingZeros = true;
2381         piece = piece * 10 + *iterator - '0';
2382         if (piece > 255)
2383             return std::nullopt;
2384         advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2385         if (iterator.atEnd())
2386             break;
2387         if (*iterator == '.')
2388             break;
2389     }
2390     if (piece && leadingZeros)
2391         return std::nullopt;
2392     return piece;
2393 }
2394
2395 template<typename CharacterType>
2396 std::optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator)
2397 {
2398     IPv4Address address = 0;
2399     for (size_t i = 0; i < 4; ++i) {
2400         if (std::optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator))
2401             address = (address << 8) + piece.value();
2402         else
2403             return std::nullopt;
2404         if (i < 3) {
2405             if (iterator.atEnd())
2406                 return std::nullopt;
2407             if (*iterator != '.')
2408                 return std::nullopt;
2409             advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2410         } else if (!iterator.atEnd())
2411             return std::nullopt;
2412     }
2413     ASSERT(iterator.atEnd());
2414     return address;
2415 }
2416
2417 template<typename CharacterType>
2418 std::optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2419 {
2420     ASSERT(*c == '[');
2421     const auto hostBegin = c;
2422     advance(c, hostBegin);
2423     if (c.atEnd())
2424         return std::nullopt;
2425
2426     IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}};
2427     size_t piecePointer = 0;
2428     std::optional<size_t> compressPointer;
2429
2430     if (*c == ':') {
2431         advance(c, hostBegin);
2432         if (c.atEnd())
2433             return std::nullopt;
2434         if (*c != ':')
2435             return std::nullopt;
2436         advance(c, hostBegin);
2437         ++piecePointer;
2438         compressPointer = piecePointer;
2439     }
2440     
2441     while (!c.atEnd()) {
2442         if (piecePointer == 8)
2443             return std::nullopt;
2444         if (*c == ':') {
2445             if (compressPointer)
2446                 return std::nullopt;
2447             advance(c, hostBegin);
2448             ++piecePointer;
2449             compressPointer = piecePointer;
2450             continue;
2451         }
2452         if (piecePointer == 6 || (compressPointer && piecePointer < 6)) {
2453             if (std::optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) {
2454                 if (compressPointer && piecePointer == 5)
2455                     return std::nullopt;
2456                 syntaxViolation(hostBegin);
2457                 address[piecePointer++] = ipv4Address.value() >> 16;
2458                 address[piecePointer++] = ipv4Address.value() & 0xFFFF;
2459                 c = { };
2460                 break;
2461             }
2462         }
2463         uint16_t value = 0;
2464         size_t length = 0;
2465         bool leadingZeros = false;
2466         for (; length < 4; length++) {
2467             if (c.atEnd())
2468                 break;
2469             if (!isASCIIHexDigit(*c))
2470                 break;
2471             if (isASCIIUpper(*c))
2472                 syntaxViolation(hostBegin);
2473             if (*c == '0' && !length)
2474                 leadingZeros = true;
2475             value = value * 0x10 + toASCIIHexValue(*c);
2476             advance(c, hostBegin);
2477         }
2478         
2479         if (UNLIKELY((value && leadingZeros) || (!value && length > 1)))
2480             syntaxViolation(hostBegin);
2481
2482         address[piecePointer++] = value;
2483         if (c.atEnd())
2484             break;
2485         if (piecePointer == 8 || *c != ':')
2486             return std::nullopt;
2487         advance(c, hostBegin);
2488     }
2489     
2490     if (!c.atEnd())
2491         return std::nullopt;
2492     
2493     if (compressPointer) {
2494         size_t swaps = piecePointer - compressPointer.value();
2495         piecePointer = 7;
2496         while (swaps)
2497             std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
2498     } else if (piecePointer != 8)
2499         return std::nullopt;
2500
2501     std::optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2502     if (possibleCompressPointer)
2503         possibleCompressPointer.value()++;
2504     if (UNLIKELY(compressPointer != possibleCompressPointer))
2505         syntaxViolation(hostBegin);
2506     
2507     return address;
2508 }
2509
2510 template<typename CharacterType>
2511 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2512 {
2513     Vector<LChar, defaultInlineBufferSize> output;
2514     output.reserveInitialCapacity(length);
2515     
2516     for (size_t i = 0; i < length; ++i) {
2517         uint8_t byte = input[i];
2518         if (byte != '%')
2519             output.uncheckedAppend(byte);
2520         else if (length > 2 && i < length - 2) {
2521             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2522                 syntaxViolation(iteratorForSyntaxViolationPosition);
2523                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2524                 i += 2;
2525             } else
2526                 output.uncheckedAppend(byte);
2527         } else
2528             output.uncheckedAppend(byte);
2529     }
2530     return output;
2531 }
2532     
2533 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length)
2534 {
2535     Vector<LChar, defaultInlineBufferSize> output;
2536     output.reserveInitialCapacity(length);
2537     
2538     for (size_t i = 0; i < length; ++i) {
2539         uint8_t byte = input[i];
2540         if (byte != '%')
2541             output.uncheckedAppend(byte);
2542         else if (length > 2 && i < length - 2) {
2543             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2544                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2545                 i += 2;
2546             } else
2547                 output.uncheckedAppend(byte);
2548         } else
2549             output.uncheckedAppend(byte);
2550     }
2551     return output;
2552 }
2553
2554 ALWAYS_INLINE static bool containsOnlyASCII(const String& string)
2555 {
2556     ASSERT(!string.isNull());
2557     if (string.is8Bit())
2558         return charactersAreAllASCII(string.characters8(), string.length());
2559     return charactersAreAllASCII(string.characters16(), string.length());
2560 }
2561
2562 template<typename CharacterType>
2563 std::optional<Vector<LChar, URLParser::defaultInlineBufferSize>> URLParser::domainToASCII(const String& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2564 {
2565     Vector<LChar, defaultInlineBufferSize> ascii;
2566     if (containsOnlyASCII(domain)) {
2567         size_t length = domain.length();
2568         if (domain.is8Bit()) {
2569             const LChar* characters = domain.characters8();
2570             ascii.reserveInitialCapacity(length);
2571             for (size_t i = 0; i < length; ++i) {
2572                 if (UNLIKELY(isASCIIUpper(characters[i])))
2573                     syntaxViolation(iteratorForSyntaxViolationPosition);
2574                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2575             }
2576         } else {
2577             const UChar* characters = domain.characters16();
2578             ascii.reserveInitialCapacity(length);
2579             for (size_t i = 0; i < length; ++i) {
2580                 if (UNLIKELY(isASCIIUpper(characters[i])))
2581                     syntaxViolation(iteratorForSyntaxViolationPosition);
2582                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2583             }
2584         }
2585         return ascii;
2586     }
2587     
2588     UChar hostnameBuffer[defaultInlineBufferSize];
2589     UErrorCode error = U_ZERO_ERROR;
2590     UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER;
2591     int32_t numCharactersConverted = uidna_nameToASCII(&internationalDomainNameTranscoder(), StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, &processingDetails, &error);
2592     ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
2593
2594     if (U_SUCCESS(error) && !processingDetails.errors) {
2595         for (int32_t i = 0; i < numCharactersConverted; ++i) {
2596             ASSERT(isASCII(hostnameBuffer[i]));
2597             ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2598         }
2599         ascii.append(hostnameBuffer, numCharactersConverted);
2600         if (domain != StringView(ascii.data(), ascii.size()))
2601             syntaxViolation(iteratorForSyntaxViolationPosition);
2602         return ascii;
2603     }
2604
2605     // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
2606     return std::nullopt;
2607 }
2608
2609 bool URLParser::hasForbiddenHostCodePoint(const Vector<LChar, URLParser::defaultInlineBufferSize>& asciiDomain)
2610 {
2611     for (size_t i = 0; i < asciiDomain.size(); ++i) {
2612         if (isForbiddenHostCodePoint(asciiDomain[i]))
2613             return true;
2614     }
2615     return false;
2616 }
2617
2618 template<typename CharacterType>
2619 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2620 {
2621     ASSERT(*iterator == ':');
2622     auto colonIterator = iterator;
2623     advance(iterator, colonIterator);
2624     uint32_t port = 0;
2625     if (UNLIKELY(iterator.atEnd())) {
2626         m_url.m_portEnd = currentPosition(colonIterator);
2627         syntaxViolation(colonIterator);
2628         return true;
2629     }
2630     size_t digitCount = 0;
2631     bool leadingZeros = false;
2632     for (; !iterator.atEnd(); ++iterator) {
2633         if (UNLIKELY(isTabOrNewline(*iterator))) {
2634             syntaxViolation(colonIterator);
2635             continue;
2636         }
2637         if (isASCIIDigit(*iterator)) {
2638             if (*iterator == '0' && !digitCount)
2639                 leadingZeros = true;
2640             ++digitCount;
2641             port = port * 10 + *iterator - '0';
2642             if (port > std::numeric_limits<uint16_t>::max())
2643                 return false;
2644         } else
2645             return false;
2646     }
2647
2648     if (port && leadingZeros)
2649         syntaxViolation(colonIterator);
2650     
2651     if (!port && digitCount > 1)
2652         syntaxViolation(colonIterator);
2653
2654     ASSERT(port == static_cast<uint16_t>(port));
2655     if (UNLIKELY(defaultPortForProtocol(parsedDataView(0, m_url.m_schemeEnd)) == static_cast<uint16_t>(port)))
2656         syntaxViolation(colonIterator);
2657     else {
2658         appendToASCIIBuffer(':');
2659         ASSERT(port <= std::numeric_limits<uint16_t>::max());
2660         appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2661     }
2662
2663     m_url.m_portEnd = currentPosition(iterator);
2664     return true;
2665 }
2666
2667 template<typename CharacterType>
2668 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2669 {
2670     if (iterator.atEnd())
2671         return false;
2672     if (*iterator == ':')
2673         return false;
2674     if (*iterator == '[') {
2675         auto ipv6End = iterator;
2676         while (!ipv6End.atEnd() && *ipv6End != ']')
2677             ++ipv6End;
2678         if (ipv6End.atEnd())
2679             return false;
2680         if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2681             serializeIPv6(address.value());
2682             if (!ipv6End.atEnd()) {
2683                 advance(ipv6End);
2684                 if (!ipv6End.atEnd() && *ipv6End == ':') {
2685                     m_url.m_hostEnd = currentPosition(ipv6End);
2686                     return parsePort(ipv6End);
2687                 }
2688                 m_url.m_hostEnd = currentPosition(ipv6End);
2689                 m_url.m_portEnd = m_url.m_hostEnd;
2690                 return true;
2691             }
2692             m_url.m_hostEnd = currentPosition(ipv6End);
2693             return true;
2694         }
2695         return false;
2696     }
2697
2698     if (!m_urlIsSpecial) {
2699         for (; !iterator.atEnd(); ++iterator) {
2700             if (UNLIKELY(isTabOrNewline(*iterator))) {
2701                 syntaxViolation(iterator);
2702                 continue;
2703             }
2704             if (*iterator == ':')
2705                 break;
2706             if (UNLIKELY(isForbiddenHostCodePoint(*iterator) && *iterator != '%'))
2707                 return false;
2708             utf8PercentEncode<isInSimpleEncodeSet>(iterator);
2709         }
2710         m_url.m_hostEnd = currentPosition(iterator);
2711         if (iterator.atEnd()) {
2712             m_url.m_portEnd = currentPosition(iterator);
2713             return true;
2714         }
2715         return parsePort(iterator);
2716     }
2717     
2718     if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2719         auto hostIterator = iterator;
2720         for (; !iterator.atEnd(); ++iterator) {
2721             if (isTabOrNewline(*iterator))
2722                 continue;
2723             if (*iterator == ':')
2724                 break;
2725             if (isForbiddenHostCodePoint(*iterator))
2726                 return false;
2727         }
2728         auto address = parseIPv4Host(hostIterator, CodePointIterator<CharacterType>(hostIterator, iterator));
2729         if (address) {
2730             serializeIPv4(address.value());
2731             m_url.m_hostEnd = currentPosition(iterator);
2732             if (iterator.atEnd()) {
2733                 m_url.m_portEnd = currentPosition(iterator);
2734                 return true;
2735             }
2736             return parsePort(iterator);
2737         }
2738         if (address.error() == IPv4ParsingError::Failure)
2739             return false;
2740         for (; hostIterator != iterator; ++hostIterator) {
2741             if (UNLIKELY(isTabOrNewline(*hostIterator))) {
2742                 syntaxViolation(hostIterator);
2743                 continue;
2744             }
2745             if (UNLIKELY(isASCIIUpper(*hostIterator)))
2746                 syntaxViolation(hostIterator);
2747             appendToASCIIBuffer(toASCIILower(*hostIterator));
2748         }
2749         m_url.m_hostEnd = currentPosition(iterator);
2750         if (!hostIterator.atEnd())
2751             return parsePort(hostIterator);
2752         m_url.m_portEnd = currentPosition(iterator);
2753         return true;
2754     }
2755     
2756     const auto hostBegin = iterator;
2757     
2758     Vector<LChar, defaultInlineBufferSize> utf8Encoded;
2759     for (; !iterator.atEnd(); ++iterator) {
2760         if (UNLIKELY(isTabOrNewline(*iterator))) {
2761             syntaxViolation(hostBegin);
2762             continue;
2763         }
2764         if (*iterator == ':')
2765             break;
2766         if (UNLIKELY(!isASCII(*iterator)))
2767             syntaxViolation(hostBegin);
2768
2769         uint8_t buffer[U8_MAX_LENGTH];
2770         int32_t offset = 0;
2771         UBool error = false;
2772         U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error);
2773         ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
2774         // FIXME: Check error.
2775         utf8Encoded.append(buffer, offset);
2776     }
2777     Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin);
2778     String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2779     if (domain.isNull())
2780         return false;
2781     if (domain != StringView(percentDecoded.data(), percentDecoded.size()))
2782         syntaxViolation(hostBegin);
2783     auto asciiDomain = domainToASCII(domain, hostBegin);
2784     if (!asciiDomain || hasForbiddenHostCodePoint(asciiDomain.value()))
2785         return false;
2786     Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value();
2787     const LChar* asciiDomainCharacters = asciiDomainValue.data();
2788
2789     auto address = parseIPv4Host(hostBegin, CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()));
2790     if (address) {
2791         serializeIPv4(address.value());
2792         m_url.m_hostEnd = currentPosition(iterator);
2793         if (iterator.atEnd()) {
2794             m_url.m_portEnd = currentPosition(iterator);
2795             return true;
2796         }
2797         return parsePort(iterator);
2798     }
2799     if (address.error() == IPv4ParsingError::Failure)
2800         return false;
2801
2802     appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2803     m_url.m_hostEnd = currentPosition(iterator);
2804     if (!iterator.atEnd())
2805         return parsePort(iterator);
2806     m_url.m_portEnd = currentPosition(iterator);
2807     return true;
2808 }
2809
2810 std::optional<String> URLParser::formURLDecode(StringView input)
2811 {
2812     auto utf8 = input.utf8(StrictConversion);
2813     if (utf8.isNull())
2814         return std::nullopt;
2815     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2816     return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2817 }
2818
2819 // https://url.spec.whatwg.org/#concept-urlencoded-parser
2820 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2821 {
2822     URLEncodedForm output;
2823     for (StringView bytes : input.split('&')) {
2824         auto equalIndex = bytes.find('=');
2825         if (equalIndex == notFound) {
2826             auto name = formURLDecode(bytes.toString().replace('+', 0x20));
2827             if (name)
2828                 output.append({ name.value(), emptyString() });
2829         } else {
2830             auto name = formURLDecode(bytes.substring(0, equalIndex).toString().replace('+', 0x20));
2831             auto value = formURLDecode(bytes.substring(equalIndex + 1).toString().replace('+', 0x20));
2832             if (name && value)
2833                 output.append({ name.value(), value.value() });
2834         }
2835     }
2836     return output;
2837 }
2838
2839 static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2840 {
2841     auto utf8 = input.utf8(StrictConversion);
2842     const char* data = utf8.data();
2843     for (size_t i = 0; i < utf8.length(); ++i) {
2844         const char byte = data[i];
2845         if (byte == 0x20)
2846             output.append(0x2B);
2847         else if (byte == 0x2A
2848             || byte == 0x2D
2849             || byte == 0x2E
2850             || (byte >= 0x30 && byte <= 0x39)
2851             || (byte >= 0x41 && byte <= 0x5A)
2852             || byte == 0x5F
2853             || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches.
2854             output.append(byte);
2855         else
2856             percentEncodeByte(byte, output);
2857     }
2858 }
2859     
2860 String URLParser::serialize(const URLEncodedForm& tuples)
2861 {
2862     Vector<LChar> output;
2863     for (auto& tuple : tuples) {
2864         if (!output.isEmpty())
2865             output.append('&');
2866         serializeURLEncodedForm(tuple.key, output);
2867         output.append('=');
2868         serializeURLEncodedForm(tuple.value, output);
2869     }
2870     return String::adopt(WTFMove(output));
2871 }
2872
2873 const UIDNA& URLParser::internationalDomainNameTranscoder()
2874 {
2875     static UIDNA* encoder;
2876     static std::once_flag onceFlag;
2877     std::call_once(onceFlag, [] {
2878         UErrorCode error = U_ZERO_ERROR;
2879         encoder = uidna_openUTS46(UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_NONTRANSITIONAL_TO_UNICODE | UIDNA_NONTRANSITIONAL_TO_ASCII, &error);
2880         RELEASE_ASSERT(U_SUCCESS(error));
2881         RELEASE_ASSERT(encoder);
2882     });
2883     return *encoder;
2884 }
2885
2886 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2887 {
2888     // FIXME: m_cannotBeABaseURL is not compared because the old URL::parse did not use it,
2889     // but once we get rid of URL::parse its value should be tested.
2890     URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2891         a.m_isValid,
2892         a.m_protocolIsInHTTPFamily,
2893         a.m_schemeEnd,
2894         a.m_userStart,
2895         a.m_userEnd,
2896         a.m_passwordEnd,
2897         a.m_hostEnd,
2898         a.m_portEnd,
2899         a.m_pathAfterLastSlash,
2900         a.m_pathEnd,
2901         a.m_queryEnd,
2902         a.m_fragmentEnd,
2903         a.m_string.utf8().data(),
2904         b.m_isValid,
2905         b.m_protocolIsInHTTPFamily,
2906         b.m_schemeEnd,
2907         b.m_userStart,
2908         b.m_userEnd,
2909         b.m_passwordEnd,
2910         b.m_hostEnd,
2911         b.m_portEnd,
2912         b.m_pathAfterLastSlash,
2913         b.m_pathEnd,
2914         b.m_queryEnd,
2915         b.m_fragmentEnd,
2916         b.m_string.utf8().data());
2917
2918     return a.m_string == b.m_string
2919         && a.m_isValid == b.m_isValid
2920         && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2921         && a.m_schemeEnd == b.m_schemeEnd
2922         && a.m_userStart == b.m_userStart
2923         && a.m_userEnd == b.m_userEnd
2924         && a.m_passwordEnd == b.m_passwordEnd
2925         && a.m_hostEnd == b.m_hostEnd
2926         && a.m_portEnd == b.m_portEnd
2927         && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2928         && a.m_pathEnd == b.m_pathEnd
2929         && a.m_queryEnd == b.m_queryEnd
2930         && a.m_fragmentEnd == b.m_fragmentEnd;
2931 }
2932
2933 bool URLParser::internalValuesConsistent(const URL& url)
2934 {
2935     return url.m_schemeEnd <= url.m_userStart
2936         && url.m_userStart <= url.m_userEnd
2937         && url.m_userEnd <= url.m_passwordEnd
2938         && url.m_passwordEnd <= url.m_hostEnd
2939         && url.m_hostEnd <= url.m_portEnd
2940         && url.m_portEnd <= url.m_pathAfterLastSlash
2941         && url.m_pathAfterLastSlash <= url.m_pathEnd
2942         && url.m_pathEnd <= url.m_queryEnd
2943         && url.m_queryEnd <= url.m_fragmentEnd
2944         && (url.m_isValid ? url.m_fragmentEnd == url.m_string.length() : !url.m_fragmentEnd);
2945     // FIXME: Why do we even store m_fragmentEnd?
2946     // It should be able to be deduced from m_isValid and m_string.length() to save memory.
2947 }
2948
2949 } // namespace WebCore