Move URL from WebCore to WTF
[WebKit-https.git] / Source / WTF / wtf / URLParser.cpp
1 /*
2  * Copyright (C) 2016-2018 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include <wtf/URLParser.h>
28
29 #include <array>
30 #include <mutex>
31 #include <unicode/uidna.h>
32 #include <unicode/utf8.h>
33 #include <unicode/utypes.h>
34
35 namespace WTF {
36
37 #define URL_PARSER_DEBUGGING 0
38
39 #if URL_PARSER_DEBUGGING
40 #define URL_PARSER_LOG(...) WTFLogAlways(__VA_ARGS__)
41 #else
42 #define URL_PARSER_LOG(...)
43 #endif
44     
45 template<typename CharacterType>
46 class CodePointIterator {
47 public:
48     ALWAYS_INLINE CodePointIterator() { }
49     ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
50         : m_begin(begin)
51         , m_end(end)
52     {
53     }
54     
55     ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
56         : CodePointIterator(begin.m_begin, end.m_begin)
57     {
58         ASSERT(end.m_begin >= begin.m_begin);
59     }
60     
61     ALWAYS_INLINE UChar32 operator*() const;
62     ALWAYS_INLINE CodePointIterator& operator++();
63
64     ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
65     {
66         return m_begin == other.m_begin
67             && m_end == other.m_end;
68     }
69     ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
70     
71     ALWAYS_INLINE CodePointIterator& operator=(const CodePointIterator& other)
72     {
73         m_begin = other.m_begin;
74         m_end = other.m_end;
75         return *this;
76     }
77
78     ALWAYS_INLINE bool atEnd() const
79     {
80         ASSERT(m_begin <= m_end);
81         return m_begin >= m_end;
82     }
83     
84     ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
85     {
86         ASSERT(m_begin >= reference);
87         return m_begin - reference;
88     }
89
90     ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
91     {
92         return codeUnitsSince(other.m_begin);
93     }
94     
95 private:
96     const CharacterType* m_begin { nullptr };
97     const CharacterType* m_end { nullptr };
98 };
99
100 template<>
101 ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const
102 {
103     ASSERT(!atEnd());
104     return *m_begin;
105 }
106
107 template<>
108 ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
109 {
110     m_begin++;
111     return *this;
112 }
113
114 template<>
115 ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const
116 {
117     ASSERT(!atEnd());
118     UChar32 c;
119     U16_GET(m_begin, 0, 0, m_end - m_begin, c);
120     return c;
121 }
122
123 template<>
124 ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
125 {
126     unsigned i = 0;
127     size_t length = m_end - m_begin;
128     U16_FWD_1(m_begin, i, length);
129     m_begin += i;
130     return *this;
131 }
132     
133 ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
134 {
135     if (U_IS_BMP(codePoint)) {
136         destination.append(static_cast<UChar>(codePoint));
137         return;
138     }
139     destination.reserveCapacity(destination.size() + 2);
140     destination.uncheckedAppend(U16_LEAD(codePoint));
141     destination.uncheckedAppend(U16_TRAIL(codePoint));
142 }
143
144 enum URLCharacterClass {
145     UserInfo = 0x1,
146     Default = 0x2,
147     ForbiddenHost = 0x4,
148     QueryPercent = 0x8,
149     SlashQuestionOrHash = 0x10,
150     ValidScheme = 0x20,
151 };
152
153 static const uint8_t characterClassTable[256] = {
154     UserInfo | Default | QueryPercent | ForbiddenHost, // 0x0
155     UserInfo | Default | QueryPercent, // 0x1
156     UserInfo | Default | QueryPercent, // 0x2
157     UserInfo | Default | QueryPercent, // 0x3
158     UserInfo | Default | QueryPercent, // 0x4
159     UserInfo | Default | QueryPercent, // 0x5
160     UserInfo | Default | QueryPercent, // 0x6
161     UserInfo | Default | QueryPercent, // 0x7
162     UserInfo | Default | QueryPercent, // 0x8
163     UserInfo | Default | QueryPercent | ForbiddenHost, // 0x9
164     UserInfo | Default | QueryPercent | ForbiddenHost, // 0xA
165     UserInfo | Default | QueryPercent, // 0xB
166     UserInfo | Default | QueryPercent, // 0xC
167     UserInfo | Default | QueryPercent | ForbiddenHost, // 0xD
168     UserInfo | Default | QueryPercent, // 0xE
169     UserInfo | Default | QueryPercent, // 0xF
170     UserInfo | Default | QueryPercent, // 0x10
171     UserInfo | Default | QueryPercent, // 0x11
172     UserInfo | Default | QueryPercent, // 0x12
173     UserInfo | Default | QueryPercent, // 0x13
174     UserInfo | Default | QueryPercent, // 0x14
175     UserInfo | Default | QueryPercent, // 0x15
176     UserInfo | Default | QueryPercent, // 0x16
177     UserInfo | Default | QueryPercent, // 0x17
178     UserInfo | Default | QueryPercent, // 0x18
179     UserInfo | Default | QueryPercent, // 0x19
180     UserInfo | Default | QueryPercent, // 0x1A
181     UserInfo | Default | QueryPercent, // 0x1B
182     UserInfo | Default | QueryPercent, // 0x1C
183     UserInfo | Default | QueryPercent, // 0x1D
184     UserInfo | Default | QueryPercent, // 0x1E
185     UserInfo | Default | QueryPercent, // 0x1F
186     UserInfo | Default | QueryPercent | ForbiddenHost, // ' '
187     0, // '!'
188     UserInfo | Default | QueryPercent, // '"'
189     UserInfo | Default | QueryPercent | SlashQuestionOrHash | ForbiddenHost, // '#'
190     0, // '$'
191     ForbiddenHost, // '%'
192     0, // '&'
193     0, // '\''
194     0, // '('
195     0, // ')'
196     0, // '*'
197     ValidScheme, // '+'
198     0, // ','
199     ValidScheme, // '-'
200     ValidScheme, // '.'
201     UserInfo | SlashQuestionOrHash | ForbiddenHost, // '/'
202     ValidScheme, // '0'
203     ValidScheme, // '1'
204     ValidScheme, // '2'
205     ValidScheme, // '3'
206     ValidScheme, // '4'
207     ValidScheme, // '5'
208     ValidScheme, // '6'
209     ValidScheme, // '7'
210     ValidScheme, // '8'
211     ValidScheme, // '9'
212     UserInfo | ForbiddenHost, // ':'
213     UserInfo, // ';'
214     UserInfo | Default | QueryPercent | ForbiddenHost, // '<'
215     UserInfo, // '='
216     UserInfo | Default | QueryPercent | ForbiddenHost, // '>'
217     UserInfo | Default | SlashQuestionOrHash | ForbiddenHost, // '?'
218     UserInfo | ForbiddenHost, // '@'
219     ValidScheme, // 'A'
220     ValidScheme, // 'B'
221     ValidScheme, // 'C'
222     ValidScheme, // 'D'
223     ValidScheme, // 'E'
224     ValidScheme, // 'F'
225     ValidScheme, // 'G'
226     ValidScheme, // 'H'
227     ValidScheme, // 'I'
228     ValidScheme, // 'J'
229     ValidScheme, // 'K'
230     ValidScheme, // 'L'
231     ValidScheme, // 'M'
232     ValidScheme, // 'N'
233     ValidScheme, // 'O'
234     ValidScheme, // 'P'
235     ValidScheme, // 'Q'
236     ValidScheme, // 'R'
237     ValidScheme, // 'S'
238     ValidScheme, // 'T'
239     ValidScheme, // 'U'
240     ValidScheme, // 'V'
241     ValidScheme, // 'W'
242     ValidScheme, // 'X'
243     ValidScheme, // 'Y'
244     ValidScheme, // 'Z'
245     UserInfo | ForbiddenHost, // '['
246     UserInfo | SlashQuestionOrHash | ForbiddenHost, // '\\'
247     UserInfo | ForbiddenHost, // ']'
248     UserInfo, // '^'
249     0, // '_'
250     UserInfo | Default, // '`'
251     ValidScheme, // 'a'
252     ValidScheme, // 'b'
253     ValidScheme, // 'c'
254     ValidScheme, // 'd'
255     ValidScheme, // 'e'
256     ValidScheme, // 'f'
257     ValidScheme, // 'g'
258     ValidScheme, // 'h'
259     ValidScheme, // 'i'
260     ValidScheme, // 'j'
261     ValidScheme, // 'k'
262     ValidScheme, // 'l'
263     ValidScheme, // 'm'
264     ValidScheme, // 'n'
265     ValidScheme, // 'o'
266     ValidScheme, // 'p'
267     ValidScheme, // 'q'
268     ValidScheme, // 'r'
269     ValidScheme, // 's'
270     ValidScheme, // 't'
271     ValidScheme, // 'u'
272     ValidScheme, // 'v'
273     ValidScheme, // 'w'
274     ValidScheme, // 'x'
275     ValidScheme, // 'y'
276     ValidScheme, // 'z'
277     UserInfo | Default, // '{'
278     UserInfo, // '|'
279     UserInfo | Default, // '}'
280     0, // '~'
281     QueryPercent, // 0x7F
282     QueryPercent, // 0x80
283     QueryPercent, // 0x81
284     QueryPercent, // 0x82
285     QueryPercent, // 0x83
286     QueryPercent, // 0x84
287     QueryPercent, // 0x85
288     QueryPercent, // 0x86
289     QueryPercent, // 0x87
290     QueryPercent, // 0x88
291     QueryPercent, // 0x89
292     QueryPercent, // 0x8A
293     QueryPercent, // 0x8B
294     QueryPercent, // 0x8C
295     QueryPercent, // 0x8D
296     QueryPercent, // 0x8E
297     QueryPercent, // 0x8F
298     QueryPercent, // 0x90
299     QueryPercent, // 0x91
300     QueryPercent, // 0x92
301     QueryPercent, // 0x93
302     QueryPercent, // 0x94
303     QueryPercent, // 0x95
304     QueryPercent, // 0x96
305     QueryPercent, // 0x97
306     QueryPercent, // 0x98
307     QueryPercent, // 0x99
308     QueryPercent, // 0x9A
309     QueryPercent, // 0x9B
310     QueryPercent, // 0x9C
311     QueryPercent, // 0x9D
312     QueryPercent, // 0x9E
313     QueryPercent, // 0x9F
314     QueryPercent, // 0xA0
315     QueryPercent, // 0xA1
316     QueryPercent, // 0xA2
317     QueryPercent, // 0xA3
318     QueryPercent, // 0xA4
319     QueryPercent, // 0xA5
320     QueryPercent, // 0xA6
321     QueryPercent, // 0xA7
322     QueryPercent, // 0xA8
323     QueryPercent, // 0xA9
324     QueryPercent, // 0xAA
325     QueryPercent, // 0xAB
326     QueryPercent, // 0xAC
327     QueryPercent, // 0xAD
328     QueryPercent, // 0xAE
329     QueryPercent, // 0xAF
330     QueryPercent, // 0xB0
331     QueryPercent, // 0xB1
332     QueryPercent, // 0xB2
333     QueryPercent, // 0xB3
334     QueryPercent, // 0xB4
335     QueryPercent, // 0xB5
336     QueryPercent, // 0xB6
337     QueryPercent, // 0xB7
338     QueryPercent, // 0xB8
339     QueryPercent, // 0xB9
340     QueryPercent, // 0xBA
341     QueryPercent, // 0xBB
342     QueryPercent, // 0xBC
343     QueryPercent, // 0xBD
344     QueryPercent, // 0xBE
345     QueryPercent, // 0xBF
346     QueryPercent, // 0xC0
347     QueryPercent, // 0xC1
348     QueryPercent, // 0xC2
349     QueryPercent, // 0xC3
350     QueryPercent, // 0xC4
351     QueryPercent, // 0xC5
352     QueryPercent, // 0xC6
353     QueryPercent, // 0xC7
354     QueryPercent, // 0xC8
355     QueryPercent, // 0xC9
356     QueryPercent, // 0xCA
357     QueryPercent, // 0xCB
358     QueryPercent, // 0xCC
359     QueryPercent, // 0xCD
360     QueryPercent, // 0xCE
361     QueryPercent, // 0xCF
362     QueryPercent, // 0xD0
363     QueryPercent, // 0xD1
364     QueryPercent, // 0xD2
365     QueryPercent, // 0xD3
366     QueryPercent, // 0xD4
367     QueryPercent, // 0xD5
368     QueryPercent, // 0xD6
369     QueryPercent, // 0xD7
370     QueryPercent, // 0xD8
371     QueryPercent, // 0xD9
372     QueryPercent, // 0xDA
373     QueryPercent, // 0xDB
374     QueryPercent, // 0xDC
375     QueryPercent, // 0xDD
376     QueryPercent, // 0xDE
377     QueryPercent, // 0xDF
378     QueryPercent, // 0xE0
379     QueryPercent, // 0xE1
380     QueryPercent, // 0xE2
381     QueryPercent, // 0xE3
382     QueryPercent, // 0xE4
383     QueryPercent, // 0xE5
384     QueryPercent, // 0xE6
385     QueryPercent, // 0xE7
386     QueryPercent, // 0xE8
387     QueryPercent, // 0xE9
388     QueryPercent, // 0xEA
389     QueryPercent, // 0xEB
390     QueryPercent, // 0xEC
391     QueryPercent, // 0xED
392     QueryPercent, // 0xEE
393     QueryPercent, // 0xEF
394     QueryPercent, // 0xF0
395     QueryPercent, // 0xF1
396     QueryPercent, // 0xF2
397     QueryPercent, // 0xF3
398     QueryPercent, // 0xF4
399     QueryPercent, // 0xF5
400     QueryPercent, // 0xF6
401     QueryPercent, // 0xF7
402     QueryPercent, // 0xF8
403     QueryPercent, // 0xF9
404     QueryPercent, // 0xFA
405     QueryPercent, // 0xFB
406     QueryPercent, // 0xFC
407     QueryPercent, // 0xFD
408     QueryPercent, // 0xFE
409     QueryPercent, // 0xFF
410 };
411
412 template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; }
413 template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
414 template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
415 template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
416 template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
417 template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
418 template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
419 template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
420 template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & ValidScheme; }
421 template<typename CharacterType> ALWAYS_INLINE static bool isForbiddenHostCodePoint(CharacterType character) { return character <= ']' && characterClassTable[character] & ForbiddenHost; }
422 ALWAYS_INLINE static bool shouldPercentEncodeQueryByte(uint8_t byte, const bool& urlIsSpecial)
423 {
424     if (characterClassTable[byte] & QueryPercent)
425         return true;
426     if (byte == '\'' && urlIsSpecial)
427         return true;
428     return false;
429 }
430
431 bool URLParser::isInUserInfoEncodeSet(UChar c)
432 {
433     return WTF::isInUserInfoEncodeSet(c);
434 }
435
436 template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation>
437 ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
438 {
439     ++iterator;
440     while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
441         if (reportSyntaxViolation == ReportSyntaxViolation::Yes)
442             syntaxViolation(iteratorForSyntaxViolationPosition);
443         ++iterator;
444     }
445 }
446
447 template<typename CharacterType>
448 bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator)
449 {
450     if (iterator.atEnd())
451         return false;
452     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
453     if (iterator.atEnd())
454         return false;
455     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
456     return iterator.atEnd();
457 }
458
459 template<typename CharacterType>
460 ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
461 {
462     if (iterator.atEnd() || !isASCIIAlpha(*iterator))
463         return false;
464     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
465     if (iterator.atEnd())
466         return false;
467     if (*iterator == ':')
468         return true;
469     if (UNLIKELY(*iterator == '|'))
470         return true;
471     return false;
472 }
473
474 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
475 {
476     ASSERT(isASCII(codePoint));
477     if (UNLIKELY(m_didSeeSyntaxViolation))
478         m_asciiBuffer.append(codePoint);
479 }
480
481 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
482 {
483     if (UNLIKELY(m_didSeeSyntaxViolation))
484         m_asciiBuffer.append(characters, length);
485 }
486
487 template<typename CharacterType>
488 void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
489 {
490     ASSERT(isWindowsDriveLetter(iterator));
491     appendToASCIIBuffer(*iterator);
492     advance(iterator);
493     ASSERT(!iterator.atEnd());
494     ASSERT(*iterator == ':' || *iterator == '|');
495     if (*iterator == '|')
496         syntaxViolation(iterator);
497     appendToASCIIBuffer(':');
498     advance(iterator);
499 }
500
501 bool URLParser::copyBaseWindowsDriveLetter(const URL& base)
502 {
503     if (base.protocolIs("file")) {
504         RELEASE_ASSERT(base.m_hostEnd + base.m_portLength < base.m_string.length());
505         if (base.m_string.is8Bit()) {
506             const LChar* begin = base.m_string.characters8();
507             CodePointIterator<LChar> c(begin + base.m_hostEnd + base.m_portLength + 1, begin + base.m_string.length());
508             if (isWindowsDriveLetter(c)) {
509                 appendWindowsDriveLetter(c);
510                 return true;
511             }
512         } else {
513             const UChar* begin = base.m_string.characters16();
514             CodePointIterator<UChar> c(begin + base.m_hostEnd + base.m_portLength + 1, begin + base.m_string.length());
515             if (isWindowsDriveLetter(c)) {
516                 appendWindowsDriveLetter(c);
517                 return true;
518             }
519         }
520     }
521     return false;
522 }
523
524 template<typename CharacterType>
525 bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
526 {
527     if (!isWindowsDriveLetter(iterator))
528         return true;
529     if (iterator.atEnd())
530         return false;
531     advance(iterator);
532     if (iterator.atEnd())
533         return true;
534     advance(iterator);
535     if (iterator.atEnd())
536         return true;
537     return !isSlashQuestionOrHash(*iterator);
538 }
539
540 static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
541 {
542     buffer.append('%');
543     buffer.append(upperNibbleToASCIIHexDigit(byte));
544     buffer.append(lowerNibbleToASCIIHexDigit(byte));
545 }
546
547 void URLParser::percentEncodeByte(uint8_t byte)
548 {
549     ASSERT(m_didSeeSyntaxViolation);
550     appendToASCIIBuffer('%');
551     appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
552     appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
553 }
554
555 const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD";
556 const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1;
557
558 template<bool(*isInCodeSet)(UChar32), typename CharacterType>
559 ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
560 {
561     ASSERT(!iterator.atEnd());
562     UChar32 codePoint = *iterator;
563     if (LIKELY(isASCII(codePoint))) {
564         if (UNLIKELY(isInCodeSet(codePoint))) {
565             syntaxViolation(iterator);
566             percentEncodeByte(codePoint);
567         } else
568             appendToASCIIBuffer(codePoint);
569         return;
570     }
571     ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
572     syntaxViolation(iterator);
573     
574     if (!U_IS_UNICODE_CHAR(codePoint)) {
575         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
576         return;
577     }
578     
579     uint8_t buffer[U8_MAX_LENGTH];
580     int32_t offset = 0;
581     U8_APPEND_UNSAFE(buffer, offset, codePoint);
582     for (int32_t i = 0; i < offset; ++i)
583         percentEncodeByte(buffer[i]);
584 }
585
586 template<typename CharacterType>
587 ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
588 {
589     ASSERT(!iterator.atEnd());
590     UChar32 codePoint = *iterator;
591     if (LIKELY(isASCII(codePoint))) {
592         if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint, m_urlIsSpecial))) {
593             syntaxViolation(iterator);
594             percentEncodeByte(codePoint);
595         } else
596             appendToASCIIBuffer(codePoint);
597         return;
598     }
599     
600     syntaxViolation(iterator);
601     
602     if (!U_IS_UNICODE_CHAR(codePoint)) {
603         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
604         return;
605     }
606
607     uint8_t buffer[U8_MAX_LENGTH];
608     int32_t offset = 0;
609     U8_APPEND_UNSAFE(buffer, offset, codePoint);
610     for (int32_t i = 0; i < offset; ++i) {
611         auto byte = buffer[i];
612         if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
613             percentEncodeByte(byte);
614         else
615             appendToASCIIBuffer(byte);
616     }
617 }
618
619 template<typename CharacterType>
620 void URLParser::encodeNonUTF8Query(const Vector<UChar>& source, const URLTextEncoding& encoding, CodePointIterator<CharacterType> iterator)
621 {
622     auto encoded = encoding.encodeForURLParsing(StringView(source.data(), source.size()));
623     auto* data = encoded.data();
624     size_t length = encoded.size();
625     
626     if (!length == !iterator.atEnd()) {
627         syntaxViolation(iterator);
628         return;
629     }
630     
631     size_t i = 0;
632     for (; i < length; ++i) {
633         ASSERT(!iterator.atEnd());
634         uint8_t byte = data[i];
635         if (UNLIKELY(byte != *iterator)) {
636             syntaxViolation(iterator);
637             break;
638         }
639         if (UNLIKELY(shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))) {
640             syntaxViolation(iterator);
641             break;
642         }
643         appendToASCIIBuffer(byte);
644         ++iterator;
645     }
646     while (!iterator.atEnd() && isTabOrNewline(*iterator))
647         ++iterator;
648     ASSERT((i == length) == iterator.atEnd());
649     for (; i < length; ++i) {
650         ASSERT(m_didSeeSyntaxViolation);
651         uint8_t byte = data[i];
652         if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
653             percentEncodeByte(byte);
654         else
655             appendToASCIIBuffer(byte);
656     }
657 }
658
659 std::optional<uint16_t> URLParser::defaultPortForProtocol(StringView scheme)
660 {
661     static const uint16_t ftpPort = 21;
662     static const uint16_t gopherPort = 70;
663     static const uint16_t httpPort = 80;
664     static const uint16_t httpsPort = 443;
665     static const uint16_t wsPort = 80;
666     static const uint16_t wssPort = 443;
667     
668     auto length = scheme.length();
669     if (!length)
670         return std::nullopt;
671     switch (scheme[0]) {
672     case 'w':
673         switch (length) {
674         case 2:
675             if (scheme[1] == 's')
676                 return wsPort;
677             return std::nullopt;
678         case 3:
679             if (scheme[1] == 's'
680                 && scheme[2] == 's')
681                 return wssPort;
682             return std::nullopt;
683         default:
684             return false;
685         }
686     case 'h':
687         switch (length) {
688         case 4:
689             if (scheme[1] == 't'
690                 && scheme[2] == 't'
691                 && scheme[3] == 'p')
692                 return httpPort;
693             return std::nullopt;
694         case 5:
695             if (scheme[1] == 't'
696                 && scheme[2] == 't'
697                 && scheme[3] == 'p'
698                 && scheme[4] == 's')
699                 return httpsPort;
700             return std::nullopt;
701         default:
702             return std::nullopt;
703         }
704     case 'g':
705         if (length == 6
706             && scheme[1] == 'o'
707             && scheme[2] == 'p'
708             && scheme[3] == 'h'
709             && scheme[4] == 'e'
710             && scheme[5] == 'r')
711             return gopherPort;
712         return std::nullopt;
713     case 'f':
714         if (length == 3
715             && scheme[1] == 't'
716             && scheme[2] == 'p')
717             return ftpPort;
718         return std::nullopt;
719     default:
720         return std::nullopt;
721     }
722 }
723
724 enum class Scheme {
725     WS,
726     WSS,
727     File,
728     FTP,
729     Gopher,
730     HTTP,
731     HTTPS,
732     NonSpecial
733 };
734
735 ALWAYS_INLINE static Scheme scheme(StringView scheme)
736 {
737     auto length = scheme.length();
738     if (!length)
739         return Scheme::NonSpecial;
740     switch (scheme[0]) {
741     case 'f':
742         switch (length) {
743         case 3:
744             if (scheme[1] == 't'
745                 && scheme[2] == 'p')
746                 return Scheme::FTP;
747             return Scheme::NonSpecial;
748         case 4:
749             if (scheme[1] == 'i'
750                 && scheme[2] == 'l'
751                 && scheme[3] == 'e')
752                 return Scheme::File;
753             return Scheme::NonSpecial;
754         default:
755             return Scheme::NonSpecial;
756         }
757     case 'g':
758         if (length == 6
759             && scheme[1] == 'o'
760             && scheme[2] == 'p'
761             && scheme[3] == 'h'
762             && scheme[4] == 'e'
763             && scheme[5] == 'r')
764             return Scheme::Gopher;
765         return Scheme::NonSpecial;
766     case 'h':
767         switch (length) {
768         case 4:
769             if (scheme[1] == 't'
770                 && scheme[2] == 't'
771                 && scheme[3] == 'p')
772                 return Scheme::HTTP;
773             return Scheme::NonSpecial;
774         case 5:
775             if (scheme[1] == 't'
776                 && scheme[2] == 't'
777                 && scheme[3] == 'p'
778                 && scheme[4] == 's')
779                 return Scheme::HTTPS;
780             return Scheme::NonSpecial;
781         default:
782             return Scheme::NonSpecial;
783         }
784     case 'w':
785         switch (length) {
786         case 2:
787             if (scheme[1] == 's')
788                 return Scheme::WS;
789             return Scheme::NonSpecial;
790         case 3:
791             if (scheme[1] == 's'
792                 && scheme[2] == 's')
793                 return Scheme::WSS;
794             return Scheme::NonSpecial;
795         default:
796             return Scheme::NonSpecial;
797         }
798     default:
799         return Scheme::NonSpecial;
800     }
801 }
802
803 std::optional<String> URLParser::maybeCanonicalizeScheme(const String& scheme)
804 {
805     if (scheme.isEmpty())
806         return std::nullopt;
807
808     if (!isASCIIAlpha(scheme[0]))
809         return std::nullopt;
810
811     for (size_t i = 1; i < scheme.length(); ++i) {
812         if (isASCIIAlphanumeric(scheme[i]) || scheme[i] == '+' || scheme[i] == '-' || scheme[i] == '.')
813             continue;
814         return std::nullopt;
815     }
816
817     return scheme.convertToASCIILowercase();
818 }
819
820 bool URLParser::isSpecialScheme(const String& schemeArg)
821 {
822     return scheme(schemeArg) != Scheme::NonSpecial;
823 }
824
825 enum class URLParser::URLPart {
826     SchemeEnd,
827     UserStart,
828     UserEnd,
829     PasswordEnd,
830     HostEnd,
831     PortEnd,
832     PathAfterLastSlash,
833     PathEnd,
834     QueryEnd,
835 };
836
837 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
838 {
839     switch (part) {
840     case URLPart::QueryEnd:
841         return url.m_queryEnd;
842     case URLPart::PathEnd:
843         return url.m_pathEnd;
844     case URLPart::PathAfterLastSlash:
845         return url.m_pathAfterLastSlash;
846     case URLPart::PortEnd:
847         return url.m_hostEnd + url.m_portLength;
848     case URLPart::HostEnd:
849         return url.m_hostEnd;
850     case URLPart::PasswordEnd:
851         return url.m_passwordEnd;
852     case URLPart::UserEnd:
853         return url.m_userEnd;
854     case URLPart::UserStart:
855         return url.m_userStart;
856     case URLPart::SchemeEnd:
857         return url.m_schemeEnd;
858     }
859     ASSERT_NOT_REACHED();
860     return 0;
861 }
862
863 void URLParser::copyASCIIStringUntil(const String& string, size_t length)
864 {
865     RELEASE_ASSERT(length <= string.length());
866     if (string.isNull())
867         return;
868     ASSERT(m_asciiBuffer.isEmpty());
869     if (string.is8Bit())
870         appendToASCIIBuffer(string.characters8(), length);
871     else {
872         const UChar* characters = string.characters16();
873         for (size_t i = 0; i < length; ++i) {
874             UChar c = characters[i];
875             ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
876             appendToASCIIBuffer(c);
877         }
878     }
879 }
880
881 template<typename CharacterType>
882 void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, const URLTextEncoding*& nonUTF8QueryEncoding)
883 {
884     syntaxViolation(iterator);
885
886     m_asciiBuffer.clear();
887     copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
888     switch (part) {
889     case URLPart::QueryEnd:
890         m_url.m_queryEnd = base.m_queryEnd;
891         FALLTHROUGH;
892     case URLPart::PathEnd:
893         m_url.m_pathEnd = base.m_pathEnd;
894         FALLTHROUGH;
895     case URLPart::PathAfterLastSlash:
896         m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
897         FALLTHROUGH;
898     case URLPart::PortEnd:
899         m_url.m_portLength = base.m_portLength;
900         FALLTHROUGH;
901     case URLPart::HostEnd:
902         m_url.m_hostEnd = base.m_hostEnd;
903         FALLTHROUGH;
904     case URLPart::PasswordEnd:
905         m_url.m_passwordEnd = base.m_passwordEnd;
906         FALLTHROUGH;
907     case URLPart::UserEnd:
908         m_url.m_userEnd = base.m_userEnd;
909         FALLTHROUGH;
910     case URLPart::UserStart:
911         m_url.m_userStart = base.m_userStart;
912         FALLTHROUGH;
913     case URLPart::SchemeEnd:
914         m_url.m_isValid = base.m_isValid;
915         m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
916         m_url.m_schemeEnd = base.m_schemeEnd;
917     }
918     switch (scheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd))) {
919     case Scheme::WS:
920     case Scheme::WSS:
921         nonUTF8QueryEncoding = nullptr;
922         m_urlIsSpecial = true;
923         return;
924     case Scheme::File:
925         m_urlIsFile = true;
926         FALLTHROUGH;
927     case Scheme::FTP:
928     case Scheme::Gopher:
929     case Scheme::HTTP:
930     case Scheme::HTTPS:
931         m_urlIsSpecial = true;
932         return;
933     case Scheme::NonSpecial:
934         m_urlIsSpecial = false;
935         nonUTF8QueryEncoding = nullptr;
936         return;
937     }
938     ASSERT_NOT_REACHED();
939 }
940
941 static const char dotASCIICode[2] = {'2', 'e'};
942
943 template<typename CharacterType>
944 ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c)
945 {
946     if (c.atEnd())
947         return false;
948     if (*c == '.') {
949         advance<CharacterType, ReportSyntaxViolation::No>(c);
950         return c.atEnd() || isSlashQuestionOrHash(*c);
951     }
952     if (*c != '%')
953         return false;
954     advance<CharacterType, ReportSyntaxViolation::No>(c);
955     if (c.atEnd() || *c != dotASCIICode[0])
956         return false;
957     advance<CharacterType, ReportSyntaxViolation::No>(c);
958     if (c.atEnd())
959         return false;
960     if (toASCIILower(*c) == dotASCIICode[1]) {
961         advance<CharacterType, ReportSyntaxViolation::No>(c);
962         return c.atEnd() || isSlashQuestionOrHash(*c);
963     }
964     return false;
965 }
966
967 template<typename CharacterType>
968 ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
969 {
970     if (c.atEnd())
971         return false;
972     if (*c == '.') {
973         advance<CharacterType, ReportSyntaxViolation::No>(c);
974         return isSingleDotPathSegment(c);
975     }
976     if (*c != '%')
977         return false;
978     advance<CharacterType, ReportSyntaxViolation::No>(c);
979     if (c.atEnd() || *c != dotASCIICode[0])
980         return false;
981     advance<CharacterType, ReportSyntaxViolation::No>(c);
982     if (c.atEnd())
983         return false;
984     if (toASCIILower(*c) == dotASCIICode[1]) {
985         advance<CharacterType, ReportSyntaxViolation::No>(c);
986         return isSingleDotPathSegment(c);
987     }
988     return false;
989 }
990
991 template<typename CharacterType>
992 void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
993 {
994     ASSERT(isSingleDotPathSegment(c));
995     if (*c == '.') {
996         advance(c);
997         if (!c.atEnd()) {
998             if (*c == '/' || *c == '\\')
999                 advance(c);
1000             else
1001                 ASSERT(*c == '?' || *c == '#');
1002         }
1003     } else {
1004         ASSERT(*c == '%');
1005         advance(c);
1006         ASSERT(*c == dotASCIICode[0]);
1007         advance(c);
1008         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1009         advance(c);
1010         if (!c.atEnd()) {
1011             if (*c == '/' || *c == '\\')
1012                 advance(c);
1013             else
1014                 ASSERT(*c == '?' || *c == '#');
1015         }
1016     }
1017 }
1018
1019 template<typename CharacterType>
1020 void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
1021 {
1022     ASSERT(isDoubleDotPathSegment(c));
1023     if (*c == '.')
1024         advance(c);
1025     else {
1026         ASSERT(*c == '%');
1027         advance(c);
1028         ASSERT(*c == dotASCIICode[0]);
1029         advance(c);
1030         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1031         advance(c);
1032     }
1033     consumeSingleDotPathSegment(c);
1034 }
1035
1036 bool URLParser::shouldPopPath(unsigned newPathAfterLastSlash)
1037 {
1038     ASSERT(m_didSeeSyntaxViolation);
1039     if (!m_urlIsFile)
1040         return true;
1041
1042     ASSERT(m_url.m_pathAfterLastSlash <= m_asciiBuffer.size());
1043     CodePointIterator<LChar> componentToPop(&m_asciiBuffer[newPathAfterLastSlash], &m_asciiBuffer[0] + m_url.m_pathAfterLastSlash);
1044     if (newPathAfterLastSlash == m_url.m_hostEnd + m_url.m_portLength + 1 && isWindowsDriveLetter(componentToPop))
1045         return false;
1046     return true;
1047 }
1048
1049 void URLParser::popPath()
1050 {
1051     ASSERT(m_didSeeSyntaxViolation);
1052     if (m_url.m_pathAfterLastSlash > m_url.m_hostEnd + m_url.m_portLength + 1) {
1053         auto newPathAfterLastSlash = m_url.m_pathAfterLastSlash - 1;
1054         if (m_asciiBuffer[newPathAfterLastSlash] == '/')
1055             newPathAfterLastSlash--;
1056         while (newPathAfterLastSlash > m_url.m_hostEnd + m_url.m_portLength && m_asciiBuffer[newPathAfterLastSlash] != '/')
1057             newPathAfterLastSlash--;
1058         newPathAfterLastSlash++;
1059         if (shouldPopPath(newPathAfterLastSlash))
1060             m_url.m_pathAfterLastSlash = newPathAfterLastSlash;
1061     }
1062     m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
1063 }
1064
1065 template<typename CharacterType>
1066 void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
1067 {
1068     if (m_didSeeSyntaxViolation)
1069         return;
1070     m_didSeeSyntaxViolation = true;
1071     
1072     ASSERT(m_asciiBuffer.isEmpty());
1073     size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1074     RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
1075     m_asciiBuffer.reserveCapacity(m_inputString.length());
1076     for (size_t i = 0; i < codeUnitsToCopy; ++i) {
1077         ASSERT(isASCII(m_inputString[i]));
1078         m_asciiBuffer.uncheckedAppend(m_inputString[i]);
1079     }
1080 }
1081
1082 void URLParser::failure()
1083 {
1084     m_url.invalidate();
1085     m_url.m_string = m_inputString;
1086 }
1087
1088 template<typename CharacterType>
1089 bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint)
1090 {
1091     if (iterator.atEnd() || toASCIILower(*iterator) != codePoint)
1092         return false;
1093     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
1094     return true;
1095 }
1096
1097 template<typename CharacterType>
1098 bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator)
1099 {
1100     if (!checkLocalhostCodePoint(iterator, 'l'))
1101         return false;
1102     if (!checkLocalhostCodePoint(iterator, 'o'))
1103         return false;
1104     if (!checkLocalhostCodePoint(iterator, 'c'))
1105         return false;
1106     if (!checkLocalhostCodePoint(iterator, 'a'))
1107         return false;
1108     if (!checkLocalhostCodePoint(iterator, 'l'))
1109         return false;
1110     if (!checkLocalhostCodePoint(iterator, 'h'))
1111         return false;
1112     if (!checkLocalhostCodePoint(iterator, 'o'))
1113         return false;
1114     if (!checkLocalhostCodePoint(iterator, 's'))
1115         return false;
1116     if (!checkLocalhostCodePoint(iterator, 't'))
1117         return false;
1118     return iterator.atEnd();
1119 }
1120
1121 bool URLParser::isLocalhost(StringView view)
1122 {
1123     if (view.is8Bit())
1124         return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length()));
1125     return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length()));
1126 }
1127
1128 ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
1129 {
1130     if (UNLIKELY(m_didSeeSyntaxViolation)) {
1131         ASSERT(start + length <= m_asciiBuffer.size());
1132         return StringView(m_asciiBuffer.data() + start, length);
1133     }
1134     ASSERT(start + length <= m_inputString.length());
1135     return StringView(m_inputString).substring(start, length);
1136 }
1137
1138 ALWAYS_INLINE UChar URLParser::parsedDataView(size_t position)
1139 {
1140     if (UNLIKELY(m_didSeeSyntaxViolation))
1141         return m_asciiBuffer[position];
1142     return m_inputString[position];
1143 }
1144
1145 template<typename CharacterType>
1146 ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1147 {
1148     if (UNLIKELY(m_didSeeSyntaxViolation))
1149         return m_asciiBuffer.size();
1150     
1151     return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1152 }
1153
1154 URLParser::URLParser(const String& input, const URL& base, const URLTextEncoding* nonUTF8QueryEncoding)
1155     : m_inputString(input)
1156 {
1157     if (input.isNull()) {
1158         if (base.isValid() && !base.m_cannotBeABaseURL) {
1159             m_url = base;
1160             m_url.removeFragmentIdentifier();
1161         }
1162         return;
1163     }
1164
1165     if (input.is8Bit()) {
1166         m_inputBegin = input.characters8();
1167         parse(input.characters8(), input.length(), base, nonUTF8QueryEncoding);
1168     } else {
1169         m_inputBegin = input.characters16();
1170         parse(input.characters16(), input.length(), base, nonUTF8QueryEncoding);
1171     }
1172
1173     ASSERT(!m_url.m_isValid
1174         || m_didSeeSyntaxViolation == (m_url.string() != input)
1175         || (input.isAllSpecialCharacters<isC0ControlOrSpace>()
1176             && m_url.m_string == base.m_string.left(base.m_queryEnd)));
1177     ASSERT(internalValuesConsistent(m_url));
1178 #if !ASSERT_DISABLED
1179     if (!m_didSeeSyntaxViolation) {
1180         // Force a syntax violation at the beginning to make sure we get the same result.
1181         URLParser parser(makeString(" ", input), base, nonUTF8QueryEncoding);
1182         URL parsed = parser.result();
1183         if (parsed.isValid())
1184             ASSERT(allValuesEqual(parser.result(), m_url));
1185     }
1186 #endif
1187 }
1188
1189 template<typename CharacterType>
1190 void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const URLTextEncoding* nonUTF8QueryEncoding)
1191 {
1192     URL_PARSER_LOG("Parsing URL <%s> base <%s>", String(input, length).utf8().data(), base.string().utf8().data());
1193     m_url = { };
1194     ASSERT(m_asciiBuffer.isEmpty());
1195
1196     Vector<UChar> queryBuffer;
1197
1198     unsigned endIndex = length;
1199     while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) {
1200         syntaxViolation(CodePointIterator<CharacterType>(input, input));
1201         endIndex--;
1202     }
1203     CodePointIterator<CharacterType> c(input, input + endIndex);
1204     CodePointIterator<CharacterType> authorityOrHostBegin;
1205     CodePointIterator<CharacterType> queryBegin;
1206     while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1207         syntaxViolation(c);
1208         ++c;
1209     }
1210     auto beginAfterControlAndSpace = c;
1211
1212     enum class State : uint8_t {
1213         SchemeStart,
1214         Scheme,
1215         NoScheme,
1216         SpecialRelativeOrAuthority,
1217         PathOrAuthority,
1218         Relative,
1219         RelativeSlash,
1220         SpecialAuthoritySlashes,
1221         SpecialAuthorityIgnoreSlashes,
1222         AuthorityOrHost,
1223         Host,
1224         File,
1225         FileSlash,
1226         FileHost,
1227         PathStart,
1228         Path,
1229         CannotBeABaseURLPath,
1230         UTF8Query,
1231         NonUTF8Query,
1232         Fragment,
1233     };
1234
1235 #define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1236 #define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x)
1237
1238     State state = State::SchemeStart;
1239     while (!c.atEnd()) {
1240         if (UNLIKELY(isTabOrNewline(*c))) {
1241             syntaxViolation(c);
1242             ++c;
1243             continue;
1244         }
1245
1246         switch (state) {
1247         case State::SchemeStart:
1248             LOG_STATE("SchemeStart");
1249             if (isASCIIAlpha(*c)) {
1250                 if (UNLIKELY(isASCIIUpper(*c)))
1251                     syntaxViolation(c);
1252                 appendToASCIIBuffer(toASCIILower(*c));
1253                 advance(c);
1254                 if (c.atEnd()) {
1255                     m_asciiBuffer.clear();
1256                     state = State::NoScheme;
1257                     c = beginAfterControlAndSpace;
1258                     break;
1259                 }
1260                 state = State::Scheme;
1261             } else
1262                 state = State::NoScheme;
1263             break;
1264         case State::Scheme:
1265             LOG_STATE("Scheme");
1266             if (isValidSchemeCharacter(*c)) {
1267                 if (UNLIKELY(isASCIIUpper(*c)))
1268                     syntaxViolation(c);
1269                 appendToASCIIBuffer(toASCIILower(*c));
1270             } else if (*c == ':') {
1271                 unsigned schemeEnd = currentPosition(c);
1272                 if (schemeEnd > URL::maxSchemeLength) {
1273                     failure();
1274                     return;
1275                 }
1276                 m_url.m_schemeEnd = schemeEnd;
1277                 StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd);
1278                 appendToASCIIBuffer(':');
1279                 switch (scheme(urlScheme)) {
1280                 case Scheme::File:
1281                     m_urlIsSpecial = true;
1282                     m_urlIsFile = true;
1283                     state = State::File;
1284                     ++c;
1285                     break;
1286                 case Scheme::WS:
1287                 case Scheme::WSS:
1288                     nonUTF8QueryEncoding = nullptr;
1289                     m_urlIsSpecial = true;
1290                     if (base.protocolIs(urlScheme))
1291                         state = State::SpecialRelativeOrAuthority;
1292                     else
1293                         state = State::SpecialAuthoritySlashes;
1294                     ++c;
1295                     break;
1296                 case Scheme::HTTP:
1297                 case Scheme::HTTPS:
1298                     m_url.m_protocolIsInHTTPFamily = true;
1299                     FALLTHROUGH;
1300                 case Scheme::FTP:
1301                 case Scheme::Gopher:
1302                     m_urlIsSpecial = true;
1303                     if (base.protocolIs(urlScheme))
1304                         state = State::SpecialRelativeOrAuthority;
1305                     else
1306                         state = State::SpecialAuthoritySlashes;
1307                     ++c;
1308                     break;
1309                 case Scheme::NonSpecial:
1310                     nonUTF8QueryEncoding = nullptr;
1311                     auto maybeSlash = c;
1312                     advance(maybeSlash);
1313                     if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1314                         appendToASCIIBuffer('/');
1315                         c = maybeSlash;
1316                         state = State::PathOrAuthority;
1317                         ASSERT(*c == '/');
1318                         ++c;
1319                         m_url.m_userStart = currentPosition(c);
1320                     } else {
1321                         ++c;
1322                         m_url.m_userStart = currentPosition(c);
1323                         m_url.m_userEnd = m_url.m_userStart;
1324                         m_url.m_passwordEnd = m_url.m_userStart;
1325                         m_url.m_hostEnd = m_url.m_userStart;
1326                         m_url.m_portLength = 0;
1327                         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1328                         m_url.m_cannotBeABaseURL = true;
1329                         state = State::CannotBeABaseURLPath;
1330                     }
1331                     break;
1332                 }
1333                 break;
1334             } else {
1335                 m_asciiBuffer.clear();
1336                 state = State::NoScheme;
1337                 c = beginAfterControlAndSpace;
1338                 break;
1339             }
1340             advance(c);
1341             if (c.atEnd()) {
1342                 m_asciiBuffer.clear();
1343                 state = State::NoScheme;
1344                 c = beginAfterControlAndSpace;
1345             }
1346             break;
1347         case State::NoScheme:
1348             LOG_STATE("NoScheme");
1349             if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) {
1350                 failure();
1351                 return;
1352             }
1353             if (base.m_cannotBeABaseURL && *c == '#') {
1354                 copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1355                 state = State::Fragment;
1356                 appendToASCIIBuffer('#');
1357                 ++c;
1358                 break;
1359             }
1360             if (!base.protocolIs("file")) {
1361                 state = State::Relative;
1362                 break;
1363             }
1364             copyURLPartsUntil(base, URLPart::SchemeEnd, c, nonUTF8QueryEncoding);
1365             appendToASCIIBuffer(':');
1366             state = State::File;
1367             break;
1368         case State::SpecialRelativeOrAuthority:
1369             LOG_STATE("SpecialRelativeOrAuthority");
1370             if (*c == '/') {
1371                 appendToASCIIBuffer('/');
1372                 advance(c);
1373                 if (c.atEnd()) {
1374                     failure();
1375                     return;
1376                 }
1377                 if (*c == '/') {
1378                     appendToASCIIBuffer('/');
1379                     state = State::SpecialAuthorityIgnoreSlashes;
1380                     ++c;
1381                 } else
1382                     state = State::RelativeSlash;
1383             } else
1384                 state = State::Relative;
1385             break;
1386         case State::PathOrAuthority:
1387             LOG_STATE("PathOrAuthority");
1388             if (*c == '/') {
1389                 appendToASCIIBuffer('/');
1390                 state = State::AuthorityOrHost;
1391                 advance(c);
1392                 m_url.m_userStart = currentPosition(c);
1393                 authorityOrHostBegin = c;
1394             } else {
1395                 ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1396                 m_url.m_userStart = currentPosition(c) - 1;
1397                 m_url.m_userEnd = m_url.m_userStart;
1398                 m_url.m_passwordEnd = m_url.m_userStart;
1399                 m_url.m_hostEnd = m_url.m_userStart;
1400                 m_url.m_portLength = 0;
1401                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1402                 state = State::Path;
1403             }
1404             break;
1405         case State::Relative:
1406             LOG_STATE("Relative");
1407             switch (*c) {
1408             case '/':
1409             case '\\':
1410                 state = State::RelativeSlash;
1411                 ++c;
1412                 break;
1413             case '?':
1414                 copyURLPartsUntil(base, URLPart::PathEnd, c, nonUTF8QueryEncoding);
1415                 appendToASCIIBuffer('?');
1416                 ++c;
1417                 if (nonUTF8QueryEncoding) {
1418                     queryBegin = c;
1419                     state = State::NonUTF8Query;
1420                 } else
1421                     state = State::UTF8Query;
1422                 break;
1423             case '#':
1424                 copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1425                 appendToASCIIBuffer('#');
1426                 state = State::Fragment;
1427                 ++c;
1428                 break;
1429             default:
1430                 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, nonUTF8QueryEncoding);
1431                 if (currentPosition(c) && parsedDataView(currentPosition(c) - 1) != '/') {
1432                     appendToASCIIBuffer('/');
1433                     m_url.m_pathAfterLastSlash = currentPosition(c);
1434                 }
1435                 state = State::Path;
1436                 break;
1437             }
1438             break;
1439         case State::RelativeSlash:
1440             LOG_STATE("RelativeSlash");
1441             if (*c == '/' || *c == '\\') {
1442                 ++c;
1443                 copyURLPartsUntil(base, URLPart::SchemeEnd, c, nonUTF8QueryEncoding);
1444                 appendToASCIIBuffer("://", 3);
1445                 if (m_urlIsSpecial)
1446                     state = State::SpecialAuthorityIgnoreSlashes;
1447                 else {
1448                     m_url.m_userStart = currentPosition(c);
1449                     state = State::AuthorityOrHost;
1450                     authorityOrHostBegin = c;
1451                 }
1452             } else {
1453                 copyURLPartsUntil(base, URLPart::PortEnd, c, nonUTF8QueryEncoding);
1454                 appendToASCIIBuffer('/');
1455                 m_url.m_pathAfterLastSlash = base.m_hostEnd + base.m_portLength + 1;
1456                 state = State::Path;
1457             }
1458             break;
1459         case State::SpecialAuthoritySlashes:
1460             LOG_STATE("SpecialAuthoritySlashes");
1461             if (LIKELY(*c == '/' || *c == '\\')) {
1462                 if (UNLIKELY(*c == '\\'))
1463                     syntaxViolation(c);
1464                 appendToASCIIBuffer('/');
1465                 advance(c);
1466                 if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) {
1467                     if (UNLIKELY(*c == '\\'))
1468                         syntaxViolation(c);
1469                     ++c;
1470                     appendToASCIIBuffer('/');
1471                 } else {
1472                     syntaxViolation(c);
1473                     appendToASCIIBuffer('/');
1474                 }
1475             } else {
1476                 syntaxViolation(c);
1477                 appendToASCIIBuffer("//", 2);
1478             }
1479             state = State::SpecialAuthorityIgnoreSlashes;
1480             break;
1481         case State::SpecialAuthorityIgnoreSlashes:
1482             LOG_STATE("SpecialAuthorityIgnoreSlashes");
1483             if (*c == '/' || *c == '\\') {
1484                 syntaxViolation(c);
1485                 ++c;
1486             } else {
1487                 m_url.m_userStart = currentPosition(c);
1488                 state = State::AuthorityOrHost;
1489                 authorityOrHostBegin = c;
1490             }
1491             break;
1492         case State::AuthorityOrHost:
1493             do {
1494                 LOG_STATE("AuthorityOrHost");
1495                 if (*c == '@') {
1496                     auto lastAt = c;
1497                     auto findLastAt = c;
1498                     while (!findLastAt.atEnd()) {
1499                         URL_PARSER_LOG("Finding last @: %c", *findLastAt);
1500                         if (*findLastAt == '@')
1501                             lastAt = findLastAt;
1502                         bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\');
1503                         if (isSlash || *findLastAt == '?' || *findLastAt == '#')
1504                             break;
1505                         ++findLastAt;
1506                     }
1507                     parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1508                     c = lastAt;
1509                     advance(c);
1510                     authorityOrHostBegin = c;
1511                     state = State::Host;
1512                     m_hostHasPercentOrNonASCII = false;
1513                     break;
1514                 }
1515                 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1516                 if (isSlash || *c == '?' || *c == '#') {
1517                     auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c);
1518                     if (iterator.atEnd()) {
1519                         if (m_urlIsSpecial)
1520                             return failure();
1521                         m_url.m_userEnd = currentPosition(c);
1522                         m_url.m_passwordEnd = m_url.m_userEnd;
1523                         m_url.m_hostEnd = m_url.m_userEnd;
1524                         m_url.m_portLength = 0;
1525                         m_url.m_pathAfterLastSlash = m_url.m_userEnd;
1526                     } else {
1527                         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1528                         m_url.m_passwordEnd = m_url.m_userEnd;
1529                         if (!parseHostAndPort(iterator)) {
1530                             failure();
1531                             return;
1532                         }
1533                         if (UNLIKELY(!isSlash)) {
1534                             if (m_urlIsSpecial) {
1535                                 syntaxViolation(c);
1536                                 appendToASCIIBuffer('/');
1537                             }
1538                             m_url.m_pathAfterLastSlash = currentPosition(c);
1539                         }
1540                     }
1541                     state = State::Path;
1542                     break;
1543                 }
1544                 if (isPercentOrNonASCII(*c))
1545                     m_hostHasPercentOrNonASCII = true;
1546                 ++c;
1547             } while (!c.atEnd());
1548             break;
1549         case State::Host:
1550             do {
1551                 LOG_STATE("Host");
1552                 if (*c == '/' || *c == '?' || *c == '#') {
1553                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1554                         failure();
1555                         return;
1556                     }
1557                     if (*c == '?' || *c == '#') {
1558                         syntaxViolation(c);
1559                         appendToASCIIBuffer('/');
1560                         m_url.m_pathAfterLastSlash = currentPosition(c);
1561                     }
1562                     state = State::Path;
1563                     break;
1564                 }
1565                 if (isPercentOrNonASCII(*c))
1566                     m_hostHasPercentOrNonASCII = true;
1567                 ++c;
1568             } while (!c.atEnd());
1569             break;
1570         case State::File:
1571             LOG_STATE("File");
1572             switch (*c) {
1573             case '\\':
1574                 syntaxViolation(c);
1575                 FALLTHROUGH;
1576             case '/':
1577                 appendToASCIIBuffer('/');
1578                 state = State::FileSlash;
1579                 ++c;
1580                 break;
1581             case '?':
1582                 syntaxViolation(c);
1583                 if (base.isValid() && base.protocolIs("file")) {
1584                     copyURLPartsUntil(base, URLPart::PathEnd, c, nonUTF8QueryEncoding);
1585                     appendToASCIIBuffer('?');
1586                     ++c;
1587                 } else {
1588                     appendToASCIIBuffer("///?", 4);
1589                     ++c;
1590                     m_url.m_userStart = currentPosition(c) - 2;
1591                     m_url.m_userEnd = m_url.m_userStart;
1592                     m_url.m_passwordEnd = m_url.m_userStart;
1593                     m_url.m_hostEnd = m_url.m_userStart;
1594                     m_url.m_portLength = 0;
1595                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1596                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1597                 }
1598                 if (nonUTF8QueryEncoding) {
1599                     queryBegin = c;
1600                     state = State::NonUTF8Query;
1601                 } else
1602                     state = State::UTF8Query;
1603                 break;
1604             case '#':
1605                 syntaxViolation(c);
1606                 if (base.isValid() && base.protocolIs("file")) {
1607                     copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1608                     appendToASCIIBuffer('#');
1609                 } else {
1610                     appendToASCIIBuffer("///#", 4);
1611                     m_url.m_userStart = currentPosition(c) - 2;
1612                     m_url.m_userEnd = m_url.m_userStart;
1613                     m_url.m_passwordEnd = m_url.m_userStart;
1614                     m_url.m_hostEnd = m_url.m_userStart;
1615                     m_url.m_portLength = 0;
1616                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1617                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1618                     m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1619                 }
1620                 state = State::Fragment;
1621                 ++c;
1622                 break;
1623             default:
1624                 syntaxViolation(c);
1625                 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1626                     copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, nonUTF8QueryEncoding);
1627                 else {
1628                     appendToASCIIBuffer("///", 3);
1629                     m_url.m_userStart = currentPosition(c) - 1;
1630                     m_url.m_userEnd = m_url.m_userStart;
1631                     m_url.m_passwordEnd = m_url.m_userStart;
1632                     m_url.m_hostEnd = m_url.m_userStart;
1633                     m_url.m_portLength = 0;
1634                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1635                     if (isWindowsDriveLetter(c))
1636                         appendWindowsDriveLetter(c);
1637                 }
1638                 state = State::Path;
1639                 break;
1640             }
1641             break;
1642         case State::FileSlash:
1643             LOG_STATE("FileSlash");
1644             if (LIKELY(*c == '/' || *c == '\\')) {
1645                 if (UNLIKELY(*c == '\\'))
1646                     syntaxViolation(c);
1647                 appendToASCIIBuffer('/');
1648                 advance(c);
1649                 m_url.m_userStart = currentPosition(c);
1650                 m_url.m_userEnd = m_url.m_userStart;
1651                 m_url.m_passwordEnd = m_url.m_userStart;
1652                 m_url.m_hostEnd = m_url.m_userStart;
1653                 m_url.m_portLength = 0;
1654                 authorityOrHostBegin = c;
1655                 state = State::FileHost;
1656                 break;
1657             }
1658             syntaxViolation(c);
1659             appendToASCIIBuffer("//", 2);
1660             m_url.m_userStart = currentPosition(c) - 1;
1661             m_url.m_userEnd = m_url.m_userStart;
1662             m_url.m_passwordEnd = m_url.m_userStart;
1663             m_url.m_hostEnd = m_url.m_userStart;
1664             m_url.m_portLength = 0;
1665             if (isWindowsDriveLetter(c)) {
1666                 appendWindowsDriveLetter(c);
1667                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1668             } else if (copyBaseWindowsDriveLetter(base)) {
1669                 appendToASCIIBuffer('/');
1670                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1671             } else
1672                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1673             state = State::Path;
1674             break;
1675         case State::FileHost:
1676             do {
1677                 LOG_STATE("FileHost");
1678                 if (isSlashQuestionOrHash(*c)) {
1679                     bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1680                         && isWindowsDriveLetter(authorityOrHostBegin);
1681                     if (windowsQuirk) {
1682                         syntaxViolation(authorityOrHostBegin);
1683                         appendToASCIIBuffer('/');
1684                         appendWindowsDriveLetter(authorityOrHostBegin);
1685                     }
1686                     if (windowsQuirk || authorityOrHostBegin == c) {
1687                         ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1) == '/');
1688                         if (UNLIKELY(*c == '?')) {
1689                             syntaxViolation(c);
1690                             appendToASCIIBuffer("/?", 2);
1691                             ++c;
1692                             if (nonUTF8QueryEncoding) {
1693                                 queryBegin = c;
1694                                 state = State::NonUTF8Query;
1695                             } else
1696                                 state = State::UTF8Query;
1697                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1698                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1699                             break;
1700                         }
1701                         if (UNLIKELY(*c == '#')) {
1702                             syntaxViolation(c);
1703                             appendToASCIIBuffer("/#", 2);
1704                             ++c;
1705                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1706                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1707                             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1708                             state = State::Fragment;
1709                             break;
1710                         }
1711                         state = State::Path;
1712                         break;
1713                     }
1714                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1715                         failure();
1716                         return;
1717                     }
1718                     if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) {
1719                         syntaxViolation(c);
1720                         m_asciiBuffer.shrink(m_url.m_passwordEnd);
1721                         m_url.m_hostEnd = currentPosition(c);
1722                         m_url.m_portLength = 0;
1723                     }
1724                     
1725                     state = State::PathStart;
1726                     break;
1727                 }
1728                 if (isPercentOrNonASCII(*c))
1729                     m_hostHasPercentOrNonASCII = true;
1730                 ++c;
1731             } while (!c.atEnd());
1732             break;
1733         case State::PathStart:
1734             LOG_STATE("PathStart");
1735             if (*c != '/' && *c != '\\') {
1736                 syntaxViolation(c);
1737                 appendToASCIIBuffer('/');
1738             }
1739             m_url.m_pathAfterLastSlash = currentPosition(c);
1740             state = State::Path;
1741             break;
1742         case State::Path:
1743             LOG_STATE("Path");
1744             if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1745                 if (UNLIKELY(m_urlIsSpecial && *c == '\\'))
1746                     syntaxViolation(c);
1747                 appendToASCIIBuffer('/');
1748                 ++c;
1749                 m_url.m_pathAfterLastSlash = currentPosition(c);
1750                 break;
1751             }
1752             if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1) == '/')) {
1753                 if (UNLIKELY(isDoubleDotPathSegment(c))) {
1754                     syntaxViolation(c);
1755                     consumeDoubleDotPathSegment(c);
1756                     popPath();
1757                     break;
1758                 }
1759                 if (UNLIKELY(isSingleDotPathSegment(c))) {
1760                     syntaxViolation(c);
1761                     consumeSingleDotPathSegment(c);
1762                     break;
1763                 }
1764             }
1765             if (*c == '?') {
1766                 m_url.m_pathEnd = currentPosition(c);
1767                 appendToASCIIBuffer('?');
1768                 ++c;
1769                 if (nonUTF8QueryEncoding) {
1770                     queryBegin = c;
1771                     state = State::NonUTF8Query;
1772                 } else
1773                     state = State::UTF8Query;
1774                 break;
1775             }
1776             if (*c == '#') {
1777                 m_url.m_pathEnd = currentPosition(c);
1778                 m_url.m_queryEnd = m_url.m_pathEnd;
1779                 state = State::Fragment;
1780                 break;
1781             }
1782             utf8PercentEncode<isInDefaultEncodeSet>(c);
1783             ++c;
1784             break;
1785         case State::CannotBeABaseURLPath:
1786             LOG_STATE("CannotBeABaseURLPath");
1787             if (*c == '?') {
1788                 m_url.m_pathEnd = currentPosition(c);
1789                 appendToASCIIBuffer('?');
1790                 ++c;
1791                 if (nonUTF8QueryEncoding) {
1792                     queryBegin = c;
1793                     state = State::NonUTF8Query;
1794                 } else
1795                     state = State::UTF8Query;
1796             } else if (*c == '#') {
1797                 m_url.m_pathEnd = currentPosition(c);
1798                 m_url.m_queryEnd = m_url.m_pathEnd;
1799                 state = State::Fragment;
1800             } else if (*c == '/') {
1801                 appendToASCIIBuffer('/');
1802                 ++c;
1803                 m_url.m_pathAfterLastSlash = currentPosition(c);
1804             } else {
1805                 utf8PercentEncode<isInSimpleEncodeSet>(c);
1806                 ++c;
1807             }
1808             break;
1809         case State::UTF8Query:
1810             LOG_STATE("UTF8Query");
1811             ASSERT(queryBegin == CodePointIterator<CharacterType>());
1812             if (*c == '#') {
1813                 m_url.m_queryEnd = currentPosition(c);
1814                 state = State::Fragment;
1815                 break;
1816             }
1817             ASSERT(!nonUTF8QueryEncoding);
1818             utf8QueryEncode(c);
1819             ++c;
1820             break;
1821         case State::NonUTF8Query:
1822             do {
1823                 LOG_STATE("NonUTF8Query");
1824                 ASSERT(queryBegin != CodePointIterator<CharacterType>());
1825                 if (*c == '#') {
1826                     encodeNonUTF8Query(queryBuffer, *nonUTF8QueryEncoding, CodePointIterator<CharacterType>(queryBegin, c));
1827                     m_url.m_queryEnd = currentPosition(c);
1828                     state = State::Fragment;
1829                     break;
1830                 }
1831                 appendCodePoint(queryBuffer, *c);
1832                 advance(c, queryBegin);
1833             } while (!c.atEnd());
1834             break;
1835         case State::Fragment:
1836             URL_PARSER_LOG("State Fragment");
1837             utf8PercentEncode<isInSimpleEncodeSet>(c);
1838             ++c;
1839             break;
1840         }
1841     }
1842
1843     switch (state) {
1844     case State::SchemeStart:
1845         LOG_FINAL_STATE("SchemeStart");
1846         if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1847             m_url = base;
1848             m_url.removeFragmentIdentifier();
1849             return;
1850         }
1851         failure();
1852         return;
1853     case State::Scheme:
1854         LOG_FINAL_STATE("Scheme");
1855         failure();
1856         return;
1857     case State::NoScheme:
1858         LOG_FINAL_STATE("NoScheme");
1859         RELEASE_ASSERT_NOT_REACHED();
1860     case State::SpecialRelativeOrAuthority:
1861         LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1862         copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1863         break;
1864     case State::PathOrAuthority:
1865         LOG_FINAL_STATE("PathOrAuthority");
1866         ASSERT(m_url.m_userStart);
1867         ASSERT(m_url.m_userStart == currentPosition(c));
1868         ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1869         m_url.m_userStart--;
1870         m_url.m_userEnd = m_url.m_userStart;
1871         m_url.m_passwordEnd = m_url.m_userStart;
1872         m_url.m_hostEnd = m_url.m_userStart;
1873         m_url.m_portLength = 0;
1874         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1875         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1876         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1877         break;
1878     case State::Relative:
1879         LOG_FINAL_STATE("Relative");
1880         RELEASE_ASSERT_NOT_REACHED();
1881     case State::RelativeSlash:
1882         LOG_FINAL_STATE("RelativeSlash");
1883         copyURLPartsUntil(base, URLPart::PortEnd, c, nonUTF8QueryEncoding);
1884         appendToASCIIBuffer('/');
1885         m_url.m_pathAfterLastSlash = m_url.m_hostEnd + m_url.m_portLength + 1;
1886         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1887         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1888         break;
1889     case State::SpecialAuthoritySlashes:
1890         LOG_FINAL_STATE("SpecialAuthoritySlashes");
1891         m_url.m_userStart = currentPosition(c);
1892         m_url.m_userEnd = m_url.m_userStart;
1893         m_url.m_passwordEnd = m_url.m_userStart;
1894         m_url.m_hostEnd = m_url.m_userStart;
1895         m_url.m_portLength = 0;
1896         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1897         m_url.m_pathEnd = m_url.m_userStart;
1898         m_url.m_queryEnd = m_url.m_userStart;
1899         break;
1900     case State::SpecialAuthorityIgnoreSlashes:
1901         LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1902         failure();
1903         return;
1904     case State::AuthorityOrHost:
1905         LOG_FINAL_STATE("AuthorityOrHost");
1906         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1907         m_url.m_passwordEnd = m_url.m_userEnd;
1908         if (authorityOrHostBegin.atEnd()) {
1909             m_url.m_userEnd = m_url.m_userStart;
1910             m_url.m_passwordEnd = m_url.m_userStart;
1911             m_url.m_hostEnd = m_url.m_userStart;
1912             m_url.m_portLength = 0;
1913             m_url.m_pathEnd = m_url.m_userStart;
1914         } else if (!parseHostAndPort(authorityOrHostBegin)) {
1915             failure();
1916             return;
1917         } else {
1918             if (m_urlIsSpecial) {
1919                 syntaxViolation(c);
1920                 appendToASCIIBuffer('/');
1921                 m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength + 1;
1922             } else
1923                 m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength;
1924         }
1925         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1926         m_url.m_queryEnd = m_url.m_pathEnd;
1927         break;
1928     case State::Host:
1929         LOG_FINAL_STATE("Host");
1930         if (!parseHostAndPort(authorityOrHostBegin)) {
1931             failure();
1932             return;
1933         }
1934         if (m_urlIsSpecial) {
1935             syntaxViolation(c);
1936             appendToASCIIBuffer('/');
1937             m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength + 1;
1938         } else
1939             m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength;
1940         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1941         m_url.m_queryEnd = m_url.m_pathEnd;
1942         break;
1943     case State::File:
1944         LOG_FINAL_STATE("File");
1945         if (base.isValid() && base.protocolIs("file")) {
1946             copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1947             break;
1948         }
1949         syntaxViolation(c);
1950         appendToASCIIBuffer("///", 3);
1951         m_url.m_userStart = currentPosition(c) - 1;
1952         m_url.m_userEnd = m_url.m_userStart;
1953         m_url.m_passwordEnd = m_url.m_userStart;
1954         m_url.m_hostEnd = m_url.m_userStart;
1955         m_url.m_portLength = 0;
1956         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1957         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1958         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1959         break;
1960     case State::FileSlash:
1961         LOG_FINAL_STATE("FileSlash");
1962         syntaxViolation(c);
1963         m_url.m_userStart = currentPosition(c) + 1;
1964         appendToASCIIBuffer("//", 2);
1965         m_url.m_userEnd = m_url.m_userStart;
1966         m_url.m_passwordEnd = m_url.m_userStart;
1967         m_url.m_hostEnd = m_url.m_userStart;
1968         m_url.m_portLength = 0;
1969         if (copyBaseWindowsDriveLetter(base)) {
1970             appendToASCIIBuffer('/');
1971             m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1972         } else
1973             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1974         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1975         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1976         break;
1977     case State::FileHost:
1978         LOG_FINAL_STATE("FileHost");
1979         if (takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1980             && isWindowsDriveLetter(authorityOrHostBegin)) {
1981             syntaxViolation(authorityOrHostBegin);
1982             appendToASCIIBuffer('/');
1983             appendWindowsDriveLetter(authorityOrHostBegin);
1984             m_url.m_pathAfterLastSlash = currentPosition(c);
1985             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1986             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1987             break;
1988         }
1989         
1990         if (authorityOrHostBegin == c) {
1991             syntaxViolation(c);
1992             appendToASCIIBuffer('/');
1993             m_url.m_userStart = currentPosition(c) - 1;
1994             m_url.m_userEnd = m_url.m_userStart;
1995             m_url.m_passwordEnd = m_url.m_userStart;
1996             m_url.m_hostEnd = m_url.m_userStart;
1997             m_url.m_portLength = 0;
1998             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1999             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
2000             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
2001             break;
2002         }
2003
2004         if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
2005             failure();
2006             return;
2007         }
2008
2009         syntaxViolation(c);
2010         if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) {
2011             m_asciiBuffer.shrink(m_url.m_passwordEnd);
2012             m_url.m_hostEnd = currentPosition(c);
2013             m_url.m_portLength = 0;
2014         }
2015         appendToASCIIBuffer('/');
2016         m_url.m_pathAfterLastSlash = m_url.m_hostEnd + m_url.m_portLength + 1;
2017         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
2018         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
2019         break;
2020     case State::PathStart:
2021         LOG_FINAL_STATE("PathStart");
2022         RELEASE_ASSERT_NOT_REACHED();
2023     case State::Path:
2024         LOG_FINAL_STATE("Path");
2025         m_url.m_pathEnd = currentPosition(c);
2026         m_url.m_queryEnd = m_url.m_pathEnd;
2027         break;
2028     case State::CannotBeABaseURLPath:
2029         LOG_FINAL_STATE("CannotBeABaseURLPath");
2030         m_url.m_pathEnd = currentPosition(c);
2031         m_url.m_queryEnd = m_url.m_pathEnd;
2032         break;
2033     case State::UTF8Query:
2034         LOG_FINAL_STATE("UTF8Query");
2035         ASSERT(queryBegin == CodePointIterator<CharacterType>());
2036         m_url.m_queryEnd = currentPosition(c);
2037         break;
2038     case State::NonUTF8Query:
2039         LOG_FINAL_STATE("NonUTF8Query");
2040         ASSERT(queryBegin != CodePointIterator<CharacterType>());
2041         encodeNonUTF8Query(queryBuffer, *nonUTF8QueryEncoding, CodePointIterator<CharacterType>(queryBegin, c));
2042         m_url.m_queryEnd = currentPosition(c);
2043         break;
2044     case State::Fragment:
2045         LOG_FINAL_STATE("Fragment");
2046         break;
2047     }
2048
2049     if (LIKELY(!m_didSeeSyntaxViolation)) {
2050         m_url.m_string = m_inputString;
2051         ASSERT(m_asciiBuffer.isEmpty());
2052     } else
2053         m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
2054     m_url.m_isValid = true;
2055     URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
2056 }
2057
2058 template<typename CharacterType>
2059 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
2060 {
2061     if (UNLIKELY(iterator.atEnd())) {
2062         syntaxViolation(iterator);
2063         m_url.m_userEnd = currentPosition(iterator);
2064         m_url.m_passwordEnd = m_url.m_userEnd;
2065         return;
2066     }
2067     for (; !iterator.atEnd(); advance(iterator)) {
2068         if (*iterator == ':') {
2069             m_url.m_userEnd = currentPosition(iterator);
2070             auto iteratorAtColon = iterator;
2071             ++iterator;
2072             bool tabOrNewlineAfterColon = false;
2073             while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
2074                 tabOrNewlineAfterColon = true;
2075                 ++iterator;
2076             }
2077             if (UNLIKELY(iterator.atEnd())) {
2078                 syntaxViolation(iteratorAtColon);
2079                 m_url.m_passwordEnd = m_url.m_userEnd;
2080                 if (m_url.m_userEnd > m_url.m_userStart)
2081                     appendToASCIIBuffer('@');
2082                 return;
2083             }
2084             if (tabOrNewlineAfterColon)
2085                 syntaxViolation(iteratorAtColon);
2086             appendToASCIIBuffer(':');
2087             break;
2088         }
2089         utf8PercentEncode<WTF::isInUserInfoEncodeSet>(iterator);
2090     }
2091     for (; !iterator.atEnd(); advance(iterator))
2092         utf8PercentEncode<WTF::isInUserInfoEncodeSet>(iterator);
2093     m_url.m_passwordEnd = currentPosition(iterator);
2094     if (!m_url.m_userEnd)
2095         m_url.m_userEnd = m_url.m_passwordEnd;
2096     appendToASCIIBuffer('@');
2097 }
2098
2099 template<typename UnsignedIntegerType>
2100 void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
2101 {
2102     LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
2103     LChar* end = std::end(buf);
2104     LChar* p = end;
2105     do {
2106         *--p = (number % 10) + '0';
2107         number /= 10;
2108     } while (number);
2109     appendToASCIIBuffer(p, end - p);
2110 }
2111
2112 void URLParser::serializeIPv4(IPv4Address address)
2113 {
2114     appendNumberToASCIIBuffer<uint8_t>(address >> 24);
2115     appendToASCIIBuffer('.');
2116     appendNumberToASCIIBuffer<uint8_t>(address >> 16);
2117     appendToASCIIBuffer('.');
2118     appendNumberToASCIIBuffer<uint8_t>(address >> 8);
2119     appendToASCIIBuffer('.');
2120     appendNumberToASCIIBuffer<uint8_t>(address);
2121 }
2122     
2123 static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
2124 {
2125     size_t end = begin;
2126     for (; end < 8; end++) {
2127         if (address[end])
2128             break;
2129     }
2130     return end - begin;
2131 }
2132
2133 static std::optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
2134 {
2135     std::optional<size_t> longest;
2136     size_t longestLength = 0;
2137     for (size_t i = 0; i < 8; i++) {
2138         size_t length = zeroSequenceLength(address, i);
2139         if (length) {
2140             if (length > 1 && (!longest || longestLength < length)) {
2141                 longest = i;
2142                 longestLength = length;
2143             }
2144             i += length;
2145         }
2146     }
2147     return longest;
2148 }
2149
2150 void URLParser::serializeIPv6Piece(uint16_t piece)
2151 {
2152     bool printed = false;
2153     if (auto nibble0 = piece >> 12) {
2154         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
2155         printed = true;
2156     }
2157     auto nibble1 = piece >> 8 & 0xF;
2158     if (printed || nibble1) {
2159         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
2160         printed = true;
2161     }
2162     auto nibble2 = piece >> 4 & 0xF;
2163     if (printed || nibble2)
2164         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
2165     appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
2166 }
2167
2168 void URLParser::serializeIPv6(URLParser::IPv6Address address)
2169 {
2170     appendToASCIIBuffer('[');
2171     auto compressPointer = findLongestZeroSequence(address);
2172     for (size_t piece = 0; piece < 8; piece++) {
2173         if (compressPointer && compressPointer.value() == piece) {
2174             ASSERT(!address[piece]);
2175             if (piece)
2176                 appendToASCIIBuffer(':');
2177             else
2178                 appendToASCIIBuffer("::", 2);
2179             while (piece < 8 && !address[piece])
2180                 piece++;
2181             if (piece == 8)
2182                 break;
2183         }
2184         serializeIPv6Piece(address[piece]);
2185         if (piece < 7)
2186             appendToASCIIBuffer(':');
2187     }
2188     appendToASCIIBuffer(']');
2189 }
2190
2191 enum class URLParser::IPv4PieceParsingError {
2192     Failure,
2193     Overflow,
2194 };
2195
2196 template<typename CharacterType>
2197 Expected<uint32_t, URLParser::IPv4PieceParsingError> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation)
2198 {
2199     enum class State : uint8_t {
2200         UnknownBase,
2201         Decimal,
2202         OctalOrHex,
2203         Octal,
2204         Hex,
2205     };
2206     State state = State::UnknownBase;
2207     Checked<uint32_t, RecordOverflow> value = 0;
2208     if (!iterator.atEnd() && *iterator == '.')
2209         return makeUnexpected(IPv4PieceParsingError::Failure);
2210     while (!iterator.atEnd()) {
2211         if (isTabOrNewline(*iterator)) {
2212             didSeeSyntaxViolation = true;
2213             ++iterator;
2214             continue;
2215         }
2216         if (*iterator == '.') {
2217             ASSERT(!value.hasOverflowed());
2218             return value.unsafeGet();
2219         }
2220         switch (state) {
2221         case State::UnknownBase:
2222             if (UNLIKELY(*iterator == '0')) {
2223                 ++iterator;
2224                 state = State::OctalOrHex;
2225                 break;
2226             }
2227             state = State::Decimal;
2228             break;
2229         case State::OctalOrHex:
2230             didSeeSyntaxViolation = true;
2231             if (*iterator == 'x' || *iterator == 'X') {
2232                 ++iterator;
2233                 state = State::Hex;
2234                 break;
2235             }
2236             state = State::Octal;
2237             break;
2238         case State::Decimal:
2239             if (!isASCIIDigit(*iterator))
2240                 return makeUnexpected(IPv4PieceParsingError::Failure);
2241             value *= 10;
2242             value += *iterator - '0';
2243             if (UNLIKELY(value.hasOverflowed()))
2244                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2245             ++iterator;
2246             break;
2247         case State::Octal:
2248             ASSERT(didSeeSyntaxViolation);
2249             if (*iterator < '0' || *iterator > '7')
2250                 return makeUnexpected(IPv4PieceParsingError::Failure);
2251             value *= 8;
2252             value += *iterator - '0';
2253             if (UNLIKELY(value.hasOverflowed()))
2254                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2255             ++iterator;
2256             break;
2257         case State::Hex:
2258             ASSERT(didSeeSyntaxViolation);
2259             if (!isASCIIHexDigit(*iterator))
2260                 return makeUnexpected(IPv4PieceParsingError::Failure);
2261             value *= 16;
2262             value += toASCIIHexValue(*iterator);
2263             if (UNLIKELY(value.hasOverflowed()))
2264                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2265             ++iterator;
2266             break;
2267         }
2268     }
2269     ASSERT(!value.hasOverflowed());
2270     return value.unsafeGet();
2271 }
2272
2273 ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2274 {
2275     RELEASE_ASSERT(exponent <= 4);
2276     uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
2277     return values[exponent];
2278 }
2279
2280 enum class URLParser::IPv4ParsingError {
2281     Failure,
2282     NotIPv4,
2283 };
2284
2285 template<typename CharacterTypeForSyntaxViolation, typename CharacterType>
2286 Expected<URLParser::IPv4Address, URLParser::IPv4ParsingError> URLParser::parseIPv4Host(const CodePointIterator<CharacterTypeForSyntaxViolation>& iteratorForSyntaxViolationPosition, CodePointIterator<CharacterType> iterator)
2287 {
2288     Vector<Expected<uint32_t, URLParser::IPv4PieceParsingError>, 4> items;
2289     bool didSeeSyntaxViolation = false;
2290     if (!iterator.atEnd() && *iterator == '.')
2291         return makeUnexpected(IPv4ParsingError::NotIPv4);
2292     while (!iterator.atEnd()) {
2293         if (isTabOrNewline(*iterator)) {
2294             didSeeSyntaxViolation = true;
2295             ++iterator;
2296             continue;
2297         }
2298         if (items.size() >= 4)
2299             return makeUnexpected(IPv4ParsingError::NotIPv4);
2300         items.append(parseIPv4Piece(iterator, didSeeSyntaxViolation));
2301         if (!iterator.atEnd() && *iterator == '.') {
2302             ++iterator;
2303             if (iterator.atEnd())
2304                 syntaxViolation(iteratorForSyntaxViolationPosition);
2305             else if (*iterator == '.')
2306                 return makeUnexpected(IPv4ParsingError::NotIPv4);
2307         }
2308     }
2309     if (!iterator.atEnd() || !items.size() || items.size() > 4)
2310         return makeUnexpected(IPv4ParsingError::NotIPv4);
2311     for (const auto& item : items) {
2312         if (!item.has_value() && item.error() == IPv4PieceParsingError::Failure)
2313             return makeUnexpected(IPv4ParsingError::NotIPv4);
2314     }
2315     for (const auto& item : items) {
2316         if (!item.has_value() && item.error() == IPv4PieceParsingError::Overflow)
2317             return makeUnexpected(IPv4ParsingError::Failure);
2318     }
2319     if (items.size() > 1) {
2320         for (size_t i = 0; i < items.size() - 1; i++) {
2321             if (items[i].value() > 255)
2322                 return makeUnexpected(IPv4ParsingError::Failure);
2323         }
2324     }
2325     if (items[items.size() - 1].value() >= pow256(5 - items.size()))
2326         return makeUnexpected(IPv4ParsingError::Failure);
2327
2328     if (didSeeSyntaxViolation)
2329         syntaxViolation(iteratorForSyntaxViolationPosition);
2330     for (const auto& item : items) {
2331         if (item.value() > 255)
2332             syntaxViolation(iteratorForSyntaxViolationPosition);
2333     }
2334
2335     if (UNLIKELY(items.size() != 4))
2336         syntaxViolation(iteratorForSyntaxViolationPosition);
2337
2338     IPv4Address ipv4 = items.takeLast().value();
2339     for (size_t counter = 0; counter < items.size(); ++counter)
2340         ipv4 += items[counter].value() * pow256(3 - counter);
2341     return ipv4;
2342 }
2343
2344 template<typename CharacterType>
2345 std::optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator)
2346 {
2347     if (iterator.atEnd())
2348         return std::nullopt;
2349     uint32_t piece = 0;
2350     bool leadingZeros = false;
2351     size_t digitCount = 0;
2352     while (!iterator.atEnd()) {
2353         if (!isASCIIDigit(*iterator))
2354             return std::nullopt;
2355         ++digitCount;
2356         if (!piece && *iterator == '0') {
2357             if (leadingZeros)
2358                 return std::nullopt;
2359             leadingZeros = true;
2360         }
2361         if (!piece && *iterator == '0')
2362             leadingZeros = true;
2363         piece = piece * 10 + *iterator - '0';
2364         if (piece > 255)
2365             return std::nullopt;
2366         advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2367         if (iterator.atEnd())
2368             break;
2369         if (*iterator == '.')
2370             break;
2371     }
2372     if (piece && leadingZeros)
2373         return std::nullopt;
2374     return piece;
2375 }
2376
2377 template<typename CharacterType>
2378 std::optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator)
2379 {
2380     IPv4Address address = 0;
2381     for (size_t i = 0; i < 4; ++i) {
2382         if (std::optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator))
2383             address = (address << 8) + piece.value();
2384         else
2385             return std::nullopt;
2386         if (i < 3) {
2387             if (iterator.atEnd())
2388                 return std::nullopt;
2389             if (*iterator != '.')
2390                 return std::nullopt;
2391             advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2392         } else if (!iterator.atEnd())
2393             return std::nullopt;
2394     }
2395     ASSERT(iterator.atEnd());
2396     return address;
2397 }
2398
2399 template<typename CharacterType>
2400 std::optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2401 {
2402     ASSERT(*c == '[');
2403     const auto hostBegin = c;
2404     advance(c, hostBegin);
2405     if (c.atEnd())
2406         return std::nullopt;
2407
2408     IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}};
2409     size_t piecePointer = 0;
2410     std::optional<size_t> compressPointer;
2411
2412     if (*c == ':') {
2413         advance(c, hostBegin);
2414         if (c.atEnd())
2415             return std::nullopt;
2416         if (*c != ':')
2417             return std::nullopt;
2418         advance(c, hostBegin);
2419         ++piecePointer;
2420         compressPointer = piecePointer;
2421     }
2422     
2423     while (!c.atEnd()) {
2424         if (piecePointer == 8)
2425             return std::nullopt;
2426         if (*c == ':') {
2427             if (compressPointer)
2428                 return std::nullopt;
2429             advance(c, hostBegin);
2430             ++piecePointer;
2431             compressPointer = piecePointer;
2432             continue;
2433         }
2434         if (piecePointer == 6 || (compressPointer && piecePointer < 6)) {
2435             if (std::optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) {
2436                 if (compressPointer && piecePointer == 5)
2437                     return std::nullopt;
2438                 syntaxViolation(hostBegin);
2439                 address[piecePointer++] = ipv4Address.value() >> 16;
2440                 address[piecePointer++] = ipv4Address.value() & 0xFFFF;
2441                 c = { };
2442                 break;
2443             }
2444         }
2445         uint16_t value = 0;
2446         size_t length = 0;
2447         bool leadingZeros = false;
2448         for (; length < 4; length++) {
2449             if (c.atEnd())
2450                 break;
2451             if (!isASCIIHexDigit(*c))
2452                 break;
2453             if (isASCIIUpper(*c))
2454                 syntaxViolation(hostBegin);
2455             if (*c == '0' && !length)
2456                 leadingZeros = true;
2457             value = value * 0x10 + toASCIIHexValue(*c);
2458             advance(c, hostBegin);
2459         }
2460         
2461         if (UNLIKELY((value && leadingZeros) || (!value && length > 1)))
2462             syntaxViolation(hostBegin);
2463
2464         address[piecePointer++] = value;
2465         if (c.atEnd())
2466             break;
2467         if (piecePointer == 8 || *c != ':')
2468             return std::nullopt;
2469         advance(c, hostBegin);
2470     }
2471     
2472     if (!c.atEnd())
2473         return std::nullopt;
2474     
2475     if (compressPointer) {
2476         size_t swaps = piecePointer - compressPointer.value();
2477         piecePointer = 7;
2478         while (swaps)
2479             std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
2480     } else if (piecePointer != 8)
2481         return std::nullopt;
2482
2483     std::optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2484     if (possibleCompressPointer)
2485         possibleCompressPointer.value()++;
2486     if (UNLIKELY(compressPointer != possibleCompressPointer))
2487         syntaxViolation(hostBegin);
2488     
2489     return address;
2490 }
2491
2492 template<typename CharacterType>
2493 URLParser::LCharBuffer URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2494 {
2495     LCharBuffer output;
2496     output.reserveInitialCapacity(length);
2497     
2498     for (size_t i = 0; i < length; ++i) {
2499         uint8_t byte = input[i];
2500         if (byte != '%')
2501             output.uncheckedAppend(byte);
2502         else if (length > 2 && i < length - 2) {
2503             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2504                 syntaxViolation(iteratorForSyntaxViolationPosition);
2505                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2506                 i += 2;
2507             } else
2508                 output.uncheckedAppend(byte);
2509         } else
2510             output.uncheckedAppend(byte);
2511     }
2512     return output;
2513 }
2514     
2515 URLParser::LCharBuffer URLParser::percentDecode(const LChar* input, size_t length)
2516 {
2517     LCharBuffer output;
2518     output.reserveInitialCapacity(length);
2519     
2520     for (size_t i = 0; i < length; ++i) {
2521         uint8_t byte = input[i];
2522         if (byte != '%')
2523             output.uncheckedAppend(byte);
2524         else if (length > 2 && i < length - 2) {
2525             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2526                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2527                 i += 2;
2528             } else
2529                 output.uncheckedAppend(byte);
2530         } else
2531             output.uncheckedAppend(byte);
2532     }
2533     return output;
2534 }
2535
2536 template<typename CharacterType> std::optional<URLParser::LCharBuffer> URLParser::domainToASCII(StringImpl& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2537 {
2538     LCharBuffer ascii;
2539     if (domain.isAllASCII()) {
2540         size_t length = domain.length();
2541         if (domain.is8Bit()) {
2542             const LChar* characters = domain.characters8();
2543             ascii.reserveInitialCapacity(length);
2544             for (size_t i = 0; i < length; ++i) {
2545                 if (UNLIKELY(isASCIIUpper(characters[i])))
2546                     syntaxViolation(iteratorForSyntaxViolationPosition);
2547                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2548             }
2549         } else {
2550             const UChar* characters = domain.characters16();
2551             ascii.reserveInitialCapacity(length);
2552             for (size_t i = 0; i < length; ++i) {
2553                 if (UNLIKELY(isASCIIUpper(characters[i])))
2554                     syntaxViolation(iteratorForSyntaxViolationPosition);
2555                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2556             }
2557         }
2558         return ascii;
2559     }
2560     
2561     const size_t maxDomainLength = 64;
2562     UChar hostnameBuffer[maxDomainLength];
2563     UErrorCode error = U_ZERO_ERROR;
2564     UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER;
2565     int32_t numCharactersConverted = uidna_nameToASCII(&internationalDomainNameTranscoder(), StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, maxDomainLength, &processingDetails, &error);
2566     ASSERT(numCharactersConverted <= static_cast<int32_t>(maxDomainLength));
2567
2568     if (U_SUCCESS(error) && !processingDetails.errors) {
2569         for (int32_t i = 0; i < numCharactersConverted; ++i) {
2570             ASSERT(isASCII(hostnameBuffer[i]));
2571             ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2572         }
2573         ascii.append(hostnameBuffer, numCharactersConverted);
2574         if (domain != StringView(ascii.data(), ascii.size()))
2575             syntaxViolation(iteratorForSyntaxViolationPosition);
2576         return ascii;
2577     }
2578     return std::nullopt;
2579 }
2580
2581 bool URLParser::hasForbiddenHostCodePoint(const URLParser::LCharBuffer& asciiDomain)
2582 {
2583     for (size_t i = 0; i < asciiDomain.size(); ++i) {
2584         if (isForbiddenHostCodePoint(asciiDomain[i]))
2585             return true;
2586     }
2587     return false;
2588 }
2589
2590 template<typename CharacterType>
2591 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2592 {
2593     ASSERT(*iterator == ':');
2594     auto colonIterator = iterator;
2595     advance(iterator, colonIterator);
2596     uint32_t port = 0;
2597     if (UNLIKELY(iterator.atEnd())) {
2598         unsigned portLength = currentPosition(colonIterator) - m_url.m_hostEnd;
2599         RELEASE_ASSERT(portLength <= URL::maxPortLength);
2600         m_url.m_portLength = portLength;
2601         syntaxViolation(colonIterator);
2602         return true;
2603     }
2604     size_t digitCount = 0;
2605     bool leadingZeros = false;
2606     for (; !iterator.atEnd(); ++iterator) {
2607         if (UNLIKELY(isTabOrNewline(*iterator))) {
2608             syntaxViolation(colonIterator);
2609             continue;
2610         }
2611         if (isASCIIDigit(*iterator)) {
2612             if (*iterator == '0' && !digitCount)
2613                 leadingZeros = true;
2614             ++digitCount;
2615             port = port * 10 + *iterator - '0';
2616             if (port > std::numeric_limits<uint16_t>::max())
2617                 return false;
2618         } else
2619             return false;
2620     }
2621
2622     if (port && leadingZeros)
2623         syntaxViolation(colonIterator);
2624     
2625     if (!port && digitCount > 1)
2626         syntaxViolation(colonIterator);
2627
2628     ASSERT(port == static_cast<uint16_t>(port));
2629     if (UNLIKELY(defaultPortForProtocol(parsedDataView(0, m_url.m_schemeEnd)) == static_cast<uint16_t>(port)))
2630         syntaxViolation(colonIterator);
2631     else {
2632         appendToASCIIBuffer(':');
2633         ASSERT(port <= std::numeric_limits<uint16_t>::max());
2634         appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2635     }
2636
2637     unsigned portLength = currentPosition(iterator) - m_url.m_hostEnd;
2638     RELEASE_ASSERT(portLength <= URL::maxPortLength);
2639     m_url.m_portLength = portLength;
2640     return true;
2641 }
2642
2643 template<typename CharacterType>
2644 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2645 {
2646     if (iterator.atEnd())
2647         return false;
2648     if (*iterator == ':')
2649         return false;
2650     if (*iterator == '[') {
2651         auto ipv6End = iterator;
2652         while (!ipv6End.atEnd() && *ipv6End != ']')
2653             ++ipv6End;
2654         if (ipv6End.atEnd())
2655             return false;
2656         if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2657             serializeIPv6(address.value());
2658             if (!ipv6End.atEnd()) {
2659                 advance(ipv6End);
2660                 if (!ipv6End.atEnd() && *ipv6End == ':') {
2661                     m_url.m_hostEnd = currentPosition(ipv6End);
2662                     return parsePort(ipv6End);
2663                 }
2664                 m_url.m_hostEnd = currentPosition(ipv6End);
2665                 m_url.m_portLength = 0;
2666                 return true;
2667             }
2668             m_url.m_hostEnd = currentPosition(ipv6End);
2669             return true;
2670         }
2671         return false;
2672     }
2673
2674     if (!m_urlIsSpecial) {
2675         for (; !iterator.atEnd(); ++iterator) {
2676             if (UNLIKELY(isTabOrNewline(*iterator))) {
2677                 syntaxViolation(iterator);
2678                 continue;
2679             }
2680             if (*iterator == ':')
2681                 break;
2682             if (UNLIKELY(isForbiddenHostCodePoint(*iterator) && *iterator != '%'))
2683                 return false;
2684             utf8PercentEncode<isInSimpleEncodeSet>(iterator);
2685         }
2686         m_url.m_hostEnd = currentPosition(iterator);
2687         if (iterator.atEnd()) {
2688             m_url.m_portLength = 0;
2689             return true;
2690         }
2691         return parsePort(iterator);
2692     }
2693     
2694     if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2695         auto hostIterator = iterator;
2696         for (; !iterator.atEnd(); ++iterator) {
2697             if (isTabOrNewline(*iterator))
2698                 continue;
2699             if (*iterator == ':')
2700                 break;
2701             if (isForbiddenHostCodePoint(*iterator))
2702                 return false;
2703         }
2704         auto address = parseIPv4Host(hostIterator, CodePointIterator<CharacterType>(hostIterator, iterator));
2705         if (address) {
2706             serializeIPv4(address.value());
2707             m_url.m_hostEnd = currentPosition(iterator);
2708             if (iterator.atEnd()) {
2709                 m_url.m_portLength = 0;
2710                 return true;
2711             }
2712             return parsePort(iterator);
2713         }
2714         if (address.error() == IPv4ParsingError::Failure)
2715             return false;
2716         for (; hostIterator != iterator; ++hostIterator) {
2717             if (UNLIKELY(isTabOrNewline(*hostIterator))) {
2718                 syntaxViolation(hostIterator);
2719                 continue;
2720             }
2721             if (UNLIKELY(isASCIIUpper(*hostIterator)))
2722                 syntaxViolation(hostIterator);
2723             appendToASCIIBuffer(toASCIILower(*hostIterator));
2724         }
2725         m_url.m_hostEnd = currentPosition(iterator);
2726         if (!hostIterator.atEnd())
2727             return parsePort(hostIterator);
2728         unsigned portLength = currentPosition(iterator) - m_url.m_hostEnd;
2729         RELEASE_ASSERT(portLength <= URL::maxPortLength);
2730         m_url.m_portLength = portLength;
2731         return true;
2732     }
2733     
2734     const auto hostBegin = iterator;
2735     
2736     LCharBuffer utf8Encoded;
2737     for (; !iterator.atEnd(); ++iterator) {
2738         if (UNLIKELY(isTabOrNewline(*iterator))) {
2739             syntaxViolation(hostBegin);
2740             continue;
2741         }
2742         if (*iterator == ':')
2743             break;
2744         if (UNLIKELY(!isASCII(*iterator)))
2745             syntaxViolation(hostBegin);
2746
2747         if (!U_IS_UNICODE_CHAR(*iterator))
2748             return false;
2749         uint8_t buffer[U8_MAX_LENGTH];
2750         int32_t offset = 0;
2751         U8_APPEND_UNSAFE(buffer, offset, *iterator);
2752         utf8Encoded.append(buffer, offset);
2753     }
2754     LCharBuffer percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin);
2755     String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2756     if (domain.isNull())
2757         return false;
2758     if (domain != StringView(percentDecoded.data(), percentDecoded.size()))
2759         syntaxViolation(hostBegin);
2760     auto asciiDomain = domainToASCII(*domain.impl(), hostBegin);
2761     if (!asciiDomain || hasForbiddenHostCodePoint(asciiDomain.value()))
2762         return false;
2763     LCharBuffer& asciiDomainValue = asciiDomain.value();
2764     const LChar* asciiDomainCharacters = asciiDomainValue.data();
2765
2766     auto address = parseIPv4Host(hostBegin, CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()));
2767     if (address) {
2768         serializeIPv4(address.value());
2769         m_url.m_hostEnd = currentPosition(iterator);
2770         if (iterator.atEnd()) {
2771             m_url.m_portLength = 0;
2772             return true;
2773         }
2774         return parsePort(iterator);
2775     }
2776     if (address.error() == IPv4ParsingError::Failure)
2777         return false;
2778
2779     appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2780     m_url.m_hostEnd = currentPosition(iterator);
2781     if (!iterator.atEnd())
2782         return parsePort(iterator);
2783     m_url.m_portLength = 0;
2784     return true;
2785 }
2786
2787 std::optional<String> URLParser::formURLDecode(StringView input)
2788 {
2789     auto utf8 = input.utf8(StrictConversion);
2790     if (utf8.isNull())
2791         return std::nullopt;
2792     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2793     return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2794 }
2795
2796 // https://url.spec.whatwg.org/#concept-urlencoded-parser
2797 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2798 {
2799     URLEncodedForm output;
2800     for (StringView bytes : input.split('&')) {
2801         auto equalIndex = bytes.find('=');
2802         if (equalIndex == notFound) {
2803             auto name = formURLDecode(bytes.toString().replace('+', 0x20));
2804             if (name)
2805                 output.append({ name.value(), emptyString() });
2806         } else {
2807             auto name = formURLDecode(bytes.substring(0, equalIndex).toString().replace('+', 0x20));
2808             auto value = formURLDecode(bytes.substring(equalIndex + 1).toString().replace('+', 0x20));
2809             if (name && value)
2810                 output.append({ name.value(), value.value() });
2811         }
2812     }
2813     return output;
2814 }
2815
2816 static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2817 {
2818     auto utf8 = input.utf8(StrictConversion);
2819     const char* data = utf8.data();
2820     for (size_t i = 0; i < utf8.length(); ++i) {
2821         const char byte = data[i];
2822         if (byte == 0x20)
2823             output.append(0x2B);
2824         else if (byte == 0x2A
2825             || byte == 0x2D
2826             || byte == 0x2E
2827             || (byte >= 0x30 && byte <= 0x39)
2828             || (byte >= 0x41 && byte <= 0x5A)
2829             || byte == 0x5F
2830             || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches.
2831             output.append(byte);
2832         else
2833             percentEncodeByte(byte, output);
2834     }
2835 }
2836     
2837 String URLParser::serialize(const URLEncodedForm& tuples)
2838 {
2839     if (tuples.isEmpty())
2840         return { };
2841
2842     Vector<LChar> output;
2843     for (auto& tuple : tuples) {
2844         if (!output.isEmpty())
2845             output.append('&');
2846         serializeURLEncodedForm(tuple.key, output);
2847         output.append('=');
2848         serializeURLEncodedForm(tuple.value, output);
2849     }
2850     return String::adopt(WTFMove(output));
2851 }
2852
2853 const UIDNA& URLParser::internationalDomainNameTranscoder()
2854 {
2855     static UIDNA* encoder;
2856     static std::once_flag onceFlag;
2857     std::call_once(onceFlag, [] {
2858         UErrorCode error = U_ZERO_ERROR;
2859         // Warning: Please contact a WebKitGTK+ developer if changing these flags.
2860         // They should be synced with ephy_uri_decode() in ephy-uri-helpers.c.
2861         encoder = uidna_openUTS46(UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_NONTRANSITIONAL_TO_UNICODE | UIDNA_NONTRANSITIONAL_TO_ASCII, &error);
2862         RELEASE_ASSERT(U_SUCCESS(error));
2863         RELEASE_ASSERT(encoder);
2864     });
2865     return *encoder;
2866 }
2867
2868 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2869 {
2870     URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2871         a.m_isValid,
2872         a.m_cannotBeABaseURL,
2873         a.m_protocolIsInHTTPFamily,
2874         a.m_schemeEnd,
2875         a.m_userStart,
2876         a.m_userEnd,
2877         a.m_passwordEnd,
2878         a.m_hostEnd,
2879         a.m_hostEnd + a.m_portLength,
2880         a.m_pathAfterLastSlash,
2881         a.m_pathEnd,
2882         a.m_queryEnd,
2883         a.m_string.utf8().data(),
2884         b.m_isValid,
2885         b.m_cannotBeABaseURL,
2886         b.m_protocolIsInHTTPFamily,
2887         b.m_schemeEnd,
2888         b.m_userStart,
2889         b.m_userEnd,
2890         b.m_passwordEnd,
2891         b.m_hostEnd,
2892         b.m_hostEnd + b.m_portLength,
2893         b.m_pathAfterLastSlash,
2894         b.m_pathEnd,
2895         b.m_queryEnd,
2896         b.m_string.utf8().data());
2897
2898     return a.m_string == b.m_string
2899         && a.m_isValid == b.m_isValid
2900         && a.m_cannotBeABaseURL == b.m_cannotBeABaseURL
2901         && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2902         && a.m_schemeEnd == b.m_schemeEnd
2903         && a.m_userStart == b.m_userStart
2904         && a.m_userEnd == b.m_userEnd
2905         && a.m_passwordEnd == b.m_passwordEnd
2906         && a.m_hostEnd == b.m_hostEnd
2907         && a.m_portLength == b.m_portLength
2908         && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2909         && a.m_pathEnd == b.m_pathEnd
2910         && a.m_queryEnd == b.m_queryEnd;
2911 }
2912
2913 bool URLParser::internalValuesConsistent(const URL& url)
2914 {
2915     return url.m_schemeEnd <= url.m_userStart
2916         && url.m_userStart <= url.m_userEnd
2917         && url.m_userEnd <= url.m_passwordEnd
2918         && url.m_passwordEnd <= url.m_hostEnd
2919         && url.m_hostEnd + url.m_portLength <= url.m_pathAfterLastSlash
2920         && url.m_pathAfterLastSlash <= url.m_pathEnd
2921         && url.m_pathEnd <= url.m_queryEnd
2922         && url.m_queryEnd <= url.m_string.length();
2923 }
2924
2925 } // namespace WTF