Reduce size of WebCore::URL
[WebKit-https.git] / Source / WebCore / platform / URLParser.cpp
1 /*
2  * Copyright (C) 2016-2018 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "URLParser.h"
28
29 #include "Logging.h"
30 #include "RuntimeApplicationChecks.h"
31 #include <array>
32 #include <mutex>
33 #include <unicode/uidna.h>
34 #include <unicode/utypes.h>
35
36 namespace WebCore {
37
38 #define URL_PARSER_DEBUGGING 0
39
40 #if URL_PARSER_DEBUGGING
41 #define URL_PARSER_LOG(...) LOG(URLParser, __VA_ARGS__)
42 #else
43 #define URL_PARSER_LOG(...)
44 #endif
45     
46 template<typename CharacterType>
47 class CodePointIterator {
48 public:
49     ALWAYS_INLINE CodePointIterator() { }
50     ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
51         : m_begin(begin)
52         , m_end(end)
53     {
54     }
55     
56     ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
57         : CodePointIterator(begin.m_begin, end.m_begin)
58     {
59         ASSERT(end.m_begin >= begin.m_begin);
60     }
61     
62     ALWAYS_INLINE UChar32 operator*() const;
63     ALWAYS_INLINE CodePointIterator& operator++();
64
65     ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
66     {
67         return m_begin == other.m_begin
68             && m_end == other.m_end;
69     }
70     ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
71     
72     ALWAYS_INLINE CodePointIterator& operator=(const CodePointIterator& other)
73     {
74         m_begin = other.m_begin;
75         m_end = other.m_end;
76         return *this;
77     }
78
79     ALWAYS_INLINE bool atEnd() const
80     {
81         ASSERT(m_begin <= m_end);
82         return m_begin >= m_end;
83     }
84     
85     ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
86     {
87         ASSERT(m_begin >= reference);
88         return m_begin - reference;
89     }
90
91     ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
92     {
93         return codeUnitsSince(other.m_begin);
94     }
95     
96 private:
97     const CharacterType* m_begin { nullptr };
98     const CharacterType* m_end { nullptr };
99 };
100
101 template<>
102 ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const
103 {
104     ASSERT(!atEnd());
105     return *m_begin;
106 }
107
108 template<>
109 ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
110 {
111     m_begin++;
112     return *this;
113 }
114
115 template<>
116 ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const
117 {
118     ASSERT(!atEnd());
119     UChar32 c;
120     U16_GET(m_begin, 0, 0, m_end - m_begin, c);
121     return c;
122 }
123
124 template<>
125 ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
126 {
127     unsigned i = 0;
128     size_t length = m_end - m_begin;
129     U16_FWD_1(m_begin, i, length);
130     m_begin += i;
131     return *this;
132 }
133     
134 ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
135 {
136     if (U_IS_BMP(codePoint)) {
137         destination.append(static_cast<UChar>(codePoint));
138         return;
139     }
140     destination.reserveCapacity(destination.size() + 2);
141     destination.uncheckedAppend(U16_LEAD(codePoint));
142     destination.uncheckedAppend(U16_TRAIL(codePoint));
143 }
144
145 enum URLCharacterClass {
146     UserInfo = 0x1,
147     Default = 0x2,
148     ForbiddenHost = 0x4,
149     QueryPercent = 0x8,
150     SlashQuestionOrHash = 0x10,
151     ValidScheme = 0x20,
152 };
153
154 static const uint8_t characterClassTable[256] = {
155     UserInfo | Default | QueryPercent | ForbiddenHost, // 0x0
156     UserInfo | Default | QueryPercent, // 0x1
157     UserInfo | Default | QueryPercent, // 0x2
158     UserInfo | Default | QueryPercent, // 0x3
159     UserInfo | Default | QueryPercent, // 0x4
160     UserInfo | Default | QueryPercent, // 0x5
161     UserInfo | Default | QueryPercent, // 0x6
162     UserInfo | Default | QueryPercent, // 0x7
163     UserInfo | Default | QueryPercent, // 0x8
164     UserInfo | Default | QueryPercent | ForbiddenHost, // 0x9
165     UserInfo | Default | QueryPercent | ForbiddenHost, // 0xA
166     UserInfo | Default | QueryPercent, // 0xB
167     UserInfo | Default | QueryPercent, // 0xC
168     UserInfo | Default | QueryPercent | ForbiddenHost, // 0xD
169     UserInfo | Default | QueryPercent, // 0xE
170     UserInfo | Default | QueryPercent, // 0xF
171     UserInfo | Default | QueryPercent, // 0x10
172     UserInfo | Default | QueryPercent, // 0x11
173     UserInfo | Default | QueryPercent, // 0x12
174     UserInfo | Default | QueryPercent, // 0x13
175     UserInfo | Default | QueryPercent, // 0x14
176     UserInfo | Default | QueryPercent, // 0x15
177     UserInfo | Default | QueryPercent, // 0x16
178     UserInfo | Default | QueryPercent, // 0x17
179     UserInfo | Default | QueryPercent, // 0x18
180     UserInfo | Default | QueryPercent, // 0x19
181     UserInfo | Default | QueryPercent, // 0x1A
182     UserInfo | Default | QueryPercent, // 0x1B
183     UserInfo | Default | QueryPercent, // 0x1C
184     UserInfo | Default | QueryPercent, // 0x1D
185     UserInfo | Default | QueryPercent, // 0x1E
186     UserInfo | Default | QueryPercent, // 0x1F
187     UserInfo | Default | QueryPercent | ForbiddenHost, // ' '
188     0, // '!'
189     UserInfo | Default | QueryPercent, // '"'
190     UserInfo | Default | QueryPercent | SlashQuestionOrHash | ForbiddenHost, // '#'
191     0, // '$'
192     ForbiddenHost, // '%'
193     0, // '&'
194     0, // '\''
195     0, // '('
196     0, // ')'
197     0, // '*'
198     ValidScheme, // '+'
199     0, // ','
200     ValidScheme, // '-'
201     ValidScheme, // '.'
202     UserInfo | SlashQuestionOrHash | ForbiddenHost, // '/'
203     ValidScheme, // '0'
204     ValidScheme, // '1'
205     ValidScheme, // '2'
206     ValidScheme, // '3'
207     ValidScheme, // '4'
208     ValidScheme, // '5'
209     ValidScheme, // '6'
210     ValidScheme, // '7'
211     ValidScheme, // '8'
212     ValidScheme, // '9'
213     UserInfo | ForbiddenHost, // ':'
214     UserInfo, // ';'
215     UserInfo | Default | QueryPercent | ForbiddenHost, // '<'
216     UserInfo, // '='
217     UserInfo | Default | QueryPercent | ForbiddenHost, // '>'
218     UserInfo | Default | SlashQuestionOrHash | ForbiddenHost, // '?'
219     UserInfo | ForbiddenHost, // '@'
220     ValidScheme, // 'A'
221     ValidScheme, // 'B'
222     ValidScheme, // 'C'
223     ValidScheme, // 'D'
224     ValidScheme, // 'E'
225     ValidScheme, // 'F'
226     ValidScheme, // 'G'
227     ValidScheme, // 'H'
228     ValidScheme, // 'I'
229     ValidScheme, // 'J'
230     ValidScheme, // 'K'
231     ValidScheme, // 'L'
232     ValidScheme, // 'M'
233     ValidScheme, // 'N'
234     ValidScheme, // 'O'
235     ValidScheme, // 'P'
236     ValidScheme, // 'Q'
237     ValidScheme, // 'R'
238     ValidScheme, // 'S'
239     ValidScheme, // 'T'
240     ValidScheme, // 'U'
241     ValidScheme, // 'V'
242     ValidScheme, // 'W'
243     ValidScheme, // 'X'
244     ValidScheme, // 'Y'
245     ValidScheme, // 'Z'
246     UserInfo | ForbiddenHost, // '['
247     UserInfo | SlashQuestionOrHash | ForbiddenHost, // '\\'
248     UserInfo | ForbiddenHost, // ']'
249     UserInfo, // '^'
250     0, // '_'
251     UserInfo | Default, // '`'
252     ValidScheme, // 'a'
253     ValidScheme, // 'b'
254     ValidScheme, // 'c'
255     ValidScheme, // 'd'
256     ValidScheme, // 'e'
257     ValidScheme, // 'f'
258     ValidScheme, // 'g'
259     ValidScheme, // 'h'
260     ValidScheme, // 'i'
261     ValidScheme, // 'j'
262     ValidScheme, // 'k'
263     ValidScheme, // 'l'
264     ValidScheme, // 'm'
265     ValidScheme, // 'n'
266     ValidScheme, // 'o'
267     ValidScheme, // 'p'
268     ValidScheme, // 'q'
269     ValidScheme, // 'r'
270     ValidScheme, // 's'
271     ValidScheme, // 't'
272     ValidScheme, // 'u'
273     ValidScheme, // 'v'
274     ValidScheme, // 'w'
275     ValidScheme, // 'x'
276     ValidScheme, // 'y'
277     ValidScheme, // 'z'
278     UserInfo | Default, // '{'
279     UserInfo, // '|'
280     UserInfo | Default, // '}'
281     0, // '~'
282     QueryPercent, // 0x7F
283     QueryPercent, // 0x80
284     QueryPercent, // 0x81
285     QueryPercent, // 0x82
286     QueryPercent, // 0x83
287     QueryPercent, // 0x84
288     QueryPercent, // 0x85
289     QueryPercent, // 0x86
290     QueryPercent, // 0x87
291     QueryPercent, // 0x88
292     QueryPercent, // 0x89
293     QueryPercent, // 0x8A
294     QueryPercent, // 0x8B
295     QueryPercent, // 0x8C
296     QueryPercent, // 0x8D
297     QueryPercent, // 0x8E
298     QueryPercent, // 0x8F
299     QueryPercent, // 0x90
300     QueryPercent, // 0x91
301     QueryPercent, // 0x92
302     QueryPercent, // 0x93
303     QueryPercent, // 0x94
304     QueryPercent, // 0x95
305     QueryPercent, // 0x96
306     QueryPercent, // 0x97
307     QueryPercent, // 0x98
308     QueryPercent, // 0x99
309     QueryPercent, // 0x9A
310     QueryPercent, // 0x9B
311     QueryPercent, // 0x9C
312     QueryPercent, // 0x9D
313     QueryPercent, // 0x9E
314     QueryPercent, // 0x9F
315     QueryPercent, // 0xA0
316     QueryPercent, // 0xA1
317     QueryPercent, // 0xA2
318     QueryPercent, // 0xA3
319     QueryPercent, // 0xA4
320     QueryPercent, // 0xA5
321     QueryPercent, // 0xA6
322     QueryPercent, // 0xA7
323     QueryPercent, // 0xA8
324     QueryPercent, // 0xA9
325     QueryPercent, // 0xAA
326     QueryPercent, // 0xAB
327     QueryPercent, // 0xAC
328     QueryPercent, // 0xAD
329     QueryPercent, // 0xAE
330     QueryPercent, // 0xAF
331     QueryPercent, // 0xB0
332     QueryPercent, // 0xB1
333     QueryPercent, // 0xB2
334     QueryPercent, // 0xB3
335     QueryPercent, // 0xB4
336     QueryPercent, // 0xB5
337     QueryPercent, // 0xB6
338     QueryPercent, // 0xB7
339     QueryPercent, // 0xB8
340     QueryPercent, // 0xB9
341     QueryPercent, // 0xBA
342     QueryPercent, // 0xBB
343     QueryPercent, // 0xBC
344     QueryPercent, // 0xBD
345     QueryPercent, // 0xBE
346     QueryPercent, // 0xBF
347     QueryPercent, // 0xC0
348     QueryPercent, // 0xC1
349     QueryPercent, // 0xC2
350     QueryPercent, // 0xC3
351     QueryPercent, // 0xC4
352     QueryPercent, // 0xC5
353     QueryPercent, // 0xC6
354     QueryPercent, // 0xC7
355     QueryPercent, // 0xC8
356     QueryPercent, // 0xC9
357     QueryPercent, // 0xCA
358     QueryPercent, // 0xCB
359     QueryPercent, // 0xCC
360     QueryPercent, // 0xCD
361     QueryPercent, // 0xCE
362     QueryPercent, // 0xCF
363     QueryPercent, // 0xD0
364     QueryPercent, // 0xD1
365     QueryPercent, // 0xD2
366     QueryPercent, // 0xD3
367     QueryPercent, // 0xD4
368     QueryPercent, // 0xD5
369     QueryPercent, // 0xD6
370     QueryPercent, // 0xD7
371     QueryPercent, // 0xD8
372     QueryPercent, // 0xD9
373     QueryPercent, // 0xDA
374     QueryPercent, // 0xDB
375     QueryPercent, // 0xDC
376     QueryPercent, // 0xDD
377     QueryPercent, // 0xDE
378     QueryPercent, // 0xDF
379     QueryPercent, // 0xE0
380     QueryPercent, // 0xE1
381     QueryPercent, // 0xE2
382     QueryPercent, // 0xE3
383     QueryPercent, // 0xE4
384     QueryPercent, // 0xE5
385     QueryPercent, // 0xE6
386     QueryPercent, // 0xE7
387     QueryPercent, // 0xE8
388     QueryPercent, // 0xE9
389     QueryPercent, // 0xEA
390     QueryPercent, // 0xEB
391     QueryPercent, // 0xEC
392     QueryPercent, // 0xED
393     QueryPercent, // 0xEE
394     QueryPercent, // 0xEF
395     QueryPercent, // 0xF0
396     QueryPercent, // 0xF1
397     QueryPercent, // 0xF2
398     QueryPercent, // 0xF3
399     QueryPercent, // 0xF4
400     QueryPercent, // 0xF5
401     QueryPercent, // 0xF6
402     QueryPercent, // 0xF7
403     QueryPercent, // 0xF8
404     QueryPercent, // 0xF9
405     QueryPercent, // 0xFA
406     QueryPercent, // 0xFB
407     QueryPercent, // 0xFC
408     QueryPercent, // 0xFD
409     QueryPercent, // 0xFE
410     QueryPercent, // 0xFF
411 };
412
413 template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; }
414 template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
415 template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
416 template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
417 template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
418 template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
419 template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
420 template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
421 template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & ValidScheme; }
422 template<typename CharacterType> ALWAYS_INLINE static bool isForbiddenHostCodePoint(CharacterType character) { return character <= ']' && characterClassTable[character] & ForbiddenHost; }
423 ALWAYS_INLINE static bool shouldPercentEncodeQueryByte(uint8_t byte, const bool& urlIsSpecial)
424 {
425     if (characterClassTable[byte] & QueryPercent)
426         return true;
427     if (byte == '\'' && urlIsSpecial)
428         return true;
429     return false;
430 }
431
432 bool URLParser::isInUserInfoEncodeSet(UChar c)
433 {
434     return WebCore::isInUserInfoEncodeSet(c);
435 }
436
437 template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation>
438 ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
439 {
440     ++iterator;
441     while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
442         if (reportSyntaxViolation == ReportSyntaxViolation::Yes)
443             syntaxViolation(iteratorForSyntaxViolationPosition);
444         ++iterator;
445     }
446 }
447
448 template<typename CharacterType>
449 bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator)
450 {
451     if (iterator.atEnd())
452         return false;
453     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
454     if (iterator.atEnd())
455         return false;
456     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
457     return iterator.atEnd();
458 }
459
460 template<typename CharacterType>
461 ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
462 {
463     if (iterator.atEnd() || !isASCIIAlpha(*iterator))
464         return false;
465     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
466     if (iterator.atEnd())
467         return false;
468     if (*iterator == ':')
469         return true;
470     if (UNLIKELY(*iterator == '|'))
471         return true;
472     return false;
473 }
474
475 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
476 {
477     ASSERT(isASCII(codePoint));
478     if (UNLIKELY(m_didSeeSyntaxViolation))
479         m_asciiBuffer.append(codePoint);
480 }
481
482 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
483 {
484     if (UNLIKELY(m_didSeeSyntaxViolation))
485         m_asciiBuffer.append(characters, length);
486 }
487
488 template<typename CharacterType>
489 void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
490 {
491     ASSERT(isWindowsDriveLetter(iterator));
492     appendToASCIIBuffer(*iterator);
493     advance(iterator);
494     ASSERT(!iterator.atEnd());
495     ASSERT(*iterator == ':' || *iterator == '|');
496     if (*iterator == '|')
497         syntaxViolation(iterator);
498     appendToASCIIBuffer(':');
499     advance(iterator);
500 }
501
502 bool URLParser::copyBaseWindowsDriveLetter(const URL& base)
503 {
504     if (base.protocolIs("file")) {
505         RELEASE_ASSERT(base.m_hostEnd + base.m_portLength < base.m_string.length());
506         if (base.m_string.is8Bit()) {
507             const LChar* begin = base.m_string.characters8();
508             CodePointIterator<LChar> c(begin + base.m_hostEnd + base.m_portLength + 1, begin + base.m_string.length());
509             if (isWindowsDriveLetter(c)) {
510                 appendWindowsDriveLetter(c);
511                 return true;
512             }
513         } else {
514             const UChar* begin = base.m_string.characters16();
515             CodePointIterator<UChar> c(begin + base.m_hostEnd + base.m_portLength + 1, begin + base.m_string.length());
516             if (isWindowsDriveLetter(c)) {
517                 appendWindowsDriveLetter(c);
518                 return true;
519             }
520         }
521     }
522     return false;
523 }
524
525 template<typename CharacterType>
526 bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
527 {
528     if (!isWindowsDriveLetter(iterator))
529         return true;
530     if (iterator.atEnd())
531         return false;
532     advance(iterator);
533     if (iterator.atEnd())
534         return true;
535     advance(iterator);
536     if (iterator.atEnd())
537         return true;
538     return !isSlashQuestionOrHash(*iterator);
539 }
540
541 static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
542 {
543     buffer.append('%');
544     buffer.append(upperNibbleToASCIIHexDigit(byte));
545     buffer.append(lowerNibbleToASCIIHexDigit(byte));
546 }
547
548 void URLParser::percentEncodeByte(uint8_t byte)
549 {
550     ASSERT(m_didSeeSyntaxViolation);
551     appendToASCIIBuffer('%');
552     appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
553     appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
554 }
555
556 const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD";
557 const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1;
558
559 template<bool(*isInCodeSet)(UChar32), typename CharacterType>
560 ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
561 {
562     ASSERT(!iterator.atEnd());
563     UChar32 codePoint = *iterator;
564     if (LIKELY(isASCII(codePoint))) {
565         if (UNLIKELY(isInCodeSet(codePoint))) {
566             syntaxViolation(iterator);
567             percentEncodeByte(codePoint);
568         } else
569             appendToASCIIBuffer(codePoint);
570         return;
571     }
572     ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
573     syntaxViolation(iterator);
574     
575     if (!U_IS_UNICODE_CHAR(codePoint)) {
576         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
577         return;
578     }
579     
580     uint8_t buffer[U8_MAX_LENGTH];
581     int32_t offset = 0;
582     U8_APPEND_UNSAFE(buffer, offset, codePoint);
583     for (int32_t i = 0; i < offset; ++i)
584         percentEncodeByte(buffer[i]);
585 }
586
587 template<typename CharacterType>
588 ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
589 {
590     ASSERT(!iterator.atEnd());
591     UChar32 codePoint = *iterator;
592     if (LIKELY(isASCII(codePoint))) {
593         if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint, m_urlIsSpecial))) {
594             syntaxViolation(iterator);
595             percentEncodeByte(codePoint);
596         } else
597             appendToASCIIBuffer(codePoint);
598         return;
599     }
600     
601     syntaxViolation(iterator);
602     
603     if (!U_IS_UNICODE_CHAR(codePoint)) {
604         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
605         return;
606     }
607
608     uint8_t buffer[U8_MAX_LENGTH];
609     int32_t offset = 0;
610     U8_APPEND_UNSAFE(buffer, offset, codePoint);
611     for (int32_t i = 0; i < offset; ++i) {
612         auto byte = buffer[i];
613         if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
614             percentEncodeByte(byte);
615         else
616             appendToASCIIBuffer(byte);
617     }
618 }
619
620 template<typename CharacterType>
621 void URLParser::encodeQuery(const Vector<UChar>& source, const TextEncoding& encoding, CodePointIterator<CharacterType> iterator)
622 {
623     // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
624     auto encoded = encoding.encode(StringView(source.data(), source.size()), UnencodableHandling::URLEncodedEntities);
625     auto* data = encoded.data();
626     size_t length = encoded.size();
627     
628     if (!length == !iterator.atEnd()) {
629         syntaxViolation(iterator);
630         return;
631     }
632     
633     size_t i = 0;
634     for (; i < length; ++i) {
635         ASSERT(!iterator.atEnd());
636         uint8_t byte = data[i];
637         if (UNLIKELY(byte != *iterator)) {
638             syntaxViolation(iterator);
639             break;
640         }
641         if (UNLIKELY(shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))) {
642             syntaxViolation(iterator);
643             break;
644         }
645         appendToASCIIBuffer(byte);
646         ++iterator;
647     }
648     while (!iterator.atEnd() && isTabOrNewline(*iterator))
649         ++iterator;
650     ASSERT((i == length) == iterator.atEnd());
651     for (; i < length; ++i) {
652         ASSERT(m_didSeeSyntaxViolation);
653         uint8_t byte = data[i];
654         if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
655             percentEncodeByte(byte);
656         else
657             appendToASCIIBuffer(byte);
658     }
659 }
660
661 std::optional<uint16_t> URLParser::defaultPortForProtocol(StringView scheme)
662 {
663     static const uint16_t ftpPort = 21;
664     static const uint16_t gopherPort = 70;
665     static const uint16_t httpPort = 80;
666     static const uint16_t httpsPort = 443;
667     static const uint16_t wsPort = 80;
668     static const uint16_t wssPort = 443;
669     
670     auto length = scheme.length();
671     if (!length)
672         return std::nullopt;
673     switch (scheme[0]) {
674     case 'w':
675         switch (length) {
676         case 2:
677             if (scheme[1] == 's')
678                 return wsPort;
679             return std::nullopt;
680         case 3:
681             if (scheme[1] == 's'
682                 && scheme[2] == 's')
683                 return wssPort;
684             return std::nullopt;
685         default:
686             return false;
687         }
688     case 'h':
689         switch (length) {
690         case 4:
691             if (scheme[1] == 't'
692                 && scheme[2] == 't'
693                 && scheme[3] == 'p')
694                 return httpPort;
695             return std::nullopt;
696         case 5:
697             if (scheme[1] == 't'
698                 && scheme[2] == 't'
699                 && scheme[3] == 'p'
700                 && scheme[4] == 's')
701                 return httpsPort;
702             return std::nullopt;
703         default:
704             return std::nullopt;
705         }
706     case 'g':
707         if (length == 6
708             && scheme[1] == 'o'
709             && scheme[2] == 'p'
710             && scheme[3] == 'h'
711             && scheme[4] == 'e'
712             && scheme[5] == 'r')
713             return gopherPort;
714         return std::nullopt;
715     case 'f':
716         if (length == 3
717             && scheme[1] == 't'
718             && scheme[2] == 'p')
719             return ftpPort;
720         return std::nullopt;
721     default:
722         return std::nullopt;
723     }
724 }
725
726 enum class Scheme {
727     WS,
728     WSS,
729     File,
730     FTP,
731     Gopher,
732     HTTP,
733     HTTPS,
734     NonSpecial
735 };
736
737 ALWAYS_INLINE static Scheme scheme(StringView scheme)
738 {
739     auto length = scheme.length();
740     if (!length)
741         return Scheme::NonSpecial;
742     switch (scheme[0]) {
743     case 'f':
744         switch (length) {
745         case 3:
746             if (scheme[1] == 't'
747                 && scheme[2] == 'p')
748                 return Scheme::FTP;
749             return Scheme::NonSpecial;
750         case 4:
751             if (scheme[1] == 'i'
752                 && scheme[2] == 'l'
753                 && scheme[3] == 'e')
754                 return Scheme::File;
755             return Scheme::NonSpecial;
756         default:
757             return Scheme::NonSpecial;
758         }
759     case 'g':
760         if (length == 6
761             && scheme[1] == 'o'
762             && scheme[2] == 'p'
763             && scheme[3] == 'h'
764             && scheme[4] == 'e'
765             && scheme[5] == 'r')
766             return Scheme::Gopher;
767         return Scheme::NonSpecial;
768     case 'h':
769         switch (length) {
770         case 4:
771             if (scheme[1] == 't'
772                 && scheme[2] == 't'
773                 && scheme[3] == 'p')
774                 return Scheme::HTTP;
775             return Scheme::NonSpecial;
776         case 5:
777             if (scheme[1] == 't'
778                 && scheme[2] == 't'
779                 && scheme[3] == 'p'
780                 && scheme[4] == 's')
781                 return Scheme::HTTPS;
782             return Scheme::NonSpecial;
783         default:
784             return Scheme::NonSpecial;
785         }
786     case 'w':
787         switch (length) {
788         case 2:
789             if (scheme[1] == 's')
790                 return Scheme::WS;
791             return Scheme::NonSpecial;
792         case 3:
793             if (scheme[1] == 's'
794                 && scheme[2] == 's')
795                 return Scheme::WSS;
796             return Scheme::NonSpecial;
797         default:
798             return Scheme::NonSpecial;
799         }
800     default:
801         return Scheme::NonSpecial;
802     }
803 }
804
805 std::optional<String> URLParser::maybeCanonicalizeScheme(const String& scheme)
806 {
807     if (scheme.isEmpty())
808         return std::nullopt;
809
810     if (!isASCIIAlpha(scheme[0]))
811         return std::nullopt;
812
813     for (size_t i = 1; i < scheme.length(); ++i) {
814         if (isASCIIAlphanumeric(scheme[i]) || scheme[i] == '+' || scheme[i] == '-' || scheme[i] == '.')
815             continue;
816         return std::nullopt;
817     }
818
819     return scheme.convertToASCIILowercase();
820 }
821
822 bool URLParser::isSpecialScheme(const String& schemeArg)
823 {
824     return scheme(schemeArg) != Scheme::NonSpecial;
825 }
826
827 enum class URLParser::URLPart {
828     SchemeEnd,
829     UserStart,
830     UserEnd,
831     PasswordEnd,
832     HostEnd,
833     PortEnd,
834     PathAfterLastSlash,
835     PathEnd,
836     QueryEnd,
837 };
838
839 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
840 {
841     switch (part) {
842     case URLPart::QueryEnd:
843         return url.m_queryEnd;
844     case URLPart::PathEnd:
845         return url.m_pathEnd;
846     case URLPart::PathAfterLastSlash:
847         return url.m_pathAfterLastSlash;
848     case URLPart::PortEnd:
849         return url.m_hostEnd + url.m_portLength;
850     case URLPart::HostEnd:
851         return url.m_hostEnd;
852     case URLPart::PasswordEnd:
853         return url.m_passwordEnd;
854     case URLPart::UserEnd:
855         return url.m_userEnd;
856     case URLPart::UserStart:
857         return url.m_userStart;
858     case URLPart::SchemeEnd:
859         return url.m_schemeEnd;
860     }
861     ASSERT_NOT_REACHED();
862     return 0;
863 }
864
865 void URLParser::copyASCIIStringUntil(const String& string, size_t length)
866 {
867     RELEASE_ASSERT(length <= string.length());
868     if (string.isNull())
869         return;
870     ASSERT(m_asciiBuffer.isEmpty());
871     if (string.is8Bit())
872         appendToASCIIBuffer(string.characters8(), length);
873     else {
874         const UChar* characters = string.characters16();
875         for (size_t i = 0; i < length; ++i) {
876             UChar c = characters[i];
877             ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
878             appendToASCIIBuffer(c);
879         }
880     }
881 }
882
883 template<typename CharacterType>
884 void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, bool& isUTF8Encoding)
885 {
886     syntaxViolation(iterator);
887
888     m_asciiBuffer.clear();
889     copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
890     switch (part) {
891     case URLPart::QueryEnd:
892         m_url.m_queryEnd = base.m_queryEnd;
893         FALLTHROUGH;
894     case URLPart::PathEnd:
895         m_url.m_pathEnd = base.m_pathEnd;
896         FALLTHROUGH;
897     case URLPart::PathAfterLastSlash:
898         m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
899         FALLTHROUGH;
900     case URLPart::PortEnd:
901         m_url.m_portLength = base.m_portLength;
902         FALLTHROUGH;
903     case URLPart::HostEnd:
904         m_url.m_hostEnd = base.m_hostEnd;
905         FALLTHROUGH;
906     case URLPart::PasswordEnd:
907         m_url.m_passwordEnd = base.m_passwordEnd;
908         FALLTHROUGH;
909     case URLPart::UserEnd:
910         m_url.m_userEnd = base.m_userEnd;
911         FALLTHROUGH;
912     case URLPart::UserStart:
913         m_url.m_userStart = base.m_userStart;
914         FALLTHROUGH;
915     case URLPart::SchemeEnd:
916         m_url.m_isValid = base.m_isValid;
917         m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
918         m_url.m_schemeEnd = base.m_schemeEnd;
919     }
920     switch (scheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd))) {
921     case Scheme::WS:
922     case Scheme::WSS:
923         isUTF8Encoding = true;
924         m_urlIsSpecial = true;
925         return;
926     case Scheme::File:
927         m_urlIsFile = true;
928         FALLTHROUGH;
929     case Scheme::FTP:
930     case Scheme::Gopher:
931     case Scheme::HTTP:
932     case Scheme::HTTPS:
933         m_urlIsSpecial = true;
934         return;
935     case Scheme::NonSpecial:
936         m_urlIsSpecial = false;
937         isUTF8Encoding = true;
938         return;
939     }
940     ASSERT_NOT_REACHED();
941 }
942
943 static const char dotASCIICode[2] = {'2', 'e'};
944
945 template<typename CharacterType>
946 ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c)
947 {
948     if (c.atEnd())
949         return false;
950     if (*c == '.') {
951         advance<CharacterType, ReportSyntaxViolation::No>(c);
952         return c.atEnd() || isSlashQuestionOrHash(*c);
953     }
954     if (*c != '%')
955         return false;
956     advance<CharacterType, ReportSyntaxViolation::No>(c);
957     if (c.atEnd() || *c != dotASCIICode[0])
958         return false;
959     advance<CharacterType, ReportSyntaxViolation::No>(c);
960     if (c.atEnd())
961         return false;
962     if (toASCIILower(*c) == dotASCIICode[1]) {
963         advance<CharacterType, ReportSyntaxViolation::No>(c);
964         return c.atEnd() || isSlashQuestionOrHash(*c);
965     }
966     return false;
967 }
968
969 template<typename CharacterType>
970 ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
971 {
972     if (c.atEnd())
973         return false;
974     if (*c == '.') {
975         advance<CharacterType, ReportSyntaxViolation::No>(c);
976         return isSingleDotPathSegment(c);
977     }
978     if (*c != '%')
979         return false;
980     advance<CharacterType, ReportSyntaxViolation::No>(c);
981     if (c.atEnd() || *c != dotASCIICode[0])
982         return false;
983     advance<CharacterType, ReportSyntaxViolation::No>(c);
984     if (c.atEnd())
985         return false;
986     if (toASCIILower(*c) == dotASCIICode[1]) {
987         advance<CharacterType, ReportSyntaxViolation::No>(c);
988         return isSingleDotPathSegment(c);
989     }
990     return false;
991 }
992
993 template<typename CharacterType>
994 void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
995 {
996     ASSERT(isSingleDotPathSegment(c));
997     if (*c == '.') {
998         advance(c);
999         if (!c.atEnd()) {
1000             if (*c == '/' || *c == '\\')
1001                 advance(c);
1002             else
1003                 ASSERT(*c == '?' || *c == '#');
1004         }
1005     } else {
1006         ASSERT(*c == '%');
1007         advance(c);
1008         ASSERT(*c == dotASCIICode[0]);
1009         advance(c);
1010         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1011         advance(c);
1012         if (!c.atEnd()) {
1013             if (*c == '/' || *c == '\\')
1014                 advance(c);
1015             else
1016                 ASSERT(*c == '?' || *c == '#');
1017         }
1018     }
1019 }
1020
1021 template<typename CharacterType>
1022 void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
1023 {
1024     ASSERT(isDoubleDotPathSegment(c));
1025     if (*c == '.')
1026         advance(c);
1027     else {
1028         ASSERT(*c == '%');
1029         advance(c);
1030         ASSERT(*c == dotASCIICode[0]);
1031         advance(c);
1032         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1033         advance(c);
1034     }
1035     consumeSingleDotPathSegment(c);
1036 }
1037
1038 bool URLParser::shouldPopPath(unsigned newPathAfterLastSlash)
1039 {
1040     ASSERT(m_didSeeSyntaxViolation);
1041     if (!m_urlIsFile)
1042         return true;
1043
1044     ASSERT(m_url.m_pathAfterLastSlash <= m_asciiBuffer.size());
1045     CodePointIterator<LChar> componentToPop(&m_asciiBuffer[newPathAfterLastSlash], &m_asciiBuffer[0] + m_url.m_pathAfterLastSlash);
1046     if (newPathAfterLastSlash == m_url.m_hostEnd + m_url.m_portLength + 1 && isWindowsDriveLetter(componentToPop))
1047         return false;
1048     return true;
1049 }
1050
1051 void URLParser::popPath()
1052 {
1053     ASSERT(m_didSeeSyntaxViolation);
1054     if (m_url.m_pathAfterLastSlash > m_url.m_hostEnd + m_url.m_portLength + 1) {
1055         auto newPathAfterLastSlash = m_url.m_pathAfterLastSlash - 1;
1056         if (m_asciiBuffer[newPathAfterLastSlash] == '/')
1057             newPathAfterLastSlash--;
1058         while (newPathAfterLastSlash > m_url.m_hostEnd + m_url.m_portLength && m_asciiBuffer[newPathAfterLastSlash] != '/')
1059             newPathAfterLastSlash--;
1060         newPathAfterLastSlash++;
1061         if (shouldPopPath(newPathAfterLastSlash))
1062             m_url.m_pathAfterLastSlash = newPathAfterLastSlash;
1063     }
1064     m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
1065 }
1066
1067 template<typename CharacterType>
1068 void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
1069 {
1070     if (m_didSeeSyntaxViolation)
1071         return;
1072     m_didSeeSyntaxViolation = true;
1073     
1074     ASSERT(m_asciiBuffer.isEmpty());
1075     size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1076     RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
1077     m_asciiBuffer.reserveCapacity(m_inputString.length());
1078     for (size_t i = 0; i < codeUnitsToCopy; ++i) {
1079         ASSERT(isASCII(m_inputString[i]));
1080         m_asciiBuffer.uncheckedAppend(m_inputString[i]);
1081     }
1082 }
1083
1084 void URLParser::failure()
1085 {
1086     m_url.invalidate();
1087     m_url.m_string = m_inputString;
1088 }
1089
1090 template<typename CharacterType>
1091 bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint)
1092 {
1093     if (iterator.atEnd() || toASCIILower(*iterator) != codePoint)
1094         return false;
1095     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
1096     return true;
1097 }
1098
1099 template<typename CharacterType>
1100 bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator)
1101 {
1102     if (!checkLocalhostCodePoint(iterator, 'l'))
1103         return false;
1104     if (!checkLocalhostCodePoint(iterator, 'o'))
1105         return false;
1106     if (!checkLocalhostCodePoint(iterator, 'c'))
1107         return false;
1108     if (!checkLocalhostCodePoint(iterator, 'a'))
1109         return false;
1110     if (!checkLocalhostCodePoint(iterator, 'l'))
1111         return false;
1112     if (!checkLocalhostCodePoint(iterator, 'h'))
1113         return false;
1114     if (!checkLocalhostCodePoint(iterator, 'o'))
1115         return false;
1116     if (!checkLocalhostCodePoint(iterator, 's'))
1117         return false;
1118     if (!checkLocalhostCodePoint(iterator, 't'))
1119         return false;
1120     return iterator.atEnd();
1121 }
1122
1123 bool URLParser::isLocalhost(StringView view)
1124 {
1125     if (view.is8Bit())
1126         return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length()));
1127     return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length()));
1128 }
1129
1130 ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
1131 {
1132     if (UNLIKELY(m_didSeeSyntaxViolation)) {
1133         ASSERT(start + length <= m_asciiBuffer.size());
1134         return StringView(m_asciiBuffer.data() + start, length);
1135     }
1136     ASSERT(start + length <= m_inputString.length());
1137     return StringView(m_inputString).substring(start, length);
1138 }
1139
1140 ALWAYS_INLINE UChar URLParser::parsedDataView(size_t position)
1141 {
1142     if (UNLIKELY(m_didSeeSyntaxViolation))
1143         return m_asciiBuffer[position];
1144     return m_inputString[position];
1145 }
1146
1147 template<typename CharacterType>
1148 ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1149 {
1150     if (UNLIKELY(m_didSeeSyntaxViolation))
1151         return m_asciiBuffer.size();
1152     
1153     return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1154 }
1155
1156 URLParser::URLParser(const String& input, const URL& base, const TextEncoding& encoding)
1157     : m_inputString(input)
1158 {
1159     if (input.isNull()) {
1160         if (base.isValid() && !base.m_cannotBeABaseURL) {
1161             m_url = base;
1162             m_url.removeFragmentIdentifier();
1163         }
1164         return;
1165     }
1166
1167     if (input.is8Bit()) {
1168         m_inputBegin = input.characters8();
1169         parse(input.characters8(), input.length(), base, encoding);
1170     } else {
1171         m_inputBegin = input.characters16();
1172         parse(input.characters16(), input.length(), base, encoding);
1173     }
1174
1175     ASSERT(!m_url.m_isValid
1176         || m_didSeeSyntaxViolation == (m_url.string() != input)
1177         || (input.isAllSpecialCharacters<isC0ControlOrSpace>()
1178             && m_url.m_string == base.m_string.left(base.m_queryEnd)));
1179     ASSERT(internalValuesConsistent(m_url));
1180 #if !ASSERT_DISABLED
1181     if (!m_didSeeSyntaxViolation) {
1182         // Force a syntax violation at the beginning to make sure we get the same result.
1183         URLParser parser(makeString(" ", input), base, encoding);
1184         URL parsed = parser.result();
1185         if (parsed.isValid())
1186             ASSERT(allValuesEqual(parser.result(), m_url));
1187     }
1188 #endif
1189 }
1190
1191 template<typename CharacterType>
1192 void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
1193 {
1194     URL_PARSER_LOG("Parsing URL <%s> base <%s> encoding <%s>", String(input, length).utf8().data(), base.string().utf8().data(), encoding.name());
1195     m_url = { };
1196     ASSERT(m_asciiBuffer.isEmpty());
1197     
1198     bool isUTF8Encoding = encoding == UTF8Encoding();
1199     Vector<UChar> queryBuffer;
1200
1201     unsigned endIndex = length;
1202     while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) {
1203         syntaxViolation(CodePointIterator<CharacterType>(input, input));
1204         endIndex--;
1205     }
1206     CodePointIterator<CharacterType> c(input, input + endIndex);
1207     CodePointIterator<CharacterType> authorityOrHostBegin;
1208     CodePointIterator<CharacterType> queryBegin;
1209     while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1210         syntaxViolation(c);
1211         ++c;
1212     }
1213     auto beginAfterControlAndSpace = c;
1214
1215     enum class State : uint8_t {
1216         SchemeStart,
1217         Scheme,
1218         NoScheme,
1219         SpecialRelativeOrAuthority,
1220         PathOrAuthority,
1221         Relative,
1222         RelativeSlash,
1223         SpecialAuthoritySlashes,
1224         SpecialAuthorityIgnoreSlashes,
1225         AuthorityOrHost,
1226         Host,
1227         File,
1228         FileSlash,
1229         FileHost,
1230         PathStart,
1231         Path,
1232         CannotBeABaseURLPath,
1233         UTF8Query,
1234         NonUTF8Query,
1235         Fragment,
1236     };
1237
1238 #define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1239 #define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x)
1240
1241     State state = State::SchemeStart;
1242     while (!c.atEnd()) {
1243         if (UNLIKELY(isTabOrNewline(*c))) {
1244             syntaxViolation(c);
1245             ++c;
1246             continue;
1247         }
1248
1249         switch (state) {
1250         case State::SchemeStart:
1251             LOG_STATE("SchemeStart");
1252             if (isASCIIAlpha(*c)) {
1253                 if (UNLIKELY(isASCIIUpper(*c)))
1254                     syntaxViolation(c);
1255                 appendToASCIIBuffer(toASCIILower(*c));
1256                 advance(c);
1257                 if (c.atEnd()) {
1258                     m_asciiBuffer.clear();
1259                     state = State::NoScheme;
1260                     c = beginAfterControlAndSpace;
1261                     break;
1262                 }
1263                 state = State::Scheme;
1264             } else
1265                 state = State::NoScheme;
1266             break;
1267         case State::Scheme:
1268             LOG_STATE("Scheme");
1269             if (isValidSchemeCharacter(*c)) {
1270                 if (UNLIKELY(isASCIIUpper(*c)))
1271                     syntaxViolation(c);
1272                 appendToASCIIBuffer(toASCIILower(*c));
1273             } else if (*c == ':') {
1274                 unsigned schemeEnd = currentPosition(c);
1275                 if (schemeEnd > URL::maxSchemeLength) {
1276                     failure();
1277                     return;
1278                 }
1279                 m_url.m_schemeEnd = schemeEnd;
1280                 StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd);
1281                 appendToASCIIBuffer(':');
1282                 switch (scheme(urlScheme)) {
1283                 case Scheme::File:
1284                     m_urlIsSpecial = true;
1285                     m_urlIsFile = true;
1286                     state = State::File;
1287                     ++c;
1288                     break;
1289                 case Scheme::WS:
1290                 case Scheme::WSS:
1291                     isUTF8Encoding = true;
1292                     m_urlIsSpecial = true;
1293                     if (base.protocolIs(urlScheme))
1294                         state = State::SpecialRelativeOrAuthority;
1295                     else
1296                         state = State::SpecialAuthoritySlashes;
1297                     ++c;
1298                     break;
1299                 case Scheme::HTTP:
1300                 case Scheme::HTTPS:
1301                     m_url.m_protocolIsInHTTPFamily = true;
1302                     FALLTHROUGH;
1303                 case Scheme::FTP:
1304                 case Scheme::Gopher:
1305                     m_urlIsSpecial = true;
1306                     if (base.protocolIs(urlScheme))
1307                         state = State::SpecialRelativeOrAuthority;
1308                     else
1309                         state = State::SpecialAuthoritySlashes;
1310                     ++c;
1311                     break;
1312                 case Scheme::NonSpecial:
1313                     isUTF8Encoding = true;
1314                     auto maybeSlash = c;
1315                     advance(maybeSlash);
1316                     if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1317                         appendToASCIIBuffer('/');
1318                         c = maybeSlash;
1319                         state = State::PathOrAuthority;
1320                         ASSERT(*c == '/');
1321                         ++c;
1322                         m_url.m_userStart = currentPosition(c);
1323                     } else {
1324                         ++c;
1325                         m_url.m_userStart = currentPosition(c);
1326                         m_url.m_userEnd = m_url.m_userStart;
1327                         m_url.m_passwordEnd = m_url.m_userStart;
1328                         m_url.m_hostEnd = m_url.m_userStart;
1329                         m_url.m_portLength = 0;
1330                         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1331                         m_url.m_cannotBeABaseURL = true;
1332                         state = State::CannotBeABaseURLPath;
1333                     }
1334                     break;
1335                 }
1336                 break;
1337             } else {
1338                 m_asciiBuffer.clear();
1339                 state = State::NoScheme;
1340                 c = beginAfterControlAndSpace;
1341                 break;
1342             }
1343             advance(c);
1344             if (c.atEnd()) {
1345                 m_asciiBuffer.clear();
1346                 state = State::NoScheme;
1347                 c = beginAfterControlAndSpace;
1348             }
1349             break;
1350         case State::NoScheme:
1351             LOG_STATE("NoScheme");
1352             if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) {
1353                 failure();
1354                 return;
1355             }
1356             if (base.m_cannotBeABaseURL && *c == '#') {
1357                 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1358                 state = State::Fragment;
1359                 appendToASCIIBuffer('#');
1360                 ++c;
1361                 break;
1362             }
1363             if (!base.protocolIs("file")) {
1364                 state = State::Relative;
1365                 break;
1366             }
1367             copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1368             appendToASCIIBuffer(':');
1369             state = State::File;
1370             break;
1371         case State::SpecialRelativeOrAuthority:
1372             LOG_STATE("SpecialRelativeOrAuthority");
1373             if (*c == '/') {
1374                 appendToASCIIBuffer('/');
1375                 advance(c);
1376                 if (c.atEnd()) {
1377                     failure();
1378                     return;
1379                 }
1380                 if (*c == '/') {
1381                     appendToASCIIBuffer('/');
1382                     state = State::SpecialAuthorityIgnoreSlashes;
1383                     ++c;
1384                 } else
1385                     state = State::RelativeSlash;
1386             } else
1387                 state = State::Relative;
1388             break;
1389         case State::PathOrAuthority:
1390             LOG_STATE("PathOrAuthority");
1391             if (*c == '/') {
1392                 appendToASCIIBuffer('/');
1393                 state = State::AuthorityOrHost;
1394                 advance(c);
1395                 m_url.m_userStart = currentPosition(c);
1396                 authorityOrHostBegin = c;
1397             } else {
1398                 ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1399                 m_url.m_userStart = currentPosition(c) - 1;
1400                 m_url.m_userEnd = m_url.m_userStart;
1401                 m_url.m_passwordEnd = m_url.m_userStart;
1402                 m_url.m_hostEnd = m_url.m_userStart;
1403                 m_url.m_portLength = 0;
1404                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1405                 state = State::Path;
1406             }
1407             break;
1408         case State::Relative:
1409             LOG_STATE("Relative");
1410             switch (*c) {
1411             case '/':
1412             case '\\':
1413                 state = State::RelativeSlash;
1414                 ++c;
1415                 break;
1416             case '?':
1417                 copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1418                 appendToASCIIBuffer('?');
1419                 ++c;
1420                 if (isUTF8Encoding)
1421                     state = State::UTF8Query;
1422                 else {
1423                     queryBegin = c;
1424                     state = State::NonUTF8Query;
1425                 }
1426                 break;
1427             case '#':
1428                 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1429                 appendToASCIIBuffer('#');
1430                 state = State::Fragment;
1431                 ++c;
1432                 break;
1433             default:
1434                 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1435                 if (currentPosition(c) && parsedDataView(currentPosition(c) - 1) != '/') {
1436                     appendToASCIIBuffer('/');
1437                     m_url.m_pathAfterLastSlash = currentPosition(c);
1438                 }
1439                 state = State::Path;
1440                 break;
1441             }
1442             break;
1443         case State::RelativeSlash:
1444             LOG_STATE("RelativeSlash");
1445             if (*c == '/' || *c == '\\') {
1446                 ++c;
1447                 copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1448                 appendToASCIIBuffer("://", 3);
1449                 if (m_urlIsSpecial)
1450                     state = State::SpecialAuthorityIgnoreSlashes;
1451                 else {
1452                     m_url.m_userStart = currentPosition(c);
1453                     state = State::AuthorityOrHost;
1454                     authorityOrHostBegin = c;
1455                 }
1456             } else {
1457                 copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1458                 appendToASCIIBuffer('/');
1459                 m_url.m_pathAfterLastSlash = base.m_hostEnd + base.m_portLength + 1;
1460                 state = State::Path;
1461             }
1462             break;
1463         case State::SpecialAuthoritySlashes:
1464             LOG_STATE("SpecialAuthoritySlashes");
1465             if (LIKELY(*c == '/' || *c == '\\')) {
1466                 if (UNLIKELY(*c == '\\'))
1467                     syntaxViolation(c);
1468                 appendToASCIIBuffer('/');
1469                 advance(c);
1470                 if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) {
1471                     if (UNLIKELY(*c == '\\'))
1472                         syntaxViolation(c);
1473                     ++c;
1474                     appendToASCIIBuffer('/');
1475                 } else {
1476                     syntaxViolation(c);
1477                     appendToASCIIBuffer('/');
1478                 }
1479             } else {
1480                 syntaxViolation(c);
1481                 appendToASCIIBuffer("//", 2);
1482             }
1483             state = State::SpecialAuthorityIgnoreSlashes;
1484             break;
1485         case State::SpecialAuthorityIgnoreSlashes:
1486             LOG_STATE("SpecialAuthorityIgnoreSlashes");
1487             if (*c == '/' || *c == '\\') {
1488                 syntaxViolation(c);
1489                 ++c;
1490             } else {
1491                 m_url.m_userStart = currentPosition(c);
1492                 state = State::AuthorityOrHost;
1493                 authorityOrHostBegin = c;
1494             }
1495             break;
1496         case State::AuthorityOrHost:
1497             do {
1498                 LOG_STATE("AuthorityOrHost");
1499                 if (*c == '@') {
1500                     auto lastAt = c;
1501                     auto findLastAt = c;
1502                     while (!findLastAt.atEnd()) {
1503                         URL_PARSER_LOG("Finding last @: %c", *findLastAt);
1504                         if (*findLastAt == '@')
1505                             lastAt = findLastAt;
1506                         bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\');
1507                         if (isSlash || *findLastAt == '?' || *findLastAt == '#')
1508                             break;
1509                         ++findLastAt;
1510                     }
1511                     parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1512                     c = lastAt;
1513                     advance(c);
1514                     authorityOrHostBegin = c;
1515                     state = State::Host;
1516                     m_hostHasPercentOrNonASCII = false;
1517                     break;
1518                 }
1519                 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1520                 if (isSlash || *c == '?' || *c == '#') {
1521                     auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c);
1522                     if (iterator.atEnd()) {
1523                         if (m_urlIsSpecial)
1524                             return failure();
1525                         m_url.m_userEnd = currentPosition(c);
1526                         m_url.m_passwordEnd = m_url.m_userEnd;
1527                         m_url.m_hostEnd = m_url.m_userEnd;
1528                         m_url.m_portLength = 0;
1529                         m_url.m_pathAfterLastSlash = m_url.m_userEnd;
1530                     } else {
1531                         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1532                         m_url.m_passwordEnd = m_url.m_userEnd;
1533                         if (!parseHostAndPort(iterator)) {
1534                             failure();
1535                             return;
1536                         }
1537                         if (UNLIKELY(!isSlash)) {
1538                             if (m_urlIsSpecial) {
1539                                 syntaxViolation(c);
1540                                 appendToASCIIBuffer('/');
1541                             }
1542                             m_url.m_pathAfterLastSlash = currentPosition(c);
1543                         }
1544                     }
1545                     state = State::Path;
1546                     break;
1547                 }
1548                 if (isPercentOrNonASCII(*c))
1549                     m_hostHasPercentOrNonASCII = true;
1550                 ++c;
1551             } while (!c.atEnd());
1552             break;
1553         case State::Host:
1554             do {
1555                 LOG_STATE("Host");
1556                 if (*c == '/' || *c == '?' || *c == '#') {
1557                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1558                         failure();
1559                         return;
1560                     }
1561                     if (*c == '?' || *c == '#') {
1562                         syntaxViolation(c);
1563                         appendToASCIIBuffer('/');
1564                         m_url.m_pathAfterLastSlash = currentPosition(c);
1565                     }
1566                     state = State::Path;
1567                     break;
1568                 }
1569                 if (isPercentOrNonASCII(*c))
1570                     m_hostHasPercentOrNonASCII = true;
1571                 ++c;
1572             } while (!c.atEnd());
1573             break;
1574         case State::File:
1575             LOG_STATE("File");
1576             switch (*c) {
1577             case '\\':
1578                 syntaxViolation(c);
1579                 FALLTHROUGH;
1580             case '/':
1581                 appendToASCIIBuffer('/');
1582                 state = State::FileSlash;
1583                 ++c;
1584                 break;
1585             case '?':
1586                 syntaxViolation(c);
1587                 if (base.isValid() && base.protocolIs("file")) {
1588                     copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1589                     appendToASCIIBuffer('?');
1590                     ++c;
1591                 } else {
1592                     appendToASCIIBuffer("///?", 4);
1593                     ++c;
1594                     m_url.m_userStart = currentPosition(c) - 2;
1595                     m_url.m_userEnd = m_url.m_userStart;
1596                     m_url.m_passwordEnd = m_url.m_userStart;
1597                     m_url.m_hostEnd = m_url.m_userStart;
1598                     m_url.m_portLength = 0;
1599                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1600                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1601                 }
1602                 if (isUTF8Encoding)
1603                     state = State::UTF8Query;
1604                 else {
1605                     queryBegin = c;
1606                     state = State::NonUTF8Query;
1607                 }
1608                 break;
1609             case '#':
1610                 syntaxViolation(c);
1611                 if (base.isValid() && base.protocolIs("file")) {
1612                     copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1613                     appendToASCIIBuffer('#');
1614                 } else {
1615                     appendToASCIIBuffer("///#", 4);
1616                     m_url.m_userStart = currentPosition(c) - 2;
1617                     m_url.m_userEnd = m_url.m_userStart;
1618                     m_url.m_passwordEnd = m_url.m_userStart;
1619                     m_url.m_hostEnd = m_url.m_userStart;
1620                     m_url.m_portLength = 0;
1621                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1622                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1623                     m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1624                 }
1625                 state = State::Fragment;
1626                 ++c;
1627                 break;
1628             default:
1629                 syntaxViolation(c);
1630                 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1631                     copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1632                 else {
1633                     appendToASCIIBuffer("///", 3);
1634                     m_url.m_userStart = currentPosition(c) - 1;
1635                     m_url.m_userEnd = m_url.m_userStart;
1636                     m_url.m_passwordEnd = m_url.m_userStart;
1637                     m_url.m_hostEnd = m_url.m_userStart;
1638                     m_url.m_portLength = 0;
1639                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1640                     if (isWindowsDriveLetter(c))
1641                         appendWindowsDriveLetter(c);
1642                 }
1643                 state = State::Path;
1644                 break;
1645             }
1646             break;
1647         case State::FileSlash:
1648             LOG_STATE("FileSlash");
1649             if (LIKELY(*c == '/' || *c == '\\')) {
1650                 if (UNLIKELY(*c == '\\'))
1651                     syntaxViolation(c);
1652                 appendToASCIIBuffer('/');
1653                 advance(c);
1654                 m_url.m_userStart = currentPosition(c);
1655                 m_url.m_userEnd = m_url.m_userStart;
1656                 m_url.m_passwordEnd = m_url.m_userStart;
1657                 m_url.m_hostEnd = m_url.m_userStart;
1658                 m_url.m_portLength = 0;
1659                 authorityOrHostBegin = c;
1660                 state = State::FileHost;
1661                 break;
1662             }
1663             syntaxViolation(c);
1664             appendToASCIIBuffer("//", 2);
1665             m_url.m_userStart = currentPosition(c) - 1;
1666             m_url.m_userEnd = m_url.m_userStart;
1667             m_url.m_passwordEnd = m_url.m_userStart;
1668             m_url.m_hostEnd = m_url.m_userStart;
1669             m_url.m_portLength = 0;
1670             if (isWindowsDriveLetter(c)) {
1671                 appendWindowsDriveLetter(c);
1672                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1673             } else if (copyBaseWindowsDriveLetter(base)) {
1674                 appendToASCIIBuffer('/');
1675                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1676             } else
1677                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1678             state = State::Path;
1679             break;
1680         case State::FileHost:
1681             do {
1682                 LOG_STATE("FileHost");
1683                 if (isSlashQuestionOrHash(*c)) {
1684                     bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1685                         && isWindowsDriveLetter(authorityOrHostBegin);
1686                     if (windowsQuirk) {
1687                         syntaxViolation(authorityOrHostBegin);
1688                         appendToASCIIBuffer('/');
1689                         appendWindowsDriveLetter(authorityOrHostBegin);
1690                     }
1691                     if (windowsQuirk || authorityOrHostBegin == c) {
1692                         ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1) == '/');
1693                         if (UNLIKELY(*c == '?')) {
1694                             syntaxViolation(c);
1695                             appendToASCIIBuffer("/?", 2);
1696                             ++c;
1697                             if (isUTF8Encoding)
1698                                 state = State::UTF8Query;
1699                             else {
1700                                 queryBegin = c;
1701                                 state = State::NonUTF8Query;
1702                             }
1703                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1704                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1705                             break;
1706                         }
1707                         if (UNLIKELY(*c == '#')) {
1708                             syntaxViolation(c);
1709                             appendToASCIIBuffer("/#", 2);
1710                             ++c;
1711                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1712                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1713                             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1714                             state = State::Fragment;
1715                             break;
1716                         }
1717                         state = State::Path;
1718                         break;
1719                     }
1720                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1721                         failure();
1722                         return;
1723                     }
1724                     if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) {
1725                         syntaxViolation(c);
1726                         m_asciiBuffer.shrink(m_url.m_passwordEnd);
1727                         m_url.m_hostEnd = currentPosition(c);
1728                         m_url.m_portLength = 0;
1729                     }
1730                     
1731                     state = State::PathStart;
1732                     break;
1733                 }
1734                 if (isPercentOrNonASCII(*c))
1735                     m_hostHasPercentOrNonASCII = true;
1736                 ++c;
1737             } while (!c.atEnd());
1738             break;
1739         case State::PathStart:
1740             LOG_STATE("PathStart");
1741             if (*c != '/' && *c != '\\') {
1742                 syntaxViolation(c);
1743                 appendToASCIIBuffer('/');
1744             }
1745             m_url.m_pathAfterLastSlash = currentPosition(c);
1746             state = State::Path;
1747             break;
1748         case State::Path:
1749             LOG_STATE("Path");
1750             if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1751                 if (UNLIKELY(m_urlIsSpecial && *c == '\\'))
1752                     syntaxViolation(c);
1753                 appendToASCIIBuffer('/');
1754                 ++c;
1755                 m_url.m_pathAfterLastSlash = currentPosition(c);
1756                 break;
1757             }
1758             if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1) == '/')) {
1759                 if (UNLIKELY(isDoubleDotPathSegment(c))) {
1760                     syntaxViolation(c);
1761                     consumeDoubleDotPathSegment(c);
1762                     popPath();
1763                     break;
1764                 }
1765                 if (UNLIKELY(isSingleDotPathSegment(c))) {
1766                     syntaxViolation(c);
1767                     consumeSingleDotPathSegment(c);
1768                     break;
1769                 }
1770             }
1771             if (*c == '?') {
1772                 m_url.m_pathEnd = currentPosition(c);
1773                 appendToASCIIBuffer('?');
1774                 ++c;
1775                 if (isUTF8Encoding)
1776                     state = State::UTF8Query;
1777                 else {
1778                     queryBegin = c;
1779                     state = State::NonUTF8Query;
1780                 }
1781                 break;
1782             }
1783             if (*c == '#') {
1784                 m_url.m_pathEnd = currentPosition(c);
1785                 m_url.m_queryEnd = m_url.m_pathEnd;
1786                 state = State::Fragment;
1787                 break;
1788             }
1789             utf8PercentEncode<isInDefaultEncodeSet>(c);
1790             ++c;
1791             break;
1792         case State::CannotBeABaseURLPath:
1793             LOG_STATE("CannotBeABaseURLPath");
1794             if (*c == '?') {
1795                 m_url.m_pathEnd = currentPosition(c);
1796                 appendToASCIIBuffer('?');
1797                 ++c;
1798                 if (isUTF8Encoding)
1799                     state = State::UTF8Query;
1800                 else {
1801                     queryBegin = c;
1802                     state = State::NonUTF8Query;
1803                 }
1804             } else if (*c == '#') {
1805                 m_url.m_pathEnd = currentPosition(c);
1806                 m_url.m_queryEnd = m_url.m_pathEnd;
1807                 state = State::Fragment;
1808             } else if (*c == '/') {
1809                 appendToASCIIBuffer('/');
1810                 ++c;
1811                 m_url.m_pathAfterLastSlash = currentPosition(c);
1812             } else {
1813                 utf8PercentEncode<isInSimpleEncodeSet>(c);
1814                 ++c;
1815             }
1816             break;
1817         case State::UTF8Query:
1818             LOG_STATE("UTF8Query");
1819             ASSERT(queryBegin == CodePointIterator<CharacterType>());
1820             if (*c == '#') {
1821                 m_url.m_queryEnd = currentPosition(c);
1822                 state = State::Fragment;
1823                 break;
1824             }
1825             if (isUTF8Encoding)
1826                 utf8QueryEncode(c);
1827             else
1828                 appendCodePoint(queryBuffer, *c);
1829             ++c;
1830             break;
1831         case State::NonUTF8Query:
1832             do {
1833                 LOG_STATE("NonUTF8Query");
1834                 ASSERT(queryBegin != CodePointIterator<CharacterType>());
1835                 if (*c == '#') {
1836                     encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
1837                     m_url.m_queryEnd = currentPosition(c);
1838                     state = State::Fragment;
1839                     break;
1840                 }
1841                 appendCodePoint(queryBuffer, *c);
1842                 advance(c, queryBegin);
1843             } while (!c.atEnd());
1844             break;
1845         case State::Fragment:
1846             URL_PARSER_LOG("State Fragment");
1847             utf8PercentEncode<isInSimpleEncodeSet>(c);
1848             ++c;
1849             break;
1850         }
1851     }
1852
1853     switch (state) {
1854     case State::SchemeStart:
1855         LOG_FINAL_STATE("SchemeStart");
1856         if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1857             m_url = base;
1858             m_url.removeFragmentIdentifier();
1859             return;
1860         }
1861         failure();
1862         return;
1863     case State::Scheme:
1864         LOG_FINAL_STATE("Scheme");
1865         failure();
1866         return;
1867     case State::NoScheme:
1868         LOG_FINAL_STATE("NoScheme");
1869         RELEASE_ASSERT_NOT_REACHED();
1870     case State::SpecialRelativeOrAuthority:
1871         LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1872         copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1873         break;
1874     case State::PathOrAuthority:
1875         LOG_FINAL_STATE("PathOrAuthority");
1876         ASSERT(m_url.m_userStart);
1877         ASSERT(m_url.m_userStart == currentPosition(c));
1878         ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1879         m_url.m_userStart--;
1880         m_url.m_userEnd = m_url.m_userStart;
1881         m_url.m_passwordEnd = m_url.m_userStart;
1882         m_url.m_hostEnd = m_url.m_userStart;
1883         m_url.m_portLength = 0;
1884         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1885         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1886         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1887         break;
1888     case State::Relative:
1889         LOG_FINAL_STATE("Relative");
1890         RELEASE_ASSERT_NOT_REACHED();
1891     case State::RelativeSlash:
1892         LOG_FINAL_STATE("RelativeSlash");
1893         copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1894         appendToASCIIBuffer('/');
1895         m_url.m_pathAfterLastSlash = m_url.m_hostEnd + m_url.m_portLength + 1;
1896         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1897         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1898         break;
1899     case State::SpecialAuthoritySlashes:
1900         LOG_FINAL_STATE("SpecialAuthoritySlashes");
1901         m_url.m_userStart = currentPosition(c);
1902         m_url.m_userEnd = m_url.m_userStart;
1903         m_url.m_passwordEnd = m_url.m_userStart;
1904         m_url.m_hostEnd = m_url.m_userStart;
1905         m_url.m_portLength = 0;
1906         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1907         m_url.m_pathEnd = m_url.m_userStart;
1908         m_url.m_queryEnd = m_url.m_userStart;
1909         break;
1910     case State::SpecialAuthorityIgnoreSlashes:
1911         LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1912         failure();
1913         return;
1914     case State::AuthorityOrHost:
1915         LOG_FINAL_STATE("AuthorityOrHost");
1916         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1917         m_url.m_passwordEnd = m_url.m_userEnd;
1918         if (authorityOrHostBegin.atEnd()) {
1919             m_url.m_userEnd = m_url.m_userStart;
1920             m_url.m_passwordEnd = m_url.m_userStart;
1921             m_url.m_hostEnd = m_url.m_userStart;
1922             m_url.m_portLength = 0;
1923             m_url.m_pathEnd = m_url.m_userStart;
1924         } else if (!parseHostAndPort(authorityOrHostBegin)) {
1925             failure();
1926             return;
1927         } else {
1928             if (m_urlIsSpecial) {
1929                 syntaxViolation(c);
1930                 appendToASCIIBuffer('/');
1931                 m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength + 1;
1932             } else
1933                 m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength;
1934         }
1935         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1936         m_url.m_queryEnd = m_url.m_pathEnd;
1937         break;
1938     case State::Host:
1939         LOG_FINAL_STATE("Host");
1940         if (!parseHostAndPort(authorityOrHostBegin)) {
1941             failure();
1942             return;
1943         }
1944         if (m_urlIsSpecial) {
1945             syntaxViolation(c);
1946             appendToASCIIBuffer('/');
1947             m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength + 1;
1948         } else
1949             m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength;
1950         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1951         m_url.m_queryEnd = m_url.m_pathEnd;
1952         break;
1953     case State::File:
1954         LOG_FINAL_STATE("File");
1955         if (base.isValid() && base.protocolIs("file")) {
1956             copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1957             break;
1958         }
1959         syntaxViolation(c);
1960         appendToASCIIBuffer("///", 3);
1961         m_url.m_userStart = currentPosition(c) - 1;
1962         m_url.m_userEnd = m_url.m_userStart;
1963         m_url.m_passwordEnd = m_url.m_userStart;
1964         m_url.m_hostEnd = m_url.m_userStart;
1965         m_url.m_portLength = 0;
1966         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1967         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1968         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1969         break;
1970     case State::FileSlash:
1971         LOG_FINAL_STATE("FileSlash");
1972         syntaxViolation(c);
1973         m_url.m_userStart = currentPosition(c) + 1;
1974         appendToASCIIBuffer("//", 2);
1975         m_url.m_userEnd = m_url.m_userStart;
1976         m_url.m_passwordEnd = m_url.m_userStart;
1977         m_url.m_hostEnd = m_url.m_userStart;
1978         m_url.m_portLength = 0;
1979         if (copyBaseWindowsDriveLetter(base)) {
1980             appendToASCIIBuffer('/');
1981             m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1982         } else
1983             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1984         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1985         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1986         break;
1987     case State::FileHost:
1988         LOG_FINAL_STATE("FileHost");
1989         if (takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1990             && isWindowsDriveLetter(authorityOrHostBegin)) {
1991             syntaxViolation(authorityOrHostBegin);
1992             appendToASCIIBuffer('/');
1993             appendWindowsDriveLetter(authorityOrHostBegin);
1994             m_url.m_pathAfterLastSlash = currentPosition(c);
1995             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1996             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1997             break;
1998         }
1999         
2000         if (authorityOrHostBegin == c) {
2001             syntaxViolation(c);
2002             appendToASCIIBuffer('/');
2003             m_url.m_userStart = currentPosition(c) - 1;
2004             m_url.m_userEnd = m_url.m_userStart;
2005             m_url.m_passwordEnd = m_url.m_userStart;
2006             m_url.m_hostEnd = m_url.m_userStart;
2007             m_url.m_portLength = 0;
2008             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
2009             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
2010             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
2011             break;
2012         }
2013
2014         if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
2015             failure();
2016             return;
2017         }
2018
2019         syntaxViolation(c);
2020         if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) {
2021             m_asciiBuffer.shrink(m_url.m_passwordEnd);
2022             m_url.m_hostEnd = currentPosition(c);
2023             m_url.m_portLength = 0;
2024         }
2025         appendToASCIIBuffer('/');
2026         m_url.m_pathAfterLastSlash = m_url.m_hostEnd + m_url.m_portLength + 1;
2027         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
2028         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
2029         break;
2030     case State::PathStart:
2031         LOG_FINAL_STATE("PathStart");
2032         RELEASE_ASSERT_NOT_REACHED();
2033     case State::Path:
2034         LOG_FINAL_STATE("Path");
2035         m_url.m_pathEnd = currentPosition(c);
2036         m_url.m_queryEnd = m_url.m_pathEnd;
2037         break;
2038     case State::CannotBeABaseURLPath:
2039         LOG_FINAL_STATE("CannotBeABaseURLPath");
2040         m_url.m_pathEnd = currentPosition(c);
2041         m_url.m_queryEnd = m_url.m_pathEnd;
2042         break;
2043     case State::UTF8Query:
2044         LOG_FINAL_STATE("UTF8Query");
2045         ASSERT(queryBegin == CodePointIterator<CharacterType>());
2046         m_url.m_queryEnd = currentPosition(c);
2047         break;
2048     case State::NonUTF8Query:
2049         LOG_FINAL_STATE("NonUTF8Query");
2050         ASSERT(queryBegin != CodePointIterator<CharacterType>());
2051         encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
2052         m_url.m_queryEnd = currentPosition(c);
2053         break;
2054     case State::Fragment:
2055         LOG_FINAL_STATE("Fragment");
2056         break;
2057     }
2058
2059     if (LIKELY(!m_didSeeSyntaxViolation)) {
2060         m_url.m_string = m_inputString;
2061         ASSERT(m_asciiBuffer.isEmpty());
2062     } else
2063         m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
2064     m_url.m_isValid = true;
2065     URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
2066 }
2067
2068 template<typename CharacterType>
2069 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
2070 {
2071     if (UNLIKELY(iterator.atEnd())) {
2072         syntaxViolation(iterator);
2073         m_url.m_userEnd = currentPosition(iterator);
2074         m_url.m_passwordEnd = m_url.m_userEnd;
2075         return;
2076     }
2077     for (; !iterator.atEnd(); advance(iterator)) {
2078         if (*iterator == ':') {
2079             m_url.m_userEnd = currentPosition(iterator);
2080             auto iteratorAtColon = iterator;
2081             ++iterator;
2082             bool tabOrNewlineAfterColon = false;
2083             while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
2084                 tabOrNewlineAfterColon = true;
2085                 ++iterator;
2086             }
2087             if (UNLIKELY(iterator.atEnd())) {
2088                 syntaxViolation(iteratorAtColon);
2089                 m_url.m_passwordEnd = m_url.m_userEnd;
2090                 if (m_url.m_userEnd > m_url.m_userStart)
2091                     appendToASCIIBuffer('@');
2092                 return;
2093             }
2094             if (tabOrNewlineAfterColon)
2095                 syntaxViolation(iteratorAtColon);
2096             appendToASCIIBuffer(':');
2097             break;
2098         }
2099         utf8PercentEncode<WebCore::isInUserInfoEncodeSet>(iterator);
2100     }
2101     for (; !iterator.atEnd(); advance(iterator))
2102         utf8PercentEncode<WebCore::isInUserInfoEncodeSet>(iterator);
2103     m_url.m_passwordEnd = currentPosition(iterator);
2104     if (!m_url.m_userEnd)
2105         m_url.m_userEnd = m_url.m_passwordEnd;
2106     appendToASCIIBuffer('@');
2107 }
2108
2109 template<typename UnsignedIntegerType>
2110 void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
2111 {
2112     LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
2113     LChar* end = std::end(buf);
2114     LChar* p = end;
2115     do {
2116         *--p = (number % 10) + '0';
2117         number /= 10;
2118     } while (number);
2119     appendToASCIIBuffer(p, end - p);
2120 }
2121
2122 void URLParser::serializeIPv4(IPv4Address address)
2123 {
2124     appendNumberToASCIIBuffer<uint8_t>(address >> 24);
2125     appendToASCIIBuffer('.');
2126     appendNumberToASCIIBuffer<uint8_t>(address >> 16);
2127     appendToASCIIBuffer('.');
2128     appendNumberToASCIIBuffer<uint8_t>(address >> 8);
2129     appendToASCIIBuffer('.');
2130     appendNumberToASCIIBuffer<uint8_t>(address);
2131 }
2132     
2133 static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
2134 {
2135     size_t end = begin;
2136     for (; end < 8; end++) {
2137         if (address[end])
2138             break;
2139     }
2140     return end - begin;
2141 }
2142
2143 static std::optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
2144 {
2145     std::optional<size_t> longest;
2146     size_t longestLength = 0;
2147     for (size_t i = 0; i < 8; i++) {
2148         size_t length = zeroSequenceLength(address, i);
2149         if (length) {
2150             if (length > 1 && (!longest || longestLength < length)) {
2151                 longest = i;
2152                 longestLength = length;
2153             }
2154             i += length;
2155         }
2156     }
2157     return longest;
2158 }
2159
2160 void URLParser::serializeIPv6Piece(uint16_t piece)
2161 {
2162     bool printed = false;
2163     if (auto nibble0 = piece >> 12) {
2164         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
2165         printed = true;
2166     }
2167     auto nibble1 = piece >> 8 & 0xF;
2168     if (printed || nibble1) {
2169         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
2170         printed = true;
2171     }
2172     auto nibble2 = piece >> 4 & 0xF;
2173     if (printed || nibble2)
2174         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
2175     appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
2176 }
2177
2178 void URLParser::serializeIPv6(URLParser::IPv6Address address)
2179 {
2180     appendToASCIIBuffer('[');
2181     auto compressPointer = findLongestZeroSequence(address);
2182     for (size_t piece = 0; piece < 8; piece++) {
2183         if (compressPointer && compressPointer.value() == piece) {
2184             ASSERT(!address[piece]);
2185             if (piece)
2186                 appendToASCIIBuffer(':');
2187             else
2188                 appendToASCIIBuffer("::", 2);
2189             while (piece < 8 && !address[piece])
2190                 piece++;
2191             if (piece == 8)
2192                 break;
2193         }
2194         serializeIPv6Piece(address[piece]);
2195         if (piece < 7)
2196             appendToASCIIBuffer(':');
2197     }
2198     appendToASCIIBuffer(']');
2199 }
2200
2201 enum class URLParser::IPv4PieceParsingError {
2202     Failure,
2203     Overflow,
2204 };
2205
2206 template<typename CharacterType>
2207 Expected<uint32_t, URLParser::IPv4PieceParsingError> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation)
2208 {
2209     enum class State : uint8_t {
2210         UnknownBase,
2211         Decimal,
2212         OctalOrHex,
2213         Octal,
2214         Hex,
2215     };
2216     State state = State::UnknownBase;
2217     Checked<uint32_t, RecordOverflow> value = 0;
2218     if (!iterator.atEnd() && *iterator == '.')
2219         return makeUnexpected(IPv4PieceParsingError::Failure);
2220     while (!iterator.atEnd()) {
2221         if (isTabOrNewline(*iterator)) {
2222             didSeeSyntaxViolation = true;
2223             ++iterator;
2224             continue;
2225         }
2226         if (*iterator == '.') {
2227             ASSERT(!value.hasOverflowed());
2228             return value.unsafeGet();
2229         }
2230         switch (state) {
2231         case State::UnknownBase:
2232             if (UNLIKELY(*iterator == '0')) {
2233                 ++iterator;
2234                 state = State::OctalOrHex;
2235                 break;
2236             }
2237             state = State::Decimal;
2238             break;
2239         case State::OctalOrHex:
2240             didSeeSyntaxViolation = true;
2241             if (*iterator == 'x' || *iterator == 'X') {
2242                 ++iterator;
2243                 state = State::Hex;
2244                 break;
2245             }
2246             state = State::Octal;
2247             break;
2248         case State::Decimal:
2249             if (!isASCIIDigit(*iterator))
2250                 return makeUnexpected(IPv4PieceParsingError::Failure);
2251             value *= 10;
2252             value += *iterator - '0';
2253             if (UNLIKELY(value.hasOverflowed()))
2254                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2255             ++iterator;
2256             break;
2257         case State::Octal:
2258             ASSERT(didSeeSyntaxViolation);
2259             if (*iterator < '0' || *iterator > '7')
2260                 return makeUnexpected(IPv4PieceParsingError::Failure);
2261             value *= 8;
2262             value += *iterator - '0';
2263             if (UNLIKELY(value.hasOverflowed()))
2264                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2265             ++iterator;
2266             break;
2267         case State::Hex:
2268             ASSERT(didSeeSyntaxViolation);
2269             if (!isASCIIHexDigit(*iterator))
2270                 return makeUnexpected(IPv4PieceParsingError::Failure);
2271             value *= 16;
2272             value += toASCIIHexValue(*iterator);
2273             if (UNLIKELY(value.hasOverflowed()))
2274                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2275             ++iterator;
2276             break;
2277         }
2278     }
2279     ASSERT(!value.hasOverflowed());
2280     return value.unsafeGet();
2281 }
2282
2283 ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2284 {
2285     RELEASE_ASSERT(exponent <= 4);
2286     uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
2287     return values[exponent];
2288 }
2289
2290 enum class URLParser::IPv4ParsingError {
2291     Failure,
2292     NotIPv4,
2293 };
2294
2295 template<typename CharacterTypeForSyntaxViolation, typename CharacterType>
2296 Expected<URLParser::IPv4Address, URLParser::IPv4ParsingError> URLParser::parseIPv4Host(const CodePointIterator<CharacterTypeForSyntaxViolation>& iteratorForSyntaxViolationPosition, CodePointIterator<CharacterType> iterator)
2297 {
2298     Vector<Expected<uint32_t, URLParser::IPv4PieceParsingError>, 4> items;
2299     bool didSeeSyntaxViolation = false;
2300     if (!iterator.atEnd() && *iterator == '.')
2301         return makeUnexpected(IPv4ParsingError::NotIPv4);
2302     while (!iterator.atEnd()) {
2303         if (isTabOrNewline(*iterator)) {
2304             didSeeSyntaxViolation = true;
2305             ++iterator;
2306             continue;
2307         }
2308         if (items.size() >= 4)
2309             return makeUnexpected(IPv4ParsingError::NotIPv4);
2310         items.append(parseIPv4Piece(iterator, didSeeSyntaxViolation));
2311         if (!iterator.atEnd() && *iterator == '.') {
2312             ++iterator;
2313             if (iterator.atEnd())
2314                 syntaxViolation(iteratorForSyntaxViolationPosition);
2315             else if (*iterator == '.')
2316                 return makeUnexpected(IPv4ParsingError::NotIPv4);
2317         }
2318     }
2319     if (!iterator.atEnd() || !items.size() || items.size() > 4)
2320         return makeUnexpected(IPv4ParsingError::NotIPv4);
2321     for (const auto& item : items) {
2322         if (!item.has_value() && item.error() == IPv4PieceParsingError::Failure)
2323             return makeUnexpected(IPv4ParsingError::NotIPv4);
2324     }
2325     for (const auto& item : items) {
2326         if (!item.has_value() && item.error() == IPv4PieceParsingError::Overflow)
2327             return makeUnexpected(IPv4ParsingError::Failure);
2328     }
2329     if (items.size() > 1) {
2330         for (size_t i = 0; i < items.size() - 1; i++) {
2331             if (items[i].value() > 255)
2332                 return makeUnexpected(IPv4ParsingError::Failure);
2333         }
2334     }
2335     if (items[items.size() - 1].value() >= pow256(5 - items.size()))
2336         return makeUnexpected(IPv4ParsingError::Failure);
2337
2338     if (didSeeSyntaxViolation)
2339         syntaxViolation(iteratorForSyntaxViolationPosition);
2340     for (const auto& item : items) {
2341         if (item.value() > 255)
2342             syntaxViolation(iteratorForSyntaxViolationPosition);
2343     }
2344
2345     if (UNLIKELY(items.size() != 4))
2346         syntaxViolation(iteratorForSyntaxViolationPosition);
2347
2348     IPv4Address ipv4 = items.takeLast().value();
2349     for (size_t counter = 0; counter < items.size(); ++counter)
2350         ipv4 += items[counter].value() * pow256(3 - counter);
2351     return ipv4;
2352 }
2353
2354 template<typename CharacterType>
2355 std::optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator)
2356 {
2357     if (iterator.atEnd())
2358         return std::nullopt;
2359     uint32_t piece = 0;
2360     bool leadingZeros = false;
2361     size_t digitCount = 0;
2362     while (!iterator.atEnd()) {
2363         if (!isASCIIDigit(*iterator))
2364             return std::nullopt;
2365         ++digitCount;
2366         if (!piece && *iterator == '0') {
2367             if (leadingZeros)
2368                 return std::nullopt;
2369             leadingZeros = true;
2370         }
2371         if (!piece && *iterator == '0')
2372             leadingZeros = true;
2373         piece = piece * 10 + *iterator - '0';
2374         if (piece > 255)
2375             return std::nullopt;
2376         advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2377         if (iterator.atEnd())
2378             break;
2379         if (*iterator == '.')
2380             break;
2381     }
2382     if (piece && leadingZeros)
2383         return std::nullopt;
2384     return piece;
2385 }
2386
2387 template<typename CharacterType>
2388 std::optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator)
2389 {
2390     IPv4Address address = 0;
2391     for (size_t i = 0; i < 4; ++i) {
2392         if (std::optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator))
2393             address = (address << 8) + piece.value();
2394         else
2395             return std::nullopt;
2396         if (i < 3) {
2397             if (iterator.atEnd())
2398                 return std::nullopt;
2399             if (*iterator != '.')
2400                 return std::nullopt;
2401             advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2402         } else if (!iterator.atEnd())
2403             return std::nullopt;
2404     }
2405     ASSERT(iterator.atEnd());
2406     return address;
2407 }
2408
2409 template<typename CharacterType>
2410 std::optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2411 {
2412     ASSERT(*c == '[');
2413     const auto hostBegin = c;
2414     advance(c, hostBegin);
2415     if (c.atEnd())
2416         return std::nullopt;
2417
2418     IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}};
2419     size_t piecePointer = 0;
2420     std::optional<size_t> compressPointer;
2421
2422     if (*c == ':') {
2423         advance(c, hostBegin);
2424         if (c.atEnd())
2425             return std::nullopt;
2426         if (*c != ':')
2427             return std::nullopt;
2428         advance(c, hostBegin);
2429         ++piecePointer;
2430         compressPointer = piecePointer;
2431     }
2432     
2433     while (!c.atEnd()) {
2434         if (piecePointer == 8)
2435             return std::nullopt;
2436         if (*c == ':') {
2437             if (compressPointer)
2438                 return std::nullopt;
2439             advance(c, hostBegin);
2440             ++piecePointer;
2441             compressPointer = piecePointer;
2442             continue;
2443         }
2444         if (piecePointer == 6 || (compressPointer && piecePointer < 6)) {
2445             if (std::optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) {
2446                 if (compressPointer && piecePointer == 5)
2447                     return std::nullopt;
2448                 syntaxViolation(hostBegin);
2449                 address[piecePointer++] = ipv4Address.value() >> 16;
2450                 address[piecePointer++] = ipv4Address.value() & 0xFFFF;
2451                 c = { };
2452                 break;
2453             }
2454         }
2455         uint16_t value = 0;
2456         size_t length = 0;
2457         bool leadingZeros = false;
2458         for (; length < 4; length++) {
2459             if (c.atEnd())
2460                 break;
2461             if (!isASCIIHexDigit(*c))
2462                 break;
2463             if (isASCIIUpper(*c))
2464                 syntaxViolation(hostBegin);
2465             if (*c == '0' && !length)
2466                 leadingZeros = true;
2467             value = value * 0x10 + toASCIIHexValue(*c);
2468             advance(c, hostBegin);
2469         }
2470         
2471         if (UNLIKELY((value && leadingZeros) || (!value && length > 1)))
2472             syntaxViolation(hostBegin);
2473
2474         address[piecePointer++] = value;
2475         if (c.atEnd())
2476             break;
2477         if (piecePointer == 8 || *c != ':')
2478             return std::nullopt;
2479         advance(c, hostBegin);
2480     }
2481     
2482     if (!c.atEnd())
2483         return std::nullopt;
2484     
2485     if (compressPointer) {
2486         size_t swaps = piecePointer - compressPointer.value();
2487         piecePointer = 7;
2488         while (swaps)
2489             std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
2490     } else if (piecePointer != 8)
2491         return std::nullopt;
2492
2493     std::optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2494     if (possibleCompressPointer)
2495         possibleCompressPointer.value()++;
2496     if (UNLIKELY(compressPointer != possibleCompressPointer))
2497         syntaxViolation(hostBegin);
2498     
2499     return address;
2500 }
2501
2502 template<typename CharacterType>
2503 URLParser::LCharBuffer URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2504 {
2505     LCharBuffer output;
2506     output.reserveInitialCapacity(length);
2507     
2508     for (size_t i = 0; i < length; ++i) {
2509         uint8_t byte = input[i];
2510         if (byte != '%')
2511             output.uncheckedAppend(byte);
2512         else if (length > 2 && i < length - 2) {
2513             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2514                 syntaxViolation(iteratorForSyntaxViolationPosition);
2515                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2516                 i += 2;
2517             } else
2518                 output.uncheckedAppend(byte);
2519         } else
2520             output.uncheckedAppend(byte);
2521     }
2522     return output;
2523 }
2524     
2525 URLParser::LCharBuffer URLParser::percentDecode(const LChar* input, size_t length)
2526 {
2527     LCharBuffer output;
2528     output.reserveInitialCapacity(length);
2529     
2530     for (size_t i = 0; i < length; ++i) {
2531         uint8_t byte = input[i];
2532         if (byte != '%')
2533             output.uncheckedAppend(byte);
2534         else if (length > 2 && i < length - 2) {
2535             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2536                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2537                 i += 2;
2538             } else
2539                 output.uncheckedAppend(byte);
2540         } else
2541             output.uncheckedAppend(byte);
2542     }
2543     return output;
2544 }
2545
2546 template<typename CharacterType> std::optional<URLParser::LCharBuffer> URLParser::domainToASCII(StringImpl& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2547 {
2548     LCharBuffer ascii;
2549     if (domain.isAllASCII()) {
2550         size_t length = domain.length();
2551         if (domain.is8Bit()) {
2552             const LChar* characters = domain.characters8();
2553             ascii.reserveInitialCapacity(length);
2554             for (size_t i = 0; i < length; ++i) {
2555                 if (UNLIKELY(isASCIIUpper(characters[i])))
2556                     syntaxViolation(iteratorForSyntaxViolationPosition);
2557                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2558             }
2559         } else {
2560             const UChar* characters = domain.characters16();
2561             ascii.reserveInitialCapacity(length);
2562             for (size_t i = 0; i < length; ++i) {
2563                 if (UNLIKELY(isASCIIUpper(characters[i])))
2564                     syntaxViolation(iteratorForSyntaxViolationPosition);
2565                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2566             }
2567         }
2568         return ascii;
2569     }
2570     
2571     UChar hostnameBuffer[defaultInlineBufferSize];
2572     UErrorCode error = U_ZERO_ERROR;
2573     UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER;
2574     int32_t numCharactersConverted = uidna_nameToASCII(&internationalDomainNameTranscoder(), StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, &processingDetails, &error);
2575     ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
2576
2577     if (U_SUCCESS(error) && !processingDetails.errors) {
2578         for (int32_t i = 0; i < numCharactersConverted; ++i) {
2579             ASSERT(isASCII(hostnameBuffer[i]));
2580             ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2581         }
2582         ascii.append(hostnameBuffer, numCharactersConverted);
2583         if (domain != StringView(ascii.data(), ascii.size()))
2584             syntaxViolation(iteratorForSyntaxViolationPosition);
2585         return ascii;
2586     }
2587
2588     // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
2589     return std::nullopt;
2590 }
2591
2592 bool URLParser::hasForbiddenHostCodePoint(const URLParser::LCharBuffer& asciiDomain)
2593 {
2594     for (size_t i = 0; i < asciiDomain.size(); ++i) {
2595         if (isForbiddenHostCodePoint(asciiDomain[i]))
2596             return true;
2597     }
2598     return false;
2599 }
2600
2601 template<typename CharacterType>
2602 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2603 {
2604     ASSERT(*iterator == ':');
2605     auto colonIterator = iterator;
2606     advance(iterator, colonIterator);
2607     uint32_t port = 0;
2608     if (UNLIKELY(iterator.atEnd())) {
2609         unsigned portLength = currentPosition(colonIterator) - m_url.m_hostEnd;
2610         RELEASE_ASSERT(portLength <= URL::maxPortLength);
2611         m_url.m_portLength = portLength;
2612         syntaxViolation(colonIterator);
2613         return true;
2614     }
2615     size_t digitCount = 0;
2616     bool leadingZeros = false;
2617     for (; !iterator.atEnd(); ++iterator) {
2618         if (UNLIKELY(isTabOrNewline(*iterator))) {
2619             syntaxViolation(colonIterator);
2620             continue;
2621         }
2622         if (isASCIIDigit(*iterator)) {
2623             if (*iterator == '0' && !digitCount)
2624                 leadingZeros = true;
2625             ++digitCount;
2626             port = port * 10 + *iterator - '0';
2627             if (port > std::numeric_limits<uint16_t>::max())
2628                 return false;
2629         } else
2630             return false;
2631     }
2632
2633     if (port && leadingZeros)
2634         syntaxViolation(colonIterator);
2635     
2636     if (!port && digitCount > 1)
2637         syntaxViolation(colonIterator);
2638
2639     ASSERT(port == static_cast<uint16_t>(port));
2640     if (UNLIKELY(defaultPortForProtocol(parsedDataView(0, m_url.m_schemeEnd)) == static_cast<uint16_t>(port)))
2641         syntaxViolation(colonIterator);
2642     else {
2643         appendToASCIIBuffer(':');
2644         ASSERT(port <= std::numeric_limits<uint16_t>::max());
2645         appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2646     }
2647
2648     unsigned portLength = currentPosition(iterator) - m_url.m_hostEnd;
2649     RELEASE_ASSERT(portLength <= URL::maxPortLength);
2650     m_url.m_portLength = portLength;
2651     return true;
2652 }
2653
2654 template<typename CharacterType>
2655 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2656 {
2657     if (iterator.atEnd())
2658         return false;
2659     if (*iterator == ':')
2660         return false;
2661     if (*iterator == '[') {
2662         auto ipv6End = iterator;
2663         while (!ipv6End.atEnd() && *ipv6End != ']')
2664             ++ipv6End;
2665         if (ipv6End.atEnd())
2666             return false;
2667         if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2668             serializeIPv6(address.value());
2669             if (!ipv6End.atEnd()) {
2670                 advance(ipv6End);
2671                 if (!ipv6End.atEnd() && *ipv6End == ':') {
2672                     m_url.m_hostEnd = currentPosition(ipv6End);
2673                     return parsePort(ipv6End);
2674                 }
2675                 m_url.m_hostEnd = currentPosition(ipv6End);
2676                 m_url.m_portLength = 0;
2677                 return true;
2678             }
2679             m_url.m_hostEnd = currentPosition(ipv6End);
2680             return true;
2681         }
2682         return false;
2683     }
2684
2685     if (!m_urlIsSpecial) {
2686         for (; !iterator.atEnd(); ++iterator) {
2687             if (UNLIKELY(isTabOrNewline(*iterator))) {
2688                 syntaxViolation(iterator);
2689                 continue;
2690             }
2691             if (*iterator == ':')
2692                 break;
2693             if (UNLIKELY(isForbiddenHostCodePoint(*iterator) && *iterator != '%'))
2694                 return false;
2695             utf8PercentEncode<isInSimpleEncodeSet>(iterator);
2696         }
2697         m_url.m_hostEnd = currentPosition(iterator);
2698         if (iterator.atEnd()) {
2699             m_url.m_portLength = 0;
2700             return true;
2701         }
2702         return parsePort(iterator);
2703     }
2704     
2705     if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2706         auto hostIterator = iterator;
2707         for (; !iterator.atEnd(); ++iterator) {
2708             if (isTabOrNewline(*iterator))
2709                 continue;
2710             if (*iterator == ':')
2711                 break;
2712             if (isForbiddenHostCodePoint(*iterator))
2713                 return false;
2714         }
2715         auto address = parseIPv4Host(hostIterator, CodePointIterator<CharacterType>(hostIterator, iterator));
2716         if (address) {
2717             serializeIPv4(address.value());
2718             m_url.m_hostEnd = currentPosition(iterator);
2719             if (iterator.atEnd()) {
2720                 m_url.m_portLength = 0;
2721                 return true;
2722             }
2723             return parsePort(iterator);
2724         }
2725         if (address.error() == IPv4ParsingError::Failure)
2726             return false;
2727         for (; hostIterator != iterator; ++hostIterator) {
2728             if (UNLIKELY(isTabOrNewline(*hostIterator))) {
2729                 syntaxViolation(hostIterator);
2730                 continue;
2731             }
2732             if (UNLIKELY(isASCIIUpper(*hostIterator)))
2733                 syntaxViolation(hostIterator);
2734             appendToASCIIBuffer(toASCIILower(*hostIterator));
2735         }
2736         m_url.m_hostEnd = currentPosition(iterator);
2737         if (!hostIterator.atEnd())
2738             return parsePort(hostIterator);
2739         unsigned portLength = currentPosition(iterator) - m_url.m_hostEnd;
2740         RELEASE_ASSERT(portLength <= URL::maxPortLength);
2741         m_url.m_portLength = portLength;
2742         return true;
2743     }
2744     
2745     const auto hostBegin = iterator;
2746     
2747     LCharBuffer utf8Encoded;
2748     for (; !iterator.atEnd(); ++iterator) {
2749         if (UNLIKELY(isTabOrNewline(*iterator))) {
2750             syntaxViolation(hostBegin);
2751             continue;
2752         }
2753         if (*iterator == ':')
2754             break;
2755         if (UNLIKELY(!isASCII(*iterator)))
2756             syntaxViolation(hostBegin);
2757
2758         uint8_t buffer[U8_MAX_LENGTH];
2759         int32_t offset = 0;
2760         UBool error = false;
2761         U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error);
2762         ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
2763         // FIXME: Check error.
2764         utf8Encoded.append(buffer, offset);
2765     }
2766     LCharBuffer percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin);
2767     String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2768     if (domain.isNull())
2769         return false;
2770     if (domain != StringView(percentDecoded.data(), percentDecoded.size()))
2771         syntaxViolation(hostBegin);
2772     auto asciiDomain = domainToASCII(*domain.impl(), hostBegin);
2773     if (!asciiDomain || hasForbiddenHostCodePoint(asciiDomain.value()))
2774         return false;
2775     LCharBuffer& asciiDomainValue = asciiDomain.value();
2776     const LChar* asciiDomainCharacters = asciiDomainValue.data();
2777
2778     auto address = parseIPv4Host(hostBegin, CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()));
2779     if (address) {
2780         serializeIPv4(address.value());
2781         m_url.m_hostEnd = currentPosition(iterator);
2782         if (iterator.atEnd()) {
2783             m_url.m_portLength = 0;
2784             return true;
2785         }
2786         return parsePort(iterator);
2787     }
2788     if (address.error() == IPv4ParsingError::Failure)
2789         return false;
2790
2791     appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2792     m_url.m_hostEnd = currentPosition(iterator);
2793     if (!iterator.atEnd())
2794         return parsePort(iterator);
2795     m_url.m_portLength = 0;
2796     return true;
2797 }
2798
2799 std::optional<String> URLParser::formURLDecode(StringView input)
2800 {
2801     auto utf8 = input.utf8(StrictConversion);
2802     if (utf8.isNull())
2803         return std::nullopt;
2804     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2805     return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2806 }
2807
2808 // https://url.spec.whatwg.org/#concept-urlencoded-parser
2809 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2810 {
2811     URLEncodedForm output;
2812     for (StringView bytes : input.split('&')) {
2813         auto equalIndex = bytes.find('=');
2814         if (equalIndex == notFound) {
2815             auto name = formURLDecode(bytes.toString().replace('+', 0x20));
2816             if (name)
2817                 output.append({ name.value(), emptyString() });
2818         } else {
2819             auto name = formURLDecode(bytes.substring(0, equalIndex).toString().replace('+', 0x20));
2820             auto value = formURLDecode(bytes.substring(equalIndex + 1).toString().replace('+', 0x20));
2821             if (name && value)
2822                 output.append({ name.value(), value.value() });
2823         }
2824     }
2825     return output;
2826 }
2827
2828 static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2829 {
2830     auto utf8 = input.utf8(StrictConversion);
2831     const char* data = utf8.data();
2832     for (size_t i = 0; i < utf8.length(); ++i) {
2833         const char byte = data[i];
2834         if (byte == 0x20)
2835             output.append(0x2B);
2836         else if (byte == 0x2A
2837             || byte == 0x2D
2838             || byte == 0x2E
2839             || (byte >= 0x30 && byte <= 0x39)
2840             || (byte >= 0x41 && byte <= 0x5A)
2841             || byte == 0x5F
2842             || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches.
2843             output.append(byte);
2844         else
2845             percentEncodeByte(byte, output);
2846     }
2847 }
2848     
2849 String URLParser::serialize(const URLEncodedForm& tuples)
2850 {
2851     if (tuples.isEmpty())
2852         return { };
2853
2854     Vector<LChar> output;
2855     for (auto& tuple : tuples) {
2856         if (!output.isEmpty())
2857             output.append('&');
2858         serializeURLEncodedForm(tuple.key, output);
2859         output.append('=');
2860         serializeURLEncodedForm(tuple.value, output);
2861     }
2862     return String::adopt(WTFMove(output));
2863 }
2864
2865 const UIDNA& URLParser::internationalDomainNameTranscoder()
2866 {
2867     static UIDNA* encoder;
2868     static std::once_flag onceFlag;
2869     std::call_once(onceFlag, [] {
2870         UErrorCode error = U_ZERO_ERROR;
2871         // Warning: Please contact a WebKitGTK+ developer if changing these flags.
2872         // They should be synced with ephy_uri_decode() in ephy-uri-helpers.c.
2873         encoder = uidna_openUTS46(UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_NONTRANSITIONAL_TO_UNICODE | UIDNA_NONTRANSITIONAL_TO_ASCII, &error);
2874         RELEASE_ASSERT(U_SUCCESS(error));
2875         RELEASE_ASSERT(encoder);
2876     });
2877     return *encoder;
2878 }
2879
2880 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2881 {
2882     URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2883         a.m_isValid,
2884         a.m_cannotBeABaseURL,
2885         a.m_protocolIsInHTTPFamily,
2886         a.m_schemeEnd,
2887         a.m_userStart,
2888         a.m_userEnd,
2889         a.m_passwordEnd,
2890         a.m_hostEnd,
2891         a.m_hostEnd + a.m_portLength,
2892         a.m_pathAfterLastSlash,
2893         a.m_pathEnd,
2894         a.m_queryEnd,
2895         a.m_string.utf8().data(),
2896         b.m_isValid,
2897         b.m_cannotBeABaseURL,
2898         b.m_protocolIsInHTTPFamily,
2899         b.m_schemeEnd,
2900         b.m_userStart,
2901         b.m_userEnd,
2902         b.m_passwordEnd,
2903         b.m_hostEnd,
2904         b.m_hostEnd + b.m_portLength,
2905         b.m_pathAfterLastSlash,
2906         b.m_pathEnd,
2907         b.m_queryEnd,
2908         b.m_string.utf8().data());
2909
2910     return a.m_string == b.m_string
2911         && a.m_isValid == b.m_isValid
2912         && a.m_cannotBeABaseURL == b.m_cannotBeABaseURL
2913         && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2914         && a.m_schemeEnd == b.m_schemeEnd
2915         && a.m_userStart == b.m_userStart
2916         && a.m_userEnd == b.m_userEnd
2917         && a.m_passwordEnd == b.m_passwordEnd
2918         && a.m_hostEnd == b.m_hostEnd
2919         && a.m_portLength == b.m_portLength
2920         && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2921         && a.m_pathEnd == b.m_pathEnd
2922         && a.m_queryEnd == b.m_queryEnd;
2923 }
2924
2925 bool URLParser::internalValuesConsistent(const URL& url)
2926 {
2927     return url.m_schemeEnd <= url.m_userStart
2928         && url.m_userStart <= url.m_userEnd
2929         && url.m_userEnd <= url.m_passwordEnd
2930         && url.m_passwordEnd <= url.m_hostEnd
2931         && url.m_hostEnd + url.m_portLength <= url.m_pathAfterLastSlash
2932         && url.m_pathAfterLastSlash <= url.m_pathEnd
2933         && url.m_pathEnd <= url.m_queryEnd
2934         && url.m_queryEnd <= url.m_string.length();
2935 }
2936
2937 } // namespace WebCore