Strings need to be in some kind of gigacage
[WebKit-https.git] / Source / WebCore / platform / URLParser.cpp
1 /*
2  * Copyright (C) 2016 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "URLParser.h"
28
29 #include "Logging.h"
30 #include "RuntimeApplicationChecks.h"
31 #include <array>
32 #include <mutex>
33 #include <unicode/uidna.h>
34 #include <unicode/utypes.h>
35
36 namespace WebCore {
37
38 #define URL_PARSER_DEBUGGING 0
39
40 #if URL_PARSER_DEBUGGING
41 #define URL_PARSER_LOG(...) LOG(URLParser, __VA_ARGS__)
42 #else
43 #define URL_PARSER_LOG(...)
44 #endif
45     
46 template<typename CharacterType>
47 class CodePointIterator {
48 public:
49     ALWAYS_INLINE CodePointIterator() { }
50     ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
51         : m_begin(begin)
52         , m_end(end)
53     {
54     }
55     
56     ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
57         : CodePointIterator(begin.m_begin, end.m_begin)
58     {
59         ASSERT(end.m_begin >= begin.m_begin);
60     }
61     
62     ALWAYS_INLINE UChar32 operator*() const;
63     ALWAYS_INLINE CodePointIterator& operator++();
64
65     ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
66     {
67         return m_begin == other.m_begin
68             && m_end == other.m_end;
69     }
70     ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
71     
72     ALWAYS_INLINE CodePointIterator& operator=(const CodePointIterator& other)
73     {
74         m_begin = other.m_begin;
75         m_end = other.m_end;
76         return *this;
77     }
78
79     ALWAYS_INLINE bool atEnd() const
80     {
81         ASSERT(m_begin <= m_end);
82         return m_begin >= m_end;
83     }
84     
85     ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
86     {
87         ASSERT(m_begin >= reference);
88         return m_begin - reference;
89     }
90
91     ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
92     {
93         return codeUnitsSince(other.m_begin);
94     }
95     
96 private:
97     const CharacterType* m_begin { nullptr };
98     const CharacterType* m_end { nullptr };
99 };
100
101 template<>
102 ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const
103 {
104     ASSERT(!atEnd());
105     return *m_begin;
106 }
107
108 template<>
109 ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
110 {
111     m_begin++;
112     return *this;
113 }
114
115 template<>
116 ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const
117 {
118     ASSERT(!atEnd());
119     UChar32 c;
120     U16_GET(m_begin, 0, 0, m_end - m_begin, c);
121     return c;
122 }
123
124 template<>
125 ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
126 {
127     unsigned i = 0;
128     size_t length = m_end - m_begin;
129     U16_FWD_1(m_begin, i, length);
130     m_begin += i;
131     return *this;
132 }
133     
134 ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
135 {
136     if (U_IS_BMP(codePoint)) {
137         destination.append(static_cast<UChar>(codePoint));
138         return;
139     }
140     destination.reserveCapacity(destination.size() + 2);
141     destination.uncheckedAppend(U16_LEAD(codePoint));
142     destination.uncheckedAppend(U16_TRAIL(codePoint));
143 }
144
145 enum URLCharacterClass {
146     UserInfo = 0x1,
147     Default = 0x2,
148     ForbiddenHost = 0x4,
149     QueryPercent = 0x8,
150     SlashQuestionOrHash = 0x10,
151     ValidScheme = 0x20,
152 };
153
154 static const uint8_t characterClassTable[256] = {
155     UserInfo | Default | QueryPercent | ForbiddenHost, // 0x0
156     UserInfo | Default | QueryPercent, // 0x1
157     UserInfo | Default | QueryPercent, // 0x2
158     UserInfo | Default | QueryPercent, // 0x3
159     UserInfo | Default | QueryPercent, // 0x4
160     UserInfo | Default | QueryPercent, // 0x5
161     UserInfo | Default | QueryPercent, // 0x6
162     UserInfo | Default | QueryPercent, // 0x7
163     UserInfo | Default | QueryPercent, // 0x8
164     UserInfo | Default | QueryPercent | ForbiddenHost, // 0x9
165     UserInfo | Default | QueryPercent | ForbiddenHost, // 0xA
166     UserInfo | Default | QueryPercent, // 0xB
167     UserInfo | Default | QueryPercent, // 0xC
168     UserInfo | Default | QueryPercent | ForbiddenHost, // 0xD
169     UserInfo | Default | QueryPercent, // 0xE
170     UserInfo | Default | QueryPercent, // 0xF
171     UserInfo | Default | QueryPercent, // 0x10
172     UserInfo | Default | QueryPercent, // 0x11
173     UserInfo | Default | QueryPercent, // 0x12
174     UserInfo | Default | QueryPercent, // 0x13
175     UserInfo | Default | QueryPercent, // 0x14
176     UserInfo | Default | QueryPercent, // 0x15
177     UserInfo | Default | QueryPercent, // 0x16
178     UserInfo | Default | QueryPercent, // 0x17
179     UserInfo | Default | QueryPercent, // 0x18
180     UserInfo | Default | QueryPercent, // 0x19
181     UserInfo | Default | QueryPercent, // 0x1A
182     UserInfo | Default | QueryPercent, // 0x1B
183     UserInfo | Default | QueryPercent, // 0x1C
184     UserInfo | Default | QueryPercent, // 0x1D
185     UserInfo | Default | QueryPercent, // 0x1E
186     UserInfo | Default | QueryPercent, // 0x1F
187     UserInfo | Default | QueryPercent | ForbiddenHost, // ' '
188     0, // '!'
189     UserInfo | Default | QueryPercent, // '"'
190     UserInfo | Default | QueryPercent | SlashQuestionOrHash | ForbiddenHost, // '#'
191     0, // '$'
192     ForbiddenHost, // '%'
193     0, // '&'
194     0, // '''
195     0, // '('
196     0, // ')'
197     0, // '*'
198     ValidScheme, // '+'
199     0, // ','
200     ValidScheme, // '-'
201     ValidScheme, // '.'
202     UserInfo | SlashQuestionOrHash | ForbiddenHost, // '/'
203     ValidScheme, // '0'
204     ValidScheme, // '1'
205     ValidScheme, // '2'
206     ValidScheme, // '3'
207     ValidScheme, // '4'
208     ValidScheme, // '5'
209     ValidScheme, // '6'
210     ValidScheme, // '7'
211     ValidScheme, // '8'
212     ValidScheme, // '9'
213     UserInfo | ForbiddenHost, // ':'
214     UserInfo, // ';'
215     UserInfo | Default | QueryPercent, // '<'
216     UserInfo, // '='
217     UserInfo | Default | QueryPercent, // '>'
218     UserInfo | Default | SlashQuestionOrHash | ForbiddenHost, // '?'
219     UserInfo | ForbiddenHost, // '@'
220     ValidScheme, // 'A'
221     ValidScheme, // 'B'
222     ValidScheme, // 'C'
223     ValidScheme, // 'D'
224     ValidScheme, // 'E'
225     ValidScheme, // 'F'
226     ValidScheme, // 'G'
227     ValidScheme, // 'H'
228     ValidScheme, // 'I'
229     ValidScheme, // 'J'
230     ValidScheme, // 'K'
231     ValidScheme, // 'L'
232     ValidScheme, // 'M'
233     ValidScheme, // 'N'
234     ValidScheme, // 'O'
235     ValidScheme, // 'P'
236     ValidScheme, // 'Q'
237     ValidScheme, // 'R'
238     ValidScheme, // 'S'
239     ValidScheme, // 'T'
240     ValidScheme, // 'U'
241     ValidScheme, // 'V'
242     ValidScheme, // 'W'
243     ValidScheme, // 'X'
244     ValidScheme, // 'Y'
245     ValidScheme, // 'Z'
246     UserInfo | ForbiddenHost, // '['
247     UserInfo | SlashQuestionOrHash | ForbiddenHost, // '\\'
248     UserInfo | ForbiddenHost, // ']'
249     UserInfo, // '^'
250     0, // '_'
251     UserInfo | Default, // '`'
252     ValidScheme, // 'a'
253     ValidScheme, // 'b'
254     ValidScheme, // 'c'
255     ValidScheme, // 'd'
256     ValidScheme, // 'e'
257     ValidScheme, // 'f'
258     ValidScheme, // 'g'
259     ValidScheme, // 'h'
260     ValidScheme, // 'i'
261     ValidScheme, // 'j'
262     ValidScheme, // 'k'
263     ValidScheme, // 'l'
264     ValidScheme, // 'm'
265     ValidScheme, // 'n'
266     ValidScheme, // 'o'
267     ValidScheme, // 'p'
268     ValidScheme, // 'q'
269     ValidScheme, // 'r'
270     ValidScheme, // 's'
271     ValidScheme, // 't'
272     ValidScheme, // 'u'
273     ValidScheme, // 'v'
274     ValidScheme, // 'w'
275     ValidScheme, // 'x'
276     ValidScheme, // 'y'
277     ValidScheme, // 'z'
278     UserInfo | Default, // '{'
279     UserInfo, // '|'
280     UserInfo | Default, // '}'
281     0, // '~'
282     QueryPercent, // 0x7F
283     QueryPercent, // 0x80
284     QueryPercent, // 0x81
285     QueryPercent, // 0x82
286     QueryPercent, // 0x83
287     QueryPercent, // 0x84
288     QueryPercent, // 0x85
289     QueryPercent, // 0x86
290     QueryPercent, // 0x87
291     QueryPercent, // 0x88
292     QueryPercent, // 0x89
293     QueryPercent, // 0x8A
294     QueryPercent, // 0x8B
295     QueryPercent, // 0x8C
296     QueryPercent, // 0x8D
297     QueryPercent, // 0x8E
298     QueryPercent, // 0x8F
299     QueryPercent, // 0x90
300     QueryPercent, // 0x91
301     QueryPercent, // 0x92
302     QueryPercent, // 0x93
303     QueryPercent, // 0x94
304     QueryPercent, // 0x95
305     QueryPercent, // 0x96
306     QueryPercent, // 0x97
307     QueryPercent, // 0x98
308     QueryPercent, // 0x99
309     QueryPercent, // 0x9A
310     QueryPercent, // 0x9B
311     QueryPercent, // 0x9C
312     QueryPercent, // 0x9D
313     QueryPercent, // 0x9E
314     QueryPercent, // 0x9F
315     QueryPercent, // 0xA0
316     QueryPercent, // 0xA1
317     QueryPercent, // 0xA2
318     QueryPercent, // 0xA3
319     QueryPercent, // 0xA4
320     QueryPercent, // 0xA5
321     QueryPercent, // 0xA6
322     QueryPercent, // 0xA7
323     QueryPercent, // 0xA8
324     QueryPercent, // 0xA9
325     QueryPercent, // 0xAA
326     QueryPercent, // 0xAB
327     QueryPercent, // 0xAC
328     QueryPercent, // 0xAD
329     QueryPercent, // 0xAE
330     QueryPercent, // 0xAF
331     QueryPercent, // 0xB0
332     QueryPercent, // 0xB1
333     QueryPercent, // 0xB2
334     QueryPercent, // 0xB3
335     QueryPercent, // 0xB4
336     QueryPercent, // 0xB5
337     QueryPercent, // 0xB6
338     QueryPercent, // 0xB7
339     QueryPercent, // 0xB8
340     QueryPercent, // 0xB9
341     QueryPercent, // 0xBA
342     QueryPercent, // 0xBB
343     QueryPercent, // 0xBC
344     QueryPercent, // 0xBD
345     QueryPercent, // 0xBE
346     QueryPercent, // 0xBF
347     QueryPercent, // 0xC0
348     QueryPercent, // 0xC1
349     QueryPercent, // 0xC2
350     QueryPercent, // 0xC3
351     QueryPercent, // 0xC4
352     QueryPercent, // 0xC5
353     QueryPercent, // 0xC6
354     QueryPercent, // 0xC7
355     QueryPercent, // 0xC8
356     QueryPercent, // 0xC9
357     QueryPercent, // 0xCA
358     QueryPercent, // 0xCB
359     QueryPercent, // 0xCC
360     QueryPercent, // 0xCD
361     QueryPercent, // 0xCE
362     QueryPercent, // 0xCF
363     QueryPercent, // 0xD0
364     QueryPercent, // 0xD1
365     QueryPercent, // 0xD2
366     QueryPercent, // 0xD3
367     QueryPercent, // 0xD4
368     QueryPercent, // 0xD5
369     QueryPercent, // 0xD6
370     QueryPercent, // 0xD7
371     QueryPercent, // 0xD8
372     QueryPercent, // 0xD9
373     QueryPercent, // 0xDA
374     QueryPercent, // 0xDB
375     QueryPercent, // 0xDC
376     QueryPercent, // 0xDD
377     QueryPercent, // 0xDE
378     QueryPercent, // 0xDF
379     QueryPercent, // 0xE0
380     QueryPercent, // 0xE1
381     QueryPercent, // 0xE2
382     QueryPercent, // 0xE3
383     QueryPercent, // 0xE4
384     QueryPercent, // 0xE5
385     QueryPercent, // 0xE6
386     QueryPercent, // 0xE7
387     QueryPercent, // 0xE8
388     QueryPercent, // 0xE9
389     QueryPercent, // 0xEA
390     QueryPercent, // 0xEB
391     QueryPercent, // 0xEC
392     QueryPercent, // 0xED
393     QueryPercent, // 0xEE
394     QueryPercent, // 0xEF
395     QueryPercent, // 0xF0
396     QueryPercent, // 0xF1
397     QueryPercent, // 0xF2
398     QueryPercent, // 0xF3
399     QueryPercent, // 0xF4
400     QueryPercent, // 0xF5
401     QueryPercent, // 0xF6
402     QueryPercent, // 0xF7
403     QueryPercent, // 0xF8
404     QueryPercent, // 0xF9
405     QueryPercent, // 0xFA
406     QueryPercent, // 0xFB
407     QueryPercent, // 0xFC
408     QueryPercent, // 0xFD
409     QueryPercent, // 0xFE
410     QueryPercent, // 0xFF
411 };
412
413 template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; }
414 template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
415 template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
416 template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
417 template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
418 template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
419 template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
420 template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
421 template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & ValidScheme; }
422 template<typename CharacterType> ALWAYS_INLINE static bool isForbiddenHostCodePoint(CharacterType character) { return character <= ']' && characterClassTable[character] & ForbiddenHost; }
423 ALWAYS_INLINE static bool shouldPercentEncodeQueryByte(uint8_t byte, const bool& urlIsSpecial)
424 {
425     if (characterClassTable[byte] & QueryPercent)
426         return true;
427     if (byte == '\'' && urlIsSpecial)
428         return true;
429     return false;
430 }
431
432 template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation>
433 ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
434 {
435     ++iterator;
436     while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
437         if (reportSyntaxViolation == ReportSyntaxViolation::Yes)
438             syntaxViolation(iteratorForSyntaxViolationPosition);
439         ++iterator;
440     }
441 }
442
443 template<typename CharacterType>
444 bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator)
445 {
446     if (iterator.atEnd())
447         return false;
448     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
449     if (iterator.atEnd())
450         return false;
451     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
452     return iterator.atEnd();
453 }
454
455 template<typename CharacterType>
456 ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
457 {
458     if (iterator.atEnd() || !isASCIIAlpha(*iterator))
459         return false;
460     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
461     if (iterator.atEnd())
462         return false;
463     if (*iterator == ':')
464         return true;
465     if (UNLIKELY(*iterator == '|'))
466         return true;
467     return false;
468 }
469
470 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
471 {
472     ASSERT(isASCII(codePoint));
473     if (UNLIKELY(m_didSeeSyntaxViolation))
474         m_asciiBuffer.append(codePoint);
475 }
476
477 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
478 {
479     if (UNLIKELY(m_didSeeSyntaxViolation))
480         m_asciiBuffer.append(characters, length);
481 }
482
483 template<typename CharacterType>
484 void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
485 {
486     ASSERT(isWindowsDriveLetter(iterator));
487     appendToASCIIBuffer(*iterator);
488     advance(iterator);
489     ASSERT(!iterator.atEnd());
490     ASSERT(*iterator == ':' || *iterator == '|');
491     if (*iterator == '|')
492         syntaxViolation(iterator);
493     appendToASCIIBuffer(':');
494     advance(iterator);
495 }
496
497 bool URLParser::copyBaseWindowsDriveLetter(const URL& base)
498 {
499     if (base.protocolIs("file")) {
500         RELEASE_ASSERT(base.m_portEnd < base.m_string.length());
501         if (base.m_string.is8Bit()) {
502             const LChar* begin = base.m_string.characters8();
503             CodePointIterator<LChar> c(begin + base.m_portEnd + 1, begin + base.m_string.length());
504             if (isWindowsDriveLetter(c)) {
505                 appendWindowsDriveLetter(c);
506                 return true;
507             }
508         } else {
509             const UChar* begin = base.m_string.characters16();
510             CodePointIterator<UChar> c(begin + base.m_portEnd + 1, begin + base.m_string.length());
511             if (isWindowsDriveLetter(c)) {
512                 appendWindowsDriveLetter(c);
513                 return true;
514             }
515         }
516     }
517     return false;
518 }
519
520 template<typename CharacterType>
521 bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
522 {
523     if (!isWindowsDriveLetter(iterator))
524         return true;
525     if (iterator.atEnd())
526         return false;
527     advance(iterator);
528     if (iterator.atEnd())
529         return true;
530     advance(iterator);
531     if (iterator.atEnd())
532         return true;
533     return !isSlashQuestionOrHash(*iterator);
534 }
535
536 static void percentEncodeByte(uint8_t byte, StringVector<LChar>& buffer)
537 {
538     buffer.append('%');
539     buffer.append(upperNibbleToASCIIHexDigit(byte));
540     buffer.append(lowerNibbleToASCIIHexDigit(byte));
541 }
542
543 void URLParser::percentEncodeByte(uint8_t byte)
544 {
545     ASSERT(m_didSeeSyntaxViolation);
546     appendToASCIIBuffer('%');
547     appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
548     appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
549 }
550
551 const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD";
552 const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1;
553
554 template<bool(*isInCodeSet)(UChar32), typename CharacterType>
555 ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
556 {
557     ASSERT(!iterator.atEnd());
558     UChar32 codePoint = *iterator;
559     if (LIKELY(isASCII(codePoint))) {
560         if (UNLIKELY(isInCodeSet(codePoint))) {
561             syntaxViolation(iterator);
562             percentEncodeByte(codePoint);
563         } else
564             appendToASCIIBuffer(codePoint);
565         return;
566     }
567     ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
568     syntaxViolation(iterator);
569     
570     if (!U_IS_UNICODE_CHAR(codePoint)) {
571         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
572         return;
573     }
574     
575     uint8_t buffer[U8_MAX_LENGTH];
576     int32_t offset = 0;
577     U8_APPEND_UNSAFE(buffer, offset, codePoint);
578     for (int32_t i = 0; i < offset; ++i)
579         percentEncodeByte(buffer[i]);
580 }
581
582 template<typename CharacterType>
583 ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
584 {
585     ASSERT(!iterator.atEnd());
586     UChar32 codePoint = *iterator;
587     if (LIKELY(isASCII(codePoint))) {
588         if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint, m_urlIsSpecial))) {
589             syntaxViolation(iterator);
590             percentEncodeByte(codePoint);
591         } else
592             appendToASCIIBuffer(codePoint);
593         return;
594     }
595     
596     syntaxViolation(iterator);
597     
598     if (!U_IS_UNICODE_CHAR(codePoint)) {
599         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
600         return;
601     }
602
603     uint8_t buffer[U8_MAX_LENGTH];
604     int32_t offset = 0;
605     U8_APPEND_UNSAFE(buffer, offset, codePoint);
606     for (int32_t i = 0; i < offset; ++i) {
607         auto byte = buffer[i];
608         if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
609             percentEncodeByte(byte);
610         else
611             appendToASCIIBuffer(byte);
612     }
613 }
614
615 template<typename CharacterType>
616 void URLParser::encodeQuery(const Vector<UChar>& source, const TextEncoding& encoding, CodePointIterator<CharacterType> iterator)
617 {
618     // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
619     CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
620     const char* data = encoded.data();
621     size_t length = encoded.length();
622     
623     if (!length == !iterator.atEnd()) {
624         syntaxViolation(iterator);
625         return;
626     }
627     
628     size_t i = 0;
629     for (; i < length; ++i) {
630         ASSERT(!iterator.atEnd());
631         uint8_t byte = data[i];
632         if (UNLIKELY(byte != *iterator)) {
633             syntaxViolation(iterator);
634             break;
635         }
636         if (UNLIKELY(shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))) {
637             syntaxViolation(iterator);
638             break;
639         }
640         appendToASCIIBuffer(byte);
641         ++iterator;
642     }
643     while (!iterator.atEnd() && isTabOrNewline(*iterator))
644         ++iterator;
645     ASSERT((i == length) == iterator.atEnd());
646     for (; i < length; ++i) {
647         ASSERT(m_didSeeSyntaxViolation);
648         uint8_t byte = data[i];
649         if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
650             percentEncodeByte(byte);
651         else
652             appendToASCIIBuffer(byte);
653     }
654 }
655
656 std::optional<uint16_t> URLParser::defaultPortForProtocol(StringView scheme)
657 {
658     static const uint16_t ftpPort = 21;
659     static const uint16_t gopherPort = 70;
660     static const uint16_t httpPort = 80;
661     static const uint16_t httpsPort = 443;
662     static const uint16_t wsPort = 80;
663     static const uint16_t wssPort = 443;
664     
665     auto length = scheme.length();
666     if (!length)
667         return std::nullopt;
668     switch (scheme[0]) {
669     case 'w':
670         switch (length) {
671         case 2:
672             if (scheme[1] == 's')
673                 return wsPort;
674             return std::nullopt;
675         case 3:
676             if (scheme[1] == 's'
677                 && scheme[2] == 's')
678                 return wssPort;
679             return std::nullopt;
680         default:
681             return false;
682         }
683     case 'h':
684         switch (length) {
685         case 4:
686             if (scheme[1] == 't'
687                 && scheme[2] == 't'
688                 && scheme[3] == 'p')
689                 return httpPort;
690             return std::nullopt;
691         case 5:
692             if (scheme[1] == 't'
693                 && scheme[2] == 't'
694                 && scheme[3] == 'p'
695                 && scheme[4] == 's')
696                 return httpsPort;
697             return std::nullopt;
698         default:
699             return std::nullopt;
700         }
701     case 'g':
702         if (length == 6
703             && scheme[1] == 'o'
704             && scheme[2] == 'p'
705             && scheme[3] == 'h'
706             && scheme[4] == 'e'
707             && scheme[5] == 'r')
708             return gopherPort;
709         return std::nullopt;
710     case 'f':
711         if (length == 3
712             && scheme[1] == 't'
713             && scheme[2] == 'p')
714             return ftpPort;
715         return std::nullopt;
716     default:
717         return std::nullopt;
718     }
719 }
720
721 enum class Scheme {
722     WS,
723     WSS,
724     File,
725     FTP,
726     Gopher,
727     HTTP,
728     HTTPS,
729     NonSpecial
730 };
731
732 ALWAYS_INLINE static Scheme scheme(StringView scheme)
733 {
734     auto length = scheme.length();
735     if (!length)
736         return Scheme::NonSpecial;
737     switch (scheme[0]) {
738     case 'f':
739         switch (length) {
740         case 3:
741             if (scheme[1] == 't'
742                 && scheme[2] == 'p')
743                 return Scheme::FTP;
744             return Scheme::NonSpecial;
745         case 4:
746             if (scheme[1] == 'i'
747                 && scheme[2] == 'l'
748                 && scheme[3] == 'e')
749                 return Scheme::File;
750             return Scheme::NonSpecial;
751         default:
752             return Scheme::NonSpecial;
753         }
754     case 'g':
755         if (length == 6
756             && scheme[1] == 'o'
757             && scheme[2] == 'p'
758             && scheme[3] == 'h'
759             && scheme[4] == 'e'
760             && scheme[5] == 'r')
761             return Scheme::Gopher;
762         return Scheme::NonSpecial;
763     case 'h':
764         switch (length) {
765         case 4:
766             if (scheme[1] == 't'
767                 && scheme[2] == 't'
768                 && scheme[3] == 'p')
769                 return Scheme::HTTP;
770             return Scheme::NonSpecial;
771         case 5:
772             if (scheme[1] == 't'
773                 && scheme[2] == 't'
774                 && scheme[3] == 'p'
775                 && scheme[4] == 's')
776                 return Scheme::HTTPS;
777             return Scheme::NonSpecial;
778         default:
779             return Scheme::NonSpecial;
780         }
781     case 'w':
782         switch (length) {
783         case 2:
784             if (scheme[1] == 's')
785                 return Scheme::WS;
786             return Scheme::NonSpecial;
787         case 3:
788             if (scheme[1] == 's'
789                 && scheme[2] == 's')
790                 return Scheme::WSS;
791             return Scheme::NonSpecial;
792         default:
793             return Scheme::NonSpecial;
794         }
795     default:
796         return Scheme::NonSpecial;
797     }
798 }
799
800 std::optional<String> URLParser::maybeCanonicalizeScheme(const String& scheme)
801 {
802     if (scheme.isEmpty())
803         return std::nullopt;
804
805     if (!isASCIIAlpha(scheme[0]))
806         return std::nullopt;
807
808     for (size_t i = 1; i < scheme.length(); ++i) {
809         if (isASCIIAlphanumeric(scheme[i]) || scheme[i] == '+' || scheme[i] == '-' || scheme[i] == '.')
810             continue;
811         return std::nullopt;
812     }
813
814     return scheme.convertToASCIILowercase();
815 }
816
817 bool URLParser::isSpecialScheme(const String& schemeArg)
818 {
819     return scheme(schemeArg) != Scheme::NonSpecial;
820 }
821
822 enum class URLParser::URLPart {
823     SchemeEnd,
824     UserStart,
825     UserEnd,
826     PasswordEnd,
827     HostEnd,
828     PortEnd,
829     PathAfterLastSlash,
830     PathEnd,
831     QueryEnd,
832 };
833
834 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
835 {
836     switch (part) {
837     case URLPart::QueryEnd:
838         return url.m_queryEnd;
839     case URLPart::PathEnd:
840         return url.m_pathEnd;
841     case URLPart::PathAfterLastSlash:
842         return url.m_pathAfterLastSlash;
843     case URLPart::PortEnd:
844         return url.m_portEnd;
845     case URLPart::HostEnd:
846         return url.m_hostEnd;
847     case URLPart::PasswordEnd:
848         return url.m_passwordEnd;
849     case URLPart::UserEnd:
850         return url.m_userEnd;
851     case URLPart::UserStart:
852         return url.m_userStart;
853     case URLPart::SchemeEnd:
854         return url.m_schemeEnd;
855     }
856     ASSERT_NOT_REACHED();
857     return 0;
858 }
859
860 void URLParser::copyASCIIStringUntil(const String& string, size_t length)
861 {
862     RELEASE_ASSERT(length <= string.length());
863     if (string.isNull())
864         return;
865     ASSERT(m_asciiBuffer.isEmpty());
866     if (string.is8Bit())
867         appendToASCIIBuffer(string.characters8(), length);
868     else {
869         const UChar* characters = string.characters16();
870         for (size_t i = 0; i < length; ++i) {
871             UChar c = characters[i];
872             ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
873             appendToASCIIBuffer(c);
874         }
875     }
876 }
877
878 template<typename CharacterType>
879 void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, bool& isUTF8Encoding)
880 {
881     syntaxViolation(iterator);
882
883     m_asciiBuffer.clear();
884     copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
885     switch (part) {
886     case URLPart::QueryEnd:
887         m_url.m_queryEnd = base.m_queryEnd;
888         FALLTHROUGH;
889     case URLPart::PathEnd:
890         m_url.m_pathEnd = base.m_pathEnd;
891         FALLTHROUGH;
892     case URLPart::PathAfterLastSlash:
893         m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
894         FALLTHROUGH;
895     case URLPart::PortEnd:
896         m_url.m_portEnd = base.m_portEnd;
897         FALLTHROUGH;
898     case URLPart::HostEnd:
899         m_url.m_hostEnd = base.m_hostEnd;
900         FALLTHROUGH;
901     case URLPart::PasswordEnd:
902         m_url.m_passwordEnd = base.m_passwordEnd;
903         FALLTHROUGH;
904     case URLPart::UserEnd:
905         m_url.m_userEnd = base.m_userEnd;
906         FALLTHROUGH;
907     case URLPart::UserStart:
908         m_url.m_userStart = base.m_userStart;
909         FALLTHROUGH;
910     case URLPart::SchemeEnd:
911         m_url.m_isValid = base.m_isValid;
912         m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
913         m_url.m_schemeEnd = base.m_schemeEnd;
914     }
915     switch (scheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd))) {
916     case Scheme::WS:
917     case Scheme::WSS:
918         isUTF8Encoding = true;
919         m_urlIsSpecial = true;
920         return;
921     case Scheme::File:
922         m_urlIsFile = true;
923         FALLTHROUGH;
924     case Scheme::FTP:
925     case Scheme::Gopher:
926     case Scheme::HTTP:
927     case Scheme::HTTPS:
928         m_urlIsSpecial = true;
929         return;
930     case Scheme::NonSpecial:
931         m_urlIsSpecial = false;
932         isUTF8Encoding = true;
933         return;
934     }
935     ASSERT_NOT_REACHED();
936 }
937
938 static const char dotASCIICode[2] = {'2', 'e'};
939
940 template<typename CharacterType>
941 ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c)
942 {
943     if (c.atEnd())
944         return false;
945     if (*c == '.') {
946         advance<CharacterType, ReportSyntaxViolation::No>(c);
947         return c.atEnd() || isSlashQuestionOrHash(*c);
948     }
949     if (*c != '%')
950         return false;
951     advance<CharacterType, ReportSyntaxViolation::No>(c);
952     if (c.atEnd() || *c != dotASCIICode[0])
953         return false;
954     advance<CharacterType, ReportSyntaxViolation::No>(c);
955     if (c.atEnd())
956         return false;
957     if (toASCIILower(*c) == dotASCIICode[1]) {
958         advance<CharacterType, ReportSyntaxViolation::No>(c);
959         return c.atEnd() || isSlashQuestionOrHash(*c);
960     }
961     return false;
962 }
963
964 template<typename CharacterType>
965 ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
966 {
967     if (c.atEnd())
968         return false;
969     if (*c == '.') {
970         advance<CharacterType, ReportSyntaxViolation::No>(c);
971         return isSingleDotPathSegment(c);
972     }
973     if (*c != '%')
974         return false;
975     advance<CharacterType, ReportSyntaxViolation::No>(c);
976     if (c.atEnd() || *c != dotASCIICode[0])
977         return false;
978     advance<CharacterType, ReportSyntaxViolation::No>(c);
979     if (c.atEnd())
980         return false;
981     if (toASCIILower(*c) == dotASCIICode[1]) {
982         advance<CharacterType, ReportSyntaxViolation::No>(c);
983         return isSingleDotPathSegment(c);
984     }
985     return false;
986 }
987
988 template<typename CharacterType>
989 void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
990 {
991     ASSERT(isSingleDotPathSegment(c));
992     if (*c == '.') {
993         advance(c);
994         if (!c.atEnd()) {
995             if (*c == '/' || *c == '\\')
996                 advance(c);
997             else
998                 ASSERT(*c == '?' || *c == '#');
999         }
1000     } else {
1001         ASSERT(*c == '%');
1002         advance(c);
1003         ASSERT(*c == dotASCIICode[0]);
1004         advance(c);
1005         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1006         advance(c);
1007         if (!c.atEnd()) {
1008             if (*c == '/' || *c == '\\')
1009                 advance(c);
1010             else
1011                 ASSERT(*c == '?' || *c == '#');
1012         }
1013     }
1014 }
1015
1016 template<typename CharacterType>
1017 void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
1018 {
1019     ASSERT(isDoubleDotPathSegment(c));
1020     if (*c == '.')
1021         advance(c);
1022     else {
1023         ASSERT(*c == '%');
1024         advance(c);
1025         ASSERT(*c == dotASCIICode[0]);
1026         advance(c);
1027         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1028         advance(c);
1029     }
1030     consumeSingleDotPathSegment(c);
1031 }
1032
1033 bool URLParser::shouldPopPath(unsigned newPathAfterLastSlash)
1034 {
1035     ASSERT(m_didSeeSyntaxViolation);
1036     if (!m_urlIsFile)
1037         return true;
1038
1039     ASSERT(m_url.m_pathAfterLastSlash <= m_asciiBuffer.size());
1040     CodePointIterator<LChar> componentToPop(&m_asciiBuffer[newPathAfterLastSlash], &m_asciiBuffer[0] + m_url.m_pathAfterLastSlash);
1041     if (newPathAfterLastSlash == m_url.m_portEnd + 1 && isWindowsDriveLetter(componentToPop))
1042         return false;
1043     return true;
1044 }
1045
1046 void URLParser::popPath()
1047 {
1048     ASSERT(m_didSeeSyntaxViolation);
1049     if (m_url.m_pathAfterLastSlash > m_url.m_portEnd + 1) {
1050         auto newPathAfterLastSlash = m_url.m_pathAfterLastSlash - 1;
1051         if (m_asciiBuffer[newPathAfterLastSlash] == '/')
1052             newPathAfterLastSlash--;
1053         while (newPathAfterLastSlash > m_url.m_portEnd && m_asciiBuffer[newPathAfterLastSlash] != '/')
1054             newPathAfterLastSlash--;
1055         newPathAfterLastSlash++;
1056         if (shouldPopPath(newPathAfterLastSlash))
1057             m_url.m_pathAfterLastSlash = newPathAfterLastSlash;
1058     }
1059     m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
1060 }
1061
1062 template<typename CharacterType>
1063 void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
1064 {
1065     if (m_didSeeSyntaxViolation)
1066         return;
1067     m_didSeeSyntaxViolation = true;
1068     
1069     ASSERT(m_asciiBuffer.isEmpty());
1070     size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1071     RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
1072     m_asciiBuffer.reserveCapacity(m_inputString.length());
1073     for (size_t i = 0; i < codeUnitsToCopy; ++i) {
1074         ASSERT(isASCII(m_inputString[i]));
1075         m_asciiBuffer.uncheckedAppend(m_inputString[i]);
1076     }
1077 }
1078
1079 void URLParser::failure()
1080 {
1081     m_url.invalidate();
1082     m_url.m_string = m_inputString;
1083 }
1084
1085 template<typename CharacterType>
1086 bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint)
1087 {
1088     if (iterator.atEnd() || toASCIILower(*iterator) != codePoint)
1089         return false;
1090     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
1091     return true;
1092 }
1093
1094 template<typename CharacterType>
1095 bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator)
1096 {
1097     if (!checkLocalhostCodePoint(iterator, 'l'))
1098         return false;
1099     if (!checkLocalhostCodePoint(iterator, 'o'))
1100         return false;
1101     if (!checkLocalhostCodePoint(iterator, 'c'))
1102         return false;
1103     if (!checkLocalhostCodePoint(iterator, 'a'))
1104         return false;
1105     if (!checkLocalhostCodePoint(iterator, 'l'))
1106         return false;
1107     if (!checkLocalhostCodePoint(iterator, 'h'))
1108         return false;
1109     if (!checkLocalhostCodePoint(iterator, 'o'))
1110         return false;
1111     if (!checkLocalhostCodePoint(iterator, 's'))
1112         return false;
1113     if (!checkLocalhostCodePoint(iterator, 't'))
1114         return false;
1115     return iterator.atEnd();
1116 }
1117
1118 bool URLParser::isLocalhost(StringView view)
1119 {
1120     if (view.is8Bit())
1121         return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length()));
1122     return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length()));
1123 }
1124
1125 ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
1126 {
1127     if (UNLIKELY(m_didSeeSyntaxViolation)) {
1128         ASSERT(start + length <= m_asciiBuffer.size());
1129         return StringView(m_asciiBuffer.data() + start, length);
1130     }
1131     ASSERT(start + length <= m_inputString.length());
1132     return StringView(m_inputString).substring(start, length);
1133 }
1134
1135 ALWAYS_INLINE UChar URLParser::parsedDataView(size_t position)
1136 {
1137     if (UNLIKELY(m_didSeeSyntaxViolation))
1138         return m_asciiBuffer[position];
1139     return m_inputString[position];
1140 }
1141
1142 template<typename CharacterType>
1143 ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1144 {
1145     if (UNLIKELY(m_didSeeSyntaxViolation))
1146         return m_asciiBuffer.size();
1147     
1148     return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1149 }
1150
1151 URLParser::URLParser(const String& input, const URL& base, const TextEncoding& encoding)
1152     : m_inputString(input)
1153 {
1154     if (input.isNull()) {
1155         if (base.isValid() && !base.m_cannotBeABaseURL) {
1156             m_url = base;
1157             m_url.removeFragmentIdentifier();
1158         }
1159         return;
1160     }
1161
1162     if (input.is8Bit()) {
1163         m_inputBegin = input.characters8();
1164         parse(input.characters8(), input.length(), base, encoding);
1165     } else {
1166         m_inputBegin = input.characters16();
1167         parse(input.characters16(), input.length(), base, encoding);
1168     }
1169
1170     ASSERT(!m_url.m_isValid
1171         || m_didSeeSyntaxViolation == (m_url.string() != input)
1172         || (input.isAllSpecialCharacters<isC0ControlOrSpace>()
1173             && m_url.m_string == base.m_string.left(base.m_queryEnd)));
1174     ASSERT(internalValuesConsistent(m_url));
1175 #if !ASSERT_DISABLED
1176     if (!m_didSeeSyntaxViolation) {
1177         // Force a syntax violation at the beginning to make sure we get the same result.
1178         URLParser parser(makeString(" ", input), base, encoding);
1179         URL parsed = parser.result();
1180         if (parsed.isValid())
1181             ASSERT(allValuesEqual(parser.result(), m_url));
1182     }
1183 #endif
1184 }
1185
1186 template<typename CharacterType>
1187 void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
1188 {
1189     URL_PARSER_LOG("Parsing URL <%s> base <%s> encoding <%s>", String(input, length).utf8().data(), base.string().utf8().data(), encoding.name());
1190     m_url = { };
1191     ASSERT(m_asciiBuffer.isEmpty());
1192     
1193     bool isUTF8Encoding = encoding == UTF8Encoding();
1194     Vector<UChar> queryBuffer;
1195
1196     unsigned endIndex = length;
1197     while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) {
1198         syntaxViolation(CodePointIterator<CharacterType>(input, input));
1199         endIndex--;
1200     }
1201     CodePointIterator<CharacterType> c(input, input + endIndex);
1202     CodePointIterator<CharacterType> authorityOrHostBegin;
1203     CodePointIterator<CharacterType> queryBegin;
1204     while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1205         syntaxViolation(c);
1206         ++c;
1207     }
1208     auto beginAfterControlAndSpace = c;
1209
1210     enum class State : uint8_t {
1211         SchemeStart,
1212         Scheme,
1213         NoScheme,
1214         SpecialRelativeOrAuthority,
1215         PathOrAuthority,
1216         Relative,
1217         RelativeSlash,
1218         SpecialAuthoritySlashes,
1219         SpecialAuthorityIgnoreSlashes,
1220         AuthorityOrHost,
1221         Host,
1222         File,
1223         FileSlash,
1224         FileHost,
1225         PathStart,
1226         Path,
1227         CannotBeABaseURLPath,
1228         UTF8Query,
1229         NonUTF8Query,
1230         Fragment,
1231     };
1232
1233 #define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1234 #define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x)
1235
1236     State state = State::SchemeStart;
1237     while (!c.atEnd()) {
1238         if (UNLIKELY(isTabOrNewline(*c))) {
1239             syntaxViolation(c);
1240             ++c;
1241             continue;
1242         }
1243
1244         switch (state) {
1245         case State::SchemeStart:
1246             LOG_STATE("SchemeStart");
1247             if (isASCIIAlpha(*c)) {
1248                 if (UNLIKELY(isASCIIUpper(*c)))
1249                     syntaxViolation(c);
1250                 appendToASCIIBuffer(toASCIILower(*c));
1251                 advance(c);
1252                 if (c.atEnd()) {
1253                     m_asciiBuffer.clear();
1254                     state = State::NoScheme;
1255                     c = beginAfterControlAndSpace;
1256                 }
1257                 state = State::Scheme;
1258             } else
1259                 state = State::NoScheme;
1260             break;
1261         case State::Scheme:
1262             LOG_STATE("Scheme");
1263             if (isValidSchemeCharacter(*c)) {
1264                 if (UNLIKELY(isASCIIUpper(*c)))
1265                     syntaxViolation(c);
1266                 appendToASCIIBuffer(toASCIILower(*c));
1267             } else if (*c == ':') {
1268                 m_url.m_schemeEnd = currentPosition(c);
1269                 StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd);
1270                 appendToASCIIBuffer(':');
1271                 switch (scheme(urlScheme)) {
1272                 case Scheme::File:
1273                     m_urlIsSpecial = true;
1274                     m_urlIsFile = true;
1275                     state = State::File;
1276                     ++c;
1277                     break;
1278                 case Scheme::WS:
1279                 case Scheme::WSS:
1280                     isUTF8Encoding = true;
1281                     m_urlIsSpecial = true;
1282                     if (base.protocolIs(urlScheme))
1283                         state = State::SpecialRelativeOrAuthority;
1284                     else
1285                         state = State::SpecialAuthoritySlashes;
1286                     ++c;
1287                     break;
1288                 case Scheme::HTTP:
1289                 case Scheme::HTTPS:
1290                     m_url.m_protocolIsInHTTPFamily = true;
1291                     FALLTHROUGH;
1292                 case Scheme::FTP:
1293                 case Scheme::Gopher:
1294                     m_urlIsSpecial = true;
1295                     if (base.protocolIs(urlScheme))
1296                         state = State::SpecialRelativeOrAuthority;
1297                     else
1298                         state = State::SpecialAuthoritySlashes;
1299                     ++c;
1300                     break;
1301                 case Scheme::NonSpecial:
1302                     isUTF8Encoding = true;
1303                     auto maybeSlash = c;
1304                     advance(maybeSlash);
1305                     if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1306                         appendToASCIIBuffer('/');
1307                         c = maybeSlash;
1308                         state = State::PathOrAuthority;
1309                         ASSERT(*c == '/');
1310                         ++c;
1311                         m_url.m_userStart = currentPosition(c);
1312                     } else {
1313                         ++c;
1314                         m_url.m_userStart = currentPosition(c);
1315                         m_url.m_userEnd = m_url.m_userStart;
1316                         m_url.m_passwordEnd = m_url.m_userStart;
1317                         m_url.m_hostEnd = m_url.m_userStart;
1318                         m_url.m_portEnd = m_url.m_userStart;
1319                         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1320                         m_url.m_cannotBeABaseURL = true;
1321                         state = State::CannotBeABaseURLPath;
1322                     }
1323                     break;
1324                 }
1325                 break;
1326             } else {
1327                 m_asciiBuffer.clear();
1328                 state = State::NoScheme;
1329                 c = beginAfterControlAndSpace;
1330                 break;
1331             }
1332             advance(c);
1333             if (c.atEnd()) {
1334                 m_asciiBuffer.clear();
1335                 state = State::NoScheme;
1336                 c = beginAfterControlAndSpace;
1337             }
1338             break;
1339         case State::NoScheme:
1340             LOG_STATE("NoScheme");
1341             if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) {
1342                 failure();
1343                 return;
1344             }
1345             if (base.m_cannotBeABaseURL && *c == '#') {
1346                 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1347                 state = State::Fragment;
1348                 appendToASCIIBuffer('#');
1349                 ++c;
1350                 break;
1351             }
1352             if (!base.protocolIs("file")) {
1353                 state = State::Relative;
1354                 break;
1355             }
1356             copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1357             appendToASCIIBuffer(':');
1358             state = State::File;
1359             break;
1360         case State::SpecialRelativeOrAuthority:
1361             LOG_STATE("SpecialRelativeOrAuthority");
1362             if (*c == '/') {
1363                 appendToASCIIBuffer('/');
1364                 advance(c);
1365                 if (c.atEnd()) {
1366                     failure();
1367                     return;
1368                 }
1369                 if (*c == '/') {
1370                     appendToASCIIBuffer('/');
1371                     state = State::SpecialAuthorityIgnoreSlashes;
1372                     ++c;
1373                 } else
1374                     state = State::RelativeSlash;
1375             } else
1376                 state = State::Relative;
1377             break;
1378         case State::PathOrAuthority:
1379             LOG_STATE("PathOrAuthority");
1380             if (*c == '/') {
1381                 appendToASCIIBuffer('/');
1382                 state = State::AuthorityOrHost;
1383                 advance(c);
1384                 m_url.m_userStart = currentPosition(c);
1385                 authorityOrHostBegin = c;
1386             } else {
1387                 ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1388                 m_url.m_userStart = currentPosition(c) - 1;
1389                 m_url.m_userEnd = m_url.m_userStart;
1390                 m_url.m_passwordEnd = m_url.m_userStart;
1391                 m_url.m_hostEnd = m_url.m_userStart;
1392                 m_url.m_portEnd = m_url.m_userStart;
1393                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1394                 state = State::Path;
1395             }
1396             break;
1397         case State::Relative:
1398             LOG_STATE("Relative");
1399             switch (*c) {
1400             case '/':
1401             case '\\':
1402                 state = State::RelativeSlash;
1403                 ++c;
1404                 break;
1405             case '?':
1406                 copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1407                 appendToASCIIBuffer('?');
1408                 ++c;
1409                 if (isUTF8Encoding)
1410                     state = State::UTF8Query;
1411                 else {
1412                     queryBegin = c;
1413                     state = State::NonUTF8Query;
1414                 }
1415                 break;
1416             case '#':
1417                 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1418                 appendToASCIIBuffer('#');
1419                 state = State::Fragment;
1420                 ++c;
1421                 break;
1422             default:
1423                 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1424                 if (currentPosition(c) && parsedDataView(currentPosition(c) - 1) != '/') {
1425                     appendToASCIIBuffer('/');
1426                     m_url.m_pathAfterLastSlash = currentPosition(c);
1427                 }
1428                 state = State::Path;
1429                 break;
1430             }
1431             break;
1432         case State::RelativeSlash:
1433             LOG_STATE("RelativeSlash");
1434             if (*c == '/' || *c == '\\') {
1435                 ++c;
1436                 copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1437                 appendToASCIIBuffer("://", 3);
1438                 if (m_urlIsSpecial)
1439                     state = State::SpecialAuthorityIgnoreSlashes;
1440                 else {
1441                     m_url.m_userStart = currentPosition(c);
1442                     state = State::AuthorityOrHost;
1443                     authorityOrHostBegin = c;
1444                 }
1445             } else {
1446                 copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1447                 appendToASCIIBuffer('/');
1448                 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1449                 state = State::Path;
1450             }
1451             break;
1452         case State::SpecialAuthoritySlashes:
1453             LOG_STATE("SpecialAuthoritySlashes");
1454             if (LIKELY(*c == '/' || *c == '\\')) {
1455                 if (UNLIKELY(*c == '\\'))
1456                     syntaxViolation(c);
1457                 appendToASCIIBuffer('/');
1458                 advance(c);
1459                 if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) {
1460                     if (UNLIKELY(*c == '\\'))
1461                         syntaxViolation(c);
1462                     ++c;
1463                     appendToASCIIBuffer('/');
1464                 } else {
1465                     syntaxViolation(c);
1466                     appendToASCIIBuffer('/');
1467                 }
1468             } else {
1469                 syntaxViolation(c);
1470                 appendToASCIIBuffer("//", 2);
1471             }
1472             state = State::SpecialAuthorityIgnoreSlashes;
1473             break;
1474         case State::SpecialAuthorityIgnoreSlashes:
1475             LOG_STATE("SpecialAuthorityIgnoreSlashes");
1476             if (*c == '/' || *c == '\\') {
1477                 syntaxViolation(c);
1478                 ++c;
1479             } else {
1480                 m_url.m_userStart = currentPosition(c);
1481                 state = State::AuthorityOrHost;
1482                 authorityOrHostBegin = c;
1483             }
1484             break;
1485         case State::AuthorityOrHost:
1486             do {
1487                 LOG_STATE("AuthorityOrHost");
1488                 if (*c == '@') {
1489                     auto lastAt = c;
1490                     auto findLastAt = c;
1491                     while (!findLastAt.atEnd()) {
1492                         URL_PARSER_LOG("Finding last @: %c", *findLastAt);
1493                         if (*findLastAt == '@')
1494                             lastAt = findLastAt;
1495                         bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\');
1496                         if (isSlash || *findLastAt == '?' || *findLastAt == '#')
1497                             break;
1498                         ++findLastAt;
1499                     }
1500                     parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1501                     c = lastAt;
1502                     advance(c);
1503                     authorityOrHostBegin = c;
1504                     state = State::Host;
1505                     m_hostHasPercentOrNonASCII = false;
1506                     break;
1507                 }
1508                 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1509                 if (isSlash || *c == '?' || *c == '#') {
1510                     auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c);
1511                     if (iterator.atEnd()) {
1512                         if (m_urlIsSpecial)
1513                             return failure();
1514                         m_url.m_userEnd = currentPosition(c);
1515                         m_url.m_passwordEnd = m_url.m_userEnd;
1516                         m_url.m_hostEnd = m_url.m_userEnd;
1517                         m_url.m_portEnd = m_url.m_userEnd;
1518                         m_url.m_pathAfterLastSlash = m_url.m_userEnd;
1519                     } else {
1520                         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1521                         m_url.m_passwordEnd = m_url.m_userEnd;
1522                         if (!parseHostAndPort(iterator)) {
1523                             failure();
1524                             return;
1525                         }
1526                         if (UNLIKELY(!isSlash)) {
1527                             if (m_urlIsSpecial) {
1528                                 syntaxViolation(c);
1529                                 appendToASCIIBuffer('/');
1530                             }
1531                             m_url.m_pathAfterLastSlash = currentPosition(c);
1532                         }
1533                     }
1534                     state = State::Path;
1535                     break;
1536                 }
1537                 if (isPercentOrNonASCII(*c))
1538                     m_hostHasPercentOrNonASCII = true;
1539                 ++c;
1540             } while (!c.atEnd());
1541             break;
1542         case State::Host:
1543             do {
1544                 LOG_STATE("Host");
1545                 if (*c == '/' || *c == '?' || *c == '#') {
1546                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1547                         failure();
1548                         return;
1549                     }
1550                     if (*c == '?' || *c == '#') {
1551                         syntaxViolation(c);
1552                         appendToASCIIBuffer('/');
1553                         m_url.m_pathAfterLastSlash = currentPosition(c);
1554                     }
1555                     state = State::Path;
1556                     break;
1557                 }
1558                 if (isPercentOrNonASCII(*c))
1559                     m_hostHasPercentOrNonASCII = true;
1560                 ++c;
1561             } while (!c.atEnd());
1562             break;
1563         case State::File:
1564             LOG_STATE("File");
1565             switch (*c) {
1566             case '\\':
1567                 syntaxViolation(c);
1568                 FALLTHROUGH;
1569             case '/':
1570                 appendToASCIIBuffer('/');
1571                 state = State::FileSlash;
1572                 ++c;
1573                 break;
1574             case '?':
1575                 syntaxViolation(c);
1576                 if (base.isValid() && base.protocolIs("file")) {
1577                     copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1578                     appendToASCIIBuffer('?');
1579                     ++c;
1580                 } else {
1581                     appendToASCIIBuffer("///?", 4);
1582                     ++c;
1583                     m_url.m_userStart = currentPosition(c) - 2;
1584                     m_url.m_userEnd = m_url.m_userStart;
1585                     m_url.m_passwordEnd = m_url.m_userStart;
1586                     m_url.m_hostEnd = m_url.m_userStart;
1587                     m_url.m_portEnd = m_url.m_userStart;
1588                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1589                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1590                 }
1591                 if (isUTF8Encoding)
1592                     state = State::UTF8Query;
1593                 else {
1594                     queryBegin = c;
1595                     state = State::NonUTF8Query;
1596                 }
1597                 break;
1598             case '#':
1599                 syntaxViolation(c);
1600                 if (base.isValid() && base.protocolIs("file")) {
1601                     copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1602                     appendToASCIIBuffer('#');
1603                 } else {
1604                     appendToASCIIBuffer("///#", 4);
1605                     m_url.m_userStart = currentPosition(c) - 2;
1606                     m_url.m_userEnd = m_url.m_userStart;
1607                     m_url.m_passwordEnd = m_url.m_userStart;
1608                     m_url.m_hostEnd = m_url.m_userStart;
1609                     m_url.m_portEnd = m_url.m_userStart;
1610                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1611                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1612                     m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1613                 }
1614                 state = State::Fragment;
1615                 ++c;
1616                 break;
1617             default:
1618                 syntaxViolation(c);
1619                 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1620                     copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1621                 else {
1622                     appendToASCIIBuffer("///", 3);
1623                     m_url.m_userStart = currentPosition(c) - 1;
1624                     m_url.m_userEnd = m_url.m_userStart;
1625                     m_url.m_passwordEnd = m_url.m_userStart;
1626                     m_url.m_hostEnd = m_url.m_userStart;
1627                     m_url.m_portEnd = m_url.m_userStart;
1628                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1629                     if (isWindowsDriveLetter(c))
1630                         appendWindowsDriveLetter(c);
1631                 }
1632                 state = State::Path;
1633                 break;
1634             }
1635             break;
1636         case State::FileSlash:
1637             LOG_STATE("FileSlash");
1638             if (LIKELY(*c == '/' || *c == '\\')) {
1639                 if (UNLIKELY(*c == '\\'))
1640                     syntaxViolation(c);
1641                 appendToASCIIBuffer('/');
1642                 advance(c);
1643                 m_url.m_userStart = currentPosition(c);
1644                 m_url.m_userEnd = m_url.m_userStart;
1645                 m_url.m_passwordEnd = m_url.m_userStart;
1646                 m_url.m_hostEnd = m_url.m_userStart;
1647                 m_url.m_portEnd = m_url.m_userStart;
1648                 authorityOrHostBegin = c;
1649                 state = State::FileHost;
1650                 break;
1651             }
1652             syntaxViolation(c);
1653             appendToASCIIBuffer("//", 2);
1654             m_url.m_userStart = currentPosition(c) - 1;
1655             m_url.m_userEnd = m_url.m_userStart;
1656             m_url.m_passwordEnd = m_url.m_userStart;
1657             m_url.m_hostEnd = m_url.m_userStart;
1658             m_url.m_portEnd = m_url.m_userStart;
1659             if (isWindowsDriveLetter(c)) {
1660                 appendWindowsDriveLetter(c);
1661                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1662             } else if (copyBaseWindowsDriveLetter(base)) {
1663                 appendToASCIIBuffer('/');
1664                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1665             } else
1666                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1667             state = State::Path;
1668             break;
1669         case State::FileHost:
1670             do {
1671                 LOG_STATE("FileHost");
1672                 if (isSlashQuestionOrHash(*c)) {
1673                     bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1674                         && isWindowsDriveLetter(authorityOrHostBegin);
1675                     if (windowsQuirk) {
1676                         syntaxViolation(authorityOrHostBegin);
1677                         appendToASCIIBuffer('/');
1678                         appendWindowsDriveLetter(authorityOrHostBegin);
1679                     }
1680                     if (windowsQuirk || authorityOrHostBegin == c) {
1681                         ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1) == '/');
1682                         if (UNLIKELY(*c == '?')) {
1683                             syntaxViolation(c);
1684                             appendToASCIIBuffer("/?", 2);
1685                             ++c;
1686                             if (isUTF8Encoding)
1687                                 state = State::UTF8Query;
1688                             else {
1689                                 queryBegin = c;
1690                                 state = State::NonUTF8Query;
1691                             }
1692                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1693                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1694                             break;
1695                         }
1696                         if (UNLIKELY(*c == '#')) {
1697                             syntaxViolation(c);
1698                             appendToASCIIBuffer("/#", 2);
1699                             ++c;
1700                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1701                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1702                             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1703                             state = State::Fragment;
1704                             break;
1705                         }
1706                         state = State::Path;
1707                         break;
1708                     }
1709                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1710                         failure();
1711                         return;
1712                     }
1713                     if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) {
1714                         syntaxViolation(c);
1715                         m_asciiBuffer.shrink(m_url.m_passwordEnd);
1716                         m_url.m_hostEnd = currentPosition(c);
1717                         m_url.m_portEnd = m_url.m_hostEnd;
1718                     }
1719                     
1720                     state = State::PathStart;
1721                     break;
1722                 }
1723                 if (isPercentOrNonASCII(*c))
1724                     m_hostHasPercentOrNonASCII = true;
1725                 ++c;
1726             } while (!c.atEnd());
1727             break;
1728         case State::PathStart:
1729             LOG_STATE("PathStart");
1730             if (*c != '/' && *c != '\\')
1731                 ++c;
1732             state = State::Path;
1733             break;
1734         case State::Path:
1735             LOG_STATE("Path");
1736             if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1737                 if (UNLIKELY(m_urlIsSpecial && *c == '\\'))
1738                     syntaxViolation(c);
1739                 appendToASCIIBuffer('/');
1740                 ++c;
1741                 m_url.m_pathAfterLastSlash = currentPosition(c);
1742                 break;
1743             }
1744             if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1) == '/')) {
1745                 if (UNLIKELY(isDoubleDotPathSegment(c))) {
1746                     syntaxViolation(c);
1747                     consumeDoubleDotPathSegment(c);
1748                     popPath();
1749                     break;
1750                 }
1751                 if (UNLIKELY(isSingleDotPathSegment(c))) {
1752                     syntaxViolation(c);
1753                     consumeSingleDotPathSegment(c);
1754                     break;
1755                 }
1756             }
1757             if (*c == '?') {
1758                 m_url.m_pathEnd = currentPosition(c);
1759                 appendToASCIIBuffer('?');
1760                 ++c;
1761                 if (isUTF8Encoding)
1762                     state = State::UTF8Query;
1763                 else {
1764                     queryBegin = c;
1765                     state = State::NonUTF8Query;
1766                 }
1767                 break;
1768             }
1769             if (*c == '#') {
1770                 m_url.m_pathEnd = currentPosition(c);
1771                 m_url.m_queryEnd = m_url.m_pathEnd;
1772                 state = State::Fragment;
1773                 break;
1774             }
1775             utf8PercentEncode<isInDefaultEncodeSet>(c);
1776             ++c;
1777             break;
1778         case State::CannotBeABaseURLPath:
1779             LOG_STATE("CannotBeABaseURLPath");
1780             if (*c == '?') {
1781                 m_url.m_pathEnd = currentPosition(c);
1782                 appendToASCIIBuffer('?');
1783                 ++c;
1784                 if (isUTF8Encoding)
1785                     state = State::UTF8Query;
1786                 else {
1787                     queryBegin = c;
1788                     state = State::NonUTF8Query;
1789                 }
1790             } else if (*c == '#') {
1791                 m_url.m_pathEnd = currentPosition(c);
1792                 m_url.m_queryEnd = m_url.m_pathEnd;
1793                 state = State::Fragment;
1794             } else if (*c == '/') {
1795                 appendToASCIIBuffer('/');
1796                 ++c;
1797                 m_url.m_pathAfterLastSlash = currentPosition(c);
1798             } else {
1799                 utf8PercentEncode<isInSimpleEncodeSet>(c);
1800                 ++c;
1801             }
1802             break;
1803         case State::UTF8Query:
1804             LOG_STATE("UTF8Query");
1805             ASSERT(queryBegin == CodePointIterator<CharacterType>());
1806             if (*c == '#') {
1807                 m_url.m_queryEnd = currentPosition(c);
1808                 state = State::Fragment;
1809                 break;
1810             }
1811             if (isUTF8Encoding)
1812                 utf8QueryEncode(c);
1813             else
1814                 appendCodePoint(queryBuffer, *c);
1815             ++c;
1816             break;
1817         case State::NonUTF8Query:
1818             do {
1819                 LOG_STATE("NonUTF8Query");
1820                 ASSERT(queryBegin != CodePointIterator<CharacterType>());
1821                 if (*c == '#') {
1822                     encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
1823                     m_url.m_queryEnd = currentPosition(c);
1824                     state = State::Fragment;
1825                     break;
1826                 }
1827                 appendCodePoint(queryBuffer, *c);
1828                 advance(c, queryBegin);
1829             } while (!c.atEnd());
1830             break;
1831         case State::Fragment:
1832             URL_PARSER_LOG("State Fragment");
1833             utf8PercentEncode<isInSimpleEncodeSet>(c);
1834             ++c;
1835             break;
1836         }
1837     }
1838
1839     switch (state) {
1840     case State::SchemeStart:
1841         LOG_FINAL_STATE("SchemeStart");
1842         if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1843             m_url = base;
1844             m_url.removeFragmentIdentifier();
1845             return;
1846         }
1847         failure();
1848         return;
1849     case State::Scheme:
1850         LOG_FINAL_STATE("Scheme");
1851         failure();
1852         return;
1853     case State::NoScheme:
1854         LOG_FINAL_STATE("NoScheme");
1855         RELEASE_ASSERT_NOT_REACHED();
1856     case State::SpecialRelativeOrAuthority:
1857         LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1858         copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1859         break;
1860     case State::PathOrAuthority:
1861         LOG_FINAL_STATE("PathOrAuthority");
1862         ASSERT(m_url.m_userStart);
1863         ASSERT(m_url.m_userStart == currentPosition(c));
1864         ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1865         m_url.m_userStart--;
1866         m_url.m_userEnd = m_url.m_userStart;
1867         m_url.m_passwordEnd = m_url.m_userStart;
1868         m_url.m_hostEnd = m_url.m_userStart;
1869         m_url.m_portEnd = m_url.m_userStart;
1870         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1871         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1872         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1873         break;
1874     case State::Relative:
1875         LOG_FINAL_STATE("Relative");
1876         RELEASE_ASSERT_NOT_REACHED();
1877     case State::RelativeSlash:
1878         LOG_FINAL_STATE("RelativeSlash");
1879         copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1880         appendToASCIIBuffer('/');
1881         m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1882         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1883         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1884         break;
1885     case State::SpecialAuthoritySlashes:
1886         LOG_FINAL_STATE("SpecialAuthoritySlashes");
1887         m_url.m_userStart = currentPosition(c);
1888         m_url.m_userEnd = m_url.m_userStart;
1889         m_url.m_passwordEnd = m_url.m_userStart;
1890         m_url.m_hostEnd = m_url.m_userStart;
1891         m_url.m_portEnd = m_url.m_userStart;
1892         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1893         m_url.m_pathEnd = m_url.m_userStart;
1894         m_url.m_queryEnd = m_url.m_userStart;
1895         break;
1896     case State::SpecialAuthorityIgnoreSlashes:
1897         LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1898         failure();
1899         return;
1900     case State::AuthorityOrHost:
1901         LOG_FINAL_STATE("AuthorityOrHost");
1902         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1903         m_url.m_passwordEnd = m_url.m_userEnd;
1904         if (authorityOrHostBegin.atEnd()) {
1905             m_url.m_userEnd = m_url.m_userStart;
1906             m_url.m_passwordEnd = m_url.m_userStart;
1907             m_url.m_hostEnd = m_url.m_userStart;
1908             m_url.m_portEnd = m_url.m_userStart;
1909             m_url.m_pathEnd = m_url.m_userStart;
1910         } else if (!parseHostAndPort(authorityOrHostBegin)) {
1911             failure();
1912             return;
1913         } else {
1914             if (m_urlIsSpecial) {
1915                 syntaxViolation(c);
1916                 appendToASCIIBuffer('/');
1917                 m_url.m_pathEnd = m_url.m_portEnd + 1;
1918             } else
1919                 m_url.m_pathEnd = m_url.m_portEnd;
1920         }
1921         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1922         m_url.m_queryEnd = m_url.m_pathEnd;
1923         break;
1924     case State::Host:
1925         LOG_FINAL_STATE("Host");
1926         if (!parseHostAndPort(authorityOrHostBegin)) {
1927             failure();
1928             return;
1929         }
1930         if (m_urlIsSpecial) {
1931             syntaxViolation(c);
1932             appendToASCIIBuffer('/');
1933             m_url.m_pathEnd = m_url.m_portEnd + 1;
1934         } else
1935             m_url.m_pathEnd = m_url.m_portEnd;
1936         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1937         m_url.m_queryEnd = m_url.m_pathEnd;
1938         break;
1939     case State::File:
1940         LOG_FINAL_STATE("File");
1941         if (base.isValid() && base.protocolIs("file")) {
1942             copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1943             break;
1944         }
1945         syntaxViolation(c);
1946         appendToASCIIBuffer("///", 3);
1947         m_url.m_userStart = currentPosition(c) - 1;
1948         m_url.m_userEnd = m_url.m_userStart;
1949         m_url.m_passwordEnd = m_url.m_userStart;
1950         m_url.m_hostEnd = m_url.m_userStart;
1951         m_url.m_portEnd = m_url.m_userStart;
1952         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1953         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1954         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1955         break;
1956     case State::FileSlash:
1957         LOG_FINAL_STATE("FileSlash");
1958         syntaxViolation(c);
1959         m_url.m_userStart = currentPosition(c) + 1;
1960         appendToASCIIBuffer("//", 2);
1961         m_url.m_userEnd = m_url.m_userStart;
1962         m_url.m_passwordEnd = m_url.m_userStart;
1963         m_url.m_hostEnd = m_url.m_userStart;
1964         m_url.m_portEnd = m_url.m_userStart;
1965         if (copyBaseWindowsDriveLetter(base)) {
1966             appendToASCIIBuffer('/');
1967             m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1968         } else
1969             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1970         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1971         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1972         break;
1973     case State::FileHost:
1974         LOG_FINAL_STATE("FileHost");
1975         if (takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1976             && isWindowsDriveLetter(authorityOrHostBegin)) {
1977             syntaxViolation(authorityOrHostBegin);
1978             appendToASCIIBuffer('/');
1979             appendWindowsDriveLetter(authorityOrHostBegin);
1980             m_url.m_pathAfterLastSlash = currentPosition(c);
1981             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1982             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1983             break;
1984         }
1985         
1986         if (authorityOrHostBegin == c) {
1987             syntaxViolation(c);
1988             appendToASCIIBuffer('/');
1989             m_url.m_userStart = currentPosition(c) - 1;
1990             m_url.m_userEnd = m_url.m_userStart;
1991             m_url.m_passwordEnd = m_url.m_userStart;
1992             m_url.m_hostEnd = m_url.m_userStart;
1993             m_url.m_portEnd = m_url.m_userStart;
1994             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1995             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1996             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1997             break;
1998         }
1999
2000         if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
2001             failure();
2002             return;
2003         }
2004
2005         syntaxViolation(c);
2006         if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) {
2007             m_asciiBuffer.shrink(m_url.m_passwordEnd);
2008             m_url.m_hostEnd = currentPosition(c);
2009             m_url.m_portEnd = m_url.m_hostEnd;
2010         }
2011         appendToASCIIBuffer('/');
2012         m_url.m_pathAfterLastSlash = m_url.m_portEnd + 1;
2013         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
2014         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
2015         break;
2016     case State::PathStart:
2017         LOG_FINAL_STATE("PathStart");
2018         RELEASE_ASSERT_NOT_REACHED();
2019     case State::Path:
2020         LOG_FINAL_STATE("Path");
2021         m_url.m_pathEnd = currentPosition(c);
2022         m_url.m_queryEnd = m_url.m_pathEnd;
2023         break;
2024     case State::CannotBeABaseURLPath:
2025         LOG_FINAL_STATE("CannotBeABaseURLPath");
2026         m_url.m_pathEnd = currentPosition(c);
2027         m_url.m_queryEnd = m_url.m_pathEnd;
2028         break;
2029     case State::UTF8Query:
2030         LOG_FINAL_STATE("UTF8Query");
2031         ASSERT(queryBegin == CodePointIterator<CharacterType>());
2032         m_url.m_queryEnd = currentPosition(c);
2033         break;
2034     case State::NonUTF8Query:
2035         LOG_FINAL_STATE("NonUTF8Query");
2036         ASSERT(queryBegin != CodePointIterator<CharacterType>());
2037         encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
2038         m_url.m_queryEnd = currentPosition(c);
2039         break;
2040     case State::Fragment:
2041         LOG_FINAL_STATE("Fragment");
2042         break;
2043     }
2044
2045     if (LIKELY(!m_didSeeSyntaxViolation)) {
2046         m_url.m_string = m_inputString;
2047         ASSERT(m_asciiBuffer.isEmpty());
2048     } else
2049         m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
2050     m_url.m_isValid = true;
2051     URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
2052 }
2053
2054 template<typename CharacterType>
2055 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
2056 {
2057     if (UNLIKELY(iterator.atEnd())) {
2058         syntaxViolation(iterator);
2059         m_url.m_userEnd = currentPosition(iterator);
2060         m_url.m_passwordEnd = m_url.m_userEnd;
2061         return;
2062     }
2063     for (; !iterator.atEnd(); advance(iterator)) {
2064         if (*iterator == ':') {
2065             m_url.m_userEnd = currentPosition(iterator);
2066             auto iteratorAtColon = iterator;
2067             ++iterator;
2068             bool tabOrNewlineAfterColon = false;
2069             while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
2070                 tabOrNewlineAfterColon = true;
2071                 ++iterator;
2072             }
2073             if (UNLIKELY(iterator.atEnd())) {
2074                 syntaxViolation(iteratorAtColon);
2075                 m_url.m_passwordEnd = m_url.m_userEnd;
2076                 if (m_url.m_userEnd > m_url.m_userStart)
2077                     appendToASCIIBuffer('@');
2078                 return;
2079             }
2080             if (tabOrNewlineAfterColon)
2081                 syntaxViolation(iteratorAtColon);
2082             appendToASCIIBuffer(':');
2083             break;
2084         }
2085         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2086     }
2087     for (; !iterator.atEnd(); advance(iterator))
2088         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2089     m_url.m_passwordEnd = currentPosition(iterator);
2090     if (!m_url.m_userEnd)
2091         m_url.m_userEnd = m_url.m_passwordEnd;
2092     appendToASCIIBuffer('@');
2093 }
2094
2095 template<typename UnsignedIntegerType>
2096 void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
2097 {
2098     LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
2099     LChar* end = std::end(buf);
2100     LChar* p = end;
2101     do {
2102         *--p = (number % 10) + '0';
2103         number /= 10;
2104     } while (number);
2105     appendToASCIIBuffer(p, end - p);
2106 }
2107
2108 void URLParser::serializeIPv4(IPv4Address address)
2109 {
2110     appendNumberToASCIIBuffer<uint8_t>(address >> 24);
2111     appendToASCIIBuffer('.');
2112     appendNumberToASCIIBuffer<uint8_t>(address >> 16);
2113     appendToASCIIBuffer('.');
2114     appendNumberToASCIIBuffer<uint8_t>(address >> 8);
2115     appendToASCIIBuffer('.');
2116     appendNumberToASCIIBuffer<uint8_t>(address);
2117 }
2118     
2119 static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
2120 {
2121     size_t end = begin;
2122     for (; end < 8; end++) {
2123         if (address[end])
2124             break;
2125     }
2126     return end - begin;
2127 }
2128
2129 static std::optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
2130 {
2131     std::optional<size_t> longest;
2132     size_t longestLength = 0;
2133     for (size_t i = 0; i < 8; i++) {
2134         size_t length = zeroSequenceLength(address, i);
2135         if (length) {
2136             if (length > 1 && (!longest || longestLength < length)) {
2137                 longest = i;
2138                 longestLength = length;
2139             }
2140             i += length;
2141         }
2142     }
2143     return longest;
2144 }
2145
2146 void URLParser::serializeIPv6Piece(uint16_t piece)
2147 {
2148     bool printed = false;
2149     if (auto nibble0 = piece >> 12) {
2150         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
2151         printed = true;
2152     }
2153     auto nibble1 = piece >> 8 & 0xF;
2154     if (printed || nibble1) {
2155         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
2156         printed = true;
2157     }
2158     auto nibble2 = piece >> 4 & 0xF;
2159     if (printed || nibble2)
2160         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
2161     appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
2162 }
2163
2164 void URLParser::serializeIPv6(URLParser::IPv6Address address)
2165 {
2166     appendToASCIIBuffer('[');
2167     auto compressPointer = findLongestZeroSequence(address);
2168     for (size_t piece = 0; piece < 8; piece++) {
2169         if (compressPointer && compressPointer.value() == piece) {
2170             ASSERT(!address[piece]);
2171             if (piece)
2172                 appendToASCIIBuffer(':');
2173             else
2174                 appendToASCIIBuffer("::", 2);
2175             while (piece < 8 && !address[piece])
2176                 piece++;
2177             if (piece == 8)
2178                 break;
2179         }
2180         serializeIPv6Piece(address[piece]);
2181         if (piece < 7)
2182             appendToASCIIBuffer(':');
2183     }
2184     appendToASCIIBuffer(']');
2185 }
2186
2187 enum class URLParser::IPv4PieceParsingError {
2188     Failure,
2189     Overflow,
2190 };
2191
2192 template<typename CharacterType>
2193 Expected<uint32_t, URLParser::IPv4PieceParsingError> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation)
2194 {
2195     enum class State : uint8_t {
2196         UnknownBase,
2197         Decimal,
2198         OctalOrHex,
2199         Octal,
2200         Hex,
2201     };
2202     State state = State::UnknownBase;
2203     Checked<uint32_t, RecordOverflow> value = 0;
2204     if (!iterator.atEnd() && *iterator == '.')
2205         return makeUnexpected(IPv4PieceParsingError::Failure);
2206     while (!iterator.atEnd()) {
2207         if (isTabOrNewline(*iterator)) {
2208             didSeeSyntaxViolation = true;
2209             ++iterator;
2210             continue;
2211         }
2212         if (*iterator == '.') {
2213             ASSERT(!value.hasOverflowed());
2214             return value.unsafeGet();
2215         }
2216         switch (state) {
2217         case State::UnknownBase:
2218             if (UNLIKELY(*iterator == '0')) {
2219                 ++iterator;
2220                 state = State::OctalOrHex;
2221                 break;
2222             }
2223             state = State::Decimal;
2224             break;
2225         case State::OctalOrHex:
2226             didSeeSyntaxViolation = true;
2227             if (*iterator == 'x' || *iterator == 'X') {
2228                 ++iterator;
2229                 state = State::Hex;
2230                 break;
2231             }
2232             state = State::Octal;
2233             break;
2234         case State::Decimal:
2235             if (!isASCIIDigit(*iterator))
2236                 return makeUnexpected(IPv4PieceParsingError::Failure);
2237             value *= 10;
2238             value += *iterator - '0';
2239             if (UNLIKELY(value.hasOverflowed()))
2240                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2241             ++iterator;
2242             break;
2243         case State::Octal:
2244             ASSERT(didSeeSyntaxViolation);
2245             if (*iterator < '0' || *iterator > '7')
2246                 return makeUnexpected(IPv4PieceParsingError::Failure);
2247             value *= 8;
2248             value += *iterator - '0';
2249             if (UNLIKELY(value.hasOverflowed()))
2250                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2251             ++iterator;
2252             break;
2253         case State::Hex:
2254             ASSERT(didSeeSyntaxViolation);
2255             if (!isASCIIHexDigit(*iterator))
2256                 return makeUnexpected(IPv4PieceParsingError::Failure);
2257             value *= 16;
2258             value += toASCIIHexValue(*iterator);
2259             if (UNLIKELY(value.hasOverflowed()))
2260                 return makeUnexpected(IPv4PieceParsingError::Overflow);
2261             ++iterator;
2262             break;
2263         }
2264     }
2265     ASSERT(!value.hasOverflowed());
2266     return value.unsafeGet();
2267 }
2268
2269 ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2270 {
2271     RELEASE_ASSERT(exponent <= 4);
2272     uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
2273     return values[exponent];
2274 }
2275
2276 enum class URLParser::IPv4ParsingError {
2277     Failure,
2278     NotIPv4,
2279 };
2280
2281 template<typename CharacterTypeForSyntaxViolation, typename CharacterType>
2282 Expected<URLParser::IPv4Address, URLParser::IPv4ParsingError> URLParser::parseIPv4Host(const CodePointIterator<CharacterTypeForSyntaxViolation>& iteratorForSyntaxViolationPosition, CodePointIterator<CharacterType> iterator)
2283 {
2284     Vector<Expected<uint32_t, URLParser::IPv4PieceParsingError>, 4> items;
2285     bool didSeeSyntaxViolation = false;
2286     if (!iterator.atEnd() && *iterator == '.')
2287         return makeUnexpected(IPv4ParsingError::NotIPv4);
2288     while (!iterator.atEnd()) {
2289         if (isTabOrNewline(*iterator)) {
2290             didSeeSyntaxViolation = true;
2291             ++iterator;
2292             continue;
2293         }
2294         if (items.size() >= 4)
2295             return makeUnexpected(IPv4ParsingError::NotIPv4);
2296         items.append(parseIPv4Piece(iterator, didSeeSyntaxViolation));
2297         if (!iterator.atEnd() && *iterator == '.') {
2298             ++iterator;
2299             if (iterator.atEnd())
2300                 syntaxViolation(iteratorForSyntaxViolationPosition);
2301             else if (*iterator == '.')
2302                 return makeUnexpected(IPv4ParsingError::NotIPv4);
2303         }
2304     }
2305     if (!iterator.atEnd() || !items.size() || items.size() > 4)
2306         return makeUnexpected(IPv4ParsingError::NotIPv4);
2307     for (const auto& item : items) {
2308         if (!item.hasValue() && item.error() == IPv4PieceParsingError::Failure)
2309             return makeUnexpected(IPv4ParsingError::NotIPv4);
2310     }
2311     for (const auto& item : items) {
2312         if (!item.hasValue() && item.error() == IPv4PieceParsingError::Overflow)
2313             return makeUnexpected(IPv4ParsingError::Failure);
2314     }
2315     if (items.size() > 1) {
2316         for (size_t i = 0; i < items.size() - 1; i++) {
2317             if (items[i].value() > 255)
2318                 return makeUnexpected(IPv4ParsingError::Failure);
2319         }
2320     }
2321     if (items[items.size() - 1].value() >= pow256(5 - items.size()))
2322         return makeUnexpected(IPv4ParsingError::Failure);
2323
2324     if (didSeeSyntaxViolation)
2325         syntaxViolation(iteratorForSyntaxViolationPosition);
2326     for (const auto& item : items) {
2327         if (item.value() > 255)
2328             syntaxViolation(iteratorForSyntaxViolationPosition);
2329     }
2330
2331     if (UNLIKELY(items.size() != 4))
2332         syntaxViolation(iteratorForSyntaxViolationPosition);
2333
2334     IPv4Address ipv4 = items.takeLast().value();
2335     for (size_t counter = 0; counter < items.size(); ++counter)
2336         ipv4 += items[counter].value() * pow256(3 - counter);
2337     return ipv4;
2338 }
2339
2340 template<typename CharacterType>
2341 std::optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator)
2342 {
2343     if (iterator.atEnd())
2344         return std::nullopt;
2345     uint32_t piece = 0;
2346     bool leadingZeros = false;
2347     size_t digitCount = 0;
2348     while (!iterator.atEnd()) {
2349         if (!isASCIIDigit(*iterator))
2350             return std::nullopt;
2351         ++digitCount;
2352         if (!piece && *iterator == '0') {
2353             if (leadingZeros)
2354                 return std::nullopt;
2355             leadingZeros = true;
2356         }
2357         if (!piece && *iterator == '0')
2358             leadingZeros = true;
2359         piece = piece * 10 + *iterator - '0';
2360         if (piece > 255)
2361             return std::nullopt;
2362         advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2363         if (iterator.atEnd())
2364             break;
2365         if (*iterator == '.')
2366             break;
2367     }
2368     if (piece && leadingZeros)
2369         return std::nullopt;
2370     return piece;
2371 }
2372
2373 template<typename CharacterType>
2374 std::optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator)
2375 {
2376     IPv4Address address = 0;
2377     for (size_t i = 0; i < 4; ++i) {
2378         if (std::optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator))
2379             address = (address << 8) + piece.value();
2380         else
2381             return std::nullopt;
2382         if (i < 3) {
2383             if (iterator.atEnd())
2384                 return std::nullopt;
2385             if (*iterator != '.')
2386                 return std::nullopt;
2387             advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2388         } else if (!iterator.atEnd())
2389             return std::nullopt;
2390     }
2391     ASSERT(iterator.atEnd());
2392     return address;
2393 }
2394
2395 template<typename CharacterType>
2396 std::optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2397 {
2398     ASSERT(*c == '[');
2399     const auto hostBegin = c;
2400     advance(c, hostBegin);
2401     if (c.atEnd())
2402         return std::nullopt;
2403
2404     IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}};
2405     size_t piecePointer = 0;
2406     std::optional<size_t> compressPointer;
2407
2408     if (*c == ':') {
2409         advance(c, hostBegin);
2410         if (c.atEnd())
2411             return std::nullopt;
2412         if (*c != ':')
2413             return std::nullopt;
2414         advance(c, hostBegin);
2415         ++piecePointer;
2416         compressPointer = piecePointer;
2417     }
2418     
2419     while (!c.atEnd()) {
2420         if (piecePointer == 8)
2421             return std::nullopt;
2422         if (*c == ':') {
2423             if (compressPointer)
2424                 return std::nullopt;
2425             advance(c, hostBegin);
2426             ++piecePointer;
2427             compressPointer = piecePointer;
2428             continue;
2429         }
2430         if (piecePointer == 6 || (compressPointer && piecePointer < 6)) {
2431             if (std::optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) {
2432                 if (compressPointer && piecePointer == 5)
2433                     return std::nullopt;
2434                 syntaxViolation(hostBegin);
2435                 address[piecePointer++] = ipv4Address.value() >> 16;
2436                 address[piecePointer++] = ipv4Address.value() & 0xFFFF;
2437                 c = { };
2438                 break;
2439             }
2440         }
2441         uint16_t value = 0;
2442         size_t length = 0;
2443         bool leadingZeros = false;
2444         for (; length < 4; length++) {
2445             if (c.atEnd())
2446                 break;
2447             if (!isASCIIHexDigit(*c))
2448                 break;
2449             if (isASCIIUpper(*c))
2450                 syntaxViolation(hostBegin);
2451             if (*c == '0' && !length)
2452                 leadingZeros = true;
2453             value = value * 0x10 + toASCIIHexValue(*c);
2454             advance(c, hostBegin);
2455         }
2456         
2457         if (UNLIKELY((value && leadingZeros) || (!value && length > 1)))
2458             syntaxViolation(hostBegin);
2459
2460         address[piecePointer++] = value;
2461         if (c.atEnd())
2462             break;
2463         if (piecePointer == 8 || *c != ':')
2464             return std::nullopt;
2465         advance(c, hostBegin);
2466     }
2467     
2468     if (!c.atEnd())
2469         return std::nullopt;
2470     
2471     if (compressPointer) {
2472         size_t swaps = piecePointer - compressPointer.value();
2473         piecePointer = 7;
2474         while (swaps)
2475             std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
2476     } else if (piecePointer != 8)
2477         return std::nullopt;
2478
2479     std::optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2480     if (possibleCompressPointer)
2481         possibleCompressPointer.value()++;
2482     if (UNLIKELY(compressPointer != possibleCompressPointer))
2483         syntaxViolation(hostBegin);
2484     
2485     return address;
2486 }
2487
2488 template<typename CharacterType>
2489 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2490 {
2491     Vector<LChar, defaultInlineBufferSize> output;
2492     output.reserveInitialCapacity(length);
2493     
2494     for (size_t i = 0; i < length; ++i) {
2495         uint8_t byte = input[i];
2496         if (byte != '%')
2497             output.uncheckedAppend(byte);
2498         else if (length > 2 && i < length - 2) {
2499             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2500                 syntaxViolation(iteratorForSyntaxViolationPosition);
2501                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2502                 i += 2;
2503             } else
2504                 output.uncheckedAppend(byte);
2505         } else
2506             output.uncheckedAppend(byte);
2507     }
2508     return output;
2509 }
2510     
2511 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length)
2512 {
2513     Vector<LChar, defaultInlineBufferSize> output;
2514     output.reserveInitialCapacity(length);
2515     
2516     for (size_t i = 0; i < length; ++i) {
2517         uint8_t byte = input[i];
2518         if (byte != '%')
2519             output.uncheckedAppend(byte);
2520         else if (length > 2 && i < length - 2) {
2521             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2522                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2523                 i += 2;
2524             } else
2525                 output.uncheckedAppend(byte);
2526         } else
2527             output.uncheckedAppend(byte);
2528     }
2529     return output;
2530 }
2531
2532 ALWAYS_INLINE static bool containsOnlyASCII(const String& string)
2533 {
2534     ASSERT(!string.isNull());
2535     if (string.is8Bit())
2536         return charactersAreAllASCII(string.characters8(), string.length());
2537     return charactersAreAllASCII(string.characters16(), string.length());
2538 }
2539
2540 template<typename CharacterType>
2541 std::optional<Vector<LChar, URLParser::defaultInlineBufferSize>> URLParser::domainToASCII(const String& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2542 {
2543     Vector<LChar, defaultInlineBufferSize> ascii;
2544     if (containsOnlyASCII(domain)) {
2545         size_t length = domain.length();
2546         if (domain.is8Bit()) {
2547             const LChar* characters = domain.characters8();
2548             ascii.reserveInitialCapacity(length);
2549             for (size_t i = 0; i < length; ++i) {
2550                 if (UNLIKELY(isASCIIUpper(characters[i])))
2551                     syntaxViolation(iteratorForSyntaxViolationPosition);
2552                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2553             }
2554         } else {
2555             const UChar* characters = domain.characters16();
2556             ascii.reserveInitialCapacity(length);
2557             for (size_t i = 0; i < length; ++i) {
2558                 if (UNLIKELY(isASCIIUpper(characters[i])))
2559                     syntaxViolation(iteratorForSyntaxViolationPosition);
2560                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2561             }
2562         }
2563         return ascii;
2564     }
2565     
2566     UChar hostnameBuffer[defaultInlineBufferSize];
2567     UErrorCode error = U_ZERO_ERROR;
2568     UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER;
2569     int32_t numCharactersConverted = uidna_nameToASCII(&internationalDomainNameTranscoder(), StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, &processingDetails, &error);
2570     ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
2571
2572     if (U_SUCCESS(error) && !processingDetails.errors) {
2573         for (int32_t i = 0; i < numCharactersConverted; ++i) {
2574             ASSERT(isASCII(hostnameBuffer[i]));
2575             ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2576         }
2577         ascii.append(hostnameBuffer, numCharactersConverted);
2578         if (domain != StringView(ascii.data(), ascii.size()))
2579             syntaxViolation(iteratorForSyntaxViolationPosition);
2580         return ascii;
2581     }
2582
2583     // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
2584     return std::nullopt;
2585 }
2586
2587 bool URLParser::hasForbiddenHostCodePoint(const Vector<LChar, URLParser::defaultInlineBufferSize>& asciiDomain)
2588 {
2589     for (size_t i = 0; i < asciiDomain.size(); ++i) {
2590         if (isForbiddenHostCodePoint(asciiDomain[i]))
2591             return true;
2592     }
2593     return false;
2594 }
2595
2596 template<typename CharacterType>
2597 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2598 {
2599     ASSERT(*iterator == ':');
2600     auto colonIterator = iterator;
2601     advance(iterator, colonIterator);
2602     uint32_t port = 0;
2603     if (UNLIKELY(iterator.atEnd())) {
2604         m_url.m_portEnd = currentPosition(colonIterator);
2605         syntaxViolation(colonIterator);
2606         return true;
2607     }
2608     size_t digitCount = 0;
2609     bool leadingZeros = false;
2610     for (; !iterator.atEnd(); ++iterator) {
2611         if (UNLIKELY(isTabOrNewline(*iterator))) {
2612             syntaxViolation(colonIterator);
2613             continue;
2614         }
2615         if (isASCIIDigit(*iterator)) {
2616             if (*iterator == '0' && !digitCount)
2617                 leadingZeros = true;
2618             ++digitCount;
2619             port = port * 10 + *iterator - '0';
2620             if (port > std::numeric_limits<uint16_t>::max())
2621                 return false;
2622         } else
2623             return false;
2624     }
2625
2626     if (port && leadingZeros)
2627         syntaxViolation(colonIterator);
2628     
2629     if (!port && digitCount > 1)
2630         syntaxViolation(colonIterator);
2631
2632     ASSERT(port == static_cast<uint16_t>(port));
2633     if (UNLIKELY(defaultPortForProtocol(parsedDataView(0, m_url.m_schemeEnd)) == static_cast<uint16_t>(port)))
2634         syntaxViolation(colonIterator);
2635     else {
2636         appendToASCIIBuffer(':');
2637         ASSERT(port <= std::numeric_limits<uint16_t>::max());
2638         appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2639     }
2640
2641     m_url.m_portEnd = currentPosition(iterator);
2642     return true;
2643 }
2644
2645 template<typename CharacterType>
2646 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2647 {
2648     if (iterator.atEnd())
2649         return false;
2650     if (*iterator == ':')
2651         return false;
2652     if (*iterator == '[') {
2653         auto ipv6End = iterator;
2654         while (!ipv6End.atEnd() && *ipv6End != ']')
2655             ++ipv6End;
2656         if (ipv6End.atEnd())
2657             return false;
2658         if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2659             serializeIPv6(address.value());
2660             if (!ipv6End.atEnd()) {
2661                 advance(ipv6End);
2662                 if (!ipv6End.atEnd() && *ipv6End == ':') {
2663                     m_url.m_hostEnd = currentPosition(ipv6End);
2664                     return parsePort(ipv6End);
2665                 }
2666                 m_url.m_hostEnd = currentPosition(ipv6End);
2667                 m_url.m_portEnd = m_url.m_hostEnd;
2668                 return true;
2669             }
2670             m_url.m_hostEnd = currentPosition(ipv6End);
2671             return true;
2672         }
2673         return false;
2674     }
2675
2676     if (!m_urlIsSpecial) {
2677         for (; !iterator.atEnd(); ++iterator) {
2678             if (UNLIKELY(isTabOrNewline(*iterator))) {
2679                 syntaxViolation(iterator);
2680                 continue;
2681             }
2682             if (*iterator == ':')
2683                 break;
2684             if (UNLIKELY(isForbiddenHostCodePoint(*iterator) && *iterator != '%'))
2685                 return false;
2686             utf8PercentEncode<isInSimpleEncodeSet>(iterator);
2687         }
2688         m_url.m_hostEnd = currentPosition(iterator);
2689         if (iterator.atEnd()) {
2690             m_url.m_portEnd = currentPosition(iterator);
2691             return true;
2692         }
2693         return parsePort(iterator);
2694     }
2695     
2696     if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2697         auto hostIterator = iterator;
2698         for (; !iterator.atEnd(); ++iterator) {
2699             if (isTabOrNewline(*iterator))
2700                 continue;
2701             if (*iterator == ':')
2702                 break;
2703             if (isForbiddenHostCodePoint(*iterator))
2704                 return false;
2705         }
2706         auto address = parseIPv4Host(hostIterator, CodePointIterator<CharacterType>(hostIterator, iterator));
2707         if (address) {
2708             serializeIPv4(address.value());
2709             m_url.m_hostEnd = currentPosition(iterator);
2710             if (iterator.atEnd()) {
2711                 m_url.m_portEnd = currentPosition(iterator);
2712                 return true;
2713             }
2714             return parsePort(iterator);
2715         }
2716         if (address.error() == IPv4ParsingError::Failure)
2717             return false;
2718         for (; hostIterator != iterator; ++hostIterator) {
2719             if (UNLIKELY(isTabOrNewline(*hostIterator))) {
2720                 syntaxViolation(hostIterator);
2721                 continue;
2722             }
2723             if (UNLIKELY(isASCIIUpper(*hostIterator)))
2724                 syntaxViolation(hostIterator);
2725             appendToASCIIBuffer(toASCIILower(*hostIterator));
2726         }
2727         m_url.m_hostEnd = currentPosition(iterator);
2728         if (!hostIterator.atEnd())
2729             return parsePort(hostIterator);
2730         m_url.m_portEnd = currentPosition(iterator);
2731         return true;
2732     }
2733     
2734     const auto hostBegin = iterator;
2735     
2736     Vector<LChar, defaultInlineBufferSize> utf8Encoded;
2737     for (; !iterator.atEnd(); ++iterator) {
2738         if (UNLIKELY(isTabOrNewline(*iterator))) {
2739             syntaxViolation(hostBegin);
2740             continue;
2741         }
2742         if (*iterator == ':')
2743             break;
2744         if (UNLIKELY(!isASCII(*iterator)))
2745             syntaxViolation(hostBegin);
2746
2747         uint8_t buffer[U8_MAX_LENGTH];
2748         int32_t offset = 0;
2749         UBool error = false;
2750         U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error);
2751         ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
2752         // FIXME: Check error.
2753         utf8Encoded.append(buffer, offset);
2754     }
2755     Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin);
2756     String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2757     if (domain.isNull())
2758         return false;
2759     if (domain != StringView(percentDecoded.data(), percentDecoded.size()))
2760         syntaxViolation(hostBegin);
2761     auto asciiDomain = domainToASCII(domain, hostBegin);
2762     if (!asciiDomain || hasForbiddenHostCodePoint(asciiDomain.value()))
2763         return false;
2764     Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value();
2765     const LChar* asciiDomainCharacters = asciiDomainValue.data();
2766
2767     auto address = parseIPv4Host(hostBegin, CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()));
2768     if (address) {
2769         serializeIPv4(address.value());
2770         m_url.m_hostEnd = currentPosition(iterator);
2771         if (iterator.atEnd()) {
2772             m_url.m_portEnd = currentPosition(iterator);
2773             return true;
2774         }
2775         return parsePort(iterator);
2776     }
2777     if (address.error() == IPv4ParsingError::Failure)
2778         return false;
2779
2780     appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2781     m_url.m_hostEnd = currentPosition(iterator);
2782     if (!iterator.atEnd())
2783         return parsePort(iterator);
2784     m_url.m_portEnd = currentPosition(iterator);
2785     return true;
2786 }
2787
2788 std::optional<String> URLParser::formURLDecode(StringView input)
2789 {
2790     auto utf8 = input.utf8(StrictConversion);
2791     if (utf8.isNull())
2792         return std::nullopt;
2793     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2794     return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2795 }
2796
2797 // https://url.spec.whatwg.org/#concept-urlencoded-parser
2798 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2799 {
2800     URLEncodedForm output;
2801     for (StringView bytes : input.split('&')) {
2802         auto equalIndex = bytes.find('=');
2803         if (equalIndex == notFound) {
2804             auto name = formURLDecode(bytes.toString().replace('+', 0x20));
2805             if (name)
2806                 output.append({ name.value(), emptyString() });
2807         } else {
2808             auto name = formURLDecode(bytes.substring(0, equalIndex).toString().replace('+', 0x20));
2809             auto value = formURLDecode(bytes.substring(equalIndex + 1).toString().replace('+', 0x20));
2810             if (name && value)
2811                 output.append({ name.value(), value.value() });
2812         }
2813     }
2814     return output;
2815 }
2816
2817 static void serializeURLEncodedForm(const String& input, StringVector<LChar>& output)
2818 {
2819     auto utf8 = input.utf8(StrictConversion);
2820     const char* data = utf8.data();
2821     for (size_t i = 0; i < utf8.length(); ++i) {
2822         const char byte = data[i];
2823         if (byte == 0x20)
2824             output.append(0x2B);
2825         else if (byte == 0x2A
2826             || byte == 0x2D
2827             || byte == 0x2E
2828             || (byte >= 0x30 && byte <= 0x39)
2829             || (byte >= 0x41 && byte <= 0x5A)
2830             || byte == 0x5F
2831             || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches.
2832             output.append(byte);
2833         else
2834             percentEncodeByte(byte, output);
2835     }
2836 }
2837     
2838 String URLParser::serialize(const URLEncodedForm& tuples)
2839 {
2840     if (tuples.isEmpty())
2841         return { };
2842
2843     StringVector<LChar> output;
2844     for (auto& tuple : tuples) {
2845         if (!output.isEmpty())
2846             output.append('&');
2847         serializeURLEncodedForm(tuple.key, output);
2848         output.append('=');
2849         serializeURLEncodedForm(tuple.value, output);
2850     }
2851     return String::adopt(WTFMove(output));
2852 }
2853
2854 const UIDNA& URLParser::internationalDomainNameTranscoder()
2855 {
2856     static UIDNA* encoder;
2857     static std::once_flag onceFlag;
2858     std::call_once(onceFlag, [] {
2859         UErrorCode error = U_ZERO_ERROR;
2860         // Warning: Please contact a WebKitGTK+ developer if changing these flags.
2861         // They should be synced with ephy_uri_decode() in ephy-uri-helpers.c.
2862         encoder = uidna_openUTS46(UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_NONTRANSITIONAL_TO_UNICODE | UIDNA_NONTRANSITIONAL_TO_ASCII, &error);
2863         RELEASE_ASSERT(U_SUCCESS(error));
2864         RELEASE_ASSERT(encoder);
2865     });
2866     return *encoder;
2867 }
2868
2869 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2870 {
2871     // FIXME: m_cannotBeABaseURL is not compared because the old URL::parse did not use it,
2872     // but once we get rid of URL::parse its value should be tested.
2873     URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %s",
2874         a.m_isValid,
2875         a.m_protocolIsInHTTPFamily,
2876         a.m_schemeEnd,
2877         a.m_userStart,
2878         a.m_userEnd,
2879         a.m_passwordEnd,
2880         a.m_hostEnd,
2881         a.m_portEnd,
2882         a.m_pathAfterLastSlash,
2883         a.m_pathEnd,
2884         a.m_queryEnd,
2885         a.m_string.utf8().data(),
2886         b.m_isValid,
2887         b.m_protocolIsInHTTPFamily,
2888         b.m_schemeEnd,
2889         b.m_userStart,
2890         b.m_userEnd,
2891         b.m_passwordEnd,
2892         b.m_hostEnd,
2893         b.m_portEnd,
2894         b.m_pathAfterLastSlash,
2895         b.m_pathEnd,
2896         b.m_queryEnd,
2897         b.m_string.utf8().data());
2898
2899     return a.m_string == b.m_string
2900         && a.m_isValid == b.m_isValid
2901         && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2902         && a.m_schemeEnd == b.m_schemeEnd
2903         && a.m_userStart == b.m_userStart
2904         && a.m_userEnd == b.m_userEnd
2905         && a.m_passwordEnd == b.m_passwordEnd
2906         && a.m_hostEnd == b.m_hostEnd
2907         && a.m_portEnd == b.m_portEnd
2908         && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2909         && a.m_pathEnd == b.m_pathEnd
2910         && a.m_queryEnd == b.m_queryEnd;
2911 }
2912
2913 bool URLParser::internalValuesConsistent(const URL& url)
2914 {
2915     return url.m_schemeEnd <= url.m_userStart
2916         && url.m_userStart <= url.m_userEnd
2917         && url.m_userEnd <= url.m_passwordEnd
2918         && url.m_passwordEnd <= url.m_hostEnd
2919         && url.m_hostEnd <= url.m_portEnd
2920         && url.m_portEnd <= url.m_pathAfterLastSlash
2921         && url.m_pathAfterLastSlash <= url.m_pathEnd
2922         && url.m_pathEnd <= url.m_queryEnd
2923         && url.m_queryEnd <= url.m_string.length();
2924 }
2925
2926 } // namespace WebCore