7151e9ec4da1cc7d2b00ee8fb68908fe47a27849
[WebKit-https.git] / Source / WebCore / platform / URLParser.cpp
1 /*
2  * Copyright (C) 2016 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "URLParser.h"
28
29 #include "Logging.h"
30 #include "RuntimeApplicationChecks.h"
31 #include <array>
32 #include <unicode/uidna.h>
33 #include <unicode/utypes.h>
34
35 namespace WebCore {
36
37 #define URL_PARSER_DEBUGGING 0
38     
39 #if URL_PARSER_DEBUGGING
40 #define URL_PARSER_LOG(...) LOG(URLParser, __VA_ARGS__)
41 #else
42 #define URL_PARSER_LOG(...)
43 #endif
44     
45 template<typename CharacterType>
46 class CodePointIterator {
47 public:
48     ALWAYS_INLINE CodePointIterator() { }
49     ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
50         : m_begin(begin)
51         , m_end(end)
52     {
53     }
54     
55     ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
56         : CodePointIterator(begin.m_begin, end.m_begin)
57     {
58         ASSERT(end.m_begin >= begin.m_begin);
59     }
60     
61     ALWAYS_INLINE UChar32 operator*() const;
62     ALWAYS_INLINE CodePointIterator& operator++();
63
64     ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
65     {
66         return m_begin == other.m_begin
67             && m_end == other.m_end;
68     }
69     ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
70     
71     ALWAYS_INLINE CodePointIterator& operator=(const CodePointIterator& other)
72     {
73         m_begin = other.m_begin;
74         m_end = other.m_end;
75         return *this;
76     }
77
78     ALWAYS_INLINE bool atEnd() const
79     {
80         ASSERT(m_begin <= m_end);
81         return m_begin >= m_end;
82     }
83     
84     ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
85     {
86         ASSERT(m_begin >= reference);
87         return m_begin - reference;
88     }
89
90     ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
91     {
92         return codeUnitsSince(other.m_begin);
93     }
94     
95 private:
96     const CharacterType* m_begin { nullptr };
97     const CharacterType* m_end { nullptr };
98 };
99
100 template<>
101 ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const
102 {
103     ASSERT(!atEnd());
104     return *m_begin;
105 }
106
107 template<>
108 ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
109 {
110     ASSERT(!atEnd());
111     m_begin++;
112     return *this;
113 }
114
115 template<>
116 ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const
117 {
118     ASSERT(!atEnd());
119     UChar32 c;
120     U16_GET(m_begin, 0, 0, m_end - m_begin, c);
121     return c;
122 }
123
124 template<>
125 ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
126 {
127     ASSERT(!atEnd());
128     unsigned i = 0;
129     size_t length = m_end - m_begin;
130     U16_FWD_1(m_begin, i, length);
131     m_begin += i;
132     return *this;
133 }
134     
135 ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
136 {
137     if (U_IS_BMP(codePoint)) {
138         destination.append(static_cast<UChar>(codePoint));
139         return;
140     }
141     destination.reserveCapacity(destination.size() + 2);
142     destination.uncheckedAppend(U16_LEAD(codePoint));
143     destination.uncheckedAppend(U16_TRAIL(codePoint));
144 }
145
146 enum URLCharacterClass {
147     UserInfo = 0x1,
148     Default = 0x2,
149     InvalidDomain = 0x4,
150     QueryPercent = 0x8,
151     SlashQuestionOrHash = 0x10,
152     ValidScheme = 0x20,
153 };
154
155 static const uint8_t characterClassTable[256] = {
156     UserInfo | Default | InvalidDomain | QueryPercent, // 0x0
157     UserInfo | Default | QueryPercent, // 0x1
158     UserInfo | Default | QueryPercent, // 0x2
159     UserInfo | Default | QueryPercent, // 0x3
160     UserInfo | Default | QueryPercent, // 0x4
161     UserInfo | Default | QueryPercent, // 0x5
162     UserInfo | Default | QueryPercent, // 0x6
163     UserInfo | Default | QueryPercent, // 0x7
164     UserInfo | Default | QueryPercent, // 0x8
165     UserInfo | Default | InvalidDomain | QueryPercent, // 0x9
166     UserInfo | Default | InvalidDomain | QueryPercent, // 0xA
167     UserInfo | Default | QueryPercent, // 0xB
168     UserInfo | Default | QueryPercent, // 0xC
169     UserInfo | Default | InvalidDomain | QueryPercent, // 0xD
170     UserInfo | Default | QueryPercent, // 0xE
171     UserInfo | Default | QueryPercent, // 0xF
172     UserInfo | Default | QueryPercent, // 0x10
173     UserInfo | Default | QueryPercent, // 0x11
174     UserInfo | Default | QueryPercent, // 0x12
175     UserInfo | Default | QueryPercent, // 0x13
176     UserInfo | Default | QueryPercent, // 0x14
177     UserInfo | Default | QueryPercent, // 0x15
178     UserInfo | Default | QueryPercent, // 0x16
179     UserInfo | Default | QueryPercent, // 0x17
180     UserInfo | Default | QueryPercent, // 0x18
181     UserInfo | Default | QueryPercent, // 0x19
182     UserInfo | Default | QueryPercent, // 0x1A
183     UserInfo | Default | QueryPercent, // 0x1B
184     UserInfo | Default | QueryPercent, // 0x1C
185     UserInfo | Default | QueryPercent, // 0x1D
186     UserInfo | Default | QueryPercent, // 0x1E
187     UserInfo | Default | QueryPercent, // 0x1F
188     UserInfo | Default | InvalidDomain | QueryPercent, // ' '
189     0, // '!'
190     UserInfo | Default | QueryPercent, // '"'
191     UserInfo | Default | InvalidDomain | QueryPercent | SlashQuestionOrHash, // '#'
192     0, // '$'
193     InvalidDomain, // '%'
194     0, // '&'
195     0, // '''
196     0, // '('
197     0, // ')'
198     0, // '*'
199     ValidScheme, // '+'
200     0, // ','
201     ValidScheme, // '-'
202     ValidScheme, // '.'
203     UserInfo | InvalidDomain | SlashQuestionOrHash, // '/'
204     ValidScheme, // '0'
205     ValidScheme, // '1'
206     ValidScheme, // '2'
207     ValidScheme, // '3'
208     ValidScheme, // '4'
209     ValidScheme, // '5'
210     ValidScheme, // '6'
211     ValidScheme, // '7'
212     ValidScheme, // '8'
213     ValidScheme, // '9'
214     UserInfo | InvalidDomain, // ':'
215     UserInfo, // ';'
216     UserInfo | Default | QueryPercent, // '<'
217     UserInfo, // '='
218     UserInfo | Default | QueryPercent, // '>'
219     UserInfo | Default | InvalidDomain | SlashQuestionOrHash, // '?'
220     UserInfo | InvalidDomain, // '@'
221     ValidScheme, // 'A'
222     ValidScheme, // 'B'
223     ValidScheme, // 'C'
224     ValidScheme, // 'D'
225     ValidScheme, // 'E'
226     ValidScheme, // 'F'
227     ValidScheme, // 'G'
228     ValidScheme, // 'H'
229     ValidScheme, // 'I'
230     ValidScheme, // 'J'
231     ValidScheme, // 'K'
232     ValidScheme, // 'L'
233     ValidScheme, // 'M'
234     ValidScheme, // 'N'
235     ValidScheme, // 'O'
236     ValidScheme, // 'P'
237     ValidScheme, // 'Q'
238     ValidScheme, // 'R'
239     ValidScheme, // 'S'
240     ValidScheme, // 'T'
241     ValidScheme, // 'U'
242     ValidScheme, // 'V'
243     ValidScheme, // 'W'
244     ValidScheme, // 'X'
245     ValidScheme, // 'Y'
246     ValidScheme, // 'Z'
247     UserInfo | InvalidDomain, // '['
248     UserInfo | InvalidDomain | SlashQuestionOrHash, // '\\'
249     UserInfo | InvalidDomain, // ']'
250     UserInfo, // '^'
251     0, // '_'
252     UserInfo | Default, // '`'
253     ValidScheme, // 'a'
254     ValidScheme, // 'b'
255     ValidScheme, // 'c'
256     ValidScheme, // 'd'
257     ValidScheme, // 'e'
258     ValidScheme, // 'f'
259     ValidScheme, // 'g'
260     ValidScheme, // 'h'
261     ValidScheme, // 'i'
262     ValidScheme, // 'j'
263     ValidScheme, // 'k'
264     ValidScheme, // 'l'
265     ValidScheme, // 'm'
266     ValidScheme, // 'n'
267     ValidScheme, // 'o'
268     ValidScheme, // 'p'
269     ValidScheme, // 'q'
270     ValidScheme, // 'r'
271     ValidScheme, // 's'
272     ValidScheme, // 't'
273     ValidScheme, // 'u'
274     ValidScheme, // 'v'
275     ValidScheme, // 'w'
276     ValidScheme, // 'x'
277     ValidScheme, // 'y'
278     ValidScheme, // 'z'
279     UserInfo | Default, // '{'
280     UserInfo, // '|'
281     UserInfo | Default, // '}'
282     0, // '~'
283     QueryPercent, // 0x7F
284     QueryPercent, // 0x80
285     QueryPercent, // 0x81
286     QueryPercent, // 0x82
287     QueryPercent, // 0x83
288     QueryPercent, // 0x84
289     QueryPercent, // 0x85
290     QueryPercent, // 0x86
291     QueryPercent, // 0x87
292     QueryPercent, // 0x88
293     QueryPercent, // 0x89
294     QueryPercent, // 0x8A
295     QueryPercent, // 0x8B
296     QueryPercent, // 0x8C
297     QueryPercent, // 0x8D
298     QueryPercent, // 0x8E
299     QueryPercent, // 0x8F
300     QueryPercent, // 0x90
301     QueryPercent, // 0x91
302     QueryPercent, // 0x92
303     QueryPercent, // 0x93
304     QueryPercent, // 0x94
305     QueryPercent, // 0x95
306     QueryPercent, // 0x96
307     QueryPercent, // 0x97
308     QueryPercent, // 0x98
309     QueryPercent, // 0x99
310     QueryPercent, // 0x9A
311     QueryPercent, // 0x9B
312     QueryPercent, // 0x9C
313     QueryPercent, // 0x9D
314     QueryPercent, // 0x9E
315     QueryPercent, // 0x9F
316     QueryPercent, // 0xA0
317     QueryPercent, // 0xA1
318     QueryPercent, // 0xA2
319     QueryPercent, // 0xA3
320     QueryPercent, // 0xA4
321     QueryPercent, // 0xA5
322     QueryPercent, // 0xA6
323     QueryPercent, // 0xA7
324     QueryPercent, // 0xA8
325     QueryPercent, // 0xA9
326     QueryPercent, // 0xAA
327     QueryPercent, // 0xAB
328     QueryPercent, // 0xAC
329     QueryPercent, // 0xAD
330     QueryPercent, // 0xAE
331     QueryPercent, // 0xAF
332     QueryPercent, // 0xB0
333     QueryPercent, // 0xB1
334     QueryPercent, // 0xB2
335     QueryPercent, // 0xB3
336     QueryPercent, // 0xB4
337     QueryPercent, // 0xB5
338     QueryPercent, // 0xB6
339     QueryPercent, // 0xB7
340     QueryPercent, // 0xB8
341     QueryPercent, // 0xB9
342     QueryPercent, // 0xBA
343     QueryPercent, // 0xBB
344     QueryPercent, // 0xBC
345     QueryPercent, // 0xBD
346     QueryPercent, // 0xBE
347     QueryPercent, // 0xBF
348     QueryPercent, // 0xC0
349     QueryPercent, // 0xC1
350     QueryPercent, // 0xC2
351     QueryPercent, // 0xC3
352     QueryPercent, // 0xC4
353     QueryPercent, // 0xC5
354     QueryPercent, // 0xC6
355     QueryPercent, // 0xC7
356     QueryPercent, // 0xC8
357     QueryPercent, // 0xC9
358     QueryPercent, // 0xCA
359     QueryPercent, // 0xCB
360     QueryPercent, // 0xCC
361     QueryPercent, // 0xCD
362     QueryPercent, // 0xCE
363     QueryPercent, // 0xCF
364     QueryPercent, // 0xD0
365     QueryPercent, // 0xD1
366     QueryPercent, // 0xD2
367     QueryPercent, // 0xD3
368     QueryPercent, // 0xD4
369     QueryPercent, // 0xD5
370     QueryPercent, // 0xD6
371     QueryPercent, // 0xD7
372     QueryPercent, // 0xD8
373     QueryPercent, // 0xD9
374     QueryPercent, // 0xDA
375     QueryPercent, // 0xDB
376     QueryPercent, // 0xDC
377     QueryPercent, // 0xDD
378     QueryPercent, // 0xDE
379     QueryPercent, // 0xDF
380     QueryPercent, // 0xE0
381     QueryPercent, // 0xE1
382     QueryPercent, // 0xE2
383     QueryPercent, // 0xE3
384     QueryPercent, // 0xE4
385     QueryPercent, // 0xE5
386     QueryPercent, // 0xE6
387     QueryPercent, // 0xE7
388     QueryPercent, // 0xE8
389     QueryPercent, // 0xE9
390     QueryPercent, // 0xEA
391     QueryPercent, // 0xEB
392     QueryPercent, // 0xEC
393     QueryPercent, // 0xED
394     QueryPercent, // 0xEE
395     QueryPercent, // 0xEF
396     QueryPercent, // 0xF0
397     QueryPercent, // 0xF1
398     QueryPercent, // 0xF2
399     QueryPercent, // 0xF3
400     QueryPercent, // 0xF4
401     QueryPercent, // 0xF5
402     QueryPercent, // 0xF6
403     QueryPercent, // 0xF7
404     QueryPercent, // 0xF8
405     QueryPercent, // 0xF9
406     QueryPercent, // 0xFA
407     QueryPercent, // 0xFB
408     QueryPercent, // 0xFC
409     QueryPercent, // 0xFD
410     QueryPercent, // 0xFE
411     QueryPercent, // 0xFF
412 };
413
414 template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; }
415 template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
416 template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
417 template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
418 template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
419 template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
420 template<typename CharacterType> ALWAYS_INLINE static bool isInvalidDomainCharacter(CharacterType character) { return character <= ']' && characterClassTable[character] & InvalidDomain; }
421 template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
422 template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
423 template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & ValidScheme; }
424 static bool shouldPercentEncodeQueryByte(uint8_t byte) { return characterClassTable[byte] & QueryPercent; }
425
426 template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation>
427 ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
428 {
429     ++iterator;
430     while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
431         if (reportSyntaxViolation == ReportSyntaxViolation::Yes)
432             syntaxViolation(iteratorForSyntaxViolationPosition);
433         ++iterator;
434     }
435 }
436
437 template<typename CharacterType>
438 bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator)
439 {
440     if (iterator.atEnd())
441         return false;
442     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
443     if (iterator.atEnd())
444         return false;
445     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
446     return iterator.atEnd();
447 }
448
449 template<typename CharacterType>
450 ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
451 {
452     if (iterator.atEnd() || !isASCIIAlpha(*iterator))
453         return false;
454     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
455     if (iterator.atEnd())
456         return false;
457     if (*iterator == ':')
458         return true;
459     if (UNLIKELY(*iterator == '|'))
460         return true;
461     return false;
462 }
463
464 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
465 {
466     ASSERT(isASCII(codePoint));
467     if (UNLIKELY(m_didSeeSyntaxViolation))
468         m_asciiBuffer.append(codePoint);
469 }
470
471 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
472 {
473     if (UNLIKELY(m_didSeeSyntaxViolation))
474         m_asciiBuffer.append(characters, length);
475 }
476
477 template<typename CharacterType>
478 void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
479 {
480     ASSERT(isWindowsDriveLetter(iterator));
481     appendToASCIIBuffer(*iterator);
482     advance(iterator);
483     ASSERT(!iterator.atEnd());
484     ASSERT(*iterator == ':' || *iterator == '|');
485     if (*iterator == '|')
486         syntaxViolation(iterator);
487     appendToASCIIBuffer(':');
488     advance(iterator);
489 }
490
491 template<typename CharacterType>
492 bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
493 {
494     if (!isWindowsDriveLetter(iterator))
495         return true;
496     if (iterator.atEnd())
497         return false;
498     advance(iterator);
499     if (iterator.atEnd())
500         return true;
501     advance(iterator);
502     if (iterator.atEnd())
503         return true;
504     return !isSlashQuestionOrHash(*iterator);
505 }
506
507 static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
508 {
509     buffer.append('%');
510     buffer.append(upperNibbleToASCIIHexDigit(byte));
511     buffer.append(lowerNibbleToASCIIHexDigit(byte));
512 }
513
514 void URLParser::percentEncodeByte(uint8_t byte)
515 {
516     ASSERT(m_didSeeSyntaxViolation);
517     appendToASCIIBuffer('%');
518     appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
519     appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
520 }
521
522 const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD";
523 const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1;
524
525 template<bool(*isInCodeSet)(UChar32), typename CharacterType>
526 ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
527 {
528     ASSERT(!iterator.atEnd());
529     UChar32 codePoint = *iterator;
530     if (LIKELY(isASCII(codePoint))) {
531         if (UNLIKELY(isInCodeSet(codePoint))) {
532             syntaxViolation(iterator);
533             percentEncodeByte(codePoint);
534         } else
535             appendToASCIIBuffer(codePoint);
536         return;
537     }
538     ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
539     syntaxViolation(iterator);
540     
541     if (!U_IS_UNICODE_CHAR(codePoint)) {
542         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
543         return;
544     }
545     
546     uint8_t buffer[U8_MAX_LENGTH];
547     int32_t offset = 0;
548     U8_APPEND_UNSAFE(buffer, offset, codePoint);
549     for (int32_t i = 0; i < offset; ++i)
550         percentEncodeByte(buffer[i]);
551 }
552
553 template<typename CharacterType>
554 ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
555 {
556     ASSERT(!iterator.atEnd());
557     UChar32 codePoint = *iterator;
558     if (LIKELY(isASCII(codePoint))) {
559         if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint))) {
560             syntaxViolation(iterator);
561             percentEncodeByte(codePoint);
562         } else
563             appendToASCIIBuffer(codePoint);
564         return;
565     }
566     
567     syntaxViolation(iterator);
568     
569     if (!U_IS_UNICODE_CHAR(codePoint)) {
570         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
571         return;
572     }
573
574     uint8_t buffer[U8_MAX_LENGTH];
575     int32_t offset = 0;
576     U8_APPEND_UNSAFE(buffer, offset, codePoint);
577     for (int32_t i = 0; i < offset; ++i) {
578         auto byte = buffer[i];
579         if (shouldPercentEncodeQueryByte(byte))
580             percentEncodeByte(byte);
581         else
582             appendToASCIIBuffer(byte);
583     }
584 }
585
586 template<typename CharacterType>
587 void URLParser::encodeQuery(const Vector<UChar>& source, const TextEncoding& encoding, CodePointIterator<CharacterType> iterator)
588 {
589     // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
590     CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
591     const char* data = encoded.data();
592     size_t length = encoded.length();
593     
594     if (!length == !iterator.atEnd()) {
595         syntaxViolation(iterator);
596         return;
597     }
598     
599     size_t i = 0;
600     for (; i < length; ++i) {
601         ASSERT(!iterator.atEnd());
602         uint8_t byte = data[i];
603         if (UNLIKELY(byte != *iterator)) {
604             syntaxViolation(iterator);
605             break;
606         }
607         if (UNLIKELY(shouldPercentEncodeQueryByte(byte))) {
608             syntaxViolation(iterator);
609             break;
610         }
611         appendToASCIIBuffer(byte);
612         ++iterator;
613     }
614     while (!iterator.atEnd() && isTabOrNewline(*iterator))
615         ++iterator;
616     ASSERT((i == length) == iterator.atEnd());
617     for (; i < length; ++i) {
618         ASSERT(m_didSeeSyntaxViolation);
619         uint8_t byte = data[i];
620         if (shouldPercentEncodeQueryByte(byte))
621             percentEncodeByte(byte);
622         else
623             appendToASCIIBuffer(byte);
624     }
625 }
626
627 Optional<uint16_t> URLParser::defaultPortForProtocol(StringView scheme)
628 {
629     static const uint16_t ftpPort = 21;
630     static const uint16_t gopherPort = 70;
631     static const uint16_t httpPort = 80;
632     static const uint16_t httpsPort = 443;
633     static const uint16_t wsPort = 80;
634     static const uint16_t wssPort = 443;
635     
636     auto length = scheme.length();
637     if (!length)
638         return Nullopt;
639     switch (scheme[0]) {
640     case 'w':
641         switch (length) {
642         case 2:
643             if (scheme[1] == 's')
644                 return wsPort;
645             return Nullopt;
646         case 3:
647             if (scheme[1] == 's'
648                 && scheme[2] == 's')
649                 return wssPort;
650             return Nullopt;
651         default:
652             return false;
653         }
654     case 'h':
655         switch (length) {
656         case 4:
657             if (scheme[1] == 't'
658                 && scheme[2] == 't'
659                 && scheme[3] == 'p')
660                 return httpPort;
661             return Nullopt;
662         case 5:
663             if (scheme[1] == 't'
664                 && scheme[2] == 't'
665                 && scheme[3] == 'p'
666                 && scheme[4] == 's')
667                 return httpsPort;
668             return Nullopt;
669         default:
670             return Nullopt;
671         }
672     case 'g':
673         if (length == 6
674             && scheme[1] == 'o'
675             && scheme[2] == 'p'
676             && scheme[3] == 'h'
677             && scheme[4] == 'e'
678             && scheme[5] == 'r')
679             return gopherPort;
680         return Nullopt;
681     case 'f':
682         if (length == 3
683             && scheme[1] == 't'
684             && scheme[2] == 'p')
685             return ftpPort;
686         return Nullopt;
687     default:
688         return Nullopt;
689     }
690 }
691
692 enum class Scheme {
693     WS,
694     WSS,
695     File,
696     FTP,
697     Gopher,
698     HTTP,
699     HTTPS,
700     NonSpecial
701 };
702
703 ALWAYS_INLINE static Scheme scheme(StringView scheme)
704 {
705     auto length = scheme.length();
706     if (!length)
707         return Scheme::NonSpecial;
708     switch (scheme[0]) {
709     case 'f':
710         switch (length) {
711         case 3:
712             if (scheme[1] == 't'
713                 && scheme[2] == 'p')
714                 return Scheme::FTP;
715             return Scheme::NonSpecial;
716         case 4:
717             if (scheme[1] == 'i'
718                 && scheme[2] == 'l'
719                 && scheme[3] == 'e')
720                 return Scheme::File;
721             return Scheme::NonSpecial;
722         default:
723             return Scheme::NonSpecial;
724         }
725     case 'g':
726         if (length == 6
727             && scheme[1] == 'o'
728             && scheme[2] == 'p'
729             && scheme[3] == 'h'
730             && scheme[4] == 'e'
731             && scheme[5] == 'r')
732             return Scheme::Gopher;
733         return Scheme::NonSpecial;
734     case 'h':
735         switch (length) {
736         case 4:
737             if (scheme[1] == 't'
738                 && scheme[2] == 't'
739                 && scheme[3] == 'p')
740                 return Scheme::HTTP;
741             return Scheme::NonSpecial;
742         case 5:
743             if (scheme[1] == 't'
744                 && scheme[2] == 't'
745                 && scheme[3] == 'p'
746                 && scheme[4] == 's')
747                 return Scheme::HTTPS;
748             return Scheme::NonSpecial;
749         default:
750             return Scheme::NonSpecial;
751         }
752     case 'w':
753         switch (length) {
754         case 2:
755             if (scheme[1] == 's')
756                 return Scheme::WS;
757             return Scheme::NonSpecial;
758         case 3:
759             if (scheme[1] == 's'
760                 && scheme[2] == 's')
761                 return Scheme::WSS;
762             return Scheme::NonSpecial;
763         default:
764             return Scheme::NonSpecial;
765         }
766     default:
767         return Scheme::NonSpecial;
768     }
769 }
770
771 enum class URLParser::URLPart {
772     SchemeEnd,
773     UserStart,
774     UserEnd,
775     PasswordEnd,
776     HostEnd,
777     PortEnd,
778     PathAfterLastSlash,
779     PathEnd,
780     QueryEnd,
781     FragmentEnd,
782 };
783
784 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
785 {
786     switch (part) {
787     case URLPart::FragmentEnd:
788         return url.m_fragmentEnd;
789     case URLPart::QueryEnd:
790         return url.m_queryEnd;
791     case URLPart::PathEnd:
792         return url.m_pathEnd;
793     case URLPart::PathAfterLastSlash:
794         return url.m_pathAfterLastSlash;
795     case URLPart::PortEnd:
796         return url.m_portEnd;
797     case URLPart::HostEnd:
798         return url.m_hostEnd;
799     case URLPart::PasswordEnd:
800         return url.m_passwordEnd;
801     case URLPart::UserEnd:
802         return url.m_userEnd;
803     case URLPart::UserStart:
804         return url.m_userStart;
805     case URLPart::SchemeEnd:
806         return url.m_schemeEnd;
807     }
808     ASSERT_NOT_REACHED();
809     return 0;
810 }
811
812 void URLParser::copyASCIIStringUntil(const String& string, size_t length)
813 {
814     RELEASE_ASSERT(length <= string.length());
815     if (string.isNull())
816         return;
817     ASSERT(m_asciiBuffer.isEmpty());
818     if (string.is8Bit()) {
819         appendToASCIIBuffer(string.characters8(), length);
820     } else {
821         const UChar* characters = string.characters16();
822         for (size_t i = 0; i < length; ++i) {
823             UChar c = characters[i];
824             ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
825             appendToASCIIBuffer(c);
826         }
827     }
828 }
829
830 template<typename CharacterType>
831 void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, bool& isUTF8Encoding)
832 {
833     syntaxViolation(iterator);
834
835     m_asciiBuffer.clear();
836     copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
837     switch (part) {
838     case URLPart::FragmentEnd:
839         RELEASE_ASSERT_NOT_REACHED();
840     case URLPart::QueryEnd:
841         m_url.m_queryEnd = base.m_queryEnd;
842         FALLTHROUGH;
843     case URLPart::PathEnd:
844         m_url.m_pathEnd = base.m_pathEnd;
845         FALLTHROUGH;
846     case URLPart::PathAfterLastSlash:
847         m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
848         FALLTHROUGH;
849     case URLPart::PortEnd:
850         m_url.m_portEnd = base.m_portEnd;
851         FALLTHROUGH;
852     case URLPart::HostEnd:
853         m_url.m_hostEnd = base.m_hostEnd;
854         FALLTHROUGH;
855     case URLPart::PasswordEnd:
856         m_url.m_passwordEnd = base.m_passwordEnd;
857         FALLTHROUGH;
858     case URLPart::UserEnd:
859         m_url.m_userEnd = base.m_userEnd;
860         FALLTHROUGH;
861     case URLPart::UserStart:
862         m_url.m_userStart = base.m_userStart;
863         FALLTHROUGH;
864     case URLPart::SchemeEnd:
865         m_url.m_isValid = base.m_isValid;
866         m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
867         m_url.m_schemeEnd = base.m_schemeEnd;
868     }
869     switch (scheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd))) {
870     case Scheme::WS:
871     case Scheme::WSS:
872         isUTF8Encoding = true;
873         FALLTHROUGH;
874     case Scheme::File:
875     case Scheme::FTP:
876     case Scheme::Gopher:
877     case Scheme::HTTP:
878     case Scheme::HTTPS:
879         m_urlIsSpecial = true;
880         return;
881     case Scheme::NonSpecial:
882         m_urlIsSpecial = false;
883         isUTF8Encoding = true;
884         return;
885     }
886     ASSERT_NOT_REACHED();
887 }
888
889 static const char dotASCIICode[2] = {'2', 'e'};
890
891 template<typename CharacterType>
892 ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c)
893 {
894     if (c.atEnd())
895         return false;
896     if (*c == '.') {
897         advance<CharacterType, ReportSyntaxViolation::No>(c);
898         return c.atEnd() || isSlashQuestionOrHash(*c);
899     }
900     if (*c != '%')
901         return false;
902     advance<CharacterType, ReportSyntaxViolation::No>(c);
903     if (c.atEnd() || *c != dotASCIICode[0])
904         return false;
905     advance<CharacterType, ReportSyntaxViolation::No>(c);
906     if (c.atEnd())
907         return false;
908     if (toASCIILower(*c) == dotASCIICode[1]) {
909         advance<CharacterType, ReportSyntaxViolation::No>(c);
910         return c.atEnd() || isSlashQuestionOrHash(*c);
911     }
912     return false;
913 }
914
915 template<typename CharacterType>
916 ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
917 {
918     if (c.atEnd())
919         return false;
920     if (*c == '.') {
921         advance<CharacterType, ReportSyntaxViolation::No>(c);
922         return isSingleDotPathSegment(c);
923     }
924     if (*c != '%')
925         return false;
926     advance<CharacterType, ReportSyntaxViolation::No>(c);
927     if (c.atEnd() || *c != dotASCIICode[0])
928         return false;
929     advance<CharacterType, ReportSyntaxViolation::No>(c);
930     if (c.atEnd())
931         return false;
932     if (toASCIILower(*c) == dotASCIICode[1]) {
933         advance<CharacterType, ReportSyntaxViolation::No>(c);
934         return isSingleDotPathSegment(c);
935     }
936     return false;
937 }
938
939 template<typename CharacterType>
940 void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
941 {
942     ASSERT(isSingleDotPathSegment(c));
943     if (*c == '.') {
944         advance(c);
945         if (!c.atEnd()) {
946             if (*c == '/' || *c == '\\')
947                 advance(c);
948             else
949                 ASSERT(*c == '?' || *c == '#');
950         }
951     } else {
952         ASSERT(*c == '%');
953         advance(c);
954         ASSERT(*c == dotASCIICode[0]);
955         advance(c);
956         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
957         advance(c);
958         if (!c.atEnd()) {
959             if (*c == '/' || *c == '\\')
960                 advance(c);
961             else
962                 ASSERT(*c == '?' || *c == '#');
963         }
964     }
965 }
966
967 template<typename CharacterType>
968 void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
969 {
970     ASSERT(isDoubleDotPathSegment(c));
971     if (*c == '.')
972         advance(c);
973     else {
974         ASSERT(*c == '%');
975         advance(c);
976         ASSERT(*c == dotASCIICode[0]);
977         advance(c);
978         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
979         advance(c);
980     }
981     consumeSingleDotPathSegment(c);
982 }
983
984 void URLParser::popPath()
985 {
986     ASSERT(m_didSeeSyntaxViolation);
987     if (m_url.m_pathAfterLastSlash > m_url.m_portEnd + 1) {
988         m_url.m_pathAfterLastSlash--;
989         if (m_asciiBuffer[m_url.m_pathAfterLastSlash] == '/')
990             m_url.m_pathAfterLastSlash--;
991         while (m_url.m_pathAfterLastSlash > m_url.m_portEnd && m_asciiBuffer[m_url.m_pathAfterLastSlash] != '/')
992             m_url.m_pathAfterLastSlash--;
993         m_url.m_pathAfterLastSlash++;
994     }
995     m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
996 }
997
998 template<typename CharacterType>
999 void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
1000 {
1001     if (m_didSeeSyntaxViolation)
1002         return;
1003     m_didSeeSyntaxViolation = true;
1004     
1005     ASSERT(m_asciiBuffer.isEmpty());
1006     size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1007     RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
1008     m_asciiBuffer.reserveCapacity(m_inputString.length());
1009     for (size_t i = 0; i < codeUnitsToCopy; ++i) {
1010         ASSERT(isASCII(m_inputString[i]));
1011         m_asciiBuffer.uncheckedAppend(m_inputString[i]);
1012     }
1013 }
1014
1015 void URLParser::failure()
1016 {
1017     m_url.invalidate();
1018     m_url.m_string = m_inputString;
1019 }
1020
1021 template<typename CharacterType>
1022 bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint)
1023 {
1024     if (iterator.atEnd() || toASCIILower(*iterator) != codePoint)
1025         return false;
1026     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
1027     return true;
1028 }
1029
1030 template<typename CharacterType>
1031 bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator)
1032 {
1033     if (!checkLocalhostCodePoint(iterator, 'l'))
1034         return false;
1035     if (!checkLocalhostCodePoint(iterator, 'o'))
1036         return false;
1037     if (!checkLocalhostCodePoint(iterator, 'c'))
1038         return false;
1039     if (!checkLocalhostCodePoint(iterator, 'a'))
1040         return false;
1041     if (!checkLocalhostCodePoint(iterator, 'l'))
1042         return false;
1043     if (!checkLocalhostCodePoint(iterator, 'h'))
1044         return false;
1045     if (!checkLocalhostCodePoint(iterator, 'o'))
1046         return false;
1047     if (!checkLocalhostCodePoint(iterator, 's'))
1048         return false;
1049     if (!checkLocalhostCodePoint(iterator, 't'))
1050         return false;
1051     return iterator.atEnd();
1052 }
1053
1054 bool URLParser::isLocalhost(StringView view)
1055 {
1056     if (view.is8Bit())
1057         return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length()));
1058     return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length()));
1059 }
1060
1061 ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
1062 {
1063     if (UNLIKELY(m_didSeeSyntaxViolation)) {
1064         ASSERT(start + length <= m_asciiBuffer.size());
1065         return StringView(m_asciiBuffer.data() + start, length);
1066     }
1067     ASSERT(start + length <= m_inputString.length());
1068     return StringView(m_inputString).substring(start, length);
1069 }
1070
1071 template<typename CharacterType>
1072 ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1073 {
1074     if (UNLIKELY(m_didSeeSyntaxViolation))
1075         return m_asciiBuffer.size();
1076     
1077     return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1078 }
1079
1080 URLParser::URLParser(const String& input, const URL& base, const TextEncoding& encoding)
1081     : m_inputString(input)
1082 {
1083     if (input.isNull()) {
1084         if (base.isValid() && !base.m_cannotBeABaseURL) {
1085             m_url = base;
1086             m_url.removeFragmentIdentifier();
1087         }
1088         return;
1089     }
1090
1091     if (input.is8Bit()) {
1092         m_inputBegin = input.characters8();
1093         parse(input.characters8(), input.length(), base, encoding);
1094     } else {
1095         m_inputBegin = input.characters16();
1096         parse(input.characters16(), input.length(), base, encoding);
1097     }
1098
1099     ASSERT(!m_url.m_isValid
1100         || m_didSeeSyntaxViolation == (m_url.string() != input)
1101         || (input.isAllSpecialCharacters<isC0ControlOrSpace>()
1102             && m_url.m_string == base.m_string.left(base.m_queryEnd)));
1103     ASSERT(internalValuesConsistent(m_url));
1104 #if !ASSERT_DISABLED
1105     if (!m_didSeeSyntaxViolation) {
1106         // Force a syntax violation at the beginning to make sure we get the same result.
1107         URLParser parser(makeString(" ", input), base, encoding);
1108         URL parsed = parser.result();
1109         if (parsed.isValid())
1110             ASSERT(allValuesEqual(parser.result(), m_url));
1111     }
1112 #endif
1113 }
1114
1115 template<typename CharacterType>
1116 void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
1117 {
1118     URL_PARSER_LOG("Parsing URL <%s> base <%s> encoding <%s>", String(input, length).utf8().data(), base.string().utf8().data(), encoding.name());
1119     m_url = { };
1120     ASSERT(m_asciiBuffer.isEmpty());
1121     
1122     bool isUTF8Encoding = encoding == UTF8Encoding();
1123     Vector<UChar> queryBuffer;
1124
1125     unsigned endIndex = length;
1126     while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) {
1127         syntaxViolation(CodePointIterator<CharacterType>(input, input));
1128         endIndex--;
1129     }
1130     CodePointIterator<CharacterType> c(input, input + endIndex);
1131     CodePointIterator<CharacterType> authorityOrHostBegin;
1132     CodePointIterator<CharacterType> queryBegin;
1133     while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1134         syntaxViolation(c);
1135         ++c;
1136     }
1137     auto beginAfterControlAndSpace = c;
1138
1139     enum class State : uint8_t {
1140         SchemeStart,
1141         Scheme,
1142         NoScheme,
1143         SpecialRelativeOrAuthority,
1144         PathOrAuthority,
1145         Relative,
1146         RelativeSlash,
1147         SpecialAuthoritySlashes,
1148         SpecialAuthorityIgnoreSlashes,
1149         AuthorityOrHost,
1150         Host,
1151         File,
1152         FileSlash,
1153         FileHost,
1154         PathStart,
1155         Path,
1156         CannotBeABaseURLPath,
1157         UTF8Query,
1158         NonUTF8Query,
1159         Fragment,
1160     };
1161
1162 #define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1163 #define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x)
1164
1165     State state = State::SchemeStart;
1166     while (!c.atEnd()) {
1167         if (UNLIKELY(isTabOrNewline(*c))) {
1168             syntaxViolation(c);
1169             ++c;
1170             continue;
1171         }
1172
1173         switch (state) {
1174         case State::SchemeStart:
1175             LOG_STATE("SchemeStart");
1176             if (isASCIIAlpha(*c)) {
1177                 if (UNLIKELY(isASCIIUpper(*c)))
1178                     syntaxViolation(c);
1179                 appendToASCIIBuffer(toASCIILower(*c));
1180                 advance(c);
1181                 if (c.atEnd()) {
1182                     m_asciiBuffer.clear();
1183                     state = State::NoScheme;
1184                     c = beginAfterControlAndSpace;
1185                 }
1186                 state = State::Scheme;
1187             } else
1188                 state = State::NoScheme;
1189             break;
1190         case State::Scheme:
1191             LOG_STATE("Scheme");
1192             if (isValidSchemeCharacter(*c)) {
1193                 if (UNLIKELY(isASCIIUpper(*c)))
1194                     syntaxViolation(c);
1195                 appendToASCIIBuffer(toASCIILower(*c));
1196             } else if (*c == ':') {
1197                 m_url.m_schemeEnd = currentPosition(c);
1198                 StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd);
1199                 appendToASCIIBuffer(':');
1200                 switch (scheme(urlScheme)) {
1201                 case Scheme::File:
1202                     m_urlIsSpecial = true;
1203                     state = State::File;
1204                     ++c;
1205                     break;
1206                 case Scheme::WS:
1207                 case Scheme::WSS:
1208                     isUTF8Encoding = true;
1209                     m_urlIsSpecial = true;
1210                     if (base.protocolIs(urlScheme))
1211                         state = State::SpecialRelativeOrAuthority;
1212                     else
1213                         state = State::SpecialAuthoritySlashes;
1214                     ++c;
1215                     break;
1216                 case Scheme::HTTP:
1217                 case Scheme::HTTPS:
1218                     m_url.m_protocolIsInHTTPFamily = true;
1219                     FALLTHROUGH;
1220                 case Scheme::FTP:
1221                 case Scheme::Gopher:
1222                     m_urlIsSpecial = true;
1223                     if (base.protocolIs(urlScheme))
1224                         state = State::SpecialRelativeOrAuthority;
1225                     else
1226                         state = State::SpecialAuthoritySlashes;
1227                     ++c;
1228                     break;
1229                 case Scheme::NonSpecial:
1230                     isUTF8Encoding = true;
1231                     auto maybeSlash = c;
1232                     advance(maybeSlash);
1233                     if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1234                         appendToASCIIBuffer('/');
1235                         c = maybeSlash;
1236                         state = State::PathOrAuthority;
1237                         ASSERT(*c == '/');
1238                         ++c;
1239                         m_url.m_userStart = currentPosition(c);
1240                     } else {
1241                         ++c;
1242                         m_url.m_userStart = currentPosition(c);
1243                         m_url.m_userEnd = m_url.m_userStart;
1244                         m_url.m_passwordEnd = m_url.m_userStart;
1245                         m_url.m_hostEnd = m_url.m_userStart;
1246                         m_url.m_portEnd = m_url.m_userStart;
1247                         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1248                         m_url.m_cannotBeABaseURL = true;
1249                         state = State::CannotBeABaseURLPath;
1250                     }
1251                     break;
1252                 }
1253                 break;
1254             } else {
1255                 m_asciiBuffer.clear();
1256                 state = State::NoScheme;
1257                 c = beginAfterControlAndSpace;
1258                 break;
1259             }
1260             advance(c);
1261             if (c.atEnd()) {
1262                 m_asciiBuffer.clear();
1263                 state = State::NoScheme;
1264                 c = beginAfterControlAndSpace;
1265             }
1266             break;
1267         case State::NoScheme:
1268             LOG_STATE("NoScheme");
1269             if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) {
1270                 failure();
1271                 return;
1272             }
1273             if (base.m_cannotBeABaseURL && *c == '#') {
1274                 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1275                 state = State::Fragment;
1276                 appendToASCIIBuffer('#');
1277                 ++c;
1278                 break;
1279             }
1280             if (!base.protocolIs("file")) {
1281                 state = State::Relative;
1282                 break;
1283             }
1284             copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1285             appendToASCIIBuffer(':');
1286             state = State::File;
1287             break;
1288         case State::SpecialRelativeOrAuthority:
1289             LOG_STATE("SpecialRelativeOrAuthority");
1290             if (*c == '/') {
1291                 appendToASCIIBuffer('/');
1292                 advance(c);
1293                 if (c.atEnd()) {
1294                     failure();
1295                     return;
1296                 }
1297                 if (*c == '/') {
1298                     appendToASCIIBuffer('/');
1299                     state = State::SpecialAuthorityIgnoreSlashes;
1300                     ++c;
1301                 } else
1302                     state = State::RelativeSlash;
1303             } else
1304                 state = State::Relative;
1305             break;
1306         case State::PathOrAuthority:
1307             LOG_STATE("PathOrAuthority");
1308             if (*c == '/') {
1309                 appendToASCIIBuffer('/');
1310                 state = State::AuthorityOrHost;
1311                 advance(c);
1312                 m_url.m_userStart = currentPosition(c);
1313                 authorityOrHostBegin = c;
1314             } else {
1315                 ASSERT(parsedDataView(currentPosition(c) - 1, 1) == "/");
1316                 m_url.m_userStart = currentPosition(c) - 1;
1317                 m_url.m_userEnd = m_url.m_userStart;
1318                 m_url.m_passwordEnd = m_url.m_userStart;
1319                 m_url.m_hostEnd = m_url.m_userStart;
1320                 m_url.m_portEnd = m_url.m_userStart;
1321                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1322                 state = State::Path;
1323             }
1324             break;
1325         case State::Relative:
1326             LOG_STATE("Relative");
1327             switch (*c) {
1328             case '/':
1329             case '\\':
1330                 state = State::RelativeSlash;
1331                 ++c;
1332                 break;
1333             case '?':
1334                 copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1335                 appendToASCIIBuffer('?');
1336                 ++c;
1337                 if (isUTF8Encoding)
1338                     state = State::UTF8Query;
1339                 else {
1340                     queryBegin = c;
1341                     state = State::NonUTF8Query;
1342                 }
1343                 break;
1344             case '#':
1345                 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1346                 appendToASCIIBuffer('#');
1347                 state = State::Fragment;
1348                 ++c;
1349                 break;
1350             default:
1351                 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1352                 state = State::Path;
1353                 break;
1354             }
1355             break;
1356         case State::RelativeSlash:
1357             LOG_STATE("RelativeSlash");
1358             if (*c == '/' || *c == '\\') {
1359                 ++c;
1360                 copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1361                 appendToASCIIBuffer("://", 3);
1362                 state = State::SpecialAuthorityIgnoreSlashes;
1363             } else {
1364                 copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1365                 appendToASCIIBuffer('/');
1366                 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1367                 state = State::Path;
1368             }
1369             break;
1370         case State::SpecialAuthoritySlashes:
1371             LOG_STATE("SpecialAuthoritySlashes");
1372             if (LIKELY(*c == '/' || *c == '\\')) {
1373                 if (UNLIKELY(*c == '\\'))
1374                     syntaxViolation(c);
1375                 appendToASCIIBuffer('/');
1376                 advance(c);
1377                 if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) {
1378                     if (UNLIKELY(*c == '\\'))
1379                         syntaxViolation(c);
1380                     ++c;
1381                     appendToASCIIBuffer('/');
1382                 } else {
1383                     syntaxViolation(c);
1384                     appendToASCIIBuffer('/');
1385                 }
1386             } else {
1387                 syntaxViolation(c);
1388                 appendToASCIIBuffer("//", 2);
1389             }
1390             state = State::SpecialAuthorityIgnoreSlashes;
1391             break;
1392         case State::SpecialAuthorityIgnoreSlashes:
1393             LOG_STATE("SpecialAuthorityIgnoreSlashes");
1394             if (*c == '/' || *c == '\\') {
1395                 syntaxViolation(c);
1396                 ++c;
1397             } else {
1398                 m_url.m_userStart = currentPosition(c);
1399                 state = State::AuthorityOrHost;
1400                 authorityOrHostBegin = c;
1401             }
1402             break;
1403         case State::AuthorityOrHost:
1404             do {
1405                 LOG_STATE("AuthorityOrHost");
1406                 if (*c == '@') {
1407                     auto lastAt = c;
1408                     auto findLastAt = c;
1409                     while (!findLastAt.atEnd()) {
1410                         URL_PARSER_LOG("Finding last @: %c", *findLastAt);
1411                         if (*findLastAt == '@')
1412                             lastAt = findLastAt;
1413                         bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\');
1414                         if (isSlash || *findLastAt == '?' || *findLastAt == '#')
1415                             break;
1416                         ++findLastAt;
1417                     }
1418                     parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1419                     c = lastAt;
1420                     advance(c);
1421                     authorityOrHostBegin = c;
1422                     state = State::Host;
1423                     m_hostHasPercentOrNonASCII = false;
1424                     break;
1425                 }
1426                 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1427                 if (isSlash || *c == '?' || *c == '#') {
1428                     auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c);
1429                     if (iterator.atEnd()) {
1430                         m_url.m_userEnd = currentPosition(c);
1431                         m_url.m_passwordEnd = m_url.m_userEnd;
1432                         m_url.m_hostEnd = m_url.m_userEnd;
1433                         m_url.m_portEnd = m_url.m_userEnd;
1434                         m_url.m_pathAfterLastSlash = m_url.m_userEnd;
1435                     } else {
1436                         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1437                         m_url.m_passwordEnd = m_url.m_userEnd;
1438                         if (!parseHostAndPort(iterator)) {
1439                             failure();
1440                             return;
1441                         }
1442                         if (UNLIKELY(!isSlash)) {
1443                             syntaxViolation(c);
1444                             appendToASCIIBuffer('/');
1445                             m_url.m_pathAfterLastSlash = currentPosition(c);
1446                         }
1447                     }
1448                     state = State::Path;
1449                     break;
1450                 }
1451                 if (isPercentOrNonASCII(*c))
1452                     m_hostHasPercentOrNonASCII = true;
1453                 ++c;
1454             } while (!c.atEnd());
1455             break;
1456         case State::Host:
1457             do {
1458                 LOG_STATE("Host");
1459                 if (*c == '/' || *c == '?' || *c == '#') {
1460                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1461                         failure();
1462                         return;
1463                     }
1464                     if (*c == '?' || *c == '#') {
1465                         syntaxViolation(c);
1466                         appendToASCIIBuffer('/');
1467                         m_url.m_pathAfterLastSlash = currentPosition(c);
1468                     }
1469                     state = State::Path;
1470                     break;
1471                 }
1472                 if (isPercentOrNonASCII(*c))
1473                     m_hostHasPercentOrNonASCII = true;
1474                 ++c;
1475             } while (!c.atEnd());
1476             break;
1477         case State::File:
1478             LOG_STATE("File");
1479             switch (*c) {
1480             case '\\':
1481                 syntaxViolation(c);
1482                 FALLTHROUGH;
1483             case '/':
1484                 appendToASCIIBuffer('/');
1485                 state = State::FileSlash;
1486                 ++c;
1487                 break;
1488             case '?':
1489                 syntaxViolation(c);
1490                 if (base.isValid() && base.protocolIs("file")) {
1491                     copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1492                     appendToASCIIBuffer('?');
1493                     ++c;
1494                 } else {
1495                     appendToASCIIBuffer("///?", 4);
1496                     ++c;
1497                     m_url.m_userStart = currentPosition(c) - 2;
1498                     m_url.m_userEnd = m_url.m_userStart;
1499                     m_url.m_passwordEnd = m_url.m_userStart;
1500                     m_url.m_hostEnd = m_url.m_userStart;
1501                     m_url.m_portEnd = m_url.m_userStart;
1502                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1503                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1504                 }
1505                 if (isUTF8Encoding)
1506                     state = State::UTF8Query;
1507                 else {
1508                     queryBegin = c;
1509                     state = State::NonUTF8Query;
1510                 }
1511                 break;
1512             case '#':
1513                 syntaxViolation(c);
1514                 if (base.isValid() && base.protocolIs("file")) {
1515                     copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1516                     appendToASCIIBuffer('#');
1517                 } else {
1518                     appendToASCIIBuffer("///#", 4);
1519                     m_url.m_userStart = currentPosition(c) - 2;
1520                     m_url.m_userEnd = m_url.m_userStart;
1521                     m_url.m_passwordEnd = m_url.m_userStart;
1522                     m_url.m_hostEnd = m_url.m_userStart;
1523                     m_url.m_portEnd = m_url.m_userStart;
1524                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1525                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1526                     m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1527                 }
1528                 state = State::Fragment;
1529                 ++c;
1530                 break;
1531             default:
1532                 syntaxViolation(c);
1533                 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1534                     copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1535                 else {
1536                     appendToASCIIBuffer("///", 3);
1537                     m_url.m_userStart = currentPosition(c) - 1;
1538                     m_url.m_userEnd = m_url.m_userStart;
1539                     m_url.m_passwordEnd = m_url.m_userStart;
1540                     m_url.m_hostEnd = m_url.m_userStart;
1541                     m_url.m_portEnd = m_url.m_userStart;
1542                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1543                     if (isWindowsDriveLetter(c))
1544                         appendWindowsDriveLetter(c);
1545                 }
1546                 state = State::Path;
1547                 break;
1548             }
1549             break;
1550         case State::FileSlash:
1551             LOG_STATE("FileSlash");
1552             if (LIKELY(*c == '/' || *c == '\\')) {
1553                 if (UNLIKELY(*c == '\\'))
1554                     syntaxViolation(c);
1555                 appendToASCIIBuffer('/');
1556                 advance(c);
1557                 m_url.m_userStart = currentPosition(c);
1558                 m_url.m_userEnd = m_url.m_userStart;
1559                 m_url.m_passwordEnd = m_url.m_userStart;
1560                 m_url.m_hostEnd = m_url.m_userStart;
1561                 m_url.m_portEnd = m_url.m_userStart;
1562                 authorityOrHostBegin = c;
1563                 state = State::FileHost;
1564                 break;
1565             }
1566             if (base.isValid() && base.protocolIs("file")) {
1567                 // FIXME: This String copy is unnecessary.
1568                 String basePath = base.path();
1569                 if (basePath.length() >= 2) {
1570                     bool windowsQuirk = basePath.is8Bit()
1571                         ? isWindowsDriveLetter(CodePointIterator<LChar>(basePath.characters8(), basePath.characters8() + basePath.length()))
1572                         : isWindowsDriveLetter(CodePointIterator<UChar>(basePath.characters16(), basePath.characters16() + basePath.length()));
1573                     if (windowsQuirk) {
1574                         appendToASCIIBuffer(basePath[0]);
1575                         appendToASCIIBuffer(basePath[1]);
1576                     }
1577                 }
1578             }
1579             syntaxViolation(c);
1580             appendToASCIIBuffer("//", 2);
1581             m_url.m_userStart = currentPosition(c) - 1;
1582             m_url.m_userEnd = m_url.m_userStart;
1583             m_url.m_passwordEnd = m_url.m_userStart;
1584             m_url.m_hostEnd = m_url.m_userStart;
1585             m_url.m_portEnd = m_url.m_userStart;
1586             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1587             if (isWindowsDriveLetter(c))
1588                 appendWindowsDriveLetter(c);
1589             state = State::Path;
1590             break;
1591         case State::FileHost:
1592             do {
1593                 LOG_STATE("FileHost");
1594                 if (isSlashQuestionOrHash(*c)) {
1595                     bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1596                         && isWindowsDriveLetter(authorityOrHostBegin);
1597                     if (windowsQuirk) {
1598                         syntaxViolation(authorityOrHostBegin);
1599                         appendToASCIIBuffer('/');
1600                         appendWindowsDriveLetter(authorityOrHostBegin);
1601                     }
1602                     if (windowsQuirk || authorityOrHostBegin == c) {
1603                         ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1, 1) == "/");
1604                         if (UNLIKELY(*c == '?')) {
1605                             syntaxViolation(c);
1606                             appendToASCIIBuffer("/?", 2);
1607                             ++c;
1608                             if (isUTF8Encoding)
1609                                 state = State::UTF8Query;
1610                             else {
1611                                 queryBegin = c;
1612                                 state = State::NonUTF8Query;
1613                             }
1614                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1615                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1616                             break;
1617                         }
1618                         if (UNLIKELY(*c == '#')) {
1619                             syntaxViolation(c);
1620                             appendToASCIIBuffer("/#", 2);
1621                             ++c;
1622                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1623                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1624                             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1625                             state = State::Fragment;
1626                             break;
1627                         }
1628                         state = State::Path;
1629                         break;
1630                     }
1631                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1632                         failure();
1633                         return;
1634                     }
1635                     if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) {
1636                         syntaxViolation(c);
1637                         m_asciiBuffer.shrink(m_url.m_passwordEnd);
1638                         m_url.m_hostEnd = currentPosition(c);
1639                         m_url.m_portEnd = m_url.m_hostEnd;
1640                     }
1641                     
1642                     state = State::PathStart;
1643                     break;
1644                 }
1645                 if (isPercentOrNonASCII(*c))
1646                     m_hostHasPercentOrNonASCII = true;
1647                 ++c;
1648             } while (!c.atEnd());
1649             break;
1650         case State::PathStart:
1651             LOG_STATE("PathStart");
1652             if (*c != '/' && *c != '\\')
1653                 ++c;
1654             state = State::Path;
1655             break;
1656         case State::Path:
1657             LOG_STATE("Path");
1658             if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1659                 if (UNLIKELY(m_urlIsSpecial && *c == '\\'))
1660                     syntaxViolation(c);
1661                 appendToASCIIBuffer('/');
1662                 ++c;
1663                 m_url.m_pathAfterLastSlash = currentPosition(c);
1664                 break;
1665             }
1666             if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1, 1) == "/")) {
1667                 if (UNLIKELY(isDoubleDotPathSegment(c))) {
1668                     syntaxViolation(c);
1669                     consumeDoubleDotPathSegment(c);
1670                     popPath();
1671                     break;
1672                 }
1673                 if (UNLIKELY(isSingleDotPathSegment(c))) {
1674                     syntaxViolation(c);
1675                     consumeSingleDotPathSegment(c);
1676                     break;
1677                 }
1678             }
1679             if (*c == '?') {
1680                 m_url.m_pathEnd = currentPosition(c);
1681                 appendToASCIIBuffer('?');
1682                 ++c;
1683                 if (isUTF8Encoding)
1684                     state = State::UTF8Query;
1685                 else {
1686                     queryBegin = c;
1687                     state = State::NonUTF8Query;
1688                 }
1689                 break;
1690             }
1691             if (*c == '#') {
1692                 m_url.m_pathEnd = currentPosition(c);
1693                 m_url.m_queryEnd = m_url.m_pathEnd;
1694                 state = State::Fragment;
1695                 break;
1696             }
1697             utf8PercentEncode<isInDefaultEncodeSet>(c);
1698             ++c;
1699             break;
1700         case State::CannotBeABaseURLPath:
1701             LOG_STATE("CannotBeABaseURLPath");
1702             if (*c == '?') {
1703                 m_url.m_pathEnd = currentPosition(c);
1704                 appendToASCIIBuffer('?');
1705                 ++c;
1706                 if (isUTF8Encoding)
1707                     state = State::UTF8Query;
1708                 else {
1709                     queryBegin = c;
1710                     state = State::NonUTF8Query;
1711                 }
1712             } else if (*c == '#') {
1713                 m_url.m_pathEnd = currentPosition(c);
1714                 m_url.m_queryEnd = m_url.m_pathEnd;
1715                 state = State::Fragment;
1716             } else if (*c == '/') {
1717                 appendToASCIIBuffer('/');
1718                 ++c;
1719                 m_url.m_pathAfterLastSlash = currentPosition(c);
1720             } else {
1721                 utf8PercentEncode<isInSimpleEncodeSet>(c);
1722                 ++c;
1723             }
1724             break;
1725         case State::UTF8Query:
1726             LOG_STATE("UTF8Query");
1727             ASSERT(queryBegin == CodePointIterator<CharacterType>());
1728             if (*c == '#') {
1729                 m_url.m_queryEnd = currentPosition(c);
1730                 state = State::Fragment;
1731                 break;
1732             }
1733             if (isUTF8Encoding)
1734                 utf8QueryEncode(c);
1735             else
1736                 appendCodePoint(queryBuffer, *c);
1737             ++c;
1738             break;
1739         case State::NonUTF8Query:
1740             do {
1741                 LOG_STATE("NonUTF8Query");
1742                 ASSERT(queryBegin != CodePointIterator<CharacterType>());
1743                 if (*c == '#') {
1744                     encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
1745                     m_url.m_queryEnd = currentPosition(c);
1746                     state = State::Fragment;
1747                     break;
1748                 }
1749                 appendCodePoint(queryBuffer, *c);
1750                 advance(c, queryBegin);
1751             } while (!c.atEnd());
1752             break;
1753         case State::Fragment:
1754             URL_PARSER_LOG("State Fragment");
1755             utf8PercentEncode<isInSimpleEncodeSet>(c);
1756             ++c;
1757             break;
1758         }
1759     }
1760
1761     switch (state) {
1762     case State::SchemeStart:
1763         LOG_FINAL_STATE("SchemeStart");
1764         if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1765             m_url = base;
1766             m_url.removeFragmentIdentifier();
1767             return;
1768         }
1769         failure();
1770         return;
1771     case State::Scheme:
1772         LOG_FINAL_STATE("Scheme");
1773         failure();
1774         return;
1775     case State::NoScheme:
1776         LOG_FINAL_STATE("NoScheme");
1777         RELEASE_ASSERT_NOT_REACHED();
1778     case State::SpecialRelativeOrAuthority:
1779         LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1780         copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1781         m_url.m_fragmentEnd = m_url.m_queryEnd;
1782         break;
1783     case State::PathOrAuthority:
1784         LOG_FINAL_STATE("PathOrAuthority");
1785         ASSERT(m_url.m_userStart);
1786         ASSERT(m_url.m_userStart == currentPosition(c));
1787         ASSERT(parsedDataView(currentPosition(c) - 1, 1) == "/");
1788         m_url.m_userStart--;
1789         m_url.m_userEnd = m_url.m_userStart;
1790         m_url.m_passwordEnd = m_url.m_userStart;
1791         m_url.m_hostEnd = m_url.m_userStart;
1792         m_url.m_portEnd = m_url.m_userStart;
1793         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1794         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1795         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1796         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1797         break;
1798     case State::Relative:
1799         LOG_FINAL_STATE("Relative");
1800         RELEASE_ASSERT_NOT_REACHED();
1801     case State::RelativeSlash:
1802         LOG_FINAL_STATE("RelativeSlash");
1803         copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1804         appendToASCIIBuffer('/');
1805         m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1806         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1807         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1808         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1809         break;
1810     case State::SpecialAuthoritySlashes:
1811         LOG_FINAL_STATE("SpecialAuthoritySlashes");
1812         m_url.m_userStart = currentPosition(c);
1813         m_url.m_userEnd = m_url.m_userStart;
1814         m_url.m_passwordEnd = m_url.m_userStart;
1815         m_url.m_hostEnd = m_url.m_userStart;
1816         m_url.m_portEnd = m_url.m_userStart;
1817         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1818         m_url.m_pathEnd = m_url.m_userStart;
1819         m_url.m_queryEnd = m_url.m_userStart;
1820         m_url.m_fragmentEnd = m_url.m_userStart;
1821         break;
1822     case State::SpecialAuthorityIgnoreSlashes:
1823         LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1824         failure();
1825         return;
1826         break;
1827     case State::AuthorityOrHost:
1828         LOG_FINAL_STATE("AuthorityOrHost");
1829         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1830         m_url.m_passwordEnd = m_url.m_userEnd;
1831         if (authorityOrHostBegin.atEnd()) {
1832             m_url.m_userEnd = m_url.m_userStart;
1833             m_url.m_passwordEnd = m_url.m_userStart;
1834             m_url.m_hostEnd = m_url.m_userStart;
1835             m_url.m_portEnd = m_url.m_userStart;
1836             m_url.m_pathEnd = m_url.m_userStart;
1837         } else if (!parseHostAndPort(authorityOrHostBegin)) {
1838             failure();
1839             return;
1840         } else {
1841             if (m_urlIsSpecial) {
1842                 syntaxViolation(c);
1843                 appendToASCIIBuffer('/');
1844                 m_url.m_pathEnd = m_url.m_portEnd + 1;
1845             } else
1846                 m_url.m_pathEnd = m_url.m_portEnd;
1847         }
1848         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1849         m_url.m_queryEnd = m_url.m_pathEnd;
1850         m_url.m_fragmentEnd = m_url.m_pathEnd;
1851         break;
1852     case State::Host:
1853         LOG_FINAL_STATE("Host");
1854         if (!parseHostAndPort(authorityOrHostBegin)) {
1855             failure();
1856             return;
1857         }
1858         if (m_urlIsSpecial) {
1859             syntaxViolation(c);
1860             appendToASCIIBuffer('/');
1861             m_url.m_pathEnd = m_url.m_portEnd + 1;
1862         } else
1863             m_url.m_pathEnd = m_url.m_portEnd;
1864         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1865         m_url.m_queryEnd = m_url.m_pathEnd;
1866         m_url.m_fragmentEnd = m_url.m_pathEnd;
1867         break;
1868     case State::File:
1869         LOG_FINAL_STATE("File");
1870         if (base.isValid() && base.protocolIs("file")) {
1871             copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1872             appendToASCIIBuffer(':');
1873         }
1874         syntaxViolation(c);
1875         appendToASCIIBuffer("///", 3);
1876         m_url.m_userStart = currentPosition(c) - 1;
1877         m_url.m_userEnd = m_url.m_userStart;
1878         m_url.m_passwordEnd = m_url.m_userStart;
1879         m_url.m_hostEnd = m_url.m_userStart;
1880         m_url.m_portEnd = m_url.m_userStart;
1881         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1882         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1883         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1884         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1885         break;
1886     case State::FileSlash:
1887         LOG_FINAL_STATE("FileSlash");
1888         syntaxViolation(c);
1889         m_url.m_userStart = currentPosition(c) + 1;
1890         appendToASCIIBuffer("//", 2);
1891         m_url.m_userEnd = m_url.m_userStart;
1892         m_url.m_passwordEnd = m_url.m_userStart;
1893         m_url.m_hostEnd = m_url.m_userStart;
1894         m_url.m_portEnd = m_url.m_userStart;
1895         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1896         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1897         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1898         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1899         break;
1900     case State::FileHost:
1901         LOG_FINAL_STATE("FileHost");
1902         if (authorityOrHostBegin == c) {
1903             syntaxViolation(c);
1904             appendToASCIIBuffer('/');
1905             m_url.m_userStart = currentPosition(c) - 1;
1906             m_url.m_userEnd = m_url.m_userStart;
1907             m_url.m_passwordEnd = m_url.m_userStart;
1908             m_url.m_hostEnd = m_url.m_userStart;
1909             m_url.m_portEnd = m_url.m_userStart;
1910             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1911             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1912             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1913             m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1914             break;
1915         }
1916
1917         if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1918             failure();
1919             return;
1920         }
1921
1922         syntaxViolation(c);
1923         if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) {
1924             m_asciiBuffer.shrink(m_url.m_passwordEnd);
1925             m_url.m_hostEnd = currentPosition(c);
1926             m_url.m_portEnd = m_url.m_hostEnd;
1927         }
1928         appendToASCIIBuffer('/');
1929         m_url.m_pathAfterLastSlash = m_url.m_portEnd + 1;
1930         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1931         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1932         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1933         break;
1934     case State::PathStart:
1935         LOG_FINAL_STATE("PathStart");
1936         RELEASE_ASSERT_NOT_REACHED();
1937     case State::Path:
1938         LOG_FINAL_STATE("Path");
1939         m_url.m_pathEnd = currentPosition(c);
1940         m_url.m_queryEnd = m_url.m_pathEnd;
1941         m_url.m_fragmentEnd = m_url.m_pathEnd;
1942         break;
1943     case State::CannotBeABaseURLPath:
1944         LOG_FINAL_STATE("CannotBeABaseURLPath");
1945         m_url.m_pathEnd = currentPosition(c);
1946         m_url.m_queryEnd = m_url.m_pathEnd;
1947         m_url.m_fragmentEnd = m_url.m_pathEnd;
1948         break;
1949     case State::UTF8Query:
1950         LOG_FINAL_STATE("UTF8Query");
1951         ASSERT(queryBegin == CodePointIterator<CharacterType>());
1952         m_url.m_queryEnd = currentPosition(c);
1953         m_url.m_fragmentEnd = m_url.m_queryEnd;
1954         break;
1955     case State::NonUTF8Query:
1956         LOG_FINAL_STATE("NonUTF8Query");
1957         ASSERT(queryBegin != CodePointIterator<CharacterType>());
1958         encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
1959         m_url.m_queryEnd = currentPosition(c);
1960         m_url.m_fragmentEnd = m_url.m_queryEnd;
1961         break;
1962     case State::Fragment:
1963         LOG_FINAL_STATE("Fragment");
1964         m_url.m_fragmentEnd = currentPosition(c);
1965         break;
1966     }
1967
1968     if (LIKELY(!m_didSeeSyntaxViolation)) {
1969         m_url.m_string = m_inputString;
1970         ASSERT(m_asciiBuffer.isEmpty());
1971     } else
1972         m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
1973     m_url.m_isValid = true;
1974     URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
1975 }
1976
1977 template<typename CharacterType>
1978 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
1979 {
1980     if (UNLIKELY(iterator.atEnd())) {
1981         syntaxViolation(iterator);
1982         m_url.m_userEnd = currentPosition(iterator);
1983         m_url.m_passwordEnd = m_url.m_userEnd;
1984         return;
1985     }
1986     for (; !iterator.atEnd(); advance(iterator)) {
1987         if (*iterator == ':') {
1988             m_url.m_userEnd = currentPosition(iterator);
1989             auto iteratorAtColon = iterator;
1990             ++iterator;
1991             bool tabOrNewlineAfterColon = false;
1992             while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
1993                 tabOrNewlineAfterColon = true;
1994                 ++iterator;
1995             }
1996             if (UNLIKELY(iterator.atEnd())) {
1997                 syntaxViolation(iteratorAtColon);
1998                 m_url.m_passwordEnd = m_url.m_userEnd;
1999                 if (m_url.m_userEnd > m_url.m_userStart)
2000                     appendToASCIIBuffer('@');
2001                 return;
2002             }
2003             if (tabOrNewlineAfterColon)
2004                 syntaxViolation(iteratorAtColon);
2005             appendToASCIIBuffer(':');
2006             break;
2007         }
2008         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2009     }
2010     for (; !iterator.atEnd(); advance(iterator))
2011         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2012     m_url.m_passwordEnd = currentPosition(iterator);
2013     if (!m_url.m_userEnd)
2014         m_url.m_userEnd = m_url.m_passwordEnd;
2015     appendToASCIIBuffer('@');
2016 }
2017
2018 template<typename UnsignedIntegerType>
2019 void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
2020 {
2021     LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
2022     LChar* end = buf + WTF_ARRAY_LENGTH(buf);
2023     LChar* p = end;
2024     do {
2025         *--p = (number % 10) + '0';
2026         number /= 10;
2027     } while (number);
2028     appendToASCIIBuffer(p, end - p);
2029 }
2030
2031 void URLParser::serializeIPv4(IPv4Address address)
2032 {
2033     appendNumberToASCIIBuffer<uint8_t>(address >> 24);
2034     appendToASCIIBuffer('.');
2035     appendNumberToASCIIBuffer<uint8_t>(address >> 16);
2036     appendToASCIIBuffer('.');
2037     appendNumberToASCIIBuffer<uint8_t>(address >> 8);
2038     appendToASCIIBuffer('.');
2039     appendNumberToASCIIBuffer<uint8_t>(address);
2040 }
2041     
2042 static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
2043 {
2044     size_t end = begin;
2045     for (; end < 8; end++) {
2046         if (address[end])
2047             break;
2048     }
2049     return end - begin;
2050 }
2051
2052 static Optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
2053 {
2054     Optional<size_t> longest;
2055     size_t longestLength = 0;
2056     for (size_t i = 0; i < 8; i++) {
2057         size_t length = zeroSequenceLength(address, i);
2058         if (length) {
2059             if (length > 1 && (!longest || longestLength < length)) {
2060                 longest = i;
2061                 longestLength = length;
2062             }
2063             i += length;
2064         }
2065     }
2066     return longest;
2067 }
2068
2069 void URLParser::serializeIPv6Piece(uint16_t piece)
2070 {
2071     bool printed = false;
2072     if (auto nibble0 = piece >> 12) {
2073         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
2074         printed = true;
2075     }
2076     auto nibble1 = piece >> 8 & 0xF;
2077     if (printed || nibble1) {
2078         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
2079         printed = true;
2080     }
2081     auto nibble2 = piece >> 4 & 0xF;
2082     if (printed || nibble2)
2083         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
2084     appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
2085 }
2086
2087 void URLParser::serializeIPv6(URLParser::IPv6Address address)
2088 {
2089     appendToASCIIBuffer('[');
2090     auto compressPointer = findLongestZeroSequence(address);
2091     for (size_t piece = 0; piece < 8; piece++) {
2092         if (compressPointer && compressPointer.value() == piece) {
2093             ASSERT(!address[piece]);
2094             if (piece)
2095                 appendToASCIIBuffer(':');
2096             else
2097                 appendToASCIIBuffer("::", 2);
2098             while (piece < 8 && !address[piece])
2099                 piece++;
2100             if (piece == 8)
2101                 break;
2102         }
2103         serializeIPv6Piece(address[piece]);
2104         if (piece < 7)
2105             appendToASCIIBuffer(':');
2106     }
2107     appendToASCIIBuffer(']');
2108 }
2109
2110 template<typename CharacterType>
2111 Optional<uint32_t> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation)
2112 {
2113     enum class State : uint8_t {
2114         UnknownBase,
2115         Decimal,
2116         OctalOrHex,
2117         Octal,
2118         Hex,
2119     };
2120     State state = State::UnknownBase;
2121     Checked<uint32_t, RecordOverflow> value = 0;
2122     if (!iterator.atEnd() && *iterator == '.')
2123         return Nullopt;
2124     while (!iterator.atEnd()) {
2125         if (isTabOrNewline(*iterator)) {
2126             didSeeSyntaxViolation = true;
2127             ++iterator;
2128             continue;
2129         }
2130         if (*iterator == '.') {
2131             ASSERT(!value.hasOverflowed());
2132             return value.unsafeGet();
2133         }
2134         switch (state) {
2135         case State::UnknownBase:
2136             if (UNLIKELY(*iterator == '0')) {
2137                 ++iterator;
2138                 state = State::OctalOrHex;
2139                 break;
2140             }
2141             state = State::Decimal;
2142             break;
2143         case State::OctalOrHex:
2144             didSeeSyntaxViolation = true;
2145             if (*iterator == 'x' || *iterator == 'X') {
2146                 ++iterator;
2147                 state = State::Hex;
2148                 break;
2149             }
2150             state = State::Octal;
2151             break;
2152         case State::Decimal:
2153             if (*iterator < '0' || *iterator > '9')
2154                 return Nullopt;
2155             value *= 10;
2156             value += *iterator - '0';
2157             if (UNLIKELY(value.hasOverflowed()))
2158                 return Nullopt;
2159             ++iterator;
2160             break;
2161         case State::Octal:
2162             ASSERT(didSeeSyntaxViolation);
2163             if (*iterator < '0' || *iterator > '7')
2164                 return Nullopt;
2165             value *= 8;
2166             value += *iterator - '0';
2167             if (UNLIKELY(value.hasOverflowed()))
2168                 return Nullopt;
2169             ++iterator;
2170             break;
2171         case State::Hex:
2172             ASSERT(didSeeSyntaxViolation);
2173             if (!isASCIIHexDigit(*iterator))
2174                 return Nullopt;
2175             value *= 16;
2176             value += toASCIIHexValue(*iterator);
2177             if (UNLIKELY(value.hasOverflowed()))
2178                 return Nullopt;
2179             ++iterator;
2180             break;
2181         }
2182     }
2183     ASSERT(!value.hasOverflowed());
2184     return value.unsafeGet();
2185 }
2186
2187 ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2188 {
2189     RELEASE_ASSERT(exponent <= 4);
2190     uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
2191     return values[exponent];
2192 }
2193
2194 template<typename CharacterType>
2195 Optional<URLParser::IPv4Address> URLParser::parseIPv4Host(CodePointIterator<CharacterType> iterator)
2196 {
2197     auto hostBegin = iterator;
2198
2199     Vector<uint32_t, 4> items;
2200     items.reserveInitialCapacity(4);
2201     bool didSeeSyntaxViolation = false;
2202     while (!iterator.atEnd()) {
2203         if (isTabOrNewline(*iterator)) {
2204             didSeeSyntaxViolation = true;
2205             ++iterator;
2206             continue;
2207         }
2208         if (items.size() >= 4)
2209             return Nullopt;
2210         if (auto item = parseIPv4Piece(iterator, didSeeSyntaxViolation))
2211             items.append(item.value());
2212         else
2213             return Nullopt;
2214         if (!iterator.atEnd()) {
2215             if (items.size() >= 4)
2216                 return Nullopt;
2217             if (*iterator == '.')
2218                 ++iterator;
2219             else
2220                 return Nullopt;
2221         }
2222     }
2223     if (!iterator.atEnd() || !items.size() || items.size() > 4)
2224         return Nullopt;
2225     if (items.size() > 1) {
2226         for (size_t i = 0; i < items.size() - 1; i++) {
2227             if (items[i] > 255)
2228                 return Nullopt;
2229         }
2230     }
2231     if (items[items.size() - 1] >= pow256(5 - items.size()))
2232         return Nullopt;
2233
2234     if (didSeeSyntaxViolation)
2235         syntaxViolation(hostBegin);
2236     for (auto item : items) {
2237         if (item > 255)
2238             syntaxViolation(hostBegin);
2239     }
2240
2241     if (UNLIKELY(items.size() != 4))
2242         syntaxViolation(hostBegin);
2243
2244     IPv4Address ipv4 = items.takeLast();
2245     for (size_t counter = 0; counter < items.size(); ++counter)
2246         ipv4 += items[counter] * pow256(3 - counter);
2247     return ipv4;
2248 }
2249
2250 template<typename CharacterType>
2251 Optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator)
2252 {
2253     if (iterator.atEnd())
2254         return Nullopt;
2255     uint32_t piece = 0;
2256     bool leadingZeros = false;
2257     size_t digitCount = 0;
2258     while (!iterator.atEnd()) {
2259         if (!isASCIIDigit(*iterator))
2260             return Nullopt;
2261         ++digitCount;
2262         if (!piece && *iterator == '0') {
2263             if (leadingZeros)
2264                 return Nullopt;
2265             leadingZeros = true;
2266         }
2267         if (!piece && *iterator == '0')
2268             leadingZeros = true;
2269         piece = piece * 10 + *iterator - '0';
2270         if (piece > 255)
2271             return Nullopt;
2272         advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2273         if (iterator.atEnd())
2274             break;
2275         if (*iterator == '.')
2276             break;
2277     }
2278     if (piece && leadingZeros)
2279         return Nullopt;
2280     return piece;
2281 }
2282
2283 template<typename CharacterType>
2284 Optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator)
2285 {
2286     IPv4Address address = 0;
2287     for (size_t i = 0; i < 4; ++i) {
2288         if (Optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator))
2289             address = (address << 8) + piece.value();
2290         else
2291             return Nullopt;
2292         if (i < 3) {
2293             if (iterator.atEnd())
2294                 return Nullopt;
2295             if (*iterator != '.')
2296                 return Nullopt;
2297             advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2298         } else if (!iterator.atEnd())
2299             return Nullopt;
2300     }
2301     ASSERT(iterator.atEnd());
2302     return address;
2303 }
2304
2305 template<typename CharacterType>
2306 Optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2307 {
2308     ASSERT(*c == '[');
2309     auto hostBegin = c;
2310     advance(c, hostBegin);
2311     if (c.atEnd())
2312         return Nullopt;
2313
2314     IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}};
2315     size_t piecePointer = 0;
2316     Optional<size_t> compressPointer;
2317
2318     if (*c == ':') {
2319         advance(c, hostBegin);
2320         if (c.atEnd())
2321             return Nullopt;
2322         if (*c != ':')
2323             return Nullopt;
2324         advance(c, hostBegin);
2325         ++piecePointer;
2326         compressPointer = piecePointer;
2327     }
2328     
2329     while (!c.atEnd()) {
2330         if (piecePointer == 8)
2331             return Nullopt;
2332         if (*c == ':') {
2333             if (compressPointer)
2334                 return Nullopt;
2335             advance(c, hostBegin);
2336             ++piecePointer;
2337             compressPointer = piecePointer;
2338             continue;
2339         }
2340         if (piecePointer == 6 || (compressPointer && piecePointer < 6)) {
2341             if (Optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) {
2342                 if (compressPointer && piecePointer == 5)
2343                     return Nullopt;
2344                 syntaxViolation(hostBegin);
2345                 address[piecePointer++] = ipv4Address.value() >> 16;
2346                 address[piecePointer++] = ipv4Address.value() & 0xFFFF;
2347                 c = { };
2348                 break;
2349             }
2350         }
2351         uint16_t value = 0;
2352         size_t length = 0;
2353         bool leadingZeros = false;
2354         for (; length < 4; length++) {
2355             if (c.atEnd())
2356                 break;
2357             if (!isASCIIHexDigit(*c))
2358                 break;
2359             if (isASCIIUpper(*c))
2360                 syntaxViolation(hostBegin);
2361             if (*c == '0' && !length)
2362                 leadingZeros = true;
2363             value = value * 0x10 + toASCIIHexValue(*c);
2364             advance(c, hostBegin);
2365         }
2366         
2367         if (UNLIKELY((value && leadingZeros) || (!value && length > 1)))
2368             syntaxViolation(hostBegin);
2369
2370         address[piecePointer++] = value;
2371         if (c.atEnd())
2372             break;
2373         if (piecePointer == 8 || *c != ':')
2374             return Nullopt;
2375         advance(c, hostBegin);
2376     }
2377     
2378     if (!c.atEnd())
2379         return Nullopt;
2380     
2381     if (compressPointer) {
2382         size_t swaps = piecePointer - compressPointer.value();
2383         piecePointer = 7;
2384         while (swaps)
2385             std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
2386     } else if (piecePointer != 8)
2387         return Nullopt;
2388
2389     Optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2390     if (possibleCompressPointer)
2391         possibleCompressPointer.value()++;
2392     if (UNLIKELY(compressPointer != possibleCompressPointer))
2393         syntaxViolation(hostBegin);
2394     
2395     return address;
2396 }
2397
2398 template<typename CharacterType>
2399 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2400 {
2401     Vector<LChar, defaultInlineBufferSize> output;
2402     output.reserveInitialCapacity(length);
2403     
2404     for (size_t i = 0; i < length; ++i) {
2405         uint8_t byte = input[i];
2406         if (byte != '%')
2407             output.uncheckedAppend(byte);
2408         else if (length > 2 && i < length - 2) {
2409             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2410                 syntaxViolation(iteratorForSyntaxViolationPosition);
2411                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2412                 i += 2;
2413             } else
2414                 output.uncheckedAppend(byte);
2415         } else
2416             output.uncheckedAppend(byte);
2417     }
2418     return output;
2419 }
2420     
2421 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length)
2422 {
2423     Vector<LChar, defaultInlineBufferSize> output;
2424     output.reserveInitialCapacity(length);
2425     
2426     for (size_t i = 0; i < length; ++i) {
2427         uint8_t byte = input[i];
2428         if (byte != '%')
2429             output.uncheckedAppend(byte);
2430         else if (length > 2 && i < length - 2) {
2431             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2432                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2433                 i += 2;
2434             } else
2435                 output.uncheckedAppend(byte);
2436         } else
2437             output.uncheckedAppend(byte);
2438     }
2439     return output;
2440 }
2441
2442 ALWAYS_INLINE static bool containsOnlyASCII(const String& string)
2443 {
2444     if (string.is8Bit())
2445         return charactersAreAllASCII(string.characters8(), string.length());
2446     return charactersAreAllASCII(string.characters16(), string.length());
2447 }
2448
2449 template<typename CharacterType>
2450 Optional<Vector<LChar, URLParser::defaultInlineBufferSize>> URLParser::domainToASCII(const String& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2451 {
2452     Vector<LChar, defaultInlineBufferSize> ascii;
2453     if (containsOnlyASCII(domain)) {
2454         size_t length = domain.length();
2455         if (domain.is8Bit()) {
2456             const LChar* characters = domain.characters8();
2457             ascii.reserveInitialCapacity(length);
2458             for (size_t i = 0; i < length; ++i) {
2459                 if (UNLIKELY(isASCIIUpper(characters[i])))
2460                     syntaxViolation(iteratorForSyntaxViolationPosition);
2461                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2462             }
2463         } else {
2464             const UChar* characters = domain.characters16();
2465             ascii.reserveInitialCapacity(length);
2466             for (size_t i = 0; i < length; ++i) {
2467                 if (UNLIKELY(isASCIIUpper(characters[i])))
2468                     syntaxViolation(iteratorForSyntaxViolationPosition);
2469                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2470             }
2471         }
2472         return ascii;
2473     }
2474     
2475     UChar hostnameBuffer[defaultInlineBufferSize];
2476     UErrorCode error = U_ZERO_ERROR;
2477
2478 #if COMPILER(GCC) || COMPILER(CLANG)
2479 #pragma GCC diagnostic push
2480 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2481 #endif
2482     // FIXME: This should use uidna_openUTS46 / uidna_close instead
2483     int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, UIDNA_ALLOW_UNASSIGNED, nullptr, &error);
2484 #if COMPILER(GCC) || COMPILER(CLANG)
2485 #pragma GCC diagnostic pop
2486 #endif
2487     ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
2488
2489     if (error == U_ZERO_ERROR) {
2490         for (int32_t i = 0; i < numCharactersConverted; ++i) {
2491             ASSERT(isASCII(hostnameBuffer[i]));
2492             ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2493         }
2494         ascii.append(hostnameBuffer, numCharactersConverted);
2495         if (domain != StringView(ascii.data(), ascii.size()))
2496             syntaxViolation(iteratorForSyntaxViolationPosition);
2497         return ascii;
2498     }
2499
2500     // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
2501     return Nullopt;
2502 }
2503
2504 bool URLParser::hasInvalidDomainCharacter(const Vector<LChar, URLParser::defaultInlineBufferSize>& asciiDomain)
2505 {
2506     for (size_t i = 0; i < asciiDomain.size(); ++i) {
2507         if (isInvalidDomainCharacter(asciiDomain[i]))
2508             return true;
2509     }
2510     return false;
2511 }
2512
2513 template<typename CharacterType>
2514 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2515 {
2516     ASSERT(*iterator == ':');
2517     auto colonIterator = iterator;
2518     advance(iterator, colonIterator);
2519     uint32_t port = 0;
2520     if (UNLIKELY(iterator.atEnd())) {
2521         m_url.m_portEnd = currentPosition(colonIterator);
2522         syntaxViolation(colonIterator);
2523         return true;
2524     }
2525     size_t digitCount = 0;
2526     bool leadingZeros = false;
2527     for (; !iterator.atEnd(); ++iterator) {
2528         if (UNLIKELY(isTabOrNewline(*iterator))) {
2529             syntaxViolation(colonIterator);
2530             continue;
2531         }
2532         if (isASCIIDigit(*iterator)) {
2533             if (*iterator == '0' && !digitCount)
2534                 leadingZeros = true;
2535             ++digitCount;
2536             port = port * 10 + *iterator - '0';
2537             if (port > std::numeric_limits<uint16_t>::max())
2538                 return false;
2539         } else
2540             return false;
2541     }
2542
2543     if (port && leadingZeros)
2544         syntaxViolation(colonIterator);
2545     
2546     if (!port && digitCount > 1)
2547         syntaxViolation(colonIterator);
2548
2549     ASSERT(port == static_cast<uint16_t>(port));
2550     if (UNLIKELY(defaultPortForProtocol(parsedDataView(0, m_url.m_schemeEnd)) == static_cast<uint16_t>(port)))
2551         syntaxViolation(colonIterator);
2552     else {
2553         appendToASCIIBuffer(':');
2554         ASSERT(port <= std::numeric_limits<uint16_t>::max());
2555         appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2556     }
2557
2558     m_url.m_portEnd = currentPosition(iterator);
2559     return true;
2560 }
2561
2562 template<typename CharacterType>
2563 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2564 {
2565     if (iterator.atEnd())
2566         return false;
2567     if (*iterator == ':')
2568         return false;
2569     if (*iterator == '[') {
2570         auto ipv6End = iterator;
2571         while (!ipv6End.atEnd() && *ipv6End != ']')
2572             ++ipv6End;
2573         if (ipv6End.atEnd())
2574             return false;
2575         if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2576             serializeIPv6(address.value());
2577             if (!ipv6End.atEnd()) {
2578                 advance(ipv6End);
2579                 if (!ipv6End.atEnd() && *ipv6End == ':') {
2580                     m_url.m_hostEnd = currentPosition(ipv6End);
2581                     return parsePort(ipv6End);
2582                 }
2583                 m_url.m_hostEnd = currentPosition(ipv6End);
2584                 m_url.m_portEnd = m_url.m_hostEnd;
2585                 return true;
2586             }
2587             m_url.m_hostEnd = currentPosition(ipv6End);
2588             return true;
2589         }
2590         return false;
2591     }
2592
2593     if (!m_urlIsSpecial) {
2594         for (; !iterator.atEnd(); ++iterator) {
2595             if (UNLIKELY(isTabOrNewline(*iterator))) {
2596                 syntaxViolation(iterator);
2597                 continue;
2598             }
2599             if (*iterator == ':')
2600                 break;
2601             utf8PercentEncode<isInSimpleEncodeSet>(iterator);
2602         }
2603         m_url.m_hostEnd = currentPosition(iterator);
2604         if (iterator.atEnd()) {
2605             m_url.m_portEnd = currentPosition(iterator);
2606             return true;
2607         }
2608         return parsePort(iterator);
2609     }
2610     
2611     if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2612         auto hostIterator = iterator;
2613         for (; !iterator.atEnd(); ++iterator) {
2614             if (isTabOrNewline(*iterator))
2615                 continue;
2616             if (*iterator == ':')
2617                 break;
2618             if (isInvalidDomainCharacter(*iterator))
2619                 return false;
2620         }
2621         if (auto address = parseIPv4Host(CodePointIterator<CharacterType>(hostIterator, iterator))) {
2622             serializeIPv4(address.value());
2623             m_url.m_hostEnd = currentPosition(iterator);
2624             if (iterator.atEnd()) {
2625                 m_url.m_portEnd = currentPosition(iterator);
2626                 return true;
2627             }
2628             return parsePort(iterator);
2629         }
2630         for (; hostIterator != iterator; ++hostIterator) {
2631             if (UNLIKELY(isTabOrNewline(*hostIterator))) {
2632                 syntaxViolation(hostIterator);
2633                 continue;
2634             }
2635             if (UNLIKELY(isASCIIUpper(*hostIterator)))
2636                 syntaxViolation(hostIterator);
2637             appendToASCIIBuffer(toASCIILower(*hostIterator));
2638         }
2639         m_url.m_hostEnd = currentPosition(iterator);
2640         if (!hostIterator.atEnd())
2641             return parsePort(hostIterator);
2642         m_url.m_portEnd = currentPosition(iterator);
2643         return true;
2644     }
2645     
2646     auto hostBegin = iterator;
2647     
2648     Vector<LChar, defaultInlineBufferSize> utf8Encoded;
2649     for (; !iterator.atEnd(); ++iterator) {
2650         if (UNLIKELY(isTabOrNewline(*iterator))) {
2651             syntaxViolation(hostBegin);
2652             continue;
2653         }
2654         if (*iterator == ':')
2655             break;
2656         if (UNLIKELY(!isASCII(*iterator)))
2657             syntaxViolation(hostBegin);
2658
2659         uint8_t buffer[U8_MAX_LENGTH];
2660         int32_t offset = 0;
2661         UBool error = false;
2662         U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error);
2663         ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
2664         // FIXME: Check error.
2665         utf8Encoded.append(buffer, offset);
2666     }
2667     Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin);
2668     String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2669     if (domain != StringView(percentDecoded.data(), percentDecoded.size()))
2670         syntaxViolation(hostBegin);
2671     auto asciiDomain = domainToASCII(domain, hostBegin);
2672     if (!asciiDomain || hasInvalidDomainCharacter(asciiDomain.value()))
2673         return false;
2674     Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value();
2675     const LChar* asciiDomainCharacters = asciiDomainValue.data();
2676
2677     if (auto address = parseIPv4Host(CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()))) {
2678         serializeIPv4(address.value());
2679         m_url.m_hostEnd = currentPosition(iterator);
2680         if (iterator.atEnd()) {
2681             m_url.m_portEnd = currentPosition(iterator);
2682             return true;
2683         }
2684         return parsePort(iterator);
2685     }
2686
2687     appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2688     m_url.m_hostEnd = currentPosition(iterator);
2689     if (!iterator.atEnd())
2690         return parsePort(iterator);
2691     m_url.m_portEnd = currentPosition(iterator);
2692     return true;
2693 }
2694
2695 Optional<String> URLParser::formURLDecode(StringView input)
2696 {
2697     auto utf8 = input.utf8(StrictConversion);
2698     if (utf8.isNull())
2699         return Nullopt;
2700     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2701     return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2702 }
2703
2704 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2705 {
2706     Vector<StringView> sequences = input.split('&');
2707
2708     URLEncodedForm output;
2709     for (auto& bytes : sequences) {
2710         auto valueStart = bytes.find('=');
2711         if (valueStart == notFound) {
2712             if (auto name = formURLDecode(bytes))
2713                 output.append({name.value().replace('+', 0x20), emptyString()});
2714         } else {
2715             auto name = formURLDecode(bytes.substring(0, valueStart));
2716             auto value = formURLDecode(bytes.substring(valueStart + 1));
2717             if (name && value)
2718                 output.append(std::make_pair(name.value().replace('+', 0x20), value.value().replace('+', 0x20)));
2719         }
2720     }
2721     return output;
2722 }
2723
2724 static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2725 {
2726     auto utf8 = input.utf8(StrictConversion);
2727     const char* data = utf8.data();
2728     for (size_t i = 0; i < utf8.length(); ++i) {
2729         const char byte = data[i];
2730         if (byte == 0x20)
2731             output.append(0x2B);
2732         else if (byte == 0x2A
2733             || byte == 0x2D
2734             || byte == 0x2E
2735             || (byte >= 0x30 && byte <= 0x39)
2736             || (byte >= 0x41 && byte <= 0x5A)
2737             || byte == 0x5F
2738             || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches.
2739             output.append(byte);
2740         else
2741             percentEncodeByte(byte, output);
2742     }
2743 }
2744     
2745 String URLParser::serialize(const URLEncodedForm& tuples)
2746 {
2747     Vector<LChar> output;
2748     for (auto& tuple : tuples) {
2749         if (!output.isEmpty())
2750             output.append('&');
2751         serializeURLEncodedForm(tuple.first, output);
2752         output.append('=');
2753         serializeURLEncodedForm(tuple.second, output);
2754     }
2755     return String::adopt(WTFMove(output));
2756 }
2757
2758 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2759 {
2760     // FIXME: m_cannotBeABaseURL is not compared because the old URL::parse did not use it,
2761     // but once we get rid of URL::parse its value should be tested.
2762     URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2763         a.m_isValid,
2764         a.m_protocolIsInHTTPFamily,
2765         a.m_schemeEnd,
2766         a.m_userStart,
2767         a.m_userEnd,
2768         a.m_passwordEnd,
2769         a.m_hostEnd,
2770         a.m_portEnd,
2771         a.m_pathAfterLastSlash,
2772         a.m_pathEnd,
2773         a.m_queryEnd,
2774         a.m_fragmentEnd,
2775         a.m_string.utf8().data(),
2776         b.m_isValid,
2777         b.m_protocolIsInHTTPFamily,
2778         b.m_schemeEnd,
2779         b.m_userStart,
2780         b.m_userEnd,
2781         b.m_passwordEnd,
2782         b.m_hostEnd,
2783         b.m_portEnd,
2784         b.m_pathAfterLastSlash,
2785         b.m_pathEnd,
2786         b.m_queryEnd,
2787         b.m_fragmentEnd,
2788         b.m_string.utf8().data());
2789
2790     return a.m_string == b.m_string
2791         && a.m_isValid == b.m_isValid
2792         && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2793         && a.m_schemeEnd == b.m_schemeEnd
2794         && a.m_userStart == b.m_userStart
2795         && a.m_userEnd == b.m_userEnd
2796         && a.m_passwordEnd == b.m_passwordEnd
2797         && a.m_hostEnd == b.m_hostEnd
2798         && a.m_portEnd == b.m_portEnd
2799         && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2800         && a.m_pathEnd == b.m_pathEnd
2801         && a.m_queryEnd == b.m_queryEnd
2802         && a.m_fragmentEnd == b.m_fragmentEnd;
2803 }
2804
2805 bool URLParser::internalValuesConsistent(const URL& url)
2806 {
2807     return url.m_schemeEnd <= url.m_userStart
2808         && url.m_userStart <= url.m_userEnd
2809         && url.m_userEnd <= url.m_passwordEnd
2810         && url.m_passwordEnd <= url.m_hostEnd
2811         && url.m_hostEnd <= url.m_portEnd
2812         && url.m_portEnd <= url.m_pathAfterLastSlash
2813         && url.m_pathAfterLastSlash <= url.m_pathEnd
2814         && url.m_pathEnd <= url.m_queryEnd
2815         && url.m_queryEnd <= url.m_fragmentEnd
2816         && (url.m_isValid ? url.m_fragmentEnd == url.m_string.length() : !url.m_fragmentEnd);
2817     // FIXME: Why do we even store m_fragmentEnd?
2818     // It should be able to be deduced from m_isValid and m_string.length() to save memory.
2819 }
2820
2821 static bool urlParserEnabled = true;
2822
2823 void URLParser::setEnabled(bool enabled)
2824 {
2825     urlParserEnabled = enabled;
2826 }
2827
2828 bool URLParser::enabled()
2829 {
2830     return urlParserEnabled;
2831 }
2832
2833 } // namespace WebCore