Move isDefaultPortForProtocol from URLParser.cpp back to URL.cpp
[WebKit-https.git] / Source / WebCore / platform / URLParser.cpp
1 /*
2  * Copyright (C) 2016 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "URLParser.h"
28
29 #include "Logging.h"
30 #include "RuntimeApplicationChecks.h"
31 #include <array>
32 #include <unicode/uidna.h>
33 #include <unicode/utypes.h>
34
35 namespace WebCore {
36
37 #define URL_PARSER_DEBUGGING 0
38     
39 #if URL_PARSER_DEBUGGING
40 #define URL_PARSER_LOG(...) LOG(URLParser, __VA_ARGS__)
41 #else
42 #define URL_PARSER_LOG(...)
43 #endif
44     
45 template<typename CharacterType>
46 class CodePointIterator {
47 public:
48     ALWAYS_INLINE CodePointIterator() { }
49     ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
50         : m_begin(begin)
51         , m_end(end)
52     {
53     }
54     
55     ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
56         : CodePointIterator(begin.m_begin, end.m_begin)
57     {
58         ASSERT(end.m_begin >= begin.m_begin);
59     }
60     
61     ALWAYS_INLINE UChar32 operator*() const;
62     ALWAYS_INLINE CodePointIterator& operator++();
63
64     ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
65     {
66         return m_begin == other.m_begin
67             && m_end == other.m_end;
68     }
69     ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
70     
71     ALWAYS_INLINE CodePointIterator& operator=(const CodePointIterator& other)
72     {
73         m_begin = other.m_begin;
74         m_end = other.m_end;
75         return *this;
76     }
77
78     ALWAYS_INLINE bool atEnd() const
79     {
80         ASSERT(m_begin <= m_end);
81         return m_begin >= m_end;
82     }
83     
84     ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
85     {
86         ASSERT(m_begin >= reference);
87         return m_begin - reference;
88     }
89
90     ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
91     {
92         return codeUnitsSince(other.m_begin);
93     }
94     
95 private:
96     const CharacterType* m_begin { nullptr };
97     const CharacterType* m_end { nullptr };
98 };
99
100 template<>
101 ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const
102 {
103     ASSERT(!atEnd());
104     return *m_begin;
105 }
106
107 template<>
108 ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
109 {
110     ASSERT(!atEnd());
111     m_begin++;
112     return *this;
113 }
114
115 template<>
116 ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const
117 {
118     ASSERT(!atEnd());
119     UChar32 c;
120     U16_GET(m_begin, 0, 0, m_end - m_begin, c);
121     return c;
122 }
123
124 template<>
125 ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
126 {
127     ASSERT(!atEnd());
128     unsigned i = 0;
129     size_t length = m_end - m_begin;
130     U16_FWD_1(m_begin, i, length);
131     m_begin += i;
132     return *this;
133 }
134     
135 ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
136 {
137     if (U_IS_BMP(codePoint)) {
138         destination.append(static_cast<UChar>(codePoint));
139         return;
140     }
141     destination.reserveCapacity(destination.size() + 2);
142     destination.uncheckedAppend(U16_LEAD(codePoint));
143     destination.uncheckedAppend(U16_TRAIL(codePoint));
144 }
145
146 enum URLCharacterClass {
147     UserInfo = 0x1,
148     Default = 0x2,
149     InvalidDomain = 0x4,
150     QueryPercent = 0x8,
151     SlashQuestionOrHash = 0x10,
152     ValidScheme = 0x20,
153 };
154
155 static const uint8_t characterClassTable[256] = {
156     UserInfo | Default | InvalidDomain | QueryPercent, // 0x0
157     UserInfo | Default | QueryPercent, // 0x1
158     UserInfo | Default | QueryPercent, // 0x2
159     UserInfo | Default | QueryPercent, // 0x3
160     UserInfo | Default | QueryPercent, // 0x4
161     UserInfo | Default | QueryPercent, // 0x5
162     UserInfo | Default | QueryPercent, // 0x6
163     UserInfo | Default | QueryPercent, // 0x7
164     UserInfo | Default | QueryPercent, // 0x8
165     UserInfo | Default | InvalidDomain | QueryPercent, // 0x9
166     UserInfo | Default | InvalidDomain | QueryPercent, // 0xA
167     UserInfo | Default | QueryPercent, // 0xB
168     UserInfo | Default | QueryPercent, // 0xC
169     UserInfo | Default | InvalidDomain | QueryPercent, // 0xD
170     UserInfo | Default | QueryPercent, // 0xE
171     UserInfo | Default | QueryPercent, // 0xF
172     UserInfo | Default | QueryPercent, // 0x10
173     UserInfo | Default | QueryPercent, // 0x11
174     UserInfo | Default | QueryPercent, // 0x12
175     UserInfo | Default | QueryPercent, // 0x13
176     UserInfo | Default | QueryPercent, // 0x14
177     UserInfo | Default | QueryPercent, // 0x15
178     UserInfo | Default | QueryPercent, // 0x16
179     UserInfo | Default | QueryPercent, // 0x17
180     UserInfo | Default | QueryPercent, // 0x18
181     UserInfo | Default | QueryPercent, // 0x19
182     UserInfo | Default | QueryPercent, // 0x1A
183     UserInfo | Default | QueryPercent, // 0x1B
184     UserInfo | Default | QueryPercent, // 0x1C
185     UserInfo | Default | QueryPercent, // 0x1D
186     UserInfo | Default | QueryPercent, // 0x1E
187     UserInfo | Default | QueryPercent, // 0x1F
188     UserInfo | Default | InvalidDomain | QueryPercent, // ' '
189     0, // '!'
190     UserInfo | Default | QueryPercent, // '"'
191     UserInfo | Default | InvalidDomain | QueryPercent | SlashQuestionOrHash, // '#'
192     0, // '$'
193     InvalidDomain, // '%'
194     0, // '&'
195     0, // '''
196     0, // '('
197     0, // ')'
198     0, // '*'
199     ValidScheme, // '+'
200     0, // ','
201     ValidScheme, // '-'
202     ValidScheme, // '.'
203     UserInfo | InvalidDomain | SlashQuestionOrHash, // '/'
204     ValidScheme, // '0'
205     ValidScheme, // '1'
206     ValidScheme, // '2'
207     ValidScheme, // '3'
208     ValidScheme, // '4'
209     ValidScheme, // '5'
210     ValidScheme, // '6'
211     ValidScheme, // '7'
212     ValidScheme, // '8'
213     ValidScheme, // '9'
214     UserInfo | InvalidDomain, // ':'
215     UserInfo, // ';'
216     UserInfo | Default | QueryPercent, // '<'
217     UserInfo, // '='
218     UserInfo | Default | QueryPercent, // '>'
219     UserInfo | Default | InvalidDomain | SlashQuestionOrHash, // '?'
220     UserInfo | InvalidDomain, // '@'
221     ValidScheme, // 'A'
222     ValidScheme, // 'B'
223     ValidScheme, // 'C'
224     ValidScheme, // 'D'
225     ValidScheme, // 'E'
226     ValidScheme, // 'F'
227     ValidScheme, // 'G'
228     ValidScheme, // 'H'
229     ValidScheme, // 'I'
230     ValidScheme, // 'J'
231     ValidScheme, // 'K'
232     ValidScheme, // 'L'
233     ValidScheme, // 'M'
234     ValidScheme, // 'N'
235     ValidScheme, // 'O'
236     ValidScheme, // 'P'
237     ValidScheme, // 'Q'
238     ValidScheme, // 'R'
239     ValidScheme, // 'S'
240     ValidScheme, // 'T'
241     ValidScheme, // 'U'
242     ValidScheme, // 'V'
243     ValidScheme, // 'W'
244     ValidScheme, // 'X'
245     ValidScheme, // 'Y'
246     ValidScheme, // 'Z'
247     UserInfo | InvalidDomain, // '['
248     UserInfo | InvalidDomain | SlashQuestionOrHash, // '\\'
249     UserInfo | InvalidDomain, // ']'
250     UserInfo, // '^'
251     0, // '_'
252     UserInfo | Default, // '`'
253     ValidScheme, // 'a'
254     ValidScheme, // 'b'
255     ValidScheme, // 'c'
256     ValidScheme, // 'd'
257     ValidScheme, // 'e'
258     ValidScheme, // 'f'
259     ValidScheme, // 'g'
260     ValidScheme, // 'h'
261     ValidScheme, // 'i'
262     ValidScheme, // 'j'
263     ValidScheme, // 'k'
264     ValidScheme, // 'l'
265     ValidScheme, // 'm'
266     ValidScheme, // 'n'
267     ValidScheme, // 'o'
268     ValidScheme, // 'p'
269     ValidScheme, // 'q'
270     ValidScheme, // 'r'
271     ValidScheme, // 's'
272     ValidScheme, // 't'
273     ValidScheme, // 'u'
274     ValidScheme, // 'v'
275     ValidScheme, // 'w'
276     ValidScheme, // 'x'
277     ValidScheme, // 'y'
278     ValidScheme, // 'z'
279     UserInfo | Default, // '{'
280     UserInfo, // '|'
281     UserInfo | Default, // '}'
282     0, // '~'
283     QueryPercent, // 0x7F
284     QueryPercent, // 0x80
285     QueryPercent, // 0x81
286     QueryPercent, // 0x82
287     QueryPercent, // 0x83
288     QueryPercent, // 0x84
289     QueryPercent, // 0x85
290     QueryPercent, // 0x86
291     QueryPercent, // 0x87
292     QueryPercent, // 0x88
293     QueryPercent, // 0x89
294     QueryPercent, // 0x8A
295     QueryPercent, // 0x8B
296     QueryPercent, // 0x8C
297     QueryPercent, // 0x8D
298     QueryPercent, // 0x8E
299     QueryPercent, // 0x8F
300     QueryPercent, // 0x90
301     QueryPercent, // 0x91
302     QueryPercent, // 0x92
303     QueryPercent, // 0x93
304     QueryPercent, // 0x94
305     QueryPercent, // 0x95
306     QueryPercent, // 0x96
307     QueryPercent, // 0x97
308     QueryPercent, // 0x98
309     QueryPercent, // 0x99
310     QueryPercent, // 0x9A
311     QueryPercent, // 0x9B
312     QueryPercent, // 0x9C
313     QueryPercent, // 0x9D
314     QueryPercent, // 0x9E
315     QueryPercent, // 0x9F
316     QueryPercent, // 0xA0
317     QueryPercent, // 0xA1
318     QueryPercent, // 0xA2
319     QueryPercent, // 0xA3
320     QueryPercent, // 0xA4
321     QueryPercent, // 0xA5
322     QueryPercent, // 0xA6
323     QueryPercent, // 0xA7
324     QueryPercent, // 0xA8
325     QueryPercent, // 0xA9
326     QueryPercent, // 0xAA
327     QueryPercent, // 0xAB
328     QueryPercent, // 0xAC
329     QueryPercent, // 0xAD
330     QueryPercent, // 0xAE
331     QueryPercent, // 0xAF
332     QueryPercent, // 0xB0
333     QueryPercent, // 0xB1
334     QueryPercent, // 0xB2
335     QueryPercent, // 0xB3
336     QueryPercent, // 0xB4
337     QueryPercent, // 0xB5
338     QueryPercent, // 0xB6
339     QueryPercent, // 0xB7
340     QueryPercent, // 0xB8
341     QueryPercent, // 0xB9
342     QueryPercent, // 0xBA
343     QueryPercent, // 0xBB
344     QueryPercent, // 0xBC
345     QueryPercent, // 0xBD
346     QueryPercent, // 0xBE
347     QueryPercent, // 0xBF
348     QueryPercent, // 0xC0
349     QueryPercent, // 0xC1
350     QueryPercent, // 0xC2
351     QueryPercent, // 0xC3
352     QueryPercent, // 0xC4
353     QueryPercent, // 0xC5
354     QueryPercent, // 0xC6
355     QueryPercent, // 0xC7
356     QueryPercent, // 0xC8
357     QueryPercent, // 0xC9
358     QueryPercent, // 0xCA
359     QueryPercent, // 0xCB
360     QueryPercent, // 0xCC
361     QueryPercent, // 0xCD
362     QueryPercent, // 0xCE
363     QueryPercent, // 0xCF
364     QueryPercent, // 0xD0
365     QueryPercent, // 0xD1
366     QueryPercent, // 0xD2
367     QueryPercent, // 0xD3
368     QueryPercent, // 0xD4
369     QueryPercent, // 0xD5
370     QueryPercent, // 0xD6
371     QueryPercent, // 0xD7
372     QueryPercent, // 0xD8
373     QueryPercent, // 0xD9
374     QueryPercent, // 0xDA
375     QueryPercent, // 0xDB
376     QueryPercent, // 0xDC
377     QueryPercent, // 0xDD
378     QueryPercent, // 0xDE
379     QueryPercent, // 0xDF
380     QueryPercent, // 0xE0
381     QueryPercent, // 0xE1
382     QueryPercent, // 0xE2
383     QueryPercent, // 0xE3
384     QueryPercent, // 0xE4
385     QueryPercent, // 0xE5
386     QueryPercent, // 0xE6
387     QueryPercent, // 0xE7
388     QueryPercent, // 0xE8
389     QueryPercent, // 0xE9
390     QueryPercent, // 0xEA
391     QueryPercent, // 0xEB
392     QueryPercent, // 0xEC
393     QueryPercent, // 0xED
394     QueryPercent, // 0xEE
395     QueryPercent, // 0xEF
396     QueryPercent, // 0xF0
397     QueryPercent, // 0xF1
398     QueryPercent, // 0xF2
399     QueryPercent, // 0xF3
400     QueryPercent, // 0xF4
401     QueryPercent, // 0xF5
402     QueryPercent, // 0xF6
403     QueryPercent, // 0xF7
404     QueryPercent, // 0xF8
405     QueryPercent, // 0xF9
406     QueryPercent, // 0xFA
407     QueryPercent, // 0xFB
408     QueryPercent, // 0xFC
409     QueryPercent, // 0xFD
410     QueryPercent, // 0xFE
411     QueryPercent, // 0xFF
412 };
413
414 template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; }
415 template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
416 template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
417 template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
418 template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
419 template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
420 template<typename CharacterType> ALWAYS_INLINE static bool isInvalidDomainCharacter(CharacterType character) { return character <= ']' && characterClassTable[character] & InvalidDomain; }
421 template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
422 template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
423 template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & ValidScheme; }
424 static bool shouldPercentEncodeQueryByte(uint8_t byte) { return characterClassTable[byte] & QueryPercent; }
425
426 template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation>
427 ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
428 {
429     ++iterator;
430     while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
431         if (reportSyntaxViolation == ReportSyntaxViolation::Yes)
432             syntaxViolation(iteratorForSyntaxViolationPosition);
433         ++iterator;
434     }
435 }
436
437 template<typename CharacterType>
438 bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator)
439 {
440     if (iterator.atEnd())
441         return false;
442     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
443     if (iterator.atEnd())
444         return false;
445     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
446     return iterator.atEnd();
447 }
448
449 template<typename CharacterType>
450 ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
451 {
452     if (iterator.atEnd() || !isASCIIAlpha(*iterator))
453         return false;
454     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
455     if (iterator.atEnd())
456         return false;
457     if (*iterator == ':')
458         return true;
459     if (UNLIKELY(*iterator == '|'))
460         return true;
461     return false;
462 }
463
464 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
465 {
466     ASSERT(isASCII(codePoint));
467     if (UNLIKELY(m_didSeeSyntaxViolation))
468         m_asciiBuffer.append(codePoint);
469 }
470
471 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
472 {
473     if (UNLIKELY(m_didSeeSyntaxViolation))
474         m_asciiBuffer.append(characters, length);
475 }
476
477 template<typename CharacterType>
478 void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
479 {
480     ASSERT(isWindowsDriveLetter(iterator));
481     appendToASCIIBuffer(*iterator);
482     advance(iterator);
483     ASSERT(!iterator.atEnd());
484     ASSERT(*iterator == ':' || *iterator == '|');
485     if (*iterator == '|')
486         syntaxViolation(iterator);
487     appendToASCIIBuffer(':');
488     advance(iterator);
489 }
490
491 template<typename CharacterType>
492 bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
493 {
494     if (!isWindowsDriveLetter(iterator))
495         return true;
496     if (iterator.atEnd())
497         return false;
498     advance(iterator);
499     if (iterator.atEnd())
500         return true;
501     advance(iterator);
502     if (iterator.atEnd())
503         return true;
504     return !isSlashQuestionOrHash(*iterator);
505 }
506
507 static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
508 {
509     buffer.append('%');
510     buffer.append(upperNibbleToASCIIHexDigit(byte));
511     buffer.append(lowerNibbleToASCIIHexDigit(byte));
512 }
513
514 void URLParser::percentEncodeByte(uint8_t byte)
515 {
516     ASSERT(m_didSeeSyntaxViolation);
517     appendToASCIIBuffer('%');
518     appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
519     appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
520 }
521
522 const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD";
523 const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1;
524
525 template<bool(*isInCodeSet)(UChar32), typename CharacterType>
526 ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
527 {
528     ASSERT(!iterator.atEnd());
529     UChar32 codePoint = *iterator;
530     if (LIKELY(isASCII(codePoint))) {
531         if (UNLIKELY(isInCodeSet(codePoint))) {
532             syntaxViolation(iterator);
533             percentEncodeByte(codePoint);
534         } else
535             appendToASCIIBuffer(codePoint);
536         return;
537     }
538     ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
539     syntaxViolation(iterator);
540     
541     if (!U_IS_UNICODE_CHAR(codePoint)) {
542         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
543         return;
544     }
545     
546     uint8_t buffer[U8_MAX_LENGTH];
547     int32_t offset = 0;
548     U8_APPEND_UNSAFE(buffer, offset, codePoint);
549     for (int32_t i = 0; i < offset; ++i)
550         percentEncodeByte(buffer[i]);
551 }
552
553 template<typename CharacterType>
554 ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
555 {
556     ASSERT(!iterator.atEnd());
557     UChar32 codePoint = *iterator;
558     if (LIKELY(isASCII(codePoint))) {
559         if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint))) {
560             syntaxViolation(iterator);
561             percentEncodeByte(codePoint);
562         } else
563             appendToASCIIBuffer(codePoint);
564         return;
565     }
566     
567     syntaxViolation(iterator);
568     
569     if (!U_IS_UNICODE_CHAR(codePoint)) {
570         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
571         return;
572     }
573
574     uint8_t buffer[U8_MAX_LENGTH];
575     int32_t offset = 0;
576     U8_APPEND_UNSAFE(buffer, offset, codePoint);
577     for (int32_t i = 0; i < offset; ++i) {
578         auto byte = buffer[i];
579         if (shouldPercentEncodeQueryByte(byte))
580             percentEncodeByte(byte);
581         else
582             appendToASCIIBuffer(byte);
583     }
584 }
585
586 template<typename CharacterType>
587 void URLParser::encodeQuery(const Vector<UChar>& source, const TextEncoding& encoding, CodePointIterator<CharacterType> iterator)
588 {
589     // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
590     CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
591     const char* data = encoded.data();
592     size_t length = encoded.length();
593     
594     if (!length == !iterator.atEnd()) {
595         syntaxViolation(iterator);
596         return;
597     }
598     
599     size_t i = 0;
600     for (; i < length; ++i) {
601         ASSERT(!iterator.atEnd());
602         uint8_t byte = data[i];
603         if (UNLIKELY(byte != *iterator)) {
604             syntaxViolation(iterator);
605             break;
606         }
607         if (UNLIKELY(shouldPercentEncodeQueryByte(byte))) {
608             syntaxViolation(iterator);
609             break;
610         }
611         appendToASCIIBuffer(byte);
612         ++iterator;
613     }
614     while (!iterator.atEnd() && isTabOrNewline(*iterator))
615         ++iterator;
616     ASSERT((i == length) == iterator.atEnd());
617     for (; i < length; ++i) {
618         ASSERT(m_didSeeSyntaxViolation);
619         uint8_t byte = data[i];
620         if (shouldPercentEncodeQueryByte(byte))
621             percentEncodeByte(byte);
622         else
623             appendToASCIIBuffer(byte);
624     }
625 }
626
627 Optional<uint16_t> URLParser::defaultPortForProtocol(StringView scheme)
628 {
629     static const uint16_t ftpPort = 21;
630     static const uint16_t gopherPort = 70;
631     static const uint16_t httpPort = 80;
632     static const uint16_t httpsPort = 443;
633     static const uint16_t wsPort = 80;
634     static const uint16_t wssPort = 443;
635     
636     auto length = scheme.length();
637     if (!length)
638         return Nullopt;
639     switch (scheme[0]) {
640     case 'w':
641         switch (length) {
642         case 2:
643             if (scheme[1] == 's')
644                 return wsPort;
645             return Nullopt;
646         case 3:
647             if (scheme[1] == 's'
648                 && scheme[2] == 's')
649                 return wssPort;
650             return Nullopt;
651         default:
652             return false;
653         }
654     case 'h':
655         switch (length) {
656         case 4:
657             if (scheme[1] == 't'
658                 && scheme[2] == 't'
659                 && scheme[3] == 'p')
660                 return httpPort;
661             return Nullopt;
662         case 5:
663             if (scheme[1] == 't'
664                 && scheme[2] == 't'
665                 && scheme[3] == 'p'
666                 && scheme[4] == 's')
667                 return httpsPort;
668             return Nullopt;
669         default:
670             return Nullopt;
671         }
672     case 'g':
673         if (length == 6
674             && scheme[1] == 'o'
675             && scheme[2] == 'p'
676             && scheme[3] == 'h'
677             && scheme[4] == 'e'
678             && scheme[5] == 'r')
679             return gopherPort;
680         return Nullopt;
681     case 'f':
682         if (length == 3
683             && scheme[1] == 't'
684             && scheme[2] == 'p')
685             return ftpPort;
686         return Nullopt;
687     default:
688         return Nullopt;
689     }
690 }
691
692 enum class Scheme {
693     WS,
694     WSS,
695     File,
696     FTP,
697     Gopher,
698     HTTP,
699     HTTPS,
700     NonSpecial
701 };
702
703 ALWAYS_INLINE static Scheme scheme(StringView scheme)
704 {
705     auto length = scheme.length();
706     if (!length)
707         return Scheme::NonSpecial;
708     switch (scheme[0]) {
709     case 'f':
710         switch (length) {
711         case 3:
712             if (scheme[1] == 't'
713                 && scheme[2] == 'p')
714                 return Scheme::FTP;
715             return Scheme::NonSpecial;
716         case 4:
717             if (scheme[1] == 'i'
718                 && scheme[2] == 'l'
719                 && scheme[3] == 'e')
720                 return Scheme::File;
721             return Scheme::NonSpecial;
722         default:
723             return Scheme::NonSpecial;
724         }
725     case 'g':
726         if (length == 6
727             && scheme[1] == 'o'
728             && scheme[2] == 'p'
729             && scheme[3] == 'h'
730             && scheme[4] == 'e'
731             && scheme[5] == 'r')
732             return Scheme::Gopher;
733         return Scheme::NonSpecial;
734     case 'h':
735         switch (length) {
736         case 4:
737             if (scheme[1] == 't'
738                 && scheme[2] == 't'
739                 && scheme[3] == 'p')
740                 return Scheme::HTTP;
741             return Scheme::NonSpecial;
742         case 5:
743             if (scheme[1] == 't'
744                 && scheme[2] == 't'
745                 && scheme[3] == 'p'
746                 && scheme[4] == 's')
747                 return Scheme::HTTPS;
748             return Scheme::NonSpecial;
749         default:
750             return Scheme::NonSpecial;
751         }
752     case 'w':
753         switch (length) {
754         case 2:
755             if (scheme[1] == 's')
756                 return Scheme::WS;
757             return Scheme::NonSpecial;
758         case 3:
759             if (scheme[1] == 's'
760                 && scheme[2] == 's')
761                 return Scheme::WSS;
762             return Scheme::NonSpecial;
763         default:
764             return Scheme::NonSpecial;
765         }
766     default:
767         return Scheme::NonSpecial;
768     }
769 }
770
771 enum class URLParser::URLPart {
772     SchemeEnd,
773     UserStart,
774     UserEnd,
775     PasswordEnd,
776     HostEnd,
777     PortEnd,
778     PathAfterLastSlash,
779     PathEnd,
780     QueryEnd,
781     FragmentEnd,
782 };
783
784 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
785 {
786     switch (part) {
787     case URLPart::FragmentEnd:
788         return url.m_fragmentEnd;
789     case URLPart::QueryEnd:
790         return url.m_queryEnd;
791     case URLPart::PathEnd:
792         return url.m_pathEnd;
793     case URLPart::PathAfterLastSlash:
794         return url.m_pathAfterLastSlash;
795     case URLPart::PortEnd:
796         return url.m_portEnd;
797     case URLPart::HostEnd:
798         return url.m_hostEnd;
799     case URLPart::PasswordEnd:
800         return url.m_passwordEnd;
801     case URLPart::UserEnd:
802         return url.m_userEnd;
803     case URLPart::UserStart:
804         return url.m_userStart;
805     case URLPart::SchemeEnd:
806         return url.m_schemeEnd;
807     }
808     ASSERT_NOT_REACHED();
809     return 0;
810 }
811
812 void URLParser::copyASCIIStringUntil(const String& string, size_t length)
813 {
814     RELEASE_ASSERT(length <= string.length());
815     if (string.isNull())
816         return;
817     ASSERT(m_asciiBuffer.isEmpty());
818     if (string.is8Bit()) {
819         appendToASCIIBuffer(string.characters8(), length);
820     } else {
821         const UChar* characters = string.characters16();
822         for (size_t i = 0; i < length; ++i) {
823             UChar c = characters[i];
824             ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
825             appendToASCIIBuffer(c);
826         }
827     }
828 }
829
830 template<typename CharacterType>
831 void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, bool& isUTF8Encoding)
832 {
833     syntaxViolation(iterator);
834
835     m_asciiBuffer.clear();
836     copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
837     switch (part) {
838     case URLPart::FragmentEnd:
839         RELEASE_ASSERT_NOT_REACHED();
840     case URLPart::QueryEnd:
841         m_url.m_queryEnd = base.m_queryEnd;
842         FALLTHROUGH;
843     case URLPart::PathEnd:
844         m_url.m_pathEnd = base.m_pathEnd;
845         FALLTHROUGH;
846     case URLPart::PathAfterLastSlash:
847         m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
848         FALLTHROUGH;
849     case URLPart::PortEnd:
850         m_url.m_portEnd = base.m_portEnd;
851         FALLTHROUGH;
852     case URLPart::HostEnd:
853         m_url.m_hostEnd = base.m_hostEnd;
854         FALLTHROUGH;
855     case URLPart::PasswordEnd:
856         m_url.m_passwordEnd = base.m_passwordEnd;
857         FALLTHROUGH;
858     case URLPart::UserEnd:
859         m_url.m_userEnd = base.m_userEnd;
860         FALLTHROUGH;
861     case URLPart::UserStart:
862         m_url.m_userStart = base.m_userStart;
863         FALLTHROUGH;
864     case URLPart::SchemeEnd:
865         m_url.m_isValid = base.m_isValid;
866         m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
867         m_url.m_schemeEnd = base.m_schemeEnd;
868     }
869     switch (scheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd))) {
870     case Scheme::WS:
871     case Scheme::WSS:
872         isUTF8Encoding = true;
873         FALLTHROUGH;
874     case Scheme::File:
875     case Scheme::FTP:
876     case Scheme::Gopher:
877     case Scheme::HTTP:
878     case Scheme::HTTPS:
879         m_urlIsSpecial = true;
880         return;
881     case Scheme::NonSpecial:
882         m_urlIsSpecial = false;
883         isUTF8Encoding = true;
884         return;
885     }
886     ASSERT_NOT_REACHED();
887 }
888
889 static const char dotASCIICode[2] = {'2', 'e'};
890
891 template<typename CharacterType>
892 ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c)
893 {
894     if (c.atEnd())
895         return false;
896     if (*c == '.') {
897         advance<CharacterType, ReportSyntaxViolation::No>(c);
898         return c.atEnd() || isSlashQuestionOrHash(*c);
899     }
900     if (*c != '%')
901         return false;
902     advance<CharacterType, ReportSyntaxViolation::No>(c);
903     if (c.atEnd() || *c != dotASCIICode[0])
904         return false;
905     advance<CharacterType, ReportSyntaxViolation::No>(c);
906     if (c.atEnd())
907         return false;
908     if (toASCIILower(*c) == dotASCIICode[1]) {
909         advance<CharacterType, ReportSyntaxViolation::No>(c);
910         return c.atEnd() || isSlashQuestionOrHash(*c);
911     }
912     return false;
913 }
914
915 template<typename CharacterType>
916 ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
917 {
918     if (c.atEnd())
919         return false;
920     if (*c == '.') {
921         advance<CharacterType, ReportSyntaxViolation::No>(c);
922         return isSingleDotPathSegment(c);
923     }
924     if (*c != '%')
925         return false;
926     advance<CharacterType, ReportSyntaxViolation::No>(c);
927     if (c.atEnd() || *c != dotASCIICode[0])
928         return false;
929     advance<CharacterType, ReportSyntaxViolation::No>(c);
930     if (c.atEnd())
931         return false;
932     if (toASCIILower(*c) == dotASCIICode[1]) {
933         advance<CharacterType, ReportSyntaxViolation::No>(c);
934         return isSingleDotPathSegment(c);
935     }
936     return false;
937 }
938
939 template<typename CharacterType>
940 void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
941 {
942     ASSERT(isSingleDotPathSegment(c));
943     if (*c == '.') {
944         advance(c);
945         if (!c.atEnd()) {
946             if (*c == '/' || *c == '\\')
947                 advance(c);
948             else
949                 ASSERT(*c == '?' || *c == '#');
950         }
951     } else {
952         ASSERT(*c == '%');
953         advance(c);
954         ASSERT(*c == dotASCIICode[0]);
955         advance(c);
956         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
957         advance(c);
958         if (!c.atEnd()) {
959             if (*c == '/' || *c == '\\')
960                 advance(c);
961             else
962                 ASSERT(*c == '?' || *c == '#');
963         }
964     }
965 }
966
967 template<typename CharacterType>
968 void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
969 {
970     ASSERT(isDoubleDotPathSegment(c));
971     if (*c == '.')
972         advance(c);
973     else {
974         ASSERT(*c == '%');
975         advance(c);
976         ASSERT(*c == dotASCIICode[0]);
977         advance(c);
978         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
979         advance(c);
980     }
981     consumeSingleDotPathSegment(c);
982 }
983
984 void URLParser::popPath()
985 {
986     ASSERT(m_didSeeSyntaxViolation);
987     if (m_url.m_pathAfterLastSlash > m_url.m_portEnd + 1) {
988         m_url.m_pathAfterLastSlash--;
989         if (m_asciiBuffer[m_url.m_pathAfterLastSlash] == '/')
990             m_url.m_pathAfterLastSlash--;
991         while (m_url.m_pathAfterLastSlash > m_url.m_portEnd && m_asciiBuffer[m_url.m_pathAfterLastSlash] != '/')
992             m_url.m_pathAfterLastSlash--;
993         m_url.m_pathAfterLastSlash++;
994     }
995     m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
996 }
997
998 template<typename CharacterType>
999 void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
1000 {
1001     if (m_didSeeSyntaxViolation)
1002         return;
1003     m_didSeeSyntaxViolation = true;
1004     
1005     ASSERT(m_asciiBuffer.isEmpty());
1006     size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1007     RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
1008     m_asciiBuffer.reserveCapacity(m_inputString.length());
1009     for (size_t i = 0; i < codeUnitsToCopy; ++i) {
1010         ASSERT(isASCII(m_inputString[i]));
1011         m_asciiBuffer.uncheckedAppend(m_inputString[i]);
1012     }
1013 }
1014
1015 void URLParser::failure()
1016 {
1017     m_url.invalidate();
1018     m_url.m_string = m_inputString;
1019 }
1020
1021 template<typename CharacterType>
1022 bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint)
1023 {
1024     if (iterator.atEnd() || toASCIILower(*iterator) != codePoint)
1025         return false;
1026     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
1027     return true;
1028 }
1029
1030 template<typename CharacterType>
1031 bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator)
1032 {
1033     if (!checkLocalhostCodePoint(iterator, 'l'))
1034         return false;
1035     if (!checkLocalhostCodePoint(iterator, 'o'))
1036         return false;
1037     if (!checkLocalhostCodePoint(iterator, 'c'))
1038         return false;
1039     if (!checkLocalhostCodePoint(iterator, 'a'))
1040         return false;
1041     if (!checkLocalhostCodePoint(iterator, 'l'))
1042         return false;
1043     if (!checkLocalhostCodePoint(iterator, 'h'))
1044         return false;
1045     if (!checkLocalhostCodePoint(iterator, 'o'))
1046         return false;
1047     if (!checkLocalhostCodePoint(iterator, 's'))
1048         return false;
1049     if (!checkLocalhostCodePoint(iterator, 't'))
1050         return false;
1051     return iterator.atEnd();
1052 }
1053
1054 bool URLParser::isLocalhost(StringView view)
1055 {
1056     if (view.is8Bit())
1057         return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length()));
1058     return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length()));
1059 }
1060
1061 ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
1062 {
1063     if (UNLIKELY(m_didSeeSyntaxViolation)) {
1064         ASSERT(start + length <= m_asciiBuffer.size());
1065         return StringView(m_asciiBuffer.data() + start, length);
1066     }
1067     ASSERT(start + length <= m_inputString.length());
1068     return StringView(m_inputString).substring(start, length);
1069 }
1070
1071 template<typename CharacterType>
1072 ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1073 {
1074     if (UNLIKELY(m_didSeeSyntaxViolation))
1075         return m_asciiBuffer.size();
1076     
1077     return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1078 }
1079
1080 URLParser::URLParser(const String& input, const URL& base, const TextEncoding& encoding)
1081     : m_inputString(input)
1082 {
1083     if (input.isNull()) {
1084         if (base.isValid() && !base.m_cannotBeABaseURL) {
1085             m_url = base;
1086             m_url.removeFragmentIdentifier();
1087         }
1088         return;
1089     }
1090
1091     if (input.is8Bit()) {
1092         m_inputBegin = input.characters8();
1093         parse(input.characters8(), input.length(), base, encoding);
1094     } else {
1095         m_inputBegin = input.characters16();
1096         parse(input.characters16(), input.length(), base, encoding);
1097     }
1098
1099     ASSERT(!m_url.m_isValid
1100         || m_didSeeSyntaxViolation == (m_url.string() != input)
1101         || (input.isAllSpecialCharacters<isC0ControlOrSpace>()
1102             && m_url.m_string == base.m_string.left(base.m_queryEnd)));
1103     ASSERT(internalValuesConsistent(m_url));
1104 #if !ASSERT_DISABLED
1105     if (!m_didSeeSyntaxViolation) {
1106         // Force a syntax violation at the beginning to make sure we get the same result.
1107         URLParser parser(makeString(" ", input), base, encoding);
1108         URL parsed = parser.result();
1109         if (parsed.isValid())
1110             ASSERT(allValuesEqual(parser.result(), m_url));
1111     }
1112 #endif
1113 }
1114
1115 template<typename CharacterType>
1116 void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
1117 {
1118     URL_PARSER_LOG("Parsing URL <%s> base <%s> encoding <%s>", String(input, length).utf8().data(), base.string().utf8().data(), encoding.name());
1119     m_url = { };
1120     ASSERT(m_asciiBuffer.isEmpty());
1121     
1122     bool isUTF8Encoding = encoding == UTF8Encoding();
1123     Vector<UChar> queryBuffer;
1124
1125     unsigned endIndex = length;
1126     while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) {
1127         syntaxViolation(CodePointIterator<CharacterType>(input, input));
1128         endIndex--;
1129     }
1130     CodePointIterator<CharacterType> c(input, input + endIndex);
1131     CodePointIterator<CharacterType> authorityOrHostBegin;
1132     CodePointIterator<CharacterType> queryBegin;
1133     while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1134         syntaxViolation(c);
1135         ++c;
1136     }
1137     auto beginAfterControlAndSpace = c;
1138
1139     enum class State : uint8_t {
1140         SchemeStart,
1141         Scheme,
1142         NoScheme,
1143         SpecialRelativeOrAuthority,
1144         PathOrAuthority,
1145         Relative,
1146         RelativeSlash,
1147         SpecialAuthoritySlashes,
1148         SpecialAuthorityIgnoreSlashes,
1149         AuthorityOrHost,
1150         Host,
1151         File,
1152         FileSlash,
1153         FileHost,
1154         PathStart,
1155         Path,
1156         CannotBeABaseURLPath,
1157         UTF8Query,
1158         NonUTF8Query,
1159         Fragment,
1160     };
1161
1162 #define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1163 #define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x)
1164
1165     State state = State::SchemeStart;
1166     while (!c.atEnd()) {
1167         if (UNLIKELY(isTabOrNewline(*c))) {
1168             syntaxViolation(c);
1169             ++c;
1170             continue;
1171         }
1172
1173         switch (state) {
1174         case State::SchemeStart:
1175             LOG_STATE("SchemeStart");
1176             if (isASCIIAlpha(*c)) {
1177                 if (UNLIKELY(isASCIIUpper(*c)))
1178                     syntaxViolation(c);
1179                 appendToASCIIBuffer(toASCIILower(*c));
1180                 advance(c);
1181                 if (c.atEnd()) {
1182                     m_asciiBuffer.clear();
1183                     state = State::NoScheme;
1184                     c = beginAfterControlAndSpace;
1185                 }
1186                 state = State::Scheme;
1187             } else
1188                 state = State::NoScheme;
1189             break;
1190         case State::Scheme:
1191             LOG_STATE("Scheme");
1192             if (isValidSchemeCharacter(*c)) {
1193                 if (UNLIKELY(isASCIIUpper(*c)))
1194                     syntaxViolation(c);
1195                 appendToASCIIBuffer(toASCIILower(*c));
1196             } else if (*c == ':') {
1197                 m_url.m_schemeEnd = currentPosition(c);
1198                 StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd);
1199                 appendToASCIIBuffer(':');
1200                 switch (scheme(urlScheme)) {
1201                 case Scheme::File:
1202                     m_urlIsSpecial = true;
1203                     state = State::File;
1204                     ++c;
1205                     break;
1206                 case Scheme::WS:
1207                 case Scheme::WSS:
1208                     isUTF8Encoding = true;
1209                     m_urlIsSpecial = true;
1210                     if (base.protocolIs(urlScheme))
1211                         state = State::SpecialRelativeOrAuthority;
1212                     else
1213                         state = State::SpecialAuthoritySlashes;
1214                     ++c;
1215                     break;
1216                 case Scheme::HTTP:
1217                 case Scheme::HTTPS:
1218                     m_url.m_protocolIsInHTTPFamily = true;
1219                     FALLTHROUGH;
1220                 case Scheme::FTP:
1221                 case Scheme::Gopher:
1222                     m_urlIsSpecial = true;
1223                     if (base.protocolIs(urlScheme))
1224                         state = State::SpecialRelativeOrAuthority;
1225                     else
1226                         state = State::SpecialAuthoritySlashes;
1227                     ++c;
1228                     break;
1229                 case Scheme::NonSpecial:
1230                     isUTF8Encoding = true;
1231                     auto maybeSlash = c;
1232                     advance(maybeSlash);
1233                     if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1234                         appendToASCIIBuffer('/');
1235                         c = maybeSlash;
1236                         state = State::PathOrAuthority;
1237                         ASSERT(*c == '/');
1238                         ++c;
1239                         m_url.m_userStart = currentPosition(c);
1240                     } else {
1241                         ++c;
1242                         m_url.m_userStart = currentPosition(c);
1243                         m_url.m_userEnd = m_url.m_userStart;
1244                         m_url.m_passwordEnd = m_url.m_userStart;
1245                         m_url.m_hostEnd = m_url.m_userStart;
1246                         m_url.m_portEnd = m_url.m_userStart;
1247                         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1248                         m_url.m_cannotBeABaseURL = true;
1249                         state = State::CannotBeABaseURLPath;
1250                     }
1251                     break;
1252                 }
1253                 break;
1254             } else {
1255                 m_asciiBuffer.clear();
1256                 state = State::NoScheme;
1257                 c = beginAfterControlAndSpace;
1258                 break;
1259             }
1260             advance(c);
1261             if (c.atEnd()) {
1262                 m_asciiBuffer.clear();
1263                 state = State::NoScheme;
1264                 c = beginAfterControlAndSpace;
1265             }
1266             break;
1267         case State::NoScheme:
1268             LOG_STATE("NoScheme");
1269             if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) {
1270                 failure();
1271                 return;
1272             }
1273             if (base.m_cannotBeABaseURL && *c == '#') {
1274                 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1275                 state = State::Fragment;
1276                 appendToASCIIBuffer('#');
1277                 ++c;
1278                 break;
1279             }
1280             if (!base.protocolIs("file")) {
1281                 state = State::Relative;
1282                 break;
1283             }
1284             copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1285             appendToASCIIBuffer(':');
1286             state = State::File;
1287             break;
1288         case State::SpecialRelativeOrAuthority:
1289             LOG_STATE("SpecialRelativeOrAuthority");
1290             if (*c == '/') {
1291                 appendToASCIIBuffer('/');
1292                 advance(c);
1293                 if (c.atEnd()) {
1294                     failure();
1295                     return;
1296                 }
1297                 if (*c == '/') {
1298                     appendToASCIIBuffer('/');
1299                     state = State::SpecialAuthorityIgnoreSlashes;
1300                     ++c;
1301                 } else
1302                     state = State::RelativeSlash;
1303             } else
1304                 state = State::Relative;
1305             break;
1306         case State::PathOrAuthority:
1307             LOG_STATE("PathOrAuthority");
1308             if (*c == '/') {
1309                 appendToASCIIBuffer('/');
1310                 state = State::AuthorityOrHost;
1311                 advance(c);
1312                 m_url.m_userStart = currentPosition(c);
1313                 authorityOrHostBegin = c;
1314             } else {
1315                 ASSERT(parsedDataView(currentPosition(c) - 1, 1) == "/");
1316                 m_url.m_userStart = currentPosition(c) - 1;
1317                 m_url.m_userEnd = m_url.m_userStart;
1318                 m_url.m_passwordEnd = m_url.m_userStart;
1319                 m_url.m_hostEnd = m_url.m_userStart;
1320                 m_url.m_portEnd = m_url.m_userStart;
1321                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1322                 state = State::Path;
1323             }
1324             break;
1325         case State::Relative:
1326             LOG_STATE("Relative");
1327             switch (*c) {
1328             case '/':
1329             case '\\':
1330                 state = State::RelativeSlash;
1331                 ++c;
1332                 break;
1333             case '?':
1334                 copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1335                 appendToASCIIBuffer('?');
1336                 ++c;
1337                 if (isUTF8Encoding)
1338                     state = State::UTF8Query;
1339                 else {
1340                     queryBegin = c;
1341                     state = State::NonUTF8Query;
1342                 }
1343                 break;
1344             case '#':
1345                 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1346                 appendToASCIIBuffer('#');
1347                 state = State::Fragment;
1348                 ++c;
1349                 break;
1350             default:
1351                 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1352                 state = State::Path;
1353                 break;
1354             }
1355             break;
1356         case State::RelativeSlash:
1357             LOG_STATE("RelativeSlash");
1358             if (*c == '/' || *c == '\\') {
1359                 ++c;
1360                 copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1361                 appendToASCIIBuffer("://", 3);
1362                 state = State::SpecialAuthorityIgnoreSlashes;
1363             } else {
1364                 copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1365                 appendToASCIIBuffer('/');
1366                 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1367                 state = State::Path;
1368             }
1369             break;
1370         case State::SpecialAuthoritySlashes:
1371             LOG_STATE("SpecialAuthoritySlashes");
1372             if (LIKELY(*c == '/' || *c == '\\')) {
1373                 if (UNLIKELY(*c == '\\'))
1374                     syntaxViolation(c);
1375                 appendToASCIIBuffer('/');
1376                 advance(c);
1377                 if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) {
1378                     if (UNLIKELY(*c == '\\'))
1379                         syntaxViolation(c);
1380                     ++c;
1381                     appendToASCIIBuffer('/');
1382                 } else {
1383                     syntaxViolation(c);
1384                     appendToASCIIBuffer('/');
1385                 }
1386             } else {
1387                 syntaxViolation(c);
1388                 appendToASCIIBuffer("//", 2);
1389             }
1390             state = State::SpecialAuthorityIgnoreSlashes;
1391             break;
1392         case State::SpecialAuthorityIgnoreSlashes:
1393             LOG_STATE("SpecialAuthorityIgnoreSlashes");
1394             if (*c == '/' || *c == '\\') {
1395                 syntaxViolation(c);
1396                 ++c;
1397             } else {
1398                 m_url.m_userStart = currentPosition(c);
1399                 state = State::AuthorityOrHost;
1400                 authorityOrHostBegin = c;
1401             }
1402             break;
1403         case State::AuthorityOrHost:
1404             do {
1405                 LOG_STATE("AuthorityOrHost");
1406                 if (*c == '@') {
1407                     auto lastAt = c;
1408                     auto findLastAt = c;
1409                     while (!findLastAt.atEnd()) {
1410                         URL_PARSER_LOG("Finding last @: %c", *findLastAt);
1411                         if (*findLastAt == '@')
1412                             lastAt = findLastAt;
1413                         bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\');
1414                         if (isSlash || *findLastAt == '?' || *findLastAt == '#')
1415                             break;
1416                         ++findLastAt;
1417                     }
1418                     parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1419                     c = lastAt;
1420                     advance(c);
1421                     authorityOrHostBegin = c;
1422                     state = State::Host;
1423                     m_hostHasPercentOrNonASCII = false;
1424                     break;
1425                 }
1426                 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1427                 if (isSlash || *c == '?' || *c == '#') {
1428                     auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c);
1429                     if (iterator.atEnd()) {
1430                         size_t position = currentPosition(c);
1431                         ASSERT(m_url.m_userStart == position);
1432                         RELEASE_ASSERT(position >= 2);
1433                         position -= 2;
1434                         ASSERT(parsedDataView(position, 2) == "//");
1435                         m_url.m_userStart = position;
1436                         m_url.m_userEnd = position;
1437                         m_url.m_passwordEnd = position;
1438                         m_url.m_hostEnd = position;
1439                         m_url.m_portEnd = position;
1440                         m_url.m_pathAfterLastSlash = position + 2;
1441                     } else {
1442                         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1443                         m_url.m_passwordEnd = m_url.m_userEnd;
1444                         if (!parseHostAndPort(iterator)) {
1445                             failure();
1446                             return;
1447                         }
1448                         if (UNLIKELY(!isSlash)) {
1449                             syntaxViolation(c);
1450                             appendToASCIIBuffer('/');
1451                             m_url.m_pathAfterLastSlash = currentPosition(c);
1452                         }
1453                     }
1454                     state = State::Path;
1455                     break;
1456                 }
1457                 if (isPercentOrNonASCII(*c))
1458                     m_hostHasPercentOrNonASCII = true;
1459                 ++c;
1460             } while (!c.atEnd());
1461             break;
1462         case State::Host:
1463             do {
1464                 LOG_STATE("Host");
1465                 if (*c == '/' || *c == '?' || *c == '#') {
1466                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1467                         failure();
1468                         return;
1469                     }
1470                     if (*c == '?' || *c == '#') {
1471                         syntaxViolation(c);
1472                         appendToASCIIBuffer('/');
1473                         m_url.m_pathAfterLastSlash = currentPosition(c);
1474                     }
1475                     state = State::Path;
1476                     break;
1477                 }
1478                 if (isPercentOrNonASCII(*c))
1479                     m_hostHasPercentOrNonASCII = true;
1480                 ++c;
1481             } while (!c.atEnd());
1482             break;
1483         case State::File:
1484             LOG_STATE("File");
1485             switch (*c) {
1486             case '\\':
1487                 syntaxViolation(c);
1488                 FALLTHROUGH;
1489             case '/':
1490                 appendToASCIIBuffer('/');
1491                 state = State::FileSlash;
1492                 ++c;
1493                 break;
1494             case '?':
1495                 syntaxViolation(c);
1496                 if (base.isValid() && base.protocolIs("file")) {
1497                     copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1498                     appendToASCIIBuffer('?');
1499                     ++c;
1500                 } else {
1501                     appendToASCIIBuffer("///?", 4);
1502                     ++c;
1503                     m_url.m_userStart = currentPosition(c) - 2;
1504                     m_url.m_userEnd = m_url.m_userStart;
1505                     m_url.m_passwordEnd = m_url.m_userStart;
1506                     m_url.m_hostEnd = m_url.m_userStart;
1507                     m_url.m_portEnd = m_url.m_userStart;
1508                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1509                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1510                 }
1511                 if (isUTF8Encoding)
1512                     state = State::UTF8Query;
1513                 else {
1514                     queryBegin = c;
1515                     state = State::NonUTF8Query;
1516                 }
1517                 break;
1518             case '#':
1519                 syntaxViolation(c);
1520                 if (base.isValid() && base.protocolIs("file")) {
1521                     copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1522                     appendToASCIIBuffer('#');
1523                 } else {
1524                     appendToASCIIBuffer("///#", 4);
1525                     m_url.m_userStart = currentPosition(c) - 2;
1526                     m_url.m_userEnd = m_url.m_userStart;
1527                     m_url.m_passwordEnd = m_url.m_userStart;
1528                     m_url.m_hostEnd = m_url.m_userStart;
1529                     m_url.m_portEnd = m_url.m_userStart;
1530                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1531                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1532                     m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1533                 }
1534                 state = State::Fragment;
1535                 ++c;
1536                 break;
1537             default:
1538                 syntaxViolation(c);
1539                 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1540                     copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1541                 else {
1542                     appendToASCIIBuffer("///", 3);
1543                     m_url.m_userStart = currentPosition(c) - 1;
1544                     m_url.m_userEnd = m_url.m_userStart;
1545                     m_url.m_passwordEnd = m_url.m_userStart;
1546                     m_url.m_hostEnd = m_url.m_userStart;
1547                     m_url.m_portEnd = m_url.m_userStart;
1548                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1549                     if (isWindowsDriveLetter(c))
1550                         appendWindowsDriveLetter(c);
1551                 }
1552                 state = State::Path;
1553                 break;
1554             }
1555             break;
1556         case State::FileSlash:
1557             LOG_STATE("FileSlash");
1558             if (LIKELY(*c == '/' || *c == '\\')) {
1559                 if (UNLIKELY(*c == '\\'))
1560                     syntaxViolation(c);
1561                 appendToASCIIBuffer('/');
1562                 advance(c);
1563                 m_url.m_userStart = currentPosition(c);
1564                 m_url.m_userEnd = m_url.m_userStart;
1565                 m_url.m_passwordEnd = m_url.m_userStart;
1566                 m_url.m_hostEnd = m_url.m_userStart;
1567                 m_url.m_portEnd = m_url.m_userStart;
1568                 authorityOrHostBegin = c;
1569                 state = State::FileHost;
1570                 break;
1571             }
1572             if (base.isValid() && base.protocolIs("file")) {
1573                 // FIXME: This String copy is unnecessary.
1574                 String basePath = base.path();
1575                 if (basePath.length() >= 2) {
1576                     bool windowsQuirk = basePath.is8Bit()
1577                         ? isWindowsDriveLetter(CodePointIterator<LChar>(basePath.characters8(), basePath.characters8() + basePath.length()))
1578                         : isWindowsDriveLetter(CodePointIterator<UChar>(basePath.characters16(), basePath.characters16() + basePath.length()));
1579                     if (windowsQuirk) {
1580                         appendToASCIIBuffer(basePath[0]);
1581                         appendToASCIIBuffer(basePath[1]);
1582                     }
1583                 }
1584             }
1585             syntaxViolation(c);
1586             appendToASCIIBuffer("//", 2);
1587             m_url.m_userStart = currentPosition(c) - 1;
1588             m_url.m_userEnd = m_url.m_userStart;
1589             m_url.m_passwordEnd = m_url.m_userStart;
1590             m_url.m_hostEnd = m_url.m_userStart;
1591             m_url.m_portEnd = m_url.m_userStart;
1592             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1593             if (isWindowsDriveLetter(c))
1594                 appendWindowsDriveLetter(c);
1595             state = State::Path;
1596             break;
1597         case State::FileHost:
1598             do {
1599                 LOG_STATE("FileHost");
1600                 if (isSlashQuestionOrHash(*c)) {
1601                     bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1602                         && isWindowsDriveLetter(authorityOrHostBegin);
1603                     if (windowsQuirk) {
1604                         syntaxViolation(authorityOrHostBegin);
1605                         appendToASCIIBuffer('/');
1606                         appendWindowsDriveLetter(authorityOrHostBegin);
1607                     }
1608                     if (windowsQuirk || authorityOrHostBegin == c) {
1609                         ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1, 1) == "/");
1610                         if (UNLIKELY(*c == '?')) {
1611                             syntaxViolation(c);
1612                             appendToASCIIBuffer("/?", 2);
1613                             ++c;
1614                             if (isUTF8Encoding)
1615                                 state = State::UTF8Query;
1616                             else {
1617                                 queryBegin = c;
1618                                 state = State::NonUTF8Query;
1619                             }
1620                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1621                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1622                             break;
1623                         }
1624                         if (UNLIKELY(*c == '#')) {
1625                             syntaxViolation(c);
1626                             appendToASCIIBuffer("/#", 2);
1627                             ++c;
1628                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1629                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1630                             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1631                             state = State::Fragment;
1632                             break;
1633                         }
1634                         state = State::Path;
1635                         break;
1636                     }
1637                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1638                         failure();
1639                         return;
1640                     }
1641                     if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) {
1642                         syntaxViolation(c);
1643                         m_asciiBuffer.shrink(m_url.m_passwordEnd);
1644                         m_url.m_hostEnd = currentPosition(c);
1645                         m_url.m_portEnd = m_url.m_hostEnd;
1646                     }
1647                     
1648                     state = State::PathStart;
1649                     break;
1650                 }
1651                 if (isPercentOrNonASCII(*c))
1652                     m_hostHasPercentOrNonASCII = true;
1653                 ++c;
1654             } while (!c.atEnd());
1655             break;
1656         case State::PathStart:
1657             LOG_STATE("PathStart");
1658             if (*c != '/' && *c != '\\')
1659                 ++c;
1660             state = State::Path;
1661             break;
1662         case State::Path:
1663             LOG_STATE("Path");
1664             if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1665                 if (UNLIKELY(m_urlIsSpecial && *c == '\\'))
1666                     syntaxViolation(c);
1667                 appendToASCIIBuffer('/');
1668                 ++c;
1669                 m_url.m_pathAfterLastSlash = currentPosition(c);
1670                 break;
1671             }
1672             if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1, 1) == "/")) {
1673                 if (UNLIKELY(isDoubleDotPathSegment(c))) {
1674                     syntaxViolation(c);
1675                     consumeDoubleDotPathSegment(c);
1676                     popPath();
1677                     break;
1678                 }
1679                 if (UNLIKELY(isSingleDotPathSegment(c))) {
1680                     syntaxViolation(c);
1681                     consumeSingleDotPathSegment(c);
1682                     break;
1683                 }
1684             }
1685             if (*c == '?') {
1686                 m_url.m_pathEnd = currentPosition(c);
1687                 appendToASCIIBuffer('?');
1688                 ++c;
1689                 if (isUTF8Encoding)
1690                     state = State::UTF8Query;
1691                 else {
1692                     queryBegin = c;
1693                     state = State::NonUTF8Query;
1694                 }
1695                 break;
1696             }
1697             if (*c == '#') {
1698                 m_url.m_pathEnd = currentPosition(c);
1699                 m_url.m_queryEnd = m_url.m_pathEnd;
1700                 state = State::Fragment;
1701                 break;
1702             }
1703             utf8PercentEncode<isInDefaultEncodeSet>(c);
1704             ++c;
1705             break;
1706         case State::CannotBeABaseURLPath:
1707             LOG_STATE("CannotBeABaseURLPath");
1708             if (*c == '?') {
1709                 m_url.m_pathEnd = currentPosition(c);
1710                 appendToASCIIBuffer('?');
1711                 ++c;
1712                 if (isUTF8Encoding)
1713                     state = State::UTF8Query;
1714                 else {
1715                     queryBegin = c;
1716                     state = State::NonUTF8Query;
1717                 }
1718             } else if (*c == '#') {
1719                 m_url.m_pathEnd = currentPosition(c);
1720                 m_url.m_queryEnd = m_url.m_pathEnd;
1721                 state = State::Fragment;
1722             } else if (*c == '/') {
1723                 appendToASCIIBuffer('/');
1724                 ++c;
1725                 m_url.m_pathAfterLastSlash = currentPosition(c);
1726             } else {
1727                 utf8PercentEncode<isInSimpleEncodeSet>(c);
1728                 ++c;
1729             }
1730             break;
1731         case State::UTF8Query:
1732             LOG_STATE("UTF8Query");
1733             ASSERT(queryBegin == CodePointIterator<CharacterType>());
1734             if (*c == '#') {
1735                 m_url.m_queryEnd = currentPosition(c);
1736                 state = State::Fragment;
1737                 break;
1738             }
1739             if (isUTF8Encoding)
1740                 utf8QueryEncode(c);
1741             else
1742                 appendCodePoint(queryBuffer, *c);
1743             ++c;
1744             break;
1745         case State::NonUTF8Query:
1746             do {
1747                 LOG_STATE("NonUTF8Query");
1748                 ASSERT(queryBegin != CodePointIterator<CharacterType>());
1749                 if (*c == '#') {
1750                     encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
1751                     m_url.m_queryEnd = currentPosition(c);
1752                     state = State::Fragment;
1753                     break;
1754                 }
1755                 appendCodePoint(queryBuffer, *c);
1756                 advance(c, queryBegin);
1757             } while (!c.atEnd());
1758             break;
1759         case State::Fragment:
1760             URL_PARSER_LOG("State Fragment");
1761             utf8PercentEncode<isInSimpleEncodeSet>(c);
1762             ++c;
1763             break;
1764         }
1765     }
1766
1767     switch (state) {
1768     case State::SchemeStart:
1769         LOG_FINAL_STATE("SchemeStart");
1770         if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1771             m_url = base;
1772             m_url.removeFragmentIdentifier();
1773             return;
1774         }
1775         failure();
1776         return;
1777     case State::Scheme:
1778         LOG_FINAL_STATE("Scheme");
1779         failure();
1780         return;
1781     case State::NoScheme:
1782         LOG_FINAL_STATE("NoScheme");
1783         RELEASE_ASSERT_NOT_REACHED();
1784     case State::SpecialRelativeOrAuthority:
1785         LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1786         copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1787         m_url.m_fragmentEnd = m_url.m_queryEnd;
1788         break;
1789     case State::PathOrAuthority:
1790         LOG_FINAL_STATE("PathOrAuthority");
1791         ASSERT(m_url.m_userStart);
1792         ASSERT(m_url.m_userStart == currentPosition(c));
1793         ASSERT(parsedDataView(currentPosition(c) - 1, 1) == "/");
1794         m_url.m_userStart--;
1795         m_url.m_userEnd = m_url.m_userStart;
1796         m_url.m_passwordEnd = m_url.m_userStart;
1797         m_url.m_hostEnd = m_url.m_userStart;
1798         m_url.m_portEnd = m_url.m_userStart;
1799         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1800         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1801         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1802         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1803         break;
1804     case State::Relative:
1805         LOG_FINAL_STATE("Relative");
1806         RELEASE_ASSERT_NOT_REACHED();
1807     case State::RelativeSlash:
1808         LOG_FINAL_STATE("RelativeSlash");
1809         copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1810         appendToASCIIBuffer('/');
1811         m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1812         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1813         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1814         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1815         break;
1816     case State::SpecialAuthoritySlashes:
1817         LOG_FINAL_STATE("SpecialAuthoritySlashes");
1818         m_url.m_userStart = currentPosition(c);
1819         m_url.m_userEnd = m_url.m_userStart;
1820         m_url.m_passwordEnd = m_url.m_userStart;
1821         m_url.m_hostEnd = m_url.m_userStart;
1822         m_url.m_portEnd = m_url.m_userStart;
1823         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1824         m_url.m_pathEnd = m_url.m_userStart;
1825         m_url.m_queryEnd = m_url.m_userStart;
1826         m_url.m_fragmentEnd = m_url.m_userStart;
1827         break;
1828     case State::SpecialAuthorityIgnoreSlashes:
1829         LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1830         failure();
1831         return;
1832         break;
1833     case State::AuthorityOrHost:
1834         LOG_FINAL_STATE("AuthorityOrHost");
1835         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1836         m_url.m_passwordEnd = m_url.m_userEnd;
1837         if (authorityOrHostBegin.atEnd()) {
1838             RELEASE_ASSERT(m_url.m_userStart >= 2);
1839             ASSERT(parsedDataView(m_url.m_userStart - 2, 2) == "//");
1840             m_url.m_userStart -= 2;
1841             m_url.m_userEnd = m_url.m_userStart;
1842             m_url.m_passwordEnd = m_url.m_userStart;
1843             m_url.m_hostEnd = m_url.m_userStart;
1844             m_url.m_portEnd = m_url.m_userStart;
1845             m_url.m_pathEnd = m_url.m_userStart + 2;
1846         } else if (!parseHostAndPort(authorityOrHostBegin)) {
1847             failure();
1848             return;
1849         } else {
1850             if (m_urlIsSpecial) {
1851                 syntaxViolation(c);
1852                 appendToASCIIBuffer('/');
1853                 m_url.m_pathEnd = m_url.m_portEnd + 1;
1854             } else
1855                 m_url.m_pathEnd = m_url.m_portEnd;
1856         }
1857         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1858         m_url.m_queryEnd = m_url.m_pathEnd;
1859         m_url.m_fragmentEnd = m_url.m_pathEnd;
1860         break;
1861     case State::Host:
1862         LOG_FINAL_STATE("Host");
1863         if (!parseHostAndPort(authorityOrHostBegin)) {
1864             failure();
1865             return;
1866         }
1867         if (m_urlIsSpecial) {
1868             syntaxViolation(c);
1869             appendToASCIIBuffer('/');
1870             m_url.m_pathEnd = m_url.m_portEnd + 1;
1871         } else
1872             m_url.m_pathEnd = m_url.m_portEnd;
1873         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1874         m_url.m_queryEnd = m_url.m_pathEnd;
1875         m_url.m_fragmentEnd = m_url.m_pathEnd;
1876         break;
1877     case State::File:
1878         LOG_FINAL_STATE("File");
1879         if (base.isValid() && base.protocolIs("file")) {
1880             copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1881             appendToASCIIBuffer(':');
1882         }
1883         syntaxViolation(c);
1884         appendToASCIIBuffer("///", 3);
1885         m_url.m_userStart = currentPosition(c) - 1;
1886         m_url.m_userEnd = m_url.m_userStart;
1887         m_url.m_passwordEnd = m_url.m_userStart;
1888         m_url.m_hostEnd = m_url.m_userStart;
1889         m_url.m_portEnd = m_url.m_userStart;
1890         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1891         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1892         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1893         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1894         break;
1895     case State::FileSlash:
1896         LOG_FINAL_STATE("FileSlash");
1897         syntaxViolation(c);
1898         m_url.m_userStart = currentPosition(c) + 1;
1899         appendToASCIIBuffer("//", 2);
1900         m_url.m_userEnd = m_url.m_userStart;
1901         m_url.m_passwordEnd = m_url.m_userStart;
1902         m_url.m_hostEnd = m_url.m_userStart;
1903         m_url.m_portEnd = m_url.m_userStart;
1904         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1905         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1906         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1907         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1908         break;
1909     case State::FileHost:
1910         LOG_FINAL_STATE("FileHost");
1911         if (authorityOrHostBegin == c) {
1912             syntaxViolation(c);
1913             appendToASCIIBuffer('/');
1914             m_url.m_userStart = currentPosition(c) - 1;
1915             m_url.m_userEnd = m_url.m_userStart;
1916             m_url.m_passwordEnd = m_url.m_userStart;
1917             m_url.m_hostEnd = m_url.m_userStart;
1918             m_url.m_portEnd = m_url.m_userStart;
1919             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1920             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1921             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1922             m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1923             break;
1924         }
1925
1926         if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1927             failure();
1928             return;
1929         }
1930
1931         syntaxViolation(c);
1932         if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) {
1933             m_asciiBuffer.shrink(m_url.m_passwordEnd);
1934             m_url.m_hostEnd = currentPosition(c);
1935             m_url.m_portEnd = m_url.m_hostEnd;
1936         }
1937         appendToASCIIBuffer('/');
1938         m_url.m_pathAfterLastSlash = m_url.m_portEnd + 1;
1939         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1940         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1941         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1942         break;
1943     case State::PathStart:
1944         LOG_FINAL_STATE("PathStart");
1945         RELEASE_ASSERT_NOT_REACHED();
1946     case State::Path:
1947         LOG_FINAL_STATE("Path");
1948         m_url.m_pathEnd = currentPosition(c);
1949         m_url.m_queryEnd = m_url.m_pathEnd;
1950         m_url.m_fragmentEnd = m_url.m_pathEnd;
1951         break;
1952     case State::CannotBeABaseURLPath:
1953         LOG_FINAL_STATE("CannotBeABaseURLPath");
1954         m_url.m_pathEnd = currentPosition(c);
1955         m_url.m_queryEnd = m_url.m_pathEnd;
1956         m_url.m_fragmentEnd = m_url.m_pathEnd;
1957         break;
1958     case State::UTF8Query:
1959         LOG_FINAL_STATE("UTF8Query");
1960         ASSERT(queryBegin == CodePointIterator<CharacterType>());
1961         m_url.m_queryEnd = currentPosition(c);
1962         m_url.m_fragmentEnd = m_url.m_queryEnd;
1963         break;
1964     case State::NonUTF8Query:
1965         LOG_FINAL_STATE("NonUTF8Query");
1966         ASSERT(queryBegin != CodePointIterator<CharacterType>());
1967         encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
1968         m_url.m_queryEnd = currentPosition(c);
1969         m_url.m_fragmentEnd = m_url.m_queryEnd;
1970         break;
1971     case State::Fragment:
1972         LOG_FINAL_STATE("Fragment");
1973         m_url.m_fragmentEnd = currentPosition(c);
1974         break;
1975     }
1976
1977     if (LIKELY(!m_didSeeSyntaxViolation)) {
1978         m_url.m_string = m_inputString;
1979         ASSERT(m_asciiBuffer.isEmpty());
1980     } else
1981         m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
1982     m_url.m_isValid = true;
1983     URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
1984 }
1985
1986 template<typename CharacterType>
1987 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
1988 {
1989     if (UNLIKELY(iterator.atEnd())) {
1990         syntaxViolation(iterator);
1991         m_url.m_userEnd = currentPosition(iterator);
1992         m_url.m_passwordEnd = m_url.m_userEnd;
1993         return;
1994     }
1995     for (; !iterator.atEnd(); advance(iterator)) {
1996         if (*iterator == ':') {
1997             m_url.m_userEnd = currentPosition(iterator);
1998             auto iteratorAtColon = iterator;
1999             ++iterator;
2000             bool tabOrNewlineAfterColon = false;
2001             while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
2002                 tabOrNewlineAfterColon = true;
2003                 ++iterator;
2004             }
2005             if (UNLIKELY(iterator.atEnd())) {
2006                 syntaxViolation(iteratorAtColon);
2007                 m_url.m_passwordEnd = m_url.m_userEnd;
2008                 if (m_url.m_userEnd > m_url.m_userStart)
2009                     appendToASCIIBuffer('@');
2010                 return;
2011             }
2012             if (tabOrNewlineAfterColon)
2013                 syntaxViolation(iteratorAtColon);
2014             appendToASCIIBuffer(':');
2015             break;
2016         }
2017         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2018     }
2019     for (; !iterator.atEnd(); advance(iterator))
2020         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2021     m_url.m_passwordEnd = currentPosition(iterator);
2022     if (!m_url.m_userEnd)
2023         m_url.m_userEnd = m_url.m_passwordEnd;
2024     appendToASCIIBuffer('@');
2025 }
2026
2027 template<typename UnsignedIntegerType>
2028 void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
2029 {
2030     LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
2031     LChar* end = buf + WTF_ARRAY_LENGTH(buf);
2032     LChar* p = end;
2033     do {
2034         *--p = (number % 10) + '0';
2035         number /= 10;
2036     } while (number);
2037     appendToASCIIBuffer(p, end - p);
2038 }
2039
2040 void URLParser::serializeIPv4(IPv4Address address)
2041 {
2042     appendNumberToASCIIBuffer<uint8_t>(address >> 24);
2043     appendToASCIIBuffer('.');
2044     appendNumberToASCIIBuffer<uint8_t>(address >> 16);
2045     appendToASCIIBuffer('.');
2046     appendNumberToASCIIBuffer<uint8_t>(address >> 8);
2047     appendToASCIIBuffer('.');
2048     appendNumberToASCIIBuffer<uint8_t>(address);
2049 }
2050     
2051 static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
2052 {
2053     size_t end = begin;
2054     for (; end < 8; end++) {
2055         if (address[end])
2056             break;
2057     }
2058     return end - begin;
2059 }
2060
2061 static Optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
2062 {
2063     Optional<size_t> longest;
2064     size_t longestLength = 0;
2065     for (size_t i = 0; i < 8; i++) {
2066         size_t length = zeroSequenceLength(address, i);
2067         if (length) {
2068             if (length > 1 && (!longest || longestLength < length)) {
2069                 longest = i;
2070                 longestLength = length;
2071             }
2072             i += length;
2073         }
2074     }
2075     return longest;
2076 }
2077
2078 void URLParser::serializeIPv6Piece(uint16_t piece)
2079 {
2080     bool printed = false;
2081     if (auto nibble0 = piece >> 12) {
2082         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
2083         printed = true;
2084     }
2085     auto nibble1 = piece >> 8 & 0xF;
2086     if (printed || nibble1) {
2087         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
2088         printed = true;
2089     }
2090     auto nibble2 = piece >> 4 & 0xF;
2091     if (printed || nibble2)
2092         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
2093     appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
2094 }
2095
2096 void URLParser::serializeIPv6(URLParser::IPv6Address address)
2097 {
2098     appendToASCIIBuffer('[');
2099     auto compressPointer = findLongestZeroSequence(address);
2100     for (size_t piece = 0; piece < 8; piece++) {
2101         if (compressPointer && compressPointer.value() == piece) {
2102             ASSERT(!address[piece]);
2103             if (piece)
2104                 appendToASCIIBuffer(':');
2105             else
2106                 appendToASCIIBuffer("::", 2);
2107             while (piece < 8 && !address[piece])
2108                 piece++;
2109             if (piece == 8)
2110                 break;
2111         }
2112         serializeIPv6Piece(address[piece]);
2113         if (piece < 7)
2114             appendToASCIIBuffer(':');
2115     }
2116     appendToASCIIBuffer(']');
2117 }
2118
2119 template<typename CharacterType>
2120 Optional<uint32_t> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation)
2121 {
2122     enum class State : uint8_t {
2123         UnknownBase,
2124         Decimal,
2125         OctalOrHex,
2126         Octal,
2127         Hex,
2128     };
2129     State state = State::UnknownBase;
2130     Checked<uint32_t, RecordOverflow> value = 0;
2131     if (!iterator.atEnd() && *iterator == '.')
2132         return Nullopt;
2133     while (!iterator.atEnd()) {
2134         if (isTabOrNewline(*iterator)) {
2135             didSeeSyntaxViolation = true;
2136             ++iterator;
2137             continue;
2138         }
2139         if (*iterator == '.') {
2140             ASSERT(!value.hasOverflowed());
2141             return value.unsafeGet();
2142         }
2143         switch (state) {
2144         case State::UnknownBase:
2145             if (UNLIKELY(*iterator == '0')) {
2146                 ++iterator;
2147                 state = State::OctalOrHex;
2148                 break;
2149             }
2150             state = State::Decimal;
2151             break;
2152         case State::OctalOrHex:
2153             didSeeSyntaxViolation = true;
2154             if (*iterator == 'x' || *iterator == 'X') {
2155                 ++iterator;
2156                 state = State::Hex;
2157                 break;
2158             }
2159             state = State::Octal;
2160             break;
2161         case State::Decimal:
2162             if (*iterator < '0' || *iterator > '9')
2163                 return Nullopt;
2164             value *= 10;
2165             value += *iterator - '0';
2166             if (UNLIKELY(value.hasOverflowed()))
2167                 return Nullopt;
2168             ++iterator;
2169             break;
2170         case State::Octal:
2171             ASSERT(didSeeSyntaxViolation);
2172             if (*iterator < '0' || *iterator > '7')
2173                 return Nullopt;
2174             value *= 8;
2175             value += *iterator - '0';
2176             if (UNLIKELY(value.hasOverflowed()))
2177                 return Nullopt;
2178             ++iterator;
2179             break;
2180         case State::Hex:
2181             ASSERT(didSeeSyntaxViolation);
2182             if (!isASCIIHexDigit(*iterator))
2183                 return Nullopt;
2184             value *= 16;
2185             value += toASCIIHexValue(*iterator);
2186             if (UNLIKELY(value.hasOverflowed()))
2187                 return Nullopt;
2188             ++iterator;
2189             break;
2190         }
2191     }
2192     ASSERT(!value.hasOverflowed());
2193     return value.unsafeGet();
2194 }
2195
2196 ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2197 {
2198     RELEASE_ASSERT(exponent <= 4);
2199     uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
2200     return values[exponent];
2201 }
2202
2203 template<typename CharacterType>
2204 Optional<URLParser::IPv4Address> URLParser::parseIPv4Host(CodePointIterator<CharacterType> iterator)
2205 {
2206     auto hostBegin = iterator;
2207
2208     Vector<uint32_t, 4> items;
2209     items.reserveInitialCapacity(4);
2210     bool didSeeSyntaxViolation = false;
2211     while (!iterator.atEnd()) {
2212         if (isTabOrNewline(*iterator)) {
2213             didSeeSyntaxViolation = true;
2214             ++iterator;
2215             continue;
2216         }
2217         if (items.size() >= 4)
2218             return Nullopt;
2219         if (auto item = parseIPv4Piece(iterator, didSeeSyntaxViolation))
2220             items.append(item.value());
2221         else
2222             return Nullopt;
2223         if (!iterator.atEnd()) {
2224             if (items.size() >= 4)
2225                 return Nullopt;
2226             if (*iterator == '.')
2227                 ++iterator;
2228             else
2229                 return Nullopt;
2230         }
2231     }
2232     if (!iterator.atEnd() || !items.size() || items.size() > 4)
2233         return Nullopt;
2234     if (items.size() > 1) {
2235         for (size_t i = 0; i < items.size() - 1; i++) {
2236             if (items[i] > 255)
2237                 return Nullopt;
2238         }
2239     }
2240     if (items[items.size() - 1] >= pow256(5 - items.size()))
2241         return Nullopt;
2242
2243     if (didSeeSyntaxViolation)
2244         syntaxViolation(hostBegin);
2245     for (auto item : items) {
2246         if (item > 255)
2247             syntaxViolation(hostBegin);
2248     }
2249
2250     if (UNLIKELY(items.size() != 4))
2251         syntaxViolation(hostBegin);
2252
2253     IPv4Address ipv4 = items.takeLast();
2254     for (size_t counter = 0; counter < items.size(); ++counter)
2255         ipv4 += items[counter] * pow256(3 - counter);
2256     return ipv4;
2257 }
2258
2259 template<typename CharacterType>
2260 Optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator)
2261 {
2262     if (iterator.atEnd())
2263         return Nullopt;
2264     uint32_t piece = 0;
2265     bool leadingZeros = false;
2266     size_t digitCount = 0;
2267     while (!iterator.atEnd()) {
2268         if (!isASCIIDigit(*iterator))
2269             return Nullopt;
2270         ++digitCount;
2271         if (!piece && *iterator == '0') {
2272             if (leadingZeros)
2273                 return Nullopt;
2274             leadingZeros = true;
2275         }
2276         if (!piece && *iterator == '0')
2277             leadingZeros = true;
2278         piece = piece * 10 + *iterator - '0';
2279         if (piece > 255)
2280             return Nullopt;
2281         advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2282         if (iterator.atEnd())
2283             break;
2284         if (*iterator == '.')
2285             break;
2286     }
2287     if (piece && leadingZeros)
2288         return Nullopt;
2289     return piece;
2290 }
2291
2292 template<typename CharacterType>
2293 Optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator)
2294 {
2295     IPv4Address address = 0;
2296     for (size_t i = 0; i < 4; ++i) {
2297         if (Optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator))
2298             address = (address << 8) + piece.value();
2299         else
2300             return Nullopt;
2301         if (i < 3) {
2302             if (iterator.atEnd())
2303                 return Nullopt;
2304             if (*iterator != '.')
2305                 return Nullopt;
2306             advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2307         } else if (!iterator.atEnd())
2308             return Nullopt;
2309     }
2310     ASSERT(iterator.atEnd());
2311     return address;
2312 }
2313
2314 template<typename CharacterType>
2315 Optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2316 {
2317     ASSERT(*c == '[');
2318     auto hostBegin = c;
2319     advance(c, hostBegin);
2320     if (c.atEnd())
2321         return Nullopt;
2322
2323     IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}};
2324     size_t piecePointer = 0;
2325     Optional<size_t> compressPointer;
2326
2327     if (*c == ':') {
2328         advance(c, hostBegin);
2329         if (c.atEnd())
2330             return Nullopt;
2331         if (*c != ':')
2332             return Nullopt;
2333         advance(c, hostBegin);
2334         ++piecePointer;
2335         compressPointer = piecePointer;
2336     }
2337     
2338     while (!c.atEnd()) {
2339         if (piecePointer == 8)
2340             return Nullopt;
2341         if (*c == ':') {
2342             if (compressPointer)
2343                 return Nullopt;
2344             advance(c, hostBegin);
2345             ++piecePointer;
2346             compressPointer = piecePointer;
2347             continue;
2348         }
2349         if (piecePointer == 6 || (compressPointer && piecePointer < 6)) {
2350             if (Optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) {
2351                 if (compressPointer && piecePointer == 5)
2352                     return Nullopt;
2353                 syntaxViolation(hostBegin);
2354                 address[piecePointer++] = ipv4Address.value() >> 16;
2355                 address[piecePointer++] = ipv4Address.value() & 0xFFFF;
2356                 c = { };
2357                 break;
2358             }
2359         }
2360         uint16_t value = 0;
2361         size_t length = 0;
2362         bool leadingZeros = false;
2363         for (; length < 4; length++) {
2364             if (c.atEnd())
2365                 break;
2366             if (!isASCIIHexDigit(*c))
2367                 break;
2368             if (isASCIIUpper(*c))
2369                 syntaxViolation(hostBegin);
2370             if (*c == '0' && !length)
2371                 leadingZeros = true;
2372             value = value * 0x10 + toASCIIHexValue(*c);
2373             advance(c, hostBegin);
2374         }
2375         
2376         if (UNLIKELY((value && leadingZeros) || (!value && length > 1)))
2377             syntaxViolation(hostBegin);
2378
2379         address[piecePointer++] = value;
2380         if (c.atEnd())
2381             break;
2382         if (piecePointer == 8 || *c != ':')
2383             return Nullopt;
2384         advance(c, hostBegin);
2385     }
2386     
2387     if (!c.atEnd())
2388         return Nullopt;
2389     
2390     if (compressPointer) {
2391         size_t swaps = piecePointer - compressPointer.value();
2392         piecePointer = 7;
2393         while (swaps)
2394             std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
2395     } else if (piecePointer != 8)
2396         return Nullopt;
2397
2398     Optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2399     if (possibleCompressPointer)
2400         possibleCompressPointer.value()++;
2401     if (UNLIKELY(compressPointer != possibleCompressPointer))
2402         syntaxViolation(hostBegin);
2403     
2404     return address;
2405 }
2406
2407 template<typename CharacterType>
2408 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2409 {
2410     Vector<LChar, defaultInlineBufferSize> output;
2411     output.reserveInitialCapacity(length);
2412     
2413     for (size_t i = 0; i < length; ++i) {
2414         uint8_t byte = input[i];
2415         if (byte != '%')
2416             output.uncheckedAppend(byte);
2417         else if (length > 2 && i < length - 2) {
2418             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2419                 syntaxViolation(iteratorForSyntaxViolationPosition);
2420                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2421                 i += 2;
2422             } else
2423                 output.uncheckedAppend(byte);
2424         } else
2425             output.uncheckedAppend(byte);
2426     }
2427     return output;
2428 }
2429     
2430 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length)
2431 {
2432     Vector<LChar, defaultInlineBufferSize> output;
2433     output.reserveInitialCapacity(length);
2434     
2435     for (size_t i = 0; i < length; ++i) {
2436         uint8_t byte = input[i];
2437         if (byte != '%')
2438             output.uncheckedAppend(byte);
2439         else if (length > 2 && i < length - 2) {
2440             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2441                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2442                 i += 2;
2443             } else
2444                 output.uncheckedAppend(byte);
2445         } else
2446             output.uncheckedAppend(byte);
2447     }
2448     return output;
2449 }
2450
2451 ALWAYS_INLINE static bool containsOnlyASCII(const String& string)
2452 {
2453     if (string.is8Bit())
2454         return charactersAreAllASCII(string.characters8(), string.length());
2455     return charactersAreAllASCII(string.characters16(), string.length());
2456 }
2457
2458 template<typename CharacterType>
2459 Optional<Vector<LChar, URLParser::defaultInlineBufferSize>> URLParser::domainToASCII(const String& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2460 {
2461     Vector<LChar, defaultInlineBufferSize> ascii;
2462     if (containsOnlyASCII(domain)) {
2463         size_t length = domain.length();
2464         if (domain.is8Bit()) {
2465             const LChar* characters = domain.characters8();
2466             ascii.reserveInitialCapacity(length);
2467             for (size_t i = 0; i < length; ++i) {
2468                 if (UNLIKELY(isASCIIUpper(characters[i])))
2469                     syntaxViolation(iteratorForSyntaxViolationPosition);
2470                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2471             }
2472         } else {
2473             const UChar* characters = domain.characters16();
2474             ascii.reserveInitialCapacity(length);
2475             for (size_t i = 0; i < length; ++i) {
2476                 if (UNLIKELY(isASCIIUpper(characters[i])))
2477                     syntaxViolation(iteratorForSyntaxViolationPosition);
2478                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2479             }
2480         }
2481         return ascii;
2482     }
2483     
2484     UChar hostnameBuffer[defaultInlineBufferSize];
2485     UErrorCode error = U_ZERO_ERROR;
2486
2487 #if COMPILER(GCC) || COMPILER(CLANG)
2488 #pragma GCC diagnostic push
2489 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2490 #endif
2491     // FIXME: This should use uidna_openUTS46 / uidna_close instead
2492     int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, UIDNA_ALLOW_UNASSIGNED, nullptr, &error);
2493 #if COMPILER(GCC) || COMPILER(CLANG)
2494 #pragma GCC diagnostic pop
2495 #endif
2496     ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
2497
2498     if (error == U_ZERO_ERROR) {
2499         for (int32_t i = 0; i < numCharactersConverted; ++i) {
2500             ASSERT(isASCII(hostnameBuffer[i]));
2501             ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2502         }
2503         ascii.append(hostnameBuffer, numCharactersConverted);
2504         if (domain != StringView(ascii.data(), ascii.size()))
2505             syntaxViolation(iteratorForSyntaxViolationPosition);
2506         return ascii;
2507     }
2508
2509     // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
2510     return Nullopt;
2511 }
2512
2513 bool URLParser::hasInvalidDomainCharacter(const Vector<LChar, URLParser::defaultInlineBufferSize>& asciiDomain)
2514 {
2515     for (size_t i = 0; i < asciiDomain.size(); ++i) {
2516         if (isInvalidDomainCharacter(asciiDomain[i]))
2517             return true;
2518     }
2519     return false;
2520 }
2521
2522 template<typename CharacterType>
2523 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2524 {
2525     ASSERT(*iterator == ':');
2526     auto colonIterator = iterator;
2527     advance(iterator, colonIterator);
2528     uint32_t port = 0;
2529     if (UNLIKELY(iterator.atEnd())) {
2530         m_url.m_portEnd = currentPosition(colonIterator);
2531         syntaxViolation(colonIterator);
2532         return true;
2533     }
2534     size_t digitCount = 0;
2535     bool leadingZeros = false;
2536     for (; !iterator.atEnd(); ++iterator) {
2537         if (UNLIKELY(isTabOrNewline(*iterator))) {
2538             syntaxViolation(colonIterator);
2539             continue;
2540         }
2541         if (isASCIIDigit(*iterator)) {
2542             if (*iterator == '0' && !digitCount)
2543                 leadingZeros = true;
2544             ++digitCount;
2545             port = port * 10 + *iterator - '0';
2546             if (port > std::numeric_limits<uint16_t>::max())
2547                 return false;
2548         } else
2549             return false;
2550     }
2551
2552     if (port && leadingZeros)
2553         syntaxViolation(colonIterator);
2554     
2555     if (!port && digitCount > 1)
2556         syntaxViolation(colonIterator);
2557
2558     ASSERT(port == static_cast<uint16_t>(port));
2559     if (UNLIKELY(defaultPortForProtocol(parsedDataView(0, m_url.m_schemeEnd)) == static_cast<uint16_t>(port)))
2560         syntaxViolation(colonIterator);
2561     else {
2562         appendToASCIIBuffer(':');
2563         ASSERT(port <= std::numeric_limits<uint16_t>::max());
2564         appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2565     }
2566
2567     m_url.m_portEnd = currentPosition(iterator);
2568     return true;
2569 }
2570
2571 template<typename CharacterType>
2572 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2573 {
2574     if (iterator.atEnd())
2575         return false;
2576     if (*iterator == ':')
2577         return false;
2578     if (*iterator == '[') {
2579         auto ipv6End = iterator;
2580         while (!ipv6End.atEnd() && *ipv6End != ']')
2581             ++ipv6End;
2582         if (ipv6End.atEnd())
2583             return false;
2584         if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2585             serializeIPv6(address.value());
2586             if (!ipv6End.atEnd()) {
2587                 advance(ipv6End);
2588                 if (!ipv6End.atEnd() && *ipv6End == ':') {
2589                     m_url.m_hostEnd = currentPosition(ipv6End);
2590                     return parsePort(ipv6End);
2591                 }
2592                 m_url.m_hostEnd = currentPosition(ipv6End);
2593                 m_url.m_portEnd = m_url.m_hostEnd;
2594                 return true;
2595             }
2596             m_url.m_hostEnd = currentPosition(ipv6End);
2597             return true;
2598         }
2599         return false;
2600     }
2601
2602     if (!m_urlIsSpecial) {
2603         for (; !iterator.atEnd(); ++iterator) {
2604             if (UNLIKELY(isTabOrNewline(*iterator))) {
2605                 syntaxViolation(iterator);
2606                 continue;
2607             }
2608             if (*iterator == ':')
2609                 break;
2610             utf8PercentEncode<isInSimpleEncodeSet>(iterator);
2611         }
2612         m_url.m_hostEnd = currentPosition(iterator);
2613         if (iterator.atEnd()) {
2614             m_url.m_portEnd = currentPosition(iterator);
2615             return true;
2616         }
2617         return parsePort(iterator);
2618     }
2619     
2620     if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2621         auto hostIterator = iterator;
2622         for (; !iterator.atEnd(); ++iterator) {
2623             if (isTabOrNewline(*iterator))
2624                 continue;
2625             if (*iterator == ':')
2626                 break;
2627             if (isInvalidDomainCharacter(*iterator))
2628                 return false;
2629         }
2630         if (auto address = parseIPv4Host(CodePointIterator<CharacterType>(hostIterator, iterator))) {
2631             serializeIPv4(address.value());
2632             m_url.m_hostEnd = currentPosition(iterator);
2633             if (iterator.atEnd()) {
2634                 m_url.m_portEnd = currentPosition(iterator);
2635                 return true;
2636             }
2637             return parsePort(iterator);
2638         }
2639         for (; hostIterator != iterator; ++hostIterator) {
2640             if (UNLIKELY(isTabOrNewline(*hostIterator))) {
2641                 syntaxViolation(hostIterator);
2642                 continue;
2643             }
2644             if (UNLIKELY(isASCIIUpper(*hostIterator)))
2645                 syntaxViolation(hostIterator);
2646             appendToASCIIBuffer(toASCIILower(*hostIterator));
2647         }
2648         m_url.m_hostEnd = currentPosition(iterator);
2649         if (!hostIterator.atEnd())
2650             return parsePort(hostIterator);
2651         m_url.m_portEnd = currentPosition(iterator);
2652         return true;
2653     }
2654     
2655     auto hostBegin = iterator;
2656     
2657     Vector<LChar, defaultInlineBufferSize> utf8Encoded;
2658     for (; !iterator.atEnd(); ++iterator) {
2659         if (UNLIKELY(isTabOrNewline(*iterator))) {
2660             syntaxViolation(hostBegin);
2661             continue;
2662         }
2663         if (*iterator == ':')
2664             break;
2665         if (UNLIKELY(!isASCII(*iterator)))
2666             syntaxViolation(hostBegin);
2667
2668         uint8_t buffer[U8_MAX_LENGTH];
2669         int32_t offset = 0;
2670         UBool error = false;
2671         U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error);
2672         ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
2673         // FIXME: Check error.
2674         utf8Encoded.append(buffer, offset);
2675     }
2676     Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin);
2677     String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2678     if (domain != StringView(percentDecoded.data(), percentDecoded.size()))
2679         syntaxViolation(hostBegin);
2680     auto asciiDomain = domainToASCII(domain, hostBegin);
2681     if (!asciiDomain || hasInvalidDomainCharacter(asciiDomain.value()))
2682         return false;
2683     Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value();
2684     const LChar* asciiDomainCharacters = asciiDomainValue.data();
2685
2686     if (auto address = parseIPv4Host(CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()))) {
2687         serializeIPv4(address.value());
2688         m_url.m_hostEnd = currentPosition(iterator);
2689         if (iterator.atEnd()) {
2690             m_url.m_portEnd = currentPosition(iterator);
2691             return true;
2692         }
2693         return parsePort(iterator);
2694     }
2695
2696     appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2697     m_url.m_hostEnd = currentPosition(iterator);
2698     if (!iterator.atEnd())
2699         return parsePort(iterator);
2700     m_url.m_portEnd = currentPosition(iterator);
2701     return true;
2702 }
2703
2704 Optional<String> URLParser::formURLDecode(StringView input)
2705 {
2706     auto utf8 = input.utf8(StrictConversion);
2707     if (utf8.isNull())
2708         return Nullopt;
2709     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2710     return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2711 }
2712
2713 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2714 {
2715     Vector<StringView> sequences = input.split('&');
2716
2717     URLEncodedForm output;
2718     for (auto& bytes : sequences) {
2719         auto valueStart = bytes.find('=');
2720         if (valueStart == notFound) {
2721             if (auto name = formURLDecode(bytes))
2722                 output.append({name.value().replace('+', 0x20), emptyString()});
2723         } else {
2724             auto name = formURLDecode(bytes.substring(0, valueStart));
2725             auto value = formURLDecode(bytes.substring(valueStart + 1));
2726             if (name && value)
2727                 output.append(std::make_pair(name.value().replace('+', 0x20), value.value().replace('+', 0x20)));
2728         }
2729     }
2730     return output;
2731 }
2732
2733 static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2734 {
2735     auto utf8 = input.utf8(StrictConversion);
2736     const char* data = utf8.data();
2737     for (size_t i = 0; i < utf8.length(); ++i) {
2738         const char byte = data[i];
2739         if (byte == 0x20)
2740             output.append(0x2B);
2741         else if (byte == 0x2A
2742             || byte == 0x2D
2743             || byte == 0x2E
2744             || (byte >= 0x30 && byte <= 0x39)
2745             || (byte >= 0x41 && byte <= 0x5A)
2746             || byte == 0x5F
2747             || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches.
2748             output.append(byte);
2749         else
2750             percentEncodeByte(byte, output);
2751     }
2752 }
2753     
2754 String URLParser::serialize(const URLEncodedForm& tuples)
2755 {
2756     Vector<LChar> output;
2757     for (auto& tuple : tuples) {
2758         if (!output.isEmpty())
2759             output.append('&');
2760         serializeURLEncodedForm(tuple.first, output);
2761         output.append('=');
2762         serializeURLEncodedForm(tuple.second, output);
2763     }
2764     return String::adopt(WTFMove(output));
2765 }
2766
2767 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2768 {
2769     // FIXME: m_cannotBeABaseURL is not compared because the old URL::parse did not use it,
2770     // but once we get rid of URL::parse its value should be tested.
2771     URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2772         a.m_isValid,
2773         a.m_protocolIsInHTTPFamily,
2774         a.m_schemeEnd,
2775         a.m_userStart,
2776         a.m_userEnd,
2777         a.m_passwordEnd,
2778         a.m_hostEnd,
2779         a.m_portEnd,
2780         a.m_pathAfterLastSlash,
2781         a.m_pathEnd,
2782         a.m_queryEnd,
2783         a.m_fragmentEnd,
2784         a.m_string.utf8().data(),
2785         b.m_isValid,
2786         b.m_protocolIsInHTTPFamily,
2787         b.m_schemeEnd,
2788         b.m_userStart,
2789         b.m_userEnd,
2790         b.m_passwordEnd,
2791         b.m_hostEnd,
2792         b.m_portEnd,
2793         b.m_pathAfterLastSlash,
2794         b.m_pathEnd,
2795         b.m_queryEnd,
2796         b.m_fragmentEnd,
2797         b.m_string.utf8().data());
2798
2799     return a.m_string == b.m_string
2800         && a.m_isValid == b.m_isValid
2801         && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2802         && a.m_schemeEnd == b.m_schemeEnd
2803         && a.m_userStart == b.m_userStart
2804         && a.m_userEnd == b.m_userEnd
2805         && a.m_passwordEnd == b.m_passwordEnd
2806         && a.m_hostEnd == b.m_hostEnd
2807         && a.m_portEnd == b.m_portEnd
2808         && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2809         && a.m_pathEnd == b.m_pathEnd
2810         && a.m_queryEnd == b.m_queryEnd
2811         && a.m_fragmentEnd == b.m_fragmentEnd;
2812 }
2813
2814 bool URLParser::internalValuesConsistent(const URL& url)
2815 {
2816     return url.m_schemeEnd <= url.m_userStart
2817         && url.m_userStart <= url.m_userEnd
2818         && url.m_userEnd <= url.m_passwordEnd
2819         && url.m_passwordEnd <= url.m_hostEnd
2820         && url.m_hostEnd <= url.m_portEnd
2821         && url.m_portEnd <= url.m_pathAfterLastSlash
2822         && url.m_pathAfterLastSlash <= url.m_pathEnd
2823         && url.m_pathEnd <= url.m_queryEnd
2824         && url.m_queryEnd <= url.m_fragmentEnd
2825         && (url.m_isValid ? url.m_fragmentEnd == url.m_string.length() : !url.m_fragmentEnd);
2826     // FIXME: Why do we even store m_fragmentEnd?
2827     // It should be able to be deduced from m_isValid and m_string.length() to save memory.
2828 }
2829
2830 static bool urlParserEnabled = true;
2831
2832 void URLParser::setEnabled(bool enabled)
2833 {
2834     urlParserEnabled = enabled;
2835 }
2836
2837 bool URLParser::enabled()
2838 {
2839     return urlParserEnabled;
2840 }
2841
2842 } // namespace WebCore