4bdd9784f0128e71a9275f8ee0109ecedc293b92
[WebKit-https.git] / Source / WebCore / platform / URLParser.cpp
1 /*
2  * Copyright (C) 2016 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "URLParser.h"
28
29 #include "Logging.h"
30 #include "RuntimeApplicationChecks.h"
31 #include <array>
32 #include <unicode/uidna.h>
33 #include <unicode/utypes.h>
34
35 namespace WebCore {
36
37 #define URL_PARSER_DEBUGGING 0
38     
39 #if URL_PARSER_DEBUGGING
40 #define URL_PARSER_LOG(...) LOG(URLParser, __VA_ARGS__)
41 #else
42 #define URL_PARSER_LOG(...)
43 #endif
44     
45 template<typename CharacterType>
46 class CodePointIterator {
47 public:
48     ALWAYS_INLINE CodePointIterator() { }
49     ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
50         : m_begin(begin)
51         , m_end(end)
52     {
53     }
54     
55     ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
56         : CodePointIterator(begin.m_begin, end.m_begin)
57     {
58         ASSERT(end.m_begin >= begin.m_begin);
59     }
60     
61     ALWAYS_INLINE UChar32 operator*() const;
62     ALWAYS_INLINE CodePointIterator& operator++();
63
64     ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
65     {
66         return m_begin == other.m_begin
67             && m_end == other.m_end;
68     }
69     ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
70     
71     ALWAYS_INLINE CodePointIterator& operator=(const CodePointIterator& other)
72     {
73         m_begin = other.m_begin;
74         m_end = other.m_end;
75         return *this;
76     }
77
78     ALWAYS_INLINE bool atEnd() const
79     {
80         ASSERT(m_begin <= m_end);
81         return m_begin >= m_end;
82     }
83     
84     ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
85     {
86         ASSERT(m_begin >= reference);
87         return m_begin - reference;
88     }
89
90     ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
91     {
92         return codeUnitsSince(other.m_begin);
93     }
94     
95 private:
96     const CharacterType* m_begin { nullptr };
97     const CharacterType* m_end { nullptr };
98 };
99
100 template<>
101 ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const
102 {
103     ASSERT(!atEnd());
104     return *m_begin;
105 }
106
107 template<>
108 ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
109 {
110     ASSERT(!atEnd());
111     m_begin++;
112     return *this;
113 }
114
115 template<>
116 ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const
117 {
118     ASSERT(!atEnd());
119     UChar32 c;
120     U16_GET(m_begin, 0, 0, m_end - m_begin, c);
121     return c;
122 }
123
124 template<>
125 ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
126 {
127     ASSERT(!atEnd());
128     unsigned i = 0;
129     size_t length = m_end - m_begin;
130     U16_FWD_1(m_begin, i, length);
131     m_begin += i;
132     return *this;
133 }
134     
135 ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
136 {
137     if (U_IS_BMP(codePoint)) {
138         destination.append(static_cast<UChar>(codePoint));
139         return;
140     }
141     destination.reserveCapacity(destination.size() + 2);
142     destination.uncheckedAppend(U16_LEAD(codePoint));
143     destination.uncheckedAppend(U16_TRAIL(codePoint));
144 }
145
146 enum URLCharacterClass {
147     UserInfo = 0x1,
148     Default = 0x2,
149     InvalidDomain = 0x4,
150     QueryPercent = 0x8,
151     SlashQuestionOrHash = 0x10,
152     ValidScheme = 0x20,
153 };
154
155 static const uint8_t characterClassTable[256] = {
156     UserInfo | Default | InvalidDomain | QueryPercent, // 0x0
157     UserInfo | Default | QueryPercent, // 0x1
158     UserInfo | Default | QueryPercent, // 0x2
159     UserInfo | Default | QueryPercent, // 0x3
160     UserInfo | Default | QueryPercent, // 0x4
161     UserInfo | Default | QueryPercent, // 0x5
162     UserInfo | Default | QueryPercent, // 0x6
163     UserInfo | Default | QueryPercent, // 0x7
164     UserInfo | Default | QueryPercent, // 0x8
165     UserInfo | Default | InvalidDomain | QueryPercent, // 0x9
166     UserInfo | Default | InvalidDomain | QueryPercent, // 0xA
167     UserInfo | Default | QueryPercent, // 0xB
168     UserInfo | Default | QueryPercent, // 0xC
169     UserInfo | Default | InvalidDomain | QueryPercent, // 0xD
170     UserInfo | Default | QueryPercent, // 0xE
171     UserInfo | Default | QueryPercent, // 0xF
172     UserInfo | Default | QueryPercent, // 0x10
173     UserInfo | Default | QueryPercent, // 0x11
174     UserInfo | Default | QueryPercent, // 0x12
175     UserInfo | Default | QueryPercent, // 0x13
176     UserInfo | Default | QueryPercent, // 0x14
177     UserInfo | Default | QueryPercent, // 0x15
178     UserInfo | Default | QueryPercent, // 0x16
179     UserInfo | Default | QueryPercent, // 0x17
180     UserInfo | Default | QueryPercent, // 0x18
181     UserInfo | Default | QueryPercent, // 0x19
182     UserInfo | Default | QueryPercent, // 0x1A
183     UserInfo | Default | QueryPercent, // 0x1B
184     UserInfo | Default | QueryPercent, // 0x1C
185     UserInfo | Default | QueryPercent, // 0x1D
186     UserInfo | Default | QueryPercent, // 0x1E
187     UserInfo | Default | QueryPercent, // 0x1F
188     UserInfo | Default | InvalidDomain | QueryPercent, // ' '
189     0, // '!'
190     UserInfo | Default | QueryPercent, // '"'
191     UserInfo | Default | InvalidDomain | QueryPercent | SlashQuestionOrHash, // '#'
192     0, // '$'
193     InvalidDomain, // '%'
194     0, // '&'
195     0, // '''
196     0, // '('
197     0, // ')'
198     0, // '*'
199     ValidScheme, // '+'
200     0, // ','
201     ValidScheme, // '-'
202     ValidScheme, // '.'
203     UserInfo | InvalidDomain | SlashQuestionOrHash, // '/'
204     ValidScheme, // '0'
205     ValidScheme, // '1'
206     ValidScheme, // '2'
207     ValidScheme, // '3'
208     ValidScheme, // '4'
209     ValidScheme, // '5'
210     ValidScheme, // '6'
211     ValidScheme, // '7'
212     ValidScheme, // '8'
213     ValidScheme, // '9'
214     UserInfo | InvalidDomain, // ':'
215     UserInfo, // ';'
216     UserInfo | Default | QueryPercent, // '<'
217     UserInfo, // '='
218     UserInfo | Default | QueryPercent, // '>'
219     UserInfo | Default | InvalidDomain | SlashQuestionOrHash, // '?'
220     UserInfo | InvalidDomain, // '@'
221     ValidScheme, // 'A'
222     ValidScheme, // 'B'
223     ValidScheme, // 'C'
224     ValidScheme, // 'D'
225     ValidScheme, // 'E'
226     ValidScheme, // 'F'
227     ValidScheme, // 'G'
228     ValidScheme, // 'H'
229     ValidScheme, // 'I'
230     ValidScheme, // 'J'
231     ValidScheme, // 'K'
232     ValidScheme, // 'L'
233     ValidScheme, // 'M'
234     ValidScheme, // 'N'
235     ValidScheme, // 'O'
236     ValidScheme, // 'P'
237     ValidScheme, // 'Q'
238     ValidScheme, // 'R'
239     ValidScheme, // 'S'
240     ValidScheme, // 'T'
241     ValidScheme, // 'U'
242     ValidScheme, // 'V'
243     ValidScheme, // 'W'
244     ValidScheme, // 'X'
245     ValidScheme, // 'Y'
246     ValidScheme, // 'Z'
247     UserInfo | InvalidDomain, // '['
248     UserInfo | InvalidDomain | SlashQuestionOrHash, // '\\'
249     UserInfo | InvalidDomain, // ']'
250     UserInfo, // '^'
251     0, // '_'
252     UserInfo | Default, // '`'
253     ValidScheme, // 'a'
254     ValidScheme, // 'b'
255     ValidScheme, // 'c'
256     ValidScheme, // 'd'
257     ValidScheme, // 'e'
258     ValidScheme, // 'f'
259     ValidScheme, // 'g'
260     ValidScheme, // 'h'
261     ValidScheme, // 'i'
262     ValidScheme, // 'j'
263     ValidScheme, // 'k'
264     ValidScheme, // 'l'
265     ValidScheme, // 'm'
266     ValidScheme, // 'n'
267     ValidScheme, // 'o'
268     ValidScheme, // 'p'
269     ValidScheme, // 'q'
270     ValidScheme, // 'r'
271     ValidScheme, // 's'
272     ValidScheme, // 't'
273     ValidScheme, // 'u'
274     ValidScheme, // 'v'
275     ValidScheme, // 'w'
276     ValidScheme, // 'x'
277     ValidScheme, // 'y'
278     ValidScheme, // 'z'
279     UserInfo | Default, // '{'
280     UserInfo, // '|'
281     UserInfo | Default, // '}'
282     0, // '~'
283     QueryPercent, // 0x7F
284     QueryPercent, // 0x80
285     QueryPercent, // 0x81
286     QueryPercent, // 0x82
287     QueryPercent, // 0x83
288     QueryPercent, // 0x84
289     QueryPercent, // 0x85
290     QueryPercent, // 0x86
291     QueryPercent, // 0x87
292     QueryPercent, // 0x88
293     QueryPercent, // 0x89
294     QueryPercent, // 0x8A
295     QueryPercent, // 0x8B
296     QueryPercent, // 0x8C
297     QueryPercent, // 0x8D
298     QueryPercent, // 0x8E
299     QueryPercent, // 0x8F
300     QueryPercent, // 0x90
301     QueryPercent, // 0x91
302     QueryPercent, // 0x92
303     QueryPercent, // 0x93
304     QueryPercent, // 0x94
305     QueryPercent, // 0x95
306     QueryPercent, // 0x96
307     QueryPercent, // 0x97
308     QueryPercent, // 0x98
309     QueryPercent, // 0x99
310     QueryPercent, // 0x9A
311     QueryPercent, // 0x9B
312     QueryPercent, // 0x9C
313     QueryPercent, // 0x9D
314     QueryPercent, // 0x9E
315     QueryPercent, // 0x9F
316     QueryPercent, // 0xA0
317     QueryPercent, // 0xA1
318     QueryPercent, // 0xA2
319     QueryPercent, // 0xA3
320     QueryPercent, // 0xA4
321     QueryPercent, // 0xA5
322     QueryPercent, // 0xA6
323     QueryPercent, // 0xA7
324     QueryPercent, // 0xA8
325     QueryPercent, // 0xA9
326     QueryPercent, // 0xAA
327     QueryPercent, // 0xAB
328     QueryPercent, // 0xAC
329     QueryPercent, // 0xAD
330     QueryPercent, // 0xAE
331     QueryPercent, // 0xAF
332     QueryPercent, // 0xB0
333     QueryPercent, // 0xB1
334     QueryPercent, // 0xB2
335     QueryPercent, // 0xB3
336     QueryPercent, // 0xB4
337     QueryPercent, // 0xB5
338     QueryPercent, // 0xB6
339     QueryPercent, // 0xB7
340     QueryPercent, // 0xB8
341     QueryPercent, // 0xB9
342     QueryPercent, // 0xBA
343     QueryPercent, // 0xBB
344     QueryPercent, // 0xBC
345     QueryPercent, // 0xBD
346     QueryPercent, // 0xBE
347     QueryPercent, // 0xBF
348     QueryPercent, // 0xC0
349     QueryPercent, // 0xC1
350     QueryPercent, // 0xC2
351     QueryPercent, // 0xC3
352     QueryPercent, // 0xC4
353     QueryPercent, // 0xC5
354     QueryPercent, // 0xC6
355     QueryPercent, // 0xC7
356     QueryPercent, // 0xC8
357     QueryPercent, // 0xC9
358     QueryPercent, // 0xCA
359     QueryPercent, // 0xCB
360     QueryPercent, // 0xCC
361     QueryPercent, // 0xCD
362     QueryPercent, // 0xCE
363     QueryPercent, // 0xCF
364     QueryPercent, // 0xD0
365     QueryPercent, // 0xD1
366     QueryPercent, // 0xD2
367     QueryPercent, // 0xD3
368     QueryPercent, // 0xD4
369     QueryPercent, // 0xD5
370     QueryPercent, // 0xD6
371     QueryPercent, // 0xD7
372     QueryPercent, // 0xD8
373     QueryPercent, // 0xD9
374     QueryPercent, // 0xDA
375     QueryPercent, // 0xDB
376     QueryPercent, // 0xDC
377     QueryPercent, // 0xDD
378     QueryPercent, // 0xDE
379     QueryPercent, // 0xDF
380     QueryPercent, // 0xE0
381     QueryPercent, // 0xE1
382     QueryPercent, // 0xE2
383     QueryPercent, // 0xE3
384     QueryPercent, // 0xE4
385     QueryPercent, // 0xE5
386     QueryPercent, // 0xE6
387     QueryPercent, // 0xE7
388     QueryPercent, // 0xE8
389     QueryPercent, // 0xE9
390     QueryPercent, // 0xEA
391     QueryPercent, // 0xEB
392     QueryPercent, // 0xEC
393     QueryPercent, // 0xED
394     QueryPercent, // 0xEE
395     QueryPercent, // 0xEF
396     QueryPercent, // 0xF0
397     QueryPercent, // 0xF1
398     QueryPercent, // 0xF2
399     QueryPercent, // 0xF3
400     QueryPercent, // 0xF4
401     QueryPercent, // 0xF5
402     QueryPercent, // 0xF6
403     QueryPercent, // 0xF7
404     QueryPercent, // 0xF8
405     QueryPercent, // 0xF9
406     QueryPercent, // 0xFA
407     QueryPercent, // 0xFB
408     QueryPercent, // 0xFC
409     QueryPercent, // 0xFD
410     QueryPercent, // 0xFE
411     QueryPercent, // 0xFF
412 };
413
414 template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; }
415 template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
416 template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
417 template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
418 template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
419 template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
420 template<typename CharacterType> ALWAYS_INLINE static bool isInvalidDomainCharacter(CharacterType character) { return character <= ']' && characterClassTable[character] & InvalidDomain; }
421 template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
422 template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
423 template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & ValidScheme; }
424 static bool shouldPercentEncodeQueryByte(uint8_t byte) { return characterClassTable[byte] & QueryPercent; }
425
426 template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation>
427 ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
428 {
429     ++iterator;
430     while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
431         if (reportSyntaxViolation == ReportSyntaxViolation::Yes)
432             syntaxViolation(iteratorForSyntaxViolationPosition);
433         ++iterator;
434     }
435 }
436
437 template<typename CharacterType>
438 bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator)
439 {
440     if (iterator.atEnd())
441         return false;
442     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
443     if (iterator.atEnd())
444         return false;
445     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
446     return iterator.atEnd();
447 }
448
449 template<typename CharacterType>
450 ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
451 {
452     if (iterator.atEnd() || !isASCIIAlpha(*iterator))
453         return false;
454     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
455     if (iterator.atEnd())
456         return false;
457     if (*iterator == ':')
458         return true;
459     if (UNLIKELY(*iterator == '|'))
460         return true;
461     return false;
462 }
463
464 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
465 {
466     ASSERT(isASCII(codePoint));
467     if (UNLIKELY(m_didSeeSyntaxViolation))
468         m_asciiBuffer.append(codePoint);
469 }
470
471 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
472 {
473     if (UNLIKELY(m_didSeeSyntaxViolation))
474         m_asciiBuffer.append(characters, length);
475 }
476
477 template<typename CharacterType>
478 void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
479 {
480     ASSERT(isWindowsDriveLetter(iterator));
481     appendToASCIIBuffer(*iterator);
482     advance(iterator);
483     ASSERT(!iterator.atEnd());
484     ASSERT(*iterator == ':' || *iterator == '|');
485     if (*iterator == '|')
486         syntaxViolation(iterator);
487     appendToASCIIBuffer(':');
488     advance(iterator);
489 }
490
491 template<typename CharacterType>
492 bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
493 {
494     if (!isWindowsDriveLetter(iterator))
495         return true;
496     if (iterator.atEnd())
497         return false;
498     advance(iterator);
499     if (iterator.atEnd())
500         return true;
501     advance(iterator);
502     if (iterator.atEnd())
503         return true;
504     return !isSlashQuestionOrHash(*iterator);
505 }
506
507 static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
508 {
509     buffer.append('%');
510     buffer.append(upperNibbleToASCIIHexDigit(byte));
511     buffer.append(lowerNibbleToASCIIHexDigit(byte));
512 }
513
514 void URLParser::percentEncodeByte(uint8_t byte)
515 {
516     ASSERT(m_didSeeSyntaxViolation);
517     appendToASCIIBuffer('%');
518     appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
519     appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
520 }
521
522 const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD";
523 const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1;
524
525 template<bool(*isInCodeSet)(UChar32), typename CharacterType>
526 ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
527 {
528     ASSERT(!iterator.atEnd());
529     UChar32 codePoint = *iterator;
530     if (LIKELY(isASCII(codePoint))) {
531         if (UNLIKELY(isInCodeSet(codePoint))) {
532             syntaxViolation(iterator);
533             percentEncodeByte(codePoint);
534         } else
535             appendToASCIIBuffer(codePoint);
536         return;
537     }
538     ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
539     syntaxViolation(iterator);
540     
541     if (!U_IS_UNICODE_CHAR(codePoint)) {
542         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
543         return;
544     }
545     
546     uint8_t buffer[U8_MAX_LENGTH];
547     int32_t offset = 0;
548     U8_APPEND_UNSAFE(buffer, offset, codePoint);
549     for (int32_t i = 0; i < offset; ++i)
550         percentEncodeByte(buffer[i]);
551 }
552
553 template<typename CharacterType>
554 ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
555 {
556     ASSERT(!iterator.atEnd());
557     UChar32 codePoint = *iterator;
558     if (LIKELY(isASCII(codePoint))) {
559         if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint))) {
560             syntaxViolation(iterator);
561             percentEncodeByte(codePoint);
562         } else
563             appendToASCIIBuffer(codePoint);
564         return;
565     }
566     
567     syntaxViolation(iterator);
568     
569     if (!U_IS_UNICODE_CHAR(codePoint)) {
570         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
571         return;
572     }
573
574     uint8_t buffer[U8_MAX_LENGTH];
575     int32_t offset = 0;
576     U8_APPEND_UNSAFE(buffer, offset, codePoint);
577     for (int32_t i = 0; i < offset; ++i) {
578         auto byte = buffer[i];
579         if (shouldPercentEncodeQueryByte(byte))
580             percentEncodeByte(byte);
581         else
582             appendToASCIIBuffer(byte);
583     }
584 }
585
586 template<typename CharacterType>
587 void URLParser::encodeQuery(const Vector<UChar>& source, const TextEncoding& encoding, CodePointIterator<CharacterType> iterator)
588 {
589     // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
590     CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
591     const char* data = encoded.data();
592     size_t length = encoded.length();
593     
594     if (!length == !iterator.atEnd()) {
595         syntaxViolation(iterator);
596         return;
597     }
598     
599     size_t i = 0;
600     for (; i < length; ++i) {
601         ASSERT(!iterator.atEnd());
602         uint8_t byte = data[i];
603         if (UNLIKELY(byte != *iterator)) {
604             syntaxViolation(iterator);
605             break;
606         }
607         if (UNLIKELY(shouldPercentEncodeQueryByte(byte))) {
608             syntaxViolation(iterator);
609             break;
610         }
611         appendToASCIIBuffer(byte);
612         ++iterator;
613     }
614     while (!iterator.atEnd() && isTabOrNewline(*iterator))
615         ++iterator;
616     ASSERT((i == length) == iterator.atEnd());
617     for (; i < length; ++i) {
618         ASSERT(m_didSeeSyntaxViolation);
619         uint8_t byte = data[i];
620         if (shouldPercentEncodeQueryByte(byte))
621             percentEncodeByte(byte);
622         else
623             appendToASCIIBuffer(byte);
624     }
625 }
626
627 Optional<uint16_t> defaultPortForProtocol(StringView scheme)
628 {
629     static const uint16_t ftpPort = 21;
630     static const uint16_t gopherPort = 70;
631     static const uint16_t httpPort = 80;
632     static const uint16_t httpsPort = 443;
633     static const uint16_t wsPort = 80;
634     static const uint16_t wssPort = 443;
635     
636     auto length = scheme.length();
637     if (!length)
638         return Nullopt;
639     switch (scheme[0]) {
640     case 'w':
641         switch (length) {
642         case 2:
643             if (scheme[1] == 's')
644                 return wsPort;
645             return Nullopt;
646         case 3:
647             if (scheme[1] == 's'
648                 && scheme[2] == 's')
649                 return wssPort;
650             return Nullopt;
651         default:
652             return false;
653         }
654     case 'h':
655         switch (length) {
656         case 4:
657             if (scheme[1] == 't'
658                 && scheme[2] == 't'
659                 && scheme[3] == 'p')
660                 return httpPort;
661             return Nullopt;
662         case 5:
663             if (scheme[1] == 't'
664                 && scheme[2] == 't'
665                 && scheme[3] == 'p'
666                 && scheme[4] == 's')
667                 return httpsPort;
668             return Nullopt;
669         default:
670             return Nullopt;
671         }
672     case 'g':
673         if (length == 6
674             && scheme[1] == 'o'
675             && scheme[2] == 'p'
676             && scheme[3] == 'h'
677             && scheme[4] == 'e'
678             && scheme[5] == 'r')
679             return gopherPort;
680         return Nullopt;
681     case 'f':
682         if (length == 3
683             && scheme[1] == 't'
684             && scheme[2] == 'p')
685             return ftpPort;
686         return Nullopt;
687     default:
688         return Nullopt;
689     }
690 }
691
692 bool isDefaultPortForProtocol(uint16_t port, StringView protocol)
693 {
694     return defaultPortForProtocol(protocol) == port;
695 }
696
697 enum class Scheme {
698     WS,
699     WSS,
700     File,
701     FTP,
702     Gopher,
703     HTTP,
704     HTTPS,
705     NonSpecial
706 };
707
708 ALWAYS_INLINE static Scheme scheme(StringView scheme)
709 {
710     auto length = scheme.length();
711     if (!length)
712         return Scheme::NonSpecial;
713     switch (scheme[0]) {
714     case 'f':
715         switch (length) {
716         case 3:
717             if (scheme[1] == 't'
718                 && scheme[2] == 'p')
719                 return Scheme::FTP;
720             return Scheme::NonSpecial;
721         case 4:
722             if (scheme[1] == 'i'
723                 && scheme[2] == 'l'
724                 && scheme[3] == 'e')
725                 return Scheme::File;
726             return Scheme::NonSpecial;
727         default:
728             return Scheme::NonSpecial;
729         }
730     case 'g':
731         if (length == 6
732             && scheme[1] == 'o'
733             && scheme[2] == 'p'
734             && scheme[3] == 'h'
735             && scheme[4] == 'e'
736             && scheme[5] == 'r')
737             return Scheme::Gopher;
738         return Scheme::NonSpecial;
739     case 'h':
740         switch (length) {
741         case 4:
742             if (scheme[1] == 't'
743                 && scheme[2] == 't'
744                 && scheme[3] == 'p')
745                 return Scheme::HTTP;
746             return Scheme::NonSpecial;
747         case 5:
748             if (scheme[1] == 't'
749                 && scheme[2] == 't'
750                 && scheme[3] == 'p'
751                 && scheme[4] == 's')
752                 return Scheme::HTTPS;
753             return Scheme::NonSpecial;
754         default:
755             return Scheme::NonSpecial;
756         }
757     case 'w':
758         switch (length) {
759         case 2:
760             if (scheme[1] == 's')
761                 return Scheme::WS;
762             return Scheme::NonSpecial;
763         case 3:
764             if (scheme[1] == 's'
765                 && scheme[2] == 's')
766                 return Scheme::WSS;
767             return Scheme::NonSpecial;
768         default:
769             return Scheme::NonSpecial;
770         }
771     default:
772         return Scheme::NonSpecial;
773     }
774 }
775
776 enum class URLParser::URLPart {
777     SchemeEnd,
778     UserStart,
779     UserEnd,
780     PasswordEnd,
781     HostEnd,
782     PortEnd,
783     PathAfterLastSlash,
784     PathEnd,
785     QueryEnd,
786     FragmentEnd,
787 };
788
789 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
790 {
791     switch (part) {
792     case URLPart::FragmentEnd:
793         return url.m_fragmentEnd;
794     case URLPart::QueryEnd:
795         return url.m_queryEnd;
796     case URLPart::PathEnd:
797         return url.m_pathEnd;
798     case URLPart::PathAfterLastSlash:
799         return url.m_pathAfterLastSlash;
800     case URLPart::PortEnd:
801         return url.m_portEnd;
802     case URLPart::HostEnd:
803         return url.m_hostEnd;
804     case URLPart::PasswordEnd:
805         return url.m_passwordEnd;
806     case URLPart::UserEnd:
807         return url.m_userEnd;
808     case URLPart::UserStart:
809         return url.m_userStart;
810     case URLPart::SchemeEnd:
811         return url.m_schemeEnd;
812     }
813     ASSERT_NOT_REACHED();
814     return 0;
815 }
816
817 void URLParser::copyASCIIStringUntil(const String& string, size_t length)
818 {
819     RELEASE_ASSERT(length <= string.length());
820     if (string.isNull())
821         return;
822     ASSERT(m_asciiBuffer.isEmpty());
823     if (string.is8Bit()) {
824         appendToASCIIBuffer(string.characters8(), length);
825     } else {
826         const UChar* characters = string.characters16();
827         for (size_t i = 0; i < length; ++i) {
828             UChar c = characters[i];
829             ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
830             appendToASCIIBuffer(c);
831         }
832     }
833 }
834
835 template<typename CharacterType>
836 void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, bool& isUTF8Encoding)
837 {
838     syntaxViolation(iterator);
839
840     m_asciiBuffer.clear();
841     copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
842     switch (part) {
843     case URLPart::FragmentEnd:
844         RELEASE_ASSERT_NOT_REACHED();
845     case URLPart::QueryEnd:
846         m_url.m_queryEnd = base.m_queryEnd;
847         FALLTHROUGH;
848     case URLPart::PathEnd:
849         m_url.m_pathEnd = base.m_pathEnd;
850         FALLTHROUGH;
851     case URLPart::PathAfterLastSlash:
852         m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
853         FALLTHROUGH;
854     case URLPart::PortEnd:
855         m_url.m_portEnd = base.m_portEnd;
856         FALLTHROUGH;
857     case URLPart::HostEnd:
858         m_url.m_hostEnd = base.m_hostEnd;
859         FALLTHROUGH;
860     case URLPart::PasswordEnd:
861         m_url.m_passwordEnd = base.m_passwordEnd;
862         FALLTHROUGH;
863     case URLPart::UserEnd:
864         m_url.m_userEnd = base.m_userEnd;
865         FALLTHROUGH;
866     case URLPart::UserStart:
867         m_url.m_userStart = base.m_userStart;
868         FALLTHROUGH;
869     case URLPart::SchemeEnd:
870         m_url.m_isValid = base.m_isValid;
871         m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
872         m_url.m_schemeEnd = base.m_schemeEnd;
873     }
874     switch (scheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd))) {
875     case Scheme::WS:
876     case Scheme::WSS:
877         isUTF8Encoding = true;
878         FALLTHROUGH;
879     case Scheme::File:
880     case Scheme::FTP:
881     case Scheme::Gopher:
882     case Scheme::HTTP:
883     case Scheme::HTTPS:
884         m_urlIsSpecial = true;
885         return;
886     case Scheme::NonSpecial:
887         m_urlIsSpecial = false;
888         isUTF8Encoding = true;
889         return;
890     }
891     ASSERT_NOT_REACHED();
892 }
893
894 static const char dotASCIICode[2] = {'2', 'e'};
895
896 template<typename CharacterType>
897 ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c)
898 {
899     if (c.atEnd())
900         return false;
901     if (*c == '.') {
902         advance<CharacterType, ReportSyntaxViolation::No>(c);
903         return c.atEnd() || isSlashQuestionOrHash(*c);
904     }
905     if (*c != '%')
906         return false;
907     advance<CharacterType, ReportSyntaxViolation::No>(c);
908     if (c.atEnd() || *c != dotASCIICode[0])
909         return false;
910     advance<CharacterType, ReportSyntaxViolation::No>(c);
911     if (c.atEnd())
912         return false;
913     if (toASCIILower(*c) == dotASCIICode[1]) {
914         advance<CharacterType, ReportSyntaxViolation::No>(c);
915         return c.atEnd() || isSlashQuestionOrHash(*c);
916     }
917     return false;
918 }
919
920 template<typename CharacterType>
921 ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
922 {
923     if (c.atEnd())
924         return false;
925     if (*c == '.') {
926         advance<CharacterType, ReportSyntaxViolation::No>(c);
927         return isSingleDotPathSegment(c);
928     }
929     if (*c != '%')
930         return false;
931     advance<CharacterType, ReportSyntaxViolation::No>(c);
932     if (c.atEnd() || *c != dotASCIICode[0])
933         return false;
934     advance<CharacterType, ReportSyntaxViolation::No>(c);
935     if (c.atEnd())
936         return false;
937     if (toASCIILower(*c) == dotASCIICode[1]) {
938         advance<CharacterType, ReportSyntaxViolation::No>(c);
939         return isSingleDotPathSegment(c);
940     }
941     return false;
942 }
943
944 template<typename CharacterType>
945 void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
946 {
947     ASSERT(isSingleDotPathSegment(c));
948     if (*c == '.') {
949         advance(c);
950         if (!c.atEnd()) {
951             if (*c == '/' || *c == '\\')
952                 advance(c);
953             else
954                 ASSERT(*c == '?' || *c == '#');
955         }
956     } else {
957         ASSERT(*c == '%');
958         advance(c);
959         ASSERT(*c == dotASCIICode[0]);
960         advance(c);
961         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
962         advance(c);
963         if (!c.atEnd()) {
964             if (*c == '/' || *c == '\\')
965                 advance(c);
966             else
967                 ASSERT(*c == '?' || *c == '#');
968         }
969     }
970 }
971
972 template<typename CharacterType>
973 void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
974 {
975     ASSERT(isDoubleDotPathSegment(c));
976     if (*c == '.')
977         advance(c);
978     else {
979         ASSERT(*c == '%');
980         advance(c);
981         ASSERT(*c == dotASCIICode[0]);
982         advance(c);
983         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
984         advance(c);
985     }
986     consumeSingleDotPathSegment(c);
987 }
988
989 void URLParser::popPath()
990 {
991     ASSERT(m_didSeeSyntaxViolation);
992     if (m_url.m_pathAfterLastSlash > m_url.m_portEnd + 1) {
993         m_url.m_pathAfterLastSlash--;
994         if (m_asciiBuffer[m_url.m_pathAfterLastSlash] == '/')
995             m_url.m_pathAfterLastSlash--;
996         while (m_url.m_pathAfterLastSlash > m_url.m_portEnd && m_asciiBuffer[m_url.m_pathAfterLastSlash] != '/')
997             m_url.m_pathAfterLastSlash--;
998         m_url.m_pathAfterLastSlash++;
999     }
1000     m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
1001 }
1002
1003 template<typename CharacterType>
1004 void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
1005 {
1006     if (m_didSeeSyntaxViolation)
1007         return;
1008     m_didSeeSyntaxViolation = true;
1009     
1010     ASSERT(m_asciiBuffer.isEmpty());
1011     size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1012     RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
1013     m_asciiBuffer.reserveCapacity(m_inputString.length());
1014     for (size_t i = 0; i < codeUnitsToCopy; ++i) {
1015         ASSERT(isASCII(m_inputString[i]));
1016         m_asciiBuffer.uncheckedAppend(m_inputString[i]);
1017     }
1018 }
1019
1020 void URLParser::failure()
1021 {
1022     m_url.invalidate();
1023     m_url.m_string = m_inputString;
1024 }
1025
1026 template<typename CharacterType>
1027 bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint)
1028 {
1029     if (iterator.atEnd() || toASCIILower(*iterator) != codePoint)
1030         return false;
1031     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
1032     return true;
1033 }
1034
1035 template<typename CharacterType>
1036 bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator)
1037 {
1038     if (!checkLocalhostCodePoint(iterator, 'l'))
1039         return false;
1040     if (!checkLocalhostCodePoint(iterator, 'o'))
1041         return false;
1042     if (!checkLocalhostCodePoint(iterator, 'c'))
1043         return false;
1044     if (!checkLocalhostCodePoint(iterator, 'a'))
1045         return false;
1046     if (!checkLocalhostCodePoint(iterator, 'l'))
1047         return false;
1048     if (!checkLocalhostCodePoint(iterator, 'h'))
1049         return false;
1050     if (!checkLocalhostCodePoint(iterator, 'o'))
1051         return false;
1052     if (!checkLocalhostCodePoint(iterator, 's'))
1053         return false;
1054     if (!checkLocalhostCodePoint(iterator, 't'))
1055         return false;
1056     return iterator.atEnd();
1057 }
1058
1059 bool URLParser::isLocalhost(StringView view)
1060 {
1061     if (view.is8Bit())
1062         return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length()));
1063     return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length()));
1064 }
1065
1066 ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
1067 {
1068     if (UNLIKELY(m_didSeeSyntaxViolation)) {
1069         ASSERT(start + length <= m_asciiBuffer.size());
1070         return StringView(m_asciiBuffer.data() + start, length);
1071     }
1072     ASSERT(start + length <= m_inputString.length());
1073     return StringView(m_inputString).substring(start, length);
1074 }
1075
1076 template<typename CharacterType>
1077 ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1078 {
1079     if (UNLIKELY(m_didSeeSyntaxViolation))
1080         return m_asciiBuffer.size();
1081     
1082     return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1083 }
1084
1085 URLParser::URLParser(const String& input, const URL& base, const TextEncoding& encoding)
1086     : m_inputString(input)
1087 {
1088     if (input.isNull()) {
1089         if (base.isValid() && !base.m_cannotBeABaseURL) {
1090             m_url = base;
1091             m_url.removeFragmentIdentifier();
1092         }
1093         return;
1094     }
1095
1096     if (input.is8Bit()) {
1097         m_inputBegin = input.characters8();
1098         parse(input.characters8(), input.length(), base, encoding);
1099     } else {
1100         m_inputBegin = input.characters16();
1101         parse(input.characters16(), input.length(), base, encoding);
1102     }
1103
1104     ASSERT(!m_url.m_isValid
1105         || m_didSeeSyntaxViolation == (m_url.string() != input)
1106         || (input.isAllSpecialCharacters<isC0ControlOrSpace>()
1107             && m_url.m_string == base.m_string.left(base.m_queryEnd)));
1108     ASSERT(internalValuesConsistent(m_url));
1109 #if !ASSERT_DISABLED
1110     if (!m_didSeeSyntaxViolation) {
1111         // Force a syntax violation at the beginning to make sure we get the same result.
1112         URLParser parser(makeString(" ", input), base, encoding);
1113         URL parsed = parser.result();
1114         if (parsed.isValid())
1115             ASSERT(allValuesEqual(parser.result(), m_url));
1116     }
1117 #endif
1118 }
1119
1120 template<typename CharacterType>
1121 void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
1122 {
1123     URL_PARSER_LOG("Parsing URL <%s> base <%s> encoding <%s>", String(input, length).utf8().data(), base.string().utf8().data(), encoding.name());
1124     m_url = { };
1125     ASSERT(m_asciiBuffer.isEmpty());
1126     
1127     bool isUTF8Encoding = encoding == UTF8Encoding();
1128     Vector<UChar> queryBuffer;
1129
1130     unsigned endIndex = length;
1131     while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) {
1132         syntaxViolation(CodePointIterator<CharacterType>(input, input));
1133         endIndex--;
1134     }
1135     CodePointIterator<CharacterType> c(input, input + endIndex);
1136     CodePointIterator<CharacterType> authorityOrHostBegin;
1137     CodePointIterator<CharacterType> queryBegin;
1138     while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1139         syntaxViolation(c);
1140         ++c;
1141     }
1142     auto beginAfterControlAndSpace = c;
1143
1144     enum class State : uint8_t {
1145         SchemeStart,
1146         Scheme,
1147         NoScheme,
1148         SpecialRelativeOrAuthority,
1149         PathOrAuthority,
1150         Relative,
1151         RelativeSlash,
1152         SpecialAuthoritySlashes,
1153         SpecialAuthorityIgnoreSlashes,
1154         AuthorityOrHost,
1155         Host,
1156         File,
1157         FileSlash,
1158         FileHost,
1159         PathStart,
1160         Path,
1161         CannotBeABaseURLPath,
1162         UTF8Query,
1163         NonUTF8Query,
1164         Fragment,
1165     };
1166
1167 #define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1168 #define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x)
1169
1170     State state = State::SchemeStart;
1171     while (!c.atEnd()) {
1172         if (UNLIKELY(isTabOrNewline(*c))) {
1173             syntaxViolation(c);
1174             ++c;
1175             continue;
1176         }
1177
1178         switch (state) {
1179         case State::SchemeStart:
1180             LOG_STATE("SchemeStart");
1181             if (isASCIIAlpha(*c)) {
1182                 if (UNLIKELY(isASCIIUpper(*c)))
1183                     syntaxViolation(c);
1184                 appendToASCIIBuffer(toASCIILower(*c));
1185                 advance(c);
1186                 if (c.atEnd()) {
1187                     m_asciiBuffer.clear();
1188                     state = State::NoScheme;
1189                     c = beginAfterControlAndSpace;
1190                 }
1191                 state = State::Scheme;
1192             } else
1193                 state = State::NoScheme;
1194             break;
1195         case State::Scheme:
1196             LOG_STATE("Scheme");
1197             if (isValidSchemeCharacter(*c)) {
1198                 if (UNLIKELY(isASCIIUpper(*c)))
1199                     syntaxViolation(c);
1200                 appendToASCIIBuffer(toASCIILower(*c));
1201             } else if (*c == ':') {
1202                 m_url.m_schemeEnd = currentPosition(c);
1203                 StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd);
1204                 appendToASCIIBuffer(':');
1205                 switch (scheme(urlScheme)) {
1206                 case Scheme::File:
1207                     m_urlIsSpecial = true;
1208                     state = State::File;
1209                     ++c;
1210                     break;
1211                 case Scheme::WS:
1212                 case Scheme::WSS:
1213                     isUTF8Encoding = true;
1214                     m_urlIsSpecial = true;
1215                     if (base.protocolIs(urlScheme))
1216                         state = State::SpecialRelativeOrAuthority;
1217                     else
1218                         state = State::SpecialAuthoritySlashes;
1219                     ++c;
1220                     break;
1221                 case Scheme::HTTP:
1222                 case Scheme::HTTPS:
1223                     m_url.m_protocolIsInHTTPFamily = true;
1224                     FALLTHROUGH;
1225                 case Scheme::FTP:
1226                 case Scheme::Gopher:
1227                     m_urlIsSpecial = true;
1228                     if (base.protocolIs(urlScheme))
1229                         state = State::SpecialRelativeOrAuthority;
1230                     else
1231                         state = State::SpecialAuthoritySlashes;
1232                     ++c;
1233                     break;
1234                 case Scheme::NonSpecial:
1235                     isUTF8Encoding = true;
1236                     auto maybeSlash = c;
1237                     advance(maybeSlash);
1238                     if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1239                         appendToASCIIBuffer('/');
1240                         c = maybeSlash;
1241                         state = State::PathOrAuthority;
1242                         ASSERT(*c == '/');
1243                         ++c;
1244                         m_url.m_userStart = currentPosition(c);
1245                     } else {
1246                         ++c;
1247                         m_url.m_userStart = currentPosition(c);
1248                         m_url.m_userEnd = m_url.m_userStart;
1249                         m_url.m_passwordEnd = m_url.m_userStart;
1250                         m_url.m_hostEnd = m_url.m_userStart;
1251                         m_url.m_portEnd = m_url.m_userStart;
1252                         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1253                         m_url.m_cannotBeABaseURL = true;
1254                         state = State::CannotBeABaseURLPath;
1255                     }
1256                     break;
1257                 }
1258                 break;
1259             } else {
1260                 m_asciiBuffer.clear();
1261                 state = State::NoScheme;
1262                 c = beginAfterControlAndSpace;
1263                 break;
1264             }
1265             advance(c);
1266             if (c.atEnd()) {
1267                 m_asciiBuffer.clear();
1268                 state = State::NoScheme;
1269                 c = beginAfterControlAndSpace;
1270             }
1271             break;
1272         case State::NoScheme:
1273             LOG_STATE("NoScheme");
1274             if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) {
1275                 failure();
1276                 return;
1277             }
1278             if (base.m_cannotBeABaseURL && *c == '#') {
1279                 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1280                 state = State::Fragment;
1281                 appendToASCIIBuffer('#');
1282                 ++c;
1283                 break;
1284             }
1285             if (!base.protocolIs("file")) {
1286                 state = State::Relative;
1287                 break;
1288             }
1289             copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1290             appendToASCIIBuffer(':');
1291             state = State::File;
1292             break;
1293         case State::SpecialRelativeOrAuthority:
1294             LOG_STATE("SpecialRelativeOrAuthority");
1295             if (*c == '/') {
1296                 appendToASCIIBuffer('/');
1297                 advance(c);
1298                 if (c.atEnd()) {
1299                     failure();
1300                     return;
1301                 }
1302                 if (*c == '/') {
1303                     appendToASCIIBuffer('/');
1304                     state = State::SpecialAuthorityIgnoreSlashes;
1305                     ++c;
1306                 } else
1307                     state = State::RelativeSlash;
1308             } else
1309                 state = State::Relative;
1310             break;
1311         case State::PathOrAuthority:
1312             LOG_STATE("PathOrAuthority");
1313             if (*c == '/') {
1314                 appendToASCIIBuffer('/');
1315                 state = State::AuthorityOrHost;
1316                 advance(c);
1317                 m_url.m_userStart = currentPosition(c);
1318                 authorityOrHostBegin = c;
1319             } else {
1320                 ASSERT(parsedDataView(currentPosition(c) - 1, 1) == "/");
1321                 m_url.m_userStart = currentPosition(c) - 1;
1322                 m_url.m_userEnd = m_url.m_userStart;
1323                 m_url.m_passwordEnd = m_url.m_userStart;
1324                 m_url.m_hostEnd = m_url.m_userStart;
1325                 m_url.m_portEnd = m_url.m_userStart;
1326                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1327                 state = State::Path;
1328             }
1329             break;
1330         case State::Relative:
1331             LOG_STATE("Relative");
1332             switch (*c) {
1333             case '/':
1334             case '\\':
1335                 state = State::RelativeSlash;
1336                 ++c;
1337                 break;
1338             case '?':
1339                 copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1340                 appendToASCIIBuffer('?');
1341                 ++c;
1342                 if (isUTF8Encoding)
1343                     state = State::UTF8Query;
1344                 else {
1345                     queryBegin = c;
1346                     state = State::NonUTF8Query;
1347                 }
1348                 break;
1349             case '#':
1350                 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1351                 appendToASCIIBuffer('#');
1352                 state = State::Fragment;
1353                 ++c;
1354                 break;
1355             default:
1356                 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1357                 state = State::Path;
1358                 break;
1359             }
1360             break;
1361         case State::RelativeSlash:
1362             LOG_STATE("RelativeSlash");
1363             if (*c == '/' || *c == '\\') {
1364                 ++c;
1365                 copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1366                 appendToASCIIBuffer("://", 3);
1367                 state = State::SpecialAuthorityIgnoreSlashes;
1368             } else {
1369                 copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1370                 appendToASCIIBuffer('/');
1371                 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1372                 state = State::Path;
1373             }
1374             break;
1375         case State::SpecialAuthoritySlashes:
1376             LOG_STATE("SpecialAuthoritySlashes");
1377             if (LIKELY(*c == '/' || *c == '\\')) {
1378                 if (UNLIKELY(*c == '\\'))
1379                     syntaxViolation(c);
1380                 appendToASCIIBuffer('/');
1381                 advance(c);
1382                 if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) {
1383                     if (UNLIKELY(*c == '\\'))
1384                         syntaxViolation(c);
1385                     ++c;
1386                     appendToASCIIBuffer('/');
1387                 } else {
1388                     syntaxViolation(c);
1389                     appendToASCIIBuffer('/');
1390                 }
1391             } else {
1392                 syntaxViolation(c);
1393                 appendToASCIIBuffer("//", 2);
1394             }
1395             state = State::SpecialAuthorityIgnoreSlashes;
1396             break;
1397         case State::SpecialAuthorityIgnoreSlashes:
1398             LOG_STATE("SpecialAuthorityIgnoreSlashes");
1399             if (*c == '/' || *c == '\\') {
1400                 syntaxViolation(c);
1401                 ++c;
1402             } else {
1403                 m_url.m_userStart = currentPosition(c);
1404                 state = State::AuthorityOrHost;
1405                 authorityOrHostBegin = c;
1406             }
1407             break;
1408         case State::AuthorityOrHost:
1409             do {
1410                 LOG_STATE("AuthorityOrHost");
1411                 if (*c == '@') {
1412                     auto lastAt = c;
1413                     auto findLastAt = c;
1414                     while (!findLastAt.atEnd()) {
1415                         URL_PARSER_LOG("Finding last @: %c", *findLastAt);
1416                         if (*findLastAt == '@')
1417                             lastAt = findLastAt;
1418                         bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\');
1419                         if (isSlash || *findLastAt == '?' || *findLastAt == '#')
1420                             break;
1421                         ++findLastAt;
1422                     }
1423                     parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1424                     c = lastAt;
1425                     advance(c);
1426                     authorityOrHostBegin = c;
1427                     state = State::Host;
1428                     m_hostHasPercentOrNonASCII = false;
1429                     break;
1430                 }
1431                 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1432                 if (isSlash || *c == '?' || *c == '#') {
1433                     auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c);
1434                     if (iterator.atEnd()) {
1435                         size_t position = currentPosition(c);
1436                         ASSERT(m_url.m_userStart == position);
1437                         RELEASE_ASSERT(position >= 2);
1438                         position -= 2;
1439                         ASSERT(parsedDataView(position, 2) == "//");
1440                         m_url.m_userStart = position;
1441                         m_url.m_userEnd = position;
1442                         m_url.m_passwordEnd = position;
1443                         m_url.m_hostEnd = position;
1444                         m_url.m_portEnd = position;
1445                         m_url.m_pathAfterLastSlash = position + 2;
1446                     } else {
1447                         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1448                         m_url.m_passwordEnd = m_url.m_userEnd;
1449                         if (!parseHostAndPort(iterator)) {
1450                             failure();
1451                             return;
1452                         }
1453                         if (UNLIKELY(!isSlash)) {
1454                             syntaxViolation(c);
1455                             appendToASCIIBuffer('/');
1456                             m_url.m_pathAfterLastSlash = currentPosition(c);
1457                         }
1458                     }
1459                     state = State::Path;
1460                     break;
1461                 }
1462                 if (isPercentOrNonASCII(*c))
1463                     m_hostHasPercentOrNonASCII = true;
1464                 ++c;
1465             } while (!c.atEnd());
1466             break;
1467         case State::Host:
1468             do {
1469                 LOG_STATE("Host");
1470                 if (*c == '/' || *c == '?' || *c == '#') {
1471                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1472                         failure();
1473                         return;
1474                     }
1475                     if (*c == '?' || *c == '#') {
1476                         syntaxViolation(c);
1477                         appendToASCIIBuffer('/');
1478                         m_url.m_pathAfterLastSlash = currentPosition(c);
1479                     }
1480                     state = State::Path;
1481                     break;
1482                 }
1483                 if (isPercentOrNonASCII(*c))
1484                     m_hostHasPercentOrNonASCII = true;
1485                 ++c;
1486             } while (!c.atEnd());
1487             break;
1488         case State::File:
1489             LOG_STATE("File");
1490             switch (*c) {
1491             case '\\':
1492                 syntaxViolation(c);
1493                 FALLTHROUGH;
1494             case '/':
1495                 appendToASCIIBuffer('/');
1496                 state = State::FileSlash;
1497                 ++c;
1498                 break;
1499             case '?':
1500                 syntaxViolation(c);
1501                 if (base.isValid() && base.protocolIs("file")) {
1502                     copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1503                     appendToASCIIBuffer('?');
1504                     ++c;
1505                 } else {
1506                     appendToASCIIBuffer("///?", 4);
1507                     ++c;
1508                     m_url.m_userStart = currentPosition(c) - 2;
1509                     m_url.m_userEnd = m_url.m_userStart;
1510                     m_url.m_passwordEnd = m_url.m_userStart;
1511                     m_url.m_hostEnd = m_url.m_userStart;
1512                     m_url.m_portEnd = m_url.m_userStart;
1513                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1514                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1515                 }
1516                 if (isUTF8Encoding)
1517                     state = State::UTF8Query;
1518                 else {
1519                     queryBegin = c;
1520                     state = State::NonUTF8Query;
1521                 }
1522                 break;
1523             case '#':
1524                 syntaxViolation(c);
1525                 if (base.isValid() && base.protocolIs("file")) {
1526                     copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1527                     appendToASCIIBuffer('#');
1528                 } else {
1529                     appendToASCIIBuffer("///#", 4);
1530                     m_url.m_userStart = currentPosition(c) - 2;
1531                     m_url.m_userEnd = m_url.m_userStart;
1532                     m_url.m_passwordEnd = m_url.m_userStart;
1533                     m_url.m_hostEnd = m_url.m_userStart;
1534                     m_url.m_portEnd = m_url.m_userStart;
1535                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1536                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1537                     m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1538                 }
1539                 state = State::Fragment;
1540                 ++c;
1541                 break;
1542             default:
1543                 syntaxViolation(c);
1544                 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1545                     copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1546                 else {
1547                     appendToASCIIBuffer("///", 3);
1548                     m_url.m_userStart = currentPosition(c) - 1;
1549                     m_url.m_userEnd = m_url.m_userStart;
1550                     m_url.m_passwordEnd = m_url.m_userStart;
1551                     m_url.m_hostEnd = m_url.m_userStart;
1552                     m_url.m_portEnd = m_url.m_userStart;
1553                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1554                     if (isWindowsDriveLetter(c))
1555                         appendWindowsDriveLetter(c);
1556                 }
1557                 state = State::Path;
1558                 break;
1559             }
1560             break;
1561         case State::FileSlash:
1562             LOG_STATE("FileSlash");
1563             if (LIKELY(*c == '/' || *c == '\\')) {
1564                 if (UNLIKELY(*c == '\\'))
1565                     syntaxViolation(c);
1566                 appendToASCIIBuffer('/');
1567                 advance(c);
1568                 m_url.m_userStart = currentPosition(c);
1569                 m_url.m_userEnd = m_url.m_userStart;
1570                 m_url.m_passwordEnd = m_url.m_userStart;
1571                 m_url.m_hostEnd = m_url.m_userStart;
1572                 m_url.m_portEnd = m_url.m_userStart;
1573                 authorityOrHostBegin = c;
1574                 state = State::FileHost;
1575                 break;
1576             }
1577             if (base.isValid() && base.protocolIs("file")) {
1578                 // FIXME: This String copy is unnecessary.
1579                 String basePath = base.path();
1580                 if (basePath.length() >= 2) {
1581                     bool windowsQuirk = basePath.is8Bit()
1582                         ? isWindowsDriveLetter(CodePointIterator<LChar>(basePath.characters8(), basePath.characters8() + basePath.length()))
1583                         : isWindowsDriveLetter(CodePointIterator<UChar>(basePath.characters16(), basePath.characters16() + basePath.length()));
1584                     if (windowsQuirk) {
1585                         appendToASCIIBuffer(basePath[0]);
1586                         appendToASCIIBuffer(basePath[1]);
1587                     }
1588                 }
1589             }
1590             syntaxViolation(c);
1591             appendToASCIIBuffer("//", 2);
1592             m_url.m_userStart = currentPosition(c) - 1;
1593             m_url.m_userEnd = m_url.m_userStart;
1594             m_url.m_passwordEnd = m_url.m_userStart;
1595             m_url.m_hostEnd = m_url.m_userStart;
1596             m_url.m_portEnd = m_url.m_userStart;
1597             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1598             if (isWindowsDriveLetter(c))
1599                 appendWindowsDriveLetter(c);
1600             state = State::Path;
1601             break;
1602         case State::FileHost:
1603             do {
1604                 LOG_STATE("FileHost");
1605                 if (isSlashQuestionOrHash(*c)) {
1606                     bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1607                         && isWindowsDriveLetter(authorityOrHostBegin);
1608                     if (windowsQuirk) {
1609                         syntaxViolation(authorityOrHostBegin);
1610                         appendToASCIIBuffer('/');
1611                         appendWindowsDriveLetter(authorityOrHostBegin);
1612                     }
1613                     if (windowsQuirk || authorityOrHostBegin == c) {
1614                         ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1, 1) == "/");
1615                         if (UNLIKELY(*c == '?')) {
1616                             syntaxViolation(c);
1617                             appendToASCIIBuffer("/?", 2);
1618                             ++c;
1619                             if (isUTF8Encoding)
1620                                 state = State::UTF8Query;
1621                             else {
1622                                 queryBegin = c;
1623                                 state = State::NonUTF8Query;
1624                             }
1625                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1626                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1627                             break;
1628                         }
1629                         if (UNLIKELY(*c == '#')) {
1630                             syntaxViolation(c);
1631                             appendToASCIIBuffer("/#", 2);
1632                             ++c;
1633                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1634                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1635                             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1636                             state = State::Fragment;
1637                             break;
1638                         }
1639                         state = State::Path;
1640                         break;
1641                     }
1642                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1643                         failure();
1644                         return;
1645                     }
1646                     if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) {
1647                         syntaxViolation(c);
1648                         m_asciiBuffer.shrink(m_url.m_passwordEnd);
1649                         m_url.m_hostEnd = currentPosition(c);
1650                         m_url.m_portEnd = m_url.m_hostEnd;
1651                     }
1652                     
1653                     state = State::PathStart;
1654                     break;
1655                 }
1656                 if (isPercentOrNonASCII(*c))
1657                     m_hostHasPercentOrNonASCII = true;
1658                 ++c;
1659             } while (!c.atEnd());
1660             break;
1661         case State::PathStart:
1662             LOG_STATE("PathStart");
1663             if (*c != '/' && *c != '\\')
1664                 ++c;
1665             state = State::Path;
1666             break;
1667         case State::Path:
1668             LOG_STATE("Path");
1669             if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1670                 if (UNLIKELY(m_urlIsSpecial && *c == '\\'))
1671                     syntaxViolation(c);
1672                 appendToASCIIBuffer('/');
1673                 ++c;
1674                 m_url.m_pathAfterLastSlash = currentPosition(c);
1675                 break;
1676             }
1677             if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1, 1) == "/")) {
1678                 if (UNLIKELY(isDoubleDotPathSegment(c))) {
1679                     syntaxViolation(c);
1680                     consumeDoubleDotPathSegment(c);
1681                     popPath();
1682                     break;
1683                 }
1684                 if (UNLIKELY(isSingleDotPathSegment(c))) {
1685                     syntaxViolation(c);
1686                     consumeSingleDotPathSegment(c);
1687                     break;
1688                 }
1689             }
1690             if (*c == '?') {
1691                 m_url.m_pathEnd = currentPosition(c);
1692                 appendToASCIIBuffer('?');
1693                 ++c;
1694                 if (isUTF8Encoding)
1695                     state = State::UTF8Query;
1696                 else {
1697                     queryBegin = c;
1698                     state = State::NonUTF8Query;
1699                 }
1700                 break;
1701             }
1702             if (*c == '#') {
1703                 m_url.m_pathEnd = currentPosition(c);
1704                 m_url.m_queryEnd = m_url.m_pathEnd;
1705                 state = State::Fragment;
1706                 break;
1707             }
1708             utf8PercentEncode<isInDefaultEncodeSet>(c);
1709             ++c;
1710             break;
1711         case State::CannotBeABaseURLPath:
1712             LOG_STATE("CannotBeABaseURLPath");
1713             if (*c == '?') {
1714                 m_url.m_pathEnd = currentPosition(c);
1715                 appendToASCIIBuffer('?');
1716                 ++c;
1717                 if (isUTF8Encoding)
1718                     state = State::UTF8Query;
1719                 else {
1720                     queryBegin = c;
1721                     state = State::NonUTF8Query;
1722                 }
1723             } else if (*c == '#') {
1724                 m_url.m_pathEnd = currentPosition(c);
1725                 m_url.m_queryEnd = m_url.m_pathEnd;
1726                 state = State::Fragment;
1727             } else if (*c == '/') {
1728                 appendToASCIIBuffer('/');
1729                 ++c;
1730                 m_url.m_pathAfterLastSlash = currentPosition(c);
1731             } else {
1732                 utf8PercentEncode<isInSimpleEncodeSet>(c);
1733                 ++c;
1734             }
1735             break;
1736         case State::UTF8Query:
1737             LOG_STATE("UTF8Query");
1738             ASSERT(queryBegin == CodePointIterator<CharacterType>());
1739             if (*c == '#') {
1740                 m_url.m_queryEnd = currentPosition(c);
1741                 state = State::Fragment;
1742                 break;
1743             }
1744             if (isUTF8Encoding)
1745                 utf8QueryEncode(c);
1746             else
1747                 appendCodePoint(queryBuffer, *c);
1748             ++c;
1749             break;
1750         case State::NonUTF8Query:
1751             do {
1752                 LOG_STATE("NonUTF8Query");
1753                 ASSERT(queryBegin != CodePointIterator<CharacterType>());
1754                 if (*c == '#') {
1755                     encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
1756                     m_url.m_queryEnd = currentPosition(c);
1757                     state = State::Fragment;
1758                     break;
1759                 }
1760                 appendCodePoint(queryBuffer, *c);
1761                 advance(c, queryBegin);
1762             } while (!c.atEnd());
1763             break;
1764         case State::Fragment:
1765             URL_PARSER_LOG("State Fragment");
1766             utf8PercentEncode<isInSimpleEncodeSet>(c);
1767             ++c;
1768             break;
1769         }
1770     }
1771
1772     switch (state) {
1773     case State::SchemeStart:
1774         LOG_FINAL_STATE("SchemeStart");
1775         if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1776             m_url = base;
1777             m_url.removeFragmentIdentifier();
1778             return;
1779         }
1780         failure();
1781         return;
1782     case State::Scheme:
1783         LOG_FINAL_STATE("Scheme");
1784         failure();
1785         return;
1786     case State::NoScheme:
1787         LOG_FINAL_STATE("NoScheme");
1788         RELEASE_ASSERT_NOT_REACHED();
1789     case State::SpecialRelativeOrAuthority:
1790         LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1791         copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1792         m_url.m_fragmentEnd = m_url.m_queryEnd;
1793         break;
1794     case State::PathOrAuthority:
1795         LOG_FINAL_STATE("PathOrAuthority");
1796         ASSERT(m_url.m_userStart);
1797         ASSERT(m_url.m_userStart == currentPosition(c));
1798         ASSERT(parsedDataView(currentPosition(c) - 1, 1) == "/");
1799         m_url.m_userStart--;
1800         m_url.m_userEnd = m_url.m_userStart;
1801         m_url.m_passwordEnd = m_url.m_userStart;
1802         m_url.m_hostEnd = m_url.m_userStart;
1803         m_url.m_portEnd = m_url.m_userStart;
1804         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1805         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1806         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1807         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1808         break;
1809     case State::Relative:
1810         LOG_FINAL_STATE("Relative");
1811         RELEASE_ASSERT_NOT_REACHED();
1812     case State::RelativeSlash:
1813         LOG_FINAL_STATE("RelativeSlash");
1814         copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1815         appendToASCIIBuffer('/');
1816         m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1817         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1818         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1819         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1820         break;
1821     case State::SpecialAuthoritySlashes:
1822         LOG_FINAL_STATE("SpecialAuthoritySlashes");
1823         m_url.m_userStart = currentPosition(c);
1824         m_url.m_userEnd = m_url.m_userStart;
1825         m_url.m_passwordEnd = m_url.m_userStart;
1826         m_url.m_hostEnd = m_url.m_userStart;
1827         m_url.m_portEnd = m_url.m_userStart;
1828         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1829         m_url.m_pathEnd = m_url.m_userStart;
1830         m_url.m_queryEnd = m_url.m_userStart;
1831         m_url.m_fragmentEnd = m_url.m_userStart;
1832         break;
1833     case State::SpecialAuthorityIgnoreSlashes:
1834         LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1835         failure();
1836         return;
1837         break;
1838     case State::AuthorityOrHost:
1839         LOG_FINAL_STATE("AuthorityOrHost");
1840         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1841         m_url.m_passwordEnd = m_url.m_userEnd;
1842         if (authorityOrHostBegin.atEnd()) {
1843             RELEASE_ASSERT(m_url.m_userStart >= 2);
1844             ASSERT(parsedDataView(m_url.m_userStart - 2, 2) == "//");
1845             m_url.m_userStart -= 2;
1846             m_url.m_userEnd = m_url.m_userStart;
1847             m_url.m_passwordEnd = m_url.m_userStart;
1848             m_url.m_hostEnd = m_url.m_userStart;
1849             m_url.m_portEnd = m_url.m_userStart;
1850             m_url.m_pathEnd = m_url.m_userStart + 2;
1851         } else if (!parseHostAndPort(authorityOrHostBegin)) {
1852             failure();
1853             return;
1854         } else {
1855             if (m_urlIsSpecial) {
1856                 syntaxViolation(c);
1857                 appendToASCIIBuffer('/');
1858                 m_url.m_pathEnd = m_url.m_portEnd + 1;
1859             } else
1860                 m_url.m_pathEnd = m_url.m_portEnd;
1861         }
1862         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1863         m_url.m_queryEnd = m_url.m_pathEnd;
1864         m_url.m_fragmentEnd = m_url.m_pathEnd;
1865         break;
1866     case State::Host:
1867         LOG_FINAL_STATE("Host");
1868         if (!parseHostAndPort(authorityOrHostBegin)) {
1869             failure();
1870             return;
1871         }
1872         if (m_urlIsSpecial) {
1873             syntaxViolation(c);
1874             appendToASCIIBuffer('/');
1875             m_url.m_pathEnd = m_url.m_portEnd + 1;
1876         } else
1877             m_url.m_pathEnd = m_url.m_portEnd;
1878         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1879         m_url.m_queryEnd = m_url.m_pathEnd;
1880         m_url.m_fragmentEnd = m_url.m_pathEnd;
1881         break;
1882     case State::File:
1883         LOG_FINAL_STATE("File");
1884         if (base.isValid() && base.protocolIs("file")) {
1885             copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1886             appendToASCIIBuffer(':');
1887         }
1888         syntaxViolation(c);
1889         appendToASCIIBuffer("///", 3);
1890         m_url.m_userStart = currentPosition(c) - 1;
1891         m_url.m_userEnd = m_url.m_userStart;
1892         m_url.m_passwordEnd = m_url.m_userStart;
1893         m_url.m_hostEnd = m_url.m_userStart;
1894         m_url.m_portEnd = m_url.m_userStart;
1895         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1896         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1897         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1898         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1899         break;
1900     case State::FileSlash:
1901         LOG_FINAL_STATE("FileSlash");
1902         syntaxViolation(c);
1903         m_url.m_userStart = currentPosition(c) + 1;
1904         appendToASCIIBuffer("//", 2);
1905         m_url.m_userEnd = m_url.m_userStart;
1906         m_url.m_passwordEnd = m_url.m_userStart;
1907         m_url.m_hostEnd = m_url.m_userStart;
1908         m_url.m_portEnd = m_url.m_userStart;
1909         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1910         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1911         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1912         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1913         break;
1914     case State::FileHost:
1915         LOG_FINAL_STATE("FileHost");
1916         if (authorityOrHostBegin == c) {
1917             syntaxViolation(c);
1918             appendToASCIIBuffer('/');
1919             m_url.m_userStart = currentPosition(c) - 1;
1920             m_url.m_userEnd = m_url.m_userStart;
1921             m_url.m_passwordEnd = m_url.m_userStart;
1922             m_url.m_hostEnd = m_url.m_userStart;
1923             m_url.m_portEnd = m_url.m_userStart;
1924             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1925             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1926             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1927             m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1928             break;
1929         }
1930
1931         if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1932             failure();
1933             return;
1934         }
1935
1936         syntaxViolation(c);
1937         if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) {
1938             m_asciiBuffer.shrink(m_url.m_passwordEnd);
1939             m_url.m_hostEnd = currentPosition(c);
1940             m_url.m_portEnd = m_url.m_hostEnd;
1941         }
1942         appendToASCIIBuffer('/');
1943         m_url.m_pathAfterLastSlash = m_url.m_portEnd + 1;
1944         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1945         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1946         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1947         break;
1948     case State::PathStart:
1949         LOG_FINAL_STATE("PathStart");
1950         RELEASE_ASSERT_NOT_REACHED();
1951     case State::Path:
1952         LOG_FINAL_STATE("Path");
1953         m_url.m_pathEnd = currentPosition(c);
1954         m_url.m_queryEnd = m_url.m_pathEnd;
1955         m_url.m_fragmentEnd = m_url.m_pathEnd;
1956         break;
1957     case State::CannotBeABaseURLPath:
1958         LOG_FINAL_STATE("CannotBeABaseURLPath");
1959         m_url.m_pathEnd = currentPosition(c);
1960         m_url.m_queryEnd = m_url.m_pathEnd;
1961         m_url.m_fragmentEnd = m_url.m_pathEnd;
1962         break;
1963     case State::UTF8Query:
1964         LOG_FINAL_STATE("UTF8Query");
1965         ASSERT(queryBegin == CodePointIterator<CharacterType>());
1966         m_url.m_queryEnd = currentPosition(c);
1967         m_url.m_fragmentEnd = m_url.m_queryEnd;
1968         break;
1969     case State::NonUTF8Query:
1970         LOG_FINAL_STATE("NonUTF8Query");
1971         ASSERT(queryBegin != CodePointIterator<CharacterType>());
1972         encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
1973         m_url.m_queryEnd = currentPosition(c);
1974         m_url.m_fragmentEnd = m_url.m_queryEnd;
1975         break;
1976     case State::Fragment:
1977         LOG_FINAL_STATE("Fragment");
1978         m_url.m_fragmentEnd = currentPosition(c);
1979         break;
1980     }
1981
1982     if (LIKELY(!m_didSeeSyntaxViolation)) {
1983         m_url.m_string = m_inputString;
1984         ASSERT(m_asciiBuffer.isEmpty());
1985     } else
1986         m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
1987     m_url.m_isValid = true;
1988     URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
1989 }
1990
1991 template<typename CharacterType>
1992 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
1993 {
1994     if (UNLIKELY(iterator.atEnd())) {
1995         syntaxViolation(iterator);
1996         m_url.m_userEnd = currentPosition(iterator);
1997         m_url.m_passwordEnd = m_url.m_userEnd;
1998         return;
1999     }
2000     for (; !iterator.atEnd(); advance(iterator)) {
2001         if (*iterator == ':') {
2002             m_url.m_userEnd = currentPosition(iterator);
2003             auto iteratorAtColon = iterator;
2004             ++iterator;
2005             bool tabOrNewlineAfterColon = false;
2006             while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
2007                 tabOrNewlineAfterColon = true;
2008                 ++iterator;
2009             }
2010             if (UNLIKELY(iterator.atEnd())) {
2011                 syntaxViolation(iteratorAtColon);
2012                 m_url.m_passwordEnd = m_url.m_userEnd;
2013                 if (m_url.m_userEnd > m_url.m_userStart)
2014                     appendToASCIIBuffer('@');
2015                 return;
2016             }
2017             if (tabOrNewlineAfterColon)
2018                 syntaxViolation(iteratorAtColon);
2019             appendToASCIIBuffer(':');
2020             break;
2021         }
2022         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2023     }
2024     for (; !iterator.atEnd(); advance(iterator))
2025         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2026     m_url.m_passwordEnd = currentPosition(iterator);
2027     if (!m_url.m_userEnd)
2028         m_url.m_userEnd = m_url.m_passwordEnd;
2029     appendToASCIIBuffer('@');
2030 }
2031
2032 template<typename UnsignedIntegerType>
2033 void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
2034 {
2035     LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
2036     LChar* end = buf + WTF_ARRAY_LENGTH(buf);
2037     LChar* p = end;
2038     do {
2039         *--p = (number % 10) + '0';
2040         number /= 10;
2041     } while (number);
2042     appendToASCIIBuffer(p, end - p);
2043 }
2044
2045 void URLParser::serializeIPv4(IPv4Address address)
2046 {
2047     appendNumberToASCIIBuffer<uint8_t>(address >> 24);
2048     appendToASCIIBuffer('.');
2049     appendNumberToASCIIBuffer<uint8_t>(address >> 16);
2050     appendToASCIIBuffer('.');
2051     appendNumberToASCIIBuffer<uint8_t>(address >> 8);
2052     appendToASCIIBuffer('.');
2053     appendNumberToASCIIBuffer<uint8_t>(address);
2054 }
2055     
2056 static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
2057 {
2058     size_t end = begin;
2059     for (; end < 8; end++) {
2060         if (address[end])
2061             break;
2062     }
2063     return end - begin;
2064 }
2065
2066 static Optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
2067 {
2068     Optional<size_t> longest;
2069     size_t longestLength = 0;
2070     for (size_t i = 0; i < 8; i++) {
2071         size_t length = zeroSequenceLength(address, i);
2072         if (length) {
2073             if (length > 1 && (!longest || longestLength < length)) {
2074                 longest = i;
2075                 longestLength = length;
2076             }
2077             i += length;
2078         }
2079     }
2080     return longest;
2081 }
2082
2083 void URLParser::serializeIPv6Piece(uint16_t piece)
2084 {
2085     bool printed = false;
2086     if (auto nibble0 = piece >> 12) {
2087         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
2088         printed = true;
2089     }
2090     auto nibble1 = piece >> 8 & 0xF;
2091     if (printed || nibble1) {
2092         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
2093         printed = true;
2094     }
2095     auto nibble2 = piece >> 4 & 0xF;
2096     if (printed || nibble2)
2097         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
2098     appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
2099 }
2100
2101 void URLParser::serializeIPv6(URLParser::IPv6Address address)
2102 {
2103     appendToASCIIBuffer('[');
2104     auto compressPointer = findLongestZeroSequence(address);
2105     for (size_t piece = 0; piece < 8; piece++) {
2106         if (compressPointer && compressPointer.value() == piece) {
2107             ASSERT(!address[piece]);
2108             if (piece)
2109                 appendToASCIIBuffer(':');
2110             else
2111                 appendToASCIIBuffer("::", 2);
2112             while (piece < 8 && !address[piece])
2113                 piece++;
2114             if (piece == 8)
2115                 break;
2116         }
2117         serializeIPv6Piece(address[piece]);
2118         if (piece < 7)
2119             appendToASCIIBuffer(':');
2120     }
2121     appendToASCIIBuffer(']');
2122 }
2123
2124 template<typename CharacterType>
2125 Optional<uint32_t> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation)
2126 {
2127     enum class State : uint8_t {
2128         UnknownBase,
2129         Decimal,
2130         OctalOrHex,
2131         Octal,
2132         Hex,
2133     };
2134     State state = State::UnknownBase;
2135     Checked<uint32_t, RecordOverflow> value = 0;
2136     if (!iterator.atEnd() && *iterator == '.')
2137         return Nullopt;
2138     while (!iterator.atEnd()) {
2139         if (isTabOrNewline(*iterator)) {
2140             didSeeSyntaxViolation = true;
2141             ++iterator;
2142             continue;
2143         }
2144         if (*iterator == '.') {
2145             ASSERT(!value.hasOverflowed());
2146             return value.unsafeGet();
2147         }
2148         switch (state) {
2149         case State::UnknownBase:
2150             if (UNLIKELY(*iterator == '0')) {
2151                 ++iterator;
2152                 state = State::OctalOrHex;
2153                 break;
2154             }
2155             state = State::Decimal;
2156             break;
2157         case State::OctalOrHex:
2158             didSeeSyntaxViolation = true;
2159             if (*iterator == 'x' || *iterator == 'X') {
2160                 ++iterator;
2161                 state = State::Hex;
2162                 break;
2163             }
2164             state = State::Octal;
2165             break;
2166         case State::Decimal:
2167             if (*iterator < '0' || *iterator > '9')
2168                 return Nullopt;
2169             value *= 10;
2170             value += *iterator - '0';
2171             if (UNLIKELY(value.hasOverflowed()))
2172                 return Nullopt;
2173             ++iterator;
2174             break;
2175         case State::Octal:
2176             ASSERT(didSeeSyntaxViolation);
2177             if (*iterator < '0' || *iterator > '7')
2178                 return Nullopt;
2179             value *= 8;
2180             value += *iterator - '0';
2181             if (UNLIKELY(value.hasOverflowed()))
2182                 return Nullopt;
2183             ++iterator;
2184             break;
2185         case State::Hex:
2186             ASSERT(didSeeSyntaxViolation);
2187             if (!isASCIIHexDigit(*iterator))
2188                 return Nullopt;
2189             value *= 16;
2190             value += toASCIIHexValue(*iterator);
2191             if (UNLIKELY(value.hasOverflowed()))
2192                 return Nullopt;
2193             ++iterator;
2194             break;
2195         }
2196     }
2197     ASSERT(!value.hasOverflowed());
2198     return value.unsafeGet();
2199 }
2200
2201 ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2202 {
2203     RELEASE_ASSERT(exponent <= 4);
2204     uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
2205     return values[exponent];
2206 }
2207
2208 template<typename CharacterType>
2209 Optional<URLParser::IPv4Address> URLParser::parseIPv4Host(CodePointIterator<CharacterType> iterator)
2210 {
2211     auto hostBegin = iterator;
2212
2213     Vector<uint32_t, 4> items;
2214     items.reserveInitialCapacity(4);
2215     bool didSeeSyntaxViolation = false;
2216     while (!iterator.atEnd()) {
2217         if (isTabOrNewline(*iterator)) {
2218             didSeeSyntaxViolation = true;
2219             ++iterator;
2220             continue;
2221         }
2222         if (items.size() >= 4)
2223             return Nullopt;
2224         if (auto item = parseIPv4Piece(iterator, didSeeSyntaxViolation))
2225             items.append(item.value());
2226         else
2227             return Nullopt;
2228         if (!iterator.atEnd()) {
2229             if (items.size() >= 4)
2230                 return Nullopt;
2231             if (*iterator == '.')
2232                 ++iterator;
2233             else
2234                 return Nullopt;
2235         }
2236     }
2237     if (!iterator.atEnd() || !items.size() || items.size() > 4)
2238         return Nullopt;
2239     if (items.size() > 1) {
2240         for (size_t i = 0; i < items.size() - 1; i++) {
2241             if (items[i] > 255)
2242                 return Nullopt;
2243         }
2244     }
2245     if (items[items.size() - 1] >= pow256(5 - items.size()))
2246         return Nullopt;
2247
2248     if (didSeeSyntaxViolation)
2249         syntaxViolation(hostBegin);
2250     for (auto item : items) {
2251         if (item > 255)
2252             syntaxViolation(hostBegin);
2253     }
2254
2255     if (UNLIKELY(items.size() != 4))
2256         syntaxViolation(hostBegin);
2257
2258     IPv4Address ipv4 = items.takeLast();
2259     for (size_t counter = 0; counter < items.size(); ++counter)
2260         ipv4 += items[counter] * pow256(3 - counter);
2261     return ipv4;
2262 }
2263
2264 template<typename CharacterType>
2265 Optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator)
2266 {
2267     if (iterator.atEnd())
2268         return Nullopt;
2269     uint32_t piece = 0;
2270     bool leadingZeros = false;
2271     size_t digitCount = 0;
2272     while (!iterator.atEnd()) {
2273         if (!isASCIIDigit(*iterator))
2274             return Nullopt;
2275         ++digitCount;
2276         if (!piece && *iterator == '0') {
2277             if (leadingZeros)
2278                 return Nullopt;
2279             leadingZeros = true;
2280         }
2281         if (!piece && *iterator == '0')
2282             leadingZeros = true;
2283         piece = piece * 10 + *iterator - '0';
2284         if (piece > 255)
2285             return Nullopt;
2286         advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2287         if (iterator.atEnd())
2288             break;
2289         if (*iterator == '.')
2290             break;
2291     }
2292     if (piece && leadingZeros)
2293         return Nullopt;
2294     return piece;
2295 }
2296
2297 template<typename CharacterType>
2298 Optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator)
2299 {
2300     IPv4Address address = 0;
2301     for (size_t i = 0; i < 4; ++i) {
2302         if (Optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator))
2303             address = (address << 8) + piece.value();
2304         else
2305             return Nullopt;
2306         if (i < 3) {
2307             if (iterator.atEnd())
2308                 return Nullopt;
2309             if (*iterator != '.')
2310                 return Nullopt;
2311             advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2312         } else if (!iterator.atEnd())
2313             return Nullopt;
2314     }
2315     ASSERT(iterator.atEnd());
2316     return address;
2317 }
2318
2319 template<typename CharacterType>
2320 Optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2321 {
2322     ASSERT(*c == '[');
2323     auto hostBegin = c;
2324     advance(c, hostBegin);
2325     if (c.atEnd())
2326         return Nullopt;
2327
2328     IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}};
2329     size_t piecePointer = 0;
2330     Optional<size_t> compressPointer;
2331
2332     if (*c == ':') {
2333         advance(c, hostBegin);
2334         if (c.atEnd())
2335             return Nullopt;
2336         if (*c != ':')
2337             return Nullopt;
2338         advance(c, hostBegin);
2339         ++piecePointer;
2340         compressPointer = piecePointer;
2341     }
2342     
2343     while (!c.atEnd()) {
2344         if (piecePointer == 8)
2345             return Nullopt;
2346         if (*c == ':') {
2347             if (compressPointer)
2348                 return Nullopt;
2349             advance(c, hostBegin);
2350             ++piecePointer;
2351             compressPointer = piecePointer;
2352             continue;
2353         }
2354         if (piecePointer == 6 || (compressPointer && piecePointer < 6)) {
2355             if (Optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) {
2356                 if (compressPointer && piecePointer == 5)
2357                     return Nullopt;
2358                 syntaxViolation(hostBegin);
2359                 address[piecePointer++] = ipv4Address.value() >> 16;
2360                 address[piecePointer++] = ipv4Address.value() & 0xFFFF;
2361                 c = { };
2362                 break;
2363             }
2364         }
2365         uint16_t value = 0;
2366         size_t length = 0;
2367         bool leadingZeros = false;
2368         for (; length < 4; length++) {
2369             if (c.atEnd())
2370                 break;
2371             if (!isASCIIHexDigit(*c))
2372                 break;
2373             if (isASCIIUpper(*c))
2374                 syntaxViolation(hostBegin);
2375             if (*c == '0' && !length)
2376                 leadingZeros = true;
2377             value = value * 0x10 + toASCIIHexValue(*c);
2378             advance(c, hostBegin);
2379         }
2380         
2381         if (UNLIKELY((value && leadingZeros) || (!value && length > 1)))
2382             syntaxViolation(hostBegin);
2383
2384         address[piecePointer++] = value;
2385         if (c.atEnd())
2386             break;
2387         if (piecePointer == 8 || *c != ':')
2388             return Nullopt;
2389         advance(c, hostBegin);
2390     }
2391     
2392     if (!c.atEnd())
2393         return Nullopt;
2394     
2395     if (compressPointer) {
2396         size_t swaps = piecePointer - compressPointer.value();
2397         piecePointer = 7;
2398         while (swaps)
2399             std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
2400     } else if (piecePointer != 8)
2401         return Nullopt;
2402
2403     Optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2404     if (possibleCompressPointer)
2405         possibleCompressPointer.value()++;
2406     if (UNLIKELY(compressPointer != possibleCompressPointer))
2407         syntaxViolation(hostBegin);
2408     
2409     return address;
2410 }
2411
2412 template<typename CharacterType>
2413 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2414 {
2415     Vector<LChar, defaultInlineBufferSize> output;
2416     output.reserveInitialCapacity(length);
2417     
2418     for (size_t i = 0; i < length; ++i) {
2419         uint8_t byte = input[i];
2420         if (byte != '%')
2421             output.uncheckedAppend(byte);
2422         else if (length > 2 && i < length - 2) {
2423             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2424                 syntaxViolation(iteratorForSyntaxViolationPosition);
2425                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2426                 i += 2;
2427             } else
2428                 output.uncheckedAppend(byte);
2429         } else
2430             output.uncheckedAppend(byte);
2431     }
2432     return output;
2433 }
2434     
2435 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length)
2436 {
2437     Vector<LChar, defaultInlineBufferSize> output;
2438     output.reserveInitialCapacity(length);
2439     
2440     for (size_t i = 0; i < length; ++i) {
2441         uint8_t byte = input[i];
2442         if (byte != '%')
2443             output.uncheckedAppend(byte);
2444         else if (length > 2 && i < length - 2) {
2445             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2446                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2447                 i += 2;
2448             } else
2449                 output.uncheckedAppend(byte);
2450         } else
2451             output.uncheckedAppend(byte);
2452     }
2453     return output;
2454 }
2455
2456 ALWAYS_INLINE static bool containsOnlyASCII(const String& string)
2457 {
2458     if (string.is8Bit())
2459         return charactersAreAllASCII(string.characters8(), string.length());
2460     return charactersAreAllASCII(string.characters16(), string.length());
2461 }
2462
2463 template<typename CharacterType>
2464 Optional<Vector<LChar, URLParser::defaultInlineBufferSize>> URLParser::domainToASCII(const String& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2465 {
2466     Vector<LChar, defaultInlineBufferSize> ascii;
2467     if (containsOnlyASCII(domain)) {
2468         size_t length = domain.length();
2469         if (domain.is8Bit()) {
2470             const LChar* characters = domain.characters8();
2471             ascii.reserveInitialCapacity(length);
2472             for (size_t i = 0; i < length; ++i) {
2473                 if (UNLIKELY(isASCIIUpper(characters[i])))
2474                     syntaxViolation(iteratorForSyntaxViolationPosition);
2475                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2476             }
2477         } else {
2478             const UChar* characters = domain.characters16();
2479             ascii.reserveInitialCapacity(length);
2480             for (size_t i = 0; i < length; ++i) {
2481                 if (UNLIKELY(isASCIIUpper(characters[i])))
2482                     syntaxViolation(iteratorForSyntaxViolationPosition);
2483                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2484             }
2485         }
2486         return ascii;
2487     }
2488     
2489     UChar hostnameBuffer[defaultInlineBufferSize];
2490     UErrorCode error = U_ZERO_ERROR;
2491
2492 #if COMPILER(GCC) || COMPILER(CLANG)
2493 #pragma GCC diagnostic push
2494 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2495 #endif
2496     // FIXME: This should use uidna_openUTS46 / uidna_close instead
2497     int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, UIDNA_ALLOW_UNASSIGNED, nullptr, &error);
2498 #if COMPILER(GCC) || COMPILER(CLANG)
2499 #pragma GCC diagnostic pop
2500 #endif
2501     ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
2502
2503     if (error == U_ZERO_ERROR) {
2504         for (int32_t i = 0; i < numCharactersConverted; ++i) {
2505             ASSERT(isASCII(hostnameBuffer[i]));
2506             ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2507         }
2508         ascii.append(hostnameBuffer, numCharactersConverted);
2509         if (domain != StringView(ascii.data(), ascii.size()))
2510             syntaxViolation(iteratorForSyntaxViolationPosition);
2511         return ascii;
2512     }
2513
2514     // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
2515     return Nullopt;
2516 }
2517
2518 bool URLParser::hasInvalidDomainCharacter(const Vector<LChar, URLParser::defaultInlineBufferSize>& asciiDomain)
2519 {
2520     for (size_t i = 0; i < asciiDomain.size(); ++i) {
2521         if (isInvalidDomainCharacter(asciiDomain[i]))
2522             return true;
2523     }
2524     return false;
2525 }
2526
2527 template<typename CharacterType>
2528 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2529 {
2530     ASSERT(*iterator == ':');
2531     auto colonIterator = iterator;
2532     advance(iterator, colonIterator);
2533     uint32_t port = 0;
2534     if (UNLIKELY(iterator.atEnd())) {
2535         m_url.m_portEnd = currentPosition(colonIterator);
2536         syntaxViolation(colonIterator);
2537         return true;
2538     }
2539     size_t digitCount = 0;
2540     bool leadingZeros = false;
2541     for (; !iterator.atEnd(); ++iterator) {
2542         if (UNLIKELY(isTabOrNewline(*iterator))) {
2543             syntaxViolation(colonIterator);
2544             continue;
2545         }
2546         if (isASCIIDigit(*iterator)) {
2547             if (*iterator == '0' && !digitCount)
2548                 leadingZeros = true;
2549             ++digitCount;
2550             port = port * 10 + *iterator - '0';
2551             if (port > std::numeric_limits<uint16_t>::max())
2552                 return false;
2553         } else
2554             return false;
2555     }
2556
2557     if (port && leadingZeros)
2558         syntaxViolation(colonIterator);
2559     
2560     if (!port && digitCount > 1)
2561         syntaxViolation(colonIterator);
2562
2563     if (UNLIKELY(isDefaultPortForProtocol(port, parsedDataView(0, m_url.m_schemeEnd))))
2564         syntaxViolation(colonIterator);
2565     else {
2566         appendToASCIIBuffer(':');
2567         ASSERT(port <= std::numeric_limits<uint16_t>::max());
2568         appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2569     }
2570
2571     m_url.m_portEnd = currentPosition(iterator);
2572     return true;
2573 }
2574
2575 template<typename CharacterType>
2576 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2577 {
2578     if (iterator.atEnd())
2579         return false;
2580     if (*iterator == ':')
2581         return false;
2582     if (*iterator == '[') {
2583         auto ipv6End = iterator;
2584         while (!ipv6End.atEnd() && *ipv6End != ']')
2585             ++ipv6End;
2586         if (ipv6End.atEnd())
2587             return false;
2588         if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2589             serializeIPv6(address.value());
2590             if (!ipv6End.atEnd()) {
2591                 advance(ipv6End);
2592                 if (!ipv6End.atEnd() && *ipv6End == ':') {
2593                     m_url.m_hostEnd = currentPosition(ipv6End);
2594                     return parsePort(ipv6End);
2595                 }
2596                 m_url.m_hostEnd = currentPosition(ipv6End);
2597                 m_url.m_portEnd = m_url.m_hostEnd;
2598                 return true;
2599             }
2600             m_url.m_hostEnd = currentPosition(ipv6End);
2601             return true;
2602         }
2603         return false;
2604     }
2605
2606     if (!m_urlIsSpecial) {
2607         for (; !iterator.atEnd(); ++iterator) {
2608             if (UNLIKELY(isTabOrNewline(*iterator))) {
2609                 syntaxViolation(iterator);
2610                 continue;
2611             }
2612             if (*iterator == ':')
2613                 break;
2614             utf8PercentEncode<isInSimpleEncodeSet>(iterator);
2615         }
2616         m_url.m_hostEnd = currentPosition(iterator);
2617         if (iterator.atEnd()) {
2618             m_url.m_portEnd = currentPosition(iterator);
2619             return true;
2620         }
2621         return parsePort(iterator);
2622     }
2623     
2624     if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2625         auto hostIterator = iterator;
2626         for (; !iterator.atEnd(); ++iterator) {
2627             if (isTabOrNewline(*iterator))
2628                 continue;
2629             if (*iterator == ':')
2630                 break;
2631             if (isInvalidDomainCharacter(*iterator))
2632                 return false;
2633         }
2634         if (auto address = parseIPv4Host(CodePointIterator<CharacterType>(hostIterator, iterator))) {
2635             serializeIPv4(address.value());
2636             m_url.m_hostEnd = currentPosition(iterator);
2637             if (iterator.atEnd()) {
2638                 m_url.m_portEnd = currentPosition(iterator);
2639                 return true;
2640             }
2641             return parsePort(iterator);
2642         }
2643         for (; hostIterator != iterator; ++hostIterator) {
2644             if (UNLIKELY(isTabOrNewline(*hostIterator))) {
2645                 syntaxViolation(hostIterator);
2646                 continue;
2647             }
2648             if (UNLIKELY(isASCIIUpper(*hostIterator)))
2649                 syntaxViolation(hostIterator);
2650             appendToASCIIBuffer(toASCIILower(*hostIterator));
2651         }
2652         m_url.m_hostEnd = currentPosition(iterator);
2653         if (!hostIterator.atEnd())
2654             return parsePort(hostIterator);
2655         m_url.m_portEnd = currentPosition(iterator);
2656         return true;
2657     }
2658     
2659     auto hostBegin = iterator;
2660     
2661     Vector<LChar, defaultInlineBufferSize> utf8Encoded;
2662     for (; !iterator.atEnd(); ++iterator) {
2663         if (UNLIKELY(isTabOrNewline(*iterator))) {
2664             syntaxViolation(hostBegin);
2665             continue;
2666         }
2667         if (*iterator == ':')
2668             break;
2669         if (UNLIKELY(!isASCII(*iterator)))
2670             syntaxViolation(hostBegin);
2671
2672         uint8_t buffer[U8_MAX_LENGTH];
2673         int32_t offset = 0;
2674         UBool error = false;
2675         U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error);
2676         ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
2677         // FIXME: Check error.
2678         utf8Encoded.append(buffer, offset);
2679     }
2680     Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin);
2681     String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2682     if (domain != StringView(percentDecoded.data(), percentDecoded.size()))
2683         syntaxViolation(hostBegin);
2684     auto asciiDomain = domainToASCII(domain, hostBegin);
2685     if (!asciiDomain || hasInvalidDomainCharacter(asciiDomain.value()))
2686         return false;
2687     Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value();
2688     const LChar* asciiDomainCharacters = asciiDomainValue.data();
2689
2690     if (auto address = parseIPv4Host(CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()))) {
2691         serializeIPv4(address.value());
2692         m_url.m_hostEnd = currentPosition(iterator);
2693         if (iterator.atEnd()) {
2694             m_url.m_portEnd = currentPosition(iterator);
2695             return true;
2696         }
2697         return parsePort(iterator);
2698     }
2699
2700     appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2701     m_url.m_hostEnd = currentPosition(iterator);
2702     if (!iterator.atEnd())
2703         return parsePort(iterator);
2704     m_url.m_portEnd = currentPosition(iterator);
2705     return true;
2706 }
2707
2708 Optional<String> URLParser::formURLDecode(StringView input)
2709 {
2710     auto utf8 = input.utf8(StrictConversion);
2711     if (utf8.isNull())
2712         return Nullopt;
2713     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2714     return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2715 }
2716
2717 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2718 {
2719     Vector<StringView> sequences = input.split('&');
2720
2721     URLEncodedForm output;
2722     for (auto& bytes : sequences) {
2723         auto valueStart = bytes.find('=');
2724         if (valueStart == notFound) {
2725             if (auto name = formURLDecode(bytes))
2726                 output.append({name.value().replace('+', 0x20), emptyString()});
2727         } else {
2728             auto name = formURLDecode(bytes.substring(0, valueStart));
2729             auto value = formURLDecode(bytes.substring(valueStart + 1));
2730             if (name && value)
2731                 output.append(std::make_pair(name.value().replace('+', 0x20), value.value().replace('+', 0x20)));
2732         }
2733     }
2734     return output;
2735 }
2736
2737 static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2738 {
2739     auto utf8 = input.utf8(StrictConversion);
2740     const char* data = utf8.data();
2741     for (size_t i = 0; i < utf8.length(); ++i) {
2742         const char byte = data[i];
2743         if (byte == 0x20)
2744             output.append(0x2B);
2745         else if (byte == 0x2A
2746             || byte == 0x2D
2747             || byte == 0x2E
2748             || (byte >= 0x30 && byte <= 0x39)
2749             || (byte >= 0x41 && byte <= 0x5A)
2750             || byte == 0x5F
2751             || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches.
2752             output.append(byte);
2753         else
2754             percentEncodeByte(byte, output);
2755     }
2756 }
2757     
2758 String URLParser::serialize(const URLEncodedForm& tuples)
2759 {
2760     Vector<LChar> output;
2761     for (auto& tuple : tuples) {
2762         if (!output.isEmpty())
2763             output.append('&');
2764         serializeURLEncodedForm(tuple.first, output);
2765         output.append('=');
2766         serializeURLEncodedForm(tuple.second, output);
2767     }
2768     return String::adopt(WTFMove(output));
2769 }
2770
2771 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2772 {
2773     // FIXME: m_cannotBeABaseURL is not compared because the old URL::parse did not use it,
2774     // but once we get rid of URL::parse its value should be tested.
2775     URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2776         a.m_isValid,
2777         a.m_protocolIsInHTTPFamily,
2778         a.m_schemeEnd,
2779         a.m_userStart,
2780         a.m_userEnd,
2781         a.m_passwordEnd,
2782         a.m_hostEnd,
2783         a.m_portEnd,
2784         a.m_pathAfterLastSlash,
2785         a.m_pathEnd,
2786         a.m_queryEnd,
2787         a.m_fragmentEnd,
2788         a.m_string.utf8().data(),
2789         b.m_isValid,
2790         b.m_protocolIsInHTTPFamily,
2791         b.m_schemeEnd,
2792         b.m_userStart,
2793         b.m_userEnd,
2794         b.m_passwordEnd,
2795         b.m_hostEnd,
2796         b.m_portEnd,
2797         b.m_pathAfterLastSlash,
2798         b.m_pathEnd,
2799         b.m_queryEnd,
2800         b.m_fragmentEnd,
2801         b.m_string.utf8().data());
2802
2803     return a.m_string == b.m_string
2804         && a.m_isValid == b.m_isValid
2805         && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2806         && a.m_schemeEnd == b.m_schemeEnd
2807         && a.m_userStart == b.m_userStart
2808         && a.m_userEnd == b.m_userEnd
2809         && a.m_passwordEnd == b.m_passwordEnd
2810         && a.m_hostEnd == b.m_hostEnd
2811         && a.m_portEnd == b.m_portEnd
2812         && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2813         && a.m_pathEnd == b.m_pathEnd
2814         && a.m_queryEnd == b.m_queryEnd
2815         && a.m_fragmentEnd == b.m_fragmentEnd;
2816 }
2817
2818 bool URLParser::internalValuesConsistent(const URL& url)
2819 {
2820     return url.m_schemeEnd <= url.m_userStart
2821         && url.m_userStart <= url.m_userEnd
2822         && url.m_userEnd <= url.m_passwordEnd
2823         && url.m_passwordEnd <= url.m_hostEnd
2824         && url.m_hostEnd <= url.m_portEnd
2825         && url.m_portEnd <= url.m_pathAfterLastSlash
2826         && url.m_pathAfterLastSlash <= url.m_pathEnd
2827         && url.m_pathEnd <= url.m_queryEnd
2828         && url.m_queryEnd <= url.m_fragmentEnd
2829         && (url.m_isValid ? url.m_fragmentEnd == url.m_string.length() : !url.m_fragmentEnd);
2830     // FIXME: Why do we even store m_fragmentEnd?
2831     // It should be able to be deduced from m_isValid and m_string.length() to save memory.
2832 }
2833
2834 static bool urlParserEnabled = true;
2835
2836 void URLParser::setEnabled(bool enabled)
2837 {
2838     urlParserEnabled = enabled;
2839 }
2840
2841 bool URLParser::enabled()
2842 {
2843     return urlParserEnabled;
2844 }
2845
2846 } // namespace WebCore