Update ANGLE
[WebKit-https.git] / Source / WebCore / platform / URLParser.cpp
1 /*
2  * Copyright (C) 2016 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "URLParser.h"
28
29 #include "Logging.h"
30 #include "RuntimeApplicationChecks.h"
31 #include <array>
32 #include <unicode/uidna.h>
33 #include <unicode/utypes.h>
34
35 namespace WebCore {
36
37 #define URL_PARSER_DEBUGGING 0
38     
39 #if URL_PARSER_DEBUGGING
40 #define URL_PARSER_LOG(...) LOG(URLParser, __VA_ARGS__)
41 #else
42 #define URL_PARSER_LOG(...)
43 #endif
44     
45 template<typename CharacterType>
46 class CodePointIterator {
47 public:
48     ALWAYS_INLINE CodePointIterator() { }
49     ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
50         : m_begin(begin)
51         , m_end(end)
52     {
53     }
54     
55     ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
56         : CodePointIterator(begin.m_begin, end.m_begin)
57     {
58         ASSERT(end.m_begin >= begin.m_begin);
59     }
60     
61     ALWAYS_INLINE UChar32 operator*() const;
62     ALWAYS_INLINE CodePointIterator& operator++();
63
64     ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
65     {
66         return m_begin == other.m_begin
67             && m_end == other.m_end;
68     }
69     ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
70     
71     ALWAYS_INLINE CodePointIterator& operator=(const CodePointIterator& other)
72     {
73         m_begin = other.m_begin;
74         m_end = other.m_end;
75         return *this;
76     }
77
78     ALWAYS_INLINE bool atEnd() const
79     {
80         ASSERT(m_begin <= m_end);
81         return m_begin >= m_end;
82     }
83     
84     ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
85     {
86         ASSERT(m_begin >= reference);
87         return m_begin - reference;
88     }
89
90     ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
91     {
92         return codeUnitsSince(other.m_begin);
93     }
94     
95 private:
96     const CharacterType* m_begin { nullptr };
97     const CharacterType* m_end { nullptr };
98 };
99
100 template<>
101 ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const
102 {
103     ASSERT(!atEnd());
104     return *m_begin;
105 }
106
107 template<>
108 ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
109 {
110     ASSERT(!atEnd());
111     m_begin++;
112     return *this;
113 }
114
115 template<>
116 ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const
117 {
118     ASSERT(!atEnd());
119     UChar32 c;
120     U16_GET(m_begin, 0, 0, m_end - m_begin, c);
121     return c;
122 }
123
124 template<>
125 ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
126 {
127     ASSERT(!atEnd());
128     unsigned i = 0;
129     size_t length = m_end - m_begin;
130     U16_FWD_1(m_begin, i, length);
131     m_begin += i;
132     return *this;
133 }
134     
135 ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
136 {
137     if (U_IS_BMP(codePoint)) {
138         destination.append(static_cast<UChar>(codePoint));
139         return;
140     }
141     destination.reserveCapacity(destination.size() + 2);
142     destination.uncheckedAppend(U16_LEAD(codePoint));
143     destination.uncheckedAppend(U16_TRAIL(codePoint));
144 }
145
146 enum URLCharacterClass {
147     UserInfo = 0x1,
148     Default = 0x2,
149     InvalidDomain = 0x4,
150     QueryPercent = 0x8,
151     SlashQuestionOrHash = 0x10,
152     ValidScheme = 0x20,
153 };
154
155 static const uint8_t characterClassTable[256] = {
156     UserInfo | Default | InvalidDomain | QueryPercent, // 0x0
157     UserInfo | Default | QueryPercent, // 0x1
158     UserInfo | Default | QueryPercent, // 0x2
159     UserInfo | Default | QueryPercent, // 0x3
160     UserInfo | Default | QueryPercent, // 0x4
161     UserInfo | Default | QueryPercent, // 0x5
162     UserInfo | Default | QueryPercent, // 0x6
163     UserInfo | Default | QueryPercent, // 0x7
164     UserInfo | Default | QueryPercent, // 0x8
165     UserInfo | Default | InvalidDomain | QueryPercent, // 0x9
166     UserInfo | Default | InvalidDomain | QueryPercent, // 0xA
167     UserInfo | Default | QueryPercent, // 0xB
168     UserInfo | Default | QueryPercent, // 0xC
169     UserInfo | Default | InvalidDomain | QueryPercent, // 0xD
170     UserInfo | Default | QueryPercent, // 0xE
171     UserInfo | Default | QueryPercent, // 0xF
172     UserInfo | Default | QueryPercent, // 0x10
173     UserInfo | Default | QueryPercent, // 0x11
174     UserInfo | Default | QueryPercent, // 0x12
175     UserInfo | Default | QueryPercent, // 0x13
176     UserInfo | Default | QueryPercent, // 0x14
177     UserInfo | Default | QueryPercent, // 0x15
178     UserInfo | Default | QueryPercent, // 0x16
179     UserInfo | Default | QueryPercent, // 0x17
180     UserInfo | Default | QueryPercent, // 0x18
181     UserInfo | Default | QueryPercent, // 0x19
182     UserInfo | Default | QueryPercent, // 0x1A
183     UserInfo | Default | QueryPercent, // 0x1B
184     UserInfo | Default | QueryPercent, // 0x1C
185     UserInfo | Default | QueryPercent, // 0x1D
186     UserInfo | Default | QueryPercent, // 0x1E
187     UserInfo | Default | QueryPercent, // 0x1F
188     UserInfo | Default | InvalidDomain | QueryPercent, // ' '
189     0, // '!'
190     UserInfo | Default | QueryPercent, // '"'
191     UserInfo | Default | InvalidDomain | QueryPercent | SlashQuestionOrHash, // '#'
192     0, // '$'
193     InvalidDomain, // '%'
194     0, // '&'
195     0, // '''
196     0, // '('
197     0, // ')'
198     0, // '*'
199     ValidScheme, // '+'
200     0, // ','
201     ValidScheme, // '-'
202     ValidScheme, // '.'
203     UserInfo | InvalidDomain | SlashQuestionOrHash, // '/'
204     ValidScheme, // '0'
205     ValidScheme, // '1'
206     ValidScheme, // '2'
207     ValidScheme, // '3'
208     ValidScheme, // '4'
209     ValidScheme, // '5'
210     ValidScheme, // '6'
211     ValidScheme, // '7'
212     ValidScheme, // '8'
213     ValidScheme, // '9'
214     UserInfo | InvalidDomain, // ':'
215     UserInfo, // ';'
216     UserInfo | Default | QueryPercent, // '<'
217     UserInfo, // '='
218     UserInfo | Default | QueryPercent, // '>'
219     UserInfo | Default | InvalidDomain | SlashQuestionOrHash, // '?'
220     UserInfo | InvalidDomain, // '@'
221     ValidScheme, // 'A'
222     ValidScheme, // 'B'
223     ValidScheme, // 'C'
224     ValidScheme, // 'D'
225     ValidScheme, // 'E'
226     ValidScheme, // 'F'
227     ValidScheme, // 'G'
228     ValidScheme, // 'H'
229     ValidScheme, // 'I'
230     ValidScheme, // 'J'
231     ValidScheme, // 'K'
232     ValidScheme, // 'L'
233     ValidScheme, // 'M'
234     ValidScheme, // 'N'
235     ValidScheme, // 'O'
236     ValidScheme, // 'P'
237     ValidScheme, // 'Q'
238     ValidScheme, // 'R'
239     ValidScheme, // 'S'
240     ValidScheme, // 'T'
241     ValidScheme, // 'U'
242     ValidScheme, // 'V'
243     ValidScheme, // 'W'
244     ValidScheme, // 'X'
245     ValidScheme, // 'Y'
246     ValidScheme, // 'Z'
247     UserInfo | InvalidDomain, // '['
248     UserInfo | InvalidDomain | SlashQuestionOrHash, // '\\'
249     UserInfo | InvalidDomain, // ']'
250     UserInfo, // '^'
251     0, // '_'
252     UserInfo | Default, // '`'
253     ValidScheme, // 'a'
254     ValidScheme, // 'b'
255     ValidScheme, // 'c'
256     ValidScheme, // 'd'
257     ValidScheme, // 'e'
258     ValidScheme, // 'f'
259     ValidScheme, // 'g'
260     ValidScheme, // 'h'
261     ValidScheme, // 'i'
262     ValidScheme, // 'j'
263     ValidScheme, // 'k'
264     ValidScheme, // 'l'
265     ValidScheme, // 'm'
266     ValidScheme, // 'n'
267     ValidScheme, // 'o'
268     ValidScheme, // 'p'
269     ValidScheme, // 'q'
270     ValidScheme, // 'r'
271     ValidScheme, // 's'
272     ValidScheme, // 't'
273     ValidScheme, // 'u'
274     ValidScheme, // 'v'
275     ValidScheme, // 'w'
276     ValidScheme, // 'x'
277     ValidScheme, // 'y'
278     ValidScheme, // 'z'
279     UserInfo | Default, // '{'
280     UserInfo, // '|'
281     UserInfo | Default, // '}'
282     0, // '~'
283     QueryPercent, // 0x7F
284     QueryPercent, // 0x80
285     QueryPercent, // 0x81
286     QueryPercent, // 0x82
287     QueryPercent, // 0x83
288     QueryPercent, // 0x84
289     QueryPercent, // 0x85
290     QueryPercent, // 0x86
291     QueryPercent, // 0x87
292     QueryPercent, // 0x88
293     QueryPercent, // 0x89
294     QueryPercent, // 0x8A
295     QueryPercent, // 0x8B
296     QueryPercent, // 0x8C
297     QueryPercent, // 0x8D
298     QueryPercent, // 0x8E
299     QueryPercent, // 0x8F
300     QueryPercent, // 0x90
301     QueryPercent, // 0x91
302     QueryPercent, // 0x92
303     QueryPercent, // 0x93
304     QueryPercent, // 0x94
305     QueryPercent, // 0x95
306     QueryPercent, // 0x96
307     QueryPercent, // 0x97
308     QueryPercent, // 0x98
309     QueryPercent, // 0x99
310     QueryPercent, // 0x9A
311     QueryPercent, // 0x9B
312     QueryPercent, // 0x9C
313     QueryPercent, // 0x9D
314     QueryPercent, // 0x9E
315     QueryPercent, // 0x9F
316     QueryPercent, // 0xA0
317     QueryPercent, // 0xA1
318     QueryPercent, // 0xA2
319     QueryPercent, // 0xA3
320     QueryPercent, // 0xA4
321     QueryPercent, // 0xA5
322     QueryPercent, // 0xA6
323     QueryPercent, // 0xA7
324     QueryPercent, // 0xA8
325     QueryPercent, // 0xA9
326     QueryPercent, // 0xAA
327     QueryPercent, // 0xAB
328     QueryPercent, // 0xAC
329     QueryPercent, // 0xAD
330     QueryPercent, // 0xAE
331     QueryPercent, // 0xAF
332     QueryPercent, // 0xB0
333     QueryPercent, // 0xB1
334     QueryPercent, // 0xB2
335     QueryPercent, // 0xB3
336     QueryPercent, // 0xB4
337     QueryPercent, // 0xB5
338     QueryPercent, // 0xB6
339     QueryPercent, // 0xB7
340     QueryPercent, // 0xB8
341     QueryPercent, // 0xB9
342     QueryPercent, // 0xBA
343     QueryPercent, // 0xBB
344     QueryPercent, // 0xBC
345     QueryPercent, // 0xBD
346     QueryPercent, // 0xBE
347     QueryPercent, // 0xBF
348     QueryPercent, // 0xC0
349     QueryPercent, // 0xC1
350     QueryPercent, // 0xC2
351     QueryPercent, // 0xC3
352     QueryPercent, // 0xC4
353     QueryPercent, // 0xC5
354     QueryPercent, // 0xC6
355     QueryPercent, // 0xC7
356     QueryPercent, // 0xC8
357     QueryPercent, // 0xC9
358     QueryPercent, // 0xCA
359     QueryPercent, // 0xCB
360     QueryPercent, // 0xCC
361     QueryPercent, // 0xCD
362     QueryPercent, // 0xCE
363     QueryPercent, // 0xCF
364     QueryPercent, // 0xD0
365     QueryPercent, // 0xD1
366     QueryPercent, // 0xD2
367     QueryPercent, // 0xD3
368     QueryPercent, // 0xD4
369     QueryPercent, // 0xD5
370     QueryPercent, // 0xD6
371     QueryPercent, // 0xD7
372     QueryPercent, // 0xD8
373     QueryPercent, // 0xD9
374     QueryPercent, // 0xDA
375     QueryPercent, // 0xDB
376     QueryPercent, // 0xDC
377     QueryPercent, // 0xDD
378     QueryPercent, // 0xDE
379     QueryPercent, // 0xDF
380     QueryPercent, // 0xE0
381     QueryPercent, // 0xE1
382     QueryPercent, // 0xE2
383     QueryPercent, // 0xE3
384     QueryPercent, // 0xE4
385     QueryPercent, // 0xE5
386     QueryPercent, // 0xE6
387     QueryPercent, // 0xE7
388     QueryPercent, // 0xE8
389     QueryPercent, // 0xE9
390     QueryPercent, // 0xEA
391     QueryPercent, // 0xEB
392     QueryPercent, // 0xEC
393     QueryPercent, // 0xED
394     QueryPercent, // 0xEE
395     QueryPercent, // 0xEF
396     QueryPercent, // 0xF0
397     QueryPercent, // 0xF1
398     QueryPercent, // 0xF2
399     QueryPercent, // 0xF3
400     QueryPercent, // 0xF4
401     QueryPercent, // 0xF5
402     QueryPercent, // 0xF6
403     QueryPercent, // 0xF7
404     QueryPercent, // 0xF8
405     QueryPercent, // 0xF9
406     QueryPercent, // 0xFA
407     QueryPercent, // 0xFB
408     QueryPercent, // 0xFC
409     QueryPercent, // 0xFD
410     QueryPercent, // 0xFE
411     QueryPercent, // 0xFF
412 };
413
414 template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; }
415 template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
416 template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
417 template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
418 template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
419 template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
420 template<typename CharacterType> ALWAYS_INLINE static bool isInvalidDomainCharacter(CharacterType character) { return character <= ']' && characterClassTable[character] & InvalidDomain; }
421 template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
422 template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
423 template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & ValidScheme; }
424 static bool shouldPercentEncodeQueryByte(uint8_t byte) { return characterClassTable[byte] & QueryPercent; }
425
426 template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation>
427 ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
428 {
429     ++iterator;
430     while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
431         if (reportSyntaxViolation == ReportSyntaxViolation::Yes)
432             syntaxViolation(iteratorForSyntaxViolationPosition);
433         ++iterator;
434     }
435 }
436
437 template<typename CharacterType>
438 bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator)
439 {
440     if (iterator.atEnd())
441         return false;
442     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
443     if (iterator.atEnd())
444         return false;
445     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
446     return iterator.atEnd();
447 }
448
449 template<typename CharacterType>
450 ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
451 {
452     if (iterator.atEnd() || !isASCIIAlpha(*iterator))
453         return false;
454     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
455     if (iterator.atEnd())
456         return false;
457     if (*iterator == ':')
458         return true;
459     if (UNLIKELY(*iterator == '|'))
460         return true;
461     return false;
462 }
463
464 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
465 {
466     ASSERT(isASCII(codePoint));
467     if (UNLIKELY(m_didSeeSyntaxViolation))
468         m_asciiBuffer.append(codePoint);
469 }
470
471 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
472 {
473     if (UNLIKELY(m_didSeeSyntaxViolation))
474         m_asciiBuffer.append(characters, length);
475 }
476
477 template<typename CharacterType>
478 void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
479 {
480     ASSERT(isWindowsDriveLetter(iterator));
481     appendToASCIIBuffer(*iterator);
482     advance(iterator);
483     ASSERT(!iterator.atEnd());
484     ASSERT(*iterator == ':' || *iterator == '|');
485     if (*iterator == '|')
486         syntaxViolation(iterator);
487     appendToASCIIBuffer(':');
488     advance(iterator);
489 }
490
491 template<typename CharacterType>
492 bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
493 {
494     if (!isWindowsDriveLetter(iterator))
495         return true;
496     if (iterator.atEnd())
497         return false;
498     advance(iterator);
499     if (iterator.atEnd())
500         return true;
501     advance(iterator);
502     if (iterator.atEnd())
503         return true;
504     return !isSlashQuestionOrHash(*iterator);
505 }
506
507 static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
508 {
509     buffer.append('%');
510     buffer.append(upperNibbleToASCIIHexDigit(byte));
511     buffer.append(lowerNibbleToASCIIHexDigit(byte));
512 }
513
514 void URLParser::percentEncodeByte(uint8_t byte)
515 {
516     ASSERT(m_didSeeSyntaxViolation);
517     appendToASCIIBuffer('%');
518     appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
519     appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
520 }
521
522 const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD";
523 const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1;
524
525 template<bool(*isInCodeSet)(UChar32), typename CharacterType>
526 ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
527 {
528     ASSERT(!iterator.atEnd());
529     UChar32 codePoint = *iterator;
530     if (LIKELY(isASCII(codePoint))) {
531         if (UNLIKELY(isInCodeSet(codePoint))) {
532             syntaxViolation(iterator);
533             percentEncodeByte(codePoint);
534         } else
535             appendToASCIIBuffer(codePoint);
536         return;
537     }
538     ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
539     syntaxViolation(iterator);
540     
541     if (!U_IS_UNICODE_CHAR(codePoint)) {
542         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
543         return;
544     }
545     
546     uint8_t buffer[U8_MAX_LENGTH];
547     int32_t offset = 0;
548     U8_APPEND_UNSAFE(buffer, offset, codePoint);
549     for (int32_t i = 0; i < offset; ++i)
550         percentEncodeByte(buffer[i]);
551 }
552
553 template<typename CharacterType>
554 ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
555 {
556     ASSERT(!iterator.atEnd());
557     UChar32 codePoint = *iterator;
558     if (LIKELY(isASCII(codePoint))) {
559         if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint))) {
560             syntaxViolation(iterator);
561             percentEncodeByte(codePoint);
562         } else
563             appendToASCIIBuffer(codePoint);
564         return;
565     }
566     
567     syntaxViolation(iterator);
568     
569     if (!U_IS_UNICODE_CHAR(codePoint)) {
570         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
571         return;
572     }
573
574     uint8_t buffer[U8_MAX_LENGTH];
575     int32_t offset = 0;
576     U8_APPEND_UNSAFE(buffer, offset, codePoint);
577     for (int32_t i = 0; i < offset; ++i) {
578         auto byte = buffer[i];
579         if (shouldPercentEncodeQueryByte(byte))
580             percentEncodeByte(byte);
581         else
582             appendToASCIIBuffer(byte);
583     }
584 }
585
586 template<typename CharacterType>
587 void URLParser::encodeQuery(const Vector<UChar>& source, const TextEncoding& encoding, CodePointIterator<CharacterType> iterator)
588 {
589     // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
590     CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
591     const char* data = encoded.data();
592     size_t length = encoded.length();
593     
594     if (!length == !iterator.atEnd()) {
595         syntaxViolation(iterator);
596         return;
597     }
598     
599     size_t i = 0;
600     for (; i < length; ++i) {
601         ASSERT(!iterator.atEnd());
602         uint8_t byte = data[i];
603         if (UNLIKELY(byte != *iterator)) {
604             syntaxViolation(iterator);
605             break;
606         }
607         if (UNLIKELY(shouldPercentEncodeQueryByte(byte))) {
608             syntaxViolation(iterator);
609             break;
610         }
611         appendToASCIIBuffer(byte);
612         ++iterator;
613     }
614     while (!iterator.atEnd() && isTabOrNewline(*iterator))
615         ++iterator;
616     ASSERT((i == length) == iterator.atEnd());
617     for (; i < length; ++i) {
618         ASSERT(m_didSeeSyntaxViolation);
619         uint8_t byte = data[i];
620         if (shouldPercentEncodeQueryByte(byte))
621             percentEncodeByte(byte);
622         else
623             appendToASCIIBuffer(byte);
624     }
625 }
626
627 Optional<uint16_t> defaultPortForProtocol(StringView scheme)
628 {
629     static const uint16_t ftpPort = 21;
630     static const uint16_t gopherPort = 70;
631     static const uint16_t httpPort = 80;
632     static const uint16_t httpsPort = 443;
633     static const uint16_t wsPort = 80;
634     static const uint16_t wssPort = 443;
635     
636     auto length = scheme.length();
637     if (!length)
638         return Nullopt;
639     switch (scheme[0]) {
640     case 'w':
641         switch (length) {
642         case 2:
643             if (scheme[1] == 's')
644                 return wsPort;
645             return Nullopt;
646         case 3:
647             if (scheme[1] == 's'
648                 && scheme[2] == 's')
649                 return wssPort;
650             return Nullopt;
651         default:
652             return false;
653         }
654     case 'h':
655         switch (length) {
656         case 4:
657             if (scheme[1] == 't'
658                 && scheme[2] == 't'
659                 && scheme[3] == 'p')
660                 return httpPort;
661             return Nullopt;
662         case 5:
663             if (scheme[1] == 't'
664                 && scheme[2] == 't'
665                 && scheme[3] == 'p'
666                 && scheme[4] == 's')
667                 return httpsPort;
668             return Nullopt;
669         default:
670             return Nullopt;
671         }
672     case 'g':
673         if (length == 6
674             && scheme[1] == 'o'
675             && scheme[2] == 'p'
676             && scheme[3] == 'h'
677             && scheme[4] == 'e'
678             && scheme[5] == 'r')
679             return gopherPort;
680         return Nullopt;
681     case 'f':
682         if (length == 3
683             && scheme[1] == 't'
684             && scheme[2] == 'p')
685             return ftpPort;
686         return Nullopt;
687     default:
688         return Nullopt;
689     }
690 }
691
692 bool isDefaultPortForProtocol(uint16_t port, StringView protocol)
693 {
694     return defaultPortForProtocol(protocol) == port;
695 }
696
697 enum class Scheme {
698     WS,
699     WSS,
700     File,
701     FTP,
702     Gopher,
703     HTTP,
704     HTTPS,
705     NonSpecial
706 };
707
708 ALWAYS_INLINE static Scheme scheme(StringView scheme)
709 {
710     auto length = scheme.length();
711     if (!length)
712         return Scheme::NonSpecial;
713     switch (scheme[0]) {
714     case 'f':
715         switch (length) {
716         case 3:
717             if (scheme[1] == 't'
718                 && scheme[2] == 'p')
719                 return Scheme::FTP;
720             return Scheme::NonSpecial;
721         case 4:
722             if (scheme[1] == 'i'
723                 && scheme[2] == 'l'
724                 && scheme[3] == 'e')
725                 return Scheme::File;
726             return Scheme::NonSpecial;
727         default:
728             return Scheme::NonSpecial;
729         }
730     case 'g':
731         if (length == 6
732             && scheme[1] == 'o'
733             && scheme[2] == 'p'
734             && scheme[3] == 'h'
735             && scheme[4] == 'e'
736             && scheme[5] == 'r')
737             return Scheme::Gopher;
738         return Scheme::NonSpecial;
739     case 'h':
740         switch (length) {
741         case 4:
742             if (scheme[1] == 't'
743                 && scheme[2] == 't'
744                 && scheme[3] == 'p')
745                 return Scheme::HTTP;
746             return Scheme::NonSpecial;
747         case 5:
748             if (scheme[1] == 't'
749                 && scheme[2] == 't'
750                 && scheme[3] == 'p'
751                 && scheme[4] == 's')
752                 return Scheme::HTTPS;
753             return Scheme::NonSpecial;
754         default:
755             return Scheme::NonSpecial;
756         }
757     case 'w':
758         switch (length) {
759         case 2:
760             if (scheme[1] == 's')
761                 return Scheme::WS;
762             return Scheme::NonSpecial;
763         case 3:
764             if (scheme[1] == 's'
765                 && scheme[2] == 's')
766                 return Scheme::WSS;
767             return Scheme::NonSpecial;
768         default:
769             return Scheme::NonSpecial;
770         }
771     default:
772         return Scheme::NonSpecial;
773     }
774 }
775
776 enum class URLParser::URLPart {
777     SchemeEnd,
778     UserStart,
779     UserEnd,
780     PasswordEnd,
781     HostEnd,
782     PortEnd,
783     PathAfterLastSlash,
784     PathEnd,
785     QueryEnd,
786     FragmentEnd,
787 };
788
789 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
790 {
791     switch (part) {
792     case URLPart::FragmentEnd:
793         return url.m_fragmentEnd;
794     case URLPart::QueryEnd:
795         return url.m_queryEnd;
796     case URLPart::PathEnd:
797         return url.m_pathEnd;
798     case URLPart::PathAfterLastSlash:
799         return url.m_pathAfterLastSlash;
800     case URLPart::PortEnd:
801         return url.m_portEnd;
802     case URLPart::HostEnd:
803         return url.m_hostEnd;
804     case URLPart::PasswordEnd:
805         return url.m_passwordEnd;
806     case URLPart::UserEnd:
807         return url.m_userEnd;
808     case URLPart::UserStart:
809         return url.m_userStart;
810     case URLPart::SchemeEnd:
811         return url.m_schemeEnd;
812     }
813     ASSERT_NOT_REACHED();
814     return 0;
815 }
816
817 void URLParser::copyASCIIStringUntil(const String& string, size_t length)
818 {
819     RELEASE_ASSERT(length <= string.length());
820     if (string.isNull())
821         return;
822     ASSERT(m_asciiBuffer.isEmpty());
823     if (string.is8Bit()) {
824         appendToASCIIBuffer(string.characters8(), length);
825     } else {
826         const UChar* characters = string.characters16();
827         for (size_t i = 0; i < length; ++i) {
828             UChar c = characters[i];
829             ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
830             appendToASCIIBuffer(c);
831         }
832     }
833 }
834
835 template<typename CharacterType>
836 void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, bool& isUTF8Encoding)
837 {
838     syntaxViolation(iterator);
839
840     m_asciiBuffer.clear();
841     copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
842     switch (part) {
843     case URLPart::FragmentEnd:
844         RELEASE_ASSERT_NOT_REACHED();
845     case URLPart::QueryEnd:
846         m_url.m_queryEnd = base.m_queryEnd;
847         FALLTHROUGH;
848     case URLPart::PathEnd:
849         m_url.m_pathEnd = base.m_pathEnd;
850         FALLTHROUGH;
851     case URLPart::PathAfterLastSlash:
852         m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
853         FALLTHROUGH;
854     case URLPart::PortEnd:
855         m_url.m_portEnd = base.m_portEnd;
856         FALLTHROUGH;
857     case URLPart::HostEnd:
858         m_url.m_hostEnd = base.m_hostEnd;
859         FALLTHROUGH;
860     case URLPart::PasswordEnd:
861         m_url.m_passwordEnd = base.m_passwordEnd;
862         FALLTHROUGH;
863     case URLPart::UserEnd:
864         m_url.m_userEnd = base.m_userEnd;
865         FALLTHROUGH;
866     case URLPart::UserStart:
867         m_url.m_userStart = base.m_userStart;
868         FALLTHROUGH;
869     case URLPart::SchemeEnd:
870         m_url.m_isValid = base.m_isValid;
871         m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
872         m_url.m_schemeEnd = base.m_schemeEnd;
873     }
874     switch (scheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd))) {
875     case Scheme::WS:
876     case Scheme::WSS:
877         isUTF8Encoding = true;
878         FALLTHROUGH;
879     case Scheme::File:
880     case Scheme::FTP:
881     case Scheme::Gopher:
882     case Scheme::HTTP:
883     case Scheme::HTTPS:
884         m_urlIsSpecial = true;
885         return;
886     case Scheme::NonSpecial:
887         m_urlIsSpecial = false;
888         isUTF8Encoding = true;
889         return;
890     }
891     ASSERT_NOT_REACHED();
892 }
893
894 template<typename CharacterType>
895 ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c)
896 {
897     if (c.atEnd())
898         return false;
899     if (*c == '.') {
900         advance<CharacterType, ReportSyntaxViolation::No>(c);
901         return c.atEnd() || isSlashQuestionOrHash(*c);
902     }
903     return false;
904 }
905
906 template<typename CharacterType>
907 ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
908 {
909     if (c.atEnd())
910         return false;
911     if (*c == '.') {
912         advance<CharacterType, ReportSyntaxViolation::No>(c);
913         return isSingleDotPathSegment(c);
914     }
915     return false;
916 }
917
918 template<typename CharacterType>
919 void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
920 {
921     ASSERT(isSingleDotPathSegment(c));
922     advance(c);
923     if (!c.atEnd()) {
924         if (*c == '/' || *c == '\\')
925             advance(c);
926         else
927             ASSERT(*c == '?' || *c == '#');
928     }
929 }
930
931 template<typename CharacterType>
932 void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
933 {
934     ASSERT(isDoubleDotPathSegment(c));
935     advance(c);
936     consumeSingleDotPathSegment(c);
937 }
938
939 void URLParser::popPath()
940 {
941     ASSERT(m_didSeeSyntaxViolation);
942     if (m_url.m_pathAfterLastSlash > m_url.m_portEnd + 1) {
943         m_url.m_pathAfterLastSlash--;
944         if (m_asciiBuffer[m_url.m_pathAfterLastSlash] == '/')
945             m_url.m_pathAfterLastSlash--;
946         while (m_url.m_pathAfterLastSlash > m_url.m_portEnd && m_asciiBuffer[m_url.m_pathAfterLastSlash] != '/')
947             m_url.m_pathAfterLastSlash--;
948         m_url.m_pathAfterLastSlash++;
949     }
950     m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
951 }
952
953 template<typename CharacterType>
954 void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
955 {
956     if (m_didSeeSyntaxViolation)
957         return;
958     m_didSeeSyntaxViolation = true;
959     
960     ASSERT(m_asciiBuffer.isEmpty());
961     size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
962     RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
963     m_asciiBuffer.reserveCapacity(m_inputString.length());
964     for (size_t i = 0; i < codeUnitsToCopy; ++i) {
965         ASSERT(isASCII(m_inputString[i]));
966         m_asciiBuffer.uncheckedAppend(m_inputString[i]);
967     }
968 }
969
970 void URLParser::failure()
971 {
972     m_url.invalidate();
973     m_url.m_string = m_inputString;
974 }
975
976 template<typename CharacterType>
977 bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint)
978 {
979     if (iterator.atEnd() || toASCIILower(*iterator) != codePoint)
980         return false;
981     advance<CharacterType, ReportSyntaxViolation::No>(iterator);
982     return true;
983 }
984
985 template<typename CharacterType>
986 bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator)
987 {
988     if (!checkLocalhostCodePoint(iterator, 'l'))
989         return false;
990     if (!checkLocalhostCodePoint(iterator, 'o'))
991         return false;
992     if (!checkLocalhostCodePoint(iterator, 'c'))
993         return false;
994     if (!checkLocalhostCodePoint(iterator, 'a'))
995         return false;
996     if (!checkLocalhostCodePoint(iterator, 'l'))
997         return false;
998     if (!checkLocalhostCodePoint(iterator, 'h'))
999         return false;
1000     if (!checkLocalhostCodePoint(iterator, 'o'))
1001         return false;
1002     if (!checkLocalhostCodePoint(iterator, 's'))
1003         return false;
1004     if (!checkLocalhostCodePoint(iterator, 't'))
1005         return false;
1006     return iterator.atEnd();
1007 }
1008
1009 bool URLParser::isLocalhost(StringView view)
1010 {
1011     if (view.is8Bit())
1012         return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length()));
1013     return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length()));
1014 }
1015
1016 ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
1017 {
1018     if (UNLIKELY(m_didSeeSyntaxViolation)) {
1019         ASSERT(start + length <= m_asciiBuffer.size());
1020         return StringView(m_asciiBuffer.data() + start, length);
1021     }
1022     ASSERT(start + length <= m_inputString.length());
1023     return StringView(m_inputString).substring(start, length);
1024 }
1025
1026 template<typename CharacterType>
1027 ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1028 {
1029     if (UNLIKELY(m_didSeeSyntaxViolation))
1030         return m_asciiBuffer.size();
1031     
1032     return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1033 }
1034
1035 URLParser::URLParser(const String& input, const URL& base, const TextEncoding& encoding)
1036     : m_inputString(input)
1037 {
1038     if (input.isNull()) {
1039         if (base.isValid() && !base.m_cannotBeABaseURL) {
1040             m_url = base;
1041             m_url.removeFragmentIdentifier();
1042         }
1043         return;
1044     }
1045
1046     if (input.is8Bit()) {
1047         m_inputBegin = input.characters8();
1048         parse(input.characters8(), input.length(), base, encoding);
1049     } else {
1050         m_inputBegin = input.characters16();
1051         parse(input.characters16(), input.length(), base, encoding);
1052     }
1053
1054     ASSERT(!m_url.m_isValid
1055         || m_didSeeSyntaxViolation == (m_url.string() != input)
1056         || (input.isAllSpecialCharacters<isC0ControlOrSpace>()
1057             && m_url.m_string == base.m_string.left(base.m_queryEnd)));
1058     ASSERT(internalValuesConsistent(m_url));
1059 #if !ASSERT_DISABLED
1060     if (!m_didSeeSyntaxViolation) {
1061         // Force a syntax violation at the beginning to make sure we get the same result.
1062         URLParser parser(makeString(" ", input), base, encoding);
1063         URL parsed = parser.result();
1064         if (parsed.isValid())
1065             ASSERT(allValuesEqual(parser.result(), m_url));
1066     }
1067 #endif
1068 }
1069
1070 template<typename CharacterType>
1071 void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
1072 {
1073     URL_PARSER_LOG("Parsing URL <%s> base <%s> encoding <%s>", String(input, length).utf8().data(), base.string().utf8().data(), encoding.name());
1074     m_url = { };
1075     ASSERT(m_asciiBuffer.isEmpty());
1076     
1077     bool isUTF8Encoding = encoding == UTF8Encoding();
1078     Vector<UChar> queryBuffer;
1079
1080     unsigned endIndex = length;
1081     while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) {
1082         syntaxViolation(CodePointIterator<CharacterType>(input, input));
1083         endIndex--;
1084     }
1085     CodePointIterator<CharacterType> c(input, input + endIndex);
1086     CodePointIterator<CharacterType> authorityOrHostBegin;
1087     CodePointIterator<CharacterType> queryBegin;
1088     while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1089         syntaxViolation(c);
1090         ++c;
1091     }
1092     auto beginAfterControlAndSpace = c;
1093
1094     enum class State : uint8_t {
1095         SchemeStart,
1096         Scheme,
1097         NoScheme,
1098         SpecialRelativeOrAuthority,
1099         PathOrAuthority,
1100         Relative,
1101         RelativeSlash,
1102         SpecialAuthoritySlashes,
1103         SpecialAuthorityIgnoreSlashes,
1104         AuthorityOrHost,
1105         Host,
1106         File,
1107         FileSlash,
1108         FileHost,
1109         PathStart,
1110         Path,
1111         CannotBeABaseURLPath,
1112         UTF8Query,
1113         NonUTF8Query,
1114         Fragment,
1115     };
1116
1117 #define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1118 #define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x)
1119
1120     State state = State::SchemeStart;
1121     while (!c.atEnd()) {
1122         if (UNLIKELY(isTabOrNewline(*c))) {
1123             syntaxViolation(c);
1124             ++c;
1125             continue;
1126         }
1127
1128         switch (state) {
1129         case State::SchemeStart:
1130             LOG_STATE("SchemeStart");
1131             if (isASCIIAlpha(*c)) {
1132                 if (UNLIKELY(isASCIIUpper(*c)))
1133                     syntaxViolation(c);
1134                 appendToASCIIBuffer(toASCIILower(*c));
1135                 advance(c);
1136                 if (c.atEnd()) {
1137                     m_asciiBuffer.clear();
1138                     state = State::NoScheme;
1139                     c = beginAfterControlAndSpace;
1140                 }
1141                 state = State::Scheme;
1142             } else
1143                 state = State::NoScheme;
1144             break;
1145         case State::Scheme:
1146             LOG_STATE("Scheme");
1147             if (isValidSchemeCharacter(*c)) {
1148                 if (UNLIKELY(isASCIIUpper(*c)))
1149                     syntaxViolation(c);
1150                 appendToASCIIBuffer(toASCIILower(*c));
1151             } else if (*c == ':') {
1152                 m_url.m_schemeEnd = currentPosition(c);
1153                 StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd);
1154                 appendToASCIIBuffer(':');
1155                 switch (scheme(urlScheme)) {
1156                 case Scheme::File:
1157                     m_urlIsSpecial = true;
1158                     state = State::File;
1159                     ++c;
1160                     break;
1161                 case Scheme::WS:
1162                 case Scheme::WSS:
1163                     isUTF8Encoding = true;
1164                     m_urlIsSpecial = true;
1165                     if (base.protocolIs(urlScheme))
1166                         state = State::SpecialRelativeOrAuthority;
1167                     else
1168                         state = State::SpecialAuthoritySlashes;
1169                     ++c;
1170                     break;
1171                 case Scheme::HTTP:
1172                 case Scheme::HTTPS:
1173                     m_url.m_protocolIsInHTTPFamily = true;
1174                     FALLTHROUGH;
1175                 case Scheme::FTP:
1176                 case Scheme::Gopher:
1177                     m_urlIsSpecial = true;
1178                     if (base.protocolIs(urlScheme))
1179                         state = State::SpecialRelativeOrAuthority;
1180                     else
1181                         state = State::SpecialAuthoritySlashes;
1182                     ++c;
1183                     break;
1184                 case Scheme::NonSpecial:
1185                     isUTF8Encoding = true;
1186                     auto maybeSlash = c;
1187                     advance(maybeSlash);
1188                     if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1189                         appendToASCIIBuffer('/');
1190                         c = maybeSlash;
1191                         state = State::PathOrAuthority;
1192                         ASSERT(*c == '/');
1193                         ++c;
1194                         m_url.m_userStart = currentPosition(c);
1195                     } else {
1196                         ++c;
1197                         m_url.m_userStart = currentPosition(c);
1198                         m_url.m_userEnd = m_url.m_userStart;
1199                         m_url.m_passwordEnd = m_url.m_userStart;
1200                         m_url.m_hostEnd = m_url.m_userStart;
1201                         m_url.m_portEnd = m_url.m_userStart;
1202                         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1203                         m_url.m_cannotBeABaseURL = true;
1204                         state = State::CannotBeABaseURLPath;
1205                     }
1206                     break;
1207                 }
1208                 break;
1209             } else {
1210                 m_asciiBuffer.clear();
1211                 state = State::NoScheme;
1212                 c = beginAfterControlAndSpace;
1213                 break;
1214             }
1215             advance(c);
1216             if (c.atEnd()) {
1217                 m_asciiBuffer.clear();
1218                 state = State::NoScheme;
1219                 c = beginAfterControlAndSpace;
1220             }
1221             break;
1222         case State::NoScheme:
1223             LOG_STATE("NoScheme");
1224             if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) {
1225                 failure();
1226                 return;
1227             }
1228             if (base.m_cannotBeABaseURL && *c == '#') {
1229                 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1230                 state = State::Fragment;
1231                 appendToASCIIBuffer('#');
1232                 ++c;
1233                 break;
1234             }
1235             if (!base.protocolIs("file")) {
1236                 state = State::Relative;
1237                 break;
1238             }
1239             copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1240             appendToASCIIBuffer(':');
1241             state = State::File;
1242             break;
1243         case State::SpecialRelativeOrAuthority:
1244             LOG_STATE("SpecialRelativeOrAuthority");
1245             if (*c == '/') {
1246                 appendToASCIIBuffer('/');
1247                 advance(c);
1248                 if (c.atEnd()) {
1249                     failure();
1250                     return;
1251                 }
1252                 if (*c == '/') {
1253                     appendToASCIIBuffer('/');
1254                     state = State::SpecialAuthorityIgnoreSlashes;
1255                     ++c;
1256                 } else
1257                     state = State::RelativeSlash;
1258             } else
1259                 state = State::Relative;
1260             break;
1261         case State::PathOrAuthority:
1262             LOG_STATE("PathOrAuthority");
1263             if (*c == '/') {
1264                 appendToASCIIBuffer('/');
1265                 state = State::AuthorityOrHost;
1266                 advance(c);
1267                 m_url.m_userStart = currentPosition(c);
1268                 authorityOrHostBegin = c;
1269             } else {
1270                 ASSERT(parsedDataView(currentPosition(c) - 1, 1) == "/");
1271                 m_url.m_userStart = currentPosition(c) - 1;
1272                 m_url.m_userEnd = m_url.m_userStart;
1273                 m_url.m_passwordEnd = m_url.m_userStart;
1274                 m_url.m_hostEnd = m_url.m_userStart;
1275                 m_url.m_portEnd = m_url.m_userStart;
1276                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1277                 state = State::Path;
1278             }
1279             break;
1280         case State::Relative:
1281             LOG_STATE("Relative");
1282             switch (*c) {
1283             case '/':
1284             case '\\':
1285                 state = State::RelativeSlash;
1286                 ++c;
1287                 break;
1288             case '?':
1289                 copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1290                 appendToASCIIBuffer('?');
1291                 ++c;
1292                 if (isUTF8Encoding)
1293                     state = State::UTF8Query;
1294                 else {
1295                     queryBegin = c;
1296                     state = State::NonUTF8Query;
1297                 }
1298                 break;
1299             case '#':
1300                 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1301                 appendToASCIIBuffer('#');
1302                 state = State::Fragment;
1303                 ++c;
1304                 break;
1305             default:
1306                 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1307                 state = State::Path;
1308                 break;
1309             }
1310             break;
1311         case State::RelativeSlash:
1312             LOG_STATE("RelativeSlash");
1313             if (*c == '/' || *c == '\\') {
1314                 ++c;
1315                 copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1316                 appendToASCIIBuffer("://", 3);
1317                 state = State::SpecialAuthorityIgnoreSlashes;
1318             } else {
1319                 copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1320                 appendToASCIIBuffer('/');
1321                 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1322                 state = State::Path;
1323             }
1324             break;
1325         case State::SpecialAuthoritySlashes:
1326             LOG_STATE("SpecialAuthoritySlashes");
1327             if (LIKELY(*c == '/' || *c == '\\')) {
1328                 if (UNLIKELY(*c == '\\'))
1329                     syntaxViolation(c);
1330                 appendToASCIIBuffer('/');
1331                 advance(c);
1332                 if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) {
1333                     if (UNLIKELY(*c == '\\'))
1334                         syntaxViolation(c);
1335                     ++c;
1336                     appendToASCIIBuffer('/');
1337                 } else {
1338                     syntaxViolation(c);
1339                     appendToASCIIBuffer('/');
1340                 }
1341             } else {
1342                 syntaxViolation(c);
1343                 appendToASCIIBuffer("//", 2);
1344             }
1345             state = State::SpecialAuthorityIgnoreSlashes;
1346             break;
1347         case State::SpecialAuthorityIgnoreSlashes:
1348             LOG_STATE("SpecialAuthorityIgnoreSlashes");
1349             if (*c == '/' || *c == '\\') {
1350                 syntaxViolation(c);
1351                 ++c;
1352             } else {
1353                 m_url.m_userStart = currentPosition(c);
1354                 state = State::AuthorityOrHost;
1355                 authorityOrHostBegin = c;
1356             }
1357             break;
1358         case State::AuthorityOrHost:
1359             do {
1360                 LOG_STATE("AuthorityOrHost");
1361                 if (*c == '@') {
1362                     auto lastAt = c;
1363                     auto findLastAt = c;
1364                     while (!findLastAt.atEnd()) {
1365                         URL_PARSER_LOG("Finding last @: %c", *findLastAt);
1366                         if (*findLastAt == '@')
1367                             lastAt = findLastAt;
1368                         bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\');
1369                         if (isSlash || *findLastAt == '?' || *findLastAt == '#')
1370                             break;
1371                         ++findLastAt;
1372                     }
1373                     parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1374                     c = lastAt;
1375                     advance(c);
1376                     authorityOrHostBegin = c;
1377                     state = State::Host;
1378                     m_hostHasPercentOrNonASCII = false;
1379                     break;
1380                 }
1381                 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1382                 if (isSlash || *c == '?' || *c == '#') {
1383                     auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c);
1384                     if (iterator.atEnd()) {
1385                         size_t position = currentPosition(c);
1386                         ASSERT(m_url.m_userStart == position);
1387                         RELEASE_ASSERT(position >= 2);
1388                         position -= 2;
1389                         ASSERT(parsedDataView(position, 2) == "//");
1390                         m_url.m_userStart = position;
1391                         m_url.m_userEnd = position;
1392                         m_url.m_passwordEnd = position;
1393                         m_url.m_hostEnd = position;
1394                         m_url.m_portEnd = position;
1395                         m_url.m_pathAfterLastSlash = position + 2;
1396                     } else {
1397                         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1398                         m_url.m_passwordEnd = m_url.m_userEnd;
1399                         if (!parseHostAndPort(iterator)) {
1400                             failure();
1401                             return;
1402                         }
1403                         if (UNLIKELY(!isSlash)) {
1404                             syntaxViolation(c);
1405                             appendToASCIIBuffer('/');
1406                             m_url.m_pathAfterLastSlash = currentPosition(c);
1407                         }
1408                     }
1409                     state = State::Path;
1410                     break;
1411                 }
1412                 if (isPercentOrNonASCII(*c))
1413                     m_hostHasPercentOrNonASCII = true;
1414                 ++c;
1415             } while (!c.atEnd());
1416             break;
1417         case State::Host:
1418             do {
1419                 LOG_STATE("Host");
1420                 if (*c == '/' || *c == '?' || *c == '#') {
1421                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1422                         failure();
1423                         return;
1424                     }
1425                     if (*c == '?' || *c == '#') {
1426                         syntaxViolation(c);
1427                         appendToASCIIBuffer('/');
1428                         m_url.m_pathAfterLastSlash = currentPosition(c);
1429                     }
1430                     state = State::Path;
1431                     break;
1432                 }
1433                 if (isPercentOrNonASCII(*c))
1434                     m_hostHasPercentOrNonASCII = true;
1435                 ++c;
1436             } while (!c.atEnd());
1437             break;
1438         case State::File:
1439             LOG_STATE("File");
1440             switch (*c) {
1441             case '\\':
1442                 syntaxViolation(c);
1443                 FALLTHROUGH;
1444             case '/':
1445                 appendToASCIIBuffer('/');
1446                 state = State::FileSlash;
1447                 ++c;
1448                 break;
1449             case '?':
1450                 syntaxViolation(c);
1451                 if (base.isValid() && base.protocolIs("file")) {
1452                     copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1453                     appendToASCIIBuffer('?');
1454                     ++c;
1455                 } else {
1456                     appendToASCIIBuffer("///?", 4);
1457                     ++c;
1458                     m_url.m_userStart = currentPosition(c) - 2;
1459                     m_url.m_userEnd = m_url.m_userStart;
1460                     m_url.m_passwordEnd = m_url.m_userStart;
1461                     m_url.m_hostEnd = m_url.m_userStart;
1462                     m_url.m_portEnd = m_url.m_userStart;
1463                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1464                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1465                 }
1466                 if (isUTF8Encoding)
1467                     state = State::UTF8Query;
1468                 else {
1469                     queryBegin = c;
1470                     state = State::NonUTF8Query;
1471                 }
1472                 break;
1473             case '#':
1474                 syntaxViolation(c);
1475                 if (base.isValid() && base.protocolIs("file")) {
1476                     copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1477                     appendToASCIIBuffer('#');
1478                 } else {
1479                     appendToASCIIBuffer("///#", 4);
1480                     m_url.m_userStart = currentPosition(c) - 2;
1481                     m_url.m_userEnd = m_url.m_userStart;
1482                     m_url.m_passwordEnd = m_url.m_userStart;
1483                     m_url.m_hostEnd = m_url.m_userStart;
1484                     m_url.m_portEnd = m_url.m_userStart;
1485                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1486                     m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1487                     m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1488                 }
1489                 state = State::Fragment;
1490                 ++c;
1491                 break;
1492             default:
1493                 syntaxViolation(c);
1494                 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1495                     copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1496                 else {
1497                     appendToASCIIBuffer("///", 3);
1498                     m_url.m_userStart = currentPosition(c) - 1;
1499                     m_url.m_userEnd = m_url.m_userStart;
1500                     m_url.m_passwordEnd = m_url.m_userStart;
1501                     m_url.m_hostEnd = m_url.m_userStart;
1502                     m_url.m_portEnd = m_url.m_userStart;
1503                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1504                     if (isWindowsDriveLetter(c))
1505                         appendWindowsDriveLetter(c);
1506                 }
1507                 state = State::Path;
1508                 break;
1509             }
1510             break;
1511         case State::FileSlash:
1512             LOG_STATE("FileSlash");
1513             if (LIKELY(*c == '/' || *c == '\\')) {
1514                 if (UNLIKELY(*c == '\\'))
1515                     syntaxViolation(c);
1516                 appendToASCIIBuffer('/');
1517                 advance(c);
1518                 m_url.m_userStart = currentPosition(c);
1519                 m_url.m_userEnd = m_url.m_userStart;
1520                 m_url.m_passwordEnd = m_url.m_userStart;
1521                 m_url.m_hostEnd = m_url.m_userStart;
1522                 m_url.m_portEnd = m_url.m_userStart;
1523                 authorityOrHostBegin = c;
1524                 state = State::FileHost;
1525                 break;
1526             }
1527             if (base.isValid() && base.protocolIs("file")) {
1528                 // FIXME: This String copy is unnecessary.
1529                 String basePath = base.path();
1530                 if (basePath.length() >= 2) {
1531                     bool windowsQuirk = basePath.is8Bit()
1532                         ? isWindowsDriveLetter(CodePointIterator<LChar>(basePath.characters8(), basePath.characters8() + basePath.length()))
1533                         : isWindowsDriveLetter(CodePointIterator<UChar>(basePath.characters16(), basePath.characters16() + basePath.length()));
1534                     if (windowsQuirk) {
1535                         appendToASCIIBuffer(basePath[0]);
1536                         appendToASCIIBuffer(basePath[1]);
1537                     }
1538                 }
1539             }
1540             syntaxViolation(c);
1541             appendToASCIIBuffer("//", 2);
1542             m_url.m_userStart = currentPosition(c) - 1;
1543             m_url.m_userEnd = m_url.m_userStart;
1544             m_url.m_passwordEnd = m_url.m_userStart;
1545             m_url.m_hostEnd = m_url.m_userStart;
1546             m_url.m_portEnd = m_url.m_userStart;
1547             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1548             if (isWindowsDriveLetter(c))
1549                 appendWindowsDriveLetter(c);
1550             state = State::Path;
1551             break;
1552         case State::FileHost:
1553             do {
1554                 LOG_STATE("FileHost");
1555                 if (isSlashQuestionOrHash(*c)) {
1556                     bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1557                         && isWindowsDriveLetter(authorityOrHostBegin);
1558                     if (windowsQuirk) {
1559                         syntaxViolation(authorityOrHostBegin);
1560                         appendToASCIIBuffer('/');
1561                         appendWindowsDriveLetter(authorityOrHostBegin);
1562                     }
1563                     if (windowsQuirk || authorityOrHostBegin == c) {
1564                         ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1, 1) == "/");
1565                         if (UNLIKELY(*c == '?')) {
1566                             syntaxViolation(c);
1567                             appendToASCIIBuffer("/?", 2);
1568                             ++c;
1569                             if (isUTF8Encoding)
1570                                 state = State::UTF8Query;
1571                             else {
1572                                 queryBegin = c;
1573                                 state = State::NonUTF8Query;
1574                             }
1575                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1576                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1577                             break;
1578                         }
1579                         if (UNLIKELY(*c == '#')) {
1580                             syntaxViolation(c);
1581                             appendToASCIIBuffer("/#", 2);
1582                             ++c;
1583                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1584                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1585                             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1586                             state = State::Fragment;
1587                             break;
1588                         }
1589                         state = State::Path;
1590                         break;
1591                     }
1592                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1593                         failure();
1594                         return;
1595                     }
1596                     if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) {
1597                         syntaxViolation(c);
1598                         m_asciiBuffer.shrink(m_url.m_passwordEnd);
1599                         m_url.m_hostEnd = currentPosition(c);
1600                         m_url.m_portEnd = m_url.m_hostEnd;
1601                     }
1602                     
1603                     state = State::PathStart;
1604                     break;
1605                 }
1606                 if (isPercentOrNonASCII(*c))
1607                     m_hostHasPercentOrNonASCII = true;
1608                 ++c;
1609             } while (!c.atEnd());
1610             break;
1611         case State::PathStart:
1612             LOG_STATE("PathStart");
1613             if (*c != '/' && *c != '\\')
1614                 ++c;
1615             state = State::Path;
1616             break;
1617         case State::Path:
1618             LOG_STATE("Path");
1619             if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1620                 if (UNLIKELY(m_urlIsSpecial && *c == '\\'))
1621                     syntaxViolation(c);
1622                 appendToASCIIBuffer('/');
1623                 ++c;
1624                 m_url.m_pathAfterLastSlash = currentPosition(c);
1625                 break;
1626             }
1627             if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1, 1) == "/")) {
1628                 if (UNLIKELY(isDoubleDotPathSegment(c))) {
1629                     syntaxViolation(c);
1630                     consumeDoubleDotPathSegment(c);
1631                     popPath();
1632                     break;
1633                 }
1634                 if (UNLIKELY(isSingleDotPathSegment(c))) {
1635                     syntaxViolation(c);
1636                     consumeSingleDotPathSegment(c);
1637                     break;
1638                 }
1639             }
1640             if (*c == '?') {
1641                 m_url.m_pathEnd = currentPosition(c);
1642                 appendToASCIIBuffer('?');
1643                 ++c;
1644                 if (isUTF8Encoding)
1645                     state = State::UTF8Query;
1646                 else {
1647                     queryBegin = c;
1648                     state = State::NonUTF8Query;
1649                 }
1650                 break;
1651             }
1652             if (*c == '#') {
1653                 m_url.m_pathEnd = currentPosition(c);
1654                 m_url.m_queryEnd = m_url.m_pathEnd;
1655                 state = State::Fragment;
1656                 break;
1657             }
1658             utf8PercentEncode<isInDefaultEncodeSet>(c);
1659             ++c;
1660             break;
1661         case State::CannotBeABaseURLPath:
1662             LOG_STATE("CannotBeABaseURLPath");
1663             if (*c == '?') {
1664                 m_url.m_pathEnd = currentPosition(c);
1665                 appendToASCIIBuffer('?');
1666                 ++c;
1667                 if (isUTF8Encoding)
1668                     state = State::UTF8Query;
1669                 else {
1670                     queryBegin = c;
1671                     state = State::NonUTF8Query;
1672                 }
1673             } else if (*c == '#') {
1674                 m_url.m_pathEnd = currentPosition(c);
1675                 m_url.m_queryEnd = m_url.m_pathEnd;
1676                 state = State::Fragment;
1677             } else if (*c == '/') {
1678                 appendToASCIIBuffer('/');
1679                 ++c;
1680                 m_url.m_pathAfterLastSlash = currentPosition(c);
1681             } else {
1682                 utf8PercentEncode<isInSimpleEncodeSet>(c);
1683                 ++c;
1684             }
1685             break;
1686         case State::UTF8Query:
1687             LOG_STATE("UTF8Query");
1688             ASSERT(queryBegin == CodePointIterator<CharacterType>());
1689             if (*c == '#') {
1690                 m_url.m_queryEnd = currentPosition(c);
1691                 state = State::Fragment;
1692                 break;
1693             }
1694             if (isUTF8Encoding)
1695                 utf8QueryEncode(c);
1696             else
1697                 appendCodePoint(queryBuffer, *c);
1698             ++c;
1699             break;
1700         case State::NonUTF8Query:
1701             do {
1702                 LOG_STATE("NonUTF8Query");
1703                 ASSERT(queryBegin != CodePointIterator<CharacterType>());
1704                 if (*c == '#') {
1705                     encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
1706                     m_url.m_queryEnd = currentPosition(c);
1707                     state = State::Fragment;
1708                     break;
1709                 }
1710                 appendCodePoint(queryBuffer, *c);
1711                 advance(c, queryBegin);
1712             } while (!c.atEnd());
1713             break;
1714         case State::Fragment:
1715             URL_PARSER_LOG("State Fragment");
1716             utf8PercentEncode<isInSimpleEncodeSet>(c);
1717             ++c;
1718             break;
1719         }
1720     }
1721
1722     switch (state) {
1723     case State::SchemeStart:
1724         LOG_FINAL_STATE("SchemeStart");
1725         if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1726             m_url = base;
1727             m_url.removeFragmentIdentifier();
1728             return;
1729         }
1730         failure();
1731         return;
1732     case State::Scheme:
1733         LOG_FINAL_STATE("Scheme");
1734         failure();
1735         return;
1736     case State::NoScheme:
1737         LOG_FINAL_STATE("NoScheme");
1738         RELEASE_ASSERT_NOT_REACHED();
1739     case State::SpecialRelativeOrAuthority:
1740         LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1741         copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1742         m_url.m_fragmentEnd = m_url.m_queryEnd;
1743         break;
1744     case State::PathOrAuthority:
1745         LOG_FINAL_STATE("PathOrAuthority");
1746         ASSERT(m_url.m_userStart);
1747         ASSERT(m_url.m_userStart == currentPosition(c));
1748         ASSERT(parsedDataView(currentPosition(c) - 1, 1) == "/");
1749         m_url.m_userStart--;
1750         m_url.m_userEnd = m_url.m_userStart;
1751         m_url.m_passwordEnd = m_url.m_userStart;
1752         m_url.m_hostEnd = m_url.m_userStart;
1753         m_url.m_portEnd = m_url.m_userStart;
1754         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1755         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1756         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1757         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1758         break;
1759     case State::Relative:
1760         LOG_FINAL_STATE("Relative");
1761         RELEASE_ASSERT_NOT_REACHED();
1762     case State::RelativeSlash:
1763         LOG_FINAL_STATE("RelativeSlash");
1764         copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1765         appendToASCIIBuffer('/');
1766         m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1767         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1768         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1769         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1770         break;
1771     case State::SpecialAuthoritySlashes:
1772         LOG_FINAL_STATE("SpecialAuthoritySlashes");
1773         m_url.m_userStart = currentPosition(c);
1774         m_url.m_userEnd = m_url.m_userStart;
1775         m_url.m_passwordEnd = m_url.m_userStart;
1776         m_url.m_hostEnd = m_url.m_userStart;
1777         m_url.m_portEnd = m_url.m_userStart;
1778         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1779         m_url.m_pathEnd = m_url.m_userStart;
1780         m_url.m_queryEnd = m_url.m_userStart;
1781         m_url.m_fragmentEnd = m_url.m_userStart;
1782         break;
1783     case State::SpecialAuthorityIgnoreSlashes:
1784         LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1785         failure();
1786         return;
1787         break;
1788     case State::AuthorityOrHost:
1789         LOG_FINAL_STATE("AuthorityOrHost");
1790         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1791         m_url.m_passwordEnd = m_url.m_userEnd;
1792         if (authorityOrHostBegin.atEnd()) {
1793             RELEASE_ASSERT(m_url.m_userStart >= 2);
1794             ASSERT(parsedDataView(m_url.m_userStart - 2, 2) == "//");
1795             m_url.m_userStart -= 2;
1796             m_url.m_userEnd = m_url.m_userStart;
1797             m_url.m_passwordEnd = m_url.m_userStart;
1798             m_url.m_hostEnd = m_url.m_userStart;
1799             m_url.m_portEnd = m_url.m_userStart;
1800             m_url.m_pathEnd = m_url.m_userStart + 2;
1801         } else if (!parseHostAndPort(authorityOrHostBegin)) {
1802             failure();
1803             return;
1804         } else {
1805             if (m_urlIsSpecial) {
1806                 syntaxViolation(c);
1807                 appendToASCIIBuffer('/');
1808                 m_url.m_pathEnd = m_url.m_portEnd + 1;
1809             } else
1810                 m_url.m_pathEnd = m_url.m_portEnd;
1811         }
1812         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1813         m_url.m_queryEnd = m_url.m_pathEnd;
1814         m_url.m_fragmentEnd = m_url.m_pathEnd;
1815         break;
1816     case State::Host:
1817         LOG_FINAL_STATE("Host");
1818         if (!parseHostAndPort(authorityOrHostBegin)) {
1819             failure();
1820             return;
1821         }
1822         if (m_urlIsSpecial) {
1823             syntaxViolation(c);
1824             appendToASCIIBuffer('/');
1825             m_url.m_pathEnd = m_url.m_portEnd + 1;
1826         } else
1827             m_url.m_pathEnd = m_url.m_portEnd;
1828         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1829         m_url.m_queryEnd = m_url.m_pathEnd;
1830         m_url.m_fragmentEnd = m_url.m_pathEnd;
1831         break;
1832     case State::File:
1833         LOG_FINAL_STATE("File");
1834         if (base.isValid() && base.protocolIs("file")) {
1835             copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1836             appendToASCIIBuffer(':');
1837         }
1838         syntaxViolation(c);
1839         appendToASCIIBuffer("///", 3);
1840         m_url.m_userStart = currentPosition(c) - 1;
1841         m_url.m_userEnd = m_url.m_userStart;
1842         m_url.m_passwordEnd = m_url.m_userStart;
1843         m_url.m_hostEnd = m_url.m_userStart;
1844         m_url.m_portEnd = m_url.m_userStart;
1845         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1846         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1847         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1848         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1849         break;
1850     case State::FileSlash:
1851         LOG_FINAL_STATE("FileSlash");
1852         syntaxViolation(c);
1853         m_url.m_userStart = currentPosition(c) + 1;
1854         appendToASCIIBuffer("//", 2);
1855         m_url.m_userEnd = m_url.m_userStart;
1856         m_url.m_passwordEnd = m_url.m_userStart;
1857         m_url.m_hostEnd = m_url.m_userStart;
1858         m_url.m_portEnd = m_url.m_userStart;
1859         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1860         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1861         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1862         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1863         break;
1864     case State::FileHost:
1865         LOG_FINAL_STATE("FileHost");
1866         if (authorityOrHostBegin == c) {
1867             syntaxViolation(c);
1868             appendToASCIIBuffer('/');
1869             m_url.m_userStart = currentPosition(c) - 1;
1870             m_url.m_userEnd = m_url.m_userStart;
1871             m_url.m_passwordEnd = m_url.m_userStart;
1872             m_url.m_hostEnd = m_url.m_userStart;
1873             m_url.m_portEnd = m_url.m_userStart;
1874             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1875             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1876             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1877             m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1878             break;
1879         }
1880
1881         if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1882             failure();
1883             return;
1884         }
1885
1886         syntaxViolation(c);
1887         if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) {
1888             m_asciiBuffer.shrink(m_url.m_passwordEnd);
1889             m_url.m_hostEnd = currentPosition(c);
1890             m_url.m_portEnd = m_url.m_hostEnd;
1891         }
1892         appendToASCIIBuffer('/');
1893         m_url.m_pathAfterLastSlash = m_url.m_portEnd + 1;
1894         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1895         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1896         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1897         break;
1898     case State::PathStart:
1899         LOG_FINAL_STATE("PathStart");
1900         RELEASE_ASSERT_NOT_REACHED();
1901     case State::Path:
1902         LOG_FINAL_STATE("Path");
1903         m_url.m_pathEnd = currentPosition(c);
1904         m_url.m_queryEnd = m_url.m_pathEnd;
1905         m_url.m_fragmentEnd = m_url.m_pathEnd;
1906         break;
1907     case State::CannotBeABaseURLPath:
1908         LOG_FINAL_STATE("CannotBeABaseURLPath");
1909         m_url.m_pathEnd = currentPosition(c);
1910         m_url.m_queryEnd = m_url.m_pathEnd;
1911         m_url.m_fragmentEnd = m_url.m_pathEnd;
1912         break;
1913     case State::UTF8Query:
1914         LOG_FINAL_STATE("UTF8Query");
1915         ASSERT(queryBegin == CodePointIterator<CharacterType>());
1916         m_url.m_queryEnd = currentPosition(c);
1917         m_url.m_fragmentEnd = m_url.m_queryEnd;
1918         break;
1919     case State::NonUTF8Query:
1920         LOG_FINAL_STATE("NonUTF8Query");
1921         ASSERT(queryBegin != CodePointIterator<CharacterType>());
1922         encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
1923         m_url.m_queryEnd = currentPosition(c);
1924         m_url.m_fragmentEnd = m_url.m_queryEnd;
1925         break;
1926     case State::Fragment:
1927         LOG_FINAL_STATE("Fragment");
1928         m_url.m_fragmentEnd = currentPosition(c);
1929         break;
1930     }
1931
1932     if (LIKELY(!m_didSeeSyntaxViolation)) {
1933         m_url.m_string = m_inputString;
1934         ASSERT(m_asciiBuffer.isEmpty());
1935     } else
1936         m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
1937     m_url.m_isValid = true;
1938     URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
1939 }
1940
1941 template<typename CharacterType>
1942 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
1943 {
1944     if (UNLIKELY(iterator.atEnd())) {
1945         syntaxViolation(iterator);
1946         m_url.m_userEnd = currentPosition(iterator);
1947         m_url.m_passwordEnd = m_url.m_userEnd;
1948         return;
1949     }
1950     for (; !iterator.atEnd(); advance(iterator)) {
1951         if (*iterator == ':') {
1952             m_url.m_userEnd = currentPosition(iterator);
1953             auto iteratorAtColon = iterator;
1954             ++iterator;
1955             bool tabOrNewlineAfterColon = false;
1956             while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
1957                 tabOrNewlineAfterColon = true;
1958                 ++iterator;
1959             }
1960             if (UNLIKELY(iterator.atEnd())) {
1961                 syntaxViolation(iteratorAtColon);
1962                 m_url.m_passwordEnd = m_url.m_userEnd;
1963                 if (m_url.m_userEnd > m_url.m_userStart)
1964                     appendToASCIIBuffer('@');
1965                 return;
1966             }
1967             if (tabOrNewlineAfterColon)
1968                 syntaxViolation(iteratorAtColon);
1969             appendToASCIIBuffer(':');
1970             break;
1971         }
1972         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
1973     }
1974     for (; !iterator.atEnd(); advance(iterator))
1975         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
1976     m_url.m_passwordEnd = currentPosition(iterator);
1977     if (!m_url.m_userEnd)
1978         m_url.m_userEnd = m_url.m_passwordEnd;
1979     appendToASCIIBuffer('@');
1980 }
1981
1982 template<typename UnsignedIntegerType>
1983 void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
1984 {
1985     LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
1986     LChar* end = buf + WTF_ARRAY_LENGTH(buf);
1987     LChar* p = end;
1988     do {
1989         *--p = (number % 10) + '0';
1990         number /= 10;
1991     } while (number);
1992     appendToASCIIBuffer(p, end - p);
1993 }
1994
1995 void URLParser::serializeIPv4(IPv4Address address)
1996 {
1997     appendNumberToASCIIBuffer<uint8_t>(address >> 24);
1998     appendToASCIIBuffer('.');
1999     appendNumberToASCIIBuffer<uint8_t>(address >> 16);
2000     appendToASCIIBuffer('.');
2001     appendNumberToASCIIBuffer<uint8_t>(address >> 8);
2002     appendToASCIIBuffer('.');
2003     appendNumberToASCIIBuffer<uint8_t>(address);
2004 }
2005     
2006 static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
2007 {
2008     size_t end = begin;
2009     for (; end < 8; end++) {
2010         if (address[end])
2011             break;
2012     }
2013     return end - begin;
2014 }
2015
2016 static Optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
2017 {
2018     Optional<size_t> longest;
2019     size_t longestLength = 0;
2020     for (size_t i = 0; i < 8; i++) {
2021         size_t length = zeroSequenceLength(address, i);
2022         if (length) {
2023             if (length > 1 && (!longest || longestLength < length)) {
2024                 longest = i;
2025                 longestLength = length;
2026             }
2027             i += length;
2028         }
2029     }
2030     return longest;
2031 }
2032
2033 void URLParser::serializeIPv6Piece(uint16_t piece)
2034 {
2035     bool printed = false;
2036     if (auto nibble0 = piece >> 12) {
2037         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
2038         printed = true;
2039     }
2040     auto nibble1 = piece >> 8 & 0xF;
2041     if (printed || nibble1) {
2042         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
2043         printed = true;
2044     }
2045     auto nibble2 = piece >> 4 & 0xF;
2046     if (printed || nibble2)
2047         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
2048     appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
2049 }
2050
2051 void URLParser::serializeIPv6(URLParser::IPv6Address address)
2052 {
2053     appendToASCIIBuffer('[');
2054     auto compressPointer = findLongestZeroSequence(address);
2055     for (size_t piece = 0; piece < 8; piece++) {
2056         if (compressPointer && compressPointer.value() == piece) {
2057             ASSERT(!address[piece]);
2058             if (piece)
2059                 appendToASCIIBuffer(':');
2060             else
2061                 appendToASCIIBuffer("::", 2);
2062             while (piece < 8 && !address[piece])
2063                 piece++;
2064             if (piece == 8)
2065                 break;
2066         }
2067         serializeIPv6Piece(address[piece]);
2068         if (piece < 7)
2069             appendToASCIIBuffer(':');
2070     }
2071     appendToASCIIBuffer(']');
2072 }
2073
2074 template<typename CharacterType>
2075 Optional<uint32_t> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation)
2076 {
2077     enum class State : uint8_t {
2078         UnknownBase,
2079         Decimal,
2080         OctalOrHex,
2081         Octal,
2082         Hex,
2083     };
2084     State state = State::UnknownBase;
2085     Checked<uint32_t, RecordOverflow> value = 0;
2086     if (!iterator.atEnd() && *iterator == '.')
2087         return Nullopt;
2088     while (!iterator.atEnd()) {
2089         if (isTabOrNewline(*iterator)) {
2090             didSeeSyntaxViolation = true;
2091             ++iterator;
2092             continue;
2093         }
2094         if (*iterator == '.') {
2095             ASSERT(!value.hasOverflowed());
2096             return value.unsafeGet();
2097         }
2098         switch (state) {
2099         case State::UnknownBase:
2100             if (UNLIKELY(*iterator == '0')) {
2101                 ++iterator;
2102                 state = State::OctalOrHex;
2103                 break;
2104             }
2105             state = State::Decimal;
2106             break;
2107         case State::OctalOrHex:
2108             didSeeSyntaxViolation = true;
2109             if (*iterator == 'x' || *iterator == 'X') {
2110                 ++iterator;
2111                 state = State::Hex;
2112                 break;
2113             }
2114             state = State::Octal;
2115             break;
2116         case State::Decimal:
2117             if (*iterator < '0' || *iterator > '9')
2118                 return Nullopt;
2119             value *= 10;
2120             value += *iterator - '0';
2121             if (UNLIKELY(value.hasOverflowed()))
2122                 return Nullopt;
2123             ++iterator;
2124             break;
2125         case State::Octal:
2126             ASSERT(didSeeSyntaxViolation);
2127             if (*iterator < '0' || *iterator > '7')
2128                 return Nullopt;
2129             value *= 8;
2130             value += *iterator - '0';
2131             if (UNLIKELY(value.hasOverflowed()))
2132                 return Nullopt;
2133             ++iterator;
2134             break;
2135         case State::Hex:
2136             ASSERT(didSeeSyntaxViolation);
2137             if (!isASCIIHexDigit(*iterator))
2138                 return Nullopt;
2139             value *= 16;
2140             value += toASCIIHexValue(*iterator);
2141             if (UNLIKELY(value.hasOverflowed()))
2142                 return Nullopt;
2143             ++iterator;
2144             break;
2145         }
2146     }
2147     ASSERT(!value.hasOverflowed());
2148     return value.unsafeGet();
2149 }
2150
2151 ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2152 {
2153     RELEASE_ASSERT(exponent <= 4);
2154     uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
2155     return values[exponent];
2156 }
2157
2158 template<typename CharacterType>
2159 Optional<URLParser::IPv4Address> URLParser::parseIPv4Host(CodePointIterator<CharacterType> iterator)
2160 {
2161     auto hostBegin = iterator;
2162
2163     Vector<uint32_t, 4> items;
2164     items.reserveInitialCapacity(4);
2165     bool didSeeSyntaxViolation = false;
2166     while (!iterator.atEnd()) {
2167         if (isTabOrNewline(*iterator)) {
2168             didSeeSyntaxViolation = true;
2169             ++iterator;
2170             continue;
2171         }
2172         if (items.size() >= 4)
2173             return Nullopt;
2174         if (auto item = parseIPv4Piece(iterator, didSeeSyntaxViolation))
2175             items.append(item.value());
2176         else
2177             return Nullopt;
2178         if (!iterator.atEnd()) {
2179             if (items.size() >= 4)
2180                 return Nullopt;
2181             if (*iterator == '.')
2182                 ++iterator;
2183             else
2184                 return Nullopt;
2185         }
2186     }
2187     if (!iterator.atEnd() || !items.size() || items.size() > 4)
2188         return Nullopt;
2189     if (items.size() > 1) {
2190         for (size_t i = 0; i < items.size() - 1; i++) {
2191             if (items[i] > 255)
2192                 return Nullopt;
2193         }
2194     }
2195     if (items[items.size() - 1] >= pow256(5 - items.size()))
2196         return Nullopt;
2197
2198     if (didSeeSyntaxViolation)
2199         syntaxViolation(hostBegin);
2200     for (auto item : items) {
2201         if (item > 255)
2202             syntaxViolation(hostBegin);
2203     }
2204
2205     if (UNLIKELY(items.size() != 4))
2206         syntaxViolation(hostBegin);
2207
2208     IPv4Address ipv4 = items.takeLast();
2209     for (size_t counter = 0; counter < items.size(); ++counter)
2210         ipv4 += items[counter] * pow256(3 - counter);
2211     return ipv4;
2212 }
2213
2214 template<typename CharacterType>
2215 Optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator)
2216 {
2217     if (iterator.atEnd())
2218         return Nullopt;
2219     uint32_t piece = 0;
2220     bool leadingZeros = false;
2221     size_t digitCount = 0;
2222     while (!iterator.atEnd()) {
2223         if (!isASCIIDigit(*iterator))
2224             return Nullopt;
2225         ++digitCount;
2226         if (!piece && *iterator == '0') {
2227             if (leadingZeros)
2228                 return Nullopt;
2229             leadingZeros = true;
2230         }
2231         if (!piece && *iterator == '0')
2232             leadingZeros = true;
2233         piece = piece * 10 + *iterator - '0';
2234         if (piece > 255)
2235             return Nullopt;
2236         advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2237         if (iterator.atEnd())
2238             break;
2239         if (*iterator == '.')
2240             break;
2241     }
2242     if (piece && leadingZeros)
2243         return Nullopt;
2244     return piece;
2245 }
2246
2247 template<typename CharacterType>
2248 Optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator)
2249 {
2250     IPv4Address address = 0;
2251     for (size_t i = 0; i < 4; ++i) {
2252         if (Optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator))
2253             address = (address << 8) + piece.value();
2254         else
2255             return Nullopt;
2256         if (i < 3) {
2257             if (iterator.atEnd())
2258                 return Nullopt;
2259             if (*iterator != '.')
2260                 return Nullopt;
2261             advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2262         } else if (!iterator.atEnd())
2263             return Nullopt;
2264     }
2265     ASSERT(iterator.atEnd());
2266     return address;
2267 }
2268
2269 template<typename CharacterType>
2270 Optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2271 {
2272     ASSERT(*c == '[');
2273     auto hostBegin = c;
2274     advance(c, hostBegin);
2275     if (c.atEnd())
2276         return Nullopt;
2277
2278     IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}};
2279     size_t piecePointer = 0;
2280     Optional<size_t> compressPointer;
2281
2282     if (*c == ':') {
2283         advance(c, hostBegin);
2284         if (c.atEnd())
2285             return Nullopt;
2286         if (*c != ':')
2287             return Nullopt;
2288         advance(c, hostBegin);
2289         ++piecePointer;
2290         compressPointer = piecePointer;
2291     }
2292     
2293     while (!c.atEnd()) {
2294         if (piecePointer == 8)
2295             return Nullopt;
2296         if (*c == ':') {
2297             if (compressPointer)
2298                 return Nullopt;
2299             advance(c, hostBegin);
2300             ++piecePointer;
2301             compressPointer = piecePointer;
2302             continue;
2303         }
2304         if (piecePointer == 6 || (compressPointer && piecePointer < 6)) {
2305             if (Optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) {
2306                 if (compressPointer && piecePointer == 5)
2307                     return Nullopt;
2308                 syntaxViolation(hostBegin);
2309                 address[piecePointer++] = ipv4Address.value() >> 16;
2310                 address[piecePointer++] = ipv4Address.value() & 0xFFFF;
2311                 c = { };
2312                 break;
2313             }
2314         }
2315         uint16_t value = 0;
2316         size_t length = 0;
2317         bool leadingZeros = false;
2318         for (; length < 4; length++) {
2319             if (c.atEnd())
2320                 break;
2321             if (!isASCIIHexDigit(*c))
2322                 break;
2323             if (isASCIIUpper(*c))
2324                 syntaxViolation(hostBegin);
2325             if (*c == '0' && !length)
2326                 leadingZeros = true;
2327             value = value * 0x10 + toASCIIHexValue(*c);
2328             advance(c, hostBegin);
2329         }
2330         
2331         if (UNLIKELY((value && leadingZeros) || (!value && length > 1)))
2332             syntaxViolation(hostBegin);
2333
2334         address[piecePointer++] = value;
2335         if (c.atEnd())
2336             break;
2337         if (piecePointer == 8 || *c != ':')
2338             return Nullopt;
2339         advance(c, hostBegin);
2340     }
2341     
2342     if (!c.atEnd())
2343         return Nullopt;
2344     
2345     if (compressPointer) {
2346         size_t swaps = piecePointer - compressPointer.value();
2347         piecePointer = 7;
2348         while (swaps)
2349             std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
2350     } else if (piecePointer != 8)
2351         return Nullopt;
2352
2353     Optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2354     if (possibleCompressPointer)
2355         possibleCompressPointer.value()++;
2356     if (UNLIKELY(compressPointer != possibleCompressPointer))
2357         syntaxViolation(hostBegin);
2358     
2359     return address;
2360 }
2361
2362 template<typename CharacterType>
2363 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2364 {
2365     Vector<LChar, defaultInlineBufferSize> output;
2366     output.reserveInitialCapacity(length);
2367     
2368     for (size_t i = 0; i < length; ++i) {
2369         uint8_t byte = input[i];
2370         if (byte != '%')
2371             output.uncheckedAppend(byte);
2372         else if (length > 2 && i < length - 2) {
2373             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2374                 syntaxViolation(iteratorForSyntaxViolationPosition);
2375                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2376                 i += 2;
2377             } else
2378                 output.uncheckedAppend(byte);
2379         } else
2380             output.uncheckedAppend(byte);
2381     }
2382     return output;
2383 }
2384     
2385 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length)
2386 {
2387     Vector<LChar, defaultInlineBufferSize> output;
2388     output.reserveInitialCapacity(length);
2389     
2390     for (size_t i = 0; i < length; ++i) {
2391         uint8_t byte = input[i];
2392         if (byte != '%')
2393             output.uncheckedAppend(byte);
2394         else if (length > 2 && i < length - 2) {
2395             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2396                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2397                 i += 2;
2398             } else
2399                 output.uncheckedAppend(byte);
2400         } else
2401             output.uncheckedAppend(byte);
2402     }
2403     return output;
2404 }
2405
2406 ALWAYS_INLINE static bool containsOnlyASCII(const String& string)
2407 {
2408     if (string.is8Bit())
2409         return charactersAreAllASCII(string.characters8(), string.length());
2410     return charactersAreAllASCII(string.characters16(), string.length());
2411 }
2412
2413 template<typename CharacterType>
2414 Optional<Vector<LChar, URLParser::defaultInlineBufferSize>> URLParser::domainToASCII(const String& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2415 {
2416     Vector<LChar, defaultInlineBufferSize> ascii;
2417     if (containsOnlyASCII(domain)) {
2418         size_t length = domain.length();
2419         if (domain.is8Bit()) {
2420             const LChar* characters = domain.characters8();
2421             ascii.reserveInitialCapacity(length);
2422             if (m_urlIsSpecial) {
2423                 for (size_t i = 0; i < length; ++i) {
2424                     if (UNLIKELY(isASCIIUpper(characters[i])))
2425                         syntaxViolation(iteratorForSyntaxViolationPosition);
2426                     ascii.uncheckedAppend(toASCIILower(characters[i]));
2427                 }
2428             } else {
2429                 for (size_t i = 0; i < length; ++i)
2430                     ascii.uncheckedAppend(characters[i]);
2431             }
2432         } else {
2433             const UChar* characters = domain.characters16();
2434             ascii.reserveInitialCapacity(length);
2435             if (m_urlIsSpecial) {
2436                 for (size_t i = 0; i < length; ++i) {
2437                     if (UNLIKELY(isASCIIUpper(characters[i])))
2438                         syntaxViolation(iteratorForSyntaxViolationPosition);
2439                     ascii.uncheckedAppend(toASCIILower(characters[i]));
2440                 }
2441             } else {
2442                 for (size_t i = 0; i < length; ++i)
2443                     ascii.uncheckedAppend(characters[i]);
2444             }
2445         }
2446         return ascii;
2447     }
2448     
2449     UChar hostnameBuffer[defaultInlineBufferSize];
2450     UErrorCode error = U_ZERO_ERROR;
2451
2452 #if COMPILER(GCC) || COMPILER(CLANG)
2453 #pragma GCC diagnostic push
2454 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2455 #endif
2456     // FIXME: This should use uidna_openUTS46 / uidna_close instead
2457     int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, UIDNA_ALLOW_UNASSIGNED, nullptr, &error);
2458 #if COMPILER(GCC) || COMPILER(CLANG)
2459 #pragma GCC diagnostic pop
2460 #endif
2461     ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
2462
2463     if (error == U_ZERO_ERROR) {
2464         for (int32_t i = 0; i < numCharactersConverted; ++i) {
2465             ASSERT(isASCII(hostnameBuffer[i]));
2466             ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2467         }
2468         ascii.append(hostnameBuffer, numCharactersConverted);
2469         if (domain != StringView(ascii.data(), ascii.size()))
2470             syntaxViolation(iteratorForSyntaxViolationPosition);
2471         return ascii;
2472     }
2473
2474     // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
2475     return Nullopt;
2476 }
2477
2478 bool URLParser::hasInvalidDomainCharacter(const Vector<LChar, URLParser::defaultInlineBufferSize>& asciiDomain)
2479 {
2480     for (size_t i = 0; i < asciiDomain.size(); ++i) {
2481         if (isInvalidDomainCharacter(asciiDomain[i]))
2482             return true;
2483     }
2484     return false;
2485 }
2486
2487 template<typename CharacterType>
2488 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2489 {
2490     ASSERT(*iterator == ':');
2491     auto colonIterator = iterator;
2492     advance(iterator, colonIterator);
2493     uint32_t port = 0;
2494     if (UNLIKELY(iterator.atEnd())) {
2495         m_url.m_portEnd = currentPosition(colonIterator);
2496         syntaxViolation(colonIterator);
2497         return true;
2498     }
2499     size_t digitCount = 0;
2500     bool leadingZeros = false;
2501     for (; !iterator.atEnd(); ++iterator) {
2502         if (UNLIKELY(isTabOrNewline(*iterator))) {
2503             syntaxViolation(colonIterator);
2504             continue;
2505         }
2506         if (isASCIIDigit(*iterator)) {
2507             if (*iterator == '0' && !digitCount)
2508                 leadingZeros = true;
2509             ++digitCount;
2510             port = port * 10 + *iterator - '0';
2511             if (port > std::numeric_limits<uint16_t>::max())
2512                 return false;
2513         } else
2514             return false;
2515     }
2516
2517     if (port && leadingZeros)
2518         syntaxViolation(colonIterator);
2519     
2520     if (!port && digitCount > 1)
2521         syntaxViolation(colonIterator);
2522
2523     if (UNLIKELY(isDefaultPortForProtocol(port, parsedDataView(0, m_url.m_schemeEnd))))
2524         syntaxViolation(colonIterator);
2525     else {
2526         appendToASCIIBuffer(':');
2527         ASSERT(port <= std::numeric_limits<uint16_t>::max());
2528         appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2529     }
2530
2531     m_url.m_portEnd = currentPosition(iterator);
2532     return true;
2533 }
2534
2535 template<typename CharacterType>
2536 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2537 {
2538     if (iterator.atEnd())
2539         return false;
2540     if (*iterator == ':')
2541         return false;
2542     if (*iterator == '[') {
2543         auto ipv6End = iterator;
2544         while (!ipv6End.atEnd() && *ipv6End != ']')
2545             ++ipv6End;
2546         if (ipv6End.atEnd())
2547             return false;
2548         if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2549             serializeIPv6(address.value());
2550             if (!ipv6End.atEnd()) {
2551                 advance(ipv6End);
2552                 if (!ipv6End.atEnd() && *ipv6End == ':') {
2553                     m_url.m_hostEnd = currentPosition(ipv6End);
2554                     return parsePort(ipv6End);
2555                 }
2556                 m_url.m_hostEnd = currentPosition(ipv6End);
2557                 m_url.m_portEnd = m_url.m_hostEnd;
2558                 return true;
2559             }
2560             m_url.m_hostEnd = currentPosition(ipv6End);
2561             return true;
2562         }
2563     }
2564
2565     if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2566         auto hostIterator = iterator;
2567         for (; !iterator.atEnd(); ++iterator) {
2568             if (isTabOrNewline(*iterator))
2569                 continue;
2570             if (*iterator == ':')
2571                 break;
2572             if (isInvalidDomainCharacter(*iterator))
2573                 return false;
2574         }
2575         if (auto address = parseIPv4Host(CodePointIterator<CharacterType>(hostIterator, iterator))) {
2576             serializeIPv4(address.value());
2577             m_url.m_hostEnd = currentPosition(iterator);
2578             if (iterator.atEnd()) {
2579                 m_url.m_portEnd = currentPosition(iterator);
2580                 return true;
2581             }
2582             return parsePort(iterator);
2583         }
2584         for (; hostIterator != iterator; ++hostIterator) {
2585             if (LIKELY(!isTabOrNewline(*hostIterator))) {
2586                 if (m_urlIsSpecial) {
2587                     if (UNLIKELY(isASCIIUpper(*hostIterator)))
2588                         syntaxViolation(hostIterator);
2589                     appendToASCIIBuffer(toASCIILower(*hostIterator));
2590                 } else
2591                     appendToASCIIBuffer(*hostIterator);
2592             } else
2593                 syntaxViolation(hostIterator);
2594         }
2595         m_url.m_hostEnd = currentPosition(iterator);
2596         if (!hostIterator.atEnd())
2597             return parsePort(hostIterator);
2598         m_url.m_portEnd = currentPosition(iterator);
2599         return true;
2600     }
2601     
2602     auto hostBegin = iterator;
2603     
2604     Vector<LChar, defaultInlineBufferSize> utf8Encoded;
2605     for (; !iterator.atEnd(); ++iterator) {
2606         if (UNLIKELY(isTabOrNewline(*iterator))) {
2607             syntaxViolation(hostBegin);
2608             continue;
2609         }
2610         if (*iterator == ':')
2611             break;
2612         if (UNLIKELY(!isASCII(*iterator)))
2613             syntaxViolation(hostBegin);
2614
2615         uint8_t buffer[U8_MAX_LENGTH];
2616         int32_t offset = 0;
2617         UBool error = false;
2618         U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error);
2619         ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
2620         // FIXME: Check error.
2621         utf8Encoded.append(buffer, offset);
2622     }
2623     Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin);
2624     String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2625     if (domain != StringView(percentDecoded.data(), percentDecoded.size()))
2626         syntaxViolation(hostBegin);
2627     auto asciiDomain = domainToASCII(domain, hostBegin);
2628     if (!asciiDomain || hasInvalidDomainCharacter(asciiDomain.value()))
2629         return false;
2630     Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value();
2631     const LChar* asciiDomainCharacters = asciiDomainValue.data();
2632
2633     if (auto address = parseIPv4Host(CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()))) {
2634         serializeIPv4(address.value());
2635         m_url.m_hostEnd = currentPosition(iterator);
2636         if (iterator.atEnd()) {
2637             m_url.m_portEnd = currentPosition(iterator);
2638             return true;
2639         }
2640         return parsePort(iterator);
2641     }
2642
2643     appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2644     m_url.m_hostEnd = currentPosition(iterator);
2645     if (!iterator.atEnd())
2646         return parsePort(iterator);
2647     m_url.m_portEnd = currentPosition(iterator);
2648     return true;
2649 }
2650
2651 Optional<String> URLParser::formURLDecode(StringView input)
2652 {
2653     auto utf8 = input.utf8(StrictConversion);
2654     if (utf8.isNull())
2655         return Nullopt;
2656     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2657     return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2658 }
2659
2660 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2661 {
2662     Vector<StringView> sequences = input.split('&');
2663
2664     URLEncodedForm output;
2665     for (auto& bytes : sequences) {
2666         auto valueStart = bytes.find('=');
2667         if (valueStart == notFound) {
2668             if (auto name = formURLDecode(bytes))
2669                 output.append({name.value().replace('+', 0x20), emptyString()});
2670         } else {
2671             auto name = formURLDecode(bytes.substring(0, valueStart));
2672             auto value = formURLDecode(bytes.substring(valueStart + 1));
2673             if (name && value)
2674                 output.append(std::make_pair(name.value().replace('+', 0x20), value.value().replace('+', 0x20)));
2675         }
2676     }
2677     return output;
2678 }
2679
2680 static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2681 {
2682     auto utf8 = input.utf8(StrictConversion);
2683     const char* data = utf8.data();
2684     for (size_t i = 0; i < utf8.length(); ++i) {
2685         const char byte = data[i];
2686         if (byte == 0x20)
2687             output.append(0x2B);
2688         else if (byte == 0x2A
2689             || byte == 0x2D
2690             || byte == 0x2E
2691             || (byte >= 0x30 && byte <= 0x39)
2692             || (byte >= 0x41 && byte <= 0x5A)
2693             || byte == 0x5F
2694             || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches.
2695             output.append(byte);
2696         else
2697             percentEncodeByte(byte, output);
2698     }
2699 }
2700     
2701 String URLParser::serialize(const URLEncodedForm& tuples)
2702 {
2703     Vector<LChar> output;
2704     for (auto& tuple : tuples) {
2705         if (!output.isEmpty())
2706             output.append('&');
2707         serializeURLEncodedForm(tuple.first, output);
2708         output.append('=');
2709         serializeURLEncodedForm(tuple.second, output);
2710     }
2711     return String::adopt(WTFMove(output));
2712 }
2713
2714 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2715 {
2716     // FIXME: m_cannotBeABaseURL is not compared because the old URL::parse did not use it,
2717     // but once we get rid of URL::parse its value should be tested.
2718     URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2719         a.m_isValid,
2720         a.m_protocolIsInHTTPFamily,
2721         a.m_schemeEnd,
2722         a.m_userStart,
2723         a.m_userEnd,
2724         a.m_passwordEnd,
2725         a.m_hostEnd,
2726         a.m_portEnd,
2727         a.m_pathAfterLastSlash,
2728         a.m_pathEnd,
2729         a.m_queryEnd,
2730         a.m_fragmentEnd,
2731         a.m_string.utf8().data(),
2732         b.m_isValid,
2733         b.m_protocolIsInHTTPFamily,
2734         b.m_schemeEnd,
2735         b.m_userStart,
2736         b.m_userEnd,
2737         b.m_passwordEnd,
2738         b.m_hostEnd,
2739         b.m_portEnd,
2740         b.m_pathAfterLastSlash,
2741         b.m_pathEnd,
2742         b.m_queryEnd,
2743         b.m_fragmentEnd,
2744         b.m_string.utf8().data());
2745
2746     return a.m_string == b.m_string
2747         && a.m_isValid == b.m_isValid
2748         && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2749         && a.m_schemeEnd == b.m_schemeEnd
2750         && a.m_userStart == b.m_userStart
2751         && a.m_userEnd == b.m_userEnd
2752         && a.m_passwordEnd == b.m_passwordEnd
2753         && a.m_hostEnd == b.m_hostEnd
2754         && a.m_portEnd == b.m_portEnd
2755         && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2756         && a.m_pathEnd == b.m_pathEnd
2757         && a.m_queryEnd == b.m_queryEnd
2758         && a.m_fragmentEnd == b.m_fragmentEnd;
2759 }
2760
2761 bool URLParser::internalValuesConsistent(const URL& url)
2762 {
2763     return url.m_schemeEnd <= url.m_userStart
2764         && url.m_userStart <= url.m_userEnd
2765         && url.m_userEnd <= url.m_passwordEnd
2766         && url.m_passwordEnd <= url.m_hostEnd
2767         && url.m_hostEnd <= url.m_portEnd
2768         && url.m_portEnd <= url.m_pathAfterLastSlash
2769         && url.m_pathAfterLastSlash <= url.m_pathEnd
2770         && url.m_pathEnd <= url.m_queryEnd
2771         && url.m_queryEnd <= url.m_fragmentEnd
2772         && (url.m_isValid ? url.m_fragmentEnd == url.m_string.length() : !url.m_fragmentEnd);
2773     // FIXME: Why do we even store m_fragmentEnd?
2774     // It should be able to be deduced from m_isValid and m_string.length() to save memory.
2775 }
2776
2777 enum class URLParserEnabled {
2778     Undetermined,
2779     Yes,
2780     No
2781 };
2782
2783 static URLParserEnabled urlParserEnabled = URLParserEnabled::Undetermined;
2784
2785 void URLParser::setEnabled(bool enabled)
2786 {
2787     urlParserEnabled = enabled ? URLParserEnabled::Yes : URLParserEnabled::No;
2788 }
2789
2790 bool URLParser::enabled()
2791 {
2792     if (urlParserEnabled == URLParserEnabled::Undetermined) {
2793 #if PLATFORM(MAC)
2794         urlParserEnabled = MacApplication::isSafari() ? URLParserEnabled::Yes : URLParserEnabled::No;
2795 #elif PLATFORM(IOS)
2796         urlParserEnabled = IOSApplication::isMobileSafari() ? URLParserEnabled::Yes : URLParserEnabled::No;
2797 #else
2798         urlParserEnabled = URLParserEnabled::Yes;
2799 #endif
2800     }
2801     return urlParserEnabled == URLParserEnabled::Yes;
2802 }
2803
2804 } // namespace WebCore