URLParser should ignore tabs in authority
[WebKit-https.git] / Source / WebCore / platform / URLParser.cpp
1 /*
2  * Copyright (C) 2016 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "URLParser.h"
28
29 #include "Logging.h"
30 #include <array>
31 #include <unicode/uidna.h>
32 #include <unicode/utypes.h>
33
34 namespace WebCore {
35
36 template<typename CharacterType>
37 class CodePointIterator {
38 public:
39     ALWAYS_INLINE CodePointIterator() { }
40     ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
41         : m_begin(begin)
42         , m_end(end)
43     {
44     }
45     
46     ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
47         : CodePointIterator(begin.m_begin, end.m_begin)
48     {
49         ASSERT(end.m_begin >= begin.m_begin);
50     }
51     
52     ALWAYS_INLINE UChar32 operator*() const;
53     ALWAYS_INLINE CodePointIterator& operator++();
54
55     ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
56     {
57         return m_begin == other.m_begin
58             && m_end == other.m_end;
59     }
60     ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
61     
62     ALWAYS_INLINE CodePointIterator& operator=(const CodePointIterator& other)
63     {
64         m_begin = other.m_begin;
65         m_end = other.m_end;
66         return *this;
67     }
68
69     ALWAYS_INLINE bool atEnd() const
70     {
71         ASSERT(m_begin <= m_end);
72         return m_begin >= m_end;
73     }
74     
75     ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
76     {
77         ASSERT(m_begin >= reference);
78         return m_begin - reference;
79     }
80
81     ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
82     {
83         return codeUnitsSince(other.m_begin);
84     }
85     
86 private:
87     const CharacterType* m_begin { nullptr };
88     const CharacterType* m_end { nullptr };
89 };
90
91 template<>
92 ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const
93 {
94     ASSERT(!atEnd());
95     return *m_begin;
96 }
97
98 template<>
99 ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
100 {
101     ASSERT(!atEnd());
102     m_begin++;
103     return *this;
104 }
105
106 template<>
107 ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const
108 {
109     ASSERT(!atEnd());
110     UChar32 c;
111     U16_GET(m_begin, 0, 0, m_end - m_begin, c);
112     return c;
113 }
114
115 template<>
116 ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
117 {
118     ASSERT(!atEnd());
119     unsigned i = 0;
120     size_t length = m_end - m_begin;
121     U16_FWD_1(m_begin, i, length);
122     m_begin += i;
123     return *this;
124 }
125     
126 ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
127 {
128     if (U_IS_BMP(codePoint)) {
129         destination.append(static_cast<UChar>(codePoint));
130         return;
131     }
132     destination.reserveCapacity(destination.size() + 2);
133     destination.uncheckedAppend(U16_LEAD(codePoint));
134     destination.uncheckedAppend(U16_TRAIL(codePoint));
135 }
136
137 enum URLCharacterClass {
138     UserInfo = 0x1,
139     Default = 0x2,
140     InvalidDomain = 0x4,
141     QueryPercent = 0x8,
142     SlashQuestionOrHash = 0x10,
143     Scheme = 0x20,
144 };
145
146 static const uint8_t characterClassTable[256] = {
147     UserInfo | Default | InvalidDomain | QueryPercent, // 0x0
148     UserInfo | Default | QueryPercent, // 0x1
149     UserInfo | Default | QueryPercent, // 0x2
150     UserInfo | Default | QueryPercent, // 0x3
151     UserInfo | Default | QueryPercent, // 0x4
152     UserInfo | Default | QueryPercent, // 0x5
153     UserInfo | Default | QueryPercent, // 0x6
154     UserInfo | Default | QueryPercent, // 0x7
155     UserInfo | Default | QueryPercent, // 0x8
156     UserInfo | Default | InvalidDomain | QueryPercent, // 0x9
157     UserInfo | Default | InvalidDomain | QueryPercent, // 0xA
158     UserInfo | Default | QueryPercent, // 0xB
159     UserInfo | Default | QueryPercent, // 0xC
160     UserInfo | Default | InvalidDomain | QueryPercent, // 0xD
161     UserInfo | Default | QueryPercent, // 0xE
162     UserInfo | Default | QueryPercent, // 0xF
163     UserInfo | Default | QueryPercent, // 0x10
164     UserInfo | Default | QueryPercent, // 0x11
165     UserInfo | Default | QueryPercent, // 0x12
166     UserInfo | Default | QueryPercent, // 0x13
167     UserInfo | Default | QueryPercent, // 0x14
168     UserInfo | Default | QueryPercent, // 0x15
169     UserInfo | Default | QueryPercent, // 0x16
170     UserInfo | Default | QueryPercent, // 0x17
171     UserInfo | Default | QueryPercent, // 0x18
172     UserInfo | Default | QueryPercent, // 0x19
173     UserInfo | Default | QueryPercent, // 0x1A
174     UserInfo | Default | QueryPercent, // 0x1B
175     UserInfo | Default | QueryPercent, // 0x1C
176     UserInfo | Default | QueryPercent, // 0x1D
177     UserInfo | Default | QueryPercent, // 0x1E
178     UserInfo | Default | QueryPercent, // 0x1F
179     UserInfo | Default | InvalidDomain | QueryPercent, // ' '
180     0, // '!'
181     UserInfo | Default | QueryPercent, // '"'
182     UserInfo | Default | InvalidDomain | QueryPercent | SlashQuestionOrHash, // '#'
183     0, // '$'
184     InvalidDomain, // '%'
185     0, // '&'
186     0, // '''
187     0, // '('
188     0, // ')'
189     0, // '*'
190     Scheme, // '+'
191     0, // ','
192     Scheme, // '-'
193     Scheme, // '.'
194     UserInfo | InvalidDomain | SlashQuestionOrHash, // '/'
195     Scheme, // '0'
196     Scheme, // '1'
197     Scheme, // '2'
198     Scheme, // '3'
199     Scheme, // '4'
200     Scheme, // '5'
201     Scheme, // '6'
202     Scheme, // '7'
203     Scheme, // '8'
204     Scheme, // '9'
205     UserInfo | InvalidDomain, // ':'
206     UserInfo, // ';'
207     UserInfo | Default | QueryPercent, // '<'
208     UserInfo, // '='
209     UserInfo | Default | QueryPercent, // '>'
210     UserInfo | Default | InvalidDomain | SlashQuestionOrHash, // '?'
211     UserInfo | InvalidDomain, // '@'
212     Scheme, // 'A'
213     Scheme, // 'B'
214     Scheme, // 'C'
215     Scheme, // 'D'
216     Scheme, // 'E'
217     Scheme, // 'F'
218     Scheme, // 'G'
219     Scheme, // 'H'
220     Scheme, // 'I'
221     Scheme, // 'J'
222     Scheme, // 'K'
223     Scheme, // 'L'
224     Scheme, // 'M'
225     Scheme, // 'N'
226     Scheme, // 'O'
227     Scheme, // 'P'
228     Scheme, // 'Q'
229     Scheme, // 'R'
230     Scheme, // 'S'
231     Scheme, // 'T'
232     Scheme, // 'U'
233     Scheme, // 'V'
234     Scheme, // 'W'
235     Scheme, // 'X'
236     Scheme, // 'Y'
237     Scheme, // 'Z'
238     UserInfo | InvalidDomain, // '['
239     UserInfo | InvalidDomain | SlashQuestionOrHash, // '\\'
240     UserInfo | InvalidDomain, // ']'
241     UserInfo, // '^'
242     0, // '_'
243     UserInfo | Default, // '`'
244     Scheme, // 'a'
245     Scheme, // 'b'
246     Scheme, // 'c'
247     Scheme, // 'd'
248     Scheme, // 'e'
249     Scheme, // 'f'
250     Scheme, // 'g'
251     Scheme, // 'h'
252     Scheme, // 'i'
253     Scheme, // 'j'
254     Scheme, // 'k'
255     Scheme, // 'l'
256     Scheme, // 'm'
257     Scheme, // 'n'
258     Scheme, // 'o'
259     Scheme, // 'p'
260     Scheme, // 'q'
261     Scheme, // 'r'
262     Scheme, // 's'
263     Scheme, // 't'
264     Scheme, // 'u'
265     Scheme, // 'v'
266     Scheme, // 'w'
267     Scheme, // 'x'
268     Scheme, // 'y'
269     Scheme, // 'z'
270     UserInfo | Default, // '{'
271     UserInfo, // '|'
272     UserInfo | Default, // '}'
273     0, // '~'
274     QueryPercent, // 0x7F
275     QueryPercent, // 0x80
276     QueryPercent, // 0x81
277     QueryPercent, // 0x82
278     QueryPercent, // 0x83
279     QueryPercent, // 0x84
280     QueryPercent, // 0x85
281     QueryPercent, // 0x86
282     QueryPercent, // 0x87
283     QueryPercent, // 0x88
284     QueryPercent, // 0x89
285     QueryPercent, // 0x8A
286     QueryPercent, // 0x8B
287     QueryPercent, // 0x8C
288     QueryPercent, // 0x8D
289     QueryPercent, // 0x8E
290     QueryPercent, // 0x8F
291     QueryPercent, // 0x90
292     QueryPercent, // 0x91
293     QueryPercent, // 0x92
294     QueryPercent, // 0x93
295     QueryPercent, // 0x94
296     QueryPercent, // 0x95
297     QueryPercent, // 0x96
298     QueryPercent, // 0x97
299     QueryPercent, // 0x98
300     QueryPercent, // 0x99
301     QueryPercent, // 0x9A
302     QueryPercent, // 0x9B
303     QueryPercent, // 0x9C
304     QueryPercent, // 0x9D
305     QueryPercent, // 0x9E
306     QueryPercent, // 0x9F
307     QueryPercent, // 0xA0
308     QueryPercent, // 0xA1
309     QueryPercent, // 0xA2
310     QueryPercent, // 0xA3
311     QueryPercent, // 0xA4
312     QueryPercent, // 0xA5
313     QueryPercent, // 0xA6
314     QueryPercent, // 0xA7
315     QueryPercent, // 0xA8
316     QueryPercent, // 0xA9
317     QueryPercent, // 0xAA
318     QueryPercent, // 0xAB
319     QueryPercent, // 0xAC
320     QueryPercent, // 0xAD
321     QueryPercent, // 0xAE
322     QueryPercent, // 0xAF
323     QueryPercent, // 0xB0
324     QueryPercent, // 0xB1
325     QueryPercent, // 0xB2
326     QueryPercent, // 0xB3
327     QueryPercent, // 0xB4
328     QueryPercent, // 0xB5
329     QueryPercent, // 0xB6
330     QueryPercent, // 0xB7
331     QueryPercent, // 0xB8
332     QueryPercent, // 0xB9
333     QueryPercent, // 0xBA
334     QueryPercent, // 0xBB
335     QueryPercent, // 0xBC
336     QueryPercent, // 0xBD
337     QueryPercent, // 0xBE
338     QueryPercent, // 0xBF
339     QueryPercent, // 0xC0
340     QueryPercent, // 0xC1
341     QueryPercent, // 0xC2
342     QueryPercent, // 0xC3
343     QueryPercent, // 0xC4
344     QueryPercent, // 0xC5
345     QueryPercent, // 0xC6
346     QueryPercent, // 0xC7
347     QueryPercent, // 0xC8
348     QueryPercent, // 0xC9
349     QueryPercent, // 0xCA
350     QueryPercent, // 0xCB
351     QueryPercent, // 0xCC
352     QueryPercent, // 0xCD
353     QueryPercent, // 0xCE
354     QueryPercent, // 0xCF
355     QueryPercent, // 0xD0
356     QueryPercent, // 0xD1
357     QueryPercent, // 0xD2
358     QueryPercent, // 0xD3
359     QueryPercent, // 0xD4
360     QueryPercent, // 0xD5
361     QueryPercent, // 0xD6
362     QueryPercent, // 0xD7
363     QueryPercent, // 0xD8
364     QueryPercent, // 0xD9
365     QueryPercent, // 0xDA
366     QueryPercent, // 0xDB
367     QueryPercent, // 0xDC
368     QueryPercent, // 0xDD
369     QueryPercent, // 0xDE
370     QueryPercent, // 0xDF
371     QueryPercent, // 0xE0
372     QueryPercent, // 0xE1
373     QueryPercent, // 0xE2
374     QueryPercent, // 0xE3
375     QueryPercent, // 0xE4
376     QueryPercent, // 0xE5
377     QueryPercent, // 0xE6
378     QueryPercent, // 0xE7
379     QueryPercent, // 0xE8
380     QueryPercent, // 0xE9
381     QueryPercent, // 0xEA
382     QueryPercent, // 0xEB
383     QueryPercent, // 0xEC
384     QueryPercent, // 0xED
385     QueryPercent, // 0xEE
386     QueryPercent, // 0xEF
387     QueryPercent, // 0xF0
388     QueryPercent, // 0xF1
389     QueryPercent, // 0xF2
390     QueryPercent, // 0xF3
391     QueryPercent, // 0xF4
392     QueryPercent, // 0xF5
393     QueryPercent, // 0xF6
394     QueryPercent, // 0xF7
395     QueryPercent, // 0xF8
396     QueryPercent, // 0xF9
397     QueryPercent, // 0xFA
398     QueryPercent, // 0xFB
399     QueryPercent, // 0xFC
400     QueryPercent, // 0xFD
401     QueryPercent, // 0xFE
402     QueryPercent, // 0xFF
403 };
404
405 template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; }
406 template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
407 template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
408 template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
409 template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
410 template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
411 template<typename CharacterType> ALWAYS_INLINE static bool isInvalidDomainCharacter(CharacterType character) { return character <= ']' && characterClassTable[character] & InvalidDomain; }
412 template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
413 template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
414 template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & Scheme; }
415 static bool shouldPercentEncodeQueryByte(uint8_t byte) { return characterClassTable[byte] & QueryPercent; }
416
417 template<typename CharacterType>
418 ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
419 {
420     ++iterator;
421     while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
422         syntaxViolation(iteratorForSyntaxViolationPosition);
423         ++iterator;
424     }
425 }
426
427 template<typename CharacterType>
428 ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
429 {
430     if (iterator.atEnd() || !isASCIIAlpha(*iterator))
431         return false;
432     advance(iterator);
433     if (iterator.atEnd())
434         return false;
435     if (*iterator == ':')
436         return true;
437     if (UNLIKELY(*iterator == '|')) {
438         syntaxViolation(iterator);
439         return true;
440     }
441     return false;
442 }
443
444 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
445 {
446     ASSERT(m_unicodeFragmentBuffer.isEmpty());
447     ASSERT(isASCII(codePoint));
448     if (UNLIKELY(m_didSeeSyntaxViolation))
449         m_asciiBuffer.append(codePoint);
450 }
451
452 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
453 {
454     ASSERT(m_unicodeFragmentBuffer.isEmpty());
455     if (UNLIKELY(m_didSeeSyntaxViolation))
456         m_asciiBuffer.append(characters, length);
457 }
458
459 template<typename CharacterType>
460 void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
461 {
462     ASSERT(isWindowsDriveLetter(iterator));
463     appendToASCIIBuffer(*iterator);
464     advance(iterator);
465     ASSERT(!iterator.atEnd());
466     ASSERT(*iterator == ':' || *iterator == '|');
467     appendToASCIIBuffer(':');
468     advance(iterator);
469 }
470
471 template<typename CharacterType>
472 bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
473 {
474     if (!isWindowsDriveLetter(iterator))
475         return true;
476     if (iterator.atEnd())
477         return false;
478     advance(iterator);
479     if (iterator.atEnd())
480         return true;
481     advance(iterator);
482     if (iterator.atEnd())
483         return true;
484     return !isSlashQuestionOrHash(*iterator);
485 }
486
487 static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
488 {
489     buffer.append('%');
490     buffer.append(upperNibbleToASCIIHexDigit(byte));
491     buffer.append(lowerNibbleToASCIIHexDigit(byte));
492 }
493
494 void URLParser::percentEncodeByte(uint8_t byte)
495 {
496     appendToASCIIBuffer('%');
497     appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
498     appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
499 }
500
501 const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD";
502 const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1;
503
504 template<bool(*isInCodeSet)(UChar32), typename CharacterType>
505 ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
506 {
507     ASSERT(!iterator.atEnd());
508     UChar32 codePoint = *iterator;
509     if (LIKELY(isASCII(codePoint))) {
510         if (UNLIKELY(isInCodeSet(codePoint))) {
511             syntaxViolation(iterator);
512             percentEncodeByte(codePoint);
513         } else
514             appendToASCIIBuffer(codePoint);
515         return;
516     }
517     ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
518     syntaxViolation(iterator);
519     
520     if (!U_IS_UNICODE_CHAR(codePoint)) {
521         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
522         return;
523     }
524     
525     uint8_t buffer[U8_MAX_LENGTH];
526     int32_t offset = 0;
527     U8_APPEND_UNSAFE(buffer, offset, codePoint);
528     for (int32_t i = 0; i < offset; ++i)
529         percentEncodeByte(buffer[i]);
530 }
531
532 template<typename CharacterType>
533 ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
534 {
535     ASSERT(!iterator.atEnd());
536     UChar32 codePoint = *iterator;
537     if (LIKELY(isASCII(codePoint))) {
538         if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint))) {
539             syntaxViolation(iterator);
540             percentEncodeByte(codePoint);
541         } else
542             appendToASCIIBuffer(codePoint);
543         return;
544     }
545     
546     syntaxViolation(iterator);
547     
548     if (!U_IS_UNICODE_CHAR(codePoint)) {
549         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
550         return;
551     }
552
553     uint8_t buffer[U8_MAX_LENGTH];
554     int32_t offset = 0;
555     U8_APPEND_UNSAFE(buffer, offset, codePoint);
556     for (int32_t i = 0; i < offset; ++i) {
557         auto byte = buffer[i];
558         if (shouldPercentEncodeQueryByte(byte))
559             percentEncodeByte(byte);
560         else
561             appendToASCIIBuffer(byte);
562     }
563 }
564     
565 void URLParser::encodeQuery(const Vector<UChar>& source, const TextEncoding& encoding)
566 {
567     // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
568     CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
569     const char* data = encoded.data();
570     size_t length = encoded.length();
571     for (size_t i = 0; i < length; ++i) {
572         uint8_t byte = data[i];
573         if (shouldPercentEncodeQueryByte(byte))
574             percentEncodeByte(byte);
575         else
576             appendToASCIIBuffer(byte);
577     }
578 }
579
580 ALWAYS_INLINE static bool isDefaultPort(StringView scheme, uint16_t port)
581 {
582     static const uint16_t ftpPort = 21;
583     static const uint16_t gopherPort = 70;
584     static const uint16_t httpPort = 80;
585     static const uint16_t httpsPort = 443;
586     static const uint16_t wsPort = 80;
587     static const uint16_t wssPort = 443;
588     
589     auto length = scheme.length();
590     if (!length)
591         return false;
592     switch (scheme[0]) {
593     case 'w':
594         switch (length) {
595         case 2:
596             return scheme[1] == 's'
597                 && port == wsPort;
598         case 3:
599             return scheme[1] == 's'
600                 && scheme[2] == 's'
601                 && port == wssPort;
602         default:
603             return false;
604         }
605     case 'h':
606         switch (length) {
607         case 4:
608             return scheme[1] == 't'
609                 && scheme[2] == 't'
610                 && scheme[3] == 'p'
611                 && port == httpPort;
612         case 5:
613             return scheme[1] == 't'
614                 && scheme[2] == 't'
615                 && scheme[3] == 'p'
616                 && scheme[4] == 's'
617                 && port == httpsPort;
618         default:
619             return false;
620         }
621     case 'g':
622         return length == 6
623             && scheme[1] == 'o'
624             && scheme[2] == 'p'
625             && scheme[3] == 'h'
626             && scheme[4] == 'e'
627             && scheme[5] == 'r'
628             && port == gopherPort;
629     case 'f':
630         return length == 3
631             && scheme[1] == 't'
632             && scheme[2] == 'p'
633             && port == ftpPort;
634         return false;
635     default:
636         return false;
637     }
638 }
639
640 ALWAYS_INLINE static bool isSpecialScheme(StringView scheme)
641 {
642     auto length = scheme.length();
643     if (!length)
644         return false;
645     switch (scheme[0]) {
646     case 'f':
647         switch (length) {
648         case 3:
649             return scheme[1] == 't'
650                 && scheme[2] == 'p';
651         case 4:
652             return scheme[1] == 'i'
653                 && scheme[2] == 'l'
654                 && scheme[3] == 'e';
655         default:
656             return false;
657         }
658     case 'g':
659         return length == 6
660             && scheme[1] == 'o'
661             && scheme[2] == 'p'
662             && scheme[3] == 'h'
663             && scheme[4] == 'e'
664             && scheme[5] == 'r';
665     case 'h':
666         switch (length) {
667         case 4:
668             return scheme[1] == 't'
669                 && scheme[2] == 't'
670                 && scheme[3] == 'p';
671         case 5:
672             return scheme[1] == 't'
673                 && scheme[2] == 't'
674                 && scheme[3] == 'p'
675                 && scheme[4] == 's';
676         default:
677             return false;
678         }
679     case 'w':
680         switch (length) {
681         case 2:
682             return scheme[1] == 's';
683         case 3:
684             return scheme[1] == 's'
685                 && scheme[2] == 's';
686         default:
687             return false;
688         }
689     default:
690         return false;
691     }
692 }
693
694 enum class URLParser::URLPart {
695     SchemeEnd,
696     UserStart,
697     UserEnd,
698     PasswordEnd,
699     HostEnd,
700     PortEnd,
701     PathAfterLastSlash,
702     PathEnd,
703     QueryEnd,
704     FragmentEnd,
705 };
706
707 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
708 {
709     switch (part) {
710     case URLPart::FragmentEnd:
711         return url.m_fragmentEnd;
712     case URLPart::QueryEnd:
713         return url.m_queryEnd;
714     case URLPart::PathEnd:
715         return url.m_pathEnd;
716     case URLPart::PathAfterLastSlash:
717         return url.m_pathAfterLastSlash;
718     case URLPart::PortEnd:
719         return url.m_portEnd;
720     case URLPart::HostEnd:
721         return url.m_hostEnd;
722     case URLPart::PasswordEnd:
723         return url.m_passwordEnd;
724     case URLPart::UserEnd:
725         return url.m_userEnd;
726     case URLPart::UserStart:
727         return url.m_userStart;
728     case URLPart::SchemeEnd:
729         return url.m_schemeEnd;
730     }
731     ASSERT_NOT_REACHED();
732     return 0;
733 }
734
735 void URLParser::copyASCIIStringUntil(const String& string, size_t lengthIf8Bit, size_t lengthIf16Bit)
736 {
737     if (string.isNull()) {
738         ASSERT(!lengthIf8Bit);
739         ASSERT(!lengthIf16Bit);
740         return;
741     }
742     ASSERT(m_asciiBuffer.isEmpty());
743     if (string.is8Bit()) {
744         RELEASE_ASSERT(lengthIf8Bit <= string.length());
745         appendToASCIIBuffer(string.characters8(), lengthIf8Bit);
746     } else {
747         RELEASE_ASSERT(lengthIf16Bit <= string.length());
748         const UChar* characters = string.characters16();
749         for (size_t i = 0; i < lengthIf16Bit; ++i) {
750             UChar c = characters[i];
751             ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
752             appendToASCIIBuffer(c);
753         }
754     }
755 }
756
757 template<typename CharacterType>
758 void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator)
759 {
760     syntaxViolation(iterator);
761
762     m_asciiBuffer.clear();
763     m_unicodeFragmentBuffer.clear();
764     if (part == URLPart::FragmentEnd) {
765         copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, URLPart::FragmentEnd), urlLengthUntilPart(base, URLPart::QueryEnd));
766         if (!base.m_string.is8Bit()) {
767             const String& fragment = base.m_string;
768             bool seenUnicode = false;
769             for (size_t i = base.m_queryEnd; i < base.m_fragmentEnd; ++i) {
770                 if (!seenUnicode && !isASCII(fragment[i]))
771                     seenUnicode = true;
772                 if (seenUnicode)
773                     m_unicodeFragmentBuffer.uncheckedAppend(fragment[i]);
774                 else
775                     m_asciiBuffer.uncheckedAppend(fragment[i]);
776             }
777         }
778     } else {
779         size_t length = urlLengthUntilPart(base, part);
780         copyASCIIStringUntil(base.m_string, length, length);
781     }
782     switch (part) {
783     case URLPart::FragmentEnd:
784         m_url.m_fragmentEnd = base.m_fragmentEnd;
785         FALLTHROUGH;
786     case URLPart::QueryEnd:
787         m_url.m_queryEnd = base.m_queryEnd;
788         FALLTHROUGH;
789     case URLPart::PathEnd:
790         m_url.m_pathEnd = base.m_pathEnd;
791         FALLTHROUGH;
792     case URLPart::PathAfterLastSlash:
793         m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
794         FALLTHROUGH;
795     case URLPart::PortEnd:
796         m_url.m_portEnd = base.m_portEnd;
797         FALLTHROUGH;
798     case URLPart::HostEnd:
799         m_url.m_hostEnd = base.m_hostEnd;
800         FALLTHROUGH;
801     case URLPart::PasswordEnd:
802         m_url.m_passwordEnd = base.m_passwordEnd;
803         FALLTHROUGH;
804     case URLPart::UserEnd:
805         m_url.m_userEnd = base.m_userEnd;
806         FALLTHROUGH;
807     case URLPart::UserStart:
808         m_url.m_userStart = base.m_userStart;
809         FALLTHROUGH;
810     case URLPart::SchemeEnd:
811         m_url.m_isValid = base.m_isValid;
812         m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
813         m_url.m_schemeEnd = base.m_schemeEnd;
814     }
815     m_urlIsSpecial = isSpecialScheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd));
816 }
817
818 static const char* dotASCIICode = "2e";
819
820 template<typename CharacterType>
821 ALWAYS_INLINE static bool isPercentEncodedDot(CodePointIterator<CharacterType> c)
822 {
823     if (c.atEnd())
824         return false;
825     if (*c != '%')
826         return false;
827     ++c;
828     if (c.atEnd())
829         return false;
830     if (*c != dotASCIICode[0])
831         return false;
832     ++c;
833     if (c.atEnd())
834         return false;
835     return toASCIILower(*c) == dotASCIICode[1];
836 }
837
838 template<typename CharacterType>
839 ALWAYS_INLINE static bool isSingleDotPathSegment(CodePointIterator<CharacterType> c)
840 {
841     if (c.atEnd())
842         return false;
843     if (*c == '.') {
844         ++c;
845         return c.atEnd() || isSlashQuestionOrHash(*c);
846     }
847     if (*c != '%')
848         return false;
849     ++c;
850     if (c.atEnd() || *c != dotASCIICode[0])
851         return false;
852     ++c;
853     if (c.atEnd())
854         return false;
855     if (toASCIILower(*c) == dotASCIICode[1]) {
856         ++c;
857         return c.atEnd() || isSlashQuestionOrHash(*c);
858     }
859     return false;
860 }
861
862 template<typename CharacterType>
863 ALWAYS_INLINE static bool isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
864 {
865     if (c.atEnd())
866         return false;
867     if (*c == '.') {
868         ++c;
869         return isSingleDotPathSegment(c);
870     }
871     if (*c != '%')
872         return false;
873     ++c;
874     if (c.atEnd() || *c != dotASCIICode[0])
875         return false;
876     ++c;
877     if (c.atEnd())
878         return false;
879     if (toASCIILower(*c) == dotASCIICode[1]) {
880         ++c;
881         return isSingleDotPathSegment(c);
882     }
883     return false;
884 }
885
886 template<typename CharacterType>
887 static void consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
888 {
889     ASSERT(isSingleDotPathSegment(c));
890     if (*c == '.') {
891         ++c;
892         if (!c.atEnd()) {
893             if (*c == '/' || *c == '\\')
894                 ++c;
895             else
896                 ASSERT(*c == '?' || *c == '#');
897         }
898     } else {
899         ASSERT(*c == '%');
900         ++c;
901         ASSERT(*c == dotASCIICode[0]);
902         ++c;
903         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
904         ++c;
905         if (!c.atEnd()) {
906             if (*c == '/' || *c == '\\')
907                 ++c;
908             else
909                 ASSERT(*c == '?' || *c == '#');
910         }
911     }
912 }
913
914 template<typename CharacterType>
915 static void consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
916 {
917     ASSERT(isDoubleDotPathSegment(c));
918     if (*c == '.')
919         ++c;
920     else {
921         ASSERT(*c == '%');
922         ++c;
923         ASSERT(*c == dotASCIICode[0]);
924         ++c;
925         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
926         ++c;
927     }
928     consumeSingleDotPathSegment(c);
929 }
930
931 void URLParser::popPath()
932 {
933     ASSERT(m_didSeeSyntaxViolation);
934     if (m_url.m_pathAfterLastSlash > m_url.m_portEnd + 1) {
935         m_url.m_pathAfterLastSlash--;
936         if (m_asciiBuffer[m_url.m_pathAfterLastSlash] == '/')
937             m_url.m_pathAfterLastSlash--;
938         while (m_url.m_pathAfterLastSlash > m_url.m_portEnd && m_asciiBuffer[m_url.m_pathAfterLastSlash] != '/')
939             m_url.m_pathAfterLastSlash--;
940         m_url.m_pathAfterLastSlash++;
941     }
942     m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
943 }
944
945 template<typename CharacterType>
946 void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
947 {
948     if (m_didSeeSyntaxViolation)
949         return;
950     m_didSeeSyntaxViolation = true;
951     
952     ASSERT(m_asciiBuffer.isEmpty());
953     ASSERT(m_unicodeFragmentBuffer.isEmpty());
954     ASSERT_WITH_MESSAGE(!m_url.m_queryEnd, "syntaxViolation should not be used in the fragment, which might contain non-ASCII code points when serialized");
955     size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
956     RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
957     m_asciiBuffer.reserveCapacity(m_inputString.length());
958     for (size_t i = 0; i < codeUnitsToCopy; ++i) {
959         ASSERT(isASCII(m_inputString[i]));
960         m_asciiBuffer.uncheckedAppend(m_inputString[i]);
961     }
962 }
963
964 template<typename CharacterType>
965 void URLParser::fragmentSyntaxViolation(const CodePointIterator<CharacterType>& iterator)
966 {
967     if (m_didSeeSyntaxViolation)
968         return;
969     m_didSeeSyntaxViolation = true;
970     m_didSeeUnicodeFragmentCodePoint = true;
971
972     ASSERT(m_asciiBuffer.isEmpty());
973     ASSERT(m_unicodeFragmentBuffer.isEmpty());
974     size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
975     size_t asciiCodeUnitsToCopy = m_url.m_queryEnd;
976     size_t unicodeCodeUnitsToCopy = codeUnitsToCopy - asciiCodeUnitsToCopy;
977     RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
978     m_asciiBuffer.reserveCapacity(asciiCodeUnitsToCopy);
979     for (size_t i = 0; i < asciiCodeUnitsToCopy; ++i) {
980         ASSERT(isASCII(m_inputString[i]));
981         m_asciiBuffer.uncheckedAppend(m_inputString[i]);
982     }
983     m_unicodeFragmentBuffer.reserveCapacity(m_inputString.length() - asciiCodeUnitsToCopy);
984     for (size_t i = asciiCodeUnitsToCopy; i < asciiCodeUnitsToCopy + unicodeCodeUnitsToCopy; ++i)
985         m_unicodeFragmentBuffer.uncheckedAppend(m_inputString[i]);
986 }
987
988 void URLParser::failure()
989 {
990     m_url.invalidate();
991     m_url.m_string = m_inputString;
992 }
993
994 ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
995 {
996     if (UNLIKELY(m_didSeeSyntaxViolation)) {
997         ASSERT(start + length <= m_asciiBuffer.size());
998         return StringView(m_asciiBuffer.data() + start, length);
999     }
1000     ASSERT(start + length <= m_inputString.length());
1001     return StringView(m_inputString).substring(start, length);
1002 }
1003
1004 template<typename CharacterType>
1005 ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1006 {
1007     if (UNLIKELY(m_didSeeSyntaxViolation)) {
1008         ASSERT(m_unicodeFragmentBuffer.isEmpty());
1009         return m_asciiBuffer.size();
1010     }
1011     
1012     return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1013 }
1014
1015 URLParser::URLParser(const String& input, const URL& base, const TextEncoding& encoding)
1016     : m_inputString(input)
1017 {
1018     if (input.isNull()) {
1019         if (base.isValid() && !base.m_cannotBeABaseURL)
1020             m_url = base;
1021         return;
1022     }
1023
1024     if (input.is8Bit()) {
1025         m_inputBegin = input.characters8();
1026         parse(input.characters8(), input.length(), base, encoding);
1027     } else {
1028         m_inputBegin = input.characters16();
1029         parse(input.characters16(), input.length(), base, encoding);
1030     }
1031     ASSERT(!m_url.m_isValid
1032         || m_didSeeSyntaxViolation == (m_url.string() != input)
1033         || (input.isEmpty() && m_url.m_string == base.m_string));
1034 }
1035
1036 template<typename CharacterType>
1037 void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
1038 {
1039     LOG(URLParser, "Parsing URL <%s> base <%s>", String(input, length).utf8().data(), base.string().utf8().data());
1040     m_url = { };
1041     ASSERT(m_asciiBuffer.isEmpty());
1042     ASSERT(m_unicodeFragmentBuffer.isEmpty());
1043     
1044     bool isUTF8Encoding = encoding == UTF8Encoding();
1045     Vector<UChar> queryBuffer;
1046
1047     unsigned endIndex = length;
1048     while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) {
1049         syntaxViolation(CodePointIterator<CharacterType>(input, input));
1050         endIndex--;
1051     }
1052     CodePointIterator<CharacterType> c(input, input + endIndex);
1053     CodePointIterator<CharacterType> authorityOrHostBegin;
1054     while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1055         syntaxViolation(c);
1056         ++c;
1057     }
1058     auto beginAfterControlAndSpace = c;
1059
1060     enum class State : uint8_t {
1061         SchemeStart,
1062         Scheme,
1063         NoScheme,
1064         SpecialRelativeOrAuthority,
1065         PathOrAuthority,
1066         Relative,
1067         RelativeSlash,
1068         SpecialAuthoritySlashes,
1069         SpecialAuthorityIgnoreSlashes,
1070         AuthorityOrHost,
1071         Host,
1072         File,
1073         FileSlash,
1074         FileHost,
1075         PathStart,
1076         Path,
1077         CannotBeABaseURLPath,
1078         Query,
1079         Fragment,
1080     };
1081
1082 #define LOG_STATE(x) LOG(URLParser, "State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1083 #define LOG_FINAL_STATE(x) LOG(URLParser, "Final State: %s", x)
1084
1085     State state = State::SchemeStart;
1086     while (!c.atEnd()) {
1087         if (UNLIKELY(isTabOrNewline(*c))) {
1088             syntaxViolation(c);
1089             ++c;
1090             continue;
1091         }
1092
1093         switch (state) {
1094         case State::SchemeStart:
1095             LOG_STATE("SchemeStart");
1096             if (isASCIIAlpha(*c)) {
1097                 if (UNLIKELY(isASCIIUpper(*c)))
1098                     syntaxViolation(c);
1099                 appendToASCIIBuffer(toASCIILower(*c));
1100                 advance(c);
1101                 if (c.atEnd()) {
1102                     m_asciiBuffer.clear();
1103                     state = State::NoScheme;
1104                     c = beginAfterControlAndSpace;
1105                 }
1106                 state = State::Scheme;
1107             } else
1108                 state = State::NoScheme;
1109             break;
1110         case State::Scheme:
1111             LOG_STATE("Scheme");
1112             if (isValidSchemeCharacter(*c)) {
1113                 if (UNLIKELY(isASCIIUpper(*c)))
1114                     syntaxViolation(c);
1115                 appendToASCIIBuffer(toASCIILower(*c));
1116             } else if (*c == ':') {
1117                 m_url.m_schemeEnd = currentPosition(c);
1118                 StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd);
1119                 m_url.m_protocolIsInHTTPFamily = urlScheme == "http" || urlScheme == "https";
1120                 appendToASCIIBuffer(':');
1121                 if (urlScheme == "file") {
1122                     m_urlIsSpecial = true;
1123                     state = State::File;
1124                     ++c;
1125                     break;
1126                 }
1127                 if (isSpecialScheme(urlScheme)) {
1128                     m_urlIsSpecial = true;
1129                     if (base.protocolIs(urlScheme))
1130                         state = State::SpecialRelativeOrAuthority;
1131                     else
1132                         state = State::SpecialAuthoritySlashes;
1133                     ++c;
1134                 } else {
1135                     auto maybeSlash = c;
1136                     advance(maybeSlash);
1137                     if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1138                         appendToASCIIBuffer('/');
1139                         c = maybeSlash;
1140                         state = State::PathOrAuthority;
1141                         ASSERT(*c == '/');
1142                         ++c;
1143                         m_url.m_userStart = currentPosition(c);
1144                     } else {
1145                         ++c;
1146                         m_url.m_userStart = currentPosition(c);
1147                         m_url.m_userEnd = m_url.m_userStart;
1148                         m_url.m_passwordEnd = m_url.m_userStart;
1149                         m_url.m_hostEnd = m_url.m_userStart;
1150                         m_url.m_portEnd = m_url.m_userStart;
1151                         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1152                         m_url.m_cannotBeABaseURL = true;
1153                         state = State::CannotBeABaseURLPath;
1154                     }
1155                 }
1156                 break;
1157             } else {
1158                 m_asciiBuffer.clear();
1159                 state = State::NoScheme;
1160                 c = beginAfterControlAndSpace;
1161                 break;
1162             }
1163             advance(c);
1164             if (c.atEnd()) {
1165                 m_asciiBuffer.clear();
1166                 state = State::NoScheme;
1167                 c = beginAfterControlAndSpace;
1168             }
1169             break;
1170         case State::NoScheme:
1171             LOG_STATE("NoScheme");
1172             if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) {
1173                 failure();
1174                 return;
1175             }
1176             if (base.m_cannotBeABaseURL && *c == '#') {
1177                 copyURLPartsUntil(base, URLPart::QueryEnd, c);
1178                 state = State::Fragment;
1179                 appendToASCIIBuffer('#');
1180                 ++c;
1181                 break;
1182             }
1183             if (!base.protocolIs("file")) {
1184                 state = State::Relative;
1185                 break;
1186             }
1187             copyURLPartsUntil(base, URLPart::SchemeEnd, c);
1188             appendToASCIIBuffer(':');
1189             state = State::File;
1190             break;
1191         case State::SpecialRelativeOrAuthority:
1192             LOG_STATE("SpecialRelativeOrAuthority");
1193             if (*c == '/') {
1194                 appendToASCIIBuffer('/');
1195                 advance(c);
1196                 if (c.atEnd()) {
1197                     failure();
1198                     return;
1199                 }
1200                 if (*c == '/') {
1201                     appendToASCIIBuffer('/');
1202                     state = State::SpecialAuthorityIgnoreSlashes;
1203                     ++c;
1204                 } else
1205                     state = State::RelativeSlash;
1206             } else
1207                 state = State::Relative;
1208             break;
1209         case State::PathOrAuthority:
1210             LOG_STATE("PathOrAuthority");
1211             if (*c == '/') {
1212                 appendToASCIIBuffer('/');
1213                 state = State::AuthorityOrHost;
1214                 ++c;
1215                 m_url.m_userStart = currentPosition(c);
1216                 authorityOrHostBegin = c;
1217             } else {
1218                 ASSERT(parsedDataView(currentPosition(c) - 1, 1) == "/");
1219                 m_url.m_userStart = currentPosition(c) - 1;
1220                 m_url.m_userEnd = m_url.m_userStart;
1221                 m_url.m_passwordEnd = m_url.m_userStart;
1222                 m_url.m_hostEnd = m_url.m_userStart;
1223                 m_url.m_portEnd = m_url.m_userStart;
1224                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1225                 state = State::Path;
1226             }
1227             break;
1228         case State::Relative:
1229             LOG_STATE("Relative");
1230             switch (*c) {
1231             case '/':
1232             case '\\':
1233                 state = State::RelativeSlash;
1234                 ++c;
1235                 break;
1236             case '?':
1237                 copyURLPartsUntil(base, URLPart::PathEnd, c);
1238                 appendToASCIIBuffer('?');
1239                 state = State::Query;
1240                 ++c;
1241                 break;
1242             case '#':
1243                 copyURLPartsUntil(base, URLPart::QueryEnd, c);
1244                 appendToASCIIBuffer('#');
1245                 state = State::Fragment;
1246                 ++c;
1247                 break;
1248             default:
1249                 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c);
1250                 state = State::Path;
1251                 break;
1252             }
1253             break;
1254         case State::RelativeSlash:
1255             LOG_STATE("RelativeSlash");
1256             if (*c == '/' || *c == '\\') {
1257                 ++c;
1258                 copyURLPartsUntil(base, URLPart::SchemeEnd, c);
1259                 appendToASCIIBuffer("://", 3);
1260                 state = State::SpecialAuthorityIgnoreSlashes;
1261             } else {
1262                 copyURLPartsUntil(base, URLPart::PortEnd, c);
1263                 appendToASCIIBuffer('/');
1264                 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1265                 state = State::Path;
1266             }
1267             break;
1268         case State::SpecialAuthoritySlashes:
1269             LOG_STATE("SpecialAuthoritySlashes");
1270             if (LIKELY(*c == '/' || *c == '\\')) {
1271                 if (UNLIKELY(*c == '\\'))
1272                     syntaxViolation(c);
1273                 appendToASCIIBuffer('/');
1274                 advance(c);
1275                 if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) {
1276                     if (UNLIKELY(*c == '\\'))
1277                         syntaxViolation(c);
1278                     ++c;
1279                     appendToASCIIBuffer('/');
1280                 } else {
1281                     syntaxViolation(c);
1282                     appendToASCIIBuffer('/');
1283                 }
1284             } else {
1285                 syntaxViolation(c);
1286                 appendToASCIIBuffer("//", 2);
1287             }
1288             state = State::SpecialAuthorityIgnoreSlashes;
1289             break;
1290         case State::SpecialAuthorityIgnoreSlashes:
1291             LOG_STATE("SpecialAuthorityIgnoreSlashes");
1292             if (*c == '/' || *c == '\\') {
1293                 syntaxViolation(c);
1294                 ++c;
1295             } else {
1296                 m_url.m_userStart = currentPosition(c);
1297                 state = State::AuthorityOrHost;
1298                 authorityOrHostBegin = c;
1299             }
1300             break;
1301         case State::AuthorityOrHost:
1302             do {
1303                 LOG_STATE("AuthorityOrHost");
1304                 if (*c == '@') {
1305                     auto lastAt = c;
1306                     auto findLastAt = c;
1307                     while (!findLastAt.atEnd()) {
1308                         if (*findLastAt == '@')
1309                             lastAt = findLastAt;
1310                         bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\');
1311                         if (isSlash || *findLastAt == '?' || *findLastAt == '#')
1312                             break;
1313                         ++findLastAt;
1314                     }
1315                     parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1316                     c = lastAt;
1317                     advance(c);
1318                     authorityOrHostBegin = c;
1319                     state = State::Host;
1320                     m_hostHasPercentOrNonASCII = false;
1321                     break;
1322                 }
1323                 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1324                 if (isSlash || *c == '?' || *c == '#') {
1325                     m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1326                     m_url.m_passwordEnd = m_url.m_userEnd;
1327                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1328                         failure();
1329                         return;
1330                     }
1331                     if (UNLIKELY(!isSlash)) {
1332                         syntaxViolation(c);
1333                         appendToASCIIBuffer('/');
1334                         m_url.m_pathAfterLastSlash = currentPosition(c);
1335                     }
1336                     state = State::Path;
1337                     break;
1338                 }
1339                 if (isPercentOrNonASCII(*c))
1340                     m_hostHasPercentOrNonASCII = true;
1341                 ++c;
1342             } while (!c.atEnd());
1343             break;
1344         case State::Host:
1345             LOG_STATE("Host");
1346             if (*c == '/' || *c == '?' || *c == '#') {
1347                 if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1348                     failure();
1349                     return;
1350                 }
1351                 if (*c == '?' || *c == '#') {
1352                     syntaxViolation(c);
1353                     appendToASCIIBuffer('/');
1354                     m_url.m_pathAfterLastSlash = currentPosition(c);
1355                 }
1356                 state = State::Path;
1357                 break;
1358             }
1359             if (isPercentOrNonASCII(*c))
1360                 m_hostHasPercentOrNonASCII = true;
1361             ++c;
1362             break;
1363         case State::File:
1364             LOG_STATE("File");
1365             switch (*c) {
1366             case '\\':
1367                 syntaxViolation(c);
1368                 FALLTHROUGH;
1369             case '/':
1370                 appendToASCIIBuffer('/');
1371                 state = State::FileSlash;
1372                 ++c;
1373                 break;
1374             case '?':
1375                 syntaxViolation(c);
1376                 if (base.isValid() && base.protocolIs("file"))
1377                     copyURLPartsUntil(base, URLPart::PathEnd, c);
1378                 appendToASCIIBuffer("///?", 4);
1379                 m_url.m_userStart = currentPosition(c) - 2;
1380                 m_url.m_userEnd = m_url.m_userStart;
1381                 m_url.m_passwordEnd = m_url.m_userStart;
1382                 m_url.m_hostEnd = m_url.m_userStart;
1383                 m_url.m_portEnd = m_url.m_userStart;
1384                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1385                 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1386                 state = State::Query;
1387                 ++c;
1388                 break;
1389             case '#':
1390                 syntaxViolation(c);
1391                 if (base.isValid() && base.protocolIs("file"))
1392                     copyURLPartsUntil(base, URLPart::QueryEnd, c);
1393                 appendToASCIIBuffer("///#", 4);
1394                 m_url.m_userStart = currentPosition(c) - 2;
1395                 m_url.m_userEnd = m_url.m_userStart;
1396                 m_url.m_passwordEnd = m_url.m_userStart;
1397                 m_url.m_hostEnd = m_url.m_userStart;
1398                 m_url.m_portEnd = m_url.m_userStart;
1399                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1400                 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1401                 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1402                 state = State::Fragment;
1403                 ++c;
1404                 break;
1405             default:
1406                 syntaxViolation(c);
1407                 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1408                     copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c);
1409                 else {
1410                     appendToASCIIBuffer("///", 3);
1411                     m_url.m_userStart = currentPosition(c) - 1;
1412                     m_url.m_userEnd = m_url.m_userStart;
1413                     m_url.m_passwordEnd = m_url.m_userStart;
1414                     m_url.m_hostEnd = m_url.m_userStart;
1415                     m_url.m_portEnd = m_url.m_userStart;
1416                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1417                     if (isWindowsDriveLetter(c))
1418                         appendWindowsDriveLetter(c);
1419                 }
1420                 state = State::Path;
1421                 break;
1422             }
1423             break;
1424         case State::FileSlash:
1425             LOG_STATE("FileSlash");
1426             if (LIKELY(*c == '/' || *c == '\\')) {
1427                 if (UNLIKELY(*c == '\\'))
1428                     syntaxViolation(c);
1429                 ++c;
1430                 appendToASCIIBuffer('/');
1431                 m_url.m_userStart = currentPosition(c);
1432                 m_url.m_userEnd = m_url.m_userStart;
1433                 m_url.m_passwordEnd = m_url.m_userStart;
1434                 m_url.m_hostEnd = m_url.m_userStart;
1435                 m_url.m_portEnd = m_url.m_userStart;
1436                 authorityOrHostBegin = c;
1437                 state = State::FileHost;
1438                 break;
1439             }
1440             if (base.isValid() && base.protocolIs("file")) {
1441                 // FIXME: This String copy is unnecessary.
1442                 String basePath = base.path();
1443                 if (basePath.length() >= 2) {
1444                     bool windowsQuirk = basePath.is8Bit()
1445                         ? isWindowsDriveLetter(CodePointIterator<LChar>(basePath.characters8(), basePath.characters8() + basePath.length()))
1446                         : isWindowsDriveLetter(CodePointIterator<UChar>(basePath.characters16(), basePath.characters16() + basePath.length()));
1447                     if (windowsQuirk) {
1448                         appendToASCIIBuffer(basePath[0]);
1449                         appendToASCIIBuffer(basePath[1]);
1450                     }
1451                 }
1452             }
1453             syntaxViolation(c);
1454             appendToASCIIBuffer("//", 2);
1455             m_url.m_userStart = currentPosition(c) - 1;
1456             m_url.m_userEnd = m_url.m_userStart;
1457             m_url.m_passwordEnd = m_url.m_userStart;
1458             m_url.m_hostEnd = m_url.m_userStart;
1459             m_url.m_portEnd = m_url.m_userStart;
1460             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1461             if (isWindowsDriveLetter(c))
1462                 appendWindowsDriveLetter(c);
1463             state = State::Path;
1464             break;
1465         case State::FileHost:
1466             LOG_STATE("FileHost");
1467             if (isSlashQuestionOrHash(*c)) {
1468                 bool windowsQuirk = c.codeUnitsSince(authorityOrHostBegin) == 2 && isWindowsDriveLetter(authorityOrHostBegin);
1469                 if (windowsQuirk) {
1470                     syntaxViolation(authorityOrHostBegin);
1471                     appendToASCIIBuffer('/');
1472                     appendWindowsDriveLetter(authorityOrHostBegin);
1473                 }
1474                 if (windowsQuirk || authorityOrHostBegin == c) {
1475                     ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1, 1) == "/");
1476                     if (UNLIKELY(*c == '?')) {
1477                         syntaxViolation(c);
1478                         appendToASCIIBuffer("/?", 2);
1479                         ++c;
1480                         m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1481                         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1482                         state = State::Query;
1483                         break;
1484                     }
1485                     if (UNLIKELY(*c == '#')) {
1486                         syntaxViolation(c);
1487                         appendToASCIIBuffer("/#", 2);
1488                         ++c;
1489                         m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1490                         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1491                         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1492                         state = State::Fragment;
1493                         break;
1494                     }
1495                     state = State::Path;
1496                     break;
1497                 }
1498                 if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1499                     failure();
1500                     return;
1501                 }
1502                 if (UNLIKELY(equalLettersIgnoringASCIICase(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd), "localhost"))) {
1503                     syntaxViolation(c);
1504                     m_asciiBuffer.shrink(m_url.m_passwordEnd);
1505                     m_url.m_hostEnd = currentPosition(c);
1506                     m_url.m_portEnd = m_url.m_hostEnd;
1507                 }
1508                 
1509                 state = State::PathStart;
1510                 break;
1511             }
1512             if (isPercentOrNonASCII(*c))
1513                 m_hostHasPercentOrNonASCII = true;
1514             ++c;
1515             break;
1516         case State::PathStart:
1517             LOG_STATE("PathStart");
1518             if (*c != '/' && *c != '\\')
1519                 ++c;
1520             state = State::Path;
1521             break;
1522         case State::Path:
1523             LOG_STATE("Path");
1524             if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1525                 if (UNLIKELY(m_urlIsSpecial && *c == '\\'))
1526                     syntaxViolation(c);
1527                 appendToASCIIBuffer('/');
1528                 ++c;
1529                 m_url.m_pathAfterLastSlash = currentPosition(c);
1530                 break;
1531             }
1532             if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1, 1) == "/")) {
1533                 if (UNLIKELY(isDoubleDotPathSegment(c))) {
1534                     syntaxViolation(c);
1535                     consumeDoubleDotPathSegment(c);
1536                     popPath();
1537                     break;
1538                 }
1539                 if (UNLIKELY(isSingleDotPathSegment(c))) {
1540                     syntaxViolation(c);
1541                     consumeSingleDotPathSegment(c);
1542                     break;
1543                 }
1544             }
1545             if (*c == '?') {
1546                 m_url.m_pathEnd = currentPosition(c);
1547                 state = State::Query;
1548                 break;
1549             }
1550             if (*c == '#') {
1551                 m_url.m_pathEnd = currentPosition(c);
1552                 m_url.m_queryEnd = m_url.m_pathEnd;
1553                 state = State::Fragment;
1554                 break;
1555             }
1556             if (isPercentEncodedDot(c)) {
1557                 if (UNLIKELY(*c != '.'))
1558                     syntaxViolation(c);
1559                 appendToASCIIBuffer('.');
1560                 ASSERT(*c == '%');
1561                 ++c;
1562                 ASSERT(*c == dotASCIICode[0]);
1563                 ++c;
1564                 ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1565                 ++c;
1566                 break;
1567             }
1568             utf8PercentEncode<isInDefaultEncodeSet>(c);
1569             ++c;
1570             break;
1571         case State::CannotBeABaseURLPath:
1572             LOG_STATE("CannotBeABaseURLPath");
1573             if (*c == '?') {
1574                 m_url.m_pathEnd = currentPosition(c);
1575                 state = State::Query;
1576             } else if (*c == '#') {
1577                 m_url.m_pathEnd = currentPosition(c);
1578                 m_url.m_queryEnd = m_url.m_pathEnd;
1579                 state = State::Fragment;
1580             } else if (*c == '/') {
1581                 appendToASCIIBuffer('/');
1582                 ++c;
1583                 m_url.m_pathAfterLastSlash = currentPosition(c);
1584             } else {
1585                 utf8PercentEncode<isInSimpleEncodeSet>(c);
1586                 ++c;
1587             }
1588             break;
1589         case State::Query:
1590             LOG_STATE("Query");
1591             if (*c == '#') {
1592                 if (!isUTF8Encoding)
1593                     encodeQuery(queryBuffer, encoding);
1594                 m_url.m_queryEnd = currentPosition(c);
1595                 state = State::Fragment;
1596                 break;
1597             }
1598             if (isUTF8Encoding)
1599                 utf8QueryEncode(c);
1600             else
1601                 appendCodePoint(queryBuffer, *c);
1602             ++c;
1603             break;
1604         case State::Fragment:
1605             do {
1606                 LOG(URLParser, "State Fragment");
1607                 if (!m_didSeeUnicodeFragmentCodePoint && isASCII(*c))
1608                     appendToASCIIBuffer(*c);
1609                 else {
1610                     m_didSeeUnicodeFragmentCodePoint = true;
1611                     if (UNLIKELY(m_didSeeSyntaxViolation))
1612                         appendCodePoint(m_unicodeFragmentBuffer, *c);
1613                     else {
1614                         ASSERT(m_asciiBuffer.isEmpty());
1615                         ASSERT(m_unicodeFragmentBuffer.isEmpty());
1616                     }
1617                 }
1618                 ++c;
1619                 while (UNLIKELY(!c.atEnd() && isTabOrNewline(*c))) {
1620                     fragmentSyntaxViolation(c);
1621                     ++c;
1622                 }
1623             } while (!c.atEnd());
1624             break;
1625         }
1626     }
1627
1628     switch (state) {
1629     case State::SchemeStart:
1630         LOG_FINAL_STATE("SchemeStart");
1631         if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1632             m_url = base;
1633             return;
1634         }
1635         failure();
1636         return;
1637     case State::Scheme:
1638         LOG_FINAL_STATE("Scheme");
1639         failure();
1640         return;
1641     case State::NoScheme:
1642         LOG_FINAL_STATE("NoScheme");
1643         RELEASE_ASSERT_NOT_REACHED();
1644     case State::SpecialRelativeOrAuthority:
1645         LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1646         copyURLPartsUntil(base, URLPart::QueryEnd, c);
1647         m_url.m_fragmentEnd = m_url.m_queryEnd;
1648         break;
1649     case State::PathOrAuthority:
1650         LOG_FINAL_STATE("PathOrAuthority");
1651         ASSERT(m_url.m_userStart);
1652         ASSERT(m_url.m_userStart == currentPosition(c));
1653         ASSERT(parsedDataView(currentPosition(c) - 1, 1) == "/");
1654         m_url.m_userStart--;
1655         m_url.m_userEnd = m_url.m_userStart;
1656         m_url.m_passwordEnd = m_url.m_userStart;
1657         m_url.m_hostEnd = m_url.m_userStart;
1658         m_url.m_portEnd = m_url.m_userStart;
1659         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1660         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1661         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1662         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1663         break;
1664     case State::Relative:
1665         LOG_FINAL_STATE("Relative");
1666         copyURLPartsUntil(base, URLPart::FragmentEnd, c);
1667         break;
1668     case State::RelativeSlash:
1669         LOG_FINAL_STATE("RelativeSlash");
1670         copyURLPartsUntil(base, URLPart::PortEnd, c);
1671         appendToASCIIBuffer('/');
1672         m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1673         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1674         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1675         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1676         break;
1677     case State::SpecialAuthoritySlashes:
1678         LOG_FINAL_STATE("SpecialAuthoritySlashes");
1679         m_url.m_userStart = currentPosition(c);
1680         m_url.m_userEnd = m_url.m_userStart;
1681         m_url.m_passwordEnd = m_url.m_userStart;
1682         m_url.m_hostEnd = m_url.m_userStart;
1683         m_url.m_portEnd = m_url.m_userStart;
1684         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1685         m_url.m_pathEnd = m_url.m_userStart;
1686         m_url.m_queryEnd = m_url.m_userStart;
1687         m_url.m_fragmentEnd = m_url.m_userStart;
1688         break;
1689     case State::SpecialAuthorityIgnoreSlashes:
1690         LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1691         failure();
1692         return;
1693         break;
1694     case State::AuthorityOrHost:
1695         LOG_FINAL_STATE("AuthorityOrHost");
1696         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1697         m_url.m_passwordEnd = m_url.m_userEnd;
1698         if (authorityOrHostBegin.atEnd()) {
1699             m_url.m_hostEnd = m_url.m_userEnd;
1700             m_url.m_portEnd = m_url.m_userEnd;
1701         } else if (!parseHostAndPort(authorityOrHostBegin)) {
1702             failure();
1703             return;
1704         }
1705         syntaxViolation(c);
1706         appendToASCIIBuffer('/');
1707         m_url.m_pathEnd = m_url.m_portEnd + 1;
1708         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1709         m_url.m_queryEnd = m_url.m_pathEnd;
1710         m_url.m_fragmentEnd = m_url.m_pathEnd;
1711         break;
1712     case State::Host:
1713         LOG_FINAL_STATE("Host");
1714         if (!parseHostAndPort(authorityOrHostBegin)) {
1715             failure();
1716             return;
1717         }
1718         syntaxViolation(c);
1719         appendToASCIIBuffer('/');
1720         m_url.m_pathEnd = m_url.m_portEnd + 1;
1721         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1722         m_url.m_queryEnd = m_url.m_pathEnd;
1723         m_url.m_fragmentEnd = m_url.m_pathEnd;
1724         break;
1725     case State::File:
1726         LOG_FINAL_STATE("File");
1727         if (base.isValid() && base.protocolIs("file")) {
1728             copyURLPartsUntil(base, URLPart::QueryEnd, c);
1729             appendToASCIIBuffer(':');
1730         }
1731         syntaxViolation(c);
1732         appendToASCIIBuffer("///", 3);
1733         m_url.m_userStart = currentPosition(c) - 1;
1734         m_url.m_userEnd = m_url.m_userStart;
1735         m_url.m_passwordEnd = m_url.m_userStart;
1736         m_url.m_hostEnd = m_url.m_userStart;
1737         m_url.m_portEnd = m_url.m_userStart;
1738         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1739         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1740         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1741         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1742         break;
1743     case State::FileSlash:
1744         LOG_FINAL_STATE("FileSlash");
1745         syntaxViolation(c);
1746         m_url.m_userStart = currentPosition(c) + 1;
1747         appendToASCIIBuffer("//", 2);
1748         m_url.m_userEnd = m_url.m_userStart;
1749         m_url.m_passwordEnd = m_url.m_userStart;
1750         m_url.m_hostEnd = m_url.m_userStart;
1751         m_url.m_portEnd = m_url.m_userStart;
1752         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1753         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1754         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1755         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1756         break;
1757     case State::FileHost:
1758         LOG_FINAL_STATE("FileHost");
1759         if (authorityOrHostBegin == c) {
1760             syntaxViolation(c);
1761             appendToASCIIBuffer('/');
1762             m_url.m_userStart = currentPosition(c) - 1;
1763             m_url.m_userEnd = m_url.m_userStart;
1764             m_url.m_passwordEnd = m_url.m_userStart;
1765             m_url.m_hostEnd = m_url.m_userStart;
1766             m_url.m_portEnd = m_url.m_userStart;
1767             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1768             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1769             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1770             m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1771             break;
1772         }
1773
1774         if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1775             failure();
1776             return;
1777         }
1778
1779         syntaxViolation(c);
1780         if (equalLettersIgnoringASCIICase(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd), "localhost")) {
1781             m_asciiBuffer.shrink(m_url.m_passwordEnd);
1782             m_url.m_hostEnd = currentPosition(c);
1783             m_url.m_portEnd = m_url.m_hostEnd;
1784         }
1785         appendToASCIIBuffer('/');
1786         m_url.m_pathAfterLastSlash = m_url.m_hostEnd + 1;
1787         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1788         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1789         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1790         break;
1791     case State::PathStart:
1792         LOG_FINAL_STATE("PathStart");
1793         RELEASE_ASSERT_NOT_REACHED();
1794     case State::Path:
1795         LOG_FINAL_STATE("Path");
1796         m_url.m_pathEnd = currentPosition(c);
1797         m_url.m_queryEnd = m_url.m_pathEnd;
1798         m_url.m_fragmentEnd = m_url.m_pathEnd;
1799         break;
1800     case State::CannotBeABaseURLPath:
1801         LOG_FINAL_STATE("CannotBeABaseURLPath");
1802         m_url.m_pathEnd = currentPosition(c);
1803         m_url.m_queryEnd = m_url.m_pathEnd;
1804         m_url.m_fragmentEnd = m_url.m_pathEnd;
1805         break;
1806     case State::Query:
1807         LOG_FINAL_STATE("Query");
1808         if (!isUTF8Encoding)
1809             encodeQuery(queryBuffer, encoding);
1810         m_url.m_queryEnd = currentPosition(c);
1811         m_url.m_fragmentEnd = m_url.m_queryEnd;
1812         break;
1813     case State::Fragment:
1814         {
1815             LOG_FINAL_STATE("Fragment");
1816             size_t length = m_didSeeSyntaxViolation ? m_asciiBuffer.size() + m_unicodeFragmentBuffer.size() : c.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1817             m_url.m_fragmentEnd = length;
1818             break;
1819         }
1820     }
1821
1822     if (LIKELY(!m_didSeeSyntaxViolation)) {
1823         m_url.m_string = m_inputString;
1824         ASSERT(m_asciiBuffer.isEmpty());
1825         ASSERT(m_unicodeFragmentBuffer.isEmpty());
1826     } else if (!m_didSeeUnicodeFragmentCodePoint) {
1827         ASSERT(m_unicodeFragmentBuffer.isEmpty());
1828         m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
1829     } else {
1830         Vector<UChar> buffer;
1831         buffer.reserveInitialCapacity(m_asciiBuffer.size() + m_unicodeFragmentBuffer.size());
1832         buffer.appendVector(m_asciiBuffer);
1833         buffer.appendVector(m_unicodeFragmentBuffer);
1834         m_url.m_string = String::adopt(WTFMove(buffer));
1835     }
1836     m_url.m_isValid = true;
1837     LOG(URLParser, "Parsed URL <%s>", m_url.m_string.utf8().data());
1838     ASSERT(internalValuesConsistent(m_url));
1839 }
1840
1841 template<typename CharacterType>
1842 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
1843 {
1844     if (UNLIKELY(iterator.atEnd())) {
1845         syntaxViolation(iterator);
1846         m_url.m_userEnd = currentPosition(iterator);
1847         m_url.m_passwordEnd = m_url.m_userEnd;
1848         return;
1849     }
1850     for (; !iterator.atEnd(); advance(iterator)) {
1851         if (*iterator == ':') {
1852             m_url.m_userEnd = currentPosition(iterator);
1853             auto iteratorAtColon = iterator;
1854             ++iterator;
1855             bool tabOrNewlineAfterColon = false;
1856             while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
1857                 tabOrNewlineAfterColon = true;
1858                 ++iterator;
1859             }
1860             if (UNLIKELY(iterator.atEnd())) {
1861                 syntaxViolation(iteratorAtColon);
1862                 m_url.m_passwordEnd = m_url.m_userEnd;
1863                 if (m_url.m_userEnd > m_url.m_userStart)
1864                     appendToASCIIBuffer('@');
1865                 return;
1866             }
1867             if (tabOrNewlineAfterColon)
1868                 syntaxViolation(iteratorAtColon);
1869             appendToASCIIBuffer(':');
1870             break;
1871         }
1872         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
1873     }
1874     for (; !iterator.atEnd(); advance(iterator))
1875         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
1876     m_url.m_passwordEnd = currentPosition(iterator);
1877     if (!m_url.m_userEnd)
1878         m_url.m_userEnd = m_url.m_passwordEnd;
1879     appendToASCIIBuffer('@');
1880 }
1881
1882 template<typename UnsignedIntegerType>
1883 void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
1884 {
1885     LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
1886     LChar* end = buf + WTF_ARRAY_LENGTH(buf);
1887     LChar* p = end;
1888     do {
1889         *--p = (number % 10) + '0';
1890         number /= 10;
1891     } while (number);
1892     appendToASCIIBuffer(p, end - p);
1893 }
1894
1895 void URLParser::serializeIPv4(IPv4Address address)
1896 {
1897     appendNumberToASCIIBuffer<uint8_t>(address >> 24);
1898     appendToASCIIBuffer('.');
1899     appendNumberToASCIIBuffer<uint8_t>(address >> 16);
1900     appendToASCIIBuffer('.');
1901     appendNumberToASCIIBuffer<uint8_t>(address >> 8);
1902     appendToASCIIBuffer('.');
1903     appendNumberToASCIIBuffer<uint8_t>(address);
1904 }
1905     
1906 static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
1907 {
1908     size_t end = begin;
1909     for (; end < 8; end++) {
1910         if (address[end])
1911             break;
1912     }
1913     return end - begin;
1914 }
1915
1916 static Optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
1917 {
1918     Optional<size_t> longest;
1919     size_t longestLength = 0;
1920     for (size_t i = 0; i < 8; i++) {
1921         size_t length = zeroSequenceLength(address, i);
1922         if (length) {
1923             if (length > 1 && (!longest || longestLength < length)) {
1924                 longest = i;
1925                 longestLength = length;
1926             }
1927             i += length;
1928         }
1929     }
1930     return longest;
1931 }
1932
1933 void URLParser::serializeIPv6Piece(uint16_t piece)
1934 {
1935     bool printed = false;
1936     if (auto nibble0 = piece >> 12) {
1937         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
1938         printed = true;
1939     }
1940     auto nibble1 = piece >> 8 & 0xF;
1941     if (printed || nibble1) {
1942         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
1943         printed = true;
1944     }
1945     auto nibble2 = piece >> 4 & 0xF;
1946     if (printed || nibble2)
1947         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
1948     appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
1949 }
1950
1951 void URLParser::serializeIPv6(URLParser::IPv6Address address)
1952 {
1953     appendToASCIIBuffer('[');
1954     auto compressPointer = findLongestZeroSequence(address);
1955     for (size_t piece = 0; piece < 8; piece++) {
1956         if (compressPointer && compressPointer.value() == piece) {
1957             ASSERT(!address[piece]);
1958             if (piece)
1959                 appendToASCIIBuffer(':');
1960             else
1961                 appendToASCIIBuffer("::", 2);
1962             while (piece < 8 && !address[piece])
1963                 piece++;
1964             if (piece == 8)
1965                 break;
1966         }
1967         serializeIPv6Piece(address[piece]);
1968         if (piece < 7)
1969             appendToASCIIBuffer(':');
1970     }
1971     appendToASCIIBuffer(']');
1972 }
1973
1974 template<typename CharacterType>
1975 Optional<uint32_t> URLParser::parseIPv4Number(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
1976 {
1977     // FIXME: Check for overflow.
1978     enum class State : uint8_t {
1979         UnknownBase,
1980         Decimal,
1981         OctalOrHex,
1982         Octal,
1983         Hex,
1984     };
1985     State state = State::UnknownBase;
1986     uint32_t value = 0;
1987     while (!iterator.atEnd()) {
1988         if (*iterator == '.') {
1989             ++iterator;
1990             return value;
1991         }
1992         switch (state) {
1993         case State::UnknownBase:
1994             if (UNLIKELY(*iterator == '0')) {
1995                 ++iterator;
1996                 state = State::OctalOrHex;
1997                 break;
1998             }
1999             state = State::Decimal;
2000             break;
2001         case State::OctalOrHex:
2002             syntaxViolation(iteratorForSyntaxViolationPosition);
2003             if (*iterator == 'x' || *iterator == 'X') {
2004                 ++iterator;
2005                 state = State::Hex;
2006                 break;
2007             }
2008             state = State::Octal;
2009             break;
2010         case State::Decimal:
2011             if (*iterator < '0' || *iterator > '9')
2012                 return Nullopt;
2013             value *= 10;
2014             value += *iterator - '0';
2015             ++iterator;
2016             break;
2017         case State::Octal:
2018             ASSERT(m_didSeeSyntaxViolation);
2019             if (*iterator < '0' || *iterator > '7')
2020                 return Nullopt;
2021             value *= 8;
2022             value += *iterator - '0';
2023             ++iterator;
2024             break;
2025         case State::Hex:
2026             ASSERT(m_didSeeSyntaxViolation);
2027             if (!isASCIIHexDigit(*iterator))
2028                 return Nullopt;
2029             value *= 16;
2030             value += toASCIIHexValue(*iterator);
2031             ++iterator;
2032             break;
2033         }
2034     }
2035     return value;
2036 }
2037
2038 ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2039 {
2040     RELEASE_ASSERT(exponent <= 4);
2041     uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
2042     return values[exponent];
2043 }
2044
2045 template<typename CharacterType>
2046 Optional<URLParser::IPv4Address> URLParser::parseIPv4Host(CodePointIterator<CharacterType> iterator)
2047 {
2048     auto hostBegin = iterator;
2049
2050     Vector<uint32_t, 4> items;
2051     items.reserveInitialCapacity(4);
2052     while (!iterator.atEnd()) {
2053         if (items.size() >= 4)
2054             return Nullopt;
2055         if (auto item = parseIPv4Number(iterator, hostBegin))
2056             items.append(item.value());
2057         else
2058             return Nullopt;
2059     }
2060     if (!items.size() || items.size() > 4)
2061         return Nullopt;
2062     if (items.size() > 2) {
2063         for (size_t i = 0; i < items.size() - 2; i++) {
2064             if (items[i] > 255)
2065                 return Nullopt;
2066         }
2067     }
2068     if (items[items.size() - 1] >= pow256(5 - items.size()))
2069         return Nullopt;
2070     for (auto item : items) {
2071         if (item > 255)
2072             return Nullopt;
2073     }
2074
2075     if (UNLIKELY(items.size() != 4))
2076         syntaxViolation(hostBegin);
2077
2078     IPv4Address ipv4 = items.takeLast();
2079     for (size_t counter = 0; counter < items.size(); ++counter)
2080         ipv4 += items[counter] * pow256(3 - counter);
2081     return ipv4;
2082 }
2083     
2084 template<typename CharacterType>
2085 Optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2086 {
2087     ASSERT(*c == '[');
2088     auto hostBegin = c;
2089     advance(c, hostBegin);
2090     if (c.atEnd())
2091         return Nullopt;
2092
2093     IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}};
2094     size_t piecePointer = 0;
2095     Optional<size_t> compressPointer;
2096
2097     if (*c == ':') {
2098         advance(c, hostBegin);
2099         if (c.atEnd())
2100             return Nullopt;
2101         if (*c != ':')
2102             return Nullopt;
2103         advance(c, hostBegin);
2104         ++piecePointer;
2105         compressPointer = piecePointer;
2106     }
2107     
2108     while (!c.atEnd()) {
2109         if (piecePointer == 8)
2110             return Nullopt;
2111         if (*c == ':') {
2112             if (compressPointer)
2113                 return Nullopt;
2114             advance(c, hostBegin);
2115             ++piecePointer;
2116             compressPointer = piecePointer;
2117             continue;
2118         }
2119         uint16_t value = 0;
2120         size_t length = 0;
2121         for (; length < 4; length++) {
2122             if (c.atEnd())
2123                 break;
2124             if (!isASCIIHexDigit(*c))
2125                 break;
2126             if (isASCIIUpper(*c))
2127                 syntaxViolation(hostBegin);
2128             value = value * 0x10 + toASCIIHexValue(*c);
2129             advance(c, hostBegin);
2130         }
2131         if (UNLIKELY(length > 1 && !value))
2132             syntaxViolation(hostBegin);
2133
2134         address[piecePointer++] = value;
2135         if (c.atEnd())
2136             break;
2137         if (*c != ':')
2138             return Nullopt;
2139         advance(c, hostBegin);
2140     }
2141     
2142     if (!c.atEnd()) {
2143         if (piecePointer > 6)
2144             return Nullopt;
2145         size_t dotsSeen = 0;
2146         while (!c.atEnd()) {
2147             Optional<uint16_t> value;
2148             if (!isASCIIDigit(*c))
2149                 return Nullopt;
2150             while (isASCIIDigit(*c)) {
2151                 auto number = *c - '0';
2152                 if (!value)
2153                     value = number;
2154                 else if (!value.value())
2155                     return Nullopt;
2156                 else
2157                     value = value.value() * 10 + number;
2158                 advance(c, hostBegin);
2159                 if (c.atEnd())
2160                     return Nullopt;
2161                 if (value.value() > 255)
2162                     return Nullopt;
2163             }
2164             if (dotsSeen < 3 && *c != '.')
2165                 return Nullopt;
2166             address[piecePointer] = address[piecePointer] * 0x100 + value.valueOr(0);
2167             if (dotsSeen == 1 || dotsSeen == 3)
2168                 piecePointer++;
2169             if (!c.atEnd())
2170                 advance(c, hostBegin);
2171             if (dotsSeen == 3 && !c.atEnd())
2172                 return Nullopt;
2173             dotsSeen++;
2174         }
2175     }
2176     if (compressPointer) {
2177         size_t swaps = piecePointer - compressPointer.value();
2178         piecePointer = 7;
2179         while (swaps)
2180             std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
2181     } else if (piecePointer != 8)
2182         return Nullopt;
2183
2184     Optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2185     if (possibleCompressPointer)
2186         possibleCompressPointer.value()++;
2187     if (UNLIKELY(compressPointer != possibleCompressPointer))
2188         syntaxViolation(hostBegin);
2189     
2190     return address;
2191 }
2192
2193 const size_t defaultInlineBufferSize = 2048;
2194
2195 static Vector<LChar, defaultInlineBufferSize> percentDecode(const LChar* input, size_t length)
2196 {
2197     Vector<LChar, defaultInlineBufferSize> output;
2198     output.reserveInitialCapacity(length);
2199     
2200     for (size_t i = 0; i < length; ++i) {
2201         uint8_t byte = input[i];
2202         if (byte != '%')
2203             output.uncheckedAppend(byte);
2204         else if (i < length - 2) {
2205             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2206                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2207                 i += 2;
2208             } else
2209                 output.uncheckedAppend(byte);
2210         } else
2211             output.uncheckedAppend(byte);
2212     }
2213     return output;
2214 }
2215
2216 ALWAYS_INLINE static bool containsOnlyASCII(const String& string)
2217 {
2218     if (string.is8Bit())
2219         return charactersAreAllASCII(string.characters8(), string.length());
2220     return charactersAreAllASCII(string.characters16(), string.length());
2221 }
2222
2223 static Optional<Vector<LChar, defaultInlineBufferSize>> domainToASCII(const String& domain)
2224 {
2225     Vector<LChar, defaultInlineBufferSize> ascii;
2226     if (containsOnlyASCII(domain)) {
2227         size_t length = domain.length();
2228         if (domain.is8Bit()) {
2229             const LChar* characters = domain.characters8();
2230             ascii.reserveInitialCapacity(length);
2231             for (size_t i = 0; i < length; ++i)
2232                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2233         } else {
2234             const UChar* characters = domain.characters16();
2235             ascii.reserveInitialCapacity(length);
2236             for (size_t i = 0; i < length; ++i)
2237                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2238         }
2239         return ascii;
2240     }
2241     
2242     UChar hostnameBuffer[defaultInlineBufferSize];
2243     UErrorCode error = U_ZERO_ERROR;
2244
2245 #if COMPILER(GCC) || COMPILER(CLANG)
2246 #pragma GCC diagnostic push
2247 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2248 #endif
2249     // FIXME: This should use uidna_openUTS46 / uidna_close instead
2250     int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, UIDNA_ALLOW_UNASSIGNED, nullptr, &error);
2251 #if COMPILER(GCC) || COMPILER(CLANG)
2252 #pragma GCC diagnostic pop
2253 #endif
2254     ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
2255
2256     if (error == U_ZERO_ERROR) {
2257         for (int32_t i = 0; i < numCharactersConverted; ++i) {
2258             ASSERT(isASCII(hostnameBuffer[i]));
2259             ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2260         }
2261         ascii.append(hostnameBuffer, numCharactersConverted);
2262         return ascii;
2263     }
2264
2265     // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
2266     return Nullopt;
2267 }
2268
2269 static bool hasInvalidDomainCharacter(const Vector<LChar, defaultInlineBufferSize>& asciiDomain)
2270 {
2271     for (size_t i = 0; i < asciiDomain.size(); ++i) {
2272         if (isInvalidDomainCharacter(asciiDomain[i]))
2273             return true;
2274     }
2275     return false;
2276 }
2277
2278 template<typename CharacterType>
2279 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2280 {
2281     ASSERT(*iterator == ':');
2282     auto colonIterator = iterator;
2283     advance(iterator, colonIterator);
2284     uint32_t port = 0;
2285     if (UNLIKELY(iterator.atEnd())) {
2286         m_url.m_portEnd = currentPosition(colonIterator);
2287         syntaxViolation(colonIterator);
2288         return true;
2289     }
2290     for (; !iterator.atEnd(); ++iterator) {
2291         if (UNLIKELY(isTabOrNewline(*iterator))) {
2292             syntaxViolation(colonIterator);
2293             continue;
2294         }
2295         if (isASCIIDigit(*iterator)) {
2296             port = port * 10 + *iterator - '0';
2297             if (port > std::numeric_limits<uint16_t>::max())
2298                 return false;
2299         } else
2300             return false;
2301     }
2302
2303     if (UNLIKELY(isDefaultPort(parsedDataView(0, m_url.m_schemeEnd), port)))
2304         syntaxViolation(colonIterator);
2305     else {
2306         appendToASCIIBuffer(':');
2307         ASSERT(port <= std::numeric_limits<uint16_t>::max());
2308         appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2309     }
2310
2311     m_url.m_portEnd = currentPosition(iterator);
2312     return true;
2313 }
2314
2315 template<typename CharacterType>
2316 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2317 {
2318     if (iterator.atEnd())
2319         return false;
2320     if (*iterator == '[') {
2321         auto ipv6End = iterator;
2322         while (!ipv6End.atEnd() && *ipv6End != ']')
2323             ++ipv6End;
2324         if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2325             serializeIPv6(address.value());
2326             if (!ipv6End.atEnd()) {
2327                 advance(ipv6End);
2328                 if (!ipv6End.atEnd() && *ipv6End == ':') {
2329                     m_url.m_hostEnd = currentPosition(ipv6End);
2330                     return parsePort(ipv6End);
2331                 }
2332                 m_url.m_hostEnd = currentPosition(ipv6End);
2333                 m_url.m_portEnd = m_url.m_hostEnd;
2334                 return true;
2335             }
2336             m_url.m_hostEnd = currentPosition(ipv6End);
2337             return true;
2338         }
2339     }
2340
2341     if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2342         auto hostIterator = iterator;
2343         for (; !iterator.atEnd(); ++iterator) {
2344             if (isTabOrNewline(*iterator))
2345                 continue;
2346             if (*iterator == ':')
2347                 break;
2348             if (isInvalidDomainCharacter(*iterator))
2349                 return false;
2350         }
2351         if (auto address = parseIPv4Host(CodePointIterator<CharacterType>(hostIterator, iterator))) {
2352             serializeIPv4(address.value());
2353             m_url.m_hostEnd = currentPosition(iterator);
2354             if (iterator.atEnd()) {
2355                 m_url.m_portEnd = currentPosition(iterator);
2356                 return true;
2357             }
2358             return parsePort(iterator);
2359         }
2360         for (; hostIterator != iterator; ++hostIterator) {
2361             if (LIKELY(!isTabOrNewline(*hostIterator))) {
2362                 if (UNLIKELY(isASCIIUpper(*hostIterator)))
2363                     syntaxViolation(hostIterator);
2364                 appendToASCIIBuffer(toASCIILower(*hostIterator));
2365             } else
2366                 syntaxViolation(hostIterator);
2367         }
2368         m_url.m_hostEnd = currentPosition(iterator);
2369         if (!hostIterator.atEnd())
2370             return parsePort(hostIterator);
2371         m_url.m_portEnd = currentPosition(iterator);
2372         return true;
2373     }
2374     
2375     syntaxViolation(iterator);
2376     
2377     Vector<LChar, defaultInlineBufferSize> utf8Encoded;
2378     for (; !iterator.atEnd(); ++iterator) {
2379         if (isTabOrNewline(*iterator))
2380             continue;
2381         if (*iterator == ':')
2382             break;
2383         uint8_t buffer[U8_MAX_LENGTH];
2384         int32_t offset = 0;
2385         UBool error = false;
2386         U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error);
2387         ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
2388         // FIXME: Check error.
2389         utf8Encoded.append(buffer, offset);
2390     }
2391     Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size());
2392     String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2393     auto asciiDomain = domainToASCII(domain);
2394     if (!asciiDomain || hasInvalidDomainCharacter(asciiDomain.value()))
2395         return false;
2396     Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value();
2397     const LChar* asciiDomainCharacters = asciiDomainValue.data();
2398
2399     if (auto address = parseIPv4Host(CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()))) {
2400         serializeIPv4(address.value());
2401         m_url.m_hostEnd = currentPosition(iterator);
2402         if (iterator.atEnd()) {
2403             m_url.m_portEnd = currentPosition(iterator);
2404             return true;
2405         }
2406         return parsePort(iterator);
2407     }
2408
2409     appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2410     m_url.m_hostEnd = currentPosition(iterator);
2411     if (!iterator.atEnd())
2412         return parsePort(iterator);
2413     m_url.m_portEnd = currentPosition(iterator);
2414     return true;
2415 }
2416
2417 static Optional<String> formURLDecode(StringView input)
2418 {
2419     auto utf8 = input.utf8(StrictConversion);
2420     if (utf8.isNull())
2421         return Nullopt;
2422     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2423     return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2424 }
2425
2426 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2427 {
2428     Vector<StringView> sequences = input.split('&');
2429
2430     URLEncodedForm output;
2431     for (auto& bytes : sequences) {
2432         auto valueStart = bytes.find('=');
2433         if (valueStart == notFound) {
2434             if (auto name = formURLDecode(bytes))
2435                 output.append({name.value().replace('+', 0x20), emptyString()});
2436         } else {
2437             auto name = formURLDecode(bytes.substring(0, valueStart));
2438             auto value = formURLDecode(bytes.substring(valueStart + 1));
2439             if (name && value)
2440                 output.append(std::make_pair(name.value().replace('+', 0x20), value.value().replace('+', 0x20)));
2441         }
2442     }
2443     return output;
2444 }
2445
2446 static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2447 {
2448     auto utf8 = input.utf8(StrictConversion);
2449     const char* data = utf8.data();
2450     for (size_t i = 0; i < utf8.length(); ++i) {
2451         const char byte = data[i];
2452         if (byte == 0x20)
2453             output.append(0x2B);
2454         else if (byte == 0x2A
2455             || byte == 0x2D
2456             || byte == 0x2E
2457             || (byte >= 0x30 && byte <= 0x39)
2458             || (byte >= 0x41 && byte <= 0x5A)
2459             || byte == 0x5F
2460             || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches.
2461             output.append(byte);
2462         else
2463             percentEncodeByte(byte, output);
2464     }
2465 }
2466     
2467 String URLParser::serialize(const URLEncodedForm& tuples)
2468 {
2469     Vector<LChar> output;
2470     for (auto& tuple : tuples) {
2471         if (!output.isEmpty())
2472             output.append('&');
2473         serializeURLEncodedForm(tuple.first, output);
2474         output.append('=');
2475         serializeURLEncodedForm(tuple.second, output);
2476     }
2477     return String::adopt(WTFMove(output));
2478 }
2479
2480 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2481 {
2482     // FIXME: m_cannotBeABaseURL is not compared because the old URL::parse did not use it,
2483     // but once we get rid of URL::parse its value should be tested.
2484     LOG(URLParser, "%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2485         a.m_isValid,
2486         a.m_protocolIsInHTTPFamily,
2487         a.m_schemeEnd,
2488         a.m_userStart,
2489         a.m_userEnd,
2490         a.m_passwordEnd,
2491         a.m_hostEnd,
2492         a.m_portEnd,
2493         a.m_pathAfterLastSlash,
2494         a.m_pathEnd,
2495         a.m_queryEnd,
2496         a.m_fragmentEnd,
2497         a.m_string.utf8().data(),
2498         b.m_isValid,
2499         b.m_protocolIsInHTTPFamily,
2500         b.m_schemeEnd,
2501         b.m_userStart,
2502         b.m_userEnd,
2503         b.m_passwordEnd,
2504         b.m_hostEnd,
2505         b.m_portEnd,
2506         b.m_pathAfterLastSlash,
2507         b.m_pathEnd,
2508         b.m_queryEnd,
2509         b.m_fragmentEnd,
2510         b.m_string.utf8().data());
2511
2512     return a.m_string == b.m_string
2513         && a.m_isValid == b.m_isValid
2514         && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2515         && a.m_schemeEnd == b.m_schemeEnd
2516         && a.m_userStart == b.m_userStart
2517         && a.m_userEnd == b.m_userEnd
2518         && a.m_passwordEnd == b.m_passwordEnd
2519         && a.m_hostEnd == b.m_hostEnd
2520         && a.m_portEnd == b.m_portEnd
2521         && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2522         && a.m_pathEnd == b.m_pathEnd
2523         && a.m_queryEnd == b.m_queryEnd
2524         && a.m_fragmentEnd == b.m_fragmentEnd;
2525 }
2526
2527 bool URLParser::internalValuesConsistent(const URL& url)
2528 {
2529     return url.m_schemeEnd <= url.m_userStart
2530         && url.m_userStart <= url.m_userEnd
2531         && url.m_userEnd <= url.m_passwordEnd
2532         && url.m_passwordEnd <= url.m_hostEnd
2533         && url.m_hostEnd <= url.m_portEnd
2534         && url.m_portEnd <= url.m_pathAfterLastSlash
2535         && url.m_pathAfterLastSlash <= url.m_pathEnd
2536         && url.m_pathEnd <= url.m_queryEnd
2537         && url.m_queryEnd <= url.m_fragmentEnd
2538         && (url.m_isValid ? url.m_fragmentEnd == url.m_string.length() : !url.m_fragmentEnd);
2539     // FIXME: Why do we even store m_fragmentEnd?
2540     // It should be able to be deduced from m_isValid and m_string.length() to save memory.
2541 }
2542
2543 static bool urlParserEnabled = false;
2544
2545 void URLParser::setEnabled(bool enabled)
2546 {
2547     urlParserEnabled = enabled;
2548 }
2549
2550 bool URLParser::enabled()
2551 {
2552     return urlParserEnabled;
2553 }
2554
2555 } // namespace WebCore