5a00d7de204854e07c7ebcd1e4d2a4c42e4e11ff
[WebKit-https.git] / Source / WebCore / platform / URLParser.cpp
1 /*
2  * Copyright (C) 2016 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "URLParser.h"
28
29 #include "Logging.h"
30 #include <array>
31 #include <unicode/uidna.h>
32 #include <unicode/utypes.h>
33
34 namespace WebCore {
35
36 template<typename CharacterType>
37 class CodePointIterator {
38 public:
39     ALWAYS_INLINE CodePointIterator() { }
40     ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
41         : m_begin(begin)
42         , m_end(end)
43     {
44     }
45     
46     ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
47         : CodePointIterator(begin.m_begin, end.m_begin)
48     {
49         ASSERT(end.m_begin >= begin.m_begin);
50     }
51     
52     ALWAYS_INLINE UChar32 operator*() const;
53     ALWAYS_INLINE CodePointIterator& operator++();
54
55     ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
56     {
57         return m_begin == other.m_begin
58             && m_end == other.m_end;
59     }
60     ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
61     
62     ALWAYS_INLINE CodePointIterator& operator=(const CodePointIterator& other)
63     {
64         m_begin = other.m_begin;
65         m_end = other.m_end;
66         return *this;
67     }
68
69     ALWAYS_INLINE bool atEnd() const
70     {
71         ASSERT(m_begin <= m_end);
72         return m_begin >= m_end;
73     }
74     
75     ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
76     {
77         ASSERT(m_begin >= reference);
78         return m_begin - reference;
79     }
80
81     ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
82     {
83         return codeUnitsSince(other.m_begin);
84     }
85     
86 private:
87     const CharacterType* m_begin { nullptr };
88     const CharacterType* m_end { nullptr };
89 };
90
91 template<>
92 ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const
93 {
94     ASSERT(!atEnd());
95     return *m_begin;
96 }
97
98 template<>
99 ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
100 {
101     ASSERT(!atEnd());
102     m_begin++;
103     return *this;
104 }
105
106 template<>
107 ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const
108 {
109     ASSERT(!atEnd());
110     UChar32 c;
111     U16_GET(m_begin, 0, 0, m_end - m_begin, c);
112     return c;
113 }
114
115 template<>
116 ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
117 {
118     ASSERT(!atEnd());
119     unsigned i = 0;
120     size_t length = m_end - m_begin;
121     U16_FWD_1(m_begin, i, length);
122     m_begin += i;
123     return *this;
124 }
125     
126 ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
127 {
128     if (U_IS_BMP(codePoint)) {
129         destination.append(static_cast<UChar>(codePoint));
130         return;
131     }
132     destination.reserveCapacity(destination.size() + 2);
133     destination.uncheckedAppend(U16_LEAD(codePoint));
134     destination.uncheckedAppend(U16_TRAIL(codePoint));
135 }
136
137 enum URLCharacterClass {
138     UserInfo = 0x1,
139     Default = 0x2,
140     InvalidDomain = 0x4,
141     QueryPercent = 0x8,
142     SlashQuestionOrHash = 0x10,
143     Scheme = 0x20,
144 };
145
146 static const uint8_t characterClassTable[256] = {
147     UserInfo | Default | InvalidDomain | QueryPercent, // 0x0
148     UserInfo | Default | QueryPercent, // 0x1
149     UserInfo | Default | QueryPercent, // 0x2
150     UserInfo | Default | QueryPercent, // 0x3
151     UserInfo | Default | QueryPercent, // 0x4
152     UserInfo | Default | QueryPercent, // 0x5
153     UserInfo | Default | QueryPercent, // 0x6
154     UserInfo | Default | QueryPercent, // 0x7
155     UserInfo | Default | QueryPercent, // 0x8
156     UserInfo | Default | InvalidDomain | QueryPercent, // 0x9
157     UserInfo | Default | InvalidDomain | QueryPercent, // 0xA
158     UserInfo | Default | QueryPercent, // 0xB
159     UserInfo | Default | QueryPercent, // 0xC
160     UserInfo | Default | InvalidDomain | QueryPercent, // 0xD
161     UserInfo | Default | QueryPercent, // 0xE
162     UserInfo | Default | QueryPercent, // 0xF
163     UserInfo | Default | QueryPercent, // 0x10
164     UserInfo | Default | QueryPercent, // 0x11
165     UserInfo | Default | QueryPercent, // 0x12
166     UserInfo | Default | QueryPercent, // 0x13
167     UserInfo | Default | QueryPercent, // 0x14
168     UserInfo | Default | QueryPercent, // 0x15
169     UserInfo | Default | QueryPercent, // 0x16
170     UserInfo | Default | QueryPercent, // 0x17
171     UserInfo | Default | QueryPercent, // 0x18
172     UserInfo | Default | QueryPercent, // 0x19
173     UserInfo | Default | QueryPercent, // 0x1A
174     UserInfo | Default | QueryPercent, // 0x1B
175     UserInfo | Default | QueryPercent, // 0x1C
176     UserInfo | Default | QueryPercent, // 0x1D
177     UserInfo | Default | QueryPercent, // 0x1E
178     UserInfo | Default | QueryPercent, // 0x1F
179     UserInfo | Default | InvalidDomain | QueryPercent, // ' '
180     0, // '!'
181     UserInfo | Default | QueryPercent, // '"'
182     UserInfo | Default | InvalidDomain | QueryPercent | SlashQuestionOrHash, // '#'
183     0, // '$'
184     InvalidDomain, // '%'
185     0, // '&'
186     0, // '''
187     0, // '('
188     0, // ')'
189     0, // '*'
190     Scheme, // '+'
191     0, // ','
192     Scheme, // '-'
193     Scheme, // '.'
194     UserInfo | InvalidDomain | SlashQuestionOrHash, // '/'
195     Scheme, // '0'
196     Scheme, // '1'
197     Scheme, // '2'
198     Scheme, // '3'
199     Scheme, // '4'
200     Scheme, // '5'
201     Scheme, // '6'
202     Scheme, // '7'
203     Scheme, // '8'
204     Scheme, // '9'
205     UserInfo | InvalidDomain, // ':'
206     UserInfo, // ';'
207     UserInfo | Default | QueryPercent, // '<'
208     UserInfo, // '='
209     UserInfo | Default | QueryPercent, // '>'
210     UserInfo | Default | InvalidDomain | SlashQuestionOrHash, // '?'
211     UserInfo | InvalidDomain, // '@'
212     Scheme, // 'A'
213     Scheme, // 'B'
214     Scheme, // 'C'
215     Scheme, // 'D'
216     Scheme, // 'E'
217     Scheme, // 'F'
218     Scheme, // 'G'
219     Scheme, // 'H'
220     Scheme, // 'I'
221     Scheme, // 'J'
222     Scheme, // 'K'
223     Scheme, // 'L'
224     Scheme, // 'M'
225     Scheme, // 'N'
226     Scheme, // 'O'
227     Scheme, // 'P'
228     Scheme, // 'Q'
229     Scheme, // 'R'
230     Scheme, // 'S'
231     Scheme, // 'T'
232     Scheme, // 'U'
233     Scheme, // 'V'
234     Scheme, // 'W'
235     Scheme, // 'X'
236     Scheme, // 'Y'
237     Scheme, // 'Z'
238     UserInfo | InvalidDomain, // '['
239     UserInfo | InvalidDomain | SlashQuestionOrHash, // '\\'
240     UserInfo | InvalidDomain, // ']'
241     UserInfo, // '^'
242     0, // '_'
243     UserInfo | Default, // '`'
244     Scheme, // 'a'
245     Scheme, // 'b'
246     Scheme, // 'c'
247     Scheme, // 'd'
248     Scheme, // 'e'
249     Scheme, // 'f'
250     Scheme, // 'g'
251     Scheme, // 'h'
252     Scheme, // 'i'
253     Scheme, // 'j'
254     Scheme, // 'k'
255     Scheme, // 'l'
256     Scheme, // 'm'
257     Scheme, // 'n'
258     Scheme, // 'o'
259     Scheme, // 'p'
260     Scheme, // 'q'
261     Scheme, // 'r'
262     Scheme, // 's'
263     Scheme, // 't'
264     Scheme, // 'u'
265     Scheme, // 'v'
266     Scheme, // 'w'
267     Scheme, // 'x'
268     Scheme, // 'y'
269     Scheme, // 'z'
270     UserInfo | Default, // '{'
271     UserInfo, // '|'
272     UserInfo | Default, // '}'
273     0, // '~'
274     QueryPercent, // 0x7F
275     QueryPercent, // 0x80
276     QueryPercent, // 0x81
277     QueryPercent, // 0x82
278     QueryPercent, // 0x83
279     QueryPercent, // 0x84
280     QueryPercent, // 0x85
281     QueryPercent, // 0x86
282     QueryPercent, // 0x87
283     QueryPercent, // 0x88
284     QueryPercent, // 0x89
285     QueryPercent, // 0x8A
286     QueryPercent, // 0x8B
287     QueryPercent, // 0x8C
288     QueryPercent, // 0x8D
289     QueryPercent, // 0x8E
290     QueryPercent, // 0x8F
291     QueryPercent, // 0x90
292     QueryPercent, // 0x91
293     QueryPercent, // 0x92
294     QueryPercent, // 0x93
295     QueryPercent, // 0x94
296     QueryPercent, // 0x95
297     QueryPercent, // 0x96
298     QueryPercent, // 0x97
299     QueryPercent, // 0x98
300     QueryPercent, // 0x99
301     QueryPercent, // 0x9A
302     QueryPercent, // 0x9B
303     QueryPercent, // 0x9C
304     QueryPercent, // 0x9D
305     QueryPercent, // 0x9E
306     QueryPercent, // 0x9F
307     QueryPercent, // 0xA0
308     QueryPercent, // 0xA1
309     QueryPercent, // 0xA2
310     QueryPercent, // 0xA3
311     QueryPercent, // 0xA4
312     QueryPercent, // 0xA5
313     QueryPercent, // 0xA6
314     QueryPercent, // 0xA7
315     QueryPercent, // 0xA8
316     QueryPercent, // 0xA9
317     QueryPercent, // 0xAA
318     QueryPercent, // 0xAB
319     QueryPercent, // 0xAC
320     QueryPercent, // 0xAD
321     QueryPercent, // 0xAE
322     QueryPercent, // 0xAF
323     QueryPercent, // 0xB0
324     QueryPercent, // 0xB1
325     QueryPercent, // 0xB2
326     QueryPercent, // 0xB3
327     QueryPercent, // 0xB4
328     QueryPercent, // 0xB5
329     QueryPercent, // 0xB6
330     QueryPercent, // 0xB7
331     QueryPercent, // 0xB8
332     QueryPercent, // 0xB9
333     QueryPercent, // 0xBA
334     QueryPercent, // 0xBB
335     QueryPercent, // 0xBC
336     QueryPercent, // 0xBD
337     QueryPercent, // 0xBE
338     QueryPercent, // 0xBF
339     QueryPercent, // 0xC0
340     QueryPercent, // 0xC1
341     QueryPercent, // 0xC2
342     QueryPercent, // 0xC3
343     QueryPercent, // 0xC4
344     QueryPercent, // 0xC5
345     QueryPercent, // 0xC6
346     QueryPercent, // 0xC7
347     QueryPercent, // 0xC8
348     QueryPercent, // 0xC9
349     QueryPercent, // 0xCA
350     QueryPercent, // 0xCB
351     QueryPercent, // 0xCC
352     QueryPercent, // 0xCD
353     QueryPercent, // 0xCE
354     QueryPercent, // 0xCF
355     QueryPercent, // 0xD0
356     QueryPercent, // 0xD1
357     QueryPercent, // 0xD2
358     QueryPercent, // 0xD3
359     QueryPercent, // 0xD4
360     QueryPercent, // 0xD5
361     QueryPercent, // 0xD6
362     QueryPercent, // 0xD7
363     QueryPercent, // 0xD8
364     QueryPercent, // 0xD9
365     QueryPercent, // 0xDA
366     QueryPercent, // 0xDB
367     QueryPercent, // 0xDC
368     QueryPercent, // 0xDD
369     QueryPercent, // 0xDE
370     QueryPercent, // 0xDF
371     QueryPercent, // 0xE0
372     QueryPercent, // 0xE1
373     QueryPercent, // 0xE2
374     QueryPercent, // 0xE3
375     QueryPercent, // 0xE4
376     QueryPercent, // 0xE5
377     QueryPercent, // 0xE6
378     QueryPercent, // 0xE7
379     QueryPercent, // 0xE8
380     QueryPercent, // 0xE9
381     QueryPercent, // 0xEA
382     QueryPercent, // 0xEB
383     QueryPercent, // 0xEC
384     QueryPercent, // 0xED
385     QueryPercent, // 0xEE
386     QueryPercent, // 0xEF
387     QueryPercent, // 0xF0
388     QueryPercent, // 0xF1
389     QueryPercent, // 0xF2
390     QueryPercent, // 0xF3
391     QueryPercent, // 0xF4
392     QueryPercent, // 0xF5
393     QueryPercent, // 0xF6
394     QueryPercent, // 0xF7
395     QueryPercent, // 0xF8
396     QueryPercent, // 0xF9
397     QueryPercent, // 0xFA
398     QueryPercent, // 0xFB
399     QueryPercent, // 0xFC
400     QueryPercent, // 0xFD
401     QueryPercent, // 0xFE
402     QueryPercent, // 0xFF
403 };
404
405 template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; }
406 template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
407 template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
408 template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
409 template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
410 template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
411 template<typename CharacterType> ALWAYS_INLINE static bool isInvalidDomainCharacter(CharacterType character) { return character <= ']' && characterClassTable[character] & InvalidDomain; }
412 template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
413 template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
414 template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & Scheme; }
415 static bool shouldPercentEncodeQueryByte(uint8_t byte) { return characterClassTable[byte] & QueryPercent; }
416
417 template<typename CharacterType>
418 ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
419 {
420     ++iterator;
421     while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
422         syntaxViolation(iteratorForSyntaxViolationPosition);
423         ++iterator;
424     }
425 }
426
427 template<typename CharacterType>
428 ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
429 {
430     if (iterator.atEnd() || !isASCIIAlpha(*iterator))
431         return false;
432     advance(iterator);
433     if (iterator.atEnd())
434         return false;
435     if (*iterator == ':')
436         return true;
437     if (UNLIKELY(*iterator == '|')) {
438         syntaxViolation(iterator);
439         return true;
440     }
441     return false;
442 }
443
444 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
445 {
446     ASSERT(m_unicodeFragmentBuffer.isEmpty());
447     ASSERT(isASCII(codePoint));
448     if (UNLIKELY(m_didSeeSyntaxViolation))
449         m_asciiBuffer.append(codePoint);
450 }
451
452 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
453 {
454     ASSERT(m_unicodeFragmentBuffer.isEmpty());
455     if (UNLIKELY(m_didSeeSyntaxViolation))
456         m_asciiBuffer.append(characters, length);
457 }
458
459 template<typename CharacterType>
460 void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
461 {
462     ASSERT(isWindowsDriveLetter(iterator));
463     appendToASCIIBuffer(*iterator);
464     advance(iterator);
465     ASSERT(!iterator.atEnd());
466     ASSERT(*iterator == ':' || *iterator == '|');
467     appendToASCIIBuffer(':');
468     advance(iterator);
469 }
470
471 template<typename CharacterType>
472 bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
473 {
474     if (!isWindowsDriveLetter(iterator))
475         return true;
476     if (iterator.atEnd())
477         return false;
478     advance(iterator);
479     if (iterator.atEnd())
480         return true;
481     advance(iterator);
482     if (iterator.atEnd())
483         return true;
484     return !isSlashQuestionOrHash(*iterator);
485 }
486
487 static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
488 {
489     buffer.append('%');
490     buffer.append(upperNibbleToASCIIHexDigit(byte));
491     buffer.append(lowerNibbleToASCIIHexDigit(byte));
492 }
493
494 void URLParser::percentEncodeByte(uint8_t byte)
495 {
496     appendToASCIIBuffer('%');
497     appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
498     appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
499 }
500
501 const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD";
502 const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1;
503
504 template<bool(*isInCodeSet)(UChar32), typename CharacterType>
505 ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
506 {
507     ASSERT(!iterator.atEnd());
508     UChar32 codePoint = *iterator;
509     if (LIKELY(isASCII(codePoint))) {
510         if (UNLIKELY(isInCodeSet(codePoint))) {
511             syntaxViolation(iterator);
512             percentEncodeByte(codePoint);
513         } else
514             appendToASCIIBuffer(codePoint);
515         return;
516     }
517     ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
518     syntaxViolation(iterator);
519     
520     if (!U_IS_UNICODE_CHAR(codePoint)) {
521         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
522         return;
523     }
524     
525     uint8_t buffer[U8_MAX_LENGTH];
526     int32_t offset = 0;
527     U8_APPEND_UNSAFE(buffer, offset, codePoint);
528     for (int32_t i = 0; i < offset; ++i)
529         percentEncodeByte(buffer[i]);
530 }
531
532 template<typename CharacterType>
533 ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
534 {
535     ASSERT(!iterator.atEnd());
536     UChar32 codePoint = *iterator;
537     if (LIKELY(isASCII(codePoint))) {
538         if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint))) {
539             syntaxViolation(iterator);
540             percentEncodeByte(codePoint);
541         } else
542             appendToASCIIBuffer(codePoint);
543         return;
544     }
545     
546     syntaxViolation(iterator);
547     
548     if (!U_IS_UNICODE_CHAR(codePoint)) {
549         appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
550         return;
551     }
552
553     uint8_t buffer[U8_MAX_LENGTH];
554     int32_t offset = 0;
555     U8_APPEND_UNSAFE(buffer, offset, codePoint);
556     for (int32_t i = 0; i < offset; ++i) {
557         auto byte = buffer[i];
558         if (shouldPercentEncodeQueryByte(byte))
559             percentEncodeByte(byte);
560         else
561             appendToASCIIBuffer(byte);
562     }
563 }
564     
565 void URLParser::encodeQuery(const Vector<UChar>& source, const TextEncoding& encoding)
566 {
567     // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
568     CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
569     const char* data = encoded.data();
570     size_t length = encoded.length();
571     for (size_t i = 0; i < length; ++i) {
572         uint8_t byte = data[i];
573         if (shouldPercentEncodeQueryByte(byte))
574             percentEncodeByte(byte);
575         else
576             appendToASCIIBuffer(byte);
577     }
578 }
579
580 ALWAYS_INLINE static bool isDefaultPort(StringView scheme, uint16_t port)
581 {
582     static const uint16_t ftpPort = 21;
583     static const uint16_t gopherPort = 70;
584     static const uint16_t httpPort = 80;
585     static const uint16_t httpsPort = 443;
586     static const uint16_t wsPort = 80;
587     static const uint16_t wssPort = 443;
588     
589     auto length = scheme.length();
590     if (!length)
591         return false;
592     switch (scheme[0]) {
593     case 'w':
594         switch (length) {
595         case 2:
596             return scheme[1] == 's'
597                 && port == wsPort;
598         case 3:
599             return scheme[1] == 's'
600                 && scheme[2] == 's'
601                 && port == wssPort;
602         default:
603             return false;
604         }
605     case 'h':
606         switch (length) {
607         case 4:
608             return scheme[1] == 't'
609                 && scheme[2] == 't'
610                 && scheme[3] == 'p'
611                 && port == httpPort;
612         case 5:
613             return scheme[1] == 't'
614                 && scheme[2] == 't'
615                 && scheme[3] == 'p'
616                 && scheme[4] == 's'
617                 && port == httpsPort;
618         default:
619             return false;
620         }
621     case 'g':
622         return length == 6
623             && scheme[1] == 'o'
624             && scheme[2] == 'p'
625             && scheme[3] == 'h'
626             && scheme[4] == 'e'
627             && scheme[5] == 'r'
628             && port == gopherPort;
629     case 'f':
630         return length == 3
631             && scheme[1] == 't'
632             && scheme[2] == 'p'
633             && port == ftpPort;
634         return false;
635     default:
636         return false;
637     }
638 }
639
640 ALWAYS_INLINE static bool isSpecialScheme(StringView scheme)
641 {
642     auto length = scheme.length();
643     if (!length)
644         return false;
645     switch (scheme[0]) {
646     case 'f':
647         switch (length) {
648         case 3:
649             return scheme[1] == 't'
650                 && scheme[2] == 'p';
651         case 4:
652             return scheme[1] == 'i'
653                 && scheme[2] == 'l'
654                 && scheme[3] == 'e';
655         default:
656             return false;
657         }
658     case 'g':
659         return length == 6
660             && scheme[1] == 'o'
661             && scheme[2] == 'p'
662             && scheme[3] == 'h'
663             && scheme[4] == 'e'
664             && scheme[5] == 'r';
665     case 'h':
666         switch (length) {
667         case 4:
668             return scheme[1] == 't'
669                 && scheme[2] == 't'
670                 && scheme[3] == 'p';
671         case 5:
672             return scheme[1] == 't'
673                 && scheme[2] == 't'
674                 && scheme[3] == 'p'
675                 && scheme[4] == 's';
676         default:
677             return false;
678         }
679     case 'w':
680         switch (length) {
681         case 2:
682             return scheme[1] == 's';
683         case 3:
684             return scheme[1] == 's'
685                 && scheme[2] == 's';
686         default:
687             return false;
688         }
689     default:
690         return false;
691     }
692 }
693
694 enum class URLParser::URLPart {
695     SchemeEnd,
696     UserStart,
697     UserEnd,
698     PasswordEnd,
699     HostEnd,
700     PortEnd,
701     PathAfterLastSlash,
702     PathEnd,
703     QueryEnd,
704     FragmentEnd,
705 };
706
707 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
708 {
709     switch (part) {
710     case URLPart::FragmentEnd:
711         return url.m_fragmentEnd;
712     case URLPart::QueryEnd:
713         return url.m_queryEnd;
714     case URLPart::PathEnd:
715         return url.m_pathEnd;
716     case URLPart::PathAfterLastSlash:
717         return url.m_pathAfterLastSlash;
718     case URLPart::PortEnd:
719         return url.m_portEnd;
720     case URLPart::HostEnd:
721         return url.m_hostEnd;
722     case URLPart::PasswordEnd:
723         return url.m_passwordEnd;
724     case URLPart::UserEnd:
725         return url.m_userEnd;
726     case URLPart::UserStart:
727         return url.m_userStart;
728     case URLPart::SchemeEnd:
729         return url.m_schemeEnd;
730     }
731     ASSERT_NOT_REACHED();
732     return 0;
733 }
734
735 void URLParser::copyASCIIStringUntil(const String& string, size_t lengthIf8Bit, size_t lengthIf16Bit)
736 {
737     if (string.isNull()) {
738         ASSERT(!lengthIf8Bit);
739         ASSERT(!lengthIf16Bit);
740         return;
741     }
742     ASSERT(m_asciiBuffer.isEmpty());
743     if (string.is8Bit()) {
744         RELEASE_ASSERT(lengthIf8Bit <= string.length());
745         appendToASCIIBuffer(string.characters8(), lengthIf8Bit);
746     } else {
747         RELEASE_ASSERT(lengthIf16Bit <= string.length());
748         const UChar* characters = string.characters16();
749         for (size_t i = 0; i < lengthIf16Bit; ++i) {
750             UChar c = characters[i];
751             ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
752             appendToASCIIBuffer(c);
753         }
754     }
755 }
756
757 template<typename CharacterType>
758 void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator)
759 {
760     syntaxViolation(iterator);
761
762     m_asciiBuffer.clear();
763     m_unicodeFragmentBuffer.clear();
764     if (part == URLPart::FragmentEnd) {
765         copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, URLPart::FragmentEnd), urlLengthUntilPart(base, URLPart::QueryEnd));
766         if (!base.m_string.is8Bit()) {
767             const String& fragment = base.m_string;
768             bool seenUnicode = false;
769             for (size_t i = base.m_queryEnd; i < base.m_fragmentEnd; ++i) {
770                 if (!seenUnicode && !isASCII(fragment[i]))
771                     seenUnicode = true;
772                 if (seenUnicode)
773                     m_unicodeFragmentBuffer.uncheckedAppend(fragment[i]);
774                 else
775                     m_asciiBuffer.uncheckedAppend(fragment[i]);
776             }
777         }
778     } else {
779         size_t length = urlLengthUntilPart(base, part);
780         copyASCIIStringUntil(base.m_string, length, length);
781     }
782     switch (part) {
783     case URLPart::FragmentEnd:
784         m_url.m_fragmentEnd = base.m_fragmentEnd;
785         FALLTHROUGH;
786     case URLPart::QueryEnd:
787         m_url.m_queryEnd = base.m_queryEnd;
788         FALLTHROUGH;
789     case URLPart::PathEnd:
790         m_url.m_pathEnd = base.m_pathEnd;
791         FALLTHROUGH;
792     case URLPart::PathAfterLastSlash:
793         m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
794         FALLTHROUGH;
795     case URLPart::PortEnd:
796         m_url.m_portEnd = base.m_portEnd;
797         FALLTHROUGH;
798     case URLPart::HostEnd:
799         m_url.m_hostEnd = base.m_hostEnd;
800         FALLTHROUGH;
801     case URLPart::PasswordEnd:
802         m_url.m_passwordEnd = base.m_passwordEnd;
803         FALLTHROUGH;
804     case URLPart::UserEnd:
805         m_url.m_userEnd = base.m_userEnd;
806         FALLTHROUGH;
807     case URLPart::UserStart:
808         m_url.m_userStart = base.m_userStart;
809         FALLTHROUGH;
810     case URLPart::SchemeEnd:
811         m_url.m_isValid = base.m_isValid;
812         m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
813         m_url.m_schemeEnd = base.m_schemeEnd;
814     }
815     m_urlIsSpecial = isSpecialScheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd));
816 }
817
818 static const char* dotASCIICode = "2e";
819
820 template<typename CharacterType>
821 ALWAYS_INLINE static bool isPercentEncodedDot(CodePointIterator<CharacterType> c)
822 {
823     if (c.atEnd())
824         return false;
825     if (*c != '%')
826         return false;
827     ++c;
828     if (c.atEnd())
829         return false;
830     if (*c != dotASCIICode[0])
831         return false;
832     ++c;
833     if (c.atEnd())
834         return false;
835     return toASCIILower(*c) == dotASCIICode[1];
836 }
837
838 template<typename CharacterType>
839 ALWAYS_INLINE static bool isSingleDotPathSegment(CodePointIterator<CharacterType> c)
840 {
841     if (c.atEnd())
842         return false;
843     if (*c == '.') {
844         ++c;
845         return c.atEnd() || isSlashQuestionOrHash(*c);
846     }
847     if (*c != '%')
848         return false;
849     ++c;
850     if (c.atEnd() || *c != dotASCIICode[0])
851         return false;
852     ++c;
853     if (c.atEnd())
854         return false;
855     if (toASCIILower(*c) == dotASCIICode[1]) {
856         ++c;
857         return c.atEnd() || isSlashQuestionOrHash(*c);
858     }
859     return false;
860 }
861
862 template<typename CharacterType>
863 ALWAYS_INLINE static bool isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
864 {
865     if (c.atEnd())
866         return false;
867     if (*c == '.') {
868         ++c;
869         return isSingleDotPathSegment(c);
870     }
871     if (*c != '%')
872         return false;
873     ++c;
874     if (c.atEnd() || *c != dotASCIICode[0])
875         return false;
876     ++c;
877     if (c.atEnd())
878         return false;
879     if (toASCIILower(*c) == dotASCIICode[1]) {
880         ++c;
881         return isSingleDotPathSegment(c);
882     }
883     return false;
884 }
885
886 template<typename CharacterType>
887 static void consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
888 {
889     ASSERT(isSingleDotPathSegment(c));
890     if (*c == '.') {
891         ++c;
892         if (!c.atEnd()) {
893             if (*c == '/' || *c == '\\')
894                 ++c;
895             else
896                 ASSERT(*c == '?' || *c == '#');
897         }
898     } else {
899         ASSERT(*c == '%');
900         ++c;
901         ASSERT(*c == dotASCIICode[0]);
902         ++c;
903         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
904         ++c;
905         if (!c.atEnd()) {
906             if (*c == '/' || *c == '\\')
907                 ++c;
908             else
909                 ASSERT(*c == '?' || *c == '#');
910         }
911     }
912 }
913
914 template<typename CharacterType>
915 static void consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
916 {
917     ASSERT(isDoubleDotPathSegment(c));
918     if (*c == '.')
919         ++c;
920     else {
921         ASSERT(*c == '%');
922         ++c;
923         ASSERT(*c == dotASCIICode[0]);
924         ++c;
925         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
926         ++c;
927     }
928     consumeSingleDotPathSegment(c);
929 }
930
931 void URLParser::popPath()
932 {
933     ASSERT(m_didSeeSyntaxViolation);
934     if (m_url.m_pathAfterLastSlash > m_url.m_portEnd + 1) {
935         m_url.m_pathAfterLastSlash--;
936         if (m_asciiBuffer[m_url.m_pathAfterLastSlash] == '/')
937             m_url.m_pathAfterLastSlash--;
938         while (m_url.m_pathAfterLastSlash > m_url.m_portEnd && m_asciiBuffer[m_url.m_pathAfterLastSlash] != '/')
939             m_url.m_pathAfterLastSlash--;
940         m_url.m_pathAfterLastSlash++;
941     }
942     m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
943 }
944
945 template<typename CharacterType>
946 void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
947 {
948     if (m_didSeeSyntaxViolation)
949         return;
950     m_didSeeSyntaxViolation = true;
951     
952     ASSERT(m_asciiBuffer.isEmpty());
953     ASSERT(m_unicodeFragmentBuffer.isEmpty());
954     ASSERT_WITH_MESSAGE(!m_url.m_queryEnd, "syntaxViolation should not be used in the fragment, which might contain non-ASCII code points when serialized");
955     size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
956     RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
957     m_asciiBuffer.reserveCapacity(m_inputString.length());
958     for (size_t i = 0; i < codeUnitsToCopy; ++i) {
959         ASSERT(isASCII(m_inputString[i]));
960         m_asciiBuffer.uncheckedAppend(m_inputString[i]);
961     }
962 }
963
964 template<typename CharacterType>
965 void URLParser::fragmentSyntaxViolation(const CodePointIterator<CharacterType>& iterator)
966 {
967     if (m_didSeeSyntaxViolation)
968         return;
969     m_didSeeSyntaxViolation = true;
970     m_didSeeUnicodeFragmentCodePoint = true;
971
972     ASSERT(m_asciiBuffer.isEmpty());
973     ASSERT(m_unicodeFragmentBuffer.isEmpty());
974     size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
975     size_t asciiCodeUnitsToCopy = m_url.m_queryEnd;
976     size_t unicodeCodeUnitsToCopy = codeUnitsToCopy - asciiCodeUnitsToCopy;
977     RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
978     m_asciiBuffer.reserveCapacity(asciiCodeUnitsToCopy);
979     for (size_t i = 0; i < asciiCodeUnitsToCopy; ++i) {
980         ASSERT(isASCII(m_inputString[i]));
981         m_asciiBuffer.uncheckedAppend(m_inputString[i]);
982     }
983     m_unicodeFragmentBuffer.reserveCapacity(m_inputString.length() - asciiCodeUnitsToCopy);
984     for (size_t i = asciiCodeUnitsToCopy; i < asciiCodeUnitsToCopy + unicodeCodeUnitsToCopy; ++i)
985         m_unicodeFragmentBuffer.uncheckedAppend(m_inputString[i]);
986 }
987
988 void URLParser::failure()
989 {
990     m_url.invalidate();
991     m_url.m_string = m_inputString;
992 }
993
994 ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
995 {
996     if (UNLIKELY(m_didSeeSyntaxViolation)) {
997         ASSERT(start + length <= m_asciiBuffer.size());
998         return StringView(m_asciiBuffer.data() + start, length);
999     }
1000     ASSERT(start + length <= m_inputString.length());
1001     return StringView(m_inputString).substring(start, length);
1002 }
1003
1004 template<typename CharacterType>
1005 ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1006 {
1007     if (UNLIKELY(m_didSeeSyntaxViolation)) {
1008         ASSERT(m_unicodeFragmentBuffer.isEmpty());
1009         return m_asciiBuffer.size();
1010     }
1011     
1012     return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1013 }
1014
1015 URLParser::URLParser(const String& input, const URL& base, const TextEncoding& encoding)
1016     : m_inputString(input)
1017 {
1018     if (input.isNull()) {
1019         if (base.isValid() && !base.m_cannotBeABaseURL)
1020             m_url = base;
1021         return;
1022     }
1023
1024     if (input.is8Bit()) {
1025         m_inputBegin = input.characters8();
1026         parse(input.characters8(), input.length(), base, encoding);
1027     } else {
1028         m_inputBegin = input.characters16();
1029         parse(input.characters16(), input.length(), base, encoding);
1030     }
1031     ASSERT(!m_url.m_isValid
1032         || m_didSeeSyntaxViolation == (m_url.string() != input)
1033         || (input.isEmpty() && m_url.m_string == base.m_string));
1034 }
1035
1036 template<typename CharacterType>
1037 void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
1038 {
1039     LOG(URLParser, "Parsing URL <%s> base <%s>", String(input, length).utf8().data(), base.string().utf8().data());
1040     m_url = { };
1041     ASSERT(m_asciiBuffer.isEmpty());
1042     ASSERT(m_unicodeFragmentBuffer.isEmpty());
1043     
1044     bool isUTF8Encoding = encoding == UTF8Encoding();
1045     Vector<UChar> queryBuffer;
1046
1047     unsigned endIndex = length;
1048     while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) {
1049         syntaxViolation(CodePointIterator<CharacterType>(input, input));
1050         endIndex--;
1051     }
1052     CodePointIterator<CharacterType> c(input, input + endIndex);
1053     CodePointIterator<CharacterType> authorityOrHostBegin;
1054     while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1055         syntaxViolation(c);
1056         ++c;
1057     }
1058     auto beginAfterControlAndSpace = c;
1059
1060     enum class State : uint8_t {
1061         SchemeStart,
1062         Scheme,
1063         NoScheme,
1064         SpecialRelativeOrAuthority,
1065         PathOrAuthority,
1066         Relative,
1067         RelativeSlash,
1068         SpecialAuthoritySlashes,
1069         SpecialAuthorityIgnoreSlashes,
1070         AuthorityOrHost,
1071         Host,
1072         File,
1073         FileSlash,
1074         FileHost,
1075         PathStart,
1076         Path,
1077         CannotBeABaseURLPath,
1078         Query,
1079         Fragment,
1080     };
1081
1082 #define LOG_STATE(x) LOG(URLParser, "State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1083 #define LOG_FINAL_STATE(x) LOG(URLParser, "Final State: %s", x)
1084
1085     State state = State::SchemeStart;
1086     while (!c.atEnd()) {
1087         if (UNLIKELY(isTabOrNewline(*c))) {
1088             syntaxViolation(c);
1089             ++c;
1090             continue;
1091         }
1092
1093         switch (state) {
1094         case State::SchemeStart:
1095             LOG_STATE("SchemeStart");
1096             if (isASCIIAlpha(*c)) {
1097                 if (UNLIKELY(isASCIIUpper(*c)))
1098                     syntaxViolation(c);
1099                 appendToASCIIBuffer(toASCIILower(*c));
1100                 advance(c);
1101                 if (c.atEnd()) {
1102                     m_asciiBuffer.clear();
1103                     state = State::NoScheme;
1104                     c = beginAfterControlAndSpace;
1105                 }
1106                 state = State::Scheme;
1107             } else
1108                 state = State::NoScheme;
1109             break;
1110         case State::Scheme:
1111             LOG_STATE("Scheme");
1112             if (isValidSchemeCharacter(*c)) {
1113                 if (UNLIKELY(isASCIIUpper(*c)))
1114                     syntaxViolation(c);
1115                 appendToASCIIBuffer(toASCIILower(*c));
1116             } else if (*c == ':') {
1117                 m_url.m_schemeEnd = currentPosition(c);
1118                 StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd);
1119                 m_url.m_protocolIsInHTTPFamily = urlScheme == "http" || urlScheme == "https";
1120                 appendToASCIIBuffer(':');
1121                 if (urlScheme == "file") {
1122                     m_urlIsSpecial = true;
1123                     state = State::File;
1124                     ++c;
1125                     break;
1126                 }
1127                 if (isSpecialScheme(urlScheme)) {
1128                     m_urlIsSpecial = true;
1129                     if (base.protocolIs(urlScheme))
1130                         state = State::SpecialRelativeOrAuthority;
1131                     else
1132                         state = State::SpecialAuthoritySlashes;
1133                     ++c;
1134                 } else {
1135                     auto maybeSlash = c;
1136                     advance(maybeSlash);
1137                     if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1138                         appendToASCIIBuffer('/');
1139                         c = maybeSlash;
1140                         state = State::PathOrAuthority;
1141                         ASSERT(*c == '/');
1142                         ++c;
1143                         m_url.m_userStart = currentPosition(c);
1144                     } else {
1145                         ++c;
1146                         m_url.m_userStart = currentPosition(c);
1147                         m_url.m_userEnd = m_url.m_userStart;
1148                         m_url.m_passwordEnd = m_url.m_userStart;
1149                         m_url.m_hostEnd = m_url.m_userStart;
1150                         m_url.m_portEnd = m_url.m_userStart;
1151                         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1152                         m_url.m_cannotBeABaseURL = true;
1153                         state = State::CannotBeABaseURLPath;
1154                     }
1155                 }
1156                 break;
1157             } else {
1158                 m_asciiBuffer.clear();
1159                 state = State::NoScheme;
1160                 c = beginAfterControlAndSpace;
1161                 break;
1162             }
1163             advance(c);
1164             if (c.atEnd()) {
1165                 m_asciiBuffer.clear();
1166                 state = State::NoScheme;
1167                 c = beginAfterControlAndSpace;
1168             }
1169             break;
1170         case State::NoScheme:
1171             LOG_STATE("NoScheme");
1172             if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) {
1173                 failure();
1174                 return;
1175             }
1176             if (base.m_cannotBeABaseURL && *c == '#') {
1177                 copyURLPartsUntil(base, URLPart::QueryEnd, c);
1178                 state = State::Fragment;
1179                 appendToASCIIBuffer('#');
1180                 ++c;
1181                 break;
1182             }
1183             if (!base.protocolIs("file")) {
1184                 state = State::Relative;
1185                 break;
1186             }
1187             copyURLPartsUntil(base, URLPart::SchemeEnd, c);
1188             appendToASCIIBuffer(':');
1189             state = State::File;
1190             break;
1191         case State::SpecialRelativeOrAuthority:
1192             LOG_STATE("SpecialRelativeOrAuthority");
1193             if (*c == '/') {
1194                 appendToASCIIBuffer('/');
1195                 advance(c);
1196                 if (c.atEnd()) {
1197                     failure();
1198                     return;
1199                 }
1200                 if (*c == '/') {
1201                     appendToASCIIBuffer('/');
1202                     state = State::SpecialAuthorityIgnoreSlashes;
1203                     ++c;
1204                 } else
1205                     state = State::RelativeSlash;
1206             } else
1207                 state = State::Relative;
1208             break;
1209         case State::PathOrAuthority:
1210             LOG_STATE("PathOrAuthority");
1211             if (*c == '/') {
1212                 appendToASCIIBuffer('/');
1213                 state = State::AuthorityOrHost;
1214                 ++c;
1215                 m_url.m_userStart = currentPosition(c);
1216                 authorityOrHostBegin = c;
1217             } else {
1218                 ASSERT(parsedDataView(currentPosition(c) - 1, 1) == "/");
1219                 m_url.m_userStart = currentPosition(c) - 1;
1220                 m_url.m_userEnd = m_url.m_userStart;
1221                 m_url.m_passwordEnd = m_url.m_userStart;
1222                 m_url.m_hostEnd = m_url.m_userStart;
1223                 m_url.m_portEnd = m_url.m_userStart;
1224                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1225                 state = State::Path;
1226             }
1227             break;
1228         case State::Relative:
1229             LOG_STATE("Relative");
1230             switch (*c) {
1231             case '/':
1232             case '\\':
1233                 state = State::RelativeSlash;
1234                 ++c;
1235                 break;
1236             case '?':
1237                 copyURLPartsUntil(base, URLPart::PathEnd, c);
1238                 appendToASCIIBuffer('?');
1239                 state = State::Query;
1240                 ++c;
1241                 break;
1242             case '#':
1243                 copyURLPartsUntil(base, URLPart::QueryEnd, c);
1244                 appendToASCIIBuffer('#');
1245                 state = State::Fragment;
1246                 ++c;
1247                 break;
1248             default:
1249                 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c);
1250                 state = State::Path;
1251                 break;
1252             }
1253             break;
1254         case State::RelativeSlash:
1255             LOG_STATE("RelativeSlash");
1256             if (*c == '/' || *c == '\\') {
1257                 ++c;
1258                 copyURLPartsUntil(base, URLPart::SchemeEnd, c);
1259                 appendToASCIIBuffer("://", 3);
1260                 state = State::SpecialAuthorityIgnoreSlashes;
1261             } else {
1262                 copyURLPartsUntil(base, URLPart::PortEnd, c);
1263                 appendToASCIIBuffer('/');
1264                 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1265                 state = State::Path;
1266             }
1267             break;
1268         case State::SpecialAuthoritySlashes:
1269             LOG_STATE("SpecialAuthoritySlashes");
1270             if (LIKELY(*c == '/' || *c == '\\')) {
1271                 if (UNLIKELY(*c == '\\'))
1272                     syntaxViolation(c);
1273                 appendToASCIIBuffer('/');
1274                 advance(c);
1275                 if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) {
1276                     if (UNLIKELY(*c == '\\'))
1277                         syntaxViolation(c);
1278                     ++c;
1279                     appendToASCIIBuffer('/');
1280                 } else {
1281                     syntaxViolation(c);
1282                     appendToASCIIBuffer('/');
1283                 }
1284             } else {
1285                 syntaxViolation(c);
1286                 appendToASCIIBuffer("//", 2);
1287             }
1288             state = State::SpecialAuthorityIgnoreSlashes;
1289             break;
1290         case State::SpecialAuthorityIgnoreSlashes:
1291             LOG_STATE("SpecialAuthorityIgnoreSlashes");
1292             if (*c == '/' || *c == '\\') {
1293                 syntaxViolation(c);
1294                 ++c;
1295             } else {
1296                 m_url.m_userStart = currentPosition(c);
1297                 state = State::AuthorityOrHost;
1298                 authorityOrHostBegin = c;
1299             }
1300             break;
1301         case State::AuthorityOrHost:
1302             do {
1303                 LOG_STATE("AuthorityOrHost");
1304                 if (*c == '@') {
1305                     auto lastAt = c;
1306                     auto findLastAt = c;
1307                     while (!findLastAt.atEnd()) {
1308                         if (*findLastAt == '@')
1309                             lastAt = findLastAt;
1310                         bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\');
1311                         if (isSlash || *findLastAt == '?' || *findLastAt == '#')
1312                             break;
1313                         ++findLastAt;
1314                     }
1315                     parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1316                     c = lastAt;
1317                     advance(c);
1318                     authorityOrHostBegin = c;
1319                     state = State::Host;
1320                     m_hostHasPercentOrNonASCII = false;
1321                     break;
1322                 }
1323                 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1324                 if (isSlash || *c == '?' || *c == '#') {
1325                     m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1326                     m_url.m_passwordEnd = m_url.m_userEnd;
1327                     if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1328                         failure();
1329                         return;
1330                     }
1331                     if (UNLIKELY(!isSlash)) {
1332                         syntaxViolation(c);
1333                         appendToASCIIBuffer('/');
1334                         m_url.m_pathAfterLastSlash = currentPosition(c);
1335                     }
1336                     state = State::Path;
1337                     break;
1338                 }
1339                 if (isPercentOrNonASCII(*c))
1340                     m_hostHasPercentOrNonASCII = true;
1341                 ++c;
1342             } while (!c.atEnd());
1343             break;
1344         case State::Host:
1345             LOG_STATE("Host");
1346             if (*c == '/' || *c == '?' || *c == '#') {
1347                 if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1348                     failure();
1349                     return;
1350                 }
1351                 if (*c == '?' || *c == '#') {
1352                     syntaxViolation(c);
1353                     appendToASCIIBuffer('/');
1354                     m_url.m_pathAfterLastSlash = currentPosition(c);
1355                 }
1356                 state = State::Path;
1357                 break;
1358             }
1359             if (isPercentOrNonASCII(*c))
1360                 m_hostHasPercentOrNonASCII = true;
1361             ++c;
1362             break;
1363         case State::File:
1364             LOG_STATE("File");
1365             switch (*c) {
1366             case '\\':
1367                 syntaxViolation(c);
1368                 FALLTHROUGH;
1369             case '/':
1370                 appendToASCIIBuffer('/');
1371                 state = State::FileSlash;
1372                 ++c;
1373                 break;
1374             case '?':
1375                 syntaxViolation(c);
1376                 if (base.isValid() && base.protocolIs("file"))
1377                     copyURLPartsUntil(base, URLPart::PathEnd, c);
1378                 appendToASCIIBuffer("///?", 4);
1379                 m_url.m_userStart = currentPosition(c) - 2;
1380                 m_url.m_userEnd = m_url.m_userStart;
1381                 m_url.m_passwordEnd = m_url.m_userStart;
1382                 m_url.m_hostEnd = m_url.m_userStart;
1383                 m_url.m_portEnd = m_url.m_userStart;
1384                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1385                 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1386                 state = State::Query;
1387                 ++c;
1388                 break;
1389             case '#':
1390                 syntaxViolation(c);
1391                 if (base.isValid() && base.protocolIs("file"))
1392                     copyURLPartsUntil(base, URLPart::QueryEnd, c);
1393                 appendToASCIIBuffer("///#", 4);
1394                 m_url.m_userStart = currentPosition(c) - 2;
1395                 m_url.m_userEnd = m_url.m_userStart;
1396                 m_url.m_passwordEnd = m_url.m_userStart;
1397                 m_url.m_hostEnd = m_url.m_userStart;
1398                 m_url.m_portEnd = m_url.m_userStart;
1399                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1400                 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1401                 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1402                 state = State::Fragment;
1403                 ++c;
1404                 break;
1405             default:
1406                 syntaxViolation(c);
1407                 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1408                     copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c);
1409                 else {
1410                     appendToASCIIBuffer("///", 3);
1411                     m_url.m_userStart = currentPosition(c) - 1;
1412                     m_url.m_userEnd = m_url.m_userStart;
1413                     m_url.m_passwordEnd = m_url.m_userStart;
1414                     m_url.m_hostEnd = m_url.m_userStart;
1415                     m_url.m_portEnd = m_url.m_userStart;
1416                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1417                     if (isWindowsDriveLetter(c))
1418                         appendWindowsDriveLetter(c);
1419                 }
1420                 state = State::Path;
1421                 break;
1422             }
1423             break;
1424         case State::FileSlash:
1425             LOG_STATE("FileSlash");
1426             if (LIKELY(*c == '/' || *c == '\\')) {
1427                 if (UNLIKELY(*c == '\\'))
1428                     syntaxViolation(c);
1429                 ++c;
1430                 appendToASCIIBuffer('/');
1431                 m_url.m_userStart = currentPosition(c);
1432                 m_url.m_userEnd = m_url.m_userStart;
1433                 m_url.m_passwordEnd = m_url.m_userStart;
1434                 m_url.m_hostEnd = m_url.m_userStart;
1435                 m_url.m_portEnd = m_url.m_userStart;
1436                 authorityOrHostBegin = c;
1437                 state = State::FileHost;
1438                 break;
1439             }
1440             if (base.isValid() && base.protocolIs("file")) {
1441                 // FIXME: This String copy is unnecessary.
1442                 String basePath = base.path();
1443                 if (basePath.length() >= 2) {
1444                     bool windowsQuirk = basePath.is8Bit()
1445                         ? isWindowsDriveLetter(CodePointIterator<LChar>(basePath.characters8(), basePath.characters8() + basePath.length()))
1446                         : isWindowsDriveLetter(CodePointIterator<UChar>(basePath.characters16(), basePath.characters16() + basePath.length()));
1447                     if (windowsQuirk) {
1448                         appendToASCIIBuffer(basePath[0]);
1449                         appendToASCIIBuffer(basePath[1]);
1450                     }
1451                 }
1452             }
1453             syntaxViolation(c);
1454             appendToASCIIBuffer("//", 2);
1455             m_url.m_userStart = currentPosition(c) - 1;
1456             m_url.m_userEnd = m_url.m_userStart;
1457             m_url.m_passwordEnd = m_url.m_userStart;
1458             m_url.m_hostEnd = m_url.m_userStart;
1459             m_url.m_portEnd = m_url.m_userStart;
1460             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1461             if (isWindowsDriveLetter(c))
1462                 appendWindowsDriveLetter(c);
1463             state = State::Path;
1464             break;
1465         case State::FileHost:
1466             LOG_STATE("FileHost");
1467             if (isSlashQuestionOrHash(*c)) {
1468                 bool windowsQuirk = c.codeUnitsSince(authorityOrHostBegin) == 2 && isWindowsDriveLetter(authorityOrHostBegin);
1469                 if (windowsQuirk) {
1470                     syntaxViolation(authorityOrHostBegin);
1471                     appendToASCIIBuffer('/');
1472                     appendWindowsDriveLetter(authorityOrHostBegin);
1473                 }
1474                 if (windowsQuirk || authorityOrHostBegin == c) {
1475                     ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1, 1) == "/");
1476                     if (UNLIKELY(*c == '?')) {
1477                         syntaxViolation(c);
1478                         appendToASCIIBuffer("/?", 2);
1479                         ++c;
1480                         m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1481                         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1482                         state = State::Query;
1483                         break;
1484                     }
1485                     if (UNLIKELY(*c == '#')) {
1486                         syntaxViolation(c);
1487                         appendToASCIIBuffer("/#", 2);
1488                         ++c;
1489                         m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1490                         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1491                         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1492                         state = State::Fragment;
1493                         break;
1494                     }
1495                     state = State::Path;
1496                     break;
1497                 }
1498                 if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1499                     failure();
1500                     return;
1501                 }
1502                 if (UNLIKELY(equalLettersIgnoringASCIICase(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd), "localhost"))) {
1503                     syntaxViolation(c);
1504                     m_asciiBuffer.shrink(m_url.m_passwordEnd);
1505                     m_url.m_hostEnd = currentPosition(c);
1506                     m_url.m_portEnd = m_url.m_hostEnd;
1507                 }
1508                 
1509                 state = State::PathStart;
1510                 break;
1511             }
1512             if (isPercentOrNonASCII(*c))
1513                 m_hostHasPercentOrNonASCII = true;
1514             ++c;
1515             break;
1516         case State::PathStart:
1517             LOG_STATE("PathStart");
1518             if (*c != '/' && *c != '\\')
1519                 ++c;
1520             state = State::Path;
1521             break;
1522         case State::Path:
1523             LOG_STATE("Path");
1524             if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1525                 if (UNLIKELY(m_urlIsSpecial && *c == '\\'))
1526                     syntaxViolation(c);
1527                 appendToASCIIBuffer('/');
1528                 ++c;
1529                 m_url.m_pathAfterLastSlash = currentPosition(c);
1530                 break;
1531             }
1532             if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1, 1) == "/")) {
1533                 if (UNLIKELY(isDoubleDotPathSegment(c))) {
1534                     syntaxViolation(c);
1535                     consumeDoubleDotPathSegment(c);
1536                     popPath();
1537                     break;
1538                 }
1539                 if (UNLIKELY(isSingleDotPathSegment(c))) {
1540                     syntaxViolation(c);
1541                     consumeSingleDotPathSegment(c);
1542                     break;
1543                 }
1544             }
1545             if (*c == '?') {
1546                 m_url.m_pathEnd = currentPosition(c);
1547                 state = State::Query;
1548                 break;
1549             }
1550             if (*c == '#') {
1551                 m_url.m_pathEnd = currentPosition(c);
1552                 m_url.m_queryEnd = m_url.m_pathEnd;
1553                 state = State::Fragment;
1554                 break;
1555             }
1556             if (isPercentEncodedDot(c)) {
1557                 if (UNLIKELY(*c != '.'))
1558                     syntaxViolation(c);
1559                 appendToASCIIBuffer('.');
1560                 ASSERT(*c == '%');
1561                 ++c;
1562                 ASSERT(*c == dotASCIICode[0]);
1563                 ++c;
1564                 ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1565                 ++c;
1566                 break;
1567             }
1568             utf8PercentEncode<isInDefaultEncodeSet>(c);
1569             ++c;
1570             break;
1571         case State::CannotBeABaseURLPath:
1572             LOG_STATE("CannotBeABaseURLPath");
1573             if (*c == '?') {
1574                 m_url.m_pathEnd = currentPosition(c);
1575                 state = State::Query;
1576             } else if (*c == '#') {
1577                 m_url.m_pathEnd = currentPosition(c);
1578                 m_url.m_queryEnd = m_url.m_pathEnd;
1579                 state = State::Fragment;
1580             } else if (*c == '/') {
1581                 appendToASCIIBuffer('/');
1582                 ++c;
1583                 m_url.m_pathAfterLastSlash = currentPosition(c);
1584             } else {
1585                 utf8PercentEncode<isInSimpleEncodeSet>(c);
1586                 ++c;
1587             }
1588             break;
1589         case State::Query:
1590             LOG_STATE("Query");
1591             if (*c == '#') {
1592                 if (!isUTF8Encoding)
1593                     encodeQuery(queryBuffer, encoding);
1594                 m_url.m_queryEnd = currentPosition(c);
1595                 state = State::Fragment;
1596                 break;
1597             }
1598             if (isUTF8Encoding)
1599                 utf8QueryEncode(c);
1600             else
1601                 appendCodePoint(queryBuffer, *c);
1602             ++c;
1603             break;
1604         case State::Fragment:
1605             do {
1606                 LOG(URLParser, "State Fragment");
1607                 if (!m_didSeeUnicodeFragmentCodePoint && isASCII(*c))
1608                     appendToASCIIBuffer(*c);
1609                 else {
1610                     m_didSeeUnicodeFragmentCodePoint = true;
1611                     if (UNLIKELY(m_didSeeSyntaxViolation))
1612                         appendCodePoint(m_unicodeFragmentBuffer, *c);
1613                     else {
1614                         ASSERT(m_asciiBuffer.isEmpty());
1615                         ASSERT(m_unicodeFragmentBuffer.isEmpty());
1616                     }
1617                 }
1618                 ++c;
1619                 while (UNLIKELY(!c.atEnd() && isTabOrNewline(*c))) {
1620                     fragmentSyntaxViolation(c);
1621                     ++c;
1622                 }
1623             } while (!c.atEnd());
1624             break;
1625         }
1626     }
1627
1628     switch (state) {
1629     case State::SchemeStart:
1630         LOG_FINAL_STATE("SchemeStart");
1631         if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1632             m_url = base;
1633             return;
1634         }
1635         failure();
1636         return;
1637     case State::Scheme:
1638         LOG_FINAL_STATE("Scheme");
1639         failure();
1640         return;
1641     case State::NoScheme:
1642         LOG_FINAL_STATE("NoScheme");
1643         RELEASE_ASSERT_NOT_REACHED();
1644     case State::SpecialRelativeOrAuthority:
1645         LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1646         copyURLPartsUntil(base, URLPart::QueryEnd, c);
1647         m_url.m_fragmentEnd = m_url.m_queryEnd;
1648         break;
1649     case State::PathOrAuthority:
1650         LOG_FINAL_STATE("PathOrAuthority");
1651         ASSERT(m_url.m_userStart);
1652         ASSERT(m_url.m_userStart == currentPosition(c));
1653         ASSERT(parsedDataView(currentPosition(c) - 1, 1) == "/");
1654         m_url.m_userStart--;
1655         m_url.m_userEnd = m_url.m_userStart;
1656         m_url.m_passwordEnd = m_url.m_userStart;
1657         m_url.m_hostEnd = m_url.m_userStart;
1658         m_url.m_portEnd = m_url.m_userStart;
1659         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1660         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1661         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1662         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1663         break;
1664     case State::Relative:
1665         LOG_FINAL_STATE("Relative");
1666         copyURLPartsUntil(base, URLPart::FragmentEnd, c);
1667         break;
1668     case State::RelativeSlash:
1669         LOG_FINAL_STATE("RelativeSlash");
1670         copyURLPartsUntil(base, URLPart::PortEnd, c);
1671         appendToASCIIBuffer('/');
1672         m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1673         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1674         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1675         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1676         break;
1677     case State::SpecialAuthoritySlashes:
1678         LOG_FINAL_STATE("SpecialAuthoritySlashes");
1679         m_url.m_userStart = currentPosition(c);
1680         m_url.m_userEnd = m_url.m_userStart;
1681         m_url.m_passwordEnd = m_url.m_userStart;
1682         m_url.m_hostEnd = m_url.m_userStart;
1683         m_url.m_portEnd = m_url.m_userStart;
1684         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1685         m_url.m_pathEnd = m_url.m_userStart;
1686         m_url.m_queryEnd = m_url.m_userStart;
1687         m_url.m_fragmentEnd = m_url.m_userStart;
1688         break;
1689     case State::SpecialAuthorityIgnoreSlashes:
1690         LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1691         failure();
1692         return;
1693         break;
1694     case State::AuthorityOrHost:
1695         LOG_FINAL_STATE("AuthorityOrHost");
1696         m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1697         m_url.m_passwordEnd = m_url.m_userEnd;
1698         if (authorityOrHostBegin.atEnd()) {
1699             m_url.m_hostEnd = m_url.m_userEnd;
1700             m_url.m_portEnd = m_url.m_userEnd;
1701         } else if (!parseHostAndPort(authorityOrHostBegin)) {
1702             failure();
1703             return;
1704         }
1705         syntaxViolation(c);
1706         appendToASCIIBuffer('/');
1707         m_url.m_pathEnd = m_url.m_portEnd + 1;
1708         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1709         m_url.m_queryEnd = m_url.m_pathEnd;
1710         m_url.m_fragmentEnd = m_url.m_pathEnd;
1711         break;
1712     case State::Host:
1713         LOG_FINAL_STATE("Host");
1714         if (!parseHostAndPort(authorityOrHostBegin)) {
1715             failure();
1716             return;
1717         }
1718         syntaxViolation(c);
1719         appendToASCIIBuffer('/');
1720         m_url.m_pathEnd = m_url.m_portEnd + 1;
1721         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1722         m_url.m_queryEnd = m_url.m_pathEnd;
1723         m_url.m_fragmentEnd = m_url.m_pathEnd;
1724         break;
1725     case State::File:
1726         LOG_FINAL_STATE("File");
1727         if (base.isValid() && base.protocolIs("file")) {
1728             copyURLPartsUntil(base, URLPart::QueryEnd, c);
1729             appendToASCIIBuffer(':');
1730         }
1731         syntaxViolation(c);
1732         appendToASCIIBuffer("///", 3);
1733         m_url.m_userStart = currentPosition(c) - 1;
1734         m_url.m_userEnd = m_url.m_userStart;
1735         m_url.m_passwordEnd = m_url.m_userStart;
1736         m_url.m_hostEnd = m_url.m_userStart;
1737         m_url.m_portEnd = m_url.m_userStart;
1738         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1739         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1740         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1741         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1742         break;
1743     case State::FileSlash:
1744         LOG_FINAL_STATE("FileSlash");
1745         syntaxViolation(c);
1746         m_url.m_userStart = currentPosition(c) + 1;
1747         appendToASCIIBuffer("//", 2);
1748         m_url.m_userEnd = m_url.m_userStart;
1749         m_url.m_passwordEnd = m_url.m_userStart;
1750         m_url.m_hostEnd = m_url.m_userStart;
1751         m_url.m_portEnd = m_url.m_userStart;
1752         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1753         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1754         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1755         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1756         break;
1757     case State::FileHost:
1758         LOG_FINAL_STATE("FileHost");
1759         if (authorityOrHostBegin == c) {
1760             syntaxViolation(c);
1761             appendToASCIIBuffer('/');
1762             m_url.m_userStart = currentPosition(c) - 1;
1763             m_url.m_userEnd = m_url.m_userStart;
1764             m_url.m_passwordEnd = m_url.m_userStart;
1765             m_url.m_hostEnd = m_url.m_userStart;
1766             m_url.m_portEnd = m_url.m_userStart;
1767             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1768             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1769             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1770             m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1771             break;
1772         }
1773
1774         if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1775             failure();
1776             return;
1777         }
1778
1779         syntaxViolation(c);
1780         if (equalLettersIgnoringASCIICase(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd), "localhost")) {
1781             m_asciiBuffer.shrink(m_url.m_passwordEnd);
1782             m_url.m_hostEnd = currentPosition(c);
1783             m_url.m_portEnd = m_url.m_hostEnd;
1784         }
1785         appendToASCIIBuffer('/');
1786         m_url.m_pathAfterLastSlash = m_url.m_hostEnd + 1;
1787         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1788         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1789         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1790         break;
1791     case State::PathStart:
1792         LOG_FINAL_STATE("PathStart");
1793         RELEASE_ASSERT_NOT_REACHED();
1794     case State::Path:
1795         LOG_FINAL_STATE("Path");
1796         m_url.m_pathEnd = currentPosition(c);
1797         m_url.m_queryEnd = m_url.m_pathEnd;
1798         m_url.m_fragmentEnd = m_url.m_pathEnd;
1799         break;
1800     case State::CannotBeABaseURLPath:
1801         LOG_FINAL_STATE("CannotBeABaseURLPath");
1802         m_url.m_pathEnd = currentPosition(c);
1803         m_url.m_queryEnd = m_url.m_pathEnd;
1804         m_url.m_fragmentEnd = m_url.m_pathEnd;
1805         break;
1806     case State::Query:
1807         LOG_FINAL_STATE("Query");
1808         if (!isUTF8Encoding)
1809             encodeQuery(queryBuffer, encoding);
1810         m_url.m_queryEnd = currentPosition(c);
1811         m_url.m_fragmentEnd = m_url.m_queryEnd;
1812         break;
1813     case State::Fragment:
1814         {
1815             LOG_FINAL_STATE("Fragment");
1816             size_t length = m_didSeeSyntaxViolation ? m_asciiBuffer.size() + m_unicodeFragmentBuffer.size() : c.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1817             m_url.m_fragmentEnd = length;
1818             break;
1819         }
1820     }
1821
1822     if (LIKELY(!m_didSeeSyntaxViolation)) {
1823         m_url.m_string = m_inputString;
1824         ASSERT(m_asciiBuffer.isEmpty());
1825         ASSERT(m_unicodeFragmentBuffer.isEmpty());
1826     } else if (!m_didSeeUnicodeFragmentCodePoint) {
1827         ASSERT(m_unicodeFragmentBuffer.isEmpty());
1828         m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
1829     } else {
1830         Vector<UChar> buffer;
1831         buffer.reserveInitialCapacity(m_asciiBuffer.size() + m_unicodeFragmentBuffer.size());
1832         buffer.appendVector(m_asciiBuffer);
1833         buffer.appendVector(m_unicodeFragmentBuffer);
1834         m_url.m_string = String::adopt(WTFMove(buffer));
1835     }
1836     m_url.m_isValid = true;
1837     LOG(URLParser, "Parsed URL <%s>", m_url.m_string.utf8().data());
1838     ASSERT(internalValuesConsistent(m_url));
1839 }
1840
1841 template<typename CharacterType>
1842 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
1843 {
1844     if (UNLIKELY(iterator.atEnd())) {
1845         syntaxViolation(iterator);
1846         m_url.m_userEnd = currentPosition(iterator);
1847         m_url.m_passwordEnd = m_url.m_userEnd;
1848         return;
1849     }
1850     auto authorityOrHostBegin = iterator;
1851     for (; !iterator.atEnd(); advance(iterator)) {
1852         if (*iterator == ':') {
1853             m_url.m_userEnd = currentPosition(iterator);
1854             auto iteratorAtColon = iterator;
1855             advance(iterator, authorityOrHostBegin);
1856             if (UNLIKELY(iterator.atEnd())) {
1857                 syntaxViolation(iteratorAtColon);
1858                 m_url.m_passwordEnd = m_url.m_userEnd;
1859                 if (m_url.m_userEnd > m_url.m_userStart)
1860                     appendToASCIIBuffer('@');
1861                 return;
1862             }
1863             appendToASCIIBuffer(':');
1864             break;
1865         }
1866         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
1867     }
1868     for (; !iterator.atEnd(); advance(iterator))
1869         utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
1870     m_url.m_passwordEnd = currentPosition(iterator);
1871     if (!m_url.m_userEnd)
1872         m_url.m_userEnd = m_url.m_passwordEnd;
1873     appendToASCIIBuffer('@');
1874 }
1875
1876 template<typename UnsignedIntegerType>
1877 void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
1878 {
1879     LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
1880     LChar* end = buf + WTF_ARRAY_LENGTH(buf);
1881     LChar* p = end;
1882     do {
1883         *--p = (number % 10) + '0';
1884         number /= 10;
1885     } while (number);
1886     appendToASCIIBuffer(p, end - p);
1887 }
1888
1889 void URLParser::serializeIPv4(IPv4Address address)
1890 {
1891     appendNumberToASCIIBuffer<uint8_t>(address >> 24);
1892     appendToASCIIBuffer('.');
1893     appendNumberToASCIIBuffer<uint8_t>(address >> 16);
1894     appendToASCIIBuffer('.');
1895     appendNumberToASCIIBuffer<uint8_t>(address >> 8);
1896     appendToASCIIBuffer('.');
1897     appendNumberToASCIIBuffer<uint8_t>(address);
1898 }
1899     
1900 static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
1901 {
1902     size_t end = begin;
1903     for (; end < 8; end++) {
1904         if (address[end])
1905             break;
1906     }
1907     return end - begin;
1908 }
1909
1910 static Optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
1911 {
1912     Optional<size_t> longest;
1913     size_t longestLength = 0;
1914     for (size_t i = 0; i < 8; i++) {
1915         size_t length = zeroSequenceLength(address, i);
1916         if (length) {
1917             if (length > 1 && (!longest || longestLength < length)) {
1918                 longest = i;
1919                 longestLength = length;
1920             }
1921             i += length;
1922         }
1923     }
1924     return longest;
1925 }
1926
1927 void URLParser::serializeIPv6Piece(uint16_t piece)
1928 {
1929     bool printed = false;
1930     if (auto nibble0 = piece >> 12) {
1931         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
1932         printed = true;
1933     }
1934     auto nibble1 = piece >> 8 & 0xF;
1935     if (printed || nibble1) {
1936         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
1937         printed = true;
1938     }
1939     auto nibble2 = piece >> 4 & 0xF;
1940     if (printed || nibble2)
1941         appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
1942     appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
1943 }
1944
1945 void URLParser::serializeIPv6(URLParser::IPv6Address address)
1946 {
1947     appendToASCIIBuffer('[');
1948     auto compressPointer = findLongestZeroSequence(address);
1949     for (size_t piece = 0; piece < 8; piece++) {
1950         if (compressPointer && compressPointer.value() == piece) {
1951             ASSERT(!address[piece]);
1952             if (piece)
1953                 appendToASCIIBuffer(':');
1954             else
1955                 appendToASCIIBuffer("::", 2);
1956             while (piece < 8 && !address[piece])
1957                 piece++;
1958             if (piece == 8)
1959                 break;
1960         }
1961         serializeIPv6Piece(address[piece]);
1962         if (piece < 7)
1963             appendToASCIIBuffer(':');
1964     }
1965     appendToASCIIBuffer(']');
1966 }
1967
1968 template<typename CharacterType>
1969 Optional<uint32_t> URLParser::parseIPv4Number(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
1970 {
1971     // FIXME: Check for overflow.
1972     enum class State : uint8_t {
1973         UnknownBase,
1974         Decimal,
1975         OctalOrHex,
1976         Octal,
1977         Hex,
1978     };
1979     State state = State::UnknownBase;
1980     uint32_t value = 0;
1981     while (!iterator.atEnd()) {
1982         if (*iterator == '.') {
1983             ++iterator;
1984             return value;
1985         }
1986         switch (state) {
1987         case State::UnknownBase:
1988             if (UNLIKELY(*iterator == '0')) {
1989                 ++iterator;
1990                 state = State::OctalOrHex;
1991                 break;
1992             }
1993             state = State::Decimal;
1994             break;
1995         case State::OctalOrHex:
1996             syntaxViolation(iteratorForSyntaxViolationPosition);
1997             if (*iterator == 'x' || *iterator == 'X') {
1998                 ++iterator;
1999                 state = State::Hex;
2000                 break;
2001             }
2002             state = State::Octal;
2003             break;
2004         case State::Decimal:
2005             if (*iterator < '0' || *iterator > '9')
2006                 return Nullopt;
2007             value *= 10;
2008             value += *iterator - '0';
2009             ++iterator;
2010             break;
2011         case State::Octal:
2012             ASSERT(m_didSeeSyntaxViolation);
2013             if (*iterator < '0' || *iterator > '7')
2014                 return Nullopt;
2015             value *= 8;
2016             value += *iterator - '0';
2017             ++iterator;
2018             break;
2019         case State::Hex:
2020             ASSERT(m_didSeeSyntaxViolation);
2021             if (!isASCIIHexDigit(*iterator))
2022                 return Nullopt;
2023             value *= 16;
2024             value += toASCIIHexValue(*iterator);
2025             ++iterator;
2026             break;
2027         }
2028     }
2029     return value;
2030 }
2031
2032 ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2033 {
2034     RELEASE_ASSERT(exponent <= 4);
2035     uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
2036     return values[exponent];
2037 }
2038
2039 template<typename CharacterType>
2040 Optional<URLParser::IPv4Address> URLParser::parseIPv4Host(CodePointIterator<CharacterType> iterator)
2041 {
2042     auto hostBegin = iterator;
2043
2044     Vector<uint32_t, 4> items;
2045     items.reserveInitialCapacity(4);
2046     while (!iterator.atEnd()) {
2047         if (items.size() >= 4)
2048             return Nullopt;
2049         if (auto item = parseIPv4Number(iterator, hostBegin))
2050             items.append(item.value());
2051         else
2052             return Nullopt;
2053     }
2054     if (!items.size() || items.size() > 4)
2055         return Nullopt;
2056     if (items.size() > 2) {
2057         for (size_t i = 0; i < items.size() - 2; i++) {
2058             if (items[i] > 255)
2059                 return Nullopt;
2060         }
2061     }
2062     if (items[items.size() - 1] >= pow256(5 - items.size()))
2063         return Nullopt;
2064     for (auto item : items) {
2065         if (item > 255)
2066             return Nullopt;
2067     }
2068
2069     if (UNLIKELY(items.size() != 4))
2070         syntaxViolation(hostBegin);
2071
2072     IPv4Address ipv4 = items.takeLast();
2073     for (size_t counter = 0; counter < items.size(); ++counter)
2074         ipv4 += items[counter] * pow256(3 - counter);
2075     return ipv4;
2076 }
2077     
2078 template<typename CharacterType>
2079 Optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2080 {
2081     ASSERT(*c == '[');
2082     auto hostBegin = c;
2083     advance(c, hostBegin);
2084     if (c.atEnd())
2085         return Nullopt;
2086
2087     IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}};
2088     size_t piecePointer = 0;
2089     Optional<size_t> compressPointer;
2090
2091     if (*c == ':') {
2092         advance(c, hostBegin);
2093         if (c.atEnd())
2094             return Nullopt;
2095         if (*c != ':')
2096             return Nullopt;
2097         advance(c, hostBegin);
2098         ++piecePointer;
2099         compressPointer = piecePointer;
2100     }
2101     
2102     while (!c.atEnd()) {
2103         if (piecePointer == 8)
2104             return Nullopt;
2105         if (*c == ':') {
2106             if (compressPointer)
2107                 return Nullopt;
2108             advance(c, hostBegin);
2109             ++piecePointer;
2110             compressPointer = piecePointer;
2111             continue;
2112         }
2113         uint16_t value = 0;
2114         size_t length = 0;
2115         for (; length < 4; length++) {
2116             if (c.atEnd())
2117                 break;
2118             if (!isASCIIHexDigit(*c))
2119                 break;
2120             if (isASCIIUpper(*c))
2121                 syntaxViolation(hostBegin);
2122             value = value * 0x10 + toASCIIHexValue(*c);
2123             advance(c, hostBegin);
2124         }
2125         if (UNLIKELY(length > 1 && !value))
2126             syntaxViolation(hostBegin);
2127
2128         address[piecePointer++] = value;
2129         if (c.atEnd())
2130             break;
2131         if (*c != ':')
2132             return Nullopt;
2133         advance(c, hostBegin);
2134     }
2135     
2136     if (!c.atEnd()) {
2137         if (piecePointer > 6)
2138             return Nullopt;
2139         size_t dotsSeen = 0;
2140         while (!c.atEnd()) {
2141             Optional<uint16_t> value;
2142             if (!isASCIIDigit(*c))
2143                 return Nullopt;
2144             while (isASCIIDigit(*c)) {
2145                 auto number = *c - '0';
2146                 if (!value)
2147                     value = number;
2148                 else if (!value.value())
2149                     return Nullopt;
2150                 else
2151                     value = value.value() * 10 + number;
2152                 advance(c, hostBegin);
2153                 if (c.atEnd())
2154                     return Nullopt;
2155                 if (value.value() > 255)
2156                     return Nullopt;
2157             }
2158             if (dotsSeen < 3 && *c != '.')
2159                 return Nullopt;
2160             address[piecePointer] = address[piecePointer] * 0x100 + value.valueOr(0);
2161             if (dotsSeen == 1 || dotsSeen == 3)
2162                 piecePointer++;
2163             if (!c.atEnd())
2164                 advance(c, hostBegin);
2165             if (dotsSeen == 3 && !c.atEnd())
2166                 return Nullopt;
2167             dotsSeen++;
2168         }
2169     }
2170     if (compressPointer) {
2171         size_t swaps = piecePointer - compressPointer.value();
2172         piecePointer = 7;
2173         while (swaps)
2174             std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
2175     } else if (piecePointer != 8)
2176         return Nullopt;
2177
2178     Optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2179     if (possibleCompressPointer)
2180         possibleCompressPointer.value()++;
2181     if (UNLIKELY(compressPointer != possibleCompressPointer))
2182         syntaxViolation(hostBegin);
2183     
2184     return address;
2185 }
2186
2187 const size_t defaultInlineBufferSize = 2048;
2188
2189 static Vector<LChar, defaultInlineBufferSize> percentDecode(const LChar* input, size_t length)
2190 {
2191     Vector<LChar, defaultInlineBufferSize> output;
2192     output.reserveInitialCapacity(length);
2193     
2194     for (size_t i = 0; i < length; ++i) {
2195         uint8_t byte = input[i];
2196         if (byte != '%')
2197             output.uncheckedAppend(byte);
2198         else if (i < length - 2) {
2199             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2200                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2201                 i += 2;
2202             } else
2203                 output.uncheckedAppend(byte);
2204         } else
2205             output.uncheckedAppend(byte);
2206     }
2207     return output;
2208 }
2209
2210 ALWAYS_INLINE static bool containsOnlyASCII(const String& string)
2211 {
2212     if (string.is8Bit())
2213         return charactersAreAllASCII(string.characters8(), string.length());
2214     return charactersAreAllASCII(string.characters16(), string.length());
2215 }
2216
2217 static Optional<Vector<LChar, defaultInlineBufferSize>> domainToASCII(const String& domain)
2218 {
2219     Vector<LChar, defaultInlineBufferSize> ascii;
2220     if (containsOnlyASCII(domain)) {
2221         size_t length = domain.length();
2222         if (domain.is8Bit()) {
2223             const LChar* characters = domain.characters8();
2224             ascii.reserveInitialCapacity(length);
2225             for (size_t i = 0; i < length; ++i)
2226                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2227         } else {
2228             const UChar* characters = domain.characters16();
2229             ascii.reserveInitialCapacity(length);
2230             for (size_t i = 0; i < length; ++i)
2231                 ascii.uncheckedAppend(toASCIILower(characters[i]));
2232         }
2233         return ascii;
2234     }
2235     
2236     UChar hostnameBuffer[defaultInlineBufferSize];
2237     UErrorCode error = U_ZERO_ERROR;
2238
2239 #if COMPILER(GCC) || COMPILER(CLANG)
2240 #pragma GCC diagnostic push
2241 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2242 #endif
2243     // FIXME: This should use uidna_openUTS46 / uidna_close instead
2244     int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, UIDNA_ALLOW_UNASSIGNED, nullptr, &error);
2245 #if COMPILER(GCC) || COMPILER(CLANG)
2246 #pragma GCC diagnostic pop
2247 #endif
2248     ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
2249
2250     if (error == U_ZERO_ERROR) {
2251         for (int32_t i = 0; i < numCharactersConverted; ++i) {
2252             ASSERT(isASCII(hostnameBuffer[i]));
2253             ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2254         }
2255         ascii.append(hostnameBuffer, numCharactersConverted);
2256         return ascii;
2257     }
2258
2259     // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
2260     return Nullopt;
2261 }
2262
2263 static bool hasInvalidDomainCharacter(const Vector<LChar, defaultInlineBufferSize>& asciiDomain)
2264 {
2265     for (size_t i = 0; i < asciiDomain.size(); ++i) {
2266         if (isInvalidDomainCharacter(asciiDomain[i]))
2267             return true;
2268     }
2269     return false;
2270 }
2271
2272 template<typename CharacterType>
2273 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2274 {
2275     ASSERT(*iterator == ':');
2276     auto colonIterator = iterator;
2277     advance(iterator, colonIterator);
2278     uint32_t port = 0;
2279     if (UNLIKELY(iterator.atEnd())) {
2280         m_url.m_portEnd = currentPosition(colonIterator);
2281         syntaxViolation(colonIterator);
2282         return true;
2283     }
2284     for (; !iterator.atEnd(); ++iterator) {
2285         if (UNLIKELY(isTabOrNewline(*iterator))) {
2286             syntaxViolation(colonIterator);
2287             continue;
2288         }
2289         if (isASCIIDigit(*iterator)) {
2290             port = port * 10 + *iterator - '0';
2291             if (port > std::numeric_limits<uint16_t>::max())
2292                 return false;
2293         } else
2294             return false;
2295     }
2296
2297     if (UNLIKELY(isDefaultPort(parsedDataView(0, m_url.m_schemeEnd), port)))
2298         syntaxViolation(colonIterator);
2299     else {
2300         appendToASCIIBuffer(':');
2301         ASSERT(port <= std::numeric_limits<uint16_t>::max());
2302         appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2303     }
2304
2305     m_url.m_portEnd = currentPosition(iterator);
2306     return true;
2307 }
2308
2309 template<typename CharacterType>
2310 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2311 {
2312     if (iterator.atEnd())
2313         return false;
2314     if (*iterator == '[') {
2315         auto ipv6End = iterator;
2316         while (!ipv6End.atEnd() && *ipv6End != ']')
2317             ++ipv6End;
2318         if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2319             serializeIPv6(address.value());
2320             if (!ipv6End.atEnd()) {
2321                 advance(ipv6End);
2322                 if (!ipv6End.atEnd() && *ipv6End == ':') {
2323                     m_url.m_hostEnd = currentPosition(ipv6End);
2324                     return parsePort(ipv6End);
2325                 }
2326                 m_url.m_hostEnd = currentPosition(ipv6End);
2327                 m_url.m_portEnd = m_url.m_hostEnd;
2328                 return true;
2329             }
2330             m_url.m_hostEnd = currentPosition(ipv6End);
2331             return true;
2332         }
2333     }
2334
2335     if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2336         auto hostIterator = iterator;
2337         for (; !iterator.atEnd(); ++iterator) {
2338             if (isTabOrNewline(*iterator))
2339                 continue;
2340             if (*iterator == ':')
2341                 break;
2342             if (isInvalidDomainCharacter(*iterator))
2343                 return false;
2344         }
2345         if (auto address = parseIPv4Host(CodePointIterator<CharacterType>(hostIterator, iterator))) {
2346             serializeIPv4(address.value());
2347             m_url.m_hostEnd = currentPosition(iterator);
2348             if (iterator.atEnd()) {
2349                 m_url.m_portEnd = currentPosition(iterator);
2350                 return true;
2351             }
2352             return parsePort(iterator);
2353         }
2354         for (; hostIterator != iterator; ++hostIterator) {
2355             if (LIKELY(!isTabOrNewline(*hostIterator))) {
2356                 if (UNLIKELY(isASCIIUpper(*hostIterator)))
2357                     syntaxViolation(hostIterator);
2358                 appendToASCIIBuffer(toASCIILower(*hostIterator));
2359             } else
2360                 syntaxViolation(hostIterator);
2361         }
2362         m_url.m_hostEnd = currentPosition(iterator);
2363         if (!hostIterator.atEnd())
2364             return parsePort(hostIterator);
2365         m_url.m_portEnd = currentPosition(iterator);
2366         return true;
2367     }
2368     
2369     syntaxViolation(iterator);
2370     
2371     Vector<LChar, defaultInlineBufferSize> utf8Encoded;
2372     for (; !iterator.atEnd(); ++iterator) {
2373         if (isTabOrNewline(*iterator))
2374             continue;
2375         if (*iterator == ':')
2376             break;
2377         uint8_t buffer[U8_MAX_LENGTH];
2378         int32_t offset = 0;
2379         UBool error = false;
2380         U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error);
2381         ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
2382         // FIXME: Check error.
2383         utf8Encoded.append(buffer, offset);
2384     }
2385     Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size());
2386     String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2387     auto asciiDomain = domainToASCII(domain);
2388     if (!asciiDomain || hasInvalidDomainCharacter(asciiDomain.value()))
2389         return false;
2390     Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value();
2391     const LChar* asciiDomainCharacters = asciiDomainValue.data();
2392
2393     if (auto address = parseIPv4Host(CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()))) {
2394         serializeIPv4(address.value());
2395         m_url.m_hostEnd = currentPosition(iterator);
2396         if (iterator.atEnd()) {
2397             m_url.m_portEnd = currentPosition(iterator);
2398             return true;
2399         }
2400         return parsePort(iterator);
2401     }
2402
2403     appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2404     m_url.m_hostEnd = currentPosition(iterator);
2405     if (!iterator.atEnd())
2406         return parsePort(iterator);
2407     m_url.m_portEnd = currentPosition(iterator);
2408     return true;
2409 }
2410
2411 static Optional<String> formURLDecode(StringView input)
2412 {
2413     auto utf8 = input.utf8(StrictConversion);
2414     if (utf8.isNull())
2415         return Nullopt;
2416     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2417     return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2418 }
2419
2420 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2421 {
2422     Vector<StringView> sequences = input.split('&');
2423
2424     URLEncodedForm output;
2425     for (auto& bytes : sequences) {
2426         auto valueStart = bytes.find('=');
2427         if (valueStart == notFound) {
2428             if (auto name = formURLDecode(bytes))
2429                 output.append({name.value().replace('+', 0x20), emptyString()});
2430         } else {
2431             auto name = formURLDecode(bytes.substring(0, valueStart));
2432             auto value = formURLDecode(bytes.substring(valueStart + 1));
2433             if (name && value)
2434                 output.append(std::make_pair(name.value().replace('+', 0x20), value.value().replace('+', 0x20)));
2435         }
2436     }
2437     return output;
2438 }
2439
2440 static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2441 {
2442     auto utf8 = input.utf8(StrictConversion);
2443     const char* data = utf8.data();
2444     for (size_t i = 0; i < utf8.length(); ++i) {
2445         const char byte = data[i];
2446         if (byte == 0x20)
2447             output.append(0x2B);
2448         else if (byte == 0x2A
2449             || byte == 0x2D
2450             || byte == 0x2E
2451             || (byte >= 0x30 && byte <= 0x39)
2452             || (byte >= 0x41 && byte <= 0x5A)
2453             || byte == 0x5F
2454             || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches.
2455             output.append(byte);
2456         else
2457             percentEncodeByte(byte, output);
2458     }
2459 }
2460     
2461 String URLParser::serialize(const URLEncodedForm& tuples)
2462 {
2463     Vector<LChar> output;
2464     for (auto& tuple : tuples) {
2465         if (!output.isEmpty())
2466             output.append('&');
2467         serializeURLEncodedForm(tuple.first, output);
2468         output.append('=');
2469         serializeURLEncodedForm(tuple.second, output);
2470     }
2471     return String::adopt(WTFMove(output));
2472 }
2473
2474 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2475 {
2476     // FIXME: m_cannotBeABaseURL is not compared because the old URL::parse did not use it,
2477     // but once we get rid of URL::parse its value should be tested.
2478     LOG(URLParser, "%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2479         a.m_isValid,
2480         a.m_protocolIsInHTTPFamily,
2481         a.m_schemeEnd,
2482         a.m_userStart,
2483         a.m_userEnd,
2484         a.m_passwordEnd,
2485         a.m_hostEnd,
2486         a.m_portEnd,
2487         a.m_pathAfterLastSlash,
2488         a.m_pathEnd,
2489         a.m_queryEnd,
2490         a.m_fragmentEnd,
2491         a.m_string.utf8().data(),
2492         b.m_isValid,
2493         b.m_protocolIsInHTTPFamily,
2494         b.m_schemeEnd,
2495         b.m_userStart,
2496         b.m_userEnd,
2497         b.m_passwordEnd,
2498         b.m_hostEnd,
2499         b.m_portEnd,
2500         b.m_pathAfterLastSlash,
2501         b.m_pathEnd,
2502         b.m_queryEnd,
2503         b.m_fragmentEnd,
2504         b.m_string.utf8().data());
2505
2506     return a.m_string == b.m_string
2507         && a.m_isValid == b.m_isValid
2508         && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2509         && a.m_schemeEnd == b.m_schemeEnd
2510         && a.m_userStart == b.m_userStart
2511         && a.m_userEnd == b.m_userEnd
2512         && a.m_passwordEnd == b.m_passwordEnd
2513         && a.m_hostEnd == b.m_hostEnd
2514         && a.m_portEnd == b.m_portEnd
2515         && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2516         && a.m_pathEnd == b.m_pathEnd
2517         && a.m_queryEnd == b.m_queryEnd
2518         && a.m_fragmentEnd == b.m_fragmentEnd;
2519 }
2520
2521 bool URLParser::internalValuesConsistent(const URL& url)
2522 {
2523     return url.m_schemeEnd <= url.m_userStart
2524         && url.m_userStart <= url.m_userEnd
2525         && url.m_userEnd <= url.m_passwordEnd
2526         && url.m_passwordEnd <= url.m_hostEnd
2527         && url.m_hostEnd <= url.m_portEnd
2528         && url.m_portEnd <= url.m_pathAfterLastSlash
2529         && url.m_pathAfterLastSlash <= url.m_pathEnd
2530         && url.m_pathEnd <= url.m_queryEnd
2531         && url.m_queryEnd <= url.m_fragmentEnd
2532         && (url.m_isValid ? url.m_fragmentEnd == url.m_string.length() : !url.m_fragmentEnd);
2533     // FIXME: Why do we even store m_fragmentEnd?
2534     // It should be able to be deduced from m_isValid and m_string.length() to save memory.
2535 }
2536
2537 static bool urlParserEnabled = false;
2538
2539 void URLParser::setEnabled(bool enabled)
2540 {
2541     urlParserEnabled = enabled;
2542 }
2543
2544 bool URLParser::enabled()
2545 {
2546     return urlParserEnabled;
2547 }
2548
2549 } // namespace WebCore