eb0f79bbb9e6ef087359c71f312262ff4d3f52a0
[WebKit-https.git] / Source / WebCore / platform / URLParser.cpp
1 /*
2  * Copyright (C) 2016 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "URLParser.h"
28
29 #include "Logging.h"
30 #include <array>
31 #include <unicode/uidna.h>
32 #include <unicode/utypes.h>
33 #include <wtf/HashMap.h>
34 #include <wtf/NeverDestroyed.h>
35 #include <wtf/text/StringBuilder.h>
36 #include <wtf/text/StringHash.h>
37
38 namespace WebCore {
39
40 template<typename CharacterType>
41 class CodePointIterator {
42 public:
43     CodePointIterator() { }
44     CodePointIterator(const CharacterType* begin, const CharacterType* end)
45         : m_begin(begin)
46         , m_end(end)
47     {
48     }
49     
50     CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
51         : CodePointIterator(begin.m_begin, end.m_begin)
52     {
53         ASSERT(end.m_begin >= begin.m_begin);
54     }
55     
56     UChar32 operator*() const;
57     CodePointIterator& operator++();
58
59     bool operator==(const CodePointIterator& other) const
60     {
61         return m_begin == other.m_begin
62             && m_end == other.m_end;
63     }
64     bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
65     
66     CodePointIterator& operator=(const CodePointIterator& other)
67     {
68         m_begin = other.m_begin;
69         m_end = other.m_end;
70         return *this;
71     }
72
73     bool atEnd() const
74     {
75         ASSERT(m_begin <= m_end);
76         return m_begin >= m_end;
77     }
78     
79 private:
80     const CharacterType* m_begin { nullptr };
81     const CharacterType* m_end { nullptr };
82 };
83
84 template<>
85 UChar32 CodePointIterator<LChar>::operator*() const
86 {
87     ASSERT(!atEnd());
88     return *m_begin;
89 }
90
91 template<>
92 auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
93 {
94     ASSERT(!atEnd());
95     m_begin++;
96     return *this;
97 }
98
99 template<>
100 UChar32 CodePointIterator<UChar>::operator*() const
101 {
102     ASSERT(!atEnd());
103     UChar32 c;
104     U16_GET(m_begin, 0, 0, m_end - m_begin, c);
105     return c;
106 }
107
108 template<>
109 auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
110 {
111     ASSERT(!atEnd());
112     unsigned i = 0;
113     size_t length = m_end - m_begin;
114     U16_FWD_1(m_begin, i, length);
115     m_begin += i;
116     return *this;
117 }
118
119 enum URLCharacterClass {
120     UserInfo = 0x1,
121     Default = 0x2,
122     InvalidDomain = 0x4,
123     QueryPercent = 0x8,
124     SlashQuestionOrHash = 0x10,
125 };
126
127 static const uint8_t characterClassTable[256] = {
128     UserInfo | Default | InvalidDomain | QueryPercent, // 0x0
129     UserInfo | Default | QueryPercent, // 0x1
130     UserInfo | Default | QueryPercent, // 0x2
131     UserInfo | Default | QueryPercent, // 0x3
132     UserInfo | Default | QueryPercent, // 0x4
133     UserInfo | Default | QueryPercent, // 0x5
134     UserInfo | Default | QueryPercent, // 0x6
135     UserInfo | Default | QueryPercent, // 0x7
136     UserInfo | Default | QueryPercent, // 0x8
137     UserInfo | Default | InvalidDomain | QueryPercent, // 0x9
138     UserInfo | Default | InvalidDomain | QueryPercent, // 0xA
139     UserInfo | Default | QueryPercent, // 0xB
140     UserInfo | Default | QueryPercent, // 0xC
141     UserInfo | Default | InvalidDomain | QueryPercent, // 0xD
142     UserInfo | Default | QueryPercent, // 0xE
143     UserInfo | Default | QueryPercent, // 0xF
144     UserInfo | Default | QueryPercent, // 0x10
145     UserInfo | Default | QueryPercent, // 0x11
146     UserInfo | Default | QueryPercent, // 0x12
147     UserInfo | Default | QueryPercent, // 0x13
148     UserInfo | Default | QueryPercent, // 0x14
149     UserInfo | Default | QueryPercent, // 0x15
150     UserInfo | Default | QueryPercent, // 0x16
151     UserInfo | Default | QueryPercent, // 0x17
152     UserInfo | Default | QueryPercent, // 0x18
153     UserInfo | Default | QueryPercent, // 0x19
154     UserInfo | Default | QueryPercent, // 0x1A
155     UserInfo | Default | QueryPercent, // 0x1B
156     UserInfo | Default | QueryPercent, // 0x1C
157     UserInfo | Default | QueryPercent, // 0x1D
158     UserInfo | Default | QueryPercent, // 0x1E
159     UserInfo | Default | QueryPercent, // 0x1F
160     UserInfo | Default | InvalidDomain | QueryPercent, // ' '
161     0, // '!'
162     UserInfo | Default | QueryPercent, // '"'
163     UserInfo | Default | InvalidDomain | QueryPercent | SlashQuestionOrHash, // '#'
164     0, // '$'
165     InvalidDomain, // '%'
166     0, // '&'
167     0, // '''
168     0, // '('
169     0, // ')'
170     0, // '*'
171     0, // '+'
172     0, // ','
173     0, // '-'
174     0, // '.'
175     UserInfo | InvalidDomain | SlashQuestionOrHash, // '/'
176     0, // '0'
177     0, // '1'
178     0, // '2'
179     0, // '3'
180     0, // '4'
181     0, // '5'
182     0, // '6'
183     0, // '7'
184     0, // '8'
185     0, // '9'
186     UserInfo | InvalidDomain, // ':'
187     UserInfo, // ';'
188     UserInfo | Default | QueryPercent, // '<'
189     UserInfo, // '='
190     UserInfo | Default | QueryPercent, // '>'
191     UserInfo | Default | InvalidDomain | SlashQuestionOrHash, // '?'
192     UserInfo | InvalidDomain, // '@'
193     0, // 'A'
194     0, // 'B'
195     0, // 'C'
196     0, // 'D'
197     0, // 'E'
198     0, // 'F'
199     0, // 'G'
200     0, // 'H'
201     0, // 'I'
202     0, // 'J'
203     0, // 'K'
204     0, // 'L'
205     0, // 'M'
206     0, // 'N'
207     0, // 'O'
208     0, // 'P'
209     0, // 'Q'
210     0, // 'R'
211     0, // 'S'
212     0, // 'T'
213     0, // 'U'
214     0, // 'V'
215     0, // 'W'
216     0, // 'X'
217     0, // 'Y'
218     0, // 'Z'
219     UserInfo | InvalidDomain, // '['
220     UserInfo | InvalidDomain | SlashQuestionOrHash, // '\\'
221     UserInfo | InvalidDomain, // ']'
222     UserInfo, // '^'
223     0, // '_'
224     UserInfo | Default, // '`'
225     0, // 'a'
226     0, // 'b'
227     0, // 'c'
228     0, // 'd'
229     0, // 'e'
230     0, // 'f'
231     0, // 'g'
232     0, // 'h'
233     0, // 'i'
234     0, // 'j'
235     0, // 'k'
236     0, // 'l'
237     0, // 'm'
238     0, // 'n'
239     0, // 'o'
240     0, // 'p'
241     0, // 'q'
242     0, // 'r'
243     0, // 's'
244     0, // 't'
245     0, // 'u'
246     0, // 'v'
247     0, // 'w'
248     0, // 'x'
249     0, // 'y'
250     0, // 'z'
251     UserInfo | Default, // '{'
252     UserInfo, // '|'
253     UserInfo | Default, // '}'
254     0, // '~'
255     QueryPercent, // 0x7F
256     QueryPercent, // 0x80
257     QueryPercent, // 0x81
258     QueryPercent, // 0x82
259     QueryPercent, // 0x83
260     QueryPercent, // 0x84
261     QueryPercent, // 0x85
262     QueryPercent, // 0x86
263     QueryPercent, // 0x87
264     QueryPercent, // 0x88
265     QueryPercent, // 0x89
266     QueryPercent, // 0x8A
267     QueryPercent, // 0x8B
268     QueryPercent, // 0x8C
269     QueryPercent, // 0x8D
270     QueryPercent, // 0x8E
271     QueryPercent, // 0x8F
272     QueryPercent, // 0x90
273     QueryPercent, // 0x91
274     QueryPercent, // 0x92
275     QueryPercent, // 0x93
276     QueryPercent, // 0x94
277     QueryPercent, // 0x95
278     QueryPercent, // 0x96
279     QueryPercent, // 0x97
280     QueryPercent, // 0x98
281     QueryPercent, // 0x99
282     QueryPercent, // 0x9A
283     QueryPercent, // 0x9B
284     QueryPercent, // 0x9C
285     QueryPercent, // 0x9D
286     QueryPercent, // 0x9E
287     QueryPercent, // 0x9F
288     QueryPercent, // 0xA0
289     QueryPercent, // 0xA1
290     QueryPercent, // 0xA2
291     QueryPercent, // 0xA3
292     QueryPercent, // 0xA4
293     QueryPercent, // 0xA5
294     QueryPercent, // 0xA6
295     QueryPercent, // 0xA7
296     QueryPercent, // 0xA8
297     QueryPercent, // 0xA9
298     QueryPercent, // 0xAA
299     QueryPercent, // 0xAB
300     QueryPercent, // 0xAC
301     QueryPercent, // 0xAD
302     QueryPercent, // 0xAE
303     QueryPercent, // 0xAF
304     QueryPercent, // 0xB0
305     QueryPercent, // 0xB1
306     QueryPercent, // 0xB2
307     QueryPercent, // 0xB3
308     QueryPercent, // 0xB4
309     QueryPercent, // 0xB5
310     QueryPercent, // 0xB6
311     QueryPercent, // 0xB7
312     QueryPercent, // 0xB8
313     QueryPercent, // 0xB9
314     QueryPercent, // 0xBA
315     QueryPercent, // 0xBB
316     QueryPercent, // 0xBC
317     QueryPercent, // 0xBD
318     QueryPercent, // 0xBE
319     QueryPercent, // 0xBF
320     QueryPercent, // 0xC0
321     QueryPercent, // 0xC1
322     QueryPercent, // 0xC2
323     QueryPercent, // 0xC3
324     QueryPercent, // 0xC4
325     QueryPercent, // 0xC5
326     QueryPercent, // 0xC6
327     QueryPercent, // 0xC7
328     QueryPercent, // 0xC8
329     QueryPercent, // 0xC9
330     QueryPercent, // 0xCA
331     QueryPercent, // 0xCB
332     QueryPercent, // 0xCC
333     QueryPercent, // 0xCD
334     QueryPercent, // 0xCE
335     QueryPercent, // 0xCF
336     QueryPercent, // 0xD0
337     QueryPercent, // 0xD1
338     QueryPercent, // 0xD2
339     QueryPercent, // 0xD3
340     QueryPercent, // 0xD4
341     QueryPercent, // 0xD5
342     QueryPercent, // 0xD6
343     QueryPercent, // 0xD7
344     QueryPercent, // 0xD8
345     QueryPercent, // 0xD9
346     QueryPercent, // 0xDA
347     QueryPercent, // 0xDB
348     QueryPercent, // 0xDC
349     QueryPercent, // 0xDD
350     QueryPercent, // 0xDE
351     QueryPercent, // 0xDF
352     QueryPercent, // 0xE0
353     QueryPercent, // 0xE1
354     QueryPercent, // 0xE2
355     QueryPercent, // 0xE3
356     QueryPercent, // 0xE4
357     QueryPercent, // 0xE5
358     QueryPercent, // 0xE6
359     QueryPercent, // 0xE7
360     QueryPercent, // 0xE8
361     QueryPercent, // 0xE9
362     QueryPercent, // 0xEA
363     QueryPercent, // 0xEB
364     QueryPercent, // 0xEC
365     QueryPercent, // 0xED
366     QueryPercent, // 0xEE
367     QueryPercent, // 0xEF
368     QueryPercent, // 0xF0
369     QueryPercent, // 0xF1
370     QueryPercent, // 0xF2
371     QueryPercent, // 0xF3
372     QueryPercent, // 0xF4
373     QueryPercent, // 0xF5
374     QueryPercent, // 0xF6
375     QueryPercent, // 0xF7
376     QueryPercent, // 0xF8
377     QueryPercent, // 0xF9
378     QueryPercent, // 0xFA
379     QueryPercent, // 0xFB
380     QueryPercent, // 0xFC
381     QueryPercent, // 0xFD
382     QueryPercent, // 0xFE
383     QueryPercent, // 0xFF
384 };
385
386 template<typename CharacterType> inline static bool isC0Control(CharacterType character) { return character <= 0x1F; }
387 template<typename CharacterType> inline static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
388 template<typename CharacterType> inline static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
389 template<typename CharacterType> inline static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
390 template<typename CharacterType> inline static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
391 template<typename CharacterType> inline static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
392 template<typename CharacterType> inline static bool isInvalidDomainCharacter(CharacterType character) { return character <= ']' && characterClassTable[character] & InvalidDomain; }
393 template<typename CharacterType> inline static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
394 template<typename CharacterType> inline static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
395 static bool shouldPercentEncodeQueryByte(uint8_t byte) { return characterClassTable[byte] & QueryPercent; }
396
397 template<bool serialized, typename CharacterType>
398 void incrementIteratorSkippingTabAndNewLine(CodePointIterator<CharacterType>& iterator)
399 {
400     ++iterator;
401     while (!serialized && !iterator.atEnd() && isTabOrNewline(*iterator))
402         ++iterator;
403 }
404
405 template<bool serialized, typename CharacterType>
406 inline static bool isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
407 {
408     if (iterator.atEnd() || !isASCIIAlpha(*iterator))
409         return false;
410     incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
411     if (iterator.atEnd())
412         return false;
413     return *iterator == ':' || *iterator == '|';
414 }
415
416 inline static bool isWindowsDriveLetter(const Vector<LChar>& buffer, size_t index)
417 {
418     if (buffer.size() < index + 2)
419         return false;
420     return isASCIIAlpha(buffer[index]) && (buffer[index + 1] == ':' || buffer[index + 1] == '|');
421 }
422
423 template<bool serialized, typename CharacterType>
424 inline static void checkWindowsDriveLetter(CodePointIterator<CharacterType>& iterator, Vector<LChar>& asciiBuffer)
425 {
426     if (isWindowsDriveLetter<serialized>(iterator)) {
427         asciiBuffer.reserveCapacity(asciiBuffer.size() + 2);
428         asciiBuffer.uncheckedAppend(*iterator);
429         incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
430         ASSERT(!iterator.atEnd());
431         ASSERT(*iterator == ':' || *iterator == '|');
432         asciiBuffer.uncheckedAppend(':');
433         incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
434     }
435 }
436
437 template<bool serialized, typename CharacterType>
438 inline static bool shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
439 {
440     if (!isWindowsDriveLetter<serialized>(iterator))
441         return true;
442     if (iterator.atEnd())
443         return false;
444     incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
445     if (iterator.atEnd())
446         return true;
447     incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
448     if (iterator.atEnd())
449         return true;
450     return !isSlashQuestionOrHash(*iterator);
451 }
452
453 inline static void percentEncode(uint8_t byte, Vector<LChar>& buffer)
454 {
455     buffer.append('%');
456     buffer.append(upperNibbleToASCIIHexDigit(byte));
457     buffer.append(lowerNibbleToASCIIHexDigit(byte));
458 }
459
460 template<bool serialized>
461 inline static void utf8PercentEncode(UChar32 codePoint, Vector<LChar>& destination, bool(*isInCodeSet)(UChar32))
462 {
463     if (serialized) {
464         ASSERT_WITH_SECURITY_IMPLICATION(isASCII(codePoint));
465         ASSERT_WITH_SECURITY_IMPLICATION(!isInCodeSet(codePoint));
466         destination.append(codePoint);
467     } else {
468         if (isInCodeSet(codePoint)) {
469             uint8_t buffer[U8_MAX_LENGTH];
470             int32_t offset = 0;
471             UBool error = false;
472             U8_APPEND(buffer, offset, U8_MAX_LENGTH, codePoint, error);
473             // FIXME: Check error.
474             for (int32_t i = 0; i < offset; ++i)
475                 percentEncode(buffer[i], destination);
476         } else {
477             ASSERT_WITH_MESSAGE(isASCII(codePoint), "isInCodeSet should always return true for non-ASCII characters");
478             destination.append(codePoint);
479         }
480     }
481 }
482
483 template<bool serialized>
484 inline static void utf8PercentEncodeQuery(UChar32 codePoint, Vector<LChar>& destination)
485 {
486     if (serialized) {
487         ASSERT_WITH_SECURITY_IMPLICATION(isASCII(codePoint));
488         ASSERT_WITH_SECURITY_IMPLICATION(!shouldPercentEncodeQueryByte(codePoint));
489         destination.append(codePoint);
490     } else {
491         uint8_t buffer[U8_MAX_LENGTH];
492         int32_t offset = 0;
493         UBool error = false;
494         U8_APPEND(buffer, offset, U8_MAX_LENGTH, codePoint, error);
495         ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
496         // FIXME: Check error.
497         for (int32_t i = 0; i < offset; ++i) {
498             auto byte = buffer[i];
499             if (shouldPercentEncodeQueryByte(byte))
500                 percentEncode(byte, destination);
501             else
502                 destination.append(byte);
503         }
504     }
505 }
506     
507 inline static void encodeQuery(const StringBuilder& source, Vector<LChar>& destination, const TextEncoding& encoding)
508 {
509     // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
510     CString encoded = encoding.encode(source.toStringPreserveCapacity(), URLEncodedEntitiesForUnencodables);
511     const char* data = encoded.data();
512     size_t length = encoded.length();
513     for (size_t i = 0; i < length; ++i) {
514         uint8_t byte = data[i];
515         if (shouldPercentEncodeQueryByte(byte))
516             percentEncode(byte, destination);
517         else
518             destination.append(byte);
519     }
520 }
521
522 inline static bool isDefaultPort(StringView scheme, uint16_t port)
523 {
524     static const uint16_t ftpPort = 21;
525     static const uint16_t gopherPort = 70;
526     static const uint16_t httpPort = 80;
527     static const uint16_t httpsPort = 443;
528     static const uint16_t wsPort = 80;
529     static const uint16_t wssPort = 443;
530     
531     auto length = scheme.length();
532     if (!length)
533         return false;
534     switch (scheme[0]) {
535     case 'w':
536         switch (length) {
537         case 2:
538             return scheme[1] == 's'
539                 && port == wsPort;
540         case 3:
541             return scheme[1] == 's'
542                 && scheme[2] == 's'
543                 && port == wssPort;
544         default:
545             return false;
546         }
547     case 'h':
548         switch (length) {
549         case 4:
550             return scheme[1] == 't'
551                 && scheme[2] == 't'
552                 && scheme[3] == 'p'
553                 && port == httpPort;
554         case 5:
555             return scheme[1] == 't'
556                 && scheme[2] == 't'
557                 && scheme[3] == 'p'
558                 && scheme[4] == 's'
559                 && port == httpsPort;
560         default:
561             return false;
562         }
563     case 'g':
564         return length == 6
565             && scheme[1] == 'o'
566             && scheme[2] == 'p'
567             && scheme[3] == 'h'
568             && scheme[4] == 'e'
569             && scheme[5] == 'r'
570             && port == gopherPort;
571     case 'f':
572         return length == 3
573             && scheme[1] == 't'
574             && scheme[2] == 'p'
575             && port == ftpPort;
576         return false;
577     default:
578         return false;
579     }
580 }
581
582 inline static bool isSpecialScheme(StringView scheme)
583 {
584     auto length = scheme.length();
585     if (!length)
586         return false;
587     switch (scheme[0]) {
588     case 'f':
589         switch (length) {
590         case 3:
591             return scheme[1] == 't'
592                 && scheme[2] == 'p';
593         case 4:
594             return scheme[1] == 'i'
595                 && scheme[2] == 'l'
596                 && scheme[3] == 'e';
597         default:
598             return false;
599         }
600     case 'g':
601         return length == 6
602             && scheme[1] == 'o'
603             && scheme[2] == 'p'
604             && scheme[3] == 'h'
605             && scheme[4] == 'e'
606             && scheme[5] == 'r';
607     case 'h':
608         switch (length) {
609         case 4:
610             return scheme[1] == 't'
611                 && scheme[2] == 't'
612                 && scheme[3] == 'p';
613         case 5:
614             return scheme[1] == 't'
615                 && scheme[2] == 't'
616                 && scheme[3] == 'p'
617                 && scheme[4] == 's';
618         default:
619             return false;
620         }
621     case 'w':
622         switch (length) {
623         case 2:
624             return scheme[1] == 's';
625         case 3:
626             return scheme[1] == 's'
627                 && scheme[2] == 's';
628         default:
629             return false;
630         }
631     default:
632         return false;
633     }
634 }
635
636 enum class URLParser::URLPart {
637     SchemeEnd,
638     UserStart,
639     UserEnd,
640     PasswordEnd,
641     HostEnd,
642     PortEnd,
643     PathAfterLastSlash,
644     PathEnd,
645     QueryEnd,
646     FragmentEnd,
647 };
648
649 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
650 {
651     switch (part) {
652     case URLPart::FragmentEnd:
653         return url.m_fragmentEnd;
654     case URLPart::QueryEnd:
655         return url.m_queryEnd;
656     case URLPart::PathEnd:
657         return url.m_pathEnd;
658     case URLPart::PathAfterLastSlash:
659         return url.m_pathAfterLastSlash;
660     case URLPart::PortEnd:
661         return url.m_portEnd;
662     case URLPart::HostEnd:
663         return url.m_hostEnd;
664     case URLPart::PasswordEnd:
665         return url.m_passwordEnd;
666     case URLPart::UserEnd:
667         return url.m_userEnd;
668     case URLPart::UserStart:
669         return url.m_userStart;
670     case URLPart::SchemeEnd:
671         return url.m_schemeEnd;
672     }
673     ASSERT_NOT_REACHED();
674     return 0;
675 }
676
677 inline static void copyASCIIStringUntil(Vector<LChar>& destination, const String& string, size_t lengthIf8Bit, size_t lengthIf16Bit)
678 {
679     ASSERT(destination.isEmpty());
680     if (string.is8Bit()) {
681         RELEASE_ASSERT(lengthIf8Bit <= string.length());
682         destination.append(string.characters8(), lengthIf8Bit);
683     } else {
684         RELEASE_ASSERT(lengthIf16Bit <= string.length());
685         destination.reserveCapacity(lengthIf16Bit);
686         const UChar* characters = string.characters16();
687         for (size_t i = 0; i < lengthIf16Bit; ++i) {
688             UChar c = characters[i];
689             ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
690             destination.uncheckedAppend(c);
691         }
692     }
693 }
694
695 void URLParser::copyURLPartsUntil(const URL& base, URLPart part)
696 {
697     m_asciiBuffer.clear();
698     m_unicodeFragmentBuffer.clear();
699     if (part == URLPart::FragmentEnd) {
700         copyASCIIStringUntil(m_asciiBuffer, base.m_string, urlLengthUntilPart(base, URLPart::FragmentEnd), urlLengthUntilPart(base, URLPart::QueryEnd));
701         if (!base.m_string.is8Bit()) {
702             const String& fragment = base.m_string;
703             bool seenUnicode = false;
704             for (size_t i = base.m_queryEnd; i < base.m_fragmentEnd; ++i) {
705                 if (!seenUnicode && !isASCII(fragment[i]))
706                     seenUnicode = true;
707                 if (seenUnicode)
708                     m_unicodeFragmentBuffer.uncheckedAppend(fragment[i]);
709                 else
710                     m_asciiBuffer.uncheckedAppend(fragment[i]);
711             }
712         }
713     } else {
714         size_t length = urlLengthUntilPart(base, part);
715         copyASCIIStringUntil(m_asciiBuffer, base.m_string, length, length);
716     }
717     switch (part) {
718     case URLPart::FragmentEnd:
719         m_url.m_fragmentEnd = base.m_fragmentEnd;
720         FALLTHROUGH;
721     case URLPart::QueryEnd:
722         m_url.m_queryEnd = base.m_queryEnd;
723         FALLTHROUGH;
724     case URLPart::PathEnd:
725         m_url.m_pathEnd = base.m_pathEnd;
726         FALLTHROUGH;
727     case URLPart::PathAfterLastSlash:
728         m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
729         FALLTHROUGH;
730     case URLPart::PortEnd:
731         m_url.m_portEnd = base.m_portEnd;
732         FALLTHROUGH;
733     case URLPart::HostEnd:
734         m_url.m_hostEnd = base.m_hostEnd;
735         FALLTHROUGH;
736     case URLPart::PasswordEnd:
737         m_url.m_passwordEnd = base.m_passwordEnd;
738         FALLTHROUGH;
739     case URLPart::UserEnd:
740         m_url.m_userEnd = base.m_userEnd;
741         FALLTHROUGH;
742     case URLPart::UserStart:
743         m_url.m_userStart = base.m_userStart;
744         FALLTHROUGH;
745     case URLPart::SchemeEnd:
746         m_url.m_isValid = base.m_isValid;
747         m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
748         m_url.m_schemeEnd = base.m_schemeEnd;
749     }
750     m_urlIsSpecial = isSpecialScheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd));
751 }
752
753 static const char* dotASCIICode = "2e";
754
755 template<typename CharacterType>
756 inline static bool isPercentEncodedDot(CodePointIterator<CharacterType> c)
757 {
758     if (c.atEnd())
759         return false;
760     if (*c != '%')
761         return false;
762     ++c;
763     if (c.atEnd())
764         return false;
765     if (*c != dotASCIICode[0])
766         return false;
767     ++c;
768     if (c.atEnd())
769         return false;
770     return toASCIILower(*c) == dotASCIICode[1];
771 }
772
773 template<typename CharacterType>
774 inline static bool isSingleDotPathSegment(CodePointIterator<CharacterType> c)
775 {
776     if (c.atEnd())
777         return false;
778     if (*c == '.') {
779         ++c;
780         return c.atEnd() || isSlashQuestionOrHash(*c);
781     }
782     if (*c != '%')
783         return false;
784     ++c;
785     if (c.atEnd() || *c != dotASCIICode[0])
786         return false;
787     ++c;
788     if (c.atEnd())
789         return false;
790     if (toASCIILower(*c) == dotASCIICode[1]) {
791         ++c;
792         return c.atEnd() || isSlashQuestionOrHash(*c);
793     }
794     return false;
795 }
796
797 template<typename CharacterType>
798 inline static bool isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
799 {
800     if (c.atEnd())
801         return false;
802     if (*c == '.') {
803         ++c;
804         return isSingleDotPathSegment(c);
805     }
806     if (*c != '%')
807         return false;
808     ++c;
809     if (c.atEnd() || *c != dotASCIICode[0])
810         return false;
811     ++c;
812     if (c.atEnd())
813         return false;
814     if (toASCIILower(*c) == dotASCIICode[1]) {
815         ++c;
816         return isSingleDotPathSegment(c);
817     }
818     return false;
819 }
820
821 template<typename CharacterType>
822 inline static void consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
823 {
824     ASSERT(isSingleDotPathSegment(c));
825     if (*c == '.') {
826         ++c;
827         if (!c.atEnd()) {
828             if (*c == '/' || *c == '\\')
829                 ++c;
830             else
831                 ASSERT(*c == '?' || *c == '#');
832         }
833     } else {
834         ASSERT(*c == '%');
835         ++c;
836         ASSERT(*c == dotASCIICode[0]);
837         ++c;
838         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
839         ++c;
840         if (!c.atEnd()) {
841             if (*c == '/' || *c == '\\')
842                 ++c;
843             else
844                 ASSERT(*c == '?' || *c == '#');
845         }
846     }
847 }
848
849 template<typename CharacterType>
850 inline static void consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
851 {
852     ASSERT(isDoubleDotPathSegment(c));
853     if (*c == '.')
854         ++c;
855     else {
856         ASSERT(*c == '%');
857         ++c;
858         ASSERT(*c == dotASCIICode[0]);
859         ++c;
860         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
861         ++c;
862     }
863     consumeSingleDotPathSegment(c);
864 }
865
866 void URLParser::popPath()
867 {
868     if (m_url.m_pathAfterLastSlash > m_url.m_portEnd + 1) {
869         m_url.m_pathAfterLastSlash--;
870         if (m_asciiBuffer[m_url.m_pathAfterLastSlash] == '/')
871             m_url.m_pathAfterLastSlash--;
872         while (m_url.m_pathAfterLastSlash > m_url.m_portEnd && m_asciiBuffer[m_url.m_pathAfterLastSlash] != '/')
873             m_url.m_pathAfterLastSlash--;
874         m_url.m_pathAfterLastSlash++;
875     }
876     m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
877 }
878
879 template<typename CharacterType>
880 URL URLParser::failure(const CharacterType* input, unsigned length)
881 {
882     URL url;
883     url.m_isValid = false;
884     url.m_protocolIsInHTTPFamily = false;
885     url.m_cannotBeABaseURL = false;
886     url.m_schemeEnd = 0;
887     url.m_userStart = 0;
888     url.m_userEnd = 0;
889     url.m_passwordEnd = 0;
890     url.m_hostEnd = 0;
891     url.m_portEnd = 0;
892     url.m_pathAfterLastSlash = 0;
893     url.m_pathEnd = 0;
894     url.m_queryEnd = 0;
895     url.m_fragmentEnd = 0;
896     url.m_string = String(input, length);
897     return url;
898 }
899
900 URL URLParser::parse(const String& input, const URL& base, const TextEncoding& encoding)
901 {
902     const bool serialized = false;
903     if (input.is8Bit())
904         return parse<serialized>(input.characters8(), input.length(), base, encoding);
905     return parse<serialized>(input.characters16(), input.length(), base, encoding);
906 }
907
908 URL URLParser::parseSerializedURL(const String& input)
909 {
910     const bool serialized = true;
911     if (input.is8Bit())
912         return parse<serialized>(input.characters8(), input.length(), { }, UTF8Encoding());
913     return parse<serialized>(input.characters16(), input.length(), { }, UTF8Encoding());
914 }
915     
916 template<bool serialized, typename CharacterType>
917 URL URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
918 {
919     LOG(URLParser, "Parsing URL <%s> base <%s>", String(input, length).utf8().data(), base.string().utf8().data());
920     m_url = { };
921     m_asciiBuffer.clear();
922     m_unicodeFragmentBuffer.clear();
923     m_asciiBuffer.reserveCapacity(length);
924     
925     bool isUTF8Encoding = encoding == UTF8Encoding();
926     StringBuilder queryBuffer;
927
928     unsigned endIndex = length;
929     while (endIndex && isC0ControlOrSpace(input[endIndex - 1]))
930         endIndex--;
931     CodePointIterator<CharacterType> c(input, input + endIndex);
932     CodePointIterator<CharacterType> authorityOrHostBegin;
933     while (!c.atEnd() && isC0ControlOrSpace(*c))
934         ++c;
935     auto beginAfterControlAndSpace = c;
936
937     enum class State : uint8_t {
938         SchemeStart,
939         Scheme,
940         NoScheme,
941         SpecialRelativeOrAuthority,
942         PathOrAuthority,
943         Relative,
944         RelativeSlash,
945         SpecialAuthoritySlashes,
946         SpecialAuthorityIgnoreSlashes,
947         AuthorityOrHost,
948         Host,
949         File,
950         FileSlash,
951         FileHost,
952         PathStart,
953         Path,
954         CannotBeABaseURLPath,
955         Query,
956         Fragment,
957     };
958
959 #define LOG_STATE(x) LOG(URLParser, "State %s, code point %c, asciiBuffer size %zu", x, *c, m_asciiBuffer.size())
960 #define LOG_FINAL_STATE(x) LOG(URLParser, "Final State: %s", x)
961
962     State state = State::SchemeStart;
963     while (!c.atEnd()) {
964         if (!serialized && isTabOrNewline(*c)) {
965             ++c;
966             continue;
967         }
968
969         switch (state) {
970         case State::SchemeStart:
971             LOG_STATE("SchemeStart");
972             if (isASCIIAlpha(*c)) {
973                 m_asciiBuffer.uncheckedAppend(toASCIILower(*c));
974                 ++c;
975                 state = State::Scheme;
976             } else
977                 state = State::NoScheme;
978             break;
979         case State::Scheme:
980             LOG_STATE("Scheme");
981             if (isASCIIAlphanumeric(*c) || *c == '+' || *c == '-' || *c == '.')
982                 m_asciiBuffer.append(toASCIILower(*c));
983             else if (*c == ':') {
984                 m_url.m_schemeEnd = m_asciiBuffer.size();
985                 StringView urlScheme = StringView(m_asciiBuffer.data(), m_url.m_schemeEnd);
986                 m_url.m_protocolIsInHTTPFamily = urlScheme == "http" || urlScheme == "https";
987                 if (urlScheme == "file") {
988                     m_urlIsSpecial = true;
989                     state = State::File;
990                     m_asciiBuffer.append(':');
991                     ++c;
992                     break;
993                 }
994                 m_asciiBuffer.append(':');
995                 if (isSpecialScheme(urlScheme)) {
996                     m_urlIsSpecial = true;
997                     if (base.protocolIs(m_asciiBuffer.data(), m_asciiBuffer.size() - 1))
998                         state = State::SpecialRelativeOrAuthority;
999                     else
1000                         state = State::SpecialAuthoritySlashes;
1001                 } else {
1002                     auto maybeSlash = c;
1003                     incrementIteratorSkippingTabAndNewLine<serialized>(maybeSlash);
1004                     if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1005                         m_asciiBuffer.append('/');
1006                         m_url.m_userStart = m_asciiBuffer.size();
1007                         state = State::PathOrAuthority;
1008                         c = maybeSlash;
1009                         ASSERT(*c == '/');
1010                     } else {
1011                         m_url.m_userStart = m_asciiBuffer.size();
1012                         m_url.m_userEnd = m_url.m_userStart;
1013                         m_url.m_passwordEnd = m_url.m_userStart;
1014                         m_url.m_hostEnd = m_url.m_userStart;
1015                         m_url.m_portEnd = m_url.m_userStart;
1016                         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1017                         m_url.m_cannotBeABaseURL = true;
1018                         state = State::CannotBeABaseURLPath;
1019                     }
1020                 }
1021                 ++c;
1022                 break;
1023             } else {
1024                 m_asciiBuffer.clear();
1025                 state = State::NoScheme;
1026                 c = beginAfterControlAndSpace;
1027                 break;
1028             }
1029             incrementIteratorSkippingTabAndNewLine<serialized>(c);
1030             if (c.atEnd()) {
1031                 m_asciiBuffer.clear();
1032                 state = State::NoScheme;
1033                 c = beginAfterControlAndSpace;
1034             }
1035             break;
1036         case State::NoScheme:
1037             LOG_STATE("NoScheme");
1038             if (base.isNull() || (base.m_cannotBeABaseURL && *c != '#'))
1039                 return failure(input, length);
1040             if (base.m_cannotBeABaseURL && *c == '#') {
1041                 copyURLPartsUntil(base, URLPart::QueryEnd);
1042                 state = State::Fragment;
1043                 m_asciiBuffer.append('#');
1044                 ++c;
1045                 break;
1046             }
1047             if (!base.protocolIs("file")) {
1048                 state = State::Relative;
1049                 break;
1050             }
1051             copyURLPartsUntil(base, URLPart::SchemeEnd);
1052             m_asciiBuffer.append(':');
1053             state = State::File;
1054             break;
1055         case State::SpecialRelativeOrAuthority:
1056             LOG_STATE("SpecialRelativeOrAuthority");
1057             if (*c == '/') {
1058                 m_asciiBuffer.append('/');
1059                 incrementIteratorSkippingTabAndNewLine<serialized>(c);
1060                 if (c.atEnd())
1061                     return failure(input, length);
1062                 if (*c == '/') {
1063                     m_asciiBuffer.append('/');
1064                     state = State::SpecialAuthorityIgnoreSlashes;
1065                     ++c;
1066                 } else
1067                     state = State::RelativeSlash;
1068             } else
1069                 state = State::Relative;
1070             break;
1071         case State::PathOrAuthority:
1072             LOG_STATE("PathOrAuthority");
1073             if (*c == '/') {
1074                 m_asciiBuffer.append('/');
1075                 m_url.m_userStart = m_asciiBuffer.size();
1076                 state = State::AuthorityOrHost;
1077                 ++c;
1078                 authorityOrHostBegin = c;
1079             } else {
1080                 ASSERT(m_asciiBuffer.last() == '/');
1081                 m_url.m_userStart = m_asciiBuffer.size() - 1;
1082                 m_url.m_userEnd = m_url.m_userStart;
1083                 m_url.m_passwordEnd = m_url.m_userStart;
1084                 m_url.m_hostEnd = m_url.m_userStart;
1085                 m_url.m_portEnd = m_url.m_userStart;
1086                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1087                 state = State::Path;
1088             }
1089             break;
1090         case State::Relative:
1091             LOG_STATE("Relative");
1092             switch (*c) {
1093             case '/':
1094             case '\\':
1095                 state = State::RelativeSlash;
1096                 ++c;
1097                 break;
1098             case '?':
1099                 copyURLPartsUntil(base, URLPart::PathEnd);
1100                 m_asciiBuffer.append('?');
1101                 state = State::Query;
1102                 ++c;
1103                 break;
1104             case '#':
1105                 copyURLPartsUntil(base, URLPart::QueryEnd);
1106                 m_asciiBuffer.append('#');
1107                 state = State::Fragment;
1108                 ++c;
1109                 break;
1110             default:
1111                 copyURLPartsUntil(base, URLPart::PathAfterLastSlash);
1112                 state = State::Path;
1113                 break;
1114             }
1115             break;
1116         case State::RelativeSlash:
1117             LOG_STATE("RelativeSlash");
1118             if (*c == '/' || *c == '\\') {
1119                 ++c;
1120                 copyURLPartsUntil(base, URLPart::SchemeEnd);
1121                 m_asciiBuffer.append("://", 3);
1122                 state = State::SpecialAuthorityIgnoreSlashes;
1123             } else {
1124                 copyURLPartsUntil(base, URLPart::PortEnd);
1125                 m_asciiBuffer.append('/');
1126                 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1127                 state = State::Path;
1128             }
1129             break;
1130         case State::SpecialAuthoritySlashes:
1131             LOG_STATE("SpecialAuthoritySlashes");
1132             m_asciiBuffer.append("//", 2);
1133             if (*c == '/' || *c == '\\') {
1134                 incrementIteratorSkippingTabAndNewLine<serialized>(c);
1135                 if (!c.atEnd() && (*c == '/' || *c == '\\'))
1136                     ++c;
1137             }
1138             state = State::SpecialAuthorityIgnoreSlashes;
1139             break;
1140         case State::SpecialAuthorityIgnoreSlashes:
1141             LOG_STATE("SpecialAuthorityIgnoreSlashes");
1142             if (*c == '/' || *c == '\\') {
1143                 m_asciiBuffer.append('/');
1144                 ++c;
1145             }
1146             m_url.m_userStart = m_asciiBuffer.size();
1147             state = State::AuthorityOrHost;
1148             authorityOrHostBegin = c;
1149             break;
1150         case State::AuthorityOrHost:
1151             LOG_STATE("AuthorityOrHost");
1152             {
1153                 if (*c == '@') {
1154                     auto lastAt = c;
1155                     auto findLastAt = c;
1156                     while (!findLastAt.atEnd()) {
1157                         if (*findLastAt == '@')
1158                             lastAt = findLastAt;
1159                         ++findLastAt;
1160                     }
1161                     parseAuthority<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1162                     c = lastAt;
1163                     incrementIteratorSkippingTabAndNewLine<serialized>(c);
1164                     authorityOrHostBegin = c;
1165                     state = State::Host;
1166                     m_hostHasPercentOrNonASCII = false;
1167                     break;
1168                 }
1169                 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1170                 if (isSlash || *c == '?' || *c == '#') {
1171                     m_url.m_userEnd = m_asciiBuffer.size();
1172                     m_url.m_passwordEnd = m_url.m_userEnd;
1173                     if (!parseHostAndPort<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
1174                         return failure(input, length);
1175                     if (!isSlash) {
1176                         m_asciiBuffer.append('/');
1177                         m_url.m_pathAfterLastSlash = m_asciiBuffer.size();
1178                     }
1179                     state = State::Path;
1180                     break;
1181                 }
1182                 if (isPercentOrNonASCII(*c))
1183                     m_hostHasPercentOrNonASCII = true;
1184                 ++c;
1185             }
1186             break;
1187         case State::Host:
1188             LOG_STATE("Host");
1189             if (*c == '/' || *c == '?' || *c == '#') {
1190                 if (!parseHostAndPort<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
1191                     return failure(input, length);
1192                 state = State::Path;
1193                 break;
1194             }
1195             if (isPercentOrNonASCII(*c))
1196                 m_hostHasPercentOrNonASCII = true;
1197             ++c;
1198             break;
1199         case State::File:
1200             LOG_STATE("File");
1201             switch (*c) {
1202             case '/':
1203             case '\\':
1204                 m_asciiBuffer.append('/');
1205                 state = State::FileSlash;
1206                 ++c;
1207                 break;
1208             case '?':
1209                 if (!base.isNull() && base.protocolIs("file"))
1210                     copyURLPartsUntil(base, URLPart::PathEnd);
1211                 m_asciiBuffer.append("///?", 4);
1212                 m_url.m_userStart = m_asciiBuffer.size() - 2;
1213                 m_url.m_userEnd = m_url.m_userStart;
1214                 m_url.m_passwordEnd = m_url.m_userStart;
1215                 m_url.m_hostEnd = m_url.m_userStart;
1216                 m_url.m_portEnd = m_url.m_userStart;
1217                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1218                 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1219                 state = State::Query;
1220                 ++c;
1221                 break;
1222             case '#':
1223                 if (!base.isNull() && base.protocolIs("file"))
1224                     copyURLPartsUntil(base, URLPart::QueryEnd);
1225                 m_asciiBuffer.append("///#", 4);
1226                 m_url.m_userStart = m_asciiBuffer.size() - 2;
1227                 m_url.m_userEnd = m_url.m_userStart;
1228                 m_url.m_passwordEnd = m_url.m_userStart;
1229                 m_url.m_hostEnd = m_url.m_userStart;
1230                 m_url.m_portEnd = m_url.m_userStart;
1231                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1232                 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1233                 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1234                 state = State::Fragment;
1235                 ++c;
1236                 break;
1237             default:
1238                 if (!base.isNull() && base.protocolIs("file") && shouldCopyFileURL<serialized>(c))
1239                     copyURLPartsUntil(base, URLPart::PathAfterLastSlash);
1240                 else {
1241                     m_asciiBuffer.append("///", 3);
1242                     m_url.m_userStart = m_asciiBuffer.size() - 1;
1243                     m_url.m_userEnd = m_url.m_userStart;
1244                     m_url.m_passwordEnd = m_url.m_userStart;
1245                     m_url.m_hostEnd = m_url.m_userStart;
1246                     m_url.m_portEnd = m_url.m_userStart;
1247                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1248                     checkWindowsDriveLetter<serialized>(c, m_asciiBuffer);
1249                 }
1250                 state = State::Path;
1251                 break;
1252             }
1253             break;
1254         case State::FileSlash:
1255             LOG_STATE("FileSlash");
1256             if (*c == '/' || *c == '\\') {
1257                 ++c;
1258                 m_asciiBuffer.append('/');
1259                 m_url.m_userStart = m_asciiBuffer.size();
1260                 m_url.m_userEnd = m_url.m_userStart;
1261                 m_url.m_passwordEnd = m_url.m_userStart;
1262                 m_url.m_hostEnd = m_url.m_userStart;
1263                 m_url.m_portEnd = m_url.m_userStart;
1264                 authorityOrHostBegin = c;
1265                 state = State::FileHost;
1266                 break;
1267             }
1268             if (!base.isNull() && base.protocolIs("file")) {
1269                 // FIXME: This String copy is unnecessary.
1270                 String basePath = base.path();
1271                 if (basePath.length() >= 2) {
1272                     bool windowsQuirk = basePath.is8Bit()
1273                         ? isWindowsDriveLetter<serialized>(CodePointIterator<LChar>(basePath.characters8(), basePath.characters8() + basePath.length()))
1274                         : isWindowsDriveLetter<serialized>(CodePointIterator<UChar>(basePath.characters16(), basePath.characters16() + basePath.length()));
1275                     if (windowsQuirk) {
1276                         m_asciiBuffer.append(basePath[0]);
1277                         m_asciiBuffer.append(basePath[1]);
1278                     }
1279                 }
1280             }
1281             m_asciiBuffer.append("//", 2);
1282             m_url.m_userStart = m_asciiBuffer.size() - 1;
1283             m_url.m_userEnd = m_url.m_userStart;
1284             m_url.m_passwordEnd = m_url.m_userStart;
1285             m_url.m_hostEnd = m_url.m_userStart;
1286             m_url.m_portEnd = m_url.m_userStart;
1287             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1288             checkWindowsDriveLetter<serialized>(c, m_asciiBuffer);
1289             state = State::Path;
1290             break;
1291         case State::FileHost:
1292             LOG_STATE("FileHost");
1293             if (isSlashQuestionOrHash(*c)) {
1294                 if (isWindowsDriveLetter(m_asciiBuffer, m_url.m_portEnd + 1)) {
1295                     state = State::Path;
1296                     break;
1297                 }
1298                 if (authorityOrHostBegin == c) {
1299                     ASSERT(m_asciiBuffer[m_asciiBuffer.size() - 1] == '/');
1300                     if (*c == '?') {
1301                         m_asciiBuffer.append("/?", 2);
1302                         m_url.m_pathAfterLastSlash = m_asciiBuffer.size() - 1;
1303                         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1304                         state = State::Query;
1305                         ++c;
1306                         break;
1307                     }
1308                     if (*c == '#') {
1309                         m_asciiBuffer.append("/#", 2);
1310                         m_url.m_pathAfterLastSlash = m_asciiBuffer.size() - 1;
1311                         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1312                         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1313                         state = State::Fragment;
1314                         ++c;
1315                         break;
1316                     }
1317                     state = State::Path;
1318                     break;
1319                 }
1320                 if (!parseHostAndPort<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
1321                     return failure(input, length);
1322                 
1323                 if (StringView(m_asciiBuffer.data() + m_url.m_passwordEnd, m_asciiBuffer.size() - m_url.m_passwordEnd) == "localhost")  {
1324                     m_asciiBuffer.shrink(m_url.m_passwordEnd);
1325                     m_url.m_hostEnd = m_asciiBuffer.size();
1326                     m_url.m_portEnd = m_url.m_hostEnd;
1327                 }
1328                 
1329                 state = State::PathStart;
1330                 break;
1331             }
1332             if (isPercentOrNonASCII(*c))
1333                 m_hostHasPercentOrNonASCII = true;
1334             ++c;
1335             break;
1336         case State::PathStart:
1337             LOG_STATE("PathStart");
1338             if (*c != '/' && *c != '\\')
1339                 ++c;
1340             state = State::Path;
1341             break;
1342         case State::Path:
1343             LOG_STATE("Path");
1344             if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1345                 m_asciiBuffer.append('/');
1346                 m_url.m_pathAfterLastSlash = m_asciiBuffer.size();
1347                 ++c;
1348                 break;
1349             }
1350             if (m_asciiBuffer.size() && m_asciiBuffer[m_asciiBuffer.size() - 1] == '/') {
1351                 if (isDoubleDotPathSegment(c)) {
1352                     consumeDoubleDotPathSegment(c);
1353                     popPath();
1354                     break;
1355                 }
1356                 if (m_asciiBuffer[m_asciiBuffer.size() - 1] == '/' && isSingleDotPathSegment(c)) {
1357                     consumeSingleDotPathSegment(c);
1358                     break;
1359                 }
1360             }
1361             if (*c == '?') {
1362                 m_url.m_pathEnd = m_asciiBuffer.size();
1363                 state = State::Query;
1364                 break;
1365             }
1366             if (*c == '#') {
1367                 m_url.m_pathEnd = m_asciiBuffer.size();
1368                 m_url.m_queryEnd = m_url.m_pathEnd;
1369                 state = State::Fragment;
1370                 break;
1371             }
1372             if (isPercentEncodedDot(c)) {
1373                 m_asciiBuffer.append('.');
1374                 ASSERT(*c == '%');
1375                 ++c;
1376                 ASSERT(*c == dotASCIICode[0]);
1377                 ++c;
1378                 ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1379                 ++c;
1380                 break;
1381             }
1382             utf8PercentEncode<serialized>(*c, m_asciiBuffer, isInDefaultEncodeSet);
1383             ++c;
1384             break;
1385         case State::CannotBeABaseURLPath:
1386             LOG_STATE("CannotBeABaseURLPath");
1387             if (*c == '?') {
1388                 m_url.m_pathEnd = m_asciiBuffer.size();
1389                 state = State::Query;
1390             } else if (*c == '#') {
1391                 m_url.m_pathEnd = m_asciiBuffer.size();
1392                 m_url.m_queryEnd = m_url.m_pathEnd;
1393                 state = State::Fragment;
1394             } else {
1395                 utf8PercentEncode<serialized>(*c, m_asciiBuffer, isInSimpleEncodeSet);
1396                 ++c;
1397             }
1398             break;
1399         case State::Query:
1400             LOG_STATE("Query");
1401             if (*c == '#') {
1402                 if (!isUTF8Encoding)
1403                     encodeQuery(queryBuffer, m_asciiBuffer, encoding);
1404                 m_url.m_queryEnd = m_asciiBuffer.size();
1405                 state = State::Fragment;
1406                 break;
1407             }
1408             if (isUTF8Encoding)
1409                 utf8PercentEncodeQuery<serialized>(*c, m_asciiBuffer);
1410             else
1411                 queryBuffer.append(*c);
1412             ++c;
1413             break;
1414         case State::Fragment:
1415             LOG_STATE("Fragment");
1416             if (m_unicodeFragmentBuffer.isEmpty() && isASCII(*c))
1417                 m_asciiBuffer.append(*c);
1418             else
1419                 m_unicodeFragmentBuffer.append(*c);
1420             ++c;
1421             break;
1422         }
1423     }
1424
1425     switch (state) {
1426     case State::SchemeStart:
1427         LOG_FINAL_STATE("SchemeStart");
1428         if (!m_asciiBuffer.size() && !base.isNull())
1429             return base;
1430         return failure(input, length);
1431     case State::Scheme:
1432         LOG_FINAL_STATE("Scheme");
1433         return failure(input, length);
1434     case State::NoScheme:
1435         LOG_FINAL_STATE("NoScheme");
1436         RELEASE_ASSERT_NOT_REACHED();
1437     case State::SpecialRelativeOrAuthority:
1438         LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1439         copyURLPartsUntil(base, URLPart::QueryEnd);
1440         m_url.m_fragmentEnd = m_url.m_queryEnd;
1441         break;
1442     case State::PathOrAuthority:
1443         LOG_FINAL_STATE("PathOrAuthority");
1444         m_url.m_userEnd = m_asciiBuffer.size();
1445         m_url.m_passwordEnd = m_url.m_userEnd;
1446         m_url.m_hostEnd = m_url.m_userEnd;
1447         m_url.m_portEnd = m_url.m_userEnd;
1448         m_url.m_pathAfterLastSlash = m_url.m_userEnd;
1449         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1450         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1451         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1452         break;
1453     case State::Relative:
1454         LOG_FINAL_STATE("Relative");
1455         copyURLPartsUntil(base, URLPart::FragmentEnd);
1456         break;
1457     case State::RelativeSlash:
1458         LOG_FINAL_STATE("RelativeSlash");
1459         copyURLPartsUntil(base, URLPart::PortEnd);
1460         m_asciiBuffer.append('/');
1461         m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1462         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1463         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1464         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1465         break;
1466     case State::SpecialAuthoritySlashes:
1467         LOG_FINAL_STATE("SpecialAuthoritySlashes");
1468         m_url.m_userStart = m_asciiBuffer.size();
1469         m_url.m_userEnd = m_url.m_userStart;
1470         m_url.m_passwordEnd = m_url.m_userStart;
1471         m_url.m_hostEnd = m_url.m_userStart;
1472         m_url.m_portEnd = m_url.m_userStart;
1473         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1474         m_url.m_pathEnd = m_url.m_userStart;
1475         m_url.m_queryEnd = m_url.m_userStart;
1476         m_url.m_fragmentEnd = m_url.m_userStart;
1477         break;
1478     case State::SpecialAuthorityIgnoreSlashes:
1479         LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1480         return failure(input, length);
1481         break;
1482     case State::AuthorityOrHost:
1483         LOG_FINAL_STATE("AuthorityOrHost");
1484         m_url.m_userEnd = m_asciiBuffer.size();
1485         m_url.m_passwordEnd = m_url.m_userEnd;
1486         if (authorityOrHostBegin.atEnd()) {
1487             m_url.m_hostEnd = m_url.m_userEnd;
1488             m_url.m_portEnd = m_url.m_userEnd;
1489         } else if (!parseHostAndPort<serialized>(authorityOrHostBegin))
1490             return failure(input, length);
1491         m_asciiBuffer.append('/');
1492         m_url.m_pathEnd = m_url.m_portEnd + 1;
1493         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1494         m_url.m_queryEnd = m_url.m_pathEnd;
1495         m_url.m_fragmentEnd = m_url.m_pathEnd;
1496         break;
1497     case State::Host:
1498         LOG_FINAL_STATE("Host");
1499         if (!parseHostAndPort<serialized>(authorityOrHostBegin))
1500             return failure(input, length);
1501         m_asciiBuffer.append('/');
1502         m_url.m_pathEnd = m_url.m_portEnd + 1;
1503         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1504         m_url.m_queryEnd = m_url.m_pathEnd;
1505         m_url.m_fragmentEnd = m_url.m_pathEnd;
1506         break;
1507     case State::File:
1508         LOG_FINAL_STATE("File");
1509         if (!base.isNull() && base.protocolIs("file")) {
1510             copyURLPartsUntil(base, URLPart::QueryEnd);
1511             m_asciiBuffer.append(':');
1512         }
1513         m_asciiBuffer.append("///", 3);
1514         m_url.m_userStart = m_asciiBuffer.size() - 1;
1515         m_url.m_userEnd = m_url.m_userStart;
1516         m_url.m_passwordEnd = m_url.m_userStart;
1517         m_url.m_hostEnd = m_url.m_userStart;
1518         m_url.m_portEnd = m_url.m_userStart;
1519         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1520         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1521         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1522         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1523         break;
1524     case State::FileSlash:
1525         LOG_FINAL_STATE("FileSlash");
1526         m_asciiBuffer.append("//", 2);
1527         m_url.m_userStart = m_asciiBuffer.size() - 1;
1528         m_url.m_userEnd = m_url.m_userStart;
1529         m_url.m_passwordEnd = m_url.m_userStart;
1530         m_url.m_hostEnd = m_url.m_userStart;
1531         m_url.m_portEnd = m_url.m_userStart;
1532         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1533         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1534         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1535         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1536         break;
1537     case State::FileHost:
1538         LOG_FINAL_STATE("FileHost");
1539         if (authorityOrHostBegin == c) {
1540             m_asciiBuffer.append('/');
1541             m_url.m_userStart = m_asciiBuffer.size() - 1;
1542             m_url.m_userEnd = m_url.m_userStart;
1543             m_url.m_passwordEnd = m_url.m_userStart;
1544             m_url.m_hostEnd = m_url.m_userStart;
1545             m_url.m_portEnd = m_url.m_userStart;
1546             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1547             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1548             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1549             m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1550             break;
1551         }
1552
1553         if (!parseHostAndPort<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
1554             return failure(input, length);
1555
1556         if (StringView(m_asciiBuffer.data() + m_url.m_passwordEnd, m_asciiBuffer.size() - m_url.m_passwordEnd) == "localhost")  {
1557             m_asciiBuffer.shrink(m_url.m_passwordEnd);
1558             m_url.m_hostEnd = m_asciiBuffer.size();
1559             m_url.m_portEnd = m_url.m_hostEnd;
1560         }
1561         m_asciiBuffer.append('/');
1562         m_url.m_pathAfterLastSlash = m_url.m_hostEnd + 1;
1563         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1564         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1565         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1566         break;
1567     case State::PathStart:
1568         LOG_FINAL_STATE("PathStart");
1569         RELEASE_ASSERT_NOT_REACHED();
1570     case State::Path:
1571         LOG_FINAL_STATE("Path");
1572         m_url.m_pathEnd = m_asciiBuffer.size();
1573         m_url.m_queryEnd = m_url.m_pathEnd;
1574         m_url.m_fragmentEnd = m_url.m_pathEnd;
1575         break;
1576     case State::CannotBeABaseURLPath:
1577         LOG_FINAL_STATE("CannotBeABaseURLPath");
1578         m_url.m_pathEnd = m_asciiBuffer.size();
1579         m_url.m_queryEnd = m_url.m_pathEnd;
1580         m_url.m_fragmentEnd = m_url.m_pathEnd;
1581         break;
1582     case State::Query:
1583         LOG_FINAL_STATE("Query");
1584         if (!isUTF8Encoding)
1585             encodeQuery(queryBuffer, m_asciiBuffer, encoding);
1586         m_url.m_queryEnd = m_asciiBuffer.size();
1587         m_url.m_fragmentEnd = m_url.m_queryEnd;
1588         break;
1589     case State::Fragment:
1590         LOG_FINAL_STATE("Fragment");
1591         m_url.m_fragmentEnd = m_asciiBuffer.size() + m_unicodeFragmentBuffer.size();
1592         break;
1593     }
1594
1595     if (m_unicodeFragmentBuffer.isEmpty()) {
1596         // FIXME: String::adopt should require a WTFMove.
1597         m_url.m_string = String::adopt(m_asciiBuffer);
1598     } else {
1599         StringBuilder builder;
1600         builder.reserveCapacity(m_asciiBuffer.size() + m_unicodeFragmentBuffer.size());
1601         builder.append(m_asciiBuffer.data(), m_asciiBuffer.size());
1602         for (size_t i = 0; i < m_unicodeFragmentBuffer.size(); ++i)
1603             builder.append(m_unicodeFragmentBuffer[i]);
1604         m_url.m_string = builder.toString();
1605     }
1606     m_url.m_isValid = true;
1607     LOG(URLParser, "Parsed URL <%s>", m_url.m_string.utf8().data());
1608     ASSERT(internalValuesConsistent(m_url));
1609     return m_url;
1610 }
1611
1612 template<bool serialized, typename CharacterType>
1613 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
1614 {
1615     if (iterator.atEnd()) {
1616         m_url.m_userEnd = m_asciiBuffer.size();
1617         m_url.m_passwordEnd = m_url.m_userEnd;
1618         return;
1619     }
1620     for (; !iterator.atEnd(); ++iterator) {
1621         if (*iterator == ':') {
1622             ++iterator;
1623             m_url.m_userEnd = m_asciiBuffer.size();
1624             if (iterator.atEnd()) {
1625                 m_url.m_passwordEnd = m_url.m_userEnd;
1626                 if (m_url.m_userEnd > m_url.m_userStart)
1627                     m_asciiBuffer.append('@');
1628                 return;
1629             }
1630             m_asciiBuffer.append(':');
1631             break;
1632         }
1633         utf8PercentEncode<serialized>(*iterator, m_asciiBuffer, isInUserInfoEncodeSet);
1634     }
1635     for (; !iterator.atEnd(); ++iterator)
1636         utf8PercentEncode<serialized>(*iterator, m_asciiBuffer, isInUserInfoEncodeSet);
1637     m_url.m_passwordEnd = m_asciiBuffer.size();
1638     if (!m_url.m_userEnd)
1639         m_url.m_userEnd = m_url.m_passwordEnd;
1640     m_asciiBuffer.append('@');
1641 }
1642
1643 template<typename UnsignedIntegerType>
1644 void append(Vector<LChar>& destination, UnsignedIntegerType number)
1645 {
1646     LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
1647     LChar* end = buf + WTF_ARRAY_LENGTH(buf);
1648     LChar* p = end;
1649     do {
1650         *--p = (number % 10) + '0';
1651         number /= 10;
1652     } while (number);
1653     destination.append(p, end - p);
1654 }
1655
1656 inline static void serializeIPv4(uint32_t address, Vector<LChar>& buffer)
1657 {
1658     append<uint8_t>(buffer, address >> 24);
1659     buffer.append('.');
1660     append<uint8_t>(buffer, address >> 16);
1661     buffer.append('.');
1662     append<uint8_t>(buffer, address >> 8);
1663     buffer.append('.');
1664     append<uint8_t>(buffer, address);
1665 }
1666     
1667 inline static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
1668 {
1669     size_t end = begin;
1670     for (; end < 8; end++) {
1671         if (address[end])
1672             break;
1673     }
1674     return end - begin;
1675 }
1676
1677 inline static Optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
1678 {
1679     Optional<size_t> longest;
1680     size_t longestLength = 0;
1681     for (size_t i = 0; i < 8; i++) {
1682         size_t length = zeroSequenceLength(address, i);
1683         if (length) {
1684             if (length > 1 && (!longest || longestLength < length)) {
1685                 longest = i;
1686                 longestLength = length;
1687             }
1688             i += length;
1689         }
1690     }
1691     return longest;
1692 }
1693     
1694 inline static void serializeIPv6Piece(uint16_t piece, Vector<LChar>& buffer)
1695 {
1696     bool printed = false;
1697     if (auto nibble0 = piece >> 12) {
1698         buffer.append(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
1699         printed = true;
1700     }
1701     auto nibble1 = piece >> 8 & 0xF;
1702     if (printed || nibble1) {
1703         buffer.append(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
1704         printed = true;
1705     }
1706     auto nibble2 = piece >> 4 & 0xF;
1707     if (printed || nibble2)
1708         buffer.append(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
1709     buffer.append(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
1710 }
1711
1712 inline static void serializeIPv6(std::array<uint16_t, 8> address, Vector<LChar>& buffer)
1713 {
1714     buffer.append('[');
1715     auto compressPointer = findLongestZeroSequence(address);
1716     for (size_t piece = 0; piece < 8; piece++) {
1717         if (compressPointer && compressPointer.value() == piece) {
1718             ASSERT(!address[piece]);
1719             if (piece)
1720                 buffer.append(':');
1721             else
1722                 buffer.append("::", 2);
1723             while (piece < 8 && !address[piece])
1724                 piece++;
1725             if (piece == 8)
1726                 break;
1727         }
1728         serializeIPv6Piece(address[piece], buffer);
1729         if (piece < 7)
1730             buffer.append(':');
1731     }
1732     buffer.append(']');
1733 }
1734
1735 template<typename CharacterType>
1736 inline static Optional<uint32_t> parseIPv4Number(CodePointIterator<CharacterType>& iterator)
1737 {
1738     // FIXME: Check for overflow.
1739     enum class State : uint8_t {
1740         UnknownBase,
1741         Decimal,
1742         OctalOrHex,
1743         Octal,
1744         Hex,
1745     };
1746     State state = State::UnknownBase;
1747     uint32_t value = 0;
1748     while (!iterator.atEnd()) {
1749         if (*iterator == '.') {
1750             ++iterator;
1751             return value;
1752         }
1753         switch (state) {
1754         case State::UnknownBase:
1755             if (*iterator == '0') {
1756                 ++iterator;
1757                 state = State::OctalOrHex;
1758                 break;
1759             }
1760             state = State::Decimal;
1761             break;
1762         case State::OctalOrHex:
1763             if (*iterator == 'x' || *iterator == 'X') {
1764                 ++iterator;
1765                 state = State::Hex;
1766                 break;
1767             }
1768             state = State::Octal;
1769             break;
1770         case State::Decimal:
1771             if (*iterator < '0' || *iterator > '9')
1772                 return Nullopt;
1773             value *= 10;
1774             value += *iterator - '0';
1775             ++iterator;
1776             break;
1777         case State::Octal:
1778             if (*iterator < '0' || *iterator > '7')
1779                 return Nullopt;
1780             value *= 8;
1781             value += *iterator - '0';
1782             ++iterator;
1783             break;
1784         case State::Hex:
1785             if (!isASCIIHexDigit(*iterator))
1786                 return Nullopt;
1787             value *= 16;
1788             value += toASCIIHexValue(*iterator);
1789             ++iterator;
1790             break;
1791         }
1792     }
1793     return value;
1794 }
1795
1796 inline static uint64_t pow256(size_t exponent)
1797 {
1798     RELEASE_ASSERT(exponent <= 4);
1799     uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
1800     return values[exponent];
1801 }
1802
1803 template<typename CharacterType>
1804 inline static Optional<uint32_t> parseIPv4Host(CodePointIterator<CharacterType> iterator)
1805 {
1806     Vector<uint32_t, 4> items;
1807     items.reserveInitialCapacity(4);
1808     while (!iterator.atEnd()) {
1809         if (items.size() >= 4)
1810             return Nullopt;
1811         if (auto item = parseIPv4Number(iterator))
1812             items.append(item.value());
1813         else
1814             return Nullopt;
1815     }
1816     if (!items.size() || items.size() > 4)
1817         return Nullopt;
1818     if (items.size() > 2) {
1819         for (size_t i = 0; i < items.size() - 2; i++) {
1820             if (items[i] > 255)
1821                 return Nullopt;
1822         }
1823     }
1824     if (items[items.size() - 1] >= pow256(5 - items.size()))
1825         return Nullopt;
1826     for (auto item : items) {
1827         if (item > 255)
1828             return Nullopt;
1829     }
1830     uint32_t ipv4 = items.takeLast();
1831     for (size_t counter = 0; counter < items.size(); ++counter)
1832         ipv4 += items[counter] * pow256(3 - counter);
1833     return ipv4;
1834 }
1835     
1836 template<typename CharacterType>
1837 inline static Optional<std::array<uint16_t, 8>> parseIPv6Host(CodePointIterator<CharacterType> c)
1838 {
1839     if (c.atEnd())
1840         return Nullopt;
1841
1842     std::array<uint16_t, 8> address = {{0, 0, 0, 0, 0, 0, 0, 0}};
1843     size_t piecePointer = 0;
1844     Optional<size_t> compressPointer;
1845
1846     if (*c == ':') {
1847         ++c;
1848         if (c.atEnd())
1849             return Nullopt;
1850         if (*c != ':')
1851             return Nullopt;
1852         ++c;
1853         ++piecePointer;
1854         compressPointer = piecePointer;
1855     }
1856     
1857     while (!c.atEnd()) {
1858         if (piecePointer == 8)
1859             return Nullopt;
1860         if (*c == ':') {
1861             if (compressPointer)
1862                 return Nullopt;
1863             ++c;
1864             ++piecePointer;
1865             compressPointer = piecePointer;
1866             continue;
1867         }
1868         uint16_t value = 0;
1869         for (size_t length = 0; length < 4; length++) {
1870             if (c.atEnd())
1871                 break;
1872             if (!isASCIIHexDigit(*c))
1873                 break;
1874             value = value * 0x10 + toASCIIHexValue(*c);
1875             ++c;
1876         }
1877         address[piecePointer++] = value;
1878         if (c.atEnd())
1879             break;
1880         if (*c != ':')
1881             return Nullopt;
1882         ++c;
1883     }
1884     
1885     if (!c.atEnd()) {
1886         if (piecePointer > 6)
1887             return Nullopt;
1888         size_t dotsSeen = 0;
1889         while (!c.atEnd()) {
1890             Optional<uint16_t> value;
1891             if (!isASCIIDigit(*c))
1892                 return Nullopt;
1893             while (isASCIIDigit(*c)) {
1894                 auto number = *c - '0';
1895                 if (!value)
1896                     value = number;
1897                 else if (!value.value())
1898                     return Nullopt;
1899                 else
1900                     value = value.value() * 10 + number;
1901                 ++c;
1902                 if (c.atEnd())
1903                     return Nullopt;
1904                 if (value.value() > 255)
1905                     return Nullopt;
1906             }
1907             if (dotsSeen < 3 && *c != '.')
1908                 return Nullopt;
1909             address[piecePointer] = address[piecePointer] * 0x100 + value.valueOr(0);
1910             if (dotsSeen == 1 || dotsSeen == 3)
1911                 piecePointer++;
1912             if (!c.atEnd())
1913                 ++c;
1914             if (dotsSeen == 3 && !c.atEnd())
1915                 return Nullopt;
1916             dotsSeen++;
1917         }
1918     }
1919     if (compressPointer) {
1920         size_t swaps = piecePointer - compressPointer.value();
1921         piecePointer = 7;
1922         while (swaps)
1923             std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
1924     } else if (piecePointer != 8)
1925         return Nullopt;
1926     return address;
1927 }
1928
1929 // FIXME: This should return a CString.
1930 inline static String percentDecode(const LChar* input, size_t length)
1931 {
1932     StringBuilder output;
1933     
1934     for (size_t i = 0; i < length; ++i) {
1935         uint8_t byte = input[i];
1936         if (byte != '%')
1937             output.append(byte);
1938         else if (i < length - 2) {
1939             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
1940                 output.append(toASCIIHexValue(input[i + 1], input[i + 2]));
1941                 i += 2;
1942             } else
1943                 output.append(byte);
1944         } else
1945             output.append(byte);
1946     }
1947     return output.toStringPreserveCapacity();
1948 }
1949
1950 inline static bool containsOnlyASCII(const String& string)
1951 {
1952     if (string.is8Bit())
1953         return charactersAreAllASCII(string.characters8(), string.length());
1954     return charactersAreAllASCII(string.characters16(), string.length());
1955 }
1956
1957 inline static Optional<String> domainToASCII(const String& domain)
1958 {
1959     const unsigned hostnameBufferLength = 2048;
1960
1961     if (containsOnlyASCII(domain)) {
1962         if (domain.is8Bit())
1963             return domain.convertToASCIILowercase();
1964         Vector<LChar, hostnameBufferLength> buffer;
1965         size_t length = domain.length();
1966         buffer.reserveInitialCapacity(length);
1967         for (size_t i = 0; i < length; ++i)
1968             buffer.append(toASCIILower(domain[i]));
1969         return String(buffer.data(), length);
1970     }
1971     
1972     UChar hostnameBuffer[hostnameBufferLength];
1973     UErrorCode error = U_ZERO_ERROR;
1974
1975 #if COMPILER(GCC) || COMPILER(CLANG)
1976 #pragma GCC diagnostic push
1977 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
1978 #endif
1979     // FIXME: This should use uidna_openUTS46 / uidna_close instead
1980     int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, nullptr, &error);
1981 #if COMPILER(GCC) || COMPILER(CLANG)
1982 #pragma GCC diagnostic pop
1983 #endif
1984
1985     if (error == U_ZERO_ERROR) {
1986         LChar buffer[hostnameBufferLength];
1987         for (int32_t i = 0; i < numCharactersConverted; ++i) {
1988             ASSERT(isASCII(hostnameBuffer[i]));
1989             buffer[i] = hostnameBuffer[i];
1990         }
1991         return String(buffer, numCharactersConverted);
1992     }
1993
1994     // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
1995     return Nullopt;
1996 }
1997
1998 inline static bool hasInvalidDomainCharacter(const String& asciiDomain)
1999 {
2000     RELEASE_ASSERT(asciiDomain.is8Bit());
2001     const LChar* characters = asciiDomain.characters8();
2002     for (size_t i = 0; i < asciiDomain.length(); ++i) {
2003         if (isInvalidDomainCharacter(characters[i]))
2004             return true;
2005     }
2006     return false;
2007 }
2008
2009 template<bool serialized, typename CharacterType>
2010 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2011 {
2012     uint32_t port = 0;
2013     if (iterator.atEnd()) {
2014         m_url.m_portEnd = m_asciiBuffer.size();
2015         return true;
2016     }
2017     m_asciiBuffer.append(':');
2018     for (; !iterator.atEnd(); ++iterator) {
2019         if (!serialized && isTabOrNewline(*iterator))
2020             continue;
2021         if (isASCIIDigit(*iterator)) {
2022             port = port * 10 + *iterator - '0';
2023             if (port > std::numeric_limits<uint16_t>::max())
2024                 return false;
2025         } else
2026             return false;
2027     }
2028
2029     if (isDefaultPort(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd), port)) {
2030         ASSERT(m_asciiBuffer.last() == ':');
2031         m_asciiBuffer.shrink(m_asciiBuffer.size() - 1);
2032     } else
2033         append<uint16_t>(m_asciiBuffer, static_cast<uint16_t>(port));
2034
2035     m_url.m_portEnd = m_asciiBuffer.size();
2036     return true;
2037 }
2038
2039 template<bool serialized, typename CharacterType>
2040 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2041 {
2042     if (iterator.atEnd())
2043         return false;
2044     if (*iterator == '[') {
2045         ++iterator;
2046         auto ipv6End = iterator;
2047         while (!ipv6End.atEnd() && *ipv6End != ']')
2048             ++ipv6End;
2049         if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2050             serializeIPv6(address.value(), m_asciiBuffer);
2051             m_url.m_hostEnd = m_asciiBuffer.size();
2052             if (!ipv6End.atEnd()) {
2053                 ++ipv6End;
2054                 if (!ipv6End.atEnd() && *ipv6End == ':') {
2055                     ++ipv6End;
2056                     return parsePort<serialized>(ipv6End);
2057                 }
2058                 m_url.m_portEnd = m_asciiBuffer.size();
2059                 return true;
2060             }
2061             return true;
2062         }
2063     }
2064     
2065     if (!m_hostHasPercentOrNonASCII) {
2066         auto hostIterator = iterator;
2067         for (; !iterator.atEnd(); ++iterator) {
2068             if (!serialized && isTabOrNewline(*iterator))
2069                 continue;
2070             if (*iterator == ':')
2071                 break;
2072             if (isInvalidDomainCharacter(*iterator))
2073                 return false;
2074         }
2075         if (auto address = parseIPv4Host(CodePointIterator<CharacterType>(hostIterator, iterator))) {
2076             serializeIPv4(address.value(), m_asciiBuffer);
2077             m_url.m_hostEnd = m_asciiBuffer.size();
2078             if (iterator.atEnd()) {
2079                 m_url.m_portEnd = m_asciiBuffer.size();
2080                 return true;
2081             }
2082             ++iterator;
2083             return parsePort<serialized>(iterator);
2084         }
2085         for (; hostIterator != iterator; ++hostIterator) {
2086             if (serialized || !isTabOrNewline(*hostIterator))
2087                 m_asciiBuffer.append(toASCIILower(*hostIterator));
2088         }
2089         m_url.m_hostEnd = m_asciiBuffer.size();
2090         if (!hostIterator.atEnd()) {
2091             ASSERT(*hostIterator == ':');
2092             incrementIteratorSkippingTabAndNewLine<serialized>(hostIterator);
2093             return parsePort<serialized>(hostIterator);
2094         }
2095         m_url.m_portEnd = m_asciiBuffer.size();
2096         return true;
2097     }
2098
2099     // FIXME: We probably don't need to make so many buffers and String copies.
2100     StringBuilder utf8Encoded;
2101     for (; !iterator.atEnd(); ++iterator) {
2102         if (!serialized && isTabOrNewline(*iterator))
2103             continue;
2104         if (*iterator == ':')
2105             break;
2106         uint8_t buffer[U8_MAX_LENGTH];
2107         int32_t offset = 0;
2108         UBool error = false;
2109         U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error);
2110         ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
2111         // FIXME: Check error.
2112         utf8Encoded.append(buffer, offset);
2113     }
2114     RELEASE_ASSERT(utf8Encoded.is8Bit());
2115     String percentDecoded = percentDecode(utf8Encoded.characters8(), utf8Encoded.length());
2116     RELEASE_ASSERT(percentDecoded.is8Bit());
2117     String domain = String::fromUTF8(percentDecoded.characters8(), percentDecoded.length());
2118     auto asciiDomain = domainToASCII(domain);
2119     if (!asciiDomain || hasInvalidDomainCharacter(asciiDomain.value()))
2120         return false;
2121     String& asciiDomainValue = asciiDomain.value();
2122     RELEASE_ASSERT(asciiDomainValue.is8Bit());
2123     const LChar* asciiDomainCharacters = asciiDomainValue.characters8();
2124
2125     if (auto address = parseIPv4Host(CodePointIterator<LChar>(asciiDomainCharacters, asciiDomainCharacters + asciiDomainValue.length()))) {
2126         serializeIPv4(address.value(), m_asciiBuffer);
2127         m_url.m_hostEnd = m_asciiBuffer.size();
2128         if (iterator.atEnd()) {
2129             m_url.m_portEnd = m_asciiBuffer.size();
2130             return true;
2131         }
2132         ++iterator;
2133         return parsePort<serialized>(iterator);
2134     }
2135
2136     m_asciiBuffer.append(asciiDomainCharacters, asciiDomainValue.length());
2137     m_url.m_hostEnd = m_asciiBuffer.size();
2138     if (!iterator.atEnd()) {
2139         ASSERT(*iterator == ':');
2140         incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
2141         return parsePort<serialized>(iterator);
2142     }
2143     m_url.m_portEnd = m_asciiBuffer.size();
2144     return true;
2145 }
2146
2147 inline static Optional<String> formURLDecode(StringView input)
2148 {
2149     auto utf8 = input.utf8(StrictConversion);
2150     if (utf8.isNull())
2151         return Nullopt;
2152     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2153     RELEASE_ASSERT(percentDecoded.is8Bit());
2154     return String::fromUTF8(percentDecoded.characters8(), percentDecoded.length());
2155 }
2156
2157 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2158 {
2159     Vector<StringView> sequences = input.split('&');
2160
2161     URLEncodedForm output;
2162     for (auto& bytes : sequences) {
2163         auto valueStart = bytes.find('=');
2164         if (valueStart == notFound) {
2165             if (auto name = formURLDecode(bytes))
2166                 output.append({name.value().replace('+', 0x20), emptyString()});
2167         } else {
2168             auto name = formURLDecode(bytes.substring(0, valueStart));
2169             auto value = formURLDecode(bytes.substring(valueStart + 1));
2170             if (name && value)
2171                 output.append(std::make_pair(name.value().replace('+', 0x20), value.value().replace('+', 0x20)));
2172         }
2173     }
2174     return output;
2175 }
2176
2177 inline static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2178 {
2179     auto utf8 = input.utf8(StrictConversion);
2180     const char* data = utf8.data();
2181     for (size_t i = 0; i < utf8.length(); ++i) {
2182         const char byte = data[i];
2183         if (byte == 0x20)
2184             output.append(0x2B);
2185         else if (byte == 0x2A
2186             || byte == 0x2D
2187             || byte == 0x2E
2188             || (byte >= 0x30 && byte <= 0x39)
2189             || (byte >= 0x41 && byte <= 0x5A)
2190             || byte == 0x5F
2191             || (byte >= 0x61 && byte <= 0x7A))
2192             output.append(byte);
2193         else
2194             percentEncode(byte, output);
2195     }
2196 }
2197     
2198 String URLParser::serialize(const URLEncodedForm& tuples)
2199 {
2200     Vector<LChar> output;
2201     for (auto& tuple : tuples) {
2202         if (!output.isEmpty())
2203             output.append('&');
2204         serializeURLEncodedForm(tuple.first, output);
2205         output.append('=');
2206         serializeURLEncodedForm(tuple.second, output);
2207     }
2208     return String::adopt(output);
2209 }
2210
2211 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2212 {
2213     // FIXME: m_cannotBeABaseURL is not compared because the old URL::parse did not use it,
2214     // but once we get rid of URL::parse its value should be tested.
2215     LOG(URLParser, "%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2216         a.m_isValid,
2217         a.m_protocolIsInHTTPFamily,
2218         a.m_schemeEnd,
2219         a.m_userStart,
2220         a.m_userEnd,
2221         a.m_passwordEnd,
2222         a.m_hostEnd,
2223         a.m_portEnd,
2224         a.m_pathAfterLastSlash,
2225         a.m_pathEnd,
2226         a.m_queryEnd,
2227         a.m_fragmentEnd,
2228         a.m_string.utf8().data(),
2229         b.m_isValid,
2230         b.m_protocolIsInHTTPFamily,
2231         b.m_schemeEnd,
2232         b.m_userStart,
2233         b.m_userEnd,
2234         b.m_passwordEnd,
2235         b.m_hostEnd,
2236         b.m_portEnd,
2237         b.m_pathAfterLastSlash,
2238         b.m_pathEnd,
2239         b.m_queryEnd,
2240         b.m_fragmentEnd,
2241         b.m_string.utf8().data());
2242
2243     return a.m_string == b.m_string
2244         && a.m_isValid == b.m_isValid
2245         && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2246         && a.m_schemeEnd == b.m_schemeEnd
2247         && a.m_userStart == b.m_userStart
2248         && a.m_userEnd == b.m_userEnd
2249         && a.m_passwordEnd == b.m_passwordEnd
2250         && a.m_hostEnd == b.m_hostEnd
2251         && a.m_portEnd == b.m_portEnd
2252         && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2253         && a.m_pathEnd == b.m_pathEnd
2254         && a.m_queryEnd == b.m_queryEnd
2255         && a.m_fragmentEnd == b.m_fragmentEnd;
2256 }
2257
2258 bool URLParser::internalValuesConsistent(const URL& url)
2259 {    
2260     return url.m_schemeEnd <= url.m_userStart
2261         && url.m_userStart <= url.m_userEnd
2262         && url.m_userEnd <= url.m_passwordEnd
2263         && url.m_passwordEnd <= url.m_hostEnd
2264         && url.m_hostEnd <= url.m_hostEnd
2265         && url.m_portEnd <= url.m_pathAfterLastSlash
2266         && url.m_pathAfterLastSlash <= url.m_pathEnd
2267         && url.m_pathEnd <= url.m_queryEnd
2268         && url.m_queryEnd <= url.m_fragmentEnd
2269         && (url.m_isValid ? url.m_fragmentEnd == url.m_string.length() : !url.m_fragmentEnd);
2270     // FIXME: Why do we even store m_fragmentEnd?
2271     // It should be able to be deduced from m_isValid and m_string.length() to save memory.
2272 }
2273
2274 static bool urlParserEnabled = false;
2275
2276 void URLParser::setEnabled(bool enabled)
2277 {
2278     urlParserEnabled = enabled;
2279 }
2280
2281 bool URLParser::enabled()
2282 {
2283     return urlParserEnabled;
2284 }
2285
2286 } // namespace WebCore