Reduce allocations in URLParser
[WebKit-https.git] / Source / WebCore / platform / URLParser.cpp
1 /*
2  * Copyright (C) 2016 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "URLParser.h"
28
29 #include "Logging.h"
30 #include <array>
31 #include <unicode/uidna.h>
32 #include <unicode/utypes.h>
33
34 namespace WebCore {
35
36 template<typename CharacterType>
37 class CodePointIterator {
38 public:
39     CodePointIterator() { }
40     CodePointIterator(const CharacterType* begin, const CharacterType* end)
41         : m_begin(begin)
42         , m_end(end)
43     {
44     }
45     
46     CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
47         : CodePointIterator(begin.m_begin, end.m_begin)
48     {
49         ASSERT(end.m_begin >= begin.m_begin);
50     }
51     
52     UChar32 operator*() const;
53     CodePointIterator& operator++();
54
55     bool operator==(const CodePointIterator& other) const
56     {
57         return m_begin == other.m_begin
58             && m_end == other.m_end;
59     }
60     bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
61     
62     CodePointIterator& operator=(const CodePointIterator& other)
63     {
64         m_begin = other.m_begin;
65         m_end = other.m_end;
66         return *this;
67     }
68
69     bool atEnd() const
70     {
71         ASSERT(m_begin <= m_end);
72         return m_begin >= m_end;
73     }
74     
75 private:
76     const CharacterType* m_begin { nullptr };
77     const CharacterType* m_end { nullptr };
78 };
79
80 template<>
81 UChar32 CodePointIterator<LChar>::operator*() const
82 {
83     ASSERT(!atEnd());
84     return *m_begin;
85 }
86
87 template<>
88 auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
89 {
90     ASSERT(!atEnd());
91     m_begin++;
92     return *this;
93 }
94
95 template<>
96 UChar32 CodePointIterator<UChar>::operator*() const
97 {
98     ASSERT(!atEnd());
99     UChar32 c;
100     U16_GET(m_begin, 0, 0, m_end - m_begin, c);
101     return c;
102 }
103
104 template<>
105 auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
106 {
107     ASSERT(!atEnd());
108     unsigned i = 0;
109     size_t length = m_end - m_begin;
110     U16_FWD_1(m_begin, i, length);
111     m_begin += i;
112     return *this;
113 }
114     
115 static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
116 {
117     if (U_IS_BMP(codePoint)) {
118         destination.append(static_cast<UChar>(codePoint));
119         return;
120     }
121     destination.reserveCapacity(destination.size() + 2);
122     destination.uncheckedAppend(U16_LEAD(codePoint));
123     destination.uncheckedAppend(U16_TRAIL(codePoint));
124 }
125
126 enum URLCharacterClass {
127     UserInfo = 0x1,
128     Default = 0x2,
129     InvalidDomain = 0x4,
130     QueryPercent = 0x8,
131     SlashQuestionOrHash = 0x10,
132 };
133
134 static const uint8_t characterClassTable[256] = {
135     UserInfo | Default | InvalidDomain | QueryPercent, // 0x0
136     UserInfo | Default | QueryPercent, // 0x1
137     UserInfo | Default | QueryPercent, // 0x2
138     UserInfo | Default | QueryPercent, // 0x3
139     UserInfo | Default | QueryPercent, // 0x4
140     UserInfo | Default | QueryPercent, // 0x5
141     UserInfo | Default | QueryPercent, // 0x6
142     UserInfo | Default | QueryPercent, // 0x7
143     UserInfo | Default | QueryPercent, // 0x8
144     UserInfo | Default | InvalidDomain | QueryPercent, // 0x9
145     UserInfo | Default | InvalidDomain | QueryPercent, // 0xA
146     UserInfo | Default | QueryPercent, // 0xB
147     UserInfo | Default | QueryPercent, // 0xC
148     UserInfo | Default | InvalidDomain | QueryPercent, // 0xD
149     UserInfo | Default | QueryPercent, // 0xE
150     UserInfo | Default | QueryPercent, // 0xF
151     UserInfo | Default | QueryPercent, // 0x10
152     UserInfo | Default | QueryPercent, // 0x11
153     UserInfo | Default | QueryPercent, // 0x12
154     UserInfo | Default | QueryPercent, // 0x13
155     UserInfo | Default | QueryPercent, // 0x14
156     UserInfo | Default | QueryPercent, // 0x15
157     UserInfo | Default | QueryPercent, // 0x16
158     UserInfo | Default | QueryPercent, // 0x17
159     UserInfo | Default | QueryPercent, // 0x18
160     UserInfo | Default | QueryPercent, // 0x19
161     UserInfo | Default | QueryPercent, // 0x1A
162     UserInfo | Default | QueryPercent, // 0x1B
163     UserInfo | Default | QueryPercent, // 0x1C
164     UserInfo | Default | QueryPercent, // 0x1D
165     UserInfo | Default | QueryPercent, // 0x1E
166     UserInfo | Default | QueryPercent, // 0x1F
167     UserInfo | Default | InvalidDomain | QueryPercent, // ' '
168     0, // '!'
169     UserInfo | Default | QueryPercent, // '"'
170     UserInfo | Default | InvalidDomain | QueryPercent | SlashQuestionOrHash, // '#'
171     0, // '$'
172     InvalidDomain, // '%'
173     0, // '&'
174     0, // '''
175     0, // '('
176     0, // ')'
177     0, // '*'
178     0, // '+'
179     0, // ','
180     0, // '-'
181     0, // '.'
182     UserInfo | InvalidDomain | SlashQuestionOrHash, // '/'
183     0, // '0'
184     0, // '1'
185     0, // '2'
186     0, // '3'
187     0, // '4'
188     0, // '5'
189     0, // '6'
190     0, // '7'
191     0, // '8'
192     0, // '9'
193     UserInfo | InvalidDomain, // ':'
194     UserInfo, // ';'
195     UserInfo | Default | QueryPercent, // '<'
196     UserInfo, // '='
197     UserInfo | Default | QueryPercent, // '>'
198     UserInfo | Default | InvalidDomain | SlashQuestionOrHash, // '?'
199     UserInfo | InvalidDomain, // '@'
200     0, // 'A'
201     0, // 'B'
202     0, // 'C'
203     0, // 'D'
204     0, // 'E'
205     0, // 'F'
206     0, // 'G'
207     0, // 'H'
208     0, // 'I'
209     0, // 'J'
210     0, // 'K'
211     0, // 'L'
212     0, // 'M'
213     0, // 'N'
214     0, // 'O'
215     0, // 'P'
216     0, // 'Q'
217     0, // 'R'
218     0, // 'S'
219     0, // 'T'
220     0, // 'U'
221     0, // 'V'
222     0, // 'W'
223     0, // 'X'
224     0, // 'Y'
225     0, // 'Z'
226     UserInfo | InvalidDomain, // '['
227     UserInfo | InvalidDomain | SlashQuestionOrHash, // '\\'
228     UserInfo | InvalidDomain, // ']'
229     UserInfo, // '^'
230     0, // '_'
231     UserInfo | Default, // '`'
232     0, // 'a'
233     0, // 'b'
234     0, // 'c'
235     0, // 'd'
236     0, // 'e'
237     0, // 'f'
238     0, // 'g'
239     0, // 'h'
240     0, // 'i'
241     0, // 'j'
242     0, // 'k'
243     0, // 'l'
244     0, // 'm'
245     0, // 'n'
246     0, // 'o'
247     0, // 'p'
248     0, // 'q'
249     0, // 'r'
250     0, // 's'
251     0, // 't'
252     0, // 'u'
253     0, // 'v'
254     0, // 'w'
255     0, // 'x'
256     0, // 'y'
257     0, // 'z'
258     UserInfo | Default, // '{'
259     UserInfo, // '|'
260     UserInfo | Default, // '}'
261     0, // '~'
262     QueryPercent, // 0x7F
263     QueryPercent, // 0x80
264     QueryPercent, // 0x81
265     QueryPercent, // 0x82
266     QueryPercent, // 0x83
267     QueryPercent, // 0x84
268     QueryPercent, // 0x85
269     QueryPercent, // 0x86
270     QueryPercent, // 0x87
271     QueryPercent, // 0x88
272     QueryPercent, // 0x89
273     QueryPercent, // 0x8A
274     QueryPercent, // 0x8B
275     QueryPercent, // 0x8C
276     QueryPercent, // 0x8D
277     QueryPercent, // 0x8E
278     QueryPercent, // 0x8F
279     QueryPercent, // 0x90
280     QueryPercent, // 0x91
281     QueryPercent, // 0x92
282     QueryPercent, // 0x93
283     QueryPercent, // 0x94
284     QueryPercent, // 0x95
285     QueryPercent, // 0x96
286     QueryPercent, // 0x97
287     QueryPercent, // 0x98
288     QueryPercent, // 0x99
289     QueryPercent, // 0x9A
290     QueryPercent, // 0x9B
291     QueryPercent, // 0x9C
292     QueryPercent, // 0x9D
293     QueryPercent, // 0x9E
294     QueryPercent, // 0x9F
295     QueryPercent, // 0xA0
296     QueryPercent, // 0xA1
297     QueryPercent, // 0xA2
298     QueryPercent, // 0xA3
299     QueryPercent, // 0xA4
300     QueryPercent, // 0xA5
301     QueryPercent, // 0xA6
302     QueryPercent, // 0xA7
303     QueryPercent, // 0xA8
304     QueryPercent, // 0xA9
305     QueryPercent, // 0xAA
306     QueryPercent, // 0xAB
307     QueryPercent, // 0xAC
308     QueryPercent, // 0xAD
309     QueryPercent, // 0xAE
310     QueryPercent, // 0xAF
311     QueryPercent, // 0xB0
312     QueryPercent, // 0xB1
313     QueryPercent, // 0xB2
314     QueryPercent, // 0xB3
315     QueryPercent, // 0xB4
316     QueryPercent, // 0xB5
317     QueryPercent, // 0xB6
318     QueryPercent, // 0xB7
319     QueryPercent, // 0xB8
320     QueryPercent, // 0xB9
321     QueryPercent, // 0xBA
322     QueryPercent, // 0xBB
323     QueryPercent, // 0xBC
324     QueryPercent, // 0xBD
325     QueryPercent, // 0xBE
326     QueryPercent, // 0xBF
327     QueryPercent, // 0xC0
328     QueryPercent, // 0xC1
329     QueryPercent, // 0xC2
330     QueryPercent, // 0xC3
331     QueryPercent, // 0xC4
332     QueryPercent, // 0xC5
333     QueryPercent, // 0xC6
334     QueryPercent, // 0xC7
335     QueryPercent, // 0xC8
336     QueryPercent, // 0xC9
337     QueryPercent, // 0xCA
338     QueryPercent, // 0xCB
339     QueryPercent, // 0xCC
340     QueryPercent, // 0xCD
341     QueryPercent, // 0xCE
342     QueryPercent, // 0xCF
343     QueryPercent, // 0xD0
344     QueryPercent, // 0xD1
345     QueryPercent, // 0xD2
346     QueryPercent, // 0xD3
347     QueryPercent, // 0xD4
348     QueryPercent, // 0xD5
349     QueryPercent, // 0xD6
350     QueryPercent, // 0xD7
351     QueryPercent, // 0xD8
352     QueryPercent, // 0xD9
353     QueryPercent, // 0xDA
354     QueryPercent, // 0xDB
355     QueryPercent, // 0xDC
356     QueryPercent, // 0xDD
357     QueryPercent, // 0xDE
358     QueryPercent, // 0xDF
359     QueryPercent, // 0xE0
360     QueryPercent, // 0xE1
361     QueryPercent, // 0xE2
362     QueryPercent, // 0xE3
363     QueryPercent, // 0xE4
364     QueryPercent, // 0xE5
365     QueryPercent, // 0xE6
366     QueryPercent, // 0xE7
367     QueryPercent, // 0xE8
368     QueryPercent, // 0xE9
369     QueryPercent, // 0xEA
370     QueryPercent, // 0xEB
371     QueryPercent, // 0xEC
372     QueryPercent, // 0xED
373     QueryPercent, // 0xEE
374     QueryPercent, // 0xEF
375     QueryPercent, // 0xF0
376     QueryPercent, // 0xF1
377     QueryPercent, // 0xF2
378     QueryPercent, // 0xF3
379     QueryPercent, // 0xF4
380     QueryPercent, // 0xF5
381     QueryPercent, // 0xF6
382     QueryPercent, // 0xF7
383     QueryPercent, // 0xF8
384     QueryPercent, // 0xF9
385     QueryPercent, // 0xFA
386     QueryPercent, // 0xFB
387     QueryPercent, // 0xFC
388     QueryPercent, // 0xFD
389     QueryPercent, // 0xFE
390     QueryPercent, // 0xFF
391 };
392
393 template<typename CharacterType> inline static bool isC0Control(CharacterType character) { return character <= 0x1F; }
394 template<typename CharacterType> inline static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
395 template<typename CharacterType> inline static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
396 template<typename CharacterType> inline static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
397 template<typename CharacterType> inline static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
398 template<typename CharacterType> inline static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
399 template<typename CharacterType> inline static bool isInvalidDomainCharacter(CharacterType character) { return character <= ']' && characterClassTable[character] & InvalidDomain; }
400 template<typename CharacterType> inline static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
401 template<typename CharacterType> inline static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
402 static bool shouldPercentEncodeQueryByte(uint8_t byte) { return characterClassTable[byte] & QueryPercent; }
403
404 template<bool serialized, typename CharacterType>
405 void incrementIteratorSkippingTabAndNewLine(CodePointIterator<CharacterType>& iterator)
406 {
407     ++iterator;
408     while (!serialized && !iterator.atEnd() && isTabOrNewline(*iterator))
409         ++iterator;
410 }
411
412 template<bool serialized, typename CharacterType>
413 inline static bool isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
414 {
415     if (iterator.atEnd() || !isASCIIAlpha(*iterator))
416         return false;
417     incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
418     if (iterator.atEnd())
419         return false;
420     return *iterator == ':' || *iterator == '|';
421 }
422
423 inline static bool isWindowsDriveLetter(const Vector<LChar>& buffer, size_t index)
424 {
425     if (buffer.size() < index + 2)
426         return false;
427     return isASCIIAlpha(buffer[index]) && (buffer[index + 1] == ':' || buffer[index + 1] == '|');
428 }
429
430 template<bool serialized, typename CharacterType>
431 inline static void checkWindowsDriveLetter(CodePointIterator<CharacterType>& iterator, Vector<LChar>& asciiBuffer)
432 {
433     if (isWindowsDriveLetter<serialized>(iterator)) {
434         asciiBuffer.reserveCapacity(asciiBuffer.size() + 2);
435         asciiBuffer.uncheckedAppend(*iterator);
436         incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
437         ASSERT(!iterator.atEnd());
438         ASSERT(*iterator == ':' || *iterator == '|');
439         asciiBuffer.uncheckedAppend(':');
440         incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
441     }
442 }
443
444 template<bool serialized, typename CharacterType>
445 inline static bool shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
446 {
447     if (!isWindowsDriveLetter<serialized>(iterator))
448         return true;
449     if (iterator.atEnd())
450         return false;
451     incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
452     if (iterator.atEnd())
453         return true;
454     incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
455     if (iterator.atEnd())
456         return true;
457     return !isSlashQuestionOrHash(*iterator);
458 }
459
460 inline static void percentEncode(uint8_t byte, Vector<LChar>& buffer)
461 {
462     buffer.append('%');
463     buffer.append(upperNibbleToASCIIHexDigit(byte));
464     buffer.append(lowerNibbleToASCIIHexDigit(byte));
465 }
466
467 template<bool serialized>
468 inline static void utf8PercentEncode(UChar32 codePoint, Vector<LChar>& destination, bool(*isInCodeSet)(UChar32))
469 {
470     if (serialized) {
471         ASSERT_WITH_SECURITY_IMPLICATION(isASCII(codePoint));
472         ASSERT_WITH_SECURITY_IMPLICATION(!isInCodeSet(codePoint));
473         destination.append(codePoint);
474     } else {
475         if (isInCodeSet(codePoint)) {
476             uint8_t buffer[U8_MAX_LENGTH];
477             int32_t offset = 0;
478             UBool error = false;
479             U8_APPEND(buffer, offset, U8_MAX_LENGTH, codePoint, error);
480             // FIXME: Check error.
481             for (int32_t i = 0; i < offset; ++i)
482                 percentEncode(buffer[i], destination);
483         } else {
484             ASSERT_WITH_MESSAGE(isASCII(codePoint), "isInCodeSet should always return true for non-ASCII characters");
485             destination.append(codePoint);
486         }
487     }
488 }
489
490 template<bool serialized>
491 inline static void utf8PercentEncodeQuery(UChar32 codePoint, Vector<LChar>& destination)
492 {
493     if (serialized) {
494         ASSERT_WITH_SECURITY_IMPLICATION(isASCII(codePoint));
495         ASSERT_WITH_SECURITY_IMPLICATION(!shouldPercentEncodeQueryByte(codePoint));
496         destination.append(codePoint);
497     } else {
498         uint8_t buffer[U8_MAX_LENGTH];
499         int32_t offset = 0;
500         UBool error = false;
501         U8_APPEND(buffer, offset, U8_MAX_LENGTH, codePoint, error);
502         ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
503         // FIXME: Check error.
504         for (int32_t i = 0; i < offset; ++i) {
505             auto byte = buffer[i];
506             if (shouldPercentEncodeQueryByte(byte))
507                 percentEncode(byte, destination);
508             else
509                 destination.append(byte);
510         }
511     }
512 }
513     
514 inline static void encodeQuery(const Vector<UChar>& source, Vector<LChar>& destination, const TextEncoding& encoding)
515 {
516     // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
517     CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
518     const char* data = encoded.data();
519     size_t length = encoded.length();
520     for (size_t i = 0; i < length; ++i) {
521         uint8_t byte = data[i];
522         if (shouldPercentEncodeQueryByte(byte))
523             percentEncode(byte, destination);
524         else
525             destination.append(byte);
526     }
527 }
528
529 inline static bool isDefaultPort(StringView scheme, uint16_t port)
530 {
531     static const uint16_t ftpPort = 21;
532     static const uint16_t gopherPort = 70;
533     static const uint16_t httpPort = 80;
534     static const uint16_t httpsPort = 443;
535     static const uint16_t wsPort = 80;
536     static const uint16_t wssPort = 443;
537     
538     auto length = scheme.length();
539     if (!length)
540         return false;
541     switch (scheme[0]) {
542     case 'w':
543         switch (length) {
544         case 2:
545             return scheme[1] == 's'
546                 && port == wsPort;
547         case 3:
548             return scheme[1] == 's'
549                 && scheme[2] == 's'
550                 && port == wssPort;
551         default:
552             return false;
553         }
554     case 'h':
555         switch (length) {
556         case 4:
557             return scheme[1] == 't'
558                 && scheme[2] == 't'
559                 && scheme[3] == 'p'
560                 && port == httpPort;
561         case 5:
562             return scheme[1] == 't'
563                 && scheme[2] == 't'
564                 && scheme[3] == 'p'
565                 && scheme[4] == 's'
566                 && port == httpsPort;
567         default:
568             return false;
569         }
570     case 'g':
571         return length == 6
572             && scheme[1] == 'o'
573             && scheme[2] == 'p'
574             && scheme[3] == 'h'
575             && scheme[4] == 'e'
576             && scheme[5] == 'r'
577             && port == gopherPort;
578     case 'f':
579         return length == 3
580             && scheme[1] == 't'
581             && scheme[2] == 'p'
582             && port == ftpPort;
583         return false;
584     default:
585         return false;
586     }
587 }
588
589 inline static bool isSpecialScheme(StringView scheme)
590 {
591     auto length = scheme.length();
592     if (!length)
593         return false;
594     switch (scheme[0]) {
595     case 'f':
596         switch (length) {
597         case 3:
598             return scheme[1] == 't'
599                 && scheme[2] == 'p';
600         case 4:
601             return scheme[1] == 'i'
602                 && scheme[2] == 'l'
603                 && scheme[3] == 'e';
604         default:
605             return false;
606         }
607     case 'g':
608         return length == 6
609             && scheme[1] == 'o'
610             && scheme[2] == 'p'
611             && scheme[3] == 'h'
612             && scheme[4] == 'e'
613             && scheme[5] == 'r';
614     case 'h':
615         switch (length) {
616         case 4:
617             return scheme[1] == 't'
618                 && scheme[2] == 't'
619                 && scheme[3] == 'p';
620         case 5:
621             return scheme[1] == 't'
622                 && scheme[2] == 't'
623                 && scheme[3] == 'p'
624                 && scheme[4] == 's';
625         default:
626             return false;
627         }
628     case 'w':
629         switch (length) {
630         case 2:
631             return scheme[1] == 's';
632         case 3:
633             return scheme[1] == 's'
634                 && scheme[2] == 's';
635         default:
636             return false;
637         }
638     default:
639         return false;
640     }
641 }
642
643 enum class URLParser::URLPart {
644     SchemeEnd,
645     UserStart,
646     UserEnd,
647     PasswordEnd,
648     HostEnd,
649     PortEnd,
650     PathAfterLastSlash,
651     PathEnd,
652     QueryEnd,
653     FragmentEnd,
654 };
655
656 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
657 {
658     switch (part) {
659     case URLPart::FragmentEnd:
660         return url.m_fragmentEnd;
661     case URLPart::QueryEnd:
662         return url.m_queryEnd;
663     case URLPart::PathEnd:
664         return url.m_pathEnd;
665     case URLPart::PathAfterLastSlash:
666         return url.m_pathAfterLastSlash;
667     case URLPart::PortEnd:
668         return url.m_portEnd;
669     case URLPart::HostEnd:
670         return url.m_hostEnd;
671     case URLPart::PasswordEnd:
672         return url.m_passwordEnd;
673     case URLPart::UserEnd:
674         return url.m_userEnd;
675     case URLPart::UserStart:
676         return url.m_userStart;
677     case URLPart::SchemeEnd:
678         return url.m_schemeEnd;
679     }
680     ASSERT_NOT_REACHED();
681     return 0;
682 }
683
684 inline static void copyASCIIStringUntil(Vector<LChar>& destination, const String& string, size_t lengthIf8Bit, size_t lengthIf16Bit)
685 {
686     ASSERT(destination.isEmpty());
687     if (string.is8Bit()) {
688         RELEASE_ASSERT(lengthIf8Bit <= string.length());
689         destination.append(string.characters8(), lengthIf8Bit);
690     } else {
691         RELEASE_ASSERT(lengthIf16Bit <= string.length());
692         destination.reserveCapacity(lengthIf16Bit);
693         const UChar* characters = string.characters16();
694         for (size_t i = 0; i < lengthIf16Bit; ++i) {
695             UChar c = characters[i];
696             ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
697             destination.uncheckedAppend(c);
698         }
699     }
700 }
701
702 void URLParser::copyURLPartsUntil(const URL& base, URLPart part)
703 {
704     m_asciiBuffer.clear();
705     m_unicodeFragmentBuffer.clear();
706     if (part == URLPart::FragmentEnd) {
707         copyASCIIStringUntil(m_asciiBuffer, base.m_string, urlLengthUntilPart(base, URLPart::FragmentEnd), urlLengthUntilPart(base, URLPart::QueryEnd));
708         if (!base.m_string.is8Bit()) {
709             const String& fragment = base.m_string;
710             bool seenUnicode = false;
711             for (size_t i = base.m_queryEnd; i < base.m_fragmentEnd; ++i) {
712                 if (!seenUnicode && !isASCII(fragment[i]))
713                     seenUnicode = true;
714                 if (seenUnicode)
715                     m_unicodeFragmentBuffer.uncheckedAppend(fragment[i]);
716                 else
717                     m_asciiBuffer.uncheckedAppend(fragment[i]);
718             }
719         }
720     } else {
721         size_t length = urlLengthUntilPart(base, part);
722         copyASCIIStringUntil(m_asciiBuffer, base.m_string, length, length);
723     }
724     switch (part) {
725     case URLPart::FragmentEnd:
726         m_url.m_fragmentEnd = base.m_fragmentEnd;
727         FALLTHROUGH;
728     case URLPart::QueryEnd:
729         m_url.m_queryEnd = base.m_queryEnd;
730         FALLTHROUGH;
731     case URLPart::PathEnd:
732         m_url.m_pathEnd = base.m_pathEnd;
733         FALLTHROUGH;
734     case URLPart::PathAfterLastSlash:
735         m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
736         FALLTHROUGH;
737     case URLPart::PortEnd:
738         m_url.m_portEnd = base.m_portEnd;
739         FALLTHROUGH;
740     case URLPart::HostEnd:
741         m_url.m_hostEnd = base.m_hostEnd;
742         FALLTHROUGH;
743     case URLPart::PasswordEnd:
744         m_url.m_passwordEnd = base.m_passwordEnd;
745         FALLTHROUGH;
746     case URLPart::UserEnd:
747         m_url.m_userEnd = base.m_userEnd;
748         FALLTHROUGH;
749     case URLPart::UserStart:
750         m_url.m_userStart = base.m_userStart;
751         FALLTHROUGH;
752     case URLPart::SchemeEnd:
753         m_url.m_isValid = base.m_isValid;
754         m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
755         m_url.m_schemeEnd = base.m_schemeEnd;
756     }
757     m_urlIsSpecial = isSpecialScheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd));
758 }
759
760 static const char* dotASCIICode = "2e";
761
762 template<typename CharacterType>
763 inline static bool isPercentEncodedDot(CodePointIterator<CharacterType> c)
764 {
765     if (c.atEnd())
766         return false;
767     if (*c != '%')
768         return false;
769     ++c;
770     if (c.atEnd())
771         return false;
772     if (*c != dotASCIICode[0])
773         return false;
774     ++c;
775     if (c.atEnd())
776         return false;
777     return toASCIILower(*c) == dotASCIICode[1];
778 }
779
780 template<typename CharacterType>
781 inline static bool isSingleDotPathSegment(CodePointIterator<CharacterType> c)
782 {
783     if (c.atEnd())
784         return false;
785     if (*c == '.') {
786         ++c;
787         return c.atEnd() || isSlashQuestionOrHash(*c);
788     }
789     if (*c != '%')
790         return false;
791     ++c;
792     if (c.atEnd() || *c != dotASCIICode[0])
793         return false;
794     ++c;
795     if (c.atEnd())
796         return false;
797     if (toASCIILower(*c) == dotASCIICode[1]) {
798         ++c;
799         return c.atEnd() || isSlashQuestionOrHash(*c);
800     }
801     return false;
802 }
803
804 template<typename CharacterType>
805 inline static bool isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
806 {
807     if (c.atEnd())
808         return false;
809     if (*c == '.') {
810         ++c;
811         return isSingleDotPathSegment(c);
812     }
813     if (*c != '%')
814         return false;
815     ++c;
816     if (c.atEnd() || *c != dotASCIICode[0])
817         return false;
818     ++c;
819     if (c.atEnd())
820         return false;
821     if (toASCIILower(*c) == dotASCIICode[1]) {
822         ++c;
823         return isSingleDotPathSegment(c);
824     }
825     return false;
826 }
827
828 template<typename CharacterType>
829 inline static void consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
830 {
831     ASSERT(isSingleDotPathSegment(c));
832     if (*c == '.') {
833         ++c;
834         if (!c.atEnd()) {
835             if (*c == '/' || *c == '\\')
836                 ++c;
837             else
838                 ASSERT(*c == '?' || *c == '#');
839         }
840     } else {
841         ASSERT(*c == '%');
842         ++c;
843         ASSERT(*c == dotASCIICode[0]);
844         ++c;
845         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
846         ++c;
847         if (!c.atEnd()) {
848             if (*c == '/' || *c == '\\')
849                 ++c;
850             else
851                 ASSERT(*c == '?' || *c == '#');
852         }
853     }
854 }
855
856 template<typename CharacterType>
857 inline static void consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
858 {
859     ASSERT(isDoubleDotPathSegment(c));
860     if (*c == '.')
861         ++c;
862     else {
863         ASSERT(*c == '%');
864         ++c;
865         ASSERT(*c == dotASCIICode[0]);
866         ++c;
867         ASSERT(toASCIILower(*c) == dotASCIICode[1]);
868         ++c;
869     }
870     consumeSingleDotPathSegment(c);
871 }
872
873 void URLParser::popPath()
874 {
875     if (m_url.m_pathAfterLastSlash > m_url.m_portEnd + 1) {
876         m_url.m_pathAfterLastSlash--;
877         if (m_asciiBuffer[m_url.m_pathAfterLastSlash] == '/')
878             m_url.m_pathAfterLastSlash--;
879         while (m_url.m_pathAfterLastSlash > m_url.m_portEnd && m_asciiBuffer[m_url.m_pathAfterLastSlash] != '/')
880             m_url.m_pathAfterLastSlash--;
881         m_url.m_pathAfterLastSlash++;
882     }
883     m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
884 }
885
886 template<typename CharacterType>
887 URL URLParser::failure(const CharacterType* input, unsigned length)
888 {
889     URL url;
890     url.m_isValid = false;
891     url.m_protocolIsInHTTPFamily = false;
892     url.m_cannotBeABaseURL = false;
893     url.m_schemeEnd = 0;
894     url.m_userStart = 0;
895     url.m_userEnd = 0;
896     url.m_passwordEnd = 0;
897     url.m_hostEnd = 0;
898     url.m_portEnd = 0;
899     url.m_pathAfterLastSlash = 0;
900     url.m_pathEnd = 0;
901     url.m_queryEnd = 0;
902     url.m_fragmentEnd = 0;
903     url.m_string = String(input, length);
904     return url;
905 }
906
907 URL URLParser::parse(const String& input, const URL& base, const TextEncoding& encoding)
908 {
909     const bool serialized = false;
910     if (input.is8Bit())
911         return parse<serialized>(input.characters8(), input.length(), base, encoding);
912     return parse<serialized>(input.characters16(), input.length(), base, encoding);
913 }
914
915 URL URLParser::parseSerializedURL(const String& input)
916 {
917     const bool serialized = true;
918     if (input.is8Bit())
919         return parse<serialized>(input.characters8(), input.length(), { }, UTF8Encoding());
920     return parse<serialized>(input.characters16(), input.length(), { }, UTF8Encoding());
921 }
922
923 template<bool serialized, typename CharacterType>
924 URL URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
925 {
926     LOG(URLParser, "Parsing URL <%s> base <%s>", String(input, length).utf8().data(), base.string().utf8().data());
927     m_url = { };
928     m_asciiBuffer.clear();
929     m_unicodeFragmentBuffer.clear();
930     m_asciiBuffer.reserveCapacity(length);
931     
932     bool isUTF8Encoding = encoding == UTF8Encoding();
933     Vector<UChar> queryBuffer;
934
935     unsigned endIndex = length;
936     while (endIndex && isC0ControlOrSpace(input[endIndex - 1]))
937         endIndex--;
938     CodePointIterator<CharacterType> c(input, input + endIndex);
939     CodePointIterator<CharacterType> authorityOrHostBegin;
940     while (!c.atEnd() && isC0ControlOrSpace(*c))
941         ++c;
942     auto beginAfterControlAndSpace = c;
943
944     enum class State : uint8_t {
945         SchemeStart,
946         Scheme,
947         NoScheme,
948         SpecialRelativeOrAuthority,
949         PathOrAuthority,
950         Relative,
951         RelativeSlash,
952         SpecialAuthoritySlashes,
953         SpecialAuthorityIgnoreSlashes,
954         AuthorityOrHost,
955         Host,
956         File,
957         FileSlash,
958         FileHost,
959         PathStart,
960         Path,
961         CannotBeABaseURLPath,
962         Query,
963         Fragment,
964     };
965
966 #define LOG_STATE(x) LOG(URLParser, "State %s, code point %c, asciiBuffer size %zu", x, *c, m_asciiBuffer.size())
967 #define LOG_FINAL_STATE(x) LOG(URLParser, "Final State: %s", x)
968
969     State state = State::SchemeStart;
970     while (!c.atEnd()) {
971         if (!serialized && isTabOrNewline(*c)) {
972             ++c;
973             continue;
974         }
975
976         switch (state) {
977         case State::SchemeStart:
978             LOG_STATE("SchemeStart");
979             if (isASCIIAlpha(*c)) {
980                 m_asciiBuffer.uncheckedAppend(toASCIILower(*c));
981                 ++c;
982                 state = State::Scheme;
983             } else
984                 state = State::NoScheme;
985             break;
986         case State::Scheme:
987             LOG_STATE("Scheme");
988             if (isASCIIAlphanumeric(*c) || *c == '+' || *c == '-' || *c == '.')
989                 m_asciiBuffer.append(toASCIILower(*c));
990             else if (*c == ':') {
991                 m_url.m_schemeEnd = m_asciiBuffer.size();
992                 StringView urlScheme = StringView(m_asciiBuffer.data(), m_url.m_schemeEnd);
993                 m_url.m_protocolIsInHTTPFamily = urlScheme == "http" || urlScheme == "https";
994                 if (urlScheme == "file") {
995                     m_urlIsSpecial = true;
996                     state = State::File;
997                     m_asciiBuffer.append(':');
998                     ++c;
999                     break;
1000                 }
1001                 m_asciiBuffer.append(':');
1002                 if (isSpecialScheme(urlScheme)) {
1003                     m_urlIsSpecial = true;
1004                     if (base.protocolIs(m_asciiBuffer.data(), m_asciiBuffer.size() - 1))
1005                         state = State::SpecialRelativeOrAuthority;
1006                     else
1007                         state = State::SpecialAuthoritySlashes;
1008                 } else {
1009                     auto maybeSlash = c;
1010                     incrementIteratorSkippingTabAndNewLine<serialized>(maybeSlash);
1011                     if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1012                         m_asciiBuffer.append('/');
1013                         m_url.m_userStart = m_asciiBuffer.size();
1014                         state = State::PathOrAuthority;
1015                         c = maybeSlash;
1016                         ASSERT(*c == '/');
1017                     } else {
1018                         m_url.m_userStart = m_asciiBuffer.size();
1019                         m_url.m_userEnd = m_url.m_userStart;
1020                         m_url.m_passwordEnd = m_url.m_userStart;
1021                         m_url.m_hostEnd = m_url.m_userStart;
1022                         m_url.m_portEnd = m_url.m_userStart;
1023                         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1024                         m_url.m_cannotBeABaseURL = true;
1025                         state = State::CannotBeABaseURLPath;
1026                     }
1027                 }
1028                 ++c;
1029                 break;
1030             } else {
1031                 m_asciiBuffer.clear();
1032                 state = State::NoScheme;
1033                 c = beginAfterControlAndSpace;
1034                 break;
1035             }
1036             incrementIteratorSkippingTabAndNewLine<serialized>(c);
1037             if (c.atEnd()) {
1038                 m_asciiBuffer.clear();
1039                 state = State::NoScheme;
1040                 c = beginAfterControlAndSpace;
1041             }
1042             break;
1043         case State::NoScheme:
1044             LOG_STATE("NoScheme");
1045             if (base.isNull() || (base.m_cannotBeABaseURL && *c != '#'))
1046                 return failure(input, length);
1047             if (base.m_cannotBeABaseURL && *c == '#') {
1048                 copyURLPartsUntil(base, URLPart::QueryEnd);
1049                 state = State::Fragment;
1050                 m_asciiBuffer.append('#');
1051                 ++c;
1052                 break;
1053             }
1054             if (!base.protocolIs("file")) {
1055                 state = State::Relative;
1056                 break;
1057             }
1058             copyURLPartsUntil(base, URLPart::SchemeEnd);
1059             m_asciiBuffer.append(':');
1060             state = State::File;
1061             break;
1062         case State::SpecialRelativeOrAuthority:
1063             LOG_STATE("SpecialRelativeOrAuthority");
1064             if (*c == '/') {
1065                 m_asciiBuffer.append('/');
1066                 incrementIteratorSkippingTabAndNewLine<serialized>(c);
1067                 if (c.atEnd())
1068                     return failure(input, length);
1069                 if (*c == '/') {
1070                     m_asciiBuffer.append('/');
1071                     state = State::SpecialAuthorityIgnoreSlashes;
1072                     ++c;
1073                 } else
1074                     state = State::RelativeSlash;
1075             } else
1076                 state = State::Relative;
1077             break;
1078         case State::PathOrAuthority:
1079             LOG_STATE("PathOrAuthority");
1080             if (*c == '/') {
1081                 m_asciiBuffer.append('/');
1082                 m_url.m_userStart = m_asciiBuffer.size();
1083                 state = State::AuthorityOrHost;
1084                 ++c;
1085                 authorityOrHostBegin = c;
1086             } else {
1087                 ASSERT(m_asciiBuffer.last() == '/');
1088                 m_url.m_userStart = m_asciiBuffer.size() - 1;
1089                 m_url.m_userEnd = m_url.m_userStart;
1090                 m_url.m_passwordEnd = m_url.m_userStart;
1091                 m_url.m_hostEnd = m_url.m_userStart;
1092                 m_url.m_portEnd = m_url.m_userStart;
1093                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1094                 state = State::Path;
1095             }
1096             break;
1097         case State::Relative:
1098             LOG_STATE("Relative");
1099             switch (*c) {
1100             case '/':
1101             case '\\':
1102                 state = State::RelativeSlash;
1103                 ++c;
1104                 break;
1105             case '?':
1106                 copyURLPartsUntil(base, URLPart::PathEnd);
1107                 m_asciiBuffer.append('?');
1108                 state = State::Query;
1109                 ++c;
1110                 break;
1111             case '#':
1112                 copyURLPartsUntil(base, URLPart::QueryEnd);
1113                 m_asciiBuffer.append('#');
1114                 state = State::Fragment;
1115                 ++c;
1116                 break;
1117             default:
1118                 copyURLPartsUntil(base, URLPart::PathAfterLastSlash);
1119                 state = State::Path;
1120                 break;
1121             }
1122             break;
1123         case State::RelativeSlash:
1124             LOG_STATE("RelativeSlash");
1125             if (*c == '/' || *c == '\\') {
1126                 ++c;
1127                 copyURLPartsUntil(base, URLPart::SchemeEnd);
1128                 m_asciiBuffer.append("://", 3);
1129                 state = State::SpecialAuthorityIgnoreSlashes;
1130             } else {
1131                 copyURLPartsUntil(base, URLPart::PortEnd);
1132                 m_asciiBuffer.append('/');
1133                 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1134                 state = State::Path;
1135             }
1136             break;
1137         case State::SpecialAuthoritySlashes:
1138             LOG_STATE("SpecialAuthoritySlashes");
1139             m_asciiBuffer.append("//", 2);
1140             if (*c == '/' || *c == '\\') {
1141                 incrementIteratorSkippingTabAndNewLine<serialized>(c);
1142                 if (!c.atEnd() && (*c == '/' || *c == '\\'))
1143                     ++c;
1144             }
1145             state = State::SpecialAuthorityIgnoreSlashes;
1146             break;
1147         case State::SpecialAuthorityIgnoreSlashes:
1148             LOG_STATE("SpecialAuthorityIgnoreSlashes");
1149             if (*c == '/' || *c == '\\') {
1150                 m_asciiBuffer.append('/');
1151                 ++c;
1152             }
1153             m_url.m_userStart = m_asciiBuffer.size();
1154             state = State::AuthorityOrHost;
1155             authorityOrHostBegin = c;
1156             break;
1157         case State::AuthorityOrHost:
1158             LOG_STATE("AuthorityOrHost");
1159             {
1160                 if (*c == '@') {
1161                     auto lastAt = c;
1162                     auto findLastAt = c;
1163                     while (!findLastAt.atEnd()) {
1164                         if (*findLastAt == '@')
1165                             lastAt = findLastAt;
1166                         ++findLastAt;
1167                     }
1168                     parseAuthority<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1169                     c = lastAt;
1170                     incrementIteratorSkippingTabAndNewLine<serialized>(c);
1171                     authorityOrHostBegin = c;
1172                     state = State::Host;
1173                     m_hostHasPercentOrNonASCII = false;
1174                     break;
1175                 }
1176                 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1177                 if (isSlash || *c == '?' || *c == '#') {
1178                     m_url.m_userEnd = m_asciiBuffer.size();
1179                     m_url.m_passwordEnd = m_url.m_userEnd;
1180                     if (!parseHostAndPort<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
1181                         return failure(input, length);
1182                     if (!isSlash) {
1183                         m_asciiBuffer.append('/');
1184                         m_url.m_pathAfterLastSlash = m_asciiBuffer.size();
1185                     }
1186                     state = State::Path;
1187                     break;
1188                 }
1189                 if (isPercentOrNonASCII(*c))
1190                     m_hostHasPercentOrNonASCII = true;
1191                 ++c;
1192             }
1193             break;
1194         case State::Host:
1195             LOG_STATE("Host");
1196             if (*c == '/' || *c == '?' || *c == '#') {
1197                 if (!parseHostAndPort<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
1198                     return failure(input, length);
1199                 state = State::Path;
1200                 break;
1201             }
1202             if (isPercentOrNonASCII(*c))
1203                 m_hostHasPercentOrNonASCII = true;
1204             ++c;
1205             break;
1206         case State::File:
1207             LOG_STATE("File");
1208             switch (*c) {
1209             case '/':
1210             case '\\':
1211                 m_asciiBuffer.append('/');
1212                 state = State::FileSlash;
1213                 ++c;
1214                 break;
1215             case '?':
1216                 if (!base.isNull() && base.protocolIs("file"))
1217                     copyURLPartsUntil(base, URLPart::PathEnd);
1218                 m_asciiBuffer.append("///?", 4);
1219                 m_url.m_userStart = m_asciiBuffer.size() - 2;
1220                 m_url.m_userEnd = m_url.m_userStart;
1221                 m_url.m_passwordEnd = m_url.m_userStart;
1222                 m_url.m_hostEnd = m_url.m_userStart;
1223                 m_url.m_portEnd = m_url.m_userStart;
1224                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1225                 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1226                 state = State::Query;
1227                 ++c;
1228                 break;
1229             case '#':
1230                 if (!base.isNull() && base.protocolIs("file"))
1231                     copyURLPartsUntil(base, URLPart::QueryEnd);
1232                 m_asciiBuffer.append("///#", 4);
1233                 m_url.m_userStart = m_asciiBuffer.size() - 2;
1234                 m_url.m_userEnd = m_url.m_userStart;
1235                 m_url.m_passwordEnd = m_url.m_userStart;
1236                 m_url.m_hostEnd = m_url.m_userStart;
1237                 m_url.m_portEnd = m_url.m_userStart;
1238                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1239                 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1240                 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1241                 state = State::Fragment;
1242                 ++c;
1243                 break;
1244             default:
1245                 if (!base.isNull() && base.protocolIs("file") && shouldCopyFileURL<serialized>(c))
1246                     copyURLPartsUntil(base, URLPart::PathAfterLastSlash);
1247                 else {
1248                     m_asciiBuffer.append("///", 3);
1249                     m_url.m_userStart = m_asciiBuffer.size() - 1;
1250                     m_url.m_userEnd = m_url.m_userStart;
1251                     m_url.m_passwordEnd = m_url.m_userStart;
1252                     m_url.m_hostEnd = m_url.m_userStart;
1253                     m_url.m_portEnd = m_url.m_userStart;
1254                     m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1255                     checkWindowsDriveLetter<serialized>(c, m_asciiBuffer);
1256                 }
1257                 state = State::Path;
1258                 break;
1259             }
1260             break;
1261         case State::FileSlash:
1262             LOG_STATE("FileSlash");
1263             if (*c == '/' || *c == '\\') {
1264                 ++c;
1265                 m_asciiBuffer.append('/');
1266                 m_url.m_userStart = m_asciiBuffer.size();
1267                 m_url.m_userEnd = m_url.m_userStart;
1268                 m_url.m_passwordEnd = m_url.m_userStart;
1269                 m_url.m_hostEnd = m_url.m_userStart;
1270                 m_url.m_portEnd = m_url.m_userStart;
1271                 authorityOrHostBegin = c;
1272                 state = State::FileHost;
1273                 break;
1274             }
1275             if (!base.isNull() && base.protocolIs("file")) {
1276                 // FIXME: This String copy is unnecessary.
1277                 String basePath = base.path();
1278                 if (basePath.length() >= 2) {
1279                     bool windowsQuirk = basePath.is8Bit()
1280                         ? isWindowsDriveLetter<serialized>(CodePointIterator<LChar>(basePath.characters8(), basePath.characters8() + basePath.length()))
1281                         : isWindowsDriveLetter<serialized>(CodePointIterator<UChar>(basePath.characters16(), basePath.characters16() + basePath.length()));
1282                     if (windowsQuirk) {
1283                         m_asciiBuffer.append(basePath[0]);
1284                         m_asciiBuffer.append(basePath[1]);
1285                     }
1286                 }
1287             }
1288             m_asciiBuffer.append("//", 2);
1289             m_url.m_userStart = m_asciiBuffer.size() - 1;
1290             m_url.m_userEnd = m_url.m_userStart;
1291             m_url.m_passwordEnd = m_url.m_userStart;
1292             m_url.m_hostEnd = m_url.m_userStart;
1293             m_url.m_portEnd = m_url.m_userStart;
1294             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1295             checkWindowsDriveLetter<serialized>(c, m_asciiBuffer);
1296             state = State::Path;
1297             break;
1298         case State::FileHost:
1299             LOG_STATE("FileHost");
1300             if (isSlashQuestionOrHash(*c)) {
1301                 if (isWindowsDriveLetter(m_asciiBuffer, m_url.m_portEnd + 1)) {
1302                     state = State::Path;
1303                     break;
1304                 }
1305                 if (authorityOrHostBegin == c) {
1306                     ASSERT(m_asciiBuffer[m_asciiBuffer.size() - 1] == '/');
1307                     if (*c == '?') {
1308                         m_asciiBuffer.append("/?", 2);
1309                         m_url.m_pathAfterLastSlash = m_asciiBuffer.size() - 1;
1310                         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1311                         state = State::Query;
1312                         ++c;
1313                         break;
1314                     }
1315                     if (*c == '#') {
1316                         m_asciiBuffer.append("/#", 2);
1317                         m_url.m_pathAfterLastSlash = m_asciiBuffer.size() - 1;
1318                         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1319                         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1320                         state = State::Fragment;
1321                         ++c;
1322                         break;
1323                     }
1324                     state = State::Path;
1325                     break;
1326                 }
1327                 if (!parseHostAndPort<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
1328                     return failure(input, length);
1329                 
1330                 if (StringView(m_asciiBuffer.data() + m_url.m_passwordEnd, m_asciiBuffer.size() - m_url.m_passwordEnd) == "localhost")  {
1331                     m_asciiBuffer.shrink(m_url.m_passwordEnd);
1332                     m_url.m_hostEnd = m_asciiBuffer.size();
1333                     m_url.m_portEnd = m_url.m_hostEnd;
1334                 }
1335                 
1336                 state = State::PathStart;
1337                 break;
1338             }
1339             if (isPercentOrNonASCII(*c))
1340                 m_hostHasPercentOrNonASCII = true;
1341             ++c;
1342             break;
1343         case State::PathStart:
1344             LOG_STATE("PathStart");
1345             if (*c != '/' && *c != '\\')
1346                 ++c;
1347             state = State::Path;
1348             break;
1349         case State::Path:
1350             LOG_STATE("Path");
1351             if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1352                 m_asciiBuffer.append('/');
1353                 m_url.m_pathAfterLastSlash = m_asciiBuffer.size();
1354                 ++c;
1355                 break;
1356             }
1357             if (m_asciiBuffer.size() && m_asciiBuffer[m_asciiBuffer.size() - 1] == '/') {
1358                 if (isDoubleDotPathSegment(c)) {
1359                     consumeDoubleDotPathSegment(c);
1360                     popPath();
1361                     break;
1362                 }
1363                 if (m_asciiBuffer[m_asciiBuffer.size() - 1] == '/' && isSingleDotPathSegment(c)) {
1364                     consumeSingleDotPathSegment(c);
1365                     break;
1366                 }
1367             }
1368             if (*c == '?') {
1369                 m_url.m_pathEnd = m_asciiBuffer.size();
1370                 state = State::Query;
1371                 break;
1372             }
1373             if (*c == '#') {
1374                 m_url.m_pathEnd = m_asciiBuffer.size();
1375                 m_url.m_queryEnd = m_url.m_pathEnd;
1376                 state = State::Fragment;
1377                 break;
1378             }
1379             if (isPercentEncodedDot(c)) {
1380                 m_asciiBuffer.append('.');
1381                 ASSERT(*c == '%');
1382                 ++c;
1383                 ASSERT(*c == dotASCIICode[0]);
1384                 ++c;
1385                 ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1386                 ++c;
1387                 break;
1388             }
1389             utf8PercentEncode<serialized>(*c, m_asciiBuffer, isInDefaultEncodeSet);
1390             ++c;
1391             break;
1392         case State::CannotBeABaseURLPath:
1393             LOG_STATE("CannotBeABaseURLPath");
1394             if (*c == '?') {
1395                 m_url.m_pathEnd = m_asciiBuffer.size();
1396                 state = State::Query;
1397             } else if (*c == '#') {
1398                 m_url.m_pathEnd = m_asciiBuffer.size();
1399                 m_url.m_queryEnd = m_url.m_pathEnd;
1400                 state = State::Fragment;
1401             } else {
1402                 utf8PercentEncode<serialized>(*c, m_asciiBuffer, isInSimpleEncodeSet);
1403                 ++c;
1404             }
1405             break;
1406         case State::Query:
1407             LOG_STATE("Query");
1408             if (*c == '#') {
1409                 if (!isUTF8Encoding)
1410                     encodeQuery(queryBuffer, m_asciiBuffer, encoding);
1411                 m_url.m_queryEnd = m_asciiBuffer.size();
1412                 state = State::Fragment;
1413                 break;
1414             }
1415             if (isUTF8Encoding)
1416                 utf8PercentEncodeQuery<serialized>(*c, m_asciiBuffer);
1417             else
1418                 appendCodePoint(queryBuffer, *c);
1419             ++c;
1420             break;
1421         case State::Fragment:
1422             LOG_STATE("Fragment");
1423             if (m_unicodeFragmentBuffer.isEmpty() && isASCII(*c))
1424                 m_asciiBuffer.append(*c);
1425             else
1426                 appendCodePoint(m_unicodeFragmentBuffer, *c);
1427             ++c;
1428             break;
1429         }
1430     }
1431
1432     switch (state) {
1433     case State::SchemeStart:
1434         LOG_FINAL_STATE("SchemeStart");
1435         if (!m_asciiBuffer.size() && !base.isNull())
1436             return base;
1437         return failure(input, length);
1438     case State::Scheme:
1439         LOG_FINAL_STATE("Scheme");
1440         return failure(input, length);
1441     case State::NoScheme:
1442         LOG_FINAL_STATE("NoScheme");
1443         RELEASE_ASSERT_NOT_REACHED();
1444     case State::SpecialRelativeOrAuthority:
1445         LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1446         copyURLPartsUntil(base, URLPart::QueryEnd);
1447         m_url.m_fragmentEnd = m_url.m_queryEnd;
1448         break;
1449     case State::PathOrAuthority:
1450         LOG_FINAL_STATE("PathOrAuthority");
1451         m_url.m_userEnd = m_asciiBuffer.size();
1452         m_url.m_passwordEnd = m_url.m_userEnd;
1453         m_url.m_hostEnd = m_url.m_userEnd;
1454         m_url.m_portEnd = m_url.m_userEnd;
1455         m_url.m_pathAfterLastSlash = m_url.m_userEnd;
1456         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1457         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1458         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1459         break;
1460     case State::Relative:
1461         LOG_FINAL_STATE("Relative");
1462         copyURLPartsUntil(base, URLPart::FragmentEnd);
1463         break;
1464     case State::RelativeSlash:
1465         LOG_FINAL_STATE("RelativeSlash");
1466         copyURLPartsUntil(base, URLPart::PortEnd);
1467         m_asciiBuffer.append('/');
1468         m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1469         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1470         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1471         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1472         break;
1473     case State::SpecialAuthoritySlashes:
1474         LOG_FINAL_STATE("SpecialAuthoritySlashes");
1475         m_url.m_userStart = m_asciiBuffer.size();
1476         m_url.m_userEnd = m_url.m_userStart;
1477         m_url.m_passwordEnd = m_url.m_userStart;
1478         m_url.m_hostEnd = m_url.m_userStart;
1479         m_url.m_portEnd = m_url.m_userStart;
1480         m_url.m_pathAfterLastSlash = m_url.m_userStart;
1481         m_url.m_pathEnd = m_url.m_userStart;
1482         m_url.m_queryEnd = m_url.m_userStart;
1483         m_url.m_fragmentEnd = m_url.m_userStart;
1484         break;
1485     case State::SpecialAuthorityIgnoreSlashes:
1486         LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1487         return failure(input, length);
1488         break;
1489     case State::AuthorityOrHost:
1490         LOG_FINAL_STATE("AuthorityOrHost");
1491         m_url.m_userEnd = m_asciiBuffer.size();
1492         m_url.m_passwordEnd = m_url.m_userEnd;
1493         if (authorityOrHostBegin.atEnd()) {
1494             m_url.m_hostEnd = m_url.m_userEnd;
1495             m_url.m_portEnd = m_url.m_userEnd;
1496         } else if (!parseHostAndPort<serialized>(authorityOrHostBegin))
1497             return failure(input, length);
1498         m_asciiBuffer.append('/');
1499         m_url.m_pathEnd = m_url.m_portEnd + 1;
1500         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1501         m_url.m_queryEnd = m_url.m_pathEnd;
1502         m_url.m_fragmentEnd = m_url.m_pathEnd;
1503         break;
1504     case State::Host:
1505         LOG_FINAL_STATE("Host");
1506         if (!parseHostAndPort<serialized>(authorityOrHostBegin))
1507             return failure(input, length);
1508         m_asciiBuffer.append('/');
1509         m_url.m_pathEnd = m_url.m_portEnd + 1;
1510         m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1511         m_url.m_queryEnd = m_url.m_pathEnd;
1512         m_url.m_fragmentEnd = m_url.m_pathEnd;
1513         break;
1514     case State::File:
1515         LOG_FINAL_STATE("File");
1516         if (!base.isNull() && base.protocolIs("file")) {
1517             copyURLPartsUntil(base, URLPart::QueryEnd);
1518             m_asciiBuffer.append(':');
1519         }
1520         m_asciiBuffer.append("///", 3);
1521         m_url.m_userStart = m_asciiBuffer.size() - 1;
1522         m_url.m_userEnd = m_url.m_userStart;
1523         m_url.m_passwordEnd = m_url.m_userStart;
1524         m_url.m_hostEnd = m_url.m_userStart;
1525         m_url.m_portEnd = m_url.m_userStart;
1526         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1527         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1528         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1529         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1530         break;
1531     case State::FileSlash:
1532         LOG_FINAL_STATE("FileSlash");
1533         m_asciiBuffer.append("//", 2);
1534         m_url.m_userStart = m_asciiBuffer.size() - 1;
1535         m_url.m_userEnd = m_url.m_userStart;
1536         m_url.m_passwordEnd = m_url.m_userStart;
1537         m_url.m_hostEnd = m_url.m_userStart;
1538         m_url.m_portEnd = m_url.m_userStart;
1539         m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1540         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1541         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1542         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1543         break;
1544     case State::FileHost:
1545         LOG_FINAL_STATE("FileHost");
1546         if (authorityOrHostBegin == c) {
1547             m_asciiBuffer.append('/');
1548             m_url.m_userStart = m_asciiBuffer.size() - 1;
1549             m_url.m_userEnd = m_url.m_userStart;
1550             m_url.m_passwordEnd = m_url.m_userStart;
1551             m_url.m_hostEnd = m_url.m_userStart;
1552             m_url.m_portEnd = m_url.m_userStart;
1553             m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1554             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1555             m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1556             m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1557             break;
1558         }
1559
1560         if (!parseHostAndPort<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
1561             return failure(input, length);
1562
1563         if (StringView(m_asciiBuffer.data() + m_url.m_passwordEnd, m_asciiBuffer.size() - m_url.m_passwordEnd) == "localhost")  {
1564             m_asciiBuffer.shrink(m_url.m_passwordEnd);
1565             m_url.m_hostEnd = m_asciiBuffer.size();
1566             m_url.m_portEnd = m_url.m_hostEnd;
1567         }
1568         m_asciiBuffer.append('/');
1569         m_url.m_pathAfterLastSlash = m_url.m_hostEnd + 1;
1570         m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1571         m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1572         m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1573         break;
1574     case State::PathStart:
1575         LOG_FINAL_STATE("PathStart");
1576         RELEASE_ASSERT_NOT_REACHED();
1577     case State::Path:
1578         LOG_FINAL_STATE("Path");
1579         m_url.m_pathEnd = m_asciiBuffer.size();
1580         m_url.m_queryEnd = m_url.m_pathEnd;
1581         m_url.m_fragmentEnd = m_url.m_pathEnd;
1582         break;
1583     case State::CannotBeABaseURLPath:
1584         LOG_FINAL_STATE("CannotBeABaseURLPath");
1585         m_url.m_pathEnd = m_asciiBuffer.size();
1586         m_url.m_queryEnd = m_url.m_pathEnd;
1587         m_url.m_fragmentEnd = m_url.m_pathEnd;
1588         break;
1589     case State::Query:
1590         LOG_FINAL_STATE("Query");
1591         if (!isUTF8Encoding)
1592             encodeQuery(queryBuffer, m_asciiBuffer, encoding);
1593         m_url.m_queryEnd = m_asciiBuffer.size();
1594         m_url.m_fragmentEnd = m_url.m_queryEnd;
1595         break;
1596     case State::Fragment:
1597         LOG_FINAL_STATE("Fragment");
1598         m_url.m_fragmentEnd = m_asciiBuffer.size() + m_unicodeFragmentBuffer.size();
1599         break;
1600     }
1601
1602     if (m_unicodeFragmentBuffer.isEmpty()) {
1603         // FIXME: String::adopt should require a WTFMove.
1604         m_url.m_string = String::adopt(m_asciiBuffer);
1605     } else {
1606         StringBuilder builder;
1607         builder.reserveCapacity(m_asciiBuffer.size() + m_unicodeFragmentBuffer.size());
1608         builder.append(m_asciiBuffer.data(), m_asciiBuffer.size());
1609         for (size_t i = 0; i < m_unicodeFragmentBuffer.size(); ++i)
1610             builder.append(m_unicodeFragmentBuffer[i]);
1611         m_url.m_string = builder.toString();
1612     }
1613     m_url.m_isValid = true;
1614     LOG(URLParser, "Parsed URL <%s>", m_url.m_string.utf8().data());
1615     ASSERT(internalValuesConsistent(m_url));
1616     return m_url;
1617 }
1618
1619 template<bool serialized, typename CharacterType>
1620 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
1621 {
1622     if (iterator.atEnd()) {
1623         m_url.m_userEnd = m_asciiBuffer.size();
1624         m_url.m_passwordEnd = m_url.m_userEnd;
1625         return;
1626     }
1627     for (; !iterator.atEnd(); ++iterator) {
1628         if (*iterator == ':') {
1629             ++iterator;
1630             m_url.m_userEnd = m_asciiBuffer.size();
1631             if (iterator.atEnd()) {
1632                 m_url.m_passwordEnd = m_url.m_userEnd;
1633                 if (m_url.m_userEnd > m_url.m_userStart)
1634                     m_asciiBuffer.append('@');
1635                 return;
1636             }
1637             m_asciiBuffer.append(':');
1638             break;
1639         }
1640         utf8PercentEncode<serialized>(*iterator, m_asciiBuffer, isInUserInfoEncodeSet);
1641     }
1642     for (; !iterator.atEnd(); ++iterator)
1643         utf8PercentEncode<serialized>(*iterator, m_asciiBuffer, isInUserInfoEncodeSet);
1644     m_url.m_passwordEnd = m_asciiBuffer.size();
1645     if (!m_url.m_userEnd)
1646         m_url.m_userEnd = m_url.m_passwordEnd;
1647     m_asciiBuffer.append('@');
1648 }
1649
1650 template<typename UnsignedIntegerType>
1651 void append(Vector<LChar>& destination, UnsignedIntegerType number)
1652 {
1653     LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
1654     LChar* end = buf + WTF_ARRAY_LENGTH(buf);
1655     LChar* p = end;
1656     do {
1657         *--p = (number % 10) + '0';
1658         number /= 10;
1659     } while (number);
1660     destination.append(p, end - p);
1661 }
1662
1663 inline static void serializeIPv4(uint32_t address, Vector<LChar>& buffer)
1664 {
1665     append<uint8_t>(buffer, address >> 24);
1666     buffer.append('.');
1667     append<uint8_t>(buffer, address >> 16);
1668     buffer.append('.');
1669     append<uint8_t>(buffer, address >> 8);
1670     buffer.append('.');
1671     append<uint8_t>(buffer, address);
1672 }
1673     
1674 inline static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
1675 {
1676     size_t end = begin;
1677     for (; end < 8; end++) {
1678         if (address[end])
1679             break;
1680     }
1681     return end - begin;
1682 }
1683
1684 inline static Optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
1685 {
1686     Optional<size_t> longest;
1687     size_t longestLength = 0;
1688     for (size_t i = 0; i < 8; i++) {
1689         size_t length = zeroSequenceLength(address, i);
1690         if (length) {
1691             if (length > 1 && (!longest || longestLength < length)) {
1692                 longest = i;
1693                 longestLength = length;
1694             }
1695             i += length;
1696         }
1697     }
1698     return longest;
1699 }
1700     
1701 inline static void serializeIPv6Piece(uint16_t piece, Vector<LChar>& buffer)
1702 {
1703     bool printed = false;
1704     if (auto nibble0 = piece >> 12) {
1705         buffer.append(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
1706         printed = true;
1707     }
1708     auto nibble1 = piece >> 8 & 0xF;
1709     if (printed || nibble1) {
1710         buffer.append(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
1711         printed = true;
1712     }
1713     auto nibble2 = piece >> 4 & 0xF;
1714     if (printed || nibble2)
1715         buffer.append(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
1716     buffer.append(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
1717 }
1718
1719 inline static void serializeIPv6(std::array<uint16_t, 8> address, Vector<LChar>& buffer)
1720 {
1721     buffer.append('[');
1722     auto compressPointer = findLongestZeroSequence(address);
1723     for (size_t piece = 0; piece < 8; piece++) {
1724         if (compressPointer && compressPointer.value() == piece) {
1725             ASSERT(!address[piece]);
1726             if (piece)
1727                 buffer.append(':');
1728             else
1729                 buffer.append("::", 2);
1730             while (piece < 8 && !address[piece])
1731                 piece++;
1732             if (piece == 8)
1733                 break;
1734         }
1735         serializeIPv6Piece(address[piece], buffer);
1736         if (piece < 7)
1737             buffer.append(':');
1738     }
1739     buffer.append(']');
1740 }
1741
1742 template<typename CharacterType>
1743 inline static Optional<uint32_t> parseIPv4Number(CodePointIterator<CharacterType>& iterator)
1744 {
1745     // FIXME: Check for overflow.
1746     enum class State : uint8_t {
1747         UnknownBase,
1748         Decimal,
1749         OctalOrHex,
1750         Octal,
1751         Hex,
1752     };
1753     State state = State::UnknownBase;
1754     uint32_t value = 0;
1755     while (!iterator.atEnd()) {
1756         if (*iterator == '.') {
1757             ++iterator;
1758             return value;
1759         }
1760         switch (state) {
1761         case State::UnknownBase:
1762             if (*iterator == '0') {
1763                 ++iterator;
1764                 state = State::OctalOrHex;
1765                 break;
1766             }
1767             state = State::Decimal;
1768             break;
1769         case State::OctalOrHex:
1770             if (*iterator == 'x' || *iterator == 'X') {
1771                 ++iterator;
1772                 state = State::Hex;
1773                 break;
1774             }
1775             state = State::Octal;
1776             break;
1777         case State::Decimal:
1778             if (*iterator < '0' || *iterator > '9')
1779                 return Nullopt;
1780             value *= 10;
1781             value += *iterator - '0';
1782             ++iterator;
1783             break;
1784         case State::Octal:
1785             if (*iterator < '0' || *iterator > '7')
1786                 return Nullopt;
1787             value *= 8;
1788             value += *iterator - '0';
1789             ++iterator;
1790             break;
1791         case State::Hex:
1792             if (!isASCIIHexDigit(*iterator))
1793                 return Nullopt;
1794             value *= 16;
1795             value += toASCIIHexValue(*iterator);
1796             ++iterator;
1797             break;
1798         }
1799     }
1800     return value;
1801 }
1802
1803 inline static uint64_t pow256(size_t exponent)
1804 {
1805     RELEASE_ASSERT(exponent <= 4);
1806     uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
1807     return values[exponent];
1808 }
1809
1810 template<typename CharacterType>
1811 inline static Optional<uint32_t> parseIPv4Host(CodePointIterator<CharacterType> iterator)
1812 {
1813     Vector<uint32_t, 4> items;
1814     items.reserveInitialCapacity(4);
1815     while (!iterator.atEnd()) {
1816         if (items.size() >= 4)
1817             return Nullopt;
1818         if (auto item = parseIPv4Number(iterator))
1819             items.append(item.value());
1820         else
1821             return Nullopt;
1822     }
1823     if (!items.size() || items.size() > 4)
1824         return Nullopt;
1825     if (items.size() > 2) {
1826         for (size_t i = 0; i < items.size() - 2; i++) {
1827             if (items[i] > 255)
1828                 return Nullopt;
1829         }
1830     }
1831     if (items[items.size() - 1] >= pow256(5 - items.size()))
1832         return Nullopt;
1833     for (auto item : items) {
1834         if (item > 255)
1835             return Nullopt;
1836     }
1837     uint32_t ipv4 = items.takeLast();
1838     for (size_t counter = 0; counter < items.size(); ++counter)
1839         ipv4 += items[counter] * pow256(3 - counter);
1840     return ipv4;
1841 }
1842     
1843 template<typename CharacterType>
1844 inline static Optional<std::array<uint16_t, 8>> parseIPv6Host(CodePointIterator<CharacterType> c)
1845 {
1846     if (c.atEnd())
1847         return Nullopt;
1848
1849     std::array<uint16_t, 8> address = {{0, 0, 0, 0, 0, 0, 0, 0}};
1850     size_t piecePointer = 0;
1851     Optional<size_t> compressPointer;
1852
1853     if (*c == ':') {
1854         ++c;
1855         if (c.atEnd())
1856             return Nullopt;
1857         if (*c != ':')
1858             return Nullopt;
1859         ++c;
1860         ++piecePointer;
1861         compressPointer = piecePointer;
1862     }
1863     
1864     while (!c.atEnd()) {
1865         if (piecePointer == 8)
1866             return Nullopt;
1867         if (*c == ':') {
1868             if (compressPointer)
1869                 return Nullopt;
1870             ++c;
1871             ++piecePointer;
1872             compressPointer = piecePointer;
1873             continue;
1874         }
1875         uint16_t value = 0;
1876         for (size_t length = 0; length < 4; length++) {
1877             if (c.atEnd())
1878                 break;
1879             if (!isASCIIHexDigit(*c))
1880                 break;
1881             value = value * 0x10 + toASCIIHexValue(*c);
1882             ++c;
1883         }
1884         address[piecePointer++] = value;
1885         if (c.atEnd())
1886             break;
1887         if (*c != ':')
1888             return Nullopt;
1889         ++c;
1890     }
1891     
1892     if (!c.atEnd()) {
1893         if (piecePointer > 6)
1894             return Nullopt;
1895         size_t dotsSeen = 0;
1896         while (!c.atEnd()) {
1897             Optional<uint16_t> value;
1898             if (!isASCIIDigit(*c))
1899                 return Nullopt;
1900             while (isASCIIDigit(*c)) {
1901                 auto number = *c - '0';
1902                 if (!value)
1903                     value = number;
1904                 else if (!value.value())
1905                     return Nullopt;
1906                 else
1907                     value = value.value() * 10 + number;
1908                 ++c;
1909                 if (c.atEnd())
1910                     return Nullopt;
1911                 if (value.value() > 255)
1912                     return Nullopt;
1913             }
1914             if (dotsSeen < 3 && *c != '.')
1915                 return Nullopt;
1916             address[piecePointer] = address[piecePointer] * 0x100 + value.valueOr(0);
1917             if (dotsSeen == 1 || dotsSeen == 3)
1918                 piecePointer++;
1919             if (!c.atEnd())
1920                 ++c;
1921             if (dotsSeen == 3 && !c.atEnd())
1922                 return Nullopt;
1923             dotsSeen++;
1924         }
1925     }
1926     if (compressPointer) {
1927         size_t swaps = piecePointer - compressPointer.value();
1928         piecePointer = 7;
1929         while (swaps)
1930             std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
1931     } else if (piecePointer != 8)
1932         return Nullopt;
1933     return address;
1934 }
1935
1936 const size_t defaultInlineBufferSize = 2048;
1937
1938 inline static Vector<LChar, defaultInlineBufferSize> percentDecode(const LChar* input, size_t length)
1939 {
1940     Vector<LChar, defaultInlineBufferSize> output;
1941     output.reserveInitialCapacity(length);
1942     
1943     for (size_t i = 0; i < length; ++i) {
1944         uint8_t byte = input[i];
1945         if (byte != '%')
1946             output.uncheckedAppend(byte);
1947         else if (i < length - 2) {
1948             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
1949                 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
1950                 i += 2;
1951             } else
1952                 output.uncheckedAppend(byte);
1953         } else
1954             output.uncheckedAppend(byte);
1955     }
1956     return output;
1957 }
1958
1959 inline static bool containsOnlyASCII(const String& string)
1960 {
1961     if (string.is8Bit())
1962         return charactersAreAllASCII(string.characters8(), string.length());
1963     return charactersAreAllASCII(string.characters16(), string.length());
1964 }
1965
1966 inline static Optional<Vector<LChar, defaultInlineBufferSize>> domainToASCII(const String& domain)
1967 {
1968     Vector<LChar, defaultInlineBufferSize> ascii;
1969     if (containsOnlyASCII(domain)) {
1970         size_t length = domain.length();
1971         if (domain.is8Bit()) {
1972             const LChar* characters = domain.characters8();
1973             ascii.reserveInitialCapacity(length);
1974             for (size_t i = 0; i < length; ++i)
1975                 ascii.uncheckedAppend(toASCIILower(characters[i]));
1976         } else {
1977             const UChar* characters = domain.characters16();
1978             ascii.reserveInitialCapacity(length);
1979             for (size_t i = 0; i < length; ++i)
1980                 ascii.uncheckedAppend(toASCIILower(characters[i]));
1981         }
1982         return ascii;
1983     }
1984     
1985     UChar hostnameBuffer[defaultInlineBufferSize];
1986     UErrorCode error = U_ZERO_ERROR;
1987
1988 #if COMPILER(GCC) || COMPILER(CLANG)
1989 #pragma GCC diagnostic push
1990 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
1991 #endif
1992     // FIXME: This should use uidna_openUTS46 / uidna_close instead
1993     int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, UIDNA_ALLOW_UNASSIGNED, nullptr, &error);
1994 #if COMPILER(GCC) || COMPILER(CLANG)
1995 #pragma GCC diagnostic pop
1996 #endif
1997     ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
1998
1999     if (error == U_ZERO_ERROR) {
2000         for (int32_t i = 0; i < numCharactersConverted; ++i) {
2001             ASSERT(isASCII(hostnameBuffer[i]));
2002             ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2003         }
2004         ascii.append(hostnameBuffer, numCharactersConverted);
2005         return ascii;
2006     }
2007
2008     // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
2009     return Nullopt;
2010 }
2011
2012 inline static bool hasInvalidDomainCharacter(const Vector<LChar, defaultInlineBufferSize>& asciiDomain)
2013 {
2014     for (size_t i = 0; i < asciiDomain.size(); ++i) {
2015         if (isInvalidDomainCharacter(asciiDomain[i]))
2016             return true;
2017     }
2018     return false;
2019 }
2020
2021 template<bool serialized, typename CharacterType>
2022 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2023 {
2024     uint32_t port = 0;
2025     if (iterator.atEnd()) {
2026         m_url.m_portEnd = m_asciiBuffer.size();
2027         return true;
2028     }
2029     m_asciiBuffer.append(':');
2030     for (; !iterator.atEnd(); ++iterator) {
2031         if (!serialized && isTabOrNewline(*iterator))
2032             continue;
2033         if (isASCIIDigit(*iterator)) {
2034             port = port * 10 + *iterator - '0';
2035             if (port > std::numeric_limits<uint16_t>::max())
2036                 return false;
2037         } else
2038             return false;
2039     }
2040
2041     if (isDefaultPort(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd), port)) {
2042         ASSERT(m_asciiBuffer.last() == ':');
2043         m_asciiBuffer.shrink(m_asciiBuffer.size() - 1);
2044     } else
2045         append<uint16_t>(m_asciiBuffer, static_cast<uint16_t>(port));
2046
2047     m_url.m_portEnd = m_asciiBuffer.size();
2048     return true;
2049 }
2050
2051 template<bool serialized, typename CharacterType>
2052 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2053 {
2054     if (iterator.atEnd())
2055         return false;
2056     if (*iterator == '[') {
2057         ++iterator;
2058         auto ipv6End = iterator;
2059         while (!ipv6End.atEnd() && *ipv6End != ']')
2060             ++ipv6End;
2061         if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2062             serializeIPv6(address.value(), m_asciiBuffer);
2063             m_url.m_hostEnd = m_asciiBuffer.size();
2064             if (!ipv6End.atEnd()) {
2065                 ++ipv6End;
2066                 if (!ipv6End.atEnd() && *ipv6End == ':') {
2067                     ++ipv6End;
2068                     return parsePort<serialized>(ipv6End);
2069                 }
2070                 m_url.m_portEnd = m_asciiBuffer.size();
2071                 return true;
2072             }
2073             return true;
2074         }
2075     }
2076     
2077     if (!m_hostHasPercentOrNonASCII) {
2078         auto hostIterator = iterator;
2079         for (; !iterator.atEnd(); ++iterator) {
2080             if (!serialized && isTabOrNewline(*iterator))
2081                 continue;
2082             if (*iterator == ':')
2083                 break;
2084             if (isInvalidDomainCharacter(*iterator))
2085                 return false;
2086         }
2087         if (auto address = parseIPv4Host(CodePointIterator<CharacterType>(hostIterator, iterator))) {
2088             serializeIPv4(address.value(), m_asciiBuffer);
2089             m_url.m_hostEnd = m_asciiBuffer.size();
2090             if (iterator.atEnd()) {
2091                 m_url.m_portEnd = m_asciiBuffer.size();
2092                 return true;
2093             }
2094             ++iterator;
2095             return parsePort<serialized>(iterator);
2096         }
2097         for (; hostIterator != iterator; ++hostIterator) {
2098             if (serialized || !isTabOrNewline(*hostIterator))
2099                 m_asciiBuffer.append(toASCIILower(*hostIterator));
2100         }
2101         m_url.m_hostEnd = m_asciiBuffer.size();
2102         if (!hostIterator.atEnd()) {
2103             ASSERT(*hostIterator == ':');
2104             incrementIteratorSkippingTabAndNewLine<serialized>(hostIterator);
2105             return parsePort<serialized>(hostIterator);
2106         }
2107         m_url.m_portEnd = m_asciiBuffer.size();
2108         return true;
2109     }
2110     
2111     Vector<LChar, defaultInlineBufferSize> utf8Encoded;
2112     for (; !iterator.atEnd(); ++iterator) {
2113         if (!serialized && isTabOrNewline(*iterator))
2114             continue;
2115         if (*iterator == ':')
2116             break;
2117         uint8_t buffer[U8_MAX_LENGTH];
2118         int32_t offset = 0;
2119         UBool error = false;
2120         U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error);
2121         ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
2122         // FIXME: Check error.
2123         utf8Encoded.append(buffer, offset);
2124     }
2125     Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size());
2126     String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2127     auto asciiDomain = domainToASCII(domain);
2128     if (!asciiDomain || hasInvalidDomainCharacter(asciiDomain.value()))
2129         return false;
2130     Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value();
2131     const LChar* asciiDomainCharacters = asciiDomainValue.data();
2132
2133     if (auto address = parseIPv4Host(CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()))) {
2134         serializeIPv4(address.value(), m_asciiBuffer);
2135         m_url.m_hostEnd = m_asciiBuffer.size();
2136         if (iterator.atEnd()) {
2137             m_url.m_portEnd = m_asciiBuffer.size();
2138             return true;
2139         }
2140         ++iterator;
2141         return parsePort<serialized>(iterator);
2142     }
2143
2144     m_asciiBuffer.append(asciiDomainCharacters, asciiDomainValue.size());
2145     m_url.m_hostEnd = m_asciiBuffer.size();
2146     if (!iterator.atEnd()) {
2147         ASSERT(*iterator == ':');
2148         incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
2149         return parsePort<serialized>(iterator);
2150     }
2151     m_url.m_portEnd = m_asciiBuffer.size();
2152     return true;
2153 }
2154
2155 inline static Optional<String> formURLDecode(StringView input)
2156 {
2157     auto utf8 = input.utf8(StrictConversion);
2158     if (utf8.isNull())
2159         return Nullopt;
2160     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2161     return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2162 }
2163
2164 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2165 {
2166     Vector<StringView> sequences = input.split('&');
2167
2168     URLEncodedForm output;
2169     for (auto& bytes : sequences) {
2170         auto valueStart = bytes.find('=');
2171         if (valueStart == notFound) {
2172             if (auto name = formURLDecode(bytes))
2173                 output.append({name.value().replace('+', 0x20), emptyString()});
2174         } else {
2175             auto name = formURLDecode(bytes.substring(0, valueStart));
2176             auto value = formURLDecode(bytes.substring(valueStart + 1));
2177             if (name && value)
2178                 output.append(std::make_pair(name.value().replace('+', 0x20), value.value().replace('+', 0x20)));
2179         }
2180     }
2181     return output;
2182 }
2183
2184 inline static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2185 {
2186     auto utf8 = input.utf8(StrictConversion);
2187     const char* data = utf8.data();
2188     for (size_t i = 0; i < utf8.length(); ++i) {
2189         const char byte = data[i];
2190         if (byte == 0x20)
2191             output.append(0x2B);
2192         else if (byte == 0x2A
2193             || byte == 0x2D
2194             || byte == 0x2E
2195             || (byte >= 0x30 && byte <= 0x39)
2196             || (byte >= 0x41 && byte <= 0x5A)
2197             || byte == 0x5F
2198             || (byte >= 0x61 && byte <= 0x7A))
2199             output.append(byte);
2200         else
2201             percentEncode(byte, output);
2202     }
2203 }
2204     
2205 String URLParser::serialize(const URLEncodedForm& tuples)
2206 {
2207     Vector<LChar> output;
2208     for (auto& tuple : tuples) {
2209         if (!output.isEmpty())
2210             output.append('&');
2211         serializeURLEncodedForm(tuple.first, output);
2212         output.append('=');
2213         serializeURLEncodedForm(tuple.second, output);
2214     }
2215     return String::adopt(output);
2216 }
2217
2218 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2219 {
2220     // FIXME: m_cannotBeABaseURL is not compared because the old URL::parse did not use it,
2221     // but once we get rid of URL::parse its value should be tested.
2222     LOG(URLParser, "%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2223         a.m_isValid,
2224         a.m_protocolIsInHTTPFamily,
2225         a.m_schemeEnd,
2226         a.m_userStart,
2227         a.m_userEnd,
2228         a.m_passwordEnd,
2229         a.m_hostEnd,
2230         a.m_portEnd,
2231         a.m_pathAfterLastSlash,
2232         a.m_pathEnd,
2233         a.m_queryEnd,
2234         a.m_fragmentEnd,
2235         a.m_string.utf8().data(),
2236         b.m_isValid,
2237         b.m_protocolIsInHTTPFamily,
2238         b.m_schemeEnd,
2239         b.m_userStart,
2240         b.m_userEnd,
2241         b.m_passwordEnd,
2242         b.m_hostEnd,
2243         b.m_portEnd,
2244         b.m_pathAfterLastSlash,
2245         b.m_pathEnd,
2246         b.m_queryEnd,
2247         b.m_fragmentEnd,
2248         b.m_string.utf8().data());
2249
2250     return a.m_string == b.m_string
2251         && a.m_isValid == b.m_isValid
2252         && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2253         && a.m_schemeEnd == b.m_schemeEnd
2254         && a.m_userStart == b.m_userStart
2255         && a.m_userEnd == b.m_userEnd
2256         && a.m_passwordEnd == b.m_passwordEnd
2257         && a.m_hostEnd == b.m_hostEnd
2258         && a.m_portEnd == b.m_portEnd
2259         && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2260         && a.m_pathEnd == b.m_pathEnd
2261         && a.m_queryEnd == b.m_queryEnd
2262         && a.m_fragmentEnd == b.m_fragmentEnd;
2263 }
2264
2265 bool URLParser::internalValuesConsistent(const URL& url)
2266 {    
2267     return url.m_schemeEnd <= url.m_userStart
2268         && url.m_userStart <= url.m_userEnd
2269         && url.m_userEnd <= url.m_passwordEnd
2270         && url.m_passwordEnd <= url.m_hostEnd
2271         && url.m_hostEnd <= url.m_hostEnd
2272         && url.m_portEnd <= url.m_pathAfterLastSlash
2273         && url.m_pathAfterLastSlash <= url.m_pathEnd
2274         && url.m_pathEnd <= url.m_queryEnd
2275         && url.m_queryEnd <= url.m_fragmentEnd
2276         && (url.m_isValid ? url.m_fragmentEnd == url.m_string.length() : !url.m_fragmentEnd);
2277     // FIXME: Why do we even store m_fragmentEnd?
2278     // It should be able to be deduced from m_isValid and m_string.length() to save memory.
2279 }
2280
2281 static bool urlParserEnabled = false;
2282
2283 void URLParser::setEnabled(bool enabled)
2284 {
2285     urlParserEnabled = enabled;
2286 }
2287
2288 bool URLParser::enabled()
2289 {
2290     return urlParserEnabled;
2291 }
2292
2293 } // namespace WebCore