2 * Copyright (C) 2016 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23 * THE POSSIBILITY OF SUCH DAMAGE.
27 #include "URLParser.h"
30 #include "RuntimeApplicationChecks.h"
32 #include <unicode/uidna.h>
33 #include <unicode/utypes.h>
37 #define URL_PARSER_DEBUGGING 0
39 #if URL_PARSER_DEBUGGING
40 #define URL_PARSER_LOG(...) LOG(URLParser, __VA_ARGS__)
42 #define URL_PARSER_LOG(...)
45 template<typename CharacterType>
46 class CodePointIterator {
48 ALWAYS_INLINE CodePointIterator() { }
49 ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
55 ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
56 : CodePointIterator(begin.m_begin, end.m_begin)
58 ASSERT(end.m_begin >= begin.m_begin);
61 ALWAYS_INLINE UChar32 operator*() const;
62 ALWAYS_INLINE CodePointIterator& operator++();
64 ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
66 return m_begin == other.m_begin
67 && m_end == other.m_end;
69 ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
71 ALWAYS_INLINE CodePointIterator& operator=(const CodePointIterator& other)
73 m_begin = other.m_begin;
78 ALWAYS_INLINE bool atEnd() const
80 ASSERT(m_begin <= m_end);
81 return m_begin >= m_end;
84 ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
86 ASSERT(m_begin >= reference);
87 return m_begin - reference;
90 ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
92 return codeUnitsSince(other.m_begin);
96 const CharacterType* m_begin { nullptr };
97 const CharacterType* m_end { nullptr };
101 ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const
108 ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
116 ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const
120 U16_GET(m_begin, 0, 0, m_end - m_begin, c);
125 ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
129 size_t length = m_end - m_begin;
130 U16_FWD_1(m_begin, i, length);
135 ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
137 if (U_IS_BMP(codePoint)) {
138 destination.append(static_cast<UChar>(codePoint));
141 destination.reserveCapacity(destination.size() + 2);
142 destination.uncheckedAppend(U16_LEAD(codePoint));
143 destination.uncheckedAppend(U16_TRAIL(codePoint));
146 enum URLCharacterClass {
151 SlashQuestionOrHash = 0x10,
155 static const uint8_t characterClassTable[256] = {
156 UserInfo | Default | InvalidDomain | QueryPercent, // 0x0
157 UserInfo | Default | QueryPercent, // 0x1
158 UserInfo | Default | QueryPercent, // 0x2
159 UserInfo | Default | QueryPercent, // 0x3
160 UserInfo | Default | QueryPercent, // 0x4
161 UserInfo | Default | QueryPercent, // 0x5
162 UserInfo | Default | QueryPercent, // 0x6
163 UserInfo | Default | QueryPercent, // 0x7
164 UserInfo | Default | QueryPercent, // 0x8
165 UserInfo | Default | InvalidDomain | QueryPercent, // 0x9
166 UserInfo | Default | InvalidDomain | QueryPercent, // 0xA
167 UserInfo | Default | QueryPercent, // 0xB
168 UserInfo | Default | QueryPercent, // 0xC
169 UserInfo | Default | InvalidDomain | QueryPercent, // 0xD
170 UserInfo | Default | QueryPercent, // 0xE
171 UserInfo | Default | QueryPercent, // 0xF
172 UserInfo | Default | QueryPercent, // 0x10
173 UserInfo | Default | QueryPercent, // 0x11
174 UserInfo | Default | QueryPercent, // 0x12
175 UserInfo | Default | QueryPercent, // 0x13
176 UserInfo | Default | QueryPercent, // 0x14
177 UserInfo | Default | QueryPercent, // 0x15
178 UserInfo | Default | QueryPercent, // 0x16
179 UserInfo | Default | QueryPercent, // 0x17
180 UserInfo | Default | QueryPercent, // 0x18
181 UserInfo | Default | QueryPercent, // 0x19
182 UserInfo | Default | QueryPercent, // 0x1A
183 UserInfo | Default | QueryPercent, // 0x1B
184 UserInfo | Default | QueryPercent, // 0x1C
185 UserInfo | Default | QueryPercent, // 0x1D
186 UserInfo | Default | QueryPercent, // 0x1E
187 UserInfo | Default | QueryPercent, // 0x1F
188 UserInfo | Default | InvalidDomain | QueryPercent, // ' '
190 UserInfo | Default | QueryPercent, // '"'
191 UserInfo | Default | InvalidDomain | QueryPercent | SlashQuestionOrHash, // '#'
193 InvalidDomain, // '%'
203 UserInfo | InvalidDomain | SlashQuestionOrHash, // '/'
214 UserInfo | InvalidDomain, // ':'
216 UserInfo | Default | QueryPercent, // '<'
218 UserInfo | Default | QueryPercent, // '>'
219 UserInfo | Default | InvalidDomain | SlashQuestionOrHash, // '?'
220 UserInfo | InvalidDomain, // '@'
247 UserInfo | InvalidDomain, // '['
248 UserInfo | InvalidDomain | SlashQuestionOrHash, // '\\'
249 UserInfo | InvalidDomain, // ']'
252 UserInfo | Default, // '`'
279 UserInfo | Default, // '{'
281 UserInfo | Default, // '}'
283 QueryPercent, // 0x7F
284 QueryPercent, // 0x80
285 QueryPercent, // 0x81
286 QueryPercent, // 0x82
287 QueryPercent, // 0x83
288 QueryPercent, // 0x84
289 QueryPercent, // 0x85
290 QueryPercent, // 0x86
291 QueryPercent, // 0x87
292 QueryPercent, // 0x88
293 QueryPercent, // 0x89
294 QueryPercent, // 0x8A
295 QueryPercent, // 0x8B
296 QueryPercent, // 0x8C
297 QueryPercent, // 0x8D
298 QueryPercent, // 0x8E
299 QueryPercent, // 0x8F
300 QueryPercent, // 0x90
301 QueryPercent, // 0x91
302 QueryPercent, // 0x92
303 QueryPercent, // 0x93
304 QueryPercent, // 0x94
305 QueryPercent, // 0x95
306 QueryPercent, // 0x96
307 QueryPercent, // 0x97
308 QueryPercent, // 0x98
309 QueryPercent, // 0x99
310 QueryPercent, // 0x9A
311 QueryPercent, // 0x9B
312 QueryPercent, // 0x9C
313 QueryPercent, // 0x9D
314 QueryPercent, // 0x9E
315 QueryPercent, // 0x9F
316 QueryPercent, // 0xA0
317 QueryPercent, // 0xA1
318 QueryPercent, // 0xA2
319 QueryPercent, // 0xA3
320 QueryPercent, // 0xA4
321 QueryPercent, // 0xA5
322 QueryPercent, // 0xA6
323 QueryPercent, // 0xA7
324 QueryPercent, // 0xA8
325 QueryPercent, // 0xA9
326 QueryPercent, // 0xAA
327 QueryPercent, // 0xAB
328 QueryPercent, // 0xAC
329 QueryPercent, // 0xAD
330 QueryPercent, // 0xAE
331 QueryPercent, // 0xAF
332 QueryPercent, // 0xB0
333 QueryPercent, // 0xB1
334 QueryPercent, // 0xB2
335 QueryPercent, // 0xB3
336 QueryPercent, // 0xB4
337 QueryPercent, // 0xB5
338 QueryPercent, // 0xB6
339 QueryPercent, // 0xB7
340 QueryPercent, // 0xB8
341 QueryPercent, // 0xB9
342 QueryPercent, // 0xBA
343 QueryPercent, // 0xBB
344 QueryPercent, // 0xBC
345 QueryPercent, // 0xBD
346 QueryPercent, // 0xBE
347 QueryPercent, // 0xBF
348 QueryPercent, // 0xC0
349 QueryPercent, // 0xC1
350 QueryPercent, // 0xC2
351 QueryPercent, // 0xC3
352 QueryPercent, // 0xC4
353 QueryPercent, // 0xC5
354 QueryPercent, // 0xC6
355 QueryPercent, // 0xC7
356 QueryPercent, // 0xC8
357 QueryPercent, // 0xC9
358 QueryPercent, // 0xCA
359 QueryPercent, // 0xCB
360 QueryPercent, // 0xCC
361 QueryPercent, // 0xCD
362 QueryPercent, // 0xCE
363 QueryPercent, // 0xCF
364 QueryPercent, // 0xD0
365 QueryPercent, // 0xD1
366 QueryPercent, // 0xD2
367 QueryPercent, // 0xD3
368 QueryPercent, // 0xD4
369 QueryPercent, // 0xD5
370 QueryPercent, // 0xD6
371 QueryPercent, // 0xD7
372 QueryPercent, // 0xD8
373 QueryPercent, // 0xD9
374 QueryPercent, // 0xDA
375 QueryPercent, // 0xDB
376 QueryPercent, // 0xDC
377 QueryPercent, // 0xDD
378 QueryPercent, // 0xDE
379 QueryPercent, // 0xDF
380 QueryPercent, // 0xE0
381 QueryPercent, // 0xE1
382 QueryPercent, // 0xE2
383 QueryPercent, // 0xE3
384 QueryPercent, // 0xE4
385 QueryPercent, // 0xE5
386 QueryPercent, // 0xE6
387 QueryPercent, // 0xE7
388 QueryPercent, // 0xE8
389 QueryPercent, // 0xE9
390 QueryPercent, // 0xEA
391 QueryPercent, // 0xEB
392 QueryPercent, // 0xEC
393 QueryPercent, // 0xED
394 QueryPercent, // 0xEE
395 QueryPercent, // 0xEF
396 QueryPercent, // 0xF0
397 QueryPercent, // 0xF1
398 QueryPercent, // 0xF2
399 QueryPercent, // 0xF3
400 QueryPercent, // 0xF4
401 QueryPercent, // 0xF5
402 QueryPercent, // 0xF6
403 QueryPercent, // 0xF7
404 QueryPercent, // 0xF8
405 QueryPercent, // 0xF9
406 QueryPercent, // 0xFA
407 QueryPercent, // 0xFB
408 QueryPercent, // 0xFC
409 QueryPercent, // 0xFD
410 QueryPercent, // 0xFE
411 QueryPercent, // 0xFF
414 template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; }
415 template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
416 template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
417 template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
418 template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
419 template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
420 template<typename CharacterType> ALWAYS_INLINE static bool isInvalidDomainCharacter(CharacterType character) { return character <= ']' && characterClassTable[character] & InvalidDomain; }
421 template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
422 template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
423 template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & ValidScheme; }
424 static bool shouldPercentEncodeQueryByte(uint8_t byte) { return characterClassTable[byte] & QueryPercent; }
426 template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation>
427 ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
430 while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
431 if (reportSyntaxViolation == ReportSyntaxViolation::Yes)
432 syntaxViolation(iteratorForSyntaxViolationPosition);
437 template<typename CharacterType>
438 bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator)
440 if (iterator.atEnd())
442 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
443 if (iterator.atEnd())
445 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
446 return iterator.atEnd();
449 template<typename CharacterType>
450 ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
452 if (iterator.atEnd() || !isASCIIAlpha(*iterator))
454 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
455 if (iterator.atEnd())
457 if (*iterator == ':')
459 if (UNLIKELY(*iterator == '|'))
464 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
466 ASSERT(isASCII(codePoint));
467 if (UNLIKELY(m_didSeeSyntaxViolation))
468 m_asciiBuffer.append(codePoint);
471 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
473 if (UNLIKELY(m_didSeeSyntaxViolation))
474 m_asciiBuffer.append(characters, length);
477 template<typename CharacterType>
478 void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
480 ASSERT(isWindowsDriveLetter(iterator));
481 appendToASCIIBuffer(*iterator);
483 ASSERT(!iterator.atEnd());
484 ASSERT(*iterator == ':' || *iterator == '|');
485 if (*iterator == '|')
486 syntaxViolation(iterator);
487 appendToASCIIBuffer(':');
491 template<typename CharacterType>
492 bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
494 if (!isWindowsDriveLetter(iterator))
496 if (iterator.atEnd())
499 if (iterator.atEnd())
502 if (iterator.atEnd())
504 return !isSlashQuestionOrHash(*iterator);
507 static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
510 buffer.append(upperNibbleToASCIIHexDigit(byte));
511 buffer.append(lowerNibbleToASCIIHexDigit(byte));
514 void URLParser::percentEncodeByte(uint8_t byte)
516 ASSERT(m_didSeeSyntaxViolation);
517 appendToASCIIBuffer('%');
518 appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
519 appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
522 const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD";
523 const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1;
525 template<bool(*isInCodeSet)(UChar32), typename CharacterType>
526 ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
528 ASSERT(!iterator.atEnd());
529 UChar32 codePoint = *iterator;
530 if (LIKELY(isASCII(codePoint))) {
531 if (UNLIKELY(isInCodeSet(codePoint))) {
532 syntaxViolation(iterator);
533 percentEncodeByte(codePoint);
535 appendToASCIIBuffer(codePoint);
538 ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
539 syntaxViolation(iterator);
541 if (!U_IS_UNICODE_CHAR(codePoint)) {
542 appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
546 uint8_t buffer[U8_MAX_LENGTH];
548 U8_APPEND_UNSAFE(buffer, offset, codePoint);
549 for (int32_t i = 0; i < offset; ++i)
550 percentEncodeByte(buffer[i]);
553 template<typename CharacterType>
554 ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
556 ASSERT(!iterator.atEnd());
557 UChar32 codePoint = *iterator;
558 if (LIKELY(isASCII(codePoint))) {
559 if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint))) {
560 syntaxViolation(iterator);
561 percentEncodeByte(codePoint);
563 appendToASCIIBuffer(codePoint);
567 syntaxViolation(iterator);
569 if (!U_IS_UNICODE_CHAR(codePoint)) {
570 appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
574 uint8_t buffer[U8_MAX_LENGTH];
576 U8_APPEND_UNSAFE(buffer, offset, codePoint);
577 for (int32_t i = 0; i < offset; ++i) {
578 auto byte = buffer[i];
579 if (shouldPercentEncodeQueryByte(byte))
580 percentEncodeByte(byte);
582 appendToASCIIBuffer(byte);
586 template<typename CharacterType>
587 void URLParser::encodeQuery(const Vector<UChar>& source, const TextEncoding& encoding, CodePointIterator<CharacterType> iterator)
589 // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
590 CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
591 const char* data = encoded.data();
592 size_t length = encoded.length();
594 if (!length == !iterator.atEnd()) {
595 syntaxViolation(iterator);
600 for (; i < length; ++i) {
601 ASSERT(!iterator.atEnd());
602 uint8_t byte = data[i];
603 if (UNLIKELY(byte != *iterator)) {
604 syntaxViolation(iterator);
607 if (UNLIKELY(shouldPercentEncodeQueryByte(byte))) {
608 syntaxViolation(iterator);
611 appendToASCIIBuffer(byte);
614 while (!iterator.atEnd() && isTabOrNewline(*iterator))
616 ASSERT((i == length) == iterator.atEnd());
617 for (; i < length; ++i) {
618 ASSERT(m_didSeeSyntaxViolation);
619 uint8_t byte = data[i];
620 if (shouldPercentEncodeQueryByte(byte))
621 percentEncodeByte(byte);
623 appendToASCIIBuffer(byte);
627 Optional<uint16_t> defaultPortForProtocol(StringView scheme)
629 static const uint16_t ftpPort = 21;
630 static const uint16_t gopherPort = 70;
631 static const uint16_t httpPort = 80;
632 static const uint16_t httpsPort = 443;
633 static const uint16_t wsPort = 80;
634 static const uint16_t wssPort = 443;
636 auto length = scheme.length();
643 if (scheme[1] == 's')
692 bool isDefaultPortForProtocol(uint16_t port, StringView protocol)
694 return defaultPortForProtocol(protocol) == port;
708 ALWAYS_INLINE static Scheme scheme(StringView scheme)
710 auto length = scheme.length();
712 return Scheme::NonSpecial;
720 return Scheme::NonSpecial;
726 return Scheme::NonSpecial;
728 return Scheme::NonSpecial;
737 return Scheme::Gopher;
738 return Scheme::NonSpecial;
746 return Scheme::NonSpecial;
752 return Scheme::HTTPS;
753 return Scheme::NonSpecial;
755 return Scheme::NonSpecial;
760 if (scheme[1] == 's')
762 return Scheme::NonSpecial;
767 return Scheme::NonSpecial;
769 return Scheme::NonSpecial;
772 return Scheme::NonSpecial;
776 enum class URLParser::URLPart {
789 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
792 case URLPart::FragmentEnd:
793 return url.m_fragmentEnd;
794 case URLPart::QueryEnd:
795 return url.m_queryEnd;
796 case URLPart::PathEnd:
797 return url.m_pathEnd;
798 case URLPart::PathAfterLastSlash:
799 return url.m_pathAfterLastSlash;
800 case URLPart::PortEnd:
801 return url.m_portEnd;
802 case URLPart::HostEnd:
803 return url.m_hostEnd;
804 case URLPart::PasswordEnd:
805 return url.m_passwordEnd;
806 case URLPart::UserEnd:
807 return url.m_userEnd;
808 case URLPart::UserStart:
809 return url.m_userStart;
810 case URLPart::SchemeEnd:
811 return url.m_schemeEnd;
813 ASSERT_NOT_REACHED();
817 void URLParser::copyASCIIStringUntil(const String& string, size_t length)
819 RELEASE_ASSERT(length <= string.length());
822 ASSERT(m_asciiBuffer.isEmpty());
823 if (string.is8Bit()) {
824 appendToASCIIBuffer(string.characters8(), length);
826 const UChar* characters = string.characters16();
827 for (size_t i = 0; i < length; ++i) {
828 UChar c = characters[i];
829 ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
830 appendToASCIIBuffer(c);
835 template<typename CharacterType>
836 void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, bool& isUTF8Encoding)
838 syntaxViolation(iterator);
840 m_asciiBuffer.clear();
841 copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
843 case URLPart::FragmentEnd:
844 RELEASE_ASSERT_NOT_REACHED();
845 case URLPart::QueryEnd:
846 m_url.m_queryEnd = base.m_queryEnd;
848 case URLPart::PathEnd:
849 m_url.m_pathEnd = base.m_pathEnd;
851 case URLPart::PathAfterLastSlash:
852 m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
854 case URLPart::PortEnd:
855 m_url.m_portEnd = base.m_portEnd;
857 case URLPart::HostEnd:
858 m_url.m_hostEnd = base.m_hostEnd;
860 case URLPart::PasswordEnd:
861 m_url.m_passwordEnd = base.m_passwordEnd;
863 case URLPart::UserEnd:
864 m_url.m_userEnd = base.m_userEnd;
866 case URLPart::UserStart:
867 m_url.m_userStart = base.m_userStart;
869 case URLPart::SchemeEnd:
870 m_url.m_isValid = base.m_isValid;
871 m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
872 m_url.m_schemeEnd = base.m_schemeEnd;
874 switch (scheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd))) {
877 isUTF8Encoding = true;
884 m_urlIsSpecial = true;
886 case Scheme::NonSpecial:
887 m_urlIsSpecial = false;
888 isUTF8Encoding = true;
891 ASSERT_NOT_REACHED();
894 static const char dotASCIICode[2] = {'2', 'e'};
896 template<typename CharacterType>
897 ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c)
902 advance<CharacterType, ReportSyntaxViolation::No>(c);
903 return c.atEnd() || isSlashQuestionOrHash(*c);
907 advance<CharacterType, ReportSyntaxViolation::No>(c);
908 if (c.atEnd() || *c != dotASCIICode[0])
910 advance<CharacterType, ReportSyntaxViolation::No>(c);
913 if (toASCIILower(*c) == dotASCIICode[1]) {
914 advance<CharacterType, ReportSyntaxViolation::No>(c);
915 return c.atEnd() || isSlashQuestionOrHash(*c);
920 template<typename CharacterType>
921 ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
926 advance<CharacterType, ReportSyntaxViolation::No>(c);
927 return isSingleDotPathSegment(c);
931 advance<CharacterType, ReportSyntaxViolation::No>(c);
932 if (c.atEnd() || *c != dotASCIICode[0])
934 advance<CharacterType, ReportSyntaxViolation::No>(c);
937 if (toASCIILower(*c) == dotASCIICode[1]) {
938 advance<CharacterType, ReportSyntaxViolation::No>(c);
939 return isSingleDotPathSegment(c);
944 template<typename CharacterType>
945 void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
947 ASSERT(isSingleDotPathSegment(c));
951 if (*c == '/' || *c == '\\')
954 ASSERT(*c == '?' || *c == '#');
959 ASSERT(*c == dotASCIICode[0]);
961 ASSERT(toASCIILower(*c) == dotASCIICode[1]);
964 if (*c == '/' || *c == '\\')
967 ASSERT(*c == '?' || *c == '#');
972 template<typename CharacterType>
973 void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
975 ASSERT(isDoubleDotPathSegment(c));
981 ASSERT(*c == dotASCIICode[0]);
983 ASSERT(toASCIILower(*c) == dotASCIICode[1]);
986 consumeSingleDotPathSegment(c);
989 void URLParser::popPath()
991 ASSERT(m_didSeeSyntaxViolation);
992 if (m_url.m_pathAfterLastSlash > m_url.m_portEnd + 1) {
993 m_url.m_pathAfterLastSlash--;
994 if (m_asciiBuffer[m_url.m_pathAfterLastSlash] == '/')
995 m_url.m_pathAfterLastSlash--;
996 while (m_url.m_pathAfterLastSlash > m_url.m_portEnd && m_asciiBuffer[m_url.m_pathAfterLastSlash] != '/')
997 m_url.m_pathAfterLastSlash--;
998 m_url.m_pathAfterLastSlash++;
1000 m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
1003 template<typename CharacterType>
1004 void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
1006 if (m_didSeeSyntaxViolation)
1008 m_didSeeSyntaxViolation = true;
1010 ASSERT(m_asciiBuffer.isEmpty());
1011 size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1012 RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
1013 m_asciiBuffer.reserveCapacity(m_inputString.length());
1014 for (size_t i = 0; i < codeUnitsToCopy; ++i) {
1015 ASSERT(isASCII(m_inputString[i]));
1016 m_asciiBuffer.uncheckedAppend(m_inputString[i]);
1020 void URLParser::failure()
1023 m_url.m_string = m_inputString;
1026 template<typename CharacterType>
1027 bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint)
1029 if (iterator.atEnd() || toASCIILower(*iterator) != codePoint)
1031 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
1035 template<typename CharacterType>
1036 bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator)
1038 if (!checkLocalhostCodePoint(iterator, 'l'))
1040 if (!checkLocalhostCodePoint(iterator, 'o'))
1042 if (!checkLocalhostCodePoint(iterator, 'c'))
1044 if (!checkLocalhostCodePoint(iterator, 'a'))
1046 if (!checkLocalhostCodePoint(iterator, 'l'))
1048 if (!checkLocalhostCodePoint(iterator, 'h'))
1050 if (!checkLocalhostCodePoint(iterator, 'o'))
1052 if (!checkLocalhostCodePoint(iterator, 's'))
1054 if (!checkLocalhostCodePoint(iterator, 't'))
1056 return iterator.atEnd();
1059 bool URLParser::isLocalhost(StringView view)
1062 return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length()));
1063 return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length()));
1066 ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
1068 if (UNLIKELY(m_didSeeSyntaxViolation)) {
1069 ASSERT(start + length <= m_asciiBuffer.size());
1070 return StringView(m_asciiBuffer.data() + start, length);
1072 ASSERT(start + length <= m_inputString.length());
1073 return StringView(m_inputString).substring(start, length);
1076 template<typename CharacterType>
1077 ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1079 if (UNLIKELY(m_didSeeSyntaxViolation))
1080 return m_asciiBuffer.size();
1082 return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1085 URLParser::URLParser(const String& input, const URL& base, const TextEncoding& encoding)
1086 : m_inputString(input)
1088 if (input.isNull()) {
1089 if (base.isValid() && !base.m_cannotBeABaseURL) {
1091 m_url.removeFragmentIdentifier();
1096 if (input.is8Bit()) {
1097 m_inputBegin = input.characters8();
1098 parse(input.characters8(), input.length(), base, encoding);
1100 m_inputBegin = input.characters16();
1101 parse(input.characters16(), input.length(), base, encoding);
1104 ASSERT(!m_url.m_isValid
1105 || m_didSeeSyntaxViolation == (m_url.string() != input)
1106 || (input.isAllSpecialCharacters<isC0ControlOrSpace>()
1107 && m_url.m_string == base.m_string.left(base.m_queryEnd)));
1108 ASSERT(internalValuesConsistent(m_url));
1109 #if !ASSERT_DISABLED
1110 if (!m_didSeeSyntaxViolation) {
1111 // Force a syntax violation at the beginning to make sure we get the same result.
1112 URLParser parser(makeString(" ", input), base, encoding);
1113 URL parsed = parser.result();
1114 if (parsed.isValid())
1115 ASSERT(allValuesEqual(parser.result(), m_url));
1120 template<typename CharacterType>
1121 void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
1123 URL_PARSER_LOG("Parsing URL <%s> base <%s> encoding <%s>", String(input, length).utf8().data(), base.string().utf8().data(), encoding.name());
1125 ASSERT(m_asciiBuffer.isEmpty());
1127 bool isUTF8Encoding = encoding == UTF8Encoding();
1128 Vector<UChar> queryBuffer;
1130 unsigned endIndex = length;
1131 while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) {
1132 syntaxViolation(CodePointIterator<CharacterType>(input, input));
1135 CodePointIterator<CharacterType> c(input, input + endIndex);
1136 CodePointIterator<CharacterType> authorityOrHostBegin;
1137 CodePointIterator<CharacterType> queryBegin;
1138 while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1142 auto beginAfterControlAndSpace = c;
1144 enum class State : uint8_t {
1148 SpecialRelativeOrAuthority,
1152 SpecialAuthoritySlashes,
1153 SpecialAuthorityIgnoreSlashes,
1161 CannotBeABaseURLPath,
1167 #define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1168 #define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x)
1170 State state = State::SchemeStart;
1171 while (!c.atEnd()) {
1172 if (UNLIKELY(isTabOrNewline(*c))) {
1179 case State::SchemeStart:
1180 LOG_STATE("SchemeStart");
1181 if (isASCIIAlpha(*c)) {
1182 if (UNLIKELY(isASCIIUpper(*c)))
1184 appendToASCIIBuffer(toASCIILower(*c));
1187 m_asciiBuffer.clear();
1188 state = State::NoScheme;
1189 c = beginAfterControlAndSpace;
1191 state = State::Scheme;
1193 state = State::NoScheme;
1196 LOG_STATE("Scheme");
1197 if (isValidSchemeCharacter(*c)) {
1198 if (UNLIKELY(isASCIIUpper(*c)))
1200 appendToASCIIBuffer(toASCIILower(*c));
1201 } else if (*c == ':') {
1202 m_url.m_schemeEnd = currentPosition(c);
1203 StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd);
1204 appendToASCIIBuffer(':');
1205 switch (scheme(urlScheme)) {
1207 m_urlIsSpecial = true;
1208 state = State::File;
1213 isUTF8Encoding = true;
1214 m_urlIsSpecial = true;
1215 if (base.protocolIs(urlScheme))
1216 state = State::SpecialRelativeOrAuthority;
1218 state = State::SpecialAuthoritySlashes;
1223 m_url.m_protocolIsInHTTPFamily = true;
1226 case Scheme::Gopher:
1227 m_urlIsSpecial = true;
1228 if (base.protocolIs(urlScheme))
1229 state = State::SpecialRelativeOrAuthority;
1231 state = State::SpecialAuthoritySlashes;
1234 case Scheme::NonSpecial:
1235 isUTF8Encoding = true;
1236 auto maybeSlash = c;
1237 advance(maybeSlash);
1238 if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1239 appendToASCIIBuffer('/');
1241 state = State::PathOrAuthority;
1244 m_url.m_userStart = currentPosition(c);
1247 m_url.m_userStart = currentPosition(c);
1248 m_url.m_userEnd = m_url.m_userStart;
1249 m_url.m_passwordEnd = m_url.m_userStart;
1250 m_url.m_hostEnd = m_url.m_userStart;
1251 m_url.m_portEnd = m_url.m_userStart;
1252 m_url.m_pathAfterLastSlash = m_url.m_userStart;
1253 m_url.m_cannotBeABaseURL = true;
1254 state = State::CannotBeABaseURLPath;
1260 m_asciiBuffer.clear();
1261 state = State::NoScheme;
1262 c = beginAfterControlAndSpace;
1267 m_asciiBuffer.clear();
1268 state = State::NoScheme;
1269 c = beginAfterControlAndSpace;
1272 case State::NoScheme:
1273 LOG_STATE("NoScheme");
1274 if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) {
1278 if (base.m_cannotBeABaseURL && *c == '#') {
1279 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1280 state = State::Fragment;
1281 appendToASCIIBuffer('#');
1285 if (!base.protocolIs("file")) {
1286 state = State::Relative;
1289 copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1290 appendToASCIIBuffer(':');
1291 state = State::File;
1293 case State::SpecialRelativeOrAuthority:
1294 LOG_STATE("SpecialRelativeOrAuthority");
1296 appendToASCIIBuffer('/');
1303 appendToASCIIBuffer('/');
1304 state = State::SpecialAuthorityIgnoreSlashes;
1307 state = State::RelativeSlash;
1309 state = State::Relative;
1311 case State::PathOrAuthority:
1312 LOG_STATE("PathOrAuthority");
1314 appendToASCIIBuffer('/');
1315 state = State::AuthorityOrHost;
1317 m_url.m_userStart = currentPosition(c);
1318 authorityOrHostBegin = c;
1320 ASSERT(parsedDataView(currentPosition(c) - 1, 1) == "/");
1321 m_url.m_userStart = currentPosition(c) - 1;
1322 m_url.m_userEnd = m_url.m_userStart;
1323 m_url.m_passwordEnd = m_url.m_userStart;
1324 m_url.m_hostEnd = m_url.m_userStart;
1325 m_url.m_portEnd = m_url.m_userStart;
1326 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1327 state = State::Path;
1330 case State::Relative:
1331 LOG_STATE("Relative");
1335 state = State::RelativeSlash;
1339 copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1340 appendToASCIIBuffer('?');
1343 state = State::UTF8Query;
1346 state = State::NonUTF8Query;
1350 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1351 appendToASCIIBuffer('#');
1352 state = State::Fragment;
1356 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1357 state = State::Path;
1361 case State::RelativeSlash:
1362 LOG_STATE("RelativeSlash");
1363 if (*c == '/' || *c == '\\') {
1365 copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1366 appendToASCIIBuffer("://", 3);
1367 state = State::SpecialAuthorityIgnoreSlashes;
1369 copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1370 appendToASCIIBuffer('/');
1371 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1372 state = State::Path;
1375 case State::SpecialAuthoritySlashes:
1376 LOG_STATE("SpecialAuthoritySlashes");
1377 if (LIKELY(*c == '/' || *c == '\\')) {
1378 if (UNLIKELY(*c == '\\'))
1380 appendToASCIIBuffer('/');
1382 if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) {
1383 if (UNLIKELY(*c == '\\'))
1386 appendToASCIIBuffer('/');
1389 appendToASCIIBuffer('/');
1393 appendToASCIIBuffer("//", 2);
1395 state = State::SpecialAuthorityIgnoreSlashes;
1397 case State::SpecialAuthorityIgnoreSlashes:
1398 LOG_STATE("SpecialAuthorityIgnoreSlashes");
1399 if (*c == '/' || *c == '\\') {
1403 m_url.m_userStart = currentPosition(c);
1404 state = State::AuthorityOrHost;
1405 authorityOrHostBegin = c;
1408 case State::AuthorityOrHost:
1410 LOG_STATE("AuthorityOrHost");
1413 auto findLastAt = c;
1414 while (!findLastAt.atEnd()) {
1415 URL_PARSER_LOG("Finding last @: %c", *findLastAt);
1416 if (*findLastAt == '@')
1417 lastAt = findLastAt;
1418 bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\');
1419 if (isSlash || *findLastAt == '?' || *findLastAt == '#')
1423 parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1426 authorityOrHostBegin = c;
1427 state = State::Host;
1428 m_hostHasPercentOrNonASCII = false;
1431 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1432 if (isSlash || *c == '?' || *c == '#') {
1433 auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c);
1434 if (iterator.atEnd()) {
1435 size_t position = currentPosition(c);
1436 ASSERT(m_url.m_userStart == position);
1437 RELEASE_ASSERT(position >= 2);
1439 ASSERT(parsedDataView(position, 2) == "//");
1440 m_url.m_userStart = position;
1441 m_url.m_userEnd = position;
1442 m_url.m_passwordEnd = position;
1443 m_url.m_hostEnd = position;
1444 m_url.m_portEnd = position;
1445 m_url.m_pathAfterLastSlash = position + 2;
1447 m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1448 m_url.m_passwordEnd = m_url.m_userEnd;
1449 if (!parseHostAndPort(iterator)) {
1453 if (UNLIKELY(!isSlash)) {
1455 appendToASCIIBuffer('/');
1456 m_url.m_pathAfterLastSlash = currentPosition(c);
1459 state = State::Path;
1462 if (isPercentOrNonASCII(*c))
1463 m_hostHasPercentOrNonASCII = true;
1465 } while (!c.atEnd());
1470 if (*c == '/' || *c == '?' || *c == '#') {
1471 if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1475 if (*c == '?' || *c == '#') {
1477 appendToASCIIBuffer('/');
1478 m_url.m_pathAfterLastSlash = currentPosition(c);
1480 state = State::Path;
1483 if (isPercentOrNonASCII(*c))
1484 m_hostHasPercentOrNonASCII = true;
1486 } while (!c.atEnd());
1495 appendToASCIIBuffer('/');
1496 state = State::FileSlash;
1501 if (base.isValid() && base.protocolIs("file")) {
1502 copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1503 appendToASCIIBuffer('?');
1506 appendToASCIIBuffer("///?", 4);
1508 m_url.m_userStart = currentPosition(c) - 2;
1509 m_url.m_userEnd = m_url.m_userStart;
1510 m_url.m_passwordEnd = m_url.m_userStart;
1511 m_url.m_hostEnd = m_url.m_userStart;
1512 m_url.m_portEnd = m_url.m_userStart;
1513 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1514 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1517 state = State::UTF8Query;
1520 state = State::NonUTF8Query;
1525 if (base.isValid() && base.protocolIs("file")) {
1526 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1527 appendToASCIIBuffer('#');
1529 appendToASCIIBuffer("///#", 4);
1530 m_url.m_userStart = currentPosition(c) - 2;
1531 m_url.m_userEnd = m_url.m_userStart;
1532 m_url.m_passwordEnd = m_url.m_userStart;
1533 m_url.m_hostEnd = m_url.m_userStart;
1534 m_url.m_portEnd = m_url.m_userStart;
1535 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1536 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1537 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1539 state = State::Fragment;
1544 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1545 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1547 appendToASCIIBuffer("///", 3);
1548 m_url.m_userStart = currentPosition(c) - 1;
1549 m_url.m_userEnd = m_url.m_userStart;
1550 m_url.m_passwordEnd = m_url.m_userStart;
1551 m_url.m_hostEnd = m_url.m_userStart;
1552 m_url.m_portEnd = m_url.m_userStart;
1553 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1554 if (isWindowsDriveLetter(c))
1555 appendWindowsDriveLetter(c);
1557 state = State::Path;
1561 case State::FileSlash:
1562 LOG_STATE("FileSlash");
1563 if (LIKELY(*c == '/' || *c == '\\')) {
1564 if (UNLIKELY(*c == '\\'))
1566 appendToASCIIBuffer('/');
1568 m_url.m_userStart = currentPosition(c);
1569 m_url.m_userEnd = m_url.m_userStart;
1570 m_url.m_passwordEnd = m_url.m_userStart;
1571 m_url.m_hostEnd = m_url.m_userStart;
1572 m_url.m_portEnd = m_url.m_userStart;
1573 authorityOrHostBegin = c;
1574 state = State::FileHost;
1577 if (base.isValid() && base.protocolIs("file")) {
1578 // FIXME: This String copy is unnecessary.
1579 String basePath = base.path();
1580 if (basePath.length() >= 2) {
1581 bool windowsQuirk = basePath.is8Bit()
1582 ? isWindowsDriveLetter(CodePointIterator<LChar>(basePath.characters8(), basePath.characters8() + basePath.length()))
1583 : isWindowsDriveLetter(CodePointIterator<UChar>(basePath.characters16(), basePath.characters16() + basePath.length()));
1585 appendToASCIIBuffer(basePath[0]);
1586 appendToASCIIBuffer(basePath[1]);
1591 appendToASCIIBuffer("//", 2);
1592 m_url.m_userStart = currentPosition(c) - 1;
1593 m_url.m_userEnd = m_url.m_userStart;
1594 m_url.m_passwordEnd = m_url.m_userStart;
1595 m_url.m_hostEnd = m_url.m_userStart;
1596 m_url.m_portEnd = m_url.m_userStart;
1597 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1598 if (isWindowsDriveLetter(c))
1599 appendWindowsDriveLetter(c);
1600 state = State::Path;
1602 case State::FileHost:
1604 LOG_STATE("FileHost");
1605 if (isSlashQuestionOrHash(*c)) {
1606 bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1607 && isWindowsDriveLetter(authorityOrHostBegin);
1609 syntaxViolation(authorityOrHostBegin);
1610 appendToASCIIBuffer('/');
1611 appendWindowsDriveLetter(authorityOrHostBegin);
1613 if (windowsQuirk || authorityOrHostBegin == c) {
1614 ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1, 1) == "/");
1615 if (UNLIKELY(*c == '?')) {
1617 appendToASCIIBuffer("/?", 2);
1620 state = State::UTF8Query;
1623 state = State::NonUTF8Query;
1625 m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1626 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1629 if (UNLIKELY(*c == '#')) {
1631 appendToASCIIBuffer("/#", 2);
1633 m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1634 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1635 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1636 state = State::Fragment;
1639 state = State::Path;
1642 if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1646 if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) {
1648 m_asciiBuffer.shrink(m_url.m_passwordEnd);
1649 m_url.m_hostEnd = currentPosition(c);
1650 m_url.m_portEnd = m_url.m_hostEnd;
1653 state = State::PathStart;
1656 if (isPercentOrNonASCII(*c))
1657 m_hostHasPercentOrNonASCII = true;
1659 } while (!c.atEnd());
1661 case State::PathStart:
1662 LOG_STATE("PathStart");
1663 if (*c != '/' && *c != '\\')
1665 state = State::Path;
1669 if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1670 if (UNLIKELY(m_urlIsSpecial && *c == '\\'))
1672 appendToASCIIBuffer('/');
1674 m_url.m_pathAfterLastSlash = currentPosition(c);
1677 if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1, 1) == "/")) {
1678 if (UNLIKELY(isDoubleDotPathSegment(c))) {
1680 consumeDoubleDotPathSegment(c);
1684 if (UNLIKELY(isSingleDotPathSegment(c))) {
1686 consumeSingleDotPathSegment(c);
1691 m_url.m_pathEnd = currentPosition(c);
1692 appendToASCIIBuffer('?');
1695 state = State::UTF8Query;
1698 state = State::NonUTF8Query;
1703 m_url.m_pathEnd = currentPosition(c);
1704 m_url.m_queryEnd = m_url.m_pathEnd;
1705 state = State::Fragment;
1708 utf8PercentEncode<isInDefaultEncodeSet>(c);
1711 case State::CannotBeABaseURLPath:
1712 LOG_STATE("CannotBeABaseURLPath");
1714 m_url.m_pathEnd = currentPosition(c);
1715 appendToASCIIBuffer('?');
1718 state = State::UTF8Query;
1721 state = State::NonUTF8Query;
1723 } else if (*c == '#') {
1724 m_url.m_pathEnd = currentPosition(c);
1725 m_url.m_queryEnd = m_url.m_pathEnd;
1726 state = State::Fragment;
1727 } else if (*c == '/') {
1728 appendToASCIIBuffer('/');
1730 m_url.m_pathAfterLastSlash = currentPosition(c);
1732 utf8PercentEncode<isInSimpleEncodeSet>(c);
1736 case State::UTF8Query:
1737 LOG_STATE("UTF8Query");
1738 ASSERT(queryBegin == CodePointIterator<CharacterType>());
1740 m_url.m_queryEnd = currentPosition(c);
1741 state = State::Fragment;
1747 appendCodePoint(queryBuffer, *c);
1750 case State::NonUTF8Query:
1752 LOG_STATE("NonUTF8Query");
1753 ASSERT(queryBegin != CodePointIterator<CharacterType>());
1755 encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
1756 m_url.m_queryEnd = currentPosition(c);
1757 state = State::Fragment;
1760 appendCodePoint(queryBuffer, *c);
1761 advance(c, queryBegin);
1762 } while (!c.atEnd());
1764 case State::Fragment:
1765 URL_PARSER_LOG("State Fragment");
1766 utf8PercentEncode<isInSimpleEncodeSet>(c);
1773 case State::SchemeStart:
1774 LOG_FINAL_STATE("SchemeStart");
1775 if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1777 m_url.removeFragmentIdentifier();
1783 LOG_FINAL_STATE("Scheme");
1786 case State::NoScheme:
1787 LOG_FINAL_STATE("NoScheme");
1788 RELEASE_ASSERT_NOT_REACHED();
1789 case State::SpecialRelativeOrAuthority:
1790 LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1791 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1792 m_url.m_fragmentEnd = m_url.m_queryEnd;
1794 case State::PathOrAuthority:
1795 LOG_FINAL_STATE("PathOrAuthority");
1796 ASSERT(m_url.m_userStart);
1797 ASSERT(m_url.m_userStart == currentPosition(c));
1798 ASSERT(parsedDataView(currentPosition(c) - 1, 1) == "/");
1799 m_url.m_userStart--;
1800 m_url.m_userEnd = m_url.m_userStart;
1801 m_url.m_passwordEnd = m_url.m_userStart;
1802 m_url.m_hostEnd = m_url.m_userStart;
1803 m_url.m_portEnd = m_url.m_userStart;
1804 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1805 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1806 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1807 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1809 case State::Relative:
1810 LOG_FINAL_STATE("Relative");
1811 RELEASE_ASSERT_NOT_REACHED();
1812 case State::RelativeSlash:
1813 LOG_FINAL_STATE("RelativeSlash");
1814 copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1815 appendToASCIIBuffer('/');
1816 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1817 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1818 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1819 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1821 case State::SpecialAuthoritySlashes:
1822 LOG_FINAL_STATE("SpecialAuthoritySlashes");
1823 m_url.m_userStart = currentPosition(c);
1824 m_url.m_userEnd = m_url.m_userStart;
1825 m_url.m_passwordEnd = m_url.m_userStart;
1826 m_url.m_hostEnd = m_url.m_userStart;
1827 m_url.m_portEnd = m_url.m_userStart;
1828 m_url.m_pathAfterLastSlash = m_url.m_userStart;
1829 m_url.m_pathEnd = m_url.m_userStart;
1830 m_url.m_queryEnd = m_url.m_userStart;
1831 m_url.m_fragmentEnd = m_url.m_userStart;
1833 case State::SpecialAuthorityIgnoreSlashes:
1834 LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1838 case State::AuthorityOrHost:
1839 LOG_FINAL_STATE("AuthorityOrHost");
1840 m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1841 m_url.m_passwordEnd = m_url.m_userEnd;
1842 if (authorityOrHostBegin.atEnd()) {
1843 RELEASE_ASSERT(m_url.m_userStart >= 2);
1844 ASSERT(parsedDataView(m_url.m_userStart - 2, 2) == "//");
1845 m_url.m_userStart -= 2;
1846 m_url.m_userEnd = m_url.m_userStart;
1847 m_url.m_passwordEnd = m_url.m_userStart;
1848 m_url.m_hostEnd = m_url.m_userStart;
1849 m_url.m_portEnd = m_url.m_userStart;
1850 m_url.m_pathEnd = m_url.m_userStart + 2;
1851 } else if (!parseHostAndPort(authorityOrHostBegin)) {
1855 if (m_urlIsSpecial) {
1857 appendToASCIIBuffer('/');
1858 m_url.m_pathEnd = m_url.m_portEnd + 1;
1860 m_url.m_pathEnd = m_url.m_portEnd;
1862 m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1863 m_url.m_queryEnd = m_url.m_pathEnd;
1864 m_url.m_fragmentEnd = m_url.m_pathEnd;
1867 LOG_FINAL_STATE("Host");
1868 if (!parseHostAndPort(authorityOrHostBegin)) {
1872 if (m_urlIsSpecial) {
1874 appendToASCIIBuffer('/');
1875 m_url.m_pathEnd = m_url.m_portEnd + 1;
1877 m_url.m_pathEnd = m_url.m_portEnd;
1878 m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1879 m_url.m_queryEnd = m_url.m_pathEnd;
1880 m_url.m_fragmentEnd = m_url.m_pathEnd;
1883 LOG_FINAL_STATE("File");
1884 if (base.isValid() && base.protocolIs("file")) {
1885 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1886 appendToASCIIBuffer(':');
1889 appendToASCIIBuffer("///", 3);
1890 m_url.m_userStart = currentPosition(c) - 1;
1891 m_url.m_userEnd = m_url.m_userStart;
1892 m_url.m_passwordEnd = m_url.m_userStart;
1893 m_url.m_hostEnd = m_url.m_userStart;
1894 m_url.m_portEnd = m_url.m_userStart;
1895 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1896 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1897 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1898 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1900 case State::FileSlash:
1901 LOG_FINAL_STATE("FileSlash");
1903 m_url.m_userStart = currentPosition(c) + 1;
1904 appendToASCIIBuffer("//", 2);
1905 m_url.m_userEnd = m_url.m_userStart;
1906 m_url.m_passwordEnd = m_url.m_userStart;
1907 m_url.m_hostEnd = m_url.m_userStart;
1908 m_url.m_portEnd = m_url.m_userStart;
1909 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1910 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1911 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1912 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1914 case State::FileHost:
1915 LOG_FINAL_STATE("FileHost");
1916 if (authorityOrHostBegin == c) {
1918 appendToASCIIBuffer('/');
1919 m_url.m_userStart = currentPosition(c) - 1;
1920 m_url.m_userEnd = m_url.m_userStart;
1921 m_url.m_passwordEnd = m_url.m_userStart;
1922 m_url.m_hostEnd = m_url.m_userStart;
1923 m_url.m_portEnd = m_url.m_userStart;
1924 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1925 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1926 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1927 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1931 if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1937 if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) {
1938 m_asciiBuffer.shrink(m_url.m_passwordEnd);
1939 m_url.m_hostEnd = currentPosition(c);
1940 m_url.m_portEnd = m_url.m_hostEnd;
1942 appendToASCIIBuffer('/');
1943 m_url.m_pathAfterLastSlash = m_url.m_portEnd + 1;
1944 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1945 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1946 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1948 case State::PathStart:
1949 LOG_FINAL_STATE("PathStart");
1950 RELEASE_ASSERT_NOT_REACHED();
1952 LOG_FINAL_STATE("Path");
1953 m_url.m_pathEnd = currentPosition(c);
1954 m_url.m_queryEnd = m_url.m_pathEnd;
1955 m_url.m_fragmentEnd = m_url.m_pathEnd;
1957 case State::CannotBeABaseURLPath:
1958 LOG_FINAL_STATE("CannotBeABaseURLPath");
1959 m_url.m_pathEnd = currentPosition(c);
1960 m_url.m_queryEnd = m_url.m_pathEnd;
1961 m_url.m_fragmentEnd = m_url.m_pathEnd;
1963 case State::UTF8Query:
1964 LOG_FINAL_STATE("UTF8Query");
1965 ASSERT(queryBegin == CodePointIterator<CharacterType>());
1966 m_url.m_queryEnd = currentPosition(c);
1967 m_url.m_fragmentEnd = m_url.m_queryEnd;
1969 case State::NonUTF8Query:
1970 LOG_FINAL_STATE("NonUTF8Query");
1971 ASSERT(queryBegin != CodePointIterator<CharacterType>());
1972 encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
1973 m_url.m_queryEnd = currentPosition(c);
1974 m_url.m_fragmentEnd = m_url.m_queryEnd;
1976 case State::Fragment:
1977 LOG_FINAL_STATE("Fragment");
1978 m_url.m_fragmentEnd = currentPosition(c);
1982 if (LIKELY(!m_didSeeSyntaxViolation)) {
1983 m_url.m_string = m_inputString;
1984 ASSERT(m_asciiBuffer.isEmpty());
1986 m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
1987 m_url.m_isValid = true;
1988 URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
1991 template<typename CharacterType>
1992 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
1994 if (UNLIKELY(iterator.atEnd())) {
1995 syntaxViolation(iterator);
1996 m_url.m_userEnd = currentPosition(iterator);
1997 m_url.m_passwordEnd = m_url.m_userEnd;
2000 for (; !iterator.atEnd(); advance(iterator)) {
2001 if (*iterator == ':') {
2002 m_url.m_userEnd = currentPosition(iterator);
2003 auto iteratorAtColon = iterator;
2005 bool tabOrNewlineAfterColon = false;
2006 while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
2007 tabOrNewlineAfterColon = true;
2010 if (UNLIKELY(iterator.atEnd())) {
2011 syntaxViolation(iteratorAtColon);
2012 m_url.m_passwordEnd = m_url.m_userEnd;
2013 if (m_url.m_userEnd > m_url.m_userStart)
2014 appendToASCIIBuffer('@');
2017 if (tabOrNewlineAfterColon)
2018 syntaxViolation(iteratorAtColon);
2019 appendToASCIIBuffer(':');
2022 utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2024 for (; !iterator.atEnd(); advance(iterator))
2025 utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2026 m_url.m_passwordEnd = currentPosition(iterator);
2027 if (!m_url.m_userEnd)
2028 m_url.m_userEnd = m_url.m_passwordEnd;
2029 appendToASCIIBuffer('@');
2032 template<typename UnsignedIntegerType>
2033 void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
2035 LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
2036 LChar* end = buf + WTF_ARRAY_LENGTH(buf);
2039 *--p = (number % 10) + '0';
2042 appendToASCIIBuffer(p, end - p);
2045 void URLParser::serializeIPv4(IPv4Address address)
2047 appendNumberToASCIIBuffer<uint8_t>(address >> 24);
2048 appendToASCIIBuffer('.');
2049 appendNumberToASCIIBuffer<uint8_t>(address >> 16);
2050 appendToASCIIBuffer('.');
2051 appendNumberToASCIIBuffer<uint8_t>(address >> 8);
2052 appendToASCIIBuffer('.');
2053 appendNumberToASCIIBuffer<uint8_t>(address);
2056 static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
2059 for (; end < 8; end++) {
2066 static Optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
2068 Optional<size_t> longest;
2069 size_t longestLength = 0;
2070 for (size_t i = 0; i < 8; i++) {
2071 size_t length = zeroSequenceLength(address, i);
2073 if (length > 1 && (!longest || longestLength < length)) {
2075 longestLength = length;
2083 void URLParser::serializeIPv6Piece(uint16_t piece)
2085 bool printed = false;
2086 if (auto nibble0 = piece >> 12) {
2087 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
2090 auto nibble1 = piece >> 8 & 0xF;
2091 if (printed || nibble1) {
2092 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
2095 auto nibble2 = piece >> 4 & 0xF;
2096 if (printed || nibble2)
2097 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
2098 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
2101 void URLParser::serializeIPv6(URLParser::IPv6Address address)
2103 appendToASCIIBuffer('[');
2104 auto compressPointer = findLongestZeroSequence(address);
2105 for (size_t piece = 0; piece < 8; piece++) {
2106 if (compressPointer && compressPointer.value() == piece) {
2107 ASSERT(!address[piece]);
2109 appendToASCIIBuffer(':');
2111 appendToASCIIBuffer("::", 2);
2112 while (piece < 8 && !address[piece])
2117 serializeIPv6Piece(address[piece]);
2119 appendToASCIIBuffer(':');
2121 appendToASCIIBuffer(']');
2124 template<typename CharacterType>
2125 Optional<uint32_t> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation)
2127 enum class State : uint8_t {
2134 State state = State::UnknownBase;
2135 Checked<uint32_t, RecordOverflow> value = 0;
2136 if (!iterator.atEnd() && *iterator == '.')
2138 while (!iterator.atEnd()) {
2139 if (isTabOrNewline(*iterator)) {
2140 didSeeSyntaxViolation = true;
2144 if (*iterator == '.') {
2145 ASSERT(!value.hasOverflowed());
2146 return value.unsafeGet();
2149 case State::UnknownBase:
2150 if (UNLIKELY(*iterator == '0')) {
2152 state = State::OctalOrHex;
2155 state = State::Decimal;
2157 case State::OctalOrHex:
2158 didSeeSyntaxViolation = true;
2159 if (*iterator == 'x' || *iterator == 'X') {
2164 state = State::Octal;
2166 case State::Decimal:
2167 if (*iterator < '0' || *iterator > '9')
2170 value += *iterator - '0';
2171 if (UNLIKELY(value.hasOverflowed()))
2176 ASSERT(didSeeSyntaxViolation);
2177 if (*iterator < '0' || *iterator > '7')
2180 value += *iterator - '0';
2181 if (UNLIKELY(value.hasOverflowed()))
2186 ASSERT(didSeeSyntaxViolation);
2187 if (!isASCIIHexDigit(*iterator))
2190 value += toASCIIHexValue(*iterator);
2191 if (UNLIKELY(value.hasOverflowed()))
2197 ASSERT(!value.hasOverflowed());
2198 return value.unsafeGet();
2201 ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2203 RELEASE_ASSERT(exponent <= 4);
2204 uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
2205 return values[exponent];
2208 template<typename CharacterType>
2209 Optional<URLParser::IPv4Address> URLParser::parseIPv4Host(CodePointIterator<CharacterType> iterator)
2211 auto hostBegin = iterator;
2213 Vector<uint32_t, 4> items;
2214 items.reserveInitialCapacity(4);
2215 bool didSeeSyntaxViolation = false;
2216 while (!iterator.atEnd()) {
2217 if (isTabOrNewline(*iterator)) {
2218 didSeeSyntaxViolation = true;
2222 if (items.size() >= 4)
2224 if (auto item = parseIPv4Piece(iterator, didSeeSyntaxViolation))
2225 items.append(item.value());
2228 if (!iterator.atEnd()) {
2229 if (items.size() >= 4)
2231 if (*iterator == '.')
2237 if (!iterator.atEnd() || !items.size() || items.size() > 4)
2239 if (items.size() > 1) {
2240 for (size_t i = 0; i < items.size() - 1; i++) {
2245 if (items[items.size() - 1] >= pow256(5 - items.size()))
2248 if (didSeeSyntaxViolation)
2249 syntaxViolation(hostBegin);
2250 for (auto item : items) {
2252 syntaxViolation(hostBegin);
2255 if (UNLIKELY(items.size() != 4))
2256 syntaxViolation(hostBegin);
2258 IPv4Address ipv4 = items.takeLast();
2259 for (size_t counter = 0; counter < items.size(); ++counter)
2260 ipv4 += items[counter] * pow256(3 - counter);
2264 template<typename CharacterType>
2265 Optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator)
2267 if (iterator.atEnd())
2270 bool leadingZeros = false;
2271 size_t digitCount = 0;
2272 while (!iterator.atEnd()) {
2273 if (!isASCIIDigit(*iterator))
2276 if (!piece && *iterator == '0') {
2279 leadingZeros = true;
2281 if (!piece && *iterator == '0')
2282 leadingZeros = true;
2283 piece = piece * 10 + *iterator - '0';
2286 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2287 if (iterator.atEnd())
2289 if (*iterator == '.')
2292 if (piece && leadingZeros)
2297 template<typename CharacterType>
2298 Optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator)
2300 IPv4Address address = 0;
2301 for (size_t i = 0; i < 4; ++i) {
2302 if (Optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator))
2303 address = (address << 8) + piece.value();
2307 if (iterator.atEnd())
2309 if (*iterator != '.')
2311 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2312 } else if (!iterator.atEnd())
2315 ASSERT(iterator.atEnd());
2319 template<typename CharacterType>
2320 Optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2324 advance(c, hostBegin);
2328 IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}};
2329 size_t piecePointer = 0;
2330 Optional<size_t> compressPointer;
2333 advance(c, hostBegin);
2338 advance(c, hostBegin);
2340 compressPointer = piecePointer;
2343 while (!c.atEnd()) {
2344 if (piecePointer == 8)
2347 if (compressPointer)
2349 advance(c, hostBegin);
2351 compressPointer = piecePointer;
2354 if (piecePointer == 6 || (compressPointer && piecePointer < 6)) {
2355 if (Optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) {
2356 if (compressPointer && piecePointer == 5)
2358 syntaxViolation(hostBegin);
2359 address[piecePointer++] = ipv4Address.value() >> 16;
2360 address[piecePointer++] = ipv4Address.value() & 0xFFFF;
2367 bool leadingZeros = false;
2368 for (; length < 4; length++) {
2371 if (!isASCIIHexDigit(*c))
2373 if (isASCIIUpper(*c))
2374 syntaxViolation(hostBegin);
2375 if (*c == '0' && !length)
2376 leadingZeros = true;
2377 value = value * 0x10 + toASCIIHexValue(*c);
2378 advance(c, hostBegin);
2381 if (UNLIKELY((value && leadingZeros) || (!value && length > 1)))
2382 syntaxViolation(hostBegin);
2384 address[piecePointer++] = value;
2387 if (piecePointer == 8 || *c != ':')
2389 advance(c, hostBegin);
2395 if (compressPointer) {
2396 size_t swaps = piecePointer - compressPointer.value();
2399 std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
2400 } else if (piecePointer != 8)
2403 Optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2404 if (possibleCompressPointer)
2405 possibleCompressPointer.value()++;
2406 if (UNLIKELY(compressPointer != possibleCompressPointer))
2407 syntaxViolation(hostBegin);
2412 template<typename CharacterType>
2413 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2415 Vector<LChar, defaultInlineBufferSize> output;
2416 output.reserveInitialCapacity(length);
2418 for (size_t i = 0; i < length; ++i) {
2419 uint8_t byte = input[i];
2421 output.uncheckedAppend(byte);
2422 else if (length > 2 && i < length - 2) {
2423 if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2424 syntaxViolation(iteratorForSyntaxViolationPosition);
2425 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2428 output.uncheckedAppend(byte);
2430 output.uncheckedAppend(byte);
2435 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length)
2437 Vector<LChar, defaultInlineBufferSize> output;
2438 output.reserveInitialCapacity(length);
2440 for (size_t i = 0; i < length; ++i) {
2441 uint8_t byte = input[i];
2443 output.uncheckedAppend(byte);
2444 else if (length > 2 && i < length - 2) {
2445 if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2446 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2449 output.uncheckedAppend(byte);
2451 output.uncheckedAppend(byte);
2456 ALWAYS_INLINE static bool containsOnlyASCII(const String& string)
2458 if (string.is8Bit())
2459 return charactersAreAllASCII(string.characters8(), string.length());
2460 return charactersAreAllASCII(string.characters16(), string.length());
2463 template<typename CharacterType>
2464 Optional<Vector<LChar, URLParser::defaultInlineBufferSize>> URLParser::domainToASCII(const String& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2466 Vector<LChar, defaultInlineBufferSize> ascii;
2467 if (containsOnlyASCII(domain)) {
2468 size_t length = domain.length();
2469 if (domain.is8Bit()) {
2470 const LChar* characters = domain.characters8();
2471 ascii.reserveInitialCapacity(length);
2472 for (size_t i = 0; i < length; ++i) {
2473 if (UNLIKELY(isASCIIUpper(characters[i])))
2474 syntaxViolation(iteratorForSyntaxViolationPosition);
2475 ascii.uncheckedAppend(toASCIILower(characters[i]));
2478 const UChar* characters = domain.characters16();
2479 ascii.reserveInitialCapacity(length);
2480 for (size_t i = 0; i < length; ++i) {
2481 if (UNLIKELY(isASCIIUpper(characters[i])))
2482 syntaxViolation(iteratorForSyntaxViolationPosition);
2483 ascii.uncheckedAppend(toASCIILower(characters[i]));
2489 UChar hostnameBuffer[defaultInlineBufferSize];
2490 UErrorCode error = U_ZERO_ERROR;
2492 #if COMPILER(GCC) || COMPILER(CLANG)
2493 #pragma GCC diagnostic push
2494 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2496 // FIXME: This should use uidna_openUTS46 / uidna_close instead
2497 int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, UIDNA_ALLOW_UNASSIGNED, nullptr, &error);
2498 #if COMPILER(GCC) || COMPILER(CLANG)
2499 #pragma GCC diagnostic pop
2501 ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
2503 if (error == U_ZERO_ERROR) {
2504 for (int32_t i = 0; i < numCharactersConverted; ++i) {
2505 ASSERT(isASCII(hostnameBuffer[i]));
2506 ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2508 ascii.append(hostnameBuffer, numCharactersConverted);
2509 if (domain != StringView(ascii.data(), ascii.size()))
2510 syntaxViolation(iteratorForSyntaxViolationPosition);
2514 // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
2518 bool URLParser::hasInvalidDomainCharacter(const Vector<LChar, URLParser::defaultInlineBufferSize>& asciiDomain)
2520 for (size_t i = 0; i < asciiDomain.size(); ++i) {
2521 if (isInvalidDomainCharacter(asciiDomain[i]))
2527 template<typename CharacterType>
2528 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2530 ASSERT(*iterator == ':');
2531 auto colonIterator = iterator;
2532 advance(iterator, colonIterator);
2534 if (UNLIKELY(iterator.atEnd())) {
2535 m_url.m_portEnd = currentPosition(colonIterator);
2536 syntaxViolation(colonIterator);
2539 size_t digitCount = 0;
2540 bool leadingZeros = false;
2541 for (; !iterator.atEnd(); ++iterator) {
2542 if (UNLIKELY(isTabOrNewline(*iterator))) {
2543 syntaxViolation(colonIterator);
2546 if (isASCIIDigit(*iterator)) {
2547 if (*iterator == '0' && !digitCount)
2548 leadingZeros = true;
2550 port = port * 10 + *iterator - '0';
2551 if (port > std::numeric_limits<uint16_t>::max())
2557 if (port && leadingZeros)
2558 syntaxViolation(colonIterator);
2560 if (!port && digitCount > 1)
2561 syntaxViolation(colonIterator);
2563 if (UNLIKELY(isDefaultPortForProtocol(port, parsedDataView(0, m_url.m_schemeEnd))))
2564 syntaxViolation(colonIterator);
2566 appendToASCIIBuffer(':');
2567 ASSERT(port <= std::numeric_limits<uint16_t>::max());
2568 appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2571 m_url.m_portEnd = currentPosition(iterator);
2575 template<typename CharacterType>
2576 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2578 if (iterator.atEnd())
2580 if (*iterator == ':')
2582 if (*iterator == '[') {
2583 auto ipv6End = iterator;
2584 while (!ipv6End.atEnd() && *ipv6End != ']')
2586 if (ipv6End.atEnd())
2588 if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2589 serializeIPv6(address.value());
2590 if (!ipv6End.atEnd()) {
2592 if (!ipv6End.atEnd() && *ipv6End == ':') {
2593 m_url.m_hostEnd = currentPosition(ipv6End);
2594 return parsePort(ipv6End);
2596 m_url.m_hostEnd = currentPosition(ipv6End);
2597 m_url.m_portEnd = m_url.m_hostEnd;
2600 m_url.m_hostEnd = currentPosition(ipv6End);
2606 if (!m_urlIsSpecial) {
2607 for (; !iterator.atEnd(); ++iterator) {
2608 if (UNLIKELY(isTabOrNewline(*iterator))) {
2609 syntaxViolation(iterator);
2612 if (*iterator == ':')
2614 utf8PercentEncode<isInSimpleEncodeSet>(iterator);
2616 m_url.m_hostEnd = currentPosition(iterator);
2617 if (iterator.atEnd()) {
2618 m_url.m_portEnd = currentPosition(iterator);
2621 return parsePort(iterator);
2624 if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2625 auto hostIterator = iterator;
2626 for (; !iterator.atEnd(); ++iterator) {
2627 if (isTabOrNewline(*iterator))
2629 if (*iterator == ':')
2631 if (isInvalidDomainCharacter(*iterator))
2634 if (auto address = parseIPv4Host(CodePointIterator<CharacterType>(hostIterator, iterator))) {
2635 serializeIPv4(address.value());
2636 m_url.m_hostEnd = currentPosition(iterator);
2637 if (iterator.atEnd()) {
2638 m_url.m_portEnd = currentPosition(iterator);
2641 return parsePort(iterator);
2643 for (; hostIterator != iterator; ++hostIterator) {
2644 if (UNLIKELY(isTabOrNewline(*hostIterator))) {
2645 syntaxViolation(hostIterator);
2648 if (UNLIKELY(isASCIIUpper(*hostIterator)))
2649 syntaxViolation(hostIterator);
2650 appendToASCIIBuffer(toASCIILower(*hostIterator));
2652 m_url.m_hostEnd = currentPosition(iterator);
2653 if (!hostIterator.atEnd())
2654 return parsePort(hostIterator);
2655 m_url.m_portEnd = currentPosition(iterator);
2659 auto hostBegin = iterator;
2661 Vector<LChar, defaultInlineBufferSize> utf8Encoded;
2662 for (; !iterator.atEnd(); ++iterator) {
2663 if (UNLIKELY(isTabOrNewline(*iterator))) {
2664 syntaxViolation(hostBegin);
2667 if (*iterator == ':')
2669 if (UNLIKELY(!isASCII(*iterator)))
2670 syntaxViolation(hostBegin);
2672 uint8_t buffer[U8_MAX_LENGTH];
2674 UBool error = false;
2675 U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error);
2676 ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
2677 // FIXME: Check error.
2678 utf8Encoded.append(buffer, offset);
2680 Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin);
2681 String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2682 if (domain != StringView(percentDecoded.data(), percentDecoded.size()))
2683 syntaxViolation(hostBegin);
2684 auto asciiDomain = domainToASCII(domain, hostBegin);
2685 if (!asciiDomain || hasInvalidDomainCharacter(asciiDomain.value()))
2687 Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value();
2688 const LChar* asciiDomainCharacters = asciiDomainValue.data();
2690 if (auto address = parseIPv4Host(CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()))) {
2691 serializeIPv4(address.value());
2692 m_url.m_hostEnd = currentPosition(iterator);
2693 if (iterator.atEnd()) {
2694 m_url.m_portEnd = currentPosition(iterator);
2697 return parsePort(iterator);
2700 appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2701 m_url.m_hostEnd = currentPosition(iterator);
2702 if (!iterator.atEnd())
2703 return parsePort(iterator);
2704 m_url.m_portEnd = currentPosition(iterator);
2708 Optional<String> URLParser::formURLDecode(StringView input)
2710 auto utf8 = input.utf8(StrictConversion);
2713 auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2714 return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2717 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2719 Vector<StringView> sequences = input.split('&');
2721 URLEncodedForm output;
2722 for (auto& bytes : sequences) {
2723 auto valueStart = bytes.find('=');
2724 if (valueStart == notFound) {
2725 if (auto name = formURLDecode(bytes))
2726 output.append({name.value().replace('+', 0x20), emptyString()});
2728 auto name = formURLDecode(bytes.substring(0, valueStart));
2729 auto value = formURLDecode(bytes.substring(valueStart + 1));
2731 output.append(std::make_pair(name.value().replace('+', 0x20), value.value().replace('+', 0x20)));
2737 static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2739 auto utf8 = input.utf8(StrictConversion);
2740 const char* data = utf8.data();
2741 for (size_t i = 0; i < utf8.length(); ++i) {
2742 const char byte = data[i];
2744 output.append(0x2B);
2745 else if (byte == 0x2A
2748 || (byte >= 0x30 && byte <= 0x39)
2749 || (byte >= 0x41 && byte <= 0x5A)
2751 || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches.
2752 output.append(byte);
2754 percentEncodeByte(byte, output);
2758 String URLParser::serialize(const URLEncodedForm& tuples)
2760 Vector<LChar> output;
2761 for (auto& tuple : tuples) {
2762 if (!output.isEmpty())
2764 serializeURLEncodedForm(tuple.first, output);
2766 serializeURLEncodedForm(tuple.second, output);
2768 return String::adopt(WTFMove(output));
2771 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2773 // FIXME: m_cannotBeABaseURL is not compared because the old URL::parse did not use it,
2774 // but once we get rid of URL::parse its value should be tested.
2775 URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2777 a.m_protocolIsInHTTPFamily,
2784 a.m_pathAfterLastSlash,
2788 a.m_string.utf8().data(),
2790 b.m_protocolIsInHTTPFamily,
2797 b.m_pathAfterLastSlash,
2801 b.m_string.utf8().data());
2803 return a.m_string == b.m_string
2804 && a.m_isValid == b.m_isValid
2805 && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2806 && a.m_schemeEnd == b.m_schemeEnd
2807 && a.m_userStart == b.m_userStart
2808 && a.m_userEnd == b.m_userEnd
2809 && a.m_passwordEnd == b.m_passwordEnd
2810 && a.m_hostEnd == b.m_hostEnd
2811 && a.m_portEnd == b.m_portEnd
2812 && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2813 && a.m_pathEnd == b.m_pathEnd
2814 && a.m_queryEnd == b.m_queryEnd
2815 && a.m_fragmentEnd == b.m_fragmentEnd;
2818 bool URLParser::internalValuesConsistent(const URL& url)
2820 return url.m_schemeEnd <= url.m_userStart
2821 && url.m_userStart <= url.m_userEnd
2822 && url.m_userEnd <= url.m_passwordEnd
2823 && url.m_passwordEnd <= url.m_hostEnd
2824 && url.m_hostEnd <= url.m_portEnd
2825 && url.m_portEnd <= url.m_pathAfterLastSlash
2826 && url.m_pathAfterLastSlash <= url.m_pathEnd
2827 && url.m_pathEnd <= url.m_queryEnd
2828 && url.m_queryEnd <= url.m_fragmentEnd
2829 && (url.m_isValid ? url.m_fragmentEnd == url.m_string.length() : !url.m_fragmentEnd);
2830 // FIXME: Why do we even store m_fragmentEnd?
2831 // It should be able to be deduced from m_isValid and m_string.length() to save memory.
2834 static bool urlParserEnabled = true;
2836 void URLParser::setEnabled(bool enabled)
2838 urlParserEnabled = enabled;
2841 bool URLParser::enabled()
2843 return urlParserEnabled;
2846 } // namespace WebCore