2 * Copyright (C) 2016 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23 * THE POSSIBILITY OF SUCH DAMAGE.
27 #include "URLParser.h"
30 #include "RuntimeApplicationChecks.h"
32 #include <unicode/uidna.h>
33 #include <unicode/utypes.h>
37 #define URL_PARSER_DEBUGGING 0
39 #if URL_PARSER_DEBUGGING
40 #define URL_PARSER_LOG(...) LOG(URLParser, __VA_ARGS__)
42 #define URL_PARSER_LOG(...)
45 template<typename CharacterType>
46 class CodePointIterator {
48 ALWAYS_INLINE CodePointIterator() { }
49 ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
55 ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
56 : CodePointIterator(begin.m_begin, end.m_begin)
58 ASSERT(end.m_begin >= begin.m_begin);
61 ALWAYS_INLINE UChar32 operator*() const;
62 ALWAYS_INLINE CodePointIterator& operator++();
64 ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
66 return m_begin == other.m_begin
67 && m_end == other.m_end;
69 ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
71 ALWAYS_INLINE CodePointIterator& operator=(const CodePointIterator& other)
73 m_begin = other.m_begin;
78 ALWAYS_INLINE bool atEnd() const
80 ASSERT(m_begin <= m_end);
81 return m_begin >= m_end;
84 ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
86 ASSERT(m_begin >= reference);
87 return m_begin - reference;
90 ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
92 return codeUnitsSince(other.m_begin);
96 const CharacterType* m_begin { nullptr };
97 const CharacterType* m_end { nullptr };
101 ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const
108 ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
116 ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const
120 U16_GET(m_begin, 0, 0, m_end - m_begin, c);
125 ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
129 size_t length = m_end - m_begin;
130 U16_FWD_1(m_begin, i, length);
135 ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
137 if (U_IS_BMP(codePoint)) {
138 destination.append(static_cast<UChar>(codePoint));
141 destination.reserveCapacity(destination.size() + 2);
142 destination.uncheckedAppend(U16_LEAD(codePoint));
143 destination.uncheckedAppend(U16_TRAIL(codePoint));
146 enum URLCharacterClass {
151 SlashQuestionOrHash = 0x10,
155 static const uint8_t characterClassTable[256] = {
156 UserInfo | Default | InvalidDomain | QueryPercent, // 0x0
157 UserInfo | Default | QueryPercent, // 0x1
158 UserInfo | Default | QueryPercent, // 0x2
159 UserInfo | Default | QueryPercent, // 0x3
160 UserInfo | Default | QueryPercent, // 0x4
161 UserInfo | Default | QueryPercent, // 0x5
162 UserInfo | Default | QueryPercent, // 0x6
163 UserInfo | Default | QueryPercent, // 0x7
164 UserInfo | Default | QueryPercent, // 0x8
165 UserInfo | Default | InvalidDomain | QueryPercent, // 0x9
166 UserInfo | Default | InvalidDomain | QueryPercent, // 0xA
167 UserInfo | Default | QueryPercent, // 0xB
168 UserInfo | Default | QueryPercent, // 0xC
169 UserInfo | Default | InvalidDomain | QueryPercent, // 0xD
170 UserInfo | Default | QueryPercent, // 0xE
171 UserInfo | Default | QueryPercent, // 0xF
172 UserInfo | Default | QueryPercent, // 0x10
173 UserInfo | Default | QueryPercent, // 0x11
174 UserInfo | Default | QueryPercent, // 0x12
175 UserInfo | Default | QueryPercent, // 0x13
176 UserInfo | Default | QueryPercent, // 0x14
177 UserInfo | Default | QueryPercent, // 0x15
178 UserInfo | Default | QueryPercent, // 0x16
179 UserInfo | Default | QueryPercent, // 0x17
180 UserInfo | Default | QueryPercent, // 0x18
181 UserInfo | Default | QueryPercent, // 0x19
182 UserInfo | Default | QueryPercent, // 0x1A
183 UserInfo | Default | QueryPercent, // 0x1B
184 UserInfo | Default | QueryPercent, // 0x1C
185 UserInfo | Default | QueryPercent, // 0x1D
186 UserInfo | Default | QueryPercent, // 0x1E
187 UserInfo | Default | QueryPercent, // 0x1F
188 UserInfo | Default | InvalidDomain | QueryPercent, // ' '
190 UserInfo | Default | QueryPercent, // '"'
191 UserInfo | Default | InvalidDomain | QueryPercent | SlashQuestionOrHash, // '#'
193 InvalidDomain, // '%'
203 UserInfo | InvalidDomain | SlashQuestionOrHash, // '/'
214 UserInfo | InvalidDomain, // ':'
216 UserInfo | Default | QueryPercent, // '<'
218 UserInfo | Default | QueryPercent, // '>'
219 UserInfo | Default | InvalidDomain | SlashQuestionOrHash, // '?'
220 UserInfo | InvalidDomain, // '@'
247 UserInfo | InvalidDomain, // '['
248 UserInfo | InvalidDomain | SlashQuestionOrHash, // '\\'
249 UserInfo | InvalidDomain, // ']'
252 UserInfo | Default, // '`'
279 UserInfo | Default, // '{'
281 UserInfo | Default, // '}'
283 QueryPercent, // 0x7F
284 QueryPercent, // 0x80
285 QueryPercent, // 0x81
286 QueryPercent, // 0x82
287 QueryPercent, // 0x83
288 QueryPercent, // 0x84
289 QueryPercent, // 0x85
290 QueryPercent, // 0x86
291 QueryPercent, // 0x87
292 QueryPercent, // 0x88
293 QueryPercent, // 0x89
294 QueryPercent, // 0x8A
295 QueryPercent, // 0x8B
296 QueryPercent, // 0x8C
297 QueryPercent, // 0x8D
298 QueryPercent, // 0x8E
299 QueryPercent, // 0x8F
300 QueryPercent, // 0x90
301 QueryPercent, // 0x91
302 QueryPercent, // 0x92
303 QueryPercent, // 0x93
304 QueryPercent, // 0x94
305 QueryPercent, // 0x95
306 QueryPercent, // 0x96
307 QueryPercent, // 0x97
308 QueryPercent, // 0x98
309 QueryPercent, // 0x99
310 QueryPercent, // 0x9A
311 QueryPercent, // 0x9B
312 QueryPercent, // 0x9C
313 QueryPercent, // 0x9D
314 QueryPercent, // 0x9E
315 QueryPercent, // 0x9F
316 QueryPercent, // 0xA0
317 QueryPercent, // 0xA1
318 QueryPercent, // 0xA2
319 QueryPercent, // 0xA3
320 QueryPercent, // 0xA4
321 QueryPercent, // 0xA5
322 QueryPercent, // 0xA6
323 QueryPercent, // 0xA7
324 QueryPercent, // 0xA8
325 QueryPercent, // 0xA9
326 QueryPercent, // 0xAA
327 QueryPercent, // 0xAB
328 QueryPercent, // 0xAC
329 QueryPercent, // 0xAD
330 QueryPercent, // 0xAE
331 QueryPercent, // 0xAF
332 QueryPercent, // 0xB0
333 QueryPercent, // 0xB1
334 QueryPercent, // 0xB2
335 QueryPercent, // 0xB3
336 QueryPercent, // 0xB4
337 QueryPercent, // 0xB5
338 QueryPercent, // 0xB6
339 QueryPercent, // 0xB7
340 QueryPercent, // 0xB8
341 QueryPercent, // 0xB9
342 QueryPercent, // 0xBA
343 QueryPercent, // 0xBB
344 QueryPercent, // 0xBC
345 QueryPercent, // 0xBD
346 QueryPercent, // 0xBE
347 QueryPercent, // 0xBF
348 QueryPercent, // 0xC0
349 QueryPercent, // 0xC1
350 QueryPercent, // 0xC2
351 QueryPercent, // 0xC3
352 QueryPercent, // 0xC4
353 QueryPercent, // 0xC5
354 QueryPercent, // 0xC6
355 QueryPercent, // 0xC7
356 QueryPercent, // 0xC8
357 QueryPercent, // 0xC9
358 QueryPercent, // 0xCA
359 QueryPercent, // 0xCB
360 QueryPercent, // 0xCC
361 QueryPercent, // 0xCD
362 QueryPercent, // 0xCE
363 QueryPercent, // 0xCF
364 QueryPercent, // 0xD0
365 QueryPercent, // 0xD1
366 QueryPercent, // 0xD2
367 QueryPercent, // 0xD3
368 QueryPercent, // 0xD4
369 QueryPercent, // 0xD5
370 QueryPercent, // 0xD6
371 QueryPercent, // 0xD7
372 QueryPercent, // 0xD8
373 QueryPercent, // 0xD9
374 QueryPercent, // 0xDA
375 QueryPercent, // 0xDB
376 QueryPercent, // 0xDC
377 QueryPercent, // 0xDD
378 QueryPercent, // 0xDE
379 QueryPercent, // 0xDF
380 QueryPercent, // 0xE0
381 QueryPercent, // 0xE1
382 QueryPercent, // 0xE2
383 QueryPercent, // 0xE3
384 QueryPercent, // 0xE4
385 QueryPercent, // 0xE5
386 QueryPercent, // 0xE6
387 QueryPercent, // 0xE7
388 QueryPercent, // 0xE8
389 QueryPercent, // 0xE9
390 QueryPercent, // 0xEA
391 QueryPercent, // 0xEB
392 QueryPercent, // 0xEC
393 QueryPercent, // 0xED
394 QueryPercent, // 0xEE
395 QueryPercent, // 0xEF
396 QueryPercent, // 0xF0
397 QueryPercent, // 0xF1
398 QueryPercent, // 0xF2
399 QueryPercent, // 0xF3
400 QueryPercent, // 0xF4
401 QueryPercent, // 0xF5
402 QueryPercent, // 0xF6
403 QueryPercent, // 0xF7
404 QueryPercent, // 0xF8
405 QueryPercent, // 0xF9
406 QueryPercent, // 0xFA
407 QueryPercent, // 0xFB
408 QueryPercent, // 0xFC
409 QueryPercent, // 0xFD
410 QueryPercent, // 0xFE
411 QueryPercent, // 0xFF
414 template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; }
415 template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
416 template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
417 template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
418 template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
419 template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
420 template<typename CharacterType> ALWAYS_INLINE static bool isInvalidDomainCharacter(CharacterType character) { return character <= ']' && characterClassTable[character] & InvalidDomain; }
421 template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
422 template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
423 template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & ValidScheme; }
424 static bool shouldPercentEncodeQueryByte(uint8_t byte) { return characterClassTable[byte] & QueryPercent; }
426 template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation>
427 ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
430 while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
431 if (reportSyntaxViolation == ReportSyntaxViolation::Yes)
432 syntaxViolation(iteratorForSyntaxViolationPosition);
437 template<typename CharacterType>
438 bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator)
440 if (iterator.atEnd())
442 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
443 if (iterator.atEnd())
445 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
446 return iterator.atEnd();
449 template<typename CharacterType>
450 ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
452 if (iterator.atEnd() || !isASCIIAlpha(*iterator))
454 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
455 if (iterator.atEnd())
457 if (*iterator == ':')
459 if (UNLIKELY(*iterator == '|'))
464 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
466 ASSERT(isASCII(codePoint));
467 if (UNLIKELY(m_didSeeSyntaxViolation))
468 m_asciiBuffer.append(codePoint);
471 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
473 if (UNLIKELY(m_didSeeSyntaxViolation))
474 m_asciiBuffer.append(characters, length);
477 template<typename CharacterType>
478 void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
480 ASSERT(isWindowsDriveLetter(iterator));
481 appendToASCIIBuffer(*iterator);
483 ASSERT(!iterator.atEnd());
484 ASSERT(*iterator == ':' || *iterator == '|');
485 if (*iterator == '|')
486 syntaxViolation(iterator);
487 appendToASCIIBuffer(':');
491 template<typename CharacterType>
492 bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
494 if (!isWindowsDriveLetter(iterator))
496 if (iterator.atEnd())
499 if (iterator.atEnd())
502 if (iterator.atEnd())
504 return !isSlashQuestionOrHash(*iterator);
507 static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
510 buffer.append(upperNibbleToASCIIHexDigit(byte));
511 buffer.append(lowerNibbleToASCIIHexDigit(byte));
514 void URLParser::percentEncodeByte(uint8_t byte)
516 ASSERT(m_didSeeSyntaxViolation);
517 appendToASCIIBuffer('%');
518 appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
519 appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
522 const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD";
523 const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1;
525 template<bool(*isInCodeSet)(UChar32), typename CharacterType>
526 ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
528 ASSERT(!iterator.atEnd());
529 UChar32 codePoint = *iterator;
530 if (LIKELY(isASCII(codePoint))) {
531 if (UNLIKELY(isInCodeSet(codePoint))) {
532 syntaxViolation(iterator);
533 percentEncodeByte(codePoint);
535 appendToASCIIBuffer(codePoint);
538 ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
539 syntaxViolation(iterator);
541 if (!U_IS_UNICODE_CHAR(codePoint)) {
542 appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
546 uint8_t buffer[U8_MAX_LENGTH];
548 U8_APPEND_UNSAFE(buffer, offset, codePoint);
549 for (int32_t i = 0; i < offset; ++i)
550 percentEncodeByte(buffer[i]);
553 template<typename CharacterType>
554 ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
556 ASSERT(!iterator.atEnd());
557 UChar32 codePoint = *iterator;
558 if (LIKELY(isASCII(codePoint))) {
559 if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint))) {
560 syntaxViolation(iterator);
561 percentEncodeByte(codePoint);
563 appendToASCIIBuffer(codePoint);
567 syntaxViolation(iterator);
569 if (!U_IS_UNICODE_CHAR(codePoint)) {
570 appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
574 uint8_t buffer[U8_MAX_LENGTH];
576 U8_APPEND_UNSAFE(buffer, offset, codePoint);
577 for (int32_t i = 0; i < offset; ++i) {
578 auto byte = buffer[i];
579 if (shouldPercentEncodeQueryByte(byte))
580 percentEncodeByte(byte);
582 appendToASCIIBuffer(byte);
586 template<typename CharacterType>
587 void URLParser::encodeQuery(const Vector<UChar>& source, const TextEncoding& encoding, CodePointIterator<CharacterType> iterator)
589 // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
590 CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
591 const char* data = encoded.data();
592 size_t length = encoded.length();
594 if (!length == !iterator.atEnd()) {
595 syntaxViolation(iterator);
600 for (; i < length; ++i) {
601 ASSERT(!iterator.atEnd());
602 uint8_t byte = data[i];
603 if (UNLIKELY(byte != *iterator)) {
604 syntaxViolation(iterator);
607 if (UNLIKELY(shouldPercentEncodeQueryByte(byte))) {
608 syntaxViolation(iterator);
611 appendToASCIIBuffer(byte);
614 while (!iterator.atEnd() && isTabOrNewline(*iterator))
616 ASSERT((i == length) == iterator.atEnd());
617 for (; i < length; ++i) {
618 ASSERT(m_didSeeSyntaxViolation);
619 uint8_t byte = data[i];
620 if (shouldPercentEncodeQueryByte(byte))
621 percentEncodeByte(byte);
623 appendToASCIIBuffer(byte);
627 Optional<uint16_t> URLParser::defaultPortForProtocol(StringView scheme)
629 static const uint16_t ftpPort = 21;
630 static const uint16_t gopherPort = 70;
631 static const uint16_t httpPort = 80;
632 static const uint16_t httpsPort = 443;
633 static const uint16_t wsPort = 80;
634 static const uint16_t wssPort = 443;
636 auto length = scheme.length();
643 if (scheme[1] == 's')
703 ALWAYS_INLINE static Scheme scheme(StringView scheme)
705 auto length = scheme.length();
707 return Scheme::NonSpecial;
715 return Scheme::NonSpecial;
721 return Scheme::NonSpecial;
723 return Scheme::NonSpecial;
732 return Scheme::Gopher;
733 return Scheme::NonSpecial;
741 return Scheme::NonSpecial;
747 return Scheme::HTTPS;
748 return Scheme::NonSpecial;
750 return Scheme::NonSpecial;
755 if (scheme[1] == 's')
757 return Scheme::NonSpecial;
762 return Scheme::NonSpecial;
764 return Scheme::NonSpecial;
767 return Scheme::NonSpecial;
771 enum class URLParser::URLPart {
784 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
787 case URLPart::FragmentEnd:
788 return url.m_fragmentEnd;
789 case URLPart::QueryEnd:
790 return url.m_queryEnd;
791 case URLPart::PathEnd:
792 return url.m_pathEnd;
793 case URLPart::PathAfterLastSlash:
794 return url.m_pathAfterLastSlash;
795 case URLPart::PortEnd:
796 return url.m_portEnd;
797 case URLPart::HostEnd:
798 return url.m_hostEnd;
799 case URLPart::PasswordEnd:
800 return url.m_passwordEnd;
801 case URLPart::UserEnd:
802 return url.m_userEnd;
803 case URLPart::UserStart:
804 return url.m_userStart;
805 case URLPart::SchemeEnd:
806 return url.m_schemeEnd;
808 ASSERT_NOT_REACHED();
812 void URLParser::copyASCIIStringUntil(const String& string, size_t length)
814 RELEASE_ASSERT(length <= string.length());
817 ASSERT(m_asciiBuffer.isEmpty());
818 if (string.is8Bit()) {
819 appendToASCIIBuffer(string.characters8(), length);
821 const UChar* characters = string.characters16();
822 for (size_t i = 0; i < length; ++i) {
823 UChar c = characters[i];
824 ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
825 appendToASCIIBuffer(c);
830 template<typename CharacterType>
831 void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, bool& isUTF8Encoding)
833 syntaxViolation(iterator);
835 m_asciiBuffer.clear();
836 copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
838 case URLPart::FragmentEnd:
839 RELEASE_ASSERT_NOT_REACHED();
840 case URLPart::QueryEnd:
841 m_url.m_queryEnd = base.m_queryEnd;
843 case URLPart::PathEnd:
844 m_url.m_pathEnd = base.m_pathEnd;
846 case URLPart::PathAfterLastSlash:
847 m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
849 case URLPart::PortEnd:
850 m_url.m_portEnd = base.m_portEnd;
852 case URLPart::HostEnd:
853 m_url.m_hostEnd = base.m_hostEnd;
855 case URLPart::PasswordEnd:
856 m_url.m_passwordEnd = base.m_passwordEnd;
858 case URLPart::UserEnd:
859 m_url.m_userEnd = base.m_userEnd;
861 case URLPart::UserStart:
862 m_url.m_userStart = base.m_userStart;
864 case URLPart::SchemeEnd:
865 m_url.m_isValid = base.m_isValid;
866 m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
867 m_url.m_schemeEnd = base.m_schemeEnd;
869 switch (scheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd))) {
872 isUTF8Encoding = true;
879 m_urlIsSpecial = true;
881 case Scheme::NonSpecial:
882 m_urlIsSpecial = false;
883 isUTF8Encoding = true;
886 ASSERT_NOT_REACHED();
889 static const char dotASCIICode[2] = {'2', 'e'};
891 template<typename CharacterType>
892 ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c)
897 advance<CharacterType, ReportSyntaxViolation::No>(c);
898 return c.atEnd() || isSlashQuestionOrHash(*c);
902 advance<CharacterType, ReportSyntaxViolation::No>(c);
903 if (c.atEnd() || *c != dotASCIICode[0])
905 advance<CharacterType, ReportSyntaxViolation::No>(c);
908 if (toASCIILower(*c) == dotASCIICode[1]) {
909 advance<CharacterType, ReportSyntaxViolation::No>(c);
910 return c.atEnd() || isSlashQuestionOrHash(*c);
915 template<typename CharacterType>
916 ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
921 advance<CharacterType, ReportSyntaxViolation::No>(c);
922 return isSingleDotPathSegment(c);
926 advance<CharacterType, ReportSyntaxViolation::No>(c);
927 if (c.atEnd() || *c != dotASCIICode[0])
929 advance<CharacterType, ReportSyntaxViolation::No>(c);
932 if (toASCIILower(*c) == dotASCIICode[1]) {
933 advance<CharacterType, ReportSyntaxViolation::No>(c);
934 return isSingleDotPathSegment(c);
939 template<typename CharacterType>
940 void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
942 ASSERT(isSingleDotPathSegment(c));
946 if (*c == '/' || *c == '\\')
949 ASSERT(*c == '?' || *c == '#');
954 ASSERT(*c == dotASCIICode[0]);
956 ASSERT(toASCIILower(*c) == dotASCIICode[1]);
959 if (*c == '/' || *c == '\\')
962 ASSERT(*c == '?' || *c == '#');
967 template<typename CharacterType>
968 void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
970 ASSERT(isDoubleDotPathSegment(c));
976 ASSERT(*c == dotASCIICode[0]);
978 ASSERT(toASCIILower(*c) == dotASCIICode[1]);
981 consumeSingleDotPathSegment(c);
984 void URLParser::popPath()
986 ASSERT(m_didSeeSyntaxViolation);
987 if (m_url.m_pathAfterLastSlash > m_url.m_portEnd + 1) {
988 m_url.m_pathAfterLastSlash--;
989 if (m_asciiBuffer[m_url.m_pathAfterLastSlash] == '/')
990 m_url.m_pathAfterLastSlash--;
991 while (m_url.m_pathAfterLastSlash > m_url.m_portEnd && m_asciiBuffer[m_url.m_pathAfterLastSlash] != '/')
992 m_url.m_pathAfterLastSlash--;
993 m_url.m_pathAfterLastSlash++;
995 m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
998 template<typename CharacterType>
999 void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
1001 if (m_didSeeSyntaxViolation)
1003 m_didSeeSyntaxViolation = true;
1005 ASSERT(m_asciiBuffer.isEmpty());
1006 size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1007 RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
1008 m_asciiBuffer.reserveCapacity(m_inputString.length());
1009 for (size_t i = 0; i < codeUnitsToCopy; ++i) {
1010 ASSERT(isASCII(m_inputString[i]));
1011 m_asciiBuffer.uncheckedAppend(m_inputString[i]);
1015 void URLParser::failure()
1018 m_url.m_string = m_inputString;
1021 template<typename CharacterType>
1022 bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint)
1024 if (iterator.atEnd() || toASCIILower(*iterator) != codePoint)
1026 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
1030 template<typename CharacterType>
1031 bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator)
1033 if (!checkLocalhostCodePoint(iterator, 'l'))
1035 if (!checkLocalhostCodePoint(iterator, 'o'))
1037 if (!checkLocalhostCodePoint(iterator, 'c'))
1039 if (!checkLocalhostCodePoint(iterator, 'a'))
1041 if (!checkLocalhostCodePoint(iterator, 'l'))
1043 if (!checkLocalhostCodePoint(iterator, 'h'))
1045 if (!checkLocalhostCodePoint(iterator, 'o'))
1047 if (!checkLocalhostCodePoint(iterator, 's'))
1049 if (!checkLocalhostCodePoint(iterator, 't'))
1051 return iterator.atEnd();
1054 bool URLParser::isLocalhost(StringView view)
1057 return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length()));
1058 return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length()));
1061 ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
1063 if (UNLIKELY(m_didSeeSyntaxViolation)) {
1064 ASSERT(start + length <= m_asciiBuffer.size());
1065 return StringView(m_asciiBuffer.data() + start, length);
1067 ASSERT(start + length <= m_inputString.length());
1068 return StringView(m_inputString).substring(start, length);
1071 template<typename CharacterType>
1072 ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1074 if (UNLIKELY(m_didSeeSyntaxViolation))
1075 return m_asciiBuffer.size();
1077 return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1080 URLParser::URLParser(const String& input, const URL& base, const TextEncoding& encoding)
1081 : m_inputString(input)
1083 if (input.isNull()) {
1084 if (base.isValid() && !base.m_cannotBeABaseURL) {
1086 m_url.removeFragmentIdentifier();
1091 if (input.is8Bit()) {
1092 m_inputBegin = input.characters8();
1093 parse(input.characters8(), input.length(), base, encoding);
1095 m_inputBegin = input.characters16();
1096 parse(input.characters16(), input.length(), base, encoding);
1099 ASSERT(!m_url.m_isValid
1100 || m_didSeeSyntaxViolation == (m_url.string() != input)
1101 || (input.isAllSpecialCharacters<isC0ControlOrSpace>()
1102 && m_url.m_string == base.m_string.left(base.m_queryEnd)));
1103 ASSERT(internalValuesConsistent(m_url));
1104 #if !ASSERT_DISABLED
1105 if (!m_didSeeSyntaxViolation) {
1106 // Force a syntax violation at the beginning to make sure we get the same result.
1107 URLParser parser(makeString(" ", input), base, encoding);
1108 URL parsed = parser.result();
1109 if (parsed.isValid())
1110 ASSERT(allValuesEqual(parser.result(), m_url));
1115 template<typename CharacterType>
1116 void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
1118 URL_PARSER_LOG("Parsing URL <%s> base <%s> encoding <%s>", String(input, length).utf8().data(), base.string().utf8().data(), encoding.name());
1120 ASSERT(m_asciiBuffer.isEmpty());
1122 bool isUTF8Encoding = encoding == UTF8Encoding();
1123 Vector<UChar> queryBuffer;
1125 unsigned endIndex = length;
1126 while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) {
1127 syntaxViolation(CodePointIterator<CharacterType>(input, input));
1130 CodePointIterator<CharacterType> c(input, input + endIndex);
1131 CodePointIterator<CharacterType> authorityOrHostBegin;
1132 CodePointIterator<CharacterType> queryBegin;
1133 while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1137 auto beginAfterControlAndSpace = c;
1139 enum class State : uint8_t {
1143 SpecialRelativeOrAuthority,
1147 SpecialAuthoritySlashes,
1148 SpecialAuthorityIgnoreSlashes,
1156 CannotBeABaseURLPath,
1162 #define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1163 #define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x)
1165 State state = State::SchemeStart;
1166 while (!c.atEnd()) {
1167 if (UNLIKELY(isTabOrNewline(*c))) {
1174 case State::SchemeStart:
1175 LOG_STATE("SchemeStart");
1176 if (isASCIIAlpha(*c)) {
1177 if (UNLIKELY(isASCIIUpper(*c)))
1179 appendToASCIIBuffer(toASCIILower(*c));
1182 m_asciiBuffer.clear();
1183 state = State::NoScheme;
1184 c = beginAfterControlAndSpace;
1186 state = State::Scheme;
1188 state = State::NoScheme;
1191 LOG_STATE("Scheme");
1192 if (isValidSchemeCharacter(*c)) {
1193 if (UNLIKELY(isASCIIUpper(*c)))
1195 appendToASCIIBuffer(toASCIILower(*c));
1196 } else if (*c == ':') {
1197 m_url.m_schemeEnd = currentPosition(c);
1198 StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd);
1199 appendToASCIIBuffer(':');
1200 switch (scheme(urlScheme)) {
1202 m_urlIsSpecial = true;
1203 state = State::File;
1208 isUTF8Encoding = true;
1209 m_urlIsSpecial = true;
1210 if (base.protocolIs(urlScheme))
1211 state = State::SpecialRelativeOrAuthority;
1213 state = State::SpecialAuthoritySlashes;
1218 m_url.m_protocolIsInHTTPFamily = true;
1221 case Scheme::Gopher:
1222 m_urlIsSpecial = true;
1223 if (base.protocolIs(urlScheme))
1224 state = State::SpecialRelativeOrAuthority;
1226 state = State::SpecialAuthoritySlashes;
1229 case Scheme::NonSpecial:
1230 isUTF8Encoding = true;
1231 auto maybeSlash = c;
1232 advance(maybeSlash);
1233 if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1234 appendToASCIIBuffer('/');
1236 state = State::PathOrAuthority;
1239 m_url.m_userStart = currentPosition(c);
1242 m_url.m_userStart = currentPosition(c);
1243 m_url.m_userEnd = m_url.m_userStart;
1244 m_url.m_passwordEnd = m_url.m_userStart;
1245 m_url.m_hostEnd = m_url.m_userStart;
1246 m_url.m_portEnd = m_url.m_userStart;
1247 m_url.m_pathAfterLastSlash = m_url.m_userStart;
1248 m_url.m_cannotBeABaseURL = true;
1249 state = State::CannotBeABaseURLPath;
1255 m_asciiBuffer.clear();
1256 state = State::NoScheme;
1257 c = beginAfterControlAndSpace;
1262 m_asciiBuffer.clear();
1263 state = State::NoScheme;
1264 c = beginAfterControlAndSpace;
1267 case State::NoScheme:
1268 LOG_STATE("NoScheme");
1269 if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) {
1273 if (base.m_cannotBeABaseURL && *c == '#') {
1274 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1275 state = State::Fragment;
1276 appendToASCIIBuffer('#');
1280 if (!base.protocolIs("file")) {
1281 state = State::Relative;
1284 copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1285 appendToASCIIBuffer(':');
1286 state = State::File;
1288 case State::SpecialRelativeOrAuthority:
1289 LOG_STATE("SpecialRelativeOrAuthority");
1291 appendToASCIIBuffer('/');
1298 appendToASCIIBuffer('/');
1299 state = State::SpecialAuthorityIgnoreSlashes;
1302 state = State::RelativeSlash;
1304 state = State::Relative;
1306 case State::PathOrAuthority:
1307 LOG_STATE("PathOrAuthority");
1309 appendToASCIIBuffer('/');
1310 state = State::AuthorityOrHost;
1312 m_url.m_userStart = currentPosition(c);
1313 authorityOrHostBegin = c;
1315 ASSERT(parsedDataView(currentPosition(c) - 1, 1) == "/");
1316 m_url.m_userStart = currentPosition(c) - 1;
1317 m_url.m_userEnd = m_url.m_userStart;
1318 m_url.m_passwordEnd = m_url.m_userStart;
1319 m_url.m_hostEnd = m_url.m_userStart;
1320 m_url.m_portEnd = m_url.m_userStart;
1321 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1322 state = State::Path;
1325 case State::Relative:
1326 LOG_STATE("Relative");
1330 state = State::RelativeSlash;
1334 copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1335 appendToASCIIBuffer('?');
1338 state = State::UTF8Query;
1341 state = State::NonUTF8Query;
1345 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1346 appendToASCIIBuffer('#');
1347 state = State::Fragment;
1351 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1352 state = State::Path;
1356 case State::RelativeSlash:
1357 LOG_STATE("RelativeSlash");
1358 if (*c == '/' || *c == '\\') {
1360 copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1361 appendToASCIIBuffer("://", 3);
1362 state = State::SpecialAuthorityIgnoreSlashes;
1364 copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1365 appendToASCIIBuffer('/');
1366 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1367 state = State::Path;
1370 case State::SpecialAuthoritySlashes:
1371 LOG_STATE("SpecialAuthoritySlashes");
1372 if (LIKELY(*c == '/' || *c == '\\')) {
1373 if (UNLIKELY(*c == '\\'))
1375 appendToASCIIBuffer('/');
1377 if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) {
1378 if (UNLIKELY(*c == '\\'))
1381 appendToASCIIBuffer('/');
1384 appendToASCIIBuffer('/');
1388 appendToASCIIBuffer("//", 2);
1390 state = State::SpecialAuthorityIgnoreSlashes;
1392 case State::SpecialAuthorityIgnoreSlashes:
1393 LOG_STATE("SpecialAuthorityIgnoreSlashes");
1394 if (*c == '/' || *c == '\\') {
1398 m_url.m_userStart = currentPosition(c);
1399 state = State::AuthorityOrHost;
1400 authorityOrHostBegin = c;
1403 case State::AuthorityOrHost:
1405 LOG_STATE("AuthorityOrHost");
1408 auto findLastAt = c;
1409 while (!findLastAt.atEnd()) {
1410 URL_PARSER_LOG("Finding last @: %c", *findLastAt);
1411 if (*findLastAt == '@')
1412 lastAt = findLastAt;
1413 bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\');
1414 if (isSlash || *findLastAt == '?' || *findLastAt == '#')
1418 parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1421 authorityOrHostBegin = c;
1422 state = State::Host;
1423 m_hostHasPercentOrNonASCII = false;
1426 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1427 if (isSlash || *c == '?' || *c == '#') {
1428 auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c);
1429 if (iterator.atEnd()) {
1430 m_url.m_userEnd = currentPosition(c);
1431 m_url.m_passwordEnd = m_url.m_userEnd;
1432 m_url.m_hostEnd = m_url.m_userEnd;
1433 m_url.m_portEnd = m_url.m_userEnd;
1434 m_url.m_pathAfterLastSlash = m_url.m_userEnd;
1436 m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1437 m_url.m_passwordEnd = m_url.m_userEnd;
1438 if (!parseHostAndPort(iterator)) {
1442 if (UNLIKELY(!isSlash)) {
1444 appendToASCIIBuffer('/');
1445 m_url.m_pathAfterLastSlash = currentPosition(c);
1448 state = State::Path;
1451 if (isPercentOrNonASCII(*c))
1452 m_hostHasPercentOrNonASCII = true;
1454 } while (!c.atEnd());
1459 if (*c == '/' || *c == '?' || *c == '#') {
1460 if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1464 if (*c == '?' || *c == '#') {
1466 appendToASCIIBuffer('/');
1467 m_url.m_pathAfterLastSlash = currentPosition(c);
1469 state = State::Path;
1472 if (isPercentOrNonASCII(*c))
1473 m_hostHasPercentOrNonASCII = true;
1475 } while (!c.atEnd());
1484 appendToASCIIBuffer('/');
1485 state = State::FileSlash;
1490 if (base.isValid() && base.protocolIs("file")) {
1491 copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1492 appendToASCIIBuffer('?');
1495 appendToASCIIBuffer("///?", 4);
1497 m_url.m_userStart = currentPosition(c) - 2;
1498 m_url.m_userEnd = m_url.m_userStart;
1499 m_url.m_passwordEnd = m_url.m_userStart;
1500 m_url.m_hostEnd = m_url.m_userStart;
1501 m_url.m_portEnd = m_url.m_userStart;
1502 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1503 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1506 state = State::UTF8Query;
1509 state = State::NonUTF8Query;
1514 if (base.isValid() && base.protocolIs("file")) {
1515 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1516 appendToASCIIBuffer('#');
1518 appendToASCIIBuffer("///#", 4);
1519 m_url.m_userStart = currentPosition(c) - 2;
1520 m_url.m_userEnd = m_url.m_userStart;
1521 m_url.m_passwordEnd = m_url.m_userStart;
1522 m_url.m_hostEnd = m_url.m_userStart;
1523 m_url.m_portEnd = m_url.m_userStart;
1524 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1525 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1526 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1528 state = State::Fragment;
1533 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1534 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1536 appendToASCIIBuffer("///", 3);
1537 m_url.m_userStart = currentPosition(c) - 1;
1538 m_url.m_userEnd = m_url.m_userStart;
1539 m_url.m_passwordEnd = m_url.m_userStart;
1540 m_url.m_hostEnd = m_url.m_userStart;
1541 m_url.m_portEnd = m_url.m_userStart;
1542 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1543 if (isWindowsDriveLetter(c))
1544 appendWindowsDriveLetter(c);
1546 state = State::Path;
1550 case State::FileSlash:
1551 LOG_STATE("FileSlash");
1552 if (LIKELY(*c == '/' || *c == '\\')) {
1553 if (UNLIKELY(*c == '\\'))
1555 appendToASCIIBuffer('/');
1557 m_url.m_userStart = currentPosition(c);
1558 m_url.m_userEnd = m_url.m_userStart;
1559 m_url.m_passwordEnd = m_url.m_userStart;
1560 m_url.m_hostEnd = m_url.m_userStart;
1561 m_url.m_portEnd = m_url.m_userStart;
1562 authorityOrHostBegin = c;
1563 state = State::FileHost;
1566 if (base.isValid() && base.protocolIs("file")) {
1567 // FIXME: This String copy is unnecessary.
1568 String basePath = base.path();
1569 if (basePath.length() >= 2) {
1570 bool windowsQuirk = basePath.is8Bit()
1571 ? isWindowsDriveLetter(CodePointIterator<LChar>(basePath.characters8(), basePath.characters8() + basePath.length()))
1572 : isWindowsDriveLetter(CodePointIterator<UChar>(basePath.characters16(), basePath.characters16() + basePath.length()));
1574 appendToASCIIBuffer(basePath[0]);
1575 appendToASCIIBuffer(basePath[1]);
1580 appendToASCIIBuffer("//", 2);
1581 m_url.m_userStart = currentPosition(c) - 1;
1582 m_url.m_userEnd = m_url.m_userStart;
1583 m_url.m_passwordEnd = m_url.m_userStart;
1584 m_url.m_hostEnd = m_url.m_userStart;
1585 m_url.m_portEnd = m_url.m_userStart;
1586 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1587 if (isWindowsDriveLetter(c))
1588 appendWindowsDriveLetter(c);
1589 state = State::Path;
1591 case State::FileHost:
1593 LOG_STATE("FileHost");
1594 if (isSlashQuestionOrHash(*c)) {
1595 bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1596 && isWindowsDriveLetter(authorityOrHostBegin);
1598 syntaxViolation(authorityOrHostBegin);
1599 appendToASCIIBuffer('/');
1600 appendWindowsDriveLetter(authorityOrHostBegin);
1602 if (windowsQuirk || authorityOrHostBegin == c) {
1603 ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1, 1) == "/");
1604 if (UNLIKELY(*c == '?')) {
1606 appendToASCIIBuffer("/?", 2);
1609 state = State::UTF8Query;
1612 state = State::NonUTF8Query;
1614 m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1615 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1618 if (UNLIKELY(*c == '#')) {
1620 appendToASCIIBuffer("/#", 2);
1622 m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1623 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1624 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1625 state = State::Fragment;
1628 state = State::Path;
1631 if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1635 if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) {
1637 m_asciiBuffer.shrink(m_url.m_passwordEnd);
1638 m_url.m_hostEnd = currentPosition(c);
1639 m_url.m_portEnd = m_url.m_hostEnd;
1642 state = State::PathStart;
1645 if (isPercentOrNonASCII(*c))
1646 m_hostHasPercentOrNonASCII = true;
1648 } while (!c.atEnd());
1650 case State::PathStart:
1651 LOG_STATE("PathStart");
1652 if (*c != '/' && *c != '\\')
1654 state = State::Path;
1658 if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1659 if (UNLIKELY(m_urlIsSpecial && *c == '\\'))
1661 appendToASCIIBuffer('/');
1663 m_url.m_pathAfterLastSlash = currentPosition(c);
1666 if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1, 1) == "/")) {
1667 if (UNLIKELY(isDoubleDotPathSegment(c))) {
1669 consumeDoubleDotPathSegment(c);
1673 if (UNLIKELY(isSingleDotPathSegment(c))) {
1675 consumeSingleDotPathSegment(c);
1680 m_url.m_pathEnd = currentPosition(c);
1681 appendToASCIIBuffer('?');
1684 state = State::UTF8Query;
1687 state = State::NonUTF8Query;
1692 m_url.m_pathEnd = currentPosition(c);
1693 m_url.m_queryEnd = m_url.m_pathEnd;
1694 state = State::Fragment;
1697 utf8PercentEncode<isInDefaultEncodeSet>(c);
1700 case State::CannotBeABaseURLPath:
1701 LOG_STATE("CannotBeABaseURLPath");
1703 m_url.m_pathEnd = currentPosition(c);
1704 appendToASCIIBuffer('?');
1707 state = State::UTF8Query;
1710 state = State::NonUTF8Query;
1712 } else if (*c == '#') {
1713 m_url.m_pathEnd = currentPosition(c);
1714 m_url.m_queryEnd = m_url.m_pathEnd;
1715 state = State::Fragment;
1716 } else if (*c == '/') {
1717 appendToASCIIBuffer('/');
1719 m_url.m_pathAfterLastSlash = currentPosition(c);
1721 utf8PercentEncode<isInSimpleEncodeSet>(c);
1725 case State::UTF8Query:
1726 LOG_STATE("UTF8Query");
1727 ASSERT(queryBegin == CodePointIterator<CharacterType>());
1729 m_url.m_queryEnd = currentPosition(c);
1730 state = State::Fragment;
1736 appendCodePoint(queryBuffer, *c);
1739 case State::NonUTF8Query:
1741 LOG_STATE("NonUTF8Query");
1742 ASSERT(queryBegin != CodePointIterator<CharacterType>());
1744 encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
1745 m_url.m_queryEnd = currentPosition(c);
1746 state = State::Fragment;
1749 appendCodePoint(queryBuffer, *c);
1750 advance(c, queryBegin);
1751 } while (!c.atEnd());
1753 case State::Fragment:
1754 URL_PARSER_LOG("State Fragment");
1755 utf8PercentEncode<isInSimpleEncodeSet>(c);
1762 case State::SchemeStart:
1763 LOG_FINAL_STATE("SchemeStart");
1764 if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1766 m_url.removeFragmentIdentifier();
1772 LOG_FINAL_STATE("Scheme");
1775 case State::NoScheme:
1776 LOG_FINAL_STATE("NoScheme");
1777 RELEASE_ASSERT_NOT_REACHED();
1778 case State::SpecialRelativeOrAuthority:
1779 LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1780 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1781 m_url.m_fragmentEnd = m_url.m_queryEnd;
1783 case State::PathOrAuthority:
1784 LOG_FINAL_STATE("PathOrAuthority");
1785 ASSERT(m_url.m_userStart);
1786 ASSERT(m_url.m_userStart == currentPosition(c));
1787 ASSERT(parsedDataView(currentPosition(c) - 1, 1) == "/");
1788 m_url.m_userStart--;
1789 m_url.m_userEnd = m_url.m_userStart;
1790 m_url.m_passwordEnd = m_url.m_userStart;
1791 m_url.m_hostEnd = m_url.m_userStart;
1792 m_url.m_portEnd = m_url.m_userStart;
1793 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1794 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1795 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1796 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1798 case State::Relative:
1799 LOG_FINAL_STATE("Relative");
1800 RELEASE_ASSERT_NOT_REACHED();
1801 case State::RelativeSlash:
1802 LOG_FINAL_STATE("RelativeSlash");
1803 copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1804 appendToASCIIBuffer('/');
1805 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1806 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1807 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1808 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1810 case State::SpecialAuthoritySlashes:
1811 LOG_FINAL_STATE("SpecialAuthoritySlashes");
1812 m_url.m_userStart = currentPosition(c);
1813 m_url.m_userEnd = m_url.m_userStart;
1814 m_url.m_passwordEnd = m_url.m_userStart;
1815 m_url.m_hostEnd = m_url.m_userStart;
1816 m_url.m_portEnd = m_url.m_userStart;
1817 m_url.m_pathAfterLastSlash = m_url.m_userStart;
1818 m_url.m_pathEnd = m_url.m_userStart;
1819 m_url.m_queryEnd = m_url.m_userStart;
1820 m_url.m_fragmentEnd = m_url.m_userStart;
1822 case State::SpecialAuthorityIgnoreSlashes:
1823 LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1827 case State::AuthorityOrHost:
1828 LOG_FINAL_STATE("AuthorityOrHost");
1829 m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1830 m_url.m_passwordEnd = m_url.m_userEnd;
1831 if (authorityOrHostBegin.atEnd()) {
1832 m_url.m_userEnd = m_url.m_userStart;
1833 m_url.m_passwordEnd = m_url.m_userStart;
1834 m_url.m_hostEnd = m_url.m_userStart;
1835 m_url.m_portEnd = m_url.m_userStart;
1836 m_url.m_pathEnd = m_url.m_userStart;
1837 } else if (!parseHostAndPort(authorityOrHostBegin)) {
1841 if (m_urlIsSpecial) {
1843 appendToASCIIBuffer('/');
1844 m_url.m_pathEnd = m_url.m_portEnd + 1;
1846 m_url.m_pathEnd = m_url.m_portEnd;
1848 m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1849 m_url.m_queryEnd = m_url.m_pathEnd;
1850 m_url.m_fragmentEnd = m_url.m_pathEnd;
1853 LOG_FINAL_STATE("Host");
1854 if (!parseHostAndPort(authorityOrHostBegin)) {
1858 if (m_urlIsSpecial) {
1860 appendToASCIIBuffer('/');
1861 m_url.m_pathEnd = m_url.m_portEnd + 1;
1863 m_url.m_pathEnd = m_url.m_portEnd;
1864 m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1865 m_url.m_queryEnd = m_url.m_pathEnd;
1866 m_url.m_fragmentEnd = m_url.m_pathEnd;
1869 LOG_FINAL_STATE("File");
1870 if (base.isValid() && base.protocolIs("file")) {
1871 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1872 appendToASCIIBuffer(':');
1875 appendToASCIIBuffer("///", 3);
1876 m_url.m_userStart = currentPosition(c) - 1;
1877 m_url.m_userEnd = m_url.m_userStart;
1878 m_url.m_passwordEnd = m_url.m_userStart;
1879 m_url.m_hostEnd = m_url.m_userStart;
1880 m_url.m_portEnd = m_url.m_userStart;
1881 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1882 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1883 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1884 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1886 case State::FileSlash:
1887 LOG_FINAL_STATE("FileSlash");
1889 m_url.m_userStart = currentPosition(c) + 1;
1890 appendToASCIIBuffer("//", 2);
1891 m_url.m_userEnd = m_url.m_userStart;
1892 m_url.m_passwordEnd = m_url.m_userStart;
1893 m_url.m_hostEnd = m_url.m_userStart;
1894 m_url.m_portEnd = m_url.m_userStart;
1895 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1896 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1897 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1898 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1900 case State::FileHost:
1901 LOG_FINAL_STATE("FileHost");
1902 if (authorityOrHostBegin == c) {
1904 appendToASCIIBuffer('/');
1905 m_url.m_userStart = currentPosition(c) - 1;
1906 m_url.m_userEnd = m_url.m_userStart;
1907 m_url.m_passwordEnd = m_url.m_userStart;
1908 m_url.m_hostEnd = m_url.m_userStart;
1909 m_url.m_portEnd = m_url.m_userStart;
1910 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1911 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1912 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1913 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1917 if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1923 if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) {
1924 m_asciiBuffer.shrink(m_url.m_passwordEnd);
1925 m_url.m_hostEnd = currentPosition(c);
1926 m_url.m_portEnd = m_url.m_hostEnd;
1928 appendToASCIIBuffer('/');
1929 m_url.m_pathAfterLastSlash = m_url.m_portEnd + 1;
1930 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1931 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1932 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1934 case State::PathStart:
1935 LOG_FINAL_STATE("PathStart");
1936 RELEASE_ASSERT_NOT_REACHED();
1938 LOG_FINAL_STATE("Path");
1939 m_url.m_pathEnd = currentPosition(c);
1940 m_url.m_queryEnd = m_url.m_pathEnd;
1941 m_url.m_fragmentEnd = m_url.m_pathEnd;
1943 case State::CannotBeABaseURLPath:
1944 LOG_FINAL_STATE("CannotBeABaseURLPath");
1945 m_url.m_pathEnd = currentPosition(c);
1946 m_url.m_queryEnd = m_url.m_pathEnd;
1947 m_url.m_fragmentEnd = m_url.m_pathEnd;
1949 case State::UTF8Query:
1950 LOG_FINAL_STATE("UTF8Query");
1951 ASSERT(queryBegin == CodePointIterator<CharacterType>());
1952 m_url.m_queryEnd = currentPosition(c);
1953 m_url.m_fragmentEnd = m_url.m_queryEnd;
1955 case State::NonUTF8Query:
1956 LOG_FINAL_STATE("NonUTF8Query");
1957 ASSERT(queryBegin != CodePointIterator<CharacterType>());
1958 encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
1959 m_url.m_queryEnd = currentPosition(c);
1960 m_url.m_fragmentEnd = m_url.m_queryEnd;
1962 case State::Fragment:
1963 LOG_FINAL_STATE("Fragment");
1964 m_url.m_fragmentEnd = currentPosition(c);
1968 if (LIKELY(!m_didSeeSyntaxViolation)) {
1969 m_url.m_string = m_inputString;
1970 ASSERT(m_asciiBuffer.isEmpty());
1972 m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
1973 m_url.m_isValid = true;
1974 URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
1977 template<typename CharacterType>
1978 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
1980 if (UNLIKELY(iterator.atEnd())) {
1981 syntaxViolation(iterator);
1982 m_url.m_userEnd = currentPosition(iterator);
1983 m_url.m_passwordEnd = m_url.m_userEnd;
1986 for (; !iterator.atEnd(); advance(iterator)) {
1987 if (*iterator == ':') {
1988 m_url.m_userEnd = currentPosition(iterator);
1989 auto iteratorAtColon = iterator;
1991 bool tabOrNewlineAfterColon = false;
1992 while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
1993 tabOrNewlineAfterColon = true;
1996 if (UNLIKELY(iterator.atEnd())) {
1997 syntaxViolation(iteratorAtColon);
1998 m_url.m_passwordEnd = m_url.m_userEnd;
1999 if (m_url.m_userEnd > m_url.m_userStart)
2000 appendToASCIIBuffer('@');
2003 if (tabOrNewlineAfterColon)
2004 syntaxViolation(iteratorAtColon);
2005 appendToASCIIBuffer(':');
2008 utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2010 for (; !iterator.atEnd(); advance(iterator))
2011 utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2012 m_url.m_passwordEnd = currentPosition(iterator);
2013 if (!m_url.m_userEnd)
2014 m_url.m_userEnd = m_url.m_passwordEnd;
2015 appendToASCIIBuffer('@');
2018 template<typename UnsignedIntegerType>
2019 void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
2021 LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
2022 LChar* end = buf + WTF_ARRAY_LENGTH(buf);
2025 *--p = (number % 10) + '0';
2028 appendToASCIIBuffer(p, end - p);
2031 void URLParser::serializeIPv4(IPv4Address address)
2033 appendNumberToASCIIBuffer<uint8_t>(address >> 24);
2034 appendToASCIIBuffer('.');
2035 appendNumberToASCIIBuffer<uint8_t>(address >> 16);
2036 appendToASCIIBuffer('.');
2037 appendNumberToASCIIBuffer<uint8_t>(address >> 8);
2038 appendToASCIIBuffer('.');
2039 appendNumberToASCIIBuffer<uint8_t>(address);
2042 static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
2045 for (; end < 8; end++) {
2052 static Optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
2054 Optional<size_t> longest;
2055 size_t longestLength = 0;
2056 for (size_t i = 0; i < 8; i++) {
2057 size_t length = zeroSequenceLength(address, i);
2059 if (length > 1 && (!longest || longestLength < length)) {
2061 longestLength = length;
2069 void URLParser::serializeIPv6Piece(uint16_t piece)
2071 bool printed = false;
2072 if (auto nibble0 = piece >> 12) {
2073 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
2076 auto nibble1 = piece >> 8 & 0xF;
2077 if (printed || nibble1) {
2078 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
2081 auto nibble2 = piece >> 4 & 0xF;
2082 if (printed || nibble2)
2083 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
2084 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
2087 void URLParser::serializeIPv6(URLParser::IPv6Address address)
2089 appendToASCIIBuffer('[');
2090 auto compressPointer = findLongestZeroSequence(address);
2091 for (size_t piece = 0; piece < 8; piece++) {
2092 if (compressPointer && compressPointer.value() == piece) {
2093 ASSERT(!address[piece]);
2095 appendToASCIIBuffer(':');
2097 appendToASCIIBuffer("::", 2);
2098 while (piece < 8 && !address[piece])
2103 serializeIPv6Piece(address[piece]);
2105 appendToASCIIBuffer(':');
2107 appendToASCIIBuffer(']');
2110 template<typename CharacterType>
2111 Optional<uint32_t> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation)
2113 enum class State : uint8_t {
2120 State state = State::UnknownBase;
2121 Checked<uint32_t, RecordOverflow> value = 0;
2122 if (!iterator.atEnd() && *iterator == '.')
2124 while (!iterator.atEnd()) {
2125 if (isTabOrNewline(*iterator)) {
2126 didSeeSyntaxViolation = true;
2130 if (*iterator == '.') {
2131 ASSERT(!value.hasOverflowed());
2132 return value.unsafeGet();
2135 case State::UnknownBase:
2136 if (UNLIKELY(*iterator == '0')) {
2138 state = State::OctalOrHex;
2141 state = State::Decimal;
2143 case State::OctalOrHex:
2144 didSeeSyntaxViolation = true;
2145 if (*iterator == 'x' || *iterator == 'X') {
2150 state = State::Octal;
2152 case State::Decimal:
2153 if (*iterator < '0' || *iterator > '9')
2156 value += *iterator - '0';
2157 if (UNLIKELY(value.hasOverflowed()))
2162 ASSERT(didSeeSyntaxViolation);
2163 if (*iterator < '0' || *iterator > '7')
2166 value += *iterator - '0';
2167 if (UNLIKELY(value.hasOverflowed()))
2172 ASSERT(didSeeSyntaxViolation);
2173 if (!isASCIIHexDigit(*iterator))
2176 value += toASCIIHexValue(*iterator);
2177 if (UNLIKELY(value.hasOverflowed()))
2183 ASSERT(!value.hasOverflowed());
2184 return value.unsafeGet();
2187 ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2189 RELEASE_ASSERT(exponent <= 4);
2190 uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
2191 return values[exponent];
2194 template<typename CharacterType>
2195 Optional<URLParser::IPv4Address> URLParser::parseIPv4Host(CodePointIterator<CharacterType> iterator)
2197 auto hostBegin = iterator;
2199 Vector<uint32_t, 4> items;
2200 items.reserveInitialCapacity(4);
2201 bool didSeeSyntaxViolation = false;
2202 while (!iterator.atEnd()) {
2203 if (isTabOrNewline(*iterator)) {
2204 didSeeSyntaxViolation = true;
2208 if (items.size() >= 4)
2210 if (auto item = parseIPv4Piece(iterator, didSeeSyntaxViolation))
2211 items.append(item.value());
2214 if (!iterator.atEnd()) {
2215 if (items.size() >= 4)
2217 if (*iterator == '.')
2223 if (!iterator.atEnd() || !items.size() || items.size() > 4)
2225 if (items.size() > 1) {
2226 for (size_t i = 0; i < items.size() - 1; i++) {
2231 if (items[items.size() - 1] >= pow256(5 - items.size()))
2234 if (didSeeSyntaxViolation)
2235 syntaxViolation(hostBegin);
2236 for (auto item : items) {
2238 syntaxViolation(hostBegin);
2241 if (UNLIKELY(items.size() != 4))
2242 syntaxViolation(hostBegin);
2244 IPv4Address ipv4 = items.takeLast();
2245 for (size_t counter = 0; counter < items.size(); ++counter)
2246 ipv4 += items[counter] * pow256(3 - counter);
2250 template<typename CharacterType>
2251 Optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator)
2253 if (iterator.atEnd())
2256 bool leadingZeros = false;
2257 size_t digitCount = 0;
2258 while (!iterator.atEnd()) {
2259 if (!isASCIIDigit(*iterator))
2262 if (!piece && *iterator == '0') {
2265 leadingZeros = true;
2267 if (!piece && *iterator == '0')
2268 leadingZeros = true;
2269 piece = piece * 10 + *iterator - '0';
2272 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2273 if (iterator.atEnd())
2275 if (*iterator == '.')
2278 if (piece && leadingZeros)
2283 template<typename CharacterType>
2284 Optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator)
2286 IPv4Address address = 0;
2287 for (size_t i = 0; i < 4; ++i) {
2288 if (Optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator))
2289 address = (address << 8) + piece.value();
2293 if (iterator.atEnd())
2295 if (*iterator != '.')
2297 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2298 } else if (!iterator.atEnd())
2301 ASSERT(iterator.atEnd());
2305 template<typename CharacterType>
2306 Optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2310 advance(c, hostBegin);
2314 IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}};
2315 size_t piecePointer = 0;
2316 Optional<size_t> compressPointer;
2319 advance(c, hostBegin);
2324 advance(c, hostBegin);
2326 compressPointer = piecePointer;
2329 while (!c.atEnd()) {
2330 if (piecePointer == 8)
2333 if (compressPointer)
2335 advance(c, hostBegin);
2337 compressPointer = piecePointer;
2340 if (piecePointer == 6 || (compressPointer && piecePointer < 6)) {
2341 if (Optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) {
2342 if (compressPointer && piecePointer == 5)
2344 syntaxViolation(hostBegin);
2345 address[piecePointer++] = ipv4Address.value() >> 16;
2346 address[piecePointer++] = ipv4Address.value() & 0xFFFF;
2353 bool leadingZeros = false;
2354 for (; length < 4; length++) {
2357 if (!isASCIIHexDigit(*c))
2359 if (isASCIIUpper(*c))
2360 syntaxViolation(hostBegin);
2361 if (*c == '0' && !length)
2362 leadingZeros = true;
2363 value = value * 0x10 + toASCIIHexValue(*c);
2364 advance(c, hostBegin);
2367 if (UNLIKELY((value && leadingZeros) || (!value && length > 1)))
2368 syntaxViolation(hostBegin);
2370 address[piecePointer++] = value;
2373 if (piecePointer == 8 || *c != ':')
2375 advance(c, hostBegin);
2381 if (compressPointer) {
2382 size_t swaps = piecePointer - compressPointer.value();
2385 std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
2386 } else if (piecePointer != 8)
2389 Optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2390 if (possibleCompressPointer)
2391 possibleCompressPointer.value()++;
2392 if (UNLIKELY(compressPointer != possibleCompressPointer))
2393 syntaxViolation(hostBegin);
2398 template<typename CharacterType>
2399 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2401 Vector<LChar, defaultInlineBufferSize> output;
2402 output.reserveInitialCapacity(length);
2404 for (size_t i = 0; i < length; ++i) {
2405 uint8_t byte = input[i];
2407 output.uncheckedAppend(byte);
2408 else if (length > 2 && i < length - 2) {
2409 if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2410 syntaxViolation(iteratorForSyntaxViolationPosition);
2411 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2414 output.uncheckedAppend(byte);
2416 output.uncheckedAppend(byte);
2421 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length)
2423 Vector<LChar, defaultInlineBufferSize> output;
2424 output.reserveInitialCapacity(length);
2426 for (size_t i = 0; i < length; ++i) {
2427 uint8_t byte = input[i];
2429 output.uncheckedAppend(byte);
2430 else if (length > 2 && i < length - 2) {
2431 if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2432 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2435 output.uncheckedAppend(byte);
2437 output.uncheckedAppend(byte);
2442 ALWAYS_INLINE static bool containsOnlyASCII(const String& string)
2444 if (string.is8Bit())
2445 return charactersAreAllASCII(string.characters8(), string.length());
2446 return charactersAreAllASCII(string.characters16(), string.length());
2449 template<typename CharacterType>
2450 Optional<Vector<LChar, URLParser::defaultInlineBufferSize>> URLParser::domainToASCII(const String& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2452 Vector<LChar, defaultInlineBufferSize> ascii;
2453 if (containsOnlyASCII(domain)) {
2454 size_t length = domain.length();
2455 if (domain.is8Bit()) {
2456 const LChar* characters = domain.characters8();
2457 ascii.reserveInitialCapacity(length);
2458 for (size_t i = 0; i < length; ++i) {
2459 if (UNLIKELY(isASCIIUpper(characters[i])))
2460 syntaxViolation(iteratorForSyntaxViolationPosition);
2461 ascii.uncheckedAppend(toASCIILower(characters[i]));
2464 const UChar* characters = domain.characters16();
2465 ascii.reserveInitialCapacity(length);
2466 for (size_t i = 0; i < length; ++i) {
2467 if (UNLIKELY(isASCIIUpper(characters[i])))
2468 syntaxViolation(iteratorForSyntaxViolationPosition);
2469 ascii.uncheckedAppend(toASCIILower(characters[i]));
2475 UChar hostnameBuffer[defaultInlineBufferSize];
2476 UErrorCode error = U_ZERO_ERROR;
2478 #if COMPILER(GCC) || COMPILER(CLANG)
2479 #pragma GCC diagnostic push
2480 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2482 // FIXME: This should use uidna_openUTS46 / uidna_close instead
2483 int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, UIDNA_ALLOW_UNASSIGNED, nullptr, &error);
2484 #if COMPILER(GCC) || COMPILER(CLANG)
2485 #pragma GCC diagnostic pop
2487 ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
2489 if (error == U_ZERO_ERROR) {
2490 for (int32_t i = 0; i < numCharactersConverted; ++i) {
2491 ASSERT(isASCII(hostnameBuffer[i]));
2492 ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2494 ascii.append(hostnameBuffer, numCharactersConverted);
2495 if (domain != StringView(ascii.data(), ascii.size()))
2496 syntaxViolation(iteratorForSyntaxViolationPosition);
2500 // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
2504 bool URLParser::hasInvalidDomainCharacter(const Vector<LChar, URLParser::defaultInlineBufferSize>& asciiDomain)
2506 for (size_t i = 0; i < asciiDomain.size(); ++i) {
2507 if (isInvalidDomainCharacter(asciiDomain[i]))
2513 template<typename CharacterType>
2514 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2516 ASSERT(*iterator == ':');
2517 auto colonIterator = iterator;
2518 advance(iterator, colonIterator);
2520 if (UNLIKELY(iterator.atEnd())) {
2521 m_url.m_portEnd = currentPosition(colonIterator);
2522 syntaxViolation(colonIterator);
2525 size_t digitCount = 0;
2526 bool leadingZeros = false;
2527 for (; !iterator.atEnd(); ++iterator) {
2528 if (UNLIKELY(isTabOrNewline(*iterator))) {
2529 syntaxViolation(colonIterator);
2532 if (isASCIIDigit(*iterator)) {
2533 if (*iterator == '0' && !digitCount)
2534 leadingZeros = true;
2536 port = port * 10 + *iterator - '0';
2537 if (port > std::numeric_limits<uint16_t>::max())
2543 if (port && leadingZeros)
2544 syntaxViolation(colonIterator);
2546 if (!port && digitCount > 1)
2547 syntaxViolation(colonIterator);
2549 ASSERT(port == static_cast<uint16_t>(port));
2550 if (UNLIKELY(defaultPortForProtocol(parsedDataView(0, m_url.m_schemeEnd)) == static_cast<uint16_t>(port)))
2551 syntaxViolation(colonIterator);
2553 appendToASCIIBuffer(':');
2554 ASSERT(port <= std::numeric_limits<uint16_t>::max());
2555 appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2558 m_url.m_portEnd = currentPosition(iterator);
2562 template<typename CharacterType>
2563 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2565 if (iterator.atEnd())
2567 if (*iterator == ':')
2569 if (*iterator == '[') {
2570 auto ipv6End = iterator;
2571 while (!ipv6End.atEnd() && *ipv6End != ']')
2573 if (ipv6End.atEnd())
2575 if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2576 serializeIPv6(address.value());
2577 if (!ipv6End.atEnd()) {
2579 if (!ipv6End.atEnd() && *ipv6End == ':') {
2580 m_url.m_hostEnd = currentPosition(ipv6End);
2581 return parsePort(ipv6End);
2583 m_url.m_hostEnd = currentPosition(ipv6End);
2584 m_url.m_portEnd = m_url.m_hostEnd;
2587 m_url.m_hostEnd = currentPosition(ipv6End);
2593 if (!m_urlIsSpecial) {
2594 for (; !iterator.atEnd(); ++iterator) {
2595 if (UNLIKELY(isTabOrNewline(*iterator))) {
2596 syntaxViolation(iterator);
2599 if (*iterator == ':')
2601 utf8PercentEncode<isInSimpleEncodeSet>(iterator);
2603 m_url.m_hostEnd = currentPosition(iterator);
2604 if (iterator.atEnd()) {
2605 m_url.m_portEnd = currentPosition(iterator);
2608 return parsePort(iterator);
2611 if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2612 auto hostIterator = iterator;
2613 for (; !iterator.atEnd(); ++iterator) {
2614 if (isTabOrNewline(*iterator))
2616 if (*iterator == ':')
2618 if (isInvalidDomainCharacter(*iterator))
2621 if (auto address = parseIPv4Host(CodePointIterator<CharacterType>(hostIterator, iterator))) {
2622 serializeIPv4(address.value());
2623 m_url.m_hostEnd = currentPosition(iterator);
2624 if (iterator.atEnd()) {
2625 m_url.m_portEnd = currentPosition(iterator);
2628 return parsePort(iterator);
2630 for (; hostIterator != iterator; ++hostIterator) {
2631 if (UNLIKELY(isTabOrNewline(*hostIterator))) {
2632 syntaxViolation(hostIterator);
2635 if (UNLIKELY(isASCIIUpper(*hostIterator)))
2636 syntaxViolation(hostIterator);
2637 appendToASCIIBuffer(toASCIILower(*hostIterator));
2639 m_url.m_hostEnd = currentPosition(iterator);
2640 if (!hostIterator.atEnd())
2641 return parsePort(hostIterator);
2642 m_url.m_portEnd = currentPosition(iterator);
2646 auto hostBegin = iterator;
2648 Vector<LChar, defaultInlineBufferSize> utf8Encoded;
2649 for (; !iterator.atEnd(); ++iterator) {
2650 if (UNLIKELY(isTabOrNewline(*iterator))) {
2651 syntaxViolation(hostBegin);
2654 if (*iterator == ':')
2656 if (UNLIKELY(!isASCII(*iterator)))
2657 syntaxViolation(hostBegin);
2659 uint8_t buffer[U8_MAX_LENGTH];
2661 UBool error = false;
2662 U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error);
2663 ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
2664 // FIXME: Check error.
2665 utf8Encoded.append(buffer, offset);
2667 Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin);
2668 String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2669 if (domain != StringView(percentDecoded.data(), percentDecoded.size()))
2670 syntaxViolation(hostBegin);
2671 auto asciiDomain = domainToASCII(domain, hostBegin);
2672 if (!asciiDomain || hasInvalidDomainCharacter(asciiDomain.value()))
2674 Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value();
2675 const LChar* asciiDomainCharacters = asciiDomainValue.data();
2677 if (auto address = parseIPv4Host(CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()))) {
2678 serializeIPv4(address.value());
2679 m_url.m_hostEnd = currentPosition(iterator);
2680 if (iterator.atEnd()) {
2681 m_url.m_portEnd = currentPosition(iterator);
2684 return parsePort(iterator);
2687 appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2688 m_url.m_hostEnd = currentPosition(iterator);
2689 if (!iterator.atEnd())
2690 return parsePort(iterator);
2691 m_url.m_portEnd = currentPosition(iterator);
2695 Optional<String> URLParser::formURLDecode(StringView input)
2697 auto utf8 = input.utf8(StrictConversion);
2700 auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2701 return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2704 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2706 Vector<StringView> sequences = input.split('&');
2708 URLEncodedForm output;
2709 for (auto& bytes : sequences) {
2710 auto valueStart = bytes.find('=');
2711 if (valueStart == notFound) {
2712 if (auto name = formURLDecode(bytes))
2713 output.append({name.value().replace('+', 0x20), emptyString()});
2715 auto name = formURLDecode(bytes.substring(0, valueStart));
2716 auto value = formURLDecode(bytes.substring(valueStart + 1));
2718 output.append(std::make_pair(name.value().replace('+', 0x20), value.value().replace('+', 0x20)));
2724 static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2726 auto utf8 = input.utf8(StrictConversion);
2727 const char* data = utf8.data();
2728 for (size_t i = 0; i < utf8.length(); ++i) {
2729 const char byte = data[i];
2731 output.append(0x2B);
2732 else if (byte == 0x2A
2735 || (byte >= 0x30 && byte <= 0x39)
2736 || (byte >= 0x41 && byte <= 0x5A)
2738 || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches.
2739 output.append(byte);
2741 percentEncodeByte(byte, output);
2745 String URLParser::serialize(const URLEncodedForm& tuples)
2747 Vector<LChar> output;
2748 for (auto& tuple : tuples) {
2749 if (!output.isEmpty())
2751 serializeURLEncodedForm(tuple.first, output);
2753 serializeURLEncodedForm(tuple.second, output);
2755 return String::adopt(WTFMove(output));
2758 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2760 // FIXME: m_cannotBeABaseURL is not compared because the old URL::parse did not use it,
2761 // but once we get rid of URL::parse its value should be tested.
2762 URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2764 a.m_protocolIsInHTTPFamily,
2771 a.m_pathAfterLastSlash,
2775 a.m_string.utf8().data(),
2777 b.m_protocolIsInHTTPFamily,
2784 b.m_pathAfterLastSlash,
2788 b.m_string.utf8().data());
2790 return a.m_string == b.m_string
2791 && a.m_isValid == b.m_isValid
2792 && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2793 && a.m_schemeEnd == b.m_schemeEnd
2794 && a.m_userStart == b.m_userStart
2795 && a.m_userEnd == b.m_userEnd
2796 && a.m_passwordEnd == b.m_passwordEnd
2797 && a.m_hostEnd == b.m_hostEnd
2798 && a.m_portEnd == b.m_portEnd
2799 && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2800 && a.m_pathEnd == b.m_pathEnd
2801 && a.m_queryEnd == b.m_queryEnd
2802 && a.m_fragmentEnd == b.m_fragmentEnd;
2805 bool URLParser::internalValuesConsistent(const URL& url)
2807 return url.m_schemeEnd <= url.m_userStart
2808 && url.m_userStart <= url.m_userEnd
2809 && url.m_userEnd <= url.m_passwordEnd
2810 && url.m_passwordEnd <= url.m_hostEnd
2811 && url.m_hostEnd <= url.m_portEnd
2812 && url.m_portEnd <= url.m_pathAfterLastSlash
2813 && url.m_pathAfterLastSlash <= url.m_pathEnd
2814 && url.m_pathEnd <= url.m_queryEnd
2815 && url.m_queryEnd <= url.m_fragmentEnd
2816 && (url.m_isValid ? url.m_fragmentEnd == url.m_string.length() : !url.m_fragmentEnd);
2817 // FIXME: Why do we even store m_fragmentEnd?
2818 // It should be able to be deduced from m_isValid and m_string.length() to save memory.
2821 static bool urlParserEnabled = true;
2823 void URLParser::setEnabled(bool enabled)
2825 urlParserEnabled = enabled;
2828 bool URLParser::enabled()
2830 return urlParserEnabled;
2833 } // namespace WebCore