2 * Copyright (C) 2016 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23 * THE POSSIBILITY OF SUCH DAMAGE.
27 #include "URLParser.h"
30 #include "RuntimeApplicationChecks.h"
33 #include <unicode/uidna.h>
34 #include <unicode/utypes.h>
38 #define URL_PARSER_DEBUGGING 0
40 #if URL_PARSER_DEBUGGING
41 #define URL_PARSER_LOG(...) LOG(URLParser, __VA_ARGS__)
43 #define URL_PARSER_LOG(...)
46 template<typename CharacterType>
47 class CodePointIterator {
49 ALWAYS_INLINE CodePointIterator() { }
50 ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
56 ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
57 : CodePointIterator(begin.m_begin, end.m_begin)
59 ASSERT(end.m_begin >= begin.m_begin);
62 ALWAYS_INLINE UChar32 operator*() const;
63 ALWAYS_INLINE CodePointIterator& operator++();
65 ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
67 return m_begin == other.m_begin
68 && m_end == other.m_end;
70 ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
72 ALWAYS_INLINE CodePointIterator& operator=(const CodePointIterator& other)
74 m_begin = other.m_begin;
79 ALWAYS_INLINE bool atEnd() const
81 ASSERT(m_begin <= m_end);
82 return m_begin >= m_end;
85 ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
87 ASSERT(m_begin >= reference);
88 return m_begin - reference;
91 ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
93 return codeUnitsSince(other.m_begin);
97 const CharacterType* m_begin { nullptr };
98 const CharacterType* m_end { nullptr };
102 ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const
109 ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
116 ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const
120 U16_GET(m_begin, 0, 0, m_end - m_begin, c);
125 ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
128 size_t length = m_end - m_begin;
129 U16_FWD_1(m_begin, i, length);
134 ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
136 if (U_IS_BMP(codePoint)) {
137 destination.append(static_cast<UChar>(codePoint));
140 destination.reserveCapacity(destination.size() + 2);
141 destination.uncheckedAppend(U16_LEAD(codePoint));
142 destination.uncheckedAppend(U16_TRAIL(codePoint));
145 enum URLCharacterClass {
150 SlashQuestionOrHash = 0x10,
154 static const uint8_t characterClassTable[256] = {
155 UserInfo | Default | QueryPercent | ForbiddenHost, // 0x0
156 UserInfo | Default | QueryPercent, // 0x1
157 UserInfo | Default | QueryPercent, // 0x2
158 UserInfo | Default | QueryPercent, // 0x3
159 UserInfo | Default | QueryPercent, // 0x4
160 UserInfo | Default | QueryPercent, // 0x5
161 UserInfo | Default | QueryPercent, // 0x6
162 UserInfo | Default | QueryPercent, // 0x7
163 UserInfo | Default | QueryPercent, // 0x8
164 UserInfo | Default | QueryPercent | ForbiddenHost, // 0x9
165 UserInfo | Default | QueryPercent | ForbiddenHost, // 0xA
166 UserInfo | Default | QueryPercent, // 0xB
167 UserInfo | Default | QueryPercent, // 0xC
168 UserInfo | Default | QueryPercent | ForbiddenHost, // 0xD
169 UserInfo | Default | QueryPercent, // 0xE
170 UserInfo | Default | QueryPercent, // 0xF
171 UserInfo | Default | QueryPercent, // 0x10
172 UserInfo | Default | QueryPercent, // 0x11
173 UserInfo | Default | QueryPercent, // 0x12
174 UserInfo | Default | QueryPercent, // 0x13
175 UserInfo | Default | QueryPercent, // 0x14
176 UserInfo | Default | QueryPercent, // 0x15
177 UserInfo | Default | QueryPercent, // 0x16
178 UserInfo | Default | QueryPercent, // 0x17
179 UserInfo | Default | QueryPercent, // 0x18
180 UserInfo | Default | QueryPercent, // 0x19
181 UserInfo | Default | QueryPercent, // 0x1A
182 UserInfo | Default | QueryPercent, // 0x1B
183 UserInfo | Default | QueryPercent, // 0x1C
184 UserInfo | Default | QueryPercent, // 0x1D
185 UserInfo | Default | QueryPercent, // 0x1E
186 UserInfo | Default | QueryPercent, // 0x1F
187 UserInfo | Default | QueryPercent | ForbiddenHost, // ' '
189 UserInfo | Default | QueryPercent, // '"'
190 UserInfo | Default | QueryPercent | SlashQuestionOrHash | ForbiddenHost, // '#'
192 ForbiddenHost, // '%'
202 UserInfo | SlashQuestionOrHash | ForbiddenHost, // '/'
213 UserInfo | ForbiddenHost, // ':'
215 UserInfo | Default | QueryPercent, // '<'
217 UserInfo | Default | QueryPercent, // '>'
218 UserInfo | Default | SlashQuestionOrHash | ForbiddenHost, // '?'
219 UserInfo | ForbiddenHost, // '@'
246 UserInfo | ForbiddenHost, // '['
247 UserInfo | SlashQuestionOrHash | ForbiddenHost, // '\\'
248 UserInfo | ForbiddenHost, // ']'
251 UserInfo | Default, // '`'
278 UserInfo | Default, // '{'
280 UserInfo | Default, // '}'
282 QueryPercent, // 0x7F
283 QueryPercent, // 0x80
284 QueryPercent, // 0x81
285 QueryPercent, // 0x82
286 QueryPercent, // 0x83
287 QueryPercent, // 0x84
288 QueryPercent, // 0x85
289 QueryPercent, // 0x86
290 QueryPercent, // 0x87
291 QueryPercent, // 0x88
292 QueryPercent, // 0x89
293 QueryPercent, // 0x8A
294 QueryPercent, // 0x8B
295 QueryPercent, // 0x8C
296 QueryPercent, // 0x8D
297 QueryPercent, // 0x8E
298 QueryPercent, // 0x8F
299 QueryPercent, // 0x90
300 QueryPercent, // 0x91
301 QueryPercent, // 0x92
302 QueryPercent, // 0x93
303 QueryPercent, // 0x94
304 QueryPercent, // 0x95
305 QueryPercent, // 0x96
306 QueryPercent, // 0x97
307 QueryPercent, // 0x98
308 QueryPercent, // 0x99
309 QueryPercent, // 0x9A
310 QueryPercent, // 0x9B
311 QueryPercent, // 0x9C
312 QueryPercent, // 0x9D
313 QueryPercent, // 0x9E
314 QueryPercent, // 0x9F
315 QueryPercent, // 0xA0
316 QueryPercent, // 0xA1
317 QueryPercent, // 0xA2
318 QueryPercent, // 0xA3
319 QueryPercent, // 0xA4
320 QueryPercent, // 0xA5
321 QueryPercent, // 0xA6
322 QueryPercent, // 0xA7
323 QueryPercent, // 0xA8
324 QueryPercent, // 0xA9
325 QueryPercent, // 0xAA
326 QueryPercent, // 0xAB
327 QueryPercent, // 0xAC
328 QueryPercent, // 0xAD
329 QueryPercent, // 0xAE
330 QueryPercent, // 0xAF
331 QueryPercent, // 0xB0
332 QueryPercent, // 0xB1
333 QueryPercent, // 0xB2
334 QueryPercent, // 0xB3
335 QueryPercent, // 0xB4
336 QueryPercent, // 0xB5
337 QueryPercent, // 0xB6
338 QueryPercent, // 0xB7
339 QueryPercent, // 0xB8
340 QueryPercent, // 0xB9
341 QueryPercent, // 0xBA
342 QueryPercent, // 0xBB
343 QueryPercent, // 0xBC
344 QueryPercent, // 0xBD
345 QueryPercent, // 0xBE
346 QueryPercent, // 0xBF
347 QueryPercent, // 0xC0
348 QueryPercent, // 0xC1
349 QueryPercent, // 0xC2
350 QueryPercent, // 0xC3
351 QueryPercent, // 0xC4
352 QueryPercent, // 0xC5
353 QueryPercent, // 0xC6
354 QueryPercent, // 0xC7
355 QueryPercent, // 0xC8
356 QueryPercent, // 0xC9
357 QueryPercent, // 0xCA
358 QueryPercent, // 0xCB
359 QueryPercent, // 0xCC
360 QueryPercent, // 0xCD
361 QueryPercent, // 0xCE
362 QueryPercent, // 0xCF
363 QueryPercent, // 0xD0
364 QueryPercent, // 0xD1
365 QueryPercent, // 0xD2
366 QueryPercent, // 0xD3
367 QueryPercent, // 0xD4
368 QueryPercent, // 0xD5
369 QueryPercent, // 0xD6
370 QueryPercent, // 0xD7
371 QueryPercent, // 0xD8
372 QueryPercent, // 0xD9
373 QueryPercent, // 0xDA
374 QueryPercent, // 0xDB
375 QueryPercent, // 0xDC
376 QueryPercent, // 0xDD
377 QueryPercent, // 0xDE
378 QueryPercent, // 0xDF
379 QueryPercent, // 0xE0
380 QueryPercent, // 0xE1
381 QueryPercent, // 0xE2
382 QueryPercent, // 0xE3
383 QueryPercent, // 0xE4
384 QueryPercent, // 0xE5
385 QueryPercent, // 0xE6
386 QueryPercent, // 0xE7
387 QueryPercent, // 0xE8
388 QueryPercent, // 0xE9
389 QueryPercent, // 0xEA
390 QueryPercent, // 0xEB
391 QueryPercent, // 0xEC
392 QueryPercent, // 0xED
393 QueryPercent, // 0xEE
394 QueryPercent, // 0xEF
395 QueryPercent, // 0xF0
396 QueryPercent, // 0xF1
397 QueryPercent, // 0xF2
398 QueryPercent, // 0xF3
399 QueryPercent, // 0xF4
400 QueryPercent, // 0xF5
401 QueryPercent, // 0xF6
402 QueryPercent, // 0xF7
403 QueryPercent, // 0xF8
404 QueryPercent, // 0xF9
405 QueryPercent, // 0xFA
406 QueryPercent, // 0xFB
407 QueryPercent, // 0xFC
408 QueryPercent, // 0xFD
409 QueryPercent, // 0xFE
410 QueryPercent, // 0xFF
413 template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; }
414 template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
415 template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
416 template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
417 template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
418 template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
419 template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
420 template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
421 template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & ValidScheme; }
422 template<typename CharacterType> ALWAYS_INLINE static bool isForbiddenHostCodePoint(CharacterType character) { return character <= ']' && characterClassTable[character] & ForbiddenHost; }
423 static bool shouldPercentEncodeQueryByte(uint8_t byte) { return characterClassTable[byte] & QueryPercent; }
425 template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation>
426 ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
429 while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
430 if (reportSyntaxViolation == ReportSyntaxViolation::Yes)
431 syntaxViolation(iteratorForSyntaxViolationPosition);
436 template<typename CharacterType>
437 bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator)
439 if (iterator.atEnd())
441 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
442 if (iterator.atEnd())
444 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
445 return iterator.atEnd();
448 template<typename CharacterType>
449 ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
451 if (iterator.atEnd() || !isASCIIAlpha(*iterator))
453 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
454 if (iterator.atEnd())
456 if (*iterator == ':')
458 if (UNLIKELY(*iterator == '|'))
463 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
465 ASSERT(isASCII(codePoint));
466 if (UNLIKELY(m_didSeeSyntaxViolation))
467 m_asciiBuffer.append(codePoint);
470 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
472 if (UNLIKELY(m_didSeeSyntaxViolation))
473 m_asciiBuffer.append(characters, length);
476 template<typename CharacterType>
477 void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
479 ASSERT(isWindowsDriveLetter(iterator));
480 appendToASCIIBuffer(*iterator);
482 ASSERT(!iterator.atEnd());
483 ASSERT(*iterator == ':' || *iterator == '|');
484 if (*iterator == '|')
485 syntaxViolation(iterator);
486 appendToASCIIBuffer(':');
490 bool URLParser::copyBaseWindowsDriveLetter(const URL& base)
492 if (base.protocolIs("file")) {
493 RELEASE_ASSERT(base.m_portEnd < base.m_string.length());
494 if (base.m_string.is8Bit()) {
495 const LChar* begin = base.m_string.characters8();
496 CodePointIterator<LChar> c(begin + base.m_portEnd + 1, begin + base.m_string.length());
497 if (isWindowsDriveLetter(c)) {
498 appendWindowsDriveLetter(c);
502 const UChar* begin = base.m_string.characters16();
503 CodePointIterator<UChar> c(begin + base.m_portEnd + 1, begin + base.m_string.length());
504 if (isWindowsDriveLetter(c)) {
505 appendWindowsDriveLetter(c);
513 template<typename CharacterType>
514 bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
516 if (!isWindowsDriveLetter(iterator))
518 if (iterator.atEnd())
521 if (iterator.atEnd())
524 if (iterator.atEnd())
526 return !isSlashQuestionOrHash(*iterator);
529 static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
532 buffer.append(upperNibbleToASCIIHexDigit(byte));
533 buffer.append(lowerNibbleToASCIIHexDigit(byte));
536 void URLParser::percentEncodeByte(uint8_t byte)
538 ASSERT(m_didSeeSyntaxViolation);
539 appendToASCIIBuffer('%');
540 appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
541 appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
544 const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD";
545 const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1;
547 template<bool(*isInCodeSet)(UChar32), typename CharacterType>
548 ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
550 ASSERT(!iterator.atEnd());
551 UChar32 codePoint = *iterator;
552 if (LIKELY(isASCII(codePoint))) {
553 if (UNLIKELY(isInCodeSet(codePoint))) {
554 syntaxViolation(iterator);
555 percentEncodeByte(codePoint);
557 appendToASCIIBuffer(codePoint);
560 ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
561 syntaxViolation(iterator);
563 if (!U_IS_UNICODE_CHAR(codePoint)) {
564 appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
568 uint8_t buffer[U8_MAX_LENGTH];
570 U8_APPEND_UNSAFE(buffer, offset, codePoint);
571 for (int32_t i = 0; i < offset; ++i)
572 percentEncodeByte(buffer[i]);
575 template<typename CharacterType>
576 ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
578 ASSERT(!iterator.atEnd());
579 UChar32 codePoint = *iterator;
580 if (LIKELY(isASCII(codePoint))) {
581 if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint))) {
582 syntaxViolation(iterator);
583 percentEncodeByte(codePoint);
585 appendToASCIIBuffer(codePoint);
589 syntaxViolation(iterator);
591 if (!U_IS_UNICODE_CHAR(codePoint)) {
592 appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
596 uint8_t buffer[U8_MAX_LENGTH];
598 U8_APPEND_UNSAFE(buffer, offset, codePoint);
599 for (int32_t i = 0; i < offset; ++i) {
600 auto byte = buffer[i];
601 if (shouldPercentEncodeQueryByte(byte))
602 percentEncodeByte(byte);
604 appendToASCIIBuffer(byte);
608 template<typename CharacterType>
609 void URLParser::encodeQuery(const Vector<UChar>& source, const TextEncoding& encoding, CodePointIterator<CharacterType> iterator)
611 // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
612 CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
613 const char* data = encoded.data();
614 size_t length = encoded.length();
616 if (!length == !iterator.atEnd()) {
617 syntaxViolation(iterator);
622 for (; i < length; ++i) {
623 ASSERT(!iterator.atEnd());
624 uint8_t byte = data[i];
625 if (UNLIKELY(byte != *iterator)) {
626 syntaxViolation(iterator);
629 if (UNLIKELY(shouldPercentEncodeQueryByte(byte))) {
630 syntaxViolation(iterator);
633 appendToASCIIBuffer(byte);
636 while (!iterator.atEnd() && isTabOrNewline(*iterator))
638 ASSERT((i == length) == iterator.atEnd());
639 for (; i < length; ++i) {
640 ASSERT(m_didSeeSyntaxViolation);
641 uint8_t byte = data[i];
642 if (shouldPercentEncodeQueryByte(byte))
643 percentEncodeByte(byte);
645 appendToASCIIBuffer(byte);
649 std::optional<uint16_t> URLParser::defaultPortForProtocol(StringView scheme)
651 static const uint16_t ftpPort = 21;
652 static const uint16_t gopherPort = 70;
653 static const uint16_t httpPort = 80;
654 static const uint16_t httpsPort = 443;
655 static const uint16_t wsPort = 80;
656 static const uint16_t wssPort = 443;
658 auto length = scheme.length();
665 if (scheme[1] == 's')
725 ALWAYS_INLINE static Scheme scheme(StringView scheme)
727 auto length = scheme.length();
729 return Scheme::NonSpecial;
737 return Scheme::NonSpecial;
743 return Scheme::NonSpecial;
745 return Scheme::NonSpecial;
754 return Scheme::Gopher;
755 return Scheme::NonSpecial;
763 return Scheme::NonSpecial;
769 return Scheme::HTTPS;
770 return Scheme::NonSpecial;
772 return Scheme::NonSpecial;
777 if (scheme[1] == 's')
779 return Scheme::NonSpecial;
784 return Scheme::NonSpecial;
786 return Scheme::NonSpecial;
789 return Scheme::NonSpecial;
793 std::optional<String> URLParser::maybeCanonicalizeScheme(const String& scheme)
795 if (scheme.isEmpty())
798 if (!isASCIIAlpha(scheme[0]))
801 for (size_t i = 1; i < scheme.length(); ++i) {
802 if (isASCIIAlphanumeric(scheme[i]) || scheme[i] == '+' || scheme[i] == '-' || scheme[i] == '.')
807 return scheme.convertToASCIILowercase();
810 bool URLParser::isSpecialScheme(const String& schemeArg)
812 return scheme(schemeArg) != Scheme::NonSpecial;
815 enum class URLParser::URLPart {
828 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
831 case URLPart::FragmentEnd:
832 return url.m_fragmentEnd;
833 case URLPart::QueryEnd:
834 return url.m_queryEnd;
835 case URLPart::PathEnd:
836 return url.m_pathEnd;
837 case URLPart::PathAfterLastSlash:
838 return url.m_pathAfterLastSlash;
839 case URLPart::PortEnd:
840 return url.m_portEnd;
841 case URLPart::HostEnd:
842 return url.m_hostEnd;
843 case URLPart::PasswordEnd:
844 return url.m_passwordEnd;
845 case URLPart::UserEnd:
846 return url.m_userEnd;
847 case URLPart::UserStart:
848 return url.m_userStart;
849 case URLPart::SchemeEnd:
850 return url.m_schemeEnd;
852 ASSERT_NOT_REACHED();
856 void URLParser::copyASCIIStringUntil(const String& string, size_t length)
858 RELEASE_ASSERT(length <= string.length());
861 ASSERT(m_asciiBuffer.isEmpty());
863 appendToASCIIBuffer(string.characters8(), length);
865 const UChar* characters = string.characters16();
866 for (size_t i = 0; i < length; ++i) {
867 UChar c = characters[i];
868 ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
869 appendToASCIIBuffer(c);
874 template<typename CharacterType>
875 void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, bool& isUTF8Encoding)
877 syntaxViolation(iterator);
879 m_asciiBuffer.clear();
880 copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
882 case URLPart::FragmentEnd:
883 RELEASE_ASSERT_NOT_REACHED();
884 case URLPart::QueryEnd:
885 m_url.m_queryEnd = base.m_queryEnd;
887 case URLPart::PathEnd:
888 m_url.m_pathEnd = base.m_pathEnd;
890 case URLPart::PathAfterLastSlash:
891 m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
893 case URLPart::PortEnd:
894 m_url.m_portEnd = base.m_portEnd;
896 case URLPart::HostEnd:
897 m_url.m_hostEnd = base.m_hostEnd;
899 case URLPart::PasswordEnd:
900 m_url.m_passwordEnd = base.m_passwordEnd;
902 case URLPart::UserEnd:
903 m_url.m_userEnd = base.m_userEnd;
905 case URLPart::UserStart:
906 m_url.m_userStart = base.m_userStart;
908 case URLPart::SchemeEnd:
909 m_url.m_isValid = base.m_isValid;
910 m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
911 m_url.m_schemeEnd = base.m_schemeEnd;
913 switch (scheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd))) {
916 isUTF8Encoding = true;
917 m_urlIsSpecial = true;
926 m_urlIsSpecial = true;
928 case Scheme::NonSpecial:
929 m_urlIsSpecial = false;
930 isUTF8Encoding = true;
933 ASSERT_NOT_REACHED();
936 static const char dotASCIICode[2] = {'2', 'e'};
938 template<typename CharacterType>
939 ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c)
944 advance<CharacterType, ReportSyntaxViolation::No>(c);
945 return c.atEnd() || isSlashQuestionOrHash(*c);
949 advance<CharacterType, ReportSyntaxViolation::No>(c);
950 if (c.atEnd() || *c != dotASCIICode[0])
952 advance<CharacterType, ReportSyntaxViolation::No>(c);
955 if (toASCIILower(*c) == dotASCIICode[1]) {
956 advance<CharacterType, ReportSyntaxViolation::No>(c);
957 return c.atEnd() || isSlashQuestionOrHash(*c);
962 template<typename CharacterType>
963 ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
968 advance<CharacterType, ReportSyntaxViolation::No>(c);
969 return isSingleDotPathSegment(c);
973 advance<CharacterType, ReportSyntaxViolation::No>(c);
974 if (c.atEnd() || *c != dotASCIICode[0])
976 advance<CharacterType, ReportSyntaxViolation::No>(c);
979 if (toASCIILower(*c) == dotASCIICode[1]) {
980 advance<CharacterType, ReportSyntaxViolation::No>(c);
981 return isSingleDotPathSegment(c);
986 template<typename CharacterType>
987 void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
989 ASSERT(isSingleDotPathSegment(c));
993 if (*c == '/' || *c == '\\')
996 ASSERT(*c == '?' || *c == '#');
1001 ASSERT(*c == dotASCIICode[0]);
1003 ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1006 if (*c == '/' || *c == '\\')
1009 ASSERT(*c == '?' || *c == '#');
1014 template<typename CharacterType>
1015 void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
1017 ASSERT(isDoubleDotPathSegment(c));
1023 ASSERT(*c == dotASCIICode[0]);
1025 ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1028 consumeSingleDotPathSegment(c);
1031 bool URLParser::shouldPopPath(unsigned newPathAfterLastSlash)
1033 ASSERT(m_didSeeSyntaxViolation);
1037 ASSERT(m_url.m_pathAfterLastSlash <= m_asciiBuffer.size());
1038 CodePointIterator<LChar> componentToPop(&m_asciiBuffer[newPathAfterLastSlash], &m_asciiBuffer[0] + m_url.m_pathAfterLastSlash);
1039 if (newPathAfterLastSlash == m_url.m_portEnd + 1 && isWindowsDriveLetter(componentToPop))
1044 void URLParser::popPath()
1046 ASSERT(m_didSeeSyntaxViolation);
1047 if (m_url.m_pathAfterLastSlash > m_url.m_portEnd + 1) {
1048 auto newPathAfterLastSlash = m_url.m_pathAfterLastSlash - 1;
1049 if (m_asciiBuffer[newPathAfterLastSlash] == '/')
1050 newPathAfterLastSlash--;
1051 while (newPathAfterLastSlash > m_url.m_portEnd && m_asciiBuffer[newPathAfterLastSlash] != '/')
1052 newPathAfterLastSlash--;
1053 newPathAfterLastSlash++;
1054 if (shouldPopPath(newPathAfterLastSlash))
1055 m_url.m_pathAfterLastSlash = newPathAfterLastSlash;
1057 m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
1060 template<typename CharacterType>
1061 void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
1063 if (m_didSeeSyntaxViolation)
1065 m_didSeeSyntaxViolation = true;
1067 ASSERT(m_asciiBuffer.isEmpty());
1068 size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1069 RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
1070 m_asciiBuffer.reserveCapacity(m_inputString.length());
1071 for (size_t i = 0; i < codeUnitsToCopy; ++i) {
1072 ASSERT(isASCII(m_inputString[i]));
1073 m_asciiBuffer.uncheckedAppend(m_inputString[i]);
1077 void URLParser::failure()
1080 m_url.m_string = m_inputString;
1083 template<typename CharacterType>
1084 bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint)
1086 if (iterator.atEnd() || toASCIILower(*iterator) != codePoint)
1088 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
1092 template<typename CharacterType>
1093 bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator)
1095 if (!checkLocalhostCodePoint(iterator, 'l'))
1097 if (!checkLocalhostCodePoint(iterator, 'o'))
1099 if (!checkLocalhostCodePoint(iterator, 'c'))
1101 if (!checkLocalhostCodePoint(iterator, 'a'))
1103 if (!checkLocalhostCodePoint(iterator, 'l'))
1105 if (!checkLocalhostCodePoint(iterator, 'h'))
1107 if (!checkLocalhostCodePoint(iterator, 'o'))
1109 if (!checkLocalhostCodePoint(iterator, 's'))
1111 if (!checkLocalhostCodePoint(iterator, 't'))
1113 return iterator.atEnd();
1116 bool URLParser::isLocalhost(StringView view)
1119 return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length()));
1120 return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length()));
1123 ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
1125 if (UNLIKELY(m_didSeeSyntaxViolation)) {
1126 ASSERT(start + length <= m_asciiBuffer.size());
1127 return StringView(m_asciiBuffer.data() + start, length);
1129 ASSERT(start + length <= m_inputString.length());
1130 return StringView(m_inputString).substring(start, length);
1133 ALWAYS_INLINE UChar URLParser::parsedDataView(size_t position)
1135 if (UNLIKELY(m_didSeeSyntaxViolation))
1136 return m_asciiBuffer[position];
1137 return m_inputString[position];
1140 template<typename CharacterType>
1141 ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1143 if (UNLIKELY(m_didSeeSyntaxViolation))
1144 return m_asciiBuffer.size();
1146 return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1149 URLParser::URLParser(const String& input, const URL& base, const TextEncoding& encoding)
1150 : m_inputString(input)
1152 if (input.isNull()) {
1153 if (base.isValid() && !base.m_cannotBeABaseURL) {
1155 m_url.removeFragmentIdentifier();
1160 if (input.is8Bit()) {
1161 m_inputBegin = input.characters8();
1162 parse(input.characters8(), input.length(), base, encoding);
1164 m_inputBegin = input.characters16();
1165 parse(input.characters16(), input.length(), base, encoding);
1168 ASSERT(!m_url.m_isValid
1169 || m_didSeeSyntaxViolation == (m_url.string() != input)
1170 || (input.isAllSpecialCharacters<isC0ControlOrSpace>()
1171 && m_url.m_string == base.m_string.left(base.m_queryEnd)));
1172 ASSERT(internalValuesConsistent(m_url));
1173 #if !ASSERT_DISABLED
1174 if (!m_didSeeSyntaxViolation) {
1175 // Force a syntax violation at the beginning to make sure we get the same result.
1176 URLParser parser(makeString(" ", input), base, encoding);
1177 URL parsed = parser.result();
1178 if (parsed.isValid())
1179 ASSERT(allValuesEqual(parser.result(), m_url));
1184 template<typename CharacterType>
1185 void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
1187 URL_PARSER_LOG("Parsing URL <%s> base <%s> encoding <%s>", String(input, length).utf8().data(), base.string().utf8().data(), encoding.name());
1189 ASSERT(m_asciiBuffer.isEmpty());
1191 bool isUTF8Encoding = encoding == UTF8Encoding();
1192 Vector<UChar> queryBuffer;
1194 unsigned endIndex = length;
1195 while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) {
1196 syntaxViolation(CodePointIterator<CharacterType>(input, input));
1199 CodePointIterator<CharacterType> c(input, input + endIndex);
1200 CodePointIterator<CharacterType> authorityOrHostBegin;
1201 CodePointIterator<CharacterType> queryBegin;
1202 while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1206 auto beginAfterControlAndSpace = c;
1208 enum class State : uint8_t {
1212 SpecialRelativeOrAuthority,
1216 SpecialAuthoritySlashes,
1217 SpecialAuthorityIgnoreSlashes,
1225 CannotBeABaseURLPath,
1231 #define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1232 #define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x)
1234 State state = State::SchemeStart;
1235 while (!c.atEnd()) {
1236 if (UNLIKELY(isTabOrNewline(*c))) {
1243 case State::SchemeStart:
1244 LOG_STATE("SchemeStart");
1245 if (isASCIIAlpha(*c)) {
1246 if (UNLIKELY(isASCIIUpper(*c)))
1248 appendToASCIIBuffer(toASCIILower(*c));
1251 m_asciiBuffer.clear();
1252 state = State::NoScheme;
1253 c = beginAfterControlAndSpace;
1255 state = State::Scheme;
1257 state = State::NoScheme;
1260 LOG_STATE("Scheme");
1261 if (isValidSchemeCharacter(*c)) {
1262 if (UNLIKELY(isASCIIUpper(*c)))
1264 appendToASCIIBuffer(toASCIILower(*c));
1265 } else if (*c == ':') {
1266 m_url.m_schemeEnd = currentPosition(c);
1267 StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd);
1268 appendToASCIIBuffer(':');
1269 switch (scheme(urlScheme)) {
1271 m_urlIsSpecial = true;
1273 state = State::File;
1278 isUTF8Encoding = true;
1279 m_urlIsSpecial = true;
1280 if (base.protocolIs(urlScheme))
1281 state = State::SpecialRelativeOrAuthority;
1283 state = State::SpecialAuthoritySlashes;
1288 m_url.m_protocolIsInHTTPFamily = true;
1291 case Scheme::Gopher:
1292 m_urlIsSpecial = true;
1293 if (base.protocolIs(urlScheme))
1294 state = State::SpecialRelativeOrAuthority;
1296 state = State::SpecialAuthoritySlashes;
1299 case Scheme::NonSpecial:
1300 isUTF8Encoding = true;
1301 auto maybeSlash = c;
1302 advance(maybeSlash);
1303 if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1304 appendToASCIIBuffer('/');
1306 state = State::PathOrAuthority;
1309 m_url.m_userStart = currentPosition(c);
1312 m_url.m_userStart = currentPosition(c);
1313 m_url.m_userEnd = m_url.m_userStart;
1314 m_url.m_passwordEnd = m_url.m_userStart;
1315 m_url.m_hostEnd = m_url.m_userStart;
1316 m_url.m_portEnd = m_url.m_userStart;
1317 m_url.m_pathAfterLastSlash = m_url.m_userStart;
1318 m_url.m_cannotBeABaseURL = true;
1319 state = State::CannotBeABaseURLPath;
1325 m_asciiBuffer.clear();
1326 state = State::NoScheme;
1327 c = beginAfterControlAndSpace;
1332 m_asciiBuffer.clear();
1333 state = State::NoScheme;
1334 c = beginAfterControlAndSpace;
1337 case State::NoScheme:
1338 LOG_STATE("NoScheme");
1339 if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) {
1343 if (base.m_cannotBeABaseURL && *c == '#') {
1344 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1345 state = State::Fragment;
1346 appendToASCIIBuffer('#');
1350 if (!base.protocolIs("file")) {
1351 state = State::Relative;
1354 copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1355 appendToASCIIBuffer(':');
1356 state = State::File;
1358 case State::SpecialRelativeOrAuthority:
1359 LOG_STATE("SpecialRelativeOrAuthority");
1361 appendToASCIIBuffer('/');
1368 appendToASCIIBuffer('/');
1369 state = State::SpecialAuthorityIgnoreSlashes;
1372 state = State::RelativeSlash;
1374 state = State::Relative;
1376 case State::PathOrAuthority:
1377 LOG_STATE("PathOrAuthority");
1379 appendToASCIIBuffer('/');
1380 state = State::AuthorityOrHost;
1382 m_url.m_userStart = currentPosition(c);
1383 authorityOrHostBegin = c;
1385 ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1386 m_url.m_userStart = currentPosition(c) - 1;
1387 m_url.m_userEnd = m_url.m_userStart;
1388 m_url.m_passwordEnd = m_url.m_userStart;
1389 m_url.m_hostEnd = m_url.m_userStart;
1390 m_url.m_portEnd = m_url.m_userStart;
1391 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1392 state = State::Path;
1395 case State::Relative:
1396 LOG_STATE("Relative");
1400 state = State::RelativeSlash;
1404 copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1405 appendToASCIIBuffer('?');
1408 state = State::UTF8Query;
1411 state = State::NonUTF8Query;
1415 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1416 appendToASCIIBuffer('#');
1417 state = State::Fragment;
1421 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1422 if (currentPosition(c) && parsedDataView(currentPosition(c) - 1) != '/') {
1423 appendToASCIIBuffer('/');
1424 m_url.m_pathAfterLastSlash = currentPosition(c);
1426 state = State::Path;
1430 case State::RelativeSlash:
1431 LOG_STATE("RelativeSlash");
1432 if (*c == '/' || *c == '\\') {
1434 copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding);
1435 appendToASCIIBuffer("://", 3);
1437 state = State::SpecialAuthorityIgnoreSlashes;
1439 m_url.m_userStart = currentPosition(c);
1440 state = State::AuthorityOrHost;
1441 authorityOrHostBegin = c;
1444 copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1445 appendToASCIIBuffer('/');
1446 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1447 state = State::Path;
1450 case State::SpecialAuthoritySlashes:
1451 LOG_STATE("SpecialAuthoritySlashes");
1452 if (LIKELY(*c == '/' || *c == '\\')) {
1453 if (UNLIKELY(*c == '\\'))
1455 appendToASCIIBuffer('/');
1457 if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) {
1458 if (UNLIKELY(*c == '\\'))
1461 appendToASCIIBuffer('/');
1464 appendToASCIIBuffer('/');
1468 appendToASCIIBuffer("//", 2);
1470 state = State::SpecialAuthorityIgnoreSlashes;
1472 case State::SpecialAuthorityIgnoreSlashes:
1473 LOG_STATE("SpecialAuthorityIgnoreSlashes");
1474 if (*c == '/' || *c == '\\') {
1478 m_url.m_userStart = currentPosition(c);
1479 state = State::AuthorityOrHost;
1480 authorityOrHostBegin = c;
1483 case State::AuthorityOrHost:
1485 LOG_STATE("AuthorityOrHost");
1488 auto findLastAt = c;
1489 while (!findLastAt.atEnd()) {
1490 URL_PARSER_LOG("Finding last @: %c", *findLastAt);
1491 if (*findLastAt == '@')
1492 lastAt = findLastAt;
1493 bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\');
1494 if (isSlash || *findLastAt == '?' || *findLastAt == '#')
1498 parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1501 authorityOrHostBegin = c;
1502 state = State::Host;
1503 m_hostHasPercentOrNonASCII = false;
1506 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1507 if (isSlash || *c == '?' || *c == '#') {
1508 auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c);
1509 if (iterator.atEnd()) {
1512 m_url.m_userEnd = currentPosition(c);
1513 m_url.m_passwordEnd = m_url.m_userEnd;
1514 m_url.m_hostEnd = m_url.m_userEnd;
1515 m_url.m_portEnd = m_url.m_userEnd;
1516 m_url.m_pathAfterLastSlash = m_url.m_userEnd;
1518 m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1519 m_url.m_passwordEnd = m_url.m_userEnd;
1520 if (!parseHostAndPort(iterator)) {
1524 if (UNLIKELY(!isSlash)) {
1525 if (m_urlIsSpecial) {
1527 appendToASCIIBuffer('/');
1529 m_url.m_pathAfterLastSlash = currentPosition(c);
1532 state = State::Path;
1535 if (isPercentOrNonASCII(*c))
1536 m_hostHasPercentOrNonASCII = true;
1538 } while (!c.atEnd());
1543 if (*c == '/' || *c == '?' || *c == '#') {
1544 if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1548 if (*c == '?' || *c == '#') {
1550 appendToASCIIBuffer('/');
1551 m_url.m_pathAfterLastSlash = currentPosition(c);
1553 state = State::Path;
1556 if (isPercentOrNonASCII(*c))
1557 m_hostHasPercentOrNonASCII = true;
1559 } while (!c.atEnd());
1568 appendToASCIIBuffer('/');
1569 state = State::FileSlash;
1574 if (base.isValid() && base.protocolIs("file")) {
1575 copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding);
1576 appendToASCIIBuffer('?');
1579 appendToASCIIBuffer("///?", 4);
1581 m_url.m_userStart = currentPosition(c) - 2;
1582 m_url.m_userEnd = m_url.m_userStart;
1583 m_url.m_passwordEnd = m_url.m_userStart;
1584 m_url.m_hostEnd = m_url.m_userStart;
1585 m_url.m_portEnd = m_url.m_userStart;
1586 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1587 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1590 state = State::UTF8Query;
1593 state = State::NonUTF8Query;
1598 if (base.isValid() && base.protocolIs("file")) {
1599 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1600 appendToASCIIBuffer('#');
1602 appendToASCIIBuffer("///#", 4);
1603 m_url.m_userStart = currentPosition(c) - 2;
1604 m_url.m_userEnd = m_url.m_userStart;
1605 m_url.m_passwordEnd = m_url.m_userStart;
1606 m_url.m_hostEnd = m_url.m_userStart;
1607 m_url.m_portEnd = m_url.m_userStart;
1608 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1609 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1610 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1612 state = State::Fragment;
1617 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1618 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding);
1620 appendToASCIIBuffer("///", 3);
1621 m_url.m_userStart = currentPosition(c) - 1;
1622 m_url.m_userEnd = m_url.m_userStart;
1623 m_url.m_passwordEnd = m_url.m_userStart;
1624 m_url.m_hostEnd = m_url.m_userStart;
1625 m_url.m_portEnd = m_url.m_userStart;
1626 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1627 if (isWindowsDriveLetter(c))
1628 appendWindowsDriveLetter(c);
1630 state = State::Path;
1634 case State::FileSlash:
1635 LOG_STATE("FileSlash");
1636 if (LIKELY(*c == '/' || *c == '\\')) {
1637 if (UNLIKELY(*c == '\\'))
1639 appendToASCIIBuffer('/');
1641 m_url.m_userStart = currentPosition(c);
1642 m_url.m_userEnd = m_url.m_userStart;
1643 m_url.m_passwordEnd = m_url.m_userStart;
1644 m_url.m_hostEnd = m_url.m_userStart;
1645 m_url.m_portEnd = m_url.m_userStart;
1646 authorityOrHostBegin = c;
1647 state = State::FileHost;
1651 appendToASCIIBuffer("//", 2);
1652 m_url.m_userStart = currentPosition(c) - 1;
1653 m_url.m_userEnd = m_url.m_userStart;
1654 m_url.m_passwordEnd = m_url.m_userStart;
1655 m_url.m_hostEnd = m_url.m_userStart;
1656 m_url.m_portEnd = m_url.m_userStart;
1657 if (isWindowsDriveLetter(c)) {
1658 appendWindowsDriveLetter(c);
1659 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1660 } else if (copyBaseWindowsDriveLetter(base)) {
1661 appendToASCIIBuffer('/');
1662 m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1664 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1665 state = State::Path;
1667 case State::FileHost:
1669 LOG_STATE("FileHost");
1670 if (isSlashQuestionOrHash(*c)) {
1671 bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1672 && isWindowsDriveLetter(authorityOrHostBegin);
1674 syntaxViolation(authorityOrHostBegin);
1675 appendToASCIIBuffer('/');
1676 appendWindowsDriveLetter(authorityOrHostBegin);
1678 if (windowsQuirk || authorityOrHostBegin == c) {
1679 ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1) == '/');
1680 if (UNLIKELY(*c == '?')) {
1682 appendToASCIIBuffer("/?", 2);
1685 state = State::UTF8Query;
1688 state = State::NonUTF8Query;
1690 m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1691 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1694 if (UNLIKELY(*c == '#')) {
1696 appendToASCIIBuffer("/#", 2);
1698 m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1699 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1700 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1701 state = State::Fragment;
1704 state = State::Path;
1707 if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1711 if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) {
1713 m_asciiBuffer.shrink(m_url.m_passwordEnd);
1714 m_url.m_hostEnd = currentPosition(c);
1715 m_url.m_portEnd = m_url.m_hostEnd;
1718 state = State::PathStart;
1721 if (isPercentOrNonASCII(*c))
1722 m_hostHasPercentOrNonASCII = true;
1724 } while (!c.atEnd());
1726 case State::PathStart:
1727 LOG_STATE("PathStart");
1728 if (*c != '/' && *c != '\\')
1730 state = State::Path;
1734 if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1735 if (UNLIKELY(m_urlIsSpecial && *c == '\\'))
1737 appendToASCIIBuffer('/');
1739 m_url.m_pathAfterLastSlash = currentPosition(c);
1742 if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1) == '/')) {
1743 if (UNLIKELY(isDoubleDotPathSegment(c))) {
1745 consumeDoubleDotPathSegment(c);
1749 if (UNLIKELY(isSingleDotPathSegment(c))) {
1751 consumeSingleDotPathSegment(c);
1756 m_url.m_pathEnd = currentPosition(c);
1757 appendToASCIIBuffer('?');
1760 state = State::UTF8Query;
1763 state = State::NonUTF8Query;
1768 m_url.m_pathEnd = currentPosition(c);
1769 m_url.m_queryEnd = m_url.m_pathEnd;
1770 state = State::Fragment;
1773 utf8PercentEncode<isInDefaultEncodeSet>(c);
1776 case State::CannotBeABaseURLPath:
1777 LOG_STATE("CannotBeABaseURLPath");
1779 m_url.m_pathEnd = currentPosition(c);
1780 appendToASCIIBuffer('?');
1783 state = State::UTF8Query;
1786 state = State::NonUTF8Query;
1788 } else if (*c == '#') {
1789 m_url.m_pathEnd = currentPosition(c);
1790 m_url.m_queryEnd = m_url.m_pathEnd;
1791 state = State::Fragment;
1792 } else if (*c == '/') {
1793 appendToASCIIBuffer('/');
1795 m_url.m_pathAfterLastSlash = currentPosition(c);
1797 utf8PercentEncode<isInSimpleEncodeSet>(c);
1801 case State::UTF8Query:
1802 LOG_STATE("UTF8Query");
1803 ASSERT(queryBegin == CodePointIterator<CharacterType>());
1805 m_url.m_queryEnd = currentPosition(c);
1806 state = State::Fragment;
1812 appendCodePoint(queryBuffer, *c);
1815 case State::NonUTF8Query:
1817 LOG_STATE("NonUTF8Query");
1818 ASSERT(queryBegin != CodePointIterator<CharacterType>());
1820 encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
1821 m_url.m_queryEnd = currentPosition(c);
1822 state = State::Fragment;
1825 appendCodePoint(queryBuffer, *c);
1826 advance(c, queryBegin);
1827 } while (!c.atEnd());
1829 case State::Fragment:
1830 URL_PARSER_LOG("State Fragment");
1831 utf8PercentEncode<isInSimpleEncodeSet>(c);
1838 case State::SchemeStart:
1839 LOG_FINAL_STATE("SchemeStart");
1840 if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1842 m_url.removeFragmentIdentifier();
1848 LOG_FINAL_STATE("Scheme");
1851 case State::NoScheme:
1852 LOG_FINAL_STATE("NoScheme");
1853 RELEASE_ASSERT_NOT_REACHED();
1854 case State::SpecialRelativeOrAuthority:
1855 LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1856 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1857 m_url.m_fragmentEnd = m_url.m_queryEnd;
1859 case State::PathOrAuthority:
1860 LOG_FINAL_STATE("PathOrAuthority");
1861 ASSERT(m_url.m_userStart);
1862 ASSERT(m_url.m_userStart == currentPosition(c));
1863 ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1864 m_url.m_userStart--;
1865 m_url.m_userEnd = m_url.m_userStart;
1866 m_url.m_passwordEnd = m_url.m_userStart;
1867 m_url.m_hostEnd = m_url.m_userStart;
1868 m_url.m_portEnd = m_url.m_userStart;
1869 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1870 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1871 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1872 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1874 case State::Relative:
1875 LOG_FINAL_STATE("Relative");
1876 RELEASE_ASSERT_NOT_REACHED();
1877 case State::RelativeSlash:
1878 LOG_FINAL_STATE("RelativeSlash");
1879 copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding);
1880 appendToASCIIBuffer('/');
1881 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1882 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1883 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1884 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1886 case State::SpecialAuthoritySlashes:
1887 LOG_FINAL_STATE("SpecialAuthoritySlashes");
1888 m_url.m_userStart = currentPosition(c);
1889 m_url.m_userEnd = m_url.m_userStart;
1890 m_url.m_passwordEnd = m_url.m_userStart;
1891 m_url.m_hostEnd = m_url.m_userStart;
1892 m_url.m_portEnd = m_url.m_userStart;
1893 m_url.m_pathAfterLastSlash = m_url.m_userStart;
1894 m_url.m_pathEnd = m_url.m_userStart;
1895 m_url.m_queryEnd = m_url.m_userStart;
1896 m_url.m_fragmentEnd = m_url.m_userStart;
1898 case State::SpecialAuthorityIgnoreSlashes:
1899 LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1902 case State::AuthorityOrHost:
1903 LOG_FINAL_STATE("AuthorityOrHost");
1904 m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1905 m_url.m_passwordEnd = m_url.m_userEnd;
1906 if (authorityOrHostBegin.atEnd()) {
1907 m_url.m_userEnd = m_url.m_userStart;
1908 m_url.m_passwordEnd = m_url.m_userStart;
1909 m_url.m_hostEnd = m_url.m_userStart;
1910 m_url.m_portEnd = m_url.m_userStart;
1911 m_url.m_pathEnd = m_url.m_userStart;
1912 } else if (!parseHostAndPort(authorityOrHostBegin)) {
1916 if (m_urlIsSpecial) {
1918 appendToASCIIBuffer('/');
1919 m_url.m_pathEnd = m_url.m_portEnd + 1;
1921 m_url.m_pathEnd = m_url.m_portEnd;
1923 m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1924 m_url.m_queryEnd = m_url.m_pathEnd;
1925 m_url.m_fragmentEnd = m_url.m_pathEnd;
1928 LOG_FINAL_STATE("Host");
1929 if (!parseHostAndPort(authorityOrHostBegin)) {
1933 if (m_urlIsSpecial) {
1935 appendToASCIIBuffer('/');
1936 m_url.m_pathEnd = m_url.m_portEnd + 1;
1938 m_url.m_pathEnd = m_url.m_portEnd;
1939 m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1940 m_url.m_queryEnd = m_url.m_pathEnd;
1941 m_url.m_fragmentEnd = m_url.m_pathEnd;
1944 LOG_FINAL_STATE("File");
1945 if (base.isValid() && base.protocolIs("file")) {
1946 copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding);
1947 m_url.m_fragmentEnd = m_url.m_queryEnd;
1951 appendToASCIIBuffer("///", 3);
1952 m_url.m_userStart = currentPosition(c) - 1;
1953 m_url.m_userEnd = m_url.m_userStart;
1954 m_url.m_passwordEnd = m_url.m_userStart;
1955 m_url.m_hostEnd = m_url.m_userStart;
1956 m_url.m_portEnd = m_url.m_userStart;
1957 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1958 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1959 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1960 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1962 case State::FileSlash:
1963 LOG_FINAL_STATE("FileSlash");
1965 m_url.m_userStart = currentPosition(c) + 1;
1966 appendToASCIIBuffer("//", 2);
1967 m_url.m_userEnd = m_url.m_userStart;
1968 m_url.m_passwordEnd = m_url.m_userStart;
1969 m_url.m_hostEnd = m_url.m_userStart;
1970 m_url.m_portEnd = m_url.m_userStart;
1971 if (copyBaseWindowsDriveLetter(base)) {
1972 appendToASCIIBuffer('/');
1973 m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1975 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1976 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1977 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1978 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1980 case State::FileHost:
1981 LOG_FINAL_STATE("FileHost");
1982 if (takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1983 && isWindowsDriveLetter(authorityOrHostBegin)) {
1984 syntaxViolation(authorityOrHostBegin);
1985 appendToASCIIBuffer('/');
1986 appendWindowsDriveLetter(authorityOrHostBegin);
1987 m_url.m_pathAfterLastSlash = currentPosition(c);
1988 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1989 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1990 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1994 if (authorityOrHostBegin == c) {
1996 appendToASCIIBuffer('/');
1997 m_url.m_userStart = currentPosition(c) - 1;
1998 m_url.m_userEnd = m_url.m_userStart;
1999 m_url.m_passwordEnd = m_url.m_userStart;
2000 m_url.m_hostEnd = m_url.m_userStart;
2001 m_url.m_portEnd = m_url.m_userStart;
2002 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
2003 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
2004 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
2005 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
2009 if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
2015 if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) {
2016 m_asciiBuffer.shrink(m_url.m_passwordEnd);
2017 m_url.m_hostEnd = currentPosition(c);
2018 m_url.m_portEnd = m_url.m_hostEnd;
2020 appendToASCIIBuffer('/');
2021 m_url.m_pathAfterLastSlash = m_url.m_portEnd + 1;
2022 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
2023 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
2024 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
2026 case State::PathStart:
2027 LOG_FINAL_STATE("PathStart");
2028 RELEASE_ASSERT_NOT_REACHED();
2030 LOG_FINAL_STATE("Path");
2031 m_url.m_pathEnd = currentPosition(c);
2032 m_url.m_queryEnd = m_url.m_pathEnd;
2033 m_url.m_fragmentEnd = m_url.m_pathEnd;
2035 case State::CannotBeABaseURLPath:
2036 LOG_FINAL_STATE("CannotBeABaseURLPath");
2037 m_url.m_pathEnd = currentPosition(c);
2038 m_url.m_queryEnd = m_url.m_pathEnd;
2039 m_url.m_fragmentEnd = m_url.m_pathEnd;
2041 case State::UTF8Query:
2042 LOG_FINAL_STATE("UTF8Query");
2043 ASSERT(queryBegin == CodePointIterator<CharacterType>());
2044 m_url.m_queryEnd = currentPosition(c);
2045 m_url.m_fragmentEnd = m_url.m_queryEnd;
2047 case State::NonUTF8Query:
2048 LOG_FINAL_STATE("NonUTF8Query");
2049 ASSERT(queryBegin != CodePointIterator<CharacterType>());
2050 encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
2051 m_url.m_queryEnd = currentPosition(c);
2052 m_url.m_fragmentEnd = m_url.m_queryEnd;
2054 case State::Fragment:
2055 LOG_FINAL_STATE("Fragment");
2056 m_url.m_fragmentEnd = currentPosition(c);
2060 if (LIKELY(!m_didSeeSyntaxViolation)) {
2061 m_url.m_string = m_inputString;
2062 ASSERT(m_asciiBuffer.isEmpty());
2064 m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
2065 m_url.m_isValid = true;
2066 URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
2069 template<typename CharacterType>
2070 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
2072 if (UNLIKELY(iterator.atEnd())) {
2073 syntaxViolation(iterator);
2074 m_url.m_userEnd = currentPosition(iterator);
2075 m_url.m_passwordEnd = m_url.m_userEnd;
2078 for (; !iterator.atEnd(); advance(iterator)) {
2079 if (*iterator == ':') {
2080 m_url.m_userEnd = currentPosition(iterator);
2081 auto iteratorAtColon = iterator;
2083 bool tabOrNewlineAfterColon = false;
2084 while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
2085 tabOrNewlineAfterColon = true;
2088 if (UNLIKELY(iterator.atEnd())) {
2089 syntaxViolation(iteratorAtColon);
2090 m_url.m_passwordEnd = m_url.m_userEnd;
2091 if (m_url.m_userEnd > m_url.m_userStart)
2092 appendToASCIIBuffer('@');
2095 if (tabOrNewlineAfterColon)
2096 syntaxViolation(iteratorAtColon);
2097 appendToASCIIBuffer(':');
2100 utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2102 for (; !iterator.atEnd(); advance(iterator))
2103 utf8PercentEncode<isInUserInfoEncodeSet>(iterator);
2104 m_url.m_passwordEnd = currentPosition(iterator);
2105 if (!m_url.m_userEnd)
2106 m_url.m_userEnd = m_url.m_passwordEnd;
2107 appendToASCIIBuffer('@');
2110 template<typename UnsignedIntegerType>
2111 void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
2113 LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
2114 LChar* end = std::end(buf);
2117 *--p = (number % 10) + '0';
2120 appendToASCIIBuffer(p, end - p);
2123 void URLParser::serializeIPv4(IPv4Address address)
2125 appendNumberToASCIIBuffer<uint8_t>(address >> 24);
2126 appendToASCIIBuffer('.');
2127 appendNumberToASCIIBuffer<uint8_t>(address >> 16);
2128 appendToASCIIBuffer('.');
2129 appendNumberToASCIIBuffer<uint8_t>(address >> 8);
2130 appendToASCIIBuffer('.');
2131 appendNumberToASCIIBuffer<uint8_t>(address);
2134 static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
2137 for (; end < 8; end++) {
2144 static std::optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
2146 std::optional<size_t> longest;
2147 size_t longestLength = 0;
2148 for (size_t i = 0; i < 8; i++) {
2149 size_t length = zeroSequenceLength(address, i);
2151 if (length > 1 && (!longest || longestLength < length)) {
2153 longestLength = length;
2161 void URLParser::serializeIPv6Piece(uint16_t piece)
2163 bool printed = false;
2164 if (auto nibble0 = piece >> 12) {
2165 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
2168 auto nibble1 = piece >> 8 & 0xF;
2169 if (printed || nibble1) {
2170 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
2173 auto nibble2 = piece >> 4 & 0xF;
2174 if (printed || nibble2)
2175 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
2176 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
2179 void URLParser::serializeIPv6(URLParser::IPv6Address address)
2181 appendToASCIIBuffer('[');
2182 auto compressPointer = findLongestZeroSequence(address);
2183 for (size_t piece = 0; piece < 8; piece++) {
2184 if (compressPointer && compressPointer.value() == piece) {
2185 ASSERT(!address[piece]);
2187 appendToASCIIBuffer(':');
2189 appendToASCIIBuffer("::", 2);
2190 while (piece < 8 && !address[piece])
2195 serializeIPv6Piece(address[piece]);
2197 appendToASCIIBuffer(':');
2199 appendToASCIIBuffer(']');
2202 enum class URLParser::IPv4PieceParsingError {
2207 template<typename CharacterType>
2208 Expected<uint32_t, URLParser::IPv4PieceParsingError> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation)
2210 enum class State : uint8_t {
2217 State state = State::UnknownBase;
2218 Checked<uint32_t, RecordOverflow> value = 0;
2219 if (!iterator.atEnd() && *iterator == '.')
2220 return makeUnexpected(IPv4PieceParsingError::Failure);
2221 while (!iterator.atEnd()) {
2222 if (isTabOrNewline(*iterator)) {
2223 didSeeSyntaxViolation = true;
2227 if (*iterator == '.') {
2228 ASSERT(!value.hasOverflowed());
2229 return value.unsafeGet();
2232 case State::UnknownBase:
2233 if (UNLIKELY(*iterator == '0')) {
2235 state = State::OctalOrHex;
2238 state = State::Decimal;
2240 case State::OctalOrHex:
2241 didSeeSyntaxViolation = true;
2242 if (*iterator == 'x' || *iterator == 'X') {
2247 state = State::Octal;
2249 case State::Decimal:
2250 if (!isASCIIDigit(*iterator))
2251 return makeUnexpected(IPv4PieceParsingError::Failure);
2253 value += *iterator - '0';
2254 if (UNLIKELY(value.hasOverflowed()))
2255 return makeUnexpected(IPv4PieceParsingError::Overflow);
2259 ASSERT(didSeeSyntaxViolation);
2260 if (*iterator < '0' || *iterator > '7')
2261 return makeUnexpected(IPv4PieceParsingError::Failure);
2263 value += *iterator - '0';
2264 if (UNLIKELY(value.hasOverflowed()))
2265 return makeUnexpected(IPv4PieceParsingError::Overflow);
2269 ASSERT(didSeeSyntaxViolation);
2270 if (!isASCIIHexDigit(*iterator))
2271 return makeUnexpected(IPv4PieceParsingError::Failure);
2273 value += toASCIIHexValue(*iterator);
2274 if (UNLIKELY(value.hasOverflowed()))
2275 return makeUnexpected(IPv4PieceParsingError::Overflow);
2280 ASSERT(!value.hasOverflowed());
2281 return value.unsafeGet();
2284 ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2286 RELEASE_ASSERT(exponent <= 4);
2287 uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
2288 return values[exponent];
2291 enum class URLParser::IPv4ParsingError {
2296 template<typename CharacterTypeForSyntaxViolation, typename CharacterType>
2297 Expected<URLParser::IPv4Address, URLParser::IPv4ParsingError> URLParser::parseIPv4Host(const CodePointIterator<CharacterTypeForSyntaxViolation>& iteratorForSyntaxViolationPosition, CodePointIterator<CharacterType> iterator)
2299 Vector<Expected<uint32_t, URLParser::IPv4PieceParsingError>, 4> items;
2300 bool didSeeSyntaxViolation = false;
2301 if (!iterator.atEnd() && *iterator == '.')
2302 return makeUnexpected(IPv4ParsingError::NotIPv4);
2303 while (!iterator.atEnd()) {
2304 if (isTabOrNewline(*iterator)) {
2305 didSeeSyntaxViolation = true;
2309 if (items.size() >= 4)
2310 return makeUnexpected(IPv4ParsingError::NotIPv4);
2311 items.append(parseIPv4Piece(iterator, didSeeSyntaxViolation));
2312 if (!iterator.atEnd() && *iterator == '.') {
2314 if (iterator.atEnd())
2315 syntaxViolation(iteratorForSyntaxViolationPosition);
2316 else if (*iterator == '.')
2317 return makeUnexpected(IPv4ParsingError::NotIPv4);
2320 if (!iterator.atEnd() || !items.size() || items.size() > 4)
2321 return makeUnexpected(IPv4ParsingError::NotIPv4);
2322 for (const auto& item : items) {
2323 if (!item.hasValue() && item.error() == IPv4PieceParsingError::Failure)
2324 return makeUnexpected(IPv4ParsingError::NotIPv4);
2326 for (const auto& item : items) {
2327 if (!item.hasValue() && item.error() == IPv4PieceParsingError::Overflow)
2328 return makeUnexpected(IPv4ParsingError::Failure);
2330 if (items.size() > 1) {
2331 for (size_t i = 0; i < items.size() - 1; i++) {
2332 if (items[i].value() > 255)
2333 return makeUnexpected(IPv4ParsingError::Failure);
2336 if (items[items.size() - 1].value() >= pow256(5 - items.size()))
2337 return makeUnexpected(IPv4ParsingError::Failure);
2339 if (didSeeSyntaxViolation)
2340 syntaxViolation(iteratorForSyntaxViolationPosition);
2341 for (const auto& item : items) {
2342 if (item.value() > 255)
2343 syntaxViolation(iteratorForSyntaxViolationPosition);
2346 if (UNLIKELY(items.size() != 4))
2347 syntaxViolation(iteratorForSyntaxViolationPosition);
2349 IPv4Address ipv4 = items.takeLast().value();
2350 for (size_t counter = 0; counter < items.size(); ++counter)
2351 ipv4 += items[counter].value() * pow256(3 - counter);
2355 template<typename CharacterType>
2356 std::optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator)
2358 if (iterator.atEnd())
2359 return std::nullopt;
2361 bool leadingZeros = false;
2362 size_t digitCount = 0;
2363 while (!iterator.atEnd()) {
2364 if (!isASCIIDigit(*iterator))
2365 return std::nullopt;
2367 if (!piece && *iterator == '0') {
2369 return std::nullopt;
2370 leadingZeros = true;
2372 if (!piece && *iterator == '0')
2373 leadingZeros = true;
2374 piece = piece * 10 + *iterator - '0';
2376 return std::nullopt;
2377 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2378 if (iterator.atEnd())
2380 if (*iterator == '.')
2383 if (piece && leadingZeros)
2384 return std::nullopt;
2388 template<typename CharacterType>
2389 std::optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator)
2391 IPv4Address address = 0;
2392 for (size_t i = 0; i < 4; ++i) {
2393 if (std::optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator))
2394 address = (address << 8) + piece.value();
2396 return std::nullopt;
2398 if (iterator.atEnd())
2399 return std::nullopt;
2400 if (*iterator != '.')
2401 return std::nullopt;
2402 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2403 } else if (!iterator.atEnd())
2404 return std::nullopt;
2406 ASSERT(iterator.atEnd());
2410 template<typename CharacterType>
2411 std::optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2414 const auto hostBegin = c;
2415 advance(c, hostBegin);
2417 return std::nullopt;
2419 IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}};
2420 size_t piecePointer = 0;
2421 std::optional<size_t> compressPointer;
2424 advance(c, hostBegin);
2426 return std::nullopt;
2428 return std::nullopt;
2429 advance(c, hostBegin);
2431 compressPointer = piecePointer;
2434 while (!c.atEnd()) {
2435 if (piecePointer == 8)
2436 return std::nullopt;
2438 if (compressPointer)
2439 return std::nullopt;
2440 advance(c, hostBegin);
2442 compressPointer = piecePointer;
2445 if (piecePointer == 6 || (compressPointer && piecePointer < 6)) {
2446 if (std::optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) {
2447 if (compressPointer && piecePointer == 5)
2448 return std::nullopt;
2449 syntaxViolation(hostBegin);
2450 address[piecePointer++] = ipv4Address.value() >> 16;
2451 address[piecePointer++] = ipv4Address.value() & 0xFFFF;
2458 bool leadingZeros = false;
2459 for (; length < 4; length++) {
2462 if (!isASCIIHexDigit(*c))
2464 if (isASCIIUpper(*c))
2465 syntaxViolation(hostBegin);
2466 if (*c == '0' && !length)
2467 leadingZeros = true;
2468 value = value * 0x10 + toASCIIHexValue(*c);
2469 advance(c, hostBegin);
2472 if (UNLIKELY((value && leadingZeros) || (!value && length > 1)))
2473 syntaxViolation(hostBegin);
2475 address[piecePointer++] = value;
2478 if (piecePointer == 8 || *c != ':')
2479 return std::nullopt;
2480 advance(c, hostBegin);
2484 return std::nullopt;
2486 if (compressPointer) {
2487 size_t swaps = piecePointer - compressPointer.value();
2490 std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
2491 } else if (piecePointer != 8)
2492 return std::nullopt;
2494 std::optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2495 if (possibleCompressPointer)
2496 possibleCompressPointer.value()++;
2497 if (UNLIKELY(compressPointer != possibleCompressPointer))
2498 syntaxViolation(hostBegin);
2503 template<typename CharacterType>
2504 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2506 Vector<LChar, defaultInlineBufferSize> output;
2507 output.reserveInitialCapacity(length);
2509 for (size_t i = 0; i < length; ++i) {
2510 uint8_t byte = input[i];
2512 output.uncheckedAppend(byte);
2513 else if (length > 2 && i < length - 2) {
2514 if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2515 syntaxViolation(iteratorForSyntaxViolationPosition);
2516 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2519 output.uncheckedAppend(byte);
2521 output.uncheckedAppend(byte);
2526 Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length)
2528 Vector<LChar, defaultInlineBufferSize> output;
2529 output.reserveInitialCapacity(length);
2531 for (size_t i = 0; i < length; ++i) {
2532 uint8_t byte = input[i];
2534 output.uncheckedAppend(byte);
2535 else if (length > 2 && i < length - 2) {
2536 if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2537 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2540 output.uncheckedAppend(byte);
2542 output.uncheckedAppend(byte);
2547 ALWAYS_INLINE static bool containsOnlyASCII(const String& string)
2549 ASSERT(!string.isNull());
2550 if (string.is8Bit())
2551 return charactersAreAllASCII(string.characters8(), string.length());
2552 return charactersAreAllASCII(string.characters16(), string.length());
2555 template<typename CharacterType>
2556 std::optional<Vector<LChar, URLParser::defaultInlineBufferSize>> URLParser::domainToASCII(const String& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2558 Vector<LChar, defaultInlineBufferSize> ascii;
2559 if (containsOnlyASCII(domain)) {
2560 size_t length = domain.length();
2561 if (domain.is8Bit()) {
2562 const LChar* characters = domain.characters8();
2563 ascii.reserveInitialCapacity(length);
2564 for (size_t i = 0; i < length; ++i) {
2565 if (UNLIKELY(isASCIIUpper(characters[i])))
2566 syntaxViolation(iteratorForSyntaxViolationPosition);
2567 ascii.uncheckedAppend(toASCIILower(characters[i]));
2570 const UChar* characters = domain.characters16();
2571 ascii.reserveInitialCapacity(length);
2572 for (size_t i = 0; i < length; ++i) {
2573 if (UNLIKELY(isASCIIUpper(characters[i])))
2574 syntaxViolation(iteratorForSyntaxViolationPosition);
2575 ascii.uncheckedAppend(toASCIILower(characters[i]));
2581 UChar hostnameBuffer[defaultInlineBufferSize];
2582 UErrorCode error = U_ZERO_ERROR;
2583 UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER;
2584 int32_t numCharactersConverted = uidna_nameToASCII(&internationalDomainNameTranscoder(), StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, &processingDetails, &error);
2585 ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
2587 if (U_SUCCESS(error) && !processingDetails.errors) {
2588 for (int32_t i = 0; i < numCharactersConverted; ++i) {
2589 ASSERT(isASCII(hostnameBuffer[i]));
2590 ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2592 ascii.append(hostnameBuffer, numCharactersConverted);
2593 if (domain != StringView(ascii.data(), ascii.size()))
2594 syntaxViolation(iteratorForSyntaxViolationPosition);
2598 // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
2599 return std::nullopt;
2602 bool URLParser::hasForbiddenHostCodePoint(const Vector<LChar, URLParser::defaultInlineBufferSize>& asciiDomain)
2604 for (size_t i = 0; i < asciiDomain.size(); ++i) {
2605 if (isForbiddenHostCodePoint(asciiDomain[i]))
2611 template<typename CharacterType>
2612 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2614 ASSERT(*iterator == ':');
2615 auto colonIterator = iterator;
2616 advance(iterator, colonIterator);
2618 if (UNLIKELY(iterator.atEnd())) {
2619 m_url.m_portEnd = currentPosition(colonIterator);
2620 syntaxViolation(colonIterator);
2623 size_t digitCount = 0;
2624 bool leadingZeros = false;
2625 for (; !iterator.atEnd(); ++iterator) {
2626 if (UNLIKELY(isTabOrNewline(*iterator))) {
2627 syntaxViolation(colonIterator);
2630 if (isASCIIDigit(*iterator)) {
2631 if (*iterator == '0' && !digitCount)
2632 leadingZeros = true;
2634 port = port * 10 + *iterator - '0';
2635 if (port > std::numeric_limits<uint16_t>::max())
2641 if (port && leadingZeros)
2642 syntaxViolation(colonIterator);
2644 if (!port && digitCount > 1)
2645 syntaxViolation(colonIterator);
2647 ASSERT(port == static_cast<uint16_t>(port));
2648 if (UNLIKELY(defaultPortForProtocol(parsedDataView(0, m_url.m_schemeEnd)) == static_cast<uint16_t>(port)))
2649 syntaxViolation(colonIterator);
2651 appendToASCIIBuffer(':');
2652 ASSERT(port <= std::numeric_limits<uint16_t>::max());
2653 appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2656 m_url.m_portEnd = currentPosition(iterator);
2660 template<typename CharacterType>
2661 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2663 if (iterator.atEnd())
2665 if (*iterator == ':')
2667 if (*iterator == '[') {
2668 auto ipv6End = iterator;
2669 while (!ipv6End.atEnd() && *ipv6End != ']')
2671 if (ipv6End.atEnd())
2673 if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2674 serializeIPv6(address.value());
2675 if (!ipv6End.atEnd()) {
2677 if (!ipv6End.atEnd() && *ipv6End == ':') {
2678 m_url.m_hostEnd = currentPosition(ipv6End);
2679 return parsePort(ipv6End);
2681 m_url.m_hostEnd = currentPosition(ipv6End);
2682 m_url.m_portEnd = m_url.m_hostEnd;
2685 m_url.m_hostEnd = currentPosition(ipv6End);
2691 if (!m_urlIsSpecial) {
2692 for (; !iterator.atEnd(); ++iterator) {
2693 if (UNLIKELY(isTabOrNewline(*iterator))) {
2694 syntaxViolation(iterator);
2697 if (*iterator == ':')
2699 if (UNLIKELY(isForbiddenHostCodePoint(*iterator) && *iterator != '%'))
2701 utf8PercentEncode<isInSimpleEncodeSet>(iterator);
2703 m_url.m_hostEnd = currentPosition(iterator);
2704 if (iterator.atEnd()) {
2705 m_url.m_portEnd = currentPosition(iterator);
2708 return parsePort(iterator);
2711 if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2712 auto hostIterator = iterator;
2713 for (; !iterator.atEnd(); ++iterator) {
2714 if (isTabOrNewline(*iterator))
2716 if (*iterator == ':')
2718 if (isForbiddenHostCodePoint(*iterator))
2721 auto address = parseIPv4Host(hostIterator, CodePointIterator<CharacterType>(hostIterator, iterator));
2723 serializeIPv4(address.value());
2724 m_url.m_hostEnd = currentPosition(iterator);
2725 if (iterator.atEnd()) {
2726 m_url.m_portEnd = currentPosition(iterator);
2729 return parsePort(iterator);
2731 if (address.error() == IPv4ParsingError::Failure)
2733 for (; hostIterator != iterator; ++hostIterator) {
2734 if (UNLIKELY(isTabOrNewline(*hostIterator))) {
2735 syntaxViolation(hostIterator);
2738 if (UNLIKELY(isASCIIUpper(*hostIterator)))
2739 syntaxViolation(hostIterator);
2740 appendToASCIIBuffer(toASCIILower(*hostIterator));
2742 m_url.m_hostEnd = currentPosition(iterator);
2743 if (!hostIterator.atEnd())
2744 return parsePort(hostIterator);
2745 m_url.m_portEnd = currentPosition(iterator);
2749 const auto hostBegin = iterator;
2751 Vector<LChar, defaultInlineBufferSize> utf8Encoded;
2752 for (; !iterator.atEnd(); ++iterator) {
2753 if (UNLIKELY(isTabOrNewline(*iterator))) {
2754 syntaxViolation(hostBegin);
2757 if (*iterator == ':')
2759 if (UNLIKELY(!isASCII(*iterator)))
2760 syntaxViolation(hostBegin);
2762 uint8_t buffer[U8_MAX_LENGTH];
2764 UBool error = false;
2765 U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error);
2766 ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
2767 // FIXME: Check error.
2768 utf8Encoded.append(buffer, offset);
2770 Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin);
2771 String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2772 if (domain.isNull())
2774 if (domain != StringView(percentDecoded.data(), percentDecoded.size()))
2775 syntaxViolation(hostBegin);
2776 auto asciiDomain = domainToASCII(domain, hostBegin);
2777 if (!asciiDomain || hasForbiddenHostCodePoint(asciiDomain.value()))
2779 Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value();
2780 const LChar* asciiDomainCharacters = asciiDomainValue.data();
2782 auto address = parseIPv4Host(hostBegin, CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()));
2784 serializeIPv4(address.value());
2785 m_url.m_hostEnd = currentPosition(iterator);
2786 if (iterator.atEnd()) {
2787 m_url.m_portEnd = currentPosition(iterator);
2790 return parsePort(iterator);
2792 if (address.error() == IPv4ParsingError::Failure)
2795 appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2796 m_url.m_hostEnd = currentPosition(iterator);
2797 if (!iterator.atEnd())
2798 return parsePort(iterator);
2799 m_url.m_portEnd = currentPosition(iterator);
2803 std::optional<String> URLParser::formURLDecode(StringView input)
2805 auto utf8 = input.utf8(StrictConversion);
2807 return std::nullopt;
2808 auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2809 return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2812 // https://url.spec.whatwg.org/#concept-urlencoded-parser
2813 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2815 URLEncodedForm output;
2816 for (StringView bytes : input.split('&')) {
2817 auto equalIndex = bytes.find('=');
2818 if (equalIndex == notFound) {
2819 auto name = formURLDecode(bytes.toString().replace('+', 0x20));
2821 output.append({ name.value(), emptyString() });
2823 auto name = formURLDecode(bytes.substring(0, equalIndex).toString().replace('+', 0x20));
2824 auto value = formURLDecode(bytes.substring(equalIndex + 1).toString().replace('+', 0x20));
2826 output.append({ name.value(), value.value() });
2832 static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2834 auto utf8 = input.utf8(StrictConversion);
2835 const char* data = utf8.data();
2836 for (size_t i = 0; i < utf8.length(); ++i) {
2837 const char byte = data[i];
2839 output.append(0x2B);
2840 else if (byte == 0x2A
2843 || (byte >= 0x30 && byte <= 0x39)
2844 || (byte >= 0x41 && byte <= 0x5A)
2846 || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches.
2847 output.append(byte);
2849 percentEncodeByte(byte, output);
2853 String URLParser::serialize(const URLEncodedForm& tuples)
2855 Vector<LChar> output;
2856 for (auto& tuple : tuples) {
2857 if (!output.isEmpty())
2859 serializeURLEncodedForm(tuple.key, output);
2861 serializeURLEncodedForm(tuple.value, output);
2863 return String::adopt(WTFMove(output));
2866 const UIDNA& URLParser::internationalDomainNameTranscoder()
2868 static UIDNA* encoder;
2869 static std::once_flag onceFlag;
2870 std::call_once(onceFlag, [] {
2871 UErrorCode error = U_ZERO_ERROR;
2872 encoder = uidna_openUTS46(UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_NONTRANSITIONAL_TO_UNICODE | UIDNA_NONTRANSITIONAL_TO_ASCII, &error);
2873 RELEASE_ASSERT(U_SUCCESS(error));
2874 RELEASE_ASSERT(encoder);
2879 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2881 // FIXME: m_cannotBeABaseURL is not compared because the old URL::parse did not use it,
2882 // but once we get rid of URL::parse its value should be tested.
2883 URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2885 a.m_protocolIsInHTTPFamily,
2892 a.m_pathAfterLastSlash,
2896 a.m_string.utf8().data(),
2898 b.m_protocolIsInHTTPFamily,
2905 b.m_pathAfterLastSlash,
2909 b.m_string.utf8().data());
2911 return a.m_string == b.m_string
2912 && a.m_isValid == b.m_isValid
2913 && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2914 && a.m_schemeEnd == b.m_schemeEnd
2915 && a.m_userStart == b.m_userStart
2916 && a.m_userEnd == b.m_userEnd
2917 && a.m_passwordEnd == b.m_passwordEnd
2918 && a.m_hostEnd == b.m_hostEnd
2919 && a.m_portEnd == b.m_portEnd
2920 && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2921 && a.m_pathEnd == b.m_pathEnd
2922 && a.m_queryEnd == b.m_queryEnd
2923 && a.m_fragmentEnd == b.m_fragmentEnd;
2926 bool URLParser::internalValuesConsistent(const URL& url)
2928 return url.m_schemeEnd <= url.m_userStart
2929 && url.m_userStart <= url.m_userEnd
2930 && url.m_userEnd <= url.m_passwordEnd
2931 && url.m_passwordEnd <= url.m_hostEnd
2932 && url.m_hostEnd <= url.m_portEnd
2933 && url.m_portEnd <= url.m_pathAfterLastSlash
2934 && url.m_pathAfterLastSlash <= url.m_pathEnd
2935 && url.m_pathEnd <= url.m_queryEnd
2936 && url.m_queryEnd <= url.m_fragmentEnd
2937 && (url.m_isValid ? url.m_fragmentEnd == url.m_string.length() : !url.m_fragmentEnd);
2938 // FIXME: Why do we even store m_fragmentEnd?
2939 // It should be able to be deduced from m_isValid and m_string.length() to save memory.
2942 } // namespace WebCore