2 * Copyright (C) 2016 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23 * THE POSSIBILITY OF SUCH DAMAGE.
27 #include "URLParser.h"
31 #include <unicode/uidna.h>
32 #include <unicode/utypes.h>
36 template<typename CharacterType>
37 class CodePointIterator {
39 CodePointIterator() { }
40 CodePointIterator(const CharacterType* begin, const CharacterType* end)
46 CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
47 : CodePointIterator(begin.m_begin, end.m_begin)
49 ASSERT(end.m_begin >= begin.m_begin);
52 UChar32 operator*() const;
53 CodePointIterator& operator++();
55 bool operator==(const CodePointIterator& other) const
57 return m_begin == other.m_begin
58 && m_end == other.m_end;
60 bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
62 CodePointIterator& operator=(const CodePointIterator& other)
64 m_begin = other.m_begin;
71 ASSERT(m_begin <= m_end);
72 return m_begin >= m_end;
76 const CharacterType* m_begin { nullptr };
77 const CharacterType* m_end { nullptr };
81 UChar32 CodePointIterator<LChar>::operator*() const
88 auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
96 UChar32 CodePointIterator<UChar>::operator*() const
100 U16_GET(m_begin, 0, 0, m_end - m_begin, c);
105 auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
109 size_t length = m_end - m_begin;
110 U16_FWD_1(m_begin, i, length);
115 static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
117 if (U_IS_BMP(codePoint)) {
118 destination.append(static_cast<UChar>(codePoint));
121 destination.reserveCapacity(destination.size() + 2);
122 destination.uncheckedAppend(U16_LEAD(codePoint));
123 destination.uncheckedAppend(U16_TRAIL(codePoint));
126 enum URLCharacterClass {
131 SlashQuestionOrHash = 0x10,
134 static const uint8_t characterClassTable[256] = {
135 UserInfo | Default | InvalidDomain | QueryPercent, // 0x0
136 UserInfo | Default | QueryPercent, // 0x1
137 UserInfo | Default | QueryPercent, // 0x2
138 UserInfo | Default | QueryPercent, // 0x3
139 UserInfo | Default | QueryPercent, // 0x4
140 UserInfo | Default | QueryPercent, // 0x5
141 UserInfo | Default | QueryPercent, // 0x6
142 UserInfo | Default | QueryPercent, // 0x7
143 UserInfo | Default | QueryPercent, // 0x8
144 UserInfo | Default | InvalidDomain | QueryPercent, // 0x9
145 UserInfo | Default | InvalidDomain | QueryPercent, // 0xA
146 UserInfo | Default | QueryPercent, // 0xB
147 UserInfo | Default | QueryPercent, // 0xC
148 UserInfo | Default | InvalidDomain | QueryPercent, // 0xD
149 UserInfo | Default | QueryPercent, // 0xE
150 UserInfo | Default | QueryPercent, // 0xF
151 UserInfo | Default | QueryPercent, // 0x10
152 UserInfo | Default | QueryPercent, // 0x11
153 UserInfo | Default | QueryPercent, // 0x12
154 UserInfo | Default | QueryPercent, // 0x13
155 UserInfo | Default | QueryPercent, // 0x14
156 UserInfo | Default | QueryPercent, // 0x15
157 UserInfo | Default | QueryPercent, // 0x16
158 UserInfo | Default | QueryPercent, // 0x17
159 UserInfo | Default | QueryPercent, // 0x18
160 UserInfo | Default | QueryPercent, // 0x19
161 UserInfo | Default | QueryPercent, // 0x1A
162 UserInfo | Default | QueryPercent, // 0x1B
163 UserInfo | Default | QueryPercent, // 0x1C
164 UserInfo | Default | QueryPercent, // 0x1D
165 UserInfo | Default | QueryPercent, // 0x1E
166 UserInfo | Default | QueryPercent, // 0x1F
167 UserInfo | Default | InvalidDomain | QueryPercent, // ' '
169 UserInfo | Default | QueryPercent, // '"'
170 UserInfo | Default | InvalidDomain | QueryPercent | SlashQuestionOrHash, // '#'
172 InvalidDomain, // '%'
182 UserInfo | InvalidDomain | SlashQuestionOrHash, // '/'
193 UserInfo | InvalidDomain, // ':'
195 UserInfo | Default | QueryPercent, // '<'
197 UserInfo | Default | QueryPercent, // '>'
198 UserInfo | Default | InvalidDomain | SlashQuestionOrHash, // '?'
199 UserInfo | InvalidDomain, // '@'
226 UserInfo | InvalidDomain, // '['
227 UserInfo | InvalidDomain | SlashQuestionOrHash, // '\\'
228 UserInfo | InvalidDomain, // ']'
231 UserInfo | Default, // '`'
258 UserInfo | Default, // '{'
260 UserInfo | Default, // '}'
262 QueryPercent, // 0x7F
263 QueryPercent, // 0x80
264 QueryPercent, // 0x81
265 QueryPercent, // 0x82
266 QueryPercent, // 0x83
267 QueryPercent, // 0x84
268 QueryPercent, // 0x85
269 QueryPercent, // 0x86
270 QueryPercent, // 0x87
271 QueryPercent, // 0x88
272 QueryPercent, // 0x89
273 QueryPercent, // 0x8A
274 QueryPercent, // 0x8B
275 QueryPercent, // 0x8C
276 QueryPercent, // 0x8D
277 QueryPercent, // 0x8E
278 QueryPercent, // 0x8F
279 QueryPercent, // 0x90
280 QueryPercent, // 0x91
281 QueryPercent, // 0x92
282 QueryPercent, // 0x93
283 QueryPercent, // 0x94
284 QueryPercent, // 0x95
285 QueryPercent, // 0x96
286 QueryPercent, // 0x97
287 QueryPercent, // 0x98
288 QueryPercent, // 0x99
289 QueryPercent, // 0x9A
290 QueryPercent, // 0x9B
291 QueryPercent, // 0x9C
292 QueryPercent, // 0x9D
293 QueryPercent, // 0x9E
294 QueryPercent, // 0x9F
295 QueryPercent, // 0xA0
296 QueryPercent, // 0xA1
297 QueryPercent, // 0xA2
298 QueryPercent, // 0xA3
299 QueryPercent, // 0xA4
300 QueryPercent, // 0xA5
301 QueryPercent, // 0xA6
302 QueryPercent, // 0xA7
303 QueryPercent, // 0xA8
304 QueryPercent, // 0xA9
305 QueryPercent, // 0xAA
306 QueryPercent, // 0xAB
307 QueryPercent, // 0xAC
308 QueryPercent, // 0xAD
309 QueryPercent, // 0xAE
310 QueryPercent, // 0xAF
311 QueryPercent, // 0xB0
312 QueryPercent, // 0xB1
313 QueryPercent, // 0xB2
314 QueryPercent, // 0xB3
315 QueryPercent, // 0xB4
316 QueryPercent, // 0xB5
317 QueryPercent, // 0xB6
318 QueryPercent, // 0xB7
319 QueryPercent, // 0xB8
320 QueryPercent, // 0xB9
321 QueryPercent, // 0xBA
322 QueryPercent, // 0xBB
323 QueryPercent, // 0xBC
324 QueryPercent, // 0xBD
325 QueryPercent, // 0xBE
326 QueryPercent, // 0xBF
327 QueryPercent, // 0xC0
328 QueryPercent, // 0xC1
329 QueryPercent, // 0xC2
330 QueryPercent, // 0xC3
331 QueryPercent, // 0xC4
332 QueryPercent, // 0xC5
333 QueryPercent, // 0xC6
334 QueryPercent, // 0xC7
335 QueryPercent, // 0xC8
336 QueryPercent, // 0xC9
337 QueryPercent, // 0xCA
338 QueryPercent, // 0xCB
339 QueryPercent, // 0xCC
340 QueryPercent, // 0xCD
341 QueryPercent, // 0xCE
342 QueryPercent, // 0xCF
343 QueryPercent, // 0xD0
344 QueryPercent, // 0xD1
345 QueryPercent, // 0xD2
346 QueryPercent, // 0xD3
347 QueryPercent, // 0xD4
348 QueryPercent, // 0xD5
349 QueryPercent, // 0xD6
350 QueryPercent, // 0xD7
351 QueryPercent, // 0xD8
352 QueryPercent, // 0xD9
353 QueryPercent, // 0xDA
354 QueryPercent, // 0xDB
355 QueryPercent, // 0xDC
356 QueryPercent, // 0xDD
357 QueryPercent, // 0xDE
358 QueryPercent, // 0xDF
359 QueryPercent, // 0xE0
360 QueryPercent, // 0xE1
361 QueryPercent, // 0xE2
362 QueryPercent, // 0xE3
363 QueryPercent, // 0xE4
364 QueryPercent, // 0xE5
365 QueryPercent, // 0xE6
366 QueryPercent, // 0xE7
367 QueryPercent, // 0xE8
368 QueryPercent, // 0xE9
369 QueryPercent, // 0xEA
370 QueryPercent, // 0xEB
371 QueryPercent, // 0xEC
372 QueryPercent, // 0xED
373 QueryPercent, // 0xEE
374 QueryPercent, // 0xEF
375 QueryPercent, // 0xF0
376 QueryPercent, // 0xF1
377 QueryPercent, // 0xF2
378 QueryPercent, // 0xF3
379 QueryPercent, // 0xF4
380 QueryPercent, // 0xF5
381 QueryPercent, // 0xF6
382 QueryPercent, // 0xF7
383 QueryPercent, // 0xF8
384 QueryPercent, // 0xF9
385 QueryPercent, // 0xFA
386 QueryPercent, // 0xFB
387 QueryPercent, // 0xFC
388 QueryPercent, // 0xFD
389 QueryPercent, // 0xFE
390 QueryPercent, // 0xFF
393 template<typename CharacterType> inline static bool isC0Control(CharacterType character) { return character <= 0x1F; }
394 template<typename CharacterType> inline static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
395 template<typename CharacterType> inline static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
396 template<typename CharacterType> inline static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
397 template<typename CharacterType> inline static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
398 template<typename CharacterType> inline static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
399 template<typename CharacterType> inline static bool isInvalidDomainCharacter(CharacterType character) { return character <= ']' && characterClassTable[character] & InvalidDomain; }
400 template<typename CharacterType> inline static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
401 template<typename CharacterType> inline static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
402 static bool shouldPercentEncodeQueryByte(uint8_t byte) { return characterClassTable[byte] & QueryPercent; }
404 template<bool serialized, typename CharacterType>
405 void incrementIteratorSkippingTabAndNewLine(CodePointIterator<CharacterType>& iterator)
408 while (!serialized && !iterator.atEnd() && isTabOrNewline(*iterator))
412 template<bool serialized, typename CharacterType>
413 inline static bool isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
415 if (iterator.atEnd() || !isASCIIAlpha(*iterator))
417 incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
418 if (iterator.atEnd())
420 return *iterator == ':' || *iterator == '|';
423 inline static bool isWindowsDriveLetter(const Vector<LChar>& buffer, size_t index)
425 if (buffer.size() < index + 2)
427 return isASCIIAlpha(buffer[index]) && (buffer[index + 1] == ':' || buffer[index + 1] == '|');
430 template<bool serialized, typename CharacterType>
431 inline static void checkWindowsDriveLetter(CodePointIterator<CharacterType>& iterator, Vector<LChar>& asciiBuffer)
433 if (isWindowsDriveLetter<serialized>(iterator)) {
434 asciiBuffer.reserveCapacity(asciiBuffer.size() + 2);
435 asciiBuffer.uncheckedAppend(*iterator);
436 incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
437 ASSERT(!iterator.atEnd());
438 ASSERT(*iterator == ':' || *iterator == '|');
439 asciiBuffer.uncheckedAppend(':');
440 incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
444 template<bool serialized, typename CharacterType>
445 inline static bool shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
447 if (!isWindowsDriveLetter<serialized>(iterator))
449 if (iterator.atEnd())
451 incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
452 if (iterator.atEnd())
454 incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
455 if (iterator.atEnd())
457 return !isSlashQuestionOrHash(*iterator);
460 inline static void percentEncode(uint8_t byte, Vector<LChar>& buffer)
463 buffer.append(upperNibbleToASCIIHexDigit(byte));
464 buffer.append(lowerNibbleToASCIIHexDigit(byte));
467 template<bool serialized>
468 inline static void utf8PercentEncode(UChar32 codePoint, Vector<LChar>& destination, bool(*isInCodeSet)(UChar32))
471 ASSERT_WITH_SECURITY_IMPLICATION(isASCII(codePoint));
472 ASSERT_WITH_SECURITY_IMPLICATION(!isInCodeSet(codePoint));
473 destination.append(codePoint);
475 if (isInCodeSet(codePoint)) {
476 uint8_t buffer[U8_MAX_LENGTH];
479 U8_APPEND(buffer, offset, U8_MAX_LENGTH, codePoint, error);
480 // FIXME: Check error.
481 for (int32_t i = 0; i < offset; ++i)
482 percentEncode(buffer[i], destination);
484 ASSERT_WITH_MESSAGE(isASCII(codePoint), "isInCodeSet should always return true for non-ASCII characters");
485 destination.append(codePoint);
490 template<bool serialized>
491 inline static void utf8PercentEncodeQuery(UChar32 codePoint, Vector<LChar>& destination)
494 ASSERT_WITH_SECURITY_IMPLICATION(isASCII(codePoint));
495 ASSERT_WITH_SECURITY_IMPLICATION(!shouldPercentEncodeQueryByte(codePoint));
496 destination.append(codePoint);
498 uint8_t buffer[U8_MAX_LENGTH];
501 U8_APPEND(buffer, offset, U8_MAX_LENGTH, codePoint, error);
502 ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
503 // FIXME: Check error.
504 for (int32_t i = 0; i < offset; ++i) {
505 auto byte = buffer[i];
506 if (shouldPercentEncodeQueryByte(byte))
507 percentEncode(byte, destination);
509 destination.append(byte);
514 inline static void encodeQuery(const Vector<UChar>& source, Vector<LChar>& destination, const TextEncoding& encoding)
516 // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
517 CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
518 const char* data = encoded.data();
519 size_t length = encoded.length();
520 for (size_t i = 0; i < length; ++i) {
521 uint8_t byte = data[i];
522 if (shouldPercentEncodeQueryByte(byte))
523 percentEncode(byte, destination);
525 destination.append(byte);
529 inline static bool isDefaultPort(StringView scheme, uint16_t port)
531 static const uint16_t ftpPort = 21;
532 static const uint16_t gopherPort = 70;
533 static const uint16_t httpPort = 80;
534 static const uint16_t httpsPort = 443;
535 static const uint16_t wsPort = 80;
536 static const uint16_t wssPort = 443;
538 auto length = scheme.length();
545 return scheme[1] == 's'
548 return scheme[1] == 's'
557 return scheme[1] == 't'
562 return scheme[1] == 't'
566 && port == httpsPort;
577 && port == gopherPort;
589 inline static bool isSpecialScheme(StringView scheme)
591 auto length = scheme.length();
598 return scheme[1] == 't'
601 return scheme[1] == 'i'
617 return scheme[1] == 't'
621 return scheme[1] == 't'
631 return scheme[1] == 's';
633 return scheme[1] == 's'
643 enum class URLParser::URLPart {
656 size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
659 case URLPart::FragmentEnd:
660 return url.m_fragmentEnd;
661 case URLPart::QueryEnd:
662 return url.m_queryEnd;
663 case URLPart::PathEnd:
664 return url.m_pathEnd;
665 case URLPart::PathAfterLastSlash:
666 return url.m_pathAfterLastSlash;
667 case URLPart::PortEnd:
668 return url.m_portEnd;
669 case URLPart::HostEnd:
670 return url.m_hostEnd;
671 case URLPart::PasswordEnd:
672 return url.m_passwordEnd;
673 case URLPart::UserEnd:
674 return url.m_userEnd;
675 case URLPart::UserStart:
676 return url.m_userStart;
677 case URLPart::SchemeEnd:
678 return url.m_schemeEnd;
680 ASSERT_NOT_REACHED();
684 inline static void copyASCIIStringUntil(Vector<LChar>& destination, const String& string, size_t lengthIf8Bit, size_t lengthIf16Bit)
686 ASSERT(destination.isEmpty());
687 if (string.is8Bit()) {
688 RELEASE_ASSERT(lengthIf8Bit <= string.length());
689 destination.append(string.characters8(), lengthIf8Bit);
691 RELEASE_ASSERT(lengthIf16Bit <= string.length());
692 destination.reserveCapacity(lengthIf16Bit);
693 const UChar* characters = string.characters16();
694 for (size_t i = 0; i < lengthIf16Bit; ++i) {
695 UChar c = characters[i];
696 ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
697 destination.uncheckedAppend(c);
702 void URLParser::copyURLPartsUntil(const URL& base, URLPart part)
704 m_asciiBuffer.clear();
705 m_unicodeFragmentBuffer.clear();
706 if (part == URLPart::FragmentEnd) {
707 copyASCIIStringUntil(m_asciiBuffer, base.m_string, urlLengthUntilPart(base, URLPart::FragmentEnd), urlLengthUntilPart(base, URLPart::QueryEnd));
708 if (!base.m_string.is8Bit()) {
709 const String& fragment = base.m_string;
710 bool seenUnicode = false;
711 for (size_t i = base.m_queryEnd; i < base.m_fragmentEnd; ++i) {
712 if (!seenUnicode && !isASCII(fragment[i]))
715 m_unicodeFragmentBuffer.uncheckedAppend(fragment[i]);
717 m_asciiBuffer.uncheckedAppend(fragment[i]);
721 size_t length = urlLengthUntilPart(base, part);
722 copyASCIIStringUntil(m_asciiBuffer, base.m_string, length, length);
725 case URLPart::FragmentEnd:
726 m_url.m_fragmentEnd = base.m_fragmentEnd;
728 case URLPart::QueryEnd:
729 m_url.m_queryEnd = base.m_queryEnd;
731 case URLPart::PathEnd:
732 m_url.m_pathEnd = base.m_pathEnd;
734 case URLPart::PathAfterLastSlash:
735 m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
737 case URLPart::PortEnd:
738 m_url.m_portEnd = base.m_portEnd;
740 case URLPart::HostEnd:
741 m_url.m_hostEnd = base.m_hostEnd;
743 case URLPart::PasswordEnd:
744 m_url.m_passwordEnd = base.m_passwordEnd;
746 case URLPart::UserEnd:
747 m_url.m_userEnd = base.m_userEnd;
749 case URLPart::UserStart:
750 m_url.m_userStart = base.m_userStart;
752 case URLPart::SchemeEnd:
753 m_url.m_isValid = base.m_isValid;
754 m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
755 m_url.m_schemeEnd = base.m_schemeEnd;
757 m_urlIsSpecial = isSpecialScheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd));
760 static const char* dotASCIICode = "2e";
762 template<typename CharacterType>
763 inline static bool isPercentEncodedDot(CodePointIterator<CharacterType> c)
772 if (*c != dotASCIICode[0])
777 return toASCIILower(*c) == dotASCIICode[1];
780 template<typename CharacterType>
781 inline static bool isSingleDotPathSegment(CodePointIterator<CharacterType> c)
787 return c.atEnd() || isSlashQuestionOrHash(*c);
792 if (c.atEnd() || *c != dotASCIICode[0])
797 if (toASCIILower(*c) == dotASCIICode[1]) {
799 return c.atEnd() || isSlashQuestionOrHash(*c);
804 template<typename CharacterType>
805 inline static bool isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
811 return isSingleDotPathSegment(c);
816 if (c.atEnd() || *c != dotASCIICode[0])
821 if (toASCIILower(*c) == dotASCIICode[1]) {
823 return isSingleDotPathSegment(c);
828 template<typename CharacterType>
829 inline static void consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
831 ASSERT(isSingleDotPathSegment(c));
835 if (*c == '/' || *c == '\\')
838 ASSERT(*c == '?' || *c == '#');
843 ASSERT(*c == dotASCIICode[0]);
845 ASSERT(toASCIILower(*c) == dotASCIICode[1]);
848 if (*c == '/' || *c == '\\')
851 ASSERT(*c == '?' || *c == '#');
856 template<typename CharacterType>
857 inline static void consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
859 ASSERT(isDoubleDotPathSegment(c));
865 ASSERT(*c == dotASCIICode[0]);
867 ASSERT(toASCIILower(*c) == dotASCIICode[1]);
870 consumeSingleDotPathSegment(c);
873 void URLParser::popPath()
875 if (m_url.m_pathAfterLastSlash > m_url.m_portEnd + 1) {
876 m_url.m_pathAfterLastSlash--;
877 if (m_asciiBuffer[m_url.m_pathAfterLastSlash] == '/')
878 m_url.m_pathAfterLastSlash--;
879 while (m_url.m_pathAfterLastSlash > m_url.m_portEnd && m_asciiBuffer[m_url.m_pathAfterLastSlash] != '/')
880 m_url.m_pathAfterLastSlash--;
881 m_url.m_pathAfterLastSlash++;
883 m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
886 template<typename CharacterType>
887 URL URLParser::failure(const CharacterType* input, unsigned length)
890 url.m_isValid = false;
891 url.m_protocolIsInHTTPFamily = false;
892 url.m_cannotBeABaseURL = false;
896 url.m_passwordEnd = 0;
899 url.m_pathAfterLastSlash = 0;
902 url.m_fragmentEnd = 0;
903 url.m_string = String(input, length);
907 URL URLParser::parse(const String& input, const URL& base, const TextEncoding& encoding)
909 const bool serialized = false;
911 return parse<serialized>(input.characters8(), input.length(), base, encoding);
912 return parse<serialized>(input.characters16(), input.length(), base, encoding);
915 URL URLParser::parseSerializedURL(const String& input)
917 const bool serialized = true;
919 return parse<serialized>(input.characters8(), input.length(), { }, UTF8Encoding());
920 return parse<serialized>(input.characters16(), input.length(), { }, UTF8Encoding());
923 template<bool serialized, typename CharacterType>
924 URL URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
926 LOG(URLParser, "Parsing URL <%s> base <%s>", String(input, length).utf8().data(), base.string().utf8().data());
928 m_asciiBuffer.clear();
929 m_unicodeFragmentBuffer.clear();
930 m_asciiBuffer.reserveCapacity(length);
932 bool isUTF8Encoding = encoding == UTF8Encoding();
933 Vector<UChar> queryBuffer;
935 unsigned endIndex = length;
936 while (endIndex && isC0ControlOrSpace(input[endIndex - 1]))
938 CodePointIterator<CharacterType> c(input, input + endIndex);
939 CodePointIterator<CharacterType> authorityOrHostBegin;
940 while (!c.atEnd() && isC0ControlOrSpace(*c))
942 auto beginAfterControlAndSpace = c;
944 enum class State : uint8_t {
948 SpecialRelativeOrAuthority,
952 SpecialAuthoritySlashes,
953 SpecialAuthorityIgnoreSlashes,
961 CannotBeABaseURLPath,
966 #define LOG_STATE(x) LOG(URLParser, "State %s, code point %c, asciiBuffer size %zu", x, *c, m_asciiBuffer.size())
967 #define LOG_FINAL_STATE(x) LOG(URLParser, "Final State: %s", x)
969 State state = State::SchemeStart;
971 if (!serialized && isTabOrNewline(*c)) {
977 case State::SchemeStart:
978 LOG_STATE("SchemeStart");
979 if (isASCIIAlpha(*c)) {
980 m_asciiBuffer.uncheckedAppend(toASCIILower(*c));
982 state = State::Scheme;
984 state = State::NoScheme;
988 if (isASCIIAlphanumeric(*c) || *c == '+' || *c == '-' || *c == '.')
989 m_asciiBuffer.append(toASCIILower(*c));
990 else if (*c == ':') {
991 m_url.m_schemeEnd = m_asciiBuffer.size();
992 StringView urlScheme = StringView(m_asciiBuffer.data(), m_url.m_schemeEnd);
993 m_url.m_protocolIsInHTTPFamily = urlScheme == "http" || urlScheme == "https";
994 if (urlScheme == "file") {
995 m_urlIsSpecial = true;
997 m_asciiBuffer.append(':');
1001 m_asciiBuffer.append(':');
1002 if (isSpecialScheme(urlScheme)) {
1003 m_urlIsSpecial = true;
1004 if (base.protocolIs(m_asciiBuffer.data(), m_asciiBuffer.size() - 1))
1005 state = State::SpecialRelativeOrAuthority;
1007 state = State::SpecialAuthoritySlashes;
1009 auto maybeSlash = c;
1010 incrementIteratorSkippingTabAndNewLine<serialized>(maybeSlash);
1011 if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1012 m_asciiBuffer.append('/');
1013 m_url.m_userStart = m_asciiBuffer.size();
1014 state = State::PathOrAuthority;
1018 m_url.m_userStart = m_asciiBuffer.size();
1019 m_url.m_userEnd = m_url.m_userStart;
1020 m_url.m_passwordEnd = m_url.m_userStart;
1021 m_url.m_hostEnd = m_url.m_userStart;
1022 m_url.m_portEnd = m_url.m_userStart;
1023 m_url.m_pathAfterLastSlash = m_url.m_userStart;
1024 m_url.m_cannotBeABaseURL = true;
1025 state = State::CannotBeABaseURLPath;
1031 m_asciiBuffer.clear();
1032 state = State::NoScheme;
1033 c = beginAfterControlAndSpace;
1036 incrementIteratorSkippingTabAndNewLine<serialized>(c);
1038 m_asciiBuffer.clear();
1039 state = State::NoScheme;
1040 c = beginAfterControlAndSpace;
1043 case State::NoScheme:
1044 LOG_STATE("NoScheme");
1045 if (base.isNull() || (base.m_cannotBeABaseURL && *c != '#'))
1046 return failure(input, length);
1047 if (base.m_cannotBeABaseURL && *c == '#') {
1048 copyURLPartsUntil(base, URLPart::QueryEnd);
1049 state = State::Fragment;
1050 m_asciiBuffer.append('#');
1054 if (!base.protocolIs("file")) {
1055 state = State::Relative;
1058 copyURLPartsUntil(base, URLPart::SchemeEnd);
1059 m_asciiBuffer.append(':');
1060 state = State::File;
1062 case State::SpecialRelativeOrAuthority:
1063 LOG_STATE("SpecialRelativeOrAuthority");
1065 m_asciiBuffer.append('/');
1066 incrementIteratorSkippingTabAndNewLine<serialized>(c);
1068 return failure(input, length);
1070 m_asciiBuffer.append('/');
1071 state = State::SpecialAuthorityIgnoreSlashes;
1074 state = State::RelativeSlash;
1076 state = State::Relative;
1078 case State::PathOrAuthority:
1079 LOG_STATE("PathOrAuthority");
1081 m_asciiBuffer.append('/');
1082 m_url.m_userStart = m_asciiBuffer.size();
1083 state = State::AuthorityOrHost;
1085 authorityOrHostBegin = c;
1087 ASSERT(m_asciiBuffer.last() == '/');
1088 m_url.m_userStart = m_asciiBuffer.size() - 1;
1089 m_url.m_userEnd = m_url.m_userStart;
1090 m_url.m_passwordEnd = m_url.m_userStart;
1091 m_url.m_hostEnd = m_url.m_userStart;
1092 m_url.m_portEnd = m_url.m_userStart;
1093 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1094 state = State::Path;
1097 case State::Relative:
1098 LOG_STATE("Relative");
1102 state = State::RelativeSlash;
1106 copyURLPartsUntil(base, URLPart::PathEnd);
1107 m_asciiBuffer.append('?');
1108 state = State::Query;
1112 copyURLPartsUntil(base, URLPart::QueryEnd);
1113 m_asciiBuffer.append('#');
1114 state = State::Fragment;
1118 copyURLPartsUntil(base, URLPart::PathAfterLastSlash);
1119 state = State::Path;
1123 case State::RelativeSlash:
1124 LOG_STATE("RelativeSlash");
1125 if (*c == '/' || *c == '\\') {
1127 copyURLPartsUntil(base, URLPart::SchemeEnd);
1128 m_asciiBuffer.append("://", 3);
1129 state = State::SpecialAuthorityIgnoreSlashes;
1131 copyURLPartsUntil(base, URLPart::PortEnd);
1132 m_asciiBuffer.append('/');
1133 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1134 state = State::Path;
1137 case State::SpecialAuthoritySlashes:
1138 LOG_STATE("SpecialAuthoritySlashes");
1139 m_asciiBuffer.append("//", 2);
1140 if (*c == '/' || *c == '\\') {
1141 incrementIteratorSkippingTabAndNewLine<serialized>(c);
1142 if (!c.atEnd() && (*c == '/' || *c == '\\'))
1145 state = State::SpecialAuthorityIgnoreSlashes;
1147 case State::SpecialAuthorityIgnoreSlashes:
1148 LOG_STATE("SpecialAuthorityIgnoreSlashes");
1149 if (*c == '/' || *c == '\\') {
1150 m_asciiBuffer.append('/');
1153 m_url.m_userStart = m_asciiBuffer.size();
1154 state = State::AuthorityOrHost;
1155 authorityOrHostBegin = c;
1157 case State::AuthorityOrHost:
1158 LOG_STATE("AuthorityOrHost");
1162 auto findLastAt = c;
1163 while (!findLastAt.atEnd()) {
1164 if (*findLastAt == '@')
1165 lastAt = findLastAt;
1168 parseAuthority<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1170 incrementIteratorSkippingTabAndNewLine<serialized>(c);
1171 authorityOrHostBegin = c;
1172 state = State::Host;
1173 m_hostHasPercentOrNonASCII = false;
1176 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1177 if (isSlash || *c == '?' || *c == '#') {
1178 m_url.m_userEnd = m_asciiBuffer.size();
1179 m_url.m_passwordEnd = m_url.m_userEnd;
1180 if (!parseHostAndPort<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
1181 return failure(input, length);
1183 m_asciiBuffer.append('/');
1184 m_url.m_pathAfterLastSlash = m_asciiBuffer.size();
1186 state = State::Path;
1189 if (isPercentOrNonASCII(*c))
1190 m_hostHasPercentOrNonASCII = true;
1196 if (*c == '/' || *c == '?' || *c == '#') {
1197 if (!parseHostAndPort<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
1198 return failure(input, length);
1199 state = State::Path;
1202 if (isPercentOrNonASCII(*c))
1203 m_hostHasPercentOrNonASCII = true;
1211 m_asciiBuffer.append('/');
1212 state = State::FileSlash;
1216 if (!base.isNull() && base.protocolIs("file"))
1217 copyURLPartsUntil(base, URLPart::PathEnd);
1218 m_asciiBuffer.append("///?", 4);
1219 m_url.m_userStart = m_asciiBuffer.size() - 2;
1220 m_url.m_userEnd = m_url.m_userStart;
1221 m_url.m_passwordEnd = m_url.m_userStart;
1222 m_url.m_hostEnd = m_url.m_userStart;
1223 m_url.m_portEnd = m_url.m_userStart;
1224 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1225 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1226 state = State::Query;
1230 if (!base.isNull() && base.protocolIs("file"))
1231 copyURLPartsUntil(base, URLPart::QueryEnd);
1232 m_asciiBuffer.append("///#", 4);
1233 m_url.m_userStart = m_asciiBuffer.size() - 2;
1234 m_url.m_userEnd = m_url.m_userStart;
1235 m_url.m_passwordEnd = m_url.m_userStart;
1236 m_url.m_hostEnd = m_url.m_userStart;
1237 m_url.m_portEnd = m_url.m_userStart;
1238 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1239 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1240 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1241 state = State::Fragment;
1245 if (!base.isNull() && base.protocolIs("file") && shouldCopyFileURL<serialized>(c))
1246 copyURLPartsUntil(base, URLPart::PathAfterLastSlash);
1248 m_asciiBuffer.append("///", 3);
1249 m_url.m_userStart = m_asciiBuffer.size() - 1;
1250 m_url.m_userEnd = m_url.m_userStart;
1251 m_url.m_passwordEnd = m_url.m_userStart;
1252 m_url.m_hostEnd = m_url.m_userStart;
1253 m_url.m_portEnd = m_url.m_userStart;
1254 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1255 checkWindowsDriveLetter<serialized>(c, m_asciiBuffer);
1257 state = State::Path;
1261 case State::FileSlash:
1262 LOG_STATE("FileSlash");
1263 if (*c == '/' || *c == '\\') {
1265 m_asciiBuffer.append('/');
1266 m_url.m_userStart = m_asciiBuffer.size();
1267 m_url.m_userEnd = m_url.m_userStart;
1268 m_url.m_passwordEnd = m_url.m_userStart;
1269 m_url.m_hostEnd = m_url.m_userStart;
1270 m_url.m_portEnd = m_url.m_userStart;
1271 authorityOrHostBegin = c;
1272 state = State::FileHost;
1275 if (!base.isNull() && base.protocolIs("file")) {
1276 // FIXME: This String copy is unnecessary.
1277 String basePath = base.path();
1278 if (basePath.length() >= 2) {
1279 bool windowsQuirk = basePath.is8Bit()
1280 ? isWindowsDriveLetter<serialized>(CodePointIterator<LChar>(basePath.characters8(), basePath.characters8() + basePath.length()))
1281 : isWindowsDriveLetter<serialized>(CodePointIterator<UChar>(basePath.characters16(), basePath.characters16() + basePath.length()));
1283 m_asciiBuffer.append(basePath[0]);
1284 m_asciiBuffer.append(basePath[1]);
1288 m_asciiBuffer.append("//", 2);
1289 m_url.m_userStart = m_asciiBuffer.size() - 1;
1290 m_url.m_userEnd = m_url.m_userStart;
1291 m_url.m_passwordEnd = m_url.m_userStart;
1292 m_url.m_hostEnd = m_url.m_userStart;
1293 m_url.m_portEnd = m_url.m_userStart;
1294 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1295 checkWindowsDriveLetter<serialized>(c, m_asciiBuffer);
1296 state = State::Path;
1298 case State::FileHost:
1299 LOG_STATE("FileHost");
1300 if (isSlashQuestionOrHash(*c)) {
1301 if (isWindowsDriveLetter(m_asciiBuffer, m_url.m_portEnd + 1)) {
1302 state = State::Path;
1305 if (authorityOrHostBegin == c) {
1306 ASSERT(m_asciiBuffer[m_asciiBuffer.size() - 1] == '/');
1308 m_asciiBuffer.append("/?", 2);
1309 m_url.m_pathAfterLastSlash = m_asciiBuffer.size() - 1;
1310 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1311 state = State::Query;
1316 m_asciiBuffer.append("/#", 2);
1317 m_url.m_pathAfterLastSlash = m_asciiBuffer.size() - 1;
1318 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1319 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1320 state = State::Fragment;
1324 state = State::Path;
1327 if (!parseHostAndPort<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
1328 return failure(input, length);
1330 if (StringView(m_asciiBuffer.data() + m_url.m_passwordEnd, m_asciiBuffer.size() - m_url.m_passwordEnd) == "localhost") {
1331 m_asciiBuffer.shrink(m_url.m_passwordEnd);
1332 m_url.m_hostEnd = m_asciiBuffer.size();
1333 m_url.m_portEnd = m_url.m_hostEnd;
1336 state = State::PathStart;
1339 if (isPercentOrNonASCII(*c))
1340 m_hostHasPercentOrNonASCII = true;
1343 case State::PathStart:
1344 LOG_STATE("PathStart");
1345 if (*c != '/' && *c != '\\')
1347 state = State::Path;
1351 if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1352 m_asciiBuffer.append('/');
1353 m_url.m_pathAfterLastSlash = m_asciiBuffer.size();
1357 if (m_asciiBuffer.size() && m_asciiBuffer[m_asciiBuffer.size() - 1] == '/') {
1358 if (isDoubleDotPathSegment(c)) {
1359 consumeDoubleDotPathSegment(c);
1363 if (m_asciiBuffer[m_asciiBuffer.size() - 1] == '/' && isSingleDotPathSegment(c)) {
1364 consumeSingleDotPathSegment(c);
1369 m_url.m_pathEnd = m_asciiBuffer.size();
1370 state = State::Query;
1374 m_url.m_pathEnd = m_asciiBuffer.size();
1375 m_url.m_queryEnd = m_url.m_pathEnd;
1376 state = State::Fragment;
1379 if (isPercentEncodedDot(c)) {
1380 m_asciiBuffer.append('.');
1383 ASSERT(*c == dotASCIICode[0]);
1385 ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1389 utf8PercentEncode<serialized>(*c, m_asciiBuffer, isInDefaultEncodeSet);
1392 case State::CannotBeABaseURLPath:
1393 LOG_STATE("CannotBeABaseURLPath");
1395 m_url.m_pathEnd = m_asciiBuffer.size();
1396 state = State::Query;
1397 } else if (*c == '#') {
1398 m_url.m_pathEnd = m_asciiBuffer.size();
1399 m_url.m_queryEnd = m_url.m_pathEnd;
1400 state = State::Fragment;
1402 utf8PercentEncode<serialized>(*c, m_asciiBuffer, isInSimpleEncodeSet);
1409 if (!isUTF8Encoding)
1410 encodeQuery(queryBuffer, m_asciiBuffer, encoding);
1411 m_url.m_queryEnd = m_asciiBuffer.size();
1412 state = State::Fragment;
1416 utf8PercentEncodeQuery<serialized>(*c, m_asciiBuffer);
1418 appendCodePoint(queryBuffer, *c);
1421 case State::Fragment:
1422 LOG_STATE("Fragment");
1423 if (m_unicodeFragmentBuffer.isEmpty() && isASCII(*c))
1424 m_asciiBuffer.append(*c);
1426 appendCodePoint(m_unicodeFragmentBuffer, *c);
1433 case State::SchemeStart:
1434 LOG_FINAL_STATE("SchemeStart");
1435 if (!m_asciiBuffer.size() && !base.isNull())
1437 return failure(input, length);
1439 LOG_FINAL_STATE("Scheme");
1440 return failure(input, length);
1441 case State::NoScheme:
1442 LOG_FINAL_STATE("NoScheme");
1443 RELEASE_ASSERT_NOT_REACHED();
1444 case State::SpecialRelativeOrAuthority:
1445 LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1446 copyURLPartsUntil(base, URLPart::QueryEnd);
1447 m_url.m_fragmentEnd = m_url.m_queryEnd;
1449 case State::PathOrAuthority:
1450 LOG_FINAL_STATE("PathOrAuthority");
1451 m_url.m_userEnd = m_asciiBuffer.size();
1452 m_url.m_passwordEnd = m_url.m_userEnd;
1453 m_url.m_hostEnd = m_url.m_userEnd;
1454 m_url.m_portEnd = m_url.m_userEnd;
1455 m_url.m_pathAfterLastSlash = m_url.m_userEnd;
1456 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1457 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1458 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1460 case State::Relative:
1461 LOG_FINAL_STATE("Relative");
1462 copyURLPartsUntil(base, URLPart::FragmentEnd);
1464 case State::RelativeSlash:
1465 LOG_FINAL_STATE("RelativeSlash");
1466 copyURLPartsUntil(base, URLPart::PortEnd);
1467 m_asciiBuffer.append('/');
1468 m_url.m_pathAfterLastSlash = base.m_portEnd + 1;
1469 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1470 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1471 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1473 case State::SpecialAuthoritySlashes:
1474 LOG_FINAL_STATE("SpecialAuthoritySlashes");
1475 m_url.m_userStart = m_asciiBuffer.size();
1476 m_url.m_userEnd = m_url.m_userStart;
1477 m_url.m_passwordEnd = m_url.m_userStart;
1478 m_url.m_hostEnd = m_url.m_userStart;
1479 m_url.m_portEnd = m_url.m_userStart;
1480 m_url.m_pathAfterLastSlash = m_url.m_userStart;
1481 m_url.m_pathEnd = m_url.m_userStart;
1482 m_url.m_queryEnd = m_url.m_userStart;
1483 m_url.m_fragmentEnd = m_url.m_userStart;
1485 case State::SpecialAuthorityIgnoreSlashes:
1486 LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1487 return failure(input, length);
1489 case State::AuthorityOrHost:
1490 LOG_FINAL_STATE("AuthorityOrHost");
1491 m_url.m_userEnd = m_asciiBuffer.size();
1492 m_url.m_passwordEnd = m_url.m_userEnd;
1493 if (authorityOrHostBegin.atEnd()) {
1494 m_url.m_hostEnd = m_url.m_userEnd;
1495 m_url.m_portEnd = m_url.m_userEnd;
1496 } else if (!parseHostAndPort<serialized>(authorityOrHostBegin))
1497 return failure(input, length);
1498 m_asciiBuffer.append('/');
1499 m_url.m_pathEnd = m_url.m_portEnd + 1;
1500 m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1501 m_url.m_queryEnd = m_url.m_pathEnd;
1502 m_url.m_fragmentEnd = m_url.m_pathEnd;
1505 LOG_FINAL_STATE("Host");
1506 if (!parseHostAndPort<serialized>(authorityOrHostBegin))
1507 return failure(input, length);
1508 m_asciiBuffer.append('/');
1509 m_url.m_pathEnd = m_url.m_portEnd + 1;
1510 m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1511 m_url.m_queryEnd = m_url.m_pathEnd;
1512 m_url.m_fragmentEnd = m_url.m_pathEnd;
1515 LOG_FINAL_STATE("File");
1516 if (!base.isNull() && base.protocolIs("file")) {
1517 copyURLPartsUntil(base, URLPart::QueryEnd);
1518 m_asciiBuffer.append(':');
1520 m_asciiBuffer.append("///", 3);
1521 m_url.m_userStart = m_asciiBuffer.size() - 1;
1522 m_url.m_userEnd = m_url.m_userStart;
1523 m_url.m_passwordEnd = m_url.m_userStart;
1524 m_url.m_hostEnd = m_url.m_userStart;
1525 m_url.m_portEnd = m_url.m_userStart;
1526 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1527 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1528 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1529 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1531 case State::FileSlash:
1532 LOG_FINAL_STATE("FileSlash");
1533 m_asciiBuffer.append("//", 2);
1534 m_url.m_userStart = m_asciiBuffer.size() - 1;
1535 m_url.m_userEnd = m_url.m_userStart;
1536 m_url.m_passwordEnd = m_url.m_userStart;
1537 m_url.m_hostEnd = m_url.m_userStart;
1538 m_url.m_portEnd = m_url.m_userStart;
1539 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1540 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1541 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1542 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1544 case State::FileHost:
1545 LOG_FINAL_STATE("FileHost");
1546 if (authorityOrHostBegin == c) {
1547 m_asciiBuffer.append('/');
1548 m_url.m_userStart = m_asciiBuffer.size() - 1;
1549 m_url.m_userEnd = m_url.m_userStart;
1550 m_url.m_passwordEnd = m_url.m_userStart;
1551 m_url.m_hostEnd = m_url.m_userStart;
1552 m_url.m_portEnd = m_url.m_userStart;
1553 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1554 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1555 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1556 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1560 if (!parseHostAndPort<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
1561 return failure(input, length);
1563 if (StringView(m_asciiBuffer.data() + m_url.m_passwordEnd, m_asciiBuffer.size() - m_url.m_passwordEnd) == "localhost") {
1564 m_asciiBuffer.shrink(m_url.m_passwordEnd);
1565 m_url.m_hostEnd = m_asciiBuffer.size();
1566 m_url.m_portEnd = m_url.m_hostEnd;
1568 m_asciiBuffer.append('/');
1569 m_url.m_pathAfterLastSlash = m_url.m_hostEnd + 1;
1570 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1571 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1572 m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash;
1574 case State::PathStart:
1575 LOG_FINAL_STATE("PathStart");
1576 RELEASE_ASSERT_NOT_REACHED();
1578 LOG_FINAL_STATE("Path");
1579 m_url.m_pathEnd = m_asciiBuffer.size();
1580 m_url.m_queryEnd = m_url.m_pathEnd;
1581 m_url.m_fragmentEnd = m_url.m_pathEnd;
1583 case State::CannotBeABaseURLPath:
1584 LOG_FINAL_STATE("CannotBeABaseURLPath");
1585 m_url.m_pathEnd = m_asciiBuffer.size();
1586 m_url.m_queryEnd = m_url.m_pathEnd;
1587 m_url.m_fragmentEnd = m_url.m_pathEnd;
1590 LOG_FINAL_STATE("Query");
1591 if (!isUTF8Encoding)
1592 encodeQuery(queryBuffer, m_asciiBuffer, encoding);
1593 m_url.m_queryEnd = m_asciiBuffer.size();
1594 m_url.m_fragmentEnd = m_url.m_queryEnd;
1596 case State::Fragment:
1597 LOG_FINAL_STATE("Fragment");
1598 m_url.m_fragmentEnd = m_asciiBuffer.size() + m_unicodeFragmentBuffer.size();
1602 if (m_unicodeFragmentBuffer.isEmpty()) {
1603 // FIXME: String::adopt should require a WTFMove.
1604 m_url.m_string = String::adopt(m_asciiBuffer);
1606 StringBuilder builder;
1607 builder.reserveCapacity(m_asciiBuffer.size() + m_unicodeFragmentBuffer.size());
1608 builder.append(m_asciiBuffer.data(), m_asciiBuffer.size());
1609 for (size_t i = 0; i < m_unicodeFragmentBuffer.size(); ++i)
1610 builder.append(m_unicodeFragmentBuffer[i]);
1611 m_url.m_string = builder.toString();
1613 m_url.m_isValid = true;
1614 LOG(URLParser, "Parsed URL <%s>", m_url.m_string.utf8().data());
1615 ASSERT(internalValuesConsistent(m_url));
1619 template<bool serialized, typename CharacterType>
1620 void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
1622 if (iterator.atEnd()) {
1623 m_url.m_userEnd = m_asciiBuffer.size();
1624 m_url.m_passwordEnd = m_url.m_userEnd;
1627 for (; !iterator.atEnd(); ++iterator) {
1628 if (*iterator == ':') {
1630 m_url.m_userEnd = m_asciiBuffer.size();
1631 if (iterator.atEnd()) {
1632 m_url.m_passwordEnd = m_url.m_userEnd;
1633 if (m_url.m_userEnd > m_url.m_userStart)
1634 m_asciiBuffer.append('@');
1637 m_asciiBuffer.append(':');
1640 utf8PercentEncode<serialized>(*iterator, m_asciiBuffer, isInUserInfoEncodeSet);
1642 for (; !iterator.atEnd(); ++iterator)
1643 utf8PercentEncode<serialized>(*iterator, m_asciiBuffer, isInUserInfoEncodeSet);
1644 m_url.m_passwordEnd = m_asciiBuffer.size();
1645 if (!m_url.m_userEnd)
1646 m_url.m_userEnd = m_url.m_passwordEnd;
1647 m_asciiBuffer.append('@');
1650 template<typename UnsignedIntegerType>
1651 void append(Vector<LChar>& destination, UnsignedIntegerType number)
1653 LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
1654 LChar* end = buf + WTF_ARRAY_LENGTH(buf);
1657 *--p = (number % 10) + '0';
1660 destination.append(p, end - p);
1663 inline static void serializeIPv4(uint32_t address, Vector<LChar>& buffer)
1665 append<uint8_t>(buffer, address >> 24);
1667 append<uint8_t>(buffer, address >> 16);
1669 append<uint8_t>(buffer, address >> 8);
1671 append<uint8_t>(buffer, address);
1674 inline static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
1677 for (; end < 8; end++) {
1684 inline static Optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
1686 Optional<size_t> longest;
1687 size_t longestLength = 0;
1688 for (size_t i = 0; i < 8; i++) {
1689 size_t length = zeroSequenceLength(address, i);
1691 if (length > 1 && (!longest || longestLength < length)) {
1693 longestLength = length;
1701 inline static void serializeIPv6Piece(uint16_t piece, Vector<LChar>& buffer)
1703 bool printed = false;
1704 if (auto nibble0 = piece >> 12) {
1705 buffer.append(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
1708 auto nibble1 = piece >> 8 & 0xF;
1709 if (printed || nibble1) {
1710 buffer.append(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
1713 auto nibble2 = piece >> 4 & 0xF;
1714 if (printed || nibble2)
1715 buffer.append(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
1716 buffer.append(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
1719 inline static void serializeIPv6(std::array<uint16_t, 8> address, Vector<LChar>& buffer)
1722 auto compressPointer = findLongestZeroSequence(address);
1723 for (size_t piece = 0; piece < 8; piece++) {
1724 if (compressPointer && compressPointer.value() == piece) {
1725 ASSERT(!address[piece]);
1729 buffer.append("::", 2);
1730 while (piece < 8 && !address[piece])
1735 serializeIPv6Piece(address[piece], buffer);
1742 template<typename CharacterType>
1743 inline static Optional<uint32_t> parseIPv4Number(CodePointIterator<CharacterType>& iterator)
1745 // FIXME: Check for overflow.
1746 enum class State : uint8_t {
1753 State state = State::UnknownBase;
1755 while (!iterator.atEnd()) {
1756 if (*iterator == '.') {
1761 case State::UnknownBase:
1762 if (*iterator == '0') {
1764 state = State::OctalOrHex;
1767 state = State::Decimal;
1769 case State::OctalOrHex:
1770 if (*iterator == 'x' || *iterator == 'X') {
1775 state = State::Octal;
1777 case State::Decimal:
1778 if (*iterator < '0' || *iterator > '9')
1781 value += *iterator - '0';
1785 if (*iterator < '0' || *iterator > '7')
1788 value += *iterator - '0';
1792 if (!isASCIIHexDigit(*iterator))
1795 value += toASCIIHexValue(*iterator);
1803 inline static uint64_t pow256(size_t exponent)
1805 RELEASE_ASSERT(exponent <= 4);
1806 uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
1807 return values[exponent];
1810 template<typename CharacterType>
1811 inline static Optional<uint32_t> parseIPv4Host(CodePointIterator<CharacterType> iterator)
1813 Vector<uint32_t, 4> items;
1814 items.reserveInitialCapacity(4);
1815 while (!iterator.atEnd()) {
1816 if (items.size() >= 4)
1818 if (auto item = parseIPv4Number(iterator))
1819 items.append(item.value());
1823 if (!items.size() || items.size() > 4)
1825 if (items.size() > 2) {
1826 for (size_t i = 0; i < items.size() - 2; i++) {
1831 if (items[items.size() - 1] >= pow256(5 - items.size()))
1833 for (auto item : items) {
1837 uint32_t ipv4 = items.takeLast();
1838 for (size_t counter = 0; counter < items.size(); ++counter)
1839 ipv4 += items[counter] * pow256(3 - counter);
1843 template<typename CharacterType>
1844 inline static Optional<std::array<uint16_t, 8>> parseIPv6Host(CodePointIterator<CharacterType> c)
1849 std::array<uint16_t, 8> address = {{0, 0, 0, 0, 0, 0, 0, 0}};
1850 size_t piecePointer = 0;
1851 Optional<size_t> compressPointer;
1861 compressPointer = piecePointer;
1864 while (!c.atEnd()) {
1865 if (piecePointer == 8)
1868 if (compressPointer)
1872 compressPointer = piecePointer;
1876 for (size_t length = 0; length < 4; length++) {
1879 if (!isASCIIHexDigit(*c))
1881 value = value * 0x10 + toASCIIHexValue(*c);
1884 address[piecePointer++] = value;
1893 if (piecePointer > 6)
1895 size_t dotsSeen = 0;
1896 while (!c.atEnd()) {
1897 Optional<uint16_t> value;
1898 if (!isASCIIDigit(*c))
1900 while (isASCIIDigit(*c)) {
1901 auto number = *c - '0';
1904 else if (!value.value())
1907 value = value.value() * 10 + number;
1911 if (value.value() > 255)
1914 if (dotsSeen < 3 && *c != '.')
1916 address[piecePointer] = address[piecePointer] * 0x100 + value.valueOr(0);
1917 if (dotsSeen == 1 || dotsSeen == 3)
1921 if (dotsSeen == 3 && !c.atEnd())
1926 if (compressPointer) {
1927 size_t swaps = piecePointer - compressPointer.value();
1930 std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
1931 } else if (piecePointer != 8)
1936 const size_t defaultInlineBufferSize = 2048;
1938 inline static Vector<LChar, defaultInlineBufferSize> percentDecode(const LChar* input, size_t length)
1940 Vector<LChar, defaultInlineBufferSize> output;
1941 output.reserveInitialCapacity(length);
1943 for (size_t i = 0; i < length; ++i) {
1944 uint8_t byte = input[i];
1946 output.uncheckedAppend(byte);
1947 else if (i < length - 2) {
1948 if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
1949 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
1952 output.uncheckedAppend(byte);
1954 output.uncheckedAppend(byte);
1959 inline static bool containsOnlyASCII(const String& string)
1961 if (string.is8Bit())
1962 return charactersAreAllASCII(string.characters8(), string.length());
1963 return charactersAreAllASCII(string.characters16(), string.length());
1966 inline static Optional<Vector<LChar, defaultInlineBufferSize>> domainToASCII(const String& domain)
1968 Vector<LChar, defaultInlineBufferSize> ascii;
1969 if (containsOnlyASCII(domain)) {
1970 size_t length = domain.length();
1971 if (domain.is8Bit()) {
1972 const LChar* characters = domain.characters8();
1973 ascii.reserveInitialCapacity(length);
1974 for (size_t i = 0; i < length; ++i)
1975 ascii.uncheckedAppend(toASCIILower(characters[i]));
1977 const UChar* characters = domain.characters16();
1978 ascii.reserveInitialCapacity(length);
1979 for (size_t i = 0; i < length; ++i)
1980 ascii.uncheckedAppend(toASCIILower(characters[i]));
1985 UChar hostnameBuffer[defaultInlineBufferSize];
1986 UErrorCode error = U_ZERO_ERROR;
1988 #if COMPILER(GCC) || COMPILER(CLANG)
1989 #pragma GCC diagnostic push
1990 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
1992 // FIXME: This should use uidna_openUTS46 / uidna_close instead
1993 int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, UIDNA_ALLOW_UNASSIGNED, nullptr, &error);
1994 #if COMPILER(GCC) || COMPILER(CLANG)
1995 #pragma GCC diagnostic pop
1997 ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
1999 if (error == U_ZERO_ERROR) {
2000 for (int32_t i = 0; i < numCharactersConverted; ++i) {
2001 ASSERT(isASCII(hostnameBuffer[i]));
2002 ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2004 ascii.append(hostnameBuffer, numCharactersConverted);
2008 // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
2012 inline static bool hasInvalidDomainCharacter(const Vector<LChar, defaultInlineBufferSize>& asciiDomain)
2014 for (size_t i = 0; i < asciiDomain.size(); ++i) {
2015 if (isInvalidDomainCharacter(asciiDomain[i]))
2021 template<bool serialized, typename CharacterType>
2022 bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2025 if (iterator.atEnd()) {
2026 m_url.m_portEnd = m_asciiBuffer.size();
2029 m_asciiBuffer.append(':');
2030 for (; !iterator.atEnd(); ++iterator) {
2031 if (!serialized && isTabOrNewline(*iterator))
2033 if (isASCIIDigit(*iterator)) {
2034 port = port * 10 + *iterator - '0';
2035 if (port > std::numeric_limits<uint16_t>::max())
2041 if (isDefaultPort(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd), port)) {
2042 ASSERT(m_asciiBuffer.last() == ':');
2043 m_asciiBuffer.shrink(m_asciiBuffer.size() - 1);
2045 append<uint16_t>(m_asciiBuffer, static_cast<uint16_t>(port));
2047 m_url.m_portEnd = m_asciiBuffer.size();
2051 template<bool serialized, typename CharacterType>
2052 bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2054 if (iterator.atEnd())
2056 if (*iterator == '[') {
2058 auto ipv6End = iterator;
2059 while (!ipv6End.atEnd() && *ipv6End != ']')
2061 if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2062 serializeIPv6(address.value(), m_asciiBuffer);
2063 m_url.m_hostEnd = m_asciiBuffer.size();
2064 if (!ipv6End.atEnd()) {
2066 if (!ipv6End.atEnd() && *ipv6End == ':') {
2068 return parsePort<serialized>(ipv6End);
2070 m_url.m_portEnd = m_asciiBuffer.size();
2077 if (!m_hostHasPercentOrNonASCII) {
2078 auto hostIterator = iterator;
2079 for (; !iterator.atEnd(); ++iterator) {
2080 if (!serialized && isTabOrNewline(*iterator))
2082 if (*iterator == ':')
2084 if (isInvalidDomainCharacter(*iterator))
2087 if (auto address = parseIPv4Host(CodePointIterator<CharacterType>(hostIterator, iterator))) {
2088 serializeIPv4(address.value(), m_asciiBuffer);
2089 m_url.m_hostEnd = m_asciiBuffer.size();
2090 if (iterator.atEnd()) {
2091 m_url.m_portEnd = m_asciiBuffer.size();
2095 return parsePort<serialized>(iterator);
2097 for (; hostIterator != iterator; ++hostIterator) {
2098 if (serialized || !isTabOrNewline(*hostIterator))
2099 m_asciiBuffer.append(toASCIILower(*hostIterator));
2101 m_url.m_hostEnd = m_asciiBuffer.size();
2102 if (!hostIterator.atEnd()) {
2103 ASSERT(*hostIterator == ':');
2104 incrementIteratorSkippingTabAndNewLine<serialized>(hostIterator);
2105 return parsePort<serialized>(hostIterator);
2107 m_url.m_portEnd = m_asciiBuffer.size();
2111 Vector<LChar, defaultInlineBufferSize> utf8Encoded;
2112 for (; !iterator.atEnd(); ++iterator) {
2113 if (!serialized && isTabOrNewline(*iterator))
2115 if (*iterator == ':')
2117 uint8_t buffer[U8_MAX_LENGTH];
2119 UBool error = false;
2120 U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error);
2121 ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
2122 // FIXME: Check error.
2123 utf8Encoded.append(buffer, offset);
2125 Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size());
2126 String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2127 auto asciiDomain = domainToASCII(domain);
2128 if (!asciiDomain || hasInvalidDomainCharacter(asciiDomain.value()))
2130 Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value();
2131 const LChar* asciiDomainCharacters = asciiDomainValue.data();
2133 if (auto address = parseIPv4Host(CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()))) {
2134 serializeIPv4(address.value(), m_asciiBuffer);
2135 m_url.m_hostEnd = m_asciiBuffer.size();
2136 if (iterator.atEnd()) {
2137 m_url.m_portEnd = m_asciiBuffer.size();
2141 return parsePort<serialized>(iterator);
2144 m_asciiBuffer.append(asciiDomainCharacters, asciiDomainValue.size());
2145 m_url.m_hostEnd = m_asciiBuffer.size();
2146 if (!iterator.atEnd()) {
2147 ASSERT(*iterator == ':');
2148 incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
2149 return parsePort<serialized>(iterator);
2151 m_url.m_portEnd = m_asciiBuffer.size();
2155 inline static Optional<String> formURLDecode(StringView input)
2157 auto utf8 = input.utf8(StrictConversion);
2160 auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2161 return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2164 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2166 Vector<StringView> sequences = input.split('&');
2168 URLEncodedForm output;
2169 for (auto& bytes : sequences) {
2170 auto valueStart = bytes.find('=');
2171 if (valueStart == notFound) {
2172 if (auto name = formURLDecode(bytes))
2173 output.append({name.value().replace('+', 0x20), emptyString()});
2175 auto name = formURLDecode(bytes.substring(0, valueStart));
2176 auto value = formURLDecode(bytes.substring(valueStart + 1));
2178 output.append(std::make_pair(name.value().replace('+', 0x20), value.value().replace('+', 0x20)));
2184 inline static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2186 auto utf8 = input.utf8(StrictConversion);
2187 const char* data = utf8.data();
2188 for (size_t i = 0; i < utf8.length(); ++i) {
2189 const char byte = data[i];
2191 output.append(0x2B);
2192 else if (byte == 0x2A
2195 || (byte >= 0x30 && byte <= 0x39)
2196 || (byte >= 0x41 && byte <= 0x5A)
2198 || (byte >= 0x61 && byte <= 0x7A))
2199 output.append(byte);
2201 percentEncode(byte, output);
2205 String URLParser::serialize(const URLEncodedForm& tuples)
2207 Vector<LChar> output;
2208 for (auto& tuple : tuples) {
2209 if (!output.isEmpty())
2211 serializeURLEncodedForm(tuple.first, output);
2213 serializeURLEncodedForm(tuple.second, output);
2215 return String::adopt(output);
2218 bool URLParser::allValuesEqual(const URL& a, const URL& b)
2220 // FIXME: m_cannotBeABaseURL is not compared because the old URL::parse did not use it,
2221 // but once we get rid of URL::parse its value should be tested.
2222 LOG(URLParser, "%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2224 a.m_protocolIsInHTTPFamily,
2231 a.m_pathAfterLastSlash,
2235 a.m_string.utf8().data(),
2237 b.m_protocolIsInHTTPFamily,
2244 b.m_pathAfterLastSlash,
2248 b.m_string.utf8().data());
2250 return a.m_string == b.m_string
2251 && a.m_isValid == b.m_isValid
2252 && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2253 && a.m_schemeEnd == b.m_schemeEnd
2254 && a.m_userStart == b.m_userStart
2255 && a.m_userEnd == b.m_userEnd
2256 && a.m_passwordEnd == b.m_passwordEnd
2257 && a.m_hostEnd == b.m_hostEnd
2258 && a.m_portEnd == b.m_portEnd
2259 && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2260 && a.m_pathEnd == b.m_pathEnd
2261 && a.m_queryEnd == b.m_queryEnd
2262 && a.m_fragmentEnd == b.m_fragmentEnd;
2265 bool URLParser::internalValuesConsistent(const URL& url)
2267 return url.m_schemeEnd <= url.m_userStart
2268 && url.m_userStart <= url.m_userEnd
2269 && url.m_userEnd <= url.m_passwordEnd
2270 && url.m_passwordEnd <= url.m_hostEnd
2271 && url.m_hostEnd <= url.m_hostEnd
2272 && url.m_portEnd <= url.m_pathAfterLastSlash
2273 && url.m_pathAfterLastSlash <= url.m_pathEnd
2274 && url.m_pathEnd <= url.m_queryEnd
2275 && url.m_queryEnd <= url.m_fragmentEnd
2276 && (url.m_isValid ? url.m_fragmentEnd == url.m_string.length() : !url.m_fragmentEnd);
2277 // FIXME: Why do we even store m_fragmentEnd?
2278 // It should be able to be deduced from m_isValid and m_string.length() to save memory.
2281 static bool urlParserEnabled = false;
2283 void URLParser::setEnabled(bool enabled)
2285 urlParserEnabled = enabled;
2288 bool URLParser::enabled()
2290 return urlParserEnabled;
2293 } // namespace WebCore