Rename StringBuilder::append(UChar32) to StringBuilder::appendCharacter(UChar32)...
[WebKit-https.git] / Source / WebCore / html / parser / HTMLEntityParser.cpp
1 /*
2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3  * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
26  */
27
28 #include "config.h"
29 #include "HTMLEntityParser.h"
30
31 #include "CharacterReferenceParserInlines.h"
32 #include "HTMLEntitySearch.h"
33 #include "HTMLEntityTable.h"
34 #include <wtf/text/StringBuilder.h>
35 #include <wtf/unicode/CharacterNames.h>
36
37 namespace WebCore {
38
39 static const UChar windowsLatin1ExtensionArray[32] = {
40     0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
41     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
42     0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
43     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
44 };
45
46 class HTMLEntityParser {
47 public:
48     static UChar32 legalEntityFor(UChar32 value)
49     {
50         if (value <= 0 || value > UCHAR_MAX_VALUE || U_IS_SURROGATE(value))
51             return replacementCharacter;
52         if ((value & ~0x1F) != 0x80)
53             return value;
54         return windowsLatin1ExtensionArray[value - 0x80];
55     }
56
57     static bool acceptMalformed() { return true; }
58
59     static bool consumeNamedEntity(SegmentedString& source, StringBuilder& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)
60     {
61         StringBuilder consumedCharacters;
62         HTMLEntitySearch entitySearch;
63         while (!source.isEmpty()) {
64             cc = source.currentCharacter();
65             entitySearch.advance(cc);
66             if (!entitySearch.isEntityPrefix())
67                 break;
68             consumedCharacters.append(cc);
69             source.advancePastNonNewline();
70         }
71         notEnoughCharacters = source.isEmpty();
72         if (notEnoughCharacters) {
73             // We can't an entity because there might be a longer entity
74             // that we could match if we had more data.
75             unconsumeCharacters(source, consumedCharacters);
76             return false;
77         }
78         if (!entitySearch.mostRecentMatch()) {
79             unconsumeCharacters(source, consumedCharacters);
80             return false;
81         }
82         if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
83             // We've consumed too many characters. We need to walk the
84             // source back to the point at which we had consumed an
85             // actual entity.
86             unconsumeCharacters(source, consumedCharacters);
87             consumedCharacters.clear();
88             const int length = entitySearch.mostRecentMatch()->length;
89             const LChar* reference = entitySearch.mostRecentMatch()->entity;
90             for (int i = 0; i < length; ++i) {
91                 cc = source.currentCharacter();
92                 ASSERT_UNUSED(reference, cc == *reference++);
93                 consumedCharacters.append(cc);
94                 source.advancePastNonNewline();
95                 ASSERT(!source.isEmpty());
96             }
97             cc = source.currentCharacter();
98         }
99         if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
100             || !additionalAllowedCharacter
101             || !(isASCIIAlphanumeric(cc) || cc == '=')) {
102             decodedEntity.appendCharacter(entitySearch.mostRecentMatch()->firstValue);
103             if (entitySearch.mostRecentMatch()->secondValue)
104                 decodedEntity.appendCharacter(entitySearch.mostRecentMatch()->secondValue);
105             return true;
106         }
107         unconsumeCharacters(source, consumedCharacters);
108         return false;
109     }
110 };
111
112 bool consumeHTMLEntity(SegmentedString& source, StringBuilder& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
113 {
114     return consumeCharacterReference<HTMLEntityParser>(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter);
115 }
116
117 static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)
118 {
119     if (U_IS_BMP(value)) {
120         UChar character = static_cast<UChar>(value);
121         ASSERT(character == value);
122         result[0] = character;
123         return 1;
124     }
125
126     result[0] = U16_LEAD(value);
127     result[1] = U16_TRAIL(value);
128     return 2;
129 }
130
131 size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4])
132 {
133     HTMLEntitySearch search;
134     while (*name) {
135         search.advance(*name++);
136         if (!search.isEntityPrefix())
137             return 0;
138     }
139     search.advance(';');
140     if (!search.isEntityPrefix())
141         return 0;
142
143     size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result);
144     if (!search.mostRecentMatch()->secondValue)
145         return numberOfCodePoints;
146     return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints);
147 }
148
149 } // namespace WebCore