eeb4392dccb483a8ca83c1fec9ed51dc2bd71a34
[WebKit-https.git] / Source / WTF / wtf / text / icu / TextBreakIteratorICU.h
1 /*
2  * Copyright (C) 2017 Apple Inc. All rights reserved.
3  *
4  * This library is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Library General Public
6  * License as published by the Free Software Foundation; either
7  * version 2 of the License, or (at your option) any later version.
8  *
9  * This library is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * Library General Public License for more details.
13  *
14  * You should have received a copy of the GNU Library General Public License
15  * along with this library; see the file COPYING.LIB.  If not, write to
16  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
17  * Boston, MA 02110-1301, USA.
18  *
19  */
20
21 #pragma once
22
23 #include <unicode/ubrk.h>
24 #include <wtf/Optional.h>
25 #include <wtf/text/icu/UTextProviderLatin1.h>
26
27 #define USE_ICU_CURSOR_ITERATOR (PLATFORM(MAC) && __MAC_OS_X_VERSION_MIN_REQUIRED < 101200)
28
29 namespace WTF {
30
31 #if USE_ICU_CURSOR_ITERATOR
32 static String cursorRules()
33 {
34     return ASCIILiteral(
35         // This rule set is based on character-break iterator rules of ICU 57
36         // <http://source.icu-project.org/repos/icu/icu/tags/release-57-1/source/data/brkitr/>.
37         // The major differences from the original ones are listed below:
38         // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
39         // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
40         // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
41         // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
42         // * Added rules for regional indicator symbols.
43         "$CR      = [\\p{Grapheme_Cluster_Break = CR}];"
44         "$LF      = [\\p{Grapheme_Cluster_Break = LF}];"
45         "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
46         "$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced marks
47         "$Extend  = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
48         "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
49         "$L       = [\\p{Grapheme_Cluster_Break = L}];"
50         "$V       = [\\p{Grapheme_Cluster_Break = V}];"
51         "$T       = [\\p{Grapheme_Cluster_Break = T}];"
52         "$LV      = [\\p{Grapheme_Cluster_Break = LV}];"
53         "$LVT     = [\\p{Grapheme_Cluster_Break = LVT}];"
54         "$Hin0    = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha
55         "$HinV    = \\u094D;" // Devanagari Sign Virama
56         "$Hin1    = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha
57         "$Ben0    = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha
58         "$BenV    = \\u09CD;" // Bengali Sign Virama
59         "$Ben1    = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha
60         "$Pan0    = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha
61         "$PanV    = \\u0A4D;" // Gurmukhi Sign Virama
62         "$Pan1    = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha
63         "$Guj0    = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha
64         "$GujV    = \\u0ACD;" // Gujarati Sign Virama
65         "$Guj1    = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha
66         "$Ori0    = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha
67         "$OriV    = \\u0B4D;" // Oriya Sign Virama
68         "$Ori1    = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha
69         "$Tel0    = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha
70         "$TelV    = \\u0C4D;" // Telugu Sign Virama
71         "$Tel1    = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha
72         "$Kan0    = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha
73         "$KanV    = \\u0CCD;" // Kannada Sign Virama
74         "$Kan1    = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha
75         "$Mal0    = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha
76         "$MalV    = \\u0D4D;" // Malayalam Sign Virama
77         "$Mal1    = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha
78         "$RI      = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators
79         "$ZWJ     = \\u200D;" // Zero width joiner
80         "$EmojiVar = [\\uFE0F];" // Emoji-style variation selector
81         "$EmojiForSeqs = [\\u2640 \\u2642 \\u26F9 \\u2764 \\U0001F308 \\U0001F3C3-\\U0001F3C4 \\U0001F3CA-\\U0001F3CC \\U0001F3F3 \\U0001F441 \\U0001F466-\\U0001F469 \\U0001F46E-\\U0001F46F \\U0001F471 \\U0001F473 \\U0001F477 \\U0001F481-\\U0001F482 \\U0001F486-\\U0001F487 \\U0001F48B \\U0001F575 \\U0001F5E8 \\U0001F645-\\U0001F647 \\U0001F64B \\U0001F64D-\\U0001F64E \\U0001F6A3 \\U0001F6B4-\\U0001F6B6 \\u2695-\\u2696 \\u2708 \\U0001F33E \\U0001F373 \\U0001F393 \\U0001F3A4 \\U0001F3A8 \\U0001F3EB \\U0001F3ED \\U0001F4BB-\\U0001F4BC \\U0001F527 \\U0001F52C \\U0001F680 \\U0001F692 \\U0001F926 \\U0001F937-\\U0001F939 \\U0001F93C-\\U0001F93E];" // Emoji that participate in ZWJ sequences
82         "$EmojiForMods = [\\u261D \\u26F9 \\u270A-\\u270D \\U0001F385 \\U0001F3C3-\\U0001F3C4 \\U0001F3CA \\U0001F3CB \\U0001F442-\\U0001F443 \\U0001F446-\\U0001F450 \\U0001F466-\\U0001F478 \\U0001F47C \\U0001F481-\\U0001F483 \\U0001F485-\\U0001F487 \\U0001F4AA \\U0001F575 \\U0001F590 \\U0001F595 \\U0001F596 \\U0001F645-\\U0001F647 \\U0001F64B-\\U0001F64F \\U0001F6A3 \\U0001F6B4-\\U0001F6B6 \\U0001F6C0 \\U0001F918 \\U0001F3C2 \\U0001F3C7 \\U0001F3CC \\U0001F574 \\U0001F57A \\U0001F6CC \\U0001F919-\\U0001F91E \\U0001F926 \\U0001F930 \\U0001F933-\\U0001F939 \\U0001F93C-\\U0001F93E] ;" // Emoji that take Fitzpatrick modifiers
83         "$EmojiMods = [\\U0001F3FB-\\U0001F3FF];" // Fitzpatrick modifiers
84         "!!chain;"
85         "!!RINoChain;"
86         "!!forward;"
87         "$CR $LF;"
88         "$L ($L | $V | $LV | $LVT);"
89         "($LV | $V) ($V | $T);"
90         "($LVT | $T) $T;"
91         "$RI $RI $Extend* / $RI;"
92         "$RI $RI $Extend*;"
93         "[^$Control $CR $LF] $Extend;"
94         "[^$Control $CR $LF] $SpacingMark;"
95         "$Hin0 $HinV $Hin1;" // Devanagari Virama (forward)
96         "$Ben0 $BenV $Ben1;" // Bengali Virama (forward)
97         "$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward)
98         "$Guj0 $GujV $Guj1;" // Gujarati Virama (forward)
99         "$Ori0 $OriV $Ori1;" // Oriya Virama (forward)
100         "$Tel0 $TelV $Tel1;" // Telugu Virama (forward)
101         "$Kan0 $KanV $Kan1;" // Kannada Virama (forward)
102         "$Mal0 $MalV $Mal1;" // Malayalam Virama (forward)
103         "$ZWJ $EmojiForSeqs;" // Don't break in emoji ZWJ sequences
104         "$EmojiForMods $EmojiVar? $EmojiMods;" // Don't break between relevant emoji (possibly with variation selector) and Fitzpatrick modifier
105         "!!reverse;"
106         "$LF $CR;"
107         "($L | $V | $LV | $LVT) $L;"
108         "($V | $T) ($LV | $V);"
109         "$T ($LVT | $T);"
110         "$Extend* $RI $RI / $Extend* $RI $RI;"
111         "$Extend* $RI $RI;"
112         "$Extend      [^$Control $CR $LF];"
113         "$SpacingMark [^$Control $CR $LF];"
114         "$Hin1 $HinV $Hin0;" // Devanagari Virama (backward)
115         "$Ben1 $BenV $Ben0;" // Bengali Virama (backward)
116         "$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward)
117         "$Guj1 $GujV $Guj0;" // Gujarati Virama (backward)
118         "$Ori1 $OriV $Ori0;" // Gujarati Virama (backward)
119         "$Tel1 $TelV $Tel0;" // Telugu Virama (backward)
120         "$Kan1 $KanV $Kan0;" // Kannada Virama (backward)
121         "$Mal1 $MalV $Mal0;" // Malayalam Virama (backward)
122         "$EmojiForSeqs $ZWJ;" // Don't break in emoji ZWJ sequences
123         "$EmojiMods $EmojiVar? $EmojiForMods;" // Don't break between relevant emoji (possibly with variation selector) and Fitzpatrick modifier
124         "!!safe_reverse;"
125         "$RI $RI+;"
126         "[$EmojiVar $EmojiMods]+ $EmojiForMods;"
127         "!!safe_forward;"
128         "$RI $RI+;"
129         "$EmojiForMods [$EmojiVar $EmojiMods]+;"
130     );
131 }
132 #endif
133
134 class TextBreakIteratorICU {
135 public:
136     enum class Mode {
137         Line,
138         Character,
139 #if USE_ICU_CURSOR_ITERATOR
140         Cursor,
141 #endif
142     };
143
144     void set8BitText(const LChar* buffer, unsigned length)
145     {
146         UTextWithBuffer textLocal;
147         textLocal.text = UTEXT_INITIALIZER;
148         textLocal.text.extraSize = sizeof(textLocal.buffer);
149         textLocal.text.pExtra = textLocal.buffer;
150
151         UErrorCode status = U_ZERO_ERROR;
152         UText* text = openLatin1UTextProvider(&textLocal, buffer, length, &status);
153         ASSERT(U_SUCCESS(status));
154         ASSERT(text);
155
156         ubrk_setUText(m_iterator, text, &status);
157         ASSERT(U_SUCCESS(status));
158
159         utext_close(text);
160     }
161
162     TextBreakIteratorICU(StringView string, Mode mode, const char *locale)
163     {
164         UBreakIteratorType type;
165         switch (mode) {
166         case Mode::Line:
167             type = UBRK_LINE;
168             break;
169         case Mode::Character:
170             type = UBRK_CHARACTER;
171             break;
172 #if USE_ICU_CURSOR_ITERATOR
173         case Mode::Cursor:
174             type = UBRK_CHARACTER;
175             break;
176 #endif
177         default:
178             ASSERT_NOT_REACHED();
179             type = UBRK_CHARACTER;
180             break;
181         }
182
183         bool requiresSet8BitText = string.is8Bit();
184
185         const UChar *text = requiresSet8BitText ? nullptr : string.characters16();
186         int32_t textLength = requiresSet8BitText ? 0 : string.length();
187
188         // FIXME: Handle weak / normal / strict line breaking.
189         UErrorCode status = U_ZERO_ERROR;
190 #if USE_ICU_CURSOR_ITERATOR
191         if (mode == Mode::Cursor) {
192             static NeverDestroyed<String> cursorRules = WTF::cursorRules();
193             static NeverDestroyed<StringView::UpconvertedCharacters> upconvertedRules = StringView(cursorRules).upconvertedCharacters();
194             UParseError parseError;
195             m_iterator = ubrk_openRules(upconvertedRules.get(), cursorRules.get().length(), text, textLength, &parseError, &status);
196         } else
197 #endif
198             m_iterator = ubrk_open(type, locale, text, textLength, &status);
199         ASSERT(U_SUCCESS(status));
200
201         if (requiresSet8BitText)
202             set8BitText(string.characters8(), string.length());
203     }
204
205     TextBreakIteratorICU() = delete;
206     TextBreakIteratorICU(const TextBreakIteratorICU&) = delete;
207
208     TextBreakIteratorICU(TextBreakIteratorICU&& other)
209         : m_iterator(other.m_iterator)
210     {
211         other.m_iterator = nullptr;
212     }
213
214     TextBreakIteratorICU& operator=(const TextBreakIteratorICU&) = delete;
215
216     TextBreakIteratorICU& operator=(TextBreakIteratorICU&& other)
217     {
218         if (m_iterator)
219             ubrk_close(m_iterator);
220         m_iterator = other.m_iterator;
221         other.m_iterator = nullptr;
222         return *this;
223     }
224
225     ~TextBreakIteratorICU()
226     {
227         if (m_iterator)
228             ubrk_close(m_iterator);
229     }
230
231     void setText(StringView string)
232     {
233         if (string.is8Bit()) {
234             set8BitText(string.characters8(), string.length());
235             return;
236         }
237         UErrorCode status = U_ZERO_ERROR;
238         ubrk_setText(m_iterator, string.characters16(), string.length(), &status);
239         ASSERT(U_SUCCESS(status));
240     }
241
242     std::optional<unsigned> preceding(unsigned location) const
243     {
244         auto result = ubrk_preceding(m_iterator, location);
245         if (result == UBRK_DONE)
246             return { };
247         return result;
248     }
249
250     std::optional<unsigned> following(unsigned location) const
251     {
252         auto result = ubrk_following(m_iterator, location);
253         if (result == UBRK_DONE)
254             return { };
255         return result;
256     }
257
258     bool isBoundary(unsigned location) const
259     {
260         return ubrk_isBoundary(m_iterator, location);
261     }
262
263 private:
264     UBreakIterator* m_iterator;
265 };
266
267 }