Quick follow-up to previous patch.
[WebKit-https.git] / Source / WTF / wtf / text / TextBreakIterator.cpp
1 /*
2  * (C) 1999 Lars Knoll (knoll@kde.org)
3  * Copyright (C) 2004-2016 Apple Inc. All rights reserved.
4  * Copyright (C) 2007-2009 Torch Mobile, Inc.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Library General Public
8  * License as published by the Free Software Foundation; either
9  * version 2 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Library General Public License for more details.
15  *
16  * You should have received a copy of the GNU Library General Public License
17  * along with this library; see the file COPYING.LIB.  If not, write to
18  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19  * Boston, MA 02110-1301, USA.
20  */
21
22 #include "config.h"
23 #include "TextBreakIterator.h"
24
25 #include "LineBreakIteratorPoolICU.h"
26 #include "TextBreakIteratorInternalICU.h"
27 #include "UTextProviderLatin1.h"
28 #include "UTextProviderUTF16.h"
29 #include <atomic>
30 #include <mutex>
31 #include <unicode/ubrk.h>
32 #include <wtf/text/StringBuilder.h>
33
34 // FIXME: This needs a better name
35 #define ADDITIONAL_EMOJI_SUPPORT (PLATFORM(IOS) || (PLATFORM(MAC) && __MAC_OS_X_VERSION_MIN_REQUIRED >= 101100))
36
37 namespace WTF {
38
39 // Iterator initialization
40
41 static TextBreakIterator* initializeIterator(UBreakIteratorType type, const char* locale = currentTextBreakLocaleID())
42 {
43     UErrorCode openStatus = U_ZERO_ERROR;
44     TextBreakIterator* iterator = reinterpret_cast<TextBreakIterator*>(ubrk_open(type, locale, 0, 0, &openStatus));
45     ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
46     return iterator;
47 }
48
49 #if !PLATFORM(IOS)
50
51 static TextBreakIterator* initializeIteratorWithRules(const char* breakRules)
52 {
53     UParseError parseStatus;
54     UErrorCode openStatus = U_ZERO_ERROR;
55     unsigned length = strlen(breakRules);
56     auto upconvertedCharacters = StringView(reinterpret_cast<const LChar*>(breakRules), length).upconvertedCharacters();
57     TextBreakIterator* iterator = reinterpret_cast<TextBreakIterator*>(ubrk_openRules(upconvertedCharacters, length, 0, 0, &parseStatus, &openStatus));
58     ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
59     return iterator;
60 }
61
62 #endif
63
64
65 // Iterator text setting
66
67 static TextBreakIterator* setTextForIterator(TextBreakIterator& iterator, StringView string)
68 {
69     if (string.is8Bit()) {
70         UTextWithBuffer textLocal;
71         textLocal.text = UTEXT_INITIALIZER;
72         textLocal.text.extraSize = sizeof(textLocal.buffer);
73         textLocal.text.pExtra = textLocal.buffer;
74
75         UErrorCode openStatus = U_ZERO_ERROR;
76         UText* text = openLatin1UTextProvider(&textLocal, string.characters8(), string.length(), &openStatus);
77         if (U_FAILURE(openStatus)) {
78             LOG_ERROR("uTextOpenLatin1 failed with status %d", openStatus);
79             return nullptr;
80         }
81
82         UErrorCode setTextStatus = U_ZERO_ERROR;
83         ubrk_setUText(reinterpret_cast<UBreakIterator*>(&iterator), text, &setTextStatus);
84         if (U_FAILURE(setTextStatus)) {
85             LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
86             return nullptr;
87         }
88
89         utext_close(text);
90     } else {
91         UErrorCode setTextStatus = U_ZERO_ERROR;
92         ubrk_setText(reinterpret_cast<UBreakIterator*>(&iterator), string.characters16(), string.length(), &setTextStatus);
93         if (U_FAILURE(setTextStatus))
94             return nullptr;
95     }
96
97     return &iterator;
98 }
99
100 static TextBreakIterator* setContextAwareTextForIterator(TextBreakIterator& iterator, StringView string, const UChar* priorContext, unsigned priorContextLength)
101 {
102     if (string.is8Bit()) {
103         UTextWithBuffer textLocal;
104         textLocal.text = UTEXT_INITIALIZER;
105         textLocal.text.extraSize = sizeof(textLocal.buffer);
106         textLocal.text.pExtra = textLocal.buffer;
107
108         UErrorCode openStatus = U_ZERO_ERROR;
109         UText* text = openLatin1ContextAwareUTextProvider(&textLocal, string.characters8(), string.length(), priorContext, priorContextLength, &openStatus);
110         if (U_FAILURE(openStatus)) {
111             LOG_ERROR("openLatin1ContextAwareUTextProvider failed with status %d", openStatus);
112             return nullptr;
113         }
114
115         UErrorCode setTextStatus = U_ZERO_ERROR;
116         ubrk_setUText(reinterpret_cast<UBreakIterator*>(&iterator), text, &setTextStatus);
117         if (U_FAILURE(setTextStatus)) {
118             LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
119             return nullptr;
120         }
121
122         utext_close(text);
123     } else {
124         UText textLocal = UTEXT_INITIALIZER;
125
126         UErrorCode openStatus = U_ZERO_ERROR;
127         UText* text = openUTF16ContextAwareUTextProvider(&textLocal, string.characters16(), string.length(), priorContext, priorContextLength, &openStatus);
128         if (U_FAILURE(openStatus)) {
129             LOG_ERROR("openUTF16ContextAwareUTextProvider failed with status %d", openStatus);
130             return 0;
131         }
132
133         UErrorCode setTextStatus = U_ZERO_ERROR;
134         ubrk_setUText(reinterpret_cast<UBreakIterator*>(&iterator), text, &setTextStatus);
135         if (U_FAILURE(setTextStatus)) {
136             LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
137             return nullptr;
138         }
139
140         utext_close(text);
141     }
142
143     return &iterator;
144 }
145
146
147 // Static iterators
148
149 TextBreakIterator* wordBreakIterator(StringView string)
150 {
151     static TextBreakIterator* staticWordBreakIterator = initializeIterator(UBRK_WORD);
152     if (!staticWordBreakIterator)
153         return nullptr;
154
155     return setTextForIterator(*staticWordBreakIterator, string);
156 }
157
158 TextBreakIterator* sentenceBreakIterator(StringView string)
159 {
160     static TextBreakIterator* staticSentenceBreakIterator = initializeIterator(UBRK_SENTENCE);
161     if (!staticSentenceBreakIterator)
162         return nullptr;
163
164     return setTextForIterator(*staticSentenceBreakIterator, string);
165 }
166
167 TextBreakIterator* cursorMovementIterator(StringView string)
168 {
169 #if !PLATFORM(IOS)
170     // This rule set is based on character-break iterator rules of ICU 57
171     // <http://source.icu-project.org/repos/icu/icu/tags/release-57-1/source/data/brkitr/>.
172     // The major differences from the original ones are listed below:
173     // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
174     // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
175     // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
176     // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
177     // * Added rules for regional indicator symbols.
178     static const char* kRules =
179         "$CR      = [\\p{Grapheme_Cluster_Break = CR}];"
180         "$LF      = [\\p{Grapheme_Cluster_Break = LF}];"
181         "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
182         "$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced marks
183         "$Extend  = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
184         "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
185         "$L       = [\\p{Grapheme_Cluster_Break = L}];"
186         "$V       = [\\p{Grapheme_Cluster_Break = V}];"
187         "$T       = [\\p{Grapheme_Cluster_Break = T}];"
188         "$LV      = [\\p{Grapheme_Cluster_Break = LV}];"
189         "$LVT     = [\\p{Grapheme_Cluster_Break = LVT}];"
190         "$Hin0    = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha
191         "$HinV    = \\u094D;" // Devanagari Sign Virama
192         "$Hin1    = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha
193         "$Ben0    = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha
194         "$BenV    = \\u09CD;" // Bengali Sign Virama
195         "$Ben1    = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha
196         "$Pan0    = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha
197         "$PanV    = \\u0A4D;" // Gurmukhi Sign Virama
198         "$Pan1    = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha
199         "$Guj0    = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha
200         "$GujV    = \\u0ACD;" // Gujarati Sign Virama
201         "$Guj1    = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha
202         "$Ori0    = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha
203         "$OriV    = \\u0B4D;" // Oriya Sign Virama
204         "$Ori1    = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha
205         "$Tel0    = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha
206         "$TelV    = \\u0C4D;" // Telugu Sign Virama
207         "$Tel1    = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha
208         "$Kan0    = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha
209         "$KanV    = \\u0CCD;" // Kannada Sign Virama
210         "$Kan1    = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha
211         "$Mal0    = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha
212         "$MalV    = \\u0D4D;" // Malayalam Sign Virama
213         "$Mal1    = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha
214         "$RI      = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators
215         "$ZWJ     = \\u200D;" // Zero width joiner
216         "$EmojiVar = [\\uFE0F];" // Emoji-style variation selector
217 #if ADDITIONAL_EMOJI_SUPPORT
218         "$EmojiForSeqs = [\\u2640 \\u2642 \\u26F9 \\u2764 \\U0001F308 \\U0001F3C3-\\U0001F3C4 \\U0001F3CA-\\U0001F3CC \\U0001F3F3 \\U0001F441 \\U0001F466-\\U0001F469 \\U0001F46E-\\U0001F46F \\U0001F471 \\U0001F473 \\U0001F477 \\U0001F481-\\U0001F482 \\U0001F486-\\U0001F487 \\U0001F48B \\U0001F575 \\U0001F5E8 \\U0001F645-\\U0001F647 \\U0001F64B \\U0001F64D-\\U0001F64E \\U0001F6A3 \\U0001F6B4-\\U0001F6B6 \\u2695-\\u2696 \\u2708 \\U0001F33E \\U0001F373 \\U0001F393 \\U0001F3A4 \\U0001F3A8 \\U0001F3EB \\U0001F3ED \\U0001F4BB-\\U0001F4BC \\U0001F527 \\U0001F52C \\U0001F680 \\U0001F692 \\U0001F926 \\U0001F937-\\U0001F939 \\U0001F93C-\\U0001F93E];" // Emoji that participate in ZWJ sequences
219         "$EmojiForMods = [\\u261D \\u26F9 \\u270A-\\u270D \\U0001F385 \\U0001F3C3-\\U0001F3C4 \\U0001F3CA \\U0001F3CB \\U0001F442-\\U0001F443 \\U0001F446-\\U0001F450 \\U0001F466-\\U0001F469 \\U0001F46E-\\U0001F478 \\U0001F47C \\U0001F481-\\U0001F483 \\U0001F485-\\U0001F487 \\U0001F4AA \\U0001F575 \\U0001F590 \\U0001F595 \\U0001F596 \\U0001F645-\\U0001F647 \\U0001F64B-\\U0001F64F \\U0001F6A3 \\U0001F6B4-\\U0001F6B6 \\U0001F6C0 \\U0001F918 \\U0001F3C2 \\U0001F3C7 \\U0001F3CC \\U0001F574 \\U0001F57A \\U0001F6CC \\U0001F919-\\U0001F91E \\U0001F926 \\U0001F930 \\U0001F933-\\U0001F939 \\U0001F93C-\\U0001F93E] ;" // Emoji that take Fitzpatrick modifiers
220 #else
221         "$EmojiForSeqs = [\\u2764 \\U0001F466-\\U0001F469 \\U0001F48B];" // Emoji that participate in ZWJ sequences
222         "$EmojiForMods = [\\u261D \\u270A-\\u270C \\U0001F385 \\U0001F3C3-\\U0001F3C4 \\U0001F3C7 \\U0001F3CA \\U0001F442-\\U0001F443 \\U0001F446-\\U0001F450 \\U0001F466-\\U0001F469 \\U0001F46E-\\U0001F478 \\U0001F47C \\U0001F481-\\U0001F483 \\U0001F485-\\U0001F487 \\U0001F4AA \\U0001F596 \\U0001F645-\\U0001F647 \\U0001F64B-\\U0001F64F \\U0001F6A3 \\U0001F6B4-\\U0001F6B6 \\U0001F6C0] ;" // Emoji that take Fitzpatrick modifiers
223 #endif
224         "$EmojiMods = [\\U0001F3FB-\\U0001F3FF];" // Fitzpatrick modifiers
225         "!!chain;"
226 #if ADDITIONAL_EMOJI_SUPPORT
227         "!!RINoChain;"
228 #endif
229         "!!forward;"
230         "$CR $LF;"
231         "$L ($L | $V | $LV | $LVT);"
232         "($LV | $V) ($V | $T);"
233         "($LVT | $T) $T;"
234 #if ADDITIONAL_EMOJI_SUPPORT
235         "$RI $RI $Extend* / $RI;"
236         "$RI $RI $Extend*;"
237         "[^$Control $CR $LF] $Extend;"
238         "[^$Control $CR $LF] $SpacingMark;"
239 #else
240         "[^$Control $CR $LF] $Extend;"
241         "[^$Control $CR $LF] $SpacingMark;"
242         "$RI $RI / $RI;"
243         "$RI $RI;"
244 #endif
245         "$Hin0 $HinV $Hin1;" // Devanagari Virama (forward)
246         "$Ben0 $BenV $Ben1;" // Bengali Virama (forward)
247         "$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward)
248         "$Guj0 $GujV $Guj1;" // Gujarati Virama (forward)
249         "$Ori0 $OriV $Ori1;" // Oriya Virama (forward)
250         "$Tel0 $TelV $Tel1;" // Telugu Virama (forward)
251         "$Kan0 $KanV $Kan1;" // Kannada Virama (forward)
252         "$Mal0 $MalV $Mal1;" // Malayalam Virama (forward)
253         "$ZWJ $EmojiForSeqs;" // Don't break in emoji ZWJ sequences
254         "$EmojiForMods $EmojiVar? $EmojiMods;" // Don't break between relevant emoji (possibly with variation selector) and Fitzpatrick modifier
255         "!!reverse;"
256         "$LF $CR;"
257         "($L | $V | $LV | $LVT) $L;"
258         "($V | $T) ($LV | $V);"
259         "$T ($LVT | $T);"
260 #if ADDITIONAL_EMOJI_SUPPORT
261         "$Extend* $RI $RI / $Extend* $RI $RI;"
262         "$Extend* $RI $RI;"
263         "$Extend      [^$Control $CR $LF];"
264         "$SpacingMark [^$Control $CR $LF];"
265 #else
266         "$Extend      [^$Control $CR $LF];"
267         "$SpacingMark [^$Control $CR $LF];"
268         "$RI $RI / $RI $RI;"
269         "$RI $RI;"
270 #endif
271         "$Hin1 $HinV $Hin0;" // Devanagari Virama (backward)
272         "$Ben1 $BenV $Ben0;" // Bengali Virama (backward)
273         "$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward)
274         "$Guj1 $GujV $Guj0;" // Gujarati Virama (backward)
275         "$Ori1 $OriV $Ori0;" // Gujarati Virama (backward)
276         "$Tel1 $TelV $Tel0;" // Telugu Virama (backward)
277         "$Kan1 $KanV $Kan0;" // Kannada Virama (backward)
278         "$Mal1 $MalV $Mal0;" // Malayalam Virama (backward)
279         "$EmojiForSeqs $ZWJ;" // Don't break in emoji ZWJ sequences
280         "$EmojiMods $EmojiVar? $EmojiForMods;" // Don't break between relevant emoji (possibly with variation selector) and Fitzpatrick modifier
281 #if ADDITIONAL_EMOJI_SUPPORT
282         "!!safe_reverse;"
283         "$RI $RI+;"
284         "[$EmojiVar $EmojiMods]+ $EmojiForMods;"
285         "!!safe_forward;"
286         "$RI $RI+;"
287         "$EmojiForMods [$EmojiVar $EmojiMods]+;";
288 #else
289         "[$EmojiVar $EmojiMods]+ $EmojiForMods;"
290         "$EmojiForMods [$EmojiVar $EmojiMods]+;"
291         "!!safe_reverse;"
292         "!!safe_forward;";
293 #endif
294     static TextBreakIterator* staticCursorMovementIterator = initializeIteratorWithRules(kRules);
295 #else // PLATFORM(IOS)
296     // Use the special Thai character break iterator for all locales
297     static TextBreakIterator* staticCursorMovementIterator = initializeIterator(UBRK_CHARACTER, "th");
298 #endif // !PLATFORM(IOS)
299
300     if (!staticCursorMovementIterator)
301         return nullptr;
302
303     return setTextForIterator(*staticCursorMovementIterator, string);
304 }
305
306 TextBreakIterator* acquireLineBreakIterator(StringView string, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength, LineBreakIteratorMode mode, bool isCJK)
307 {
308     TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale, mode, isCJK);
309     if (!iterator)
310         return nullptr;
311
312     return setContextAwareTextForIterator(*iterator, string, priorContext, priorContextLength);
313 }
314
315 void releaseLineBreakIterator(TextBreakIterator* iterator)
316 {
317     ASSERT_ARG(iterator, iterator);
318
319     LineBreakIteratorPool::sharedPool().put(iterator);
320 }
321
322 static const char* uax14Prologue =
323     "!!chain;"
324     "!!LBCMNoChain;"
325     "!!lookAheadHardBreak;";
326
327 static const char* uax14AssignmentsBefore =
328     // explicitly enumerate $CJ since ICU versions prior to 49 don't support :LineBreak=Conditional_Japanese_Starter:
329     "$CJ = ["
330 #if (U_ICU_VERSION_MAJOR_NUM >= 4) && (U_ICU_VERSION_MINOR_NUM >= 9)
331     ":LineBreak=Conditional_Japanese_Starter:"
332 #else
333     "\\u3041\\u3043\\u3045\\u3047\\u3049\\u3063\\u3083\\u3085\\u3087\\u308E\\u3095\\u3096\\u30A1\\u30A3\\u30A5\\u30A7"
334     "\\u30A9\\u30C3\\u30E3\\u30E5\\u30E7\\u30EE\\u30F5\\u30F6\\u30FC"
335     "\\u31F0\\u31F1\\u31F2\\u31F3\\u31F4\\u31F5\\u31F6\\u31F7\\u31F8\\u31F9\\u31FA\\u31FB\\u31FC\\u31FD\\u31FE\\u31FF"
336     "\\uFF67\\uFF68\\uFF69\\uFF6A\\uFF6B\\uFF6C\\uFF6D\\uFF6E\\uFF6F\\uFF70"
337 #endif
338     "];";
339
340 static const char* uax14AssignmentsCustomLooseCJK =
341     "$BA_SUB = [\\u2010\\u2013];"
342     "$EX_SUB = [\\u0021\\u003F\\uFF01\\uFF1F];"
343     "$ID_SUB = '';"
344     "$IN_SUB = [\\u2025\\u2026];"
345     "$IS_SUB = [\\u003A\\u003B];"
346     "$NS_SUB = [\\u203C\\u2047\\u2048\\u2049\\u3005\\u301C\\u303B\\u309D\\u309E\\u30A0\\u30FB\\u30FD\\u30FE\\uFF1A\\uFF1B\\uFF65];"
347     "$PO_SUB = [\\u0025\\u00A2\\u00B0\\u2030\\u2032\\u2033\\u2103\\uFF05\\uFFE0];"
348     "$PR_SUB = [\\u0024\\u00A3\\u00A5\\u20AC\\u2116\\uFF04\\uFFE1\\uFFE5];"
349     "$ID_ADD = [$CJ $BA_SUB $EX_SUB $IN_SUB $IS_SUB $NS_SUB $PO_SUB $PR_SUB];"
350     "$NS_ADD = '';";
351
352 static const char* uax14AssignmentsCustomLooseNonCJK =
353     "$BA_SUB = '';"
354     "$EX_SUB = '';"
355     "$ID_SUB = '';"
356     "$IN_SUB = [\\u2025\\u2026];"
357     "$IS_SUB = '';"
358     "$NS_SUB = [\\u3005\\u303B\\u309D\\u309E\\u30FD\\u30FE];"
359     "$PO_SUB = '';"
360     "$PR_SUB = '';"
361     "$ID_ADD = [$CJ $IN_SUB $NS_SUB];"
362     "$NS_ADD = '';";
363
364 static const char* uax14AssignmentsCustomNormalCJK =
365     "$BA_SUB = [\\u2010\\u2013];"
366     "$EX_SUB = '';"
367     "$IN_SUB = '';"
368     "$ID_SUB = '';"
369     "$IS_SUB = '';"
370     "$NS_SUB = [\\u301C\\u30A0];"
371     "$PO_SUB = '';"
372     "$PR_SUB = '';"
373     "$ID_ADD = [$CJ $BA_SUB $NS_SUB];"
374     "$NS_ADD = '';";
375
376 static const char* uax14AssignmentsCustomNormalNonCJK =
377     "$BA_SUB = '';"
378     "$EX_SUB = '';"
379     "$ID_SUB = '';"
380     "$IN_SUB = '';"
381     "$IS_SUB = '';"
382     "$NS_SUB = '';"
383     "$PO_SUB = '';"
384     "$PR_SUB = '';"
385     "$ID_ADD = [$CJ];"
386     "$NS_ADD = '';";
387
388 static const char* uax14AssignmentsCustomStrictCJK =
389     "$BA_SUB = '';"
390     "$EX_SUB = '';"
391     "$ID_SUB = '';"
392     "$IN_SUB = '';"
393     "$IS_SUB = '';"
394     "$NS_SUB = '';"
395     "$PO_SUB = '';"
396     "$PR_SUB = '';"
397     "$ID_ADD = '';"
398     "$NS_ADD = [$CJ];";
399
400 #define uax14AssignmentsCustomStrictNonCJK      uax14AssignmentsCustomStrictCJK
401 #define uax14AssignmentsCustomDefaultCJK        uax14AssignmentsCustomNormalCJK
402 #define uax14AssignmentsCustomDefaultNonCJK     uax14AssignmentsCustomStrictNonCJK
403
404 static const char* uax14AssignmentsAfter =
405     "$AI = [:LineBreak = Ambiguous:];"
406     "$AL = [:LineBreak = Alphabetic:];"
407     "$BA = [[:LineBreak = Break_After:] - $BA_SUB];"
408     "$BB = [:LineBreak = Break_Before:];"
409     "$BK = [:LineBreak = Mandatory_Break:];"
410     "$B2 = [:LineBreak = Break_Both:];"
411     "$CB = [:LineBreak = Contingent_Break:];"
412     "$CL = [:LineBreak = Close_Punctuation:];"
413     "$CM = [:LineBreak = Combining_Mark:];"
414     "$CP = [:LineBreak = Close_Parenthesis:];"
415     "$CR = [:LineBreak = Carriage_Return:];"
416     "$EX = [[:LineBreak = Exclamation:] - $EX_SUB];"
417     "$GL = [:LineBreak = Glue:];"
418 #if (U_ICU_VERSION_MAJOR_NUM >= 4) && (U_ICU_VERSION_MINOR_NUM >= 9)
419     "$HL = [:LineBreak = Hebrew_Letter:];"
420 #else
421     "$HL = [[:Hebrew:] & [:Letter:]];"
422 #endif
423     "$HY = [:LineBreak = Hyphen:];"
424     "$H2 = [:LineBreak = H2:];"
425     "$H3 = [:LineBreak = H3:];"
426     "$ID = [[[[:LineBreak = Ideographic:] - $CJ] $ID_ADD] - $ID_SUB];"
427     "$IN = [[:LineBreak = Inseparable:] - $IN_SUB];"
428     "$IS = [[:LineBreak = Infix_Numeric:] - $IS_SUB];"
429     "$JL = [:LineBreak = JL:];"
430     "$JV = [:LineBreak = JV:];"
431     "$JT = [:LineBreak = JT:];"
432     "$LF = [:LineBreak = Line_Feed:];"
433     "$NL = [:LineBreak = Next_Line:];"
434     "$NS = [[[[:LineBreak = Nonstarter:] - $CJ] $NS_ADD] - $NS_SUB];"
435     "$NU = [:LineBreak = Numeric:];"
436     "$OP = [:LineBreak = Open_Punctuation:];"
437     "$PO = [[:LineBreak = Postfix_Numeric:] - $PO_SUB];"
438     "$PR = [[:LineBreak = Prefix_Numeric:] - $PR_SUB];"
439     "$QU = [:LineBreak = Quotation:];"
440     "$RI = [\\U0001F1E6-\\U0001F1FF];"
441     "$SA = [:LineBreak = Complex_Context:];"
442     "$SG = [:LineBreak = Surrogate:];"
443     "$SP = [:LineBreak = Space:];"
444     "$SY = [:LineBreak = Break_Symbols:];"
445     "$WJ = [:LineBreak = Word_Joiner:];"
446     "$XX = [:LineBreak = Unknown:];"
447     "$ZW = [:LineBreak = ZWSpace:];"
448     "$ZWJ = \\u200D;"
449     "$EmojiVar = \\uFE0F;"
450 #if ADDITIONAL_EMOJI_SUPPORT
451     "$EmojiForSeqs = [\\u2640 \\u2642 \\u26F9 \\u2764 \\U0001F308 \\U0001F3C3-\\U0001F3C4 \\U0001F3CA-\\U0001F3CC \\U0001F3F3 \\U0001F441 \\U0001F466-\\U0001F469 \\U0001F46E-\\U0001F46F \\U0001F471 \\U0001F473 \\U0001F477 \\U0001F481-\\U0001F482 \\U0001F486-\\U0001F487 \\U0001F48B \\U0001F575 \\U0001F5E8 \\U0001F645-\\U0001F647 \\U0001F64B \\U0001F64D-\\U0001F64E \\U0001F6A3 \\U0001F6B4-\\U0001F6B6 \\u2695-\\u2696 \\u2708 \\U0001F33E \\U0001F373 \\U0001F393 \\U0001F3A4 \\U0001F3A8 \\U0001F3EB \\U0001F3ED \\U0001F4BB-\\U0001F4BC \\U0001F527 \\U0001F52C \\U0001F680 \\U0001F692 \\U0001F926 \\U0001F937-\\U0001F939 \\U0001F93C-\\U0001F93E];" // Emoji that participate in ZWJ sequences
452     "$EmojiForMods = [\\u261D \\u26F9 \\u270A-\\u270D \\U0001F385 \\U0001F3C3-\\U0001F3C4 \\U0001F3CA \\U0001F3CB \\U0001F442-\\U0001F443 \\U0001F446-\\U0001F450 \\U0001F466-\\U0001F469 \\U0001F46E-\\U0001F478 \\U0001F47C \\U0001F481-\\U0001F483 \\U0001F485-\\U0001F487 \\U0001F4AA \\U0001F575 \\U0001F590 \\U0001F595 \\U0001F596 \\U0001F645-\\U0001F647 \\U0001F64B-\\U0001F64F \\U0001F6A3 \\U0001F6B4-\\U0001F6B6 \\U0001F6C0 \\U0001F918 \\U0001F3C2 \\U0001F3C7 \\U0001F3CC \\U0001F574 \\U0001F57A \\U0001F6CC \\U0001F919-\\U0001F91E \\U0001F926 \\U0001F930 \\U0001F933-\\U0001F939 \\U0001F93C-\\U0001F93E] ;" // Emoji that take Fitzpatrick modifiers
453 #else
454     "$EmojiForSeqs = [\\u2764 \\U0001F466-\\U0001F469 \\U0001F48B];"
455     "$EmojiForMods = [\\u261D \\u270A-\\u270C \\U0001F385 \\U0001F3C3-\\U0001F3C4 \\U0001F3C7 \\U0001F3CA \\U0001F442-\\U0001F443 \\U0001F446-\\U0001F450 \\U0001F466-\\U0001F469 \\U0001F46E-\\U0001F478 \\U0001F47C \\U0001F481-\\U0001F483 \\U0001F485-\\U0001F487 \\U0001F4AA \\U0001F596 \\U0001F645-\\U0001F647 \\U0001F64B-\\U0001F64F \\U0001F6A3 \\U0001F6B4-\\U0001F6B6 \\U0001F6C0] ;" // Emoji that take Fitzpatrick modifiers
456 #endif
457     "$EmojiMods = [\\U0001F3FB-\\U0001F3FF];"
458     "$dictionary = [:LineBreak = Complex_Context:];"
459     "$ALPlus = [$AL $AI $SA $SG $XX];"
460     "$ALcm = $ALPlus $CM*;"
461     "$BAcm = $BA $CM*;"
462     "$BBcm = $BB $CM*;"
463     "$B2cm = $B2 $CM*;"
464     "$CLcm = $CL $CM*;"
465     "$CPcm = $CP $CM*;"
466     "$EXcm = $EX $CM*;"
467     "$GLcm = $GL $CM*;"
468     "$HLcm = $HL $CM*;"
469     "$HYcm = $HY $CM*;"
470     "$H2cm = $H2 $CM*;"
471     "$H3cm = $H3 $CM*;"
472     "$IDcm = $ID $CM*;"
473     "$INcm = $IN $CM*;"
474     "$IScm = $IS $CM*;"
475     "$JLcm = $JL $CM*;"
476     "$JVcm = $JV $CM*;"
477     "$JTcm = $JT $CM*;"
478     "$NScm = $NS $CM*;"
479     "$NUcm = $NU $CM*;"
480     "$OPcm = $OP $CM*;"
481     "$POcm = $PO $CM*;"
482     "$PRcm = $PR $CM*;"
483     "$QUcm = $QU $CM*;"
484     "$RIcm = $RI $CM*;"
485     "$SYcm = $SY $CM*;"
486     "$WJcm = $WJ $CM*;";
487
488 static const char* uax14Forward =
489     "!!forward;"
490     "$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM];"
491     "$CANT_CM = [$SP $BK $CR $LF $NL $ZW $CM];"
492     "$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];"
493     "$AL_FOLLOW_CM = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $ALPlus];"
494     "$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];"
495     "$LB4Breaks = [$BK $CR $LF $NL];"
496     "$LB4NonBreaks = [^$BK $CR $LF $NL];"
497     "$LB8Breaks = [$LB4Breaks $ZW];"
498     "$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];"
499     "$LB18NonBreaks = [$LB8NonBreaks - [$SP]];"
500     "$LB18Breaks = [$LB8Breaks $SP];"
501     "$LB20NonBreaks = [$LB18NonBreaks - $CB];"
502     "$ALPlus $CM+;"
503     "$BA $CM+;"
504     "$BB $CM+;"
505     "$B2 $CM+;"
506     "$CL $CM+;"
507     "$CP $CM+;"
508     "$EX $CM+;"
509     "$GL $CM+;"
510     "$HL $CM+;"
511     "$HY $CM+;"
512     "$H2 $CM+;"
513     "$H3 $CM+;"
514     "$ID $CM+;"
515     "$IN $CM+;"
516     "$IS $CM+;"
517     "$JL $CM+;"
518     "$JV $CM+;"
519     "$JT $CM+;"
520     "$NS $CM+;"
521     "$NU $CM+;"
522     "$OP $CM+;"
523     "$PO $CM+;"
524     "$PR $CM+;"
525     "$QU $CM+;"
526     "$RI $CM+;"
527     "$SY $CM+;"
528     "$WJ $CM+;"
529     "$CR $LF {100};"
530     "$LB4NonBreaks? $LB4Breaks {100};"
531     "$CAN_CM $CM* $LB4Breaks {100};"
532     "$CM+ $LB4Breaks {100};"
533     "$LB4NonBreaks [$SP $ZW];"
534     "$CAN_CM $CM* [$SP $ZW];"
535     "$CM+ [$SP $ZW];"
536     "$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;"
537     "$CAN_CM $CM+;"
538     "$CM+;"
539     "$CAN_CM $CM* $WJcm;"
540     "$LB8NonBreaks $WJcm;"
541     "$CM+ $WJcm;"
542     "$WJcm $CANT_CM;"
543     "$WJcm $CAN_CM $CM*;"
544     "$GLcm $CAN_CM $CM*;"
545     "$GLcm $CANT_CM;"
546     "[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;"
547     "$CM+ GLcm;"
548     "$LB8NonBreaks $CL;"
549     "$CAN_CM $CM* $CL;"
550     "$CM+ $CL;"
551     "$LB8NonBreaks $CP;"
552     "$CAN_CM $CM* $CP;"
553     "$CM+ $CP;"
554     "$LB8NonBreaks $EX;"
555     "$CAN_CM $CM* $EX;"
556     "$CM+ $EX;"
557     "$LB8NonBreaks $IS;"
558     "$CAN_CM $CM* $IS;"
559     "$CM+ $IS;"
560     "$LB8NonBreaks $SY;"
561     "$CAN_CM $CM* $SY;"
562     "$CM+ $SY;"
563     "$OPcm $SP* $CAN_CM $CM*;"
564     "$OPcm $SP* $CANT_CM;"
565     "$OPcm $SP+ $CM+ $AL_FOLLOW?;"
566     "$QUcm $SP* $OPcm;"
567     "($CLcm | $CPcm) $SP* $NScm;"
568     "$B2cm $SP* $B2cm;"
569     "$LB18NonBreaks $CM* $QUcm;"
570     "$CM+ $QUcm;"
571     "$QUcm .?;"
572     "$QUcm $LB18NonBreaks $CM*;"
573     "$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); "
574     "$BBcm [^$CB];"
575     "$BBcm $LB20NonBreaks $CM*;"
576     "$HLcm ($HYcm | $BAcm) [^$CB]?;"
577     "$SYcm $HLcm;"
578     "($ALcm | $HLcm) $INcm;"
579     "$CM+ $INcm;"
580     "$EXcm $INcm;"
581     "$IDcm $INcm;"
582     "$INcm $INcm;"
583     "$NUcm $INcm;"
584     "$IDcm $POcm;"
585     "$ALcm $NUcm;"
586     "$HLcm $NUcm;"
587     "$CM+ $NUcm;"
588     "$NUcm $ALcm;"
589     "$NUcm $HLcm;"
590     "$PRcm $IDcm;"
591     "$PRcm ($ALcm | $HLcm);"
592     "$POcm ($ALcm | $HLcm);"
593     "($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;"
594     "$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);"
595     "($JVcm | $H2cm) ($JVcm | $JTcm);"
596     "($JTcm | $H3cm) $JTcm;"
597     "($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;"
598     "($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;"
599     "$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);"
600     "($ALcm | $HLcm) ($ALcm | $HLcm);"
601     "$CM+ ($ALcm | $HLcm);"
602     "$IScm ($ALcm | $HLcm);"
603     "($ALcm | $HLcm | $NUcm) $OPcm;"
604     "$CM+ $OPcm;"
605     "$CPcm ($ALcm | $HLcm | $NUcm);"
606 #if ADDITIONAL_EMOJI_SUPPORT
607     "$RIcm $RIcm;"
608 #endif
609     "$EmojiForMods $EmojiVar? $EmojiMods;";
610
611 static const char* uax14Reverse =
612     "!!reverse;"
613     "$CM+ $ALPlus;"
614     "$CM+ $BA;"
615     "$CM+ $BB;"
616     "$CM+ $B2;"
617     "$CM+ $CL;"
618     "$CM+ $CP;"
619     "$CM+ $EX;"
620     "$CM+ $GL;"
621     "$CM+ $HL;"
622     "$CM+ $HY;"
623     "$CM+ $H2;"
624     "$CM+ $H3;"
625     "$CM+ $ID;"
626     "$CM+ $IN;"
627     "$CM+ $IS;"
628     "$CM+ $JL;"
629     "$CM+ $JV;"
630     "$CM+ $JT;"
631     "$CM+ $NS;"
632     "$CM+ $NU;"
633     "$CM+ $OP;"
634     "$CM+ $PO;"
635     "$CM+ $PR;"
636     "$CM+ $QU;"
637 #if ADDITIONAL_EMOJI_SUPPORT
638     "$CM+ $RI;"
639 #endif
640     "$CM+ $SY;"
641     "$CM+ $WJ;"
642     "$CM+;"
643     "$AL_FOLLOW $CM+ / ([$BK $CR $LF $NL $ZW {eof}] | $SP+ $CM+ $SP | $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}]));"
644     "[$PR] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];"
645     "$LB4Breaks [$LB4NonBreaks-$CM];"
646     "$LB4Breaks $CM+ $CAN_CM;"
647     "$LF $CR;"
648     "[$SP $ZW] [$LB4NonBreaks-$CM];"
649     "[$SP $ZW] $CM+ $CAN_CM;"
650     "$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;"
651     "$CM+ $CAN_CM;"
652     "$CM* $WJ $CM* $CAN_CM;"
653     "$CM* $WJ [$LB8NonBreaks-$CM];"
654     "$CANT_CM $CM* $WJ;"
655     "$CM* $CAN_CM $CM* $WJ;"
656     "$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];"
657     "$CANT_CM $CM* $GL;"
658     "$CM* $CAN_CM $CM* $GL;"
659     "$CL $CM+ $CAN_CM;"
660     "$CP $CM+ $CAN_CM;"
661     "$EX $CM+ $CAN_CM;"
662     "$IS $CM+ $CAN_CM;"
663     "$SY $CM+ $CAN_CM;"
664     "$CL [$LB8NonBreaks-$CM];"
665     "$CP [$LB8NonBreaks-$CM];"
666     "$EX [$LB8NonBreaks-$CM];"
667     "$IS [$LB8NonBreaks-$CM];"
668     "$SY [$LB8NonBreaks-$CM];"
669     "[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP; "
670     "$CM* $CAN_CM $SP* $CM* $OP;"
671     "$CANT_CM $SP* $CM* $OP;"
672     "$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP;"
673     "$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;"
674     "$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;"
675     "$SY $CM $SP+ $OP;"
676     "$CM* $OP $SP* $CM* $QU;"
677     "$CM* $NS $SP* $CM* ($CL | $CP);"
678     "$CM* $B2 $SP* $CM* $B2;"
679     "$CM* $QU $CM* $CAN_CM;"
680     "$CM* $QU $LB18NonBreaks;"
681     "$CM* $CAN_CM $CM* $QU;"
682     "$CANT_CM $CM* $QU;"
683     "$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM];"
684     "$CM* [$LB20NonBreaks-$CM] $CM* $BB;"
685     "[^$CB] $CM* $BB;"
686     "[^$CB] $CM* ($HY | $BA) $CM* $HL;"
687     "$CM* $HL $CM* $SY;"
688     "$CM* $IN $CM* ($ALPlus | $HL);"
689     "$CM* $IN $CM* $EX;"
690     "$CM* $IN $CM* $ID;"
691     "$CM* $IN $CM* $IN;"
692     "$CM* $IN $CM* $NU;"
693     "$CM* $PO $CM* $ID;"
694     "$CM* $NU $CM* ($ALPlus | $HL);"
695     "$CM* ($ALPlus | $HL) $CM* $NU;"
696     "$CM* $ID $CM* $PR;"
697     "$CM* ($ALPlus | $HL) $CM* $PR;"
698     "$CM* ($ALPlus | $HL) $CM* $PO;"
699     "($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;"
700     "$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;"
701     "$CM* ($JT | $JV) $CM* ($H2 | $JV);"
702     "$CM* $JT $CM* ($H3 | $JT);"
703     "$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);"
704     "$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);"
705     "$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;"
706     "$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);"
707     "$CM* ($ALPlus | $HL) $CM* $IS;"
708     "$CM* $OP $CM* ($ALPlus | $HL | $NU);"
709     "$CM* ($ALPlus | $HL | $NU) $CM* $CP;"
710 #if ADDITIONAL_EMOJI_SUPPORT
711     "$CM* $RI $CM* $RI;"
712 #endif
713     "$EmojiMods $EmojiVar? $EmojiForMods;";
714
715 static const char* uax14SafeForward =
716     "!!safe_forward;"
717     "[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];"
718     "$dictionary $dictionary;";
719
720 static const char* uax14SafeReverse =
721     "!!safe_reverse;"
722     "$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];"
723     "$CM+ $SP / .;"
724     "$SP+ $CM* $OP;"
725     "$SP+ $CM* $QU;"
726     "$SP+ $CM* ($CL | $CP);"
727     "$SP+ $CM* $B2;"
728     "$CM* ($HY | $BA) $CM* $HL;"
729     "($CM* ($IS | $SY))+ $CM* $NU;"
730     "($CL | $CP) $CM* ($NU | $IS | $SY);"
731     "$dictionary $dictionary;";
732
733 static String mapLineIteratorModeToRules(LineBreakIteratorMode mode, bool isCJK)
734 {
735     StringBuilder rulesBuilder;
736     rulesBuilder.append(uax14Prologue);
737     rulesBuilder.append(uax14AssignmentsBefore);
738     switch (mode) {
739     case LineBreakIteratorMode::Default:
740         rulesBuilder.append(isCJK ? uax14AssignmentsCustomDefaultCJK : uax14AssignmentsCustomDefaultNonCJK);
741         break;
742     case LineBreakIteratorMode::Loose:
743         rulesBuilder.append(isCJK ? uax14AssignmentsCustomLooseCJK : uax14AssignmentsCustomLooseNonCJK);
744         break;
745     case LineBreakIteratorMode::Normal:
746         rulesBuilder.append(isCJK ? uax14AssignmentsCustomNormalCJK : uax14AssignmentsCustomNormalNonCJK);
747         break;
748     case LineBreakIteratorMode::Strict:
749         rulesBuilder.append(isCJK ? uax14AssignmentsCustomStrictCJK : uax14AssignmentsCustomStrictNonCJK);
750         break;
751     }
752     rulesBuilder.append(uax14AssignmentsAfter);
753     rulesBuilder.append(uax14Forward);
754     rulesBuilder.append(uax14Reverse);
755     rulesBuilder.append(uax14SafeForward);
756     rulesBuilder.append(uax14SafeReverse);
757     return rulesBuilder.toString();
758 }
759
760 // Recognize BCP47 compliant primary language values of 'zh', 'ja', 'ko'
761 // (in any combination of case), optionally followed by subtags. Don't
762 // recognize 3-letter variants 'chi'/'zho', 'jpn', or 'kor' since BCP47
763 // requires use of shortest language tag.
764 bool isCJKLocale(const AtomicString& locale)
765 {
766     size_t length = locale.length();
767     if (length < 2)
768         return false;
769     auto c1 = locale[0];
770     auto c2 = locale[1];
771     auto c3 = length == 2 ? 0 : locale[2];
772     if (!c3 || c3 == '-' || c3 == '_' || c3 == '@') {
773         if (c1 == 'z' || c1 == 'Z')
774             return c2 == 'h' || c2 == 'H';
775         if (c1 == 'j' || c1 == 'J')
776             return c2 == 'a' || c2 == 'A';
777         if (c1 == 'k' || c1 == 'K')
778             return c2 == 'o' || c2 == 'O';
779     }
780     return false;
781 }
782
783 TextBreakIterator* openLineBreakIterator(const AtomicString& locale, LineBreakIteratorMode mode, bool isCJK)
784 {
785     UBreakIterator* ubrkIter;
786     UErrorCode openStatus = U_ZERO_ERROR;
787     bool localeIsEmpty = locale.isEmpty();
788     if (mode == LineBreakIteratorMode::Default)
789         ubrkIter = ubrk_open(UBRK_LINE, localeIsEmpty ? currentTextBreakLocaleID() : locale.string().utf8().data(), 0, 0, &openStatus);
790     else {
791         UParseError parseStatus;
792         auto rules = mapLineIteratorModeToRules(mode, isCJK);
793         ubrkIter = ubrk_openRules(StringView(rules).upconvertedCharacters(), rules.length(), 0, 0, &parseStatus, &openStatus);
794     }
795     // locale comes from a web page and it can be invalid, leading ICU
796     // to fail, in which case we fall back to the default locale.
797     if (!localeIsEmpty && U_FAILURE(openStatus)) {
798         openStatus = U_ZERO_ERROR;
799         ubrkIter = ubrk_open(UBRK_LINE, currentTextBreakLocaleID(), 0, 0, &openStatus);
800     }
801
802     if (U_FAILURE(openStatus)) {
803         LOG_ERROR("ubrk_open failed with status %d", openStatus);
804         return nullptr;
805     }
806
807     return reinterpret_cast<TextBreakIterator*>(ubrkIter);
808 }
809
810 void closeLineBreakIterator(TextBreakIterator*& iterator)
811 {
812     UBreakIterator* ubrkIter = reinterpret_cast<UBreakIterator*>(iterator);
813     ASSERT(ubrkIter);
814     ubrk_close(ubrkIter);
815     iterator = nullptr;
816 }
817
818 static std::atomic<TextBreakIterator*> nonSharedCharacterBreakIterator = ATOMIC_VAR_INIT(nullptr);
819
820 static inline TextBreakIterator* getNonSharedCharacterBreakIterator()
821 {
822     if (auto *res = nonSharedCharacterBreakIterator.exchange(nullptr, std::memory_order_acquire))
823         return res;
824     return initializeIterator(UBRK_CHARACTER);
825 }
826
827 static inline void cacheNonSharedCharacterBreakIterator(TextBreakIterator* cacheMe)
828 {
829     if (auto *old = nonSharedCharacterBreakIterator.exchange(cacheMe, std::memory_order_release))
830         ubrk_close(reinterpret_cast<UBreakIterator*>(old));
831 }
832
833 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(StringView string)
834 {
835     if ((m_iterator = getNonSharedCharacterBreakIterator()))
836         m_iterator = setTextForIterator(*m_iterator, string);
837 }
838
839 NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator()
840 {
841     if (m_iterator)
842         cacheNonSharedCharacterBreakIterator(m_iterator);
843 }
844
845 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(NonSharedCharacterBreakIterator&& other)
846     : m_iterator(nullptr)
847 {
848     std::swap(m_iterator, other.m_iterator);
849 }
850
851
852 // Iterator implemenation.
853
854 int textBreakFirst(TextBreakIterator* iterator)
855 {
856     return ubrk_first(reinterpret_cast<UBreakIterator*>(iterator));
857 }
858
859 int textBreakLast(TextBreakIterator* iterator)
860 {
861     return ubrk_last(reinterpret_cast<UBreakIterator*>(iterator));
862 }
863
864 int textBreakNext(TextBreakIterator* iterator)
865 {
866     return ubrk_next(reinterpret_cast<UBreakIterator*>(iterator));
867 }
868
869 int textBreakPrevious(TextBreakIterator* iterator)
870 {
871     return ubrk_previous(reinterpret_cast<UBreakIterator*>(iterator));
872 }
873
874 int textBreakPreceding(TextBreakIterator* iterator, int pos)
875 {
876     return ubrk_preceding(reinterpret_cast<UBreakIterator*>(iterator), pos);
877 }
878
879 int textBreakFollowing(TextBreakIterator* iterator, int pos)
880 {
881     return ubrk_following(reinterpret_cast<UBreakIterator*>(iterator), pos);
882 }
883
884 int textBreakCurrent(TextBreakIterator* iterator)
885 {
886     return ubrk_current(reinterpret_cast<UBreakIterator*>(iterator));
887 }
888
889 bool isTextBreak(TextBreakIterator* iterator, int position)
890 {
891     return ubrk_isBoundary(reinterpret_cast<UBreakIterator*>(iterator), position);
892 }
893
894 bool isWordTextBreak(TextBreakIterator* iterator)
895 {
896     int ruleStatus = ubrk_getRuleStatus(reinterpret_cast<UBreakIterator*>(iterator));
897     return ruleStatus != UBRK_WORD_NONE;
898 }
899
900 unsigned numGraphemeClusters(StringView string)
901 {
902     unsigned stringLength = string.length();
903     
904     if (!stringLength)
905         return 0;
906
907     // The only Latin-1 Extended Grapheme Cluster is CRLF.
908     if (string.is8Bit()) {
909         auto* characters = string.characters8();
910         unsigned numCRLF = 0;
911         for (unsigned i = 1; i < stringLength; ++i)
912             numCRLF += characters[i - 1] == '\r' && characters[i] == '\n';
913         return stringLength - numCRLF;
914     }
915
916     NonSharedCharacterBreakIterator iterator { string };
917     if (!iterator) {
918         ASSERT_NOT_REACHED();
919         return stringLength;
920     }
921
922     unsigned numGraphemeClusters = 0;
923     while (textBreakNext(iterator) != TextBreakDone)
924         ++numGraphemeClusters;
925     return numGraphemeClusters;
926 }
927
928 unsigned numCharactersInGraphemeClusters(StringView string, unsigned numGraphemeClusters)
929 {
930     unsigned stringLength = string.length();
931
932     if (stringLength <= numGraphemeClusters)
933         return stringLength;
934
935     // The only Latin-1 Extended Grapheme Cluster is CRLF.
936     if (string.is8Bit()) {
937         auto* characters = string.characters8();
938         unsigned i, j;
939         for (i = 0, j = 0; i < numGraphemeClusters && j + 1 < stringLength; ++i, ++j)
940             j += characters[j] == '\r' && characters[j + 1] == '\n';
941         return j + (i < numGraphemeClusters && j < stringLength);
942     }
943
944     NonSharedCharacterBreakIterator iterator { string };
945     if (!iterator) {
946         ASSERT_NOT_REACHED();
947         return stringLength;
948     }
949
950     for (unsigned i = 0; i < numGraphemeClusters; ++i) {
951         if (textBreakNext(iterator) == TextBreakDone)
952             return stringLength;
953     }
954     return textBreakCurrent(iterator);
955 }
956
957 } // namespace WTF