Add some new emoji with modifiers and new sequence.
[WebKit-https.git] / Source / WebCore / platform / text / TextBreakIterator.cpp
1 /*
2  * (C) 1999 Lars Knoll (knoll@kde.org)
3  * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2015 Apple Inc. All rights reserved.
4  * Copyright (C) 2007-2009 Torch Mobile, Inc.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Library General Public
8  * License as published by the Free Software Foundation; either
9  * version 2 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Library General Public License for more details.
15  *
16  * You should have received a copy of the GNU Library General Public License
17  * along with this library; see the file COPYING.LIB.  If not, write to
18  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19  * Boston, MA 02110-1301, USA.
20  */
21
22 #include "config.h"
23 #include "TextBreakIterator.h"
24
25 #include "LineBreakIteratorPoolICU.h"
26 #include "UTextProviderLatin1.h"
27 #include "UTextProviderUTF16.h"
28 #include <mutex>
29 #include <wtf/Atomics.h>
30 #include <wtf/text/StringView.h>
31
32 // FIXME: This needs a better name
33 #define ADDITIONAL_EMOJI_SUPPORT ((PLATFORM(IOS) && __IPHONE_OS_VERSION_MIN_REQUIRED >= 90000) || (PLATFORM(MAC) && __MAC_OS_X_VERSION_MIN_REQUIRED >= 101100))
34
35 namespace WebCore {
36
37 // Iterator initialization
38
39 static TextBreakIterator* initializeIterator(UBreakIteratorType type, const char* locale = currentTextBreakLocaleID())
40 {
41     UErrorCode openStatus = U_ZERO_ERROR;
42     TextBreakIterator* iterator = reinterpret_cast<TextBreakIterator*>(ubrk_open(type, locale, 0, 0, &openStatus));
43     ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
44     return iterator;
45 }
46
47 #if !PLATFORM(IOS)
48
49 static TextBreakIterator* initializeIteratorWithRules(const char* breakRules)
50 {
51     UParseError parseStatus;
52     UErrorCode openStatus = U_ZERO_ERROR;
53     unsigned length = strlen(breakRules);
54     auto upconvertedCharacters = StringView(reinterpret_cast<const LChar*>(breakRules), length).upconvertedCharacters();
55     TextBreakIterator* iterator = reinterpret_cast<TextBreakIterator*>(ubrk_openRules(upconvertedCharacters, length, 0, 0, &parseStatus, &openStatus));
56     ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
57     return iterator;
58 }
59
60 #endif
61
62
63 // Iterator text setting
64
65 static TextBreakIterator* setTextForIterator(TextBreakIterator& iterator, StringView string)
66 {
67     if (string.is8Bit()) {
68         UTextWithBuffer textLocal;
69         textLocal.text = UTEXT_INITIALIZER;
70         textLocal.text.extraSize = sizeof(textLocal.buffer);
71         textLocal.text.pExtra = textLocal.buffer;
72
73         UErrorCode openStatus = U_ZERO_ERROR;
74         UText* text = openLatin1UTextProvider(&textLocal, string.characters8(), string.length(), &openStatus);
75         if (U_FAILURE(openStatus)) {
76             LOG_ERROR("uTextOpenLatin1 failed with status %d", openStatus);
77             return nullptr;
78         }
79
80         UErrorCode setTextStatus = U_ZERO_ERROR;
81         ubrk_setUText(reinterpret_cast<UBreakIterator*>(&iterator), text, &setTextStatus);
82         if (U_FAILURE(setTextStatus)) {
83             LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
84             return nullptr;
85         }
86
87         utext_close(text);
88     } else {
89         UErrorCode setTextStatus = U_ZERO_ERROR;
90         ubrk_setText(reinterpret_cast<UBreakIterator*>(&iterator), string.characters16(), string.length(), &setTextStatus);
91         if (U_FAILURE(setTextStatus))
92             return nullptr;
93     }
94
95     return &iterator;
96 }
97
98 static TextBreakIterator* setContextAwareTextForIterator(TextBreakIterator& iterator, StringView string, const UChar* priorContext, unsigned priorContextLength)
99 {
100     if (string.is8Bit()) {
101         UTextWithBuffer textLocal;
102         textLocal.text = UTEXT_INITIALIZER;
103         textLocal.text.extraSize = sizeof(textLocal.buffer);
104         textLocal.text.pExtra = textLocal.buffer;
105
106         UErrorCode openStatus = U_ZERO_ERROR;
107         UText* text = openLatin1ContextAwareUTextProvider(&textLocal, string.characters8(), string.length(), priorContext, priorContextLength, &openStatus);
108         if (U_FAILURE(openStatus)) {
109             LOG_ERROR("openLatin1ContextAwareUTextProvider failed with status %d", openStatus);
110             return nullptr;
111         }
112
113         UErrorCode setTextStatus = U_ZERO_ERROR;
114         ubrk_setUText(reinterpret_cast<UBreakIterator*>(&iterator), text, &setTextStatus);
115         if (U_FAILURE(setTextStatus)) {
116             LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
117             return nullptr;
118         }
119
120         utext_close(text);
121     } else {
122         UText textLocal = UTEXT_INITIALIZER;
123
124         UErrorCode openStatus = U_ZERO_ERROR;
125         UText* text = openUTF16ContextAwareUTextProvider(&textLocal, string.characters16(), string.length(), priorContext, priorContextLength, &openStatus);
126         if (U_FAILURE(openStatus)) {
127             LOG_ERROR("openUTF16ContextAwareUTextProvider failed with status %d", openStatus);
128             return 0;
129         }
130
131         UErrorCode setTextStatus = U_ZERO_ERROR;
132         ubrk_setUText(reinterpret_cast<UBreakIterator*>(&iterator), text, &setTextStatus);
133         if (U_FAILURE(setTextStatus)) {
134             LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
135             return nullptr;
136         }
137
138         utext_close(text);
139     }
140
141     return &iterator;
142 }
143
144
145 // Static iterators
146
147 TextBreakIterator* wordBreakIterator(StringView string)
148 {
149     static TextBreakIterator* staticWordBreakIterator = initializeIterator(UBRK_WORD);
150     if (!staticWordBreakIterator)
151         return nullptr;
152
153     return setTextForIterator(*staticWordBreakIterator, string);
154 }
155
156 TextBreakIterator* sentenceBreakIterator(StringView string)
157 {
158     static TextBreakIterator* staticSentenceBreakIterator = initializeIterator(UBRK_SENTENCE);
159     if (!staticSentenceBreakIterator)
160         return nullptr;
161
162     return setTextForIterator(*staticSentenceBreakIterator, string);
163 }
164
165 TextBreakIterator* cursorMovementIterator(StringView string)
166 {
167 #if !PLATFORM(IOS)
168     // This rule set is based on character-break iterator rules of ICU 4.0
169     // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
170     // The major differences from the original ones are listed below:
171     // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
172     // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
173     // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
174     // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
175     // * Added rules for regional indicator symbols.
176     static const char* kRules =
177         "$CR      = [\\p{Grapheme_Cluster_Break = CR}];"
178         "$LF      = [\\p{Grapheme_Cluster_Break = LF}];"
179         "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
180         "$VoiceMarks = [\\uFF9E\\uFF9F];"  // Japanese half-width katakana voiced marks
181         "$Extend  = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
182         "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
183         "$L       = [\\p{Grapheme_Cluster_Break = L}];"
184         "$V       = [\\p{Grapheme_Cluster_Break = V}];"
185         "$T       = [\\p{Grapheme_Cluster_Break = T}];"
186         "$LV      = [\\p{Grapheme_Cluster_Break = LV}];"
187         "$LVT     = [\\p{Grapheme_Cluster_Break = LVT}];"
188         "$Hin0    = [\\u0905-\\u0939];"    // Devanagari Letter A,...,Ha
189         "$HinV    = \\u094D;"              // Devanagari Sign Virama
190         "$Hin1    = [\\u0915-\\u0939];"    // Devanagari Letter Ka,...,Ha
191         "$Ben0    = [\\u0985-\\u09B9];"    // Bengali Letter A,...,Ha
192         "$BenV    = \\u09CD;"              // Bengali Sign Virama
193         "$Ben1    = [\\u0995-\\u09B9];"    // Bengali Letter Ka,...,Ha
194         "$Pan0    = [\\u0A05-\\u0A39];"    // Gurmukhi Letter A,...,Ha
195         "$PanV    = \\u0A4D;"              // Gurmukhi Sign Virama
196         "$Pan1    = [\\u0A15-\\u0A39];"    // Gurmukhi Letter Ka,...,Ha
197         "$Guj0    = [\\u0A85-\\u0AB9];"    // Gujarati Letter A,...,Ha
198         "$GujV    = \\u0ACD;"              // Gujarati Sign Virama
199         "$Guj1    = [\\u0A95-\\u0AB9];"    // Gujarati Letter Ka,...,Ha
200         "$Ori0    = [\\u0B05-\\u0B39];"    // Oriya Letter A,...,Ha
201         "$OriV    = \\u0B4D;"              // Oriya Sign Virama
202         "$Ori1    = [\\u0B15-\\u0B39];"    // Oriya Letter Ka,...,Ha
203         "$Tel0    = [\\u0C05-\\u0C39];"    // Telugu Letter A,...,Ha
204         "$TelV    = \\u0C4D;"              // Telugu Sign Virama
205         "$Tel1    = [\\u0C14-\\u0C39];"    // Telugu Letter Ka,...,Ha
206         "$Kan0    = [\\u0C85-\\u0CB9];"    // Kannada Letter A,...,Ha
207         "$KanV    = \\u0CCD;"              // Kannada Sign Virama
208         "$Kan1    = [\\u0C95-\\u0CB9];"    // Kannada Letter A,...,Ha
209         "$Mal0    = [\\u0D05-\\u0D39];"    // Malayalam Letter A,...,Ha
210         "$MalV    = \\u0D4D;"              // Malayalam Sign Virama
211         "$Mal1    = [\\u0D15-\\u0D39];"    // Malayalam Letter A,...,Ha
212         "$RI      = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators
213         "$ZWJ     = \\u200D;"               // Zero width joiner
214         "$EmojiVar = [\\uFE0F];"            // Emoji-style variation selector
215 #if ADDITIONAL_EMOJI_SUPPORT
216         "$EmojiForSeqs = [\\u2764 \\U0001F441 \\U0001F466-\\U0001F469 \\U0001F48B \\U0001F5E8];" // Emoji that participate in ZWJ sequences
217         "$EmojiForMods = [\\u261D \\u270A-\\u270D \\U0001F385 \\U0001F3C3-\\U0001F3C4 \\U0001F3C7 \\U0001F3CA \\U0001F442-\\U0001F443 \\U0001F446-\\U0001F450 \\U0001F466-\\U0001F469 \\U0001F46E-\\U0001F478 \\U0001F47C \\U0001F481-\\U0001F483 \\U0001F485-\\U0001F487 \\U0001F4AA \\U0001F590 \\U0001F595 \\U0001F596 \\U0001F645-\\U0001F647 \\U0001F64B-\\U0001F64F \\U0001F6A3 \\U0001F6B4-\\U0001F6B6 \\U0001F6C0 \\U0001F918] ;" // Emoji that take Fitzpatrick modifiers
218 #else
219         "$EmojiForSeqs = [\\u2764 \\U0001F466-\\U0001F469 \\U0001F48B];" // Emoji that participate in ZWJ sequences
220         "$EmojiForMods = [\\u261D \\u270A-\\u270C \\U0001F385 \\U0001F3C3-\\U0001F3C4 \\U0001F3C7 \\U0001F3CA \\U0001F442-\\U0001F443 \\U0001F446-\\U0001F450 \\U0001F466-\\U0001F469 \\U0001F46E-\\U0001F478 \\U0001F47C \\U0001F481-\\U0001F483 \\U0001F485-\\U0001F487 \\U0001F4AA \\U0001F596 \\U0001F645-\\U0001F647 \\U0001F64B-\\U0001F64F \\U0001F6A3 \\U0001F6B4-\\U0001F6B6 \\U0001F6C0] ;" // Emoji that take Fitzpatrick modifiers
221 #endif
222         "$EmojiMods = [\\U0001F3FB-\\U0001F3FF];" // Fitzpatrick modifiers
223         "!!chain;"
224 #if ADDITIONAL_EMOJI_SUPPORT
225         "!!RINoChain;"
226 #endif
227         "!!forward;"
228         "$CR $LF;"
229         "$L ($L | $V | $LV | $LVT);"
230         "($LV | $V) ($V | $T);"
231         "($LVT | $T) $T;"
232 #if ADDITIONAL_EMOJI_SUPPORT
233         "$RI $RI $Extend* / $RI;"
234         "$RI $RI $Extend*;"
235         "[^$Control $CR $LF] $Extend;"
236         "[^$Control $CR $LF] $SpacingMark;"
237 #else
238         "[^$Control $CR $LF] $Extend;"
239         "[^$Control $CR $LF] $SpacingMark;"
240         "$RI $RI / $RI;"
241         "$RI $RI;"
242 #endif
243         "$Hin0 $HinV $Hin1;"               // Devanagari Virama (forward)
244         "$Ben0 $BenV $Ben1;"               // Bengali Virama (forward)
245         "$Pan0 $PanV $Pan1;"               // Gurmukhi Virama (forward)
246         "$Guj0 $GujV $Guj1;"               // Gujarati Virama (forward)
247         "$Ori0 $OriV $Ori1;"               // Oriya Virama (forward)
248         "$Tel0 $TelV $Tel1;"               // Telugu Virama (forward)
249         "$Kan0 $KanV $Kan1;"               // Kannada Virama (forward)
250         "$Mal0 $MalV $Mal1;"               // Malayalam Virama (forward)
251         "$ZWJ $EmojiForSeqs;"              // Don't break in emoji ZWJ sequences
252         "$EmojiForMods $EmojiVar? $EmojiMods;" // Don't break between relevant emoji (possibly with variation selector) and Fitzpatrick modifier
253         "!!reverse;"
254         "$LF $CR;"
255         "($L | $V | $LV | $LVT) $L;"
256         "($V | $T) ($LV | $V);"
257         "$T ($LVT | $T);"
258 #if ADDITIONAL_EMOJI_SUPPORT
259         "$Extend* $RI $RI / $Extend* $RI $RI;"
260         "$Extend* $RI $RI;"
261         "$Extend      [^$Control $CR $LF];"
262         "$SpacingMark [^$Control $CR $LF];"
263 #else
264         "$Extend      [^$Control $CR $LF];"
265         "$SpacingMark [^$Control $CR $LF];"
266         "$RI $RI / $RI $RI;"
267         "$RI $RI;"
268 #endif
269         "$Hin1 $HinV $Hin0;"               // Devanagari Virama (backward)
270         "$Ben1 $BenV $Ben0;"               // Bengali Virama (backward)
271         "$Pan1 $PanV $Pan0;"               // Gurmukhi Virama (backward)
272         "$Guj1 $GujV $Guj0;"               // Gujarati Virama (backward)
273         "$Ori1 $OriV $Ori0;"               // Gujarati Virama (backward)
274         "$Tel1 $TelV $Tel0;"               // Telugu Virama (backward)
275         "$Kan1 $KanV $Kan0;"               // Kannada Virama (backward)
276         "$Mal1 $MalV $Mal0;"               // Malayalam Virama (backward)
277         "$EmojiForSeqs $ZWJ;"              // Don't break in emoji ZWJ sequences
278         "$EmojiMods $EmojiVar? $EmojiForMods;" // Don't break between relevant emoji (possibly with variation selector) and Fitzpatrick modifier
279 #if ADDITIONAL_EMOJI_SUPPORT
280         "!!safe_reverse;"
281         "$RI $RI+;"
282         "[$EmojiVar $EmojiMods]+ $EmojiForMods;"
283         "!!safe_forward;"
284         "$RI $RI+;"
285         "$EmojiForMods [$EmojiVar $EmojiMods]+;";
286 #else
287         "[$EmojiVar $EmojiMods]+ $EmojiForMods;"
288         "$EmojiForMods [$EmojiVar $EmojiMods]+;"
289         "!!safe_reverse;"
290         "!!safe_forward;";
291 #endif
292     static TextBreakIterator* staticCursorMovementIterator = initializeIteratorWithRules(kRules);
293 #else // PLATFORM(IOS)
294     // Use the special Thai character break iterator for all locales
295     static TextBreakIterator* staticCursorMovementIterator = initializeIterator(UBRK_CHARACTER, "th");
296 #endif // !PLATFORM(IOS)
297
298     if (!staticCursorMovementIterator)
299         return nullptr;
300
301     return setTextForIterator(*staticCursorMovementIterator, string);
302 }
303
304 TextBreakIterator* acquireLineBreakIterator(StringView string, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength, LineBreakIteratorMode mode, bool isCJK)
305 {
306     TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale, mode, isCJK);
307     if (!iterator)
308         return nullptr;
309
310     return setContextAwareTextForIterator(*iterator, string, priorContext, priorContextLength);
311 }
312
313 void releaseLineBreakIterator(TextBreakIterator* iterator)
314 {
315     ASSERT_ARG(iterator, iterator);
316
317     LineBreakIteratorPool::sharedPool().put(iterator);
318 }
319
320 static const char* uax14Prologue =
321     "!!chain;"
322     "!!LBCMNoChain;"
323     "!!lookAheadHardBreak;";
324
325 static const char* uax14AssignmentsBefore =
326     // explicitly enumerate $CJ since ICU versions prior to 49 don't support :LineBreak=Conditional_Japanese_Starter:
327     "$CJ = ["
328 #if (U_ICU_VERSION_MAJOR_NUM >= 4) && (U_ICU_VERSION_MINOR_NUM >= 9)
329     ":LineBreak=Conditional_Japanese_Starter:"
330 #else
331     "\\u3041\\u3043\\u3045\\u3047\\u3049\\u3063\\u3083\\u3085\\u3087\\u308E\\u3095\\u3096\\u30A1\\u30A3\\u30A5\\u30A7"
332     "\\u30A9\\u30C3\\u30E3\\u30E5\\u30E7\\u30EE\\u30F5\\u30F6\\u30FC"
333     "\\u31F0\\u31F1\\u31F2\\u31F3\\u31F4\\u31F5\\u31F6\\u31F7\\u31F8\\u31F9\\u31FA\\u31FB\\u31FC\\u31FD\\u31FE\\u31FF"
334     "\\uFF67\\uFF68\\uFF69\\uFF6A\\uFF6B\\uFF6C\\uFF6D\\uFF6E\\uFF6F\\uFF70"
335 #endif
336     "];";
337
338 static const char* uax14AssignmentsCustomLooseCJK =
339     "$BA_SUB = [\\u2010\\u2013];"
340     "$EX_SUB = [\\u0021\\u003F\\uFF01\\uFF1F];"
341     "$ID_SUB = '';"
342     "$IN_SUB = [\\u2025\\u2026];"
343     "$IS_SUB = [\\u003A\\u003B];"
344     "$NS_SUB = [\\u203C\\u2047\\u2048\\u2049\\u3005\\u301C\\u303B\\u309D\\u309E\\u30A0\\u30FB\\u30FD\\u30FE\\uFF1A\\uFF1B\\uFF65];"
345     "$PO_SUB = [\\u0025\\u00A2\\u00B0\\u2030\\u2032\\u2033\\u2103\\uFF05\\uFFE0];"
346     "$PR_SUB = [\\u0024\\u00A3\\u00A5\\u20AC\\u2116\\uFF04\\uFFE1\\uFFE5];"
347     "$ID_ADD = [$CJ $BA_SUB $EX_SUB $IN_SUB $IS_SUB $NS_SUB $PO_SUB $PR_SUB];"
348     "$NS_ADD = '';";
349
350 static const char* uax14AssignmentsCustomLooseNonCJK =
351     "$BA_SUB = '';"
352     "$EX_SUB = '';"
353     "$ID_SUB = '';"
354     "$IN_SUB = [\\u2025\\u2026];"
355     "$IS_SUB = '';"
356     "$NS_SUB = [\\u3005\\u303B\\u309D\\u309E\\u30FD\\u30FE];"
357     "$PO_SUB = '';"
358     "$PR_SUB = '';"
359     "$ID_ADD = [$CJ $IN_SUB $NS_SUB];"
360     "$NS_ADD = '';";
361
362 static const char* uax14AssignmentsCustomNormalCJK =
363     "$BA_SUB = [\\u2010\\u2013];"
364     "$EX_SUB = '';"
365     "$IN_SUB = '';"
366     "$ID_SUB = '';"
367     "$IS_SUB = '';"
368     "$NS_SUB = [\\u301C\\u30A0];"
369     "$PO_SUB = '';"
370     "$PR_SUB = '';"
371     "$ID_ADD = [$CJ $BA_SUB $NS_SUB];"
372     "$NS_ADD = '';";
373
374 static const char* uax14AssignmentsCustomNormalNonCJK =
375     "$BA_SUB = '';"
376     "$EX_SUB = '';"
377     "$ID_SUB = '';"
378     "$IN_SUB = '';"
379     "$IS_SUB = '';"
380     "$NS_SUB = '';"
381     "$PO_SUB = '';"
382     "$PR_SUB = '';"
383     "$ID_ADD = [$CJ];"
384     "$NS_ADD = '';";
385
386 static const char* uax14AssignmentsCustomStrictCJK =
387     "$BA_SUB = '';"
388     "$EX_SUB = '';"
389     "$ID_SUB = '';"
390     "$IN_SUB = '';"
391     "$IS_SUB = '';"
392     "$NS_SUB = '';"
393     "$PO_SUB = '';"
394     "$PR_SUB = '';"
395     "$ID_ADD = '';"
396     "$NS_ADD = [$CJ];";
397
398 #define uax14AssignmentsCustomStrictNonCJK      uax14AssignmentsCustomStrictCJK
399 #define uax14AssignmentsCustomDefaultCJK        uax14AssignmentsCustomNormalCJK
400 #define uax14AssignmentsCustomDefaultNonCJK     uax14AssignmentsCustomStrictNonCJK
401
402 static const char* uax14AssignmentsAfter =
403     "$AI = [:LineBreak = Ambiguous:];"
404     "$AL = [:LineBreak = Alphabetic:];"
405     "$BA = [[:LineBreak = Break_After:] - $BA_SUB];"
406     "$BB = [:LineBreak = Break_Before:];"
407     "$BK = [:LineBreak = Mandatory_Break:];"
408     "$B2 = [:LineBreak = Break_Both:];"
409     "$CB = [:LineBreak = Contingent_Break:];"
410     "$CL = [:LineBreak = Close_Punctuation:];"
411     "$CM = [:LineBreak = Combining_Mark:];"
412     "$CP = [:LineBreak = Close_Parenthesis:];"
413     "$CR = [:LineBreak = Carriage_Return:];"
414     "$EX = [[:LineBreak = Exclamation:] - $EX_SUB];"
415     "$GL = [:LineBreak = Glue:];"
416 #if (U_ICU_VERSION_MAJOR_NUM >= 4) && (U_ICU_VERSION_MINOR_NUM >= 9)
417     "$HL = [:LineBreak = Hebrew_Letter:];"
418 #else
419     "$HL = [[:Hebrew:] & [:Letter:]];"
420 #endif
421     "$HY = [:LineBreak = Hyphen:];"
422     "$H2 = [:LineBreak = H2:];"
423     "$H3 = [:LineBreak = H3:];"
424     "$ID = [[[[:LineBreak = Ideographic:] - $CJ] $ID_ADD] - $ID_SUB];"
425     "$IN = [[:LineBreak = Inseparable:] - $IN_SUB];"
426     "$IS = [[:LineBreak = Infix_Numeric:] - $IS_SUB];"
427     "$JL = [:LineBreak = JL:];"
428     "$JV = [:LineBreak = JV:];"
429     "$JT = [:LineBreak = JT:];"
430     "$LF = [:LineBreak = Line_Feed:];"
431     "$NL = [:LineBreak = Next_Line:];"
432     "$NS = [[[[:LineBreak = Nonstarter:] - $CJ] $NS_ADD] - $NS_SUB];"
433     "$NU = [:LineBreak = Numeric:];"
434     "$OP = [:LineBreak = Open_Punctuation:];"
435     "$PO = [[:LineBreak = Postfix_Numeric:] - $PO_SUB];"
436     "$PR = [[:LineBreak = Prefix_Numeric:] - $PR_SUB];"
437     "$QU = [:LineBreak = Quotation:];"
438     "$RI = [\\U0001F1E6-\\U0001F1FF];"
439     "$SA = [:LineBreak = Complex_Context:];"
440     "$SG = [:LineBreak = Surrogate:];"
441     "$SP = [:LineBreak = Space:];"
442     "$SY = [:LineBreak = Break_Symbols:];"
443     "$WJ = [:LineBreak = Word_Joiner:];"
444     "$XX = [:LineBreak = Unknown:];"
445     "$ZW = [:LineBreak = ZWSpace:];"
446     "$ZWJ = \\u200D;"
447     "$EmojiVar = \\uFE0F;"
448 #if ADDITIONAL_EMOJI_SUPPORT
449     "$EmojiForSeqs = [\\u2764 \\U0001F441 \\U0001F466-\\U0001F469 \\U0001F48B \\U0001F5E8];"
450     "$EmojiForMods = [\\u261D \\u270A-\\u270D \\U0001F385 \\U0001F3C3-\\U0001F3C4 \\U0001F3C7 \\U0001F3CA \\U0001F442-\\U0001F443 \\U0001F446-\\U0001F450 \\U0001F466-\\U0001F469 \\U0001F46E-\\U0001F478 \\U0001F47C \\U0001F481-\\U0001F483 \\U0001F485-\\U0001F487 \\U0001F4AA \\U0001F590 \\U0001F595 \\U0001F596 \\U0001F645-\\U0001F647 \\U0001F64B-\\U0001F64F \\U0001F6A3 \\U0001F6B4-\\U0001F6B6 \\U0001F6C0 \\U0001F918] ;" // Emoji that take Fitzpatrick modifiers
451 #else
452     "$EmojiForSeqs = [\\u2764 \\U0001F466-\\U0001F469 \\U0001F48B];"
453     "$EmojiForMods = [\\u261D \\u270A-\\u270C \\U0001F385 \\U0001F3C3-\\U0001F3C4 \\U0001F3C7 \\U0001F3CA \\U0001F442-\\U0001F443 \\U0001F446-\\U0001F450 \\U0001F466-\\U0001F469 \\U0001F46E-\\U0001F478 \\U0001F47C \\U0001F481-\\U0001F483 \\U0001F485-\\U0001F487 \\U0001F4AA \\U0001F596 \\U0001F645-\\U0001F647 \\U0001F64B-\\U0001F64F \\U0001F6A3 \\U0001F6B4-\\U0001F6B6 \\U0001F6C0] ;" // Emoji that take Fitzpatrick modifiers
454 #endif
455     "$EmojiMods = [\\U0001F3FB-\\U0001F3FF];"
456     "$dictionary = [:LineBreak = Complex_Context:];"
457     "$ALPlus = [$AL $AI $SA $SG $XX];"
458     "$ALcm = $ALPlus $CM*;"
459     "$BAcm = $BA $CM*;"
460     "$BBcm = $BB $CM*;"
461     "$B2cm = $B2 $CM*;"
462     "$CLcm = $CL $CM*;"
463     "$CPcm = $CP $CM*;"
464     "$EXcm = $EX $CM*;"
465     "$GLcm = $GL $CM*;"
466     "$HLcm = $HL $CM*;"
467     "$HYcm = $HY $CM*;"
468     "$H2cm = $H2 $CM*;"
469     "$H3cm = $H3 $CM*;"
470     "$IDcm = $ID $CM*;"
471     "$INcm = $IN $CM*;"
472     "$IScm = $IS $CM*;"
473     "$JLcm = $JL $CM*;"
474     "$JVcm = $JV $CM*;"
475     "$JTcm = $JT $CM*;"
476     "$NScm = $NS $CM*;"
477     "$NUcm = $NU $CM*;"
478     "$OPcm = $OP $CM*;"
479     "$POcm = $PO $CM*;"
480     "$PRcm = $PR $CM*;"
481     "$QUcm = $QU $CM*;"
482     "$RIcm = $QU $CM*;"
483     "$SYcm = $SY $CM*;"
484     "$WJcm = $WJ $CM*;";
485
486 static const char* uax14Forward =
487     "!!forward;"
488     "$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM];"
489     "$CANT_CM = [$SP $BK $CR $LF $NL $ZW $CM];"
490     "$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];"
491     "$AL_FOLLOW_CM = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $ALPlus];"
492     "$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];"
493     "$LB4Breaks = [$BK $CR $LF $NL];"
494     "$LB4NonBreaks = [^$BK $CR $LF $NL];"
495     "$LB8Breaks = [$LB4Breaks $ZW];"
496     "$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];"
497     "$LB18NonBreaks = [$LB8NonBreaks - [$SP]];"
498     "$LB18Breaks = [$LB8Breaks $SP];"
499     "$LB20NonBreaks = [$LB18NonBreaks - $CB];"
500     "$ALPlus $CM+;"
501     "$BA $CM+;"
502     "$BB $CM+;"
503     "$B2 $CM+;"
504     "$CL $CM+;"
505     "$CP $CM+;"
506     "$EX $CM+;"
507     "$GL $CM+;"
508     "$HL $CM+;"
509     "$HY $CM+;"
510     "$H2 $CM+;"
511     "$H3 $CM+;"
512     "$ID $CM+;"
513     "$IN $CM+;"
514     "$IS $CM+;"
515     "$JL $CM+;"
516     "$JV $CM+;"
517     "$JT $CM+;"
518     "$NS $CM+;"
519     "$NU $CM+;"
520     "$OP $CM+;"
521     "$PO $CM+;"
522     "$PR $CM+;"
523     "$QU $CM+;"
524     "$SY $CM+;"
525     "$WJ $CM+;"
526     "$CR $LF {100};"
527     "$LB4NonBreaks? $LB4Breaks {100};"
528     "$CAN_CM $CM* $LB4Breaks {100};"
529     "$CM+ $LB4Breaks {100};"
530     "$LB4NonBreaks [$SP $ZW];"
531     "$CAN_CM $CM* [$SP $ZW];"
532     "$CM+ [$SP $ZW];"
533     "$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;"
534     "$CAN_CM $CM+;"
535     "$CM+;"
536     "$CAN_CM $CM* $WJcm;"
537     "$LB8NonBreaks $WJcm;"
538     "$CM+ $WJcm;"
539     "$WJcm $CANT_CM;"
540     "$WJcm $CAN_CM $CM*;"
541     "$GLcm $CAN_CM $CM*;"
542     "$GLcm $CANT_CM;"
543     "[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;"
544     "$CM+ GLcm;"
545     "$LB8NonBreaks $CL;"
546     "$CAN_CM $CM* $CL;"
547     "$CM+ $CL;"
548     "$LB8NonBreaks $CP;"
549     "$CAN_CM $CM* $CP;"
550     "$CM+ $CP;"
551     "$LB8NonBreaks $EX;"
552     "$CAN_CM $CM* $EX;"
553     "$CM+ $EX;"
554     "$LB8NonBreaks $IS;"
555     "$CAN_CM $CM* $IS;"
556     "$CM+ $IS;"
557     "$LB8NonBreaks $SY;"
558     "$CAN_CM $CM* $SY;"
559     "$CM+ $SY;"
560     "$OPcm $SP* $CAN_CM $CM*;"
561     "$OPcm $SP* $CANT_CM;"
562     "$OPcm $SP+ $CM+ $AL_FOLLOW?;"
563     "$QUcm $SP* $OPcm;"
564     "($CLcm | $CPcm) $SP* $NScm;"
565     "$B2cm $SP* $B2cm;"
566     "$LB18NonBreaks $CM* $QUcm;"
567     "$CM+ $QUcm;"
568     "$QUcm .?;"
569     "$QUcm $LB18NonBreaks $CM*;"
570     "$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); "
571     "$BBcm [^$CB];"
572     "$BBcm $LB20NonBreaks $CM*;"
573     "$HLcm ($HYcm | $BAcm) [^$CB]?;"
574     "($ALcm | $HLcm) $INcm;"
575     "$CM+ $INcm;"
576     "$IDcm $INcm;"
577     "$INcm $INcm;"
578     "$NUcm $INcm;"
579     "$IDcm $POcm;"
580     "$ALcm $NUcm;"
581     "$HLcm $NUcm;"
582     "$CM+ $NUcm;"
583     "$NUcm $ALcm;"
584     "$NUcm $HLcm;"
585     "$PRcm $IDcm;"
586     "$PRcm ($ALcm | $HLcm);"
587     "$POcm ($ALcm | $HLcm);"
588     "($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;"
589     "$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);"
590     "($JVcm | $H2cm) ($JVcm | $JTcm);"
591     "($JTcm | $H3cm) $JTcm;"
592     "($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;"
593     "($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;"
594     "$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);"
595     "($ALcm | $HLcm) ($ALcm | $HLcm);"
596     "$CM+ ($ALcm | $HLcm);"
597     "$IScm ($ALcm | $HLcm);"
598     "($ALcm | $HLcm | $NUcm) $OPcm;"
599     "$CM+ $OPcm;"
600     "$CPcm ($ALcm | $HLcm | $NUcm);"
601 #if ADDITIONAL_EMOJI_SUPPORT
602     "$RIcm $RIcm;"
603 #endif
604     "$EmojiForMods $EmojiVar? $EmojiMods;";
605
606 static const char* uax14Reverse =
607     "!!reverse;"
608     "$CM+ $ALPlus;"
609     "$CM+ $BA;"
610     "$CM+ $BB;"
611     "$CM+ $B2;"
612     "$CM+ $CL;"
613     "$CM+ $CP;"
614     "$CM+ $EX;"
615     "$CM+ $GL;"
616     "$CM+ $HL;"
617     "$CM+ $HY;"
618     "$CM+ $H2;"
619     "$CM+ $H3;"
620     "$CM+ $ID;"
621     "$CM+ $IN;"
622     "$CM+ $IS;"
623     "$CM+ $JL;"
624     "$CM+ $JV;"
625     "$CM+ $JT;"
626     "$CM+ $NS;"
627     "$CM+ $NU;"
628     "$CM+ $OP;"
629     "$CM+ $PO;"
630     "$CM+ $PR;"
631     "$CM+ $QU;"
632 #if ADDITIONAL_EMOJI_SUPPORT
633     "$CM+ $RI;"
634 #endif
635     "$CM+ $SY;"
636     "$CM+ $WJ;"
637     "$CM+;"
638     "$AL_FOLLOW $CM+ / ([$BK $CR $LF $NL $ZW {eof}] | $SP+ $CM+ $SP | $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}]));"
639     "[$PR] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];"
640     "$LB4Breaks [$LB4NonBreaks-$CM];"
641     "$LB4Breaks $CM+ $CAN_CM;"
642     "$LF $CR;"
643     "[$SP $ZW] [$LB4NonBreaks-$CM];"
644     "[$SP $ZW] $CM+ $CAN_CM;"
645     "$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;"
646     "$CM+ $CAN_CM;"
647     "$CM* $WJ $CM* $CAN_CM;"
648     "$CM* $WJ [$LB8NonBreaks-$CM];"
649     "$CANT_CM $CM* $WJ;"
650     "$CM* $CAN_CM $CM* $WJ;"
651     "$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];"
652     "$CANT_CM $CM* $GL;"
653     "$CM* $CAN_CM $CM* $GL;"
654     "$CL $CM+ $CAN_CM;"
655     "$CP $CM+ $CAN_CM;"
656     "$EX $CM+ $CAN_CM;"
657     "$IS $CM+ $CAN_CM;"
658     "$SY $CM+ $CAN_CM;"
659     "$CL [$LB8NonBreaks-$CM];"
660     "$CP [$LB8NonBreaks-$CM];"
661     "$EX [$LB8NonBreaks-$CM];"
662     "$IS [$LB8NonBreaks-$CM];"
663     "$SY [$LB8NonBreaks-$CM];"
664     "[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP; "
665     "$CM* $CAN_CM $SP* $CM* $OP;"
666     "$CANT_CM $SP* $CM* $OP;"
667     "$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP;"
668     "$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;"
669     "$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;"
670     "$SY $CM $SP+ $OP;"
671     "$CM* $OP $SP* $CM* $QU;"
672     "$CM* $NS $SP* $CM* ($CL | $CP);"
673     "$CM* $B2 $SP* $CM* $B2;"
674     "$CM* $QU $CM* $CAN_CM;"
675     "$CM* $QU $LB18NonBreaks;"
676     "$CM* $CAN_CM $CM* $QU;"
677     "$CANT_CM $CM* $QU;"
678     "$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM];"
679     "$CM* [$LB20NonBreaks-$CM] $CM* $BB;"
680     "[^$CB] $CM* $BB;"
681     "[^$CB] $CM* ($HY | $BA) $CM* $HL;"
682     "$CM* $IN $CM* ($ALPlus | $HL);"
683     "$CM* $IN $CM* $ID;"
684     "$CM* $IN $CM* $IN;"
685     "$CM* $IN $CM* $NU;"
686     "$CM* $PO $CM* $ID;"
687     "$CM* $NU $CM* ($ALPlus | $HL);"
688     "$CM* ($ALPlus | $HL) $CM* $NU;"
689     "$CM* $ID $CM* $PR;"
690     "$CM* ($ALPlus | $HL) $CM* $PR;"
691     "$CM* ($ALPlus | $HL) $CM* $PO;"
692     "($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;"
693     "$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;"
694     "$CM* ($JT | $JV) $CM* ($H2 | $JV);"
695     "$CM* $JT $CM* ($H3 | $JT);"
696     "$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);"
697     "$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);"
698     "$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;"
699     "$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);"
700     "$CM* ($ALPlus | $HL) $CM* $IS;"
701     "$CM* $OP $CM* ($ALPlus | $HL | $NU);"
702     "$CM* ($ALPlus | $HL | $NU) $CM* $CP;"
703 #if ADDITIONAL_EMOJI_SUPPORT
704     "$CM* $RI $CM* $RI;"
705 #endif
706     "$EmojiMods $EmojiVar? $EmojiForMods;";
707
708 static const char* uax14SafeForward =
709     "!!safe_forward;"
710     "[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];"
711     "$dictionary $dictionary;";
712
713 static const char* uax14SafeReverse =
714     "!!safe_reverse;"
715     "$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];"
716     "$CM+ $SP / .;"
717     "$SP+ $CM* $OP;"
718     "$SP+ $CM* $QU;"
719     "$SP+ $CM* ($CL | $CP);"
720     "$SP+ $CM* $B2;"
721     "$CM* ($HY | $BA) $CM* $HL;"
722     "($CM* ($IS | $SY))+ $CM* $NU;"
723     "($CL | $CP) $CM* ($NU | $IS | $SY);"
724     "$dictionary $dictionary;";
725
726 static String mapLineIteratorModeToRules(LineBreakIteratorMode mode, bool isCJK)
727 {
728     StringBuilder rulesBuilder;
729     rulesBuilder.append(uax14Prologue);
730     rulesBuilder.append(uax14AssignmentsBefore);
731     switch (mode) {
732     case LineBreakIteratorModeUAX14:
733         rulesBuilder.append(isCJK ? uax14AssignmentsCustomDefaultCJK : uax14AssignmentsCustomDefaultNonCJK);
734         break;
735     case LineBreakIteratorModeUAX14Loose:
736         rulesBuilder.append(isCJK ? uax14AssignmentsCustomLooseCJK : uax14AssignmentsCustomLooseNonCJK);
737         break;
738     case LineBreakIteratorModeUAX14Normal:
739         rulesBuilder.append(isCJK ? uax14AssignmentsCustomNormalCJK : uax14AssignmentsCustomNormalNonCJK);
740         break;
741     case LineBreakIteratorModeUAX14Strict:
742         rulesBuilder.append(isCJK ? uax14AssignmentsCustomStrictCJK : uax14AssignmentsCustomStrictNonCJK);
743         break;
744     }
745     rulesBuilder.append(uax14AssignmentsAfter);
746     rulesBuilder.append(uax14Forward);
747     rulesBuilder.append(uax14Reverse);
748     rulesBuilder.append(uax14SafeForward);
749     rulesBuilder.append(uax14SafeReverse);
750     return rulesBuilder.toString();
751 }
752
753 // Recognize BCP47 compliant primary language values of 'zh', 'ja', 'ko'
754 // (in any combination of case), optionally followed by subtags. Don't
755 // recognize 3-letter variants 'chi'/'zho', 'jpn', or 'kor' since BCP47
756 // requires use of shortest language tag.
757 bool isCJKLocale(const AtomicString& locale)
758 {
759     size_t length = locale.length();
760     if (length < 2)
761         return false;
762     auto c1 = locale[0];
763     auto c2 = locale[1];
764     auto c3 = length == 2 ? 0 : locale[2];
765     if (!c3 || c3 == '-' || c3 == '_' || c3 == '@') {
766         if (c1 == 'z' || c1 == 'Z')
767             return c2 == 'h' || c2 == 'H';
768         if (c1 == 'j' || c1 == 'J')
769             return c2 == 'a' || c2 == 'A';
770         if (c1 == 'k' || c1 == 'K')
771             return c2 == 'o' || c2 == 'O';
772     }
773     return false;
774 }
775
776 TextBreakIterator* openLineBreakIterator(const AtomicString& locale, LineBreakIteratorMode mode, bool isCJK)
777 {
778     UBreakIterator* ubrkIter;
779     UErrorCode openStatus = U_ZERO_ERROR;
780     bool localeIsEmpty = locale.isEmpty();
781     if (mode == LineBreakIteratorModeUAX14)
782         ubrkIter = ubrk_open(UBRK_LINE, localeIsEmpty ? currentTextBreakLocaleID() : locale.string().utf8().data(), 0, 0, &openStatus);
783     else {
784         UParseError parseStatus;
785         auto rules = mapLineIteratorModeToRules(mode, isCJK);
786         ubrkIter = ubrk_openRules(StringView(rules).upconvertedCharacters(), rules.length(), 0, 0, &parseStatus, &openStatus);
787     }
788     // locale comes from a web page and it can be invalid, leading ICU
789     // to fail, in which case we fall back to the default locale.
790     if (!localeIsEmpty && U_FAILURE(openStatus)) {
791         openStatus = U_ZERO_ERROR;
792         ubrkIter = ubrk_open(UBRK_LINE, currentTextBreakLocaleID(), 0, 0, &openStatus);
793     }
794
795     if (U_FAILURE(openStatus)) {
796         LOG_ERROR("ubrk_open failed with status %d", openStatus);
797         return nullptr;
798     }
799
800     return reinterpret_cast<TextBreakIterator*>(ubrkIter);
801 }
802
803 void closeLineBreakIterator(TextBreakIterator*& iterator)
804 {
805     UBreakIterator* ubrkIter = reinterpret_cast<UBreakIterator*>(iterator);
806     ASSERT(ubrkIter);
807     ubrk_close(ubrkIter);
808     iterator = nullptr;
809 }
810
811 static TextBreakIterator* nonSharedCharacterBreakIterator;
812
813 static inline bool compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator* expected, TextBreakIterator* newValue)
814 {
815 #if ENABLE(COMPARE_AND_SWAP)
816     return WTF::weakCompareAndSwap(reinterpret_cast<void**>(&nonSharedCharacterBreakIterator), expected, newValue);
817 #else
818     static StaticLock nonSharedCharacterBreakIteratorMutex;
819     std::lock_guard<StaticLock> locker(nonSharedCharacterBreakIteratorMutex);
820     if (nonSharedCharacterBreakIterator != expected)
821         return false;
822     nonSharedCharacterBreakIterator = newValue;
823     return true;
824 #endif
825 }
826
827 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(StringView string)
828 {
829     m_iterator = nonSharedCharacterBreakIterator;
830
831     bool createdIterator = m_iterator && compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0);
832     if (!createdIterator)
833         m_iterator = initializeIterator(UBRK_CHARACTER);
834     if (!m_iterator)
835         return;
836
837     m_iterator = setTextForIterator(*m_iterator, string);
838 }
839
840 NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator()
841 {
842     if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator))
843         ubrk_close(reinterpret_cast<UBreakIterator*>(m_iterator));
844 }
845
846
847 // Iterator implemenation.
848
849 int textBreakFirst(TextBreakIterator* iterator)
850 {
851     return ubrk_first(reinterpret_cast<UBreakIterator*>(iterator));
852 }
853
854 int textBreakLast(TextBreakIterator* iterator)
855 {
856     return ubrk_last(reinterpret_cast<UBreakIterator*>(iterator));
857 }
858
859 int textBreakNext(TextBreakIterator* iterator)
860 {
861     return ubrk_next(reinterpret_cast<UBreakIterator*>(iterator));
862 }
863
864 int textBreakPrevious(TextBreakIterator* iterator)
865 {
866     return ubrk_previous(reinterpret_cast<UBreakIterator*>(iterator));
867 }
868
869 int textBreakPreceding(TextBreakIterator* iterator, int pos)
870 {
871     return ubrk_preceding(reinterpret_cast<UBreakIterator*>(iterator), pos);
872 }
873
874 int textBreakFollowing(TextBreakIterator* iterator, int pos)
875 {
876     return ubrk_following(reinterpret_cast<UBreakIterator*>(iterator), pos);
877 }
878
879 int textBreakCurrent(TextBreakIterator* iterator)
880 {
881     return ubrk_current(reinterpret_cast<UBreakIterator*>(iterator));
882 }
883
884 bool isTextBreak(TextBreakIterator* iterator, int position)
885 {
886     return ubrk_isBoundary(reinterpret_cast<UBreakIterator*>(iterator), position);
887 }
888
889 bool isWordTextBreak(TextBreakIterator* iterator)
890 {
891     int ruleStatus = ubrk_getRuleStatus(reinterpret_cast<UBreakIterator*>(iterator));
892     return ruleStatus != UBRK_WORD_NONE;
893 }
894
895 unsigned numGraphemeClusters(const String& s)
896 {
897     unsigned stringLength = s.length();
898     
899     if (!stringLength)
900         return 0;
901
902     // The only Latin-1 Extended Grapheme Cluster is CR LF
903     if (s.is8Bit() && !s.contains('\r'))
904         return stringLength;
905
906     NonSharedCharacterBreakIterator it(s);
907     if (!it)
908         return stringLength;
909
910     unsigned num = 0;
911     while (textBreakNext(it) != TextBreakDone)
912         ++num;
913     return num;
914 }
915
916 unsigned numCharactersInGraphemeClusters(const StringView& s, unsigned numGraphemeClusters)
917 {
918     unsigned stringLength = s.length();
919
920     if (!stringLength)
921         return 0;
922
923     // The only Latin-1 Extended Grapheme Cluster is CR LF
924     if (s.is8Bit() && !s.contains('\r'))
925         return std::min(stringLength, numGraphemeClusters);
926
927     NonSharedCharacterBreakIterator it(s);
928     if (!it)
929         return std::min(stringLength, numGraphemeClusters);
930
931     for (unsigned i = 0; i < numGraphemeClusters; ++i) {
932         if (textBreakNext(it) == TextBreakDone)
933             return stringLength;
934     }
935     return textBreakCurrent(it);
936 }
937
938 } // namespace WebCore