478608343f4e70033492f3c0859e17cd277e1549
[WebKit-https.git] / Source / WebCore / platform / text / TextBreakIterator.cpp
1 /*
2  * (C) 1999 Lars Knoll (knoll@kde.org)
3  * Copyright (C) 2004, 2005, 2006, 2007, 2008 Apple Inc. All rights reserved.
4  * Copyright (C) 2007-2009 Torch Mobile, Inc.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Library General Public
8  * License as published by the Free Software Foundation; either
9  * version 2 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Library General Public License for more details.
15  *
16  * You should have received a copy of the GNU Library General Public License
17  * along with this library; see the file COPYING.LIB.  If not, write to
18  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19  * Boston, MA 02110-1301, USA.
20  */
21
22 #include "config.h"
23 #include "TextBreakIterator.h"
24
25 #include "LineBreakIteratorPoolICU.h"
26 #include "UTextProviderLatin1.h"
27 #include "UTextProviderUTF16.h"
28 #include <mutex>
29 #include <wtf/Atomics.h>
30 #include <wtf/text/StringView.h>
31
32 namespace WebCore {
33
34 // Iterator initialization
35
36 static TextBreakIterator* initializeIterator(UBreakIteratorType type, const char* locale = currentTextBreakLocaleID())
37 {
38     UErrorCode openStatus = U_ZERO_ERROR;
39     TextBreakIterator* iterator = reinterpret_cast<TextBreakIterator*>(ubrk_open(type, locale, 0, 0, &openStatus));
40     ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
41     return iterator;
42 }
43
44 #if !PLATFORM(IOS)
45
46 static TextBreakIterator* initializeIteratorWithRules(const char* breakRules)
47 {
48     UParseError parseStatus;
49     UErrorCode openStatus = U_ZERO_ERROR;
50     unsigned length = strlen(breakRules);
51     auto upconvertedCharacters = StringView(reinterpret_cast<const LChar*>(breakRules), length).upconvertedCharacters();
52     TextBreakIterator* iterator = reinterpret_cast<TextBreakIterator*>(ubrk_openRules(upconvertedCharacters, length, 0, 0, &parseStatus, &openStatus));
53     ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
54     return iterator;
55 }
56
57 #endif
58
59
60 // Iterator text setting
61
62 static TextBreakIterator* setTextForIterator(TextBreakIterator& iterator, StringView string)
63 {
64     if (string.is8Bit()) {
65         UTextWithBuffer textLocal;
66         textLocal.text = UTEXT_INITIALIZER;
67         textLocal.text.extraSize = sizeof(textLocal.buffer);
68         textLocal.text.pExtra = textLocal.buffer;
69
70         UErrorCode openStatus = U_ZERO_ERROR;
71         UText* text = openLatin1UTextProvider(&textLocal, string.characters8(), string.length(), &openStatus);
72         if (U_FAILURE(openStatus)) {
73             LOG_ERROR("uTextOpenLatin1 failed with status %d", openStatus);
74             return nullptr;
75         }
76
77         UErrorCode setTextStatus = U_ZERO_ERROR;
78         ubrk_setUText(reinterpret_cast<UBreakIterator*>(&iterator), text, &setTextStatus);
79         if (U_FAILURE(setTextStatus)) {
80             LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
81             return nullptr;
82         }
83
84         utext_close(text);
85     } else {
86         UErrorCode setTextStatus = U_ZERO_ERROR;
87         ubrk_setText(reinterpret_cast<UBreakIterator*>(&iterator), string.characters16(), string.length(), &setTextStatus);
88         if (U_FAILURE(setTextStatus))
89             return nullptr;
90     }
91
92     return &iterator;
93 }
94
95 static TextBreakIterator* setContextAwareTextForIterator(TextBreakIterator& iterator, StringView string, const UChar* priorContext, unsigned priorContextLength)
96 {
97     if (string.is8Bit()) {
98         UTextWithBuffer textLocal;
99         textLocal.text = UTEXT_INITIALIZER;
100         textLocal.text.extraSize = sizeof(textLocal.buffer);
101         textLocal.text.pExtra = textLocal.buffer;
102
103         UErrorCode openStatus = U_ZERO_ERROR;
104         UText* text = openLatin1ContextAwareUTextProvider(&textLocal, string.characters8(), string.length(), priorContext, priorContextLength, &openStatus);
105         if (U_FAILURE(openStatus)) {
106             LOG_ERROR("openLatin1ContextAwareUTextProvider failed with status %d", openStatus);
107             return nullptr;
108         }
109
110         UErrorCode setTextStatus = U_ZERO_ERROR;
111         ubrk_setUText(reinterpret_cast<UBreakIterator*>(&iterator), text, &setTextStatus);
112         if (U_FAILURE(setTextStatus)) {
113             LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
114             return nullptr;
115         }
116
117         utext_close(text);
118     } else {
119         UText textLocal = UTEXT_INITIALIZER;
120
121         UErrorCode openStatus = U_ZERO_ERROR;
122         UText* text = openUTF16ContextAwareUTextProvider(&textLocal, string.characters16(), string.length(), priorContext, priorContextLength, &openStatus);
123         if (U_FAILURE(openStatus)) {
124             LOG_ERROR("openUTF16ContextAwareUTextProvider failed with status %d", openStatus);
125             return 0;
126         }
127
128         UErrorCode setTextStatus = U_ZERO_ERROR;
129         ubrk_setUText(reinterpret_cast<UBreakIterator*>(&iterator), text, &setTextStatus);
130         if (U_FAILURE(setTextStatus)) {
131             LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
132             return nullptr;
133         }
134
135         utext_close(text);
136     }
137
138     return &iterator;
139 }
140
141
142 // Static iterators
143
144 TextBreakIterator* wordBreakIterator(StringView string)
145 {
146     static TextBreakIterator* staticWordBreakIterator = initializeIterator(UBRK_WORD);
147     if (!staticWordBreakIterator)
148         return nullptr;
149
150     return setTextForIterator(*staticWordBreakIterator, string);
151 }
152
153 TextBreakIterator* sentenceBreakIterator(StringView string)
154 {
155     static TextBreakIterator* staticSentenceBreakIterator = initializeIterator(UBRK_SENTENCE);
156     if (!staticSentenceBreakIterator)
157         return nullptr;
158
159     return setTextForIterator(*staticSentenceBreakIterator, string);
160 }
161
162 TextBreakIterator* cursorMovementIterator(StringView string)
163 {
164 #if !PLATFORM(IOS)
165     // This rule set is based on character-break iterator rules of ICU 4.0
166     // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
167     // The major differences from the original ones are listed below:
168     // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
169     // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
170     // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
171     // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
172     // * Added rules for regional indicator symbols.
173     static const char* kRules =
174         "$CR      = [\\p{Grapheme_Cluster_Break = CR}];"
175         "$LF      = [\\p{Grapheme_Cluster_Break = LF}];"
176         "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
177         "$VoiceMarks = [\\uFF9E\\uFF9F];"  // Japanese half-width katakana voiced marks
178         "$Extend  = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
179         "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
180         "$L       = [\\p{Grapheme_Cluster_Break = L}];"
181         "$V       = [\\p{Grapheme_Cluster_Break = V}];"
182         "$T       = [\\p{Grapheme_Cluster_Break = T}];"
183         "$LV      = [\\p{Grapheme_Cluster_Break = LV}];"
184         "$LVT     = [\\p{Grapheme_Cluster_Break = LVT}];"
185         "$Hin0    = [\\u0905-\\u0939];"    // Devanagari Letter A,...,Ha
186         "$HinV    = \\u094D;"              // Devanagari Sign Virama
187         "$Hin1    = [\\u0915-\\u0939];"    // Devanagari Letter Ka,...,Ha
188         "$Ben0    = [\\u0985-\\u09B9];"    // Bengali Letter A,...,Ha
189         "$BenV    = \\u09CD;"              // Bengali Sign Virama
190         "$Ben1    = [\\u0995-\\u09B9];"    // Bengali Letter Ka,...,Ha
191         "$Pan0    = [\\u0A05-\\u0A39];"    // Gurmukhi Letter A,...,Ha
192         "$PanV    = \\u0A4D;"              // Gurmukhi Sign Virama
193         "$Pan1    = [\\u0A15-\\u0A39];"    // Gurmukhi Letter Ka,...,Ha
194         "$Guj0    = [\\u0A85-\\u0AB9];"    // Gujarati Letter A,...,Ha
195         "$GujV    = \\u0ACD;"              // Gujarati Sign Virama
196         "$Guj1    = [\\u0A95-\\u0AB9];"    // Gujarati Letter Ka,...,Ha
197         "$Ori0    = [\\u0B05-\\u0B39];"    // Oriya Letter A,...,Ha
198         "$OriV    = \\u0B4D;"              // Oriya Sign Virama
199         "$Ori1    = [\\u0B15-\\u0B39];"    // Oriya Letter Ka,...,Ha
200         "$Tel0    = [\\u0C05-\\u0C39];"    // Telugu Letter A,...,Ha
201         "$TelV    = \\u0C4D;"              // Telugu Sign Virama
202         "$Tel1    = [\\u0C14-\\u0C39];"    // Telugu Letter Ka,...,Ha
203         "$Kan0    = [\\u0C85-\\u0CB9];"    // Kannada Letter A,...,Ha
204         "$KanV    = \\u0CCD;"              // Kannada Sign Virama
205         "$Kan1    = [\\u0C95-\\u0CB9];"    // Kannada Letter A,...,Ha
206         "$Mal0    = [\\u0D05-\\u0D39];"    // Malayalam Letter A,...,Ha
207         "$MalV    = \\u0D4D;"              // Malayalam Sign Virama
208         "$Mal1    = [\\u0D15-\\u0D39];"    // Malayalam Letter A,...,Ha
209         "$RI      = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators
210         "!!chain;"
211         "!!forward;"
212         "$CR $LF;"
213         "$L ($L | $V | $LV | $LVT);"
214         "($LV | $V) ($V | $T);"
215         "($LVT | $T) $T;"
216         "[^$Control $CR $LF] $Extend;"
217         "[^$Control $CR $LF] $SpacingMark;"
218         "$RI $RI / $RI;"
219         "$RI $RI;"
220         "$Hin0 $HinV $Hin1;"               // Devanagari Virama (forward)
221         "$Ben0 $BenV $Ben1;"               // Bengali Virama (forward)
222         "$Pan0 $PanV $Pan1;"               // Gurmukhi Virama (forward)
223         "$Guj0 $GujV $Guj1;"               // Gujarati Virama (forward)
224         "$Ori0 $OriV $Ori1;"               // Oriya Virama (forward)
225         "$Tel0 $TelV $Tel1;"               // Telugu Virama (forward)
226         "$Kan0 $KanV $Kan1;"               // Kannada Virama (forward)
227         "$Mal0 $MalV $Mal1;"               // Malayalam Virama (forward)
228         "!!reverse;"
229         "$LF $CR;"
230         "($L | $V | $LV | $LVT) $L;"
231         "($V | $T) ($LV | $V);"
232         "$T ($LVT | $T);"
233         "$Extend      [^$Control $CR $LF];"
234         "$SpacingMark [^$Control $CR $LF];"
235         "$RI $RI / $RI $RI;"
236         "$RI $RI;"
237         "$Hin1 $HinV $Hin0;"               // Devanagari Virama (backward)
238         "$Ben1 $BenV $Ben0;"               // Bengali Virama (backward)
239         "$Pan1 $PanV $Pan0;"               // Gurmukhi Virama (backward)
240         "$Guj1 $GujV $Guj0;"               // Gujarati Virama (backward)
241         "$Ori1 $OriV $Ori0;"               // Gujarati Virama (backward)
242         "$Tel1 $TelV $Tel0;"               // Telugu Virama (backward)
243         "$Kan1 $KanV $Kan0;"               // Kannada Virama (backward)
244         "$Mal1 $MalV $Mal0;"               // Malayalam Virama (backward)
245         "!!safe_reverse;"
246         "!!safe_forward;";
247     static TextBreakIterator* staticCursorMovementIterator = initializeIteratorWithRules(kRules);
248 #else // PLATFORM(IOS)
249     // Use the special Thai character break iterator for all locales
250     static TextBreakIterator* staticCursorMovementIterator = initializeIterator(UBRK_CHARACTER, "th");
251 #endif // !PLATFORM(IOS)
252
253     if (!staticCursorMovementIterator)
254         return nullptr;
255
256     return setTextForIterator(*staticCursorMovementIterator, string);
257 }
258
259 TextBreakIterator* acquireLineBreakIterator(StringView string, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
260 {
261     TextBreakIterator* iterator = reinterpret_cast<TextBreakIterator*>(LineBreakIteratorPool::sharedPool().take(locale));
262     if (!iterator)
263         return nullptr;
264
265     return setContextAwareTextForIterator(*iterator, string, priorContext, priorContextLength);
266 }
267
268 void releaseLineBreakIterator(TextBreakIterator* iterator)
269 {
270     ASSERT_ARG(iterator, iterator);
271
272     LineBreakIteratorPool::sharedPool().put(reinterpret_cast<UBreakIterator*>(iterator));
273 }
274
275 static TextBreakIterator* nonSharedCharacterBreakIterator;
276
277 static inline bool compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator* expected, TextBreakIterator* newValue)
278 {
279 #if ENABLE(COMPARE_AND_SWAP)
280     return WTF::weakCompareAndSwap(reinterpret_cast<void**>(&nonSharedCharacterBreakIterator), expected, newValue);
281 #else
282     DEPRECATED_DEFINE_STATIC_LOCAL(std::mutex, nonSharedCharacterBreakIteratorMutex, ());
283     std::lock_guard<std::mutex> locker(nonSharedCharacterBreakIteratorMutex);
284     if (nonSharedCharacterBreakIterator != expected)
285         return false;
286     nonSharedCharacterBreakIterator = newValue;
287     return true;
288 #endif
289 }
290
291 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(StringView string)
292 {
293     m_iterator = nonSharedCharacterBreakIterator;
294
295     bool createdIterator = m_iterator && compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0);
296     if (!createdIterator)
297         m_iterator = initializeIterator(UBRK_CHARACTER);
298     if (!m_iterator)
299         return;
300
301     m_iterator = setTextForIterator(*m_iterator, string);
302 }
303
304 NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator()
305 {
306     if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator))
307         ubrk_close(reinterpret_cast<UBreakIterator*>(m_iterator));
308 }
309
310
311 // Iterator implemenation.
312
313 int textBreakFirst(TextBreakIterator* iterator)
314 {
315     return ubrk_first(reinterpret_cast<UBreakIterator*>(iterator));
316 }
317
318 int textBreakLast(TextBreakIterator* iterator)
319 {
320     return ubrk_last(reinterpret_cast<UBreakIterator*>(iterator));
321 }
322
323 int textBreakNext(TextBreakIterator* iterator)
324 {
325     return ubrk_next(reinterpret_cast<UBreakIterator*>(iterator));
326 }
327
328 int textBreakPrevious(TextBreakIterator* iterator)
329 {
330     return ubrk_previous(reinterpret_cast<UBreakIterator*>(iterator));
331 }
332
333 int textBreakPreceding(TextBreakIterator* iterator, int pos)
334 {
335     return ubrk_preceding(reinterpret_cast<UBreakIterator*>(iterator), pos);
336 }
337
338 int textBreakFollowing(TextBreakIterator* iterator, int pos)
339 {
340     return ubrk_following(reinterpret_cast<UBreakIterator*>(iterator), pos);
341 }
342
343 int textBreakCurrent(TextBreakIterator* iterator)
344 {
345     return ubrk_current(reinterpret_cast<UBreakIterator*>(iterator));
346 }
347
348 bool isTextBreak(TextBreakIterator* iterator, int position)
349 {
350     return ubrk_isBoundary(reinterpret_cast<UBreakIterator*>(iterator), position);
351 }
352
353 bool isWordTextBreak(TextBreakIterator* iterator)
354 {
355     int ruleStatus = ubrk_getRuleStatus(reinterpret_cast<UBreakIterator*>(iterator));
356     return ruleStatus != UBRK_WORD_NONE;
357 }
358
359 unsigned numGraphemeClusters(const String& s)
360 {
361     unsigned stringLength = s.length();
362     
363     if (!stringLength)
364         return 0;
365
366     // The only Latin-1 Extended Grapheme Cluster is CR LF
367     if (s.is8Bit() && !s.contains('\r'))
368         return stringLength;
369
370     NonSharedCharacterBreakIterator it(s);
371     if (!it)
372         return stringLength;
373
374     unsigned num = 0;
375     while (textBreakNext(it) != TextBreakDone)
376         ++num;
377     return num;
378 }
379
380 unsigned numCharactersInGraphemeClusters(const String& s, unsigned numGraphemeClusters)
381 {
382     unsigned stringLength = s.length();
383
384     if (!stringLength)
385         return 0;
386
387     // The only Latin-1 Extended Grapheme Cluster is CR LF
388     if (s.is8Bit() && !s.contains('\r'))
389         return std::min(stringLength, numGraphemeClusters);
390
391     NonSharedCharacterBreakIterator it(s);
392     if (!it)
393         return std::min(stringLength, numGraphemeClusters);
394
395     for (unsigned i = 0; i < numGraphemeClusters; ++i) {
396         if (textBreakNext(it) == TextBreakDone)
397             return stringLength;
398     }
399     return textBreakCurrent(it);
400 }
401
402 } // namespace WebCore