Make line breaking obey the -webkit-locale property
[WebKit-https.git] / Source / WebCore / platform / text / TextBreakIteratorICU.cpp
1 /*
2  * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
3  * Copyright (C) 2007 Apple Inc. All rights reserved.
4  *
5  * This library is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU Library General Public
7  * License as published by the Free Software Foundation; either
8  * version 2 of the License, or (at your option) any later version.
9  *
10  * This library is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * Library General Public License for more details.
14  *
15  * You should have received a copy of the GNU Library General Public License
16  * along with this library; see the file COPYING.LIB.  If not, write to
17  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18  * Boston, MA 02110-1301, USA.
19  *
20  */
21
22 #include "config.h"
23 #include "TextBreakIterator.h"
24
25 #include "PlatformString.h"
26 #include "TextBreakIteratorInternalICU.h"
27 #include <unicode/ubrk.h>
28 #include <wtf/Assertions.h>
29 #include <wtf/HashMap.h>
30 #include <wtf/text/CString.h>
31
32 using namespace std;
33
34 namespace WebCore {
35
36 static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator,
37     UBreakIteratorType type, const UChar* string, int length)
38 {
39     if (!string)
40         return 0;
41
42     if (!createdIterator) {
43         UErrorCode openStatus = U_ZERO_ERROR;
44         iterator = reinterpret_cast<TextBreakIterator*>(ubrk_open(type, currentTextBreakLocaleID(), 0, 0, &openStatus));
45         createdIterator = true;
46         ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
47     }
48     if (!iterator)
49         return 0;
50
51     UErrorCode setTextStatus = U_ZERO_ERROR;
52     ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus);
53     if (U_FAILURE(setTextStatus))
54         return 0;
55
56     return iterator;
57 }
58
59 TextBreakIterator* characterBreakIterator(const UChar* string, int length)
60 {
61     static bool createdCharacterBreakIterator = false;
62     static TextBreakIterator* staticCharacterBreakIterator;
63     return setUpIterator(createdCharacterBreakIterator,
64         staticCharacterBreakIterator, UBRK_CHARACTER, string, length);
65 }
66
67 TextBreakIterator* wordBreakIterator(const UChar* string, int length)
68 {
69     static bool createdWordBreakIterator = false;
70     static TextBreakIterator* staticWordBreakIterator;
71     return setUpIterator(createdWordBreakIterator,
72         staticWordBreakIterator, UBRK_WORD, string, length);
73 }
74
75 class LineBreakIteratorPool {
76 WTF_MAKE_NONCOPYABLE(LineBreakIteratorPool);
77 public:
78     static LineBreakIteratorPool& sharedPool()
79     {
80         ASSERT(isMainThread());
81         DEFINE_STATIC_LOCAL(LineBreakIteratorPool, pool, ());
82         return pool;
83     }
84
85     UBreakIterator* take(const AtomicString& locale)
86     {
87         UBreakIterator* iterator = 0;
88         for (size_t i = 0; i < m_pool.size(); ++i) {
89             if (m_pool[i].first == locale) {
90                 iterator = m_pool[i].second;
91                 m_pool.remove(i);
92                 break;
93             }
94         }
95
96         if (!iterator) {
97             UErrorCode openStatus = U_ZERO_ERROR;
98             iterator = ubrk_open(UBRK_LINE, locale.isEmpty() ? currentTextBreakLocaleID() : locale.string().utf8().data(), 0, 0, &openStatus);
99             if (U_FAILURE(openStatus)) {
100                 LOG_ERROR("ubrk_open failed with status %d", openStatus);
101                 return 0;
102             }
103         }
104
105         ASSERT(!m_vendedIterators.contains(iterator));
106         m_vendedIterators.set(iterator, locale);
107         return iterator;
108     }
109
110     void put(UBreakIterator* iterator)
111     {
112         ASSERT_ARG(iterator, m_vendedIterators.contains(iterator));
113
114         if (m_pool.size() == capacity) {
115             ubrk_close(m_pool[0].second);
116             m_pool.remove(0);
117         }
118
119         m_pool.append(Entry(m_vendedIterators.take(iterator), iterator));
120     }
121
122 private:
123     LineBreakIteratorPool() { }
124
125     static const size_t capacity = 4;
126
127     typedef pair<AtomicString, UBreakIterator*> Entry;
128     typedef Vector<Entry, capacity> Pool;
129     Pool m_pool;
130     HashMap<UBreakIterator*, AtomicString> m_vendedIterators;
131 };
132
133 TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length, const AtomicString& locale)
134 {
135     UBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
136
137     UErrorCode setTextStatus = U_ZERO_ERROR;
138     ubrk_setText(iterator, string, length, &setTextStatus);
139     if (U_FAILURE(setTextStatus)) {
140         LOG_ERROR("ubrk_setText failed with status %d", setTextStatus);
141         return 0;
142     }
143
144     return reinterpret_cast<TextBreakIterator*>(iterator);
145 }
146
147 void releaseLineBreakIterator(TextBreakIterator* iterator)
148 {
149     ASSERT_ARG(iterator, iterator);
150
151     LineBreakIteratorPool::sharedPool().put(reinterpret_cast<UBreakIterator*>(iterator));
152 }
153
154 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
155 {
156     static bool createdSentenceBreakIterator = false;
157     static TextBreakIterator* staticSentenceBreakIterator;
158     return setUpIterator(createdSentenceBreakIterator,
159         staticSentenceBreakIterator, UBRK_SENTENCE, string, length);
160 }
161
162 int textBreakFirst(TextBreakIterator* iterator)
163 {
164     return ubrk_first(reinterpret_cast<UBreakIterator*>(iterator));
165 }
166
167 int textBreakLast(TextBreakIterator* iterator)
168 {
169     return ubrk_last(reinterpret_cast<UBreakIterator*>(iterator));
170 }
171
172 int textBreakNext(TextBreakIterator* iterator)
173 {
174     return ubrk_next(reinterpret_cast<UBreakIterator*>(iterator));
175 }
176
177 int textBreakPrevious(TextBreakIterator* iterator)
178 {
179     return ubrk_previous(reinterpret_cast<UBreakIterator*>(iterator));
180 }
181
182 int textBreakPreceding(TextBreakIterator* iterator, int pos)
183 {
184     return ubrk_preceding(reinterpret_cast<UBreakIterator*>(iterator), pos);
185 }
186
187 int textBreakFollowing(TextBreakIterator* iterator, int pos)
188 {
189     return ubrk_following(reinterpret_cast<UBreakIterator*>(iterator), pos);
190 }
191
192 int textBreakCurrent(TextBreakIterator* iterator)
193 {
194     return ubrk_current(reinterpret_cast<UBreakIterator*>(iterator));
195 }
196
197 bool isTextBreak(TextBreakIterator* iterator, int position)
198 {
199     return ubrk_isBoundary(reinterpret_cast<UBreakIterator*>(iterator), position);
200 }
201
202 static TextBreakIterator* setUpIteratorWithRules(bool& createdIterator, TextBreakIterator*& iterator,
203     const char* breakRules, const UChar* string, int length)
204 {
205     if (!string)
206         return 0;
207
208     if (!createdIterator) {
209         UParseError parseStatus;
210         UErrorCode openStatus = U_ZERO_ERROR;
211         String rules(breakRules);
212         iterator = reinterpret_cast<TextBreakIterator*>(ubrk_openRules(rules.characters(), rules.length(), 0, 0, &parseStatus, &openStatus));
213         createdIterator = true;
214         ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
215     }
216     if (!iterator)
217         return 0;
218
219     UErrorCode setTextStatus = U_ZERO_ERROR;
220     ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus);
221     if (U_FAILURE(setTextStatus))
222         return 0;
223
224     return iterator;
225 }
226
227 TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
228 {
229     // This rule set is based on character-break iterator rules of ICU 4.0
230     // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
231     // The major differences from the original ones are listed below:
232     // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
233     // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
234     // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
235     // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
236     static const char* kRules =
237         "$CR      = [\\p{Grapheme_Cluster_Break = CR}];"
238         "$LF      = [\\p{Grapheme_Cluster_Break = LF}];"
239         "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
240         "$VoiceMarks = [\\uFF9E\\uFF9F];"  // Japanese half-width katakana voiced marks
241         "$Extend  = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
242         "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
243         "$L       = [\\p{Grapheme_Cluster_Break = L}];"
244         "$V       = [\\p{Grapheme_Cluster_Break = V}];"
245         "$T       = [\\p{Grapheme_Cluster_Break = T}];"
246         "$LV      = [\\p{Grapheme_Cluster_Break = LV}];"
247         "$LVT     = [\\p{Grapheme_Cluster_Break = LVT}];"
248         "$Hin0    = [\\u0905-\\u0939];"    // Devanagari Letter A,...,Ha
249         "$HinV    = \\u094D;"              // Devanagari Sign Virama
250         "$Hin1    = [\\u0915-\\u0939];"    // Devanagari Letter Ka,...,Ha
251         "$Ben0    = [\\u0985-\\u09B9];"    // Bengali Letter A,...,Ha
252         "$BenV    = \\u09CD;"              // Bengali Sign Virama
253         "$Ben1    = [\\u0995-\\u09B9];"    // Bengali Letter Ka,...,Ha
254         "$Pan0    = [\\u0A05-\\u0A39];"    // Gurmukhi Letter A,...,Ha
255         "$PanV    = \\u0A4D;"              // Gurmukhi Sign Virama
256         "$Pan1    = [\\u0A15-\\u0A39];"    // Gurmukhi Letter Ka,...,Ha
257         "$Guj0    = [\\u0A85-\\u0AB9];"    // Gujarati Letter A,...,Ha
258         "$GujV    = \\u0ACD;"              // Gujarati Sign Virama
259         "$Guj1    = [\\u0A95-\\u0AB9];"    // Gujarati Letter Ka,...,Ha
260         "$Ori0    = [\\u0B05-\\u0B39];"    // Oriya Letter A,...,Ha
261         "$OriV    = \\u0B4D;"              // Oriya Sign Virama
262         "$Ori1    = [\\u0B15-\\u0B39];"    // Oriya Letter Ka,...,Ha
263         "$Tel0    = [\\u0C05-\\u0C39];"    // Telugu Letter A,...,Ha
264         "$TelV    = \\u0C4D;"              // Telugu Sign Virama
265         "$Tel1    = [\\u0C14-\\u0C39];"    // Telugu Letter Ka,...,Ha
266         "$Kan0    = [\\u0C85-\\u0CB9];"    // Kannada Letter A,...,Ha
267         "$KanV    = \\u0CCD;"              // Kannada Sign Virama
268         "$Kan1    = [\\u0C95-\\u0CB9];"    // Kannada Letter A,...,Ha
269         "$Mal0    = [\\u0D05-\\u0D39];"    // Malayalam Letter A,...,Ha
270         "$MalV    = \\u0D4D;"              // Malayalam Sign Virama
271         "$Mal1    = [\\u0D15-\\u0D39];"    // Malayalam Letter A,...,Ha
272         "!!chain;"
273         "!!forward;"
274         "$CR $LF;"
275         "$L ($L | $V | $LV | $LVT);"
276         "($LV | $V) ($V | $T);"
277         "($LVT | $T) $T;"
278         "[^$Control $CR $LF] $Extend;"
279         "[^$Control $CR $LF] $SpacingMark;"
280         "$Hin0 $HinV $Hin1;"               // Devanagari Virama (forward)
281         "$Ben0 $BenV $Ben1;"               // Bengali Virama (forward)
282         "$Pan0 $PanV $Pan1;"               // Gurmukhi Virama (forward)
283         "$Guj0 $GujV $Guj1;"               // Gujarati Virama (forward)
284         "$Ori0 $OriV $Ori1;"               // Oriya Virama (forward)
285         "$Tel0 $TelV $Tel1;"               // Telugu Virama (forward)
286         "$Kan0 $KanV $Kan1;"               // Kannada Virama (forward)
287         "$Mal0 $MalV $Mal1;"               // Malayalam Virama (forward)
288         "!!reverse;"
289         "$LF $CR;"
290         "($L | $V | $LV | $LVT) $L;"
291         "($V | $T) ($LV | $V);"
292         "$T ($LVT | $T);"
293         "$Extend      [^$Control $CR $LF];"
294         "$SpacingMark [^$Control $CR $LF];"
295         "$Hin1 $HinV $Hin0;"               // Devanagari Virama (backward)
296         "$Ben1 $BenV $Ben0;"               // Bengali Virama (backward)
297         "$Pan1 $PanV $Pan0;"               // Gurmukhi Virama (backward)
298         "$Guj1 $GujV $Guj0;"               // Gujarati Virama (backward)
299         "$Ori1 $OriV $Ori0;"               // Gujarati Virama (backward)
300         "$Tel1 $TelV $Tel0;"               // Telugu Virama (backward)
301         "$Kan1 $KanV $Kan0;"               // Kannada Virama (backward)
302         "$Mal1 $MalV $Mal0;"               // Malayalam Virama (backward)
303         "!!safe_reverse;"
304         "!!safe_forward;";
305     static bool createdCursorMovementIterator = false;
306     static TextBreakIterator* staticCursorMovementIterator;
307     return setUpIteratorWithRules(createdCursorMovementIterator, staticCursorMovementIterator, kRules, string, length);
308 }
309
310 }