56aa118ab1ad54a038ec8258b77331e8d325ca0f
[WebKit-https.git] / Source / WebCore / platform / text / TextBreakIteratorICU.cpp
1 /*
2  * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
3  * Copyright (C) 2007 Apple Inc. All rights reserved.
4  *
5  * This library is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU Library General Public
7  * License as published by the Free Software Foundation; either
8  * version 2 of the License, or (at your option) any later version.
9  *
10  * This library is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * Library General Public License for more details.
14  *
15  * You should have received a copy of the GNU Library General Public License
16  * along with this library; see the file COPYING.LIB.  If not, write to
17  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18  * Boston, MA 02110-1301, USA.
19  *
20  */
21
22 #include "config.h"
23 #include "TextBreakIterator.h"
24
25 #include "PlatformString.h"
26 #include "TextBreakIteratorInternalICU.h"
27 #include <unicode/ubrk.h>
28 #include <wtf/Assertions.h>
29
30 using namespace std;
31
32 namespace WebCore {
33
34 static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator,
35     UBreakIteratorType type, const UChar* string, int length)
36 {
37     if (!string)
38         return 0;
39
40     if (!createdIterator) {
41         UErrorCode openStatus = U_ZERO_ERROR;
42         iterator = reinterpret_cast<TextBreakIterator*>(ubrk_open(type, currentTextBreakLocaleID(), 0, 0, &openStatus));
43         createdIterator = true;
44         ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
45     }
46     if (!iterator)
47         return 0;
48
49     UErrorCode setTextStatus = U_ZERO_ERROR;
50     ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus);
51     if (U_FAILURE(setTextStatus))
52         return 0;
53
54     return iterator;
55 }
56
57 TextBreakIterator* characterBreakIterator(const UChar* string, int length)
58 {
59     static bool createdCharacterBreakIterator = false;
60     static TextBreakIterator* staticCharacterBreakIterator;
61     return setUpIterator(createdCharacterBreakIterator,
62         staticCharacterBreakIterator, UBRK_CHARACTER, string, length);
63 }
64
65 TextBreakIterator* wordBreakIterator(const UChar* string, int length)
66 {
67     static bool createdWordBreakIterator = false;
68     static TextBreakIterator* staticWordBreakIterator;
69     return setUpIterator(createdWordBreakIterator,
70         staticWordBreakIterator, UBRK_WORD, string, length);
71 }
72
73 static bool createdLineBreakIterator = false;
74 static TextBreakIterator* staticLineBreakIterator;
75
76 TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length)
77 {
78     TextBreakIterator* lineBreakIterator = 0;
79     if (!createdLineBreakIterator || staticLineBreakIterator) {
80         setUpIterator(createdLineBreakIterator, staticLineBreakIterator, UBRK_LINE, string, length);
81         swap(staticLineBreakIterator, lineBreakIterator);
82     }
83
84     if (!lineBreakIterator) {
85         bool createdNewLineBreakIterator = false;
86         setUpIterator(createdNewLineBreakIterator, lineBreakIterator, UBRK_LINE, string, length);
87     }
88
89     return lineBreakIterator;
90 }
91
92 void releaseLineBreakIterator(TextBreakIterator* iterator)
93 {
94     ASSERT(createdLineBreakIterator);
95     ASSERT(iterator);
96
97     if (!staticLineBreakIterator)
98         staticLineBreakIterator = iterator;
99     else
100         ubrk_close(reinterpret_cast<UBreakIterator*>(iterator));
101 }
102
103 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
104 {
105     static bool createdSentenceBreakIterator = false;
106     static TextBreakIterator* staticSentenceBreakIterator;
107     return setUpIterator(createdSentenceBreakIterator,
108         staticSentenceBreakIterator, UBRK_SENTENCE, string, length);
109 }
110
111 int textBreakFirst(TextBreakIterator* iterator)
112 {
113     return ubrk_first(reinterpret_cast<UBreakIterator*>(iterator));
114 }
115
116 int textBreakLast(TextBreakIterator* iterator)
117 {
118     return ubrk_last(reinterpret_cast<UBreakIterator*>(iterator));
119 }
120
121 int textBreakNext(TextBreakIterator* iterator)
122 {
123     return ubrk_next(reinterpret_cast<UBreakIterator*>(iterator));
124 }
125
126 int textBreakPrevious(TextBreakIterator* iterator)
127 {
128     return ubrk_previous(reinterpret_cast<UBreakIterator*>(iterator));
129 }
130
131 int textBreakPreceding(TextBreakIterator* iterator, int pos)
132 {
133     return ubrk_preceding(reinterpret_cast<UBreakIterator*>(iterator), pos);
134 }
135
136 int textBreakFollowing(TextBreakIterator* iterator, int pos)
137 {
138     return ubrk_following(reinterpret_cast<UBreakIterator*>(iterator), pos);
139 }
140
141 int textBreakCurrent(TextBreakIterator* iterator)
142 {
143     return ubrk_current(reinterpret_cast<UBreakIterator*>(iterator));
144 }
145
146 bool isTextBreak(TextBreakIterator* iterator, int position)
147 {
148     return ubrk_isBoundary(reinterpret_cast<UBreakIterator*>(iterator), position);
149 }
150
151 static TextBreakIterator* setUpIteratorWithRules(bool& createdIterator, TextBreakIterator*& iterator,
152     const char* breakRules, const UChar* string, int length)
153 {
154     if (!string)
155         return 0;
156
157     if (!createdIterator) {
158         UParseError parseStatus;
159         UErrorCode openStatus = U_ZERO_ERROR;
160         String rules(breakRules);
161         iterator = reinterpret_cast<TextBreakIterator*>(ubrk_openRules(rules.characters(), rules.length(), 0, 0, &parseStatus, &openStatus));
162         createdIterator = true;
163         ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
164     }
165     if (!iterator)
166         return 0;
167
168     UErrorCode setTextStatus = U_ZERO_ERROR;
169     ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus);
170     if (U_FAILURE(setTextStatus))
171         return 0;
172
173     return iterator;
174 }
175
176 TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
177 {
178     // This rule set is based on character-break iterator rules of ICU 4.0
179     // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
180     // The major differences from the original ones are listed below:
181     // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
182     // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
183     // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
184     // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
185     static const char* kRules =
186         "$CR      = [\\p{Grapheme_Cluster_Break = CR}];"
187         "$LF      = [\\p{Grapheme_Cluster_Break = LF}];"
188         "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
189         "$VoiceMarks = [\\uFF9E\\uFF9F];"  // Japanese half-width katakana voiced marks
190         "$Extend  = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
191         "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
192         "$L       = [\\p{Grapheme_Cluster_Break = L}];"
193         "$V       = [\\p{Grapheme_Cluster_Break = V}];"
194         "$T       = [\\p{Grapheme_Cluster_Break = T}];"
195         "$LV      = [\\p{Grapheme_Cluster_Break = LV}];"
196         "$LVT     = [\\p{Grapheme_Cluster_Break = LVT}];"
197         "$Hin0    = [\\u0905-\\u0939];"    // Devanagari Letter A,...,Ha
198         "$HinV    = \\u094D;"              // Devanagari Sign Virama
199         "$Hin1    = [\\u0915-\\u0939];"    // Devanagari Letter Ka,...,Ha
200         "$Ben0    = [\\u0985-\\u09B9];"    // Bengali Letter A,...,Ha
201         "$BenV    = \\u09CD;"              // Bengali Sign Virama
202         "$Ben1    = [\\u0995-\\u09B9];"    // Bengali Letter Ka,...,Ha
203         "$Pan0    = [\\u0A05-\\u0A39];"    // Gurmukhi Letter A,...,Ha
204         "$PanV    = \\u0A4D;"              // Gurmukhi Sign Virama
205         "$Pan1    = [\\u0A15-\\u0A39];"    // Gurmukhi Letter Ka,...,Ha
206         "$Guj0    = [\\u0A85-\\u0AB9];"    // Gujarati Letter A,...,Ha
207         "$GujV    = \\u0ACD;"              // Gujarati Sign Virama
208         "$Guj1    = [\\u0A95-\\u0AB9];"    // Gujarati Letter Ka,...,Ha
209         "$Ori0    = [\\u0B05-\\u0B39];"    // Oriya Letter A,...,Ha
210         "$OriV    = \\u0B4D;"              // Oriya Sign Virama
211         "$Ori1    = [\\u0B15-\\u0B39];"    // Oriya Letter Ka,...,Ha
212         "$Tel0    = [\\u0C05-\\u0C39];"    // Telugu Letter A,...,Ha
213         "$TelV    = \\u0C4D;"              // Telugu Sign Virama
214         "$Tel1    = [\\u0C14-\\u0C39];"    // Telugu Letter Ka,...,Ha
215         "$Kan0    = [\\u0C85-\\u0CB9];"    // Kannada Letter A,...,Ha
216         "$KanV    = \\u0CCD;"              // Kannada Sign Virama
217         "$Kan1    = [\\u0C95-\\u0CB9];"    // Kannada Letter A,...,Ha
218         "$Mal0    = [\\u0D05-\\u0D39];"    // Malayalam Letter A,...,Ha
219         "$MalV    = \\u0D4D;"              // Malayalam Sign Virama
220         "$Mal1    = [\\u0D15-\\u0D39];"    // Malayalam Letter A,...,Ha
221         "!!chain;"
222         "!!forward;"
223         "$CR $LF;"
224         "$L ($L | $V | $LV | $LVT);"
225         "($LV | $V) ($V | $T);"
226         "($LVT | $T) $T;"
227         "[^$Control $CR $LF] $Extend;"
228         "[^$Control $CR $LF] $SpacingMark;"
229         "$Hin0 $HinV $Hin1;"               // Devanagari Virama (forward)
230         "$Ben0 $BenV $Ben1;"               // Bengali Virama (forward)
231         "$Pan0 $PanV $Pan1;"               // Gurmukhi Virama (forward)
232         "$Guj0 $GujV $Guj1;"               // Gujarati Virama (forward)
233         "$Ori0 $OriV $Ori1;"               // Oriya Virama (forward)
234         "$Tel0 $TelV $Tel1;"               // Telugu Virama (forward)
235         "$Kan0 $KanV $Kan1;"               // Kannada Virama (forward)
236         "$Mal0 $MalV $Mal1;"               // Malayalam Virama (forward)
237         "!!reverse;"
238         "$LF $CR;"
239         "($L | $V | $LV | $LVT) $L;"
240         "($V | $T) ($LV | $V);"
241         "$T ($LVT | $T);"
242         "$Extend      [^$Control $CR $LF];"
243         "$SpacingMark [^$Control $CR $LF];"
244         "$Hin1 $HinV $Hin0;"               // Devanagari Virama (backward)
245         "$Ben1 $BenV $Ben0;"               // Bengali Virama (backward)
246         "$Pan1 $PanV $Pan0;"               // Gurmukhi Virama (backward)
247         "$Guj1 $GujV $Guj0;"               // Gujarati Virama (backward)
248         "$Ori1 $OriV $Ori0;"               // Gujarati Virama (backward)
249         "$Tel1 $TelV $Tel0;"               // Telugu Virama (backward)
250         "$Kan1 $KanV $Kan0;"               // Kannada Virama (backward)
251         "$Mal1 $MalV $Mal0;"               // Malayalam Virama (backward)
252         "!!safe_reverse;"
253         "!!safe_forward;";
254     static bool createdCursorMovementIterator = false;
255     static TextBreakIterator* staticCursorMovementIterator;
256     return setUpIteratorWithRules(createdCursorMovementIterator, staticCursorMovementIterator, kRules, string, length);
257 }
258
259 }