1 // -*- c-basic-offset: 2 -*-
3 * This file is part of the KDE libraries
4 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
5 * Copyright (C) 2004 Apple Computer, Inc.
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Library General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Library General Public License for more details.
17 * You should have received a copy of the GNU Library General Public License
18 * along with this library; see the file COPYING.LIB. If not, write to
19 * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 * Boston, MA 02111-1307, USA.
24 #ifndef _KJS_USTRING_H_
25 #define _KJS_USTRING_H_
28 #include <sys/types.h>
29 #ifndef KWQ_UNSIGNED_TYPES_DEFINED
30 #define KWQ_UNSIGNED_TYPES_DEFINED
31 typedef unsigned char uchar;
32 typedef unsigned long ulong;
54 * @short Unicode character.
56 * UChar represents a 16 bit Unicode character. It's internal data
57 * representation is compatible to XChar2b and QChar. It's therefore
58 * possible to exchange data with X and Qt with shallow copies.
62 * Construct a character with uninitialized value.
66 * Construct a character with the value denoted by the arguments.
67 * @param h higher byte
70 UChar(unsigned char h , unsigned char l);
72 * Construct a character with the given value.
73 * @param u 16 bit Unicode value
76 UChar(unsigned char u);
77 UChar(unsigned short u);
78 UChar(const UCharReference &c);
80 * @return The higher byte of the character.
82 unsigned char high() const { return uc >> 8; }
84 * @return The lower byte of the character.
86 unsigned char low() const { return uc; }
88 * @return the 16 bit Unicode value of the character
90 unsigned short unicode() const { return uc; }
93 * @return The character converted to lower case.
95 UChar toLower() const;
97 * @return The character converted to upper case.
99 UChar toUpper() const;
104 inline UChar::UChar() { }
105 inline UChar::UChar(unsigned char h , unsigned char l) : uc(h << 8 | l) { }
106 inline UChar::UChar(char u) : uc((unsigned char)u) { }
107 inline UChar::UChar(unsigned char u) : uc(u) { }
108 inline UChar::UChar(unsigned short u) : uc(u) { }
111 * @short Dynamic reference to a string character.
113 * UCharReference is the dynamic counterpart of @ref UChar. It's used when
114 * characters retrieved via index from a @ref UString are used in an
115 * assignment expression (and therefore can't be treated as being const):
117 * UString s("hello world");
121 * If that sounds confusing your best bet is to simply forget about the
122 * existance of this class and treat is as being identical to @ref UChar.
124 class UCharReference {
125 friend class UString;
126 UCharReference(UString *s, unsigned int off) : str(s), offset(off) { }
129 * Set the referenced character to c.
131 UCharReference& operator=(UChar c);
133 * Same operator as above except the argument that it takes.
135 UCharReference& operator=(char c) { return operator=(UChar(c)); }
137 * @return Unicode value.
139 unsigned short unicode() const { return ref().uc; }
141 * @return Lower byte.
143 unsigned char low() const { return ref().uc; }
145 * @return Higher byte.
147 unsigned char high() const { return ref().uc >> 8; }
149 * @return Character converted to lower case.
151 UChar toLower() const { return ref().toLower(); }
153 * @return Character converted to upper case.
155 UChar toUpper() const { return ref().toUpper(); }
157 // not implemented, can only be constructed from UString
165 inline UChar::UChar(const UCharReference &c) : uc(c.unicode()) { }
168 * @short 8 bit char based string class
172 CString() : data(0), length(0) { }
173 CString(const char *c);
174 CString(const char *c, int len);
175 CString(const CString &);
179 CString &append(const CString &);
180 CString &operator=(const char *c);
181 CString &operator=(const CString &);
182 CString &operator+=(const CString &c) { return append(c); }
184 int size() const { return length; }
185 const char *c_str() const { return data; }
192 * @short Unicode string class
195 friend bool operator==(const UString&, const UString&);
196 friend class UCharReference;
197 friend class Identifier;
198 friend class PropertyMap;
199 friend class PropertyMapHashTableEntry;
205 friend class UString;
206 friend bool operator==(const UString&, const UString&);
208 static Rep *create(UChar *d, int l);
209 static Rep *create(Rep *base, int offset, int length);
212 UChar *data() const { return baseString ? (baseString->buf + baseString->preCapacity + offset) : (buf + preCapacity + offset); }
213 int size() const { return len; }
215 unsigned hash() const { if (_hash == 0) _hash = computeHash(data(), len); return _hash; }
216 static unsigned computeHash(const UChar *, int length);
217 static unsigned computeHash(const char *);
220 void deref() { if (--rc == 0) destroy(); }
226 mutable unsigned _hash;
228 UString::Rep *baseString;
230 // potentially shared data
243 * Constructs a null string.
247 * Constructs a string from the single character c.
249 explicit UString(char c);
251 * Constructs a string from a classical zero determined char string.
253 UString(const char *c);
255 * Constructs a string from an array of Unicode characters of the specified
258 UString(const UChar *c, int length);
260 * If copy is false the string data will be adopted.
261 * That means that the data will NOT be copied and the pointer will
262 * be deleted when the UString object is modified or destroyed.
263 * Behaviour defaults to a deep copy if copy is true.
265 UString(UChar *c, int length, bool copy);
267 * Copy constructor. Makes a shallow copy only.
269 UString(const UString &s) { attach(s.rep); }
271 * Convenience declaration only ! You'll be on your own to write the
272 * implementation for a construction from QString.
274 * Note: feel free to contact me if you want to see a dummy header for
275 * your favourite FooString class here !
277 UString(const QString &);
279 * Convenience declaration only ! See @ref UString(const QString&).
281 UString(const DOM::DOMString &);
283 * Concatenation constructor. Makes operator+ more efficient.
285 UString(const UString &, const UString &);
287 * Destructor. If this handle was the only one holding a reference to the
288 * string the data will be freed.
290 ~UString() { release(); }
293 * Constructs a string from an int.
295 static UString from(int i);
297 * Constructs a string from an unsigned int.
299 static UString from(unsigned int u);
301 * Constructs a string from a long.
303 static UString from(long l);
305 * Constructs a string from a double.
307 static UString from(double d);
311 Range(int pos, int len) : position(pos), length(len) {}
317 UString spliceSubstringsWithSeparators(const Range *substringRanges, int rangeCount, const UString *separators, int separatorCount) const;
320 * Append another string.
322 UString &append(const UString &);
323 UString &append(const char *);
324 UString &append(unsigned short);
325 UString &append(char c) { return append(static_cast<unsigned short>(static_cast<unsigned char>(c))); }
326 UString &append(UChar c) { return append(c.uc); }
329 * @return The string converted to the 8-bit string type @ref CString().
331 CString cstring() const;
333 * Convert the Unicode string to plain ASCII chars chopping of any higher
334 * bytes. This method should only be used for *debugging* purposes as it
335 * is neither Unicode safe nor free from side effects. In order not to
336 * waste any memory the char buffer is static and *shared* by all UString
342 * Convert the string to UTF-8, assuming it is UTF-16 encoded.
343 * Since this function is tolerant of badly formed UTF-16, it can create UTF-8
344 * strings that are invalid because they have characters in the range
345 * U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is guaranteed to
346 * be otherwise valid.
348 CString UTF8String() const;
351 * @see UString(const QString&).
353 DOM::DOMString string() const;
355 * @see UString(const QString&).
357 QString qstring() const;
359 * @see UString(const QString&).
361 QConstString qconststring() const;
364 * Assignment operator.
366 UString &operator=(const char *c);
367 UString &operator=(const UString &);
369 * Appends the specified string.
371 UString &operator+=(const UString &s) { return append(s); }
372 UString &operator+=(const char *s) { return append(s); }
375 * @return A pointer to the internal Unicode data.
377 const UChar* data() const { return rep->data(); }
379 * @return True if null.
381 bool isNull() const { return (rep == &Rep::null); }
383 * @return True if null or zero length.
385 bool isEmpty() const { return (!rep->len); }
387 * Use this if you want to make sure that this string is a plain ASCII
388 * string. For example, if you don't want to lose any information when
389 * using @ref cstring() or @ref ascii().
391 * @return True if the string doesn't contain any non-ASCII characters.
395 * @return The length of the string.
397 int size() const { return rep->size(); }
399 * Const character at specified position.
401 UChar operator[](int pos) const;
403 * Writable reference to character at specified position.
405 UCharReference operator[](int pos);
408 * Attempts an conversion to a number. Apart from floating point numbers,
409 * the algorithm will recognize hexadecimal representations (as
410 * indicated by a 0x or 0X prefix) and +/- Infinity.
411 * Returns NaN if the conversion failed.
412 * @param tolerateTrailingJunk if true, toDouble can tolerate garbage after the number.
413 * @param tolerateEmptyString if false, toDouble will turn an empty string into NaN rather than 0.
415 double toDouble(bool tolerateTrailingJunk, bool tolerateEmptyString) const;
416 double toDouble(bool tolerateTrailingJunk) const;
417 double toDouble() const;
419 * Attempts an conversion to an unsigned long integer. ok will be set
420 * according to the success.
421 * @param tolerateEmptyString if false, toULong will return false for *ok for an empty string.
423 unsigned long toULong(bool *ok, bool tolerateEmptyString) const;
424 unsigned long toULong(bool *ok = 0) const;
426 uint32_t toUInt32(bool *ok = 0) const;
427 uint32_t toStrictUInt32(bool *ok = 0) const;
430 * Attempts an conversion to an array index. The "ok" boolean will be set
431 * to true if it is a valid array index according to the rule from
432 * ECMA 15.2 about what an array index is. It must exactly match the string
433 * form of an unsigned integer, and be less than 2^32 - 1.
435 unsigned toArrayIndex(bool *ok = 0) const;
438 * @return Position of first occurrence of f starting at position pos.
439 * -1 if the search was not successful.
441 int find(const UString &f, int pos = 0) const;
442 int find(UChar, int pos = 0) const;
444 * @return Position of first occurrence of f searching backwards from
446 * -1 if the search was not successful.
448 int rfind(const UString &f, int pos) const;
449 int rfind(UChar, int pos) const;
451 * @return The sub string starting at position pos and length len.
453 UString substr(int pos = 0, int len = -1) const;
455 * Static instance of a null string.
457 static const UString &null();
460 * Clear statically allocated resources.
462 static void globalClear();
465 UString(Rep *r) { attach(r); }
469 int expandedSize(int size, int otherSize) const;
470 int usedCapacity() const;
471 int usedPreCapacity() const;
472 void expandCapacity(int requiredLength);
473 void expandPreCapacity(int requiredPreCap);
478 inline bool operator==(const UChar &c1, const UChar &c2) {
479 return (c1.uc == c2.uc);
481 bool operator==(const UString& s1, const UString& s2);
482 inline bool operator!=(const UString& s1, const UString& s2) {
483 return !KJS::operator==(s1, s2);
485 bool operator<(const UString& s1, const UString& s2);
486 bool operator==(const UString& s1, const char *s2);
487 inline bool operator!=(const UString& s1, const char *s2) {
488 return !KJS::operator==(s1, s2);
490 inline bool operator==(const char *s1, const UString& s2) {
491 return operator==(s2, s1);
493 inline bool operator!=(const char *s1, const UString& s2) {
494 return !KJS::operator==(s1, s2);
496 bool operator==(const CString& s1, const CString& s2);
497 inline UString operator+(const UString& s1, const UString& s2) {
498 return UString(s1, s2);
501 int compare(const UString &, const UString &);
503 // Given a first byte, gives the length of the UTF-8 sequence it begins.
504 // Returns 0 for bytes that are not legal starts of UTF-8 sequences.
505 // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
506 int UTF8SequenceLength(char);
508 // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
509 // Only allows Unicode characters (U-00000000 to U-0010FFFF).
510 // Returns -1 if the sequence is not valid (including presence of extra bytes).
511 int decodeUTF8Sequence(const char *);