summaryrefslogtreecommitdiffstats
path: root/JavaScriptCore/runtime/UString.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'JavaScriptCore/runtime/UString.cpp')
-rw-r--r--JavaScriptCore/runtime/UString.cpp516
1 files changed, 126 insertions, 390 deletions
diff --git a/JavaScriptCore/runtime/UString.cpp b/JavaScriptCore/runtime/UString.cpp
index c442500..ac3acfd 100644
--- a/JavaScriptCore/runtime/UString.cpp
+++ b/JavaScriptCore/runtime/UString.cpp
@@ -54,36 +54,40 @@ namespace JSC {
extern const double NaN;
extern const double Inf;
-// The null string is immutable, except for refCount.
-UString* UString::s_nullUString;
-
COMPILE_ASSERT(sizeof(UString) == sizeof(void*), UString_should_stay_small);
-void initializeUString()
+// Construct a string with UTF-16 data.
+UString::UString(const UChar* characters, unsigned length)
+ : m_impl(characters ? StringImpl::create(characters, length) : 0)
{
- // UStringImpl::empty() does not construct its static string in a threadsafe fashion,
- // so ensure it has been initialized from here.
- UStringImpl::empty();
-
- UString::s_nullUString = new UString;
}
-UString::UString(const char* c)
- : m_rep(Rep::create(c))
+// Construct a string with UTF-16 data, from a null-terminated source.
+UString::UString(const UChar* characters)
{
+ if (!characters)
+ return;
+
+ int length = 0;
+ while (characters[length] != UChar(0))
+ ++length;
+
+ m_impl = StringImpl::create(characters, length);
}
-UString::UString(const char* c, unsigned length)
- : m_rep(Rep::create(c, length))
+// Construct a string with latin1 data.
+UString::UString(const char* characters, unsigned length)
+ : m_impl(characters ? StringImpl::create(characters, length) : 0)
{
}
-UString::UString(const UChar* c, unsigned length)
- : m_rep(Rep::create(c, length))
+// Construct a string with latin1 data, from a null-terminated source.
+UString::UString(const char* characters)
+ : m_impl(characters ? StringImpl::create(characters) : 0)
{
}
-UString UString::from(int i)
+UString UString::number(int i)
{
UChar buf[1 + sizeof(i) * 3];
UChar* end = buf + sizeof(buf) / sizeof(UChar);
@@ -112,7 +116,7 @@ UString UString::from(int i)
return UString(p, static_cast<unsigned>(end - p));
}
-UString UString::from(long long i)
+UString UString::number(long long i)
{
UChar buf[1 + sizeof(i) * 3];
UChar* end = buf + sizeof(buf) / sizeof(UChar);
@@ -145,7 +149,7 @@ UString UString::from(long long i)
return UString(p, static_cast<unsigned>(end - p));
}
-UString UString::from(unsigned u)
+UString UString::number(unsigned u)
{
UChar buf[sizeof(u) * 3];
UChar* end = buf + sizeof(buf) / sizeof(UChar);
@@ -163,7 +167,7 @@ UString UString::from(unsigned u)
return UString(p, static_cast<unsigned>(end - p));
}
-UString UString::from(long l)
+UString UString::number(long l)
{
UChar buf[1 + sizeof(l) * 3];
UChar* end = buf + sizeof(buf) / sizeof(UChar);
@@ -192,7 +196,7 @@ UString UString::from(long l)
return UString(p, end - p);
}
-UString UString::from(double d)
+UString UString::number(double d)
{
DtoaBuffer buffer;
unsigned length;
@@ -200,359 +204,17 @@ UString UString::from(double d)
return UString(buffer, length);
}
-char* UString::ascii() const
-{
- static char* asciiBuffer = 0;
-
- unsigned length = size();
- unsigned neededSize = length + 1;
- delete[] asciiBuffer;
- asciiBuffer = new char[neededSize];
-
- const UChar* p = data();
- char* q = asciiBuffer;
- const UChar* limit = p + length;
- while (p != limit) {
- *q = static_cast<char>(p[0]);
- ++p;
- ++q;
- }
- *q = '\0';
-
- return asciiBuffer;
-}
-
-bool UString::is8Bit() const
-{
- const UChar* u = data();
- const UChar* limit = u + size();
- while (u < limit) {
- if (u[0] > 0xFF)
- return false;
- ++u;
- }
-
- return true;
-}
-
-UChar UString::operator[](unsigned pos) const
+UString UString::substringSharingImpl(unsigned offset, unsigned length) const
{
- if (pos >= size())
- return '\0';
- return data()[pos];
-}
-
-static inline bool isInfinity(double number)
-{
- return number == Inf || number == -Inf;
-}
-
-static bool isInfinity(const UChar* data, const UChar* end)
-{
- return data + 7 < end
- && data[0] == 'I'
- && data[1] == 'n'
- && data[2] == 'f'
- && data[3] == 'i'
- && data[4] == 'n'
- && data[5] == 'i'
- && data[6] == 't'
- && data[7] == 'y';
-}
-
-double UString::toDouble(bool tolerateTrailingJunk, bool tolerateEmptyString) const
-{
- unsigned size = this->size();
-
- if (size == 1) {
- UChar c = data()[0];
- if (isASCIIDigit(c))
- return c - '0';
- if (isStrWhiteSpace(c) && tolerateEmptyString)
- return 0;
- return NaN;
- }
-
- // FIXME: If tolerateTrailingJunk is true, then we want to tolerate junk
- // after the number, even if it contains invalid UTF-16 sequences. So we
- // shouldn't use the UTF8String function, which returns null when it
- // encounters invalid UTF-16. Further, we have no need to convert the
- // non-ASCII characters to UTF-8, so the UTF8String does quite a bit of
- // unnecessary work.
-
- // FIXME: The space skipping code below skips only ASCII spaces, but callers
- // need to skip all StrWhiteSpace. The isStrWhiteSpace function does the
- // right thing but requires UChar, not char, for its argument.
-
- const UChar* data = this->data();
- const UChar* end = data + size;
-
- // Skip leading white space.
- for (; data < end; ++data) {
- if (!isStrWhiteSpace(*data))
- break;
- }
-
- // Empty string.
- if (data == end)
- return tolerateEmptyString ? 0.0 : NaN;
-
- double number;
-
- if (data[0] == '0' && data + 2 < end && (data[1] | 0x20) == 'x' && isASCIIHexDigit(data[2])) {
- // Hex number.
- data += 2;
- const UChar* firstDigitPosition = data;
- number = 0;
- while (true) {
- number = number * 16 + toASCIIHexValue(*data);
- ++data;
- if (data == end)
- break;
- if (!isASCIIHexDigit(*data))
- break;
- }
- if (number >= mantissaOverflowLowerBound)
- number = parseIntOverflow(firstDigitPosition, data - firstDigitPosition, 16);
- } else {
- // Decimal number.
-
- // Put into a null-terminated byte buffer.
- Vector<char, 32> byteBuffer;
- for (const UChar* characters = data; characters < end; ++characters) {
- UChar character = *characters;
- byteBuffer.append(isASCII(character) ? character : 0);
- }
- byteBuffer.append(0);
-
- char* byteBufferEnd;
- number = WTF::strtod(byteBuffer.data(), &byteBufferEnd);
- const UChar* pastNumber = data + (byteBufferEnd - byteBuffer.data());
-
- if ((number || pastNumber != data) && !isInfinity(number))
- data = pastNumber;
- else {
- // We used strtod() to do the conversion. However, strtod() handles
- // infinite values slightly differently than JavaScript in that it
- // converts the string "inf" with any capitalization to infinity,
- // whereas the ECMA spec requires that it be converted to NaN.
-
- double signedInfinity = Inf;
- if (data < end) {
- if (*data == '+')
- data++;
- else if (*data == '-') {
- signedInfinity = -Inf;
- data++;
- }
- }
- if (isInfinity(data, end)) {
- number = signedInfinity;
- data += 8;
- } else if (isInfinity(number) && data < end && (*data | 0x20) != 'i')
- data = pastNumber;
- else
- return NaN;
- }
- }
-
- // Look for trailing junk.
- if (!tolerateTrailingJunk) {
- // Allow trailing white space.
- for (; data < end; ++data) {
- if (!isStrWhiteSpace(*data))
- break;
- }
- if (data != end)
- return NaN;
- }
-
- return number;
-}
-
-double UString::toDouble(bool tolerateTrailingJunk) const
-{
- return toDouble(tolerateTrailingJunk, true);
-}
-
-double UString::toDouble() const
-{
- return toDouble(false, true);
-}
-
-uint32_t UString::toUInt32(bool* ok) const
-{
- double d = toDouble();
- bool b = true;
-
- if (d != static_cast<uint32_t>(d)) {
- b = false;
- d = 0;
- }
-
- if (ok)
- *ok = b;
-
- return static_cast<uint32_t>(d);
-}
-
-uint32_t UString::toUInt32(bool* ok, bool tolerateEmptyString) const
-{
- double d = toDouble(false, tolerateEmptyString);
- bool b = true;
-
- if (d != static_cast<uint32_t>(d)) {
- b = false;
- d = 0;
- }
+ // FIXME: We used to check against a limit of Heap::minExtraCost / sizeof(UChar).
- if (ok)
- *ok = b;
+ unsigned stringLength = this->length();
+ offset = min(offset, stringLength);
+ length = min(length, stringLength - offset);
- return static_cast<uint32_t>(d);
-}
-
-uint32_t UString::toStrictUInt32(bool* ok) const
-{
- if (ok)
- *ok = false;
-
- // Empty string is not OK.
- unsigned len = m_rep->length();
- if (len == 0)
- return 0;
- const UChar* p = m_rep->characters();
- unsigned short c = p[0];
-
- // If the first digit is 0, only 0 itself is OK.
- if (c == '0') {
- if (len == 1 && ok)
- *ok = true;
- return 0;
- }
-
- // Convert to UInt32, checking for overflow.
- uint32_t i = 0;
- while (1) {
- // Process character, turning it into a digit.
- if (c < '0' || c > '9')
- return 0;
- const unsigned d = c - '0';
-
- // Multiply by 10, checking for overflow out of 32 bits.
- if (i > 0xFFFFFFFFU / 10)
- return 0;
- i *= 10;
-
- // Add in the digit, checking for overflow out of 32 bits.
- const unsigned max = 0xFFFFFFFFU - d;
- if (i > max)
- return 0;
- i += d;
-
- // Handle end of string.
- if (--len == 0) {
- if (ok)
- *ok = true;
- return i;
- }
-
- // Get next character.
- c = *(++p);
- }
-}
-
-unsigned UString::find(const UString& f, unsigned pos) const
-{
- unsigned fsz = f.size();
-
- if (fsz == 1) {
- UChar ch = f[0];
- const UChar* end = data() + size();
- for (const UChar* c = data() + pos; c < end; c++) {
- if (*c == ch)
- return static_cast<unsigned>(c - data());
- }
- return NotFound;
- }
-
- unsigned sz = size();
- if (sz < fsz)
- return NotFound;
- if (fsz == 0)
- return pos;
- const UChar* end = data() + sz - fsz;
- unsigned fsizeminusone = (fsz - 1) * sizeof(UChar);
- const UChar* fdata = f.data();
- unsigned short fchar = fdata[0];
- ++fdata;
- for (const UChar* c = data() + pos; c <= end; c++) {
- if (c[0] == fchar && !memcmp(c + 1, fdata, fsizeminusone))
- return static_cast<unsigned>(c - data());
- }
-
- return NotFound;
-}
-
-unsigned UString::find(UChar ch, unsigned pos) const
-{
- const UChar* end = data() + size();
- for (const UChar* c = data() + pos; c < end; c++) {
- if (*c == ch)
- return static_cast<unsigned>(c - data());
- }
-
- return NotFound;
-}
-
-unsigned UString::rfind(const UString& f, unsigned pos) const
-{
- unsigned sz = size();
- unsigned fsz = f.size();
- if (sz < fsz)
- return NotFound;
- if (pos > sz - fsz)
- pos = sz - fsz;
- if (fsz == 0)
- return pos;
- unsigned fsizeminusone = (fsz - 1) * sizeof(UChar);
- const UChar* fdata = f.data();
- for (const UChar* c = data() + pos; c >= data(); c--) {
- if (*c == *fdata && !memcmp(c + 1, fdata + 1, fsizeminusone))
- return static_cast<unsigned>(c - data());
- }
-
- return NotFound;
-}
-
-unsigned UString::rfind(UChar ch, unsigned pos) const
-{
- if (isEmpty())
- return NotFound;
- if (pos + 1 >= size())
- pos = size() - 1;
- for (const UChar* c = data() + pos; c >= data(); c--) {
- if (*c == ch)
- return static_cast<unsigned>(c - data());
- }
-
- return NotFound;
-}
-
-UString UString::substr(unsigned pos, unsigned len) const
-{
- unsigned s = size();
-
- if (pos >= s)
- pos = s;
- unsigned limit = s - pos;
- if (len > limit)
- len = limit;
-
- if (pos == 0 && len == s)
+ if (!offset && length == stringLength)
return *this;
-
- return UString(Rep::create(m_rep, pos, len));
+ return UString(StringImpl::create(m_impl, offset, length));
}
bool operator==(const UString& s1, const char *s2)
@@ -560,8 +222,8 @@ bool operator==(const UString& s1, const char *s2)
if (s2 == 0)
return s1.isEmpty();
- const UChar* u = s1.data();
- const UChar* uend = u + s1.size();
+ const UChar* u = s1.characters();
+ const UChar* uend = u + s1.length();
while (u != uend && *s2) {
if (u[0] != (unsigned char)*s2)
return false;
@@ -574,11 +236,11 @@ bool operator==(const UString& s1, const char *s2)
bool operator<(const UString& s1, const UString& s2)
{
- const unsigned l1 = s1.size();
- const unsigned l2 = s2.size();
+ const unsigned l1 = s1.length();
+ const unsigned l2 = s2.length();
const unsigned lmin = l1 < l2 ? l1 : l2;
- const UChar* c1 = s1.data();
- const UChar* c2 = s2.data();
+ const UChar* c1 = s1.characters();
+ const UChar* c2 = s2.characters();
unsigned l = 0;
while (l < lmin && *c1 == *c2) {
c1++;
@@ -593,11 +255,11 @@ bool operator<(const UString& s1, const UString& s2)
bool operator>(const UString& s1, const UString& s2)
{
- const unsigned l1 = s1.size();
- const unsigned l2 = s2.size();
+ const unsigned l1 = s1.length();
+ const unsigned l2 = s2.length();
const unsigned lmin = l1 < l2 ? l1 : l2;
- const UChar* c1 = s1.data();
- const UChar* c2 = s2.data();
+ const UChar* c1 = s1.characters();
+ const UChar* c2 = s2.characters();
unsigned l = 0;
while (l < lmin && *c1 == *c2) {
c1++;
@@ -610,20 +272,94 @@ bool operator>(const UString& s1, const UString& s2)
return (l1 > l2);
}
-CString UString::UTF8String(bool strict) const
+CString UString::ascii() const
{
- // Allocate a buffer big enough to hold all the characters.
- const unsigned length = size();
- Vector<char, 1024> buffer(length * 3);
-
- // Convert to runs of 8-bit characters.
- char* p = buffer.data();
- const UChar* d = reinterpret_cast<const UChar*>(&data()[0]);
- ConversionResult result = convertUTF16ToUTF8(&d, d + length, &p, p + buffer.size(), strict);
- if (result != conversionOK)
+ // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
+ // preserved, characters outside of this range are converted to '?'.
+
+ unsigned length = this->length();
+ const UChar* characters = this->characters();
+
+ char* characterBuffer;
+ CString result = CString::newUninitialized(length, characterBuffer);
+
+ for (unsigned i = 0; i < length; ++i) {
+ UChar ch = characters[i];
+ characterBuffer[i] = ch && (ch < 0x20 || ch >= 0x7f) ? '?' : ch;
+ }
+
+ return result;
+}
+
+CString UString::latin1() const
+{
+ // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
+ // preserved, characters outside of this range are converted to '?'.
+
+ unsigned length = this->length();
+ const UChar* characters = this->characters();
+
+ char* characterBuffer;
+ CString result = CString::newUninitialized(length, characterBuffer);
+
+ for (unsigned i = 0; i < length; ++i) {
+ UChar ch = characters[i];
+ characterBuffer[i] = ch > 0xff ? '?' : ch;
+ }
+
+ return result;
+}
+
+// Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available.
+static inline void putUTF8Triple(char*& buffer, UChar ch)
+{
+ ASSERT(ch >= 0x0800);
+ *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
+ *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
+ *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
+}
+
+CString UString::utf8(bool strict) const
+{
+ unsigned length = this->length();
+ const UChar* characters = this->characters();
+
+ // Allocate a buffer big enough to hold all the characters
+ // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
+ // Optimization ideas, if we find this function is hot:
+ // * We could speculatively create a CStringBuffer to contain 'length'
+ // characters, and resize if necessary (i.e. if the buffer contains
+ // non-ascii characters). (Alternatively, scan the buffer first for
+ // ascii characters, so we know this will be sufficient).
+ // * We could allocate a CStringBuffer with an appropriate size to
+ // have a good chance of being able to write the string into the
+ // buffer without reallocing (say, 1.5 x length).
+ Vector<char, 1024> bufferVector(length * 3);
+
+ char* buffer = bufferVector.data();
+ ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
+ ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
+
+ // Only produced from strict conversion.
+ if (result == sourceIllegal)
return CString();
- return CString(buffer.data(), p - buffer.data());
+ // Check for an unconverted high surrogate.
+ if (result == sourceExhausted) {
+ if (strict)
+ return CString();
+ // This should be one unpaired high surrogate. Treat it the same
+ // was as an unpaired high surrogate would have been handled in
+ // the middle of a string with non-strict conversion - which is
+ // to say, simply encode it to UTF-8.
+ ASSERT((characters + 1) == (this->characters() + length));
+ ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
+ // There should be room left, since one UChar hasn't been converted.
+ ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
+ putUTF8Triple(buffer, *characters);
+ }
+
+ return CString(bufferVector.data(), buffer - bufferVector.data());
}
} // namespace JSC