diff options
Diffstat (limited to 'JavaScriptCore/runtime/UString.cpp')
-rw-r--r-- | JavaScriptCore/runtime/UString.cpp | 516 |
1 files changed, 126 insertions, 390 deletions
diff --git a/JavaScriptCore/runtime/UString.cpp b/JavaScriptCore/runtime/UString.cpp index c442500..ac3acfd 100644 --- a/JavaScriptCore/runtime/UString.cpp +++ b/JavaScriptCore/runtime/UString.cpp @@ -54,36 +54,40 @@ namespace JSC { extern const double NaN; extern const double Inf; -// The null string is immutable, except for refCount. -UString* UString::s_nullUString; - COMPILE_ASSERT(sizeof(UString) == sizeof(void*), UString_should_stay_small); -void initializeUString() +// Construct a string with UTF-16 data. +UString::UString(const UChar* characters, unsigned length) + : m_impl(characters ? StringImpl::create(characters, length) : 0) { - // UStringImpl::empty() does not construct its static string in a threadsafe fashion, - // so ensure it has been initialized from here. - UStringImpl::empty(); - - UString::s_nullUString = new UString; } -UString::UString(const char* c) - : m_rep(Rep::create(c)) +// Construct a string with UTF-16 data, from a null-terminated source. +UString::UString(const UChar* characters) { + if (!characters) + return; + + int length = 0; + while (characters[length] != UChar(0)) + ++length; + + m_impl = StringImpl::create(characters, length); } -UString::UString(const char* c, unsigned length) - : m_rep(Rep::create(c, length)) +// Construct a string with latin1 data. +UString::UString(const char* characters, unsigned length) + : m_impl(characters ? StringImpl::create(characters, length) : 0) { } -UString::UString(const UChar* c, unsigned length) - : m_rep(Rep::create(c, length)) +// Construct a string with latin1 data, from a null-terminated source. +UString::UString(const char* characters) + : m_impl(characters ? StringImpl::create(characters) : 0) { } -UString UString::from(int i) +UString UString::number(int i) { UChar buf[1 + sizeof(i) * 3]; UChar* end = buf + sizeof(buf) / sizeof(UChar); @@ -112,7 +116,7 @@ UString UString::from(int i) return UString(p, static_cast<unsigned>(end - p)); } -UString UString::from(long long i) +UString UString::number(long long i) { UChar buf[1 + sizeof(i) * 3]; UChar* end = buf + sizeof(buf) / sizeof(UChar); @@ -145,7 +149,7 @@ UString UString::from(long long i) return UString(p, static_cast<unsigned>(end - p)); } -UString UString::from(unsigned u) +UString UString::number(unsigned u) { UChar buf[sizeof(u) * 3]; UChar* end = buf + sizeof(buf) / sizeof(UChar); @@ -163,7 +167,7 @@ UString UString::from(unsigned u) return UString(p, static_cast<unsigned>(end - p)); } -UString UString::from(long l) +UString UString::number(long l) { UChar buf[1 + sizeof(l) * 3]; UChar* end = buf + sizeof(buf) / sizeof(UChar); @@ -192,7 +196,7 @@ UString UString::from(long l) return UString(p, end - p); } -UString UString::from(double d) +UString UString::number(double d) { DtoaBuffer buffer; unsigned length; @@ -200,359 +204,17 @@ UString UString::from(double d) return UString(buffer, length); } -char* UString::ascii() const -{ - static char* asciiBuffer = 0; - - unsigned length = size(); - unsigned neededSize = length + 1; - delete[] asciiBuffer; - asciiBuffer = new char[neededSize]; - - const UChar* p = data(); - char* q = asciiBuffer; - const UChar* limit = p + length; - while (p != limit) { - *q = static_cast<char>(p[0]); - ++p; - ++q; - } - *q = '\0'; - - return asciiBuffer; -} - -bool UString::is8Bit() const -{ - const UChar* u = data(); - const UChar* limit = u + size(); - while (u < limit) { - if (u[0] > 0xFF) - return false; - ++u; - } - - return true; -} - -UChar UString::operator[](unsigned pos) const +UString UString::substringSharingImpl(unsigned offset, unsigned length) const { - if (pos >= size()) - return '\0'; - return data()[pos]; -} - -static inline bool isInfinity(double number) -{ - return number == Inf || number == -Inf; -} - -static bool isInfinity(const UChar* data, const UChar* end) -{ - return data + 7 < end - && data[0] == 'I' - && data[1] == 'n' - && data[2] == 'f' - && data[3] == 'i' - && data[4] == 'n' - && data[5] == 'i' - && data[6] == 't' - && data[7] == 'y'; -} - -double UString::toDouble(bool tolerateTrailingJunk, bool tolerateEmptyString) const -{ - unsigned size = this->size(); - - if (size == 1) { - UChar c = data()[0]; - if (isASCIIDigit(c)) - return c - '0'; - if (isStrWhiteSpace(c) && tolerateEmptyString) - return 0; - return NaN; - } - - // FIXME: If tolerateTrailingJunk is true, then we want to tolerate junk - // after the number, even if it contains invalid UTF-16 sequences. So we - // shouldn't use the UTF8String function, which returns null when it - // encounters invalid UTF-16. Further, we have no need to convert the - // non-ASCII characters to UTF-8, so the UTF8String does quite a bit of - // unnecessary work. - - // FIXME: The space skipping code below skips only ASCII spaces, but callers - // need to skip all StrWhiteSpace. The isStrWhiteSpace function does the - // right thing but requires UChar, not char, for its argument. - - const UChar* data = this->data(); - const UChar* end = data + size; - - // Skip leading white space. - for (; data < end; ++data) { - if (!isStrWhiteSpace(*data)) - break; - } - - // Empty string. - if (data == end) - return tolerateEmptyString ? 0.0 : NaN; - - double number; - - if (data[0] == '0' && data + 2 < end && (data[1] | 0x20) == 'x' && isASCIIHexDigit(data[2])) { - // Hex number. - data += 2; - const UChar* firstDigitPosition = data; - number = 0; - while (true) { - number = number * 16 + toASCIIHexValue(*data); - ++data; - if (data == end) - break; - if (!isASCIIHexDigit(*data)) - break; - } - if (number >= mantissaOverflowLowerBound) - number = parseIntOverflow(firstDigitPosition, data - firstDigitPosition, 16); - } else { - // Decimal number. - - // Put into a null-terminated byte buffer. - Vector<char, 32> byteBuffer; - for (const UChar* characters = data; characters < end; ++characters) { - UChar character = *characters; - byteBuffer.append(isASCII(character) ? character : 0); - } - byteBuffer.append(0); - - char* byteBufferEnd; - number = WTF::strtod(byteBuffer.data(), &byteBufferEnd); - const UChar* pastNumber = data + (byteBufferEnd - byteBuffer.data()); - - if ((number || pastNumber != data) && !isInfinity(number)) - data = pastNumber; - else { - // We used strtod() to do the conversion. However, strtod() handles - // infinite values slightly differently than JavaScript in that it - // converts the string "inf" with any capitalization to infinity, - // whereas the ECMA spec requires that it be converted to NaN. - - double signedInfinity = Inf; - if (data < end) { - if (*data == '+') - data++; - else if (*data == '-') { - signedInfinity = -Inf; - data++; - } - } - if (isInfinity(data, end)) { - number = signedInfinity; - data += 8; - } else if (isInfinity(number) && data < end && (*data | 0x20) != 'i') - data = pastNumber; - else - return NaN; - } - } - - // Look for trailing junk. - if (!tolerateTrailingJunk) { - // Allow trailing white space. - for (; data < end; ++data) { - if (!isStrWhiteSpace(*data)) - break; - } - if (data != end) - return NaN; - } - - return number; -} - -double UString::toDouble(bool tolerateTrailingJunk) const -{ - return toDouble(tolerateTrailingJunk, true); -} - -double UString::toDouble() const -{ - return toDouble(false, true); -} - -uint32_t UString::toUInt32(bool* ok) const -{ - double d = toDouble(); - bool b = true; - - if (d != static_cast<uint32_t>(d)) { - b = false; - d = 0; - } - - if (ok) - *ok = b; - - return static_cast<uint32_t>(d); -} - -uint32_t UString::toUInt32(bool* ok, bool tolerateEmptyString) const -{ - double d = toDouble(false, tolerateEmptyString); - bool b = true; - - if (d != static_cast<uint32_t>(d)) { - b = false; - d = 0; - } + // FIXME: We used to check against a limit of Heap::minExtraCost / sizeof(UChar). - if (ok) - *ok = b; + unsigned stringLength = this->length(); + offset = min(offset, stringLength); + length = min(length, stringLength - offset); - return static_cast<uint32_t>(d); -} - -uint32_t UString::toStrictUInt32(bool* ok) const -{ - if (ok) - *ok = false; - - // Empty string is not OK. - unsigned len = m_rep->length(); - if (len == 0) - return 0; - const UChar* p = m_rep->characters(); - unsigned short c = p[0]; - - // If the first digit is 0, only 0 itself is OK. - if (c == '0') { - if (len == 1 && ok) - *ok = true; - return 0; - } - - // Convert to UInt32, checking for overflow. - uint32_t i = 0; - while (1) { - // Process character, turning it into a digit. - if (c < '0' || c > '9') - return 0; - const unsigned d = c - '0'; - - // Multiply by 10, checking for overflow out of 32 bits. - if (i > 0xFFFFFFFFU / 10) - return 0; - i *= 10; - - // Add in the digit, checking for overflow out of 32 bits. - const unsigned max = 0xFFFFFFFFU - d; - if (i > max) - return 0; - i += d; - - // Handle end of string. - if (--len == 0) { - if (ok) - *ok = true; - return i; - } - - // Get next character. - c = *(++p); - } -} - -unsigned UString::find(const UString& f, unsigned pos) const -{ - unsigned fsz = f.size(); - - if (fsz == 1) { - UChar ch = f[0]; - const UChar* end = data() + size(); - for (const UChar* c = data() + pos; c < end; c++) { - if (*c == ch) - return static_cast<unsigned>(c - data()); - } - return NotFound; - } - - unsigned sz = size(); - if (sz < fsz) - return NotFound; - if (fsz == 0) - return pos; - const UChar* end = data() + sz - fsz; - unsigned fsizeminusone = (fsz - 1) * sizeof(UChar); - const UChar* fdata = f.data(); - unsigned short fchar = fdata[0]; - ++fdata; - for (const UChar* c = data() + pos; c <= end; c++) { - if (c[0] == fchar && !memcmp(c + 1, fdata, fsizeminusone)) - return static_cast<unsigned>(c - data()); - } - - return NotFound; -} - -unsigned UString::find(UChar ch, unsigned pos) const -{ - const UChar* end = data() + size(); - for (const UChar* c = data() + pos; c < end; c++) { - if (*c == ch) - return static_cast<unsigned>(c - data()); - } - - return NotFound; -} - -unsigned UString::rfind(const UString& f, unsigned pos) const -{ - unsigned sz = size(); - unsigned fsz = f.size(); - if (sz < fsz) - return NotFound; - if (pos > sz - fsz) - pos = sz - fsz; - if (fsz == 0) - return pos; - unsigned fsizeminusone = (fsz - 1) * sizeof(UChar); - const UChar* fdata = f.data(); - for (const UChar* c = data() + pos; c >= data(); c--) { - if (*c == *fdata && !memcmp(c + 1, fdata + 1, fsizeminusone)) - return static_cast<unsigned>(c - data()); - } - - return NotFound; -} - -unsigned UString::rfind(UChar ch, unsigned pos) const -{ - if (isEmpty()) - return NotFound; - if (pos + 1 >= size()) - pos = size() - 1; - for (const UChar* c = data() + pos; c >= data(); c--) { - if (*c == ch) - return static_cast<unsigned>(c - data()); - } - - return NotFound; -} - -UString UString::substr(unsigned pos, unsigned len) const -{ - unsigned s = size(); - - if (pos >= s) - pos = s; - unsigned limit = s - pos; - if (len > limit) - len = limit; - - if (pos == 0 && len == s) + if (!offset && length == stringLength) return *this; - - return UString(Rep::create(m_rep, pos, len)); + return UString(StringImpl::create(m_impl, offset, length)); } bool operator==(const UString& s1, const char *s2) @@ -560,8 +222,8 @@ bool operator==(const UString& s1, const char *s2) if (s2 == 0) return s1.isEmpty(); - const UChar* u = s1.data(); - const UChar* uend = u + s1.size(); + const UChar* u = s1.characters(); + const UChar* uend = u + s1.length(); while (u != uend && *s2) { if (u[0] != (unsigned char)*s2) return false; @@ -574,11 +236,11 @@ bool operator==(const UString& s1, const char *s2) bool operator<(const UString& s1, const UString& s2) { - const unsigned l1 = s1.size(); - const unsigned l2 = s2.size(); + const unsigned l1 = s1.length(); + const unsigned l2 = s2.length(); const unsigned lmin = l1 < l2 ? l1 : l2; - const UChar* c1 = s1.data(); - const UChar* c2 = s2.data(); + const UChar* c1 = s1.characters(); + const UChar* c2 = s2.characters(); unsigned l = 0; while (l < lmin && *c1 == *c2) { c1++; @@ -593,11 +255,11 @@ bool operator<(const UString& s1, const UString& s2) bool operator>(const UString& s1, const UString& s2) { - const unsigned l1 = s1.size(); - const unsigned l2 = s2.size(); + const unsigned l1 = s1.length(); + const unsigned l2 = s2.length(); const unsigned lmin = l1 < l2 ? l1 : l2; - const UChar* c1 = s1.data(); - const UChar* c2 = s2.data(); + const UChar* c1 = s1.characters(); + const UChar* c2 = s2.characters(); unsigned l = 0; while (l < lmin && *c1 == *c2) { c1++; @@ -610,20 +272,94 @@ bool operator>(const UString& s1, const UString& s2) return (l1 > l2); } -CString UString::UTF8String(bool strict) const +CString UString::ascii() const { - // Allocate a buffer big enough to hold all the characters. - const unsigned length = size(); - Vector<char, 1024> buffer(length * 3); - - // Convert to runs of 8-bit characters. - char* p = buffer.data(); - const UChar* d = reinterpret_cast<const UChar*>(&data()[0]); - ConversionResult result = convertUTF16ToUTF8(&d, d + length, &p, p + buffer.size(), strict); - if (result != conversionOK) + // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are + // preserved, characters outside of this range are converted to '?'. + + unsigned length = this->length(); + const UChar* characters = this->characters(); + + char* characterBuffer; + CString result = CString::newUninitialized(length, characterBuffer); + + for (unsigned i = 0; i < length; ++i) { + UChar ch = characters[i]; + characterBuffer[i] = ch && (ch < 0x20 || ch >= 0x7f) ? '?' : ch; + } + + return result; +} + +CString UString::latin1() const +{ + // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are + // preserved, characters outside of this range are converted to '?'. + + unsigned length = this->length(); + const UChar* characters = this->characters(); + + char* characterBuffer; + CString result = CString::newUninitialized(length, characterBuffer); + + for (unsigned i = 0; i < length; ++i) { + UChar ch = characters[i]; + characterBuffer[i] = ch > 0xff ? '?' : ch; + } + + return result; +} + +// Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available. +static inline void putUTF8Triple(char*& buffer, UChar ch) +{ + ASSERT(ch >= 0x0800); + *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); + *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); + *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); +} + +CString UString::utf8(bool strict) const +{ + unsigned length = this->length(); + const UChar* characters = this->characters(); + + // Allocate a buffer big enough to hold all the characters + // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). + // Optimization ideas, if we find this function is hot: + // * We could speculatively create a CStringBuffer to contain 'length' + // characters, and resize if necessary (i.e. if the buffer contains + // non-ascii characters). (Alternatively, scan the buffer first for + // ascii characters, so we know this will be sufficient). + // * We could allocate a CStringBuffer with an appropriate size to + // have a good chance of being able to write the string into the + // buffer without reallocing (say, 1.5 x length). + Vector<char, 1024> bufferVector(length * 3); + + char* buffer = bufferVector.data(); + ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict); + ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion + + // Only produced from strict conversion. + if (result == sourceIllegal) return CString(); - return CString(buffer.data(), p - buffer.data()); + // Check for an unconverted high surrogate. + if (result == sourceExhausted) { + if (strict) + return CString(); + // This should be one unpaired high surrogate. Treat it the same + // was as an unpaired high surrogate would have been handled in + // the middle of a string with non-strict conversion - which is + // to say, simply encode it to UTF-8. + ASSERT((characters + 1) == (this->characters() + length)); + ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF)); + // There should be room left, since one UChar hasn't been converted. + ASSERT((buffer + 3) <= (buffer + bufferVector.size())); + putUTF8Triple(buffer, *characters); + } + + return CString(bufferVector.data(), buffer - bufferVector.data()); } } // namespace JSC |