1 files changed, 126 insertions, 390 deletions
diff --git a/JavaScriptCore/runtime/UString.cpp b/JavaScriptCore/runtime/UString.cpp
index c442500..ac3acfd 100644
--- a/JavaScriptCore/runtime/UString.cpp
+++ b/JavaScriptCore/runtime/UString.cpp
@@ -54,36 +54,40 @@ namespace JSC {
 extern const double NaN;
 extern const double Inf;
 
-// The null string is immutable, except for refCount.
-UString* UString::s_nullUString;
-
 COMPILE_ASSERT(sizeof(UString) == sizeof(void*), UString_should_stay_small);
 
-void initializeUString()
+// Construct a string with UTF-16 data.
+UString::UString(const UChar* characters, unsigned length)
+    : m_impl(characters ? StringImpl::create(characters, length) : 0)
 {
-    // UStringImpl::empty() does not construct its static string in a threadsafe fashion,
-    // so ensure it has been initialized from here.
-    UStringImpl::empty();
-
-    UString::s_nullUString = new UString;
 }
 
-UString::UString(const char* c)
-    : m_rep(Rep::create(c))
+// Construct a string with UTF-16 data, from a null-terminated source.
+UString::UString(const UChar* characters)
 {
+    if (!characters)
+        return;
+
+    int length = 0;
+    while (characters[length] != UChar(0))
+        ++length;
+
+    m_impl = StringImpl::create(characters, length);
 }
 
-UString::UString(const char* c, unsigned length)
-    : m_rep(Rep::create(c, length))
+// Construct a string with latin1 data.
+UString::UString(const char* characters, unsigned length)
+    : m_impl(characters ? StringImpl::create(characters, length) : 0)
 {
 }
 
-UString::UString(const UChar* c, unsigned length)
-    : m_rep(Rep::create(c, length))
+// Construct a string with latin1 data, from a null-terminated source.
+UString::UString(const char* characters)
+    : m_impl(characters ? StringImpl::create(characters) : 0)
 {
 }
 
-UString UString::from(int i)
+UString UString::number(int i)
 {
     UChar buf[1 + sizeof(i) * 3];
     UChar* end = buf + sizeof(buf) / sizeof(UChar);
@@ -112,7 +116,7 @@ UString UString::from(int i)
     return UString(p, static_cast<unsigned>(end - p));
 }
 
-UString UString::from(long long i)
+UString UString::number(long long i)
 {
     UChar buf[1 + sizeof(i) * 3];
     UChar* end = buf + sizeof(buf) / sizeof(UChar);
@@ -145,7 +149,7 @@ UString UString::from(long long i)
     return UString(p, static_cast<unsigned>(end - p));
 }
 
-UString UString::from(unsigned u)
+UString UString::number(unsigned u)
 {
     UChar buf[sizeof(u) * 3];
     UChar* end = buf + sizeof(buf) / sizeof(UChar);
@@ -163,7 +167,7 @@ UString UString::from(unsigned u)
     return UString(p, static_cast<unsigned>(end - p));
 }
 
-UString UString::from(long l)
+UString UString::number(long l)
 {
     UChar buf[1 + sizeof(l) * 3];
     UChar* end = buf + sizeof(buf) / sizeof(UChar);
@@ -192,7 +196,7 @@ UString UString::from(long l)
     return UString(p, end - p);
 }
 
-UString UString::from(double d)
+UString UString::number(double d)
 {
     DtoaBuffer buffer;
     unsigned length;
@@ -200,359 +204,17 @@ UString UString::from(double d)
     return UString(buffer, length);
 }
 
-char* UString::ascii() const
-{
-    static char* asciiBuffer = 0;
-
-    unsigned length = size();
-    unsigned neededSize = length + 1;
-    delete[] asciiBuffer;
-    asciiBuffer = new char[neededSize];
-
-    const UChar* p = data();
-    char* q = asciiBuffer;
-    const UChar* limit = p + length;
-    while (p != limit) {
-        *q = static_cast<char>(p[0]);
-        ++p;
-        ++q;
-    }
-    *q = '\0';
-
-    return asciiBuffer;
-}
-
-bool UString::is8Bit() const
-{
-    const UChar* u = data();
-    const UChar* limit = u + size();
-    while (u < limit) {
-        if (u[0] > 0xFF)
-            return false;
-        ++u;
-    }
-
-    return true;
-}
-
-UChar UString::operator[](unsigned pos) const
+UString UString::substringSharingImpl(unsigned offset, unsigned length) const
 {
-    if (pos >= size())
-        return '\0';
-    return data()[pos];
-}
-
-static inline bool isInfinity(double number)
-{
-    return number == Inf || number == -Inf;
-}
-
-static bool isInfinity(const UChar* data, const UChar* end)
-{
-    return data + 7 < end
-        && data[0] == 'I'
-        && data[1] == 'n'
-        && data[2] == 'f'
-        && data[3] == 'i'
-        && data[4] == 'n'
-        && data[5] == 'i'
-        && data[6] == 't'
-        && data[7] == 'y';
-}
-
-double UString::toDouble(bool tolerateTrailingJunk, bool tolerateEmptyString) const
-{
-    unsigned size = this->size();
-
-    if (size == 1) {
-        UChar c = data()[0];
-        if (isASCIIDigit(c))
-            return c - '0';
-        if (isStrWhiteSpace(c) && tolerateEmptyString)
-            return 0;
-        return NaN;
-    }
-
-    // FIXME: If tolerateTrailingJunk is true, then we want to tolerate junk 
-    // after the number, even if it contains invalid UTF-16 sequences. So we
-    // shouldn't use the UTF8String function, which returns null when it
-    // encounters invalid UTF-16. Further, we have no need to convert the
-    // non-ASCII characters to UTF-8, so the UTF8String does quite a bit of
-    // unnecessary work.
-
-    // FIXME: The space skipping code below skips only ASCII spaces, but callers
-    // need to skip all StrWhiteSpace. The isStrWhiteSpace function does the
-    // right thing but requires UChar, not char, for its argument.
-
-    const UChar* data = this->data();
-    const UChar* end = data + size;
-
-    // Skip leading white space.
-    for (; data < end; ++data) {
-        if (!isStrWhiteSpace(*data))
-            break;
-    }
-
-    // Empty string.
-    if (data == end)
-        return tolerateEmptyString ? 0.0 : NaN;
-
-    double number;
-
-    if (data[0] == '0' && data + 2 < end && (data[1] | 0x20) == 'x' && isASCIIHexDigit(data[2])) {
-        // Hex number.
-        data += 2;
-        const UChar* firstDigitPosition = data;
-        number = 0;
-        while (true) {
-            number = number * 16 + toASCIIHexValue(*data);
-            ++data;
-            if (data == end)
-                break;
-            if (!isASCIIHexDigit(*data))
-                break;
-        }
-        if (number >= mantissaOverflowLowerBound)
-            number = parseIntOverflow(firstDigitPosition, data - firstDigitPosition, 16);
-    } else {
-        // Decimal number.
-
-        // Put into a null-terminated byte buffer.
-        Vector<char, 32> byteBuffer;
-        for (const UChar* characters = data; characters < end; ++characters) {
-            UChar character = *characters;
-            byteBuffer.append(isASCII(character) ? character : 0);
-        }
-        byteBuffer.append(0);
-
-        char* byteBufferEnd;
-        number = WTF::strtod(byteBuffer.data(), &byteBufferEnd);
-        const UChar* pastNumber = data + (byteBufferEnd - byteBuffer.data());
-
-        if ((number || pastNumber != data) && !isInfinity(number))
-            data = pastNumber;
-        else {
-            // We used strtod() to do the conversion. However, strtod() handles
-            // infinite values slightly differently than JavaScript in that it
-            // converts the string "inf" with any capitalization to infinity,
-            // whereas the ECMA spec requires that it be converted to NaN.
-
-            double signedInfinity = Inf;
-            if (data < end) {
-                if (*data == '+')
-                    data++;
-                else if (*data == '-') {
-                    signedInfinity = -Inf;
-                    data++;
-                }
-            }
-            if (isInfinity(data, end)) {
-                number = signedInfinity;
-                data += 8;
-            } else if (isInfinity(number) && data < end && (*data | 0x20) != 'i')
-                data = pastNumber;
-            else
-                return NaN;
-        }
-    }
-
-    // Look for trailing junk.
-    if (!tolerateTrailingJunk) {
-        // Allow trailing white space.
-        for (; data < end; ++data) {
-            if (!isStrWhiteSpace(*data))
-                break;
-        }
-        if (data != end)
-            return NaN;
-    }
-
-    return number;
-}
-
-double UString::toDouble(bool tolerateTrailingJunk) const
-{
-    return toDouble(tolerateTrailingJunk, true);
-}
-
-double UString::toDouble() const
-{
-    return toDouble(false, true);
-}
-
-uint32_t UString::toUInt32(bool* ok) const
-{
-    double d = toDouble();
-    bool b = true;
-
-    if (d != static_cast<uint32_t>(d)) {
-        b = false;
-        d = 0;
-    }
-
-    if (ok)
-        *ok = b;
-
-    return static_cast<uint32_t>(d);
-}
-
-uint32_t UString::toUInt32(bool* ok, bool tolerateEmptyString) const
-{
-    double d = toDouble(false, tolerateEmptyString);
-    bool b = true;
-
-    if (d != static_cast<uint32_t>(d)) {
-        b = false;
-        d = 0;
-    }
+    // FIXME: We used to check against a limit of Heap::minExtraCost / sizeof(UChar).
 
-    if (ok)
-        *ok = b;
+    unsigned stringLength = this->length();
+    offset = min(offset, stringLength);
+    length = min(length, stringLength - offset);
 
-    return static_cast<uint32_t>(d);
-}
-
-uint32_t UString::toStrictUInt32(bool* ok) const
-{
-    if (ok)
-        *ok = false;
-
-    // Empty string is not OK.
-    unsigned len = m_rep->length();
-    if (len == 0)
-        return 0;
-    const UChar* p = m_rep->characters();
-    unsigned short c = p[0];
-
-    // If the first digit is 0, only 0 itself is OK.
-    if (c == '0') {
-        if (len == 1 && ok)
-            *ok = true;
-        return 0;
-    }
-
-    // Convert to UInt32, checking for overflow.
-    uint32_t i = 0;
-    while (1) {
-        // Process character, turning it into a digit.
-        if (c < '0' || c > '9')
-            return 0;
-        const unsigned d = c - '0';
-
-        // Multiply by 10, checking for overflow out of 32 bits.
-        if (i > 0xFFFFFFFFU / 10)
-            return 0;
-        i *= 10;
-
-        // Add in the digit, checking for overflow out of 32 bits.
-        const unsigned max = 0xFFFFFFFFU - d;
-        if (i > max)
-            return 0;
-        i += d;
-
-        // Handle end of string.
-        if (--len == 0) {
-            if (ok)
-                *ok = true;
-            return i;
-        }
-
-        // Get next character.
-        c = *(++p);
-    }
-}
-
-unsigned UString::find(const UString& f, unsigned pos) const
-{
-    unsigned fsz = f.size();
-
-    if (fsz == 1) {
-        UChar ch = f[0];
-        const UChar* end = data() + size();
-        for (const UChar* c = data() + pos; c < end; c++) {
-            if (*c == ch)
-                return static_cast<unsigned>(c - data());
-        }
-        return NotFound;
-    }
-
-    unsigned sz = size();
-    if (sz < fsz)
-        return NotFound;
-    if (fsz == 0)
-        return pos;
-    const UChar* end = data() + sz - fsz;
-    unsigned fsizeminusone = (fsz - 1) * sizeof(UChar);
-    const UChar* fdata = f.data();
-    unsigned short fchar = fdata[0];
-    ++fdata;
-    for (const UChar* c = data() + pos; c <= end; c++) {
-        if (c[0] == fchar && !memcmp(c + 1, fdata, fsizeminusone))
-            return static_cast<unsigned>(c - data());
-    }
-
-    return NotFound;
-}
-
-unsigned UString::find(UChar ch, unsigned pos) const
-{
-    const UChar* end = data() + size();
-    for (const UChar* c = data() + pos; c < end; c++) {
-        if (*c == ch)
-            return static_cast<unsigned>(c - data());
-    }
-
-    return NotFound;
-}
-
-unsigned UString::rfind(const UString& f, unsigned pos) const
-{
-    unsigned sz = size();
-    unsigned fsz = f.size();
-    if (sz < fsz)
-        return NotFound;
-    if (pos > sz - fsz)
-        pos = sz - fsz;
-    if (fsz == 0)
-        return pos;
-    unsigned fsizeminusone = (fsz - 1) * sizeof(UChar);
-    const UChar* fdata = f.data();
-    for (const UChar* c = data() + pos; c >= data(); c--) {
-        if (*c == *fdata && !memcmp(c + 1, fdata + 1, fsizeminusone))
-            return static_cast<unsigned>(c - data());
-    }
-
-    return NotFound;
-}
-
-unsigned UString::rfind(UChar ch, unsigned pos) const
-{
-    if (isEmpty())
-        return NotFound;
-    if (pos + 1 >= size())
-        pos = size() - 1;
-    for (const UChar* c = data() + pos; c >= data(); c--) {
-        if (*c == ch)
-            return static_cast<unsigned>(c - data());
-    }
-
-    return NotFound;
-}
-
-UString UString::substr(unsigned pos, unsigned len) const
-{
-    unsigned s = size();
-
-    if (pos >= s)
-        pos = s;
-    unsigned limit = s - pos;
-    if (len > limit)
-        len = limit;
-
-    if (pos == 0 && len == s)
+    if (!offset && length == stringLength)
         return *this;
-
-    return UString(Rep::create(m_rep, pos, len));
+    return UString(StringImpl::create(m_impl, offset, length));
 }
 
 bool operator==(const UString& s1, const char *s2)
@@ -560,8 +222,8 @@ bool operator==(const UString& s1, const char *s2)
     if (s2 == 0)
         return s1.isEmpty();
 
-    const UChar* u = s1.data();
-    const UChar* uend = u + s1.size();
+    const UChar* u = s1.characters();
+    const UChar* uend = u + s1.length();
     while (u != uend && *s2) {
         if (u[0] != (unsigned char)*s2)
             return false;
@@ -574,11 +236,11 @@ bool operator==(const UString& s1, const char *s2)
 
 bool operator<(const UString& s1, const UString& s2)
 {
-    const unsigned l1 = s1.size();
-    const unsigned l2 = s2.size();
+    const unsigned l1 = s1.length();
+    const unsigned l2 = s2.length();
     const unsigned lmin = l1 < l2 ? l1 : l2;
-    const UChar* c1 = s1.data();
-    const UChar* c2 = s2.data();
+    const UChar* c1 = s1.characters();
+    const UChar* c2 = s2.characters();
     unsigned l = 0;
     while (l < lmin && *c1 == *c2) {
         c1++;
@@ -593,11 +255,11 @@ bool operator<(const UString& s1, const UString& s2)
 
 bool operator>(const UString& s1, const UString& s2)
 {
-    const unsigned l1 = s1.size();
-    const unsigned l2 = s2.size();
+    const unsigned l1 = s1.length();
+    const unsigned l2 = s2.length();
     const unsigned lmin = l1 < l2 ? l1 : l2;
-    const UChar* c1 = s1.data();
-    const UChar* c2 = s2.data();
+    const UChar* c1 = s1.characters();
+    const UChar* c2 = s2.characters();
     unsigned l = 0;
     while (l < lmin && *c1 == *c2) {
         c1++;
@@ -610,20 +272,94 @@ bool operator>(const UString& s1, const UString& s2)
     return (l1 > l2);
 }
 
-CString UString::UTF8String(bool strict) const
+CString UString::ascii() const
 {
-    // Allocate a buffer big enough to hold all the characters.
-    const unsigned length = size();
-    Vector<char, 1024> buffer(length * 3);
-
-    // Convert to runs of 8-bit characters.
-    char* p = buffer.data();
-    const UChar* d = reinterpret_cast<const UChar*>(&data()[0]);
-    ConversionResult result = convertUTF16ToUTF8(&d, d + length, &p, p + buffer.size(), strict);
-    if (result != conversionOK)
+    // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
+    // preserved, characters outside of this range are converted to '?'.
+
+    unsigned length = this->length();
+    const UChar* characters = this->characters();
+
+    char* characterBuffer;
+    CString result = CString::newUninitialized(length, characterBuffer);
+
+    for (unsigned i = 0; i < length; ++i) {
+        UChar ch = characters[i];
+        characterBuffer[i] = ch && (ch < 0x20 || ch >= 0x7f) ? '?' : ch;
+    }
+
+    return result;
+}
+
+CString UString::latin1() const
+{
+    // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
+    // preserved, characters outside of this range are converted to '?'.
+
+    unsigned length = this->length();
+    const UChar* characters = this->characters();
+
+    char* characterBuffer;
+    CString result = CString::newUninitialized(length, characterBuffer);
+
+    for (unsigned i = 0; i < length; ++i) {
+        UChar ch = characters[i];
+        characterBuffer[i] = ch > 0xff ? '?' : ch;
+    }
+
+    return result;
+}
+
+// Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available.
+static inline void putUTF8Triple(char*& buffer, UChar ch)
+{
+    ASSERT(ch >= 0x0800);
+    *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
+    *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
+    *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
+}
+
+CString UString::utf8(bool strict) const
+{
+    unsigned length = this->length();
+    const UChar* characters = this->characters();
+
+    // Allocate a buffer big enough to hold all the characters
+    // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
+    // Optimization ideas, if we find this function is hot:
+    //  * We could speculatively create a CStringBuffer to contain 'length' 
+    //    characters, and resize if necessary (i.e. if the buffer contains
+    //    non-ascii characters). (Alternatively, scan the buffer first for
+    //    ascii characters, so we know this will be sufficient).
+    //  * We could allocate a CStringBuffer with an appropriate size to
+    //    have a good chance of being able to write the string into the
+    //    buffer without reallocing (say, 1.5 x length).
+    Vector<char, 1024> bufferVector(length * 3);
+
+    char* buffer = bufferVector.data();
+    ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
+    ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
+
+    // Only produced from strict conversion.
+    if (result == sourceIllegal)
         return CString();
 
-    return CString(buffer.data(), p - buffer.data());
+    // Check for an unconverted high surrogate.
+    if (result == sourceExhausted) {
+        if (strict)
+            return CString();
+        // This should be one unpaired high surrogate. Treat it the same
+        // was as an unpaired high surrogate would have been handled in
+        // the middle of a string with non-strict conversion - which is
+        // to say, simply encode it to UTF-8.
+        ASSERT((characters + 1) == (this->characters() + length));
+        ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
+        // There should be room left, since one UChar hasn't been converted.
+        ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
+        putUTF8Triple(buffer, *characters);
+    }
+
+    return CString(bufferVector.data(), buffer - bufferVector.data());
 }
 
 } // namespace JSC