diff options
| author | Kenny Root <kroot@google.com> | 2010-11-09 14:37:23 -0800 | 
|---|---|---|
| committer | Alex Ray <aray@google.com> | 2013-07-30 13:56:55 -0700 | 
| commit | ba0165bef09729a33ab8e0ca329342be05e0d859 (patch) | |
| tree | 2f961146c3c8c625a01902207485fed0d7a2cdcd /libs/utils | |
| parent | 3a91fca00c6b3db62b4dc0da95ba30671caf3283 (diff) | |
| download | system_core-ba0165bef09729a33ab8e0ca329342be05e0d859.zip system_core-ba0165bef09729a33ab8e0ca329342be05e0d859.tar.gz system_core-ba0165bef09729a33ab8e0ca329342be05e0d859.tar.bz2 | |
Split UTF functions from String8/16
Split out all the UTF-8/16/32 handling code from String8/16 to its own
file to allow better reuse of code.
Change-Id: If9ce63920edc75472c38da4adce0d13cda9ad2f7
Diffstat (limited to 'libs/utils')
| -rw-r--r-- | libs/utils/Android.mk | 1 | ||||
| -rw-r--r-- | libs/utils/ResourceTypes.cpp | 99 | ||||
| -rw-r--r-- | libs/utils/String16.cpp | 253 | ||||
| -rw-r--r-- | libs/utils/String8.cpp | 391 | ||||
| -rw-r--r-- | libs/utils/Unicode.cpp | 575 | ||||
| -rw-r--r-- | libs/utils/tests/Android.mk | 3 | ||||
| -rw-r--r-- | libs/utils/tests/Unicode_test.cpp | 115 | 
7 files changed, 814 insertions, 623 deletions
| diff --git a/libs/utils/Android.mk b/libs/utils/Android.mk index eb75ed8..05a9674 100644 --- a/libs/utils/Android.mk +++ b/libs/utils/Android.mk @@ -41,6 +41,7 @@ commonSources:= \  	TextOutput.cpp \  	Threads.cpp \  	Timers.cpp \ +	Unicode.cpp \  	VectorImpl.cpp \  	ZipFileCRO.cpp \  	ZipFileRO.cpp \ diff --git a/libs/utils/ResourceTypes.cpp b/libs/utils/ResourceTypes.cpp index f287298..bbf5093 100644 --- a/libs/utils/ResourceTypes.cpp +++ b/libs/utils/ResourceTypes.cpp @@ -444,15 +444,51 @@ void ResStringPool::uninit()      }  } -#define DECODE_LENGTH(str, chrsz, len) \ -    len = *(str); \ -    if (*(str)&(1<<(chrsz*8-1))) { \ -        (str)++; \ -        len = (((len)&((1<<(chrsz*8-1))-1))<<(chrsz*8)) + *(str); \ -    } \ -    (str)++; - -const uint16_t* ResStringPool::stringAt(size_t idx, size_t* outLen) const +/** + * Strings in UTF-16 format have length indicated by a length encoded in the + * stored data. It is either 1 or 2 characters of length data. This allows a + * maximum length of 0x7FFFFFF (2147483647 bytes), but if you're storing that + * much data in a string, you're abusing them. + * + * If the high bit is set, then there are two characters or 4 bytes of length + * data encoded. In that case, drop the high bit of the first character and + * add it together with the next character. + */ +static inline size_t +decodeLength(const char16_t** str) +{ +    size_t len = **str; +    if ((len & 0x8000) != 0) { +        (*str)++; +        len = ((len & 0x7FFF) << 16) | **str; +    } +    (*str)++; +    return len; +} + +/** + * Strings in UTF-8 format have length indicated by a length encoded in the + * stored data. It is either 1 or 2 characters of length data. This allows a + * maximum length of 0x7FFF (32767 bytes), but you should consider storing + * text in another way if you're using that much data in a single string. + * + * If the high bit is set, then there are two characters or 2 bytes of length + * data encoded. In that case, drop the high bit of the first character and + * add it together with the next character. + */ +static inline size_t +decodeLength(const uint8_t** str) +{ +    size_t len = **str; +    if ((len & 0x80) != 0) { +        (*str)++; +        len = ((len & 0x7F) << 8) | **str; +    } +    (*str)++; +    return len; +} + +const uint16_t* ResStringPool::stringAt(size_t idx, size_t* u16len) const  {      if (mError == NO_ERROR && idx < mHeader->stringCount) {          const bool isUTF8 = (mHeader->flags&ResStringPool_header::UTF8_FLAG) != 0; @@ -461,37 +497,51 @@ const uint16_t* ResStringPool::stringAt(size_t idx, size_t* outLen) const              if (!isUTF8) {                  const char16_t* strings = (char16_t*)mStrings;                  const char16_t* str = strings+off; -                DECODE_LENGTH(str, sizeof(char16_t), *outLen) -                if ((uint32_t)(str+*outLen-strings) < mStringPoolSize) { + +                *u16len = decodeLength(&str); +                if ((uint32_t)(str+*u16len-strings) < mStringPoolSize) {                      return str;                  } else {                      LOGW("Bad string block: string #%d extends to %d, past end at %d\n", -                            (int)idx, (int)(str+*outLen-strings), (int)mStringPoolSize); +                            (int)idx, (int)(str+*u16len-strings), (int)mStringPoolSize);                  }              } else {                  const uint8_t* strings = (uint8_t*)mStrings; -                const uint8_t* str = strings+off; -                DECODE_LENGTH(str, sizeof(uint8_t), *outLen) -                size_t encLen; -                DECODE_LENGTH(str, sizeof(uint8_t), encLen) -                if ((uint32_t)(str+encLen-strings) < mStringPoolSize) { +                const uint8_t* u8str = strings+off; + +                *u16len = decodeLength(&u8str); +                size_t u8len = decodeLength(&u8str); + +                // encLen must be less than 0x7FFF due to encoding. +                if ((uint32_t)(u8str+u8len-strings) < mStringPoolSize) {                      AutoMutex lock(mDecodeLock); +                      if (mCache[idx] != NULL) {                          return mCache[idx];                      } -                    char16_t *u16str = (char16_t *)calloc(*outLen+1, sizeof(char16_t)); + +                    ssize_t actualLen = utf8_to_utf16_length(u8str, u8len); +                    if (actualLen < 0 || (size_t)actualLen != *u16len) { +                        LOGW("Bad string block: string #%lld decoded length is not correct " +                                "%lld vs %llu\n", +                                (long long)idx, (long long)actualLen, (long long)*u16len); +                        return NULL; +                    } + +                    char16_t *u16str = (char16_t *)calloc(*u16len+1, sizeof(char16_t));                      if (!u16str) {                          LOGW("No memory when trying to allocate decode cache for string #%d\n",                                  (int)idx);                          return NULL;                      } -                    const unsigned char *u8src = reinterpret_cast<const unsigned char *>(str); -                    utf8_to_utf16(u8src, encLen, u16str, *outLen); + +                    utf8_to_utf16(u8str, u8len, u16str);                      mCache[idx] = u16str;                      return u16str;                  } else { -                    LOGW("Bad string block: string #%d extends to %d, past end at %d\n", -                            (int)idx, (int)(str+encLen-strings), (int)mStringPoolSize); +                    LOGW("Bad string block: string #%lld extends to %lld, past end at %lld\n", +                            (long long)idx, (long long)(u8str+u8len-strings), +                            (long long)mStringPoolSize);                  }              }          } else { @@ -512,9 +562,8 @@ const char* ResStringPool::string8At(size_t idx, size_t* outLen) const              if (isUTF8) {                  const uint8_t* strings = (uint8_t*)mStrings;                  const uint8_t* str = strings+off; -                DECODE_LENGTH(str, sizeof(uint8_t), *outLen) -                size_t encLen; -                DECODE_LENGTH(str, sizeof(uint8_t), encLen) +                *outLen = decodeLength(&str); +                size_t encLen = decodeLength(&str);                  if ((uint32_t)(str+encLen-strings) < mStringPoolSize) {                      return (const char*)str;                  } else { diff --git a/libs/utils/String16.cpp b/libs/utils/String16.cpp index eab7b2b..4ce1664 100644 --- a/libs/utils/String16.cpp +++ b/libs/utils/String16.cpp @@ -18,228 +18,17 @@  #include <utils/Debug.h>  #include <utils/Log.h> +#include <utils/Unicode.h>  #include <utils/String8.h>  #include <utils/TextOutput.h>  #include <utils/threads.h>  #include <private/utils/Static.h> -#ifdef HAVE_WINSOCK -# undef  nhtol -# undef  htonl -# undef  nhtos -# undef  htons - -# ifdef HAVE_LITTLE_ENDIAN -#  define ntohl(x)    ( ((x) << 24) | (((x) >> 24) & 255) | (((x) << 8) & 0xff0000) | (((x) >> 8) & 0xff00) ) -#  define htonl(x)    ntohl(x) -#  define ntohs(x)    ( (((x) << 8) & 0xff00) | (((x) >> 8) & 255) ) -#  define htons(x)    ntohs(x) -# else -#  define ntohl(x)    (x) -#  define htonl(x)    (x) -#  define ntohs(x)    (x) -#  define htons(x)    (x) -# endif -#else -# include <netinet/in.h> -#endif -  #include <memory.h>  #include <stdio.h>  #include <ctype.h> -// --------------------------------------------------------------------------- - -int strcmp16(const char16_t *s1, const char16_t *s2) -{ -  char16_t ch; -  int d = 0; - -  while ( 1 ) { -    d = (int)(ch = *s1++) - (int)*s2++; -    if ( d || !ch ) -      break; -  } - -  return d; -} - -int strncmp16(const char16_t *s1, const char16_t *s2, size_t n) -{ -  char16_t ch; -  int d = 0; - -  while ( n-- ) { -    d = (int)(ch = *s1++) - (int)*s2++; -    if ( d || !ch ) -      break; -  } - -  return d; -} - -char16_t *strcpy16(char16_t *dst, const char16_t *src) -{ -  char16_t *q = dst; -  const char16_t *p = src; -  char16_t ch; - -  do { -    *q++ = ch = *p++; -  } while ( ch ); - -  return dst; -} - -size_t strlen16(const char16_t *s) -{ -  const char16_t *ss = s; -  while ( *ss ) -    ss++; -  return ss-s; -} - - -char16_t *strncpy16(char16_t *dst, const char16_t *src, size_t n) -{ -  char16_t *q = dst; -  const char16_t *p = src; -  char ch; - -  while (n) { -    n--; -    *q++ = ch = *p++; -    if ( !ch ) -      break; -  } - -  *q = 0; - -  return dst; -} - -size_t strnlen16(const char16_t *s, size_t maxlen) -{ -  const char16_t *ss = s; - -  /* Important: the maxlen test must precede the reference through ss; -     since the byte beyond the maximum may segfault */ -  while ((maxlen > 0) && *ss) { -    ss++; -    maxlen--; -  } -  return ss-s; -} - -int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2) -{ -    const char16_t* e1 = s1+n1; -    const char16_t* e2 = s2+n2; - -    while (s1 < e1 && s2 < e2) { -        const int d = (int)*s1++ - (int)*s2++; -        if (d) { -            return d; -        } -    } - -    return n1 < n2 -        ? (0 - (int)*s2) -        : (n1 > n2 -           ? ((int)*s1 - 0) -           : 0); -} - -int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2) -{ -    const char16_t* e1 = s1H+n1; -    const char16_t* e2 = s2N+n2; - -    while (s1H < e1 && s2N < e2) { -        const char16_t c2 = ntohs(*s2N); -        const int d = (int)*s1H++ - (int)c2; -        s2N++; -        if (d) { -            return d; -        } -    } - -    return n1 < n2 -        ? (0 - (int)ntohs(*s2N)) -        : (n1 > n2 -           ? ((int)*s1H - 0) -           : 0); -} - -static inline size_t -utf8_char_len(uint8_t ch) -{ -    return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1; -} - -#define UTF8_SHIFT_AND_MASK(unicode, byte)  (unicode)<<=6; (unicode) |= (0x3f & (byte)); - -static inline uint32_t -utf8_to_utf32(const uint8_t *src, size_t length) -{ -    uint32_t unicode; - -    switch (length) -    { -        case 1: -            return src[0]; -        case 2: -            unicode = src[0] & 0x1f; -            UTF8_SHIFT_AND_MASK(unicode, src[1]) -            return unicode; -        case 3: -            unicode = src[0] & 0x0f; -            UTF8_SHIFT_AND_MASK(unicode, src[1]) -            UTF8_SHIFT_AND_MASK(unicode, src[2]) -            return unicode; -        case 4: -            unicode = src[0] & 0x07; -            UTF8_SHIFT_AND_MASK(unicode, src[1]) -            UTF8_SHIFT_AND_MASK(unicode, src[2]) -            UTF8_SHIFT_AND_MASK(unicode, src[3]) -            return unicode; -        default: -            return 0xffff; -    } -     -    //printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result); -} - -void -utf8_to_utf16(const uint8_t *src, size_t srcLen, -        char16_t* dst, const size_t dstLen) -{ -    const uint8_t* const end = src + srcLen; -    const char16_t* const dstEnd = dst + dstLen; -    while (src < end && dst < dstEnd) { -        size_t len = utf8_char_len(*src); -        uint32_t codepoint = utf8_to_utf32((const uint8_t*)src, len); - -        // Convert the UTF32 codepoint to one or more UTF16 codepoints -        if (codepoint <= 0xFFFF) { -            // Single UTF16 character -            *dst++ = (char16_t) codepoint; -        } else { -            // Multiple UTF16 characters with surrogates -            codepoint = codepoint - 0x10000; -            *dst++ = (char16_t) ((codepoint >> 10) + 0xD800); -            *dst++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00); -        } - -        src += len; -    } -    if (dst < dstEnd) { -        *dst = 0; -    } -} - -// ---------------------------------------------------------------------------  namespace android { @@ -270,37 +59,33 @@ void terminate_string16()  // --------------------------------------------------------------------------- -static char16_t* allocFromUTF8(const char* in, size_t len) +static char16_t* allocFromUTF8(const char* u8str, size_t u8len)  { -    if (len == 0) return getEmptyString(); -     -    size_t chars = 0; -    const char* end = in+len; -    const char* p = in; -     -    while (p < end) { -        chars++; -        int utf8len = utf8_char_len(*p); -        uint32_t codepoint = utf8_to_utf32((const uint8_t*)p, utf8len); -        if (codepoint > 0xFFFF) chars++; // this will be a surrogate pair in utf16 -        p += utf8len; +    if (u8len == 0) return getEmptyString(); + +    const uint8_t* u8cur = (const uint8_t*) u8str; + +    const ssize_t u16len = utf8_to_utf16_length(u8cur, u8len); +    if (u16len < 0) { +        return getEmptyString();      } -     -    size_t bufSize = (chars+1)*sizeof(char16_t); -    SharedBuffer* buf = SharedBuffer::alloc(bufSize); + +    const uint8_t* const u8end = u8cur + u8len; + +    SharedBuffer* buf = SharedBuffer::alloc(sizeof(char16_t)*(u16len+1));      if (buf) { -        p = in; -        char16_t* str = (char16_t*)buf->data(); -         -        utf8_to_utf16((const uint8_t*)p, len, str, bufSize); +        u8cur = (const uint8_t*) u8str; +        char16_t* u16str = (char16_t*)buf->data(); + +        utf8_to_utf16(u8cur, u8len, u16str);          //printf("Created UTF-16 string from UTF-8 \"%s\":", in);          //printHexData(1, str, buf->size(), 16, 1);          //printf("\n"); -        return str; +        return u16str;      } -     +      return getEmptyString();  } diff --git a/libs/utils/String8.cpp b/libs/utils/String8.cpp index 6358fc4..c8dc083 100644 --- a/libs/utils/String8.cpp +++ b/libs/utils/String8.cpp @@ -17,6 +17,8 @@  #include <utils/String8.h>  #include <utils/Log.h> +#include <utils/Unicode.h> +#include <utils/SharedBuffer.h>  #include <utils/String16.h>  #include <utils/TextOutput.h>  #include <utils/threads.h> @@ -34,94 +36,10 @@  namespace android { -static const char32_t kByteMask = 0x000000BF; -static const char32_t kByteMark = 0x00000080; - -// Surrogates aren't valid for UTF-32 characters, so define some -// constants that will let us screen them out. -static const char32_t kUnicodeSurrogateHighStart  = 0x0000D800; -static const char32_t kUnicodeSurrogateHighEnd    = 0x0000DBFF; -static const char32_t kUnicodeSurrogateLowStart   = 0x0000DC00; -static const char32_t kUnicodeSurrogateLowEnd     = 0x0000DFFF; -static const char32_t kUnicodeSurrogateStart      = kUnicodeSurrogateHighStart; -static const char32_t kUnicodeSurrogateEnd        = kUnicodeSurrogateLowEnd; -static const char32_t kUnicodeMaxCodepoint        = 0x0010FFFF; - -// Mask used to set appropriate bits in first byte of UTF-8 sequence, -// indexed by number of bytes in the sequence. -// 0xxxxxxx -// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000 -// 110yyyyx 10xxxxxx -// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0 -// 1110yyyy 10yxxxxx 10xxxxxx -// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0 -// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx -// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0 -static const char32_t kFirstByteMark[] = { -    0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0 -}; -  // Separator used by resource paths. This is not platform dependent contrary  // to OS_PATH_SEPARATOR.  #define RES_PATH_SEPARATOR '/' -// Return number of utf8 bytes required for the character. -static size_t utf32_to_utf8_bytes(char32_t srcChar) -{ -    size_t bytesToWrite; - -    // Figure out how many bytes the result will require. -    if (srcChar < 0x00000080) -    { -        bytesToWrite = 1; -    } -    else if (srcChar < 0x00000800) -    { -        bytesToWrite = 2; -    } -    else if (srcChar < 0x00010000) -    { -        if ((srcChar < kUnicodeSurrogateStart) -         || (srcChar > kUnicodeSurrogateEnd)) -        { -            bytesToWrite = 3; -        } -        else -        { -            // Surrogates are invalid UTF-32 characters. -            return 0; -        } -    } -    // Max code point for Unicode is 0x0010FFFF. -    else if (srcChar <= kUnicodeMaxCodepoint) -    { -        bytesToWrite = 4; -    } -    else -    { -        // Invalid UTF-32 character. -        return 0; -    } - -    return bytesToWrite; -} - -// Write out the source character to <dstP>. - -static void utf32_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes) -{ -    dstP += bytes; -    switch (bytes) -    {   /* note: everything falls through. */ -        case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; -        case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; -        case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; -        case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]); -    } -} - -// --------------------------------------------------------------------------- -  static SharedBuffer* gEmptyStringBuf = NULL;  static char* gEmptyString = NULL; @@ -175,62 +93,47 @@ static char* allocFromUTF8(const char* in, size_t len)      return getEmptyString();  } -template<typename T, typename L> -static char* allocFromUTF16OrUTF32(const T* in, L len) +static char* allocFromUTF16(const char16_t* in, size_t len)  {      if (len == 0) return getEmptyString(); -    size_t bytes = 0; -    const T* end = in+len; -    const T* p = in; - -    while (p < end) { -        bytes += utf32_to_utf8_bytes(*p); -        p++; +    const ssize_t bytes = utf16_to_utf8_length(in, len); +    if (bytes < 0) { +        return getEmptyString();      }      SharedBuffer* buf = SharedBuffer::alloc(bytes+1);      LOG_ASSERT(buf, "Unable to allocate shared buffer"); -    if (buf) { -        p = in; -        char* str = (char*)buf->data(); -        char* d = str; -        while (p < end) { -            const T c = *p++; -            size_t len = utf32_to_utf8_bytes(c); -            utf32_to_utf8((uint8_t*)d, c, len); -            d += len; -        } -        *d = 0; - -        return str; +    if (!buf) { +        return getEmptyString();      } -    return getEmptyString(); +    char* str = (char*)buf->data(); +    utf16_to_utf8(in, len, str); +    return str;  } -static char* allocFromUTF16(const char16_t* in, size_t len) +static char* allocFromUTF32(const char32_t* in, size_t len)  { -    if (len == 0) return getEmptyString(); +    if (len == 0) { +        return getEmptyString(); +    } -    const size_t bytes = utf8_length_from_utf16(in, len); +    const ssize_t bytes = utf32_to_utf8_length(in, len); +    if (bytes < 0) { +        return getEmptyString(); +    }      SharedBuffer* buf = SharedBuffer::alloc(bytes+1);      LOG_ASSERT(buf, "Unable to allocate shared buffer"); -    if (buf) { -        char* str = (char*)buf->data(); - -        utf16_to_utf8(in, len, str, bytes+1); - -        return str; +    if (!buf) { +        return getEmptyString();      } -    return getEmptyString(); -} +    char* str = (char*) buf->data(); +    utf32_to_utf8(in, len, str); -static char* allocFromUTF32(const char32_t* in, size_t len) -{ -    return allocFromUTF16OrUTF32<char32_t, size_t>(in, len); +    return str;  }  // --------------------------------------------------------------------------- @@ -510,17 +413,17 @@ void String8::toUpper(size_t start, size_t length)  size_t String8::getUtf32Length() const  { -    return utf32_length(mString, length()); +    return utf8_to_utf32_length(mString, length());  }  int32_t String8::getUtf32At(size_t index, size_t *next_index) const  { -    return utf32_at(mString, length(), index, next_index); +    return utf32_from_utf8_at(mString, length(), index, next_index);  } -size_t String8::getUtf32(char32_t* dst, size_t dst_len) const +void String8::getUtf32(char32_t* dst) const  { -    return utf8_to_utf32(mString, length(), dst, dst_len); +    utf8_to_utf32(mString, length(), dst);  }  TextOutput& operator<<(TextOutput& to, const String8& val) @@ -705,241 +608,3 @@ String8& String8::convertToResPath()  }  }; // namespace android - -// --------------------------------------------------------------------------- - -size_t strlen32(const char32_t *s) -{ -  const char32_t *ss = s; -  while ( *ss ) -    ss++; -  return ss-s; -} - -size_t strnlen32(const char32_t *s, size_t maxlen) -{ -  const char32_t *ss = s; -  while ((maxlen > 0) && *ss) { -    ss++; -    maxlen--; -  } -  return ss-s; -} - -size_t utf8_length(const char *src) -{ -    const char *cur = src; -    size_t ret = 0; -    while (*cur != '\0') { -        const char first_char = *cur++; -        if ((first_char & 0x80) == 0) { // ASCII -            ret += 1; -            continue; -        } -        // (UTF-8's character must not be like 10xxxxxx, -        //  but 110xxxxx, 1110xxxx, ... or 1111110x) -        if ((first_char & 0x40) == 0) { -            return 0; -        } - -        int32_t mask, to_ignore_mask; -        size_t num_to_read = 0; -        char32_t utf32 = 0; -        for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80; -             num_to_read < 5 && (first_char & mask); -             num_to_read++, to_ignore_mask |= mask, mask >>= 1) { -            if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx -                return 0; -            } -            // 0x3F == 00111111 -            utf32 = (utf32 << 6) + (*cur++ & 0x3F); -        } -        // "first_char" must be (110xxxxx - 11110xxx) -        if (num_to_read == 5) { -            return 0; -        } -        to_ignore_mask |= mask; -        utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1)); -        if (utf32 > android::kUnicodeMaxCodepoint) { -            return 0; -        } - -        ret += num_to_read; -    } -    return ret; -} - -size_t utf32_length(const char *src, size_t src_len) -{ -    if (src == NULL || src_len == 0) { -        return 0; -    } -    size_t ret = 0; -    const char* cur; -    const char* end; -    size_t num_to_skip; -    for (cur = src, end = src + src_len, num_to_skip = 1; -         cur < end; -         cur += num_to_skip, ret++) { -        const char first_char = *cur; -        num_to_skip = 1; -        if ((first_char & 0x80) == 0) {  // ASCII -            continue; -        } -        int32_t mask; - -        for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) { -        } -    } -    return ret; -} - -size_t utf8_length_from_utf32(const char32_t *src, size_t src_len) -{ -    if (src == NULL || src_len == 0) { -        return 0; -    } -    size_t ret = 0; -    const char32_t *end = src + src_len; -    while (src < end) { -        ret += android::utf32_to_utf8_bytes(*src++); -    } -    return ret; -} - -size_t utf8_length_from_utf16(const char16_t *src, size_t src_len) -{ -    if (src == NULL || src_len == 0) { -        return 0; -    } -    size_t ret = 0; -    const char16_t* const end = src + src_len; -    while (src < end) { -        if ((*src & 0xFC00) == 0xD800 && (src + 1) < end -                && (*++src & 0xFC00) == 0xDC00) { -            // surrogate pairs are always 4 bytes. -            ret += 4; -            src++; -        } else { -            ret += android::utf32_to_utf8_bytes((char32_t) *src++); -        } -    } -    return ret; -} - -static int32_t utf32_at_internal(const char* cur, size_t *num_read) -{ -    const char first_char = *cur; -    if ((first_char & 0x80) == 0) { // ASCII -        *num_read = 1; -        return *cur; -    } -    cur++; -    char32_t mask, to_ignore_mask; -    size_t num_to_read = 0; -    char32_t utf32 = first_char; -    for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80; -         (first_char & mask); -         num_to_read++, to_ignore_mask |= mask, mask >>= 1) { -        // 0x3F == 00111111 -        utf32 = (utf32 << 6) + (*cur++ & 0x3F); -    } -    to_ignore_mask |= mask; -    utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1))); - -    *num_read = num_to_read; -    return static_cast<int32_t>(utf32); -} - -int32_t utf32_at(const char *src, size_t src_len, -                 size_t index, size_t *next_index) -{ -    if (index >= src_len) { -        return -1; -    } -    size_t dummy_index; -    if (next_index == NULL) { -        next_index = &dummy_index; -    } -    size_t num_read; -    int32_t ret = utf32_at_internal(src + index, &num_read); -    if (ret >= 0) { -        *next_index = index + num_read; -    } - -    return ret; -} - -size_t utf8_to_utf32(const char* src, size_t src_len, -                     char32_t* dst, size_t dst_len) -{ -    if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) { -        return 0; -    } - -    const char* cur = src; -    const char* end = src + src_len; -    char32_t* cur_utf32 = dst; -    const char32_t* end_utf32 = dst + dst_len; -    while (cur_utf32 < end_utf32 && cur < end) { -        size_t num_read; -        *cur_utf32++ = -                static_cast<char32_t>(utf32_at_internal(cur, &num_read)); -        cur += num_read; -    } -    if (cur_utf32 < end_utf32) { -        *cur_utf32 = 0; -    } -    return static_cast<size_t>(cur_utf32 - dst); -} - -size_t utf32_to_utf8(const char32_t* src, size_t src_len, -                     char* dst, size_t dst_len) -{ -    if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) { -        return 0; -    } -    const char32_t *cur_utf32 = src; -    const char32_t *end_utf32 = src + src_len; -    char *cur = dst; -    const char *end = dst + dst_len; -    while (cur_utf32 < end_utf32 && cur < end) { -        size_t len = android::utf32_to_utf8_bytes(*cur_utf32); -        android::utf32_to_utf8((uint8_t *)cur, *cur_utf32++, len); -        cur += len; -    } -    if (cur < end) { -        *cur = '\0'; -    } -    return cur - dst; -} - -size_t utf16_to_utf8(const char16_t* src, size_t src_len, -                     char* dst, size_t dst_len) -{ -    if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) { -        return 0; -    } -    const char16_t* cur_utf16 = src; -    const char16_t* const end_utf16 = src + src_len; -    char *cur = dst; -    const char* const end = dst + dst_len; -    while (cur_utf16 < end_utf16 && cur < end) { -        char32_t utf32; -        // surrogate pairs -        if ((*cur_utf16 & 0xFC00) == 0xD800 && (cur_utf16 + 1) < end_utf16) { -            utf32 = (*cur_utf16++ - 0xD800) << 10; -            utf32 |= *cur_utf16++ - 0xDC00; -            utf32 += 0x10000; -        } else { -            utf32 = (char32_t) *cur_utf16++; -        } -        size_t len = android::utf32_to_utf8_bytes(utf32); -        android::utf32_to_utf8((uint8_t*)cur, utf32, len); -        cur += len; -    } -    if (cur < end) { -        *cur = '\0'; -    } -    return cur - dst; -} diff --git a/libs/utils/Unicode.cpp b/libs/utils/Unicode.cpp new file mode 100644 index 0000000..78c61b4 --- /dev/null +++ b/libs/utils/Unicode.cpp @@ -0,0 +1,575 @@ +/* + * Copyright (C) 2005 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + *      http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <utils/Unicode.h> + +#include <stddef.h> + +#ifdef HAVE_WINSOCK +# undef  nhtol +# undef  htonl +# undef  nhtos +# undef  htons + +# ifdef HAVE_LITTLE_ENDIAN +#  define ntohl(x)    ( ((x) << 24) | (((x) >> 24) & 255) | (((x) << 8) & 0xff0000) | (((x) >> 8) & 0xff00) ) +#  define htonl(x)    ntohl(x) +#  define ntohs(x)    ( (((x) << 8) & 0xff00) | (((x) >> 8) & 255) ) +#  define htons(x)    ntohs(x) +# else +#  define ntohl(x)    (x) +#  define htonl(x)    (x) +#  define ntohs(x)    (x) +#  define htons(x)    (x) +# endif +#else +# include <netinet/in.h> +#endif + +extern "C" { + +static const char32_t kByteMask = 0x000000BF; +static const char32_t kByteMark = 0x00000080; + +// Surrogates aren't valid for UTF-32 characters, so define some +// constants that will let us screen them out. +static const char32_t kUnicodeSurrogateHighStart  = 0x0000D800; +static const char32_t kUnicodeSurrogateHighEnd    = 0x0000DBFF; +static const char32_t kUnicodeSurrogateLowStart   = 0x0000DC00; +static const char32_t kUnicodeSurrogateLowEnd     = 0x0000DFFF; +static const char32_t kUnicodeSurrogateStart      = kUnicodeSurrogateHighStart; +static const char32_t kUnicodeSurrogateEnd        = kUnicodeSurrogateLowEnd; +static const char32_t kUnicodeMaxCodepoint        = 0x0010FFFF; + +// Mask used to set appropriate bits in first byte of UTF-8 sequence, +// indexed by number of bytes in the sequence. +// 0xxxxxxx +// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000 +// 110yyyyx 10xxxxxx +// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0 +// 1110yyyy 10yxxxxx 10xxxxxx +// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0 +// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx +// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0 +static const char32_t kFirstByteMark[] = { +    0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0 +}; + +// -------------------------------------------------------------------------- +// UTF-32 +// -------------------------------------------------------------------------- + +/** + * Return number of UTF-8 bytes required for the character. If the character + * is invalid, return size of 0. + */ +static inline size_t utf32_codepoint_utf8_length(char32_t srcChar) +{ +    // Figure out how many bytes the result will require. +    if (srcChar < 0x00000080) { +        return 1; +    } else if (srcChar < 0x00000800) { +        return 2; +    } else if (srcChar < 0x00010000) { +        if ((srcChar < kUnicodeSurrogateStart) || (srcChar > kUnicodeSurrogateEnd)) { +            return 3; +        } else { +            // Surrogates are invalid UTF-32 characters. +            return 0; +        } +    } +    // Max code point for Unicode is 0x0010FFFF. +    else if (srcChar <= kUnicodeMaxCodepoint) { +        return 4; +    } else { +        // Invalid UTF-32 character. +        return 0; +    } +} + +// Write out the source character to <dstP>. + +static inline void utf32_codepoint_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes) +{ +    dstP += bytes; +    switch (bytes) +    {   /* note: everything falls through. */ +        case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; +        case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; +        case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; +        case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]); +    } +} + +size_t strlen32(const char32_t *s) +{ +  const char32_t *ss = s; +  while ( *ss ) +    ss++; +  return ss-s; +} + +size_t strnlen32(const char32_t *s, size_t maxlen) +{ +  const char32_t *ss = s; +  while ((maxlen > 0) && *ss) { +    ss++; +    maxlen--; +  } +  return ss-s; +} + +static inline int32_t utf32_at_internal(const char* cur, size_t *num_read) +{ +    const char first_char = *cur; +    if ((first_char & 0x80) == 0) { // ASCII +        *num_read = 1; +        return *cur; +    } +    cur++; +    char32_t mask, to_ignore_mask; +    size_t num_to_read = 0; +    char32_t utf32 = first_char; +    for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80; +         (first_char & mask); +         num_to_read++, to_ignore_mask |= mask, mask >>= 1) { +        // 0x3F == 00111111 +        utf32 = (utf32 << 6) + (*cur++ & 0x3F); +    } +    to_ignore_mask |= mask; +    utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1))); + +    *num_read = num_to_read; +    return static_cast<int32_t>(utf32); +} + +int32_t utf32_from_utf8_at(const char *src, size_t src_len, size_t index, size_t *next_index) +{ +    if (index >= src_len) { +        return -1; +    } +    size_t dummy_index; +    if (next_index == NULL) { +        next_index = &dummy_index; +    } +    size_t num_read; +    int32_t ret = utf32_at_internal(src + index, &num_read); +    if (ret >= 0) { +        *next_index = index + num_read; +    } + +    return ret; +} + +ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len) +{ +    if (src == NULL || src_len == 0) { +        return -1; +    } + +    size_t ret = 0; +    const char32_t *end = src + src_len; +    while (src < end) { +        ret += utf32_codepoint_utf8_length(*src++); +    } +    return ret; +} + +void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst) +{ +    if (src == NULL || src_len == 0 || dst == NULL) { +        return; +    } + +    const char32_t *cur_utf32 = src; +    const char32_t *end_utf32 = src + src_len; +    char *cur = dst; +    while (cur_utf32 < end_utf32) { +        size_t len = utf32_codepoint_utf8_length(*cur_utf32); +        utf32_codepoint_to_utf8((uint8_t *)cur, *cur_utf32++, len); +        cur += len; +    } +    *cur = '\0'; +} + +// -------------------------------------------------------------------------- +// UTF-16 +// -------------------------------------------------------------------------- + +int strcmp16(const char16_t *s1, const char16_t *s2) +{ +  char16_t ch; +  int d = 0; + +  while ( 1 ) { +    d = (int)(ch = *s1++) - (int)*s2++; +    if ( d || !ch ) +      break; +  } + +  return d; +} + +int strncmp16(const char16_t *s1, const char16_t *s2, size_t n) +{ +  char16_t ch; +  int d = 0; + +  while ( n-- ) { +    d = (int)(ch = *s1++) - (int)*s2++; +    if ( d || !ch ) +      break; +  } + +  return d; +} + +char16_t *strcpy16(char16_t *dst, const char16_t *src) +{ +  char16_t *q = dst; +  const char16_t *p = src; +  char16_t ch; + +  do { +    *q++ = ch = *p++; +  } while ( ch ); + +  return dst; +} + +size_t strlen16(const char16_t *s) +{ +  const char16_t *ss = s; +  while ( *ss ) +    ss++; +  return ss-s; +} + + +char16_t *strncpy16(char16_t *dst, const char16_t *src, size_t n) +{ +  char16_t *q = dst; +  const char16_t *p = src; +  char ch; + +  while (n) { +    n--; +    *q++ = ch = *p++; +    if ( !ch ) +      break; +  } + +  *q = 0; + +  return dst; +} + +size_t strnlen16(const char16_t *s, size_t maxlen) +{ +  const char16_t *ss = s; + +  /* Important: the maxlen test must precede the reference through ss; +     since the byte beyond the maximum may segfault */ +  while ((maxlen > 0) && *ss) { +    ss++; +    maxlen--; +  } +  return ss-s; +} + +int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2) +{ +    const char16_t* e1 = s1+n1; +    const char16_t* e2 = s2+n2; + +    while (s1 < e1 && s2 < e2) { +        const int d = (int)*s1++ - (int)*s2++; +        if (d) { +            return d; +        } +    } + +    return n1 < n2 +        ? (0 - (int)*s2) +        : (n1 > n2 +           ? ((int)*s1 - 0) +           : 0); +} + +int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2) +{ +    const char16_t* e1 = s1H+n1; +    const char16_t* e2 = s2N+n2; + +    while (s1H < e1 && s2N < e2) { +        const char16_t c2 = ntohs(*s2N); +        const int d = (int)*s1H++ - (int)c2; +        s2N++; +        if (d) { +            return d; +        } +    } + +    return n1 < n2 +        ? (0 - (int)ntohs(*s2N)) +        : (n1 > n2 +           ? ((int)*s1H - 0) +           : 0); +} + +void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst) +{ +    if (src == NULL || src_len == 0 || dst == NULL) { +        return; +    } + +    const char16_t* cur_utf16 = src; +    const char16_t* const end_utf16 = src + src_len; +    char *cur = dst; +    while (cur_utf16 < end_utf16) { +        char32_t utf32; +        // surrogate pairs +        if ((*cur_utf16 & 0xFC00) == 0xD800) { +            utf32 = (*cur_utf16++ - 0xD800) << 10; +            utf32 |= *cur_utf16++ - 0xDC00; +            utf32 += 0x10000; +        } else { +            utf32 = (char32_t) *cur_utf16++; +        } +        const size_t len = utf32_codepoint_utf8_length(utf32); +        utf32_codepoint_to_utf8((uint8_t*)cur, utf32, len); +        cur += len; +    } +    *cur = '\0'; +} + +// -------------------------------------------------------------------------- +// UTF-8 +// -------------------------------------------------------------------------- + +ssize_t utf8_length(const char *src) +{ +    const char *cur = src; +    size_t ret = 0; +    while (*cur != '\0') { +        const char first_char = *cur++; +        if ((first_char & 0x80) == 0) { // ASCII +            ret += 1; +            continue; +        } +        // (UTF-8's character must not be like 10xxxxxx, +        //  but 110xxxxx, 1110xxxx, ... or 1111110x) +        if ((first_char & 0x40) == 0) { +            return -1; +        } + +        int32_t mask, to_ignore_mask; +        size_t num_to_read = 0; +        char32_t utf32 = 0; +        for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80; +             num_to_read < 5 && (first_char & mask); +             num_to_read++, to_ignore_mask |= mask, mask >>= 1) { +            if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx +                return -1; +            } +            // 0x3F == 00111111 +            utf32 = (utf32 << 6) + (*cur++ & 0x3F); +        } +        // "first_char" must be (110xxxxx - 11110xxx) +        if (num_to_read == 5) { +            return -1; +        } +        to_ignore_mask |= mask; +        utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1)); +        if (utf32 > kUnicodeMaxCodepoint) { +            return -1; +        } + +        ret += num_to_read; +    } +    return ret; +} + +ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len) +{ +    if (src == NULL || src_len == 0) { +        return -1; +    } + +    size_t ret = 0; +    const char16_t* const end = src + src_len; +    while (src < end) { +        if ((*src & 0xFC00) == 0xD800 && (src + 1) < end +                && (*++src & 0xFC00) == 0xDC00) { +            // surrogate pairs are always 4 bytes. +            ret += 4; +            src++; +        } else { +            ret += utf32_codepoint_utf8_length((char32_t) *src++); +        } +    } +    return ret; +} + +/** + * Returns 1-4 based on the number of leading bits. + * + * 1111 -> 4 + * 1110 -> 3 + * 110x -> 2 + * 10xx -> 1 + * 0xxx -> 1 + */ +static inline size_t utf8_codepoint_len(uint8_t ch) +{ +    return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1; +} + +static inline void utf8_shift_and_mask(uint32_t* codePoint, const uint8_t byte) +{ +    *codePoint <<= 6; +    *codePoint |= 0x3F & byte; +} + +size_t utf8_to_utf32_length(const char *src, size_t src_len) +{ +    if (src == NULL || src_len == 0) { +        return 0; +    } +    size_t ret = 0; +    const char* cur; +    const char* end; +    size_t num_to_skip; +    for (cur = src, end = src + src_len, num_to_skip = 1; +         cur < end; +         cur += num_to_skip, ret++) { +        const char first_char = *cur; +        num_to_skip = 1; +        if ((first_char & 0x80) == 0) {  // ASCII +            continue; +        } +        int32_t mask; + +        for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) { +        } +    } +    return ret; +} + +void utf8_to_utf32(const char* src, size_t src_len, char32_t* dst) +{ +    if (src == NULL || src_len == 0 || dst == NULL) { +        return; +    } + +    const char* cur = src; +    const char* const end = src + src_len; +    char32_t* cur_utf32 = dst; +    while (cur < end) { +        size_t num_read; +        *cur_utf32++ = static_cast<char32_t>(utf32_at_internal(cur, &num_read)); +        cur += num_read; +    } +    *cur_utf32 = 0; +} + +static inline uint32_t utf8_to_utf32_codepoint(const uint8_t *src, size_t length) +{ +    uint32_t unicode; + +    switch (length) +    { +        case 1: +            return src[0]; +        case 2: +            unicode = src[0] & 0x1f; +            utf8_shift_and_mask(&unicode, src[1]); +            return unicode; +        case 3: +            unicode = src[0] & 0x0f; +            utf8_shift_and_mask(&unicode, src[1]); +            utf8_shift_and_mask(&unicode, src[2]); +            return unicode; +        case 4: +            unicode = src[0] & 0x07; +            utf8_shift_and_mask(&unicode, src[1]); +            utf8_shift_and_mask(&unicode, src[2]); +            utf8_shift_and_mask(&unicode, src[3]); +            return unicode; +        default: +            return 0xffff; +    } + +    //printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result); +} + +ssize_t utf8_to_utf16_length(const uint8_t* u8str, size_t u8len) +{ +    const uint8_t* const u8end = u8str + u8len; +    const uint8_t* u8cur = u8str; + +    /* Validate that the UTF-8 is the correct len */ +    size_t u16measuredLen = 0; +    while (u8cur < u8end) { +        u16measuredLen++; +        int u8charLen = utf8_codepoint_len(*u8cur); +        uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8charLen); +        if (codepoint > 0xFFFF) u16measuredLen++; // this will be a surrogate pair in utf16 +        u8cur += u8charLen; +    } + +    /** +     * Make sure that we ended where we thought we would and the output UTF-16 +     * will be exactly how long we were told it would be. +     */ +    if (u8cur != u8end) { +        return -1; +    } + +    return u16measuredLen; +} + +/** + * Convert a UTF-8 string to UTF-16. The destination UTF-16 buffer must have + * space for NULL at the end. + */ +void utf8_to_utf16(const uint8_t* u8str, size_t u8len, char16_t* u16str) +{ +    const uint8_t* const u8end = u8str + u8len; +    const uint8_t* u8cur = u8str; +    char16_t* u16cur = u16str; + +    while (u8cur < u8end) { +        size_t u8len = utf8_codepoint_len(*u8cur); +        uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len); + +        // Convert the UTF32 codepoint to one or more UTF16 codepoints +        if (codepoint <= 0xFFFF) { +            // Single UTF16 character +            *u16cur++ = (char16_t) codepoint; +        } else { +            // Multiple UTF16 characters with surrogates +            codepoint = codepoint - 0x10000; +            *u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800); +            *u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00); +        } + +        u8cur += u8len; +    } +    *u16cur = 0; +} + +} diff --git a/libs/utils/tests/Android.mk b/libs/utils/tests/Android.mk index 00077ee..72d4876 100644 --- a/libs/utils/tests/Android.mk +++ b/libs/utils/tests/Android.mk @@ -8,7 +8,8 @@ ifneq ($(TARGET_SIMULATOR),true)  test_src_files := \  	ObbFile_test.cpp \  	Looper_test.cpp \ -	String8_test.cpp +	String8_test.cpp \ +	Unicode_test.cpp  shared_libraries := \  	libz \ diff --git a/libs/utils/tests/Unicode_test.cpp b/libs/utils/tests/Unicode_test.cpp new file mode 100644 index 0000000..18c130c --- /dev/null +++ b/libs/utils/tests/Unicode_test.cpp @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + *      http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "Unicode_test" +#include <utils/Log.h> +#include <utils/Unicode.h> + +#include <gtest/gtest.h> + +namespace android { + +class UnicodeTest : public testing::Test { +protected: +    virtual void SetUp() { +    } + +    virtual void TearDown() { +    } +}; + +TEST_F(UnicodeTest, UTF8toUTF16ZeroLength) { +    ssize_t measured; + +    const uint8_t str[] = { }; + +    measured = utf8_to_utf16_length(str, 0); +    EXPECT_EQ(0, measured) +            << "Zero length input should return zero length output."; +} + +TEST_F(UnicodeTest, UTF8toUTF16ASCIILength) { +    ssize_t measured; + +    // U+0030 or ASCII '0' +    const uint8_t str[] = { 0x30 }; + +    measured = utf8_to_utf16_length(str, sizeof(str)); +    EXPECT_EQ(1, measured) +            << "ASCII glyphs should have a length of 1 char16_t"; +} + +TEST_F(UnicodeTest, UTF8toUTF16Plane1Length) { +    ssize_t measured; + +    // U+2323 SMILE +    const uint8_t str[] = { 0xE2, 0x8C, 0xA3 }; + +    measured = utf8_to_utf16_length(str, sizeof(str)); +    EXPECT_EQ(1, measured) +            << "Plane 1 glyphs should have a length of 1 char16_t"; +} + +TEST_F(UnicodeTest, UTF8toUTF16SurrogateLength) { +    ssize_t measured; + +    // U+10000 +    const uint8_t str[] = { 0xF0, 0x90, 0x80, 0x80 }; + +    measured = utf8_to_utf16_length(str, sizeof(str)); +    EXPECT_EQ(2, measured) +            << "Surrogate pairs should have a length of 2 char16_t"; +} + +TEST_F(UnicodeTest, UTF8toUTF16TruncatedUTF8) { +    ssize_t measured; + +    // Truncated U+2323 SMILE +    // U+2323 SMILE +    const uint8_t str[] = { 0xE2, 0x8C }; + +    measured = utf8_to_utf16_length(str, sizeof(str)); +    EXPECT_EQ(-1, measured) +            << "Truncated UTF-8 should return -1 to indicate invalid"; +} + +TEST_F(UnicodeTest, UTF8toUTF16Normal) { +    const uint8_t str[] = { +        0x30, // U+0030, 1 UTF-16 character +        0xC4, 0x80, // U+0100, 1 UTF-16 character +        0xE2, 0x8C, 0xA3, // U+2323, 1 UTF-16 character +        0xF0, 0x90, 0x80, 0x80, // U+10000, 2 UTF-16 character +    }; + +    char16_t output[1 + 1 + 1 + 2 + 1]; // Room for NULL + +    utf8_to_utf16(str, sizeof(str), output); + +    EXPECT_EQ(0x0030, output[0]) +            << "should be U+0030"; +    EXPECT_EQ(0x0100, output[1]) +            << "should be U+0100"; +    EXPECT_EQ(0x2323, output[2]) +            << "should be U+2323"; +    EXPECT_EQ(0xD800, output[3]) +            << "should be first half of surrogate U+10000"; +    EXPECT_EQ(0xDC00, output[4]) +            << "should be second half of surrogate U+10000"; +    EXPECT_EQ(NULL, output[5]) +            << "should be NULL terminated"; +} + +} | 
