diff options
author | Kenny Root <kroot@google.com> | 2010-11-09 14:37:23 -0800 |
---|---|---|
committer | Kenny Root <kroot@google.com> | 2010-11-12 15:53:40 -0800 |
commit | 300ba6846949f5b21c6d93d7698dbc39716cf832 (patch) | |
tree | f00aed47d06332a09aa6909a6605b0743661c981 | |
parent | d781089731127bd9199d47f53b170895868b8750 (diff) | |
download | frameworks_base-300ba6846949f5b21c6d93d7698dbc39716cf832.zip frameworks_base-300ba6846949f5b21c6d93d7698dbc39716cf832.tar.gz frameworks_base-300ba6846949f5b21c6d93d7698dbc39716cf832.tar.bz2 |
Split UTF functions from String8/16
Split out all the UTF-8/16/32 handling code from String8/16 to its own
file to allow better reuse of code.
Change-Id: If9ce63920edc75472c38da4adce0d13cda9ad2f7
-rw-r--r-- | core/jni/android_emoji_EmojiFactory.cpp | 1 | ||||
-rw-r--r-- | include/utils/String16.h | 29 | ||||
-rw-r--r-- | include/utils/String8.h | 116 | ||||
-rw-r--r-- | include/utils/Unicode.h | 161 | ||||
-rw-r--r-- | libs/rs/rsFont.cpp | 2 | ||||
-rw-r--r-- | libs/utils/Android.mk | 1 | ||||
-rw-r--r-- | libs/utils/ResourceTypes.cpp | 99 | ||||
-rw-r--r-- | libs/utils/String16.cpp | 253 | ||||
-rw-r--r-- | libs/utils/String8.cpp | 391 | ||||
-rw-r--r-- | libs/utils/Unicode.cpp | 575 | ||||
-rw-r--r-- | libs/utils/tests/Android.mk | 3 | ||||
-rw-r--r-- | libs/utils/tests/Unicode_test.cpp | 115 |
12 files changed, 983 insertions, 763 deletions
diff --git a/core/jni/android_emoji_EmojiFactory.cpp b/core/jni/android_emoji_EmojiFactory.cpp index 63550fb..f653b36 100644 --- a/core/jni/android_emoji_EmojiFactory.cpp +++ b/core/jni/android_emoji_EmojiFactory.cpp @@ -4,6 +4,7 @@ #define LOG_TAG "EmojiFactory_jni" #include <utils/Log.h> #include <utils/String8.h> +#include <utils/String16.h> #include "EmojiFactory.h" #include <nativehelper/JNIHelp.h> diff --git a/include/utils/String16.h b/include/utils/String16.h index 07a0c11..584f53f 100644 --- a/include/utils/String16.h +++ b/include/utils/String16.h @@ -19,39 +19,12 @@ #include <utils/Errors.h> #include <utils/SharedBuffer.h> - -#include <stdint.h> -#include <sys/types.h> +#include <utils/Unicode.h> // --------------------------------------------------------------------------- extern "C" { -typedef uint16_t char16_t; - -// Standard string functions on char16 strings. -int strcmp16(const char16_t *, const char16_t *); -int strncmp16(const char16_t *s1, const char16_t *s2, size_t n); -size_t strlen16(const char16_t *); -size_t strnlen16(const char16_t *, size_t); -char16_t *strcpy16(char16_t *, const char16_t *); -char16_t *strncpy16(char16_t *, const char16_t *, size_t); - -// Version of comparison that supports embedded nulls. -// This is different than strncmp() because we don't stop -// at a nul character and consider the strings to be different -// if the lengths are different (thus we need to supply the -// lengths of both strings). This can also be used when -// your string is not nul-terminated as it will have the -// equivalent result as strcmp16 (unlike strncmp16). -int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2); - -// Version of strzcmp16 for comparing strings in different endianness. -int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2); - -// Convert UTF-8 to UTF-16 including surrogate pairs -void utf8_to_utf16(const uint8_t *src, size_t srcLen, char16_t* dst, const size_t dstLen); - } // --------------------------------------------------------------------------- diff --git a/include/utils/String8.h b/include/utils/String8.h index cef8eca..b36f128 100644 --- a/include/utils/String8.h +++ b/include/utils/String8.h @@ -18,122 +18,16 @@ #define ANDROID_STRING8_H #include <utils/Errors.h> +#include <utils/SharedBuffer.h> +#include <utils/Unicode.h> -// Need this for the char16_t type; String8.h should not -// be depedent on the String16 class. -#include <utils/String16.h> - -#include <stdint.h> -#include <string.h> -#include <sys/types.h> - -// --------------------------------------------------------------------------- - -extern "C" { - -typedef uint32_t char32_t; - -size_t strlen32(const char32_t *); -size_t strnlen32(const char32_t *, size_t); - -/* - * Returns the length of "src" when "src" is valid UTF-8 string. - * Returns 0 if src is NULL, 0-length string or non UTF-8 string. - * This function should be used to determine whether "src" is valid UTF-8 - * characters with valid unicode codepoints. "src" must be null-terminated. - * - * If you are going to use other GetUtf... functions defined in this header - * with string which may not be valid UTF-8 with valid codepoint (form 0 to - * 0x10FFFF), you should use this function before calling others, since the - * other functions do not check whether the string is valid UTF-8 or not. - * - * If you do not care whether "src" is valid UTF-8 or not, you should use - * strlen() as usual, which should be much faster. - */ -size_t utf8_length(const char *src); - -/* - * Returns the UTF-32 length of "src". - */ -size_t utf32_length(const char *src, size_t src_len); - -/* - * Returns the UTF-8 length of "src". - */ -size_t utf8_length_from_utf16(const char16_t *src, size_t src_len); - -/* - * Returns the UTF-8 length of "src". - */ -size_t utf8_length_from_utf32(const char32_t *src, size_t src_len); - -/* - * Returns the unicode value at "index". - * Returns -1 when the index is invalid (equals to or more than "src_len"). - * If returned value is positive, it is able to be converted to char32_t, which - * is unsigned. Then, if "next_index" is not NULL, the next index to be used is - * stored in "next_index". "next_index" can be NULL. - */ -int32_t utf32_at(const char *src, size_t src_len, - size_t index, size_t *next_index); - -/* - * Stores a UTF-32 string converted from "src" in "dst", if "dst_length" is not - * large enough to store the string, the part of the "src" string is stored - * into "dst". - * Returns the size actually used for storing the string. - * "dst" is not null-terminated when dst_len is fully used (like strncpy). - */ -size_t utf8_to_utf32(const char* src, size_t src_len, - char32_t* dst, size_t dst_len); - -/* - * Stores a UTF-8 string converted from "src" in "dst", if "dst_length" is not - * large enough to store the string, the part of the "src" string is stored - * into "dst" as much as possible. See the examples for more detail. - * Returns the size actually used for storing the string. - * dst" is not null-terminated when dst_len is fully used (like strncpy). - * - * Example 1 - * "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84) - * "src_len" == 2 - * "dst_len" >= 7 - * -> - * Returned value == 6 - * "dst" becomes \xE3\x81\x82\xE3\x81\x84\0 - * (note that "dst" is null-terminated) - * - * Example 2 - * "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84) - * "src_len" == 2 - * "dst_len" == 5 - * -> - * Returned value == 3 - * "dst" becomes \xE3\x81\x82\0 - * (note that "dst" is null-terminated, but \u3044 is not stored in "dst" - * since "dst" does not have enough size to store the character) - * - * Example 3 - * "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84) - * "src_len" == 2 - * "dst_len" == 6 - * -> - * Returned value == 6 - * "dst" becomes \xE3\x81\x82\xE3\x81\x84 - * (note that "dst" is NOT null-terminated, like strncpy) - */ -size_t utf32_to_utf8(const char32_t* src, size_t src_len, - char* dst, size_t dst_len); - -size_t utf16_to_utf8(const char16_t* src, size_t src_len, - char* dst, size_t dst_len); - -} +#include <string.h> // for strcmp // --------------------------------------------------------------------------- namespace android { +class String16; class TextOutput; //! This is a string holding UTF-8 characters. Does not allow the value more @@ -182,7 +76,7 @@ public: size_t getUtf32Length() const; int32_t getUtf32At(size_t index, size_t *next_index) const; - size_t getUtf32(char32_t* dst, size_t dst_len) const; + void getUtf32(char32_t* dst) const; inline String8& operator=(const String8& other); inline String8& operator=(const char* other); diff --git a/include/utils/Unicode.h b/include/utils/Unicode.h new file mode 100644 index 0000000..6afb291 --- /dev/null +++ b/include/utils/Unicode.h @@ -0,0 +1,161 @@ +/* + * Copyright (C) 2005 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ANDROID_UNICODE_H +#define ANDROID_UNICODE_H + +#include <sys/types.h> +#include <stdint.h> + +extern "C" { + +typedef uint32_t char32_t; +typedef uint16_t char16_t; + +// Standard string functions on char16_t strings. +int strcmp16(const char16_t *, const char16_t *); +int strncmp16(const char16_t *s1, const char16_t *s2, size_t n); +size_t strlen16(const char16_t *); +size_t strnlen16(const char16_t *, size_t); +char16_t *strcpy16(char16_t *, const char16_t *); +char16_t *strncpy16(char16_t *, const char16_t *, size_t); + +// Version of comparison that supports embedded nulls. +// This is different than strncmp() because we don't stop +// at a nul character and consider the strings to be different +// if the lengths are different (thus we need to supply the +// lengths of both strings). This can also be used when +// your string is not nul-terminated as it will have the +// equivalent result as strcmp16 (unlike strncmp16). +int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2); + +// Version of strzcmp16 for comparing strings in different endianness. +int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2); + +// Standard string functions on char32_t strings. +size_t strlen32(const char32_t *); +size_t strnlen32(const char32_t *, size_t); + +/** + * Measure the length of a UTF-32 string in UTF-8. If the string is invalid + * such as containing a surrogate character, -1 will be returned. + */ +ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len); + +/** + * Stores a UTF-8 string converted from "src" in "dst", if "dst_length" is not + * large enough to store the string, the part of the "src" string is stored + * into "dst" as much as possible. See the examples for more detail. + * Returns the size actually used for storing the string. + * dst" is not null-terminated when dst_len is fully used (like strncpy). + * + * Example 1 + * "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84) + * "src_len" == 2 + * "dst_len" >= 7 + * -> + * Returned value == 6 + * "dst" becomes \xE3\x81\x82\xE3\x81\x84\0 + * (note that "dst" is null-terminated) + * + * Example 2 + * "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84) + * "src_len" == 2 + * "dst_len" == 5 + * -> + * Returned value == 3 + * "dst" becomes \xE3\x81\x82\0 + * (note that "dst" is null-terminated, but \u3044 is not stored in "dst" + * since "dst" does not have enough size to store the character) + * + * Example 3 + * "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84) + * "src_len" == 2 + * "dst_len" == 6 + * -> + * Returned value == 6 + * "dst" becomes \xE3\x81\x82\xE3\x81\x84 + * (note that "dst" is NOT null-terminated, like strncpy) + */ +void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst); + +/** + * Returns the unicode value at "index". + * Returns -1 when the index is invalid (equals to or more than "src_len"). + * If returned value is positive, it is able to be converted to char32_t, which + * is unsigned. Then, if "next_index" is not NULL, the next index to be used is + * stored in "next_index". "next_index" can be NULL. + */ +int32_t utf32_from_utf8_at(const char *src, size_t src_len, size_t index, size_t *next_index); + + +/** + * Returns the UTF-8 length of UTF-16 string "src". + */ +ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len); + +/** + * Converts a UTF-16 string to UTF-8. The destination buffer must be large + * enough to fit the UTF-16 as measured by utf16_to_utf8_length with an added + * NULL terminator. + */ +void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst); + +/** + * Returns the length of "src" when "src" is valid UTF-8 string. + * Returns 0 if src is NULL or 0-length string. Returns -1 when the source + * is an invalid string. + * + * This function should be used to determine whether "src" is valid UTF-8 + * characters with valid unicode codepoints. "src" must be null-terminated. + * + * If you are going to use other utf8_to_... functions defined in this header + * with string which may not be valid UTF-8 with valid codepoint (form 0 to + * 0x10FFFF), you should use this function before calling others, since the + * other functions do not check whether the string is valid UTF-8 or not. + * + * If you do not care whether "src" is valid UTF-8 or not, you should use + * strlen() as usual, which should be much faster. + */ +ssize_t utf8_length(const char *src); + +/** + * Measure the length of a UTF-32 string. + */ +size_t utf8_to_utf32_length(const char *src, size_t src_len); + +/** + * Stores a UTF-32 string converted from "src" in "dst". "dst" must be large + * enough to store the entire converted string as measured by + * utf8_to_utf32_length plus space for a NULL terminator. + */ +void utf8_to_utf32(const char* src, size_t src_len, char32_t* dst); + +/** + * Returns the UTF-16 length of UTF-8 string "src". + */ +ssize_t utf8_to_utf16_length(const uint8_t* src, size_t srcLen); + +/** + * Convert UTF-8 to UTF-16 including surrogate pairs. The destination buffer + * must be large enough to hold the result as measured by utf8_to_utf16_length + * plus an added NULL terminator. + */ +void utf8_to_utf16(const uint8_t* src, size_t srcLen, char16_t* dst); + +} + +#endif diff --git a/libs/rs/rsFont.cpp b/libs/rs/rsFont.cpp index 33b0f51..96e350d 100644 --- a/libs/rs/rsFont.cpp +++ b/libs/rs/rsFont.cpp @@ -174,7 +174,7 @@ void Font::renderUTF(const char *text, uint32_t len, int32_t x, int32_t y, while (glyphsLeft > 0) { - int32_t utfChar = utf32_at(text, len, index, &nextIndex); + int32_t utfChar = utf32_from_utf8_at(text, len, index, &nextIndex); // Reached the end of the string or encountered if (utfChar < 0) { diff --git a/libs/utils/Android.mk b/libs/utils/Android.mk index eb75ed8..05a9674 100644 --- a/libs/utils/Android.mk +++ b/libs/utils/Android.mk @@ -41,6 +41,7 @@ commonSources:= \ TextOutput.cpp \ Threads.cpp \ Timers.cpp \ + Unicode.cpp \ VectorImpl.cpp \ ZipFileCRO.cpp \ ZipFileRO.cpp \ diff --git a/libs/utils/ResourceTypes.cpp b/libs/utils/ResourceTypes.cpp index f287298..bbf5093 100644 --- a/libs/utils/ResourceTypes.cpp +++ b/libs/utils/ResourceTypes.cpp @@ -444,15 +444,51 @@ void ResStringPool::uninit() } } -#define DECODE_LENGTH(str, chrsz, len) \ - len = *(str); \ - if (*(str)&(1<<(chrsz*8-1))) { \ - (str)++; \ - len = (((len)&((1<<(chrsz*8-1))-1))<<(chrsz*8)) + *(str); \ - } \ - (str)++; - -const uint16_t* ResStringPool::stringAt(size_t idx, size_t* outLen) const +/** + * Strings in UTF-16 format have length indicated by a length encoded in the + * stored data. It is either 1 or 2 characters of length data. This allows a + * maximum length of 0x7FFFFFF (2147483647 bytes), but if you're storing that + * much data in a string, you're abusing them. + * + * If the high bit is set, then there are two characters or 4 bytes of length + * data encoded. In that case, drop the high bit of the first character and + * add it together with the next character. + */ +static inline size_t +decodeLength(const char16_t** str) +{ + size_t len = **str; + if ((len & 0x8000) != 0) { + (*str)++; + len = ((len & 0x7FFF) << 16) | **str; + } + (*str)++; + return len; +} + +/** + * Strings in UTF-8 format have length indicated by a length encoded in the + * stored data. It is either 1 or 2 characters of length data. This allows a + * maximum length of 0x7FFF (32767 bytes), but you should consider storing + * text in another way if you're using that much data in a single string. + * + * If the high bit is set, then there are two characters or 2 bytes of length + * data encoded. In that case, drop the high bit of the first character and + * add it together with the next character. + */ +static inline size_t +decodeLength(const uint8_t** str) +{ + size_t len = **str; + if ((len & 0x80) != 0) { + (*str)++; + len = ((len & 0x7F) << 8) | **str; + } + (*str)++; + return len; +} + +const uint16_t* ResStringPool::stringAt(size_t idx, size_t* u16len) const { if (mError == NO_ERROR && idx < mHeader->stringCount) { const bool isUTF8 = (mHeader->flags&ResStringPool_header::UTF8_FLAG) != 0; @@ -461,37 +497,51 @@ const uint16_t* ResStringPool::stringAt(size_t idx, size_t* outLen) const if (!isUTF8) { const char16_t* strings = (char16_t*)mStrings; const char16_t* str = strings+off; - DECODE_LENGTH(str, sizeof(char16_t), *outLen) - if ((uint32_t)(str+*outLen-strings) < mStringPoolSize) { + + *u16len = decodeLength(&str); + if ((uint32_t)(str+*u16len-strings) < mStringPoolSize) { return str; } else { LOGW("Bad string block: string #%d extends to %d, past end at %d\n", - (int)idx, (int)(str+*outLen-strings), (int)mStringPoolSize); + (int)idx, (int)(str+*u16len-strings), (int)mStringPoolSize); } } else { const uint8_t* strings = (uint8_t*)mStrings; - const uint8_t* str = strings+off; - DECODE_LENGTH(str, sizeof(uint8_t), *outLen) - size_t encLen; - DECODE_LENGTH(str, sizeof(uint8_t), encLen) - if ((uint32_t)(str+encLen-strings) < mStringPoolSize) { + const uint8_t* u8str = strings+off; + + *u16len = decodeLength(&u8str); + size_t u8len = decodeLength(&u8str); + + // encLen must be less than 0x7FFF due to encoding. + if ((uint32_t)(u8str+u8len-strings) < mStringPoolSize) { AutoMutex lock(mDecodeLock); + if (mCache[idx] != NULL) { return mCache[idx]; } - char16_t *u16str = (char16_t *)calloc(*outLen+1, sizeof(char16_t)); + + ssize_t actualLen = utf8_to_utf16_length(u8str, u8len); + if (actualLen < 0 || (size_t)actualLen != *u16len) { + LOGW("Bad string block: string #%lld decoded length is not correct " + "%lld vs %llu\n", + (long long)idx, (long long)actualLen, (long long)*u16len); + return NULL; + } + + char16_t *u16str = (char16_t *)calloc(*u16len+1, sizeof(char16_t)); if (!u16str) { LOGW("No memory when trying to allocate decode cache for string #%d\n", (int)idx); return NULL; } - const unsigned char *u8src = reinterpret_cast<const unsigned char *>(str); - utf8_to_utf16(u8src, encLen, u16str, *outLen); + + utf8_to_utf16(u8str, u8len, u16str); mCache[idx] = u16str; return u16str; } else { - LOGW("Bad string block: string #%d extends to %d, past end at %d\n", - (int)idx, (int)(str+encLen-strings), (int)mStringPoolSize); + LOGW("Bad string block: string #%lld extends to %lld, past end at %lld\n", + (long long)idx, (long long)(u8str+u8len-strings), + (long long)mStringPoolSize); } } } else { @@ -512,9 +562,8 @@ const char* ResStringPool::string8At(size_t idx, size_t* outLen) const if (isUTF8) { const uint8_t* strings = (uint8_t*)mStrings; const uint8_t* str = strings+off; - DECODE_LENGTH(str, sizeof(uint8_t), *outLen) - size_t encLen; - DECODE_LENGTH(str, sizeof(uint8_t), encLen) + *outLen = decodeLength(&str); + size_t encLen = decodeLength(&str); if ((uint32_t)(str+encLen-strings) < mStringPoolSize) { return (const char*)str; } else { diff --git a/libs/utils/String16.cpp b/libs/utils/String16.cpp index eab7b2b..4ce1664 100644 --- a/libs/utils/String16.cpp +++ b/libs/utils/String16.cpp @@ -18,228 +18,17 @@ #include <utils/Debug.h> #include <utils/Log.h> +#include <utils/Unicode.h> #include <utils/String8.h> #include <utils/TextOutput.h> #include <utils/threads.h> #include <private/utils/Static.h> -#ifdef HAVE_WINSOCK -# undef nhtol -# undef htonl -# undef nhtos -# undef htons - -# ifdef HAVE_LITTLE_ENDIAN -# define ntohl(x) ( ((x) << 24) | (((x) >> 24) & 255) | (((x) << 8) & 0xff0000) | (((x) >> 8) & 0xff00) ) -# define htonl(x) ntohl(x) -# define ntohs(x) ( (((x) << 8) & 0xff00) | (((x) >> 8) & 255) ) -# define htons(x) ntohs(x) -# else -# define ntohl(x) (x) -# define htonl(x) (x) -# define ntohs(x) (x) -# define htons(x) (x) -# endif -#else -# include <netinet/in.h> -#endif - #include <memory.h> #include <stdio.h> #include <ctype.h> -// --------------------------------------------------------------------------- - -int strcmp16(const char16_t *s1, const char16_t *s2) -{ - char16_t ch; - int d = 0; - - while ( 1 ) { - d = (int)(ch = *s1++) - (int)*s2++; - if ( d || !ch ) - break; - } - - return d; -} - -int strncmp16(const char16_t *s1, const char16_t *s2, size_t n) -{ - char16_t ch; - int d = 0; - - while ( n-- ) { - d = (int)(ch = *s1++) - (int)*s2++; - if ( d || !ch ) - break; - } - - return d; -} - -char16_t *strcpy16(char16_t *dst, const char16_t *src) -{ - char16_t *q = dst; - const char16_t *p = src; - char16_t ch; - - do { - *q++ = ch = *p++; - } while ( ch ); - - return dst; -} - -size_t strlen16(const char16_t *s) -{ - const char16_t *ss = s; - while ( *ss ) - ss++; - return ss-s; -} - - -char16_t *strncpy16(char16_t *dst, const char16_t *src, size_t n) -{ - char16_t *q = dst; - const char16_t *p = src; - char ch; - - while (n) { - n--; - *q++ = ch = *p++; - if ( !ch ) - break; - } - - *q = 0; - - return dst; -} - -size_t strnlen16(const char16_t *s, size_t maxlen) -{ - const char16_t *ss = s; - - /* Important: the maxlen test must precede the reference through ss; - since the byte beyond the maximum may segfault */ - while ((maxlen > 0) && *ss) { - ss++; - maxlen--; - } - return ss-s; -} - -int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2) -{ - const char16_t* e1 = s1+n1; - const char16_t* e2 = s2+n2; - - while (s1 < e1 && s2 < e2) { - const int d = (int)*s1++ - (int)*s2++; - if (d) { - return d; - } - } - - return n1 < n2 - ? (0 - (int)*s2) - : (n1 > n2 - ? ((int)*s1 - 0) - : 0); -} - -int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2) -{ - const char16_t* e1 = s1H+n1; - const char16_t* e2 = s2N+n2; - - while (s1H < e1 && s2N < e2) { - const char16_t c2 = ntohs(*s2N); - const int d = (int)*s1H++ - (int)c2; - s2N++; - if (d) { - return d; - } - } - - return n1 < n2 - ? (0 - (int)ntohs(*s2N)) - : (n1 > n2 - ? ((int)*s1H - 0) - : 0); -} - -static inline size_t -utf8_char_len(uint8_t ch) -{ - return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1; -} - -#define UTF8_SHIFT_AND_MASK(unicode, byte) (unicode)<<=6; (unicode) |= (0x3f & (byte)); - -static inline uint32_t -utf8_to_utf32(const uint8_t *src, size_t length) -{ - uint32_t unicode; - - switch (length) - { - case 1: - return src[0]; - case 2: - unicode = src[0] & 0x1f; - UTF8_SHIFT_AND_MASK(unicode, src[1]) - return unicode; - case 3: - unicode = src[0] & 0x0f; - UTF8_SHIFT_AND_MASK(unicode, src[1]) - UTF8_SHIFT_AND_MASK(unicode, src[2]) - return unicode; - case 4: - unicode = src[0] & 0x07; - UTF8_SHIFT_AND_MASK(unicode, src[1]) - UTF8_SHIFT_AND_MASK(unicode, src[2]) - UTF8_SHIFT_AND_MASK(unicode, src[3]) - return unicode; - default: - return 0xffff; - } - - //printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result); -} - -void -utf8_to_utf16(const uint8_t *src, size_t srcLen, - char16_t* dst, const size_t dstLen) -{ - const uint8_t* const end = src + srcLen; - const char16_t* const dstEnd = dst + dstLen; - while (src < end && dst < dstEnd) { - size_t len = utf8_char_len(*src); - uint32_t codepoint = utf8_to_utf32((const uint8_t*)src, len); - - // Convert the UTF32 codepoint to one or more UTF16 codepoints - if (codepoint <= 0xFFFF) { - // Single UTF16 character - *dst++ = (char16_t) codepoint; - } else { - // Multiple UTF16 characters with surrogates - codepoint = codepoint - 0x10000; - *dst++ = (char16_t) ((codepoint >> 10) + 0xD800); - *dst++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00); - } - - src += len; - } - if (dst < dstEnd) { - *dst = 0; - } -} - -// --------------------------------------------------------------------------- namespace android { @@ -270,37 +59,33 @@ void terminate_string16() // --------------------------------------------------------------------------- -static char16_t* allocFromUTF8(const char* in, size_t len) +static char16_t* allocFromUTF8(const char* u8str, size_t u8len) { - if (len == 0) return getEmptyString(); - - size_t chars = 0; - const char* end = in+len; - const char* p = in; - - while (p < end) { - chars++; - int utf8len = utf8_char_len(*p); - uint32_t codepoint = utf8_to_utf32((const uint8_t*)p, utf8len); - if (codepoint > 0xFFFF) chars++; // this will be a surrogate pair in utf16 - p += utf8len; + if (u8len == 0) return getEmptyString(); + + const uint8_t* u8cur = (const uint8_t*) u8str; + + const ssize_t u16len = utf8_to_utf16_length(u8cur, u8len); + if (u16len < 0) { + return getEmptyString(); } - - size_t bufSize = (chars+1)*sizeof(char16_t); - SharedBuffer* buf = SharedBuffer::alloc(bufSize); + + const uint8_t* const u8end = u8cur + u8len; + + SharedBuffer* buf = SharedBuffer::alloc(sizeof(char16_t)*(u16len+1)); if (buf) { - p = in; - char16_t* str = (char16_t*)buf->data(); - - utf8_to_utf16((const uint8_t*)p, len, str, bufSize); + u8cur = (const uint8_t*) u8str; + char16_t* u16str = (char16_t*)buf->data(); + + utf8_to_utf16(u8cur, u8len, u16str); //printf("Created UTF-16 string from UTF-8 \"%s\":", in); //printHexData(1, str, buf->size(), 16, 1); //printf("\n"); - return str; + return u16str; } - + return getEmptyString(); } diff --git a/libs/utils/String8.cpp b/libs/utils/String8.cpp index 6358fc4..c8dc083 100644 --- a/libs/utils/String8.cpp +++ b/libs/utils/String8.cpp @@ -17,6 +17,8 @@ #include <utils/String8.h> #include <utils/Log.h> +#include <utils/Unicode.h> +#include <utils/SharedBuffer.h> #include <utils/String16.h> #include <utils/TextOutput.h> #include <utils/threads.h> @@ -34,94 +36,10 @@ namespace android { -static const char32_t kByteMask = 0x000000BF; -static const char32_t kByteMark = 0x00000080; - -// Surrogates aren't valid for UTF-32 characters, so define some -// constants that will let us screen them out. -static const char32_t kUnicodeSurrogateHighStart = 0x0000D800; -static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF; -static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00; -static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF; -static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart; -static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd; -static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF; - -// Mask used to set appropriate bits in first byte of UTF-8 sequence, -// indexed by number of bytes in the sequence. -// 0xxxxxxx -// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000 -// 110yyyyx 10xxxxxx -// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0 -// 1110yyyy 10yxxxxx 10xxxxxx -// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0 -// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx -// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0 -static const char32_t kFirstByteMark[] = { - 0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0 -}; - // Separator used by resource paths. This is not platform dependent contrary // to OS_PATH_SEPARATOR. #define RES_PATH_SEPARATOR '/' -// Return number of utf8 bytes required for the character. -static size_t utf32_to_utf8_bytes(char32_t srcChar) -{ - size_t bytesToWrite; - - // Figure out how many bytes the result will require. - if (srcChar < 0x00000080) - { - bytesToWrite = 1; - } - else if (srcChar < 0x00000800) - { - bytesToWrite = 2; - } - else if (srcChar < 0x00010000) - { - if ((srcChar < kUnicodeSurrogateStart) - || (srcChar > kUnicodeSurrogateEnd)) - { - bytesToWrite = 3; - } - else - { - // Surrogates are invalid UTF-32 characters. - return 0; - } - } - // Max code point for Unicode is 0x0010FFFF. - else if (srcChar <= kUnicodeMaxCodepoint) - { - bytesToWrite = 4; - } - else - { - // Invalid UTF-32 character. - return 0; - } - - return bytesToWrite; -} - -// Write out the source character to <dstP>. - -static void utf32_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes) -{ - dstP += bytes; - switch (bytes) - { /* note: everything falls through. */ - case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; - case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; - case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; - case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]); - } -} - -// --------------------------------------------------------------------------- - static SharedBuffer* gEmptyStringBuf = NULL; static char* gEmptyString = NULL; @@ -175,62 +93,47 @@ static char* allocFromUTF8(const char* in, size_t len) return getEmptyString(); } -template<typename T, typename L> -static char* allocFromUTF16OrUTF32(const T* in, L len) +static char* allocFromUTF16(const char16_t* in, size_t len) { if (len == 0) return getEmptyString(); - size_t bytes = 0; - const T* end = in+len; - const T* p = in; - - while (p < end) { - bytes += utf32_to_utf8_bytes(*p); - p++; + const ssize_t bytes = utf16_to_utf8_length(in, len); + if (bytes < 0) { + return getEmptyString(); } SharedBuffer* buf = SharedBuffer::alloc(bytes+1); LOG_ASSERT(buf, "Unable to allocate shared buffer"); - if (buf) { - p = in; - char* str = (char*)buf->data(); - char* d = str; - while (p < end) { - const T c = *p++; - size_t len = utf32_to_utf8_bytes(c); - utf32_to_utf8((uint8_t*)d, c, len); - d += len; - } - *d = 0; - - return str; + if (!buf) { + return getEmptyString(); } - return getEmptyString(); + char* str = (char*)buf->data(); + utf16_to_utf8(in, len, str); + return str; } -static char* allocFromUTF16(const char16_t* in, size_t len) +static char* allocFromUTF32(const char32_t* in, size_t len) { - if (len == 0) return getEmptyString(); + if (len == 0) { + return getEmptyString(); + } - const size_t bytes = utf8_length_from_utf16(in, len); + const ssize_t bytes = utf32_to_utf8_length(in, len); + if (bytes < 0) { + return getEmptyString(); + } SharedBuffer* buf = SharedBuffer::alloc(bytes+1); LOG_ASSERT(buf, "Unable to allocate shared buffer"); - if (buf) { - char* str = (char*)buf->data(); - - utf16_to_utf8(in, len, str, bytes+1); - - return str; + if (!buf) { + return getEmptyString(); } - return getEmptyString(); -} + char* str = (char*) buf->data(); + utf32_to_utf8(in, len, str); -static char* allocFromUTF32(const char32_t* in, size_t len) -{ - return allocFromUTF16OrUTF32<char32_t, size_t>(in, len); + return str; } // --------------------------------------------------------------------------- @@ -510,17 +413,17 @@ void String8::toUpper(size_t start, size_t length) size_t String8::getUtf32Length() const { - return utf32_length(mString, length()); + return utf8_to_utf32_length(mString, length()); } int32_t String8::getUtf32At(size_t index, size_t *next_index) const { - return utf32_at(mString, length(), index, next_index); + return utf32_from_utf8_at(mString, length(), index, next_index); } -size_t String8::getUtf32(char32_t* dst, size_t dst_len) const +void String8::getUtf32(char32_t* dst) const { - return utf8_to_utf32(mString, length(), dst, dst_len); + utf8_to_utf32(mString, length(), dst); } TextOutput& operator<<(TextOutput& to, const String8& val) @@ -705,241 +608,3 @@ String8& String8::convertToResPath() } }; // namespace android - -// --------------------------------------------------------------------------- - -size_t strlen32(const char32_t *s) -{ - const char32_t *ss = s; - while ( *ss ) - ss++; - return ss-s; -} - -size_t strnlen32(const char32_t *s, size_t maxlen) -{ - const char32_t *ss = s; - while ((maxlen > 0) && *ss) { - ss++; - maxlen--; - } - return ss-s; -} - -size_t utf8_length(const char *src) -{ - const char *cur = src; - size_t ret = 0; - while (*cur != '\0') { - const char first_char = *cur++; - if ((first_char & 0x80) == 0) { // ASCII - ret += 1; - continue; - } - // (UTF-8's character must not be like 10xxxxxx, - // but 110xxxxx, 1110xxxx, ... or 1111110x) - if ((first_char & 0x40) == 0) { - return 0; - } - - int32_t mask, to_ignore_mask; - size_t num_to_read = 0; - char32_t utf32 = 0; - for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80; - num_to_read < 5 && (first_char & mask); - num_to_read++, to_ignore_mask |= mask, mask >>= 1) { - if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx - return 0; - } - // 0x3F == 00111111 - utf32 = (utf32 << 6) + (*cur++ & 0x3F); - } - // "first_char" must be (110xxxxx - 11110xxx) - if (num_to_read == 5) { - return 0; - } - to_ignore_mask |= mask; - utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1)); - if (utf32 > android::kUnicodeMaxCodepoint) { - return 0; - } - - ret += num_to_read; - } - return ret; -} - -size_t utf32_length(const char *src, size_t src_len) -{ - if (src == NULL || src_len == 0) { - return 0; - } - size_t ret = 0; - const char* cur; - const char* end; - size_t num_to_skip; - for (cur = src, end = src + src_len, num_to_skip = 1; - cur < end; - cur += num_to_skip, ret++) { - const char first_char = *cur; - num_to_skip = 1; - if ((first_char & 0x80) == 0) { // ASCII - continue; - } - int32_t mask; - - for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) { - } - } - return ret; -} - -size_t utf8_length_from_utf32(const char32_t *src, size_t src_len) -{ - if (src == NULL || src_len == 0) { - return 0; - } - size_t ret = 0; - const char32_t *end = src + src_len; - while (src < end) { - ret += android::utf32_to_utf8_bytes(*src++); - } - return ret; -} - -size_t utf8_length_from_utf16(const char16_t *src, size_t src_len) -{ - if (src == NULL || src_len == 0) { - return 0; - } - size_t ret = 0; - const char16_t* const end = src + src_len; - while (src < end) { - if ((*src & 0xFC00) == 0xD800 && (src + 1) < end - && (*++src & 0xFC00) == 0xDC00) { - // surrogate pairs are always 4 bytes. - ret += 4; - src++; - } else { - ret += android::utf32_to_utf8_bytes((char32_t) *src++); - } - } - return ret; -} - -static int32_t utf32_at_internal(const char* cur, size_t *num_read) -{ - const char first_char = *cur; - if ((first_char & 0x80) == 0) { // ASCII - *num_read = 1; - return *cur; - } - cur++; - char32_t mask, to_ignore_mask; - size_t num_to_read = 0; - char32_t utf32 = first_char; - for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80; - (first_char & mask); - num_to_read++, to_ignore_mask |= mask, mask >>= 1) { - // 0x3F == 00111111 - utf32 = (utf32 << 6) + (*cur++ & 0x3F); - } - to_ignore_mask |= mask; - utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1))); - - *num_read = num_to_read; - return static_cast<int32_t>(utf32); -} - -int32_t utf32_at(const char *src, size_t src_len, - size_t index, size_t *next_index) -{ - if (index >= src_len) { - return -1; - } - size_t dummy_index; - if (next_index == NULL) { - next_index = &dummy_index; - } - size_t num_read; - int32_t ret = utf32_at_internal(src + index, &num_read); - if (ret >= 0) { - *next_index = index + num_read; - } - - return ret; -} - -size_t utf8_to_utf32(const char* src, size_t src_len, - char32_t* dst, size_t dst_len) -{ - if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) { - return 0; - } - - const char* cur = src; - const char* end = src + src_len; - char32_t* cur_utf32 = dst; - const char32_t* end_utf32 = dst + dst_len; - while (cur_utf32 < end_utf32 && cur < end) { - size_t num_read; - *cur_utf32++ = - static_cast<char32_t>(utf32_at_internal(cur, &num_read)); - cur += num_read; - } - if (cur_utf32 < end_utf32) { - *cur_utf32 = 0; - } - return static_cast<size_t>(cur_utf32 - dst); -} - -size_t utf32_to_utf8(const char32_t* src, size_t src_len, - char* dst, size_t dst_len) -{ - if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) { - return 0; - } - const char32_t *cur_utf32 = src; - const char32_t *end_utf32 = src + src_len; - char *cur = dst; - const char *end = dst + dst_len; - while (cur_utf32 < end_utf32 && cur < end) { - size_t len = android::utf32_to_utf8_bytes(*cur_utf32); - android::utf32_to_utf8((uint8_t *)cur, *cur_utf32++, len); - cur += len; - } - if (cur < end) { - *cur = '\0'; - } - return cur - dst; -} - -size_t utf16_to_utf8(const char16_t* src, size_t src_len, - char* dst, size_t dst_len) -{ - if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) { - return 0; - } - const char16_t* cur_utf16 = src; - const char16_t* const end_utf16 = src + src_len; - char *cur = dst; - const char* const end = dst + dst_len; - while (cur_utf16 < end_utf16 && cur < end) { - char32_t utf32; - // surrogate pairs - if ((*cur_utf16 & 0xFC00) == 0xD800 && (cur_utf16 + 1) < end_utf16) { - utf32 = (*cur_utf16++ - 0xD800) << 10; - utf32 |= *cur_utf16++ - 0xDC00; - utf32 += 0x10000; - } else { - utf32 = (char32_t) *cur_utf16++; - } - size_t len = android::utf32_to_utf8_bytes(utf32); - android::utf32_to_utf8((uint8_t*)cur, utf32, len); - cur += len; - } - if (cur < end) { - *cur = '\0'; - } - return cur - dst; -} diff --git a/libs/utils/Unicode.cpp b/libs/utils/Unicode.cpp new file mode 100644 index 0000000..78c61b4 --- /dev/null +++ b/libs/utils/Unicode.cpp @@ -0,0 +1,575 @@ +/* + * Copyright (C) 2005 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <utils/Unicode.h> + +#include <stddef.h> + +#ifdef HAVE_WINSOCK +# undef nhtol +# undef htonl +# undef nhtos +# undef htons + +# ifdef HAVE_LITTLE_ENDIAN +# define ntohl(x) ( ((x) << 24) | (((x) >> 24) & 255) | (((x) << 8) & 0xff0000) | (((x) >> 8) & 0xff00) ) +# define htonl(x) ntohl(x) +# define ntohs(x) ( (((x) << 8) & 0xff00) | (((x) >> 8) & 255) ) +# define htons(x) ntohs(x) +# else +# define ntohl(x) (x) +# define htonl(x) (x) +# define ntohs(x) (x) +# define htons(x) (x) +# endif +#else +# include <netinet/in.h> +#endif + +extern "C" { + +static const char32_t kByteMask = 0x000000BF; +static const char32_t kByteMark = 0x00000080; + +// Surrogates aren't valid for UTF-32 characters, so define some +// constants that will let us screen them out. +static const char32_t kUnicodeSurrogateHighStart = 0x0000D800; +static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF; +static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00; +static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF; +static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart; +static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd; +static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF; + +// Mask used to set appropriate bits in first byte of UTF-8 sequence, +// indexed by number of bytes in the sequence. +// 0xxxxxxx +// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000 +// 110yyyyx 10xxxxxx +// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0 +// 1110yyyy 10yxxxxx 10xxxxxx +// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0 +// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx +// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0 +static const char32_t kFirstByteMark[] = { + 0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0 +}; + +// -------------------------------------------------------------------------- +// UTF-32 +// -------------------------------------------------------------------------- + +/** + * Return number of UTF-8 bytes required for the character. If the character + * is invalid, return size of 0. + */ +static inline size_t utf32_codepoint_utf8_length(char32_t srcChar) +{ + // Figure out how many bytes the result will require. + if (srcChar < 0x00000080) { + return 1; + } else if (srcChar < 0x00000800) { + return 2; + } else if (srcChar < 0x00010000) { + if ((srcChar < kUnicodeSurrogateStart) || (srcChar > kUnicodeSurrogateEnd)) { + return 3; + } else { + // Surrogates are invalid UTF-32 characters. + return 0; + } + } + // Max code point for Unicode is 0x0010FFFF. + else if (srcChar <= kUnicodeMaxCodepoint) { + return 4; + } else { + // Invalid UTF-32 character. + return 0; + } +} + +// Write out the source character to <dstP>. + +static inline void utf32_codepoint_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes) +{ + dstP += bytes; + switch (bytes) + { /* note: everything falls through. */ + case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; + case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; + case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; + case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]); + } +} + +size_t strlen32(const char32_t *s) +{ + const char32_t *ss = s; + while ( *ss ) + ss++; + return ss-s; +} + +size_t strnlen32(const char32_t *s, size_t maxlen) +{ + const char32_t *ss = s; + while ((maxlen > 0) && *ss) { + ss++; + maxlen--; + } + return ss-s; +} + +static inline int32_t utf32_at_internal(const char* cur, size_t *num_read) +{ + const char first_char = *cur; + if ((first_char & 0x80) == 0) { // ASCII + *num_read = 1; + return *cur; + } + cur++; + char32_t mask, to_ignore_mask; + size_t num_to_read = 0; + char32_t utf32 = first_char; + for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80; + (first_char & mask); + num_to_read++, to_ignore_mask |= mask, mask >>= 1) { + // 0x3F == 00111111 + utf32 = (utf32 << 6) + (*cur++ & 0x3F); + } + to_ignore_mask |= mask; + utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1))); + + *num_read = num_to_read; + return static_cast<int32_t>(utf32); +} + +int32_t utf32_from_utf8_at(const char *src, size_t src_len, size_t index, size_t *next_index) +{ + if (index >= src_len) { + return -1; + } + size_t dummy_index; + if (next_index == NULL) { + next_index = &dummy_index; + } + size_t num_read; + int32_t ret = utf32_at_internal(src + index, &num_read); + if (ret >= 0) { + *next_index = index + num_read; + } + + return ret; +} + +ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len) +{ + if (src == NULL || src_len == 0) { + return -1; + } + + size_t ret = 0; + const char32_t *end = src + src_len; + while (src < end) { + ret += utf32_codepoint_utf8_length(*src++); + } + return ret; +} + +void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst) +{ + if (src == NULL || src_len == 0 || dst == NULL) { + return; + } + + const char32_t *cur_utf32 = src; + const char32_t *end_utf32 = src + src_len; + char *cur = dst; + while (cur_utf32 < end_utf32) { + size_t len = utf32_codepoint_utf8_length(*cur_utf32); + utf32_codepoint_to_utf8((uint8_t *)cur, *cur_utf32++, len); + cur += len; + } + *cur = '\0'; +} + +// -------------------------------------------------------------------------- +// UTF-16 +// -------------------------------------------------------------------------- + +int strcmp16(const char16_t *s1, const char16_t *s2) +{ + char16_t ch; + int d = 0; + + while ( 1 ) { + d = (int)(ch = *s1++) - (int)*s2++; + if ( d || !ch ) + break; + } + + return d; +} + +int strncmp16(const char16_t *s1, const char16_t *s2, size_t n) +{ + char16_t ch; + int d = 0; + + while ( n-- ) { + d = (int)(ch = *s1++) - (int)*s2++; + if ( d || !ch ) + break; + } + + return d; +} + +char16_t *strcpy16(char16_t *dst, const char16_t *src) +{ + char16_t *q = dst; + const char16_t *p = src; + char16_t ch; + + do { + *q++ = ch = *p++; + } while ( ch ); + + return dst; +} + +size_t strlen16(const char16_t *s) +{ + const char16_t *ss = s; + while ( *ss ) + ss++; + return ss-s; +} + + +char16_t *strncpy16(char16_t *dst, const char16_t *src, size_t n) +{ + char16_t *q = dst; + const char16_t *p = src; + char ch; + + while (n) { + n--; + *q++ = ch = *p++; + if ( !ch ) + break; + } + + *q = 0; + + return dst; +} + +size_t strnlen16(const char16_t *s, size_t maxlen) +{ + const char16_t *ss = s; + + /* Important: the maxlen test must precede the reference through ss; + since the byte beyond the maximum may segfault */ + while ((maxlen > 0) && *ss) { + ss++; + maxlen--; + } + return ss-s; +} + +int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2) +{ + const char16_t* e1 = s1+n1; + const char16_t* e2 = s2+n2; + + while (s1 < e1 && s2 < e2) { + const int d = (int)*s1++ - (int)*s2++; + if (d) { + return d; + } + } + + return n1 < n2 + ? (0 - (int)*s2) + : (n1 > n2 + ? ((int)*s1 - 0) + : 0); +} + +int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2) +{ + const char16_t* e1 = s1H+n1; + const char16_t* e2 = s2N+n2; + + while (s1H < e1 && s2N < e2) { + const char16_t c2 = ntohs(*s2N); + const int d = (int)*s1H++ - (int)c2; + s2N++; + if (d) { + return d; + } + } + + return n1 < n2 + ? (0 - (int)ntohs(*s2N)) + : (n1 > n2 + ? ((int)*s1H - 0) + : 0); +} + +void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst) +{ + if (src == NULL || src_len == 0 || dst == NULL) { + return; + } + + const char16_t* cur_utf16 = src; + const char16_t* const end_utf16 = src + src_len; + char *cur = dst; + while (cur_utf16 < end_utf16) { + char32_t utf32; + // surrogate pairs + if ((*cur_utf16 & 0xFC00) == 0xD800) { + utf32 = (*cur_utf16++ - 0xD800) << 10; + utf32 |= *cur_utf16++ - 0xDC00; + utf32 += 0x10000; + } else { + utf32 = (char32_t) *cur_utf16++; + } + const size_t len = utf32_codepoint_utf8_length(utf32); + utf32_codepoint_to_utf8((uint8_t*)cur, utf32, len); + cur += len; + } + *cur = '\0'; +} + +// -------------------------------------------------------------------------- +// UTF-8 +// -------------------------------------------------------------------------- + +ssize_t utf8_length(const char *src) +{ + const char *cur = src; + size_t ret = 0; + while (*cur != '\0') { + const char first_char = *cur++; + if ((first_char & 0x80) == 0) { // ASCII + ret += 1; + continue; + } + // (UTF-8's character must not be like 10xxxxxx, + // but 110xxxxx, 1110xxxx, ... or 1111110x) + if ((first_char & 0x40) == 0) { + return -1; + } + + int32_t mask, to_ignore_mask; + size_t num_to_read = 0; + char32_t utf32 = 0; + for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80; + num_to_read < 5 && (first_char & mask); + num_to_read++, to_ignore_mask |= mask, mask >>= 1) { + if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx + return -1; + } + // 0x3F == 00111111 + utf32 = (utf32 << 6) + (*cur++ & 0x3F); + } + // "first_char" must be (110xxxxx - 11110xxx) + if (num_to_read == 5) { + return -1; + } + to_ignore_mask |= mask; + utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1)); + if (utf32 > kUnicodeMaxCodepoint) { + return -1; + } + + ret += num_to_read; + } + return ret; +} + +ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len) +{ + if (src == NULL || src_len == 0) { + return -1; + } + + size_t ret = 0; + const char16_t* const end = src + src_len; + while (src < end) { + if ((*src & 0xFC00) == 0xD800 && (src + 1) < end + && (*++src & 0xFC00) == 0xDC00) { + // surrogate pairs are always 4 bytes. + ret += 4; + src++; + } else { + ret += utf32_codepoint_utf8_length((char32_t) *src++); + } + } + return ret; +} + +/** + * Returns 1-4 based on the number of leading bits. + * + * 1111 -> 4 + * 1110 -> 3 + * 110x -> 2 + * 10xx -> 1 + * 0xxx -> 1 + */ +static inline size_t utf8_codepoint_len(uint8_t ch) +{ + return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1; +} + +static inline void utf8_shift_and_mask(uint32_t* codePoint, const uint8_t byte) +{ + *codePoint <<= 6; + *codePoint |= 0x3F & byte; +} + +size_t utf8_to_utf32_length(const char *src, size_t src_len) +{ + if (src == NULL || src_len == 0) { + return 0; + } + size_t ret = 0; + const char* cur; + const char* end; + size_t num_to_skip; + for (cur = src, end = src + src_len, num_to_skip = 1; + cur < end; + cur += num_to_skip, ret++) { + const char first_char = *cur; + num_to_skip = 1; + if ((first_char & 0x80) == 0) { // ASCII + continue; + } + int32_t mask; + + for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) { + } + } + return ret; +} + +void utf8_to_utf32(const char* src, size_t src_len, char32_t* dst) +{ + if (src == NULL || src_len == 0 || dst == NULL) { + return; + } + + const char* cur = src; + const char* const end = src + src_len; + char32_t* cur_utf32 = dst; + while (cur < end) { + size_t num_read; + *cur_utf32++ = static_cast<char32_t>(utf32_at_internal(cur, &num_read)); + cur += num_read; + } + *cur_utf32 = 0; +} + +static inline uint32_t utf8_to_utf32_codepoint(const uint8_t *src, size_t length) +{ + uint32_t unicode; + + switch (length) + { + case 1: + return src[0]; + case 2: + unicode = src[0] & 0x1f; + utf8_shift_and_mask(&unicode, src[1]); + return unicode; + case 3: + unicode = src[0] & 0x0f; + utf8_shift_and_mask(&unicode, src[1]); + utf8_shift_and_mask(&unicode, src[2]); + return unicode; + case 4: + unicode = src[0] & 0x07; + utf8_shift_and_mask(&unicode, src[1]); + utf8_shift_and_mask(&unicode, src[2]); + utf8_shift_and_mask(&unicode, src[3]); + return unicode; + default: + return 0xffff; + } + + //printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result); +} + +ssize_t utf8_to_utf16_length(const uint8_t* u8str, size_t u8len) +{ + const uint8_t* const u8end = u8str + u8len; + const uint8_t* u8cur = u8str; + + /* Validate that the UTF-8 is the correct len */ + size_t u16measuredLen = 0; + while (u8cur < u8end) { + u16measuredLen++; + int u8charLen = utf8_codepoint_len(*u8cur); + uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8charLen); + if (codepoint > 0xFFFF) u16measuredLen++; // this will be a surrogate pair in utf16 + u8cur += u8charLen; + } + + /** + * Make sure that we ended where we thought we would and the output UTF-16 + * will be exactly how long we were told it would be. + */ + if (u8cur != u8end) { + return -1; + } + + return u16measuredLen; +} + +/** + * Convert a UTF-8 string to UTF-16. The destination UTF-16 buffer must have + * space for NULL at the end. + */ +void utf8_to_utf16(const uint8_t* u8str, size_t u8len, char16_t* u16str) +{ + const uint8_t* const u8end = u8str + u8len; + const uint8_t* u8cur = u8str; + char16_t* u16cur = u16str; + + while (u8cur < u8end) { + size_t u8len = utf8_codepoint_len(*u8cur); + uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len); + + // Convert the UTF32 codepoint to one or more UTF16 codepoints + if (codepoint <= 0xFFFF) { + // Single UTF16 character + *u16cur++ = (char16_t) codepoint; + } else { + // Multiple UTF16 characters with surrogates + codepoint = codepoint - 0x10000; + *u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800); + *u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00); + } + + u8cur += u8len; + } + *u16cur = 0; +} + +} diff --git a/libs/utils/tests/Android.mk b/libs/utils/tests/Android.mk index 00077ee..72d4876 100644 --- a/libs/utils/tests/Android.mk +++ b/libs/utils/tests/Android.mk @@ -8,7 +8,8 @@ ifneq ($(TARGET_SIMULATOR),true) test_src_files := \ ObbFile_test.cpp \ Looper_test.cpp \ - String8_test.cpp + String8_test.cpp \ + Unicode_test.cpp shared_libraries := \ libz \ diff --git a/libs/utils/tests/Unicode_test.cpp b/libs/utils/tests/Unicode_test.cpp new file mode 100644 index 0000000..18c130c --- /dev/null +++ b/libs/utils/tests/Unicode_test.cpp @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "Unicode_test" +#include <utils/Log.h> +#include <utils/Unicode.h> + +#include <gtest/gtest.h> + +namespace android { + +class UnicodeTest : public testing::Test { +protected: + virtual void SetUp() { + } + + virtual void TearDown() { + } +}; + +TEST_F(UnicodeTest, UTF8toUTF16ZeroLength) { + ssize_t measured; + + const uint8_t str[] = { }; + + measured = utf8_to_utf16_length(str, 0); + EXPECT_EQ(0, measured) + << "Zero length input should return zero length output."; +} + +TEST_F(UnicodeTest, UTF8toUTF16ASCIILength) { + ssize_t measured; + + // U+0030 or ASCII '0' + const uint8_t str[] = { 0x30 }; + + measured = utf8_to_utf16_length(str, sizeof(str)); + EXPECT_EQ(1, measured) + << "ASCII glyphs should have a length of 1 char16_t"; +} + +TEST_F(UnicodeTest, UTF8toUTF16Plane1Length) { + ssize_t measured; + + // U+2323 SMILE + const uint8_t str[] = { 0xE2, 0x8C, 0xA3 }; + + measured = utf8_to_utf16_length(str, sizeof(str)); + EXPECT_EQ(1, measured) + << "Plane 1 glyphs should have a length of 1 char16_t"; +} + +TEST_F(UnicodeTest, UTF8toUTF16SurrogateLength) { + ssize_t measured; + + // U+10000 + const uint8_t str[] = { 0xF0, 0x90, 0x80, 0x80 }; + + measured = utf8_to_utf16_length(str, sizeof(str)); + EXPECT_EQ(2, measured) + << "Surrogate pairs should have a length of 2 char16_t"; +} + +TEST_F(UnicodeTest, UTF8toUTF16TruncatedUTF8) { + ssize_t measured; + + // Truncated U+2323 SMILE + // U+2323 SMILE + const uint8_t str[] = { 0xE2, 0x8C }; + + measured = utf8_to_utf16_length(str, sizeof(str)); + EXPECT_EQ(-1, measured) + << "Truncated UTF-8 should return -1 to indicate invalid"; +} + +TEST_F(UnicodeTest, UTF8toUTF16Normal) { + const uint8_t str[] = { + 0x30, // U+0030, 1 UTF-16 character + 0xC4, 0x80, // U+0100, 1 UTF-16 character + 0xE2, 0x8C, 0xA3, // U+2323, 1 UTF-16 character + 0xF0, 0x90, 0x80, 0x80, // U+10000, 2 UTF-16 character + }; + + char16_t output[1 + 1 + 1 + 2 + 1]; // Room for NULL + + utf8_to_utf16(str, sizeof(str), output); + + EXPECT_EQ(0x0030, output[0]) + << "should be U+0030"; + EXPECT_EQ(0x0100, output[1]) + << "should be U+0100"; + EXPECT_EQ(0x2323, output[2]) + << "should be U+2323"; + EXPECT_EQ(0xD800, output[3]) + << "should be first half of surrogate U+10000"; + EXPECT_EQ(0xDC00, output[4]) + << "should be second half of surrogate U+10000"; + EXPECT_EQ(NULL, output[5]) + << "should be NULL terminated"; +} + +} |