summaryrefslogtreecommitdiffstats
path: root/media/libmedia/MediaScannerClient.cpp
diff options
context:
space:
mode:
authorMarco Nelissen <marcone@google.com>2013-11-13 14:18:21 -0800
committerMarco Nelissen <marcone@google.com>2013-12-11 10:28:44 -0800
commit544ad2be674423238c47650d2c8588ba7dfc9ed2 (patch)
tree1167d14b0e345cd7ad6c2a415a7134c915507b86 /media/libmedia/MediaScannerClient.cpp
parent35a9e7d49ff5ff99fedb0bf2a35c39f7ea5a2f9e (diff)
downloadframeworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.zip
frameworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.tar.gz
frameworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.tar.bz2
Better character set encoding detection
Id3 tags are supposed to be ISO-8859-1 or unicode, but often aren't. To better detect the real encoding we now use ICU to detect possible encodings for a given byte sequence, then apply additional heuristics to determine the most likely one. b/5564857 Change-Id: I53bc83b006433da5c2f2ccfcd770ddb3a26b64d0
Diffstat (limited to 'media/libmedia/MediaScannerClient.cpp')
-rw-r--r--media/libmedia/MediaScannerClient.cpp202
1 files changed, 21 insertions, 181 deletions
diff --git a/media/libmedia/MediaScannerClient.cpp b/media/libmedia/MediaScannerClient.cpp
index 93a4a4c..1661f04 100644
--- a/media/libmedia/MediaScannerClient.cpp
+++ b/media/libmedia/MediaScannerClient.cpp
@@ -14,217 +14,57 @@
* limitations under the License.
*/
+//#define LOG_NDEBUG 0
+#define LOG_TAG "MediaScannerClient"
+#include <utils/Log.h>
+
#include <media/mediascanner.h>
+#include "CharacterEncodingDetector.h"
#include "StringArray.h"
-#include "autodetect.h"
-#include "unicode/ucnv.h"
-#include "unicode/ustring.h"
-
namespace android {
MediaScannerClient::MediaScannerClient()
- : mNames(NULL),
- mValues(NULL),
- mLocaleEncoding(kEncodingNone)
+ : mEncodingDetector(NULL)
{
}
MediaScannerClient::~MediaScannerClient()
{
- delete mNames;
- delete mValues;
+ delete mEncodingDetector;
}
void MediaScannerClient::setLocale(const char* locale)
{
- if (!locale) return;
-
- if (!strncmp(locale, "ja", 2))
- mLocaleEncoding = kEncodingShiftJIS;
- else if (!strncmp(locale, "ko", 2))
- mLocaleEncoding = kEncodingEUCKR;
- else if (!strncmp(locale, "zh", 2)) {
- if (!strcmp(locale, "zh_CN")) {
- // simplified chinese for mainland China
- mLocaleEncoding = kEncodingGBK;
- } else {
- // assume traditional for non-mainland Chinese locales (Taiwan, Hong Kong, Singapore)
- mLocaleEncoding = kEncodingBig5;
- }
- }
+ mLocale = locale; // not currently used
}
void MediaScannerClient::beginFile()
{
- mNames = new StringArray;
- mValues = new StringArray;
+ delete mEncodingDetector;
+ mEncodingDetector = new CharacterEncodingDetector();
}
status_t MediaScannerClient::addStringTag(const char* name, const char* value)
{
- if (mLocaleEncoding != kEncodingNone) {
- // don't bother caching strings that are all ASCII.
- // call handleStringTag directly instead.
- // check to see if value (which should be utf8) has any non-ASCII characters
- bool nonAscii = false;
- const char* chp = value;
- char ch;
- while ((ch = *chp++)) {
- if (ch & 0x80) {
- nonAscii = true;
- break;
- }
- }
-
- if (nonAscii) {
- // save the strings for later so they can be used for native encoding detection
- mNames->push_back(name);
- mValues->push_back(value);
- return OK;
- }
- // else fall through
- }
-
- // autodetection is not necessary, so no need to cache the values
- // pass directly to the client instead
- return handleStringTag(name, value);
-}
-
-static uint32_t possibleEncodings(const char* s)
-{
- uint32_t result = kEncodingAll;
- // if s contains a native encoding, then it was mistakenly encoded in utf8 as if it were latin-1
- // so we need to reverse the latin-1 -> utf8 conversion to get the native chars back
- uint8_t ch1, ch2;
- uint8_t* chp = (uint8_t *)s;
-
- while ((ch1 = *chp++)) {
- if (ch1 & 0x80) {
- ch2 = *chp++;
- ch1 = ((ch1 << 6) & 0xC0) | (ch2 & 0x3F);
- // ch1 is now the first byte of the potential native char
-
- ch2 = *chp++;
- if (ch2 & 0x80)
- ch2 = ((ch2 << 6) & 0xC0) | (*chp++ & 0x3F);
- // ch2 is now the second byte of the potential native char
- int ch = (int)ch1 << 8 | (int)ch2;
- result &= findPossibleEncodings(ch);
- }
- // else ASCII character, which could be anything
- }
-
- return result;
-}
-
-void MediaScannerClient::convertValues(uint32_t encoding)
-{
- const char* enc = NULL;
- switch (encoding) {
- case kEncodingShiftJIS:
- enc = "shift-jis";
- break;
- case kEncodingGBK:
- enc = "gbk";
- break;
- case kEncodingBig5:
- enc = "Big5";
- break;
- case kEncodingEUCKR:
- enc = "EUC-KR";
- break;
- }
-
- if (enc) {
- UErrorCode status = U_ZERO_ERROR;
-
- UConverter *conv = ucnv_open(enc, &status);
- if (U_FAILURE(status)) {
- ALOGE("could not create UConverter for %s", enc);
- return;
- }
- UConverter *utf8Conv = ucnv_open("UTF-8", &status);
- if (U_FAILURE(status)) {
- ALOGE("could not create UConverter for UTF-8");
- ucnv_close(conv);
- return;
- }
-
- // for each value string, convert from native encoding to UTF-8
- for (int i = 0; i < mNames->size(); i++) {
- // first we need to untangle the utf8 and convert it back to the original bytes
- // since we are reducing the length of the string, we can do this in place
- uint8_t* src = (uint8_t *)mValues->getEntry(i);
- int len = strlen((char *)src);
- uint8_t* dest = src;
-
- uint8_t uch;
- while ((uch = *src++)) {
- if (uch & 0x80)
- *dest++ = ((uch << 6) & 0xC0) | (*src++ & 0x3F);
- else
- *dest++ = uch;
- }
- *dest = 0;
-
- // now convert from native encoding to UTF-8
- const char* source = mValues->getEntry(i);
- int targetLength = len * 3 + 1;
- char* buffer = new char[targetLength];
- // don't normally check for NULL, but in this case targetLength may be large
- if (!buffer)
- break;
- char* target = buffer;
-
- ucnv_convertEx(utf8Conv, conv, &target, target + targetLength,
- &source, (const char *)dest, NULL, NULL, NULL, NULL, TRUE, TRUE, &status);
- if (U_FAILURE(status)) {
- ALOGE("ucnv_convertEx failed: %d", status);
- mValues->setEntry(i, "???");
- } else {
- // zero terminate
- *target = 0;
- mValues->setEntry(i, buffer);
- }
-
- delete[] buffer;
- }
-
- ucnv_close(conv);
- ucnv_close(utf8Conv);
- }
+ mEncodingDetector->addTag(name, value);
+ return OK;
}
void MediaScannerClient::endFile()
{
- if (mLocaleEncoding != kEncodingNone) {
- int size = mNames->size();
- uint32_t encoding = kEncodingAll;
-
- // compute a bit mask containing all possible encodings
- for (int i = 0; i < mNames->size(); i++)
- encoding &= possibleEncodings(mValues->getEntry(i));
-
- // if the locale encoding matches, then assume we have a native encoding.
- if (encoding & mLocaleEncoding)
- convertValues(mLocaleEncoding);
-
- // finally, push all name/value pairs to the client
- for (int i = 0; i < mNames->size(); i++) {
- status_t status = handleStringTag(mNames->getEntry(i), mValues->getEntry(i));
- if (status) {
- break;
- }
+ mEncodingDetector->detectAndConvert();
+
+ int size = mEncodingDetector->size();
+ if (size) {
+ for (int i = 0; i < size; i++) {
+ const char *name;
+ const char *value;
+ mEncodingDetector->getTag(i, &name, &value);
+ handleStringTag(name, value);
}
}
- // else addStringTag() has done all the work so we have nothing to do
-
- delete mNames;
- delete mValues;
- mNames = NULL;
- mValues = NULL;
}
} // namespace android