summaryrefslogtreecommitdiffstats
path: root/media/libmedia/CharacterEncodingDetector.cpp
diff options
context:
space:
mode:
authorMarco Nelissen <marcone@google.com>2013-11-13 14:18:21 -0800
committerMarco Nelissen <marcone@google.com>2013-12-11 10:28:44 -0800
commit544ad2be674423238c47650d2c8588ba7dfc9ed2 (patch)
tree1167d14b0e345cd7ad6c2a415a7134c915507b86 /media/libmedia/CharacterEncodingDetector.cpp
parent35a9e7d49ff5ff99fedb0bf2a35c39f7ea5a2f9e (diff)
downloadframeworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.zip
frameworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.tar.gz
frameworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.tar.bz2
Better character set encoding detection
Id3 tags are supposed to be ISO-8859-1 or unicode, but often aren't. To better detect the real encoding we now use ICU to detect possible encodings for a given byte sequence, then apply additional heuristics to determine the most likely one. b/5564857 Change-Id: I53bc83b006433da5c2f2ccfcd770ddb3a26b64d0
Diffstat (limited to 'media/libmedia/CharacterEncodingDetector.cpp')
-rw-r--r--media/libmedia/CharacterEncodingDetector.cpp364
1 files changed, 364 insertions, 0 deletions
diff --git a/media/libmedia/CharacterEncodingDetector.cpp b/media/libmedia/CharacterEncodingDetector.cpp
new file mode 100644
index 0000000..eb091ac
--- /dev/null
+++ b/media/libmedia/CharacterEncodingDetector.cpp
@@ -0,0 +1,364 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//#define LOG_NDEBUG 0
+#define LOG_TAG "CharacterEncodingDector"
+#include <utils/Log.h>
+
+#include "CharacterEncodingDetector.h"
+#include "CharacterEncodingDetectorTables.h"
+
+#include "utils/Vector.h"
+#include "StringArray.h"
+
+#include "unicode/ucnv.h"
+#include "unicode/ucsdet.h"
+#include "unicode/ustring.h"
+
+namespace android {
+
+CharacterEncodingDetector::CharacterEncodingDetector() {
+
+ UErrorCode status = U_ZERO_ERROR;
+ mUtf8Conv = ucnv_open("UTF-8", &status);
+ if (U_FAILURE(status)) {
+ ALOGE("could not create UConverter for UTF-8");
+ mUtf8Conv = NULL;
+ }
+}
+
+CharacterEncodingDetector::~CharacterEncodingDetector() {
+ ucnv_close(mUtf8Conv);
+}
+
+void CharacterEncodingDetector::addTag(const char *name, const char *value) {
+ mNames.push_back(name);
+ mValues.push_back(value);
+}
+
+size_t CharacterEncodingDetector::size() {
+ return mNames.size();
+}
+
+status_t CharacterEncodingDetector::getTag(int index, const char **name, const char**value) {
+ if (index >= mNames.size()) {
+ return BAD_VALUE;
+ }
+
+ *name = mNames.getEntry(index);
+ *value = mValues.getEntry(index);
+ return OK;
+}
+
+static bool isPrintableAscii(const char *value, size_t len) {
+ for (size_t i = 0; i < len; i++) {
+ if ((value[i] & 0x80) || value[i] < 0x20 || value[i] == 0x7f) {
+ return false;
+ }
+ }
+ return true;
+}
+
+void CharacterEncodingDetector::detectAndConvert() {
+
+ int size = mNames.size();
+ ALOGV("%d tags before conversion", size);
+ for (int i = 0; i < size; i++) {
+ ALOGV("%s: %s", mNames.getEntry(i), mValues.getEntry(i));
+ }
+
+ if (size && mUtf8Conv) {
+
+ UErrorCode status = U_ZERO_ERROR;
+ UCharsetDetector *csd = ucsdet_open(&status);
+ const UCharsetMatch *ucm;
+
+ // try combined detection of artist/album/title etc.
+ char buf[1024];
+ buf[0] = 0;
+ int idx;
+ for (int i = 0; i < size; i++) {
+ const char *name = mNames.getEntry(i);
+ const char *value = mValues.getEntry(i);
+ if (!isPrintableAscii(value, strlen(value)) && (
+ !strcmp(name, "artist") ||
+ !strcmp(name, "albumartist") ||
+ !strcmp(name, "composer") ||
+ !strcmp(name, "genre") ||
+ !strcmp(name, "album") ||
+ !strcmp(name, "title"))) {
+ strlcat(buf, value, sizeof(buf));
+ // separate tags by space so ICU's ngram detector can do its job
+ strlcat(buf, " ", sizeof(buf));
+ }
+ }
+ ucsdet_setText(csd, buf, strlen(buf), &status);
+
+ int32_t matches;
+ const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
+ const char *combinedenc = "???";
+
+ const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf), ucma, matches);
+
+ if (bestCombinedMatch != NULL) {
+ combinedenc = ucsdet_getName(bestCombinedMatch, &status);
+ }
+
+ for (int i = 0; i < size; i++) {
+ const char *name = mNames.getEntry(i);
+ uint8_t* src = (uint8_t *)mValues.getEntry(i);
+ int len = strlen((char *)src);
+ uint8_t* dest = src;
+
+ ALOGV("@@@ checking %s", name);
+ const char *s = mValues.getEntry(i);
+ int32_t inputLength = strlen(s);
+ const char *enc;
+
+ if (!strcmp(name, "artist") ||
+ !strcmp(name, "albumartist") ||
+ !strcmp(name, "composer") ||
+ !strcmp(name, "genre") ||
+ !strcmp(name, "album") ||
+ !strcmp(name, "title")) {
+ // use encoding determined from the combination of artist/album/title etc.
+ enc = combinedenc;
+ } else {
+ ucsdet_setText(csd, s, inputLength, &status);
+ ucm = ucsdet_detect(csd, &status);
+ if (!ucm) {
+ mValues.setEntry(i, "???");
+ continue;
+ }
+ enc = ucsdet_getName(ucm, &status);
+ ALOGV("@@@@ recognized charset: %s for %s confidence %d",
+ enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status));
+ }
+
+ if (strcmp(enc,"UTF-8") != 0) {
+ // only convert if the source encoding isn't already UTF-8
+ ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i));
+ UConverter *conv = ucnv_open(enc, &status);
+ if (U_FAILURE(status)) {
+ ALOGE("could not create UConverter for %s", enc);
+ continue;
+ }
+
+ // convert from native encoding to UTF-8
+ const char* source = mValues.getEntry(i);
+ int targetLength = len * 3 + 1;
+ char* buffer = new char[targetLength];
+ // don't normally check for NULL, but in this case targetLength may be large
+ if (!buffer)
+ break;
+ char* target = buffer;
+
+ ucnv_convertEx(mUtf8Conv, conv, &target, target + targetLength,
+ &source, source + strlen(source),
+ NULL, NULL, NULL, NULL, TRUE, TRUE, &status);
+
+ if (U_FAILURE(status)) {
+ ALOGE("ucnv_convertEx failed: %d", status);
+ mValues.setEntry(i, "???");
+ } else {
+ // zero terminate
+ *target = 0;
+ mValues.setEntry(i, buffer);
+ }
+
+ delete[] buffer;
+
+ ucnv_close(conv);
+ }
+ }
+
+ for (int i = size - 1; i >= 0; --i) {
+ if (strlen(mValues.getEntry(i)) == 0) {
+ ALOGV("erasing %s because entry is empty", mNames.getEntry(i));
+ mNames.erase(i);
+ mValues.erase(i);
+ }
+ }
+
+ ucsdet_close(csd);
+ }
+}
+
+/*
+ * When ICU detects multiple encoding matches, apply additional heuristics to determine
+ * which one is the best match, since ICU can't always be trusted to make the right choice.
+ *
+ * What this method does is:
+ * - decode the input using each of the matches found
+ * - recalculate the starting confidence level for multibyte encodings using a different
+ * algorithm and larger frequent character lists than ICU
+ * - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc)
+ * - pick the highest match
+ */
+const UCharsetMatch *CharacterEncodingDetector::getPreferred(
+ const char *input, size_t len, const UCharsetMatch** ucma, size_t nummatches) {
+
+ Vector<const UCharsetMatch*> matches;
+ UErrorCode status = U_ZERO_ERROR;
+
+ ALOGV("%d matches", nummatches);
+ for (size_t i = 0; i < nummatches; i++) {
+ const char *encname = ucsdet_getName(ucma[i], &status);
+ int confidence = ucsdet_getConfidence(ucma[i], &status);
+ ALOGV("%d: %s %d", i, encname, confidence);
+ matches.push_back(ucma[i]);
+ }
+
+ size_t num = matches.size();
+ if (num == 0) {
+ return NULL;
+ }
+ if (num == 1) {
+ return matches[0];
+ }
+
+ ALOGV("considering %d matches", num);
+
+ // keep track of how many "special" characters result when converting the input using each
+ // encoding
+ Vector<int> newconfidence;
+ for (size_t i = 0; i < num; i++) {
+ const uint16_t *freqdata = NULL;
+ float freqcoverage = 0;
+ status = U_ZERO_ERROR;
+ const char *encname = ucsdet_getName(matches[i], &status);
+ int confidence = ucsdet_getConfidence(matches[i], &status);
+ if (!strcmp("GB18030", encname)) {
+ freqdata = frequent_zhCN;
+ freqcoverage = frequent_zhCN_coverage;
+ } else if (!strcmp("Big5", encname)) {
+ freqdata = frequent_zhTW;
+ freqcoverage = frequent_zhTW_coverage;
+ } else if (!strcmp("EUC-KR", encname)) {
+ freqdata = frequent_ko;
+ freqcoverage = frequent_ko_coverage;
+ } else if (!strcmp("EUC-JP", encname)) {
+ freqdata = frequent_ja;
+ freqcoverage = frequent_ja_coverage;
+ } else if (!strcmp("Shift_JIS", encname)) {
+ freqdata = frequent_ja;
+ freqcoverage = frequent_ja_coverage;
+ }
+
+ ALOGV("%d: %s %d", i, encname, confidence);
+ UConverter *conv = ucnv_open(encname, &status);
+ const char *source = input;
+ const char *sourceLimit = input + len;
+ status = U_ZERO_ERROR;
+ int demerit = 0;
+ int frequentchars = 0;
+ int totalchars = 0;
+ while (true) {
+ // demerit the current encoding for each "special" character found after conversion.
+ // The amount of demerit is somewhat arbitrarily chosen.
+ int inchar;
+ if (source != sourceLimit) {
+ inchar = (source[0] << 8) + source[1];
+ }
+ UChar32 c = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
+ if (!U_SUCCESS(status)) {
+ break;
+ }
+ if (c < 0x20 || (c >= 0x7f && c <= 0x009f)) {
+ ALOGV("control character %x", c);
+ demerit += 100;
+ } else if ((c >= 0xa0 && c <= 0xbe) // symbols, superscripts
+ || (c == 0xd7) || (c == 0xf7) // multiplication and division signs
+ || (c >= 0x2000 && c <= 0x209f)) { // punctuation, superscripts
+ ALOGV("unlikely character %x", c);
+ demerit += 10;
+ } else if (c >= 0xe000 && c <= 0xf8ff) {
+ ALOGV("private use character %x", c);
+ demerit += 30;
+ } else if (c >= 0x2190 && c <= 0x2bff) {
+ // this range comprises various symbol ranges that are unlikely to appear in
+ // music file metadata.
+ ALOGV("symbol %x", c);
+ demerit += 10;
+ } else if (c == 0xfffd) {
+ ALOGV("replacement character");
+ demerit += 50;
+ } else if (c >= 0xfff0 && c <= 0xfffc) {
+ ALOGV("unicode special %x", c);
+ demerit += 50;
+ } else if (freqdata != NULL) {
+ totalchars++;
+ if (isFrequent(freqdata, c)) {
+ frequentchars++;
+ }
+ }
+ }
+ if (freqdata != NULL && totalchars != 0) {
+ int myconfidence = 10 + float((100 * frequentchars) / totalchars) / freqcoverage;
+ ALOGV("ICU confidence: %d, my confidence: %d (%d %d)", confidence, myconfidence,
+ totalchars, frequentchars);
+ if (myconfidence > 100) myconfidence = 100;
+ if (myconfidence < 0) myconfidence = 0;
+ confidence = myconfidence;
+ }
+ ALOGV("%d-%d=%d", confidence, demerit, confidence - demerit);
+ newconfidence.push_back(confidence - demerit);
+ ucnv_close(conv);
+ if (i == 0 && (confidence - demerit) == 100) {
+ // no need to check any further, we'll end up using this match anyway
+ break;
+ }
+ }
+
+ // find match with highest confidence after adjusting for unlikely characters
+ int highest = newconfidence[0];
+ size_t highestidx = 0;
+ num = newconfidence.size();
+ for (size_t i = 1; i < num; i++) {
+ if (newconfidence[i] > highest) {
+ highest = newconfidence[i];
+ highestidx = i;
+ }
+ }
+ status = U_ZERO_ERROR;
+ ALOGV("selecting '%s' w/ %d confidence", ucsdet_getName(matches[highestidx], &status), highest);
+ return matches[highestidx];
+}
+
+
+bool CharacterEncodingDetector::isFrequent(const uint16_t *values, uint32_t c) {
+
+ int start = 0;
+ int end = 511; // All the tables have 512 entries
+ int mid = (start+end)/2;
+
+ while(start <= end) {
+ if(c == values[mid]) {
+ return true;
+ } else if (c > values[mid]) {
+ start = mid + 1;
+ } else {
+ end = mid - 1;
+ }
+
+ mid = (start + end) / 2;
+ }
+
+ return false;
+}
+
+
+} // namespace android