From 544ad2be674423238c47650d2c8588ba7dfc9ed2 Mon Sep 17 00:00:00 2001 From: Marco Nelissen Date: Wed, 13 Nov 2013 14:18:21 -0800 Subject: Better character set encoding detection Id3 tags are supposed to be ISO-8859-1 or unicode, but often aren't. To better detect the real encoding we now use ICU to detect possible encodings for a given byte sequence, then apply additional heuristics to determine the most likely one. b/5564857 Change-Id: I53bc83b006433da5c2f2ccfcd770ddb3a26b64d0 --- media/libmedia/CharacterEncodingDetector.cpp | 364 +++++++++++++++++++++++++++ 1 file changed, 364 insertions(+) create mode 100644 media/libmedia/CharacterEncodingDetector.cpp (limited to 'media/libmedia/CharacterEncodingDetector.cpp') diff --git a/media/libmedia/CharacterEncodingDetector.cpp b/media/libmedia/CharacterEncodingDetector.cpp new file mode 100644 index 0000000..eb091ac --- /dev/null +++ b/media/libmedia/CharacterEncodingDetector.cpp @@ -0,0 +1,364 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//#define LOG_NDEBUG 0 +#define LOG_TAG "CharacterEncodingDector" +#include + +#include "CharacterEncodingDetector.h" +#include "CharacterEncodingDetectorTables.h" + +#include "utils/Vector.h" +#include "StringArray.h" + +#include "unicode/ucnv.h" +#include "unicode/ucsdet.h" +#include "unicode/ustring.h" + +namespace android { + +CharacterEncodingDetector::CharacterEncodingDetector() { + + UErrorCode status = U_ZERO_ERROR; + mUtf8Conv = ucnv_open("UTF-8", &status); + if (U_FAILURE(status)) { + ALOGE("could not create UConverter for UTF-8"); + mUtf8Conv = NULL; + } +} + +CharacterEncodingDetector::~CharacterEncodingDetector() { + ucnv_close(mUtf8Conv); +} + +void CharacterEncodingDetector::addTag(const char *name, const char *value) { + mNames.push_back(name); + mValues.push_back(value); +} + +size_t CharacterEncodingDetector::size() { + return mNames.size(); +} + +status_t CharacterEncodingDetector::getTag(int index, const char **name, const char**value) { + if (index >= mNames.size()) { + return BAD_VALUE; + } + + *name = mNames.getEntry(index); + *value = mValues.getEntry(index); + return OK; +} + +static bool isPrintableAscii(const char *value, size_t len) { + for (size_t i = 0; i < len; i++) { + if ((value[i] & 0x80) || value[i] < 0x20 || value[i] == 0x7f) { + return false; + } + } + return true; +} + +void CharacterEncodingDetector::detectAndConvert() { + + int size = mNames.size(); + ALOGV("%d tags before conversion", size); + for (int i = 0; i < size; i++) { + ALOGV("%s: %s", mNames.getEntry(i), mValues.getEntry(i)); + } + + if (size && mUtf8Conv) { + + UErrorCode status = U_ZERO_ERROR; + UCharsetDetector *csd = ucsdet_open(&status); + const UCharsetMatch *ucm; + + // try combined detection of artist/album/title etc. + char buf[1024]; + buf[0] = 0; + int idx; + for (int i = 0; i < size; i++) { + const char *name = mNames.getEntry(i); + const char *value = mValues.getEntry(i); + if (!isPrintableAscii(value, strlen(value)) && ( + !strcmp(name, "artist") || + !strcmp(name, "albumartist") || + !strcmp(name, "composer") || + !strcmp(name, "genre") || + !strcmp(name, "album") || + !strcmp(name, "title"))) { + strlcat(buf, value, sizeof(buf)); + // separate tags by space so ICU's ngram detector can do its job + strlcat(buf, " ", sizeof(buf)); + } + } + ucsdet_setText(csd, buf, strlen(buf), &status); + + int32_t matches; + const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status); + const char *combinedenc = "???"; + + const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf), ucma, matches); + + if (bestCombinedMatch != NULL) { + combinedenc = ucsdet_getName(bestCombinedMatch, &status); + } + + for (int i = 0; i < size; i++) { + const char *name = mNames.getEntry(i); + uint8_t* src = (uint8_t *)mValues.getEntry(i); + int len = strlen((char *)src); + uint8_t* dest = src; + + ALOGV("@@@ checking %s", name); + const char *s = mValues.getEntry(i); + int32_t inputLength = strlen(s); + const char *enc; + + if (!strcmp(name, "artist") || + !strcmp(name, "albumartist") || + !strcmp(name, "composer") || + !strcmp(name, "genre") || + !strcmp(name, "album") || + !strcmp(name, "title")) { + // use encoding determined from the combination of artist/album/title etc. + enc = combinedenc; + } else { + ucsdet_setText(csd, s, inputLength, &status); + ucm = ucsdet_detect(csd, &status); + if (!ucm) { + mValues.setEntry(i, "???"); + continue; + } + enc = ucsdet_getName(ucm, &status); + ALOGV("@@@@ recognized charset: %s for %s confidence %d", + enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status)); + } + + if (strcmp(enc,"UTF-8") != 0) { + // only convert if the source encoding isn't already UTF-8 + ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i)); + UConverter *conv = ucnv_open(enc, &status); + if (U_FAILURE(status)) { + ALOGE("could not create UConverter for %s", enc); + continue; + } + + // convert from native encoding to UTF-8 + const char* source = mValues.getEntry(i); + int targetLength = len * 3 + 1; + char* buffer = new char[targetLength]; + // don't normally check for NULL, but in this case targetLength may be large + if (!buffer) + break; + char* target = buffer; + + ucnv_convertEx(mUtf8Conv, conv, &target, target + targetLength, + &source, source + strlen(source), + NULL, NULL, NULL, NULL, TRUE, TRUE, &status); + + if (U_FAILURE(status)) { + ALOGE("ucnv_convertEx failed: %d", status); + mValues.setEntry(i, "???"); + } else { + // zero terminate + *target = 0; + mValues.setEntry(i, buffer); + } + + delete[] buffer; + + ucnv_close(conv); + } + } + + for (int i = size - 1; i >= 0; --i) { + if (strlen(mValues.getEntry(i)) == 0) { + ALOGV("erasing %s because entry is empty", mNames.getEntry(i)); + mNames.erase(i); + mValues.erase(i); + } + } + + ucsdet_close(csd); + } +} + +/* + * When ICU detects multiple encoding matches, apply additional heuristics to determine + * which one is the best match, since ICU can't always be trusted to make the right choice. + * + * What this method does is: + * - decode the input using each of the matches found + * - recalculate the starting confidence level for multibyte encodings using a different + * algorithm and larger frequent character lists than ICU + * - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc) + * - pick the highest match + */ +const UCharsetMatch *CharacterEncodingDetector::getPreferred( + const char *input, size_t len, const UCharsetMatch** ucma, size_t nummatches) { + + Vector matches; + UErrorCode status = U_ZERO_ERROR; + + ALOGV("%d matches", nummatches); + for (size_t i = 0; i < nummatches; i++) { + const char *encname = ucsdet_getName(ucma[i], &status); + int confidence = ucsdet_getConfidence(ucma[i], &status); + ALOGV("%d: %s %d", i, encname, confidence); + matches.push_back(ucma[i]); + } + + size_t num = matches.size(); + if (num == 0) { + return NULL; + } + if (num == 1) { + return matches[0]; + } + + ALOGV("considering %d matches", num); + + // keep track of how many "special" characters result when converting the input using each + // encoding + Vector newconfidence; + for (size_t i = 0; i < num; i++) { + const uint16_t *freqdata = NULL; + float freqcoverage = 0; + status = U_ZERO_ERROR; + const char *encname = ucsdet_getName(matches[i], &status); + int confidence = ucsdet_getConfidence(matches[i], &status); + if (!strcmp("GB18030", encname)) { + freqdata = frequent_zhCN; + freqcoverage = frequent_zhCN_coverage; + } else if (!strcmp("Big5", encname)) { + freqdata = frequent_zhTW; + freqcoverage = frequent_zhTW_coverage; + } else if (!strcmp("EUC-KR", encname)) { + freqdata = frequent_ko; + freqcoverage = frequent_ko_coverage; + } else if (!strcmp("EUC-JP", encname)) { + freqdata = frequent_ja; + freqcoverage = frequent_ja_coverage; + } else if (!strcmp("Shift_JIS", encname)) { + freqdata = frequent_ja; + freqcoverage = frequent_ja_coverage; + } + + ALOGV("%d: %s %d", i, encname, confidence); + UConverter *conv = ucnv_open(encname, &status); + const char *source = input; + const char *sourceLimit = input + len; + status = U_ZERO_ERROR; + int demerit = 0; + int frequentchars = 0; + int totalchars = 0; + while (true) { + // demerit the current encoding for each "special" character found after conversion. + // The amount of demerit is somewhat arbitrarily chosen. + int inchar; + if (source != sourceLimit) { + inchar = (source[0] << 8) + source[1]; + } + UChar32 c = ucnv_getNextUChar(conv, &source, sourceLimit, &status); + if (!U_SUCCESS(status)) { + break; + } + if (c < 0x20 || (c >= 0x7f && c <= 0x009f)) { + ALOGV("control character %x", c); + demerit += 100; + } else if ((c >= 0xa0 && c <= 0xbe) // symbols, superscripts + || (c == 0xd7) || (c == 0xf7) // multiplication and division signs + || (c >= 0x2000 && c <= 0x209f)) { // punctuation, superscripts + ALOGV("unlikely character %x", c); + demerit += 10; + } else if (c >= 0xe000 && c <= 0xf8ff) { + ALOGV("private use character %x", c); + demerit += 30; + } else if (c >= 0x2190 && c <= 0x2bff) { + // this range comprises various symbol ranges that are unlikely to appear in + // music file metadata. + ALOGV("symbol %x", c); + demerit += 10; + } else if (c == 0xfffd) { + ALOGV("replacement character"); + demerit += 50; + } else if (c >= 0xfff0 && c <= 0xfffc) { + ALOGV("unicode special %x", c); + demerit += 50; + } else if (freqdata != NULL) { + totalchars++; + if (isFrequent(freqdata, c)) { + frequentchars++; + } + } + } + if (freqdata != NULL && totalchars != 0) { + int myconfidence = 10 + float((100 * frequentchars) / totalchars) / freqcoverage; + ALOGV("ICU confidence: %d, my confidence: %d (%d %d)", confidence, myconfidence, + totalchars, frequentchars); + if (myconfidence > 100) myconfidence = 100; + if (myconfidence < 0) myconfidence = 0; + confidence = myconfidence; + } + ALOGV("%d-%d=%d", confidence, demerit, confidence - demerit); + newconfidence.push_back(confidence - demerit); + ucnv_close(conv); + if (i == 0 && (confidence - demerit) == 100) { + // no need to check any further, we'll end up using this match anyway + break; + } + } + + // find match with highest confidence after adjusting for unlikely characters + int highest = newconfidence[0]; + size_t highestidx = 0; + num = newconfidence.size(); + for (size_t i = 1; i < num; i++) { + if (newconfidence[i] > highest) { + highest = newconfidence[i]; + highestidx = i; + } + } + status = U_ZERO_ERROR; + ALOGV("selecting '%s' w/ %d confidence", ucsdet_getName(matches[highestidx], &status), highest); + return matches[highestidx]; +} + + +bool CharacterEncodingDetector::isFrequent(const uint16_t *values, uint32_t c) { + + int start = 0; + int end = 511; // All the tables have 512 entries + int mid = (start+end)/2; + + while(start <= end) { + if(c == values[mid]) { + return true; + } else if (c > values[mid]) { + start = mid + 1; + } else { + end = mid - 1; + } + + mid = (start + end) / 2; + } + + return false; +} + + +} // namespace android -- cgit v1.1