diff options
Diffstat (limited to 'media/libmedia/CharacterEncodingDetector.cpp')
-rw-r--r-- | media/libmedia/CharacterEncodingDetector.cpp | 473 |
1 files changed, 473 insertions, 0 deletions
diff --git a/media/libmedia/CharacterEncodingDetector.cpp b/media/libmedia/CharacterEncodingDetector.cpp new file mode 100644 index 0000000..41994dc --- /dev/null +++ b/media/libmedia/CharacterEncodingDetector.cpp @@ -0,0 +1,473 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//#define LOG_NDEBUG 0 +#define LOG_TAG "CharacterEncodingDector" +#include <utils/Log.h> + +#include <CharacterEncodingDetector.h> +#include "CharacterEncodingDetectorTables.h" + +#include "utils/Vector.h" +#include "StringArray.h" + +#include "unicode/ucnv.h" +#include "unicode/ucsdet.h" +#include "unicode/ustring.h" + +namespace android { + +CharacterEncodingDetector::CharacterEncodingDetector() { + + UErrorCode status = U_ZERO_ERROR; + mUtf8Conv = ucnv_open("UTF-8", &status); + if (U_FAILURE(status)) { + ALOGE("could not create UConverter for UTF-8"); + mUtf8Conv = NULL; + } +} + +CharacterEncodingDetector::~CharacterEncodingDetector() { + ucnv_close(mUtf8Conv); +} + +void CharacterEncodingDetector::addTag(const char *name, const char *value) { + mNames.push_back(name); + mValues.push_back(value); +} + +size_t CharacterEncodingDetector::size() { + return mNames.size(); +} + +status_t CharacterEncodingDetector::getTag(int index, const char **name, const char**value) { + if (index >= mNames.size()) { + return BAD_VALUE; + } + + *name = mNames.getEntry(index); + *value = mValues.getEntry(index); + return OK; +} + +static bool isPrintableAscii(const char *value, size_t len) { + for (size_t i = 0; i < len; i++) { + if ((value[i] & 0x80) || value[i] < 0x20 || value[i] == 0x7f) { + return false; + } + } + return true; +} + +void CharacterEncodingDetector::detectAndConvert() { + + int size = mNames.size(); + ALOGV("%d tags before conversion", size); + for (int i = 0; i < size; i++) { + ALOGV("%s: %s", mNames.getEntry(i), mValues.getEntry(i)); + } + + if (size && mUtf8Conv) { + + UErrorCode status = U_ZERO_ERROR; + UCharsetDetector *csd = ucsdet_open(&status); + const UCharsetMatch *ucm; + + // try combined detection of artist/album/title etc. + char buf[1024]; + buf[0] = 0; + int idx; + bool allprintable = true; + for (int i = 0; i < size; i++) { + const char *name = mNames.getEntry(i); + const char *value = mValues.getEntry(i); + if (!isPrintableAscii(value, strlen(value)) && ( + !strcmp(name, "artist") || + !strcmp(name, "albumartist") || + !strcmp(name, "composer") || + !strcmp(name, "genre") || + !strcmp(name, "album") || + !strcmp(name, "title"))) { + strlcat(buf, value, sizeof(buf)); + // separate tags by space so ICU's ngram detector can do its job + strlcat(buf, " ", sizeof(buf)); + allprintable = false; + } + } + + const char *combinedenc = "UTF-8"; + if (allprintable) { + // since 'buf' is empty, ICU would return a UTF-8 matcher with low confidence, so + // no need to even call it + ALOGV("all tags are printable, assuming ascii (%zu)", strlen(buf)); + } else { + ucsdet_setText(csd, buf, strlen(buf), &status); + int32_t matches; + const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status); + bool goodmatch = true; + int highest = 0; + const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf), + ucma, matches, &goodmatch, &highest); + + ALOGV("goodmatch: %s, highest: %d", goodmatch ? "true" : "false", highest); + if (!goodmatch && (highest < 15 || strlen(buf) < 20)) { + ALOGV("not a good match, trying with more data"); + // This string might be too short for ICU to do anything useful with. + // (real world example: "Björk" in ISO-8859-1 might be detected as GB18030, because + // the ISO detector reports a confidence of 0, while the GB18030 detector reports + // a confidence of 10 with no invalid characters) + // Append artist, album and title if they were previously omitted because they + // were printable ascii. + bool added = false; + for (int i = 0; i < size; i++) { + const char *name = mNames.getEntry(i); + const char *value = mValues.getEntry(i); + if (isPrintableAscii(value, strlen(value)) && ( + !strcmp(name, "artist") || + !strcmp(name, "album") || + !strcmp(name, "title"))) { + strlcat(buf, value, sizeof(buf)); + strlcat(buf, " ", sizeof(buf)); + added = true; + } + } + if (added) { + ucsdet_setText(csd, buf, strlen(buf), &status); + ucma = ucsdet_detectAll(csd, &matches, &status); + bestCombinedMatch = getPreferred(buf, strlen(buf), + ucma, matches, &goodmatch, &highest); + if (!goodmatch && highest <= 15) { + ALOGV("still not a good match after adding printable tags"); + bestCombinedMatch = NULL; + } + } else { + ALOGV("no printable tags to add"); + } + } + + if (bestCombinedMatch != NULL) { + combinedenc = ucsdet_getName(bestCombinedMatch, &status); + } else { + combinedenc = "ISO-8859-1"; + } + } + + for (int i = 0; i < size; i++) { + const char *name = mNames.getEntry(i); + uint8_t* src = (uint8_t *)mValues.getEntry(i); + int len = strlen((char *)src); + uint8_t* dest = src; + + ALOGV("@@@ checking %s", name); + const char *s = mValues.getEntry(i); + int32_t inputLength = strlen(s); + const char *enc; + + if (!allprintable && (!strcmp(name, "artist") || + !strcmp(name, "albumartist") || + !strcmp(name, "composer") || + !strcmp(name, "genre") || + !strcmp(name, "album") || + !strcmp(name, "title"))) { + // use encoding determined from the combination of artist/album/title etc. + enc = combinedenc; + } else { + if (isPrintableAscii(s, inputLength)) { + enc = "UTF-8"; + ALOGV("@@@@ %s is ascii", mNames.getEntry(i)); + } else { + ucsdet_setText(csd, s, inputLength, &status); + ucm = ucsdet_detect(csd, &status); + if (!ucm) { + mValues.setEntry(i, "???"); + continue; + } + enc = ucsdet_getName(ucm, &status); + ALOGV("@@@@ recognized charset: %s for %s confidence %d", + enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status)); + } + } + + if (strcmp(enc,"UTF-8") != 0) { + // only convert if the source encoding isn't already UTF-8 + ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i)); + status = U_ZERO_ERROR; + UConverter *conv = ucnv_open(enc, &status); + if (U_FAILURE(status)) { + ALOGW("could not create UConverter for %s (%d), falling back to ISO-8859-1", + enc, status); + status = U_ZERO_ERROR; + conv = ucnv_open("ISO-8859-1", &status); + if (U_FAILURE(status)) { + ALOGW("could not create UConverter for ISO-8859-1 either"); + continue; + } + } + + // convert from native encoding to UTF-8 + const char* source = mValues.getEntry(i); + int targetLength = len * 3 + 1; + char* buffer = new char[targetLength]; + // don't normally check for NULL, but in this case targetLength may be large + if (!buffer) + break; + char* target = buffer; + + ucnv_convertEx(mUtf8Conv, conv, &target, target + targetLength, + &source, source + strlen(source), + NULL, NULL, NULL, NULL, TRUE, TRUE, &status); + + if (U_FAILURE(status)) { + ALOGE("ucnv_convertEx failed: %d", status); + mValues.setEntry(i, "???"); + } else { + // zero terminate + *target = 0; + // strip trailing spaces + while (--target > buffer && *target == ' ') { + *target = 0; + } + // skip leading spaces + char *start = buffer; + while (*start == ' ') { + start++; + } + mValues.setEntry(i, start); + } + + delete[] buffer; + + ucnv_close(conv); + } + } + + for (int i = size - 1; i >= 0; --i) { + if (strlen(mValues.getEntry(i)) == 0) { + ALOGV("erasing %s because entry is empty", mNames.getEntry(i)); + mNames.erase(i); + mValues.erase(i); + } + } + + ucsdet_close(csd); + } +} + +/* + * When ICU detects multiple encoding matches, apply additional heuristics to determine + * which one is the best match, since ICU can't always be trusted to make the right choice. + * + * What this method does is: + * - decode the input using each of the matches found + * - recalculate the starting confidence level for multibyte encodings using a different + * algorithm and larger frequent character lists than ICU + * - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc) + * - pick the highest match + * - signal to the caller whether this match is considered good: confidence > 15, and confidence + * delta with the next runner up > 15 + */ +const UCharsetMatch *CharacterEncodingDetector::getPreferred( + const char *input, size_t len, + const UCharsetMatch** ucma, size_t nummatches, + bool *goodmatch, int *highestmatch) { + + *goodmatch = false; + Vector<const UCharsetMatch*> matches; + UErrorCode status = U_ZERO_ERROR; + + ALOGV("%zu matches", nummatches); + for (size_t i = 0; i < nummatches; i++) { + const char *encname = ucsdet_getName(ucma[i], &status); + int confidence = ucsdet_getConfidence(ucma[i], &status); + ALOGV("%zu: %s %d", i, encname, confidence); + matches.push_back(ucma[i]); + } + + size_t num = matches.size(); + if (num == 0) { + return NULL; + } + if (num == 1) { + int confidence = ucsdet_getConfidence(matches[0], &status); + if (confidence > 15) { + *goodmatch = true; + } + return matches[0]; + } + + ALOGV("considering %zu matches", num); + + // keep track of how many "special" characters result when converting the input using each + // encoding + Vector<int> newconfidence; + for (size_t i = 0; i < num; i++) { + const uint16_t *freqdata = NULL; + float freqcoverage = 0; + status = U_ZERO_ERROR; + const char *encname = ucsdet_getName(matches[i], &status); + int confidence = ucsdet_getConfidence(matches[i], &status); + if (!strcmp("GB18030", encname)) { + freqdata = frequent_zhCN; + freqcoverage = frequent_zhCN_coverage; + } else if (!strcmp("Big5", encname)) { + freqdata = frequent_zhTW; + freqcoverage = frequent_zhTW_coverage; + } else if (!strcmp("EUC-KR", encname)) { + freqdata = frequent_ko; + freqcoverage = frequent_ko_coverage; + } else if (!strcmp("EUC-JP", encname)) { + freqdata = frequent_ja; + freqcoverage = frequent_ja_coverage; + } else if (!strcmp("Shift_JIS", encname)) { + freqdata = frequent_ja; + freqcoverage = frequent_ja_coverage; + } + + ALOGV("%zu: %s %d", i, encname, confidence); + status = U_ZERO_ERROR; + UConverter *conv = ucnv_open(encname, &status); + int demerit = 0; + if (U_FAILURE(status)) { + ALOGV("failed to open %s: %d", encname, status); + confidence = 0; + demerit += 1000; + } + const char *source = input; + const char *sourceLimit = input + len; + status = U_ZERO_ERROR; + int frequentchars = 0; + int totalchars = 0; + while (true) { + // demerit the current encoding for each "special" character found after conversion. + // The amount of demerit is somewhat arbitrarily chosen. + int inchar; + if (source != sourceLimit) { + inchar = (source[0] << 8) + source[1]; + } + UChar32 c = ucnv_getNextUChar(conv, &source, sourceLimit, &status); + if (!U_SUCCESS(status)) { + break; + } + if (c < 0x20 || (c >= 0x7f && c <= 0x009f)) { + ALOGV("control character %x", c); + demerit += 100; + } else if ((c == 0xa0) // no-break space + || (c >= 0xa2 && c <= 0xbe) // symbols, superscripts + || (c == 0xd7) || (c == 0xf7) // multiplication and division signs + || (c >= 0x2000 && c <= 0x209f)) { // punctuation, superscripts + ALOGV("unlikely character %x", c); + demerit += 10; + } else if (c >= 0xe000 && c <= 0xf8ff) { + ALOGV("private use character %x", c); + demerit += 30; + } else if (c >= 0x2190 && c <= 0x2bff) { + // this range comprises various symbol ranges that are unlikely to appear in + // music file metadata. + ALOGV("symbol %x", c); + demerit += 10; + } else if (c == 0xfffd) { + ALOGV("replacement character"); + demerit += 50; + } else if (c >= 0xfff0 && c <= 0xfffc) { + ALOGV("unicode special %x", c); + demerit += 50; + } else if (freqdata != NULL) { + totalchars++; + if (isFrequent(freqdata, c)) { + frequentchars++; + } + } + } + if (freqdata != NULL && totalchars != 0) { + int myconfidence = 10 + float((100 * frequentchars) / totalchars) / freqcoverage; + ALOGV("ICU confidence: %d, my confidence: %d (%d %d)", confidence, myconfidence, + totalchars, frequentchars); + if (myconfidence > 100) myconfidence = 100; + if (myconfidence < 0) myconfidence = 0; + confidence = myconfidence; + } + ALOGV("%d-%d=%d", confidence, demerit, confidence - demerit); + newconfidence.push_back(confidence - demerit); + ucnv_close(conv); + if (i == 0 && (confidence - demerit) == 100) { + // no need to check any further, we'll end up using this match anyway + break; + } + } + + // find match with highest confidence after adjusting for unlikely characters + int highest = newconfidence[0]; + size_t highestidx = 0; + int runnerup = -10000; + int runnerupidx = -10000; + num = newconfidence.size(); + for (size_t i = 1; i < num; i++) { + if (newconfidence[i] > highest) { + runnerup = highest; + runnerupidx = highestidx; + highest = newconfidence[i]; + highestidx = i; + } else if (newconfidence[i] > runnerup){ + runnerup = newconfidence[i]; + runnerupidx = i; + } + } + status = U_ZERO_ERROR; + ALOGV("selecting: '%s' w/ %d confidence", + ucsdet_getName(matches[highestidx], &status), highest); + if (runnerupidx < 0) { + ALOGV("no runner up"); + if (highest > 15) { + *goodmatch = true; + } + } else { + ALOGV("runner up: '%s' w/ %d confidence", + ucsdet_getName(matches[runnerupidx], &status), runnerup); + if (runnerup < 0) { + runnerup = 0; + } + if ((highest - runnerup) > 15) { + *goodmatch = true; + } + } + *highestmatch = highest; + return matches[highestidx]; +} + + +bool CharacterEncodingDetector::isFrequent(const uint16_t *values, uint32_t c) { + + int start = 0; + int end = 511; // All the tables have 512 entries + int mid = (start+end)/2; + + while(start <= end) { + if(c == values[mid]) { + return true; + } else if (c > values[mid]) { + start = mid + 1; + } else { + end = mid - 1; + } + + mid = (start + end) / 2; + } + + return false; +} + + +} // namespace android |