1 files changed, 473 insertions, 0 deletions
diff --git a/media/libmedia/CharacterEncodingDetector.cpp b/media/libmedia/CharacterEncodingDetector.cpp
new file mode 100644
index 0000000..41994dc
--- /dev/null
+++ b/media/libmedia/CharacterEncodingDetector.cpp
@@ -0,0 +1,473 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//#define LOG_NDEBUG 0
+#define LOG_TAG "CharacterEncodingDector"
+#include <utils/Log.h>
+
+#include <CharacterEncodingDetector.h>
+#include "CharacterEncodingDetectorTables.h"
+
+#include "utils/Vector.h"
+#include "StringArray.h"
+
+#include "unicode/ucnv.h"
+#include "unicode/ucsdet.h"
+#include "unicode/ustring.h"
+
+namespace android {
+
+CharacterEncodingDetector::CharacterEncodingDetector() {
+
+    UErrorCode status = U_ZERO_ERROR;
+    mUtf8Conv = ucnv_open("UTF-8", &status);
+    if (U_FAILURE(status)) {
+        ALOGE("could not create UConverter for UTF-8");
+        mUtf8Conv = NULL;
+    }
+}
+
+CharacterEncodingDetector::~CharacterEncodingDetector() {
+    ucnv_close(mUtf8Conv);
+}
+
+void CharacterEncodingDetector::addTag(const char *name, const char *value) {
+    mNames.push_back(name);
+    mValues.push_back(value);
+}
+
+size_t CharacterEncodingDetector::size() {
+    return mNames.size();
+}
+
+status_t CharacterEncodingDetector::getTag(int index, const char **name, const char**value) {
+    if (index >= mNames.size()) {
+        return BAD_VALUE;
+    }
+
+    *name = mNames.getEntry(index);
+    *value = mValues.getEntry(index);
+    return OK;
+}
+
+static bool isPrintableAscii(const char *value, size_t len) {
+    for (size_t i = 0; i < len; i++) {
+        if ((value[i] & 0x80) || value[i] < 0x20 || value[i] == 0x7f) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void CharacterEncodingDetector::detectAndConvert() {
+
+    int size = mNames.size();
+    ALOGV("%d tags before conversion", size);
+    for (int i = 0; i < size; i++) {
+        ALOGV("%s: %s", mNames.getEntry(i), mValues.getEntry(i));
+    }
+
+    if (size && mUtf8Conv) {
+
+        UErrorCode status = U_ZERO_ERROR;
+        UCharsetDetector *csd = ucsdet_open(&status);
+        const UCharsetMatch *ucm;
+
+        // try combined detection of artist/album/title etc.
+        char buf[1024];
+        buf[0] = 0;
+        int idx;
+        bool allprintable = true;
+        for (int i = 0; i < size; i++) {
+            const char *name = mNames.getEntry(i);
+            const char *value = mValues.getEntry(i);
+            if (!isPrintableAscii(value, strlen(value)) && (
+                        !strcmp(name, "artist") ||
+                        !strcmp(name, "albumartist") ||
+                        !strcmp(name, "composer") ||
+                        !strcmp(name, "genre") ||
+                        !strcmp(name, "album") ||
+                        !strcmp(name, "title"))) {
+                strlcat(buf, value, sizeof(buf));
+                // separate tags by space so ICU's ngram detector can do its job
+                strlcat(buf, " ", sizeof(buf));
+                allprintable = false;
+            }
+        }
+
+        const char *combinedenc = "UTF-8";
+        if (allprintable) {
+            // since 'buf' is empty, ICU would return a UTF-8 matcher with low confidence, so
+            // no need to even call it
+            ALOGV("all tags are printable, assuming ascii (%zu)", strlen(buf));
+        } else {
+            ucsdet_setText(csd, buf, strlen(buf), &status);
+            int32_t matches;
+            const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
+            bool goodmatch = true;
+            int highest = 0;
+            const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf),
+                    ucma, matches, &goodmatch, &highest);
+
+            ALOGV("goodmatch: %s, highest: %d", goodmatch ? "true" : "false", highest);
+            if (!goodmatch && (highest < 15 || strlen(buf) < 20)) {
+                ALOGV("not a good match, trying with more data");
+                // This string might be too short for ICU to do anything useful with.
+                // (real world example: "Björk" in ISO-8859-1 might be detected as GB18030, because
+                //  the ISO detector reports a confidence of 0, while the GB18030 detector reports
+                //  a confidence of 10 with no invalid characters)
+                // Append artist, album and title if they were previously omitted because they
+                // were printable ascii.
+                bool added = false;
+                for (int i = 0; i < size; i++) {
+                    const char *name = mNames.getEntry(i);
+                    const char *value = mValues.getEntry(i);
+                    if (isPrintableAscii(value, strlen(value)) && (
+                                !strcmp(name, "artist") ||
+                                !strcmp(name, "album") ||
+                                !strcmp(name, "title"))) {
+                        strlcat(buf, value, sizeof(buf));
+                        strlcat(buf, " ", sizeof(buf));
+                        added = true;
+                    }
+                }
+                if (added) {
+                    ucsdet_setText(csd, buf, strlen(buf), &status);
+                    ucma = ucsdet_detectAll(csd, &matches, &status);
+                    bestCombinedMatch = getPreferred(buf, strlen(buf),
+                            ucma, matches, &goodmatch, &highest);
+                    if (!goodmatch && highest <= 15) {
+                        ALOGV("still not a good match after adding printable tags");
+                        bestCombinedMatch = NULL;
+                    }
+                } else {
+                    ALOGV("no printable tags to add");
+                }
+            }
+
+            if (bestCombinedMatch != NULL) {
+                combinedenc = ucsdet_getName(bestCombinedMatch, &status);
+            } else {
+                combinedenc = "ISO-8859-1";
+            }
+        }
+
+        for (int i = 0; i < size; i++) {
+            const char *name = mNames.getEntry(i);
+            uint8_t* src = (uint8_t *)mValues.getEntry(i);
+            int len = strlen((char *)src);
+            uint8_t* dest = src;
+
+            ALOGV("@@@ checking %s", name);
+            const char *s = mValues.getEntry(i);
+            int32_t inputLength = strlen(s);
+            const char *enc;
+
+            if (!allprintable && (!strcmp(name, "artist") ||
+                    !strcmp(name, "albumartist") ||
+                    !strcmp(name, "composer") ||
+                    !strcmp(name, "genre") ||
+                    !strcmp(name, "album") ||
+                    !strcmp(name, "title"))) {
+                // use encoding determined from the combination of artist/album/title etc.
+                enc = combinedenc;
+            } else {
+                if (isPrintableAscii(s, inputLength)) {
+                    enc = "UTF-8";
+                    ALOGV("@@@@ %s is ascii", mNames.getEntry(i));
+                } else {
+                    ucsdet_setText(csd, s, inputLength, &status);
+                    ucm = ucsdet_detect(csd, &status);
+                    if (!ucm) {
+                        mValues.setEntry(i, "???");
+                        continue;
+                    }
+                    enc = ucsdet_getName(ucm, &status);
+                    ALOGV("@@@@ recognized charset: %s for %s confidence %d",
+                            enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status));
+                }
+            }
+
+            if (strcmp(enc,"UTF-8") != 0) {
+                // only convert if the source encoding isn't already UTF-8
+                ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i));
+                status = U_ZERO_ERROR;
+                UConverter *conv = ucnv_open(enc, &status);
+                if (U_FAILURE(status)) {
+                    ALOGW("could not create UConverter for %s (%d), falling back to ISO-8859-1",
+                            enc, status);
+                    status = U_ZERO_ERROR;
+                    conv = ucnv_open("ISO-8859-1", &status);
+                    if (U_FAILURE(status)) {
+                        ALOGW("could not create UConverter for ISO-8859-1 either");
+                        continue;
+                    }
+                }
+
+                // convert from native encoding to UTF-8
+                const char* source = mValues.getEntry(i);
+                int targetLength = len * 3 + 1;
+                char* buffer = new char[targetLength];
+                // don't normally check for NULL, but in this case targetLength may be large
+                if (!buffer)
+                    break;
+                char* target = buffer;
+
+                ucnv_convertEx(mUtf8Conv, conv, &target, target + targetLength,
+                        &source, source + strlen(source),
+                        NULL, NULL, NULL, NULL, TRUE, TRUE, &status);
+
+                if (U_FAILURE(status)) {
+                    ALOGE("ucnv_convertEx failed: %d", status);
+                    mValues.setEntry(i, "???");
+                } else {
+                    // zero terminate
+                    *target = 0;
+                    // strip trailing spaces
+                    while (--target > buffer && *target == ' ') {
+                        *target = 0;
+                    }
+                    // skip leading spaces
+                    char *start = buffer;
+                    while (*start == ' ') {
+                        start++;
+                    }
+                    mValues.setEntry(i, start);
+                }
+
+                delete[] buffer;
+
+                ucnv_close(conv);
+            }
+        }
+
+        for (int i = size - 1; i >= 0; --i) {
+            if (strlen(mValues.getEntry(i)) == 0) {
+                ALOGV("erasing %s because entry is empty", mNames.getEntry(i));
+                mNames.erase(i);
+                mValues.erase(i);
+            }
+        }
+
+        ucsdet_close(csd);
+    }
+}
+
+/*
+ * When ICU detects multiple encoding matches, apply additional heuristics to determine
+ * which one is the best match, since ICU can't always be trusted to make the right choice.
+ *
+ * What this method does is:
+ * - decode the input using each of the matches found
+ * - recalculate the starting confidence level for multibyte encodings using a different
+ *   algorithm and larger frequent character lists than ICU
+ * - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc)
+ * - pick the highest match
+ * - signal to the caller whether this match is considered good: confidence > 15, and confidence
+ *   delta with the next runner up > 15
+ */
+const UCharsetMatch *CharacterEncodingDetector::getPreferred(
+        const char *input, size_t len,
+        const UCharsetMatch** ucma, size_t nummatches,
+        bool *goodmatch, int *highestmatch) {
+
+    *goodmatch = false;
+    Vector<const UCharsetMatch*> matches;
+    UErrorCode status = U_ZERO_ERROR;
+
+    ALOGV("%zu matches", nummatches);
+    for (size_t i = 0; i < nummatches; i++) {
+        const char *encname = ucsdet_getName(ucma[i], &status);
+        int confidence = ucsdet_getConfidence(ucma[i], &status);
+        ALOGV("%zu: %s %d", i, encname, confidence);
+        matches.push_back(ucma[i]);
+    }
+
+    size_t num = matches.size();
+    if (num == 0) {
+        return NULL;
+    }
+    if (num == 1) {
+        int confidence = ucsdet_getConfidence(matches[0], &status);
+        if (confidence > 15) {
+            *goodmatch = true;
+        }
+        return matches[0];
+    }
+
+    ALOGV("considering %zu matches", num);
+
+    // keep track of how many "special" characters result when converting the input using each
+    // encoding
+    Vector<int> newconfidence;
+    for (size_t i = 0; i < num; i++) {
+        const uint16_t *freqdata = NULL;
+        float freqcoverage = 0;
+        status = U_ZERO_ERROR;
+        const char *encname = ucsdet_getName(matches[i], &status);
+        int confidence = ucsdet_getConfidence(matches[i], &status);
+        if (!strcmp("GB18030", encname)) {
+            freqdata = frequent_zhCN;
+            freqcoverage = frequent_zhCN_coverage;
+        } else if (!strcmp("Big5", encname)) {
+            freqdata = frequent_zhTW;
+            freqcoverage = frequent_zhTW_coverage;
+        } else if (!strcmp("EUC-KR", encname)) {
+            freqdata = frequent_ko;
+            freqcoverage = frequent_ko_coverage;
+        } else if (!strcmp("EUC-JP", encname)) {
+            freqdata = frequent_ja;
+            freqcoverage = frequent_ja_coverage;
+        } else if (!strcmp("Shift_JIS", encname)) {
+            freqdata = frequent_ja;
+            freqcoverage = frequent_ja_coverage;
+        }
+
+        ALOGV("%zu: %s %d", i, encname, confidence);
+        status = U_ZERO_ERROR;
+        UConverter *conv = ucnv_open(encname, &status);
+        int demerit = 0;
+        if (U_FAILURE(status)) {
+            ALOGV("failed to open %s: %d", encname, status);
+            confidence = 0;
+            demerit += 1000;
+        }
+        const char *source = input;
+        const char *sourceLimit = input + len;
+        status = U_ZERO_ERROR;
+        int frequentchars = 0;
+        int totalchars = 0;
+        while (true) {
+            // demerit the current encoding for each "special" character found after conversion.
+            // The amount of demerit is somewhat arbitrarily chosen.
+            int inchar;
+            if (source != sourceLimit) {
+                inchar = (source[0] << 8) + source[1];
+            }
+            UChar32 c = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
+            if (!U_SUCCESS(status)) {
+                break;
+            }
+            if (c < 0x20 || (c >= 0x7f && c <= 0x009f)) {
+                ALOGV("control character %x", c);
+                demerit += 100;
+            } else if ((c == 0xa0)                      // no-break space
+                    || (c >= 0xa2 && c <= 0xbe)         // symbols, superscripts
+                    || (c == 0xd7) || (c == 0xf7)       // multiplication and division signs
+                    || (c >= 0x2000 && c <= 0x209f)) {  // punctuation, superscripts
+                ALOGV("unlikely character %x", c);
+                demerit += 10;
+            } else if (c >= 0xe000 && c <= 0xf8ff) {
+                ALOGV("private use character %x", c);
+                demerit += 30;
+            } else if (c >= 0x2190 && c <= 0x2bff) {
+                // this range comprises various symbol ranges that are unlikely to appear in
+                // music file metadata.
+                ALOGV("symbol %x", c);
+                demerit += 10;
+            } else if (c == 0xfffd) {
+                ALOGV("replacement character");
+                demerit += 50;
+            } else if (c >= 0xfff0 && c <= 0xfffc) {
+                ALOGV("unicode special %x", c);
+                demerit += 50;
+            } else if (freqdata != NULL) {
+                totalchars++;
+                if (isFrequent(freqdata, c)) {
+                    frequentchars++;
+                }
+            }
+        }
+        if (freqdata != NULL && totalchars != 0) {
+            int myconfidence = 10 + float((100 * frequentchars) / totalchars) / freqcoverage;
+            ALOGV("ICU confidence: %d, my confidence: %d (%d %d)", confidence, myconfidence,
+                    totalchars, frequentchars);
+            if (myconfidence > 100) myconfidence = 100;
+            if (myconfidence < 0) myconfidence = 0;
+            confidence = myconfidence;
+        }
+        ALOGV("%d-%d=%d", confidence, demerit, confidence - demerit);
+        newconfidence.push_back(confidence - demerit);
+        ucnv_close(conv);
+        if (i == 0 && (confidence - demerit) == 100) {
+            // no need to check any further, we'll end up using this match anyway
+            break;
+        }
+    }
+
+    // find match with highest confidence after adjusting for unlikely characters
+    int highest = newconfidence[0];
+    size_t highestidx = 0;
+    int runnerup = -10000;
+    int runnerupidx = -10000;
+    num = newconfidence.size();
+    for (size_t i = 1; i < num; i++) {
+        if (newconfidence[i] > highest) {
+            runnerup = highest;
+            runnerupidx = highestidx;
+            highest = newconfidence[i];
+            highestidx = i;
+        } else if (newconfidence[i] > runnerup){
+            runnerup = newconfidence[i];
+            runnerupidx = i;
+        }
+    }
+    status = U_ZERO_ERROR;
+    ALOGV("selecting: '%s' w/ %d confidence",
+            ucsdet_getName(matches[highestidx], &status), highest);
+    if (runnerupidx < 0) {
+        ALOGV("no runner up");
+        if (highest > 15) {
+            *goodmatch = true;
+        }
+    } else {
+        ALOGV("runner up: '%s' w/ %d confidence",
+                ucsdet_getName(matches[runnerupidx], &status), runnerup);
+        if (runnerup < 0) {
+            runnerup = 0;
+        }
+        if ((highest - runnerup) > 15) {
+            *goodmatch = true;
+        }
+    }
+    *highestmatch = highest;
+    return matches[highestidx];
+}
+
+
+bool CharacterEncodingDetector::isFrequent(const uint16_t *values, uint32_t c) {
+
+    int start = 0;
+    int end = 511; // All the tables have 512 entries
+    int mid = (start+end)/2;
+
+    while(start <= end) {
+        if(c == values[mid]) {
+            return true;
+        } else if (c > values[mid]) {
+            start = mid + 1;
+        } else {
+            end = mid - 1;
+        }
+
+        mid = (start + end) / 2;
+    }
+
+    return false;
+}
+
+
+}  // namespace android