Use more tags to help the ICU detector.

The detector only gave non-ascii data to ICU. In some cases that could result in very short data, for which ICU would issue a low confidence level for the actual encoding. By padding the data with additional (ascii) tags, we improve accuracy for such files. Becauses this can reduce accuracy in other cases, only do this when the initial confidence is low. b/13473604 Change-Id: I63d932043155c310b0e358cdf2d37787961e94b7
author: Marco Nelissen <marcone@google.com> 2014-03-18 14:00:39 -0700
committer: Marco Nelissen <marcone@google.com> 2014-03-19 09:48:18 -0700
commit: bfd55f243feb3f04e26ad07aae035475768ada8a (patch)
tree: ace7974e03aa66e5858c58e3205171d786e0882e /media/libmedia/CharacterEncodingDetector.cpp
parent: e848bd9abb3bbbd4c66f9fb9d1442f7663d7ba40 (diff)
download: frameworks_av-bfd55f243feb3f04e26ad07aae035475768ada8a.zip
frameworks_av-bfd55f243feb3f04e26ad07aae035475768ada8a.tar.gz
frameworks_av-bfd55f243feb3f04e26ad07aae035475768ada8a.tar.bz2
1 files changed, 96 insertions, 19 deletions
diff --git a/media/libmedia/CharacterEncodingDetector.cpp b/media/libmedia/CharacterEncodingDetector.cpp
index eb091ac..5a3bf9d 100644
--- a/media/libmedia/CharacterEncodingDetector.cpp
+++ b/media/libmedia/CharacterEncodingDetector.cpp
@@ -90,6 +90,7 @@ void CharacterEncodingDetector::detectAndConvert() {
         char buf[1024];
         buf[0] = 0;
         int idx;
+        bool allprintable = true;
         for (int i = 0; i < size; i++) {
             const char *name = mNames.getEntry(i);
             const char *value = mValues.getEntry(i);
@@ -103,18 +104,60 @@ void CharacterEncodingDetector::detectAndConvert() {
                 strlcat(buf, value, sizeof(buf));
                 // separate tags by space so ICU's ngram detector can do its job
                 strlcat(buf, " ", sizeof(buf));
+                allprintable = false;
             }
         }
-        ucsdet_setText(csd, buf, strlen(buf), &status);
 
-        int32_t matches;
-        const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
-        const char *combinedenc = "???";
-
-        const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf), ucma, matches);
+        const char *combinedenc = "UTF-8";
+        if (allprintable) {
+            // since 'buf' is empty, ICU would return a UTF-8 matcher with low confidence, so
+            // no need to even call it
+            ALOGV("all tags are printable, assuming ascii (%d)", strlen(buf));
+        } else {
+            ucsdet_setText(csd, buf, strlen(buf), &status);
+            int32_t matches;
+            const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
+            bool goodmatch = true;
+            const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf),
+                    ucma, matches, &goodmatch);
+
+            if (!goodmatch && strlen(buf) < 20) {
+                ALOGV("not a good match, trying with more data");
+                // This string might be too short for ICU to do anything useful with.
+                // (real world example: "Björk" in ISO-8859-1 might be detected as GB18030, because
+                //  the ISO detector reports a confidence of 0, while the GB18030 detector reports
+                //  a confidence of 10 with no invalid characters)
+                // Append artist, album and title if they were previously omitted because they
+                // were printable ascii.
+                bool added = false;
+                for (int i = 0; i < size; i++) {
+                    const char *name = mNames.getEntry(i);
+                    const char *value = mValues.getEntry(i);
+                    if (isPrintableAscii(value, strlen(value)) && (
+                                !strcmp(name, "artist") ||
+                                !strcmp(name, "album") ||
+                                !strcmp(name, "title"))) {
+                        strlcat(buf, value, sizeof(buf));
+                        strlcat(buf, " ", sizeof(buf));
+                        added = true;
+                    }
+                }
+                if (added) {
+                    ucsdet_setText(csd, buf, strlen(buf), &status);
+                    ucma = ucsdet_detectAll(csd, &matches, &status);
+                    bestCombinedMatch = getPreferred(buf, strlen(buf),
+                            ucma, matches, &goodmatch);
+                    if (!goodmatch) {
+                        ALOGV("still not a good match after adding printable tags");
+                    }
+                } else {
+                    ALOGV("no printable tags to add");
+                }
+            }
 
-        if (bestCombinedMatch != NULL) {
-            combinedenc = ucsdet_getName(bestCombinedMatch, &status);
+            if (bestCombinedMatch != NULL) {
+                combinedenc = ucsdet_getName(bestCombinedMatch, &status);
+            }
         }
 
         for (int i = 0; i < size; i++) {
@@ -128,7 +171,7 @@ void CharacterEncodingDetector::detectAndConvert() {
             int32_t inputLength = strlen(s);
             const char *enc;
 
-            if (!strcmp(name, "artist") ||
+            if (!allprintable && !strcmp(name, "artist") ||
                     !strcmp(name, "albumartist") ||
                     !strcmp(name, "composer") ||
                     !strcmp(name, "genre") ||
@@ -137,15 +180,20 @@ void CharacterEncodingDetector::detectAndConvert() {
                 // use encoding determined from the combination of artist/album/title etc.
                 enc = combinedenc;
             } else {
-                ucsdet_setText(csd, s, inputLength, &status);
-                ucm = ucsdet_detect(csd, &status);
-                if (!ucm) {
-                    mValues.setEntry(i, "???");
-                    continue;
+                if (isPrintableAscii(s, inputLength)) {
+                    enc = "UTF-8";
+                    ALOGV("@@@@ %s is ascii", mNames.getEntry(i));
+                } else {
+                    ucsdet_setText(csd, s, inputLength, &status);
+                    ucm = ucsdet_detect(csd, &status);
+                    if (!ucm) {
+                        mValues.setEntry(i, "???");
+                        continue;
+                    }
+                    enc = ucsdet_getName(ucm, &status);
+                    ALOGV("@@@@ recognized charset: %s for %s confidence %d",
+                            enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status));
                 }
-                enc = ucsdet_getName(ucm, &status);
-                ALOGV("@@@@ recognized charset: %s for %s confidence %d",
-                        enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status));
             }
 
             if (strcmp(enc,"UTF-8") != 0) {
@@ -207,10 +255,15 @@ void CharacterEncodingDetector::detectAndConvert() {
  *   algorithm and larger frequent character lists than ICU
  * - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc)
  * - pick the highest match
+ * - signal to the caller whether this match is considered good: confidence > 15, and confidence
+ *   delta with the next runner up > 15
  */
 const UCharsetMatch *CharacterEncodingDetector::getPreferred(
-        const char *input, size_t len, const UCharsetMatch** ucma, size_t nummatches) {
+        const char *input, size_t len,
+        const UCharsetMatch** ucma, size_t nummatches,
+        bool *goodmatch) {
 
+    *goodmatch = false;
     Vector<const UCharsetMatch*> matches;
     UErrorCode status = U_ZERO_ERROR;
 
@@ -227,6 +280,10 @@ const UCharsetMatch *CharacterEncodingDetector::getPreferred(
         return NULL;
     }
     if (num == 1) {
+        int confidence = ucsdet_getConfidence(matches[0], &status);
+        if (confidence > 15) {
+            *goodmatch = true;
+        }
         return matches[0];
     }
 
@@ -326,15 +383,35 @@ const UCharsetMatch *CharacterEncodingDetector::getPreferred(
     // find match with highest confidence after adjusting for unlikely characters
     int highest = newconfidence[0];
     size_t highestidx = 0;
+    int runnerup = -10000;
+    int runnerupidx = -10000;
     num = newconfidence.size();
     for (size_t i = 1; i < num; i++) {
         if (newconfidence[i] > highest) {
+            runnerup = highest;
+            runnerupidx = highestidx;
             highest = newconfidence[i];
             highestidx = i;
+        } else if (newconfidence[i] > runnerup){
+            runnerup = newconfidence[i];
+            runnerupidx = i;
         }
     }
     status = U_ZERO_ERROR;
-    ALOGV("selecting '%s' w/ %d confidence", ucsdet_getName(matches[highestidx], &status), highest);
+    ALOGV("selecting: '%s' w/ %d confidence",
+            ucsdet_getName(matches[highestidx], &status), highest);
+    if (runnerupidx < 0) {
+        ALOGV("no runner up");
+        if (highest > 15) {
+            *goodmatch = true;
+        }
+    } else {
+        ALOGV("runner up: '%s' w/ %d confidence",
+                ucsdet_getName(matches[runnerupidx], &status), runnerup);
+        if ((highest - runnerup) > 15) {
+            *goodmatch = true;
+        }
+    }
     return matches[highestidx];
 }
author	Marco Nelissen <marcone@google.com>	2014-03-18 14:00:39 -0700
committer	Marco Nelissen <marcone@google.com>	2014-03-19 09:48:18 -0700
commit	bfd55f243feb3f04e26ad07aae035475768ada8a (patch)
tree	ace7974e03aa66e5858c58e3205171d786e0882e /media/libmedia/CharacterEncodingDetector.cpp
parent	e848bd9abb3bbbd4c66f9fb9d1442f7663d7ba40 (diff)
download	frameworks_av-bfd55f243feb3f04e26ad07aae035475768ada8a.zip frameworks_av-bfd55f243feb3f04e26ad07aae035475768ada8a.tar.gz frameworks_av-bfd55f243feb3f04e26ad07aae035475768ada8a.tar.bz2