Merge "Use CharacterEncodingDetector in metadataretriever" into lmp-dev

author: Marco Nelissen <marcone@google.com> 2014-09-03 19:09:43 +0000
committer: Android (Google) Code Review <android-gerrit@google.com> 2014-09-03 19:09:43 +0000
commit: d71233a846aca7035a851941c1530d04c6a65086 (patch)
tree: a1f528af0c52fd0d78a1843743f2930a2684f2b2 /media/libmedia
parent: acdae5d7865b604acaadd3be1c45c84ca4bf3952 (diff)
parent: 34581f44cde67960fbac3ba1f191a2c063ea5145 (diff)
download: frameworks_av-d71233a846aca7035a851941c1530d04c6a65086.zip
frameworks_av-d71233a846aca7035a851941c1530d04c6a65086.tar.gz
frameworks_av-d71233a846aca7035a851941c1530d04c6a65086.tar.bz2
5 files changed, 51 insertions, 183 deletions
diff --git a/media/libmedia/Android.mk b/media/libmedia/Android.mk
index 37bc418..e012116 100644
--- a/media/libmedia/Android.mk
+++ b/media/libmedia/Android.mk
@@ -76,9 +76,10 @@ LOCAL_MODULE:= libmedia
 
 LOCAL_C_INCLUDES := \
     $(TOP)/frameworks/native/include/media/openmax \
+    $(TOP)/frameworks/av/include/media/ \
     $(TOP)/frameworks/av/media/libstagefright \
-    external/icu/icu4c/source/common \
-    external/icu/icu4c/source/i18n \
+    $(TOP)/external/icu/icu4c/source/common \
+    $(TOP)/external/icu/icu4c/source/i18n \
     $(call include-path-for, audio-effects) \
     $(call include-path-for, audio-utils)
 
diff --git a/media/libmedia/CharacterEncodingDetector.cpp b/media/libmedia/CharacterEncodingDetector.cpp
index 7d1ddfd..41994dc 100644
--- a/media/libmedia/CharacterEncodingDetector.cpp
+++ b/media/libmedia/CharacterEncodingDetector.cpp
@@ -18,7 +18,7 @@
 #define LOG_TAG "CharacterEncodingDector"
 #include <utils/Log.h>
 
-#include "CharacterEncodingDetector.h"
+#include <CharacterEncodingDetector.h>
 #include "CharacterEncodingDetectorTables.h"
 
 #include "utils/Vector.h"
@@ -118,10 +118,12 @@ void CharacterEncodingDetector::detectAndConvert() {
             int32_t matches;
             const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
             bool goodmatch = true;
+            int highest = 0;
             const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf),
-                    ucma, matches, &goodmatch);
+                    ucma, matches, &goodmatch, &highest);
 
-            if (!goodmatch && strlen(buf) < 20) {
+            ALOGV("goodmatch: %s, highest: %d", goodmatch ? "true" : "false", highest);
+            if (!goodmatch && (highest < 15 || strlen(buf) < 20)) {
                 ALOGV("not a good match, trying with more data");
                 // This string might be too short for ICU to do anything useful with.
                 // (real world example: "Björk" in ISO-8859-1 might be detected as GB18030, because
@@ -146,9 +148,10 @@ void CharacterEncodingDetector::detectAndConvert() {
                     ucsdet_setText(csd, buf, strlen(buf), &status);
                     ucma = ucsdet_detectAll(csd, &matches, &status);
                     bestCombinedMatch = getPreferred(buf, strlen(buf),
-                            ucma, matches, &goodmatch);
-                    if (!goodmatch) {
+                            ucma, matches, &goodmatch, &highest);
+                    if (!goodmatch && highest <= 15) {
                         ALOGV("still not a good match after adding printable tags");
+                        bestCombinedMatch = NULL;
                     }
                 } else {
                     ALOGV("no printable tags to add");
@@ -157,6 +160,8 @@ void CharacterEncodingDetector::detectAndConvert() {
 
             if (bestCombinedMatch != NULL) {
                 combinedenc = ucsdet_getName(bestCombinedMatch, &status);
+            } else {
+                combinedenc = "ISO-8859-1";
             }
         }
 
@@ -199,10 +204,17 @@ void CharacterEncodingDetector::detectAndConvert() {
             if (strcmp(enc,"UTF-8") != 0) {
                 // only convert if the source encoding isn't already UTF-8
                 ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i));
+                status = U_ZERO_ERROR;
                 UConverter *conv = ucnv_open(enc, &status);
                 if (U_FAILURE(status)) {
-                    ALOGE("could not create UConverter for %s", enc);
-                    continue;
+                    ALOGW("could not create UConverter for %s (%d), falling back to ISO-8859-1",
+                            enc, status);
+                    status = U_ZERO_ERROR;
+                    conv = ucnv_open("ISO-8859-1", &status);
+                    if (U_FAILURE(status)) {
+                        ALOGW("could not create UConverter for ISO-8859-1 either");
+                        continue;
+                    }
                 }
 
                 // convert from native encoding to UTF-8
@@ -224,7 +236,16 @@ void CharacterEncodingDetector::detectAndConvert() {
                 } else {
                     // zero terminate
                     *target = 0;
-                    mValues.setEntry(i, buffer);
+                    // strip trailing spaces
+                    while (--target > buffer && *target == ' ') {
+                        *target = 0;
+                    }
+                    // skip leading spaces
+                    char *start = buffer;
+                    while (*start == ' ') {
+                        start++;
+                    }
+                    mValues.setEntry(i, start);
                 }
 
                 delete[] buffer;
@@ -261,7 +282,7 @@ void CharacterEncodingDetector::detectAndConvert() {
 const UCharsetMatch *CharacterEncodingDetector::getPreferred(
         const char *input, size_t len,
         const UCharsetMatch** ucma, size_t nummatches,
-        bool *goodmatch) {
+        bool *goodmatch, int *highestmatch) {
 
     *goodmatch = false;
     Vector<const UCharsetMatch*> matches;
@@ -316,11 +337,17 @@ const UCharsetMatch *CharacterEncodingDetector::getPreferred(
         }
 
         ALOGV("%zu: %s %d", i, encname, confidence);
+        status = U_ZERO_ERROR;
         UConverter *conv = ucnv_open(encname, &status);
+        int demerit = 0;
+        if (U_FAILURE(status)) {
+            ALOGV("failed to open %s: %d", encname, status);
+            confidence = 0;
+            demerit += 1000;
+        }
         const char *source = input;
         const char *sourceLimit = input + len;
         status = U_ZERO_ERROR;
-        int demerit = 0;
         int frequentchars = 0;
         int totalchars = 0;
         while (true) {
@@ -337,7 +364,8 @@ const UCharsetMatch *CharacterEncodingDetector::getPreferred(
             if (c < 0x20 || (c >= 0x7f && c <= 0x009f)) {
                 ALOGV("control character %x", c);
                 demerit += 100;
-            } else if ((c >= 0xa0 && c <= 0xbe)         // symbols, superscripts
+            } else if ((c == 0xa0)                      // no-break space
+                    || (c >= 0xa2 && c <= 0xbe)         // symbols, superscripts
                     || (c == 0xd7) || (c == 0xf7)       // multiplication and division signs
                     || (c >= 0x2000 && c <= 0x209f)) {  // punctuation, superscripts
                 ALOGV("unlikely character %x", c);
@@ -408,10 +436,14 @@ const UCharsetMatch *CharacterEncodingDetector::getPreferred(
     } else {
         ALOGV("runner up: '%s' w/ %d confidence",
                 ucsdet_getName(matches[runnerupidx], &status), runnerup);
+        if (runnerup < 0) {
+            runnerup = 0;
+        }
         if ((highest - runnerup) > 15) {
             *goodmatch = true;
         }
     }
+    *highestmatch = highest;
     return matches[highestidx];
 }
 
diff --git a/media/libmedia/CharacterEncodingDetector.h b/media/libmedia/CharacterEncodingDetector.h
deleted file mode 100644
index 7b5ed86..0000000
--- a/media/libmedia/CharacterEncodingDetector.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef _CHARACTER_ENCODING_DETECTOR_H
-#define _CHARACTER_ENCODING_DETECTOR_H
-
-#include <media/mediascanner.h>
-
-#include "StringArray.h"
-
-#include "unicode/ucnv.h"
-#include "unicode/ucsdet.h"
-#include "unicode/ustring.h"
-
-namespace android {
-
-class CharacterEncodingDetector {
-
-    public:
-    CharacterEncodingDetector();
-        ~CharacterEncodingDetector();
-
-        void addTag(const char *name, const char *value);
-        size_t size();
-
-        void detectAndConvert();
-        status_t getTag(int index, const char **name, const char**value);
-
-    private:
-        const UCharsetMatch *getPreferred(
-                const char *input, size_t len,
-                const UCharsetMatch** ucma, size_t matches,
-                bool *goodmatch);
-
-        bool isFrequent(const uint16_t *values, uint32_t c);
-
-        // cached name and value strings, for native encoding support.
-        // TODO: replace these with byte blob arrays that don't require the data to be
-        // singlenullbyte-terminated
-        StringArray     mNames;
-        StringArray     mValues;
-
-        UConverter*     mUtf8Conv;
-};
-
-
-
-};  // namespace android
-
-#endif
diff --git a/media/libmedia/MediaScannerClient.cpp b/media/libmedia/MediaScannerClient.cpp
index 1661f04..9f803cb 100644
--- a/media/libmedia/MediaScannerClient.cpp
+++ b/media/libmedia/MediaScannerClient.cpp
@@ -25,14 +25,10 @@
 
 namespace android {
 
-MediaScannerClient::MediaScannerClient()
-    :   mEncodingDetector(NULL)
-{
+MediaScannerClient::MediaScannerClient() {
 }
 
-MediaScannerClient::~MediaScannerClient()
-{
-    delete mEncodingDetector;
+MediaScannerClient::~MediaScannerClient() {
 }
 
 void MediaScannerClient::setLocale(const char* locale)
@@ -40,31 +36,16 @@ void MediaScannerClient::setLocale(const char* locale)
     mLocale = locale; // not currently used
 }
 
-void MediaScannerClient::beginFile()
-{
-    delete mEncodingDetector;
-    mEncodingDetector = new CharacterEncodingDetector();
+void MediaScannerClient::beginFile() {
 }
 
 status_t MediaScannerClient::addStringTag(const char* name, const char* value)
 {
-    mEncodingDetector->addTag(name, value);
+    handleStringTag(name, value);
     return OK;
 }
 
-void MediaScannerClient::endFile()
-{
-    mEncodingDetector->detectAndConvert();
-
-    int size = mEncodingDetector->size();
-    if (size) {
-        for (int i = 0; i < size; i++) {
-            const char *name;
-            const char *value;
-            mEncodingDetector->getTag(i, &name, &value);
-            handleStringTag(name, value);
-        }
-    }
+void MediaScannerClient::endFile() {
 }
 
 }  // namespace android
diff --git a/media/libmedia/StringArray.h b/media/libmedia/StringArray.h
deleted file mode 100644
index ae47085..0000000
--- a/media/libmedia/StringArray.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-//
-// Sortable array of strings.  STL-ish, but STL-free.
-//  
-#ifndef _LIBS_MEDIA_STRING_ARRAY_H
-#define _LIBS_MEDIA_STRING_ARRAY_H
-
-#include <stdlib.h>
-#include <string.h>
-
-namespace android {
-
-//
-// An expanding array of strings.  Add, get, sort, delete.
-//
-class StringArray {
-public:
-    StringArray();
-    virtual ~StringArray();
-
-    //
-    // Add a string.  A copy of the string is made.
-    //
-    bool push_back(const char* str);
-
-    //
-    // Delete an entry.
-    //
-    void erase(int idx);
-
-    //
-    // Sort the array.
-    //
-    void sort(int (*compare)(const void*, const void*));
-
-    //
-    // Pass this to the sort routine to do an ascending alphabetical sort.
-    //
-    static int cmpAscendingAlpha(const void* pstr1, const void* pstr2);
-
-    //
-    // Get the #of items in the array.
-    //
-    inline int size(void) const { return mCurrent; }
-
-    //
-    // Return entry N.
-    // [should use operator[] here]
-    //
-    const char* getEntry(int idx) const {
-        return (unsigned(idx) >= unsigned(mCurrent)) ? NULL : mArray[idx];
-    }
-
-    //
-    // Set entry N to specified string.
-    // [should use operator[] here]
-    //
-    void setEntry(int idx, const char* str);
-
-private:
-    int     mMax;
-    int     mCurrent;
-    char**  mArray;
-};
-
-}; // namespace android
-
-#endif // _LIBS_MEDIA_STRING_ARRAY_H
author	Marco Nelissen <marcone@google.com>	2014-09-03 19:09:43 +0000
committer	Android (Google) Code Review <android-gerrit@google.com>	2014-09-03 19:09:43 +0000
commit	d71233a846aca7035a851941c1530d04c6a65086 (patch)
tree	a1f528af0c52fd0d78a1843743f2930a2684f2b2 /media/libmedia
parent	acdae5d7865b604acaadd3be1c45c84ca4bf3952 (diff)
parent	34581f44cde67960fbac3ba1f191a2c063ea5145 (diff)
download	frameworks_av-d71233a846aca7035a851941c1530d04c6a65086.zip frameworks_av-d71233a846aca7035a851941c1530d04c6a65086.tar.gz frameworks_av-d71233a846aca7035a851941c1530d04c6a65086.tar.bz2