From 34581f44cde67960fbac3ba1f191a2c063ea5145 Mon Sep 17 00:00:00 2001 From: Marco Nelissen Date: Fri, 29 Aug 2014 16:00:28 -0700 Subject: Use CharacterEncodingDetector in metadataretriever instead of media scanner. This way the java MediaMetadataRetriever API will give the same result as the media scanner. Also apply some tweaks to the encoding detector to improve handling of ISO-8859-1 tags. Bug: 16302581, 17205395 Change-Id: I1682a7a6a8bf04cffaa455044ba72dd7fd152d49 --- include/media/CharacterEncodingDetector.h | 63 ++++++++++++++++ include/media/StringArray.h | 83 ++++++++++++++++++++++ include/media/mediascanner.h | 1 - media/libmedia/Android.mk | 5 +- media/libmedia/CharacterEncodingDetector.cpp | 54 +++++++++++--- media/libmedia/CharacterEncodingDetector.h | 63 ---------------- media/libmedia/MediaScannerClient.cpp | 29 ++------ media/libmedia/StringArray.h | 83 ---------------------- media/libstagefright/Android.mk | 3 + .../StagefrightMetadataRetriever.cpp | 59 ++++++++++----- 10 files changed, 243 insertions(+), 200 deletions(-) create mode 100644 include/media/CharacterEncodingDetector.h create mode 100644 include/media/StringArray.h delete mode 100644 media/libmedia/CharacterEncodingDetector.h delete mode 100644 media/libmedia/StringArray.h diff --git a/include/media/CharacterEncodingDetector.h b/include/media/CharacterEncodingDetector.h new file mode 100644 index 0000000..deaa377 --- /dev/null +++ b/include/media/CharacterEncodingDetector.h @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _CHARACTER_ENCODING_DETECTOR_H +#define _CHARACTER_ENCODING_DETECTOR_H + +#include + +#include "StringArray.h" + +#include "unicode/ucnv.h" +#include "unicode/ucsdet.h" +#include "unicode/ustring.h" + +namespace android { + +class CharacterEncodingDetector { + + public: + CharacterEncodingDetector(); + ~CharacterEncodingDetector(); + + void addTag(const char *name, const char *value); + size_t size(); + + void detectAndConvert(); + status_t getTag(int index, const char **name, const char**value); + + private: + const UCharsetMatch *getPreferred( + const char *input, size_t len, + const UCharsetMatch** ucma, size_t matches, + bool *goodmatch, int *highestmatch); + + bool isFrequent(const uint16_t *values, uint32_t c); + + // cached name and value strings, for native encoding support. + // TODO: replace these with byte blob arrays that don't require the data to be + // singlenullbyte-terminated + StringArray mNames; + StringArray mValues; + + UConverter* mUtf8Conv; +}; + + + +}; // namespace android + +#endif diff --git a/include/media/StringArray.h b/include/media/StringArray.h new file mode 100644 index 0000000..ae47085 --- /dev/null +++ b/include/media/StringArray.h @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// +// Sortable array of strings. STL-ish, but STL-free. +// +#ifndef _LIBS_MEDIA_STRING_ARRAY_H +#define _LIBS_MEDIA_STRING_ARRAY_H + +#include +#include + +namespace android { + +// +// An expanding array of strings. Add, get, sort, delete. +// +class StringArray { +public: + StringArray(); + virtual ~StringArray(); + + // + // Add a string. A copy of the string is made. + // + bool push_back(const char* str); + + // + // Delete an entry. + // + void erase(int idx); + + // + // Sort the array. + // + void sort(int (*compare)(const void*, const void*)); + + // + // Pass this to the sort routine to do an ascending alphabetical sort. + // + static int cmpAscendingAlpha(const void* pstr1, const void* pstr2); + + // + // Get the #of items in the array. + // + inline int size(void) const { return mCurrent; } + + // + // Return entry N. + // [should use operator[] here] + // + const char* getEntry(int idx) const { + return (unsigned(idx) >= unsigned(mCurrent)) ? NULL : mArray[idx]; + } + + // + // Set entry N to specified string. + // [should use operator[] here] + // + void setEntry(int idx, const char* str); + +private: + int mMax; + int mCurrent; + char** mArray; +}; + +}; // namespace android + +#endif // _LIBS_MEDIA_STRING_ARRAY_H diff --git a/include/media/mediascanner.h b/include/media/mediascanner.h index 5213bdc..d555279 100644 --- a/include/media/mediascanner.h +++ b/include/media/mediascanner.h @@ -122,7 +122,6 @@ public: protected: // default encoding from MediaScanner::mLocale String8 mLocale; - CharacterEncodingDetector *mEncodingDetector; }; }; // namespace android diff --git a/media/libmedia/Android.mk b/media/libmedia/Android.mk index 3be0651..ffadb23 100644 --- a/media/libmedia/Android.mk +++ b/media/libmedia/Android.mk @@ -76,9 +76,10 @@ LOCAL_MODULE:= libmedia LOCAL_C_INCLUDES := \ $(TOP)/frameworks/native/include/media/openmax \ + $(TOP)/frameworks/av/include/media/ \ $(TOP)/frameworks/av/media/libstagefright \ - external/icu/icu4c/source/common \ - external/icu/icu4c/source/i18n \ + $(TOP)/external/icu/icu4c/source/common \ + $(TOP)/external/icu/icu4c/source/i18n \ $(call include-path-for, audio-effects) \ $(call include-path-for, audio-utils) diff --git a/media/libmedia/CharacterEncodingDetector.cpp b/media/libmedia/CharacterEncodingDetector.cpp index 7d1ddfd..41994dc 100644 --- a/media/libmedia/CharacterEncodingDetector.cpp +++ b/media/libmedia/CharacterEncodingDetector.cpp @@ -18,7 +18,7 @@ #define LOG_TAG "CharacterEncodingDector" #include -#include "CharacterEncodingDetector.h" +#include #include "CharacterEncodingDetectorTables.h" #include "utils/Vector.h" @@ -118,10 +118,12 @@ void CharacterEncodingDetector::detectAndConvert() { int32_t matches; const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status); bool goodmatch = true; + int highest = 0; const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf), - ucma, matches, &goodmatch); + ucma, matches, &goodmatch, &highest); - if (!goodmatch && strlen(buf) < 20) { + ALOGV("goodmatch: %s, highest: %d", goodmatch ? "true" : "false", highest); + if (!goodmatch && (highest < 15 || strlen(buf) < 20)) { ALOGV("not a good match, trying with more data"); // This string might be too short for ICU to do anything useful with. // (real world example: "Björk" in ISO-8859-1 might be detected as GB18030, because @@ -146,9 +148,10 @@ void CharacterEncodingDetector::detectAndConvert() { ucsdet_setText(csd, buf, strlen(buf), &status); ucma = ucsdet_detectAll(csd, &matches, &status); bestCombinedMatch = getPreferred(buf, strlen(buf), - ucma, matches, &goodmatch); - if (!goodmatch) { + ucma, matches, &goodmatch, &highest); + if (!goodmatch && highest <= 15) { ALOGV("still not a good match after adding printable tags"); + bestCombinedMatch = NULL; } } else { ALOGV("no printable tags to add"); @@ -157,6 +160,8 @@ void CharacterEncodingDetector::detectAndConvert() { if (bestCombinedMatch != NULL) { combinedenc = ucsdet_getName(bestCombinedMatch, &status); + } else { + combinedenc = "ISO-8859-1"; } } @@ -199,10 +204,17 @@ void CharacterEncodingDetector::detectAndConvert() { if (strcmp(enc,"UTF-8") != 0) { // only convert if the source encoding isn't already UTF-8 ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i)); + status = U_ZERO_ERROR; UConverter *conv = ucnv_open(enc, &status); if (U_FAILURE(status)) { - ALOGE("could not create UConverter for %s", enc); - continue; + ALOGW("could not create UConverter for %s (%d), falling back to ISO-8859-1", + enc, status); + status = U_ZERO_ERROR; + conv = ucnv_open("ISO-8859-1", &status); + if (U_FAILURE(status)) { + ALOGW("could not create UConverter for ISO-8859-1 either"); + continue; + } } // convert from native encoding to UTF-8 @@ -224,7 +236,16 @@ void CharacterEncodingDetector::detectAndConvert() { } else { // zero terminate *target = 0; - mValues.setEntry(i, buffer); + // strip trailing spaces + while (--target > buffer && *target == ' ') { + *target = 0; + } + // skip leading spaces + char *start = buffer; + while (*start == ' ') { + start++; + } + mValues.setEntry(i, start); } delete[] buffer; @@ -261,7 +282,7 @@ void CharacterEncodingDetector::detectAndConvert() { const UCharsetMatch *CharacterEncodingDetector::getPreferred( const char *input, size_t len, const UCharsetMatch** ucma, size_t nummatches, - bool *goodmatch) { + bool *goodmatch, int *highestmatch) { *goodmatch = false; Vector matches; @@ -316,11 +337,17 @@ const UCharsetMatch *CharacterEncodingDetector::getPreferred( } ALOGV("%zu: %s %d", i, encname, confidence); + status = U_ZERO_ERROR; UConverter *conv = ucnv_open(encname, &status); + int demerit = 0; + if (U_FAILURE(status)) { + ALOGV("failed to open %s: %d", encname, status); + confidence = 0; + demerit += 1000; + } const char *source = input; const char *sourceLimit = input + len; status = U_ZERO_ERROR; - int demerit = 0; int frequentchars = 0; int totalchars = 0; while (true) { @@ -337,7 +364,8 @@ const UCharsetMatch *CharacterEncodingDetector::getPreferred( if (c < 0x20 || (c >= 0x7f && c <= 0x009f)) { ALOGV("control character %x", c); demerit += 100; - } else if ((c >= 0xa0 && c <= 0xbe) // symbols, superscripts + } else if ((c == 0xa0) // no-break space + || (c >= 0xa2 && c <= 0xbe) // symbols, superscripts || (c == 0xd7) || (c == 0xf7) // multiplication and division signs || (c >= 0x2000 && c <= 0x209f)) { // punctuation, superscripts ALOGV("unlikely character %x", c); @@ -408,10 +436,14 @@ const UCharsetMatch *CharacterEncodingDetector::getPreferred( } else { ALOGV("runner up: '%s' w/ %d confidence", ucsdet_getName(matches[runnerupidx], &status), runnerup); + if (runnerup < 0) { + runnerup = 0; + } if ((highest - runnerup) > 15) { *goodmatch = true; } } + *highestmatch = highest; return matches[highestidx]; } diff --git a/media/libmedia/CharacterEncodingDetector.h b/media/libmedia/CharacterEncodingDetector.h deleted file mode 100644 index 7b5ed86..0000000 --- a/media/libmedia/CharacterEncodingDetector.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef _CHARACTER_ENCODING_DETECTOR_H -#define _CHARACTER_ENCODING_DETECTOR_H - -#include - -#include "StringArray.h" - -#include "unicode/ucnv.h" -#include "unicode/ucsdet.h" -#include "unicode/ustring.h" - -namespace android { - -class CharacterEncodingDetector { - - public: - CharacterEncodingDetector(); - ~CharacterEncodingDetector(); - - void addTag(const char *name, const char *value); - size_t size(); - - void detectAndConvert(); - status_t getTag(int index, const char **name, const char**value); - - private: - const UCharsetMatch *getPreferred( - const char *input, size_t len, - const UCharsetMatch** ucma, size_t matches, - bool *goodmatch); - - bool isFrequent(const uint16_t *values, uint32_t c); - - // cached name and value strings, for native encoding support. - // TODO: replace these with byte blob arrays that don't require the data to be - // singlenullbyte-terminated - StringArray mNames; - StringArray mValues; - - UConverter* mUtf8Conv; -}; - - - -}; // namespace android - -#endif diff --git a/media/libmedia/MediaScannerClient.cpp b/media/libmedia/MediaScannerClient.cpp index 1661f04..9f803cb 100644 --- a/media/libmedia/MediaScannerClient.cpp +++ b/media/libmedia/MediaScannerClient.cpp @@ -25,14 +25,10 @@ namespace android { -MediaScannerClient::MediaScannerClient() - : mEncodingDetector(NULL) -{ +MediaScannerClient::MediaScannerClient() { } -MediaScannerClient::~MediaScannerClient() -{ - delete mEncodingDetector; +MediaScannerClient::~MediaScannerClient() { } void MediaScannerClient::setLocale(const char* locale) @@ -40,31 +36,16 @@ void MediaScannerClient::setLocale(const char* locale) mLocale = locale; // not currently used } -void MediaScannerClient::beginFile() -{ - delete mEncodingDetector; - mEncodingDetector = new CharacterEncodingDetector(); +void MediaScannerClient::beginFile() { } status_t MediaScannerClient::addStringTag(const char* name, const char* value) { - mEncodingDetector->addTag(name, value); + handleStringTag(name, value); return OK; } -void MediaScannerClient::endFile() -{ - mEncodingDetector->detectAndConvert(); - - int size = mEncodingDetector->size(); - if (size) { - for (int i = 0; i < size; i++) { - const char *name; - const char *value; - mEncodingDetector->getTag(i, &name, &value); - handleStringTag(name, value); - } - } +void MediaScannerClient::endFile() { } } // namespace android diff --git a/media/libmedia/StringArray.h b/media/libmedia/StringArray.h deleted file mode 100644 index ae47085..0000000 --- a/media/libmedia/StringArray.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// -// Sortable array of strings. STL-ish, but STL-free. -// -#ifndef _LIBS_MEDIA_STRING_ARRAY_H -#define _LIBS_MEDIA_STRING_ARRAY_H - -#include -#include - -namespace android { - -// -// An expanding array of strings. Add, get, sort, delete. -// -class StringArray { -public: - StringArray(); - virtual ~StringArray(); - - // - // Add a string. A copy of the string is made. - // - bool push_back(const char* str); - - // - // Delete an entry. - // - void erase(int idx); - - // - // Sort the array. - // - void sort(int (*compare)(const void*, const void*)); - - // - // Pass this to the sort routine to do an ascending alphabetical sort. - // - static int cmpAscendingAlpha(const void* pstr1, const void* pstr2); - - // - // Get the #of items in the array. - // - inline int size(void) const { return mCurrent; } - - // - // Return entry N. - // [should use operator[] here] - // - const char* getEntry(int idx) const { - return (unsigned(idx) >= unsigned(mCurrent)) ? NULL : mArray[idx]; - } - - // - // Set entry N to specified string. - // [should use operator[] here] - // - void setEntry(int idx, const char* str); - -private: - int mMax; - int mCurrent; - char** mArray; -}; - -}; // namespace android - -#endif // _LIBS_MEDIA_STRING_ARRAY_H diff --git a/media/libstagefright/Android.mk b/media/libstagefright/Android.mk index be9af5e..193f8a7 100644 --- a/media/libstagefright/Android.mk +++ b/media/libstagefright/Android.mk @@ -62,6 +62,7 @@ LOCAL_SRC_FILES:= \ avc_utils.cpp \ LOCAL_C_INCLUDES:= \ + $(TOP)/frameworks/av/include/media/ \ $(TOP)/frameworks/av/include/media/stagefright/timedtext \ $(TOP)/frameworks/native/include/media/hardware \ $(TOP)/frameworks/native/include/media/openmax \ @@ -70,6 +71,8 @@ LOCAL_C_INCLUDES:= \ $(TOP)/external/openssl/include \ $(TOP)/external/libvpx/libwebm \ $(TOP)/system/netd/include \ + $(TOP)/external/icu/icu4c/source/common \ + $(TOP)/external/icu/icu4c/source/i18n \ LOCAL_SHARED_LIBRARIES := \ libbinder \ diff --git a/media/libstagefright/StagefrightMetadataRetriever.cpp b/media/libstagefright/StagefrightMetadataRetriever.cpp index 8cc41e7..101fc8a 100644 --- a/media/libstagefright/StagefrightMetadataRetriever.cpp +++ b/media/libstagefright/StagefrightMetadataRetriever.cpp @@ -32,6 +32,7 @@ #include #include #include +#include namespace android { @@ -450,32 +451,58 @@ void StagefrightMetadataRetriever::parseMetaData() { struct Map { int from; int to; + const char *name; }; static const Map kMap[] = { - { kKeyMIMEType, METADATA_KEY_MIMETYPE }, - { kKeyCDTrackNumber, METADATA_KEY_CD_TRACK_NUMBER }, - { kKeyDiscNumber, METADATA_KEY_DISC_NUMBER }, - { kKeyAlbum, METADATA_KEY_ALBUM }, - { kKeyArtist, METADATA_KEY_ARTIST }, - { kKeyAlbumArtist, METADATA_KEY_ALBUMARTIST }, - { kKeyAuthor, METADATA_KEY_AUTHOR }, - { kKeyComposer, METADATA_KEY_COMPOSER }, - { kKeyDate, METADATA_KEY_DATE }, - { kKeyGenre, METADATA_KEY_GENRE }, - { kKeyTitle, METADATA_KEY_TITLE }, - { kKeyYear, METADATA_KEY_YEAR }, - { kKeyWriter, METADATA_KEY_WRITER }, - { kKeyCompilation, METADATA_KEY_COMPILATION }, - { kKeyLocation, METADATA_KEY_LOCATION }, + { kKeyMIMEType, METADATA_KEY_MIMETYPE, NULL }, + { kKeyCDTrackNumber, METADATA_KEY_CD_TRACK_NUMBER, "tracknumber" }, + { kKeyDiscNumber, METADATA_KEY_DISC_NUMBER, "discnumber" }, + { kKeyAlbum, METADATA_KEY_ALBUM, "album" }, + { kKeyArtist, METADATA_KEY_ARTIST, "artist" }, + { kKeyAlbumArtist, METADATA_KEY_ALBUMARTIST, "albumartist" }, + { kKeyAuthor, METADATA_KEY_AUTHOR, NULL }, + { kKeyComposer, METADATA_KEY_COMPOSER, "composer" }, + { kKeyDate, METADATA_KEY_DATE, NULL }, + { kKeyGenre, METADATA_KEY_GENRE, "genre" }, + { kKeyTitle, METADATA_KEY_TITLE, "title" }, + { kKeyYear, METADATA_KEY_YEAR, "year" }, + { kKeyWriter, METADATA_KEY_WRITER, "writer" }, + { kKeyCompilation, METADATA_KEY_COMPILATION, "compilation" }, + { kKeyLocation, METADATA_KEY_LOCATION, NULL }, }; + static const size_t kNumMapEntries = sizeof(kMap) / sizeof(kMap[0]); + CharacterEncodingDetector *detector = new CharacterEncodingDetector(); + for (size_t i = 0; i < kNumMapEntries; ++i) { const char *value; if (meta->findCString(kMap[i].from, &value)) { - mMetaData.add(kMap[i].to, String8(value)); + if (kMap[i].name) { + // add to charset detector + detector->addTag(kMap[i].name, value); + } else { + // directly add to output list + mMetaData.add(kMap[i].to, String8(value)); + } + } + } + + detector->detectAndConvert(); + int size = detector->size(); + if (size) { + for (int i = 0; i < size; i++) { + const char *name; + const char *value; + detector->getTag(i, &name, &value); + for (size_t j = 0; j < kNumMapEntries; ++j) { + if (kMap[j].name && !strcmp(kMap[j].name, name)) { + mMetaData.add(kMap[j].to, String8(value)); + } + } } } + delete detector; const void *data; uint32_t type; -- cgit v1.1