Better character set encoding detection

Id3 tags are supposed to be ISO-8859-1 or unicode, but often aren't. To better detect the real encoding we now use ICU to detect possible encodings for a given byte sequence, then apply additional heuristics to determine the most likely one. b/5564857 Change-Id: I53bc83b006433da5c2f2ccfcd770ddb3a26b64d0
author: Marco Nelissen <marcone@google.com> 2013-11-13 14:18:21 -0800
committer: Marco Nelissen <marcone@google.com> 2013-12-11 10:28:44 -0800
commit: 544ad2be674423238c47650d2c8588ba7dfc9ed2 (patch)
tree: 1167d14b0e345cd7ad6c2a415a7134c915507b86 /media/libmedia/CharacterEncodingDetector.h
parent: 35a9e7d49ff5ff99fedb0bf2a35c39f7ea5a2f9e (diff)
download: frameworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.zip
frameworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.tar.gz
frameworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.tar.bz2
1 files changed, 61 insertions, 0 deletions
diff --git a/media/libmedia/CharacterEncodingDetector.h b/media/libmedia/CharacterEncodingDetector.h
new file mode 100644
index 0000000..3655a91
--- /dev/null
+++ b/media/libmedia/CharacterEncodingDetector.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _CHARACTER_ENCODING_DETECTOR_H
+#define _CHARACTER_ENCODING_DETECTOR_H
+
+#include <media/mediascanner.h>
+
+#include "StringArray.h"
+
+#include "unicode/ucnv.h"
+#include "unicode/ucsdet.h"
+#include "unicode/ustring.h"
+
+namespace android {
+
+class CharacterEncodingDetector {
+
+    public:
+    CharacterEncodingDetector();
+        ~CharacterEncodingDetector();
+
+        void addTag(const char *name, const char *value);
+        size_t size();
+
+        void detectAndConvert();
+        status_t getTag(int index, const char **name, const char**value);
+
+    private:
+        const UCharsetMatch *getPreferred(
+                const char *input, size_t len, const UCharsetMatch** ucma, size_t matches);
+
+        bool isFrequent(const uint16_t *values, uint32_t c);
+
+        // cached name and value strings, for native encoding support.
+        // TODO: replace these with byte blob arrays that don't require the data to be
+        // singlenullbyte-terminated
+        StringArray     mNames;
+        StringArray     mValues;
+
+        UConverter*     mUtf8Conv;
+};
+
+
+
+};  // namespace android
+
+#endif
author	Marco Nelissen <marcone@google.com>	2013-11-13 14:18:21 -0800
committer	Marco Nelissen <marcone@google.com>	2013-12-11 10:28:44 -0800
commit	544ad2be674423238c47650d2c8588ba7dfc9ed2 (patch)
tree	1167d14b0e345cd7ad6c2a415a7134c915507b86 /media/libmedia/CharacterEncodingDetector.h
parent	35a9e7d49ff5ff99fedb0bf2a35c39f7ea5a2f9e (diff)
download	frameworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.zip frameworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.tar.gz frameworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.tar.bz2