From 544ad2be674423238c47650d2c8588ba7dfc9ed2 Mon Sep 17 00:00:00 2001
From: Marco Nelissen <marcone@google.com>
Date: Wed, 13 Nov 2013 14:18:21 -0800
Subject: Better character set encoding detection

Id3 tags are supposed to be ISO-8859-1 or unicode, but often aren't.
To better detect the real encoding we now use ICU to detect possible
encodings for a given byte sequence, then apply additional heuristics
to determine the most likely one.
b/5564857

Change-Id: I53bc83b006433da5c2f2ccfcd770ddb3a26b64d0
---
 media/libmedia/CharacterEncodingDetector.h | 61 ++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 media/libmedia/CharacterEncodingDetector.h

(limited to 'media/libmedia/CharacterEncodingDetector.h')

diff --git a/media/libmedia/CharacterEncodingDetector.h b/media/libmedia/CharacterEncodingDetector.h
new file mode 100644
index 0000000..3655a91
--- /dev/null
+++ b/media/libmedia/CharacterEncodingDetector.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _CHARACTER_ENCODING_DETECTOR_H
+#define _CHARACTER_ENCODING_DETECTOR_H
+
+#include <media/mediascanner.h>
+
+#include "StringArray.h"
+
+#include "unicode/ucnv.h"
+#include "unicode/ucsdet.h"
+#include "unicode/ustring.h"
+
+namespace android {
+
+class CharacterEncodingDetector {
+
+    public:
+    CharacterEncodingDetector();
+        ~CharacterEncodingDetector();
+
+        void addTag(const char *name, const char *value);
+        size_t size();
+
+        void detectAndConvert();
+        status_t getTag(int index, const char **name, const char**value);
+
+    private:
+        const UCharsetMatch *getPreferred(
+                const char *input, size_t len, const UCharsetMatch** ucma, size_t matches);
+
+        bool isFrequent(const uint16_t *values, uint32_t c);
+
+        // cached name and value strings, for native encoding support.
+        // TODO: replace these with byte blob arrays that don't require the data to be
+        // singlenullbyte-terminated
+        StringArray     mNames;
+        StringArray     mValues;
+
+        UConverter*     mUtf8Conv;
+};
+
+
+
+};  // namespace android
+
+#endif
-- 
cgit v1.1