summaryrefslogtreecommitdiffstats
path: root/media/libmedia/CharacterEncodingDetector.h
diff options
context:
space:
mode:
authorMarco Nelissen <marcone@google.com>2013-11-13 14:18:21 -0800
committerMarco Nelissen <marcone@google.com>2013-12-11 10:28:44 -0800
commit544ad2be674423238c47650d2c8588ba7dfc9ed2 (patch)
tree1167d14b0e345cd7ad6c2a415a7134c915507b86 /media/libmedia/CharacterEncodingDetector.h
parent35a9e7d49ff5ff99fedb0bf2a35c39f7ea5a2f9e (diff)
downloadframeworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.zip
frameworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.tar.gz
frameworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.tar.bz2
Better character set encoding detection
Id3 tags are supposed to be ISO-8859-1 or unicode, but often aren't. To better detect the real encoding we now use ICU to detect possible encodings for a given byte sequence, then apply additional heuristics to determine the most likely one. b/5564857 Change-Id: I53bc83b006433da5c2f2ccfcd770ddb3a26b64d0
Diffstat (limited to 'media/libmedia/CharacterEncodingDetector.h')
-rw-r--r--media/libmedia/CharacterEncodingDetector.h61
1 files changed, 61 insertions, 0 deletions
diff --git a/media/libmedia/CharacterEncodingDetector.h b/media/libmedia/CharacterEncodingDetector.h
new file mode 100644
index 0000000..3655a91
--- /dev/null
+++ b/media/libmedia/CharacterEncodingDetector.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _CHARACTER_ENCODING_DETECTOR_H
+#define _CHARACTER_ENCODING_DETECTOR_H
+
+#include <media/mediascanner.h>
+
+#include "StringArray.h"
+
+#include "unicode/ucnv.h"
+#include "unicode/ucsdet.h"
+#include "unicode/ustring.h"
+
+namespace android {
+
+class CharacterEncodingDetector {
+
+ public:
+ CharacterEncodingDetector();
+ ~CharacterEncodingDetector();
+
+ void addTag(const char *name, const char *value);
+ size_t size();
+
+ void detectAndConvert();
+ status_t getTag(int index, const char **name, const char**value);
+
+ private:
+ const UCharsetMatch *getPreferred(
+ const char *input, size_t len, const UCharsetMatch** ucma, size_t matches);
+
+ bool isFrequent(const uint16_t *values, uint32_t c);
+
+ // cached name and value strings, for native encoding support.
+ // TODO: replace these with byte blob arrays that don't require the data to be
+ // singlenullbyte-terminated
+ StringArray mNames;
+ StringArray mValues;
+
+ UConverter* mUtf8Conv;
+};
+
+
+
+}; // namespace android
+
+#endif