diff options
author | Marco Nelissen <marcone@google.com> | 2013-11-13 14:18:21 -0800 |
---|---|---|
committer | Marco Nelissen <marcone@google.com> | 2013-12-11 10:28:44 -0800 |
commit | 544ad2be674423238c47650d2c8588ba7dfc9ed2 (patch) | |
tree | 1167d14b0e345cd7ad6c2a415a7134c915507b86 /media/libmedia/CharacterEncodingDetector.h | |
parent | 35a9e7d49ff5ff99fedb0bf2a35c39f7ea5a2f9e (diff) | |
download | frameworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.zip frameworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.tar.gz frameworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.tar.bz2 |
Better character set encoding detection
Id3 tags are supposed to be ISO-8859-1 or unicode, but often aren't.
To better detect the real encoding we now use ICU to detect possible
encodings for a given byte sequence, then apply additional heuristics
to determine the most likely one.
b/5564857
Change-Id: I53bc83b006433da5c2f2ccfcd770ddb3a26b64d0
Diffstat (limited to 'media/libmedia/CharacterEncodingDetector.h')
-rw-r--r-- | media/libmedia/CharacterEncodingDetector.h | 61 |
1 files changed, 61 insertions, 0 deletions
diff --git a/media/libmedia/CharacterEncodingDetector.h b/media/libmedia/CharacterEncodingDetector.h new file mode 100644 index 0000000..3655a91 --- /dev/null +++ b/media/libmedia/CharacterEncodingDetector.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _CHARACTER_ENCODING_DETECTOR_H +#define _CHARACTER_ENCODING_DETECTOR_H + +#include <media/mediascanner.h> + +#include "StringArray.h" + +#include "unicode/ucnv.h" +#include "unicode/ucsdet.h" +#include "unicode/ustring.h" + +namespace android { + +class CharacterEncodingDetector { + + public: + CharacterEncodingDetector(); + ~CharacterEncodingDetector(); + + void addTag(const char *name, const char *value); + size_t size(); + + void detectAndConvert(); + status_t getTag(int index, const char **name, const char**value); + + private: + const UCharsetMatch *getPreferred( + const char *input, size_t len, const UCharsetMatch** ucma, size_t matches); + + bool isFrequent(const uint16_t *values, uint32_t c); + + // cached name and value strings, for native encoding support. + // TODO: replace these with byte blob arrays that don't require the data to be + // singlenullbyte-terminated + StringArray mNames; + StringArray mValues; + + UConverter* mUtf8Conv; +}; + + + +}; // namespace android + +#endif |