From 544ad2be674423238c47650d2c8588ba7dfc9ed2 Mon Sep 17 00:00:00 2001 From: Marco Nelissen Date: Wed, 13 Nov 2013 14:18:21 -0800 Subject: Better character set encoding detection Id3 tags are supposed to be ISO-8859-1 or unicode, but often aren't. To better detect the real encoding we now use ICU to detect possible encodings for a given byte sequence, then apply additional heuristics to determine the most likely one. b/5564857 Change-Id: I53bc83b006433da5c2f2ccfcd770ddb3a26b64d0 --- media/libmedia/CharacterEncodingDetector.h | 61 ++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 media/libmedia/CharacterEncodingDetector.h (limited to 'media/libmedia/CharacterEncodingDetector.h') diff --git a/media/libmedia/CharacterEncodingDetector.h b/media/libmedia/CharacterEncodingDetector.h new file mode 100644 index 0000000..3655a91 --- /dev/null +++ b/media/libmedia/CharacterEncodingDetector.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _CHARACTER_ENCODING_DETECTOR_H +#define _CHARACTER_ENCODING_DETECTOR_H + +#include + +#include "StringArray.h" + +#include "unicode/ucnv.h" +#include "unicode/ucsdet.h" +#include "unicode/ustring.h" + +namespace android { + +class CharacterEncodingDetector { + + public: + CharacterEncodingDetector(); + ~CharacterEncodingDetector(); + + void addTag(const char *name, const char *value); + size_t size(); + + void detectAndConvert(); + status_t getTag(int index, const char **name, const char**value); + + private: + const UCharsetMatch *getPreferred( + const char *input, size_t len, const UCharsetMatch** ucma, size_t matches); + + bool isFrequent(const uint16_t *values, uint32_t c); + + // cached name and value strings, for native encoding support. + // TODO: replace these with byte blob arrays that don't require the data to be + // singlenullbyte-terminated + StringArray mNames; + StringArray mValues; + + UConverter* mUtf8Conv; +}; + + + +}; // namespace android + +#endif -- cgit v1.1