From 544ad2be674423238c47650d2c8588ba7dfc9ed2 Mon Sep 17 00:00:00 2001 From: Marco Nelissen Date: Wed, 13 Nov 2013 14:18:21 -0800 Subject: Better character set encoding detection Id3 tags are supposed to be ISO-8859-1 or unicode, but often aren't. To better detect the real encoding we now use ICU to detect possible encodings for a given byte sequence, then apply additional heuristics to determine the most likely one. b/5564857 Change-Id: I53bc83b006433da5c2f2ccfcd770ddb3a26b64d0 --- media/libstagefright/id3/ID3.cpp | 79 +++++++++++++++------------------------- 1 file changed, 29 insertions(+), 50 deletions(-) (limited to 'media/libstagefright/id3/ID3.cpp') diff --git a/media/libstagefright/id3/ID3.cpp b/media/libstagefright/id3/ID3.cpp index 34d671a..a486522 100644 --- a/media/libstagefright/id3/ID3.cpp +++ b/media/libstagefright/id3/ID3.cpp @@ -468,49 +468,6 @@ void ID3::Iterator::getID(String8 *id) const { } } -static void convertISO8859ToString8( - const uint8_t *data, size_t size, - String8 *s) { - size_t utf8len = 0; - for (size_t i = 0; i < size; ++i) { - if (data[i] == '\0') { - size = i; - break; - } else if (data[i] < 0x80) { - ++utf8len; - } else { - utf8len += 2; - } - } - - if (utf8len == size) { - // Only ASCII characters present. - - s->setTo((const char *)data, size); - return; - } - - char *tmp = new char[utf8len]; - char *ptr = tmp; - for (size_t i = 0; i < size; ++i) { - if (data[i] == '\0') { - break; - } else if (data[i] < 0x80) { - *ptr++ = data[i]; - } else if (data[i] < 0xc0) { - *ptr++ = 0xc2; - *ptr++ = data[i]; - } else { - *ptr++ = 0xc3; - *ptr++ = data[i] - 64; - } - } - - s->setTo(tmp, utf8len); - - delete[] tmp; - tmp = NULL; -} // the 2nd argument is used to get the data following the \0 in a comment field void ID3::Iterator::getString(String8 *id, String8 *comment) const { @@ -543,7 +500,9 @@ void ID3::Iterator::getstring(String8 *id, bool otherdata) const { return; } - convertISO8859ToString8(frameData, mFrameSize, id); + // this is supposed to be ISO-8859-1, but pass it up as-is to the caller, who will figure + // out the real encoding + id->setTo((const char*)frameData, mFrameSize); return; } @@ -561,13 +520,13 @@ void ID3::Iterator::getstring(String8 *id, bool otherdata) const { } if (encoding == 0x00) { - // ISO 8859-1 - convertISO8859ToString8(frameData + 1, n, id); + // supposedly ISO 8859-1 + id->setTo((const char*)frameData + 1, n); } else if (encoding == 0x03) { - // UTF-8 + // supposedly UTF-8 id->setTo((const char *)(frameData + 1), n); } else if (encoding == 0x02) { - // UTF-16 BE, no byte order mark. + // supposedly UTF-16 BE, no byte order mark. // API wants number of characters, not number of bytes... int len = n / 2; const char16_t *framedata = (const char16_t *) (frameData + 1); @@ -583,7 +542,7 @@ void ID3::Iterator::getstring(String8 *id, bool otherdata) const { if (framedatacopy != NULL) { delete[] framedatacopy; } - } else { + } else if (encoding == 0x01) { // UCS-2 // API wants number of characters, not number of bytes... int len = n / 2; @@ -602,7 +561,27 @@ void ID3::Iterator::getstring(String8 *id, bool otherdata) const { framedata++; len--; } - id->setTo(framedata, len); + + // check if the resulting data consists entirely of 8-bit values + bool eightBit = true; + for (int i = 0; i < len; i++) { + if (framedata[i] > 0xff) { + eightBit = false; + break; + } + } + if (eightBit) { + // collapse to 8 bit, then let the media scanner client figure out the real encoding + char *frame8 = new char[len]; + for (int i = 0; i < len; i++) { + frame8[i] = framedata[i]; + } + id->setTo(frame8, len); + delete [] frame8; + } else { + id->setTo(framedata, len); + } + if (framedatacopy != NULL) { delete[] framedatacopy; } -- cgit v1.1