summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/id3/ID3.cpp
diff options
context:
space:
mode:
authorMarco Nelissen <marcone@google.com>2013-11-13 14:18:21 -0800
committerMarco Nelissen <marcone@google.com>2013-12-11 10:28:44 -0800
commit544ad2be674423238c47650d2c8588ba7dfc9ed2 (patch)
tree1167d14b0e345cd7ad6c2a415a7134c915507b86 /media/libstagefright/id3/ID3.cpp
parent35a9e7d49ff5ff99fedb0bf2a35c39f7ea5a2f9e (diff)
downloadframeworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.zip
frameworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.tar.gz
frameworks_av-544ad2be674423238c47650d2c8588ba7dfc9ed2.tar.bz2
Better character set encoding detection
Id3 tags are supposed to be ISO-8859-1 or unicode, but often aren't. To better detect the real encoding we now use ICU to detect possible encodings for a given byte sequence, then apply additional heuristics to determine the most likely one. b/5564857 Change-Id: I53bc83b006433da5c2f2ccfcd770ddb3a26b64d0
Diffstat (limited to 'media/libstagefright/id3/ID3.cpp')
-rw-r--r--media/libstagefright/id3/ID3.cpp79
1 files changed, 29 insertions, 50 deletions
diff --git a/media/libstagefright/id3/ID3.cpp b/media/libstagefright/id3/ID3.cpp
index 34d671a..a486522 100644
--- a/media/libstagefright/id3/ID3.cpp
+++ b/media/libstagefright/id3/ID3.cpp
@@ -468,49 +468,6 @@ void ID3::Iterator::getID(String8 *id) const {
}
}
-static void convertISO8859ToString8(
- const uint8_t *data, size_t size,
- String8 *s) {
- size_t utf8len = 0;
- for (size_t i = 0; i < size; ++i) {
- if (data[i] == '\0') {
- size = i;
- break;
- } else if (data[i] < 0x80) {
- ++utf8len;
- } else {
- utf8len += 2;
- }
- }
-
- if (utf8len == size) {
- // Only ASCII characters present.
-
- s->setTo((const char *)data, size);
- return;
- }
-
- char *tmp = new char[utf8len];
- char *ptr = tmp;
- for (size_t i = 0; i < size; ++i) {
- if (data[i] == '\0') {
- break;
- } else if (data[i] < 0x80) {
- *ptr++ = data[i];
- } else if (data[i] < 0xc0) {
- *ptr++ = 0xc2;
- *ptr++ = data[i];
- } else {
- *ptr++ = 0xc3;
- *ptr++ = data[i] - 64;
- }
- }
-
- s->setTo(tmp, utf8len);
-
- delete[] tmp;
- tmp = NULL;
-}
// the 2nd argument is used to get the data following the \0 in a comment field
void ID3::Iterator::getString(String8 *id, String8 *comment) const {
@@ -543,7 +500,9 @@ void ID3::Iterator::getstring(String8 *id, bool otherdata) const {
return;
}
- convertISO8859ToString8(frameData, mFrameSize, id);
+ // this is supposed to be ISO-8859-1, but pass it up as-is to the caller, who will figure
+ // out the real encoding
+ id->setTo((const char*)frameData, mFrameSize);
return;
}
@@ -561,13 +520,13 @@ void ID3::Iterator::getstring(String8 *id, bool otherdata) const {
}
if (encoding == 0x00) {
- // ISO 8859-1
- convertISO8859ToString8(frameData + 1, n, id);
+ // supposedly ISO 8859-1
+ id->setTo((const char*)frameData + 1, n);
} else if (encoding == 0x03) {
- // UTF-8
+ // supposedly UTF-8
id->setTo((const char *)(frameData + 1), n);
} else if (encoding == 0x02) {
- // UTF-16 BE, no byte order mark.
+ // supposedly UTF-16 BE, no byte order mark.
// API wants number of characters, not number of bytes...
int len = n / 2;
const char16_t *framedata = (const char16_t *) (frameData + 1);
@@ -583,7 +542,7 @@ void ID3::Iterator::getstring(String8 *id, bool otherdata) const {
if (framedatacopy != NULL) {
delete[] framedatacopy;
}
- } else {
+ } else if (encoding == 0x01) {
// UCS-2
// API wants number of characters, not number of bytes...
int len = n / 2;
@@ -602,7 +561,27 @@ void ID3::Iterator::getstring(String8 *id, bool otherdata) const {
framedata++;
len--;
}
- id->setTo(framedata, len);
+
+ // check if the resulting data consists entirely of 8-bit values
+ bool eightBit = true;
+ for (int i = 0; i < len; i++) {
+ if (framedata[i] > 0xff) {
+ eightBit = false;
+ break;
+ }
+ }
+ if (eightBit) {
+ // collapse to 8 bit, then let the media scanner client figure out the real encoding
+ char *frame8 = new char[len];
+ for (int i = 0; i < len; i++) {
+ frame8[i] = framedata[i];
+ }
+ id->setTo(frame8, len);
+ delete [] frame8;
+ } else {
+ id->setTo(framedata, len);
+ }
+
if (framedatacopy != NULL) {
delete[] framedatacopy;
}