From 544ad2be674423238c47650d2c8588ba7dfc9ed2 Mon Sep 17 00:00:00 2001
From: Marco Nelissen <marcone@google.com>
Date: Wed, 13 Nov 2013 14:18:21 -0800
Subject: Better character set encoding detection

Id3 tags are supposed to be ISO-8859-1 or unicode, but often aren't.
To better detect the real encoding we now use ICU to detect possible
encodings for a given byte sequence, then apply additional heuristics
to determine the most likely one.
b/5564857

Change-Id: I53bc83b006433da5c2f2ccfcd770ddb3a26b64d0
---
 media/libstagefright/id3/ID3.cpp | 79 +++++++++++++++-------------------------
 1 file changed, 29 insertions(+), 50 deletions(-)

(limited to 'media/libstagefright/id3/ID3.cpp')

diff --git a/media/libstagefright/id3/ID3.cpp b/media/libstagefright/id3/ID3.cpp
index 34d671a..a486522 100644
--- a/media/libstagefright/id3/ID3.cpp
+++ b/media/libstagefright/id3/ID3.cpp
@@ -468,49 +468,6 @@ void ID3::Iterator::getID(String8 *id) const {
     }
 }
 
-static void convertISO8859ToString8(
-        const uint8_t *data, size_t size,
-        String8 *s) {
-    size_t utf8len = 0;
-    for (size_t i = 0; i < size; ++i) {
-        if (data[i] == '\0') {
-            size = i;
-            break;
-        } else if (data[i] < 0x80) {
-            ++utf8len;
-        } else {
-            utf8len += 2;
-        }
-    }
-
-    if (utf8len == size) {
-        // Only ASCII characters present.
-
-        s->setTo((const char *)data, size);
-        return;
-    }
-
-    char *tmp = new char[utf8len];
-    char *ptr = tmp;
-    for (size_t i = 0; i < size; ++i) {
-        if (data[i] == '\0') {
-            break;
-        } else if (data[i] < 0x80) {
-            *ptr++ = data[i];
-        } else if (data[i] < 0xc0) {
-            *ptr++ = 0xc2;
-            *ptr++ = data[i];
-        } else {
-            *ptr++ = 0xc3;
-            *ptr++ = data[i] - 64;
-        }
-    }
-
-    s->setTo(tmp, utf8len);
-
-    delete[] tmp;
-    tmp = NULL;
-}
 
 // the 2nd argument is used to get the data following the \0 in a comment field
 void ID3::Iterator::getString(String8 *id, String8 *comment) const {
@@ -543,7 +500,9 @@ void ID3::Iterator::getstring(String8 *id, bool otherdata) const {
             return;
         }
 
-        convertISO8859ToString8(frameData, mFrameSize, id);
+        // this is supposed to be ISO-8859-1, but pass it up as-is to the caller, who will figure
+        // out the real encoding
+        id->setTo((const char*)frameData, mFrameSize);
         return;
     }
 
@@ -561,13 +520,13 @@ void ID3::Iterator::getstring(String8 *id, bool otherdata) const {
     }
 
     if (encoding == 0x00) {
-        // ISO 8859-1
-        convertISO8859ToString8(frameData + 1, n, id);
+        // supposedly ISO 8859-1
+        id->setTo((const char*)frameData + 1, n);
     } else if (encoding == 0x03) {
-        // UTF-8
+        // supposedly UTF-8
         id->setTo((const char *)(frameData + 1), n);
     } else if (encoding == 0x02) {
-        // UTF-16 BE, no byte order mark.
+        // supposedly UTF-16 BE, no byte order mark.
         // API wants number of characters, not number of bytes...
         int len = n / 2;
         const char16_t *framedata = (const char16_t *) (frameData + 1);
@@ -583,7 +542,7 @@ void ID3::Iterator::getstring(String8 *id, bool otherdata) const {
         if (framedatacopy != NULL) {
             delete[] framedatacopy;
         }
-    } else {
+    } else if (encoding == 0x01) {
         // UCS-2
         // API wants number of characters, not number of bytes...
         int len = n / 2;
@@ -602,7 +561,27 @@ void ID3::Iterator::getstring(String8 *id, bool otherdata) const {
             framedata++;
             len--;
         }
-        id->setTo(framedata, len);
+
+        // check if the resulting data consists entirely of 8-bit values
+        bool eightBit = true;
+        for (int i = 0; i < len; i++) {
+            if (framedata[i] > 0xff) {
+                eightBit = false;
+                break;
+            }
+        }
+        if (eightBit) {
+            // collapse to 8 bit, then let the media scanner client figure out the real encoding
+            char *frame8 = new char[len];
+            for (int i = 0; i < len; i++) {
+                frame8[i] = framedata[i];
+            }
+            id->setTo(frame8, len);
+            delete [] frame8;
+        } else {
+            id->setTo(framedata, len);
+        }
+
         if (framedatacopy != NULL) {
             delete[] framedatacopy;
         }
-- 
cgit v1.1