aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Support/ConvertUTF.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Support/ConvertUTF.c')
-rw-r--r--lib/Support/ConvertUTF.c153
1 files changed, 145 insertions, 8 deletions
diff --git a/lib/Support/ConvertUTF.c b/lib/Support/ConvertUTF.c
index 23f17ca..128459a 100644
--- a/lib/Support/ConvertUTF.c
+++ b/lib/Support/ConvertUTF.c
@@ -51,6 +51,7 @@
#ifdef CVTUTF_DEBUG
#include <stdio.h>
#endif
+#include <assert.h>
static const int halfShift = 10; /* used for shifting by 10 bits */
@@ -392,6 +393,99 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
/* --------------------------------------------------------------------- */
+static unsigned
+findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
+ const UTF8 *sourceEnd) {
+ UTF8 b1, b2, b3;
+
+ assert(!isLegalUTF8Sequence(source, sourceEnd));
+
+ /*
+ * Unicode 6.3.0, D93b:
+ *
+ * Maximal subpart of an ill-formed subsequence: The longest code unit
+ * subsequence starting at an unconvertible offset that is either:
+ * a. the initial subsequence of a well-formed code unit sequence, or
+ * b. a subsequence of length one.
+ */
+
+ if (source == sourceEnd)
+ return 0;
+
+ /*
+ * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
+ * Byte Sequences.
+ */
+
+ b1 = *source;
+ ++source;
+ if (b1 >= 0xC2 && b1 <= 0xDF) {
+ /*
+ * First byte is valid, but we know that this code unit sequence is
+ * invalid, so the maximal subpart has to end after the first byte.
+ */
+ return 1;
+ }
+
+ if (source == sourceEnd)
+ return 1;
+
+ b2 = *source;
+ ++source;
+
+ if (b1 == 0xE0) {
+ return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
+ }
+ if (b1 >= 0xE1 && b1 <= 0xEC) {
+ return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
+ }
+ if (b1 == 0xED) {
+ return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
+ }
+ if (b1 >= 0xEE && b1 <= 0xEF) {
+ return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
+ }
+ if (b1 == 0xF0) {
+ if (b2 >= 0x90 && b2 <= 0xBF) {
+ if (source == sourceEnd)
+ return 2;
+
+ b3 = *source;
+ return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
+ }
+ return 1;
+ }
+ if (b1 >= 0xF1 && b1 <= 0xF3) {
+ if (b2 >= 0x80 && b2 <= 0xBF) {
+ if (source == sourceEnd)
+ return 2;
+
+ b3 = *source;
+ return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
+ }
+ return 1;
+ }
+ if (b1 == 0xF4) {
+ if (b2 >= 0x80 && b2 <= 0x8F) {
+ if (source == sourceEnd)
+ return 2;
+
+ b3 = *source;
+ return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
+ }
+ return 1;
+ }
+
+ assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
+ /*
+ * There are no valid sequences that start with these bytes. Maximal subpart
+ * is defined to have length 1 in these cases.
+ */
+ return 1;
+}
+
+/* --------------------------------------------------------------------- */
+
/*
* Exported function to return the total number of bytes in a codepoint
* represented in UTF-8, given the value of the first byte.
@@ -491,9 +585,10 @@ ConversionResult ConvertUTF8toUTF16 (
/* --------------------------------------------------------------------- */
-ConversionResult ConvertUTF8toUTF32 (
+static ConversionResult ConvertUTF8toUTF32Impl(
const UTF8** sourceStart, const UTF8* sourceEnd,
- UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
+ UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
+ Boolean InputIsPartial) {
ConversionResult result = conversionOK;
const UTF8* source = *sourceStart;
UTF32* target = *targetStart;
@@ -501,12 +596,42 @@ ConversionResult ConvertUTF8toUTF32 (
UTF32 ch = 0;
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
if (extraBytesToRead >= sourceEnd - source) {
- result = sourceExhausted; break;
+ if (flags == strictConversion || InputIsPartial) {
+ result = sourceExhausted;
+ break;
+ } else {
+ result = sourceIllegal;
+
+ /*
+ * Replace the maximal subpart of ill-formed sequence with
+ * replacement character.
+ */
+ source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
+ sourceEnd);
+ *target++ = UNI_REPLACEMENT_CHAR;
+ continue;
+ }
}
+ if (target >= targetEnd) {
+ result = targetExhausted; break;
+ }
+
/* Do this check whether lenient or strict */
if (!isLegalUTF8(source, extraBytesToRead+1)) {
result = sourceIllegal;
- break;
+ if (flags == strictConversion) {
+ /* Abort conversion. */
+ break;
+ } else {
+ /*
+ * Replace the maximal subpart of ill-formed sequence with
+ * replacement character.
+ */
+ source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
+ sourceEnd);
+ *target++ = UNI_REPLACEMENT_CHAR;
+ continue;
+ }
}
/*
* The cases all fall through. See "Note A" below.
@@ -521,10 +646,6 @@ ConversionResult ConvertUTF8toUTF32 (
}
ch -= offsetsFromUTF8[extraBytesToRead];
- if (target >= targetEnd) {
- source -= (extraBytesToRead+1); /* Back up the source pointer! */
- result = targetExhausted; break;
- }
if (ch <= UNI_MAX_LEGAL_UTF32) {
/*
* UTF-16 surrogate values are illegal in UTF-32, and anything
@@ -551,6 +672,22 @@ ConversionResult ConvertUTF8toUTF32 (
return result;
}
+ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
+ const UTF8 *sourceEnd,
+ UTF32 **targetStart,
+ UTF32 *targetEnd,
+ ConversionFlags flags) {
+ return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
+ flags, /*InputIsPartial=*/true);
+}
+
+ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
+ const UTF8 *sourceEnd, UTF32 **targetStart,
+ UTF32 *targetEnd, ConversionFlags flags) {
+ return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
+ flags, /*InputIsPartial=*/false);
+}
+
/* ---------------------------------------------------------------------
Note A.