|
|
eceae4 |
From f0370bfa9c47575d4e47c94e443aa91983683a43 Mon Sep 17 00:00:00 2001
|
|
|
eceae4 |
From: Erik Larsson <mechie@users.sourceforge.net>
|
|
|
eceae4 |
Date: Tue, 12 Apr 2016 17:02:40 +0200
|
|
|
eceae4 |
Subject: [PATCH 2/2] unistr.c: Unify the two defines NOREVBOM and
|
|
|
eceae4 |
ALLOW_BROKEN_SURROGATES.
|
|
|
eceae4 |
|
|
|
eceae4 |
In the mailing list discussion we came to the conclusion that there
|
|
|
eceae4 |
doesn't seem to be any reason to keep these declarations separate since
|
|
|
eceae4 |
they address the same issue, namely libntfs-3g's tolerance for bad
|
|
|
eceae4 |
Unicode data in filenames and other UTF-16 strings in the file system,
|
|
|
eceae4 |
so merge the two defines into the new define ALLOW_BROKEN_UNICODE.
|
|
|
eceae4 |
---
|
|
|
eceae4 |
libntfs-3g/unistr.c | 58 +++++++++++++++++++++++------------------------------
|
|
|
eceae4 |
1 file changed, 25 insertions(+), 33 deletions(-)
|
|
|
eceae4 |
|
|
|
eceae4 |
diff --git a/libntfs-3g/unistr.c b/libntfs-3g/unistr.c
|
|
|
eceae4 |
index 71802aa..753acc0 100644
|
|
|
eceae4 |
--- a/libntfs-3g/unistr.c
|
|
|
eceae4 |
+++ b/libntfs-3g/unistr.c
|
|
|
eceae4 |
@@ -59,12 +59,11 @@
|
|
|
eceae4 |
#include "logging.h"
|
|
|
eceae4 |
#include "misc.h"
|
|
|
eceae4 |
|
|
|
eceae4 |
-#define NOREVBOM 0 /* JPA rejecting U+FFFE and U+FFFF, open to debate */
|
|
|
eceae4 |
-
|
|
|
eceae4 |
-#ifndef ALLOW_BROKEN_SURROGATES
|
|
|
eceae4 |
-/* Erik allowing broken UTF-16 surrogate pairs by default, open to debate. */
|
|
|
eceae4 |
-#define ALLOW_BROKEN_SURROGATES 1
|
|
|
eceae4 |
-#endif /* !defined(ALLOW_BROKEN_SURROGATES) */
|
|
|
eceae4 |
+#ifndef ALLOW_BROKEN_UNICODE
|
|
|
eceae4 |
+/* Erik allowing broken UTF-16 surrogate pairs and U+FFFE and U+FFFF by default,
|
|
|
eceae4 |
+ * open to debate. */
|
|
|
eceae4 |
+#define ALLOW_BROKEN_UNICODE 1
|
|
|
eceae4 |
+#endif /* !defined(ALLOW_BROKEN_UNICODE) */
|
|
|
eceae4 |
|
|
|
eceae4 |
/*
|
|
|
eceae4 |
* IMPORTANT
|
|
|
eceae4 |
@@ -468,7 +467,7 @@ static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_l
|
|
|
eceae4 |
surrog = FALSE;
|
|
|
eceae4 |
count += 4;
|
|
|
eceae4 |
} else {
|
|
|
eceae4 |
-#if ALLOW_BROKEN_SURROGATES
|
|
|
eceae4 |
+#if ALLOW_BROKEN_UNICODE
|
|
|
eceae4 |
/* The first UTF-16 unit of a surrogate pair has
|
|
|
eceae4 |
* a value between 0xd800 and 0xdc00. It can be
|
|
|
eceae4 |
* encoded as an individual UTF-8 sequence if we
|
|
|
eceae4 |
@@ -481,7 +480,7 @@ static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_l
|
|
|
eceae4 |
continue;
|
|
|
eceae4 |
#else
|
|
|
eceae4 |
goto fail;
|
|
|
eceae4 |
-#endif /* ALLOW_BROKEN_SURROGATES */
|
|
|
eceae4 |
+#endif /* ALLOW_BROKEN_UNICODE */
|
|
|
eceae4 |
}
|
|
|
eceae4 |
} else
|
|
|
eceae4 |
if (c < 0x80)
|
|
|
eceae4 |
@@ -492,15 +491,13 @@ static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_l
|
|
|
eceae4 |
count += 3;
|
|
|
eceae4 |
else if (c < 0xdc00)
|
|
|
eceae4 |
surrog = TRUE;
|
|
|
eceae4 |
-#if ALLOW_BROKEN_SURROGATES
|
|
|
eceae4 |
+#if ALLOW_BROKEN_UNICODE
|
|
|
eceae4 |
else if (c < 0xe000)
|
|
|
eceae4 |
count += 3;
|
|
|
eceae4 |
-#endif /* ALLOW_BROKEN_SURROGATES */
|
|
|
eceae4 |
-#if NOREVBOM
|
|
|
eceae4 |
- else if ((c >= 0xe000) && (c < 0xfffe))
|
|
|
eceae4 |
-#else
|
|
|
eceae4 |
else if (c >= 0xe000)
|
|
|
eceae4 |
-#endif
|
|
|
eceae4 |
+#else
|
|
|
eceae4 |
+ else if ((c >= 0xe000) && (c < 0xfffe))
|
|
|
eceae4 |
+#endif /* ALLOW_BROKEN_UNICODE */
|
|
|
eceae4 |
count += 3;
|
|
|
eceae4 |
else
|
|
|
eceae4 |
goto fail;
|
|
|
eceae4 |
@@ -510,11 +507,11 @@ static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_l
|
|
|
eceae4 |
}
|
|
|
eceae4 |
}
|
|
|
eceae4 |
if (surrog)
|
|
|
eceae4 |
-#if ALLOW_BROKEN_SURROGATES
|
|
|
eceae4 |
+#if ALLOW_BROKEN_UNICODE
|
|
|
eceae4 |
count += 3; /* ending with a single surrogate */
|
|
|
eceae4 |
#else
|
|
|
eceae4 |
goto fail;
|
|
|
eceae4 |
-#endif /* ALLOW_BROKEN_SURROGATES */
|
|
|
eceae4 |
+#endif /* ALLOW_BROKEN_UNICODE */
|
|
|
eceae4 |
|
|
|
eceae4 |
ret = count;
|
|
|
eceae4 |
out:
|
|
|
eceae4 |
@@ -576,7 +573,7 @@ static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
|
|
|
eceae4 |
*t++ = 0x80 + (c & 63);
|
|
|
eceae4 |
halfpair = 0;
|
|
|
eceae4 |
} else {
|
|
|
eceae4 |
-#if ALLOW_BROKEN_SURROGATES
|
|
|
eceae4 |
+#if ALLOW_BROKEN_UNICODE
|
|
|
eceae4 |
/* The first UTF-16 unit of a surrogate pair has
|
|
|
eceae4 |
* a value between 0xd800 and 0xdc00. It can be
|
|
|
eceae4 |
* encoded as an individual UTF-8 sequence if we
|
|
|
eceae4 |
@@ -591,7 +588,7 @@ static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
|
|
|
eceae4 |
continue;
|
|
|
eceae4 |
#else
|
|
|
eceae4 |
goto fail;
|
|
|
eceae4 |
-#endif /* ALLOW_BROKEN_SURROGATES */
|
|
|
eceae4 |
+#endif /* ALLOW_BROKEN_UNICODE */
|
|
|
eceae4 |
}
|
|
|
eceae4 |
} else if (c < 0x80) {
|
|
|
eceae4 |
*t++ = c;
|
|
|
eceae4 |
@@ -605,13 +602,13 @@ static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
|
|
|
eceae4 |
*t++ = 0x80 | (c & 0x3f);
|
|
|
eceae4 |
} else if (c < 0xdc00)
|
|
|
eceae4 |
halfpair = c;
|
|
|
eceae4 |
-#if ALLOW_BROKEN_SURROGATES
|
|
|
eceae4 |
+#if ALLOW_BROKEN_UNICODE
|
|
|
eceae4 |
else if (c < 0xe000) {
|
|
|
eceae4 |
*t++ = 0xe0 | (c >> 12);
|
|
|
eceae4 |
*t++ = 0x80 | ((c >> 6) & 0x3f);
|
|
|
eceae4 |
*t++ = 0x80 | (c & 0x3f);
|
|
|
eceae4 |
}
|
|
|
eceae4 |
-#endif /* ALLOW_BROKEN_SURROGATES */
|
|
|
eceae4 |
+#endif /* ALLOW_BROKEN_UNICODE */
|
|
|
eceae4 |
else if (c >= 0xe000) {
|
|
|
eceae4 |
*t++ = 0xe0 | (c >> 12);
|
|
|
eceae4 |
*t++ = 0x80 | ((c >> 6) & 0x3f);
|
|
|
eceae4 |
@@ -620,13 +617,13 @@ static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
|
|
|
eceae4 |
goto fail;
|
|
|
eceae4 |
}
|
|
|
eceae4 |
}
|
|
|
eceae4 |
-#if ALLOW_BROKEN_SURROGATES
|
|
|
eceae4 |
+#if ALLOW_BROKEN_UNICODE
|
|
|
eceae4 |
if (halfpair) { /* ending with a single surrogate */
|
|
|
eceae4 |
*t++ = 0xe0 | (halfpair >> 12);
|
|
|
eceae4 |
*t++ = 0x80 | ((halfpair >> 6) & 0x3f);
|
|
|
eceae4 |
*t++ = 0x80 | (halfpair & 0x3f);
|
|
|
eceae4 |
}
|
|
|
eceae4 |
-#endif /* ALLOW_BROKEN_SURROGATES */
|
|
|
eceae4 |
+#endif /* ALLOW_BROKEN_UNICODE */
|
|
|
eceae4 |
*t = '\0';
|
|
|
eceae4 |
|
|
|
eceae4 |
#if defined(__APPLE__) || defined(__DARWIN__)
|
|
|
eceae4 |
@@ -748,21 +745,16 @@ static int utf8_to_unicode(u32 *wc, const char *s)
|
|
|
eceae4 |
| ((u32)(s[1] & 0x3F) << 6)
|
|
|
eceae4 |
| ((u32)(s[2] & 0x3F));
|
|
|
eceae4 |
/* Check valid ranges */
|
|
|
eceae4 |
-#if NOREVBOM
|
|
|
eceae4 |
+#if ALLOW_BROKEN_UNICODE
|
|
|
eceae4 |
if (((*wc >= 0x800) && (*wc <= 0xD7FF))
|
|
|
eceae4 |
-#if ALLOW_BROKEN_SURROGATES
|
|
|
eceae4 |
|| ((*wc >= 0xD800) && (*wc <= 0xDFFF))
|
|
|
eceae4 |
-#endif /* ALLOW_BROKEN_SURROGATES */
|
|
|
eceae4 |
- || ((*wc >= 0xe000) && (*wc <= 0xFFFD)))
|
|
|
eceae4 |
- return 3;
|
|
|
eceae4 |
-#else
|
|
|
eceae4 |
- if (((*wc >= 0x800) && (*wc <= 0xD7FF))
|
|
|
eceae4 |
-#if ALLOW_BROKEN_SURROGATES
|
|
|
eceae4 |
- || ((*wc >= 0xD800) && (*wc <= 0xDFFF))
|
|
|
eceae4 |
-#endif /* ALLOW_BROKEN_SURROGATES */
|
|
|
eceae4 |
|| ((*wc >= 0xe000) && (*wc <= 0xFFFF)))
|
|
|
eceae4 |
return 3;
|
|
|
eceae4 |
-#endif
|
|
|
eceae4 |
+#else
|
|
|
eceae4 |
+ if (((*wc >= 0x800) && (*wc <= 0xD7FF))
|
|
|
eceae4 |
+ || ((*wc >= 0xe000) && (*wc <= 0xFFFD)))
|
|
|
eceae4 |
+ return 3;
|
|
|
eceae4 |
+#endif /* ALLOW_BROKEN_UNICODE */
|
|
|
eceae4 |
}
|
|
|
eceae4 |
goto fail;
|
|
|
eceae4 |
/* four-byte */
|
|
|
eceae4 |
--
|
|
|
eceae4 |
2.10.2
|
|
|
eceae4 |
|