Blame SOURCES/0001-unistr.c-Enable-encoding-broken-UTF-16-into-broken-U.patch

eceae4
From d9c61dd60ec484909f70b7a916ada3a93af94b60 Mon Sep 17 00:00:00 2001
eceae4
From: Erik Larsson <mechie@users.sourceforge.net>
eceae4
Date: Fri, 8 Apr 2016 05:39:48 +0200
eceae4
Subject: [PATCH 1/2] unistr.c: Enable encoding broken UTF-16 into broken
eceae4
 UTF-8, A.K.A. WTF-8.
eceae4
eceae4
Windows filenames may contain invalid UTF-16 sequences (specifically
eceae4
broken surrogate pairs), which cannot be converted to UTF-8 if we do
eceae4
strict conversion.
eceae4
eceae4
This patch enables encoding broken UTF-16 into similarly broken UTF-8 by
eceae4
encoding any surrogate character that don't have a match into a separate
eceae4
3-byte UTF-8 sequence.
eceae4
eceae4
This is "sort of" valid UTF-8, but not valid Unicode since the code
eceae4
points used for surrogate pair encoding are not supposed to occur in a
eceae4
valid Unicode string... but on the other hand the source UTF-16 data is
eceae4
also broken, so we aren't really making things any worse.
eceae4
eceae4
This format is sometimes referred to as WTF-8 (Wobbly Translation
eceae4
Format, 8-bit encoding) and is a common solution to represent broken
eceae4
UTF-16 as UTF-8.
eceae4
eceae4
It is a lossless round-trip conversion, i.e converting from broken
eceae4
UTF-16 to "WTF-8" and back to UTF-16 yields the same broken UTF-16
eceae4
sequence. Because of this property it enables accessing these files
eceae4
by filename through ntfs-3g and the ntfsprogs (e.g. ls -la works as
eceae4
expected).
eceae4
eceae4
To disable this behaviour you can pass the preprocessor/compiler flag
eceae4
'-DALLOW_BROKEN_SURROGATES=0' when building ntfs-3g.
eceae4
---
eceae4
 libntfs-3g/unistr.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++--
eceae4
 1 file changed, 65 insertions(+), 2 deletions(-)
eceae4
eceae4
diff --git a/libntfs-3g/unistr.c b/libntfs-3g/unistr.c
eceae4
index 7f278cd..71802aa 100644
eceae4
--- a/libntfs-3g/unistr.c
eceae4
+++ b/libntfs-3g/unistr.c
eceae4
@@ -61,6 +61,11 @@
eceae4
 
eceae4
 #define NOREVBOM 0  /* JPA rejecting U+FFFE and U+FFFF, open to debate */
eceae4
 
eceae4
+#ifndef ALLOW_BROKEN_SURROGATES
eceae4
+/* Erik allowing broken UTF-16 surrogate pairs by default, open to debate. */
eceae4
+#define ALLOW_BROKEN_SURROGATES 1
eceae4
+#endif /* !defined(ALLOW_BROKEN_SURROGATES) */
eceae4
+
eceae4
 /*
eceae4
  * IMPORTANT
eceae4
  * =========
eceae4
@@ -462,8 +467,22 @@ static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_l
eceae4
 			if ((c >= 0xdc00) && (c < 0xe000)) {
eceae4
 				surrog = FALSE;
eceae4
 				count += 4;
eceae4
-			} else 
eceae4
+			} else {
eceae4
+#if ALLOW_BROKEN_SURROGATES
eceae4
+				/* The first UTF-16 unit of a surrogate pair has
eceae4
+				 * a value between 0xd800 and 0xdc00. It can be
eceae4
+				 * encoded as an individual UTF-8 sequence if we
eceae4
+				 * cannot combine it with the next UTF-16 unit
eceae4
+				 * unit as a surrogate pair. */
eceae4
+				surrog = FALSE;
eceae4
+				count += 3;
eceae4
+
eceae4
+				--i;
eceae4
+				continue;
eceae4
+#else
eceae4
 				goto fail;
eceae4
+#endif /* ALLOW_BROKEN_SURROGATES */
eceae4
+			}
eceae4
 		} else
eceae4
 			if (c < 0x80)
eceae4
 				count++;
eceae4
@@ -473,6 +492,10 @@ static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_l
eceae4
 				count += 3;
eceae4
 			else if (c < 0xdc00)
eceae4
 				surrog = TRUE;
eceae4
+#if ALLOW_BROKEN_SURROGATES
eceae4
+			else if (c < 0xe000)
eceae4
+				count += 3;
eceae4
+#endif /* ALLOW_BROKEN_SURROGATES */
eceae4
 #if NOREVBOM
eceae4
 			else if ((c >= 0xe000) && (c < 0xfffe))
eceae4
 #else
eceae4
@@ -487,7 +510,11 @@ static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_l
eceae4
 		}
eceae4
 	}
eceae4
 	if (surrog) 
eceae4
+#if ALLOW_BROKEN_SURROGATES
eceae4
+		count += 3; /* ending with a single surrogate */
eceae4
+#else
eceae4
 		goto fail;
eceae4
+#endif /* ALLOW_BROKEN_SURROGATES */
eceae4
 
eceae4
 	ret = count;
eceae4
 out:
eceae4
@@ -548,8 +575,24 @@ static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
eceae4
 				*t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4);
eceae4
 				*t++ = 0x80 + (c & 63);
eceae4
 				halfpair = 0;
eceae4
-			} else 
eceae4
+			} else {
eceae4
+#if ALLOW_BROKEN_SURROGATES
eceae4
+				/* The first UTF-16 unit of a surrogate pair has
eceae4
+				 * a value between 0xd800 and 0xdc00. It can be
eceae4
+				 * encoded as an individual UTF-8 sequence if we
eceae4
+				 * cannot combine it with the next UTF-16 unit
eceae4
+				 * unit as a surrogate pair. */
eceae4
+				*t++ = 0xe0 | (halfpair >> 12);
eceae4
+				*t++ = 0x80 | ((halfpair >> 6) & 0x3f);
eceae4
+				*t++ = 0x80 | (halfpair & 0x3f);
eceae4
+				halfpair = 0;
eceae4
+
eceae4
+				--i;
eceae4
+				continue;
eceae4
+#else
eceae4
 				goto fail;
eceae4
+#endif /* ALLOW_BROKEN_SURROGATES */
eceae4
+			}
eceae4
 		} else if (c < 0x80) {
eceae4
 			*t++ = c;
eceae4
 	    	} else {
eceae4
@@ -562,6 +605,13 @@ static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
eceae4
 		        	*t++ = 0x80 | (c & 0x3f);
eceae4
 			} else if (c < 0xdc00)
eceae4
 				halfpair = c;
eceae4
+#if ALLOW_BROKEN_SURROGATES
eceae4
+			else if (c < 0xe000) {
eceae4
+				*t++ = 0xe0 | (c >> 12);
eceae4
+				*t++ = 0x80 | ((c >> 6) & 0x3f);
eceae4
+				*t++ = 0x80 | (c & 0x3f);
eceae4
+			}
eceae4
+#endif /* ALLOW_BROKEN_SURROGATES */
eceae4
 			else if (c >= 0xe000) {
eceae4
 				*t++ = 0xe0 | (c >> 12);
eceae4
 				*t++ = 0x80 | ((c >> 6) & 0x3f);
eceae4
@@ -570,6 +620,13 @@ static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
eceae4
 				goto fail;
eceae4
 	        }
eceae4
 	}
eceae4
+#if ALLOW_BROKEN_SURROGATES
eceae4
+	if (halfpair) { /* ending with a single surrogate */
eceae4
+		*t++ = 0xe0 | (halfpair >> 12);
eceae4
+		*t++ = 0x80 | ((halfpair >> 6) & 0x3f);
eceae4
+		*t++ = 0x80 | (halfpair & 0x3f);
eceae4
+	}
eceae4
+#endif /* ALLOW_BROKEN_SURROGATES */
eceae4
 	*t = '\0';
eceae4
 	
eceae4
 #if defined(__APPLE__) || defined(__DARWIN__)
eceae4
@@ -693,10 +750,16 @@ static int utf8_to_unicode(u32 *wc, const char *s)
eceae4
 			/* Check valid ranges */
eceae4
 #if NOREVBOM
eceae4
 			if (((*wc >= 0x800) && (*wc <= 0xD7FF))
eceae4
+#if ALLOW_BROKEN_SURROGATES
eceae4
+			  || ((*wc >= 0xD800) && (*wc <= 0xDFFF))
eceae4
+#endif /* ALLOW_BROKEN_SURROGATES */
eceae4
 			  || ((*wc >= 0xe000) && (*wc <= 0xFFFD)))
eceae4
 				return 3;
eceae4
 #else
eceae4
 			if (((*wc >= 0x800) && (*wc <= 0xD7FF))
eceae4
+#if ALLOW_BROKEN_SURROGATES
eceae4
+			  || ((*wc >= 0xD800) && (*wc <= 0xDFFF))
eceae4
+#endif /* ALLOW_BROKEN_SURROGATES */
eceae4
 			  || ((*wc >= 0xe000) && (*wc <= 0xFFFF)))
eceae4
 				return 3;
eceae4
 #endif
eceae4
-- 
eceae4
2.10.2
eceae4