Blame SOURCES/wget-1.14-support-non-ASCII-characters.patch

bc22e6
From 0a33fa22c597234ab133f63127b4a5e00cf048b9 Mon Sep 17 00:00:00 2001
bc22e6
From: Tomas Hozza <thozza@redhat.com>
bc22e6
Date: Mon, 20 Jun 2016 12:10:38 +0200
bc22e6
Subject: [PATCH] Support non-ASCII characters
bc22e6
bc22e6
Upstream commit 59b920874daa565a1323ffa1e756e80493190686
bc22e6
bc22e6
Signed-off-by: Tomas Hozza <thozza@redhat.com>
bc22e6
---
bc22e6
 src/url.c             | 87 +++++++++++++++++++++++++++++++++++++++++++++++++--
bc22e6
 tests/Test-ftp-iri.px |  4 +--
bc22e6
 2 files changed, 87 insertions(+), 4 deletions(-)
bc22e6
bc22e6
diff --git a/src/url.c b/src/url.c
bc22e6
index 6bca719..d0d9e27 100644
bc22e6
--- a/src/url.c
bc22e6
+++ b/src/url.c
bc22e6
@@ -42,6 +42,11 @@ as that of the covered work.  */
bc22e6
 #include "url.h"
bc22e6
 #include "host.h"  /* for is_valid_ipv6_address */
bc22e6
 
bc22e6
+#if HAVE_ICONV
bc22e6
+#include <iconv.h>
bc22e6
+#include <langinfo.h>
bc22e6
+#endif
bc22e6
+
bc22e6
 #ifdef __VMS
bc22e6
 #include "vms.h"
bc22e6
 #endif /* def __VMS */
bc22e6
@@ -1335,8 +1340,8 @@ UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
bc22e6
   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
bc22e6
   0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
bc22e6
 
bc22e6
-  C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
bc22e6
-  C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
bc22e6
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0, /* 128-143 */
bc22e6
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0, /* 144-159 */
bc22e6
   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
bc22e6
   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
bc22e6
 
bc22e6
@@ -1456,6 +1461,82 @@ append_uri_pathel (const char *b, const char *e, bool escaped,
bc22e6
   TAIL_INCR (dest, outlen);
bc22e6
 }
bc22e6
 
bc22e6
+static char *
bc22e6
+convert_fname (const char *fname)
bc22e6
+{
bc22e6
+  char *converted_fname = (char *)fname;
bc22e6
+#if HAVE_ICONV
bc22e6
+  const char *from_encoding = opt.encoding_remote;
bc22e6
+  const char *to_encoding = opt.locale;
bc22e6
+  iconv_t cd;
bc22e6
+  size_t len, done, inlen, outlen;
bc22e6
+  char *s;
bc22e6
+  const char *orig_fname = fname;;
bc22e6
+
bc22e6
+  /* Defaults for remote and local encodings.  */
bc22e6
+  if (!from_encoding)
bc22e6
+    from_encoding = "UTF-8";
bc22e6
+  if (!to_encoding)
bc22e6
+    to_encoding = nl_langinfo (CODESET);
bc22e6
+
bc22e6
+  cd = iconv_open (to_encoding, from_encoding);
bc22e6
+  if (cd == (iconv_t)(-1))
bc22e6
+    logprintf (LOG_VERBOSE, _("Conversion from %s to %s isn't supported\n"),
bc22e6
+	       quote (from_encoding), quote (to_encoding));
bc22e6
+  else
bc22e6
+    {
bc22e6
+      inlen = strlen (fname);
bc22e6
+      len = outlen = inlen * 2;
bc22e6
+      converted_fname = s = xmalloc (outlen + 1);
bc22e6
+      done = 0;
bc22e6
+
bc22e6
+      for (;;)
bc22e6
+	{
bc22e6
+	  if (iconv (cd, &fname, &inlen, &s, &outlen) != (size_t)(-1)
bc22e6
+	      && iconv (cd, NULL, NULL, &s, &outlen) != (size_t)(-1))
bc22e6
+	    {
bc22e6
+	      *(converted_fname + len - outlen - done) = '\0';
bc22e6
+	      iconv_close(cd);
bc22e6
+	      DEBUGP (("Converted file name '%s' (%s) -> '%s' (%s)\n",
bc22e6
+		       orig_fname, from_encoding, converted_fname, to_encoding));
bc22e6
+	      xfree (orig_fname);
bc22e6
+	      return converted_fname;
bc22e6
+	    }
bc22e6
+
bc22e6
+	  /* Incomplete or invalid multibyte sequence */
bc22e6
+	  if (errno == EINVAL || errno == EILSEQ)
bc22e6
+	    {
bc22e6
+	      logprintf (LOG_VERBOSE,
bc22e6
+			 _("Incomplete or invalid multibyte sequence encountered\n"));
bc22e6
+	      xfree (converted_fname);
bc22e6
+	      converted_fname = (char *)orig_fname;
bc22e6
+	      break;
bc22e6
+	    }
bc22e6
+	  else if (errno == E2BIG) /* Output buffer full */
bc22e6
+	    {
bc22e6
+	      done = len;
bc22e6
+	      len = outlen = done + inlen * 2;
bc22e6
+	      converted_fname = xrealloc (converted_fname, outlen + 1);
bc22e6
+	      s = converted_fname + done;
bc22e6
+	    }
bc22e6
+	  else /* Weird, we got an unspecified error */
bc22e6
+	    {
bc22e6
+	      logprintf (LOG_VERBOSE, _("Unhandled errno %d\n"), errno);
bc22e6
+	      xfree (converted_fname);
bc22e6
+	      converted_fname = (char *)orig_fname;
bc22e6
+	      break;
bc22e6
+	    }
bc22e6
+	}
bc22e6
+      DEBUGP (("Failed to convert file name '%s' (%s) -> '?' (%s)\n",
bc22e6
+	       orig_fname, from_encoding, to_encoding));
bc22e6
+    }
bc22e6
+
bc22e6
+    iconv_close(cd);
bc22e6
+#endif
bc22e6
+
bc22e6
+  return converted_fname;
bc22e6
+}
bc22e6
+
bc22e6
 /* Append to DEST the directory structure that corresponds the
bc22e6
    directory part of URL's path.  For example, if the URL is
bc22e6
    http://server/dir1/dir2/file, this appends "/dir1/dir2".
bc22e6
@@ -1582,6 +1663,8 @@ url_file_name (const struct url *u, char *replaced_filename)
bc22e6
 
bc22e6
   fname = fnres.base;
bc22e6
 
bc22e6
+  fname = convert_fname (fname);
bc22e6
+
bc22e6
   /* Check the cases in which the unique extensions are not used:
bc22e6
      1) Clobbering is turned off (-nc).
bc22e6
      2) Retrieval with regetting.
bc22e6
diff --git a/tests/Test-ftp-iri.px b/tests/Test-ftp-iri.px
bc22e6
index a4b7fe1..24ac467 100755
bc22e6
--- a/tests/Test-ftp-iri.px
bc22e6
+++ b/tests/Test-ftp-iri.px
bc22e6
@@ -26,12 +26,12 @@ my %urls = (
bc22e6
     },
bc22e6
 );
bc22e6
 
bc22e6
-my $cmdline = $WgetTest::WGETPATH . " --local-encoding=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
bc22e6
+my $cmdline = $WgetTest::WGETPATH . " --local-encoding=iso-8859-1 --remote-encoding=utf-8 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
bc22e6
 
bc22e6
 my $expected_error_code = 0;
bc22e6
 
bc22e6
 my %expected_downloaded_files = (
bc22e6
-    "fran${ccedilla_u8}ais.txt" => {
bc22e6
+    "fran${ccedilla_l1}ais.txt" => {
bc22e6
         content => $francais,
bc22e6
     },
bc22e6
 );
bc22e6
-- 
bc22e6
2.5.5
bc22e6