Blame SOURCES/XML-LibXML-2.0202-Parse-an-ampersand-entity-in-SAX-interface.patch

5fc6d7
From 3d0adda7560137309be8b10c63ff41e41dfb1516 Mon Sep 17 00:00:00 2001
5fc6d7
From: =?UTF-8?q?Petr=20P=C3=ADsa=C5=99?= <ppisar@redhat.com>
5fc6d7
Date: Tue, 28 Jan 2020 17:05:32 +0100
5fc6d7
Subject: [PATCH] Parse an ampersand entity in SAX interface
5fc6d7
MIME-Version: 1.0
5fc6d7
Content-Type: text/plain; charset=UTF-8
5fc6d7
Content-Transfer-Encoding: 8bit
5fc6d7
5fc6d7
After disabling parsing external entities in XML-LibXML-2.0202,
5fc6d7
XML::LibXML::SAX interface stopped expanding & and & entities
5fc6d7
in attribute values (often found in href XHTML attributes) and
5fc6d7
returned "&" instead. This was discovered by a RDF-Trine test
5fc6d7
suite failure <https://github.com/kasei/perlrdf/issues/167>.
5fc6d7
5fc6d7
First, I suspected XML-LibXML
5fc6d7
<https://rt.cpan.org/Ticket/Display.html?id=131498>, but it turned out
5fc6d7
that the unexpanded entity comes from libxml2 C library itself. And
5fc6d7
that it's not just an ommitted expansion, but that it's actually an
5fc6d7
escape sequence for "&" characters. Other XML metacharacters (like
5fc6d7
"<") are not affeced. Also text nodes are also not affected.  My
5fc6d7
finding was confirmed by an old libxml2 bug report
5fc6d7
<https://bugzilla.gnome.org/show_bug.cgi?id=316487>.
5fc6d7
5fc6d7
This patch "fixes" this discepancy by replacing all "&"
5fc6d7
subtstrings with a literal "&" in SAX interface of start_element()
5fc6d7
callbacks.
5fc6d7
5fc6d7
Signed-off-by: Petr Písař <ppisar@redhat.com>
5fc6d7
---
5fc6d7
 MANIFEST          |  1 +
5fc6d7
 perl-libxml-sax.c | 44 ++++++++++++++++++++++++++++++++++++++++++--
5fc6d7
 t/52_sax_intent.t | 40 ++++++++++++++++++++++++++++++++++++++++
5fc6d7
 3 files changed, 83 insertions(+), 2 deletions(-)
5fc6d7
 create mode 100755 t/52_sax_intent.t
5fc6d7
5fc6d7
diff --git a/MANIFEST b/MANIFEST
5fc6d7
index 5248ea5..ccc3410 100644
5fc6d7
--- a/MANIFEST
5fc6d7
+++ b/MANIFEST
5fc6d7
@@ -174,6 +174,7 @@ t/49callbacks_returning_undef.t
5fc6d7
 t/49global_extent.t
5fc6d7
 t/50devel.t
5fc6d7
 t/51_parse_html_string_rt87089.t
5fc6d7
+t/52_sax_intent.t
5fc6d7
 t/60error_prev_chain.t
5fc6d7
 t/60struct_error.t
5fc6d7
 t/61error.t
5fc6d7
diff --git a/perl-libxml-sax.c b/perl-libxml-sax.c
5fc6d7
index b949d3c..232a879 100644
5fc6d7
--- a/perl-libxml-sax.c
5fc6d7
+++ b/perl-libxml-sax.c
5fc6d7
@@ -20,6 +20,7 @@ extern "C" {
5fc6d7
 #include "ppport.h"
5fc6d7
 
5fc6d7
 #include <stdlib.h>
5fc6d7
+#include <string.h>
5fc6d7
 #include <libxml/xmlmemory.h>
5fc6d7
 #include <libxml/parser.h>
5fc6d7
 #include <libxml/parserInternals.h>
5fc6d7
@@ -639,6 +640,34 @@ PmmGenNsName( const xmlChar * name, const xmlChar * nsURI )
5fc6d7
     return retval;
5fc6d7
 }
5fc6d7
 
5fc6d7
+/* If a value argument does not contain "&", the value pointer is returned.
5fc6d7
+ * Otherwise a new xmlChar * string is allocated, the value copied there and
5fc6d7
+ * "&" occurences replaced with "&". Then the caller must free it. */
5fc6d7
+static
5fc6d7
+xmlChar *
5fc6d7
+_expandAmp( const xmlChar *value )
5fc6d7
+{
5fc6d7
+    xmlChar *expanded = NULL;
5fc6d7
+    const xmlChar *entity;
5fc6d7
+    int length;
5fc6d7
+
5fc6d7
+    if (value == NULL ||
5fc6d7
+            (NULL == (entity = (const xmlChar *)strstr((const char *)value, "&")))) {
5fc6d7
+        return (xmlChar *)value;
5fc6d7
+    }
5fc6d7
+
5fc6d7
+    do {
5fc6d7
+        length = entity - value;
5fc6d7
+        expanded = xmlStrncat(expanded, value, length);
5fc6d7
+        expanded = xmlStrncat(expanded, (const xmlChar *)"&", 1);
5fc6d7
+        value += length + 5; /* "&" */
5fc6d7
+    } while (NULL != (entity = (const xmlChar*)strstr((const char *)value, "&")));
5fc6d7
+
5fc6d7
+    expanded = xmlStrcat(expanded, value);
5fc6d7
+
5fc6d7
+    return expanded;
5fc6d7
+}
5fc6d7
+
5fc6d7
 HV *
5fc6d7
 PmmGenAttributeHashSV( pTHX_ PmmSAXVectorPtr sax,
5fc6d7
                        const xmlChar **attr, SV * handler )
5fc6d7
@@ -653,8 +682,8 @@ PmmGenAttributeHashSV( pTHX_ PmmSAXVectorPtr sax,
5fc6d7
     const xmlChar * nsURI = NULL;
5fc6d7
     const xmlChar **ta    = attr;
5fc6d7
     const xmlChar * name  = NULL;
5fc6d7
-    const xmlChar * value = NULL;
5fc6d7
 
5fc6d7
+    xmlChar * value       = NULL;
5fc6d7
     xmlChar * keyname     = NULL;
5fc6d7
     xmlChar * localname   = NULL;
5fc6d7
     xmlChar * prefix      = NULL;
5fc6d7
@@ -665,7 +694,13 @@ PmmGenAttributeHashSV( pTHX_ PmmSAXVectorPtr sax,
5fc6d7
         while ( *ta != NULL ) {
5fc6d7
             atV = newHV();
5fc6d7
             name = *ta;  ta++;
5fc6d7
-            value = *ta; ta++;
5fc6d7
+            /* XXX: libxml2 SAX2 interface does not expand &
5fc6d7
+             * entity in the attribute values
5fc6d7
+             * <https://bugzilla.gnome.org/show_bug.cgi?id=316487>
5fc6d7
+             * resulting in stray "&" sequences after disabling
5fc6d7
+             * external entity expansion
5fc6d7
+             * <https://rt.cpan.org/Ticket/Display.html?id=131498>. */
5fc6d7
+            value = _expandAmp(*ta);
5fc6d7
 
5fc6d7
             if ( name != NULL && XML_STR_NOT_EMPTY( name ) ) {
5fc6d7
                 localname = xmlSplitQName(NULL, name, &prefix);
5fc6d7
@@ -754,6 +789,11 @@ PmmGenAttributeHashSV( pTHX_ PmmSAXVectorPtr sax,
5fc6d7
                 prefix    = NULL;
5fc6d7
 
5fc6d7
             }
5fc6d7
+
5fc6d7
+            if (value != *ta) {
5fc6d7
+                xmlFree(value);
5fc6d7
+            }
5fc6d7
+            ta++;
5fc6d7
         }
5fc6d7
     }
5fc6d7
 
5fc6d7
diff --git a/t/52_sax_intent.t b/t/52_sax_intent.t
5fc6d7
new file mode 100755
5fc6d7
index 0000000..a45b4d1
5fc6d7
--- /dev/null
5fc6d7
+++ b/t/52_sax_intent.t
5fc6d7
@@ -0,0 +1,40 @@
5fc6d7
+use strict;
5fc6d7
+use warnings;
5fc6d7
+use Test::More;
5fc6d7
+
5fc6d7
+my %tests = (
5fc6d7
+    # attribte name     raw attrib. value   expected parsed value
5fc6d7
+    predefined =>       ['"',          '"'],       # alawys worked
5fc6d7
+    numeric =>          ['A',           'A'],       # always worked
5fc6d7
+    numericampersand => ['&',           '&'],       # a regression
5fc6d7
+    ampA =>             ['&A',          '&A'],      # a corner case
5fc6d7
+    Aamp =>             ['A&',          'A&'],      # a corner case
5fc6d7
+    AampBampC =>        ['A&B&C',   'A&B&C'],   # a corner case
5fc6d7
+);
5fc6d7
+plan tests => scalar (keys %tests);
5fc6d7
+
5fc6d7
+my $input = '
5fc6d7
+for my $test (sort keys %tests) {
5fc6d7
+    $input .= sprintf(" %s='%s'", $test, $tests{$test}->[0]);
5fc6d7
+}
5fc6d7
+$input .= '/>';
5fc6d7
+
5fc6d7
+diag("Parsing $input");
5fc6d7
+use XML::LibXML::SAX;
5fc6d7
+
5fc6d7
+XML::LibXML::SAX->new(Handler => 'Handler')->parse_string($input);
5fc6d7
+
5fc6d7
+
5fc6d7
+package Handler;
5fc6d7
+sub start_element {
5fc6d7
+    my ($self, $node) = @_;
5fc6d7
+    for my $attribute (sort keys %{$node->{Attributes}}) {
5fc6d7
+        my $name = $node->{Attributes}->{$attribute}->{Name};
5fc6d7
+        Test::More::is(
5fc6d7
+            $node->{Attributes}->{$attribute}->{Value},
5fc6d7
+            $tests{$name}->[1],
5fc6d7
+            sprintf("%s='%s' attribute", $name, $tests{$name}->[0])
5fc6d7
+        );
5fc6d7
+    }
5fc6d7
+}
5fc6d7
+
5fc6d7
-- 
5fc6d7
2.21.1
5fc6d7