Blame SOURCES/XML-LibXML-2.0202-Parse-an-ampersand-entity-in-SAX-interface.patch

2f6a4c
From 3d0adda7560137309be8b10c63ff41e41dfb1516 Mon Sep 17 00:00:00 2001
2f6a4c
From: =?UTF-8?q?Petr=20P=C3=ADsa=C5=99?= <ppisar@redhat.com>
2f6a4c
Date: Tue, 28 Jan 2020 17:05:32 +0100
2f6a4c
Subject: [PATCH] Parse an ampersand entity in SAX interface
2f6a4c
MIME-Version: 1.0
2f6a4c
Content-Type: text/plain; charset=UTF-8
2f6a4c
Content-Transfer-Encoding: 8bit
2f6a4c
2f6a4c
After disabling parsing external entities in XML-LibXML-2.0202,
2f6a4c
XML::LibXML::SAX interface stopped expanding & and & entities
2f6a4c
in attribute values (often found in href XHTML attributes) and
2f6a4c
returned "&" instead. This was discovered by a RDF-Trine test
2f6a4c
suite failure <https://github.com/kasei/perlrdf/issues/167>.
2f6a4c
2f6a4c
First, I suspected XML-LibXML
2f6a4c
<https://rt.cpan.org/Ticket/Display.html?id=131498>, but it turned out
2f6a4c
that the unexpanded entity comes from libxml2 C library itself. And
2f6a4c
that it's not just an ommitted expansion, but that it's actually an
2f6a4c
escape sequence for "&" characters. Other XML metacharacters (like
2f6a4c
"<") are not affeced. Also text nodes are also not affected.  My
2f6a4c
finding was confirmed by an old libxml2 bug report
2f6a4c
<https://bugzilla.gnome.org/show_bug.cgi?id=316487>.
2f6a4c
2f6a4c
This patch "fixes" this discepancy by replacing all "&"
2f6a4c
subtstrings with a literal "&" in SAX interface of start_element()
2f6a4c
callbacks.
2f6a4c
2f6a4c
Signed-off-by: Petr Písař <ppisar@redhat.com>
2f6a4c
---
2f6a4c
 MANIFEST          |  1 +
2f6a4c
 perl-libxml-sax.c | 44 ++++++++++++++++++++++++++++++++++++++++++--
2f6a4c
 t/52_sax_intent.t | 40 ++++++++++++++++++++++++++++++++++++++++
2f6a4c
 3 files changed, 83 insertions(+), 2 deletions(-)
2f6a4c
 create mode 100755 t/52_sax_intent.t
2f6a4c
2f6a4c
diff --git a/MANIFEST b/MANIFEST
2f6a4c
index 5248ea5..ccc3410 100644
2f6a4c
--- a/MANIFEST
2f6a4c
+++ b/MANIFEST
2f6a4c
@@ -174,6 +174,7 @@ t/49callbacks_returning_undef.t
2f6a4c
 t/49global_extent.t
2f6a4c
 t/50devel.t
2f6a4c
 t/51_parse_html_string_rt87089.t
2f6a4c
+t/52_sax_intent.t
2f6a4c
 t/60error_prev_chain.t
2f6a4c
 t/60struct_error.t
2f6a4c
 t/61error.t
2f6a4c
diff --git a/perl-libxml-sax.c b/perl-libxml-sax.c
2f6a4c
index b949d3c..232a879 100644
2f6a4c
--- a/perl-libxml-sax.c
2f6a4c
+++ b/perl-libxml-sax.c
2f6a4c
@@ -20,6 +20,7 @@ extern "C" {
2f6a4c
 #include "ppport.h"
2f6a4c
 
2f6a4c
 #include <stdlib.h>
2f6a4c
+#include <string.h>
2f6a4c
 #include <libxml/xmlmemory.h>
2f6a4c
 #include <libxml/parser.h>
2f6a4c
 #include <libxml/parserInternals.h>
2f6a4c
@@ -639,6 +640,34 @@ PmmGenNsName( const xmlChar * name, const xmlChar * nsURI )
2f6a4c
     return retval;
2f6a4c
 }
2f6a4c
 
2f6a4c
+/* If a value argument does not contain "&", the value pointer is returned.
2f6a4c
+ * Otherwise a new xmlChar * string is allocated, the value copied there and
2f6a4c
+ * "&" occurences replaced with "&". Then the caller must free it. */
2f6a4c
+static
2f6a4c
+xmlChar *
2f6a4c
+_expandAmp( const xmlChar *value )
2f6a4c
+{
2f6a4c
+    xmlChar *expanded = NULL;
2f6a4c
+    const xmlChar *entity;
2f6a4c
+    int length;
2f6a4c
+
2f6a4c
+    if (value == NULL ||
2f6a4c
+            (NULL == (entity = (const xmlChar *)strstr((const char *)value, "&")))) {
2f6a4c
+        return (xmlChar *)value;
2f6a4c
+    }
2f6a4c
+
2f6a4c
+    do {
2f6a4c
+        length = entity - value;
2f6a4c
+        expanded = xmlStrncat(expanded, value, length);
2f6a4c
+        expanded = xmlStrncat(expanded, (const xmlChar *)"&", 1);
2f6a4c
+        value += length + 5; /* "&" */
2f6a4c
+    } while (NULL != (entity = (const xmlChar*)strstr((const char *)value, "&")));
2f6a4c
+
2f6a4c
+    expanded = xmlStrcat(expanded, value);
2f6a4c
+
2f6a4c
+    return expanded;
2f6a4c
+}
2f6a4c
+
2f6a4c
 HV *
2f6a4c
 PmmGenAttributeHashSV( pTHX_ PmmSAXVectorPtr sax,
2f6a4c
                        const xmlChar **attr, SV * handler )
2f6a4c
@@ -653,8 +682,8 @@ PmmGenAttributeHashSV( pTHX_ PmmSAXVectorPtr sax,
2f6a4c
     const xmlChar * nsURI = NULL;
2f6a4c
     const xmlChar **ta    = attr;
2f6a4c
     const xmlChar * name  = NULL;
2f6a4c
-    const xmlChar * value = NULL;
2f6a4c
 
2f6a4c
+    xmlChar * value       = NULL;
2f6a4c
     xmlChar * keyname     = NULL;
2f6a4c
     xmlChar * localname   = NULL;
2f6a4c
     xmlChar * prefix      = NULL;
2f6a4c
@@ -665,7 +694,13 @@ PmmGenAttributeHashSV( pTHX_ PmmSAXVectorPtr sax,
2f6a4c
         while ( *ta != NULL ) {
2f6a4c
             atV = newHV();
2f6a4c
             name = *ta;  ta++;
2f6a4c
-            value = *ta; ta++;
2f6a4c
+            /* XXX: libxml2 SAX2 interface does not expand &
2f6a4c
+             * entity in the attribute values
2f6a4c
+             * <https://bugzilla.gnome.org/show_bug.cgi?id=316487>
2f6a4c
+             * resulting in stray "&" sequences after disabling
2f6a4c
+             * external entity expansion
2f6a4c
+             * <https://rt.cpan.org/Ticket/Display.html?id=131498>. */
2f6a4c
+            value = _expandAmp(*ta);
2f6a4c
 
2f6a4c
             if ( name != NULL && XML_STR_NOT_EMPTY( name ) ) {
2f6a4c
                 localname = xmlSplitQName(NULL, name, &prefix);
2f6a4c
@@ -754,6 +789,11 @@ PmmGenAttributeHashSV( pTHX_ PmmSAXVectorPtr sax,
2f6a4c
                 prefix    = NULL;
2f6a4c
 
2f6a4c
             }
2f6a4c
+
2f6a4c
+            if (value != *ta) {
2f6a4c
+                xmlFree(value);
2f6a4c
+            }
2f6a4c
+            ta++;
2f6a4c
         }
2f6a4c
     }
2f6a4c
 
2f6a4c
diff --git a/t/52_sax_intent.t b/t/52_sax_intent.t
2f6a4c
new file mode 100755
2f6a4c
index 0000000..a45b4d1
2f6a4c
--- /dev/null
2f6a4c
+++ b/t/52_sax_intent.t
2f6a4c
@@ -0,0 +1,40 @@
2f6a4c
+use strict;
2f6a4c
+use warnings;
2f6a4c
+use Test::More;
2f6a4c
+
2f6a4c
+my %tests = (
2f6a4c
+    # attribte name     raw attrib. value   expected parsed value
2f6a4c
+    predefined =>       ['"',          '"'],       # alawys worked
2f6a4c
+    numeric =>          ['A',           'A'],       # always worked
2f6a4c
+    numericampersand => ['&',           '&'],       # a regression
2f6a4c
+    ampA =>             ['&A',          '&A'],      # a corner case
2f6a4c
+    Aamp =>             ['A&',          'A&'],      # a corner case
2f6a4c
+    AampBampC =>        ['A&B&C',   'A&B&C'],   # a corner case
2f6a4c
+);
2f6a4c
+plan tests => scalar (keys %tests);
2f6a4c
+
2f6a4c
+my $input = '
2f6a4c
+for my $test (sort keys %tests) {
2f6a4c
+    $input .= sprintf(" %s='%s'", $test, $tests{$test}->[0]);
2f6a4c
+}
2f6a4c
+$input .= '/>';
2f6a4c
+
2f6a4c
+diag("Parsing $input");
2f6a4c
+use XML::LibXML::SAX;
2f6a4c
+
2f6a4c
+XML::LibXML::SAX->new(Handler => 'Handler')->parse_string($input);
2f6a4c
+
2f6a4c
+
2f6a4c
+package Handler;
2f6a4c
+sub start_element {
2f6a4c
+    my ($self, $node) = @_;
2f6a4c
+    for my $attribute (sort keys %{$node->{Attributes}}) {
2f6a4c
+        my $name = $node->{Attributes}->{$attribute}->{Name};
2f6a4c
+        Test::More::is(
2f6a4c
+            $node->{Attributes}->{$attribute}->{Value},
2f6a4c
+            $tests{$name}->[1],
2f6a4c
+            sprintf("%s='%s' attribute", $name, $tests{$name}->[0])
2f6a4c
+        );
2f6a4c
+    }
2f6a4c
+}
2f6a4c
+
2f6a4c
-- 
2f6a4c
2.21.1
2f6a4c