|
|
2f6a4c |
From 3d0adda7560137309be8b10c63ff41e41dfb1516 Mon Sep 17 00:00:00 2001
|
|
|
2f6a4c |
From: =?UTF-8?q?Petr=20P=C3=ADsa=C5=99?= <ppisar@redhat.com>
|
|
|
2f6a4c |
Date: Tue, 28 Jan 2020 17:05:32 +0100
|
|
|
2f6a4c |
Subject: [PATCH] Parse an ampersand entity in SAX interface
|
|
|
2f6a4c |
MIME-Version: 1.0
|
|
|
2f6a4c |
Content-Type: text/plain; charset=UTF-8
|
|
|
2f6a4c |
Content-Transfer-Encoding: 8bit
|
|
|
2f6a4c |
|
|
|
2f6a4c |
After disabling parsing external entities in XML-LibXML-2.0202,
|
|
|
2f6a4c |
XML::LibXML::SAX interface stopped expanding & and & entities
|
|
|
2f6a4c |
in attribute values (often found in href XHTML attributes) and
|
|
|
2f6a4c |
returned "&" instead. This was discovered by a RDF-Trine test
|
|
|
2f6a4c |
suite failure <https://github.com/kasei/perlrdf/issues/167>.
|
|
|
2f6a4c |
|
|
|
2f6a4c |
First, I suspected XML-LibXML
|
|
|
2f6a4c |
<https://rt.cpan.org/Ticket/Display.html?id=131498>, but it turned out
|
|
|
2f6a4c |
that the unexpanded entity comes from libxml2 C library itself. And
|
|
|
2f6a4c |
that it's not just an ommitted expansion, but that it's actually an
|
|
|
2f6a4c |
escape sequence for "&" characters. Other XML metacharacters (like
|
|
|
2f6a4c |
"<") are not affeced. Also text nodes are also not affected. My
|
|
|
2f6a4c |
finding was confirmed by an old libxml2 bug report
|
|
|
2f6a4c |
<https://bugzilla.gnome.org/show_bug.cgi?id=316487>.
|
|
|
2f6a4c |
|
|
|
2f6a4c |
This patch "fixes" this discepancy by replacing all "&"
|
|
|
2f6a4c |
subtstrings with a literal "&" in SAX interface of start_element()
|
|
|
2f6a4c |
callbacks.
|
|
|
2f6a4c |
|
|
|
2f6a4c |
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
|
|
2f6a4c |
---
|
|
|
2f6a4c |
MANIFEST | 1 +
|
|
|
2f6a4c |
perl-libxml-sax.c | 44 ++++++++++++++++++++++++++++++++++++++++++--
|
|
|
2f6a4c |
t/52_sax_intent.t | 40 ++++++++++++++++++++++++++++++++++++++++
|
|
|
2f6a4c |
3 files changed, 83 insertions(+), 2 deletions(-)
|
|
|
2f6a4c |
create mode 100755 t/52_sax_intent.t
|
|
|
2f6a4c |
|
|
|
2f6a4c |
diff --git a/MANIFEST b/MANIFEST
|
|
|
2f6a4c |
index 5248ea5..ccc3410 100644
|
|
|
2f6a4c |
--- a/MANIFEST
|
|
|
2f6a4c |
+++ b/MANIFEST
|
|
|
2f6a4c |
@@ -174,6 +174,7 @@ t/49callbacks_returning_undef.t
|
|
|
2f6a4c |
t/49global_extent.t
|
|
|
2f6a4c |
t/50devel.t
|
|
|
2f6a4c |
t/51_parse_html_string_rt87089.t
|
|
|
2f6a4c |
+t/52_sax_intent.t
|
|
|
2f6a4c |
t/60error_prev_chain.t
|
|
|
2f6a4c |
t/60struct_error.t
|
|
|
2f6a4c |
t/61error.t
|
|
|
2f6a4c |
diff --git a/perl-libxml-sax.c b/perl-libxml-sax.c
|
|
|
2f6a4c |
index b949d3c..232a879 100644
|
|
|
2f6a4c |
--- a/perl-libxml-sax.c
|
|
|
2f6a4c |
+++ b/perl-libxml-sax.c
|
|
|
2f6a4c |
@@ -20,6 +20,7 @@ extern "C" {
|
|
|
2f6a4c |
#include "ppport.h"
|
|
|
2f6a4c |
|
|
|
2f6a4c |
#include <stdlib.h>
|
|
|
2f6a4c |
+#include <string.h>
|
|
|
2f6a4c |
#include <libxml/xmlmemory.h>
|
|
|
2f6a4c |
#include <libxml/parser.h>
|
|
|
2f6a4c |
#include <libxml/parserInternals.h>
|
|
|
2f6a4c |
@@ -639,6 +640,34 @@ PmmGenNsName( const xmlChar * name, const xmlChar * nsURI )
|
|
|
2f6a4c |
return retval;
|
|
|
2f6a4c |
}
|
|
|
2f6a4c |
|
|
|
2f6a4c |
+/* If a value argument does not contain "&", the value pointer is returned.
|
|
|
2f6a4c |
+ * Otherwise a new xmlChar * string is allocated, the value copied there and
|
|
|
2f6a4c |
+ * "&" occurences replaced with "&". Then the caller must free it. */
|
|
|
2f6a4c |
+static
|
|
|
2f6a4c |
+xmlChar *
|
|
|
2f6a4c |
+_expandAmp( const xmlChar *value )
|
|
|
2f6a4c |
+{
|
|
|
2f6a4c |
+ xmlChar *expanded = NULL;
|
|
|
2f6a4c |
+ const xmlChar *entity;
|
|
|
2f6a4c |
+ int length;
|
|
|
2f6a4c |
+
|
|
|
2f6a4c |
+ if (value == NULL ||
|
|
|
2f6a4c |
+ (NULL == (entity = (const xmlChar *)strstr((const char *)value, "&")))) {
|
|
|
2f6a4c |
+ return (xmlChar *)value;
|
|
|
2f6a4c |
+ }
|
|
|
2f6a4c |
+
|
|
|
2f6a4c |
+ do {
|
|
|
2f6a4c |
+ length = entity - value;
|
|
|
2f6a4c |
+ expanded = xmlStrncat(expanded, value, length);
|
|
|
2f6a4c |
+ expanded = xmlStrncat(expanded, (const xmlChar *)"&", 1);
|
|
|
2f6a4c |
+ value += length + 5; /* "&" */
|
|
|
2f6a4c |
+ } while (NULL != (entity = (const xmlChar*)strstr((const char *)value, "&")));
|
|
|
2f6a4c |
+
|
|
|
2f6a4c |
+ expanded = xmlStrcat(expanded, value);
|
|
|
2f6a4c |
+
|
|
|
2f6a4c |
+ return expanded;
|
|
|
2f6a4c |
+}
|
|
|
2f6a4c |
+
|
|
|
2f6a4c |
HV *
|
|
|
2f6a4c |
PmmGenAttributeHashSV( pTHX_ PmmSAXVectorPtr sax,
|
|
|
2f6a4c |
const xmlChar **attr, SV * handler )
|
|
|
2f6a4c |
@@ -653,8 +682,8 @@ PmmGenAttributeHashSV( pTHX_ PmmSAXVectorPtr sax,
|
|
|
2f6a4c |
const xmlChar * nsURI = NULL;
|
|
|
2f6a4c |
const xmlChar **ta = attr;
|
|
|
2f6a4c |
const xmlChar * name = NULL;
|
|
|
2f6a4c |
- const xmlChar * value = NULL;
|
|
|
2f6a4c |
|
|
|
2f6a4c |
+ xmlChar * value = NULL;
|
|
|
2f6a4c |
xmlChar * keyname = NULL;
|
|
|
2f6a4c |
xmlChar * localname = NULL;
|
|
|
2f6a4c |
xmlChar * prefix = NULL;
|
|
|
2f6a4c |
@@ -665,7 +694,13 @@ PmmGenAttributeHashSV( pTHX_ PmmSAXVectorPtr sax,
|
|
|
2f6a4c |
while ( *ta != NULL ) {
|
|
|
2f6a4c |
atV = newHV();
|
|
|
2f6a4c |
name = *ta; ta++;
|
|
|
2f6a4c |
- value = *ta; ta++;
|
|
|
2f6a4c |
+ /* XXX: libxml2 SAX2 interface does not expand &
|
|
|
2f6a4c |
+ * entity in the attribute values
|
|
|
2f6a4c |
+ * <https://bugzilla.gnome.org/show_bug.cgi?id=316487>
|
|
|
2f6a4c |
+ * resulting in stray "&" sequences after disabling
|
|
|
2f6a4c |
+ * external entity expansion
|
|
|
2f6a4c |
+ * <https://rt.cpan.org/Ticket/Display.html?id=131498>. */
|
|
|
2f6a4c |
+ value = _expandAmp(*ta);
|
|
|
2f6a4c |
|
|
|
2f6a4c |
if ( name != NULL && XML_STR_NOT_EMPTY( name ) ) {
|
|
|
2f6a4c |
localname = xmlSplitQName(NULL, name, &prefix);
|
|
|
2f6a4c |
@@ -754,6 +789,11 @@ PmmGenAttributeHashSV( pTHX_ PmmSAXVectorPtr sax,
|
|
|
2f6a4c |
prefix = NULL;
|
|
|
2f6a4c |
|
|
|
2f6a4c |
}
|
|
|
2f6a4c |
+
|
|
|
2f6a4c |
+ if (value != *ta) {
|
|
|
2f6a4c |
+ xmlFree(value);
|
|
|
2f6a4c |
+ }
|
|
|
2f6a4c |
+ ta++;
|
|
|
2f6a4c |
}
|
|
|
2f6a4c |
}
|
|
|
2f6a4c |
|
|
|
2f6a4c |
diff --git a/t/52_sax_intent.t b/t/52_sax_intent.t
|
|
|
2f6a4c |
new file mode 100755
|
|
|
2f6a4c |
index 0000000..a45b4d1
|
|
|
2f6a4c |
--- /dev/null
|
|
|
2f6a4c |
+++ b/t/52_sax_intent.t
|
|
|
2f6a4c |
@@ -0,0 +1,40 @@
|
|
|
2f6a4c |
+use strict;
|
|
|
2f6a4c |
+use warnings;
|
|
|
2f6a4c |
+use Test::More;
|
|
|
2f6a4c |
+
|
|
|
2f6a4c |
+my %tests = (
|
|
|
2f6a4c |
+ # attribte name raw attrib. value expected parsed value
|
|
|
2f6a4c |
+ predefined => ['"', '"'], # alawys worked
|
|
|
2f6a4c |
+ numeric => ['A', 'A'], # always worked
|
|
|
2f6a4c |
+ numericampersand => ['&', '&'], # a regression
|
|
|
2f6a4c |
+ ampA => ['&A', '&A'], # a corner case
|
|
|
2f6a4c |
+ Aamp => ['A&', 'A&'], # a corner case
|
|
|
2f6a4c |
+ AampBampC => ['A&B&C', 'A&B&C'], # a corner case
|
|
|
2f6a4c |
+);
|
|
|
2f6a4c |
+plan tests => scalar (keys %tests);
|
|
|
2f6a4c |
+
|
|
|
2f6a4c |
+my $input = '
|
|
|
2f6a4c |
+for my $test (sort keys %tests) {
|
|
|
2f6a4c |
+ $input .= sprintf(" %s='%s'", $test, $tests{$test}->[0]);
|
|
|
2f6a4c |
+}
|
|
|
2f6a4c |
+$input .= '/>';
|
|
|
2f6a4c |
+
|
|
|
2f6a4c |
+diag("Parsing $input");
|
|
|
2f6a4c |
+use XML::LibXML::SAX;
|
|
|
2f6a4c |
+
|
|
|
2f6a4c |
+XML::LibXML::SAX->new(Handler => 'Handler')->parse_string($input);
|
|
|
2f6a4c |
+
|
|
|
2f6a4c |
+
|
|
|
2f6a4c |
+package Handler;
|
|
|
2f6a4c |
+sub start_element {
|
|
|
2f6a4c |
+ my ($self, $node) = @_;
|
|
|
2f6a4c |
+ for my $attribute (sort keys %{$node->{Attributes}}) {
|
|
|
2f6a4c |
+ my $name = $node->{Attributes}->{$attribute}->{Name};
|
|
|
2f6a4c |
+ Test::More::is(
|
|
|
2f6a4c |
+ $node->{Attributes}->{$attribute}->{Value},
|
|
|
2f6a4c |
+ $tests{$name}->[1],
|
|
|
2f6a4c |
+ sprintf("%s='%s' attribute", $name, $tests{$name}->[0])
|
|
|
2f6a4c |
+ );
|
|
|
2f6a4c |
+ }
|
|
|
2f6a4c |
+}
|
|
|
2f6a4c |
+
|
|
|
2f6a4c |
--
|
|
|
2f6a4c |
2.21.1
|
|
|
2f6a4c |
|