Blame SOURCES/libxml2-2.9.12-fix-lxml-corrupted-tree.patch

8bf870
From 85b1792e37b131e7a51af98a37f92472e8de5f3f Mon Sep 17 00:00:00 2001
8bf870
From: Nick Wellnhofer <wellnhofer@aevum.de>
8bf870
Date: Tue, 18 May 2021 20:08:28 +0200
8bf870
Subject: [PATCH] Work around lxml API abuse
8bf870
8bf870
Make xmlNodeDumpOutput and htmlNodeDumpFormatOutput work with corrupted
8bf870
parent pointers. This used to work with the old recursive code but the
8bf870
non-recursive rewrite required parent pointers to be set correctly.
8bf870
8bf870
Unfortunately, lxml relies on the old behavior and passes subtrees with
8bf870
a corrupted structure. Fall back to a recursive function call if an
8bf870
invalid parent pointer is detected.
8bf870
8bf870
Fixes #255.
8bf870
---
8bf870
 HTMLtree.c | 46 ++++++++++++++++++++++++++++------------------
8bf870
 xmlsave.c  | 31 +++++++++++++++++++++----------
8bf870
 2 files changed, 49 insertions(+), 28 deletions(-)
8bf870
8bf870
diff --git a/HTMLtree.c b/HTMLtree.c
8bf870
index 24434d45..bdd639c7 100644
8bf870
--- a/HTMLtree.c
8bf870
+++ b/HTMLtree.c
8bf870
@@ -744,7 +744,7 @@ void
8bf870
 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
8bf870
 	                 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
8bf870
                          int format) {
8bf870
-    xmlNodePtr root;
8bf870
+    xmlNodePtr root, parent;
8bf870
     xmlAttrPtr attr;
8bf870
     const htmlElemDesc * info;
8bf870
 
8bf870
@@ -755,6 +755,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
8bf870
     }
8bf870
 
8bf870
     root = cur;
8bf870
+    parent = cur->parent;
8bf870
     while (1) {
8bf870
         switch (cur->type) {
8bf870
         case XML_HTML_DOCUMENT_NODE:
8bf870
@@ -762,13 +763,25 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
8bf870
             if (((xmlDocPtr) cur)->intSubset != NULL) {
8bf870
                 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
8bf870
             }
8bf870
-            if (cur->children != NULL) {
8bf870
+            /* Always validate cur->parent when descending. */
8bf870
+            if ((cur->parent == parent) && (cur->children != NULL)) {
8bf870
+                parent = cur;
8bf870
                 cur = cur->children;
8bf870
                 continue;
8bf870
             }
8bf870
             break;
8bf870
 
8bf870
         case XML_ELEMENT_NODE:
8bf870
+            /*
8bf870
+             * Some users like lxml are known to pass nodes with a corrupted
8bf870
+             * tree structure. Fall back to a recursive call to handle this
8bf870
+             * case.
8bf870
+             */
8bf870
+            if ((cur->parent != parent) && (cur->children != NULL)) {
8bf870
+                htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
8bf870
+                break;
8bf870
+            }
8bf870
+
8bf870
             /*
8bf870
              * Get specific HTML info for that node.
8bf870
              */
8bf870
@@ -817,6 +830,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
8bf870
                     (cur->name != NULL) &&
8bf870
                     (cur->name[0] != 'p')) /* p, pre, param */
8bf870
                     xmlOutputBufferWriteString(buf, "\n");
8bf870
+                parent = cur;
8bf870
                 cur = cur->children;
8bf870
                 continue;
8bf870
             }
8bf870
@@ -825,9 +839,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
8bf870
                 (info != NULL) && (!info->isinline)) {
8bf870
                 if ((cur->next->type != HTML_TEXT_NODE) &&
8bf870
                     (cur->next->type != HTML_ENTITY_REF_NODE) &&
8bf870
-                    (cur->parent != NULL) &&
8bf870
-                    (cur->parent->name != NULL) &&
8bf870
-                    (cur->parent->name[0] != 'p')) /* p, pre, param */
8bf870
+                    (parent != NULL) &&
8bf870
+                    (parent->name != NULL) &&
8bf870
+                    (parent->name[0] != 'p')) /* p, pre, param */
8bf870
                     xmlOutputBufferWriteString(buf, "\n");
8bf870
             }
8bf870
 
8bf870
@@ -842,9 +856,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
8bf870
                 break;
8bf870
             if (((cur->name == (const xmlChar *)xmlStringText) ||
8bf870
                  (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
8bf870
-                ((cur->parent == NULL) ||
8bf870
-                 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
8bf870
-                  (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
8bf870
+                ((parent == NULL) ||
8bf870
+                 ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
8bf870
+                  (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
8bf870
                 xmlChar *buffer;
8bf870
 
8bf870
                 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
8bf870
@@ -902,13 +916,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
8bf870
                 break;
8bf870
             }
8bf870
 
8bf870
-            /*
8bf870
-             * The parent should never be NULL here but we want to handle
8bf870
-             * corrupted documents gracefully.
8bf870
-             */
8bf870
-            if (cur->parent == NULL)
8bf870
-                return;
8bf870
-            cur = cur->parent;
8bf870
+            cur = parent;
8bf870
+            /* cur->parent was validated when descending. */
8bf870
+            parent = cur->parent;
8bf870
 
8bf870
             if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
8bf870
                 (cur->type == XML_DOCUMENT_NODE)) {
8bf870
@@ -939,9 +949,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
8bf870
                     (cur->next != NULL)) {
8bf870
                     if ((cur->next->type != HTML_TEXT_NODE) &&
8bf870
                         (cur->next->type != HTML_ENTITY_REF_NODE) &&
8bf870
-                        (cur->parent != NULL) &&
8bf870
-                        (cur->parent->name != NULL) &&
8bf870
-                        (cur->parent->name[0] != 'p')) /* p, pre, param */
8bf870
+                        (parent != NULL) &&
8bf870
+                        (parent->name != NULL) &&
8bf870
+                        (parent->name[0] != 'p')) /* p, pre, param */
8bf870
                         xmlOutputBufferWriteString(buf, "\n");
8bf870
                 }
8bf870
             }
8bf870
diff --git a/xmlsave.c b/xmlsave.c
8bf870
index 61a40459..aedbd5e7 100644
8bf870
--- a/xmlsave.c
8bf870
+++ b/xmlsave.c
8bf870
@@ -847,7 +847,7 @@ htmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) {
8bf870
 static void
8bf870
 xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) {
8bf870
     int format = ctxt->format;
8bf870
-    xmlNodePtr tmp, root, unformattedNode = NULL;
8bf870
+    xmlNodePtr tmp, root, unformattedNode = NULL, parent;
8bf870
     xmlAttrPtr attr;
8bf870
     xmlChar *start, *end;
8bf870
     xmlOutputBufferPtr buf;
8bf870
@@ -856,6 +856,7 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) {
8bf870
     buf = ctxt->buf;
8bf870
 
8bf870
     root = cur;
8bf870
+    parent = cur->parent;
8bf870
     while (1) {
8bf870
         switch (cur->type) {
8bf870
         case XML_DOCUMENT_NODE:
8bf870
@@ -868,7 +869,9 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) {
8bf870
             break;
8bf870
 
8bf870
         case XML_DOCUMENT_FRAG_NODE:
8bf870
-            if (cur->children != NULL) {
8bf870
+            /* Always validate cur->parent when descending. */
8bf870
+            if ((cur->parent == parent) && (cur->children != NULL)) {
8bf870
+                parent = cur;
8bf870
                 cur = cur->children;
8bf870
                 continue;
8bf870
             }
8bf870
@@ -887,7 +890,18 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) {
8bf870
             break;
8bf870
 
8bf870
         case XML_ELEMENT_NODE:
8bf870
-	    if ((cur != root) && (ctxt->format == 1) && (xmlIndentTreeOutput))
8bf870
+            /*
8bf870
+             * Some users like lxml are known to pass nodes with a corrupted
8bf870
+             * tree structure. Fall back to a recursive call to handle this
8bf870
+             * case.
8bf870
+             */
8bf870
+            if ((cur->parent != parent) && (cur->children != NULL)) {
8bf870
+                xmlNodeDumpOutputInternal(ctxt, cur);
8bf870
+                break;
8bf870
+            }
8bf870
+
8bf870
+	    if ((ctxt->level > 0) && (ctxt->format == 1) &&
8bf870
+                (xmlIndentTreeOutput))
8bf870
 		xmlOutputBufferWrite(buf, ctxt->indent_size *
8bf870
 				     (ctxt->level > ctxt->indent_nr ?
8bf870
 				      ctxt->indent_nr : ctxt->level),
8bf870
@@ -942,6 +956,7 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) {
8bf870
                 xmlOutputBufferWrite(buf, 1, ">");
8bf870
                 if (ctxt->format == 1) xmlOutputBufferWrite(buf, 1, "\n");
8bf870
                 if (ctxt->level >= 0) ctxt->level++;
8bf870
+                parent = cur;
8bf870
                 cur = cur->children;
8bf870
                 continue;
8bf870
             }
8bf870
@@ -1058,13 +1073,9 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) {
8bf870
                 break;
8bf870
             }
8bf870
 
8bf870
-            /*
8bf870
-             * The parent should never be NULL here but we want to handle
8bf870
-             * corrupted documents gracefully.
8bf870
-             */
8bf870
-            if (cur->parent == NULL)
8bf870
-                return;
8bf870
-            cur = cur->parent;
8bf870
+            cur = parent;
8bf870
+            /* cur->parent was validated when descending. */
8bf870
+            parent = cur->parent;
8bf870
 
8bf870
             if (cur->type == XML_ELEMENT_NODE) {
8bf870
                 if (ctxt->level > 0) ctxt->level--;
8bf870
-- 
8bf870
GitLab
8bf870