BUG: iterparse of read_xml not parsing duplicate element and attribute names (#47414)

ParfaitG · web-flow · commit 10967ce78be5 · 2022-06-21T16:10:45.000-07:00
* BUG: iterparse of read_xml not parsing duplicate element and attribute names

* Refactor duplicative code in each parser to shared base class

* Add lxml preceding-sibling iterparse cleanup

* Revert code refactoring back to bug fix only

* Remove whatsnew bug fix note on unreleased version feature
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
@@ -413,11 +413,21 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]:
                     row = {}
 
             if row is not None:
-                for col in self.iterparse[row_node]:
-                    if curr_elem == col:
-                        row[col] = elem.text.strip() if elem.text else None
-                    if col in elem.attrib:
-                        row[col] = elem.attrib[col]
+                if self.names:
+                    for col, nm in zip(self.iterparse[row_node], self.names):
+                        if curr_elem == col:
+                            elem_val = elem.text.strip() if elem.text else None
+                            if elem_val not in row.values() and nm not in row:
+                                row[nm] = elem_val
+                        if col in elem.attrib:
+                            if elem.attrib[col] not in row.values() and nm not in row:
+                                row[nm] = elem.attrib[col]
+                else:
+                    for col in self.iterparse[row_node]:
+                        if curr_elem == col:
+                            row[col] = elem.text.strip() if elem.text else None
+                        if col in elem.attrib:
+                            row[col] = elem.attrib[col]
 
             if event == "end":
                 if curr_elem == row_node and row is not None:
@@ -661,11 +671,21 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]:
                     row = {}
 
             if row is not None:
-                for col in self.iterparse[row_node]:
-                    if curr_elem == col:
-                        row[col] = elem.text.strip() if elem.text else None
-                    if col in elem.attrib:
-                        row[col] = elem.attrib[col]
+                if self.names:
+                    for col, nm in zip(self.iterparse[row_node], self.names):
+                        if curr_elem == col:
+                            elem_val = elem.text.strip() if elem.text else None
+                            if elem_val not in row.values() and nm not in row:
+                                row[nm] = elem_val
+                        if col in elem.attrib:
+                            if elem.attrib[col] not in row.values() and nm not in row:
+                                row[nm] = elem.attrib[col]
+                else:
+                    for col in self.iterparse[row_node]:
+                        if curr_elem == col:
+                            row[col] = elem.text.strip() if elem.text else None
+                        if col in elem.attrib:
+                            row[col] = elem.attrib[col]
 
             if event == "end":
                 if curr_elem == row_node and row is not None:
@@ -1020,7 +1040,8 @@ def read_xml(
 
     names :  list-like, optional
         Column names for DataFrame of parsed XML data. Use this parameter to
-        rename original element names and distinguish same named elements.
+        rename original element names and distinguish same named elements and
+        attributes.
 
     dtype : Type name or dict of column -> type, optional
         Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
@@ -789,6 +789,41 @@ def test_names_option_output(datapath, parser):
     tm.assert_frame_equal(df_iter, df_expected)
 
 
+def test_repeat_names(parser):
+    xml = """\
+<shapes>
+  <shape type="2D">
+    <name>circle</name>
+    <type>curved</type>
+  </shape>
+  <shape type="3D">
+    <name>sphere</name>
+    <type>curved</type>
+  </shape>
+</shapes>"""
+    df_xpath = read_xml(
+        xml, xpath=".//shape", parser=parser, names=["type_dim", "shape", "type_edge"]
+    )
+
+    df_iter = read_xml_iterparse(
+        xml,
+        parser=parser,
+        iterparse={"shape": ["type", "name", "type"]},
+        names=["type_dim", "shape", "type_edge"],
+    )
+
+    df_expected = DataFrame(
+        {
+            "type_dim": ["2D", "3D"],
+            "shape": ["circle", "sphere"],
+            "type_edge": ["curved", "curved"],
+        }
+    )
+
+    tm.assert_frame_equal(df_xpath, df_expected)
+    tm.assert_frame_equal(df_iter, df_expected)
+
+
 def test_names_option_wrong_length(datapath, parser):
     filename = datapath("io", "data", "xml", "books.xml")