Skip to content

Commit 10967ce

Browse files
authored
BUG: iterparse of read_xml not parsing duplicate element and attribute names (#47414)
* BUG: iterparse of read_xml not parsing duplicate element and attribute names * Refactor duplicative code in each parser to shared base class * Add lxml preceding-sibling iterparse cleanup * Revert code refactoring back to bug fix only * Remove whatsnew bug fix note on unreleased version feature
1 parent c7c2bcf commit 10967ce

File tree

2 files changed

+67
-11
lines changed

2 files changed

+67
-11
lines changed

pandas/io/xml.py

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -413,11 +413,21 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]:
413413
row = {}
414414

415415
if row is not None:
416-
for col in self.iterparse[row_node]:
417-
if curr_elem == col:
418-
row[col] = elem.text.strip() if elem.text else None
419-
if col in elem.attrib:
420-
row[col] = elem.attrib[col]
416+
if self.names:
417+
for col, nm in zip(self.iterparse[row_node], self.names):
418+
if curr_elem == col:
419+
elem_val = elem.text.strip() if elem.text else None
420+
if elem_val not in row.values() and nm not in row:
421+
row[nm] = elem_val
422+
if col in elem.attrib:
423+
if elem.attrib[col] not in row.values() and nm not in row:
424+
row[nm] = elem.attrib[col]
425+
else:
426+
for col in self.iterparse[row_node]:
427+
if curr_elem == col:
428+
row[col] = elem.text.strip() if elem.text else None
429+
if col in elem.attrib:
430+
row[col] = elem.attrib[col]
421431

422432
if event == "end":
423433
if curr_elem == row_node and row is not None:
@@ -661,11 +671,21 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]:
661671
row = {}
662672

663673
if row is not None:
664-
for col in self.iterparse[row_node]:
665-
if curr_elem == col:
666-
row[col] = elem.text.strip() if elem.text else None
667-
if col in elem.attrib:
668-
row[col] = elem.attrib[col]
674+
if self.names:
675+
for col, nm in zip(self.iterparse[row_node], self.names):
676+
if curr_elem == col:
677+
elem_val = elem.text.strip() if elem.text else None
678+
if elem_val not in row.values() and nm not in row:
679+
row[nm] = elem_val
680+
if col in elem.attrib:
681+
if elem.attrib[col] not in row.values() and nm not in row:
682+
row[nm] = elem.attrib[col]
683+
else:
684+
for col in self.iterparse[row_node]:
685+
if curr_elem == col:
686+
row[col] = elem.text.strip() if elem.text else None
687+
if col in elem.attrib:
688+
row[col] = elem.attrib[col]
669689

670690
if event == "end":
671691
if curr_elem == row_node and row is not None:
@@ -1020,7 +1040,8 @@ def read_xml(
10201040
10211041
names : list-like, optional
10221042
Column names for DataFrame of parsed XML data. Use this parameter to
1023-
rename original element names and distinguish same named elements.
1043+
rename original element names and distinguish same named elements and
1044+
attributes.
10241045
10251046
dtype : Type name or dict of column -> type, optional
10261047
Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,

pandas/tests/io/xml/test_xml.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -789,6 +789,41 @@ def test_names_option_output(datapath, parser):
789789
tm.assert_frame_equal(df_iter, df_expected)
790790

791791

792+
def test_repeat_names(parser):
793+
xml = """\
794+
<shapes>
795+
<shape type="2D">
796+
<name>circle</name>
797+
<type>curved</type>
798+
</shape>
799+
<shape type="3D">
800+
<name>sphere</name>
801+
<type>curved</type>
802+
</shape>
803+
</shapes>"""
804+
df_xpath = read_xml(
805+
xml, xpath=".//shape", parser=parser, names=["type_dim", "shape", "type_edge"]
806+
)
807+
808+
df_iter = read_xml_iterparse(
809+
xml,
810+
parser=parser,
811+
iterparse={"shape": ["type", "name", "type"]},
812+
names=["type_dim", "shape", "type_edge"],
813+
)
814+
815+
df_expected = DataFrame(
816+
{
817+
"type_dim": ["2D", "3D"],
818+
"shape": ["circle", "sphere"],
819+
"type_edge": ["curved", "curved"],
820+
}
821+
)
822+
823+
tm.assert_frame_equal(df_xpath, df_expected)
824+
tm.assert_frame_equal(df_iter, df_expected)
825+
826+
792827
def test_names_option_wrong_length(datapath, parser):
793828
filename = datapath("io", "data", "xml", "books.xml")
794829

0 commit comments

Comments
 (0)