Skip to content

CLN/DOC: Adjust xpath validation and error messaging in read_xml with IO tools doc note and example #48386

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Sep 6, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3174,6 +3174,42 @@ But assigning *any* temporary name to correct URI allows parsing by nodes.
However, if XPath does not reference node names such as default, ``/*``, then
``namespaces`` is not required.

.. note::

Since ``xpath`` identifies the parent of content to be parsed, only immediate
desendants which include child nodes or current attributes are parsed.
Therefore, ``read_xml`` will not parse the text of grandchildren or other
descendants and will not parse attributes of any descendant. To retrieve
lower level content, adjust xpath to lower level. For example,

.. ipython:: python
:okwarning:

xml = """
<data>
<row>
<shape sides="4">square</shape>
<degrees>360</degrees>
</row>
<row>
<shape sides="0">circle</shape>
<degrees>360</degrees>
</row>
<row>
<shape sides="3">triangle</shape>
<degrees>180</degrees>
</row>
</data>"""

df = pd.read_xml(xml, xpath="./row")
df

shows the attribute sides on shape element was not parsed as expected
since this attribute resides on the child of row element and not row
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
since this attribute resides on the child of row element and not row
since this attribute resides on the child of a row element and not the row

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Articles are not needed here since note is specifically referring to element in XML which is literally named <row>.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Then put double backticks instead.

element itself. In other words, sides attribute is a grandchild level
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
element itself. In other words, sides attribute is a grandchild level
element itself. In other words, the ``sides`` attribute is a grandchild level

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good to me. Will adjust.

descendant of row element. The ``xpath`` targets only row element
content which includes its children and its attributes.

With `lxml`_ as parser, you can flatten nested XML documents with an XSLT
script which also can be string/file/URL types. As background, `XSLT`_ is
a special-purpose language written in a special XML file that can transform
Expand Down
52 changes: 35 additions & 17 deletions pandas/io/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:

return dicts

def _validate_path(self) -> None:
def _validate_path(self) -> list[Any]:
"""
Validate xpath.

Expand Down Expand Up @@ -446,8 +446,7 @@ def parse_data(self) -> list[dict[str, str | None]]:

if self.iterparse is None:
self.xml_doc = self._parse_doc(self.path_or_buffer)
self._validate_path()
elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
elems = self._validate_path()

self._validate_names()

Expand All @@ -459,7 +458,7 @@ def parse_data(self) -> list[dict[str, str | None]]:

return xml_dicts

def _validate_path(self) -> None:
def _validate_path(self) -> list[Any]:
"""
Notes
-----
Expand All @@ -468,18 +467,28 @@ def _validate_path(self) -> None:
"""

msg = (
"xpath does not return any nodes. "
"xpath does not return any nodes or attributes. "
"Be sure to specify in `xpath` the parent nodes of "
"children and attributes to parse. "
"If document uses namespaces denoted with "
"xmlns, be sure to define namespaces and "
"use them in xpath."
)
try:
elems = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
children = [el.findall("*") for el in elems]
attrs = {k: v for el in elems for k, v in el.attrib.items()}

if elems is None:
raise ValueError(msg)

if elems is not None and elems.find("*") is None and elems.attrib is None:
raise ValueError(msg)
if elems is not None:
if self.elems_only and children == []:
raise ValueError(msg)
elif self.attrs_only and attrs == {}:
raise ValueError(msg)
elif children == [] and attrs == {}:
raise ValueError(msg)

except (KeyError, SyntaxError):
raise SyntaxError(
Expand All @@ -488,6 +497,8 @@ def _validate_path(self) -> None:
"undeclared namespace prefix."
)

return elems

def _validate_names(self) -> None:
children: list[Any]

Expand Down Expand Up @@ -554,8 +565,7 @@ def parse_data(self) -> list[dict[str, str | None]]:
self.xsl_doc = self._parse_doc(self.stylesheet)
self.xml_doc = self._transform_doc()

self._validate_path()
elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
elems = self._validate_path()

self._validate_names()

Expand All @@ -567,25 +577,33 @@ def parse_data(self) -> list[dict[str, str | None]]:

return xml_dicts

def _validate_path(self) -> None:
def _validate_path(self) -> list[Any]:

msg = (
"xpath does not return any nodes. "
"Be sure row level nodes are in xpath. "
"xpath does not return any nodes or attributes. "
"Be sure to specify in `xpath` the parent nodes of "
"children and attributes to parse. "
"If document uses namespaces denoted with "
"xmlns, be sure to define namespaces and "
"use them in xpath."
)

elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces)
attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces)
children = [el.xpath("*") for el in elems]
attrs = {k: v for el in elems for k, v in el.attrib.items()}

if elems == []:
raise ValueError(msg)

if elems != [] and attrs == [] and children == []:
raise ValueError(msg)
if elems != []:
if self.elems_only and children == []:
raise ValueError(msg)
elif self.attrs_only and attrs == {}:
raise ValueError(msg)
elif children == [] and attrs == {}:
raise ValueError(msg)

return elems

def _validate_names(self) -> None:
children: list[Any]
Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/io/xml/test_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -760,6 +760,30 @@ def test_elem_and_attrs_only(datapath, parser):
read_xml(filename, elems_only=True, attrs_only=True, parser=parser)


def test_empty_attrs_only(parser):
xml = """
<data>
<row>
<shape sides="4">square</shape>
<degrees>360</degrees>
</row>
<row>
<shape sides="0">circle</shape>
<degrees>360</degrees>
</row>
<row>
<shape sides="3">triangle</shape>
<degrees>180</degrees>
</row>
</data>"""

with pytest.raises(
ValueError,
match=("xpath does not return any nodes or attributes"),
):
read_xml(xml, xpath="./row", attrs_only=True, parser=parser)


@td.skip_if_no("lxml")
def test_attribute_centric_xml():
xml = """\
Expand Down