1178 lines
38 KiB
Python
1178 lines
38 KiB
Python
|
"""
|
||
|
:mod:``pandas.io.xml`` is a module for reading XML.
|
||
|
"""
|
||
|
|
||
|
from __future__ import annotations
|
||
|
|
||
|
import io
|
||
|
from os import PathLike
|
||
|
from typing import (
|
||
|
TYPE_CHECKING,
|
||
|
Any,
|
||
|
Callable,
|
||
|
)
|
||
|
import warnings
|
||
|
|
||
|
from pandas._libs import lib
|
||
|
from pandas.compat._optional import import_optional_dependency
|
||
|
from pandas.errors import (
|
||
|
AbstractMethodError,
|
||
|
ParserError,
|
||
|
)
|
||
|
from pandas.util._decorators import doc
|
||
|
from pandas.util._exceptions import find_stack_level
|
||
|
from pandas.util._validators import check_dtype_backend
|
||
|
|
||
|
from pandas.core.dtypes.common import is_list_like
|
||
|
|
||
|
from pandas.core.shared_docs import _shared_docs
|
||
|
|
||
|
from pandas.io.common import (
|
||
|
file_exists,
|
||
|
get_handle,
|
||
|
infer_compression,
|
||
|
is_file_like,
|
||
|
is_fsspec_url,
|
||
|
is_url,
|
||
|
stringify_path,
|
||
|
)
|
||
|
from pandas.io.parsers import TextParser
|
||
|
|
||
|
if TYPE_CHECKING:
|
||
|
from collections.abc import Sequence
|
||
|
from xml.etree.ElementTree import Element
|
||
|
|
||
|
from lxml import etree
|
||
|
|
||
|
from pandas._typing import (
|
||
|
CompressionOptions,
|
||
|
ConvertersArg,
|
||
|
DtypeArg,
|
||
|
DtypeBackend,
|
||
|
FilePath,
|
||
|
ParseDatesArg,
|
||
|
ReadBuffer,
|
||
|
StorageOptions,
|
||
|
XMLParsers,
|
||
|
)
|
||
|
|
||
|
from pandas import DataFrame
|
||
|
|
||
|
|
||
|
@doc(
|
||
|
storage_options=_shared_docs["storage_options"],
|
||
|
decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
|
||
|
)
|
||
|
class _XMLFrameParser:
|
||
|
"""
|
||
|
Internal subclass to parse XML into DataFrames.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
path_or_buffer : a valid JSON ``str``, path object or file-like object
|
||
|
Any valid string path is acceptable. The string could be a URL. Valid
|
||
|
URL schemes include http, ftp, s3, and file.
|
||
|
|
||
|
xpath : str or regex
|
||
|
The ``XPath`` expression to parse required set of nodes for
|
||
|
migration to :class:`~pandas.DataFrame`. ``etree`` supports limited ``XPath``.
|
||
|
|
||
|
namespaces : dict
|
||
|
The namespaces defined in XML document (``xmlns:namespace='URI'``)
|
||
|
as dicts with key being namespace and value the URI.
|
||
|
|
||
|
elems_only : bool
|
||
|
Parse only the child elements at the specified ``xpath``.
|
||
|
|
||
|
attrs_only : bool
|
||
|
Parse only the attributes at the specified ``xpath``.
|
||
|
|
||
|
names : list
|
||
|
Column names for :class:`~pandas.DataFrame` of parsed XML data.
|
||
|
|
||
|
dtype : dict
|
||
|
Data type for data or columns. E.g. {{'a': np.float64,
|
||
|
'b': np.int32, 'c': 'Int64'}}
|
||
|
|
||
|
.. versionadded:: 1.5.0
|
||
|
|
||
|
converters : dict, optional
|
||
|
Dict of functions for converting values in certain columns. Keys can
|
||
|
either be integers or column labels.
|
||
|
|
||
|
.. versionadded:: 1.5.0
|
||
|
|
||
|
parse_dates : bool or list of int or names or list of lists or dict
|
||
|
Converts either index or select columns to datetimes
|
||
|
|
||
|
.. versionadded:: 1.5.0
|
||
|
|
||
|
encoding : str
|
||
|
Encoding of xml object or document.
|
||
|
|
||
|
stylesheet : str or file-like
|
||
|
URL, file, file-like object, or a raw string containing XSLT,
|
||
|
``etree`` does not support XSLT but retained for consistency.
|
||
|
|
||
|
iterparse : dict, optional
|
||
|
Dict with row element as key and list of descendant elements
|
||
|
and/or attributes as value to be retrieved in iterparsing of
|
||
|
XML document.
|
||
|
|
||
|
.. versionadded:: 1.5.0
|
||
|
|
||
|
{decompression_options}
|
||
|
|
||
|
.. versionchanged:: 1.4.0 Zstandard support.
|
||
|
|
||
|
{storage_options}
|
||
|
|
||
|
See also
|
||
|
--------
|
||
|
pandas.io.xml._EtreeFrameParser
|
||
|
pandas.io.xml._LxmlFrameParser
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
To subclass this class effectively you must override the following methods:`
|
||
|
* :func:`parse_data`
|
||
|
* :func:`_parse_nodes`
|
||
|
* :func:`_iterparse_nodes`
|
||
|
* :func:`_parse_doc`
|
||
|
* :func:`_validate_names`
|
||
|
* :func:`_validate_path`
|
||
|
|
||
|
|
||
|
See each method's respective documentation for details on their
|
||
|
functionality.
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
|
||
|
xpath: str,
|
||
|
namespaces: dict[str, str] | None,
|
||
|
elems_only: bool,
|
||
|
attrs_only: bool,
|
||
|
names: Sequence[str] | None,
|
||
|
dtype: DtypeArg | None,
|
||
|
converters: ConvertersArg | None,
|
||
|
parse_dates: ParseDatesArg | None,
|
||
|
encoding: str | None,
|
||
|
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
|
||
|
iterparse: dict[str, list[str]] | None,
|
||
|
compression: CompressionOptions,
|
||
|
storage_options: StorageOptions,
|
||
|
) -> None:
|
||
|
self.path_or_buffer = path_or_buffer
|
||
|
self.xpath = xpath
|
||
|
self.namespaces = namespaces
|
||
|
self.elems_only = elems_only
|
||
|
self.attrs_only = attrs_only
|
||
|
self.names = names
|
||
|
self.dtype = dtype
|
||
|
self.converters = converters
|
||
|
self.parse_dates = parse_dates
|
||
|
self.encoding = encoding
|
||
|
self.stylesheet = stylesheet
|
||
|
self.iterparse = iterparse
|
||
|
self.is_style = None
|
||
|
self.compression: CompressionOptions = compression
|
||
|
self.storage_options = storage_options
|
||
|
|
||
|
def parse_data(self) -> list[dict[str, str | None]]:
|
||
|
"""
|
||
|
Parse xml data.
|
||
|
|
||
|
This method will call the other internal methods to
|
||
|
validate ``xpath``, names, parse and return specific nodes.
|
||
|
"""
|
||
|
|
||
|
raise AbstractMethodError(self)
|
||
|
|
||
|
def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]:
|
||
|
"""
|
||
|
Parse xml nodes.
|
||
|
|
||
|
This method will parse the children and attributes of elements
|
||
|
in ``xpath``, conditionally for only elements, only attributes
|
||
|
or both while optionally renaming node names.
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
ValueError
|
||
|
* If only elements and only attributes are specified.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Namespace URIs will be removed from return node values. Also,
|
||
|
elements with missing children or attributes compared to siblings
|
||
|
will have optional keys filled with None values.
|
||
|
"""
|
||
|
|
||
|
dicts: list[dict[str, str | None]]
|
||
|
|
||
|
if self.elems_only and self.attrs_only:
|
||
|
raise ValueError("Either element or attributes can be parsed not both.")
|
||
|
if self.elems_only:
|
||
|
if self.names:
|
||
|
dicts = [
|
||
|
{
|
||
|
**(
|
||
|
{el.tag: el.text}
|
||
|
if el.text and not el.text.isspace()
|
||
|
else {}
|
||
|
),
|
||
|
**{
|
||
|
nm: ch.text if ch.text else None
|
||
|
for nm, ch in zip(self.names, el.findall("*"))
|
||
|
},
|
||
|
}
|
||
|
for el in elems
|
||
|
]
|
||
|
else:
|
||
|
dicts = [
|
||
|
{ch.tag: ch.text if ch.text else None for ch in el.findall("*")}
|
||
|
for el in elems
|
||
|
]
|
||
|
|
||
|
elif self.attrs_only:
|
||
|
dicts = [
|
||
|
{k: v if v else None for k, v in el.attrib.items()} for el in elems
|
||
|
]
|
||
|
|
||
|
elif self.names:
|
||
|
dicts = [
|
||
|
{
|
||
|
**el.attrib,
|
||
|
**({el.tag: el.text} if el.text and not el.text.isspace() else {}),
|
||
|
**{
|
||
|
nm: ch.text if ch.text else None
|
||
|
for nm, ch in zip(self.names, el.findall("*"))
|
||
|
},
|
||
|
}
|
||
|
for el in elems
|
||
|
]
|
||
|
|
||
|
else:
|
||
|
dicts = [
|
||
|
{
|
||
|
**el.attrib,
|
||
|
**({el.tag: el.text} if el.text and not el.text.isspace() else {}),
|
||
|
**{ch.tag: ch.text if ch.text else None for ch in el.findall("*")},
|
||
|
}
|
||
|
for el in elems
|
||
|
]
|
||
|
|
||
|
dicts = [
|
||
|
{k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts
|
||
|
]
|
||
|
|
||
|
keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
|
||
|
dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
|
||
|
|
||
|
if self.names:
|
||
|
dicts = [dict(zip(self.names, d.values())) for d in dicts]
|
||
|
|
||
|
return dicts
|
||
|
|
||
|
def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
|
||
|
"""
|
||
|
Iterparse xml nodes.
|
||
|
|
||
|
This method will read in local disk, decompressed XML files for elements
|
||
|
and underlying descendants using iterparse, a method to iterate through
|
||
|
an XML tree without holding entire XML tree in memory.
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
TypeError
|
||
|
* If ``iterparse`` is not a dict or its dict value is not list-like.
|
||
|
ParserError
|
||
|
* If ``path_or_buffer`` is not a physical file on disk or file-like object.
|
||
|
* If no data is returned from selected items in ``iterparse``.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Namespace URIs will be removed from return node values. Also,
|
||
|
elements with missing children or attributes in submitted list
|
||
|
will have optional keys filled with None values.
|
||
|
"""
|
||
|
|
||
|
dicts: list[dict[str, str | None]] = []
|
||
|
row: dict[str, str | None] | None = None
|
||
|
|
||
|
if not isinstance(self.iterparse, dict):
|
||
|
raise TypeError(
|
||
|
f"{type(self.iterparse).__name__} is not a valid type for iterparse"
|
||
|
)
|
||
|
|
||
|
row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""
|
||
|
if not is_list_like(self.iterparse[row_node]):
|
||
|
raise TypeError(
|
||
|
f"{type(self.iterparse[row_node])} is not a valid type "
|
||
|
"for value in iterparse"
|
||
|
)
|
||
|
|
||
|
if (not hasattr(self.path_or_buffer, "read")) and (
|
||
|
not isinstance(self.path_or_buffer, (str, PathLike))
|
||
|
or is_url(self.path_or_buffer)
|
||
|
or is_fsspec_url(self.path_or_buffer)
|
||
|
or (
|
||
|
isinstance(self.path_or_buffer, str)
|
||
|
and self.path_or_buffer.startswith(("<?xml", "<"))
|
||
|
)
|
||
|
or infer_compression(self.path_or_buffer, "infer") is not None
|
||
|
):
|
||
|
raise ParserError(
|
||
|
"iterparse is designed for large XML files that are fully extracted on "
|
||
|
"local disk and not as compressed files or online sources."
|
||
|
)
|
||
|
|
||
|
iterparse_repeats = len(self.iterparse[row_node]) != len(
|
||
|
set(self.iterparse[row_node])
|
||
|
)
|
||
|
|
||
|
for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):
|
||
|
curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag
|
||
|
|
||
|
if event == "start":
|
||
|
if curr_elem == row_node:
|
||
|
row = {}
|
||
|
|
||
|
if row is not None:
|
||
|
if self.names and iterparse_repeats:
|
||
|
for col, nm in zip(self.iterparse[row_node], self.names):
|
||
|
if curr_elem == col:
|
||
|
elem_val = elem.text if elem.text else None
|
||
|
if elem_val not in row.values() and nm not in row:
|
||
|
row[nm] = elem_val
|
||
|
|
||
|
if col in elem.attrib:
|
||
|
if elem.attrib[col] not in row.values() and nm not in row:
|
||
|
row[nm] = elem.attrib[col]
|
||
|
else:
|
||
|
for col in self.iterparse[row_node]:
|
||
|
if curr_elem == col:
|
||
|
row[col] = elem.text if elem.text else None
|
||
|
if col in elem.attrib:
|
||
|
row[col] = elem.attrib[col]
|
||
|
|
||
|
if event == "end":
|
||
|
if curr_elem == row_node and row is not None:
|
||
|
dicts.append(row)
|
||
|
row = None
|
||
|
|
||
|
elem.clear()
|
||
|
if hasattr(elem, "getprevious"):
|
||
|
while (
|
||
|
elem.getprevious() is not None and elem.getparent() is not None
|
||
|
):
|
||
|
del elem.getparent()[0]
|
||
|
|
||
|
if dicts == []:
|
||
|
raise ParserError("No result from selected items in iterparse.")
|
||
|
|
||
|
keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
|
||
|
dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
|
||
|
|
||
|
if self.names:
|
||
|
dicts = [dict(zip(self.names, d.values())) for d in dicts]
|
||
|
|
||
|
return dicts
|
||
|
|
||
|
def _validate_path(self) -> list[Any]:
|
||
|
"""
|
||
|
Validate ``xpath``.
|
||
|
|
||
|
This method checks for syntax, evaluation, or empty nodes return.
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
SyntaxError
|
||
|
* If xpah is not supported or issues with namespaces.
|
||
|
|
||
|
ValueError
|
||
|
* If xpah does not return any nodes.
|
||
|
"""
|
||
|
|
||
|
raise AbstractMethodError(self)
|
||
|
|
||
|
def _validate_names(self) -> None:
|
||
|
"""
|
||
|
Validate names.
|
||
|
|
||
|
This method will check if names is a list-like and aligns
|
||
|
with length of parse nodes.
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
ValueError
|
||
|
* If value is not a list and less then length of nodes.
|
||
|
"""
|
||
|
raise AbstractMethodError(self)
|
||
|
|
||
|
def _parse_doc(
|
||
|
self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
|
||
|
) -> Element | etree._Element:
|
||
|
"""
|
||
|
Build tree from path_or_buffer.
|
||
|
|
||
|
This method will parse XML object into tree
|
||
|
either from string/bytes or file location.
|
||
|
"""
|
||
|
raise AbstractMethodError(self)
|
||
|
|
||
|
|
||
|
class _EtreeFrameParser(_XMLFrameParser):
|
||
|
"""
|
||
|
Internal class to parse XML into DataFrames with the Python
|
||
|
standard library XML module: `xml.etree.ElementTree`.
|
||
|
"""
|
||
|
|
||
|
def parse_data(self) -> list[dict[str, str | None]]:
|
||
|
from xml.etree.ElementTree import iterparse
|
||
|
|
||
|
if self.stylesheet is not None:
|
||
|
raise ValueError(
|
||
|
"To use stylesheet, you need lxml installed and selected as parser."
|
||
|
)
|
||
|
|
||
|
if self.iterparse is None:
|
||
|
self.xml_doc = self._parse_doc(self.path_or_buffer)
|
||
|
elems = self._validate_path()
|
||
|
|
||
|
self._validate_names()
|
||
|
|
||
|
xml_dicts: list[dict[str, str | None]] = (
|
||
|
self._parse_nodes(elems)
|
||
|
if self.iterparse is None
|
||
|
else self._iterparse_nodes(iterparse)
|
||
|
)
|
||
|
|
||
|
return xml_dicts
|
||
|
|
||
|
def _validate_path(self) -> list[Any]:
|
||
|
"""
|
||
|
Notes
|
||
|
-----
|
||
|
``etree`` supports limited ``XPath``. If user attempts a more complex
|
||
|
expression syntax error will raise.
|
||
|
"""
|
||
|
|
||
|
msg = (
|
||
|
"xpath does not return any nodes or attributes. "
|
||
|
"Be sure to specify in `xpath` the parent nodes of "
|
||
|
"children and attributes to parse. "
|
||
|
"If document uses namespaces denoted with "
|
||
|
"xmlns, be sure to define namespaces and "
|
||
|
"use them in xpath."
|
||
|
)
|
||
|
try:
|
||
|
elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
|
||
|
children = [ch for el in elems for ch in el.findall("*")]
|
||
|
attrs = {k: v for el in elems for k, v in el.attrib.items()}
|
||
|
|
||
|
if elems is None:
|
||
|
raise ValueError(msg)
|
||
|
|
||
|
if elems is not None:
|
||
|
if self.elems_only and children == []:
|
||
|
raise ValueError(msg)
|
||
|
if self.attrs_only and attrs == {}:
|
||
|
raise ValueError(msg)
|
||
|
if children == [] and attrs == {}:
|
||
|
raise ValueError(msg)
|
||
|
|
||
|
except (KeyError, SyntaxError):
|
||
|
raise SyntaxError(
|
||
|
"You have used an incorrect or unsupported XPath "
|
||
|
"expression for etree library or you used an "
|
||
|
"undeclared namespace prefix."
|
||
|
)
|
||
|
|
||
|
return elems
|
||
|
|
||
|
def _validate_names(self) -> None:
|
||
|
children: list[Any]
|
||
|
|
||
|
if self.names:
|
||
|
if self.iterparse:
|
||
|
children = self.iterparse[next(iter(self.iterparse))]
|
||
|
else:
|
||
|
parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
|
||
|
children = parent.findall("*") if parent is not None else []
|
||
|
|
||
|
if is_list_like(self.names):
|
||
|
if len(self.names) < len(children):
|
||
|
raise ValueError(
|
||
|
"names does not match length of child elements in xpath."
|
||
|
)
|
||
|
else:
|
||
|
raise TypeError(
|
||
|
f"{type(self.names).__name__} is not a valid type for names"
|
||
|
)
|
||
|
|
||
|
def _parse_doc(
|
||
|
self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
|
||
|
) -> Element:
|
||
|
from xml.etree.ElementTree import (
|
||
|
XMLParser,
|
||
|
parse,
|
||
|
)
|
||
|
|
||
|
handle_data = get_data_from_filepath(
|
||
|
filepath_or_buffer=raw_doc,
|
||
|
encoding=self.encoding,
|
||
|
compression=self.compression,
|
||
|
storage_options=self.storage_options,
|
||
|
)
|
||
|
|
||
|
with preprocess_data(handle_data) as xml_data:
|
||
|
curr_parser = XMLParser(encoding=self.encoding)
|
||
|
document = parse(xml_data, parser=curr_parser)
|
||
|
|
||
|
return document.getroot()
|
||
|
|
||
|
|
||
|
class _LxmlFrameParser(_XMLFrameParser):
|
||
|
"""
|
||
|
Internal class to parse XML into :class:`~pandas.DataFrame` with third-party
|
||
|
full-featured XML library, ``lxml``, that supports
|
||
|
``XPath`` 1.0 and XSLT 1.0.
|
||
|
"""
|
||
|
|
||
|
def parse_data(self) -> list[dict[str, str | None]]:
|
||
|
"""
|
||
|
Parse xml data.
|
||
|
|
||
|
This method will call the other internal methods to
|
||
|
validate ``xpath``, names, optionally parse and run XSLT,
|
||
|
and parse original or transformed XML and return specific nodes.
|
||
|
"""
|
||
|
from lxml.etree import iterparse
|
||
|
|
||
|
if self.iterparse is None:
|
||
|
self.xml_doc = self._parse_doc(self.path_or_buffer)
|
||
|
|
||
|
if self.stylesheet:
|
||
|
self.xsl_doc = self._parse_doc(self.stylesheet)
|
||
|
self.xml_doc = self._transform_doc()
|
||
|
|
||
|
elems = self._validate_path()
|
||
|
|
||
|
self._validate_names()
|
||
|
|
||
|
xml_dicts: list[dict[str, str | None]] = (
|
||
|
self._parse_nodes(elems)
|
||
|
if self.iterparse is None
|
||
|
else self._iterparse_nodes(iterparse)
|
||
|
)
|
||
|
|
||
|
return xml_dicts
|
||
|
|
||
|
def _validate_path(self) -> list[Any]:
|
||
|
msg = (
|
||
|
"xpath does not return any nodes or attributes. "
|
||
|
"Be sure to specify in `xpath` the parent nodes of "
|
||
|
"children and attributes to parse. "
|
||
|
"If document uses namespaces denoted with "
|
||
|
"xmlns, be sure to define namespaces and "
|
||
|
"use them in xpath."
|
||
|
)
|
||
|
|
||
|
elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
|
||
|
children = [ch for el in elems for ch in el.xpath("*")]
|
||
|
attrs = {k: v for el in elems for k, v in el.attrib.items()}
|
||
|
|
||
|
if elems == []:
|
||
|
raise ValueError(msg)
|
||
|
|
||
|
if elems != []:
|
||
|
if self.elems_only and children == []:
|
||
|
raise ValueError(msg)
|
||
|
if self.attrs_only and attrs == {}:
|
||
|
raise ValueError(msg)
|
||
|
if children == [] and attrs == {}:
|
||
|
raise ValueError(msg)
|
||
|
|
||
|
return elems
|
||
|
|
||
|
def _validate_names(self) -> None:
|
||
|
children: list[Any]
|
||
|
|
||
|
if self.names:
|
||
|
if self.iterparse:
|
||
|
children = self.iterparse[next(iter(self.iterparse))]
|
||
|
else:
|
||
|
children = self.xml_doc.xpath(
|
||
|
self.xpath + "[1]/*", namespaces=self.namespaces
|
||
|
)
|
||
|
|
||
|
if is_list_like(self.names):
|
||
|
if len(self.names) < len(children):
|
||
|
raise ValueError(
|
||
|
"names does not match length of child elements in xpath."
|
||
|
)
|
||
|
else:
|
||
|
raise TypeError(
|
||
|
f"{type(self.names).__name__} is not a valid type for names"
|
||
|
)
|
||
|
|
||
|
def _parse_doc(
|
||
|
self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
|
||
|
) -> etree._Element:
|
||
|
from lxml.etree import (
|
||
|
XMLParser,
|
||
|
fromstring,
|
||
|
parse,
|
||
|
)
|
||
|
|
||
|
handle_data = get_data_from_filepath(
|
||
|
filepath_or_buffer=raw_doc,
|
||
|
encoding=self.encoding,
|
||
|
compression=self.compression,
|
||
|
storage_options=self.storage_options,
|
||
|
)
|
||
|
|
||
|
with preprocess_data(handle_data) as xml_data:
|
||
|
curr_parser = XMLParser(encoding=self.encoding)
|
||
|
|
||
|
if isinstance(xml_data, io.StringIO):
|
||
|
if self.encoding is None:
|
||
|
raise TypeError(
|
||
|
"Can not pass encoding None when input is StringIO."
|
||
|
)
|
||
|
|
||
|
document = fromstring(
|
||
|
xml_data.getvalue().encode(self.encoding), parser=curr_parser
|
||
|
)
|
||
|
else:
|
||
|
document = parse(xml_data, parser=curr_parser)
|
||
|
|
||
|
return document
|
||
|
|
||
|
def _transform_doc(self) -> etree._XSLTResultTree:
|
||
|
"""
|
||
|
Transform original tree using stylesheet.
|
||
|
|
||
|
This method will transform original xml using XSLT script into
|
||
|
am ideally flatter xml document for easier parsing and migration
|
||
|
to Data Frame.
|
||
|
"""
|
||
|
from lxml.etree import XSLT
|
||
|
|
||
|
transformer = XSLT(self.xsl_doc)
|
||
|
new_doc = transformer(self.xml_doc)
|
||
|
|
||
|
return new_doc
|
||
|
|
||
|
|
||
|
def get_data_from_filepath(
|
||
|
filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],
|
||
|
encoding: str | None,
|
||
|
compression: CompressionOptions,
|
||
|
storage_options: StorageOptions,
|
||
|
) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:
|
||
|
"""
|
||
|
Extract raw XML data.
|
||
|
|
||
|
The method accepts three input types:
|
||
|
1. filepath (string-like)
|
||
|
2. file-like object (e.g. open file object, StringIO)
|
||
|
3. XML string or bytes
|
||
|
|
||
|
This method turns (1) into (2) to simplify the rest of the processing.
|
||
|
It returns input types (2) and (3) unchanged.
|
||
|
"""
|
||
|
if not isinstance(filepath_or_buffer, bytes):
|
||
|
filepath_or_buffer = stringify_path(filepath_or_buffer)
|
||
|
|
||
|
if (
|
||
|
isinstance(filepath_or_buffer, str)
|
||
|
and not filepath_or_buffer.startswith(("<?xml", "<"))
|
||
|
) and (
|
||
|
not isinstance(filepath_or_buffer, str)
|
||
|
or is_url(filepath_or_buffer)
|
||
|
or is_fsspec_url(filepath_or_buffer)
|
||
|
or file_exists(filepath_or_buffer)
|
||
|
):
|
||
|
with get_handle(
|
||
|
filepath_or_buffer,
|
||
|
"r",
|
||
|
encoding=encoding,
|
||
|
compression=compression,
|
||
|
storage_options=storage_options,
|
||
|
) as handle_obj:
|
||
|
filepath_or_buffer = (
|
||
|
handle_obj.handle.read()
|
||
|
if hasattr(handle_obj.handle, "read")
|
||
|
else handle_obj.handle
|
||
|
)
|
||
|
|
||
|
return filepath_or_buffer
|
||
|
|
||
|
|
||
|
def preprocess_data(data) -> io.StringIO | io.BytesIO:
|
||
|
"""
|
||
|
Convert extracted raw data.
|
||
|
|
||
|
This method will return underlying data of extracted XML content.
|
||
|
The data either has a `read` attribute (e.g. a file object or a
|
||
|
StringIO/BytesIO) or is a string or bytes that is an XML document.
|
||
|
"""
|
||
|
|
||
|
if isinstance(data, str):
|
||
|
data = io.StringIO(data)
|
||
|
|
||
|
elif isinstance(data, bytes):
|
||
|
data = io.BytesIO(data)
|
||
|
|
||
|
return data
|
||
|
|
||
|
|
||
|
def _data_to_frame(data, **kwargs) -> DataFrame:
|
||
|
"""
|
||
|
Convert parsed data to Data Frame.
|
||
|
|
||
|
This method will bind xml dictionary data of keys and values
|
||
|
into named columns of Data Frame using the built-in TextParser
|
||
|
class that build Data Frame and infers specific dtypes.
|
||
|
"""
|
||
|
|
||
|
tags = next(iter(data))
|
||
|
nodes = [list(d.values()) for d in data]
|
||
|
|
||
|
try:
|
||
|
with TextParser(nodes, names=tags, **kwargs) as tp:
|
||
|
return tp.read()
|
||
|
except ParserError:
|
||
|
raise ParserError(
|
||
|
"XML document may be too complex for import. "
|
||
|
"Try to flatten document and use distinct "
|
||
|
"element and attribute names."
|
||
|
)
|
||
|
|
||
|
|
||
|
def _parse(
|
||
|
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
|
||
|
xpath: str,
|
||
|
namespaces: dict[str, str] | None,
|
||
|
elems_only: bool,
|
||
|
attrs_only: bool,
|
||
|
names: Sequence[str] | None,
|
||
|
dtype: DtypeArg | None,
|
||
|
converters: ConvertersArg | None,
|
||
|
parse_dates: ParseDatesArg | None,
|
||
|
encoding: str | None,
|
||
|
parser: XMLParsers,
|
||
|
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
|
||
|
iterparse: dict[str, list[str]] | None,
|
||
|
compression: CompressionOptions,
|
||
|
storage_options: StorageOptions,
|
||
|
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
|
||
|
**kwargs,
|
||
|
) -> DataFrame:
|
||
|
"""
|
||
|
Call internal parsers.
|
||
|
|
||
|
This method will conditionally call internal parsers:
|
||
|
LxmlFrameParser and/or EtreeParser.
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
ImportError
|
||
|
* If lxml is not installed if selected as parser.
|
||
|
|
||
|
ValueError
|
||
|
* If parser is not lxml or etree.
|
||
|
"""
|
||
|
|
||
|
p: _EtreeFrameParser | _LxmlFrameParser
|
||
|
|
||
|
if isinstance(path_or_buffer, str) and not any(
|
||
|
[
|
||
|
is_file_like(path_or_buffer),
|
||
|
file_exists(path_or_buffer),
|
||
|
is_url(path_or_buffer),
|
||
|
is_fsspec_url(path_or_buffer),
|
||
|
]
|
||
|
):
|
||
|
warnings.warn(
|
||
|
"Passing literal xml to 'read_xml' is deprecated and "
|
||
|
"will be removed in a future version. To read from a "
|
||
|
"literal string, wrap it in a 'StringIO' object.",
|
||
|
FutureWarning,
|
||
|
stacklevel=find_stack_level(),
|
||
|
)
|
||
|
|
||
|
if parser == "lxml":
|
||
|
lxml = import_optional_dependency("lxml.etree", errors="ignore")
|
||
|
|
||
|
if lxml is not None:
|
||
|
p = _LxmlFrameParser(
|
||
|
path_or_buffer,
|
||
|
xpath,
|
||
|
namespaces,
|
||
|
elems_only,
|
||
|
attrs_only,
|
||
|
names,
|
||
|
dtype,
|
||
|
converters,
|
||
|
parse_dates,
|
||
|
encoding,
|
||
|
stylesheet,
|
||
|
iterparse,
|
||
|
compression,
|
||
|
storage_options,
|
||
|
)
|
||
|
else:
|
||
|
raise ImportError("lxml not found, please install or use the etree parser.")
|
||
|
|
||
|
elif parser == "etree":
|
||
|
p = _EtreeFrameParser(
|
||
|
path_or_buffer,
|
||
|
xpath,
|
||
|
namespaces,
|
||
|
elems_only,
|
||
|
attrs_only,
|
||
|
names,
|
||
|
dtype,
|
||
|
converters,
|
||
|
parse_dates,
|
||
|
encoding,
|
||
|
stylesheet,
|
||
|
iterparse,
|
||
|
compression,
|
||
|
storage_options,
|
||
|
)
|
||
|
else:
|
||
|
raise ValueError("Values for parser can only be lxml or etree.")
|
||
|
|
||
|
data_dicts = p.parse_data()
|
||
|
|
||
|
return _data_to_frame(
|
||
|
data=data_dicts,
|
||
|
dtype=dtype,
|
||
|
converters=converters,
|
||
|
parse_dates=parse_dates,
|
||
|
dtype_backend=dtype_backend,
|
||
|
**kwargs,
|
||
|
)
|
||
|
|
||
|
|
||
|
@doc(
|
||
|
storage_options=_shared_docs["storage_options"],
|
||
|
decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
|
||
|
)
|
||
|
def read_xml(
|
||
|
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
|
||
|
*,
|
||
|
xpath: str = "./*",
|
||
|
namespaces: dict[str, str] | None = None,
|
||
|
elems_only: bool = False,
|
||
|
attrs_only: bool = False,
|
||
|
names: Sequence[str] | None = None,
|
||
|
dtype: DtypeArg | None = None,
|
||
|
converters: ConvertersArg | None = None,
|
||
|
parse_dates: ParseDatesArg | None = None,
|
||
|
# encoding can not be None for lxml and StringIO input
|
||
|
encoding: str | None = "utf-8",
|
||
|
parser: XMLParsers = "lxml",
|
||
|
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,
|
||
|
iterparse: dict[str, list[str]] | None = None,
|
||
|
compression: CompressionOptions = "infer",
|
||
|
storage_options: StorageOptions | None = None,
|
||
|
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
|
||
|
) -> DataFrame:
|
||
|
r"""
|
||
|
Read XML document into a :class:`~pandas.DataFrame` object.
|
||
|
|
||
|
.. versionadded:: 1.3.0
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
path_or_buffer : str, path object, or file-like object
|
||
|
String, path object (implementing ``os.PathLike[str]``), or file-like
|
||
|
object implementing a ``read()`` function. The string can be any valid XML
|
||
|
string or a path. The string can further be a URL. Valid URL schemes
|
||
|
include http, ftp, s3, and file.
|
||
|
|
||
|
.. deprecated:: 2.1.0
|
||
|
Passing xml literal strings is deprecated.
|
||
|
Wrap literal xml input in ``io.StringIO`` or ``io.BytesIO`` instead.
|
||
|
|
||
|
xpath : str, optional, default './\*'
|
||
|
The ``XPath`` to parse required set of nodes for migration to
|
||
|
:class:`~pandas.DataFrame`.``XPath`` should return a collection of elements
|
||
|
and not a single element. Note: The ``etree`` parser supports limited ``XPath``
|
||
|
expressions. For more complex ``XPath``, use ``lxml`` which requires
|
||
|
installation.
|
||
|
|
||
|
namespaces : dict, optional
|
||
|
The namespaces defined in XML document as dicts with key being
|
||
|
namespace prefix and value the URI. There is no need to include all
|
||
|
namespaces in XML, only the ones used in ``xpath`` expression.
|
||
|
Note: if XML document uses default namespace denoted as
|
||
|
`xmlns='<URI>'` without a prefix, you must assign any temporary
|
||
|
namespace prefix such as 'doc' to the URI in order to parse
|
||
|
underlying nodes and/or attributes. For example, ::
|
||
|
|
||
|
namespaces = {{"doc": "https://example.com"}}
|
||
|
|
||
|
elems_only : bool, optional, default False
|
||
|
Parse only the child elements at the specified ``xpath``. By default,
|
||
|
all child elements and non-empty text nodes are returned.
|
||
|
|
||
|
attrs_only : bool, optional, default False
|
||
|
Parse only the attributes at the specified ``xpath``.
|
||
|
By default, all attributes are returned.
|
||
|
|
||
|
names : list-like, optional
|
||
|
Column names for DataFrame of parsed XML data. Use this parameter to
|
||
|
rename original element names and distinguish same named elements and
|
||
|
attributes.
|
||
|
|
||
|
dtype : Type name or dict of column -> type, optional
|
||
|
Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
|
||
|
'c': 'Int64'}}
|
||
|
Use `str` or `object` together with suitable `na_values` settings
|
||
|
to preserve and not interpret dtype.
|
||
|
If converters are specified, they will be applied INSTEAD
|
||
|
of dtype conversion.
|
||
|
|
||
|
.. versionadded:: 1.5.0
|
||
|
|
||
|
converters : dict, optional
|
||
|
Dict of functions for converting values in certain columns. Keys can either
|
||
|
be integers or column labels.
|
||
|
|
||
|
.. versionadded:: 1.5.0
|
||
|
|
||
|
parse_dates : bool or list of int or names or list of lists or dict, default False
|
||
|
Identifiers to parse index or columns to datetime. The behavior is as follows:
|
||
|
|
||
|
* boolean. If True -> try parsing the index.
|
||
|
* list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
|
||
|
each as a separate date column.
|
||
|
* list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
|
||
|
a single date column.
|
||
|
* dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
|
||
|
result 'foo'
|
||
|
|
||
|
.. versionadded:: 1.5.0
|
||
|
|
||
|
encoding : str, optional, default 'utf-8'
|
||
|
Encoding of XML document.
|
||
|
|
||
|
parser : {{'lxml','etree'}}, default 'lxml'
|
||
|
Parser module to use for retrieval of data. Only 'lxml' and
|
||
|
'etree' are supported. With 'lxml' more complex ``XPath`` searches
|
||
|
and ability to use XSLT stylesheet are supported.
|
||
|
|
||
|
stylesheet : str, path object or file-like object
|
||
|
A URL, file-like object, or a raw string containing an XSLT script.
|
||
|
This stylesheet should flatten complex, deeply nested XML documents
|
||
|
for easier parsing. To use this feature you must have ``lxml`` module
|
||
|
installed and specify 'lxml' as ``parser``. The ``xpath`` must
|
||
|
reference nodes of transformed XML document generated after XSLT
|
||
|
transformation and not the original XML document. Only XSLT 1.0
|
||
|
scripts and not later versions is currently supported.
|
||
|
|
||
|
iterparse : dict, optional
|
||
|
The nodes or attributes to retrieve in iterparsing of XML document
|
||
|
as a dict with key being the name of repeating element and value being
|
||
|
list of elements or attribute names that are descendants of the repeated
|
||
|
element. Note: If this option is used, it will replace ``xpath`` parsing
|
||
|
and unlike ``xpath``, descendants do not need to relate to each other but can
|
||
|
exist any where in document under the repeating element. This memory-
|
||
|
efficient method should be used for very large XML files (500MB, 1GB, or 5GB+).
|
||
|
For example, ::
|
||
|
|
||
|
iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}}
|
||
|
|
||
|
.. versionadded:: 1.5.0
|
||
|
|
||
|
{decompression_options}
|
||
|
|
||
|
.. versionchanged:: 1.4.0 Zstandard support.
|
||
|
|
||
|
{storage_options}
|
||
|
|
||
|
dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
|
||
|
Back-end data type applied to the resultant :class:`DataFrame`
|
||
|
(still experimental). Behaviour is as follows:
|
||
|
|
||
|
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
|
||
|
(default).
|
||
|
* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
|
||
|
DataFrame.
|
||
|
|
||
|
.. versionadded:: 2.0
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
df
|
||
|
A DataFrame.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
read_json : Convert a JSON string to pandas object.
|
||
|
read_html : Read HTML tables into a list of DataFrame objects.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
This method is best designed to import shallow XML documents in
|
||
|
following format which is the ideal fit for the two-dimensions of a
|
||
|
``DataFrame`` (row by column). ::
|
||
|
|
||
|
<root>
|
||
|
<row>
|
||
|
<column1>data</column1>
|
||
|
<column2>data</column2>
|
||
|
<column3>data</column3>
|
||
|
...
|
||
|
</row>
|
||
|
<row>
|
||
|
...
|
||
|
</row>
|
||
|
...
|
||
|
</root>
|
||
|
|
||
|
As a file format, XML documents can be designed any way including
|
||
|
layout of elements and attributes as long as it conforms to W3C
|
||
|
specifications. Therefore, this method is a convenience handler for
|
||
|
a specific flatter design and not all possible XML structures.
|
||
|
|
||
|
However, for more complex XML documents, ``stylesheet`` allows you to
|
||
|
temporarily redesign original document with XSLT (a special purpose
|
||
|
language) for a flatter version for migration to a DataFrame.
|
||
|
|
||
|
This function will *always* return a single :class:`DataFrame` or raise
|
||
|
exceptions due to issues with XML document, ``xpath``, or other
|
||
|
parameters.
|
||
|
|
||
|
See the :ref:`read_xml documentation in the IO section of the docs
|
||
|
<io.read_xml>` for more information in using this method to parse XML
|
||
|
files to DataFrames.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from io import StringIO
|
||
|
>>> xml = '''<?xml version='1.0' encoding='utf-8'?>
|
||
|
... <data xmlns="http://example.com">
|
||
|
... <row>
|
||
|
... <shape>square</shape>
|
||
|
... <degrees>360</degrees>
|
||
|
... <sides>4.0</sides>
|
||
|
... </row>
|
||
|
... <row>
|
||
|
... <shape>circle</shape>
|
||
|
... <degrees>360</degrees>
|
||
|
... <sides/>
|
||
|
... </row>
|
||
|
... <row>
|
||
|
... <shape>triangle</shape>
|
||
|
... <degrees>180</degrees>
|
||
|
... <sides>3.0</sides>
|
||
|
... </row>
|
||
|
... </data>'''
|
||
|
|
||
|
>>> df = pd.read_xml(StringIO(xml))
|
||
|
>>> df
|
||
|
shape degrees sides
|
||
|
0 square 360 4.0
|
||
|
1 circle 360 NaN
|
||
|
2 triangle 180 3.0
|
||
|
|
||
|
>>> xml = '''<?xml version='1.0' encoding='utf-8'?>
|
||
|
... <data>
|
||
|
... <row shape="square" degrees="360" sides="4.0"/>
|
||
|
... <row shape="circle" degrees="360"/>
|
||
|
... <row shape="triangle" degrees="180" sides="3.0"/>
|
||
|
... </data>'''
|
||
|
|
||
|
>>> df = pd.read_xml(StringIO(xml), xpath=".//row")
|
||
|
>>> df
|
||
|
shape degrees sides
|
||
|
0 square 360 4.0
|
||
|
1 circle 360 NaN
|
||
|
2 triangle 180 3.0
|
||
|
|
||
|
>>> xml = '''<?xml version='1.0' encoding='utf-8'?>
|
||
|
... <doc:data xmlns:doc="https://example.com">
|
||
|
... <doc:row>
|
||
|
... <doc:shape>square</doc:shape>
|
||
|
... <doc:degrees>360</doc:degrees>
|
||
|
... <doc:sides>4.0</doc:sides>
|
||
|
... </doc:row>
|
||
|
... <doc:row>
|
||
|
... <doc:shape>circle</doc:shape>
|
||
|
... <doc:degrees>360</doc:degrees>
|
||
|
... <doc:sides/>
|
||
|
... </doc:row>
|
||
|
... <doc:row>
|
||
|
... <doc:shape>triangle</doc:shape>
|
||
|
... <doc:degrees>180</doc:degrees>
|
||
|
... <doc:sides>3.0</doc:sides>
|
||
|
... </doc:row>
|
||
|
... </doc:data>'''
|
||
|
|
||
|
>>> df = pd.read_xml(StringIO(xml),
|
||
|
... xpath="//doc:row",
|
||
|
... namespaces={{"doc": "https://example.com"}})
|
||
|
>>> df
|
||
|
shape degrees sides
|
||
|
0 square 360 4.0
|
||
|
1 circle 360 NaN
|
||
|
2 triangle 180 3.0
|
||
|
|
||
|
>>> xml_data = '''
|
||
|
... <data>
|
||
|
... <row>
|
||
|
... <index>0</index>
|
||
|
... <a>1</a>
|
||
|
... <b>2.5</b>
|
||
|
... <c>True</c>
|
||
|
... <d>a</d>
|
||
|
... <e>2019-12-31 00:00:00</e>
|
||
|
... </row>
|
||
|
... <row>
|
||
|
... <index>1</index>
|
||
|
... <b>4.5</b>
|
||
|
... <c>False</c>
|
||
|
... <d>b</d>
|
||
|
... <e>2019-12-31 00:00:00</e>
|
||
|
... </row>
|
||
|
... </data>
|
||
|
... '''
|
||
|
|
||
|
>>> df = pd.read_xml(StringIO(xml_data),
|
||
|
... dtype_backend="numpy_nullable",
|
||
|
... parse_dates=["e"])
|
||
|
>>> df
|
||
|
index a b c d e
|
||
|
0 0 1 2.5 True a 2019-12-31
|
||
|
1 1 <NA> 4.5 False b 2019-12-31
|
||
|
"""
|
||
|
check_dtype_backend(dtype_backend)
|
||
|
|
||
|
return _parse(
|
||
|
path_or_buffer=path_or_buffer,
|
||
|
xpath=xpath,
|
||
|
namespaces=namespaces,
|
||
|
elems_only=elems_only,
|
||
|
attrs_only=attrs_only,
|
||
|
names=names,
|
||
|
dtype=dtype,
|
||
|
converters=converters,
|
||
|
parse_dates=parse_dates,
|
||
|
encoding=encoding,
|
||
|
parser=parser,
|
||
|
stylesheet=stylesheet,
|
||
|
iterparse=iterparse,
|
||
|
compression=compression,
|
||
|
storage_options=storage_options,
|
||
|
dtype_backend=dtype_backend,
|
||
|
)
|