206 lines
6.8 KiB
Python
206 lines
6.8 KiB
Python
|
""" orc compat """
|
||
|
from __future__ import annotations
|
||
|
|
||
|
import io
|
||
|
from types import ModuleType
|
||
|
from typing import (
|
||
|
Any,
|
||
|
Literal,
|
||
|
)
|
||
|
|
||
|
from pandas._libs import lib
|
||
|
from pandas._typing import (
|
||
|
DtypeBackend,
|
||
|
FilePath,
|
||
|
ReadBuffer,
|
||
|
WriteBuffer,
|
||
|
)
|
||
|
from pandas.compat._optional import import_optional_dependency
|
||
|
from pandas.util._validators import check_dtype_backend
|
||
|
|
||
|
from pandas.core.dtypes.common import (
|
||
|
is_categorical_dtype,
|
||
|
is_interval_dtype,
|
||
|
is_period_dtype,
|
||
|
is_unsigned_integer_dtype,
|
||
|
)
|
||
|
|
||
|
import pandas as pd
|
||
|
from pandas.core.frame import DataFrame
|
||
|
|
||
|
from pandas.io.common import get_handle
|
||
|
|
||
|
|
||
|
def read_orc(
|
||
|
path: FilePath | ReadBuffer[bytes],
|
||
|
columns: list[str] | None = None,
|
||
|
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
|
||
|
**kwargs,
|
||
|
) -> DataFrame:
|
||
|
"""
|
||
|
Load an ORC object from the file path, returning a DataFrame.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
path : str, path object, or file-like object
|
||
|
String, path object (implementing ``os.PathLike[str]``), or file-like
|
||
|
object implementing a binary ``read()`` function. The string could be a URL.
|
||
|
Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
|
||
|
expected. A local file could be:
|
||
|
``file://localhost/path/to/table.orc``.
|
||
|
columns : list, default None
|
||
|
If not None, only these columns will be read from the file.
|
||
|
Output always follows the ordering of the file and not the columns list.
|
||
|
This mirrors the original behaviour of
|
||
|
:external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`.
|
||
|
dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames
|
||
|
Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
|
||
|
arrays, nullable dtypes are used for all dtypes that have a nullable
|
||
|
implementation when "numpy_nullable" is set, pyarrow is used for all
|
||
|
dtypes if "pyarrow" is set.
|
||
|
|
||
|
The dtype_backends are still experimential.
|
||
|
|
||
|
.. versionadded:: 2.0
|
||
|
|
||
|
**kwargs
|
||
|
Any additional kwargs are passed to pyarrow.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
DataFrame
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Before using this function you should read the :ref:`user guide about ORC <io.orc>`
|
||
|
and :ref:`install optional dependencies <install.warn_orc>`.
|
||
|
"""
|
||
|
# we require a newer version of pyarrow than we support for parquet
|
||
|
|
||
|
orc = import_optional_dependency("pyarrow.orc")
|
||
|
|
||
|
check_dtype_backend(dtype_backend)
|
||
|
|
||
|
with get_handle(path, "rb", is_text=False) as handles:
|
||
|
orc_file = orc.ORCFile(handles.handle)
|
||
|
pa_table = orc_file.read(columns=columns, **kwargs)
|
||
|
if dtype_backend is not lib.no_default:
|
||
|
if dtype_backend == "pyarrow":
|
||
|
df = pa_table.to_pandas(types_mapper=pd.ArrowDtype)
|
||
|
else:
|
||
|
from pandas.io._util import _arrow_dtype_mapping
|
||
|
|
||
|
mapping = _arrow_dtype_mapping()
|
||
|
df = pa_table.to_pandas(types_mapper=mapping.get)
|
||
|
return df
|
||
|
else:
|
||
|
return pa_table.to_pandas()
|
||
|
|
||
|
|
||
|
def to_orc(
|
||
|
df: DataFrame,
|
||
|
path: FilePath | WriteBuffer[bytes] | None = None,
|
||
|
*,
|
||
|
engine: Literal["pyarrow"] = "pyarrow",
|
||
|
index: bool | None = None,
|
||
|
engine_kwargs: dict[str, Any] | None = None,
|
||
|
) -> bytes | None:
|
||
|
"""
|
||
|
Write a DataFrame to the ORC format.
|
||
|
|
||
|
.. versionadded:: 1.5.0
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
df : DataFrame
|
||
|
The dataframe to be written to ORC. Raises NotImplementedError
|
||
|
if dtype of one or more columns is category, unsigned integers,
|
||
|
intervals, periods or sparse.
|
||
|
path : str, file-like object or None, default None
|
||
|
If a string, it will be used as Root Directory path
|
||
|
when writing a partitioned dataset. By file-like object,
|
||
|
we refer to objects with a write() method, such as a file handle
|
||
|
(e.g. via builtin open function). If path is None,
|
||
|
a bytes object is returned.
|
||
|
engine : str, default 'pyarrow'
|
||
|
ORC library to use. Pyarrow must be >= 7.0.0.
|
||
|
index : bool, optional
|
||
|
If ``True``, include the dataframe's index(es) in the file output. If
|
||
|
``False``, they will not be written to the file.
|
||
|
If ``None``, similar to ``infer`` the dataframe's index(es)
|
||
|
will be saved. However, instead of being saved as values,
|
||
|
the RangeIndex will be stored as a range in the metadata so it
|
||
|
doesn't require much space and is faster. Other indexes will
|
||
|
be included as columns in the file output.
|
||
|
engine_kwargs : dict[str, Any] or None, default None
|
||
|
Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
bytes if no path argument is provided else None
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
NotImplementedError
|
||
|
Dtype of one or more columns is category, unsigned integers, interval,
|
||
|
period or sparse.
|
||
|
ValueError
|
||
|
engine is not pyarrow.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
* Before using this function you should read the
|
||
|
:ref:`user guide about ORC <io.orc>` and
|
||
|
:ref:`install optional dependencies <install.warn_orc>`.
|
||
|
* This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
|
||
|
library.
|
||
|
* For supported dtypes please refer to `supported ORC features in Arrow
|
||
|
<https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
|
||
|
* Currently timezones in datetime columns are not preserved when a
|
||
|
dataframe is converted into ORC files.
|
||
|
"""
|
||
|
if index is None:
|
||
|
index = df.index.names[0] is not None
|
||
|
if engine_kwargs is None:
|
||
|
engine_kwargs = {}
|
||
|
|
||
|
# If unsupported dtypes are found raise NotImplementedError
|
||
|
# In Pyarrow 9.0.0 this check will no longer be needed
|
||
|
for dtype in df.dtypes:
|
||
|
if (
|
||
|
is_categorical_dtype(dtype)
|
||
|
or is_interval_dtype(dtype)
|
||
|
or is_period_dtype(dtype)
|
||
|
or is_unsigned_integer_dtype(dtype)
|
||
|
):
|
||
|
raise NotImplementedError(
|
||
|
"The dtype of one or more columns is not supported yet."
|
||
|
)
|
||
|
|
||
|
if engine != "pyarrow":
|
||
|
raise ValueError("engine must be 'pyarrow'")
|
||
|
engine = import_optional_dependency(engine, min_version="7.0.0")
|
||
|
orc = import_optional_dependency("pyarrow.orc")
|
||
|
|
||
|
was_none = path is None
|
||
|
if was_none:
|
||
|
path = io.BytesIO()
|
||
|
assert path is not None # For mypy
|
||
|
with get_handle(path, "wb", is_text=False) as handles:
|
||
|
assert isinstance(engine, ModuleType) # For mypy
|
||
|
try:
|
||
|
orc.write_table(
|
||
|
engine.Table.from_pandas(df, preserve_index=index),
|
||
|
handles.handle,
|
||
|
**engine_kwargs,
|
||
|
)
|
||
|
except TypeError as e:
|
||
|
raise NotImplementedError(
|
||
|
"The dtype of one or more columns is not supported yet."
|
||
|
) from e
|
||
|
|
||
|
if was_none:
|
||
|
assert isinstance(path, io.BytesIO) # For mypy
|
||
|
return path.getvalue()
|
||
|
return None
|