1038 lines
35 KiB
Python
1038 lines
35 KiB
Python
|
""" test parquet compat """
|
||
|
import datetime
|
||
|
from distutils.version import LooseVersion
|
||
|
from io import BytesIO
|
||
|
import os
|
||
|
import pathlib
|
||
|
from warnings import catch_warnings
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
from pandas.compat import PY38, is_platform_windows
|
||
|
import pandas.util._test_decorators as td
|
||
|
|
||
|
import pandas as pd
|
||
|
import pandas._testing as tm
|
||
|
|
||
|
from pandas.io.parquet import (
|
||
|
FastParquetImpl,
|
||
|
PyArrowImpl,
|
||
|
get_engine,
|
||
|
read_parquet,
|
||
|
to_parquet,
|
||
|
)
|
||
|
|
||
|
try:
|
||
|
import pyarrow
|
||
|
|
||
|
_HAVE_PYARROW = True
|
||
|
except ImportError:
|
||
|
_HAVE_PYARROW = False
|
||
|
|
||
|
try:
|
||
|
import fastparquet
|
||
|
|
||
|
_HAVE_FASTPARQUET = True
|
||
|
except ImportError:
|
||
|
_HAVE_FASTPARQUET = False
|
||
|
|
||
|
|
||
|
pytestmark = pytest.mark.filterwarnings(
|
||
|
"ignore:RangeIndex.* is deprecated:DeprecationWarning"
|
||
|
)
|
||
|
|
||
|
|
||
|
# setup engines & skips
|
||
|
@pytest.fixture(
|
||
|
params=[
|
||
|
pytest.param(
|
||
|
"fastparquet",
|
||
|
marks=pytest.mark.skipif(
|
||
|
not _HAVE_FASTPARQUET, reason="fastparquet is not installed"
|
||
|
),
|
||
|
),
|
||
|
pytest.param(
|
||
|
"pyarrow",
|
||
|
marks=pytest.mark.skipif(
|
||
|
not _HAVE_PYARROW, reason="pyarrow is not installed"
|
||
|
),
|
||
|
),
|
||
|
]
|
||
|
)
|
||
|
def engine(request):
|
||
|
return request.param
|
||
|
|
||
|
|
||
|
@pytest.fixture
|
||
|
def pa():
|
||
|
if not _HAVE_PYARROW:
|
||
|
pytest.skip("pyarrow is not installed")
|
||
|
return "pyarrow"
|
||
|
|
||
|
|
||
|
@pytest.fixture
|
||
|
def fp():
|
||
|
if not _HAVE_FASTPARQUET:
|
||
|
pytest.skip("fastparquet is not installed")
|
||
|
return "fastparquet"
|
||
|
|
||
|
|
||
|
@pytest.fixture
|
||
|
def df_compat():
|
||
|
return pd.DataFrame({"A": [1, 2, 3], "B": "foo"})
|
||
|
|
||
|
|
||
|
@pytest.fixture
|
||
|
def df_cross_compat():
|
||
|
df = pd.DataFrame(
|
||
|
{
|
||
|
"a": list("abc"),
|
||
|
"b": list(range(1, 4)),
|
||
|
# 'c': np.arange(3, 6).astype('u1'),
|
||
|
"d": np.arange(4.0, 7.0, dtype="float64"),
|
||
|
"e": [True, False, True],
|
||
|
"f": pd.date_range("20130101", periods=3),
|
||
|
# 'g': pd.date_range('20130101', periods=3,
|
||
|
# tz='US/Eastern'),
|
||
|
# 'h': pd.date_range('20130101', periods=3, freq='ns')
|
||
|
}
|
||
|
)
|
||
|
return df
|
||
|
|
||
|
|
||
|
@pytest.fixture
|
||
|
def df_full():
|
||
|
return pd.DataFrame(
|
||
|
{
|
||
|
"string": list("abc"),
|
||
|
"string_with_nan": ["a", np.nan, "c"],
|
||
|
"string_with_none": ["a", None, "c"],
|
||
|
"bytes": [b"foo", b"bar", b"baz"],
|
||
|
"unicode": ["foo", "bar", "baz"],
|
||
|
"int": list(range(1, 4)),
|
||
|
"uint": np.arange(3, 6).astype("u1"),
|
||
|
"float": np.arange(4.0, 7.0, dtype="float64"),
|
||
|
"float_with_nan": [2.0, np.nan, 3.0],
|
||
|
"bool": [True, False, True],
|
||
|
"datetime": pd.date_range("20130101", periods=3),
|
||
|
"datetime_with_nat": [
|
||
|
pd.Timestamp("20130101"),
|
||
|
pd.NaT,
|
||
|
pd.Timestamp("20130103"),
|
||
|
],
|
||
|
}
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.fixture(
|
||
|
params=[
|
||
|
datetime.datetime.now(datetime.timezone.utc),
|
||
|
datetime.datetime.now(datetime.timezone.min),
|
||
|
datetime.datetime.now(datetime.timezone.max),
|
||
|
datetime.datetime.strptime("2019-01-04T16:41:24+0200", "%Y-%m-%dT%H:%M:%S%z"),
|
||
|
datetime.datetime.strptime("2019-01-04T16:41:24+0215", "%Y-%m-%dT%H:%M:%S%z"),
|
||
|
datetime.datetime.strptime("2019-01-04T16:41:24-0200", "%Y-%m-%dT%H:%M:%S%z"),
|
||
|
datetime.datetime.strptime("2019-01-04T16:41:24-0215", "%Y-%m-%dT%H:%M:%S%z"),
|
||
|
]
|
||
|
)
|
||
|
def timezone_aware_date_list(request):
|
||
|
return request.param
|
||
|
|
||
|
|
||
|
def check_round_trip(
|
||
|
df,
|
||
|
engine=None,
|
||
|
path=None,
|
||
|
write_kwargs=None,
|
||
|
read_kwargs=None,
|
||
|
expected=None,
|
||
|
check_names=True,
|
||
|
check_like=False,
|
||
|
check_dtype=True,
|
||
|
repeat=2,
|
||
|
):
|
||
|
"""Verify parquet serializer and deserializer produce the same results.
|
||
|
|
||
|
Performs a pandas to disk and disk to pandas round trip,
|
||
|
then compares the 2 resulting DataFrames to verify equality.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
df: Dataframe
|
||
|
engine: str, optional
|
||
|
'pyarrow' or 'fastparquet'
|
||
|
path: str, optional
|
||
|
write_kwargs: dict of str:str, optional
|
||
|
read_kwargs: dict of str:str, optional
|
||
|
expected: DataFrame, optional
|
||
|
Expected deserialization result, otherwise will be equal to `df`
|
||
|
check_names: list of str, optional
|
||
|
Closed set of column names to be compared
|
||
|
check_like: bool, optional
|
||
|
If True, ignore the order of index & columns.
|
||
|
repeat: int, optional
|
||
|
How many times to repeat the test
|
||
|
"""
|
||
|
write_kwargs = write_kwargs or {"compression": None}
|
||
|
read_kwargs = read_kwargs or {}
|
||
|
|
||
|
if expected is None:
|
||
|
expected = df
|
||
|
|
||
|
if engine:
|
||
|
write_kwargs["engine"] = engine
|
||
|
read_kwargs["engine"] = engine
|
||
|
|
||
|
def compare(repeat):
|
||
|
for _ in range(repeat):
|
||
|
df.to_parquet(path, **write_kwargs)
|
||
|
with catch_warnings(record=True):
|
||
|
actual = read_parquet(path, **read_kwargs)
|
||
|
|
||
|
tm.assert_frame_equal(
|
||
|
expected,
|
||
|
actual,
|
||
|
check_names=check_names,
|
||
|
check_like=check_like,
|
||
|
check_dtype=check_dtype,
|
||
|
)
|
||
|
|
||
|
if path is None:
|
||
|
with tm.ensure_clean() as path:
|
||
|
compare(repeat)
|
||
|
else:
|
||
|
compare(repeat)
|
||
|
|
||
|
|
||
|
def test_invalid_engine(df_compat):
|
||
|
with pytest.raises(ValueError):
|
||
|
check_round_trip(df_compat, "foo", "bar")
|
||
|
|
||
|
|
||
|
def test_options_py(df_compat, pa):
|
||
|
# use the set option
|
||
|
|
||
|
with pd.option_context("io.parquet.engine", "pyarrow"):
|
||
|
check_round_trip(df_compat)
|
||
|
|
||
|
|
||
|
def test_options_fp(df_compat, fp):
|
||
|
# use the set option
|
||
|
|
||
|
with pd.option_context("io.parquet.engine", "fastparquet"):
|
||
|
check_round_trip(df_compat)
|
||
|
|
||
|
|
||
|
def test_options_auto(df_compat, fp, pa):
|
||
|
# use the set option
|
||
|
|
||
|
with pd.option_context("io.parquet.engine", "auto"):
|
||
|
check_round_trip(df_compat)
|
||
|
|
||
|
|
||
|
def test_options_get_engine(fp, pa):
|
||
|
assert isinstance(get_engine("pyarrow"), PyArrowImpl)
|
||
|
assert isinstance(get_engine("fastparquet"), FastParquetImpl)
|
||
|
|
||
|
with pd.option_context("io.parquet.engine", "pyarrow"):
|
||
|
assert isinstance(get_engine("auto"), PyArrowImpl)
|
||
|
assert isinstance(get_engine("pyarrow"), PyArrowImpl)
|
||
|
assert isinstance(get_engine("fastparquet"), FastParquetImpl)
|
||
|
|
||
|
with pd.option_context("io.parquet.engine", "fastparquet"):
|
||
|
assert isinstance(get_engine("auto"), FastParquetImpl)
|
||
|
assert isinstance(get_engine("pyarrow"), PyArrowImpl)
|
||
|
assert isinstance(get_engine("fastparquet"), FastParquetImpl)
|
||
|
|
||
|
with pd.option_context("io.parquet.engine", "auto"):
|
||
|
assert isinstance(get_engine("auto"), PyArrowImpl)
|
||
|
assert isinstance(get_engine("pyarrow"), PyArrowImpl)
|
||
|
assert isinstance(get_engine("fastparquet"), FastParquetImpl)
|
||
|
|
||
|
|
||
|
def test_get_engine_auto_error_message():
|
||
|
# Expect different error messages from get_engine(engine="auto")
|
||
|
# if engines aren't installed vs. are installed but bad version
|
||
|
from pandas.compat._optional import VERSIONS
|
||
|
|
||
|
# Do we have engines installed, but a bad version of them?
|
||
|
pa_min_ver = VERSIONS.get("pyarrow")
|
||
|
fp_min_ver = VERSIONS.get("fastparquet")
|
||
|
have_pa_bad_version = (
|
||
|
False
|
||
|
if not _HAVE_PYARROW
|
||
|
else LooseVersion(pyarrow.__version__) < LooseVersion(pa_min_ver)
|
||
|
)
|
||
|
have_fp_bad_version = (
|
||
|
False
|
||
|
if not _HAVE_FASTPARQUET
|
||
|
else LooseVersion(fastparquet.__version__) < LooseVersion(fp_min_ver)
|
||
|
)
|
||
|
# Do we have usable engines installed?
|
||
|
have_usable_pa = _HAVE_PYARROW and not have_pa_bad_version
|
||
|
have_usable_fp = _HAVE_FASTPARQUET and not have_fp_bad_version
|
||
|
|
||
|
if not have_usable_pa and not have_usable_fp:
|
||
|
# No usable engines found.
|
||
|
if have_pa_bad_version:
|
||
|
match = f"Pandas requires version .{pa_min_ver}. or newer of .pyarrow."
|
||
|
with pytest.raises(ImportError, match=match):
|
||
|
get_engine("auto")
|
||
|
else:
|
||
|
match = "Missing optional dependency .pyarrow."
|
||
|
with pytest.raises(ImportError, match=match):
|
||
|
get_engine("auto")
|
||
|
|
||
|
if have_fp_bad_version:
|
||
|
match = f"Pandas requires version .{fp_min_ver}. or newer of .fastparquet."
|
||
|
with pytest.raises(ImportError, match=match):
|
||
|
get_engine("auto")
|
||
|
else:
|
||
|
match = "Missing optional dependency .fastparquet."
|
||
|
with pytest.raises(ImportError, match=match):
|
||
|
get_engine("auto")
|
||
|
|
||
|
|
||
|
def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
|
||
|
# cross-compat with differing reading/writing engines
|
||
|
|
||
|
df = df_cross_compat
|
||
|
with tm.ensure_clean() as path:
|
||
|
df.to_parquet(path, engine=pa, compression=None)
|
||
|
|
||
|
result = read_parquet(path, engine=fp)
|
||
|
tm.assert_frame_equal(result, df)
|
||
|
|
||
|
result = read_parquet(path, engine=fp, columns=["a", "d"])
|
||
|
tm.assert_frame_equal(result, df[["a", "d"]])
|
||
|
|
||
|
|
||
|
def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
|
||
|
# cross-compat with differing reading/writing engines
|
||
|
|
||
|
if (
|
||
|
LooseVersion(pyarrow.__version__) < "0.15"
|
||
|
and LooseVersion(pyarrow.__version__) >= "0.13"
|
||
|
):
|
||
|
pytest.xfail(
|
||
|
"Reading fastparquet with pyarrow in 0.14 fails: "
|
||
|
"https://issues.apache.org/jira/browse/ARROW-6492"
|
||
|
)
|
||
|
|
||
|
df = df_cross_compat
|
||
|
with tm.ensure_clean() as path:
|
||
|
df.to_parquet(path, engine=fp, compression=None)
|
||
|
|
||
|
with catch_warnings(record=True):
|
||
|
result = read_parquet(path, engine=pa)
|
||
|
tm.assert_frame_equal(result, df)
|
||
|
|
||
|
result = read_parquet(path, engine=pa, columns=["a", "d"])
|
||
|
tm.assert_frame_equal(result, df[["a", "d"]])
|
||
|
|
||
|
|
||
|
class Base:
|
||
|
def check_error_on_write(self, df, engine, exc):
|
||
|
# check that we are raising the exception on writing
|
||
|
with tm.ensure_clean() as path:
|
||
|
with pytest.raises(exc):
|
||
|
to_parquet(df, path, engine, compression=None)
|
||
|
|
||
|
@tm.network
|
||
|
def test_parquet_read_from_url(self, df_compat, engine):
|
||
|
if engine != "auto":
|
||
|
pytest.importorskip(engine)
|
||
|
url = (
|
||
|
"https://raw.githubusercontent.com/pandas-dev/pandas/"
|
||
|
"master/pandas/tests/io/data/parquet/simple.parquet"
|
||
|
)
|
||
|
df = pd.read_parquet(url)
|
||
|
tm.assert_frame_equal(df, df_compat)
|
||
|
|
||
|
|
||
|
class TestBasic(Base):
|
||
|
def test_error(self, engine):
|
||
|
for obj in [
|
||
|
pd.Series([1, 2, 3]),
|
||
|
1,
|
||
|
"foo",
|
||
|
pd.Timestamp("20130101"),
|
||
|
np.array([1, 2, 3]),
|
||
|
]:
|
||
|
self.check_error_on_write(obj, engine, ValueError)
|
||
|
|
||
|
def test_columns_dtypes(self, engine):
|
||
|
df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))})
|
||
|
|
||
|
# unicode
|
||
|
df.columns = ["foo", "bar"]
|
||
|
check_round_trip(df, engine)
|
||
|
|
||
|
def test_columns_dtypes_invalid(self, engine):
|
||
|
df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))})
|
||
|
|
||
|
# numeric
|
||
|
df.columns = [0, 1]
|
||
|
self.check_error_on_write(df, engine, ValueError)
|
||
|
|
||
|
# bytes
|
||
|
df.columns = [b"foo", b"bar"]
|
||
|
self.check_error_on_write(df, engine, ValueError)
|
||
|
|
||
|
# python object
|
||
|
df.columns = [
|
||
|
datetime.datetime(2011, 1, 1, 0, 0),
|
||
|
datetime.datetime(2011, 1, 1, 1, 1),
|
||
|
]
|
||
|
self.check_error_on_write(df, engine, ValueError)
|
||
|
|
||
|
@pytest.mark.parametrize("compression", [None, "gzip", "snappy", "brotli"])
|
||
|
def test_compression(self, engine, compression):
|
||
|
|
||
|
if compression == "snappy":
|
||
|
pytest.importorskip("snappy")
|
||
|
|
||
|
elif compression == "brotli":
|
||
|
pytest.importorskip("brotli")
|
||
|
|
||
|
df = pd.DataFrame({"A": [1, 2, 3]})
|
||
|
check_round_trip(df, engine, write_kwargs={"compression": compression})
|
||
|
|
||
|
def test_read_columns(self, engine):
|
||
|
# GH18154
|
||
|
df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))})
|
||
|
|
||
|
expected = pd.DataFrame({"string": list("abc")})
|
||
|
check_round_trip(
|
||
|
df, engine, expected=expected, read_kwargs={"columns": ["string"]}
|
||
|
)
|
||
|
|
||
|
def test_write_index(self, engine):
|
||
|
check_names = engine != "fastparquet"
|
||
|
|
||
|
df = pd.DataFrame({"A": [1, 2, 3]})
|
||
|
check_round_trip(df, engine)
|
||
|
|
||
|
indexes = [
|
||
|
[2, 3, 4],
|
||
|
pd.date_range("20130101", periods=3),
|
||
|
list("abc"),
|
||
|
[1, 3, 4],
|
||
|
]
|
||
|
# non-default index
|
||
|
for index in indexes:
|
||
|
df.index = index
|
||
|
if isinstance(index, pd.DatetimeIndex):
|
||
|
df.index = df.index._with_freq(None) # freq doesnt round-trip
|
||
|
check_round_trip(df, engine, check_names=check_names)
|
||
|
|
||
|
# index with meta-data
|
||
|
df.index = [0, 1, 2]
|
||
|
df.index.name = "foo"
|
||
|
check_round_trip(df, engine)
|
||
|
|
||
|
def test_write_multiindex(self, pa):
|
||
|
# Not supported in fastparquet as of 0.1.3 or older pyarrow version
|
||
|
engine = pa
|
||
|
|
||
|
df = pd.DataFrame({"A": [1, 2, 3]})
|
||
|
index = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])
|
||
|
df.index = index
|
||
|
check_round_trip(df, engine)
|
||
|
|
||
|
def test_multiindex_with_columns(self, pa):
|
||
|
engine = pa
|
||
|
dates = pd.date_range("01-Jan-2018", "01-Dec-2018", freq="MS")
|
||
|
df = pd.DataFrame(np.random.randn(2 * len(dates), 3), columns=list("ABC"))
|
||
|
index1 = pd.MultiIndex.from_product(
|
||
|
[["Level1", "Level2"], dates], names=["level", "date"]
|
||
|
)
|
||
|
index2 = index1.copy(names=None)
|
||
|
for index in [index1, index2]:
|
||
|
df.index = index
|
||
|
|
||
|
check_round_trip(df, engine)
|
||
|
check_round_trip(
|
||
|
df, engine, read_kwargs={"columns": ["A", "B"]}, expected=df[["A", "B"]]
|
||
|
)
|
||
|
|
||
|
def test_write_ignoring_index(self, engine):
|
||
|
# ENH 20768
|
||
|
# Ensure index=False omits the index from the written Parquet file.
|
||
|
df = pd.DataFrame({"a": [1, 2, 3], "b": ["q", "r", "s"]})
|
||
|
|
||
|
write_kwargs = {"compression": None, "index": False}
|
||
|
|
||
|
# Because we're dropping the index, we expect the loaded dataframe to
|
||
|
# have the default integer index.
|
||
|
expected = df.reset_index(drop=True)
|
||
|
|
||
|
check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected)
|
||
|
|
||
|
# Ignore custom index
|
||
|
df = pd.DataFrame(
|
||
|
{"a": [1, 2, 3], "b": ["q", "r", "s"]}, index=["zyx", "wvu", "tsr"]
|
||
|
)
|
||
|
|
||
|
check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected)
|
||
|
|
||
|
# Ignore multi-indexes as well.
|
||
|
arrays = [
|
||
|
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
|
||
|
["one", "two", "one", "two", "one", "two", "one", "two"],
|
||
|
]
|
||
|
df = pd.DataFrame(
|
||
|
{"one": list(range(8)), "two": [-i for i in range(8)]}, index=arrays
|
||
|
)
|
||
|
|
||
|
expected = df.reset_index(drop=True)
|
||
|
check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected)
|
||
|
|
||
|
def test_write_column_multiindex(self, engine):
|
||
|
# Not able to write column multi-indexes with non-string column names.
|
||
|
mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])
|
||
|
df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns)
|
||
|
self.check_error_on_write(df, engine, ValueError)
|
||
|
|
||
|
def test_write_column_multiindex_nonstring(self, pa):
|
||
|
# GH #34777
|
||
|
# Not supported in fastparquet as of 0.1.3
|
||
|
engine = pa
|
||
|
|
||
|
# Not able to write column multi-indexes with non-string column names
|
||
|
arrays = [
|
||
|
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
|
||
|
[1, 2, 1, 2, 1, 2, 1, 2],
|
||
|
]
|
||
|
df = pd.DataFrame(np.random.randn(8, 8), columns=arrays)
|
||
|
df.columns.names = ["Level1", "Level2"]
|
||
|
|
||
|
self.check_error_on_write(df, engine, ValueError)
|
||
|
|
||
|
def test_write_column_multiindex_string(self, pa):
|
||
|
# GH #34777
|
||
|
# Not supported in fastparquet as of 0.1.3
|
||
|
engine = pa
|
||
|
|
||
|
# Write column multi-indexes with string column names
|
||
|
arrays = [
|
||
|
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
|
||
|
["one", "two", "one", "two", "one", "two", "one", "two"],
|
||
|
]
|
||
|
df = pd.DataFrame(np.random.randn(8, 8), columns=arrays)
|
||
|
df.columns.names = ["ColLevel1", "ColLevel2"]
|
||
|
|
||
|
check_round_trip(df, engine)
|
||
|
|
||
|
def test_write_column_index_string(self, pa):
|
||
|
# GH #34777
|
||
|
# Not supported in fastparquet as of 0.1.3
|
||
|
engine = pa
|
||
|
|
||
|
# Write column indexes with string column names
|
||
|
arrays = ["bar", "baz", "foo", "qux"]
|
||
|
df = pd.DataFrame(np.random.randn(8, 4), columns=arrays)
|
||
|
df.columns.name = "StringCol"
|
||
|
|
||
|
check_round_trip(df, engine)
|
||
|
|
||
|
def test_write_column_index_nonstring(self, pa):
|
||
|
# GH #34777
|
||
|
# Not supported in fastparquet as of 0.1.3
|
||
|
engine = pa
|
||
|
|
||
|
# Write column indexes with string column names
|
||
|
arrays = [1, 2, 3, 4]
|
||
|
df = pd.DataFrame(np.random.randn(8, 4), columns=arrays)
|
||
|
df.columns.name = "NonStringCol"
|
||
|
|
||
|
self.check_error_on_write(df, engine, ValueError)
|
||
|
|
||
|
|
||
|
class TestParquetPyArrow(Base):
|
||
|
def test_basic(self, pa, df_full):
|
||
|
|
||
|
df = df_full
|
||
|
|
||
|
# additional supported types for pyarrow
|
||
|
dti = pd.date_range("20130101", periods=3, tz="Europe/Brussels")
|
||
|
dti = dti._with_freq(None) # freq doesnt round-trip
|
||
|
df["datetime_tz"] = dti
|
||
|
df["bool_with_none"] = [True, None, True]
|
||
|
|
||
|
check_round_trip(df, pa)
|
||
|
|
||
|
def test_basic_subset_columns(self, pa, df_full):
|
||
|
# GH18628
|
||
|
|
||
|
df = df_full
|
||
|
# additional supported types for pyarrow
|
||
|
df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="Europe/Brussels")
|
||
|
|
||
|
check_round_trip(
|
||
|
df,
|
||
|
pa,
|
||
|
expected=df[["string", "int"]],
|
||
|
read_kwargs={"columns": ["string", "int"]},
|
||
|
)
|
||
|
|
||
|
def test_to_bytes_without_path_or_buf_provided(self, pa, df_full):
|
||
|
# GH 37105
|
||
|
|
||
|
buf_bytes = df_full.to_parquet(engine=pa)
|
||
|
assert isinstance(buf_bytes, bytes)
|
||
|
|
||
|
buf_stream = BytesIO(buf_bytes)
|
||
|
res = pd.read_parquet(buf_stream)
|
||
|
|
||
|
tm.assert_frame_equal(df_full, res)
|
||
|
|
||
|
def test_duplicate_columns(self, pa):
|
||
|
# not currently able to handle duplicate columns
|
||
|
df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy()
|
||
|
self.check_error_on_write(df, pa, ValueError)
|
||
|
|
||
|
def test_unsupported(self, pa):
|
||
|
if LooseVersion(pyarrow.__version__) < LooseVersion("0.15.1.dev"):
|
||
|
# period - will be supported using an extension type with pyarrow 1.0
|
||
|
df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)})
|
||
|
# pyarrow 0.11 raises ArrowTypeError
|
||
|
# older pyarrows raise ArrowInvalid
|
||
|
self.check_error_on_write(df, pa, Exception)
|
||
|
|
||
|
# timedelta
|
||
|
df = pd.DataFrame({"a": pd.timedelta_range("1 day", periods=3)})
|
||
|
self.check_error_on_write(df, pa, NotImplementedError)
|
||
|
|
||
|
# mixed python objects
|
||
|
df = pd.DataFrame({"a": ["a", 1, 2.0]})
|
||
|
# pyarrow 0.11 raises ArrowTypeError
|
||
|
# older pyarrows raise ArrowInvalid
|
||
|
self.check_error_on_write(df, pa, Exception)
|
||
|
|
||
|
def test_categorical(self, pa):
|
||
|
|
||
|
# supported in >= 0.7.0
|
||
|
df = pd.DataFrame()
|
||
|
df["a"] = pd.Categorical(list("abcdef"))
|
||
|
|
||
|
# test for null, out-of-order values, and unobserved category
|
||
|
df["b"] = pd.Categorical(
|
||
|
["bar", "foo", "foo", "bar", None, "bar"],
|
||
|
dtype=pd.CategoricalDtype(["foo", "bar", "baz"]),
|
||
|
)
|
||
|
|
||
|
# test for ordered flag
|
||
|
df["c"] = pd.Categorical(
|
||
|
["a", "b", "c", "a", "c", "b"], categories=["b", "c", "d"], ordered=True
|
||
|
)
|
||
|
|
||
|
if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.0"):
|
||
|
check_round_trip(df, pa)
|
||
|
else:
|
||
|
# de-serialized as object for pyarrow < 0.15
|
||
|
expected = df.astype(object)
|
||
|
check_round_trip(df, pa, expected=expected)
|
||
|
|
||
|
@pytest.mark.xfail(
|
||
|
is_platform_windows() and PY38,
|
||
|
reason="localhost connection rejected",
|
||
|
strict=False,
|
||
|
)
|
||
|
def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa, s3so):
|
||
|
s3fs = pytest.importorskip("s3fs")
|
||
|
if LooseVersion(pyarrow.__version__) <= LooseVersion("0.17.0"):
|
||
|
pytest.skip()
|
||
|
s3 = s3fs.S3FileSystem(**s3so)
|
||
|
kw = {"filesystem": s3}
|
||
|
check_round_trip(
|
||
|
df_compat,
|
||
|
pa,
|
||
|
path="pandas-test/pyarrow.parquet",
|
||
|
read_kwargs=kw,
|
||
|
write_kwargs=kw,
|
||
|
)
|
||
|
|
||
|
def test_s3_roundtrip(self, df_compat, s3_resource, pa, s3so):
|
||
|
if LooseVersion(pyarrow.__version__) <= LooseVersion("0.17.0"):
|
||
|
pytest.skip()
|
||
|
# GH #19134
|
||
|
s3so = {"storage_options": s3so}
|
||
|
check_round_trip(
|
||
|
df_compat,
|
||
|
pa,
|
||
|
path="s3://pandas-test/pyarrow.parquet",
|
||
|
read_kwargs=s3so,
|
||
|
write_kwargs=s3so,
|
||
|
)
|
||
|
|
||
|
@td.skip_if_no("s3fs") # also requires flask
|
||
|
@pytest.mark.parametrize(
|
||
|
"partition_col",
|
||
|
[
|
||
|
["A"],
|
||
|
[],
|
||
|
],
|
||
|
)
|
||
|
def test_s3_roundtrip_for_dir(
|
||
|
self, df_compat, s3_resource, pa, partition_col, s3so
|
||
|
):
|
||
|
# GH #26388
|
||
|
expected_df = df_compat.copy()
|
||
|
|
||
|
# GH #35791
|
||
|
# read_table uses the new Arrow Datasets API since pyarrow 1.0.0
|
||
|
# Previous behaviour was pyarrow partitioned columns become 'category' dtypes
|
||
|
# These are added to back of dataframe on read. In new API category dtype is
|
||
|
# only used if partition field is string, but this changed again to use
|
||
|
# category dtype for all types (not only strings) in pyarrow 2.0.0
|
||
|
pa10 = (LooseVersion(pyarrow.__version__) >= LooseVersion("1.0.0")) and (
|
||
|
LooseVersion(pyarrow.__version__) < LooseVersion("2.0.0")
|
||
|
)
|
||
|
if partition_col:
|
||
|
if pa10:
|
||
|
partition_col_type = "int32"
|
||
|
else:
|
||
|
partition_col_type = "category"
|
||
|
|
||
|
expected_df[partition_col] = expected_df[partition_col].astype(
|
||
|
partition_col_type
|
||
|
)
|
||
|
|
||
|
check_round_trip(
|
||
|
df_compat,
|
||
|
pa,
|
||
|
expected=expected_df,
|
||
|
path="s3://pandas-test/parquet_dir",
|
||
|
read_kwargs={"storage_options": s3so},
|
||
|
write_kwargs={
|
||
|
"partition_cols": partition_col,
|
||
|
"compression": None,
|
||
|
"storage_options": s3so,
|
||
|
},
|
||
|
check_like=True,
|
||
|
repeat=1,
|
||
|
)
|
||
|
|
||
|
@td.skip_if_no("pyarrow")
|
||
|
def test_read_file_like_obj_support(self, df_compat):
|
||
|
buffer = BytesIO()
|
||
|
df_compat.to_parquet(buffer)
|
||
|
df_from_buf = pd.read_parquet(buffer)
|
||
|
tm.assert_frame_equal(df_compat, df_from_buf)
|
||
|
|
||
|
@td.skip_if_no("pyarrow")
|
||
|
def test_expand_user(self, df_compat, monkeypatch):
|
||
|
monkeypatch.setenv("HOME", "TestingUser")
|
||
|
monkeypatch.setenv("USERPROFILE", "TestingUser")
|
||
|
with pytest.raises(OSError, match=r".*TestingUser.*"):
|
||
|
pd.read_parquet("~/file.parquet")
|
||
|
with pytest.raises(OSError, match=r".*TestingUser.*"):
|
||
|
df_compat.to_parquet("~/file.parquet")
|
||
|
|
||
|
def test_partition_cols_supported(self, pa, df_full):
|
||
|
# GH #23283
|
||
|
partition_cols = ["bool", "int"]
|
||
|
df = df_full
|
||
|
with tm.ensure_clean_dir() as path:
|
||
|
df.to_parquet(path, partition_cols=partition_cols, compression=None)
|
||
|
import pyarrow.parquet as pq
|
||
|
|
||
|
dataset = pq.ParquetDataset(path, validate_schema=False)
|
||
|
assert len(dataset.partitions.partition_names) == 2
|
||
|
assert dataset.partitions.partition_names == set(partition_cols)
|
||
|
assert read_parquet(path).shape == df.shape
|
||
|
|
||
|
def test_partition_cols_string(self, pa, df_full):
|
||
|
# GH #27117
|
||
|
partition_cols = "bool"
|
||
|
partition_cols_list = [partition_cols]
|
||
|
df = df_full
|
||
|
with tm.ensure_clean_dir() as path:
|
||
|
df.to_parquet(path, partition_cols=partition_cols, compression=None)
|
||
|
import pyarrow.parquet as pq
|
||
|
|
||
|
dataset = pq.ParquetDataset(path, validate_schema=False)
|
||
|
assert len(dataset.partitions.partition_names) == 1
|
||
|
assert dataset.partitions.partition_names == set(partition_cols_list)
|
||
|
assert read_parquet(path).shape == df.shape
|
||
|
|
||
|
@pytest.mark.parametrize("path_type", [str, pathlib.Path])
|
||
|
def test_partition_cols_pathlib(self, pa, df_compat, path_type):
|
||
|
# GH 35902
|
||
|
|
||
|
partition_cols = "B"
|
||
|
partition_cols_list = [partition_cols]
|
||
|
df = df_compat
|
||
|
|
||
|
with tm.ensure_clean_dir() as path_str:
|
||
|
path = path_type(path_str)
|
||
|
df.to_parquet(path, partition_cols=partition_cols_list)
|
||
|
assert read_parquet(path).shape == df.shape
|
||
|
|
||
|
def test_empty_dataframe(self, pa):
|
||
|
# GH #27339
|
||
|
df = pd.DataFrame()
|
||
|
check_round_trip(df, pa)
|
||
|
|
||
|
def test_write_with_schema(self, pa):
|
||
|
import pyarrow
|
||
|
|
||
|
df = pd.DataFrame({"x": [0, 1]})
|
||
|
schema = pyarrow.schema([pyarrow.field("x", type=pyarrow.bool_())])
|
||
|
out_df = df.astype(bool)
|
||
|
check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df)
|
||
|
|
||
|
@td.skip_if_no("pyarrow", min_version="0.15.0")
|
||
|
def test_additional_extension_arrays(self, pa):
|
||
|
# test additional ExtensionArrays that are supported through the
|
||
|
# __arrow_array__ protocol
|
||
|
df = pd.DataFrame(
|
||
|
{
|
||
|
"a": pd.Series([1, 2, 3], dtype="Int64"),
|
||
|
"b": pd.Series([1, 2, 3], dtype="UInt32"),
|
||
|
"c": pd.Series(["a", None, "c"], dtype="string"),
|
||
|
}
|
||
|
)
|
||
|
if LooseVersion(pyarrow.__version__) >= LooseVersion("0.16.0"):
|
||
|
expected = df
|
||
|
else:
|
||
|
# de-serialized as plain int / object
|
||
|
expected = df.assign(
|
||
|
a=df.a.astype("int64"), b=df.b.astype("int64"), c=df.c.astype("object")
|
||
|
)
|
||
|
check_round_trip(df, pa, expected=expected)
|
||
|
|
||
|
df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")})
|
||
|
if LooseVersion(pyarrow.__version__) >= LooseVersion("0.16.0"):
|
||
|
expected = df
|
||
|
else:
|
||
|
# if missing values in integer, currently de-serialized as float
|
||
|
expected = df.assign(a=df.a.astype("float64"))
|
||
|
check_round_trip(df, pa, expected=expected)
|
||
|
|
||
|
@td.skip_if_no("pyarrow", min_version="0.16.0")
|
||
|
def test_additional_extension_types(self, pa):
|
||
|
# test additional ExtensionArrays that are supported through the
|
||
|
# __arrow_array__ protocol + by defining a custom ExtensionType
|
||
|
df = pd.DataFrame(
|
||
|
{
|
||
|
# Arrow does not yet support struct in writing to Parquet (ARROW-1644)
|
||
|
# "c": pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2), (3, 4)]),
|
||
|
"d": pd.period_range("2012-01-01", periods=3, freq="D"),
|
||
|
}
|
||
|
)
|
||
|
check_round_trip(df, pa)
|
||
|
|
||
|
@td.skip_if_no("pyarrow", min_version="0.16")
|
||
|
def test_use_nullable_dtypes(self, pa):
|
||
|
import pyarrow.parquet as pq
|
||
|
|
||
|
table = pyarrow.table(
|
||
|
{
|
||
|
"a": pyarrow.array([1, 2, 3, None], "int64"),
|
||
|
"b": pyarrow.array([1, 2, 3, None], "uint8"),
|
||
|
"c": pyarrow.array(["a", "b", "c", None]),
|
||
|
"d": pyarrow.array([True, False, True, None]),
|
||
|
}
|
||
|
)
|
||
|
with tm.ensure_clean() as path:
|
||
|
# write manually with pyarrow to write integers
|
||
|
pq.write_table(table, path)
|
||
|
result1 = read_parquet(path)
|
||
|
result2 = read_parquet(path, use_nullable_dtypes=True)
|
||
|
|
||
|
assert result1["a"].dtype == np.dtype("float64")
|
||
|
expected = pd.DataFrame(
|
||
|
{
|
||
|
"a": pd.array([1, 2, 3, None], dtype="Int64"),
|
||
|
"b": pd.array([1, 2, 3, None], dtype="UInt8"),
|
||
|
"c": pd.array(["a", "b", "c", None], dtype="string"),
|
||
|
"d": pd.array([True, False, True, None], dtype="boolean"),
|
||
|
}
|
||
|
)
|
||
|
tm.assert_frame_equal(result2, expected)
|
||
|
|
||
|
@td.skip_if_no("pyarrow", min_version="0.14")
|
||
|
def test_timestamp_nanoseconds(self, pa):
|
||
|
# with version 2.0, pyarrow defaults to writing the nanoseconds, so
|
||
|
# this should work without error
|
||
|
df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1n", periods=10)})
|
||
|
check_round_trip(df, pa, write_kwargs={"version": "2.0"})
|
||
|
|
||
|
def test_timezone_aware_index(self, pa, timezone_aware_date_list):
|
||
|
if LooseVersion(pyarrow.__version__) >= LooseVersion("2.0.0"):
|
||
|
# temporary skip this test until it is properly resolved
|
||
|
# https://github.com/pandas-dev/pandas/issues/37286
|
||
|
pytest.skip()
|
||
|
idx = 5 * [timezone_aware_date_list]
|
||
|
df = pd.DataFrame(index=idx, data={"index_as_col": idx})
|
||
|
|
||
|
# see gh-36004
|
||
|
# compare time(zone) values only, skip their class:
|
||
|
# pyarrow always creates fixed offset timezones using pytz.FixedOffset()
|
||
|
# even if it was datetime.timezone() originally
|
||
|
#
|
||
|
# technically they are the same:
|
||
|
# they both implement datetime.tzinfo
|
||
|
# they both wrap datetime.timedelta()
|
||
|
# this use-case sets the resolution to 1 minute
|
||
|
check_round_trip(df, pa, check_dtype=False)
|
||
|
|
||
|
@td.skip_if_no("pyarrow", min_version="1.0.0")
|
||
|
def test_filter_row_groups(self, pa):
|
||
|
# https://github.com/pandas-dev/pandas/issues/26551
|
||
|
df = pd.DataFrame({"a": list(range(0, 3))})
|
||
|
with tm.ensure_clean() as path:
|
||
|
df.to_parquet(path, pa)
|
||
|
result = read_parquet(
|
||
|
path, pa, filters=[("a", "==", 0)], use_legacy_dataset=False
|
||
|
)
|
||
|
assert len(result) == 1
|
||
|
|
||
|
|
||
|
class TestParquetFastParquet(Base):
|
||
|
@td.skip_if_no("fastparquet", min_version="0.3.2")
|
||
|
def test_basic(self, fp, df_full):
|
||
|
df = df_full
|
||
|
|
||
|
dti = pd.date_range("20130101", periods=3, tz="US/Eastern")
|
||
|
dti = dti._with_freq(None) # freq doesnt round-trip
|
||
|
df["datetime_tz"] = dti
|
||
|
df["timedelta"] = pd.timedelta_range("1 day", periods=3)
|
||
|
check_round_trip(df, fp)
|
||
|
|
||
|
@pytest.mark.skip(reason="not supported")
|
||
|
def test_duplicate_columns(self, fp):
|
||
|
|
||
|
# not currently able to handle duplicate columns
|
||
|
df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy()
|
||
|
self.check_error_on_write(df, fp, ValueError)
|
||
|
|
||
|
def test_bool_with_none(self, fp):
|
||
|
df = pd.DataFrame({"a": [True, None, False]})
|
||
|
expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
|
||
|
check_round_trip(df, fp, expected=expected)
|
||
|
|
||
|
def test_unsupported(self, fp):
|
||
|
|
||
|
# period
|
||
|
df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)})
|
||
|
self.check_error_on_write(df, fp, ValueError)
|
||
|
|
||
|
# mixed
|
||
|
df = pd.DataFrame({"a": ["a", 1, 2.0]})
|
||
|
self.check_error_on_write(df, fp, ValueError)
|
||
|
|
||
|
def test_categorical(self, fp):
|
||
|
df = pd.DataFrame({"a": pd.Categorical(list("abc"))})
|
||
|
check_round_trip(df, fp)
|
||
|
|
||
|
def test_filter_row_groups(self, fp):
|
||
|
d = {"a": list(range(0, 3))}
|
||
|
df = pd.DataFrame(d)
|
||
|
with tm.ensure_clean() as path:
|
||
|
df.to_parquet(path, fp, compression=None, row_group_offsets=1)
|
||
|
result = read_parquet(path, fp, filters=[("a", "==", 0)])
|
||
|
assert len(result) == 1
|
||
|
|
||
|
def test_s3_roundtrip(self, df_compat, s3_resource, fp, s3so):
|
||
|
# GH #19134
|
||
|
check_round_trip(
|
||
|
df_compat,
|
||
|
fp,
|
||
|
path="s3://pandas-test/fastparquet.parquet",
|
||
|
read_kwargs={"storage_options": s3so},
|
||
|
write_kwargs={"compression": None, "storage_options": s3so},
|
||
|
)
|
||
|
|
||
|
def test_partition_cols_supported(self, fp, df_full):
|
||
|
# GH #23283
|
||
|
partition_cols = ["bool", "int"]
|
||
|
df = df_full
|
||
|
with tm.ensure_clean_dir() as path:
|
||
|
df.to_parquet(
|
||
|
path,
|
||
|
engine="fastparquet",
|
||
|
partition_cols=partition_cols,
|
||
|
compression=None,
|
||
|
)
|
||
|
assert os.path.exists(path)
|
||
|
import fastparquet
|
||
|
|
||
|
actual_partition_cols = fastparquet.ParquetFile(path, False).cats
|
||
|
assert len(actual_partition_cols) == 2
|
||
|
|
||
|
def test_partition_cols_string(self, fp, df_full):
|
||
|
# GH #27117
|
||
|
partition_cols = "bool"
|
||
|
df = df_full
|
||
|
with tm.ensure_clean_dir() as path:
|
||
|
df.to_parquet(
|
||
|
path,
|
||
|
engine="fastparquet",
|
||
|
partition_cols=partition_cols,
|
||
|
compression=None,
|
||
|
)
|
||
|
assert os.path.exists(path)
|
||
|
import fastparquet
|
||
|
|
||
|
actual_partition_cols = fastparquet.ParquetFile(path, False).cats
|
||
|
assert len(actual_partition_cols) == 1
|
||
|
|
||
|
def test_partition_on_supported(self, fp, df_full):
|
||
|
# GH #23283
|
||
|
partition_cols = ["bool", "int"]
|
||
|
df = df_full
|
||
|
with tm.ensure_clean_dir() as path:
|
||
|
df.to_parquet(
|
||
|
path,
|
||
|
engine="fastparquet",
|
||
|
compression=None,
|
||
|
partition_on=partition_cols,
|
||
|
)
|
||
|
assert os.path.exists(path)
|
||
|
import fastparquet
|
||
|
|
||
|
actual_partition_cols = fastparquet.ParquetFile(path, False).cats
|
||
|
assert len(actual_partition_cols) == 2
|
||
|
|
||
|
def test_error_on_using_partition_cols_and_partition_on(self, fp, df_full):
|
||
|
# GH #23283
|
||
|
partition_cols = ["bool", "int"]
|
||
|
df = df_full
|
||
|
with pytest.raises(ValueError):
|
||
|
with tm.ensure_clean_dir() as path:
|
||
|
df.to_parquet(
|
||
|
path,
|
||
|
engine="fastparquet",
|
||
|
compression=None,
|
||
|
partition_on=partition_cols,
|
||
|
partition_cols=partition_cols,
|
||
|
)
|
||
|
|
||
|
def test_empty_dataframe(self, fp):
|
||
|
# GH #27339
|
||
|
df = pd.DataFrame()
|
||
|
expected = df.copy()
|
||
|
expected.index.name = "index"
|
||
|
check_round_trip(df, fp, expected=expected)
|
||
|
|
||
|
def test_timezone_aware_index(self, fp, timezone_aware_date_list):
|
||
|
idx = 5 * [timezone_aware_date_list]
|
||
|
|
||
|
df = pd.DataFrame(index=idx, data={"index_as_col": idx})
|
||
|
|
||
|
expected = df.copy()
|
||
|
expected.index.name = "index"
|
||
|
check_round_trip(df, fp, expected=expected)
|
||
|
|
||
|
def test_use_nullable_dtypes_not_supported(self, fp):
|
||
|
df = pd.DataFrame({"a": [1, 2]})
|
||
|
|
||
|
with tm.ensure_clean() as path:
|
||
|
df.to_parquet(path)
|
||
|
with pytest.raises(ValueError, match="not supported for the fastparquet"):
|
||
|
read_parquet(path, engine="fastparquet", use_nullable_dtypes=True)
|