""" test parquet compat """ import datetime from distutils.version import LooseVersion import os from warnings import catch_warnings import numpy as np import pytest import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm from pandas.io.parquet import ( FastParquetImpl, PyArrowImpl, get_engine, read_parquet, to_parquet, ) try: import pyarrow # noqa _HAVE_PYARROW = True except ImportError: _HAVE_PYARROW = False try: import fastparquet # noqa _HAVE_FASTPARQUET = True except ImportError: _HAVE_FASTPARQUET = False pytestmark = pytest.mark.filterwarnings( "ignore:RangeIndex.* is deprecated:DeprecationWarning" ) # setup engines & skips @pytest.fixture( params=[ pytest.param( "fastparquet", marks=pytest.mark.skipif( not _HAVE_FASTPARQUET, reason="fastparquet is not installed" ), ), pytest.param( "pyarrow", marks=pytest.mark.skipif( not _HAVE_PYARROW, reason="pyarrow is not installed" ), ), ] ) def engine(request): return request.param @pytest.fixture def pa(): if not _HAVE_PYARROW: pytest.skip("pyarrow is not installed") return "pyarrow" @pytest.fixture def fp(): if not _HAVE_FASTPARQUET: pytest.skip("fastparquet is not installed") return "fastparquet" @pytest.fixture def df_compat(): return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}) @pytest.fixture def df_cross_compat(): df = pd.DataFrame( { "a": list("abc"), "b": list(range(1, 4)), # 'c': np.arange(3, 6).astype('u1'), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.date_range("20130101", periods=3), # 'g': pd.date_range('20130101', periods=3, # tz='US/Eastern'), # 'h': pd.date_range('20130101', periods=3, freq='ns') } ) return df @pytest.fixture def df_full(): return pd.DataFrame( { "string": list("abc"), "string_with_nan": ["a", np.nan, "c"], "string_with_none": ["a", None, "c"], "bytes": [b"foo", b"bar", b"baz"], "unicode": ["foo", "bar", "baz"], "int": list(range(1, 4)), "uint": np.arange(3, 6).astype("u1"), "float": np.arange(4.0, 7.0, dtype="float64"), "float_with_nan": [2.0, np.nan, 3.0], "bool": [True, False, True], "datetime": pd.date_range("20130101", periods=3), "datetime_with_nat": [ pd.Timestamp("20130101"), pd.NaT, pd.Timestamp("20130103"), ], } ) def check_round_trip( df, engine=None, path=None, write_kwargs=None, read_kwargs=None, expected=None, check_names=True, repeat=2, ): """Verify parquet serializer and deserializer produce the same results. Performs a pandas to disk and disk to pandas round trip, then compares the 2 resulting DataFrames to verify equality. Parameters ---------- df: Dataframe engine: str, optional 'pyarrow' or 'fastparquet' path: str, optional write_kwargs: dict of str:str, optional read_kwargs: dict of str:str, optional expected: DataFrame, optional Expected deserialization result, otherwise will be equal to `df` check_names: list of str, optional Closed set of column names to be compared repeat: int, optional How many times to repeat the test """ write_kwargs = write_kwargs or {"compression": None} read_kwargs = read_kwargs or {} if expected is None: expected = df if engine: write_kwargs["engine"] = engine read_kwargs["engine"] = engine def compare(repeat): for _ in range(repeat): df.to_parquet(path, **write_kwargs) with catch_warnings(record=True): actual = read_parquet(path, **read_kwargs) tm.assert_frame_equal(expected, actual, check_names=check_names) if path is None: with tm.ensure_clean() as path: compare(repeat) else: compare(repeat) def test_invalid_engine(df_compat): with pytest.raises(ValueError): check_round_trip(df_compat, "foo", "bar") def test_options_py(df_compat, pa): # use the set option with pd.option_context("io.parquet.engine", "pyarrow"): check_round_trip(df_compat) def test_options_fp(df_compat, fp): # use the set option with pd.option_context("io.parquet.engine", "fastparquet"): check_round_trip(df_compat) def test_options_auto(df_compat, fp, pa): # use the set option with pd.option_context("io.parquet.engine", "auto"): check_round_trip(df_compat) def test_options_get_engine(fp, pa): assert isinstance(get_engine("pyarrow"), PyArrowImpl) assert isinstance(get_engine("fastparquet"), FastParquetImpl) with pd.option_context("io.parquet.engine", "pyarrow"): assert isinstance(get_engine("auto"), PyArrowImpl) assert isinstance(get_engine("pyarrow"), PyArrowImpl) assert isinstance(get_engine("fastparquet"), FastParquetImpl) with pd.option_context("io.parquet.engine", "fastparquet"): assert isinstance(get_engine("auto"), FastParquetImpl) assert isinstance(get_engine("pyarrow"), PyArrowImpl) assert isinstance(get_engine("fastparquet"), FastParquetImpl) with pd.option_context("io.parquet.engine", "auto"): assert isinstance(get_engine("auto"), PyArrowImpl) assert isinstance(get_engine("pyarrow"), PyArrowImpl) assert isinstance(get_engine("fastparquet"), FastParquetImpl) def test_cross_engine_pa_fp(df_cross_compat, pa, fp): # cross-compat with differing reading/writing engines df = df_cross_compat with tm.ensure_clean() as path: df.to_parquet(path, engine=pa, compression=None) result = read_parquet(path, engine=fp) tm.assert_frame_equal(result, df) result = read_parquet(path, engine=fp, columns=["a", "d"]) tm.assert_frame_equal(result, df[["a", "d"]]) def test_cross_engine_fp_pa(df_cross_compat, pa, fp): # cross-compat with differing reading/writing engines if ( LooseVersion(pyarrow.__version__) < "0.15" and LooseVersion(pyarrow.__version__) >= "0.13" ): pytest.xfail( "Reading fastparquet with pyarrow in 0.14 fails: " "https://issues.apache.org/jira/browse/ARROW-6492" ) df = df_cross_compat with tm.ensure_clean() as path: df.to_parquet(path, engine=fp, compression=None) with catch_warnings(record=True): result = read_parquet(path, engine=pa) tm.assert_frame_equal(result, df) result = read_parquet(path, engine=pa, columns=["a", "d"]) tm.assert_frame_equal(result, df[["a", "d"]]) class Base: def check_error_on_write(self, df, engine, exc): # check that we are raising the exception on writing with tm.ensure_clean() as path: with pytest.raises(exc): to_parquet(df, path, engine, compression=None) class TestBasic(Base): def test_error(self, engine): for obj in [ pd.Series([1, 2, 3]), 1, "foo", pd.Timestamp("20130101"), np.array([1, 2, 3]), ]: self.check_error_on_write(obj, engine, ValueError) def test_columns_dtypes(self, engine): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) # unicode df.columns = ["foo", "bar"] check_round_trip(df, engine) def test_columns_dtypes_invalid(self, engine): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) # numeric df.columns = [0, 1] self.check_error_on_write(df, engine, ValueError) # bytes df.columns = [b"foo", b"bar"] self.check_error_on_write(df, engine, ValueError) # python object df.columns = [ datetime.datetime(2011, 1, 1, 0, 0), datetime.datetime(2011, 1, 1, 1, 1), ] self.check_error_on_write(df, engine, ValueError) @pytest.mark.parametrize("compression", [None, "gzip", "snappy", "brotli"]) def test_compression(self, engine, compression): if compression == "snappy": pytest.importorskip("snappy") elif compression == "brotli": pytest.importorskip("brotli") df = pd.DataFrame({"A": [1, 2, 3]}) check_round_trip(df, engine, write_kwargs={"compression": compression}) def test_read_columns(self, engine): # GH18154 df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) expected = pd.DataFrame({"string": list("abc")}) check_round_trip( df, engine, expected=expected, read_kwargs={"columns": ["string"]} ) def test_write_index(self, engine): check_names = engine != "fastparquet" df = pd.DataFrame({"A": [1, 2, 3]}) check_round_trip(df, engine) indexes = [ [2, 3, 4], pd.date_range("20130101", periods=3), list("abc"), [1, 3, 4], ] # non-default index for index in indexes: df.index = index check_round_trip(df, engine, check_names=check_names) # index with meta-data df.index = [0, 1, 2] df.index.name = "foo" check_round_trip(df, engine) def test_write_multiindex(self, pa): # Not supported in fastparquet as of 0.1.3 or older pyarrow version engine = pa df = pd.DataFrame({"A": [1, 2, 3]}) index = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) df.index = index check_round_trip(df, engine) def test_write_column_multiindex(self, engine): # column multi-index mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns) self.check_error_on_write(df, engine, ValueError) def test_multiindex_with_columns(self, pa): engine = pa dates = pd.date_range("01-Jan-2018", "01-Dec-2018", freq="MS") df = pd.DataFrame(np.random.randn(2 * len(dates), 3), columns=list("ABC")) index1 = pd.MultiIndex.from_product( [["Level1", "Level2"], dates], names=["level", "date"] ) index2 = index1.copy(names=None) for index in [index1, index2]: df.index = index check_round_trip(df, engine) check_round_trip( df, engine, read_kwargs={"columns": ["A", "B"]}, expected=df[["A", "B"]] ) def test_write_ignoring_index(self, engine): # ENH 20768 # Ensure index=False omits the index from the written Parquet file. df = pd.DataFrame({"a": [1, 2, 3], "b": ["q", "r", "s"]}) write_kwargs = {"compression": None, "index": False} # Because we're dropping the index, we expect the loaded dataframe to # have the default integer index. expected = df.reset_index(drop=True) check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) # Ignore custom index df = pd.DataFrame( {"a": [1, 2, 3], "b": ["q", "r", "s"]}, index=["zyx", "wvu", "tsr"] ) check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) # Ignore multi-indexes as well. arrays = [ ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], ["one", "two", "one", "two", "one", "two", "one", "two"], ] df = pd.DataFrame( {"one": list(range(8)), "two": [-i for i in range(8)]}, index=arrays ) expected = df.reset_index(drop=True) check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) class TestParquetPyArrow(Base): def test_basic(self, pa, df_full): df = df_full # additional supported types for pyarrow df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="Europe/Brussels") df["bool_with_none"] = [True, None, True] check_round_trip(df, pa) def test_basic_subset_columns(self, pa, df_full): # GH18628 df = df_full # additional supported types for pyarrow df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="Europe/Brussels") check_round_trip( df, pa, expected=df[["string", "int"]], read_kwargs={"columns": ["string", "int"]}, ) def test_duplicate_columns(self, pa): # not currently able to handle duplicate columns df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() self.check_error_on_write(df, pa, ValueError) def test_unsupported(self, pa): if LooseVersion(pyarrow.__version__) < LooseVersion("0.15.1.dev"): # period - will be supported using an extension type with pyarrow 1.0 df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) # pyarrow 0.11 raises ArrowTypeError # older pyarrows raise ArrowInvalid self.check_error_on_write(df, pa, Exception) # timedelta df = pd.DataFrame({"a": pd.timedelta_range("1 day", periods=3)}) self.check_error_on_write(df, pa, NotImplementedError) # mixed python objects df = pd.DataFrame({"a": ["a", 1, 2.0]}) # pyarrow 0.11 raises ArrowTypeError # older pyarrows raise ArrowInvalid self.check_error_on_write(df, pa, Exception) def test_categorical(self, pa): # supported in >= 0.7.0 df = pd.DataFrame() df["a"] = pd.Categorical(list("abcdef")) # test for null, out-of-order values, and unobserved category df["b"] = pd.Categorical( ["bar", "foo", "foo", "bar", None, "bar"], dtype=pd.CategoricalDtype(["foo", "bar", "baz"]), ) # test for ordered flag df["c"] = pd.Categorical( ["a", "b", "c", "a", "c", "b"], categories=["b", "c", "d"], ordered=True ) if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.0"): check_round_trip(df, pa) else: # de-serialized as object for pyarrow < 0.15 expected = df.astype(object) check_round_trip(df, pa, expected=expected) def test_s3_roundtrip(self, df_compat, s3_resource, pa): # GH #19134 check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet") def test_partition_cols_supported(self, pa, df_full): # GH #23283 partition_cols = ["bool", "int"] df = df_full with tm.ensure_clean_dir() as path: df.to_parquet(path, partition_cols=partition_cols, compression=None) import pyarrow.parquet as pq dataset = pq.ParquetDataset(path, validate_schema=False) assert len(dataset.partitions.partition_names) == 2 assert dataset.partitions.partition_names == set(partition_cols) def test_partition_cols_string(self, pa, df_full): # GH #27117 partition_cols = "bool" partition_cols_list = [partition_cols] df = df_full with tm.ensure_clean_dir() as path: df.to_parquet(path, partition_cols=partition_cols, compression=None) import pyarrow.parquet as pq dataset = pq.ParquetDataset(path, validate_schema=False) assert len(dataset.partitions.partition_names) == 1 assert dataset.partitions.partition_names == set(partition_cols_list) def test_empty_dataframe(self, pa): # GH #27339 df = pd.DataFrame() check_round_trip(df, pa) def test_write_with_schema(self, pa): import pyarrow df = pd.DataFrame({"x": [0, 1]}) schema = pyarrow.schema([pyarrow.field("x", type=pyarrow.bool_())]) out_df = df.astype(bool) check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df) @td.skip_if_no("pyarrow", min_version="0.15.0") def test_additional_extension_arrays(self, pa): # test additional ExtensionArrays that are supported through the # __arrow_array__ protocol df = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype="Int64"), "b": pd.Series([1, 2, 3], dtype="UInt32"), "c": pd.Series(["a", None, "c"], dtype="string"), } ) if LooseVersion(pyarrow.__version__) >= LooseVersion("0.16.0"): expected = df else: # de-serialized as plain int / object expected = df.assign( a=df.a.astype("int64"), b=df.b.astype("int64"), c=df.c.astype("object") ) check_round_trip(df, pa, expected=expected) df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) if LooseVersion(pyarrow.__version__) >= LooseVersion("0.16.0"): expected = df else: # if missing values in integer, currently de-serialized as float expected = df.assign(a=df.a.astype("float64")) check_round_trip(df, pa, expected=expected) @td.skip_if_no("pyarrow", min_version="0.16.0") def test_additional_extension_types(self, pa): # test additional ExtensionArrays that are supported through the # __arrow_array__ protocol + by defining a custom ExtensionType df = pd.DataFrame( { # Arrow does not yet support struct in writing to Parquet (ARROW-1644) # "c": pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2), (3, 4)]), "d": pd.period_range("2012-01-01", periods=3, freq="D"), } ) check_round_trip(df, pa) class TestParquetFastParquet(Base): @td.skip_if_no("fastparquet", min_version="0.3.2") def test_basic(self, fp, df_full): df = df_full df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="US/Eastern") df["timedelta"] = pd.timedelta_range("1 day", periods=3) check_round_trip(df, fp) @pytest.mark.skip(reason="not supported") def test_duplicate_columns(self, fp): # not currently able to handle duplicate columns df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() self.check_error_on_write(df, fp, ValueError) def test_bool_with_none(self, fp): df = pd.DataFrame({"a": [True, None, False]}) expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") check_round_trip(df, fp, expected=expected) def test_unsupported(self, fp): # period df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) self.check_error_on_write(df, fp, ValueError) # mixed df = pd.DataFrame({"a": ["a", 1, 2.0]}) self.check_error_on_write(df, fp, ValueError) def test_categorical(self, fp): df = pd.DataFrame({"a": pd.Categorical(list("abc"))}) check_round_trip(df, fp) def test_filter_row_groups(self, fp): d = {"a": list(range(0, 3))} df = pd.DataFrame(d) with tm.ensure_clean() as path: df.to_parquet(path, fp, compression=None, row_group_offsets=1) result = read_parquet(path, fp, filters=[("a", "==", 0)]) assert len(result) == 1 def test_s3_roundtrip(self, df_compat, s3_resource, fp): # GH #19134 check_round_trip(df_compat, fp, path="s3://pandas-test/fastparquet.parquet") def test_partition_cols_supported(self, fp, df_full): # GH #23283 partition_cols = ["bool", "int"] df = df_full with tm.ensure_clean_dir() as path: df.to_parquet( path, engine="fastparquet", partition_cols=partition_cols, compression=None, ) assert os.path.exists(path) import fastparquet # noqa: F811 actual_partition_cols = fastparquet.ParquetFile(path, False).cats assert len(actual_partition_cols) == 2 def test_partition_cols_string(self, fp, df_full): # GH #27117 partition_cols = "bool" df = df_full with tm.ensure_clean_dir() as path: df.to_parquet( path, engine="fastparquet", partition_cols=partition_cols, compression=None, ) assert os.path.exists(path) import fastparquet # noqa: F811 actual_partition_cols = fastparquet.ParquetFile(path, False).cats assert len(actual_partition_cols) == 1 def test_partition_on_supported(self, fp, df_full): # GH #23283 partition_cols = ["bool", "int"] df = df_full with tm.ensure_clean_dir() as path: df.to_parquet( path, engine="fastparquet", compression=None, partition_on=partition_cols, ) assert os.path.exists(path) import fastparquet # noqa: F811 actual_partition_cols = fastparquet.ParquetFile(path, False).cats assert len(actual_partition_cols) == 2 def test_error_on_using_partition_cols_and_partition_on(self, fp, df_full): # GH #23283 partition_cols = ["bool", "int"] df = df_full with pytest.raises(ValueError): with tm.ensure_clean_dir() as path: df.to_parquet( path, engine="fastparquet", compression=None, partition_on=partition_cols, partition_cols=partition_cols, ) def test_empty_dataframe(self, fp): # GH #27339 df = pd.DataFrame() expected = df.copy() expected.index.name = "index" check_round_trip(df, fp, expected=expected)