projektAI/venv/Lib/site-packages/pandas/tests/io/test_pickle.py
2021-06-06 22:13:05 +02:00

603 lines
18 KiB
Python

"""
manage legacy pickle tests
How to add pickle tests:
1. Install pandas version intended to output the pickle.
2. Execute "generate_legacy_storage_files.py" to create the pickle.
$ python generate_legacy_storage_files.py <output_dir> pickle
3. Move the created pickle to "data/legacy_pickle/<version>" directory.
"""
import bz2
import datetime
import functools
from functools import partial
import glob
import gzip
import io
import os
from pathlib import Path
import pickle
import shutil
from warnings import catch_warnings, simplefilter
import zipfile
import numpy as np
import pytest
from pandas.compat import PY38, get_lzma_file, import_lzma, is_platform_little_endian
import pandas.util._test_decorators as td
import pandas as pd
from pandas import Index, Series, period_range
import pandas._testing as tm
from pandas.tseries.offsets import Day, MonthEnd
lzma = import_lzma()
@pytest.fixture(scope="module")
def current_pickle_data():
# our current version pickle data
from pandas.tests.io.generate_legacy_storage_files import create_pickle_data
return create_pickle_data()
# ---------------------
# comparison functions
# ---------------------
def compare_element(result, expected, typ, version=None):
if isinstance(expected, Index):
tm.assert_index_equal(expected, result)
return
if typ.startswith("sp_"):
comparator = tm.assert_equal
comparator(result, expected)
elif typ == "timestamp":
if expected is pd.NaT:
assert result is pd.NaT
else:
assert result == expected
assert result.freq == expected.freq
else:
comparator = getattr(tm, f"assert_{typ}_equal", tm.assert_almost_equal)
comparator(result, expected)
def compare(data, vf, version):
data = pd.read_pickle(vf)
m = globals()
for typ, dv in data.items():
for dt, result in dv.items():
expected = data[typ][dt]
# use a specific comparator
# if available
comparator = f"compare_{typ}_{dt}"
comparator = m.get(comparator, m["compare_element"])
comparator(result, expected, typ, version)
return data
def compare_series_ts(result, expected, typ, version):
# GH 7748
tm.assert_series_equal(result, expected)
assert result.index.freq == expected.index.freq
assert not result.index.freq.normalize
tm.assert_series_equal(result > 0, expected > 0)
# GH 9291
freq = result.index.freq
assert freq + Day(1) == Day(2)
res = freq + pd.Timedelta(hours=1)
assert isinstance(res, pd.Timedelta)
assert res == pd.Timedelta(days=1, hours=1)
res = freq + pd.Timedelta(nanoseconds=1)
assert isinstance(res, pd.Timedelta)
assert res == pd.Timedelta(days=1, nanoseconds=1)
def compare_series_dt_tz(result, expected, typ, version):
tm.assert_series_equal(result, expected)
def compare_series_cat(result, expected, typ, version):
tm.assert_series_equal(result, expected)
def compare_frame_dt_mixed_tzs(result, expected, typ, version):
tm.assert_frame_equal(result, expected)
def compare_frame_cat_onecol(result, expected, typ, version):
tm.assert_frame_equal(result, expected)
def compare_frame_cat_and_float(result, expected, typ, version):
compare_frame_cat_onecol(result, expected, typ, version)
def compare_index_period(result, expected, typ, version):
tm.assert_index_equal(result, expected)
assert isinstance(result.freq, MonthEnd)
assert result.freq == MonthEnd()
assert result.freqstr == "M"
tm.assert_index_equal(result.shift(2), expected.shift(2))
files = glob.glob(
os.path.join(os.path.dirname(__file__), "data", "legacy_pickle", "*", "*.pickle")
)
@pytest.fixture(params=files)
def legacy_pickle(request, datapath):
return datapath(request.param)
# ---------------------
# tests
# ---------------------
def test_pickles(current_pickle_data, legacy_pickle):
if not is_platform_little_endian():
pytest.skip("known failure on non-little endian")
version = os.path.basename(os.path.dirname(legacy_pickle))
with catch_warnings(record=True):
simplefilter("ignore")
compare(current_pickle_data, legacy_pickle, version)
def python_pickler(obj, path):
with open(path, "wb") as fh:
pickle.dump(obj, fh, protocol=-1)
def python_unpickler(path):
with open(path, "rb") as fh:
fh.seek(0)
return pickle.load(fh)
@pytest.mark.parametrize(
"pickle_writer",
[
pytest.param(python_pickler, id="python"),
pytest.param(pd.to_pickle, id="pandas_proto_default"),
pytest.param(
functools.partial(pd.to_pickle, protocol=pickle.HIGHEST_PROTOCOL),
id="pandas_proto_highest",
),
pytest.param(functools.partial(pd.to_pickle, protocol=4), id="pandas_proto_4"),
pytest.param(
functools.partial(pd.to_pickle, protocol=5),
id="pandas_proto_5",
marks=pytest.mark.skipif(not PY38, reason="protocol 5 not supported"),
),
],
)
def test_round_trip_current(current_pickle_data, pickle_writer):
data = current_pickle_data
for typ, dv in data.items():
for dt, expected in dv.items():
for writer in [pd.to_pickle, python_pickler]:
with tm.ensure_clean() as path:
# test writing with each pickler
pickle_writer(expected, path)
# test reading with each unpickler
result = pd.read_pickle(path)
compare_element(result, expected, typ)
result = python_unpickler(path)
compare_element(result, expected, typ)
# and the same for file objects (GH 35679)
with open(path, mode="wb") as handle:
writer(expected, path)
handle.seek(0) # shouldn't close file handle
with open(path, mode="rb") as handle:
result = pd.read_pickle(handle)
handle.seek(0) # shouldn't close file handle
compare_element(result, expected, typ)
def test_pickle_path_pathlib():
df = tm.makeDataFrame()
result = tm.round_trip_pathlib(df.to_pickle, pd.read_pickle)
tm.assert_frame_equal(df, result)
def test_pickle_path_localpath():
df = tm.makeDataFrame()
result = tm.round_trip_localpath(df.to_pickle, pd.read_pickle)
tm.assert_frame_equal(df, result)
def test_legacy_sparse_warning(datapath):
"""
Generated with
>>> df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [0, 0, 1, 1]}).to_sparse()
>>> df.to_pickle("pandas/tests/io/data/pickle/sparseframe-0.20.3.pickle.gz",
... compression="gzip")
>>> s = df['B']
>>> s.to_pickle("pandas/tests/io/data/pickle/sparseseries-0.20.3.pickle.gz",
... compression="gzip")
"""
with tm.assert_produces_warning(FutureWarning):
simplefilter("ignore", DeprecationWarning) # from boto
pd.read_pickle(
datapath("io", "data", "pickle", "sparseseries-0.20.3.pickle.gz"),
compression="gzip",
)
with tm.assert_produces_warning(FutureWarning):
simplefilter("ignore", DeprecationWarning) # from boto
pd.read_pickle(
datapath("io", "data", "pickle", "sparseframe-0.20.3.pickle.gz"),
compression="gzip",
)
# ---------------------
# test pickle compression
# ---------------------
@pytest.fixture
def get_random_path():
return f"__{tm.rands(10)}__.pickle"
class TestCompression:
_compression_to_extension = {
None: ".none",
"gzip": ".gz",
"bz2": ".bz2",
"zip": ".zip",
"xz": ".xz",
}
def compress_file(self, src_path, dest_path, compression):
if compression is None:
shutil.copyfile(src_path, dest_path)
return
if compression == "gzip":
f = gzip.open(dest_path, "w")
elif compression == "bz2":
f = bz2.BZ2File(dest_path, "w")
elif compression == "zip":
with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f:
f.write(src_path, os.path.basename(src_path))
elif compression == "xz":
f = get_lzma_file(lzma)(dest_path, "w")
else:
msg = f"Unrecognized compression type: {compression}"
raise ValueError(msg)
if compression != "zip":
with open(src_path, "rb") as fh, f:
f.write(fh.read())
def test_write_explicit(self, compression, get_random_path):
base = get_random_path
path1 = base + ".compressed"
path2 = base + ".raw"
with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
df = tm.makeDataFrame()
# write to compressed file
df.to_pickle(p1, compression=compression)
# decompress
with tm.decompress_file(p1, compression=compression) as f:
with open(p2, "wb") as fh:
fh.write(f.read())
# read decompressed file
df2 = pd.read_pickle(p2, compression=None)
tm.assert_frame_equal(df, df2)
@pytest.mark.parametrize("compression", ["", "None", "bad", "7z"])
def test_write_explicit_bad(self, compression, get_random_path):
with pytest.raises(ValueError, match="Unrecognized compression type"):
with tm.ensure_clean(get_random_path) as path:
df = tm.makeDataFrame()
df.to_pickle(path, compression=compression)
@pytest.mark.parametrize("ext", ["", ".gz", ".bz2", ".no_compress", ".xz"])
def test_write_infer(self, ext, get_random_path):
base = get_random_path
path1 = base + ext
path2 = base + ".raw"
compression = None
for c in self._compression_to_extension:
if self._compression_to_extension[c] == ext:
compression = c
break
with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
df = tm.makeDataFrame()
# write to compressed file by inferred compression method
df.to_pickle(p1)
# decompress
with tm.decompress_file(p1, compression=compression) as f:
with open(p2, "wb") as fh:
fh.write(f.read())
# read decompressed file
df2 = pd.read_pickle(p2, compression=None)
tm.assert_frame_equal(df, df2)
def test_read_explicit(self, compression, get_random_path):
base = get_random_path
path1 = base + ".raw"
path2 = base + ".compressed"
with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
df = tm.makeDataFrame()
# write to uncompressed file
df.to_pickle(p1, compression=None)
# compress
self.compress_file(p1, p2, compression=compression)
# read compressed file
df2 = pd.read_pickle(p2, compression=compression)
tm.assert_frame_equal(df, df2)
@pytest.mark.parametrize("ext", ["", ".gz", ".bz2", ".zip", ".no_compress", ".xz"])
def test_read_infer(self, ext, get_random_path):
base = get_random_path
path1 = base + ".raw"
path2 = base + ext
compression = None
for c in self._compression_to_extension:
if self._compression_to_extension[c] == ext:
compression = c
break
with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
df = tm.makeDataFrame()
# write to uncompressed file
df.to_pickle(p1, compression=None)
# compress
self.compress_file(p1, p2, compression=compression)
# read compressed file by inferred compression method
df2 = pd.read_pickle(p2)
tm.assert_frame_equal(df, df2)
# ---------------------
# test pickle compression
# ---------------------
class TestProtocol:
@pytest.mark.parametrize("protocol", [-1, 0, 1, 2])
def test_read(self, protocol, get_random_path):
with tm.ensure_clean(get_random_path) as path:
df = tm.makeDataFrame()
df.to_pickle(path, protocol=protocol)
df2 = pd.read_pickle(path)
tm.assert_frame_equal(df, df2)
@pytest.mark.parametrize(
["pickle_file", "excols"],
[
("test_py27.pkl", pd.Index(["a", "b", "c"])),
(
"test_mi_py27.pkl",
pd.MultiIndex.from_arrays([["a", "b", "c"], ["A", "B", "C"]]),
),
],
)
def test_unicode_decode_error(datapath, pickle_file, excols):
# pickle file written with py27, should be readable without raising
# UnicodeDecodeError, see GH#28645 and GH#31988
path = datapath("io", "data", "pickle", pickle_file)
df = pd.read_pickle(path)
# just test the columns are correct since the values are random
tm.assert_index_equal(df.columns, excols)
# ---------------------
# tests for buffer I/O
# ---------------------
def test_pickle_buffer_roundtrip():
with tm.ensure_clean() as path:
df = tm.makeDataFrame()
with open(path, "wb") as fh:
df.to_pickle(fh)
with open(path, "rb") as fh:
result = pd.read_pickle(fh)
tm.assert_frame_equal(df, result)
# ---------------------
# tests for URL I/O
# ---------------------
@pytest.mark.parametrize(
"mockurl", ["http://url.com", "ftp://test.com", "http://gzip.com"]
)
def test_pickle_generalurl_read(monkeypatch, mockurl):
def python_pickler(obj, path):
with open(path, "wb") as fh:
pickle.dump(obj, fh, protocol=-1)
class MockReadResponse:
def __init__(self, path):
self.file = open(path, "rb")
if "gzip" in path:
self.headers = {"Content-Encoding": "gzip"}
else:
self.headers = {"Content-Encoding": None}
def read(self):
return self.file.read()
def close(self):
return self.file.close()
with tm.ensure_clean() as path:
def mock_urlopen_read(*args, **kwargs):
return MockReadResponse(path)
df = tm.makeDataFrame()
python_pickler(df, path)
monkeypatch.setattr("urllib.request.urlopen", mock_urlopen_read)
result = pd.read_pickle(mockurl)
tm.assert_frame_equal(df, result)
@td.skip_if_no("fsspec")
def test_pickle_fsspec_roundtrip():
with tm.ensure_clean():
mockurl = "memory://afile"
df = tm.makeDataFrame()
df.to_pickle(mockurl)
result = pd.read_pickle(mockurl)
tm.assert_frame_equal(df, result)
class MyTz(datetime.tzinfo):
def __init__(self):
pass
def test_read_pickle_with_subclass():
# GH 12163
expected = Series(dtype=object), MyTz()
result = tm.round_trip_pickle(expected)
tm.assert_series_equal(result[0], expected[0])
assert isinstance(result[1], MyTz)
def test_pickle_binary_object_compression(compression):
"""
Read/write from binary file-objects w/wo compression.
GH 26237, GH 29054, and GH 29570
"""
df = tm.makeDataFrame()
# reference for compression
with tm.ensure_clean() as path:
df.to_pickle(path, compression=compression)
reference = Path(path).read_bytes()
# write
buffer = io.BytesIO()
df.to_pickle(buffer, compression=compression)
buffer.seek(0)
# gzip and zip safe the filename: cannot compare the compressed content
assert buffer.getvalue() == reference or compression in ("gzip", "zip")
# read
read_df = pd.read_pickle(buffer, compression=compression)
buffer.seek(0)
tm.assert_frame_equal(df, read_df)
def test_pickle_dataframe_with_multilevel_index(
multiindex_year_month_day_dataframe_random_data,
multiindex_dataframe_random_data,
):
ymd = multiindex_year_month_day_dataframe_random_data
frame = multiindex_dataframe_random_data
def _test_roundtrip(frame):
unpickled = tm.round_trip_pickle(frame)
tm.assert_frame_equal(frame, unpickled)
_test_roundtrip(frame)
_test_roundtrip(frame.T)
_test_roundtrip(ymd)
_test_roundtrip(ymd.T)
def test_pickle_timeseries_periodindex():
# GH#2891
prng = period_range("1/1/2011", "1/1/2012", freq="M")
ts = Series(np.random.randn(len(prng)), prng)
new_ts = tm.round_trip_pickle(ts)
assert new_ts.index.freq == "M"
@pytest.mark.parametrize(
"name", [777, 777.0, "name", datetime.datetime(2001, 11, 11), (1, 2)]
)
def test_pickle_preserve_name(name):
unpickled = tm.round_trip_pickle(tm.makeTimeSeries(name=name))
assert unpickled.name == name
def test_pickle_datetimes(datetime_series):
unp_ts = tm.round_trip_pickle(datetime_series)
tm.assert_series_equal(unp_ts, datetime_series)
def test_pickle_strings(string_series):
unp_series = tm.round_trip_pickle(string_series)
tm.assert_series_equal(unp_series, string_series)
def test_pickle_preserves_block_ndim():
# GH#37631
ser = Series(list("abc")).astype("category").iloc[[0]]
res = tm.round_trip_pickle(ser)
assert res._mgr.blocks[0].ndim == 1
assert res._mgr.blocks[0].shape == (1,)
# GH#37631 OP issue was about indexing, underlying problem was pickle
tm.assert_series_equal(res[[True]], ser)
@pytest.mark.parametrize("protocol", [pickle.DEFAULT_PROTOCOL, pickle.HIGHEST_PROTOCOL])
def test_pickle_big_dataframe_compression(protocol, compression):
# GH#39002
df = pd.DataFrame(range(100000))
result = tm.round_trip_pathlib(
partial(df.to_pickle, protocol=protocol, compression=compression),
partial(pd.read_pickle, compression=compression),
)
tm.assert_frame_equal(df, result)