Inzynierka/Lib/site-packages/pandas/tests/io/pytables/test_store.py
2023-06-02 12:51:02 +02:00

987 lines
31 KiB
Python

import datetime as dt
import hashlib
import os
import tempfile
import time
from warnings import (
catch_warnings,
simplefilter,
)
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Index,
MultiIndex,
Series,
Timestamp,
concat,
date_range,
timedelta_range,
)
import pandas._testing as tm
from pandas.tests.io.pytables.common import (
_maybe_remove,
ensure_clean_store,
safe_close,
)
_default_compressor = "blosc"
from pandas.io.pytables import (
HDFStore,
read_hdf,
)
pytestmark = pytest.mark.single_cpu
def test_context(setup_path):
with tm.ensure_clean(setup_path) as path:
try:
with HDFStore(path) as tbl:
raise ValueError("blah")
except ValueError:
pass
with tm.ensure_clean(setup_path) as path:
with HDFStore(path) as tbl:
tbl["a"] = tm.makeDataFrame()
assert len(tbl) == 1
assert type(tbl["a"]) == DataFrame
def test_no_track_times(tmp_path, setup_path):
# GH 32682
# enables to set track_times (see `pytables` `create_table` documentation)
def checksum(filename, hash_factory=hashlib.md5, chunk_num_blocks=128):
h = hash_factory()
with open(filename, "rb") as f:
for chunk in iter(lambda: f.read(chunk_num_blocks * h.block_size), b""):
h.update(chunk)
return h.digest()
def create_h5_and_return_checksum(tmp_path, track_times):
path = tmp_path / setup_path
df = DataFrame({"a": [1]})
with HDFStore(path, mode="w") as hdf:
hdf.put(
"table",
df,
format="table",
data_columns=True,
index=None,
track_times=track_times,
)
return checksum(path)
checksum_0_tt_false = create_h5_and_return_checksum(tmp_path, track_times=False)
checksum_0_tt_true = create_h5_and_return_checksum(tmp_path, track_times=True)
# sleep is necessary to create h5 with different creation time
time.sleep(1)
checksum_1_tt_false = create_h5_and_return_checksum(tmp_path, track_times=False)
checksum_1_tt_true = create_h5_and_return_checksum(tmp_path, track_times=True)
# checksums are the same if track_time = False
assert checksum_0_tt_false == checksum_1_tt_false
# checksums are NOT same if track_time = True
assert checksum_0_tt_true != checksum_1_tt_true
def test_iter_empty(setup_path):
with ensure_clean_store(setup_path) as store:
# GH 12221
assert list(store) == []
def test_repr(setup_path):
with ensure_clean_store(setup_path) as store:
repr(store)
store.info()
store["a"] = tm.makeTimeSeries()
store["b"] = tm.makeStringSeries()
store["c"] = tm.makeDataFrame()
df = tm.makeDataFrame()
df["obj1"] = "foo"
df["obj2"] = "bar"
df["bool1"] = df["A"] > 0
df["bool2"] = df["B"] > 0
df["bool3"] = True
df["int1"] = 1
df["int2"] = 2
df["timestamp1"] = Timestamp("20010102")
df["timestamp2"] = Timestamp("20010103")
df["datetime1"] = dt.datetime(2001, 1, 2, 0, 0)
df["datetime2"] = dt.datetime(2001, 1, 3, 0, 0)
df.loc[df.index[3:6], ["obj1"]] = np.nan
df = df._consolidate()
with catch_warnings(record=True):
simplefilter("ignore", pd.errors.PerformanceWarning)
store["df"] = df
# make a random group in hdf space
store._handle.create_group(store._handle.root, "bah")
assert store.filename in repr(store)
assert store.filename in str(store)
store.info()
# storers
with ensure_clean_store(setup_path) as store:
df = tm.makeDataFrame()
store.append("df", df)
s = store.get_storer("df")
repr(s)
str(s)
def test_contains(setup_path):
with ensure_clean_store(setup_path) as store:
store["a"] = tm.makeTimeSeries()
store["b"] = tm.makeDataFrame()
store["foo/bar"] = tm.makeDataFrame()
assert "a" in store
assert "b" in store
assert "c" not in store
assert "foo/bar" in store
assert "/foo/bar" in store
assert "/foo/b" not in store
assert "bar" not in store
# gh-2694: tables.NaturalNameWarning
with catch_warnings(record=True):
store["node())"] = tm.makeDataFrame()
assert "node())" in store
def test_versioning(setup_path):
with ensure_clean_store(setup_path) as store:
store["a"] = tm.makeTimeSeries()
store["b"] = tm.makeDataFrame()
df = tm.makeTimeDataFrame()
_maybe_remove(store, "df1")
store.append("df1", df[:10])
store.append("df1", df[10:])
assert store.root.a._v_attrs.pandas_version == "0.15.2"
assert store.root.b._v_attrs.pandas_version == "0.15.2"
assert store.root.df1._v_attrs.pandas_version == "0.15.2"
# write a file and wipe its versioning
_maybe_remove(store, "df2")
store.append("df2", df)
# this is an error because its table_type is appendable, but no
# version info
store.get_node("df2")._v_attrs.pandas_version = None
msg = "'NoneType' object has no attribute 'startswith'"
with pytest.raises(Exception, match=msg):
store.select("df2")
@pytest.mark.parametrize(
"where, expected",
[
(
"/",
{
"": ({"first_group", "second_group"}, set()),
"/first_group": (set(), {"df1", "df2"}),
"/second_group": ({"third_group"}, {"df3", "s1"}),
"/second_group/third_group": (set(), {"df4"}),
},
),
(
"/second_group",
{
"/second_group": ({"third_group"}, {"df3", "s1"}),
"/second_group/third_group": (set(), {"df4"}),
},
),
],
)
def test_walk(where, expected):
# GH10143
objs = {
"df1": DataFrame([1, 2, 3]),
"df2": DataFrame([4, 5, 6]),
"df3": DataFrame([6, 7, 8]),
"df4": DataFrame([9, 10, 11]),
"s1": Series([10, 9, 8]),
# Next 3 items aren't pandas objects and should be ignored
"a1": np.array([[1, 2, 3], [4, 5, 6]]),
"tb1": np.array([(1, 2, 3), (4, 5, 6)], dtype="i,i,i"),
"tb2": np.array([(7, 8, 9), (10, 11, 12)], dtype="i,i,i"),
}
with ensure_clean_store("walk_groups.hdf", mode="w") as store:
store.put("/first_group/df1", objs["df1"])
store.put("/first_group/df2", objs["df2"])
store.put("/second_group/df3", objs["df3"])
store.put("/second_group/s1", objs["s1"])
store.put("/second_group/third_group/df4", objs["df4"])
# Create non-pandas objects
store._handle.create_array("/first_group", "a1", objs["a1"])
store._handle.create_table("/first_group", "tb1", obj=objs["tb1"])
store._handle.create_table("/second_group", "tb2", obj=objs["tb2"])
assert len(list(store.walk(where=where))) == len(expected)
for path, groups, leaves in store.walk(where=where):
assert path in expected
expected_groups, expected_frames = expected[path]
assert expected_groups == set(groups)
assert expected_frames == set(leaves)
for leaf in leaves:
frame_path = "/".join([path, leaf])
obj = store.get(frame_path)
if "df" in leaf:
tm.assert_frame_equal(obj, objs[leaf])
else:
tm.assert_series_equal(obj, objs[leaf])
def test_getattr(setup_path):
with ensure_clean_store(setup_path) as store:
s = tm.makeTimeSeries()
store["a"] = s
# test attribute access
result = store.a
tm.assert_series_equal(result, s)
result = getattr(store, "a")
tm.assert_series_equal(result, s)
df = tm.makeTimeDataFrame()
store["df"] = df
result = store.df
tm.assert_frame_equal(result, df)
# errors
for x in ["d", "mode", "path", "handle", "complib"]:
msg = f"'HDFStore' object has no attribute '{x}'"
with pytest.raises(AttributeError, match=msg):
getattr(store, x)
# not stores
for x in ["mode", "path", "handle", "complib"]:
getattr(store, f"_{x}")
def test_store_dropna(tmp_path, setup_path):
df_with_missing = DataFrame(
{"col1": [0.0, np.nan, 2.0], "col2": [1.0, np.nan, np.nan]},
index=list("abc"),
)
df_without_missing = DataFrame(
{"col1": [0.0, 2.0], "col2": [1.0, np.nan]}, index=list("ac")
)
# # Test to make sure defaults are to not drop.
# # Corresponding to Issue 9382
path = tmp_path / setup_path
df_with_missing.to_hdf(path, "df", format="table")
reloaded = read_hdf(path, "df")
tm.assert_frame_equal(df_with_missing, reloaded)
path = tmp_path / setup_path
df_with_missing.to_hdf(path, "df", format="table", dropna=False)
reloaded = read_hdf(path, "df")
tm.assert_frame_equal(df_with_missing, reloaded)
path = tmp_path / setup_path
df_with_missing.to_hdf(path, "df", format="table", dropna=True)
reloaded = read_hdf(path, "df")
tm.assert_frame_equal(df_without_missing, reloaded)
def test_to_hdf_with_min_itemsize(tmp_path, setup_path):
path = tmp_path / setup_path
# min_itemsize in index with to_hdf (GH 10381)
df = tm.makeMixedDataFrame().set_index("C")
df.to_hdf(path, "ss3", format="table", min_itemsize={"index": 6})
# just make sure there is a longer string:
df2 = df.copy().reset_index().assign(C="longer").set_index("C")
df2.to_hdf(path, "ss3", append=True, format="table")
tm.assert_frame_equal(read_hdf(path, "ss3"), concat([df, df2]))
# same as above, with a Series
df["B"].to_hdf(path, "ss4", format="table", min_itemsize={"index": 6})
df2["B"].to_hdf(path, "ss4", append=True, format="table")
tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]]))
@pytest.mark.parametrize("format", ["fixed", "table"])
def test_to_hdf_errors(tmp_path, format, setup_path):
data = ["\ud800foo"]
ser = Series(data, index=Index(data))
path = tmp_path / setup_path
# GH 20835
ser.to_hdf(path, "table", format=format, errors="surrogatepass")
result = read_hdf(path, "table", errors="surrogatepass")
tm.assert_series_equal(result, ser)
def test_create_table_index(setup_path):
with ensure_clean_store(setup_path) as store:
with catch_warnings(record=True):
def col(t, column):
return getattr(store.get_storer(t).table.cols, column)
# data columns
df = tm.makeTimeDataFrame()
df["string"] = "foo"
df["string2"] = "bar"
store.append("f", df, data_columns=["string", "string2"])
assert col("f", "index").is_indexed is True
assert col("f", "string").is_indexed is True
assert col("f", "string2").is_indexed is True
# specify index=columns
store.append("f2", df, index=["string"], data_columns=["string", "string2"])
assert col("f2", "index").is_indexed is False
assert col("f2", "string").is_indexed is True
assert col("f2", "string2").is_indexed is False
# try to index a non-table
_maybe_remove(store, "f2")
store.put("f2", df)
msg = "cannot create table index on a Fixed format store"
with pytest.raises(TypeError, match=msg):
store.create_table_index("f2")
def test_create_table_index_data_columns_argument(setup_path):
# GH 28156
with ensure_clean_store(setup_path) as store:
with catch_warnings(record=True):
def col(t, column):
return getattr(store.get_storer(t).table.cols, column)
# data columns
df = tm.makeTimeDataFrame()
df["string"] = "foo"
df["string2"] = "bar"
store.append("f", df, data_columns=["string"])
assert col("f", "index").is_indexed is True
assert col("f", "string").is_indexed is True
msg = "'Cols' object has no attribute 'string2'"
with pytest.raises(AttributeError, match=msg):
col("f", "string2").is_indexed
# try to index a col which isn't a data_column
msg = (
"column string2 is not a data_column.\n"
"In order to read column string2 you must reload the dataframe \n"
"into HDFStore and include string2 with the data_columns argument."
)
with pytest.raises(AttributeError, match=msg):
store.create_table_index("f", columns=["string2"])
def test_mi_data_columns(setup_path):
# GH 14435
idx = MultiIndex.from_arrays(
[date_range("2000-01-01", periods=5), range(5)], names=["date", "id"]
)
df = DataFrame({"a": [1.1, 1.2, 1.3, 1.4, 1.5]}, index=idx)
with ensure_clean_store(setup_path) as store:
store.append("df", df, data_columns=True)
actual = store.select("df", where="id == 1")
expected = df.iloc[[1], :]
tm.assert_frame_equal(actual, expected)
def test_table_mixed_dtypes(setup_path):
# frame
df = tm.makeDataFrame()
df["obj1"] = "foo"
df["obj2"] = "bar"
df["bool1"] = df["A"] > 0
df["bool2"] = df["B"] > 0
df["bool3"] = True
df["int1"] = 1
df["int2"] = 2
df["timestamp1"] = Timestamp("20010102")
df["timestamp2"] = Timestamp("20010103")
df["datetime1"] = dt.datetime(2001, 1, 2, 0, 0)
df["datetime2"] = dt.datetime(2001, 1, 3, 0, 0)
df.loc[df.index[3:6], ["obj1"]] = np.nan
df = df._consolidate()
with ensure_clean_store(setup_path) as store:
store.append("df1_mixed", df)
tm.assert_frame_equal(store.select("df1_mixed"), df)
def test_calendar_roundtrip_issue(setup_path):
# 8591
# doc example from tseries holiday section
weekmask_egypt = "Sun Mon Tue Wed Thu"
holidays = [
"2012-05-01",
dt.datetime(2013, 5, 1),
np.datetime64("2014-05-01"),
]
bday_egypt = pd.offsets.CustomBusinessDay(
holidays=holidays, weekmask=weekmask_egypt
)
mydt = dt.datetime(2013, 4, 30)
dts = date_range(mydt, periods=5, freq=bday_egypt)
s = Series(dts.weekday, dts).map(Series("Mon Tue Wed Thu Fri Sat Sun".split()))
with ensure_clean_store(setup_path) as store:
store.put("fixed", s)
result = store.select("fixed")
tm.assert_series_equal(result, s)
store.append("table", s)
result = store.select("table")
tm.assert_series_equal(result, s)
def test_remove(setup_path):
with ensure_clean_store(setup_path) as store:
ts = tm.makeTimeSeries()
df = tm.makeDataFrame()
store["a"] = ts
store["b"] = df
_maybe_remove(store, "a")
assert len(store) == 1
tm.assert_frame_equal(df, store["b"])
_maybe_remove(store, "b")
assert len(store) == 0
# nonexistence
with pytest.raises(
KeyError, match="'No object named a_nonexistent_store in the file'"
):
store.remove("a_nonexistent_store")
# pathing
store["a"] = ts
store["b/foo"] = df
_maybe_remove(store, "foo")
_maybe_remove(store, "b/foo")
assert len(store) == 1
store["a"] = ts
store["b/foo"] = df
_maybe_remove(store, "b")
assert len(store) == 1
# __delitem__
store["a"] = ts
store["b"] = df
del store["a"]
del store["b"]
assert len(store) == 0
def test_same_name_scoping(setup_path):
with ensure_clean_store(setup_path) as store:
df = DataFrame(np.random.randn(20, 2), index=date_range("20130101", periods=20))
store.put("df", df, format="table")
expected = df[df.index > Timestamp("20130105")]
result = store.select("df", "index>datetime.datetime(2013,1,5)")
tm.assert_frame_equal(result, expected)
# changes what 'datetime' points to in the namespace where
# 'select' does the lookup
# technically an error, but allow it
result = store.select("df", "index>datetime.datetime(2013,1,5)")
tm.assert_frame_equal(result, expected)
result = store.select("df", "index>datetime(2013,1,5)")
tm.assert_frame_equal(result, expected)
def test_store_index_name(setup_path):
df = tm.makeDataFrame()
df.index.name = "foo"
with ensure_clean_store(setup_path) as store:
store["frame"] = df
recons = store["frame"]
tm.assert_frame_equal(recons, df)
@pytest.mark.parametrize("table_format", ["table", "fixed"])
def test_store_index_name_numpy_str(tmp_path, table_format, setup_path):
# GH #13492
idx = Index(
pd.to_datetime([dt.date(2000, 1, 1), dt.date(2000, 1, 2)]),
name="cols\u05d2",
)
idx1 = Index(
pd.to_datetime([dt.date(2010, 1, 1), dt.date(2010, 1, 2)]),
name="rows\u05d0",
)
df = DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1)
# This used to fail, returning numpy strings instead of python strings.
path = tmp_path / setup_path
df.to_hdf(path, "df", format=table_format)
df2 = read_hdf(path, "df")
tm.assert_frame_equal(df, df2, check_names=True)
assert type(df2.index.name) == str
assert type(df2.columns.name) == str
def test_store_series_name(setup_path):
df = tm.makeDataFrame()
series = df["A"]
with ensure_clean_store(setup_path) as store:
store["series"] = series
recons = store["series"]
tm.assert_series_equal(recons, series)
def test_overwrite_node(setup_path):
with ensure_clean_store(setup_path) as store:
store["a"] = tm.makeTimeDataFrame()
ts = tm.makeTimeSeries()
store["a"] = ts
tm.assert_series_equal(store["a"], ts)
def test_coordinates(setup_path):
df = tm.makeTimeDataFrame()
with ensure_clean_store(setup_path) as store:
_maybe_remove(store, "df")
store.append("df", df)
# all
c = store.select_as_coordinates("df")
assert (c.values == np.arange(len(df.index))).all()
# get coordinates back & test vs frame
_maybe_remove(store, "df")
df = DataFrame({"A": range(5), "B": range(5)})
store.append("df", df)
c = store.select_as_coordinates("df", ["index<3"])
assert (c.values == np.arange(3)).all()
result = store.select("df", where=c)
expected = df.loc[0:2, :]
tm.assert_frame_equal(result, expected)
c = store.select_as_coordinates("df", ["index>=3", "index<=4"])
assert (c.values == np.arange(2) + 3).all()
result = store.select("df", where=c)
expected = df.loc[3:4, :]
tm.assert_frame_equal(result, expected)
assert isinstance(c, Index)
# multiple tables
_maybe_remove(store, "df1")
_maybe_remove(store, "df2")
df1 = tm.makeTimeDataFrame()
df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
store.append("df1", df1, data_columns=["A", "B"])
store.append("df2", df2)
c = store.select_as_coordinates("df1", ["A>0", "B>0"])
df1_result = store.select("df1", c)
df2_result = store.select("df2", c)
result = concat([df1_result, df2_result], axis=1)
expected = concat([df1, df2], axis=1)
expected = expected[(expected.A > 0) & (expected.B > 0)]
tm.assert_frame_equal(result, expected, check_freq=False)
# FIXME: 2021-01-18 on some (mostly windows) builds we get freq=None
# but expect freq="18B"
# pass array/mask as the coordinates
with ensure_clean_store(setup_path) as store:
df = DataFrame(
np.random.randn(1000, 2), index=date_range("20000101", periods=1000)
)
store.append("df", df)
c = store.select_column("df", "index")
where = c[DatetimeIndex(c).month == 5].index
expected = df.iloc[where]
# locations
result = store.select("df", where=where)
tm.assert_frame_equal(result, expected)
# boolean
result = store.select("df", where=where)
tm.assert_frame_equal(result, expected)
# invalid
msg = (
"where must be passed as a string, PyTablesExpr, "
"or list-like of PyTablesExpr"
)
with pytest.raises(TypeError, match=msg):
store.select("df", where=np.arange(len(df), dtype="float64"))
with pytest.raises(TypeError, match=msg):
store.select("df", where=np.arange(len(df) + 1))
with pytest.raises(TypeError, match=msg):
store.select("df", where=np.arange(len(df)), start=5)
with pytest.raises(TypeError, match=msg):
store.select("df", where=np.arange(len(df)), start=5, stop=10)
# selection with filter
selection = date_range("20000101", periods=500)
result = store.select("df", where="index in selection")
expected = df[df.index.isin(selection)]
tm.assert_frame_equal(result, expected)
# list
df = DataFrame(np.random.randn(10, 2))
store.append("df2", df)
result = store.select("df2", where=[0, 3, 5])
expected = df.iloc[[0, 3, 5]]
tm.assert_frame_equal(result, expected)
# boolean
where = [True] * 10
where[-2] = False
result = store.select("df2", where=where)
expected = df.loc[where]
tm.assert_frame_equal(result, expected)
# start/stop
result = store.select("df2", start=5, stop=10)
expected = df[5:10]
tm.assert_frame_equal(result, expected)
def test_start_stop_table(setup_path):
with ensure_clean_store(setup_path) as store:
# table
df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)})
store.append("df", df)
result = store.select("df", "columns=['A']", start=0, stop=5)
expected = df.loc[0:4, ["A"]]
tm.assert_frame_equal(result, expected)
# out of range
result = store.select("df", "columns=['A']", start=30, stop=40)
assert len(result) == 0
expected = df.loc[30:40, ["A"]]
tm.assert_frame_equal(result, expected)
def test_start_stop_multiple(setup_path):
# GH 16209
with ensure_clean_store(setup_path) as store:
df = DataFrame({"foo": [1, 2], "bar": [1, 2]})
store.append_to_multiple(
{"selector": ["foo"], "data": None}, df, selector="selector"
)
result = store.select_as_multiple(
["selector", "data"], selector="selector", start=0, stop=1
)
expected = df.loc[[0], ["foo", "bar"]]
tm.assert_frame_equal(result, expected)
def test_start_stop_fixed(setup_path):
with ensure_clean_store(setup_path) as store:
# fixed, GH 8287
df = DataFrame(
{"A": np.random.rand(20), "B": np.random.rand(20)},
index=date_range("20130101", periods=20),
)
store.put("df", df)
result = store.select("df", start=0, stop=5)
expected = df.iloc[0:5, :]
tm.assert_frame_equal(result, expected)
result = store.select("df", start=5, stop=10)
expected = df.iloc[5:10, :]
tm.assert_frame_equal(result, expected)
# out of range
result = store.select("df", start=30, stop=40)
expected = df.iloc[30:40, :]
tm.assert_frame_equal(result, expected)
# series
s = df.A
store.put("s", s)
result = store.select("s", start=0, stop=5)
expected = s.iloc[0:5]
tm.assert_series_equal(result, expected)
result = store.select("s", start=5, stop=10)
expected = s.iloc[5:10]
tm.assert_series_equal(result, expected)
# sparse; not implemented
df = tm.makeDataFrame()
df.iloc[3:5, 1:3] = np.nan
df.iloc[8:10, -2] = np.nan
def test_select_filter_corner(setup_path):
df = DataFrame(np.random.randn(50, 100))
df.index = [f"{c:3d}" for c in df.index]
df.columns = [f"{c:3d}" for c in df.columns]
with ensure_clean_store(setup_path) as store:
store.put("frame", df, format="table")
crit = "columns=df.columns[:75]"
result = store.select("frame", [crit])
tm.assert_frame_equal(result, df.loc[:, df.columns[:75]])
crit = "columns=df.columns[:75:2]"
result = store.select("frame", [crit])
tm.assert_frame_equal(result, df.loc[:, df.columns[:75:2]])
def test_path_pathlib():
df = tm.makeDataFrame()
result = tm.round_trip_pathlib(
lambda p: df.to_hdf(p, "df"), lambda p: read_hdf(p, "df")
)
tm.assert_frame_equal(df, result)
@pytest.mark.parametrize("start, stop", [(0, 2), (1, 2), (None, None)])
def test_contiguous_mixed_data_table(start, stop, setup_path):
# GH 17021
df = DataFrame(
{
"a": Series([20111010, 20111011, 20111012]),
"b": Series(["ab", "cd", "ab"]),
}
)
with ensure_clean_store(setup_path) as store:
store.append("test_dataset", df)
result = store.select("test_dataset", start=start, stop=stop)
tm.assert_frame_equal(df[start:stop], result)
def test_path_pathlib_hdfstore():
df = tm.makeDataFrame()
def writer(path):
with HDFStore(path) as store:
df.to_hdf(store, "df")
def reader(path):
with HDFStore(path) as store:
return read_hdf(store, "df")
result = tm.round_trip_pathlib(writer, reader)
tm.assert_frame_equal(df, result)
def test_pickle_path_localpath():
df = tm.makeDataFrame()
result = tm.round_trip_pathlib(
lambda p: df.to_hdf(p, "df"), lambda p: read_hdf(p, "df")
)
tm.assert_frame_equal(df, result)
def test_path_localpath_hdfstore():
df = tm.makeDataFrame()
def writer(path):
with HDFStore(path) as store:
df.to_hdf(store, "df")
def reader(path):
with HDFStore(path) as store:
return read_hdf(store, "df")
result = tm.round_trip_localpath(writer, reader)
tm.assert_frame_equal(df, result)
def test_copy():
with catch_warnings(record=True):
def do_copy(f, new_f=None, keys=None, propindexes=True, **kwargs):
if new_f is None:
fd, new_f = tempfile.mkstemp()
try:
store = HDFStore(f, "r")
tstore = store.copy(new_f, keys=keys, propindexes=propindexes, **kwargs)
# check keys
if keys is None:
keys = store.keys()
assert set(keys) == set(tstore.keys())
# check indices & nrows
for k in tstore.keys():
if tstore.get_storer(k).is_table:
new_t = tstore.get_storer(k)
orig_t = store.get_storer(k)
assert orig_t.nrows == new_t.nrows
# check propindixes
if propindexes:
for a in orig_t.axes:
if a.is_indexed:
assert new_t[a.name].is_indexed
finally:
safe_close(store)
safe_close(tstore)
try:
os.close(fd)
except (OSError, ValueError):
pass
os.remove(new_f)
# new table
df = tm.makeDataFrame()
with tm.ensure_clean() as path:
with HDFStore(path) as st:
st.append("df", df, data_columns=["A"])
do_copy(f=path)
do_copy(f=path, propindexes=False)
def test_duplicate_column_name(tmp_path, setup_path):
df = DataFrame(columns=["a", "a"], data=[[0, 0]])
path = tmp_path / setup_path
msg = "Columns index has to be unique for fixed format"
with pytest.raises(ValueError, match=msg):
df.to_hdf(path, "df", format="fixed")
df.to_hdf(path, "df", format="table")
other = read_hdf(path, "df")
tm.assert_frame_equal(df, other)
assert df.equals(other)
assert other.equals(df)
def test_preserve_timedeltaindex_type(setup_path):
# GH9635
df = DataFrame(np.random.normal(size=(10, 5)))
df.index = timedelta_range(start="0s", periods=10, freq="1s", name="example")
with ensure_clean_store(setup_path) as store:
store["df"] = df
tm.assert_frame_equal(store["df"], df)
def test_columns_multiindex_modified(tmp_path, setup_path):
# BUG: 7212
df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE"))
df.index.name = "letters"
df = df.set_index(keys="E", append=True)
data_columns = df.index.names + df.columns.tolist()
path = tmp_path / setup_path
df.to_hdf(
path,
"df",
mode="a",
append=True,
data_columns=data_columns,
index=False,
)
cols2load = list("BCD")
cols2load_original = list(cols2load)
# GH#10055 make sure read_hdf call does not alter cols2load inplace
read_hdf(path, "df", columns=cols2load)
assert cols2load_original == cols2load
def test_to_hdf_with_object_column_names(tmp_path, setup_path):
# GH9057
types_should_fail = [
tm.makeIntIndex,
tm.makeFloatIndex,
tm.makeDateIndex,
tm.makeTimedeltaIndex,
tm.makePeriodIndex,
]
types_should_run = [
tm.makeStringIndex,
tm.makeCategoricalIndex,
]
for index in types_should_fail:
df = DataFrame(np.random.randn(10, 2), columns=index(2))
path = tmp_path / setup_path
with catch_warnings(record=True):
msg = "cannot have non-object label DataIndexableCol"
with pytest.raises(ValueError, match=msg):
df.to_hdf(path, "df", format="table", data_columns=True)
for index in types_should_run:
df = DataFrame(np.random.randn(10, 2), columns=index(2))
path = tmp_path / setup_path
with catch_warnings(record=True):
df.to_hdf(path, "df", format="table", data_columns=True)
result = read_hdf(path, "df", where=f"index = [{df.index[0]}]")
assert len(result)
def test_hdfstore_strides(setup_path):
# GH22073
df = DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})
with ensure_clean_store(setup_path) as store:
store.put("df", df)
assert df["a"].values.strides == store["df"]["a"].values.strides
def test_store_bool_index(tmp_path, setup_path):
# GH#48667
df = DataFrame([[1]], columns=[True], index=Index([False], dtype="bool"))
expected = df.copy()
# # Test to make sure defaults are to not drop.
# # Corresponding to Issue 9382
path = tmp_path / setup_path
df.to_hdf(path, "a")
result = read_hdf(path, "a")
tm.assert_frame_equal(expected, result)