projektAI/venv/Lib/site-packages/pandas/tests/io/pytables/test_store.py
2021-06-06 22:13:05 +02:00

4899 lines
177 KiB
Python

import datetime
from datetime import timedelta
from distutils.version import LooseVersion
import hashlib
from io import BytesIO
import os
from pathlib import Path
import re
import time
from warnings import catch_warnings, simplefilter
import numpy as np
import pytest
from pandas.compat import is_platform_little_endian, is_platform_windows
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
Categorical,
CategoricalIndex,
DataFrame,
DatetimeIndex,
Index,
Int64Index,
MultiIndex,
RangeIndex,
Series,
Timestamp,
bdate_range,
concat,
date_range,
isna,
timedelta_range,
)
import pandas._testing as tm
from pandas.tests.io.pytables.common import (
_maybe_remove,
ensure_clean_path,
ensure_clean_store,
safe_close,
tables,
)
from pandas.io.pytables import (
ClosedFileError,
HDFStore,
PossibleDataLossError,
Term,
_maybe_adjust_name,
read_hdf,
)
from pandas.io import pytables as pytables # isort:skip
from pandas.io.pytables import TableIterator # isort:skip
_default_compressor = "blosc"
ignore_natural_naming_warning = pytest.mark.filterwarnings(
"ignore:object name:tables.exceptions.NaturalNameWarning"
)
@pytest.mark.single
class TestHDFStore:
def test_format_type(self, setup_path):
df = DataFrame({"A": [1, 2]})
with ensure_clean_path(setup_path) as path:
with HDFStore(path) as store:
store.put("a", df, format="fixed")
store.put("b", df, format="table")
assert store.get_storer("a").format_type == "fixed"
assert store.get_storer("b").format_type == "table"
def test_format_kwarg_in_constructor(self, setup_path):
# GH 13291
msg = "format is not a defined argument for HDFStore"
with tm.ensure_clean(setup_path) as path:
with pytest.raises(ValueError, match=msg):
HDFStore(path, format="table")
def test_context(self, setup_path):
with tm.ensure_clean(setup_path) as path:
try:
with HDFStore(path) as tbl:
raise ValueError("blah")
except ValueError:
pass
with tm.ensure_clean(setup_path) as path:
with HDFStore(path) as tbl:
tbl["a"] = tm.makeDataFrame()
assert len(tbl) == 1
assert type(tbl["a"]) == DataFrame
def test_conv_read_write(self, setup_path):
with tm.ensure_clean() as path:
def roundtrip(key, obj, **kwargs):
obj.to_hdf(path, key, **kwargs)
return read_hdf(path, key)
o = tm.makeTimeSeries()
tm.assert_series_equal(o, roundtrip("series", o))
o = tm.makeStringSeries()
tm.assert_series_equal(o, roundtrip("string_series", o))
o = tm.makeDataFrame()
tm.assert_frame_equal(o, roundtrip("frame", o))
# table
df = DataFrame({"A": range(5), "B": range(5)})
df.to_hdf(path, "table", append=True)
result = read_hdf(path, "table", where=["index>2"])
tm.assert_frame_equal(df[df.index > 2], result)
def test_long_strings(self, setup_path):
# GH6166
df = DataFrame(
{"a": tm.rands_array(100, size=10)}, index=tm.rands_array(100, size=10)
)
with ensure_clean_store(setup_path) as store:
store.append("df", df, data_columns=["a"])
result = store.select("df")
tm.assert_frame_equal(df, result)
def test_api(self, setup_path):
# GH4584
# API issue when to_hdf doesn't accept append AND format args
with ensure_clean_path(setup_path) as path:
df = tm.makeDataFrame()
df.iloc[:10].to_hdf(path, "df", append=True, format="table")
df.iloc[10:].to_hdf(path, "df", append=True, format="table")
tm.assert_frame_equal(read_hdf(path, "df"), df)
# append to False
df.iloc[:10].to_hdf(path, "df", append=False, format="table")
df.iloc[10:].to_hdf(path, "df", append=True, format="table")
tm.assert_frame_equal(read_hdf(path, "df"), df)
with ensure_clean_path(setup_path) as path:
df = tm.makeDataFrame()
df.iloc[:10].to_hdf(path, "df", append=True)
df.iloc[10:].to_hdf(path, "df", append=True, format="table")
tm.assert_frame_equal(read_hdf(path, "df"), df)
# append to False
df.iloc[:10].to_hdf(path, "df", append=False, format="table")
df.iloc[10:].to_hdf(path, "df", append=True)
tm.assert_frame_equal(read_hdf(path, "df"), df)
with ensure_clean_path(setup_path) as path:
df = tm.makeDataFrame()
df.to_hdf(path, "df", append=False, format="fixed")
tm.assert_frame_equal(read_hdf(path, "df"), df)
df.to_hdf(path, "df", append=False, format="f")
tm.assert_frame_equal(read_hdf(path, "df"), df)
df.to_hdf(path, "df", append=False)
tm.assert_frame_equal(read_hdf(path, "df"), df)
df.to_hdf(path, "df")
tm.assert_frame_equal(read_hdf(path, "df"), df)
with ensure_clean_store(setup_path) as store:
path = store._path
df = tm.makeDataFrame()
_maybe_remove(store, "df")
store.append("df", df.iloc[:10], append=True, format="table")
store.append("df", df.iloc[10:], append=True, format="table")
tm.assert_frame_equal(store.select("df"), df)
# append to False
_maybe_remove(store, "df")
store.append("df", df.iloc[:10], append=False, format="table")
store.append("df", df.iloc[10:], append=True, format="table")
tm.assert_frame_equal(store.select("df"), df)
# formats
_maybe_remove(store, "df")
store.append("df", df.iloc[:10], append=False, format="table")
store.append("df", df.iloc[10:], append=True, format="table")
tm.assert_frame_equal(store.select("df"), df)
_maybe_remove(store, "df")
store.append("df", df.iloc[:10], append=False, format="table")
store.append("df", df.iloc[10:], append=True, format=None)
tm.assert_frame_equal(store.select("df"), df)
with ensure_clean_path(setup_path) as path:
# Invalid.
df = tm.makeDataFrame()
msg = "Can only append to Tables"
with pytest.raises(ValueError, match=msg):
df.to_hdf(path, "df", append=True, format="f")
with pytest.raises(ValueError, match=msg):
df.to_hdf(path, "df", append=True, format="fixed")
msg = r"invalid HDFStore format specified \[foo\]"
with pytest.raises(TypeError, match=msg):
df.to_hdf(path, "df", append=True, format="foo")
with pytest.raises(TypeError, match=msg):
df.to_hdf(path, "df", append=False, format="foo")
# File path doesn't exist
path = ""
msg = f"File {path} does not exist"
with pytest.raises(FileNotFoundError, match=msg):
read_hdf(path, "df")
def test_api_default_format(self, setup_path):
# default_format option
with ensure_clean_store(setup_path) as store:
df = tm.makeDataFrame()
pd.set_option("io.hdf.default_format", "fixed")
_maybe_remove(store, "df")
store.put("df", df)
assert not store.get_storer("df").is_table
msg = "Can only append to Tables"
with pytest.raises(ValueError, match=msg):
store.append("df2", df)
pd.set_option("io.hdf.default_format", "table")
_maybe_remove(store, "df")
store.put("df", df)
assert store.get_storer("df").is_table
_maybe_remove(store, "df2")
store.append("df2", df)
assert store.get_storer("df").is_table
pd.set_option("io.hdf.default_format", None)
with ensure_clean_path(setup_path) as path:
df = tm.makeDataFrame()
pd.set_option("io.hdf.default_format", "fixed")
df.to_hdf(path, "df")
with HDFStore(path) as store:
assert not store.get_storer("df").is_table
with pytest.raises(ValueError, match=msg):
df.to_hdf(path, "df2", append=True)
pd.set_option("io.hdf.default_format", "table")
df.to_hdf(path, "df3")
with HDFStore(path) as store:
assert store.get_storer("df3").is_table
df.to_hdf(path, "df4", append=True)
with HDFStore(path) as store:
assert store.get_storer("df4").is_table
pd.set_option("io.hdf.default_format", None)
def test_keys(self, setup_path):
with ensure_clean_store(setup_path) as store:
store["a"] = tm.makeTimeSeries()
store["b"] = tm.makeStringSeries()
store["c"] = tm.makeDataFrame()
assert len(store) == 3
expected = {"/a", "/b", "/c"}
assert set(store.keys()) == expected
assert set(store) == expected
def test_no_track_times(self, setup_path):
# GH 32682
# enables to set track_times (see `pytables` `create_table` documentation)
def checksum(filename, hash_factory=hashlib.md5, chunk_num_blocks=128):
h = hash_factory()
with open(filename, "rb") as f:
for chunk in iter(lambda: f.read(chunk_num_blocks * h.block_size), b""):
h.update(chunk)
return h.digest()
def create_h5_and_return_checksum(track_times):
with ensure_clean_path(setup_path) as path:
df = DataFrame({"a": [1]})
with HDFStore(path, mode="w") as hdf:
hdf.put(
"table",
df,
format="table",
data_columns=True,
index=None,
track_times=track_times,
)
return checksum(path)
checksum_0_tt_false = create_h5_and_return_checksum(track_times=False)
checksum_0_tt_true = create_h5_and_return_checksum(track_times=True)
# sleep is necessary to create h5 with different creation time
time.sleep(1)
checksum_1_tt_false = create_h5_and_return_checksum(track_times=False)
checksum_1_tt_true = create_h5_and_return_checksum(track_times=True)
# checksums are the same if track_time = False
assert checksum_0_tt_false == checksum_1_tt_false
# checksums are NOT same if track_time = True
assert checksum_0_tt_true != checksum_1_tt_true
def test_non_pandas_keys(self, setup_path):
class Table1(tables.IsDescription):
value1 = tables.Float32Col()
class Table2(tables.IsDescription):
value2 = tables.Float32Col()
class Table3(tables.IsDescription):
value3 = tables.Float32Col()
with ensure_clean_path(setup_path) as path:
with tables.open_file(path, mode="w") as h5file:
group = h5file.create_group("/", "group")
h5file.create_table(group, "table1", Table1, "Table 1")
h5file.create_table(group, "table2", Table2, "Table 2")
h5file.create_table(group, "table3", Table3, "Table 3")
with HDFStore(path) as store:
assert len(store.keys(include="native")) == 3
expected = {"/group/table1", "/group/table2", "/group/table3"}
assert set(store.keys(include="native")) == expected
assert set(store.keys(include="pandas")) == set()
for name in expected:
df = store.get(name)
assert len(df.columns) == 1
def test_keys_illegal_include_keyword_value(self, setup_path):
with ensure_clean_store(setup_path) as store:
with pytest.raises(
ValueError,
match="`include` should be either 'pandas' or 'native' "
"but is 'illegal'",
):
store.keys(include="illegal")
def test_keys_ignore_hdf_softlink(self, setup_path):
# GH 20523
# Puts a softlink into HDF file and rereads
with ensure_clean_store(setup_path) as store:
df = DataFrame({"A": range(5), "B": range(5)})
store.put("df", df)
assert store.keys() == ["/df"]
store._handle.create_soft_link(store._handle.root, "symlink", "df")
# Should ignore the softlink
assert store.keys() == ["/df"]
def test_iter_empty(self, setup_path):
with ensure_clean_store(setup_path) as store:
# GH 12221
assert list(store) == []
def test_repr(self, setup_path):
with ensure_clean_store(setup_path) as store:
repr(store)
store.info()
store["a"] = tm.makeTimeSeries()
store["b"] = tm.makeStringSeries()
store["c"] = tm.makeDataFrame()
df = tm.makeDataFrame()
df["obj1"] = "foo"
df["obj2"] = "bar"
df["bool1"] = df["A"] > 0
df["bool2"] = df["B"] > 0
df["bool3"] = True
df["int1"] = 1
df["int2"] = 2
df["timestamp1"] = Timestamp("20010102")
df["timestamp2"] = Timestamp("20010103")
df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0)
df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0)
df.loc[df.index[3:6], ["obj1"]] = np.nan
df = df._consolidate()._convert(datetime=True)
with catch_warnings(record=True):
simplefilter("ignore", pd.errors.PerformanceWarning)
store["df"] = df
# make a random group in hdf space
store._handle.create_group(store._handle.root, "bah")
assert store.filename in repr(store)
assert store.filename in str(store)
store.info()
# storers
with ensure_clean_store(setup_path) as store:
df = tm.makeDataFrame()
store.append("df", df)
s = store.get_storer("df")
repr(s)
str(s)
@ignore_natural_naming_warning
def test_contains(self, setup_path):
with ensure_clean_store(setup_path) as store:
store["a"] = tm.makeTimeSeries()
store["b"] = tm.makeDataFrame()
store["foo/bar"] = tm.makeDataFrame()
assert "a" in store
assert "b" in store
assert "c" not in store
assert "foo/bar" in store
assert "/foo/bar" in store
assert "/foo/b" not in store
assert "bar" not in store
# gh-2694: tables.NaturalNameWarning
with catch_warnings(record=True):
store["node())"] = tm.makeDataFrame()
assert "node())" in store
def test_versioning(self, setup_path):
with ensure_clean_store(setup_path) as store:
store["a"] = tm.makeTimeSeries()
store["b"] = tm.makeDataFrame()
df = tm.makeTimeDataFrame()
_maybe_remove(store, "df1")
store.append("df1", df[:10])
store.append("df1", df[10:])
assert store.root.a._v_attrs.pandas_version == "0.15.2"
assert store.root.b._v_attrs.pandas_version == "0.15.2"
assert store.root.df1._v_attrs.pandas_version == "0.15.2"
# write a file and wipe its versioning
_maybe_remove(store, "df2")
store.append("df2", df)
# this is an error because its table_type is appendable, but no
# version info
store.get_node("df2")._v_attrs.pandas_version = None
msg = "'NoneType' object has no attribute 'startswith'"
with pytest.raises(Exception, match=msg):
store.select("df2")
def test_mode(self, setup_path):
df = tm.makeTimeDataFrame()
def check(mode):
with ensure_clean_path(setup_path) as path:
# constructor
if mode in ["r", "r+"]:
with pytest.raises(IOError):
HDFStore(path, mode=mode)
else:
store = HDFStore(path, mode=mode)
assert store._handle.mode == mode
store.close()
with ensure_clean_path(setup_path) as path:
# context
if mode in ["r", "r+"]:
with pytest.raises(IOError):
with HDFStore(path, mode=mode) as store:
pass
else:
with HDFStore(path, mode=mode) as store:
assert store._handle.mode == mode
with ensure_clean_path(setup_path) as path:
# conv write
if mode in ["r", "r+"]:
with pytest.raises(IOError):
df.to_hdf(path, "df", mode=mode)
df.to_hdf(path, "df", mode="w")
else:
df.to_hdf(path, "df", mode=mode)
# conv read
if mode in ["w"]:
msg = (
"mode w is not allowed while performing a read. "
r"Allowed modes are r, r\+ and a."
)
with pytest.raises(ValueError, match=msg):
read_hdf(path, "df", mode=mode)
else:
result = read_hdf(path, "df", mode=mode)
tm.assert_frame_equal(result, df)
def check_default_mode():
# read_hdf uses default mode
with ensure_clean_path(setup_path) as path:
df.to_hdf(path, "df", mode="w")
result = read_hdf(path, "df")
tm.assert_frame_equal(result, df)
check("r")
check("r+")
check("a")
check("w")
check_default_mode()
def test_reopen_handle(self, setup_path):
with ensure_clean_path(setup_path) as path:
store = HDFStore(path, mode="a")
store["a"] = tm.makeTimeSeries()
# invalid mode change
with pytest.raises(PossibleDataLossError):
store.open("w")
store.close()
assert not store.is_open
# truncation ok here
store.open("w")
assert store.is_open
assert len(store) == 0
store.close()
assert not store.is_open
store = HDFStore(path, mode="a")
store["a"] = tm.makeTimeSeries()
# reopen as read
store.open("r")
assert store.is_open
assert len(store) == 1
assert store._mode == "r"
store.close()
assert not store.is_open
# reopen as append
store.open("a")
assert store.is_open
assert len(store) == 1
assert store._mode == "a"
store.close()
assert not store.is_open
# reopen as append (again)
store.open("a")
assert store.is_open
assert len(store) == 1
assert store._mode == "a"
store.close()
assert not store.is_open
def test_open_args(self, setup_path):
with tm.ensure_clean(setup_path) as path:
df = tm.makeDataFrame()
# create an in memory store
store = HDFStore(
path, mode="a", driver="H5FD_CORE", driver_core_backing_store=0
)
store["df"] = df
store.append("df2", df)
tm.assert_frame_equal(store["df"], df)
tm.assert_frame_equal(store["df2"], df)
store.close()
# the file should not have actually been written
assert not os.path.exists(path)
def test_flush(self, setup_path):
with ensure_clean_store(setup_path) as store:
store["a"] = tm.makeTimeSeries()
store.flush()
store.flush(fsync=True)
def test_get(self, setup_path):
with ensure_clean_store(setup_path) as store:
store["a"] = tm.makeTimeSeries()
left = store.get("a")
right = store["a"]
tm.assert_series_equal(left, right)
left = store.get("/a")
right = store["/a"]
tm.assert_series_equal(left, right)
with pytest.raises(KeyError, match="'No object named b in the file'"):
store.get("b")
@pytest.mark.parametrize(
"where, expected",
[
(
"/",
{
"": ({"first_group", "second_group"}, set()),
"/first_group": (set(), {"df1", "df2"}),
"/second_group": ({"third_group"}, {"df3", "s1"}),
"/second_group/third_group": (set(), {"df4"}),
},
),
(
"/second_group",
{
"/second_group": ({"third_group"}, {"df3", "s1"}),
"/second_group/third_group": (set(), {"df4"}),
},
),
],
)
def test_walk(self, where, expected, setup_path):
# GH10143
objs = {
"df1": DataFrame([1, 2, 3]),
"df2": DataFrame([4, 5, 6]),
"df3": DataFrame([6, 7, 8]),
"df4": DataFrame([9, 10, 11]),
"s1": Series([10, 9, 8]),
# Next 3 items aren't pandas objects and should be ignored
"a1": np.array([[1, 2, 3], [4, 5, 6]]),
"tb1": np.array([(1, 2, 3), (4, 5, 6)], dtype="i,i,i"),
"tb2": np.array([(7, 8, 9), (10, 11, 12)], dtype="i,i,i"),
}
with ensure_clean_store("walk_groups.hdf", mode="w") as store:
store.put("/first_group/df1", objs["df1"])
store.put("/first_group/df2", objs["df2"])
store.put("/second_group/df3", objs["df3"])
store.put("/second_group/s1", objs["s1"])
store.put("/second_group/third_group/df4", objs["df4"])
# Create non-pandas objects
store._handle.create_array("/first_group", "a1", objs["a1"])
store._handle.create_table("/first_group", "tb1", obj=objs["tb1"])
store._handle.create_table("/second_group", "tb2", obj=objs["tb2"])
assert len(list(store.walk(where=where))) == len(expected)
for path, groups, leaves in store.walk(where=where):
assert path in expected
expected_groups, expected_frames = expected[path]
assert expected_groups == set(groups)
assert expected_frames == set(leaves)
for leaf in leaves:
frame_path = "/".join([path, leaf])
obj = store.get(frame_path)
if "df" in leaf:
tm.assert_frame_equal(obj, objs[leaf])
else:
tm.assert_series_equal(obj, objs[leaf])
def test_getattr(self, setup_path):
with ensure_clean_store(setup_path) as store:
s = tm.makeTimeSeries()
store["a"] = s
# test attribute access
result = store.a
tm.assert_series_equal(result, s)
result = getattr(store, "a")
tm.assert_series_equal(result, s)
df = tm.makeTimeDataFrame()
store["df"] = df
result = store.df
tm.assert_frame_equal(result, df)
# errors
for x in ["d", "mode", "path", "handle", "complib"]:
with pytest.raises(AttributeError):
getattr(store, x)
# not stores
for x in ["mode", "path", "handle", "complib"]:
getattr(store, f"_{x}")
def test_put(self, setup_path):
with ensure_clean_store(setup_path) as store:
ts = tm.makeTimeSeries()
df = tm.makeTimeDataFrame()
store["a"] = ts
store["b"] = df[:10]
store["foo/bar/bah"] = df[:10]
store["foo"] = df[:10]
store["/foo"] = df[:10]
store.put("c", df[:10], format="table")
# not OK, not a table
with pytest.raises(ValueError):
store.put("b", df[10:], append=True)
# node does not currently exist, test _is_table_type returns False
# in this case
_maybe_remove(store, "f")
with pytest.raises(ValueError):
store.put("f", df[10:], append=True)
# can't put to a table (use append instead)
with pytest.raises(ValueError):
store.put("c", df[10:], append=True)
# overwrite table
store.put("c", df[:10], format="table", append=False)
tm.assert_frame_equal(df[:10], store["c"])
def test_put_string_index(self, setup_path):
with ensure_clean_store(setup_path) as store:
index = Index([f"I am a very long string index: {i}" for i in range(20)])
s = Series(np.arange(20), index=index)
df = DataFrame({"A": s, "B": s})
store["a"] = s
tm.assert_series_equal(store["a"], s)
store["b"] = df
tm.assert_frame_equal(store["b"], df)
# mixed length
index = Index(
["abcdefghijklmnopqrstuvwxyz1234567890"]
+ [f"I am a very long string index: {i}" for i in range(20)]
)
s = Series(np.arange(21), index=index)
df = DataFrame({"A": s, "B": s})
store["a"] = s
tm.assert_series_equal(store["a"], s)
store["b"] = df
tm.assert_frame_equal(store["b"], df)
def test_put_compression(self, setup_path):
with ensure_clean_store(setup_path) as store:
df = tm.makeTimeDataFrame()
store.put("c", df, format="table", complib="zlib")
tm.assert_frame_equal(store["c"], df)
# can't compress if format='fixed'
with pytest.raises(ValueError):
store.put("b", df, format="fixed", complib="zlib")
@td.skip_if_windows_python_3
def test_put_compression_blosc(self, setup_path):
df = tm.makeTimeDataFrame()
with ensure_clean_store(setup_path) as store:
# can't compress if format='fixed'
with pytest.raises(ValueError):
store.put("b", df, format="fixed", complib="blosc")
store.put("c", df, format="table", complib="blosc")
tm.assert_frame_equal(store["c"], df)
def test_complibs_default_settings(self, setup_path):
# GH15943
df = tm.makeDataFrame()
# Set complevel and check if complib is automatically set to
# default value
with ensure_clean_path(setup_path) as tmpfile:
df.to_hdf(tmpfile, "df", complevel=9)
result = pd.read_hdf(tmpfile, "df")
tm.assert_frame_equal(result, df)
with tables.open_file(tmpfile, mode="r") as h5file:
for node in h5file.walk_nodes(where="/df", classname="Leaf"):
assert node.filters.complevel == 9
assert node.filters.complib == "zlib"
# Set complib and check to see if compression is disabled
with ensure_clean_path(setup_path) as tmpfile:
df.to_hdf(tmpfile, "df", complib="zlib")
result = pd.read_hdf(tmpfile, "df")
tm.assert_frame_equal(result, df)
with tables.open_file(tmpfile, mode="r") as h5file:
for node in h5file.walk_nodes(where="/df", classname="Leaf"):
assert node.filters.complevel == 0
assert node.filters.complib is None
# Check if not setting complib or complevel results in no compression
with ensure_clean_path(setup_path) as tmpfile:
df.to_hdf(tmpfile, "df")
result = pd.read_hdf(tmpfile, "df")
tm.assert_frame_equal(result, df)
with tables.open_file(tmpfile, mode="r") as h5file:
for node in h5file.walk_nodes(where="/df", classname="Leaf"):
assert node.filters.complevel == 0
assert node.filters.complib is None
# Check if file-defaults can be overridden on a per table basis
with ensure_clean_path(setup_path) as tmpfile:
store = HDFStore(tmpfile)
store.append("dfc", df, complevel=9, complib="blosc")
store.append("df", df)
store.close()
with tables.open_file(tmpfile, mode="r") as h5file:
for node in h5file.walk_nodes(where="/df", classname="Leaf"):
assert node.filters.complevel == 0
assert node.filters.complib is None
for node in h5file.walk_nodes(where="/dfc", classname="Leaf"):
assert node.filters.complevel == 9
assert node.filters.complib == "blosc"
def test_complibs(self, setup_path):
# GH14478
df = tm.makeDataFrame()
# Building list of all complibs and complevels tuples
all_complibs = tables.filters.all_complibs
# Remove lzo if its not available on this platform
if not tables.which_lib_version("lzo"):
all_complibs.remove("lzo")
# Remove bzip2 if its not available on this platform
if not tables.which_lib_version("bzip2"):
all_complibs.remove("bzip2")
all_levels = range(0, 10)
all_tests = [(lib, lvl) for lib in all_complibs for lvl in all_levels]
for (lib, lvl) in all_tests:
with ensure_clean_path(setup_path) as tmpfile:
gname = "foo"
# Write and read file to see if data is consistent
df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl)
result = pd.read_hdf(tmpfile, gname)
tm.assert_frame_equal(result, df)
# Open file and check metadata
# for correct amount of compression
h5table = tables.open_file(tmpfile, mode="r")
for node in h5table.walk_nodes(where="/" + gname, classname="Leaf"):
assert node.filters.complevel == lvl
if lvl == 0:
assert node.filters.complib is None
else:
assert node.filters.complib == lib
h5table.close()
def test_put_integer(self, setup_path):
# non-date, non-string index
df = DataFrame(np.random.randn(50, 100))
self._check_roundtrip(df, tm.assert_frame_equal, setup_path)
def test_put_mixed_type(self, setup_path):
df = tm.makeTimeDataFrame()
df["obj1"] = "foo"
df["obj2"] = "bar"
df["bool1"] = df["A"] > 0
df["bool2"] = df["B"] > 0
df["bool3"] = True
df["int1"] = 1
df["int2"] = 2
df["timestamp1"] = Timestamp("20010102")
df["timestamp2"] = Timestamp("20010103")
df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0)
df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0)
df.loc[df.index[3:6], ["obj1"]] = np.nan
df = df._consolidate()._convert(datetime=True)
with ensure_clean_store(setup_path) as store:
_maybe_remove(store, "df")
# PerformanceWarning
with catch_warnings(record=True):
simplefilter("ignore", pd.errors.PerformanceWarning)
store.put("df", df)
expected = store.get("df")
tm.assert_frame_equal(expected, df)
@pytest.mark.filterwarnings(
"ignore:object name:tables.exceptions.NaturalNameWarning"
)
def test_append(self, setup_path):
with ensure_clean_store(setup_path) as store:
# this is allowed by almost always don't want to do it
# tables.NaturalNameWarning):
with catch_warnings(record=True):
df = tm.makeTimeDataFrame()
_maybe_remove(store, "df1")
store.append("df1", df[:10])
store.append("df1", df[10:])
tm.assert_frame_equal(store["df1"], df)
_maybe_remove(store, "df2")
store.put("df2", df[:10], format="table")
store.append("df2", df[10:])
tm.assert_frame_equal(store["df2"], df)
_maybe_remove(store, "df3")
store.append("/df3", df[:10])
store.append("/df3", df[10:])
tm.assert_frame_equal(store["df3"], df)
# this is allowed by almost always don't want to do it
# tables.NaturalNameWarning
_maybe_remove(store, "/df3 foo")
store.append("/df3 foo", df[:10])
store.append("/df3 foo", df[10:])
tm.assert_frame_equal(store["df3 foo"], df)
# dtype issues - mizxed type in a single object column
df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]])
df["mixed_column"] = "testing"
df.loc[2, "mixed_column"] = np.nan
_maybe_remove(store, "df")
store.append("df", df)
tm.assert_frame_equal(store["df"], df)
# uints - test storage of uints
uint_data = DataFrame(
{
"u08": Series(
np.random.randint(0, high=255, size=5), dtype=np.uint8
),
"u16": Series(
np.random.randint(0, high=65535, size=5), dtype=np.uint16
),
"u32": Series(
np.random.randint(0, high=2 ** 30, size=5), dtype=np.uint32
),
"u64": Series(
[2 ** 58, 2 ** 59, 2 ** 60, 2 ** 61, 2 ** 62],
dtype=np.uint64,
),
},
index=np.arange(5),
)
_maybe_remove(store, "uints")
store.append("uints", uint_data)
tm.assert_frame_equal(store["uints"], uint_data)
# uints - test storage of uints in indexable columns
_maybe_remove(store, "uints")
# 64-bit indices not yet supported
store.append("uints", uint_data, data_columns=["u08", "u16", "u32"])
tm.assert_frame_equal(store["uints"], uint_data)
def test_append_series(self, setup_path):
with ensure_clean_store(setup_path) as store:
# basic
ss = tm.makeStringSeries()
ts = tm.makeTimeSeries()
ns = Series(np.arange(100))
store.append("ss", ss)
result = store["ss"]
tm.assert_series_equal(result, ss)
assert result.name is None
store.append("ts", ts)
result = store["ts"]
tm.assert_series_equal(result, ts)
assert result.name is None
ns.name = "foo"
store.append("ns", ns)
result = store["ns"]
tm.assert_series_equal(result, ns)
assert result.name == ns.name
# select on the values
expected = ns[ns > 60]
result = store.select("ns", "foo>60")
tm.assert_series_equal(result, expected)
# select on the index and values
expected = ns[(ns > 70) & (ns.index < 90)]
result = store.select("ns", "foo>70 and index<90")
tm.assert_series_equal(result, expected)
# multi-index
mi = DataFrame(np.random.randn(5, 1), columns=["A"])
mi["B"] = np.arange(len(mi))
mi["C"] = "foo"
mi.loc[3:5, "C"] = "bar"
mi.set_index(["C", "B"], inplace=True)
s = mi.stack()
s.index = s.index.droplevel(2)
store.append("mi", s)
tm.assert_series_equal(store["mi"], s)
def test_store_index_types(self, setup_path):
# GH5386
# test storing various index types
with ensure_clean_store(setup_path) as store:
def check(format, index):
df = DataFrame(np.random.randn(10, 2), columns=list("AB"))
df.index = index(len(df))
_maybe_remove(store, "df")
store.put("df", df, format=format)
tm.assert_frame_equal(df, store["df"])
for index in [
tm.makeFloatIndex,
tm.makeStringIndex,
tm.makeIntIndex,
tm.makeDateIndex,
]:
check("table", index)
check("fixed", index)
# period index currently broken for table
# seee GH7796 FIXME
check("fixed", tm.makePeriodIndex)
# check('table',tm.makePeriodIndex)
# unicode
index = tm.makeUnicodeIndex
check("table", index)
check("fixed", index)
@pytest.mark.skipif(
not is_platform_little_endian(), reason="reason platform is not little endian"
)
def test_encoding(self, setup_path):
with ensure_clean_store(setup_path) as store:
df = DataFrame({"A": "foo", "B": "bar"}, index=range(5))
df.loc[2, "A"] = np.nan
df.loc[3, "B"] = np.nan
_maybe_remove(store, "df")
store.append("df", df, encoding="ascii")
tm.assert_frame_equal(store["df"], df)
expected = df.reindex(columns=["A"])
result = store.select("df", Term("columns=A", encoding="ascii"))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"val",
[
[b"E\xc9, 17", b"", b"a", b"b", b"c"],
[b"E\xc9, 17", b"a", b"b", b"c"],
[b"EE, 17", b"", b"a", b"b", b"c"],
[b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"],
[b"", b"a", b"b", b"c"],
[b"\xf8\xfc", b"a", b"b", b"c"],
[b"A\xf8\xfc", b"", b"a", b"b", b"c"],
[np.nan, b"", b"b", b"c"],
[b"A\xf8\xfc", np.nan, b"", b"b", b"c"],
],
)
@pytest.mark.parametrize("dtype", ["category", object])
def test_latin_encoding(self, setup_path, dtype, val):
enc = "latin-1"
nan_rep = ""
key = "data"
val = [x.decode(enc) if isinstance(x, bytes) else x for x in val]
ser = Series(val, dtype=dtype)
with ensure_clean_path(setup_path) as store:
ser.to_hdf(store, key, format="table", encoding=enc, nan_rep=nan_rep)
retr = read_hdf(store, key)
s_nan = ser.replace(nan_rep, np.nan)
tm.assert_series_equal(s_nan, retr)
def test_append_some_nans(self, setup_path):
with ensure_clean_store(setup_path) as store:
df = DataFrame(
{
"A": Series(np.random.randn(20)).astype("int32"),
"A1": np.random.randn(20),
"A2": np.random.randn(20),
"B": "foo",
"C": "bar",
"D": Timestamp("20010101"),
"E": datetime.datetime(2001, 1, 2, 0, 0),
},
index=np.arange(20),
)
# some nans
_maybe_remove(store, "df1")
df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan
store.append("df1", df[:10])
store.append("df1", df[10:])
tm.assert_frame_equal(store["df1"], df)
# first column
df1 = df.copy()
df1.loc[:, "A1"] = np.nan
_maybe_remove(store, "df1")
store.append("df1", df1[:10])
store.append("df1", df1[10:])
tm.assert_frame_equal(store["df1"], df1)
# 2nd column
df2 = df.copy()
df2.loc[:, "A2"] = np.nan
_maybe_remove(store, "df2")
store.append("df2", df2[:10])
store.append("df2", df2[10:])
tm.assert_frame_equal(store["df2"], df2)
# datetimes
df3 = df.copy()
df3.loc[:, "E"] = np.nan
_maybe_remove(store, "df3")
store.append("df3", df3[:10])
store.append("df3", df3[10:])
tm.assert_frame_equal(store["df3"], df3)
def test_append_all_nans(self, setup_path):
with ensure_clean_store(setup_path) as store:
df = DataFrame(
{"A1": np.random.randn(20), "A2": np.random.randn(20)},
index=np.arange(20),
)
df.loc[0:15, :] = np.nan
# nan some entire rows (dropna=True)
_maybe_remove(store, "df")
store.append("df", df[:10], dropna=True)
store.append("df", df[10:], dropna=True)
tm.assert_frame_equal(store["df"], df[-4:])
# nan some entire rows (dropna=False)
_maybe_remove(store, "df2")
store.append("df2", df[:10], dropna=False)
store.append("df2", df[10:], dropna=False)
tm.assert_frame_equal(store["df2"], df)
# tests the option io.hdf.dropna_table
pd.set_option("io.hdf.dropna_table", False)
_maybe_remove(store, "df3")
store.append("df3", df[:10])
store.append("df3", df[10:])
tm.assert_frame_equal(store["df3"], df)
pd.set_option("io.hdf.dropna_table", True)
_maybe_remove(store, "df4")
store.append("df4", df[:10])
store.append("df4", df[10:])
tm.assert_frame_equal(store["df4"], df[-4:])
# nan some entire rows (string are still written!)
df = DataFrame(
{
"A1": np.random.randn(20),
"A2": np.random.randn(20),
"B": "foo",
"C": "bar",
},
index=np.arange(20),
)
df.loc[0:15, :] = np.nan
_maybe_remove(store, "df")
store.append("df", df[:10], dropna=True)
store.append("df", df[10:], dropna=True)
tm.assert_frame_equal(store["df"], df)
_maybe_remove(store, "df2")
store.append("df2", df[:10], dropna=False)
store.append("df2", df[10:], dropna=False)
tm.assert_frame_equal(store["df2"], df)
# nan some entire rows (but since we have dates they are still
# written!)
df = DataFrame(
{
"A1": np.random.randn(20),
"A2": np.random.randn(20),
"B": "foo",
"C": "bar",
"D": Timestamp("20010101"),
"E": datetime.datetime(2001, 1, 2, 0, 0),
},
index=np.arange(20),
)
df.loc[0:15, :] = np.nan
_maybe_remove(store, "df")
store.append("df", df[:10], dropna=True)
store.append("df", df[10:], dropna=True)
tm.assert_frame_equal(store["df"], df)
_maybe_remove(store, "df2")
store.append("df2", df[:10], dropna=False)
store.append("df2", df[10:], dropna=False)
tm.assert_frame_equal(store["df2"], df)
def test_store_dropna(self, setup_path):
df_with_missing = DataFrame(
{"col1": [0.0, np.nan, 2.0], "col2": [1.0, np.nan, np.nan]},
index=list("abc"),
)
df_without_missing = DataFrame(
{"col1": [0.0, 2.0], "col2": [1.0, np.nan]}, index=list("ac")
)
# # Test to make sure defaults are to not drop.
# # Corresponding to Issue 9382
with ensure_clean_path(setup_path) as path:
df_with_missing.to_hdf(path, "df", format="table")
reloaded = read_hdf(path, "df")
tm.assert_frame_equal(df_with_missing, reloaded)
with ensure_clean_path(setup_path) as path:
df_with_missing.to_hdf(path, "df", format="table", dropna=False)
reloaded = read_hdf(path, "df")
tm.assert_frame_equal(df_with_missing, reloaded)
with ensure_clean_path(setup_path) as path:
df_with_missing.to_hdf(path, "df", format="table", dropna=True)
reloaded = read_hdf(path, "df")
tm.assert_frame_equal(df_without_missing, reloaded)
def test_read_missing_key_close_store(self, setup_path):
# GH 25766
with ensure_clean_path(setup_path) as path:
df = DataFrame({"a": range(2), "b": range(2)})
df.to_hdf(path, "k1")
with pytest.raises(KeyError, match="'No object named k2 in the file'"):
pd.read_hdf(path, "k2")
# smoke test to test that file is properly closed after
# read with KeyError before another write
df.to_hdf(path, "k2")
def test_read_missing_key_opened_store(self, setup_path):
# GH 28699
with ensure_clean_path(setup_path) as path:
df = DataFrame({"a": range(2), "b": range(2)})
df.to_hdf(path, "k1")
with HDFStore(path, "r") as store:
with pytest.raises(KeyError, match="'No object named k2 in the file'"):
pd.read_hdf(store, "k2")
# Test that the file is still open after a KeyError and that we can
# still read from it.
pd.read_hdf(store, "k1")
def test_append_frame_column_oriented(self, setup_path):
with ensure_clean_store(setup_path) as store:
# column oriented
df = tm.makeTimeDataFrame()
df.index = df.index._with_freq(None) # freq doesnt round-trip
_maybe_remove(store, "df1")
store.append("df1", df.iloc[:, :2], axes=["columns"])
store.append("df1", df.iloc[:, 2:])
tm.assert_frame_equal(store["df1"], df)
result = store.select("df1", "columns=A")
expected = df.reindex(columns=["A"])
tm.assert_frame_equal(expected, result)
# selection on the non-indexable
result = store.select("df1", ("columns=A", "index=df.index[0:4]"))
expected = df.reindex(columns=["A"], index=df.index[0:4])
tm.assert_frame_equal(expected, result)
# this isn't supported
with pytest.raises(TypeError):
store.select("df1", "columns=A and index>df.index[4]")
def test_append_with_different_block_ordering(self, setup_path):
# GH 4096; using same frames, but different block orderings
with ensure_clean_store(setup_path) as store:
for i in range(10):
df = DataFrame(np.random.randn(10, 2), columns=list("AB"))
df["index"] = range(10)
df["index"] += i * 10
df["int64"] = Series([1] * len(df), dtype="int64")
df["int16"] = Series([1] * len(df), dtype="int16")
if i % 2 == 0:
del df["int64"]
df["int64"] = Series([1] * len(df), dtype="int64")
if i % 3 == 0:
a = df.pop("A")
df["A"] = a
df.set_index("index", inplace=True)
store.append("df", df)
# test a different ordering but with more fields (like invalid
# combinate)
with ensure_clean_store(setup_path) as store:
df = DataFrame(np.random.randn(10, 2), columns=list("AB"), dtype="float64")
df["int64"] = Series([1] * len(df), dtype="int64")
df["int16"] = Series([1] * len(df), dtype="int16")
store.append("df", df)
# store additional fields in different blocks
df["int16_2"] = Series([1] * len(df), dtype="int16")
with pytest.raises(ValueError):
store.append("df", df)
# store multiple additional fields in different blocks
df["float_3"] = Series([1.0] * len(df), dtype="float64")
with pytest.raises(ValueError):
store.append("df", df)
def test_append_with_strings(self, setup_path):
with ensure_clean_store(setup_path) as store:
with catch_warnings(record=True):
def check_col(key, name, size):
assert (
getattr(store.get_storer(key).table.description, name).itemsize
== size
)
# avoid truncation on elements
df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]])
store.append("df_big", df)
tm.assert_frame_equal(store.select("df_big"), df)
check_col("df_big", "values_block_1", 15)
# appending smaller string ok
df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]])
store.append("df_big", df2)
expected = concat([df, df2])
tm.assert_frame_equal(store.select("df_big"), expected)
check_col("df_big", "values_block_1", 15)
# avoid truncation on elements
df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]])
store.append("df_big2", df, min_itemsize={"values": 50})
tm.assert_frame_equal(store.select("df_big2"), df)
check_col("df_big2", "values_block_1", 50)
# bigger string on next append
store.append("df_new", df)
df_new = DataFrame(
[[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]]
)
with pytest.raises(ValueError):
store.append("df_new", df_new)
# min_itemsize on Series index (GH 11412)
df = tm.makeMixedDataFrame().set_index("C")
store.append("ss", df["B"], min_itemsize={"index": 4})
tm.assert_series_equal(store.select("ss"), df["B"])
# same as above, with data_columns=True
store.append(
"ss2", df["B"], data_columns=True, min_itemsize={"index": 4}
)
tm.assert_series_equal(store.select("ss2"), df["B"])
# min_itemsize in index without appending (GH 10381)
store.put("ss3", df, format="table", min_itemsize={"index": 6})
# just make sure there is a longer string:
df2 = df.copy().reset_index().assign(C="longer").set_index("C")
store.append("ss3", df2)
tm.assert_frame_equal(store.select("ss3"), pd.concat([df, df2]))
# same as above, with a Series
store.put("ss4", df["B"], format="table", min_itemsize={"index": 6})
store.append("ss4", df2["B"])
tm.assert_series_equal(
store.select("ss4"), pd.concat([df["B"], df2["B"]])
)
# with nans
_maybe_remove(store, "df")
df = tm.makeTimeDataFrame()
df["string"] = "foo"
df.loc[df.index[1:4], "string"] = np.nan
df["string2"] = "bar"
df.loc[df.index[4:8], "string2"] = np.nan
df["string3"] = "bah"
df.loc[df.index[1:], "string3"] = np.nan
store.append("df", df)
result = store.select("df")
tm.assert_frame_equal(result, df)
with ensure_clean_store(setup_path) as store:
def check_col(key, name, size):
assert getattr(
store.get_storer(key).table.description, name
).itemsize, size
df = DataFrame({"A": "foo", "B": "bar"}, index=range(10))
# a min_itemsize that creates a data_column
_maybe_remove(store, "df")
store.append("df", df, min_itemsize={"A": 200})
check_col("df", "A", 200)
assert store.get_storer("df").data_columns == ["A"]
# a min_itemsize that creates a data_column2
_maybe_remove(store, "df")
store.append("df", df, data_columns=["B"], min_itemsize={"A": 200})
check_col("df", "A", 200)
assert store.get_storer("df").data_columns == ["B", "A"]
# a min_itemsize that creates a data_column2
_maybe_remove(store, "df")
store.append("df", df, data_columns=["B"], min_itemsize={"values": 200})
check_col("df", "B", 200)
check_col("df", "values_block_0", 200)
assert store.get_storer("df").data_columns == ["B"]
# infer the .typ on subsequent appends
_maybe_remove(store, "df")
store.append("df", df[:5], min_itemsize=200)
store.append("df", df[5:], min_itemsize=200)
tm.assert_frame_equal(store["df"], df)
# invalid min_itemsize keys
df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"])
_maybe_remove(store, "df")
with pytest.raises(ValueError):
store.append("df", df, min_itemsize={"foo": 20, "foobar": 20})
def test_append_with_empty_string(self, setup_path):
with ensure_clean_store(setup_path) as store:
# with all empty strings (GH 12242)
df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]})
store.append("df", df[:-1], min_itemsize={"x": 1})
store.append("df", df[-1:], min_itemsize={"x": 1})
tm.assert_frame_equal(store.select("df"), df)
def test_to_hdf_with_min_itemsize(self, setup_path):
with ensure_clean_path(setup_path) as path:
# min_itemsize in index with to_hdf (GH 10381)
df = tm.makeMixedDataFrame().set_index("C")
df.to_hdf(path, "ss3", format="table", min_itemsize={"index": 6})
# just make sure there is a longer string:
df2 = df.copy().reset_index().assign(C="longer").set_index("C")
df2.to_hdf(path, "ss3", append=True, format="table")
tm.assert_frame_equal(pd.read_hdf(path, "ss3"), pd.concat([df, df2]))
# same as above, with a Series
df["B"].to_hdf(path, "ss4", format="table", min_itemsize={"index": 6})
df2["B"].to_hdf(path, "ss4", append=True, format="table")
tm.assert_series_equal(
pd.read_hdf(path, "ss4"), pd.concat([df["B"], df2["B"]])
)
@pytest.mark.parametrize("format", ["fixed", "table"])
def test_to_hdf_errors(self, format, setup_path):
data = ["\ud800foo"]
ser = Series(data, index=Index(data))
with ensure_clean_path(setup_path) as path:
# GH 20835
ser.to_hdf(path, "table", format=format, errors="surrogatepass")
result = pd.read_hdf(path, "table", errors="surrogatepass")
tm.assert_series_equal(result, ser)
def test_append_with_data_columns(self, setup_path):
with ensure_clean_store(setup_path) as store:
df = tm.makeTimeDataFrame()
df.iloc[0, df.columns.get_loc("B")] = 1.0
_maybe_remove(store, "df")
store.append("df", df[:2], data_columns=["B"])
store.append("df", df[2:])
tm.assert_frame_equal(store["df"], df)
# check that we have indices created
assert store._handle.root.df.table.cols.index.is_indexed is True
assert store._handle.root.df.table.cols.B.is_indexed is True
# data column searching
result = store.select("df", "B>0")
expected = df[df.B > 0]
tm.assert_frame_equal(result, expected)
# data column searching (with an indexable and a data_columns)
result = store.select("df", "B>0 and index>df.index[3]")
df_new = df.reindex(index=df.index[4:])
expected = df_new[df_new.B > 0]
tm.assert_frame_equal(result, expected)
# data column selection with a string data_column
df_new = df.copy()
df_new["string"] = "foo"
df_new.loc[df_new.index[1:4], "string"] = np.nan
df_new.loc[df_new.index[5:6], "string"] = "bar"
_maybe_remove(store, "df")
store.append("df", df_new, data_columns=["string"])
result = store.select("df", "string='foo'")
expected = df_new[df_new.string == "foo"]
tm.assert_frame_equal(result, expected)
# using min_itemsize and a data column
def check_col(key, name, size):
assert (
getattr(store.get_storer(key).table.description, name).itemsize
== size
)
with ensure_clean_store(setup_path) as store:
_maybe_remove(store, "df")
store.append(
"df", df_new, data_columns=["string"], min_itemsize={"string": 30}
)
check_col("df", "string", 30)
_maybe_remove(store, "df")
store.append("df", df_new, data_columns=["string"], min_itemsize=30)
check_col("df", "string", 30)
_maybe_remove(store, "df")
store.append(
"df", df_new, data_columns=["string"], min_itemsize={"values": 30}
)
check_col("df", "string", 30)
with ensure_clean_store(setup_path) as store:
df_new["string2"] = "foobarbah"
df_new["string_block1"] = "foobarbah1"
df_new["string_block2"] = "foobarbah2"
_maybe_remove(store, "df")
store.append(
"df",
df_new,
data_columns=["string", "string2"],
min_itemsize={"string": 30, "string2": 40, "values": 50},
)
check_col("df", "string", 30)
check_col("df", "string2", 40)
check_col("df", "values_block_1", 50)
with ensure_clean_store(setup_path) as store:
# multiple data columns
df_new = df.copy()
df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0
df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0
df_new["string"] = "foo"
sl = df_new.columns.get_loc("string")
df_new.iloc[1:4, sl] = np.nan
df_new.iloc[5:6, sl] = "bar"
df_new["string2"] = "foo"
sl = df_new.columns.get_loc("string2")
df_new.iloc[2:5, sl] = np.nan
df_new.iloc[7:8, sl] = "bar"
_maybe_remove(store, "df")
store.append("df", df_new, data_columns=["A", "B", "string", "string2"])
result = store.select(
"df", "string='foo' and string2='foo' and A>0 and B<0"
)
expected = df_new[
(df_new.string == "foo")
& (df_new.string2 == "foo")
& (df_new.A > 0)
& (df_new.B < 0)
]
tm.assert_frame_equal(result, expected, check_freq=False)
# FIXME: 2020-05-07 freq check randomly fails in the CI
# yield an empty frame
result = store.select("df", "string='foo' and string2='cool'")
expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")]
tm.assert_frame_equal(result, expected)
with ensure_clean_store(setup_path) as store:
# doc example
df_dc = df.copy()
df_dc["string"] = "foo"
df_dc.loc[df_dc.index[4:6], "string"] = np.nan
df_dc.loc[df_dc.index[7:9], "string"] = "bar"
df_dc["string2"] = "cool"
df_dc["datetime"] = Timestamp("20010102")
df_dc = df_dc._convert(datetime=True)
df_dc.loc[df_dc.index[3:5], ["A", "B", "datetime"]] = np.nan
_maybe_remove(store, "df_dc")
store.append(
"df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"]
)
result = store.select("df_dc", "B>0")
expected = df_dc[df_dc.B > 0]
tm.assert_frame_equal(result, expected)
result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"])
expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")]
tm.assert_frame_equal(result, expected, check_freq=False)
# FIXME: 2020-12-07 intermittent build failures here with freq of
# None instead of BDay(4)
with ensure_clean_store(setup_path) as store:
# doc example part 2
np.random.seed(1234)
index = date_range("1/1/2000", periods=8)
df_dc = DataFrame(
np.random.randn(8, 3), index=index, columns=["A", "B", "C"]
)
df_dc["string"] = "foo"
df_dc.loc[df_dc.index[4:6], "string"] = np.nan
df_dc.loc[df_dc.index[7:9], "string"] = "bar"
df_dc.loc[:, ["B", "C"]] = df_dc.loc[:, ["B", "C"]].abs()
df_dc["string2"] = "cool"
# on-disk operations
store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"])
result = store.select("df_dc", "B>0")
expected = df_dc[df_dc.B > 0]
tm.assert_frame_equal(result, expected)
result = store.select("df_dc", ["B > 0", "C > 0", 'string == "foo"'])
expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")]
tm.assert_frame_equal(result, expected)
def test_create_table_index(self, setup_path):
with ensure_clean_store(setup_path) as store:
with catch_warnings(record=True):
def col(t, column):
return getattr(store.get_storer(t).table.cols, column)
# data columns
df = tm.makeTimeDataFrame()
df["string"] = "foo"
df["string2"] = "bar"
store.append("f", df, data_columns=["string", "string2"])
assert col("f", "index").is_indexed is True
assert col("f", "string").is_indexed is True
assert col("f", "string2").is_indexed is True
# specify index=columns
store.append(
"f2", df, index=["string"], data_columns=["string", "string2"]
)
assert col("f2", "index").is_indexed is False
assert col("f2", "string").is_indexed is True
assert col("f2", "string2").is_indexed is False
# try to index a non-table
_maybe_remove(store, "f2")
store.put("f2", df)
with pytest.raises(TypeError):
store.create_table_index("f2")
def test_create_table_index_data_columns_argument(self, setup_path):
# GH 28156
with ensure_clean_store(setup_path) as store:
with catch_warnings(record=True):
def col(t, column):
return getattr(store.get_storer(t).table.cols, column)
# data columns
df = tm.makeTimeDataFrame()
df["string"] = "foo"
df["string2"] = "bar"
store.append("f", df, data_columns=["string"])
assert col("f", "index").is_indexed is True
assert col("f", "string").is_indexed is True
msg = "'Cols' object has no attribute 'string2'"
with pytest.raises(AttributeError, match=msg):
col("f", "string2").is_indexed
# try to index a col which isn't a data_column
msg = (
"column string2 is not a data_column.\n"
"In order to read column string2 you must reload the dataframe \n"
"into HDFStore and include string2 with the data_columns argument."
)
with pytest.raises(AttributeError, match=msg):
store.create_table_index("f", columns=["string2"])
def test_append_hierarchical(self, setup_path):
index = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=["foo", "bar"],
)
df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"])
with ensure_clean_store(setup_path) as store:
store.append("mi", df)
result = store.select("mi")
tm.assert_frame_equal(result, df)
# GH 3748
result = store.select("mi", columns=["A", "B"])
expected = df.reindex(columns=["A", "B"])
tm.assert_frame_equal(result, expected)
with ensure_clean_path("test.hdf") as path:
df.to_hdf(path, "df", format="table")
result = read_hdf(path, "df", columns=["A", "B"])
expected = df.reindex(columns=["A", "B"])
tm.assert_frame_equal(result, expected)
def test_column_multiindex(self, setup_path):
# GH 4710
# recreate multi-indexes properly
index = MultiIndex.from_tuples(
[("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")], names=["first", "second"]
)
df = DataFrame(np.arange(12).reshape(3, 4), columns=index)
expected = df.copy()
if isinstance(expected.index, RangeIndex):
expected.index = Int64Index(expected.index)
with ensure_clean_store(setup_path) as store:
store.put("df", df)
tm.assert_frame_equal(
store["df"], expected, check_index_type=True, check_column_type=True
)
store.put("df1", df, format="table")
tm.assert_frame_equal(
store["df1"], expected, check_index_type=True, check_column_type=True
)
with pytest.raises(ValueError):
store.put("df2", df, format="table", data_columns=["A"])
with pytest.raises(ValueError):
store.put("df3", df, format="table", data_columns=True)
# appending multi-column on existing table (see GH 6167)
with ensure_clean_store(setup_path) as store:
store.append("df2", df)
store.append("df2", df)
tm.assert_frame_equal(store["df2"], concat((df, df)))
# non_index_axes name
df = DataFrame(
np.arange(12).reshape(3, 4), columns=Index(list("ABCD"), name="foo")
)
expected = df.copy()
if isinstance(expected.index, RangeIndex):
expected.index = Int64Index(expected.index)
with ensure_clean_store(setup_path) as store:
store.put("df1", df, format="table")
tm.assert_frame_equal(
store["df1"], expected, check_index_type=True, check_column_type=True
)
def test_store_multiindex(self, setup_path):
# validate multi-index names
# GH 5527
with ensure_clean_store(setup_path) as store:
def make_index(names=None):
return MultiIndex.from_tuples(
[
(datetime.datetime(2013, 12, d), s, t)
for d in range(1, 3)
for s in range(2)
for t in range(3)
],
names=names,
)
# no names
_maybe_remove(store, "df")
df = DataFrame(np.zeros((12, 2)), columns=["a", "b"], index=make_index())
store.append("df", df)
tm.assert_frame_equal(store.select("df"), df)
# partial names
_maybe_remove(store, "df")
df = DataFrame(
np.zeros((12, 2)),
columns=["a", "b"],
index=make_index(["date", None, None]),
)
store.append("df", df)
tm.assert_frame_equal(store.select("df"), df)
# series
_maybe_remove(store, "s")
s = Series(np.zeros(12), index=make_index(["date", None, None]))
store.append("s", s)
xp = Series(np.zeros(12), index=make_index(["date", "level_1", "level_2"]))
tm.assert_series_equal(store.select("s"), xp)
# dup with column
_maybe_remove(store, "df")
df = DataFrame(
np.zeros((12, 2)),
columns=["a", "b"],
index=make_index(["date", "a", "t"]),
)
with pytest.raises(ValueError):
store.append("df", df)
# dup within level
_maybe_remove(store, "df")
df = DataFrame(
np.zeros((12, 2)),
columns=["a", "b"],
index=make_index(["date", "date", "date"]),
)
with pytest.raises(ValueError):
store.append("df", df)
# fully names
_maybe_remove(store, "df")
df = DataFrame(
np.zeros((12, 2)),
columns=["a", "b"],
index=make_index(["date", "s", "t"]),
)
store.append("df", df)
tm.assert_frame_equal(store.select("df"), df)
def test_select_columns_in_where(self, setup_path):
# GH 6169
# recreate multi-indexes when columns is passed
# in the `where` argument
index = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=["foo_name", "bar_name"],
)
# With a DataFrame
df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"])
with ensure_clean_store(setup_path) as store:
store.put("df", df, format="table")
expected = df[["A"]]
tm.assert_frame_equal(store.select("df", columns=["A"]), expected)
tm.assert_frame_equal(store.select("df", where="columns=['A']"), expected)
# With a Series
s = Series(np.random.randn(10), index=index, name="A")
with ensure_clean_store(setup_path) as store:
store.put("s", s, format="table")
tm.assert_series_equal(store.select("s", where="columns=['A']"), s)
def test_mi_data_columns(self, setup_path):
# GH 14435
idx = MultiIndex.from_arrays(
[date_range("2000-01-01", periods=5), range(5)], names=["date", "id"]
)
df = DataFrame({"a": [1.1, 1.2, 1.3, 1.4, 1.5]}, index=idx)
with ensure_clean_store(setup_path) as store:
store.append("df", df, data_columns=True)
actual = store.select("df", where="id == 1")
expected = df.iloc[[1], :]
tm.assert_frame_equal(actual, expected)
def test_pass_spec_to_storer(self, setup_path):
df = tm.makeDataFrame()
with ensure_clean_store(setup_path) as store:
store.put("df", df)
with pytest.raises(TypeError):
store.select("df", columns=["A"])
with pytest.raises(TypeError):
store.select("df", where=[("columns=A")])
def test_append_misc(self, setup_path):
with ensure_clean_store(setup_path) as store:
df = tm.makeDataFrame()
store.append("df", df, chunksize=1)
result = store.select("df")
tm.assert_frame_equal(result, df)
store.append("df1", df, expectedrows=10)
result = store.select("df1")
tm.assert_frame_equal(result, df)
# more chunksize in append tests
def check(obj, comparator):
for c in [10, 200, 1000]:
with ensure_clean_store(setup_path, mode="w") as store:
store.append("obj", obj, chunksize=c)
result = store.select("obj")
comparator(result, obj)
df = tm.makeDataFrame()
df["string"] = "foo"
df["float322"] = 1.0
df["float322"] = df["float322"].astype("float32")
df["bool"] = df["float322"] > 0
df["time1"] = Timestamp("20130101")
df["time2"] = Timestamp("20130102")
check(df, tm.assert_frame_equal)
# empty frame, GH4273
with ensure_clean_store(setup_path) as store:
# 0 len
df_empty = DataFrame(columns=list("ABC"))
store.append("df", df_empty)
with pytest.raises(KeyError, match="'No object named df in the file'"):
store.select("df")
# repeated append of 0/non-zero frames
df = DataFrame(np.random.rand(10, 3), columns=list("ABC"))
store.append("df", df)
tm.assert_frame_equal(store.select("df"), df)
store.append("df", df_empty)
tm.assert_frame_equal(store.select("df"), df)
# store
df = DataFrame(columns=list("ABC"))
store.put("df2", df)
tm.assert_frame_equal(store.select("df2"), df)
def test_append_raise(self, setup_path):
with ensure_clean_store(setup_path) as store:
# test append with invalid input to get good error messages
# list in column
df = tm.makeDataFrame()
df["invalid"] = [["a"]] * len(df)
assert df.dtypes["invalid"] == np.object_
with pytest.raises(TypeError):
store.append("df", df)
# multiple invalid columns
df["invalid2"] = [["a"]] * len(df)
df["invalid3"] = [["a"]] * len(df)
with pytest.raises(TypeError):
store.append("df", df)
# datetime with embedded nans as object
df = tm.makeDataFrame()
s = Series(datetime.datetime(2001, 1, 2), index=df.index)
s = s.astype(object)
s[0:5] = np.nan
df["invalid"] = s
assert df.dtypes["invalid"] == np.object_
with pytest.raises(TypeError):
store.append("df", df)
# directly ndarray
with pytest.raises(TypeError):
store.append("df", np.arange(10))
# series directly
with pytest.raises(TypeError):
store.append("df", Series(np.arange(10)))
# appending an incompatible table
df = tm.makeDataFrame()
store.append("df", df)
df["foo"] = "foo"
with pytest.raises(ValueError):
store.append("df", df)
def test_table_index_incompatible_dtypes(self, setup_path):
df1 = DataFrame({"a": [1, 2, 3]})
df2 = DataFrame({"a": [4, 5, 6]}, index=date_range("1/1/2000", periods=3))
with ensure_clean_store(setup_path) as store:
store.put("frame", df1, format="table")
with pytest.raises(TypeError):
store.put("frame", df2, format="table", append=True)
def test_table_values_dtypes_roundtrip(self, setup_path):
with ensure_clean_store(setup_path) as store:
df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8")
store.append("df_f8", df1)
tm.assert_series_equal(df1.dtypes, store["df_f8"].dtypes)
df2 = DataFrame({"a": [1, 2, 3]}, dtype="i8")
store.append("df_i8", df2)
tm.assert_series_equal(df2.dtypes, store["df_i8"].dtypes)
# incompatible dtype
with pytest.raises(ValueError):
store.append("df_i8", df1)
# check creation/storage/retrieval of float32 (a bit hacky to
# actually create them thought)
df1 = DataFrame(np.array([[1], [2], [3]], dtype="f4"), columns=["A"])
store.append("df_f4", df1)
tm.assert_series_equal(df1.dtypes, store["df_f4"].dtypes)
assert df1.dtypes[0] == "float32"
# check with mixed dtypes
df1 = DataFrame(
{
c: Series(np.random.randint(5), dtype=c)
for c in ["float32", "float64", "int32", "int64", "int16", "int8"]
}
)
df1["string"] = "foo"
df1["float322"] = 1.0
df1["float322"] = df1["float322"].astype("float32")
df1["bool"] = df1["float32"] > 0
df1["time1"] = Timestamp("20130101")
df1["time2"] = Timestamp("20130102")
store.append("df_mixed_dtypes1", df1)
result = store.select("df_mixed_dtypes1").dtypes.value_counts()
result.index = [str(i) for i in result.index]
expected = Series(
{
"float32": 2,
"float64": 1,
"int32": 1,
"bool": 1,
"int16": 1,
"int8": 1,
"int64": 1,
"object": 1,
"datetime64[ns]": 2,
}
)
result = result.sort_index()
expected = expected.sort_index()
tm.assert_series_equal(result, expected)
def test_table_mixed_dtypes(self, setup_path):
# frame
df = tm.makeDataFrame()
df["obj1"] = "foo"
df["obj2"] = "bar"
df["bool1"] = df["A"] > 0
df["bool2"] = df["B"] > 0
df["bool3"] = True
df["int1"] = 1
df["int2"] = 2
df["timestamp1"] = Timestamp("20010102")
df["timestamp2"] = Timestamp("20010103")
df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0)
df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0)
df.loc[df.index[3:6], ["obj1"]] = np.nan
df = df._consolidate()._convert(datetime=True)
with ensure_clean_store(setup_path) as store:
store.append("df1_mixed", df)
tm.assert_frame_equal(store.select("df1_mixed"), df)
def test_unimplemented_dtypes_table_columns(self, setup_path):
with ensure_clean_store(setup_path) as store:
dtypes = [("date", datetime.date(2001, 1, 2))]
# currently not supported dtypes ####
for n, f in dtypes:
df = tm.makeDataFrame()
df[n] = f
with pytest.raises(TypeError):
store.append(f"df1_{n}", df)
# frame
df = tm.makeDataFrame()
df["obj1"] = "foo"
df["obj2"] = "bar"
df["datetime1"] = datetime.date(2001, 1, 2)
df = df._consolidate()._convert(datetime=True)
with ensure_clean_store(setup_path) as store:
# this fails because we have a date in the object block......
with pytest.raises(TypeError):
store.append("df_unimplemented", df)
def test_calendar_roundtrip_issue(self, setup_path):
# 8591
# doc example from tseries holiday section
weekmask_egypt = "Sun Mon Tue Wed Thu"
holidays = [
"2012-05-01",
datetime.datetime(2013, 5, 1),
np.datetime64("2014-05-01"),
]
bday_egypt = pd.offsets.CustomBusinessDay(
holidays=holidays, weekmask=weekmask_egypt
)
dt = datetime.datetime(2013, 4, 30)
dts = date_range(dt, periods=5, freq=bday_egypt)
s = Series(dts.weekday, dts).map(Series("Mon Tue Wed Thu Fri Sat Sun".split()))
with ensure_clean_store(setup_path) as store:
store.put("fixed", s)
result = store.select("fixed")
tm.assert_series_equal(result, s)
store.append("table", s)
result = store.select("table")
tm.assert_series_equal(result, s)
def test_append_with_timedelta(self, setup_path):
# GH 3577
# append timedelta
df = DataFrame(
{
"A": Timestamp("20130101"),
"B": [
Timestamp("20130101") + timedelta(days=i, seconds=10)
for i in range(10)
],
}
)
df["C"] = df["A"] - df["B"]
df.loc[3:5, "C"] = np.nan
with ensure_clean_store(setup_path) as store:
# table
_maybe_remove(store, "df")
store.append("df", df, data_columns=True)
result = store.select("df")
tm.assert_frame_equal(result, df)
result = store.select("df", where="C<100000")
tm.assert_frame_equal(result, df)
result = store.select("df", where="C<pd.Timedelta('-3D')")
tm.assert_frame_equal(result, df.iloc[3:])
result = store.select("df", "C<'-3D'")
tm.assert_frame_equal(result, df.iloc[3:])
# a bit hacky here as we don't really deal with the NaT properly
result = store.select("df", "C<'-500000s'")
result = result.dropna(subset=["C"])
tm.assert_frame_equal(result, df.iloc[6:])
result = store.select("df", "C<'-3.5D'")
result = result.iloc[1:]
tm.assert_frame_equal(result, df.iloc[4:])
# fixed
_maybe_remove(store, "df2")
store.put("df2", df)
result = store.select("df2")
tm.assert_frame_equal(result, df)
def test_remove(self, setup_path):
with ensure_clean_store(setup_path) as store:
ts = tm.makeTimeSeries()
df = tm.makeDataFrame()
store["a"] = ts
store["b"] = df
_maybe_remove(store, "a")
assert len(store) == 1
tm.assert_frame_equal(df, store["b"])
_maybe_remove(store, "b")
assert len(store) == 0
# nonexistence
with pytest.raises(
KeyError, match="'No object named a_nonexistent_store in the file'"
):
store.remove("a_nonexistent_store")
# pathing
store["a"] = ts
store["b/foo"] = df
_maybe_remove(store, "foo")
_maybe_remove(store, "b/foo")
assert len(store) == 1
store["a"] = ts
store["b/foo"] = df
_maybe_remove(store, "b")
assert len(store) == 1
# __delitem__
store["a"] = ts
store["b"] = df
del store["a"]
del store["b"]
assert len(store) == 0
def test_invalid_terms(self, setup_path):
with ensure_clean_store(setup_path) as store:
with catch_warnings(record=True):
df = tm.makeTimeDataFrame()
df["string"] = "foo"
df.loc[df.index[0:4], "string"] = "bar"
store.put("df", df, format="table")
# some invalid terms
with pytest.raises(TypeError):
Term()
# more invalid
with pytest.raises(ValueError):
store.select("df", "df.index[3]")
with pytest.raises(SyntaxError):
store.select("df", "index>")
# from the docs
with ensure_clean_path(setup_path) as path:
dfq = DataFrame(
np.random.randn(10, 4),
columns=list("ABCD"),
index=date_range("20130101", periods=10),
)
dfq.to_hdf(path, "dfq", format="table", data_columns=True)
# check ok
read_hdf(
path, "dfq", where="index>Timestamp('20130104') & columns=['A', 'B']"
)
read_hdf(path, "dfq", where="A>0 or C>0")
# catch the invalid reference
with ensure_clean_path(setup_path) as path:
dfq = DataFrame(
np.random.randn(10, 4),
columns=list("ABCD"),
index=date_range("20130101", periods=10),
)
dfq.to_hdf(path, "dfq", format="table")
with pytest.raises(ValueError):
read_hdf(path, "dfq", where="A>0 or C>0")
def test_same_name_scoping(self, setup_path):
with ensure_clean_store(setup_path) as store:
import pandas as pd
df = DataFrame(
np.random.randn(20, 2), index=pd.date_range("20130101", periods=20)
)
store.put("df", df, format="table")
expected = df[df.index > Timestamp("20130105")]
import datetime
result = store.select("df", "index>datetime.datetime(2013,1,5)")
tm.assert_frame_equal(result, expected)
from datetime import datetime # noqa
# technically an error, but allow it
result = store.select("df", "index>datetime.datetime(2013,1,5)")
tm.assert_frame_equal(result, expected)
result = store.select("df", "index>datetime(2013,1,5)")
tm.assert_frame_equal(result, expected)
def test_series(self, setup_path):
s = tm.makeStringSeries()
self._check_roundtrip(s, tm.assert_series_equal, path=setup_path)
ts = tm.makeTimeSeries()
self._check_roundtrip(ts, tm.assert_series_equal, path=setup_path)
ts2 = Series(ts.index, Index(ts.index, dtype=object))
self._check_roundtrip(ts2, tm.assert_series_equal, path=setup_path)
ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object))
self._check_roundtrip(
ts3, tm.assert_series_equal, path=setup_path, check_index_type=False
)
def test_float_index(self, setup_path):
# GH #454
index = np.random.randn(10)
s = Series(np.random.randn(10), index=index)
self._check_roundtrip(s, tm.assert_series_equal, path=setup_path)
def test_tuple_index(self, setup_path):
# GH #492
col = np.arange(10)
idx = [(0.0, 1.0), (2.0, 3.0), (4.0, 5.0)]
data = np.random.randn(30).reshape((3, 10))
DF = DataFrame(data, index=idx, columns=col)
with catch_warnings(record=True):
simplefilter("ignore", pd.errors.PerformanceWarning)
self._check_roundtrip(DF, tm.assert_frame_equal, path=setup_path)
@pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning")
def test_index_types(self, setup_path):
with catch_warnings(record=True):
values = np.random.randn(2)
func = lambda l, r: tm.assert_series_equal(l, r, check_index_type=True)
with catch_warnings(record=True):
ser = Series(values, [0, "y"])
self._check_roundtrip(ser, func, path=setup_path)
with catch_warnings(record=True):
ser = Series(values, [datetime.datetime.today(), 0])
self._check_roundtrip(ser, func, path=setup_path)
with catch_warnings(record=True):
ser = Series(values, ["y", 0])
self._check_roundtrip(ser, func, path=setup_path)
with catch_warnings(record=True):
ser = Series(values, [datetime.date.today(), "a"])
self._check_roundtrip(ser, func, path=setup_path)
with catch_warnings(record=True):
ser = Series(values, [0, "y"])
self._check_roundtrip(ser, func, path=setup_path)
ser = Series(values, [datetime.datetime.today(), 0])
self._check_roundtrip(ser, func, path=setup_path)
ser = Series(values, ["y", 0])
self._check_roundtrip(ser, func, path=setup_path)
ser = Series(values, [datetime.date.today(), "a"])
self._check_roundtrip(ser, func, path=setup_path)
ser = Series(values, [1.23, "b"])
self._check_roundtrip(ser, func, path=setup_path)
ser = Series(values, [1, 1.53])
self._check_roundtrip(ser, func, path=setup_path)
ser = Series(values, [1, 5])
self._check_roundtrip(ser, func, path=setup_path)
ser = Series(
values, [datetime.datetime(2012, 1, 1), datetime.datetime(2012, 1, 2)]
)
self._check_roundtrip(ser, func, path=setup_path)
def test_timeseries_preepoch(self, setup_path):
dr = bdate_range("1/1/1940", "1/1/1960")
ts = Series(np.random.randn(len(dr)), index=dr)
try:
self._check_roundtrip(ts, tm.assert_series_equal, path=setup_path)
except OverflowError:
pytest.skip("known failer on some windows platforms")
@pytest.mark.parametrize(
"compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)]
)
def test_frame(self, compression, setup_path):
df = tm.makeDataFrame()
# put in some random NAs
df.values[0, 0] = np.nan
df.values[5, 3] = np.nan
self._check_roundtrip_table(
df, tm.assert_frame_equal, path=setup_path, compression=compression
)
self._check_roundtrip(
df, tm.assert_frame_equal, path=setup_path, compression=compression
)
tdf = tm.makeTimeDataFrame()
self._check_roundtrip(
tdf, tm.assert_frame_equal, path=setup_path, compression=compression
)
with ensure_clean_store(setup_path) as store:
# not consolidated
df["foo"] = np.random.randn(len(df))
store["df"] = df
recons = store["df"]
assert recons._mgr.is_consolidated()
# empty
self._check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path)
def test_empty_series_frame(self, setup_path):
s0 = Series(dtype=object)
s1 = Series(name="myseries", dtype=object)
df0 = DataFrame()
df1 = DataFrame(index=["a", "b", "c"])
df2 = DataFrame(columns=["d", "e", "f"])
self._check_roundtrip(s0, tm.assert_series_equal, path=setup_path)
self._check_roundtrip(s1, tm.assert_series_equal, path=setup_path)
self._check_roundtrip(df0, tm.assert_frame_equal, path=setup_path)
self._check_roundtrip(df1, tm.assert_frame_equal, path=setup_path)
self._check_roundtrip(df2, tm.assert_frame_equal, path=setup_path)
@pytest.mark.parametrize(
"dtype", [np.int64, np.float64, object, "m8[ns]", "M8[ns]"]
)
def test_empty_series(self, dtype, setup_path):
s = Series(dtype=dtype)
self._check_roundtrip(s, tm.assert_series_equal, path=setup_path)
def test_can_serialize_dates(self, setup_path):
rng = [x.date() for x in bdate_range("1/1/2000", "1/30/2000")]
frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
self._check_roundtrip(frame, tm.assert_frame_equal, path=setup_path)
def test_store_hierarchical(self, setup_path):
index = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=["foo", "bar"],
)
frame = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"])
self._check_roundtrip(frame, tm.assert_frame_equal, path=setup_path)
self._check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path)
self._check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path)
# check that the names are stored
with ensure_clean_store(setup_path) as store:
store["frame"] = frame
recons = store["frame"]
tm.assert_frame_equal(recons, frame)
def test_store_index_name(self, setup_path):
df = tm.makeDataFrame()
df.index.name = "foo"
with ensure_clean_store(setup_path) as store:
store["frame"] = df
recons = store["frame"]
tm.assert_frame_equal(recons, df)
@pytest.mark.parametrize("table_format", ["table", "fixed"])
def test_store_index_name_numpy_str(self, table_format, setup_path):
# GH #13492
idx = Index(
pd.to_datetime([datetime.date(2000, 1, 1), datetime.date(2000, 1, 2)]),
name="cols\u05d2",
)
idx1 = Index(
pd.to_datetime([datetime.date(2010, 1, 1), datetime.date(2010, 1, 2)]),
name="rows\u05d0",
)
df = DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1)
# This used to fail, returning numpy strings instead of python strings.
with ensure_clean_path(setup_path) as path:
df.to_hdf(path, "df", format=table_format)
df2 = read_hdf(path, "df")
tm.assert_frame_equal(df, df2, check_names=True)
assert type(df2.index.name) == str
assert type(df2.columns.name) == str
def test_store_series_name(self, setup_path):
df = tm.makeDataFrame()
series = df["A"]
with ensure_clean_store(setup_path) as store:
store["series"] = series
recons = store["series"]
tm.assert_series_equal(recons, series)
@pytest.mark.parametrize(
"compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)]
)
def test_store_mixed(self, compression, setup_path):
def _make_one():
df = tm.makeDataFrame()
df["obj1"] = "foo"
df["obj2"] = "bar"
df["bool1"] = df["A"] > 0
df["bool2"] = df["B"] > 0
df["int1"] = 1
df["int2"] = 2
return df._consolidate()
df1 = _make_one()
df2 = _make_one()
self._check_roundtrip(df1, tm.assert_frame_equal, path=setup_path)
self._check_roundtrip(df2, tm.assert_frame_equal, path=setup_path)
with ensure_clean_store(setup_path) as store:
store["obj"] = df1
tm.assert_frame_equal(store["obj"], df1)
store["obj"] = df2
tm.assert_frame_equal(store["obj"], df2)
# check that can store Series of all of these types
self._check_roundtrip(
df1["obj1"],
tm.assert_series_equal,
path=setup_path,
compression=compression,
)
self._check_roundtrip(
df1["bool1"],
tm.assert_series_equal,
path=setup_path,
compression=compression,
)
self._check_roundtrip(
df1["int1"],
tm.assert_series_equal,
path=setup_path,
compression=compression,
)
@pytest.mark.filterwarnings(
"ignore:\\nduplicate:pandas.io.pytables.DuplicateWarning"
)
def test_select_with_dups(self, setup_path):
# single dtypes
df = DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"])
df.index = date_range("20130101 9:30", periods=10, freq="T")
with ensure_clean_store(setup_path) as store:
store.append("df", df)
result = store.select("df")
expected = df
tm.assert_frame_equal(result, expected, by_blocks=True)
result = store.select("df", columns=df.columns)
expected = df
tm.assert_frame_equal(result, expected, by_blocks=True)
result = store.select("df", columns=["A"])
expected = df.loc[:, ["A"]]
tm.assert_frame_equal(result, expected)
# dups across dtypes
df = concat(
[
DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]),
DataFrame(
np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"]
),
],
axis=1,
)
df.index = date_range("20130101 9:30", periods=10, freq="T")
with ensure_clean_store(setup_path) as store:
store.append("df", df)
result = store.select("df")
expected = df
tm.assert_frame_equal(result, expected, by_blocks=True)
result = store.select("df", columns=df.columns)
expected = df
tm.assert_frame_equal(result, expected, by_blocks=True)
expected = df.loc[:, ["A"]]
result = store.select("df", columns=["A"])
tm.assert_frame_equal(result, expected, by_blocks=True)
expected = df.loc[:, ["B", "A"]]
result = store.select("df", columns=["B", "A"])
tm.assert_frame_equal(result, expected, by_blocks=True)
# duplicates on both index and columns
with ensure_clean_store(setup_path) as store:
store.append("df", df)
store.append("df", df)
expected = df.loc[:, ["B", "A"]]
expected = concat([expected, expected])
result = store.select("df", columns=["B", "A"])
tm.assert_frame_equal(result, expected, by_blocks=True)
def test_overwrite_node(self, setup_path):
with ensure_clean_store(setup_path) as store:
store["a"] = tm.makeTimeDataFrame()
ts = tm.makeTimeSeries()
store["a"] = ts
tm.assert_series_equal(store["a"], ts)
def test_select(self, setup_path):
with ensure_clean_store(setup_path) as store:
with catch_warnings(record=True):
# select with columns=
df = tm.makeTimeDataFrame()
_maybe_remove(store, "df")
store.append("df", df)
result = store.select("df", columns=["A", "B"])
expected = df.reindex(columns=["A", "B"])
tm.assert_frame_equal(expected, result)
# equivalently
result = store.select("df", [("columns=['A', 'B']")])
expected = df.reindex(columns=["A", "B"])
tm.assert_frame_equal(expected, result)
# with a data column
_maybe_remove(store, "df")
store.append("df", df, data_columns=["A"])
result = store.select("df", ["A > 0"], columns=["A", "B"])
expected = df[df.A > 0].reindex(columns=["A", "B"])
tm.assert_frame_equal(expected, result)
# all a data columns
_maybe_remove(store, "df")
store.append("df", df, data_columns=True)
result = store.select("df", ["A > 0"], columns=["A", "B"])
expected = df[df.A > 0].reindex(columns=["A", "B"])
tm.assert_frame_equal(expected, result)
# with a data column, but different columns
_maybe_remove(store, "df")
store.append("df", df, data_columns=["A"])
result = store.select("df", ["A > 0"], columns=["C", "D"])
expected = df[df.A > 0].reindex(columns=["C", "D"])
tm.assert_frame_equal(expected, result)
def test_select_dtypes(self, setup_path):
with ensure_clean_store(setup_path) as store:
# with a Timestamp data column (GH #2637)
df = DataFrame(
{
"ts": bdate_range("2012-01-01", periods=300),
"A": np.random.randn(300),
}
)
_maybe_remove(store, "df")
store.append("df", df, data_columns=["ts", "A"])
result = store.select("df", "ts>=Timestamp('2012-02-01')")
expected = df[df.ts >= Timestamp("2012-02-01")]
tm.assert_frame_equal(expected, result)
# bool columns (GH #2849)
df = DataFrame(np.random.randn(5, 2), columns=["A", "B"])
df["object"] = "foo"
df.loc[4:5, "object"] = "bar"
df["boolv"] = df["A"] > 0
_maybe_remove(store, "df")
store.append("df", df, data_columns=True)
expected = df[df.boolv == True].reindex(columns=["A", "boolv"]) # noqa
for v in [True, "true", 1]:
result = store.select("df", f"boolv == {v}", columns=["A", "boolv"])
tm.assert_frame_equal(expected, result)
expected = df[df.boolv == False].reindex(columns=["A", "boolv"]) # noqa
for v in [False, "false", 0]:
result = store.select("df", f"boolv == {v}", columns=["A", "boolv"])
tm.assert_frame_equal(expected, result)
# integer index
df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)})
_maybe_remove(store, "df_int")
store.append("df_int", df)
result = store.select("df_int", "index<10 and columns=['A']")
expected = df.reindex(index=list(df.index)[0:10], columns=["A"])
tm.assert_frame_equal(expected, result)
# float index
df = DataFrame(
{
"A": np.random.rand(20),
"B": np.random.rand(20),
"index": np.arange(20, dtype="f8"),
}
)
_maybe_remove(store, "df_float")
store.append("df_float", df)
result = store.select("df_float", "index<10.0 and columns=['A']")
expected = df.reindex(index=list(df.index)[0:10], columns=["A"])
tm.assert_frame_equal(expected, result)
with ensure_clean_store(setup_path) as store:
# floats w/o NaN
df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64")
df["cols"] = (df["cols"] + 10).apply(str)
store.append("df1", df, data_columns=True)
result = store.select("df1", where="values>2.0")
expected = df[df["values"] > 2.0]
tm.assert_frame_equal(expected, result)
# floats with NaN
df.iloc[0] = np.nan
expected = df[df["values"] > 2.0]
store.append("df2", df, data_columns=True, index=False)
result = store.select("df2", where="values>2.0")
tm.assert_frame_equal(expected, result)
# https://github.com/PyTables/PyTables/issues/282
# bug in selection when 0th row has a np.nan and an index
# store.append('df3',df,data_columns=True)
# result = store.select(
# 'df3', where='values>2.0')
# tm.assert_frame_equal(expected, result)
# not in first position float with NaN ok too
df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64")
df["cols"] = (df["cols"] + 10).apply(str)
df.iloc[1] = np.nan
expected = df[df["values"] > 2.0]
store.append("df4", df, data_columns=True)
result = store.select("df4", where="values>2.0")
tm.assert_frame_equal(expected, result)
# test selection with comparison against numpy scalar
# GH 11283
with ensure_clean_store(setup_path) as store:
df = tm.makeDataFrame()
expected = df[df["A"] > 0]
store.append("df", df, data_columns=True)
np_zero = np.float64(0) # noqa
result = store.select("df", where=["A>np_zero"])
tm.assert_frame_equal(expected, result)
def test_select_with_many_inputs(self, setup_path):
with ensure_clean_store(setup_path) as store:
df = DataFrame(
{
"ts": bdate_range("2012-01-01", periods=300),
"A": np.random.randn(300),
"B": range(300),
"users": ["a"] * 50
+ ["b"] * 50
+ ["c"] * 100
+ [f"a{i:03d}" for i in range(100)],
}
)
_maybe_remove(store, "df")
store.append("df", df, data_columns=["ts", "A", "B", "users"])
# regular select
result = store.select("df", "ts>=Timestamp('2012-02-01')")
expected = df[df.ts >= Timestamp("2012-02-01")]
tm.assert_frame_equal(expected, result)
# small selector
result = store.select(
"df", "ts>=Timestamp('2012-02-01') & users=['a','b','c']"
)
expected = df[
(df.ts >= Timestamp("2012-02-01")) & df.users.isin(["a", "b", "c"])
]
tm.assert_frame_equal(expected, result)
# big selector along the columns
selector = ["a", "b", "c"] + [f"a{i:03d}" for i in range(60)]
result = store.select(
"df", "ts>=Timestamp('2012-02-01') and users=selector"
)
expected = df[(df.ts >= Timestamp("2012-02-01")) & df.users.isin(selector)]
tm.assert_frame_equal(expected, result)
selector = range(100, 200)
result = store.select("df", "B=selector")
expected = df[df.B.isin(selector)]
tm.assert_frame_equal(expected, result)
assert len(result) == 100
# big selector along the index
selector = Index(df.ts[0:100].values)
result = store.select("df", "ts=selector")
expected = df[df.ts.isin(selector.values)]
tm.assert_frame_equal(expected, result)
assert len(result) == 100
def test_select_iterator(self, setup_path):
# single table
with ensure_clean_store(setup_path) as store:
df = tm.makeTimeDataFrame(500)
_maybe_remove(store, "df")
store.append("df", df)
expected = store.select("df")
results = list(store.select("df", iterator=True))
result = concat(results)
tm.assert_frame_equal(expected, result)
results = list(store.select("df", chunksize=100))
assert len(results) == 5
result = concat(results)
tm.assert_frame_equal(expected, result)
results = list(store.select("df", chunksize=150))
result = concat(results)
tm.assert_frame_equal(result, expected)
with ensure_clean_path(setup_path) as path:
df = tm.makeTimeDataFrame(500)
df.to_hdf(path, "df_non_table")
with pytest.raises(TypeError):
read_hdf(path, "df_non_table", chunksize=100)
with pytest.raises(TypeError):
read_hdf(path, "df_non_table", iterator=True)
with ensure_clean_path(setup_path) as path:
df = tm.makeTimeDataFrame(500)
df.to_hdf(path, "df", format="table")
results = list(read_hdf(path, "df", chunksize=100))
result = concat(results)
assert len(results) == 5
tm.assert_frame_equal(result, df)
tm.assert_frame_equal(result, read_hdf(path, "df"))
# multiple
with ensure_clean_store(setup_path) as store:
df1 = tm.makeTimeDataFrame(500)
store.append("df1", df1, data_columns=True)
df2 = tm.makeTimeDataFrame(500).rename(columns="{}_2".format)
df2["foo"] = "bar"
store.append("df2", df2)
df = concat([df1, df2], axis=1)
# full selection
expected = store.select_as_multiple(["df1", "df2"], selector="df1")
results = list(
store.select_as_multiple(["df1", "df2"], selector="df1", chunksize=150)
)
result = concat(results)
tm.assert_frame_equal(expected, result)
def test_select_iterator_complete_8014(self, setup_path):
# GH 8014
# using iterator and where clause
chunksize = 1e4
# no iterator
with ensure_clean_store(setup_path) as store:
expected = tm.makeTimeDataFrame(100064, "S")
_maybe_remove(store, "df")
store.append("df", expected)
beg_dt = expected.index[0]
end_dt = expected.index[-1]
# select w/o iteration and no where clause works
result = store.select("df")
tm.assert_frame_equal(expected, result)
# select w/o iterator and where clause, single term, begin
# of range, works
where = f"index >= '{beg_dt}'"
result = store.select("df", where=where)
tm.assert_frame_equal(expected, result)
# select w/o iterator and where clause, single term, end
# of range, works
where = f"index <= '{end_dt}'"
result = store.select("df", where=where)
tm.assert_frame_equal(expected, result)
# select w/o iterator and where clause, inclusive range,
# works
where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
result = store.select("df", where=where)
tm.assert_frame_equal(expected, result)
# with iterator, full range
with ensure_clean_store(setup_path) as store:
expected = tm.makeTimeDataFrame(100064, "S")
_maybe_remove(store, "df")
store.append("df", expected)
beg_dt = expected.index[0]
end_dt = expected.index[-1]
# select w/iterator and no where clause works
results = list(store.select("df", chunksize=chunksize))
result = concat(results)
tm.assert_frame_equal(expected, result)
# select w/iterator and where clause, single term, begin of range
where = f"index >= '{beg_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
tm.assert_frame_equal(expected, result)
# select w/iterator and where clause, single term, end of range
where = f"index <= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
tm.assert_frame_equal(expected, result)
# select w/iterator and where clause, inclusive range
where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
tm.assert_frame_equal(expected, result)
def test_select_iterator_non_complete_8014(self, setup_path):
# GH 8014
# using iterator and where clause
chunksize = 1e4
# with iterator, non complete range
with ensure_clean_store(setup_path) as store:
expected = tm.makeTimeDataFrame(100064, "S")
_maybe_remove(store, "df")
store.append("df", expected)
beg_dt = expected.index[1]
end_dt = expected.index[-2]
# select w/iterator and where clause, single term, begin of range
where = f"index >= '{beg_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
rexpected = expected[expected.index >= beg_dt]
tm.assert_frame_equal(rexpected, result)
# select w/iterator and where clause, single term, end of range
where = f"index <= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
rexpected = expected[expected.index <= end_dt]
tm.assert_frame_equal(rexpected, result)
# select w/iterator and where clause, inclusive range
where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
rexpected = expected[
(expected.index >= beg_dt) & (expected.index <= end_dt)
]
tm.assert_frame_equal(rexpected, result)
# with iterator, empty where
with ensure_clean_store(setup_path) as store:
expected = tm.makeTimeDataFrame(100064, "S")
_maybe_remove(store, "df")
store.append("df", expected)
end_dt = expected.index[-1]
# select w/iterator and where clause, single term, begin of range
where = f"index > '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
assert 0 == len(results)
def test_select_iterator_many_empty_frames(self, setup_path):
# GH 8014
# using iterator and where clause can return many empty
# frames.
chunksize = int(1e4)
# with iterator, range limited to the first chunk
with ensure_clean_store(setup_path) as store:
expected = tm.makeTimeDataFrame(100000, "S")
_maybe_remove(store, "df")
store.append("df", expected)
beg_dt = expected.index[0]
end_dt = expected.index[chunksize - 1]
# select w/iterator and where clause, single term, begin of range
where = f"index >= '{beg_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
rexpected = expected[expected.index >= beg_dt]
tm.assert_frame_equal(rexpected, result)
# select w/iterator and where clause, single term, end of range
where = f"index <= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
assert len(results) == 1
result = concat(results)
rexpected = expected[expected.index <= end_dt]
tm.assert_frame_equal(rexpected, result)
# select w/iterator and where clause, inclusive range
where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
# should be 1, is 10
assert len(results) == 1
result = concat(results)
rexpected = expected[
(expected.index >= beg_dt) & (expected.index <= end_dt)
]
tm.assert_frame_equal(rexpected, result)
# select w/iterator and where clause which selects
# *nothing*.
#
# To be consistent with Python idiom I suggest this should
# return [] e.g. `for e in []: print True` never prints
# True.
where = f"index <= '{beg_dt}' & index >= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
# should be []
assert len(results) == 0
@pytest.mark.filterwarnings(
"ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning"
)
def test_retain_index_attributes(self, setup_path):
# GH 3499, losing frequency info on index recreation
df = DataFrame(
{"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))}
)
with ensure_clean_store(setup_path) as store:
_maybe_remove(store, "data")
store.put("data", df, format="table")
result = store.get("data")
tm.assert_frame_equal(df, result)
for attr in ["freq", "tz", "name"]:
for idx in ["index", "columns"]:
assert getattr(getattr(df, idx), attr, None) == getattr(
getattr(result, idx), attr, None
)
# try to append a table with a different frequency
with catch_warnings(record=True):
df2 = DataFrame(
{
"A": Series(
range(3), index=date_range("2002-1-1", periods=3, freq="D")
)
}
)
store.append("data", df2)
assert store.get_storer("data").info["index"]["freq"] is None
# this is ok
_maybe_remove(store, "df2")
df2 = DataFrame(
{
"A": Series(
range(3),
index=[
Timestamp("20010101"),
Timestamp("20010102"),
Timestamp("20020101"),
],
)
}
)
store.append("df2", df2)
df3 = DataFrame(
{
"A": Series(
range(3), index=date_range("2002-1-1", periods=3, freq="D")
)
}
)
store.append("df2", df3)
@pytest.mark.filterwarnings(
"ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning"
)
def test_retain_index_attributes2(self, setup_path):
with ensure_clean_path(setup_path) as path:
with catch_warnings(record=True):
df = DataFrame(
{
"A": Series(
range(3), index=date_range("2000-1-1", periods=3, freq="H")
)
}
)
df.to_hdf(path, "data", mode="w", append=True)
df2 = DataFrame(
{
"A": Series(
range(3), index=date_range("2002-1-1", periods=3, freq="D")
)
}
)
df2.to_hdf(path, "data", append=True)
idx = date_range("2000-1-1", periods=3, freq="H")
idx.name = "foo"
df = DataFrame({"A": Series(range(3), index=idx)})
df.to_hdf(path, "data", mode="w", append=True)
assert read_hdf(path, "data").index.name == "foo"
with catch_warnings(record=True):
idx2 = date_range("2001-1-1", periods=3, freq="H")
idx2.name = "bar"
df2 = DataFrame({"A": Series(range(3), index=idx2)})
df2.to_hdf(path, "data", append=True)
assert read_hdf(path, "data").index.name is None
def test_frame_select(self, setup_path):
df = tm.makeTimeDataFrame()
with ensure_clean_store(setup_path) as store:
store.put("frame", df, format="table")
date = df.index[len(df) // 2]
crit1 = Term("index>=date")
assert crit1.env.scope["date"] == date
crit2 = "columns=['A', 'D']"
crit3 = "columns=A"
result = store.select("frame", [crit1, crit2])
expected = df.loc[date:, ["A", "D"]]
tm.assert_frame_equal(result, expected)
result = store.select("frame", [crit3])
expected = df.loc[:, ["A"]]
tm.assert_frame_equal(result, expected)
# invalid terms
df = tm.makeTimeDataFrame()
store.append("df_time", df)
with pytest.raises(ValueError):
store.select("df_time", "index>0")
# can't select if not written as table
# store['frame'] = df
# with pytest.raises(ValueError):
# store.select('frame', [crit1, crit2])
def test_frame_select_complex(self, setup_path):
# select via complex criteria
df = tm.makeTimeDataFrame()
df["string"] = "foo"
df.loc[df.index[0:4], "string"] = "bar"
with ensure_clean_store(setup_path) as store:
store.put("df", df, format="table", data_columns=["string"])
# empty
result = store.select("df", 'index>df.index[3] & string="bar"')
expected = df.loc[(df.index > df.index[3]) & (df.string == "bar")]
tm.assert_frame_equal(result, expected)
result = store.select("df", 'index>df.index[3] & string="foo"')
expected = df.loc[(df.index > df.index[3]) & (df.string == "foo")]
tm.assert_frame_equal(result, expected)
# or
result = store.select("df", 'index>df.index[3] | string="bar"')
expected = df.loc[(df.index > df.index[3]) | (df.string == "bar")]
tm.assert_frame_equal(result, expected)
result = store.select(
"df", '(index>df.index[3] & index<=df.index[6]) | string="bar"'
)
expected = df.loc[
((df.index > df.index[3]) & (df.index <= df.index[6]))
| (df.string == "bar")
]
tm.assert_frame_equal(result, expected)
# invert
result = store.select("df", 'string!="bar"')
expected = df.loc[df.string != "bar"]
tm.assert_frame_equal(result, expected)
# invert not implemented in numexpr :(
with pytest.raises(NotImplementedError):
store.select("df", '~(string="bar")')
# invert ok for filters
result = store.select("df", "~(columns=['A','B'])")
expected = df.loc[:, df.columns.difference(["A", "B"])]
tm.assert_frame_equal(result, expected)
# in
result = store.select("df", "index>df.index[3] & columns in ['A','B']")
expected = df.loc[df.index > df.index[3]].reindex(columns=["A", "B"])
tm.assert_frame_equal(result, expected)
def test_frame_select_complex2(self, setup_path):
with ensure_clean_path(["parms.hdf", "hist.hdf"]) as paths:
pp, hh = paths
# use non-trivial selection criteria
parms = DataFrame({"A": [1, 1, 2, 2, 3]})
parms.to_hdf(pp, "df", mode="w", format="table", data_columns=["A"])
selection = read_hdf(pp, "df", where="A=[2,3]")
hist = DataFrame(
np.random.randn(25, 1),
columns=["data"],
index=MultiIndex.from_tuples(
[(i, j) for i in range(5) for j in range(5)], names=["l1", "l2"]
),
)
hist.to_hdf(hh, "df", mode="w", format="table")
expected = read_hdf(hh, "df", where="l1=[2, 3, 4]")
# scope with list like
l = selection.index.tolist() # noqa
store = HDFStore(hh)
result = store.select("df", where="l1=l")
tm.assert_frame_equal(result, expected)
store.close()
result = read_hdf(hh, "df", where="l1=l")
tm.assert_frame_equal(result, expected)
# index
index = selection.index # noqa
result = read_hdf(hh, "df", where="l1=index")
tm.assert_frame_equal(result, expected)
result = read_hdf(hh, "df", where="l1=selection.index")
tm.assert_frame_equal(result, expected)
result = read_hdf(hh, "df", where="l1=selection.index.tolist()")
tm.assert_frame_equal(result, expected)
result = read_hdf(hh, "df", where="l1=list(selection.index)")
tm.assert_frame_equal(result, expected)
# scope with index
store = HDFStore(hh)
result = store.select("df", where="l1=index")
tm.assert_frame_equal(result, expected)
result = store.select("df", where="l1=selection.index")
tm.assert_frame_equal(result, expected)
result = store.select("df", where="l1=selection.index.tolist()")
tm.assert_frame_equal(result, expected)
result = store.select("df", where="l1=list(selection.index)")
tm.assert_frame_equal(result, expected)
store.close()
def test_invalid_filtering(self, setup_path):
# can't use more than one filter (atm)
df = tm.makeTimeDataFrame()
with ensure_clean_store(setup_path) as store:
store.put("df", df, format="table")
# not implemented
with pytest.raises(NotImplementedError):
store.select("df", "columns=['A'] | columns=['B']")
# in theory we could deal with this
with pytest.raises(NotImplementedError):
store.select("df", "columns=['A','B'] & columns=['C']")
def test_string_select(self, setup_path):
# GH 2973
with ensure_clean_store(setup_path) as store:
df = tm.makeTimeDataFrame()
# test string ==/!=
df["x"] = "none"
df.loc[df.index[2:7], "x"] = ""
store.append("df", df, data_columns=["x"])
result = store.select("df", "x=none")
expected = df[df.x == "none"]
tm.assert_frame_equal(result, expected)
result = store.select("df", "x!=none")
expected = df[df.x != "none"]
tm.assert_frame_equal(result, expected)
df2 = df.copy()
df2.loc[df2.x == "", "x"] = np.nan
store.append("df2", df2, data_columns=["x"])
result = store.select("df2", "x!=none")
expected = df2[isna(df2.x)]
tm.assert_frame_equal(result, expected)
# int ==/!=
df["int"] = 1
df.loc[df.index[2:7], "int"] = 2
store.append("df3", df, data_columns=["int"])
result = store.select("df3", "int=2")
expected = df[df.int == 2]
tm.assert_frame_equal(result, expected)
result = store.select("df3", "int!=2")
expected = df[df.int != 2]
tm.assert_frame_equal(result, expected)
def test_read_column(self, setup_path):
df = tm.makeTimeDataFrame()
with ensure_clean_store(setup_path) as store:
_maybe_remove(store, "df")
# GH 17912
# HDFStore.select_column should raise a KeyError
# exception if the key is not a valid store
with pytest.raises(KeyError, match="No object named df in the file"):
store.select_column("df", "index")
store.append("df", df)
# error
with pytest.raises(
KeyError, match=re.escape("'column [foo] not found in the table'")
):
store.select_column("df", "foo")
with pytest.raises(Exception):
store.select_column("df", "index", where=["index>5"])
# valid
result = store.select_column("df", "index")
tm.assert_almost_equal(result.values, Series(df.index).values)
assert isinstance(result, Series)
# not a data indexable column
with pytest.raises(ValueError):
store.select_column("df", "values_block_0")
# a data column
df2 = df.copy()
df2["string"] = "foo"
store.append("df2", df2, data_columns=["string"])
result = store.select_column("df2", "string")
tm.assert_almost_equal(result.values, df2["string"].values)
# a data column with NaNs, result excludes the NaNs
df3 = df.copy()
df3["string"] = "foo"
df3.loc[df3.index[4:6], "string"] = np.nan
store.append("df3", df3, data_columns=["string"])
result = store.select_column("df3", "string")
tm.assert_almost_equal(result.values, df3["string"].values)
# start/stop
result = store.select_column("df3", "string", start=2)
tm.assert_almost_equal(result.values, df3["string"].values[2:])
result = store.select_column("df3", "string", start=-2)
tm.assert_almost_equal(result.values, df3["string"].values[-2:])
result = store.select_column("df3", "string", stop=2)
tm.assert_almost_equal(result.values, df3["string"].values[:2])
result = store.select_column("df3", "string", stop=-2)
tm.assert_almost_equal(result.values, df3["string"].values[:-2])
result = store.select_column("df3", "string", start=2, stop=-2)
tm.assert_almost_equal(result.values, df3["string"].values[2:-2])
result = store.select_column("df3", "string", start=-2, stop=2)
tm.assert_almost_equal(result.values, df3["string"].values[-2:2])
# GH 10392 - make sure column name is preserved
df4 = DataFrame({"A": np.random.randn(10), "B": "foo"})
store.append("df4", df4, data_columns=True)
expected = df4["B"]
result = store.select_column("df4", "B")
tm.assert_series_equal(result, expected)
def test_coordinates(self, setup_path):
df = tm.makeTimeDataFrame()
with ensure_clean_store(setup_path) as store:
_maybe_remove(store, "df")
store.append("df", df)
# all
c = store.select_as_coordinates("df")
assert (c.values == np.arange(len(df.index))).all()
# get coordinates back & test vs frame
_maybe_remove(store, "df")
df = DataFrame({"A": range(5), "B": range(5)})
store.append("df", df)
c = store.select_as_coordinates("df", ["index<3"])
assert (c.values == np.arange(3)).all()
result = store.select("df", where=c)
expected = df.loc[0:2, :]
tm.assert_frame_equal(result, expected)
c = store.select_as_coordinates("df", ["index>=3", "index<=4"])
assert (c.values == np.arange(2) + 3).all()
result = store.select("df", where=c)
expected = df.loc[3:4, :]
tm.assert_frame_equal(result, expected)
assert isinstance(c, Index)
# multiple tables
_maybe_remove(store, "df1")
_maybe_remove(store, "df2")
df1 = tm.makeTimeDataFrame()
df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
store.append("df1", df1, data_columns=["A", "B"])
store.append("df2", df2)
c = store.select_as_coordinates("df1", ["A>0", "B>0"])
df1_result = store.select("df1", c)
df2_result = store.select("df2", c)
result = concat([df1_result, df2_result], axis=1)
expected = concat([df1, df2], axis=1)
expected = expected[(expected.A > 0) & (expected.B > 0)]
tm.assert_frame_equal(result, expected)
# pass array/mask as the coordinates
with ensure_clean_store(setup_path) as store:
df = DataFrame(
np.random.randn(1000, 2), index=date_range("20000101", periods=1000)
)
store.append("df", df)
c = store.select_column("df", "index")
where = c[DatetimeIndex(c).month == 5].index
expected = df.iloc[where]
# locations
result = store.select("df", where=where)
tm.assert_frame_equal(result, expected)
# boolean
result = store.select("df", where=where)
tm.assert_frame_equal(result, expected)
# invalid
with pytest.raises(ValueError):
store.select("df", where=np.arange(len(df), dtype="float64"))
with pytest.raises(ValueError):
store.select("df", where=np.arange(len(df) + 1))
with pytest.raises(ValueError):
store.select("df", where=np.arange(len(df)), start=5)
with pytest.raises(ValueError):
store.select("df", where=np.arange(len(df)), start=5, stop=10)
# selection with filter
selection = date_range("20000101", periods=500)
result = store.select("df", where="index in selection")
expected = df[df.index.isin(selection)]
tm.assert_frame_equal(result, expected)
# list
df = DataFrame(np.random.randn(10, 2))
store.append("df2", df)
result = store.select("df2", where=[0, 3, 5])
expected = df.iloc[[0, 3, 5]]
tm.assert_frame_equal(result, expected)
# boolean
where = [True] * 10
where[-2] = False
result = store.select("df2", where=where)
expected = df.loc[where]
tm.assert_frame_equal(result, expected)
# start/stop
result = store.select("df2", start=5, stop=10)
expected = df[5:10]
tm.assert_frame_equal(result, expected)
def test_append_to_multiple(self, setup_path):
df1 = tm.makeTimeDataFrame()
df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
df2["foo"] = "bar"
df = concat([df1, df2], axis=1)
with ensure_clean_store(setup_path) as store:
# exceptions
with pytest.raises(ValueError):
store.append_to_multiple(
{"df1": ["A", "B"], "df2": None}, df, selector="df3"
)
with pytest.raises(ValueError):
store.append_to_multiple({"df1": None, "df2": None}, df, selector="df3")
with pytest.raises(ValueError):
store.append_to_multiple("df1", df, "df1")
# regular operation
store.append_to_multiple(
{"df1": ["A", "B"], "df2": None}, df, selector="df1"
)
result = store.select_as_multiple(
["df1", "df2"], where=["A>0", "B>0"], selector="df1"
)
expected = df[(df.A > 0) & (df.B > 0)]
tm.assert_frame_equal(result, expected)
def test_append_to_multiple_dropna(self, setup_path):
df1 = tm.makeTimeDataFrame()
df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
df = concat([df1, df2], axis=1)
with ensure_clean_store(setup_path) as store:
# dropna=True should guarantee rows are synchronized
store.append_to_multiple(
{"df1": ["A", "B"], "df2": None}, df, selector="df1", dropna=True
)
result = store.select_as_multiple(["df1", "df2"])
expected = df.dropna()
tm.assert_frame_equal(result, expected)
tm.assert_index_equal(store.select("df1").index, store.select("df2").index)
@pytest.mark.xfail(
run=False, reason="append_to_multiple_dropna_false is not raising as failed"
)
def test_append_to_multiple_dropna_false(self, setup_path):
df1 = tm.makeTimeDataFrame()
df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
df = concat([df1, df2], axis=1)
with ensure_clean_store(setup_path) as store:
# dropna=False shouldn't synchronize row indexes
store.append_to_multiple(
{"df1a": ["A", "B"], "df2a": None}, df, selector="df1a", dropna=False
)
with pytest.raises(ValueError):
store.select_as_multiple(["df1a", "df2a"])
assert not store.select("df1a").index.equals(store.select("df2a").index)
def test_append_to_multiple_min_itemsize(self, setup_path):
# GH 11238
df = DataFrame(
{
"IX": np.arange(1, 21),
"Num": np.arange(1, 21),
"BigNum": np.arange(1, 21) * 88,
"Str": ["a" for _ in range(20)],
"LongStr": ["abcde" for _ in range(20)],
}
)
expected = df.iloc[[0]]
with ensure_clean_store(setup_path) as store:
store.append_to_multiple(
{
"index": ["IX"],
"nums": ["Num", "BigNum"],
"strs": ["Str", "LongStr"],
},
df.iloc[[0]],
"index",
min_itemsize={"Str": 10, "LongStr": 100, "Num": 2},
)
result = store.select_as_multiple(["index", "nums", "strs"])
tm.assert_frame_equal(result, expected)
def test_select_as_multiple(self, setup_path):
df1 = tm.makeTimeDataFrame()
df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
df2["foo"] = "bar"
with ensure_clean_store(setup_path) as store:
# no tables stored
with pytest.raises(Exception):
store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1")
store.append("df1", df1, data_columns=["A", "B"])
store.append("df2", df2)
# exceptions
with pytest.raises(Exception):
store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1")
with pytest.raises(Exception):
store.select_as_multiple([None], where=["A>0", "B>0"], selector="df1")
msg = "'No object named df3 in the file'"
with pytest.raises(KeyError, match=msg):
store.select_as_multiple(
["df1", "df3"], where=["A>0", "B>0"], selector="df1"
)
with pytest.raises(KeyError, match=msg):
store.select_as_multiple(["df3"], where=["A>0", "B>0"], selector="df1")
with pytest.raises(KeyError, match="'No object named df4 in the file'"):
store.select_as_multiple(
["df1", "df2"], where=["A>0", "B>0"], selector="df4"
)
# default select
result = store.select("df1", ["A>0", "B>0"])
expected = store.select_as_multiple(
["df1"], where=["A>0", "B>0"], selector="df1"
)
tm.assert_frame_equal(result, expected)
expected = store.select_as_multiple(
"df1", where=["A>0", "B>0"], selector="df1"
)
tm.assert_frame_equal(result, expected)
# multiple
result = store.select_as_multiple(
["df1", "df2"], where=["A>0", "B>0"], selector="df1"
)
expected = concat([df1, df2], axis=1)
expected = expected[(expected.A > 0) & (expected.B > 0)]
tm.assert_frame_equal(result, expected)
# multiple (diff selector)
result = store.select_as_multiple(
["df1", "df2"], where="index>df2.index[4]", selector="df2"
)
expected = concat([df1, df2], axis=1)
expected = expected[5:]
tm.assert_frame_equal(result, expected)
# test exception for diff rows
store.append("df3", tm.makeTimeDataFrame(nper=50))
with pytest.raises(ValueError):
store.select_as_multiple(
["df1", "df3"], where=["A>0", "B>0"], selector="df1"
)
@pytest.mark.skipif(
LooseVersion(tables.__version__) < LooseVersion("3.1.0"),
reason=("tables version does not support fix for nan selection bug: GH 4858"),
)
def test_nan_selection_bug_4858(self, setup_path):
with ensure_clean_store(setup_path) as store:
df = DataFrame({"cols": range(6), "values": range(6)}, dtype="float64")
df["cols"] = (df["cols"] + 10).apply(str)
df.iloc[0] = np.nan
expected = DataFrame(
{"cols": ["13.0", "14.0", "15.0"], "values": [3.0, 4.0, 5.0]},
index=[3, 4, 5],
)
# write w/o the index on that particular column
store.append("df", df, data_columns=True, index=["cols"])
result = store.select("df", where="values>2.0")
tm.assert_frame_equal(result, expected)
def test_start_stop_table(self, setup_path):
with ensure_clean_store(setup_path) as store:
# table
df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)})
store.append("df", df)
result = store.select("df", "columns=['A']", start=0, stop=5)
expected = df.loc[0:4, ["A"]]
tm.assert_frame_equal(result, expected)
# out of range
result = store.select("df", "columns=['A']", start=30, stop=40)
assert len(result) == 0
expected = df.loc[30:40, ["A"]]
tm.assert_frame_equal(result, expected)
def test_start_stop_multiple(self, setup_path):
# GH 16209
with ensure_clean_store(setup_path) as store:
df = DataFrame({"foo": [1, 2], "bar": [1, 2]})
store.append_to_multiple(
{"selector": ["foo"], "data": None}, df, selector="selector"
)
result = store.select_as_multiple(
["selector", "data"], selector="selector", start=0, stop=1
)
expected = df.loc[[0], ["foo", "bar"]]
tm.assert_frame_equal(result, expected)
def test_start_stop_fixed(self, setup_path):
with ensure_clean_store(setup_path) as store:
# fixed, GH 8287
df = DataFrame(
{"A": np.random.rand(20), "B": np.random.rand(20)},
index=pd.date_range("20130101", periods=20),
)
store.put("df", df)
result = store.select("df", start=0, stop=5)
expected = df.iloc[0:5, :]
tm.assert_frame_equal(result, expected)
result = store.select("df", start=5, stop=10)
expected = df.iloc[5:10, :]
tm.assert_frame_equal(result, expected)
# out of range
result = store.select("df", start=30, stop=40)
expected = df.iloc[30:40, :]
tm.assert_frame_equal(result, expected)
# series
s = df.A
store.put("s", s)
result = store.select("s", start=0, stop=5)
expected = s.iloc[0:5]
tm.assert_series_equal(result, expected)
result = store.select("s", start=5, stop=10)
expected = s.iloc[5:10]
tm.assert_series_equal(result, expected)
# sparse; not implemented
df = tm.makeDataFrame()
df.iloc[3:5, 1:3] = np.nan
df.iloc[8:10, -2] = np.nan
def test_select_filter_corner(self, setup_path):
df = DataFrame(np.random.randn(50, 100))
df.index = [f"{c:3d}" for c in df.index]
df.columns = [f"{c:3d}" for c in df.columns]
with ensure_clean_store(setup_path) as store:
store.put("frame", df, format="table")
crit = "columns=df.columns[:75]"
result = store.select("frame", [crit])
tm.assert_frame_equal(result, df.loc[:, df.columns[:75]])
crit = "columns=df.columns[:75:2]"
result = store.select("frame", [crit])
tm.assert_frame_equal(result, df.loc[:, df.columns[:75:2]])
def test_path_pathlib(self, setup_path):
df = tm.makeDataFrame()
result = tm.round_trip_pathlib(
lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df")
)
tm.assert_frame_equal(df, result)
@pytest.mark.parametrize("start, stop", [(0, 2), (1, 2), (None, None)])
def test_contiguous_mixed_data_table(self, start, stop, setup_path):
# GH 17021
# ValueError when reading a contiguous mixed-data table ft. VLArray
df = DataFrame(
{
"a": Series([20111010, 20111011, 20111012]),
"b": Series(["ab", "cd", "ab"]),
}
)
with ensure_clean_store(setup_path) as store:
store.append("test_dataset", df)
result = store.select("test_dataset", start=start, stop=stop)
tm.assert_frame_equal(df[start:stop], result)
def test_path_pathlib_hdfstore(self, setup_path):
df = tm.makeDataFrame()
def writer(path):
with HDFStore(path) as store:
df.to_hdf(store, "df")
def reader(path):
with HDFStore(path) as store:
return pd.read_hdf(store, "df")
result = tm.round_trip_pathlib(writer, reader)
tm.assert_frame_equal(df, result)
def test_pickle_path_localpath(self, setup_path):
df = tm.makeDataFrame()
result = tm.round_trip_pathlib(
lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df")
)
tm.assert_frame_equal(df, result)
def test_path_localpath_hdfstore(self, setup_path):
df = tm.makeDataFrame()
def writer(path):
with HDFStore(path) as store:
df.to_hdf(store, "df")
def reader(path):
with HDFStore(path) as store:
return pd.read_hdf(store, "df")
result = tm.round_trip_localpath(writer, reader)
tm.assert_frame_equal(df, result)
def _check_roundtrip(self, obj, comparator, path, compression=False, **kwargs):
options = {}
if compression:
options["complib"] = _default_compressor
with ensure_clean_store(path, "w", **options) as store:
store["obj"] = obj
retrieved = store["obj"]
comparator(retrieved, obj, **kwargs)
def _check_double_roundtrip(
self, obj, comparator, path, compression=False, **kwargs
):
options = {}
if compression:
options["complib"] = compression or _default_compressor
with ensure_clean_store(path, "w", **options) as store:
store["obj"] = obj
retrieved = store["obj"]
comparator(retrieved, obj, **kwargs)
store["obj"] = retrieved
again = store["obj"]
comparator(again, obj, **kwargs)
def _check_roundtrip_table(self, obj, comparator, path, compression=False):
options = {}
if compression:
options["complib"] = _default_compressor
with ensure_clean_store(path, "w", **options) as store:
store.put("obj", obj, format="table")
retrieved = store["obj"]
comparator(retrieved, obj)
def test_multiple_open_close(self, setup_path):
# gh-4409: open & close multiple times
with ensure_clean_path(setup_path) as path:
df = tm.makeDataFrame()
df.to_hdf(path, "df", mode="w", format="table")
# single
store = HDFStore(path)
assert "CLOSED" not in store.info()
assert store.is_open
store.close()
assert "CLOSED" in store.info()
assert not store.is_open
with ensure_clean_path(setup_path) as path:
if pytables._table_file_open_policy_is_strict:
# multiples
store1 = HDFStore(path)
with pytest.raises(ValueError):
HDFStore(path)
store1.close()
else:
# multiples
store1 = HDFStore(path)
store2 = HDFStore(path)
assert "CLOSED" not in store1.info()
assert "CLOSED" not in store2.info()
assert store1.is_open
assert store2.is_open
store1.close()
assert "CLOSED" in store1.info()
assert not store1.is_open
assert "CLOSED" not in store2.info()
assert store2.is_open
store2.close()
assert "CLOSED" in store1.info()
assert "CLOSED" in store2.info()
assert not store1.is_open
assert not store2.is_open
# nested close
store = HDFStore(path, mode="w")
store.append("df", df)
store2 = HDFStore(path)
store2.append("df2", df)
store2.close()
assert "CLOSED" in store2.info()
assert not store2.is_open
store.close()
assert "CLOSED" in store.info()
assert not store.is_open
# double closing
store = HDFStore(path, mode="w")
store.append("df", df)
store2 = HDFStore(path)
store.close()
assert "CLOSED" in store.info()
assert not store.is_open
store2.close()
assert "CLOSED" in store2.info()
assert not store2.is_open
# ops on a closed store
with ensure_clean_path(setup_path) as path:
df = tm.makeDataFrame()
df.to_hdf(path, "df", mode="w", format="table")
store = HDFStore(path)
store.close()
with pytest.raises(ClosedFileError):
store.keys()
with pytest.raises(ClosedFileError):
"df" in store
with pytest.raises(ClosedFileError):
len(store)
with pytest.raises(ClosedFileError):
store["df"]
with pytest.raises(AttributeError):
store.df
with pytest.raises(ClosedFileError):
store.select("df")
with pytest.raises(ClosedFileError):
store.get("df")
with pytest.raises(ClosedFileError):
store.append("df2", df)
with pytest.raises(ClosedFileError):
store.put("df3", df)
with pytest.raises(ClosedFileError):
store.get_storer("df2")
with pytest.raises(ClosedFileError):
store.remove("df2")
with pytest.raises(ClosedFileError, match="file is not open"):
store.select("df")
def test_pytables_native_read(self, datapath, setup_path):
with ensure_clean_store(
datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r"
) as store:
d2 = store["detector/readout"]
assert isinstance(d2, DataFrame)
@pytest.mark.skipif(
is_platform_windows(), reason="native2 read fails oddly on windows"
)
def test_pytables_native2_read(self, datapath, setup_path):
with ensure_clean_store(
datapath("io", "data", "legacy_hdf", "pytables_native2.h5"), mode="r"
) as store:
str(store)
d1 = store["detector"]
assert isinstance(d1, DataFrame)
def test_legacy_table_fixed_format_read_py2(self, datapath, setup_path):
# GH 24510
# legacy table with fixed format written in Python 2
with ensure_clean_store(
datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r"
) as store:
result = store.select("df")
expected = DataFrame(
[[1, 2, 3, "D"]],
columns=["A", "B", "C", "D"],
index=Index(["ABC"], name="INDEX_NAME"),
)
tm.assert_frame_equal(expected, result)
def test_legacy_table_fixed_format_read_datetime_py2(self, datapath, setup_path):
# GH 31750
# legacy table with fixed format and datetime64 column written in Python 2
with ensure_clean_store(
datapath("io", "data", "legacy_hdf", "legacy_table_fixed_datetime_py2.h5"),
mode="r",
) as store:
result = store.select("df")
expected = DataFrame(
[[Timestamp("2020-02-06T18:00")]],
columns=["A"],
index=Index(["date"]),
)
tm.assert_frame_equal(expected, result)
def test_legacy_table_read_py2(self, datapath, setup_path):
# issue: 24925
# legacy table written in Python 2
with ensure_clean_store(
datapath("io", "data", "legacy_hdf", "legacy_table_py2.h5"), mode="r"
) as store:
result = store.select("table")
expected = DataFrame({"a": ["a", "b"], "b": [2, 3]})
tm.assert_frame_equal(expected, result)
def test_copy(self, setup_path):
with catch_warnings(record=True):
def do_copy(f, new_f=None, keys=None, propindexes=True, **kwargs):
try:
store = HDFStore(f, "r")
if new_f is None:
import tempfile
fd, new_f = tempfile.mkstemp()
tstore = store.copy(
new_f, keys=keys, propindexes=propindexes, **kwargs
)
# check keys
if keys is None:
keys = store.keys()
assert set(keys) == set(tstore.keys())
# check indices & nrows
for k in tstore.keys():
if tstore.get_storer(k).is_table:
new_t = tstore.get_storer(k)
orig_t = store.get_storer(k)
assert orig_t.nrows == new_t.nrows
# check propindixes
if propindexes:
for a in orig_t.axes:
if a.is_indexed:
assert new_t[a.name].is_indexed
finally:
safe_close(store)
safe_close(tstore)
try:
os.close(fd)
except (OSError, ValueError):
pass
os.remove(new_f)
# new table
df = tm.makeDataFrame()
with tm.ensure_clean() as path:
st = HDFStore(path)
st.append("df", df, data_columns=["A"])
st.close()
do_copy(f=path)
do_copy(f=path, propindexes=False)
def test_store_datetime_fractional_secs(self, setup_path):
with ensure_clean_store(setup_path) as store:
dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456)
series = Series([0], [dt])
store["a"] = series
assert store["a"].index[0] == dt
def test_tseries_indices_series(self, setup_path):
with ensure_clean_store(setup_path) as store:
idx = tm.makeDateIndex(10)
ser = Series(np.random.randn(len(idx)), idx)
store["a"] = ser
result = store["a"]
tm.assert_series_equal(result, ser)
assert result.index.freq == ser.index.freq
tm.assert_class_equal(result.index, ser.index, obj="series index")
idx = tm.makePeriodIndex(10)
ser = Series(np.random.randn(len(idx)), idx)
store["a"] = ser
result = store["a"]
tm.assert_series_equal(result, ser)
assert result.index.freq == ser.index.freq
tm.assert_class_equal(result.index, ser.index, obj="series index")
def test_tseries_indices_frame(self, setup_path):
with ensure_clean_store(setup_path) as store:
idx = tm.makeDateIndex(10)
df = DataFrame(np.random.randn(len(idx), 3), index=idx)
store["a"] = df
result = store["a"]
tm.assert_frame_equal(result, df)
assert result.index.freq == df.index.freq
tm.assert_class_equal(result.index, df.index, obj="dataframe index")
idx = tm.makePeriodIndex(10)
df = DataFrame(np.random.randn(len(idx), 3), idx)
store["a"] = df
result = store["a"]
tm.assert_frame_equal(result, df)
assert result.index.freq == df.index.freq
tm.assert_class_equal(result.index, df.index, obj="dataframe index")
def test_unicode_index(self, setup_path):
unicode_values = ["\u03c3", "\u03c3\u03c3"]
# PerformanceWarning
with catch_warnings(record=True):
simplefilter("ignore", pd.errors.PerformanceWarning)
s = Series(np.random.randn(len(unicode_values)), unicode_values)
self._check_roundtrip(s, tm.assert_series_equal, path=setup_path)
def test_unicode_longer_encoded(self, setup_path):
# GH 11234
char = "\u0394"
df = DataFrame({"A": [char]})
with ensure_clean_store(setup_path) as store:
store.put("df", df, format="table", encoding="utf-8")
result = store.get("df")
tm.assert_frame_equal(result, df)
df = DataFrame({"A": ["a", char], "B": ["b", "b"]})
with ensure_clean_store(setup_path) as store:
store.put("df", df, format="table", encoding="utf-8")
result = store.get("df")
tm.assert_frame_equal(result, df)
def test_store_datetime_mixed(self, setup_path):
df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]})
ts = tm.makeTimeSeries()
df["d"] = ts.index[:3]
self._check_roundtrip(df, tm.assert_frame_equal, path=setup_path)
# FIXME: don't leave commented-out code
# def test_cant_write_multiindex_table(self):
# # for now, #1848
# df = DataFrame(np.random.randn(10, 4),
# index=[np.arange(5).repeat(2),
# np.tile(np.arange(2), 5)])
#
# with pytest.raises(Exception):
# store.put('foo', df, format='table')
def test_append_with_diff_col_name_types_raises_value_error(self, setup_path):
df = DataFrame(np.random.randn(10, 1))
df2 = DataFrame({"a": np.random.randn(10)})
df3 = DataFrame({(1, 2): np.random.randn(10)})
df4 = DataFrame({("1", 2): np.random.randn(10)})
df5 = DataFrame({("1", 2, object): np.random.randn(10)})
with ensure_clean_store(setup_path) as store:
name = f"df_{tm.rands(10)}"
store.append(name, df)
for d in (df2, df3, df4, df5):
with pytest.raises(ValueError):
store.append(name, d)
def test_query_with_nested_special_character(self, setup_path):
df = DataFrame(
{
"a": ["a", "a", "c", "b", "test & test", "c", "b", "e"],
"b": [1, 2, 3, 4, 5, 6, 7, 8],
}
)
expected = df[df.a == "test & test"]
with ensure_clean_store(setup_path) as store:
store.append("test", df, format="table", data_columns=True)
result = store.select("test", 'a = "test & test"')
tm.assert_frame_equal(expected, result)
def test_categorical(self, setup_path):
with ensure_clean_store(setup_path) as store:
# Basic
_maybe_remove(store, "s")
s = Series(
Categorical(
["a", "b", "b", "a", "a", "c"],
categories=["a", "b", "c", "d"],
ordered=False,
)
)
store.append("s", s, format="table")
result = store.select("s")
tm.assert_series_equal(s, result)
_maybe_remove(store, "s_ordered")
s = Series(
Categorical(
["a", "b", "b", "a", "a", "c"],
categories=["a", "b", "c", "d"],
ordered=True,
)
)
store.append("s_ordered", s, format="table")
result = store.select("s_ordered")
tm.assert_series_equal(s, result)
_maybe_remove(store, "df")
df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]})
store.append("df", df, format="table")
result = store.select("df")
tm.assert_frame_equal(result, df)
# Dtypes
_maybe_remove(store, "si")
s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category")
store.append("si", s)
result = store.select("si")
tm.assert_series_equal(result, s)
_maybe_remove(store, "si2")
s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category")
store.append("si2", s)
result = store.select("si2")
tm.assert_series_equal(result, s)
# Multiple
_maybe_remove(store, "df2")
df2 = df.copy()
df2["s2"] = Series(list("abcdefg")).astype("category")
store.append("df2", df2)
result = store.select("df2")
tm.assert_frame_equal(result, df2)
# Make sure the metadata is OK
info = store.info()
assert "/df2 " in info
# assert '/df2/meta/values_block_0/meta' in info
assert "/df2/meta/values_block_1/meta" in info
# unordered
_maybe_remove(store, "s2")
s = Series(
Categorical(
["a", "b", "b", "a", "a", "c"],
categories=["a", "b", "c", "d"],
ordered=False,
)
)
store.append("s2", s, format="table")
result = store.select("s2")
tm.assert_series_equal(result, s)
# Query
_maybe_remove(store, "df3")
store.append("df3", df, data_columns=["s"])
expected = df[df.s.isin(["b", "c"])]
result = store.select("df3", where=['s in ["b","c"]'])
tm.assert_frame_equal(result, expected)
expected = df[df.s.isin(["b", "c"])]
result = store.select("df3", where=['s = ["b","c"]'])
tm.assert_frame_equal(result, expected)
expected = df[df.s.isin(["d"])]
result = store.select("df3", where=['s in ["d"]'])
tm.assert_frame_equal(result, expected)
expected = df[df.s.isin(["f"])]
result = store.select("df3", where=['s in ["f"]'])
tm.assert_frame_equal(result, expected)
# Appending with same categories is ok
store.append("df3", df)
df = concat([df, df])
expected = df[df.s.isin(["b", "c"])]
result = store.select("df3", where=['s in ["b","c"]'])
tm.assert_frame_equal(result, expected)
# Appending must have the same categories
df3 = df.copy()
df3["s"] = df3["s"].cat.remove_unused_categories()
with pytest.raises(ValueError):
store.append("df3", df3)
# Remove, and make sure meta data is removed (its a recursive
# removal so should be).
result = store.select("df3/meta/s/meta")
assert result is not None
store.remove("df3")
with pytest.raises(
KeyError, match="'No object named df3/meta/s/meta in the file'"
):
store.select("df3/meta/s/meta")
def test_categorical_conversion(self, setup_path):
# GH13322
# Check that read_hdf with categorical columns doesn't return rows if
# where criteria isn't met.
obsids = ["ESP_012345_6789", "ESP_987654_3210"]
imgids = ["APF00006np", "APF0001imm"]
data = [4.3, 9.8]
# Test without categories
df = DataFrame({"obsids": obsids, "imgids": imgids, "data": data})
# We are expecting an empty DataFrame matching types of df
expected = df.iloc[[], :]
with ensure_clean_path(setup_path) as path:
df.to_hdf(path, "df", format="table", data_columns=True)
result = read_hdf(path, "df", where="obsids=B")
tm.assert_frame_equal(result, expected)
# Test with categories
df.obsids = df.obsids.astype("category")
df.imgids = df.imgids.astype("category")
# We are expecting an empty DataFrame matching types of df
expected = df.iloc[[], :]
with ensure_clean_path(setup_path) as path:
df.to_hdf(path, "df", format="table", data_columns=True)
result = read_hdf(path, "df", where="obsids=B")
tm.assert_frame_equal(result, expected)
def test_categorical_nan_only_columns(self, setup_path):
# GH18413
# Check that read_hdf with categorical columns with NaN-only values can
# be read back.
df = DataFrame(
{
"a": ["a", "b", "c", np.nan],
"b": [np.nan, np.nan, np.nan, np.nan],
"c": [1, 2, 3, 4],
"d": Series([None] * 4, dtype=object),
}
)
df["a"] = df.a.astype("category")
df["b"] = df.b.astype("category")
df["d"] = df.b.astype("category")
expected = df
with ensure_clean_path(setup_path) as path:
df.to_hdf(path, "df", format="table", data_columns=True)
result = read_hdf(path, "df")
tm.assert_frame_equal(result, expected)
def test_duplicate_column_name(self, setup_path):
df = DataFrame(columns=["a", "a"], data=[[0, 0]])
with ensure_clean_path(setup_path) as path:
with pytest.raises(ValueError):
df.to_hdf(path, "df", format="fixed")
df.to_hdf(path, "df", format="table")
other = read_hdf(path, "df")
tm.assert_frame_equal(df, other)
assert df.equals(other)
assert other.equals(df)
def test_round_trip_equals(self, setup_path):
# GH 9330
df = DataFrame({"B": [1, 2], "A": ["x", "y"]})
with ensure_clean_path(setup_path) as path:
df.to_hdf(path, "df", format="table")
other = read_hdf(path, "df")
tm.assert_frame_equal(df, other)
assert df.equals(other)
assert other.equals(df)
def test_preserve_timedeltaindex_type(self, setup_path):
# GH9635
# Storing TimedeltaIndexed DataFrames in fixed stores did not preserve
# the type of the index.
df = DataFrame(np.random.normal(size=(10, 5)))
df.index = timedelta_range(start="0s", periods=10, freq="1s", name="example")
with ensure_clean_store(setup_path) as store:
store["df"] = df
tm.assert_frame_equal(store["df"], df)
def test_columns_multiindex_modified(self, setup_path):
# BUG: 7212
# read_hdf store.select modified the passed columns parameters
# when multi-indexed.
df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE"))
df.index.name = "letters"
df = df.set_index(keys="E", append=True)
data_columns = df.index.names + df.columns.tolist()
with ensure_clean_path(setup_path) as path:
df.to_hdf(
path,
"df",
mode="a",
append=True,
data_columns=data_columns,
index=False,
)
cols2load = list("BCD")
cols2load_original = list(cols2load)
df_loaded = read_hdf(path, "df", columns=cols2load) # noqa
assert cols2load_original == cols2load
@ignore_natural_naming_warning
def test_to_hdf_with_object_column_names(self, setup_path):
# GH9057
# Writing HDF5 table format should only work for string-like
# column types
types_should_fail = [
tm.makeIntIndex,
tm.makeFloatIndex,
tm.makeDateIndex,
tm.makeTimedeltaIndex,
tm.makePeriodIndex,
]
types_should_run = [
tm.makeStringIndex,
tm.makeCategoricalIndex,
tm.makeUnicodeIndex,
]
for index in types_should_fail:
df = DataFrame(np.random.randn(10, 2), columns=index(2))
with ensure_clean_path(setup_path) as path:
with catch_warnings(record=True):
msg = "cannot have non-object label DataIndexableCol"
with pytest.raises(ValueError, match=msg):
df.to_hdf(path, "df", format="table", data_columns=True)
for index in types_should_run:
df = DataFrame(np.random.randn(10, 2), columns=index(2))
with ensure_clean_path(setup_path) as path:
with catch_warnings(record=True):
df.to_hdf(path, "df", format="table", data_columns=True)
result = pd.read_hdf(path, "df", where=f"index = [{df.index[0]}]")
assert len(result)
def test_read_hdf_open_store(self, setup_path):
# GH10330
# No check for non-string path_or-buf, and no test of open store
df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE"))
df.index.name = "letters"
df = df.set_index(keys="E", append=True)
with ensure_clean_path(setup_path) as path:
df.to_hdf(path, "df", mode="w")
direct = read_hdf(path, "df")
store = HDFStore(path, mode="r")
indirect = read_hdf(store, "df")
tm.assert_frame_equal(direct, indirect)
assert store.is_open
store.close()
def test_read_hdf_iterator(self, setup_path):
df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE"))
df.index.name = "letters"
df = df.set_index(keys="E", append=True)
with ensure_clean_path(setup_path) as path:
df.to_hdf(path, "df", mode="w", format="t")
direct = read_hdf(path, "df")
iterator = read_hdf(path, "df", iterator=True)
assert isinstance(iterator, TableIterator)
indirect = next(iterator.__iter__())
tm.assert_frame_equal(direct, indirect)
iterator.store.close()
def test_read_hdf_errors(self, setup_path):
df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE"))
with ensure_clean_path(setup_path) as path:
with pytest.raises(IOError):
read_hdf(path, "key")
df.to_hdf(path, "df")
store = HDFStore(path, mode="r")
store.close()
with pytest.raises(IOError):
read_hdf(store, "df")
def test_read_hdf_generic_buffer_errors(self):
with pytest.raises(NotImplementedError):
read_hdf(BytesIO(b""), "df")
def test_invalid_complib(self, setup_path):
df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE"))
with tm.ensure_clean(setup_path) as path:
with pytest.raises(ValueError):
df.to_hdf(path, "df", complib="foolib")
# GH10443
def test_read_nokey(self, setup_path):
df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE"))
# Categorical dtype not supported for "fixed" format. So no need
# to test with that dtype in the dataframe here.
with ensure_clean_path(setup_path) as path:
df.to_hdf(path, "df", mode="a")
reread = read_hdf(path)
tm.assert_frame_equal(df, reread)
df.to_hdf(path, "df2", mode="a")
with pytest.raises(ValueError):
read_hdf(path)
def test_read_nokey_table(self, setup_path):
# GH13231
df = DataFrame({"i": range(5), "c": Series(list("abacd"), dtype="category")})
with ensure_clean_path(setup_path) as path:
df.to_hdf(path, "df", mode="a", format="table")
reread = read_hdf(path)
tm.assert_frame_equal(df, reread)
df.to_hdf(path, "df2", mode="a", format="table")
with pytest.raises(ValueError):
read_hdf(path)
def test_read_nokey_empty(self, setup_path):
with ensure_clean_path(setup_path) as path:
store = HDFStore(path)
store.close()
with pytest.raises(ValueError):
read_hdf(path)
def test_read_from_pathlib_path(self, setup_path):
# GH11773
expected = DataFrame(
np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")
)
with ensure_clean_path(setup_path) as filename:
path_obj = Path(filename)
expected.to_hdf(path_obj, "df", mode="a")
actual = read_hdf(path_obj, "df")
tm.assert_frame_equal(expected, actual)
@td.skip_if_no("py.path")
def test_read_from_py_localpath(self, setup_path):
# GH11773
from py.path import local as LocalPath
expected = DataFrame(
np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")
)
with ensure_clean_path(setup_path) as filename:
path_obj = LocalPath(filename)
expected.to_hdf(path_obj, "df", mode="a")
actual = read_hdf(path_obj, "df")
tm.assert_frame_equal(expected, actual)
def test_query_long_float_literal(self, setup_path):
# GH 14241
df = DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]})
with ensure_clean_store(setup_path) as store:
store.append("test", df, format="table", data_columns=True)
cutoff = 1000000000.0006
result = store.select("test", f"A < {cutoff:.4f}")
assert result.empty
cutoff = 1000000000.0010
result = store.select("test", f"A > {cutoff:.4f}")
expected = df.loc[[1, 2], :]
tm.assert_frame_equal(expected, result)
exact = 1000000000.0011
result = store.select("test", f"A == {exact:.4f}")
expected = df.loc[[1], :]
tm.assert_frame_equal(expected, result)
def test_query_compare_column_type(self, setup_path):
# GH 15492
df = DataFrame(
{
"date": ["2014-01-01", "2014-01-02"],
"real_date": date_range("2014-01-01", periods=2),
"float": [1.1, 1.2],
"int": [1, 2],
},
columns=["date", "real_date", "float", "int"],
)
with ensure_clean_store(setup_path) as store:
store.append("test", df, format="table", data_columns=True)
ts = Timestamp("2014-01-01") # noqa
result = store.select("test", where="real_date > ts")
expected = df.loc[[1], :]
tm.assert_frame_equal(expected, result)
for op in ["<", ">", "=="]:
# non strings to string column always fail
for v in [2.1, True, Timestamp("2014-01-01"), pd.Timedelta(1, "s")]:
query = f"date {op} v"
with pytest.raises(TypeError):
store.select("test", where=query)
# strings to other columns must be convertible to type
v = "a"
for col in ["int", "float", "real_date"]:
query = f"{col} {op} v"
with pytest.raises(ValueError):
store.select("test", where=query)
for v, col in zip(
["1", "1.1", "2014-01-01"], ["int", "float", "real_date"]
):
query = f"{col} {op} v"
result = store.select("test", where=query)
if op == "==":
expected = df.loc[[0], :]
elif op == ">":
expected = df.loc[[1], :]
else:
expected = df.loc[[], :]
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize("format", ["fixed", "table"])
def test_read_hdf_series_mode_r(self, format, setup_path):
# GH 16583
# Tests that reading a Series saved to an HDF file
# still works if a mode='r' argument is supplied
series = tm.makeFloatSeries()
with ensure_clean_path(setup_path) as path:
series.to_hdf(path, key="data", format=format)
result = pd.read_hdf(path, key="data", mode="r")
tm.assert_series_equal(result, series)
def test_fspath(self):
with tm.ensure_clean("foo.h5") as path:
with HDFStore(path) as store:
assert os.fspath(store) == str(path)
def test_read_py2_hdf_file_in_py3(self, datapath):
# GH 16781
# tests reading a PeriodIndex DataFrame written in Python2 in Python3
# the file was generated in Python 2.7 like so:
#
# df = DataFrame([1.,2,3], index=pd.PeriodIndex(
# ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B'))
# df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p')
expected = DataFrame(
[1.0, 2, 3],
index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"),
)
with ensure_clean_store(
datapath(
"io", "data", "legacy_hdf", "periodindex_0.20.1_x86_64_darwin_2.7.13.h5"
),
mode="r",
) as store:
result = store["p"]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("where", ["", (), (None,), [], [None]])
def test_select_empty_where(self, where):
# GH26610
# Using keyword `where` as '' or (), or [None], etc
# while reading from HDF store raises
# "SyntaxError: only a single expression is allowed"
df = DataFrame([1, 2, 3])
with ensure_clean_path("empty_where.h5") as path:
with HDFStore(path) as store:
store.put("df", df, "t")
result = pd.read_hdf(store, "df", where=where)
tm.assert_frame_equal(result, df)
@pytest.mark.parametrize(
"idx",
[
date_range("2019", freq="D", periods=3, tz="UTC"),
CategoricalIndex(list("abc")),
],
)
def test_to_hdf_multiindex_extension_dtype(self, idx, setup_path):
# GH 7775
mi = MultiIndex.from_arrays([idx, idx])
df = DataFrame(0, index=mi, columns=["a"])
with ensure_clean_path(setup_path) as path:
with pytest.raises(NotImplementedError, match="Saving a MultiIndex"):
df.to_hdf(path, "df")
def test_unsuppored_hdf_file_error(self, datapath):
# GH 9539
data_path = datapath("io", "data", "legacy_hdf/incompatible_dataset.h5")
message = (
r"Dataset\(s\) incompatible with Pandas data types, "
"not table, or no datasets found in HDF5 file."
)
with pytest.raises(ValueError, match=message):
pd.read_hdf(data_path)
@pytest.mark.parametrize("bad_version", [(1, 2), (1,), [], "12", "123"])
def test_maybe_adjust_name_bad_version_raises(bad_version):
msg = "Version is incorrect, expected sequence of 3 integers"
with pytest.raises(ValueError, match=msg):
_maybe_adjust_name("values_block_0", version=bad_version)