import datetime from datetime import timedelta from distutils.version import LooseVersion import hashlib from io import BytesIO import os from pathlib import Path import re import time from warnings import catch_warnings, simplefilter import numpy as np import pytest from pandas.compat import is_platform_little_endian, is_platform_windows import pandas.util._test_decorators as td import pandas as pd from pandas import ( Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Index, Int64Index, MultiIndex, RangeIndex, Series, Timestamp, bdate_range, concat, date_range, isna, timedelta_range, ) import pandas._testing as tm from pandas.tests.io.pytables.common import ( _maybe_remove, ensure_clean_path, ensure_clean_store, safe_close, tables, ) from pandas.io.pytables import ( ClosedFileError, HDFStore, PossibleDataLossError, Term, _maybe_adjust_name, read_hdf, ) from pandas.io import pytables as pytables # isort:skip from pandas.io.pytables import TableIterator # isort:skip _default_compressor = "blosc" ignore_natural_naming_warning = pytest.mark.filterwarnings( "ignore:object name:tables.exceptions.NaturalNameWarning" ) @pytest.mark.single class TestHDFStore: def test_format_type(self, setup_path): df = DataFrame({"A": [1, 2]}) with ensure_clean_path(setup_path) as path: with HDFStore(path) as store: store.put("a", df, format="fixed") store.put("b", df, format="table") assert store.get_storer("a").format_type == "fixed" assert store.get_storer("b").format_type == "table" def test_format_kwarg_in_constructor(self, setup_path): # GH 13291 msg = "format is not a defined argument for HDFStore" with tm.ensure_clean(setup_path) as path: with pytest.raises(ValueError, match=msg): HDFStore(path, format="table") def test_context(self, setup_path): with tm.ensure_clean(setup_path) as path: try: with HDFStore(path) as tbl: raise ValueError("blah") except ValueError: pass with tm.ensure_clean(setup_path) as path: with HDFStore(path) as tbl: tbl["a"] = tm.makeDataFrame() assert len(tbl) == 1 assert type(tbl["a"]) == DataFrame def test_conv_read_write(self, setup_path): with tm.ensure_clean() as path: def roundtrip(key, obj, **kwargs): obj.to_hdf(path, key, **kwargs) return read_hdf(path, key) o = tm.makeTimeSeries() tm.assert_series_equal(o, roundtrip("series", o)) o = tm.makeStringSeries() tm.assert_series_equal(o, roundtrip("string_series", o)) o = tm.makeDataFrame() tm.assert_frame_equal(o, roundtrip("frame", o)) # table df = DataFrame({"A": range(5), "B": range(5)}) df.to_hdf(path, "table", append=True) result = read_hdf(path, "table", where=["index>2"]) tm.assert_frame_equal(df[df.index > 2], result) def test_long_strings(self, setup_path): # GH6166 df = DataFrame( {"a": tm.rands_array(100, size=10)}, index=tm.rands_array(100, size=10) ) with ensure_clean_store(setup_path) as store: store.append("df", df, data_columns=["a"]) result = store.select("df") tm.assert_frame_equal(df, result) def test_api(self, setup_path): # GH4584 # API issue when to_hdf doesn't accept append AND format args with ensure_clean_path(setup_path) as path: df = tm.makeDataFrame() df.iloc[:10].to_hdf(path, "df", append=True, format="table") df.iloc[10:].to_hdf(path, "df", append=True, format="table") tm.assert_frame_equal(read_hdf(path, "df"), df) # append to False df.iloc[:10].to_hdf(path, "df", append=False, format="table") df.iloc[10:].to_hdf(path, "df", append=True, format="table") tm.assert_frame_equal(read_hdf(path, "df"), df) with ensure_clean_path(setup_path) as path: df = tm.makeDataFrame() df.iloc[:10].to_hdf(path, "df", append=True) df.iloc[10:].to_hdf(path, "df", append=True, format="table") tm.assert_frame_equal(read_hdf(path, "df"), df) # append to False df.iloc[:10].to_hdf(path, "df", append=False, format="table") df.iloc[10:].to_hdf(path, "df", append=True) tm.assert_frame_equal(read_hdf(path, "df"), df) with ensure_clean_path(setup_path) as path: df = tm.makeDataFrame() df.to_hdf(path, "df", append=False, format="fixed") tm.assert_frame_equal(read_hdf(path, "df"), df) df.to_hdf(path, "df", append=False, format="f") tm.assert_frame_equal(read_hdf(path, "df"), df) df.to_hdf(path, "df", append=False) tm.assert_frame_equal(read_hdf(path, "df"), df) df.to_hdf(path, "df") tm.assert_frame_equal(read_hdf(path, "df"), df) with ensure_clean_store(setup_path) as store: path = store._path df = tm.makeDataFrame() _maybe_remove(store, "df") store.append("df", df.iloc[:10], append=True, format="table") store.append("df", df.iloc[10:], append=True, format="table") tm.assert_frame_equal(store.select("df"), df) # append to False _maybe_remove(store, "df") store.append("df", df.iloc[:10], append=False, format="table") store.append("df", df.iloc[10:], append=True, format="table") tm.assert_frame_equal(store.select("df"), df) # formats _maybe_remove(store, "df") store.append("df", df.iloc[:10], append=False, format="table") store.append("df", df.iloc[10:], append=True, format="table") tm.assert_frame_equal(store.select("df"), df) _maybe_remove(store, "df") store.append("df", df.iloc[:10], append=False, format="table") store.append("df", df.iloc[10:], append=True, format=None) tm.assert_frame_equal(store.select("df"), df) with ensure_clean_path(setup_path) as path: # Invalid. df = tm.makeDataFrame() msg = "Can only append to Tables" with pytest.raises(ValueError, match=msg): df.to_hdf(path, "df", append=True, format="f") with pytest.raises(ValueError, match=msg): df.to_hdf(path, "df", append=True, format="fixed") msg = r"invalid HDFStore format specified \[foo\]" with pytest.raises(TypeError, match=msg): df.to_hdf(path, "df", append=True, format="foo") with pytest.raises(TypeError, match=msg): df.to_hdf(path, "df", append=False, format="foo") # File path doesn't exist path = "" msg = f"File {path} does not exist" with pytest.raises(FileNotFoundError, match=msg): read_hdf(path, "df") def test_api_default_format(self, setup_path): # default_format option with ensure_clean_store(setup_path) as store: df = tm.makeDataFrame() pd.set_option("io.hdf.default_format", "fixed") _maybe_remove(store, "df") store.put("df", df) assert not store.get_storer("df").is_table msg = "Can only append to Tables" with pytest.raises(ValueError, match=msg): store.append("df2", df) pd.set_option("io.hdf.default_format", "table") _maybe_remove(store, "df") store.put("df", df) assert store.get_storer("df").is_table _maybe_remove(store, "df2") store.append("df2", df) assert store.get_storer("df").is_table pd.set_option("io.hdf.default_format", None) with ensure_clean_path(setup_path) as path: df = tm.makeDataFrame() pd.set_option("io.hdf.default_format", "fixed") df.to_hdf(path, "df") with HDFStore(path) as store: assert not store.get_storer("df").is_table with pytest.raises(ValueError, match=msg): df.to_hdf(path, "df2", append=True) pd.set_option("io.hdf.default_format", "table") df.to_hdf(path, "df3") with HDFStore(path) as store: assert store.get_storer("df3").is_table df.to_hdf(path, "df4", append=True) with HDFStore(path) as store: assert store.get_storer("df4").is_table pd.set_option("io.hdf.default_format", None) def test_keys(self, setup_path): with ensure_clean_store(setup_path) as store: store["a"] = tm.makeTimeSeries() store["b"] = tm.makeStringSeries() store["c"] = tm.makeDataFrame() assert len(store) == 3 expected = {"/a", "/b", "/c"} assert set(store.keys()) == expected assert set(store) == expected def test_no_track_times(self, setup_path): # GH 32682 # enables to set track_times (see `pytables` `create_table` documentation) def checksum(filename, hash_factory=hashlib.md5, chunk_num_blocks=128): h = hash_factory() with open(filename, "rb") as f: for chunk in iter(lambda: f.read(chunk_num_blocks * h.block_size), b""): h.update(chunk) return h.digest() def create_h5_and_return_checksum(track_times): with ensure_clean_path(setup_path) as path: df = DataFrame({"a": [1]}) with HDFStore(path, mode="w") as hdf: hdf.put( "table", df, format="table", data_columns=True, index=None, track_times=track_times, ) return checksum(path) checksum_0_tt_false = create_h5_and_return_checksum(track_times=False) checksum_0_tt_true = create_h5_and_return_checksum(track_times=True) # sleep is necessary to create h5 with different creation time time.sleep(1) checksum_1_tt_false = create_h5_and_return_checksum(track_times=False) checksum_1_tt_true = create_h5_and_return_checksum(track_times=True) # checksums are the same if track_time = False assert checksum_0_tt_false == checksum_1_tt_false # checksums are NOT same if track_time = True assert checksum_0_tt_true != checksum_1_tt_true def test_non_pandas_keys(self, setup_path): class Table1(tables.IsDescription): value1 = tables.Float32Col() class Table2(tables.IsDescription): value2 = tables.Float32Col() class Table3(tables.IsDescription): value3 = tables.Float32Col() with ensure_clean_path(setup_path) as path: with tables.open_file(path, mode="w") as h5file: group = h5file.create_group("/", "group") h5file.create_table(group, "table1", Table1, "Table 1") h5file.create_table(group, "table2", Table2, "Table 2") h5file.create_table(group, "table3", Table3, "Table 3") with HDFStore(path) as store: assert len(store.keys(include="native")) == 3 expected = {"/group/table1", "/group/table2", "/group/table3"} assert set(store.keys(include="native")) == expected assert set(store.keys(include="pandas")) == set() for name in expected: df = store.get(name) assert len(df.columns) == 1 def test_keys_illegal_include_keyword_value(self, setup_path): with ensure_clean_store(setup_path) as store: with pytest.raises( ValueError, match="`include` should be either 'pandas' or 'native' " "but is 'illegal'", ): store.keys(include="illegal") def test_keys_ignore_hdf_softlink(self, setup_path): # GH 20523 # Puts a softlink into HDF file and rereads with ensure_clean_store(setup_path) as store: df = DataFrame({"A": range(5), "B": range(5)}) store.put("df", df) assert store.keys() == ["/df"] store._handle.create_soft_link(store._handle.root, "symlink", "df") # Should ignore the softlink assert store.keys() == ["/df"] def test_iter_empty(self, setup_path): with ensure_clean_store(setup_path) as store: # GH 12221 assert list(store) == [] def test_repr(self, setup_path): with ensure_clean_store(setup_path) as store: repr(store) store.info() store["a"] = tm.makeTimeSeries() store["b"] = tm.makeStringSeries() store["c"] = tm.makeDataFrame() df = tm.makeDataFrame() df["obj1"] = "foo" df["obj2"] = "bar" df["bool1"] = df["A"] > 0 df["bool2"] = df["B"] > 0 df["bool3"] = True df["int1"] = 1 df["int2"] = 2 df["timestamp1"] = Timestamp("20010102") df["timestamp2"] = Timestamp("20010103") df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) df.loc[df.index[3:6], ["obj1"]] = np.nan df = df._consolidate()._convert(datetime=True) with catch_warnings(record=True): simplefilter("ignore", pd.errors.PerformanceWarning) store["df"] = df # make a random group in hdf space store._handle.create_group(store._handle.root, "bah") assert store.filename in repr(store) assert store.filename in str(store) store.info() # storers with ensure_clean_store(setup_path) as store: df = tm.makeDataFrame() store.append("df", df) s = store.get_storer("df") repr(s) str(s) @ignore_natural_naming_warning def test_contains(self, setup_path): with ensure_clean_store(setup_path) as store: store["a"] = tm.makeTimeSeries() store["b"] = tm.makeDataFrame() store["foo/bar"] = tm.makeDataFrame() assert "a" in store assert "b" in store assert "c" not in store assert "foo/bar" in store assert "/foo/bar" in store assert "/foo/b" not in store assert "bar" not in store # gh-2694: tables.NaturalNameWarning with catch_warnings(record=True): store["node())"] = tm.makeDataFrame() assert "node())" in store def test_versioning(self, setup_path): with ensure_clean_store(setup_path) as store: store["a"] = tm.makeTimeSeries() store["b"] = tm.makeDataFrame() df = tm.makeTimeDataFrame() _maybe_remove(store, "df1") store.append("df1", df[:10]) store.append("df1", df[10:]) assert store.root.a._v_attrs.pandas_version == "0.15.2" assert store.root.b._v_attrs.pandas_version == "0.15.2" assert store.root.df1._v_attrs.pandas_version == "0.15.2" # write a file and wipe its versioning _maybe_remove(store, "df2") store.append("df2", df) # this is an error because its table_type is appendable, but no # version info store.get_node("df2")._v_attrs.pandas_version = None msg = "'NoneType' object has no attribute 'startswith'" with pytest.raises(Exception, match=msg): store.select("df2") def test_mode(self, setup_path): df = tm.makeTimeDataFrame() def check(mode): with ensure_clean_path(setup_path) as path: # constructor if mode in ["r", "r+"]: with pytest.raises(IOError): HDFStore(path, mode=mode) else: store = HDFStore(path, mode=mode) assert store._handle.mode == mode store.close() with ensure_clean_path(setup_path) as path: # context if mode in ["r", "r+"]: with pytest.raises(IOError): with HDFStore(path, mode=mode) as store: pass else: with HDFStore(path, mode=mode) as store: assert store._handle.mode == mode with ensure_clean_path(setup_path) as path: # conv write if mode in ["r", "r+"]: with pytest.raises(IOError): df.to_hdf(path, "df", mode=mode) df.to_hdf(path, "df", mode="w") else: df.to_hdf(path, "df", mode=mode) # conv read if mode in ["w"]: msg = ( "mode w is not allowed while performing a read. " r"Allowed modes are r, r\+ and a." ) with pytest.raises(ValueError, match=msg): read_hdf(path, "df", mode=mode) else: result = read_hdf(path, "df", mode=mode) tm.assert_frame_equal(result, df) def check_default_mode(): # read_hdf uses default mode with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", mode="w") result = read_hdf(path, "df") tm.assert_frame_equal(result, df) check("r") check("r+") check("a") check("w") check_default_mode() def test_reopen_handle(self, setup_path): with ensure_clean_path(setup_path) as path: store = HDFStore(path, mode="a") store["a"] = tm.makeTimeSeries() # invalid mode change with pytest.raises(PossibleDataLossError): store.open("w") store.close() assert not store.is_open # truncation ok here store.open("w") assert store.is_open assert len(store) == 0 store.close() assert not store.is_open store = HDFStore(path, mode="a") store["a"] = tm.makeTimeSeries() # reopen as read store.open("r") assert store.is_open assert len(store) == 1 assert store._mode == "r" store.close() assert not store.is_open # reopen as append store.open("a") assert store.is_open assert len(store) == 1 assert store._mode == "a" store.close() assert not store.is_open # reopen as append (again) store.open("a") assert store.is_open assert len(store) == 1 assert store._mode == "a" store.close() assert not store.is_open def test_open_args(self, setup_path): with tm.ensure_clean(setup_path) as path: df = tm.makeDataFrame() # create an in memory store store = HDFStore( path, mode="a", driver="H5FD_CORE", driver_core_backing_store=0 ) store["df"] = df store.append("df2", df) tm.assert_frame_equal(store["df"], df) tm.assert_frame_equal(store["df2"], df) store.close() # the file should not have actually been written assert not os.path.exists(path) def test_flush(self, setup_path): with ensure_clean_store(setup_path) as store: store["a"] = tm.makeTimeSeries() store.flush() store.flush(fsync=True) def test_get(self, setup_path): with ensure_clean_store(setup_path) as store: store["a"] = tm.makeTimeSeries() left = store.get("a") right = store["a"] tm.assert_series_equal(left, right) left = store.get("/a") right = store["/a"] tm.assert_series_equal(left, right) with pytest.raises(KeyError, match="'No object named b in the file'"): store.get("b") @pytest.mark.parametrize( "where, expected", [ ( "/", { "": ({"first_group", "second_group"}, set()), "/first_group": (set(), {"df1", "df2"}), "/second_group": ({"third_group"}, {"df3", "s1"}), "/second_group/third_group": (set(), {"df4"}), }, ), ( "/second_group", { "/second_group": ({"third_group"}, {"df3", "s1"}), "/second_group/third_group": (set(), {"df4"}), }, ), ], ) def test_walk(self, where, expected, setup_path): # GH10143 objs = { "df1": DataFrame([1, 2, 3]), "df2": DataFrame([4, 5, 6]), "df3": DataFrame([6, 7, 8]), "df4": DataFrame([9, 10, 11]), "s1": Series([10, 9, 8]), # Next 3 items aren't pandas objects and should be ignored "a1": np.array([[1, 2, 3], [4, 5, 6]]), "tb1": np.array([(1, 2, 3), (4, 5, 6)], dtype="i,i,i"), "tb2": np.array([(7, 8, 9), (10, 11, 12)], dtype="i,i,i"), } with ensure_clean_store("walk_groups.hdf", mode="w") as store: store.put("/first_group/df1", objs["df1"]) store.put("/first_group/df2", objs["df2"]) store.put("/second_group/df3", objs["df3"]) store.put("/second_group/s1", objs["s1"]) store.put("/second_group/third_group/df4", objs["df4"]) # Create non-pandas objects store._handle.create_array("/first_group", "a1", objs["a1"]) store._handle.create_table("/first_group", "tb1", obj=objs["tb1"]) store._handle.create_table("/second_group", "tb2", obj=objs["tb2"]) assert len(list(store.walk(where=where))) == len(expected) for path, groups, leaves in store.walk(where=where): assert path in expected expected_groups, expected_frames = expected[path] assert expected_groups == set(groups) assert expected_frames == set(leaves) for leaf in leaves: frame_path = "/".join([path, leaf]) obj = store.get(frame_path) if "df" in leaf: tm.assert_frame_equal(obj, objs[leaf]) else: tm.assert_series_equal(obj, objs[leaf]) def test_getattr(self, setup_path): with ensure_clean_store(setup_path) as store: s = tm.makeTimeSeries() store["a"] = s # test attribute access result = store.a tm.assert_series_equal(result, s) result = getattr(store, "a") tm.assert_series_equal(result, s) df = tm.makeTimeDataFrame() store["df"] = df result = store.df tm.assert_frame_equal(result, df) # errors for x in ["d", "mode", "path", "handle", "complib"]: with pytest.raises(AttributeError): getattr(store, x) # not stores for x in ["mode", "path", "handle", "complib"]: getattr(store, f"_{x}") def test_put(self, setup_path): with ensure_clean_store(setup_path) as store: ts = tm.makeTimeSeries() df = tm.makeTimeDataFrame() store["a"] = ts store["b"] = df[:10] store["foo/bar/bah"] = df[:10] store["foo"] = df[:10] store["/foo"] = df[:10] store.put("c", df[:10], format="table") # not OK, not a table with pytest.raises(ValueError): store.put("b", df[10:], append=True) # node does not currently exist, test _is_table_type returns False # in this case _maybe_remove(store, "f") with pytest.raises(ValueError): store.put("f", df[10:], append=True) # can't put to a table (use append instead) with pytest.raises(ValueError): store.put("c", df[10:], append=True) # overwrite table store.put("c", df[:10], format="table", append=False) tm.assert_frame_equal(df[:10], store["c"]) def test_put_string_index(self, setup_path): with ensure_clean_store(setup_path) as store: index = Index([f"I am a very long string index: {i}" for i in range(20)]) s = Series(np.arange(20), index=index) df = DataFrame({"A": s, "B": s}) store["a"] = s tm.assert_series_equal(store["a"], s) store["b"] = df tm.assert_frame_equal(store["b"], df) # mixed length index = Index( ["abcdefghijklmnopqrstuvwxyz1234567890"] + [f"I am a very long string index: {i}" for i in range(20)] ) s = Series(np.arange(21), index=index) df = DataFrame({"A": s, "B": s}) store["a"] = s tm.assert_series_equal(store["a"], s) store["b"] = df tm.assert_frame_equal(store["b"], df) def test_put_compression(self, setup_path): with ensure_clean_store(setup_path) as store: df = tm.makeTimeDataFrame() store.put("c", df, format="table", complib="zlib") tm.assert_frame_equal(store["c"], df) # can't compress if format='fixed' with pytest.raises(ValueError): store.put("b", df, format="fixed", complib="zlib") @td.skip_if_windows_python_3 def test_put_compression_blosc(self, setup_path): df = tm.makeTimeDataFrame() with ensure_clean_store(setup_path) as store: # can't compress if format='fixed' with pytest.raises(ValueError): store.put("b", df, format="fixed", complib="blosc") store.put("c", df, format="table", complib="blosc") tm.assert_frame_equal(store["c"], df) def test_complibs_default_settings(self, setup_path): # GH15943 df = tm.makeDataFrame() # Set complevel and check if complib is automatically set to # default value with ensure_clean_path(setup_path) as tmpfile: df.to_hdf(tmpfile, "df", complevel=9) result = pd.read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 9 assert node.filters.complib == "zlib" # Set complib and check to see if compression is disabled with ensure_clean_path(setup_path) as tmpfile: df.to_hdf(tmpfile, "df", complib="zlib") result = pd.read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 0 assert node.filters.complib is None # Check if not setting complib or complevel results in no compression with ensure_clean_path(setup_path) as tmpfile: df.to_hdf(tmpfile, "df") result = pd.read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 0 assert node.filters.complib is None # Check if file-defaults can be overridden on a per table basis with ensure_clean_path(setup_path) as tmpfile: store = HDFStore(tmpfile) store.append("dfc", df, complevel=9, complib="blosc") store.append("df", df) store.close() with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 0 assert node.filters.complib is None for node in h5file.walk_nodes(where="/dfc", classname="Leaf"): assert node.filters.complevel == 9 assert node.filters.complib == "blosc" def test_complibs(self, setup_path): # GH14478 df = tm.makeDataFrame() # Building list of all complibs and complevels tuples all_complibs = tables.filters.all_complibs # Remove lzo if its not available on this platform if not tables.which_lib_version("lzo"): all_complibs.remove("lzo") # Remove bzip2 if its not available on this platform if not tables.which_lib_version("bzip2"): all_complibs.remove("bzip2") all_levels = range(0, 10) all_tests = [(lib, lvl) for lib in all_complibs for lvl in all_levels] for (lib, lvl) in all_tests: with ensure_clean_path(setup_path) as tmpfile: gname = "foo" # Write and read file to see if data is consistent df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl) result = pd.read_hdf(tmpfile, gname) tm.assert_frame_equal(result, df) # Open file and check metadata # for correct amount of compression h5table = tables.open_file(tmpfile, mode="r") for node in h5table.walk_nodes(where="/" + gname, classname="Leaf"): assert node.filters.complevel == lvl if lvl == 0: assert node.filters.complib is None else: assert node.filters.complib == lib h5table.close() def test_put_integer(self, setup_path): # non-date, non-string index df = DataFrame(np.random.randn(50, 100)) self._check_roundtrip(df, tm.assert_frame_equal, setup_path) def test_put_mixed_type(self, setup_path): df = tm.makeTimeDataFrame() df["obj1"] = "foo" df["obj2"] = "bar" df["bool1"] = df["A"] > 0 df["bool2"] = df["B"] > 0 df["bool3"] = True df["int1"] = 1 df["int2"] = 2 df["timestamp1"] = Timestamp("20010102") df["timestamp2"] = Timestamp("20010103") df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) df.loc[df.index[3:6], ["obj1"]] = np.nan df = df._consolidate()._convert(datetime=True) with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") # PerformanceWarning with catch_warnings(record=True): simplefilter("ignore", pd.errors.PerformanceWarning) store.put("df", df) expected = store.get("df") tm.assert_frame_equal(expected, df) @pytest.mark.filterwarnings( "ignore:object name:tables.exceptions.NaturalNameWarning" ) def test_append(self, setup_path): with ensure_clean_store(setup_path) as store: # this is allowed by almost always don't want to do it # tables.NaturalNameWarning): with catch_warnings(record=True): df = tm.makeTimeDataFrame() _maybe_remove(store, "df1") store.append("df1", df[:10]) store.append("df1", df[10:]) tm.assert_frame_equal(store["df1"], df) _maybe_remove(store, "df2") store.put("df2", df[:10], format="table") store.append("df2", df[10:]) tm.assert_frame_equal(store["df2"], df) _maybe_remove(store, "df3") store.append("/df3", df[:10]) store.append("/df3", df[10:]) tm.assert_frame_equal(store["df3"], df) # this is allowed by almost always don't want to do it # tables.NaturalNameWarning _maybe_remove(store, "/df3 foo") store.append("/df3 foo", df[:10]) store.append("/df3 foo", df[10:]) tm.assert_frame_equal(store["df3 foo"], df) # dtype issues - mizxed type in a single object column df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]]) df["mixed_column"] = "testing" df.loc[2, "mixed_column"] = np.nan _maybe_remove(store, "df") store.append("df", df) tm.assert_frame_equal(store["df"], df) # uints - test storage of uints uint_data = DataFrame( { "u08": Series( np.random.randint(0, high=255, size=5), dtype=np.uint8 ), "u16": Series( np.random.randint(0, high=65535, size=5), dtype=np.uint16 ), "u32": Series( np.random.randint(0, high=2 ** 30, size=5), dtype=np.uint32 ), "u64": Series( [2 ** 58, 2 ** 59, 2 ** 60, 2 ** 61, 2 ** 62], dtype=np.uint64, ), }, index=np.arange(5), ) _maybe_remove(store, "uints") store.append("uints", uint_data) tm.assert_frame_equal(store["uints"], uint_data) # uints - test storage of uints in indexable columns _maybe_remove(store, "uints") # 64-bit indices not yet supported store.append("uints", uint_data, data_columns=["u08", "u16", "u32"]) tm.assert_frame_equal(store["uints"], uint_data) def test_append_series(self, setup_path): with ensure_clean_store(setup_path) as store: # basic ss = tm.makeStringSeries() ts = tm.makeTimeSeries() ns = Series(np.arange(100)) store.append("ss", ss) result = store["ss"] tm.assert_series_equal(result, ss) assert result.name is None store.append("ts", ts) result = store["ts"] tm.assert_series_equal(result, ts) assert result.name is None ns.name = "foo" store.append("ns", ns) result = store["ns"] tm.assert_series_equal(result, ns) assert result.name == ns.name # select on the values expected = ns[ns > 60] result = store.select("ns", "foo>60") tm.assert_series_equal(result, expected) # select on the index and values expected = ns[(ns > 70) & (ns.index < 90)] result = store.select("ns", "foo>70 and index<90") tm.assert_series_equal(result, expected) # multi-index mi = DataFrame(np.random.randn(5, 1), columns=["A"]) mi["B"] = np.arange(len(mi)) mi["C"] = "foo" mi.loc[3:5, "C"] = "bar" mi.set_index(["C", "B"], inplace=True) s = mi.stack() s.index = s.index.droplevel(2) store.append("mi", s) tm.assert_series_equal(store["mi"], s) def test_store_index_types(self, setup_path): # GH5386 # test storing various index types with ensure_clean_store(setup_path) as store: def check(format, index): df = DataFrame(np.random.randn(10, 2), columns=list("AB")) df.index = index(len(df)) _maybe_remove(store, "df") store.put("df", df, format=format) tm.assert_frame_equal(df, store["df"]) for index in [ tm.makeFloatIndex, tm.makeStringIndex, tm.makeIntIndex, tm.makeDateIndex, ]: check("table", index) check("fixed", index) # period index currently broken for table # seee GH7796 FIXME check("fixed", tm.makePeriodIndex) # check('table',tm.makePeriodIndex) # unicode index = tm.makeUnicodeIndex check("table", index) check("fixed", index) @pytest.mark.skipif( not is_platform_little_endian(), reason="reason platform is not little endian" ) def test_encoding(self, setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame({"A": "foo", "B": "bar"}, index=range(5)) df.loc[2, "A"] = np.nan df.loc[3, "B"] = np.nan _maybe_remove(store, "df") store.append("df", df, encoding="ascii") tm.assert_frame_equal(store["df"], df) expected = df.reindex(columns=["A"]) result = store.select("df", Term("columns=A", encoding="ascii")) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "val", [ [b"E\xc9, 17", b"", b"a", b"b", b"c"], [b"E\xc9, 17", b"a", b"b", b"c"], [b"EE, 17", b"", b"a", b"b", b"c"], [b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"], [b"", b"a", b"b", b"c"], [b"\xf8\xfc", b"a", b"b", b"c"], [b"A\xf8\xfc", b"", b"a", b"b", b"c"], [np.nan, b"", b"b", b"c"], [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], ], ) @pytest.mark.parametrize("dtype", ["category", object]) def test_latin_encoding(self, setup_path, dtype, val): enc = "latin-1" nan_rep = "" key = "data" val = [x.decode(enc) if isinstance(x, bytes) else x for x in val] ser = Series(val, dtype=dtype) with ensure_clean_path(setup_path) as store: ser.to_hdf(store, key, format="table", encoding=enc, nan_rep=nan_rep) retr = read_hdf(store, key) s_nan = ser.replace(nan_rep, np.nan) tm.assert_series_equal(s_nan, retr) def test_append_some_nans(self, setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( { "A": Series(np.random.randn(20)).astype("int32"), "A1": np.random.randn(20), "A2": np.random.randn(20), "B": "foo", "C": "bar", "D": Timestamp("20010101"), "E": datetime.datetime(2001, 1, 2, 0, 0), }, index=np.arange(20), ) # some nans _maybe_remove(store, "df1") df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan store.append("df1", df[:10]) store.append("df1", df[10:]) tm.assert_frame_equal(store["df1"], df) # first column df1 = df.copy() df1.loc[:, "A1"] = np.nan _maybe_remove(store, "df1") store.append("df1", df1[:10]) store.append("df1", df1[10:]) tm.assert_frame_equal(store["df1"], df1) # 2nd column df2 = df.copy() df2.loc[:, "A2"] = np.nan _maybe_remove(store, "df2") store.append("df2", df2[:10]) store.append("df2", df2[10:]) tm.assert_frame_equal(store["df2"], df2) # datetimes df3 = df.copy() df3.loc[:, "E"] = np.nan _maybe_remove(store, "df3") store.append("df3", df3[:10]) store.append("df3", df3[10:]) tm.assert_frame_equal(store["df3"], df3) def test_append_all_nans(self, setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( {"A1": np.random.randn(20), "A2": np.random.randn(20)}, index=np.arange(20), ) df.loc[0:15, :] = np.nan # nan some entire rows (dropna=True) _maybe_remove(store, "df") store.append("df", df[:10], dropna=True) store.append("df", df[10:], dropna=True) tm.assert_frame_equal(store["df"], df[-4:]) # nan some entire rows (dropna=False) _maybe_remove(store, "df2") store.append("df2", df[:10], dropna=False) store.append("df2", df[10:], dropna=False) tm.assert_frame_equal(store["df2"], df) # tests the option io.hdf.dropna_table pd.set_option("io.hdf.dropna_table", False) _maybe_remove(store, "df3") store.append("df3", df[:10]) store.append("df3", df[10:]) tm.assert_frame_equal(store["df3"], df) pd.set_option("io.hdf.dropna_table", True) _maybe_remove(store, "df4") store.append("df4", df[:10]) store.append("df4", df[10:]) tm.assert_frame_equal(store["df4"], df[-4:]) # nan some entire rows (string are still written!) df = DataFrame( { "A1": np.random.randn(20), "A2": np.random.randn(20), "B": "foo", "C": "bar", }, index=np.arange(20), ) df.loc[0:15, :] = np.nan _maybe_remove(store, "df") store.append("df", df[:10], dropna=True) store.append("df", df[10:], dropna=True) tm.assert_frame_equal(store["df"], df) _maybe_remove(store, "df2") store.append("df2", df[:10], dropna=False) store.append("df2", df[10:], dropna=False) tm.assert_frame_equal(store["df2"], df) # nan some entire rows (but since we have dates they are still # written!) df = DataFrame( { "A1": np.random.randn(20), "A2": np.random.randn(20), "B": "foo", "C": "bar", "D": Timestamp("20010101"), "E": datetime.datetime(2001, 1, 2, 0, 0), }, index=np.arange(20), ) df.loc[0:15, :] = np.nan _maybe_remove(store, "df") store.append("df", df[:10], dropna=True) store.append("df", df[10:], dropna=True) tm.assert_frame_equal(store["df"], df) _maybe_remove(store, "df2") store.append("df2", df[:10], dropna=False) store.append("df2", df[10:], dropna=False) tm.assert_frame_equal(store["df2"], df) def test_store_dropna(self, setup_path): df_with_missing = DataFrame( {"col1": [0.0, np.nan, 2.0], "col2": [1.0, np.nan, np.nan]}, index=list("abc"), ) df_without_missing = DataFrame( {"col1": [0.0, 2.0], "col2": [1.0, np.nan]}, index=list("ac") ) # # Test to make sure defaults are to not drop. # # Corresponding to Issue 9382 with ensure_clean_path(setup_path) as path: df_with_missing.to_hdf(path, "df", format="table") reloaded = read_hdf(path, "df") tm.assert_frame_equal(df_with_missing, reloaded) with ensure_clean_path(setup_path) as path: df_with_missing.to_hdf(path, "df", format="table", dropna=False) reloaded = read_hdf(path, "df") tm.assert_frame_equal(df_with_missing, reloaded) with ensure_clean_path(setup_path) as path: df_with_missing.to_hdf(path, "df", format="table", dropna=True) reloaded = read_hdf(path, "df") tm.assert_frame_equal(df_without_missing, reloaded) def test_read_missing_key_close_store(self, setup_path): # GH 25766 with ensure_clean_path(setup_path) as path: df = DataFrame({"a": range(2), "b": range(2)}) df.to_hdf(path, "k1") with pytest.raises(KeyError, match="'No object named k2 in the file'"): pd.read_hdf(path, "k2") # smoke test to test that file is properly closed after # read with KeyError before another write df.to_hdf(path, "k2") def test_read_missing_key_opened_store(self, setup_path): # GH 28699 with ensure_clean_path(setup_path) as path: df = DataFrame({"a": range(2), "b": range(2)}) df.to_hdf(path, "k1") with HDFStore(path, "r") as store: with pytest.raises(KeyError, match="'No object named k2 in the file'"): pd.read_hdf(store, "k2") # Test that the file is still open after a KeyError and that we can # still read from it. pd.read_hdf(store, "k1") def test_append_frame_column_oriented(self, setup_path): with ensure_clean_store(setup_path) as store: # column oriented df = tm.makeTimeDataFrame() df.index = df.index._with_freq(None) # freq doesnt round-trip _maybe_remove(store, "df1") store.append("df1", df.iloc[:, :2], axes=["columns"]) store.append("df1", df.iloc[:, 2:]) tm.assert_frame_equal(store["df1"], df) result = store.select("df1", "columns=A") expected = df.reindex(columns=["A"]) tm.assert_frame_equal(expected, result) # selection on the non-indexable result = store.select("df1", ("columns=A", "index=df.index[0:4]")) expected = df.reindex(columns=["A"], index=df.index[0:4]) tm.assert_frame_equal(expected, result) # this isn't supported with pytest.raises(TypeError): store.select("df1", "columns=A and index>df.index[4]") def test_append_with_different_block_ordering(self, setup_path): # GH 4096; using same frames, but different block orderings with ensure_clean_store(setup_path) as store: for i in range(10): df = DataFrame(np.random.randn(10, 2), columns=list("AB")) df["index"] = range(10) df["index"] += i * 10 df["int64"] = Series([1] * len(df), dtype="int64") df["int16"] = Series([1] * len(df), dtype="int16") if i % 2 == 0: del df["int64"] df["int64"] = Series([1] * len(df), dtype="int64") if i % 3 == 0: a = df.pop("A") df["A"] = a df.set_index("index", inplace=True) store.append("df", df) # test a different ordering but with more fields (like invalid # combinate) with ensure_clean_store(setup_path) as store: df = DataFrame(np.random.randn(10, 2), columns=list("AB"), dtype="float64") df["int64"] = Series([1] * len(df), dtype="int64") df["int16"] = Series([1] * len(df), dtype="int16") store.append("df", df) # store additional fields in different blocks df["int16_2"] = Series([1] * len(df), dtype="int16") with pytest.raises(ValueError): store.append("df", df) # store multiple additional fields in different blocks df["float_3"] = Series([1.0] * len(df), dtype="float64") with pytest.raises(ValueError): store.append("df", df) def test_append_with_strings(self, setup_path): with ensure_clean_store(setup_path) as store: with catch_warnings(record=True): def check_col(key, name, size): assert ( getattr(store.get_storer(key).table.description, name).itemsize == size ) # avoid truncation on elements df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) store.append("df_big", df) tm.assert_frame_equal(store.select("df_big"), df) check_col("df_big", "values_block_1", 15) # appending smaller string ok df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]]) store.append("df_big", df2) expected = concat([df, df2]) tm.assert_frame_equal(store.select("df_big"), expected) check_col("df_big", "values_block_1", 15) # avoid truncation on elements df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) store.append("df_big2", df, min_itemsize={"values": 50}) tm.assert_frame_equal(store.select("df_big2"), df) check_col("df_big2", "values_block_1", 50) # bigger string on next append store.append("df_new", df) df_new = DataFrame( [[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]] ) with pytest.raises(ValueError): store.append("df_new", df_new) # min_itemsize on Series index (GH 11412) df = tm.makeMixedDataFrame().set_index("C") store.append("ss", df["B"], min_itemsize={"index": 4}) tm.assert_series_equal(store.select("ss"), df["B"]) # same as above, with data_columns=True store.append( "ss2", df["B"], data_columns=True, min_itemsize={"index": 4} ) tm.assert_series_equal(store.select("ss2"), df["B"]) # min_itemsize in index without appending (GH 10381) store.put("ss3", df, format="table", min_itemsize={"index": 6}) # just make sure there is a longer string: df2 = df.copy().reset_index().assign(C="longer").set_index("C") store.append("ss3", df2) tm.assert_frame_equal(store.select("ss3"), pd.concat([df, df2])) # same as above, with a Series store.put("ss4", df["B"], format="table", min_itemsize={"index": 6}) store.append("ss4", df2["B"]) tm.assert_series_equal( store.select("ss4"), pd.concat([df["B"], df2["B"]]) ) # with nans _maybe_remove(store, "df") df = tm.makeTimeDataFrame() df["string"] = "foo" df.loc[df.index[1:4], "string"] = np.nan df["string2"] = "bar" df.loc[df.index[4:8], "string2"] = np.nan df["string3"] = "bah" df.loc[df.index[1:], "string3"] = np.nan store.append("df", df) result = store.select("df") tm.assert_frame_equal(result, df) with ensure_clean_store(setup_path) as store: def check_col(key, name, size): assert getattr( store.get_storer(key).table.description, name ).itemsize, size df = DataFrame({"A": "foo", "B": "bar"}, index=range(10)) # a min_itemsize that creates a data_column _maybe_remove(store, "df") store.append("df", df, min_itemsize={"A": 200}) check_col("df", "A", 200) assert store.get_storer("df").data_columns == ["A"] # a min_itemsize that creates a data_column2 _maybe_remove(store, "df") store.append("df", df, data_columns=["B"], min_itemsize={"A": 200}) check_col("df", "A", 200) assert store.get_storer("df").data_columns == ["B", "A"] # a min_itemsize that creates a data_column2 _maybe_remove(store, "df") store.append("df", df, data_columns=["B"], min_itemsize={"values": 200}) check_col("df", "B", 200) check_col("df", "values_block_0", 200) assert store.get_storer("df").data_columns == ["B"] # infer the .typ on subsequent appends _maybe_remove(store, "df") store.append("df", df[:5], min_itemsize=200) store.append("df", df[5:], min_itemsize=200) tm.assert_frame_equal(store["df"], df) # invalid min_itemsize keys df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"]) _maybe_remove(store, "df") with pytest.raises(ValueError): store.append("df", df, min_itemsize={"foo": 20, "foobar": 20}) def test_append_with_empty_string(self, setup_path): with ensure_clean_store(setup_path) as store: # with all empty strings (GH 12242) df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]}) store.append("df", df[:-1], min_itemsize={"x": 1}) store.append("df", df[-1:], min_itemsize={"x": 1}) tm.assert_frame_equal(store.select("df"), df) def test_to_hdf_with_min_itemsize(self, setup_path): with ensure_clean_path(setup_path) as path: # min_itemsize in index with to_hdf (GH 10381) df = tm.makeMixedDataFrame().set_index("C") df.to_hdf(path, "ss3", format="table", min_itemsize={"index": 6}) # just make sure there is a longer string: df2 = df.copy().reset_index().assign(C="longer").set_index("C") df2.to_hdf(path, "ss3", append=True, format="table") tm.assert_frame_equal(pd.read_hdf(path, "ss3"), pd.concat([df, df2])) # same as above, with a Series df["B"].to_hdf(path, "ss4", format="table", min_itemsize={"index": 6}) df2["B"].to_hdf(path, "ss4", append=True, format="table") tm.assert_series_equal( pd.read_hdf(path, "ss4"), pd.concat([df["B"], df2["B"]]) ) @pytest.mark.parametrize("format", ["fixed", "table"]) def test_to_hdf_errors(self, format, setup_path): data = ["\ud800foo"] ser = Series(data, index=Index(data)) with ensure_clean_path(setup_path) as path: # GH 20835 ser.to_hdf(path, "table", format=format, errors="surrogatepass") result = pd.read_hdf(path, "table", errors="surrogatepass") tm.assert_series_equal(result, ser) def test_append_with_data_columns(self, setup_path): with ensure_clean_store(setup_path) as store: df = tm.makeTimeDataFrame() df.iloc[0, df.columns.get_loc("B")] = 1.0 _maybe_remove(store, "df") store.append("df", df[:2], data_columns=["B"]) store.append("df", df[2:]) tm.assert_frame_equal(store["df"], df) # check that we have indices created assert store._handle.root.df.table.cols.index.is_indexed is True assert store._handle.root.df.table.cols.B.is_indexed is True # data column searching result = store.select("df", "B>0") expected = df[df.B > 0] tm.assert_frame_equal(result, expected) # data column searching (with an indexable and a data_columns) result = store.select("df", "B>0 and index>df.index[3]") df_new = df.reindex(index=df.index[4:]) expected = df_new[df_new.B > 0] tm.assert_frame_equal(result, expected) # data column selection with a string data_column df_new = df.copy() df_new["string"] = "foo" df_new.loc[df_new.index[1:4], "string"] = np.nan df_new.loc[df_new.index[5:6], "string"] = "bar" _maybe_remove(store, "df") store.append("df", df_new, data_columns=["string"]) result = store.select("df", "string='foo'") expected = df_new[df_new.string == "foo"] tm.assert_frame_equal(result, expected) # using min_itemsize and a data column def check_col(key, name, size): assert ( getattr(store.get_storer(key).table.description, name).itemsize == size ) with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") store.append( "df", df_new, data_columns=["string"], min_itemsize={"string": 30} ) check_col("df", "string", 30) _maybe_remove(store, "df") store.append("df", df_new, data_columns=["string"], min_itemsize=30) check_col("df", "string", 30) _maybe_remove(store, "df") store.append( "df", df_new, data_columns=["string"], min_itemsize={"values": 30} ) check_col("df", "string", 30) with ensure_clean_store(setup_path) as store: df_new["string2"] = "foobarbah" df_new["string_block1"] = "foobarbah1" df_new["string_block2"] = "foobarbah2" _maybe_remove(store, "df") store.append( "df", df_new, data_columns=["string", "string2"], min_itemsize={"string": 30, "string2": 40, "values": 50}, ) check_col("df", "string", 30) check_col("df", "string2", 40) check_col("df", "values_block_1", 50) with ensure_clean_store(setup_path) as store: # multiple data columns df_new = df.copy() df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0 df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0 df_new["string"] = "foo" sl = df_new.columns.get_loc("string") df_new.iloc[1:4, sl] = np.nan df_new.iloc[5:6, sl] = "bar" df_new["string2"] = "foo" sl = df_new.columns.get_loc("string2") df_new.iloc[2:5, sl] = np.nan df_new.iloc[7:8, sl] = "bar" _maybe_remove(store, "df") store.append("df", df_new, data_columns=["A", "B", "string", "string2"]) result = store.select( "df", "string='foo' and string2='foo' and A>0 and B<0" ) expected = df_new[ (df_new.string == "foo") & (df_new.string2 == "foo") & (df_new.A > 0) & (df_new.B < 0) ] tm.assert_frame_equal(result, expected, check_freq=False) # FIXME: 2020-05-07 freq check randomly fails in the CI # yield an empty frame result = store.select("df", "string='foo' and string2='cool'") expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")] tm.assert_frame_equal(result, expected) with ensure_clean_store(setup_path) as store: # doc example df_dc = df.copy() df_dc["string"] = "foo" df_dc.loc[df_dc.index[4:6], "string"] = np.nan df_dc.loc[df_dc.index[7:9], "string"] = "bar" df_dc["string2"] = "cool" df_dc["datetime"] = Timestamp("20010102") df_dc = df_dc._convert(datetime=True) df_dc.loc[df_dc.index[3:5], ["A", "B", "datetime"]] = np.nan _maybe_remove(store, "df_dc") store.append( "df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"] ) result = store.select("df_dc", "B>0") expected = df_dc[df_dc.B > 0] tm.assert_frame_equal(result, expected) result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"]) expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] tm.assert_frame_equal(result, expected, check_freq=False) # FIXME: 2020-12-07 intermittent build failures here with freq of # None instead of BDay(4) with ensure_clean_store(setup_path) as store: # doc example part 2 np.random.seed(1234) index = date_range("1/1/2000", periods=8) df_dc = DataFrame( np.random.randn(8, 3), index=index, columns=["A", "B", "C"] ) df_dc["string"] = "foo" df_dc.loc[df_dc.index[4:6], "string"] = np.nan df_dc.loc[df_dc.index[7:9], "string"] = "bar" df_dc.loc[:, ["B", "C"]] = df_dc.loc[:, ["B", "C"]].abs() df_dc["string2"] = "cool" # on-disk operations store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"]) result = store.select("df_dc", "B>0") expected = df_dc[df_dc.B > 0] tm.assert_frame_equal(result, expected) result = store.select("df_dc", ["B > 0", "C > 0", 'string == "foo"']) expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] tm.assert_frame_equal(result, expected) def test_create_table_index(self, setup_path): with ensure_clean_store(setup_path) as store: with catch_warnings(record=True): def col(t, column): return getattr(store.get_storer(t).table.cols, column) # data columns df = tm.makeTimeDataFrame() df["string"] = "foo" df["string2"] = "bar" store.append("f", df, data_columns=["string", "string2"]) assert col("f", "index").is_indexed is True assert col("f", "string").is_indexed is True assert col("f", "string2").is_indexed is True # specify index=columns store.append( "f2", df, index=["string"], data_columns=["string", "string2"] ) assert col("f2", "index").is_indexed is False assert col("f2", "string").is_indexed is True assert col("f2", "string2").is_indexed is False # try to index a non-table _maybe_remove(store, "f2") store.put("f2", df) with pytest.raises(TypeError): store.create_table_index("f2") def test_create_table_index_data_columns_argument(self, setup_path): # GH 28156 with ensure_clean_store(setup_path) as store: with catch_warnings(record=True): def col(t, column): return getattr(store.get_storer(t).table.cols, column) # data columns df = tm.makeTimeDataFrame() df["string"] = "foo" df["string2"] = "bar" store.append("f", df, data_columns=["string"]) assert col("f", "index").is_indexed is True assert col("f", "string").is_indexed is True msg = "'Cols' object has no attribute 'string2'" with pytest.raises(AttributeError, match=msg): col("f", "string2").is_indexed # try to index a col which isn't a data_column msg = ( "column string2 is not a data_column.\n" "In order to read column string2 you must reload the dataframe \n" "into HDFStore and include string2 with the data_columns argument." ) with pytest.raises(AttributeError, match=msg): store.create_table_index("f", columns=["string2"]) def test_append_hierarchical(self, setup_path): index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["foo", "bar"], ) df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) with ensure_clean_store(setup_path) as store: store.append("mi", df) result = store.select("mi") tm.assert_frame_equal(result, df) # GH 3748 result = store.select("mi", columns=["A", "B"]) expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(result, expected) with ensure_clean_path("test.hdf") as path: df.to_hdf(path, "df", format="table") result = read_hdf(path, "df", columns=["A", "B"]) expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(result, expected) def test_column_multiindex(self, setup_path): # GH 4710 # recreate multi-indexes properly index = MultiIndex.from_tuples( [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")], names=["first", "second"] ) df = DataFrame(np.arange(12).reshape(3, 4), columns=index) expected = df.copy() if isinstance(expected.index, RangeIndex): expected.index = Int64Index(expected.index) with ensure_clean_store(setup_path) as store: store.put("df", df) tm.assert_frame_equal( store["df"], expected, check_index_type=True, check_column_type=True ) store.put("df1", df, format="table") tm.assert_frame_equal( store["df1"], expected, check_index_type=True, check_column_type=True ) with pytest.raises(ValueError): store.put("df2", df, format="table", data_columns=["A"]) with pytest.raises(ValueError): store.put("df3", df, format="table", data_columns=True) # appending multi-column on existing table (see GH 6167) with ensure_clean_store(setup_path) as store: store.append("df2", df) store.append("df2", df) tm.assert_frame_equal(store["df2"], concat((df, df))) # non_index_axes name df = DataFrame( np.arange(12).reshape(3, 4), columns=Index(list("ABCD"), name="foo") ) expected = df.copy() if isinstance(expected.index, RangeIndex): expected.index = Int64Index(expected.index) with ensure_clean_store(setup_path) as store: store.put("df1", df, format="table") tm.assert_frame_equal( store["df1"], expected, check_index_type=True, check_column_type=True ) def test_store_multiindex(self, setup_path): # validate multi-index names # GH 5527 with ensure_clean_store(setup_path) as store: def make_index(names=None): return MultiIndex.from_tuples( [ (datetime.datetime(2013, 12, d), s, t) for d in range(1, 3) for s in range(2) for t in range(3) ], names=names, ) # no names _maybe_remove(store, "df") df = DataFrame(np.zeros((12, 2)), columns=["a", "b"], index=make_index()) store.append("df", df) tm.assert_frame_equal(store.select("df"), df) # partial names _maybe_remove(store, "df") df = DataFrame( np.zeros((12, 2)), columns=["a", "b"], index=make_index(["date", None, None]), ) store.append("df", df) tm.assert_frame_equal(store.select("df"), df) # series _maybe_remove(store, "s") s = Series(np.zeros(12), index=make_index(["date", None, None])) store.append("s", s) xp = Series(np.zeros(12), index=make_index(["date", "level_1", "level_2"])) tm.assert_series_equal(store.select("s"), xp) # dup with column _maybe_remove(store, "df") df = DataFrame( np.zeros((12, 2)), columns=["a", "b"], index=make_index(["date", "a", "t"]), ) with pytest.raises(ValueError): store.append("df", df) # dup within level _maybe_remove(store, "df") df = DataFrame( np.zeros((12, 2)), columns=["a", "b"], index=make_index(["date", "date", "date"]), ) with pytest.raises(ValueError): store.append("df", df) # fully names _maybe_remove(store, "df") df = DataFrame( np.zeros((12, 2)), columns=["a", "b"], index=make_index(["date", "s", "t"]), ) store.append("df", df) tm.assert_frame_equal(store.select("df"), df) def test_select_columns_in_where(self, setup_path): # GH 6169 # recreate multi-indexes when columns is passed # in the `where` argument index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["foo_name", "bar_name"], ) # With a DataFrame df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) with ensure_clean_store(setup_path) as store: store.put("df", df, format="table") expected = df[["A"]] tm.assert_frame_equal(store.select("df", columns=["A"]), expected) tm.assert_frame_equal(store.select("df", where="columns=['A']"), expected) # With a Series s = Series(np.random.randn(10), index=index, name="A") with ensure_clean_store(setup_path) as store: store.put("s", s, format="table") tm.assert_series_equal(store.select("s", where="columns=['A']"), s) def test_mi_data_columns(self, setup_path): # GH 14435 idx = MultiIndex.from_arrays( [date_range("2000-01-01", periods=5), range(5)], names=["date", "id"] ) df = DataFrame({"a": [1.1, 1.2, 1.3, 1.4, 1.5]}, index=idx) with ensure_clean_store(setup_path) as store: store.append("df", df, data_columns=True) actual = store.select("df", where="id == 1") expected = df.iloc[[1], :] tm.assert_frame_equal(actual, expected) def test_pass_spec_to_storer(self, setup_path): df = tm.makeDataFrame() with ensure_clean_store(setup_path) as store: store.put("df", df) with pytest.raises(TypeError): store.select("df", columns=["A"]) with pytest.raises(TypeError): store.select("df", where=[("columns=A")]) def test_append_misc(self, setup_path): with ensure_clean_store(setup_path) as store: df = tm.makeDataFrame() store.append("df", df, chunksize=1) result = store.select("df") tm.assert_frame_equal(result, df) store.append("df1", df, expectedrows=10) result = store.select("df1") tm.assert_frame_equal(result, df) # more chunksize in append tests def check(obj, comparator): for c in [10, 200, 1000]: with ensure_clean_store(setup_path, mode="w") as store: store.append("obj", obj, chunksize=c) result = store.select("obj") comparator(result, obj) df = tm.makeDataFrame() df["string"] = "foo" df["float322"] = 1.0 df["float322"] = df["float322"].astype("float32") df["bool"] = df["float322"] > 0 df["time1"] = Timestamp("20130101") df["time2"] = Timestamp("20130102") check(df, tm.assert_frame_equal) # empty frame, GH4273 with ensure_clean_store(setup_path) as store: # 0 len df_empty = DataFrame(columns=list("ABC")) store.append("df", df_empty) with pytest.raises(KeyError, match="'No object named df in the file'"): store.select("df") # repeated append of 0/non-zero frames df = DataFrame(np.random.rand(10, 3), columns=list("ABC")) store.append("df", df) tm.assert_frame_equal(store.select("df"), df) store.append("df", df_empty) tm.assert_frame_equal(store.select("df"), df) # store df = DataFrame(columns=list("ABC")) store.put("df2", df) tm.assert_frame_equal(store.select("df2"), df) def test_append_raise(self, setup_path): with ensure_clean_store(setup_path) as store: # test append with invalid input to get good error messages # list in column df = tm.makeDataFrame() df["invalid"] = [["a"]] * len(df) assert df.dtypes["invalid"] == np.object_ with pytest.raises(TypeError): store.append("df", df) # multiple invalid columns df["invalid2"] = [["a"]] * len(df) df["invalid3"] = [["a"]] * len(df) with pytest.raises(TypeError): store.append("df", df) # datetime with embedded nans as object df = tm.makeDataFrame() s = Series(datetime.datetime(2001, 1, 2), index=df.index) s = s.astype(object) s[0:5] = np.nan df["invalid"] = s assert df.dtypes["invalid"] == np.object_ with pytest.raises(TypeError): store.append("df", df) # directly ndarray with pytest.raises(TypeError): store.append("df", np.arange(10)) # series directly with pytest.raises(TypeError): store.append("df", Series(np.arange(10))) # appending an incompatible table df = tm.makeDataFrame() store.append("df", df) df["foo"] = "foo" with pytest.raises(ValueError): store.append("df", df) def test_table_index_incompatible_dtypes(self, setup_path): df1 = DataFrame({"a": [1, 2, 3]}) df2 = DataFrame({"a": [4, 5, 6]}, index=date_range("1/1/2000", periods=3)) with ensure_clean_store(setup_path) as store: store.put("frame", df1, format="table") with pytest.raises(TypeError): store.put("frame", df2, format="table", append=True) def test_table_values_dtypes_roundtrip(self, setup_path): with ensure_clean_store(setup_path) as store: df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8") store.append("df_f8", df1) tm.assert_series_equal(df1.dtypes, store["df_f8"].dtypes) df2 = DataFrame({"a": [1, 2, 3]}, dtype="i8") store.append("df_i8", df2) tm.assert_series_equal(df2.dtypes, store["df_i8"].dtypes) # incompatible dtype with pytest.raises(ValueError): store.append("df_i8", df1) # check creation/storage/retrieval of float32 (a bit hacky to # actually create them thought) df1 = DataFrame(np.array([[1], [2], [3]], dtype="f4"), columns=["A"]) store.append("df_f4", df1) tm.assert_series_equal(df1.dtypes, store["df_f4"].dtypes) assert df1.dtypes[0] == "float32" # check with mixed dtypes df1 = DataFrame( { c: Series(np.random.randint(5), dtype=c) for c in ["float32", "float64", "int32", "int64", "int16", "int8"] } ) df1["string"] = "foo" df1["float322"] = 1.0 df1["float322"] = df1["float322"].astype("float32") df1["bool"] = df1["float32"] > 0 df1["time1"] = Timestamp("20130101") df1["time2"] = Timestamp("20130102") store.append("df_mixed_dtypes1", df1) result = store.select("df_mixed_dtypes1").dtypes.value_counts() result.index = [str(i) for i in result.index] expected = Series( { "float32": 2, "float64": 1, "int32": 1, "bool": 1, "int16": 1, "int8": 1, "int64": 1, "object": 1, "datetime64[ns]": 2, } ) result = result.sort_index() expected = expected.sort_index() tm.assert_series_equal(result, expected) def test_table_mixed_dtypes(self, setup_path): # frame df = tm.makeDataFrame() df["obj1"] = "foo" df["obj2"] = "bar" df["bool1"] = df["A"] > 0 df["bool2"] = df["B"] > 0 df["bool3"] = True df["int1"] = 1 df["int2"] = 2 df["timestamp1"] = Timestamp("20010102") df["timestamp2"] = Timestamp("20010103") df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) df.loc[df.index[3:6], ["obj1"]] = np.nan df = df._consolidate()._convert(datetime=True) with ensure_clean_store(setup_path) as store: store.append("df1_mixed", df) tm.assert_frame_equal(store.select("df1_mixed"), df) def test_unimplemented_dtypes_table_columns(self, setup_path): with ensure_clean_store(setup_path) as store: dtypes = [("date", datetime.date(2001, 1, 2))] # currently not supported dtypes #### for n, f in dtypes: df = tm.makeDataFrame() df[n] = f with pytest.raises(TypeError): store.append(f"df1_{n}", df) # frame df = tm.makeDataFrame() df["obj1"] = "foo" df["obj2"] = "bar" df["datetime1"] = datetime.date(2001, 1, 2) df = df._consolidate()._convert(datetime=True) with ensure_clean_store(setup_path) as store: # this fails because we have a date in the object block...... with pytest.raises(TypeError): store.append("df_unimplemented", df) def test_calendar_roundtrip_issue(self, setup_path): # 8591 # doc example from tseries holiday section weekmask_egypt = "Sun Mon Tue Wed Thu" holidays = [ "2012-05-01", datetime.datetime(2013, 5, 1), np.datetime64("2014-05-01"), ] bday_egypt = pd.offsets.CustomBusinessDay( holidays=holidays, weekmask=weekmask_egypt ) dt = datetime.datetime(2013, 4, 30) dts = date_range(dt, periods=5, freq=bday_egypt) s = Series(dts.weekday, dts).map(Series("Mon Tue Wed Thu Fri Sat Sun".split())) with ensure_clean_store(setup_path) as store: store.put("fixed", s) result = store.select("fixed") tm.assert_series_equal(result, s) store.append("table", s) result = store.select("table") tm.assert_series_equal(result, s) def test_append_with_timedelta(self, setup_path): # GH 3577 # append timedelta df = DataFrame( { "A": Timestamp("20130101"), "B": [ Timestamp("20130101") + timedelta(days=i, seconds=10) for i in range(10) ], } ) df["C"] = df["A"] - df["B"] df.loc[3:5, "C"] = np.nan with ensure_clean_store(setup_path) as store: # table _maybe_remove(store, "df") store.append("df", df, data_columns=True) result = store.select("df") tm.assert_frame_equal(result, df) result = store.select("df", where="C<100000") tm.assert_frame_equal(result, df) result = store.select("df", where="C") # from the docs with ensure_clean_path(setup_path) as path: dfq = DataFrame( np.random.randn(10, 4), columns=list("ABCD"), index=date_range("20130101", periods=10), ) dfq.to_hdf(path, "dfq", format="table", data_columns=True) # check ok read_hdf( path, "dfq", where="index>Timestamp('20130104') & columns=['A', 'B']" ) read_hdf(path, "dfq", where="A>0 or C>0") # catch the invalid reference with ensure_clean_path(setup_path) as path: dfq = DataFrame( np.random.randn(10, 4), columns=list("ABCD"), index=date_range("20130101", periods=10), ) dfq.to_hdf(path, "dfq", format="table") with pytest.raises(ValueError): read_hdf(path, "dfq", where="A>0 or C>0") def test_same_name_scoping(self, setup_path): with ensure_clean_store(setup_path) as store: import pandas as pd df = DataFrame( np.random.randn(20, 2), index=pd.date_range("20130101", periods=20) ) store.put("df", df, format="table") expected = df[df.index > Timestamp("20130105")] import datetime result = store.select("df", "index>datetime.datetime(2013,1,5)") tm.assert_frame_equal(result, expected) from datetime import datetime # noqa # technically an error, but allow it result = store.select("df", "index>datetime.datetime(2013,1,5)") tm.assert_frame_equal(result, expected) result = store.select("df", "index>datetime(2013,1,5)") tm.assert_frame_equal(result, expected) def test_series(self, setup_path): s = tm.makeStringSeries() self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) ts = tm.makeTimeSeries() self._check_roundtrip(ts, tm.assert_series_equal, path=setup_path) ts2 = Series(ts.index, Index(ts.index, dtype=object)) self._check_roundtrip(ts2, tm.assert_series_equal, path=setup_path) ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) self._check_roundtrip( ts3, tm.assert_series_equal, path=setup_path, check_index_type=False ) def test_float_index(self, setup_path): # GH #454 index = np.random.randn(10) s = Series(np.random.randn(10), index=index) self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) def test_tuple_index(self, setup_path): # GH #492 col = np.arange(10) idx = [(0.0, 1.0), (2.0, 3.0), (4.0, 5.0)] data = np.random.randn(30).reshape((3, 10)) DF = DataFrame(data, index=idx, columns=col) with catch_warnings(record=True): simplefilter("ignore", pd.errors.PerformanceWarning) self._check_roundtrip(DF, tm.assert_frame_equal, path=setup_path) @pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") def test_index_types(self, setup_path): with catch_warnings(record=True): values = np.random.randn(2) func = lambda l, r: tm.assert_series_equal(l, r, check_index_type=True) with catch_warnings(record=True): ser = Series(values, [0, "y"]) self._check_roundtrip(ser, func, path=setup_path) with catch_warnings(record=True): ser = Series(values, [datetime.datetime.today(), 0]) self._check_roundtrip(ser, func, path=setup_path) with catch_warnings(record=True): ser = Series(values, ["y", 0]) self._check_roundtrip(ser, func, path=setup_path) with catch_warnings(record=True): ser = Series(values, [datetime.date.today(), "a"]) self._check_roundtrip(ser, func, path=setup_path) with catch_warnings(record=True): ser = Series(values, [0, "y"]) self._check_roundtrip(ser, func, path=setup_path) ser = Series(values, [datetime.datetime.today(), 0]) self._check_roundtrip(ser, func, path=setup_path) ser = Series(values, ["y", 0]) self._check_roundtrip(ser, func, path=setup_path) ser = Series(values, [datetime.date.today(), "a"]) self._check_roundtrip(ser, func, path=setup_path) ser = Series(values, [1.23, "b"]) self._check_roundtrip(ser, func, path=setup_path) ser = Series(values, [1, 1.53]) self._check_roundtrip(ser, func, path=setup_path) ser = Series(values, [1, 5]) self._check_roundtrip(ser, func, path=setup_path) ser = Series( values, [datetime.datetime(2012, 1, 1), datetime.datetime(2012, 1, 2)] ) self._check_roundtrip(ser, func, path=setup_path) def test_timeseries_preepoch(self, setup_path): dr = bdate_range("1/1/1940", "1/1/1960") ts = Series(np.random.randn(len(dr)), index=dr) try: self._check_roundtrip(ts, tm.assert_series_equal, path=setup_path) except OverflowError: pytest.skip("known failer on some windows platforms") @pytest.mark.parametrize( "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] ) def test_frame(self, compression, setup_path): df = tm.makeDataFrame() # put in some random NAs df.values[0, 0] = np.nan df.values[5, 3] = np.nan self._check_roundtrip_table( df, tm.assert_frame_equal, path=setup_path, compression=compression ) self._check_roundtrip( df, tm.assert_frame_equal, path=setup_path, compression=compression ) tdf = tm.makeTimeDataFrame() self._check_roundtrip( tdf, tm.assert_frame_equal, path=setup_path, compression=compression ) with ensure_clean_store(setup_path) as store: # not consolidated df["foo"] = np.random.randn(len(df)) store["df"] = df recons = store["df"] assert recons._mgr.is_consolidated() # empty self._check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path) def test_empty_series_frame(self, setup_path): s0 = Series(dtype=object) s1 = Series(name="myseries", dtype=object) df0 = DataFrame() df1 = DataFrame(index=["a", "b", "c"]) df2 = DataFrame(columns=["d", "e", "f"]) self._check_roundtrip(s0, tm.assert_series_equal, path=setup_path) self._check_roundtrip(s1, tm.assert_series_equal, path=setup_path) self._check_roundtrip(df0, tm.assert_frame_equal, path=setup_path) self._check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) self._check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) @pytest.mark.parametrize( "dtype", [np.int64, np.float64, object, "m8[ns]", "M8[ns]"] ) def test_empty_series(self, dtype, setup_path): s = Series(dtype=dtype) self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) def test_can_serialize_dates(self, setup_path): rng = [x.date() for x in bdate_range("1/1/2000", "1/30/2000")] frame = DataFrame(np.random.randn(len(rng), 4), index=rng) self._check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) def test_store_hierarchical(self, setup_path): index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["foo", "bar"], ) frame = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) self._check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) self._check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path) self._check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path) # check that the names are stored with ensure_clean_store(setup_path) as store: store["frame"] = frame recons = store["frame"] tm.assert_frame_equal(recons, frame) def test_store_index_name(self, setup_path): df = tm.makeDataFrame() df.index.name = "foo" with ensure_clean_store(setup_path) as store: store["frame"] = df recons = store["frame"] tm.assert_frame_equal(recons, df) @pytest.mark.parametrize("table_format", ["table", "fixed"]) def test_store_index_name_numpy_str(self, table_format, setup_path): # GH #13492 idx = Index( pd.to_datetime([datetime.date(2000, 1, 1), datetime.date(2000, 1, 2)]), name="cols\u05d2", ) idx1 = Index( pd.to_datetime([datetime.date(2010, 1, 1), datetime.date(2010, 1, 2)]), name="rows\u05d0", ) df = DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1) # This used to fail, returning numpy strings instead of python strings. with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format=table_format) df2 = read_hdf(path, "df") tm.assert_frame_equal(df, df2, check_names=True) assert type(df2.index.name) == str assert type(df2.columns.name) == str def test_store_series_name(self, setup_path): df = tm.makeDataFrame() series = df["A"] with ensure_clean_store(setup_path) as store: store["series"] = series recons = store["series"] tm.assert_series_equal(recons, series) @pytest.mark.parametrize( "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] ) def test_store_mixed(self, compression, setup_path): def _make_one(): df = tm.makeDataFrame() df["obj1"] = "foo" df["obj2"] = "bar" df["bool1"] = df["A"] > 0 df["bool2"] = df["B"] > 0 df["int1"] = 1 df["int2"] = 2 return df._consolidate() df1 = _make_one() df2 = _make_one() self._check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) self._check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) with ensure_clean_store(setup_path) as store: store["obj"] = df1 tm.assert_frame_equal(store["obj"], df1) store["obj"] = df2 tm.assert_frame_equal(store["obj"], df2) # check that can store Series of all of these types self._check_roundtrip( df1["obj1"], tm.assert_series_equal, path=setup_path, compression=compression, ) self._check_roundtrip( df1["bool1"], tm.assert_series_equal, path=setup_path, compression=compression, ) self._check_roundtrip( df1["int1"], tm.assert_series_equal, path=setup_path, compression=compression, ) @pytest.mark.filterwarnings( "ignore:\\nduplicate:pandas.io.pytables.DuplicateWarning" ) def test_select_with_dups(self, setup_path): # single dtypes df = DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]) df.index = date_range("20130101 9:30", periods=10, freq="T") with ensure_clean_store(setup_path) as store: store.append("df", df) result = store.select("df") expected = df tm.assert_frame_equal(result, expected, by_blocks=True) result = store.select("df", columns=df.columns) expected = df tm.assert_frame_equal(result, expected, by_blocks=True) result = store.select("df", columns=["A"]) expected = df.loc[:, ["A"]] tm.assert_frame_equal(result, expected) # dups across dtypes df = concat( [ DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), DataFrame( np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] ), ], axis=1, ) df.index = date_range("20130101 9:30", periods=10, freq="T") with ensure_clean_store(setup_path) as store: store.append("df", df) result = store.select("df") expected = df tm.assert_frame_equal(result, expected, by_blocks=True) result = store.select("df", columns=df.columns) expected = df tm.assert_frame_equal(result, expected, by_blocks=True) expected = df.loc[:, ["A"]] result = store.select("df", columns=["A"]) tm.assert_frame_equal(result, expected, by_blocks=True) expected = df.loc[:, ["B", "A"]] result = store.select("df", columns=["B", "A"]) tm.assert_frame_equal(result, expected, by_blocks=True) # duplicates on both index and columns with ensure_clean_store(setup_path) as store: store.append("df", df) store.append("df", df) expected = df.loc[:, ["B", "A"]] expected = concat([expected, expected]) result = store.select("df", columns=["B", "A"]) tm.assert_frame_equal(result, expected, by_blocks=True) def test_overwrite_node(self, setup_path): with ensure_clean_store(setup_path) as store: store["a"] = tm.makeTimeDataFrame() ts = tm.makeTimeSeries() store["a"] = ts tm.assert_series_equal(store["a"], ts) def test_select(self, setup_path): with ensure_clean_store(setup_path) as store: with catch_warnings(record=True): # select with columns= df = tm.makeTimeDataFrame() _maybe_remove(store, "df") store.append("df", df) result = store.select("df", columns=["A", "B"]) expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(expected, result) # equivalently result = store.select("df", [("columns=['A', 'B']")]) expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(expected, result) # with a data column _maybe_remove(store, "df") store.append("df", df, data_columns=["A"]) result = store.select("df", ["A > 0"], columns=["A", "B"]) expected = df[df.A > 0].reindex(columns=["A", "B"]) tm.assert_frame_equal(expected, result) # all a data columns _maybe_remove(store, "df") store.append("df", df, data_columns=True) result = store.select("df", ["A > 0"], columns=["A", "B"]) expected = df[df.A > 0].reindex(columns=["A", "B"]) tm.assert_frame_equal(expected, result) # with a data column, but different columns _maybe_remove(store, "df") store.append("df", df, data_columns=["A"]) result = store.select("df", ["A > 0"], columns=["C", "D"]) expected = df[df.A > 0].reindex(columns=["C", "D"]) tm.assert_frame_equal(expected, result) def test_select_dtypes(self, setup_path): with ensure_clean_store(setup_path) as store: # with a Timestamp data column (GH #2637) df = DataFrame( { "ts": bdate_range("2012-01-01", periods=300), "A": np.random.randn(300), } ) _maybe_remove(store, "df") store.append("df", df, data_columns=["ts", "A"]) result = store.select("df", "ts>=Timestamp('2012-02-01')") expected = df[df.ts >= Timestamp("2012-02-01")] tm.assert_frame_equal(expected, result) # bool columns (GH #2849) df = DataFrame(np.random.randn(5, 2), columns=["A", "B"]) df["object"] = "foo" df.loc[4:5, "object"] = "bar" df["boolv"] = df["A"] > 0 _maybe_remove(store, "df") store.append("df", df, data_columns=True) expected = df[df.boolv == True].reindex(columns=["A", "boolv"]) # noqa for v in [True, "true", 1]: result = store.select("df", f"boolv == {v}", columns=["A", "boolv"]) tm.assert_frame_equal(expected, result) expected = df[df.boolv == False].reindex(columns=["A", "boolv"]) # noqa for v in [False, "false", 0]: result = store.select("df", f"boolv == {v}", columns=["A", "boolv"]) tm.assert_frame_equal(expected, result) # integer index df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)}) _maybe_remove(store, "df_int") store.append("df_int", df) result = store.select("df_int", "index<10 and columns=['A']") expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) tm.assert_frame_equal(expected, result) # float index df = DataFrame( { "A": np.random.rand(20), "B": np.random.rand(20), "index": np.arange(20, dtype="f8"), } ) _maybe_remove(store, "df_float") store.append("df_float", df) result = store.select("df_float", "index<10.0 and columns=['A']") expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) tm.assert_frame_equal(expected, result) with ensure_clean_store(setup_path) as store: # floats w/o NaN df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64") df["cols"] = (df["cols"] + 10).apply(str) store.append("df1", df, data_columns=True) result = store.select("df1", where="values>2.0") expected = df[df["values"] > 2.0] tm.assert_frame_equal(expected, result) # floats with NaN df.iloc[0] = np.nan expected = df[df["values"] > 2.0] store.append("df2", df, data_columns=True, index=False) result = store.select("df2", where="values>2.0") tm.assert_frame_equal(expected, result) # https://github.com/PyTables/PyTables/issues/282 # bug in selection when 0th row has a np.nan and an index # store.append('df3',df,data_columns=True) # result = store.select( # 'df3', where='values>2.0') # tm.assert_frame_equal(expected, result) # not in first position float with NaN ok too df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64") df["cols"] = (df["cols"] + 10).apply(str) df.iloc[1] = np.nan expected = df[df["values"] > 2.0] store.append("df4", df, data_columns=True) result = store.select("df4", where="values>2.0") tm.assert_frame_equal(expected, result) # test selection with comparison against numpy scalar # GH 11283 with ensure_clean_store(setup_path) as store: df = tm.makeDataFrame() expected = df[df["A"] > 0] store.append("df", df, data_columns=True) np_zero = np.float64(0) # noqa result = store.select("df", where=["A>np_zero"]) tm.assert_frame_equal(expected, result) def test_select_with_many_inputs(self, setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( { "ts": bdate_range("2012-01-01", periods=300), "A": np.random.randn(300), "B": range(300), "users": ["a"] * 50 + ["b"] * 50 + ["c"] * 100 + [f"a{i:03d}" for i in range(100)], } ) _maybe_remove(store, "df") store.append("df", df, data_columns=["ts", "A", "B", "users"]) # regular select result = store.select("df", "ts>=Timestamp('2012-02-01')") expected = df[df.ts >= Timestamp("2012-02-01")] tm.assert_frame_equal(expected, result) # small selector result = store.select( "df", "ts>=Timestamp('2012-02-01') & users=['a','b','c']" ) expected = df[ (df.ts >= Timestamp("2012-02-01")) & df.users.isin(["a", "b", "c"]) ] tm.assert_frame_equal(expected, result) # big selector along the columns selector = ["a", "b", "c"] + [f"a{i:03d}" for i in range(60)] result = store.select( "df", "ts>=Timestamp('2012-02-01') and users=selector" ) expected = df[(df.ts >= Timestamp("2012-02-01")) & df.users.isin(selector)] tm.assert_frame_equal(expected, result) selector = range(100, 200) result = store.select("df", "B=selector") expected = df[df.B.isin(selector)] tm.assert_frame_equal(expected, result) assert len(result) == 100 # big selector along the index selector = Index(df.ts[0:100].values) result = store.select("df", "ts=selector") expected = df[df.ts.isin(selector.values)] tm.assert_frame_equal(expected, result) assert len(result) == 100 def test_select_iterator(self, setup_path): # single table with ensure_clean_store(setup_path) as store: df = tm.makeTimeDataFrame(500) _maybe_remove(store, "df") store.append("df", df) expected = store.select("df") results = list(store.select("df", iterator=True)) result = concat(results) tm.assert_frame_equal(expected, result) results = list(store.select("df", chunksize=100)) assert len(results) == 5 result = concat(results) tm.assert_frame_equal(expected, result) results = list(store.select("df", chunksize=150)) result = concat(results) tm.assert_frame_equal(result, expected) with ensure_clean_path(setup_path) as path: df = tm.makeTimeDataFrame(500) df.to_hdf(path, "df_non_table") with pytest.raises(TypeError): read_hdf(path, "df_non_table", chunksize=100) with pytest.raises(TypeError): read_hdf(path, "df_non_table", iterator=True) with ensure_clean_path(setup_path) as path: df = tm.makeTimeDataFrame(500) df.to_hdf(path, "df", format="table") results = list(read_hdf(path, "df", chunksize=100)) result = concat(results) assert len(results) == 5 tm.assert_frame_equal(result, df) tm.assert_frame_equal(result, read_hdf(path, "df")) # multiple with ensure_clean_store(setup_path) as store: df1 = tm.makeTimeDataFrame(500) store.append("df1", df1, data_columns=True) df2 = tm.makeTimeDataFrame(500).rename(columns="{}_2".format) df2["foo"] = "bar" store.append("df2", df2) df = concat([df1, df2], axis=1) # full selection expected = store.select_as_multiple(["df1", "df2"], selector="df1") results = list( store.select_as_multiple(["df1", "df2"], selector="df1", chunksize=150) ) result = concat(results) tm.assert_frame_equal(expected, result) def test_select_iterator_complete_8014(self, setup_path): # GH 8014 # using iterator and where clause chunksize = 1e4 # no iterator with ensure_clean_store(setup_path) as store: expected = tm.makeTimeDataFrame(100064, "S") _maybe_remove(store, "df") store.append("df", expected) beg_dt = expected.index[0] end_dt = expected.index[-1] # select w/o iteration and no where clause works result = store.select("df") tm.assert_frame_equal(expected, result) # select w/o iterator and where clause, single term, begin # of range, works where = f"index >= '{beg_dt}'" result = store.select("df", where=where) tm.assert_frame_equal(expected, result) # select w/o iterator and where clause, single term, end # of range, works where = f"index <= '{end_dt}'" result = store.select("df", where=where) tm.assert_frame_equal(expected, result) # select w/o iterator and where clause, inclusive range, # works where = f"index >= '{beg_dt}' & index <= '{end_dt}'" result = store.select("df", where=where) tm.assert_frame_equal(expected, result) # with iterator, full range with ensure_clean_store(setup_path) as store: expected = tm.makeTimeDataFrame(100064, "S") _maybe_remove(store, "df") store.append("df", expected) beg_dt = expected.index[0] end_dt = expected.index[-1] # select w/iterator and no where clause works results = list(store.select("df", chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, single term, begin of range where = f"index >= '{beg_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, single term, end of range where = f"index <= '{end_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, inclusive range where = f"index >= '{beg_dt}' & index <= '{end_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result) def test_select_iterator_non_complete_8014(self, setup_path): # GH 8014 # using iterator and where clause chunksize = 1e4 # with iterator, non complete range with ensure_clean_store(setup_path) as store: expected = tm.makeTimeDataFrame(100064, "S") _maybe_remove(store, "df") store.append("df", expected) beg_dt = expected.index[1] end_dt = expected.index[-2] # select w/iterator and where clause, single term, begin of range where = f"index >= '{beg_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[expected.index >= beg_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, single term, end of range where = f"index <= '{end_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[expected.index <= end_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, inclusive range where = f"index >= '{beg_dt}' & index <= '{end_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[ (expected.index >= beg_dt) & (expected.index <= end_dt) ] tm.assert_frame_equal(rexpected, result) # with iterator, empty where with ensure_clean_store(setup_path) as store: expected = tm.makeTimeDataFrame(100064, "S") _maybe_remove(store, "df") store.append("df", expected) end_dt = expected.index[-1] # select w/iterator and where clause, single term, begin of range where = f"index > '{end_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) assert 0 == len(results) def test_select_iterator_many_empty_frames(self, setup_path): # GH 8014 # using iterator and where clause can return many empty # frames. chunksize = int(1e4) # with iterator, range limited to the first chunk with ensure_clean_store(setup_path) as store: expected = tm.makeTimeDataFrame(100000, "S") _maybe_remove(store, "df") store.append("df", expected) beg_dt = expected.index[0] end_dt = expected.index[chunksize - 1] # select w/iterator and where clause, single term, begin of range where = f"index >= '{beg_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[expected.index >= beg_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, single term, end of range where = f"index <= '{end_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) assert len(results) == 1 result = concat(results) rexpected = expected[expected.index <= end_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, inclusive range where = f"index >= '{beg_dt}' & index <= '{end_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) # should be 1, is 10 assert len(results) == 1 result = concat(results) rexpected = expected[ (expected.index >= beg_dt) & (expected.index <= end_dt) ] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause which selects # *nothing*. # # To be consistent with Python idiom I suggest this should # return [] e.g. `for e in []: print True` never prints # True. where = f"index <= '{beg_dt}' & index >= '{end_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) # should be [] assert len(results) == 0 @pytest.mark.filterwarnings( "ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning" ) def test_retain_index_attributes(self, setup_path): # GH 3499, losing frequency info on index recreation df = DataFrame( {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))} ) with ensure_clean_store(setup_path) as store: _maybe_remove(store, "data") store.put("data", df, format="table") result = store.get("data") tm.assert_frame_equal(df, result) for attr in ["freq", "tz", "name"]: for idx in ["index", "columns"]: assert getattr(getattr(df, idx), attr, None) == getattr( getattr(result, idx), attr, None ) # try to append a table with a different frequency with catch_warnings(record=True): df2 = DataFrame( { "A": Series( range(3), index=date_range("2002-1-1", periods=3, freq="D") ) } ) store.append("data", df2) assert store.get_storer("data").info["index"]["freq"] is None # this is ok _maybe_remove(store, "df2") df2 = DataFrame( { "A": Series( range(3), index=[ Timestamp("20010101"), Timestamp("20010102"), Timestamp("20020101"), ], ) } ) store.append("df2", df2) df3 = DataFrame( { "A": Series( range(3), index=date_range("2002-1-1", periods=3, freq="D") ) } ) store.append("df2", df3) @pytest.mark.filterwarnings( "ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning" ) def test_retain_index_attributes2(self, setup_path): with ensure_clean_path(setup_path) as path: with catch_warnings(record=True): df = DataFrame( { "A": Series( range(3), index=date_range("2000-1-1", periods=3, freq="H") ) } ) df.to_hdf(path, "data", mode="w", append=True) df2 = DataFrame( { "A": Series( range(3), index=date_range("2002-1-1", periods=3, freq="D") ) } ) df2.to_hdf(path, "data", append=True) idx = date_range("2000-1-1", periods=3, freq="H") idx.name = "foo" df = DataFrame({"A": Series(range(3), index=idx)}) df.to_hdf(path, "data", mode="w", append=True) assert read_hdf(path, "data").index.name == "foo" with catch_warnings(record=True): idx2 = date_range("2001-1-1", periods=3, freq="H") idx2.name = "bar" df2 = DataFrame({"A": Series(range(3), index=idx2)}) df2.to_hdf(path, "data", append=True) assert read_hdf(path, "data").index.name is None def test_frame_select(self, setup_path): df = tm.makeTimeDataFrame() with ensure_clean_store(setup_path) as store: store.put("frame", df, format="table") date = df.index[len(df) // 2] crit1 = Term("index>=date") assert crit1.env.scope["date"] == date crit2 = "columns=['A', 'D']" crit3 = "columns=A" result = store.select("frame", [crit1, crit2]) expected = df.loc[date:, ["A", "D"]] tm.assert_frame_equal(result, expected) result = store.select("frame", [crit3]) expected = df.loc[:, ["A"]] tm.assert_frame_equal(result, expected) # invalid terms df = tm.makeTimeDataFrame() store.append("df_time", df) with pytest.raises(ValueError): store.select("df_time", "index>0") # can't select if not written as table # store['frame'] = df # with pytest.raises(ValueError): # store.select('frame', [crit1, crit2]) def test_frame_select_complex(self, setup_path): # select via complex criteria df = tm.makeTimeDataFrame() df["string"] = "foo" df.loc[df.index[0:4], "string"] = "bar" with ensure_clean_store(setup_path) as store: store.put("df", df, format="table", data_columns=["string"]) # empty result = store.select("df", 'index>df.index[3] & string="bar"') expected = df.loc[(df.index > df.index[3]) & (df.string == "bar")] tm.assert_frame_equal(result, expected) result = store.select("df", 'index>df.index[3] & string="foo"') expected = df.loc[(df.index > df.index[3]) & (df.string == "foo")] tm.assert_frame_equal(result, expected) # or result = store.select("df", 'index>df.index[3] | string="bar"') expected = df.loc[(df.index > df.index[3]) | (df.string == "bar")] tm.assert_frame_equal(result, expected) result = store.select( "df", '(index>df.index[3] & index<=df.index[6]) | string="bar"' ) expected = df.loc[ ((df.index > df.index[3]) & (df.index <= df.index[6])) | (df.string == "bar") ] tm.assert_frame_equal(result, expected) # invert result = store.select("df", 'string!="bar"') expected = df.loc[df.string != "bar"] tm.assert_frame_equal(result, expected) # invert not implemented in numexpr :( with pytest.raises(NotImplementedError): store.select("df", '~(string="bar")') # invert ok for filters result = store.select("df", "~(columns=['A','B'])") expected = df.loc[:, df.columns.difference(["A", "B"])] tm.assert_frame_equal(result, expected) # in result = store.select("df", "index>df.index[3] & columns in ['A','B']") expected = df.loc[df.index > df.index[3]].reindex(columns=["A", "B"]) tm.assert_frame_equal(result, expected) def test_frame_select_complex2(self, setup_path): with ensure_clean_path(["parms.hdf", "hist.hdf"]) as paths: pp, hh = paths # use non-trivial selection criteria parms = DataFrame({"A": [1, 1, 2, 2, 3]}) parms.to_hdf(pp, "df", mode="w", format="table", data_columns=["A"]) selection = read_hdf(pp, "df", where="A=[2,3]") hist = DataFrame( np.random.randn(25, 1), columns=["data"], index=MultiIndex.from_tuples( [(i, j) for i in range(5) for j in range(5)], names=["l1", "l2"] ), ) hist.to_hdf(hh, "df", mode="w", format="table") expected = read_hdf(hh, "df", where="l1=[2, 3, 4]") # scope with list like l = selection.index.tolist() # noqa store = HDFStore(hh) result = store.select("df", where="l1=l") tm.assert_frame_equal(result, expected) store.close() result = read_hdf(hh, "df", where="l1=l") tm.assert_frame_equal(result, expected) # index index = selection.index # noqa result = read_hdf(hh, "df", where="l1=index") tm.assert_frame_equal(result, expected) result = read_hdf(hh, "df", where="l1=selection.index") tm.assert_frame_equal(result, expected) result = read_hdf(hh, "df", where="l1=selection.index.tolist()") tm.assert_frame_equal(result, expected) result = read_hdf(hh, "df", where="l1=list(selection.index)") tm.assert_frame_equal(result, expected) # scope with index store = HDFStore(hh) result = store.select("df", where="l1=index") tm.assert_frame_equal(result, expected) result = store.select("df", where="l1=selection.index") tm.assert_frame_equal(result, expected) result = store.select("df", where="l1=selection.index.tolist()") tm.assert_frame_equal(result, expected) result = store.select("df", where="l1=list(selection.index)") tm.assert_frame_equal(result, expected) store.close() def test_invalid_filtering(self, setup_path): # can't use more than one filter (atm) df = tm.makeTimeDataFrame() with ensure_clean_store(setup_path) as store: store.put("df", df, format="table") # not implemented with pytest.raises(NotImplementedError): store.select("df", "columns=['A'] | columns=['B']") # in theory we could deal with this with pytest.raises(NotImplementedError): store.select("df", "columns=['A','B'] & columns=['C']") def test_string_select(self, setup_path): # GH 2973 with ensure_clean_store(setup_path) as store: df = tm.makeTimeDataFrame() # test string ==/!= df["x"] = "none" df.loc[df.index[2:7], "x"] = "" store.append("df", df, data_columns=["x"]) result = store.select("df", "x=none") expected = df[df.x == "none"] tm.assert_frame_equal(result, expected) result = store.select("df", "x!=none") expected = df[df.x != "none"] tm.assert_frame_equal(result, expected) df2 = df.copy() df2.loc[df2.x == "", "x"] = np.nan store.append("df2", df2, data_columns=["x"]) result = store.select("df2", "x!=none") expected = df2[isna(df2.x)] tm.assert_frame_equal(result, expected) # int ==/!= df["int"] = 1 df.loc[df.index[2:7], "int"] = 2 store.append("df3", df, data_columns=["int"]) result = store.select("df3", "int=2") expected = df[df.int == 2] tm.assert_frame_equal(result, expected) result = store.select("df3", "int!=2") expected = df[df.int != 2] tm.assert_frame_equal(result, expected) def test_read_column(self, setup_path): df = tm.makeTimeDataFrame() with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") # GH 17912 # HDFStore.select_column should raise a KeyError # exception if the key is not a valid store with pytest.raises(KeyError, match="No object named df in the file"): store.select_column("df", "index") store.append("df", df) # error with pytest.raises( KeyError, match=re.escape("'column [foo] not found in the table'") ): store.select_column("df", "foo") with pytest.raises(Exception): store.select_column("df", "index", where=["index>5"]) # valid result = store.select_column("df", "index") tm.assert_almost_equal(result.values, Series(df.index).values) assert isinstance(result, Series) # not a data indexable column with pytest.raises(ValueError): store.select_column("df", "values_block_0") # a data column df2 = df.copy() df2["string"] = "foo" store.append("df2", df2, data_columns=["string"]) result = store.select_column("df2", "string") tm.assert_almost_equal(result.values, df2["string"].values) # a data column with NaNs, result excludes the NaNs df3 = df.copy() df3["string"] = "foo" df3.loc[df3.index[4:6], "string"] = np.nan store.append("df3", df3, data_columns=["string"]) result = store.select_column("df3", "string") tm.assert_almost_equal(result.values, df3["string"].values) # start/stop result = store.select_column("df3", "string", start=2) tm.assert_almost_equal(result.values, df3["string"].values[2:]) result = store.select_column("df3", "string", start=-2) tm.assert_almost_equal(result.values, df3["string"].values[-2:]) result = store.select_column("df3", "string", stop=2) tm.assert_almost_equal(result.values, df3["string"].values[:2]) result = store.select_column("df3", "string", stop=-2) tm.assert_almost_equal(result.values, df3["string"].values[:-2]) result = store.select_column("df3", "string", start=2, stop=-2) tm.assert_almost_equal(result.values, df3["string"].values[2:-2]) result = store.select_column("df3", "string", start=-2, stop=2) tm.assert_almost_equal(result.values, df3["string"].values[-2:2]) # GH 10392 - make sure column name is preserved df4 = DataFrame({"A": np.random.randn(10), "B": "foo"}) store.append("df4", df4, data_columns=True) expected = df4["B"] result = store.select_column("df4", "B") tm.assert_series_equal(result, expected) def test_coordinates(self, setup_path): df = tm.makeTimeDataFrame() with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") store.append("df", df) # all c = store.select_as_coordinates("df") assert (c.values == np.arange(len(df.index))).all() # get coordinates back & test vs frame _maybe_remove(store, "df") df = DataFrame({"A": range(5), "B": range(5)}) store.append("df", df) c = store.select_as_coordinates("df", ["index<3"]) assert (c.values == np.arange(3)).all() result = store.select("df", where=c) expected = df.loc[0:2, :] tm.assert_frame_equal(result, expected) c = store.select_as_coordinates("df", ["index>=3", "index<=4"]) assert (c.values == np.arange(2) + 3).all() result = store.select("df", where=c) expected = df.loc[3:4, :] tm.assert_frame_equal(result, expected) assert isinstance(c, Index) # multiple tables _maybe_remove(store, "df1") _maybe_remove(store, "df2") df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) store.append("df1", df1, data_columns=["A", "B"]) store.append("df2", df2) c = store.select_as_coordinates("df1", ["A>0", "B>0"]) df1_result = store.select("df1", c) df2_result = store.select("df2", c) result = concat([df1_result, df2_result], axis=1) expected = concat([df1, df2], axis=1) expected = expected[(expected.A > 0) & (expected.B > 0)] tm.assert_frame_equal(result, expected) # pass array/mask as the coordinates with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.randn(1000, 2), index=date_range("20000101", periods=1000) ) store.append("df", df) c = store.select_column("df", "index") where = c[DatetimeIndex(c).month == 5].index expected = df.iloc[where] # locations result = store.select("df", where=where) tm.assert_frame_equal(result, expected) # boolean result = store.select("df", where=where) tm.assert_frame_equal(result, expected) # invalid with pytest.raises(ValueError): store.select("df", where=np.arange(len(df), dtype="float64")) with pytest.raises(ValueError): store.select("df", where=np.arange(len(df) + 1)) with pytest.raises(ValueError): store.select("df", where=np.arange(len(df)), start=5) with pytest.raises(ValueError): store.select("df", where=np.arange(len(df)), start=5, stop=10) # selection with filter selection = date_range("20000101", periods=500) result = store.select("df", where="index in selection") expected = df[df.index.isin(selection)] tm.assert_frame_equal(result, expected) # list df = DataFrame(np.random.randn(10, 2)) store.append("df2", df) result = store.select("df2", where=[0, 3, 5]) expected = df.iloc[[0, 3, 5]] tm.assert_frame_equal(result, expected) # boolean where = [True] * 10 where[-2] = False result = store.select("df2", where=where) expected = df.loc[where] tm.assert_frame_equal(result, expected) # start/stop result = store.select("df2", start=5, stop=10) expected = df[5:10] tm.assert_frame_equal(result, expected) def test_append_to_multiple(self, setup_path): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) df2["foo"] = "bar" df = concat([df1, df2], axis=1) with ensure_clean_store(setup_path) as store: # exceptions with pytest.raises(ValueError): store.append_to_multiple( {"df1": ["A", "B"], "df2": None}, df, selector="df3" ) with pytest.raises(ValueError): store.append_to_multiple({"df1": None, "df2": None}, df, selector="df3") with pytest.raises(ValueError): store.append_to_multiple("df1", df, "df1") # regular operation store.append_to_multiple( {"df1": ["A", "B"], "df2": None}, df, selector="df1" ) result = store.select_as_multiple( ["df1", "df2"], where=["A>0", "B>0"], selector="df1" ) expected = df[(df.A > 0) & (df.B > 0)] tm.assert_frame_equal(result, expected) def test_append_to_multiple_dropna(self, setup_path): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan df = concat([df1, df2], axis=1) with ensure_clean_store(setup_path) as store: # dropna=True should guarantee rows are synchronized store.append_to_multiple( {"df1": ["A", "B"], "df2": None}, df, selector="df1", dropna=True ) result = store.select_as_multiple(["df1", "df2"]) expected = df.dropna() tm.assert_frame_equal(result, expected) tm.assert_index_equal(store.select("df1").index, store.select("df2").index) @pytest.mark.xfail( run=False, reason="append_to_multiple_dropna_false is not raising as failed" ) def test_append_to_multiple_dropna_false(self, setup_path): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan df = concat([df1, df2], axis=1) with ensure_clean_store(setup_path) as store: # dropna=False shouldn't synchronize row indexes store.append_to_multiple( {"df1a": ["A", "B"], "df2a": None}, df, selector="df1a", dropna=False ) with pytest.raises(ValueError): store.select_as_multiple(["df1a", "df2a"]) assert not store.select("df1a").index.equals(store.select("df2a").index) def test_append_to_multiple_min_itemsize(self, setup_path): # GH 11238 df = DataFrame( { "IX": np.arange(1, 21), "Num": np.arange(1, 21), "BigNum": np.arange(1, 21) * 88, "Str": ["a" for _ in range(20)], "LongStr": ["abcde" for _ in range(20)], } ) expected = df.iloc[[0]] with ensure_clean_store(setup_path) as store: store.append_to_multiple( { "index": ["IX"], "nums": ["Num", "BigNum"], "strs": ["Str", "LongStr"], }, df.iloc[[0]], "index", min_itemsize={"Str": 10, "LongStr": 100, "Num": 2}, ) result = store.select_as_multiple(["index", "nums", "strs"]) tm.assert_frame_equal(result, expected) def test_select_as_multiple(self, setup_path): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) df2["foo"] = "bar" with ensure_clean_store(setup_path) as store: # no tables stored with pytest.raises(Exception): store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") store.append("df1", df1, data_columns=["A", "B"]) store.append("df2", df2) # exceptions with pytest.raises(Exception): store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") with pytest.raises(Exception): store.select_as_multiple([None], where=["A>0", "B>0"], selector="df1") msg = "'No object named df3 in the file'" with pytest.raises(KeyError, match=msg): store.select_as_multiple( ["df1", "df3"], where=["A>0", "B>0"], selector="df1" ) with pytest.raises(KeyError, match=msg): store.select_as_multiple(["df3"], where=["A>0", "B>0"], selector="df1") with pytest.raises(KeyError, match="'No object named df4 in the file'"): store.select_as_multiple( ["df1", "df2"], where=["A>0", "B>0"], selector="df4" ) # default select result = store.select("df1", ["A>0", "B>0"]) expected = store.select_as_multiple( ["df1"], where=["A>0", "B>0"], selector="df1" ) tm.assert_frame_equal(result, expected) expected = store.select_as_multiple( "df1", where=["A>0", "B>0"], selector="df1" ) tm.assert_frame_equal(result, expected) # multiple result = store.select_as_multiple( ["df1", "df2"], where=["A>0", "B>0"], selector="df1" ) expected = concat([df1, df2], axis=1) expected = expected[(expected.A > 0) & (expected.B > 0)] tm.assert_frame_equal(result, expected) # multiple (diff selector) result = store.select_as_multiple( ["df1", "df2"], where="index>df2.index[4]", selector="df2" ) expected = concat([df1, df2], axis=1) expected = expected[5:] tm.assert_frame_equal(result, expected) # test exception for diff rows store.append("df3", tm.makeTimeDataFrame(nper=50)) with pytest.raises(ValueError): store.select_as_multiple( ["df1", "df3"], where=["A>0", "B>0"], selector="df1" ) @pytest.mark.skipif( LooseVersion(tables.__version__) < LooseVersion("3.1.0"), reason=("tables version does not support fix for nan selection bug: GH 4858"), ) def test_nan_selection_bug_4858(self, setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame({"cols": range(6), "values": range(6)}, dtype="float64") df["cols"] = (df["cols"] + 10).apply(str) df.iloc[0] = np.nan expected = DataFrame( {"cols": ["13.0", "14.0", "15.0"], "values": [3.0, 4.0, 5.0]}, index=[3, 4, 5], ) # write w/o the index on that particular column store.append("df", df, data_columns=True, index=["cols"]) result = store.select("df", where="values>2.0") tm.assert_frame_equal(result, expected) def test_start_stop_table(self, setup_path): with ensure_clean_store(setup_path) as store: # table df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)}) store.append("df", df) result = store.select("df", "columns=['A']", start=0, stop=5) expected = df.loc[0:4, ["A"]] tm.assert_frame_equal(result, expected) # out of range result = store.select("df", "columns=['A']", start=30, stop=40) assert len(result) == 0 expected = df.loc[30:40, ["A"]] tm.assert_frame_equal(result, expected) def test_start_stop_multiple(self, setup_path): # GH 16209 with ensure_clean_store(setup_path) as store: df = DataFrame({"foo": [1, 2], "bar": [1, 2]}) store.append_to_multiple( {"selector": ["foo"], "data": None}, df, selector="selector" ) result = store.select_as_multiple( ["selector", "data"], selector="selector", start=0, stop=1 ) expected = df.loc[[0], ["foo", "bar"]] tm.assert_frame_equal(result, expected) def test_start_stop_fixed(self, setup_path): with ensure_clean_store(setup_path) as store: # fixed, GH 8287 df = DataFrame( {"A": np.random.rand(20), "B": np.random.rand(20)}, index=pd.date_range("20130101", periods=20), ) store.put("df", df) result = store.select("df", start=0, stop=5) expected = df.iloc[0:5, :] tm.assert_frame_equal(result, expected) result = store.select("df", start=5, stop=10) expected = df.iloc[5:10, :] tm.assert_frame_equal(result, expected) # out of range result = store.select("df", start=30, stop=40) expected = df.iloc[30:40, :] tm.assert_frame_equal(result, expected) # series s = df.A store.put("s", s) result = store.select("s", start=0, stop=5) expected = s.iloc[0:5] tm.assert_series_equal(result, expected) result = store.select("s", start=5, stop=10) expected = s.iloc[5:10] tm.assert_series_equal(result, expected) # sparse; not implemented df = tm.makeDataFrame() df.iloc[3:5, 1:3] = np.nan df.iloc[8:10, -2] = np.nan def test_select_filter_corner(self, setup_path): df = DataFrame(np.random.randn(50, 100)) df.index = [f"{c:3d}" for c in df.index] df.columns = [f"{c:3d}" for c in df.columns] with ensure_clean_store(setup_path) as store: store.put("frame", df, format="table") crit = "columns=df.columns[:75]" result = store.select("frame", [crit]) tm.assert_frame_equal(result, df.loc[:, df.columns[:75]]) crit = "columns=df.columns[:75:2]" result = store.select("frame", [crit]) tm.assert_frame_equal(result, df.loc[:, df.columns[:75:2]]) def test_path_pathlib(self, setup_path): df = tm.makeDataFrame() result = tm.round_trip_pathlib( lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df") ) tm.assert_frame_equal(df, result) @pytest.mark.parametrize("start, stop", [(0, 2), (1, 2), (None, None)]) def test_contiguous_mixed_data_table(self, start, stop, setup_path): # GH 17021 # ValueError when reading a contiguous mixed-data table ft. VLArray df = DataFrame( { "a": Series([20111010, 20111011, 20111012]), "b": Series(["ab", "cd", "ab"]), } ) with ensure_clean_store(setup_path) as store: store.append("test_dataset", df) result = store.select("test_dataset", start=start, stop=stop) tm.assert_frame_equal(df[start:stop], result) def test_path_pathlib_hdfstore(self, setup_path): df = tm.makeDataFrame() def writer(path): with HDFStore(path) as store: df.to_hdf(store, "df") def reader(path): with HDFStore(path) as store: return pd.read_hdf(store, "df") result = tm.round_trip_pathlib(writer, reader) tm.assert_frame_equal(df, result) def test_pickle_path_localpath(self, setup_path): df = tm.makeDataFrame() result = tm.round_trip_pathlib( lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df") ) tm.assert_frame_equal(df, result) def test_path_localpath_hdfstore(self, setup_path): df = tm.makeDataFrame() def writer(path): with HDFStore(path) as store: df.to_hdf(store, "df") def reader(path): with HDFStore(path) as store: return pd.read_hdf(store, "df") result = tm.round_trip_localpath(writer, reader) tm.assert_frame_equal(df, result) def _check_roundtrip(self, obj, comparator, path, compression=False, **kwargs): options = {} if compression: options["complib"] = _default_compressor with ensure_clean_store(path, "w", **options) as store: store["obj"] = obj retrieved = store["obj"] comparator(retrieved, obj, **kwargs) def _check_double_roundtrip( self, obj, comparator, path, compression=False, **kwargs ): options = {} if compression: options["complib"] = compression or _default_compressor with ensure_clean_store(path, "w", **options) as store: store["obj"] = obj retrieved = store["obj"] comparator(retrieved, obj, **kwargs) store["obj"] = retrieved again = store["obj"] comparator(again, obj, **kwargs) def _check_roundtrip_table(self, obj, comparator, path, compression=False): options = {} if compression: options["complib"] = _default_compressor with ensure_clean_store(path, "w", **options) as store: store.put("obj", obj, format="table") retrieved = store["obj"] comparator(retrieved, obj) def test_multiple_open_close(self, setup_path): # gh-4409: open & close multiple times with ensure_clean_path(setup_path) as path: df = tm.makeDataFrame() df.to_hdf(path, "df", mode="w", format="table") # single store = HDFStore(path) assert "CLOSED" not in store.info() assert store.is_open store.close() assert "CLOSED" in store.info() assert not store.is_open with ensure_clean_path(setup_path) as path: if pytables._table_file_open_policy_is_strict: # multiples store1 = HDFStore(path) with pytest.raises(ValueError): HDFStore(path) store1.close() else: # multiples store1 = HDFStore(path) store2 = HDFStore(path) assert "CLOSED" not in store1.info() assert "CLOSED" not in store2.info() assert store1.is_open assert store2.is_open store1.close() assert "CLOSED" in store1.info() assert not store1.is_open assert "CLOSED" not in store2.info() assert store2.is_open store2.close() assert "CLOSED" in store1.info() assert "CLOSED" in store2.info() assert not store1.is_open assert not store2.is_open # nested close store = HDFStore(path, mode="w") store.append("df", df) store2 = HDFStore(path) store2.append("df2", df) store2.close() assert "CLOSED" in store2.info() assert not store2.is_open store.close() assert "CLOSED" in store.info() assert not store.is_open # double closing store = HDFStore(path, mode="w") store.append("df", df) store2 = HDFStore(path) store.close() assert "CLOSED" in store.info() assert not store.is_open store2.close() assert "CLOSED" in store2.info() assert not store2.is_open # ops on a closed store with ensure_clean_path(setup_path) as path: df = tm.makeDataFrame() df.to_hdf(path, "df", mode="w", format="table") store = HDFStore(path) store.close() with pytest.raises(ClosedFileError): store.keys() with pytest.raises(ClosedFileError): "df" in store with pytest.raises(ClosedFileError): len(store) with pytest.raises(ClosedFileError): store["df"] with pytest.raises(AttributeError): store.df with pytest.raises(ClosedFileError): store.select("df") with pytest.raises(ClosedFileError): store.get("df") with pytest.raises(ClosedFileError): store.append("df2", df) with pytest.raises(ClosedFileError): store.put("df3", df) with pytest.raises(ClosedFileError): store.get_storer("df2") with pytest.raises(ClosedFileError): store.remove("df2") with pytest.raises(ClosedFileError, match="file is not open"): store.select("df") def test_pytables_native_read(self, datapath, setup_path): with ensure_clean_store( datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r" ) as store: d2 = store["detector/readout"] assert isinstance(d2, DataFrame) @pytest.mark.skipif( is_platform_windows(), reason="native2 read fails oddly on windows" ) def test_pytables_native2_read(self, datapath, setup_path): with ensure_clean_store( datapath("io", "data", "legacy_hdf", "pytables_native2.h5"), mode="r" ) as store: str(store) d1 = store["detector"] assert isinstance(d1, DataFrame) def test_legacy_table_fixed_format_read_py2(self, datapath, setup_path): # GH 24510 # legacy table with fixed format written in Python 2 with ensure_clean_store( datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r" ) as store: result = store.select("df") expected = DataFrame( [[1, 2, 3, "D"]], columns=["A", "B", "C", "D"], index=Index(["ABC"], name="INDEX_NAME"), ) tm.assert_frame_equal(expected, result) def test_legacy_table_fixed_format_read_datetime_py2(self, datapath, setup_path): # GH 31750 # legacy table with fixed format and datetime64 column written in Python 2 with ensure_clean_store( datapath("io", "data", "legacy_hdf", "legacy_table_fixed_datetime_py2.h5"), mode="r", ) as store: result = store.select("df") expected = DataFrame( [[Timestamp("2020-02-06T18:00")]], columns=["A"], index=Index(["date"]), ) tm.assert_frame_equal(expected, result) def test_legacy_table_read_py2(self, datapath, setup_path): # issue: 24925 # legacy table written in Python 2 with ensure_clean_store( datapath("io", "data", "legacy_hdf", "legacy_table_py2.h5"), mode="r" ) as store: result = store.select("table") expected = DataFrame({"a": ["a", "b"], "b": [2, 3]}) tm.assert_frame_equal(expected, result) def test_copy(self, setup_path): with catch_warnings(record=True): def do_copy(f, new_f=None, keys=None, propindexes=True, **kwargs): try: store = HDFStore(f, "r") if new_f is None: import tempfile fd, new_f = tempfile.mkstemp() tstore = store.copy( new_f, keys=keys, propindexes=propindexes, **kwargs ) # check keys if keys is None: keys = store.keys() assert set(keys) == set(tstore.keys()) # check indices & nrows for k in tstore.keys(): if tstore.get_storer(k).is_table: new_t = tstore.get_storer(k) orig_t = store.get_storer(k) assert orig_t.nrows == new_t.nrows # check propindixes if propindexes: for a in orig_t.axes: if a.is_indexed: assert new_t[a.name].is_indexed finally: safe_close(store) safe_close(tstore) try: os.close(fd) except (OSError, ValueError): pass os.remove(new_f) # new table df = tm.makeDataFrame() with tm.ensure_clean() as path: st = HDFStore(path) st.append("df", df, data_columns=["A"]) st.close() do_copy(f=path) do_copy(f=path, propindexes=False) def test_store_datetime_fractional_secs(self, setup_path): with ensure_clean_store(setup_path) as store: dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) series = Series([0], [dt]) store["a"] = series assert store["a"].index[0] == dt def test_tseries_indices_series(self, setup_path): with ensure_clean_store(setup_path) as store: idx = tm.makeDateIndex(10) ser = Series(np.random.randn(len(idx)), idx) store["a"] = ser result = store["a"] tm.assert_series_equal(result, ser) assert result.index.freq == ser.index.freq tm.assert_class_equal(result.index, ser.index, obj="series index") idx = tm.makePeriodIndex(10) ser = Series(np.random.randn(len(idx)), idx) store["a"] = ser result = store["a"] tm.assert_series_equal(result, ser) assert result.index.freq == ser.index.freq tm.assert_class_equal(result.index, ser.index, obj="series index") def test_tseries_indices_frame(self, setup_path): with ensure_clean_store(setup_path) as store: idx = tm.makeDateIndex(10) df = DataFrame(np.random.randn(len(idx), 3), index=idx) store["a"] = df result = store["a"] tm.assert_frame_equal(result, df) assert result.index.freq == df.index.freq tm.assert_class_equal(result.index, df.index, obj="dataframe index") idx = tm.makePeriodIndex(10) df = DataFrame(np.random.randn(len(idx), 3), idx) store["a"] = df result = store["a"] tm.assert_frame_equal(result, df) assert result.index.freq == df.index.freq tm.assert_class_equal(result.index, df.index, obj="dataframe index") def test_unicode_index(self, setup_path): unicode_values = ["\u03c3", "\u03c3\u03c3"] # PerformanceWarning with catch_warnings(record=True): simplefilter("ignore", pd.errors.PerformanceWarning) s = Series(np.random.randn(len(unicode_values)), unicode_values) self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) def test_unicode_longer_encoded(self, setup_path): # GH 11234 char = "\u0394" df = DataFrame({"A": [char]}) with ensure_clean_store(setup_path) as store: store.put("df", df, format="table", encoding="utf-8") result = store.get("df") tm.assert_frame_equal(result, df) df = DataFrame({"A": ["a", char], "B": ["b", "b"]}) with ensure_clean_store(setup_path) as store: store.put("df", df, format="table", encoding="utf-8") result = store.get("df") tm.assert_frame_equal(result, df) def test_store_datetime_mixed(self, setup_path): df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]}) ts = tm.makeTimeSeries() df["d"] = ts.index[:3] self._check_roundtrip(df, tm.assert_frame_equal, path=setup_path) # FIXME: don't leave commented-out code # def test_cant_write_multiindex_table(self): # # for now, #1848 # df = DataFrame(np.random.randn(10, 4), # index=[np.arange(5).repeat(2), # np.tile(np.arange(2), 5)]) # # with pytest.raises(Exception): # store.put('foo', df, format='table') def test_append_with_diff_col_name_types_raises_value_error(self, setup_path): df = DataFrame(np.random.randn(10, 1)) df2 = DataFrame({"a": np.random.randn(10)}) df3 = DataFrame({(1, 2): np.random.randn(10)}) df4 = DataFrame({("1", 2): np.random.randn(10)}) df5 = DataFrame({("1", 2, object): np.random.randn(10)}) with ensure_clean_store(setup_path) as store: name = f"df_{tm.rands(10)}" store.append(name, df) for d in (df2, df3, df4, df5): with pytest.raises(ValueError): store.append(name, d) def test_query_with_nested_special_character(self, setup_path): df = DataFrame( { "a": ["a", "a", "c", "b", "test & test", "c", "b", "e"], "b": [1, 2, 3, 4, 5, 6, 7, 8], } ) expected = df[df.a == "test & test"] with ensure_clean_store(setup_path) as store: store.append("test", df, format="table", data_columns=True) result = store.select("test", 'a = "test & test"') tm.assert_frame_equal(expected, result) def test_categorical(self, setup_path): with ensure_clean_store(setup_path) as store: # Basic _maybe_remove(store, "s") s = Series( Categorical( ["a", "b", "b", "a", "a", "c"], categories=["a", "b", "c", "d"], ordered=False, ) ) store.append("s", s, format="table") result = store.select("s") tm.assert_series_equal(s, result) _maybe_remove(store, "s_ordered") s = Series( Categorical( ["a", "b", "b", "a", "a", "c"], categories=["a", "b", "c", "d"], ordered=True, ) ) store.append("s_ordered", s, format="table") result = store.select("s_ordered") tm.assert_series_equal(s, result) _maybe_remove(store, "df") df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]}) store.append("df", df, format="table") result = store.select("df") tm.assert_frame_equal(result, df) # Dtypes _maybe_remove(store, "si") s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category") store.append("si", s) result = store.select("si") tm.assert_series_equal(result, s) _maybe_remove(store, "si2") s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category") store.append("si2", s) result = store.select("si2") tm.assert_series_equal(result, s) # Multiple _maybe_remove(store, "df2") df2 = df.copy() df2["s2"] = Series(list("abcdefg")).astype("category") store.append("df2", df2) result = store.select("df2") tm.assert_frame_equal(result, df2) # Make sure the metadata is OK info = store.info() assert "/df2 " in info # assert '/df2/meta/values_block_0/meta' in info assert "/df2/meta/values_block_1/meta" in info # unordered _maybe_remove(store, "s2") s = Series( Categorical( ["a", "b", "b", "a", "a", "c"], categories=["a", "b", "c", "d"], ordered=False, ) ) store.append("s2", s, format="table") result = store.select("s2") tm.assert_series_equal(result, s) # Query _maybe_remove(store, "df3") store.append("df3", df, data_columns=["s"]) expected = df[df.s.isin(["b", "c"])] result = store.select("df3", where=['s in ["b","c"]']) tm.assert_frame_equal(result, expected) expected = df[df.s.isin(["b", "c"])] result = store.select("df3", where=['s = ["b","c"]']) tm.assert_frame_equal(result, expected) expected = df[df.s.isin(["d"])] result = store.select("df3", where=['s in ["d"]']) tm.assert_frame_equal(result, expected) expected = df[df.s.isin(["f"])] result = store.select("df3", where=['s in ["f"]']) tm.assert_frame_equal(result, expected) # Appending with same categories is ok store.append("df3", df) df = concat([df, df]) expected = df[df.s.isin(["b", "c"])] result = store.select("df3", where=['s in ["b","c"]']) tm.assert_frame_equal(result, expected) # Appending must have the same categories df3 = df.copy() df3["s"] = df3["s"].cat.remove_unused_categories() with pytest.raises(ValueError): store.append("df3", df3) # Remove, and make sure meta data is removed (its a recursive # removal so should be). result = store.select("df3/meta/s/meta") assert result is not None store.remove("df3") with pytest.raises( KeyError, match="'No object named df3/meta/s/meta in the file'" ): store.select("df3/meta/s/meta") def test_categorical_conversion(self, setup_path): # GH13322 # Check that read_hdf with categorical columns doesn't return rows if # where criteria isn't met. obsids = ["ESP_012345_6789", "ESP_987654_3210"] imgids = ["APF00006np", "APF0001imm"] data = [4.3, 9.8] # Test without categories df = DataFrame({"obsids": obsids, "imgids": imgids, "data": data}) # We are expecting an empty DataFrame matching types of df expected = df.iloc[[], :] with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table", data_columns=True) result = read_hdf(path, "df", where="obsids=B") tm.assert_frame_equal(result, expected) # Test with categories df.obsids = df.obsids.astype("category") df.imgids = df.imgids.astype("category") # We are expecting an empty DataFrame matching types of df expected = df.iloc[[], :] with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table", data_columns=True) result = read_hdf(path, "df", where="obsids=B") tm.assert_frame_equal(result, expected) def test_categorical_nan_only_columns(self, setup_path): # GH18413 # Check that read_hdf with categorical columns with NaN-only values can # be read back. df = DataFrame( { "a": ["a", "b", "c", np.nan], "b": [np.nan, np.nan, np.nan, np.nan], "c": [1, 2, 3, 4], "d": Series([None] * 4, dtype=object), } ) df["a"] = df.a.astype("category") df["b"] = df.b.astype("category") df["d"] = df.b.astype("category") expected = df with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table", data_columns=True) result = read_hdf(path, "df") tm.assert_frame_equal(result, expected) def test_duplicate_column_name(self, setup_path): df = DataFrame(columns=["a", "a"], data=[[0, 0]]) with ensure_clean_path(setup_path) as path: with pytest.raises(ValueError): df.to_hdf(path, "df", format="fixed") df.to_hdf(path, "df", format="table") other = read_hdf(path, "df") tm.assert_frame_equal(df, other) assert df.equals(other) assert other.equals(df) def test_round_trip_equals(self, setup_path): # GH 9330 df = DataFrame({"B": [1, 2], "A": ["x", "y"]}) with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table") other = read_hdf(path, "df") tm.assert_frame_equal(df, other) assert df.equals(other) assert other.equals(df) def test_preserve_timedeltaindex_type(self, setup_path): # GH9635 # Storing TimedeltaIndexed DataFrames in fixed stores did not preserve # the type of the index. df = DataFrame(np.random.normal(size=(10, 5))) df.index = timedelta_range(start="0s", periods=10, freq="1s", name="example") with ensure_clean_store(setup_path) as store: store["df"] = df tm.assert_frame_equal(store["df"], df) def test_columns_multiindex_modified(self, setup_path): # BUG: 7212 # read_hdf store.select modified the passed columns parameters # when multi-indexed. df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) df.index.name = "letters" df = df.set_index(keys="E", append=True) data_columns = df.index.names + df.columns.tolist() with ensure_clean_path(setup_path) as path: df.to_hdf( path, "df", mode="a", append=True, data_columns=data_columns, index=False, ) cols2load = list("BCD") cols2load_original = list(cols2load) df_loaded = read_hdf(path, "df", columns=cols2load) # noqa assert cols2load_original == cols2load @ignore_natural_naming_warning def test_to_hdf_with_object_column_names(self, setup_path): # GH9057 # Writing HDF5 table format should only work for string-like # column types types_should_fail = [ tm.makeIntIndex, tm.makeFloatIndex, tm.makeDateIndex, tm.makeTimedeltaIndex, tm.makePeriodIndex, ] types_should_run = [ tm.makeStringIndex, tm.makeCategoricalIndex, tm.makeUnicodeIndex, ] for index in types_should_fail: df = DataFrame(np.random.randn(10, 2), columns=index(2)) with ensure_clean_path(setup_path) as path: with catch_warnings(record=True): msg = "cannot have non-object label DataIndexableCol" with pytest.raises(ValueError, match=msg): df.to_hdf(path, "df", format="table", data_columns=True) for index in types_should_run: df = DataFrame(np.random.randn(10, 2), columns=index(2)) with ensure_clean_path(setup_path) as path: with catch_warnings(record=True): df.to_hdf(path, "df", format="table", data_columns=True) result = pd.read_hdf(path, "df", where=f"index = [{df.index[0]}]") assert len(result) def test_read_hdf_open_store(self, setup_path): # GH10330 # No check for non-string path_or-buf, and no test of open store df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) df.index.name = "letters" df = df.set_index(keys="E", append=True) with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", mode="w") direct = read_hdf(path, "df") store = HDFStore(path, mode="r") indirect = read_hdf(store, "df") tm.assert_frame_equal(direct, indirect) assert store.is_open store.close() def test_read_hdf_iterator(self, setup_path): df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) df.index.name = "letters" df = df.set_index(keys="E", append=True) with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", mode="w", format="t") direct = read_hdf(path, "df") iterator = read_hdf(path, "df", iterator=True) assert isinstance(iterator, TableIterator) indirect = next(iterator.__iter__()) tm.assert_frame_equal(direct, indirect) iterator.store.close() def test_read_hdf_errors(self, setup_path): df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) with ensure_clean_path(setup_path) as path: with pytest.raises(IOError): read_hdf(path, "key") df.to_hdf(path, "df") store = HDFStore(path, mode="r") store.close() with pytest.raises(IOError): read_hdf(store, "df") def test_read_hdf_generic_buffer_errors(self): with pytest.raises(NotImplementedError): read_hdf(BytesIO(b""), "df") def test_invalid_complib(self, setup_path): df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) with tm.ensure_clean(setup_path) as path: with pytest.raises(ValueError): df.to_hdf(path, "df", complib="foolib") # GH10443 def test_read_nokey(self, setup_path): df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) # Categorical dtype not supported for "fixed" format. So no need # to test with that dtype in the dataframe here. with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", mode="a") reread = read_hdf(path) tm.assert_frame_equal(df, reread) df.to_hdf(path, "df2", mode="a") with pytest.raises(ValueError): read_hdf(path) def test_read_nokey_table(self, setup_path): # GH13231 df = DataFrame({"i": range(5), "c": Series(list("abacd"), dtype="category")}) with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", mode="a", format="table") reread = read_hdf(path) tm.assert_frame_equal(df, reread) df.to_hdf(path, "df2", mode="a", format="table") with pytest.raises(ValueError): read_hdf(path) def test_read_nokey_empty(self, setup_path): with ensure_clean_path(setup_path) as path: store = HDFStore(path) store.close() with pytest.raises(ValueError): read_hdf(path) def test_read_from_pathlib_path(self, setup_path): # GH11773 expected = DataFrame( np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") ) with ensure_clean_path(setup_path) as filename: path_obj = Path(filename) expected.to_hdf(path_obj, "df", mode="a") actual = read_hdf(path_obj, "df") tm.assert_frame_equal(expected, actual) @td.skip_if_no("py.path") def test_read_from_py_localpath(self, setup_path): # GH11773 from py.path import local as LocalPath expected = DataFrame( np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") ) with ensure_clean_path(setup_path) as filename: path_obj = LocalPath(filename) expected.to_hdf(path_obj, "df", mode="a") actual = read_hdf(path_obj, "df") tm.assert_frame_equal(expected, actual) def test_query_long_float_literal(self, setup_path): # GH 14241 df = DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) with ensure_clean_store(setup_path) as store: store.append("test", df, format="table", data_columns=True) cutoff = 1000000000.0006 result = store.select("test", f"A < {cutoff:.4f}") assert result.empty cutoff = 1000000000.0010 result = store.select("test", f"A > {cutoff:.4f}") expected = df.loc[[1, 2], :] tm.assert_frame_equal(expected, result) exact = 1000000000.0011 result = store.select("test", f"A == {exact:.4f}") expected = df.loc[[1], :] tm.assert_frame_equal(expected, result) def test_query_compare_column_type(self, setup_path): # GH 15492 df = DataFrame( { "date": ["2014-01-01", "2014-01-02"], "real_date": date_range("2014-01-01", periods=2), "float": [1.1, 1.2], "int": [1, 2], }, columns=["date", "real_date", "float", "int"], ) with ensure_clean_store(setup_path) as store: store.append("test", df, format="table", data_columns=True) ts = Timestamp("2014-01-01") # noqa result = store.select("test", where="real_date > ts") expected = df.loc[[1], :] tm.assert_frame_equal(expected, result) for op in ["<", ">", "=="]: # non strings to string column always fail for v in [2.1, True, Timestamp("2014-01-01"), pd.Timedelta(1, "s")]: query = f"date {op} v" with pytest.raises(TypeError): store.select("test", where=query) # strings to other columns must be convertible to type v = "a" for col in ["int", "float", "real_date"]: query = f"{col} {op} v" with pytest.raises(ValueError): store.select("test", where=query) for v, col in zip( ["1", "1.1", "2014-01-01"], ["int", "float", "real_date"] ): query = f"{col} {op} v" result = store.select("test", where=query) if op == "==": expected = df.loc[[0], :] elif op == ">": expected = df.loc[[1], :] else: expected = df.loc[[], :] tm.assert_frame_equal(expected, result) @pytest.mark.parametrize("format", ["fixed", "table"]) def test_read_hdf_series_mode_r(self, format, setup_path): # GH 16583 # Tests that reading a Series saved to an HDF file # still works if a mode='r' argument is supplied series = tm.makeFloatSeries() with ensure_clean_path(setup_path) as path: series.to_hdf(path, key="data", format=format) result = pd.read_hdf(path, key="data", mode="r") tm.assert_series_equal(result, series) def test_fspath(self): with tm.ensure_clean("foo.h5") as path: with HDFStore(path) as store: assert os.fspath(store) == str(path) def test_read_py2_hdf_file_in_py3(self, datapath): # GH 16781 # tests reading a PeriodIndex DataFrame written in Python2 in Python3 # the file was generated in Python 2.7 like so: # # df = DataFrame([1.,2,3], index=pd.PeriodIndex( # ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) # df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p') expected = DataFrame( [1.0, 2, 3], index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"), ) with ensure_clean_store( datapath( "io", "data", "legacy_hdf", "periodindex_0.20.1_x86_64_darwin_2.7.13.h5" ), mode="r", ) as store: result = store["p"] tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("where", ["", (), (None,), [], [None]]) def test_select_empty_where(self, where): # GH26610 # Using keyword `where` as '' or (), or [None], etc # while reading from HDF store raises # "SyntaxError: only a single expression is allowed" df = DataFrame([1, 2, 3]) with ensure_clean_path("empty_where.h5") as path: with HDFStore(path) as store: store.put("df", df, "t") result = pd.read_hdf(store, "df", where=where) tm.assert_frame_equal(result, df) @pytest.mark.parametrize( "idx", [ date_range("2019", freq="D", periods=3, tz="UTC"), CategoricalIndex(list("abc")), ], ) def test_to_hdf_multiindex_extension_dtype(self, idx, setup_path): # GH 7775 mi = MultiIndex.from_arrays([idx, idx]) df = DataFrame(0, index=mi, columns=["a"]) with ensure_clean_path(setup_path) as path: with pytest.raises(NotImplementedError, match="Saving a MultiIndex"): df.to_hdf(path, "df") def test_unsuppored_hdf_file_error(self, datapath): # GH 9539 data_path = datapath("io", "data", "legacy_hdf/incompatible_dataset.h5") message = ( r"Dataset\(s\) incompatible with Pandas data types, " "not table, or no datasets found in HDF5 file." ) with pytest.raises(ValueError, match=message): pd.read_hdf(data_path) @pytest.mark.parametrize("bad_version", [(1, 2), (1,), [], "12", "123"]) def test_maybe_adjust_name_bad_version_raises(bad_version): msg = "Version is incorrect, expected sequence of 3 integers" with pytest.raises(ValueError, match=msg): _maybe_adjust_name("values_block_0", version=bad_version)