from collections import OrderedDict, abc from datetime import date, datetime, timedelta import functools import itertools import re import numpy as np import numpy.ma as ma import numpy.ma.mrecords as mrecords import pytest import pytz from pandas.compat import is_platform_little_endian from pandas.compat.numpy import _np_version_under1p19 from pandas.core.dtypes.common import is_integer_dtype from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype, PeriodDtype import pandas as pd from pandas import ( Categorical, CategoricalIndex, DataFrame, Index, Interval, MultiIndex, Period, RangeIndex, Series, Timedelta, Timestamp, date_range, isna, ) import pandas._testing as tm from pandas.arrays import IntervalArray, PeriodArray, SparseArray from pandas.core.construction import create_series_with_explicit_dtype MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"] MIXED_INT_DTYPES = [ "uint8", "uint16", "uint32", "uint64", "int8", "int16", "int32", "int64", ] class TestDataFrameConstructors: def test_series_with_name_not_matching_column(self): # GH#9232 x = Series(range(5), name=1) y = Series(range(5), name=0) result = DataFrame(x, columns=[0]) expected = DataFrame([], columns=[0]) tm.assert_frame_equal(result, expected) result = DataFrame(y, columns=[1]) expected = DataFrame([], columns=[1]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "constructor", [ lambda: DataFrame(), lambda: DataFrame(None), lambda: DataFrame({}), lambda: DataFrame(()), lambda: DataFrame([]), lambda: DataFrame(_ for _ in []), lambda: DataFrame(range(0)), lambda: DataFrame(data=None), lambda: DataFrame(data={}), lambda: DataFrame(data=()), lambda: DataFrame(data=[]), lambda: DataFrame(data=(_ for _ in [])), lambda: DataFrame(data=range(0)), ], ) def test_empty_constructor(self, constructor): expected = DataFrame() result = constructor() assert len(result.index) == 0 assert len(result.columns) == 0 tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "emptylike,expected_index,expected_columns", [ ([[]], RangeIndex(1), RangeIndex(0)), ([[], []], RangeIndex(2), RangeIndex(0)), ([(_ for _ in [])], RangeIndex(1), RangeIndex(0)), ], ) def test_emptylike_constructor(self, emptylike, expected_index, expected_columns): expected = DataFrame(index=expected_index, columns=expected_columns) result = DataFrame(emptylike) tm.assert_frame_equal(result, expected) def test_constructor_mixed(self, float_string_frame): index, data = tm.getMixedTypeDict() # TODO(wesm), incomplete test? indexed_frame = DataFrame(data, index=index) # noqa unindexed_frame = DataFrame(data) # noqa assert float_string_frame["foo"].dtype == np.object_ def test_constructor_cast_failure(self): foo = DataFrame({"a": ["a", "b", "c"]}, dtype=np.float64) assert foo["a"].dtype == object # GH 3010, constructing with odd arrays df = DataFrame(np.ones((4, 2))) # this is ok df["foo"] = np.ones((4, 2)).tolist() # this is not ok msg = "Wrong number of items passed 2, placement implies 1" with pytest.raises(ValueError, match=msg): df["test"] = np.ones((4, 2)) # this is ok df["foo2"] = np.ones((4, 2)).tolist() def test_constructor_dtype_copy(self): orig_df = DataFrame({"col1": [1.0], "col2": [2.0], "col3": [3.0]}) new_df = DataFrame(orig_df, dtype=float, copy=True) new_df["col1"] = 200.0 assert orig_df["col1"][0] == 1.0 def test_constructor_dtype_nocast_view(self): df = DataFrame([[1, 2]]) should_be_view = DataFrame(df, dtype=df[0].dtype) should_be_view[0][0] = 99 assert df.values[0, 0] == 99 should_be_view = DataFrame(df.values, dtype=df[0].dtype) should_be_view[0][0] = 97 assert df.values[0, 0] == 97 def test_constructor_dtype_list_data(self): df = DataFrame([[1, "2"], [None, "a"]], dtype=object) assert df.loc[1, 0] is None assert df.loc[0, 1] == "2" @pytest.mark.skipif(_np_version_under1p19, reason="NumPy change.") def test_constructor_list_of_2d_raises(self): # https://github.com/pandas-dev/pandas/issues/32289 a = DataFrame() b = np.empty((0, 0)) with pytest.raises(ValueError, match=r"shape=\(1, 0, 0\)"): DataFrame([a]) with pytest.raises(ValueError, match=r"shape=\(1, 0, 0\)"): DataFrame([b]) a = DataFrame({"A": [1, 2]}) with pytest.raises(ValueError, match=r"shape=\(2, 2, 1\)"): DataFrame([a, a]) def test_constructor_mixed_dtypes(self): def _make_mixed_dtypes_df(typ, ad=None): if typ == "int": dtypes = MIXED_INT_DTYPES arrays = [np.array(np.random.rand(10), dtype=d) for d in dtypes] elif typ == "float": dtypes = MIXED_FLOAT_DTYPES arrays = [ np.array(np.random.randint(10, size=10), dtype=d) for d in dtypes ] for d, a in zip(dtypes, arrays): assert a.dtype == d if ad is None: ad = {} ad.update({d: a for d, a in zip(dtypes, arrays)}) return DataFrame(ad) def _check_mixed_dtypes(df, dtypes=None): if dtypes is None: dtypes = MIXED_FLOAT_DTYPES + MIXED_INT_DTYPES for d in dtypes: if d in df: assert df.dtypes[d] == d # mixed floating and integer coexist in the same frame df = _make_mixed_dtypes_df("float") _check_mixed_dtypes(df) # add lots of types df = _make_mixed_dtypes_df("float", {"A": 1, "B": "foo", "C": "bar"}) _check_mixed_dtypes(df) # GH 622 df = _make_mixed_dtypes_df("int") _check_mixed_dtypes(df) def test_constructor_complex_dtypes(self): # GH10952 a = np.random.rand(10).astype(np.complex64) b = np.random.rand(10).astype(np.complex128) df = DataFrame({"a": a, "b": b}) assert a.dtype == df.a.dtype assert b.dtype == df.b.dtype def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 df = DataFrame({"A": ["x", None]}, dtype=string_dtype) result = df.isna() expected = DataFrame({"A": [False, True]}) tm.assert_frame_equal(result, expected) assert df.iloc[1, 0] is None df = DataFrame({"A": ["x", np.nan]}, dtype=string_dtype) assert np.isnan(df.iloc[1, 0]) def test_constructor_rec(self, float_frame): rec = float_frame.to_records(index=False) rec.dtype.names = list(rec.dtype.names)[::-1] index = float_frame.index df = DataFrame(rec) tm.assert_index_equal(df.columns, Index(rec.dtype.names)) df2 = DataFrame(rec, index=index) tm.assert_index_equal(df2.columns, Index(rec.dtype.names)) tm.assert_index_equal(df2.index, index) rng = np.arange(len(rec))[::-1] df3 = DataFrame(rec, index=rng, columns=["C", "B"]) expected = DataFrame(rec, index=rng).reindex(columns=["C", "B"]) tm.assert_frame_equal(df3, expected) def test_constructor_bool(self): df = DataFrame({0: np.ones(10, dtype=bool), 1: np.zeros(10, dtype=bool)}) assert df.values.dtype == np.bool_ def test_constructor_overflow_int64(self): # see gh-14881 values = np.array([2 ** 64 - i for i in range(1, 10)], dtype=np.uint64) result = DataFrame({"a": values}) assert result["a"].dtype == np.uint64 # see gh-2355 data_scores = [ (6311132704823138710, 273), (2685045978526272070, 23), (8921811264899370420, 45), (17019687244989530680, 270), (9930107427299601010, 273), ] dtype = [("uid", "u8"), ("score", "u8")] data = np.zeros((len(data_scores),), dtype=dtype) data[:] = data_scores df_crawls = DataFrame(data) assert df_crawls["uid"].dtype == np.uint64 @pytest.mark.parametrize( "values", [ np.array([2 ** 64], dtype=object), np.array([2 ** 65]), [2 ** 64 + 1], np.array([-(2 ** 63) - 4], dtype=object), np.array([-(2 ** 64) - 1]), [-(2 ** 65) - 2], ], ) def test_constructor_int_overflow(self, values): # see gh-18584 value = values[0] result = DataFrame(values) assert result[0].dtype == object assert result[0][0] == value def test_constructor_ordereddict(self): import random nitems = 100 nums = list(range(nitems)) random.shuffle(nums) expected = [f"A{i:d}" for i in nums] df = DataFrame(OrderedDict(zip(expected, [[0]] * nitems))) assert expected == list(df.columns) def test_constructor_dict(self): datetime_series = tm.makeTimeSeries(nper=30) # test expects index shifted by 5 datetime_series_short = tm.makeTimeSeries(nper=30)[5:] frame = DataFrame({"col1": datetime_series, "col2": datetime_series_short}) # col2 is padded with NaN assert len(datetime_series) == 30 assert len(datetime_series_short) == 25 tm.assert_series_equal(frame["col1"], datetime_series.rename("col1")) exp = Series( np.concatenate([[np.nan] * 5, datetime_series_short.values]), index=datetime_series.index, name="col2", ) tm.assert_series_equal(exp, frame["col2"]) frame = DataFrame( {"col1": datetime_series, "col2": datetime_series_short}, columns=["col2", "col3", "col4"], ) assert len(frame) == len(datetime_series_short) assert "col1" not in frame assert isna(frame["col3"]).all() # Corner cases assert len(DataFrame()) == 0 # mix dict and array, wrong size - no spec for which error should raise # first msg = "Mixing dicts with non-Series may lead to ambiguous ordering." with pytest.raises(ValueError, match=msg): DataFrame({"A": {"a": "a", "b": "b"}, "B": ["a", "b", "c"]}) # Length-one dict micro-optimization frame = DataFrame({"A": {"1": 1, "2": 2}}) tm.assert_index_equal(frame.index, Index(["1", "2"])) # empty dict plus index idx = Index([0, 1, 2]) frame = DataFrame({}, index=idx) assert frame.index is idx # empty dict with index and columns idx = Index([0, 1, 2]) frame = DataFrame({}, index=idx, columns=idx) assert frame.index is idx assert frame.columns is idx assert len(frame._series) == 3 # with dict of empty list and Series frame = DataFrame({"A": [], "B": []}, columns=["A", "B"]) tm.assert_index_equal(frame.index, RangeIndex(0), exact=True) # GH 14381 # Dict with None value frame_none = DataFrame({"a": None}, index=[0]) frame_none_list = DataFrame({"a": [None]}, index=[0]) assert frame_none._get_value(0, "a") is None assert frame_none_list._get_value(0, "a") is None tm.assert_frame_equal(frame_none, frame_none_list) # GH10856 # dict with scalar values should raise error, even if columns passed msg = "If using all scalar values, you must pass an index" with pytest.raises(ValueError, match=msg): DataFrame({"a": 0.7}) with pytest.raises(ValueError, match=msg): DataFrame({"a": 0.7}, columns=["a"]) @pytest.mark.parametrize("scalar", [2, np.nan, None, "D"]) def test_constructor_invalid_items_unused(self, scalar): # No error if invalid (scalar) value is in fact not used: result = DataFrame({"a": scalar}, columns=["b"]) expected = DataFrame(columns=["b"]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")]) def test_constructor_dict_nan_key(self, value): # GH 18455 cols = [1, value, 3] idx = ["a", value] values = [[0, 3], [1, 4], [2, 5]] data = {cols[c]: Series(values[c], index=idx) for c in range(3)} result = DataFrame(data).sort_values(1).sort_values("a", axis=1) expected = DataFrame( np.arange(6, dtype="int64").reshape(2, 3), index=idx, columns=cols ) tm.assert_frame_equal(result, expected) result = DataFrame(data, index=idx).sort_values("a", axis=1) tm.assert_frame_equal(result, expected) result = DataFrame(data, index=idx, columns=cols) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("value", [np.nan, None, float("nan")]) def test_constructor_dict_nan_tuple_key(self, value): # GH 18455 cols = Index([(11, 21), (value, 22), (13, value)]) idx = Index([("a", value), (value, 2)]) values = [[0, 3], [1, 4], [2, 5]] data = {cols[c]: Series(values[c], index=idx) for c in range(3)} result = DataFrame(data).sort_values((11, 21)).sort_values(("a", value), axis=1) expected = DataFrame( np.arange(6, dtype="int64").reshape(2, 3), index=idx, columns=cols ) tm.assert_frame_equal(result, expected) result = DataFrame(data, index=idx).sort_values(("a", value), axis=1) tm.assert_frame_equal(result, expected) result = DataFrame(data, index=idx, columns=cols) tm.assert_frame_equal(result, expected) def test_constructor_dict_order_insertion(self): datetime_series = tm.makeTimeSeries(nper=30) datetime_series_short = tm.makeTimeSeries(nper=25) # GH19018 # initialization ordering: by insertion order if python>= 3.6 d = {"b": datetime_series_short, "a": datetime_series} frame = DataFrame(data=d) expected = DataFrame(data=d, columns=list("ba")) tm.assert_frame_equal(frame, expected) def test_constructor_dict_nan_key_and_columns(self): # GH 16894 result = DataFrame({np.nan: [1, 2], 2: [2, 3]}, columns=[np.nan, 2]) expected = DataFrame([[1, 2], [2, 3]], columns=[np.nan, 2]) tm.assert_frame_equal(result, expected) def test_constructor_multi_index(self): # GH 4078 # construction error with mi and all-nan frame tuples = [(2, 3), (3, 3), (3, 3)] mi = MultiIndex.from_tuples(tuples) df = DataFrame(index=mi, columns=mi) assert isna(df).values.ravel().all() tuples = [(3, 3), (2, 3), (3, 3)] mi = MultiIndex.from_tuples(tuples) df = DataFrame(index=mi, columns=mi) assert isna(df).values.ravel().all() def test_constructor_2d_index(self): # GH 25416 # handling of 2d index in construction df = DataFrame([[1]], columns=[[1]], index=[1, 2]) expected = DataFrame( [1, 1], index=pd.Int64Index([1, 2], dtype="int64"), columns=MultiIndex(levels=[[1]], codes=[[0]]), ) tm.assert_frame_equal(df, expected) df = DataFrame([[1]], columns=[[1]], index=[[1, 2]]) expected = DataFrame( [1, 1], index=MultiIndex(levels=[[1, 2]], codes=[[0, 1]]), columns=MultiIndex(levels=[[1]], codes=[[0]]), ) tm.assert_frame_equal(df, expected) def test_constructor_error_msgs(self): msg = "Empty data passed with indices specified." # passing an empty array with columns specified. with pytest.raises(ValueError, match=msg): DataFrame(np.empty(0), columns=list("abc")) msg = "Mixing dicts with non-Series may lead to ambiguous ordering." # mix dict and array, wrong size with pytest.raises(ValueError, match=msg): DataFrame({"A": {"a": "a", "b": "b"}, "B": ["a", "b", "c"]}) # wrong size ndarray, GH 3105 msg = r"Shape of passed values is \(4, 3\), indices imply \(3, 3\)" with pytest.raises(ValueError, match=msg): DataFrame( np.arange(12).reshape((4, 3)), columns=["foo", "bar", "baz"], index=date_range("2000-01-01", periods=3), ) arr = np.array([[4, 5, 6]]) msg = r"Shape of passed values is \(1, 3\), indices imply \(1, 4\)" with pytest.raises(ValueError, match=msg): DataFrame(index=[0], columns=range(0, 4), data=arr) arr = np.array([4, 5, 6]) msg = r"Shape of passed values is \(3, 1\), indices imply \(1, 4\)" with pytest.raises(ValueError, match=msg): DataFrame(index=[0], columns=range(0, 4), data=arr) # higher dim raise exception with pytest.raises(ValueError, match="Must pass 2-d input"): DataFrame(np.zeros((3, 3, 3)), columns=["A", "B", "C"], index=[1]) # wrong size axis labels msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)" with pytest.raises(ValueError, match=msg): DataFrame(np.random.rand(2, 3), columns=["A", "B", "C"], index=[1]) msg = r"Shape of passed values is \(2, 3\), indices imply \(2, 2\)" with pytest.raises(ValueError, match=msg): DataFrame(np.random.rand(2, 3), columns=["A", "B"], index=[1, 2]) # gh-26429 msg = "2 columns passed, passed data had 10 columns" with pytest.raises(ValueError, match=msg): DataFrame((range(10), range(10, 20)), columns=("ones", "twos")) msg = "If using all scalar values, you must pass an index" with pytest.raises(ValueError, match=msg): DataFrame({"a": False, "b": True}) def test_constructor_subclass_dict(self, float_frame, dict_subclass): # Test for passing dict subclass to constructor data = { "col1": dict_subclass((x, 10.0 * x) for x in range(10)), "col2": dict_subclass((x, 20.0 * x) for x in range(10)), } df = DataFrame(data) refdf = DataFrame({col: dict(val.items()) for col, val in data.items()}) tm.assert_frame_equal(refdf, df) data = dict_subclass(data.items()) df = DataFrame(data) tm.assert_frame_equal(refdf, df) # try with defaultdict from collections import defaultdict data = {} float_frame["B"][:10] = np.nan for k, v in float_frame.items(): dct = defaultdict(dict) dct.update(v.to_dict()) data[k] = dct frame = DataFrame(data) expected = frame.reindex(index=float_frame.index) tm.assert_frame_equal(float_frame, expected) def test_constructor_dict_block(self): expected = np.array([[4.0, 3.0, 2.0, 1.0]]) df = DataFrame( {"d": [4.0], "c": [3.0], "b": [2.0], "a": [1.0]}, columns=["d", "c", "b", "a"], ) tm.assert_numpy_array_equal(df.values, expected) def test_constructor_dict_cast(self): # cast float tests test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} frame = DataFrame(test_data, dtype=float) assert len(frame) == 3 assert frame["B"].dtype == np.float64 assert frame["A"].dtype == np.float64 frame = DataFrame(test_data) assert len(frame) == 3 assert frame["B"].dtype == np.object_ assert frame["A"].dtype == np.float64 # can't cast to float test_data = { "A": dict(zip(range(20), tm.makeStringIndex(20))), "B": dict(zip(range(15), np.random.randn(15))), } frame = DataFrame(test_data, dtype=float) assert len(frame) == 20 assert frame["A"].dtype == np.object_ assert frame["B"].dtype == np.float64 def test_constructor_dict_dont_upcast(self): d = {"Col1": {"Row1": "A String", "Row2": np.nan}} df = DataFrame(d) assert isinstance(df["Col1"]["Row2"], float) dm = DataFrame([[1, 2], ["a", "b"]], index=[1, 2], columns=[1, 2]) assert isinstance(dm[1][1], int) def test_constructor_dict_of_tuples(self): # GH #1491 data = {"a": (1, 2, 3), "b": (4, 5, 6)} result = DataFrame(data) expected = DataFrame({k: list(v) for k, v in data.items()}) tm.assert_frame_equal(result, expected, check_dtype=False) def test_constructor_dict_of_ranges(self): # GH 26356 data = {"a": range(3), "b": range(3, 6)} result = DataFrame(data) expected = DataFrame({"a": [0, 1, 2], "b": [3, 4, 5]}) tm.assert_frame_equal(result, expected) def test_constructor_dict_of_iterators(self): # GH 26349 data = {"a": iter(range(3)), "b": reversed(range(3))} result = DataFrame(data) expected = DataFrame({"a": [0, 1, 2], "b": [2, 1, 0]}) tm.assert_frame_equal(result, expected) def test_constructor_dict_of_generators(self): # GH 26349 data = {"a": (i for i in (range(3))), "b": (i for i in reversed(range(3)))} result = DataFrame(data) expected = DataFrame({"a": [0, 1, 2], "b": [2, 1, 0]}) tm.assert_frame_equal(result, expected) def test_constructor_dict_multiindex(self): def check(result, expected): return tm.assert_frame_equal( result, expected, check_dtype=True, check_index_type=True, check_column_type=True, check_names=True, ) d = { ("a", "a"): {("i", "i"): 0, ("i", "j"): 1, ("j", "i"): 2}, ("b", "a"): {("i", "i"): 6, ("i", "j"): 5, ("j", "i"): 4}, ("b", "c"): {("i", "i"): 7, ("i", "j"): 8, ("j", "i"): 9}, } _d = sorted(d.items()) df = DataFrame(d) expected = DataFrame( [x[1] for x in _d], index=MultiIndex.from_tuples([x[0] for x in _d]) ).T expected.index = MultiIndex.from_tuples(expected.index) check(df, expected) d["z"] = {"y": 123.0, ("i", "i"): 111, ("i", "j"): 111, ("j", "i"): 111} _d.insert(0, ("z", d["z"])) expected = DataFrame( [x[1] for x in _d], index=Index([x[0] for x in _d], tupleize_cols=False) ).T expected.index = Index(expected.index, tupleize_cols=False) df = DataFrame(d) df = df.reindex(columns=expected.columns, index=expected.index) check(df, expected) def test_constructor_dict_datetime64_index(self): # GH 10160 dates_as_str = ["1984-02-19", "1988-11-06", "1989-12-03", "1990-03-15"] def create_data(constructor): return {i: {constructor(s): 2 * i} for i, s in enumerate(dates_as_str)} data_datetime64 = create_data(np.datetime64) data_datetime = create_data(lambda x: datetime.strptime(x, "%Y-%m-%d")) data_Timestamp = create_data(Timestamp) expected = DataFrame( [ {0: 0, 1: None, 2: None, 3: None}, {0: None, 1: 2, 2: None, 3: None}, {0: None, 1: None, 2: 4, 3: None}, {0: None, 1: None, 2: None, 3: 6}, ], index=[Timestamp(dt) for dt in dates_as_str], ) result_datetime64 = DataFrame(data_datetime64) result_datetime = DataFrame(data_datetime) result_Timestamp = DataFrame(data_Timestamp) tm.assert_frame_equal(result_datetime64, expected) tm.assert_frame_equal(result_datetime, expected) tm.assert_frame_equal(result_Timestamp, expected) def test_constructor_dict_timedelta64_index(self): # GH 10160 td_as_int = [1, 2, 3, 4] def create_data(constructor): return {i: {constructor(s): 2 * i} for i, s in enumerate(td_as_int)} data_timedelta64 = create_data(lambda x: np.timedelta64(x, "D")) data_timedelta = create_data(lambda x: timedelta(days=x)) data_Timedelta = create_data(lambda x: Timedelta(x, "D")) expected = DataFrame( [ {0: 0, 1: None, 2: None, 3: None}, {0: None, 1: 2, 2: None, 3: None}, {0: None, 1: None, 2: 4, 3: None}, {0: None, 1: None, 2: None, 3: 6}, ], index=[Timedelta(td, "D") for td in td_as_int], ) result_timedelta64 = DataFrame(data_timedelta64) result_timedelta = DataFrame(data_timedelta) result_Timedelta = DataFrame(data_Timedelta) tm.assert_frame_equal(result_timedelta64, expected) tm.assert_frame_equal(result_timedelta, expected) tm.assert_frame_equal(result_Timedelta, expected) def test_constructor_period_dict(self): # PeriodIndex a = pd.PeriodIndex(["2012-01", "NaT", "2012-04"], freq="M") b = pd.PeriodIndex(["2012-02-01", "2012-03-01", "NaT"], freq="D") df = DataFrame({"a": a, "b": b}) assert df["a"].dtype == a.dtype assert df["b"].dtype == b.dtype # list of periods df = DataFrame({"a": a.astype(object).tolist(), "b": b.astype(object).tolist()}) assert df["a"].dtype == a.dtype assert df["b"].dtype == b.dtype def test_constructor_dict_extension_scalar(self, ea_scalar_and_dtype): ea_scalar, ea_dtype = ea_scalar_and_dtype df = DataFrame({"a": ea_scalar}, index=[0]) assert df["a"].dtype == ea_dtype expected = DataFrame(index=[0], columns=["a"], data=ea_scalar) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( "data,dtype", [ (Period("2020-01"), PeriodDtype("M")), (Interval(left=0, right=5), IntervalDtype("int64")), ( Timestamp("2011-01-01", tz="US/Eastern"), DatetimeTZDtype(tz="US/Eastern"), ), ], ) def test_constructor_extension_scalar_data(self, data, dtype): # GH 34832 df = DataFrame(index=[0, 1], columns=["a", "b"], data=data) assert df["a"].dtype == dtype assert df["b"].dtype == dtype arr = pd.array([data] * 2, dtype=dtype) expected = DataFrame({"a": arr, "b": arr}) tm.assert_frame_equal(df, expected) def test_nested_dict_frame_constructor(self): rng = pd.period_range("1/1/2000", periods=5) df = DataFrame(np.random.randn(10, 5), columns=rng) data = {} for col in df.columns: for row in df.index: data.setdefault(col, {})[row] = df._get_value(row, col) result = DataFrame(data, columns=rng) tm.assert_frame_equal(result, df) data = {} for col in df.columns: for row in df.index: data.setdefault(row, {})[col] = df._get_value(row, col) result = DataFrame(data, index=rng).T tm.assert_frame_equal(result, df) def _check_basic_constructor(self, empty): # mat: 2d matrix with shape (3, 2) to input. empty - makes sized # objects mat = empty((2, 3), dtype=float) # 2-D input frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) assert len(frame.index) == 2 assert len(frame.columns) == 3 # 1-D input frame = DataFrame(empty((3,)), columns=["A"], index=[1, 2, 3]) assert len(frame.index) == 3 assert len(frame.columns) == 1 # cast type frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64) assert frame.values.dtype == np.int64 # wrong size axis labels msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)" with pytest.raises(ValueError, match=msg): DataFrame(mat, columns=["A", "B", "C"], index=[1]) msg = r"Shape of passed values is \(2, 3\), indices imply \(2, 2\)" with pytest.raises(ValueError, match=msg): DataFrame(mat, columns=["A", "B"], index=[1, 2]) # higher dim raise exception with pytest.raises(ValueError, match="Must pass 2-d input"): DataFrame(empty((3, 3, 3)), columns=["A", "B", "C"], index=[1]) # automatic labeling frame = DataFrame(mat) tm.assert_index_equal(frame.index, Index(range(2)), exact=True) tm.assert_index_equal(frame.columns, Index(range(3)), exact=True) frame = DataFrame(mat, index=[1, 2]) tm.assert_index_equal(frame.columns, Index(range(3)), exact=True) frame = DataFrame(mat, columns=["A", "B", "C"]) tm.assert_index_equal(frame.index, Index(range(2)), exact=True) # 0-length axis frame = DataFrame(empty((0, 3))) assert len(frame.index) == 0 frame = DataFrame(empty((3, 0))) assert len(frame.columns) == 0 def test_constructor_ndarray(self): self._check_basic_constructor(np.ones) frame = DataFrame(["foo", "bar"], index=[0, 1], columns=["A"]) assert len(frame) == 2 def test_constructor_maskedarray(self): self._check_basic_constructor(ma.masked_all) # Check non-masked values mat = ma.masked_all((2, 3), dtype=float) mat[0, 0] = 1.0 mat[1, 2] = 2.0 frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) assert 1.0 == frame["A"][1] assert 2.0 == frame["C"][2] # what is this even checking?? mat = ma.masked_all((2, 3), dtype=float) frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) assert np.all(~np.asarray(frame == frame)) def test_constructor_maskedarray_nonfloat(self): # masked int promoted to float mat = ma.masked_all((2, 3), dtype=int) # 2-D input frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) assert len(frame.index) == 2 assert len(frame.columns) == 3 assert np.all(~np.asarray(frame == frame)) # cast type frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.float64) assert frame.values.dtype == np.float64 # Check non-masked values mat2 = ma.copy(mat) mat2[0, 0] = 1 mat2[1, 2] = 2 frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2]) assert 1 == frame["A"][1] assert 2 == frame["C"][2] # masked np.datetime64 stays (use NaT as null) mat = ma.masked_all((2, 3), dtype="M8[ns]") # 2-D input frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) assert len(frame.index) == 2 assert len(frame.columns) == 3 assert isna(frame).values.all() # cast type frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64) assert frame.values.dtype == np.int64 # Check non-masked values mat2 = ma.copy(mat) mat2[0, 0] = 1 mat2[1, 2] = 2 frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2]) assert 1 == frame["A"].view("i8")[1] assert 2 == frame["C"].view("i8")[2] # masked bool promoted to object mat = ma.masked_all((2, 3), dtype=bool) # 2-D input frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) assert len(frame.index) == 2 assert len(frame.columns) == 3 assert np.all(~np.asarray(frame == frame)) # cast type frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=object) assert frame.values.dtype == object # Check non-masked values mat2 = ma.copy(mat) mat2[0, 0] = True mat2[1, 2] = False frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2]) assert frame["A"][1] is True assert frame["C"][2] is False def test_constructor_maskedarray_hardened(self): # Check numpy masked arrays with hard masks -- from GH24574 mat_hard = ma.masked_all((2, 2), dtype=float).harden_mask() result = DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) expected = DataFrame( {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, columns=["A", "B"], index=[1, 2], dtype=float, ) tm.assert_frame_equal(result, expected) # Check case where mask is hard but no data are masked mat_hard = ma.ones((2, 2), dtype=float).harden_mask() result = DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) expected = DataFrame( {"A": [1.0, 1.0], "B": [1.0, 1.0]}, columns=["A", "B"], index=[1, 2], dtype=float, ) tm.assert_frame_equal(result, expected) def test_constructor_maskedrecarray_dtype(self): # Ensure constructor honors dtype data = np.ma.array( np.ma.zeros(5, dtype=[("date", "0 df = DataFrame( { "a": 1.0, "b": 2, "c": "foo", floatname: np.array([1.0] * 10, dtype=floatname), intname: np.array([1] * 10, dtype=intname), }, index=np.arange(10), ) result = df.dtypes expected = Series( [np.dtype("float64")] + [np.dtype("int64")] + [np.dtype("object")] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], ) tm.assert_series_equal(result, expected) # GH 2809 ind = date_range(start="2000-01-01", freq="D", periods=10) datetimes = [ts.to_pydatetime() for ts in ind] datetime_s = Series(datetimes) assert datetime_s.dtype == "M8[ns]" # GH 2810 ind = date_range(start="2000-01-01", freq="D", periods=10) datetimes = [ts.to_pydatetime() for ts in ind] dates = [ts.date() for ts in ind] df = DataFrame(datetimes, columns=["datetimes"]) df["dates"] = dates result = df.dtypes expected = Series( [np.dtype("datetime64[ns]"), np.dtype("object")], index=["datetimes", "dates"], ) tm.assert_series_equal(result, expected) # GH 7594 # don't coerce tz-aware import pytz tz = pytz.timezone("US/Eastern") dt = tz.localize(datetime(2012, 1, 1)) df = DataFrame({"End Date": dt}, index=[0]) assert df.iat[0, 0] == dt tm.assert_series_equal( df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}) ) df = DataFrame([{"End Date": dt}]) assert df.iat[0, 0] == dt tm.assert_series_equal( df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}) ) # tz-aware (UTC and other tz's) # GH 8411 dr = date_range("20130101", periods=3) df = DataFrame({"value": dr}) assert df.iat[0, 0].tz is None dr = date_range("20130101", periods=3, tz="UTC") df = DataFrame({"value": dr}) assert str(df.iat[0, 0].tz) == "UTC" dr = date_range("20130101", periods=3, tz="US/Eastern") df = DataFrame({"value": dr}) assert str(df.iat[0, 0].tz) == "US/Eastern" # GH 7822 # preserver an index with a tz on dict construction i = date_range("1/1/2011", periods=5, freq="10s", tz="US/Eastern") expected = DataFrame({"a": i.to_series().reset_index(drop=True)}) df = DataFrame() df["a"] = i tm.assert_frame_equal(df, expected) df = DataFrame({"a": i}) tm.assert_frame_equal(df, expected) # multiples i_no_tz = date_range("1/1/2011", periods=5, freq="10s") df = DataFrame({"a": i, "b": i_no_tz}) expected = DataFrame({"a": i.to_series().reset_index(drop=True), "b": i_no_tz}) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( "arr", [ np.array([None, None, None, None, datetime.now(), None]), np.array([None, None, datetime.now(), None]), [[np.datetime64("NaT")], [None]], [[np.datetime64("NaT")], [pd.NaT]], [[None], [np.datetime64("NaT")]], [[None], [pd.NaT]], [[pd.NaT], [np.datetime64("NaT")]], [[pd.NaT], [None]], ], ) def test_constructor_datetimes_with_nulls(self, arr): # gh-15869, GH#11220 result = DataFrame(arr).dtypes expected = Series([np.dtype("datetime64[ns]")]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("order", ["K", "A", "C", "F"]) @pytest.mark.parametrize( "dtype", [ "datetime64[M]", "datetime64[D]", "datetime64[h]", "datetime64[m]", "datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]", ], ) def test_constructor_datetimes_non_ns(self, order, dtype): na = np.array( [ ["2015-01-01", "2015-01-02", "2015-01-03"], ["2017-01-01", "2017-01-02", "2017-02-03"], ], dtype=dtype, order=order, ) df = DataFrame(na) expected = DataFrame( [ ["2015-01-01", "2015-01-02", "2015-01-03"], ["2017-01-01", "2017-01-02", "2017-02-03"], ] ) expected = expected.astype(dtype=dtype) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("order", ["K", "A", "C", "F"]) @pytest.mark.parametrize( "dtype", [ "timedelta64[D]", "timedelta64[h]", "timedelta64[m]", "timedelta64[s]", "timedelta64[ms]", "timedelta64[us]", "timedelta64[ns]", ], ) def test_constructor_timedelta_non_ns(self, order, dtype): na = np.array( [ [np.timedelta64(1, "D"), np.timedelta64(2, "D")], [np.timedelta64(4, "D"), np.timedelta64(5, "D")], ], dtype=dtype, order=order, ) df = DataFrame(na).astype("timedelta64[ns]") expected = DataFrame( [ [Timedelta(1, "D"), Timedelta(2, "D")], [Timedelta(4, "D"), Timedelta(5, "D")], ], ) tm.assert_frame_equal(df, expected) def test_constructor_for_list_with_dtypes(self): # test list of lists/ndarrays df = DataFrame([np.arange(5) for x in range(5)]) result = df.dtypes expected = Series([np.dtype("int64")] * 5) tm.assert_series_equal(result, expected) df = DataFrame([np.array(np.arange(5), dtype="int32") for x in range(5)]) result = df.dtypes expected = Series([np.dtype("int64")] * 5) tm.assert_series_equal(result, expected) # overflow issue? (we always expecte int64 upcasting here) df = DataFrame({"a": [2 ** 31, 2 ** 31 + 1]}) assert df.dtypes.iloc[0] == np.dtype("int64") # GH #2751 (construction with no index specified), make sure we cast to # platform values df = DataFrame([1, 2]) assert df.dtypes.iloc[0] == np.dtype("int64") df = DataFrame([1.0, 2.0]) assert df.dtypes.iloc[0] == np.dtype("float64") df = DataFrame({"a": [1, 2]}) assert df.dtypes.iloc[0] == np.dtype("int64") df = DataFrame({"a": [1.0, 2.0]}) assert df.dtypes.iloc[0] == np.dtype("float64") df = DataFrame({"a": 1}, index=range(3)) assert df.dtypes.iloc[0] == np.dtype("int64") df = DataFrame({"a": 1.0}, index=range(3)) assert df.dtypes.iloc[0] == np.dtype("float64") # with object list df = DataFrame( { "a": [1, 2, 4, 7], "b": [1.2, 2.3, 5.1, 6.3], "c": list("abcd"), "d": [datetime(2000, 1, 1) for i in range(4)], "e": [1.0, 2, 4.0, 7], } ) result = df.dtypes expected = Series( [ np.dtype("int64"), np.dtype("float64"), np.dtype("object"), np.dtype("datetime64[ns]"), np.dtype("float64"), ], index=list("abcde"), ) tm.assert_series_equal(result, expected) def test_constructor_frame_copy(self, float_frame): cop = DataFrame(float_frame, copy=True) cop["A"] = 5 assert (cop["A"] == 5).all() assert not (float_frame["A"] == 5).all() def test_constructor_ndarray_copy(self, float_frame): df = DataFrame(float_frame.values) float_frame.values[5] = 5 assert (df.values[5] == 5).all() df = DataFrame(float_frame.values, copy=True) float_frame.values[6] = 6 assert not (df.values[6] == 6).all() def test_constructor_series_copy(self, float_frame): series = float_frame._series df = DataFrame({"A": series["A"]}) df["A"][:] = 5 assert not (series["A"] == 5).all() def test_constructor_with_nas(self): # GH 5016 # na's in indices def check(df): for i in range(len(df.columns)): df.iloc[:, i] indexer = np.arange(len(df.columns))[isna(df.columns)] # No NaN found -> error if len(indexer) == 0: with pytest.raises(KeyError, match="^nan$"): df.loc[:, np.nan] # single nan should result in Series elif len(indexer) == 1: tm.assert_series_equal(df.iloc[:, indexer[0]], df.loc[:, np.nan]) # multiple nans should result in DataFrame else: tm.assert_frame_equal(df.iloc[:, indexer], df.loc[:, np.nan]) df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[1, np.nan]) check(df) df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[1.1, 2.2, np.nan]) check(df) df = DataFrame([[0, 1, 2, 3], [4, 5, 6, 7]], columns=[np.nan, 1.1, 2.2, np.nan]) check(df) df = DataFrame( [[0.0, 1, 2, 3.0], [4, 5, 6, 7]], columns=[np.nan, 1.1, 2.2, np.nan] ) check(df) # GH 21428 (non-unique columns) df = DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]], columns=[np.nan, 1, 2, 2]) check(df) def test_constructor_lists_to_object_dtype(self): # from #1074 d = DataFrame({"a": [np.nan, False]}) assert d["a"].dtype == np.object_ assert not d["a"][1] def test_constructor_categorical(self): # GH8626 # dict creation df = DataFrame({"A": list("abc")}, dtype="category") expected = Series(list("abc"), dtype="category", name="A") tm.assert_series_equal(df["A"], expected) # to_frame s = Series(list("abc"), dtype="category") result = s.to_frame() expected = Series(list("abc"), dtype="category", name=0) tm.assert_series_equal(result[0], expected) result = s.to_frame(name="foo") expected = Series(list("abc"), dtype="category", name="foo") tm.assert_series_equal(result["foo"], expected) # list-like creation df = DataFrame(list("abc"), dtype="category") expected = Series(list("abc"), dtype="category", name=0) tm.assert_series_equal(df[0], expected) # ndim != 1 df = DataFrame([Categorical(list("abc"))]) expected = DataFrame({0: Series(list("abc"), dtype="category")}) tm.assert_frame_equal(df, expected) df = DataFrame([Categorical(list("abc")), Categorical(list("abd"))]) expected = DataFrame( { 0: Series(list("abc"), dtype="category"), 1: Series(list("abd"), dtype="category"), }, columns=[0, 1], ) tm.assert_frame_equal(df, expected) # mixed df = DataFrame([Categorical(list("abc")), list("def")]) expected = DataFrame( {0: Series(list("abc"), dtype="category"), 1: list("def")}, columns=[0, 1] ) tm.assert_frame_equal(df, expected) # invalid (shape) msg = r"Shape of passed values is \(6, 2\), indices imply \(3, 2\)" with pytest.raises(ValueError, match=msg): DataFrame([Categorical(list("abc")), Categorical(list("abdefg"))]) # ndim > 1 msg = "> 1 ndim Categorical are not supported at this time" with pytest.raises(NotImplementedError, match=msg): Categorical(np.array([list("abcd")])) def test_constructor_categorical_series(self): items = [1, 2, 3, 1] exp = Series(items).astype("category") res = Series(items, dtype="category") tm.assert_series_equal(res, exp) items = ["a", "b", "c", "a"] exp = Series(items).astype("category") res = Series(items, dtype="category") tm.assert_series_equal(res, exp) # insert into frame with different index # GH 8076 index = date_range("20000101", periods=3) expected = Series( Categorical(values=[np.nan, np.nan, np.nan], categories=["a", "b", "c"]) ) expected.index = index expected = DataFrame({"x": expected}) df = DataFrame({"x": Series(["a", "b", "c"], dtype="category")}, index=index) tm.assert_frame_equal(df, expected) def test_from_records_to_records(self): # from numpy documentation arr = np.zeros((2,), dtype=("i4,f4,a10")) arr[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] # TODO(wesm): unused frame = DataFrame.from_records(arr) # noqa index = Index(np.arange(len(arr))[::-1]) indexed_frame = DataFrame.from_records(arr, index=index) tm.assert_index_equal(indexed_frame.index, index) # without names, it should go to last ditch arr2 = np.zeros((2, 3)) tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2)) # wrong length msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)" with pytest.raises(ValueError, match=msg): DataFrame.from_records(arr, index=index[:-1]) indexed_frame = DataFrame.from_records(arr, index="f1") # what to do? records = indexed_frame.to_records() assert len(records.dtype.names) == 3 records = indexed_frame.to_records(index=False) assert len(records.dtype.names) == 2 assert "index" not in records.dtype.names def test_from_records_nones(self): tuples = [(1, 2, None, 3), (1, 2, None, 3), (None, 2, 5, 3)] df = DataFrame.from_records(tuples, columns=["a", "b", "c", "d"]) assert np.isnan(df["c"][0]) def test_from_records_iterator(self): arr = np.array( [(1.0, 1.0, 2, 2), (3.0, 3.0, 4, 4), (5.0, 5.0, 6, 6), (7.0, 7.0, 8, 8)], dtype=[ ("x", np.float64), ("u", np.float32), ("y", np.int64), ("z", np.int32), ], ) df = DataFrame.from_records(iter(arr), nrows=2) xp = DataFrame( { "x": np.array([1.0, 3.0], dtype=np.float64), "u": np.array([1.0, 3.0], dtype=np.float32), "y": np.array([2, 4], dtype=np.int64), "z": np.array([2, 4], dtype=np.int32), } ) tm.assert_frame_equal(df.reindex_like(xp), xp) # no dtypes specified here, so just compare with the default arr = [(1.0, 2), (3.0, 4), (5.0, 6), (7.0, 8)] df = DataFrame.from_records(iter(arr), columns=["x", "y"], nrows=2) tm.assert_frame_equal(df, xp.reindex(columns=["x", "y"]), check_dtype=False) def test_from_records_tuples_generator(self): def tuple_generator(length): for i in range(length): letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" yield (i, letters[i % len(letters)], i / length) columns_names = ["Integer", "String", "Float"] columns = [ [i[j] for i in tuple_generator(10)] for j in range(len(columns_names)) ] data = {"Integer": columns[0], "String": columns[1], "Float": columns[2]} expected = DataFrame(data, columns=columns_names) generator = tuple_generator(10) result = DataFrame.from_records(generator, columns=columns_names) tm.assert_frame_equal(result, expected) def test_from_records_lists_generator(self): def list_generator(length): for i in range(length): letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" yield [i, letters[i % len(letters)], i / length] columns_names = ["Integer", "String", "Float"] columns = [ [i[j] for i in list_generator(10)] for j in range(len(columns_names)) ] data = {"Integer": columns[0], "String": columns[1], "Float": columns[2]} expected = DataFrame(data, columns=columns_names) generator = list_generator(10) result = DataFrame.from_records(generator, columns=columns_names) tm.assert_frame_equal(result, expected) def test_from_records_columns_not_modified(self): tuples = [(1, 2, 3), (1, 2, 3), (2, 5, 3)] columns = ["a", "b", "c"] original_columns = list(columns) df = DataFrame.from_records(tuples, columns=columns, index="a") # noqa assert columns == original_columns def test_from_records_decimal(self): from decimal import Decimal tuples = [(Decimal("1.5"),), (Decimal("2.5"),), (None,)] df = DataFrame.from_records(tuples, columns=["a"]) assert df["a"].dtype == object df = DataFrame.from_records(tuples, columns=["a"], coerce_float=True) assert df["a"].dtype == np.float64 assert np.isnan(df["a"].values[-1]) def test_from_records_duplicates(self): result = DataFrame.from_records([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "a"]) expected = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "a"]) tm.assert_frame_equal(result, expected) def test_from_records_set_index_name(self): def create_dict(order_id): return { "order_id": order_id, "quantity": np.random.randint(1, 10), "price": np.random.randint(1, 10), } documents = [create_dict(i) for i in range(10)] # demo missing data documents.append({"order_id": 10, "quantity": 5}) result = DataFrame.from_records(documents, index="order_id") assert result.index.name == "order_id" # MultiIndex result = DataFrame.from_records(documents, index=["order_id", "quantity"]) assert result.index.names == ("order_id", "quantity") def test_from_records_misc_brokenness(self): # #2179 data = {1: ["foo"], 2: ["bar"]} result = DataFrame.from_records(data, columns=["a", "b"]) exp = DataFrame(data, columns=["a", "b"]) tm.assert_frame_equal(result, exp) # overlap in index/index_names data = {"a": [1, 2, 3], "b": [4, 5, 6]} result = DataFrame.from_records(data, index=["a", "b", "c"]) exp = DataFrame(data, index=["a", "b", "c"]) tm.assert_frame_equal(result, exp) # GH 2623 rows = [] rows.append([datetime(2010, 1, 1), 1]) rows.append([datetime(2010, 1, 2), "hi"]) # test col upconverts to obj df2_obj = DataFrame.from_records(rows, columns=["date", "test"]) result = df2_obj.dtypes expected = Series( [np.dtype("datetime64[ns]"), np.dtype("object")], index=["date", "test"] ) tm.assert_series_equal(result, expected) rows = [] rows.append([datetime(2010, 1, 1), 1]) rows.append([datetime(2010, 1, 2), 1]) df2_obj = DataFrame.from_records(rows, columns=["date", "test"]) result = df2_obj.dtypes expected = Series( [np.dtype("datetime64[ns]"), np.dtype("int64")], index=["date", "test"] ) tm.assert_series_equal(result, expected) def test_from_records_empty(self): # 3562 result = DataFrame.from_records([], columns=["a", "b", "c"]) expected = DataFrame(columns=["a", "b", "c"]) tm.assert_frame_equal(result, expected) result = DataFrame.from_records([], columns=["a", "b", "b"]) expected = DataFrame(columns=["a", "b", "b"]) tm.assert_frame_equal(result, expected) def test_from_records_empty_with_nonempty_fields_gh3682(self): a = np.array([(1, 2)], dtype=[("id", np.int64), ("value", np.int64)]) df = DataFrame.from_records(a, index="id") tm.assert_index_equal(df.index, Index([1], name="id")) assert df.index.name == "id" tm.assert_index_equal(df.columns, Index(["value"])) b = np.array([], dtype=[("id", np.int64), ("value", np.int64)]) df = DataFrame.from_records(b, index="id") tm.assert_index_equal(df.index, Index([], name="id")) assert df.index.name == "id" @pytest.mark.parametrize( "dtype", tm.ALL_INT_DTYPES + tm.ALL_EA_INT_DTYPES + tm.FLOAT_DTYPES + tm.COMPLEX_DTYPES + tm.DATETIME64_DTYPES + tm.TIMEDELTA64_DTYPES + tm.BOOL_DTYPES, ) def test_check_dtype_empty_numeric_column(self, dtype): # GH24386: Ensure dtypes are set correctly for an empty DataFrame. # Empty DataFrame is generated via dictionary data with non-overlapping columns. data = DataFrame({"a": [1, 2]}, columns=["b"], dtype=dtype) assert data.b.dtype == dtype @pytest.mark.parametrize( "dtype", tm.STRING_DTYPES + tm.BYTES_DTYPES + tm.OBJECT_DTYPES ) def test_check_dtype_empty_string_column(self, dtype): # GH24386: Ensure dtypes are set correctly for an empty DataFrame. # Empty DataFrame is generated via dictionary data with non-overlapping columns. data = DataFrame({"a": [1, 2]}, columns=["b"], dtype=dtype) assert data.b.dtype.name == "object" def test_from_records_with_datetimes(self): # this may fail on certain platforms because of a numpy issue # related GH6140 if not is_platform_little_endian(): pytest.skip("known failure of test on non-little endian") # construction with a null in a recarray # GH 6140 expected = DataFrame({"EXPIRY": [datetime(2005, 3, 1, 0, 0), None]}) arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])] dtypes = [("EXPIRY", "