606 lines
16 KiB
Python
606 lines
16 KiB
Python
"""
|
|
Tests dtype specification during parsing
|
|
for all of the parsers defined in parsers.py
|
|
"""
|
|
from io import StringIO
|
|
import os
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from pandas.errors import ParserWarning
|
|
|
|
from pandas.core.dtypes.dtypes import CategoricalDtype
|
|
|
|
import pandas as pd
|
|
from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat
|
|
import pandas._testing as tm
|
|
|
|
|
|
@pytest.mark.parametrize("dtype", [str, object])
|
|
@pytest.mark.parametrize("check_orig", [True, False])
|
|
def test_dtype_all_columns(all_parsers, dtype, check_orig):
|
|
# see gh-3795, gh-6607
|
|
parser = all_parsers
|
|
|
|
df = DataFrame(
|
|
np.random.rand(5, 2).round(4),
|
|
columns=list("AB"),
|
|
index=["1A", "1B", "1C", "1D", "1E"],
|
|
)
|
|
|
|
with tm.ensure_clean("__passing_str_as_dtype__.csv") as path:
|
|
df.to_csv(path)
|
|
|
|
result = parser.read_csv(path, dtype=dtype, index_col=0)
|
|
|
|
if check_orig:
|
|
expected = df.copy()
|
|
result = result.astype(float)
|
|
else:
|
|
expected = df.astype(str)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_dtype_all_columns_empty(all_parsers):
|
|
# see gh-12048
|
|
parser = all_parsers
|
|
result = parser.read_csv(StringIO("A,B"), dtype=str)
|
|
|
|
expected = DataFrame({"A": [], "B": []}, index=[], dtype=str)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_dtype_per_column(all_parsers):
|
|
parser = all_parsers
|
|
data = """\
|
|
one,two
|
|
1,2.5
|
|
2,3.5
|
|
3,4.5
|
|
4,5.5"""
|
|
expected = DataFrame(
|
|
[[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
|
|
)
|
|
expected["one"] = expected["one"].astype(np.float64)
|
|
expected["two"] = expected["two"].astype(object)
|
|
|
|
result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_invalid_dtype_per_column(all_parsers):
|
|
parser = all_parsers
|
|
data = """\
|
|
one,two
|
|
1,2.5
|
|
2,3.5
|
|
3,4.5
|
|
4,5.5"""
|
|
|
|
with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"):
|
|
parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"dtype",
|
|
[
|
|
"category",
|
|
CategoricalDtype(),
|
|
{"a": "category", "b": "category", "c": CategoricalDtype()},
|
|
],
|
|
)
|
|
def test_categorical_dtype(all_parsers, dtype):
|
|
# see gh-10153
|
|
parser = all_parsers
|
|
data = """a,b,c
|
|
1,a,3.4
|
|
1,a,3.4
|
|
2,b,4.5"""
|
|
expected = DataFrame(
|
|
{
|
|
"a": Categorical(["1", "1", "2"]),
|
|
"b": Categorical(["a", "a", "b"]),
|
|
"c": Categorical(["3.4", "3.4", "4.5"]),
|
|
}
|
|
)
|
|
actual = parser.read_csv(StringIO(data), dtype=dtype)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
|
|
def test_categorical_dtype_single(all_parsers, dtype):
|
|
# see gh-10153
|
|
parser = all_parsers
|
|
data = """a,b,c
|
|
1,a,3.4
|
|
1,a,3.4
|
|
2,b,4.5"""
|
|
expected = DataFrame(
|
|
{"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]}
|
|
)
|
|
actual = parser.read_csv(StringIO(data), dtype=dtype)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
|
|
def test_categorical_dtype_unsorted(all_parsers):
|
|
# see gh-10153
|
|
parser = all_parsers
|
|
data = """a,b,c
|
|
1,b,3.4
|
|
1,b,3.4
|
|
2,a,4.5"""
|
|
expected = DataFrame(
|
|
{
|
|
"a": Categorical(["1", "1", "2"]),
|
|
"b": Categorical(["b", "b", "a"]),
|
|
"c": Categorical(["3.4", "3.4", "4.5"]),
|
|
}
|
|
)
|
|
actual = parser.read_csv(StringIO(data), dtype="category")
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
|
|
def test_categorical_dtype_missing(all_parsers):
|
|
# see gh-10153
|
|
parser = all_parsers
|
|
data = """a,b,c
|
|
1,b,3.4
|
|
1,nan,3.4
|
|
2,a,4.5"""
|
|
expected = DataFrame(
|
|
{
|
|
"a": Categorical(["1", "1", "2"]),
|
|
"b": Categorical(["b", np.nan, "a"]),
|
|
"c": Categorical(["3.4", "3.4", "4.5"]),
|
|
}
|
|
)
|
|
actual = parser.read_csv(StringIO(data), dtype="category")
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
|
|
@pytest.mark.slow
|
|
def test_categorical_dtype_high_cardinality_numeric(all_parsers):
|
|
# see gh-18186
|
|
parser = all_parsers
|
|
data = np.sort([str(i) for i in range(524289)])
|
|
expected = DataFrame({"a": Categorical(data, ordered=True)})
|
|
|
|
actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
|
|
actual["a"] = actual["a"].cat.reorder_categories(
|
|
np.sort(actual.a.cat.categories), ordered=True
|
|
)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
|
|
def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
|
|
# see gh-10153
|
|
pth = os.path.join(csv_dir_path, "unicode_series.csv")
|
|
parser = all_parsers
|
|
encoding = "latin-1"
|
|
|
|
expected = parser.read_csv(pth, header=None, encoding=encoding)
|
|
expected[1] = Categorical(expected[1])
|
|
|
|
actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"})
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
|
|
def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
|
|
# see gh-10153
|
|
pth = os.path.join(csv_dir_path, "utf16_ex.txt")
|
|
parser = all_parsers
|
|
encoding = "utf-16"
|
|
sep = "\t"
|
|
|
|
expected = parser.read_csv(pth, sep=sep, encoding=encoding)
|
|
expected = expected.apply(Categorical)
|
|
|
|
actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category")
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
|
|
def test_categorical_dtype_chunksize_infer_categories(all_parsers):
|
|
# see gh-10153
|
|
parser = all_parsers
|
|
data = """a,b
|
|
1,a
|
|
1,b
|
|
1,b
|
|
2,c"""
|
|
expecteds = [
|
|
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}),
|
|
DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]),
|
|
]
|
|
with parser.read_csv(
|
|
StringIO(data), dtype={"b": "category"}, chunksize=2
|
|
) as actuals:
|
|
for actual, expected in zip(actuals, expecteds):
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
|
|
def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
|
|
# see gh-10153
|
|
parser = all_parsers
|
|
data = """a,b
|
|
1,a
|
|
1,b
|
|
1,b
|
|
2,c"""
|
|
cats = ["a", "b", "c"]
|
|
expecteds = [
|
|
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}),
|
|
DataFrame(
|
|
{"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)}, index=[2, 3]
|
|
),
|
|
]
|
|
dtype = CategoricalDtype(cats)
|
|
with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals:
|
|
for actual, expected in zip(actuals, expecteds):
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("ordered", [False, True])
|
|
@pytest.mark.parametrize(
|
|
"categories",
|
|
[["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]],
|
|
)
|
|
def test_categorical_category_dtype(all_parsers, categories, ordered):
|
|
parser = all_parsers
|
|
data = """a,b
|
|
1,a
|
|
1,b
|
|
1,b
|
|
2,c"""
|
|
expected = DataFrame(
|
|
{
|
|
"a": [1, 1, 1, 2],
|
|
"b": Categorical(
|
|
["a", "b", "b", "c"], categories=categories, ordered=ordered
|
|
),
|
|
}
|
|
)
|
|
|
|
dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)}
|
|
result = parser.read_csv(StringIO(data), dtype=dtype)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_categorical_category_dtype_unsorted(all_parsers):
|
|
parser = all_parsers
|
|
data = """a,b
|
|
1,a
|
|
1,b
|
|
1,b
|
|
2,c"""
|
|
dtype = CategoricalDtype(["c", "b", "a"])
|
|
expected = DataFrame(
|
|
{
|
|
"a": [1, 1, 1, 2],
|
|
"b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]),
|
|
}
|
|
)
|
|
|
|
result = parser.read_csv(StringIO(data), dtype={"b": dtype})
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_categorical_coerces_numeric(all_parsers):
|
|
parser = all_parsers
|
|
dtype = {"b": CategoricalDtype([1, 2, 3])}
|
|
|
|
data = "b\n1\n1\n2\n3"
|
|
expected = DataFrame({"b": Categorical([1, 1, 2, 3])})
|
|
|
|
result = parser.read_csv(StringIO(data), dtype=dtype)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_categorical_coerces_datetime(all_parsers):
|
|
parser = all_parsers
|
|
dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None)
|
|
dtype = {"b": CategoricalDtype(dti)}
|
|
|
|
data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
|
|
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
|
|
|
|
result = parser.read_csv(StringIO(data), dtype=dtype)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_categorical_coerces_timestamp(all_parsers):
|
|
parser = all_parsers
|
|
dtype = {"b": CategoricalDtype([Timestamp("2014")])}
|
|
|
|
data = "b\n2014-01-01\n2014-01-01T00:00:00"
|
|
expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)})
|
|
|
|
result = parser.read_csv(StringIO(data), dtype=dtype)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_categorical_coerces_timedelta(all_parsers):
|
|
parser = all_parsers
|
|
dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))}
|
|
|
|
data = "b\n1H\n2H\n3H"
|
|
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
|
|
|
|
result = parser.read_csv(StringIO(data), dtype=dtype)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"data",
|
|
[
|
|
"b\nTrue\nFalse\nNA\nFalse",
|
|
"b\ntrue\nfalse\nNA\nfalse",
|
|
"b\nTRUE\nFALSE\nNA\nFALSE",
|
|
"b\nTrue\nFalse\nNA\nFALSE",
|
|
],
|
|
)
|
|
def test_categorical_dtype_coerces_boolean(all_parsers, data):
|
|
# see gh-20498
|
|
parser = all_parsers
|
|
dtype = {"b": CategoricalDtype([False, True])}
|
|
expected = DataFrame({"b": Categorical([True, False, None, False])})
|
|
|
|
result = parser.read_csv(StringIO(data), dtype=dtype)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_categorical_unexpected_categories(all_parsers):
|
|
parser = all_parsers
|
|
dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}
|
|
|
|
data = "b\nd\na\nc\nd" # Unexpected c
|
|
expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])})
|
|
|
|
result = parser.read_csv(StringIO(data), dtype=dtype)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_empty_pass_dtype(all_parsers):
|
|
parser = all_parsers
|
|
|
|
data = "one,two"
|
|
result = parser.read_csv(StringIO(data), dtype={"one": "u1"})
|
|
|
|
expected = DataFrame(
|
|
{"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=object)},
|
|
index=Index([], dtype=object),
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_empty_with_index_pass_dtype(all_parsers):
|
|
parser = all_parsers
|
|
|
|
data = "one,two"
|
|
result = parser.read_csv(
|
|
StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"}
|
|
)
|
|
|
|
expected = DataFrame(
|
|
{"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one")
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_empty_with_multi_index_pass_dtype(all_parsers):
|
|
parser = all_parsers
|
|
|
|
data = "one,two,three"
|
|
result = parser.read_csv(
|
|
StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"}
|
|
)
|
|
|
|
exp_idx = MultiIndex.from_arrays(
|
|
[np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)], names=["one", "two"]
|
|
)
|
|
expected = DataFrame({"three": np.empty(0, dtype=object)}, index=exp_idx)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers):
|
|
parser = all_parsers
|
|
|
|
data = "one,one"
|
|
result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"})
|
|
|
|
expected = DataFrame(
|
|
{"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
|
|
index=Index([], dtype=object),
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers):
|
|
parser = all_parsers
|
|
|
|
data = "one,one"
|
|
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
|
|
|
|
expected = DataFrame(
|
|
{"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
|
|
index=Index([], dtype=object),
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers):
|
|
# see gh-9424
|
|
parser = all_parsers
|
|
expected = concat(
|
|
[Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
|
|
axis=1,
|
|
)
|
|
expected.index = expected.index.astype(object)
|
|
|
|
data = "one,one"
|
|
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers):
|
|
# see gh-9424
|
|
parser = all_parsers
|
|
expected = concat(
|
|
[Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
|
|
axis=1,
|
|
)
|
|
expected.index = expected.index.astype(object)
|
|
|
|
with pytest.raises(ValueError, match="Duplicate names"):
|
|
data = ""
|
|
parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"})
|
|
|
|
|
|
def test_raise_on_passed_int_dtype_with_nas(all_parsers):
|
|
# see gh-2631
|
|
parser = all_parsers
|
|
data = """YEAR, DOY, a
|
|
2001,106380451,10
|
|
2001,,11
|
|
2001,106380451,67"""
|
|
|
|
msg = (
|
|
"Integer column has NA values"
|
|
if parser.engine == "c"
|
|
else "Unable to convert column DOY"
|
|
)
|
|
with pytest.raises(ValueError, match=msg):
|
|
parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True)
|
|
|
|
|
|
def test_dtype_with_converters(all_parsers):
|
|
parser = all_parsers
|
|
data = """a,b
|
|
1.1,2.2
|
|
1.2,2.3"""
|
|
|
|
# Dtype spec ignored if converted specified.
|
|
with tm.assert_produces_warning(ParserWarning):
|
|
result = parser.read_csv(
|
|
StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)}
|
|
)
|
|
expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]})
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"dtype,expected",
|
|
[
|
|
(np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)),
|
|
("category", DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[])),
|
|
(
|
|
{"a": "category", "b": "category"},
|
|
DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]),
|
|
),
|
|
("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")),
|
|
(
|
|
"timedelta64[ns]",
|
|
DataFrame(
|
|
{
|
|
"a": Series([], dtype="timedelta64[ns]"),
|
|
"b": Series([], dtype="timedelta64[ns]"),
|
|
},
|
|
index=[],
|
|
),
|
|
),
|
|
(
|
|
{"a": np.int64, "b": np.int32},
|
|
DataFrame(
|
|
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
|
|
index=[],
|
|
),
|
|
),
|
|
(
|
|
{0: np.int64, 1: np.int32},
|
|
DataFrame(
|
|
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
|
|
index=[],
|
|
),
|
|
),
|
|
(
|
|
{"a": np.int64, 1: np.int32},
|
|
DataFrame(
|
|
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
|
|
index=[],
|
|
),
|
|
),
|
|
],
|
|
)
|
|
def test_empty_dtype(all_parsers, dtype, expected):
|
|
# see gh-14712
|
|
parser = all_parsers
|
|
data = "a,b"
|
|
|
|
result = parser.read_csv(StringIO(data), header=0, dtype=dtype)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"])
|
|
)
|
|
def test_numeric_dtype(all_parsers, dtype):
|
|
data = "0\n1"
|
|
parser = all_parsers
|
|
expected = DataFrame([0, 1], dtype=dtype)
|
|
|
|
result = parser.read_csv(StringIO(data), header=None, dtype=dtype)
|
|
tm.assert_frame_equal(expected, result)
|
|
|
|
|
|
def test_boolean_dtype(all_parsers):
|
|
parser = all_parsers
|
|
data = "\n".join(
|
|
[
|
|
"a",
|
|
"True",
|
|
"TRUE",
|
|
"true",
|
|
"1",
|
|
"1.0",
|
|
"False",
|
|
"FALSE",
|
|
"false",
|
|
"0",
|
|
"0.0",
|
|
"NaN",
|
|
"nan",
|
|
"NA",
|
|
"null",
|
|
"NULL",
|
|
]
|
|
)
|
|
|
|
result = parser.read_csv(StringIO(data), dtype="boolean")
|
|
expected = DataFrame(
|
|
{
|
|
"a": pd.array(
|
|
[
|
|
True,
|
|
True,
|
|
True,
|
|
True,
|
|
True,
|
|
False,
|
|
False,
|
|
False,
|
|
False,
|
|
False,
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
],
|
|
dtype="boolean",
|
|
)
|
|
}
|
|
)
|
|
|
|
tm.assert_frame_equal(result, expected)
|