projektAI/venv/Lib/site-packages/pandas/tests/io/parser/test_usecols.py
2021-06-06 22:13:05 +02:00

575 lines
16 KiB
Python

"""
Tests the usecols functionality during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas._libs.tslib import Timestamp
from pandas import DataFrame, Index
import pandas._testing as tm
_msg_validate_usecols_arg = (
"'usecols' must either be list-like "
"of all strings, all unicode, all "
"integers or a callable."
)
_msg_validate_usecols_names = (
"Usecols do not match columns, columns expected but not found: {0}"
)
def test_raise_on_mixed_dtype_usecols(all_parsers):
# See gh-12678
data = """a,b,c
1000,2000,3000
4000,5000,6000
"""
usecols = [0, "b", 2]
parser = all_parsers
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
parser.read_csv(StringIO(data), usecols=usecols)
@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")])
def test_usecols(all_parsers, usecols):
data = """\
a,b,c
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
result = parser.read_csv(StringIO(data), usecols=usecols)
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
tm.assert_frame_equal(result, expected)
def test_usecols_with_names(all_parsers):
data = """\
a,b,c
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
names = ["foo", "bar"]
result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])]
)
def test_usecols_relative_to_names(all_parsers, names, usecols):
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols)
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
tm.assert_frame_equal(result, expected)
def test_usecols_relative_to_names2(all_parsers):
# see gh-5766
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
result = parser.read_csv(
StringIO(data), names=["a", "b"], header=None, usecols=[0, 1]
)
expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
def test_usecols_name_length_conflict(all_parsers):
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
msg = (
"Number of passed names did not match number of header fields in the file"
if parser.engine == "python"
else "Passed header names mismatches usecols"
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1])
def test_usecols_single_string(all_parsers):
# see gh-20558
parser = all_parsers
data = """foo, bar, baz
1000, 2000, 3000
4000, 5000, 6000"""
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
parser.read_csv(StringIO(data), usecols="foo")
@pytest.mark.parametrize(
"data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"]
)
def test_usecols_index_col_false(all_parsers, data):
# see gh-9082
parser = all_parsers
usecols = ["a", "c", "d"]
expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]})
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("index_col", ["b", 0])
@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]])
def test_usecols_index_col_conflict(all_parsers, usecols, index_col):
# see gh-4201: test that index_col as integer reflects usecols
parser = all_parsers
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b"))
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
tm.assert_frame_equal(result, expected)
def test_usecols_index_col_conflict2(all_parsers):
# see gh-4201: test that index_col as integer reflects usecols
parser = all_parsers
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")})
expected = expected.set_index(["b", "c"])
result = parser.read_csv(
StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"]
)
tm.assert_frame_equal(result, expected)
def test_usecols_implicit_index_col(all_parsers):
# see gh-2654
parser = all_parsers
data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"
result = parser.read_csv(StringIO(data), usecols=["a", "b"])
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
tm.assert_frame_equal(result, expected)
def test_usecols_regex_sep(all_parsers):
# see gh-2733
parser = all_parsers
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
tm.assert_frame_equal(result, expected)
def test_usecols_with_whitespace(all_parsers):
parser = all_parsers
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
result = parser.read_csv(StringIO(data), delim_whitespace=True, usecols=("a", "b"))
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"usecols,expected",
[
# Column selection by index.
([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])),
# Column selection by name.
(["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"])),
],
)
def test_usecols_with_integer_like_header(all_parsers, usecols, expected):
parser = all_parsers
data = """2,0,1
1000,2000,3000
4000,5000,6000"""
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
def test_usecols_with_parse_dates(all_parsers, usecols):
# see gh-9755
data = """a,b,c,d,e
0,1,20140101,0900,4
0,1,20140102,1000,4"""
parser = all_parsers
parse_dates = [[1, 2]]
cols = {
"a": [0, 0],
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
}
expected = DataFrame(cols, columns=["c_d", "a"])
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
tm.assert_frame_equal(result, expected)
def test_usecols_with_parse_dates2(all_parsers):
# see gh-13604
parser = all_parsers
data = """2008-02-07 09:40,1032.43
2008-02-07 09:50,1042.54
2008-02-07 10:00,1051.65"""
names = ["date", "values"]
usecols = names[:]
parse_dates = [0]
index = Index(
[
Timestamp("2008-02-07 09:40"),
Timestamp("2008-02-07 09:50"),
Timestamp("2008-02-07 10:00"),
],
name="date",
)
cols = {"values": [1032.43, 1042.54, 1051.65]}
expected = DataFrame(cols, index=index)
result = parser.read_csv(
StringIO(data),
parse_dates=parse_dates,
index_col=0,
usecols=usecols,
header=None,
names=names,
)
tm.assert_frame_equal(result, expected)
def test_usecols_with_parse_dates3(all_parsers):
# see gh-14792
parser = all_parsers
data = """a,b,c,d,e,f,g,h,i,j
2016/09/21,1,1,2,3,4,5,6,7,8"""
usecols = list("abcdefghij")
parse_dates = [0]
cols = {
"a": Timestamp("2016-09-21"),
"b": [1],
"c": [1],
"d": [2],
"e": [3],
"f": [4],
"g": [5],
"h": [6],
"i": [7],
"j": [8],
}
expected = DataFrame(cols, columns=usecols)
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
tm.assert_frame_equal(result, expected)
def test_usecols_with_parse_dates4(all_parsers):
data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
usecols = list("abcdefghij")
parse_dates = [[0, 1]]
parser = all_parsers
cols = {
"a_b": "2016/09/21 1",
"c": [1],
"d": [2],
"e": [3],
"f": [4],
"g": [5],
"h": [6],
"i": [7],
"j": [8],
}
expected = DataFrame(cols, columns=["a_b"] + list("cdefghij"))
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
@pytest.mark.parametrize(
"names",
[
list("abcde"), # Names span all columns in original data.
list("acd"), # Names span only the selected columns.
],
)
def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names):
# see gh-9755
s = """0,1,20140101,0900,4
0,1,20140102,1000,4"""
parse_dates = [[1, 2]]
parser = all_parsers
cols = {
"a": [0, 0],
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
}
expected = DataFrame(cols, columns=["c_d", "a"])
result = parser.read_csv(
StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols
)
tm.assert_frame_equal(result, expected)
def test_usecols_with_unicode_strings(all_parsers):
# see gh-13219
data = """AAA,BBB,CCC,DDD
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
exp_data = {
"AAA": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002},
"BBB": {0: 8, 1: 2, 2: 7},
}
expected = DataFrame(exp_data)
result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"])
tm.assert_frame_equal(result, expected)
def test_usecols_with_single_byte_unicode_strings(all_parsers):
# see gh-13219
data = """A,B,C,D
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
exp_data = {
"A": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002},
"B": {0: 8, 1: 2, 2: 7},
}
expected = DataFrame(exp_data)
result = parser.read_csv(StringIO(data), usecols=["A", "B"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]])
def test_usecols_with_mixed_encoding_strings(all_parsers, usecols):
data = """AAA,BBB,CCC,DDD
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
parser.read_csv(StringIO(data), usecols=usecols)
@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]])
def test_usecols_with_multi_byte_characters(all_parsers, usecols):
data = """あああ,いい,ううう,ええええ
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
exp_data = {
"あああ": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002},
"いい": {0: 8, 1: 2, 2: 7},
}
expected = DataFrame(exp_data)
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
def test_empty_usecols(all_parsers):
data = "a,b,c\n1,2,3\n4,5,6"
expected = DataFrame()
parser = all_parsers
result = parser.read_csv(StringIO(data), usecols=set())
tm.assert_frame_equal(result, expected)
def test_np_array_usecols(all_parsers):
# see gh-12546
parser = all_parsers
data = "a,b,c\n1,2,3"
usecols = np.array(["a", "b"])
expected = DataFrame([[1, 2]], columns=usecols)
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"usecols,expected",
[
(
lambda x: x.upper() in ["AAA", "BBB", "DDD"],
DataFrame(
{
"AaA": {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002,
},
"bBb": {0: 8, 1: 2, 2: 7},
"ddd": {0: "a", 1: "b", 2: "a"},
}
),
),
(lambda x: False, DataFrame()),
],
)
def test_callable_usecols(all_parsers, usecols, expected):
# see gh-14154
data = """AaA,bBb,CCC,ddd
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
def test_incomplete_first_row(all_parsers, usecols):
# see gh-6710
data = "1,2\n1,2,3"
parser = all_parsers
names = ["a", "b", "c"]
expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]})
result = parser.read_csv(StringIO(data), names=names, usecols=usecols)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,usecols,kwargs,expected",
[
# see gh-8985
(
"19,29,39\n" * 2 + "10,20,30,40",
[0, 1, 2],
{"header": None},
DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]),
),
# see gh-9549
(
("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"),
["A", "B", "C"],
{},
DataFrame(
{
"A": [1, 3, 1, 1, 1, 5],
"B": [2, 4, 2, 2, 2, 6],
"C": [3, 5, 4, 3, 3, 7],
}
),
),
],
)
def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected):
# see gh-8985
parser = all_parsers
result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"usecols,kwargs,expected,msg",
[
(
["a", "b", "c", "d"],
{},
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
None,
),
(
["a", "b", "c", "f"],
{},
None,
_msg_validate_usecols_names.format(r"\['f'\]"),
),
(["a", "b", "f"], {}, None, _msg_validate_usecols_names.format(r"\['f'\]")),
(
["a", "b", "f", "g"],
{},
None,
_msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"),
),
# see gh-14671
(
None,
{"header": 0, "names": ["A", "B", "C", "D"]},
DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}),
None,
),
(
["A", "B", "C", "f"],
{"header": 0, "names": ["A", "B", "C", "D"]},
None,
_msg_validate_usecols_names.format(r"\['f'\]"),
),
(
["A", "B", "f"],
{"names": ["A", "B", "C", "D"]},
None,
_msg_validate_usecols_names.format(r"\['f'\]"),
),
],
)
def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected, msg):
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
kwargs.update(usecols=usecols)
parser = all_parsers
if expected is None:
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
else:
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request):
if all_parsers.engine != "c":
reason = "see gh-16469: works on the C engine but not the Python engine"
# Number of passed names did not match number of header fields in the file
request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError))
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
names = ["A", "B", "C", "D"]
parser = all_parsers
result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
expected = DataFrame({"A": [1, 5], "C": [3, 7]})
tm.assert_frame_equal(result, expected)