575 lines
16 KiB
Python
575 lines
16 KiB
Python
"""
|
|
Tests the usecols functionality during parsing
|
|
for all of the parsers defined in parsers.py
|
|
"""
|
|
from io import StringIO
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from pandas._libs.tslib import Timestamp
|
|
|
|
from pandas import DataFrame, Index
|
|
import pandas._testing as tm
|
|
|
|
_msg_validate_usecols_arg = (
|
|
"'usecols' must either be list-like "
|
|
"of all strings, all unicode, all "
|
|
"integers or a callable."
|
|
)
|
|
_msg_validate_usecols_names = (
|
|
"Usecols do not match columns, columns expected but not found: {0}"
|
|
)
|
|
|
|
|
|
def test_raise_on_mixed_dtype_usecols(all_parsers):
|
|
# See gh-12678
|
|
data = """a,b,c
|
|
1000,2000,3000
|
|
4000,5000,6000
|
|
"""
|
|
usecols = [0, "b", 2]
|
|
parser = all_parsers
|
|
|
|
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
|
parser.read_csv(StringIO(data), usecols=usecols)
|
|
|
|
|
|
@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")])
|
|
def test_usecols(all_parsers, usecols):
|
|
data = """\
|
|
a,b,c
|
|
1,2,3
|
|
4,5,6
|
|
7,8,9
|
|
10,11,12"""
|
|
parser = all_parsers
|
|
result = parser.read_csv(StringIO(data), usecols=usecols)
|
|
|
|
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_usecols_with_names(all_parsers):
|
|
data = """\
|
|
a,b,c
|
|
1,2,3
|
|
4,5,6
|
|
7,8,9
|
|
10,11,12"""
|
|
parser = all_parsers
|
|
names = ["foo", "bar"]
|
|
result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
|
|
|
|
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])]
|
|
)
|
|
def test_usecols_relative_to_names(all_parsers, names, usecols):
|
|
data = """\
|
|
1,2,3
|
|
4,5,6
|
|
7,8,9
|
|
10,11,12"""
|
|
parser = all_parsers
|
|
result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols)
|
|
|
|
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_usecols_relative_to_names2(all_parsers):
|
|
# see gh-5766
|
|
data = """\
|
|
1,2,3
|
|
4,5,6
|
|
7,8,9
|
|
10,11,12"""
|
|
parser = all_parsers
|
|
result = parser.read_csv(
|
|
StringIO(data), names=["a", "b"], header=None, usecols=[0, 1]
|
|
)
|
|
|
|
expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_usecols_name_length_conflict(all_parsers):
|
|
data = """\
|
|
1,2,3
|
|
4,5,6
|
|
7,8,9
|
|
10,11,12"""
|
|
parser = all_parsers
|
|
msg = (
|
|
"Number of passed names did not match number of header fields in the file"
|
|
if parser.engine == "python"
|
|
else "Passed header names mismatches usecols"
|
|
)
|
|
|
|
with pytest.raises(ValueError, match=msg):
|
|
parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1])
|
|
|
|
|
|
def test_usecols_single_string(all_parsers):
|
|
# see gh-20558
|
|
parser = all_parsers
|
|
data = """foo, bar, baz
|
|
1000, 2000, 3000
|
|
4000, 5000, 6000"""
|
|
|
|
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
|
parser.read_csv(StringIO(data), usecols="foo")
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"]
|
|
)
|
|
def test_usecols_index_col_false(all_parsers, data):
|
|
# see gh-9082
|
|
parser = all_parsers
|
|
usecols = ["a", "c", "d"]
|
|
expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]})
|
|
|
|
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("index_col", ["b", 0])
|
|
@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]])
|
|
def test_usecols_index_col_conflict(all_parsers, usecols, index_col):
|
|
# see gh-4201: test that index_col as integer reflects usecols
|
|
parser = all_parsers
|
|
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
|
|
expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b"))
|
|
|
|
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_usecols_index_col_conflict2(all_parsers):
|
|
# see gh-4201: test that index_col as integer reflects usecols
|
|
parser = all_parsers
|
|
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
|
|
|
|
expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")})
|
|
expected = expected.set_index(["b", "c"])
|
|
|
|
result = parser.read_csv(
|
|
StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"]
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_usecols_implicit_index_col(all_parsers):
|
|
# see gh-2654
|
|
parser = all_parsers
|
|
data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"
|
|
|
|
result = parser.read_csv(StringIO(data), usecols=["a", "b"])
|
|
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_usecols_regex_sep(all_parsers):
|
|
# see gh-2733
|
|
parser = all_parsers
|
|
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
|
|
result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
|
|
|
|
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_usecols_with_whitespace(all_parsers):
|
|
parser = all_parsers
|
|
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
|
|
|
|
result = parser.read_csv(StringIO(data), delim_whitespace=True, usecols=("a", "b"))
|
|
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"usecols,expected",
|
|
[
|
|
# Column selection by index.
|
|
([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])),
|
|
# Column selection by name.
|
|
(["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"])),
|
|
],
|
|
)
|
|
def test_usecols_with_integer_like_header(all_parsers, usecols, expected):
|
|
parser = all_parsers
|
|
data = """2,0,1
|
|
1000,2000,3000
|
|
4000,5000,6000"""
|
|
|
|
result = parser.read_csv(StringIO(data), usecols=usecols)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
|
|
def test_usecols_with_parse_dates(all_parsers, usecols):
|
|
# see gh-9755
|
|
data = """a,b,c,d,e
|
|
0,1,20140101,0900,4
|
|
0,1,20140102,1000,4"""
|
|
parser = all_parsers
|
|
parse_dates = [[1, 2]]
|
|
|
|
cols = {
|
|
"a": [0, 0],
|
|
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
|
|
}
|
|
expected = DataFrame(cols, columns=["c_d", "a"])
|
|
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_usecols_with_parse_dates2(all_parsers):
|
|
# see gh-13604
|
|
parser = all_parsers
|
|
data = """2008-02-07 09:40,1032.43
|
|
2008-02-07 09:50,1042.54
|
|
2008-02-07 10:00,1051.65"""
|
|
|
|
names = ["date", "values"]
|
|
usecols = names[:]
|
|
parse_dates = [0]
|
|
|
|
index = Index(
|
|
[
|
|
Timestamp("2008-02-07 09:40"),
|
|
Timestamp("2008-02-07 09:50"),
|
|
Timestamp("2008-02-07 10:00"),
|
|
],
|
|
name="date",
|
|
)
|
|
cols = {"values": [1032.43, 1042.54, 1051.65]}
|
|
expected = DataFrame(cols, index=index)
|
|
|
|
result = parser.read_csv(
|
|
StringIO(data),
|
|
parse_dates=parse_dates,
|
|
index_col=0,
|
|
usecols=usecols,
|
|
header=None,
|
|
names=names,
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_usecols_with_parse_dates3(all_parsers):
|
|
# see gh-14792
|
|
parser = all_parsers
|
|
data = """a,b,c,d,e,f,g,h,i,j
|
|
2016/09/21,1,1,2,3,4,5,6,7,8"""
|
|
|
|
usecols = list("abcdefghij")
|
|
parse_dates = [0]
|
|
|
|
cols = {
|
|
"a": Timestamp("2016-09-21"),
|
|
"b": [1],
|
|
"c": [1],
|
|
"d": [2],
|
|
"e": [3],
|
|
"f": [4],
|
|
"g": [5],
|
|
"h": [6],
|
|
"i": [7],
|
|
"j": [8],
|
|
}
|
|
expected = DataFrame(cols, columns=usecols)
|
|
|
|
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_usecols_with_parse_dates4(all_parsers):
|
|
data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
|
|
usecols = list("abcdefghij")
|
|
parse_dates = [[0, 1]]
|
|
parser = all_parsers
|
|
|
|
cols = {
|
|
"a_b": "2016/09/21 1",
|
|
"c": [1],
|
|
"d": [2],
|
|
"e": [3],
|
|
"f": [4],
|
|
"g": [5],
|
|
"h": [6],
|
|
"i": [7],
|
|
"j": [8],
|
|
}
|
|
expected = DataFrame(cols, columns=["a_b"] + list("cdefghij"))
|
|
|
|
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
|
|
@pytest.mark.parametrize(
|
|
"names",
|
|
[
|
|
list("abcde"), # Names span all columns in original data.
|
|
list("acd"), # Names span only the selected columns.
|
|
],
|
|
)
|
|
def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names):
|
|
# see gh-9755
|
|
s = """0,1,20140101,0900,4
|
|
0,1,20140102,1000,4"""
|
|
parse_dates = [[1, 2]]
|
|
parser = all_parsers
|
|
|
|
cols = {
|
|
"a": [0, 0],
|
|
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
|
|
}
|
|
expected = DataFrame(cols, columns=["c_d", "a"])
|
|
|
|
result = parser.read_csv(
|
|
StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_usecols_with_unicode_strings(all_parsers):
|
|
# see gh-13219
|
|
data = """AAA,BBB,CCC,DDD
|
|
0.056674973,8,True,a
|
|
2.613230982,2,False,b
|
|
3.568935038,7,False,a"""
|
|
parser = all_parsers
|
|
|
|
exp_data = {
|
|
"AAA": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002},
|
|
"BBB": {0: 8, 1: 2, 2: 7},
|
|
}
|
|
expected = DataFrame(exp_data)
|
|
|
|
result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_usecols_with_single_byte_unicode_strings(all_parsers):
|
|
# see gh-13219
|
|
data = """A,B,C,D
|
|
0.056674973,8,True,a
|
|
2.613230982,2,False,b
|
|
3.568935038,7,False,a"""
|
|
parser = all_parsers
|
|
|
|
exp_data = {
|
|
"A": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002},
|
|
"B": {0: 8, 1: 2, 2: 7},
|
|
}
|
|
expected = DataFrame(exp_data)
|
|
|
|
result = parser.read_csv(StringIO(data), usecols=["A", "B"])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]])
|
|
def test_usecols_with_mixed_encoding_strings(all_parsers, usecols):
|
|
data = """AAA,BBB,CCC,DDD
|
|
0.056674973,8,True,a
|
|
2.613230982,2,False,b
|
|
3.568935038,7,False,a"""
|
|
parser = all_parsers
|
|
|
|
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
|
parser.read_csv(StringIO(data), usecols=usecols)
|
|
|
|
|
|
@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]])
|
|
def test_usecols_with_multi_byte_characters(all_parsers, usecols):
|
|
data = """あああ,いい,ううう,ええええ
|
|
0.056674973,8,True,a
|
|
2.613230982,2,False,b
|
|
3.568935038,7,False,a"""
|
|
parser = all_parsers
|
|
|
|
exp_data = {
|
|
"あああ": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002},
|
|
"いい": {0: 8, 1: 2, 2: 7},
|
|
}
|
|
expected = DataFrame(exp_data)
|
|
|
|
result = parser.read_csv(StringIO(data), usecols=usecols)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_empty_usecols(all_parsers):
|
|
data = "a,b,c\n1,2,3\n4,5,6"
|
|
expected = DataFrame()
|
|
parser = all_parsers
|
|
|
|
result = parser.read_csv(StringIO(data), usecols=set())
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_np_array_usecols(all_parsers):
|
|
# see gh-12546
|
|
parser = all_parsers
|
|
data = "a,b,c\n1,2,3"
|
|
usecols = np.array(["a", "b"])
|
|
|
|
expected = DataFrame([[1, 2]], columns=usecols)
|
|
result = parser.read_csv(StringIO(data), usecols=usecols)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"usecols,expected",
|
|
[
|
|
(
|
|
lambda x: x.upper() in ["AAA", "BBB", "DDD"],
|
|
DataFrame(
|
|
{
|
|
"AaA": {
|
|
0: 0.056674972999999997,
|
|
1: 2.6132309819999997,
|
|
2: 3.5689350380000002,
|
|
},
|
|
"bBb": {0: 8, 1: 2, 2: 7},
|
|
"ddd": {0: "a", 1: "b", 2: "a"},
|
|
}
|
|
),
|
|
),
|
|
(lambda x: False, DataFrame()),
|
|
],
|
|
)
|
|
def test_callable_usecols(all_parsers, usecols, expected):
|
|
# see gh-14154
|
|
data = """AaA,bBb,CCC,ddd
|
|
0.056674973,8,True,a
|
|
2.613230982,2,False,b
|
|
3.568935038,7,False,a"""
|
|
parser = all_parsers
|
|
|
|
result = parser.read_csv(StringIO(data), usecols=usecols)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
|
|
def test_incomplete_first_row(all_parsers, usecols):
|
|
# see gh-6710
|
|
data = "1,2\n1,2,3"
|
|
parser = all_parsers
|
|
names = ["a", "b", "c"]
|
|
expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]})
|
|
|
|
result = parser.read_csv(StringIO(data), names=names, usecols=usecols)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"data,usecols,kwargs,expected",
|
|
[
|
|
# see gh-8985
|
|
(
|
|
"19,29,39\n" * 2 + "10,20,30,40",
|
|
[0, 1, 2],
|
|
{"header": None},
|
|
DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]),
|
|
),
|
|
# see gh-9549
|
|
(
|
|
("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"),
|
|
["A", "B", "C"],
|
|
{},
|
|
DataFrame(
|
|
{
|
|
"A": [1, 3, 1, 1, 1, 5],
|
|
"B": [2, 4, 2, 2, 2, 6],
|
|
"C": [3, 5, 4, 3, 3, 7],
|
|
}
|
|
),
|
|
),
|
|
],
|
|
)
|
|
def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected):
|
|
# see gh-8985
|
|
parser = all_parsers
|
|
result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"usecols,kwargs,expected,msg",
|
|
[
|
|
(
|
|
["a", "b", "c", "d"],
|
|
{},
|
|
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
|
|
None,
|
|
),
|
|
(
|
|
["a", "b", "c", "f"],
|
|
{},
|
|
None,
|
|
_msg_validate_usecols_names.format(r"\['f'\]"),
|
|
),
|
|
(["a", "b", "f"], {}, None, _msg_validate_usecols_names.format(r"\['f'\]")),
|
|
(
|
|
["a", "b", "f", "g"],
|
|
{},
|
|
None,
|
|
_msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"),
|
|
),
|
|
# see gh-14671
|
|
(
|
|
None,
|
|
{"header": 0, "names": ["A", "B", "C", "D"]},
|
|
DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}),
|
|
None,
|
|
),
|
|
(
|
|
["A", "B", "C", "f"],
|
|
{"header": 0, "names": ["A", "B", "C", "D"]},
|
|
None,
|
|
_msg_validate_usecols_names.format(r"\['f'\]"),
|
|
),
|
|
(
|
|
["A", "B", "f"],
|
|
{"names": ["A", "B", "C", "D"]},
|
|
None,
|
|
_msg_validate_usecols_names.format(r"\['f'\]"),
|
|
),
|
|
],
|
|
)
|
|
def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected, msg):
|
|
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
|
|
kwargs.update(usecols=usecols)
|
|
parser = all_parsers
|
|
|
|
if expected is None:
|
|
with pytest.raises(ValueError, match=msg):
|
|
parser.read_csv(StringIO(data), **kwargs)
|
|
else:
|
|
result = parser.read_csv(StringIO(data), **kwargs)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
|
|
def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request):
|
|
if all_parsers.engine != "c":
|
|
reason = "see gh-16469: works on the C engine but not the Python engine"
|
|
# Number of passed names did not match number of header fields in the file
|
|
request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError))
|
|
|
|
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
|
|
names = ["A", "B", "C", "D"]
|
|
parser = all_parsers
|
|
|
|
result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
|
|
expected = DataFrame({"A": [1, 5], "C": [3, 7]})
|
|
tm.assert_frame_equal(result, expected)
|