2393 lines
67 KiB
Python
2393 lines
67 KiB
Python
![]() |
"""
|
||
|
Tests that work on both the Python and C engines but do not have a
|
||
|
specific classification into the other test modules.
|
||
|
"""
|
||
|
import codecs
|
||
|
import csv
|
||
|
from datetime import datetime
|
||
|
from inspect import signature
|
||
|
from io import BytesIO, StringIO
|
||
|
import os
|
||
|
from pathlib import Path
|
||
|
import platform
|
||
|
from urllib.error import URLError
|
||
|
import warnings
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
from pandas._libs.tslib import Timestamp
|
||
|
from pandas.compat import is_platform_linux
|
||
|
from pandas.errors import DtypeWarning, EmptyDataError, ParserError
|
||
|
import pandas.util._test_decorators as td
|
||
|
|
||
|
from pandas import DataFrame, Index, MultiIndex, Series, compat, concat, option_context
|
||
|
import pandas._testing as tm
|
||
|
|
||
|
from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser
|
||
|
|
||
|
|
||
|
def test_override_set_noconvert_columns():
|
||
|
# see gh-17351
|
||
|
#
|
||
|
# Usecols needs to be sorted in _set_noconvert_columns based
|
||
|
# on the test_usecols_with_parse_dates test from test_usecols.py
|
||
|
class MyTextFileReader(TextFileReader):
|
||
|
def __init__(self):
|
||
|
self._currow = 0
|
||
|
self.squeeze = False
|
||
|
|
||
|
class MyCParserWrapper(CParserWrapper):
|
||
|
def _set_noconvert_columns(self):
|
||
|
if self.usecols_dtype == "integer":
|
||
|
# self.usecols is a set, which is documented as unordered
|
||
|
# but in practice, a CPython set of integers is sorted.
|
||
|
# In other implementations this assumption does not hold.
|
||
|
# The following code simulates a different order, which
|
||
|
# before GH 17351 would cause the wrong columns to be
|
||
|
# converted via the parse_dates parameter
|
||
|
self.usecols = list(self.usecols)
|
||
|
self.usecols.reverse()
|
||
|
return CParserWrapper._set_noconvert_columns(self)
|
||
|
|
||
|
data = """a,b,c,d,e
|
||
|
0,1,20140101,0900,4
|
||
|
0,1,20140102,1000,4"""
|
||
|
|
||
|
parse_dates = [[1, 2]]
|
||
|
cols = {
|
||
|
"a": [0, 0],
|
||
|
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
|
||
|
}
|
||
|
expected = DataFrame(cols, columns=["c_d", "a"])
|
||
|
|
||
|
parser = MyTextFileReader()
|
||
|
parser.options = {
|
||
|
"usecols": [0, 2, 3],
|
||
|
"parse_dates": parse_dates,
|
||
|
"delimiter": ",",
|
||
|
}
|
||
|
parser._engine = MyCParserWrapper(StringIO(data), **parser.options)
|
||
|
|
||
|
result = parser.read()
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_empty_decimal_marker(all_parsers):
|
||
|
data = """A|B|C
|
||
|
1|2,334|5
|
||
|
10|13|10.
|
||
|
"""
|
||
|
# Parsers support only length-1 decimals
|
||
|
msg = "Only length-1 decimal markers supported"
|
||
|
parser = all_parsers
|
||
|
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
parser.read_csv(StringIO(data), decimal="")
|
||
|
|
||
|
|
||
|
def test_bad_stream_exception(all_parsers, csv_dir_path):
|
||
|
# see gh-13652
|
||
|
#
|
||
|
# This test validates that both the Python engine and C engine will
|
||
|
# raise UnicodeDecodeError instead of C engine raising ParserError
|
||
|
# and swallowing the exception that caused read to fail.
|
||
|
path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv")
|
||
|
codec = codecs.lookup("utf-8")
|
||
|
utf8 = codecs.lookup("utf-8")
|
||
|
parser = all_parsers
|
||
|
msg = "'utf-8' codec can't decode byte"
|
||
|
|
||
|
# Stream must be binary UTF8.
|
||
|
with open(path, "rb") as handle, codecs.StreamRecoder(
|
||
|
handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter
|
||
|
) as stream:
|
||
|
|
||
|
with pytest.raises(UnicodeDecodeError, match=msg):
|
||
|
parser.read_csv(stream)
|
||
|
|
||
|
|
||
|
def test_read_csv_local(all_parsers, csv1):
|
||
|
prefix = "file:///" if compat.is_platform_windows() else "file://"
|
||
|
parser = all_parsers
|
||
|
|
||
|
fname = prefix + str(os.path.abspath(csv1))
|
||
|
result = parser.read_csv(fname, index_col=0, parse_dates=True)
|
||
|
|
||
|
expected = DataFrame(
|
||
|
[
|
||
|
[0.980269, 3.685731, -0.364216805298, -1.159738],
|
||
|
[1.047916, -0.041232, -0.16181208307, 0.212549],
|
||
|
[0.498581, 0.731168, -0.537677223318, 1.346270],
|
||
|
[1.120202, 1.567621, 0.00364077397681, 0.675253],
|
||
|
[-0.487094, 0.571455, -1.6116394093, 0.103469],
|
||
|
[0.836649, 0.246462, 0.588542635376, 1.062782],
|
||
|
[-0.157161, 1.340307, 1.1957779562, -1.097007],
|
||
|
],
|
||
|
columns=["A", "B", "C", "D"],
|
||
|
index=Index(
|
||
|
[
|
||
|
datetime(2000, 1, 3),
|
||
|
datetime(2000, 1, 4),
|
||
|
datetime(2000, 1, 5),
|
||
|
datetime(2000, 1, 6),
|
||
|
datetime(2000, 1, 7),
|
||
|
datetime(2000, 1, 10),
|
||
|
datetime(2000, 1, 11),
|
||
|
],
|
||
|
name="index",
|
||
|
),
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_1000_sep(all_parsers):
|
||
|
parser = all_parsers
|
||
|
data = """A|B|C
|
||
|
1|2,334|5
|
||
|
10|13|10.
|
||
|
"""
|
||
|
expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]})
|
||
|
|
||
|
result = parser.read_csv(StringIO(data), sep="|", thousands=",")
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_squeeze(all_parsers):
|
||
|
data = """\
|
||
|
a,1
|
||
|
b,2
|
||
|
c,3
|
||
|
"""
|
||
|
parser = all_parsers
|
||
|
index = Index(["a", "b", "c"], name=0)
|
||
|
expected = Series([1, 2, 3], name=1, index=index)
|
||
|
|
||
|
result = parser.read_csv(StringIO(data), index_col=0, header=None, squeeze=True)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# see gh-8217
|
||
|
#
|
||
|
# Series should not be a view.
|
||
|
assert not result._is_view
|
||
|
|
||
|
|
||
|
def test_malformed(all_parsers):
|
||
|
# see gh-6607
|
||
|
parser = all_parsers
|
||
|
data = """ignore
|
||
|
A,B,C
|
||
|
1,2,3 # comment
|
||
|
1,2,3,4,5
|
||
|
2,3,4
|
||
|
"""
|
||
|
msg = "Expected 3 fields in line 4, saw 5"
|
||
|
with pytest.raises(ParserError, match=msg):
|
||
|
parser.read_csv(StringIO(data), header=1, comment="#")
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("nrows", [5, 3, None])
|
||
|
def test_malformed_chunks(all_parsers, nrows):
|
||
|
data = """ignore
|
||
|
A,B,C
|
||
|
skip
|
||
|
1,2,3
|
||
|
3,5,10 # comment
|
||
|
1,2,3,4,5
|
||
|
2,3,4
|
||
|
"""
|
||
|
parser = all_parsers
|
||
|
msg = "Expected 3 fields in line 6, saw 5"
|
||
|
with parser.read_csv(
|
||
|
StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2]
|
||
|
) as reader:
|
||
|
with pytest.raises(ParserError, match=msg):
|
||
|
reader.read(nrows)
|
||
|
|
||
|
|
||
|
def test_unnamed_columns(all_parsers):
|
||
|
data = """A,B,C,,
|
||
|
1,2,3,4,5
|
||
|
6,7,8,9,10
|
||
|
11,12,13,14,15
|
||
|
"""
|
||
|
parser = all_parsers
|
||
|
expected = DataFrame(
|
||
|
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]],
|
||
|
dtype=np.int64,
|
||
|
columns=["A", "B", "C", "Unnamed: 3", "Unnamed: 4"],
|
||
|
)
|
||
|
result = parser.read_csv(StringIO(data))
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_csv_mixed_type(all_parsers):
|
||
|
data = """A,B,C
|
||
|
a,1,2
|
||
|
b,3,4
|
||
|
c,4,5
|
||
|
"""
|
||
|
parser = all_parsers
|
||
|
expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]})
|
||
|
result = parser.read_csv(StringIO(data))
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_read_csv_low_memory_no_rows_with_index(all_parsers):
|
||
|
# see gh-21141
|
||
|
parser = all_parsers
|
||
|
|
||
|
if not parser.low_memory:
|
||
|
pytest.skip("This is a low-memory specific test")
|
||
|
|
||
|
data = """A,B,C
|
||
|
1,1,1,2
|
||
|
2,2,3,4
|
||
|
3,3,4,5
|
||
|
"""
|
||
|
result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
|
||
|
expected = DataFrame(columns=["A", "B", "C"])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_read_csv_dataframe(all_parsers, csv1):
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(csv1, index_col=0, parse_dates=True)
|
||
|
|
||
|
expected = DataFrame(
|
||
|
[
|
||
|
[0.980269, 3.685731, -0.364216805298, -1.159738],
|
||
|
[1.047916, -0.041232, -0.16181208307, 0.212549],
|
||
|
[0.498581, 0.731168, -0.537677223318, 1.346270],
|
||
|
[1.120202, 1.567621, 0.00364077397681, 0.675253],
|
||
|
[-0.487094, 0.571455, -1.6116394093, 0.103469],
|
||
|
[0.836649, 0.246462, 0.588542635376, 1.062782],
|
||
|
[-0.157161, 1.340307, 1.1957779562, -1.097007],
|
||
|
],
|
||
|
columns=["A", "B", "C", "D"],
|
||
|
index=Index(
|
||
|
[
|
||
|
datetime(2000, 1, 3),
|
||
|
datetime(2000, 1, 4),
|
||
|
datetime(2000, 1, 5),
|
||
|
datetime(2000, 1, 6),
|
||
|
datetime(2000, 1, 7),
|
||
|
datetime(2000, 1, 10),
|
||
|
datetime(2000, 1, 11),
|
||
|
],
|
||
|
name="index",
|
||
|
),
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_read_csv_no_index_name(all_parsers, csv_dir_path):
|
||
|
parser = all_parsers
|
||
|
csv2 = os.path.join(csv_dir_path, "test2.csv")
|
||
|
result = parser.read_csv(csv2, index_col=0, parse_dates=True)
|
||
|
|
||
|
expected = DataFrame(
|
||
|
[
|
||
|
[0.980269, 3.685731, -0.364216805298, -1.159738, "foo"],
|
||
|
[1.047916, -0.041232, -0.16181208307, 0.212549, "bar"],
|
||
|
[0.498581, 0.731168, -0.537677223318, 1.346270, "baz"],
|
||
|
[1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"],
|
||
|
[-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"],
|
||
|
],
|
||
|
columns=["A", "B", "C", "D", "E"],
|
||
|
index=Index(
|
||
|
[
|
||
|
datetime(2000, 1, 3),
|
||
|
datetime(2000, 1, 4),
|
||
|
datetime(2000, 1, 5),
|
||
|
datetime(2000, 1, 6),
|
||
|
datetime(2000, 1, 7),
|
||
|
]
|
||
|
),
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_read_csv_wrong_num_columns(all_parsers):
|
||
|
# Too few columns.
|
||
|
data = """A,B,C,D,E,F
|
||
|
1,2,3,4,5,6
|
||
|
6,7,8,9,10,11,12
|
||
|
11,12,13,14,15,16
|
||
|
"""
|
||
|
parser = all_parsers
|
||
|
msg = "Expected 6 fields in line 3, saw 7"
|
||
|
|
||
|
with pytest.raises(ParserError, match=msg):
|
||
|
parser.read_csv(StringIO(data))
|
||
|
|
||
|
|
||
|
def test_read_duplicate_index_explicit(all_parsers):
|
||
|
data = """index,A,B,C,D
|
||
|
foo,2,3,4,5
|
||
|
bar,7,8,9,10
|
||
|
baz,12,13,14,15
|
||
|
qux,12,13,14,15
|
||
|
foo,12,13,14,15
|
||
|
bar,12,13,14,15
|
||
|
"""
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(data), index_col=0)
|
||
|
|
||
|
expected = DataFrame(
|
||
|
[
|
||
|
[2, 3, 4, 5],
|
||
|
[7, 8, 9, 10],
|
||
|
[12, 13, 14, 15],
|
||
|
[12, 13, 14, 15],
|
||
|
[12, 13, 14, 15],
|
||
|
[12, 13, 14, 15],
|
||
|
],
|
||
|
columns=["A", "B", "C", "D"],
|
||
|
index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"),
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_read_duplicate_index_implicit(all_parsers):
|
||
|
data = """A,B,C,D
|
||
|
foo,2,3,4,5
|
||
|
bar,7,8,9,10
|
||
|
baz,12,13,14,15
|
||
|
qux,12,13,14,15
|
||
|
foo,12,13,14,15
|
||
|
bar,12,13,14,15
|
||
|
"""
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(data))
|
||
|
|
||
|
expected = DataFrame(
|
||
|
[
|
||
|
[2, 3, 4, 5],
|
||
|
[7, 8, 9, 10],
|
||
|
[12, 13, 14, 15],
|
||
|
[12, 13, 14, 15],
|
||
|
[12, 13, 14, 15],
|
||
|
[12, 13, 14, 15],
|
||
|
],
|
||
|
columns=["A", "B", "C", "D"],
|
||
|
index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]),
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"data,kwargs,expected",
|
||
|
[
|
||
|
(
|
||
|
"A,B\nTrue,1\nFalse,2\nTrue,3",
|
||
|
dict(),
|
||
|
DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
|
||
|
),
|
||
|
(
|
||
|
"A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3",
|
||
|
dict(true_values=["yes", "Yes", "YES"], false_values=["no", "NO", "No"]),
|
||
|
DataFrame(
|
||
|
[[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]],
|
||
|
columns=["A", "B"],
|
||
|
),
|
||
|
),
|
||
|
(
|
||
|
"A,B\nTRUE,1\nFALSE,2\nTRUE,3",
|
||
|
dict(),
|
||
|
DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
|
||
|
),
|
||
|
(
|
||
|
"A,B\nfoo,bar\nbar,foo",
|
||
|
dict(true_values=["foo"], false_values=["bar"]),
|
||
|
DataFrame([[True, False], [False, True]], columns=["A", "B"]),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_parse_bool(all_parsers, data, kwargs, expected):
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(data), **kwargs)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_int_conversion(all_parsers):
|
||
|
data = """A,B
|
||
|
1.0,1
|
||
|
2.0,2
|
||
|
3.0,3
|
||
|
"""
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(data))
|
||
|
|
||
|
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("nrows", [3, 3.0])
|
||
|
def test_read_nrows(all_parsers, nrows):
|
||
|
# see gh-10476
|
||
|
data = """index,A,B,C,D
|
||
|
foo,2,3,4,5
|
||
|
bar,7,8,9,10
|
||
|
baz,12,13,14,15
|
||
|
qux,12,13,14,15
|
||
|
foo2,12,13,14,15
|
||
|
bar2,12,13,14,15
|
||
|
"""
|
||
|
expected = DataFrame(
|
||
|
[["foo", 2, 3, 4, 5], ["bar", 7, 8, 9, 10], ["baz", 12, 13, 14, 15]],
|
||
|
columns=["index", "A", "B", "C", "D"],
|
||
|
)
|
||
|
parser = all_parsers
|
||
|
|
||
|
result = parser.read_csv(StringIO(data), nrows=nrows)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("nrows", [1.2, "foo", -1])
|
||
|
def test_read_nrows_bad(all_parsers, nrows):
|
||
|
data = """index,A,B,C,D
|
||
|
foo,2,3,4,5
|
||
|
bar,7,8,9,10
|
||
|
baz,12,13,14,15
|
||
|
qux,12,13,14,15
|
||
|
foo2,12,13,14,15
|
||
|
bar2,12,13,14,15
|
||
|
"""
|
||
|
msg = r"'nrows' must be an integer >=0"
|
||
|
parser = all_parsers
|
||
|
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
parser.read_csv(StringIO(data), nrows=nrows)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("index_col", [0, "index"])
|
||
|
def test_read_chunksize_with_index(all_parsers, index_col):
|
||
|
parser = all_parsers
|
||
|
data = """index,A,B,C,D
|
||
|
foo,2,3,4,5
|
||
|
bar,7,8,9,10
|
||
|
baz,12,13,14,15
|
||
|
qux,12,13,14,15
|
||
|
foo2,12,13,14,15
|
||
|
bar2,12,13,14,15
|
||
|
"""
|
||
|
|
||
|
expected = DataFrame(
|
||
|
[
|
||
|
["foo", 2, 3, 4, 5],
|
||
|
["bar", 7, 8, 9, 10],
|
||
|
["baz", 12, 13, 14, 15],
|
||
|
["qux", 12, 13, 14, 15],
|
||
|
["foo2", 12, 13, 14, 15],
|
||
|
["bar2", 12, 13, 14, 15],
|
||
|
],
|
||
|
columns=["index", "A", "B", "C", "D"],
|
||
|
)
|
||
|
expected = expected.set_index("index")
|
||
|
|
||
|
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
|
||
|
chunks = list(reader)
|
||
|
tm.assert_frame_equal(chunks[0], expected[:2])
|
||
|
tm.assert_frame_equal(chunks[1], expected[2:4])
|
||
|
tm.assert_frame_equal(chunks[2], expected[4:])
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
|
||
|
def test_read_chunksize_bad(all_parsers, chunksize):
|
||
|
data = """index,A,B,C,D
|
||
|
foo,2,3,4,5
|
||
|
bar,7,8,9,10
|
||
|
baz,12,13,14,15
|
||
|
qux,12,13,14,15
|
||
|
foo2,12,13,14,15
|
||
|
bar2,12,13,14,15
|
||
|
"""
|
||
|
parser = all_parsers
|
||
|
msg = r"'chunksize' must be an integer >=1"
|
||
|
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
with parser.read_csv(StringIO(data), chunksize=chunksize) as _:
|
||
|
pass
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("chunksize", [2, 8])
|
||
|
def test_read_chunksize_and_nrows(all_parsers, chunksize):
|
||
|
# see gh-15755
|
||
|
data = """index,A,B,C,D
|
||
|
foo,2,3,4,5
|
||
|
bar,7,8,9,10
|
||
|
baz,12,13,14,15
|
||
|
qux,12,13,14,15
|
||
|
foo2,12,13,14,15
|
||
|
bar2,12,13,14,15
|
||
|
"""
|
||
|
parser = all_parsers
|
||
|
kwargs = dict(index_col=0, nrows=5)
|
||
|
|
||
|
expected = parser.read_csv(StringIO(data), **kwargs)
|
||
|
with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader:
|
||
|
tm.assert_frame_equal(concat(reader), expected)
|
||
|
|
||
|
|
||
|
def test_read_chunksize_and_nrows_changing_size(all_parsers):
|
||
|
data = """index,A,B,C,D
|
||
|
foo,2,3,4,5
|
||
|
bar,7,8,9,10
|
||
|
baz,12,13,14,15
|
||
|
qux,12,13,14,15
|
||
|
foo2,12,13,14,15
|
||
|
bar2,12,13,14,15
|
||
|
"""
|
||
|
parser = all_parsers
|
||
|
kwargs = dict(index_col=0, nrows=5)
|
||
|
|
||
|
expected = parser.read_csv(StringIO(data), **kwargs)
|
||
|
with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader:
|
||
|
tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
|
||
|
tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5])
|
||
|
|
||
|
with pytest.raises(StopIteration, match=""):
|
||
|
reader.get_chunk(size=3)
|
||
|
|
||
|
|
||
|
def test_get_chunk_passed_chunksize(all_parsers):
|
||
|
parser = all_parsers
|
||
|
data = """A,B,C
|
||
|
1,2,3
|
||
|
4,5,6
|
||
|
7,8,9
|
||
|
1,2,3"""
|
||
|
|
||
|
with parser.read_csv(StringIO(data), chunksize=2) as reader:
|
||
|
result = reader.get_chunk()
|
||
|
|
||
|
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("kwargs", [dict(), dict(index_col=0)])
|
||
|
def test_read_chunksize_compat(all_parsers, kwargs):
|
||
|
# see gh-12185
|
||
|
data = """index,A,B,C,D
|
||
|
foo,2,3,4,5
|
||
|
bar,7,8,9,10
|
||
|
baz,12,13,14,15
|
||
|
qux,12,13,14,15
|
||
|
foo2,12,13,14,15
|
||
|
bar2,12,13,14,15
|
||
|
"""
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(data), **kwargs)
|
||
|
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
|
||
|
tm.assert_frame_equal(concat(reader), result)
|
||
|
|
||
|
|
||
|
def test_read_chunksize_jagged_names(all_parsers):
|
||
|
# see gh-23509
|
||
|
parser = all_parsers
|
||
|
data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
|
||
|
|
||
|
expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
|
||
|
with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
|
||
|
result = concat(reader)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_read_data_list(all_parsers):
|
||
|
parser = all_parsers
|
||
|
kwargs = dict(index_col=0)
|
||
|
data = "A,B,C\nfoo,1,2,3\nbar,4,5,6"
|
||
|
|
||
|
data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]]
|
||
|
expected = parser.read_csv(StringIO(data), **kwargs)
|
||
|
|
||
|
with TextParser(data_list, chunksize=2, **kwargs) as parser:
|
||
|
result = parser.read()
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_iterator(all_parsers):
|
||
|
# see gh-6607
|
||
|
data = """index,A,B,C,D
|
||
|
foo,2,3,4,5
|
||
|
bar,7,8,9,10
|
||
|
baz,12,13,14,15
|
||
|
qux,12,13,14,15
|
||
|
foo2,12,13,14,15
|
||
|
bar2,12,13,14,15
|
||
|
"""
|
||
|
parser = all_parsers
|
||
|
kwargs = dict(index_col=0)
|
||
|
|
||
|
expected = parser.read_csv(StringIO(data), **kwargs)
|
||
|
with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader:
|
||
|
|
||
|
first_chunk = reader.read(3)
|
||
|
tm.assert_frame_equal(first_chunk, expected[:3])
|
||
|
|
||
|
last_chunk = reader.read(5)
|
||
|
tm.assert_frame_equal(last_chunk, expected[3:])
|
||
|
|
||
|
|
||
|
def test_iterator2(all_parsers):
|
||
|
parser = all_parsers
|
||
|
data = """A,B,C
|
||
|
foo,1,2,3
|
||
|
bar,4,5,6
|
||
|
baz,7,8,9
|
||
|
"""
|
||
|
|
||
|
with parser.read_csv(StringIO(data), iterator=True) as reader:
|
||
|
result = list(reader)
|
||
|
|
||
|
expected = DataFrame(
|
||
|
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||
|
index=["foo", "bar", "baz"],
|
||
|
columns=["A", "B", "C"],
|
||
|
)
|
||
|
tm.assert_frame_equal(result[0], expected)
|
||
|
|
||
|
|
||
|
def test_reader_list(all_parsers):
|
||
|
data = """index,A,B,C,D
|
||
|
foo,2,3,4,5
|
||
|
bar,7,8,9,10
|
||
|
baz,12,13,14,15
|
||
|
qux,12,13,14,15
|
||
|
foo2,12,13,14,15
|
||
|
bar2,12,13,14,15
|
||
|
"""
|
||
|
parser = all_parsers
|
||
|
kwargs = dict(index_col=0)
|
||
|
|
||
|
lines = list(csv.reader(StringIO(data)))
|
||
|
with TextParser(lines, chunksize=2, **kwargs) as reader:
|
||
|
chunks = list(reader)
|
||
|
|
||
|
expected = parser.read_csv(StringIO(data), **kwargs)
|
||
|
|
||
|
tm.assert_frame_equal(chunks[0], expected[:2])
|
||
|
tm.assert_frame_equal(chunks[1], expected[2:4])
|
||
|
tm.assert_frame_equal(chunks[2], expected[4:])
|
||
|
|
||
|
|
||
|
def test_reader_list_skiprows(all_parsers):
|
||
|
data = """index,A,B,C,D
|
||
|
foo,2,3,4,5
|
||
|
bar,7,8,9,10
|
||
|
baz,12,13,14,15
|
||
|
qux,12,13,14,15
|
||
|
foo2,12,13,14,15
|
||
|
bar2,12,13,14,15
|
||
|
"""
|
||
|
parser = all_parsers
|
||
|
kwargs = dict(index_col=0)
|
||
|
|
||
|
lines = list(csv.reader(StringIO(data)))
|
||
|
with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader:
|
||
|
chunks = list(reader)
|
||
|
|
||
|
expected = parser.read_csv(StringIO(data), **kwargs)
|
||
|
|
||
|
tm.assert_frame_equal(chunks[0], expected[1:3])
|
||
|
|
||
|
|
||
|
def test_iterator_stop_on_chunksize(all_parsers):
|
||
|
# gh-3967: stopping iteration when chunksize is specified
|
||
|
parser = all_parsers
|
||
|
data = """A,B,C
|
||
|
foo,1,2,3
|
||
|
bar,4,5,6
|
||
|
baz,7,8,9
|
||
|
"""
|
||
|
|
||
|
with parser.read_csv(StringIO(data), chunksize=1) as reader:
|
||
|
result = list(reader)
|
||
|
|
||
|
assert len(result) == 3
|
||
|
expected = DataFrame(
|
||
|
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||
|
index=["foo", "bar", "baz"],
|
||
|
columns=["A", "B", "C"],
|
||
|
)
|
||
|
tm.assert_frame_equal(concat(result), expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"kwargs", [dict(iterator=True, chunksize=1), dict(iterator=True), dict(chunksize=1)]
|
||
|
)
|
||
|
def test_iterator_skipfooter_errors(all_parsers, kwargs):
|
||
|
msg = "'skipfooter' not supported for iteration"
|
||
|
parser = all_parsers
|
||
|
data = "a\n1\n2"
|
||
|
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _:
|
||
|
pass
|
||
|
|
||
|
|
||
|
def test_nrows_skipfooter_errors(all_parsers):
|
||
|
msg = "'skipfooter' not supported with 'nrows'"
|
||
|
data = "a\n1\n2\n3\n4\n5\n6"
|
||
|
parser = all_parsers
|
||
|
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
parser.read_csv(StringIO(data), skipfooter=1, nrows=5)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"data,kwargs,expected",
|
||
|
[
|
||
|
(
|
||
|
"""foo,2,3,4,5
|
||
|
bar,7,8,9,10
|
||
|
baz,12,13,14,15
|
||
|
qux,12,13,14,15
|
||
|
foo2,12,13,14,15
|
||
|
bar2,12,13,14,15
|
||
|
""",
|
||
|
dict(index_col=0, names=["index", "A", "B", "C", "D"]),
|
||
|
DataFrame(
|
||
|
[
|
||
|
[2, 3, 4, 5],
|
||
|
[7, 8, 9, 10],
|
||
|
[12, 13, 14, 15],
|
||
|
[12, 13, 14, 15],
|
||
|
[12, 13, 14, 15],
|
||
|
[12, 13, 14, 15],
|
||
|
],
|
||
|
index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"),
|
||
|
columns=["A", "B", "C", "D"],
|
||
|
),
|
||
|
),
|
||
|
(
|
||
|
"""foo,one,2,3,4,5
|
||
|
foo,two,7,8,9,10
|
||
|
foo,three,12,13,14,15
|
||
|
bar,one,12,13,14,15
|
||
|
bar,two,12,13,14,15
|
||
|
""",
|
||
|
dict(index_col=[0, 1], names=["index1", "index2", "A", "B", "C", "D"]),
|
||
|
DataFrame(
|
||
|
[
|
||
|
[2, 3, 4, 5],
|
||
|
[7, 8, 9, 10],
|
||
|
[12, 13, 14, 15],
|
||
|
[12, 13, 14, 15],
|
||
|
[12, 13, 14, 15],
|
||
|
],
|
||
|
index=MultiIndex.from_tuples(
|
||
|
[
|
||
|
("foo", "one"),
|
||
|
("foo", "two"),
|
||
|
("foo", "three"),
|
||
|
("bar", "one"),
|
||
|
("bar", "two"),
|
||
|
],
|
||
|
names=["index1", "index2"],
|
||
|
),
|
||
|
columns=["A", "B", "C", "D"],
|
||
|
),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_pass_names_with_index(all_parsers, data, kwargs, expected):
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(data), **kwargs)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
|
||
|
def test_multi_index_no_level_names(all_parsers, index_col):
|
||
|
data = """index1,index2,A,B,C,D
|
||
|
foo,one,2,3,4,5
|
||
|
foo,two,7,8,9,10
|
||
|
foo,three,12,13,14,15
|
||
|
bar,one,12,13,14,15
|
||
|
bar,two,12,13,14,15
|
||
|
"""
|
||
|
headless_data = "\n".join(data.split("\n")[1:])
|
||
|
|
||
|
names = ["A", "B", "C", "D"]
|
||
|
parser = all_parsers
|
||
|
|
||
|
result = parser.read_csv(
|
||
|
StringIO(headless_data), index_col=index_col, header=None, names=names
|
||
|
)
|
||
|
expected = parser.read_csv(StringIO(data), index_col=index_col)
|
||
|
|
||
|
# No index names in headless data.
|
||
|
expected.index.names = [None] * 2
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_multi_index_no_level_names_implicit(all_parsers):
|
||
|
parser = all_parsers
|
||
|
data = """A,B,C,D
|
||
|
foo,one,2,3,4,5
|
||
|
foo,two,7,8,9,10
|
||
|
foo,three,12,13,14,15
|
||
|
bar,one,12,13,14,15
|
||
|
bar,two,12,13,14,15
|
||
|
"""
|
||
|
|
||
|
result = parser.read_csv(StringIO(data))
|
||
|
expected = DataFrame(
|
||
|
[
|
||
|
[2, 3, 4, 5],
|
||
|
[7, 8, 9, 10],
|
||
|
[12, 13, 14, 15],
|
||
|
[12, 13, 14, 15],
|
||
|
[12, 13, 14, 15],
|
||
|
],
|
||
|
columns=["A", "B", "C", "D"],
|
||
|
index=MultiIndex.from_tuples(
|
||
|
[
|
||
|
("foo", "one"),
|
||
|
("foo", "two"),
|
||
|
("foo", "three"),
|
||
|
("bar", "one"),
|
||
|
("bar", "two"),
|
||
|
]
|
||
|
),
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"data,expected,header",
|
||
|
[
|
||
|
("a,b", DataFrame(columns=["a", "b"]), [0]),
|
||
|
(
|
||
|
"a,b\nc,d",
|
||
|
DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])),
|
||
|
[0, 1],
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize("round_trip", [True, False])
|
||
|
def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip):
|
||
|
# see gh-14545
|
||
|
parser = all_parsers
|
||
|
data = expected.to_csv(index=False) if round_trip else data
|
||
|
|
||
|
result = parser.read_csv(StringIO(data), header=header)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_no_unnamed_index(all_parsers):
|
||
|
parser = all_parsers
|
||
|
data = """ id c0 c1 c2
|
||
|
0 1 0 a b
|
||
|
1 2 0 c d
|
||
|
2 2 2 e f
|
||
|
"""
|
||
|
result = parser.read_csv(StringIO(data), sep=" ")
|
||
|
expected = DataFrame(
|
||
|
[[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]],
|
||
|
columns=["Unnamed: 0", "id", "c0", "c1", "c2"],
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_read_csv_parse_simple_list(all_parsers):
|
||
|
parser = all_parsers
|
||
|
data = """foo
|
||
|
bar baz
|
||
|
qux foo
|
||
|
foo
|
||
|
bar"""
|
||
|
|
||
|
result = parser.read_csv(StringIO(data), header=None)
|
||
|
expected = DataFrame(["foo", "bar baz", "qux foo", "foo", "bar"])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@tm.network
|
||
|
def test_url(all_parsers, csv_dir_path):
|
||
|
# TODO: FTP testing
|
||
|
parser = all_parsers
|
||
|
kwargs = dict(sep="\t")
|
||
|
|
||
|
url = (
|
||
|
"https://raw.github.com/pandas-dev/pandas/master/"
|
||
|
"pandas/tests/io/parser/data/salaries.csv"
|
||
|
)
|
||
|
url_result = parser.read_csv(url, **kwargs)
|
||
|
|
||
|
local_path = os.path.join(csv_dir_path, "salaries.csv")
|
||
|
local_result = parser.read_csv(local_path, **kwargs)
|
||
|
tm.assert_frame_equal(url_result, local_result)
|
||
|
|
||
|
|
||
|
@pytest.mark.slow
|
||
|
def test_local_file(all_parsers, csv_dir_path):
|
||
|
parser = all_parsers
|
||
|
kwargs = dict(sep="\t")
|
||
|
|
||
|
local_path = os.path.join(csv_dir_path, "salaries.csv")
|
||
|
local_result = parser.read_csv(local_path, **kwargs)
|
||
|
url = "file://localhost/" + local_path
|
||
|
|
||
|
try:
|
||
|
url_result = parser.read_csv(url, **kwargs)
|
||
|
tm.assert_frame_equal(url_result, local_result)
|
||
|
except URLError:
|
||
|
# Fails on some systems.
|
||
|
pytest.skip("Failing on: " + " ".join(platform.uname()))
|
||
|
|
||
|
|
||
|
def test_path_path_lib(all_parsers):
|
||
|
parser = all_parsers
|
||
|
df = tm.makeDataFrame()
|
||
|
result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0))
|
||
|
tm.assert_frame_equal(df, result)
|
||
|
|
||
|
|
||
|
def test_path_local_path(all_parsers):
|
||
|
parser = all_parsers
|
||
|
df = tm.makeDataFrame()
|
||
|
result = tm.round_trip_localpath(
|
||
|
df.to_csv, lambda p: parser.read_csv(p, index_col=0)
|
||
|
)
|
||
|
tm.assert_frame_equal(df, result)
|
||
|
|
||
|
|
||
|
def test_nonexistent_path(all_parsers):
|
||
|
# gh-2428: pls no segfault
|
||
|
# gh-14086: raise more helpful FileNotFoundError
|
||
|
# GH#29233 "File foo" instead of "File b'foo'"
|
||
|
parser = all_parsers
|
||
|
path = f"{tm.rands(10)}.csv"
|
||
|
|
||
|
msg = r"\[Errno 2\]"
|
||
|
with pytest.raises(FileNotFoundError, match=msg) as e:
|
||
|
parser.read_csv(path)
|
||
|
assert path == e.value.filename
|
||
|
|
||
|
|
||
|
@td.skip_if_windows # os.chmod does not work in windows
|
||
|
def test_no_permission(all_parsers):
|
||
|
# GH 23784
|
||
|
parser = all_parsers
|
||
|
|
||
|
msg = r"\[Errno 13\]"
|
||
|
with tm.ensure_clean() as path:
|
||
|
os.chmod(path, 0) # make file unreadable
|
||
|
|
||
|
# verify that this process cannot open the file (not running as sudo)
|
||
|
try:
|
||
|
with open(path):
|
||
|
pass
|
||
|
pytest.skip("Running as sudo.")
|
||
|
except PermissionError:
|
||
|
pass
|
||
|
|
||
|
with pytest.raises(PermissionError, match=msg) as e:
|
||
|
parser.read_csv(path)
|
||
|
assert path == e.value.filename
|
||
|
|
||
|
|
||
|
def test_missing_trailing_delimiters(all_parsers):
|
||
|
parser = all_parsers
|
||
|
data = """A,B,C,D
|
||
|
1,2,3,4
|
||
|
1,3,3,
|
||
|
1,4,5"""
|
||
|
|
||
|
result = parser.read_csv(StringIO(data))
|
||
|
expected = DataFrame(
|
||
|
[[1, 2, 3, 4], [1, 3, 3, np.nan], [1, 4, 5, np.nan]],
|
||
|
columns=["A", "B", "C", "D"],
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_skip_initial_space(all_parsers):
|
||
|
data = (
|
||
|
'"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
|
||
|
"1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, "
|
||
|
"314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, "
|
||
|
"70.06056, 344.98370, 1, 1, -0.689265, -0.692787, "
|
||
|
"0.212036, 14.7674, 41.605, -9999.0, -9999.0, "
|
||
|
"-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128"
|
||
|
)
|
||
|
parser = all_parsers
|
||
|
|
||
|
result = parser.read_csv(
|
||
|
StringIO(data),
|
||
|
names=list(range(33)),
|
||
|
header=None,
|
||
|
na_values=["-9999.0"],
|
||
|
skipinitialspace=True,
|
||
|
)
|
||
|
expected = DataFrame(
|
||
|
[
|
||
|
[
|
||
|
"09-Apr-2012",
|
||
|
"01:10:18.300",
|
||
|
2456026.548822908,
|
||
|
12849,
|
||
|
1.00361,
|
||
|
1.12551,
|
||
|
330.65659,
|
||
|
355626618.16711,
|
||
|
73.48821,
|
||
|
314.11625,
|
||
|
1917.09447,
|
||
|
179.71425,
|
||
|
80.0,
|
||
|
240.0,
|
||
|
-350,
|
||
|
70.06056,
|
||
|
344.9837,
|
||
|
1,
|
||
|
1,
|
||
|
-0.689265,
|
||
|
-0.692787,
|
||
|
0.212036,
|
||
|
14.7674,
|
||
|
41.605,
|
||
|
np.nan,
|
||
|
np.nan,
|
||
|
np.nan,
|
||
|
np.nan,
|
||
|
np.nan,
|
||
|
np.nan,
|
||
|
0,
|
||
|
12,
|
||
|
128,
|
||
|
]
|
||
|
]
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_trailing_delimiters(all_parsers):
|
||
|
# see gh-2442
|
||
|
data = """A,B,C
|
||
|
1,2,3,
|
||
|
4,5,6,
|
||
|
7,8,9,"""
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(data), index_col=False)
|
||
|
|
||
|
expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]})
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_escapechar(all_parsers):
|
||
|
# https://stackoverflow.com/questions/13824840/feature-request-for-
|
||
|
# pandas-read-csv
|
||
|
data = '''SEARCH_TERM,ACTUAL_URL
|
||
|
"bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
|
||
|
"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
|
||
|
"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa
|
||
|
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(
|
||
|
StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8"
|
||
|
)
|
||
|
|
||
|
assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals series'
|
||
|
|
||
|
tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"]))
|
||
|
|
||
|
|
||
|
def test_int64_min_issues(all_parsers):
|
||
|
# see gh-2599
|
||
|
parser = all_parsers
|
||
|
data = "A,B\n0,0\n0,"
|
||
|
result = parser.read_csv(StringIO(data))
|
||
|
|
||
|
expected = DataFrame({"A": [0, 0], "B": [0, np.nan]})
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_parse_integers_above_fp_precision(all_parsers):
|
||
|
data = """Numbers
|
||
|
17007000002000191
|
||
|
17007000002000191
|
||
|
17007000002000191
|
||
|
17007000002000191
|
||
|
17007000002000192
|
||
|
17007000002000192
|
||
|
17007000002000192
|
||
|
17007000002000192
|
||
|
17007000002000192
|
||
|
17007000002000194"""
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(data))
|
||
|
expected = DataFrame(
|
||
|
{
|
||
|
"Numbers": [
|
||
|
17007000002000191,
|
||
|
17007000002000191,
|
||
|
17007000002000191,
|
||
|
17007000002000191,
|
||
|
17007000002000192,
|
||
|
17007000002000192,
|
||
|
17007000002000192,
|
||
|
17007000002000192,
|
||
|
17007000002000192,
|
||
|
17007000002000194,
|
||
|
]
|
||
|
}
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_chunks_have_consistent_numerical_type(all_parsers):
|
||
|
parser = all_parsers
|
||
|
integers = [str(i) for i in range(499999)]
|
||
|
data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
|
||
|
|
||
|
# Coercions should work without warnings.
|
||
|
with tm.assert_produces_warning(None):
|
||
|
result = parser.read_csv(StringIO(data))
|
||
|
|
||
|
assert type(result.a[0]) is np.float64
|
||
|
assert result.a.dtype == float
|
||
|
|
||
|
|
||
|
def test_warn_if_chunks_have_mismatched_type(all_parsers):
|
||
|
warning_type = None
|
||
|
parser = all_parsers
|
||
|
integers = [str(i) for i in range(499999)]
|
||
|
data = "a\n" + "\n".join(integers + ["a", "b"] + integers)
|
||
|
|
||
|
# see gh-3866: if chunks are different types and can't
|
||
|
# be coerced using numerical types, then issue warning.
|
||
|
if parser.engine == "c" and parser.low_memory:
|
||
|
warning_type = DtypeWarning
|
||
|
|
||
|
with tm.assert_produces_warning(warning_type):
|
||
|
df = parser.read_csv(StringIO(data))
|
||
|
assert df.a.dtype == object
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("sep", [" ", r"\s+"])
|
||
|
def test_integer_overflow_bug(all_parsers, sep):
|
||
|
# see gh-2601
|
||
|
data = "65248E10 11\n55555E55 22\n"
|
||
|
parser = all_parsers
|
||
|
|
||
|
result = parser.read_csv(StringIO(data), header=None, sep=sep)
|
||
|
expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_catch_too_many_names(all_parsers):
|
||
|
# see gh-5156
|
||
|
data = """\
|
||
|
1,2,3
|
||
|
4,,6
|
||
|
7,8,9
|
||
|
10,11,12\n"""
|
||
|
parser = all_parsers
|
||
|
msg = (
|
||
|
"Too many columns specified: expected 4 and found 3"
|
||
|
if parser.engine == "c"
|
||
|
else "Number of passed names did not match "
|
||
|
"number of header fields in the file"
|
||
|
)
|
||
|
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"])
|
||
|
|
||
|
|
||
|
def test_ignore_leading_whitespace(all_parsers):
|
||
|
# see gh-3374, gh-6607
|
||
|
parser = all_parsers
|
||
|
data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9"
|
||
|
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||
|
|
||
|
expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]})
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_chunk_begins_with_newline_whitespace(all_parsers):
|
||
|
# see gh-10022
|
||
|
parser = all_parsers
|
||
|
data = "\n hello\nworld\n"
|
||
|
|
||
|
result = parser.read_csv(StringIO(data), header=None)
|
||
|
expected = DataFrame([" hello", "world"])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_empty_with_index(all_parsers):
|
||
|
# see gh-10184
|
||
|
data = "x,y"
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(data), index_col=0)
|
||
|
|
||
|
expected = DataFrame(columns=["y"], index=Index([], name="x"))
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_empty_with_multi_index(all_parsers):
|
||
|
# see gh-10467
|
||
|
data = "x,y,z"
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(data), index_col=["x", "y"])
|
||
|
|
||
|
expected = DataFrame(
|
||
|
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"])
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_empty_with_reversed_multi_index(all_parsers):
|
||
|
data = "x,y,z"
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(data), index_col=[1, 0])
|
||
|
|
||
|
expected = DataFrame(
|
||
|
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"])
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_float_parser(all_parsers):
|
||
|
# see gh-9565
|
||
|
parser = all_parsers
|
||
|
data = "45e-1,4.5,45.,inf,-inf"
|
||
|
result = parser.read_csv(StringIO(data), header=None)
|
||
|
|
||
|
expected = DataFrame([[float(s) for s in data.split(",")]])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_scientific_no_exponent(all_parsers_all_precisions):
|
||
|
# see gh-12215
|
||
|
df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]})
|
||
|
data = df.to_csv(index=False)
|
||
|
parser, precision = all_parsers_all_precisions
|
||
|
|
||
|
df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
|
||
|
tm.assert_frame_equal(df_roundtrip, df)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
|
||
|
def test_int64_overflow(all_parsers, conv):
|
||
|
data = """ID
|
||
|
00013007854817840016671868
|
||
|
00013007854817840016749251
|
||
|
00013007854817840016754630
|
||
|
00013007854817840016781876
|
||
|
00013007854817840017028824
|
||
|
00013007854817840017963235
|
||
|
00013007854817840018860166"""
|
||
|
parser = all_parsers
|
||
|
|
||
|
if conv is None:
|
||
|
# 13007854817840016671868 > UINT64_MAX, so this
|
||
|
# will overflow and return object as the dtype.
|
||
|
result = parser.read_csv(StringIO(data))
|
||
|
expected = DataFrame(
|
||
|
[
|
||
|
"00013007854817840016671868",
|
||
|
"00013007854817840016749251",
|
||
|
"00013007854817840016754630",
|
||
|
"00013007854817840016781876",
|
||
|
"00013007854817840017028824",
|
||
|
"00013007854817840017963235",
|
||
|
"00013007854817840018860166",
|
||
|
],
|
||
|
columns=["ID"],
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
else:
|
||
|
# 13007854817840016671868 > UINT64_MAX, so attempts
|
||
|
# to cast to either int64 or uint64 will result in
|
||
|
# an OverflowError being raised.
|
||
|
msg = (
|
||
|
"(Python int too large to convert to C long)|"
|
||
|
"(long too big to convert)|"
|
||
|
"(int too big to convert)"
|
||
|
)
|
||
|
|
||
|
with pytest.raises(OverflowError, match=msg):
|
||
|
parser.read_csv(StringIO(data), converters={"ID": conv})
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min]
|
||
|
)
|
||
|
def test_int64_uint64_range(all_parsers, val):
|
||
|
# These numbers fall right inside the int64-uint64
|
||
|
# range, so they should be parsed as string.
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(str(val)), header=None)
|
||
|
|
||
|
expected = DataFrame([val])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
|
||
|
)
|
||
|
def test_outside_int64_uint64_range(all_parsers, val):
|
||
|
# These numbers fall just outside the int64-uint64
|
||
|
# range, so they should be parsed as string.
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(str(val)), header=None)
|
||
|
|
||
|
expected = DataFrame([str(val)])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("exp_data", [[str(-1), str(2 ** 63)], [str(2 ** 63), str(-1)]])
|
||
|
def test_numeric_range_too_wide(all_parsers, exp_data):
|
||
|
# No numerical dtype can hold both negative and uint64
|
||
|
# values, so they should be cast as string.
|
||
|
parser = all_parsers
|
||
|
data = "\n".join(exp_data)
|
||
|
expected = DataFrame(exp_data)
|
||
|
|
||
|
result = parser.read_csv(StringIO(data), header=None)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999])
|
||
|
def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
|
||
|
# GH#38753
|
||
|
parser, precision = all_parsers_all_precisions
|
||
|
data = f"data\n10E{neg_exp}"
|
||
|
result = parser.read_csv(StringIO(data), float_precision=precision)
|
||
|
expected = DataFrame({"data": [0.0]})
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
|
||
|
def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
|
||
|
# GH#38753
|
||
|
parser, precision = all_parsers_all_precisions
|
||
|
data = f"data\n10E{exp}"
|
||
|
result = parser.read_csv(StringIO(data), float_precision=precision)
|
||
|
if precision == "round_trip":
|
||
|
if exp == 999999999999999999 and is_platform_linux():
|
||
|
mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
|
||
|
request.node.add_marker(mark)
|
||
|
|
||
|
value = np.inf if exp > 0 else 0.0
|
||
|
expected = DataFrame({"data": [value]})
|
||
|
else:
|
||
|
expected = DataFrame({"data": [f"10E{exp}"]})
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("iterator", [True, False])
|
||
|
def test_empty_with_nrows_chunksize(all_parsers, iterator):
|
||
|
# see gh-9535
|
||
|
parser = all_parsers
|
||
|
expected = DataFrame(columns=["foo", "bar"])
|
||
|
|
||
|
nrows = 10
|
||
|
data = StringIO("foo,bar\n")
|
||
|
|
||
|
if iterator:
|
||
|
with parser.read_csv(data, chunksize=nrows) as reader:
|
||
|
result = next(iter(reader))
|
||
|
else:
|
||
|
result = parser.read_csv(data, nrows=nrows)
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"data,kwargs,expected,msg",
|
||
|
[
|
||
|
# gh-10728: WHITESPACE_LINE
|
||
|
(
|
||
|
"a,b,c\n4,5,6\n ",
|
||
|
dict(),
|
||
|
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||
|
None,
|
||
|
),
|
||
|
# gh-10548: EAT_LINE_COMMENT
|
||
|
(
|
||
|
"a,b,c\n4,5,6\n#comment",
|
||
|
dict(comment="#"),
|
||
|
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||
|
None,
|
||
|
),
|
||
|
# EAT_CRNL_NOP
|
||
|
(
|
||
|
"a,b,c\n4,5,6\n\r",
|
||
|
dict(),
|
||
|
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||
|
None,
|
||
|
),
|
||
|
# EAT_COMMENT
|
||
|
(
|
||
|
"a,b,c\n4,5,6#comment",
|
||
|
dict(comment="#"),
|
||
|
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||
|
None,
|
||
|
),
|
||
|
# SKIP_LINE
|
||
|
(
|
||
|
"a,b,c\n4,5,6\nskipme",
|
||
|
dict(skiprows=[2]),
|
||
|
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||
|
None,
|
||
|
),
|
||
|
# EAT_LINE_COMMENT
|
||
|
(
|
||
|
"a,b,c\n4,5,6\n#comment",
|
||
|
dict(comment="#", skip_blank_lines=False),
|
||
|
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||
|
None,
|
||
|
),
|
||
|
# IN_FIELD
|
||
|
(
|
||
|
"a,b,c\n4,5,6\n ",
|
||
|
dict(skip_blank_lines=False),
|
||
|
DataFrame([["4", 5, 6], [" ", None, None]], columns=["a", "b", "c"]),
|
||
|
None,
|
||
|
),
|
||
|
# EAT_CRNL
|
||
|
(
|
||
|
"a,b,c\n4,5,6\n\r",
|
||
|
dict(skip_blank_lines=False),
|
||
|
DataFrame([[4, 5, 6], [None, None, None]], columns=["a", "b", "c"]),
|
||
|
None,
|
||
|
),
|
||
|
# ESCAPED_CHAR
|
||
|
(
|
||
|
"a,b,c\n4,5,6\n\\",
|
||
|
dict(escapechar="\\"),
|
||
|
None,
|
||
|
"(EOF following escape character)|(unexpected end of data)",
|
||
|
),
|
||
|
# ESCAPE_IN_QUOTED_FIELD
|
||
|
(
|
||
|
'a,b,c\n4,5,6\n"\\',
|
||
|
dict(escapechar="\\"),
|
||
|
None,
|
||
|
"(EOF inside string starting at row 2)|(unexpected end of data)",
|
||
|
),
|
||
|
# IN_QUOTED_FIELD
|
||
|
(
|
||
|
'a,b,c\n4,5,6\n"',
|
||
|
dict(escapechar="\\"),
|
||
|
None,
|
||
|
"(EOF inside string starting at row 2)|(unexpected end of data)",
|
||
|
),
|
||
|
],
|
||
|
ids=[
|
||
|
"whitespace-line",
|
||
|
"eat-line-comment",
|
||
|
"eat-crnl-nop",
|
||
|
"eat-comment",
|
||
|
"skip-line",
|
||
|
"eat-line-comment",
|
||
|
"in-field",
|
||
|
"eat-crnl",
|
||
|
"escaped-char",
|
||
|
"escape-in-quoted-field",
|
||
|
"in-quoted-field",
|
||
|
],
|
||
|
)
|
||
|
def test_eof_states(all_parsers, data, kwargs, expected, msg):
|
||
|
# see gh-10728, gh-10548
|
||
|
parser = all_parsers
|
||
|
|
||
|
if expected is None:
|
||
|
with pytest.raises(ParserError, match=msg):
|
||
|
parser.read_csv(StringIO(data), **kwargs)
|
||
|
else:
|
||
|
result = parser.read_csv(StringIO(data), **kwargs)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]])
|
||
|
def test_uneven_lines_with_usecols(all_parsers, usecols):
|
||
|
# see gh-12203
|
||
|
parser = all_parsers
|
||
|
data = r"""a,b,c
|
||
|
0,1,2
|
||
|
3,4,5,6,7
|
||
|
8,9,10"""
|
||
|
|
||
|
if usecols is None:
|
||
|
# Make sure that an error is still raised
|
||
|
# when the "usecols" parameter is not provided.
|
||
|
msg = r"Expected \d+ fields in line \d+, saw \d+"
|
||
|
with pytest.raises(ParserError, match=msg):
|
||
|
parser.read_csv(StringIO(data))
|
||
|
else:
|
||
|
expected = DataFrame({"a": [0, 3, 8], "b": [1, 4, 9]})
|
||
|
|
||
|
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"data,kwargs,expected",
|
||
|
[
|
||
|
# First, check to see that the response of parser when faced with no
|
||
|
# provided columns raises the correct error, with or without usecols.
|
||
|
("", dict(), None),
|
||
|
("", dict(usecols=["X"]), None),
|
||
|
(
|
||
|
",,",
|
||
|
dict(names=["Dummy", "X", "Dummy_2"], usecols=["X"]),
|
||
|
DataFrame(columns=["X"], index=[0], dtype=np.float64),
|
||
|
),
|
||
|
(
|
||
|
"",
|
||
|
dict(names=["Dummy", "X", "Dummy_2"], usecols=["X"]),
|
||
|
DataFrame(columns=["X"]),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
|
||
|
# see gh-12493
|
||
|
parser = all_parsers
|
||
|
|
||
|
if expected is None:
|
||
|
msg = "No columns to parse from file"
|
||
|
with pytest.raises(EmptyDataError, match=msg):
|
||
|
parser.read_csv(StringIO(data), **kwargs)
|
||
|
else:
|
||
|
result = parser.read_csv(StringIO(data), **kwargs)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"kwargs,expected",
|
||
|
[
|
||
|
# gh-8661, gh-8679: this should ignore six lines, including
|
||
|
# lines with trailing whitespace and blank lines.
|
||
|
(
|
||
|
dict(
|
||
|
header=None,
|
||
|
delim_whitespace=True,
|
||
|
skiprows=[0, 1, 2, 3, 5, 6],
|
||
|
skip_blank_lines=True,
|
||
|
),
|
||
|
DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]),
|
||
|
),
|
||
|
# gh-8983: test skipping set of rows after a row with trailing spaces.
|
||
|
(
|
||
|
dict(
|
||
|
delim_whitespace=True, skiprows=[1, 2, 3, 5, 6], skip_blank_lines=True
|
||
|
),
|
||
|
DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_trailing_spaces(all_parsers, kwargs, expected):
|
||
|
data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa
|
||
|
parser = all_parsers
|
||
|
|
||
|
result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_raise_on_sep_with_delim_whitespace(all_parsers):
|
||
|
# see gh-6607
|
||
|
data = "a b c\n1 2 3"
|
||
|
parser = all_parsers
|
||
|
|
||
|
with pytest.raises(ValueError, match="you can only specify one"):
|
||
|
parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("delim_whitespace", [True, False])
|
||
|
def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
|
||
|
# see gh-9710
|
||
|
parser = all_parsers
|
||
|
data = """\
|
||
|
MyColumn
|
||
|
a
|
||
|
b
|
||
|
a
|
||
|
b\n"""
|
||
|
|
||
|
expected = DataFrame({"MyColumn": list("abab")})
|
||
|
result = parser.read_csv(
|
||
|
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"sep,skip_blank_lines,exp_data",
|
||
|
[
|
||
|
(",", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
|
||
|
(r"\s+", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
|
||
|
(
|
||
|
",",
|
||
|
False,
|
||
|
[
|
||
|
[1.0, 2.0, 4.0],
|
||
|
[np.nan, np.nan, np.nan],
|
||
|
[np.nan, np.nan, np.nan],
|
||
|
[5.0, np.nan, 10.0],
|
||
|
[np.nan, np.nan, np.nan],
|
||
|
[-70.0, 0.4, 1.0],
|
||
|
],
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data):
|
||
|
parser = all_parsers
|
||
|
data = """\
|
||
|
A,B,C
|
||
|
1,2.,4.
|
||
|
|
||
|
|
||
|
5.,NaN,10.0
|
||
|
|
||
|
-70,.4,1
|
||
|
"""
|
||
|
|
||
|
if sep == r"\s+":
|
||
|
data = data.replace(",", " ")
|
||
|
|
||
|
result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines)
|
||
|
expected = DataFrame(exp_data, columns=["A", "B", "C"])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_whitespace_lines(all_parsers):
|
||
|
parser = all_parsers
|
||
|
data = """
|
||
|
|
||
|
\t \t\t
|
||
|
\t
|
||
|
A,B,C
|
||
|
\t 1,2.,4.
|
||
|
5.,NaN,10.0
|
||
|
"""
|
||
|
expected = DataFrame([[1, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"])
|
||
|
result = parser.read_csv(StringIO(data))
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"data,expected",
|
||
|
[
|
||
|
(
|
||
|
""" A B C D
|
||
|
a 1 2 3 4
|
||
|
b 1 2 3 4
|
||
|
c 1 2 3 4
|
||
|
""",
|
||
|
DataFrame(
|
||
|
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
|
||
|
columns=["A", "B", "C", "D"],
|
||
|
index=["a", "b", "c"],
|
||
|
),
|
||
|
),
|
||
|
(
|
||
|
" a b c\n1 2 3 \n4 5 6\n 7 8 9",
|
||
|
DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_whitespace_regex_separator(all_parsers, data, expected):
|
||
|
# see gh-6607
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_verbose_read(all_parsers, capsys):
|
||
|
parser = all_parsers
|
||
|
data = """a,b,c,d
|
||
|
one,1,2,3
|
||
|
one,1,2,3
|
||
|
,1,2,3
|
||
|
one,1,2,3
|
||
|
,1,2,3
|
||
|
,1,2,3
|
||
|
one,1,2,3
|
||
|
two,1,2,3"""
|
||
|
|
||
|
# Engines are verbose in different ways.
|
||
|
parser.read_csv(StringIO(data), verbose=True)
|
||
|
captured = capsys.readouterr()
|
||
|
|
||
|
if parser.engine == "c":
|
||
|
assert "Tokenization took:" in captured.out
|
||
|
assert "Parser memory cleanup took:" in captured.out
|
||
|
else: # Python engine
|
||
|
assert captured.out == "Filled 3 NA values in column a\n"
|
||
|
|
||
|
|
||
|
def test_verbose_read2(all_parsers, capsys):
|
||
|
parser = all_parsers
|
||
|
data = """a,b,c,d
|
||
|
one,1,2,3
|
||
|
two,1,2,3
|
||
|
three,1,2,3
|
||
|
four,1,2,3
|
||
|
five,1,2,3
|
||
|
,1,2,3
|
||
|
seven,1,2,3
|
||
|
eight,1,2,3"""
|
||
|
|
||
|
parser.read_csv(StringIO(data), verbose=True, index_col=0)
|
||
|
captured = capsys.readouterr()
|
||
|
|
||
|
# Engines are verbose in different ways.
|
||
|
if parser.engine == "c":
|
||
|
assert "Tokenization took:" in captured.out
|
||
|
assert "Parser memory cleanup took:" in captured.out
|
||
|
else: # Python engine
|
||
|
assert captured.out == "Filled 1 NA values in column a\n"
|
||
|
|
||
|
|
||
|
def test_iteration_open_handle(all_parsers):
|
||
|
parser = all_parsers
|
||
|
kwargs = dict(squeeze=True, header=None)
|
||
|
|
||
|
with tm.ensure_clean() as path:
|
||
|
with open(path, "w") as f:
|
||
|
f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG")
|
||
|
|
||
|
with open(path) as f:
|
||
|
for line in f:
|
||
|
if "CCC" in line:
|
||
|
break
|
||
|
|
||
|
result = parser.read_csv(f, **kwargs)
|
||
|
expected = Series(["DDD", "EEE", "FFF", "GGG"], name=0)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"data,thousands,decimal",
|
||
|
[
|
||
|
(
|
||
|
"""A|B|C
|
||
|
1|2,334.01|5
|
||
|
10|13|10.
|
||
|
""",
|
||
|
",",
|
||
|
".",
|
||
|
),
|
||
|
(
|
||
|
"""A|B|C
|
||
|
1|2.334,01|5
|
||
|
10|13|10,
|
||
|
""",
|
||
|
".",
|
||
|
",",
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal):
|
||
|
parser = all_parsers
|
||
|
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
|
||
|
|
||
|
result = parser.read_csv(
|
||
|
StringIO(data), sep="|", thousands=thousands, decimal=decimal
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_euro_decimal_format(all_parsers):
|
||
|
parser = all_parsers
|
||
|
data = """Id;Number1;Number2;Text1;Text2;Number3
|
||
|
1;1521,1541;187101,9543;ABC;poi;4,738797819
|
||
|
2;121,12;14897,76;DEF;uyt;0,377320872
|
||
|
3;878,158;108013,434;GHI;rez;2,735694704"""
|
||
|
|
||
|
result = parser.read_csv(StringIO(data), sep=";", decimal=",")
|
||
|
expected = DataFrame(
|
||
|
[
|
||
|
[1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819],
|
||
|
[2, 121.12, 14897.76, "DEF", "uyt", 0.377320872],
|
||
|
[3, 878.158, 108013.434, "GHI", "rez", 2.735694704],
|
||
|
],
|
||
|
columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("na_filter", [True, False])
|
||
|
def test_inf_parsing(all_parsers, na_filter):
|
||
|
parser = all_parsers
|
||
|
data = """\
|
||
|
,A
|
||
|
a,inf
|
||
|
b,-inf
|
||
|
c,+Inf
|
||
|
d,-Inf
|
||
|
e,INF
|
||
|
f,-INF
|
||
|
g,+INf
|
||
|
h,-INf
|
||
|
i,inF
|
||
|
j,-inF"""
|
||
|
expected = DataFrame(
|
||
|
{"A": [float("inf"), float("-inf")] * 5},
|
||
|
index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
|
||
|
)
|
||
|
result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("na_filter", [True, False])
|
||
|
def test_infinity_parsing(all_parsers, na_filter):
|
||
|
parser = all_parsers
|
||
|
data = """\
|
||
|
,A
|
||
|
a,Infinity
|
||
|
b,-Infinity
|
||
|
c,+Infinity
|
||
|
"""
|
||
|
expected = DataFrame(
|
||
|
{"A": [float("infinity"), float("-infinity"), float("+infinity")]},
|
||
|
index=["a", "b", "c"],
|
||
|
)
|
||
|
result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5])
|
||
|
def test_raise_on_no_columns(all_parsers, nrows):
|
||
|
parser = all_parsers
|
||
|
data = "\n" * nrows
|
||
|
|
||
|
msg = "No columns to parse from file"
|
||
|
with pytest.raises(EmptyDataError, match=msg):
|
||
|
parser.read_csv(StringIO(data))
|
||
|
|
||
|
|
||
|
@td.check_file_leaks
|
||
|
def test_memory_map(all_parsers, csv_dir_path):
|
||
|
mmap_file = os.path.join(csv_dir_path, "test_mmap.csv")
|
||
|
parser = all_parsers
|
||
|
|
||
|
expected = DataFrame(
|
||
|
{"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]}
|
||
|
)
|
||
|
|
||
|
result = parser.read_csv(mmap_file, memory_map=True)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_null_byte_char(all_parsers):
|
||
|
# see gh-2741
|
||
|
data = "\x00,foo"
|
||
|
names = ["a", "b"]
|
||
|
parser = all_parsers
|
||
|
|
||
|
if parser.engine == "c":
|
||
|
expected = DataFrame([[np.nan, "foo"]], columns=names)
|
||
|
out = parser.read_csv(StringIO(data), names=names)
|
||
|
tm.assert_frame_equal(out, expected)
|
||
|
else:
|
||
|
msg = "NULL byte detected"
|
||
|
with pytest.raises(ParserError, match=msg):
|
||
|
parser.read_csv(StringIO(data), names=names)
|
||
|
|
||
|
|
||
|
def test_temporary_file(all_parsers):
|
||
|
# see gh-13398
|
||
|
parser = all_parsers
|
||
|
data = "0 0"
|
||
|
|
||
|
with tm.ensure_clean(mode="w+", return_filelike=True) as new_file:
|
||
|
new_file.write(data)
|
||
|
new_file.flush()
|
||
|
new_file.seek(0)
|
||
|
|
||
|
result = parser.read_csv(new_file, sep=r"\s+", header=None)
|
||
|
|
||
|
expected = DataFrame([[0, 0]])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_internal_eof_byte(all_parsers):
|
||
|
# see gh-5500
|
||
|
parser = all_parsers
|
||
|
data = "a,b\n1\x1a,2"
|
||
|
|
||
|
expected = DataFrame([["1\x1a", 2]], columns=["a", "b"])
|
||
|
result = parser.read_csv(StringIO(data))
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_internal_eof_byte_to_file(all_parsers):
|
||
|
# see gh-16559
|
||
|
parser = all_parsers
|
||
|
data = b'c1,c2\r\n"test \x1a test", test\r\n'
|
||
|
expected = DataFrame([["test \x1a test", " test"]], columns=["c1", "c2"])
|
||
|
path = f"__{tm.rands(10)}__.csv"
|
||
|
|
||
|
with tm.ensure_clean(path) as path:
|
||
|
with open(path, "wb") as f:
|
||
|
f.write(data)
|
||
|
|
||
|
result = parser.read_csv(path)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_sub_character(all_parsers, csv_dir_path):
|
||
|
# see gh-16893
|
||
|
filename = os.path.join(csv_dir_path, "sub_char.csv")
|
||
|
expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"])
|
||
|
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(filename)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_file_handle_string_io(all_parsers):
|
||
|
# gh-14418
|
||
|
#
|
||
|
# Don't close user provided file handles.
|
||
|
parser = all_parsers
|
||
|
data = "a,b\n1,2"
|
||
|
|
||
|
fh = StringIO(data)
|
||
|
parser.read_csv(fh)
|
||
|
assert not fh.closed
|
||
|
|
||
|
|
||
|
def test_file_handles_with_open(all_parsers, csv1):
|
||
|
# gh-14418
|
||
|
#
|
||
|
# Don't close user provided file handles.
|
||
|
parser = all_parsers
|
||
|
|
||
|
for mode in ["r", "rb"]:
|
||
|
with open(csv1, mode) as f:
|
||
|
parser.read_csv(f)
|
||
|
assert not f.closed
|
||
|
|
||
|
|
||
|
def test_invalid_file_buffer_class(all_parsers):
|
||
|
# see gh-15337
|
||
|
class InvalidBuffer:
|
||
|
pass
|
||
|
|
||
|
parser = all_parsers
|
||
|
msg = "Invalid file path or buffer object type"
|
||
|
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
parser.read_csv(InvalidBuffer())
|
||
|
|
||
|
|
||
|
def test_invalid_file_buffer_mock(all_parsers):
|
||
|
# see gh-15337
|
||
|
parser = all_parsers
|
||
|
msg = "Invalid file path or buffer object type"
|
||
|
|
||
|
class Foo:
|
||
|
pass
|
||
|
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
parser.read_csv(Foo())
|
||
|
|
||
|
|
||
|
def test_valid_file_buffer_seems_invalid(all_parsers):
|
||
|
# gh-16135: we want to ensure that "tell" and "seek"
|
||
|
# aren't actually being used when we call `read_csv`
|
||
|
#
|
||
|
# Thus, while the object may look "invalid" (these
|
||
|
# methods are attributes of the `StringIO` class),
|
||
|
# it is still a valid file-object for our purposes.
|
||
|
class NoSeekTellBuffer(StringIO):
|
||
|
def tell(self):
|
||
|
raise AttributeError("No tell method")
|
||
|
|
||
|
def seek(self, pos, whence=0):
|
||
|
raise AttributeError("No seek method")
|
||
|
|
||
|
data = "a\n1"
|
||
|
parser = all_parsers
|
||
|
expected = DataFrame({"a": [1]})
|
||
|
|
||
|
result = parser.read_csv(NoSeekTellBuffer(data))
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"kwargs",
|
||
|
[dict(), dict(error_bad_lines=True)], # Default is True. # Explicitly pass in.
|
||
|
)
|
||
|
@pytest.mark.parametrize(
|
||
|
"warn_kwargs", [dict(), dict(warn_bad_lines=True), dict(warn_bad_lines=False)]
|
||
|
)
|
||
|
def test_error_bad_lines(all_parsers, kwargs, warn_kwargs):
|
||
|
# see gh-15925
|
||
|
parser = all_parsers
|
||
|
kwargs.update(**warn_kwargs)
|
||
|
data = "a\n1\n1,2,3\n4\n5,6,7"
|
||
|
|
||
|
msg = "Expected 1 fields in line 3, saw 3"
|
||
|
with pytest.raises(ParserError, match=msg):
|
||
|
parser.read_csv(StringIO(data), **kwargs)
|
||
|
|
||
|
|
||
|
def test_warn_bad_lines(all_parsers, capsys):
|
||
|
# see gh-15925
|
||
|
parser = all_parsers
|
||
|
data = "a\n1\n1,2,3\n4\n5,6,7"
|
||
|
expected = DataFrame({"a": [1, 4]})
|
||
|
|
||
|
result = parser.read_csv(StringIO(data), error_bad_lines=False, warn_bad_lines=True)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
captured = capsys.readouterr()
|
||
|
assert "Skipping line 3" in captured.err
|
||
|
assert "Skipping line 5" in captured.err
|
||
|
|
||
|
|
||
|
def test_suppress_error_output(all_parsers, capsys):
|
||
|
# see gh-15925
|
||
|
parser = all_parsers
|
||
|
data = "a\n1\n1,2,3\n4\n5,6,7"
|
||
|
expected = DataFrame({"a": [1, 4]})
|
||
|
|
||
|
result = parser.read_csv(
|
||
|
StringIO(data), error_bad_lines=False, warn_bad_lines=False
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
captured = capsys.readouterr()
|
||
|
assert captured.err == ""
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"])
|
||
|
def test_filename_with_special_chars(all_parsers, filename):
|
||
|
# see gh-15086.
|
||
|
parser = all_parsers
|
||
|
df = DataFrame({"a": [1, 2, 3]})
|
||
|
|
||
|
with tm.ensure_clean(filename) as path:
|
||
|
df.to_csv(path, index=False)
|
||
|
|
||
|
result = parser.read_csv(path)
|
||
|
tm.assert_frame_equal(result, df)
|
||
|
|
||
|
|
||
|
def test_read_csv_memory_growth_chunksize(all_parsers):
|
||
|
# see gh-24805
|
||
|
#
|
||
|
# Let's just make sure that we don't crash
|
||
|
# as we iteratively process all chunks.
|
||
|
parser = all_parsers
|
||
|
|
||
|
with tm.ensure_clean() as path:
|
||
|
with open(path, "w") as f:
|
||
|
for i in range(1000):
|
||
|
f.write(str(i) + "\n")
|
||
|
|
||
|
with parser.read_csv(path, chunksize=20) as result:
|
||
|
for _ in result:
|
||
|
pass
|
||
|
|
||
|
|
||
|
def test_read_csv_raises_on_header_prefix(all_parsers):
|
||
|
# gh-27394
|
||
|
parser = all_parsers
|
||
|
msg = "Argument prefix must be None if argument header is not None"
|
||
|
|
||
|
s = StringIO("0,1\n2,3")
|
||
|
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
parser.read_csv(s, header=0, prefix="_X")
|
||
|
|
||
|
|
||
|
def test_unexpected_keyword_parameter_exception(all_parsers):
|
||
|
# GH-34976
|
||
|
parser = all_parsers
|
||
|
|
||
|
msg = "{}\\(\\) got an unexpected keyword argument 'foo'"
|
||
|
with pytest.raises(TypeError, match=msg.format("read_csv")):
|
||
|
parser.read_csv("foo.csv", foo=1)
|
||
|
with pytest.raises(TypeError, match=msg.format("read_table")):
|
||
|
parser.read_table("foo.tsv", foo=1)
|
||
|
|
||
|
|
||
|
def test_read_table_same_signature_as_read_csv(all_parsers):
|
||
|
# GH-34976
|
||
|
parser = all_parsers
|
||
|
|
||
|
table_sign = signature(parser.read_table)
|
||
|
csv_sign = signature(parser.read_csv)
|
||
|
|
||
|
assert table_sign.parameters.keys() == csv_sign.parameters.keys()
|
||
|
assert table_sign.return_annotation == csv_sign.return_annotation
|
||
|
|
||
|
for key, csv_param in csv_sign.parameters.items():
|
||
|
table_param = table_sign.parameters[key]
|
||
|
if key == "sep":
|
||
|
assert csv_param.default == ","
|
||
|
assert table_param.default == "\t"
|
||
|
assert table_param.annotation == csv_param.annotation
|
||
|
assert table_param.kind == csv_param.kind
|
||
|
continue
|
||
|
else:
|
||
|
assert table_param == csv_param
|
||
|
|
||
|
|
||
|
def test_read_table_equivalency_to_read_csv(all_parsers):
|
||
|
# see gh-21948
|
||
|
# As of 0.25.0, read_table is undeprecated
|
||
|
parser = all_parsers
|
||
|
data = "a\tb\n1\t2\n3\t4"
|
||
|
expected = parser.read_csv(StringIO(data), sep="\t")
|
||
|
result = parser.read_table(StringIO(data))
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_first_row_bom(all_parsers):
|
||
|
# see gh-26545
|
||
|
parser = all_parsers
|
||
|
data = '''\ufeff"Head1" "Head2" "Head3"'''
|
||
|
|
||
|
result = parser.read_csv(StringIO(data), delimiter="\t")
|
||
|
expected = DataFrame(columns=["Head1", "Head2", "Head3"])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_first_row_bom_unquoted(all_parsers):
|
||
|
# see gh-36343
|
||
|
parser = all_parsers
|
||
|
data = """\ufeffHead1 Head2 Head3"""
|
||
|
|
||
|
result = parser.read_csv(StringIO(data), delimiter="\t")
|
||
|
expected = DataFrame(columns=["Head1", "Head2", "Head3"])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_integer_precision(all_parsers):
|
||
|
# Gh 7072
|
||
|
s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765
|
||
|
5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389"""
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(s), header=None)[4]
|
||
|
expected = Series([4321583677327450765, 4321113141090630389], name=4)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_file_descriptor_leak(all_parsers):
|
||
|
# GH 31488
|
||
|
|
||
|
parser = all_parsers
|
||
|
with tm.ensure_clean() as path:
|
||
|
|
||
|
def test():
|
||
|
with pytest.raises(EmptyDataError, match="No columns to parse from file"):
|
||
|
parser.read_csv(path)
|
||
|
|
||
|
td.check_file_leaks(test)()
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("nrows", range(1, 6))
|
||
|
def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
|
||
|
# GH 28071
|
||
|
ref = DataFrame(
|
||
|
[[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
|
||
|
columns=list("ab"),
|
||
|
)
|
||
|
csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
|
||
|
parser = all_parsers
|
||
|
df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
|
||
|
tm.assert_frame_equal(df, ref[:nrows])
|
||
|
|
||
|
|
||
|
def test_no_header_two_extra_columns(all_parsers):
|
||
|
# GH 26218
|
||
|
column_names = ["one", "two", "three"]
|
||
|
ref = DataFrame([["foo", "bar", "baz"]], columns=column_names)
|
||
|
stream = StringIO("foo,bar,baz,bam,blah")
|
||
|
parser = all_parsers
|
||
|
df = parser.read_csv(stream, header=None, names=column_names, index_col=False)
|
||
|
tm.assert_frame_equal(df, ref)
|
||
|
|
||
|
|
||
|
def test_read_csv_names_not_accepting_sets(all_parsers):
|
||
|
# GH 34946
|
||
|
data = """\
|
||
|
1,2,3
|
||
|
4,5,6\n"""
|
||
|
parser = all_parsers
|
||
|
with pytest.raises(ValueError, match="Names should be an ordered collection."):
|
||
|
parser.read_csv(StringIO(data), names=set("QAZ"))
|
||
|
|
||
|
|
||
|
def test_read_csv_with_use_inf_as_na(all_parsers):
|
||
|
# https://github.com/pandas-dev/pandas/issues/35493
|
||
|
parser = all_parsers
|
||
|
data = "1.0\nNaN\n3.0"
|
||
|
with option_context("use_inf_as_na", True):
|
||
|
result = parser.read_csv(StringIO(data), header=None)
|
||
|
expected = DataFrame([1.0, np.nan, 3.0])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_read_table_delim_whitespace_default_sep(all_parsers):
|
||
|
# GH: 35958
|
||
|
f = StringIO("a b c\n1 -2 -3\n4 5 6")
|
||
|
parser = all_parsers
|
||
|
result = parser.read_table(f, delim_whitespace=True)
|
||
|
expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]})
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("delimiter", [",", "\t"])
|
||
|
def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter):
|
||
|
# GH: 35958
|
||
|
f = StringIO("a b c\n1 -2 -3\n4 5 6")
|
||
|
parser = all_parsers
|
||
|
msg = (
|
||
|
"Specified a delimiter with both sep and "
|
||
|
"delim_whitespace=True; you can only specify one."
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
parser.read_csv(f, delim_whitespace=True, sep=delimiter)
|
||
|
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("delimiter", [",", "\t"])
|
||
|
def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
|
||
|
# GH: 35958
|
||
|
f = StringIO("a b c\n1 -2 -3\n4 5 6")
|
||
|
parser = all_parsers
|
||
|
msg = (
|
||
|
"Specified a delimiter with both sep and "
|
||
|
"delim_whitespace=True; you can only specify one."
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
parser.read_table(f, delim_whitespace=True, sep=delimiter)
|
||
|
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
|
||
|
|
||
|
|
||
|
def test_dict_keys_as_names(all_parsers):
|
||
|
# GH: 36928
|
||
|
data = "1,2"
|
||
|
|
||
|
keys = {"a": int, "b": int}.keys()
|
||
|
parser = all_parsers
|
||
|
|
||
|
result = parser.read_csv(StringIO(data), names=keys)
|
||
|
expected = DataFrame({"a": [1], "b": [2]})
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("io_class", [StringIO, BytesIO])
|
||
|
@pytest.mark.parametrize("encoding", [None, "utf-8"])
|
||
|
def test_read_csv_file_handle(all_parsers, io_class, encoding):
|
||
|
"""
|
||
|
Test whether read_csv does not close user-provided file handles.
|
||
|
|
||
|
GH 36980
|
||
|
"""
|
||
|
parser = all_parsers
|
||
|
expected = DataFrame({"a": [1], "b": [2]})
|
||
|
|
||
|
content = "a,b\n1,2"
|
||
|
if io_class == BytesIO:
|
||
|
content = content.encode("utf-8")
|
||
|
handle = io_class(content)
|
||
|
|
||
|
tm.assert_frame_equal(parser.read_csv(handle, encoding=encoding), expected)
|
||
|
assert not handle.closed
|
||
|
|
||
|
|
||
|
def test_memory_map_file_handle_silent_fallback(all_parsers, compression):
|
||
|
"""
|
||
|
Do not fail for buffers with memory_map=True (cannot memory map BytesIO).
|
||
|
|
||
|
GH 37621
|
||
|
"""
|
||
|
parser = all_parsers
|
||
|
expected = DataFrame({"a": [1], "b": [2]})
|
||
|
|
||
|
handle = BytesIO()
|
||
|
expected.to_csv(handle, index=False, compression=compression, mode="wb")
|
||
|
handle.seek(0)
|
||
|
|
||
|
tm.assert_frame_equal(
|
||
|
parser.read_csv(handle, memory_map=True, compression=compression),
|
||
|
expected,
|
||
|
)
|
||
|
|
||
|
|
||
|
def test_memory_map_compression(all_parsers, compression):
|
||
|
"""
|
||
|
Support memory map for compressed files.
|
||
|
|
||
|
GH 37621
|
||
|
"""
|
||
|
parser = all_parsers
|
||
|
expected = DataFrame({"a": [1], "b": [2]})
|
||
|
|
||
|
with tm.ensure_clean() as path:
|
||
|
expected.to_csv(path, index=False, compression=compression)
|
||
|
|
||
|
tm.assert_frame_equal(
|
||
|
parser.read_csv(path, memory_map=True, compression=compression),
|
||
|
expected,
|
||
|
)
|
||
|
|
||
|
|
||
|
def test_context_manager(all_parsers, datapath):
|
||
|
# make sure that opened files are closed
|
||
|
parser = all_parsers
|
||
|
|
||
|
path = datapath("io", "data", "csv", "iris.csv")
|
||
|
|
||
|
reader = parser.read_csv(path, chunksize=1)
|
||
|
assert not reader._engine.handles.handle.closed
|
||
|
try:
|
||
|
with reader:
|
||
|
next(reader)
|
||
|
assert False
|
||
|
except AssertionError:
|
||
|
assert reader._engine.handles.handle.closed
|
||
|
|
||
|
|
||
|
def test_context_manageri_user_provided(all_parsers, datapath):
|
||
|
# make sure that user-provided handles are not closed
|
||
|
parser = all_parsers
|
||
|
|
||
|
with open(datapath("io", "data", "csv", "iris.csv"), mode="r") as path:
|
||
|
|
||
|
reader = parser.read_csv(path, chunksize=1)
|
||
|
assert not reader._engine.handles.handle.closed
|
||
|
try:
|
||
|
with reader:
|
||
|
next(reader)
|
||
|
assert False
|
||
|
except AssertionError:
|
||
|
assert not reader._engine.handles.handle.closed
|
||
|
|
||
|
|
||
|
@td.check_file_leaks
|
||
|
def test_open_file(all_parsers):
|
||
|
# GH 39024
|
||
|
parser = all_parsers
|
||
|
if parser.engine == "c":
|
||
|
pytest.skip()
|
||
|
|
||
|
with tm.ensure_clean() as path:
|
||
|
file = Path(path)
|
||
|
file.write_bytes(b"\xe4\na\n1")
|
||
|
|
||
|
# should not trigger a ResourceWarning
|
||
|
warnings.simplefilter("always", category=ResourceWarning)
|
||
|
with warnings.catch_warnings(record=True) as record:
|
||
|
with pytest.raises(csv.Error, match="Could not determine delimiter"):
|
||
|
parser.read_csv(file, sep=None)
|
||
|
assert len(record) == 0, record[0].message
|