216 lines
6.3 KiB
Python
216 lines
6.3 KiB
Python
|
"""
|
||
|
Tests that work on both the Python and C engines but do not have a
|
||
|
specific classification into the other test modules.
|
||
|
"""
|
||
|
from io import StringIO
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
from pandas import (
|
||
|
DataFrame,
|
||
|
Series,
|
||
|
)
|
||
|
import pandas._testing as tm
|
||
|
|
||
|
# GH#43650: Some expected failures with the pyarrow engine can occasionally
|
||
|
# cause a deadlock instead, so we skip these instead of xfailing
|
||
|
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||
|
|
||
|
|
||
|
def test_int_conversion(all_parsers):
|
||
|
data = """A,B
|
||
|
1.0,1
|
||
|
2.0,2
|
||
|
3.0,3
|
||
|
"""
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(data))
|
||
|
|
||
|
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"data,kwargs,expected",
|
||
|
[
|
||
|
(
|
||
|
"A,B\nTrue,1\nFalse,2\nTrue,3",
|
||
|
{},
|
||
|
DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
|
||
|
),
|
||
|
(
|
||
|
"A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3",
|
||
|
{"true_values": ["yes", "Yes", "YES"], "false_values": ["no", "NO", "No"]},
|
||
|
DataFrame(
|
||
|
[[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]],
|
||
|
columns=["A", "B"],
|
||
|
),
|
||
|
),
|
||
|
(
|
||
|
"A,B\nTRUE,1\nFALSE,2\nTRUE,3",
|
||
|
{},
|
||
|
DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
|
||
|
),
|
||
|
(
|
||
|
"A,B\nfoo,bar\nbar,foo",
|
||
|
{"true_values": ["foo"], "false_values": ["bar"]},
|
||
|
DataFrame([[True, False], [False, True]], columns=["A", "B"]),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_parse_bool(all_parsers, data, kwargs, expected):
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(data), **kwargs)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_parse_integers_above_fp_precision(all_parsers):
|
||
|
data = """Numbers
|
||
|
17007000002000191
|
||
|
17007000002000191
|
||
|
17007000002000191
|
||
|
17007000002000191
|
||
|
17007000002000192
|
||
|
17007000002000192
|
||
|
17007000002000192
|
||
|
17007000002000192
|
||
|
17007000002000192
|
||
|
17007000002000194"""
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(data))
|
||
|
expected = DataFrame(
|
||
|
{
|
||
|
"Numbers": [
|
||
|
17007000002000191,
|
||
|
17007000002000191,
|
||
|
17007000002000191,
|
||
|
17007000002000191,
|
||
|
17007000002000192,
|
||
|
17007000002000192,
|
||
|
17007000002000192,
|
||
|
17007000002000192,
|
||
|
17007000002000192,
|
||
|
17007000002000194,
|
||
|
]
|
||
|
}
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@skip_pyarrow # Flaky
|
||
|
@pytest.mark.parametrize("sep", [" ", r"\s+"])
|
||
|
def test_integer_overflow_bug(all_parsers, sep):
|
||
|
# see gh-2601
|
||
|
data = "65248E10 11\n55555E55 22\n"
|
||
|
parser = all_parsers
|
||
|
|
||
|
result = parser.read_csv(StringIO(data), header=None, sep=sep)
|
||
|
expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_int64_min_issues(all_parsers):
|
||
|
# see gh-2599
|
||
|
parser = all_parsers
|
||
|
data = "A,B\n0,0\n0,"
|
||
|
result = parser.read_csv(StringIO(data))
|
||
|
|
||
|
expected = DataFrame({"A": [0, 0], "B": [0, np.nan]})
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@skip_pyarrow
|
||
|
@pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
|
||
|
def test_int64_overflow(all_parsers, conv):
|
||
|
data = """ID
|
||
|
00013007854817840016671868
|
||
|
00013007854817840016749251
|
||
|
00013007854817840016754630
|
||
|
00013007854817840016781876
|
||
|
00013007854817840017028824
|
||
|
00013007854817840017963235
|
||
|
00013007854817840018860166"""
|
||
|
parser = all_parsers
|
||
|
|
||
|
if conv is None:
|
||
|
# 13007854817840016671868 > UINT64_MAX, so this
|
||
|
# will overflow and return object as the dtype.
|
||
|
result = parser.read_csv(StringIO(data))
|
||
|
expected = DataFrame(
|
||
|
[
|
||
|
"00013007854817840016671868",
|
||
|
"00013007854817840016749251",
|
||
|
"00013007854817840016754630",
|
||
|
"00013007854817840016781876",
|
||
|
"00013007854817840017028824",
|
||
|
"00013007854817840017963235",
|
||
|
"00013007854817840018860166",
|
||
|
],
|
||
|
columns=["ID"],
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
else:
|
||
|
# 13007854817840016671868 > UINT64_MAX, so attempts
|
||
|
# to cast to either int64 or uint64 will result in
|
||
|
# an OverflowError being raised.
|
||
|
msg = (
|
||
|
"(Python int too large to convert to C long)|"
|
||
|
"(long too big to convert)|"
|
||
|
"(int too big to convert)"
|
||
|
)
|
||
|
|
||
|
with pytest.raises(OverflowError, match=msg):
|
||
|
parser.read_csv(StringIO(data), converters={"ID": conv})
|
||
|
|
||
|
|
||
|
@skip_pyarrow
|
||
|
@pytest.mark.parametrize(
|
||
|
"val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min]
|
||
|
)
|
||
|
def test_int64_uint64_range(all_parsers, val):
|
||
|
# These numbers fall right inside the int64-uint64
|
||
|
# range, so they should be parsed as string.
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(str(val)), header=None)
|
||
|
|
||
|
expected = DataFrame([val])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@skip_pyarrow
|
||
|
@pytest.mark.parametrize(
|
||
|
"val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
|
||
|
)
|
||
|
def test_outside_int64_uint64_range(all_parsers, val):
|
||
|
# These numbers fall just outside the int64-uint64
|
||
|
# range, so they should be parsed as string.
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(str(val)), header=None)
|
||
|
|
||
|
expected = DataFrame([str(val)])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@skip_pyarrow
|
||
|
@pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], [str(2**63), str(-1)]])
|
||
|
def test_numeric_range_too_wide(all_parsers, exp_data):
|
||
|
# No numerical dtype can hold both negative and uint64
|
||
|
# values, so they should be cast as string.
|
||
|
parser = all_parsers
|
||
|
data = "\n".join(exp_data)
|
||
|
expected = DataFrame(exp_data)
|
||
|
|
||
|
result = parser.read_csv(StringIO(data), header=None)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_integer_precision(all_parsers):
|
||
|
# Gh 7072
|
||
|
s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765
|
||
|
5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389"""
|
||
|
parser = all_parsers
|
||
|
result = parser.read_csv(StringIO(s), header=None)[4]
|
||
|
expected = Series([4321583677327450765, 4321113141090630389], name=4)
|
||
|
tm.assert_series_equal(result, expected)
|