projektAI/venv/Lib/site-packages/pandas/tests/io/parser/test_parse_dates.py

1598 lines
47 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
"""
Tests date parsing functionality for all of the
parsers defined in parsers.py
"""
from datetime import date, datetime
from io import StringIO
from dateutil.parser import parse as du_parse
from hypothesis import given, settings, strategies as st
import numpy as np
import pytest
import pytz
from pandas._libs.tslib import Timestamp
from pandas._libs.tslibs import parsing
from pandas._libs.tslibs.parsing import parse_datetime_string
from pandas.compat import is_platform_windows
from pandas.compat.numpy import np_array_datetime64_compat
import pandas as pd
from pandas import DataFrame, DatetimeIndex, Index, MultiIndex, Series
import pandas._testing as tm
from pandas.core.indexes.datetimes import date_range
import pandas.io.date_converters as conv
# constant
_DEFAULT_DATETIME = datetime(1, 1, 1)
# Strategy for hypothesis
if is_platform_windows():
date_strategy = st.datetimes(min_value=datetime(1900, 1, 1))
else:
date_strategy = st.datetimes()
def test_separator_date_conflict(all_parsers):
# Regression test for gh-4678
#
# Make sure thousands separator and
# date parsing do not conflict.
parser = all_parsers
data = "06-02-2013;13:00;1-000.215"
expected = DataFrame(
[[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], columns=["Date", 2]
)
df = parser.read_csv(
StringIO(data),
sep=";",
thousands="-",
parse_dates={"Date": [0, 1]},
header=None,
)
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize("keep_date_col", [True, False])
def test_multiple_date_col_custom(all_parsers, keep_date_col):
data = """\
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
"""
parser = all_parsers
def date_parser(*date_cols):
"""
Test date parser.
Parameters
----------
date_cols : args
The list of data columns to parse.
Returns
-------
parsed : Series
"""
return parsing.try_parse_dates(parsing.concat_date_cols(date_cols))
result = parser.read_csv(
StringIO(data),
header=None,
date_parser=date_parser,
prefix="X",
parse_dates={"actual": [1, 2], "nominal": [1, 3]},
keep_date_col=keep_date_col,
)
expected = DataFrame(
[
[
datetime(1999, 1, 27, 19, 0),
datetime(1999, 1, 27, 18, 56),
"KORD",
"19990127",
" 19:00:00",
" 18:56:00",
0.81,
2.81,
7.2,
0.0,
280.0,
],
[
datetime(1999, 1, 27, 20, 0),
datetime(1999, 1, 27, 19, 56),
"KORD",
"19990127",
" 20:00:00",
" 19:56:00",
0.01,
2.21,
7.2,
0.0,
260.0,
],
[
datetime(1999, 1, 27, 21, 0),
datetime(1999, 1, 27, 20, 56),
"KORD",
"19990127",
" 21:00:00",
" 20:56:00",
-0.59,
2.21,
5.7,
0.0,
280.0,
],
[
datetime(1999, 1, 27, 21, 0),
datetime(1999, 1, 27, 21, 18),
"KORD",
"19990127",
" 21:00:00",
" 21:18:00",
-0.99,
2.01,
3.6,
0.0,
270.0,
],
[
datetime(1999, 1, 27, 22, 0),
datetime(1999, 1, 27, 21, 56),
"KORD",
"19990127",
" 22:00:00",
" 21:56:00",
-0.59,
1.71,
5.1,
0.0,
290.0,
],
[
datetime(1999, 1, 27, 23, 0),
datetime(1999, 1, 27, 22, 56),
"KORD",
"19990127",
" 23:00:00",
" 22:56:00",
-0.59,
1.71,
4.6,
0.0,
280.0,
],
],
columns=[
"actual",
"nominal",
"X0",
"X1",
"X2",
"X3",
"X4",
"X5",
"X6",
"X7",
"X8",
],
)
if not keep_date_col:
expected = expected.drop(["X1", "X2", "X3"], axis=1)
elif parser.engine == "python":
expected["X1"] = expected["X1"].astype(np.int64)
# Python can sometimes be flaky about how
# the aggregated columns are entered, so
# this standardizes the order.
result = result[expected.columns]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("container", [list, tuple, Index, Series])
@pytest.mark.parametrize("dim", [1, 2])
def test_concat_date_col_fail(container, dim):
msg = "not all elements from date_cols are numpy arrays"
value = "19990127"
date_cols = tuple(container([value]) for _ in range(dim))
with pytest.raises(ValueError, match=msg):
parsing.concat_date_cols(date_cols)
@pytest.mark.parametrize("keep_date_col", [True, False])
def test_multiple_date_col(all_parsers, keep_date_col):
data = """\
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
"""
parser = all_parsers
result = parser.read_csv(
StringIO(data),
header=None,
prefix="X",
parse_dates=[[1, 2], [1, 3]],
keep_date_col=keep_date_col,
)
expected = DataFrame(
[
[
datetime(1999, 1, 27, 19, 0),
datetime(1999, 1, 27, 18, 56),
"KORD",
"19990127",
" 19:00:00",
" 18:56:00",
0.81,
2.81,
7.2,
0.0,
280.0,
],
[
datetime(1999, 1, 27, 20, 0),
datetime(1999, 1, 27, 19, 56),
"KORD",
"19990127",
" 20:00:00",
" 19:56:00",
0.01,
2.21,
7.2,
0.0,
260.0,
],
[
datetime(1999, 1, 27, 21, 0),
datetime(1999, 1, 27, 20, 56),
"KORD",
"19990127",
" 21:00:00",
" 20:56:00",
-0.59,
2.21,
5.7,
0.0,
280.0,
],
[
datetime(1999, 1, 27, 21, 0),
datetime(1999, 1, 27, 21, 18),
"KORD",
"19990127",
" 21:00:00",
" 21:18:00",
-0.99,
2.01,
3.6,
0.0,
270.0,
],
[
datetime(1999, 1, 27, 22, 0),
datetime(1999, 1, 27, 21, 56),
"KORD",
"19990127",
" 22:00:00",
" 21:56:00",
-0.59,
1.71,
5.1,
0.0,
290.0,
],
[
datetime(1999, 1, 27, 23, 0),
datetime(1999, 1, 27, 22, 56),
"KORD",
"19990127",
" 23:00:00",
" 22:56:00",
-0.59,
1.71,
4.6,
0.0,
280.0,
],
],
columns=[
"X1_X2",
"X1_X3",
"X0",
"X1",
"X2",
"X3",
"X4",
"X5",
"X6",
"X7",
"X8",
],
)
if not keep_date_col:
expected = expected.drop(["X1", "X2", "X3"], axis=1)
elif parser.engine == "python":
expected["X1"] = expected["X1"].astype(np.int64)
tm.assert_frame_equal(result, expected)
def test_date_col_as_index_col(all_parsers):
data = """\
KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
"""
parser = all_parsers
result = parser.read_csv(
StringIO(data), header=None, prefix="X", parse_dates=[1], index_col=1
)
index = Index(
[
datetime(1999, 1, 27, 19, 0),
datetime(1999, 1, 27, 20, 0),
datetime(1999, 1, 27, 21, 0),
datetime(1999, 1, 27, 21, 0),
datetime(1999, 1, 27, 22, 0),
],
name="X1",
)
expected = DataFrame(
[
["KORD", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0],
["KORD", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0],
["KORD", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0],
["KORD", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0],
["KORD", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0],
],
columns=["X0", "X2", "X3", "X4", "X5", "X6", "X7"],
index=index,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"date_parser, warning",
([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
)
def test_multiple_date_cols_int_cast(all_parsers, date_parser, warning):
data = (
"KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
"KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
"KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
"KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
"KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
"KORD,19990127, 23:00:00, 22:56:00, -0.5900"
)
parse_dates = {"actual": [1, 2], "nominal": [1, 3]}
parser = all_parsers
with tm.assert_produces_warning(warning, check_stacklevel=False):
result = parser.read_csv(
StringIO(data),
header=None,
date_parser=date_parser,
parse_dates=parse_dates,
prefix="X",
)
expected = DataFrame(
[
[datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", 0.81],
[datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), "KORD", 0.01],
[
datetime(1999, 1, 27, 21, 0),
datetime(1999, 1, 27, 20, 56),
"KORD",
-0.59,
],
[
datetime(1999, 1, 27, 21, 0),
datetime(1999, 1, 27, 21, 18),
"KORD",
-0.99,
],
[
datetime(1999, 1, 27, 22, 0),
datetime(1999, 1, 27, 21, 56),
"KORD",
-0.59,
],
[
datetime(1999, 1, 27, 23, 0),
datetime(1999, 1, 27, 22, 56),
"KORD",
-0.59,
],
],
columns=["actual", "nominal", "X0", "X4"],
)
# Python can sometimes be flaky about how
# the aggregated columns are entered, so
# this standardizes the order.
result = result[expected.columns]
tm.assert_frame_equal(result, expected)
def test_multiple_date_col_timestamp_parse(all_parsers):
parser = all_parsers
data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25"""
result = parser.read_csv(
StringIO(data), parse_dates=[[0, 1]], header=None, date_parser=Timestamp
)
expected = DataFrame(
[
[
Timestamp("05/31/2012, 15:30:00.029"),
1306.25,
1,
"E",
0,
np.nan,
1306.25,
],
[
Timestamp("05/31/2012, 15:30:00.029"),
1306.25,
8,
"E",
0,
np.nan,
1306.25,
],
],
columns=["0_1", 2, 3, 4, 5, 6, 7],
)
tm.assert_frame_equal(result, expected)
def test_multiple_date_cols_with_header(all_parsers):
parser = all_parsers
data = """\
ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]})
expected = DataFrame(
[
[
datetime(1999, 1, 27, 19, 0),
"KORD",
" 18:56:00",
0.81,
2.81,
7.2,
0.0,
280.0,
],
[
datetime(1999, 1, 27, 20, 0),
"KORD",
" 19:56:00",
0.01,
2.21,
7.2,
0.0,
260.0,
],
[
datetime(1999, 1, 27, 21, 0),
"KORD",
" 20:56:00",
-0.59,
2.21,
5.7,
0.0,
280.0,
],
[
datetime(1999, 1, 27, 21, 0),
"KORD",
" 21:18:00",
-0.99,
2.01,
3.6,
0.0,
270.0,
],
[
datetime(1999, 1, 27, 22, 0),
"KORD",
" 21:56:00",
-0.59,
1.71,
5.1,
0.0,
290.0,
],
[
datetime(1999, 1, 27, 23, 0),
"KORD",
" 22:56:00",
-0.59,
1.71,
4.6,
0.0,
280.0,
],
],
columns=[
"nominal",
"ID",
"ActualTime",
"TDew",
"TAir",
"Windspeed",
"Precip",
"WindDir",
],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,parse_dates,msg",
[
(
"""\
date_NominalTime,date,NominalTime
KORD1,19990127, 19:00:00
KORD2,19990127, 20:00:00""",
[[1, 2]],
("New date column already in dict date_NominalTime"),
),
(
"""\
ID,date,nominalTime
KORD,19990127, 19:00:00
KORD,19990127, 20:00:00""",
dict(ID=[1, 2]),
"Date column ID already in dict",
),
],
)
def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg):
parser = all_parsers
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), parse_dates=parse_dates)
def test_date_parser_int_bug(all_parsers):
# see gh-3071
parser = all_parsers
data = (
"posix_timestamp,elapsed,sys,user,queries,query_time,rows,"
"accountid,userid,contactid,level,silo,method\n"
"1343103150,0.062353,0,4,6,0.01690,3,"
"12345,1,-1,3,invoice_InvoiceResource,search\n"
)
result = parser.read_csv(
StringIO(data),
index_col=0,
parse_dates=[0],
date_parser=lambda x: datetime.utcfromtimestamp(int(x)),
)
expected = DataFrame(
[
[
0.062353,
0,
4,
6,
0.01690,
3,
12345,
1,
-1,
3,
"invoice_InvoiceResource",
"search",
]
],
columns=[
"elapsed",
"sys",
"user",
"queries",
"query_time",
"rows",
"accountid",
"userid",
"contactid",
"level",
"silo",
"method",
],
index=Index([Timestamp("2012-07-24 04:12:30")], name="posix_timestamp"),
)
tm.assert_frame_equal(result, expected)
def test_nat_parse(all_parsers):
# see gh-3062
parser = all_parsers
df = DataFrame(
dict({"A": np.arange(10, dtype="float64"), "B": Timestamp("20010101")})
)
df.iloc[3:6, :] = np.nan
with tm.ensure_clean("__nat_parse_.csv") as path:
df.to_csv(path)
result = parser.read_csv(path, index_col=0, parse_dates=["B"])
tm.assert_frame_equal(result, df)
def test_csv_custom_parser(all_parsers):
data = """A,B,C
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5
"""
parser = all_parsers
result = parser.read_csv(
StringIO(data), date_parser=lambda x: datetime.strptime(x, "%Y%m%d")
)
expected = parser.read_csv(StringIO(data), parse_dates=True)
tm.assert_frame_equal(result, expected)
def test_parse_dates_implicit_first_col(all_parsers):
data = """A,B,C
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), parse_dates=True)
expected = parser.read_csv(StringIO(data), index_col=0, parse_dates=True)
tm.assert_frame_equal(result, expected)
def test_parse_dates_string(all_parsers):
data = """date,A,B,C
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col="date", parse_dates=["date"])
# freq doesnt round-trip
index = DatetimeIndex(
list(date_range("1/1/2009", periods=3)), name="date", freq=None
)
expected = DataFrame(
{"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}, index=index
)
tm.assert_frame_equal(result, expected)
# Bug in https://github.com/dateutil/dateutil/issues/217
# has been addressed, but we just don't pass in the `yearfirst`
@pytest.mark.xfail(reason="yearfirst is not surfaced in read_*")
@pytest.mark.parametrize("parse_dates", [[["date", "time"]], [[0, 1]]])
def test_yy_format_with_year_first(all_parsers, parse_dates):
data = """date,time,B,C
090131,0010,1,2
090228,1020,3,4
090331,0830,5,6
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=0, parse_dates=parse_dates)
index = DatetimeIndex(
[
datetime(2009, 1, 31, 0, 10, 0),
datetime(2009, 2, 28, 10, 20, 0),
datetime(2009, 3, 31, 8, 30, 0),
],
dtype=object,
name="date_time",
)
expected = DataFrame({"B": [1, 3, 5], "C": [2, 4, 6]}, index=index)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]])
def test_parse_dates_column_list(all_parsers, parse_dates):
data = "a,b,c\n01/01/2010,1,15/02/2010"
parser = all_parsers
expected = DataFrame(
{"a": [datetime(2010, 1, 1)], "b": [1], "c": [datetime(2010, 2, 15)]}
)
expected = expected.set_index(["a", "b"])
result = parser.read_csv(
StringIO(data), index_col=[0, 1], parse_dates=parse_dates, dayfirst=True
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
def test_multi_index_parse_dates(all_parsers, index_col):
data = """index1,index2,A,B,C
20090101,one,a,1,2
20090101,two,b,3,4
20090101,three,c,4,5
20090102,one,a,1,2
20090102,two,b,3,4
20090102,three,c,4,5
20090103,one,a,1,2
20090103,two,b,3,4
20090103,three,c,4,5
"""
parser = all_parsers
index = MultiIndex.from_product(
[
(datetime(2009, 1, 1), datetime(2009, 1, 2), datetime(2009, 1, 3)),
("one", "two", "three"),
],
names=["index1", "index2"],
)
# Out of order.
if index_col == [1, 0]:
index = index.swaplevel(0, 1)
expected = DataFrame(
[
["a", 1, 2],
["b", 3, 4],
["c", 4, 5],
["a", 1, 2],
["b", 3, 4],
["c", 4, 5],
["a", 1, 2],
["b", 3, 4],
["c", 4, 5],
],
columns=["A", "B", "C"],
index=index,
)
result = parser.read_csv(StringIO(data), index_col=index_col, parse_dates=True)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("kwargs", [dict(dayfirst=True), dict(day_first=True)])
def test_parse_dates_custom_euro_format(all_parsers, kwargs):
parser = all_parsers
data = """foo,bar,baz
31/01/2010,1,2
01/02/2010,1,NA
02/02/2010,1,2
"""
if "dayfirst" in kwargs:
df = parser.read_csv(
StringIO(data),
names=["time", "Q", "NTU"],
date_parser=lambda d: du_parse(d, **kwargs),
header=0,
index_col=0,
parse_dates=True,
na_values=["NA"],
)
exp_index = Index(
[datetime(2010, 1, 31), datetime(2010, 2, 1), datetime(2010, 2, 2)],
name="time",
)
expected = DataFrame(
{"Q": [1, 1, 1], "NTU": [2, np.nan, 2]},
index=exp_index,
columns=["Q", "NTU"],
)
tm.assert_frame_equal(df, expected)
else:
msg = "got an unexpected keyword argument 'day_first'"
with pytest.raises(TypeError, match=msg), tm.assert_produces_warning(
FutureWarning
):
parser.read_csv(
StringIO(data),
names=["time", "Q", "NTU"],
date_parser=lambda d: du_parse(d, **kwargs),
skiprows=[0],
index_col=0,
parse_dates=True,
na_values=["NA"],
)
def test_parse_tz_aware(all_parsers):
# See gh-1693
parser = all_parsers
data = "Date,x\n2012-06-13T01:39:00Z,0.5"
result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True)
expected = DataFrame(
{"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date")
)
tm.assert_frame_equal(result, expected)
assert result.index.tz is pytz.utc
@pytest.mark.parametrize(
"parse_dates,index_col",
[({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)],
)
def test_multiple_date_cols_index(all_parsers, parse_dates, index_col):
parser = all_parsers
data = """
ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
"""
expected = DataFrame(
[
[
datetime(1999, 1, 27, 19, 0),
"KORD1",
" 18:56:00",
0.81,
2.81,
7.2,
0.0,
280.0,
],
[
datetime(1999, 1, 27, 20, 0),
"KORD2",
" 19:56:00",
0.01,
2.21,
7.2,
0.0,
260.0,
],
[
datetime(1999, 1, 27, 21, 0),
"KORD3",
" 20:56:00",
-0.59,
2.21,
5.7,
0.0,
280.0,
],
[
datetime(1999, 1, 27, 21, 0),
"KORD4",
" 21:18:00",
-0.99,
2.01,
3.6,
0.0,
270.0,
],
[
datetime(1999, 1, 27, 22, 0),
"KORD5",
" 21:56:00",
-0.59,
1.71,
5.1,
0.0,
290.0,
],
[
datetime(1999, 1, 27, 23, 0),
"KORD6",
" 22:56:00",
-0.59,
1.71,
4.6,
0.0,
280.0,
],
],
columns=[
"nominal",
"ID",
"ActualTime",
"TDew",
"TAir",
"Windspeed",
"Precip",
"WindDir",
],
)
expected = expected.set_index("nominal")
if not isinstance(parse_dates, dict):
expected.index.name = "date_NominalTime"
result = parser.read_csv(
StringIO(data), parse_dates=parse_dates, index_col=index_col
)
tm.assert_frame_equal(result, expected)
def test_multiple_date_cols_chunked(all_parsers):
parser = all_parsers
data = """\
ID,date,nominalTime,actualTime,A,B,C,D,E
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
"""
expected = DataFrame(
[
[
datetime(1999, 1, 27, 19, 0),
"KORD",
" 18:56:00",
0.81,
2.81,
7.2,
0.0,
280.0,
],
[
datetime(1999, 1, 27, 20, 0),
"KORD",
" 19:56:00",
0.01,
2.21,
7.2,
0.0,
260.0,
],
[
datetime(1999, 1, 27, 21, 0),
"KORD",
" 20:56:00",
-0.59,
2.21,
5.7,
0.0,
280.0,
],
[
datetime(1999, 1, 27, 21, 0),
"KORD",
" 21:18:00",
-0.99,
2.01,
3.6,
0.0,
270.0,
],
[
datetime(1999, 1, 27, 22, 0),
"KORD",
" 21:56:00",
-0.59,
1.71,
5.1,
0.0,
290.0,
],
[
datetime(1999, 1, 27, 23, 0),
"KORD",
" 22:56:00",
-0.59,
1.71,
4.6,
0.0,
280.0,
],
],
columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"],
)
expected = expected.set_index("nominal")
with parser.read_csv(
StringIO(data),
parse_dates={"nominal": [1, 2]},
index_col="nominal",
chunksize=2,
) as reader:
chunks = list(reader)
tm.assert_frame_equal(chunks[0], expected[:2])
tm.assert_frame_equal(chunks[1], expected[2:4])
tm.assert_frame_equal(chunks[2], expected[4:])
def test_multiple_date_col_named_index_compat(all_parsers):
parser = all_parsers
data = """\
ID,date,nominalTime,actualTime,A,B,C,D,E
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
"""
with_indices = parser.read_csv(
StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal"
)
with_names = parser.read_csv(
StringIO(data),
index_col="nominal",
parse_dates={"nominal": ["date", "nominalTime"]},
)
tm.assert_frame_equal(with_indices, with_names)
def test_multiple_date_col_multiple_index_compat(all_parsers):
parser = all_parsers
data = """\
ID,date,nominalTime,actualTime,A,B,C,D,E
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
"""
result = parser.read_csv(
StringIO(data), index_col=["nominal", "ID"], parse_dates={"nominal": [1, 2]}
)
expected = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]})
expected = expected.set_index(["nominal", "ID"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("kwargs", [dict(), dict(index_col="C")])
def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs):
# see gh-5636
parser = all_parsers
msg = (
"Only booleans, lists, and dictionaries "
"are accepted for the 'parse_dates' parameter"
)
data = """A,B,C
1,2,2003-11-1"""
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), parse_dates="C", **kwargs)
@pytest.mark.parametrize("parse_dates", [(1,), np.array([4, 5]), {1, 3, 3}])
def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates):
parser = all_parsers
msg = (
"Only booleans, lists, and dictionaries "
"are accepted for the 'parse_dates' parameter"
)
data = """A,B,C
1,2,2003-11-1"""
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), parse_dates=(1,))
@pytest.mark.parametrize("cache_dates", [True, False])
@pytest.mark.parametrize("value", ["nan", "0", ""])
def test_bad_date_parse(all_parsers, cache_dates, value):
# if we have an invalid date make sure that we handle this with
# and w/o the cache properly
parser = all_parsers
s = StringIO((f"{value},\n") * 50000)
parser.read_csv(
s,
header=None,
names=["foo", "bar"],
parse_dates=["foo"],
infer_datetime_format=False,
cache_dates=cache_dates,
)
def test_parse_dates_empty_string(all_parsers):
# see gh-2263
parser = all_parsers
data = "Date,test\n2012-01-01,1\n,2"
result = parser.read_csv(StringIO(data), parse_dates=["Date"], na_filter=False)
expected = DataFrame(
[[datetime(2012, 1, 1), 1], [pd.NaT, 2]], columns=["Date", "test"]
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,kwargs,expected",
[
(
"a\n04.15.2016",
dict(parse_dates=["a"]),
DataFrame([datetime(2016, 4, 15)], columns=["a"]),
),
(
"a\n04.15.2016",
dict(parse_dates=True, index_col=0),
DataFrame(index=DatetimeIndex(["2016-04-15"], name="a")),
),
(
"a,b\n04.15.2016,09.16.2013",
dict(parse_dates=["a", "b"]),
DataFrame(
[[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=["a", "b"]
),
),
(
"a,b\n04.15.2016,09.16.2013",
dict(parse_dates=True, index_col=[0, 1]),
DataFrame(
index=MultiIndex.from_tuples(
[(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"]
)
),
),
],
)
def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected):
# see gh-14066
parser = all_parsers
result = parser.read_csv(StringIO(data), thousands=".", **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"date_parser, warning",
([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
)
def test_parse_date_time_multi_level_column_name(all_parsers, date_parser, warning):
data = """\
D,T,A,B
date, time,a,b
2001-01-05, 09:00:00, 0.0, 10.
2001-01-06, 00:00:00, 1.0, 11.
"""
parser = all_parsers
with tm.assert_produces_warning(warning, check_stacklevel=False):
result = parser.read_csv(
StringIO(data),
header=[0, 1],
parse_dates={"date_time": [0, 1]},
date_parser=date_parser,
)
expected_data = [
[datetime(2001, 1, 5, 9, 0, 0), 0.0, 10.0],
[datetime(2001, 1, 6, 0, 0, 0), 1.0, 11.0],
]
expected = DataFrame(expected_data, columns=["date_time", ("A", "a"), ("B", "b")])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"date_parser, warning",
([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
)
@pytest.mark.parametrize(
"data,kwargs,expected",
[
(
"""\
date,time,a,b
2001-01-05, 10:00:00, 0.0, 10.
2001-01-05, 00:00:00, 1., 11.
""",
dict(header=0, parse_dates={"date_time": [0, 1]}),
DataFrame(
[
[datetime(2001, 1, 5, 10, 0, 0), 0.0, 10],
[datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0],
],
columns=["date_time", "a", "b"],
),
),
(
(
"KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
"KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
"KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
"KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
"KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
"KORD,19990127, 23:00:00, 22:56:00, -0.5900"
),
dict(header=None, parse_dates={"actual": [1, 2], "nominal": [1, 3]}),
DataFrame(
[
[
datetime(1999, 1, 27, 19, 0),
datetime(1999, 1, 27, 18, 56),
"KORD",
0.81,
],
[
datetime(1999, 1, 27, 20, 0),
datetime(1999, 1, 27, 19, 56),
"KORD",
0.01,
],
[
datetime(1999, 1, 27, 21, 0),
datetime(1999, 1, 27, 20, 56),
"KORD",
-0.59,
],
[
datetime(1999, 1, 27, 21, 0),
datetime(1999, 1, 27, 21, 18),
"KORD",
-0.99,
],
[
datetime(1999, 1, 27, 22, 0),
datetime(1999, 1, 27, 21, 56),
"KORD",
-0.59,
],
[
datetime(1999, 1, 27, 23, 0),
datetime(1999, 1, 27, 22, 56),
"KORD",
-0.59,
],
],
columns=["actual", "nominal", 0, 4],
),
),
],
)
def test_parse_date_time(all_parsers, data, kwargs, expected, date_parser, warning):
parser = all_parsers
with tm.assert_produces_warning(warning, check_stacklevel=False):
result = parser.read_csv(StringIO(data), date_parser=date_parser, **kwargs)
# Python can sometimes be flaky about how
# the aggregated columns are entered, so
# this standardizes the order.
result = result[expected.columns]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"date_parser, warning",
([conv.parse_date_fields, FutureWarning], [pd.to_datetime, None]),
)
def test_parse_date_fields(all_parsers, date_parser, warning):
parser = all_parsers
data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
with tm.assert_produces_warning(warning, check_stacklevel=False):
result = parser.read_csv(
StringIO(data),
header=0,
parse_dates={"ymd": [0, 1, 2]},
date_parser=date_parser,
)
expected = DataFrame(
[[datetime(2001, 1, 10), 10.0], [datetime(2001, 2, 1), 11.0]],
columns=["ymd", "a"],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"date_parser, warning",
(
[conv.parse_all_fields, FutureWarning],
[lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S"), None],
),
)
def test_parse_date_all_fields(all_parsers, date_parser, warning):
parser = all_parsers
data = """\
year,month,day,hour,minute,second,a,b
2001,01,05,10,00,0,0.0,10.
2001,01,5,10,0,00,1.,11.
"""
with tm.assert_produces_warning(warning, check_stacklevel=False):
result = parser.read_csv(
StringIO(data),
header=0,
date_parser=date_parser,
parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]},
)
expected = DataFrame(
[
[datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0],
[datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0],
],
columns=["ymdHMS", "a", "b"],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"date_parser, warning",
(
[conv.parse_all_fields, FutureWarning],
[lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S.%f"), None],
),
)
def test_datetime_fractional_seconds(all_parsers, date_parser, warning):
parser = all_parsers
data = """\
year,month,day,hour,minute,second,a,b
2001,01,05,10,00,0.123456,0.0,10.
2001,01,5,10,0,0.500000,1.,11.
"""
with tm.assert_produces_warning(warning, check_stacklevel=False):
result = parser.read_csv(
StringIO(data),
header=0,
date_parser=date_parser,
parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]},
)
expected = DataFrame(
[
[datetime(2001, 1, 5, 10, 0, 0, microsecond=123456), 0.0, 10.0],
[datetime(2001, 1, 5, 10, 0, 0, microsecond=500000), 1.0, 11.0],
],
columns=["ymdHMS", "a", "b"],
)
tm.assert_frame_equal(result, expected)
def test_generic(all_parsers):
parser = all_parsers
data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = parser.read_csv(
StringIO(data),
header=0,
parse_dates={"ym": [0, 1]},
date_parser=lambda y, m: date(year=int(y), month=int(m), day=1),
)
expected = DataFrame(
[[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]],
columns=["ym", "day", "a"],
)
tm.assert_frame_equal(result, expected)
def test_date_parser_resolution_if_not_ns(all_parsers):
# see gh-10245
parser = all_parsers
data = """\
date,time,prn,rxstatus
2013-11-03,19:00:00,126,00E80000
2013-11-03,19:00:00,23,00E80000
2013-11-03,19:00:00,13,00E80000
"""
def date_parser(dt, time):
return np_array_datetime64_compat(dt + "T" + time + "Z", dtype="datetime64[s]")
result = parser.read_csv(
StringIO(data),
date_parser=date_parser,
parse_dates={"datetime": ["date", "time"]},
index_col=["datetime", "prn"],
)
datetimes = np_array_datetime64_compat(
["2013-11-03T19:00:00Z"] * 3, dtype="datetime64[s]"
)
expected = DataFrame(
data={"rxstatus": ["00E80000"] * 3},
index=MultiIndex.from_tuples(
[(datetimes[0], 126), (datetimes[1], 23), (datetimes[2], 13)],
names=["datetime", "prn"],
),
)
tm.assert_frame_equal(result, expected)
def test_parse_date_column_with_empty_string(all_parsers):
# see gh-6428
parser = all_parsers
data = "case,opdate\n7,10/18/2006\n7,10/18/2008\n621, "
result = parser.read_csv(StringIO(data), parse_dates=["opdate"])
expected_data = [[7, "10/18/2006"], [7, "10/18/2008"], [621, " "]]
expected = DataFrame(expected_data, columns=["case", "opdate"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,expected",
[
(
"a\n135217135789158401\n1352171357E+5",
DataFrame({"a": [135217135789158401, 135217135700000]}, dtype="float64"),
),
(
"a\n99999999999\n123456789012345\n1234E+0",
DataFrame({"a": [99999999999, 123456789012345, 1234]}, dtype="float64"),
),
],
)
@pytest.mark.parametrize("parse_dates", [True, False])
def test_parse_date_float(all_parsers, data, expected, parse_dates):
# see gh-2697
#
# Date parsing should fail, so we leave the data untouched
# (i.e. float precision should remain unchanged).
parser = all_parsers
result = parser.read_csv(StringIO(data), parse_dates=parse_dates)
tm.assert_frame_equal(result, expected)
def test_parse_timezone(all_parsers):
# see gh-22256
parser = all_parsers
data = """dt,val
2018-01-04 09:01:00+09:00,23350
2018-01-04 09:02:00+09:00,23400
2018-01-04 09:03:00+09:00,23400
2018-01-04 09:04:00+09:00,23400
2018-01-04 09:05:00+09:00,23400"""
result = parser.read_csv(StringIO(data), parse_dates=["dt"])
dti = DatetimeIndex(
list(
pd.date_range(
start="2018-01-04 09:01:00",
end="2018-01-04 09:05:00",
freq="1min",
tz=pytz.FixedOffset(540),
)
),
freq=None,
)
expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]}
expected = DataFrame(expected_data)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"date_string",
["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"],
)
def test_invalid_parse_delimited_date(all_parsers, date_string):
parser = all_parsers
expected = DataFrame({0: [date_string]}, dtype="object")
result = parser.read_csv(StringIO(date_string), header=None, parse_dates=[0])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"date_string,dayfirst,expected",
[
# %d/%m/%Y; month > 12 thus replacement
("13/02/2019", False, datetime(2019, 2, 13)),
("13/02/2019", True, datetime(2019, 2, 13)),
# %m/%d/%Y; day > 12 thus there will be no replacement
("02/13/2019", False, datetime(2019, 2, 13)),
("02/13/2019", True, datetime(2019, 2, 13)),
# %d/%m/%Y; dayfirst==True thus replacement
("04/02/2019", True, datetime(2019, 2, 4)),
],
)
def test_parse_delimited_date_swap(all_parsers, date_string, dayfirst, expected):
parser = all_parsers
expected = DataFrame({0: [expected]}, dtype="datetime64[ns]")
result = parser.read_csv(
StringIO(date_string), header=None, dayfirst=dayfirst, parse_dates=[0]
)
tm.assert_frame_equal(result, expected)
def _helper_hypothesis_delimited_date(call, date_string, **kwargs):
msg, result = None, None
try:
result = call(date_string, **kwargs)
except ValueError as er:
msg = str(er)
pass
return msg, result
@given(date_strategy)
@settings(deadline=None)
@pytest.mark.parametrize("delimiter", list(" -./"))
@pytest.mark.parametrize("dayfirst", [True, False])
@pytest.mark.parametrize(
"date_format",
["%d %m %Y", "%m %d %Y", "%m %Y", "%Y %m %d", "%y %m %d", "%Y%m%d", "%y%m%d"],
)
def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_datetime):
if date_format == "%m %Y" and delimiter == ".":
pytest.skip(
"parse_datetime_string cannot reliably tell whether \
e.g. %m.%Y is a float or a date, thus we skip it"
)
result, expected = None, None
except_in_dateutil, except_out_dateutil = None, None
date_string = test_datetime.strftime(date_format.replace(" ", delimiter))
except_out_dateutil, result = _helper_hypothesis_delimited_date(
parse_datetime_string, date_string, dayfirst=dayfirst
)
except_in_dateutil, expected = _helper_hypothesis_delimited_date(
du_parse,
date_string,
default=_DEFAULT_DATETIME,
dayfirst=dayfirst,
yearfirst=False,
)
assert except_out_dateutil == except_in_dateutil
assert result == expected
@pytest.mark.parametrize(
"names, usecols, parse_dates, missing_cols",
[
(None, ["val"], ["date", "time"], "date, time"),
(None, ["val"], [0, "time"], "time"),
(None, ["val"], [["date", "time"]], "date, time"),
(None, ["val"], [[0, "time"]], "time"),
(None, ["val"], {"date": [0, "time"]}, "time"),
(None, ["val"], {"date": ["date", "time"]}, "date, time"),
(None, ["val"], [["date", "time"], "date"], "date, time"),
(["date1", "time1", "temperature"], None, ["date", "time"], "date, time"),
(
["date1", "time1", "temperature"],
["date1", "temperature"],
["date1", "time"],
"time",
),
],
)
def test_missing_parse_dates_column_raises(
all_parsers, names, usecols, parse_dates, missing_cols
):
# gh-31251 column names provided in parse_dates could be missing.
parser = all_parsers
content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n")
msg = f"Missing column provided to 'parse_dates': '{missing_cols}'"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates
)