projektAI/venv/Lib/site-packages/pandas/tests/frame/methods/test_to_csv.py

1341 lines
46 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
import csv
from io import StringIO
import os
import numpy as np
import pytest
from pandas.errors import ParserError
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
Timestamp,
date_range,
read_csv,
to_datetime,
)
import pandas._testing as tm
import pandas.core.common as com
from pandas.io.common import get_handle
MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"]
MIXED_INT_DTYPES = [
"uint8",
"uint16",
"uint32",
"uint64",
"int8",
"int16",
"int32",
"int64",
]
class TestDataFrameToCSV:
def read_csv(self, path, **kwargs):
params = {"index_col": 0, "parse_dates": True}
params.update(**kwargs)
return pd.read_csv(path, **params)
def test_to_csv_from_csv1(self, float_frame, datetime_frame):
with tm.ensure_clean("__tmp_to_csv_from_csv1__") as path:
float_frame["A"][:5] = np.nan
float_frame.to_csv(path)
float_frame.to_csv(path, columns=["A", "B"])
float_frame.to_csv(path, header=False)
float_frame.to_csv(path, index=False)
# test roundtrip
# freq does not roundtrip
datetime_frame.index = datetime_frame.index._with_freq(None)
datetime_frame.to_csv(path)
recons = self.read_csv(path)
tm.assert_frame_equal(datetime_frame, recons)
datetime_frame.to_csv(path, index_label="index")
recons = self.read_csv(path, index_col=None)
assert len(recons.columns) == len(datetime_frame.columns) + 1
# no index
datetime_frame.to_csv(path, index=False)
recons = self.read_csv(path, index_col=None)
tm.assert_almost_equal(datetime_frame.values, recons.values)
# corner case
dm = DataFrame(
{
"s1": Series(range(3), index=np.arange(3)),
"s2": Series(range(2), index=np.arange(2)),
}
)
dm.to_csv(path)
recons = self.read_csv(path)
tm.assert_frame_equal(dm, recons)
def test_to_csv_from_csv2(self, float_frame):
with tm.ensure_clean("__tmp_to_csv_from_csv2__") as path:
# duplicate index
df = DataFrame(
np.random.randn(3, 3), index=["a", "a", "b"], columns=["x", "y", "z"]
)
df.to_csv(path)
result = self.read_csv(path)
tm.assert_frame_equal(result, df)
midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)])
df = DataFrame(np.random.randn(3, 3), index=midx, columns=["x", "y", "z"])
df.to_csv(path)
result = self.read_csv(path, index_col=[0, 1, 2], parse_dates=False)
tm.assert_frame_equal(result, df, check_names=False)
# column aliases
col_aliases = Index(["AA", "X", "Y", "Z"])
float_frame.to_csv(path, header=col_aliases)
rs = self.read_csv(path)
xp = float_frame.copy()
xp.columns = col_aliases
tm.assert_frame_equal(xp, rs)
msg = "Writing 4 cols but got 2 aliases"
with pytest.raises(ValueError, match=msg):
float_frame.to_csv(path, header=["AA", "X"])
def test_to_csv_from_csv3(self):
with tm.ensure_clean("__tmp_to_csv_from_csv3__") as path:
df1 = DataFrame(np.random.randn(3, 1))
df2 = DataFrame(np.random.randn(3, 1))
df1.to_csv(path)
df2.to_csv(path, mode="a", header=False)
xp = pd.concat([df1, df2])
rs = pd.read_csv(path, index_col=0)
rs.columns = [int(label) for label in rs.columns]
xp.columns = [int(label) for label in xp.columns]
tm.assert_frame_equal(xp, rs)
def test_to_csv_from_csv4(self):
with tm.ensure_clean("__tmp_to_csv_from_csv4__") as path:
# GH 10833 (TimedeltaIndex formatting)
dt = pd.Timedelta(seconds=1)
df = DataFrame(
{"dt_data": [i * dt for i in range(3)]},
index=Index([i * dt for i in range(3)], name="dt_index"),
)
df.to_csv(path)
result = pd.read_csv(path, index_col="dt_index")
result.index = pd.to_timedelta(result.index)
# TODO: remove renaming when GH 10875 is solved
result.index = result.index.rename("dt_index")
result["dt_data"] = pd.to_timedelta(result["dt_data"])
tm.assert_frame_equal(df, result, check_index_type=True)
def test_to_csv_from_csv5(self, timezone_frame):
# tz, 8260
with tm.ensure_clean("__tmp_to_csv_from_csv5__") as path:
timezone_frame.to_csv(path)
result = pd.read_csv(path, index_col=0, parse_dates=["A"])
converter = (
lambda c: to_datetime(result[c])
.dt.tz_convert("UTC")
.dt.tz_convert(timezone_frame[c].dt.tz)
)
result["B"] = converter("B")
result["C"] = converter("C")
tm.assert_frame_equal(result, timezone_frame)
def test_to_csv_cols_reordering(self):
# GH3454
import pandas as pd
chunksize = 5
N = int(chunksize * 2.5)
df = tm.makeCustomDataframe(N, 3)
cs = df.columns
cols = [cs[2], cs[0]]
with tm.ensure_clean() as path:
df.to_csv(path, columns=cols, chunksize=chunksize)
rs_c = pd.read_csv(path, index_col=0)
tm.assert_frame_equal(df[cols], rs_c, check_names=False)
def test_to_csv_new_dupe_cols(self):
import pandas as pd
def _check_df(df, cols=None):
with tm.ensure_clean() as path:
df.to_csv(path, columns=cols, chunksize=chunksize)
rs_c = pd.read_csv(path, index_col=0)
# we wrote them in a different order
# so compare them in that order
if cols is not None:
if df.columns.is_unique:
rs_c.columns = cols
else:
indexer, missing = df.columns.get_indexer_non_unique(cols)
rs_c.columns = df.columns.take(indexer)
for c in cols:
obj_df = df[c]
obj_rs = rs_c[c]
if isinstance(obj_df, Series):
tm.assert_series_equal(obj_df, obj_rs)
else:
tm.assert_frame_equal(obj_df, obj_rs, check_names=False)
# wrote in the same order
else:
rs_c.columns = df.columns
tm.assert_frame_equal(df, rs_c, check_names=False)
chunksize = 5
N = int(chunksize * 2.5)
# dupe cols
df = tm.makeCustomDataframe(N, 3)
df.columns = ["a", "a", "b"]
_check_df(df, None)
# dupe cols with selection
cols = ["b", "a"]
_check_df(df, cols)
@pytest.mark.slow
def test_to_csv_dtnat(self):
# GH3437
from pandas import NaT
def make_dtnat_arr(n, nnat=None):
if nnat is None:
nnat = int(n * 0.1) # 10%
s = list(date_range("2000", freq="5min", periods=n))
if nnat:
for i in np.random.randint(0, len(s), nnat):
s[i] = NaT
i = np.random.randint(100)
s[-i] = NaT
s[i] = NaT
return s
chunksize = 1000
# N=35000
s1 = make_dtnat_arr(chunksize + 5)
s2 = make_dtnat_arr(chunksize + 5, 0)
# s3=make_dtnjat_arr(chunksize+5,0)
with tm.ensure_clean("1.csv") as pth:
df = DataFrame({"a": s1, "b": s2})
df.to_csv(pth, chunksize=chunksize)
recons = self.read_csv(pth).apply(to_datetime)
tm.assert_frame_equal(df, recons, check_names=False)
@pytest.mark.slow
def test_to_csv_moar(self):
def _do_test(
df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False
):
kwargs = {"parse_dates": False}
if cnlvl:
if rnlvl is not None:
kwargs["index_col"] = list(range(rnlvl))
kwargs["header"] = list(range(cnlvl))
with tm.ensure_clean("__tmp_to_csv_moar__") as path:
df.to_csv(path, encoding="utf8", chunksize=chunksize)
recons = self.read_csv(path, **kwargs)
else:
kwargs["header"] = 0
with tm.ensure_clean("__tmp_to_csv_moar__") as path:
df.to_csv(path, encoding="utf8", chunksize=chunksize)
recons = self.read_csv(path, **kwargs)
def _to_uni(x):
if not isinstance(x, str):
return x.decode("utf8")
return x
if dupe_col:
# read_Csv disambiguates the columns by
# labeling them dupe.1,dupe.2, etc'. monkey patch columns
recons.columns = df.columns
if rnlvl and not cnlvl:
delta_lvl = [recons.iloc[:, i].values for i in range(rnlvl - 1)]
ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl)
recons.index = ix
recons = recons.iloc[:, rnlvl - 1 :]
type_map = {"i": "i", "f": "f", "s": "O", "u": "O", "dt": "O", "p": "O"}
if r_dtype:
if r_dtype == "u": # unicode
r_dtype = "O"
recons.index = np.array(
[_to_uni(label) for label in recons.index], dtype=r_dtype
)
df.index = np.array(
[_to_uni(label) for label in df.index], dtype=r_dtype
)
elif r_dtype == "dt": # unicode
r_dtype = "O"
recons.index = np.array(
[Timestamp(label) for label in recons.index], dtype=r_dtype
)
df.index = np.array(
[Timestamp(label) for label in df.index], dtype=r_dtype
)
elif r_dtype == "p":
r_dtype = "O"
idx_list = to_datetime(recons.index)
recons.index = np.array(
[Timestamp(label) for label in idx_list], dtype=r_dtype
)
df.index = np.array(
list(map(Timestamp, df.index.to_timestamp())), dtype=r_dtype
)
else:
r_dtype = type_map.get(r_dtype)
recons.index = np.array(recons.index, dtype=r_dtype)
df.index = np.array(df.index, dtype=r_dtype)
if c_dtype:
if c_dtype == "u":
c_dtype = "O"
recons.columns = np.array(
[_to_uni(label) for label in recons.columns], dtype=c_dtype
)
df.columns = np.array(
[_to_uni(label) for label in df.columns], dtype=c_dtype
)
elif c_dtype == "dt":
c_dtype = "O"
recons.columns = np.array(
[Timestamp(label) for label in recons.columns], dtype=c_dtype
)
df.columns = np.array(
[Timestamp(label) for label in df.columns], dtype=c_dtype
)
elif c_dtype == "p":
c_dtype = "O"
col_list = to_datetime(recons.columns)
recons.columns = np.array(
[Timestamp(label) for label in col_list], dtype=c_dtype
)
col_list = df.columns.to_timestamp()
df.columns = np.array(
[Timestamp(label) for label in col_list], dtype=c_dtype
)
else:
c_dtype = type_map.get(c_dtype)
recons.columns = np.array(recons.columns, dtype=c_dtype)
df.columns = np.array(df.columns, dtype=c_dtype)
tm.assert_frame_equal(df, recons, check_names=False)
N = 100
chunksize = 1000
ncols = 4
base = chunksize // ncols
for nrows in [
2,
10,
N - 1,
N,
N + 1,
N + 2,
2 * N - 2,
2 * N - 1,
2 * N,
2 * N + 1,
2 * N + 2,
base - 1,
base,
base + 1,
]:
_do_test(
tm.makeCustomDataframe(nrows, ncols, r_idx_type="dt", c_idx_type="s"),
"dt",
"s",
)
for r_idx_type, c_idx_type in [("i", "i"), ("s", "s"), ("u", "dt"), ("p", "p")]:
for ncols in [1, 2, 3, 4]:
base = chunksize // ncols
for nrows in [
2,
10,
N - 1,
N,
N + 1,
N + 2,
2 * N - 2,
2 * N - 1,
2 * N,
2 * N + 1,
2 * N + 2,
base - 1,
base,
base + 1,
]:
_do_test(
tm.makeCustomDataframe(
nrows, ncols, r_idx_type=r_idx_type, c_idx_type=c_idx_type
),
r_idx_type,
c_idx_type,
)
for ncols in [1, 2, 3, 4]:
base = chunksize // ncols
for nrows in [
10,
N - 2,
N - 1,
N,
N + 1,
N + 2,
2 * N - 2,
2 * N - 1,
2 * N,
2 * N + 1,
2 * N + 2,
base - 1,
base,
base + 1,
]:
_do_test(tm.makeCustomDataframe(nrows, ncols))
for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]:
df = tm.makeCustomDataframe(nrows, 3)
cols = list(df.columns)
cols[:2] = ["dupe", "dupe"]
cols[-2:] = ["dupe", "dupe"]
ix = list(df.index)
ix[:2] = ["rdupe", "rdupe"]
ix[-2:] = ["rdupe", "rdupe"]
df.index = ix
df.columns = cols
_do_test(df, dupe_col=True)
_do_test(DataFrame(index=np.arange(10)))
_do_test(
tm.makeCustomDataframe(chunksize // 2 + 1, 2, r_idx_nlevels=2), rnlvl=2
)
for ncols in [2, 3, 4]:
base = int(chunksize // ncols)
for nrows in [
10,
N - 2,
N - 1,
N,
N + 1,
N + 2,
2 * N - 2,
2 * N - 1,
2 * N,
2 * N + 1,
2 * N + 2,
base - 1,
base,
base + 1,
]:
_do_test(tm.makeCustomDataframe(nrows, ncols, r_idx_nlevels=2), rnlvl=2)
_do_test(tm.makeCustomDataframe(nrows, ncols, c_idx_nlevels=2), cnlvl=2)
_do_test(
tm.makeCustomDataframe(
nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2
),
rnlvl=2,
cnlvl=2,
)
def test_to_csv_from_csv_w_some_infs(self, float_frame):
# test roundtrip with inf, -inf, nan, as full columns and mix
float_frame["G"] = np.nan
f = lambda x: [np.inf, np.nan][np.random.rand() < 0.5]
float_frame["H"] = float_frame.index.map(f)
with tm.ensure_clean() as path:
float_frame.to_csv(path)
recons = self.read_csv(path)
# TODO to_csv drops column name
tm.assert_frame_equal(float_frame, recons, check_names=False)
tm.assert_frame_equal(
np.isinf(float_frame), np.isinf(recons), check_names=False
)
def test_to_csv_from_csv_w_all_infs(self, float_frame):
# test roundtrip with inf, -inf, nan, as full columns and mix
float_frame["E"] = np.inf
float_frame["F"] = -np.inf
with tm.ensure_clean() as path:
float_frame.to_csv(path)
recons = self.read_csv(path)
# TODO to_csv drops column name
tm.assert_frame_equal(float_frame, recons, check_names=False)
tm.assert_frame_equal(
np.isinf(float_frame), np.isinf(recons), check_names=False
)
def test_to_csv_no_index(self):
# GH 3624, after appending columns, to_csv fails
with tm.ensure_clean("__tmp_to_csv_no_index__") as path:
df = DataFrame({"c1": [1, 2, 3], "c2": [4, 5, 6]})
df.to_csv(path, index=False)
result = read_csv(path)
tm.assert_frame_equal(df, result)
df["c3"] = Series([7, 8, 9], dtype="int64")
df.to_csv(path, index=False)
result = read_csv(path)
tm.assert_frame_equal(df, result)
def test_to_csv_with_mix_columns(self):
# gh-11637: incorrect output when a mix of integer and string column
# names passed as columns parameter in to_csv
df = DataFrame({0: ["a", "b", "c"], 1: ["aa", "bb", "cc"]})
df["test"] = "txt"
assert df.to_csv() == df.to_csv(columns=[0, 1, "test"])
def test_to_csv_headers(self):
# GH6186, the presence or absence of `index` incorrectly
# causes to_csv to have different header semantics.
from_df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
to_df = DataFrame([[1, 2], [3, 4]], columns=["X", "Y"])
with tm.ensure_clean("__tmp_to_csv_headers__") as path:
from_df.to_csv(path, header=["X", "Y"])
recons = self.read_csv(path)
tm.assert_frame_equal(to_df, recons)
from_df.to_csv(path, index=False, header=["X", "Y"])
recons = self.read_csv(path)
return_value = recons.reset_index(inplace=True)
assert return_value is None
tm.assert_frame_equal(to_df, recons)
def test_to_csv_multiindex(self, float_frame, datetime_frame):
frame = float_frame
old_index = frame.index
arrays = np.arange(len(old_index) * 2).reshape(2, -1)
new_index = MultiIndex.from_arrays(arrays, names=["first", "second"])
frame.index = new_index
with tm.ensure_clean("__tmp_to_csv_multiindex__") as path:
frame.to_csv(path, header=False)
frame.to_csv(path, columns=["A", "B"])
# round trip
frame.to_csv(path)
df = self.read_csv(path, index_col=[0, 1], parse_dates=False)
# TODO to_csv drops column name
tm.assert_frame_equal(frame, df, check_names=False)
assert frame.index.names == df.index.names
# needed if setUp becomes a class method
float_frame.index = old_index
# try multiindex with dates
tsframe = datetime_frame
old_index = tsframe.index
new_index = [old_index, np.arange(len(old_index))]
tsframe.index = MultiIndex.from_arrays(new_index)
tsframe.to_csv(path, index_label=["time", "foo"])
recons = self.read_csv(path, index_col=[0, 1])
# TODO to_csv drops column name
tm.assert_frame_equal(tsframe, recons, check_names=False)
# do not load index
tsframe.to_csv(path)
recons = self.read_csv(path, index_col=None)
assert len(recons.columns) == len(tsframe.columns) + 2
# no index
tsframe.to_csv(path, index=False)
recons = self.read_csv(path, index_col=None)
tm.assert_almost_equal(recons.values, datetime_frame.values)
# needed if setUp becomes class method
datetime_frame.index = old_index
with tm.ensure_clean("__tmp_to_csv_multiindex__") as path:
# GH3571, GH1651, GH3141
def _make_frame(names=None):
if names is True:
names = ["first", "second"]
return DataFrame(
np.random.randint(0, 10, size=(3, 3)),
columns=MultiIndex.from_tuples(
[("bah", "foo"), ("bah", "bar"), ("ban", "baz")], names=names
),
dtype="int64",
)
# column & index are multi-index
df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
df.to_csv(path)
result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1])
tm.assert_frame_equal(df, result)
# column is mi
df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=1, c_idx_nlevels=4)
df.to_csv(path)
result = read_csv(path, header=[0, 1, 2, 3], index_col=0)
tm.assert_frame_equal(df, result)
# dup column names?
df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=3, c_idx_nlevels=4)
df.to_csv(path)
result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2])
tm.assert_frame_equal(df, result)
# writing with no index
df = _make_frame()
df.to_csv(path, index=False)
result = read_csv(path, header=[0, 1])
tm.assert_frame_equal(df, result)
# we lose the names here
df = _make_frame(True)
df.to_csv(path, index=False)
result = read_csv(path, header=[0, 1])
assert com.all_none(*result.columns.names)
result.columns.names = df.columns.names
tm.assert_frame_equal(df, result)
# whatsnew example
df = _make_frame()
df.to_csv(path)
result = read_csv(path, header=[0, 1], index_col=[0])
tm.assert_frame_equal(df, result)
df = _make_frame(True)
df.to_csv(path)
result = read_csv(path, header=[0, 1], index_col=[0])
tm.assert_frame_equal(df, result)
# invalid options
df = _make_frame(True)
df.to_csv(path)
for i in [6, 7]:
msg = f"len of {i}, but only 5 lines in file"
with pytest.raises(ParserError, match=msg):
read_csv(path, header=list(range(i)), index_col=0)
# write with cols
msg = "cannot specify cols with a MultiIndex"
with pytest.raises(TypeError, match=msg):
df.to_csv(path, columns=["foo", "bar"])
with tm.ensure_clean("__tmp_to_csv_multiindex__") as path:
# empty
tsframe[:0].to_csv(path)
recons = self.read_csv(path)
exp = tsframe[:0]
exp.index = []
tm.assert_index_equal(recons.columns, exp.columns)
assert len(recons) == 0
def test_to_csv_interval_index(self):
# GH 28210
df = DataFrame({"A": list("abc"), "B": range(3)}, index=pd.interval_range(0, 3))
with tm.ensure_clean("__tmp_to_csv_interval_index__.csv") as path:
df.to_csv(path)
result = self.read_csv(path, index_col=0)
# can't roundtrip intervalindex via read_csv so check string repr (GH 23595)
expected = df.copy()
expected.index = expected.index.astype(str)
tm.assert_frame_equal(result, expected)
def test_to_csv_float32_nanrep(self):
df = DataFrame(np.random.randn(1, 4).astype(np.float32))
df[1] = np.nan
with tm.ensure_clean("__tmp_to_csv_float32_nanrep__.csv") as path:
df.to_csv(path, na_rep=999)
with open(path) as f:
lines = f.readlines()
assert lines[1].split(",")[2] == "999"
def test_to_csv_withcommas(self):
# Commas inside fields should be correctly escaped when saving as CSV.
df = DataFrame({"A": [1, 2, 3], "B": ["5,6", "7,8", "9,0"]})
with tm.ensure_clean("__tmp_to_csv_withcommas__.csv") as path:
df.to_csv(path)
df2 = self.read_csv(path)
tm.assert_frame_equal(df2, df)
def test_to_csv_mixed(self):
def create_cols(name):
return [f"{name}{i:03d}" for i in range(5)]
df_float = DataFrame(
np.random.randn(100, 5), dtype="float64", columns=create_cols("float")
)
df_int = DataFrame(
np.random.randn(100, 5), dtype="int64", columns=create_cols("int")
)
df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool"))
df_object = DataFrame(
"foo", index=df_float.index, columns=create_cols("object")
)
df_dt = DataFrame(
Timestamp("20010101"), index=df_float.index, columns=create_cols("date")
)
# add in some nans
df_float.iloc[30:50, 1:3] = np.nan
# ## this is a bug in read_csv right now ####
# df_dt.loc[30:50,1:3] = np.nan
df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
# dtype
dtypes = {}
for n, dtype in [
("float", np.float64),
("int", np.int64),
("bool", np.bool_),
("object", object),
]:
for c in create_cols(n):
dtypes[c] = dtype
with tm.ensure_clean() as filename:
df.to_csv(filename)
rs = read_csv(
filename, index_col=0, dtype=dtypes, parse_dates=create_cols("date")
)
tm.assert_frame_equal(rs, df)
def test_to_csv_dups_cols(self):
df = DataFrame(
np.random.randn(1000, 30),
columns=list(range(15)) + list(range(15)),
dtype="float64",
)
with tm.ensure_clean() as filename:
df.to_csv(filename) # single dtype, fine
result = read_csv(filename, index_col=0)
result.columns = df.columns
tm.assert_frame_equal(result, df)
df_float = DataFrame(np.random.randn(1000, 3), dtype="float64")
df_int = DataFrame(np.random.randn(1000, 3), dtype="int64")
df_bool = DataFrame(True, index=df_float.index, columns=range(3))
df_object = DataFrame("foo", index=df_float.index, columns=range(3))
df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3))
df = pd.concat(
[df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True
)
cols = []
for i in range(5):
cols.extend([0, 1, 2])
df.columns = cols
with tm.ensure_clean() as filename:
df.to_csv(filename)
result = read_csv(filename, index_col=0)
# date cols
for i in ["0.4", "1.4", "2.4"]:
result[i] = to_datetime(result[i])
result.columns = df.columns
tm.assert_frame_equal(result, df)
# GH3457
N = 10
df = tm.makeCustomDataframe(N, 3)
df.columns = ["a", "a", "b"]
with tm.ensure_clean() as filename:
df.to_csv(filename)
# read_csv will rename the dups columns
result = read_csv(filename, index_col=0)
result = result.rename(columns={"a.1": "a"})
tm.assert_frame_equal(result, df)
def test_to_csv_chunking(self):
aa = DataFrame({"A": range(100000)})
aa["B"] = aa.A + 1.0
aa["C"] = aa.A + 2.0
aa["D"] = aa.A + 3.0
for chunksize in [10000, 50000, 100000]:
with tm.ensure_clean() as filename:
aa.to_csv(filename, chunksize=chunksize)
rs = read_csv(filename, index_col=0)
tm.assert_frame_equal(rs, aa)
@pytest.mark.slow
def test_to_csv_wide_frame_formatting(self):
# Issue #8621
df = DataFrame(np.random.randn(1, 100010), columns=None, index=None)
with tm.ensure_clean() as filename:
df.to_csv(filename, header=False, index=False)
rs = read_csv(filename, header=None)
tm.assert_frame_equal(rs, df)
def test_to_csv_bug(self):
f1 = StringIO("a,1.0\nb,2.0")
df = self.read_csv(f1, header=None)
newdf = DataFrame({"t": df[df.columns[0]]})
with tm.ensure_clean() as path:
newdf.to_csv(path)
recons = read_csv(path, index_col=0)
# don't check_names as t != 1
tm.assert_frame_equal(recons, newdf, check_names=False)
def test_to_csv_unicode(self):
df = DataFrame({"c/\u03c3": [1, 2, 3]})
with tm.ensure_clean() as path:
df.to_csv(path, encoding="UTF-8")
df2 = read_csv(path, index_col=0, encoding="UTF-8")
tm.assert_frame_equal(df, df2)
df.to_csv(path, encoding="UTF-8", index=False)
df2 = read_csv(path, index_col=None, encoding="UTF-8")
tm.assert_frame_equal(df, df2)
def test_to_csv_unicode_index_col(self):
buf = StringIO("")
df = DataFrame(
[["\u05d0", "d2", "d3", "d4"], ["a1", "a2", "a3", "a4"]],
columns=["\u05d0", "\u05d1", "\u05d2", "\u05d3"],
index=["\u05d0", "\u05d1"],
)
df.to_csv(buf, encoding="UTF-8")
buf.seek(0)
df2 = read_csv(buf, index_col=0, encoding="UTF-8")
tm.assert_frame_equal(df, df2)
def test_to_csv_stringio(self, float_frame):
buf = StringIO()
float_frame.to_csv(buf)
buf.seek(0)
recons = read_csv(buf, index_col=0)
# TODO to_csv drops column name
tm.assert_frame_equal(recons, float_frame, check_names=False)
def test_to_csv_float_format(self):
df = DataFrame(
[[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
index=["A", "B"],
columns=["X", "Y", "Z"],
)
with tm.ensure_clean() as filename:
df.to_csv(filename, float_format="%.2f")
rs = read_csv(filename, index_col=0)
xp = DataFrame(
[[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]],
index=["A", "B"],
columns=["X", "Y", "Z"],
)
tm.assert_frame_equal(rs, xp)
def test_to_csv_unicodewriter_quoting(self):
df = DataFrame({"A": [1, 2, 3], "B": ["foo", "bar", "baz"]})
buf = StringIO()
df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf-8")
result = buf.getvalue()
expected_rows = ['"A","B"', '1,"foo"', '2,"bar"', '3,"baz"']
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert result == expected
def test_to_csv_quote_none(self):
# GH4328
df = DataFrame({"A": ["hello", '{"hello"}']})
for encoding in (None, "utf-8"):
buf = StringIO()
df.to_csv(buf, quoting=csv.QUOTE_NONE, encoding=encoding, index=False)
result = buf.getvalue()
expected_rows = ["A", "hello", '{"hello"}']
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert result == expected
def test_to_csv_index_no_leading_comma(self):
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"])
buf = StringIO()
df.to_csv(buf, index_label=False)
expected_rows = ["A,B", "one,1,4", "two,2,5", "three,3,6"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert buf.getvalue() == expected
def test_to_csv_line_terminators(self):
# see gh-20353
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"])
with tm.ensure_clean() as path:
# case 1: CRLF as line terminator
df.to_csv(path, line_terminator="\r\n")
expected = b",A,B\r\none,1,4\r\ntwo,2,5\r\nthree,3,6\r\n"
with open(path, mode="rb") as f:
assert f.read() == expected
with tm.ensure_clean() as path:
# case 2: LF as line terminator
df.to_csv(path, line_terminator="\n")
expected = b",A,B\none,1,4\ntwo,2,5\nthree,3,6\n"
with open(path, mode="rb") as f:
assert f.read() == expected
with tm.ensure_clean() as path:
# case 3: The default line terminator(=os.linesep)(gh-21406)
df.to_csv(path)
os_linesep = os.linesep.encode("utf-8")
expected = (
b",A,B"
+ os_linesep
+ b"one,1,4"
+ os_linesep
+ b"two,2,5"
+ os_linesep
+ b"three,3,6"
+ os_linesep
)
with open(path, mode="rb") as f:
assert f.read() == expected
def test_to_csv_from_csv_categorical(self):
# CSV with categoricals should result in the same output
# as when one would add a "normal" Series/DataFrame.
s = Series(pd.Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]))
s2 = Series(["a", "b", "b", "a", "a", "c", "c", "c"])
res = StringIO()
s.to_csv(res, header=False)
exp = StringIO()
s2.to_csv(exp, header=False)
assert res.getvalue() == exp.getvalue()
df = DataFrame({"s": s})
df2 = DataFrame({"s": s2})
res = StringIO()
df.to_csv(res)
exp = StringIO()
df2.to_csv(exp)
assert res.getvalue() == exp.getvalue()
def test_to_csv_path_is_none(self, float_frame):
# GH 8215
# Make sure we return string for consistency with
# Series.to_csv()
csv_str = float_frame.to_csv(path_or_buf=None)
assert isinstance(csv_str, str)
recons = pd.read_csv(StringIO(csv_str), index_col=0)
tm.assert_frame_equal(float_frame, recons)
@pytest.mark.parametrize(
"df,encoding",
[
(
DataFrame(
[[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
index=["A", "B"],
columns=["X", "Y", "Z"],
),
None,
),
# GH 21241, 21118
(DataFrame([["abc", "def", "ghi"]], columns=["X", "Y", "Z"]), "ascii"),
(DataFrame(5 * [[123, "你好", "世界"]], columns=["X", "Y", "Z"]), "gb2312"),
(
DataFrame(5 * [[123, "Γειά σου", "Κόσμε"]], columns=["X", "Y", "Z"]),
"cp737",
),
],
)
def test_to_csv_compression(self, df, encoding, compression):
with tm.ensure_clean() as filename:
df.to_csv(filename, compression=compression, encoding=encoding)
# test the round trip - to_csv -> read_csv
result = read_csv(
filename, compression=compression, index_col=0, encoding=encoding
)
tm.assert_frame_equal(df, result)
# test the round trip using file handle - to_csv -> read_csv
with get_handle(
filename, "w", compression=compression, encoding=encoding
) as handles:
df.to_csv(handles.handle, encoding=encoding)
assert not handles.handle.closed
result = pd.read_csv(
filename,
compression=compression,
encoding=encoding,
index_col=0,
squeeze=True,
)
tm.assert_frame_equal(df, result)
# explicitly make sure file is compressed
with tm.decompress_file(filename, compression) as fh:
text = fh.read().decode(encoding or "utf8")
for col in df.columns:
assert col in text
with tm.decompress_file(filename, compression) as fh:
tm.assert_frame_equal(df, read_csv(fh, index_col=0, encoding=encoding))
def test_to_csv_date_format(self, datetime_frame):
with tm.ensure_clean("__tmp_to_csv_date_format__") as path:
dt_index = datetime_frame.index
datetime_frame = DataFrame(
{"A": dt_index, "B": dt_index.shift(1)}, index=dt_index
)
datetime_frame.to_csv(path, date_format="%Y%m%d")
# Check that the data was put in the specified format
test = read_csv(path, index_col=0)
datetime_frame_int = datetime_frame.applymap(
lambda x: int(x.strftime("%Y%m%d"))
)
datetime_frame_int.index = datetime_frame_int.index.map(
lambda x: int(x.strftime("%Y%m%d"))
)
tm.assert_frame_equal(test, datetime_frame_int)
datetime_frame.to_csv(path, date_format="%Y-%m-%d")
# Check that the data was put in the specified format
test = read_csv(path, index_col=0)
datetime_frame_str = datetime_frame.applymap(
lambda x: x.strftime("%Y-%m-%d")
)
datetime_frame_str.index = datetime_frame_str.index.map(
lambda x: x.strftime("%Y-%m-%d")
)
tm.assert_frame_equal(test, datetime_frame_str)
# Check that columns get converted
datetime_frame_columns = datetime_frame.T
datetime_frame_columns.to_csv(path, date_format="%Y%m%d")
test = read_csv(path, index_col=0)
datetime_frame_columns = datetime_frame_columns.applymap(
lambda x: int(x.strftime("%Y%m%d"))
)
# Columns don't get converted to ints by read_csv
datetime_frame_columns.columns = datetime_frame_columns.columns.map(
lambda x: x.strftime("%Y%m%d")
)
tm.assert_frame_equal(test, datetime_frame_columns)
# test NaTs
nat_index = to_datetime(
["NaT"] * 10 + ["2000-01-01", "1/1/2000", "1-1-2000"]
)
nat_frame = DataFrame({"A": nat_index}, index=nat_index)
nat_frame.to_csv(path, date_format="%Y-%m-%d")
test = read_csv(path, parse_dates=[0, 1], index_col=0)
tm.assert_frame_equal(test, nat_frame)
def test_to_csv_with_dst_transitions(self):
with tm.ensure_clean("csv_date_format_with_dst") as path:
# make sure we are not failing on transitions
times = pd.date_range(
"2013-10-26 23:00",
"2013-10-27 01:00",
tz="Europe/London",
freq="H",
ambiguous="infer",
)
for i in [times, times + pd.Timedelta("10s")]:
i = i._with_freq(None) # freq is not preserved by read_csv
time_range = np.array(range(len(i)), dtype="int64")
df = DataFrame({"A": time_range}, index=i)
df.to_csv(path, index=True)
# we have to reconvert the index as we
# don't parse the tz's
result = read_csv(path, index_col=0)
result.index = to_datetime(result.index, utc=True).tz_convert(
"Europe/London"
)
tm.assert_frame_equal(result, df)
# GH11619
idx = pd.date_range("2015-01-01", "2015-12-31", freq="H", tz="Europe/Paris")
idx = idx._with_freq(None) # freq does not round-trip
idx._data._freq = None # otherwise there is trouble on unpickle
df = DataFrame({"values": 1, "idx": idx}, index=idx)
with tm.ensure_clean("csv_date_format_with_dst") as path:
df.to_csv(path, index=True)
result = read_csv(path, index_col=0)
result.index = to_datetime(result.index, utc=True).tz_convert(
"Europe/Paris"
)
result["idx"] = to_datetime(result["idx"], utc=True).astype(
"datetime64[ns, Europe/Paris]"
)
tm.assert_frame_equal(result, df)
# assert working
df.astype(str)
with tm.ensure_clean("csv_date_format_with_dst") as path:
df.to_pickle(path)
result = pd.read_pickle(path)
tm.assert_frame_equal(result, df)
def test_to_csv_quoting(self):
df = DataFrame(
{
"c_bool": [True, False],
"c_float": [1.0, 3.2],
"c_int": [42, np.nan],
"c_string": ["a", "b,c"],
}
)
expected_rows = [
",c_bool,c_float,c_int,c_string",
"0,True,1.0,42.0,a",
'1,False,3.2,,"b,c"',
]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
result = df.to_csv()
assert result == expected
result = df.to_csv(quoting=None)
assert result == expected
expected_rows = [
",c_bool,c_float,c_int,c_string",
"0,True,1.0,42.0,a",
'1,False,3.2,,"b,c"',
]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
result = df.to_csv(quoting=csv.QUOTE_MINIMAL)
assert result == expected
expected_rows = [
'"","c_bool","c_float","c_int","c_string"',
'"0","True","1.0","42.0","a"',
'"1","False","3.2","","b,c"',
]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
result = df.to_csv(quoting=csv.QUOTE_ALL)
assert result == expected
# see gh-12922, gh-13259: make sure changes to
# the formatters do not break this behaviour
expected_rows = [
'"","c_bool","c_float","c_int","c_string"',
'0,True,1.0,42.0,"a"',
'1,False,3.2,"","b,c"',
]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC)
assert result == expected
msg = "need to escape, but no escapechar set"
with pytest.raises(csv.Error, match=msg):
df.to_csv(quoting=csv.QUOTE_NONE)
with pytest.raises(csv.Error, match=msg):
df.to_csv(quoting=csv.QUOTE_NONE, escapechar=None)
expected_rows = [
",c_bool,c_float,c_int,c_string",
"0,True,1.0,42.0,a",
"1,False,3.2,,b!,c",
]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar="!")
assert result == expected
expected_rows = [
",c_bool,c_ffloat,c_int,c_string",
"0,True,1.0,42.0,a",
"1,False,3.2,,bf,c",
]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar="f")
assert result == expected
# see gh-3503: quoting Windows line terminators
# presents with encoding?
text_rows = ["a,b,c", '1,"test \r\n",3']
text = tm.convert_rows_list_to_csv_str(text_rows)
df = pd.read_csv(StringIO(text))
buf = StringIO()
df.to_csv(buf, encoding="utf-8", index=False)
assert buf.getvalue() == text
# xref gh-7791: make sure the quoting parameter is passed through
# with multi-indexes
df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]})
df = df.set_index(["a", "b"])
expected_rows = ['"a","b","c"', '"1","3","5"', '"2","4","6"']
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.to_csv(quoting=csv.QUOTE_ALL) == expected
def test_period_index_date_overflow(self):
# see gh-15982
dates = ["1990-01-01", "2000-01-01", "3005-01-01"]
index = pd.PeriodIndex(dates, freq="D")
df = DataFrame([4, 5, 6], index=index)
result = df.to_csv()
expected_rows = [",0", "1990-01-01,4", "2000-01-01,5", "3005-01-01,6"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert result == expected
date_format = "%m-%d-%Y"
result = df.to_csv(date_format=date_format)
expected_rows = [",0", "01-01-1990,4", "01-01-2000,5", "01-01-3005,6"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert result == expected
# Overflow with pd.NaT
dates = ["1990-01-01", pd.NaT, "3005-01-01"]
index = pd.PeriodIndex(dates, freq="D")
df = DataFrame([4, 5, 6], index=index)
result = df.to_csv()
expected_rows = [",0", "1990-01-01,4", ",5", "3005-01-01,6"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert result == expected
def test_multi_index_header(self):
# see gh-5539
columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)])
df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]])
df.columns = columns
header = ["a", "b", "c", "d"]
result = df.to_csv(header=header)
expected_rows = [",a,b,c,d", "0,1,2,3,4", "1,5,6,7,8"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert result == expected
def test_to_csv_single_level_multi_index(self):
# see gh-26303
index = Index([(1,), (2,), (3,)])
df = DataFrame([[1, 2, 3]], columns=index)
df = df.reindex(columns=[(1,), (3,)])
expected = ",1,3\n0,1,3\n"
result = df.to_csv(line_terminator="\n")
tm.assert_almost_equal(result, expected)
def test_gz_lineend(self):
# GH 25311
df = DataFrame({"a": [1, 2]})
expected_rows = ["a", "1", "2"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
with tm.ensure_clean("__test_gz_lineend.csv.gz") as path:
df.to_csv(path, index=False)
with tm.decompress_file(path, compression="gzip") as f:
result = f.read().decode("utf-8")
assert result == expected
def test_to_csv_numpy_16_bug(self):
frame = DataFrame({"a": date_range("1/1/2000", periods=10)})
buf = StringIO()
frame.to_csv(buf)
result = buf.getvalue()
assert "2000-01-01" in result