1675 lines
60 KiB
Python
1675 lines
60 KiB
Python
from datetime import (
|
|
datetime,
|
|
time,
|
|
)
|
|
from functools import partial
|
|
import os
|
|
from pathlib import Path
|
|
import platform
|
|
from urllib.error import URLError
|
|
from zipfile import BadZipFile
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
import pandas.util._test_decorators as td
|
|
|
|
import pandas as pd
|
|
from pandas import (
|
|
DataFrame,
|
|
Index,
|
|
MultiIndex,
|
|
Series,
|
|
)
|
|
import pandas._testing as tm
|
|
from pandas.core.arrays import (
|
|
ArrowStringArray,
|
|
StringArray,
|
|
)
|
|
|
|
read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"]
|
|
engine_params = [
|
|
# Add any engines to test here
|
|
# When defusedxml is installed it triggers deprecation warnings for
|
|
# xlrd and openpyxl, so catch those here
|
|
pytest.param(
|
|
"xlrd",
|
|
marks=[
|
|
td.skip_if_no("xlrd"),
|
|
],
|
|
),
|
|
pytest.param(
|
|
"openpyxl",
|
|
marks=[
|
|
td.skip_if_no("openpyxl"),
|
|
],
|
|
),
|
|
pytest.param(
|
|
None,
|
|
marks=[
|
|
td.skip_if_no("xlrd"),
|
|
],
|
|
),
|
|
pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")),
|
|
pytest.param("odf", marks=td.skip_if_no("odf")),
|
|
]
|
|
|
|
|
|
def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool:
|
|
"""
|
|
Filter out invalid (engine, ext) pairs instead of skipping, as that
|
|
produces 500+ pytest.skips.
|
|
"""
|
|
engine = engine.values[0]
|
|
if engine == "openpyxl" and read_ext == ".xls":
|
|
return False
|
|
if engine == "odf" and read_ext != ".ods":
|
|
return False
|
|
if read_ext == ".ods" and engine != "odf":
|
|
return False
|
|
if engine == "pyxlsb" and read_ext != ".xlsb":
|
|
return False
|
|
if read_ext == ".xlsb" and engine != "pyxlsb":
|
|
return False
|
|
if engine == "xlrd" and read_ext != ".xls":
|
|
return False
|
|
return True
|
|
|
|
|
|
def _transfer_marks(engine, read_ext):
|
|
"""
|
|
engine gives us a pytest.param object with some marks, read_ext is just
|
|
a string. We need to generate a new pytest.param inheriting the marks.
|
|
"""
|
|
values = engine.values + (read_ext,)
|
|
new_param = pytest.param(values, marks=engine.marks)
|
|
return new_param
|
|
|
|
|
|
@pytest.fixture(
|
|
params=[
|
|
_transfer_marks(eng, ext)
|
|
for eng in engine_params
|
|
for ext in read_ext_params
|
|
if _is_valid_engine_ext_pair(eng, ext)
|
|
],
|
|
ids=str,
|
|
)
|
|
def engine_and_read_ext(request):
|
|
"""
|
|
Fixture for Excel reader engine and read_ext, only including valid pairs.
|
|
"""
|
|
return request.param
|
|
|
|
|
|
@pytest.fixture
|
|
def engine(engine_and_read_ext):
|
|
engine, read_ext = engine_and_read_ext
|
|
return engine
|
|
|
|
|
|
@pytest.fixture
|
|
def read_ext(engine_and_read_ext):
|
|
engine, read_ext = engine_and_read_ext
|
|
return read_ext
|
|
|
|
|
|
class TestReaders:
|
|
@pytest.fixture(autouse=True)
|
|
def cd_and_set_engine(self, engine, datapath, monkeypatch):
|
|
"""
|
|
Change directory and set engine for read_excel calls.
|
|
"""
|
|
func = partial(pd.read_excel, engine=engine)
|
|
monkeypatch.chdir(datapath("io", "data", "excel"))
|
|
monkeypatch.setattr(pd, "read_excel", func)
|
|
|
|
def test_engine_used(self, read_ext, engine, monkeypatch):
|
|
# GH 38884
|
|
def parser(self, *args, **kwargs):
|
|
return self.engine
|
|
|
|
monkeypatch.setattr(pd.ExcelFile, "parse", parser)
|
|
|
|
expected_defaults = {
|
|
"xlsx": "openpyxl",
|
|
"xlsm": "openpyxl",
|
|
"xlsb": "pyxlsb",
|
|
"xls": "xlrd",
|
|
"ods": "odf",
|
|
}
|
|
|
|
with open("test1" + read_ext, "rb") as f:
|
|
result = pd.read_excel(f)
|
|
|
|
if engine is not None:
|
|
expected = engine
|
|
else:
|
|
expected = expected_defaults[read_ext[1:]]
|
|
assert result == expected
|
|
|
|
def test_usecols_int(self, read_ext):
|
|
# usecols as int
|
|
msg = "Passing an integer for `usecols`"
|
|
with pytest.raises(ValueError, match=msg):
|
|
pd.read_excel(
|
|
"test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=3
|
|
)
|
|
|
|
# usecols as int
|
|
with pytest.raises(ValueError, match=msg):
|
|
pd.read_excel(
|
|
"test1" + read_ext,
|
|
sheet_name="Sheet2",
|
|
skiprows=[1],
|
|
index_col=0,
|
|
usecols=3,
|
|
)
|
|
|
|
def test_usecols_list(self, request, read_ext, df_ref):
|
|
if read_ext == ".xlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="Sheets containing datetimes not supported by pyxlsb"
|
|
)
|
|
)
|
|
|
|
df_ref = df_ref.reindex(columns=["B", "C"])
|
|
df1 = pd.read_excel(
|
|
"test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=[0, 2, 3]
|
|
)
|
|
df2 = pd.read_excel(
|
|
"test1" + read_ext,
|
|
sheet_name="Sheet2",
|
|
skiprows=[1],
|
|
index_col=0,
|
|
usecols=[0, 2, 3],
|
|
)
|
|
|
|
# TODO add index to xls file)
|
|
tm.assert_frame_equal(df1, df_ref, check_names=False)
|
|
tm.assert_frame_equal(df2, df_ref, check_names=False)
|
|
|
|
def test_usecols_str(self, request, read_ext, df_ref):
|
|
if read_ext == ".xlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="Sheets containing datetimes not supported by pyxlsb"
|
|
)
|
|
)
|
|
|
|
df1 = df_ref.reindex(columns=["A", "B", "C"])
|
|
df2 = pd.read_excel(
|
|
"test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A:D"
|
|
)
|
|
df3 = pd.read_excel(
|
|
"test1" + read_ext,
|
|
sheet_name="Sheet2",
|
|
skiprows=[1],
|
|
index_col=0,
|
|
usecols="A:D",
|
|
)
|
|
|
|
# TODO add index to xls, read xls ignores index name ?
|
|
tm.assert_frame_equal(df2, df1, check_names=False)
|
|
tm.assert_frame_equal(df3, df1, check_names=False)
|
|
|
|
df1 = df_ref.reindex(columns=["B", "C"])
|
|
df2 = pd.read_excel(
|
|
"test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C,D"
|
|
)
|
|
df3 = pd.read_excel(
|
|
"test1" + read_ext,
|
|
sheet_name="Sheet2",
|
|
skiprows=[1],
|
|
index_col=0,
|
|
usecols="A,C,D",
|
|
)
|
|
# TODO add index to xls file
|
|
tm.assert_frame_equal(df2, df1, check_names=False)
|
|
tm.assert_frame_equal(df3, df1, check_names=False)
|
|
|
|
df1 = df_ref.reindex(columns=["B", "C"])
|
|
df2 = pd.read_excel(
|
|
"test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C:D"
|
|
)
|
|
df3 = pd.read_excel(
|
|
"test1" + read_ext,
|
|
sheet_name="Sheet2",
|
|
skiprows=[1],
|
|
index_col=0,
|
|
usecols="A,C:D",
|
|
)
|
|
tm.assert_frame_equal(df2, df1, check_names=False)
|
|
tm.assert_frame_equal(df3, df1, check_names=False)
|
|
|
|
@pytest.mark.parametrize(
|
|
"usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]]
|
|
)
|
|
def test_usecols_diff_positional_int_columns_order(
|
|
self, request, read_ext, usecols, df_ref
|
|
):
|
|
if read_ext == ".xlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="Sheets containing datetimes not supported by pyxlsb"
|
|
)
|
|
)
|
|
|
|
expected = df_ref[["A", "C"]]
|
|
result = pd.read_excel(
|
|
"test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=usecols
|
|
)
|
|
tm.assert_frame_equal(result, expected, check_names=False)
|
|
|
|
@pytest.mark.parametrize("usecols", [["B", "D"], ["D", "B"]])
|
|
def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_ref):
|
|
expected = df_ref[["B", "D"]]
|
|
expected.index = range(len(expected))
|
|
|
|
result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols=usecols)
|
|
tm.assert_frame_equal(result, expected, check_names=False)
|
|
|
|
def test_read_excel_without_slicing(self, request, read_ext, df_ref):
|
|
if read_ext == ".xlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="Sheets containing datetimes not supported by pyxlsb"
|
|
)
|
|
)
|
|
|
|
expected = df_ref
|
|
result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0)
|
|
tm.assert_frame_equal(result, expected, check_names=False)
|
|
|
|
def test_usecols_excel_range_str(self, request, read_ext, df_ref):
|
|
if read_ext == ".xlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="Sheets containing datetimes not supported by pyxlsb"
|
|
)
|
|
)
|
|
|
|
expected = df_ref[["C", "D"]]
|
|
result = pd.read_excel(
|
|
"test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,D:E"
|
|
)
|
|
tm.assert_frame_equal(result, expected, check_names=False)
|
|
|
|
def test_usecols_excel_range_str_invalid(self, read_ext):
|
|
msg = "Invalid column name: E1"
|
|
|
|
with pytest.raises(ValueError, match=msg):
|
|
pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols="D:E1")
|
|
|
|
def test_index_col_label_error(self, read_ext):
|
|
msg = "list indices must be integers.*, not str"
|
|
|
|
with pytest.raises(TypeError, match=msg):
|
|
pd.read_excel(
|
|
"test1" + read_ext,
|
|
sheet_name="Sheet1",
|
|
index_col=["A"],
|
|
usecols=["A", "C"],
|
|
)
|
|
|
|
def test_index_col_empty(self, read_ext):
|
|
# see gh-9208
|
|
result = pd.read_excel(
|
|
"test1" + read_ext, sheet_name="Sheet3", index_col=["A", "B", "C"]
|
|
)
|
|
expected = DataFrame(
|
|
columns=["D", "E", "F"],
|
|
index=MultiIndex(levels=[[]] * 3, codes=[[]] * 3, names=["A", "B", "C"]),
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize("index_col", [None, 2])
|
|
def test_index_col_with_unnamed(self, read_ext, index_col):
|
|
# see gh-18792
|
|
result = pd.read_excel(
|
|
"test1" + read_ext, sheet_name="Sheet4", index_col=index_col
|
|
)
|
|
expected = DataFrame(
|
|
[["i1", "a", "x"], ["i2", "b", "y"]], columns=["Unnamed: 0", "col1", "col2"]
|
|
)
|
|
if index_col:
|
|
expected = expected.set_index(expected.columns[index_col])
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_usecols_pass_non_existent_column(self, read_ext):
|
|
msg = (
|
|
"Usecols do not match columns, "
|
|
"columns expected but not found: " + r"\['E'\]"
|
|
)
|
|
|
|
with pytest.raises(ValueError, match=msg):
|
|
pd.read_excel("test1" + read_ext, usecols=["E"])
|
|
|
|
def test_usecols_wrong_type(self, read_ext):
|
|
msg = (
|
|
"'usecols' must either be list-like of "
|
|
"all strings, all unicode, all integers or a callable."
|
|
)
|
|
|
|
with pytest.raises(ValueError, match=msg):
|
|
pd.read_excel("test1" + read_ext, usecols=["E1", 0])
|
|
|
|
def test_excel_stop_iterator(self, read_ext):
|
|
parsed = pd.read_excel("test2" + read_ext, sheet_name="Sheet1")
|
|
expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"])
|
|
tm.assert_frame_equal(parsed, expected)
|
|
|
|
def test_excel_cell_error_na(self, request, read_ext):
|
|
if read_ext == ".xlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="Sheets containing datetimes not supported by pyxlsb"
|
|
)
|
|
)
|
|
|
|
parsed = pd.read_excel("test3" + read_ext, sheet_name="Sheet1")
|
|
expected = DataFrame([[np.nan]], columns=["Test"])
|
|
tm.assert_frame_equal(parsed, expected)
|
|
|
|
def test_excel_table(self, request, read_ext, df_ref):
|
|
if read_ext == ".xlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="Sheets containing datetimes not supported by pyxlsb"
|
|
)
|
|
)
|
|
|
|
df1 = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0)
|
|
df2 = pd.read_excel(
|
|
"test1" + read_ext, sheet_name="Sheet2", skiprows=[1], index_col=0
|
|
)
|
|
# TODO add index to file
|
|
tm.assert_frame_equal(df1, df_ref, check_names=False)
|
|
tm.assert_frame_equal(df2, df_ref, check_names=False)
|
|
|
|
df3 = pd.read_excel(
|
|
"test1" + read_ext, sheet_name="Sheet1", index_col=0, skipfooter=1
|
|
)
|
|
tm.assert_frame_equal(df3, df1.iloc[:-1])
|
|
|
|
def test_reader_special_dtypes(self, request, read_ext):
|
|
if read_ext == ".xlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="Sheets containing datetimes not supported by pyxlsb"
|
|
)
|
|
)
|
|
|
|
expected = DataFrame.from_dict(
|
|
{
|
|
"IntCol": [1, 2, -3, 4, 0],
|
|
"FloatCol": [1.25, 2.25, 1.83, 1.92, 0.0000000005],
|
|
"BoolCol": [True, False, True, True, False],
|
|
"StrCol": [1, 2, 3, 4, 5],
|
|
"Str2Col": ["a", 3, "c", "d", "e"],
|
|
"DateCol": [
|
|
datetime(2013, 10, 30),
|
|
datetime(2013, 10, 31),
|
|
datetime(1905, 1, 1),
|
|
datetime(2013, 12, 14),
|
|
datetime(2015, 3, 14),
|
|
],
|
|
},
|
|
)
|
|
basename = "test_types"
|
|
|
|
# should read in correctly and infer types
|
|
actual = pd.read_excel(basename + read_ext, sheet_name="Sheet1")
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
# if not coercing number, then int comes in as float
|
|
float_expected = expected.copy()
|
|
float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0
|
|
actual = pd.read_excel(basename + read_ext, sheet_name="Sheet1")
|
|
tm.assert_frame_equal(actual, float_expected)
|
|
|
|
# check setting Index (assuming xls and xlsx are the same here)
|
|
for icol, name in enumerate(expected.columns):
|
|
actual = pd.read_excel(
|
|
basename + read_ext, sheet_name="Sheet1", index_col=icol
|
|
)
|
|
exp = expected.set_index(name)
|
|
tm.assert_frame_equal(actual, exp)
|
|
|
|
expected["StrCol"] = expected["StrCol"].apply(str)
|
|
actual = pd.read_excel(
|
|
basename + read_ext, sheet_name="Sheet1", converters={"StrCol": str}
|
|
)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
# GH8212 - support for converters and missing values
|
|
def test_reader_converters(self, read_ext):
|
|
basename = "test_converters"
|
|
|
|
expected = DataFrame.from_dict(
|
|
{
|
|
"IntCol": [1, 2, -3, -1000, 0],
|
|
"FloatCol": [12.5, np.nan, 18.3, 19.2, 0.000000005],
|
|
"BoolCol": ["Found", "Found", "Found", "Not found", "Found"],
|
|
"StrCol": ["1", np.nan, "3", "4", "5"],
|
|
}
|
|
)
|
|
|
|
converters = {
|
|
"IntCol": lambda x: int(x) if x != "" else -1000,
|
|
"FloatCol": lambda x: 10 * x if x else np.nan,
|
|
2: lambda x: "Found" if x != "" else "Not found",
|
|
3: lambda x: str(x) if x else "",
|
|
}
|
|
|
|
# should read in correctly and set types of single cells (not array
|
|
# dtypes)
|
|
actual = pd.read_excel(
|
|
basename + read_ext, sheet_name="Sheet1", converters=converters
|
|
)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
def test_reader_dtype(self, read_ext):
|
|
# GH 8212
|
|
basename = "testdtype"
|
|
actual = pd.read_excel(basename + read_ext)
|
|
|
|
expected = DataFrame(
|
|
{
|
|
"a": [1, 2, 3, 4],
|
|
"b": [2.5, 3.5, 4.5, 5.5],
|
|
"c": [1, 2, 3, 4],
|
|
"d": [1.0, 2.0, np.nan, 4.0],
|
|
}
|
|
).reindex(columns=["a", "b", "c", "d"])
|
|
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
actual = pd.read_excel(
|
|
basename + read_ext, dtype={"a": "float64", "b": "float32", "c": str}
|
|
)
|
|
|
|
expected["a"] = expected["a"].astype("float64")
|
|
expected["b"] = expected["b"].astype("float32")
|
|
expected["c"] = ["001", "002", "003", "004"]
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
msg = "Unable to convert column d to type int64"
|
|
with pytest.raises(ValueError, match=msg):
|
|
pd.read_excel(basename + read_ext, dtype={"d": "int64"})
|
|
|
|
@pytest.mark.parametrize(
|
|
"dtype,expected",
|
|
[
|
|
(
|
|
None,
|
|
DataFrame(
|
|
{
|
|
"a": [1, 2, 3, 4],
|
|
"b": [2.5, 3.5, 4.5, 5.5],
|
|
"c": [1, 2, 3, 4],
|
|
"d": [1.0, 2.0, np.nan, 4.0],
|
|
}
|
|
),
|
|
),
|
|
(
|
|
{"a": "float64", "b": "float32", "c": str, "d": str},
|
|
DataFrame(
|
|
{
|
|
"a": Series([1, 2, 3, 4], dtype="float64"),
|
|
"b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
|
|
"c": ["001", "002", "003", "004"],
|
|
"d": ["1", "2", np.nan, "4"],
|
|
}
|
|
),
|
|
),
|
|
],
|
|
)
|
|
def test_reader_dtype_str(self, read_ext, dtype, expected):
|
|
# see gh-20377
|
|
basename = "testdtype"
|
|
|
|
actual = pd.read_excel(basename + read_ext, dtype=dtype)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
def test_dtype_backend(self, read_ext, dtype_backend):
|
|
# GH#36712
|
|
if read_ext in (".xlsb", ".xls"):
|
|
pytest.skip(f"No engine for filetype: '{read_ext}'")
|
|
|
|
df = DataFrame(
|
|
{
|
|
"a": Series([1, 3], dtype="Int64"),
|
|
"b": Series([2.5, 4.5], dtype="Float64"),
|
|
"c": Series([True, False], dtype="boolean"),
|
|
"d": Series(["a", "b"], dtype="string"),
|
|
"e": Series([pd.NA, 6], dtype="Int64"),
|
|
"f": Series([pd.NA, 7.5], dtype="Float64"),
|
|
"g": Series([pd.NA, True], dtype="boolean"),
|
|
"h": Series([pd.NA, "a"], dtype="string"),
|
|
"i": Series([pd.Timestamp("2019-12-31")] * 2),
|
|
"j": Series([pd.NA, pd.NA], dtype="Int64"),
|
|
}
|
|
)
|
|
with tm.ensure_clean(read_ext) as file_path:
|
|
df.to_excel(file_path, "test", index=False)
|
|
result = pd.read_excel(
|
|
file_path, sheet_name="test", dtype_backend=dtype_backend
|
|
)
|
|
if dtype_backend == "pyarrow":
|
|
import pyarrow as pa
|
|
|
|
from pandas.arrays import ArrowExtensionArray
|
|
|
|
expected = DataFrame(
|
|
{
|
|
col: ArrowExtensionArray(pa.array(df[col], from_pandas=True))
|
|
for col in df.columns
|
|
}
|
|
)
|
|
# pyarrow by default infers timestamp resolution as us, not ns
|
|
expected["i"] = ArrowExtensionArray(
|
|
expected["i"].array._data.cast(pa.timestamp(unit="us"))
|
|
)
|
|
# pyarrow supports a null type, so don't have to default to Int64
|
|
expected["j"] = ArrowExtensionArray(pa.array([None, None]))
|
|
else:
|
|
expected = df
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_dtype_backend_and_dtype(self, read_ext):
|
|
# GH#36712
|
|
if read_ext in (".xlsb", ".xls"):
|
|
pytest.skip(f"No engine for filetype: '{read_ext}'")
|
|
|
|
df = DataFrame({"a": [np.nan, 1.0], "b": [2.5, np.nan]})
|
|
with tm.ensure_clean(read_ext) as file_path:
|
|
df.to_excel(file_path, "test", index=False)
|
|
result = pd.read_excel(
|
|
file_path,
|
|
sheet_name="test",
|
|
dtype_backend="numpy_nullable",
|
|
dtype="float64",
|
|
)
|
|
tm.assert_frame_equal(result, df)
|
|
|
|
@td.skip_if_no("pyarrow")
|
|
def test_dtype_backend_string(self, read_ext, string_storage):
|
|
# GH#36712
|
|
if read_ext in (".xlsb", ".xls"):
|
|
pytest.skip(f"No engine for filetype: '{read_ext}'")
|
|
|
|
import pyarrow as pa
|
|
|
|
with pd.option_context("mode.string_storage", string_storage):
|
|
df = DataFrame(
|
|
{
|
|
"a": np.array(["a", "b"], dtype=np.object_),
|
|
"b": np.array(["x", pd.NA], dtype=np.object_),
|
|
}
|
|
)
|
|
with tm.ensure_clean(read_ext) as file_path:
|
|
df.to_excel(file_path, "test", index=False)
|
|
result = pd.read_excel(
|
|
file_path, sheet_name="test", dtype_backend="numpy_nullable"
|
|
)
|
|
|
|
if string_storage == "python":
|
|
expected = DataFrame(
|
|
{
|
|
"a": StringArray(np.array(["a", "b"], dtype=np.object_)),
|
|
"b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
|
|
}
|
|
)
|
|
else:
|
|
expected = DataFrame(
|
|
{
|
|
"a": ArrowStringArray(pa.array(["a", "b"])),
|
|
"b": ArrowStringArray(pa.array(["x", None])),
|
|
}
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)])
|
|
def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value):
|
|
# GH#35211
|
|
basename = "df_mangle_dup_col_dtypes"
|
|
dtype_dict = {"a": str, **dtypes}
|
|
dtype_dict_copy = dtype_dict.copy()
|
|
# GH#42462
|
|
result = pd.read_excel(basename + read_ext, dtype=dtype_dict)
|
|
expected = DataFrame({"a": ["1"], "a.1": [exp_value]})
|
|
assert dtype_dict == dtype_dict_copy, "dtype dict changed"
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_reader_spaces(self, read_ext):
|
|
# see gh-32207
|
|
basename = "test_spaces"
|
|
|
|
actual = pd.read_excel(basename + read_ext)
|
|
expected = DataFrame(
|
|
{
|
|
"testcol": [
|
|
"this is great",
|
|
"4 spaces",
|
|
"1 trailing ",
|
|
" 1 leading",
|
|
"2 spaces multiple times",
|
|
]
|
|
}
|
|
)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
# gh-36122, gh-35802
|
|
@pytest.mark.parametrize(
|
|
"basename,expected",
|
|
[
|
|
("gh-35802", DataFrame({"COLUMN": ["Test (1)"]})),
|
|
("gh-36122", DataFrame(columns=["got 2nd sa"])),
|
|
],
|
|
)
|
|
def test_read_excel_ods_nested_xml(self, engine, read_ext, basename, expected):
|
|
# see gh-35802
|
|
if engine != "odf":
|
|
pytest.skip(f"Skipped for engine: {engine}")
|
|
|
|
actual = pd.read_excel(basename + read_ext)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
def test_reading_all_sheets(self, read_ext):
|
|
# Test reading all sheet names by setting sheet_name to None,
|
|
# Ensure a dict is returned.
|
|
# See PR #9450
|
|
basename = "test_multisheet"
|
|
dfs = pd.read_excel(basename + read_ext, sheet_name=None)
|
|
# ensure this is not alphabetical to test order preservation
|
|
expected_keys = ["Charlie", "Alpha", "Beta"]
|
|
tm.assert_contains_all(expected_keys, dfs.keys())
|
|
# Issue 9930
|
|
# Ensure sheet order is preserved
|
|
assert expected_keys == list(dfs.keys())
|
|
|
|
def test_reading_multiple_specific_sheets(self, read_ext):
|
|
# Test reading specific sheet names by specifying a mixed list
|
|
# of integers and strings, and confirm that duplicated sheet
|
|
# references (positions/names) are removed properly.
|
|
# Ensure a dict is returned
|
|
# See PR #9450
|
|
basename = "test_multisheet"
|
|
# Explicitly request duplicates. Only the set should be returned.
|
|
expected_keys = [2, "Charlie", "Charlie"]
|
|
dfs = pd.read_excel(basename + read_ext, sheet_name=expected_keys)
|
|
expected_keys = list(set(expected_keys))
|
|
tm.assert_contains_all(expected_keys, dfs.keys())
|
|
assert len(expected_keys) == len(dfs.keys())
|
|
|
|
def test_reading_all_sheets_with_blank(self, read_ext):
|
|
# Test reading all sheet names by setting sheet_name to None,
|
|
# In the case where some sheets are blank.
|
|
# Issue #11711
|
|
basename = "blank_with_header"
|
|
dfs = pd.read_excel(basename + read_ext, sheet_name=None)
|
|
expected_keys = ["Sheet1", "Sheet2", "Sheet3"]
|
|
tm.assert_contains_all(expected_keys, dfs.keys())
|
|
|
|
# GH6403
|
|
def test_read_excel_blank(self, read_ext):
|
|
actual = pd.read_excel("blank" + read_ext, sheet_name="Sheet1")
|
|
tm.assert_frame_equal(actual, DataFrame())
|
|
|
|
def test_read_excel_blank_with_header(self, read_ext):
|
|
expected = DataFrame(columns=["col_1", "col_2"])
|
|
actual = pd.read_excel("blank_with_header" + read_ext, sheet_name="Sheet1")
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
def test_exception_message_includes_sheet_name(self, read_ext):
|
|
# GH 48706
|
|
with pytest.raises(ValueError, match=r" \(sheet: Sheet1\)$"):
|
|
pd.read_excel("blank_with_header" + read_ext, header=[1], sheet_name=None)
|
|
with pytest.raises(ZeroDivisionError, match=r" \(sheet: Sheet1\)$"):
|
|
pd.read_excel("test1" + read_ext, usecols=lambda x: 1 / 0, sheet_name=None)
|
|
|
|
@pytest.mark.filterwarnings("ignore:Cell A4 is marked:UserWarning:openpyxl")
|
|
def test_date_conversion_overflow(self, request, engine, read_ext):
|
|
# GH 10001 : pandas.ExcelFile ignore parse_dates=False
|
|
if engine == "pyxlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="Sheets containing datetimes not supported by pyxlsb"
|
|
)
|
|
)
|
|
|
|
expected = DataFrame(
|
|
[
|
|
[pd.Timestamp("2016-03-12"), "Marc Johnson"],
|
|
[pd.Timestamp("2016-03-16"), "Jack Black"],
|
|
[1e20, "Timothy Brown"],
|
|
],
|
|
columns=["DateColWithBigInt", "StringCol"],
|
|
)
|
|
|
|
if engine == "openpyxl":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(reason="Maybe not supported by openpyxl")
|
|
)
|
|
|
|
if engine is None and read_ext in (".xlsx", ".xlsm"):
|
|
# GH 35029
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(reason="Defaults to openpyxl, maybe not supported")
|
|
)
|
|
|
|
result = pd.read_excel("testdateoverflow" + read_ext)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_sheet_name(self, request, read_ext, df_ref):
|
|
if read_ext == ".xlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="Sheets containing datetimes not supported by pyxlsb"
|
|
)
|
|
)
|
|
filename = "test1"
|
|
sheet_name = "Sheet1"
|
|
|
|
df1 = pd.read_excel(
|
|
filename + read_ext, sheet_name=sheet_name, index_col=0
|
|
) # doc
|
|
df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name)
|
|
|
|
tm.assert_frame_equal(df1, df_ref, check_names=False)
|
|
tm.assert_frame_equal(df2, df_ref, check_names=False)
|
|
|
|
def test_excel_read_buffer(self, read_ext):
|
|
pth = "test1" + read_ext
|
|
expected = pd.read_excel(pth, sheet_name="Sheet1", index_col=0)
|
|
with open(pth, "rb") as f:
|
|
actual = pd.read_excel(f, sheet_name="Sheet1", index_col=0)
|
|
tm.assert_frame_equal(expected, actual)
|
|
|
|
def test_bad_engine_raises(self):
|
|
bad_engine = "foo"
|
|
with pytest.raises(ValueError, match="Unknown engine: foo"):
|
|
pd.read_excel("", engine=bad_engine)
|
|
|
|
@pytest.mark.parametrize(
|
|
"sheet_name",
|
|
[3, [0, 3], [3, 0], "Sheet4", ["Sheet1", "Sheet4"], ["Sheet4", "Sheet1"]],
|
|
)
|
|
def test_bad_sheetname_raises(self, read_ext, sheet_name):
|
|
# GH 39250
|
|
msg = "Worksheet index 3 is invalid|Worksheet named 'Sheet4' not found"
|
|
with pytest.raises(ValueError, match=msg):
|
|
pd.read_excel("blank" + read_ext, sheet_name=sheet_name)
|
|
|
|
def test_missing_file_raises(self, read_ext):
|
|
bad_file = f"foo{read_ext}"
|
|
# CI tests with other languages, translates to "No such file or directory"
|
|
match = r"(No such file or directory|没有那个文件或目录|File o directory non esistente)"
|
|
with pytest.raises(FileNotFoundError, match=match):
|
|
pd.read_excel(bad_file)
|
|
|
|
def test_corrupt_bytes_raises(self, engine):
|
|
bad_stream = b"foo"
|
|
if engine is None:
|
|
error = ValueError
|
|
msg = (
|
|
"Excel file format cannot be determined, you must "
|
|
"specify an engine manually."
|
|
)
|
|
elif engine == "xlrd":
|
|
from xlrd import XLRDError
|
|
|
|
error = XLRDError
|
|
msg = (
|
|
"Unsupported format, or corrupt file: Expected BOF "
|
|
"record; found b'foo'"
|
|
)
|
|
else:
|
|
error = BadZipFile
|
|
msg = "File is not a zip file"
|
|
with pytest.raises(error, match=msg):
|
|
pd.read_excel(bad_stream)
|
|
|
|
@pytest.mark.network
|
|
@tm.network(
|
|
url=(
|
|
"https://raw.githubusercontent.com/pandas-dev/pandas/main/"
|
|
"pandas/tests/io/data/excel/test1.xlsx"
|
|
),
|
|
check_before_test=True,
|
|
)
|
|
def test_read_from_http_url(self, read_ext):
|
|
url = (
|
|
"https://raw.githubusercontent.com/pandas-dev/pandas/main/"
|
|
"pandas/tests/io/data/excel/test1" + read_ext
|
|
)
|
|
url_table = pd.read_excel(url)
|
|
local_table = pd.read_excel("test1" + read_ext)
|
|
tm.assert_frame_equal(url_table, local_table)
|
|
|
|
@td.skip_if_not_us_locale
|
|
@pytest.mark.single_cpu
|
|
def test_read_from_s3_url(self, read_ext, s3_resource, s3so):
|
|
# Bucket "pandas-test" created in tests/io/conftest.py
|
|
with open("test1" + read_ext, "rb") as f:
|
|
s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f)
|
|
|
|
url = "s3://pandas-test/test1" + read_ext
|
|
|
|
url_table = pd.read_excel(url, storage_options=s3so)
|
|
local_table = pd.read_excel("test1" + read_ext)
|
|
tm.assert_frame_equal(url_table, local_table)
|
|
|
|
@pytest.mark.single_cpu
|
|
def test_read_from_s3_object(self, read_ext, s3_resource, s3so):
|
|
# GH 38788
|
|
# Bucket "pandas-test" created in tests/io/conftest.py
|
|
with open("test1" + read_ext, "rb") as f:
|
|
s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f)
|
|
|
|
import s3fs
|
|
|
|
s3 = s3fs.S3FileSystem(**s3so)
|
|
|
|
with s3.open("s3://pandas-test/test1" + read_ext) as f:
|
|
url_table = pd.read_excel(f)
|
|
|
|
local_table = pd.read_excel("test1" + read_ext)
|
|
tm.assert_frame_equal(url_table, local_table)
|
|
|
|
@pytest.mark.slow
|
|
def test_read_from_file_url(self, read_ext, datapath):
|
|
# FILE
|
|
localtable = os.path.join(datapath("io", "data", "excel"), "test1" + read_ext)
|
|
local_table = pd.read_excel(localtable)
|
|
|
|
try:
|
|
url_table = pd.read_excel("file://localhost/" + localtable)
|
|
except URLError:
|
|
# fails on some systems
|
|
platform_info = " ".join(platform.uname()).strip()
|
|
pytest.skip(f"failing on {platform_info}")
|
|
|
|
tm.assert_frame_equal(url_table, local_table)
|
|
|
|
def test_read_from_pathlib_path(self, read_ext):
|
|
# GH12655
|
|
str_path = "test1" + read_ext
|
|
expected = pd.read_excel(str_path, sheet_name="Sheet1", index_col=0)
|
|
|
|
path_obj = Path("test1" + read_ext)
|
|
actual = pd.read_excel(path_obj, sheet_name="Sheet1", index_col=0)
|
|
|
|
tm.assert_frame_equal(expected, actual)
|
|
|
|
@td.skip_if_no("py.path")
|
|
def test_read_from_py_localpath(self, read_ext):
|
|
# GH12655
|
|
from py.path import local as LocalPath
|
|
|
|
str_path = os.path.join("test1" + read_ext)
|
|
expected = pd.read_excel(str_path, sheet_name="Sheet1", index_col=0)
|
|
|
|
path_obj = LocalPath().join("test1" + read_ext)
|
|
actual = pd.read_excel(path_obj, sheet_name="Sheet1", index_col=0)
|
|
|
|
tm.assert_frame_equal(expected, actual)
|
|
|
|
def test_close_from_py_localpath(self, read_ext):
|
|
# GH31467
|
|
str_path = os.path.join("test1" + read_ext)
|
|
with open(str_path, "rb") as f:
|
|
x = pd.read_excel(f, sheet_name="Sheet1", index_col=0)
|
|
del x
|
|
# should not throw an exception because the passed file was closed
|
|
f.read()
|
|
|
|
def test_reader_seconds(self, request, engine, read_ext):
|
|
if engine == "pyxlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="Sheets containing datetimes not supported by pyxlsb"
|
|
)
|
|
)
|
|
|
|
# Test reading times with and without milliseconds. GH5945.
|
|
expected = DataFrame.from_dict(
|
|
{
|
|
"Time": [
|
|
time(1, 2, 3),
|
|
time(2, 45, 56, 100000),
|
|
time(4, 29, 49, 200000),
|
|
time(6, 13, 42, 300000),
|
|
time(7, 57, 35, 400000),
|
|
time(9, 41, 28, 500000),
|
|
time(11, 25, 21, 600000),
|
|
time(13, 9, 14, 700000),
|
|
time(14, 53, 7, 800000),
|
|
time(16, 37, 0, 900000),
|
|
time(18, 20, 54),
|
|
]
|
|
}
|
|
)
|
|
|
|
actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1")
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1")
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
def test_read_excel_multiindex(self, request, read_ext):
|
|
# see gh-4679
|
|
if read_ext == ".xlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="Sheets containing datetimes not supported by pyxlsb"
|
|
)
|
|
)
|
|
|
|
mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]])
|
|
mi_file = "testmultiindex" + read_ext
|
|
|
|
# "mi_column" sheet
|
|
expected = DataFrame(
|
|
[
|
|
[1, 2.5, pd.Timestamp("2015-01-01"), True],
|
|
[2, 3.5, pd.Timestamp("2015-01-02"), False],
|
|
[3, 4.5, pd.Timestamp("2015-01-03"), False],
|
|
[4, 5.5, pd.Timestamp("2015-01-04"), True],
|
|
],
|
|
columns=mi,
|
|
)
|
|
|
|
actual = pd.read_excel(
|
|
mi_file, sheet_name="mi_column", header=[0, 1], index_col=0
|
|
)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
# "mi_index" sheet
|
|
expected.index = mi
|
|
expected.columns = ["a", "b", "c", "d"]
|
|
|
|
actual = pd.read_excel(mi_file, sheet_name="mi_index", index_col=[0, 1])
|
|
tm.assert_frame_equal(actual, expected, check_names=False)
|
|
|
|
# "both" sheet
|
|
expected.columns = mi
|
|
|
|
actual = pd.read_excel(
|
|
mi_file, sheet_name="both", index_col=[0, 1], header=[0, 1]
|
|
)
|
|
tm.assert_frame_equal(actual, expected, check_names=False)
|
|
|
|
# "mi_index_name" sheet
|
|
expected.columns = ["a", "b", "c", "d"]
|
|
expected.index = mi.set_names(["ilvl1", "ilvl2"])
|
|
|
|
actual = pd.read_excel(mi_file, sheet_name="mi_index_name", index_col=[0, 1])
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
# "mi_column_name" sheet
|
|
expected.index = list(range(4))
|
|
expected.columns = mi.set_names(["c1", "c2"])
|
|
actual = pd.read_excel(
|
|
mi_file, sheet_name="mi_column_name", header=[0, 1], index_col=0
|
|
)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
# see gh-11317
|
|
# "name_with_int" sheet
|
|
expected.columns = mi.set_levels([1, 2], level=1).set_names(["c1", "c2"])
|
|
|
|
actual = pd.read_excel(
|
|
mi_file, sheet_name="name_with_int", index_col=0, header=[0, 1]
|
|
)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
# "both_name" sheet
|
|
expected.columns = mi.set_names(["c1", "c2"])
|
|
expected.index = mi.set_names(["ilvl1", "ilvl2"])
|
|
|
|
actual = pd.read_excel(
|
|
mi_file, sheet_name="both_name", index_col=[0, 1], header=[0, 1]
|
|
)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
# "both_skiprows" sheet
|
|
actual = pd.read_excel(
|
|
mi_file,
|
|
sheet_name="both_name_skiprows",
|
|
index_col=[0, 1],
|
|
header=[0, 1],
|
|
skiprows=2,
|
|
)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
@pytest.mark.parametrize(
|
|
"sheet_name,idx_lvl2",
|
|
[
|
|
("both_name_blank_after_mi_name", [np.nan, "b", "a", "b"]),
|
|
("both_name_multiple_blanks", [np.nan] * 4),
|
|
],
|
|
)
|
|
def test_read_excel_multiindex_blank_after_name(
|
|
self, request, read_ext, sheet_name, idx_lvl2
|
|
):
|
|
# GH34673
|
|
if read_ext == ".xlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="Sheets containing datetimes not supported by pyxlsb (GH4679"
|
|
)
|
|
)
|
|
|
|
mi_file = "testmultiindex" + read_ext
|
|
mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]], names=["c1", "c2"])
|
|
expected = DataFrame(
|
|
[
|
|
[1, 2.5, pd.Timestamp("2015-01-01"), True],
|
|
[2, 3.5, pd.Timestamp("2015-01-02"), False],
|
|
[3, 4.5, pd.Timestamp("2015-01-03"), False],
|
|
[4, 5.5, pd.Timestamp("2015-01-04"), True],
|
|
],
|
|
columns=mi,
|
|
index=MultiIndex.from_arrays(
|
|
(["foo", "foo", "bar", "bar"], idx_lvl2),
|
|
names=["ilvl1", "ilvl2"],
|
|
),
|
|
)
|
|
result = pd.read_excel(
|
|
mi_file,
|
|
sheet_name=sheet_name,
|
|
index_col=[0, 1],
|
|
header=[0, 1],
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_read_excel_multiindex_header_only(self, read_ext):
|
|
# see gh-11733.
|
|
#
|
|
# Don't try to parse a header name if there isn't one.
|
|
mi_file = "testmultiindex" + read_ext
|
|
result = pd.read_excel(mi_file, sheet_name="index_col_none", header=[0, 1])
|
|
|
|
exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")])
|
|
expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_excel_old_index_format(self, read_ext):
|
|
# see gh-4679
|
|
filename = "test_index_name_pre17" + read_ext
|
|
|
|
# We detect headers to determine if index names exist, so
|
|
# that "index" name in the "names" version of the data will
|
|
# now be interpreted as rows that include null data.
|
|
data = np.array(
|
|
[
|
|
[None, None, None, None, None],
|
|
["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
|
|
["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
|
|
["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
|
|
["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
|
|
["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"],
|
|
]
|
|
)
|
|
columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
|
|
mi = MultiIndex(
|
|
levels=[
|
|
["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
|
|
["R1", "R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"],
|
|
],
|
|
codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]],
|
|
names=[None, None],
|
|
)
|
|
si = Index(
|
|
["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None
|
|
)
|
|
|
|
expected = DataFrame(data, index=si, columns=columns)
|
|
|
|
actual = pd.read_excel(filename, sheet_name="single_names", index_col=0)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
expected.index = mi
|
|
|
|
actual = pd.read_excel(filename, sheet_name="multi_names", index_col=[0, 1])
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
# The analogous versions of the "names" version data
|
|
# where there are explicitly no names for the indices.
|
|
data = np.array(
|
|
[
|
|
["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
|
|
["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
|
|
["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
|
|
["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
|
|
["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"],
|
|
]
|
|
)
|
|
columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
|
|
mi = MultiIndex(
|
|
levels=[
|
|
["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
|
|
["R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"],
|
|
],
|
|
codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]],
|
|
names=[None, None],
|
|
)
|
|
si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None)
|
|
|
|
expected = DataFrame(data, index=si, columns=columns)
|
|
|
|
actual = pd.read_excel(filename, sheet_name="single_no_names", index_col=0)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
expected.index = mi
|
|
|
|
actual = pd.read_excel(filename, sheet_name="multi_no_names", index_col=[0, 1])
|
|
tm.assert_frame_equal(actual, expected, check_names=False)
|
|
|
|
def test_read_excel_bool_header_arg(self, read_ext):
|
|
# GH 6114
|
|
msg = "Passing a bool to header is invalid"
|
|
for arg in [True, False]:
|
|
with pytest.raises(TypeError, match=msg):
|
|
pd.read_excel("test1" + read_ext, header=arg)
|
|
|
|
def test_read_excel_skiprows(self, request, read_ext):
|
|
# GH 4903
|
|
if read_ext == ".xlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="Sheets containing datetimes not supported by pyxlsb"
|
|
)
|
|
)
|
|
|
|
actual = pd.read_excel(
|
|
"testskiprows" + read_ext, sheet_name="skiprows_list", skiprows=[0, 2]
|
|
)
|
|
expected = DataFrame(
|
|
[
|
|
[1, 2.5, pd.Timestamp("2015-01-01"), True],
|
|
[2, 3.5, pd.Timestamp("2015-01-02"), False],
|
|
[3, 4.5, pd.Timestamp("2015-01-03"), False],
|
|
[4, 5.5, pd.Timestamp("2015-01-04"), True],
|
|
],
|
|
columns=["a", "b", "c", "d"],
|
|
)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
actual = pd.read_excel(
|
|
"testskiprows" + read_ext,
|
|
sheet_name="skiprows_list",
|
|
skiprows=np.array([0, 2]),
|
|
)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
# GH36435
|
|
actual = pd.read_excel(
|
|
"testskiprows" + read_ext,
|
|
sheet_name="skiprows_list",
|
|
skiprows=lambda x: x in [0, 2],
|
|
)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
actual = pd.read_excel(
|
|
"testskiprows" + read_ext,
|
|
sheet_name="skiprows_list",
|
|
skiprows=3,
|
|
names=["a", "b", "c", "d"],
|
|
)
|
|
expected = DataFrame(
|
|
[
|
|
# [1, 2.5, pd.Timestamp("2015-01-01"), True],
|
|
[2, 3.5, pd.Timestamp("2015-01-02"), False],
|
|
[3, 4.5, pd.Timestamp("2015-01-03"), False],
|
|
[4, 5.5, pd.Timestamp("2015-01-04"), True],
|
|
],
|
|
columns=["a", "b", "c", "d"],
|
|
)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
def test_read_excel_skiprows_callable_not_in(self, request, read_ext):
|
|
# GH 4903
|
|
if read_ext == ".xlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="Sheets containing datetimes not supported by pyxlsb"
|
|
)
|
|
)
|
|
|
|
actual = pd.read_excel(
|
|
"testskiprows" + read_ext,
|
|
sheet_name="skiprows_list",
|
|
skiprows=lambda x: x not in [1, 3, 5],
|
|
)
|
|
expected = DataFrame(
|
|
[
|
|
[1, 2.5, pd.Timestamp("2015-01-01"), True],
|
|
# [2, 3.5, pd.Timestamp("2015-01-02"), False],
|
|
[3, 4.5, pd.Timestamp("2015-01-03"), False],
|
|
# [4, 5.5, pd.Timestamp("2015-01-04"), True],
|
|
],
|
|
columns=["a", "b", "c", "d"],
|
|
)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
def test_read_excel_nrows(self, read_ext):
|
|
# GH 16645
|
|
num_rows_to_pull = 5
|
|
actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull)
|
|
expected = pd.read_excel("test1" + read_ext)
|
|
expected = expected[:num_rows_to_pull]
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
def test_read_excel_nrows_greater_than_nrows_in_file(self, read_ext):
|
|
# GH 16645
|
|
expected = pd.read_excel("test1" + read_ext)
|
|
num_records_in_file = len(expected)
|
|
num_rows_to_pull = num_records_in_file + 10
|
|
actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
def test_read_excel_nrows_non_integer_parameter(self, read_ext):
|
|
# GH 16645
|
|
msg = "'nrows' must be an integer >=0"
|
|
with pytest.raises(ValueError, match=msg):
|
|
pd.read_excel("test1" + read_ext, nrows="5")
|
|
|
|
@pytest.mark.parametrize(
|
|
"filename,sheet_name,header,index_col,skiprows",
|
|
[
|
|
("testmultiindex", "mi_column", [0, 1], 0, None),
|
|
("testmultiindex", "mi_index", None, [0, 1], None),
|
|
("testmultiindex", "both", [0, 1], [0, 1], None),
|
|
("testmultiindex", "mi_column_name", [0, 1], 0, None),
|
|
("testskiprows", "skiprows_list", None, None, [0, 2]),
|
|
("testskiprows", "skiprows_list", None, None, lambda x: x in (0, 2)),
|
|
],
|
|
)
|
|
def test_read_excel_nrows_params(
|
|
self, read_ext, filename, sheet_name, header, index_col, skiprows
|
|
):
|
|
"""
|
|
For various parameters, we should get the same result whether we
|
|
limit the rows during load (nrows=3) or after (df.iloc[:3]).
|
|
"""
|
|
# GH 46894
|
|
expected = pd.read_excel(
|
|
filename + read_ext,
|
|
sheet_name=sheet_name,
|
|
header=header,
|
|
index_col=index_col,
|
|
skiprows=skiprows,
|
|
).iloc[:3]
|
|
actual = pd.read_excel(
|
|
filename + read_ext,
|
|
sheet_name=sheet_name,
|
|
header=header,
|
|
index_col=index_col,
|
|
skiprows=skiprows,
|
|
nrows=3,
|
|
)
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
def test_deprecated_kwargs(self, read_ext):
|
|
with pytest.raises(TypeError, match="but 3 positional arguments"):
|
|
pd.read_excel("test1" + read_ext, "Sheet1", 0)
|
|
|
|
def test_no_header_with_list_index_col(self, read_ext):
|
|
# GH 31783
|
|
file_name = "testmultiindex" + read_ext
|
|
data = [("B", "B"), ("key", "val"), (3, 4), (3, 4)]
|
|
idx = MultiIndex.from_tuples(
|
|
[("A", "A"), ("key", "val"), (1, 2), (1, 2)], names=(0, 1)
|
|
)
|
|
expected = DataFrame(data, index=idx, columns=(2, 3))
|
|
result = pd.read_excel(
|
|
file_name, sheet_name="index_col_none", index_col=[0, 1], header=None
|
|
)
|
|
tm.assert_frame_equal(expected, result)
|
|
|
|
def test_one_col_noskip_blank_line(self, read_ext):
|
|
# GH 39808
|
|
file_name = "one_col_blank_line" + read_ext
|
|
data = [0.5, np.nan, 1, 2]
|
|
expected = DataFrame(data, columns=["numbers"])
|
|
result = pd.read_excel(file_name)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_multiheader_two_blank_lines(self, read_ext):
|
|
# GH 40442
|
|
file_name = "testmultiindex" + read_ext
|
|
columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")])
|
|
data = [[np.nan, np.nan], [np.nan, np.nan], [1, 3], [2, 4]]
|
|
expected = DataFrame(data, columns=columns)
|
|
result = pd.read_excel(
|
|
file_name, sheet_name="mi_column_empty_rows", header=[0, 1]
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_trailing_blanks(self, read_ext):
|
|
"""
|
|
Sheets can contain blank cells with no data. Some of our readers
|
|
were including those cells, creating many empty rows and columns
|
|
"""
|
|
file_name = "trailing_blanks" + read_ext
|
|
result = pd.read_excel(file_name)
|
|
assert result.shape == (3, 3)
|
|
|
|
def test_ignore_chartsheets_by_str(self, request, engine, read_ext):
|
|
# GH 41448
|
|
if engine == "odf":
|
|
pytest.skip("chartsheets do not exist in the ODF format")
|
|
if engine == "pyxlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="pyxlsb can't distinguish chartsheets from worksheets"
|
|
)
|
|
)
|
|
with pytest.raises(ValueError, match="Worksheet named 'Chart1' not found"):
|
|
pd.read_excel("chartsheet" + read_ext, sheet_name="Chart1")
|
|
|
|
def test_ignore_chartsheets_by_int(self, request, engine, read_ext):
|
|
# GH 41448
|
|
if engine == "odf":
|
|
pytest.skip("chartsheets do not exist in the ODF format")
|
|
if engine == "pyxlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="pyxlsb can't distinguish chartsheets from worksheets"
|
|
)
|
|
)
|
|
with pytest.raises(
|
|
ValueError, match="Worksheet index 1 is invalid, 1 worksheets found"
|
|
):
|
|
pd.read_excel("chartsheet" + read_ext, sheet_name=1)
|
|
|
|
def test_euro_decimal_format(self, read_ext):
|
|
# copied from read_csv
|
|
result = pd.read_excel("test_decimal" + read_ext, decimal=",", skiprows=1)
|
|
expected = DataFrame(
|
|
[
|
|
[1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819],
|
|
[2, 121.12, 14897.76, "DEF", "uyt", 0.377320872],
|
|
[3, 878.158, 108013.434, "GHI", "rez", 2.735694704],
|
|
],
|
|
columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
class TestExcelFileRead:
|
|
@pytest.fixture(autouse=True)
|
|
def cd_and_set_engine(self, engine, datapath, monkeypatch):
|
|
"""
|
|
Change directory and set engine for ExcelFile objects.
|
|
"""
|
|
func = partial(pd.ExcelFile, engine=engine)
|
|
monkeypatch.chdir(datapath("io", "data", "excel"))
|
|
monkeypatch.setattr(pd, "ExcelFile", func)
|
|
|
|
def test_engine_used(self, read_ext, engine):
|
|
expected_defaults = {
|
|
"xlsx": "openpyxl",
|
|
"xlsm": "openpyxl",
|
|
"xlsb": "pyxlsb",
|
|
"xls": "xlrd",
|
|
"ods": "odf",
|
|
}
|
|
|
|
with pd.ExcelFile("test1" + read_ext) as excel:
|
|
result = excel.engine
|
|
|
|
if engine is not None:
|
|
expected = engine
|
|
else:
|
|
expected = expected_defaults[read_ext[1:]]
|
|
assert result == expected
|
|
|
|
def test_excel_passes_na(self, read_ext):
|
|
with pd.ExcelFile("test4" + read_ext) as excel:
|
|
parsed = pd.read_excel(
|
|
excel, sheet_name="Sheet1", keep_default_na=False, na_values=["apple"]
|
|
)
|
|
expected = DataFrame(
|
|
[["NA"], [1], ["NA"], [np.nan], ["rabbit"]], columns=["Test"]
|
|
)
|
|
tm.assert_frame_equal(parsed, expected)
|
|
|
|
with pd.ExcelFile("test4" + read_ext) as excel:
|
|
parsed = pd.read_excel(
|
|
excel, sheet_name="Sheet1", keep_default_na=True, na_values=["apple"]
|
|
)
|
|
expected = DataFrame(
|
|
[[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"]
|
|
)
|
|
tm.assert_frame_equal(parsed, expected)
|
|
|
|
# 13967
|
|
with pd.ExcelFile("test5" + read_ext) as excel:
|
|
parsed = pd.read_excel(
|
|
excel, sheet_name="Sheet1", keep_default_na=False, na_values=["apple"]
|
|
)
|
|
expected = DataFrame(
|
|
[["1.#QNAN"], [1], ["nan"], [np.nan], ["rabbit"]], columns=["Test"]
|
|
)
|
|
tm.assert_frame_equal(parsed, expected)
|
|
|
|
with pd.ExcelFile("test5" + read_ext) as excel:
|
|
parsed = pd.read_excel(
|
|
excel, sheet_name="Sheet1", keep_default_na=True, na_values=["apple"]
|
|
)
|
|
expected = DataFrame(
|
|
[[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"]
|
|
)
|
|
tm.assert_frame_equal(parsed, expected)
|
|
|
|
@pytest.mark.parametrize("na_filter", [None, True, False])
|
|
def test_excel_passes_na_filter(self, read_ext, na_filter):
|
|
# gh-25453
|
|
kwargs = {}
|
|
|
|
if na_filter is not None:
|
|
kwargs["na_filter"] = na_filter
|
|
|
|
with pd.ExcelFile("test5" + read_ext) as excel:
|
|
parsed = pd.read_excel(
|
|
excel,
|
|
sheet_name="Sheet1",
|
|
keep_default_na=True,
|
|
na_values=["apple"],
|
|
**kwargs,
|
|
)
|
|
|
|
if na_filter is False:
|
|
expected = [["1.#QNAN"], [1], ["nan"], ["apple"], ["rabbit"]]
|
|
else:
|
|
expected = [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]]
|
|
|
|
expected = DataFrame(expected, columns=["Test"])
|
|
tm.assert_frame_equal(parsed, expected)
|
|
|
|
def test_excel_table_sheet_by_index(self, request, read_ext, df_ref):
|
|
if read_ext == ".xlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="Sheets containing datetimes not supported by pyxlsb"
|
|
)
|
|
)
|
|
|
|
with pd.ExcelFile("test1" + read_ext) as excel:
|
|
df1 = pd.read_excel(excel, sheet_name=0, index_col=0)
|
|
df2 = pd.read_excel(excel, sheet_name=1, skiprows=[1], index_col=0)
|
|
tm.assert_frame_equal(df1, df_ref, check_names=False)
|
|
tm.assert_frame_equal(df2, df_ref, check_names=False)
|
|
|
|
with pd.ExcelFile("test1" + read_ext) as excel:
|
|
df1 = excel.parse(0, index_col=0)
|
|
df2 = excel.parse(1, skiprows=[1], index_col=0)
|
|
tm.assert_frame_equal(df1, df_ref, check_names=False)
|
|
tm.assert_frame_equal(df2, df_ref, check_names=False)
|
|
|
|
with pd.ExcelFile("test1" + read_ext) as excel:
|
|
df3 = pd.read_excel(excel, sheet_name=0, index_col=0, skipfooter=1)
|
|
tm.assert_frame_equal(df3, df1.iloc[:-1])
|
|
|
|
with pd.ExcelFile("test1" + read_ext) as excel:
|
|
df3 = excel.parse(0, index_col=0, skipfooter=1)
|
|
|
|
tm.assert_frame_equal(df3, df1.iloc[:-1])
|
|
|
|
def test_sheet_name(self, request, read_ext, df_ref):
|
|
if read_ext == ".xlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="Sheets containing datetimes not supported by pyxlsb"
|
|
)
|
|
)
|
|
|
|
filename = "test1"
|
|
sheet_name = "Sheet1"
|
|
|
|
with pd.ExcelFile(filename + read_ext) as excel:
|
|
df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc
|
|
|
|
with pd.ExcelFile(filename + read_ext) as excel:
|
|
df2_parse = excel.parse(index_col=0, sheet_name=sheet_name)
|
|
|
|
tm.assert_frame_equal(df1_parse, df_ref, check_names=False)
|
|
tm.assert_frame_equal(df2_parse, df_ref, check_names=False)
|
|
|
|
@pytest.mark.parametrize(
|
|
"sheet_name",
|
|
[3, [0, 3], [3, 0], "Sheet4", ["Sheet1", "Sheet4"], ["Sheet4", "Sheet1"]],
|
|
)
|
|
def test_bad_sheetname_raises(self, read_ext, sheet_name):
|
|
# GH 39250
|
|
msg = "Worksheet index 3 is invalid|Worksheet named 'Sheet4' not found"
|
|
with pytest.raises(ValueError, match=msg):
|
|
with pd.ExcelFile("blank" + read_ext) as excel:
|
|
excel.parse(sheet_name=sheet_name)
|
|
|
|
def test_excel_read_buffer(self, engine, read_ext):
|
|
pth = "test1" + read_ext
|
|
expected = pd.read_excel(pth, sheet_name="Sheet1", index_col=0, engine=engine)
|
|
|
|
with open(pth, "rb") as f:
|
|
with pd.ExcelFile(f) as xls:
|
|
actual = pd.read_excel(xls, sheet_name="Sheet1", index_col=0)
|
|
|
|
tm.assert_frame_equal(expected, actual)
|
|
|
|
def test_reader_closes_file(self, engine, read_ext):
|
|
with open("test1" + read_ext, "rb") as f:
|
|
with pd.ExcelFile(f) as xlsx:
|
|
# parses okay
|
|
pd.read_excel(xlsx, sheet_name="Sheet1", index_col=0, engine=engine)
|
|
|
|
assert f.closed
|
|
|
|
def test_conflicting_excel_engines(self, read_ext):
|
|
# GH 26566
|
|
msg = "Engine should not be specified when passing an ExcelFile"
|
|
|
|
with pd.ExcelFile("test1" + read_ext) as xl:
|
|
with pytest.raises(ValueError, match=msg):
|
|
pd.read_excel(xl, engine="foo")
|
|
|
|
def test_excel_read_binary(self, engine, read_ext):
|
|
# GH 15914
|
|
expected = pd.read_excel("test1" + read_ext, engine=engine)
|
|
|
|
with open("test1" + read_ext, "rb") as f:
|
|
data = f.read()
|
|
|
|
actual = pd.read_excel(data, engine=engine)
|
|
tm.assert_frame_equal(expected, actual)
|
|
|
|
def test_excel_read_binary_via_read_excel(self, read_ext, engine):
|
|
# GH 38424
|
|
with open("test1" + read_ext, "rb") as f:
|
|
result = pd.read_excel(f)
|
|
expected = pd.read_excel("test1" + read_ext, engine=engine)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_read_excel_header_index_out_of_range(self, engine):
|
|
# GH#43143
|
|
with open("df_header_oob.xlsx", "rb") as f:
|
|
with pytest.raises(ValueError, match="exceeds maximum"):
|
|
pd.read_excel(f, header=[0, 1])
|
|
|
|
@pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"])
|
|
def test_header_with_index_col(self, filename):
|
|
# GH 33476
|
|
idx = Index(["Z"], name="I2")
|
|
cols = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"])
|
|
expected = DataFrame([[1, 3]], index=idx, columns=cols, dtype="int64")
|
|
result = pd.read_excel(
|
|
filename, sheet_name="Sheet1", index_col=0, header=[0, 1]
|
|
)
|
|
tm.assert_frame_equal(expected, result)
|
|
|
|
def test_read_datetime_multiindex(self, request, engine, read_ext):
|
|
# GH 34748
|
|
if engine == "pyxlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="Sheets containing datetimes not supported by pyxlsb"
|
|
)
|
|
)
|
|
|
|
f = "test_datetime_mi" + read_ext
|
|
with pd.ExcelFile(f) as excel:
|
|
actual = pd.read_excel(excel, header=[0, 1], index_col=0, engine=engine)
|
|
expected_column_index = MultiIndex.from_tuples(
|
|
[(pd.to_datetime("02/29/2020"), pd.to_datetime("03/01/2020"))],
|
|
names=[
|
|
pd.to_datetime("02/29/2020").to_pydatetime(),
|
|
pd.to_datetime("03/01/2020").to_pydatetime(),
|
|
],
|
|
)
|
|
expected = DataFrame([], index=[], columns=expected_column_index)
|
|
|
|
tm.assert_frame_equal(expected, actual)
|
|
|
|
def test_engine_invalid_option(self, read_ext):
|
|
# read_ext includes the '.' hence the weird formatting
|
|
with pytest.raises(ValueError, match="Value must be one of *"):
|
|
with pd.option_context(f"io.excel{read_ext}.reader", "abc"):
|
|
pass
|
|
|
|
def test_ignore_chartsheets(self, request, engine, read_ext):
|
|
# GH 41448
|
|
if engine == "odf":
|
|
pytest.skip("chartsheets do not exist in the ODF format")
|
|
if engine == "pyxlsb":
|
|
request.node.add_marker(
|
|
pytest.mark.xfail(
|
|
reason="pyxlsb can't distinguish chartsheets from worksheets"
|
|
)
|
|
)
|
|
with pd.ExcelFile("chartsheet" + read_ext) as excel:
|
|
assert excel.sheet_names == ["Sheet1"]
|
|
|
|
def test_corrupt_files_closed(self, engine, read_ext):
|
|
# GH41778
|
|
errors = (BadZipFile,)
|
|
if engine is None:
|
|
pytest.skip(f"Invalid test for engine={engine}")
|
|
elif engine == "xlrd":
|
|
import xlrd
|
|
|
|
errors = (BadZipFile, xlrd.biffh.XLRDError)
|
|
|
|
with tm.ensure_clean(f"corrupt{read_ext}") as file:
|
|
Path(file).write_text("corrupt")
|
|
with tm.assert_produces_warning(False):
|
|
try:
|
|
pd.ExcelFile(file, engine=engine)
|
|
except errors:
|
|
pass
|