projektAI/venv/Lib/site-packages/pandas/tests/io/excel/test_readers.py

1276 lines
46 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
from datetime import datetime, time
from functools import partial
import os
from urllib.error import URLError
from zipfile import BadZipFile
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series
import pandas._testing as tm
from pandas.tests.io.excel import xlrd_version
read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"]
engine_params = [
# Add any engines to test here
# When defusedxml is installed it triggers deprecation warnings for
# xlrd and openpyxl, so catch those here
pytest.param(
"xlrd",
marks=[
td.skip_if_no("xlrd"),
],
),
pytest.param(
"openpyxl",
marks=[
td.skip_if_no("openpyxl"),
pytest.mark.filterwarnings("ignore:.*html argument"),
],
),
pytest.param(
None,
marks=[
td.skip_if_no("xlrd"),
],
),
pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")),
pytest.param("odf", marks=td.skip_if_no("odf")),
]
def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool:
"""
Filter out invalid (engine, ext) pairs instead of skipping, as that
produces 500+ pytest.skips.
"""
engine = engine.values[0]
if engine == "openpyxl" and read_ext == ".xls":
return False
if engine == "odf" and read_ext != ".ods":
return False
if read_ext == ".ods" and engine != "odf":
return False
if engine == "pyxlsb" and read_ext != ".xlsb":
return False
if read_ext == ".xlsb" and engine != "pyxlsb":
return False
if (
engine == "xlrd"
and xlrd_version is not None
and xlrd_version >= "2"
and read_ext != ".xls"
):
return False
return True
def _transfer_marks(engine, read_ext):
"""
engine gives us a pytest.param objec with some marks, read_ext is just
a string. We need to generate a new pytest.param inheriting the marks.
"""
values = engine.values + (read_ext,)
new_param = pytest.param(values, marks=engine.marks)
return new_param
@pytest.fixture(
autouse=True,
params=[
_transfer_marks(eng, ext)
for eng in engine_params
for ext in read_ext_params
if _is_valid_engine_ext_pair(eng, ext)
],
)
def engine_and_read_ext(request):
"""
Fixture for Excel reader engine and read_ext, only including valid pairs.
"""
return request.param
@pytest.fixture
def engine(engine_and_read_ext):
engine, read_ext = engine_and_read_ext
return engine
@pytest.fixture
def read_ext(engine_and_read_ext):
engine, read_ext = engine_and_read_ext
return read_ext
class TestReaders:
@pytest.fixture(autouse=True)
def cd_and_set_engine(self, engine, datapath, monkeypatch):
"""
Change directory and set engine for read_excel calls.
"""
func = partial(pd.read_excel, engine=engine)
monkeypatch.chdir(datapath("io", "data", "excel"))
monkeypatch.setattr(pd, "read_excel", func)
def test_usecols_int(self, read_ext, df_ref):
df_ref = df_ref.reindex(columns=["A", "B", "C"])
# usecols as int
msg = "Passing an integer for `usecols`"
with pytest.raises(ValueError, match=msg):
pd.read_excel(
"test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=3
)
# usecols as int
with pytest.raises(ValueError, match=msg):
pd.read_excel(
"test1" + read_ext,
sheet_name="Sheet2",
skiprows=[1],
index_col=0,
usecols=3,
)
def test_usecols_list(self, read_ext, df_ref):
if pd.read_excel.keywords["engine"] == "pyxlsb":
pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
df_ref = df_ref.reindex(columns=["B", "C"])
df1 = pd.read_excel(
"test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=[0, 2, 3]
)
df2 = pd.read_excel(
"test1" + read_ext,
sheet_name="Sheet2",
skiprows=[1],
index_col=0,
usecols=[0, 2, 3],
)
# TODO add index to xls file)
tm.assert_frame_equal(df1, df_ref, check_names=False)
tm.assert_frame_equal(df2, df_ref, check_names=False)
def test_usecols_str(self, read_ext, df_ref):
if pd.read_excel.keywords["engine"] == "pyxlsb":
pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
df1 = df_ref.reindex(columns=["A", "B", "C"])
df2 = pd.read_excel(
"test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A:D"
)
df3 = pd.read_excel(
"test1" + read_ext,
sheet_name="Sheet2",
skiprows=[1],
index_col=0,
usecols="A:D",
)
# TODO add index to xls, read xls ignores index name ?
tm.assert_frame_equal(df2, df1, check_names=False)
tm.assert_frame_equal(df3, df1, check_names=False)
df1 = df_ref.reindex(columns=["B", "C"])
df2 = pd.read_excel(
"test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C,D"
)
df3 = pd.read_excel(
"test1" + read_ext,
sheet_name="Sheet2",
skiprows=[1],
index_col=0,
usecols="A,C,D",
)
# TODO add index to xls file
tm.assert_frame_equal(df2, df1, check_names=False)
tm.assert_frame_equal(df3, df1, check_names=False)
df1 = df_ref.reindex(columns=["B", "C"])
df2 = pd.read_excel(
"test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C:D"
)
df3 = pd.read_excel(
"test1" + read_ext,
sheet_name="Sheet2",
skiprows=[1],
index_col=0,
usecols="A,C:D",
)
tm.assert_frame_equal(df2, df1, check_names=False)
tm.assert_frame_equal(df3, df1, check_names=False)
@pytest.mark.parametrize(
"usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]]
)
def test_usecols_diff_positional_int_columns_order(self, read_ext, usecols, df_ref):
if pd.read_excel.keywords["engine"] == "pyxlsb":
pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
expected = df_ref[["A", "C"]]
result = pd.read_excel(
"test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=usecols
)
tm.assert_frame_equal(result, expected, check_names=False)
@pytest.mark.parametrize("usecols", [["B", "D"], ["D", "B"]])
def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_ref):
expected = df_ref[["B", "D"]]
expected.index = range(len(expected))
result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols=usecols)
tm.assert_frame_equal(result, expected, check_names=False)
def test_read_excel_without_slicing(self, read_ext, df_ref):
if pd.read_excel.keywords["engine"] == "pyxlsb":
pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
expected = df_ref
result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0)
tm.assert_frame_equal(result, expected, check_names=False)
def test_usecols_excel_range_str(self, read_ext, df_ref):
if pd.read_excel.keywords["engine"] == "pyxlsb":
pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
expected = df_ref[["C", "D"]]
result = pd.read_excel(
"test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,D:E"
)
tm.assert_frame_equal(result, expected, check_names=False)
def test_usecols_excel_range_str_invalid(self, read_ext):
msg = "Invalid column name: E1"
with pytest.raises(ValueError, match=msg):
pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols="D:E1")
def test_index_col_label_error(self, read_ext):
msg = "list indices must be integers.*, not str"
with pytest.raises(TypeError, match=msg):
pd.read_excel(
"test1" + read_ext,
sheet_name="Sheet1",
index_col=["A"],
usecols=["A", "C"],
)
def test_index_col_empty(self, read_ext):
# see gh-9208
result = pd.read_excel(
"test1" + read_ext, sheet_name="Sheet3", index_col=["A", "B", "C"]
)
expected = DataFrame(
columns=["D", "E", "F"],
index=MultiIndex(levels=[[]] * 3, codes=[[]] * 3, names=["A", "B", "C"]),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("index_col", [None, 2])
def test_index_col_with_unnamed(self, read_ext, index_col):
# see gh-18792
result = pd.read_excel(
"test1" + read_ext, sheet_name="Sheet4", index_col=index_col
)
expected = DataFrame(
[["i1", "a", "x"], ["i2", "b", "y"]], columns=["Unnamed: 0", "col1", "col2"]
)
if index_col:
expected = expected.set_index(expected.columns[index_col])
tm.assert_frame_equal(result, expected)
def test_usecols_pass_non_existent_column(self, read_ext):
msg = (
"Usecols do not match columns, "
"columns expected but not found: " + r"\['E'\]"
)
with pytest.raises(ValueError, match=msg):
pd.read_excel("test1" + read_ext, usecols=["E"])
def test_usecols_wrong_type(self, read_ext):
msg = (
"'usecols' must either be list-like of "
"all strings, all unicode, all integers or a callable."
)
with pytest.raises(ValueError, match=msg):
pd.read_excel("test1" + read_ext, usecols=["E1", 0])
def test_excel_stop_iterator(self, read_ext):
parsed = pd.read_excel("test2" + read_ext, sheet_name="Sheet1")
expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"])
tm.assert_frame_equal(parsed, expected)
def test_excel_cell_error_na(self, read_ext):
if pd.read_excel.keywords["engine"] == "pyxlsb":
pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
parsed = pd.read_excel("test3" + read_ext, sheet_name="Sheet1")
expected = DataFrame([[np.nan]], columns=["Test"])
tm.assert_frame_equal(parsed, expected)
def test_excel_table(self, read_ext, df_ref):
if pd.read_excel.keywords["engine"] == "pyxlsb":
pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
df1 = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0)
df2 = pd.read_excel(
"test1" + read_ext, sheet_name="Sheet2", skiprows=[1], index_col=0
)
# TODO add index to file
tm.assert_frame_equal(df1, df_ref, check_names=False)
tm.assert_frame_equal(df2, df_ref, check_names=False)
df3 = pd.read_excel(
"test1" + read_ext, sheet_name="Sheet1", index_col=0, skipfooter=1
)
tm.assert_frame_equal(df3, df1.iloc[:-1])
def test_reader_special_dtypes(self, read_ext):
if pd.read_excel.keywords["engine"] == "pyxlsb":
pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
expected = DataFrame.from_dict(
{
"IntCol": [1, 2, -3, 4, 0],
"FloatCol": [1.25, 2.25, 1.83, 1.92, 0.0000000005],
"BoolCol": [True, False, True, True, False],
"StrCol": [1, 2, 3, 4, 5],
# GH5394 - this is why convert_float isn't vectorized
"Str2Col": ["a", 3, "c", "d", "e"],
"DateCol": [
datetime(2013, 10, 30),
datetime(2013, 10, 31),
datetime(1905, 1, 1),
datetime(2013, 12, 14),
datetime(2015, 3, 14),
],
},
)
basename = "test_types"
# should read in correctly and infer types
actual = pd.read_excel(basename + read_ext, sheet_name="Sheet1")
tm.assert_frame_equal(actual, expected)
# if not coercing number, then int comes in as float
float_expected = expected.copy()
float_expected["IntCol"] = float_expected["IntCol"].astype(float)
float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0
actual = pd.read_excel(
basename + read_ext, sheet_name="Sheet1", convert_float=False
)
tm.assert_frame_equal(actual, float_expected)
# check setting Index (assuming xls and xlsx are the same here)
for icol, name in enumerate(expected.columns):
actual = pd.read_excel(
basename + read_ext, sheet_name="Sheet1", index_col=icol
)
exp = expected.set_index(name)
tm.assert_frame_equal(actual, exp)
# convert_float and converters should be different but both accepted
expected["StrCol"] = expected["StrCol"].apply(str)
actual = pd.read_excel(
basename + read_ext, sheet_name="Sheet1", converters={"StrCol": str}
)
tm.assert_frame_equal(actual, expected)
no_convert_float = float_expected.copy()
no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str)
actual = pd.read_excel(
basename + read_ext,
sheet_name="Sheet1",
convert_float=False,
converters={"StrCol": str},
)
tm.assert_frame_equal(actual, no_convert_float)
# GH8212 - support for converters and missing values
def test_reader_converters(self, read_ext):
basename = "test_converters"
expected = DataFrame.from_dict(
{
"IntCol": [1, 2, -3, -1000, 0],
"FloatCol": [12.5, np.nan, 18.3, 19.2, 0.000000005],
"BoolCol": ["Found", "Found", "Found", "Not found", "Found"],
"StrCol": ["1", np.nan, "3", "4", "5"],
}
)
converters = {
"IntCol": lambda x: int(x) if x != "" else -1000,
"FloatCol": lambda x: 10 * x if x else np.nan,
2: lambda x: "Found" if x != "" else "Not found",
3: lambda x: str(x) if x else "",
}
# should read in correctly and set types of single cells (not array
# dtypes)
actual = pd.read_excel(
basename + read_ext, sheet_name="Sheet1", converters=converters
)
tm.assert_frame_equal(actual, expected)
def test_reader_dtype(self, read_ext):
# GH 8212
basename = "testdtype"
actual = pd.read_excel(basename + read_ext)
expected = DataFrame(
{
"a": [1, 2, 3, 4],
"b": [2.5, 3.5, 4.5, 5.5],
"c": [1, 2, 3, 4],
"d": [1.0, 2.0, np.nan, 4.0],
}
).reindex(columns=["a", "b", "c", "d"])
tm.assert_frame_equal(actual, expected)
actual = pd.read_excel(
basename + read_ext, dtype={"a": "float64", "b": "float32", "c": str}
)
expected["a"] = expected["a"].astype("float64")
expected["b"] = expected["b"].astype("float32")
expected["c"] = ["001", "002", "003", "004"]
tm.assert_frame_equal(actual, expected)
msg = "Unable to convert column d to type int64"
with pytest.raises(ValueError, match=msg):
pd.read_excel(basename + read_ext, dtype={"d": "int64"})
@pytest.mark.parametrize(
"dtype,expected",
[
(
None,
DataFrame(
{
"a": [1, 2, 3, 4],
"b": [2.5, 3.5, 4.5, 5.5],
"c": [1, 2, 3, 4],
"d": [1.0, 2.0, np.nan, 4.0],
}
),
),
(
{"a": "float64", "b": "float32", "c": str, "d": str},
DataFrame(
{
"a": Series([1, 2, 3, 4], dtype="float64"),
"b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
"c": ["001", "002", "003", "004"],
"d": ["1", "2", np.nan, "4"],
}
),
),
],
)
def test_reader_dtype_str(self, read_ext, dtype, expected):
# see gh-20377
basename = "testdtype"
actual = pd.read_excel(basename + read_ext, dtype=dtype)
tm.assert_frame_equal(actual, expected)
def test_reader_spaces(self, read_ext):
# see gh-32207
basename = "test_spaces"
actual = pd.read_excel(basename + read_ext)
expected = DataFrame(
{
"testcol": [
"this is great",
"4 spaces",
"1 trailing ",
" 1 leading",
"2 spaces multiple times",
]
}
)
tm.assert_frame_equal(actual, expected)
# gh-36122, gh-35802
@pytest.mark.parametrize(
"basename,expected",
[
("gh-35802", DataFrame({"COLUMN": ["Test (1)"]})),
("gh-36122", DataFrame(columns=["got 2nd sa"])),
],
)
def test_read_excel_ods_nested_xml(self, read_ext, basename, expected):
# see gh-35802
engine = pd.read_excel.keywords["engine"]
if engine != "odf":
pytest.skip(f"Skipped for engine: {engine}")
actual = pd.read_excel(basename + read_ext)
tm.assert_frame_equal(actual, expected)
def test_reading_all_sheets(self, read_ext):
# Test reading all sheet names by setting sheet_name to None,
# Ensure a dict is returned.
# See PR #9450
basename = "test_multisheet"
dfs = pd.read_excel(basename + read_ext, sheet_name=None)
# ensure this is not alphabetical to test order preservation
expected_keys = ["Charlie", "Alpha", "Beta"]
tm.assert_contains_all(expected_keys, dfs.keys())
# Issue 9930
# Ensure sheet order is preserved
assert expected_keys == list(dfs.keys())
def test_reading_multiple_specific_sheets(self, read_ext):
# Test reading specific sheet names by specifying a mixed list
# of integers and strings, and confirm that duplicated sheet
# references (positions/names) are removed properly.
# Ensure a dict is returned
# See PR #9450
basename = "test_multisheet"
# Explicitly request duplicates. Only the set should be returned.
expected_keys = [2, "Charlie", "Charlie"]
dfs = pd.read_excel(basename + read_ext, sheet_name=expected_keys)
expected_keys = list(set(expected_keys))
tm.assert_contains_all(expected_keys, dfs.keys())
assert len(expected_keys) == len(dfs.keys())
def test_reading_all_sheets_with_blank(self, read_ext):
# Test reading all sheet names by setting sheet_name to None,
# In the case where some sheets are blank.
# Issue #11711
basename = "blank_with_header"
dfs = pd.read_excel(basename + read_ext, sheet_name=None)
expected_keys = ["Sheet1", "Sheet2", "Sheet3"]
tm.assert_contains_all(expected_keys, dfs.keys())
# GH6403
def test_read_excel_blank(self, read_ext):
actual = pd.read_excel("blank" + read_ext, sheet_name="Sheet1")
tm.assert_frame_equal(actual, DataFrame())
def test_read_excel_blank_with_header(self, read_ext):
expected = DataFrame(columns=["col_1", "col_2"])
actual = pd.read_excel("blank_with_header" + read_ext, sheet_name="Sheet1")
tm.assert_frame_equal(actual, expected)
def test_date_conversion_overflow(self, read_ext):
# GH 10001 : pandas.ExcelFile ignore parse_dates=False
if pd.read_excel.keywords["engine"] == "pyxlsb":
pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
expected = DataFrame(
[
[pd.Timestamp("2016-03-12"), "Marc Johnson"],
[pd.Timestamp("2016-03-16"), "Jack Black"],
[1e20, "Timothy Brown"],
],
columns=["DateColWithBigInt", "StringCol"],
)
if pd.read_excel.keywords["engine"] == "openpyxl":
pytest.xfail("Maybe not supported by openpyxl")
if pd.read_excel.keywords["engine"] is None:
# GH 35029
pytest.xfail("Defaults to openpyxl, maybe not supported")
result = pd.read_excel("testdateoverflow" + read_ext)
tm.assert_frame_equal(result, expected)
def test_sheet_name(self, read_ext, df_ref):
if pd.read_excel.keywords["engine"] == "pyxlsb":
pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
filename = "test1"
sheet_name = "Sheet1"
if pd.read_excel.keywords["engine"] == "openpyxl":
pytest.xfail("Maybe not supported by openpyxl")
df1 = pd.read_excel(
filename + read_ext, sheet_name=sheet_name, index_col=0
) # doc
df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name)
tm.assert_frame_equal(df1, df_ref, check_names=False)
tm.assert_frame_equal(df2, df_ref, check_names=False)
def test_excel_read_buffer(self, read_ext):
pth = "test1" + read_ext
expected = pd.read_excel(pth, sheet_name="Sheet1", index_col=0)
with open(pth, "rb") as f:
actual = pd.read_excel(f, sheet_name="Sheet1", index_col=0)
tm.assert_frame_equal(expected, actual)
def test_bad_engine_raises(self, read_ext):
bad_engine = "foo"
with pytest.raises(ValueError, match="Unknown engine: foo"):
pd.read_excel("", engine=bad_engine)
@pytest.mark.parametrize(
"sheet_name",
[3, [0, 3], [3, 0], "Sheet4", ["Sheet1", "Sheet4"], ["Sheet4", "Sheet1"]],
)
def test_bad_sheetname_raises(self, read_ext, sheet_name):
# GH 39250
msg = "Worksheet index 3 is invalid|Worksheet named 'Sheet4' not found"
with pytest.raises(ValueError, match=msg):
pd.read_excel("blank" + read_ext, sheet_name=sheet_name)
def test_missing_file_raises(self, read_ext):
bad_file = f"foo{read_ext}"
# CI tests with zh_CN.utf8, translates to "No such file or directory"
with pytest.raises(
FileNotFoundError, match=r"(No such file or directory|没有那个文件或目录)"
):
pd.read_excel(bad_file)
def test_corrupt_bytes_raises(self, read_ext, engine):
bad_stream = b"foo"
if engine is None or engine == "xlrd":
error = ValueError
msg = "File is not a recognized excel file"
else:
error = BadZipFile
msg = "File is not a zip file"
with pytest.raises(error, match=msg):
pd.read_excel(bad_stream)
@tm.network
def test_read_from_http_url(self, read_ext):
url = (
"https://raw.githubusercontent.com/pandas-dev/pandas/master/"
"pandas/tests/io/data/excel/test1" + read_ext
)
url_table = pd.read_excel(url)
local_table = pd.read_excel("test1" + read_ext)
tm.assert_frame_equal(url_table, local_table)
@td.skip_if_not_us_locale
def test_read_from_s3_url(self, read_ext, s3_resource, s3so):
# Bucket "pandas-test" created in tests/io/conftest.py
with open("test1" + read_ext, "rb") as f:
s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f)
url = "s3://pandas-test/test1" + read_ext
url_table = pd.read_excel(url, storage_options=s3so)
local_table = pd.read_excel("test1" + read_ext)
tm.assert_frame_equal(url_table, local_table)
def test_read_from_s3_object(self, read_ext, s3_resource, s3so):
# GH 38788
# Bucket "pandas-test" created in tests/io/conftest.py
with open("test1" + read_ext, "rb") as f:
s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f)
import s3fs
s3 = s3fs.S3FileSystem(**s3so)
with s3.open("s3://pandas-test/test1" + read_ext) as f:
url_table = pd.read_excel(f)
local_table = pd.read_excel("test1" + read_ext)
tm.assert_frame_equal(url_table, local_table)
@pytest.mark.slow
def test_read_from_file_url(self, read_ext, datapath):
# FILE
localtable = os.path.join(datapath("io", "data", "excel"), "test1" + read_ext)
local_table = pd.read_excel(localtable)
try:
url_table = pd.read_excel("file://localhost/" + localtable)
except URLError:
# fails on some systems
import platform
platform_info = " ".join(platform.uname()).strip()
pytest.skip(f"failing on {platform_info}")
tm.assert_frame_equal(url_table, local_table)
def test_read_from_pathlib_path(self, read_ext):
# GH12655
from pathlib import Path
str_path = "test1" + read_ext
expected = pd.read_excel(str_path, sheet_name="Sheet1", index_col=0)
path_obj = Path("test1" + read_ext)
actual = pd.read_excel(path_obj, sheet_name="Sheet1", index_col=0)
tm.assert_frame_equal(expected, actual)
@td.skip_if_no("py.path")
@td.check_file_leaks
def test_read_from_py_localpath(self, read_ext):
# GH12655
from py.path import local as LocalPath
str_path = os.path.join("test1" + read_ext)
expected = pd.read_excel(str_path, sheet_name="Sheet1", index_col=0)
path_obj = LocalPath().join("test1" + read_ext)
actual = pd.read_excel(path_obj, sheet_name="Sheet1", index_col=0)
tm.assert_frame_equal(expected, actual)
@td.check_file_leaks
def test_close_from_py_localpath(self, read_ext):
# GH31467
str_path = os.path.join("test1" + read_ext)
with open(str_path, "rb") as f:
x = pd.read_excel(f, sheet_name="Sheet1", index_col=0)
del x
# should not throw an exception because the passed file was closed
f.read()
def test_reader_seconds(self, read_ext):
if pd.read_excel.keywords["engine"] == "pyxlsb":
pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
# Test reading times with and without milliseconds. GH5945.
expected = DataFrame.from_dict(
{
"Time": [
time(1, 2, 3),
time(2, 45, 56, 100000),
time(4, 29, 49, 200000),
time(6, 13, 42, 300000),
time(7, 57, 35, 400000),
time(9, 41, 28, 500000),
time(11, 25, 21, 600000),
time(13, 9, 14, 700000),
time(14, 53, 7, 800000),
time(16, 37, 0, 900000),
time(18, 20, 54),
]
}
)
actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1")
tm.assert_frame_equal(actual, expected)
actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1")
tm.assert_frame_equal(actual, expected)
def test_read_excel_multiindex(self, read_ext):
# see gh-4679
if pd.read_excel.keywords["engine"] == "pyxlsb":
pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]])
mi_file = "testmultiindex" + read_ext
# "mi_column" sheet
expected = DataFrame(
[
[1, 2.5, pd.Timestamp("2015-01-01"), True],
[2, 3.5, pd.Timestamp("2015-01-02"), False],
[3, 4.5, pd.Timestamp("2015-01-03"), False],
[4, 5.5, pd.Timestamp("2015-01-04"), True],
],
columns=mi,
)
actual = pd.read_excel(
mi_file, sheet_name="mi_column", header=[0, 1], index_col=0
)
tm.assert_frame_equal(actual, expected)
# "mi_index" sheet
expected.index = mi
expected.columns = ["a", "b", "c", "d"]
actual = pd.read_excel(mi_file, sheet_name="mi_index", index_col=[0, 1])
tm.assert_frame_equal(actual, expected, check_names=False)
# "both" sheet
expected.columns = mi
actual = pd.read_excel(
mi_file, sheet_name="both", index_col=[0, 1], header=[0, 1]
)
tm.assert_frame_equal(actual, expected, check_names=False)
# "mi_index_name" sheet
expected.columns = ["a", "b", "c", "d"]
expected.index = mi.set_names(["ilvl1", "ilvl2"])
actual = pd.read_excel(mi_file, sheet_name="mi_index_name", index_col=[0, 1])
tm.assert_frame_equal(actual, expected)
# "mi_column_name" sheet
expected.index = list(range(4))
expected.columns = mi.set_names(["c1", "c2"])
actual = pd.read_excel(
mi_file, sheet_name="mi_column_name", header=[0, 1], index_col=0
)
tm.assert_frame_equal(actual, expected)
# see gh-11317
# "name_with_int" sheet
expected.columns = mi.set_levels([1, 2], level=1).set_names(["c1", "c2"])
actual = pd.read_excel(
mi_file, sheet_name="name_with_int", index_col=0, header=[0, 1]
)
tm.assert_frame_equal(actual, expected)
# "both_name" sheet
expected.columns = mi.set_names(["c1", "c2"])
expected.index = mi.set_names(["ilvl1", "ilvl2"])
actual = pd.read_excel(
mi_file, sheet_name="both_name", index_col=[0, 1], header=[0, 1]
)
tm.assert_frame_equal(actual, expected)
# "both_skiprows" sheet
actual = pd.read_excel(
mi_file,
sheet_name="both_name_skiprows",
index_col=[0, 1],
header=[0, 1],
skiprows=2,
)
tm.assert_frame_equal(actual, expected)
def test_read_excel_multiindex_header_only(self, read_ext):
# see gh-11733.
#
# Don't try to parse a header name if there isn't one.
mi_file = "testmultiindex" + read_ext
result = pd.read_excel(mi_file, sheet_name="index_col_none", header=[0, 1])
exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")])
expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns)
tm.assert_frame_equal(result, expected)
def test_excel_old_index_format(self, read_ext):
# see gh-4679
filename = "test_index_name_pre17" + read_ext
# We detect headers to determine if index names exist, so
# that "index" name in the "names" version of the data will
# now be interpreted as rows that include null data.
data = np.array(
[
[None, None, None, None, None],
["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"],
]
)
columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
mi = MultiIndex(
levels=[
["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
["R1", "R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"],
],
codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]],
names=[None, None],
)
si = Index(
["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None
)
expected = DataFrame(data, index=si, columns=columns)
actual = pd.read_excel(filename, sheet_name="single_names", index_col=0)
tm.assert_frame_equal(actual, expected)
expected.index = mi
actual = pd.read_excel(filename, sheet_name="multi_names", index_col=[0, 1])
tm.assert_frame_equal(actual, expected)
# The analogous versions of the "names" version data
# where there are explicitly no names for the indices.
data = np.array(
[
["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"],
]
)
columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
mi = MultiIndex(
levels=[
["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
["R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"],
],
codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]],
names=[None, None],
)
si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None)
expected = DataFrame(data, index=si, columns=columns)
actual = pd.read_excel(filename, sheet_name="single_no_names", index_col=0)
tm.assert_frame_equal(actual, expected)
expected.index = mi
actual = pd.read_excel(filename, sheet_name="multi_no_names", index_col=[0, 1])
tm.assert_frame_equal(actual, expected, check_names=False)
def test_read_excel_bool_header_arg(self, read_ext):
# GH 6114
msg = "Passing a bool to header is invalid"
for arg in [True, False]:
with pytest.raises(TypeError, match=msg):
pd.read_excel("test1" + read_ext, header=arg)
def test_read_excel_skiprows(self, read_ext):
# GH 4903
if pd.read_excel.keywords["engine"] == "pyxlsb":
pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
actual = pd.read_excel(
"testskiprows" + read_ext, sheet_name="skiprows_list", skiprows=[0, 2]
)
expected = DataFrame(
[
[1, 2.5, pd.Timestamp("2015-01-01"), True],
[2, 3.5, pd.Timestamp("2015-01-02"), False],
[3, 4.5, pd.Timestamp("2015-01-03"), False],
[4, 5.5, pd.Timestamp("2015-01-04"), True],
],
columns=["a", "b", "c", "d"],
)
tm.assert_frame_equal(actual, expected)
actual = pd.read_excel(
"testskiprows" + read_ext,
sheet_name="skiprows_list",
skiprows=np.array([0, 2]),
)
tm.assert_frame_equal(actual, expected)
# GH36435
actual = pd.read_excel(
"testskiprows" + read_ext,
sheet_name="skiprows_list",
skiprows=lambda x: x in [0, 2],
)
tm.assert_frame_equal(actual, expected)
actual = pd.read_excel(
"testskiprows" + read_ext,
sheet_name="skiprows_list",
skiprows=3,
names=["a", "b", "c", "d"],
)
expected = DataFrame(
[
# [1, 2.5, pd.Timestamp("2015-01-01"), True],
[2, 3.5, pd.Timestamp("2015-01-02"), False],
[3, 4.5, pd.Timestamp("2015-01-03"), False],
[4, 5.5, pd.Timestamp("2015-01-04"), True],
],
columns=["a", "b", "c", "d"],
)
tm.assert_frame_equal(actual, expected)
def test_read_excel_nrows(self, read_ext):
# GH 16645
num_rows_to_pull = 5
actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull)
expected = pd.read_excel("test1" + read_ext)
expected = expected[:num_rows_to_pull]
tm.assert_frame_equal(actual, expected)
def test_read_excel_nrows_greater_than_nrows_in_file(self, read_ext):
# GH 16645
expected = pd.read_excel("test1" + read_ext)
num_records_in_file = len(expected)
num_rows_to_pull = num_records_in_file + 10
actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull)
tm.assert_frame_equal(actual, expected)
def test_read_excel_nrows_non_integer_parameter(self, read_ext):
# GH 16645
msg = "'nrows' must be an integer >=0"
with pytest.raises(ValueError, match=msg):
pd.read_excel("test1" + read_ext, nrows="5")
def test_read_excel_squeeze(self, read_ext):
# GH 12157
f = "test_squeeze" + read_ext
actual = pd.read_excel(f, sheet_name="two_columns", index_col=0, squeeze=True)
expected = Series([2, 3, 4], [4, 5, 6], name="b")
expected.index.name = "a"
tm.assert_series_equal(actual, expected)
actual = pd.read_excel(f, sheet_name="two_columns", squeeze=True)
expected = DataFrame({"a": [4, 5, 6], "b": [2, 3, 4]})
tm.assert_frame_equal(actual, expected)
actual = pd.read_excel(f, sheet_name="one_column", squeeze=True)
expected = Series([1, 2, 3], name="a")
tm.assert_series_equal(actual, expected)
def test_deprecated_kwargs(self, read_ext):
with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False):
pd.read_excel("test1" + read_ext, "Sheet1", 0)
pd.read_excel("test1" + read_ext)
def test_no_header_with_list_index_col(self, read_ext):
# GH 31783
file_name = "testmultiindex" + read_ext
data = [("B", "B"), ("key", "val"), (3, 4), (3, 4)]
idx = MultiIndex.from_tuples(
[("A", "A"), ("key", "val"), (1, 2), (1, 2)], names=(0, 1)
)
expected = DataFrame(data, index=idx, columns=(2, 3))
result = pd.read_excel(
file_name, sheet_name="index_col_none", index_col=[0, 1], header=None
)
tm.assert_frame_equal(expected, result)
class TestExcelFileRead:
@pytest.fixture(autouse=True)
def cd_and_set_engine(self, engine, datapath, monkeypatch):
"""
Change directory and set engine for ExcelFile objects.
"""
func = partial(pd.ExcelFile, engine=engine)
monkeypatch.chdir(datapath("io", "data", "excel"))
monkeypatch.setattr(pd, "ExcelFile", func)
def test_excel_passes_na(self, read_ext):
with pd.ExcelFile("test4" + read_ext) as excel:
parsed = pd.read_excel(
excel, sheet_name="Sheet1", keep_default_na=False, na_values=["apple"]
)
expected = DataFrame(
[["NA"], [1], ["NA"], [np.nan], ["rabbit"]], columns=["Test"]
)
tm.assert_frame_equal(parsed, expected)
with pd.ExcelFile("test4" + read_ext) as excel:
parsed = pd.read_excel(
excel, sheet_name="Sheet1", keep_default_na=True, na_values=["apple"]
)
expected = DataFrame(
[[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"]
)
tm.assert_frame_equal(parsed, expected)
# 13967
with pd.ExcelFile("test5" + read_ext) as excel:
parsed = pd.read_excel(
excel, sheet_name="Sheet1", keep_default_na=False, na_values=["apple"]
)
expected = DataFrame(
[["1.#QNAN"], [1], ["nan"], [np.nan], ["rabbit"]], columns=["Test"]
)
tm.assert_frame_equal(parsed, expected)
with pd.ExcelFile("test5" + read_ext) as excel:
parsed = pd.read_excel(
excel, sheet_name="Sheet1", keep_default_na=True, na_values=["apple"]
)
expected = DataFrame(
[[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"]
)
tm.assert_frame_equal(parsed, expected)
@pytest.mark.parametrize("na_filter", [None, True, False])
def test_excel_passes_na_filter(self, read_ext, na_filter):
# gh-25453
kwargs = {}
if na_filter is not None:
kwargs["na_filter"] = na_filter
with pd.ExcelFile("test5" + read_ext) as excel:
parsed = pd.read_excel(
excel,
sheet_name="Sheet1",
keep_default_na=True,
na_values=["apple"],
**kwargs,
)
if na_filter is False:
expected = [["1.#QNAN"], [1], ["nan"], ["apple"], ["rabbit"]]
else:
expected = [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]]
expected = DataFrame(expected, columns=["Test"])
tm.assert_frame_equal(parsed, expected)
def test_excel_table_sheet_by_index(self, read_ext, df_ref):
# For some reason pd.read_excel has no attribute 'keywords' here.
# Skipping based on read_ext instead.
if read_ext == ".xlsb":
pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
with pd.ExcelFile("test1" + read_ext) as excel:
df1 = pd.read_excel(excel, sheet_name=0, index_col=0)
df2 = pd.read_excel(excel, sheet_name=1, skiprows=[1], index_col=0)
tm.assert_frame_equal(df1, df_ref, check_names=False)
tm.assert_frame_equal(df2, df_ref, check_names=False)
with pd.ExcelFile("test1" + read_ext) as excel:
df1 = excel.parse(0, index_col=0)
df2 = excel.parse(1, skiprows=[1], index_col=0)
tm.assert_frame_equal(df1, df_ref, check_names=False)
tm.assert_frame_equal(df2, df_ref, check_names=False)
with pd.ExcelFile("test1" + read_ext) as excel:
df3 = pd.read_excel(excel, sheet_name=0, index_col=0, skipfooter=1)
tm.assert_frame_equal(df3, df1.iloc[:-1])
with pd.ExcelFile("test1" + read_ext) as excel:
df3 = excel.parse(0, index_col=0, skipfooter=1)
tm.assert_frame_equal(df3, df1.iloc[:-1])
def test_sheet_name(self, read_ext, df_ref):
# For some reason pd.read_excel has no attribute 'keywords' here.
# Skipping based on read_ext instead.
if read_ext == ".xlsb":
pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
filename = "test1"
sheet_name = "Sheet1"
with pd.ExcelFile(filename + read_ext) as excel:
df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc
with pd.ExcelFile(filename + read_ext) as excel:
df2_parse = excel.parse(index_col=0, sheet_name=sheet_name)
tm.assert_frame_equal(df1_parse, df_ref, check_names=False)
tm.assert_frame_equal(df2_parse, df_ref, check_names=False)
@pytest.mark.parametrize(
"sheet_name",
[3, [0, 3], [3, 0], "Sheet4", ["Sheet1", "Sheet4"], ["Sheet4", "Sheet1"]],
)
def test_bad_sheetname_raises(self, read_ext, sheet_name):
# GH 39250
msg = "Worksheet index 3 is invalid|Worksheet named 'Sheet4' not found"
with pytest.raises(ValueError, match=msg):
with pd.ExcelFile("blank" + read_ext) as excel:
excel.parse(sheet_name=sheet_name)
def test_excel_read_buffer(self, engine, read_ext):
pth = "test1" + read_ext
expected = pd.read_excel(pth, sheet_name="Sheet1", index_col=0, engine=engine)
with open(pth, "rb") as f:
with pd.ExcelFile(f) as xls:
actual = pd.read_excel(xls, sheet_name="Sheet1", index_col=0)
tm.assert_frame_equal(expected, actual)
def test_reader_closes_file(self, engine, read_ext):
with open("test1" + read_ext, "rb") as f:
with pd.ExcelFile(f) as xlsx:
# parses okay
pd.read_excel(xlsx, sheet_name="Sheet1", index_col=0, engine=engine)
assert f.closed
def test_conflicting_excel_engines(self, read_ext):
# GH 26566
msg = "Engine should not be specified when passing an ExcelFile"
with pd.ExcelFile("test1" + read_ext) as xl:
with pytest.raises(ValueError, match=msg):
pd.read_excel(xl, engine="foo")
def test_excel_read_binary(self, engine, read_ext):
# GH 15914
expected = pd.read_excel("test1" + read_ext, engine=engine)
with open("test1" + read_ext, "rb") as f:
data = f.read()
actual = pd.read_excel(data, engine=engine)
tm.assert_frame_equal(expected, actual)
def test_excel_read_binary_via_read_excel(self, read_ext, engine):
# GH 38424
if read_ext == ".xlsb" and engine == "pyxlsb":
pytest.xfail("GH 38667 - should default to pyxlsb but doesn't")
with open("test1" + read_ext, "rb") as f:
result = pd.read_excel(f)
expected = pd.read_excel("test1" + read_ext, engine=engine)
tm.assert_frame_equal(result, expected)
@pytest.mark.skipif(
xlrd_version is not None and xlrd_version >= "2",
reason="xlrd no longer supports xlsx",
)
def test_excel_high_surrogate(self, engine):
# GH 23809
expected = DataFrame(["\udc88"], columns=["Column1"])
# should not produce a segmentation violation
actual = pd.read_excel("high_surrogate.xlsx", engine="xlrd")
tm.assert_frame_equal(expected, actual)
@pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"])
def test_header_with_index_col(self, engine, filename):
# GH 33476
idx = Index(["Z"], name="I2")
cols = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"])
expected = DataFrame([[1, 3]], index=idx, columns=cols, dtype="int64")
result = pd.read_excel(
filename, sheet_name="Sheet1", index_col=0, header=[0, 1]
)
tm.assert_frame_equal(expected, result)
def test_read_datetime_multiindex(self, engine, read_ext):
# GH 34748
if engine == "pyxlsb":
pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
f = "test_datetime_mi" + read_ext
with pd.ExcelFile(f) as excel:
actual = pd.read_excel(excel, header=[0, 1], index_col=0, engine=engine)
expected_column_index = MultiIndex.from_tuples(
[(pd.to_datetime("02/29/2020"), pd.to_datetime("03/01/2020"))],
names=[
pd.to_datetime("02/29/2020").to_pydatetime(),
pd.to_datetime("03/01/2020").to_pydatetime(),
],
)
expected = DataFrame([], columns=expected_column_index)
tm.assert_frame_equal(expected, actual)