3RNN/Lib/site-packages/pandas/tests/interchange/test_spec_conformance.py
2024-05-26 19:49:15 +02:00

176 lines
5.5 KiB
Python

"""
A verbatim copy (vendored) of the spec tests.
Taken from https://github.com/data-apis/dataframe-api
"""
import ctypes
import math
import pytest
import pandas as pd
@pytest.fixture
def df_from_dict():
def maker(dct, is_categorical=False):
df = pd.DataFrame(dct)
return df.astype("category") if is_categorical else df
return maker
@pytest.mark.parametrize(
"test_data",
[
{"a": ["foo", "bar"], "b": ["baz", "qux"]},
{"a": [1.5, 2.5, 3.5], "b": [9.2, 10.5, 11.8]},
{"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]},
],
ids=["str_data", "float_data", "int_data"],
)
def test_only_one_dtype(test_data, df_from_dict):
columns = list(test_data.keys())
df = df_from_dict(test_data)
dfX = df.__dataframe__()
column_size = len(test_data[columns[0]])
for column in columns:
null_count = dfX.get_column_by_name(column).null_count
assert null_count == 0
assert isinstance(null_count, int)
assert dfX.get_column_by_name(column).size() == column_size
assert dfX.get_column_by_name(column).offset == 0
def test_mixed_dtypes(df_from_dict):
df = df_from_dict(
{
"a": [1, 2, 3], # dtype kind INT = 0
"b": [3, 4, 5], # dtype kind INT = 0
"c": [1.5, 2.5, 3.5], # dtype kind FLOAT = 2
"d": [9, 10, 11], # dtype kind INT = 0
"e": [True, False, True], # dtype kind BOOLEAN = 20
"f": ["a", "", "c"], # dtype kind STRING = 21
}
)
dfX = df.__dataframe__()
# for meanings of dtype[0] see the spec; we cannot import the spec here as this
# file is expected to be vendored *anywhere*;
# values for dtype[0] are explained above
columns = {"a": 0, "b": 0, "c": 2, "d": 0, "e": 20, "f": 21}
for column, kind in columns.items():
colX = dfX.get_column_by_name(column)
assert colX.null_count == 0
assert isinstance(colX.null_count, int)
assert colX.size() == 3
assert colX.offset == 0
assert colX.dtype[0] == kind
assert dfX.get_column_by_name("c").dtype[1] == 64
def test_na_float(df_from_dict):
df = df_from_dict({"a": [1.0, math.nan, 2.0]})
dfX = df.__dataframe__()
colX = dfX.get_column_by_name("a")
assert colX.null_count == 1
assert isinstance(colX.null_count, int)
def test_noncategorical(df_from_dict):
df = df_from_dict({"a": [1, 2, 3]})
dfX = df.__dataframe__()
colX = dfX.get_column_by_name("a")
with pytest.raises(TypeError, match=".*categorical.*"):
colX.describe_categorical
def test_categorical(df_from_dict):
df = df_from_dict(
{"weekday": ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]},
is_categorical=True,
)
colX = df.__dataframe__().get_column_by_name("weekday")
categorical = colX.describe_categorical
assert isinstance(categorical["is_ordered"], bool)
assert isinstance(categorical["is_dictionary"], bool)
def test_dataframe(df_from_dict):
df = df_from_dict(
{"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]}
)
dfX = df.__dataframe__()
assert dfX.num_columns() == 3
assert dfX.num_rows() == 3
assert dfX.num_chunks() == 1
assert list(dfX.column_names()) == ["x", "y", "z"]
assert list(dfX.select_columns((0, 2)).column_names()) == list(
dfX.select_columns_by_name(("x", "z")).column_names()
)
@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)])
def test_df_get_chunks(size, n_chunks, df_from_dict):
df = df_from_dict({"x": list(range(size))})
dfX = df.__dataframe__()
chunks = list(dfX.get_chunks(n_chunks))
assert len(chunks) == n_chunks
assert sum(chunk.num_rows() for chunk in chunks) == size
@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)])
def test_column_get_chunks(size, n_chunks, df_from_dict):
df = df_from_dict({"x": list(range(size))})
dfX = df.__dataframe__()
chunks = list(dfX.get_column(0).get_chunks(n_chunks))
assert len(chunks) == n_chunks
assert sum(chunk.size() for chunk in chunks) == size
def test_get_columns(df_from_dict):
df = df_from_dict({"a": [0, 1], "b": [2.5, 3.5]})
dfX = df.__dataframe__()
for colX in dfX.get_columns():
assert colX.size() == 2
assert colX.num_chunks() == 1
# for meanings of dtype[0] see the spec; we cannot import the spec here as this
# file is expected to be vendored *anywhere*
assert dfX.get_column(0).dtype[0] == 0 # INT
assert dfX.get_column(1).dtype[0] == 2 # FLOAT
def test_buffer(df_from_dict):
arr = [0, 1, -1]
df = df_from_dict({"a": arr})
dfX = df.__dataframe__()
colX = dfX.get_column(0)
bufX = colX.get_buffers()
dataBuf, dataDtype = bufX["data"]
assert dataBuf.bufsize > 0
assert dataBuf.ptr != 0
device, _ = dataBuf.__dlpack_device__()
# for meanings of dtype[0] see the spec; we cannot import the spec here as this
# file is expected to be vendored *anywhere*
assert dataDtype[0] == 0 # INT
if device == 1: # CPU-only as we're going to directly read memory here
bitwidth = dataDtype[1]
ctype = {
8: ctypes.c_int8,
16: ctypes.c_int16,
32: ctypes.c_int32,
64: ctypes.c_int64,
}[bitwidth]
for idx, truth in enumerate(arr):
val = ctype.from_address(dataBuf.ptr + idx * (bitwidth // 8)).value
assert val == truth, f"Buffer at index {idx} mismatch"