520 lines
18 KiB
Python
520 lines
18 KiB
Python
import numpy as np
|
|
import pytest
|
|
|
|
import pandas as pd
|
|
from pandas import DataFrame, Series, Timestamp
|
|
import pandas._testing as tm
|
|
|
|
|
|
class TestDataFrameQuantile:
|
|
@pytest.mark.parametrize(
|
|
"df,expected",
|
|
[
|
|
[
|
|
DataFrame(
|
|
{
|
|
0: Series(pd.arrays.SparseArray([1, 2])),
|
|
1: Series(pd.arrays.SparseArray([3, 4])),
|
|
}
|
|
),
|
|
Series([1.5, 3.5], name=0.5),
|
|
],
|
|
[
|
|
DataFrame(Series([0.0, None, 1.0, 2.0], dtype="Sparse[float]")),
|
|
Series([1.0], name=0.5),
|
|
],
|
|
],
|
|
)
|
|
def test_quantile_sparse(self, df, expected):
|
|
# GH#17198
|
|
# GH#24600
|
|
result = df.quantile()
|
|
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_quantile(self, datetime_frame):
|
|
from numpy import percentile
|
|
|
|
df = datetime_frame
|
|
q = df.quantile(0.1, axis=0)
|
|
assert q["A"] == percentile(df["A"], 10)
|
|
tm.assert_index_equal(q.index, df.columns)
|
|
|
|
q = df.quantile(0.9, axis=1)
|
|
assert q["2000-01-17"] == percentile(df.loc["2000-01-17"], 90)
|
|
tm.assert_index_equal(q.index, df.index)
|
|
|
|
# test degenerate case
|
|
q = DataFrame({"x": [], "y": []}).quantile(0.1, axis=0)
|
|
assert np.isnan(q["x"]) and np.isnan(q["y"])
|
|
|
|
# non-numeric exclusion
|
|
df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]})
|
|
rs = df.quantile(0.5)
|
|
xp = df.median().rename(0.5)
|
|
tm.assert_series_equal(rs, xp)
|
|
|
|
# axis
|
|
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
|
|
result = df.quantile(0.5, axis=1)
|
|
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df.quantile([0.5, 0.75], axis=1)
|
|
expected = DataFrame(
|
|
{1: [1.5, 1.75], 2: [2.5, 2.75], 3: [3.5, 3.75]}, index=[0.5, 0.75]
|
|
)
|
|
tm.assert_frame_equal(result, expected, check_index_type=True)
|
|
|
|
# We may want to break API in the future to change this
|
|
# so that we exclude non-numeric along the same axis
|
|
# See GH #7312
|
|
df = DataFrame([[1, 2, 3], ["a", "b", 4]])
|
|
result = df.quantile(0.5, axis=1)
|
|
expected = Series([3.0, 4.0], index=[0, 1], name=0.5)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_quantile_date_range(self):
|
|
# GH 2460
|
|
|
|
dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific")
|
|
ser = Series(dti)
|
|
df = DataFrame(ser)
|
|
|
|
result = df.quantile(numeric_only=False)
|
|
expected = Series(
|
|
["2016-01-02 00:00:00"], name=0.5, dtype="datetime64[ns, US/Pacific]"
|
|
)
|
|
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_quantile_axis_mixed(self):
|
|
|
|
# mixed on axis=1
|
|
df = DataFrame(
|
|
{
|
|
"A": [1, 2, 3],
|
|
"B": [2.0, 3.0, 4.0],
|
|
"C": pd.date_range("20130101", periods=3),
|
|
"D": ["foo", "bar", "baz"],
|
|
}
|
|
)
|
|
result = df.quantile(0.5, axis=1)
|
|
expected = Series([1.5, 2.5, 3.5], name=0.5)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# must raise
|
|
msg = "'<' not supported between instances of 'Timestamp' and 'float'"
|
|
with pytest.raises(TypeError, match=msg):
|
|
df.quantile(0.5, axis=1, numeric_only=False)
|
|
|
|
def test_quantile_axis_parameter(self):
|
|
# GH 9543/9544
|
|
|
|
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
|
|
|
|
result = df.quantile(0.5, axis=0)
|
|
|
|
expected = Series([2.0, 3.0], index=["A", "B"], name=0.5)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
expected = df.quantile(0.5, axis="index")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df.quantile(0.5, axis=1)
|
|
|
|
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df.quantile(0.5, axis="columns")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
msg = "No axis named -1 for object type DataFrame"
|
|
with pytest.raises(ValueError, match=msg):
|
|
df.quantile(0.1, axis=-1)
|
|
msg = "No axis named column for object type DataFrame"
|
|
with pytest.raises(ValueError, match=msg):
|
|
df.quantile(0.1, axis="column")
|
|
|
|
def test_quantile_interpolation(self):
|
|
# see gh-10174
|
|
|
|
# interpolation method other than default linear
|
|
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
|
|
result = df.quantile(0.5, axis=1, interpolation="nearest")
|
|
expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# cross-check interpolation=nearest results in original dtype
|
|
exp = np.percentile(
|
|
np.array([[1, 2, 3], [2, 3, 4]]), 0.5, axis=0, interpolation="nearest"
|
|
)
|
|
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="int64")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# float
|
|
df = DataFrame({"A": [1.0, 2.0, 3.0], "B": [2.0, 3.0, 4.0]}, index=[1, 2, 3])
|
|
result = df.quantile(0.5, axis=1, interpolation="nearest")
|
|
expected = Series([1.0, 2.0, 3.0], index=[1, 2, 3], name=0.5)
|
|
tm.assert_series_equal(result, expected)
|
|
exp = np.percentile(
|
|
np.array([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]]),
|
|
0.5,
|
|
axis=0,
|
|
interpolation="nearest",
|
|
)
|
|
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="float64")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# axis
|
|
result = df.quantile([0.5, 0.75], axis=1, interpolation="lower")
|
|
expected = DataFrame(
|
|
{1: [1.0, 1.0], 2: [2.0, 2.0], 3: [3.0, 3.0]}, index=[0.5, 0.75]
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# test degenerate case
|
|
df = DataFrame({"x": [], "y": []})
|
|
q = df.quantile(0.1, axis=0, interpolation="higher")
|
|
assert np.isnan(q["x"]) and np.isnan(q["y"])
|
|
|
|
# multi
|
|
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"])
|
|
result = df.quantile([0.25, 0.5], interpolation="midpoint")
|
|
|
|
# https://github.com/numpy/numpy/issues/7163
|
|
expected = DataFrame(
|
|
[[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
|
|
index=[0.25, 0.5],
|
|
columns=["a", "b", "c"],
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_quantile_interpolation_datetime(self, datetime_frame):
|
|
# see gh-10174
|
|
|
|
# interpolation = linear (default case)
|
|
df = datetime_frame
|
|
q = df.quantile(0.1, axis=0, interpolation="linear")
|
|
assert q["A"] == np.percentile(df["A"], 10)
|
|
|
|
def test_quantile_interpolation_int(self, int_frame):
|
|
# see gh-10174
|
|
|
|
df = int_frame
|
|
# interpolation = linear (default case)
|
|
q = df.quantile(0.1)
|
|
assert q["A"] == np.percentile(df["A"], 10)
|
|
|
|
# test with and without interpolation keyword
|
|
q1 = df.quantile(0.1, axis=0, interpolation="linear")
|
|
assert q1["A"] == np.percentile(df["A"], 10)
|
|
tm.assert_series_equal(q, q1)
|
|
|
|
def test_quantile_multi(self):
|
|
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"])
|
|
result = df.quantile([0.25, 0.5])
|
|
expected = DataFrame(
|
|
[[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
|
|
index=[0.25, 0.5],
|
|
columns=["a", "b", "c"],
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# axis = 1
|
|
result = df.quantile([0.25, 0.5], axis=1)
|
|
expected = DataFrame(
|
|
[[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], index=[0.25, 0.5], columns=[0, 1, 2]
|
|
)
|
|
|
|
# empty
|
|
result = DataFrame({"x": [], "y": []}).quantile([0.1, 0.9], axis=0)
|
|
expected = DataFrame(
|
|
{"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9]
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_quantile_datetime(self):
|
|
df = DataFrame({"a": pd.to_datetime(["2010", "2011"]), "b": [0, 5]})
|
|
|
|
# exclude datetime
|
|
result = df.quantile(0.5)
|
|
expected = Series([2.5], index=["b"])
|
|
|
|
# datetime
|
|
result = df.quantile(0.5, numeric_only=False)
|
|
expected = Series(
|
|
[Timestamp("2010-07-02 12:00:00"), 2.5], index=["a", "b"], name=0.5
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# datetime w/ multi
|
|
result = df.quantile([0.5], numeric_only=False)
|
|
expected = DataFrame(
|
|
[[Timestamp("2010-07-02 12:00:00"), 2.5]], index=[0.5], columns=["a", "b"]
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# axis = 1
|
|
df["c"] = pd.to_datetime(["2011", "2012"])
|
|
result = df[["a", "c"]].quantile(0.5, axis=1, numeric_only=False)
|
|
expected = Series(
|
|
[Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")],
|
|
index=[0, 1],
|
|
name=0.5,
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df[["a", "c"]].quantile([0.5], axis=1, numeric_only=False)
|
|
expected = DataFrame(
|
|
[[Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")]],
|
|
index=[0.5],
|
|
columns=[0, 1],
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# empty when numeric_only=True
|
|
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
|
|
# result = df[['a', 'c']].quantile(.5)
|
|
# result = df[['a', 'c']].quantile([.5])
|
|
|
|
def test_quantile_invalid(self, datetime_frame):
|
|
msg = "percentiles should all be in the interval \\[0, 1\\]"
|
|
for invalid in [-1, 2, [0.5, -1], [0.5, 2]]:
|
|
with pytest.raises(ValueError, match=msg):
|
|
datetime_frame.quantile(invalid)
|
|
|
|
def test_quantile_box(self):
|
|
df = DataFrame(
|
|
{
|
|
"A": [
|
|
Timestamp("2011-01-01"),
|
|
Timestamp("2011-01-02"),
|
|
Timestamp("2011-01-03"),
|
|
],
|
|
"B": [
|
|
Timestamp("2011-01-01", tz="US/Eastern"),
|
|
Timestamp("2011-01-02", tz="US/Eastern"),
|
|
Timestamp("2011-01-03", tz="US/Eastern"),
|
|
],
|
|
"C": [
|
|
pd.Timedelta("1 days"),
|
|
pd.Timedelta("2 days"),
|
|
pd.Timedelta("3 days"),
|
|
],
|
|
}
|
|
)
|
|
|
|
res = df.quantile(0.5, numeric_only=False)
|
|
|
|
exp = Series(
|
|
[
|
|
Timestamp("2011-01-02"),
|
|
Timestamp("2011-01-02", tz="US/Eastern"),
|
|
pd.Timedelta("2 days"),
|
|
],
|
|
name=0.5,
|
|
index=["A", "B", "C"],
|
|
)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = df.quantile([0.5], numeric_only=False)
|
|
exp = DataFrame(
|
|
[
|
|
[
|
|
Timestamp("2011-01-02"),
|
|
Timestamp("2011-01-02", tz="US/Eastern"),
|
|
pd.Timedelta("2 days"),
|
|
]
|
|
],
|
|
index=[0.5],
|
|
columns=["A", "B", "C"],
|
|
)
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
# DatetimeBlock may be consolidated and contain NaT in different loc
|
|
df = DataFrame(
|
|
{
|
|
"A": [
|
|
Timestamp("2011-01-01"),
|
|
pd.NaT,
|
|
Timestamp("2011-01-02"),
|
|
Timestamp("2011-01-03"),
|
|
],
|
|
"a": [
|
|
Timestamp("2011-01-01"),
|
|
Timestamp("2011-01-02"),
|
|
pd.NaT,
|
|
Timestamp("2011-01-03"),
|
|
],
|
|
"B": [
|
|
Timestamp("2011-01-01", tz="US/Eastern"),
|
|
pd.NaT,
|
|
Timestamp("2011-01-02", tz="US/Eastern"),
|
|
Timestamp("2011-01-03", tz="US/Eastern"),
|
|
],
|
|
"b": [
|
|
Timestamp("2011-01-01", tz="US/Eastern"),
|
|
Timestamp("2011-01-02", tz="US/Eastern"),
|
|
pd.NaT,
|
|
Timestamp("2011-01-03", tz="US/Eastern"),
|
|
],
|
|
"C": [
|
|
pd.Timedelta("1 days"),
|
|
pd.Timedelta("2 days"),
|
|
pd.Timedelta("3 days"),
|
|
pd.NaT,
|
|
],
|
|
"c": [
|
|
pd.NaT,
|
|
pd.Timedelta("1 days"),
|
|
pd.Timedelta("2 days"),
|
|
pd.Timedelta("3 days"),
|
|
],
|
|
},
|
|
columns=list("AaBbCc"),
|
|
)
|
|
|
|
res = df.quantile(0.5, numeric_only=False)
|
|
exp = Series(
|
|
[
|
|
Timestamp("2011-01-02"),
|
|
Timestamp("2011-01-02"),
|
|
Timestamp("2011-01-02", tz="US/Eastern"),
|
|
Timestamp("2011-01-02", tz="US/Eastern"),
|
|
pd.Timedelta("2 days"),
|
|
pd.Timedelta("2 days"),
|
|
],
|
|
name=0.5,
|
|
index=list("AaBbCc"),
|
|
)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = df.quantile([0.5], numeric_only=False)
|
|
exp = DataFrame(
|
|
[
|
|
[
|
|
Timestamp("2011-01-02"),
|
|
Timestamp("2011-01-02"),
|
|
Timestamp("2011-01-02", tz="US/Eastern"),
|
|
Timestamp("2011-01-02", tz="US/Eastern"),
|
|
pd.Timedelta("2 days"),
|
|
pd.Timedelta("2 days"),
|
|
]
|
|
],
|
|
index=[0.5],
|
|
columns=list("AaBbCc"),
|
|
)
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
def test_quantile_nan(self):
|
|
|
|
# GH 14357 - float block where some cols have missing values
|
|
df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)})
|
|
df.iloc[-1, 1] = np.nan
|
|
|
|
res = df.quantile(0.5)
|
|
exp = Series([3.0, 2.5], index=["a", "b"], name=0.5)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = df.quantile([0.5, 0.75])
|
|
exp = DataFrame({"a": [3.0, 4.0], "b": [2.5, 3.25]}, index=[0.5, 0.75])
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
res = df.quantile(0.5, axis=1)
|
|
exp = Series(np.arange(1.0, 6.0), name=0.5)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = df.quantile([0.5, 0.75], axis=1)
|
|
exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75])
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
# full-nan column
|
|
df["b"] = np.nan
|
|
|
|
res = df.quantile(0.5)
|
|
exp = Series([3.0, np.nan], index=["a", "b"], name=0.5)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = df.quantile([0.5, 0.75])
|
|
exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75])
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
def test_quantile_nat(self):
|
|
|
|
# full NaT column
|
|
df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]})
|
|
|
|
res = df.quantile(0.5, numeric_only=False)
|
|
exp = Series([pd.NaT], index=["a"], name=0.5)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = df.quantile([0.5], numeric_only=False)
|
|
exp = DataFrame({"a": [pd.NaT]}, index=[0.5])
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
# mixed non-null / full null column
|
|
df = DataFrame(
|
|
{
|
|
"a": [
|
|
Timestamp("2012-01-01"),
|
|
Timestamp("2012-01-02"),
|
|
Timestamp("2012-01-03"),
|
|
],
|
|
"b": [pd.NaT, pd.NaT, pd.NaT],
|
|
}
|
|
)
|
|
|
|
res = df.quantile(0.5, numeric_only=False)
|
|
exp = Series([Timestamp("2012-01-02"), pd.NaT], index=["a", "b"], name=0.5)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = df.quantile([0.5], numeric_only=False)
|
|
exp = DataFrame(
|
|
[[Timestamp("2012-01-02"), pd.NaT]], index=[0.5], columns=["a", "b"]
|
|
)
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
def test_quantile_empty_no_rows(self):
|
|
|
|
# floats
|
|
df = DataFrame(columns=["a", "b"], dtype="float64")
|
|
|
|
res = df.quantile(0.5)
|
|
exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = df.quantile([0.5])
|
|
exp = DataFrame([[np.nan, np.nan]], columns=["a", "b"], index=[0.5])
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
|
|
# res = df.quantile(0.5, axis=1)
|
|
# res = df.quantile([0.5], axis=1)
|
|
|
|
# ints
|
|
df = DataFrame(columns=["a", "b"], dtype="int64")
|
|
|
|
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
|
|
# res = df.quantile(0.5)
|
|
|
|
# datetimes
|
|
df = DataFrame(columns=["a", "b"], dtype="datetime64[ns]")
|
|
|
|
# FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0)
|
|
# res = df.quantile(0.5, numeric_only=False)
|
|
|
|
def test_quantile_empty_no_columns(self):
|
|
# GH#23925 _get_numeric_data may drop all columns
|
|
df = DataFrame(pd.date_range("1/1/18", periods=5))
|
|
df.columns.name = "captain tightpants"
|
|
result = df.quantile(0.5)
|
|
expected = Series([], index=[], name=0.5, dtype=np.float64)
|
|
expected.index.name = "captain tightpants"
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df.quantile([0.5])
|
|
expected = DataFrame([], index=[0.5], columns=[])
|
|
expected.columns.name = "captain tightpants"
|
|
tm.assert_frame_equal(result, expected)
|