200 lines
6.7 KiB
Python
200 lines
6.7 KiB
Python
import numpy as np
|
|
|
|
from pandas.core.dtypes.dtypes import CategoricalDtype
|
|
|
|
import pandas as pd
|
|
from pandas import Categorical, DataFrame, Series
|
|
import pandas._testing as tm
|
|
|
|
|
|
class TestCategoricalConcat:
|
|
def test_categorical_concat(self, sort):
|
|
# See GH 10177
|
|
df1 = DataFrame(
|
|
np.arange(18, dtype="int64").reshape(6, 3), columns=["a", "b", "c"]
|
|
)
|
|
|
|
df2 = DataFrame(np.arange(14, dtype="int64").reshape(7, 2), columns=["a", "c"])
|
|
|
|
cat_values = ["one", "one", "two", "one", "two", "two", "one"]
|
|
df2["h"] = Series(Categorical(cat_values))
|
|
|
|
res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort)
|
|
exp = DataFrame(
|
|
{
|
|
"a": [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12],
|
|
"b": [
|
|
1,
|
|
4,
|
|
7,
|
|
10,
|
|
13,
|
|
16,
|
|
np.nan,
|
|
np.nan,
|
|
np.nan,
|
|
np.nan,
|
|
np.nan,
|
|
np.nan,
|
|
np.nan,
|
|
],
|
|
"c": [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13],
|
|
"h": [None] * 6 + cat_values,
|
|
}
|
|
)
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
def test_categorical_concat_dtypes(self):
|
|
|
|
# GH8143
|
|
index = ["cat", "obj", "num"]
|
|
cat = Categorical(["a", "b", "c"])
|
|
obj = Series(["a", "b", "c"])
|
|
num = Series([1, 2, 3])
|
|
df = pd.concat([Series(cat), obj, num], axis=1, keys=index)
|
|
|
|
result = df.dtypes == "object"
|
|
expected = Series([False, True, False], index=index)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df.dtypes == "int64"
|
|
expected = Series([False, False, True], index=index)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df.dtypes == "category"
|
|
expected = Series([True, False, False], index=index)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_concat_categoricalindex(self):
|
|
# GH 16111, categories that aren't lexsorted
|
|
categories = [9, 0, 1, 2, 3]
|
|
|
|
a = Series(1, index=pd.CategoricalIndex([9, 0], categories=categories))
|
|
b = Series(2, index=pd.CategoricalIndex([0, 1], categories=categories))
|
|
c = Series(3, index=pd.CategoricalIndex([1, 2], categories=categories))
|
|
|
|
result = pd.concat([a, b, c], axis=1)
|
|
|
|
exp_idx = pd.CategoricalIndex([9, 0, 1, 2], categories=categories)
|
|
exp = DataFrame(
|
|
{
|
|
0: [1, 1, np.nan, np.nan],
|
|
1: [np.nan, 2, 2, np.nan],
|
|
2: [np.nan, np.nan, 3, 3],
|
|
},
|
|
columns=[0, 1, 2],
|
|
index=exp_idx,
|
|
)
|
|
tm.assert_frame_equal(result, exp)
|
|
|
|
def test_categorical_concat_preserve(self):
|
|
|
|
# GH 8641 series concat not preserving category dtype
|
|
# GH 13524 can concat different categories
|
|
s = Series(list("abc"), dtype="category")
|
|
s2 = Series(list("abd"), dtype="category")
|
|
|
|
exp = Series(list("abcabd"))
|
|
res = pd.concat([s, s2], ignore_index=True)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
exp = Series(list("abcabc"), dtype="category")
|
|
res = pd.concat([s, s], ignore_index=True)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
exp = Series(list("abcabc"), index=[0, 1, 2, 0, 1, 2], dtype="category")
|
|
res = pd.concat([s, s])
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
a = Series(np.arange(6, dtype="int64"))
|
|
b = Series(list("aabbca"))
|
|
|
|
df2 = DataFrame({"A": a, "B": b.astype(CategoricalDtype(list("cab")))})
|
|
res = pd.concat([df2, df2])
|
|
exp = DataFrame(
|
|
{
|
|
"A": pd.concat([a, a]),
|
|
"B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))),
|
|
}
|
|
)
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
def test_categorical_index_preserver(self):
|
|
|
|
a = Series(np.arange(6, dtype="int64"))
|
|
b = Series(list("aabbca"))
|
|
|
|
df2 = DataFrame(
|
|
{"A": a, "B": b.astype(CategoricalDtype(list("cab")))}
|
|
).set_index("B")
|
|
result = pd.concat([df2, df2])
|
|
expected = DataFrame(
|
|
{
|
|
"A": pd.concat([a, a]),
|
|
"B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))),
|
|
}
|
|
).set_index("B")
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# wrong categories -> uses concat_compat, which casts to object
|
|
df3 = DataFrame(
|
|
{"A": a, "B": Categorical(b, categories=list("abe"))}
|
|
).set_index("B")
|
|
result = pd.concat([df2, df3])
|
|
expected = pd.concat(
|
|
[
|
|
df2.set_axis(df2.index.astype(object), 0),
|
|
df3.set_axis(df3.index.astype(object), 0),
|
|
]
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_concat_categorical_tz(self):
|
|
# GH-23816
|
|
a = Series(pd.date_range("2017-01-01", periods=2, tz="US/Pacific"))
|
|
b = Series(["a", "b"], dtype="category")
|
|
result = pd.concat([a, b], ignore_index=True)
|
|
expected = Series(
|
|
[
|
|
pd.Timestamp("2017-01-01", tz="US/Pacific"),
|
|
pd.Timestamp("2017-01-02", tz="US/Pacific"),
|
|
"a",
|
|
"b",
|
|
]
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_concat_categorical_unchanged(self):
|
|
# GH-12007
|
|
# test fix for when concat on categorical and float
|
|
# coerces dtype categorical -> float
|
|
df = DataFrame(Series(["a", "b", "c"], dtype="category", name="A"))
|
|
ser = Series([0, 1, 2], index=[0, 1, 3], name="B")
|
|
result = pd.concat([df, ser], axis=1)
|
|
expected = DataFrame(
|
|
{
|
|
"A": Series(["a", "b", "c", np.nan], dtype="category"),
|
|
"B": Series([0, 1, np.nan, 2], dtype="float"),
|
|
}
|
|
)
|
|
tm.assert_equal(result, expected)
|
|
|
|
def test_categorical_concat_gh7864(self):
|
|
# GH 7864
|
|
# make sure ordering is preserved
|
|
df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list("abbaae")})
|
|
df["grade"] = Categorical(df["raw_grade"])
|
|
df["grade"].cat.set_categories(["e", "a", "b"])
|
|
|
|
df1 = df[0:3]
|
|
df2 = df[3:]
|
|
|
|
tm.assert_index_equal(df["grade"].cat.categories, df1["grade"].cat.categories)
|
|
tm.assert_index_equal(df["grade"].cat.categories, df2["grade"].cat.categories)
|
|
|
|
dfx = pd.concat([df1, df2])
|
|
tm.assert_index_equal(df["grade"].cat.categories, dfx["grade"].cat.categories)
|
|
|
|
dfa = df1.append(df2)
|
|
tm.assert_index_equal(df["grade"].cat.categories, dfa["grade"].cat.categories)
|