""" Utility functions related to concat. """ from typing import Set, cast import numpy as np from pandas._typing import ArrayLike, DtypeObj from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( is_categorical_dtype, is_dtype_equal, is_extension_array_dtype, is_sparse, ) from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCRangeIndex, ABCSeries from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseArray from pandas.core.construction import array, ensure_wrapped_if_datetimelike def _get_dtype_kinds(arrays) -> Set[str]: """ Parameters ---------- arrays : list of arrays Returns ------- set[str] A set of kinds that exist in this list of arrays. """ typs: Set[str] = set() for arr in arrays: # Note: we use dtype.kind checks because they are much more performant # than is_foo_dtype dtype = arr.dtype if not isinstance(dtype, np.dtype): # ExtensionDtype so we get # e.g. "categorical", "datetime64[ns, US/Central]", "Sparse[itn64, 0]" typ = str(dtype) elif isinstance(arr, ABCRangeIndex): typ = "range" elif dtype.kind == "M": typ = "datetime" elif dtype.kind == "m": typ = "timedelta" elif dtype.kind in ["O", "b"]: typ = str(dtype) # i.e. "object", "bool" else: typ = dtype.kind typs.add(typ) return typs def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: """ Helper function for `arr.astype(common_dtype)` but handling all special cases. """ if ( is_categorical_dtype(arr.dtype) and isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.integer) ): # problem case: categorical of int -> gives int as result dtype, # but categorical can contain NAs -> fall back to object dtype try: return arr.astype(dtype, copy=False) except ValueError: return arr.astype(object, copy=False) if is_sparse(arr) and not is_sparse(dtype): # problem case: SparseArray.astype(dtype) doesn't follow the specified # dtype exactly, but converts this to Sparse[dtype] -> first manually # convert to dense array arr = cast(SparseArray, arr) return arr.to_dense().astype(dtype, copy=False) if ( isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"] and dtype is np.dtype("object") ): # wrap datetime-likes in EA to ensure astype(object) gives Timestamp/Timedelta # this can happen when concat_compat is called directly on arrays (when arrays # are not coming from Index/Series._values), eg in BlockManager.quantile arr = array(arr) if is_extension_array_dtype(dtype): if isinstance(arr, np.ndarray): # numpy's astype cannot handle ExtensionDtypes return array(arr, dtype=dtype, copy=False) return arr.astype(dtype, copy=False) def concat_compat(to_concat, axis: int = 0): """ provide concatenation of an array of arrays each of which is a single 'normalized' dtypes (in that for example, if it's object, then it is a non-datetimelike and provide a combined dtype for the resulting array that preserves the overall dtype if possible) Parameters ---------- to_concat : array of arrays axis : axis to provide concatenation Returns ------- a single array, preserving the combined dtypes """ # filter empty arrays # 1-d dtypes always are included here def is_nonempty(x) -> bool: if x.ndim <= axis: return True return x.shape[axis] > 0 # If all arrays are empty, there's nothing to convert, just short-cut to # the concatenation, #3121. # # Creating an empty array directly is tempting, but the winnings would be # marginal given that it would still require shape & dtype calculation and # np.concatenate which has them both implemented is compiled. non_empties = [x for x in to_concat if is_nonempty(x)] if non_empties and axis == 0: to_concat = non_empties typs = _get_dtype_kinds(to_concat) _contains_datetime = any(typ.startswith("datetime") for typ in typs) all_empty = not len(non_empties) single_dtype = len({x.dtype for x in to_concat}) == 1 any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) if any_ea: # we ignore axis here, as internally concatting with EAs is always # for axis=0 if not single_dtype: target_dtype = find_common_type([x.dtype for x in to_concat]) to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] if isinstance(to_concat[0], ExtensionArray): cls = type(to_concat[0]) return cls._concat_same_type(to_concat) else: return np.concatenate(to_concat) elif _contains_datetime or "timedelta" in typs: return _concat_datetime(to_concat, axis=axis) elif all_empty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise # cast this to float) typs = _get_dtype_kinds(to_concat) if len(typs) != 1: if not len(typs - {"i", "u", "f"}) or not len(typs - {"bool", "i", "u"}): # let numpy coerce pass else: # coerce to object to_concat = [x.astype("object") for x in to_concat] return np.concatenate(to_concat, axis=axis) def union_categoricals( to_union, sort_categories: bool = False, ignore_order: bool = False ): """ Combine list-like of Categorical-like, unioning categories. All categories must have the same dtype. Parameters ---------- to_union : list-like Categorical, CategoricalIndex, or Series with dtype='category'. sort_categories : bool, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. ignore_order : bool, default False If true, the ordered attribute of the Categoricals will be ignored. Results in an unordered categorical. Returns ------- Categorical Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical - sort_categories=True and Categoricals are ordered ValueError Empty list of categoricals passed Notes ----- To learn more about categories, see `link `__ Examples -------- >>> from pandas.api.types import union_categoricals If you want to combine categoricals that do not necessarily have the same categories, `union_categoricals` will combine a list-like of categoricals. The new categories will be the union of the categories being combined. >>> a = pd.Categorical(["b", "c"]) >>> b = pd.Categorical(["a", "b"]) >>> union_categoricals([a, b]) ['b', 'c', 'a', 'b'] Categories (3, object): ['b', 'c', 'a'] By default, the resulting categories will be ordered as they appear in the `categories` of the data. If you want the categories to be lexsorted, use `sort_categories=True` argument. >>> union_categoricals([a, b], sort_categories=True) ['b', 'c', 'a', 'b'] Categories (3, object): ['a', 'b', 'c'] `union_categoricals` also works with the case of combining two categoricals of the same categories and order information (e.g. what you could also `append` for). >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "a"], ordered=True) >>> union_categoricals([a, b]) ['a', 'b', 'a', 'b', 'a'] Categories (2, object): ['a' < 'b'] Raises `TypeError` because the categories are ordered and not identical. >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "c"], ordered=True) >>> union_categoricals([a, b]) Traceback (most recent call last): ... TypeError: to union ordered Categoricals, all categories must be the same New in version 0.20.0 Ordered categoricals with different categories or orderings can be combined by using the `ignore_ordered=True` argument. >>> a = pd.Categorical(["a", "b", "c"], ordered=True) >>> b = pd.Categorical(["c", "b", "a"], ordered=True) >>> union_categoricals([a, b], ignore_order=True) ['a', 'b', 'c', 'c', 'b', 'a'] Categories (3, object): ['a', 'b', 'c'] `union_categoricals` also works with a `CategoricalIndex`, or `Series` containing categorical data, but note that the resulting array will always be a plain `Categorical` >>> a = pd.Series(["b", "c"], dtype='category') >>> b = pd.Series(["a", "b"], dtype='category') >>> union_categoricals([a, b]) ['b', 'c', 'a', 'b'] Categories (3, object): ['b', 'c', 'a'] """ from pandas import Categorical from pandas.core.arrays.categorical import recode_for_categories if len(to_union) == 0: raise ValueError("No Categoricals to union") def _maybe_unwrap(x): if isinstance(x, (ABCCategoricalIndex, ABCSeries)): return x._values elif isinstance(x, Categorical): return x else: raise TypeError("all components to combine must be Categorical") to_union = [_maybe_unwrap(x) for x in to_union] first = to_union[0] if not all( is_dtype_equal(other.categories.dtype, first.categories.dtype) for other in to_union[1:] ): raise TypeError("dtype of categories must be the same") ordered = False if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered all_codes = [first._encode_with_my_categories(x)._codes for x in to_union] new_codes = np.concatenate(all_codes) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with ordered Categoricals") if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = categories.get_indexer(first.categories) from pandas.core.algorithms import take_1d new_codes = take_1d(indexer, new_codes, fill_value=-1) elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = cats.unique() if sort_categories: categories = categories.sort_values() new_codes = [ recode_for_categories(c.codes, c.categories, categories) for c in to_union ] new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message if all(c.ordered for c in to_union): msg = "to union ordered Categoricals, all categories must be the same" raise TypeError(msg) else: raise TypeError("Categorical.ordered must be the same") if ignore_order: ordered = False return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True) def _concatenate_2d(to_concat, axis: int): # coerce to 2d if needed & concatenate if axis == 1: to_concat = [np.atleast_2d(x) for x in to_concat] return np.concatenate(to_concat, axis=axis) def _concat_datetime(to_concat, axis=0): """ provide concatenation of an datetimelike array of arrays each of which is a single M8[ns], datetime64[ns, tz] or m8[ns] dtype Parameters ---------- to_concat : array of arrays axis : axis to provide concatenation Returns ------- a single array, preserving the combined dtypes """ to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat] single_dtype = len({x.dtype for x in to_concat}) == 1 # multiple types, need to coerce to object if not single_dtype: # ensure_wrapped_if_datetimelike ensures that astype(object) wraps # in Timestamp/Timedelta return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis) if axis == 1: # TODO(EA2D): kludge not necessary with 2D EAs to_concat = [x.reshape(1, -1) if x.ndim == 1 else x for x in to_concat] result = type(to_concat[0])._concat_same_type(to_concat, axis=axis) if result.ndim == 2 and is_extension_array_dtype(result.dtype): # TODO(EA2D): kludge not necessary with 2D EAs assert result.shape[0] == 1 result = result[0] return result