3RNN/Lib/site-packages/pandas/core/interchange/utils.py

"""
Utility functions and objects for implementing the interchange API.
"""

from __future__ import annotations

import typing

import numpy as np

from pandas._libs import lib

from pandas.core.dtypes.dtypes import (
    ArrowDtype,
    CategoricalDtype,
    DatetimeTZDtype,
)

import pandas as pd

if typing.TYPE_CHECKING:
    from pandas._typing import DtypeObj


# Maps str(pyarrow.DataType) = C type format string
# Currently, no pyarrow API for this
PYARROW_CTYPES = {
    "null": "n",
    "bool": "b",
    "uint8": "C",
    "uint16": "S",
    "uint32": "I",
    "uint64": "L",
    "int8": "c",
    "int16": "S",
    "int32": "i",
    "int64": "l",
    "halffloat": "e",  # float16
    "float": "f",  # float32
    "double": "g",  # float64
    "string": "u",
    "large_string": "U",
    "binary": "z",
    "time32[s]": "tts",
    "time32[ms]": "ttm",
    "time64[us]": "ttu",
    "time64[ns]": "ttn",
    "date32[day]": "tdD",
    "date64[ms]": "tdm",
    "timestamp[s]": "tss:",
    "timestamp[ms]": "tsm:",
    "timestamp[us]": "tsu:",
    "timestamp[ns]": "tsn:",
    "duration[s]": "tDs",
    "duration[ms]": "tDm",
    "duration[us]": "tDu",
    "duration[ns]": "tDn",
}


class ArrowCTypes:
    """
    Enum for Apache Arrow C type format strings.

    The Arrow C data interface:
    https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings
    """

    NULL = "n"
    BOOL = "b"
    INT8 = "c"
    UINT8 = "C"
    INT16 = "s"
    UINT16 = "S"
    INT32 = "i"
    UINT32 = "I"
    INT64 = "l"
    UINT64 = "L"
    FLOAT16 = "e"
    FLOAT32 = "f"
    FLOAT64 = "g"
    STRING = "u"  # utf-8
    LARGE_STRING = "U"  # utf-8
    DATE32 = "tdD"
    DATE64 = "tdm"
    # Resoulution:
    #   - seconds -> 's'
    #   - milliseconds -> 'm'
    #   - microseconds -> 'u'
    #   - nanoseconds -> 'n'
    TIMESTAMP = "ts{resolution}:{tz}"
    TIME = "tt{resolution}"


class Endianness:
    """Enum indicating the byte-order of a data-type."""

    LITTLE = "<"
    BIG = ">"
    NATIVE = "="
    NA = "|"


def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str:
    """
    Represent pandas `dtype` as a format string in Apache Arrow C notation.

    Parameters
    ----------
    dtype : np.dtype
        Datatype of pandas DataFrame to represent.

    Returns
    -------
    str
        Format string in Apache Arrow C notation of the given `dtype`.
    """
    if isinstance(dtype, CategoricalDtype):
        return ArrowCTypes.INT64
    elif dtype == np.dtype("O"):
        return ArrowCTypes.STRING
    elif isinstance(dtype, ArrowDtype):
        import pyarrow as pa

        pa_type = dtype.pyarrow_dtype
        if pa.types.is_decimal(pa_type):
            return f"d:{pa_type.precision},{pa_type.scale}"
        elif pa.types.is_timestamp(pa_type) and pa_type.tz is not None:
            return f"ts{pa_type.unit[0]}:{pa_type.tz}"
        format_str = PYARROW_CTYPES.get(str(pa_type), None)
        if format_str is not None:
            return format_str

    format_str = getattr(ArrowCTypes, dtype.name.upper(), None)
    if format_str is not None:
        return format_str

    if lib.is_np_dtype(dtype, "M"):
        # Selecting the first char of resolution string:
        # dtype.str -> '<M8[ns]' -> 'n'
        resolution = np.datetime_data(dtype)[0][0]
        return ArrowCTypes.TIMESTAMP.format(resolution=resolution, tz="")

    elif isinstance(dtype, DatetimeTZDtype):
        return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit[0], tz=dtype.tz)

    elif isinstance(dtype, pd.BooleanDtype):
        return ArrowCTypes.BOOL

    raise NotImplementedError(
        f"Conversion of {dtype} to Arrow C format string is not implemented."
    )


def maybe_rechunk(series: pd.Series, *, allow_copy: bool) -> pd.Series | None:
    """
    Rechunk a multi-chunk pyarrow array into a single-chunk array, if necessary.

    - Returns `None` if the input series is not backed by a multi-chunk pyarrow array
      (and so doesn't need rechunking)
    - Returns a single-chunk-backed-Series if the input is backed by a multi-chunk
      pyarrow array and `allow_copy` is `True`.
    - Raises a `RuntimeError` if `allow_copy` is `False` and input is a
      based by a multi-chunk pyarrow array.
    """
    if not isinstance(series.dtype, pd.ArrowDtype):
        return None
    chunked_array = series.array._pa_array  # type: ignore[attr-defined]
    if len(chunked_array.chunks) == 1:
        return None
    if not allow_copy:
        raise RuntimeError(
            "Found multi-chunk pyarrow array, but `allow_copy` is False. "
            "Please rechunk the array before calling this function, or set "
            "`allow_copy=True`."
        )
    arr = chunked_array.combine_chunks()
    return pd.Series(arr, dtype=series.dtype, name=series.name, index=series.index)
1.0 2024-05-26 19:49:15 +02:00			`"""`
			`Utility functions and objects for implementing the interchange API.`
			`"""`

			`from __future__ import annotations`

			`import typing`

			`import numpy as np`

			`from pandas._libs import lib`

			`from pandas.core.dtypes.dtypes import (`
			`ArrowDtype,`
			`CategoricalDtype,`
			`DatetimeTZDtype,`
			`)`

			`import pandas as pd`

			`if typing.TYPE_CHECKING:`
			`from pandas._typing import DtypeObj`


			`# Maps str(pyarrow.DataType) = C type format string`
			`# Currently, no pyarrow API for this`
			`PYARROW_CTYPES = {`
			`"null": "n",`
			`"bool": "b",`
			`"uint8": "C",`
			`"uint16": "S",`
			`"uint32": "I",`
			`"uint64": "L",`
			`"int8": "c",`
			`"int16": "S",`
			`"int32": "i",`
			`"int64": "l",`
			`"halffloat": "e", # float16`
			`"float": "f", # float32`
			`"double": "g", # float64`
			`"string": "u",`
			`"large_string": "U",`
			`"binary": "z",`
			`"time32[s]": "tts",`
			`"time32[ms]": "ttm",`
			`"time64[us]": "ttu",`
			`"time64[ns]": "ttn",`
			`"date32[day]": "tdD",`
			`"date64[ms]": "tdm",`
			`"timestamp[s]": "tss:",`
			`"timestamp[ms]": "tsm:",`
			`"timestamp[us]": "tsu:",`
			`"timestamp[ns]": "tsn:",`
			`"duration[s]": "tDs",`
			`"duration[ms]": "tDm",`
			`"duration[us]": "tDu",`
			`"duration[ns]": "tDn",`
			`}`


			`class ArrowCTypes:`
			`"""`
			`Enum for Apache Arrow C type format strings.`

			`The Arrow C data interface:`
			`https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings`
			`"""`

			`NULL = "n"`
			`BOOL = "b"`
			`INT8 = "c"`
			`UINT8 = "C"`
			`INT16 = "s"`
			`UINT16 = "S"`
			`INT32 = "i"`
			`UINT32 = "I"`
			`INT64 = "l"`
			`UINT64 = "L"`
			`FLOAT16 = "e"`
			`FLOAT32 = "f"`
			`FLOAT64 = "g"`
			`STRING = "u" # utf-8`
			`LARGE_STRING = "U" # utf-8`
			`DATE32 = "tdD"`
			`DATE64 = "tdm"`
			`# Resoulution:`
			`# - seconds -> 's'`
			`# - milliseconds -> 'm'`
			`# - microseconds -> 'u'`
			`# - nanoseconds -> 'n'`
			`TIMESTAMP = "ts{resolution}:{tz}"`
			`TIME = "tt{resolution}"`


			`class Endianness:`
			`"""Enum indicating the byte-order of a data-type."""`

			`LITTLE = "<"`
			`BIG = ">"`
			`NATIVE = "="`
			`NA = "\|"`


			`def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str:`
			`"""`
			Represent pandas `dtype` as a format string in Apache Arrow C notation.

			`Parameters`
			`----------`
			`dtype : np.dtype`
			`Datatype of pandas DataFrame to represent.`

			`Returns`
			`-------`
			`str`
			Format string in Apache Arrow C notation of the given `dtype`.
			`"""`
			`if isinstance(dtype, CategoricalDtype):`
			`return ArrowCTypes.INT64`
			`elif dtype == np.dtype("O"):`
			`return ArrowCTypes.STRING`
			`elif isinstance(dtype, ArrowDtype):`
			`import pyarrow as pa`

			`pa_type = dtype.pyarrow_dtype`
			`if pa.types.is_decimal(pa_type):`
			`return f"d:{pa_type.precision},{pa_type.scale}"`
			`elif pa.types.is_timestamp(pa_type) and pa_type.tz is not None:`
			`return f"ts{pa_type.unit[0]}:{pa_type.tz}"`
			`format_str = PYARROW_CTYPES.get(str(pa_type), None)`
			`if format_str is not None:`
			`return format_str`

			`format_str = getattr(ArrowCTypes, dtype.name.upper(), None)`
			`if format_str is not None:`
			`return format_str`

			`if lib.is_np_dtype(dtype, "M"):`
			`# Selecting the first char of resolution string:`
			`# dtype.str -> '<M8[ns]' -> 'n'`
			`resolution = np.datetime_data(dtype)[0][0]`
			`return ArrowCTypes.TIMESTAMP.format(resolution=resolution, tz="")`

			`elif isinstance(dtype, DatetimeTZDtype):`
			`return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit[0], tz=dtype.tz)`

			`elif isinstance(dtype, pd.BooleanDtype):`
			`return ArrowCTypes.BOOL`

			`raise NotImplementedError(`
			`f"Conversion of {dtype} to Arrow C format string is not implemented."`
			`)`


			`def maybe_rechunk(series: pd.Series, *, allow_copy: bool) -> pd.Series \| None:`
			`"""`
			`Rechunk a multi-chunk pyarrow array into a single-chunk array, if necessary.`

			- Returns `None` if the input series is not backed by a multi-chunk pyarrow array
			`(and so doesn't need rechunking)`
			`- Returns a single-chunk-backed-Series if the input is backed by a multi-chunk`
			pyarrow array and `allow_copy` is `True`.
			- Raises a `RuntimeError` if `allow_copy` is `False` and input is a
			`based by a multi-chunk pyarrow array.`
			`"""`
			`if not isinstance(series.dtype, pd.ArrowDtype):`
			`return None`
			`chunked_array = series.array._pa_array # type: ignore[attr-defined]`
			`if len(chunked_array.chunks) == 1:`
			`return None`
			`if not allow_copy:`
			`raise RuntimeError(`
			"Found multi-chunk pyarrow array, but `allow_copy` is False. "
			`"Please rechunk the array before calling this function, or set "`
			"`allow_copy=True`."
			`)`
			`arr = chunked_array.combine_chunks()`
			`return pd.Series(arr, dtype=series.dtype, name=series.name, index=series.index)`