Inzynierka/Lib/site-packages/pandas/core/interchange/column.py

from __future__ import annotations

from typing import Any

import numpy as np

from pandas._libs.lib import infer_dtype
from pandas._libs.tslibs import iNaT
from pandas.errors import NoBufferPresent
from pandas.util._decorators import cache_readonly

import pandas as pd
from pandas.api.types import (
    is_categorical_dtype,
    is_string_dtype,
)
from pandas.core.interchange.buffer import PandasBuffer
from pandas.core.interchange.dataframe_protocol import (
    Column,
    ColumnBuffers,
    ColumnNullType,
    DtypeKind,
)
from pandas.core.interchange.utils import (
    ArrowCTypes,
    Endianness,
    dtype_to_arrow_c_fmt,
)

_NP_KINDS = {
    "i": DtypeKind.INT,
    "u": DtypeKind.UINT,
    "f": DtypeKind.FLOAT,
    "b": DtypeKind.BOOL,
    "U": DtypeKind.STRING,
    "M": DtypeKind.DATETIME,
    "m": DtypeKind.DATETIME,
}

_NULL_DESCRIPTION = {
    DtypeKind.FLOAT: (ColumnNullType.USE_NAN, None),
    DtypeKind.DATETIME: (ColumnNullType.USE_SENTINEL, iNaT),
    DtypeKind.INT: (ColumnNullType.NON_NULLABLE, None),
    DtypeKind.UINT: (ColumnNullType.NON_NULLABLE, None),
    DtypeKind.BOOL: (ColumnNullType.NON_NULLABLE, None),
    # Null values for categoricals are stored as `-1` sentinel values
    # in the category date (e.g., `col.values.codes` is int8 np.ndarray)
    DtypeKind.CATEGORICAL: (ColumnNullType.USE_SENTINEL, -1),
    # follow Arrow in using 1 as valid value and 0 for missing/null value
    DtypeKind.STRING: (ColumnNullType.USE_BYTEMASK, 0),
}

_NO_VALIDITY_BUFFER = {
    ColumnNullType.NON_NULLABLE: "This column is non-nullable",
    ColumnNullType.USE_NAN: "This column uses NaN as null",
    ColumnNullType.USE_SENTINEL: "This column uses a sentinel value",
}


class PandasColumn(Column):
    """
    A column object, with only the methods and properties required by the
    interchange protocol defined.
    A column can contain one or more chunks. Each chunk can contain up to three
    buffers - a data buffer, a mask buffer (depending on null representation),
    and an offsets buffer (if variable-size binary; e.g., variable-length
    strings).
    Note: this Column object can only be produced by ``__dataframe__``, so
          doesn't need its own version or ``__column__`` protocol.
    """

    def __init__(self, column: pd.Series, allow_copy: bool = True) -> None:
        """
        Note: doesn't deal with extension arrays yet, just assume a regular
        Series/ndarray for now.
        """
        if not isinstance(column, pd.Series):
            raise NotImplementedError(f"Columns of type {type(column)} not handled yet")

        # Store the column as a private attribute
        self._col = column
        self._allow_copy = allow_copy

    def size(self) -> int:
        """
        Size of the column, in elements.
        """
        return self._col.size

    @property
    def offset(self) -> int:
        """
        Offset of first element. Always zero.
        """
        # TODO: chunks are implemented now, probably this should return something
        return 0

    @cache_readonly
    def dtype(self) -> tuple[DtypeKind, int, str, str]:
        dtype = self._col.dtype

        if is_categorical_dtype(dtype):
            codes = self._col.values.codes
            (
                _,
                bitwidth,
                c_arrow_dtype_f_str,
                _,
            ) = self._dtype_from_pandasdtype(codes.dtype)
            return (
                DtypeKind.CATEGORICAL,
                bitwidth,
                c_arrow_dtype_f_str,
                Endianness.NATIVE,
            )
        elif is_string_dtype(dtype):
            if infer_dtype(self._col) == "string":
                return (
                    DtypeKind.STRING,
                    8,
                    dtype_to_arrow_c_fmt(dtype),
                    Endianness.NATIVE,
                )
            raise NotImplementedError("Non-string object dtypes are not supported yet")
        else:
            return self._dtype_from_pandasdtype(dtype)

    def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]:
        """
        See `self.dtype` for details.
        """
        # Note: 'c' (complex) not handled yet (not in array spec v1).
        #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
        #       datetime and timedelta both map to datetime (is timedelta handled?)

        kind = _NP_KINDS.get(dtype.kind, None)
        if kind is None:
            # Not a NumPy dtype. Check if it's a categorical maybe
            raise ValueError(f"Data type {dtype} not supported by interchange protocol")

        return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), dtype.byteorder

    @property
    def describe_categorical(self):
        """
        If the dtype is categorical, there are two options:
        - There are only values in the data buffer.
        - There is a separate non-categorical Column encoding for categorical values.

        Raises TypeError if the dtype is not categorical

        Content of returned dict:
            - "is_ordered" : bool, whether the ordering of dictionary indices is
                             semantically meaningful.
            - "is_dictionary" : bool, whether a dictionary-style mapping of
                                categorical values to other objects exists
            - "categories" : Column representing the (implicit) mapping of indices to
                             category values (e.g. an array of cat1, cat2, ...).
                             None if not a dictionary-style categorical.
        """
        if not self.dtype[0] == DtypeKind.CATEGORICAL:
            raise TypeError(
                "describe_categorical only works on a column with categorical dtype!"
            )

        return {
            "is_ordered": self._col.cat.ordered,
            "is_dictionary": True,
            "categories": PandasColumn(pd.Series(self._col.cat.categories)),
        }

    @property
    def describe_null(self):
        kind = self.dtype[0]
        try:
            null, value = _NULL_DESCRIPTION[kind]
        except KeyError:
            raise NotImplementedError(f"Data type {kind} not yet supported")

        return null, value

    @cache_readonly
    def null_count(self) -> int:
        """
        Number of null elements. Should always be known.
        """
        return self._col.isna().sum().item()

    @property
    def metadata(self) -> dict[str, pd.Index]:
        """
        Store specific metadata of the column.
        """
        return {"pandas.index": self._col.index}

    def num_chunks(self) -> int:
        """
        Return the number of chunks the column consists of.
        """
        return 1

    def get_chunks(self, n_chunks: int | None = None):
        """
        Return an iterator yielding the chunks.
        See `DataFrame.get_chunks` for details on ``n_chunks``.
        """
        if n_chunks and n_chunks > 1:
            size = len(self._col)
            step = size // n_chunks
            if size % n_chunks != 0:
                step += 1
            for start in range(0, step * n_chunks, step):
                yield PandasColumn(
                    self._col.iloc[start : start + step], self._allow_copy
                )
        else:
            yield self

    def get_buffers(self) -> ColumnBuffers:
        """
        Return a dictionary containing the underlying buffers.
        The returned dictionary has the following contents:
            - "data": a two-element tuple whose first element is a buffer
                      containing the data and whose second element is the data
                      buffer's associated dtype.
            - "validity": a two-element tuple whose first element is a buffer
                          containing mask values indicating missing data and
                          whose second element is the mask value buffer's
                          associated dtype. None if the null representation is
                          not a bit or byte mask.
            - "offsets": a two-element tuple whose first element is a buffer
                         containing the offset values for variable-size binary
                         data (e.g., variable-length strings) and whose second
                         element is the offsets buffer's associated dtype. None
                         if the data buffer does not have an associated offsets
                         buffer.
        """
        buffers: ColumnBuffers = {
            "data": self._get_data_buffer(),
            "validity": None,
            "offsets": None,
        }

        try:
            buffers["validity"] = self._get_validity_buffer()
        except NoBufferPresent:
            pass

        try:
            buffers["offsets"] = self._get_offsets_buffer()
        except NoBufferPresent:
            pass

        return buffers

    def _get_data_buffer(
        self,
    ) -> tuple[PandasBuffer, Any]:  # Any is for self.dtype tuple
        """
        Return the buffer containing the data and the buffer's associated dtype.
        """
        if self.dtype[0] in (
            DtypeKind.INT,
            DtypeKind.UINT,
            DtypeKind.FLOAT,
            DtypeKind.BOOL,
            DtypeKind.DATETIME,
        ):
            buffer = PandasBuffer(self._col.to_numpy(), allow_copy=self._allow_copy)
            dtype = self.dtype
        elif self.dtype[0] == DtypeKind.CATEGORICAL:
            codes = self._col.values._codes
            buffer = PandasBuffer(codes, allow_copy=self._allow_copy)
            dtype = self._dtype_from_pandasdtype(codes.dtype)
        elif self.dtype[0] == DtypeKind.STRING:
            # Marshal the strings from a NumPy object array into a byte array
            buf = self._col.to_numpy()
            b = bytearray()

            # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later
            for obj in buf:
                if isinstance(obj, str):
                    b.extend(obj.encode(encoding="utf-8"))

            # Convert the byte array to a Pandas "buffer" using
            # a NumPy array as the backing store
            buffer = PandasBuffer(np.frombuffer(b, dtype="uint8"))

            # Define the dtype for the returned buffer
            dtype = (
                DtypeKind.STRING,
                8,
                ArrowCTypes.STRING,
                Endianness.NATIVE,
            )  # note: currently only support native endianness
        else:
            raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")

        return buffer, dtype

    def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]:
        """
        Return the buffer containing the mask values indicating missing data and
        the buffer's associated dtype.
        Raises NoBufferPresent if null representation is not a bit or byte mask.
        """
        null, invalid = self.describe_null

        if self.dtype[0] == DtypeKind.STRING:
            # For now, use byte array as the mask.
            # TODO: maybe store as bit array to save space?..
            buf = self._col.to_numpy()

            # Determine the encoding for valid values
            valid = invalid == 0
            invalid = not valid

            mask = np.zeros(shape=(len(buf),), dtype=np.bool_)
            for i, obj in enumerate(buf):
                mask[i] = valid if isinstance(obj, str) else invalid

            # Convert the mask array to a Pandas "buffer" using
            # a NumPy array as the backing store
            buffer = PandasBuffer(mask)

            # Define the dtype of the returned buffer
            dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE)

            return buffer, dtype

        try:
            msg = f"{_NO_VALIDITY_BUFFER[null]} so does not have a separate mask"
        except KeyError:
            # TODO: implement for other bit/byte masks?
            raise NotImplementedError("See self.describe_null")

        raise NoBufferPresent(msg)

    def _get_offsets_buffer(self) -> tuple[PandasBuffer, Any]:
        """
        Return the buffer containing the offset values for variable-size binary
        data (e.g., variable-length strings) and the buffer's associated dtype.
        Raises NoBufferPresent if the data buffer does not have an associated
        offsets buffer.
        """
        if self.dtype[0] == DtypeKind.STRING:
            # For each string, we need to manually determine the next offset
            values = self._col.to_numpy()
            ptr = 0
            offsets = np.zeros(shape=(len(values) + 1,), dtype=np.int64)
            for i, v in enumerate(values):
                # For missing values (in this case, `np.nan` values)
                # we don't increment the pointer
                if isinstance(v, str):
                    b = v.encode(encoding="utf-8")
                    ptr += len(b)

                offsets[i + 1] = ptr

            # Convert the offsets to a Pandas "buffer" using
            # the NumPy array as the backing store
            buffer = PandasBuffer(offsets)

            # Assemble the buffer dtype info
            dtype = (
                DtypeKind.INT,
                64,
                ArrowCTypes.INT64,
                Endianness.NATIVE,
            )  # note: currently only support native endianness
        else:
            raise NoBufferPresent(
                "This column has a fixed-length dtype so "
                "it does not have an offsets buffer"
            )

        return buffer, dtype
first commit 2023-06-02 12:51:02 +02:00			`from __future__ import annotations`

			`from typing import Any`

			`import numpy as np`

			`from pandas._libs.lib import infer_dtype`
			`from pandas._libs.tslibs import iNaT`
			`from pandas.errors import NoBufferPresent`
			`from pandas.util._decorators import cache_readonly`

			`import pandas as pd`
			`from pandas.api.types import (`
			`is_categorical_dtype,`
			`is_string_dtype,`
			`)`
			`from pandas.core.interchange.buffer import PandasBuffer`
			`from pandas.core.interchange.dataframe_protocol import (`
			`Column,`
			`ColumnBuffers,`
			`ColumnNullType,`
			`DtypeKind,`
			`)`
			`from pandas.core.interchange.utils import (`
			`ArrowCTypes,`
			`Endianness,`
			`dtype_to_arrow_c_fmt,`
			`)`

			`_NP_KINDS = {`
			`"i": DtypeKind.INT,`
			`"u": DtypeKind.UINT,`
			`"f": DtypeKind.FLOAT,`
			`"b": DtypeKind.BOOL,`
			`"U": DtypeKind.STRING,`
			`"M": DtypeKind.DATETIME,`
			`"m": DtypeKind.DATETIME,`
			`}`

			`_NULL_DESCRIPTION = {`
			`DtypeKind.FLOAT: (ColumnNullType.USE_NAN, None),`
			`DtypeKind.DATETIME: (ColumnNullType.USE_SENTINEL, iNaT),`
			`DtypeKind.INT: (ColumnNullType.NON_NULLABLE, None),`
			`DtypeKind.UINT: (ColumnNullType.NON_NULLABLE, None),`
			`DtypeKind.BOOL: (ColumnNullType.NON_NULLABLE, None),`
			# Null values for categoricals are stored as `-1` sentinel values
			# in the category date (e.g., `col.values.codes` is int8 np.ndarray)
			`DtypeKind.CATEGORICAL: (ColumnNullType.USE_SENTINEL, -1),`
			`# follow Arrow in using 1 as valid value and 0 for missing/null value`
			`DtypeKind.STRING: (ColumnNullType.USE_BYTEMASK, 0),`
			`}`

			`_NO_VALIDITY_BUFFER = {`
			`ColumnNullType.NON_NULLABLE: "This column is non-nullable",`
			`ColumnNullType.USE_NAN: "This column uses NaN as null",`
			`ColumnNullType.USE_SENTINEL: "This column uses a sentinel value",`
			`}`


			`class PandasColumn(Column):`
			`"""`
			`A column object, with only the methods and properties required by the`
			`interchange protocol defined.`
			`A column can contain one or more chunks. Each chunk can contain up to three`
			`buffers - a data buffer, a mask buffer (depending on null representation),`
			`and an offsets buffer (if variable-size binary; e.g., variable-length`
			`strings).`
			Note: this Column object can only be produced by ``__dataframe__``, so
			doesn't need its own version or ``__column__`` protocol.
			`"""`

			`def __init__(self, column: pd.Series, allow_copy: bool = True) -> None:`
			`"""`
			`Note: doesn't deal with extension arrays yet, just assume a regular`
			`Series/ndarray for now.`
			`"""`
			`if not isinstance(column, pd.Series):`
			`raise NotImplementedError(f"Columns of type {type(column)} not handled yet")`

			`# Store the column as a private attribute`
			`self._col = column`
			`self._allow_copy = allow_copy`

			`def size(self) -> int:`
			`"""`
			`Size of the column, in elements.`
			`"""`
			`return self._col.size`

			`@property`
			`def offset(self) -> int:`
			`"""`
			`Offset of first element. Always zero.`
			`"""`
			`# TODO: chunks are implemented now, probably this should return something`
			`return 0`

			`@cache_readonly`
			`def dtype(self) -> tuple[DtypeKind, int, str, str]:`
			`dtype = self._col.dtype`

			`if is_categorical_dtype(dtype):`
			`codes = self._col.values.codes`
			`(`
			`_,`
			`bitwidth,`
			`c_arrow_dtype_f_str,`
			`_,`
			`) = self._dtype_from_pandasdtype(codes.dtype)`
			`return (`
			`DtypeKind.CATEGORICAL,`
			`bitwidth,`
			`c_arrow_dtype_f_str,`
			`Endianness.NATIVE,`
			`)`
			`elif is_string_dtype(dtype):`
			`if infer_dtype(self._col) == "string":`
			`return (`
			`DtypeKind.STRING,`
			`8,`
			`dtype_to_arrow_c_fmt(dtype),`
			`Endianness.NATIVE,`
			`)`
			`raise NotImplementedError("Non-string object dtypes are not supported yet")`
			`else:`
			`return self._dtype_from_pandasdtype(dtype)`

			`def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]:`
			`"""`
			See `self.dtype` for details.
			`"""`
			`# Note: 'c' (complex) not handled yet (not in array spec v1).`
			`# 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled`
			`# datetime and timedelta both map to datetime (is timedelta handled?)`

			`kind = _NP_KINDS.get(dtype.kind, None)`
			`if kind is None:`
			`# Not a NumPy dtype. Check if it's a categorical maybe`
			`raise ValueError(f"Data type {dtype} not supported by interchange protocol")`

			`return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), dtype.byteorder`

			`@property`
			`def describe_categorical(self):`
			`"""`
			`If the dtype is categorical, there are two options:`
			`- There are only values in the data buffer.`
			`- There is a separate non-categorical Column encoding for categorical values.`

			`Raises TypeError if the dtype is not categorical`

			`Content of returned dict:`
			`- "is_ordered" : bool, whether the ordering of dictionary indices is`
			`semantically meaningful.`
			`- "is_dictionary" : bool, whether a dictionary-style mapping of`
			`categorical values to other objects exists`
			`- "categories" : Column representing the (implicit) mapping of indices to`
			`category values (e.g. an array of cat1, cat2, ...).`
			`None if not a dictionary-style categorical.`
			`"""`
			`if not self.dtype[0] == DtypeKind.CATEGORICAL:`
			`raise TypeError(`
			`"describe_categorical only works on a column with categorical dtype!"`
			`)`

			`return {`
			`"is_ordered": self._col.cat.ordered,`
			`"is_dictionary": True,`
			`"categories": PandasColumn(pd.Series(self._col.cat.categories)),`
			`}`

			`@property`
			`def describe_null(self):`
			`kind = self.dtype[0]`
			`try:`
			`null, value = _NULL_DESCRIPTION[kind]`
			`except KeyError:`
			`raise NotImplementedError(f"Data type {kind} not yet supported")`

			`return null, value`

			`@cache_readonly`
			`def null_count(self) -> int:`
			`"""`
			`Number of null elements. Should always be known.`
			`"""`
			`return self._col.isna().sum().item()`

			`@property`
			`def metadata(self) -> dict[str, pd.Index]:`
			`"""`
			`Store specific metadata of the column.`
			`"""`
			`return {"pandas.index": self._col.index}`

			`def num_chunks(self) -> int:`
			`"""`
			`Return the number of chunks the column consists of.`
			`"""`
			`return 1`

			`def get_chunks(self, n_chunks: int \| None = None):`
			`"""`
			`Return an iterator yielding the chunks.`
			See `DataFrame.get_chunks` for details on ``n_chunks``.
			`"""`
			`if n_chunks and n_chunks > 1:`
			`size = len(self._col)`
			`step = size // n_chunks`
			`if size % n_chunks != 0:`
			`step += 1`
			`for start in range(0, step * n_chunks, step):`
			`yield PandasColumn(`
			`self._col.iloc[start : start + step], self._allow_copy`
			`)`
			`else:`
			`yield self`

			`def get_buffers(self) -> ColumnBuffers:`
			`"""`
			`Return a dictionary containing the underlying buffers.`
			`The returned dictionary has the following contents:`
			`- "data": a two-element tuple whose first element is a buffer`
			`containing the data and whose second element is the data`
			`buffer's associated dtype.`
			`- "validity": a two-element tuple whose first element is a buffer`
			`containing mask values indicating missing data and`
			`whose second element is the mask value buffer's`
			`associated dtype. None if the null representation is`
			`not a bit or byte mask.`
			`- "offsets": a two-element tuple whose first element is a buffer`
			`containing the offset values for variable-size binary`
			`data (e.g., variable-length strings) and whose second`
			`element is the offsets buffer's associated dtype. None`
			`if the data buffer does not have an associated offsets`
			`buffer.`
			`"""`
			`buffers: ColumnBuffers = {`
			`"data": self._get_data_buffer(),`
			`"validity": None,`
			`"offsets": None,`
			`}`

			`try:`
			`buffers["validity"] = self._get_validity_buffer()`
			`except NoBufferPresent:`
			`pass`

			`try:`
			`buffers["offsets"] = self._get_offsets_buffer()`
			`except NoBufferPresent:`
			`pass`

			`return buffers`

			`def _get_data_buffer(`
			`self,`
			`) -> tuple[PandasBuffer, Any]: # Any is for self.dtype tuple`
			`"""`
			`Return the buffer containing the data and the buffer's associated dtype.`
			`"""`
			`if self.dtype[0] in (`
			`DtypeKind.INT,`
			`DtypeKind.UINT,`
			`DtypeKind.FLOAT,`
			`DtypeKind.BOOL,`
			`DtypeKind.DATETIME,`
			`):`
			`buffer = PandasBuffer(self._col.to_numpy(), allow_copy=self._allow_copy)`
			`dtype = self.dtype`
			`elif self.dtype[0] == DtypeKind.CATEGORICAL:`
			`codes = self._col.values._codes`
			`buffer = PandasBuffer(codes, allow_copy=self._allow_copy)`
			`dtype = self._dtype_from_pandasdtype(codes.dtype)`
			`elif self.dtype[0] == DtypeKind.STRING:`
			`# Marshal the strings from a NumPy object array into a byte array`
			`buf = self._col.to_numpy()`
			`b = bytearray()`

			`# TODO: this for-loop is slow; can be implemented in Cython/C/C++ later`
			`for obj in buf:`
			`if isinstance(obj, str):`
			`b.extend(obj.encode(encoding="utf-8"))`

			`# Convert the byte array to a Pandas "buffer" using`
			`# a NumPy array as the backing store`
			`buffer = PandasBuffer(np.frombuffer(b, dtype="uint8"))`

			`# Define the dtype for the returned buffer`
			`dtype = (`
			`DtypeKind.STRING,`
			`8,`
			`ArrowCTypes.STRING,`
			`Endianness.NATIVE,`
			`) # note: currently only support native endianness`
			`else:`
			`raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")`

			`return buffer, dtype`

			`def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]:`
			`"""`
			`Return the buffer containing the mask values indicating missing data and`
			`the buffer's associated dtype.`
			`Raises NoBufferPresent if null representation is not a bit or byte mask.`
			`"""`
			`null, invalid = self.describe_null`

			`if self.dtype[0] == DtypeKind.STRING:`
			`# For now, use byte array as the mask.`
			`# TODO: maybe store as bit array to save space?..`
			`buf = self._col.to_numpy()`

			`# Determine the encoding for valid values`
			`valid = invalid == 0`
			`invalid = not valid`

			`mask = np.zeros(shape=(len(buf),), dtype=np.bool_)`
			`for i, obj in enumerate(buf):`
			`mask[i] = valid if isinstance(obj, str) else invalid`

			`# Convert the mask array to a Pandas "buffer" using`
			`# a NumPy array as the backing store`
			`buffer = PandasBuffer(mask)`

			`# Define the dtype of the returned buffer`
			`dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE)`

			`return buffer, dtype`

			`try:`
			`msg = f"{_NO_VALIDITY_BUFFER[null]} so does not have a separate mask"`
			`except KeyError:`
			`# TODO: implement for other bit/byte masks?`
			`raise NotImplementedError("See self.describe_null")`

			`raise NoBufferPresent(msg)`

			`def _get_offsets_buffer(self) -> tuple[PandasBuffer, Any]:`
			`"""`
			`Return the buffer containing the offset values for variable-size binary`
			`data (e.g., variable-length strings) and the buffer's associated dtype.`
			`Raises NoBufferPresent if the data buffer does not have an associated`
			`offsets buffer.`
			`"""`
			`if self.dtype[0] == DtypeKind.STRING:`
			`# For each string, we need to manually determine the next offset`
			`values = self._col.to_numpy()`
			`ptr = 0`
			`offsets = np.zeros(shape=(len(values) + 1,), dtype=np.int64)`
			`for i, v in enumerate(values):`
			# For missing values (in this case, `np.nan` values)
			`# we don't increment the pointer`
			`if isinstance(v, str):`
			`b = v.encode(encoding="utf-8")`
			`ptr += len(b)`

			`offsets[i + 1] = ptr`

			`# Convert the offsets to a Pandas "buffer" using`
			`# the NumPy array as the backing store`
			`buffer = PandasBuffer(offsets)`

			`# Assemble the buffer dtype info`
			`dtype = (`
			`DtypeKind.INT,`
			`64,`
			`ArrowCTypes.INT64,`
			`Endianness.NATIVE,`
			`) # note: currently only support native endianness`
			`else:`
			`raise NoBufferPresent(`
			`"This column has a fixed-length dtype so "`
			`"it does not have an offsets buffer"`
			`)`

			`return buffer, dtype`