Inzynierka/Lib/site-packages/pandas/io/formats/format.py
2023-06-02 12:51:02 +02:00

2241 lines
70 KiB
Python

"""
Internal module for formatting output data in csv, html, xml,
and latex files. This module also applies to display formatting.
"""
from __future__ import annotations
from contextlib import contextmanager
from csv import (
QUOTE_NONE,
QUOTE_NONNUMERIC,
)
from decimal import Decimal
from functools import partial
from io import StringIO
import math
import re
from shutil import get_terminal_size
from typing import (
IO,
TYPE_CHECKING,
Any,
Callable,
Final,
Generator,
Hashable,
Iterable,
List,
Mapping,
Sequence,
cast,
)
from unicodedata import east_asian_width
import numpy as np
from pandas._config.config import (
get_option,
set_option,
)
from pandas._libs import lib
from pandas._libs.missing import NA
from pandas._libs.tslibs import (
NaT,
Timedelta,
Timestamp,
get_unit_from_dtype,
iNaT,
periods_per_day,
)
from pandas._libs.tslibs.nattype import NaTType
from pandas._typing import (
ArrayLike,
Axes,
ColspaceArgType,
ColspaceType,
CompressionOptions,
FilePath,
FloatFormatType,
FormattersType,
IndexLabel,
StorageOptions,
WriteBuffer,
)
from pandas.core.dtypes.common import (
is_categorical_dtype,
is_complex_dtype,
is_datetime64_dtype,
is_extension_array_dtype,
is_float,
is_float_dtype,
is_integer,
is_integer_dtype,
is_list_like,
is_numeric_dtype,
is_scalar,
is_timedelta64_dtype,
)
from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.dtypes.missing import (
isna,
notna,
)
from pandas.core.arrays import (
Categorical,
DatetimeArray,
TimedeltaArray,
)
from pandas.core.arrays.string_ import StringDtype
from pandas.core.base import PandasObject
import pandas.core.common as com
from pandas.core.construction import extract_array
from pandas.core.indexes.api import (
Index,
MultiIndex,
PeriodIndex,
ensure_index,
)
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.indexes.timedeltas import TimedeltaIndex
from pandas.core.reshape.concat import concat
from pandas.io.common import (
check_parent_directory,
stringify_path,
)
from pandas.io.formats import printing
if TYPE_CHECKING:
from pandas import (
DataFrame,
Series,
)
common_docstring: Final = """
Parameters
----------
buf : str, Path or StringIO-like, optional, default None
Buffer to write to. If None, the output is returned as a string.
columns : sequence, optional, default None
The subset of columns to write. Writes all columns by default.
col_space : %(col_space_type)s, optional
%(col_space)s.
header : %(header_type)s, optional
%(header)s.
index : bool, optional, default True
Whether to print index (row) labels.
na_rep : str, optional, default 'NaN'
String representation of ``NaN`` to use.
formatters : list, tuple or dict of one-param. functions, optional
Formatter functions to apply to columns' elements by position or
name.
The result of each function must be a unicode string.
List/tuple must be of length equal to the number of columns.
float_format : one-parameter function, optional, default None
Formatter function to apply to columns' elements if they are
floats. This function must return a unicode string and will be
applied only to the non-``NaN`` elements, with ``NaN`` being
handled by ``na_rep``.
.. versionchanged:: 1.2.0
sparsify : bool, optional, default True
Set to False for a DataFrame with a hierarchical index to print
every multiindex key at each row.
index_names : bool, optional, default True
Prints the names of the indexes.
justify : str, default None
How to justify the column labels. If None uses the option from
the print configuration (controlled by set_option), 'right' out
of the box. Valid values are
* left
* right
* center
* justify
* justify-all
* start
* end
* inherit
* match-parent
* initial
* unset.
max_rows : int, optional
Maximum number of rows to display in the console.
max_cols : int, optional
Maximum number of columns to display in the console.
show_dimensions : bool, default False
Display DataFrame dimensions (number of rows by number of columns).
decimal : str, default '.'
Character recognized as decimal separator, e.g. ',' in Europe.
"""
_VALID_JUSTIFY_PARAMETERS = (
"left",
"right",
"center",
"justify",
"justify-all",
"start",
"end",
"inherit",
"match-parent",
"initial",
"unset",
)
return_docstring: Final = """
Returns
-------
str or None
If buf is None, returns the result as a string. Otherwise returns
None.
"""
class CategoricalFormatter:
def __init__(
self,
categorical: Categorical,
buf: IO[str] | None = None,
length: bool = True,
na_rep: str = "NaN",
footer: bool = True,
) -> None:
self.categorical = categorical
self.buf = buf if buf is not None else StringIO("")
self.na_rep = na_rep
self.length = length
self.footer = footer
self.quoting = QUOTE_NONNUMERIC
def _get_footer(self) -> str:
footer = ""
if self.length:
if footer:
footer += ", "
footer += f"Length: {len(self.categorical)}"
level_info = self.categorical._repr_categories_info()
# Levels are added in a newline
if footer:
footer += "\n"
footer += level_info
return str(footer)
def _get_formatted_values(self) -> list[str]:
return format_array(
self.categorical._internal_get_values(),
None,
float_format=None,
na_rep=self.na_rep,
quoting=self.quoting,
)
def to_string(self) -> str:
categorical = self.categorical
if len(categorical) == 0:
if self.footer:
return self._get_footer()
else:
return ""
fmt_values = self._get_formatted_values()
fmt_values = [i.strip() for i in fmt_values]
values = ", ".join(fmt_values)
result = ["[" + values + "]"]
if self.footer:
footer = self._get_footer()
if footer:
result.append(footer)
return str("\n".join(result))
class SeriesFormatter:
def __init__(
self,
series: Series,
buf: IO[str] | None = None,
length: bool | str = True,
header: bool = True,
index: bool = True,
na_rep: str = "NaN",
name: bool = False,
float_format: str | None = None,
dtype: bool = True,
max_rows: int | None = None,
min_rows: int | None = None,
) -> None:
self.series = series
self.buf = buf if buf is not None else StringIO()
self.name = name
self.na_rep = na_rep
self.header = header
self.length = length
self.index = index
self.max_rows = max_rows
self.min_rows = min_rows
if float_format is None:
float_format = get_option("display.float_format")
self.float_format = float_format
self.dtype = dtype
self.adj = get_adjustment()
self._chk_truncate()
def _chk_truncate(self) -> None:
self.tr_row_num: int | None
min_rows = self.min_rows
max_rows = self.max_rows
# truncation determined by max_rows, actual truncated number of rows
# used below by min_rows
is_truncated_vertically = max_rows and (len(self.series) > max_rows)
series = self.series
if is_truncated_vertically:
max_rows = cast(int, max_rows)
if min_rows:
# if min_rows is set (not None or 0), set max_rows to minimum
# of both
max_rows = min(min_rows, max_rows)
if max_rows == 1:
row_num = max_rows
series = series.iloc[:max_rows]
else:
row_num = max_rows // 2
series = concat((series.iloc[:row_num], series.iloc[-row_num:]))
self.tr_row_num = row_num
else:
self.tr_row_num = None
self.tr_series = series
self.is_truncated_vertically = is_truncated_vertically
def _get_footer(self) -> str:
name = self.series.name
footer = ""
if getattr(self.series.index, "freq", None) is not None:
assert isinstance(
self.series.index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)
)
footer += f"Freq: {self.series.index.freqstr}"
if self.name is not False and name is not None:
if footer:
footer += ", "
series_name = printing.pprint_thing(name, escape_chars=("\t", "\r", "\n"))
footer += f"Name: {series_name}"
if self.length is True or (
self.length == "truncate" and self.is_truncated_vertically
):
if footer:
footer += ", "
footer += f"Length: {len(self.series)}"
if self.dtype is not False and self.dtype is not None:
dtype_name = getattr(self.tr_series.dtype, "name", None)
if dtype_name:
if footer:
footer += ", "
footer += f"dtype: {printing.pprint_thing(dtype_name)}"
# level infos are added to the end and in a new line, like it is done
# for Categoricals
if is_categorical_dtype(self.tr_series.dtype):
level_info = self.tr_series._values._repr_categories_info()
if footer:
footer += "\n"
footer += level_info
return str(footer)
def _get_formatted_index(self) -> tuple[list[str], bool]:
index = self.tr_series.index
if isinstance(index, MultiIndex):
have_header = any(name for name in index.names)
fmt_index = index.format(names=True)
else:
have_header = index.name is not None
fmt_index = index.format(name=True)
return fmt_index, have_header
def _get_formatted_values(self) -> list[str]:
return format_array(
self.tr_series._values,
None,
float_format=self.float_format,
na_rep=self.na_rep,
leading_space=self.index,
)
def to_string(self) -> str:
series = self.tr_series
footer = self._get_footer()
if len(series) == 0:
return f"{type(self.series).__name__}([], {footer})"
fmt_index, have_header = self._get_formatted_index()
fmt_values = self._get_formatted_values()
if self.is_truncated_vertically:
n_header_rows = 0
row_num = self.tr_row_num
row_num = cast(int, row_num)
width = self.adj.len(fmt_values[row_num - 1])
if width > 3:
dot_str = "..."
else:
dot_str = ".."
# Series uses mode=center because it has single value columns
# DataFrame uses mode=left
dot_str = self.adj.justify([dot_str], width, mode="center")[0]
fmt_values.insert(row_num + n_header_rows, dot_str)
fmt_index.insert(row_num + 1, "")
if self.index:
result = self.adj.adjoin(3, *[fmt_index[1:], fmt_values])
else:
result = self.adj.adjoin(3, fmt_values)
if self.header and have_header:
result = fmt_index[0] + "\n" + result
if footer:
result += "\n" + footer
return str("".join(result))
class TextAdjustment:
def __init__(self) -> None:
self.encoding = get_option("display.encoding")
def len(self, text: str) -> int:
return len(text)
def justify(self, texts: Any, max_len: int, mode: str = "right") -> list[str]:
return printing.justify(texts, max_len, mode=mode)
def adjoin(self, space: int, *lists, **kwargs) -> str:
return printing.adjoin(
space, *lists, strlen=self.len, justfunc=self.justify, **kwargs
)
class EastAsianTextAdjustment(TextAdjustment):
def __init__(self) -> None:
super().__init__()
if get_option("display.unicode.ambiguous_as_wide"):
self.ambiguous_width = 2
else:
self.ambiguous_width = 1
# Definition of East Asian Width
# https://unicode.org/reports/tr11/
# Ambiguous width can be changed by option
self._EAW_MAP = {"Na": 1, "N": 1, "W": 2, "F": 2, "H": 1}
def len(self, text: str) -> int:
"""
Calculate display width considering unicode East Asian Width
"""
if not isinstance(text, str):
return len(text)
return sum(
self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) for c in text
)
def justify(
self, texts: Iterable[str], max_len: int, mode: str = "right"
) -> list[str]:
# re-calculate padding space per str considering East Asian Width
def _get_pad(t):
return max_len - self.len(t) + len(t)
if mode == "left":
return [x.ljust(_get_pad(x)) for x in texts]
elif mode == "center":
return [x.center(_get_pad(x)) for x in texts]
else:
return [x.rjust(_get_pad(x)) for x in texts]
def get_adjustment() -> TextAdjustment:
use_east_asian_width = get_option("display.unicode.east_asian_width")
if use_east_asian_width:
return EastAsianTextAdjustment()
else:
return TextAdjustment()
def get_dataframe_repr_params() -> dict[str, Any]:
"""Get the parameters used to repr(dataFrame) calls using DataFrame.to_string.
Supplying these parameters to DataFrame.to_string is equivalent to calling
``repr(DataFrame)``. This is useful if you want to adjust the repr output.
.. versionadded:: 1.4.0
Example
-------
>>> import pandas as pd
>>>
>>> df = pd.DataFrame([[1, 2], [3, 4]])
>>> repr_params = pd.io.formats.format.get_dataframe_repr_params()
>>> repr(df) == df.to_string(**repr_params)
True
"""
from pandas.io.formats import console
if get_option("display.expand_frame_repr"):
line_width, _ = console.get_console_size()
else:
line_width = None
return {
"max_rows": get_option("display.max_rows"),
"min_rows": get_option("display.min_rows"),
"max_cols": get_option("display.max_columns"),
"max_colwidth": get_option("display.max_colwidth"),
"show_dimensions": get_option("display.show_dimensions"),
"line_width": line_width,
}
def get_series_repr_params() -> dict[str, Any]:
"""Get the parameters used to repr(Series) calls using Series.to_string.
Supplying these parameters to Series.to_string is equivalent to calling
``repr(series)``. This is useful if you want to adjust the series repr output.
.. versionadded:: 1.4.0
Example
-------
>>> import pandas as pd
>>>
>>> ser = pd.Series([1, 2, 3, 4])
>>> repr_params = pd.io.formats.format.get_series_repr_params()
>>> repr(ser) == ser.to_string(**repr_params)
True
"""
width, height = get_terminal_size()
max_rows = (
height
if get_option("display.max_rows") == 0
else get_option("display.max_rows")
)
min_rows = (
height
if get_option("display.max_rows") == 0
else get_option("display.min_rows")
)
return {
"name": True,
"dtype": True,
"min_rows": min_rows,
"max_rows": max_rows,
"length": get_option("display.show_dimensions"),
}
class DataFrameFormatter:
"""Class for processing dataframe formatting options and data."""
__doc__ = __doc__ if __doc__ else ""
__doc__ += common_docstring + return_docstring
def __init__(
self,
frame: DataFrame,
columns: Sequence[Hashable] | None = None,
col_space: ColspaceArgType | None = None,
header: bool | Sequence[str] = True,
index: bool = True,
na_rep: str = "NaN",
formatters: FormattersType | None = None,
justify: str | None = None,
float_format: FloatFormatType | None = None,
sparsify: bool | None = None,
index_names: bool = True,
max_rows: int | None = None,
min_rows: int | None = None,
max_cols: int | None = None,
show_dimensions: bool | str = False,
decimal: str = ".",
bold_rows: bool = False,
escape: bool = True,
) -> None:
self.frame = frame
self.columns = self._initialize_columns(columns)
self.col_space = self._initialize_colspace(col_space)
self.header = header
self.index = index
self.na_rep = na_rep
self.formatters = self._initialize_formatters(formatters)
self.justify = self._initialize_justify(justify)
self.float_format = float_format
self.sparsify = self._initialize_sparsify(sparsify)
self.show_index_names = index_names
self.decimal = decimal
self.bold_rows = bold_rows
self.escape = escape
self.max_rows = max_rows
self.min_rows = min_rows
self.max_cols = max_cols
self.show_dimensions = show_dimensions
self.max_cols_fitted = self._calc_max_cols_fitted()
self.max_rows_fitted = self._calc_max_rows_fitted()
self.tr_frame = self.frame
self.truncate()
self.adj = get_adjustment()
def get_strcols(self) -> list[list[str]]:
"""
Render a DataFrame to a list of columns (as lists of strings).
"""
strcols = self._get_strcols_without_index()
if self.index:
str_index = self._get_formatted_index(self.tr_frame)
strcols.insert(0, str_index)
return strcols
@property
def should_show_dimensions(self) -> bool:
return self.show_dimensions is True or (
self.show_dimensions == "truncate" and self.is_truncated
)
@property
def is_truncated(self) -> bool:
return bool(self.is_truncated_horizontally or self.is_truncated_vertically)
@property
def is_truncated_horizontally(self) -> bool:
return bool(self.max_cols_fitted and (len(self.columns) > self.max_cols_fitted))
@property
def is_truncated_vertically(self) -> bool:
return bool(self.max_rows_fitted and (len(self.frame) > self.max_rows_fitted))
@property
def dimensions_info(self) -> str:
return f"\n\n[{len(self.frame)} rows x {len(self.frame.columns)} columns]"
@property
def has_index_names(self) -> bool:
return _has_names(self.frame.index)
@property
def has_column_names(self) -> bool:
return _has_names(self.frame.columns)
@property
def show_row_idx_names(self) -> bool:
return all((self.has_index_names, self.index, self.show_index_names))
@property
def show_col_idx_names(self) -> bool:
return all((self.has_column_names, self.show_index_names, self.header))
@property
def max_rows_displayed(self) -> int:
return min(self.max_rows or len(self.frame), len(self.frame))
def _initialize_sparsify(self, sparsify: bool | None) -> bool:
if sparsify is None:
return get_option("display.multi_sparse")
return sparsify
def _initialize_formatters(
self, formatters: FormattersType | None
) -> FormattersType:
if formatters is None:
return {}
elif len(self.frame.columns) == len(formatters) or isinstance(formatters, dict):
return formatters
else:
raise ValueError(
f"Formatters length({len(formatters)}) should match "
f"DataFrame number of columns({len(self.frame.columns)})"
)
def _initialize_justify(self, justify: str | None) -> str:
if justify is None:
return get_option("display.colheader_justify")
else:
return justify
def _initialize_columns(self, columns: Sequence[Hashable] | None) -> Index:
if columns is not None:
# GH 47231 - columns doesn't have to be `Sequence[str]`
# Will fix in later PR
cols = ensure_index(cast(Axes, columns))
self.frame = self.frame[cols]
return cols
else:
return self.frame.columns
def _initialize_colspace(self, col_space: ColspaceArgType | None) -> ColspaceType:
result: ColspaceType
if col_space is None:
result = {}
elif isinstance(col_space, (int, str)):
result = {"": col_space}
result.update({column: col_space for column in self.frame.columns})
elif isinstance(col_space, Mapping):
for column in col_space.keys():
if column not in self.frame.columns and column != "":
raise ValueError(
f"Col_space is defined for an unknown column: {column}"
)
result = col_space
else:
if len(self.frame.columns) != len(col_space):
raise ValueError(
f"Col_space length({len(col_space)}) should match "
f"DataFrame number of columns({len(self.frame.columns)})"
)
result = dict(zip(self.frame.columns, col_space))
return result
def _calc_max_cols_fitted(self) -> int | None:
"""Number of columns fitting the screen."""
if not self._is_in_terminal():
return self.max_cols
width, _ = get_terminal_size()
if self._is_screen_narrow(width):
return width
else:
return self.max_cols
def _calc_max_rows_fitted(self) -> int | None:
"""Number of rows with data fitting the screen."""
max_rows: int | None
if self._is_in_terminal():
_, height = get_terminal_size()
if self.max_rows == 0:
# rows available to fill with actual data
return height - self._get_number_of_auxillary_rows()
if self._is_screen_short(height):
max_rows = height
else:
max_rows = self.max_rows
else:
max_rows = self.max_rows
return self._adjust_max_rows(max_rows)
def _adjust_max_rows(self, max_rows: int | None) -> int | None:
"""Adjust max_rows using display logic.
See description here:
https://pandas.pydata.org/docs/dev/user_guide/options.html#frequently-used-options
GH #37359
"""
if max_rows:
if (len(self.frame) > max_rows) and self.min_rows:
# if truncated, set max_rows showed to min_rows
max_rows = min(self.min_rows, max_rows)
return max_rows
def _is_in_terminal(self) -> bool:
"""Check if the output is to be shown in terminal."""
return bool(self.max_cols == 0 or self.max_rows == 0)
def _is_screen_narrow(self, max_width) -> bool:
return bool(self.max_cols == 0 and len(self.frame.columns) > max_width)
def _is_screen_short(self, max_height) -> bool:
return bool(self.max_rows == 0 and len(self.frame) > max_height)
def _get_number_of_auxillary_rows(self) -> int:
"""Get number of rows occupied by prompt, dots and dimension info."""
dot_row = 1
prompt_row = 1
num_rows = dot_row + prompt_row
if self.show_dimensions:
num_rows += len(self.dimensions_info.splitlines())
if self.header:
num_rows += 1
return num_rows
def truncate(self) -> None:
"""
Check whether the frame should be truncated. If so, slice the frame up.
"""
if self.is_truncated_horizontally:
self._truncate_horizontally()
if self.is_truncated_vertically:
self._truncate_vertically()
def _truncate_horizontally(self) -> None:
"""Remove columns, which are not to be displayed and adjust formatters.
Attributes affected:
- tr_frame
- formatters
- tr_col_num
"""
assert self.max_cols_fitted is not None
col_num = self.max_cols_fitted // 2
if col_num >= 1:
left = self.tr_frame.iloc[:, :col_num]
right = self.tr_frame.iloc[:, -col_num:]
self.tr_frame = concat((left, right), axis=1)
# truncate formatter
if isinstance(self.formatters, (list, tuple)):
self.formatters = [
*self.formatters[:col_num],
*self.formatters[-col_num:],
]
else:
col_num = cast(int, self.max_cols)
self.tr_frame = self.tr_frame.iloc[:, :col_num]
self.tr_col_num = col_num
def _truncate_vertically(self) -> None:
"""Remove rows, which are not to be displayed.
Attributes affected:
- tr_frame
- tr_row_num
"""
assert self.max_rows_fitted is not None
row_num = self.max_rows_fitted // 2
if row_num >= 1:
head = self.tr_frame.iloc[:row_num, :]
tail = self.tr_frame.iloc[-row_num:, :]
self.tr_frame = concat((head, tail))
else:
row_num = cast(int, self.max_rows)
self.tr_frame = self.tr_frame.iloc[:row_num, :]
self.tr_row_num = row_num
def _get_strcols_without_index(self) -> list[list[str]]:
strcols: list[list[str]] = []
if not is_list_like(self.header) and not self.header:
for i, c in enumerate(self.tr_frame):
fmt_values = self.format_col(i)
fmt_values = _make_fixed_width(
strings=fmt_values,
justify=self.justify,
minimum=int(self.col_space.get(c, 0)),
adj=self.adj,
)
strcols.append(fmt_values)
return strcols
if is_list_like(self.header):
# cast here since can't be bool if is_list_like
self.header = cast(List[str], self.header)
if len(self.header) != len(self.columns):
raise ValueError(
f"Writing {len(self.columns)} cols "
f"but got {len(self.header)} aliases"
)
str_columns = [[label] for label in self.header]
else:
str_columns = self._get_formatted_column_labels(self.tr_frame)
if self.show_row_idx_names:
for x in str_columns:
x.append("")
for i, c in enumerate(self.tr_frame):
cheader = str_columns[i]
header_colwidth = max(
int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader)
)
fmt_values = self.format_col(i)
fmt_values = _make_fixed_width(
fmt_values, self.justify, minimum=header_colwidth, adj=self.adj
)
max_len = max(max(self.adj.len(x) for x in fmt_values), header_colwidth)
cheader = self.adj.justify(cheader, max_len, mode=self.justify)
strcols.append(cheader + fmt_values)
return strcols
def format_col(self, i: int) -> list[str]:
frame = self.tr_frame
formatter = self._get_formatter(i)
return format_array(
frame.iloc[:, i]._values,
formatter,
float_format=self.float_format,
na_rep=self.na_rep,
space=self.col_space.get(frame.columns[i]),
decimal=self.decimal,
leading_space=self.index,
)
def _get_formatter(self, i: str | int) -> Callable | None:
if isinstance(self.formatters, (list, tuple)):
if is_integer(i):
i = cast(int, i)
return self.formatters[i]
else:
return None
else:
if is_integer(i) and i not in self.columns:
i = self.columns[i]
return self.formatters.get(i, None)
def _get_formatted_column_labels(self, frame: DataFrame) -> list[list[str]]:
from pandas.core.indexes.multi import sparsify_labels
columns = frame.columns
if isinstance(columns, MultiIndex):
fmt_columns = columns.format(sparsify=False, adjoin=False)
fmt_columns = list(zip(*fmt_columns))
dtypes = self.frame.dtypes._values
# if we have a Float level, they don't use leading space at all
restrict_formatting = any(level.is_floating for level in columns.levels)
need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
def space_format(x, y):
if (
y not in self.formatters
and need_leadsp[x]
and not restrict_formatting
):
return " " + y
return y
str_columns = list(
zip(*([space_format(x, y) for y in x] for x in fmt_columns))
)
if self.sparsify and len(str_columns):
str_columns = sparsify_labels(str_columns)
str_columns = [list(x) for x in zip(*str_columns)]
else:
fmt_columns = columns.format()
dtypes = self.frame.dtypes
need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
str_columns = [
[" " + x if not self._get_formatter(i) and need_leadsp[x] else x]
for i, x in enumerate(fmt_columns)
]
# self.str_columns = str_columns
return str_columns
def _get_formatted_index(self, frame: DataFrame) -> list[str]:
# Note: this is only used by to_string() and to_latex(), not by
# to_html(). so safe to cast col_space here.
col_space = {k: cast(int, v) for k, v in self.col_space.items()}
index = frame.index
columns = frame.columns
fmt = self._get_formatter("__index__")
if isinstance(index, MultiIndex):
fmt_index = index.format(
sparsify=self.sparsify,
adjoin=False,
names=self.show_row_idx_names,
formatter=fmt,
)
else:
fmt_index = [index.format(name=self.show_row_idx_names, formatter=fmt)]
fmt_index = [
tuple(
_make_fixed_width(
list(x), justify="left", minimum=col_space.get("", 0), adj=self.adj
)
)
for x in fmt_index
]
adjoined = self.adj.adjoin(1, *fmt_index).split("\n")
# empty space for columns
if self.show_col_idx_names:
col_header = [str(x) for x in self._get_column_name_list()]
else:
col_header = [""] * columns.nlevels
if self.header:
return col_header + adjoined
else:
return adjoined
def _get_column_name_list(self) -> list[Hashable]:
names: list[Hashable] = []
columns = self.frame.columns
if isinstance(columns, MultiIndex):
names.extend("" if name is None else name for name in columns.names)
else:
names.append("" if columns.name is None else columns.name)
return names
class DataFrameRenderer:
"""Class for creating dataframe output in multiple formats.
Called in pandas.core.generic.NDFrame:
- to_csv
- to_latex
Called in pandas.core.frame.DataFrame:
- to_html
- to_string
Parameters
----------
fmt : DataFrameFormatter
Formatter with the formatting options.
"""
def __init__(self, fmt: DataFrameFormatter) -> None:
self.fmt = fmt
def to_latex(
self,
buf: FilePath | WriteBuffer[str] | None = None,
column_format: str | None = None,
longtable: bool = False,
encoding: str | None = None,
multicolumn: bool = False,
multicolumn_format: str | None = None,
multirow: bool = False,
caption: str | tuple[str, str] | None = None,
label: str | None = None,
position: str | None = None,
) -> str | None:
"""
Render a DataFrame to a LaTeX tabular/longtable environment output.
"""
from pandas.io.formats.latex import LatexFormatter
latex_formatter = LatexFormatter(
self.fmt,
longtable=longtable,
column_format=column_format,
multicolumn=multicolumn,
multicolumn_format=multicolumn_format,
multirow=multirow,
caption=caption,
label=label,
position=position,
)
string = latex_formatter.to_string()
return save_to_buffer(string, buf=buf, encoding=encoding)
def to_html(
self,
buf: FilePath | WriteBuffer[str] | None = None,
encoding: str | None = None,
classes: str | list | tuple | None = None,
notebook: bool = False,
border: int | bool | None = None,
table_id: str | None = None,
render_links: bool = False,
) -> str | None:
"""
Render a DataFrame to a html table.
Parameters
----------
buf : str, path object, file-like object, or None, default None
String, path object (implementing ``os.PathLike[str]``), or file-like
object implementing a string ``write()`` function. If None, the result is
returned as a string.
encoding : str, default “utf-8”
Set character encoding.
classes : str or list-like
classes to include in the `class` attribute of the opening
``<table>`` tag, in addition to the default "dataframe".
notebook : {True, False}, optional, default False
Whether the generated HTML is for IPython Notebook.
border : int
A ``border=border`` attribute is included in the opening
``<table>`` tag. Default ``pd.options.display.html.border``.
table_id : str, optional
A css id is included in the opening `<table>` tag if specified.
render_links : bool, default False
Convert URLs to HTML links.
"""
from pandas.io.formats.html import (
HTMLFormatter,
NotebookFormatter,
)
Klass = NotebookFormatter if notebook else HTMLFormatter
html_formatter = Klass(
self.fmt,
classes=classes,
border=border,
table_id=table_id,
render_links=render_links,
)
string = html_formatter.to_string()
return save_to_buffer(string, buf=buf, encoding=encoding)
def to_string(
self,
buf: FilePath | WriteBuffer[str] | None = None,
encoding: str | None = None,
line_width: int | None = None,
) -> str | None:
"""
Render a DataFrame to a console-friendly tabular output.
Parameters
----------
buf : str, path object, file-like object, or None, default None
String, path object (implementing ``os.PathLike[str]``), or file-like
object implementing a string ``write()`` function. If None, the result is
returned as a string.
encoding: str, default “utf-8”
Set character encoding.
line_width : int, optional
Width to wrap a line in characters.
"""
from pandas.io.formats.string import StringFormatter
string_formatter = StringFormatter(self.fmt, line_width=line_width)
string = string_formatter.to_string()
return save_to_buffer(string, buf=buf, encoding=encoding)
def to_csv(
self,
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
encoding: str | None = None,
sep: str = ",",
columns: Sequence[Hashable] | None = None,
index_label: IndexLabel | None = None,
mode: str = "w",
compression: CompressionOptions = "infer",
quoting: int | None = None,
quotechar: str = '"',
lineterminator: str | None = None,
chunksize: int | None = None,
date_format: str | None = None,
doublequote: bool = True,
escapechar: str | None = None,
errors: str = "strict",
storage_options: StorageOptions = None,
) -> str | None:
"""
Render dataframe as comma-separated file.
"""
from pandas.io.formats.csvs import CSVFormatter
if path_or_buf is None:
created_buffer = True
path_or_buf = StringIO()
else:
created_buffer = False
csv_formatter = CSVFormatter(
path_or_buf=path_or_buf,
lineterminator=lineterminator,
sep=sep,
encoding=encoding,
errors=errors,
compression=compression,
quoting=quoting,
cols=columns,
index_label=index_label,
mode=mode,
chunksize=chunksize,
quotechar=quotechar,
date_format=date_format,
doublequote=doublequote,
escapechar=escapechar,
storage_options=storage_options,
formatter=self.fmt,
)
csv_formatter.save()
if created_buffer:
assert isinstance(path_or_buf, StringIO)
content = path_or_buf.getvalue()
path_or_buf.close()
return content
return None
def save_to_buffer(
string: str,
buf: FilePath | WriteBuffer[str] | None = None,
encoding: str | None = None,
) -> str | None:
"""
Perform serialization. Write to buf or return as string if buf is None.
"""
with get_buffer(buf, encoding=encoding) as f:
f.write(string)
if buf is None:
# error: "WriteBuffer[str]" has no attribute "getvalue"
return f.getvalue() # type: ignore[attr-defined]
return None
@contextmanager
def get_buffer(
buf: FilePath | WriteBuffer[str] | None, encoding: str | None = None
) -> Generator[WriteBuffer[str], None, None] | Generator[StringIO, None, None]:
"""
Context manager to open, yield and close buffer for filenames or Path-like
objects, otherwise yield buf unchanged.
"""
if buf is not None:
buf = stringify_path(buf)
else:
buf = StringIO()
if encoding is None:
encoding = "utf-8"
elif not isinstance(buf, str):
raise ValueError("buf is not a file name and encoding is specified.")
if hasattr(buf, "write"):
# Incompatible types in "yield" (actual type "Union[str, WriteBuffer[str],
# StringIO]", expected type "Union[WriteBuffer[str], StringIO]")
yield buf # type: ignore[misc]
elif isinstance(buf, str):
check_parent_directory(str(buf))
with open(buf, "w", encoding=encoding, newline="") as f:
# GH#30034 open instead of codecs.open prevents a file leak
# if we have an invalid encoding argument.
# newline="" is needed to roundtrip correctly on
# windows test_to_latex_filename
yield f
else:
raise TypeError("buf is not a file name and it has no write method")
# ----------------------------------------------------------------------
# Array formatters
def format_array(
values: Any,
formatter: Callable | None,
float_format: FloatFormatType | None = None,
na_rep: str = "NaN",
digits: int | None = None,
space: str | int | None = None,
justify: str = "right",
decimal: str = ".",
leading_space: bool | None = True,
quoting: int | None = None,
fallback_formatter: Callable | None = None,
) -> list[str]:
"""
Format an array for printing.
Parameters
----------
values
formatter
float_format
na_rep
digits
space
justify
decimal
leading_space : bool, optional, default True
Whether the array should be formatted with a leading space.
When an array as a column of a Series or DataFrame, we do want
the leading space to pad between columns.
When formatting an Index subclass
(e.g. IntervalIndex._format_native_types), we don't want the
leading space since it should be left-aligned.
fallback_formatter
Returns
-------
List[str]
"""
fmt_klass: type[GenericArrayFormatter]
if is_datetime64_dtype(values.dtype):
fmt_klass = Datetime64Formatter
elif isinstance(values.dtype, DatetimeTZDtype):
fmt_klass = Datetime64TZFormatter
elif is_timedelta64_dtype(values.dtype):
fmt_klass = Timedelta64Formatter
elif is_extension_array_dtype(values.dtype):
fmt_klass = ExtensionArrayFormatter
elif is_float_dtype(values.dtype) or is_complex_dtype(values.dtype):
fmt_klass = FloatArrayFormatter
elif is_integer_dtype(values.dtype):
fmt_klass = IntArrayFormatter
else:
fmt_klass = GenericArrayFormatter
if space is None:
space = 12
if float_format is None:
float_format = get_option("display.float_format")
if digits is None:
digits = get_option("display.precision")
fmt_obj = fmt_klass(
values,
digits=digits,
na_rep=na_rep,
float_format=float_format,
formatter=formatter,
space=space,
justify=justify,
decimal=decimal,
leading_space=leading_space,
quoting=quoting,
fallback_formatter=fallback_formatter,
)
return fmt_obj.get_result()
class GenericArrayFormatter:
def __init__(
self,
values: Any,
digits: int = 7,
formatter: Callable | None = None,
na_rep: str = "NaN",
space: str | int = 12,
float_format: FloatFormatType | None = None,
justify: str = "right",
decimal: str = ".",
quoting: int | None = None,
fixed_width: bool = True,
leading_space: bool | None = True,
fallback_formatter: Callable | None = None,
) -> None:
self.values = values
self.digits = digits
self.na_rep = na_rep
self.space = space
self.formatter = formatter
self.float_format = float_format
self.justify = justify
self.decimal = decimal
self.quoting = quoting
self.fixed_width = fixed_width
self.leading_space = leading_space
self.fallback_formatter = fallback_formatter
def get_result(self) -> list[str]:
fmt_values = self._format_strings()
return _make_fixed_width(fmt_values, self.justify)
def _format_strings(self) -> list[str]:
if self.float_format is None:
float_format = get_option("display.float_format")
if float_format is None:
precision = get_option("display.precision")
float_format = lambda x: _trim_zeros_single_float(
f"{x: .{precision:d}f}"
)
else:
float_format = self.float_format
if self.formatter is not None:
formatter = self.formatter
elif self.fallback_formatter is not None:
formatter = self.fallback_formatter
else:
quote_strings = self.quoting is not None and self.quoting != QUOTE_NONE
formatter = partial(
printing.pprint_thing,
escape_chars=("\t", "\r", "\n"),
quote_strings=quote_strings,
)
def _format(x):
if self.na_rep is not None and is_scalar(x) and isna(x):
try:
# try block for np.isnat specifically
# determine na_rep if x is None or NaT-like
if x is None:
return "None"
elif x is NA:
return str(NA)
elif x is NaT or np.isnat(x):
return "NaT"
except (TypeError, ValueError):
# np.isnat only handles datetime or timedelta objects
pass
return self.na_rep
elif isinstance(x, PandasObject):
return str(x)
elif isinstance(x, StringDtype):
return repr(x)
else:
# object dtype
return str(formatter(x))
vals = extract_array(self.values, extract_numpy=True)
if not isinstance(vals, np.ndarray):
raise TypeError(
"ExtensionArray formatting should use ExtensionArrayFormatter"
)
inferred = lib.map_infer(vals, is_float)
is_float_type = (
inferred
# vals may have 2 or more dimensions
& np.all(notna(vals), axis=tuple(range(1, len(vals.shape))))
)
leading_space = self.leading_space
if leading_space is None:
leading_space = is_float_type.any()
fmt_values = []
for i, v in enumerate(vals):
if not is_float_type[i] and leading_space or self.formatter is not None:
fmt_values.append(f" {_format(v)}")
elif is_float_type[i]:
fmt_values.append(float_format(v))
else:
if leading_space is False:
# False specifically, so that the default is
# to include a space if we get here.
tpl = "{v}"
else:
tpl = " {v}"
fmt_values.append(tpl.format(v=_format(v)))
return fmt_values
class FloatArrayFormatter(GenericArrayFormatter):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
# float_format is expected to be a string
# formatter should be used to pass a function
if self.float_format is not None and self.formatter is None:
# GH21625, GH22270
self.fixed_width = False
if callable(self.float_format):
self.formatter = self.float_format
self.float_format = None
def _value_formatter(
self,
float_format: FloatFormatType | None = None,
threshold: float | None = None,
) -> Callable:
"""Returns a function to be applied on each value to format it"""
# the float_format parameter supersedes self.float_format
if float_format is None:
float_format = self.float_format
# we are going to compose different functions, to first convert to
# a string, then replace the decimal symbol, and finally chop according
# to the threshold
# when there is no float_format, we use str instead of '%g'
# because str(0.0) = '0.0' while '%g' % 0.0 = '0'
if float_format:
def base_formatter(v):
assert float_format is not None # for mypy
# error: "str" not callable
# error: Unexpected keyword argument "value" for "__call__" of
# "EngFormatter"
return (
float_format(value=v) # type: ignore[operator,call-arg]
if notna(v)
else self.na_rep
)
else:
def base_formatter(v):
return str(v) if notna(v) else self.na_rep
if self.decimal != ".":
def decimal_formatter(v):
return base_formatter(v).replace(".", self.decimal, 1)
else:
decimal_formatter = base_formatter
if threshold is None:
return decimal_formatter
def formatter(value):
if notna(value):
if abs(value) > threshold:
return decimal_formatter(value)
else:
return decimal_formatter(0.0)
else:
return self.na_rep
return formatter
def get_result_as_array(self) -> np.ndarray:
"""
Returns the float values converted into strings using
the parameters given at initialisation, as a numpy array
"""
def format_with_na_rep(values: ArrayLike, formatter: Callable, na_rep: str):
mask = isna(values)
formatted = np.array(
[
formatter(val) if not m else na_rep
for val, m in zip(values.ravel(), mask.ravel())
]
).reshape(values.shape)
return formatted
if self.formatter is not None:
return format_with_na_rep(self.values, self.formatter, self.na_rep)
if self.fixed_width:
threshold = get_option("display.chop_threshold")
else:
threshold = None
# if we have a fixed_width, we'll need to try different float_format
def format_values_with(float_format):
formatter = self._value_formatter(float_format, threshold)
# default formatter leaves a space to the left when formatting
# floats, must be consistent for left-justifying NaNs (GH #25061)
if self.justify == "left":
na_rep = " " + self.na_rep
else:
na_rep = self.na_rep
# separate the wheat from the chaff
values = self.values
is_complex = is_complex_dtype(values)
values = format_with_na_rep(values, formatter, na_rep)
if self.fixed_width:
if is_complex:
result = _trim_zeros_complex(values, self.decimal)
else:
result = _trim_zeros_float(values, self.decimal)
return np.asarray(result, dtype="object")
return values
# There is a special default string when we are fixed-width
# The default is otherwise to use str instead of a formatting string
float_format: FloatFormatType | None
if self.float_format is None:
if self.fixed_width:
if self.leading_space is True:
fmt_str = "{value: .{digits:d}f}"
else:
fmt_str = "{value:.{digits:d}f}"
float_format = partial(fmt_str.format, digits=self.digits)
else:
float_format = self.float_format
else:
float_format = lambda value: self.float_format % value
formatted_values = format_values_with(float_format)
if not self.fixed_width:
return formatted_values
# we need do convert to engineering format if some values are too small
# and would appear as 0, or if some values are too big and take too
# much space
if len(formatted_values) > 0:
maxlen = max(len(x) for x in formatted_values)
too_long = maxlen > self.digits + 6
else:
too_long = False
with np.errstate(invalid="ignore"):
abs_vals = np.abs(self.values)
# this is pretty arbitrary for now
# large values: more that 8 characters including decimal symbol
# and first digit, hence > 1e6
has_large_values = (abs_vals > 1e6).any()
has_small_values = (
(abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)
).any()
if has_small_values or (too_long and has_large_values):
if self.leading_space is True:
fmt_str = "{value: .{digits:d}e}"
else:
fmt_str = "{value:.{digits:d}e}"
float_format = partial(fmt_str.format, digits=self.digits)
formatted_values = format_values_with(float_format)
return formatted_values
def _format_strings(self) -> list[str]:
return list(self.get_result_as_array())
class IntArrayFormatter(GenericArrayFormatter):
def _format_strings(self) -> list[str]:
if self.leading_space is False:
formatter_str = lambda x: f"{x:d}".format(x=x)
else:
formatter_str = lambda x: f"{x: d}".format(x=x)
formatter = self.formatter or formatter_str
fmt_values = [formatter(x) for x in self.values]
return fmt_values
class Datetime64Formatter(GenericArrayFormatter):
def __init__(
self,
values: np.ndarray | Series | DatetimeIndex | DatetimeArray,
nat_rep: str = "NaT",
date_format: None = None,
**kwargs,
) -> None:
super().__init__(values, **kwargs)
self.nat_rep = nat_rep
self.date_format = date_format
def _format_strings(self) -> list[str]:
"""we by definition have DO NOT have a TZ"""
values = self.values
if not isinstance(values, DatetimeIndex):
values = DatetimeIndex(values)
if self.formatter is not None and callable(self.formatter):
return [self.formatter(x) for x in values]
fmt_values = values._data._format_native_types(
na_rep=self.nat_rep, date_format=self.date_format
)
return fmt_values.tolist()
class ExtensionArrayFormatter(GenericArrayFormatter):
def _format_strings(self) -> list[str]:
values = extract_array(self.values, extract_numpy=True)
formatter = self.formatter
fallback_formatter = None
if formatter is None:
fallback_formatter = values._formatter(boxed=True)
if isinstance(values, Categorical):
# Categorical is special for now, so that we can preserve tzinfo
array = values._internal_get_values()
else:
array = np.asarray(values)
fmt_values = format_array(
array,
formatter,
float_format=self.float_format,
na_rep=self.na_rep,
digits=self.digits,
space=self.space,
justify=self.justify,
decimal=self.decimal,
leading_space=self.leading_space,
quoting=self.quoting,
fallback_formatter=fallback_formatter,
)
return fmt_values
def format_percentiles(
percentiles: (np.ndarray | Sequence[float]),
) -> list[str]:
"""
Outputs rounded and formatted percentiles.
Parameters
----------
percentiles : list-like, containing floats from interval [0,1]
Returns
-------
formatted : list of strings
Notes
-----
Rounding precision is chosen so that: (1) if any two elements of
``percentiles`` differ, they remain different after rounding
(2) no entry is *rounded* to 0% or 100%.
Any non-integer is always rounded to at least 1 decimal place.
Examples
--------
Keeps all entries different after rounding:
>>> format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999])
['1.999%', '2.001%', '50%', '66.667%', '99.99%']
No element is rounded to 0% or 100% (unless already equal to it).
Duplicates are allowed:
>>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999])
['0%', '50%', '2.0%', '50%', '66.67%', '99.99%']
"""
percentiles = np.asarray(percentiles)
# It checks for np.NaN as well
with np.errstate(invalid="ignore"):
if (
not is_numeric_dtype(percentiles)
or not np.all(percentiles >= 0)
or not np.all(percentiles <= 1)
):
raise ValueError("percentiles should all be in the interval [0,1]")
percentiles = 100 * percentiles
percentiles_round_type = percentiles.round().astype(int)
int_idx = np.isclose(percentiles_round_type, percentiles)
if np.all(int_idx):
out = percentiles_round_type.astype(str)
return [i + "%" for i in out]
unique_pcts = np.unique(percentiles)
to_begin = unique_pcts[0] if unique_pcts[0] > 0 else None
to_end = 100 - unique_pcts[-1] if unique_pcts[-1] < 100 else None
# Least precision that keeps percentiles unique after rounding
prec = -np.floor(
np.log10(np.min(np.ediff1d(unique_pcts, to_begin=to_begin, to_end=to_end)))
).astype(int)
prec = max(1, prec)
out = np.empty_like(percentiles, dtype=object)
out[int_idx] = percentiles[int_idx].round().astype(int).astype(str)
out[~int_idx] = percentiles[~int_idx].round(prec).astype(str)
return [i + "%" for i in out]
def is_dates_only(values: np.ndarray | DatetimeArray | Index | DatetimeIndex) -> bool:
# return a boolean if we are only dates (and don't have a timezone)
if not isinstance(values, Index):
values = values.ravel()
if not isinstance(values, (DatetimeArray, DatetimeIndex)):
values = DatetimeIndex(values)
if values.tz is not None:
return False
values_int = values.asi8
consider_values = values_int != iNaT
# error: Argument 1 to "py_get_unit_from_dtype" has incompatible type
# "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]"
reso = get_unit_from_dtype(values.dtype) # type: ignore[arg-type]
ppd = periods_per_day(reso)
# TODO: can we reuse is_date_array_normalized? would need a skipna kwd
even_days = np.logical_and(consider_values, values_int % ppd != 0).sum() == 0
if even_days:
return True
return False
def _format_datetime64(x: NaTType | Timestamp, nat_rep: str = "NaT") -> str:
if x is NaT:
return nat_rep
# Timestamp.__str__ falls back to datetime.datetime.__str__ = isoformat(sep=' ')
# so it already uses string formatting rather than strftime (faster).
return str(x)
def _format_datetime64_dateonly(
x: NaTType | Timestamp,
nat_rep: str = "NaT",
date_format: str | None = None,
) -> str:
if isinstance(x, NaTType):
return nat_rep
if date_format:
return x.strftime(date_format)
else:
# Timestamp._date_repr relies on string formatting (faster than strftime)
return x._date_repr
def get_format_datetime64(
is_dates_only_: bool, nat_rep: str = "NaT", date_format: str | None = None
) -> Callable:
"""Return a formatter callable taking a datetime64 as input and providing
a string as output"""
if is_dates_only_:
return lambda x: _format_datetime64_dateonly(
x, nat_rep=nat_rep, date_format=date_format
)
else:
return lambda x: _format_datetime64(x, nat_rep=nat_rep)
def get_format_datetime64_from_values(
values: np.ndarray | DatetimeArray | DatetimeIndex, date_format: str | None
) -> str | None:
"""given values and a date_format, return a string format"""
if isinstance(values, np.ndarray) and values.ndim > 1:
# We don't actually care about the order of values, and DatetimeIndex
# only accepts 1D values
values = values.ravel()
ido = is_dates_only(values)
if ido:
# Only dates and no timezone: provide a default format
return date_format or "%Y-%m-%d"
return date_format
class Datetime64TZFormatter(Datetime64Formatter):
def _format_strings(self) -> list[str]:
"""we by definition have a TZ"""
values = self.values.astype(object)
ido = is_dates_only(values)
formatter = self.formatter or get_format_datetime64(
ido, date_format=self.date_format
)
fmt_values = [formatter(x) for x in values]
return fmt_values
class Timedelta64Formatter(GenericArrayFormatter):
def __init__(
self,
values: np.ndarray | TimedeltaIndex,
nat_rep: str = "NaT",
box: bool = False,
**kwargs,
) -> None:
super().__init__(values, **kwargs)
self.nat_rep = nat_rep
self.box = box
def _format_strings(self) -> list[str]:
formatter = self.formatter or get_format_timedelta64(
self.values, nat_rep=self.nat_rep, box=self.box
)
return [formatter(x) for x in self.values]
def get_format_timedelta64(
values: np.ndarray | TimedeltaIndex | TimedeltaArray,
nat_rep: str | float = "NaT",
box: bool = False,
) -> Callable:
"""
Return a formatter function for a range of timedeltas.
These will all have the same format argument
If box, then show the return in quotes
"""
values_int = values.view(np.int64)
consider_values = values_int != iNaT
one_day_nanos = 86400 * 10**9
# error: Unsupported operand types for % ("ExtensionArray" and "int")
not_midnight = values_int % one_day_nanos != 0 # type: ignore[operator]
# error: Argument 1 to "__call__" of "ufunc" has incompatible type
# "Union[Any, ExtensionArray, ndarray]"; expected
# "Union[Union[int, float, complex, str, bytes, generic],
# Sequence[Union[int, float, complex, str, bytes, generic]],
# Sequence[Sequence[Any]], _SupportsArray]"
both = np.logical_and(consider_values, not_midnight) # type: ignore[arg-type]
even_days = both.sum() == 0
if even_days:
format = None
else:
format = "long"
def _formatter(x):
if x is None or (is_scalar(x) and isna(x)):
return nat_rep
if not isinstance(x, Timedelta):
x = Timedelta(x)
# Timedelta._repr_base uses string formatting (faster than strftime)
result = x._repr_base(format=format)
if box:
result = f"'{result}'"
return result
return _formatter
def _make_fixed_width(
strings: list[str],
justify: str = "right",
minimum: int | None = None,
adj: TextAdjustment | None = None,
) -> list[str]:
if len(strings) == 0 or justify == "all":
return strings
if adj is None:
adjustment = get_adjustment()
else:
adjustment = adj
max_len = max(adjustment.len(x) for x in strings)
if minimum is not None:
max_len = max(minimum, max_len)
conf_max = get_option("display.max_colwidth")
if conf_max is not None and max_len > conf_max:
max_len = conf_max
def just(x: str) -> str:
if conf_max is not None:
if (conf_max > 3) & (adjustment.len(x) > max_len):
x = x[: max_len - 3] + "..."
return x
strings = [just(x) for x in strings]
result = adjustment.justify(strings, max_len, mode=justify)
return result
def _trim_zeros_complex(str_complexes: np.ndarray, decimal: str = ".") -> list[str]:
"""
Separates the real and imaginary parts from the complex number, and
executes the _trim_zeros_float method on each of those.
"""
trimmed = [
"".join(_trim_zeros_float(re.split(r"([j+-])", x), decimal))
for x in str_complexes
]
# pad strings to the length of the longest trimmed string for alignment
lengths = [len(s) for s in trimmed]
max_length = max(lengths)
padded = [
s[: -((k - 1) // 2 + 1)] # real part
+ (max_length - k) // 2 * "0"
+ s[-((k - 1) // 2 + 1) : -((k - 1) // 2)] # + / -
+ s[-((k - 1) // 2) : -1] # imaginary part
+ (max_length - k) // 2 * "0"
+ s[-1]
for s, k in zip(trimmed, lengths)
]
return padded
def _trim_zeros_single_float(str_float: str) -> str:
"""
Trims trailing zeros after a decimal point,
leaving just one if necessary.
"""
str_float = str_float.rstrip("0")
if str_float.endswith("."):
str_float += "0"
return str_float
def _trim_zeros_float(
str_floats: np.ndarray | list[str], decimal: str = "."
) -> list[str]:
"""
Trims the maximum number of trailing zeros equally from
all numbers containing decimals, leaving just one if
necessary.
"""
trimmed = str_floats
number_regex = re.compile(rf"^\s*[\+-]?[0-9]+\{decimal}[0-9]*$")
def is_number_with_decimal(x) -> bool:
return re.match(number_regex, x) is not None
def should_trim(values: np.ndarray | list[str]) -> bool:
"""
Determine if an array of strings should be trimmed.
Returns True if all numbers containing decimals (defined by the
above regular expression) within the array end in a zero, otherwise
returns False.
"""
numbers = [x for x in values if is_number_with_decimal(x)]
return len(numbers) > 0 and all(x.endswith("0") for x in numbers)
while should_trim(trimmed):
trimmed = [x[:-1] if is_number_with_decimal(x) else x for x in trimmed]
# leave one 0 after the decimal points if need be.
result = [
x + "0" if is_number_with_decimal(x) and x.endswith(decimal) else x
for x in trimmed
]
return result
def _has_names(index: Index) -> bool:
if isinstance(index, MultiIndex):
return com.any_not_none(*index.names)
else:
return index.name is not None
class EngFormatter:
"""
Formats float values according to engineering format.
Based on matplotlib.ticker.EngFormatter
"""
# The SI engineering prefixes
ENG_PREFIXES = {
-24: "y",
-21: "z",
-18: "a",
-15: "f",
-12: "p",
-9: "n",
-6: "u",
-3: "m",
0: "",
3: "k",
6: "M",
9: "G",
12: "T",
15: "P",
18: "E",
21: "Z",
24: "Y",
}
def __init__(
self, accuracy: int | None = None, use_eng_prefix: bool = False
) -> None:
self.accuracy = accuracy
self.use_eng_prefix = use_eng_prefix
def __call__(self, num: float) -> str:
"""
Formats a number in engineering notation, appending a letter
representing the power of 1000 of the original number. Some examples:
>>> format_eng = EngFormatter(accuracy=0, use_eng_prefix=True)
>>> format_eng(0)
' 0'
>>> format_eng = EngFormatter(accuracy=1, use_eng_prefix=True)
>>> format_eng(1_000_000)
' 1.0M'
>>> format_eng = EngFormatter(accuracy=2, use_eng_prefix=False)
>>> format_eng("-1e-6")
'-1.00E-06'
@param num: the value to represent
@type num: either a numeric value or a string that can be converted to
a numeric value (as per decimal.Decimal constructor)
@return: engineering formatted string
"""
dnum = Decimal(str(num))
if Decimal.is_nan(dnum):
return "NaN"
if Decimal.is_infinite(dnum):
return "inf"
sign = 1
if dnum < 0: # pragma: no cover
sign = -1
dnum = -dnum
if dnum != 0:
pow10 = Decimal(int(math.floor(dnum.log10() / 3) * 3))
else:
pow10 = Decimal(0)
pow10 = pow10.min(max(self.ENG_PREFIXES.keys()))
pow10 = pow10.max(min(self.ENG_PREFIXES.keys()))
int_pow10 = int(pow10)
if self.use_eng_prefix:
prefix = self.ENG_PREFIXES[int_pow10]
else:
if int_pow10 < 0:
prefix = f"E-{-int_pow10:02d}"
else:
prefix = f"E+{int_pow10:02d}"
mant = sign * dnum / (10**pow10)
if self.accuracy is None: # pragma: no cover
format_str = "{mant: g}{prefix}"
else:
format_str = f"{{mant: .{self.accuracy:d}f}}{{prefix}}"
formatted = format_str.format(mant=mant, prefix=prefix)
return formatted
def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> None:
"""
Format float representation in DataFrame with SI notation.
Parameters
----------
accuracy : int, default 3
Number of decimal digits after the floating point.
use_eng_prefix : bool, default False
Whether to represent a value with SI prefixes.
Returns
-------
None
Examples
--------
>>> df = pd.DataFrame([1e-9, 1e-3, 1, 1e3, 1e6])
>>> df
0
0 1.000000e-09
1 1.000000e-03
2 1.000000e+00
3 1.000000e+03
4 1.000000e+06
>>> pd.set_eng_float_format(accuracy=1)
>>> df
0
0 1.0E-09
1 1.0E-03
2 1.0E+00
3 1.0E+03
4 1.0E+06
>>> pd.set_eng_float_format(use_eng_prefix=True)
>>> df
0
0 1.000n
1 1.000m
2 1.000
3 1.000k
4 1.000M
>>> pd.set_eng_float_format(accuracy=1, use_eng_prefix=True)
>>> df
0
0 1.0n
1 1.0m
2 1.0
3 1.0k
4 1.0M
>>> pd.set_option("display.float_format", None) # unset option
"""
set_option("display.float_format", EngFormatter(accuracy, use_eng_prefix))
def get_level_lengths(
levels: Any, sentinel: bool | object | str = ""
) -> list[dict[int, int]]:
"""
For each index in each level the function returns lengths of indexes.
Parameters
----------
levels : list of lists
List of values on for level.
sentinel : string, optional
Value which states that no new index starts on there.
Returns
-------
Returns list of maps. For each level returns map of indexes (key is index
in row and value is length of index).
"""
if len(levels) == 0:
return []
control = [True] * len(levels[0])
result = []
for level in levels:
last_index = 0
lengths = {}
for i, key in enumerate(level):
if control[i] and key == sentinel:
pass
else:
control[i] = False
lengths[last_index] = i - last_index
last_index = i
lengths[last_index] = len(level) - last_index
result.append(lengths)
return result
def buffer_put_lines(buf: WriteBuffer[str], lines: list[str]) -> None:
"""
Appends lines to a buffer.
Parameters
----------
buf
The buffer to write to
lines
The lines to append.
"""
if any(isinstance(x, str) for x in lines):
lines = [str(x) for x in lines]
buf.write("\n".join(lines))