""" Module for formatting output data in Latex. """ from abc import ABC, abstractmethod from typing import Iterator, List, Optional, Sequence, Tuple, Type, Union import numpy as np from pandas.core.dtypes.generic import ABCMultiIndex from pandas.io.formats.format import DataFrameFormatter def _split_into_full_short_caption( caption: Optional[Union[str, Tuple[str, str]]] ) -> Tuple[str, str]: """Extract full and short captions from caption string/tuple. Parameters ---------- caption : str or tuple, optional Either table caption string or tuple (full_caption, short_caption). If string is provided, then it is treated as table full caption, while short_caption is considered an empty string. Returns ------- full_caption, short_caption : tuple Tuple of full_caption, short_caption strings. """ if caption: if isinstance(caption, str): full_caption = caption short_caption = "" else: try: full_caption, short_caption = caption except ValueError as err: msg = "caption must be either a string or a tuple of two strings" raise ValueError(msg) from err else: full_caption = "" short_caption = "" return full_caption, short_caption class RowStringConverter(ABC): r"""Converter for dataframe rows into LaTeX strings. Parameters ---------- formatter : `DataFrameFormatter` Instance of `DataFrameFormatter`. multicolumn: bool, optional Whether to use \multicolumn macro. multicolumn_format: str, optional Multicolumn format. multirow: bool, optional Whether to use \multirow macro. """ def __init__( self, formatter: DataFrameFormatter, multicolumn: bool = False, multicolumn_format: Optional[str] = None, multirow: bool = False, ): self.fmt = formatter self.frame = self.fmt.frame self.multicolumn = multicolumn self.multicolumn_format = multicolumn_format self.multirow = multirow self.clinebuf: List[List[int]] = [] self.strcols = self._get_strcols() self.strrows = list(zip(*self.strcols)) def get_strrow(self, row_num: int) -> str: """Get string representation of the row.""" row = self.strrows[row_num] is_multicol = ( row_num < self.column_levels and self.fmt.header and self.multicolumn ) is_multirow = ( row_num >= self.header_levels and self.fmt.index and self.multirow and self.index_levels > 1 ) is_cline_maybe_required = is_multirow and row_num < len(self.strrows) - 1 crow = self._preprocess_row(row) if is_multicol: crow = self._format_multicolumn(crow) if is_multirow: crow = self._format_multirow(crow, row_num) lst = [] lst.append(" & ".join(crow)) lst.append(" \\\\") if is_cline_maybe_required: cline = self._compose_cline(row_num, len(self.strcols)) lst.append(cline) return "".join(lst) @property def _header_row_num(self) -> int: """Number of rows in header.""" return self.header_levels if self.fmt.header else 0 @property def index_levels(self) -> int: """Integer number of levels in index.""" return self.frame.index.nlevels @property def column_levels(self) -> int: return self.frame.columns.nlevels @property def header_levels(self) -> int: nlevels = self.column_levels if self.fmt.has_index_names and self.fmt.show_index_names: nlevels += 1 return nlevels def _get_strcols(self) -> List[List[str]]: """String representation of the columns.""" if self.fmt.frame.empty: strcols = [[self._empty_info_line]] else: strcols = self.fmt.get_strcols() # reestablish the MultiIndex that has been joined by get_strcols() if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): out = self.frame.index.format( adjoin=False, sparsify=self.fmt.sparsify, names=self.fmt.has_index_names, na_rep=self.fmt.na_rep, ) # index.format will sparsify repeated entries with empty strings # so pad these with some empty space def pad_empties(x): for pad in reversed(x): if pad: break return [x[0]] + [i if i else " " * len(pad) for i in x[1:]] gen = (pad_empties(i) for i in out) # Add empty spaces for each column level clevels = self.frame.columns.nlevels out = [[" " * len(i[-1])] * clevels + i for i in gen] # Add the column names to the last index column cnames = self.frame.columns.names if any(cnames): new_names = [i if i else "{}" for i in cnames] out[self.frame.index.nlevels - 1][:clevels] = new_names # Get rid of old multiindex column and add new ones strcols = out + strcols[1:] return strcols @property def _empty_info_line(self): return ( f"Empty {type(self.frame).__name__}\n" f"Columns: {self.frame.columns}\n" f"Index: {self.frame.index}" ) def _preprocess_row(self, row: Sequence[str]) -> List[str]: """Preprocess elements of the row.""" if self.fmt.escape: crow = _escape_symbols(row) else: crow = [x if x else "{}" for x in row] if self.fmt.bold_rows and self.fmt.index: crow = _convert_to_bold(crow, self.index_levels) return crow def _format_multicolumn(self, row: List[str]) -> List[str]: r""" Combine columns belonging to a group to a single multicolumn entry according to self.multicolumn_format e.g.: a & & & b & c & will become \multicolumn{3}{l}{a} & b & \multicolumn{2}{l}{c} """ row2 = row[: self.index_levels] ncol = 1 coltext = "" def append_col(): # write multicolumn if needed if ncol > 1: row2.append( f"\\multicolumn{{{ncol:d}}}{{{self.multicolumn_format}}}" f"{{{coltext.strip()}}}" ) # don't modify where not needed else: row2.append(coltext) for c in row[self.index_levels :]: # if next col has text, write the previous if c.strip(): if coltext: append_col() coltext = c ncol = 1 # if not, add it to the previous multicolumn else: ncol += 1 # write last column name if coltext: append_col() return row2 def _format_multirow(self, row: List[str], i: int) -> List[str]: r""" Check following rows, whether row should be a multirow e.g.: becomes: a & 0 & \multirow{2}{*}{a} & 0 & & 1 & & 1 & b & 0 & \cline{1-2} b & 0 & """ for j in range(self.index_levels): if row[j].strip(): nrow = 1 for r in self.strrows[i + 1 :]: if not r[j].strip(): nrow += 1 else: break if nrow > 1: # overwrite non-multirow entry row[j] = f"\\multirow{{{nrow:d}}}{{*}}{{{row[j].strip()}}}" # save when to end the current block with \cline self.clinebuf.append([i + nrow - 1, j + 1]) return row def _compose_cline(self, i: int, icol: int) -> str: """ Create clines after multirow-blocks are finished. """ lst = [] for cl in self.clinebuf: if cl[0] == i: lst.append(f"\n\\cline{{{cl[1]:d}-{icol:d}}}") # remove entries that have been written to buffer self.clinebuf = [x for x in self.clinebuf if x[0] != i] return "".join(lst) class RowStringIterator(RowStringConverter): """Iterator over rows of the header or the body of the table.""" @abstractmethod def __iter__(self) -> Iterator[str]: """Iterate over LaTeX string representations of rows.""" class RowHeaderIterator(RowStringIterator): """Iterator for the table header rows.""" def __iter__(self) -> Iterator[str]: for row_num in range(len(self.strrows)): if row_num < self._header_row_num: yield self.get_strrow(row_num) class RowBodyIterator(RowStringIterator): """Iterator for the table body rows.""" def __iter__(self) -> Iterator[str]: for row_num in range(len(self.strrows)): if row_num >= self._header_row_num: yield self.get_strrow(row_num) class TableBuilderAbstract(ABC): """ Abstract table builder producing string representation of LaTeX table. Parameters ---------- formatter : `DataFrameFormatter` Instance of `DataFrameFormatter`. column_format: str, optional Column format, for example, 'rcl' for three columns. multicolumn: bool, optional Use multicolumn to enhance MultiIndex columns. multicolumn_format: str, optional The alignment for multicolumns, similar to column_format. multirow: bool, optional Use multirow to enhance MultiIndex rows. caption: str, optional Table caption. short_caption: str, optional Table short caption. label: str, optional LaTeX label. position: str, optional Float placement specifier, for example, 'htb'. """ def __init__( self, formatter: DataFrameFormatter, column_format: Optional[str] = None, multicolumn: bool = False, multicolumn_format: Optional[str] = None, multirow: bool = False, caption: Optional[str] = None, short_caption: Optional[str] = None, label: Optional[str] = None, position: Optional[str] = None, ): self.fmt = formatter self.column_format = column_format self.multicolumn = multicolumn self.multicolumn_format = multicolumn_format self.multirow = multirow self.caption = caption self.short_caption = short_caption self.label = label self.position = position def get_result(self) -> str: """String representation of LaTeX table.""" elements = [ self.env_begin, self.top_separator, self.header, self.middle_separator, self.env_body, self.bottom_separator, self.env_end, ] result = "\n".join([item for item in elements if item]) trailing_newline = "\n" result += trailing_newline return result @property @abstractmethod def env_begin(self) -> str: """Beginning of the environment.""" @property @abstractmethod def top_separator(self) -> str: """Top level separator.""" @property @abstractmethod def header(self) -> str: """Header lines.""" @property @abstractmethod def middle_separator(self) -> str: """Middle level separator.""" @property @abstractmethod def env_body(self) -> str: """Environment body.""" @property @abstractmethod def bottom_separator(self) -> str: """Bottom level separator.""" @property @abstractmethod def env_end(self) -> str: """End of the environment.""" class GenericTableBuilder(TableBuilderAbstract): """Table builder producing string representation of LaTeX table.""" @property def header(self) -> str: iterator = self._create_row_iterator(over="header") return "\n".join(list(iterator)) @property def top_separator(self) -> str: return "\\toprule" @property def middle_separator(self) -> str: return "\\midrule" if self._is_separator_required() else "" @property def env_body(self) -> str: iterator = self._create_row_iterator(over="body") return "\n".join(list(iterator)) def _is_separator_required(self) -> bool: return bool(self.header and self.env_body) @property def _position_macro(self) -> str: r"""Position macro, extracted from self.position, like [h].""" return f"[{self.position}]" if self.position else "" @property def _caption_macro(self) -> str: r"""Caption macro, extracted from self.caption. With short caption: \caption[short_caption]{caption_string}. Without short caption: \caption{caption_string}. """ if self.caption: return "".join( [ r"\caption", f"[{self.short_caption}]" if self.short_caption else "", f"{{{self.caption}}}", ] ) return "" @property def _label_macro(self) -> str: r"""Label macro, extracted from self.label, like \label{ref}.""" return f"\\label{{{self.label}}}" if self.label else "" def _create_row_iterator(self, over: str) -> RowStringIterator: """Create iterator over header or body of the table. Parameters ---------- over : {'body', 'header'} Over what to iterate. Returns ------- RowStringIterator Iterator over body or header. """ iterator_kind = self._select_iterator(over) return iterator_kind( formatter=self.fmt, multicolumn=self.multicolumn, multicolumn_format=self.multicolumn_format, multirow=self.multirow, ) def _select_iterator(self, over: str) -> Type[RowStringIterator]: """Select proper iterator over table rows.""" if over == "header": return RowHeaderIterator elif over == "body": return RowBodyIterator else: msg = f"'over' must be either 'header' or 'body', but {over} was provided" raise ValueError(msg) class LongTableBuilder(GenericTableBuilder): """Concrete table builder for longtable. >>> from pandas import DataFrame >>> from pandas.io.formats import format as fmt >>> df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) >>> formatter = fmt.DataFrameFormatter(df) >>> builder = LongTableBuilder(formatter, caption='a long table', ... label='tab:long', column_format='lrl') >>> table = builder.get_result() >>> print(table) \\begin{longtable}{lrl} \\caption{a long table} \\label{tab:long}\\\\ \\toprule {} & a & b \\\\ \\midrule \\endfirsthead \\caption[]{a long table} \\\\ \\toprule {} & a & b \\\\ \\midrule \\endhead \\midrule \\multicolumn{3}{r}{{Continued on next page}} \\\\ \\midrule \\endfoot \\bottomrule \\endlastfoot 0 & 1 & b1 \\\\ 1 & 2 & b2 \\\\ \\end{longtable} """ @property def env_begin(self) -> str: first_row = ( f"\\begin{{longtable}}{self._position_macro}{{{self.column_format}}}" ) elements = [first_row, f"{self._caption_and_label()}"] return "\n".join([item for item in elements if item]) def _caption_and_label(self) -> str: if self.caption or self.label: double_backslash = "\\\\" elements = [f"{self._caption_macro}", f"{self._label_macro}"] caption_and_label = "\n".join([item for item in elements if item]) caption_and_label += double_backslash return caption_and_label else: return "" @property def middle_separator(self) -> str: iterator = self._create_row_iterator(over="header") # the content between \endfirsthead and \endhead commands # mitigates repeated List of Tables entries in the final LaTeX # document when dealing with longtable environments; GH #34360 elements = [ "\\midrule", "\\endfirsthead", f"\\caption[]{{{self.caption}}} \\\\" if self.caption else "", self.top_separator, self.header, "\\midrule", "\\endhead", "\\midrule", f"\\multicolumn{{{len(iterator.strcols)}}}{{r}}" "{{Continued on next page}} \\\\", "\\midrule", "\\endfoot\n", "\\bottomrule", "\\endlastfoot", ] if self._is_separator_required(): return "\n".join(elements) return "" @property def bottom_separator(self) -> str: return "" @property def env_end(self) -> str: return "\\end{longtable}" class RegularTableBuilder(GenericTableBuilder): """Concrete table builder for regular table. >>> from pandas import DataFrame >>> from pandas.io.formats import format as fmt >>> df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) >>> formatter = fmt.DataFrameFormatter(df) >>> builder = RegularTableBuilder(formatter, caption='caption', label='lab', ... column_format='lrc') >>> table = builder.get_result() >>> print(table) \\begin{table} \\centering \\caption{caption} \\label{lab} \\begin{tabular}{lrc} \\toprule {} & a & b \\\\ \\midrule 0 & 1 & b1 \\\\ 1 & 2 & b2 \\\\ \\bottomrule \\end{tabular} \\end{table} """ @property def env_begin(self) -> str: elements = [ f"\\begin{{table}}{self._position_macro}", "\\centering", f"{self._caption_macro}", f"{self._label_macro}", f"\\begin{{tabular}}{{{self.column_format}}}", ] return "\n".join([item for item in elements if item]) @property def bottom_separator(self) -> str: return "\\bottomrule" @property def env_end(self) -> str: return "\n".join(["\\end{tabular}", "\\end{table}"]) class TabularBuilder(GenericTableBuilder): """Concrete table builder for tabular environment. >>> from pandas import DataFrame >>> from pandas.io.formats import format as fmt >>> df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) >>> formatter = fmt.DataFrameFormatter(df) >>> builder = TabularBuilder(formatter, column_format='lrc') >>> table = builder.get_result() >>> print(table) \\begin{tabular}{lrc} \\toprule {} & a & b \\\\ \\midrule 0 & 1 & b1 \\\\ 1 & 2 & b2 \\\\ \\bottomrule \\end{tabular} """ @property def env_begin(self) -> str: return f"\\begin{{tabular}}{{{self.column_format}}}" @property def bottom_separator(self) -> str: return "\\bottomrule" @property def env_end(self) -> str: return "\\end{tabular}" class LatexFormatter: r""" Used to render a DataFrame to a LaTeX tabular/longtable environment output. Parameters ---------- formatter : `DataFrameFormatter` longtable : bool, default False Use longtable environment. column_format : str, default None The columns format as specified in `LaTeX table format `__ e.g 'rcl' for 3 columns multicolumn : bool, default False Use \multicolumn to enhance MultiIndex columns. multicolumn_format : str, default 'l' The alignment for multicolumns, similar to `column_format` multirow : bool, default False Use \multirow to enhance MultiIndex rows. caption : str or tuple, optional Tuple (full_caption, short_caption), which results in \caption[short_caption]{full_caption}; if a single string is passed, no short caption will be set. label : str, optional The LaTeX label to be placed inside ``\label{}`` in the output. position : str, optional The LaTeX positional argument for tables, to be placed after ``\begin{}`` in the output. See Also -------- HTMLFormatter """ def __init__( self, formatter: DataFrameFormatter, longtable: bool = False, column_format: Optional[str] = None, multicolumn: bool = False, multicolumn_format: Optional[str] = None, multirow: bool = False, caption: Optional[Union[str, Tuple[str, str]]] = None, label: Optional[str] = None, position: Optional[str] = None, ): self.fmt = formatter self.frame = self.fmt.frame self.longtable = longtable self.column_format = column_format self.multicolumn = multicolumn self.multicolumn_format = multicolumn_format self.multirow = multirow self.caption, self.short_caption = _split_into_full_short_caption(caption) self.label = label self.position = position def to_string(self) -> str: """ Render a DataFrame to a LaTeX tabular, longtable, or table/tabular environment output. """ return self.builder.get_result() @property def builder(self) -> TableBuilderAbstract: """Concrete table builder. Returns ------- TableBuilder """ builder = self._select_builder() return builder( formatter=self.fmt, column_format=self.column_format, multicolumn=self.multicolumn, multicolumn_format=self.multicolumn_format, multirow=self.multirow, caption=self.caption, short_caption=self.short_caption, label=self.label, position=self.position, ) def _select_builder(self) -> Type[TableBuilderAbstract]: """Select proper table builder.""" if self.longtable: return LongTableBuilder if any([self.caption, self.label, self.position]): return RegularTableBuilder return TabularBuilder @property def column_format(self) -> Optional[str]: """Column format.""" return self._column_format @column_format.setter def column_format(self, input_column_format: Optional[str]) -> None: """Setter for column format.""" if input_column_format is None: self._column_format = ( self._get_index_format() + self._get_column_format_based_on_dtypes() ) elif not isinstance(input_column_format, str): raise ValueError( f"column_format must be str or unicode, " f"not {type(input_column_format)}" ) else: self._column_format = input_column_format def _get_column_format_based_on_dtypes(self) -> str: """Get column format based on data type. Right alignment for numbers and left - for strings. """ def get_col_type(dtype): if issubclass(dtype.type, np.number): return "r" return "l" dtypes = self.frame.dtypes._values return "".join(map(get_col_type, dtypes)) def _get_index_format(self) -> str: """Get index column format.""" return "l" * self.frame.index.nlevels if self.fmt.index else "" def _escape_symbols(row: Sequence[str]) -> List[str]: """Carry out string replacements for special symbols. Parameters ---------- row : list List of string, that may contain special symbols. Returns ------- list list of strings with the special symbols replaced. """ return [ ( x.replace("\\", "\\textbackslash ") .replace("_", "\\_") .replace("%", "\\%") .replace("$", "\\$") .replace("#", "\\#") .replace("{", "\\{") .replace("}", "\\}") .replace("~", "\\textasciitilde ") .replace("^", "\\textasciicircum ") .replace("&", "\\&") if (x and x != "{}") else "{}" ) for x in row ] def _convert_to_bold(crow: Sequence[str], ilevels: int) -> List[str]: """Convert elements in ``crow`` to bold.""" return [ f"\\textbf{{{x}}}" if j < ilevels and x.strip() not in ["", "{}"] else x for j, x in enumerate(crow) ] if __name__ == "__main__": import doctest doctest.testmod()