from typing import List, cast import numpy as np from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency import pandas as pd from pandas.io.excel._base import BaseExcelReader class ODFReader(BaseExcelReader): """ Read tables out of OpenDocument formatted files. Parameters ---------- filepath_or_buffer : string, path to be parsed or an open readable stream. storage_options : dict, optional passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``) """ def __init__( self, filepath_or_buffer: FilePathOrBuffer, storage_options: StorageOptions = None, ): import_optional_dependency("odf") super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): from odf.opendocument import OpenDocument return OpenDocument def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): from odf.opendocument import load return load(filepath_or_buffer) @property def empty_value(self) -> str: """Property for compat with other readers.""" return "" @property def sheet_names(self) -> List[str]: """Return a list of sheet names present in the document""" from odf.table import Table tables = self.book.getElementsByType(Table) return [t.getAttribute("name") for t in tables] def get_sheet_by_index(self, index: int): from odf.table import Table self.raise_if_bad_sheet_by_index(index) tables = self.book.getElementsByType(Table) return tables[index] def get_sheet_by_name(self, name: str): from odf.table import Table self.raise_if_bad_sheet_by_name(name) tables = self.book.getElementsByType(Table) for table in tables: if table.getAttribute("name") == name: return table self.close() raise ValueError(f"sheet {name} not found") def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: """ Parse an ODF Table into a list of lists """ from odf.table import CoveredTableCell, TableCell, TableRow covered_cell_name = CoveredTableCell().qname table_cell_name = TableCell().qname cell_names = {covered_cell_name, table_cell_name} sheet_rows = sheet.getElementsByType(TableRow) empty_rows = 0 max_row_len = 0 table: List[List[Scalar]] = [] for i, sheet_row in enumerate(sheet_rows): sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] empty_cells = 0 table_row: List[Scalar] = [] for j, sheet_cell in enumerate(sheet_cells): if sheet_cell.qname == table_cell_name: value = self._get_cell_value(sheet_cell, convert_float) else: value = self.empty_value column_repeat = self._get_column_repeat(sheet_cell) # Queue up empty values, writing only if content succeeds them if value == self.empty_value: empty_cells += column_repeat else: table_row.extend([self.empty_value] * empty_cells) empty_cells = 0 table_row.extend([value] * column_repeat) if max_row_len < len(table_row): max_row_len = len(table_row) row_repeat = self._get_row_repeat(sheet_row) if self._is_empty_row(sheet_row): empty_rows += row_repeat else: # add blank rows to our table table.extend([[self.empty_value]] * empty_rows) empty_rows = 0 for _ in range(row_repeat): table.append(table_row) # Make our table square for row in table: if len(row) < max_row_len: row.extend([self.empty_value] * (max_row_len - len(row))) return table def _get_row_repeat(self, row) -> int: """ Return number of times this row was repeated Repeating an empty row appeared to be a common way of representing sparse rows in the table. """ from odf.namespaces import TABLENS return int(row.attributes.get((TABLENS, "number-rows-repeated"), 1)) def _get_column_repeat(self, cell) -> int: from odf.namespaces import TABLENS return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1)) def _is_empty_row(self, row) -> bool: """ Helper function to find empty rows """ for column in row.childNodes: if len(column.childNodes) > 0: return False return True def _get_cell_value(self, cell, convert_float: bool) -> Scalar: from odf.namespaces import OFFICENS if str(cell) == "#N/A": return np.nan cell_type = cell.attributes.get((OFFICENS, "value-type")) if cell_type == "boolean": if str(cell) == "TRUE": return True return False if cell_type is None: return self.empty_value elif cell_type == "float": # GH5394 cell_value = float(cell.attributes.get((OFFICENS, "value"))) if convert_float: val = int(cell_value) if val == cell_value: return val return cell_value elif cell_type == "percentage": cell_value = cell.attributes.get((OFFICENS, "value")) return float(cell_value) elif cell_type == "string": return self._get_cell_string_value(cell) elif cell_type == "currency": cell_value = cell.attributes.get((OFFICENS, "value")) return float(cell_value) elif cell_type == "date": cell_value = cell.attributes.get((OFFICENS, "date-value")) return pd.to_datetime(cell_value) elif cell_type == "time": result = pd.to_datetime(str(cell)) result = cast(pd.Timestamp, result) return result.time() else: self.close() raise ValueError(f"Unrecognized type {cell_type}") def _get_cell_string_value(self, cell) -> str: """ Find and decode OpenDocument text:s tags that represent a run length encoded sequence of space characters. """ from odf.element import Element from odf.namespaces import TEXTNS from odf.text import S text_s = S().qname value = [] for fragment in cell.childNodes: if isinstance(fragment, Element): if fragment.qname == text_s: spaces = int(fragment.attributes.get((TEXTNS, "c"), 1)) value.append(" " * spaces) else: # recursive impl needed in case of nested fragments # with multiple spaces # https://github.com/pandas-dev/pandas/pull/36175#discussion_r484639704 value.append(self._get_cell_string_value(fragment)) else: value.append(str(fragment)) return "".join(value)