"""Implementation of ARFF parsers: via LIAC-ARFF and pandas.""" import itertools import re from collections import OrderedDict from collections.abc import Generator from typing import List import numpy as np import scipy as sp from ..externals import _arff from ..externals._arff import ArffSparseDataType from ..utils._chunking import chunk_generator, get_chunk_n_rows from ..utils._optional_dependencies import check_pandas_support from ..utils.fixes import pd_fillna def _split_sparse_columns( arff_data: ArffSparseDataType, include_columns: List ) -> ArffSparseDataType: """Obtains several columns from sparse ARFF representation. Additionally, the column indices are re-labelled, given the columns that are not included. (e.g., when including [1, 2, 3], the columns will be relabelled to [0, 1, 2]). Parameters ---------- arff_data : tuple A tuple of three lists of equal size; first list indicating the value, second the x coordinate and the third the y coordinate. include_columns : list A list of columns to include. Returns ------- arff_data_new : tuple Subset of arff data with only the include columns indicated by the include_columns argument. """ arff_data_new: ArffSparseDataType = (list(), list(), list()) reindexed_columns = { column_idx: array_idx for array_idx, column_idx in enumerate(include_columns) } for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]): if col_idx in include_columns: arff_data_new[0].append(val) arff_data_new[1].append(row_idx) arff_data_new[2].append(reindexed_columns[col_idx]) return arff_data_new def _sparse_data_to_array( arff_data: ArffSparseDataType, include_columns: List ) -> np.ndarray: # turns the sparse data back into an array (can't use toarray() function, # as this does only work on numeric data) num_obs = max(arff_data[1]) + 1 y_shape = (num_obs, len(include_columns)) reindexed_columns = { column_idx: array_idx for array_idx, column_idx in enumerate(include_columns) } # TODO: improve for efficiency y = np.empty(y_shape, dtype=np.float64) for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]): if col_idx in include_columns: y[row_idx, reindexed_columns[col_idx]] = val return y def _post_process_frame(frame, feature_names, target_names): """Post process a dataframe to select the desired columns in `X` and `y`. Parameters ---------- frame : dataframe The dataframe to split into `X` and `y`. feature_names : list of str The list of feature names to populate `X`. target_names : list of str The list of target names to populate `y`. Returns ------- X : dataframe The dataframe containing the features. y : {series, dataframe} or None The series or dataframe containing the target. """ X = frame[feature_names] if len(target_names) >= 2: y = frame[target_names] elif len(target_names) == 1: y = frame[target_names[0]] else: y = None return X, y def _liac_arff_parser( gzip_file, output_arrays_type, openml_columns_info, feature_names_to_select, target_names_to_select, shape=None, ): """ARFF parser using the LIAC-ARFF library coded purely in Python. This parser is quite slow but consumes a generator. Currently it is needed to parse sparse datasets. For dense datasets, it is recommended to instead use the pandas-based parser, although it does not always handles the dtypes exactly the same. Parameters ---------- gzip_file : GzipFile instance The file compressed to be read. output_arrays_type : {"numpy", "sparse", "pandas"} The type of the arrays that will be returned. The possibilities ara: - `"numpy"`: both `X` and `y` will be NumPy arrays; - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array; - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a pandas Series or DataFrame. columns_info : dict The information provided by OpenML regarding the columns of the ARFF file. feature_names_to_select : list of str A list of the feature names to be selected. target_names_to_select : list of str A list of the target names to be selected. Returns ------- X : {ndarray, sparse matrix, dataframe} The data matrix. y : {ndarray, dataframe, series} The target. frame : dataframe or None A dataframe containing both `X` and `y`. `None` if `output_array_type != "pandas"`. categories : list of str or None The names of the features that are categorical. `None` if `output_array_type == "pandas"`. """ def _io_to_generator(gzip_file): for line in gzip_file: yield line.decode("utf-8") stream = _io_to_generator(gzip_file) # find which type (dense or sparse) ARFF type we will have to deal with return_type = _arff.COO if output_arrays_type == "sparse" else _arff.DENSE_GEN # we should not let LIAC-ARFF to encode the nominal attributes with NumPy # arrays to have only numerical values. encode_nominal = not (output_arrays_type == "pandas") arff_container = _arff.load( stream, return_type=return_type, encode_nominal=encode_nominal ) columns_to_select = feature_names_to_select + target_names_to_select categories = { name: cat for name, cat in arff_container["attributes"] if isinstance(cat, list) and name in columns_to_select } if output_arrays_type == "pandas": pd = check_pandas_support("fetch_openml with as_frame=True") columns_info = OrderedDict(arff_container["attributes"]) columns_names = list(columns_info.keys()) # calculate chunksize first_row = next(arff_container["data"]) first_df = pd.DataFrame([first_row], columns=columns_names, copy=False) row_bytes = first_df.memory_usage(deep=True).sum() chunksize = get_chunk_n_rows(row_bytes) # read arff data with chunks columns_to_keep = [col for col in columns_names if col in columns_to_select] dfs = [first_df[columns_to_keep]] for data in chunk_generator(arff_container["data"], chunksize): dfs.append( pd.DataFrame(data, columns=columns_names, copy=False)[columns_to_keep] ) # dfs[0] contains only one row, which may not have enough data to infer to # column's dtype. Here we use `dfs[1]` to configure the dtype in dfs[0] if len(dfs) >= 2: dfs[0] = dfs[0].astype(dfs[1].dtypes) # liac-arff parser does not depend on NumPy and uses None to represent # missing values. To be consistent with the pandas parser, we replace # None with np.nan. frame = pd.concat(dfs, ignore_index=True) frame = pd_fillna(pd, frame) del dfs, first_df # cast the columns frame dtypes = {} for name in frame.columns: column_dtype = openml_columns_info[name]["data_type"] if column_dtype.lower() == "integer": # Use a pandas extension array instead of np.int64 to be able # to support missing values. dtypes[name] = "Int64" elif column_dtype.lower() == "nominal": dtypes[name] = "category" else: dtypes[name] = frame.dtypes[name] frame = frame.astype(dtypes) X, y = _post_process_frame( frame, feature_names_to_select, target_names_to_select ) else: arff_data = arff_container["data"] feature_indices_to_select = [ int(openml_columns_info[col_name]["index"]) for col_name in feature_names_to_select ] target_indices_to_select = [ int(openml_columns_info[col_name]["index"]) for col_name in target_names_to_select ] if isinstance(arff_data, Generator): if shape is None: raise ValueError( "shape must be provided when arr['data'] is a Generator" ) if shape[0] == -1: count = -1 else: count = shape[0] * shape[1] data = np.fromiter( itertools.chain.from_iterable(arff_data), dtype="float64", count=count, ) data = data.reshape(*shape) X = data[:, feature_indices_to_select] y = data[:, target_indices_to_select] elif isinstance(arff_data, tuple): arff_data_X = _split_sparse_columns(arff_data, feature_indices_to_select) num_obs = max(arff_data[1]) + 1 X_shape = (num_obs, len(feature_indices_to_select)) X = sp.sparse.coo_matrix( (arff_data_X[0], (arff_data_X[1], arff_data_X[2])), shape=X_shape, dtype=np.float64, ) X = X.tocsr() y = _sparse_data_to_array(arff_data, target_indices_to_select) else: # This should never happen raise ValueError( f"Unexpected type for data obtained from arff: {type(arff_data)}" ) is_classification = { col_name in categories for col_name in target_names_to_select } if not is_classification: # No target pass elif all(is_classification): y = np.hstack( [ np.take( np.asarray(categories.pop(col_name), dtype="O"), y[:, i : i + 1].astype(int, copy=False), ) for i, col_name in enumerate(target_names_to_select) ] ) elif any(is_classification): raise ValueError( "Mix of nominal and non-nominal targets is not currently supported" ) # reshape y back to 1-D array, if there is only 1 target column; # back to None if there are not target columns if y.shape[1] == 1: y = y.reshape((-1,)) elif y.shape[1] == 0: y = None if output_arrays_type == "pandas": return X, y, frame, None return X, y, None, categories def _pandas_arff_parser( gzip_file, output_arrays_type, openml_columns_info, feature_names_to_select, target_names_to_select, read_csv_kwargs=None, ): """ARFF parser using `pandas.read_csv`. This parser uses the metadata fetched directly from OpenML and skips the metadata headers of ARFF file itself. The data is loaded as a CSV file. Parameters ---------- gzip_file : GzipFile instance The GZip compressed file with the ARFF formatted payload. output_arrays_type : {"numpy", "sparse", "pandas"} The type of the arrays that will be returned. The possibilities are: - `"numpy"`: both `X` and `y` will be NumPy arrays; - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array; - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a pandas Series or DataFrame. openml_columns_info : dict The information provided by OpenML regarding the columns of the ARFF file. feature_names_to_select : list of str A list of the feature names to be selected to build `X`. target_names_to_select : list of str A list of the target names to be selected to build `y`. read_csv_kwargs : dict, default=None Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite the default options. Returns ------- X : {ndarray, sparse matrix, dataframe} The data matrix. y : {ndarray, dataframe, series} The target. frame : dataframe or None A dataframe containing both `X` and `y`. `None` if `output_array_type != "pandas"`. categories : list of str or None The names of the features that are categorical. `None` if `output_array_type == "pandas"`. """ import pandas as pd # read the file until the data section to skip the ARFF metadata headers for line in gzip_file: if line.decode("utf-8").lower().startswith("@data"): break dtypes = {} for name in openml_columns_info: column_dtype = openml_columns_info[name]["data_type"] if column_dtype.lower() == "integer": # Use Int64 to infer missing values from data # XXX: this line is not covered by our tests. Is this really needed? dtypes[name] = "Int64" elif column_dtype.lower() == "nominal": dtypes[name] = "category" # since we will not pass `names` when reading the ARFF file, we need to translate # `dtypes` from column names to column indices to pass to `pandas.read_csv` dtypes_positional = { col_idx: dtypes[name] for col_idx, name in enumerate(openml_columns_info) if name in dtypes } default_read_csv_kwargs = { "header": None, "index_col": False, # always force pandas to not use the first column as index "na_values": ["?"], # missing values are represented by `?` "keep_default_na": False, # only `?` is a missing value given the ARFF specs "comment": "%", # skip line starting by `%` since they are comments "quotechar": '"', # delimiter to use for quoted strings "skipinitialspace": True, # skip spaces after delimiter to follow ARFF specs "escapechar": "\\", "dtype": dtypes_positional, } read_csv_kwargs = {**default_read_csv_kwargs, **(read_csv_kwargs or {})} frame = pd.read_csv(gzip_file, **read_csv_kwargs) try: # Setting the columns while reading the file will select the N first columns # and not raise a ParserError. Instead, we set the columns after reading the # file and raise a ParserError if the number of columns does not match the # number of columns in the metadata given by OpenML. frame.columns = [name for name in openml_columns_info] except ValueError as exc: raise pd.errors.ParserError( "The number of columns provided by OpenML does not match the number of " "columns inferred by pandas when reading the file." ) from exc columns_to_select = feature_names_to_select + target_names_to_select columns_to_keep = [col for col in frame.columns if col in columns_to_select] frame = frame[columns_to_keep] # `pd.read_csv` automatically handles double quotes for quoting non-numeric # CSV cell values. Contrary to LIAC-ARFF, `pd.read_csv` cannot be configured to # consider either single quotes and double quotes as valid quoting chars at # the same time since this case does not occur in regular (non-ARFF) CSV files. # To mimic the behavior of LIAC-ARFF parser, we manually strip single quotes # on categories as a post-processing steps if needed. # # Note however that we intentionally do not attempt to do this kind of manual # post-processing of (non-categorical) string-typed columns because we cannot # resolve the ambiguity of the case of CSV cell with nesting quoting such as # `"'some string value'"` with pandas. single_quote_pattern = re.compile(r"^'(?P.*)'$") def strip_single_quotes(input_string): match = re.search(single_quote_pattern, input_string) if match is None: return input_string return match.group("contents") categorical_columns = [ name for name, dtype in frame.dtypes.items() if isinstance(dtype, pd.CategoricalDtype) ] for col in categorical_columns: frame[col] = frame[col].cat.rename_categories(strip_single_quotes) X, y = _post_process_frame(frame, feature_names_to_select, target_names_to_select) if output_arrays_type == "pandas": return X, y, frame, None else: X, y = X.to_numpy(), y.to_numpy() categories = { name: dtype.categories.tolist() for name, dtype in frame.dtypes.items() if isinstance(dtype, pd.CategoricalDtype) } return X, y, None, categories def load_arff_from_gzip_file( gzip_file, parser, output_type, openml_columns_info, feature_names_to_select, target_names_to_select, shape=None, read_csv_kwargs=None, ): """Load a compressed ARFF file using a given parser. Parameters ---------- gzip_file : GzipFile instance The file compressed to be read. parser : {"pandas", "liac-arff"} The parser used to parse the ARFF file. "pandas" is recommended but only supports loading dense datasets. output_type : {"numpy", "sparse", "pandas"} The type of the arrays that will be returned. The possibilities ara: - `"numpy"`: both `X` and `y` will be NumPy arrays; - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array; - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a pandas Series or DataFrame. openml_columns_info : dict The information provided by OpenML regarding the columns of the ARFF file. feature_names_to_select : list of str A list of the feature names to be selected. target_names_to_select : list of str A list of the target names to be selected. read_csv_kwargs : dict, default=None Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite the default options. Returns ------- X : {ndarray, sparse matrix, dataframe} The data matrix. y : {ndarray, dataframe, series} The target. frame : dataframe or None A dataframe containing both `X` and `y`. `None` if `output_array_type != "pandas"`. categories : list of str or None The names of the features that are categorical. `None` if `output_array_type == "pandas"`. """ if parser == "liac-arff": return _liac_arff_parser( gzip_file, output_type, openml_columns_info, feature_names_to_select, target_names_to_select, shape, ) elif parser == "pandas": return _pandas_arff_parser( gzip_file, output_type, openml_columns_info, feature_names_to_select, target_names_to_select, read_csv_kwargs, ) else: raise ValueError( f"Unknown parser: '{parser}'. Should be 'liac-arff' or 'pandas'." )