# Sebastian Raschka 2014-2020 # mlxtend Machine Learning Library Extensions # Author: Sebastian Raschka # # License: BSD 3 clause import numpy as np from scipy.sparse import csr_matrix from sklearn.base import BaseEstimator, TransformerMixin class TransactionEncoder(BaseEstimator, TransformerMixin): """Encoder class for transaction data in Python lists Parameters ------------ None Attributes ------------ columns_: list List of unique names in the `X` input list of lists Examples ------------ For usage examples, please see http://rasbt.github.io/mlxtend/user_guide/preprocessing/TransactionEncoder/ """ def __init__(self): return None def fit(self, X): """Learn unique column names from transaction DataFrame Parameters ------------ X : list of lists A python list of lists, where the outer list stores the n transactions and the inner list stores the items in each transaction. For example, [['Apple', 'Beer', 'Rice', 'Chicken'], ['Apple', 'Beer', 'Rice'], ['Apple', 'Beer'], ['Apple', 'Bananas'], ['Milk', 'Beer', 'Rice', 'Chicken'], ['Milk', 'Beer', 'Rice'], ['Milk', 'Beer'], ['Apple', 'Bananas']] """ unique_items = set() for transaction in X: for item in transaction: unique_items.add(item) self.columns_ = sorted(unique_items) columns_mapping = {} for col_idx, item in enumerate(self.columns_): columns_mapping[item] = col_idx self.columns_mapping_ = columns_mapping return self def transform(self, X, sparse=False): """Transform transactions into a one-hot encoded NumPy array. Parameters ------------ X : list of lists A python list of lists, where the outer list stores the n transactions and the inner list stores the items in each transaction. For example, [['Apple', 'Beer', 'Rice', 'Chicken'], ['Apple', 'Beer', 'Rice'], ['Apple', 'Beer'], ['Apple', 'Bananas'], ['Milk', 'Beer', 'Rice', 'Chicken'], ['Milk', 'Beer', 'Rice'], ['Milk', 'Beer'], ['Apple', 'Bananas']] sparse: bool (default=False) If True, transform will return Compressed Sparse Row matrix instead of the regular one. Returns ------------ array : NumPy array [n_transactions, n_unique_items] if sparse=False (default). Compressed Sparse Row matrix otherwise The one-hot encoded boolean array of the input transactions, where the columns represent the unique items found in the input array in alphabetic order. Exact representation depends on the sparse argument For example, array([[True , False, True , True , False, True ], [True , False, True , False, False, True ], [True , False, True , False, False, False], [True , True , False, False, False, False], [False, False, True , True , True , True ], [False, False, True , False, True , True ], [False, False, True , False, True , False], [True , True , False, False, False, False]]) The corresponding column labels are available as self.columns_, e.g., ['Apple', 'Bananas', 'Beer', 'Chicken', 'Milk', 'Rice'] """ if sparse: indptr = [0] indices = [] for transaction in X: # set is necessary because conversion to SparseDataFrame # will fail if there are duplicate items for item in set(transaction): col_idx = self.columns_mapping_[item] indices.append(col_idx) indptr.append(len(indices)) non_sparse_values = [True]*len(indices) array = csr_matrix((non_sparse_values, indices, indptr), dtype=bool) else: array = np.zeros((len(X), len(self.columns_)), dtype=bool) for row_idx, transaction in enumerate(X): for item in transaction: col_idx = self.columns_mapping_[item] array[row_idx, col_idx] = True return array def inverse_transform(self, array): """Transforms an encoded NumPy array back into transactions. Parameters ------------ array : NumPy array [n_transactions, n_unique_items] The NumPy one-hot encoded boolean array of the input transactions, where the columns represent the unique items found in the input array in alphabetic order For example, ``` array([[True , False, True , True , False, True ], [True , False, True , False, False, True ], [True , False, True , False, False, False], [True , True , False, False, False, False], [False, False, True , True , True , True ], [False, False, True , False, True , True ], [False, False, True , False, True , False], [True , True , False, False, False, False]]) ``` The corresponding column labels are available as self.columns_, e.g., ['Apple', 'Bananas', 'Beer', 'Chicken', 'Milk', 'Rice'] Returns ------------ X : list of lists A python list of lists, where the outer list stores the n transactions and the inner list stores the items in each transaction. For example, ``` [['Apple', 'Beer', 'Rice', 'Chicken'], ['Apple', 'Beer', 'Rice'], ['Apple', 'Beer'], ['Apple', 'Bananas'], ['Milk', 'Beer', 'Rice', 'Chicken'], ['Milk', 'Beer', 'Rice'], ['Milk', 'Beer'], ['Apple', 'Bananas']] ``` """ return [[self.columns_[idx] for idx, cell in enumerate(row) if cell] for row in array] def fit_transform(self, X, sparse=False): """Fit a TransactionEncoder encoder and transform a dataset.""" return self.fit(X).transform(X, sparse=sparse)