
183 lines
6.4 KiB
Raw Normal View History

2021-06-06 22:13:05 +02:00
# Sebastian Raschka 2014-2020
# mlxtend Machine Learning Library Extensions
# Author: Sebastian Raschka <>
# License: BSD 3 clause
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator, TransformerMixin
class TransactionEncoder(BaseEstimator, TransformerMixin):
"""Encoder class for transaction data in Python lists
columns_: list
List of unique names in the `X` input list of lists
For usage examples, please see
def __init__(self):
return None
def fit(self, X):
"""Learn unique column names from transaction DataFrame
X : list of lists
A python list of lists, where the outer list stores the
n transactions and the inner list stores the items in each
For example,
[['Apple', 'Beer', 'Rice', 'Chicken'],
['Apple', 'Beer', 'Rice'],
['Apple', 'Beer'],
['Apple', 'Bananas'],
['Milk', 'Beer', 'Rice', 'Chicken'],
['Milk', 'Beer', 'Rice'],
['Milk', 'Beer'],
['Apple', 'Bananas']]
unique_items = set()
for transaction in X:
for item in transaction:
self.columns_ = sorted(unique_items)
columns_mapping = {}
for col_idx, item in enumerate(self.columns_):
columns_mapping[item] = col_idx
self.columns_mapping_ = columns_mapping
return self
def transform(self, X, sparse=False):
"""Transform transactions into a one-hot encoded NumPy array.
X : list of lists
A python list of lists, where the outer list stores the
n transactions and the inner list stores the items in each
For example,
[['Apple', 'Beer', 'Rice', 'Chicken'],
['Apple', 'Beer', 'Rice'],
['Apple', 'Beer'],
['Apple', 'Bananas'],
['Milk', 'Beer', 'Rice', 'Chicken'],
['Milk', 'Beer', 'Rice'],
['Milk', 'Beer'],
['Apple', 'Bananas']]
sparse: bool (default=False)
If True, transform will return Compressed Sparse Row matrix
instead of the regular one.
array : NumPy array [n_transactions, n_unique_items]
if sparse=False (default).
Compressed Sparse Row matrix otherwise
The one-hot encoded boolean array of the input transactions,
where the columns represent the unique items found in the input
array in alphabetic order. Exact representation depends
on the sparse argument
For example,
array([[True , False, True , True , False, True ],
[True , False, True , False, False, True ],
[True , False, True , False, False, False],
[True , True , False, False, False, False],
[False, False, True , True , True , True ],
[False, False, True , False, True , True ],
[False, False, True , False, True , False],
[True , True , False, False, False, False]])
The corresponding column labels are available as self.columns_, e.g.,
['Apple', 'Bananas', 'Beer', 'Chicken', 'Milk', 'Rice']
if sparse:
indptr = [0]
indices = []
for transaction in X:
# set is necessary because conversion to SparseDataFrame
# will fail if there are duplicate items
for item in set(transaction):
col_idx = self.columns_mapping_[item]
non_sparse_values = [True]*len(indices)
array = csr_matrix((non_sparse_values, indices, indptr),
array = np.zeros((len(X), len(self.columns_)), dtype=bool)
for row_idx, transaction in enumerate(X):
for item in transaction:
col_idx = self.columns_mapping_[item]
array[row_idx, col_idx] = True
return array
def inverse_transform(self, array):
"""Transforms an encoded NumPy array back into transactions.
array : NumPy array [n_transactions, n_unique_items]
The NumPy one-hot encoded boolean array of the input transactions,
where the columns represent the unique items found in the input
array in alphabetic order
For example,
array([[True , False, True , True , False, True ],
[True , False, True , False, False, True ],
[True , False, True , False, False, False],
[True , True , False, False, False, False],
[False, False, True , True , True , True ],
[False, False, True , False, True , True ],
[False, False, True , False, True , False],
[True , True , False, False, False, False]])
The corresponding column labels are available as self.columns_,
e.g., ['Apple', 'Bananas', 'Beer', 'Chicken', 'Milk', 'Rice']
X : list of lists
A python list of lists, where the outer list stores the
n transactions and the inner list stores the items in each
For example,
[['Apple', 'Beer', 'Rice', 'Chicken'],
['Apple', 'Beer', 'Rice'],
['Apple', 'Beer'],
['Apple', 'Bananas'],
['Milk', 'Beer', 'Rice', 'Chicken'],
['Milk', 'Beer', 'Rice'],
['Milk', 'Beer'],
['Apple', 'Bananas']]
return [[self.columns_[idx]
for idx, cell in enumerate(row) if cell]
for row in array]
def fit_transform(self, X, sparse=False):
"""Fit a TransactionEncoder encoder and transform a dataset."""
return, sparse=sparse)