335 lines
12 KiB
Python
335 lines
12 KiB
Python
# Sebastian Raschka 2014-2020
|
|
# myxtend Machine Learning Library Extensions
|
|
# Author: Sebastian Raschka <sebastianraschka.com>
|
|
#
|
|
# License: BSD 3 clause
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from ..frequent_patterns import fpcommon as fpc
|
|
|
|
|
|
def generate_new_combinations(old_combinations):
|
|
"""
|
|
Generator of all combinations based on the last state of Apriori algorithm
|
|
Parameters
|
|
-----------
|
|
old_combinations: np.array
|
|
All combinations with enough support in the last step
|
|
Combinations are represented by a matrix.
|
|
Number of columns is equal to the combination size
|
|
of the previous step.
|
|
Each row represents one combination
|
|
and contains item type ids in the ascending order
|
|
```
|
|
0 1
|
|
0 15 20
|
|
1 15 22
|
|
2 17 19
|
|
```
|
|
|
|
Returns
|
|
-----------
|
|
Generator of all combinations from the last step x items
|
|
from the previous step.
|
|
|
|
Examples
|
|
-----------
|
|
For usage examples, please see
|
|
http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori
|
|
|
|
"""
|
|
|
|
items_types_in_previous_step = np.unique(old_combinations.flatten())
|
|
for old_combination in old_combinations:
|
|
max_combination = old_combination[-1]
|
|
mask = items_types_in_previous_step > max_combination
|
|
valid_items = items_types_in_previous_step[mask]
|
|
old_tuple = tuple(old_combination)
|
|
for item in valid_items:
|
|
yield from old_tuple
|
|
yield item
|
|
|
|
|
|
def generate_new_combinations_low_memory(old_combinations, X, min_support,
|
|
is_sparse):
|
|
"""
|
|
Generator of all combinations based on the last state of Apriori algorithm
|
|
Parameters
|
|
-----------
|
|
old_combinations: np.array
|
|
All combinations with enough support in the last step
|
|
Combinations are represented by a matrix.
|
|
Number of columns is equal to the combination size
|
|
of the previous step.
|
|
Each row represents one combination
|
|
and contains item type ids in the ascending order
|
|
```
|
|
0 1
|
|
0 15 20
|
|
1 15 22
|
|
2 17 19
|
|
```
|
|
|
|
X: np.array or scipy sparse matrix
|
|
The allowed values are either 0/1 or True/False.
|
|
For example,
|
|
|
|
```
|
|
0 True False True True False True
|
|
1 True False True False False True
|
|
2 True False True False False False
|
|
3 True True False False False False
|
|
4 False False True True True True
|
|
5 False False True False True True
|
|
6 False False True False True False
|
|
7 True True False False False False
|
|
```
|
|
|
|
min_support : float (default: 0.5)
|
|
A float between 0 and 1 for minumum support of the itemsets returned.
|
|
The support is computed as the fraction
|
|
`transactions_where_item(s)_occur / total_transactions`.
|
|
|
|
is_sparse : bool True if X is sparse
|
|
|
|
Returns
|
|
-----------
|
|
Generator of all combinations from the last step x items
|
|
from the previous step. Every combination contains the
|
|
number of transactions where this item occurs, followed
|
|
by item type ids in the ascending order.
|
|
No combination other than generated
|
|
do not have a chance to get enough support
|
|
|
|
Examples
|
|
-----------
|
|
For usage examples, please see
|
|
http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/generate_new_combinations/
|
|
|
|
"""
|
|
|
|
items_types_in_previous_step = np.unique(old_combinations.flatten())
|
|
rows_count = X.shape[0]
|
|
threshold = min_support * rows_count
|
|
for old_combination in old_combinations:
|
|
max_combination = old_combination[-1]
|
|
mask = items_types_in_previous_step > max_combination
|
|
valid_items = items_types_in_previous_step[mask]
|
|
old_tuple = tuple(old_combination)
|
|
if is_sparse:
|
|
mask_rows = X[:, old_tuple].toarray().all(axis=1)
|
|
X_cols = X[:, valid_items].toarray()
|
|
supports = X_cols[mask_rows].sum(axis=0)
|
|
else:
|
|
mask_rows = X[:, old_tuple].all(axis=1)
|
|
supports = X[mask_rows][:, valid_items].sum(axis=0)
|
|
valid_indices = (supports >= threshold).nonzero()[0]
|
|
for index in valid_indices:
|
|
yield supports[index]
|
|
yield from old_tuple
|
|
yield valid_items[index]
|
|
|
|
|
|
def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
|
|
low_memory=False):
|
|
"""Get frequent itemsets from a one-hot DataFrame
|
|
|
|
Parameters
|
|
-----------
|
|
df : pandas DataFrame
|
|
pandas DataFrame the encoded format. Also supports
|
|
DataFrames with sparse data; for more info, please
|
|
see (https://pandas.pydata.org/pandas-docs/stable/
|
|
user_guide/sparse.html#sparse-data-structures)
|
|
|
|
Please note that the old pandas SparseDataFrame format
|
|
is no longer supported in mlxtend >= 0.17.2.
|
|
|
|
The allowed values are either 0/1 or True/False.
|
|
For example,
|
|
|
|
```
|
|
Apple Bananas Beer Chicken Milk Rice
|
|
0 True False True True False True
|
|
1 True False True False False True
|
|
2 True False True False False False
|
|
3 True True False False False False
|
|
4 False False True True True True
|
|
5 False False True False True True
|
|
6 False False True False True False
|
|
7 True True False False False False
|
|
```
|
|
|
|
min_support : float (default: 0.5)
|
|
A float between 0 and 1 for minumum support of the itemsets returned.
|
|
The support is computed as the fraction
|
|
`transactions_where_item(s)_occur / total_transactions`.
|
|
|
|
use_colnames : bool (default: False)
|
|
If `True`, uses the DataFrames' column names in the returned DataFrame
|
|
instead of column indices.
|
|
|
|
max_len : int (default: None)
|
|
Maximum length of the itemsets generated. If `None` (default) all
|
|
possible itemsets lengths (under the apriori condition) are evaluated.
|
|
|
|
verbose : int (default: 0)
|
|
Shows the number of iterations if >= 1 and `low_memory` is `True`. If
|
|
>=1 and `low_memory` is `False`, shows the number of combinations.
|
|
|
|
low_memory : bool (default: False)
|
|
If `True`, uses an iterator to search for combinations above
|
|
`min_support`.
|
|
Note that while `low_memory=True` should only be used for large dataset
|
|
if memory resources are limited, because this implementation is approx.
|
|
3-6x slower than the default.
|
|
|
|
|
|
Returns
|
|
-----------
|
|
pandas DataFrame with columns ['support', 'itemsets'] of all itemsets
|
|
that are >= `min_support` and < than `max_len`
|
|
(if `max_len` is not None).
|
|
Each itemset in the 'itemsets' column is of type `frozenset`,
|
|
which is a Python built-in type that behaves similarly to
|
|
sets except that it is immutable
|
|
(For more info, see
|
|
https://docs.python.org/3.6/library/stdtypes.html#frozenset).
|
|
|
|
Examples
|
|
-----------
|
|
For usage examples, please see
|
|
http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
|
|
|
|
"""
|
|
|
|
def _support(_x, _n_rows, _is_sparse):
|
|
"""DRY private method to calculate support as the
|
|
row-wise sum of values / number of rows
|
|
|
|
Parameters
|
|
-----------
|
|
|
|
_x : matrix of bools or binary
|
|
|
|
_n_rows : numeric, number of rows in _x
|
|
|
|
_is_sparse : bool True if _x is sparse
|
|
|
|
Returns
|
|
-----------
|
|
np.array, shape = (n_rows, )
|
|
|
|
Examples
|
|
-----------
|
|
For usage examples, please see
|
|
http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
|
|
|
|
"""
|
|
out = (np.sum(_x, axis=0) / _n_rows)
|
|
return np.array(out).reshape(-1)
|
|
|
|
if min_support <= 0.:
|
|
raise ValueError('`min_support` must be a positive '
|
|
'number within the interval `(0, 1]`. '
|
|
'Got %s.' % min_support)
|
|
|
|
fpc.valid_input_check(df)
|
|
|
|
if hasattr(df, "sparse"):
|
|
# DataFrame with SparseArray (pandas >= 0.24)
|
|
if df.size == 0:
|
|
X = df.values
|
|
else:
|
|
X = df.sparse.to_coo().tocsc()
|
|
is_sparse = True
|
|
else:
|
|
# dense DataFrame
|
|
X = df.values
|
|
is_sparse = False
|
|
support = _support(X, X.shape[0], is_sparse)
|
|
ary_col_idx = np.arange(X.shape[1])
|
|
support_dict = {1: support[support >= min_support]}
|
|
itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
|
|
max_itemset = 1
|
|
rows_count = float(X.shape[0])
|
|
|
|
all_ones = np.ones((int(rows_count), 1))
|
|
|
|
while max_itemset and max_itemset < (max_len or float('inf')):
|
|
next_max_itemset = max_itemset + 1
|
|
|
|
# With exceptionally large datasets, the matrix operations can use a
|
|
# substantial amount of memory. For low memory applications or large
|
|
# datasets, set `low_memory=True` to use a slower but more memory-
|
|
# efficient implementation.
|
|
if low_memory:
|
|
combin = generate_new_combinations_low_memory(
|
|
itemset_dict[max_itemset], X, min_support, is_sparse)
|
|
# slightly faster than creating an array from a list of tuples
|
|
combin = np.fromiter(combin, dtype=int)
|
|
combin = combin.reshape(-1, next_max_itemset + 1)
|
|
|
|
if combin.size == 0:
|
|
break
|
|
if verbose:
|
|
print(
|
|
'\rProcessing %d combinations | Sampling itemset size %d' %
|
|
(combin.size, next_max_itemset), end="")
|
|
|
|
itemset_dict[next_max_itemset] = combin[:, 1:]
|
|
support_dict[next_max_itemset] = combin[:, 0].astype(float) \
|
|
/ rows_count
|
|
max_itemset = next_max_itemset
|
|
else:
|
|
combin = generate_new_combinations(itemset_dict[max_itemset])
|
|
combin = np.fromiter(combin, dtype=int)
|
|
combin = combin.reshape(-1, next_max_itemset)
|
|
|
|
if combin.size == 0:
|
|
break
|
|
if verbose:
|
|
print(
|
|
'\rProcessing %d combinations | Sampling itemset size %d' %
|
|
(combin.size, next_max_itemset), end="")
|
|
|
|
if is_sparse:
|
|
_bools = X[:, combin[:, 0]] == all_ones
|
|
for n in range(1, combin.shape[1]):
|
|
_bools = _bools & (X[:, combin[:, n]] == all_ones)
|
|
else:
|
|
_bools = np.all(X[:, combin], axis=2)
|
|
|
|
support = _support(np.array(_bools), rows_count, is_sparse)
|
|
_mask = (support >= min_support).reshape(-1)
|
|
if any(_mask):
|
|
itemset_dict[next_max_itemset] = np.array(combin[_mask])
|
|
support_dict[next_max_itemset] = np.array(support[_mask])
|
|
max_itemset = next_max_itemset
|
|
else:
|
|
# Exit condition
|
|
break
|
|
|
|
all_res = []
|
|
for k in sorted(itemset_dict):
|
|
support = pd.Series(support_dict[k])
|
|
itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]],
|
|
dtype='object')
|
|
|
|
res = pd.concat((support, itemsets), axis=1)
|
|
all_res.append(res)
|
|
|
|
res_df = pd.concat(all_res)
|
|
res_df.columns = ['support', 'itemsets']
|
|
if use_colnames:
|
|
mapping = {idx: item for idx, item in enumerate(df.columns)}
|
|
res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
|
|
mapping[i] for i in x]))
|
|
res_df = res_df.reset_index(drop=True)
|
|
|
|
if verbose:
|
|
print() # adds newline if verbose counter was used
|
|
|
|
return res_df
|