projektAI/venv/Lib/site-packages/mlxtend/frequent_patterns/apriori.py
2021-06-06 22:13:05 +02:00

335 lines
12 KiB
Python

# Sebastian Raschka 2014-2020
# myxtend Machine Learning Library Extensions
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause
import numpy as np
import pandas as pd
from ..frequent_patterns import fpcommon as fpc
def generate_new_combinations(old_combinations):
"""
Generator of all combinations based on the last state of Apriori algorithm
Parameters
-----------
old_combinations: np.array
All combinations with enough support in the last step
Combinations are represented by a matrix.
Number of columns is equal to the combination size
of the previous step.
Each row represents one combination
and contains item type ids in the ascending order
```
0 1
0 15 20
1 15 22
2 17 19
```
Returns
-----------
Generator of all combinations from the last step x items
from the previous step.
Examples
-----------
For usage examples, please see
http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori
"""
items_types_in_previous_step = np.unique(old_combinations.flatten())
for old_combination in old_combinations:
max_combination = old_combination[-1]
mask = items_types_in_previous_step > max_combination
valid_items = items_types_in_previous_step[mask]
old_tuple = tuple(old_combination)
for item in valid_items:
yield from old_tuple
yield item
def generate_new_combinations_low_memory(old_combinations, X, min_support,
is_sparse):
"""
Generator of all combinations based on the last state of Apriori algorithm
Parameters
-----------
old_combinations: np.array
All combinations with enough support in the last step
Combinations are represented by a matrix.
Number of columns is equal to the combination size
of the previous step.
Each row represents one combination
and contains item type ids in the ascending order
```
0 1
0 15 20
1 15 22
2 17 19
```
X: np.array or scipy sparse matrix
The allowed values are either 0/1 or True/False.
For example,
```
0 True False True True False True
1 True False True False False True
2 True False True False False False
3 True True False False False False
4 False False True True True True
5 False False True False True True
6 False False True False True False
7 True True False False False False
```
min_support : float (default: 0.5)
A float between 0 and 1 for minumum support of the itemsets returned.
The support is computed as the fraction
`transactions_where_item(s)_occur / total_transactions`.
is_sparse : bool True if X is sparse
Returns
-----------
Generator of all combinations from the last step x items
from the previous step. Every combination contains the
number of transactions where this item occurs, followed
by item type ids in the ascending order.
No combination other than generated
do not have a chance to get enough support
Examples
-----------
For usage examples, please see
http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/generate_new_combinations/
"""
items_types_in_previous_step = np.unique(old_combinations.flatten())
rows_count = X.shape[0]
threshold = min_support * rows_count
for old_combination in old_combinations:
max_combination = old_combination[-1]
mask = items_types_in_previous_step > max_combination
valid_items = items_types_in_previous_step[mask]
old_tuple = tuple(old_combination)
if is_sparse:
mask_rows = X[:, old_tuple].toarray().all(axis=1)
X_cols = X[:, valid_items].toarray()
supports = X_cols[mask_rows].sum(axis=0)
else:
mask_rows = X[:, old_tuple].all(axis=1)
supports = X[mask_rows][:, valid_items].sum(axis=0)
valid_indices = (supports >= threshold).nonzero()[0]
for index in valid_indices:
yield supports[index]
yield from old_tuple
yield valid_items[index]
def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
low_memory=False):
"""Get frequent itemsets from a one-hot DataFrame
Parameters
-----------
df : pandas DataFrame
pandas DataFrame the encoded format. Also supports
DataFrames with sparse data; for more info, please
see (https://pandas.pydata.org/pandas-docs/stable/
user_guide/sparse.html#sparse-data-structures)
Please note that the old pandas SparseDataFrame format
is no longer supported in mlxtend >= 0.17.2.
The allowed values are either 0/1 or True/False.
For example,
```
Apple Bananas Beer Chicken Milk Rice
0 True False True True False True
1 True False True False False True
2 True False True False False False
3 True True False False False False
4 False False True True True True
5 False False True False True True
6 False False True False True False
7 True True False False False False
```
min_support : float (default: 0.5)
A float between 0 and 1 for minumum support of the itemsets returned.
The support is computed as the fraction
`transactions_where_item(s)_occur / total_transactions`.
use_colnames : bool (default: False)
If `True`, uses the DataFrames' column names in the returned DataFrame
instead of column indices.
max_len : int (default: None)
Maximum length of the itemsets generated. If `None` (default) all
possible itemsets lengths (under the apriori condition) are evaluated.
verbose : int (default: 0)
Shows the number of iterations if >= 1 and `low_memory` is `True`. If
>=1 and `low_memory` is `False`, shows the number of combinations.
low_memory : bool (default: False)
If `True`, uses an iterator to search for combinations above
`min_support`.
Note that while `low_memory=True` should only be used for large dataset
if memory resources are limited, because this implementation is approx.
3-6x slower than the default.
Returns
-----------
pandas DataFrame with columns ['support', 'itemsets'] of all itemsets
that are >= `min_support` and < than `max_len`
(if `max_len` is not None).
Each itemset in the 'itemsets' column is of type `frozenset`,
which is a Python built-in type that behaves similarly to
sets except that it is immutable
(For more info, see
https://docs.python.org/3.6/library/stdtypes.html#frozenset).
Examples
-----------
For usage examples, please see
http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
"""
def _support(_x, _n_rows, _is_sparse):
"""DRY private method to calculate support as the
row-wise sum of values / number of rows
Parameters
-----------
_x : matrix of bools or binary
_n_rows : numeric, number of rows in _x
_is_sparse : bool True if _x is sparse
Returns
-----------
np.array, shape = (n_rows, )
Examples
-----------
For usage examples, please see
http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
"""
out = (np.sum(_x, axis=0) / _n_rows)
return np.array(out).reshape(-1)
if min_support <= 0.:
raise ValueError('`min_support` must be a positive '
'number within the interval `(0, 1]`. '
'Got %s.' % min_support)
fpc.valid_input_check(df)
if hasattr(df, "sparse"):
# DataFrame with SparseArray (pandas >= 0.24)
if df.size == 0:
X = df.values
else:
X = df.sparse.to_coo().tocsc()
is_sparse = True
else:
# dense DataFrame
X = df.values
is_sparse = False
support = _support(X, X.shape[0], is_sparse)
ary_col_idx = np.arange(X.shape[1])
support_dict = {1: support[support >= min_support]}
itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
max_itemset = 1
rows_count = float(X.shape[0])
all_ones = np.ones((int(rows_count), 1))
while max_itemset and max_itemset < (max_len or float('inf')):
next_max_itemset = max_itemset + 1
# With exceptionally large datasets, the matrix operations can use a
# substantial amount of memory. For low memory applications or large
# datasets, set `low_memory=True` to use a slower but more memory-
# efficient implementation.
if low_memory:
combin = generate_new_combinations_low_memory(
itemset_dict[max_itemset], X, min_support, is_sparse)
# slightly faster than creating an array from a list of tuples
combin = np.fromiter(combin, dtype=int)
combin = combin.reshape(-1, next_max_itemset + 1)
if combin.size == 0:
break
if verbose:
print(
'\rProcessing %d combinations | Sampling itemset size %d' %
(combin.size, next_max_itemset), end="")
itemset_dict[next_max_itemset] = combin[:, 1:]
support_dict[next_max_itemset] = combin[:, 0].astype(float) \
/ rows_count
max_itemset = next_max_itemset
else:
combin = generate_new_combinations(itemset_dict[max_itemset])
combin = np.fromiter(combin, dtype=int)
combin = combin.reshape(-1, next_max_itemset)
if combin.size == 0:
break
if verbose:
print(
'\rProcessing %d combinations | Sampling itemset size %d' %
(combin.size, next_max_itemset), end="")
if is_sparse:
_bools = X[:, combin[:, 0]] == all_ones
for n in range(1, combin.shape[1]):
_bools = _bools & (X[:, combin[:, n]] == all_ones)
else:
_bools = np.all(X[:, combin], axis=2)
support = _support(np.array(_bools), rows_count, is_sparse)
_mask = (support >= min_support).reshape(-1)
if any(_mask):
itemset_dict[next_max_itemset] = np.array(combin[_mask])
support_dict[next_max_itemset] = np.array(support[_mask])
max_itemset = next_max_itemset
else:
# Exit condition
break
all_res = []
for k in sorted(itemset_dict):
support = pd.Series(support_dict[k])
itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]],
dtype='object')
res = pd.concat((support, itemsets), axis=1)
all_res.append(res)
res_df = pd.concat(all_res)
res_df.columns = ['support', 'itemsets']
if use_colnames:
mapping = {idx: item for idx, item in enumerate(df.columns)}
res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
mapping[i] for i in x]))
res_df = res_df.reset_index(drop=True)
if verbose:
print() # adds newline if verbose counter was used
return res_df