projektAI/venv/Lib/site-packages/mlxtend/frequent_patterns/fpgrowth.py

134 lines
4.6 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
# mlxtend Machine Learning Library Extensions
# Author: Steve Harenberg <harenbergsd@gmail.com>
#
# License: BSD 3 clause
import math
import itertools
from ..frequent_patterns import fpcommon as fpc
def fpgrowth(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0):
"""Get frequent itemsets from a one-hot DataFrame
Parameters
-----------
df : pandas DataFrame
pandas DataFrame the encoded format. Also supports
DataFrames with sparse data; for more info, please
see (https://pandas.pydata.org/pandas-docs/stable/
user_guide/sparse.html#sparse-data-structures)
Please note that the old pandas SparseDataFrame format
is no longer supported in mlxtend >= 0.17.2.
The allowed values are either 0/1 or True/False.
For example,
```
Apple Bananas Beer Chicken Milk Rice
0 True False True True False True
1 True False True False False True
2 True False True False False False
3 True True False False False False
4 False False True True True True
5 False False True False True True
6 False False True False True False
7 True True False False False False
```
min_support : float (default: 0.5)
A float between 0 and 1 for minimum support of the itemsets returned.
The support is computed as the fraction
transactions_where_item(s)_occur / total_transactions.
use_colnames : bool (default: False)
If true, uses the DataFrames' column names in the returned DataFrame
instead of column indices.
max_len : int (default: None)
Maximum length of the itemsets generated. If `None` (default) all
possible itemsets lengths are evaluated.
verbose : int (default: 0)
Shows the stages of conditional tree generation.
Returns
-----------
pandas DataFrame with columns ['support', 'itemsets'] of all itemsets
that are >= `min_support` and < than `max_len`
(if `max_len` is not None).
Each itemset in the 'itemsets' column is of type `frozenset`,
which is a Python built-in type that behaves similarly to
sets except that it is immutable
(For more info, see
https://docs.python.org/3.6/library/stdtypes.html#frozenset).
Examples
----------
For usage examples, please see
http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpgrowth/
"""
fpc.valid_input_check(df)
if min_support <= 0.:
raise ValueError('`min_support` must be a positive '
'number within the interval `(0, 1]`. '
'Got %s.' % min_support)
colname_map = None
if use_colnames:
colname_map = {idx: item for idx, item in enumerate(df.columns)}
tree, _ = fpc.setup_fptree(df, min_support)
minsup = math.ceil(min_support * len(df.index)) # min support as count
generator = fpg_step(tree, minsup, colname_map, max_len, verbose)
return fpc.generate_itemsets(generator, len(df.index), colname_map)
def fpg_step(tree, minsup, colnames, max_len, verbose):
"""
Performs a recursive step of the fpgrowth algorithm.
Parameters
----------
tree : FPTree
minsup : int
Yields
------
lists of strings
Set of items that has occurred in minsup itemsets.
"""
count = 0
items = tree.nodes.keys()
if tree.is_path():
# If the tree is a path, we can combinatorally generate all
# remaining itemsets without generating additional conditional trees
size_remain = len(items) + 1
if max_len:
size_remain = max_len - len(tree.cond_items) + 1
for i in range(1, size_remain):
for itemset in itertools.combinations(items, i):
count += 1
support = min([tree.nodes[i][0].count for i in itemset])
yield support, tree.cond_items + list(itemset)
elif not max_len or max_len > len(tree.cond_items):
for item in items:
count += 1
support = sum([node.count for node in tree.nodes[item]])
yield support, tree.cond_items + [item]
if verbose:
tree.print_status(count, colnames)
# Generate conditional trees to generate frequent itemsets one item larger
if not tree.is_path() and (not max_len or max_len > len(tree.cond_items)):
for item in items:
cond_tree = tree.conditional_tree(item, minsup)
for sup, iset in fpg_step(cond_tree, minsup,
colnames, max_len, verbose):
yield sup, iset