projektAI/venv/Lib/site-packages/mlxtend/frequent_patterns/fpcommon.py
2021-06-06 22:13:05 +02:00

240 lines
8.1 KiB
Python

import numpy as np
import pandas as pd
import collections
from distutils.version import LooseVersion as Version
from pandas import __version__ as pandas_version
def setup_fptree(df, min_support):
num_itemsets = len(df.index) # number of itemsets in the database
is_sparse = False
if hasattr(df, "sparse"):
# DataFrame with SparseArray (pandas >= 0.24)
if df.size == 0:
itemsets = df.values
else:
itemsets = df.sparse.to_coo().tocsr()
is_sparse = True
else:
# dense DataFrame
itemsets = df.values
# support of each individual item
# if itemsets is sparse, np.sum returns an np.matrix of shape (1, N)
item_support = np.array(np.sum(itemsets, axis=0) / float(num_itemsets))
item_support = item_support.reshape(-1)
items = np.nonzero(item_support >= min_support)[0]
# Define ordering on items for inserting into FPTree
indices = item_support[items].argsort()
rank = {item: i for i, item in enumerate(items[indices])}
if is_sparse:
# Ensure that there are no zeros in sparse DataFrame
itemsets.eliminate_zeros()
# Building tree by inserting itemsets in sorted order
# Heuristic for reducing tree size is inserting in order
# of most frequent to least frequent
tree = FPTree(rank)
for i in range(num_itemsets):
if is_sparse:
# itemsets has been converted to CSR format to speed-up the line
# below. It has 3 attributes:
# - itemsets.data contains non null values, shape(#nnz,)
# - itemsets.indices contains the column number of non null
# elements, shape(#nnz,)
# - itemsets.indptr[i] contains the offset in itemset.indices of
# the first non null element in row i, shape(1+#nrows,)
nonnull = itemsets.indices[itemsets.indptr[i]:itemsets.indptr[i+1]]
else:
nonnull = np.where(itemsets[i, :])[0]
itemset = [item for item in nonnull if item in rank]
itemset.sort(key=rank.get, reverse=True)
tree.insert_itemset(itemset)
return tree, rank
def generate_itemsets(generator, num_itemsets, colname_map):
itemsets = []
supports = []
for sup, iset in generator:
itemsets.append(frozenset(iset))
supports.append(sup / num_itemsets)
res_df = pd.DataFrame({'support': supports, 'itemsets': itemsets})
if colname_map is not None:
res_df['itemsets'] = res_df['itemsets'] \
.apply(lambda x: frozenset([colname_map[i] for i in x]))
return res_df
def valid_input_check(df):
if f"{type(df)}" == "<class 'pandas.core.frame.SparseDataFrame'>":
msg = ("SparseDataFrame support has been deprecated in pandas 1.0,"
" and is no longer supported in mlxtend. "
" Please"
" see the pandas migration guide at"
" https://pandas.pydata.org/pandas-docs/"
"stable/user_guide/sparse.html#sparse-data-structures"
" for supporting sparse data in DataFrames.")
raise TypeError(msg)
if df.size == 0:
return
if hasattr(df, "sparse"):
if not isinstance(df.columns[0], str) and df.columns[0] != 0:
raise ValueError('Due to current limitations in Pandas, '
'if the sparse format has integer column names,'
'names, please make sure they either start '
'with `0` or cast them as string column names: '
'`df.columns = [str(i) for i in df.columns`].')
# Fast path: if all columns are boolean, there is nothing to checks
all_bools = df.dtypes.apply(pd.api.types.is_bool_dtype).all()
if not all_bools:
# Pandas is much slower than numpy, so use np.where on Numpy arrays
if hasattr(df, "sparse"):
if df.size == 0:
values = df.values
else:
values = df.sparse.to_coo().tocoo().data
else:
values = df.values
idxs = np.where((values != 1) & (values != 0))
if len(idxs[0]) > 0:
# idxs has 1 dimension with sparse data and 2 with dense data
val = values[tuple(loc[0] for loc in idxs)]
s = ('The allowed values for a DataFrame'
' are True, False, 0, 1. Found value %s' % (val))
raise ValueError(s)
class FPTree(object):
def __init__(self, rank=None):
self.root = FPNode(None)
self.nodes = collections.defaultdict(list)
self.cond_items = []
self.rank = rank
def conditional_tree(self, cond_item, minsup):
"""
Creates and returns the subtree of self conditioned on cond_item.
Parameters
----------
cond_item : int | str
Item that the tree (self) will be conditioned on.
minsup : int
Minimum support threshold.
Returns
-------
cond_tree : FPtree
"""
# Find all path from root node to nodes for item
branches = []
count = collections.defaultdict(int)
for node in self.nodes[cond_item]:
branch = node.itempath_from_root()
branches.append(branch)
for item in branch:
count[item] += node.count
# Define new ordering or deep trees may have combinatorially explosion
items = [item for item in count if count[item] >= minsup]
items.sort(key=count.get)
rank = {item: i for i, item in enumerate(items)}
# Create conditional tree
cond_tree = FPTree(rank)
for idx, branch in enumerate(branches):
branch = sorted([i for i in branch if i in rank],
key=rank.get, reverse=True)
cond_tree.insert_itemset(branch, self.nodes[cond_item][idx].count)
cond_tree.cond_items = self.cond_items + [cond_item]
return cond_tree
def insert_itemset(self, itemset, count=1):
"""
Inserts a list of items into the tree.
Parameters
----------
itemset : list
Items that will be inserted into the tree.
count : int
The number of occurrences of the itemset.
"""
self.root.count += count
if len(itemset) == 0:
return
# Follow existing path in tree as long as possible
index = 0
node = self.root
for item in itemset:
if item in node.children:
child = node.children[item]
child.count += count
node = child
index += 1
else:
break
# Insert any remaining items
for item in itemset[index:]:
child_node = FPNode(item, count, node)
self.nodes[item].append(child_node)
node = child_node
def is_path(self):
if len(self.root.children) > 1:
return False
for i in self.nodes:
if len(self.nodes[i]) > 1 or len(self.nodes[i][0].children) > 1:
return False
return True
def print_status(self, count, colnames):
cond_items = [str(i) for i in self.cond_items]
if colnames:
cond_items = [str(colnames[i]) for i in self.cond_items]
cond_items = ", ".join(cond_items)
print('\r%d itemset(s) from tree conditioned on items (%s)' %
(count, cond_items), end="\n")
class FPNode(object):
def __init__(self, item, count=0, parent=None):
self.item = item
self.count = count
self.parent = parent
self.children = collections.defaultdict(FPNode)
if parent is not None:
parent.children[item] = self
def itempath_from_root(self):
""" Returns the top-down sequence of items from self to
(but not including) the root node. """
path = []
if self.item is None:
return path
node = self.parent
while node.item is not None:
path.append(node.item)
node = node.parent
path.reverse()
return path