import numpy as np import pandas as pd import collections from distutils.version import LooseVersion as Version from pandas import __version__ as pandas_version def setup_fptree(df, min_support): num_itemsets = len(df.index) # number of itemsets in the database is_sparse = False if hasattr(df, "sparse"): # DataFrame with SparseArray (pandas >= 0.24) if df.size == 0: itemsets = df.values else: itemsets = df.sparse.to_coo().tocsr() is_sparse = True else: # dense DataFrame itemsets = df.values # support of each individual item # if itemsets is sparse, np.sum returns an np.matrix of shape (1, N) item_support = np.array(np.sum(itemsets, axis=0) / float(num_itemsets)) item_support = item_support.reshape(-1) items = np.nonzero(item_support >= min_support)[0] # Define ordering on items for inserting into FPTree indices = item_support[items].argsort() rank = {item: i for i, item in enumerate(items[indices])} if is_sparse: # Ensure that there are no zeros in sparse DataFrame itemsets.eliminate_zeros() # Building tree by inserting itemsets in sorted order # Heuristic for reducing tree size is inserting in order # of most frequent to least frequent tree = FPTree(rank) for i in range(num_itemsets): if is_sparse: # itemsets has been converted to CSR format to speed-up the line # below. It has 3 attributes: # - itemsets.data contains non null values, shape(#nnz,) # - itemsets.indices contains the column number of non null # elements, shape(#nnz,) # - itemsets.indptr[i] contains the offset in itemset.indices of # the first non null element in row i, shape(1+#nrows,) nonnull = itemsets.indices[itemsets.indptr[i]:itemsets.indptr[i+1]] else: nonnull = np.where(itemsets[i, :])[0] itemset = [item for item in nonnull if item in rank] itemset.sort(key=rank.get, reverse=True) tree.insert_itemset(itemset) return tree, rank def generate_itemsets(generator, num_itemsets, colname_map): itemsets = [] supports = [] for sup, iset in generator: itemsets.append(frozenset(iset)) supports.append(sup / num_itemsets) res_df = pd.DataFrame({'support': supports, 'itemsets': itemsets}) if colname_map is not None: res_df['itemsets'] = res_df['itemsets'] \ .apply(lambda x: frozenset([colname_map[i] for i in x])) return res_df def valid_input_check(df): if f"{type(df)}" == "": msg = ("SparseDataFrame support has been deprecated in pandas 1.0," " and is no longer supported in mlxtend. " " Please" " see the pandas migration guide at" " https://pandas.pydata.org/pandas-docs/" "stable/user_guide/sparse.html#sparse-data-structures" " for supporting sparse data in DataFrames.") raise TypeError(msg) if df.size == 0: return if hasattr(df, "sparse"): if not isinstance(df.columns[0], str) and df.columns[0] != 0: raise ValueError('Due to current limitations in Pandas, ' 'if the sparse format has integer column names,' 'names, please make sure they either start ' 'with `0` or cast them as string column names: ' '`df.columns = [str(i) for i in df.columns`].') # Fast path: if all columns are boolean, there is nothing to checks all_bools = df.dtypes.apply(pd.api.types.is_bool_dtype).all() if not all_bools: # Pandas is much slower than numpy, so use np.where on Numpy arrays if hasattr(df, "sparse"): if df.size == 0: values = df.values else: values = df.sparse.to_coo().tocoo().data else: values = df.values idxs = np.where((values != 1) & (values != 0)) if len(idxs[0]) > 0: # idxs has 1 dimension with sparse data and 2 with dense data val = values[tuple(loc[0] for loc in idxs)] s = ('The allowed values for a DataFrame' ' are True, False, 0, 1. Found value %s' % (val)) raise ValueError(s) class FPTree(object): def __init__(self, rank=None): self.root = FPNode(None) self.nodes = collections.defaultdict(list) self.cond_items = [] self.rank = rank def conditional_tree(self, cond_item, minsup): """ Creates and returns the subtree of self conditioned on cond_item. Parameters ---------- cond_item : int | str Item that the tree (self) will be conditioned on. minsup : int Minimum support threshold. Returns ------- cond_tree : FPtree """ # Find all path from root node to nodes for item branches = [] count = collections.defaultdict(int) for node in self.nodes[cond_item]: branch = node.itempath_from_root() branches.append(branch) for item in branch: count[item] += node.count # Define new ordering or deep trees may have combinatorially explosion items = [item for item in count if count[item] >= minsup] items.sort(key=count.get) rank = {item: i for i, item in enumerate(items)} # Create conditional tree cond_tree = FPTree(rank) for idx, branch in enumerate(branches): branch = sorted([i for i in branch if i in rank], key=rank.get, reverse=True) cond_tree.insert_itemset(branch, self.nodes[cond_item][idx].count) cond_tree.cond_items = self.cond_items + [cond_item] return cond_tree def insert_itemset(self, itemset, count=1): """ Inserts a list of items into the tree. Parameters ---------- itemset : list Items that will be inserted into the tree. count : int The number of occurrences of the itemset. """ self.root.count += count if len(itemset) == 0: return # Follow existing path in tree as long as possible index = 0 node = self.root for item in itemset: if item in node.children: child = node.children[item] child.count += count node = child index += 1 else: break # Insert any remaining items for item in itemset[index:]: child_node = FPNode(item, count, node) self.nodes[item].append(child_node) node = child_node def is_path(self): if len(self.root.children) > 1: return False for i in self.nodes: if len(self.nodes[i]) > 1 or len(self.nodes[i][0].children) > 1: return False return True def print_status(self, count, colnames): cond_items = [str(i) for i in self.cond_items] if colnames: cond_items = [str(colnames[i]) for i in self.cond_items] cond_items = ", ".join(cond_items) print('\r%d itemset(s) from tree conditioned on items (%s)' % (count, cond_items), end="\n") class FPNode(object): def __init__(self, item, count=0, parent=None): self.item = item self.count = count self.parent = parent self.children = collections.defaultdict(FPNode) if parent is not None: parent.children[item] = self def itempath_from_root(self): """ Returns the top-down sequence of items from self to (but not including) the root node. """ path = [] if self.item is None: return path node = self.parent while node.item is not None: path.append(node.item) node = node.parent path.reverse() return path