198 lines
7.1 KiB
Python
198 lines
7.1 KiB
Python
# Sebastian Raschka 2014-2020
|
|
# mlxtend Machine Learning Library Extensions
|
|
#
|
|
# Function for generating association rules
|
|
#
|
|
# Author: Joshua Goerner <https://github.com/JoshuaGoerner>
|
|
# Sebastian Raschka <sebastianraschka.com>
|
|
#
|
|
# License: BSD 3 clause
|
|
|
|
from itertools import combinations
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
|
|
def association_rules(df, metric="confidence",
|
|
min_threshold=0.8, support_only=False):
|
|
"""Generates a DataFrame of association rules including the
|
|
metrics 'score', 'confidence', and 'lift'
|
|
|
|
Parameters
|
|
-----------
|
|
df : pandas DataFrame
|
|
pandas DataFrame of frequent itemsets
|
|
with columns ['support', 'itemsets']
|
|
|
|
metric : string (default: 'confidence')
|
|
Metric to evaluate if a rule is of interest.
|
|
**Automatically set to 'support' if `support_only=True`.**
|
|
Otherwise, supported metrics are 'support', 'confidence', 'lift',
|
|
'leverage', and 'conviction'
|
|
These metrics are computed as follows:
|
|
|
|
- support(A->C) = support(A+C) [aka 'support'], range: [0, 1]\n
|
|
- confidence(A->C) = support(A+C) / support(A), range: [0, 1]\n
|
|
- lift(A->C) = confidence(A->C) / support(C), range: [0, inf]\n
|
|
- leverage(A->C) = support(A->C) - support(A)*support(C),
|
|
range: [-1, 1]\n
|
|
- conviction = [1 - support(C)] / [1 - confidence(A->C)],
|
|
range: [0, inf]\n
|
|
|
|
min_threshold : float (default: 0.8)
|
|
Minimal threshold for the evaluation metric,
|
|
via the `metric` parameter,
|
|
to decide whether a candidate rule is of interest.
|
|
|
|
support_only : bool (default: False)
|
|
Only computes the rule support and fills the other
|
|
metric columns with NaNs. This is useful if:
|
|
|
|
a) the input DataFrame is incomplete, e.g., does
|
|
not contain support values for all rule antecedents
|
|
and consequents
|
|
|
|
b) you simply want to speed up the computation because
|
|
you don't need the other metrics.
|
|
|
|
Returns
|
|
----------
|
|
pandas DataFrame with columns "antecedents" and "consequents"
|
|
that store itemsets, plus the scoring metric columns:
|
|
"antecedent support", "consequent support",
|
|
"support", "confidence", "lift",
|
|
"leverage", "conviction"
|
|
of all rules for which
|
|
metric(rule) >= min_threshold.
|
|
Each entry in the "antecedents" and "consequents" columns are
|
|
of type `frozenset`, which is a Python built-in type that
|
|
behaves similarly to sets except that it is immutable
|
|
(For more info, see
|
|
https://docs.python.org/3.6/library/stdtypes.html#frozenset).
|
|
|
|
Examples
|
|
-----------
|
|
For usage examples, please see
|
|
http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/
|
|
|
|
"""
|
|
|
|
# check for mandatory columns
|
|
if not all(col in df.columns for col in ["support", "itemsets"]):
|
|
raise ValueError("Dataframe needs to contain the\
|
|
columns 'support' and 'itemsets'")
|
|
|
|
def conviction_helper(sAC, sA, sC):
|
|
confidence = sAC/sA
|
|
conviction = np.empty(confidence.shape, dtype=float)
|
|
if not len(conviction.shape):
|
|
conviction = conviction[np.newaxis]
|
|
confidence = confidence[np.newaxis]
|
|
sAC = sAC[np.newaxis]
|
|
sA = sA[np.newaxis]
|
|
sC = sC[np.newaxis]
|
|
conviction[:] = np.inf
|
|
conviction[confidence < 1.] = ((1. - sC[confidence < 1.]) /
|
|
(1. - confidence[confidence < 1.]))
|
|
|
|
return conviction
|
|
|
|
# metrics for association rules
|
|
metric_dict = {
|
|
"antecedent support": lambda _, sA, __: sA,
|
|
"consequent support": lambda _, __, sC: sC,
|
|
"support": lambda sAC, _, __: sAC,
|
|
"confidence": lambda sAC, sA, _: sAC/sA,
|
|
"lift": lambda sAC, sA, sC: metric_dict["confidence"](sAC, sA, sC)/sC,
|
|
"leverage": lambda sAC, sA, sC: metric_dict["support"](
|
|
sAC, sA, sC) - sA*sC,
|
|
"conviction": lambda sAC, sA, sC: conviction_helper(sAC, sA, sC)
|
|
}
|
|
|
|
columns_ordered = ["antecedent support", "consequent support",
|
|
"support",
|
|
"confidence", "lift",
|
|
"leverage", "conviction"]
|
|
|
|
# check for metric compliance
|
|
if support_only:
|
|
metric = 'support'
|
|
else:
|
|
if metric not in metric_dict.keys():
|
|
raise ValueError("Metric must be 'confidence' or 'lift', got '{}'"
|
|
.format(metric))
|
|
|
|
# get dict of {frequent itemset} -> support
|
|
keys = df['itemsets'].values
|
|
values = df['support'].values
|
|
frozenset_vect = np.vectorize(lambda x: frozenset(x))
|
|
frequent_items_dict = dict(zip(frozenset_vect(keys), values))
|
|
|
|
# prepare buckets to collect frequent rules
|
|
rule_antecedents = []
|
|
rule_consequents = []
|
|
rule_supports = []
|
|
|
|
# iterate over all frequent itemsets
|
|
for k in frequent_items_dict.keys():
|
|
sAC = frequent_items_dict[k]
|
|
# to find all possible combinations
|
|
for idx in range(len(k)-1, 0, -1):
|
|
# of antecedent and consequent
|
|
for c in combinations(k, r=idx):
|
|
antecedent = frozenset(c)
|
|
consequent = k.difference(antecedent)
|
|
|
|
if support_only:
|
|
# support doesn't need these,
|
|
# hence, placeholders should suffice
|
|
sA = None
|
|
sC = None
|
|
|
|
else:
|
|
try:
|
|
sA = frequent_items_dict[antecedent]
|
|
sC = frequent_items_dict[consequent]
|
|
except KeyError as e:
|
|
s = (str(e) + 'You are likely getting this error'
|
|
' because the DataFrame is missing '
|
|
' antecedent and/or consequent '
|
|
' information.'
|
|
' You can try using the '
|
|
' `support_only=True` option')
|
|
raise KeyError(s)
|
|
# check for the threshold
|
|
|
|
score = metric_dict[metric](sAC, sA, sC)
|
|
if score >= min_threshold:
|
|
rule_antecedents.append(antecedent)
|
|
rule_consequents.append(consequent)
|
|
rule_supports.append([sAC, sA, sC])
|
|
|
|
# check if frequent rule was generated
|
|
if not rule_supports:
|
|
return pd.DataFrame(
|
|
columns=["antecedents", "consequents"] + columns_ordered)
|
|
|
|
else:
|
|
# generate metrics
|
|
rule_supports = np.array(rule_supports).T.astype(float)
|
|
df_res = pd.DataFrame(
|
|
data=list(zip(rule_antecedents, rule_consequents)),
|
|
columns=["antecedents", "consequents"])
|
|
|
|
if support_only:
|
|
sAC = rule_supports[0]
|
|
for m in columns_ordered:
|
|
df_res[m] = np.nan
|
|
df_res['support'] = sAC
|
|
|
|
else:
|
|
sAC = rule_supports[0]
|
|
sA = rule_supports[1]
|
|
sC = rule_supports[2]
|
|
for m in columns_ordered:
|
|
df_res[m] = metric_dict[m](sAC, sA, sC)
|
|
|
|
return df_res
|