projektAI/venv/Lib/site-packages/mlxtend/frequent_patterns/association_rules.py

198 lines
7.1 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
# Sebastian Raschka 2014-2020
# mlxtend Machine Learning Library Extensions
#
# Function for generating association rules
#
# Author: Joshua Goerner <https://github.com/JoshuaGoerner>
# Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause
from itertools import combinations
import numpy as np
import pandas as pd
def association_rules(df, metric="confidence",
min_threshold=0.8, support_only=False):
"""Generates a DataFrame of association rules including the
metrics 'score', 'confidence', and 'lift'
Parameters
-----------
df : pandas DataFrame
pandas DataFrame of frequent itemsets
with columns ['support', 'itemsets']
metric : string (default: 'confidence')
Metric to evaluate if a rule is of interest.
**Automatically set to 'support' if `support_only=True`.**
Otherwise, supported metrics are 'support', 'confidence', 'lift',
'leverage', and 'conviction'
These metrics are computed as follows:
- support(A->C) = support(A+C) [aka 'support'], range: [0, 1]\n
- confidence(A->C) = support(A+C) / support(A), range: [0, 1]\n
- lift(A->C) = confidence(A->C) / support(C), range: [0, inf]\n
- leverage(A->C) = support(A->C) - support(A)*support(C),
range: [-1, 1]\n
- conviction = [1 - support(C)] / [1 - confidence(A->C)],
range: [0, inf]\n
min_threshold : float (default: 0.8)
Minimal threshold for the evaluation metric,
via the `metric` parameter,
to decide whether a candidate rule is of interest.
support_only : bool (default: False)
Only computes the rule support and fills the other
metric columns with NaNs. This is useful if:
a) the input DataFrame is incomplete, e.g., does
not contain support values for all rule antecedents
and consequents
b) you simply want to speed up the computation because
you don't need the other metrics.
Returns
----------
pandas DataFrame with columns "antecedents" and "consequents"
that store itemsets, plus the scoring metric columns:
"antecedent support", "consequent support",
"support", "confidence", "lift",
"leverage", "conviction"
of all rules for which
metric(rule) >= min_threshold.
Each entry in the "antecedents" and "consequents" columns are
of type `frozenset`, which is a Python built-in type that
behaves similarly to sets except that it is immutable
(For more info, see
https://docs.python.org/3.6/library/stdtypes.html#frozenset).
Examples
-----------
For usage examples, please see
http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/
"""
# check for mandatory columns
if not all(col in df.columns for col in ["support", "itemsets"]):
raise ValueError("Dataframe needs to contain the\
columns 'support' and 'itemsets'")
def conviction_helper(sAC, sA, sC):
confidence = sAC/sA
conviction = np.empty(confidence.shape, dtype=float)
if not len(conviction.shape):
conviction = conviction[np.newaxis]
confidence = confidence[np.newaxis]
sAC = sAC[np.newaxis]
sA = sA[np.newaxis]
sC = sC[np.newaxis]
conviction[:] = np.inf
conviction[confidence < 1.] = ((1. - sC[confidence < 1.]) /
(1. - confidence[confidence < 1.]))
return conviction
# metrics for association rules
metric_dict = {
"antecedent support": lambda _, sA, __: sA,
"consequent support": lambda _, __, sC: sC,
"support": lambda sAC, _, __: sAC,
"confidence": lambda sAC, sA, _: sAC/sA,
"lift": lambda sAC, sA, sC: metric_dict["confidence"](sAC, sA, sC)/sC,
"leverage": lambda sAC, sA, sC: metric_dict["support"](
sAC, sA, sC) - sA*sC,
"conviction": lambda sAC, sA, sC: conviction_helper(sAC, sA, sC)
}
columns_ordered = ["antecedent support", "consequent support",
"support",
"confidence", "lift",
"leverage", "conviction"]
# check for metric compliance
if support_only:
metric = 'support'
else:
if metric not in metric_dict.keys():
raise ValueError("Metric must be 'confidence' or 'lift', got '{}'"
.format(metric))
# get dict of {frequent itemset} -> support
keys = df['itemsets'].values
values = df['support'].values
frozenset_vect = np.vectorize(lambda x: frozenset(x))
frequent_items_dict = dict(zip(frozenset_vect(keys), values))
# prepare buckets to collect frequent rules
rule_antecedents = []
rule_consequents = []
rule_supports = []
# iterate over all frequent itemsets
for k in frequent_items_dict.keys():
sAC = frequent_items_dict[k]
# to find all possible combinations
for idx in range(len(k)-1, 0, -1):
# of antecedent and consequent
for c in combinations(k, r=idx):
antecedent = frozenset(c)
consequent = k.difference(antecedent)
if support_only:
# support doesn't need these,
# hence, placeholders should suffice
sA = None
sC = None
else:
try:
sA = frequent_items_dict[antecedent]
sC = frequent_items_dict[consequent]
except KeyError as e:
s = (str(e) + 'You are likely getting this error'
' because the DataFrame is missing '
' antecedent and/or consequent '
' information.'
' You can try using the '
' `support_only=True` option')
raise KeyError(s)
# check for the threshold
score = metric_dict[metric](sAC, sA, sC)
if score >= min_threshold:
rule_antecedents.append(antecedent)
rule_consequents.append(consequent)
rule_supports.append([sAC, sA, sC])
# check if frequent rule was generated
if not rule_supports:
return pd.DataFrame(
columns=["antecedents", "consequents"] + columns_ordered)
else:
# generate metrics
rule_supports = np.array(rule_supports).T.astype(float)
df_res = pd.DataFrame(
data=list(zip(rule_antecedents, rule_consequents)),
columns=["antecedents", "consequents"])
if support_only:
sAC = rule_supports[0]
for m in columns_ordered:
df_res[m] = np.nan
df_res['support'] = sAC
else:
sAC = rule_supports[0]
sA = rule_supports[1]
sC = rule_supports[2]
for m in columns_ordered:
df_res[m] = metric_dict[m](sAC, sA, sC)
return df_res