projektAI/venv/Lib/site-packages/mlxtend/classifier/oner.py
2021-06-06 22:13:05 +02:00

229 lines
8.4 KiB
Python

# OneR classifier
# Sebastian Raschka 2014-2020
# mlxtend Machine Learning Library Extensions
#
# The classic OneR (One Rule) classifier
# Authors: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause
import numpy as np
import warnings
from scipy.stats import chi2_contingency
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.exceptions import NotFittedError
class OneRClassifier(BaseEstimator, ClassifierMixin):
"""OneR (One Rule) Classifier.
Parameters
----------
resolve_ties : str (default: 'first')
Option for how to resolve ties if two or more features
have the same error. Options are
- 'first' (default): chooses first feature in the list, i.e.,
feature with the lower column index.
- 'chi-squared': performs a chi-squared test for each feature
against the target and selects the feature with the lowest p-value.
Attributes
----------
self.classes_labels_ : array-like, shape = [n_labels]
Array containing the unique class labels found in the
training set.
self.feature_idx_ : int
The index of the rules' feature based on the column in
the training set.
self.p_value_ : float
The p value for a given feature. Only available after calling `fit`
when the OneR attribute `resolve_ties = 'chi-squared'` is set.
self.prediction_dict_ : dict
Dictionary containing information about the
feature's (self.feature_idx_)
rules and total error. E.g.,
`{'total error': 37, 'rules (value: class)': {0: 0, 1: 2}}`
means the total error is 37, and the rules are
"if feature value == 0 classify as 0"
and "if feature value == 1 classify as 2".
(And classify as class 1 otherwise.)
For usage examples, please see
http://rasbt.github.io/mlxtend/user_guide/classifier/OneRClassifier/
"""
def __init__(self, resolve_ties='first'):
allowed = {'first', 'chi-squared'}
if resolve_ties not in allowed:
raise ValueError('resolve_ties must be in %s. Got %s.'
% (allowed, resolve_ties))
self.resolve_ties = resolve_ties
def fit(self, X, y):
"""Learn rule from training data.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
Returns
-------
self : object
"""
# This check will only catch the most extreme cases
# but better than nothing
for c in range(X.shape[1]):
if np.unique(X[:, c]).shape[0] == X.shape[0]:
warnings.warn('Feature array likely contains at least one'
' non-categorical column.'
' Column %d appears to have a unique value'
' in every row.' % c)
break
n_class_labels = np.unique(y).shape[0]
def compute_class_counts(X, y, feature_index, feature_value):
mask = X[:, feature_index] == feature_value
return np.bincount(y[mask], minlength=n_class_labels)
prediction_dict = {} # save feature_idx: feature_val, label, error
# iterate over features
for feature_index in np.arange(X.shape[1]):
# iterate over each possible value per feature
for feature_value in np.unique(X[:, feature_index]):
class_counts = compute_class_counts(X, y,
feature_index,
feature_value)
most_frequent_class = np.argmax(class_counts)
self.class_labels_ = np.unique(y)
# count all classes for that feature match
# except the most frequent one
inverse_index = np.ones(n_class_labels, dtype=bool)
inverse_index[most_frequent_class] = False
error = np.sum(class_counts[inverse_index])
# compute the total error for each feature and
# save all the corresponding rules for a given feature
if feature_index not in prediction_dict:
prediction_dict[feature_index] = {
'total error': 0, 'rules (value: class)': {}
}
prediction_dict[feature_index][
'rules (value: class)'][
feature_value] = most_frequent_class
prediction_dict[feature_index]['total error'] += error
# get best feature (i.e., the feature with the lowest error)
best_err = np.inf
best_idx = [None]
for i in prediction_dict:
if prediction_dict[i]['total error'] < best_err:
best_err = prediction_dict[i]['total error']
best_idx[-1] = i
if self.resolve_ties == 'chi-squared':
# collect duplicates
for i in prediction_dict:
if i == best_idx[-1]:
continue
if prediction_dict[i]['total error'] == best_err:
best_idx.append(i)
p_values = []
for feature_idx in best_idx:
rules = prediction_dict[feature_idx][
'rules (value: class)']
# contingency table for a given feature
# (e.g., petal_width for iris)
# is organized as follows (without the sum columns):
#
# petal_width
# species (0.0976,0.791] (0.791,1.63] (1.63,2.5] sum
# setosa 50 0 0 50
# versicolor 0 48 2 50
# virginica 0 4 46 50
# sum 50 52 48 150
ary = np.zeros((n_class_labels, len(rules)))
for idx, r in enumerate(rules):
ary[:, idx] = np.bincount(y[X[:, feature_idx] == r],
minlength=n_class_labels)
# returns "stat, p, dof, expected"
_, p, _, _ = chi2_contingency(ary)
p_values.append(p)
best_p_idx = np.argmax(p_values)
best_idx = best_idx[best_p_idx]
self.p_value_ = p_values[best_p_idx]
elif self.resolve_ties == 'first':
best_idx = best_idx[0]
self.feature_idx_ = best_idx
self.prediction_dict_ = prediction_dict[best_idx]
return self
def predict(self, X):
""" Predict class labels for X.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns
----------
maj : array-like, shape = [n_samples]
Predicted class labels.
"""
if not hasattr(self, 'prediction_dict_'):
raise NotFittedError("Estimator not fitted, "
"call `fit` before using the model.")
rules = self.prediction_dict_['rules (value: class)']
y_pred = np.zeros(X.shape[0], dtype=np.int)
# Set up labels for those class labels in the
# dataset for which no rule exists. We use the
# first non-specified class label as the default class label.
rule_labels = set()
for feature_value in rules:
class_label = rules[feature_value]
rule_labels.add(class_label)
other_label = (set(self.class_labels_) - rule_labels)
if len(other_label):
y_pred[:] = list(other_label)[0]
# else just use "np.zeros"; we could also change this to
# self.class_labels_[-1]+1 in future
# classify all class labels for which rules exist
for feature_value in rules:
mask = X[:, self.feature_idx_] == feature_value
y_pred[mask] = rules[feature_value]
return y_pred