# OneR classifier # Sebastian Raschka 2014-2020 # mlxtend Machine Learning Library Extensions # # The classic OneR (One Rule) classifier # Authors: Sebastian Raschka # # License: BSD 3 clause import numpy as np import warnings from scipy.stats import chi2_contingency from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.exceptions import NotFittedError class OneRClassifier(BaseEstimator, ClassifierMixin): """OneR (One Rule) Classifier. Parameters ---------- resolve_ties : str (default: 'first') Option for how to resolve ties if two or more features have the same error. Options are - 'first' (default): chooses first feature in the list, i.e., feature with the lower column index. - 'chi-squared': performs a chi-squared test for each feature against the target and selects the feature with the lowest p-value. Attributes ---------- self.classes_labels_ : array-like, shape = [n_labels] Array containing the unique class labels found in the training set. self.feature_idx_ : int The index of the rules' feature based on the column in the training set. self.p_value_ : float The p value for a given feature. Only available after calling `fit` when the OneR attribute `resolve_ties = 'chi-squared'` is set. self.prediction_dict_ : dict Dictionary containing information about the feature's (self.feature_idx_) rules and total error. E.g., `{'total error': 37, 'rules (value: class)': {0: 0, 1: 2}}` means the total error is 37, and the rules are "if feature value == 0 classify as 0" and "if feature value == 1 classify as 2". (And classify as class 1 otherwise.) For usage examples, please see http://rasbt.github.io/mlxtend/user_guide/classifier/OneRClassifier/ """ def __init__(self, resolve_ties='first'): allowed = {'first', 'chi-squared'} if resolve_ties not in allowed: raise ValueError('resolve_ties must be in %s. Got %s.' % (allowed, resolve_ties)) self.resolve_ties = resolve_ties def fit(self, X, y): """Learn rule from training data. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : object """ # This check will only catch the most extreme cases # but better than nothing for c in range(X.shape[1]): if np.unique(X[:, c]).shape[0] == X.shape[0]: warnings.warn('Feature array likely contains at least one' ' non-categorical column.' ' Column %d appears to have a unique value' ' in every row.' % c) break n_class_labels = np.unique(y).shape[0] def compute_class_counts(X, y, feature_index, feature_value): mask = X[:, feature_index] == feature_value return np.bincount(y[mask], minlength=n_class_labels) prediction_dict = {} # save feature_idx: feature_val, label, error # iterate over features for feature_index in np.arange(X.shape[1]): # iterate over each possible value per feature for feature_value in np.unique(X[:, feature_index]): class_counts = compute_class_counts(X, y, feature_index, feature_value) most_frequent_class = np.argmax(class_counts) self.class_labels_ = np.unique(y) # count all classes for that feature match # except the most frequent one inverse_index = np.ones(n_class_labels, dtype=bool) inverse_index[most_frequent_class] = False error = np.sum(class_counts[inverse_index]) # compute the total error for each feature and # save all the corresponding rules for a given feature if feature_index not in prediction_dict: prediction_dict[feature_index] = { 'total error': 0, 'rules (value: class)': {} } prediction_dict[feature_index][ 'rules (value: class)'][ feature_value] = most_frequent_class prediction_dict[feature_index]['total error'] += error # get best feature (i.e., the feature with the lowest error) best_err = np.inf best_idx = [None] for i in prediction_dict: if prediction_dict[i]['total error'] < best_err: best_err = prediction_dict[i]['total error'] best_idx[-1] = i if self.resolve_ties == 'chi-squared': # collect duplicates for i in prediction_dict: if i == best_idx[-1]: continue if prediction_dict[i]['total error'] == best_err: best_idx.append(i) p_values = [] for feature_idx in best_idx: rules = prediction_dict[feature_idx][ 'rules (value: class)'] # contingency table for a given feature # (e.g., petal_width for iris) # is organized as follows (without the sum columns): # # petal_width # species (0.0976,0.791] (0.791,1.63] (1.63,2.5] sum # setosa 50 0 0 50 # versicolor 0 48 2 50 # virginica 0 4 46 50 # sum 50 52 48 150 ary = np.zeros((n_class_labels, len(rules))) for idx, r in enumerate(rules): ary[:, idx] = np.bincount(y[X[:, feature_idx] == r], minlength=n_class_labels) # returns "stat, p, dof, expected" _, p, _, _ = chi2_contingency(ary) p_values.append(p) best_p_idx = np.argmax(p_values) best_idx = best_idx[best_p_idx] self.p_value_ = p_values[best_p_idx] elif self.resolve_ties == 'first': best_idx = best_idx[0] self.feature_idx_ = best_idx self.prediction_dict_ = prediction_dict[best_idx] return self def predict(self, X): """ Predict class labels for X. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. Returns ---------- maj : array-like, shape = [n_samples] Predicted class labels. """ if not hasattr(self, 'prediction_dict_'): raise NotFittedError("Estimator not fitted, " "call `fit` before using the model.") rules = self.prediction_dict_['rules (value: class)'] y_pred = np.zeros(X.shape[0], dtype=np.int) # Set up labels for those class labels in the # dataset for which no rule exists. We use the # first non-specified class label as the default class label. rule_labels = set() for feature_value in rules: class_label = rules[feature_value] rule_labels.add(class_label) other_label = (set(self.class_labels_) - rule_labels) if len(other_label): y_pred[:] = list(other_label)[0] # else just use "np.zeros"; we could also change this to # self.class_labels_[-1]+1 in future # classify all class labels for which rules exist for feature_value in rules: mask = X[:, self.feature_idx_] == feature_value y_pred[mask] = rules[feature_value] return y_pred