projektAI/venv/Lib/site-packages/mlxtend/classifier/oner.py

# OneR classifier

# Sebastian Raschka 2014-2020
# mlxtend Machine Learning Library Extensions
#
# The classic OneR (One Rule) classifier
# Authors: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause

import numpy as np
import warnings
from scipy.stats import chi2_contingency
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.exceptions import NotFittedError


class OneRClassifier(BaseEstimator, ClassifierMixin):

    """OneR (One Rule) Classifier.

    Parameters
    ----------
    resolve_ties : str (default: 'first')
        Option for how to resolve ties if two or more features
        have the same error. Options are
        - 'first' (default): chooses first feature in the list, i.e.,
          feature with the lower column index.
        - 'chi-squared': performs a chi-squared test for each feature
          against the target and selects the feature with the lowest p-value.

    Attributes
    ----------
    self.classes_labels_ : array-like, shape = [n_labels]
        Array containing the unique class labels found in the
        training set.

    self.feature_idx_ : int
        The index of the rules' feature based on the column in
        the training set.

    self.p_value_ : float
        The p value for a given feature. Only available after calling `fit`
        when the OneR attribute `resolve_ties = 'chi-squared'` is set.

    self.prediction_dict_ : dict
        Dictionary containing information about the
        feature's (self.feature_idx_)
        rules and total error. E.g.,
        `{'total error': 37, 'rules (value: class)': {0: 0, 1: 2}}`
        means the total error is 37, and the rules are
        "if feature value == 0 classify as 0"
        and "if feature value == 1 classify as 2".
        (And classify as class 1 otherwise.)

    For usage examples, please see
    http://rasbt.github.io/mlxtend/user_guide/classifier/OneRClassifier/
    """

    def __init__(self, resolve_ties='first'):

        allowed = {'first', 'chi-squared'}
        if resolve_ties not in allowed:
            raise ValueError('resolve_ties must be in %s. Got %s.'
                             % (allowed, resolve_ties))
        self.resolve_ties = resolve_ties

    def fit(self, X, y):
        """Learn rule from training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------
        self : object

        """
        # This check will only catch the most extreme cases
        # but better than nothing
        for c in range(X.shape[1]):
            if np.unique(X[:, c]).shape[0] == X.shape[0]:
                warnings.warn('Feature array likely contains at least one'
                              ' non-categorical column.'
                              ' Column %d appears to have a unique value'
                              ' in every row.' % c)
            break

        n_class_labels = np.unique(y).shape[0]

        def compute_class_counts(X, y, feature_index, feature_value):
            mask = X[:, feature_index] == feature_value
            return np.bincount(y[mask], minlength=n_class_labels)

        prediction_dict = {}  # save feature_idx: feature_val, label, error

        # iterate over features
        for feature_index in np.arange(X.shape[1]):

            # iterate over each possible value per feature
            for feature_value in np.unique(X[:, feature_index]):

                class_counts = compute_class_counts(X, y,
                                                    feature_index,
                                                    feature_value)
                most_frequent_class = np.argmax(class_counts)
                self.class_labels_ = np.unique(y)

                # count all classes for that feature match
                # except the most frequent one
                inverse_index = np.ones(n_class_labels, dtype=bool)
                inverse_index[most_frequent_class] = False

                error = np.sum(class_counts[inverse_index])

                # compute the total error for each feature and
                #  save all the corresponding rules for a given feature
                if feature_index not in prediction_dict:
                    prediction_dict[feature_index] = {
                        'total error': 0, 'rules (value: class)': {}
                        }
                prediction_dict[feature_index][
                    'rules (value: class)'][
                        feature_value] = most_frequent_class
                prediction_dict[feature_index]['total error'] += error

            # get best feature (i.e., the feature with the lowest error)
            best_err = np.inf
            best_idx = [None]
            for i in prediction_dict:
                if prediction_dict[i]['total error'] < best_err:
                    best_err = prediction_dict[i]['total error']
                    best_idx[-1] = i

            if self.resolve_ties == 'chi-squared':

                # collect duplicates
                for i in prediction_dict:
                    if i == best_idx[-1]:
                        continue
                    if prediction_dict[i]['total error'] == best_err:
                        best_idx.append(i)

                p_values = []
                for feature_idx in best_idx:

                    rules = prediction_dict[feature_idx][
                        'rules (value: class)']

                    # contingency table for a given feature
                    #   (e.g., petal_width for iris)
                    #   is organized as follows (without the sum columns):
                    #
                    #              petal_width
                    # species      (0.0976,0.791] (0.791,1.63] (1.63,2.5] sum
                    #   setosa                 50            0          0  50
                    #   versicolor              0           48          2  50
                    #   virginica               0            4         46  50
                    #   sum                    50           52         48 150

                    ary = np.zeros((n_class_labels, len(rules)))

                    for idx, r in enumerate(rules):
                        ary[:, idx] = np.bincount(y[X[:, feature_idx] == r],
                                                  minlength=n_class_labels)

                    # returns "stat, p, dof, expected"
                    _, p, _, _ = chi2_contingency(ary)
                p_values.append(p)
                best_p_idx = np.argmax(p_values)
                best_idx = best_idx[best_p_idx]
                self.p_value_ = p_values[best_p_idx]

            elif self.resolve_ties == 'first':
                best_idx = best_idx[0]

        self.feature_idx_ = best_idx
        self.prediction_dict_ = prediction_dict[best_idx]
        return self

    def predict(self, X):
        """ Predict class labels for X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        Returns
        ----------
        maj : array-like, shape = [n_samples]
            Predicted class labels.

        """
        if not hasattr(self, 'prediction_dict_'):
            raise NotFittedError("Estimator not fitted, "
                                 "call `fit` before using the model.")

        rules = self.prediction_dict_['rules (value: class)']

        y_pred = np.zeros(X.shape[0], dtype=np.int)

        # Set up labels for those class labels in the
        # dataset for which no rule exists. We use the
        # first non-specified class label as the default class label.
        rule_labels = set()
        for feature_value in rules:
            class_label = rules[feature_value]
            rule_labels.add(class_label)
        other_label = (set(self.class_labels_) - rule_labels)
        if len(other_label):
            y_pred[:] = list(other_label)[0]
        # else just use "np.zeros"; we could also change this to
        #  self.class_labels_[-1]+1 in future

        # classify all class labels for which rules exist
        for feature_value in rules:
            mask = X[:, self.feature_idx_] == feature_value
            y_pred[mask] = rules[feature_value]

        return y_pred
Działa 2021-06-06 22:13:05 +02:00			`# OneR classifier`

			`# Sebastian Raschka 2014-2020`
			`# mlxtend Machine Learning Library Extensions`
			`#`
			`# The classic OneR (One Rule) classifier`
			`# Authors: Sebastian Raschka <sebastianraschka.com>`
			`#`
			`# License: BSD 3 clause`

			`import numpy as np`
			`import warnings`
			`from scipy.stats import chi2_contingency`
			`from sklearn.base import BaseEstimator, ClassifierMixin`
			`from sklearn.exceptions import NotFittedError`


			`class OneRClassifier(BaseEstimator, ClassifierMixin):`

			`"""OneR (One Rule) Classifier.`

			`Parameters`
			`----------`
			`resolve_ties : str (default: 'first')`
			`Option for how to resolve ties if two or more features`
			`have the same error. Options are`
			`- 'first' (default): chooses first feature in the list, i.e.,`
			`feature with the lower column index.`
			`- 'chi-squared': performs a chi-squared test for each feature`
			`against the target and selects the feature with the lowest p-value.`

			`Attributes`
			`----------`
			`self.classes_labels_ : array-like, shape = [n_labels]`
			`Array containing the unique class labels found in the`
			`training set.`

			`self.feature_idx_ : int`
			`The index of the rules' feature based on the column in`
			`the training set.`

			`self.p_value_ : float`
			The p value for a given feature. Only available after calling `fit`
			when the OneR attribute `resolve_ties = 'chi-squared'` is set.

			`self.prediction_dict_ : dict`
			`Dictionary containing information about the`
			`feature's (self.feature_idx_)`
			`rules and total error. E.g.,`
			`{'total error': 37, 'rules (value: class)': {0: 0, 1: 2}}`
			`means the total error is 37, and the rules are`
			`"if feature value == 0 classify as 0"`
			`and "if feature value == 1 classify as 2".`
			`(And classify as class 1 otherwise.)`

			`For usage examples, please see`
			`http://rasbt.github.io/mlxtend/user_guide/classifier/OneRClassifier/`
			`"""`

			`def __init__(self, resolve_ties='first'):`

			`allowed = {'first', 'chi-squared'}`
			`if resolve_ties not in allowed:`
			`raise ValueError('resolve_ties must be in %s. Got %s.'`
			`% (allowed, resolve_ties))`
			`self.resolve_ties = resolve_ties`

			`def fit(self, X, y):`
			`"""Learn rule from training data.`

			`Parameters`
			`----------`
			`X : {array-like, sparse matrix}, shape = [n_samples, n_features]`
			`Training vectors, where n_samples is the number of samples and`
			`n_features is the number of features.`

			`y : array-like, shape = [n_samples]`
			`Target values.`

			`Returns`
			`-------`
			`self : object`

			`"""`
			`# This check will only catch the most extreme cases`
			`# but better than nothing`
			`for c in range(X.shape[1]):`
			`if np.unique(X[:, c]).shape[0] == X.shape[0]:`
			`warnings.warn('Feature array likely contains at least one'`
			`' non-categorical column.'`
			`' Column %d appears to have a unique value'`
			`' in every row.' % c)`
			`break`

			`n_class_labels = np.unique(y).shape[0]`

			`def compute_class_counts(X, y, feature_index, feature_value):`
			`mask = X[:, feature_index] == feature_value`
			`return np.bincount(y[mask], minlength=n_class_labels)`

			`prediction_dict = {} # save feature_idx: feature_val, label, error`

			`# iterate over features`
			`for feature_index in np.arange(X.shape[1]):`

			`# iterate over each possible value per feature`
			`for feature_value in np.unique(X[:, feature_index]):`

			`class_counts = compute_class_counts(X, y,`
			`feature_index,`
			`feature_value)`
			`most_frequent_class = np.argmax(class_counts)`
			`self.class_labels_ = np.unique(y)`

			`# count all classes for that feature match`
			`# except the most frequent one`
			`inverse_index = np.ones(n_class_labels, dtype=bool)`
			`inverse_index[most_frequent_class] = False`

			`error = np.sum(class_counts[inverse_index])`

			`# compute the total error for each feature and`
			`# save all the corresponding rules for a given feature`
			`if feature_index not in prediction_dict:`
			`prediction_dict[feature_index] = {`
			`'total error': 0, 'rules (value: class)': {}`
			`}`
			`prediction_dict[feature_index][`
			`'rules (value: class)'][`
			`feature_value] = most_frequent_class`
			`prediction_dict[feature_index]['total error'] += error`

			`# get best feature (i.e., the feature with the lowest error)`
			`best_err = np.inf`
			`best_idx = [None]`
			`for i in prediction_dict:`
			`if prediction_dict[i]['total error'] < best_err:`
			`best_err = prediction_dict[i]['total error']`
			`best_idx[-1] = i`

			`if self.resolve_ties == 'chi-squared':`

			`# collect duplicates`
			`for i in prediction_dict:`
			`if i == best_idx[-1]:`
			`continue`
			`if prediction_dict[i]['total error'] == best_err:`
			`best_idx.append(i)`

			`p_values = []`
			`for feature_idx in best_idx:`

			`rules = prediction_dict[feature_idx][`
			`'rules (value: class)']`

			`# contingency table for a given feature`
			`# (e.g., petal_width for iris)`
			`# is organized as follows (without the sum columns):`
			`#`
			`# petal_width`
			`# species (0.0976,0.791] (0.791,1.63] (1.63,2.5] sum`
			`# setosa 50 0 0 50`
			`# versicolor 0 48 2 50`
			`# virginica 0 4 46 50`
			`# sum 50 52 48 150`

			`ary = np.zeros((n_class_labels, len(rules)))`

			`for idx, r in enumerate(rules):`
			`ary[:, idx] = np.bincount(y[X[:, feature_idx] == r],`
			`minlength=n_class_labels)`

			`# returns "stat, p, dof, expected"`
			`_, p, _, _ = chi2_contingency(ary)`
			`p_values.append(p)`
			`best_p_idx = np.argmax(p_values)`
			`best_idx = best_idx[best_p_idx]`
			`self.p_value_ = p_values[best_p_idx]`

			`elif self.resolve_ties == 'first':`
			`best_idx = best_idx[0]`

			`self.feature_idx_ = best_idx`
			`self.prediction_dict_ = prediction_dict[best_idx]`
			`return self`

			`def predict(self, X):`
			`""" Predict class labels for X.`

			`Parameters`
			`----------`
			`X : {array-like, sparse matrix}, shape = [n_samples, n_features]`
			`Training vectors, where n_samples is the number of samples and`
			`n_features is the number of features.`

			`Returns`
			`----------`
			`maj : array-like, shape = [n_samples]`
			`Predicted class labels.`

			`"""`
			`if not hasattr(self, 'prediction_dict_'):`
			`raise NotFittedError("Estimator not fitted, "`
			"call `fit` before using the model.")

			`rules = self.prediction_dict_['rules (value: class)']`

			`y_pred = np.zeros(X.shape[0], dtype=np.int)`

			`# Set up labels for those class labels in the`
			`# dataset for which no rule exists. We use the`
			`# first non-specified class label as the default class label.`
			`rule_labels = set()`
			`for feature_value in rules:`
			`class_label = rules[feature_value]`
			`rule_labels.add(class_label)`
			`other_label = (set(self.class_labels_) - rule_labels)`
			`if len(other_label):`
			`y_pred[:] = list(other_label)[0]`
			`# else just use "np.zeros"; we could also change this to`
			`# self.class_labels_[-1]+1 in future`

			`# classify all class labels for which rules exist`
			`for feature_value in rules:`
			`mask = X[:, self.feature_idx_] == feature_value`
			`y_pred[mask] = rules[feature_value]`

			`return y_pred`