mpsic_projekt_1_bayes_class.../naive_bayes.py

import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import os
from collections import Counter
from ast import literal_eval

# TODO: stworzyc mapy slow dla zbiorów z fraudulent 0 i 1

from prepare_data import read_data


class NaiveBayes:
    def __init__(self, train_x, train_y, labels):
        self.train_x = train_x
        self.train_y = train_y
        self.labels = labels
        self.counts = {}
        self.prior_prob = {}
        self.word_counts = {}

    def count_words(self):
        for label in self.labels:
            indexes = self.train_y.index[self.train_y == label].tolist()
            data = self.train_x[self.train_x.index.isin(indexes)]
            vocabulary = []
            for tokens in data:
                vocabulary += tokens
            self.word_counts.update({label: (len(vocabulary), len(set(vocabulary)), Counter(vocabulary))})

    def fit(self):
        self.counts = {l: self.train_y[self.train_y == l].shape[0] for l in self.labels}
        self.prior_prob = {l: float(self.counts[l]) / float(self.train_y.shape[0]) for l in self.labels}
        self.count_words()

    def get_posteriori(self, text):
        values = {}
        for label in self.labels:
            values = {label: 0 for label in self.labels}
            for word in text:
                values[label] += np.log((float(self.word_counts[label][2].get(word, 0) + 1)) / (
                        self.word_counts[label][0] + self.word_counts[label][1]))
            values[label] *= np.log(self.prior_prob[label])
        return values.values()

    def predict(self, test_x):
        predicted = []
        for row in test_x:
            predicted.append(np.argmax(self.get_posteriori(row)))
        return predicted


def main():
    data = read_data(os.path.join(os.path.abspath("./data"), "clean-data.csv"))
    data['tokens'] = data['tokens'].apply(literal_eval)
    x = data['tokens']
    y = data['fraudulent']
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123, stratify=y)
    bayes = NaiveBayes(x_train, y_train, [0, 1])
    bayes.fit()
    predicted = bayes.predict(x_test)

    print(accuracy_score(y_test, predicted))


if __name__ == "__main__":
    main()