mpsic_projekt_1_bayes_class.../naive_bayes.py

68 lines
2.2 KiB
Python
Raw Normal View History

2022-05-17 22:08:42 +02:00
import numpy as np
from sklearn.metrics import accuracy_score
2022-05-17 13:15:06 +02:00
from sklearn.model_selection import train_test_split
2022-05-16 23:58:37 +02:00
import os
from collections import Counter
2022-05-17 22:08:42 +02:00
from ast import literal_eval
2022-05-16 23:58:37 +02:00
2022-05-17 22:08:42 +02:00
# TODO: stworzyc mapy slow dla zbiorów z fraudulent 0 i 1
2022-05-16 23:58:37 +02:00
2022-05-17 22:08:42 +02:00
from prepare_data import read_data
2022-05-16 23:58:37 +02:00
class NaiveBayes:
2022-05-17 22:08:42 +02:00
def __init__(self, train_x, train_y, labels):
self.train_x = train_x
self.train_y = train_y
2022-05-16 23:58:37 +02:00
self.labels = labels
2022-05-17 22:08:42 +02:00
self.counts = {}
self.prior_prob = {}
self.word_counts = {}
def count_words(self):
for label in self.labels:
indexes = self.train_y.index[self.train_y == label].tolist()
data = self.train_x[self.train_x.index.isin(indexes)]
vocabulary = []
for tokens in data:
vocabulary += tokens
self.word_counts.update({label: (len(vocabulary), len(set(vocabulary)), Counter(vocabulary))})
2022-05-16 23:58:37 +02:00
def fit(self):
2022-05-17 22:08:42 +02:00
self.counts = {l: self.train_y[self.train_y == l].shape[0] for l in self.labels}
self.prior_prob = {l: float(self.counts[l]) / float(self.train_y.shape[0]) for l in self.labels}
self.count_words()
def get_posteriori(self, text):
values = {}
for label in self.labels:
values = {label: 0 for label in self.labels}
for word in text:
values[label] += np.log((float(self.word_counts[label][2].get(word, 0) + 1)) / (
self.word_counts[label][0] + self.word_counts[label][1]))
values[label] *= np.log(self.prior_prob[label])
return values.values()
def predict(self, test_x):
predicted = []
for row in test_x:
predicted.append(np.argmax(self.get_posteriori(row)))
return predicted
2022-05-17 13:15:06 +02:00
def main():
2022-05-17 22:08:42 +02:00
data = read_data(os.path.join(os.path.abspath("./data"), "clean-data.csv"))
data['tokens'] = data['tokens'].apply(literal_eval)
x = data['tokens']
y = data['fraudulent']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123, stratify=y)
bayes = NaiveBayes(x_train, y_train, [0, 1])
bayes.fit()
predicted = bayes.predict(x_test)
print(accuracy_score(y_test, predicted))
2022-05-16 23:58:37 +02:00
2022-05-17 13:15:06 +02:00
if __name__ == "__main__":
2022-05-16 23:58:37 +02:00
main()