68 lines
2.2 KiB
Python
68 lines
2.2 KiB
Python
import numpy as np
|
|
from sklearn.metrics import accuracy_score
|
|
from sklearn.model_selection import train_test_split
|
|
import os
|
|
from collections import Counter
|
|
from ast import literal_eval
|
|
|
|
# TODO: stworzyc mapy slow dla zbiorów z fraudulent 0 i 1
|
|
|
|
from prepare_data import read_data
|
|
|
|
|
|
class NaiveBayes:
|
|
def __init__(self, train_x, train_y, labels):
|
|
self.train_x = train_x
|
|
self.train_y = train_y
|
|
self.labels = labels
|
|
self.counts = {}
|
|
self.prior_prob = {}
|
|
self.word_counts = {}
|
|
|
|
def count_words(self):
|
|
for label in self.labels:
|
|
indexes = self.train_y.index[self.train_y == label].tolist()
|
|
data = self.train_x[self.train_x.index.isin(indexes)]
|
|
vocabulary = []
|
|
for tokens in data:
|
|
vocabulary += tokens
|
|
self.word_counts.update({label: (len(vocabulary), len(set(vocabulary)), Counter(vocabulary))})
|
|
|
|
def fit(self):
|
|
self.counts = {l: self.train_y[self.train_y == l].shape[0] for l in self.labels}
|
|
self.prior_prob = {l: float(self.counts[l]) / float(self.train_y.shape[0]) for l in self.labels}
|
|
self.count_words()
|
|
|
|
def get_posteriori(self, text):
|
|
values = {}
|
|
for label in self.labels:
|
|
values = {label: 0 for label in self.labels}
|
|
for word in text:
|
|
values[label] += np.log((float(self.word_counts[label][2].get(word, 0) + 1)) / (
|
|
self.word_counts[label][0] + self.word_counts[label][1]))
|
|
values[label] *= np.log(self.prior_prob[label])
|
|
return values.values()
|
|
|
|
def predict(self, test_x):
|
|
predicted = []
|
|
for row in test_x:
|
|
predicted.append(np.argmax(self.get_posteriori(row)))
|
|
return predicted
|
|
|
|
|
|
def main():
|
|
data = read_data(os.path.join(os.path.abspath("./data"), "clean-data.csv"))
|
|
data['tokens'] = data['tokens'].apply(literal_eval)
|
|
x = data['tokens']
|
|
y = data['fraudulent']
|
|
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123, stratify=y)
|
|
bayes = NaiveBayes(x_train, y_train, [0, 1])
|
|
bayes.fit()
|
|
predicted = bayes.predict(x_test)
|
|
|
|
print(accuracy_score(y_test, predicted))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|