Merge branch 'master' of https://git.wmi.amu.edu.pl/s444452/mpsic_projekt_1_bayes_classifier
d
This commit is contained in:
commit
575da00f8a
17881
data/dataset.csv
17881
data/dataset.csv
File diff suppressed because one or more lines are too long
222
naive_bayes.py
222
naive_bayes.py
@ -1,193 +1,67 @@
|
|||||||
import os
|
|
||||||
import sys
|
|
||||||
from collections import Counter
|
|
||||||
|
|
||||||
import nltk
|
|
||||||
|
|
||||||
nltk.download('punkt')
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
from sklearn.metrics import accuracy_score
|
||||||
from kaggle import api
|
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
from nltk.tokenize import RegexpTokenizer, word_tokenize, sent_tokenize
|
import os
|
||||||
|
from collections import Counter
|
||||||
|
from ast import literal_eval
|
||||||
|
|
||||||
from nltk.corpus import stopwords # To Remove the stop words
|
# TODO: stworzyc mapy slow dla zbiorów z fraudulent 0 i 1
|
||||||
|
|
||||||
from nltk.stem import PorterStemmer, WordNetLemmatizer
|
from prepare_data import read_data
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
||||||
|
|
||||||
from wordcloud import WordCloud, STOPWORDS
|
|
||||||
|
|
||||||
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
|
|
||||||
from string import punctuation
|
|
||||||
from nltk import pos_tag
|
|
||||||
from nltk.corpus import wordnet
|
|
||||||
|
|
||||||
ps = PorterStemmer() # To perform stemming
|
|
||||||
|
|
||||||
|
|
||||||
def download_data(data_path, dataset_name):
|
|
||||||
if not os.path.exists(os.path.join(data_path, dataset_name)):
|
|
||||||
api.authenticate()
|
|
||||||
api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path=data_path,
|
|
||||||
unzip=True)
|
|
||||||
os.rename(os.path.join(data_path, 'fake_job_postings.csv'), os.path.join(data_path, dataset_name))
|
|
||||||
|
|
||||||
|
|
||||||
def save_dataset(data_path, data, name):
|
|
||||||
data.to_csv(os.path.join(data_path, name), index=False)
|
|
||||||
|
|
||||||
|
|
||||||
def preprocess_dataset(data):
|
|
||||||
data = data.replace(np.nan, '', regex=True)
|
|
||||||
|
|
||||||
data['description'] = data['description'].str.replace(r"\W+", " ", regex=True)
|
|
||||||
data['description'] = data['description'].str.replace(r"url_\w+", " ", regex=True)
|
|
||||||
data['description'] = data['description'].str.replace(r"\s+", " ", regex=True)
|
|
||||||
|
|
||||||
data['text'] = data[['title', 'department', 'company_profile', 'description', 'requirements', 'benefits']].apply(
|
|
||||||
lambda x: ' '.join(x), axis=1)
|
|
||||||
data['text'] = data['text'].str.lower()
|
|
||||||
|
|
||||||
tokenizer = RegexpTokenizer(r'\w+')
|
|
||||||
data['tokens'] = data['text'].apply(tokenizer.tokenize)
|
|
||||||
# data['tokens'] = data['text'].apply(lambda x: word_tokenize(x))
|
|
||||||
|
|
||||||
return data.drop(['job_id', 'department', 'company_profile', 'description', 'requirements', 'benefits', 'text'],
|
|
||||||
axis=1)
|
|
||||||
|
|
||||||
|
|
||||||
def to_dictionary(stop_words, category):
|
|
||||||
vocab = set()
|
|
||||||
sentences = category
|
|
||||||
for i in sentences:
|
|
||||||
for word in i:
|
|
||||||
word_lower = word.lower()
|
|
||||||
if word_lower not in stop_words and word_lower.isalpha():
|
|
||||||
vocab.add(ps.stem(word_lower))
|
|
||||||
word_dic = Counter(vocab)
|
|
||||||
return word_dic
|
|
||||||
|
|
||||||
|
|
||||||
# For tokenizing the words and putting it into the word list
|
|
||||||
def return_word_list(stop_words, sentence):
|
|
||||||
word_list = []
|
|
||||||
for word in sentence:
|
|
||||||
word_lower = word.lower()
|
|
||||||
if word_lower not in stop_words and word_lower.isalpha():
|
|
||||||
word_list.append(ps.stem(word_lower))
|
|
||||||
return word_list
|
|
||||||
|
|
||||||
|
|
||||||
# For finding the conditional probability
|
|
||||||
def return_category_probability_dictionary(dict_category_wise_probability, word_list, probab, prob_df, pro):
|
|
||||||
help_dict = {}
|
|
||||||
for i, row in probab.iterrows():
|
|
||||||
for word in word_list:
|
|
||||||
if (word in prob_df.index.tolist()):
|
|
||||||
pro = pro * probab.loc[i, word]
|
|
||||||
help_dict[i] = pro * dict_category_wise_probability[i]
|
|
||||||
pro = 1
|
|
||||||
return help_dict
|
|
||||||
|
|
||||||
|
|
||||||
class NaiveBayes:
|
class NaiveBayes:
|
||||||
def __init__(self, data, labels, features):
|
def __init__(self, train_x, train_y, labels):
|
||||||
self.data = data
|
self.train_x = train_x
|
||||||
|
self.train_y = train_y
|
||||||
self.labels = labels
|
self.labels = labels
|
||||||
self.features = features
|
self.counts = {}
|
||||||
|
self.prior_prob = {}
|
||||||
|
self.word_counts = {}
|
||||||
|
|
||||||
|
def count_words(self):
|
||||||
|
for label in self.labels:
|
||||||
|
indexes = self.train_y.index[self.train_y == label].tolist()
|
||||||
|
data = self.train_x[self.train_x.index.isin(indexes)]
|
||||||
|
vocabulary = []
|
||||||
|
for tokens in data:
|
||||||
|
vocabulary += tokens
|
||||||
|
self.word_counts.update({label: (len(vocabulary), len(set(vocabulary)), Counter(vocabulary))})
|
||||||
|
|
||||||
def fit(self):
|
def fit(self):
|
||||||
pass
|
self.counts = {l: self.train_y[self.train_y == l].shape[0] for l in self.labels}
|
||||||
|
self.prior_prob = {l: float(self.counts[l]) / float(self.train_y.shape[0]) for l in self.labels}
|
||||||
|
self.count_words()
|
||||||
|
|
||||||
def transform(self):
|
def get_posteriori(self, text):
|
||||||
pass
|
values = {}
|
||||||
|
for label in self.labels:
|
||||||
|
values = {label: 0 for label in self.labels}
|
||||||
|
for word in text:
|
||||||
|
values[label] += np.log((float(self.word_counts[label][2].get(word, 0) + 1)) / (
|
||||||
|
self.word_counts[label][0] + self.word_counts[label][1]))
|
||||||
|
values[label] *= np.log(self.prior_prob[label])
|
||||||
|
return values.values()
|
||||||
|
|
||||||
def predict(self):
|
def predict(self, test_x):
|
||||||
pass
|
predicted = []
|
||||||
|
for row in test_x:
|
||||||
def evaluate(self, test_data):
|
predicted.append(np.argmax(self.get_posteriori(row)))
|
||||||
pass
|
return predicted
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
abs_data_path, dataset_name = os.path.abspath('./data'), 'dataset.csv'
|
data = read_data(os.path.join(os.path.abspath("./data"), "clean-data.csv"))
|
||||||
download_data(abs_data_path, dataset_name)
|
data['tokens'] = data['tokens'].apply(literal_eval)
|
||||||
data = pd.read_csv(os.path.join(abs_data_path, dataset_name))
|
x = data['tokens']
|
||||||
clean_data = preprocess_dataset(data)
|
y = data['fraudulent']
|
||||||
x, y = clean_data['tokens'], clean_data['fraudulent']
|
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123, stratify=y)
|
||||||
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,
|
bayes = NaiveBayes(x_train, y_train, [0, 1])
|
||||||
random_state=123, stratify=y)
|
bayes.fit()
|
||||||
train_data = pd.concat([x_train, y_train], axis=1)
|
predicted = bayes.predict(x_test)
|
||||||
print(train_data)
|
|
||||||
test_data = pd.concat([x_test, y_test], axis=1)
|
|
||||||
|
|
||||||
classes = [0, 1]
|
print(accuracy_score(y_test, predicted))
|
||||||
# Building the master dictionary that contains the word frequency
|
|
||||||
master_dict = {}
|
|
||||||
stop_words = set(stopwords.words('english'))
|
|
||||||
|
|
||||||
for category in classes:
|
|
||||||
category_temp = train_data[train_data['fraudulent'] == category]
|
|
||||||
temp_dict = to_dictionary(stop_words, category_temp['tokens'])
|
|
||||||
master_dict[category] = temp_dict
|
|
||||||
|
|
||||||
# Converting the dictionary to data frame for ease of use
|
|
||||||
word_frequency_df = pd.DataFrame(master_dict).fillna(0)
|
|
||||||
print(word_frequency_df)
|
|
||||||
|
|
||||||
# Building the dictionary that holds category wise sums and word wise probabilities
|
|
||||||
categories_to_iterate = list(word_frequency_df) # Prepared category for zip
|
|
||||||
category_sum = []
|
|
||||||
for category in categories_to_iterate:
|
|
||||||
category_sum.append(word_frequency_df[category].sum()) # Prepared category sum for zip
|
|
||||||
dict_category_sum = dict(zip(categories_to_iterate, category_sum)) # Dictionary with category based sums
|
|
||||||
print(f"The dictionary that holds the cateogry wise sum is {dict_category_sum}")
|
|
||||||
|
|
||||||
dict_category_wise_probability = dict_category_sum.copy()
|
|
||||||
|
|
||||||
total_sentences_values = dict_category_wise_probability.values()
|
|
||||||
total = sum(total_sentences_values)
|
|
||||||
|
|
||||||
for key, value in dict_category_wise_probability.items():
|
|
||||||
dict_category_wise_probability[key] = value / total
|
|
||||||
print(f"The dictionay that holds the category wise probabilities is {dict_category_wise_probability}")
|
|
||||||
|
|
||||||
# Building word probability with the application of smoothing
|
|
||||||
prob_df = word_frequency_df
|
|
||||||
for category in categories_to_iterate:
|
|
||||||
for index, row in prob_df.iterrows():
|
|
||||||
row[category] = ((row[category] + 1) / (dict_category_sum[category] + len(prob_df[category]))) # Smoothing
|
|
||||||
prob_df.at[index, category] = row[category]
|
|
||||||
print(prob_df)
|
|
||||||
|
|
||||||
probab = prob_df.transpose()
|
|
||||||
pro = 1
|
|
||||||
|
|
||||||
match = 0
|
|
||||||
total = 0
|
|
||||||
counter = 0
|
|
||||||
for _, row in test_data.iterrows():
|
|
||||||
if counter > 200:
|
|
||||||
break
|
|
||||||
ind = row['fraudulent']
|
|
||||||
text = row['tokens']
|
|
||||||
word_list = return_word_list(stop_words, text)
|
|
||||||
|
|
||||||
# Get the dictionary that contains the final probability P(word|category)
|
|
||||||
help_dict = return_category_probability_dictionary(dict_category_wise_probability, word_list, probab, prob_df,
|
|
||||||
pro)
|
|
||||||
|
|
||||||
if ind == max(help_dict, key=help_dict.get):
|
|
||||||
match = match + 1
|
|
||||||
total = total + 1
|
|
||||||
counter += 1
|
|
||||||
|
|
||||||
print(f"The model predicted {match} correctly of {total}")
|
|
||||||
print(f"The model accuracy then is {int((match / total) * 100)}%")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
103
prepare_data.py
Normal file
103
prepare_data.py
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from kaggle import api
|
||||||
|
from sklearn.utils import shuffle
|
||||||
|
|
||||||
|
import nltk
|
||||||
|
from nltk.tokenize import RegexpTokenizer
|
||||||
|
from nltk.stem.snowball import SnowballStemmer
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
|
||||||
|
nltk.download("punkt")
|
||||||
|
nltk.download("stopwords")
|
||||||
|
|
||||||
|
stemmer = SnowballStemmer(language="english")
|
||||||
|
tokenizer = RegexpTokenizer(r'\w+')
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
|
||||||
|
|
||||||
|
def read_data(data_path: str, prepare_data: bool = False):
|
||||||
|
"""Read data from given path - if @prepared_data is True, data is also preprocessed and cleaned"""
|
||||||
|
if prepare_data:
|
||||||
|
data = preprocess_dataset(data_path)
|
||||||
|
else:
|
||||||
|
data = pd.read_csv(data_path)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def download_data(data_path, dataset_name):
|
||||||
|
if not os.path.exists(os.path.join(data_path, dataset_name)):
|
||||||
|
api.authenticate()
|
||||||
|
api.dataset_download_files(
|
||||||
|
"shivamb/real-or-fake-fake-jobposting-prediction",
|
||||||
|
path=data_path,
|
||||||
|
unzip=True,
|
||||||
|
)
|
||||||
|
os.rename(
|
||||||
|
os.path.join(data_path, "fake_job_postings.csv"),
|
||||||
|
os.path.join(data_path, dataset_name),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize_and_stem_text(text):
|
||||||
|
tokenized_text = tokenizer.tokenize(text)
|
||||||
|
tokens = [token.lower() for token in tokenized_text if token.lower() not in stop_words and len(token) > 3]
|
||||||
|
return [stemmer.stem(token) for token in tokens]
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_dataset(data_path):
|
||||||
|
data = pd.read_csv(data_path).replace(np.nan, "", regex=True)
|
||||||
|
|
||||||
|
data_not_fraudulent = data[data['fraudulent'] == 0]
|
||||||
|
data_fraudulent = data[data['fraudulent'] == 1]
|
||||||
|
|
||||||
|
sample = data_not_fraudulent.sample(data_fraudulent.shape[0], replace=False)
|
||||||
|
data = pd.concat([sample.reset_index(), data_fraudulent.reset_index()], axis=0)
|
||||||
|
data = shuffle(data)
|
||||||
|
data["description"] = data["description"].str.replace(r"\W+", " ", regex=True)
|
||||||
|
data["description"] = data["description"].str.replace(r"url_\w+", " ", regex=True)
|
||||||
|
data["description"] = data["description"].str.replace(r"\s+", " ", regex=True)
|
||||||
|
|
||||||
|
data["text"] = data[[
|
||||||
|
"title",
|
||||||
|
"department",
|
||||||
|
"company_profile",
|
||||||
|
"description",
|
||||||
|
"requirements",
|
||||||
|
"benefits",
|
||||||
|
]].apply(lambda x: " ".join(x), axis=1)
|
||||||
|
# data["text"] = data[[
|
||||||
|
# "description"
|
||||||
|
# ]].apply(lambda x: " ".join(x), axis=1)
|
||||||
|
|
||||||
|
data["tokens"] = data["text"].apply(lambda text: tokenize_and_stem_text(text))
|
||||||
|
|
||||||
|
return data.drop(
|
||||||
|
[
|
||||||
|
"job_id",
|
||||||
|
"department",
|
||||||
|
"company_profile",
|
||||||
|
"description",
|
||||||
|
"requirements",
|
||||||
|
"benefits",
|
||||||
|
"text",
|
||||||
|
],
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def save_dataset(data, data_path, name):
|
||||||
|
data.to_csv(os.path.join(data_path, name), index=False)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# * Download the training data
|
||||||
|
abs_data_path, dataset_name = os.path.abspath("./data"), "dataset.csv"
|
||||||
|
download_data(abs_data_path, dataset_name)
|
||||||
|
# * Data preprocessing
|
||||||
|
data_path = os.path.join(abs_data_path, dataset_name)
|
||||||
|
cleaned_data = preprocess_dataset(data_path)
|
||||||
|
# * Save prepared data to a csv file
|
||||||
|
save_dataset(cleaned_data, abs_data_path, "clean-data.csv")
|
Loading…
Reference in New Issue
Block a user