mpsic_projekt_1_bayes_class.../naive_bayes.py

194 lines
6.7 KiB
Python

import os
import sys
from collections import Counter
import nltk
nltk.download('punkt')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from kaggle import api
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer, word_tokenize, sent_tokenize
from nltk.corpus import stopwords # To Remove the stop words
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from string import punctuation
from nltk import pos_tag
from nltk.corpus import wordnet
ps = PorterStemmer() # To perform stemming
def download_data(data_path, dataset_name):
if not os.path.exists(os.path.join(data_path, dataset_name)):
api.authenticate()
api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path=data_path,
unzip=True)
os.rename(os.path.join(data_path, 'fake_job_postings.csv'), os.path.join(data_path, dataset_name))
def save_dataset(data_path, data, name):
data.to_csv(os.path.join(data_path, name), index=False)
def preprocess_dataset(data):
data = data.replace(np.nan, '', regex=True)
data['description'] = data['description'].str.replace(r"\W+", " ", regex=True)
data['description'] = data['description'].str.replace(r"url_\w+", " ", regex=True)
data['description'] = data['description'].str.replace(r"\s+", " ", regex=True)
data['text'] = data[['title', 'department', 'company_profile', 'description', 'requirements', 'benefits']].apply(
lambda x: ' '.join(x), axis=1)
data['text'] = data['text'].str.lower()
tokenizer = RegexpTokenizer(r'\w+')
data['tokens'] = data['text'].apply(tokenizer.tokenize)
# data['tokens'] = data['text'].apply(lambda x: word_tokenize(x))
return data.drop(['job_id', 'department', 'company_profile', 'description', 'requirements', 'benefits', 'text'],
axis=1)
def to_dictionary(stop_words, category):
vocab = set()
sentences = category
for i in sentences:
for word in i:
word_lower = word.lower()
if word_lower not in stop_words and word_lower.isalpha():
vocab.add(ps.stem(word_lower))
word_dic = Counter(vocab)
return word_dic
# For tokenizing the words and putting it into the word list
def return_word_list(stop_words, sentence):
word_list = []
for word in sentence:
word_lower = word.lower()
if word_lower not in stop_words and word_lower.isalpha():
word_list.append(ps.stem(word_lower))
return word_list
# For finding the conditional probability
def return_category_probability_dictionary(dict_category_wise_probability, word_list, probab, prob_df, pro):
help_dict = {}
for i, row in probab.iterrows():
for word in word_list:
if (word in prob_df.index.tolist()):
pro = pro * probab.loc[i, word]
help_dict[i] = pro * dict_category_wise_probability[i]
pro = 1
return help_dict
class NaiveBayes:
def __init__(self, data, labels, features):
self.data = data
self.labels = labels
self.features = features
def fit(self):
pass
def transform(self):
pass
def predict(self):
pass
def evaluate(self, test_data):
pass
def main():
abs_data_path, dataset_name = os.path.abspath('./data'), 'dataset.csv'
download_data(abs_data_path, dataset_name)
data = pd.read_csv(os.path.join(abs_data_path, dataset_name))
clean_data = preprocess_dataset(data)
x, y = clean_data['tokens'], clean_data['fraudulent']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,
random_state=123, stratify=y)
train_data = pd.concat([x_train, y_train], axis=1)
print(train_data)
test_data = pd.concat([x_test, y_test], axis=1)
classes = [0, 1]
# Building the master dictionary that contains the word frequency
master_dict = {}
stop_words = set(stopwords.words('english'))
for category in classes:
category_temp = train_data[train_data['fraudulent'] == category]
temp_dict = to_dictionary(stop_words, category_temp['tokens'])
master_dict[category] = temp_dict
# Converting the dictionary to data frame for ease of use
word_frequency_df = pd.DataFrame(master_dict).fillna(0)
print(word_frequency_df)
# Building the dictionary that holds category wise sums and word wise probabilities
categories_to_iterate = list(word_frequency_df) # Prepared category for zip
category_sum = []
for category in categories_to_iterate:
category_sum.append(word_frequency_df[category].sum()) # Prepared category sum for zip
dict_category_sum = dict(zip(categories_to_iterate, category_sum)) # Dictionary with category based sums
print(f"The dictionary that holds the cateogry wise sum is {dict_category_sum}")
dict_category_wise_probability = dict_category_sum.copy()
total_sentences_values = dict_category_wise_probability.values()
total = sum(total_sentences_values)
for key, value in dict_category_wise_probability.items():
dict_category_wise_probability[key] = value / total
print(f"The dictionay that holds the category wise probabilities is {dict_category_wise_probability}")
# Building word probability with the application of smoothing
prob_df = word_frequency_df
for category in categories_to_iterate:
for index, row in prob_df.iterrows():
row[category] = ((row[category] + 1) / (dict_category_sum[category] + len(prob_df[category]))) # Smoothing
prob_df.at[index, category] = row[category]
print(prob_df)
probab = prob_df.transpose()
pro = 1
match = 0
total = 0
counter = 0
for _, row in test_data.iterrows():
if counter > 200:
break
ind = row['fraudulent']
text = row['tokens']
word_list = return_word_list(stop_words, text)
# Get the dictionary that contains the final probability P(word|category)
help_dict = return_category_probability_dictionary(dict_category_wise_probability, word_list, probab, prob_df,
pro)
if ind == max(help_dict, key=help_dict.get):
match = match + 1
total = total + 1
counter += 1
print(f"The model predicted {match} correctly of {total}")
print(f"The model accuracy then is {int((match / total) * 100)}%")
if __name__ == '__main__':
main()