init impl of naive bayes classifier

2022-05-16 23:58:37 +02:00 · 2022-05-16 23:58:37 +02:00 · 3b776ce5d6
parent 6b5a68e900
commit 3b776ce5d6
2 changed files with 18074 additions and 0 deletions
--- a/data/dataset.csv
+++ b/data/dataset.csv
--- a/naive_bayes.py
+++ b/naive_bayes.py
@ -0,0 +1,193 @@
+import os
+import sys
+from collections import Counter
+
+import nltk
+
+nltk.download('punkt')
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from kaggle import api
+from sklearn.model_selection import train_test_split
+from nltk.tokenize import RegexpTokenizer, word_tokenize, sent_tokenize
+
+from nltk.corpus import stopwords  # To Remove the stop words
+
+from nltk.stem import PorterStemmer, WordNetLemmatizer
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+from wordcloud import WordCloud, STOPWORDS
+
+from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
+from string import punctuation
+from nltk import pos_tag
+from nltk.corpus import wordnet
+
+ps = PorterStemmer()  # To perform stemming
+
+
+def download_data(data_path, dataset_name):
+    if not os.path.exists(os.path.join(data_path, dataset_name)):
+        api.authenticate()
+        api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path=data_path,
+                                   unzip=True)
+        os.rename(os.path.join(data_path, 'fake_job_postings.csv'), os.path.join(data_path, dataset_name))
+
+
+def save_dataset(data_path, data, name):
+    data.to_csv(os.path.join(data_path, name), index=False)
+
+
+def preprocess_dataset(data):
+    data = data.replace(np.nan, '', regex=True)
+
+    data['description'] = data['description'].str.replace(r"\W+", " ", regex=True)
+    data['description'] = data['description'].str.replace(r"url_\w+", " ", regex=True)
+    data['description'] = data['description'].str.replace(r"\s+", " ", regex=True)
+
+    data['text'] = data[['title', 'department', 'company_profile', 'description', 'requirements', 'benefits']].apply(
+        lambda x: ' '.join(x), axis=1)
+    data['text'] = data['text'].str.lower()
+
+    tokenizer = RegexpTokenizer(r'\w+')
+    data['tokens'] = data['text'].apply(tokenizer.tokenize)
+    # data['tokens'] = data['text'].apply(lambda x: word_tokenize(x))
+
+    return data.drop(['job_id', 'department', 'company_profile', 'description', 'requirements', 'benefits', 'text'],
+                     axis=1)
+
+
+def to_dictionary(stop_words, category):
+    vocab = set()
+    sentences = category
+    for i in sentences:
+        for word in i:
+            word_lower = word.lower()
+            if word_lower not in stop_words and word_lower.isalpha():
+                vocab.add(ps.stem(word_lower))
+    word_dic = Counter(vocab)
+    return word_dic
+
+
+# For tokenizing the words and putting it into the word list
+def return_word_list(stop_words, sentence):
+    word_list = []
+    for word in sentence:
+        word_lower = word.lower()
+        if word_lower not in stop_words and word_lower.isalpha():
+            word_list.append(ps.stem(word_lower))
+    return word_list
+
+
+# For finding the conditional probability
+def return_category_probability_dictionary(dict_category_wise_probability, word_list, probab, prob_df, pro):
+    help_dict = {}
+    for i, row in probab.iterrows():
+        for word in word_list:
+            if (word in prob_df.index.tolist()):
+                pro = pro * probab.loc[i, word]
+        help_dict[i] = pro * dict_category_wise_probability[i]
+        pro = 1
+    return help_dict
+
+
+class NaiveBayes:
+    def __init__(self, data, labels, features):
+        self.data = data
+        self.labels = labels
+        self.features = features
+
+    def fit(self):
+        pass
+
+    def transform(self):
+        pass
+
+    def predict(self):
+        pass
+
+    def evaluate(self, test_data):
+        pass
+
+
+def main():
+    abs_data_path, dataset_name = os.path.abspath('./data'), 'dataset.csv'
+    download_data(abs_data_path, dataset_name)
+    data = pd.read_csv(os.path.join(abs_data_path, dataset_name))
+    clean_data = preprocess_dataset(data)
+    x, y = clean_data['tokens'], clean_data['fraudulent']
+    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,
+                                                        random_state=123, stratify=y)
+    train_data = pd.concat([x_train, y_train], axis=1)
+    print(train_data)
+    test_data = pd.concat([x_test, y_test], axis=1)
+
+    classes = [0, 1]
+    # Building the master dictionary that contains the word frequency
+    master_dict = {}
+    stop_words = set(stopwords.words('english'))
+
+    for category in classes:
+        category_temp = train_data[train_data['fraudulent'] == category]
+        temp_dict = to_dictionary(stop_words, category_temp['tokens'])
+        master_dict[category] = temp_dict
+
+    # Converting the dictionary to data frame for ease of use
+    word_frequency_df = pd.DataFrame(master_dict).fillna(0)
+    print(word_frequency_df)
+
+    # Building the dictionary that holds category wise sums and word wise probabilities
+    categories_to_iterate = list(word_frequency_df)  # Prepared category for zip
+    category_sum = []
+    for category in categories_to_iterate:
+        category_sum.append(word_frequency_df[category].sum())  # Prepared category sum for zip
+    dict_category_sum = dict(zip(categories_to_iterate, category_sum))  # Dictionary with category based sums
+    print(f"The dictionary that holds the cateogry wise sum is {dict_category_sum}")
+
+    dict_category_wise_probability = dict_category_sum.copy()
+
+    total_sentences_values = dict_category_wise_probability.values()
+    total = sum(total_sentences_values)
+
+    for key, value in dict_category_wise_probability.items():
+        dict_category_wise_probability[key] = value / total
+    print(f"The dictionay that holds the category wise probabilities is {dict_category_wise_probability}")
+
+    # Building word probability with the application of smoothing
+    prob_df = word_frequency_df
+    for category in categories_to_iterate:
+        for index, row in prob_df.iterrows():
+            row[category] = ((row[category] + 1) / (dict_category_sum[category] + len(prob_df[category])))  # Smoothing
+            prob_df.at[index, category] = row[category]
+    print(prob_df)
+
+    probab = prob_df.transpose()
+    pro = 1
+
+    match = 0
+    total = 0
+    counter = 0
+    for _, row in test_data.iterrows():
+        if counter > 200:
+            break
+        ind = row['fraudulent']
+        text = row['tokens']
+        word_list = return_word_list(stop_words, text)
+
+        # Get the dictionary that contains the final probability P(word|category)
+        help_dict = return_category_probability_dictionary(dict_category_wise_probability, word_list, probab, prob_df,
+                                                           pro)
+
+        if ind == max(help_dict, key=help_dict.get):
+            match = match + 1
+        total = total + 1
+        counter += 1
+
+    print(f"The model predicted {match} correctly of {total}")
+    print(f"The model accuracy then is {int((match / total) * 100)}%")
+
+
+if __name__ == '__main__':
+    main()