Data prepare and model separation

2022-05-17 13:15:06 +02:00 · 2022-05-17 13:15:06 +02:00 · 06f06c68f2
commit 06f06c68f2
parent 5797dede2d
2 changed files with 185 additions and 111 deletions
--- a/naive_bayes.py
+++ b/naive_bayes.py
@ -1,62 +1,23 @@
+from nltk.corpus import wordnet
+from nltk import pos_tag
+from string import punctuation
+from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
+from wordcloud import WordCloud, STOPWORDS
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.feature_extraction.text import CountVectorizer
+from nltk.stem import PorterStemmer, WordNetLemmatizer
+from nltk.corpus import stopwords  # *To Remove the stop words
+from sklearn.model_selection import train_test_split
+import pandas as pd
 import os
-import sys
 from collections import Counter
+from prepare_data import preprocess_dataset, save_dataset

 import nltk

-nltk.download('punkt')
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-from kaggle import api
-from sklearn.model_selection import train_test_split
-from nltk.tokenize import RegexpTokenizer, word_tokenize, sent_tokenize
+nltk.download("stopwords")

-from nltk.corpus import stopwords  # To Remove the stop words
-
-from nltk.stem import PorterStemmer, WordNetLemmatizer
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.feature_extraction.text import TfidfVectorizer
-
-from wordcloud import WordCloud, STOPWORDS
-
-from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
-from string import punctuation
-from nltk import pos_tag
-from nltk.corpus import wordnet
-
-ps = PorterStemmer()  # To perform stemming
-
-
-def download_data(data_path, dataset_name):
-    if not os.path.exists(os.path.join(data_path, dataset_name)):
-        api.authenticate()
-        api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path=data_path,
-                                   unzip=True)
-        os.rename(os.path.join(data_path, 'fake_job_postings.csv'), os.path.join(data_path, dataset_name))
-
-
-def save_dataset(data_path, data, name):
-    data.to_csv(os.path.join(data_path, name), index=False)
-
-
-def preprocess_dataset(data):
-    data = data.replace(np.nan, '', regex=True)
-
-    data['description'] = data['description'].str.replace(r"\W+", " ", regex=True)
-    data['description'] = data['description'].str.replace(r"url_\w+", " ", regex=True)
-    data['description'] = data['description'].str.replace(r"\s+", " ", regex=True)
-
-    data['text'] = data[['title', 'department', 'company_profile', 'description', 'requirements', 'benefits']].apply(
-        lambda x: ' '.join(x), axis=1)
-    data['text'] = data['text'].str.lower()
-
-    tokenizer = RegexpTokenizer(r'\w+')
-    data['tokens'] = data['text'].apply(tokenizer.tokenize)
-    # data['tokens'] = data['text'].apply(lambda x: word_tokenize(x))
-
-    return data.drop(['job_id', 'department', 'company_profile', 'description', 'requirements', 'benefits', 'text'],
-                     axis=1)
+ps = PorterStemmer()  # *To perform stemming


 def to_dictionary(stop_words, category):
@ -71,22 +32,22 @@ def to_dictionary(stop_words, category):
    return word_dic


-# For tokenizing the words and putting it into the word list
+# *For tokenizing the words and putting it into the word list
 def return_word_list(stop_words, sentence):
    word_list = []
-    for word in sentence:
-        word_lower = word.lower()
-        if word_lower not in stop_words and word_lower.isalpha():
-            word_list.append(ps.stem(word_lower))
+    for word in sentence.lower():
+        if word not in stop_words and word.isalpha():
+            word_list.append(ps.stem(word))
    return word_list


-# For finding the conditional probability
-def return_category_probability_dictionary(dict_category_wise_probability, word_list, probab, prob_df, pro):
+# *For finding the conditional probability
+def return_category_probability_dictionary(dict_category_wise_probability,
+                                           word_list, probab, prob_df, pro):
    help_dict = {}
-    for i, row in probab.iterrows():
+    for i, _ in probab.iterrows():
        for word in word_list:
-            if (word in prob_df.index.tolist()):
+            if word in prob_df.index.tolist():
                pro = pro * probab.loc[i, word]
        help_dict[i] = pro * dict_category_wise_probability[i]
        pro = 1
@ -94,73 +55,111 @@ def return_category_probability_dictionary(dict_category_wise_probability, word_


 class NaiveBayes:
+
    def __init__(self, data, labels, features):
        self.data = data
        self.labels = labels
        self.features = features

    def fit(self):
-        pass
+        pass # TODO

    def transform(self):
-        pass
+        pass # TODO

    def predict(self):
-        pass
+        pass # TODO

    def evaluate(self, test_data):
-        pass
+        pass # TODO
+
+
+def read_data(data_path, prepare_data=False):
+    if prepare_data:
+        data = preprocess_dataset(data_path)
+    else:
+        data = pd.read_csv(data_path, nrows=1000)  # !Delete the nrows option
+    return data["tokens"], data["fraudulent"]
+
+
+def build_master_dict(data, classes, stop_words):
+    master_dict = {}
+
+    for category in classes:
+        category_temp = data[data["fraudulent"] == category]
+        temp_dict = to_dictionary(stop_words, category_temp["tokens"])
+        master_dict[category] = temp_dict
+
+    return master_dict
+
+
+def build_category_probs_dicts(word_frequency_df, categories_to_iterate):
+    category_sum = []
+    for category in categories_to_iterate:
+        # *Prepared category sum for zip
+        category_sum.append(word_frequency_df[category].sum())
+    # *Dictionary with category based sums
+    dict_category_sum = dict(zip(categories_to_iterate, category_sum))
+
+    cat_wise_probs_dict = dict_category_sum.copy()
+
+    total_sentences_values = cat_wise_probs_dict.values()
+    total = sum(total_sentences_values)
+
+    for key, value in cat_wise_probs_dict.items():
+        cat_wise_probs_dict[key] = value / total
+
+    return cat_wise_probs_dict, dict_category_sum
+
+
+def build_word_probs(word_freqs, categories_to_iterate, dict_category_sum):
+    prob_df = word_freqs
+    for category in categories_to_iterate:
+        for index, row in prob_df.iterrows():
+            row[category] = (row[category] + 1) / (
+                dict_category_sum[category] + len(prob_df[category])
+            )  # *Smoothing
+            prob_df.at[index, category] = row[category]
+    return prob_df


 def main():
-    abs_data_path, dataset_name = os.path.abspath('./data'), 'dataset.csv'
-    download_data(abs_data_path, dataset_name)
-    data = pd.read_csv(os.path.join(abs_data_path, dataset_name))
-    clean_data = preprocess_dataset(data)
-    x, y = clean_data['tokens'], clean_data['fraudulent']
-    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,
-                                                        random_state=123, stratify=y)
+    # *Reading and splitting data
+    x, y = read_data(os.path.join(os.path.abspath("./data"), "clean-data.csv"))
+    x_train, x_test, y_train, y_test = train_test_split(x,
+                                                        y,
+                                                        test_size=0.2,
+                                                        random_state=123,
+                                                        stratify=y)
    train_data = pd.concat([x_train, y_train], axis=1)
-    print(train_data)
+    print("\tTrain data:\n", train_data)
    test_data = pd.concat([x_test, y_test], axis=1)

    classes = [0, 1]
-    # Building the master dictionary that contains the word frequency
-    master_dict = {}
+    # *Building the master dictionary that contains the word frequency
    stop_words = set(stopwords.words('english'))
+    master_dict = build_master_dict(train_data, classes, stop_words)
+    print("Master dictionary with word freqs", master_dict)

-    for category in classes:
-        category_temp = train_data[train_data['fraudulent'] == category]
-        temp_dict = to_dictionary(stop_words, category_temp['tokens'])
-        master_dict[category] = temp_dict
-
-    # Converting the dictionary to data frame for ease of use
+    # *Converting the dictionary to data frame for ease of use
    word_frequency_df = pd.DataFrame(master_dict).fillna(0)
-    print(word_frequency_df)
+    print("Dictionary converted to DataFrame\n", word_frequency_df.head)

-    # Building the dictionary that holds category wise sums and word wise probabilities
-    categories_to_iterate = list(word_frequency_df)  # Prepared category for zip
-    category_sum = []
-    for category in categories_to_iterate:
-        category_sum.append(word_frequency_df[category].sum())  # Prepared category sum for zip
-    dict_category_sum = dict(zip(categories_to_iterate, category_sum))  # Dictionary with category based sums
-    print(f"The dictionary that holds the cateogry wise sum is {dict_category_sum}")
+    # *Building the dictionary that holds category wise sums and word wise probabilities
+    categories_to_iterate = list(
+        word_frequency_df)  # *Prepared category for zip
+    dict_category_wise_probability, dict_category_sum = build_category_probs_dicts(
+        word_frequency_df, categories_to_iterate)
+    print(
+        f"The dictionary that holds the cateogry wise sum is {dict_category_sum}"
+    )
+    print(
+        f"The dictionary that holds the category wise probabilities is {dict_category_wise_probability}"
+    )

-    dict_category_wise_probability = dict_category_sum.copy()
-
-    total_sentences_values = dict_category_wise_probability.values()
-    total = sum(total_sentences_values)
-
-    for key, value in dict_category_wise_probability.items():
-        dict_category_wise_probability[key] = value / total
-    print(f"The dictionay that holds the category wise probabilities is {dict_category_wise_probability}")
-
-    # Building word probability with the application of smoothing
-    prob_df = word_frequency_df
-    for category in categories_to_iterate:
-        for index, row in prob_df.iterrows():
-            row[category] = ((row[category] + 1) / (dict_category_sum[category] + len(prob_df[category])))  # Smoothing
-            prob_df.at[index, category] = row[category]
+    # *Building word probability with the application of smoothing
+    prob_df = build_word_probs(word_frequency_df, categories_to_iterate,
+                               dict_category_sum)
    print(prob_df)

    probab = prob_df.transpose()
@ -172,13 +171,13 @@ def main():
    for _, row in test_data.iterrows():
        if counter > 200:
            break
-        ind = row['fraudulent']
-        text = row['tokens']
+        ind = row["fraudulent"]
+        text = row["tokens"]
        word_list = return_word_list(stop_words, text)

-        # Get the dictionary that contains the final probability P(word|category)
-        help_dict = return_category_probability_dictionary(dict_category_wise_probability, word_list, probab, prob_df,
-                                                           pro)
+        # *Get the dictionary that contains the final probability P(word|category)
+        help_dict = return_category_probability_dictionary(
+            dict_category_wise_probability, word_list, probab, prob_df, pro)

        if ind == max(help_dict, key=help_dict.get):
            match = match + 1
@ -189,5 +188,5 @@ def main():
    print(f"The model accuracy then is {int((match / total) * 100)}%")


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/prepare_data.py
+++ b/prepare_data.py
@ -0,0 +1,75 @@
+import os
+import numpy as np
+import pandas as pd
+
+from kaggle import api
+
+import nltk
+from nltk.tokenize import RegexpTokenizer
+
+nltk.download("punkt")
+
+
+def download_data(data_path, dataset_name):
+    if not os.path.exists(os.path.join(data_path, dataset_name)):
+        api.authenticate()
+        api.dataset_download_files(
+            "shivamb/real-or-fake-fake-jobposting-prediction",
+            path=data_path,
+            unzip=True,
+        )
+        os.rename(
+            os.path.join(data_path, "fake_job_postings.csv"),
+            os.path.join(data_path, dataset_name),
+        )
+
+
+def preprocess_dataset(data_path):
+    data = pd.read_csv(data_path).replace(np.nan, "", regex=True)
+
+    data["description"] = data["description"].str.replace(r"\W+", " ", regex=True)
+    data["description"] = data["description"].str.replace(r"url_\w+", " ", regex=True)
+    data["description"] = data["description"].str.replace(r"\s+", " ", regex=True)
+
+    data["text"] = data[
+        [
+            "title",
+            "department",
+            "company_profile",
+            "description",
+            "requirements",
+            "benefits",
+        ]
+    ].apply(lambda x: " ".join(x).lower(), axis=1)
+    # data['text'] = data['text'].str.lower()
+
+    tokenizer = RegexpTokenizer(r"\w+")
+    data["tokens"] = data["text"].apply(tokenizer.tokenize)
+
+    return data.drop(
+        [
+            "job_id",
+            "department",
+            "company_profile",
+            "description",
+            "requirements",
+            "benefits",
+            "text",
+        ],
+        axis=1,
+    )
+
+
+def save_dataset(data, data_path, name):
+    data.to_csv(os.path.join(data_path, name), index=False)
+
+
+if __name__ == "__main__":
+    # * Download the training data
+    abs_data_path, dataset_name = os.path.abspath("./data"), "dataset.csv"
+    download_data(abs_data_path, dataset_name)
+    # * Data preprocessing
+    data_path = os.path.join(abs_data_path, dataset_name)
+    cleaned_data = preprocess_dataset(data_path)
+    # * Save prepared data to a csv file
+    save_dataset(cleaned_data, abs_data_path, "clean-data.csv")