Merge branch 'master' of https://git.wmi.amu.edu.pl/s444452/mpsic_projekt_1_bayes_classifier

d
2022-05-18 00:17:28 +02:00 · 2022-05-18 00:17:28 +02:00 · 575da00f8a
commit 575da00f8a
parent 2392c36421 45eb1ff6f2
3 changed files with 151 additions and 18055 deletions
--- a/data/dataset.csv
+++ b/data/dataset.csv
--- a/naive_bayes.py
+++ b/naive_bayes.py
@ -1,193 +1,67 @@
 import os
 import sys
 from collections import Counter
 import nltk
 nltk.download('punkt')
 import matplotlib.pyplot as plt
 import numpy as np
-import pandas as pd
+from sklearn.metrics import accuracy_score
 from kaggle import api
 from sklearn.model_selection import train_test_split
-from nltk.tokenize import RegexpTokenizer, word_tokenize, sent_tokenize
+import os
 from collections import Counter
 from ast import literal_eval
-from nltk.corpus import stopwords  # To Remove the stop words
+# TODO: stworzyc mapy slow dla zbiorów z fraudulent 0 i 1
-from nltk.stem import PorterStemmer, WordNetLemmatizer
+from prepare_data import read_data
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_extraction.text import TfidfVectorizer
 from wordcloud import WordCloud, STOPWORDS
 from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
 from string import punctuation
 from nltk import pos_tag
 from nltk.corpus import wordnet
 ps = PorterStemmer()  # To perform stemming
 def download_data(data_path, dataset_name):
    if not os.path.exists(os.path.join(data_path, dataset_name)):
        api.authenticate()
        api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path=data_path,
                                   unzip=True)
        os.rename(os.path.join(data_path, 'fake_job_postings.csv'), os.path.join(data_path, dataset_name))
 def save_dataset(data_path, data, name):
    data.to_csv(os.path.join(data_path, name), index=False)
 def preprocess_dataset(data):
    data = data.replace(np.nan, '', regex=True)
    data['description'] = data['description'].str.replace(r"\W+", " ", regex=True)
    data['description'] = data['description'].str.replace(r"url_\w+", " ", regex=True)
    data['description'] = data['description'].str.replace(r"\s+", " ", regex=True)
    data['text'] = data[['title', 'department', 'company_profile', 'description', 'requirements', 'benefits']].apply(
        lambda x: ' '.join(x), axis=1)
    data['text'] = data['text'].str.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    data['tokens'] = data['text'].apply(tokenizer.tokenize)
    # data['tokens'] = data['text'].apply(lambda x: word_tokenize(x))
    return data.drop(['job_id', 'department', 'company_profile', 'description', 'requirements', 'benefits', 'text'],
                     axis=1)
 def to_dictionary(stop_words, category):
    vocab = set()
    sentences = category
    for i in sentences:
        for word in i:
            word_lower = word.lower()
            if word_lower not in stop_words and word_lower.isalpha():
                vocab.add(ps.stem(word_lower))
    word_dic = Counter(vocab)
    return word_dic
 # For tokenizing the words and putting it into the word list
 def return_word_list(stop_words, sentence):
    word_list = []
    for word in sentence:
        word_lower = word.lower()
        if word_lower not in stop_words and word_lower.isalpha():
            word_list.append(ps.stem(word_lower))
    return word_list
 # For finding the conditional probability
 def return_category_probability_dictionary(dict_category_wise_probability, word_list, probab, prob_df, pro):
    help_dict = {}
    for i, row in probab.iterrows():
        for word in word_list:
            if (word in prob_df.index.tolist()):
                pro = pro * probab.loc[i, word]
        help_dict[i] = pro * dict_category_wise_probability[i]
        pro = 1
    return help_dict
 class NaiveBayes:
-    def __init__(self, data, labels, features):
+    def __init__(self, train_x, train_y, labels):
-        self.data = data
+        self.train_x = train_x
        self.train_y = train_y
        self.labels = labels
-        self.features = features
+        self.counts = {}
        self.prior_prob = {}
        self.word_counts = {}
    def count_words(self):
        for label in self.labels:
            indexes = self.train_y.index[self.train_y == label].tolist()
            data = self.train_x[self.train_x.index.isin(indexes)]
            vocabulary = []
            for tokens in data:
                vocabulary += tokens
            self.word_counts.update({label: (len(vocabulary), len(set(vocabulary)), Counter(vocabulary))})
    def fit(self):
-        pass
+        self.counts = {l: self.train_y[self.train_y == l].shape[0] for l in self.labels}
        self.prior_prob = {l: float(self.counts[l]) / float(self.train_y.shape[0]) for l in self.labels}
        self.count_words()
-    def transform(self):
+    def get_posteriori(self, text):
-        pass
+        values = {}
        for label in self.labels:
            values = {label: 0 for label in self.labels}
            for word in text:
                values[label] += np.log((float(self.word_counts[label][2].get(word, 0) + 1)) / (
                        self.word_counts[label][0] + self.word_counts[label][1]))
            values[label] *= np.log(self.prior_prob[label])
        return values.values()
-    def predict(self):
+    def predict(self, test_x):
-        pass
+        predicted = []
-
+        for row in test_x:
-    def evaluate(self, test_data):
+            predicted.append(np.argmax(self.get_posteriori(row)))
-        pass
+        return predicted
 def main():
-    abs_data_path, dataset_name = os.path.abspath('./data'), 'dataset.csv'
+    data = read_data(os.path.join(os.path.abspath("./data"), "clean-data.csv"))
-    download_data(abs_data_path, dataset_name)
+    data['tokens'] = data['tokens'].apply(literal_eval)
-    data = pd.read_csv(os.path.join(abs_data_path, dataset_name))
+    x = data['tokens']
-    clean_data = preprocess_dataset(data)
+    y = data['fraudulent']
-    x, y = clean_data['tokens'], clean_data['fraudulent']
+    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123, stratify=y)
-    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,
+    bayes = NaiveBayes(x_train, y_train, [0, 1])
-                                                        random_state=123, stratify=y)
+    bayes.fit()
-    train_data = pd.concat([x_train, y_train], axis=1)
+    predicted = bayes.predict(x_test)
    print(train_data)
    test_data = pd.concat([x_test, y_test], axis=1)
-    classes = [0, 1]
+    print(accuracy_score(y_test, predicted))
    # Building the master dictionary that contains the word frequency
    master_dict = {}
    stop_words = set(stopwords.words('english'))
    for category in classes:
        category_temp = train_data[train_data['fraudulent'] == category]
        temp_dict = to_dictionary(stop_words, category_temp['tokens'])
        master_dict[category] = temp_dict
    # Converting the dictionary to data frame for ease of use
    word_frequency_df = pd.DataFrame(master_dict).fillna(0)
    print(word_frequency_df)
    # Building the dictionary that holds category wise sums and word wise probabilities
    categories_to_iterate = list(word_frequency_df)  # Prepared category for zip
    category_sum = []
    for category in categories_to_iterate:
        category_sum.append(word_frequency_df[category].sum())  # Prepared category sum for zip
    dict_category_sum = dict(zip(categories_to_iterate, category_sum))  # Dictionary with category based sums
    print(f"The dictionary that holds the cateogry wise sum is {dict_category_sum}")
    dict_category_wise_probability = dict_category_sum.copy()
    total_sentences_values = dict_category_wise_probability.values()
    total = sum(total_sentences_values)
    for key, value in dict_category_wise_probability.items():
        dict_category_wise_probability[key] = value / total
    print(f"The dictionay that holds the category wise probabilities is {dict_category_wise_probability}")
    # Building word probability with the application of smoothing
    prob_df = word_frequency_df
    for category in categories_to_iterate:
        for index, row in prob_df.iterrows():
            row[category] = ((row[category] + 1) / (dict_category_sum[category] + len(prob_df[category])))  # Smoothing
            prob_df.at[index, category] = row[category]
    print(prob_df)
    probab = prob_df.transpose()
    pro = 1
    match = 0
    total = 0
    counter = 0
    for _, row in test_data.iterrows():
        if counter > 200:
            break
        ind = row['fraudulent']
        text = row['tokens']
        word_list = return_word_list(stop_words, text)
        # Get the dictionary that contains the final probability P(word|category)
        help_dict = return_category_probability_dictionary(dict_category_wise_probability, word_list, probab, prob_df,
                                                           pro)
        if ind == max(help_dict, key=help_dict.get):
            match = match + 1
        total = total + 1
        counter += 1
    print(f"The model predicted {match} correctly of {total}")
    print(f"The model accuracy then is {int((match / total) * 100)}%")
-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/prepare_data.py
+++ b/prepare_data.py
@ -0,0 +1,103 @@
 import os
 import numpy as np
 import pandas as pd
 from kaggle import api
 from sklearn.utils import shuffle
 import nltk
 from nltk.tokenize import RegexpTokenizer
 from nltk.stem.snowball import SnowballStemmer
 from nltk.corpus import stopwords
 nltk.download("punkt")
 nltk.download("stopwords")
 stemmer = SnowballStemmer(language="english")
 tokenizer = RegexpTokenizer(r'\w+')
 stop_words = set(stopwords.words('english'))
 def read_data(data_path: str, prepare_data: bool = False):
    """Read data from given path - if @prepared_data is True, data is also preprocessed and cleaned"""
    if prepare_data:
        data = preprocess_dataset(data_path)
    else:
        data = pd.read_csv(data_path)
    return data
 def download_data(data_path, dataset_name):
    if not os.path.exists(os.path.join(data_path, dataset_name)):
        api.authenticate()
        api.dataset_download_files(
            "shivamb/real-or-fake-fake-jobposting-prediction",
            path=data_path,
            unzip=True,
        )
        os.rename(
            os.path.join(data_path, "fake_job_postings.csv"),
            os.path.join(data_path, dataset_name),
        )
 def tokenize_and_stem_text(text):
    tokenized_text = tokenizer.tokenize(text)
    tokens = [token.lower() for token in tokenized_text if token.lower() not in stop_words and len(token) > 3]
    return [stemmer.stem(token) for token in tokens]
 def preprocess_dataset(data_path):
    data = pd.read_csv(data_path).replace(np.nan, "", regex=True)
    data_not_fraudulent = data[data['fraudulent'] == 0]
    data_fraudulent = data[data['fraudulent'] == 1]
    sample = data_not_fraudulent.sample(data_fraudulent.shape[0], replace=False)
    data = pd.concat([sample.reset_index(), data_fraudulent.reset_index()], axis=0)
    data = shuffle(data)
    data["description"] = data["description"].str.replace(r"\W+", " ", regex=True)
    data["description"] = data["description"].str.replace(r"url_\w+", " ", regex=True)
    data["description"] = data["description"].str.replace(r"\s+", " ", regex=True)
    data["text"] = data[[
        "title",
        "department",
        "company_profile",
        "description",
        "requirements",
        "benefits",
    ]].apply(lambda x: " ".join(x), axis=1)
    # data["text"] = data[[
    #     "description"
    # ]].apply(lambda x: " ".join(x), axis=1)
    data["tokens"] = data["text"].apply(lambda text: tokenize_and_stem_text(text))
    return data.drop(
        [
            "job_id",
            "department",
            "company_profile",
            "description",
            "requirements",
            "benefits",
            "text",
        ],
        axis=1,
    )
 def save_dataset(data, data_path, name):
    data.to_csv(os.path.join(data_path, name), index=False)
 if __name__ == "__main__":
    # * Download the training data
    abs_data_path, dataset_name = os.path.abspath("./data"), "dataset.csv"
    download_data(abs_data_path, dataset_name)
    # * Data preprocessing
    data_path = os.path.join(abs_data_path, dataset_name)
    cleaned_data = preprocess_dataset(data_path)
    # * Save prepared data to a csv file
    save_dataset(cleaned_data, abs_data_path, "clean-data.csv")