mpsic_projekt_1_bayes_class.../naive_bayes.py

import os
import sys
from collections import Counter

import nltk

nltk.download('punkt')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from kaggle import api
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer, word_tokenize, sent_tokenize

from nltk.corpus import stopwords  # To Remove the stop words

from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from wordcloud import WordCloud, STOPWORDS

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from string import punctuation
from nltk import pos_tag
from nltk.corpus import wordnet

ps = PorterStemmer()  # To perform stemming


def download_data(data_path, dataset_name):
    if not os.path.exists(os.path.join(data_path, dataset_name)):
        api.authenticate()
        api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path=data_path,
                                   unzip=True)
        os.rename(os.path.join(data_path, 'fake_job_postings.csv'), os.path.join(data_path, dataset_name))


def save_dataset(data_path, data, name):
    data.to_csv(os.path.join(data_path, name), index=False)


def preprocess_dataset(data):
    data = data.replace(np.nan, '', regex=True)

    data['description'] = data['description'].str.replace(r"\W+", " ", regex=True)
    data['description'] = data['description'].str.replace(r"url_\w+", " ", regex=True)
    data['description'] = data['description'].str.replace(r"\s+", " ", regex=True)

    data['text'] = data[['title', 'department', 'company_profile', 'description', 'requirements', 'benefits']].apply(
        lambda x: ' '.join(x), axis=1)
    data['text'] = data['text'].str.lower()

    tokenizer = RegexpTokenizer(r'\w+')
    data['tokens'] = data['text'].apply(tokenizer.tokenize)
    # data['tokens'] = data['text'].apply(lambda x: word_tokenize(x))

    return data.drop(['job_id', 'department', 'company_profile', 'description', 'requirements', 'benefits', 'text'],
                     axis=1)


def to_dictionary(stop_words, category):
    vocab = set()
    sentences = category
    for i in sentences:
        for word in i:
            word_lower = word.lower()
            if word_lower not in stop_words and word_lower.isalpha():
                vocab.add(ps.stem(word_lower))
    word_dic = Counter(vocab)
    return word_dic


# For tokenizing the words and putting it into the word list
def return_word_list(stop_words, sentence):
    word_list = []
    for word in sentence:
        word_lower = word.lower()
        if word_lower not in stop_words and word_lower.isalpha():
            word_list.append(ps.stem(word_lower))
    return word_list


# For finding the conditional probability
def return_category_probability_dictionary(dict_category_wise_probability, word_list, probab, prob_df, pro):
    help_dict = {}
    for i, row in probab.iterrows():
        for word in word_list:
            if (word in prob_df.index.tolist()):
                pro = pro * probab.loc[i, word]
        help_dict[i] = pro * dict_category_wise_probability[i]
        pro = 1
    return help_dict


class NaiveBayes:
    def __init__(self, data, labels, features):
        self.data = data
        self.labels = labels
        self.features = features

    def fit(self):
        pass

    def transform(self):
        pass

    def predict(self):
        pass

    def evaluate(self, test_data):
        pass


def main():
    abs_data_path, dataset_name = os.path.abspath('./data'), 'dataset.csv'
    download_data(abs_data_path, dataset_name)
    data = pd.read_csv(os.path.join(abs_data_path, dataset_name))
    clean_data = preprocess_dataset(data)
    x, y = clean_data['tokens'], clean_data['fraudulent']
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,
                                                        random_state=123, stratify=y)
    train_data = pd.concat([x_train, y_train], axis=1)
    print(train_data)
    test_data = pd.concat([x_test, y_test], axis=1)

    classes = [0, 1]
    # Building the master dictionary that contains the word frequency
    master_dict = {}
    stop_words = set(stopwords.words('english'))

    for category in classes:
        category_temp = train_data[train_data['fraudulent'] == category]
        temp_dict = to_dictionary(stop_words, category_temp['tokens'])
        master_dict[category] = temp_dict

    # Converting the dictionary to data frame for ease of use
    word_frequency_df = pd.DataFrame(master_dict).fillna(0)
    print(word_frequency_df)

    # Building the dictionary that holds category wise sums and word wise probabilities
    categories_to_iterate = list(word_frequency_df)  # Prepared category for zip
    category_sum = []
    for category in categories_to_iterate:
        category_sum.append(word_frequency_df[category].sum())  # Prepared category sum for zip
    dict_category_sum = dict(zip(categories_to_iterate, category_sum))  # Dictionary with category based sums
    print(f"The dictionary that holds the cateogry wise sum is {dict_category_sum}")

    dict_category_wise_probability = dict_category_sum.copy()

    total_sentences_values = dict_category_wise_probability.values()
    total = sum(total_sentences_values)

    for key, value in dict_category_wise_probability.items():
        dict_category_wise_probability[key] = value / total
    print(f"The dictionay that holds the category wise probabilities is {dict_category_wise_probability}")

    # Building word probability with the application of smoothing
    prob_df = word_frequency_df
    for category in categories_to_iterate:
        for index, row in prob_df.iterrows():
            row[category] = ((row[category] + 1) / (dict_category_sum[category] + len(prob_df[category])))  # Smoothing
            prob_df.at[index, category] = row[category]
    print(prob_df)

    probab = prob_df.transpose()
    pro = 1

    match = 0
    total = 0
    counter = 0
    for _, row in test_data.iterrows():
        if counter > 200:
            break
        ind = row['fraudulent']
        text = row['tokens']
        word_list = return_word_list(stop_words, text)

        # Get the dictionary that contains the final probability P(word|category)
        help_dict = return_category_probability_dictionary(dict_category_wise_probability, word_list, probab, prob_df,
                                                           pro)

        if ind == max(help_dict, key=help_dict.get):
            match = match + 1
        total = total + 1
        counter += 1

    print(f"The model predicted {match} correctly of {total}")
    print(f"The model accuracy then is {int((match / total) * 100)}%")


if __name__ == '__main__':
    main()