import os import sys from collections import Counter import nltk nltk.download('punkt') import matplotlib.pyplot as plt import numpy as np import pandas as pd from kaggle import api from sklearn.model_selection import train_test_split from nltk.tokenize import RegexpTokenizer, word_tokenize, sent_tokenize from nltk.corpus import stopwords # To Remove the stop words from nltk.stem import PorterStemmer, WordNetLemmatizer from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from wordcloud import WordCloud, STOPWORDS from sklearn.metrics import classification_report, confusion_matrix, accuracy_score from string import punctuation from nltk import pos_tag from nltk.corpus import wordnet ps = PorterStemmer() # To perform stemming def download_data(data_path, dataset_name): if not os.path.exists(os.path.join(data_path, dataset_name)): api.authenticate() api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path=data_path, unzip=True) os.rename(os.path.join(data_path, 'fake_job_postings.csv'), os.path.join(data_path, dataset_name)) def save_dataset(data_path, data, name): data.to_csv(os.path.join(data_path, name), index=False) def preprocess_dataset(data): data = data.replace(np.nan, '', regex=True) data['description'] = data['description'].str.replace(r"\W+", " ", regex=True) data['description'] = data['description'].str.replace(r"url_\w+", " ", regex=True) data['description'] = data['description'].str.replace(r"\s+", " ", regex=True) data['text'] = data[['title', 'department', 'company_profile', 'description', 'requirements', 'benefits']].apply( lambda x: ' '.join(x), axis=1) data['text'] = data['text'].str.lower() tokenizer = RegexpTokenizer(r'\w+') data['tokens'] = data['text'].apply(tokenizer.tokenize) # data['tokens'] = data['text'].apply(lambda x: word_tokenize(x)) return data.drop(['job_id', 'department', 'company_profile', 'description', 'requirements', 'benefits', 'text'], axis=1) def to_dictionary(stop_words, category): vocab = set() sentences = category for i in sentences: for word in i: word_lower = word.lower() if word_lower not in stop_words and word_lower.isalpha(): vocab.add(ps.stem(word_lower)) word_dic = Counter(vocab) return word_dic # For tokenizing the words and putting it into the word list def return_word_list(stop_words, sentence): word_list = [] for word in sentence: word_lower = word.lower() if word_lower not in stop_words and word_lower.isalpha(): word_list.append(ps.stem(word_lower)) return word_list # For finding the conditional probability def return_category_probability_dictionary(dict_category_wise_probability, word_list, probab, prob_df, pro): help_dict = {} for i, row in probab.iterrows(): for word in word_list: if (word in prob_df.index.tolist()): pro = pro * probab.loc[i, word] help_dict[i] = pro * dict_category_wise_probability[i] pro = 1 return help_dict class NaiveBayes: def __init__(self, data, labels, features): self.data = data self.labels = labels self.features = features def fit(self): pass def transform(self): pass def predict(self): pass def evaluate(self, test_data): pass def main(): abs_data_path, dataset_name = os.path.abspath('./data'), 'dataset.csv' download_data(abs_data_path, dataset_name) data = pd.read_csv(os.path.join(abs_data_path, dataset_name)) clean_data = preprocess_dataset(data) x, y = clean_data['tokens'], clean_data['fraudulent'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123, stratify=y) train_data = pd.concat([x_train, y_train], axis=1) print(train_data) test_data = pd.concat([x_test, y_test], axis=1) classes = [0, 1] # Building the master dictionary that contains the word frequency master_dict = {} stop_words = set(stopwords.words('english')) for category in classes: category_temp = train_data[train_data['fraudulent'] == category] temp_dict = to_dictionary(stop_words, category_temp['tokens']) master_dict[category] = temp_dict # Converting the dictionary to data frame for ease of use word_frequency_df = pd.DataFrame(master_dict).fillna(0) print(word_frequency_df) # Building the dictionary that holds category wise sums and word wise probabilities categories_to_iterate = list(word_frequency_df) # Prepared category for zip category_sum = [] for category in categories_to_iterate: category_sum.append(word_frequency_df[category].sum()) # Prepared category sum for zip dict_category_sum = dict(zip(categories_to_iterate, category_sum)) # Dictionary with category based sums print(f"The dictionary that holds the cateogry wise sum is {dict_category_sum}") dict_category_wise_probability = dict_category_sum.copy() total_sentences_values = dict_category_wise_probability.values() total = sum(total_sentences_values) for key, value in dict_category_wise_probability.items(): dict_category_wise_probability[key] = value / total print(f"The dictionay that holds the category wise probabilities is {dict_category_wise_probability}") # Building word probability with the application of smoothing prob_df = word_frequency_df for category in categories_to_iterate: for index, row in prob_df.iterrows(): row[category] = ((row[category] + 1) / (dict_category_sum[category] + len(prob_df[category]))) # Smoothing prob_df.at[index, category] = row[category] print(prob_df) probab = prob_df.transpose() pro = 1 match = 0 total = 0 counter = 0 for _, row in test_data.iterrows(): if counter > 200: break ind = row['fraudulent'] text = row['tokens'] word_list = return_word_list(stop_words, text) # Get the dictionary that contains the final probability P(word|category) help_dict = return_category_probability_dictionary(dict_category_wise_probability, word_list, probab, prob_df, pro) if ind == max(help_dict, key=help_dict.get): match = match + 1 total = total + 1 counter += 1 print(f"The model predicted {match} correctly of {total}") print(f"The model accuracy then is {int((match / total) * 100)}%") if __name__ == '__main__': main()