Data prepare and model separation

This commit is contained in:
MatOgr 2022-05-17 13:15:06 +02:00
parent 5797dede2d
commit 06f06c68f2
2 changed files with 185 additions and 111 deletions

View File

@ -1,62 +1,23 @@
from nltk.corpus import wordnet
from nltk import pos_tag
from string import punctuation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords # *To Remove the stop words
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import sys
from collections import Counter
from prepare_data import preprocess_dataset, save_dataset
import nltk
nltk.download('punkt')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from kaggle import api
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer, word_tokenize, sent_tokenize
nltk.download("stopwords")
from nltk.corpus import stopwords # To Remove the stop words
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from string import punctuation
from nltk import pos_tag
from nltk.corpus import wordnet
ps = PorterStemmer() # To perform stemming
def download_data(data_path, dataset_name):
if not os.path.exists(os.path.join(data_path, dataset_name)):
api.authenticate()
api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path=data_path,
unzip=True)
os.rename(os.path.join(data_path, 'fake_job_postings.csv'), os.path.join(data_path, dataset_name))
def save_dataset(data_path, data, name):
data.to_csv(os.path.join(data_path, name), index=False)
def preprocess_dataset(data):
data = data.replace(np.nan, '', regex=True)
data['description'] = data['description'].str.replace(r"\W+", " ", regex=True)
data['description'] = data['description'].str.replace(r"url_\w+", " ", regex=True)
data['description'] = data['description'].str.replace(r"\s+", " ", regex=True)
data['text'] = data[['title', 'department', 'company_profile', 'description', 'requirements', 'benefits']].apply(
lambda x: ' '.join(x), axis=1)
data['text'] = data['text'].str.lower()
tokenizer = RegexpTokenizer(r'\w+')
data['tokens'] = data['text'].apply(tokenizer.tokenize)
# data['tokens'] = data['text'].apply(lambda x: word_tokenize(x))
return data.drop(['job_id', 'department', 'company_profile', 'description', 'requirements', 'benefits', 'text'],
axis=1)
ps = PorterStemmer() # *To perform stemming
def to_dictionary(stop_words, category):
@ -71,22 +32,22 @@ def to_dictionary(stop_words, category):
return word_dic
# For tokenizing the words and putting it into the word list
# *For tokenizing the words and putting it into the word list
def return_word_list(stop_words, sentence):
word_list = []
for word in sentence:
word_lower = word.lower()
if word_lower not in stop_words and word_lower.isalpha():
word_list.append(ps.stem(word_lower))
for word in sentence.lower():
if word not in stop_words and word.isalpha():
word_list.append(ps.stem(word))
return word_list
# For finding the conditional probability
def return_category_probability_dictionary(dict_category_wise_probability, word_list, probab, prob_df, pro):
# *For finding the conditional probability
def return_category_probability_dictionary(dict_category_wise_probability,
word_list, probab, prob_df, pro):
help_dict = {}
for i, row in probab.iterrows():
for i, _ in probab.iterrows():
for word in word_list:
if (word in prob_df.index.tolist()):
if word in prob_df.index.tolist():
pro = pro * probab.loc[i, word]
help_dict[i] = pro * dict_category_wise_probability[i]
pro = 1
@ -94,73 +55,111 @@ def return_category_probability_dictionary(dict_category_wise_probability, word_
class NaiveBayes:
def __init__(self, data, labels, features):
self.data = data
self.labels = labels
self.features = features
def fit(self):
pass
pass # TODO
def transform(self):
pass
pass # TODO
def predict(self):
pass
pass # TODO
def evaluate(self, test_data):
pass
pass # TODO
def read_data(data_path, prepare_data=False):
if prepare_data:
data = preprocess_dataset(data_path)
else:
data = pd.read_csv(data_path, nrows=1000) # !Delete the nrows option
return data["tokens"], data["fraudulent"]
def build_master_dict(data, classes, stop_words):
master_dict = {}
for category in classes:
category_temp = data[data["fraudulent"] == category]
temp_dict = to_dictionary(stop_words, category_temp["tokens"])
master_dict[category] = temp_dict
return master_dict
def build_category_probs_dicts(word_frequency_df, categories_to_iterate):
category_sum = []
for category in categories_to_iterate:
# *Prepared category sum for zip
category_sum.append(word_frequency_df[category].sum())
# *Dictionary with category based sums
dict_category_sum = dict(zip(categories_to_iterate, category_sum))
cat_wise_probs_dict = dict_category_sum.copy()
total_sentences_values = cat_wise_probs_dict.values()
total = sum(total_sentences_values)
for key, value in cat_wise_probs_dict.items():
cat_wise_probs_dict[key] = value / total
return cat_wise_probs_dict, dict_category_sum
def build_word_probs(word_freqs, categories_to_iterate, dict_category_sum):
prob_df = word_freqs
for category in categories_to_iterate:
for index, row in prob_df.iterrows():
row[category] = (row[category] + 1) / (
dict_category_sum[category] + len(prob_df[category])
) # *Smoothing
prob_df.at[index, category] = row[category]
return prob_df
def main():
abs_data_path, dataset_name = os.path.abspath('./data'), 'dataset.csv'
download_data(abs_data_path, dataset_name)
data = pd.read_csv(os.path.join(abs_data_path, dataset_name))
clean_data = preprocess_dataset(data)
x, y = clean_data['tokens'], clean_data['fraudulent']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,
random_state=123, stratify=y)
# *Reading and splitting data
x, y = read_data(os.path.join(os.path.abspath("./data"), "clean-data.csv"))
x_train, x_test, y_train, y_test = train_test_split(x,
y,
test_size=0.2,
random_state=123,
stratify=y)
train_data = pd.concat([x_train, y_train], axis=1)
print(train_data)
print("\tTrain data:\n", train_data)
test_data = pd.concat([x_test, y_test], axis=1)
classes = [0, 1]
# Building the master dictionary that contains the word frequency
master_dict = {}
# *Building the master dictionary that contains the word frequency
stop_words = set(stopwords.words('english'))
master_dict = build_master_dict(train_data, classes, stop_words)
print("Master dictionary with word freqs", master_dict)
for category in classes:
category_temp = train_data[train_data['fraudulent'] == category]
temp_dict = to_dictionary(stop_words, category_temp['tokens'])
master_dict[category] = temp_dict
# Converting the dictionary to data frame for ease of use
# *Converting the dictionary to data frame for ease of use
word_frequency_df = pd.DataFrame(master_dict).fillna(0)
print(word_frequency_df)
print("Dictionary converted to DataFrame\n", word_frequency_df.head)
# Building the dictionary that holds category wise sums and word wise probabilities
categories_to_iterate = list(word_frequency_df) # Prepared category for zip
category_sum = []
for category in categories_to_iterate:
category_sum.append(word_frequency_df[category].sum()) # Prepared category sum for zip
dict_category_sum = dict(zip(categories_to_iterate, category_sum)) # Dictionary with category based sums
print(f"The dictionary that holds the cateogry wise sum is {dict_category_sum}")
# *Building the dictionary that holds category wise sums and word wise probabilities
categories_to_iterate = list(
word_frequency_df) # *Prepared category for zip
dict_category_wise_probability, dict_category_sum = build_category_probs_dicts(
word_frequency_df, categories_to_iterate)
print(
f"The dictionary that holds the cateogry wise sum is {dict_category_sum}"
)
print(
f"The dictionary that holds the category wise probabilities is {dict_category_wise_probability}"
)
dict_category_wise_probability = dict_category_sum.copy()
total_sentences_values = dict_category_wise_probability.values()
total = sum(total_sentences_values)
for key, value in dict_category_wise_probability.items():
dict_category_wise_probability[key] = value / total
print(f"The dictionay that holds the category wise probabilities is {dict_category_wise_probability}")
# Building word probability with the application of smoothing
prob_df = word_frequency_df
for category in categories_to_iterate:
for index, row in prob_df.iterrows():
row[category] = ((row[category] + 1) / (dict_category_sum[category] + len(prob_df[category]))) # Smoothing
prob_df.at[index, category] = row[category]
# *Building word probability with the application of smoothing
prob_df = build_word_probs(word_frequency_df, categories_to_iterate,
dict_category_sum)
print(prob_df)
probab = prob_df.transpose()
@ -172,13 +171,13 @@ def main():
for _, row in test_data.iterrows():
if counter > 200:
break
ind = row['fraudulent']
text = row['tokens']
ind = row["fraudulent"]
text = row["tokens"]
word_list = return_word_list(stop_words, text)
# Get the dictionary that contains the final probability P(word|category)
help_dict = return_category_probability_dictionary(dict_category_wise_probability, word_list, probab, prob_df,
pro)
# *Get the dictionary that contains the final probability P(word|category)
help_dict = return_category_probability_dictionary(
dict_category_wise_probability, word_list, probab, prob_df, pro)
if ind == max(help_dict, key=help_dict.get):
match = match + 1
@ -189,5 +188,5 @@ def main():
print(f"The model accuracy then is {int((match / total) * 100)}%")
if __name__ == '__main__':
if __name__ == "__main__":
main()

75
prepare_data.py Normal file
View File

@ -0,0 +1,75 @@
import os
import numpy as np
import pandas as pd
from kaggle import api
import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download("punkt")
def download_data(data_path, dataset_name):
if not os.path.exists(os.path.join(data_path, dataset_name)):
api.authenticate()
api.dataset_download_files(
"shivamb/real-or-fake-fake-jobposting-prediction",
path=data_path,
unzip=True,
)
os.rename(
os.path.join(data_path, "fake_job_postings.csv"),
os.path.join(data_path, dataset_name),
)
def preprocess_dataset(data_path):
data = pd.read_csv(data_path).replace(np.nan, "", regex=True)
data["description"] = data["description"].str.replace(r"\W+", " ", regex=True)
data["description"] = data["description"].str.replace(r"url_\w+", " ", regex=True)
data["description"] = data["description"].str.replace(r"\s+", " ", regex=True)
data["text"] = data[
[
"title",
"department",
"company_profile",
"description",
"requirements",
"benefits",
]
].apply(lambda x: " ".join(x).lower(), axis=1)
# data['text'] = data['text'].str.lower()
tokenizer = RegexpTokenizer(r"\w+")
data["tokens"] = data["text"].apply(tokenizer.tokenize)
return data.drop(
[
"job_id",
"department",
"company_profile",
"description",
"requirements",
"benefits",
"text",
],
axis=1,
)
def save_dataset(data, data_path, name):
data.to_csv(os.path.join(data_path, name), index=False)
if __name__ == "__main__":
# * Download the training data
abs_data_path, dataset_name = os.path.abspath("./data"), "dataset.csv"
download_data(abs_data_path, dataset_name)
# * Data preprocessing
data_path = os.path.join(abs_data_path, dataset_name)
cleaned_data = preprocess_dataset(data_path)
# * Save prepared data to a csv file
save_dataset(cleaned_data, abs_data_path, "clean-data.csv")