d
This commit is contained in:
szymonj98 2022-05-18 00:17:28 +02:00
commit 575da00f8a
3 changed files with 151 additions and 18055 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,193 +1,67 @@
import os
import sys
from collections import Counter
import nltk
nltk.download('punkt')
import matplotlib.pyplot as plt
import numpy as np import numpy as np
import pandas as pd from sklearn.metrics import accuracy_score
from kaggle import api
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer, word_tokenize, sent_tokenize import os
from collections import Counter
from ast import literal_eval
from nltk.corpus import stopwords # To Remove the stop words # TODO: stworzyc mapy slow dla zbiorów z fraudulent 0 i 1
from nltk.stem import PorterStemmer, WordNetLemmatizer from prepare_data import read_data
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from string import punctuation
from nltk import pos_tag
from nltk.corpus import wordnet
ps = PorterStemmer() # To perform stemming
def download_data(data_path, dataset_name):
if not os.path.exists(os.path.join(data_path, dataset_name)):
api.authenticate()
api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path=data_path,
unzip=True)
os.rename(os.path.join(data_path, 'fake_job_postings.csv'), os.path.join(data_path, dataset_name))
def save_dataset(data_path, data, name):
data.to_csv(os.path.join(data_path, name), index=False)
def preprocess_dataset(data):
data = data.replace(np.nan, '', regex=True)
data['description'] = data['description'].str.replace(r"\W+", " ", regex=True)
data['description'] = data['description'].str.replace(r"url_\w+", " ", regex=True)
data['description'] = data['description'].str.replace(r"\s+", " ", regex=True)
data['text'] = data[['title', 'department', 'company_profile', 'description', 'requirements', 'benefits']].apply(
lambda x: ' '.join(x), axis=1)
data['text'] = data['text'].str.lower()
tokenizer = RegexpTokenizer(r'\w+')
data['tokens'] = data['text'].apply(tokenizer.tokenize)
# data['tokens'] = data['text'].apply(lambda x: word_tokenize(x))
return data.drop(['job_id', 'department', 'company_profile', 'description', 'requirements', 'benefits', 'text'],
axis=1)
def to_dictionary(stop_words, category):
vocab = set()
sentences = category
for i in sentences:
for word in i:
word_lower = word.lower()
if word_lower not in stop_words and word_lower.isalpha():
vocab.add(ps.stem(word_lower))
word_dic = Counter(vocab)
return word_dic
# For tokenizing the words and putting it into the word list
def return_word_list(stop_words, sentence):
word_list = []
for word in sentence:
word_lower = word.lower()
if word_lower not in stop_words and word_lower.isalpha():
word_list.append(ps.stem(word_lower))
return word_list
# For finding the conditional probability
def return_category_probability_dictionary(dict_category_wise_probability, word_list, probab, prob_df, pro):
help_dict = {}
for i, row in probab.iterrows():
for word in word_list:
if (word in prob_df.index.tolist()):
pro = pro * probab.loc[i, word]
help_dict[i] = pro * dict_category_wise_probability[i]
pro = 1
return help_dict
class NaiveBayes: class NaiveBayes:
def __init__(self, data, labels, features): def __init__(self, train_x, train_y, labels):
self.data = data self.train_x = train_x
self.train_y = train_y
self.labels = labels self.labels = labels
self.features = features self.counts = {}
self.prior_prob = {}
self.word_counts = {}
def count_words(self):
for label in self.labels:
indexes = self.train_y.index[self.train_y == label].tolist()
data = self.train_x[self.train_x.index.isin(indexes)]
vocabulary = []
for tokens in data:
vocabulary += tokens
self.word_counts.update({label: (len(vocabulary), len(set(vocabulary)), Counter(vocabulary))})
def fit(self): def fit(self):
pass self.counts = {l: self.train_y[self.train_y == l].shape[0] for l in self.labels}
self.prior_prob = {l: float(self.counts[l]) / float(self.train_y.shape[0]) for l in self.labels}
self.count_words()
def transform(self): def get_posteriori(self, text):
pass values = {}
for label in self.labels:
values = {label: 0 for label in self.labels}
for word in text:
values[label] += np.log((float(self.word_counts[label][2].get(word, 0) + 1)) / (
self.word_counts[label][0] + self.word_counts[label][1]))
values[label] *= np.log(self.prior_prob[label])
return values.values()
def predict(self): def predict(self, test_x):
pass predicted = []
for row in test_x:
def evaluate(self, test_data): predicted.append(np.argmax(self.get_posteriori(row)))
pass return predicted
def main(): def main():
abs_data_path, dataset_name = os.path.abspath('./data'), 'dataset.csv' data = read_data(os.path.join(os.path.abspath("./data"), "clean-data.csv"))
download_data(abs_data_path, dataset_name) data['tokens'] = data['tokens'].apply(literal_eval)
data = pd.read_csv(os.path.join(abs_data_path, dataset_name)) x = data['tokens']
clean_data = preprocess_dataset(data) y = data['fraudulent']
x, y = clean_data['tokens'], clean_data['fraudulent'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123, stratify=y)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, bayes = NaiveBayes(x_train, y_train, [0, 1])
random_state=123, stratify=y) bayes.fit()
train_data = pd.concat([x_train, y_train], axis=1) predicted = bayes.predict(x_test)
print(train_data)
test_data = pd.concat([x_test, y_test], axis=1)
classes = [0, 1] print(accuracy_score(y_test, predicted))
# Building the master dictionary that contains the word frequency
master_dict = {}
stop_words = set(stopwords.words('english'))
for category in classes:
category_temp = train_data[train_data['fraudulent'] == category]
temp_dict = to_dictionary(stop_words, category_temp['tokens'])
master_dict[category] = temp_dict
# Converting the dictionary to data frame for ease of use
word_frequency_df = pd.DataFrame(master_dict).fillna(0)
print(word_frequency_df)
# Building the dictionary that holds category wise sums and word wise probabilities
categories_to_iterate = list(word_frequency_df) # Prepared category for zip
category_sum = []
for category in categories_to_iterate:
category_sum.append(word_frequency_df[category].sum()) # Prepared category sum for zip
dict_category_sum = dict(zip(categories_to_iterate, category_sum)) # Dictionary with category based sums
print(f"The dictionary that holds the cateogry wise sum is {dict_category_sum}")
dict_category_wise_probability = dict_category_sum.copy()
total_sentences_values = dict_category_wise_probability.values()
total = sum(total_sentences_values)
for key, value in dict_category_wise_probability.items():
dict_category_wise_probability[key] = value / total
print(f"The dictionay that holds the category wise probabilities is {dict_category_wise_probability}")
# Building word probability with the application of smoothing
prob_df = word_frequency_df
for category in categories_to_iterate:
for index, row in prob_df.iterrows():
row[category] = ((row[category] + 1) / (dict_category_sum[category] + len(prob_df[category]))) # Smoothing
prob_df.at[index, category] = row[category]
print(prob_df)
probab = prob_df.transpose()
pro = 1
match = 0
total = 0
counter = 0
for _, row in test_data.iterrows():
if counter > 200:
break
ind = row['fraudulent']
text = row['tokens']
word_list = return_word_list(stop_words, text)
# Get the dictionary that contains the final probability P(word|category)
help_dict = return_category_probability_dictionary(dict_category_wise_probability, word_list, probab, prob_df,
pro)
if ind == max(help_dict, key=help_dict.get):
match = match + 1
total = total + 1
counter += 1
print(f"The model predicted {match} correctly of {total}")
print(f"The model accuracy then is {int((match / total) * 100)}%")
if __name__ == '__main__': if __name__ == "__main__":
main() main()

103
prepare_data.py Normal file
View File

@ -0,0 +1,103 @@
import os
import numpy as np
import pandas as pd
from kaggle import api
from sklearn.utils import shuffle
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
nltk.download("punkt")
nltk.download("stopwords")
stemmer = SnowballStemmer(language="english")
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
def read_data(data_path: str, prepare_data: bool = False):
"""Read data from given path - if @prepared_data is True, data is also preprocessed and cleaned"""
if prepare_data:
data = preprocess_dataset(data_path)
else:
data = pd.read_csv(data_path)
return data
def download_data(data_path, dataset_name):
if not os.path.exists(os.path.join(data_path, dataset_name)):
api.authenticate()
api.dataset_download_files(
"shivamb/real-or-fake-fake-jobposting-prediction",
path=data_path,
unzip=True,
)
os.rename(
os.path.join(data_path, "fake_job_postings.csv"),
os.path.join(data_path, dataset_name),
)
def tokenize_and_stem_text(text):
tokenized_text = tokenizer.tokenize(text)
tokens = [token.lower() for token in tokenized_text if token.lower() not in stop_words and len(token) > 3]
return [stemmer.stem(token) for token in tokens]
def preprocess_dataset(data_path):
data = pd.read_csv(data_path).replace(np.nan, "", regex=True)
data_not_fraudulent = data[data['fraudulent'] == 0]
data_fraudulent = data[data['fraudulent'] == 1]
sample = data_not_fraudulent.sample(data_fraudulent.shape[0], replace=False)
data = pd.concat([sample.reset_index(), data_fraudulent.reset_index()], axis=0)
data = shuffle(data)
data["description"] = data["description"].str.replace(r"\W+", " ", regex=True)
data["description"] = data["description"].str.replace(r"url_\w+", " ", regex=True)
data["description"] = data["description"].str.replace(r"\s+", " ", regex=True)
data["text"] = data[[
"title",
"department",
"company_profile",
"description",
"requirements",
"benefits",
]].apply(lambda x: " ".join(x), axis=1)
# data["text"] = data[[
# "description"
# ]].apply(lambda x: " ".join(x), axis=1)
data["tokens"] = data["text"].apply(lambda text: tokenize_and_stem_text(text))
return data.drop(
[
"job_id",
"department",
"company_profile",
"description",
"requirements",
"benefits",
"text",
],
axis=1,
)
def save_dataset(data, data_path, name):
data.to_csv(os.path.join(data_path, name), index=False)
if __name__ == "__main__":
# * Download the training data
abs_data_path, dataset_name = os.path.abspath("./data"), "dataset.csv"
download_data(abs_data_path, dataset_name)
# * Data preprocessing
data_path = os.path.join(abs_data_path, dataset_name)
cleaned_data = preprocess_dataset(data_path)
# * Save prepared data to a csv file
save_dataset(cleaned_data, abs_data_path, "clean-data.csv")