mpsic_projekt_1_bayes_class.../prepare_data.py

104 lines
3.1 KiB
Python
Raw Normal View History

2022-05-17 13:15:06 +02:00
import os
import numpy as np
import pandas as pd
from kaggle import api
2022-05-17 22:08:42 +02:00
from sklearn.utils import shuffle
2022-05-17 13:15:06 +02:00
import nltk
from nltk.tokenize import RegexpTokenizer
2022-05-17 22:08:42 +02:00
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
2022-05-17 13:15:06 +02:00
nltk.download("punkt")
2022-05-17 22:08:42 +02:00
nltk.download("stopwords")
stemmer = SnowballStemmer(language="english")
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
def read_data(data_path: str, prepare_data: bool = False):
"""Read data from given path - if @prepared_data is True, data is also preprocessed and cleaned"""
if prepare_data:
data = preprocess_dataset(data_path)
else:
data = pd.read_csv(data_path)
return data
2022-05-17 13:15:06 +02:00
def download_data(data_path, dataset_name):
if not os.path.exists(os.path.join(data_path, dataset_name)):
api.authenticate()
api.dataset_download_files(
"shivamb/real-or-fake-fake-jobposting-prediction",
path=data_path,
unzip=True,
)
os.rename(
os.path.join(data_path, "fake_job_postings.csv"),
os.path.join(data_path, dataset_name),
)
2022-05-17 22:08:42 +02:00
def tokenize_and_stem_text(text):
tokenized_text = tokenizer.tokenize(text)
tokens = [token.lower() for token in tokenized_text if token.lower() not in stop_words and len(token) > 3]
return [stemmer.stem(token) for token in tokens]
2022-05-17 13:15:06 +02:00
def preprocess_dataset(data_path):
data = pd.read_csv(data_path).replace(np.nan, "", regex=True)
2022-05-17 22:08:42 +02:00
data_not_fraudulent = data[data['fraudulent'] == 0]
data_fraudulent = data[data['fraudulent'] == 1]
sample = data_not_fraudulent.sample(data_fraudulent.shape[0], replace=False)
data = pd.concat([sample.reset_index(), data_fraudulent.reset_index()], axis=0)
data = shuffle(data)
data["description"] = data["description"].str.replace(r"\W+", " ", regex=True)
data["description"] = data["description"].str.replace(r"url_\w+", " ", regex=True)
data["description"] = data["description"].str.replace(r"\s+", " ", regex=True)
2022-05-17 13:15:06 +02:00
2022-05-17 14:52:06 +02:00
data["text"] = data[[
"title",
"department",
"company_profile",
"description",
"requirements",
"benefits",
2022-05-17 22:08:42 +02:00
]].apply(lambda x: " ".join(x), axis=1)
# data["text"] = data[[
# "description"
# ]].apply(lambda x: " ".join(x), axis=1)
2022-05-17 13:15:06 +02:00
2022-05-17 22:08:42 +02:00
data["tokens"] = data["text"].apply(lambda text: tokenize_and_stem_text(text))
2022-05-17 13:15:06 +02:00
return data.drop(
[
"job_id",
"department",
"company_profile",
"description",
"requirements",
"benefits",
"text",
],
axis=1,
)
def save_dataset(data, data_path, name):
data.to_csv(os.path.join(data_path, name), index=False)
if __name__ == "__main__":
# * Download the training data
abs_data_path, dataset_name = os.path.abspath("./data"), "dataset.csv"
download_data(abs_data_path, dataset_name)
# * Data preprocessing
data_path = os.path.join(abs_data_path, dataset_name)
cleaned_data = preprocess_dataset(data_path)
# * Save prepared data to a csv file
save_dataset(cleaned_data, abs_data_path, "clean-data.csv")