mpsic_projekt_1_bayes_class.../prepare_data.py
2022-05-17 13:15:06 +02:00

76 lines
2.1 KiB
Python

import os
import numpy as np
import pandas as pd
from kaggle import api
import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download("punkt")
def download_data(data_path, dataset_name):
if not os.path.exists(os.path.join(data_path, dataset_name)):
api.authenticate()
api.dataset_download_files(
"shivamb/real-or-fake-fake-jobposting-prediction",
path=data_path,
unzip=True,
)
os.rename(
os.path.join(data_path, "fake_job_postings.csv"),
os.path.join(data_path, dataset_name),
)
def preprocess_dataset(data_path):
data = pd.read_csv(data_path).replace(np.nan, "", regex=True)
data["description"] = data["description"].str.replace(r"\W+", " ", regex=True)
data["description"] = data["description"].str.replace(r"url_\w+", " ", regex=True)
data["description"] = data["description"].str.replace(r"\s+", " ", regex=True)
data["text"] = data[
[
"title",
"department",
"company_profile",
"description",
"requirements",
"benefits",
]
].apply(lambda x: " ".join(x).lower(), axis=1)
# data['text'] = data['text'].str.lower()
tokenizer = RegexpTokenizer(r"\w+")
data["tokens"] = data["text"].apply(tokenizer.tokenize)
return data.drop(
[
"job_id",
"department",
"company_profile",
"description",
"requirements",
"benefits",
"text",
],
axis=1,
)
def save_dataset(data, data_path, name):
data.to_csv(os.path.join(data_path, name), index=False)
if __name__ == "__main__":
# * Download the training data
abs_data_path, dataset_name = os.path.abspath("./data"), "dataset.csv"
download_data(abs_data_path, dataset_name)
# * Data preprocessing
data_path = os.path.join(abs_data_path, dataset_name)
cleaned_data = preprocess_dataset(data_path)
# * Save prepared data to a csv file
save_dataset(cleaned_data, abs_data_path, "clean-data.csv")