76 lines
2.1 KiB
Python
76 lines
2.1 KiB
Python
import os
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from kaggle import api
|
|
|
|
import nltk
|
|
from nltk.tokenize import RegexpTokenizer
|
|
|
|
nltk.download("punkt")
|
|
|
|
|
|
def download_data(data_path, dataset_name):
|
|
if not os.path.exists(os.path.join(data_path, dataset_name)):
|
|
api.authenticate()
|
|
api.dataset_download_files(
|
|
"shivamb/real-or-fake-fake-jobposting-prediction",
|
|
path=data_path,
|
|
unzip=True,
|
|
)
|
|
os.rename(
|
|
os.path.join(data_path, "fake_job_postings.csv"),
|
|
os.path.join(data_path, dataset_name),
|
|
)
|
|
|
|
|
|
def preprocess_dataset(data_path):
|
|
data = pd.read_csv(data_path).replace(np.nan, "", regex=True)
|
|
|
|
data["description"] = data["description"].str.replace(r"\W+", " ", regex=True)
|
|
data["description"] = data["description"].str.replace(r"url_\w+", " ", regex=True)
|
|
data["description"] = data["description"].str.replace(r"\s+", " ", regex=True)
|
|
|
|
data["text"] = data[
|
|
[
|
|
"title",
|
|
"department",
|
|
"company_profile",
|
|
"description",
|
|
"requirements",
|
|
"benefits",
|
|
]
|
|
].apply(lambda x: " ".join(x).lower(), axis=1)
|
|
# data['text'] = data['text'].str.lower()
|
|
|
|
tokenizer = RegexpTokenizer(r"\w+")
|
|
data["tokens"] = data["text"].apply(tokenizer.tokenize)
|
|
|
|
return data.drop(
|
|
[
|
|
"job_id",
|
|
"department",
|
|
"company_profile",
|
|
"description",
|
|
"requirements",
|
|
"benefits",
|
|
"text",
|
|
],
|
|
axis=1,
|
|
)
|
|
|
|
|
|
def save_dataset(data, data_path, name):
|
|
data.to_csv(os.path.join(data_path, name), index=False)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# * Download the training data
|
|
abs_data_path, dataset_name = os.path.abspath("./data"), "dataset.csv"
|
|
download_data(abs_data_path, dataset_name)
|
|
# * Data preprocessing
|
|
data_path = os.path.join(abs_data_path, dataset_name)
|
|
cleaned_data = preprocess_dataset(data_path)
|
|
# * Save prepared data to a csv file
|
|
save_dataset(cleaned_data, abs_data_path, "clean-data.csv")
|