import os import numpy as np import pandas as pd from kaggle import api import nltk from nltk.tokenize import RegexpTokenizer nltk.download("punkt") def download_data(data_path, dataset_name): if not os.path.exists(os.path.join(data_path, dataset_name)): api.authenticate() api.dataset_download_files( "shivamb/real-or-fake-fake-jobposting-prediction", path=data_path, unzip=True, ) os.rename( os.path.join(data_path, "fake_job_postings.csv"), os.path.join(data_path, dataset_name), ) def preprocess_dataset(data_path): data = pd.read_csv(data_path).replace(np.nan, "", regex=True) data["description"] = data["description"].str.replace(r"\W+", " ", regex=True) data["description"] = data["description"].str.replace(r"url_\w+", " ", regex=True) data["description"] = data["description"].str.replace(r"\s+", " ", regex=True) data["text"] = data[ [ "title", "department", "company_profile", "description", "requirements", "benefits", ] ].apply(lambda x: " ".join(x).lower(), axis=1) # data['text'] = data['text'].str.lower() tokenizer = RegexpTokenizer(r"\w+") data["tokens"] = data["text"].apply(tokenizer.tokenize) return data.drop( [ "job_id", "department", "company_profile", "description", "requirements", "benefits", "text", ], axis=1, ) def save_dataset(data, data_path, name): data.to_csv(os.path.join(data_path, name), index=False) if __name__ == "__main__": # * Download the training data abs_data_path, dataset_name = os.path.abspath("./data"), "dataset.csv" download_data(abs_data_path, dataset_name) # * Data preprocessing data_path = os.path.join(abs_data_path, dataset_name) cleaned_data = preprocess_dataset(data_path) # * Save prepared data to a csv file save_dataset(cleaned_data, abs_data_path, "clean-data.csv")