import os import numpy as np import pandas as pd from kaggle import api from sklearn.utils import shuffle import nltk from nltk.tokenize import RegexpTokenizer from nltk.stem.snowball import SnowballStemmer from nltk.corpus import stopwords nltk.download("punkt") nltk.download("stopwords") stemmer = SnowballStemmer(language="english") tokenizer = RegexpTokenizer(r'\w+') stop_words = set(stopwords.words('english')) def read_data(data_path: str, prepare_data: bool = False): """Read data from given path - if @prepared_data is True, data is also preprocessed and cleaned""" if prepare_data: data = preprocess_dataset(data_path) else: data = pd.read_csv(data_path) return data def download_data(data_path, dataset_name): if not os.path.exists(os.path.join(data_path, dataset_name)): api.authenticate() api.dataset_download_files( "shivamb/real-or-fake-fake-jobposting-prediction", path=data_path, unzip=True, ) os.rename( os.path.join(data_path, "fake_job_postings.csv"), os.path.join(data_path, dataset_name), ) def tokenize_and_stem_text(text): tokenized_text = tokenizer.tokenize(text) tokens = [token.lower() for token in tokenized_text if token.lower() not in stop_words and len(token) > 3] return [stemmer.stem(token) for token in tokens] def preprocess_dataset(data_path): data = pd.read_csv(data_path).replace(np.nan, "", regex=True) data_not_fraudulent = data[data['fraudulent'] == 0] data_fraudulent = data[data['fraudulent'] == 1] sample = data_not_fraudulent.sample(data_fraudulent.shape[0], replace=False) data = pd.concat([sample.reset_index(), data_fraudulent.reset_index()], axis=0) data = shuffle(data) data["description"] = data["description"].str.replace(r"\W+", " ", regex=True) data["description"] = data["description"].str.replace(r"url_\w+", " ", regex=True) data["description"] = data["description"].str.replace(r"\s+", " ", regex=True) data["text"] = data[[ "title", "department", "company_profile", "description", "requirements", "benefits", ]].apply(lambda x: " ".join(x), axis=1) # data["text"] = data[[ # "description" # ]].apply(lambda x: " ".join(x), axis=1) data["tokens"] = data["text"].apply(lambda text: tokenize_and_stem_text(text)) return data.drop( [ "job_id", "department", "company_profile", "description", "requirements", "benefits", "text", ], axis=1, ) def save_dataset(data, data_path, name): data.to_csv(os.path.join(data_path, name), index=False) if __name__ == "__main__": # * Download the training data abs_data_path, dataset_name = os.path.abspath("./data"), "dataset.csv" download_data(abs_data_path, dataset_name) # * Data preprocessing data_path = os.path.join(abs_data_path, dataset_name) cleaned_data = preprocess_dataset(data_path) # * Save prepared data to a csv file save_dataset(cleaned_data, abs_data_path, "clean-data.csv")