104 lines
3.1 KiB
Python
104 lines
3.1 KiB
Python
import os
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from kaggle import api
|
|
from sklearn.utils import shuffle
|
|
|
|
import nltk
|
|
from nltk.tokenize import RegexpTokenizer
|
|
from nltk.stem.snowball import SnowballStemmer
|
|
from nltk.corpus import stopwords
|
|
|
|
nltk.download("punkt")
|
|
nltk.download("stopwords")
|
|
|
|
stemmer = SnowballStemmer(language="english")
|
|
tokenizer = RegexpTokenizer(r'\w+')
|
|
stop_words = set(stopwords.words('english'))
|
|
|
|
|
|
def read_data(data_path: str, prepare_data: bool = False):
|
|
"""Read data from given path - if @prepared_data is True, data is also preprocessed and cleaned"""
|
|
if prepare_data:
|
|
data = preprocess_dataset(data_path)
|
|
else:
|
|
data = pd.read_csv(data_path)
|
|
return data
|
|
|
|
|
|
def download_data(data_path, dataset_name):
|
|
if not os.path.exists(os.path.join(data_path, dataset_name)):
|
|
api.authenticate()
|
|
api.dataset_download_files(
|
|
"shivamb/real-or-fake-fake-jobposting-prediction",
|
|
path=data_path,
|
|
unzip=True,
|
|
)
|
|
os.rename(
|
|
os.path.join(data_path, "fake_job_postings.csv"),
|
|
os.path.join(data_path, dataset_name),
|
|
)
|
|
|
|
|
|
def tokenize_and_stem_text(text):
|
|
tokenized_text = tokenizer.tokenize(text)
|
|
tokens = [token.lower() for token in tokenized_text if token.lower() not in stop_words and len(token) > 3]
|
|
return [stemmer.stem(token) for token in tokens]
|
|
|
|
|
|
def preprocess_dataset(data_path):
|
|
data = pd.read_csv(data_path).replace(np.nan, "", regex=True)
|
|
|
|
data_not_fraudulent = data[data['fraudulent'] == 0]
|
|
data_fraudulent = data[data['fraudulent'] == 1]
|
|
|
|
sample = data_not_fraudulent.sample(data_fraudulent.shape[0], replace=False)
|
|
data = pd.concat([sample.reset_index(), data_fraudulent.reset_index()], axis=0)
|
|
data = shuffle(data)
|
|
data["description"] = data["description"].str.replace(r"\W+", " ", regex=True)
|
|
data["description"] = data["description"].str.replace(r"url_\w+", " ", regex=True)
|
|
data["description"] = data["description"].str.replace(r"\s+", " ", regex=True)
|
|
|
|
data["text"] = data[[
|
|
"title",
|
|
"department",
|
|
"company_profile",
|
|
"description",
|
|
"requirements",
|
|
"benefits",
|
|
]].apply(lambda x: " ".join(x), axis=1)
|
|
# data["text"] = data[[
|
|
# "description"
|
|
# ]].apply(lambda x: " ".join(x), axis=1)
|
|
|
|
data["tokens"] = data["text"].apply(lambda text: tokenize_and_stem_text(text))
|
|
|
|
return data.drop(
|
|
[
|
|
"job_id",
|
|
"department",
|
|
"company_profile",
|
|
"description",
|
|
"requirements",
|
|
"benefits",
|
|
"text",
|
|
],
|
|
axis=1,
|
|
)
|
|
|
|
|
|
def save_dataset(data, data_path, name):
|
|
data.to_csv(os.path.join(data_path, name), index=False)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# * Download the training data
|
|
abs_data_path, dataset_name = os.path.abspath("./data"), "dataset.csv"
|
|
download_data(abs_data_path, dataset_name)
|
|
# * Data preprocessing
|
|
data_path = os.path.join(abs_data_path, dataset_name)
|
|
cleaned_data = preprocess_dataset(data_path)
|
|
# * Save prepared data to a csv file
|
|
save_dataset(cleaned_data, abs_data_path, "clean-data.csv")
|