mpsic_projekt_1_bayes_class.../prepare_data.py

104 lines
3.1 KiB
Python

import os
import numpy as np
import pandas as pd
from kaggle import api
from sklearn.utils import shuffle
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
nltk.download("punkt")
nltk.download("stopwords")
stemmer = SnowballStemmer(language="english")
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
def read_data(data_path: str, prepare_data: bool = False):
"""Read data from given path - if @prepared_data is True, data is also preprocessed and cleaned"""
if prepare_data:
data = preprocess_dataset(data_path)
else:
data = pd.read_csv(data_path)
return data
def download_data(data_path, dataset_name):
if not os.path.exists(os.path.join(data_path, dataset_name)):
api.authenticate()
api.dataset_download_files(
"shivamb/real-or-fake-fake-jobposting-prediction",
path=data_path,
unzip=True,
)
os.rename(
os.path.join(data_path, "fake_job_postings.csv"),
os.path.join(data_path, dataset_name),
)
def tokenize_and_stem_text(text):
tokenized_text = tokenizer.tokenize(text)
tokens = [token.lower() for token in tokenized_text if token.lower() not in stop_words and len(token) > 3]
return [stemmer.stem(token) for token in tokens]
def preprocess_dataset(data_path):
data = pd.read_csv(data_path).replace(np.nan, "", regex=True)
data_not_fraudulent = data[data['fraudulent'] == 0]
data_fraudulent = data[data['fraudulent'] == 1]
sample = data_not_fraudulent.sample(data_fraudulent.shape[0], replace=False)
data = pd.concat([sample.reset_index(), data_fraudulent.reset_index()], axis=0)
data = shuffle(data)
data["description"] = data["description"].str.replace(r"\W+", " ", regex=True)
data["description"] = data["description"].str.replace(r"url_\w+", " ", regex=True)
data["description"] = data["description"].str.replace(r"\s+", " ", regex=True)
data["text"] = data[[
"title",
"department",
"company_profile",
"description",
"requirements",
"benefits",
]].apply(lambda x: " ".join(x), axis=1)
# data["text"] = data[[
# "description"
# ]].apply(lambda x: " ".join(x), axis=1)
data["tokens"] = data["text"].apply(lambda text: tokenize_and_stem_text(text))
return data.drop(
[
"job_id",
"department",
"company_profile",
"description",
"requirements",
"benefits",
"text",
],
axis=1,
)
def save_dataset(data, data_path, name):
data.to_csv(os.path.join(data_path, name), index=False)
if __name__ == "__main__":
# * Download the training data
abs_data_path, dataset_name = os.path.abspath("./data"), "dataset.csv"
download_data(abs_data_path, dataset_name)
# * Data preprocessing
data_path = os.path.join(abs_data_path, dataset_name)
cleaned_data = preprocess_dataset(data_path)
# * Save prepared data to a csv file
save_dataset(cleaned_data, abs_data_path, "clean-data.csv")