2022-04-23 13:52:09 +02:00
|
|
|
import pandas as pd
|
|
|
|
import numpy as np
|
|
|
|
import scipy
|
|
|
|
import torch
|
|
|
|
import pandas as pd
|
|
|
|
from sklearn.model_selection import train_test_split
|
2022-05-03 16:37:18 +02:00
|
|
|
# import kaggle
|
2022-04-23 13:52:09 +02:00
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
from torch import nn
|
|
|
|
from torch import optim
|
|
|
|
import matplotlib.pyplot as plt
|
2022-05-03 15:30:24 +02:00
|
|
|
import sys
|
2022-05-07 14:23:09 +02:00
|
|
|
from sacred import Experiment
|
|
|
|
from sacred.observers import FileStorageObserver
|
2022-05-15 14:10:56 +02:00
|
|
|
from sacred.observers import MongoObserver
|
2022-05-03 15:30:24 +02:00
|
|
|
|
2022-05-07 14:23:09 +02:00
|
|
|
ex = Experiment()
|
|
|
|
|
|
|
|
ex.observers.append(FileStorageObserver('my_runs'))
|
2022-05-15 14:10:56 +02:00
|
|
|
ex.observers.append(MongoObserver(url='mongodb://admin:IUM_2021@172.17.0.1:27017', db_name='sacred'))
|
2022-05-07 14:23:09 +02:00
|
|
|
vectorizer = TfidfVectorizer()
|
|
|
|
|
|
|
|
@ex.config
|
|
|
|
def my_config():
|
|
|
|
epochs = 10
|
2022-04-23 13:52:09 +02:00
|
|
|
|
2022-04-23 17:02:26 +02:00
|
|
|
|
|
|
|
def convert_text_to_model_form(text):
|
|
|
|
a = vectorizer.transform([text])
|
|
|
|
b = torch.tensor(scipy.sparse.csr_matrix.todense(a)).float()
|
|
|
|
return b
|
|
|
|
|
2022-05-07 14:23:09 +02:00
|
|
|
@ex.automain
|
|
|
|
def my_main(epochs, _run):
|
2022-05-07 14:30:27 +02:00
|
|
|
epochs = int(epochs)
|
2022-05-07 14:23:09 +02:00
|
|
|
# print(sys.argv[1])
|
|
|
|
# print(type(sys.argv[1]))
|
|
|
|
# print(sys.argv[1])
|
|
|
|
# epochs = int(sys.argv[1])
|
2022-05-06 21:05:15 +02:00
|
|
|
# epochs=10
|
2022-05-03 15:30:24 +02:00
|
|
|
|
|
|
|
# kaggle.api.authenticate()
|
|
|
|
# kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='.',
|
|
|
|
# unzip=True)
|
2022-04-23 13:52:09 +02:00
|
|
|
|
|
|
|
data = pd.read_csv('fake_job_postings.csv', engine='python')
|
2022-04-23 17:02:26 +02:00
|
|
|
# data = data.replace(np.nan, '', regex=True)
|
|
|
|
data = data[["company_profile", "fraudulent"]]
|
|
|
|
data = data.dropna()
|
2022-05-06 21:05:15 +02:00
|
|
|
company_profile = data["company_profile"]
|
2022-04-23 13:52:09 +02:00
|
|
|
|
2022-05-06 21:05:15 +02:00
|
|
|
# data_train, data_test = train_test_split(data, test_size=3000, random_state=1)
|
|
|
|
# data_dev, data_test = train_test_split(data_test, test_size=1500, random_state=1)
|
|
|
|
data_train = pd.read_csv('data_train.csv', engine='python', header=None).dropna()
|
|
|
|
data_dev = pd.read_csv('data_dev.csv', engine='python', header=None).dropna()
|
|
|
|
data_test = pd.read_csv('data_test.csv', engine='python', header=None).dropna()
|
2022-04-23 13:52:09 +02:00
|
|
|
|
2022-05-06 21:05:15 +02:00
|
|
|
x_train = data_train[5]
|
|
|
|
x_dev = data_dev[5]
|
|
|
|
x_test = data_test[5]
|
2022-04-23 13:52:09 +02:00
|
|
|
|
2022-05-06 21:05:15 +02:00
|
|
|
y_train = data_train[17]
|
|
|
|
y_dev = data_dev[17]
|
|
|
|
y_test = data_test[17]
|
2022-04-23 13:52:09 +02:00
|
|
|
|
2022-05-06 21:05:15 +02:00
|
|
|
company_profile = np.array(company_profile)
|
2022-04-23 13:52:09 +02:00
|
|
|
x_train = np.array(x_train)
|
|
|
|
x_dev = np.array(x_dev)
|
2022-04-23 17:02:26 +02:00
|
|
|
x_test = np.array(x_test)
|
2022-04-23 13:52:09 +02:00
|
|
|
|
|
|
|
y_train = np.array(y_train)
|
|
|
|
y_dev = np.array(y_dev)
|
2022-04-23 17:02:26 +02:00
|
|
|
y_test = np.array(y_test)
|
2022-04-23 13:52:09 +02:00
|
|
|
|
|
|
|
|
2022-05-06 21:05:15 +02:00
|
|
|
company_profile = vectorizer.fit_transform(company_profile)
|
|
|
|
x_train = vectorizer.transform(x_train)
|
2022-04-23 13:52:09 +02:00
|
|
|
x_dev = vectorizer.transform(x_dev)
|
2022-04-23 17:02:26 +02:00
|
|
|
x_test = vectorizer.transform(x_test)
|
2022-04-23 13:52:09 +02:00
|
|
|
|
|
|
|
x_train = torch.tensor(scipy.sparse.csr_matrix.todense(x_train)).float()
|
|
|
|
x_dev = torch.tensor(scipy.sparse.csr_matrix.todense(x_dev)).float()
|
2022-04-23 17:02:26 +02:00
|
|
|
x_test = torch.tensor(scipy.sparse.csr_matrix.todense(x_test)).float()
|
2022-04-23 13:52:09 +02:00
|
|
|
|
|
|
|
y_train = torch.tensor(y_train)
|
|
|
|
y_dev = torch.tensor(y_dev)
|
2022-04-23 17:02:26 +02:00
|
|
|
y_test = torch.tensor(y_test)
|
2022-04-23 13:52:09 +02:00
|
|
|
|
|
|
|
from torch import nn
|
|
|
|
|
|
|
|
model = nn.Sequential(
|
|
|
|
nn.Linear(x_train.shape[1], 64),
|
|
|
|
nn.ReLU(),
|
2022-05-06 21:05:15 +02:00
|
|
|
nn.Linear(64, data_train[17].nunique()),
|
2022-04-23 13:52:09 +02:00
|
|
|
nn.LogSoftmax(dim=1))
|
|
|
|
|
|
|
|
# Define the loss
|
|
|
|
criterion = nn.NLLLoss() # Forward pass, log
|
|
|
|
logps = model(x_train) # Calculate the loss with the logits and the labels
|
|
|
|
loss = criterion(logps, y_train)
|
|
|
|
loss.backward() # Optimizers need parameters to optimize and a learning rate
|
|
|
|
optimizer = optim.Adam(model.parameters(), lr=0.002)
|
|
|
|
|
|
|
|
train_losses = []
|
|
|
|
test_losses = []
|
|
|
|
test_accuracies = []
|
|
|
|
|
|
|
|
for e in range(epochs):
|
|
|
|
optimizer.zero_grad()
|
|
|
|
|
|
|
|
output = model.forward(x_train)
|
|
|
|
loss = criterion(output, y_train)
|
|
|
|
loss.backward()
|
|
|
|
train_loss = loss.item()
|
|
|
|
train_losses.append(train_loss)
|
|
|
|
|
|
|
|
optimizer.step()
|
|
|
|
|
|
|
|
# Turn off gradients for validation, saves memory and computations
|
|
|
|
with torch.no_grad():
|
|
|
|
model.eval()
|
|
|
|
log_ps = model(x_dev)
|
|
|
|
test_loss = criterion(log_ps, y_dev)
|
|
|
|
test_losses.append(test_loss)
|
|
|
|
|
|
|
|
ps = torch.exp(log_ps)
|
|
|
|
top_p, top_class = ps.topk(1, dim=1)
|
|
|
|
equals = top_class == y_dev.view(*top_class.shape)
|
|
|
|
test_accuracy = torch.mean(equals.float())
|
|
|
|
test_accuracies.append(test_accuracy)
|
|
|
|
|
|
|
|
model.train()
|
|
|
|
|
|
|
|
print(f"Epoch: {e + 1}/{epochs}.. ",
|
|
|
|
f"Training Loss: {train_loss:.3f}.. ",
|
|
|
|
f"Test Loss: {test_loss:.3f}.. ",
|
|
|
|
f"Test Accuracy: {test_accuracy:.3f}")
|
|
|
|
|
2022-04-23 17:02:26 +02:00
|
|
|
TP = []
|
|
|
|
TF = []
|
|
|
|
|
|
|
|
FP = []
|
|
|
|
FN = []
|
2022-05-06 21:05:15 +02:00
|
|
|
model.eval()
|
|
|
|
print(x_test.size())
|
2022-04-23 17:02:26 +02:00
|
|
|
log_ps = model(x_test)
|
|
|
|
ps = torch.exp(log_ps)
|
|
|
|
top_p, top_class = ps.topk(1, dim=1)
|
2022-05-06 21:05:15 +02:00
|
|
|
descr = np.array(data_test[5])
|
2022-04-23 17:02:26 +02:00
|
|
|
for i, (x, y) in enumerate(zip(np.array(top_class), np.array(y_test.view(*top_class.shape)))):
|
|
|
|
d = descr[i]
|
|
|
|
if x == y:
|
|
|
|
if x:
|
|
|
|
TP.append(d)
|
|
|
|
else:
|
|
|
|
TF.append(d)
|
|
|
|
else:
|
|
|
|
if x:
|
|
|
|
FP.append(d)
|
|
|
|
else:
|
|
|
|
FN.append(d)
|
|
|
|
f_score = len(TP) / (len(TP) + 0.5 * (len(FP) + len(FN)))
|
2022-04-23 17:15:28 +02:00
|
|
|
print(f"F- score = {f_score}")
|
2022-04-23 17:02:26 +02:00
|
|
|
f = open("model_resutls.txt", "a")
|
|
|
|
|
|
|
|
f.write(f"F-SCORE = {f_score}\n")
|
|
|
|
f.write(f"TP = {len(TP)}\n")
|
|
|
|
f.write(f"TF = {len(TF)}\n")
|
|
|
|
f.write(f"FP = {len(FP)}\n")
|
|
|
|
f.write(f"FN = {len(FN)}\n")
|
|
|
|
|
|
|
|
f.write(f"TP descriptions:")
|
|
|
|
for i in TP:
|
|
|
|
f.write(i+'\n')
|
|
|
|
f.write(f"TF descriptions:")
|
|
|
|
for i in TF:
|
|
|
|
f.write(i+"\n")
|
|
|
|
f.write(f"FP descriptions:")
|
|
|
|
for i in FP:
|
|
|
|
f.write(i+"\n")
|
|
|
|
f.write(f"FN descriptions:")
|
|
|
|
for i in FN:
|
|
|
|
f.write(i+"\n")
|
|
|
|
f.close()
|
2022-05-03 15:14:55 +02:00
|
|
|
|
|
|
|
torch.save(model, 'model')
|
2022-05-07 14:23:09 +02:00
|
|
|
ex.add_artifact("model")
|
2022-05-03 15:14:55 +02:00
|
|
|
|
2022-04-23 17:02:26 +02:00
|
|
|
|
2022-04-23 17:15:28 +02:00
|
|
|
# plt.figure(figsize=(12, 5))
|
|
|
|
# ax = plt.subplot(121)
|
|
|
|
# plt.xlabel('epochs')
|
|
|
|
# plt.ylabel('negative log likelihood loss')
|
|
|
|
# plt.plot(train_losses, label='Training loss')
|
|
|
|
# plt.plot(test_losses, label='Validation loss')
|
|
|
|
# plt.legend(frameon=False)
|
|
|
|
# plt.subplot(122)
|
|
|
|
# plt.xlabel('epochs')
|
|
|
|
# plt.ylabel('test accuracy')
|
|
|
|
# plt.plot(test_accuracies)
|
|
|
|
# plt.show()
|