ium_444463/main.py

113 lines
3.4 KiB
Python
Raw Normal View History

2022-04-23 13:52:09 +02:00
import pandas as pd
import numpy as np
import scipy
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
import kaggle
from sklearn.feature_extraction.text import TfidfVectorizer
from torch import nn
from torch import optim
import matplotlib.pyplot as plt
if __name__ == "__main__":
# kaggle.api.authenticate()
# kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='.',
# unzip=True)
data = pd.read_csv('fake_job_postings.csv', engine='python')
data = data.replace(np.nan, '', regex=True)
data_train, data_test = train_test_split(data, test_size=3000, random_state=1)
data_dev, data_test = train_test_split(data_test, test_size=1500, random_state=1)
x_train = data_train["title"]
x_dev = data_dev["title"]
x_test = data_test["title"]
y_train = data_train["fraudulent"]
y_dev = data_dev["fraudulent"]
y_test = data_test["fraudulent"]
x_train = np.array(x_train)
x_dev = np.array(x_dev)
y_train = np.array(y_train)
y_dev = np.array(y_dev)
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_dev = vectorizer.transform(x_dev)
x_train = torch.tensor(scipy.sparse.csr_matrix.todense(x_train)).float()
x_dev = torch.tensor(scipy.sparse.csr_matrix.todense(x_dev)).float()
y_train = torch.tensor(y_train)
y_dev = torch.tensor(y_dev)
from torch import nn
model = nn.Sequential(
nn.Linear(x_train.shape[1], 64),
nn.ReLU(),
nn.Linear(64, data_train["title"].nunique()),
nn.LogSoftmax(dim=1))
# Define the loss
criterion = nn.NLLLoss() # Forward pass, log
logps = model(x_train) # Calculate the loss with the logits and the labels
loss = criterion(logps, y_train)
loss.backward() # Optimizers need parameters to optimize and a learning rate
optimizer = optim.Adam(model.parameters(), lr=0.002)
train_losses = []
test_losses = []
test_accuracies = []
epochs = 5
for e in range(epochs):
optimizer.zero_grad()
output = model.forward(x_train)
loss = criterion(output, y_train)
loss.backward()
train_loss = loss.item()
train_losses.append(train_loss)
optimizer.step()
# Turn off gradients for validation, saves memory and computations
with torch.no_grad():
model.eval()
log_ps = model(x_dev)
test_loss = criterion(log_ps, y_dev)
test_losses.append(test_loss)
ps = torch.exp(log_ps)
top_p, top_class = ps.topk(1, dim=1)
equals = top_class == y_dev.view(*top_class.shape)
test_accuracy = torch.mean(equals.float())
test_accuracies.append(test_accuracy)
model.train()
print(f"Epoch: {e + 1}/{epochs}.. ",
f"Training Loss: {train_loss:.3f}.. ",
f"Test Loss: {test_loss:.3f}.. ",
f"Test Accuracy: {test_accuracy:.3f}")
plt.figure(figsize=(12, 5))
ax = plt.subplot(121)
plt.xlabel('epochs')
plt.ylabel('negative log likelihood loss')
plt.plot(train_losses, label='Training loss')
plt.plot(test_losses, label='Validation loss')
plt.legend(frameon=False)
plt.subplot(122)
plt.xlabel('epochs')
plt.ylabel('test accuracy')
plt.plot(test_accuracies)
plt.show()
print('Succes')