In [232]:
import time, gc

# Timing utilities
start_time = None

def start_timer():
    global start_time
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.synchronize()
    start_time = time.time()

def end_timer_and_print(local_msg):
    torch.cuda.synchronize()
    end_time = time.time()
    print("\n" + local_msg)
    print("Total execution time = {:.3f} sec".format(end_time - start_time))
    print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated()))

In [233]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 7116988186229065702
 xla_global_id: -1, name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 14465892352
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 10048785647988876421
 physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"
 xla_global_id: 416903419]

In [234]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import torch
import scipy

In [235]:
# !unzip real-or-fake-fake-jobposting-prediction.zip

In [236]:
data = pd.read_csv('fake_job_postings.csv', engine='python')
data = data[["company_profile", "fraudulent"]]
data = data.sample(frac=1)
data = data.dropna()

In [237]:
data

Unnamed: 0,company_profile,fraudulent
16503,"At Hayes-Corp, we create the fun stuff. With ...",0
16706,Tribal Worldwide Athens is a digitally centric...,0
3364,About ECHOING GREEN: Echoing Green unleashes ...,0
16856,Daily Secret is the fastest growing digital me...,0
1566,ding* is the world’s largest top-up provider. ...,0
...,...,...
7607,Established on the principles that full time e...,0
682,AGOGO creates a personalized audio channel by ...,0
2759,We are a family run business that has been in ...,0
5751,We have aggressive growth plans in place for t...,1


In [238]:
data_train, data_test = train_test_split(data, test_size=2000, random_state=1)
data_dev, data_test = train_test_split(data_test, test_size=1000, random_state=1)
len(data_train), len(data_dev), len(data_test)

(12572, 1000, 1000)

In [239]:
x_train = data_train["company_profile"]
x_dev = data_dev["company_profile"]
x_test = data_test["company_profile"]

y_train = data_train["fraudulent"]
y_dev = data_dev["fraudulent"]
y_test = data_test["fraudulent"]

x_train = np.array(x_train)
x_dev = np.array(x_dev)
x_test = np.array(x_test)

y_train = np.array(y_train)
y_dev = np.array(y_dev)
y_test = np.array(y_test)


y_train_np = np.array(y_train)
y_dev_np = np.array(y_dev)
y_test_np = np.array(y_test)

In [240]:
vectorizer = TfidfVectorizer()

In [241]:
import copy
x_train = vectorizer.fit_transform(x_train)
x_dev = vectorizer.transform(x_dev)
x_test = vectorizer.transform(x_test)

x_train_np = x_train.copy()
x_dev_np = x_dev.copy()
x_test_np = x_test.copy()

In [242]:
device = 'cuda'

In [243]:
x_train = torch.tensor(scipy.sparse.csr_matrix.todense(x_train), device=device).float()
x_dev = torch.tensor(scipy.sparse.csr_matrix.todense(x_dev), device=device).float()
x_test = torch.tensor(scipy.sparse.csr_matrix.todense(x_test), device=device).float()

y_train = torch.tensor(y_train, device=device)
y_dev = torch.tensor(y_dev, device=device)
y_test = torch.tensor(y_test, device=device)

In [244]:
from sklearn.linear_model import LogisticRegression
start_timer()
reg = LogisticRegression().fit(x_train_np, y_train_np)
end_timer_and_print("Logistic regression: ")




Logistic regression: 
Total execution time = 0.365 sec
Max memory used by tensors = 2335263744 bytes


In [245]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

y_pred_np = reg.predict(x_test_np)
print('F-score: ', f1_score(y_test_np, y_pred_np, average='macro'))

print('Accuracy: ', accuracy_score(y_test_np, y_pred_np))

F-score:  0.8685964220682922
Accuracy:  0.993


In [246]:
device="cuda"

In [247]:
def prepare_batches(X, Y, batch_size):
  data_X = []
  data_Y = []
  for i in range(0, len(X)-1, batch_size):
    data_X.append(X[i:i+batch_size])
    data_Y.append(Y[i:i+batch_size].reshape(-1,1))
  data_X = data_X[0:-1]
  data_Y = data_Y[0:-1]
  return data_X, data_Y

In [248]:
size = 512
epochs = 150



In [249]:
from torch import nn
from torch import optim
model = nn.Sequential(
        nn.Linear(x_train.shape[1], size),
        nn.ReLU(),
        # nn.Linear(64, data_train["fraudulent"].nunique()),

        nn.Linear(size, size),
        nn.ReLU(),
        nn.Linear(size, size),
        nn.ReLU(),

        nn.Linear(size, size),
        nn.ReLU(),
        nn.Linear(size, size),
        nn.ReLU(),

        nn.Linear(size, size),
        nn.ReLU(),
        nn.Linear(size, size),
        nn.ReLU(),

        nn.Linear(size, size),
        nn.ReLU(),
        nn.Linear(size, data_train["fraudulent"].nunique()),
        
        nn.LogSoftmax(dim=1))
model.cuda()
# Define the loss
criterion = nn.NLLLoss()  # Forward pass, log
logps = model(x_train)  # Calculate the loss with the logits and the labels
loss = criterion(logps, y_train)
loss.backward()  # Optimizers need parameters to optimize and a learning rate
optimizer = optim.Adam(model.parameters(), lr=0.002)

train_losses = []
test_losses = []
test_accuracies = []
start_timer()
for e in range(epochs):
    optimizer.zero_grad()

    output = model.forward(x_train)
    loss = criterion(output, y_train)
    loss.backward()
    train_loss = loss.item()
    train_losses.append(train_loss)

    optimizer.step()

    # Turn off gradients for validation, saves memory and computations
    with torch.no_grad():
        model.eval()
        log_ps = model(x_dev)
        test_loss = criterion(log_ps, y_dev)
        test_losses.append(test_loss)

        ps = torch.exp(log_ps)
        top_p, top_class = ps.topk(1, dim=1)
        equals = top_class == y_dev.view(*top_class.shape)
        test_accuracy = torch.mean(equals.float())
        test_accuracies.append(test_accuracy)

    model.train()

    print(f"Epoch: {e + 1}/{epochs}.. ",
          f"Training Loss: {train_loss:.3f}.. ",
          f"Test Loss: {test_loss:.3f}.. ",
          f"Test Accuracy: {test_accuracy:.3f}")
end_timer_and_print("Mixed precision:")



Epoch: 1/150..  Training Loss: 0.666..  Test Loss: 0.580..  Test Accuracy: 0.983
Epoch: 2/150..  Training Loss: 0.581..  Test Loss: 0.454..  Test Accuracy: 0.983
Epoch: 3/150..  Training Loss: 0.455..  Test Loss: 0.191..  Test Accuracy: 0.983
Epoch: 4/150..  Training Loss: 0.195..  Test Loss: 0.103..  Test Accuracy: 0.983
Epoch: 5/150..  Training Loss: 0.115..  Test Loss: 0.177..  Test Accuracy: 0.983
Epoch: 6/150..  Training Loss: 0.193..  Test Loss: 0.166..  Test Accuracy: 0.983
Epoch: 7/150..  Training Loss: 0.178..  Test Loss: 0.122..  Test Accuracy: 0.983
Epoch: 8/150..  Training Loss: 0.131..  Test Loss: 0.085..  Test Accuracy: 0.983
Epoch: 9/150..  Training Loss: 0.093..  Test Loss: 0.072..  Test Accuracy: 0.983
Epoch: 10/150..  Training Loss: 0.079..  Test Loss: 0.091..  Test Accuracy: 0.983
Epoch: 11/150..  Training Loss: 0.096..  Test Loss: 0.098..  Test Accuracy: 0.983
Epoch: 12/150..  Training Loss: 0.103..  Test Loss: 0.081..  Test Accuracy: 0.983
Epoch: 13/150..  Training

In [250]:
# Default model
model.eval()
predictions = []
output = model(x_test)
ps = torch.exp(output)
top_p, top_class = ps.topk(1, dim=1)
predictions = np.array(top_class.cpu().detach())
y_pred = []
for d in predictions:
  y_pred.append(d)
y_true = []
for d in y_test:
  y_true.append(int(d))
y_true
print('F-score: ', f1_score(y_true, y_pred, average='macro'))

print('Accuracy: ', accuracy_score(y_true, y_pred))

F-score:  0.9845942906441127
Accuracy:  0.999


In [251]:
# Mixed precision model
use_amp = True


model = nn.Sequential(
        nn.Linear(x_train.shape[1], size),
        nn.ReLU(),
        # nn.Linear(64, data_train["fraudulent"].nunique()),

        nn.Linear(size, size),
        nn.ReLU(),
        nn.Linear(size, size),
        nn.ReLU(),

        nn.Linear(size, size),
        nn.ReLU(),
        nn.Linear(size, size),
        nn.ReLU(),

        nn.Linear(size, size),
        nn.ReLU(),
        nn.Linear(size, size),
        nn.ReLU(),

        nn.Linear(size, size),
        nn.ReLU(),
        nn.Linear(size, data_train["fraudulent"].nunique()),
        
        nn.LogSoftmax(dim=1))
model.cuda()
# Define the loss
criterion = nn.NLLLoss()  # Forward pass, log
logps = model(x_train)  # Calculate the loss with the logits and the labels
loss = criterion(logps, y_train)
loss.backward()  # Optimizers need parameters to optimize and a learning rate
optimizer = optim.Adam(model.parameters(), lr=0.002)

train_losses = []
test_losses = []
test_accuracies = []
scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
start_timer()
for e in range(epochs):
    optimizer.zero_grad()
    with torch.cuda.amp.autocast(enabled=use_amp):
      output = model.forward(x_train)
      loss = criterion(output, y_train)
    scaler.scale(loss).backward()
    train_loss = loss.item()
    train_losses.append(train_loss)
    scaler.step(optimizer)
    scaler.update()



    # Turn off gradients for validation, saves memory and computations
    with torch.no_grad():
        model.eval()
        log_ps = model(x_dev)
        test_loss = criterion(log_ps, y_dev)
        test_losses.append(test_loss)

        ps = torch.exp(log_ps)
        top_p, top_class = ps.topk(1, dim=1)
        equals = top_class == y_dev.view(*top_class.shape)
        test_accuracy = torch.mean(equals.float())
        test_accuracies.append(test_accuracy)

    model.train()

    print(f"Epoch: {e + 1}/{epochs}.. ",
          f"Training Loss: {train_loss:.3f}.. ",
          f"Test Loss: {test_loss:.3f}.. ",
          f"Test Accuracy: {test_accuracy:.3f}")
end_timer_and_print("Mixed precision:")



Epoch: 1/150..  Training Loss: 0.729..  Test Loss: 0.643..  Test Accuracy: 0.983
Epoch: 2/150..  Training Loss: 0.644..  Test Loss: 0.518..  Test Accuracy: 0.983
Epoch: 3/150..  Training Loss: 0.519..  Test Loss: 0.245..  Test Accuracy: 0.983
Epoch: 4/150..  Training Loss: 0.249..  Test Loss: 0.087..  Test Accuracy: 0.983
Epoch: 5/150..  Training Loss: 0.098..  Test Loss: 0.171..  Test Accuracy: 0.983
Epoch: 6/150..  Training Loss: 0.187..  Test Loss: 0.178..  Test Accuracy: 0.983
Epoch: 7/150..  Training Loss: 0.191..  Test Loss: 0.135..  Test Accuracy: 0.983
Epoch: 8/150..  Training Loss: 0.145..  Test Loss: 0.093..  Test Accuracy: 0.983
Epoch: 9/150..  Training Loss: 0.101..  Test Loss: 0.070..  Test Accuracy: 0.983
Epoch: 10/150..  Training Loss: 0.077..  Test Loss: 0.088..  Test Accuracy: 0.983
Epoch: 11/150..  Training Loss: 0.093..  Test Loss: 0.100..  Test Accuracy: 0.983
Epoch: 12/150..  Training Loss: 0.104..  Test Loss: 0.080..  Test Accuracy: 0.983
Epoch: 13/150..  Training

In [252]:
# Mixed precision model
model.eval()
predictions = []
output = model(x_test)
ps = torch.exp(output)
top_p, top_class = ps.topk(1, dim=1)
predictions = np.array(top_class.cpu().detach())
y_pred = []
for d in predictions:
  y_pred.append(d)
y_true = []
for d in y_test:
  y_true.append(int(d))
y_true
print('F-score: ', f1_score(y_true, y_pred, average='macro'))

print('Accuracy: ', accuracy_score(y_true, y_pred))

F-score:  0.9845942906441127
Accuracy:  0.999
