added example evaluation

This commit is contained in:
Mikołaj Pokrywka 2022-05-06 21:05:15 +02:00
parent 0630a56ff2
commit 4378c0ce28
4 changed files with 150 additions and 12 deletions

3
.gitignore vendored
View File

@ -11,4 +11,5 @@ venv
.~lock.fake_job_postings.csv#
.idea
model_resutls.txt
model
model
metrics.txt

View File

@ -24,6 +24,7 @@ if __name__ == "__main__":
print(type(sys.argv[1]))
print(sys.argv[1])
epochs = int(sys.argv[1])
# epochs=10
# kaggle.api.authenticate()
# kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='.',
@ -33,18 +34,23 @@ if __name__ == "__main__":
# data = data.replace(np.nan, '', regex=True)
data = data[["company_profile", "fraudulent"]]
data = data.dropna()
company_profile = data["company_profile"]
data_train, data_test = train_test_split(data, test_size=3000, random_state=1)
data_dev, data_test = train_test_split(data_test, test_size=1500, random_state=1)
# data_train, data_test = train_test_split(data, test_size=3000, random_state=1)
# data_dev, data_test = train_test_split(data_test, test_size=1500, random_state=1)
data_train = pd.read_csv('data_train.csv', engine='python', header=None).dropna()
data_dev = pd.read_csv('data_dev.csv', engine='python', header=None).dropna()
data_test = pd.read_csv('data_test.csv', engine='python', header=None).dropna()
x_train = data_train["company_profile"]
x_dev = data_dev["company_profile"]
x_test = data_test["company_profile"]
x_train = data_train[5]
x_dev = data_dev[5]
x_test = data_test[5]
y_train = data_train["fraudulent"]
y_dev = data_dev["fraudulent"]
y_test = data_test["fraudulent"]
y_train = data_train[17]
y_dev = data_dev[17]
y_test = data_test[17]
company_profile = np.array(company_profile)
x_train = np.array(x_train)
x_dev = np.array(x_dev)
x_test = np.array(x_test)
@ -55,7 +61,8 @@ if __name__ == "__main__":
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
company_profile = vectorizer.fit_transform(company_profile)
x_train = vectorizer.transform(x_train)
x_dev = vectorizer.transform(x_dev)
x_test = vectorizer.transform(x_test)
@ -72,7 +79,7 @@ if __name__ == "__main__":
model = nn.Sequential(
nn.Linear(x_train.shape[1], 64),
nn.ReLU(),
nn.Linear(64, data_train["fraudulent"].nunique()),
nn.Linear(64, data_train[17].nunique()),
nn.LogSoftmax(dim=1))
# Define the loss
@ -122,10 +129,12 @@ if __name__ == "__main__":
FP = []
FN = []
model.eval()
print(x_test.size())
log_ps = model(x_test)
ps = torch.exp(log_ps)
top_p, top_class = ps.topk(1, dim=1)
descr = np.array(data_test["company_profile"])
descr = np.array(data_test[5])
for i, (x, y) in enumerate(zip(np.array(top_class), np.array(y_test.view(*top_class.shape)))):
d = descr[i]
if x == y:

38
evaluation.Jenkinsfile Normal file
View File

@ -0,0 +1,38 @@
pipeline {
agent {
dockerfile true
}
parameters {
string (
defaultValue: '10',
description: 'Epochs number',
name: 'EPOCH',
trim: false
)
}
stages {
stage('checkout: Check out from version control') {
steps {
checkout([$class: 'GitSCM', branches: [[name: ' */master']], extensions: [], userRemoteConfigs: [[credentialsId: 's444463', url: 'https://git.wmi.amu.edu.pl/s444463/ium_444463.git']]])
}
}
stage('bash script') {
steps {
withEnv(["EPOCH=${params.EPOCH}"]) {
copyArtifacts filter: '*', projectName: 's444463-create-dataset'
sh 'python3 ./evaluation.py'
archiveArtifacts artifacts: "metrics.txt"
}
}
}
}
post {
success {
emailext body: "Model successfully evaluation", subject: "Model evaluation 444463", to: "e19191c5.uam.onmicrosoft.com@emea.teams.ms"
}
failure {
emailext body: "evaluation failure", subject: "Model evaluation 444463", to: "e19191c5.uam.onmicrosoft.com@emea.teams.ms"
}
}
}

90
evaluation.py Normal file
View File

@ -0,0 +1,90 @@
import torch
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
if __name__ == "__main__":
data = pd.read_csv('fake_job_postings.csv', engine='python')
# data = data.replace(np.nan, '', regex=True)
company_profile = data["company_profile"]
company_profile = company_profile.dropna()
company_profile = np.array(company_profile)
vectorizer = TfidfVectorizer()
company_profile = vectorizer.fit_transform(company_profile)
model = torch.load('model')
data_test = pd.read_csv('data_test.csv', engine='python', header=None)
data_test = data_test.dropna()
x_test = data_test[5]
y_test = data_test[17]
x_test = np.array(x_test)
y_test = np.array(y_test)
x_test = vectorizer.transform(x_test)
x_test = torch.tensor(scipy.sparse.csr_matrix.todense(x_test)).float()
y_test = torch.tensor(y_test)
TP = []
TF = []
FP = []
FN = []
# x_test = x_test.view(x_test.size(0), -1)
model = model.eval()
print(x_test.size())
log_ps = model(x_test)
ps = torch.exp(log_ps)
top_p, top_class = ps.topk(1, dim=1)
descr = np.array(data_test[5])
for i, (x, y) in enumerate(zip(np.array(top_class), np.array(y_test.view(*top_class.shape)))):
d = descr[i]
if x == y:
if x:
TP.append(d)
else:
TF.append(d)
else:
if x:
FP.append(d)
else:
FN.append(d)
f_score = len(TP) / (len(TP) + 0.5 * (len(FP) + len(FN)))
accuracy = (len(TP) + len(TF)) / (len(TP) + len(TF) + len(FP) + len(FN))
precision = len(TP) / ( len(TP) + len(FP) )
recall = len(TP) / ( len(TP) + len(FN) )
print(f"F- score = {f_score}")
print(f"Accuracy = {accuracy}")
print(f"Precision = {precision}")
print(f"Recall = {recall}")
f = open("metrics.txt", "a")
f.write(f"F-SCORE = {f_score}\n")
f.write(f"Accuracy = {accuracy}\n")
f.write(f"Precision = {precision}\n")
f.write(f"Recall = {recall}\n")
# f.write(f"TP descriptions:")
# for i in TP:
# f.write(i+'\n')
# f.write(f"TF descriptions:")
# for i in TF:
# f.write(i+"\n")
# f.write(f"FP descriptions:")
# for i in FP:
# f.write(i+"\n")
# f.write(f"FN descriptions:")
# for i in FN:
# f.write(i+"\n")
# f.close()
a=1