added example evaluation
This commit is contained in:
parent
0630a56ff2
commit
4378c0ce28
3
.gitignore
vendored
3
.gitignore
vendored
@ -11,4 +11,5 @@ venv
|
||||
.~lock.fake_job_postings.csv#
|
||||
.idea
|
||||
model_resutls.txt
|
||||
model
|
||||
model
|
||||
metrics.txt
|
31
deepl.py
31
deepl.py
@ -24,6 +24,7 @@ if __name__ == "__main__":
|
||||
print(type(sys.argv[1]))
|
||||
print(sys.argv[1])
|
||||
epochs = int(sys.argv[1])
|
||||
# epochs=10
|
||||
|
||||
# kaggle.api.authenticate()
|
||||
# kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='.',
|
||||
@ -33,18 +34,23 @@ if __name__ == "__main__":
|
||||
# data = data.replace(np.nan, '', regex=True)
|
||||
data = data[["company_profile", "fraudulent"]]
|
||||
data = data.dropna()
|
||||
company_profile = data["company_profile"]
|
||||
|
||||
data_train, data_test = train_test_split(data, test_size=3000, random_state=1)
|
||||
data_dev, data_test = train_test_split(data_test, test_size=1500, random_state=1)
|
||||
# data_train, data_test = train_test_split(data, test_size=3000, random_state=1)
|
||||
# data_dev, data_test = train_test_split(data_test, test_size=1500, random_state=1)
|
||||
data_train = pd.read_csv('data_train.csv', engine='python', header=None).dropna()
|
||||
data_dev = pd.read_csv('data_dev.csv', engine='python', header=None).dropna()
|
||||
data_test = pd.read_csv('data_test.csv', engine='python', header=None).dropna()
|
||||
|
||||
x_train = data_train["company_profile"]
|
||||
x_dev = data_dev["company_profile"]
|
||||
x_test = data_test["company_profile"]
|
||||
x_train = data_train[5]
|
||||
x_dev = data_dev[5]
|
||||
x_test = data_test[5]
|
||||
|
||||
y_train = data_train["fraudulent"]
|
||||
y_dev = data_dev["fraudulent"]
|
||||
y_test = data_test["fraudulent"]
|
||||
y_train = data_train[17]
|
||||
y_dev = data_dev[17]
|
||||
y_test = data_test[17]
|
||||
|
||||
company_profile = np.array(company_profile)
|
||||
x_train = np.array(x_train)
|
||||
x_dev = np.array(x_dev)
|
||||
x_test = np.array(x_test)
|
||||
@ -55,7 +61,8 @@ if __name__ == "__main__":
|
||||
|
||||
vectorizer = TfidfVectorizer()
|
||||
|
||||
x_train = vectorizer.fit_transform(x_train)
|
||||
company_profile = vectorizer.fit_transform(company_profile)
|
||||
x_train = vectorizer.transform(x_train)
|
||||
x_dev = vectorizer.transform(x_dev)
|
||||
x_test = vectorizer.transform(x_test)
|
||||
|
||||
@ -72,7 +79,7 @@ if __name__ == "__main__":
|
||||
model = nn.Sequential(
|
||||
nn.Linear(x_train.shape[1], 64),
|
||||
nn.ReLU(),
|
||||
nn.Linear(64, data_train["fraudulent"].nunique()),
|
||||
nn.Linear(64, data_train[17].nunique()),
|
||||
nn.LogSoftmax(dim=1))
|
||||
|
||||
# Define the loss
|
||||
@ -122,10 +129,12 @@ if __name__ == "__main__":
|
||||
|
||||
FP = []
|
||||
FN = []
|
||||
model.eval()
|
||||
print(x_test.size())
|
||||
log_ps = model(x_test)
|
||||
ps = torch.exp(log_ps)
|
||||
top_p, top_class = ps.topk(1, dim=1)
|
||||
descr = np.array(data_test["company_profile"])
|
||||
descr = np.array(data_test[5])
|
||||
for i, (x, y) in enumerate(zip(np.array(top_class), np.array(y_test.view(*top_class.shape)))):
|
||||
d = descr[i]
|
||||
if x == y:
|
||||
|
38
evaluation.Jenkinsfile
Normal file
38
evaluation.Jenkinsfile
Normal file
@ -0,0 +1,38 @@
|
||||
pipeline {
|
||||
agent {
|
||||
dockerfile true
|
||||
}
|
||||
parameters {
|
||||
string (
|
||||
defaultValue: '10',
|
||||
description: 'Epochs number',
|
||||
name: 'EPOCH',
|
||||
trim: false
|
||||
)
|
||||
}
|
||||
stages {
|
||||
stage('checkout: Check out from version control') {
|
||||
steps {
|
||||
checkout([$class: 'GitSCM', branches: [[name: ' */master']], extensions: [], userRemoteConfigs: [[credentialsId: 's444463', url: 'https://git.wmi.amu.edu.pl/s444463/ium_444463.git']]])
|
||||
}
|
||||
}
|
||||
stage('bash script') {
|
||||
steps {
|
||||
withEnv(["EPOCH=${params.EPOCH}"]) {
|
||||
copyArtifacts filter: '*', projectName: 's444463-create-dataset'
|
||||
sh 'python3 ./evaluation.py'
|
||||
archiveArtifacts artifacts: "metrics.txt"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
post {
|
||||
success {
|
||||
emailext body: "Model successfully evaluation", subject: "Model evaluation 444463", to: "e19191c5.uam.onmicrosoft.com@emea.teams.ms"
|
||||
}
|
||||
|
||||
failure {
|
||||
emailext body: "evaluation failure", subject: "Model evaluation 444463", to: "e19191c5.uam.onmicrosoft.com@emea.teams.ms"
|
||||
}
|
||||
}
|
||||
}
|
90
evaluation.py
Normal file
90
evaluation.py
Normal file
@ -0,0 +1,90 @@
|
||||
import torch
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
import scipy
|
||||
|
||||
if __name__ == "__main__":
|
||||
data = pd.read_csv('fake_job_postings.csv', engine='python')
|
||||
# data = data.replace(np.nan, '', regex=True)
|
||||
company_profile = data["company_profile"]
|
||||
company_profile = company_profile.dropna()
|
||||
company_profile = np.array(company_profile)
|
||||
vectorizer = TfidfVectorizer()
|
||||
|
||||
company_profile = vectorizer.fit_transform(company_profile)
|
||||
model = torch.load('model')
|
||||
|
||||
data_test = pd.read_csv('data_test.csv', engine='python', header=None)
|
||||
data_test = data_test.dropna()
|
||||
x_test = data_test[5]
|
||||
y_test = data_test[17]
|
||||
|
||||
|
||||
x_test = np.array(x_test)
|
||||
|
||||
y_test = np.array(y_test)
|
||||
|
||||
|
||||
x_test = vectorizer.transform(x_test)
|
||||
|
||||
x_test = torch.tensor(scipy.sparse.csr_matrix.todense(x_test)).float()
|
||||
|
||||
y_test = torch.tensor(y_test)
|
||||
|
||||
|
||||
|
||||
TP = []
|
||||
TF = []
|
||||
|
||||
FP = []
|
||||
FN = []
|
||||
# x_test = x_test.view(x_test.size(0), -1)
|
||||
|
||||
model = model.eval()
|
||||
print(x_test.size())
|
||||
log_ps = model(x_test)
|
||||
ps = torch.exp(log_ps)
|
||||
top_p, top_class = ps.topk(1, dim=1)
|
||||
descr = np.array(data_test[5])
|
||||
for i, (x, y) in enumerate(zip(np.array(top_class), np.array(y_test.view(*top_class.shape)))):
|
||||
d = descr[i]
|
||||
if x == y:
|
||||
if x:
|
||||
TP.append(d)
|
||||
else:
|
||||
TF.append(d)
|
||||
else:
|
||||
if x:
|
||||
FP.append(d)
|
||||
else:
|
||||
FN.append(d)
|
||||
f_score = len(TP) / (len(TP) + 0.5 * (len(FP) + len(FN)))
|
||||
accuracy = (len(TP) + len(TF)) / (len(TP) + len(TF) + len(FP) + len(FN))
|
||||
precision = len(TP) / ( len(TP) + len(FP) )
|
||||
recall = len(TP) / ( len(TP) + len(FN) )
|
||||
print(f"F- score = {f_score}")
|
||||
print(f"Accuracy = {accuracy}")
|
||||
print(f"Precision = {precision}")
|
||||
print(f"Recall = {recall}")
|
||||
f = open("metrics.txt", "a")
|
||||
|
||||
f.write(f"F-SCORE = {f_score}\n")
|
||||
f.write(f"Accuracy = {accuracy}\n")
|
||||
f.write(f"Precision = {precision}\n")
|
||||
f.write(f"Recall = {recall}\n")
|
||||
|
||||
# f.write(f"TP descriptions:")
|
||||
# for i in TP:
|
||||
# f.write(i+'\n')
|
||||
# f.write(f"TF descriptions:")
|
||||
# for i in TF:
|
||||
# f.write(i+"\n")
|
||||
# f.write(f"FP descriptions:")
|
||||
# for i in FP:
|
||||
# f.write(i+"\n")
|
||||
# f.write(f"FN descriptions:")
|
||||
# for i in FN:
|
||||
# f.write(i+"\n")
|
||||
# f.close()
|
||||
a=1
|
Loading…
Reference in New Issue
Block a user