added example evaluation
This commit is contained in:
parent
0630a56ff2
commit
4378c0ce28
3
.gitignore
vendored
3
.gitignore
vendored
@ -11,4 +11,5 @@ venv
|
|||||||
.~lock.fake_job_postings.csv#
|
.~lock.fake_job_postings.csv#
|
||||||
.idea
|
.idea
|
||||||
model_resutls.txt
|
model_resutls.txt
|
||||||
model
|
model
|
||||||
|
metrics.txt
|
31
deepl.py
31
deepl.py
@ -24,6 +24,7 @@ if __name__ == "__main__":
|
|||||||
print(type(sys.argv[1]))
|
print(type(sys.argv[1]))
|
||||||
print(sys.argv[1])
|
print(sys.argv[1])
|
||||||
epochs = int(sys.argv[1])
|
epochs = int(sys.argv[1])
|
||||||
|
# epochs=10
|
||||||
|
|
||||||
# kaggle.api.authenticate()
|
# kaggle.api.authenticate()
|
||||||
# kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='.',
|
# kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='.',
|
||||||
@ -33,18 +34,23 @@ if __name__ == "__main__":
|
|||||||
# data = data.replace(np.nan, '', regex=True)
|
# data = data.replace(np.nan, '', regex=True)
|
||||||
data = data[["company_profile", "fraudulent"]]
|
data = data[["company_profile", "fraudulent"]]
|
||||||
data = data.dropna()
|
data = data.dropna()
|
||||||
|
company_profile = data["company_profile"]
|
||||||
|
|
||||||
data_train, data_test = train_test_split(data, test_size=3000, random_state=1)
|
# data_train, data_test = train_test_split(data, test_size=3000, random_state=1)
|
||||||
data_dev, data_test = train_test_split(data_test, test_size=1500, random_state=1)
|
# data_dev, data_test = train_test_split(data_test, test_size=1500, random_state=1)
|
||||||
|
data_train = pd.read_csv('data_train.csv', engine='python', header=None).dropna()
|
||||||
|
data_dev = pd.read_csv('data_dev.csv', engine='python', header=None).dropna()
|
||||||
|
data_test = pd.read_csv('data_test.csv', engine='python', header=None).dropna()
|
||||||
|
|
||||||
x_train = data_train["company_profile"]
|
x_train = data_train[5]
|
||||||
x_dev = data_dev["company_profile"]
|
x_dev = data_dev[5]
|
||||||
x_test = data_test["company_profile"]
|
x_test = data_test[5]
|
||||||
|
|
||||||
y_train = data_train["fraudulent"]
|
y_train = data_train[17]
|
||||||
y_dev = data_dev["fraudulent"]
|
y_dev = data_dev[17]
|
||||||
y_test = data_test["fraudulent"]
|
y_test = data_test[17]
|
||||||
|
|
||||||
|
company_profile = np.array(company_profile)
|
||||||
x_train = np.array(x_train)
|
x_train = np.array(x_train)
|
||||||
x_dev = np.array(x_dev)
|
x_dev = np.array(x_dev)
|
||||||
x_test = np.array(x_test)
|
x_test = np.array(x_test)
|
||||||
@ -55,7 +61,8 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
vectorizer = TfidfVectorizer()
|
vectorizer = TfidfVectorizer()
|
||||||
|
|
||||||
x_train = vectorizer.fit_transform(x_train)
|
company_profile = vectorizer.fit_transform(company_profile)
|
||||||
|
x_train = vectorizer.transform(x_train)
|
||||||
x_dev = vectorizer.transform(x_dev)
|
x_dev = vectorizer.transform(x_dev)
|
||||||
x_test = vectorizer.transform(x_test)
|
x_test = vectorizer.transform(x_test)
|
||||||
|
|
||||||
@ -72,7 +79,7 @@ if __name__ == "__main__":
|
|||||||
model = nn.Sequential(
|
model = nn.Sequential(
|
||||||
nn.Linear(x_train.shape[1], 64),
|
nn.Linear(x_train.shape[1], 64),
|
||||||
nn.ReLU(),
|
nn.ReLU(),
|
||||||
nn.Linear(64, data_train["fraudulent"].nunique()),
|
nn.Linear(64, data_train[17].nunique()),
|
||||||
nn.LogSoftmax(dim=1))
|
nn.LogSoftmax(dim=1))
|
||||||
|
|
||||||
# Define the loss
|
# Define the loss
|
||||||
@ -122,10 +129,12 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
FP = []
|
FP = []
|
||||||
FN = []
|
FN = []
|
||||||
|
model.eval()
|
||||||
|
print(x_test.size())
|
||||||
log_ps = model(x_test)
|
log_ps = model(x_test)
|
||||||
ps = torch.exp(log_ps)
|
ps = torch.exp(log_ps)
|
||||||
top_p, top_class = ps.topk(1, dim=1)
|
top_p, top_class = ps.topk(1, dim=1)
|
||||||
descr = np.array(data_test["company_profile"])
|
descr = np.array(data_test[5])
|
||||||
for i, (x, y) in enumerate(zip(np.array(top_class), np.array(y_test.view(*top_class.shape)))):
|
for i, (x, y) in enumerate(zip(np.array(top_class), np.array(y_test.view(*top_class.shape)))):
|
||||||
d = descr[i]
|
d = descr[i]
|
||||||
if x == y:
|
if x == y:
|
||||||
|
38
evaluation.Jenkinsfile
Normal file
38
evaluation.Jenkinsfile
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
pipeline {
|
||||||
|
agent {
|
||||||
|
dockerfile true
|
||||||
|
}
|
||||||
|
parameters {
|
||||||
|
string (
|
||||||
|
defaultValue: '10',
|
||||||
|
description: 'Epochs number',
|
||||||
|
name: 'EPOCH',
|
||||||
|
trim: false
|
||||||
|
)
|
||||||
|
}
|
||||||
|
stages {
|
||||||
|
stage('checkout: Check out from version control') {
|
||||||
|
steps {
|
||||||
|
checkout([$class: 'GitSCM', branches: [[name: ' */master']], extensions: [], userRemoteConfigs: [[credentialsId: 's444463', url: 'https://git.wmi.amu.edu.pl/s444463/ium_444463.git']]])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('bash script') {
|
||||||
|
steps {
|
||||||
|
withEnv(["EPOCH=${params.EPOCH}"]) {
|
||||||
|
copyArtifacts filter: '*', projectName: 's444463-create-dataset'
|
||||||
|
sh 'python3 ./evaluation.py'
|
||||||
|
archiveArtifacts artifacts: "metrics.txt"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
post {
|
||||||
|
success {
|
||||||
|
emailext body: "Model successfully evaluation", subject: "Model evaluation 444463", to: "e19191c5.uam.onmicrosoft.com@emea.teams.ms"
|
||||||
|
}
|
||||||
|
|
||||||
|
failure {
|
||||||
|
emailext body: "evaluation failure", subject: "Model evaluation 444463", to: "e19191c5.uam.onmicrosoft.com@emea.teams.ms"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
90
evaluation.py
Normal file
90
evaluation.py
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
import torch
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
import scipy
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
data = pd.read_csv('fake_job_postings.csv', engine='python')
|
||||||
|
# data = data.replace(np.nan, '', regex=True)
|
||||||
|
company_profile = data["company_profile"]
|
||||||
|
company_profile = company_profile.dropna()
|
||||||
|
company_profile = np.array(company_profile)
|
||||||
|
vectorizer = TfidfVectorizer()
|
||||||
|
|
||||||
|
company_profile = vectorizer.fit_transform(company_profile)
|
||||||
|
model = torch.load('model')
|
||||||
|
|
||||||
|
data_test = pd.read_csv('data_test.csv', engine='python', header=None)
|
||||||
|
data_test = data_test.dropna()
|
||||||
|
x_test = data_test[5]
|
||||||
|
y_test = data_test[17]
|
||||||
|
|
||||||
|
|
||||||
|
x_test = np.array(x_test)
|
||||||
|
|
||||||
|
y_test = np.array(y_test)
|
||||||
|
|
||||||
|
|
||||||
|
x_test = vectorizer.transform(x_test)
|
||||||
|
|
||||||
|
x_test = torch.tensor(scipy.sparse.csr_matrix.todense(x_test)).float()
|
||||||
|
|
||||||
|
y_test = torch.tensor(y_test)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
TP = []
|
||||||
|
TF = []
|
||||||
|
|
||||||
|
FP = []
|
||||||
|
FN = []
|
||||||
|
# x_test = x_test.view(x_test.size(0), -1)
|
||||||
|
|
||||||
|
model = model.eval()
|
||||||
|
print(x_test.size())
|
||||||
|
log_ps = model(x_test)
|
||||||
|
ps = torch.exp(log_ps)
|
||||||
|
top_p, top_class = ps.topk(1, dim=1)
|
||||||
|
descr = np.array(data_test[5])
|
||||||
|
for i, (x, y) in enumerate(zip(np.array(top_class), np.array(y_test.view(*top_class.shape)))):
|
||||||
|
d = descr[i]
|
||||||
|
if x == y:
|
||||||
|
if x:
|
||||||
|
TP.append(d)
|
||||||
|
else:
|
||||||
|
TF.append(d)
|
||||||
|
else:
|
||||||
|
if x:
|
||||||
|
FP.append(d)
|
||||||
|
else:
|
||||||
|
FN.append(d)
|
||||||
|
f_score = len(TP) / (len(TP) + 0.5 * (len(FP) + len(FN)))
|
||||||
|
accuracy = (len(TP) + len(TF)) / (len(TP) + len(TF) + len(FP) + len(FN))
|
||||||
|
precision = len(TP) / ( len(TP) + len(FP) )
|
||||||
|
recall = len(TP) / ( len(TP) + len(FN) )
|
||||||
|
print(f"F- score = {f_score}")
|
||||||
|
print(f"Accuracy = {accuracy}")
|
||||||
|
print(f"Precision = {precision}")
|
||||||
|
print(f"Recall = {recall}")
|
||||||
|
f = open("metrics.txt", "a")
|
||||||
|
|
||||||
|
f.write(f"F-SCORE = {f_score}\n")
|
||||||
|
f.write(f"Accuracy = {accuracy}\n")
|
||||||
|
f.write(f"Precision = {precision}\n")
|
||||||
|
f.write(f"Recall = {recall}\n")
|
||||||
|
|
||||||
|
# f.write(f"TP descriptions:")
|
||||||
|
# for i in TP:
|
||||||
|
# f.write(i+'\n')
|
||||||
|
# f.write(f"TF descriptions:")
|
||||||
|
# for i in TF:
|
||||||
|
# f.write(i+"\n")
|
||||||
|
# f.write(f"FP descriptions:")
|
||||||
|
# for i in FP:
|
||||||
|
# f.write(i+"\n")
|
||||||
|
# f.write(f"FN descriptions:")
|
||||||
|
# for i in FN:
|
||||||
|
# f.write(i+"\n")
|
||||||
|
# f.close()
|
||||||
|
a=1
|
Loading…
Reference in New Issue
Block a user