From 4378c0ce28e998850ddabd563ea55e57e5effd96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Pokrywka?= Date: Fri, 6 May 2022 21:05:15 +0200 Subject: [PATCH] added example evaluation --- .gitignore | 3 +- deepl.py | 31 +++++++++------ evaluation.Jenkinsfile | 38 ++++++++++++++++++ evaluation.py | 90 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 150 insertions(+), 12 deletions(-) create mode 100644 evaluation.Jenkinsfile create mode 100644 evaluation.py diff --git a/.gitignore b/.gitignore index ef2e133..c89d897 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,5 @@ venv .~lock.fake_job_postings.csv# .idea model_resutls.txt -model \ No newline at end of file +model +metrics.txt \ No newline at end of file diff --git a/deepl.py b/deepl.py index 10c1011..ccb09b3 100644 --- a/deepl.py +++ b/deepl.py @@ -24,6 +24,7 @@ if __name__ == "__main__": print(type(sys.argv[1])) print(sys.argv[1]) epochs = int(sys.argv[1]) + # epochs=10 # kaggle.api.authenticate() # kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='.', @@ -33,18 +34,23 @@ if __name__ == "__main__": # data = data.replace(np.nan, '', regex=True) data = data[["company_profile", "fraudulent"]] data = data.dropna() + company_profile = data["company_profile"] - data_train, data_test = train_test_split(data, test_size=3000, random_state=1) - data_dev, data_test = train_test_split(data_test, test_size=1500, random_state=1) + # data_train, data_test = train_test_split(data, test_size=3000, random_state=1) + # data_dev, data_test = train_test_split(data_test, test_size=1500, random_state=1) + data_train = pd.read_csv('data_train.csv', engine='python', header=None).dropna() + data_dev = pd.read_csv('data_dev.csv', engine='python', header=None).dropna() + data_test = pd.read_csv('data_test.csv', engine='python', header=None).dropna() - x_train = data_train["company_profile"] - x_dev = data_dev["company_profile"] - x_test = data_test["company_profile"] + x_train = data_train[5] + x_dev = data_dev[5] + x_test = data_test[5] - y_train = data_train["fraudulent"] - y_dev = data_dev["fraudulent"] - y_test = data_test["fraudulent"] + y_train = data_train[17] + y_dev = data_dev[17] + y_test = data_test[17] + company_profile = np.array(company_profile) x_train = np.array(x_train) x_dev = np.array(x_dev) x_test = np.array(x_test) @@ -55,7 +61,8 @@ if __name__ == "__main__": vectorizer = TfidfVectorizer() - x_train = vectorizer.fit_transform(x_train) + company_profile = vectorizer.fit_transform(company_profile) + x_train = vectorizer.transform(x_train) x_dev = vectorizer.transform(x_dev) x_test = vectorizer.transform(x_test) @@ -72,7 +79,7 @@ if __name__ == "__main__": model = nn.Sequential( nn.Linear(x_train.shape[1], 64), nn.ReLU(), - nn.Linear(64, data_train["fraudulent"].nunique()), + nn.Linear(64, data_train[17].nunique()), nn.LogSoftmax(dim=1)) # Define the loss @@ -122,10 +129,12 @@ if __name__ == "__main__": FP = [] FN = [] + model.eval() + print(x_test.size()) log_ps = model(x_test) ps = torch.exp(log_ps) top_p, top_class = ps.topk(1, dim=1) - descr = np.array(data_test["company_profile"]) + descr = np.array(data_test[5]) for i, (x, y) in enumerate(zip(np.array(top_class), np.array(y_test.view(*top_class.shape)))): d = descr[i] if x == y: diff --git a/evaluation.Jenkinsfile b/evaluation.Jenkinsfile new file mode 100644 index 0000000..c1ffa89 --- /dev/null +++ b/evaluation.Jenkinsfile @@ -0,0 +1,38 @@ +pipeline { + agent { + dockerfile true + } + parameters { + string ( + defaultValue: '10', + description: 'Epochs number', + name: 'EPOCH', + trim: false + ) + } + stages { + stage('checkout: Check out from version control') { + steps { + checkout([$class: 'GitSCM', branches: [[name: ' */master']], extensions: [], userRemoteConfigs: [[credentialsId: 's444463', url: 'https://git.wmi.amu.edu.pl/s444463/ium_444463.git']]]) + } + } + stage('bash script') { + steps { + withEnv(["EPOCH=${params.EPOCH}"]) { + copyArtifacts filter: '*', projectName: 's444463-create-dataset' + sh 'python3 ./evaluation.py' + archiveArtifacts artifacts: "metrics.txt" + } + } + } + } + post { + success { + emailext body: "Model successfully evaluation", subject: "Model evaluation 444463", to: "e19191c5.uam.onmicrosoft.com@emea.teams.ms" + } + + failure { + emailext body: "evaluation failure", subject: "Model evaluation 444463", to: "e19191c5.uam.onmicrosoft.com@emea.teams.ms" + } + } +} diff --git a/evaluation.py b/evaluation.py new file mode 100644 index 0000000..b1c3053 --- /dev/null +++ b/evaluation.py @@ -0,0 +1,90 @@ +import torch +import pandas as pd +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +import scipy + +if __name__ == "__main__": + data = pd.read_csv('fake_job_postings.csv', engine='python') + # data = data.replace(np.nan, '', regex=True) + company_profile = data["company_profile"] + company_profile = company_profile.dropna() + company_profile = np.array(company_profile) + vectorizer = TfidfVectorizer() + + company_profile = vectorizer.fit_transform(company_profile) + model = torch.load('model') + + data_test = pd.read_csv('data_test.csv', engine='python', header=None) + data_test = data_test.dropna() + x_test = data_test[5] + y_test = data_test[17] + + + x_test = np.array(x_test) + + y_test = np.array(y_test) + + + x_test = vectorizer.transform(x_test) + + x_test = torch.tensor(scipy.sparse.csr_matrix.todense(x_test)).float() + + y_test = torch.tensor(y_test) + + + + TP = [] + TF = [] + + FP = [] + FN = [] + # x_test = x_test.view(x_test.size(0), -1) + + model = model.eval() + print(x_test.size()) + log_ps = model(x_test) + ps = torch.exp(log_ps) + top_p, top_class = ps.topk(1, dim=1) + descr = np.array(data_test[5]) + for i, (x, y) in enumerate(zip(np.array(top_class), np.array(y_test.view(*top_class.shape)))): + d = descr[i] + if x == y: + if x: + TP.append(d) + else: + TF.append(d) + else: + if x: + FP.append(d) + else: + FN.append(d) + f_score = len(TP) / (len(TP) + 0.5 * (len(FP) + len(FN))) + accuracy = (len(TP) + len(TF)) / (len(TP) + len(TF) + len(FP) + len(FN)) + precision = len(TP) / ( len(TP) + len(FP) ) + recall = len(TP) / ( len(TP) + len(FN) ) + print(f"F- score = {f_score}") + print(f"Accuracy = {accuracy}") + print(f"Precision = {precision}") + print(f"Recall = {recall}") + f = open("metrics.txt", "a") + + f.write(f"F-SCORE = {f_score}\n") + f.write(f"Accuracy = {accuracy}\n") + f.write(f"Precision = {precision}\n") + f.write(f"Recall = {recall}\n") + + # f.write(f"TP descriptions:") + # for i in TP: + # f.write(i+'\n') + # f.write(f"TF descriptions:") + # for i in TF: + # f.write(i+"\n") + # f.write(f"FP descriptions:") + # for i in FP: + # f.write(i+"\n") + # f.write(f"FN descriptions:") + # for i in FN: + # f.write(i+"\n") + # f.close() + a=1 \ No newline at end of file