added example evaluation

2022-05-06 21:05:15 +02:00 · 2022-05-06 21:05:15 +02:00 · 4378c0ce28
commit 4378c0ce28
parent 0630a56ff2
4 changed files with 150 additions and 12 deletions
--- a/.gitignore
+++ b/.gitignore
@ -11,4 +11,5 @@ venv
 .~lock.fake_job_postings.csv#
 .idea
 model_resutls.txt
-model
+model
 metrics.txt
--- a/deepl.py
+++ b/deepl.py
@ -24,6 +24,7 @@ if __name__ == "__main__":
    print(type(sys.argv[1]))
    print(sys.argv[1])
    epochs = int(sys.argv[1])
    # epochs=10
    # kaggle.api.authenticate()
    # kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='.',
@ -33,18 +34,23 @@ if __name__ == "__main__":
    # data = data.replace(np.nan, '', regex=True)
    data = data[["company_profile", "fraudulent"]]
    data = data.dropna()
    company_profile = data["company_profile"]
-    data_train, data_test = train_test_split(data, test_size=3000, random_state=1)
+    # data_train, data_test = train_test_split(data, test_size=3000, random_state=1)
-    data_dev, data_test = train_test_split(data_test, test_size=1500, random_state=1)
+    # data_dev, data_test = train_test_split(data_test, test_size=1500, random_state=1)
    data_train = pd.read_csv('data_train.csv', engine='python', header=None).dropna()
    data_dev = pd.read_csv('data_dev.csv', engine='python', header=None).dropna()
    data_test = pd.read_csv('data_test.csv', engine='python', header=None).dropna()
-    x_train = data_train["company_profile"]
+    x_train = data_train[5]
-    x_dev = data_dev["company_profile"]
+    x_dev = data_dev[5]
-    x_test = data_test["company_profile"]
+    x_test = data_test[5]
-    y_train = data_train["fraudulent"]
+    y_train = data_train[17]
-    y_dev = data_dev["fraudulent"]
+    y_dev = data_dev[17]
-    y_test = data_test["fraudulent"]
+    y_test = data_test[17]
    company_profile = np.array(company_profile)
    x_train = np.array(x_train)
    x_dev = np.array(x_dev)
    x_test = np.array(x_test)
@ -55,7 +61,8 @@ if __name__ == "__main__":
    vectorizer = TfidfVectorizer()
-    x_train = vectorizer.fit_transform(x_train)
+    company_profile = vectorizer.fit_transform(company_profile)
    x_train = vectorizer.transform(x_train)
    x_dev = vectorizer.transform(x_dev)
    x_test = vectorizer.transform(x_test)
@ -72,7 +79,7 @@ if __name__ == "__main__":
    model = nn.Sequential(
        nn.Linear(x_train.shape[1], 64),
        nn.ReLU(),
-        nn.Linear(64, data_train["fraudulent"].nunique()),
+        nn.Linear(64, data_train[17].nunique()),
        nn.LogSoftmax(dim=1))
    # Define the loss
@ -122,10 +129,12 @@ if __name__ == "__main__":
    FP = []
    FN = []
    model.eval()
    print(x_test.size())
    log_ps = model(x_test)
    ps = torch.exp(log_ps)
    top_p, top_class = ps.topk(1, dim=1)
-    descr = np.array(data_test["company_profile"])
+    descr = np.array(data_test[5])
    for i, (x, y) in enumerate(zip(np.array(top_class), np.array(y_test.view(*top_class.shape)))):
        d = descr[i]
        if x == y:
--- a/evaluation.Jenkinsfile
+++ b/evaluation.Jenkinsfile
@ -0,0 +1,38 @@
 pipeline {
  	agent {
 		dockerfile true
 	}
    parameters {
        string (
            defaultValue: '10',
            description: 'Epochs number',
            name: 'EPOCH',
            trim: false
        )
    }
    stages {
        stage('checkout: Check out from version control') {
            steps { 
                    checkout([$class: 'GitSCM', branches: [[name: ' */master']], extensions: [], userRemoteConfigs: [[credentialsId: 's444463', url: 'https://git.wmi.amu.edu.pl/s444463/ium_444463.git']]])
                }
        }
        stage('bash script') {
            steps {
                withEnv(["EPOCH=${params.EPOCH}"]) {
                            copyArtifacts filter: '*', projectName: 's444463-create-dataset'
                            sh 'python3 ./evaluation.py'
                            archiveArtifacts artifacts: "metrics.txt"
                }
            }
        }
    }
    post {
        success {
            emailext body: "Model successfully evaluation", subject: "Model evaluation 444463", to: "e19191c5.uam.onmicrosoft.com@emea.teams.ms"
        }
        failure {
            emailext body: "evaluation failure", subject: "Model evaluation 444463", to: "e19191c5.uam.onmicrosoft.com@emea.teams.ms"
        }
    }
 }
--- a/evaluation.py
+++ b/evaluation.py
@ -0,0 +1,90 @@
 import torch
 import pandas as pd
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 import scipy
 if __name__ == "__main__":
    data = pd.read_csv('fake_job_postings.csv', engine='python')
    # data = data.replace(np.nan, '', regex=True)
    company_profile = data["company_profile"]
    company_profile = company_profile.dropna()
    company_profile = np.array(company_profile)
    vectorizer = TfidfVectorizer()
    company_profile = vectorizer.fit_transform(company_profile)
    model = torch.load('model')
    data_test = pd.read_csv('data_test.csv', engine='python', header=None)
    data_test = data_test.dropna()
    x_test = data_test[5]
    y_test = data_test[17]
    x_test = np.array(x_test)
    y_test = np.array(y_test)
    x_test = vectorizer.transform(x_test)
    x_test = torch.tensor(scipy.sparse.csr_matrix.todense(x_test)).float()
    y_test = torch.tensor(y_test)
    TP = []
    TF = []
    FP = []
    FN = []
    # x_test = x_test.view(x_test.size(0), -1)
    model = model.eval()
    print(x_test.size())
    log_ps = model(x_test)
    ps = torch.exp(log_ps)
    top_p, top_class = ps.topk(1, dim=1)
    descr = np.array(data_test[5])
    for i, (x, y) in enumerate(zip(np.array(top_class), np.array(y_test.view(*top_class.shape)))):
        d = descr[i]
        if x == y:
            if x:
                TP.append(d)
            else:
                TF.append(d)
        else:
            if x:
                FP.append(d)
            else:
                FN.append(d)
    f_score = len(TP) / (len(TP) + 0.5 * (len(FP) + len(FN)))
    accuracy = (len(TP) + len(TF)) / (len(TP) + len(TF) + len(FP) + len(FN))  
    precision = len(TP) / ( len(TP) + len(FP) )
    recall = len(TP) / ( len(TP) + len(FN) )
    print(f"F- score = {f_score}")
    print(f"Accuracy = {accuracy}")
    print(f"Precision = {precision}")
    print(f"Recall = {recall}")
    f = open("metrics.txt", "a")
    f.write(f"F-SCORE = {f_score}\n")
    f.write(f"Accuracy = {accuracy}\n")
    f.write(f"Precision = {precision}\n")
    f.write(f"Recall = {recall}\n")
    # f.write(f"TP descriptions:")
    # for i in TP:
    #     f.write(i+'\n')
    # f.write(f"TF descriptions:")
    # for i in TF:
    #     f.write(i+"\n")
    # f.write(f"FP descriptions:")
    # for i in FP:
    #     f.write(i+"\n")
    # f.write(f"FN descriptions:")
    # for i in FN:
    #     f.write(i+"\n")
    # f.close()
    a=1