added example evaluation

2022-05-06 21:05:15 +02:00 · 2022-05-06 21:05:15 +02:00 · 4378c0ce28
commit 4378c0ce28
parent 0630a56ff2
4 changed files with 150 additions and 12 deletions
--- a/.gitignore
+++ b/.gitignore
@ -11,4 +11,5 @@ venv
 .~lock.fake_job_postings.csv#
 .idea
 model_resutls.txt
-model
+model
+metrics.txt
--- a/deepl.py
+++ b/deepl.py
@ -24,6 +24,7 @@ if __name__ == "__main__":
    print(type(sys.argv[1]))
    print(sys.argv[1])
    epochs = int(sys.argv[1])
+    # epochs=10

    # kaggle.api.authenticate()
    # kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='.',
@ -33,18 +34,23 @@ if __name__ == "__main__":
    # data = data.replace(np.nan, '', regex=True)
    data = data[["company_profile", "fraudulent"]]
    data = data.dropna()
+    company_profile = data["company_profile"]

-    data_train, data_test = train_test_split(data, test_size=3000, random_state=1)
-    data_dev, data_test = train_test_split(data_test, test_size=1500, random_state=1)
+    # data_train, data_test = train_test_split(data, test_size=3000, random_state=1)
+    # data_dev, data_test = train_test_split(data_test, test_size=1500, random_state=1)
+    data_train = pd.read_csv('data_train.csv', engine='python', header=None).dropna()
+    data_dev = pd.read_csv('data_dev.csv', engine='python', header=None).dropna()
+    data_test = pd.read_csv('data_test.csv', engine='python', header=None).dropna()

-    x_train = data_train["company_profile"]
-    x_dev = data_dev["company_profile"]
-    x_test = data_test["company_profile"]
+    x_train = data_train[5]
+    x_dev = data_dev[5]
+    x_test = data_test[5]

-    y_train = data_train["fraudulent"]
-    y_dev = data_dev["fraudulent"]
-    y_test = data_test["fraudulent"]
+    y_train = data_train[17]
+    y_dev = data_dev[17]
+    y_test = data_test[17]

+    company_profile = np.array(company_profile)
    x_train = np.array(x_train)
    x_dev = np.array(x_dev)
    x_test = np.array(x_test)
@ -55,7 +61,8 @@ if __name__ == "__main__":

    vectorizer = TfidfVectorizer()

-    x_train = vectorizer.fit_transform(x_train)
+    company_profile = vectorizer.fit_transform(company_profile)
+    x_train = vectorizer.transform(x_train)
    x_dev = vectorizer.transform(x_dev)
    x_test = vectorizer.transform(x_test)

@ -72,7 +79,7 @@ if __name__ == "__main__":
    model = nn.Sequential(
        nn.Linear(x_train.shape[1], 64),
        nn.ReLU(),
-        nn.Linear(64, data_train["fraudulent"].nunique()),
+        nn.Linear(64, data_train[17].nunique()),
        nn.LogSoftmax(dim=1))

    # Define the loss
@ -122,10 +129,12 @@ if __name__ == "__main__":

    FP = []
    FN = []
+    model.eval()
+    print(x_test.size())
    log_ps = model(x_test)
    ps = torch.exp(log_ps)
    top_p, top_class = ps.topk(1, dim=1)
-    descr = np.array(data_test["company_profile"])
+    descr = np.array(data_test[5])
    for i, (x, y) in enumerate(zip(np.array(top_class), np.array(y_test.view(*top_class.shape)))):
        d = descr[i]
        if x == y:
--- a/evaluation.Jenkinsfile
+++ b/evaluation.Jenkinsfile
@ -0,0 +1,38 @@
+pipeline {
+  	agent {
+		dockerfile true
+	}
+    parameters {
+        string (
+            defaultValue: '10',
+            description: 'Epochs number',
+            name: 'EPOCH',
+            trim: false
+        )
+    }
+    stages {
+        stage('checkout: Check out from version control') {
+            steps { 
+                    checkout([$class: 'GitSCM', branches: [[name: ' */master']], extensions: [], userRemoteConfigs: [[credentialsId: 's444463', url: 'https://git.wmi.amu.edu.pl/s444463/ium_444463.git']]])
+                }
+        }
+        stage('bash script') {
+            steps {
+                withEnv(["EPOCH=${params.EPOCH}"]) {
+                            copyArtifacts filter: '*', projectName: 's444463-create-dataset'
+                            sh 'python3 ./evaluation.py'
+                            archiveArtifacts artifacts: "metrics.txt"
+                }
+            }
+        }
+    }
+    post {
+        success {
+            emailext body: "Model successfully evaluation", subject: "Model evaluation 444463", to: "e19191c5.uam.onmicrosoft.com@emea.teams.ms"
+        }
+
+        failure {
+            emailext body: "evaluation failure", subject: "Model evaluation 444463", to: "e19191c5.uam.onmicrosoft.com@emea.teams.ms"
+        }
+    }
+}
--- a/evaluation.py
+++ b/evaluation.py
@ -0,0 +1,90 @@
+import torch
+import pandas as pd
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+import scipy
+
+if __name__ == "__main__":
+    data = pd.read_csv('fake_job_postings.csv', engine='python')
+    # data = data.replace(np.nan, '', regex=True)
+    company_profile = data["company_profile"]
+    company_profile = company_profile.dropna()
+    company_profile = np.array(company_profile)
+    vectorizer = TfidfVectorizer()
+
+    company_profile = vectorizer.fit_transform(company_profile)
+    model = torch.load('model')
+
+    data_test = pd.read_csv('data_test.csv', engine='python', header=None)
+    data_test = data_test.dropna()
+    x_test = data_test[5]
+    y_test = data_test[17]
+
+
+    x_test = np.array(x_test)
+
+    y_test = np.array(y_test)
+
+
+    x_test = vectorizer.transform(x_test)
+
+    x_test = torch.tensor(scipy.sparse.csr_matrix.todense(x_test)).float()
+
+    y_test = torch.tensor(y_test)
+
+
+
+    TP = []
+    TF = []
+
+    FP = []
+    FN = []
+    # x_test = x_test.view(x_test.size(0), -1)
+
+    model = model.eval()
+    print(x_test.size())
+    log_ps = model(x_test)
+    ps = torch.exp(log_ps)
+    top_p, top_class = ps.topk(1, dim=1)
+    descr = np.array(data_test[5])
+    for i, (x, y) in enumerate(zip(np.array(top_class), np.array(y_test.view(*top_class.shape)))):
+        d = descr[i]
+        if x == y:
+            if x:
+                TP.append(d)
+            else:
+                TF.append(d)
+        else:
+            if x:
+                FP.append(d)
+            else:
+                FN.append(d)
+    f_score = len(TP) / (len(TP) + 0.5 * (len(FP) + len(FN)))
+    accuracy = (len(TP) + len(TF)) / (len(TP) + len(TF) + len(FP) + len(FN))  
+    precision = len(TP) / ( len(TP) + len(FP) )
+    recall = len(TP) / ( len(TP) + len(FN) )
+    print(f"F- score = {f_score}")
+    print(f"Accuracy = {accuracy}")
+    print(f"Precision = {precision}")
+    print(f"Recall = {recall}")
+    f = open("metrics.txt", "a")
+
+    f.write(f"F-SCORE = {f_score}\n")
+    f.write(f"Accuracy = {accuracy}\n")
+    f.write(f"Precision = {precision}\n")
+    f.write(f"Recall = {recall}\n")
+
+    # f.write(f"TP descriptions:")
+    # for i in TP:
+    #     f.write(i+'\n')
+    # f.write(f"TF descriptions:")
+    # for i in TF:
+    #     f.write(i+"\n")
+    # f.write(f"FP descriptions:")
+    # for i in FP:
+    #     f.write(i+"\n")
+    # f.write(f"FN descriptions:")
+    # for i in FN:
+    #     f.write(i+"\n")
+    # f.close()
+    a=1