diff --git a/Dockerfile b/Dockerfile index 7f68e74..8af5af7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,4 +12,10 @@ RUN apt-get install zip unzip --yes WORKDIR /app COPY ./deepl.py . + +COPY ./stare_zadania/process_data.sh . +COPY ./stare_zadania/download_data_and_process.py . +COPY ./stats.py . +COPY ./stare_zadania/real-or-fake-fake-jobposting-prediction.zip . + CMD python3 deepl.py diff --git a/Jenkinsfile b/Jenkinsfile index 789ccc7..111f343 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -6,20 +6,9 @@ pipeline { } parameters { string ( - defaultValue: 'mikolajpokrywka', - description: 'Kaggle username', - name: 'KAGGLE_USERNAME', - trim: false - ) - password( - defaultValue: '', - description: 'Kaggle token', - name: 'KAGGLE_KEY' - ) - string ( - defaultValue: '17000', - description: 'cut data', - name: 'CUTOFF', + defaultValue: '10', + description: 'Epochs number', + name: 'EPOCH', trim: false ) } @@ -31,11 +20,10 @@ pipeline { } stage('bash script') { steps { - withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", - "KAGGLE_KEY=${params.KAGGLE_KEY}", - "CUTOFF=${params.CUTOFF}"]) { - sh 'python3 ./download_data_and_process.py' - archiveArtifacts artifacts: "data_test.csv, data_dev.csv, data_train.csv, column_titles.csv, data.csv" + withEnv(["EPOCH=${params.CUTOFF}"]) { + copyArtifacts filter: '*', projectName: 's444463-create-dataset' + sh 'python3 ./deepl.py $EPOCHS' + archiveArtifacts artifacts: "model" } } } diff --git a/deepl.py b/deepl.py index b5d3fb2..c908806 100644 --- a/deepl.py +++ b/deepl.py @@ -9,6 +9,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer from torch import nn from torch import optim import matplotlib.pyplot as plt +import sys + def convert_text_to_model_form(text): @@ -18,9 +20,11 @@ def convert_text_to_model_form(text): if __name__ == "__main__": - kaggle.api.authenticate() - kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='.', - unzip=True) + epochs = int(sys.argv[1]) + + # kaggle.api.authenticate() + # kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='.', + # unzip=True) data = pd.read_csv('fake_job_postings.csv', engine='python') # data = data.replace(np.nan, '', regex=True) @@ -79,7 +83,6 @@ if __name__ == "__main__": test_losses = [] test_accuracies = [] - epochs = 50 for e in range(epochs): optimizer.zero_grad()