This commit is contained in:
Mikołaj Pokrywka 2022-05-03 15:55:42 +02:00
parent ac8d2b740d
commit 40617e67f9
8 changed files with 4 additions and 19489 deletions

View File

@ -17,4 +17,5 @@ COPY ./download_data_and_process.py .
COPY ./stats.py .
COPY ./real-or-fake-fake-jobposting-prediction.zip .
RUN chmod +x process_data.sh
CMD python3 download_data_and_process.py
RUN ls
# CMD python3 download_data_and_process.py

4
Jenkinsfile vendored
View File

@ -34,8 +34,8 @@ pipeline {
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
"KAGGLE_KEY=${params.KAGGLE_KEY}",
"CUTOFF=${params.CUTOFF}"]) {
sh 'python3 ./download_data_and_process.py'
archiveArtifacts artifacts: "data_test.csv, data_dev.csv, data_train.csv, column_titles.csv, data.csv"
sh './process_data.sh'
archiveArtifacts artifacts: "data_test.csv, data_dev.csv, data_train.csv, column_titles.csv, fake_job_postings.csv"
}
}
}

View File

@ -1,30 +0,0 @@
pipeline {
agent {
docker { image 'mikolajpokrywka/ium:0.0.0' }
}
parameters {
buildSelector(
defaultSelector: lastSuccessful(),
description: 'Which build to use for copying artifacts',
name: 'BUILD_SELECTOR'
)
}
stages {
stage('checkout: Check out from version control') {
steps {
checkout([$class: 'GitSCM', branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[credentialsId: 's444463', url: 'https://git.wmi.amu.edu.pl/s444463/ium_444463.git']]])
}
}
stage('Copy artifacts') {
steps {
copyArtifacts fingerprintArtifacts: true, projectName: 's444463-create-dataset', selector: buildParameter('BUILD_SELECTOR')
}
}
stage("Calcualte stats") {
steps {
sh "python3 stats.py"
archiveArtifacts artifacts: "stats.txt"
}
}
}
}

View File

@ -1,4 +0,0 @@
# Run with docker
`docker build -t ium .`
`docker run -e KAGGLE_USERNAME='your_kaggle_username' -e KAGGLE_KEY='<your_kaggle_key>' -e CUTOFF='1600' -it ium:latest`

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -1,33 +0,0 @@
import subprocess
import pandas as pd
import numpy as np
import os
path = ''
all_files = ['column_titles.csv', 'data_train.csv', 'data_dev.csv', 'data_test.csv']
data_file = open("data.csv", "w")
for name in all_files:
f = open(name, "r")
data_file.write(f.read())
f.close()
data_file.close()
data=pd.read_csv('data.csv')
data = data.replace(np.nan, '', regex=True)
print("="*20)
print('Ilość wierszy w zbiorze: ',len(data))
print("="*10, ' data["department"].value_counts() ', 10*'=')
print(data["department"].value_counts())
print("="*10, ' data.median() ', 10*'=')
print(data.median())
print("="*10, ' data.describe(include="all") ', 10*'=')
print(data.describe(include='all'))
data.describe(include="all").to_csv(r'stats.txt', header=None, index=None, sep='\t', mode='a')

View File

@ -1,2 +0,0 @@
#!/bin/bash
wc data_train.csv > stats.txt