a
This commit is contained in:
commit
ac8d2b740d
20
Dockerfile
Normal file
20
Dockerfile
Normal file
@ -0,0 +1,20 @@
|
||||
FROM ubuntu:latest
|
||||
FROM python:3.8
|
||||
RUN apt update
|
||||
|
||||
|
||||
# Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane)
|
||||
COPY ./requirements.txt .
|
||||
RUN apt-get update
|
||||
RUN pip3 install -r requirements.txt
|
||||
RUN apt-get install zip unzip --yes
|
||||
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY ./process_data.sh .
|
||||
COPY ./download_data_and_process.py .
|
||||
COPY ./stats.py .
|
||||
COPY ./real-or-fake-fake-jobposting-prediction.zip .
|
||||
RUN chmod +x process_data.sh
|
||||
CMD python3 download_data_and_process.py
|
43
Jenkinsfile
vendored
Normal file
43
Jenkinsfile
vendored
Normal file
@ -0,0 +1,43 @@
|
||||
pipeline {
|
||||
agent {
|
||||
dockerfile {
|
||||
additionalBuildArgs "--build-arg KAGGLE_USERNAME=${params.KAGGLE_USERNAME} --build-arg KAGGLE_KEY=${params.KAGGLE_KEY} --build-arg CUTOFF=${params.CUTOFF} -t ium"
|
||||
}
|
||||
}
|
||||
parameters {
|
||||
string (
|
||||
defaultValue: 'mikolajpokrywka',
|
||||
description: 'Kaggle username',
|
||||
name: 'KAGGLE_USERNAME',
|
||||
trim: false
|
||||
)
|
||||
password(
|
||||
defaultValue: '',
|
||||
description: 'Kaggle token',
|
||||
name: 'KAGGLE_KEY'
|
||||
)
|
||||
string (
|
||||
defaultValue: '17000',
|
||||
description: 'cut data',
|
||||
name: 'CUTOFF',
|
||||
trim: false
|
||||
)
|
||||
}
|
||||
stages {
|
||||
stage('checkout: Check out from version control') {
|
||||
steps {
|
||||
checkout([$class: 'GitSCM', branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[credentialsId: 's444463', url: 'https://git.wmi.amu.edu.pl/s444463/ium_444463.git']]])
|
||||
}
|
||||
}
|
||||
stage('bash script') {
|
||||
steps {
|
||||
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
|
||||
"KAGGLE_KEY=${params.KAGGLE_KEY}",
|
||||
"CUTOFF=${params.CUTOFF}"]) {
|
||||
sh 'python3 ./download_data_and_process.py'
|
||||
archiveArtifacts artifacts: "data_test.csv, data_dev.csv, data_train.csv, column_titles.csv, data.csv"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
30
Jenkinsfile.stats
Normal file
30
Jenkinsfile.stats
Normal file
@ -0,0 +1,30 @@
|
||||
pipeline {
|
||||
agent {
|
||||
docker { image 'mikolajpokrywka/ium:0.0.0' }
|
||||
}
|
||||
parameters {
|
||||
buildSelector(
|
||||
defaultSelector: lastSuccessful(),
|
||||
description: 'Which build to use for copying artifacts',
|
||||
name: 'BUILD_SELECTOR'
|
||||
)
|
||||
}
|
||||
stages {
|
||||
stage('checkout: Check out from version control') {
|
||||
steps {
|
||||
checkout([$class: 'GitSCM', branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[credentialsId: 's444463', url: 'https://git.wmi.amu.edu.pl/s444463/ium_444463.git']]])
|
||||
}
|
||||
}
|
||||
stage('Copy artifacts') {
|
||||
steps {
|
||||
copyArtifacts fingerprintArtifacts: true, projectName: 's444463-create-dataset', selector: buildParameter('BUILD_SELECTOR')
|
||||
}
|
||||
}
|
||||
stage("Calcualte stats") {
|
||||
steps {
|
||||
sh "python3 stats.py"
|
||||
archiveArtifacts artifacts: "stats.txt"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
4
README.md
Normal file
4
README.md
Normal file
@ -0,0 +1,4 @@
|
||||
# Run with docker
|
||||
`docker build -t ium .`
|
||||
|
||||
`docker run -e KAGGLE_USERNAME='your_kaggle_username' -e KAGGLE_KEY='<your_kaggle_key>' -e CUTOFF='1600' -it ium:latest`
|
1536
download_data.ipynb
Normal file
1536
download_data.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
30
download_data_and_process.py
Normal file
30
download_data_and_process.py
Normal file
@ -0,0 +1,30 @@
|
||||
import subprocess
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
import subprocess
|
||||
|
||||
rc = subprocess.call("./process_data.sh")
|
||||
# import kaggle
|
||||
|
||||
# kaggle.api.authenticate()
|
||||
# kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='fake_job_postings.csv', unzip=True)
|
||||
|
||||
|
||||
|
||||
data=pd.read_csv('fake_job_postings.csv')
|
||||
data = data.replace(np.nan, '', regex=True)
|
||||
|
||||
print("="*20)
|
||||
print('Ilość wierszy w zbiorze: ',len(data))
|
||||
|
||||
print("="*10, ' data["department"].value_counts() ', 10*'=')
|
||||
print(data["department"].value_counts())
|
||||
|
||||
print("="*10, ' data.median() ', 10*'=')
|
||||
print(data.median())
|
||||
|
||||
print("="*10, ' data.describe(include="all") ', 10*'=')
|
||||
print(data.describe(include='all'))
|
||||
|
||||
data.describe(include="all").to_csv(r'stats.txt', header=None, index=None, sep='\t', mode='a')
|
17881
fake_job_postings.csv
Normal file
17881
fake_job_postings.csv
Normal file
File diff suppressed because one or more lines are too long
15
process_data.sh
Executable file
15
process_data.sh
Executable file
@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
echo "Download data from kaggle"
|
||||
echo $KAGGLE_USERNAME
|
||||
# kaggle datasets download -d shivamb/real-or-fake-fake-jobposting-prediction
|
||||
unzip -o real-or-fake-fake-jobposting-prediction.zip
|
||||
echo "Save column titles"
|
||||
head -n 1 fake_job_postings.csv > column_titles.csv
|
||||
tail -n +2 fake_job_postings.csv > data_not_shuf.csv
|
||||
echo "Create sets"
|
||||
shuf data_not_shuf.csv > data_not_cutted.csv
|
||||
head -n $CUTOFF data_not_cutted.csv > data.csv
|
||||
sed -n '1,2500p' data.csv > data_test.csv
|
||||
sed -n '2501,5000p' data.csv > data_dev.csv
|
||||
tail -n +5001 data.csv > data_train.csv
|
||||
rm data.csv data_not_shuf.csv data_not_cutted.csv
|
BIN
real-or-fake-fake-jobposting-prediction.zip
Normal file
BIN
real-or-fake-fake-jobposting-prediction.zip
Normal file
Binary file not shown.
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@ -0,0 +1,3 @@
|
||||
pandas
|
||||
numpy
|
||||
kaggle
|
33
stats.py
Normal file
33
stats.py
Normal file
@ -0,0 +1,33 @@
|
||||
import subprocess
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
|
||||
path = ''
|
||||
|
||||
all_files = ['column_titles.csv', 'data_train.csv', 'data_dev.csv', 'data_test.csv']
|
||||
|
||||
data_file = open("data.csv", "w")
|
||||
for name in all_files:
|
||||
f = open(name, "r")
|
||||
data_file.write(f.read())
|
||||
f.close()
|
||||
|
||||
data_file.close()
|
||||
data=pd.read_csv('data.csv')
|
||||
data = data.replace(np.nan, '', regex=True)
|
||||
|
||||
print("="*20)
|
||||
print('Ilość wierszy w zbiorze: ',len(data))
|
||||
|
||||
print("="*10, ' data["department"].value_counts() ', 10*'=')
|
||||
print(data["department"].value_counts())
|
||||
|
||||
print("="*10, ' data.median() ', 10*'=')
|
||||
print(data.median())
|
||||
|
||||
print("="*10, ' data.describe(include="all") ', 10*'=')
|
||||
print(data.describe(include='all'))
|
||||
|
||||
data.describe(include="all").to_csv(r'stats.txt', header=None, index=None, sep='\t', mode='a')
|
Loading…
Reference in New Issue
Block a user