fix kaggle

This commit is contained in:
AdamOsiowy123 2022-04-10 20:29:59 +02:00
parent 4ac532c38b
commit e58dfd187e
6 changed files with 106 additions and 57 deletions

View File

@ -1,19 +1,11 @@
# Nasz obraz będzie dzidziczył z obrazu Ubuntu w wersji latest
FROM ubuntu:latest
# Instalujemy niezbędne zależności. Zwróć uwagę na flagę "-y" (assume yes)
RUN apt update && apt install -y python3 \
python3-pip \
vim
python3-pip
ENV CUTOFF=${CUTOFF}
ENV KAGGLE_USERNAME=${KAGGLE_USERNAME}
ENV KAGGLE_KEY=${KAGGLE_KEY}
WORKDIR /code
# Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane)
WORKDIR /app
# Skopiujmy nasz skrypt do katalogu /app w kontenerze
COPY . /app/
COPY . /code/
RUN python3 -m pip install -r requirements.txt
RUN mkdir /code/.kaggle && chmod o+w /code/.kaggle

View File

@ -1,5 +1,5 @@
node {
docker.image('s444452/ium:1.0').inside {
docker.image('s444452/ium:1.1').inside {
stage('Preparation') {
properties([
parameters([
@ -31,11 +31,11 @@ node {
sh 'echo KAGGLE_USERNAME: $KAGGLE_USERNAME'
sh 'echo KAGGLE_KEY: $KAGGLE_KEY'
sh 'ls'
sh "python3 lab2_data.py"
sh "python3 download_dataset.py '.' 'dataset.csv'"
}
}
stage('Archive artifacts') {
archiveArtifacts 'fake_job_postings.csv'
archiveArtifacts 'dataset.csv' 'train_data.csv' 'test_data.csv' 'dev_data.csv'
}
}
}

View File

@ -1,4 +1,5 @@
node {
docker.image('s444452/ium:1.1').inside {
stage('Preparation') {
properties([parameters([
buildSelector(
@ -12,15 +13,18 @@ node {
}
stage('Copy artifacts') {
copyArtifacts filter: 'dataset.csv', fingerprintArtifacts: true, projectName: 's444452-create-dataset', selector: buildParameter('BUILD_SELECTOR')
copyArtifacts filter: 'train_data.csv', fingerprintArtifacts: true, projectName: 's444452-create-dataset', selector: buildParameter('BUILD_SELECTOR')
copyArtifacts filter: 'test_data.csv', fingerprintArtifacts: true, projectName: 's444452-create-dataset', selector: buildParameter('BUILD_SELECTOR')
copyArtifacts filter: 'dev_data.csv', fingerprintArtifacts: true, projectName: 's444452-create-dataset', selector: buildParameter('BUILD_SELECTOR')
}
stage('Run script') {
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
"KAGGLE_KEY=${params.KAGGLE_KEY}"]) {
sh "chmod u+x ./dataset_stats.sh"
sh "./dataset_stats.sh"
sh "python3 generate_dataset_stats.py '.'"
}
}
stage('Archive artifacts') {
archiveArtifacts 'stats.txt'
archiveArtifacts 'train_stats.txt' 'test_stats.txt' 'dev_stats.txt'
}
}
}

60
download_dataset.py Normal file
View File

@ -0,0 +1,60 @@
#!/usr/bin/python
import os.path
import sys
import pandas as pd
from kaggle import api
from pandas import read_csv
from sklearn.model_selection import train_test_split
def download_and_save_dataset(data_path, dataset_name):
if not os.path.exists(os.path.join(data_path, dataset_name)):
api.authenticate()
api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path=data_path,
unzip=True)
os.rename(os.path.join(data_path, 'fake_job_postings.csv'), os.path.join(data_path, dataset_name))
def preprocess_dataset(data):
# drop columns with many nulls
return data.drop(['job_id', 'department', 'salary_range', 'benefits'], axis=1)
def split_dataset(data_path, dataset_name):
data = read_csv(os.path.join(data_path, dataset_name))
data = preprocess_dataset(data)
train_ratio, validation_ratio, test_ratio = 0.6, 0.2, 0.2
data_x, data_y = data.iloc[:, :-1], data.iloc[:, -1:]
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=1 - train_ratio,
random_state=123, stratify=data['fraudulent'])
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test,
test_size=test_ratio / (test_ratio + validation_ratio),
random_state=123)
return x_train, x_val, x_test, y_train, y_val, y_test
def save_dataset(data_path, data, name):
data.to_csv(os.path.join(data_path, name))
def main():
data_path, dataset_name = sys.argv[1], sys.argv[2]
abs_data_path = os.path.abspath(data_path)
download_and_save_dataset(abs_data_path, dataset_name)
x_train, x_val, x_test, y_train, y_val, y_test = split_dataset(abs_data_path, dataset_name)
train_data = pd.concat([x_train, y_train], axis=1)
test_data = pd.concat([x_test, y_test], axis=1)
dev_data = pd.concat([x_val, y_val], axis=1)
for data, name in ((train_data, 'train_data.csv'), (test_data, 'test_data.csv'), (dev_data, 'dev_data.csv')):
save_dataset(abs_data_path, data, name)
if __name__ == '__main__':
main()

29
generate_dataset_stats.py Normal file
View File

@ -0,0 +1,29 @@
#!/usr/bin/python
import os
import pprint
import sys
from pandas import read_csv
def save_stats_to_file(data_path, data_name, stats_name):
data = read_csv(os.path.join(data_path, data_name))
with open(os.path.join(data_path, stats_name), "w") as log_file:
for name, obj in (
('Description: ', data.describe(include='all')), ('Shape: ', data.shape), ('Head: ', data.head())):
pprint.pprint(name, log_file)
pprint.pprint(obj, log_file)
def main():
data_path = sys.argv[1]
abs_data_path = os.path.abspath(data_path)
for data_name, stats_name in (
('train_data.csv', 'train_stats.txt'), ('test_data.csv', 'test_stats.txt'),
('dev_data.csv', 'dev_stats.txt')):
save_stats_to_file(abs_data_path, data_name, stats_name)
if __name__ == '__main__':
main()

View File

@ -1,36 +0,0 @@
#!/usr/bin/python
from kaggle import api
from pandas import read_csv, DataFrame
from sklearn.model_selection import train_test_split
def download_and_save_dataset():
api.authenticate()
api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', unzip=True)
def split_dataset(data: DataFrame):
train_ratio, validation_ratio, test_ratio = 0.6, 0.2, 0.2
data_x, data_y = data.iloc[:, :-1], data.iloc[:, -1:]
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=1 - train_ratio, random_state=123)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test,
test_size=test_ratio / (test_ratio + validation_ratio),
random_state=123)
return x_train, x_val, x_test, y_train, y_val, y_test
def main():
# download_and_save_dataset()
df = read_csv('./fake_job_postings.csv')
print(df.describe(include='all'))
print(df.shape)
x_train, x_val, x_test, y_train, y_val, y_test = split_dataset(df)
print(x_train.shape, x_val.shape, x_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)
if __name__ == '__main__':
main()