fix kaggle
This commit is contained in:
parent
4ac532c38b
commit
e58dfd187e
18
Dockerfile
18
Dockerfile
@ -1,19 +1,11 @@
|
|||||||
# Nasz obraz będzie dzidziczył z obrazu Ubuntu w wersji latest
|
|
||||||
FROM ubuntu:latest
|
FROM ubuntu:latest
|
||||||
|
|
||||||
# Instalujemy niezbędne zależności. Zwróć uwagę na flagę "-y" (assume yes)
|
|
||||||
RUN apt update && apt install -y python3 \
|
RUN apt update && apt install -y python3 \
|
||||||
python3-pip \
|
python3-pip
|
||||||
vim
|
|
||||||
|
|
||||||
ENV CUTOFF=${CUTOFF}
|
WORKDIR /code
|
||||||
ENV KAGGLE_USERNAME=${KAGGLE_USERNAME}
|
|
||||||
ENV KAGGLE_KEY=${KAGGLE_KEY}
|
|
||||||
|
|
||||||
# Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane)
|
COPY . /code/
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
# Skopiujmy nasz skrypt do katalogu /app w kontenerze
|
RUN python3 -m pip install -r requirements.txt
|
||||||
COPY . /app/
|
RUN mkdir /code/.kaggle && chmod o+w /code/.kaggle
|
||||||
|
|
||||||
RUN python3 -m pip install -r requirements.txt
|
|
@ -1,5 +1,5 @@
|
|||||||
node {
|
node {
|
||||||
docker.image('s444452/ium:1.0').inside {
|
docker.image('s444452/ium:1.1').inside {
|
||||||
stage('Preparation') {
|
stage('Preparation') {
|
||||||
properties([
|
properties([
|
||||||
parameters([
|
parameters([
|
||||||
@ -31,11 +31,11 @@ node {
|
|||||||
sh 'echo KAGGLE_USERNAME: $KAGGLE_USERNAME'
|
sh 'echo KAGGLE_USERNAME: $KAGGLE_USERNAME'
|
||||||
sh 'echo KAGGLE_KEY: $KAGGLE_KEY'
|
sh 'echo KAGGLE_KEY: $KAGGLE_KEY'
|
||||||
sh 'ls'
|
sh 'ls'
|
||||||
sh "python3 lab2_data.py"
|
sh "python3 download_dataset.py '.' 'dataset.csv'"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stage('Archive artifacts') {
|
stage('Archive artifacts') {
|
||||||
archiveArtifacts 'fake_job_postings.csv'
|
archiveArtifacts 'dataset.csv' 'train_data.csv' 'test_data.csv' 'dev_data.csv'
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -1,5 +1,6 @@
|
|||||||
node {
|
node {
|
||||||
stage('Preparation') {
|
docker.image('s444452/ium:1.1').inside {
|
||||||
|
stage('Preparation') {
|
||||||
properties([parameters([
|
properties([parameters([
|
||||||
buildSelector(
|
buildSelector(
|
||||||
defaultSelector: lastSuccessful(),
|
defaultSelector: lastSuccessful(),
|
||||||
@ -12,15 +13,18 @@ node {
|
|||||||
}
|
}
|
||||||
stage('Copy artifacts') {
|
stage('Copy artifacts') {
|
||||||
copyArtifacts filter: 'dataset.csv', fingerprintArtifacts: true, projectName: 's444452-create-dataset', selector: buildParameter('BUILD_SELECTOR')
|
copyArtifacts filter: 'dataset.csv', fingerprintArtifacts: true, projectName: 's444452-create-dataset', selector: buildParameter('BUILD_SELECTOR')
|
||||||
|
copyArtifacts filter: 'train_data.csv', fingerprintArtifacts: true, projectName: 's444452-create-dataset', selector: buildParameter('BUILD_SELECTOR')
|
||||||
|
copyArtifacts filter: 'test_data.csv', fingerprintArtifacts: true, projectName: 's444452-create-dataset', selector: buildParameter('BUILD_SELECTOR')
|
||||||
|
copyArtifacts filter: 'dev_data.csv', fingerprintArtifacts: true, projectName: 's444452-create-dataset', selector: buildParameter('BUILD_SELECTOR')
|
||||||
}
|
}
|
||||||
stage('Run script') {
|
stage('Run script') {
|
||||||
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
|
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
|
||||||
"KAGGLE_KEY=${params.KAGGLE_KEY}"]) {
|
"KAGGLE_KEY=${params.KAGGLE_KEY}"]) {
|
||||||
sh "chmod u+x ./dataset_stats.sh"
|
sh "python3 generate_dataset_stats.py '.'"
|
||||||
sh "./dataset_stats.sh"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stage('Archive artifacts') {
|
stage('Archive artifacts') {
|
||||||
archiveArtifacts 'stats.txt'
|
archiveArtifacts 'train_stats.txt' 'test_stats.txt' 'dev_stats.txt'
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
60
download_dataset.py
Normal file
60
download_dataset.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
import os.path
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from kaggle import api
|
||||||
|
from pandas import read_csv
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
|
||||||
|
def download_and_save_dataset(data_path, dataset_name):
|
||||||
|
if not os.path.exists(os.path.join(data_path, dataset_name)):
|
||||||
|
api.authenticate()
|
||||||
|
api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path=data_path,
|
||||||
|
unzip=True)
|
||||||
|
os.rename(os.path.join(data_path, 'fake_job_postings.csv'), os.path.join(data_path, dataset_name))
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_dataset(data):
|
||||||
|
# drop columns with many nulls
|
||||||
|
return data.drop(['job_id', 'department', 'salary_range', 'benefits'], axis=1)
|
||||||
|
|
||||||
|
|
||||||
|
def split_dataset(data_path, dataset_name):
|
||||||
|
data = read_csv(os.path.join(data_path, dataset_name))
|
||||||
|
data = preprocess_dataset(data)
|
||||||
|
train_ratio, validation_ratio, test_ratio = 0.6, 0.2, 0.2
|
||||||
|
data_x, data_y = data.iloc[:, :-1], data.iloc[:, -1:]
|
||||||
|
|
||||||
|
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=1 - train_ratio,
|
||||||
|
random_state=123, stratify=data['fraudulent'])
|
||||||
|
|
||||||
|
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test,
|
||||||
|
test_size=test_ratio / (test_ratio + validation_ratio),
|
||||||
|
random_state=123)
|
||||||
|
|
||||||
|
return x_train, x_val, x_test, y_train, y_val, y_test
|
||||||
|
|
||||||
|
|
||||||
|
def save_dataset(data_path, data, name):
|
||||||
|
data.to_csv(os.path.join(data_path, name))
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
data_path, dataset_name = sys.argv[1], sys.argv[2]
|
||||||
|
abs_data_path = os.path.abspath(data_path)
|
||||||
|
download_and_save_dataset(abs_data_path, dataset_name)
|
||||||
|
|
||||||
|
x_train, x_val, x_test, y_train, y_val, y_test = split_dataset(abs_data_path, dataset_name)
|
||||||
|
|
||||||
|
train_data = pd.concat([x_train, y_train], axis=1)
|
||||||
|
test_data = pd.concat([x_test, y_test], axis=1)
|
||||||
|
dev_data = pd.concat([x_val, y_val], axis=1)
|
||||||
|
|
||||||
|
for data, name in ((train_data, 'train_data.csv'), (test_data, 'test_data.csv'), (dev_data, 'dev_data.csv')):
|
||||||
|
save_dataset(abs_data_path, data, name)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
29
generate_dataset_stats.py
Normal file
29
generate_dataset_stats.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
import os
|
||||||
|
import pprint
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from pandas import read_csv
|
||||||
|
|
||||||
|
|
||||||
|
def save_stats_to_file(data_path, data_name, stats_name):
|
||||||
|
data = read_csv(os.path.join(data_path, data_name))
|
||||||
|
with open(os.path.join(data_path, stats_name), "w") as log_file:
|
||||||
|
for name, obj in (
|
||||||
|
('Description: ', data.describe(include='all')), ('Shape: ', data.shape), ('Head: ', data.head())):
|
||||||
|
pprint.pprint(name, log_file)
|
||||||
|
pprint.pprint(obj, log_file)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
data_path = sys.argv[1]
|
||||||
|
abs_data_path = os.path.abspath(data_path)
|
||||||
|
|
||||||
|
for data_name, stats_name in (
|
||||||
|
('train_data.csv', 'train_stats.txt'), ('test_data.csv', 'test_stats.txt'),
|
||||||
|
('dev_data.csv', 'dev_stats.txt')):
|
||||||
|
save_stats_to_file(abs_data_path, data_name, stats_name)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
36
lab2_data.py
36
lab2_data.py
@ -1,36 +0,0 @@
|
|||||||
#!/usr/bin/python
|
|
||||||
from kaggle import api
|
|
||||||
from pandas import read_csv, DataFrame
|
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
|
|
||||||
|
|
||||||
def download_and_save_dataset():
|
|
||||||
api.authenticate()
|
|
||||||
api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', unzip=True)
|
|
||||||
|
|
||||||
|
|
||||||
def split_dataset(data: DataFrame):
|
|
||||||
train_ratio, validation_ratio, test_ratio = 0.6, 0.2, 0.2
|
|
||||||
data_x, data_y = data.iloc[:, :-1], data.iloc[:, -1:]
|
|
||||||
|
|
||||||
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=1 - train_ratio, random_state=123)
|
|
||||||
|
|
||||||
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test,
|
|
||||||
test_size=test_ratio / (test_ratio + validation_ratio),
|
|
||||||
random_state=123)
|
|
||||||
|
|
||||||
return x_train, x_val, x_test, y_train, y_val, y_test
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
# download_and_save_dataset()
|
|
||||||
df = read_csv('./fake_job_postings.csv')
|
|
||||||
print(df.describe(include='all'))
|
|
||||||
print(df.shape)
|
|
||||||
x_train, x_val, x_test, y_train, y_val, y_test = split_dataset(df)
|
|
||||||
print(x_train.shape, x_val.shape, x_test.shape)
|
|
||||||
print(y_train.shape, y_val.shape, y_test.shape)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
Loading…
Reference in New Issue
Block a user