fix kaggle
This commit is contained in:
parent
4ac532c38b
commit
e58dfd187e
18
Dockerfile
18
Dockerfile
@ -1,19 +1,11 @@
|
||||
# Nasz obraz będzie dzidziczył z obrazu Ubuntu w wersji latest
|
||||
FROM ubuntu:latest
|
||||
|
||||
# Instalujemy niezbędne zależności. Zwróć uwagę na flagę "-y" (assume yes)
|
||||
RUN apt update && apt install -y python3 \
|
||||
python3-pip \
|
||||
vim
|
||||
python3-pip
|
||||
|
||||
ENV CUTOFF=${CUTOFF}
|
||||
ENV KAGGLE_USERNAME=${KAGGLE_USERNAME}
|
||||
ENV KAGGLE_KEY=${KAGGLE_KEY}
|
||||
WORKDIR /code
|
||||
|
||||
# Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane)
|
||||
WORKDIR /app
|
||||
COPY . /code/
|
||||
|
||||
# Skopiujmy nasz skrypt do katalogu /app w kontenerze
|
||||
COPY . /app/
|
||||
|
||||
RUN python3 -m pip install -r requirements.txt
|
||||
RUN python3 -m pip install -r requirements.txt
|
||||
RUN mkdir /code/.kaggle && chmod o+w /code/.kaggle
|
@ -1,5 +1,5 @@
|
||||
node {
|
||||
docker.image('s444452/ium:1.0').inside {
|
||||
docker.image('s444452/ium:1.1').inside {
|
||||
stage('Preparation') {
|
||||
properties([
|
||||
parameters([
|
||||
@ -31,11 +31,11 @@ node {
|
||||
sh 'echo KAGGLE_USERNAME: $KAGGLE_USERNAME'
|
||||
sh 'echo KAGGLE_KEY: $KAGGLE_KEY'
|
||||
sh 'ls'
|
||||
sh "python3 lab2_data.py"
|
||||
sh "python3 download_dataset.py '.' 'dataset.csv'"
|
||||
}
|
||||
}
|
||||
stage('Archive artifacts') {
|
||||
archiveArtifacts 'fake_job_postings.csv'
|
||||
archiveArtifacts 'dataset.csv' 'train_data.csv' 'test_data.csv' 'dev_data.csv'
|
||||
}
|
||||
}
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
node {
|
||||
stage('Preparation') {
|
||||
docker.image('s444452/ium:1.1').inside {
|
||||
stage('Preparation') {
|
||||
properties([parameters([
|
||||
buildSelector(
|
||||
defaultSelector: lastSuccessful(),
|
||||
@ -12,15 +13,18 @@ node {
|
||||
}
|
||||
stage('Copy artifacts') {
|
||||
copyArtifacts filter: 'dataset.csv', fingerprintArtifacts: true, projectName: 's444452-create-dataset', selector: buildParameter('BUILD_SELECTOR')
|
||||
copyArtifacts filter: 'train_data.csv', fingerprintArtifacts: true, projectName: 's444452-create-dataset', selector: buildParameter('BUILD_SELECTOR')
|
||||
copyArtifacts filter: 'test_data.csv', fingerprintArtifacts: true, projectName: 's444452-create-dataset', selector: buildParameter('BUILD_SELECTOR')
|
||||
copyArtifacts filter: 'dev_data.csv', fingerprintArtifacts: true, projectName: 's444452-create-dataset', selector: buildParameter('BUILD_SELECTOR')
|
||||
}
|
||||
stage('Run script') {
|
||||
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
|
||||
"KAGGLE_KEY=${params.KAGGLE_KEY}"]) {
|
||||
sh "chmod u+x ./dataset_stats.sh"
|
||||
sh "./dataset_stats.sh"
|
||||
sh "python3 generate_dataset_stats.py '.'"
|
||||
}
|
||||
}
|
||||
stage('Archive artifacts') {
|
||||
archiveArtifacts 'stats.txt'
|
||||
archiveArtifacts 'train_stats.txt' 'test_stats.txt' 'dev_stats.txt'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
60
download_dataset.py
Normal file
60
download_dataset.py
Normal file
@ -0,0 +1,60 @@
|
||||
#!/usr/bin/python
|
||||
import os.path
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
from kaggle import api
|
||||
from pandas import read_csv
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
|
||||
def download_and_save_dataset(data_path, dataset_name):
|
||||
if not os.path.exists(os.path.join(data_path, dataset_name)):
|
||||
api.authenticate()
|
||||
api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path=data_path,
|
||||
unzip=True)
|
||||
os.rename(os.path.join(data_path, 'fake_job_postings.csv'), os.path.join(data_path, dataset_name))
|
||||
|
||||
|
||||
def preprocess_dataset(data):
|
||||
# drop columns with many nulls
|
||||
return data.drop(['job_id', 'department', 'salary_range', 'benefits'], axis=1)
|
||||
|
||||
|
||||
def split_dataset(data_path, dataset_name):
|
||||
data = read_csv(os.path.join(data_path, dataset_name))
|
||||
data = preprocess_dataset(data)
|
||||
train_ratio, validation_ratio, test_ratio = 0.6, 0.2, 0.2
|
||||
data_x, data_y = data.iloc[:, :-1], data.iloc[:, -1:]
|
||||
|
||||
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=1 - train_ratio,
|
||||
random_state=123, stratify=data['fraudulent'])
|
||||
|
||||
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test,
|
||||
test_size=test_ratio / (test_ratio + validation_ratio),
|
||||
random_state=123)
|
||||
|
||||
return x_train, x_val, x_test, y_train, y_val, y_test
|
||||
|
||||
|
||||
def save_dataset(data_path, data, name):
|
||||
data.to_csv(os.path.join(data_path, name))
|
||||
|
||||
|
||||
def main():
|
||||
data_path, dataset_name = sys.argv[1], sys.argv[2]
|
||||
abs_data_path = os.path.abspath(data_path)
|
||||
download_and_save_dataset(abs_data_path, dataset_name)
|
||||
|
||||
x_train, x_val, x_test, y_train, y_val, y_test = split_dataset(abs_data_path, dataset_name)
|
||||
|
||||
train_data = pd.concat([x_train, y_train], axis=1)
|
||||
test_data = pd.concat([x_test, y_test], axis=1)
|
||||
dev_data = pd.concat([x_val, y_val], axis=1)
|
||||
|
||||
for data, name in ((train_data, 'train_data.csv'), (test_data, 'test_data.csv'), (dev_data, 'dev_data.csv')):
|
||||
save_dataset(abs_data_path, data, name)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
29
generate_dataset_stats.py
Normal file
29
generate_dataset_stats.py
Normal file
@ -0,0 +1,29 @@
|
||||
#!/usr/bin/python
|
||||
import os
|
||||
import pprint
|
||||
import sys
|
||||
|
||||
from pandas import read_csv
|
||||
|
||||
|
||||
def save_stats_to_file(data_path, data_name, stats_name):
|
||||
data = read_csv(os.path.join(data_path, data_name))
|
||||
with open(os.path.join(data_path, stats_name), "w") as log_file:
|
||||
for name, obj in (
|
||||
('Description: ', data.describe(include='all')), ('Shape: ', data.shape), ('Head: ', data.head())):
|
||||
pprint.pprint(name, log_file)
|
||||
pprint.pprint(obj, log_file)
|
||||
|
||||
|
||||
def main():
|
||||
data_path = sys.argv[1]
|
||||
abs_data_path = os.path.abspath(data_path)
|
||||
|
||||
for data_name, stats_name in (
|
||||
('train_data.csv', 'train_stats.txt'), ('test_data.csv', 'test_stats.txt'),
|
||||
('dev_data.csv', 'dev_stats.txt')):
|
||||
save_stats_to_file(abs_data_path, data_name, stats_name)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
36
lab2_data.py
36
lab2_data.py
@ -1,36 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
from kaggle import api
|
||||
from pandas import read_csv, DataFrame
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
|
||||
def download_and_save_dataset():
|
||||
api.authenticate()
|
||||
api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', unzip=True)
|
||||
|
||||
|
||||
def split_dataset(data: DataFrame):
|
||||
train_ratio, validation_ratio, test_ratio = 0.6, 0.2, 0.2
|
||||
data_x, data_y = data.iloc[:, :-1], data.iloc[:, -1:]
|
||||
|
||||
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=1 - train_ratio, random_state=123)
|
||||
|
||||
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test,
|
||||
test_size=test_ratio / (test_ratio + validation_ratio),
|
||||
random_state=123)
|
||||
|
||||
return x_train, x_val, x_test, y_train, y_val, y_test
|
||||
|
||||
|
||||
def main():
|
||||
# download_and_save_dataset()
|
||||
df = read_csv('./fake_job_postings.csv')
|
||||
print(df.describe(include='all'))
|
||||
print(df.shape)
|
||||
x_train, x_val, x_test, y_train, y_val, y_test = split_dataset(df)
|
||||
print(x_train.shape, x_val.shape, x_test.shape)
|
||||
print(y_train.shape, y_val.shape, y_test.shape)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in New Issue
Block a user