zad4 jenkins docker image creation
This commit is contained in:
parent
f6e77c61ec
commit
af43187263
22
Dockerfile
Normal file
22
Dockerfile
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
FROM ubuntu:latest
|
||||||
|
|
||||||
|
ENV KAGGLE_USERNAME=gulczas
|
||||||
|
ENV KAGGLE_KEY=default_key
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
wget \
|
||||||
|
unzip \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
RUN pip3 install pandas scikit-learn requests kaggle
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY data_preparation_script.py /app/
|
||||||
|
|
||||||
|
RUN chmod +x data_preparation_script.py
|
||||||
|
|
||||||
|
CMD ["python3", "data_preparation_script.py"]
|
44
Jenkinsfile-docker-image-build
Normal file
44
Jenkinsfile-docker-image-build
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
pipeline {
|
||||||
|
agent any
|
||||||
|
|
||||||
|
parameters {
|
||||||
|
string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username')
|
||||||
|
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
|
||||||
|
}
|
||||||
|
|
||||||
|
stages {
|
||||||
|
stage('Clone Repository') {
|
||||||
|
steps {
|
||||||
|
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
stage('Build Docker image') {
|
||||||
|
steps {
|
||||||
|
script {
|
||||||
|
withEnv([
|
||||||
|
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
|
||||||
|
"KAGGLE_KEY=${env.KAGGLE_KEY}"])
|
||||||
|
sh "docker build --build-arg KAGGLE_USERNAME=$KAGGLE_USERNAME --build-arg KAGGLE_KEY=$KAGGLE_KEY -t s464953 ."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
stage('Run Docker container') {
|
||||||
|
steps {
|
||||||
|
script {
|
||||||
|
withEnv([
|
||||||
|
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
|
||||||
|
"KAGGLE_KEY=${env.KAGGLE_KEY}"])
|
||||||
|
sh "docker run --name s464953 -e KAGGLE_USERNAME=$KAGGLE_USERNAME -e KAGGLE_KEY=$KAGGLE_KEY -v ${WORKSPACE}:/app -ti s464953"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
stage('Archive stats.txt artifact') {
|
||||||
|
steps {
|
||||||
|
archiveArtifacts artifacts: 'stats.txt', allowEmptyArchive: true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
113
data_preparation_script.py
Normal file
113
data_preparation_script.py
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# Import bibliotek
|
||||||
|
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
import requests
|
||||||
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
|
from kaggle.api.kaggle_api_extended import KaggleApi
|
||||||
|
|
||||||
|
|
||||||
|
#funkcja pobierająca plik
|
||||||
|
|
||||||
|
def download_file(url, filename, destination_folder):
|
||||||
|
# Wersja dla datasetów kaggle
|
||||||
|
|
||||||
|
api = KaggleApi()
|
||||||
|
api.authenticate()
|
||||||
|
|
||||||
|
api.dataset_download_files('gulczas/spotify-dataset', path=destination_folder, unzip=True)
|
||||||
|
|
||||||
|
|
||||||
|
# funkcja dzieląca zbiór
|
||||||
|
|
||||||
|
def split_dataset(data, test_size=0.2, val_size=0.1, random_state=42):
|
||||||
|
#Podział na test i trening
|
||||||
|
train_data, test_data = train_test_split(data, test_size=test_size, random_state=random_state)
|
||||||
|
#Podział na walidacje i trening
|
||||||
|
train_data, val_data = train_test_split(train_data, test_size=val_size/(1-test_size), random_state=random_state)
|
||||||
|
|
||||||
|
return train_data, val_data, test_data
|
||||||
|
|
||||||
|
|
||||||
|
# Wyświetlanie statystyk zbioru
|
||||||
|
|
||||||
|
def print_dataset_stats(data, subset_name):
|
||||||
|
with open('stats.txt', 'a') as stats_file:
|
||||||
|
print(f"Statystyki dla zbioru {subset_name}:", file=stats_file)
|
||||||
|
print(f"Wielkość zbioru {subset_name}: {len(data)}", file=stats_file)
|
||||||
|
|
||||||
|
print("\nStatystyki wartości poszczególnych parametrów:", file=stats_file)
|
||||||
|
print(data.describe(), file=stats_file)
|
||||||
|
|
||||||
|
for column in data.columns:
|
||||||
|
print(f"Rozkład częstości dla kolumny '{column}':", file=stats_file)
|
||||||
|
print(data[column].value_counts(), file=stats_file)
|
||||||
|
print("\n", file=stats_file)
|
||||||
|
|
||||||
|
# Normalizacja danych
|
||||||
|
|
||||||
|
def normalize_data(data):
|
||||||
|
scaler = MinMaxScaler()
|
||||||
|
numeric_columns = data.select_dtypes(include=['int', 'float']).columns
|
||||||
|
scaler.fit(data[numeric_columns])
|
||||||
|
df_normalized = data.copy()
|
||||||
|
df_normalized[numeric_columns] = scaler.transform(df_normalized[numeric_columns])
|
||||||
|
return df_normalized
|
||||||
|
|
||||||
|
#Czyszczenie danych
|
||||||
|
|
||||||
|
def clean_dataset(data):
|
||||||
|
data.dropna(inplace=True)
|
||||||
|
data.drop_duplicates(inplace=True)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
# main
|
||||||
|
|
||||||
|
url = "https://www.kaggle.com/datasets/gulczas/spotify-dataset?select=Spotify_Dataset.csv"
|
||||||
|
filename = "Spotify_Dataset.csv"
|
||||||
|
destination_folder = "datasets"
|
||||||
|
|
||||||
|
# Pobieranie jeśli nie ma już pobranego pliku
|
||||||
|
if not os.path.exists(destination_folder):
|
||||||
|
os.makedirs(destination_folder)
|
||||||
|
print(f"Utworzono folder: {destination_folder}")
|
||||||
|
else:
|
||||||
|
print(f"Folder {destination_folder} już istnieje.")
|
||||||
|
|
||||||
|
if len(os.listdir(destination_folder)) == 0:
|
||||||
|
# Pobranie pliku
|
||||||
|
filepath = download_file(url, filename, destination_folder)
|
||||||
|
|
||||||
|
# Wczytanie danych z pliku CSV
|
||||||
|
data = pd.read_csv("datasets/Spotify_Dataset.csv", sep=";")
|
||||||
|
|
||||||
|
# Podział datasetu na zbiory treningowy, walidacyjny i testowy
|
||||||
|
train_data, val_data, test_data = split_dataset(data)
|
||||||
|
|
||||||
|
# Zapisanie podzielonych zbiorów danych do osobnych plików CSV
|
||||||
|
train_data.to_csv("datasets/train.csv", index=False)
|
||||||
|
val_data.to_csv("datasets/val.csv", index=False)
|
||||||
|
test_data.to_csv("datasets/test.csv", index=False)
|
||||||
|
|
||||||
|
# Wydrukowanie statystyk dla zbiorów
|
||||||
|
print_dataset_stats(train_data, "treningowego")
|
||||||
|
print("\n")
|
||||||
|
print_dataset_stats(val_data, "walidacyjnego")
|
||||||
|
print("\n")
|
||||||
|
print_dataset_stats(test_data, "testowego")
|
||||||
|
|
||||||
|
# Normalizacja i czyszczenie zbirów
|
||||||
|
train_data = normalize_data(train_data)
|
||||||
|
train_data = clean_dataset(train_data)
|
||||||
|
val_data = normalize_data(train_data)
|
||||||
|
val_data = clean_dataset(train_data)
|
||||||
|
test_data = normalize_data(train_data)
|
||||||
|
test_data = clean_dataset(train_data)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user