From af43187263c9b59c9b054b22c3776205dd477169 Mon Sep 17 00:00:00 2001 From: s464953 Date: Sat, 30 Mar 2024 13:04:13 +0100 Subject: [PATCH] zad4 jenkins docker image creation --- Dockerfile | 22 +++++++ Jenkinsfile-docker-image-build | 44 +++++++++++++ data_preparation_script.py | 113 +++++++++++++++++++++++++++++++++ 3 files changed, 179 insertions(+) create mode 100644 Dockerfile create mode 100644 Jenkinsfile-docker-image-build create mode 100644 data_preparation_script.py diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..e47e6ae --- /dev/null +++ b/Dockerfile @@ -0,0 +1,22 @@ +FROM ubuntu:latest + +ENV KAGGLE_USERNAME=gulczas +ENV KAGGLE_KEY=default_key + +RUN apt-get update && \ + apt-get install -y \ + python3 \ + python3-pip \ + wget \ + unzip \ + && rm -rf /var/lib/apt/lists/* + +RUN pip3 install pandas scikit-learn requests kaggle + +WORKDIR /app + +COPY data_preparation_script.py /app/ + +RUN chmod +x data_preparation_script.py + +CMD ["python3", "data_preparation_script.py"] \ No newline at end of file diff --git a/Jenkinsfile-docker-image-build b/Jenkinsfile-docker-image-build new file mode 100644 index 0000000..31a3560 --- /dev/null +++ b/Jenkinsfile-docker-image-build @@ -0,0 +1,44 @@ +pipeline { + agent any + + parameters { + string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username') + password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key') + } + + stages { + stage('Clone Repository') { + steps { + git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git' + } + } + + stage('Build Docker image') { + steps { + script { + withEnv([ + "KAGGLE_USERNAME=${env.KAGGLE_USERNAME}", + "KAGGLE_KEY=${env.KAGGLE_KEY}"]) + sh "docker build --build-arg KAGGLE_USERNAME=$KAGGLE_USERNAME --build-arg KAGGLE_KEY=$KAGGLE_KEY -t s464953 ." + } + } + } + + stage('Run Docker container') { + steps { + script { + withEnv([ + "KAGGLE_USERNAME=${env.KAGGLE_USERNAME}", + "KAGGLE_KEY=${env.KAGGLE_KEY}"]) + sh "docker run --name s464953 -e KAGGLE_USERNAME=$KAGGLE_USERNAME -e KAGGLE_KEY=$KAGGLE_KEY -v ${WORKSPACE}:/app -ti s464953" + } + } + } + + stage('Archive stats.txt artifact') { + steps { + archiveArtifacts artifacts: 'stats.txt', allowEmptyArchive: true + } + } + } +} diff --git a/data_preparation_script.py b/data_preparation_script.py new file mode 100644 index 0000000..7293741 --- /dev/null +++ b/data_preparation_script.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python + +# Import bibliotek + +import os +import shutil +import pandas as pd +from sklearn.model_selection import train_test_split +import requests +from sklearn.preprocessing import MinMaxScaler +from kaggle.api.kaggle_api_extended import KaggleApi + + +#funkcja pobierająca plik + +def download_file(url, filename, destination_folder): + # Wersja dla datasetów kaggle + + api = KaggleApi() + api.authenticate() + + api.dataset_download_files('gulczas/spotify-dataset', path=destination_folder, unzip=True) + + +# funkcja dzieląca zbiór + +def split_dataset(data, test_size=0.2, val_size=0.1, random_state=42): + #Podział na test i trening + train_data, test_data = train_test_split(data, test_size=test_size, random_state=random_state) + #Podział na walidacje i trening + train_data, val_data = train_test_split(train_data, test_size=val_size/(1-test_size), random_state=random_state) + + return train_data, val_data, test_data + + +# Wyświetlanie statystyk zbioru + +def print_dataset_stats(data, subset_name): + with open('stats.txt', 'a') as stats_file: + print(f"Statystyki dla zbioru {subset_name}:", file=stats_file) + print(f"Wielkość zbioru {subset_name}: {len(data)}", file=stats_file) + + print("\nStatystyki wartości poszczególnych parametrów:", file=stats_file) + print(data.describe(), file=stats_file) + + for column in data.columns: + print(f"Rozkład częstości dla kolumny '{column}':", file=stats_file) + print(data[column].value_counts(), file=stats_file) + print("\n", file=stats_file) + +# Normalizacja danych + +def normalize_data(data): + scaler = MinMaxScaler() + numeric_columns = data.select_dtypes(include=['int', 'float']).columns + scaler.fit(data[numeric_columns]) + df_normalized = data.copy() + df_normalized[numeric_columns] = scaler.transform(df_normalized[numeric_columns]) + return df_normalized + +#Czyszczenie danych + +def clean_dataset(data): + data.dropna(inplace=True) + data.drop_duplicates(inplace=True) + return data + + +# main + +url = "https://www.kaggle.com/datasets/gulczas/spotify-dataset?select=Spotify_Dataset.csv" +filename = "Spotify_Dataset.csv" +destination_folder = "datasets" + +# Pobieranie jeśli nie ma już pobranego pliku +if not os.path.exists(destination_folder): + os.makedirs(destination_folder) + print(f"Utworzono folder: {destination_folder}") +else: + print(f"Folder {destination_folder} już istnieje.") + +if len(os.listdir(destination_folder)) == 0: + # Pobranie pliku + filepath = download_file(url, filename, destination_folder) + +# Wczytanie danych z pliku CSV +data = pd.read_csv("datasets/Spotify_Dataset.csv", sep=";") + +# Podział datasetu na zbiory treningowy, walidacyjny i testowy +train_data, val_data, test_data = split_dataset(data) + +# Zapisanie podzielonych zbiorów danych do osobnych plików CSV +train_data.to_csv("datasets/train.csv", index=False) +val_data.to_csv("datasets/val.csv", index=False) +test_data.to_csv("datasets/test.csv", index=False) + +# Wydrukowanie statystyk dla zbiorów +print_dataset_stats(train_data, "treningowego") +print("\n") +print_dataset_stats(val_data, "walidacyjnego") +print("\n") +print_dataset_stats(test_data, "testowego") + +# Normalizacja i czyszczenie zbirów +train_data = normalize_data(train_data) +train_data = clean_dataset(train_data) +val_data = normalize_data(train_data) +val_data = clean_dataset(train_data) +test_data = normalize_data(train_data) +test_data = clean_dataset(train_data) + + +