updated data download job

This commit is contained in:
s464953 2024-05-08 21:04:42 +02:00
parent b7b992cb8a
commit 4ced43497c
19 changed files with 11 additions and 1305696 deletions

View File

@ -1,26 +0,0 @@
FROM ubuntu:latest
ENV KAGGLE_USERNAME=gulczas
ENV KAGGLE_KEY=default_key
RUN apt-get update && \
apt-get install -y \
python3 \
python3-pip \
wget \
unzip \
&& rm -rf /var/lib/apt/lists/*
RUN pip3 install pandas scikit-learn requests kaggle numpy
WORKDIR /app
COPY model_creator.py /app/
COPY use_model.py /app/
COPY run_py_scripts.sh /app/
RUN chmod +x model_creator.py
RUN chmod +x use_model.py
CMD ["bash", "run_py_scripts.sh"]

3
Jenkinsfile vendored
View File

@ -4,7 +4,6 @@ pipeline {
parameters {
string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username')
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
string(name: 'CUTOFF', defaultValue: '90', description: 'Number of rows to cut')
}
stages {
@ -27,7 +26,7 @@ pipeline {
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
"KAGGLE_KEY=${env.KAGGLE_KEY}"])
{
sh "bash ./download_dataset.sh ${params.CUTOFF}"
sh "bash ./download_dataset.sh"
}
}
}

View File

@ -1,57 +0,0 @@
pipeline {
agent any
parameters {
string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username')
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
}
stages {
stage('Clone Repository') {
steps {
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
}
}
stage('Stop and remove existing container') {
steps {
script {
sh "docker stop s464953 || true"
sh "docker rm s464953 || true"
}
}
}
stage('Build Docker image') {
steps {
script {
withEnv([
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
"KAGGLE_KEY=${env.KAGGLE_KEY}"
]) {
sh "docker build --build-arg KAGGLE_USERNAME=$KAGGLE_USERNAME --build-arg KAGGLE_KEY=$KAGGLE_KEY -t s464953 ."
}
}
}
}
stage('Run Docker container') {
steps {
script {
withEnv([
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
"KAGGLE_KEY=${env.KAGGLE_KEY}"
]) {
sh "docker run --name s464953 -e KAGGLE_USERNAME=$KAGGLE_USERNAME -e KAGGLE_KEY=$KAGGLE_KEY -v ${WORKSPACE}:/app s464953"
}
}
}
}
stage('Archive stats.txt artifact') {
steps {
archiveArtifacts artifacts: 'stats.txt', allowEmptyArchive: true
}
}
}
}

View File

@ -1,44 +0,0 @@
pipeline {
agent any
parameters {
string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username')
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
}
stages {
stage('Clone Repository') {
steps {
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
}
}
stage('Stop and remove existing container') {
steps {
script {
sh "docker stop s464953 || true"
sh "docker rm s464953 || true"
}
}
}
stage('Run Docker container') {
steps {
script {
withEnv([
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
"KAGGLE_KEY=${env.KAGGLE_KEY}"
]) {
sh "docker run --name s464953 -e KAGGLE_USERNAME=$KAGGLE_USERNAME -e KAGGLE_KEY=$KAGGLE_KEY -v ${WORKSPACE}:/app michalgulczynski/ium_s464953:1.0"
}
}
}
}
stage('Archive stats.txt artifact') {
steps {
archiveArtifacts artifacts: 'stats.txt', allowEmptyArchive: true
}
}
}
}

View File

@ -1,57 +0,0 @@
pipeline {
agent any
parameters {
string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username')
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
}
stages {
stage('Clone Repository') {
steps {
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
}
}
stage('Stop and remove existing container') {
steps {
script {
sh "docker stop s464953 || true"
sh "docker rm s464953 || true"
}
}
}
stage('Build Docker image') {
steps {
script {
withEnv([
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
"KAGGLE_KEY=${env.KAGGLE_KEY}"
]) {
sh "docker build --build-arg KAGGLE_USERNAME=$KAGGLE_USERNAME --build-arg KAGGLE_KEY=$KAGGLE_KEY -t s464953 ."
}
}
}
}
stage('Run Docker container') {
steps {
script {
withEnv([
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
"KAGGLE_KEY=${env.KAGGLE_KEY}"
]) {
sh "docker run --name s464953 -e KAGGLE_USERNAME=$KAGGLE_USERNAME -e KAGGLE_KEY=$KAGGLE_KEY -v ${WORKSPACE}:/app s464953"
}
}
}
}
stage('Archive stats.txt artifact') {
steps {
archiveArtifacts artifacts: 'model.pkl', allowEmptyArchive: true
}
}
}
}

View File

@ -1,42 +0,0 @@
pipeline {
agent any
parameters {
buildSelector( defaultSelector: lastSuccessful(), description: 'Build for copying artifacts', name: 'BUILD_SELECTOR')
}
stages {
stage('Clone Repository') {
steps {
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
}
}
stage('Cleanup Artifacts') {
steps {
script {
sh 'rm -rf artifacts'
}
}
}
stage('Copy Artifact') {
steps {
withEnv([
"BUILD_SELECTOR=${params.BUILD_SELECTOR}"
]) {
copyArtifacts fingerprintArtifacts: true, projectName: 'z-s464953-create-dataset', selector: buildParameter('$BUILD_SELECTOR')}
}
}
stage('Execute Shell Script') {
steps {
script {
sh "bash ./dataset_stats.sh"
}
}
}
stage('Archive Results') {
steps {
archiveArtifacts artifacts: 'artifacts/*', onlyIfSuccessful: true
}
}
}
}

View File

@ -1,113 +0,0 @@
#!/usr/bin/env python
# Import bibliotek
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split
import requests
from sklearn.preprocessing import MinMaxScaler
from kaggle.api.kaggle_api_extended import KaggleApi
#funkcja pobierająca plik
def download_file(url, filename, destination_folder):
# Wersja dla datasetów kaggle
api = KaggleApi()
api.authenticate()
api.dataset_download_files('gulczas/spotify-dataset', path=destination_folder, unzip=True)
# funkcja dzieląca zbiór
def split_dataset(data, test_size=0.2, val_size=0.1, random_state=42):
#Podział na test i trening
train_data, test_data = train_test_split(data, test_size=test_size, random_state=random_state)
#Podział na walidacje i trening
train_data, val_data = train_test_split(train_data, test_size=val_size/(1-test_size), random_state=random_state)
return train_data, val_data, test_data
# Wyświetlanie statystyk zbioru
def print_dataset_stats(data, subset_name):
with open('stats.txt', 'a') as stats_file:
print(f"Statystyki dla zbioru {subset_name}:", file=stats_file)
print(f"Wielkość zbioru {subset_name}: {len(data)}", file=stats_file)
print("\nStatystyki wartości poszczególnych parametrów:", file=stats_file)
print(data.describe(), file=stats_file)
for column in data.columns:
print(f"Rozkład częstości dla kolumny '{column}':", file=stats_file)
print(data[column].value_counts(), file=stats_file)
print("\n", file=stats_file)
# Normalizacja danych
def normalize_data(data):
scaler = MinMaxScaler()
numeric_columns = data.select_dtypes(include=['int', 'float']).columns
scaler.fit(data[numeric_columns])
df_normalized = data.copy()
df_normalized[numeric_columns] = scaler.transform(df_normalized[numeric_columns])
return df_normalized
#Czyszczenie danych
def clean_dataset(data):
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)
return data
# main
url = "https://www.kaggle.com/datasets/gulczas/spotify-dataset?select=Spotify_Dataset.csv"
filename = "Spotify_Dataset.csv"
destination_folder = "datasets"
# Pobieranie jeśli nie ma już pobranego pliku
if not os.path.exists(destination_folder):
os.makedirs(destination_folder)
print(f"Utworzono folder: {destination_folder}")
else:
print(f"Folder {destination_folder} już istnieje.")
if 'Spotify_Dataset.csv' not in os.listdir(destination_folder):
# Pobranie pliku
filepath = download_file(url, filename, destination_folder)
# Wczytanie danych z pliku CSV
data = pd.read_csv("datasets/Spotify_Dataset.csv", sep=";")
# Podział datasetu na zbiory treningowy, walidacyjny i testowy
train_data, val_data, test_data = split_dataset(data)
# Zapisanie podzielonych zbiorów danych do osobnych plików CSV
train_data.to_csv("datasets/train.csv", index=False)
val_data.to_csv("datasets/val.csv", index=False)
test_data.to_csv("datasets/test.csv", index=False)
# Wydrukowanie statystyk dla zbiorów
print_dataset_stats(train_data, "treningowego")
print("\n")
print_dataset_stats(val_data, "walidacyjnego")
print("\n")
print_dataset_stats(test_data, "testowego")
# Normalizacja i czyszczenie zbirów
train_data = normalize_data(train_data)
train_data = clean_dataset(train_data)
val_data = normalize_data(train_data)
val_data = clean_dataset(train_data)
test_data = normalize_data(train_data)
test_data = clean_dataset(train_data)

View File

@ -1,13 +0,0 @@
#!/bin/bash
echo "------------------ Train dataset stats ------------------"
wc -l artifacts/train.csv > stats_train.txt
echo "------------------ Validation dataset stats ------------------"
wc -l artifacts/validation.csv > stats_validation.txt
echo "------------------ Test dataset stats ------------------"
wc -l artifacts/test.csv > stats_test.txt
mkdir -p data
mv stats_train.txt stats_validation.txt stats_test.txt artifacts/

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,26 +1,10 @@
#!/bin/bash
pip install kaggle --upgrade
kaggle datasets download -d gulczas/spotify-dataset
unzip -o spotify-dataset.zip
echo "------------------ Shufle ------------------"
shuf Spotify_Dataset.csv -o shuffled_spotify.csv
echo "------------------ Cut off to top $1 rows ------------------"
head -n $1 shuffled_spotify.csv > cutoff_spotify.csv
echo "------------------ Split ------------------"
total_lines=$(wc -l < cutoff_spotify.csv)
num_test=$((total_lines / 10))
num_train=$((total_lines - (num_test * 2)))
num_validation=$num_test
head -n $num_train cutoff_spotify.csv > train.csv
tail -n $((num_test+num_validation)) cutoff_spotify.csv | head -n $num_test > test.csv
tail -n $num_validation cutoff_spotify.csv > validation.csv
mkdir -p artifacts
mv Spotify_Dataset.csv cutoff_spotify.csv train.csv validation.csv test.csv artifacts/
#!/bin/bash
pip install kaggle --upgrade
kaggle datasets download -d gulczas/spotify-dataset --unzip
kaggle datasets download -d joebeachcapital/30000-spotify-songs --unzip
mkdir -p artifacts
mv Spotify_Dataset.csv spotify_songs.csv artifacts/

BIN
model.pkl

Binary file not shown.

View File

@ -1,127 +0,0 @@
import pandas as pd
import os
import numpy as np
from kaggle.api.kaggle_api_extended import KaggleApi
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn. preprocessing import LabelEncoder
import pickle
def download_dataset(dataset_address, destination_folder):
api = KaggleApi()
api.authenticate()
api.dataset_download_files(dataset_address, path=destination_folder, unzip=True)
def check_datasets_presence():
dataset_1 = "Spotify_Dataset.csv"
dataset_2 = "spotify_songs.csv"
destination_folder = "datasets"
if not os.path.exists(destination_folder):
os.makedirs(destination_folder)
print(f"Utworzono folder: {destination_folder}")
else:
print(f"Folder {destination_folder} już istnieje.")
if dataset_1 not in os.listdir(destination_folder):
download_dataset('gulczas/spotify-dataset', destination_folder)
if dataset_2 not in os.listdir(destination_folder):
download_dataset('joebeachcapital/30000-spotify-songs', destination_folder)
def datasets_preparation():
df_1 = pd.read_csv("datasets/spotify_songs.csv")
df_2 = pd.read_csv("datasets/Spotify_Dataset.csv", sep=";")
df_1 = df_1.dropna()
df_2 = df_2.dropna()
df_2 = df_2.rename(columns={'Title': 'track_name'})
columns_to_remove_df_1 = ['track_id', 'track_album_id', 'track_album_name', 'track_album_release_date',
'playlist_id', 'playlist_subgenre']
columns_to_remove_df_2 = ['Date','# of Artist', 'Artist (Ind.)', '# of Nationality',
'Nationality', 'Continent', 'Points (Total)',
'Points (Ind for each Artist/Nat)', 'id', 'Song URL']
df_1 = df_1.drop(columns=columns_to_remove_df_1)
df_2 = df_2.drop(columns=columns_to_remove_df_2)
df_1 = df_1.drop_duplicates(subset=['track_name'])
df_2 = df_2.drop_duplicates(subset=['track_name'])
le = LabelEncoder()
unique_names_df2 = df_2['track_name'].unique()
diff_df = df_1[~df_1['track_name'].isin(unique_names_df2)]
diff_df = diff_df.iloc[:10000]
#diff_df = pd.concat([diff_df, df_1.iloc[:20]], ignore_index=True)
diff_df['track_artist'] = le.fit_transform(diff_df.track_artist)
diff_df['playlist_name'] = le.fit_transform(diff_df.playlist_name)
diff_df['playlist_genre'] = le.fit_transform(diff_df.playlist_genre)
#df_1 = df_1.iloc[20:]
if "docker_test_dataset.csv" not in os.listdir("datasets"):
diff_df.to_csv("datasets/docker_test_dataset.csv", index=False)
result_df = pd.merge(df_1, df_2, on='track_name', how='inner')
result_df = result_df.drop_duplicates(subset=['track_name'])
columns_to_remove_result_df = ['Rank', 'Artists', 'Danceability', 'Energy', 'Loudness',
'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence']
result_df = result_df.drop(columns=columns_to_remove_result_df)
result_df['track_artist'] = le.fit_transform(result_df.track_artist)
result_df['playlist_name'] = le.fit_transform(result_df.playlist_name)
result_df['playlist_genre'] = le.fit_transform(result_df.playlist_genre)
return result_df
check_datasets_presence()
result_df = datasets_preparation()
Y = result_df[['playlist_genre']]
X = result_df.drop(columns='playlist_genre')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.10, random_state=42)
Y_train = np.ravel(Y_train)
Y_test = np.ravel(Y_test)
scaler = StandardScaler()
numeric_columns = X_train.select_dtypes(include=['int', 'float']).columns
X_train_scaled = scaler.fit_transform(X_train[numeric_columns])
X_test_scaled = scaler.transform(X_test[numeric_columns])
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, Y_train)
Y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)
file_path = 'model.pkl'
if os.path.exists(file_path):
os.remove(file_path)
if file_path not in os.listdir("./"):
with open(file_path, 'wb') as file:
pickle.dump(model, file)
print("Model został zapisany do pliku:", file_path)

View File

@ -1,3 +0,0 @@
Real:['edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm']
Predicted: ['pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop']
Accuracy:0.1521

View File

@ -1,3 +0,0 @@
#!/bin/bash
python3 model_creator.py

View File

@ -1,36 +0,0 @@
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
np.set_printoptions(threshold=20)
file_path = 'model.pkl'
with open(file_path, 'rb') as file:
model = pickle.load(file)
print("Model został wczytany z pliku:", file_path)
test_df = pd.read_csv("datasets/docker_test_dataset.csv")
Y_test = test_df[['playlist_genre']]
X_test = test_df.drop(columns='playlist_genre')
Y_test = np.ravel(Y_test)
scaler = StandardScaler()
numeric_columns = X_test.select_dtypes(include=['int', 'float']).columns
X_test_scaled = scaler.fit_transform(X_test[numeric_columns])
Y_pred = model.predict(X_test_scaled)
with open('model_predictions.txt', 'w') as f:
pass
with open('model_predictions.txt', 'a') as f:
labels_dict = {0: 'edm', 1 : 'latin', 2 : 'pop', 3 : 'r&b', 4 : 'rap', 5 :'rock'}
Y_test_labels = [labels_dict[number] for number in Y_test]
Y_pred_labels = [labels_dict[number] for number in Y_pred]
f.write("Real:" + str(Y_test_labels[:20])+ " \nPredicted: "+ str(Y_pred_labels[:20]))
accuracy = accuracy_score(Y_test, Y_pred)
f.write("\nAccuracy:" + str(accuracy))

1271
zad1.ipynb

File diff suppressed because it is too large Load Diff