diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5daf552 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,22 @@ +FROM ubuntu:latest + +ENV KAGGLE_USERNAME=adamwieczrek +ARG KAGGLE_KEY +ENV KAGGLE_KEY=${KAGGLE_KEY} + +ARG DEBIAN_FRONTEND=noninteractive +ENV TZ=Etc/UTC + +RUN apt update && \ + apt install -y python3 python3-pip unzip + +RUN pip install kaggle pandas seaborn scikit-learn + +WORKDIR /app + +COPY dataset_stats.py /app/ +COPY IUM_02.py /app/ + +CMD ["python3", "IUM_02.py"] + +CMD bash diff --git a/IUM_02.py b/IUM_02.py new file mode 100644 index 0000000..3b97dd0 --- /dev/null +++ b/IUM_02.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python +# coding: utf-8 + +# ### Pobieranie zbioru i pakietów + +# In[1]: +from kaggle.api.kaggle_api_extended import KaggleApi + +api = KaggleApi() +api.authenticate() + +api.dataset_download_files('thedevastator/1-5-million-beer-reviews-from-beer-advocate', path="/app", unzip=True) +# get_ipython().run_line_magic('pip', 'install kaggle') +# get_ipython().run_line_magic('pip', 'install pandas') +# get_ipython().run_line_magic('pip', 'install numpy') +# get_ipython().run_line_magic('pip', 'install scikit-learn') +# get_ipython().run_line_magic('pip', 'install seaborn') +# +# +# # In[3]: +# +# +# get_ipython().system('kaggle datasets download -d thedevastator/1-5-million-beer-reviews-from-beer-advocate') +# +# +# # In[ ]: +# +# +# get_ipython().system('kaggle datasets download -d') +# +# +# # In[ ]: +# +# +# get_ipython().system('unzip -o 1-5-million-beer-reviews-from-beer-advocate.zip') + + +# In[43]: + + +import numpy as np +import pandas as pd +import seaborn as sns +from sklearn.model_selection import train_test_split +import matplotlib.pyplot as plt +from sklearn.preprocessing import MinMaxScaler + +pd.set_option('float_format', '{:f}'.format) + + +# ## Wczytywanie danych + +# In[8]: + + +beers=pd.read_csv('beer_reviews.csv') + +beers.head() + + +# In[9]: + + +beers.info() + + +# ### Czyszczenie + +# In[49]: + + +beers.dropna(subset=['brewery_name'], inplace=True) +beers.dropna(subset=['review_profilename'], inplace=True) +beers.dropna(subset=['beer_abv'], inplace=True) + +beers.isnull().sum() + + +# ### Normalizacja + +# In[22]: + + +scaler = MinMaxScaler() + +beers[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']] = scaler.fit_transform(beers[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']]) + + +# ### Podział na podzbiory + +# In[24]: + + +beers_train, beers_dev_test = train_test_split(beers, test_size=0.2, random_state=1234) +beers_dev, beers_test = train_test_split(beers_dev_test, test_size=0.5, random_state=1234) + + +# In[25]: + + +print(f"Liczba kolumn w każdym zbiorze: {beers.shape[1]} kolumn") +print(f"Całość: {beers.shape[0]} rekordów ") +print(f"Train: {beers_train.shape[0]} rekordów") +print(f"Dev: {beers_dev.shape[0]} rekordów") +print(f"Test: {beers_test.shape[0]} rekordów") + + +# ### Przegląd danych + +# In[51]: + + +print(f"Suma różnych piw: {beers['beer_name'].nunique()}") +print(f"Suma różnych styli: {beers['beer_style'].nunique()}") +print(f"Suma różnych browarów: {beers['brewery_name'].nunique()}") + + +# In[76]: + + +style_counts = beers['beer_style'].value_counts() + +top_15_styles = style_counts.head(15) + +plt.bar(top_15_styles.index, top_15_styles.values) +plt.xlabel('Styl') +plt.ylabel('Liczba piw') +plt.title('Ilość piw dla naliczniejszych styli') +plt.xticks(rotation=90) +plt.tight_layout() +plt.show() + + +# In[91]: + + +reviews = pd.DataFrame(beers.groupby('beer_name')['review_overall'].mean()) +reviews['Liczba opini'] = pd.DataFrame(beers.groupby('beer_name')['review_overall'].count()) +reviews = reviews.sort_values(by=['Liczba opini'], ascending=False) +reviews.head() + + +# In[32]: + + +beers[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.3f}") + + +# In[33]: + + +beers_train[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.1f}") + + +# In[34]: + + +beers_dev[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.1f}") + + +# In[35]: + + +beers_test[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.1f}") + + +# In[ ]: + + + + diff --git a/Jenkinsfile b/Jenkinsfile index 571b21f..6c36771 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,5 +1,5 @@ pipeline { - agent any + agent { dockerfile true } parameters { string(name: 'CUTOFF', defaultValue: '10000', description: 'Liczba wierszy do obcięcia ze zbioru danych') @@ -20,7 +20,7 @@ pipeline { "KAGGLE_USERNAME=${env.KAGGLE_USERNAME}", "KAGGLE_KEY=${env.KAGGLE_KEY}" ]) { - sh "bash ./kuggle_download.sh ${params.CUTOFF}" + sh "python3 IUM_02.py" } } } diff --git a/Jenkinsfile-stats b/Jenkinsfile-stats index e9e0935..f12a6bc 100644 --- a/Jenkinsfile-stats +++ b/Jenkinsfile-stats @@ -1,5 +1,9 @@ pipeline { - agent any + agent { + docker { + image 'adamwie123691/ium:1.1' + } + } parameters { buildSelector( defaultSelector: lastSuccessful(), description: 'Which build to use for copying artifacts', name: 'BUILD_SELECTOR') diff --git a/Jenkinsfile-stats.old b/Jenkinsfile-stats.old new file mode 100644 index 0000000..e9e0935 --- /dev/null +++ b/Jenkinsfile-stats.old @@ -0,0 +1,38 @@ +pipeline { + agent any + + parameters { + buildSelector( defaultSelector: lastSuccessful(), description: 'Which build to use for copying artifacts', name: 'BUILD_SELECTOR') + } + + stages { + stage('Clone Repository') { + steps { + git url: "https://git.wmi.amu.edu.pl/s464979/ium_464979" + } + } + stage('Copy Artifact') { + steps { + withEnv(["BUILD_SELECTOR=${params.BUILD_SELECTOR}" ]) { + copyArtifacts fingerprintArtifacts: true, projectName: 'z-s464979-create-dataset', selector: buildParameter('$BUILD_SELECTOR')} + } + } + stage('Execute Shell Script') { + steps { + script { + sh "bash ./dataset_stats.sh" + } + } + } + stage('Archive Results') { + steps { + archiveArtifacts artifacts: 'data/*', onlyIfSuccessful: true + } + } + } +// post { +// always { +// deleteDir() +// } +// } +} diff --git a/Jenkinsfile.old b/Jenkinsfile.old new file mode 100644 index 0000000..571b21f --- /dev/null +++ b/Jenkinsfile.old @@ -0,0 +1,38 @@ +pipeline { + agent any + + parameters { + string(name: 'CUTOFF', defaultValue: '10000', description: 'Liczba wierszy do obcięcia ze zbioru danych') + string(name: 'KAGGLE_USERNAME', defaultValue: '', description: 'Kaggle username') + password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key') + } + + stages { + stage('Clone Repository') { + steps { + git url: "https://git.wmi.amu.edu.pl/s464979/ium_464979" + } + } + + stage('Download, Process, and Split Dataset') { + steps { + withEnv([ + "KAGGLE_USERNAME=${env.KAGGLE_USERNAME}", + "KAGGLE_KEY=${env.KAGGLE_KEY}" + ]) { + sh "bash ./kuggle_download.sh ${params.CUTOFF}" + } + } + } + stage('Archive Results') { + steps { + archiveArtifacts artifacts: 'data/*', onlyIfSuccessful: true + } + } + } +// post { +// always { +// deleteDir() +// } +// } +} diff --git a/dataset_stats.py b/dataset_stats.py new file mode 100644 index 0000000..5c6b41e --- /dev/null +++ b/dataset_stats.py @@ -0,0 +1,21 @@ +import os +import subprocess + +train_file = "data/train.csv" +dev_file = "data/dev.csv" +test_file = "data/test.csv" + +def process_file(file_path, output_file): + result = subprocess.run(['wc', '-l', file_path], capture_output=True, text=True) + with open(output_file, 'w') as f: + f.write(result.stdout) + +process_file(train_file, "data/stats_train.txt") +process_file(dev_file, "data/stats_dev.txt") +process_file(test_file, "data/stats_test.txt") + +os.makedirs("data", exist_ok=True) + +os.rename("data/stats_train.txt", "data/stats_train.txt") +os.rename("data/stats_dev.txt", "data/stats_dev.txt") +os.rename("data/stats_test.txt", "data/stats_test.txt") diff --git a/jupyter_exporter.py b/jupyter_exporter.py new file mode 100644 index 0000000..f0ac2e8 --- /dev/null +++ b/jupyter_exporter.py @@ -0,0 +1,14 @@ +input_file = 'IUM_02.ipynb' +output_file = 'IUM_02.py' + +import nbformat +from nbconvert import PythonExporter + +with open(input_file, 'r', encoding='utf-8') as f: + nb = nbformat.read(f, as_version=4) + +exporter = PythonExporter() +source, meta = exporter.from_notebook_node(nb) + +with open(output_file, 'w', encoding='utf-8') as f: + f.write(source) \ No newline at end of file diff --git a/kuggle_download.sh b/kuggle_download.sh index 247fb7e..2d33f78 100644 --- a/kuggle_download.sh +++ b/kuggle_download.sh @@ -1,7 +1,8 @@ #!/bin/bash # Pobieranie i rozpakowanie -pip install kaggle +echo "$KAGGLE_USERNAME" +echo "$KAGGLE_KEY" kaggle datasets download -d thedevastator/1-5-million-beer-reviews-from-beer-advocate unzip -o 1-5-million-beer-reviews-from-beer-advocate.zip DATASET_FILE="beer_reviews.csv"