IUM_04

2024-04-02 22:12:47 +02:00 · 2024-04-02 22:12:47 +02:00 · 0af0bea801
commit 0af0bea801
parent ee6911def2
9 changed files with 313 additions and 4 deletions
--- a/22
+++ b/22
@ -0,0 +1,22 @@
+FROM ubuntu:latest
+
+ENV KAGGLE_USERNAME=adamwieczrek
+ARG KAGGLE_KEY
+ENV KAGGLE_KEY=${KAGGLE_KEY}
+
+ARG DEBIAN_FRONTEND=noninteractive
+ENV TZ=Etc/UTC
+
+RUN apt update && \
+    apt install -y python3 python3-pip unzip
+
+RUN pip install kaggle pandas seaborn scikit-learn
+
+WORKDIR /app
+
+COPY dataset_stats.py /app/
+COPY IUM_02.py /app/
+
+CMD ["python3", "IUM_02.py"]
+
+CMD bash
--- a/IUM_02.py
+++ b/IUM_02.py
@ -0,0 +1,171 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# ### Pobieranie zbioru i pakietów
+
+# In[1]:
+from kaggle.api.kaggle_api_extended import KaggleApi
+
+api = KaggleApi()
+api.authenticate()
+
+api.dataset_download_files('thedevastator/1-5-million-beer-reviews-from-beer-advocate', path="/app", unzip=True)
+# get_ipython().run_line_magic('pip', 'install kaggle')
+# get_ipython().run_line_magic('pip', 'install pandas')
+# get_ipython().run_line_magic('pip', 'install numpy')
+# get_ipython().run_line_magic('pip', 'install scikit-learn')
+# get_ipython().run_line_magic('pip', 'install seaborn')
+#
+#
+# # In[3]:
+#
+#
+# get_ipython().system('kaggle datasets download -d thedevastator/1-5-million-beer-reviews-from-beer-advocate')
+#
+#
+# # In[ ]:
+#
+#
+# get_ipython().system('kaggle datasets download -d')
+#
+#
+# # In[ ]:
+#
+#
+# get_ipython().system('unzip -o 1-5-million-beer-reviews-from-beer-advocate.zip')
+
+
+# In[43]:
+
+
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from sklearn.model_selection import train_test_split
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import MinMaxScaler
+
+pd.set_option('float_format', '{:f}'.format)
+
+
+# ## Wczytywanie danych
+
+# In[8]:
+
+
+beers=pd.read_csv('beer_reviews.csv')
+
+beers.head()
+
+
+# In[9]:
+
+
+beers.info()
+
+
+# ### Czyszczenie 
+
+# In[49]:
+
+
+beers.dropna(subset=['brewery_name'], inplace=True)
+beers.dropna(subset=['review_profilename'], inplace=True)
+beers.dropna(subset=['beer_abv'], inplace=True)
+
+beers.isnull().sum()
+
+
+# ### Normalizacja
+
+# In[22]:
+
+
+scaler = MinMaxScaler()
+
+beers[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']] = scaler.fit_transform(beers[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']])
+
+
+# ### Podział na podzbiory
+
+# In[24]:
+
+
+beers_train, beers_dev_test = train_test_split(beers, test_size=0.2, random_state=1234)
+beers_dev, beers_test = train_test_split(beers_dev_test, test_size=0.5, random_state=1234)
+
+
+# In[25]:
+
+
+print(f"Liczba kolumn w każdym zbiorze: {beers.shape[1]} kolumn")
+print(f"Całość: {beers.shape[0]} rekordów ")
+print(f"Train: {beers_train.shape[0]} rekordów")
+print(f"Dev: {beers_dev.shape[0]} rekordów")
+print(f"Test: {beers_test.shape[0]} rekordów")
+
+
+# ### Przegląd danych
+
+# In[51]:
+
+
+print(f"Suma różnych piw: {beers['beer_name'].nunique()}")
+print(f"Suma różnych styli: {beers['beer_style'].nunique()}")
+print(f"Suma różnych browarów: {beers['brewery_name'].nunique()}")
+
+
+# In[76]:
+
+
+style_counts = beers['beer_style'].value_counts()
+
+top_15_styles = style_counts.head(15) 
+
+plt.bar(top_15_styles.index, top_15_styles.values)
+plt.xlabel('Styl')
+plt.ylabel('Liczba piw')
+plt.title('Ilość piw dla naliczniejszych styli')
+plt.xticks(rotation=90)
+plt.tight_layout()
+plt.show()
+
+
+# In[91]:
+
+
+reviews = pd.DataFrame(beers.groupby('beer_name')['review_overall'].mean())
+reviews['Liczba opini'] = pd.DataFrame(beers.groupby('beer_name')['review_overall'].count())
+reviews = reviews.sort_values(by=['Liczba opini'], ascending=False)
+reviews.head()
+
+
+# In[32]:
+
+
+beers[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.3f}")
+
+
+# In[33]:
+
+
+beers_train[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.1f}")
+
+
+# In[34]:
+
+
+beers_dev[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.1f}")
+
+
+# In[35]:
+
+
+beers_test[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.1f}")
+
+
+# In[ ]:
+
+
+
+
--- a/4
+++ b/4
@ -1,5 +1,5 @@
 pipeline {
-    agent any
+    agent { dockerfile true }

    parameters {
        string(name: 'CUTOFF', defaultValue: '10000', description: 'Liczba wierszy do obcięcia ze zbioru danych')
@ -20,7 +20,7 @@ pipeline {
                    "KAGGLE_USERNAME=${env.KAGGLE_USERNAME}", 
                    "KAGGLE_KEY=${env.KAGGLE_KEY}"
                ]) {
-                    sh "bash ./kuggle_download.sh ${params.CUTOFF}"
+                    sh "python3 IUM_02.py"
                }
            }
        }
--- a/6
+++ b/6
@ -1,5 +1,9 @@
 pipeline {
-    agent any
+    agent {
+        docker {
+            image 'adamwie123691/ium:1.1'
+        }
+    }

    parameters {
        buildSelector( defaultSelector: lastSuccessful(), description: 'Which build to use for copying artifacts', name: 'BUILD_SELECTOR')
--- a/Jenkinsfile-stats.old
+++ b/Jenkinsfile-stats.old
@ -0,0 +1,38 @@
+pipeline {
+    agent any
+
+    parameters {
+        buildSelector( defaultSelector: lastSuccessful(), description: 'Which build to use for copying artifacts', name: 'BUILD_SELECTOR')
+    }
+
+    stages {
+        stage('Clone Repository') {
+            steps {
+                git url: "https://git.wmi.amu.edu.pl/s464979/ium_464979"
+            }
+        }
+        stage('Copy Artifact') {
+           steps {
+                withEnv(["BUILD_SELECTOR=${params.BUILD_SELECTOR}" ]) {
+                copyArtifacts fingerprintArtifacts: true, projectName: 'z-s464979-create-dataset', selector: buildParameter('$BUILD_SELECTOR')}
+            }
+        }
+         stage('Execute Shell Script') {
+            steps {
+                script {
+                    sh "bash ./dataset_stats.sh"
+                }
+            }
+        }
+        stage('Archive Results') {
+            steps {
+                archiveArtifacts artifacts: 'data/*', onlyIfSuccessful: true
+            }
+        }
+    }
+//     post {
+//         always {
+//              deleteDir()
+//         }
+//     }
+}
--- a/Jenkinsfile.old
+++ b/Jenkinsfile.old
@ -0,0 +1,38 @@
+pipeline {
+    agent any
+
+    parameters {
+        string(name: 'CUTOFF', defaultValue: '10000', description: 'Liczba wierszy do obcięcia ze zbioru danych')
+	    string(name: 'KAGGLE_USERNAME', defaultValue: '', description: 'Kaggle username')
+        password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key') 
+    }
+
+    stages {
+        stage('Clone Repository') {
+            steps {
+                git url: "https://git.wmi.amu.edu.pl/s464979/ium_464979"
+            }
+        }
+
+        stage('Download, Process, and Split Dataset') {
+            steps {
+                withEnv([
+                    "KAGGLE_USERNAME=${env.KAGGLE_USERNAME}", 
+                    "KAGGLE_KEY=${env.KAGGLE_KEY}"
+                ]) {
+                    sh "bash ./kuggle_download.sh ${params.CUTOFF}"
+                }
+            }
+        }
+        stage('Archive Results') {
+            steps {
+                archiveArtifacts artifacts: 'data/*', onlyIfSuccessful: true
+            }
+        }
+    }
+//     post {
+//         always {
+//              deleteDir()
+//         }
+//     }
+}
--- a/dataset_stats.py
+++ b/dataset_stats.py
@ -0,0 +1,21 @@
+import os
+import subprocess
+
+train_file = "data/train.csv"
+dev_file = "data/dev.csv"
+test_file = "data/test.csv"
+
+def process_file(file_path, output_file):
+    result = subprocess.run(['wc', '-l', file_path], capture_output=True, text=True)
+    with open(output_file, 'w') as f:
+        f.write(result.stdout)
+
+process_file(train_file, "data/stats_train.txt")
+process_file(dev_file, "data/stats_dev.txt")
+process_file(test_file, "data/stats_test.txt")
+
+os.makedirs("data", exist_ok=True)
+
+os.rename("data/stats_train.txt", "data/stats_train.txt")
+os.rename("data/stats_dev.txt", "data/stats_dev.txt") 
+os.rename("data/stats_test.txt", "data/stats_test.txt") 
--- a/jupyter_exporter.py
+++ b/jupyter_exporter.py
@ -0,0 +1,14 @@
+input_file = 'IUM_02.ipynb'
+output_file = 'IUM_02.py'
+
+import nbformat
+from nbconvert import PythonExporter
+
+with open(input_file, 'r', encoding='utf-8') as f:
+    nb = nbformat.read(f, as_version=4)
+
+exporter = PythonExporter()
+source, meta = exporter.from_notebook_node(nb)
+
+with open(output_file, 'w', encoding='utf-8') as f:
+    f.write(source)
--- a/kuggle_download.sh
+++ b/kuggle_download.sh
@ -1,7 +1,8 @@
 #!/bin/bash

 # Pobieranie i rozpakowanie
-pip install kaggle
+echo "$KAGGLE_USERNAME"
+echo "$KAGGLE_KEY"
 kaggle datasets download -d thedevastator/1-5-million-beer-reviews-from-beer-advocate
 unzip -o 1-5-million-beer-reviews-from-beer-advocate.zip
 DATASET_FILE="beer_reviews.csv"