IUM_04
This commit is contained in:
parent
ee6911def2
commit
0af0bea801
22
Dockerfile
Normal file
22
Dockerfile
Normal file
@ -0,0 +1,22 @@
|
||||
FROM ubuntu:latest
|
||||
|
||||
ENV KAGGLE_USERNAME=adamwieczrek
|
||||
ARG KAGGLE_KEY
|
||||
ENV KAGGLE_KEY=${KAGGLE_KEY}
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
ENV TZ=Etc/UTC
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y python3 python3-pip unzip
|
||||
|
||||
RUN pip install kaggle pandas seaborn scikit-learn
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY dataset_stats.py /app/
|
||||
COPY IUM_02.py /app/
|
||||
|
||||
CMD ["python3", "IUM_02.py"]
|
||||
|
||||
CMD bash
|
171
IUM_02.py
Normal file
171
IUM_02.py
Normal file
@ -0,0 +1,171 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
# ### Pobieranie zbioru i pakietów
|
||||
|
||||
# In[1]:
|
||||
from kaggle.api.kaggle_api_extended import KaggleApi
|
||||
|
||||
api = KaggleApi()
|
||||
api.authenticate()
|
||||
|
||||
api.dataset_download_files('thedevastator/1-5-million-beer-reviews-from-beer-advocate', path="/app", unzip=True)
|
||||
# get_ipython().run_line_magic('pip', 'install kaggle')
|
||||
# get_ipython().run_line_magic('pip', 'install pandas')
|
||||
# get_ipython().run_line_magic('pip', 'install numpy')
|
||||
# get_ipython().run_line_magic('pip', 'install scikit-learn')
|
||||
# get_ipython().run_line_magic('pip', 'install seaborn')
|
||||
#
|
||||
#
|
||||
# # In[3]:
|
||||
#
|
||||
#
|
||||
# get_ipython().system('kaggle datasets download -d thedevastator/1-5-million-beer-reviews-from-beer-advocate')
|
||||
#
|
||||
#
|
||||
# # In[ ]:
|
||||
#
|
||||
#
|
||||
# get_ipython().system('kaggle datasets download -d')
|
||||
#
|
||||
#
|
||||
# # In[ ]:
|
||||
#
|
||||
#
|
||||
# get_ipython().system('unzip -o 1-5-million-beer-reviews-from-beer-advocate.zip')
|
||||
|
||||
|
||||
# In[43]:
|
||||
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
from sklearn.model_selection import train_test_split
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
|
||||
pd.set_option('float_format', '{:f}'.format)
|
||||
|
||||
|
||||
# ## Wczytywanie danych
|
||||
|
||||
# In[8]:
|
||||
|
||||
|
||||
beers=pd.read_csv('beer_reviews.csv')
|
||||
|
||||
beers.head()
|
||||
|
||||
|
||||
# In[9]:
|
||||
|
||||
|
||||
beers.info()
|
||||
|
||||
|
||||
# ### Czyszczenie
|
||||
|
||||
# In[49]:
|
||||
|
||||
|
||||
beers.dropna(subset=['brewery_name'], inplace=True)
|
||||
beers.dropna(subset=['review_profilename'], inplace=True)
|
||||
beers.dropna(subset=['beer_abv'], inplace=True)
|
||||
|
||||
beers.isnull().sum()
|
||||
|
||||
|
||||
# ### Normalizacja
|
||||
|
||||
# In[22]:
|
||||
|
||||
|
||||
scaler = MinMaxScaler()
|
||||
|
||||
beers[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']] = scaler.fit_transform(beers[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']])
|
||||
|
||||
|
||||
# ### Podział na podzbiory
|
||||
|
||||
# In[24]:
|
||||
|
||||
|
||||
beers_train, beers_dev_test = train_test_split(beers, test_size=0.2, random_state=1234)
|
||||
beers_dev, beers_test = train_test_split(beers_dev_test, test_size=0.5, random_state=1234)
|
||||
|
||||
|
||||
# In[25]:
|
||||
|
||||
|
||||
print(f"Liczba kolumn w każdym zbiorze: {beers.shape[1]} kolumn")
|
||||
print(f"Całość: {beers.shape[0]} rekordów ")
|
||||
print(f"Train: {beers_train.shape[0]} rekordów")
|
||||
print(f"Dev: {beers_dev.shape[0]} rekordów")
|
||||
print(f"Test: {beers_test.shape[0]} rekordów")
|
||||
|
||||
|
||||
# ### Przegląd danych
|
||||
|
||||
# In[51]:
|
||||
|
||||
|
||||
print(f"Suma różnych piw: {beers['beer_name'].nunique()}")
|
||||
print(f"Suma różnych styli: {beers['beer_style'].nunique()}")
|
||||
print(f"Suma różnych browarów: {beers['brewery_name'].nunique()}")
|
||||
|
||||
|
||||
# In[76]:
|
||||
|
||||
|
||||
style_counts = beers['beer_style'].value_counts()
|
||||
|
||||
top_15_styles = style_counts.head(15)
|
||||
|
||||
plt.bar(top_15_styles.index, top_15_styles.values)
|
||||
plt.xlabel('Styl')
|
||||
plt.ylabel('Liczba piw')
|
||||
plt.title('Ilość piw dla naliczniejszych styli')
|
||||
plt.xticks(rotation=90)
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
|
||||
# In[91]:
|
||||
|
||||
|
||||
reviews = pd.DataFrame(beers.groupby('beer_name')['review_overall'].mean())
|
||||
reviews['Liczba opini'] = pd.DataFrame(beers.groupby('beer_name')['review_overall'].count())
|
||||
reviews = reviews.sort_values(by=['Liczba opini'], ascending=False)
|
||||
reviews.head()
|
||||
|
||||
|
||||
# In[32]:
|
||||
|
||||
|
||||
beers[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.3f}")
|
||||
|
||||
|
||||
# In[33]:
|
||||
|
||||
|
||||
beers_train[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.1f}")
|
||||
|
||||
|
||||
# In[34]:
|
||||
|
||||
|
||||
beers_dev[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.1f}")
|
||||
|
||||
|
||||
# In[35]:
|
||||
|
||||
|
||||
beers_test[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'beer_beerid']].describe().map(lambda x: f"{x:0.1f}")
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
|
||||
|
4
Jenkinsfile
vendored
4
Jenkinsfile
vendored
@ -1,5 +1,5 @@
|
||||
pipeline {
|
||||
agent any
|
||||
agent { dockerfile true }
|
||||
|
||||
parameters {
|
||||
string(name: 'CUTOFF', defaultValue: '10000', description: 'Liczba wierszy do obcięcia ze zbioru danych')
|
||||
@ -20,7 +20,7 @@ pipeline {
|
||||
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
|
||||
"KAGGLE_KEY=${env.KAGGLE_KEY}"
|
||||
]) {
|
||||
sh "bash ./kuggle_download.sh ${params.CUTOFF}"
|
||||
sh "python3 IUM_02.py"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,9 @@
|
||||
pipeline {
|
||||
agent any
|
||||
agent {
|
||||
docker {
|
||||
image 'adamwie123691/ium:1.1'
|
||||
}
|
||||
}
|
||||
|
||||
parameters {
|
||||
buildSelector( defaultSelector: lastSuccessful(), description: 'Which build to use for copying artifacts', name: 'BUILD_SELECTOR')
|
||||
|
38
Jenkinsfile-stats.old
Normal file
38
Jenkinsfile-stats.old
Normal file
@ -0,0 +1,38 @@
|
||||
pipeline {
|
||||
agent any
|
||||
|
||||
parameters {
|
||||
buildSelector( defaultSelector: lastSuccessful(), description: 'Which build to use for copying artifacts', name: 'BUILD_SELECTOR')
|
||||
}
|
||||
|
||||
stages {
|
||||
stage('Clone Repository') {
|
||||
steps {
|
||||
git url: "https://git.wmi.amu.edu.pl/s464979/ium_464979"
|
||||
}
|
||||
}
|
||||
stage('Copy Artifact') {
|
||||
steps {
|
||||
withEnv(["BUILD_SELECTOR=${params.BUILD_SELECTOR}" ]) {
|
||||
copyArtifacts fingerprintArtifacts: true, projectName: 'z-s464979-create-dataset', selector: buildParameter('$BUILD_SELECTOR')}
|
||||
}
|
||||
}
|
||||
stage('Execute Shell Script') {
|
||||
steps {
|
||||
script {
|
||||
sh "bash ./dataset_stats.sh"
|
||||
}
|
||||
}
|
||||
}
|
||||
stage('Archive Results') {
|
||||
steps {
|
||||
archiveArtifacts artifacts: 'data/*', onlyIfSuccessful: true
|
||||
}
|
||||
}
|
||||
}
|
||||
// post {
|
||||
// always {
|
||||
// deleteDir()
|
||||
// }
|
||||
// }
|
||||
}
|
38
Jenkinsfile.old
Normal file
38
Jenkinsfile.old
Normal file
@ -0,0 +1,38 @@
|
||||
pipeline {
|
||||
agent any
|
||||
|
||||
parameters {
|
||||
string(name: 'CUTOFF', defaultValue: '10000', description: 'Liczba wierszy do obcięcia ze zbioru danych')
|
||||
string(name: 'KAGGLE_USERNAME', defaultValue: '', description: 'Kaggle username')
|
||||
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
|
||||
}
|
||||
|
||||
stages {
|
||||
stage('Clone Repository') {
|
||||
steps {
|
||||
git url: "https://git.wmi.amu.edu.pl/s464979/ium_464979"
|
||||
}
|
||||
}
|
||||
|
||||
stage('Download, Process, and Split Dataset') {
|
||||
steps {
|
||||
withEnv([
|
||||
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
|
||||
"KAGGLE_KEY=${env.KAGGLE_KEY}"
|
||||
]) {
|
||||
sh "bash ./kuggle_download.sh ${params.CUTOFF}"
|
||||
}
|
||||
}
|
||||
}
|
||||
stage('Archive Results') {
|
||||
steps {
|
||||
archiveArtifacts artifacts: 'data/*', onlyIfSuccessful: true
|
||||
}
|
||||
}
|
||||
}
|
||||
// post {
|
||||
// always {
|
||||
// deleteDir()
|
||||
// }
|
||||
// }
|
||||
}
|
21
dataset_stats.py
Normal file
21
dataset_stats.py
Normal file
@ -0,0 +1,21 @@
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
train_file = "data/train.csv"
|
||||
dev_file = "data/dev.csv"
|
||||
test_file = "data/test.csv"
|
||||
|
||||
def process_file(file_path, output_file):
|
||||
result = subprocess.run(['wc', '-l', file_path], capture_output=True, text=True)
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(result.stdout)
|
||||
|
||||
process_file(train_file, "data/stats_train.txt")
|
||||
process_file(dev_file, "data/stats_dev.txt")
|
||||
process_file(test_file, "data/stats_test.txt")
|
||||
|
||||
os.makedirs("data", exist_ok=True)
|
||||
|
||||
os.rename("data/stats_train.txt", "data/stats_train.txt")
|
||||
os.rename("data/stats_dev.txt", "data/stats_dev.txt")
|
||||
os.rename("data/stats_test.txt", "data/stats_test.txt")
|
14
jupyter_exporter.py
Normal file
14
jupyter_exporter.py
Normal file
@ -0,0 +1,14 @@
|
||||
input_file = 'IUM_02.ipynb'
|
||||
output_file = 'IUM_02.py'
|
||||
|
||||
import nbformat
|
||||
from nbconvert import PythonExporter
|
||||
|
||||
with open(input_file, 'r', encoding='utf-8') as f:
|
||||
nb = nbformat.read(f, as_version=4)
|
||||
|
||||
exporter = PythonExporter()
|
||||
source, meta = exporter.from_notebook_node(nb)
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(source)
|
@ -1,7 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Pobieranie i rozpakowanie
|
||||
pip install kaggle
|
||||
echo "$KAGGLE_USERNAME"
|
||||
echo "$KAGGLE_KEY"
|
||||
kaggle datasets download -d thedevastator/1-5-million-beer-reviews-from-beer-advocate
|
||||
unzip -o 1-5-million-beer-reviews-from-beer-advocate.zip
|
||||
DATASET_FILE="beer_reviews.csv"
|
||||
|
Loading…
Reference in New Issue
Block a user