Add Dockerfile and script
This commit is contained in:
parent
f4bb29e2c6
commit
cf61bc31ac
14
Dockerfile
Normal file
14
Dockerfile
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
FROM ubuntu:latest
|
||||||
|
|
||||||
|
COPY requirements.txt ./
|
||||||
|
|
||||||
|
RUN apt-get update
|
||||||
|
RUN apt-get install -y python3-pip
|
||||||
|
RUN pip3 install --user -r ./requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY ./stats.py ./
|
||||||
|
COPY /kaggle.json /root/.kaggle/
|
||||||
|
|
||||||
|
CMD python3 stats.py
|
35
Zad1.py
Normal file
35
Zad1.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn import preprocessing
|
||||||
|
|
||||||
|
|
||||||
|
kaggle.api.authenticate()
|
||||||
|
kaggle.api.dataset_download_files("gpreda/covid-world-vaccination-progress", path=".", unzip=True)
|
||||||
|
|
||||||
|
df = pd.read_csv('country_vaccinations.csv')
|
||||||
|
# podział danych na train/validate/test (6:2:2) za pomocą biblioteki numpy i pandas
|
||||||
|
train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
|
||||||
|
|
||||||
|
# Wypisanie ilości elementów w poszczególnych ramkach danych
|
||||||
|
print("Whole set size".ljust(20), df.size)
|
||||||
|
print("Train set size: ".ljust(20), train.size)
|
||||||
|
print("Validate set size: ".ljust(20), validate.size)
|
||||||
|
print("Test set size: ".ljust(20), test.size)
|
||||||
|
|
||||||
|
df.describe(include='all')
|
||||||
|
|
||||||
|
for col in df.columns:
|
||||||
|
column = df[col].value_counts().plot(kind="bar",figsize=(30,10))
|
||||||
|
print("\n", col)
|
||||||
|
print(column)
|
||||||
|
|
||||||
|
# normalizacja wartości numerycznych
|
||||||
|
numeric_values = df.select_dtypes(include='float64').values # tylko wartości numeryczne
|
||||||
|
min_max_scaler = preprocessing.MinMaxScaler()
|
||||||
|
x_scaled = min_max_scaler.fit_transform(values)
|
||||||
|
numeric_columns = df.select_dtypes(include='float64').columns
|
||||||
|
df_normalized = pd.DataFrame(x_scaled, columns=numeric_columns)
|
||||||
|
for col in df.columns: # usunięcie nieznormalizowanych danych i wstawienie nowych już znormalizowanych do oryginalnej ramki danych
|
||||||
|
if col in numeric_columns: df[col] = df_normalized[col]
|
||||||
|
|
||||||
|
df.dropna() # usunięcie wierszy z polami NaN
|
5
requirements.txt
Normal file
5
requirements.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
kaggle==1.5.12
|
||||||
|
matplotlib==3.4.1
|
||||||
|
numpy==1.20.2
|
||||||
|
pandas==1.2.3
|
||||||
|
sklearn==0.0
|
35
stats.py
Normal file
35
stats.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
import zipfile
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn import preprocessing
|
||||||
|
|
||||||
|
with zipfile.ZipFile('covid-world-vaccination-progress.zip', 'r') as zip_ref:
|
||||||
|
zip_ref.extractall(".")
|
||||||
|
|
||||||
|
df = pd.read_csv('country_vaccinations.csv')
|
||||||
|
# podział danych na train/validate/test (6:2:2) za pomocą biblioteki numpy i pandas
|
||||||
|
train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
|
||||||
|
|
||||||
|
# Wypisanie ilości elementów w poszczególnych ramkach danych
|
||||||
|
print("Whole set size".ljust(20), df.size)
|
||||||
|
print("Train set size: ".ljust(20), train.size)
|
||||||
|
print("Validate set size: ".ljust(20), validate.size)
|
||||||
|
print("Test set size: ".ljust(20), test.size)
|
||||||
|
|
||||||
|
df.describe(include='all')
|
||||||
|
|
||||||
|
for col in df.columns:
|
||||||
|
column = df[col].value_counts().plot(kind="bar",figsize=(30,10))
|
||||||
|
print("\n", col)
|
||||||
|
print(column)
|
||||||
|
|
||||||
|
# normalizacja wartości numerycznych
|
||||||
|
numeric_values = df.select_dtypes(include='float64').values # tylko wartości numeryczne
|
||||||
|
min_max_scaler = preprocessing.MinMaxScaler()
|
||||||
|
x_scaled = min_max_scaler.fit_transform(values)
|
||||||
|
numeric_columns = df.select_dtypes(include='float64').columns
|
||||||
|
df_normalized = pd.DataFrame(x_scaled, columns=numeric_columns)
|
||||||
|
for col in df.columns: # usunięcie nieznormalizowanych danych i wstawienie nowych już znormalizowanych do oryginalnej ramki danych
|
||||||
|
if col in numeric_columns: df[col] = df_normalized[col]
|
||||||
|
|
||||||
|
df.dropna() # usunięcie wierszy z polami NaN
|
Loading…
Reference in New Issue
Block a user