diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..c140625 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,14 @@ +FROM ubuntu:latest + +COPY requirements.txt ./ + +RUN apt-get update +RUN apt-get install -y python3-pip +RUN pip3 install --user -r ./requirements.txt + +WORKDIR /app + +COPY ./stats.py ./ +COPY /kaggle.json /root/.kaggle/ + +CMD python3 stats.py \ No newline at end of file diff --git a/Zad1.py b/Zad1.py new file mode 100644 index 0000000..d30860b --- /dev/null +++ b/Zad1.py @@ -0,0 +1,35 @@ +import numpy as np +import pandas as pd +from sklearn import preprocessing + + +kaggle.api.authenticate() +kaggle.api.dataset_download_files("gpreda/covid-world-vaccination-progress", path=".", unzip=True) + +df = pd.read_csv('country_vaccinations.csv') +# podział danych na train/validate/test (6:2:2) za pomocą biblioteki numpy i pandas +train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))]) + +# Wypisanie ilości elementów w poszczególnych ramkach danych +print("Whole set size".ljust(20), df.size) +print("Train set size: ".ljust(20), train.size) +print("Validate set size: ".ljust(20), validate.size) +print("Test set size: ".ljust(20), test.size) + +df.describe(include='all') + +for col in df.columns: + column = df[col].value_counts().plot(kind="bar",figsize=(30,10)) + print("\n", col) + print(column) + +# normalizacja wartości numerycznych +numeric_values = df.select_dtypes(include='float64').values # tylko wartości numeryczne +min_max_scaler = preprocessing.MinMaxScaler() +x_scaled = min_max_scaler.fit_transform(values) +numeric_columns = df.select_dtypes(include='float64').columns +df_normalized = pd.DataFrame(x_scaled, columns=numeric_columns) +for col in df.columns: # usunięcie nieznormalizowanych danych i wstawienie nowych już znormalizowanych do oryginalnej ramki danych + if col in numeric_columns: df[col] = df_normalized[col] + +df.dropna() # usunięcie wierszy z polami NaN \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1feaa26 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +kaggle==1.5.12 +matplotlib==3.4.1 +numpy==1.20.2 +pandas==1.2.3 +sklearn==0.0 \ No newline at end of file diff --git a/stats.py b/stats.py new file mode 100644 index 0000000..f5c8d6c --- /dev/null +++ b/stats.py @@ -0,0 +1,35 @@ +import zipfile +import numpy as np +import pandas as pd +from sklearn import preprocessing + +with zipfile.ZipFile('covid-world-vaccination-progress.zip', 'r') as zip_ref: + zip_ref.extractall(".") + +df = pd.read_csv('country_vaccinations.csv') +# podział danych na train/validate/test (6:2:2) za pomocą biblioteki numpy i pandas +train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))]) + +# Wypisanie ilości elementów w poszczególnych ramkach danych +print("Whole set size".ljust(20), df.size) +print("Train set size: ".ljust(20), train.size) +print("Validate set size: ".ljust(20), validate.size) +print("Test set size: ".ljust(20), test.size) + +df.describe(include='all') + +for col in df.columns: + column = df[col].value_counts().plot(kind="bar",figsize=(30,10)) + print("\n", col) + print(column) + +# normalizacja wartości numerycznych +numeric_values = df.select_dtypes(include='float64').values # tylko wartości numeryczne +min_max_scaler = preprocessing.MinMaxScaler() +x_scaled = min_max_scaler.fit_transform(values) +numeric_columns = df.select_dtypes(include='float64').columns +df_normalized = pd.DataFrame(x_scaled, columns=numeric_columns) +for col in df.columns: # usunięcie nieznormalizowanych danych i wstawienie nowych już znormalizowanych do oryginalnej ramki danych + if col in numeric_columns: df[col] = df_normalized[col] + +df.dropna() # usunięcie wierszy z polami NaN \ No newline at end of file