diff --git a/Dockerfile b/Dockerfile index 66fc556..5fa0bc5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,12 @@ FROM ubuntu:latest RUN apt update && apt-get install -y python3-pip +WORKDIR /app RUN pip install pandas RUN pip install scikit-learn RUN pip install datasets RUN pip install torch -RUN pip install seaborn \ No newline at end of file +RUN pip install seaborn +COPY ./zad1.py ./ +CMD ["python3", 'zad1.py'] \ No newline at end of file diff --git a/zad1.ipynb b/zad1.ipynb index 94cebc7..d89f3d6 100644 --- a/zad1.ipynb +++ b/zad1.ipynb @@ -1727,7 +1727,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1742,8 +1742,7 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2" - }, - "orig_nbformat": 4 + } }, "nbformat": 4, "nbformat_minor": 2 diff --git a/zad1.py b/zad1.py new file mode 100644 index 0000000..ac14eb8 --- /dev/null +++ b/zad1.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[2]: + + +import pandas as pd +import sklearn.model_selection +from datasets import load_dataset + + +# In[3]: + + +dataset = load_dataset("mstz/wine", "wine") + + +# In[4]: + + +dataset["train"] + + +# In[5]: + + +wine_dataset = pd.DataFrame(dataset["train"]) + + +# In[6]: + + +wine_dataset.head()# podgląd danych + + +# In[7]: + + +wine_dataset.describe(include='all') + + +# In[8]: + + +wine_dataset["is_red"].value_counts().plot(kind="bar") + + + + +# In[9]: + + +wine_dataset["fixed_acidity"].std() + + +# In[10]: + + +import numpy as np +np.where(pd.isnull(wine_dataset))## sprawdzanie czy istnieją puste wartości + + +# In[11]: + + +for column in wine_dataset.columns: + wine_dataset[column] = wine_dataset[column] / wine_dataset[column].abs().max() # normalizacja + + +# In[12]: + + +wine_dataset.describe(include='all') # sprawdzanie wartości po znormalizowaniu + + +# In[13]: + + +wine_dataset["fixed_acidity"].nlargest(10) #sprawdza czy najwyższe wartości mają sens + + +# In[14]: + + +from sklearn.model_selection import train_test_split +wine_train, wine_test = sklearn.model_selection.train_test_split(wine_dataset, test_size=0.1, random_state=1, stratify=wine_dataset["is_red"]) +wine_train["is_red"].value_counts() +# podzielenie na train i test + + +# In[15]: + + +wine_test["is_red"].value_counts() + + +# In[16]: + + +wine_test, wine_val = sklearn.model_selection.train_test_split(wine_test, test_size=0.5, random_state=1, stratify=wine_test["is_red"]) # podzielenie na test i validation + + +# In[17]: + + +wine_test["is_red"].value_counts() + + +# In[18]: + + +wine_val["is_red"].value_counts() + + +# In[19]: + + +import seaborn as sns +sns.set_theme() + + +# In[20]: + + +len(wine_dataset.columns) + + +# In[ ]: + + + + + +# In[21]: + + +#sns.pairplot(data=wine_dataset, hue="is_red") + + +# In[22]: + + +wine_test.describe() + + +# In[23]: + + +wine_train.describe() + + +# In[24]: + + +wine_val.describe() + + +# In[25]: + + +import torch +from torch import nn +from torch.utils.data import DataLoader, Dataset + + +# In[26]: + + +class TabularDataset(Dataset): + def __init__(self, data): + self.data = data.values.astype('float32') + + def __getitem__(self, index): + x = torch.tensor(self.data[index, :-1]) + y = torch.tensor(self.data[index, -1]) + return x, y + + def __len__(self): + return len(self.data) + + +# In[27]: + + +batch_size = 64 +train_dataset = TabularDataset(wine_train) +train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) +test_dataset = TabularDataset(wine_test) +test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) + + +# In[ ]: + + + + + +# In[28]: + + +class TabularModel(nn.Module): + def __init__(self, input_dim, hidden_dim, output_dim): + super(TabularModel, self).__init__() + self.fc1 = nn.Linear(input_dim, hidden_dim) + self.relu = nn.ReLU() + self.fc2 = nn.Linear(hidden_dim, output_dim) + self.softmax = nn.Softmax(dim=1) + + def forward(self, x): + out = self.fc1(x) + out = self.relu(out) + out = self.fc2(out) + out = self.softmax(out) + return out + + +# In[29]: + + +input_dim = wine_train.shape[1] - 1 +hidden_dim = 32 +output_dim = 2 +model = TabularModel(input_dim, hidden_dim, output_dim) +criterion = nn.CrossEntropyLoss() +optimizer = torch.optim.Adam(model.parameters()) + + +# In[30]: + + +model = TabularModel(input_dim=len(wine_train.columns)-1, hidden_dim=32, output_dim=2) +criterion = nn.CrossEntropyLoss() +optimizer = torch.optim.Adam(model.parameters(), lr=0.01) + + +# In[31]: + + +num_epochs = 10 +for epoch in range(num_epochs): + running_loss = 0.0 + for i, data in enumerate(train_dataloader, 0): + inputs, labels = data + labels = labels.type(torch.LongTensor) + optimizer.zero_grad() + outputs = model(inputs) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + running_loss += loss.item() + + # Print the loss every 1000 mini-batches + if (epoch%2) == 0: + print(f'Epoch {epoch + 1}, loss: {running_loss / len(train_dataloader):.4f}') + +print('Finished Training') + + +# In[32]: + + +correct = 0 +total = 0 +with torch.no_grad(): + for data in test_dataloader: + inputs, labels = data + outputs = model(inputs.float()) + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() +print('Accuracy on test set: %d %%' % (100 * correct / total)) +