fixed the issue with dataset being different in link and in loading it via huggingface load dataset

This commit is contained in:
Maciej Tyczynski 2023-06-08 20:20:28 +02:00
parent 2105ea32ef
commit 392bed7268
2 changed files with 412 additions and 572 deletions

File diff suppressed because one or more lines are too long

196
zad1.py
View File

@ -1,171 +1,61 @@
#!/usr/bin/env python
# coding: utf-8
# In[2]:
import pandas as pd import pandas as pd
import sklearn.model_selection import sklearn.model_selection
from datasets import load_dataset from datasets import load_dataset
import mlflow
import mlflow.sklearn
# In[3]:
dataset = load_dataset("mstz/wine", "wine")
# In[4]:
dataset["train"]
# In[5]:
wine_dataset = pd.DataFrame(dataset["train"])
# In[6]:
wine_dataset.head()# podgląd danych
# In[7]:
wine_dataset.describe(include='all')
# In[8]:
wine_dataset["is_red"].value_counts().plot(kind="bar")
# In[9]:
wine_dataset["fixed_acidity"].std()
# In[10]:
import numpy as np import numpy as np
np.where(pd.isnull(wine_dataset))## sprawdzanie czy istnieją puste wartości import logging
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# logging.basicConfig(level=logging.WARN)
# logger = logging.getLogger(__name__)
# In[11]: # mlflow.set_tracking_uri("http://localhost:5000")
# mlflow.set_experiment("s123456")
# def eval_metrics(actual, pred):
# rmse = np.sqrt(mean_squared_error(actual, pred))
# mae = mean_absolute_error(actual, pred)
# r2 = r2_score(actual, pred)
# return rmse, mae, r2
import requests
url = "https://huggingface.co/datasets/mstz/wine/raw/main/Wine_Quality_Data.csv"
save_path = "Wine_Quality_Data.csv"
response = requests.get(url)
response.raise_for_status()
with open(save_path, "wb") as f:
f.write(response.content)
wine_dataset = pd.read_csv("Wine_Quality_Data.csv")
wine_dataset['color'] = wine_dataset['color'].replace({'red': 1, 'white': 0})
for column in wine_dataset.columns: for column in wine_dataset.columns:
wine_dataset[column] = wine_dataset[column] / wine_dataset[column].abs().max() # normalizacja wine_dataset[column] = wine_dataset[column] / wine_dataset[column].abs().max() # normalizacja
# In[12]:
wine_dataset.describe(include='all') # sprawdzanie wartości po znormalizowaniu
# In[13]:
wine_dataset["fixed_acidity"].nlargest(10) #sprawdza czy najwyższe wartości mają sens
# In[14]:
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
wine_train, wine_test = sklearn.model_selection.train_test_split(wine_dataset, test_size=0.1, random_state=1, stratify=wine_dataset["is_red"]) wine_train, wine_test = sklearn.model_selection.train_test_split(wine_dataset, test_size=0.1, random_state=1, stratify=wine_dataset["color"])
wine_train["is_red"].value_counts() wine_train["color"].value_counts()
# podzielenie na train i test # podzielenie na train i test
wine_test["color"].value_counts()
# In[15]:
wine_test["is_red"].value_counts() wine_test, wine_val = sklearn.model_selection.train_test_split(wine_test, test_size=0.5, random_state=1, stratify=wine_test["color"]) # podzielenie na test i validation
wine_test["color"].value_counts()
# In[16]: wine_val["color"].value_counts()
wine_test, wine_val = sklearn.model_selection.train_test_split(wine_test, test_size=0.5, random_state=1, stratify=wine_test["is_red"]) # podzielenie na test i validation
# In[17]:
wine_test["is_red"].value_counts()
# In[18]:
wine_val["is_red"].value_counts()
# In[19]:
import seaborn as sns import seaborn as sns
sns.set_theme() sns.set_theme()
# In[20]:
len(wine_dataset.columns)
# In[ ]:
# In[21]:
#sns.pairplot(data=wine_dataset, hue="is_red")
# In[22]:
wine_test.describe()
# In[23]:
wine_train.describe()
# In[24]:
wine_val.describe()
# In[25]:
import torch import torch
from torch import nn from torch import nn
from torch.utils.data import DataLoader, Dataset from torch.utils.data import DataLoader, Dataset
# In[26]:
class TabularDataset(Dataset): class TabularDataset(Dataset):
def __init__(self, data): def __init__(self, data):
self.data = data.values.astype('float32') self.data = data.values.astype('float32')
@ -179,9 +69,6 @@ class TabularDataset(Dataset):
return len(self.data) return len(self.data)
# In[27]:
batch_size = 64 batch_size = 64
train_dataset = TabularDataset(wine_train) train_dataset = TabularDataset(wine_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
@ -189,15 +76,6 @@ test_dataset = TabularDataset(wine_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# In[ ]:
# In[28]:
class TabularModel(nn.Module): class TabularModel(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim): def __init__(self, input_dim, hidden_dim, output_dim):
super(TabularModel, self).__init__() super(TabularModel, self).__init__()
@ -213,10 +91,6 @@ class TabularModel(nn.Module):
out = self.softmax(out) out = self.softmax(out)
return out return out
# In[29]:
input_dim = wine_train.shape[1] - 1 input_dim = wine_train.shape[1] - 1
hidden_dim = 32 hidden_dim = 32
output_dim = 2 output_dim = 2
@ -225,17 +99,10 @@ criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters()) optimizer = torch.optim.Adam(model.parameters())
# In[30]:
model = TabularModel(input_dim=len(wine_train.columns)-1, hidden_dim=32, output_dim=2) model = TabularModel(input_dim=len(wine_train.columns)-1, hidden_dim=32, output_dim=2)
criterion = nn.CrossEntropyLoss() criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
# In[31]:
num_epochs = 10 num_epochs = 10
for epoch in range(num_epochs): for epoch in range(num_epochs):
running_loss = 0.0 running_loss = 0.0
@ -256,9 +123,6 @@ for epoch in range(num_epochs):
print('Finished Training') print('Finished Training')
# In[32]:
correct = 0 correct = 0
total = 0 total = 0
with torch.no_grad(): with torch.no_grad():