import zipfile import torch import pandas as pd import matplotlib import matplotlib.pyplot as plt import datetime import numpy as np from kaggle.api.kaggle_api_extended import KaggleApi import torch.nn as nn import torch.optim as optim from torch.utils.data.dataset import random_split from torch.utils.data import Dataset, TensorDataset, DataLoader from torchviz import make_dot from sklearn import preprocessing # api = KaggleApi() # api.authenticate() # api.dataset_download_file('apoorvaappz/global-super-store-dataset', # file_name='Global_Superstore2.csv', path='./') # with zipfile.ZipFile('Global_Superstore2.csv.zip', 'r') as zipref: # zipref.extractall('.') data = pd.read_csv("Global_Superstore2.csv", header=0, sep=',') data["Order Date"] = pd.to_datetime(data["Order Date"]) data = data.sort_values(by="Order Date") #print(data) byMonthsYears = {} for index, row in data.iterrows(): #datee = datetime.datetime.strptime(row['Order Date'], "%d-%m-%Y") #byMonthsYears.setdefault(datee.strftime("%m-%Y"), 0) #byMonthsYears[datee.strftime("%m-%Y")] += row['Sales'] byMonthsYears.setdefault(row['Order Date'].strftime("%d-%m-%Y"), 0) byMonthsYears[row['Order Date'].strftime("%d-%m-%Y")] += row['Sales'] df = data.groupby('Order Date').agg({'Customer Name':'count', 'Sales': 'sum'}).reset_index().rename(columns={'Sales':'Sales sum', 'Customer Name':'Sales count'}) #normalizacja danych flcols = df[['Sales count', 'Sales sum']].columns x = df[['Sales count', 'Sales sum']].values # min_max_scaler = preprocessing.MinMaxScaler() max_abs_scaler = preprocessing.MaxAbsScaler() # x_scaled = min_max_scaler.fit_transform(x) x_scaled = max_abs_scaler.fit_transform(x) normcols = pd.DataFrame(x_scaled, columns=flcols) for col in flcols: df[col] = normcols[col] df.to_csv('mms_norm.csv') exit() # fig, ax = plt.subplots() # fig.set_figheight(15) # fig.set_figwidth(20) # ax.scatter(df['Month and Year'], df['Sum of sales']) #plt.show() # # Data Generation # np.random.seed(42) # x = np.random.rand(100, 1) # y = 1 + 2 * x + .1 * np.random.randn(100, 1) # # Shuffles the indices # idx = np.arange(100) # np.random.shuffle(idx) # # Uses first 80 random indices for train # train_idx = idx[:80] # # Uses the remaining indices for validation # val_idx = idx[80:] # # Generates train and validation sets # x_train, y_train = x[train_idx], y[train_idx] # x_val, y_val = x[val_idx], y[val_idx] # x_tensor = torch.from_numpy(x_train).float() # y_tensor = torch.from_numpy(y_train).float() x_tensor = torch.tensor(df['Sales sum'].values).float() y_tensor = torch.tensor(df['Sales count'].values).float() dataset = TensorDataset(x_tensor, y_tensor) #torch.manual_seed(42) lengths = [int(len(dataset)*0.8), int(len(dataset)*0.2)] train_dataset, val_dataset = random_split(dataset, lengths) train_loader = DataLoader(dataset=train_dataset) val_loader = DataLoader(dataset=val_dataset) class LayerLinearRegression(nn.Module): def __init__(self): super().__init__() # Instead of our custom parameters, we use a Linear layer with single input and single output self.linear = nn.Linear(1, 1) def forward(self, x): # Now it only takes a call to the layer to make predictions return self.linear(x) model = LayerLinearRegression() # Checks model's parameters #print(model.state_dict()) lr = 1e-3 n_epochs = 100 loss_fn = nn.MSELoss(reduction='mean') optimizer = optim.SGD(model.parameters(), lr=lr) def make_train_step(model, loss_fn, optimizer): # Builds function that performs a step in the train loop def train_step(x, y): # Sets model to TRAIN mode model.train() # Makes predictions yhat = model(x) # Computes loss loss = loss_fn(y, yhat) # Computes gradients loss.backward() # Updates parameters and zeroes gradients optimizer.step() optimizer.zero_grad() # Returns the loss return loss.item() # Returns the function that will be called inside the train loop return train_step # Creates the train_step function for our model, loss function and optimizer train_step = make_train_step(model, loss_fn, optimizer) training_losses = [] validation_losses = [] print(model.state_dict()) # For each epoch... for epoch in range(n_epochs): losses = [] # Uses loader to fetch one mini-batch for training for x_batch, y_batch in train_loader: # NOW, sends the mini-batch data to the device # so it matches location of the MODEL # x_batch = x_batch.to(device) # y_batch = y_batch.to(device) # One stpe of training loss = train_step(x_batch, y_batch) losses.append(loss) training_loss = np.mean(losses) training_losses.append(training_loss) # After finishing training steps for all mini-batches, # it is time for evaluation! # We tell PyTorch to NOT use autograd... # Do you remember why? with torch.no_grad(): val_losses = [] # Uses loader to fetch one mini-batch for validation for x_val, y_val in val_loader: # Again, sends data to same device as model # x_val = x_val.to(device) # y_val = y_val.to(device) # What is that?! model.eval() # Makes predictions yhat = model(x_val) # Computes validation loss val_loss = loss_fn(y_val, yhat) val_losses.append(val_loss.item()) validation_loss = np.mean(val_losses) validation_losses.append(validation_loss) print(f"[{epoch+1}] Training loss: {training_loss:.3f}\t Validation loss: {validation_loss:.3f}") # Checks model's parameters print(model.state_dict()) print(np.mean(losses)) print(np.mean(val_losses))