import zipfile
import torch
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import datetime
import numpy as np
from kaggle.api.kaggle_api_extended import KaggleApi
import torch.nn as nn
import torch.optim as optim
from torch.utils.data.dataset import random_split
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torchviz import make_dot
from sklearn import preprocessing
api = KaggleApi()
file_name='Global_Superstore2.csv', path='./')
with zipfile.ZipFile('Global_Superstore2.csv.zip', 'r') as zipref:
data = pd.read_csv("Global_Superstore2.csv", header=0, sep=',')
data["Order Date"] = pd.to_datetime(data["Order Date"])
data = data.sort_values(by="Order Date")
byMonthsYears = {}
for index, row in data.iterrows():
#datee = datetime.datetime.strptime(row['Order Date'], "%d-%m-%Y")
#byMonthsYears.setdefault(datee.strftime("%m-%Y"), 0)
#byMonthsYears[datee.strftime("%m-%Y")] += row['Sales']
byMonthsYears.setdefault(row['Order Date'].strftime("%d-%m-%Y"), 0)
byMonthsYears[row['Order Date'].strftime("%d-%m-%Y")] += row['Sales']
df = data.groupby('Order Date').agg({'Customer Name':'count', 'Sales': 'sum'}).reset_index().rename(columns={'Sales':'Sales sum', 'Customer Name':'Sales count'})
#normalizacja danych
flcols = df[['Sales count', 'Sales sum']].columns
x = df[['Sales count', 'Sales sum']].values
# min_max_scaler = preprocessing.MinMaxScaler()
max_abs_scaler = preprocessing.MaxAbsScaler()
# x_scaled = min_max_scaler.fit_transform(x)
x_scaled = max_abs_scaler.fit_transform(x)
normcols = pd.DataFrame(x_scaled, columns=flcols)
for col in flcols:
df[col] = normcols[col]
x_tensor = torch.tensor(df['Sales sum'].values).float()
y_tensor = torch.tensor(df['Sales count'].values).float()
dataset = TensorDataset(x_tensor, y_tensor)
lengths = [int(len(dataset)*0.8), int(len(dataset)*0.2)]
train_dataset, val_dataset = random_split(dataset, lengths)
train_loader = DataLoader(dataset=train_dataset)
val_loader = DataLoader(dataset=val_dataset)
class LayerLinearRegression(nn.Module):
def __init__(self):
# Instead of our custom parameters, we use a Linear layer with single input and single output
self.linear = nn.Linear(1, 1)
def forward(self, x):
# Now it only takes a call to the layer to make predictions
return self.linear(x)
model = LayerLinearRegression()
# Checks model's parameters
lr = 1e-3
n_epochs = 100
loss_fn = nn.MSELoss(reduction='mean')
optimizer = optim.SGD(model.parameters(), lr=lr)
def make_train_step(model, loss_fn, optimizer):
# Builds function that performs a step in the train loop
def train_step(x, y):
# Sets model to TRAIN mode
# Makes predictions
yhat = model(x)
# Computes loss
loss = loss_fn(y, yhat)
# Computes gradients
# Updates parameters and zeroes gradients
# Returns the loss
return loss.item()
# Returns the function that will be called inside the train loop
return train_step
# Creates the train_step function for our model, loss function and optimizer
train_step = make_train_step(model, loss_fn, optimizer)
training_losses = []
validation_losses = []
# For each epoch...
for epoch in range(n_epochs):
losses = []
# Uses loader to fetch one mini-batch for training
for x_batch, y_batch in train_loader:
# NOW, sends the mini-batch data to the device
# so it matches location of the MODEL
# x_batch = x_batch.to(device)
# y_batch = y_batch.to(device)
# One stpe of training
loss = train_step(x_batch, y_batch)
training_loss = np.mean(losses)
# After finishing training steps for all mini-batches,
# it is time for evaluation!
# We tell PyTorch to NOT use autograd...
# Do you remember why?
with torch.no_grad():
val_losses = []
# Uses loader to fetch one mini-batch for validation
for x_val, y_val in val_loader:
# Again, sends data to same device as model
# x_val = x_val.to(device)
# y_val = y_val.to(device)
# Makes predictions
yhat = model(x_val)
# Computes validation loss
val_loss = loss_fn(y_val, yhat)
validation_loss = np.mean(val_losses)
print(f"[{epoch+1}] Training loss: {training_loss:.3f}\t Validation loss: {validation_loss:.3f}")
# Checks model's parameters
print("Mean squared error for training: ", np.mean(losses))
print("Mean squared error for valid: ", np.mean(val_losses))