180 lines
5.8 KiB
Python
180 lines
5.8 KiB
Python
import zipfile
|
|
import torch
|
|
import pandas as pd
|
|
import matplotlib
|
|
import matplotlib.pyplot as plt
|
|
import datetime
|
|
import numpy as np
|
|
|
|
from kaggle.api.kaggle_api_extended import KaggleApi
|
|
import torch.nn as nn
|
|
import torch.optim as optim
|
|
from torch.utils.data.dataset import random_split
|
|
from torch.utils.data import Dataset, TensorDataset, DataLoader
|
|
from torchviz import make_dot
|
|
from sklearn import preprocessing
|
|
|
|
|
|
# api = KaggleApi()
|
|
# api.authenticate()
|
|
# api.dataset_download_file('apoorvaappz/global-super-store-dataset',
|
|
# file_name='Global_Superstore2.csv', path='./')
|
|
|
|
# with zipfile.ZipFile('Global_Superstore2.csv.zip', 'r') as zipref:
|
|
# zipref.extractall('.')
|
|
|
|
data = pd.read_csv("Global_Superstore2.csv", header=0, sep=',')
|
|
|
|
data["Order Date"] = pd.to_datetime(data["Order Date"])
|
|
data = data.sort_values(by="Order Date")
|
|
|
|
#print(data)
|
|
|
|
byMonthsYears = {}
|
|
for index, row in data.iterrows():
|
|
#datee = datetime.datetime.strptime(row['Order Date'], "%d-%m-%Y")
|
|
#byMonthsYears.setdefault(datee.strftime("%m-%Y"), 0)
|
|
#byMonthsYears[datee.strftime("%m-%Y")] += row['Sales']
|
|
byMonthsYears.setdefault(row['Order Date'].strftime("%d-%m-%Y"), 0)
|
|
byMonthsYears[row['Order Date'].strftime("%d-%m-%Y")] += row['Sales']
|
|
df = data.groupby('Order Date').agg({'Customer Name':'count', 'Sales': 'sum'}).reset_index().rename(columns={'Sales':'Sales sum', 'Customer Name':'Sales count'})
|
|
#normalizacja danych
|
|
flcols = df[['Sales count', 'Sales sum']].columns
|
|
x = df[['Sales count', 'Sales sum']].values
|
|
# min_max_scaler = preprocessing.MinMaxScaler()
|
|
max_abs_scaler = preprocessing.MaxAbsScaler()
|
|
# x_scaled = min_max_scaler.fit_transform(x)
|
|
x_scaled = max_abs_scaler.fit_transform(x)
|
|
normcols = pd.DataFrame(x_scaled, columns=flcols)
|
|
for col in flcols:
|
|
df[col] = normcols[col]
|
|
df.to_csv('mms_norm.csv')
|
|
exit()
|
|
# fig, ax = plt.subplots()
|
|
# fig.set_figheight(15)
|
|
# fig.set_figwidth(20)
|
|
# ax.scatter(df['Month and Year'], df['Sum of sales'])
|
|
#plt.show()
|
|
# # Data Generation
|
|
# np.random.seed(42)
|
|
# x = np.random.rand(100, 1)
|
|
# y = 1 + 2 * x + .1 * np.random.randn(100, 1)
|
|
|
|
# # Shuffles the indices
|
|
# idx = np.arange(100)
|
|
# np.random.shuffle(idx)
|
|
|
|
# # Uses first 80 random indices for train
|
|
# train_idx = idx[:80]
|
|
# # Uses the remaining indices for validation
|
|
# val_idx = idx[80:]
|
|
|
|
# # Generates train and validation sets
|
|
# x_train, y_train = x[train_idx], y[train_idx]
|
|
# x_val, y_val = x[val_idx], y[val_idx]
|
|
# x_tensor = torch.from_numpy(x_train).float()
|
|
# y_tensor = torch.from_numpy(y_train).float()
|
|
|
|
x_tensor = torch.tensor(df['Sales sum'].values).float()
|
|
y_tensor = torch.tensor(df['Sales count'].values).float()
|
|
|
|
dataset = TensorDataset(x_tensor, y_tensor)
|
|
|
|
|
|
#torch.manual_seed(42)
|
|
lengths = [int(len(dataset)*0.8), int(len(dataset)*0.2)]
|
|
train_dataset, val_dataset = random_split(dataset, lengths)
|
|
|
|
train_loader = DataLoader(dataset=train_dataset)
|
|
val_loader = DataLoader(dataset=val_dataset)
|
|
|
|
|
|
class LayerLinearRegression(nn.Module):
|
|
def __init__(self):
|
|
super().__init__()
|
|
# Instead of our custom parameters, we use a Linear layer with single input and single output
|
|
self.linear = nn.Linear(1, 1)
|
|
|
|
def forward(self, x):
|
|
# Now it only takes a call to the layer to make predictions
|
|
return self.linear(x)
|
|
|
|
model = LayerLinearRegression()
|
|
# Checks model's parameters
|
|
#print(model.state_dict())
|
|
|
|
lr = 1e-3
|
|
n_epochs = 100
|
|
|
|
loss_fn = nn.MSELoss(reduction='mean')
|
|
optimizer = optim.SGD(model.parameters(), lr=lr)
|
|
|
|
def make_train_step(model, loss_fn, optimizer):
|
|
# Builds function that performs a step in the train loop
|
|
def train_step(x, y):
|
|
# Sets model to TRAIN mode
|
|
model.train()
|
|
# Makes predictions
|
|
yhat = model(x)
|
|
# Computes loss
|
|
loss = loss_fn(y, yhat)
|
|
# Computes gradients
|
|
loss.backward()
|
|
# Updates parameters and zeroes gradients
|
|
optimizer.step()
|
|
optimizer.zero_grad()
|
|
# Returns the loss
|
|
return loss.item()
|
|
|
|
# Returns the function that will be called inside the train loop
|
|
return train_step
|
|
|
|
# Creates the train_step function for our model, loss function and optimizer
|
|
train_step = make_train_step(model, loss_fn, optimizer)
|
|
training_losses = []
|
|
validation_losses = []
|
|
print(model.state_dict())
|
|
# For each epoch...
|
|
for epoch in range(n_epochs):
|
|
losses = []
|
|
# Uses loader to fetch one mini-batch for training
|
|
for x_batch, y_batch in train_loader:
|
|
# NOW, sends the mini-batch data to the device
|
|
# so it matches location of the MODEL
|
|
# x_batch = x_batch.to(device)
|
|
# y_batch = y_batch.to(device)
|
|
# One stpe of training
|
|
loss = train_step(x_batch, y_batch)
|
|
losses.append(loss)
|
|
training_loss = np.mean(losses)
|
|
training_losses.append(training_loss)
|
|
|
|
# After finishing training steps for all mini-batches,
|
|
# it is time for evaluation!
|
|
|
|
# We tell PyTorch to NOT use autograd...
|
|
# Do you remember why?
|
|
with torch.no_grad():
|
|
val_losses = []
|
|
# Uses loader to fetch one mini-batch for validation
|
|
for x_val, y_val in val_loader:
|
|
# Again, sends data to same device as model
|
|
# x_val = x_val.to(device)
|
|
# y_val = y_val.to(device)
|
|
|
|
# What is that?!
|
|
model.eval()
|
|
# Makes predictions
|
|
yhat = model(x_val)
|
|
# Computes validation loss
|
|
val_loss = loss_fn(y_val, yhat)
|
|
val_losses.append(val_loss.item())
|
|
validation_loss = np.mean(val_losses)
|
|
validation_losses.append(validation_loss)
|
|
|
|
print(f"[{epoch+1}] Training loss: {training_loss:.3f}\t Validation loss: {validation_loss:.3f}")
|
|
|
|
# Checks model's parameters
|
|
print(model.state_dict())
|
|
print(np.mean(losses))
|
|
print(np.mean(val_losses)) |