import zipfile import torch import pandas as pd import datetime import numpy as np from kaggle.api.kaggle_api_extended import KaggleApi import torch.nn as nn from torch.utils.data.dataset import random_split from torch.utils.data import Dataset, TensorDataset from sklearn import preprocessing api = KaggleApi() api.authenticate() api.dataset_download_file('apoorvaappz/global-super-store-dataset', file_name='Global_Superstore2.csv', path='./') with zipfile.ZipFile('Global_Superstore2.csv.zip', 'r') as zipref: zipref.extractall('.') data = pd.read_csv("Global_Superstore2.csv", header=0, sep=',') data["Order Date"] = pd.to_datetime(data["Order Date"]) data = data.sort_values(by="Order Date") #print(data) byMonthsYears = {} for index, row in data.iterrows(): #datee = datetime.datetime.strptime(row['Order Date'], "%d-%m-%Y") #byMonthsYears.setdefault(datee.strftime("%m-%Y"), 0) #byMonthsYears[datee.strftime("%m-%Y")] += row['Sales'] byMonthsYears.setdefault(row['Order Date'].strftime("%d-%m-%Y"), 0) byMonthsYears[row['Order Date'].strftime("%d-%m-%Y")] += row['Sales'] df = data.groupby('Order Date').agg({'Customer Name':'count', 'Sales': 'sum'}).reset_index().rename(columns={'Sales':'Sales sum', 'Customer Name':'Sales count'}) #normalizacja danych flcols = df[['Sales count', 'Sales sum']].columns x = df[['Sales count', 'Sales sum']].values # min_max_scaler = preprocessing.MinMaxScaler() max_abs_scaler = preprocessing.MaxAbsScaler() # x_scaled = min_max_scaler.fit_transform(x) x_scaled = max_abs_scaler.fit_transform(x) normcols = pd.DataFrame(x_scaled, columns=flcols) for col in flcols: df[col] = normcols[col] #df.to_csv('mms_norm.csv') x_tensor = torch.tensor(df['Sales sum'].values).float() y_tensor = torch.tensor(df['Sales count'].values).float() dataset = TensorDataset(x_tensor, y_tensor) lengths = [int(len(dataset)*0.8), int(len(dataset)*0.2)] train_dataset, val_dataset = random_split(dataset, lengths) torch.save(train_dataset, 'train_dataset.pt') torch.save(val_dataset, 'val_dataset.pt')