59 lines
2.0 KiB
Python
59 lines
2.0 KiB
Python
|
import zipfile
|
||
|
import torch
|
||
|
import pandas as pd
|
||
|
import datetime
|
||
|
import numpy as np
|
||
|
|
||
|
from kaggle.api.kaggle_api_extended import KaggleApi
|
||
|
import torch.nn as nn
|
||
|
from torch.utils.data.dataset import random_split
|
||
|
from torch.utils.data import Dataset, TensorDataset
|
||
|
from sklearn import preprocessing
|
||
|
|
||
|
|
||
|
api = KaggleApi()
|
||
|
api.authenticate()
|
||
|
api.dataset_download_file('apoorvaappz/global-super-store-dataset',
|
||
|
file_name='Global_Superstore2.csv', path='./')
|
||
|
|
||
|
with zipfile.ZipFile('Global_Superstore2.csv.zip', 'r') as zipref:
|
||
|
zipref.extractall('.')
|
||
|
|
||
|
data = pd.read_csv("Global_Superstore2.csv", header=0, sep=',')
|
||
|
|
||
|
data["Order Date"] = pd.to_datetime(data["Order Date"])
|
||
|
data = data.sort_values(by="Order Date")
|
||
|
|
||
|
#print(data)
|
||
|
|
||
|
byMonthsYears = {}
|
||
|
for index, row in data.iterrows():
|
||
|
#datee = datetime.datetime.strptime(row['Order Date'], "%d-%m-%Y")
|
||
|
#byMonthsYears.setdefault(datee.strftime("%m-%Y"), 0)
|
||
|
#byMonthsYears[datee.strftime("%m-%Y")] += row['Sales']
|
||
|
byMonthsYears.setdefault(row['Order Date'].strftime("%d-%m-%Y"), 0)
|
||
|
byMonthsYears[row['Order Date'].strftime("%d-%m-%Y")] += row['Sales']
|
||
|
df = data.groupby('Order Date').agg({'Customer Name':'count', 'Sales': 'sum'}).reset_index().rename(columns={'Sales':'Sales sum', 'Customer Name':'Sales count'})
|
||
|
|
||
|
#normalizacja danych
|
||
|
flcols = df[['Sales count', 'Sales sum']].columns
|
||
|
x = df[['Sales count', 'Sales sum']].values
|
||
|
# min_max_scaler = preprocessing.MinMaxScaler()
|
||
|
max_abs_scaler = preprocessing.MaxAbsScaler()
|
||
|
# x_scaled = min_max_scaler.fit_transform(x)
|
||
|
x_scaled = max_abs_scaler.fit_transform(x)
|
||
|
normcols = pd.DataFrame(x_scaled, columns=flcols)
|
||
|
for col in flcols:
|
||
|
df[col] = normcols[col]
|
||
|
#df.to_csv('mms_norm.csv')
|
||
|
|
||
|
x_tensor = torch.tensor(df['Sales sum'].values).float()
|
||
|
y_tensor = torch.tensor(df['Sales count'].values).float()
|
||
|
|
||
|
dataset = TensorDataset(x_tensor, y_tensor)
|
||
|
|
||
|
lengths = [int(len(dataset)*0.8), int(len(dataset)*0.2)]
|
||
|
train_dataset, val_dataset = random_split(dataset, lengths)
|
||
|
|
||
|
torch.save(train_dataset, 'train_dataset.pt')
|
||
|
torch.save(val_dataset, 'val_dataset.pt')
|