2022-04-02 15:36:46 +02:00
|
|
|
import pandas as pd
|
|
|
|
import numpy as np
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
|
|
# usuwamy przy okazji puste pola
|
|
|
|
lego = pd.read_csv('lego_sets.csv').dropna()
|
|
|
|
|
|
|
|
# list_price moze byc do dwoch miejsc po przecinku
|
|
|
|
lego['list_price'] = lego['list_price'].round(2)
|
|
|
|
|
|
|
|
# num_reviews, piece_count i prod_id moga byc wartosciami calkowitymi
|
|
|
|
lego['num_reviews'] = lego['num_reviews'].apply(np.int64)
|
|
|
|
lego['piece_count'] = lego['piece_count'].apply(np.int64)
|
|
|
|
lego['prod_id'] = lego['prod_id'].apply(np.int64)
|
|
|
|
|
|
|
|
# wglad, statystyki
|
|
|
|
print(lego)
|
2022-04-02 16:07:21 +02:00
|
|
|
print(lego.describe(include='all'))
|
2022-04-02 15:36:46 +02:00
|
|
|
|
|
|
|
# pierwszy podzial, wydzielamy zbior treningowy
|
|
|
|
lego_train, lego_rem = train_test_split(lego, train_size=0.8, random_state=1)
|
|
|
|
|
|
|
|
# drugi podział, wydzielamy walidacyjny i testowy
|
|
|
|
lego_valid, lego_test = train_test_split(lego_rem, test_size=0.5, random_state=1)
|
|
|
|
|
|
|
|
# zapis
|
|
|
|
lego.to_csv('lego_sets_clean.csv', index=None, header=True)
|
|
|
|
lego_train.to_csv('lego_sets_clean_train.csv', index=None, header=True)
|
|
|
|
lego_valid.to_csv('lego_sets_clean_valid.csv', index=None, header=True)
|
|
|
|
lego_test.to_csv('lego_sets_clean_test.csv', index=None, header=True)
|