import pandas as pd import numpy as np from sklearn.model_selection import train_test_split # usuwamy przy okazji puste pola lego = pd.read_csv('lego_sets.csv').dropna() # list_price moze byc do dwoch miejsc po przecinku lego['list_price'] = lego['list_price'].round(2) # num_reviews, piece_count i prod_id moga byc wartosciami calkowitymi lego['num_reviews'] = lego['num_reviews'].apply(np.int64) lego['piece_count'] = lego['piece_count'].apply(np.int64) lego['prod_id'] = lego['prod_id'].apply(np.int64) # wglad, statystyki print(lego) print(lego.describe(include='all')).encode('utf-8') # pierwszy podzial, wydzielamy zbior treningowy lego_train, lego_rem = train_test_split(lego, train_size=0.8, random_state=1) # drugi podziaƂ, wydzielamy walidacyjny i testowy lego_valid, lego_test = train_test_split(lego_rem, test_size=0.5, random_state=1) # zapis lego.to_csv('lego_sets_clean.csv', index=None, header=True) lego_train.to_csv('lego_sets_clean_train.csv', index=None, header=True) lego_valid.to_csv('lego_sets_clean_valid.csv', index=None, header=True) lego_test.to_csv('lego_sets_clean_test.csv', index=None, header=True)