import pandas as pd from sklearn.model_selection import train_test_split from sklearn import preprocessing import kaggle kaggle.api.authenticate() kaggle.api.dataset_download_files('martj42/international-football-results-from-1872-to-2017', path='.', unzip=True) results = pd.read_csv('results.csv') #brak wierszy z NaN results.dropna() #normalizacja itp for collumn in ['home_team', 'away_team', 'tournament', 'city', 'country']: results[collumn] = results[collumn].str.lower() # Podział zbioru 6:1:1 train, test = train_test_split(results, test_size= 1 - 0.6) valid, test = train_test_split(test, test_size=0.5) print("All data: ", results.size) print("Train size: ", train.size) print("Test size: ", test.size) print("Validate size: ", valid.size) print(results.describe(include='all')) # sprawdzenie czy cały dataset oraz podział na podzbiory jest równy print(train.size+test.size+valid.size)