2021-04-10 15:08:18 +02:00
|
|
|
import pandas as pd
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
from sklearn import preprocessing
|
|
|
|
import kaggle
|
|
|
|
|
|
|
|
kaggle.api.authenticate()
|
|
|
|
|
2021-05-15 12:04:19 +02:00
|
|
|
# kaggle.api.dataset_download_files('martj42/international-football-results-from-1872-to-2017', path='.', unzip=True)
|
2021-04-10 15:08:18 +02:00
|
|
|
|
|
|
|
results = pd.read_csv('results.csv')
|
|
|
|
|
|
|
|
#brak wierszy z NaN
|
|
|
|
results.dropna()
|
|
|
|
|
|
|
|
#normalizacja itp
|
|
|
|
for collumn in ['home_team', 'away_team', 'tournament', 'city', 'country']:
|
|
|
|
results[collumn] = results[collumn].str.lower()
|
2021-04-24 12:21:33 +02:00
|
|
|
|
2021-04-10 15:08:18 +02:00
|
|
|
# Podział zbioru 6:1:1
|
|
|
|
train, test = train_test_split(results, test_size= 1 - 0.6)
|
|
|
|
|
|
|
|
valid, test = train_test_split(test, test_size=0.5)
|
|
|
|
|
2021-04-11 09:54:05 +02:00
|
|
|
train.to_csv("train.csv", index=False)
|
|
|
|
valid.to_csv("valid.csv",index=False)
|
|
|
|
test.to_csv("test.csv",index=False)
|