ium_434732/skrypt.ipynb
2021-03-21 16:18:50 +01:00

11 KiB

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import kaggle

kaggle.api.authenticate()

kaggle.api.dataset_download_files('martj42/international-football-results-from-1872-to-2017', path='.', unzip=True)

results = pd.read_csv('results.csv')

#brak wierszy z NaN
results.dropna()

#normalizacja itp
for collumn in ['home_team', 'away_team', 'tournament', 'city', 'country']:
    results[collumn] = results[collumn].str.lower()
    
# Podział zbioru 6:1:1
train, test = train_test_split(results, test_size= 1 - 0.6)

valid, test = train_test_split(test, test_size=0.5) 

print("All data: ", results.size)
print("Train size: ", train.size)
print("Test size: ", test.size)
print("Validate size: ", valid.size)
print(results.describe(include='all'))

# sprawdzenie czy cały dataset oraz podział na podzbiory jest równy
print(train.size+test.size+valid.size)

for col in results.columns:
    column = results[col].value_counts().plot(kind="bar")
    print("\n", col)
    print(column)
All data:  376884
Train size:  226125
Test size:  75384
Validate size:  75375
              date home_team away_team    home_score    away_score tournament  \
count        41876     41876     41876  41876.000000  41876.000000      41876   
unique       15232       308       306           NaN           NaN        112   
top     2012-02-29    brazil   uruguay           NaN           NaN   friendly   
freq            66       570       543           NaN           NaN      17136   
mean           NaN       NaN       NaN      1.744293      1.186503        NaN   
std            NaN       NaN       NaN      1.752248      1.403053        NaN   
min            NaN       NaN       NaN      0.000000      0.000000        NaN   
25%            NaN       NaN       NaN      1.000000      0.000000        NaN   
50%            NaN       NaN       NaN      1.000000      1.000000        NaN   
75%            NaN       NaN       NaN      2.000000      2.000000        NaN   
max            NaN       NaN       NaN     31.000000     21.000000        NaN   

                city        country neutral  
count          41876          41876   41876  
unique          2026            266       2  
top     kuala lumpur  united states   False  
freq             589           1160   31557  
mean             NaN            NaN     NaN  
std              NaN            NaN     NaN  
min              NaN            NaN     NaN  
25%              NaN            NaN     NaN  
50%              NaN            NaN     NaN  
75%              NaN            NaN     NaN  
max              NaN            NaN     NaN  
376884

 date
AxesSubplot(0.125,0.125;0.775x0.755)

 home_team
AxesSubplot(0.125,0.125;0.775x0.755)

 away_team
AxesSubplot(0.125,0.125;0.775x0.755)

 home_score
AxesSubplot(0.125,0.125;0.775x0.755)

 away_score
AxesSubplot(0.125,0.125;0.775x0.755)

 tournament
AxesSubplot(0.125,0.125;0.775x0.755)

 city
AxesSubplot(0.125,0.125;0.775x0.755)

 country
AxesSubplot(0.125,0.125;0.775x0.755)

 neutral
AxesSubplot(0.125,0.125;0.775x0.755)