35 lines
1.1 KiB
Python
35 lines
1.1 KiB
Python
|
import zipfile
|
||
|
import os
|
||
|
import pandas as pd
|
||
|
from sklearn.model_selection import train_test_split
|
||
|
|
||
|
if os.getenv("KAGGLE_KEY") is None or os.getenv("KAGGLE_USERNAME") is None:
|
||
|
print("Brak zmiennych środowiskowych KAGGLE_KEY lub KAAGLE_USERNAME")
|
||
|
exit()
|
||
|
|
||
|
if not os.path.isfile('fifa19.zip'):
|
||
|
os.system('kaggle datasets download -d karangadiya/fifa19')
|
||
|
|
||
|
with zipfile.ZipFile('fifa19.zip', 'r') as zip_ref:
|
||
|
zip_ref.extractall('.')
|
||
|
|
||
|
df=pd.read_csv('data.csv')
|
||
|
df = df[df["Release Clause"].notna()]
|
||
|
df = df[df["Release Clause"].notnull()]
|
||
|
|
||
|
if df["Overall"].mean() > 1:
|
||
|
df["Overall"]= df["Overall"]/100
|
||
|
|
||
|
df["Release Clause"] = df["Release Clause"].str.replace("€", "")
|
||
|
|
||
|
df["Release Clause"] = (df["Release Clause"].replace(r'[KM]+$', '', regex=True).astype(float) *
|
||
|
df["Release Clause"].str.extract(r'[\d\.]+([KM]+)', expand=False)
|
||
|
.replace(['K','M'], [1000, 1000000]).astype(int))
|
||
|
|
||
|
df.to_csv('data.csv')
|
||
|
train, dev = train_test_split(df, train_size=0.6, test_size=0.4, shuffle=True)
|
||
|
dev, test = train_test_split(dev, train_size=0.5, test_size=0.5, shuffle=False)
|
||
|
|
||
|
test.to_csv('test.csv')
|
||
|
dev.to_csv('dev.csv')
|
||
|
train.to_csv('train.csv')
|