ium_434760/Zadanie 1.py

35 lines
1.1 KiB
Python
Raw Normal View History

2021-04-11 17:20:04 +02:00
import zipfile
import os
import pandas as pd
from sklearn.model_selection import train_test_split
if os.getenv("KAGGLE_KEY") is None or os.getenv("KAGGLE_USERNAME") is None:
print("Brak zmiennych środowiskowych KAGGLE_KEY lub KAAGLE_USERNAME")
exit()
if not os.path.isfile('fifa19.zip'):
os.system('kaggle datasets download -d karangadiya/fifa19')
with zipfile.ZipFile('fifa19.zip', 'r') as zip_ref:
zip_ref.extractall('.')
df=pd.read_csv('data.csv')
df = df[df["Release Clause"].notna()]
df = df[df["Release Clause"].notnull()]
if df["Overall"].mean() > 1:
df["Overall"]= df["Overall"]/100
df["Release Clause"] = df["Release Clause"].str.replace("", "")
df["Release Clause"] = (df["Release Clause"].replace(r'[KM]+$', '', regex=True).astype(float) *
df["Release Clause"].str.extract(r'[\d\.]+([KM]+)', expand=False)
.replace(['K','M'], [1000, 1000000]).astype(int))
df.to_csv('data.csv')
train, dev = train_test_split(df, train_size=0.6, test_size=0.4, shuffle=True)
dev, test = train_test_split(dev, train_size=0.5, test_size=0.5, shuffle=False)
test.to_csv('test.csv')
dev.to_csv('dev.csv')
train.to_csv('train.csv')