diff --git a/data_filters.py b/data_filters.py index 76871e4..5e809a1 100644 --- a/data_filters.py +++ b/data_filters.py @@ -2,12 +2,12 @@ import pandas as pd from fuzzy import * -def zapisz_do_csv(nazwa_pliku, dataframe): - dataframe.to_csv(nazwa_pliku, mode='a', index=False, header=not pd.DataFrame().append(dataframe).empty) +def save_to_csv(filename, dataframe): + dataframe.to_csv(filename, mode='a', index=False, header=not pd.DataFrame().append(dataframe).empty) -def podziel_na_partie(dataframe, rozmiar_partii): - for i in range(0, len(dataframe), rozmiar_partii): - yield dataframe.iloc[i:i + rozmiar_partii] +def split_to_parts(dataframe, part_size): + for i in range(0, len(dataframe), part_size): + yield dataframe.iloc[i:i + part_size] def przetwarzaj_co_50_rekordow(plik_wejsciowy, plik_wyjsciowy): dataframe_wejsciowe = pd.read_csv(plik_wejsciowy) @@ -40,16 +40,16 @@ def generateFuzzyLogicData(dataframe): def last5Matches(season, teamA, data, df): - # Wybierz rekordy dla danej pary drużyn i sezonu + subset = df[((df['season'] == season) & ((df['home_team'] == teamA) | (df['away_team'] == teamA)))] - # Filtruj dane, aby zawierały te przed daną datą + before_given_date = subset[pd.to_datetime(subset['date']) < pd.to_datetime(data)] - # Posortuj wg daty w odwrotnej kolejności + before_given_date = before_given_date.sort_values(by='date', ascending=False) - # Wybierz 5 ostatnich przed daną datą + last_before_date = before_given_date.head(5) return last_before_date, "_5m" diff --git a/main.py b/main.py index b002c79..8853a85 100644 --- a/main.py +++ b/main.py @@ -8,8 +8,7 @@ from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.preprocessing import LabelEncoder from sklearn.metrics import classification_report - - +from sklearn.ensemble import GradientBoostingClassifier # Ostatnie 5 spotkań @@ -24,13 +23,12 @@ from sklearn.metrics import classification_report if __name__ == "__main__": - df = pd.read_csv('df_full_premierleague.csv') + df = pd.read_csv('df_parts.csv') - + ''' - ''' df = pd.read_csv('df_full_premierleague.csv') result = last5Matches('10/11', 'Stoke City', '2010-10-02', df)[0] #print(result.to_markdown()) @@ -42,7 +40,6 @@ if __name__ == "__main__": print(calculatePoints(result,'Blackburn Rovers')) print(calculateGoalDifference(result, 'Blackburn Rovers')) - ''' # df = generateTrainingData(df) # df = add_column(df, categorize_passes, "c_away_passes", "away_passes") @@ -158,8 +155,8 @@ if __name__ == "__main__": df.to_csv('df.csv', index=False) #TU sie zapisuje zbior - rozmiar_partii = 50 - for part in podziel_na_partie(df, rozmiar_partii): + part_size = 50 + for part in split_to_parts(df, part_size): part = add_column(part, @@ -206,8 +203,8 @@ if __name__ == "__main__": "c_away_passing_5btw") - zapisz_do_csv("df_parts", part) - + save_to_csv("df_parts", part) +''' df = generateFuzzyLogicData(df) label_encoder = LabelEncoder() @@ -219,9 +216,12 @@ if __name__ == "__main__": df[['home_team', 'away_team']] = temp.unstack() X = df.drop(['result_full', 'date', 'c_home_result', 'c_away_result'], axis=1) y = df['c_home_result'] + #y = label_encoder.fit_transform(df['c_home_result']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - model = RandomForestClassifier(n_estimators=100, random_state=42) + model = RandomForestClassifier(n_estimators=500, random_state=42) + #model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, random_state = 42) + model.fit(X_train, y_train) y_pred = model.predict(X_test) @@ -232,6 +232,8 @@ if __name__ == "__main__": accuracy = accuracy_score(y_test, y_pred) print(f'Dokładność modelu: {accuracy}') print(classification_report(y_test, y_pred)) + + #print(model.feature_importances_) #print(categorize_fuzzy_passes(450,50))