From 2fff850afc0c0fff8539a344df265dae52550871 Mon Sep 17 00:00:00 2001 From: s452662 Date: Sat, 6 Jan 2024 20:36:34 +0100 Subject: [PATCH] added prototype prediction model --- data_filters.py | 81 +++++++++++++++++++++++++++++ main.py | 134 ++++++++++++++++++++---------------------------- rules.py | 110 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 244 insertions(+), 81 deletions(-) create mode 100644 data_filters.py diff --git a/data_filters.py b/data_filters.py new file mode 100644 index 0000000..a8c2687 --- /dev/null +++ b/data_filters.py @@ -0,0 +1,81 @@ +import pandas as pd +from simpful import * + +def generateTrainingData(dataframe): + columns = ['season','date','home_team','away_team','result_full','home_passes','away_passes', + 'home_possession','away_possession','home_shots','away_shots'] + return dataframe[columns] + + +def generateFuzzyLogicData(dataframe): + columns = ['season','date','home_team','away_team','result_full','c_home_passes','c_away_passes', + 'c_home_possession','c_away_possession','c_home_shots','c_away_shots','c_home_form','c_away_form', + 'c_home_diff', 'c_away_diff'] + return dataframe[columns] + +def last5Matches(season, teamA, data, df): + # Wybierz rekordy dla danej pary drużyn i sezonu + subset = df[((df['season'] == season) & ((df['home_team'] == teamA) | (df['away_team'] == teamA)))] + + # Filtruj dane, aby zawierały te przed daną datą + before_given_date = subset[pd.to_datetime(subset['date']) < pd.to_datetime(data)] + + # Posortuj wg daty w odwrotnej kolejności + before_given_date = before_given_date.sort_values(by='date', ascending=False) + + # Wybierz 5 ostatnich przed daną datą + last_before_date = before_given_date.head(5) + + return last_before_date + + + + +def getResult(score,teamHome): + x,y = score.split('-') + x = int(x) + y = int(y) + + if (x > y and teamHome == True) or (x < y and teamHome == False): + return "win" + elif x == y: + return "draw" + else: + return "loss" + + +def calculatePoints(matches, team): + points = 0 + for index, row in matches.iterrows(): + if team == row['home_team']: + teamHome = True + else: + teamHome = False + x = getResult(row['result_full'], teamHome) + #print(x) + if x == "win": + points = points + 3 + elif x == "draw": + points = points + 1 + if matches.shape[0] != 0: + points_avg = points / matches.shape[0] + else: + points_avg = 0 + return points_avg + + +def calculateGoalDifference(matches, team): + goal_diff = 0 + for index, row in matches.iterrows(): + if team == row['home_team']: + teamHome = True + else: + teamHome = False + x,y = row['result_full'].split('-') + x = int(x) + y = int(y) + if teamHome: + goal_diff = goal_diff + (x-y) + else: + goal_diff = goal_diff + (y-x) + return goal_diff diff --git a/main.py b/main.py index 0574630..d624b57 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,14 @@ import pandas as pd from simpful import * +from rules import * +from data_filters import * +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score +from sklearn.preprocessing import LabelEncoder +from sklearn.metrics import classification_report + -df = pd.read_csv('df_full_premierleague.csv') # Ostatnie 5 spotkań @@ -12,95 +19,64 @@ df = pd.read_csv('df_full_premierleague.csv') #Podania ponizej 300-400 słabo powyżej 500 dużo -def generateTrainingData(dataframe): - columns = ['season','date','home_team','away_team','result_full','home_passes','away_passes', - 'home_possession','away_possession','home_shots','away_shots'] - return dataframe[columns] +if __name__ == "__main__": + df = pd.read_csv('df_full_premierleague.csv') -def last5Matches(sezon, druzynaA, data, df): - # Wybierz rekordy dla danej pary drużyn i sezonu - subset = df[((df['season'] == sezon) & ((df['home_team'] == druzynaA) | (df['away_team'] == druzynaA)))] + result = last5Matches('10/11', 'Stoke City', '2010-10-02', df) + #print(result.to_markdown()) + #print(result) + result = last5Matches('10/11', 'Blackburn Rovers', '2010-10-02', df) + #print(result.to_markdown()) + #print(result) - # Filtruj dane, aby zawierały te przed daną datą - przed_dana_data = subset[pd.to_datetime(subset['date']) < pd.to_datetime(data)] + print(calculatePoints(result,'Blackburn Rovers')) + print(calculateGoalDifference(result, 'Blackburn Rovers')) - # Posortuj wg daty w odwrotnej kolejności - przed_dana_data = przed_dana_data.sort_values(by='date', ascending=False) + df = generateTrainingData(df) + df = add_column(df, categorize_passes, "c_away_passes", "away_passes") + df = add_column(df, categorize_passes, "c_home_passes", "home_passes") - # Wybierz 5 ostatnich przed daną datą - ostatnie_przed_data = przed_dana_data.head(5) + df = add_column(df, categorize_possesion, "c_away_possession", "away_possession") + df = add_column(df, categorize_possesion, "c_home_possession", "home_possession") - return ostatnie_przed_data + df = add_column(df, categorize_shots, "c_away_shots", "away_shots") + df = add_column(df, categorize_shots, "c_home_shots", "home_shots") + print(df.columns) + df = add_column(df, get_points_home(df), "c_home_form") + df = add_column(df, get_points_away(df), "c_away_form") + df = add_column(df, get_diff_home(df), "c_home_diff") + df = add_column(df, get_diff_away(df), "c_away_diff") + df = generateFuzzyLogicData(df) -def getResult(score,teamHome): - x,y = score.split('-') - x = int(x) - y = int(y) + label_encoder = LabelEncoder() + df['season'] = label_encoder.fit_transform(df['season']) + df['c_home_result'] = get_result_list(df,True) + df['c_away_result'] = get_result_list(df,True) + temp = df[['home_team', 'away_team']].stack() + temp[:] = temp.factorize()[0] + df[['home_team', 'away_team']] = temp.unstack() + X = df.drop(['result_full', 'date', 'c_home_result', 'c_away_result'], axis=1) + y = df['c_home_result'] + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - if (x > y and teamHome == True) or (x < y and teamHome == False): - return "win" - elif x == y: - return "draw" - else: - return "loss" + model = RandomForestClassifier(n_estimators=100, random_state=42) + model.fit(X_train, y_train) + + y_pred = model.predict(X_test) + + accuracy = accuracy_score(y_test, y_pred) + print(f'Dokładność modelu: {accuracy}') + + accuracy = accuracy_score(y_test, y_pred) + print(f'Dokładność modelu: {accuracy}') + print(classification_report(y_test, y_pred)) - -def calculatePoints(matches, team): - points = 0 - for index, row in matches.iterrows(): - if team == row['home_team']: - teamHome = True - else: - teamHome = False - x = getResult(row['result_full'], teamHome) - print(x) - if x == "win": - points = points + 3 - elif x == "draw": - points = points + 1 - return points - - -def calculateGoalDifference(matches, team): - goal_diff = 0 - for index, row in matches.iterrows(): - if team == row['home_team']: - teamHome = True - else: - teamHome = False - x,y = row['result_full'].split('-') - x = int(x) - y = int(y) - if teamHome: - goal_diff = goal_diff + (x-y) - else: - goal_diff = goal_diff + (y-x) - return goal_diff - -def categorize_passes(pass_count): - if pass_count < 400: - return 0 #słabo - elif 400 <= pass_count <= 500: - return 1 #średnio - else: - return 2 #dużo - -wynik = last5Matches('10/11', 'Stoke City', '2010-10-02', df) -#print(wynik.to_markdown()) -print(wynik) -#wynik = last5Matches('10/11', 'Blackburn Rovers', '2010-10-02', df) -#print(wynik.to_markdown()) -#print(wynik) - -print(calculatePoints(wynik,'Stoke City')) -print(calculateGoalDifference(wynik, 'Stoke City')) - -df = generateTrainingData(df) -print(df) - + result = last5Matches('10/11', 'Manchester United', '2010-12-16', df) + print(calculatePoints(result,'Manchester United')) + print(calculateGoalDifference(result, 'Manchester United')) diff --git a/rules.py b/rules.py index c30ba40..6d05fe0 100644 --- a/rules.py +++ b/rules.py @@ -1,5 +1,7 @@ import simpful - +from data_filters import * +import pandas as pd +''' def kategoryzuj_strzaly(ilosc_strzalow): FS = FuzzySystem() TLV = AutoTriangle(3, terms=['mało', 'średnio', 'dużo'], universe_of_discourse=[0, 25]) @@ -38,4 +40,108 @@ def kategorie_strzalow(druzyna, sezon, data, df): shots.append(kategoria) ostatnie_spotkania['cat_shots'] = shots - return ostatnie_spotkania \ No newline at end of file + return ostatnie_spotkania + + ''' + +def categorize_shots(shots): + if shots >= 12: + return 2 + elif shots <= 6: + return 0 + else: + return 1 + +def categorize_passes(pass_count): + if pass_count < 400: + return 0 #słabo + elif 400 <= pass_count <= 500: + return 1 #średnio + else: + return 2 #dużo + +def categorize_possesion(shots): + if shots >= 56: + return 2 + elif shots <= 40: + return 0 + else: + return 1 + +def categorize_points(data, row, teamHome): + if teamHome: + data_5 = last5Matches(row['season'], row['home_team'], row['date'], data) + points = calculatePoints(data_5,row['home_team']) + else: + data_5 = last5Matches(row['season'], row['away_team'], row['date'], data) + points = calculatePoints(data_5,row['away_team']) + if points <=1: + return 0 + elif points >=2: + return 2 + else: + return 1 + + +def get_points_home(data): + points = [] + for index, row in data.iterrows(): + points.append(categorize_points(data, row, True)) + return points + + + + +def get_points_away(data): + points = [] + for index, row in data.iterrows(): + points.append(categorize_points(data, row, False)) + return points + + + +def categorize_diff(data, row, teamHome): + if teamHome: + data_5 = last5Matches(row['season'], row['home_team'], row['date'], data) + diff = calculateGoalDifference(data_5,row['home_team']) + else: + data_5 = last5Matches(row['season'], row['away_team'], row['date'], data) + diff = calculateGoalDifference(data_5,row['away_team']) + if diff <=0: + return 0 + else: + return 1 + + +def get_diff_home(data): + points = [] + for index, row in data.iterrows(): + points.append(categorize_diff(data, row, True)) + return points + + + + +def get_diff_away(data): + points = [] + for index, row in data.iterrows(): + points.append(categorize_diff(data, row, False)) + return points + + + + +def add_column(data_frame, transform_function, new_column, existing_column=None): + if existing_column != None: + new_column_values = data_frame[existing_column].apply(transform_function) + data_frame[new_column] = new_column_values + else: + new_column_values = transform_function + data_frame[new_column] = new_column_values + return data_frame + +def get_result_list(df, home_team): + results = [] + for score in df['result_full']: + results.append(getResult(score,home_team)) + return results