From f6eaba6040b0f9dd8e9650aa2d529c817c60bc4e Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Tue, 26 Dec 2023 17:28:58 +0100 Subject: [PATCH] more data cleaning --- notebooks/dataCleaning.R | 56 +++++++--------------------------------- 1 file changed, 10 insertions(+), 46 deletions(-) diff --git a/notebooks/dataCleaning.R b/notebooks/dataCleaning.R index da49ac2..f6531d1 100644 --- a/notebooks/dataCleaning.R +++ b/notebooks/dataCleaning.R @@ -1,5 +1,3 @@ -##################### The first dataset ############################## - ## Loading R packages and source the "getshots" customized own function library(jsonlite) library(tidyverse) @@ -9,6 +7,8 @@ library(REdaS) library(yd2m) library(purrr) +##################### The first dataset ############################## + # code and data from https://github.com/Dato-Futbol/xg-model get_shots <- function(file_path, name_detail, save_files = F){ @@ -102,10 +102,9 @@ get_data <- function(event_path, info_path) { # data2 <- get_data(event_path = "data/events.csv", info_path = "data/ginf.csv") # write.csv(data2, file = "data/data2.csv") - ##################### The third dataset ############################## -# make angle from the x, y coordinares for the 3rd dataset +# make angle from the x, y coordinates for the 3rd dataset loc2angle <- function(x, y) { rads <- atan(7.32 * x / (x^2 + (y - 34)^2 - (7.32/2)^2)) rads <- ifelse(rads<0, base::pi + rads, rads) @@ -113,21 +112,16 @@ loc2angle <- function(x, y) { deg } +# distance to goal loc2distance <- function(x, y) { sqrt(x^2 + y^2) } +# distance between two points on the pitch loc2locdistance <- function(x1, y1, x2, y2) { sqrt( (x1 - x2)^2 + (y1 - y2)^2 ) } -# TODO -# osobno bramkarz, wartość opponent_Goalkeeper -# kolejność w row_number() wg odlegości do piłki z podziałem na teammate/opponent -# przykład, najbliżej w teammate -> teammate_1, -# przykład najbliżej w opponent -> opponent_1 -# w przypadku NA, wypełnić wartościami teammete_(nt+1):11, opponent_(no+1):11 - get_shots2 <- function(json_file) { data <- fromJSON(json_file) %>% filter(type$name == "Shot") %>% dplyr::select(c(minute, position, location, shot)) @@ -161,13 +155,6 @@ get_shots2 <- function(json_file) { } else if ( !("teammate" %in% groups_count$teammate) ) { groups_count <- groups_count %>% add_row(teammate = "teammate", n = 0) } - # df$distance <- loc2locdistance(x1 = df$x, y1 = df$y, x2 = data$x1, y1 = data$y1) - # df <- df %>% mutate(distance = loc2locdistance(x1 = x, y1 = y, x2 = data$x1, y1 = data$y1)) %>% arrange(distance) - # df <- df %>% arrange(distance) - # df <- df %>% unnest(c(x, y, teammate)) - # df$x <- as.numeric(df$x) - # df$y <- as.numeric(df$y) - # df$teammate <- as.logical(df$teammate) %>% as.numeric() na_df <- as.data.frame(matrix("na", nrow = 21 - nrow(df), ncol = ncol(df))) colnames(na_df) <- colnames(df) @@ -179,28 +166,23 @@ get_shots2 <- function(json_file) { select(-c(teammate, position_name, rown, distance, x1, y1)) %>% mutate(x = ifelse(x == "na", NA, x), y = ifelse(x == "na", NA, y)) - - # dff <- dff %>% mutate(row = row_number()) %>% - # mutate(position_teammate = paste(ifelse(position_name == "Goalkeeper", position_name, row), teammate, sep = "_")) %>% - # select(-c(teammate, position_name, row)) } else { dff <- as.data.frame(matrix("na", nrow = 21, ncol = 3)) colnames(dff) <- c("x", "y", "teammate") dff$teammate <- rep(c("opponent", "teammate"), c(11, 10)) dff <- dff %>% group_by(teammate) %>% mutate(rown = row_number()) %>% ungroup() %>% - mutate(position_teammate = paste(teammate, rown)) %>% - select(c(-teammate, rown)) %>% + mutate(position_teammate = paste(teammate, rown, sep = "_")) %>% + select(-c(teammate, rown)) %>% mutate(x = ifelse(x == "na", NA, x), y = ifelse(x == "na", NA, y)) } - # print(dff) # print(wider_df) # stop("123") # %>% + # stop("123") wider_df <- dff %>% pivot_wider(names_from = position_teammate, values_from = c(x, y), names_sep = "_player_") # wider_df <- apply(wider_df, MARGIN = 2, unlist) - wider_df }, data$shot$freeze_frame) }, error = function(e) { @@ -208,22 +190,7 @@ get_shots2 <- function(json_file) { print(json_file) print(paste("An error occurred:", e$message)) }) - print(df_players_location) df_players_location <- df_players_location %>% t() - print(df_players_location) - - # df <- do.call(rbind, df_test$shot$freeze_frame[[1]]$location) %>% as.data.frame() - # dff <- cbind(df, df_test$shot$freeze_frame[[1]]$teammate) - # colnames(dff) <- c("x", "y", "teammate") - # - # - # na_df <- as.data.frame(matrix(NA, nrow = 21 - nrow(df), ncol = ncol(dff))) - # colnames(na_df) <- colnames(dff) - # dff <- rbind(dff, na_df) - # - # wider_df <- dff %>% - # mutate(row = row_number()) %>% - # pivot_wider(names_from = row, values_from = c(x, y, teammate), names_sep = "_player") tryCatch({ # TODO reduce error cases data$number_of_players_opponents <- mapply(function(sublist, x1_threshold) { @@ -362,18 +329,15 @@ file_names <- list.files(path = "data/la_liga_events/", pattern = "*.json") data_list <- lapply(paste("data/la_liga_events/", file_names, sep = ""), get_shots2) combined_data <- bind_rows(data_list) -# ss <- lapply(data_list, nrow) -# sss <- unlist(ss) -# # # sample data -data <- fromJSON("data/la_liga_events/ (1000).json") %>% filter(type$name == "Shot") %>% dplyr::select(c(minute, position, location, shot)) +# data <- fromJSON("data/la_liga_events/ (1000).json") %>% filter(type$name == "Shot") %>% dplyr::select(c(minute, position, location, shot)) data3_final <- combined_data %>% select(-c(shot_outcome_name, shot_saved_off_target, shot_saved_to_post, kick_off)) %>% mutate(shot_kick_off = ifelse(is.na(shot_kick_off), FALSE, shot_kick_off)) -pattern <- "^(x_player|y_player|teammate_player)[0-9]+$" +pattern <- "^(x_player_|y_player_)([0-9]|Goalkeeper)+$" cols <- names(data3_final)[grepl(pattern, names(data3_final))] data3_final <- data3_final %>% unnest(all_of(cols)) write_csv(data3_final, file = "data/final_data.csv")