more data cleaning

This commit is contained in:
LukaszChrostowski 2023-12-26 17:28:58 +01:00
parent 2ca84b303c
commit f6eaba6040

View File

@ -1,5 +1,3 @@
##################### The first dataset ##############################
## Loading R packages and source the "getshots" customized own function ## Loading R packages and source the "getshots" customized own function
library(jsonlite) library(jsonlite)
library(tidyverse) library(tidyverse)
@ -9,6 +7,8 @@ library(REdaS)
library(yd2m) library(yd2m)
library(purrr) library(purrr)
##################### The first dataset ##############################
# code and data from https://github.com/Dato-Futbol/xg-model # code and data from https://github.com/Dato-Futbol/xg-model
get_shots <- function(file_path, name_detail, save_files = F){ get_shots <- function(file_path, name_detail, save_files = F){
@ -102,10 +102,9 @@ get_data <- function(event_path, info_path) {
# data2 <- get_data(event_path = "data/events.csv", info_path = "data/ginf.csv") # data2 <- get_data(event_path = "data/events.csv", info_path = "data/ginf.csv")
# write.csv(data2, file = "data/data2.csv") # write.csv(data2, file = "data/data2.csv")
##################### The third dataset ############################## ##################### The third dataset ##############################
# make angle from the x, y coordinares for the 3rd dataset # make angle from the x, y coordinates for the 3rd dataset
loc2angle <- function(x, y) { loc2angle <- function(x, y) {
rads <- atan(7.32 * x / (x^2 + (y - 34)^2 - (7.32/2)^2)) rads <- atan(7.32 * x / (x^2 + (y - 34)^2 - (7.32/2)^2))
rads <- ifelse(rads<0, base::pi + rads, rads) rads <- ifelse(rads<0, base::pi + rads, rads)
@ -113,21 +112,16 @@ loc2angle <- function(x, y) {
deg deg
} }
# distance to goal
loc2distance <- function(x, y) { loc2distance <- function(x, y) {
sqrt(x^2 + y^2) sqrt(x^2 + y^2)
} }
# distance between two points on the pitch
loc2locdistance <- function(x1, y1, x2, y2) { loc2locdistance <- function(x1, y1, x2, y2) {
sqrt( (x1 - x2)^2 + (y1 - y2)^2 ) sqrt( (x1 - x2)^2 + (y1 - y2)^2 )
} }
# TODO
# osobno bramkarz, wartość opponent_Goalkeeper
# kolejność w row_number() wg odlegości do piłki z podziałem na teammate/opponent
# przykład, najbliżej w teammate -> teammate_1,
# przykład najbliżej w opponent -> opponent_1
# w przypadku NA, wypełnić wartościami teammete_(nt+1):11, opponent_(no+1):11
get_shots2 <- function(json_file) { get_shots2 <- function(json_file) {
data <- fromJSON(json_file) %>% filter(type$name == "Shot") %>% dplyr::select(c(minute, position, location, shot)) data <- fromJSON(json_file) %>% filter(type$name == "Shot") %>% dplyr::select(c(minute, position, location, shot))
@ -161,13 +155,6 @@ get_shots2 <- function(json_file) {
} else if ( !("teammate" %in% groups_count$teammate) ) { } else if ( !("teammate" %in% groups_count$teammate) ) {
groups_count <- groups_count %>% add_row(teammate = "teammate", n = 0) groups_count <- groups_count %>% add_row(teammate = "teammate", n = 0)
} }
# df$distance <- loc2locdistance(x1 = df$x, y1 = df$y, x2 = data$x1, y1 = data$y1)
# df <- df %>% mutate(distance = loc2locdistance(x1 = x, y1 = y, x2 = data$x1, y1 = data$y1)) %>% arrange(distance)
# df <- df %>% arrange(distance)
# df <- df %>% unnest(c(x, y, teammate))
# df$x <- as.numeric(df$x)
# df$y <- as.numeric(df$y)
# df$teammate <- as.logical(df$teammate) %>% as.numeric()
na_df <- as.data.frame(matrix("na", nrow = 21 - nrow(df), ncol = ncol(df))) na_df <- as.data.frame(matrix("na", nrow = 21 - nrow(df), ncol = ncol(df)))
colnames(na_df) <- colnames(df) colnames(na_df) <- colnames(df)
@ -179,28 +166,23 @@ get_shots2 <- function(json_file) {
select(-c(teammate, position_name, rown, distance, x1, y1)) %>% select(-c(teammate, position_name, rown, distance, x1, y1)) %>%
mutate(x = ifelse(x == "na", NA, x), mutate(x = ifelse(x == "na", NA, x),
y = ifelse(x == "na", NA, y)) y = ifelse(x == "na", NA, y))
# dff <- dff %>% mutate(row = row_number()) %>%
# mutate(position_teammate = paste(ifelse(position_name == "Goalkeeper", position_name, row), teammate, sep = "_")) %>%
# select(-c(teammate, position_name, row))
} else { } else {
dff <- as.data.frame(matrix("na", nrow = 21, ncol = 3)) dff <- as.data.frame(matrix("na", nrow = 21, ncol = 3))
colnames(dff) <- c("x", "y", "teammate") colnames(dff) <- c("x", "y", "teammate")
dff$teammate <- rep(c("opponent", "teammate"), c(11, 10)) dff$teammate <- rep(c("opponent", "teammate"), c(11, 10))
dff <- dff %>% group_by(teammate) %>% mutate(rown = row_number()) %>% ungroup() %>% dff <- dff %>% group_by(teammate) %>% mutate(rown = row_number()) %>% ungroup() %>%
mutate(position_teammate = paste(teammate, rown)) %>% mutate(position_teammate = paste(teammate, rown, sep = "_")) %>%
select(c(-teammate, rown)) %>% select(-c(teammate, rown)) %>%
mutate(x = ifelse(x == "na", NA, x), mutate(x = ifelse(x == "na", NA, x),
y = ifelse(x == "na", NA, y)) y = ifelse(x == "na", NA, y))
} }
# print(dff)
# print(wider_df) # print(wider_df)
# stop("123") # stop("123")
# %>% # %>%
# stop("123")
wider_df <- dff %>% wider_df <- dff %>%
pivot_wider(names_from = position_teammate, values_from = c(x, y), names_sep = "_player_") pivot_wider(names_from = position_teammate, values_from = c(x, y), names_sep = "_player_")
# wider_df <- apply(wider_df, MARGIN = 2, unlist) # wider_df <- apply(wider_df, MARGIN = 2, unlist)
wider_df
}, data$shot$freeze_frame) }, data$shot$freeze_frame)
}, },
error = function(e) { error = function(e) {
@ -208,22 +190,7 @@ get_shots2 <- function(json_file) {
print(json_file) print(json_file)
print(paste("An error occurred:", e$message)) print(paste("An error occurred:", e$message))
}) })
print(df_players_location)
df_players_location <- df_players_location %>% t() df_players_location <- df_players_location %>% t()
print(df_players_location)
# df <- do.call(rbind, df_test$shot$freeze_frame[[1]]$location) %>% as.data.frame()
# dff <- cbind(df, df_test$shot$freeze_frame[[1]]$teammate)
# colnames(dff) <- c("x", "y", "teammate")
#
#
# na_df <- as.data.frame(matrix(NA, nrow = 21 - nrow(df), ncol = ncol(dff)))
# colnames(na_df) <- colnames(dff)
# dff <- rbind(dff, na_df)
#
# wider_df <- dff %>%
# mutate(row = row_number()) %>%
# pivot_wider(names_from = row, values_from = c(x, y, teammate), names_sep = "_player")
tryCatch({ # TODO reduce error cases tryCatch({ # TODO reduce error cases
data$number_of_players_opponents <- mapply(function(sublist, x1_threshold) { data$number_of_players_opponents <- mapply(function(sublist, x1_threshold) {
@ -362,18 +329,15 @@ file_names <- list.files(path = "data/la_liga_events/", pattern = "*.json")
data_list <- lapply(paste("data/la_liga_events/", file_names, sep = ""), get_shots2) data_list <- lapply(paste("data/la_liga_events/", file_names, sep = ""), get_shots2)
combined_data <- bind_rows(data_list) combined_data <- bind_rows(data_list)
# ss <- lapply(data_list, nrow)
# sss <- unlist(ss)
#
# # sample data # # sample data
data <- fromJSON("data/la_liga_events/ (1000).json") %>% filter(type$name == "Shot") %>% dplyr::select(c(minute, position, location, shot)) # data <- fromJSON("data/la_liga_events/ (1000).json") %>% filter(type$name == "Shot") %>% dplyr::select(c(minute, position, location, shot))
data3_final <- combined_data %>% select(-c(shot_outcome_name, data3_final <- combined_data %>% select(-c(shot_outcome_name,
shot_saved_off_target, shot_saved_off_target,
shot_saved_to_post, shot_saved_to_post,
kick_off)) %>% kick_off)) %>%
mutate(shot_kick_off = ifelse(is.na(shot_kick_off), FALSE, shot_kick_off)) mutate(shot_kick_off = ifelse(is.na(shot_kick_off), FALSE, shot_kick_off))
pattern <- "^(x_player|y_player|teammate_player)[0-9]+$" pattern <- "^(x_player_|y_player_)([0-9]|Goalkeeper)+$"
cols <- names(data3_final)[grepl(pattern, names(data3_final))] cols <- names(data3_final)[grepl(pattern, names(data3_final))]
data3_final <- data3_final %>% unnest(all_of(cols)) data3_final <- data3_final %>% unnest(all_of(cols))
write_csv(data3_final, file = "data/final_data.csv") write_csv(data3_final, file = "data/final_data.csv")