fantastyczne_gole/notebooks/dataCleaning.R
2023-11-28 22:07:24 +01:00

258 lines
8.8 KiB
R

##################### The first dataset ##############################
## Loading R packages and source the "getshots" customized own function
library(jsonlite)
library(tidyverse)
library(ggsoccer)
library(dplyr)
# code and data from https://github.com/Dato-Futbol/xg-model
get_shots <- function(file_path, name_detail, save_files = F){
players <- fromJSON("data/players.json")
shots <- fromJSON(file_path) %>%
filter(subEventName == "Shot")
tags <- tibble(tags = shots$tags) %>%
hoist(tags,
tags_id = "id") %>%
unnest_wider(tags_id, names_sep = "")
tags2 <- tags %>%
mutate(is_goal = ifelse(rowSums(. == "101", na.rm = T) > 0, 1, 0),
is_blocked = ifelse(rowSums(. == "2101", na.rm = T) > 0, 1, 0),
is_CA = ifelse(rowSums(. == "1901", na.rm = T) > 0, 1, 0), # is countre attack
body_part = ifelse(rowSums(. == "401", na.rm = T) > 0, "left",
ifelse(rowSums(. == "402", na.rm = T) > 0, "right",
ifelse(rowSums(. == "403", na.rm = T) > 0, "head/body", "NA"))))
pos <- tibble(positions = shots$positions) %>%
hoist(positions,
y = "y",
x = "x") %>%
unnest_wider(y, names_sep = "") %>%
unnest_wider(x, names_sep = "") %>%
dplyr::select(-c(x2, y2))
shots_ok <- shots %>%
dplyr::select(matchId, teamId, playerId, eventSec, matchPeriod) %>%
bind_cols(pos, tags2) %>%
filter(is_blocked == 0) %>%
dplyr::select(-c(8:13)) %>%
left_join(players %>%
dplyr::select(c("wyId", "foot")), by = c("playerId" = "wyId")) %>%
mutate(league = name_detail)
if(save_files){
write_rds(shots, paste0("shots", name_detail, ".rds"))
write_rds(tags2, paste0("tags2", name_detail, ".rds"))
write_rds(pos, paste0("pos", name_detail, ".rds"))
write_rds(shots_ok, paste0("unblocked_shots", name_detail, ".rds"))
}
shots_ok
}
# shotsEN <- get_shots("data/events/events_England.json", "EN")
# shotsSP <- get_shots("data/events/events_Spain.json", "SP")
# shotsWC <- get_shots("data/events/events_World_Cup.json", "WC")
# shotsIT <- get_shots("data/events/events_Italy.json", "IT")
# shotsGE <- get_shots("data/events/events_Germany.json", "GE")
# shotsFR <- get_shots("data/events/events_France.json", "FR")
# shotsEC <- get_shots("data/events/events_European_Championship.json", "EC")
#
# shots <- shotsEN %>%
# bind_rows(shotsFR, shotsGE, shotsIT, shotsSP, shotsWC, shotsEC)
get_final_data <- function(data) {
data <- data %>% select(eventSec, y1, x1, is_goal, is_blocked, is_CA, body_part, foot)
data$x1 <- (100 - data$x1) * 105/100
data$y1 <- data$y1 * data$y1/100
data <- data %>% mutate(angle = atan(7.32 * x1 / (x1^2 + y1^2 - (7.32/2)^2)))
data$angle <- ifelse(data$angle<0, base::pi + data$angle, data$angle)
data <- data %>% mutate(distance = sqrt( (100 - x1)^2 + (34 - y1)^2),
minute = round(eventSec / 60),
eventSec = round(eventSec))
data
}
# data1 <- get_final_data(shots)
# write.csv(data1, file = "data/data1.csv")
##################### The second dataset ##############################
get_data <- function(event_path, info_path) {
events <- read.csv(event_path)
info <- read.csv(info_path)
events <- merge(events, info[, c('id_odsp', 'country', 'date')], by = 'id_odsp', all.x = TRUE)
data <- subset(events, event_type == 1)
data_final <- data %>% select(sort_order, time, shot_place, shot_outcome, is_goal, location, bodypart, assist_method, situation,
fast_break)
data_final
}
# data2 <- get_data(event_path = "data/events.csv", info_path = "data/ginf.csv")
# write.csv(data2, file = "data/data2.csv")
# TODO dodać kolumne z x y coordinates bazując na location
# TODO dodać kolumnę z kątem do bramki
##################### The third dataset ##############################
# TODO not all json data have these same keys/fields, customize function to cover all cases
get_shots2 <- function(json_file) {
data <- fromJSON(json_file) %>% filter(type$name == "Shot") %>% dplyr::select(c(minute, position, location, shot))
df_temp <- do.call(rbind, data$location)
colnames(df_temp) <- c("x1", "y1")
data$x1 <- df_temp[,1]
data$y1 <- df_temp[,2]
tryCatch({ # TODO reduce error cases
data$number_of_players_opponents <- mapply(function(sublist, x1_threshold) {
# Extracting the first location value and converting it to numeric
first_location_values <- sapply(sublist$location, function(loc) as.numeric(loc[1]))
if ("teammate" %in% names(sublist)) {
# Filtering and counting
res <- sum(!sublist$teammate & first_location_values > x1_threshold) # error here
} else {
res <- 0
}
res
}, data$shot$freeze_frame, data$x1)
},
error = function(e) {
# handle the error
print(paste("An error occurred:", e$message))
})
tryCatch({ # TODO reduce error cases
data$number_of_players_teammates <- mapply(function(sublist, x1_threshold) {
# Extracting the first location value and converting it to numeric
first_location_values <- sapply(sublist$location, function(loc) as.numeric(loc[1]))
if ("teammate" %in% names(sublist)) {
# Filtering and counting
res <- sum(sublist$teammate & first_location_values > x1_threshold) # error here
} else {
res <- 0
}
res
}, data$shot$freeze_frame, data$x1)
},
error = function(e) {
# handle the error
print(paste("An error occurred:", e$message))
})
data$shot <- data$shot %>% select(-freeze_frame, -statsbomb_xg, -key_pass_id)
data$shot$body_part <- data$shot$body_part %>% select(-id)
data$shot$technique <- data$shot$technique %>% select(-id)
data$shot$type <- data$shot$type %>% select(-id)
data$position <- data$position %>% select(-id)
data$shot <- data$shot %>% select(-end_location)
tryCatch({ # TODO reduce error cases
if ("one_on_one" %in% colnames(data$shot)) {
data[is.na(data$shot$one_on_one), ]$shot$one_on_one <- FALSE
} else {
data$shot$one_on_one <- FALSE
}
if ("first_time" %in% colnames(data$shot)) {
data[is.na(data$shot$first_time), ]$shot$first_time <- FALSE
} else {
data$shot$first_time <- FALSE
}
if ("aerial_won" %in% colnames(data$shot)) {
data[is.na(data$shot$aerial_won), ]$shot$aerial_won <- FALSE
} else {
data$shot$aerial_won <- FALSE
}
if ("saved_to_post" %in% colnames(data$shot)) {
data[is.na(data$shot$saved_to_post), ]$shot$saved_to_post <- FALSE
} else {
data$shot$saved_to_post <- FALSE
}
if ("deflected" %in% colnames(data$shot)) {
data[is.na(data$shot$deflected), ]$shot$deflected <- FALSE
} else {
data$shot$deflected <- FALSE
}
if ("saved_off_target" %in% colnames(data$shot)) {
data[is.na(data$shot$saved_off_target), ]$shot$saved_off_target <- FALSE
} else {
data$shot$saved_off_target <- FALSE
}
if ("open_goal" %in% colnames(data$shot)) {
data[is.na(data$shot$open_goal), ]$shot$open_goal <- FALSE
} else {
data$shot$open_goal <- FALSE
}
if ("follows_dribble" %in% colnames(data$shot)) {
data[is.na(data$shot$follows_dribble), ]$shot$follows_dribble <- FALSE
} else {
data$shot$follows_dribble <- FALSE
}
if ("redirect" %in% colnames(data$shot)) {
data[is.na(data$shot$redirect), ]$shot$redirect <- FALSE
} else {
data$shot$redirect <- FALSE
}
if ("kick_off" %in% colnames(data$kick_off)) {
data[is.na(data$shot$kick_off), ]$shotf$kick_off <- FALSE
} else {
data$kick_off <- FALSE
}
},
error = function(e) {
# handle the error
print(paste("An error occurred:", e$message))
})
data <- data %>% mutate(angle = atan(7.32 * x1 / (x1^2 + y1^2 - (7.32/2)^2)),
is_goal = ifelse(shot$outcome$id == 97, 1, 0)) %>%
select(-location)
data$angle <- ifelse(data$angle<0, base::pi + data$angle, data$angle)
data$shot$outcome <- data$shot$outcome %>% select(-id)
data <- data %>% unnest(shot, names_sep = "_") %>%
unnest(position, names_sep = "_") %>%
unnest(shot_type, names_sep = "_") %>%
unnest(shot_outcome, names_sep = "_") %>%
unnest(shot_technique, names_sep = "_") %>%
unnest(shot_body_part, names_sep = "_")
data
}
# file_names <- list.files(path = "data/la_liga_events/", pattern = "*.json")
# data_list <- lapply(paste("data/la_liga_events/", file_names, sep = ""), get_shots2)
# combined_data <- do.call(rbind, data_list)
# ss <- lapply(data_list, nrow)
# sss <- unlist(ss)
#
# # sample data
# data_test <- get_shots2("data/la_liga_events/303377.json")
# write.csv(data_test, file = "data/data3_test.csv")
##################### The fourth dataset ##############################