From a98353e25757482cc88d3b55c807679f4fc0fb8d Mon Sep 17 00:00:00 2001 From: LukaszChrostowski Date: Mon, 27 Nov 2023 10:20:04 +0100 Subject: [PATCH] more cleaning --- data/data_test.csv | 19 ++++++++++ notebooks/dataCleaning.R | 81 +++++++++++++++++++++++++++++++++++----- 2 files changed, 91 insertions(+), 9 deletions(-) create mode 100644 data/data_test.csv diff --git a/data/data_test.csv b/data/data_test.csv new file mode 100644 index 0000000..d2f8dfb --- /dev/null +++ b/data/data_test.csv @@ -0,0 +1,19 @@ +"","minute","position_name","shot_body_part_name","shot_type_name","shot_outcome_name","shot_technique_name","shot_one_on_one","shot_first_time","shot_aerial_won","shot_saved_to_post","shot_deflected","shot_saved_off_target","shot_open_goal","shot_follows_dribble","shot_redirect","x1","y1","number_of_players","kick_off","angle","is_goal" +"1",11,"Right Wing","Left Foot","Open Play","Blocked","Normal",FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,93.2,44.4,9,FALSE,0.0640059213742957,0 +"2",13,"Right Wing","Left Foot","Open Play","Saved","Lob",TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,100.3,42.8,1,FALSE,0.061730105843871,0 +"3",21,"Right Center Forward","Right Foot","Open Play","Blocked","Normal",TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,110.1,28,1,FALSE,0.0624299048089645,0 +"4",25,"Left Midfield","Left Foot","Open Play","Off T","Half Volley",FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,108.8,25,5,FALSE,0.0638870054917821,0 +"5",32,"Center Forward","Left Foot","Open Play","Goal","Lob",FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,107.1,42.5,2,FALSE,0.0590398311560372,1 +"6",38,"Right Back","Left Foot","Open Play","Goal","Normal",FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,107.7,47.3,6,FALSE,0.0569702959919461,1 +"7",42,"Right Wing","Left Foot","Free Kick","Off T","Normal",FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,87.3,45.3,11,FALSE,0.0660566548690278,0 +"8",47,"Right Wing","Head","Open Play","Saved","Normal",TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,112,32.2,1,FALSE,0.060353615283394,0 +"9",49,"Right Defensive Midfield","Right Foot","Free Kick","Off T","Normal",FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,84.7,35.2,11,FALSE,0.0736787207689688,0 +"10",58,"Right Wing","Right Foot","Open Play","Blocked","Normal",FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,115.6,51.7,2,FALSE,0.052762547731953,0 +"11",65,"Right Center Forward","Right Foot","Open Play","Goal","Volley",FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,104.3,37.6,3,FALSE,0.0620981693525753,1 +"12",69,"Left Wing","Right Foot","Open Play","Saved","Normal",FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,101.2,24.7,4,FALSE,0.068243631613985,0 +"13",71,"Right Center Forward","Right Foot","Open Play","Saved","Volley",FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,115.5,44.1,1,FALSE,0.0553048979119624,0 +"14",73,"Left Wing","Left Foot","Open Play","Blocked","Normal",FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,108.3,33.4,6,FALSE,0.0617057300721472,0 +"15",73,"Center Forward","Right Foot","Open Play","Off T","Normal",FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,107.7,39.4,6,FALSE,0.0599333680981008,0 +"16",76,"Center Forward","Right Foot","Open Play","Off T","Half Volley",TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,108.7,52.1,1,FALSE,0.0547568224463756,0 +"17",79,"Left Center Forward","Right Foot","Open Play","Off T","Volley",FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,113.9,34.4,4,FALSE,0.0588824107265361,0 +"18",89,"Right Center Forward","Left Foot","Open Play","Off T","Normal",FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,100.2,51.6,7,FALSE,0.0577379599935821,0 diff --git a/notebooks/dataCleaning.R b/notebooks/dataCleaning.R index 07ccd48..d57aa23 100644 --- a/notebooks/dataCleaning.R +++ b/notebooks/dataCleaning.R @@ -142,11 +142,65 @@ get_shots2 <- function(json_file) { data$shot <- data$shot %>% select(-end_location) tryCatch({ # TODO reduce error cases - if ("one_on_one" %in% colnames(data$shot)) data[is.na(data$shot$one_on_one), ]$shot$one_on_one <- FALSE - if ("first_time" %in% colnames(data$shot)) data[is.na(data$shot$first_time), ]$shot$first_time <- FALSE - if ("aerial_won" %in% colnames(data$shot)) data[is.na(data$shot$aerial_won), ]$shot$aerial_won <- FALSE - # data$shot$one_on_one[is.na(data$shot$one_on_one)] <- FALSE - # data$shot$first_time[is.na(data$shot$first_time)] <- FALSE + if ("one_on_one" %in% colnames(data$shot)) { + data[is.na(data$shot$one_on_one), ]$shot$one_on_one <- FALSE + } else { + data$shot$one_on_one <- FALSE + } + + if ("first_time" %in% colnames(data$shot)) { + data[is.na(data$shot$first_time), ]$shot$first_time <- FALSE + } else { + data$shot$first_time <- FALSE + } + + if ("aerial_won" %in% colnames(data$shot)) { + data[is.na(data$shot$aerial_won), ]$shot$aerial_won <- FALSE + } else { + data$shot$aerial_won <- FALSE + } + + if ("saved_to_post" %in% colnames(data$shot)) { + data[is.na(data$shot$saved_to_post), ]$shot$saved_to_post <- FALSE + } else { + data$shot$saved_to_post <- FALSE + } + + if ("deflected" %in% colnames(data$shot)) { + data[is.na(data$shot$deflected), ]$shot$deflected <- FALSE + } else { + data$shot$deflected <- FALSE + } + + if ("saved_off_target" %in% colnames(data$shot)) { + data[is.na(data$shot$saved_off_target), ]$shot$saved_off_target <- FALSE + } else { + data$shot$saved_off_target <- FALSE + } + + if ("open_goal" %in% colnames(data$shot)) { + data[is.na(data$shot$open_goal), ]$shot$open_goal <- FALSE + } else { + data$shot$open_goal <- FALSE + } + + if ("follows_dribble" %in% colnames(data$shot)) { + data[is.na(data$shot$follows_dribble), ]$shot$follows_dribble <- FALSE + } else { + data$shot$follows_dribble <- FALSE + } + + if ("redirect" %in% colnames(data$shot)) { + data[is.na(data$shot$redirect), ]$shot$redirect <- FALSE + } else { + data$shot$redirect <- FALSE + } + + if ("kick_off" %in% colnames(data$kick_off)) { + data[is.na(data$shot$kick_off), ]$shotf$kick_off <- FALSE + } else { + data$kick_off <- FALSE + } }, error = function(e) { # handle the error @@ -156,13 +210,22 @@ get_shots2 <- function(json_file) { data <- data %>% mutate(angle = atan(7.32 * x1 / (x1^2 + y1^2 - (7.32/2)^2)), is_goal = ifelse(shot$outcome$id == 97, 1, 0)) %>% select(-location) - data$shot$outcome <- data$shot$outcome %>% select(-id) data$angle <- ifelse(data$angle<0, base::pi + data$angle, data$angle) + data$shot$outcome <- data$shot$outcome %>% select(-id) + data <- data %>% unnest(shot, names_sep = "_") %>% + unnest(position, names_sep = "_") %>% + unnest(shot_type, names_sep = "_") %>% + unnest(shot_outcome, names_sep = "_") %>% + unnest(shot_technique, names_sep = "_") %>% + unnest(shot_body_part, names_sep = "_") data } file_names <- list.files(path = "data/la_liga_events/", pattern = "*.json") data_list <- lapply(paste("data/la_liga_events/", file_names, sep = ""), get_shots2) -# combined_data <- do.call(rbind, data_list[1:10]) -combined_data <- data_list[[1]] # TODO unlist data -write.csv(combined_data, file = "data/data3.csv") +combined_data <- do.call(rbind, data_list[1:10]) + +# sample data +data_test <- get_shots2("data/la_liga_events/303377.json") +write.csv(data_test, file = "data/data_test.csv") +