more data cleaning

This commit is contained in:
LukaszChrostowski 2023-12-26 17:18:57 +01:00
parent fc2fef33e6
commit 2ca84b303c
5 changed files with 38404 additions and 38351 deletions

BIN
.DS_Store vendored

Binary file not shown.

File diff suppressed because it is too large Load Diff

BIN
notebooks/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -117,6 +117,17 @@ loc2distance <- function(x, y) {
sqrt(x^2 + y^2) sqrt(x^2 + y^2)
} }
loc2locdistance <- function(x1, y1, x2, y2) {
sqrt( (x1 - x2)^2 + (y1 - y2)^2 )
}
# TODO
# osobno bramkarz, wartość opponent_Goalkeeper
# kolejność w row_number() wg odlegości do piłki z podziałem na teammate/opponent
# przykład, najbliżej w teammate -> teammate_1,
# przykład najbliżej w opponent -> opponent_1
# w przypadku NA, wypełnić wartościami teammete_(nt+1):11, opponent_(no+1):11
get_shots2 <- function(json_file) { get_shots2 <- function(json_file) {
data <- fromJSON(json_file) %>% filter(type$name == "Shot") %>% dplyr::select(c(minute, position, location, shot)) data <- fromJSON(json_file) %>% filter(type$name == "Shot") %>% dplyr::select(c(minute, position, location, shot))
@ -126,28 +137,68 @@ get_shots2 <- function(json_file) {
data$x1 <- df_temp[,1] data$x1 <- df_temp[,1]
data$y1 <- df_temp[,2] data$y1 <- df_temp[,2]
data$shot$freeze_frame <- Map(function(ff, x1, y1) {
ff$x1 <- yd_to_m(x1)
ff$y1 <- yd_to_m(y1)
return(ff)
},
data$shot$freeze_frame, data$x1, data$y1)
tryCatch({ tryCatch({
df_players_location <- mapply( function(sublist) { df_players_location <- mapply( function(sublist) {
if (!is.null(sublist$teammate)) { if (!is.null(sublist$teammate)) {
df_players <- sapply(sublist$location, function(loc) c(120, 80) - loc %>% as.numeric() %>% yd_to_m() %>% round(., digits = 1)) %>% t() %>% as.data.frame() df_players <- sapply(sublist$location, function(loc) c(120, 80) - loc %>% as.numeric() %>% yd_to_m() %>% round(., digits = 1)) %>% t() %>% as.data.frame()
# df <- sapply(sublist$teammate, function(tmt) cbind(df_players, tmt)) # df <- sapply(sublist$teammate, function(tmt) cbind(df_players, tmt))
df <- cbind(df_players, sublist$teammate) df <- cbind(df_players, sublist$teammate, sublist$position$name, sublist$x1, sublist$y1)
colnames(df) <- c("x", "y", "teammate") colnames(df) <- c("x", "y", "teammate", "position_name", "x1", "y1")
df <- df %>% mutate(teammate = ifelse(teammate, "teammate", "opponent"),
distance = loc2locdistance(x1 = x, y1 = y, x2 = x1, y2 = y1)) %>%
arrange(distance)
groups_count <- df %>% group_by(teammate) %>% count() %>% as.data.frame()
if ( !("opponent" %in% groups_count$teammate) ) {
groups_count <- groups_count %>% add_row(teammate = "opponent", n = 0)
} else if ( !("teammate" %in% groups_count$teammate) ) {
groups_count <- groups_count %>% add_row(teammate = "teammate", n = 0)
}
# df$distance <- loc2locdistance(x1 = df$x, y1 = df$y, x2 = data$x1, y1 = data$y1)
# df <- df %>% mutate(distance = loc2locdistance(x1 = x, y1 = y, x2 = data$x1, y1 = data$y1)) %>% arrange(distance)
# df <- df %>% arrange(distance)
# df <- df %>% unnest(c(x, y, teammate)) # df <- df %>% unnest(c(x, y, teammate))
# df$x <- as.numeric(df$x) # df$x <- as.numeric(df$x)
# df$y <- as.numeric(df$y) # df$y <- as.numeric(df$y)
# df$teammate <- as.logical(df$teammate) %>% as.numeric() # df$teammate <- as.logical(df$teammate) %>% as.numeric()
#
na_df <- as.data.frame(matrix(NA, nrow = 21 - nrow(df), ncol = ncol(df))) na_df <- as.data.frame(matrix("na", nrow = 21 - nrow(df), ncol = ncol(df)))
colnames(na_df) <- colnames(df) colnames(na_df) <- colnames(df)
na_df$teammate <- rep(c("opponent", "teammate"), c(11, 10) - groups_count$n)
dff <- rbind(df, na_df) dff <- rbind(df, na_df)
dff <- dff %>% group_by(teammate) %>% mutate(rown = row_number(distance)) %>% ungroup() %>%
mutate(position_teammate = paste(teammate, ifelse(position_name == "Goalkeeper", position_name, rown), sep = "_")) %>%
select(-c(teammate, position_name, rown, distance, x1, y1)) %>%
mutate(x = ifelse(x == "na", NA, x),
y = ifelse(x == "na", NA, y))
# dff <- dff %>% mutate(row = row_number()) %>%
# mutate(position_teammate = paste(ifelse(position_name == "Goalkeeper", position_name, row), teammate, sep = "_")) %>%
# select(-c(teammate, position_name, row))
} else { } else {
dff <- as.data.frame(matrix(NA, nrow = 21, ncol = 3)) dff <- as.data.frame(matrix("na", nrow = 21, ncol = 3))
colnames(dff) <- c("x", "y", "teammate") colnames(dff) <- c("x", "y", "teammate")
dff$teammate <- rep(c("opponent", "teammate"), c(11, 10))
dff <- dff %>% group_by(teammate) %>% mutate(rown = row_number()) %>% ungroup() %>%
mutate(position_teammate = paste(teammate, rown)) %>%
select(c(-teammate, rown)) %>%
mutate(x = ifelse(x == "na", NA, x),
y = ifelse(x == "na", NA, y))
} }
# print(dff)
# print(wider_df)
# stop("123")
# %>%
wider_df <- dff %>% wider_df <- dff %>%
mutate(row = row_number()) %>% pivot_wider(names_from = position_teammate, values_from = c(x, y), names_sep = "_player_")
pivot_wider(names_from = row, values_from = c(x, y, teammate), names_sep = "_player")
# wider_df <- apply(wider_df, MARGIN = 2, unlist) # wider_df <- apply(wider_df, MARGIN = 2, unlist)
wider_df wider_df
}, data$shot$freeze_frame) }, data$shot$freeze_frame)
@ -157,7 +208,9 @@ get_shots2 <- function(json_file) {
print(json_file) print(json_file)
print(paste("An error occurred:", e$message)) print(paste("An error occurred:", e$message))
}) })
print(df_players_location)
df_players_location <- df_players_location %>% t() df_players_location <- df_players_location %>% t()
print(df_players_location)
# df <- do.call(rbind, df_test$shot$freeze_frame[[1]]$location) %>% as.data.frame() # df <- do.call(rbind, df_test$shot$freeze_frame[[1]]$location) %>% as.data.frame()
# dff <- cbind(df, df_test$shot$freeze_frame[[1]]$teammate) # dff <- cbind(df, df_test$shot$freeze_frame[[1]]$teammate)
@ -313,7 +366,7 @@ combined_data <- bind_rows(data_list)
# sss <- unlist(ss) # sss <- unlist(ss)
# #
# # sample data # # sample data
# data_test <- get_shots2("data/la_liga_events/ (1000).json") data <- fromJSON("data/la_liga_events/ (1000).json") %>% filter(type$name == "Shot") %>% dplyr::select(c(minute, position, location, shot))
data3_final <- combined_data %>% select(-c(shot_outcome_name, data3_final <- combined_data %>% select(-c(shot_outcome_name,
shot_saved_off_target, shot_saved_off_target,