wizualizacja-danych/proj3/projekt.R

442 lines
17 KiB
R
Raw Normal View History

2021-06-23 14:40:48 +02:00
# dataset: https://www.kaggle.com/aiaiaidavid/airplane-crash-fatalities-since-1908-dv-03032020
# setup srodowiska ####
2021-06-23 14:40:48 +02:00
setwd('/Users/patrycjalazna/Desktop/wizualizacja-danych/projekt_3/')
# importy ####
library(shiny)
library(shinyTime)
library(shinyjs)
library(highcharter)
library(splusTimeDate)
library(tidyverse)
library(lubridate)
library(gridExtra)
library(usmap)
library(tidytext)
library(tm)
library(SnowballC)
library(wordcloud)
library(dplyr)
library(streamgraph)
library(plotly)
library(DT)
# zaladowanie datasetu ####
airplane_crashes <- read.csv('airplane_crashes.csv')
# podsumowanie ####
summary <- summary(airplane_crashes)
colnames(airplane_crashes)
# preprocessing ####
# zmiana nazwy kolumn
colnames(airplane_crashes) <- c('Date', 'Time', 'Location', 'Operator', 'Flight', 'Route', 'Type',
'Registration', 'cn/ln', 'Total_Onboard', 'Passengers_Onboard',
'Crew_Onboard', 'Total_Fatalities', 'Passengers_Fatalities',
'Crew_Fatalities', 'Ground', 'Summary')
# konwersja na prawidlowy format Date
airplane_crashes$Date <- mdy(airplane_crashes$Date)
airplane_crashes$Time <- hm(airplane_crashes$Time)
# ekstrakcja informacji z datasetu, zapis w poprawnym formacie
# ekstrakcja informacji z kolumny Date
airplane_crashes$Year <- year(airplane_crashes$Date)
airplane_crashes$Month <- as.factor(month(airplane_crashes$Date))
airplane_crashes$Day <- as.factor(day(airplane_crashes$Date))
airplane_crashes$Weekday <- as.factor(wday(airplane_crashes$Date))
airplane_crashes$Is_Leap_Year <- leap_year(airplane_crashes$Date)
airplane_crashes$Decade <- year(floor_date(airplane_crashes$Date, years(10)))
airplane_crashes$Date <- format(as.Date(airplane_crashes$Date, '%m/%d/%Y'), '%d/%m/%Y')
# ekstrakcja godziny, minuty i AM/PM z kolumny Time
airplane_crashes$Hour <- as.integer(hour(airplane_crashes$Time))
airplane_crashes$Minute <- as.factor(minute(airplane_crashes$Time))
airplane_crashes$AM_PM <- if_else(am(airplane_crashes$Time), 'AM', 'PM')
airplane_crashes$btwn_6PM_6AM <- if_else
# ShinyApp ####
2021-06-23 14:40:48 +02:00
ui <- fluidPage(
2021-06-23 14:40:48 +02:00
id = "main_content",
navbarPage("Airplane crashes from 1908 to 2020",
2021-06-23 16:45:41 +02:00
# tabPanel("General overview",
# sidebarLayout(
# sidebarPanel(
# # wybór daty
# dateRangeInput('dates',
# 'Date range:',
# min(airplane_crashes$Date),
# max(airplane_crashes$Date)),
# # wybor godziny
# timeInput("time", "Time (local):", seconds = FALSE)
# ),
# mainPanel(
# # wykresiki
# plotlyOutput("weekdayCrashes"),
# fixedRow(
# column(3, plotlyOutput("hourCrashes")),
# column(3, plotlyOutput("monthCrashes"))
# )
# )
# )
# ),
2021-06-23 14:40:48 +02:00
tabPanel("General overview",
2021-06-23 16:45:41 +02:00
fixedRow(
2021-06-23 17:46:00 +02:00
column(3,
2021-06-23 16:45:41 +02:00
dateRangeInput('dates',
'Date range:',
min(airplane_crashes$Date),
max(airplane_crashes$Date)),
# wybor godziny
2021-06-23 17:46:00 +02:00
timeInput("time", "Time (local):", seconds = FALSE)
),
column(6, plotOutput("crashes_each_year"))
2021-06-23 16:45:41 +02:00
),
2021-06-23 17:46:00 +02:00
2021-06-23 16:45:41 +02:00
fixedRow(
column(3, plotlyOutput("hourCrashes")),
2021-06-23 17:46:00 +02:00
column(3, plotlyOutput("weekdayCrashes")),
2021-06-23 16:45:41 +02:00
column(3, plotlyOutput("monthCrashes"))
)),
2021-06-23 14:40:48 +02:00
tabPanel("Streamgraph",
mainPanel(
streamgraphOutput("operatorCrashes")
)
),
tabPanel("Map",
mainPanel(
# mapy
plotlyOutput("mapPlot")
#plotOutput("mapPlot")
)
),
tabPanel("Data",
sidebarPanel(
dateRangeInput('dataDates',
'Date range:',
min(airplane_crashes$Date),
2021-06-23 16:45:41 +02:00
max(airplane_crashes$Date)),
downloadButton('downloadData', 'Download')
2021-06-23 14:40:48 +02:00
),
mainPanel(
DT::dataTableOutput('allData')
2021-06-23 16:45:41 +02:00
2021-06-23 14:40:48 +02:00
))
)
)
2021-06-23 14:40:48 +02:00
server <- function(input, output, session) {
# godzina
observe(print(strftime(input$time, "%R")))
2021-06-23 17:46:00 +02:00
output$crashes_each_year <- renderPlot({
year_wise <- airplane_crashes %>% count(Year)
year_wise <- year_wise %>%
filter(Year > year(input$dates[1])) %>%
filter(Year < year(input$dates[2]))
ggplot(year_wise, aes(x = Year, y = n)) +
geom_col(fill = '#0f4c75', col = 'white') +
labs(title = 'No. of Airplane crashes since 1908 each year', x = '', y = '') +
scale_x_continuous(breaks = seq(1908, 2020, 4))
})
output$weekdayCrashes <- renderPlotly({
2021-06-23 17:46:00 +02:00
week_day <- airplane_crashes %>% count(Weekday)
2021-06-22 20:00:43 +02:00
week_day$Weekday <- as.character(week_day$Weekday)
week_day$Weekday[week_day$Weekday == 1] <- "Monday"
week_day$Weekday[week_day$Weekday == 2] <- "Tuesday"
week_day$Weekday[week_day$Weekday == 3] <- "Wednesday"
week_day$Weekday[week_day$Weekday == 4] <- "Thursday"
week_day$Weekday[week_day$Weekday == 5] <- "Friday"
week_day$Weekday[week_day$Weekday == 6] <- "Saturday"
week_day$Weekday[week_day$Weekday == 7] <- "Sunday"
# zmiana kolejnosci dni; dni nie powinny sie wyswietlac alfabetycznie
week_day$Weekday <- factor(week_day$Weekday, levels= c('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'))
2021-06-23 17:46:00 +02:00
# plot_ly(
# x = week_day$Weekday, y = week_day$n,
# name = 'No. of crashes per week day', type = "bar"
# ) %>%
# layout(title = 'Week Day Crashes',
# xaxis = list(title = "Weekdays",
# zeroline = FALSE),
# yaxis = list(title = "No. of crashes",
# zeroline = FALSE))
df <- week_day
df <- df %>% arrange(Weekday)
fig <- df %>% plot_ly(labels = ~Weekday, values = ~n, sort=FALSE)
fig <- fig %>% add_pie(hole = 0.4)
fig <- fig %>% layout(title = "No. of crashes per weekday", xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
fig
#
# df <- df %>% arrange(Hour)
# df$Hour <- as.character(df$Hour)
# df$Hour <- paste(df$Hour, ":00", sep = "")
# fig <- df %>% plot_ly(labels = ~Hour, values = ~n, sort=FALSE)
# fig <- fig %>% add_pie(hole = 0.4)
# fig <- fig %>% layout(title = "No. of crashes per hour", xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
# yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
#
# fig
})
2021-06-23 15:49:55 +02:00
output$hourCrashes <- renderPlotly({
crashes_per_hours <- airplane_crashes %>% count(Hour)
crashes_per_hours <- crashes_per_hours[-c(1,26,27,28), ]
#dodanie pozycji labeli
# crashes_per_hours <- crashes_per_hours %>%
# arrange(Hour) %>%
# mutate(lab.ypos = cumsum(n) - 0.5*n)
#
# ggplot(crashes_per_hours, aes(x = "", y = n, fill = Hour)) +
# geom_bar(stat="identity", width=1, color="white") +
# coord_polar("y", start=0) +
# geom_text(aes(y = lab.ypos, label = Hour), color = "white")+
# theme_void() +
# theme(legend.position="none")
df <- crashes_per_hours
2021-06-23 16:27:56 +02:00
crashes_per_months$Month[crashes_per_months$Month == 1] <- "January"
2021-06-23 15:49:55 +02:00
df <- df %>% arrange(Hour)
2021-06-23 16:27:56 +02:00
df$Hour <- as.character(df$Hour)
df$Hour <- paste(df$Hour, ":00", sep = "")
2021-06-23 15:49:55 +02:00
fig <- df %>% plot_ly(labels = ~Hour, values = ~n, sort=FALSE)
fig <- fig %>% add_pie(hole = 0.4)
fig <- fig %>% layout(title = "No. of crashes per hour", xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
fig
2021-06-23 17:46:00 +02:00
2021-06-23 15:49:55 +02:00
})
output$monthCrashes <- renderPlotly({
crashes_per_months <- airplane_crashes %>% count(Month)
crashes_per_months$Month <- as.character(crashes_per_months$Month)
crashes_per_months$Month[crashes_per_months$Month == 1] <- "January"
crashes_per_months$Month[crashes_per_months$Month == 2] <- "February"
crashes_per_months$Month[crashes_per_months$Month == 3] <- "March"
crashes_per_months$Month[crashes_per_months$Month == 4] <- "April"
crashes_per_months$Month[crashes_per_months$Month == 5] <- "May"
crashes_per_months$Month[crashes_per_months$Month == 6] <- "June"
crashes_per_months$Month[crashes_per_months$Month == 7] <- "July"
crashes_per_months$Month[crashes_per_months$Month == 8] <- "August"
crashes_per_months$Month[crashes_per_months$Month == 9] <- "September"
crashes_per_months$Month[crashes_per_months$Month == 10] <- "October"
crashes_per_months$Month[crashes_per_months$Month == 11] <- "November"
crashes_per_months$Month[crashes_per_months$Month == 12] <- "December"
df <- crashes_per_months
df <- df %>% arrange(match(Month, c("January", "February","March", "April", "May", "June", "July", "August", "September", "October", "November", "December")))
fig <- df %>% plot_ly(labels = ~Month, values = ~n, sort=FALSE)
fig <- fig %>% add_pie(hole = 0.4)
fig <- fig %>% layout(title = "No. of crashes per month", xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
fig
})
2021-06-23 15:49:55 +02:00
2021-06-23 14:40:48 +02:00
output$operatorCrashes <- renderStreamgraph({
main_type_wise_year <- airplane_crashes %>%
select(Year, Type) %>%
# replace model number by empty strings
mutate(main_type = str_replace_all(Type, "[A-Za-z]*-?\\d+-?[A-Za-z]*.*", "")) %>%
# skip empty strings row
filter(main_type > 'A') %>%
group_by(Year, main_type) %>%
summarize(n = n()) %>%
group_by(Year) %>%
top_n(5, n)
# i teraz z tego streamgraph
main_type_wise_year <- main_type_wise_year %>% arrange(-n)
streamgraph <- main_type_wise_year %>%
group_by(Year, main_type) %>%
tally(wt=n) %>%
streamgraph("main_type", "n", "Year", interpolate="cardinal") %>%
sg_fill_brewer("PuOr") %>%
sg_legend(TRUE)
streamgraph
})
output$mapPlot <- renderPlotly({
states_list <- c('Alabama','Alaska','Alaksa','Arizona','Arkansas',"California",
"Colorado", "Connecticut","Delaware","Florida","Georgia",
"Hawaii","Idaho","Illinois", "Indiana","Iowa","Kansas",
"Kentucky","Louisiana","Maine","Maryland", "Massachusetts",
"Massachusett", "Michigan","Minnesota","Mississippi","Missouri",
"Montana", "Nebraska","Nevada","New Hampshire","New Jersey",
"New Mexico","New York", "North Carolina","North Dakota","Ohio",
"Oklahoma", "Oklohoma", "Oregon","Pennsylvania", "Rhode Island",
"South Carolina",
"South Dakota",'Tennesee',"Tennessee","Texas","Utah", "Vermont",
'Virginia',"Washington D.C.", "Washington, D.C.", "Washington",
"West Virginia","Wisconsin","Wyoming",
"AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA",
"HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
"MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
"NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
"SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY")
location <- airplane_crashes %>%
select(Location)
for(state in states_list) {
location <- location %>%
mutate(Location = str_replace_all(Location, state, paste(state, ', USA', sep = ''))) %>%
mutate(Location = str_replace_all(Location, 'USA.*, ', '')) %>%
mutate(Location = str_replace(Location, 'West Virginia, USA,', '')) %>%
mutate(Location = str_replace(Location, 'Afghanstan', 'Afghanistan')) %>%
mutate(Location = str_replace(Location, 'Airzona|Arazona', 'Arizona')) %>%
mutate(Location = str_replace(Location, 'Alakska', 'Alaska')) %>%
mutate(Location = str_replace(Location, 'Cailifornia|Calilfornia', 'California')) %>%
mutate(Location = str_replace(Location, 'D.*Congo', 'DR Congo')) %>%
mutate(Location = str_replace(Location, 'Domincan Republic', 'Dominican Republic')) %>%
mutate(Location = str_replace(Location, 'Hati', 'Haiti')) %>%
mutate(Location = str_replace(Location, ' International Airport', '')) %>%
mutate(Location = str_replace(Location, 'Morrocco|Morroco', 'Morocco')) %>%
mutate(Location = str_replace(Location, 'Phillipines', 'Phillipines')) %>%
mutate(Location = str_replace(Location, 'Burma', 'Myanmar')) %>%
mutate(Location = str_replace(Location, '([Ss]outhern|[Nn]orthern|[Ww]estern|[Ee]astern) ', ''))}
country_state <- location %>%
select(Location) %>%
filter(!str_detect(Location, '[Oo]cean|[Ss]ea|[Cc]hannel|Gulf of')) %>%
mutate(Location = str_replace(Location, '(Near|Off|Over) ', '')) %>%
mutate(Location = str_replace(Location, 'USA, Australia', 'Australia')) %>%
mutate(State_Province = str_replace(Location, '(.*, )?(.*), (.*)', '\\2')) %>%
mutate(Country = str_replace(Location, '.*,\\s*', ''))
cntry <- country_state %>%
group_by(Country) %>%
summarize(n = n()) %>%
arrange(desc(n))
# podzial na kategorie wg liczby wypadkow w danym kraju
cntry <- cntry %>%
mutate(m = case_when(
n > 200 ~ "200 +",
n < 200 & n >= 100 ~ "199 - 100",
n < 100 & n >= 50 ~ "99 - 50",
n < 50 & n >= 10 ~ "49 - 10",
n < 10 ~ "< 10")) %>%
mutate(m = factor(m, levels = c("< 10", "49 - 10", "99 - 50", "199 - 100", "200 +")))
world_map <- map_data("world")
map_data <- cntry %>%
full_join(world_map, by = c('Country' = 'region'))
2021-06-23 15:49:55 +02:00
map_pal = c('#E6CE39', '#F09B3C', '#D94B41', '#CD3CF0', '#555CEB')
ggplot_obj <-
ggplot(map_data, aes(x = long, y = lat, group = group, fill = m)) +
geom_polygon(colour = "white") +
labs(title = 'No. of crashes in each country', x = '', y = '', fill = '') +
scale_fill_manual(values = map_pal, na.value = 'whitesmoke') +
theme(legend.position='right', legend.justification = "top") +
guides(fill = guide_legend(reverse = TRUE))
2021-06-23 14:40:48 +02:00
ggplotly(
p = ggplot_obj,
tooltip = "all",
dynamicTicks = FALSE,
layerData = 1,
originalData = TRUE
)
})
2021-06-23 16:45:41 +02:00
output$downloadData <- downloadHandler(
filename = function() {
paste('airplane_crashes', '.csv', sep='')
},
content = function(file) {
write.csv(airplane_crashes, file)
}
)
output$allData <- DT::renderDataTable({
airplane_crashes %>%
2021-06-23 14:40:48 +02:00
# filter(Year == input$dataDates) %>%
DT::datatable()
})
}
shinyApp(ui = ui, server = server)
2021-06-23 14:40:48 +02:00
# tutaj sb rysuje wykresiki demo, zeby nie odpalac shiny za kazdym razem ####
# number of fatal accidents by operator - top 5
##
# extract and group by manufacturer
# main_type_wise_year <- airplane_crashes %>%
# select(Year, Type) %>%
# # replace model number by empty strings
# mutate(main_type = str_replace_all(Type, "[A-Za-z]*-?\\d+-?[A-Za-z]*.*", "")) %>%
# # skip empty strings row
# filter(main_type > 'A') %>%
# group_by(Year, main_type) %>%
# summarize(n = n()) %>%
# group_by(Year) %>%
# top_n(5, n)
# #%>% mutate(main_type = reorder_within(-n, main_type, Year))
#
# # i teraz z tego streamgraph
# main_type_wise_year <- main_type_wise_year %>% arrange(-n)
# main_type_wise_year %>%
# group_by(Year, main_type) %>%
# tally(wt=n) %>%
# streamgraph("main_type", "n", "Year", interpolate="cardinal") %>%
# sg_fill_brewer("PuOr") %>%
# sg_legend(TRUE)
####
# airplane_operators <- airplane_crashes %>% count(Operator, sort = TRUE)
# airplane_operators <- head(airplane_operators, 5)
# ggplot(airplane_operators, aes(x = Operator, y = n)) +
# geom_point(color = "#765432", alpha = 0.8) +
# labs(title = "No. of fatal accidents by operator", x = '', y = '')
#
# airplane_operators %>%
# streamgraph(key="Operator", value=, date="Year", offset="zero") %>%
# sg_fill_brewer("BuPu")
# wykres ile wypadkow kazdego operatora w kazdym roku