wizualizacja-danych/proj3/projekt.R

348 lines
13 KiB
R
Raw Normal View History

2021-06-23 14:40:48 +02:00
# dataset: https://www.kaggle.com/aiaiaidavid/airplane-crash-fatalities-since-1908-dv-03032020
# setup srodowiska ####
2021-06-23 14:40:48 +02:00
setwd('/Users/patrycjalazna/Desktop/wizualizacja-danych/projekt_3/')
# importy ####
library(shiny)
library(shinyTime)
library(shinyjs)
library(highcharter)
library(splusTimeDate)
library(tidyverse)
library(lubridate)
library(gridExtra)
library(usmap)
library(tidytext)
library(tm)
library(SnowballC)
library(wordcloud)
library(dplyr)
library(streamgraph)
library(plotly)
library(DT)
# zaladowanie datasetu ####
airplane_crashes <- read.csv('airplane_crashes.csv')
# podsumowanie ####
summary <- summary(airplane_crashes)
colnames(airplane_crashes)
# preprocessing ####
# zmiana nazwy kolumn
colnames(airplane_crashes) <- c('Date', 'Time', 'Location', 'Operator', 'Flight', 'Route', 'Type',
'Registration', 'cn/ln', 'Total_Onboard', 'Passengers_Onboard',
'Crew_Onboard', 'Total_Fatalities', 'Passengers_Fatalities',
'Crew_Fatalities', 'Ground', 'Summary')
# konwersja na prawidlowy format Date
airplane_crashes$Date <- mdy(airplane_crashes$Date)
airplane_crashes$Time <- hm(airplane_crashes$Time)
# ekstrakcja informacji z datasetu, zapis w poprawnym formacie
# ekstrakcja informacji z kolumny Date
airplane_crashes$Year <- year(airplane_crashes$Date)
airplane_crashes$Month <- as.factor(month(airplane_crashes$Date))
airplane_crashes$Day <- as.factor(day(airplane_crashes$Date))
airplane_crashes$Weekday <- as.factor(wday(airplane_crashes$Date))
airplane_crashes$Is_Leap_Year <- leap_year(airplane_crashes$Date)
airplane_crashes$Decade <- year(floor_date(airplane_crashes$Date, years(10)))
airplane_crashes$Date <- format(as.Date(airplane_crashes$Date, '%m/%d/%Y'), '%d/%m/%Y')
# ekstrakcja godziny, minuty i AM/PM z kolumny Time
airplane_crashes$Hour <- as.integer(hour(airplane_crashes$Time))
airplane_crashes$Minute <- as.factor(minute(airplane_crashes$Time))
airplane_crashes$AM_PM <- if_else(am(airplane_crashes$Time), 'AM', 'PM')
airplane_crashes$btwn_6PM_6AM <- if_else
# ShinyApp ####
2021-06-23 14:40:48 +02:00
ui <- fluidPage(
2021-06-23 14:40:48 +02:00
id = "main_content",
navbarPage("Airplane crashes from 1908 to 2020",
tabPanel("General overview",
sidebarLayout(
sidebarPanel(
# wybór daty
dateRangeInput('dates',
'Date range:',
min(airplane_crashes$Date),
max(airplane_crashes$Date)),
# wybor godziny
timeInput("time", "Time (local):", seconds = FALSE)
),
mainPanel(
# wykresiki
2021-06-23 15:49:55 +02:00
plotlyOutput("weekdayCrashes"),
plotlyOutput("hourCrashes")
2021-06-23 14:40:48 +02:00
)
)
),
tabPanel("Streamgraph",
mainPanel(
streamgraphOutput("operatorCrashes")
)
),
tabPanel("Map",
mainPanel(
# mapy
plotlyOutput("mapPlot")
#plotOutput("mapPlot")
)
),
tabPanel("Data",
sidebarPanel(
dateRangeInput('dataDates',
'Date range:',
min(airplane_crashes$Date),
max(airplane_crashes$Date))
),
mainPanel(
DT::dataTableOutput('allData')
))
)
)
2021-06-23 14:40:48 +02:00
server <- function(input, output, session) {
# godzina
observe(print(strftime(input$time, "%R")))
output$weekdayCrashes <- renderPlotly({
2021-06-22 20:00:43 +02:00
week_day <- airplane_crashes %>% count(Weekday)
2021-06-22 20:00:43 +02:00
week_day$Weekday <- as.character(week_day$Weekday)
week_day$Weekday[week_day$Weekday == 1] <- "Monday"
week_day$Weekday[week_day$Weekday == 2] <- "Tuesday"
week_day$Weekday[week_day$Weekday == 3] <- "Wednesday"
week_day$Weekday[week_day$Weekday == 4] <- "Thursday"
week_day$Weekday[week_day$Weekday == 5] <- "Friday"
week_day$Weekday[week_day$Weekday == 6] <- "Saturday"
week_day$Weekday[week_day$Weekday == 7] <- "Sunday"
# zmiana kolejnosci dni; dni nie powinny sie wyswietlac alfabetycznie
week_day$Weekday <- factor(week_day$Weekday, levels= c('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'))
plot_ly(
x = week_day$Weekday, y = week_day$n,
2021-06-23 15:49:55 +02:00
name = 'No. of crashes per week day', type = "bar"
) %>%
layout(title = 'Week Day Crashes',
xaxis = list(title = "Weekdays",
zeroline = FALSE),
yaxis = list(title = "No. of crashes",
zeroline = FALSE))
})
2021-06-23 15:49:55 +02:00
output$hourCrashes <- renderPlotly({
crashes_per_hours <- airplane_crashes %>% count(Hour)
crashes_per_hours <- crashes_per_hours[-c(1,26,27,28), ]
#dodanie pozycji labeli
# crashes_per_hours <- crashes_per_hours %>%
# arrange(Hour) %>%
# mutate(lab.ypos = cumsum(n) - 0.5*n)
#
# ggplot(crashes_per_hours, aes(x = "", y = n, fill = Hour)) +
# geom_bar(stat="identity", width=1, color="white") +
# coord_polar("y", start=0) +
# geom_text(aes(y = lab.ypos, label = Hour), color = "white")+
# theme_void() +
# theme(legend.position="none")
df <- crashes_per_hours
#df <- paste(df$Hour, ":00")
df <- df %>% arrange(Hour)
fig <- df %>% plot_ly(labels = ~Hour, values = ~n, sort=FALSE)
fig <- fig %>% add_pie(hole = 0.4)
fig <- fig %>% layout(title = "No. of crashes per hour", xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
fig
})
2021-06-23 14:40:48 +02:00
output$operatorCrashes <- renderStreamgraph({
main_type_wise_year <- airplane_crashes %>%
select(Year, Type) %>%
# replace model number by empty strings
mutate(main_type = str_replace_all(Type, "[A-Za-z]*-?\\d+-?[A-Za-z]*.*", "")) %>%
# skip empty strings row
filter(main_type > 'A') %>%
group_by(Year, main_type) %>%
summarize(n = n()) %>%
group_by(Year) %>%
top_n(5, n)
# i teraz z tego streamgraph
main_type_wise_year <- main_type_wise_year %>% arrange(-n)
streamgraph <- main_type_wise_year %>%
group_by(Year, main_type) %>%
tally(wt=n) %>%
streamgraph("main_type", "n", "Year", interpolate="cardinal") %>%
sg_fill_brewer("PuOr") %>%
sg_legend(TRUE)
streamgraph
})
output$mapPlot <- renderPlotly({
states_list <- c('Alabama','Alaska','Alaksa','Arizona','Arkansas',"California",
"Colorado", "Connecticut","Delaware","Florida","Georgia",
"Hawaii","Idaho","Illinois", "Indiana","Iowa","Kansas",
"Kentucky","Louisiana","Maine","Maryland", "Massachusetts",
"Massachusett", "Michigan","Minnesota","Mississippi","Missouri",
"Montana", "Nebraska","Nevada","New Hampshire","New Jersey",
"New Mexico","New York", "North Carolina","North Dakota","Ohio",
"Oklahoma", "Oklohoma", "Oregon","Pennsylvania", "Rhode Island",
"South Carolina",
"South Dakota",'Tennesee',"Tennessee","Texas","Utah", "Vermont",
'Virginia',"Washington D.C.", "Washington, D.C.", "Washington",
"West Virginia","Wisconsin","Wyoming",
"AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA",
"HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
"MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
"NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
"SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY")
location <- airplane_crashes %>%
select(Location)
for(state in states_list) {
location <- location %>%
mutate(Location = str_replace_all(Location, state, paste(state, ', USA', sep = ''))) %>%
mutate(Location = str_replace_all(Location, 'USA.*, ', '')) %>%
mutate(Location = str_replace(Location, 'West Virginia, USA,', '')) %>%
mutate(Location = str_replace(Location, 'Afghanstan', 'Afghanistan')) %>%
mutate(Location = str_replace(Location, 'Airzona|Arazona', 'Arizona')) %>%
mutate(Location = str_replace(Location, 'Alakska', 'Alaska')) %>%
mutate(Location = str_replace(Location, 'Cailifornia|Calilfornia', 'California')) %>%
mutate(Location = str_replace(Location, 'D.*Congo', 'DR Congo')) %>%
mutate(Location = str_replace(Location, 'Domincan Republic', 'Dominican Republic')) %>%
mutate(Location = str_replace(Location, 'Hati', 'Haiti')) %>%
mutate(Location = str_replace(Location, ' International Airport', '')) %>%
mutate(Location = str_replace(Location, 'Morrocco|Morroco', 'Morocco')) %>%
mutate(Location = str_replace(Location, 'Phillipines', 'Phillipines')) %>%
mutate(Location = str_replace(Location, 'Burma', 'Myanmar')) %>%
mutate(Location = str_replace(Location, '([Ss]outhern|[Nn]orthern|[Ww]estern|[Ee]astern) ', ''))}
country_state <- location %>%
select(Location) %>%
filter(!str_detect(Location, '[Oo]cean|[Ss]ea|[Cc]hannel|Gulf of')) %>%
mutate(Location = str_replace(Location, '(Near|Off|Over) ', '')) %>%
mutate(Location = str_replace(Location, 'USA, Australia', 'Australia')) %>%
mutate(State_Province = str_replace(Location, '(.*, )?(.*), (.*)', '\\2')) %>%
mutate(Country = str_replace(Location, '.*,\\s*', ''))
cntry <- country_state %>%
group_by(Country) %>%
summarize(n = n()) %>%
arrange(desc(n))
# podzial na kategorie wg liczby wypadkow w danym kraju
cntry <- cntry %>%
mutate(m = case_when(
n > 200 ~ "200 +",
n < 200 & n >= 100 ~ "199 - 100",
n < 100 & n >= 50 ~ "99 - 50",
n < 50 & n >= 10 ~ "49 - 10",
n < 10 ~ "< 10")) %>%
mutate(m = factor(m, levels = c("< 10", "49 - 10", "99 - 50", "199 - 100", "200 +")))
world_map <- map_data("world")
map_data <- cntry %>%
full_join(world_map, by = c('Country' = 'region'))
2021-06-23 15:49:55 +02:00
map_pal = c('#E6CE39', '#F09B3C', '#D94B41', '#CD3CF0', '#555CEB')
ggplot_obj <-
ggplot(map_data, aes(x = long, y = lat, group = group, fill = m)) +
geom_polygon(colour = "white") +
labs(title = 'No. of crashes in each country', x = '', y = '', fill = '') +
scale_fill_manual(values = map_pal, na.value = 'whitesmoke') +
theme(legend.position='right', legend.justification = "top") +
guides(fill = guide_legend(reverse = TRUE))
2021-06-23 14:40:48 +02:00
ggplotly(
p = ggplot_obj,
tooltip = "all",
dynamicTicks = FALSE,
layerData = 1,
originalData = TRUE
)
})
# poprawic szerkosc wierszy
output$allData <- DT::renderDataTable({
airplane_crashes %>%
2021-06-23 14:40:48 +02:00
# filter(Year == input$dataDates) %>%
DT::datatable()
})
}
shinyApp(ui = ui, server = server)
2021-06-23 14:40:48 +02:00
# tutaj sb rysuje wykresiki demo, zeby nie odpalac shiny za kazdym razem ####
# number of fatal accidents by operator - top 5
##
# extract and group by manufacturer
# main_type_wise_year <- airplane_crashes %>%
# select(Year, Type) %>%
# # replace model number by empty strings
# mutate(main_type = str_replace_all(Type, "[A-Za-z]*-?\\d+-?[A-Za-z]*.*", "")) %>%
# # skip empty strings row
# filter(main_type > 'A') %>%
# group_by(Year, main_type) %>%
# summarize(n = n()) %>%
# group_by(Year) %>%
# top_n(5, n)
# #%>% mutate(main_type = reorder_within(-n, main_type, Year))
#
# # i teraz z tego streamgraph
# main_type_wise_year <- main_type_wise_year %>% arrange(-n)
# main_type_wise_year %>%
# group_by(Year, main_type) %>%
# tally(wt=n) %>%
# streamgraph("main_type", "n", "Year", interpolate="cardinal") %>%
# sg_fill_brewer("PuOr") %>%
# sg_legend(TRUE)
####
# airplane_operators <- airplane_crashes %>% count(Operator, sort = TRUE)
# airplane_operators <- head(airplane_operators, 5)
# ggplot(airplane_operators, aes(x = Operator, y = n)) +
# geom_point(color = "#765432", alpha = 0.8) +
# labs(title = "No. of fatal accidents by operator", x = '', y = '')
#
# airplane_operators %>%
# streamgraph(key="Operator", value=, date="Year", offset="zero") %>%
# sg_fill_brewer("BuPu")
# wykres ile wypadkow kazdego operatora w kazdym roku