# Projekt 1: Przygotowanie wizualnej analizy danych z wykorzystaniem podstawowej biblioteki graficznej R i/lub biblioteki ggplot2 # zaladowanie bibliotek library(Hmisc) library(dplyr) library(ggplot2) library(RColorBrewer) # zaladowanie danych german_credit_risk <- read.csv("proj1/german_credit_data.csv", header = TRUE) # sprawdzenie danych head(german_credit_risk) tail(german_credit_risk) str(german_credit_risk) summary(german_credit_risk) describe(german_credit_risk) # zmiana nazwy pierwszej kolumny colnames(german_credit_risk)[1] <- "index" min(german_credit_risk$Age) na.omit(german_credit_risk) # violin plot ggplot(german_credit_risk, aes(x=Purpose, y=Age, fill=Sex)) + geom_violin(trim=TRUE, position=position_dodge(1)) + stat_summary(fun = mean, geom="point", shape=25, size=2) + #position=position_dodge(.9) labs(title="Credit purpose by age", x="Purpose", y = "Age") + scale_fill_brewer(palette="Accent") + theme_minimal() + theme(legend.position="bottom") # ridge plot german_credit_risk <- na.omit(german_credit_risk) ggplot(german_credit_risk, aes(x=Credit.amount,y=Checking.account,fill=Checking.account))+ geom_density_ridges_gradient(scale = 8, show.legend = TRUE, rel_min_height = 0.00) + theme_ridges() + scale_fill_brewer(palette = 4)+ scale_y_discrete(expand = c(0.01, 0)) + scale_x_continuous(expand = c(0.01, 0)) + labs(x = "Credit amount [DM]",y = "Checking account", fill="Checking account status") + ggtitle("Credit amount density estimation by checking account status ") + theme(plot.title = element_text(hjust = 0.5)) ggplot(german_credit_risk, aes(x = Duration, y = Credit.amount, color = Sex)) + geom_point(size = 1.5) + geom_smooth(se = FALSE, size = 1.5) + labs(title="Credit amount for credit duration", x="Duration", y = "Amount") + theme_minimal() + theme(legend.position="bottom") ggplot(german_credit_risk , aes(x = factor(Job), fill = Purpose)) + geom_bar() + scale_x_discrete(breaks = 0:3, labels=c("Unskilled, non-resident", "Unskilled, resident","Skilled","Highly skilled")) + labs(title="Credit count and purpose for different job statuses", x="Job status", y = "Credit count") + theme_minimal() #Wykres pudełkowy przedstawiający jakiej wielkości najczęściej ludzie biorą kredyty na poszczególne cele. stat_box_data <- function(y) { return( data.frame( y = max(german_credit_risk$Credit.amount), label = paste('count =', length(y), '\n', 'sum =', sum(y), '\n') ) ) } german_credit_risk %>% mutate(Purpose = reorder(Purpose, Credit.amount, sum)) %>% ggplot(aes(x = Purpose, y = Credit.amount, fill = Purpose)) + geom_boxplot(alpha = .7) + guides(fill = "none") + scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) + stat_summary( fun.data = stat_box_data, geom = "text", vjust = 1 ) + ylab("Credit amount") + theme_bw() #Histogram przedstawiający ilość wziętych kredytów w zależności od wieku i z przeznaczeniem na co z linią gęstości german_credit_risk %>% ggplot(aes(x=Age, fill = Purpose)) + geom_histogram(color = 'white', binwidth = 1) + scale_x_continuous(name="Age", breaks = scales::pretty_breaks(n = 10)) + ylab('Number of credits') + stat_density(aes(x=Age, y=..count..), geom = "line", inherit.aes = FALSE, size = 1.10, color = '#555555', adjust = 1)