Init commit
This commit is contained in:
commit
431a51a1bf
151
WIZ_Project1.R
Normal file
151
WIZ_Project1.R
Normal file
@ -0,0 +1,151 @@
|
||||
install.packages("ggplot2")
|
||||
install.packages("treemap")
|
||||
library(treemap)
|
||||
library(ggplot2)
|
||||
library(dplyr)
|
||||
|
||||
# Dataset loading ####
|
||||
imdb_raw <- read.csv("D:\\Studia\\JupyterProjects\\WIZ\\imdb.csv", header=TRUE, sep=",")
|
||||
imdb_raw %>%
|
||||
select(Series_Title, Released_Year, Runtime, Genre, IMDB_Rating, Meta_score, Director, Star1, Star2, Star3, Star4, No_of_Votes, Gross) -> imdb
|
||||
|
||||
# Plot 1 ####
|
||||
# Plot 2 ####
|
||||
imdb %>%
|
||||
select(Genre, Gross) -> imdb_2
|
||||
imdb_2 %>%
|
||||
group_by(Genre) %>%
|
||||
summarise(Income = sum(Gross)) %>%
|
||||
arrange(desc(Income)) %>%
|
||||
top_n(25) %>%
|
||||
treemap(imdb_2,
|
||||
# data
|
||||
index="Genre",
|
||||
vSize="Income",
|
||||
type="index",
|
||||
# Main
|
||||
title="",
|
||||
palette="Dark2",
|
||||
# Borders:
|
||||
border.col=c("black"),
|
||||
border.lwds=1,
|
||||
# Labels
|
||||
fontsize.labels=0.7,
|
||||
fontcolor.labels="white",
|
||||
fontface.labels=1,
|
||||
bg.labels=c("transparent"),
|
||||
align.labels=c("left", "top"),
|
||||
overlap.labels=0.5,
|
||||
inflate.labels=T
|
||||
)
|
||||
imdb %>%
|
||||
count(Genre) -> imdb_2b
|
||||
# Plot 2b ####
|
||||
imdb_2b %>%
|
||||
arrange(desc(Genre)) %>%
|
||||
top_n(25) %>%
|
||||
treemap(imdb_2b,
|
||||
# data
|
||||
index="Genre",
|
||||
vSize="n",
|
||||
type="index",
|
||||
# Main
|
||||
title="",
|
||||
palette="Dark2",
|
||||
# Borders:
|
||||
border.col=c("black"),
|
||||
border.lwds=1,
|
||||
# Labels
|
||||
fontsize.labels=0.7,
|
||||
fontcolor.labels="white",
|
||||
fontface.labels=1,
|
||||
bg.labels=c("transparent"),
|
||||
align.labels=c("left", "top"),
|
||||
overlap.labels=0.5,
|
||||
inflate.labels=T
|
||||
)
|
||||
|
||||
# Plot 3 ####
|
||||
|
||||
# Plot 4 ####
|
||||
# Prepare data
|
||||
imdb_3_d <- imdb %>%
|
||||
select(Director) %>%
|
||||
count(Director) %>%
|
||||
arrange(desc(n)) %>%
|
||||
top_n(15) %>%
|
||||
cbind(Role='Director')
|
||||
names(imdb_3_d)[names(imdb_3_d) == "n"] <- "Movies"
|
||||
names(imdb_3_d)[names(imdb_3_d) == "Director"] <- "Name"
|
||||
|
||||
imdb_3_s1 <- imdb %>%
|
||||
select(Star1) %>%
|
||||
count(Star1)
|
||||
names(imdb_3_s1)[names(imdb_3_s1) == 'Star1'] <- 'Star'
|
||||
|
||||
imdb_3_s2 <- imdb %>%
|
||||
select(Star2) %>%
|
||||
count(Star2)
|
||||
names(imdb_3_s2)[names(imdb_3_s2) == 'Star2'] <- 'Star'
|
||||
|
||||
imdb_3_s3 <- imdb %>%
|
||||
select(Star3) %>%
|
||||
count(Star3)
|
||||
names(imdb_3_s3)[names(imdb_3_s3) == 'Star3'] <- 'Star'
|
||||
|
||||
imdb_3_s4 <- imdb %>%
|
||||
select(Star4) %>%
|
||||
count(Star4)
|
||||
names(imdb_3_s4)[names(imdb_3_s4) == 'Star4'] <- 'Star'
|
||||
|
||||
imdb_3_s <- bind_rows(imdb_3_s1, imdb_3_s2, imdb_3_s3, imdb_3_s4) %>%
|
||||
group_by(Star) %>%
|
||||
summarise_all(funs(sum(., na.rm = TRUE))) %>%
|
||||
arrange(desc(n)) %>%
|
||||
top_n(15) %>%
|
||||
cbind(Role='Star')
|
||||
names(imdb_3_s)[names(imdb_3_s) == "n"] <- "Movies"
|
||||
names(imdb_3_s)[names(imdb_3_s) == "Star"] <- "Name"
|
||||
|
||||
imdb_3 <- bind_rows(imdb_3_d, imdb_3_s) %>%
|
||||
#group_by(Role) %>%
|
||||
#arrange(desc(Movies))
|
||||
arrange(Role)
|
||||
|
||||
# Set a number of 'empty bar' to add at the end of each group
|
||||
imdb_3 = rbind(imdb_3[1:15,],NA, NA, NA, imdb_3[16:30,], NA, NA, NA)
|
||||
#imdb_3 <- imdb_3 %>% arrange(Role)
|
||||
imdb_3$id <- seq(1, nrow(imdb_3))
|
||||
|
||||
# Get the name and the y position of each label
|
||||
label_imdb_3 <- imdb_3
|
||||
number_of_bar <- nrow(label_imdb_3)
|
||||
angle <- 90 - 360 * (label_imdb_3$id-0.5) /number_of_bar # I substract 0.5 because the letter must have the angle of the center of the bars. Not extreme right(1) or extreme left (0)
|
||||
label_imdb_3$hjust <- ifelse( angle < -90, 1, 0)
|
||||
label_imdb_3$angle <- ifelse(angle < -90, angle+180, angle)
|
||||
|
||||
# Make the plot
|
||||
p <- ggplot(imdb_3, aes(x=as.factor(id), y=Movies, fill=Role)) + # Note that id is a factor. If x is numeric, there is some space between the first bar
|
||||
geom_bar(aes(x=as.factor(id), y=Movies, fill=Role), stat="identity", alpha=0.5) +
|
||||
# Add a val=100/75/50/25 lines. I do it at the beginning to make sur barplots are OVER it.
|
||||
|
||||
# Add text showing the value of each 100/75/50/25 lines
|
||||
annotate("text", x = rep(max(imdb_3$id)-1,4), y = c(5, 10, 15, 20), label = c("5", "10", "15", "20") , color="grey", size=3 , angle=0, fontface="bold", hjust=1) +
|
||||
annotate("text", x = rep(17,4), y = c(5, 10, 15, 20), label = c("5", "10", "15", "20") , color="grey", size=3 , angle=0, fontface="bold", hjust=1) +
|
||||
|
||||
geom_bar(aes(x=as.factor(id), y=Movies, fill=Role), stat="identity", alpha=0.5) +
|
||||
ylim(-20,20) +
|
||||
theme_minimal() +
|
||||
theme(
|
||||
legend.position = "none",
|
||||
axis.text = element_blank(),
|
||||
axis.title = element_blank(),
|
||||
panel.grid = element_blank(),
|
||||
plot.margin = unit(rep(1,4), "cm")
|
||||
) +
|
||||
coord_polar() +
|
||||
geom_text(data=label_imdb_3, aes(x=id, y=Movies+5, label=Name, hjust=hjust), color="black", fontface="bold",alpha=0.6, size=2.5, angle= label_imdb_3$angle, inherit.aes = FALSE )
|
||||
|
||||
p
|
||||
|
||||
# Plot 5 ####
|
285
WIZ_Project1.ipynb
Normal file
285
WIZ_Project1.ipynb
Normal file
@ -0,0 +1,285 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5ac0bf77",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Imports"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "b66c8c8e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"package 'ggplot2' successfully unpacked and MD5 sums checked\n",
|
||||
"\n",
|
||||
"The downloaded binary packages are in\n",
|
||||
"\tC:\\Users\\pkopy\\AppData\\Local\\Temp\\Rtmpecxsl2\\downloaded_packages\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"also installing the dependencies 'glue', 'rlang', 'tibble', 'tidyselect'\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"package 'glue' successfully unpacked and MD5 sums checked\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Warning message:\n",
|
||||
"\"cannot remove prior installation of package 'glue'\"Warning message in file.copy(savedcopy, lib, recursive = TRUE):\n",
|
||||
"\"problem z kopiowaniem D:\\anaconda3\\envs\\R\\Lib\\R\\library\\00LOCK\\glue\\libs\\x64\\glue.dll do D:\\anaconda3\\envs\\R\\Lib\\R\\library\\glue\\libs\\x64\\glue.dll: Permission denied\"Warning message:\n",
|
||||
"\"restored 'glue'\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"package 'rlang' successfully unpacked and MD5 sums checked\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Warning message:\n",
|
||||
"\"cannot remove prior installation of package 'rlang'\"Warning message in file.copy(savedcopy, lib, recursive = TRUE):\n",
|
||||
"\"problem z kopiowaniem D:\\anaconda3\\envs\\R\\Lib\\R\\library\\00LOCK\\rlang\\libs\\x64\\rlang.dll do D:\\anaconda3\\envs\\R\\Lib\\R\\library\\rlang\\libs\\x64\\rlang.dll: Permission denied\"Warning message:\n",
|
||||
"\"restored 'rlang'\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"package 'tibble' successfully unpacked and MD5 sums checked\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Warning message:\n",
|
||||
"\"cannot remove prior installation of package 'tibble'\"Warning message in file.copy(savedcopy, lib, recursive = TRUE):\n",
|
||||
"\"problem z kopiowaniem D:\\anaconda3\\envs\\R\\Lib\\R\\library\\00LOCK\\tibble\\libs\\x64\\tibble.dll do D:\\anaconda3\\envs\\R\\Lib\\R\\library\\tibble\\libs\\x64\\tibble.dll: Permission denied\"Warning message:\n",
|
||||
"\"restored 'tibble'\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"package 'tidyselect' successfully unpacked and MD5 sums checked\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Warning message:\n",
|
||||
"\"cannot remove prior installation of package 'tidyselect'\"Warning message in file.copy(savedcopy, lib, recursive = TRUE):\n",
|
||||
"\"problem z kopiowaniem D:\\anaconda3\\envs\\R\\Lib\\R\\library\\00LOCK\\tidyselect\\libs\\x64\\tidyselect.dll do D:\\anaconda3\\envs\\R\\Lib\\R\\library\\tidyselect\\libs\\x64\\tidyselect.dll: Permission denied\"Warning message:\n",
|
||||
"\"restored 'tidyselect'\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"package 'dplyr' successfully unpacked and MD5 sums checked\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Warning message:\n",
|
||||
"\"cannot remove prior installation of package 'dplyr'\"Warning message in file.copy(savedcopy, lib, recursive = TRUE):\n",
|
||||
"\"problem z kopiowaniem D:\\anaconda3\\envs\\R\\Lib\\R\\library\\00LOCK\\dplyr\\libs\\x64\\dplyr.dll do D:\\anaconda3\\envs\\R\\Lib\\R\\library\\dplyr\\libs\\x64\\dplyr.dll: Permission denied\"Warning message:\n",
|
||||
"\"restored 'dplyr'\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"The downloaded binary packages are in\n",
|
||||
"\tC:\\Users\\pkopy\\AppData\\Local\\Temp\\Rtmpecxsl2\\downloaded_packages\n",
|
||||
"package 'magrittr' successfully unpacked and MD5 sums checked\n",
|
||||
"\n",
|
||||
"The downloaded binary packages are in\n",
|
||||
"\tC:\\Users\\pkopy\\AppData\\Local\\Temp\\Rtmpecxsl2\\downloaded_packages\n",
|
||||
"package 'treemap' successfully unpacked and MD5 sums checked\n",
|
||||
"\n",
|
||||
"The downloaded binary packages are in\n",
|
||||
"\tC:\\Users\\pkopy\\AppData\\Local\\Temp\\Rtmpecxsl2\\downloaded_packages\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "ERROR",
|
||||
"evalue": "Error: package or namespace load failed for 'dplyr' in loadNamespace(i, c(lib.loc, .libPaths()), versionCheck = vI[[i]]):\n namespace 'rlang' 0.3.4 is already loaded, but >= 0.4.10 is required\n",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"Error: package or namespace load failed for 'dplyr' in loadNamespace(i, c(lib.loc, .libPaths()), versionCheck = vI[[i]]):\n namespace 'rlang' 0.3.4 is already loaded, but >= 0.4.10 is required\nTraceback:\n",
|
||||
"1. library(dplyr)",
|
||||
"2. tryCatch({\n . attr(package, \"LibPath\") <- which.lib.loc\n . ns <- loadNamespace(package, lib.loc)\n . env <- attachNamespace(ns, pos = pos, deps, exclude, include.only)\n . }, error = function(e) {\n . P <- if (!is.null(cc <- conditionCall(e))) \n . paste(\" in\", deparse(cc)[1L])\n . else \"\"\n . msg <- gettextf(\"package or namespace load failed for %s%s:\\n %s\", \n . sQuote(package), P, conditionMessage(e))\n . if (logical.return) \n . message(paste(\"Error:\", msg), domain = NA)\n . else stop(msg, call. = FALSE, domain = NA)\n . })",
|
||||
"3. tryCatchList(expr, classes, parentenv, handlers)",
|
||||
"4. tryCatchOne(expr, names, parentenv, handlers[[1L]])",
|
||||
"5. value[[3L]](cond)",
|
||||
"6. stop(msg, call. = FALSE, domain = NA)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"install.packages(\"ggplot2\")\n",
|
||||
"install.packages(\"dplyr\") \n",
|
||||
"install.packages(\"magrittr\")\n",
|
||||
"install.packages(\"treemap\")\n",
|
||||
"library(dplyr)\n",
|
||||
"library(magrittr)\n",
|
||||
"library(treemap)\n",
|
||||
"library(ggplot2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2b7d623a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "a2b1b3c9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"imdb_raw <- read.csv(\"imdb_top_1000.csv\", header=TRUE, sep=\",\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "f3990501",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ERROR",
|
||||
"evalue": "Error in imdb_raw %>% select(Series_Title, Released_Year, Runtime, Genre, : nie udało się znaleźć funkcji '%>%'\n",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"Error in imdb_raw %>% select(Series_Title, Released_Year, Runtime, Genre, : nie udało się znaleźć funkcji '%>%'\nTraceback:\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"imdb_raw %>%\n",
|
||||
" select(Series_Title, Released_Year, Runtime, Genre, IMDB_Rating, Meta_score, Director, Star1, Star2, Star3, Star4, No_of_Votes, Gross) -> imdb"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d7fee283",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"imdb"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2d17ceac",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Plot 2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "faf1f5d9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Plot\n",
|
||||
"imdb %>%\n",
|
||||
" select(Genre, Gross) -> imdb_p2_raw\n",
|
||||
"imdb_p2_raw %>%\n",
|
||||
" aggregate(imdb_p2_raw, by=list(imdb_p2_raw$Genre), FUN=sum) -> imdb_p2\n",
|
||||
"treemap(imdb_p2,\n",
|
||||
" \n",
|
||||
" # data\n",
|
||||
" index=\"Genre\",\n",
|
||||
" vSize=\"Income\",\n",
|
||||
" type=\"index\",\n",
|
||||
" \n",
|
||||
" # Main\n",
|
||||
" title=\"\",\n",
|
||||
" palette=\"Dark2\",\n",
|
||||
"\n",
|
||||
" # Borders:\n",
|
||||
" border.col=c(\"black\"), \n",
|
||||
" border.lwds=1, \n",
|
||||
" \n",
|
||||
" # Labels\n",
|
||||
" fontsize.labels=0.5,\n",
|
||||
" fontcolor.labels=\"white\",\n",
|
||||
" fontface.labels=1, \n",
|
||||
" bg.labels=c(\"transparent\"), \n",
|
||||
" align.labels=c(\"left\", \"top\"), \n",
|
||||
" overlap.labels=0.5,\n",
|
||||
" inflate.labels=T # If true, labels are bigger when rectangle is bigger.\n",
|
||||
"\n",
|
||||
" \n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d5fb0625",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "R",
|
||||
"language": "R",
|
||||
"name": "ir"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": "r",
|
||||
"file_extension": ".r",
|
||||
"mimetype": "text/x-r-source",
|
||||
"name": "R",
|
||||
"pygments_lexer": "r",
|
||||
"version": "3.6.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
15
etl.py
Normal file
15
etl.py
Normal file
@ -0,0 +1,15 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import math
|
||||
|
||||
df = pd.read_csv("imdb_top_1000.csv")
|
||||
new_gross = []
|
||||
old_gross = df['Gross'].tolist()
|
||||
old_gross = [str(x) for x in old_gross]
|
||||
old_gross = ['0' if x=='nan' else x for x in old_gross]
|
||||
old_gross = [x.replace(",", "") for x in old_gross]
|
||||
for gross in old_gross:
|
||||
new_gross.append(int(gross))
|
||||
df.drop(columns=['Gross'])
|
||||
df['Gross'] = new_gross
|
||||
df.to_csv('imdb.csv')
|
1001
imdb_top_1000.csv
Normal file
1001
imdb_top_1000.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user