Init commit

2021-04-12 16:07:00 +02:00 · 2021-04-12 16:07:00 +02:00 · 431a51a1bf
commit 431a51a1bf
5 changed files with 2453 additions and 0 deletions
--- a/WIZ_Project1.R
+++ b/WIZ_Project1.R
@ -0,0 +1,151 @@
+install.packages("ggplot2")
+install.packages("treemap")
+library(treemap)
+library(ggplot2)
+library(dplyr)
+
+# Dataset loading ####
+imdb_raw <- read.csv("D:\\Studia\\JupyterProjects\\WIZ\\imdb.csv", header=TRUE, sep=",")
+imdb_raw %>%
+  select(Series_Title, Released_Year, Runtime, Genre, IMDB_Rating, Meta_score, Director, Star1, Star2, Star3, Star4, No_of_Votes, Gross) -> imdb
+
+# Plot 1 ####
+# Plot 2 ####
+imdb %>%
+  select(Genre, Gross) -> imdb_2
+imdb_2 %>%
+  group_by(Genre) %>%
+  summarise(Income = sum(Gross)) %>%
+  arrange(desc(Income)) %>%
+  top_n(25) %>%
+      treemap(imdb_2,
+          # data
+          index="Genre",
+          vSize="Income",
+          type="index",
+          # Main
+          title="",
+          palette="Dark2",
+          # Borders:
+          border.col=c("black"),             
+          border.lwds=1,                         
+          # Labels
+          fontsize.labels=0.7,
+          fontcolor.labels="white",
+          fontface.labels=1,            
+          bg.labels=c("transparent"),              
+          align.labels=c("left", "top"),                                  
+          overlap.labels=0.5,
+          inflate.labels=T  
+      ) 
+imdb %>%
+  count(Genre) -> imdb_2b
+# Plot 2b ####
+imdb_2b %>%
+  arrange(desc(Genre)) %>%
+  top_n(25) %>%
+  treemap(imdb_2b,
+          # data
+          index="Genre",
+          vSize="n",
+          type="index",
+          # Main
+          title="",
+          palette="Dark2",
+          # Borders:
+          border.col=c("black"),             
+          border.lwds=1,                         
+          # Labels
+          fontsize.labels=0.7,
+          fontcolor.labels="white",
+          fontface.labels=1,            
+          bg.labels=c("transparent"),              
+          align.labels=c("left", "top"),                                  
+          overlap.labels=0.5,
+          inflate.labels=T  
+  ) 
+
+# Plot 3 ####
+
+# Plot 4 ####
+# Prepare data
+imdb_3_d <- imdb %>%
+  select(Director) %>%
+  count(Director) %>%
+  arrange(desc(n)) %>%
+  top_n(15) %>%
+  cbind(Role='Director')
+names(imdb_3_d)[names(imdb_3_d) == "n"] <- "Movies"
+names(imdb_3_d)[names(imdb_3_d) == "Director"] <- "Name"
+
+imdb_3_s1 <- imdb %>%
+  select(Star1) %>%
+  count(Star1)
+names(imdb_3_s1)[names(imdb_3_s1) == 'Star1'] <- 'Star'
+
+imdb_3_s2 <- imdb %>%
+  select(Star2) %>%
+  count(Star2)
+names(imdb_3_s2)[names(imdb_3_s2) == 'Star2'] <- 'Star'
+
+imdb_3_s3 <- imdb %>%
+  select(Star3) %>%
+  count(Star3)
+names(imdb_3_s3)[names(imdb_3_s3) == 'Star3'] <- 'Star'
+
+imdb_3_s4 <- imdb %>%
+  select(Star4) %>%
+  count(Star4)
+names(imdb_3_s4)[names(imdb_3_s4) == 'Star4'] <- 'Star'
+
+imdb_3_s <- bind_rows(imdb_3_s1, imdb_3_s2, imdb_3_s3, imdb_3_s4) %>%
+  group_by(Star) %>%
+  summarise_all(funs(sum(., na.rm = TRUE))) %>% 
+  arrange(desc(n)) %>%
+  top_n(15) %>%
+  cbind(Role='Star')
+names(imdb_3_s)[names(imdb_3_s) == "n"] <- "Movies"
+names(imdb_3_s)[names(imdb_3_s) == "Star"] <- "Name"
+
+imdb_3 <- bind_rows(imdb_3_d, imdb_3_s) %>%
+  #group_by(Role) %>%
+  #arrange(desc(Movies))
+  arrange(Role)
+  
+# Set a number of 'empty bar' to add at the end of each group
+imdb_3 = rbind(imdb_3[1:15,],NA, NA, NA, imdb_3[16:30,], NA, NA, NA)
+#imdb_3 <- imdb_3 %>% arrange(Role)
+imdb_3$id <- seq(1, nrow(imdb_3))
+
+# Get the name and the y position of each label
+label_imdb_3 <- imdb_3
+number_of_bar <- nrow(label_imdb_3)
+angle <- 90 - 360 * (label_imdb_3$id-0.5) /number_of_bar     # I substract 0.5 because the letter must have the angle of the center of the bars. Not extreme right(1) or extreme left (0)
+label_imdb_3$hjust <- ifelse( angle < -90, 1, 0)
+label_imdb_3$angle <- ifelse(angle < -90, angle+180, angle)
+
+# Make the plot
+p <- ggplot(imdb_3, aes(x=as.factor(id), y=Movies, fill=Role)) +       # Note that id is a factor. If x is numeric, there is some space between the first bar
+  geom_bar(aes(x=as.factor(id), y=Movies, fill=Role), stat="identity", alpha=0.5) +
+  # Add a val=100/75/50/25 lines. I do it at the beginning to make sur barplots are OVER it.
+
+  # Add text showing the value of each 100/75/50/25 lines
+  annotate("text", x = rep(max(imdb_3$id)-1,4), y = c(5, 10, 15, 20), label = c("5", "10", "15", "20") , color="grey", size=3 , angle=0, fontface="bold", hjust=1) +
+  annotate("text", x = rep(17,4), y = c(5, 10, 15, 20), label = c("5", "10", "15", "20") , color="grey", size=3 , angle=0, fontface="bold", hjust=1) +
+  
+  geom_bar(aes(x=as.factor(id), y=Movies, fill=Role), stat="identity", alpha=0.5) +
+  ylim(-20,20) +
+  theme_minimal() +
+  theme(
+    legend.position = "none",
+    axis.text = element_blank(),
+    axis.title = element_blank(),
+    panel.grid = element_blank(),
+    plot.margin = unit(rep(1,4), "cm") 
+  ) +
+  coord_polar() + 
+  geom_text(data=label_imdb_3, aes(x=id, y=Movies+5, label=Name, hjust=hjust), color="black", fontface="bold",alpha=0.6, size=2.5, angle= label_imdb_3$angle, inherit.aes = FALSE )
+
+p
+
+# Plot 5 ####
--- a/WIZ_Project1.ipynb
+++ b/WIZ_Project1.ipynb
@ -0,0 +1,285 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "5ac0bf77",
+   "metadata": {},
+   "source": [
+    "# Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "b66c8c8e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "package 'ggplot2' successfully unpacked and MD5 sums checked\n",
+      "\n",
+      "The downloaded binary packages are in\n",
+      "\tC:\\Users\\pkopy\\AppData\\Local\\Temp\\Rtmpecxsl2\\downloaded_packages\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "also installing the dependencies 'glue', 'rlang', 'tibble', 'tidyselect'\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "package 'glue' successfully unpacked and MD5 sums checked\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Warning message:\n",
+      "\"cannot remove prior installation of package 'glue'\"Warning message in file.copy(savedcopy, lib, recursive = TRUE):\n",
+      "\"problem z kopiowaniem D:\\anaconda3\\envs\\R\\Lib\\R\\library\\00LOCK\\glue\\libs\\x64\\glue.dll do D:\\anaconda3\\envs\\R\\Lib\\R\\library\\glue\\libs\\x64\\glue.dll: Permission denied\"Warning message:\n",
+      "\"restored 'glue'\""
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "package 'rlang' successfully unpacked and MD5 sums checked\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Warning message:\n",
+      "\"cannot remove prior installation of package 'rlang'\"Warning message in file.copy(savedcopy, lib, recursive = TRUE):\n",
+      "\"problem z kopiowaniem D:\\anaconda3\\envs\\R\\Lib\\R\\library\\00LOCK\\rlang\\libs\\x64\\rlang.dll do D:\\anaconda3\\envs\\R\\Lib\\R\\library\\rlang\\libs\\x64\\rlang.dll: Permission denied\"Warning message:\n",
+      "\"restored 'rlang'\""
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "package 'tibble' successfully unpacked and MD5 sums checked\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Warning message:\n",
+      "\"cannot remove prior installation of package 'tibble'\"Warning message in file.copy(savedcopy, lib, recursive = TRUE):\n",
+      "\"problem z kopiowaniem D:\\anaconda3\\envs\\R\\Lib\\R\\library\\00LOCK\\tibble\\libs\\x64\\tibble.dll do D:\\anaconda3\\envs\\R\\Lib\\R\\library\\tibble\\libs\\x64\\tibble.dll: Permission denied\"Warning message:\n",
+      "\"restored 'tibble'\""
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "package 'tidyselect' successfully unpacked and MD5 sums checked\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Warning message:\n",
+      "\"cannot remove prior installation of package 'tidyselect'\"Warning message in file.copy(savedcopy, lib, recursive = TRUE):\n",
+      "\"problem z kopiowaniem D:\\anaconda3\\envs\\R\\Lib\\R\\library\\00LOCK\\tidyselect\\libs\\x64\\tidyselect.dll do D:\\anaconda3\\envs\\R\\Lib\\R\\library\\tidyselect\\libs\\x64\\tidyselect.dll: Permission denied\"Warning message:\n",
+      "\"restored 'tidyselect'\""
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "package 'dplyr' successfully unpacked and MD5 sums checked\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Warning message:\n",
+      "\"cannot remove prior installation of package 'dplyr'\"Warning message in file.copy(savedcopy, lib, recursive = TRUE):\n",
+      "\"problem z kopiowaniem D:\\anaconda3\\envs\\R\\Lib\\R\\library\\00LOCK\\dplyr\\libs\\x64\\dplyr.dll do D:\\anaconda3\\envs\\R\\Lib\\R\\library\\dplyr\\libs\\x64\\dplyr.dll: Permission denied\"Warning message:\n",
+      "\"restored 'dplyr'\""
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "The downloaded binary packages are in\n",
+      "\tC:\\Users\\pkopy\\AppData\\Local\\Temp\\Rtmpecxsl2\\downloaded_packages\n",
+      "package 'magrittr' successfully unpacked and MD5 sums checked\n",
+      "\n",
+      "The downloaded binary packages are in\n",
+      "\tC:\\Users\\pkopy\\AppData\\Local\\Temp\\Rtmpecxsl2\\downloaded_packages\n",
+      "package 'treemap' successfully unpacked and MD5 sums checked\n",
+      "\n",
+      "The downloaded binary packages are in\n",
+      "\tC:\\Users\\pkopy\\AppData\\Local\\Temp\\Rtmpecxsl2\\downloaded_packages\n"
+     ]
+    },
+    {
+     "ename": "ERROR",
+     "evalue": "Error: package or namespace load failed for 'dplyr' in loadNamespace(i, c(lib.loc, .libPaths()), versionCheck = vI[[i]]):\n namespace 'rlang' 0.3.4 is already loaded, but >= 0.4.10 is required\n",
+     "output_type": "error",
+     "traceback": [
+      "Error: package or namespace load failed for 'dplyr' in loadNamespace(i, c(lib.loc, .libPaths()), versionCheck = vI[[i]]):\n namespace 'rlang' 0.3.4 is already loaded, but >= 0.4.10 is required\nTraceback:\n",
+      "1. library(dplyr)",
+      "2. tryCatch({\n .     attr(package, \"LibPath\") <- which.lib.loc\n .     ns <- loadNamespace(package, lib.loc)\n .     env <- attachNamespace(ns, pos = pos, deps, exclude, include.only)\n . }, error = function(e) {\n .     P <- if (!is.null(cc <- conditionCall(e))) \n .         paste(\" in\", deparse(cc)[1L])\n .     else \"\"\n .     msg <- gettextf(\"package or namespace load failed for %s%s:\\n %s\", \n .         sQuote(package), P, conditionMessage(e))\n .     if (logical.return) \n .         message(paste(\"Error:\", msg), domain = NA)\n .     else stop(msg, call. = FALSE, domain = NA)\n . })",
+      "3. tryCatchList(expr, classes, parentenv, handlers)",
+      "4. tryCatchOne(expr, names, parentenv, handlers[[1L]])",
+      "5. value[[3L]](cond)",
+      "6. stop(msg, call. = FALSE, domain = NA)"
+     ]
+    }
+   ],
+   "source": [
+    "install.packages(\"ggplot2\")\n",
+    "install.packages(\"dplyr\") \n",
+    "install.packages(\"magrittr\")\n",
+    "install.packages(\"treemap\")\n",
+    "library(dplyr)\n",
+    "library(magrittr)\n",
+    "library(treemap)\n",
+    "library(ggplot2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2b7d623a",
+   "metadata": {},
+   "source": [
+    "# Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "a2b1b3c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imdb_raw <- read.csv(\"imdb_top_1000.csv\", header=TRUE, sep=\",\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "f3990501",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ERROR",
+     "evalue": "Error in imdb_raw %>% select(Series_Title, Released_Year, Runtime, Genre, : nie udało się znaleźć funkcji '%>%'\n",
+     "output_type": "error",
+     "traceback": [
+      "Error in imdb_raw %>% select(Series_Title, Released_Year, Runtime, Genre, : nie udało się znaleźć funkcji '%>%'\nTraceback:\n"
+     ]
+    }
+   ],
+   "source": [
+    "imdb_raw %>%\n",
+    "  select(Series_Title, Released_Year, Runtime, Genre, IMDB_Rating, Meta_score, Director, Star1, Star2, Star3, Star4, No_of_Votes, Gross) -> imdb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7fee283",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imdb"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2d17ceac",
+   "metadata": {},
+   "source": [
+    "# Plot 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "faf1f5d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plot\n",
+    "imdb %>%\n",
+    "    select(Genre, Gross) -> imdb_p2_raw\n",
+    "imdb_p2_raw %>%\n",
+    "    aggregate(imdb_p2_raw, by=list(imdb_p2_raw$Genre), FUN=sum) -> imdb_p2\n",
+    "treemap(imdb_p2,\n",
+    "            \n",
+    "            # data\n",
+    "            index=\"Genre\",\n",
+    "            vSize=\"Income\",\n",
+    "            type=\"index\",\n",
+    "            \n",
+    "            # Main\n",
+    "            title=\"\",\n",
+    "            palette=\"Dark2\",\n",
+    "\n",
+    "            # Borders:\n",
+    "            border.col=c(\"black\"),             \n",
+    "            border.lwds=1,                         \n",
+    "        \n",
+    "            # Labels\n",
+    "            fontsize.labels=0.5,\n",
+    "            fontcolor.labels=\"white\",\n",
+    "            fontface.labels=1,            \n",
+    "            bg.labels=c(\"transparent\"),              \n",
+    "            align.labels=c(\"left\", \"top\"),                                  \n",
+    "            overlap.labels=0.5,\n",
+    "            inflate.labels=T                        # If true, labels are bigger when rectangle is bigger.\n",
+    "\n",
+    "            \n",
+    "            )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5fb0625",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "R",
+   "language": "R",
+   "name": "ir"
+  },
+  "language_info": {
+   "codemirror_mode": "r",
+   "file_extension": ".r",
+   "mimetype": "text/x-r-source",
+   "name": "R",
+   "pygments_lexer": "r",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/etl.py
+++ b/etl.py
@ -0,0 +1,15 @@
+import pandas as pd
+import numpy as np
+import math
+
+df = pd.read_csv("imdb_top_1000.csv")
+new_gross = []
+old_gross = df['Gross'].tolist()
+old_gross = [str(x) for x in old_gross]
+old_gross = ['0' if x=='nan' else x for x in old_gross]
+old_gross = [x.replace(",", "") for x in old_gross]
+for gross in old_gross:
+    new_gross.append(int(gross))
+df.drop(columns=['Gross'])
+df['Gross'] = new_gross
+df.to_csv('imdb.csv')
--- a/imdb.csv
+++ b/imdb.csv
--- a/imdb_top_1000.csv
+++ b/imdb_top_1000.csv