From c49a82db43b96061ea1a693e8c60291bd585415a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20W=C3=B3jtowicz?= <andre@amu.edu.pl>
Date: Wed, 13 Jul 2016 13:59:25 +0200
Subject: [PATCH] Updated bank-marketing dataset

---
 .gitignore                                  |  1 +
 README.md                                   | 51 ++++++++++-----------
 data-collection/bank-marketing/config.yaml  |  2 +-
 data-collection/bank-marketing/preprocess.R | 42 ++++++++---------
 data-preprocess.R                           |  1 +
 5 files changed, 45 insertions(+), 52 deletions(-)

diff --git a/.gitignore b/.gitignore
index 9743a80..9096c7e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,3 +23,4 @@ data-collection/*/preprocessed/*
 
 # markdown outputs
 *.html
+.Rproj.user
diff --git a/README.md b/README.md
index bd2d0f3..e189944 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@ Andrzej Wójtowicz
 
 
 
-Document generation date: 2016-06-23 11:44:00.
+Document generation date: 2016-07-13 13:45:45.
 
 
 
@@ -30,7 +30,7 @@ Document generation date: 2016-06-23 11:44:00.
 
 **Source data files**:
 
-* [bank-additional.zip](https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip)
+* [bank.zip](https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip)
 
 **Cite**:
 ```nohighlight
@@ -40,25 +40,23 @@ S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of
 **Dataset**:
 
 ```nohighlight
-'data.frame':	38227 obs. of  18 variables:
- $ age           : int  56 57 37 40 56 45 59 24 25 25 ...
- $ job           : Factor w/ 11 levels "admin","blue.collar",..: 4 8 8 1 8 8 1 10 8 8 ...
- $ marital       : Factor w/ 3 levels "divorced","married",..: 2 2 2 2 2 2 2 3 3 3 ...
- $ education     : Ord.factor w/ 6 levels "basic.4y"<"basic.6y"<..: 1 4 4 2 4 3 5 5 4 4 ...
- $ housing       : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 1 2 2 2 ...
- $ loan          : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 1 1 1 1 ...
- $ contact       : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
- $ month         : Ord.factor w/ 12 levels "jan"<"feb"<"mar"<..: 5 5 5 5 5 5 5 5 5 5 ...
- $ day.of.week   : Ord.factor w/ 5 levels "mon"<"tue"<"wed"<..: 1 1 1 1 1 1 1 1 1 1 ...
- $ campaign      : int  1 1 1 1 1 1 1 1 1 1 ...
- $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
- $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
- $ emp.var.rate  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
- $ cons.price.idx: num  94 94 94 94 94 ...
- $ cons.conf.idx : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
- $ euribor3m     : num  4.86 4.86 4.86 4.86 4.86 ...
- $ nr.employed   : num  5191 5191 5191 5191 5191 ...
- $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
+'data.frame':	43193 obs. of  16 variables:
+ $ age      : int  58 44 33 35 28 42 58 43 41 29 ...
+ $ job      : Factor w/ 11 levels "admin","blue.collar",..: 5 10 3 5 5 3 6 10 1 1 ...
+ $ marital  : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 1 2 3 1 3 ...
+ $ education: Ord.factor w/ 3 levels "primary"<"secondary"<..: 3 2 2 3 3 3 1 2 2 2 ...
+ $ balance  : int  2143 29 2 231 447 2 121 593 270 390 ...
+ $ housing  : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
+ $ loan     : Factor w/ 2 levels "no","yes": 1 1 2 1 2 1 1 1 1 1 ...
+ $ contact  : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
+ $ day      : int  5 5 5 5 5 5 5 5 5 5 ...
+ $ month    : Ord.factor w/ 12 levels "jan"<"feb"<"mar"<..: 5 5 5 5 5 5 5 5 5 5 ...
+ $ campaign : int  1 1 1 1 1 1 1 1 1 1 ...
+ $ pdays    : int  999 999 999 999 999 999 999 999 999 999 ...
+ $ pdays.bin: Factor w/ 2 levels "successful","never": 2 2 2 2 2 2 2 2 2 2 ...
+ $ previous : int  0 0 0 0 0 0 0 0 0 0 ...
+ $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
+ $ y        : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
 
 ```
 
@@ -66,17 +64,16 @@ S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of
 
 |Type           | Frequency|
 |:--------------|---------:|
-|factor         |         6|
-|integer        |         3|
-|numeric        |         5|
-|ordered factor |         3|
+|factor         |         7|
+|integer        |         6|
+|ordered factor |         2|
 
 **Class imbalance**:
 
 | class A |  class B |
 |:-------:|:--------:|
-|  11 %   |   89 %   |
-|  4254   |  33973   |
+|  12 %   |   88 %   |
+|  5021   |  38172   |
 
 ---
 
diff --git a/data-collection/bank-marketing/config.yaml b/data-collection/bank-marketing/config.yaml
index d2fb363..c17066b 100644
--- a/data-collection/bank-marketing/config.yaml
+++ b/data-collection/bank-marketing/config.yaml
@@ -4,7 +4,7 @@ name: Bank Marketing
 info: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
 
 urls:
-- https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip
+- https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip
 
 cite: > 
     S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014
diff --git a/data-collection/bank-marketing/preprocess.R b/data-collection/bank-marketing/preprocess.R
index 4fdb407..7aa2c80 100644
--- a/data-collection/bank-marketing/preprocess.R
+++ b/data-collection/bank-marketing/preprocess.R
@@ -4,8 +4,8 @@ preprocessDataset = function()
     
     temp.dir = tempdir()
     
-    zip.file = "bank-additional.zip"
-    zip.dataset.path = "bank-additional/bank-additional-full.csv"
+    zip.file = "bank.zip"
+    zip.dataset.path = "bank-full.csv"
     
     flog.debug(paste("Unzipping", zip.file))
     
@@ -20,31 +20,25 @@ preprocessDataset = function()
     flog.debug("Preprocessing loaded dataset")
     
     dataset = dataset %>% 
-        select(-c(duration, pdays, default)) %>%
+        select(-c(duration, default)) %>%
         filter(job != "unknown" & marital != "unknown" & education != "unknown" & 
-               education != "illiterate" & housing != "unknown" & loan != "unknown") %>%
+               education != "unknown" & housing != "unknown" & loan != "unknown") %>%
         droplevels()
     
-    #dataset.yes = dataset %>% filter(y == "yes")
-    #dataset.no = dataset %>% filter(y == "no") %>% sample_n(nrow(dataset.yes))
-    #
-    #dataset = rbind(dataset.yes, dataset.no)
-    
-    dataset = dataset %>% mutate(
-                        education=factor(education, levels=c("basic.4y", "basic.6y", 
-                                                             "basic.9y", "high.school",
-                                                             "professional.course",
-                                                             "university.degree"),
-                                         ordered=TRUE),
-                        month=factor(month, levels=c("jan", "feb", "mar", 
-                                                     "apr", "may", "jun", 
-                                                     "jul", "aug", "sep", 
-                                                     "oct", "nov", "dec"), 
-                                     ordered=TRUE),
-                        day_of_week=factor(day_of_week, levels=c("mon", "tue", "wed",
-                                                                 "thu", "fri"),
-                                           ordered=TRUE)
-    )
+    dataset = dataset %>% 
+        mutate(
+            education=factor(education, levels=c("primary", "secondary", 
+                                                 "tertiary"),
+                             ordered=TRUE),
+            month=factor(month, levels=c("jan", "feb", "mar", 
+                                         "apr", "may", "jun", 
+                                         "jul", "aug", "sep", 
+                                         "oct", "nov", "dec"), 
+                         ordered=TRUE),
+            pdays.bin=revalue(factor(pdays==-1), 
+                              c("TRUE"="never", "FALSE"="successful")), 
+            pdays=as.integer(replace(pdays, pdays==-1, 999))) %>%
+        select(age:pdays, pdays.bin, previous:y)
     
     return(dataset)
 }
\ No newline at end of file
diff --git a/data-preprocess.R b/data-preprocess.R
index 2dd9cf3..68eeaac 100644
--- a/data-preprocess.R
+++ b/data-preprocess.R
@@ -3,6 +3,7 @@ rm(list=ls())
 source("config.R")
 source("utils.R")
 
+library(plyr)
 library(dplyr)
 library(foreign)
 library(XLConnect)