From c49a82db43b96061ea1a693e8c60291bd585415a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20W=C3=B3jtowicz?= Date: Wed, 13 Jul 2016 13:59:25 +0200 Subject: [PATCH] Updated bank-marketing dataset --- .gitignore | 1 + README.md | 51 ++++++++++----------- data-collection/bank-marketing/config.yaml | 2 +- data-collection/bank-marketing/preprocess.R | 42 ++++++++--------- data-preprocess.R | 1 + 5 files changed, 45 insertions(+), 52 deletions(-) diff --git a/.gitignore b/.gitignore index 9743a80..9096c7e 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ data-collection/*/preprocessed/* # markdown outputs *.html +.Rproj.user diff --git a/README.md b/README.md index bd2d0f3..e189944 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ Andrzej Wójtowicz -Document generation date: 2016-06-23 11:44:00. +Document generation date: 2016-07-13 13:45:45. @@ -30,7 +30,7 @@ Document generation date: 2016-06-23 11:44:00. **Source data files**: -* [bank-additional.zip](https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip) +* [bank.zip](https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip) **Cite**: ```nohighlight @@ -40,25 +40,23 @@ S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of **Dataset**: ```nohighlight -'data.frame': 38227 obs. of 18 variables: - $ age : int 56 57 37 40 56 45 59 24 25 25 ... - $ job : Factor w/ 11 levels "admin","blue.collar",..: 4 8 8 1 8 8 1 10 8 8 ... - $ marital : Factor w/ 3 levels "divorced","married",..: 2 2 2 2 2 2 2 3 3 3 ... - $ education : Ord.factor w/ 6 levels "basic.4y"<"basic.6y"<..: 1 4 4 2 4 3 5 5 4 4 ... - $ housing : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 1 2 2 2 ... - $ loan : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 1 1 1 1 ... - $ contact : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ... - $ month : Ord.factor w/ 12 levels "jan"<"feb"<"mar"<..: 5 5 5 5 5 5 5 5 5 5 ... - $ day.of.week : Ord.factor w/ 5 levels "mon"<"tue"<"wed"<..: 1 1 1 1 1 1 1 1 1 1 ... - $ campaign : int 1 1 1 1 1 1 1 1 1 1 ... - $ previous : int 0 0 0 0 0 0 0 0 0 0 ... - $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ... - $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ... - $ cons.price.idx: num 94 94 94 94 94 ... - $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ... - $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ... - $ nr.employed : num 5191 5191 5191 5191 5191 ... - $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ... +'data.frame': 43193 obs. of 16 variables: + $ age : int 58 44 33 35 28 42 58 43 41 29 ... + $ job : Factor w/ 11 levels "admin","blue.collar",..: 5 10 3 5 5 3 6 10 1 1 ... + $ marital : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 1 2 3 1 3 ... + $ education: Ord.factor w/ 3 levels "primary"<"secondary"<..: 3 2 2 3 3 3 1 2 2 2 ... + $ balance : int 2143 29 2 231 447 2 121 593 270 390 ... + $ housing : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ... + $ loan : Factor w/ 2 levels "no","yes": 1 1 2 1 2 1 1 1 1 1 ... + $ contact : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ... + $ day : int 5 5 5 5 5 5 5 5 5 5 ... + $ month : Ord.factor w/ 12 levels "jan"<"feb"<"mar"<..: 5 5 5 5 5 5 5 5 5 5 ... + $ campaign : int 1 1 1 1 1 1 1 1 1 1 ... + $ pdays : int 999 999 999 999 999 999 999 999 999 999 ... + $ pdays.bin: Factor w/ 2 levels "successful","never": 2 2 2 2 2 2 2 2 2 2 ... + $ previous : int 0 0 0 0 0 0 0 0 0 0 ... + $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ... + $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ... ``` @@ -66,17 +64,16 @@ S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of |Type | Frequency| |:--------------|---------:| -|factor | 6| -|integer | 3| -|numeric | 5| -|ordered factor | 3| +|factor | 7| +|integer | 6| +|ordered factor | 2| **Class imbalance**: | class A | class B | |:-------:|:--------:| -| 11 % | 89 % | -| 4254 | 33973 | +| 12 % | 88 % | +| 5021 | 38172 | --- diff --git a/data-collection/bank-marketing/config.yaml b/data-collection/bank-marketing/config.yaml index d2fb363..c17066b 100644 --- a/data-collection/bank-marketing/config.yaml +++ b/data-collection/bank-marketing/config.yaml @@ -4,7 +4,7 @@ name: Bank Marketing info: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing urls: -- https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip +- https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip cite: > S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014 diff --git a/data-collection/bank-marketing/preprocess.R b/data-collection/bank-marketing/preprocess.R index 4fdb407..7aa2c80 100644 --- a/data-collection/bank-marketing/preprocess.R +++ b/data-collection/bank-marketing/preprocess.R @@ -4,8 +4,8 @@ preprocessDataset = function() temp.dir = tempdir() - zip.file = "bank-additional.zip" - zip.dataset.path = "bank-additional/bank-additional-full.csv" + zip.file = "bank.zip" + zip.dataset.path = "bank-full.csv" flog.debug(paste("Unzipping", zip.file)) @@ -20,31 +20,25 @@ preprocessDataset = function() flog.debug("Preprocessing loaded dataset") dataset = dataset %>% - select(-c(duration, pdays, default)) %>% + select(-c(duration, default)) %>% filter(job != "unknown" & marital != "unknown" & education != "unknown" & - education != "illiterate" & housing != "unknown" & loan != "unknown") %>% + education != "unknown" & housing != "unknown" & loan != "unknown") %>% droplevels() - #dataset.yes = dataset %>% filter(y == "yes") - #dataset.no = dataset %>% filter(y == "no") %>% sample_n(nrow(dataset.yes)) - # - #dataset = rbind(dataset.yes, dataset.no) - - dataset = dataset %>% mutate( - education=factor(education, levels=c("basic.4y", "basic.6y", - "basic.9y", "high.school", - "professional.course", - "university.degree"), - ordered=TRUE), - month=factor(month, levels=c("jan", "feb", "mar", - "apr", "may", "jun", - "jul", "aug", "sep", - "oct", "nov", "dec"), - ordered=TRUE), - day_of_week=factor(day_of_week, levels=c("mon", "tue", "wed", - "thu", "fri"), - ordered=TRUE) - ) + dataset = dataset %>% + mutate( + education=factor(education, levels=c("primary", "secondary", + "tertiary"), + ordered=TRUE), + month=factor(month, levels=c("jan", "feb", "mar", + "apr", "may", "jun", + "jul", "aug", "sep", + "oct", "nov", "dec"), + ordered=TRUE), + pdays.bin=revalue(factor(pdays==-1), + c("TRUE"="never", "FALSE"="successful")), + pdays=as.integer(replace(pdays, pdays==-1, 999))) %>% + select(age:pdays, pdays.bin, previous:y) return(dataset) } \ No newline at end of file diff --git a/data-preprocess.R b/data-preprocess.R index 2dd9cf3..68eeaac 100644 --- a/data-preprocess.R +++ b/data-preprocess.R @@ -3,6 +3,7 @@ rm(list=ls()) source("config.R") source("utils.R") +library(plyr) library(dplyr) library(foreign) library(XLConnect)