mirror of
https://github.com/andre-wojtowicz/uci-ml-to-r.git
synced 2024-11-24 16:05:28 +01:00
Updated bank-marketing dataset
This commit is contained in:
parent
abc240bb2d
commit
c49a82db43
1
.gitignore
vendored
1
.gitignore
vendored
@ -23,3 +23,4 @@ data-collection/*/preprocessed/*
|
|||||||
|
|
||||||
# markdown outputs
|
# markdown outputs
|
||||||
*.html
|
*.html
|
||||||
|
.Rproj.user
|
||||||
|
43
README.md
43
README.md
@ -3,7 +3,7 @@ Andrzej Wójtowicz
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
Document generation date: 2016-06-23 11:44:00.
|
Document generation date: 2016-07-13 13:45:45.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -30,7 +30,7 @@ Document generation date: 2016-06-23 11:44:00.
|
|||||||
|
|
||||||
**Source data files**:
|
**Source data files**:
|
||||||
|
|
||||||
* [bank-additional.zip](https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip)
|
* [bank.zip](https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip)
|
||||||
|
|
||||||
**Cite**:
|
**Cite**:
|
||||||
```nohighlight
|
```nohighlight
|
||||||
@ -40,24 +40,22 @@ S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of
|
|||||||
**Dataset**:
|
**Dataset**:
|
||||||
|
|
||||||
```nohighlight
|
```nohighlight
|
||||||
'data.frame': 38227 obs. of 18 variables:
|
'data.frame': 43193 obs. of 16 variables:
|
||||||
$ age : int 56 57 37 40 56 45 59 24 25 25 ...
|
$ age : int 58 44 33 35 28 42 58 43 41 29 ...
|
||||||
$ job : Factor w/ 11 levels "admin","blue.collar",..: 4 8 8 1 8 8 1 10 8 8 ...
|
$ job : Factor w/ 11 levels "admin","blue.collar",..: 5 10 3 5 5 3 6 10 1 1 ...
|
||||||
$ marital : Factor w/ 3 levels "divorced","married",..: 2 2 2 2 2 2 2 3 3 3 ...
|
$ marital : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 1 2 3 1 3 ...
|
||||||
$ education : Ord.factor w/ 6 levels "basic.4y"<"basic.6y"<..: 1 4 4 2 4 3 5 5 4 4 ...
|
$ education: Ord.factor w/ 3 levels "primary"<"secondary"<..: 3 2 2 3 3 3 1 2 2 2 ...
|
||||||
$ housing : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 1 2 2 2 ...
|
$ balance : int 2143 29 2 231 447 2 121 593 270 390 ...
|
||||||
$ loan : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 1 1 1 1 ...
|
$ housing : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
|
||||||
$ contact : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
|
$ loan : Factor w/ 2 levels "no","yes": 1 1 2 1 2 1 1 1 1 1 ...
|
||||||
|
$ contact : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
|
||||||
|
$ day : int 5 5 5 5 5 5 5 5 5 5 ...
|
||||||
$ month : Ord.factor w/ 12 levels "jan"<"feb"<"mar"<..: 5 5 5 5 5 5 5 5 5 5 ...
|
$ month : Ord.factor w/ 12 levels "jan"<"feb"<"mar"<..: 5 5 5 5 5 5 5 5 5 5 ...
|
||||||
$ day.of.week : Ord.factor w/ 5 levels "mon"<"tue"<"wed"<..: 1 1 1 1 1 1 1 1 1 1 ...
|
|
||||||
$ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
|
$ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
|
||||||
|
$ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
|
||||||
|
$ pdays.bin: Factor w/ 2 levels "successful","never": 2 2 2 2 2 2 2 2 2 2 ...
|
||||||
$ previous : int 0 0 0 0 0 0 0 0 0 0 ...
|
$ previous : int 0 0 0 0 0 0 0 0 0 0 ...
|
||||||
$ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
|
$ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
|
||||||
$ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
|
|
||||||
$ cons.price.idx: num 94 94 94 94 94 ...
|
|
||||||
$ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
|
|
||||||
$ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
|
|
||||||
$ nr.employed : num 5191 5191 5191 5191 5191 ...
|
|
||||||
$ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
|
$ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
|
||||||
|
|
||||||
```
|
```
|
||||||
@ -66,17 +64,16 @@ S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of
|
|||||||
|
|
||||||
|Type | Frequency|
|
|Type | Frequency|
|
||||||
|:--------------|---------:|
|
|:--------------|---------:|
|
||||||
|factor | 6|
|
|factor | 7|
|
||||||
|integer | 3|
|
|integer | 6|
|
||||||
|numeric | 5|
|
|ordered factor | 2|
|
||||||
|ordered factor | 3|
|
|
||||||
|
|
||||||
**Class imbalance**:
|
**Class imbalance**:
|
||||||
|
|
||||||
| class A | class B |
|
| class A | class B |
|
||||||
|:-------:|:--------:|
|
|:-------:|:--------:|
|
||||||
| 11 % | 89 % |
|
| 12 % | 88 % |
|
||||||
| 4254 | 33973 |
|
| 5021 | 38172 |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ name: Bank Marketing
|
|||||||
info: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
|
info: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
|
||||||
|
|
||||||
urls:
|
urls:
|
||||||
- https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip
|
- https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip
|
||||||
|
|
||||||
cite: >
|
cite: >
|
||||||
S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014
|
S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014
|
||||||
|
@ -4,8 +4,8 @@ preprocessDataset = function()
|
|||||||
|
|
||||||
temp.dir = tempdir()
|
temp.dir = tempdir()
|
||||||
|
|
||||||
zip.file = "bank-additional.zip"
|
zip.file = "bank.zip"
|
||||||
zip.dataset.path = "bank-additional/bank-additional-full.csv"
|
zip.dataset.path = "bank-full.csv"
|
||||||
|
|
||||||
flog.debug(paste("Unzipping", zip.file))
|
flog.debug(paste("Unzipping", zip.file))
|
||||||
|
|
||||||
@ -20,31 +20,25 @@ preprocessDataset = function()
|
|||||||
flog.debug("Preprocessing loaded dataset")
|
flog.debug("Preprocessing loaded dataset")
|
||||||
|
|
||||||
dataset = dataset %>%
|
dataset = dataset %>%
|
||||||
select(-c(duration, pdays, default)) %>%
|
select(-c(duration, default)) %>%
|
||||||
filter(job != "unknown" & marital != "unknown" & education != "unknown" &
|
filter(job != "unknown" & marital != "unknown" & education != "unknown" &
|
||||||
education != "illiterate" & housing != "unknown" & loan != "unknown") %>%
|
education != "unknown" & housing != "unknown" & loan != "unknown") %>%
|
||||||
droplevels()
|
droplevels()
|
||||||
|
|
||||||
#dataset.yes = dataset %>% filter(y == "yes")
|
dataset = dataset %>%
|
||||||
#dataset.no = dataset %>% filter(y == "no") %>% sample_n(nrow(dataset.yes))
|
mutate(
|
||||||
#
|
education=factor(education, levels=c("primary", "secondary",
|
||||||
#dataset = rbind(dataset.yes, dataset.no)
|
"tertiary"),
|
||||||
|
|
||||||
dataset = dataset %>% mutate(
|
|
||||||
education=factor(education, levels=c("basic.4y", "basic.6y",
|
|
||||||
"basic.9y", "high.school",
|
|
||||||
"professional.course",
|
|
||||||
"university.degree"),
|
|
||||||
ordered=TRUE),
|
ordered=TRUE),
|
||||||
month=factor(month, levels=c("jan", "feb", "mar",
|
month=factor(month, levels=c("jan", "feb", "mar",
|
||||||
"apr", "may", "jun",
|
"apr", "may", "jun",
|
||||||
"jul", "aug", "sep",
|
"jul", "aug", "sep",
|
||||||
"oct", "nov", "dec"),
|
"oct", "nov", "dec"),
|
||||||
ordered=TRUE),
|
ordered=TRUE),
|
||||||
day_of_week=factor(day_of_week, levels=c("mon", "tue", "wed",
|
pdays.bin=revalue(factor(pdays==-1),
|
||||||
"thu", "fri"),
|
c("TRUE"="never", "FALSE"="successful")),
|
||||||
ordered=TRUE)
|
pdays=as.integer(replace(pdays, pdays==-1, 999))) %>%
|
||||||
)
|
select(age:pdays, pdays.bin, previous:y)
|
||||||
|
|
||||||
return(dataset)
|
return(dataset)
|
||||||
}
|
}
|
@ -3,6 +3,7 @@ rm(list=ls())
|
|||||||
source("config.R")
|
source("config.R")
|
||||||
source("utils.R")
|
source("utils.R")
|
||||||
|
|
||||||
|
library(plyr)
|
||||||
library(dplyr)
|
library(dplyr)
|
||||||
library(foreign)
|
library(foreign)
|
||||||
library(XLConnect)
|
library(XLConnect)
|
||||||
|
Loading…
Reference in New Issue
Block a user