2016-07-17 02:35:55 +02:00
|
|
|
preprocess.dataset = function()
|
2016-04-15 15:44:49 +02:00
|
|
|
{
|
|
|
|
temp.dir = tempdir()
|
|
|
|
|
2016-07-17 02:35:55 +02:00
|
|
|
zip.file = "bank.zip"
|
2016-07-13 13:59:25 +02:00
|
|
|
zip.dataset.path = "bank-full.csv"
|
2016-04-15 15:44:49 +02:00
|
|
|
|
|
|
|
flog.debug(paste("Unzipping", zip.file))
|
|
|
|
|
2016-07-17 02:35:55 +02:00
|
|
|
unzip(zipfile = file.path(orig.dir, zip.file),
|
|
|
|
files = zip.dataset.path,
|
|
|
|
exdir = temp.dir)
|
2016-04-15 15:44:49 +02:00
|
|
|
|
|
|
|
flog.debug(paste("Loading", zip.dataset.path))
|
|
|
|
|
2016-07-17 02:35:55 +02:00
|
|
|
dataset = read.csv(file.path(temp.dir, zip.dataset.path), sep = ";")
|
2016-04-15 15:44:49 +02:00
|
|
|
|
|
|
|
flog.debug("Preprocessing loaded dataset")
|
|
|
|
|
|
|
|
dataset = dataset %>%
|
2016-07-13 13:59:25 +02:00
|
|
|
select(-c(duration, default)) %>%
|
2016-04-15 15:44:49 +02:00
|
|
|
filter(job != "unknown" & marital != "unknown" & education != "unknown" &
|
2016-07-17 02:35:55 +02:00
|
|
|
education != "unknown" & housing != "unknown" &
|
|
|
|
loan != "unknown") %>%
|
2016-04-15 15:44:49 +02:00
|
|
|
droplevels()
|
|
|
|
|
2016-07-13 13:59:25 +02:00
|
|
|
dataset = dataset %>%
|
|
|
|
mutate(
|
2016-07-17 02:35:55 +02:00
|
|
|
education = factor(education,
|
|
|
|
levels = c("primary", "secondary", "tertiary"),
|
|
|
|
ordered = TRUE),
|
|
|
|
month = factor(month,
|
|
|
|
levels = c("jan", "feb", "mar", "apr", "may", "jun",
|
|
|
|
"jul", "aug", "sep", "oct", "nov", "dec"),
|
|
|
|
ordered = TRUE),
|
|
|
|
pdays.bin = revalue(factor(pdays == -1),
|
|
|
|
c("TRUE" = "never", "FALSE" = "successful")),
|
|
|
|
pdays = as.integer(replace(pdays, pdays == -1, 999))) %>%
|
2016-07-13 13:59:25 +02:00
|
|
|
select(age:pdays, pdays.bin, previous:y)
|
2016-04-15 15:44:49 +02:00
|
|
|
|
2016-07-17 02:35:55 +02:00
|
|
|
unlink("temp.dir", recursive = TRUE)
|
|
|
|
|
2016-04-15 15:44:49 +02:00
|
|
|
return(dataset)
|
|
|
|
}
|