1
0
mirror of https://github.com/andre-wojtowicz/uci-ml-to-r.git synced 2024-11-24 16:05:28 +01:00
uci-ml-to-r/data-collection/bank-marketing/preprocess.R

44 lines
1.5 KiB
R
Raw Normal View History

preprocess.dataset = function()
{
temp.dir = tempdir()
zip.file = "bank.zip"
2016-07-13 13:59:25 +02:00
zip.dataset.path = "bank-full.csv"
flog.debug(paste("Unzipping", zip.file))
unzip(zipfile = file.path(orig.dir, zip.file),
files = zip.dataset.path,
exdir = temp.dir)
flog.debug(paste("Loading", zip.dataset.path))
dataset = read.csv(file.path(temp.dir, zip.dataset.path), sep = ";")
flog.debug("Preprocessing loaded dataset")
dataset = dataset %>%
2016-07-13 13:59:25 +02:00
select(-c(duration, default)) %>%
filter(job != "unknown" & marital != "unknown" & education != "unknown" &
education != "unknown" & housing != "unknown" &
loan != "unknown") %>%
droplevels()
2016-07-13 13:59:25 +02:00
dataset = dataset %>%
mutate(
education = factor(education,
levels = c("primary", "secondary", "tertiary"),
ordered = TRUE),
month = factor(month,
levels = c("jan", "feb", "mar", "apr", "may", "jun",
"jul", "aug", "sep", "oct", "nov", "dec"),
ordered = TRUE),
pdays.bin = revalue(factor(pdays == -1),
c("TRUE" = "never", "FALSE" = "successful")),
pdays = as.integer(replace(pdays, pdays == -1, 999))) %>%
2016-07-13 13:59:25 +02:00
select(age:pdays, pdays.bin, previous:y)
unlink("temp.dir", recursive = TRUE)
return(dataset)
}