1
0
mirror of https://github.com/andre-wojtowicz/uci-ml-to-r.git synced 2024-07-22 07:35:30 +02:00
uci-ml-to-r/data-collection/bank-marketing/preprocess.R

50 lines
2.0 KiB
R

preprocessDataset = function()
{
#set.seed(SEED)
temp.dir = tempdir()
zip.file = "bank-additional.zip"
zip.dataset.path = "bank-additional/bank-additional-full.csv"
flog.debug(paste("Unzipping", zip.file))
unzip(zipfile=paste0(orig.dir, zip.file),
files=zip.dataset.path,
exdir=temp.dir)
flog.debug(paste("Loading", zip.dataset.path))
dataset = read.csv(paste0(temp.dir, "/", zip.dataset.path), sep=";")
flog.debug("Preprocessing loaded dataset")
dataset = dataset %>%
select(-c(duration, pdays, default)) %>%
filter(job != "unknown" & marital != "unknown" & education != "unknown" &
education != "illiterate" & housing != "unknown" & loan != "unknown") %>%
droplevels()
#dataset.yes = dataset %>% filter(y == "yes")
#dataset.no = dataset %>% filter(y == "no") %>% sample_n(nrow(dataset.yes))
#
#dataset = rbind(dataset.yes, dataset.no)
dataset = dataset %>% mutate(
education=factor(education, levels=c("basic.4y", "basic.6y",
"basic.9y", "high.school",
"professional.course",
"university.degree"),
ordered=TRUE),
month=factor(month, levels=c("jan", "feb", "mar",
"apr", "may", "jun",
"jul", "aug", "sep",
"oct", "nov", "dec"),
ordered=TRUE),
day_of_week=factor(day_of_week, levels=c("mon", "tue", "wed",
"thu", "fri"),
ordered=TRUE)
)
return(dataset)
}