refactoring, checkpoint cleanup and snapshot update

2025-02-18 16:55:56 +01:00 · 2016-07-17 02:35:55 +02:00 · 2016-07-17 02:35:55 +02:00 · 982b3b6f9d
commit 982b3b6f9d
parent c49a82db43
20 changed files with 429 additions and 361 deletions
--- a/.gitignore
+++ b/.gitignore
@ -24,3 +24,6 @@ data-collection/*/preprocessed/*
 # markdown outputs
 *.html
 .Rproj.user
 # logger outputs
 *.log
--- a/README.md
+++ b/README.md
@ -3,8 +3,7 @@ Andrzej Wójtowicz
-Document generation date: 2016-07-13 13:45:45.
+Document generation date: 2016-07-17 02:31:21.
 # Table of Contents
@ -71,7 +70,7 @@ S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of
 **Class imbalance**:
 | class A | class B |
-|:-------:|:--------:|
+|:-------:|:-------:|
 |  12 %   |  88 %   |
 |  5021   |  38172  |
@ -141,7 +140,7 @@ https://archive.ics.uci.edu/ml/citation_policy.html
 **Class imbalance**:
 | class A | class B |
-|:-------:|:--------:|
+|:-------:|:-------:|
 |  37 %   |  63 %   |
 |   212   |   357   |
@ -189,7 +188,7 @@ O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear programming",
 **Class imbalance**:
 | class A | class B |
-|:-------:|:--------:|
+|:-------:|:-------:|
 |  35 %   |  65 %   |
 |   239   |   444   |
@ -259,7 +258,7 @@ Ayres de Campos et al. (2000) SisPorto 2.0 A Program for Automated Analysis of C
 **Class imbalance**:
 | class A | class B |
-|:-------:|:--------:|
+|:-------:|:-------:|
 |  22 %   |  78 %   |
 |   471   |  1655   |
@ -321,7 +320,7 @@ Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for
 **Class imbalance**:
 | class A | class B |
-|:-------:|:--------:|
+|:-------:|:-------:|
 |  22 %   |  78 %   |
 |  6636   |  23364  |
@ -372,7 +371,7 @@ https://archive.ics.uci.edu/ml/citation_policy.html
 **Class imbalance**:
 | class A | class B |
-|:-------:|:--------:|
+|:-------:|:-------:|
 |  29 %   |  71 %   |
 |   167   |   416   |
@ -422,7 +421,7 @@ https://archive.ics.uci.edu/ml/citation_policy.html
 **Class imbalance**:
 | class A | class B |
-|:-------:|:--------:|
+|:-------:|:-------:|
 |  35 %   |  65 %   |
 |  6688   |  12332  |
@ -476,7 +475,7 @@ Sikora M., Wrobel L.: Application of rule induction algorithms for analysis of d
 **Class imbalance**:
 | class A | class B |
-|:-------:|:--------:|
+|:-------:|:-------:|
 |   7 %   |  93 %   |
 |   170   |  2414   |
@ -575,7 +574,7 @@ https://archive.ics.uci.edu/ml/citation_policy.html
 **Class imbalance**:
 | class A | class B |
-|:-------:|:--------:|
+|:-------:|:-------:|
 |  39 %   |  61 %   |
 |  1813   |  2788   |
@ -628,9 +627,8 @@ P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferen
 **Class imbalance**:
 | class A | class B |
-|:-------:|:--------:|
+|:-------:|:-------:|
 |  37 %   |  63 %   |
 |  2384   |  4113   |
 ---
--- a/config.R
+++ b/config.R
@ -1,28 +1,48 @@
-# ---- checkpoint ----
+# ---- config ----
-CHECKPOINT.MRAN.URL      = "http://mran.microsoft.com/snapshot/"
+# randomization and output files
 CHECKPOINT.SNAPSHOT.DATE = "2016-04-10"
-library(checkpoint)
+SEED                   = 1337
-options(checkpoint.mranUrl=CHECKPOINT.MRAN.URL)
+OVERWRITE.OUTPUT.FILES = TRUE # overwrite downloaded and created datasets
 checkpoint(CHECKPOINT.SNAPSHOT.DATE)
-# ---- logger ----
+# extra user configuration and init
-LOGGER_LEVEL = futile.logger::INFO
+USER.CONFIG.FILE      = "config.R.user"
 USER.INIT.FILE        = "init.R.user"
-library(futile.logger)
+# checkpoint library
 flog.threshold(LOGGER_LEVEL)
-# ---- other ----
+CHECKPOINT.MRAN.URL      = "https://mran.microsoft.com/"
 CHECKPOINT.SNAPSHOT.DATE = "2016-07-01"
 CHECKPOINT.QUICK.LOAD    = TRUE # skip testing https and checking url
-PATH_DATASETS             = "data-collection/"
+# logging system
 PATH_DATASET_ORIGINAL     = paste0(PATH_DATASETS, "*/original/")
 PATH_DATASET_PREPROCESSED = paste0(PATH_DATASETS, "*/preprocessed/")
-FILE_CONFIG_YAML          = "config.yaml"
+LOGGER.OUTPUT.S1.FILE           = "output-s1.log"
-FILE_PREPROCESSING_SCRIPT = "preprocess.R"
+LOGGER.OUTPUT.S2.FILE           = "output-s2.log"
-FILE_PREPROCESSED_OUTPUT  = "dataset.rds"
+LOGGER.LEVEL                    = 6 # futile.logger::INFO
 LOGGER.OVERWRITE.EXISTING.FILES = TRUE
-if (file.exists("config.R.user"))
+# datasets
-    source("config.R.user")
+
 DATASETS.DIR                     = "data-collection"
 DATASET.NAME.PATTERN             = "DS-NAME"
 DATASET.ORIGINAL.DIR             = 
    file.path(DATASETS.DIR, DATASET.NAME.PATTERN, "original")
 DATASET.PREPROCESSED.DIR         = 
    file.path(DATASETS.DIR, DATASET.NAME.PATTERN, "preprocessed")
 DATASET.CONFIG.FILE              = "config.yaml"
 DATASET.PREPROCESSING.SCRIPT     = "preprocess.R"
 DATASET.PREPROCESSED.OUTPUT.FILE = "dataset.rds"
 # curl
 SSL.VERIFY.PEER = FALSE
 # load custom config
 if (file.exists(USER.CONFIG.FILE))
 {
    source(USER.CONFIG.FILE)
 }
--- a/data-collection/bank-marketing/preprocess.R
+++ b/data-collection/bank-marketing/preprocess.R
@ -1,7 +1,5 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {
    #set.seed(SEED)
    temp.dir = tempdir()
    zip.file         = "bank.zip"
@ -9,36 +7,38 @@ preprocessDataset = function()
    flog.debug(paste("Unzipping", zip.file))
-    unzip(zipfile=paste0(orig.dir, zip.file),
+    unzip(zipfile = file.path(orig.dir, zip.file),
          files   = zip.dataset.path,
          exdir   = temp.dir)
    flog.debug(paste("Loading", zip.dataset.path))
-    dataset = read.csv(paste0(temp.dir, "/", zip.dataset.path), sep=";")
+    dataset = read.csv(file.path(temp.dir, zip.dataset.path), sep = ";")
    flog.debug("Preprocessing loaded dataset")
    dataset = dataset %>% 
        select(-c(duration, default)) %>%
        filter(job != "unknown" & marital != "unknown" & education != "unknown" & 
-               education != "unknown" & housing != "unknown" & loan != "unknown") %>%
+               education != "unknown" & housing != "unknown" & 
               loan != "unknown") %>%
        droplevels()
    dataset = dataset %>% 
        mutate(
-            education=factor(education, levels=c("primary", "secondary", 
+            education = factor(education, 
-                                                 "tertiary"),
+                               levels  = c("primary", "secondary", "tertiary"),
                               ordered = TRUE),
-            month=factor(month, levels=c("jan", "feb", "mar", 
+            month = factor(month, 
-                                         "apr", "may", "jun", 
+                           levels  = c("jan", "feb", "mar", "apr", "may", "jun", 
-                                         "jul", "aug", "sep", 
+                                       "jul", "aug", "sep", "oct", "nov", "dec"), 
                                         "oct", "nov", "dec"), 
                           ordered = TRUE),
            pdays.bin = revalue(factor(pdays == -1), 
                                c("TRUE" = "never", "FALSE" = "successful")), 
            pdays = as.integer(replace(pdays, pdays == -1, 999))) %>%
        select(age:pdays, pdays.bin, previous:y)
    unlink("temp.dir", recursive = TRUE)
    return(dataset)
 }
--- a/data-collection/breast-cancer-wisconsin-diagnostic/preprocess.R
+++ b/data-collection/breast-cancer-wisconsin-diagnostic/preprocess.R
@ -1,8 +1,8 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {   
    csv.file = "wdbc.data"
-    dataset = read.csv(paste0(orig.dir, "/", csv.file), header=FALSE)
+    dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE)
    colnames(dataset) = c("id", "diagnosis", 
                          apply(expand.grid(c("radius", "texture", "perimeter", 
@ -12,7 +12,8 @@ preprocessDataset = function()
                                            c("mean", "se", "worst")), 
                                1, function(x){paste(x[2], x[1])}))
-    dataset = dataset %>% select(`mean radius`:`worst fractal dimension`, diagnosis)
+    dataset = dataset %>% 
        select(`mean radius`:`worst fractal dimension`, diagnosis)
    return(dataset)
 }
--- a/data-collection/breast-cancer-wisconsin-original/preprocess.R
+++ b/data-collection/breast-cancer-wisconsin-original/preprocess.R
@ -1,8 +1,8 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {   
    csv.file = "breast-cancer-wisconsin.data"
-    dataset = read.csv(paste0(orig.dir, "/", csv.file), header=FALSE)
+    dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE)
    colnames(dataset) = c("Sample code number", "Clump Thickness", 
                          "Uniformity of Cell Size", "Uniformity of Cell Shape", 
@ -10,7 +10,8 @@ preprocessDataset = function()
                          "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", 
                          "Mitoses", "Class")
-    dataset = dataset %>% select(-`Sample code number`) %>%
+    dataset = dataset %>% 
        select(-`Sample code number`) %>%
        filter(`Bare Nuclei` != "?") %>%
        mutate(Class = factor(Class), 
               `Bare Nuclei` = as.integer(`Bare Nuclei`)) %>%
--- a/data-collection/cardiotocography/preprocess.R
+++ b/data-collection/cardiotocography/preprocess.R
@ -1,8 +1,8 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {   
    xls.file = "CTG.xls"
-    wk = loadWorkbook(paste0(orig.dir, "/", xls.file))
+    wk      = loadWorkbook(file.path(orig.dir, xls.file))
    dataset = readWorksheet(wk, sheet = "Raw Data")
    dataset = dataset %>% select(LB:FS, NSP, -c(DS, DR)) %>% 
@ -24,7 +24,8 @@ preprocessDataset = function()
               Mean     = as.integer(Mean),
               Median   = as.integer(Median),
               Variance = as.integer(Variance),
-                     Tendency=factor(Tendency, levels=c(-1,0,1), ordered=TRUE),
+               Tendency = factor(Tendency, levels = c(-1, 0, 1), 
                                 ordered = TRUE),
               A        = factor(A),
               B        = factor(B),
               C        = factor(C),
--- a/data-collection/credit-card/preprocess.R
+++ b/data-collection/credit-card/preprocess.R
@ -1,17 +1,15 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {   
    #set.seed(SEED)
    xls.file = "default of credit card clients.xls"
-    wk = loadWorkbook(paste0(orig.dir, "/", xls.file))
+    wk      = loadWorkbook(file.path(orig.dir, xls.file))
    dataset = readWorksheet(wk, sheet = "Data", startRow = 2, startCol = 2,
                            check.names = FALSE)
    dataset = dataset %>% 
        mutate(LIMIT_BAL = as.integer(LIMIT_BAL),
               SEX       = factor(SEX),
-                     EDUCATION=factor(EDUCATION), # can't order due to
+               EDUCATION = factor(EDUCATION), # can not order due to
                                              # inconsistency with
                                              # UCI description
               MARRIAGE  = factor(MARRIAGE),
@ -34,15 +32,9 @@ preprocessDataset = function()
               PAY_AMT4  = as.integer(PAY_AMT4),
               PAY_AMT5  = as.integer(PAY_AMT5),
               PAY_AMT6  = as.integer(PAY_AMT6),
-                     `default payment next month`=factor(
+               `default payment next month` = 
-                        `default payment next month`)
+                   factor(`default payment next month`)
        )
    #dataset.1 = dataset %>% filter(`default payment next month` == 1)
    #dataset.0 = dataset %>% filter(`default payment next month` == 0) %>% 
    #            sample_n(nrow(dataset.1))
    #
    #dataset = rbind(dataset.0, dataset.1)
    return(dataset)
 }
--- a/data-collection/indian-liver/preprocess.R
+++ b/data-collection/indian-liver/preprocess.R
@ -1,14 +1,14 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {   
    csv.file = "Indian Liver Patient Dataset (ILPD).csv"
-    dataset = read.csv(paste0(orig.dir, "/", csv.file),
+    dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE)
                       header=FALSE)
    colnames(dataset) = c("Age", "Gender", "TB", "DB", "Alkphos", "Sgpt", 
                          "Sgot", "TP", "ALB", "A/G Ratio", "Selector")
-    dataset = dataset %>% mutate(Selector=factor(Selector))
+    dataset = dataset %>% 
        mutate(Selector = factor(Selector))
    return(dataset)
 }
--- a/data-collection/magic/preprocess.R
+++ b/data-collection/magic/preprocess.R
@ -1,9 +1,8 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {   
    csv.file = "magic04.data"
-    dataset = read.csv(paste0(orig.dir, "/", csv.file),
+    dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE)
                       header=FALSE)
    colnames(dataset) = c("fLength", "fWidth", "fSize", "fConc", "fConc1", 
                          "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", 
--- a/data-collection/seismic-bumps/preprocess.R
+++ b/data-collection/seismic-bumps/preprocess.R
@ -1,12 +1,11 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {   
    #set.seed(SEED)
    arff.file = "seismic-bumps.arff"
-    dataset = read.arff(paste0(orig.dir, "/", arff.file))
+    dataset = read.arff(file.path(orig.dir, arff.file))
-    dataset = dataset %>% select(-c(nbumps6:nbumps89)) %>%
+    dataset = dataset %>% 
        select(-c(nbumps6:nbumps89)) %>%
        mutate(genergy   = as.integer(genergy),
               gpuls     = as.integer(gpuls),
               gdenergy  = as.integer(gdenergy),
@ -20,11 +19,5 @@ preprocessDataset = function()
               maxenergy = as.integer(maxenergy)
        )
    #dataset.1 = dataset %>% filter(class == "1")
    #dataset.0 = dataset %>% filter(class == "0") %>% 
    #            sample_n(nrow(dataset.1)*4)
    #
    #dataset = rbind(dataset.0, dataset.1)
    return(dataset)
 }
--- a/data-collection/spambase/preprocess.R
+++ b/data-collection/spambase/preprocess.R
@ -1,8 +1,8 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {   
    csv.file = "spambase.data"
-    dataset = read.csv(paste0(orig.dir, "/", csv.file),
+    dataset = read.csv(file.path(orig.dir, csv.file),
                       header = FALSE)
    colnames(dataset) = c("word_freq_make", "word_freq_address", "word_freq_all",
@ -27,7 +27,8 @@ preprocessDataset = function()
                          "capital_run_length_longest", "capital_run_length_total",
                          "class")
-    dataset = dataset %>% mutate(class=factor(class))
+    dataset = dataset %>% 
        mutate(class = factor(class))
    return(dataset)
 }
--- a/data-collection/wine-quality/preprocess.R
+++ b/data-collection/wine-quality/preprocess.R
@ -1,17 +1,20 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {   
    csv.file.w = "winequality-white.csv"
    csv.file.r = "winequality-red.csv"
-    dataset.w = read.csv(paste0(orig.dir, "/", csv.file.w), sep=";",
+    dataset.w = read.csv(file.path(orig.dir, "/", csv.file.w), sep = ";",
                         check.names = FALSE)
-    dataset.w = dataset.w %>% mutate(color="white")
+    dataset.w = dataset.w %>% 
        mutate(color = "white")
    dataset.r = read.csv(paste0(orig.dir, "/", csv.file.r), sep = ";",
                         check.names = FALSE)
-    dataset.r = dataset.r %>% mutate(color="red")
+    dataset.r = dataset.r %>% 
        mutate(color = "red")
-    dataset = rbind(dataset.w, dataset.r) %>%
+    dataset = 
        rbind(dataset.w, dataset.r) %>%
        mutate(color   = factor(color),
               quality = ifelse(quality > 5, 1, 0)) %>%
        select(`fixed acidity`:alcohol, color, quality) %>%
--- a/data-download.R
+++ b/data-download.R
@ -1,49 +0,0 @@
 rm(list=ls())
 source("config.R")
 source("utils.R")
 library(RCurl)
 library(tools)
 library(yaml)
 flog.info("Started downloading dataset collection")
 for (dir.name in dir(PATH_DATASETS))
 {
    flog.info(paste("Dataset:", dir.name))
    dest.dir  = gsub("\\*", dir.name, PATH_DATASET_ORIGINAL)
    config.yaml.file = paste0(PATH_DATASETS, dir.name, "/", FILE_CONFIG_YAML)
    urls.list = yaml.load_file(config.yaml.file)$urls
    mkdir(dest.dir)
    for (url in urls.list)
    {
        flog.info(paste("URL:", url))
        dest.file = URLdecode(basename(url))
        dest.file.path = paste0(dest.dir, dest.file)
        if (file.exists(dest.file.path))
        {
            flog.warn(paste("Target file", basename(dest.file.path), 
                            "already exists; skipping..."))
            next
        }
        tryCatch(
            raw.content <- getBinaryURL(url, .opts=curlOptions(ssl.verifypeer=FALSE)),
            error = function(e){flog.error(e); stop(e)}
        )
        writeBin(raw.content, dest.file.path)
    }
    flog.info("*****")
 }
 flog.info("Finished downloading dataset collection")
--- a/data-preprocess.R
+++ b/data-preprocess.R
@ -1,64 +0,0 @@
 rm(list=ls())
 source("config.R")
 source("utils.R")
 library(plyr)
 library(dplyr)
 library(foreign)
 library(XLConnect)
 flog.info("Started preprocessing dataset collection")
 for (dir.name in dir(PATH_DATASETS))
 {
    flog.info(paste("Dataset:", dir.name))
    orig.dir = gsub("\\*", dir.name, PATH_DATASET_ORIGINAL)
    dest.dir = gsub("\\*", dir.name, PATH_DATASET_PREPROCESSED)
    dest.file.path = paste0(dest.dir, FILE_PREPROCESSED_OUTPUT)
    if (file.exists(dest.file.path))
    {
        flog.warn(paste("Target file", basename(dest.file.path), 
                        "already exists; skipping..."))
        flog.info("*****")
        next
    }
    r.src.file = paste0(PATH_DATASETS, dir.name, "/", FILE_PREPROCESSING_SCRIPT)
    source(r.src.file)
    dataset = preprocessDataset() # custom per-dataset preprocessing
    # change column names
    colnames(dataset) = tolower(
                            make.names(
                                gsub("^\\.|\\.$", "", colnames(dataset)),
                            unique=TRUE, allow_=FALSE))
    # change factor levels
    for (name in colnames(dataset))
    {
        if (any(class(dataset[[name]]) == "factor"))
        {
            levels(dataset[[name]]) = tolower(
                                        make.names(
                                            gsub("^\\.|\\.$", "", 
                                                 levels(dataset[[name]])),
                                            unique=TRUE, allow_=FALSE))
        }
    }
    printDatasetStatistics(dataset)
    mkdir(dest.dir)
    saveRDS(dataset, dest.file.path)
    flog.info(paste("Created preprocessed file", FILE_PREPROCESSED_OUTPUT))
    flog.info("*****")
 }
 flog.info("Finished preprocessing dataset collection")
--- a/init.R
+++ b/init.R
@ -0,0 +1,64 @@
 # ---- init ----
 # clear envirionment
 rm(list = ls())
 # load setup variables
 source("config.R")
 # set randomization
 set.seed(SEED)
 # load library management system
 library(checkpoint)
 if (CHECKPOINT.QUICK.LOAD) # approx. x10 faster checkpoint library loading
 {
    # assume https
    options(checkpoint.mranUrl = CHECKPOINT.MRAN.URL)
    # disable url checking
    assignInNamespace("is.404", function(mran, warn = TRUE) { FALSE },
                      "checkpoint")
 }
 # knitr fix
 checkpoint(CHECKPOINT.SNAPSHOT.DATE, verbose = TRUE, scanForPackages = FALSE)
 if (system.file(package = "knitr") == "")
 {
    install.packages("knitr")
 }
 # actual checkpoint loading
 checkpoint(CHECKPOINT.SNAPSHOT.DATE, verbose = TRUE, scanForPackages = TRUE)
 # load logging system
 library(futile.logger)
 flog.threshold(LOGGER.LEVEL)
 # load libraries
 library(RCurl)
 library(tools)
 library(yaml)
 library(plyr)
 library(dplyr)
 library(foreign)
 library(XLConnect)
 # load helper functions
 source("utils.R")
 # perform additional custom init
 if (file.exists(USER.INIT.FILE))
 {
    source(USER.INIT.FILE)
 }
--- a/s1-download-data.R
+++ b/s1-download-data.R
@ -0,0 +1,48 @@
 # ---- download-data ----
 source("init.R")
 source("utils.R")
 setup.logger(LOGGER.OUTPUT.S1.FILE)
 flog.info("Step 1: download dataset collection")
 for (dir.name in dir(DATASETS.DIR))
 {
    flog.info(paste("Dataset:", dir.name))
    dest.dir  = gsub(DATASET.NAME.PATTERN, dir.name, DATASET.ORIGINAL.DIR)
    config.yaml.file = file.path(DATASETS.DIR, dir.name, DATASET.CONFIG.FILE)
    urls.list = yaml.load_file(config.yaml.file)$urls
    if (!dir.exists(dest.dir))
    {
        dir.create(dest.dir)
    }
    for (url in urls.list)
    {
        flog.info(paste("URL:", url))
        dest.file      = URLdecode(basename(url))
        dest.file.path = file.path(dest.dir, dest.file)
        if (!file.exists(dest.file.path) | OVERWRITE.OUTPUT.FILES)
        {
            tryCatch(
                raw.content <- 
                    getBinaryURL(url, .opts = curlOptions(ssl.verifypeer = 
                                                              SSL.VERIFY.PEER)),
                error = function(e){flog.error(e); stop(e)}
            )
            writeBin(raw.content, dest.file.path)
        } else {
            flog.warn(paste("Target file", basename(dest.file.path),
                            "already exists, skipping"))
        }
    }
    flog.info(paste(rep("*", 25), collapse = ""))
 }
--- a/s2-preprocess-data.R
+++ b/s2-preprocess-data.R
@ -0,0 +1,61 @@
 # ---- preprocess-data ----
 source("init.R")
 source("utils.R")
 setup.logger(LOGGER.OUTPUT.S2.FILE)
 flog.info("Step 2: preprocess dataset collection")
 for (dir.name in dir(DATASETS.DIR))
 {
    flog.info(paste("Dataset:", dir.name))
    orig.dir = gsub(DATASET.NAME.PATTERN, dir.name, DATASET.ORIGINAL.DIR)
    dest.dir = gsub(DATASET.NAME.PATTERN, dir.name, DATASET.PREPROCESSED.DIR)
    dest.file.path = file.path(dest.dir, DATASET.PREPROCESSED.OUTPUT.FILE)
    if (!file.exists(dest.file.path) | OVERWRITE.OUTPUT.FILES)
    {
        r.src.file = file.path(DATASETS.DIR, dir.name, 
                               DATASET.PREPROCESSING.SCRIPT)
        source(r.src.file)
        dataset = preprocess.dataset() # custom per-dataset preprocessing
        # change column names
        colnames(dataset) = tolower(
            make.names(
                gsub("^\\.|\\.$", "", colnames(dataset)),
                unique = TRUE, allow_ = FALSE))
        # change factor levels
        for (name in colnames(dataset))
        {
            if (any(class(dataset[[name]]) == "factor"))
            {
                levels(dataset[[name]]) = tolower(
                    make.names(
                        gsub("^\\.|\\.$", "", 
                             levels(dataset[[name]])),
                        unique = TRUE, allow_ = FALSE))
            }
        }
        print.dataset.statistics(dataset)
        if (!dir.exists(dest.dir))
        {
            dir.create(dest.dir)
        }
        saveRDS(dataset, dest.file.path)
        flog.info(paste("Created preprocessed file", 
                        DATASET.PREPROCESSED.OUTPUT.FILE))
    } else {
        flog.warn(paste("Target file", basename(dest.file.path), 
                        "already exists, skipping"))
    }
    flog.info(paste(rep("*", 25), collapse = ""))
 }
--- a/s3-make-readme.Rmd
+++ b/s3-make-readme.Rmd
@ -7,35 +7,31 @@ output:
 ---
 ```{r global-options, include=FALSE}
-knitr::opts_chunk$set(comment="", echo=FALSE, 
+knitr::opts_chunk$set(comment = "", echo = FALSE, warning = FALSE, message = FALSE)
-                      warning=FALSE, message=FALSE)
+source('init.R')
 source('config.R')
 ```
 Document generation date: `r Sys.time()`.
 ```{r show-datasets, results='asis'}
 library(yaml)
 cat("\n# Table of Contents\n\n")
-for (dir.name in dir(PATH_DATASETS))
+for (dir.name in dir(DATASETS.DIR))
 {
-    config.yaml.file.path = paste0(PATH_DATASETS, dir.name, "/", FILE_CONFIG_YAML)
+    config.yaml.file.path = file.path(DATASETS.DIR, dir.name, DATASET.CONFIG.FILE)
    config.yaml = yaml.load_file(config.yaml.file.path)
    anchor = gsub(" ", "-", gsub("[[:punct:]]", "", 
                                 tolower(config.yaml$name)))
    cat(paste0("1. [", config.yaml$name, "](#", anchor, ")\n" ))
 }
 cat("\n---\n\n")
-for (dir.name in dir(PATH_DATASETS))
+for (dir.name in dir(DATASETS.DIR))
 {
-    config.yaml.file.path = paste0(PATH_DATASETS, dir.name, "/", FILE_CONFIG_YAML)
+    config.yaml.file.path = file.path(DATASETS.DIR, dir.name, DATASET.CONFIG.FILE)
    config.yaml = yaml.load_file(config.yaml.file.path)
    cat(paste("#", config.yaml$name, "\n\n"))
@ -55,8 +51,10 @@ for (dir.name in dir(PATH_DATASETS))
    cat(paste("**Dataset**:\n\n"))
-    preprocessed.dir = gsub("\\*", dir.name, PATH_DATASET_PREPROCESSED)
+    preprocessed.dir =       gsub(DATASET.NAME.PATTERN, dir.name,
-    preprocessed.file.path = paste0(preprocessed.dir, FILE_PREPROCESSED_OUTPUT)
+                                  DATASET.PREPROCESSED.DIR)
    preprocessed.file.path = file.path(preprocessed.dir,
                                       DATASET.PREPROCESSED.OUTPUT.FILE)
    dataset = readRDS(preprocessed.file.path)
@ -79,14 +77,14 @@ for (dir.name in dir(PATH_DATASETS))
    cat("**Class imbalance**:\n\n")
-    cat(knitr::kable(data.frame(A=c(paste(perc.classes[1], "%"), num.classes[1]), 
+    cat(knitr::kable(data.frame(A = c(paste(perc.classes[1], "%"), 
-                                B=c(paste(perc.classes[2], "%"), num.classes[2])), 
+                                      num.classes[1]), 
                                B = c(paste(perc.classes[2], "%"),
                                      num.classes[2])), 
                     format = "markdown", col.names = c("class A", "class B"),
                     align = c("c", "c")), 
        sep = "\n")
    cat("\n---\n\n")
 }
 ```
--- a/utils.R
+++ b/utils.R
@ -1,17 +1,4 @@
-library(futile.logger)
+print.dataset.statistics = function(dataset)
 mkdir = function(dest.dir)
 {
    if (!dir.exists(dest.dir))
    {
        flog.debug(paste("Creating directory", dest.dir))
        dir.create(dest.dir)
    } else {
        flog.debug(paste("Target directory", dest.dir, "already exists"))
    }
 }
 printDatasetStatistics = function(dataset)
 {
    if (ncol(dataset) == 0) # for mockups
    {
@ -21,8 +8,19 @@ printDatasetStatistics = function(dataset)
    no.cases = nrow(dataset)
    no.attributes = ncol(dataset) - 1
-    perc.classes = round(100*as.numeric(table(dataset[, ncol(dataset)]))/nrow(dataset), 0)
+    perc.classes = 
        round(100*as.numeric(table(dataset[, ncol(dataset)]))/nrow(dataset), 0)
    flog.info(paste0("Cases: ", no.cases, ", attributes: ", no.attributes,
                     ", classes: ", perc.classes[1], "%/", perc.classes[2], "%"))
 }
 setup.logger = function(output.file)
 {
    if (LOGGER.OVERWRITE.EXISTING.FILES & file.exists(output.file))
    {
        file.remove(output.file)
    }
    invisible(flog.appender(appender.tee(output.file)))
 }