From 982b3b6f9d750edb2ddbb28429cf0568f8e151d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20W=C3=B3jtowicz?= <andre@amu.edu.pl>
Date: Sun, 17 Jul 2016 02:35:55 +0200
Subject: [PATCH] refactoring, checkpoint cleanup and snapshot update

---
 .gitignore                                    |  3 +
 README.md                                     | 84 +++++++++----------
 config.R                                      | 58 ++++++++-----
 data-collection/bank-marketing/preprocess.R   | 40 ++++-----
 .../preprocess.R                              |  7 +-
 .../preprocess.R                              | 15 ++--
 data-collection/cardiotocography/preprocess.R | 65 +++++++-------
 data-collection/credit-card/preprocess.R      | 72 +++++++---------
 data-collection/indian-liver/preprocess.R     |  8 +-
 data-collection/magic/preprocess.R            |  5 +-
 data-collection/seismic-bumps/preprocess.R    | 39 ++++-----
 data-collection/spambase/preprocess.R         |  9 +-
 data-collection/wine-quality/preprocess.R     | 27 +++---
 data-download.R                               | 49 -----------
 data-preprocess.R                             | 64 --------------
 init.R                                        | 64 ++++++++++++++
 s1-download-data.R                            | 48 +++++++++++
 s2-preprocess-data.R                          | 61 ++++++++++++++
 readme-make.Rmd => s3-make-readme.Rmd         | 42 +++++-----
 utils.R                                       | 30 ++++---
 20 files changed, 429 insertions(+), 361 deletions(-)
 delete mode 100644 data-download.R
 delete mode 100644 data-preprocess.R
 create mode 100644 init.R
 create mode 100644 s1-download-data.R
 create mode 100644 s2-preprocess-data.R
 rename readme-make.Rmd => s3-make-readme.Rmd (61%)

diff --git a/.gitignore b/.gitignore
index 9096c7e..d741a93 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,6 @@ data-collection/*/preprocessed/*
 # markdown outputs
 *.html
 .Rproj.user
+
+# logger outputs
+*.log
diff --git a/README.md b/README.md
index e189944..9024e13 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,7 @@ Andrzej Wójtowicz
 
 
 
-Document generation date: 2016-07-13 13:45:45.
-
+Document generation date: 2016-07-17 02:31:21.
 
 
 # Table of Contents
@@ -70,10 +69,10 @@ S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of
 
 **Class imbalance**:
 
-| class A |  class B |
-|:-------:|:--------:|
-|  12 %   |   88 %   |
-|  5021   |  38172   |
+| class A | class B |
+|:-------:|:-------:|
+|  12 %   |  88 %   |
+|  5021   |  38172  |
 
 ---
 
@@ -140,10 +139,10 @@ https://archive.ics.uci.edu/ml/citation_policy.html
 
 **Class imbalance**:
 
-| class A |  class B |
-|:-------:|:--------:|
-|  37 %   |   63 %   |
-|   212   |   357    |
+| class A | class B |
+|:-------:|:-------:|
+|  37 %   |  63 %   |
+|   212   |   357   |
 
 ---
 
@@ -188,10 +187,10 @@ O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear programming",
 
 **Class imbalance**:
 
-| class A |  class B |
-|:-------:|:--------:|
-|  35 %   |   65 %   |
-|   239   |   444    |
+| class A | class B |
+|:-------:|:-------:|
+|  35 %   |  65 %   |
+|   239   |   444   |
 
 ---
 
@@ -258,10 +257,10 @@ Ayres de Campos et al. (2000) SisPorto 2.0 A Program for Automated Analysis of C
 
 **Class imbalance**:
 
-| class A |  class B |
-|:-------:|:--------:|
-|  22 %   |   78 %   |
-|   471   |   1655   |
+| class A | class B |
+|:-------:|:-------:|
+|  22 %   |  78 %   |
+|   471   |  1655   |
 
 ---
 
@@ -320,10 +319,10 @@ Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for
 
 **Class imbalance**:
 
-| class A |  class B |
-|:-------:|:--------:|
-|  22 %   |   78 %   |
-|  6636   |  23364   |
+| class A | class B |
+|:-------:|:-------:|
+|  22 %   |  78 %   |
+|  6636   |  23364  |
 
 ---
 
@@ -371,10 +370,10 @@ https://archive.ics.uci.edu/ml/citation_policy.html
 
 **Class imbalance**:
 
-| class A |  class B |
-|:-------:|:--------:|
-|  29 %   |   71 %   |
-|   167   |   416    |
+| class A | class B |
+|:-------:|:-------:|
+|  29 %   |  71 %   |
+|   167   |   416   |
 
 ---
 
@@ -421,10 +420,10 @@ https://archive.ics.uci.edu/ml/citation_policy.html
 
 **Class imbalance**:
 
-| class A |  class B |
-|:-------:|:--------:|
-|  35 %   |   65 %   |
-|  6688   |  12332   |
+| class A | class B |
+|:-------:|:-------:|
+|  35 %   |  65 %   |
+|  6688   |  12332  |
 
 ---
 
@@ -475,10 +474,10 @@ Sikora M., Wrobel L.: Application of rule induction algorithms for analysis of d
 
 **Class imbalance**:
 
-| class A |  class B |
-|:-------:|:--------:|
-|   7 %   |   93 %   |
-|   170   |   2414   |
+| class A | class B |
+|:-------:|:-------:|
+|   7 %   |  93 %   |
+|   170   |  2414   |
 
 ---
 
@@ -574,10 +573,10 @@ https://archive.ics.uci.edu/ml/citation_policy.html
 
 **Class imbalance**:
 
-| class A |  class B |
-|:-------:|:--------:|
-|  39 %   |   61 %   |
-|  1813   |   2788   |
+| class A | class B |
+|:-------:|:-------:|
+|  39 %   |  61 %   |
+|  1813   |  2788   |
 
 ---
 
@@ -627,10 +626,9 @@ P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferen
 
 **Class imbalance**:
 
-| class A |  class B |
-|:-------:|:--------:|
-|  37 %   |   63 %   |
-|  2384   |   4113   |
+| class A | class B |
+|:-------:|:-------:|
+|  37 %   |  63 %   |
+|  2384   |  4113   |
 
 ---
-
diff --git a/config.R b/config.R
index ce2b693..7042dec 100644
--- a/config.R
+++ b/config.R
@@ -1,28 +1,48 @@
-# ---- checkpoint ----
+# ---- config ----
 
-CHECKPOINT.MRAN.URL      = "http://mran.microsoft.com/snapshot/"
-CHECKPOINT.SNAPSHOT.DATE = "2016-04-10"
+# randomization and output files
 
-library(checkpoint)
-options(checkpoint.mranUrl=CHECKPOINT.MRAN.URL)
-checkpoint(CHECKPOINT.SNAPSHOT.DATE)
+SEED                   = 1337
+OVERWRITE.OUTPUT.FILES = TRUE # overwrite downloaded and created datasets
 
-# ---- logger ----
+# extra user configuration and init
 
-LOGGER_LEVEL = futile.logger::INFO
+USER.CONFIG.FILE      = "config.R.user"
+USER.INIT.FILE        = "init.R.user"
 
-library(futile.logger)
-flog.threshold(LOGGER_LEVEL)
+# checkpoint library
 
-# ---- other ----
+CHECKPOINT.MRAN.URL      = "https://mran.microsoft.com/"
+CHECKPOINT.SNAPSHOT.DATE = "2016-07-01"
+CHECKPOINT.QUICK.LOAD    = TRUE # skip testing https and checking url
 
-PATH_DATASETS             = "data-collection/"
-PATH_DATASET_ORIGINAL     = paste0(PATH_DATASETS, "*/original/")
-PATH_DATASET_PREPROCESSED = paste0(PATH_DATASETS, "*/preprocessed/")
+# logging system
 
-FILE_CONFIG_YAML          = "config.yaml"
-FILE_PREPROCESSING_SCRIPT = "preprocess.R"
-FILE_PREPROCESSED_OUTPUT  = "dataset.rds"
+LOGGER.OUTPUT.S1.FILE           = "output-s1.log"
+LOGGER.OUTPUT.S2.FILE           = "output-s2.log"
+LOGGER.LEVEL                    = 6 # futile.logger::INFO
+LOGGER.OVERWRITE.EXISTING.FILES = TRUE
 
-if (file.exists("config.R.user"))
-    source("config.R.user")
+# datasets
+
+DATASETS.DIR                     = "data-collection"
+
+DATASET.NAME.PATTERN             = "DS-NAME"
+DATASET.ORIGINAL.DIR             = 
+    file.path(DATASETS.DIR, DATASET.NAME.PATTERN, "original")
+DATASET.PREPROCESSED.DIR         = 
+    file.path(DATASETS.DIR, DATASET.NAME.PATTERN, "preprocessed")
+DATASET.CONFIG.FILE              = "config.yaml"
+DATASET.PREPROCESSING.SCRIPT     = "preprocess.R"
+DATASET.PREPROCESSED.OUTPUT.FILE = "dataset.rds"
+
+# curl
+
+SSL.VERIFY.PEER = FALSE
+
+# load custom config
+
+if (file.exists(USER.CONFIG.FILE))
+{
+    source(USER.CONFIG.FILE)
+}
diff --git a/data-collection/bank-marketing/preprocess.R b/data-collection/bank-marketing/preprocess.R
index 7aa2c80..14513ee 100644
--- a/data-collection/bank-marketing/preprocess.R
+++ b/data-collection/bank-marketing/preprocess.R
@@ -1,44 +1,44 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {
-    #set.seed(SEED)
-    
     temp.dir = tempdir()
     
-    zip.file = "bank.zip"
+    zip.file         = "bank.zip"
     zip.dataset.path = "bank-full.csv"
     
     flog.debug(paste("Unzipping", zip.file))
     
-    unzip(zipfile=paste0(orig.dir, zip.file),
-          files=zip.dataset.path,
-          exdir=temp.dir)
+    unzip(zipfile = file.path(orig.dir, zip.file),
+          files   = zip.dataset.path,
+          exdir   = temp.dir)
     
     flog.debug(paste("Loading", zip.dataset.path))
     
-    dataset = read.csv(paste0(temp.dir, "/", zip.dataset.path), sep=";")
+    dataset = read.csv(file.path(temp.dir, zip.dataset.path), sep = ";")
     
     flog.debug("Preprocessing loaded dataset")
     
     dataset = dataset %>% 
         select(-c(duration, default)) %>%
         filter(job != "unknown" & marital != "unknown" & education != "unknown" & 
-               education != "unknown" & housing != "unknown" & loan != "unknown") %>%
+               education != "unknown" & housing != "unknown" & 
+               loan != "unknown") %>%
         droplevels()
     
     dataset = dataset %>% 
         mutate(
-            education=factor(education, levels=c("primary", "secondary", 
-                                                 "tertiary"),
-                             ordered=TRUE),
-            month=factor(month, levels=c("jan", "feb", "mar", 
-                                         "apr", "may", "jun", 
-                                         "jul", "aug", "sep", 
-                                         "oct", "nov", "dec"), 
-                         ordered=TRUE),
-            pdays.bin=revalue(factor(pdays==-1), 
-                              c("TRUE"="never", "FALSE"="successful")), 
-            pdays=as.integer(replace(pdays, pdays==-1, 999))) %>%
+            education = factor(education, 
+                               levels  = c("primary", "secondary", "tertiary"),
+                               ordered = TRUE),
+            month = factor(month, 
+                           levels  = c("jan", "feb", "mar", "apr", "may", "jun", 
+                                       "jul", "aug", "sep", "oct", "nov", "dec"), 
+                           ordered = TRUE),
+            pdays.bin = revalue(factor(pdays == -1), 
+                                c("TRUE" = "never", "FALSE" = "successful")), 
+            pdays = as.integer(replace(pdays, pdays == -1, 999))) %>%
         select(age:pdays, pdays.bin, previous:y)
     
+    unlink("temp.dir", recursive = TRUE)
+    
     return(dataset)
 }
\ No newline at end of file
diff --git a/data-collection/breast-cancer-wisconsin-diagnostic/preprocess.R b/data-collection/breast-cancer-wisconsin-diagnostic/preprocess.R
index 8c1ad2e..9883013 100644
--- a/data-collection/breast-cancer-wisconsin-diagnostic/preprocess.R
+++ b/data-collection/breast-cancer-wisconsin-diagnostic/preprocess.R
@@ -1,8 +1,8 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {   
     csv.file = "wdbc.data"
     
-    dataset = read.csv(paste0(orig.dir, "/", csv.file), header=FALSE)
+    dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE)
     
     colnames(dataset) = c("id", "diagnosis", 
                           apply(expand.grid(c("radius", "texture", "perimeter", 
@@ -12,7 +12,8 @@ preprocessDataset = function()
                                             c("mean", "se", "worst")), 
                                 1, function(x){paste(x[2], x[1])}))
     
-    dataset = dataset %>% select(`mean radius`:`worst fractal dimension`, diagnosis)
+    dataset = dataset %>% 
+        select(`mean radius`:`worst fractal dimension`, diagnosis)
 
     return(dataset)
 }
\ No newline at end of file
diff --git a/data-collection/breast-cancer-wisconsin-original/preprocess.R b/data-collection/breast-cancer-wisconsin-original/preprocess.R
index ce78758..69105fb 100644
--- a/data-collection/breast-cancer-wisconsin-original/preprocess.R
+++ b/data-collection/breast-cancer-wisconsin-original/preprocess.R
@@ -1,8 +1,8 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {   
     csv.file = "breast-cancer-wisconsin.data"
     
-    dataset = read.csv(paste0(orig.dir, "/", csv.file), header=FALSE)
+    dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE)
     
     colnames(dataset) = c("Sample code number", "Clump Thickness", 
                           "Uniformity of Cell Size", "Uniformity of Cell Shape", 
@@ -10,11 +10,12 @@ preprocessDataset = function()
                           "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", 
                           "Mitoses", "Class")
     
-    dataset = dataset %>% select(-`Sample code number`) %>%
-              filter(`Bare Nuclei` != "?") %>%
-              mutate(Class=factor(Class), 
-                     `Bare Nuclei`=as.integer(`Bare Nuclei`)) %>%
-              droplevels()
+    dataset = dataset %>% 
+        select(-`Sample code number`) %>%
+        filter(`Bare Nuclei` != "?") %>%
+        mutate(Class = factor(Class), 
+               `Bare Nuclei` = as.integer(`Bare Nuclei`)) %>%
+        droplevels()
     
     return(dataset)
 }
\ No newline at end of file
diff --git a/data-collection/cardiotocography/preprocess.R b/data-collection/cardiotocography/preprocess.R
index 1fa46db..458651c 100644
--- a/data-collection/cardiotocography/preprocess.R
+++ b/data-collection/cardiotocography/preprocess.R
@@ -1,40 +1,41 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {   
     xls.file = "CTG.xls"
     
-    wk = loadWorkbook(paste0(orig.dir, "/", xls.file))
-    dataset = readWorksheet(wk, sheet="Raw Data")
+    wk      = loadWorkbook(file.path(orig.dir, xls.file))
+    dataset = readWorksheet(wk, sheet = "Raw Data")
     
     dataset = dataset %>% select(LB:FS, NSP, -c(DS, DR)) %>% 
-              filter(complete.cases(.)) %>%
-              mutate(LB=as.integer(LB),
-                     AC=as.integer(AC),
-                     FM=as.integer(FM),
-                     UC=as.integer(UC),
-                     ASTV=as.integer(ASTV),
-                     ALTV=as.integer(ALTV),
-                     DL=as.integer(DL),
-                     DP=as.integer(DP),
-                     Width=as.integer(Width),
-                     Min=as.integer(Min),
-                     Max=as.integer(Max),
-                     Nmax=as.integer(Nmax),
-                     Nzeros=as.integer(Nzeros),
-                     Mode=as.integer(Mode),
-                     Mean=as.integer(Mean),
-                     Median=as.integer(Median),
-                     Variance=as.integer(Variance),
-                     Tendency=factor(Tendency, levels=c(-1,0,1), ordered=TRUE),
-                     A=factor(A),
-                     B=factor(B),
-                     C=factor(C),
-                     D=factor(D),
-                     E=factor(E),
-                     AD=factor(AD),
-                     DE=factor(DE),
-                     LD=factor(LD),
-                     FS=factor(FS),
-                     NSP=factor(replace(NSP, NSP==2, 3)))
+        filter(complete.cases(.)) %>%
+        mutate(LB       = as.integer(LB),
+               AC       = as.integer(AC),
+               FM       = as.integer(FM),
+               UC       = as.integer(UC),
+               ASTV     = as.integer(ASTV),
+               ALTV     = as.integer(ALTV),
+               DL       = as.integer(DL),
+               DP       = as.integer(DP),
+               Width    = as.integer(Width),
+               Min      = as.integer(Min),
+               Max      = as.integer(Max),
+               Nmax     = as.integer(Nmax),
+               Nzeros   = as.integer(Nzeros),
+               Mode     = as.integer(Mode),
+               Mean     = as.integer(Mean),
+               Median   = as.integer(Median),
+               Variance = as.integer(Variance),
+               Tendency = factor(Tendency, levels = c(-1, 0, 1), 
+                                 ordered = TRUE),
+               A        = factor(A),
+               B        = factor(B),
+               C        = factor(C),
+               D        = factor(D),
+               E        = factor(E),
+               AD       = factor(AD),
+               DE       = factor(DE),
+               LD       = factor(LD),
+               FS       = factor(FS),
+               NSP      = factor(replace(NSP, NSP == 2, 3)))
 
     return(dataset)
 }
\ No newline at end of file
diff --git a/data-collection/credit-card/preprocess.R b/data-collection/credit-card/preprocess.R
index 9e2ffeb..2cd44e0 100644
--- a/data-collection/credit-card/preprocess.R
+++ b/data-collection/credit-card/preprocess.R
@@ -1,48 +1,40 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {   
-    #set.seed(SEED)
-    
     xls.file = "default of credit card clients.xls"
     
-    wk = loadWorkbook(paste0(orig.dir, "/", xls.file))
-    dataset = readWorksheet(wk, sheet="Data", startRow=2, startCol=2,
-                            check.names=FALSE)
+    wk      = loadWorkbook(file.path(orig.dir, xls.file))
+    dataset = readWorksheet(wk, sheet = "Data", startRow = 2, startCol = 2,
+                            check.names = FALSE)
     
     dataset = dataset %>% 
-              mutate(LIMIT_BAL=as.integer(LIMIT_BAL),
-                     SEX=factor(SEX),
-                     EDUCATION=factor(EDUCATION), # can't order due to
-                                                  # inconsistency with
-                                                 # UCI description
-                     MARRIAGE=factor(MARRIAGE),
-                     AGE=as.integer(AGE),
-                     PAY_0=as.integer(replace(PAY_0, PAY_0<0, 0)),
-                     PAY_2=as.integer(replace(PAY_2, PAY_2<0, 0)),
-                     PAY_3=as.integer(replace(PAY_3, PAY_3<0, 0)),
-                     PAY_4=as.integer(replace(PAY_4, PAY_4<0, 0)),
-                     PAY_5=as.integer(replace(PAY_5, PAY_5<0, 0)),
-                     PAY_6=as.integer(replace(PAY_6, PAY_6<0, 0)),
-                     BILL_AMT1=as.integer(BILL_AMT1),
-                     BILL_AMT2=as.integer(BILL_AMT2),
-                     BILL_AMT3=as.integer(BILL_AMT3),
-                     BILL_AMT4=as.integer(BILL_AMT4),
-                     BILL_AMT5=as.integer(BILL_AMT5),
-                     BILL_AMT6=as.integer(BILL_AMT6),
-                     PAY_AMT1=as.integer(PAY_AMT1),
-                     PAY_AMT2=as.integer(PAY_AMT2),
-                     PAY_AMT3=as.integer(PAY_AMT3),
-                     PAY_AMT4=as.integer(PAY_AMT4),
-                     PAY_AMT5=as.integer(PAY_AMT5),
-                     PAY_AMT6=as.integer(PAY_AMT6),
-                     `default payment next month`=factor(
-                        `default payment next month`)
-              )
-    
-    #dataset.1 = dataset %>% filter(`default payment next month` == 1)
-    #dataset.0 = dataset %>% filter(`default payment next month` == 0) %>% 
-    #            sample_n(nrow(dataset.1))
-    #
-    #dataset = rbind(dataset.0, dataset.1)
+        mutate(LIMIT_BAL = as.integer(LIMIT_BAL),
+               SEX       = factor(SEX),
+               EDUCATION = factor(EDUCATION), # can not order due to
+                                              # inconsistency with
+                                              # UCI description
+               MARRIAGE  = factor(MARRIAGE),
+               AGE       = as.integer(AGE),
+               PAY_0     = as.integer(replace(PAY_0, PAY_0 < 0, 0)),
+               PAY_2     = as.integer(replace(PAY_2, PAY_2 < 0, 0)),
+               PAY_3     = as.integer(replace(PAY_3, PAY_3 < 0, 0)),
+               PAY_4     = as.integer(replace(PAY_4, PAY_4 < 0, 0)),
+               PAY_5     = as.integer(replace(PAY_5, PAY_5 < 0, 0)),
+               PAY_6     = as.integer(replace(PAY_6, PAY_6 < 0, 0)),
+               BILL_AMT1 = as.integer(BILL_AMT1),
+               BILL_AMT2 = as.integer(BILL_AMT2),
+               BILL_AMT3 = as.integer(BILL_AMT3),
+               BILL_AMT4 = as.integer(BILL_AMT4),
+               BILL_AMT5 = as.integer(BILL_AMT5),
+               BILL_AMT6 = as.integer(BILL_AMT6),
+               PAY_AMT1  = as.integer(PAY_AMT1),
+               PAY_AMT2  = as.integer(PAY_AMT2),
+               PAY_AMT3  = as.integer(PAY_AMT3),
+               PAY_AMT4  = as.integer(PAY_AMT4),
+               PAY_AMT5  = as.integer(PAY_AMT5),
+               PAY_AMT6  = as.integer(PAY_AMT6),
+               `default payment next month` = 
+                   factor(`default payment next month`)
+        )
 
     return(dataset)
 }
\ No newline at end of file
diff --git a/data-collection/indian-liver/preprocess.R b/data-collection/indian-liver/preprocess.R
index c149efa..773a047 100644
--- a/data-collection/indian-liver/preprocess.R
+++ b/data-collection/indian-liver/preprocess.R
@@ -1,14 +1,14 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {   
     csv.file = "Indian Liver Patient Dataset (ILPD).csv"
     
-    dataset = read.csv(paste0(orig.dir, "/", csv.file),
-                       header=FALSE)
+    dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE)
     
     colnames(dataset) = c("Age", "Gender", "TB", "DB", "Alkphos", "Sgpt", 
                           "Sgot", "TP", "ALB", "A/G Ratio", "Selector")
     
-    dataset = dataset %>% mutate(Selector=factor(Selector))
+    dataset = dataset %>% 
+        mutate(Selector = factor(Selector))
 
     return(dataset)
 }
\ No newline at end of file
diff --git a/data-collection/magic/preprocess.R b/data-collection/magic/preprocess.R
index 5abe672..e95183b 100644
--- a/data-collection/magic/preprocess.R
+++ b/data-collection/magic/preprocess.R
@@ -1,9 +1,8 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {   
     csv.file = "magic04.data"
     
-    dataset = read.csv(paste0(orig.dir, "/", csv.file),
-                       header=FALSE)
+    dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE)
     
     colnames(dataset) = c("fLength", "fWidth", "fSize", "fConc", "fConc1", 
                           "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", 
diff --git a/data-collection/seismic-bumps/preprocess.R b/data-collection/seismic-bumps/preprocess.R
index a4b1e5d..c1e203c 100644
--- a/data-collection/seismic-bumps/preprocess.R
+++ b/data-collection/seismic-bumps/preprocess.R
@@ -1,30 +1,23 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {   
-    #set.seed(SEED)
-    
     arff.file = "seismic-bumps.arff"
     
-    dataset = read.arff(paste0(orig.dir, "/", arff.file))
+    dataset = read.arff(file.path(orig.dir, arff.file))
     
-    dataset = dataset %>% select(-c(nbumps6:nbumps89)) %>%
-              mutate(genergy=as.integer(genergy),
-                     gpuls=as.integer(gpuls),
-                     gdenergy=as.integer(gdenergy),
-                     gdpuls=as.integer(gdpuls),
-                     nbumps=as.integer(nbumps),
-                     nbumps2=as.integer(nbumps2),
-                     nbumps3=as.integer(nbumps3),
-                     nbumps4=as.integer(nbumps4),
-                     nbumps5=as.integer(nbumps5),
-                     energy=as.integer(energy),
-                     maxenergy=as.integer(maxenergy)
-              )
-    
-    #dataset.1 = dataset %>% filter(class == "1")
-    #dataset.0 = dataset %>% filter(class == "0") %>% 
-    #            sample_n(nrow(dataset.1)*4)
-    #
-    #dataset = rbind(dataset.0, dataset.1)
+    dataset = dataset %>% 
+        select(-c(nbumps6:nbumps89)) %>%
+        mutate(genergy   = as.integer(genergy),
+               gpuls     = as.integer(gpuls),
+               gdenergy  = as.integer(gdenergy),
+               gdpuls    = as.integer(gdpuls),
+               nbumps    = as.integer(nbumps),
+               nbumps2   = as.integer(nbumps2),
+               nbumps3   = as.integer(nbumps3),
+               nbumps4   = as.integer(nbumps4),
+               nbumps5   = as.integer(nbumps5),
+               energy    = as.integer(energy),
+               maxenergy = as.integer(maxenergy)
+        )
 
     return(dataset)
 }
\ No newline at end of file
diff --git a/data-collection/spambase/preprocess.R b/data-collection/spambase/preprocess.R
index d579197..b8ef320 100644
--- a/data-collection/spambase/preprocess.R
+++ b/data-collection/spambase/preprocess.R
@@ -1,9 +1,9 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {   
     csv.file = "spambase.data"
     
-    dataset = read.csv(paste0(orig.dir, "/", csv.file),
-                      header=FALSE)
+    dataset = read.csv(file.path(orig.dir, csv.file),
+                       header = FALSE)
     
     colnames(dataset) = c("word_freq_make", "word_freq_address", "word_freq_all",
                           "word_freq_3d", "word_freq_our", "word_freq_over", 
@@ -27,7 +27,8 @@ preprocessDataset = function()
                           "capital_run_length_longest", "capital_run_length_total",
                           "class")
     
-    dataset = dataset %>% mutate(class=factor(class))
+    dataset = dataset %>% 
+        mutate(class = factor(class))
 
     return(dataset)
 }
\ No newline at end of file
diff --git a/data-collection/wine-quality/preprocess.R b/data-collection/wine-quality/preprocess.R
index 2ff5c03..d9ad34a 100644
--- a/data-collection/wine-quality/preprocess.R
+++ b/data-collection/wine-quality/preprocess.R
@@ -1,21 +1,24 @@
-preprocessDataset = function()
+preprocess.dataset = function()
 {   
     csv.file.w = "winequality-white.csv"
     csv.file.r = "winequality-red.csv"
     
-    dataset.w = read.csv(paste0(orig.dir, "/", csv.file.w), sep=";",
-                         check.names=FALSE)
-    dataset.w = dataset.w %>% mutate(color="white")
+    dataset.w = read.csv(file.path(orig.dir, "/", csv.file.w), sep = ";",
+                         check.names = FALSE)
+    dataset.w = dataset.w %>% 
+        mutate(color = "white")
     
-    dataset.r = read.csv(paste0(orig.dir, "/", csv.file.r), sep=";",
-                         check.names=FALSE)
-    dataset.r = dataset.r %>% mutate(color="red")
+    dataset.r = read.csv(paste0(orig.dir, "/", csv.file.r), sep = ";",
+                         check.names = FALSE)
+    dataset.r = dataset.r %>% 
+        mutate(color = "red")
     
-    dataset = rbind(dataset.w, dataset.r) %>%
-              mutate(color=factor(color),
-                     quality=ifelse(quality>5, 1, 0)) %>%
-              select(`fixed acidity`:alcohol, color, quality) %>%
-              mutate(quality=factor(quality))
+    dataset = 
+        rbind(dataset.w, dataset.r) %>%
+        mutate(color   = factor(color),
+               quality = ifelse(quality > 5, 1, 0)) %>%
+        select(`fixed acidity`:alcohol, color, quality) %>%
+        mutate(quality = factor(quality))
 
     return(dataset)
 }
\ No newline at end of file
diff --git a/data-download.R b/data-download.R
deleted file mode 100644
index 7867c36..0000000
--- a/data-download.R
+++ /dev/null
@@ -1,49 +0,0 @@
-rm(list=ls())
-
-source("config.R")
-source("utils.R")
-
-library(RCurl)
-library(tools)
-library(yaml)
-
-flog.info("Started downloading dataset collection")
-
-for (dir.name in dir(PATH_DATASETS))
-{
-    flog.info(paste("Dataset:", dir.name))
-    
-    dest.dir  = gsub("\\*", dir.name, PATH_DATASET_ORIGINAL)
-    config.yaml.file = paste0(PATH_DATASETS, dir.name, "/", FILE_CONFIG_YAML)
-    
-    urls.list = yaml.load_file(config.yaml.file)$urls
-
-    mkdir(dest.dir)
-    
-    for (url in urls.list)
-    {
-        flog.info(paste("URL:", url))
-        
-        dest.file = URLdecode(basename(url))
-        dest.file.path = paste0(dest.dir, dest.file)
-        
-        if (file.exists(dest.file.path))
-        {
-            flog.warn(paste("Target file", basename(dest.file.path), 
-                            "already exists; skipping..."))
-            next
-        }
-        
-        tryCatch(
-            raw.content <- getBinaryURL(url, .opts=curlOptions(ssl.verifypeer=FALSE)),
-            error = function(e){flog.error(e); stop(e)}
-        )
-        
-        writeBin(raw.content, dest.file.path)
-        
-    }
-    
-    flog.info("*****")
-}
-
-flog.info("Finished downloading dataset collection")
diff --git a/data-preprocess.R b/data-preprocess.R
deleted file mode 100644
index 68eeaac..0000000
--- a/data-preprocess.R
+++ /dev/null
@@ -1,64 +0,0 @@
-rm(list=ls())
-
-source("config.R")
-source("utils.R")
-
-library(plyr)
-library(dplyr)
-library(foreign)
-library(XLConnect)
-
-flog.info("Started preprocessing dataset collection")
-
-for (dir.name in dir(PATH_DATASETS))
-{
-    flog.info(paste("Dataset:", dir.name))
-    
-    orig.dir = gsub("\\*", dir.name, PATH_DATASET_ORIGINAL)
-    dest.dir = gsub("\\*", dir.name, PATH_DATASET_PREPROCESSED)
-    dest.file.path = paste0(dest.dir, FILE_PREPROCESSED_OUTPUT)
-    
-    if (file.exists(dest.file.path))
-    {
-        flog.warn(paste("Target file", basename(dest.file.path), 
-                        "already exists; skipping..."))
-        flog.info("*****")
-        next
-    }
-    
-    r.src.file = paste0(PATH_DATASETS, dir.name, "/", FILE_PREPROCESSING_SCRIPT)
-    
-    source(r.src.file)
-    dataset = preprocessDataset() # custom per-dataset preprocessing
-    
-    # change column names
-    colnames(dataset) = tolower(
-                            make.names(
-                                gsub("^\\.|\\.$", "", colnames(dataset)),
-                            unique=TRUE, allow_=FALSE))
-    
-    # change factor levels
-    for (name in colnames(dataset))
-    {
-        if (any(class(dataset[[name]]) == "factor"))
-        {
-            levels(dataset[[name]]) = tolower(
-                                        make.names(
-                                            gsub("^\\.|\\.$", "", 
-                                                 levels(dataset[[name]])),
-                                            unique=TRUE, allow_=FALSE))
-            
-        }
-    }
-    
-    printDatasetStatistics(dataset)
-    
-    mkdir(dest.dir)
-    saveRDS(dataset, dest.file.path)
-    
-    flog.info(paste("Created preprocessed file", FILE_PREPROCESSED_OUTPUT))
-    
-    flog.info("*****")
-}
-
-flog.info("Finished preprocessing dataset collection")
\ No newline at end of file
diff --git a/init.R b/init.R
new file mode 100644
index 0000000..b34d781
--- /dev/null
+++ b/init.R
@@ -0,0 +1,64 @@
+# ---- init ----
+
+# clear envirionment
+
+rm(list = ls())
+
+# load setup variables
+
+source("config.R")
+
+# set randomization
+
+set.seed(SEED)
+
+# load library management system
+
+library(checkpoint)
+
+if (CHECKPOINT.QUICK.LOAD) # approx. x10 faster checkpoint library loading
+{
+    # assume https
+    options(checkpoint.mranUrl = CHECKPOINT.MRAN.URL)
+    # disable url checking
+    assignInNamespace("is.404", function(mran, warn = TRUE) { FALSE },
+                      "checkpoint")
+}
+
+# knitr fix
+checkpoint(CHECKPOINT.SNAPSHOT.DATE, verbose = TRUE, scanForPackages = FALSE)
+if (system.file(package = "knitr") == "")
+{
+    install.packages("knitr")
+}
+
+# actual checkpoint loading
+checkpoint(CHECKPOINT.SNAPSHOT.DATE, verbose = TRUE, scanForPackages = TRUE)
+
+# load logging system
+
+library(futile.logger)
+
+flog.threshold(LOGGER.LEVEL)
+
+# load libraries
+
+library(RCurl)
+library(tools)
+library(yaml)
+
+library(plyr)
+library(dplyr)
+library(foreign)
+library(XLConnect)
+
+# load helper functions
+
+source("utils.R")
+
+# perform additional custom init
+
+if (file.exists(USER.INIT.FILE))
+{
+    source(USER.INIT.FILE)
+}
diff --git a/s1-download-data.R b/s1-download-data.R
new file mode 100644
index 0000000..2b602e4
--- /dev/null
+++ b/s1-download-data.R
@@ -0,0 +1,48 @@
+# ---- download-data ----
+
+source("init.R")
+source("utils.R")
+
+setup.logger(LOGGER.OUTPUT.S1.FILE)
+
+flog.info("Step 1: download dataset collection")
+
+for (dir.name in dir(DATASETS.DIR))
+{
+    flog.info(paste("Dataset:", dir.name))
+    
+    dest.dir  = gsub(DATASET.NAME.PATTERN, dir.name, DATASET.ORIGINAL.DIR)
+    config.yaml.file = file.path(DATASETS.DIR, dir.name, DATASET.CONFIG.FILE)
+    
+    urls.list = yaml.load_file(config.yaml.file)$urls
+
+    if (!dir.exists(dest.dir))
+    {
+        dir.create(dest.dir)
+    }
+    
+    for (url in urls.list)
+    {
+        flog.info(paste("URL:", url))
+        
+        dest.file      = URLdecode(basename(url))
+        dest.file.path = file.path(dest.dir, dest.file)
+        
+        if (!file.exists(dest.file.path) | OVERWRITE.OUTPUT.FILES)
+        {
+            tryCatch(
+                raw.content <- 
+                    getBinaryURL(url, .opts = curlOptions(ssl.verifypeer = 
+                                                              SSL.VERIFY.PEER)),
+                error = function(e){flog.error(e); stop(e)}
+            )
+            
+            writeBin(raw.content, dest.file.path)
+        } else {
+            flog.warn(paste("Target file", basename(dest.file.path),
+                            "already exists, skipping"))
+        }
+    }
+    
+    flog.info(paste(rep("*", 25), collapse = ""))
+}
diff --git a/s2-preprocess-data.R b/s2-preprocess-data.R
new file mode 100644
index 0000000..da36b51
--- /dev/null
+++ b/s2-preprocess-data.R
@@ -0,0 +1,61 @@
+# ---- preprocess-data ----
+
+source("init.R")
+source("utils.R")
+
+setup.logger(LOGGER.OUTPUT.S2.FILE)
+
+flog.info("Step 2: preprocess dataset collection")
+
+for (dir.name in dir(DATASETS.DIR))
+{
+    flog.info(paste("Dataset:", dir.name))
+    
+    orig.dir = gsub(DATASET.NAME.PATTERN, dir.name, DATASET.ORIGINAL.DIR)
+    dest.dir = gsub(DATASET.NAME.PATTERN, dir.name, DATASET.PREPROCESSED.DIR)
+    dest.file.path = file.path(dest.dir, DATASET.PREPROCESSED.OUTPUT.FILE)
+    
+    if (!file.exists(dest.file.path) | OVERWRITE.OUTPUT.FILES)
+    {
+        r.src.file = file.path(DATASETS.DIR, dir.name, 
+                               DATASET.PREPROCESSING.SCRIPT)
+        source(r.src.file)
+        dataset = preprocess.dataset() # custom per-dataset preprocessing
+        
+        # change column names
+        colnames(dataset) = tolower(
+            make.names(
+                gsub("^\\.|\\.$", "", colnames(dataset)),
+                unique = TRUE, allow_ = FALSE))
+        
+        # change factor levels
+        for (name in colnames(dataset))
+        {
+            if (any(class(dataset[[name]]) == "factor"))
+            {
+                levels(dataset[[name]]) = tolower(
+                    make.names(
+                        gsub("^\\.|\\.$", "", 
+                             levels(dataset[[name]])),
+                        unique = TRUE, allow_ = FALSE))
+            }
+        }
+        
+        print.dataset.statistics(dataset)
+        
+        if (!dir.exists(dest.dir))
+        {
+            dir.create(dest.dir)
+        }
+        
+        saveRDS(dataset, dest.file.path)
+        
+        flog.info(paste("Created preprocessed file", 
+                        DATASET.PREPROCESSED.OUTPUT.FILE))
+    } else {
+        flog.warn(paste("Target file", basename(dest.file.path), 
+                        "already exists, skipping"))
+    }
+    
+    flog.info(paste(rep("*", 25), collapse = ""))
+}
diff --git a/readme-make.Rmd b/s3-make-readme.Rmd
similarity index 61%
rename from readme-make.Rmd
rename to s3-make-readme.Rmd
index 0c875f6..7b968b3 100644
--- a/readme-make.Rmd
+++ b/s3-make-readme.Rmd
@@ -7,35 +7,31 @@ output:
 ---
 
 ```{r global-options, include=FALSE}
-knitr::opts_chunk$set(comment="", echo=FALSE, 
-                      warning=FALSE, message=FALSE)
-source('config.R')
+knitr::opts_chunk$set(comment = "", echo = FALSE, warning = FALSE, message = FALSE)
+source('init.R')
 ```
 
 Document generation date: `r Sys.time()`.
 
-
 ```{r show-datasets, results='asis'}
-library(yaml)
 
 cat("\n# Table of Contents\n\n")
 
-for (dir.name in dir(PATH_DATASETS))
+for (dir.name in dir(DATASETS.DIR))
 {
-    config.yaml.file.path = paste0(PATH_DATASETS, dir.name, "/", FILE_CONFIG_YAML)
+    config.yaml.file.path = file.path(DATASETS.DIR, dir.name, DATASET.CONFIG.FILE)
     config.yaml = yaml.load_file(config.yaml.file.path)
     
     anchor = gsub(" ", "-", gsub("[[:punct:]]", "", 
                                  tolower(config.yaml$name)))
     cat(paste0("1. [", config.yaml$name, "](#", anchor, ")\n" ))
-    
 }
 
 cat("\n---\n\n")
 
-for (dir.name in dir(PATH_DATASETS))
+for (dir.name in dir(DATASETS.DIR))
 {
-    config.yaml.file.path = paste0(PATH_DATASETS, dir.name, "/", FILE_CONFIG_YAML)
+    config.yaml.file.path = file.path(DATASETS.DIR, dir.name, DATASET.CONFIG.FILE)
     config.yaml = yaml.load_file(config.yaml.file.path)
     
     cat(paste("#", config.yaml$name, "\n\n"))
@@ -55,8 +51,10 @@ for (dir.name in dir(PATH_DATASETS))
     
     cat(paste("**Dataset**:\n\n"))
     
-    preprocessed.dir = gsub("\\*", dir.name, PATH_DATASET_PREPROCESSED)
-    preprocessed.file.path = paste0(preprocessed.dir, FILE_PREPROCESSED_OUTPUT)
+    preprocessed.dir =       gsub(DATASET.NAME.PATTERN, dir.name,
+                                  DATASET.PREPROCESSED.DIR)
+    preprocessed.file.path = file.path(preprocessed.dir,
+                                       DATASET.PREPROCESSED.OUTPUT.FILE)
     
     dataset = readRDS(preprocessed.file.path)
     
@@ -66,11 +64,11 @@ for (dir.name in dir(PATH_DATASETS))
     
     cat("**Predictors**:\n\n")
     
-    df.pred = data.frame(table(sapply(dataset[, 1:(ncol(dataset)-1)], 
-                               function(f){paste(class(f), collapse=" ")})))
+    df.pred = data.frame(table(sapply(dataset[, 1:(ncol(dataset) - 1)], 
+                               function(f){paste(class(f), collapse = " ")})))
     colnames(df.pred) = c("Type", "Frequency")
     
-    cat(knitr::kable(df.pred, format="markdown"), sep="\n")
+    cat(knitr::kable(df.pred, format = "markdown"), sep = "\n")
     cat("\n")
     
     perc.classes = sort(round(100*as.numeric(
@@ -79,14 +77,14 @@ for (dir.name in dir(PATH_DATASETS))
     
     cat("**Class imbalance**:\n\n")
     
-    cat(knitr::kable(data.frame(A=c(paste(perc.classes[1], "%"), num.classes[1]), 
-                                B=c(paste(perc.classes[2], "%"), num.classes[2])), 
-                     format="markdown", col.names=c("class A", " class B"),
-                     align=c("c", "c")), 
-        sep="\n")
-    
+    cat(knitr::kable(data.frame(A = c(paste(perc.classes[1], "%"), 
+                                      num.classes[1]), 
+                                B = c(paste(perc.classes[2], "%"),
+                                      num.classes[2])), 
+                     format = "markdown", col.names = c("class A", "class B"),
+                     align = c("c", "c")), 
+        sep = "\n")
 
     cat("\n---\n\n")
 }
 ```
-
diff --git a/utils.R b/utils.R
index 709ce84..4f421ba 100644
--- a/utils.R
+++ b/utils.R
@@ -1,19 +1,6 @@
-library(futile.logger)
-
-mkdir = function(dest.dir)
+print.dataset.statistics = function(dataset)
 {
-    if (!dir.exists(dest.dir))
-    {
-        flog.debug(paste("Creating directory", dest.dir))
-        dir.create(dest.dir)
-    } else {
-        flog.debug(paste("Target directory", dest.dir, "already exists"))
-    }
-}
-
-printDatasetStatistics = function(dataset)
-{
-    if (ncol(dataset)==0) # for mockups
+    if (ncol(dataset) == 0) # for mockups
     {
         flog.warn("Empty dataset")
         return()
@@ -21,8 +8,19 @@ printDatasetStatistics = function(dataset)
     
     no.cases = nrow(dataset)
     no.attributes = ncol(dataset) - 1
-    perc.classes = round(100*as.numeric(table(dataset[, ncol(dataset)]))/nrow(dataset), 0)
+    perc.classes = 
+        round(100*as.numeric(table(dataset[, ncol(dataset)]))/nrow(dataset), 0)
     
     flog.info(paste0("Cases: ", no.cases, ", attributes: ", no.attributes,
                      ", classes: ", perc.classes[1], "%/", perc.classes[2], "%"))
 }
+
+setup.logger = function(output.file)
+{
+    if (LOGGER.OVERWRITE.EXISTING.FILES & file.exists(output.file))
+    {
+        file.remove(output.file)
+    }
+    
+    invisible(flog.appender(appender.tee(output.file)))
+}