From 982b3b6f9d750edb2ddbb28429cf0568f8e151d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20W=C3=B3jtowicz?= Date: Sun, 17 Jul 2016 02:35:55 +0200 Subject: [PATCH] refactoring, checkpoint cleanup and snapshot update --- .gitignore | 3 + README.md | 84 +++++++++---------- config.R | 58 ++++++++----- data-collection/bank-marketing/preprocess.R | 40 ++++----- .../preprocess.R | 7 +- .../preprocess.R | 15 ++-- data-collection/cardiotocography/preprocess.R | 65 +++++++------- data-collection/credit-card/preprocess.R | 72 +++++++--------- data-collection/indian-liver/preprocess.R | 8 +- data-collection/magic/preprocess.R | 5 +- data-collection/seismic-bumps/preprocess.R | 39 ++++----- data-collection/spambase/preprocess.R | 9 +- data-collection/wine-quality/preprocess.R | 27 +++--- data-download.R | 49 ----------- data-preprocess.R | 64 -------------- init.R | 64 ++++++++++++++ s1-download-data.R | 48 +++++++++++ s2-preprocess-data.R | 61 ++++++++++++++ readme-make.Rmd => s3-make-readme.Rmd | 42 +++++----- utils.R | 30 ++++--- 20 files changed, 429 insertions(+), 361 deletions(-) delete mode 100644 data-download.R delete mode 100644 data-preprocess.R create mode 100644 init.R create mode 100644 s1-download-data.R create mode 100644 s2-preprocess-data.R rename readme-make.Rmd => s3-make-readme.Rmd (61%) diff --git a/.gitignore b/.gitignore index 9096c7e..d741a93 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,6 @@ data-collection/*/preprocessed/* # markdown outputs *.html .Rproj.user + +# logger outputs +*.log diff --git a/README.md b/README.md index e189944..9024e13 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,7 @@ Andrzej Wójtowicz -Document generation date: 2016-07-13 13:45:45. - +Document generation date: 2016-07-17 02:31:21. # Table of Contents @@ -70,10 +69,10 @@ S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of **Class imbalance**: -| class A | class B | -|:-------:|:--------:| -| 12 % | 88 % | -| 5021 | 38172 | +| class A | class B | +|:-------:|:-------:| +| 12 % | 88 % | +| 5021 | 38172 | --- @@ -140,10 +139,10 @@ https://archive.ics.uci.edu/ml/citation_policy.html **Class imbalance**: -| class A | class B | -|:-------:|:--------:| -| 37 % | 63 % | -| 212 | 357 | +| class A | class B | +|:-------:|:-------:| +| 37 % | 63 % | +| 212 | 357 | --- @@ -188,10 +187,10 @@ O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear programming", **Class imbalance**: -| class A | class B | -|:-------:|:--------:| -| 35 % | 65 % | -| 239 | 444 | +| class A | class B | +|:-------:|:-------:| +| 35 % | 65 % | +| 239 | 444 | --- @@ -258,10 +257,10 @@ Ayres de Campos et al. (2000) SisPorto 2.0 A Program for Automated Analysis of C **Class imbalance**: -| class A | class B | -|:-------:|:--------:| -| 22 % | 78 % | -| 471 | 1655 | +| class A | class B | +|:-------:|:-------:| +| 22 % | 78 % | +| 471 | 1655 | --- @@ -320,10 +319,10 @@ Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for **Class imbalance**: -| class A | class B | -|:-------:|:--------:| -| 22 % | 78 % | -| 6636 | 23364 | +| class A | class B | +|:-------:|:-------:| +| 22 % | 78 % | +| 6636 | 23364 | --- @@ -371,10 +370,10 @@ https://archive.ics.uci.edu/ml/citation_policy.html **Class imbalance**: -| class A | class B | -|:-------:|:--------:| -| 29 % | 71 % | -| 167 | 416 | +| class A | class B | +|:-------:|:-------:| +| 29 % | 71 % | +| 167 | 416 | --- @@ -421,10 +420,10 @@ https://archive.ics.uci.edu/ml/citation_policy.html **Class imbalance**: -| class A | class B | -|:-------:|:--------:| -| 35 % | 65 % | -| 6688 | 12332 | +| class A | class B | +|:-------:|:-------:| +| 35 % | 65 % | +| 6688 | 12332 | --- @@ -475,10 +474,10 @@ Sikora M., Wrobel L.: Application of rule induction algorithms for analysis of d **Class imbalance**: -| class A | class B | -|:-------:|:--------:| -| 7 % | 93 % | -| 170 | 2414 | +| class A | class B | +|:-------:|:-------:| +| 7 % | 93 % | +| 170 | 2414 | --- @@ -574,10 +573,10 @@ https://archive.ics.uci.edu/ml/citation_policy.html **Class imbalance**: -| class A | class B | -|:-------:|:--------:| -| 39 % | 61 % | -| 1813 | 2788 | +| class A | class B | +|:-------:|:-------:| +| 39 % | 61 % | +| 1813 | 2788 | --- @@ -627,10 +626,9 @@ P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferen **Class imbalance**: -| class A | class B | -|:-------:|:--------:| -| 37 % | 63 % | -| 2384 | 4113 | +| class A | class B | +|:-------:|:-------:| +| 37 % | 63 % | +| 2384 | 4113 | --- - diff --git a/config.R b/config.R index ce2b693..7042dec 100644 --- a/config.R +++ b/config.R @@ -1,28 +1,48 @@ -# ---- checkpoint ---- +# ---- config ---- -CHECKPOINT.MRAN.URL = "http://mran.microsoft.com/snapshot/" -CHECKPOINT.SNAPSHOT.DATE = "2016-04-10" +# randomization and output files -library(checkpoint) -options(checkpoint.mranUrl=CHECKPOINT.MRAN.URL) -checkpoint(CHECKPOINT.SNAPSHOT.DATE) +SEED = 1337 +OVERWRITE.OUTPUT.FILES = TRUE # overwrite downloaded and created datasets -# ---- logger ---- +# extra user configuration and init -LOGGER_LEVEL = futile.logger::INFO +USER.CONFIG.FILE = "config.R.user" +USER.INIT.FILE = "init.R.user" -library(futile.logger) -flog.threshold(LOGGER_LEVEL) +# checkpoint library -# ---- other ---- +CHECKPOINT.MRAN.URL = "https://mran.microsoft.com/" +CHECKPOINT.SNAPSHOT.DATE = "2016-07-01" +CHECKPOINT.QUICK.LOAD = TRUE # skip testing https and checking url -PATH_DATASETS = "data-collection/" -PATH_DATASET_ORIGINAL = paste0(PATH_DATASETS, "*/original/") -PATH_DATASET_PREPROCESSED = paste0(PATH_DATASETS, "*/preprocessed/") +# logging system -FILE_CONFIG_YAML = "config.yaml" -FILE_PREPROCESSING_SCRIPT = "preprocess.R" -FILE_PREPROCESSED_OUTPUT = "dataset.rds" +LOGGER.OUTPUT.S1.FILE = "output-s1.log" +LOGGER.OUTPUT.S2.FILE = "output-s2.log" +LOGGER.LEVEL = 6 # futile.logger::INFO +LOGGER.OVERWRITE.EXISTING.FILES = TRUE -if (file.exists("config.R.user")) - source("config.R.user") +# datasets + +DATASETS.DIR = "data-collection" + +DATASET.NAME.PATTERN = "DS-NAME" +DATASET.ORIGINAL.DIR = + file.path(DATASETS.DIR, DATASET.NAME.PATTERN, "original") +DATASET.PREPROCESSED.DIR = + file.path(DATASETS.DIR, DATASET.NAME.PATTERN, "preprocessed") +DATASET.CONFIG.FILE = "config.yaml" +DATASET.PREPROCESSING.SCRIPT = "preprocess.R" +DATASET.PREPROCESSED.OUTPUT.FILE = "dataset.rds" + +# curl + +SSL.VERIFY.PEER = FALSE + +# load custom config + +if (file.exists(USER.CONFIG.FILE)) +{ + source(USER.CONFIG.FILE) +} diff --git a/data-collection/bank-marketing/preprocess.R b/data-collection/bank-marketing/preprocess.R index 7aa2c80..14513ee 100644 --- a/data-collection/bank-marketing/preprocess.R +++ b/data-collection/bank-marketing/preprocess.R @@ -1,44 +1,44 @@ -preprocessDataset = function() +preprocess.dataset = function() { - #set.seed(SEED) - temp.dir = tempdir() - zip.file = "bank.zip" + zip.file = "bank.zip" zip.dataset.path = "bank-full.csv" flog.debug(paste("Unzipping", zip.file)) - unzip(zipfile=paste0(orig.dir, zip.file), - files=zip.dataset.path, - exdir=temp.dir) + unzip(zipfile = file.path(orig.dir, zip.file), + files = zip.dataset.path, + exdir = temp.dir) flog.debug(paste("Loading", zip.dataset.path)) - dataset = read.csv(paste0(temp.dir, "/", zip.dataset.path), sep=";") + dataset = read.csv(file.path(temp.dir, zip.dataset.path), sep = ";") flog.debug("Preprocessing loaded dataset") dataset = dataset %>% select(-c(duration, default)) %>% filter(job != "unknown" & marital != "unknown" & education != "unknown" & - education != "unknown" & housing != "unknown" & loan != "unknown") %>% + education != "unknown" & housing != "unknown" & + loan != "unknown") %>% droplevels() dataset = dataset %>% mutate( - education=factor(education, levels=c("primary", "secondary", - "tertiary"), - ordered=TRUE), - month=factor(month, levels=c("jan", "feb", "mar", - "apr", "may", "jun", - "jul", "aug", "sep", - "oct", "nov", "dec"), - ordered=TRUE), - pdays.bin=revalue(factor(pdays==-1), - c("TRUE"="never", "FALSE"="successful")), - pdays=as.integer(replace(pdays, pdays==-1, 999))) %>% + education = factor(education, + levels = c("primary", "secondary", "tertiary"), + ordered = TRUE), + month = factor(month, + levels = c("jan", "feb", "mar", "apr", "may", "jun", + "jul", "aug", "sep", "oct", "nov", "dec"), + ordered = TRUE), + pdays.bin = revalue(factor(pdays == -1), + c("TRUE" = "never", "FALSE" = "successful")), + pdays = as.integer(replace(pdays, pdays == -1, 999))) %>% select(age:pdays, pdays.bin, previous:y) + unlink("temp.dir", recursive = TRUE) + return(dataset) } \ No newline at end of file diff --git a/data-collection/breast-cancer-wisconsin-diagnostic/preprocess.R b/data-collection/breast-cancer-wisconsin-diagnostic/preprocess.R index 8c1ad2e..9883013 100644 --- a/data-collection/breast-cancer-wisconsin-diagnostic/preprocess.R +++ b/data-collection/breast-cancer-wisconsin-diagnostic/preprocess.R @@ -1,8 +1,8 @@ -preprocessDataset = function() +preprocess.dataset = function() { csv.file = "wdbc.data" - dataset = read.csv(paste0(orig.dir, "/", csv.file), header=FALSE) + dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE) colnames(dataset) = c("id", "diagnosis", apply(expand.grid(c("radius", "texture", "perimeter", @@ -12,7 +12,8 @@ preprocessDataset = function() c("mean", "se", "worst")), 1, function(x){paste(x[2], x[1])})) - dataset = dataset %>% select(`mean radius`:`worst fractal dimension`, diagnosis) + dataset = dataset %>% + select(`mean radius`:`worst fractal dimension`, diagnosis) return(dataset) } \ No newline at end of file diff --git a/data-collection/breast-cancer-wisconsin-original/preprocess.R b/data-collection/breast-cancer-wisconsin-original/preprocess.R index ce78758..69105fb 100644 --- a/data-collection/breast-cancer-wisconsin-original/preprocess.R +++ b/data-collection/breast-cancer-wisconsin-original/preprocess.R @@ -1,8 +1,8 @@ -preprocessDataset = function() +preprocess.dataset = function() { csv.file = "breast-cancer-wisconsin.data" - dataset = read.csv(paste0(orig.dir, "/", csv.file), header=FALSE) + dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE) colnames(dataset) = c("Sample code number", "Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", @@ -10,11 +10,12 @@ preprocessDataset = function() "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses", "Class") - dataset = dataset %>% select(-`Sample code number`) %>% - filter(`Bare Nuclei` != "?") %>% - mutate(Class=factor(Class), - `Bare Nuclei`=as.integer(`Bare Nuclei`)) %>% - droplevels() + dataset = dataset %>% + select(-`Sample code number`) %>% + filter(`Bare Nuclei` != "?") %>% + mutate(Class = factor(Class), + `Bare Nuclei` = as.integer(`Bare Nuclei`)) %>% + droplevels() return(dataset) } \ No newline at end of file diff --git a/data-collection/cardiotocography/preprocess.R b/data-collection/cardiotocography/preprocess.R index 1fa46db..458651c 100644 --- a/data-collection/cardiotocography/preprocess.R +++ b/data-collection/cardiotocography/preprocess.R @@ -1,40 +1,41 @@ -preprocessDataset = function() +preprocess.dataset = function() { xls.file = "CTG.xls" - wk = loadWorkbook(paste0(orig.dir, "/", xls.file)) - dataset = readWorksheet(wk, sheet="Raw Data") + wk = loadWorkbook(file.path(orig.dir, xls.file)) + dataset = readWorksheet(wk, sheet = "Raw Data") dataset = dataset %>% select(LB:FS, NSP, -c(DS, DR)) %>% - filter(complete.cases(.)) %>% - mutate(LB=as.integer(LB), - AC=as.integer(AC), - FM=as.integer(FM), - UC=as.integer(UC), - ASTV=as.integer(ASTV), - ALTV=as.integer(ALTV), - DL=as.integer(DL), - DP=as.integer(DP), - Width=as.integer(Width), - Min=as.integer(Min), - Max=as.integer(Max), - Nmax=as.integer(Nmax), - Nzeros=as.integer(Nzeros), - Mode=as.integer(Mode), - Mean=as.integer(Mean), - Median=as.integer(Median), - Variance=as.integer(Variance), - Tendency=factor(Tendency, levels=c(-1,0,1), ordered=TRUE), - A=factor(A), - B=factor(B), - C=factor(C), - D=factor(D), - E=factor(E), - AD=factor(AD), - DE=factor(DE), - LD=factor(LD), - FS=factor(FS), - NSP=factor(replace(NSP, NSP==2, 3))) + filter(complete.cases(.)) %>% + mutate(LB = as.integer(LB), + AC = as.integer(AC), + FM = as.integer(FM), + UC = as.integer(UC), + ASTV = as.integer(ASTV), + ALTV = as.integer(ALTV), + DL = as.integer(DL), + DP = as.integer(DP), + Width = as.integer(Width), + Min = as.integer(Min), + Max = as.integer(Max), + Nmax = as.integer(Nmax), + Nzeros = as.integer(Nzeros), + Mode = as.integer(Mode), + Mean = as.integer(Mean), + Median = as.integer(Median), + Variance = as.integer(Variance), + Tendency = factor(Tendency, levels = c(-1, 0, 1), + ordered = TRUE), + A = factor(A), + B = factor(B), + C = factor(C), + D = factor(D), + E = factor(E), + AD = factor(AD), + DE = factor(DE), + LD = factor(LD), + FS = factor(FS), + NSP = factor(replace(NSP, NSP == 2, 3))) return(dataset) } \ No newline at end of file diff --git a/data-collection/credit-card/preprocess.R b/data-collection/credit-card/preprocess.R index 9e2ffeb..2cd44e0 100644 --- a/data-collection/credit-card/preprocess.R +++ b/data-collection/credit-card/preprocess.R @@ -1,48 +1,40 @@ -preprocessDataset = function() +preprocess.dataset = function() { - #set.seed(SEED) - xls.file = "default of credit card clients.xls" - wk = loadWorkbook(paste0(orig.dir, "/", xls.file)) - dataset = readWorksheet(wk, sheet="Data", startRow=2, startCol=2, - check.names=FALSE) + wk = loadWorkbook(file.path(orig.dir, xls.file)) + dataset = readWorksheet(wk, sheet = "Data", startRow = 2, startCol = 2, + check.names = FALSE) dataset = dataset %>% - mutate(LIMIT_BAL=as.integer(LIMIT_BAL), - SEX=factor(SEX), - EDUCATION=factor(EDUCATION), # can't order due to - # inconsistency with - # UCI description - MARRIAGE=factor(MARRIAGE), - AGE=as.integer(AGE), - PAY_0=as.integer(replace(PAY_0, PAY_0<0, 0)), - PAY_2=as.integer(replace(PAY_2, PAY_2<0, 0)), - PAY_3=as.integer(replace(PAY_3, PAY_3<0, 0)), - PAY_4=as.integer(replace(PAY_4, PAY_4<0, 0)), - PAY_5=as.integer(replace(PAY_5, PAY_5<0, 0)), - PAY_6=as.integer(replace(PAY_6, PAY_6<0, 0)), - BILL_AMT1=as.integer(BILL_AMT1), - BILL_AMT2=as.integer(BILL_AMT2), - BILL_AMT3=as.integer(BILL_AMT3), - BILL_AMT4=as.integer(BILL_AMT4), - BILL_AMT5=as.integer(BILL_AMT5), - BILL_AMT6=as.integer(BILL_AMT6), - PAY_AMT1=as.integer(PAY_AMT1), - PAY_AMT2=as.integer(PAY_AMT2), - PAY_AMT3=as.integer(PAY_AMT3), - PAY_AMT4=as.integer(PAY_AMT4), - PAY_AMT5=as.integer(PAY_AMT5), - PAY_AMT6=as.integer(PAY_AMT6), - `default payment next month`=factor( - `default payment next month`) - ) - - #dataset.1 = dataset %>% filter(`default payment next month` == 1) - #dataset.0 = dataset %>% filter(`default payment next month` == 0) %>% - # sample_n(nrow(dataset.1)) - # - #dataset = rbind(dataset.0, dataset.1) + mutate(LIMIT_BAL = as.integer(LIMIT_BAL), + SEX = factor(SEX), + EDUCATION = factor(EDUCATION), # can not order due to + # inconsistency with + # UCI description + MARRIAGE = factor(MARRIAGE), + AGE = as.integer(AGE), + PAY_0 = as.integer(replace(PAY_0, PAY_0 < 0, 0)), + PAY_2 = as.integer(replace(PAY_2, PAY_2 < 0, 0)), + PAY_3 = as.integer(replace(PAY_3, PAY_3 < 0, 0)), + PAY_4 = as.integer(replace(PAY_4, PAY_4 < 0, 0)), + PAY_5 = as.integer(replace(PAY_5, PAY_5 < 0, 0)), + PAY_6 = as.integer(replace(PAY_6, PAY_6 < 0, 0)), + BILL_AMT1 = as.integer(BILL_AMT1), + BILL_AMT2 = as.integer(BILL_AMT2), + BILL_AMT3 = as.integer(BILL_AMT3), + BILL_AMT4 = as.integer(BILL_AMT4), + BILL_AMT5 = as.integer(BILL_AMT5), + BILL_AMT6 = as.integer(BILL_AMT6), + PAY_AMT1 = as.integer(PAY_AMT1), + PAY_AMT2 = as.integer(PAY_AMT2), + PAY_AMT3 = as.integer(PAY_AMT3), + PAY_AMT4 = as.integer(PAY_AMT4), + PAY_AMT5 = as.integer(PAY_AMT5), + PAY_AMT6 = as.integer(PAY_AMT6), + `default payment next month` = + factor(`default payment next month`) + ) return(dataset) } \ No newline at end of file diff --git a/data-collection/indian-liver/preprocess.R b/data-collection/indian-liver/preprocess.R index c149efa..773a047 100644 --- a/data-collection/indian-liver/preprocess.R +++ b/data-collection/indian-liver/preprocess.R @@ -1,14 +1,14 @@ -preprocessDataset = function() +preprocess.dataset = function() { csv.file = "Indian Liver Patient Dataset (ILPD).csv" - dataset = read.csv(paste0(orig.dir, "/", csv.file), - header=FALSE) + dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE) colnames(dataset) = c("Age", "Gender", "TB", "DB", "Alkphos", "Sgpt", "Sgot", "TP", "ALB", "A/G Ratio", "Selector") - dataset = dataset %>% mutate(Selector=factor(Selector)) + dataset = dataset %>% + mutate(Selector = factor(Selector)) return(dataset) } \ No newline at end of file diff --git a/data-collection/magic/preprocess.R b/data-collection/magic/preprocess.R index 5abe672..e95183b 100644 --- a/data-collection/magic/preprocess.R +++ b/data-collection/magic/preprocess.R @@ -1,9 +1,8 @@ -preprocessDataset = function() +preprocess.dataset = function() { csv.file = "magic04.data" - dataset = read.csv(paste0(orig.dir, "/", csv.file), - header=FALSE) + dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE) colnames(dataset) = c("fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", diff --git a/data-collection/seismic-bumps/preprocess.R b/data-collection/seismic-bumps/preprocess.R index a4b1e5d..c1e203c 100644 --- a/data-collection/seismic-bumps/preprocess.R +++ b/data-collection/seismic-bumps/preprocess.R @@ -1,30 +1,23 @@ -preprocessDataset = function() +preprocess.dataset = function() { - #set.seed(SEED) - arff.file = "seismic-bumps.arff" - dataset = read.arff(paste0(orig.dir, "/", arff.file)) + dataset = read.arff(file.path(orig.dir, arff.file)) - dataset = dataset %>% select(-c(nbumps6:nbumps89)) %>% - mutate(genergy=as.integer(genergy), - gpuls=as.integer(gpuls), - gdenergy=as.integer(gdenergy), - gdpuls=as.integer(gdpuls), - nbumps=as.integer(nbumps), - nbumps2=as.integer(nbumps2), - nbumps3=as.integer(nbumps3), - nbumps4=as.integer(nbumps4), - nbumps5=as.integer(nbumps5), - energy=as.integer(energy), - maxenergy=as.integer(maxenergy) - ) - - #dataset.1 = dataset %>% filter(class == "1") - #dataset.0 = dataset %>% filter(class == "0") %>% - # sample_n(nrow(dataset.1)*4) - # - #dataset = rbind(dataset.0, dataset.1) + dataset = dataset %>% + select(-c(nbumps6:nbumps89)) %>% + mutate(genergy = as.integer(genergy), + gpuls = as.integer(gpuls), + gdenergy = as.integer(gdenergy), + gdpuls = as.integer(gdpuls), + nbumps = as.integer(nbumps), + nbumps2 = as.integer(nbumps2), + nbumps3 = as.integer(nbumps3), + nbumps4 = as.integer(nbumps4), + nbumps5 = as.integer(nbumps5), + energy = as.integer(energy), + maxenergy = as.integer(maxenergy) + ) return(dataset) } \ No newline at end of file diff --git a/data-collection/spambase/preprocess.R b/data-collection/spambase/preprocess.R index d579197..b8ef320 100644 --- a/data-collection/spambase/preprocess.R +++ b/data-collection/spambase/preprocess.R @@ -1,9 +1,9 @@ -preprocessDataset = function() +preprocess.dataset = function() { csv.file = "spambase.data" - dataset = read.csv(paste0(orig.dir, "/", csv.file), - header=FALSE) + dataset = read.csv(file.path(orig.dir, csv.file), + header = FALSE) colnames(dataset) = c("word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d", "word_freq_our", "word_freq_over", @@ -27,7 +27,8 @@ preprocessDataset = function() "capital_run_length_longest", "capital_run_length_total", "class") - dataset = dataset %>% mutate(class=factor(class)) + dataset = dataset %>% + mutate(class = factor(class)) return(dataset) } \ No newline at end of file diff --git a/data-collection/wine-quality/preprocess.R b/data-collection/wine-quality/preprocess.R index 2ff5c03..d9ad34a 100644 --- a/data-collection/wine-quality/preprocess.R +++ b/data-collection/wine-quality/preprocess.R @@ -1,21 +1,24 @@ -preprocessDataset = function() +preprocess.dataset = function() { csv.file.w = "winequality-white.csv" csv.file.r = "winequality-red.csv" - dataset.w = read.csv(paste0(orig.dir, "/", csv.file.w), sep=";", - check.names=FALSE) - dataset.w = dataset.w %>% mutate(color="white") + dataset.w = read.csv(file.path(orig.dir, "/", csv.file.w), sep = ";", + check.names = FALSE) + dataset.w = dataset.w %>% + mutate(color = "white") - dataset.r = read.csv(paste0(orig.dir, "/", csv.file.r), sep=";", - check.names=FALSE) - dataset.r = dataset.r %>% mutate(color="red") + dataset.r = read.csv(paste0(orig.dir, "/", csv.file.r), sep = ";", + check.names = FALSE) + dataset.r = dataset.r %>% + mutate(color = "red") - dataset = rbind(dataset.w, dataset.r) %>% - mutate(color=factor(color), - quality=ifelse(quality>5, 1, 0)) %>% - select(`fixed acidity`:alcohol, color, quality) %>% - mutate(quality=factor(quality)) + dataset = + rbind(dataset.w, dataset.r) %>% + mutate(color = factor(color), + quality = ifelse(quality > 5, 1, 0)) %>% + select(`fixed acidity`:alcohol, color, quality) %>% + mutate(quality = factor(quality)) return(dataset) } \ No newline at end of file diff --git a/data-download.R b/data-download.R deleted file mode 100644 index 7867c36..0000000 --- a/data-download.R +++ /dev/null @@ -1,49 +0,0 @@ -rm(list=ls()) - -source("config.R") -source("utils.R") - -library(RCurl) -library(tools) -library(yaml) - -flog.info("Started downloading dataset collection") - -for (dir.name in dir(PATH_DATASETS)) -{ - flog.info(paste("Dataset:", dir.name)) - - dest.dir = gsub("\\*", dir.name, PATH_DATASET_ORIGINAL) - config.yaml.file = paste0(PATH_DATASETS, dir.name, "/", FILE_CONFIG_YAML) - - urls.list = yaml.load_file(config.yaml.file)$urls - - mkdir(dest.dir) - - for (url in urls.list) - { - flog.info(paste("URL:", url)) - - dest.file = URLdecode(basename(url)) - dest.file.path = paste0(dest.dir, dest.file) - - if (file.exists(dest.file.path)) - { - flog.warn(paste("Target file", basename(dest.file.path), - "already exists; skipping...")) - next - } - - tryCatch( - raw.content <- getBinaryURL(url, .opts=curlOptions(ssl.verifypeer=FALSE)), - error = function(e){flog.error(e); stop(e)} - ) - - writeBin(raw.content, dest.file.path) - - } - - flog.info("*****") -} - -flog.info("Finished downloading dataset collection") diff --git a/data-preprocess.R b/data-preprocess.R deleted file mode 100644 index 68eeaac..0000000 --- a/data-preprocess.R +++ /dev/null @@ -1,64 +0,0 @@ -rm(list=ls()) - -source("config.R") -source("utils.R") - -library(plyr) -library(dplyr) -library(foreign) -library(XLConnect) - -flog.info("Started preprocessing dataset collection") - -for (dir.name in dir(PATH_DATASETS)) -{ - flog.info(paste("Dataset:", dir.name)) - - orig.dir = gsub("\\*", dir.name, PATH_DATASET_ORIGINAL) - dest.dir = gsub("\\*", dir.name, PATH_DATASET_PREPROCESSED) - dest.file.path = paste0(dest.dir, FILE_PREPROCESSED_OUTPUT) - - if (file.exists(dest.file.path)) - { - flog.warn(paste("Target file", basename(dest.file.path), - "already exists; skipping...")) - flog.info("*****") - next - } - - r.src.file = paste0(PATH_DATASETS, dir.name, "/", FILE_PREPROCESSING_SCRIPT) - - source(r.src.file) - dataset = preprocessDataset() # custom per-dataset preprocessing - - # change column names - colnames(dataset) = tolower( - make.names( - gsub("^\\.|\\.$", "", colnames(dataset)), - unique=TRUE, allow_=FALSE)) - - # change factor levels - for (name in colnames(dataset)) - { - if (any(class(dataset[[name]]) == "factor")) - { - levels(dataset[[name]]) = tolower( - make.names( - gsub("^\\.|\\.$", "", - levels(dataset[[name]])), - unique=TRUE, allow_=FALSE)) - - } - } - - printDatasetStatistics(dataset) - - mkdir(dest.dir) - saveRDS(dataset, dest.file.path) - - flog.info(paste("Created preprocessed file", FILE_PREPROCESSED_OUTPUT)) - - flog.info("*****") -} - -flog.info("Finished preprocessing dataset collection") \ No newline at end of file diff --git a/init.R b/init.R new file mode 100644 index 0000000..b34d781 --- /dev/null +++ b/init.R @@ -0,0 +1,64 @@ +# ---- init ---- + +# clear envirionment + +rm(list = ls()) + +# load setup variables + +source("config.R") + +# set randomization + +set.seed(SEED) + +# load library management system + +library(checkpoint) + +if (CHECKPOINT.QUICK.LOAD) # approx. x10 faster checkpoint library loading +{ + # assume https + options(checkpoint.mranUrl = CHECKPOINT.MRAN.URL) + # disable url checking + assignInNamespace("is.404", function(mran, warn = TRUE) { FALSE }, + "checkpoint") +} + +# knitr fix +checkpoint(CHECKPOINT.SNAPSHOT.DATE, verbose = TRUE, scanForPackages = FALSE) +if (system.file(package = "knitr") == "") +{ + install.packages("knitr") +} + +# actual checkpoint loading +checkpoint(CHECKPOINT.SNAPSHOT.DATE, verbose = TRUE, scanForPackages = TRUE) + +# load logging system + +library(futile.logger) + +flog.threshold(LOGGER.LEVEL) + +# load libraries + +library(RCurl) +library(tools) +library(yaml) + +library(plyr) +library(dplyr) +library(foreign) +library(XLConnect) + +# load helper functions + +source("utils.R") + +# perform additional custom init + +if (file.exists(USER.INIT.FILE)) +{ + source(USER.INIT.FILE) +} diff --git a/s1-download-data.R b/s1-download-data.R new file mode 100644 index 0000000..2b602e4 --- /dev/null +++ b/s1-download-data.R @@ -0,0 +1,48 @@ +# ---- download-data ---- + +source("init.R") +source("utils.R") + +setup.logger(LOGGER.OUTPUT.S1.FILE) + +flog.info("Step 1: download dataset collection") + +for (dir.name in dir(DATASETS.DIR)) +{ + flog.info(paste("Dataset:", dir.name)) + + dest.dir = gsub(DATASET.NAME.PATTERN, dir.name, DATASET.ORIGINAL.DIR) + config.yaml.file = file.path(DATASETS.DIR, dir.name, DATASET.CONFIG.FILE) + + urls.list = yaml.load_file(config.yaml.file)$urls + + if (!dir.exists(dest.dir)) + { + dir.create(dest.dir) + } + + for (url in urls.list) + { + flog.info(paste("URL:", url)) + + dest.file = URLdecode(basename(url)) + dest.file.path = file.path(dest.dir, dest.file) + + if (!file.exists(dest.file.path) | OVERWRITE.OUTPUT.FILES) + { + tryCatch( + raw.content <- + getBinaryURL(url, .opts = curlOptions(ssl.verifypeer = + SSL.VERIFY.PEER)), + error = function(e){flog.error(e); stop(e)} + ) + + writeBin(raw.content, dest.file.path) + } else { + flog.warn(paste("Target file", basename(dest.file.path), + "already exists, skipping")) + } + } + + flog.info(paste(rep("*", 25), collapse = "")) +} diff --git a/s2-preprocess-data.R b/s2-preprocess-data.R new file mode 100644 index 0000000..da36b51 --- /dev/null +++ b/s2-preprocess-data.R @@ -0,0 +1,61 @@ +# ---- preprocess-data ---- + +source("init.R") +source("utils.R") + +setup.logger(LOGGER.OUTPUT.S2.FILE) + +flog.info("Step 2: preprocess dataset collection") + +for (dir.name in dir(DATASETS.DIR)) +{ + flog.info(paste("Dataset:", dir.name)) + + orig.dir = gsub(DATASET.NAME.PATTERN, dir.name, DATASET.ORIGINAL.DIR) + dest.dir = gsub(DATASET.NAME.PATTERN, dir.name, DATASET.PREPROCESSED.DIR) + dest.file.path = file.path(dest.dir, DATASET.PREPROCESSED.OUTPUT.FILE) + + if (!file.exists(dest.file.path) | OVERWRITE.OUTPUT.FILES) + { + r.src.file = file.path(DATASETS.DIR, dir.name, + DATASET.PREPROCESSING.SCRIPT) + source(r.src.file) + dataset = preprocess.dataset() # custom per-dataset preprocessing + + # change column names + colnames(dataset) = tolower( + make.names( + gsub("^\\.|\\.$", "", colnames(dataset)), + unique = TRUE, allow_ = FALSE)) + + # change factor levels + for (name in colnames(dataset)) + { + if (any(class(dataset[[name]]) == "factor")) + { + levels(dataset[[name]]) = tolower( + make.names( + gsub("^\\.|\\.$", "", + levels(dataset[[name]])), + unique = TRUE, allow_ = FALSE)) + } + } + + print.dataset.statistics(dataset) + + if (!dir.exists(dest.dir)) + { + dir.create(dest.dir) + } + + saveRDS(dataset, dest.file.path) + + flog.info(paste("Created preprocessed file", + DATASET.PREPROCESSED.OUTPUT.FILE)) + } else { + flog.warn(paste("Target file", basename(dest.file.path), + "already exists, skipping")) + } + + flog.info(paste(rep("*", 25), collapse = "")) +} diff --git a/readme-make.Rmd b/s3-make-readme.Rmd similarity index 61% rename from readme-make.Rmd rename to s3-make-readme.Rmd index 0c875f6..7b968b3 100644 --- a/readme-make.Rmd +++ b/s3-make-readme.Rmd @@ -7,35 +7,31 @@ output: --- ```{r global-options, include=FALSE} -knitr::opts_chunk$set(comment="", echo=FALSE, - warning=FALSE, message=FALSE) -source('config.R') +knitr::opts_chunk$set(comment = "", echo = FALSE, warning = FALSE, message = FALSE) +source('init.R') ``` Document generation date: `r Sys.time()`. - ```{r show-datasets, results='asis'} -library(yaml) cat("\n# Table of Contents\n\n") -for (dir.name in dir(PATH_DATASETS)) +for (dir.name in dir(DATASETS.DIR)) { - config.yaml.file.path = paste0(PATH_DATASETS, dir.name, "/", FILE_CONFIG_YAML) + config.yaml.file.path = file.path(DATASETS.DIR, dir.name, DATASET.CONFIG.FILE) config.yaml = yaml.load_file(config.yaml.file.path) anchor = gsub(" ", "-", gsub("[[:punct:]]", "", tolower(config.yaml$name))) cat(paste0("1. [", config.yaml$name, "](#", anchor, ")\n" )) - } cat("\n---\n\n") -for (dir.name in dir(PATH_DATASETS)) +for (dir.name in dir(DATASETS.DIR)) { - config.yaml.file.path = paste0(PATH_DATASETS, dir.name, "/", FILE_CONFIG_YAML) + config.yaml.file.path = file.path(DATASETS.DIR, dir.name, DATASET.CONFIG.FILE) config.yaml = yaml.load_file(config.yaml.file.path) cat(paste("#", config.yaml$name, "\n\n")) @@ -55,8 +51,10 @@ for (dir.name in dir(PATH_DATASETS)) cat(paste("**Dataset**:\n\n")) - preprocessed.dir = gsub("\\*", dir.name, PATH_DATASET_PREPROCESSED) - preprocessed.file.path = paste0(preprocessed.dir, FILE_PREPROCESSED_OUTPUT) + preprocessed.dir = gsub(DATASET.NAME.PATTERN, dir.name, + DATASET.PREPROCESSED.DIR) + preprocessed.file.path = file.path(preprocessed.dir, + DATASET.PREPROCESSED.OUTPUT.FILE) dataset = readRDS(preprocessed.file.path) @@ -66,11 +64,11 @@ for (dir.name in dir(PATH_DATASETS)) cat("**Predictors**:\n\n") - df.pred = data.frame(table(sapply(dataset[, 1:(ncol(dataset)-1)], - function(f){paste(class(f), collapse=" ")}))) + df.pred = data.frame(table(sapply(dataset[, 1:(ncol(dataset) - 1)], + function(f){paste(class(f), collapse = " ")}))) colnames(df.pred) = c("Type", "Frequency") - cat(knitr::kable(df.pred, format="markdown"), sep="\n") + cat(knitr::kable(df.pred, format = "markdown"), sep = "\n") cat("\n") perc.classes = sort(round(100*as.numeric( @@ -79,14 +77,14 @@ for (dir.name in dir(PATH_DATASETS)) cat("**Class imbalance**:\n\n") - cat(knitr::kable(data.frame(A=c(paste(perc.classes[1], "%"), num.classes[1]), - B=c(paste(perc.classes[2], "%"), num.classes[2])), - format="markdown", col.names=c("class A", " class B"), - align=c("c", "c")), - sep="\n") - + cat(knitr::kable(data.frame(A = c(paste(perc.classes[1], "%"), + num.classes[1]), + B = c(paste(perc.classes[2], "%"), + num.classes[2])), + format = "markdown", col.names = c("class A", "class B"), + align = c("c", "c")), + sep = "\n") cat("\n---\n\n") } ``` - diff --git a/utils.R b/utils.R index 709ce84..4f421ba 100644 --- a/utils.R +++ b/utils.R @@ -1,19 +1,6 @@ -library(futile.logger) - -mkdir = function(dest.dir) +print.dataset.statistics = function(dataset) { - if (!dir.exists(dest.dir)) - { - flog.debug(paste("Creating directory", dest.dir)) - dir.create(dest.dir) - } else { - flog.debug(paste("Target directory", dest.dir, "already exists")) - } -} - -printDatasetStatistics = function(dataset) -{ - if (ncol(dataset)==0) # for mockups + if (ncol(dataset) == 0) # for mockups { flog.warn("Empty dataset") return() @@ -21,8 +8,19 @@ printDatasetStatistics = function(dataset) no.cases = nrow(dataset) no.attributes = ncol(dataset) - 1 - perc.classes = round(100*as.numeric(table(dataset[, ncol(dataset)]))/nrow(dataset), 0) + perc.classes = + round(100*as.numeric(table(dataset[, ncol(dataset)]))/nrow(dataset), 0) flog.info(paste0("Cases: ", no.cases, ", attributes: ", no.attributes, ", classes: ", perc.classes[1], "%/", perc.classes[2], "%")) } + +setup.logger = function(output.file) +{ + if (LOGGER.OVERWRITE.EXISTING.FILES & file.exists(output.file)) + { + file.remove(output.file) + } + + invisible(flog.appender(appender.tee(output.file))) +}