1
0
mirror of https://github.com/andre-wojtowicz/uci-ml-to-r.git synced 2024-11-21 15:50:28 +01:00

refactoring, checkpoint cleanup and snapshot update

This commit is contained in:
Andrzej Wójtowicz 2016-07-17 02:35:55 +02:00
parent c49a82db43
commit 982b3b6f9d
20 changed files with 429 additions and 361 deletions

3
.gitignore vendored
View File

@ -24,3 +24,6 @@ data-collection/*/preprocessed/*
# markdown outputs # markdown outputs
*.html *.html
.Rproj.user .Rproj.user
# logger outputs
*.log

View File

@ -3,8 +3,7 @@ Andrzej Wójtowicz
Document generation date: 2016-07-13 13:45:45. Document generation date: 2016-07-17 02:31:21.
# Table of Contents # Table of Contents
@ -71,7 +70,7 @@ S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of
**Class imbalance**: **Class imbalance**:
| class A | class B | | class A | class B |
|:-------:|:--------:| |:-------:|:-------:|
| 12 % | 88 % | | 12 % | 88 % |
| 5021 | 38172 | | 5021 | 38172 |
@ -141,7 +140,7 @@ https://archive.ics.uci.edu/ml/citation_policy.html
**Class imbalance**: **Class imbalance**:
| class A | class B | | class A | class B |
|:-------:|:--------:| |:-------:|:-------:|
| 37 % | 63 % | | 37 % | 63 % |
| 212 | 357 | | 212 | 357 |
@ -189,7 +188,7 @@ O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear programming",
**Class imbalance**: **Class imbalance**:
| class A | class B | | class A | class B |
|:-------:|:--------:| |:-------:|:-------:|
| 35 % | 65 % | | 35 % | 65 % |
| 239 | 444 | | 239 | 444 |
@ -259,7 +258,7 @@ Ayres de Campos et al. (2000) SisPorto 2.0 A Program for Automated Analysis of C
**Class imbalance**: **Class imbalance**:
| class A | class B | | class A | class B |
|:-------:|:--------:| |:-------:|:-------:|
| 22 % | 78 % | | 22 % | 78 % |
| 471 | 1655 | | 471 | 1655 |
@ -321,7 +320,7 @@ Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for
**Class imbalance**: **Class imbalance**:
| class A | class B | | class A | class B |
|:-------:|:--------:| |:-------:|:-------:|
| 22 % | 78 % | | 22 % | 78 % |
| 6636 | 23364 | | 6636 | 23364 |
@ -372,7 +371,7 @@ https://archive.ics.uci.edu/ml/citation_policy.html
**Class imbalance**: **Class imbalance**:
| class A | class B | | class A | class B |
|:-------:|:--------:| |:-------:|:-------:|
| 29 % | 71 % | | 29 % | 71 % |
| 167 | 416 | | 167 | 416 |
@ -422,7 +421,7 @@ https://archive.ics.uci.edu/ml/citation_policy.html
**Class imbalance**: **Class imbalance**:
| class A | class B | | class A | class B |
|:-------:|:--------:| |:-------:|:-------:|
| 35 % | 65 % | | 35 % | 65 % |
| 6688 | 12332 | | 6688 | 12332 |
@ -476,7 +475,7 @@ Sikora M., Wrobel L.: Application of rule induction algorithms for analysis of d
**Class imbalance**: **Class imbalance**:
| class A | class B | | class A | class B |
|:-------:|:--------:| |:-------:|:-------:|
| 7 % | 93 % | | 7 % | 93 % |
| 170 | 2414 | | 170 | 2414 |
@ -575,7 +574,7 @@ https://archive.ics.uci.edu/ml/citation_policy.html
**Class imbalance**: **Class imbalance**:
| class A | class B | | class A | class B |
|:-------:|:--------:| |:-------:|:-------:|
| 39 % | 61 % | | 39 % | 61 % |
| 1813 | 2788 | | 1813 | 2788 |
@ -628,9 +627,8 @@ P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferen
**Class imbalance**: **Class imbalance**:
| class A | class B | | class A | class B |
|:-------:|:--------:| |:-------:|:-------:|
| 37 % | 63 % | | 37 % | 63 % |
| 2384 | 4113 | | 2384 | 4113 |
--- ---

View File

@ -1,28 +1,48 @@
# ---- checkpoint ---- # ---- config ----
CHECKPOINT.MRAN.URL = "http://mran.microsoft.com/snapshot/" # randomization and output files
CHECKPOINT.SNAPSHOT.DATE = "2016-04-10"
library(checkpoint) SEED = 1337
options(checkpoint.mranUrl=CHECKPOINT.MRAN.URL) OVERWRITE.OUTPUT.FILES = TRUE # overwrite downloaded and created datasets
checkpoint(CHECKPOINT.SNAPSHOT.DATE)
# ---- logger ---- # extra user configuration and init
LOGGER_LEVEL = futile.logger::INFO USER.CONFIG.FILE = "config.R.user"
USER.INIT.FILE = "init.R.user"
library(futile.logger) # checkpoint library
flog.threshold(LOGGER_LEVEL)
# ---- other ---- CHECKPOINT.MRAN.URL = "https://mran.microsoft.com/"
CHECKPOINT.SNAPSHOT.DATE = "2016-07-01"
CHECKPOINT.QUICK.LOAD = TRUE # skip testing https and checking url
PATH_DATASETS = "data-collection/" # logging system
PATH_DATASET_ORIGINAL = paste0(PATH_DATASETS, "*/original/")
PATH_DATASET_PREPROCESSED = paste0(PATH_DATASETS, "*/preprocessed/")
FILE_CONFIG_YAML = "config.yaml" LOGGER.OUTPUT.S1.FILE = "output-s1.log"
FILE_PREPROCESSING_SCRIPT = "preprocess.R" LOGGER.OUTPUT.S2.FILE = "output-s2.log"
FILE_PREPROCESSED_OUTPUT = "dataset.rds" LOGGER.LEVEL = 6 # futile.logger::INFO
LOGGER.OVERWRITE.EXISTING.FILES = TRUE
if (file.exists("config.R.user")) # datasets
source("config.R.user")
DATASETS.DIR = "data-collection"
DATASET.NAME.PATTERN = "DS-NAME"
DATASET.ORIGINAL.DIR =
file.path(DATASETS.DIR, DATASET.NAME.PATTERN, "original")
DATASET.PREPROCESSED.DIR =
file.path(DATASETS.DIR, DATASET.NAME.PATTERN, "preprocessed")
DATASET.CONFIG.FILE = "config.yaml"
DATASET.PREPROCESSING.SCRIPT = "preprocess.R"
DATASET.PREPROCESSED.OUTPUT.FILE = "dataset.rds"
# curl
SSL.VERIFY.PEER = FALSE
# load custom config
if (file.exists(USER.CONFIG.FILE))
{
source(USER.CONFIG.FILE)
}

View File

@ -1,7 +1,5 @@
preprocessDataset = function() preprocess.dataset = function()
{ {
#set.seed(SEED)
temp.dir = tempdir() temp.dir = tempdir()
zip.file = "bank.zip" zip.file = "bank.zip"
@ -9,36 +7,38 @@ preprocessDataset = function()
flog.debug(paste("Unzipping", zip.file)) flog.debug(paste("Unzipping", zip.file))
unzip(zipfile=paste0(orig.dir, zip.file), unzip(zipfile = file.path(orig.dir, zip.file),
files = zip.dataset.path, files = zip.dataset.path,
exdir = temp.dir) exdir = temp.dir)
flog.debug(paste("Loading", zip.dataset.path)) flog.debug(paste("Loading", zip.dataset.path))
dataset = read.csv(paste0(temp.dir, "/", zip.dataset.path), sep=";") dataset = read.csv(file.path(temp.dir, zip.dataset.path), sep = ";")
flog.debug("Preprocessing loaded dataset") flog.debug("Preprocessing loaded dataset")
dataset = dataset %>% dataset = dataset %>%
select(-c(duration, default)) %>% select(-c(duration, default)) %>%
filter(job != "unknown" & marital != "unknown" & education != "unknown" & filter(job != "unknown" & marital != "unknown" & education != "unknown" &
education != "unknown" & housing != "unknown" & loan != "unknown") %>% education != "unknown" & housing != "unknown" &
loan != "unknown") %>%
droplevels() droplevels()
dataset = dataset %>% dataset = dataset %>%
mutate( mutate(
education=factor(education, levels=c("primary", "secondary", education = factor(education,
"tertiary"), levels = c("primary", "secondary", "tertiary"),
ordered = TRUE), ordered = TRUE),
month=factor(month, levels=c("jan", "feb", "mar", month = factor(month,
"apr", "may", "jun", levels = c("jan", "feb", "mar", "apr", "may", "jun",
"jul", "aug", "sep", "jul", "aug", "sep", "oct", "nov", "dec"),
"oct", "nov", "dec"),
ordered = TRUE), ordered = TRUE),
pdays.bin = revalue(factor(pdays == -1), pdays.bin = revalue(factor(pdays == -1),
c("TRUE" = "never", "FALSE" = "successful")), c("TRUE" = "never", "FALSE" = "successful")),
pdays = as.integer(replace(pdays, pdays == -1, 999))) %>% pdays = as.integer(replace(pdays, pdays == -1, 999))) %>%
select(age:pdays, pdays.bin, previous:y) select(age:pdays, pdays.bin, previous:y)
unlink("temp.dir", recursive = TRUE)
return(dataset) return(dataset)
} }

View File

@ -1,8 +1,8 @@
preprocessDataset = function() preprocess.dataset = function()
{ {
csv.file = "wdbc.data" csv.file = "wdbc.data"
dataset = read.csv(paste0(orig.dir, "/", csv.file), header=FALSE) dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE)
colnames(dataset) = c("id", "diagnosis", colnames(dataset) = c("id", "diagnosis",
apply(expand.grid(c("radius", "texture", "perimeter", apply(expand.grid(c("radius", "texture", "perimeter",
@ -12,7 +12,8 @@ preprocessDataset = function()
c("mean", "se", "worst")), c("mean", "se", "worst")),
1, function(x){paste(x[2], x[1])})) 1, function(x){paste(x[2], x[1])}))
dataset = dataset %>% select(`mean radius`:`worst fractal dimension`, diagnosis) dataset = dataset %>%
select(`mean radius`:`worst fractal dimension`, diagnosis)
return(dataset) return(dataset)
} }

View File

@ -1,8 +1,8 @@
preprocessDataset = function() preprocess.dataset = function()
{ {
csv.file = "breast-cancer-wisconsin.data" csv.file = "breast-cancer-wisconsin.data"
dataset = read.csv(paste0(orig.dir, "/", csv.file), header=FALSE) dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE)
colnames(dataset) = c("Sample code number", "Clump Thickness", colnames(dataset) = c("Sample code number", "Clump Thickness",
"Uniformity of Cell Size", "Uniformity of Cell Shape", "Uniformity of Cell Size", "Uniformity of Cell Shape",
@ -10,7 +10,8 @@ preprocessDataset = function()
"Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli",
"Mitoses", "Class") "Mitoses", "Class")
dataset = dataset %>% select(-`Sample code number`) %>% dataset = dataset %>%
select(-`Sample code number`) %>%
filter(`Bare Nuclei` != "?") %>% filter(`Bare Nuclei` != "?") %>%
mutate(Class = factor(Class), mutate(Class = factor(Class),
`Bare Nuclei` = as.integer(`Bare Nuclei`)) %>% `Bare Nuclei` = as.integer(`Bare Nuclei`)) %>%

View File

@ -1,8 +1,8 @@
preprocessDataset = function() preprocess.dataset = function()
{ {
xls.file = "CTG.xls" xls.file = "CTG.xls"
wk = loadWorkbook(paste0(orig.dir, "/", xls.file)) wk = loadWorkbook(file.path(orig.dir, xls.file))
dataset = readWorksheet(wk, sheet = "Raw Data") dataset = readWorksheet(wk, sheet = "Raw Data")
dataset = dataset %>% select(LB:FS, NSP, -c(DS, DR)) %>% dataset = dataset %>% select(LB:FS, NSP, -c(DS, DR)) %>%
@ -24,7 +24,8 @@ preprocessDataset = function()
Mean = as.integer(Mean), Mean = as.integer(Mean),
Median = as.integer(Median), Median = as.integer(Median),
Variance = as.integer(Variance), Variance = as.integer(Variance),
Tendency=factor(Tendency, levels=c(-1,0,1), ordered=TRUE), Tendency = factor(Tendency, levels = c(-1, 0, 1),
ordered = TRUE),
A = factor(A), A = factor(A),
B = factor(B), B = factor(B),
C = factor(C), C = factor(C),

View File

@ -1,17 +1,15 @@
preprocessDataset = function() preprocess.dataset = function()
{ {
#set.seed(SEED)
xls.file = "default of credit card clients.xls" xls.file = "default of credit card clients.xls"
wk = loadWorkbook(paste0(orig.dir, "/", xls.file)) wk = loadWorkbook(file.path(orig.dir, xls.file))
dataset = readWorksheet(wk, sheet = "Data", startRow = 2, startCol = 2, dataset = readWorksheet(wk, sheet = "Data", startRow = 2, startCol = 2,
check.names = FALSE) check.names = FALSE)
dataset = dataset %>% dataset = dataset %>%
mutate(LIMIT_BAL = as.integer(LIMIT_BAL), mutate(LIMIT_BAL = as.integer(LIMIT_BAL),
SEX = factor(SEX), SEX = factor(SEX),
EDUCATION=factor(EDUCATION), # can't order due to EDUCATION = factor(EDUCATION), # can not order due to
# inconsistency with # inconsistency with
# UCI description # UCI description
MARRIAGE = factor(MARRIAGE), MARRIAGE = factor(MARRIAGE),
@ -34,15 +32,9 @@ preprocessDataset = function()
PAY_AMT4 = as.integer(PAY_AMT4), PAY_AMT4 = as.integer(PAY_AMT4),
PAY_AMT5 = as.integer(PAY_AMT5), PAY_AMT5 = as.integer(PAY_AMT5),
PAY_AMT6 = as.integer(PAY_AMT6), PAY_AMT6 = as.integer(PAY_AMT6),
`default payment next month`=factor( `default payment next month` =
`default payment next month`) factor(`default payment next month`)
) )
#dataset.1 = dataset %>% filter(`default payment next month` == 1)
#dataset.0 = dataset %>% filter(`default payment next month` == 0) %>%
# sample_n(nrow(dataset.1))
#
#dataset = rbind(dataset.0, dataset.1)
return(dataset) return(dataset)
} }

View File

@ -1,14 +1,14 @@
preprocessDataset = function() preprocess.dataset = function()
{ {
csv.file = "Indian Liver Patient Dataset (ILPD).csv" csv.file = "Indian Liver Patient Dataset (ILPD).csv"
dataset = read.csv(paste0(orig.dir, "/", csv.file), dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE)
header=FALSE)
colnames(dataset) = c("Age", "Gender", "TB", "DB", "Alkphos", "Sgpt", colnames(dataset) = c("Age", "Gender", "TB", "DB", "Alkphos", "Sgpt",
"Sgot", "TP", "ALB", "A/G Ratio", "Selector") "Sgot", "TP", "ALB", "A/G Ratio", "Selector")
dataset = dataset %>% mutate(Selector=factor(Selector)) dataset = dataset %>%
mutate(Selector = factor(Selector))
return(dataset) return(dataset)
} }

View File

@ -1,9 +1,8 @@
preprocessDataset = function() preprocess.dataset = function()
{ {
csv.file = "magic04.data" csv.file = "magic04.data"
dataset = read.csv(paste0(orig.dir, "/", csv.file), dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE)
header=FALSE)
colnames(dataset) = c("fLength", "fWidth", "fSize", "fConc", "fConc1", colnames(dataset) = c("fLength", "fWidth", "fSize", "fConc", "fConc1",
"fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist",

View File

@ -1,12 +1,11 @@
preprocessDataset = function() preprocess.dataset = function()
{ {
#set.seed(SEED)
arff.file = "seismic-bumps.arff" arff.file = "seismic-bumps.arff"
dataset = read.arff(paste0(orig.dir, "/", arff.file)) dataset = read.arff(file.path(orig.dir, arff.file))
dataset = dataset %>% select(-c(nbumps6:nbumps89)) %>% dataset = dataset %>%
select(-c(nbumps6:nbumps89)) %>%
mutate(genergy = as.integer(genergy), mutate(genergy = as.integer(genergy),
gpuls = as.integer(gpuls), gpuls = as.integer(gpuls),
gdenergy = as.integer(gdenergy), gdenergy = as.integer(gdenergy),
@ -20,11 +19,5 @@ preprocessDataset = function()
maxenergy = as.integer(maxenergy) maxenergy = as.integer(maxenergy)
) )
#dataset.1 = dataset %>% filter(class == "1")
#dataset.0 = dataset %>% filter(class == "0") %>%
# sample_n(nrow(dataset.1)*4)
#
#dataset = rbind(dataset.0, dataset.1)
return(dataset) return(dataset)
} }

View File

@ -1,8 +1,8 @@
preprocessDataset = function() preprocess.dataset = function()
{ {
csv.file = "spambase.data" csv.file = "spambase.data"
dataset = read.csv(paste0(orig.dir, "/", csv.file), dataset = read.csv(file.path(orig.dir, csv.file),
header = FALSE) header = FALSE)
colnames(dataset) = c("word_freq_make", "word_freq_address", "word_freq_all", colnames(dataset) = c("word_freq_make", "word_freq_address", "word_freq_all",
@ -27,7 +27,8 @@ preprocessDataset = function()
"capital_run_length_longest", "capital_run_length_total", "capital_run_length_longest", "capital_run_length_total",
"class") "class")
dataset = dataset %>% mutate(class=factor(class)) dataset = dataset %>%
mutate(class = factor(class))
return(dataset) return(dataset)
} }

View File

@ -1,17 +1,20 @@
preprocessDataset = function() preprocess.dataset = function()
{ {
csv.file.w = "winequality-white.csv" csv.file.w = "winequality-white.csv"
csv.file.r = "winequality-red.csv" csv.file.r = "winequality-red.csv"
dataset.w = read.csv(paste0(orig.dir, "/", csv.file.w), sep=";", dataset.w = read.csv(file.path(orig.dir, "/", csv.file.w), sep = ";",
check.names = FALSE) check.names = FALSE)
dataset.w = dataset.w %>% mutate(color="white") dataset.w = dataset.w %>%
mutate(color = "white")
dataset.r = read.csv(paste0(orig.dir, "/", csv.file.r), sep = ";", dataset.r = read.csv(paste0(orig.dir, "/", csv.file.r), sep = ";",
check.names = FALSE) check.names = FALSE)
dataset.r = dataset.r %>% mutate(color="red") dataset.r = dataset.r %>%
mutate(color = "red")
dataset = rbind(dataset.w, dataset.r) %>% dataset =
rbind(dataset.w, dataset.r) %>%
mutate(color = factor(color), mutate(color = factor(color),
quality = ifelse(quality > 5, 1, 0)) %>% quality = ifelse(quality > 5, 1, 0)) %>%
select(`fixed acidity`:alcohol, color, quality) %>% select(`fixed acidity`:alcohol, color, quality) %>%

View File

@ -1,49 +0,0 @@
rm(list=ls())
source("config.R")
source("utils.R")
library(RCurl)
library(tools)
library(yaml)
flog.info("Started downloading dataset collection")
for (dir.name in dir(PATH_DATASETS))
{
flog.info(paste("Dataset:", dir.name))
dest.dir = gsub("\\*", dir.name, PATH_DATASET_ORIGINAL)
config.yaml.file = paste0(PATH_DATASETS, dir.name, "/", FILE_CONFIG_YAML)
urls.list = yaml.load_file(config.yaml.file)$urls
mkdir(dest.dir)
for (url in urls.list)
{
flog.info(paste("URL:", url))
dest.file = URLdecode(basename(url))
dest.file.path = paste0(dest.dir, dest.file)
if (file.exists(dest.file.path))
{
flog.warn(paste("Target file", basename(dest.file.path),
"already exists; skipping..."))
next
}
tryCatch(
raw.content <- getBinaryURL(url, .opts=curlOptions(ssl.verifypeer=FALSE)),
error = function(e){flog.error(e); stop(e)}
)
writeBin(raw.content, dest.file.path)
}
flog.info("*****")
}
flog.info("Finished downloading dataset collection")

View File

@ -1,64 +0,0 @@
rm(list=ls())
source("config.R")
source("utils.R")
library(plyr)
library(dplyr)
library(foreign)
library(XLConnect)
flog.info("Started preprocessing dataset collection")
for (dir.name in dir(PATH_DATASETS))
{
flog.info(paste("Dataset:", dir.name))
orig.dir = gsub("\\*", dir.name, PATH_DATASET_ORIGINAL)
dest.dir = gsub("\\*", dir.name, PATH_DATASET_PREPROCESSED)
dest.file.path = paste0(dest.dir, FILE_PREPROCESSED_OUTPUT)
if (file.exists(dest.file.path))
{
flog.warn(paste("Target file", basename(dest.file.path),
"already exists; skipping..."))
flog.info("*****")
next
}
r.src.file = paste0(PATH_DATASETS, dir.name, "/", FILE_PREPROCESSING_SCRIPT)
source(r.src.file)
dataset = preprocessDataset() # custom per-dataset preprocessing
# change column names
colnames(dataset) = tolower(
make.names(
gsub("^\\.|\\.$", "", colnames(dataset)),
unique=TRUE, allow_=FALSE))
# change factor levels
for (name in colnames(dataset))
{
if (any(class(dataset[[name]]) == "factor"))
{
levels(dataset[[name]]) = tolower(
make.names(
gsub("^\\.|\\.$", "",
levels(dataset[[name]])),
unique=TRUE, allow_=FALSE))
}
}
printDatasetStatistics(dataset)
mkdir(dest.dir)
saveRDS(dataset, dest.file.path)
flog.info(paste("Created preprocessed file", FILE_PREPROCESSED_OUTPUT))
flog.info("*****")
}
flog.info("Finished preprocessing dataset collection")

64
init.R Normal file
View File

@ -0,0 +1,64 @@
# ---- init ----
# clear envirionment
rm(list = ls())
# load setup variables
source("config.R")
# set randomization
set.seed(SEED)
# load library management system
library(checkpoint)
if (CHECKPOINT.QUICK.LOAD) # approx. x10 faster checkpoint library loading
{
# assume https
options(checkpoint.mranUrl = CHECKPOINT.MRAN.URL)
# disable url checking
assignInNamespace("is.404", function(mran, warn = TRUE) { FALSE },
"checkpoint")
}
# knitr fix
checkpoint(CHECKPOINT.SNAPSHOT.DATE, verbose = TRUE, scanForPackages = FALSE)
if (system.file(package = "knitr") == "")
{
install.packages("knitr")
}
# actual checkpoint loading
checkpoint(CHECKPOINT.SNAPSHOT.DATE, verbose = TRUE, scanForPackages = TRUE)
# load logging system
library(futile.logger)
flog.threshold(LOGGER.LEVEL)
# load libraries
library(RCurl)
library(tools)
library(yaml)
library(plyr)
library(dplyr)
library(foreign)
library(XLConnect)
# load helper functions
source("utils.R")
# perform additional custom init
if (file.exists(USER.INIT.FILE))
{
source(USER.INIT.FILE)
}

48
s1-download-data.R Normal file
View File

@ -0,0 +1,48 @@
# ---- download-data ----
source("init.R")
source("utils.R")
setup.logger(LOGGER.OUTPUT.S1.FILE)
flog.info("Step 1: download dataset collection")
for (dir.name in dir(DATASETS.DIR))
{
flog.info(paste("Dataset:", dir.name))
dest.dir = gsub(DATASET.NAME.PATTERN, dir.name, DATASET.ORIGINAL.DIR)
config.yaml.file = file.path(DATASETS.DIR, dir.name, DATASET.CONFIG.FILE)
urls.list = yaml.load_file(config.yaml.file)$urls
if (!dir.exists(dest.dir))
{
dir.create(dest.dir)
}
for (url in urls.list)
{
flog.info(paste("URL:", url))
dest.file = URLdecode(basename(url))
dest.file.path = file.path(dest.dir, dest.file)
if (!file.exists(dest.file.path) | OVERWRITE.OUTPUT.FILES)
{
tryCatch(
raw.content <-
getBinaryURL(url, .opts = curlOptions(ssl.verifypeer =
SSL.VERIFY.PEER)),
error = function(e){flog.error(e); stop(e)}
)
writeBin(raw.content, dest.file.path)
} else {
flog.warn(paste("Target file", basename(dest.file.path),
"already exists, skipping"))
}
}
flog.info(paste(rep("*", 25), collapse = ""))
}

61
s2-preprocess-data.R Normal file
View File

@ -0,0 +1,61 @@
# ---- preprocess-data ----
source("init.R")
source("utils.R")
setup.logger(LOGGER.OUTPUT.S2.FILE)
flog.info("Step 2: preprocess dataset collection")
for (dir.name in dir(DATASETS.DIR))
{
flog.info(paste("Dataset:", dir.name))
orig.dir = gsub(DATASET.NAME.PATTERN, dir.name, DATASET.ORIGINAL.DIR)
dest.dir = gsub(DATASET.NAME.PATTERN, dir.name, DATASET.PREPROCESSED.DIR)
dest.file.path = file.path(dest.dir, DATASET.PREPROCESSED.OUTPUT.FILE)
if (!file.exists(dest.file.path) | OVERWRITE.OUTPUT.FILES)
{
r.src.file = file.path(DATASETS.DIR, dir.name,
DATASET.PREPROCESSING.SCRIPT)
source(r.src.file)
dataset = preprocess.dataset() # custom per-dataset preprocessing
# change column names
colnames(dataset) = tolower(
make.names(
gsub("^\\.|\\.$", "", colnames(dataset)),
unique = TRUE, allow_ = FALSE))
# change factor levels
for (name in colnames(dataset))
{
if (any(class(dataset[[name]]) == "factor"))
{
levels(dataset[[name]]) = tolower(
make.names(
gsub("^\\.|\\.$", "",
levels(dataset[[name]])),
unique = TRUE, allow_ = FALSE))
}
}
print.dataset.statistics(dataset)
if (!dir.exists(dest.dir))
{
dir.create(dest.dir)
}
saveRDS(dataset, dest.file.path)
flog.info(paste("Created preprocessed file",
DATASET.PREPROCESSED.OUTPUT.FILE))
} else {
flog.warn(paste("Target file", basename(dest.file.path),
"already exists, skipping"))
}
flog.info(paste(rep("*", 25), collapse = ""))
}

View File

@ -7,35 +7,31 @@ output:
--- ---
```{r global-options, include=FALSE} ```{r global-options, include=FALSE}
knitr::opts_chunk$set(comment="", echo=FALSE, knitr::opts_chunk$set(comment = "", echo = FALSE, warning = FALSE, message = FALSE)
warning=FALSE, message=FALSE) source('init.R')
source('config.R')
``` ```
Document generation date: `r Sys.time()`. Document generation date: `r Sys.time()`.
```{r show-datasets, results='asis'} ```{r show-datasets, results='asis'}
library(yaml)
cat("\n# Table of Contents\n\n") cat("\n# Table of Contents\n\n")
for (dir.name in dir(PATH_DATASETS)) for (dir.name in dir(DATASETS.DIR))
{ {
config.yaml.file.path = paste0(PATH_DATASETS, dir.name, "/", FILE_CONFIG_YAML) config.yaml.file.path = file.path(DATASETS.DIR, dir.name, DATASET.CONFIG.FILE)
config.yaml = yaml.load_file(config.yaml.file.path) config.yaml = yaml.load_file(config.yaml.file.path)
anchor = gsub(" ", "-", gsub("[[:punct:]]", "", anchor = gsub(" ", "-", gsub("[[:punct:]]", "",
tolower(config.yaml$name))) tolower(config.yaml$name)))
cat(paste0("1. [", config.yaml$name, "](#", anchor, ")\n" )) cat(paste0("1. [", config.yaml$name, "](#", anchor, ")\n" ))
} }
cat("\n---\n\n") cat("\n---\n\n")
for (dir.name in dir(PATH_DATASETS)) for (dir.name in dir(DATASETS.DIR))
{ {
config.yaml.file.path = paste0(PATH_DATASETS, dir.name, "/", FILE_CONFIG_YAML) config.yaml.file.path = file.path(DATASETS.DIR, dir.name, DATASET.CONFIG.FILE)
config.yaml = yaml.load_file(config.yaml.file.path) config.yaml = yaml.load_file(config.yaml.file.path)
cat(paste("#", config.yaml$name, "\n\n")) cat(paste("#", config.yaml$name, "\n\n"))
@ -55,8 +51,10 @@ for (dir.name in dir(PATH_DATASETS))
cat(paste("**Dataset**:\n\n")) cat(paste("**Dataset**:\n\n"))
preprocessed.dir = gsub("\\*", dir.name, PATH_DATASET_PREPROCESSED) preprocessed.dir = gsub(DATASET.NAME.PATTERN, dir.name,
preprocessed.file.path = paste0(preprocessed.dir, FILE_PREPROCESSED_OUTPUT) DATASET.PREPROCESSED.DIR)
preprocessed.file.path = file.path(preprocessed.dir,
DATASET.PREPROCESSED.OUTPUT.FILE)
dataset = readRDS(preprocessed.file.path) dataset = readRDS(preprocessed.file.path)
@ -79,14 +77,14 @@ for (dir.name in dir(PATH_DATASETS))
cat("**Class imbalance**:\n\n") cat("**Class imbalance**:\n\n")
cat(knitr::kable(data.frame(A=c(paste(perc.classes[1], "%"), num.classes[1]), cat(knitr::kable(data.frame(A = c(paste(perc.classes[1], "%"),
B=c(paste(perc.classes[2], "%"), num.classes[2])), num.classes[1]),
B = c(paste(perc.classes[2], "%"),
num.classes[2])),
format = "markdown", col.names = c("class A", "class B"), format = "markdown", col.names = c("class A", "class B"),
align = c("c", "c")), align = c("c", "c")),
sep = "\n") sep = "\n")
cat("\n---\n\n") cat("\n---\n\n")
} }
``` ```

28
utils.R
View File

@ -1,17 +1,4 @@
library(futile.logger) print.dataset.statistics = function(dataset)
mkdir = function(dest.dir)
{
if (!dir.exists(dest.dir))
{
flog.debug(paste("Creating directory", dest.dir))
dir.create(dest.dir)
} else {
flog.debug(paste("Target directory", dest.dir, "already exists"))
}
}
printDatasetStatistics = function(dataset)
{ {
if (ncol(dataset) == 0) # for mockups if (ncol(dataset) == 0) # for mockups
{ {
@ -21,8 +8,19 @@ printDatasetStatistics = function(dataset)
no.cases = nrow(dataset) no.cases = nrow(dataset)
no.attributes = ncol(dataset) - 1 no.attributes = ncol(dataset) - 1
perc.classes = round(100*as.numeric(table(dataset[, ncol(dataset)]))/nrow(dataset), 0) perc.classes =
round(100*as.numeric(table(dataset[, ncol(dataset)]))/nrow(dataset), 0)
flog.info(paste0("Cases: ", no.cases, ", attributes: ", no.attributes, flog.info(paste0("Cases: ", no.cases, ", attributes: ", no.attributes,
", classes: ", perc.classes[1], "%/", perc.classes[2], "%")) ", classes: ", perc.classes[1], "%/", perc.classes[2], "%"))
} }
setup.logger = function(output.file)
{
if (LOGGER.OVERWRITE.EXISTING.FILES & file.exists(output.file))
{
file.remove(output.file)
}
invisible(flog.appender(appender.tee(output.file)))
}