mirror of
https://github.com/andre-wojtowicz/uci-ml-to-r.git
synced 2024-11-24 16:05:28 +01:00
refactoring, checkpoint cleanup and snapshot update
This commit is contained in:
parent
c49a82db43
commit
982b3b6f9d
3
.gitignore
vendored
3
.gitignore
vendored
@ -24,3 +24,6 @@ data-collection/*/preprocessed/*
|
|||||||
# markdown outputs
|
# markdown outputs
|
||||||
*.html
|
*.html
|
||||||
.Rproj.user
|
.Rproj.user
|
||||||
|
|
||||||
|
# logger outputs
|
||||||
|
*.log
|
||||||
|
84
README.md
84
README.md
@ -3,8 +3,7 @@ Andrzej Wójtowicz
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
Document generation date: 2016-07-13 13:45:45.
|
Document generation date: 2016-07-17 02:31:21.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Table of Contents
|
# Table of Contents
|
||||||
@ -70,10 +69,10 @@ S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of
|
|||||||
|
|
||||||
**Class imbalance**:
|
**Class imbalance**:
|
||||||
|
|
||||||
| class A | class B |
|
| class A | class B |
|
||||||
|:-------:|:--------:|
|
|:-------:|:-------:|
|
||||||
| 12 % | 88 % |
|
| 12 % | 88 % |
|
||||||
| 5021 | 38172 |
|
| 5021 | 38172 |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@ -140,10 +139,10 @@ https://archive.ics.uci.edu/ml/citation_policy.html
|
|||||||
|
|
||||||
**Class imbalance**:
|
**Class imbalance**:
|
||||||
|
|
||||||
| class A | class B |
|
| class A | class B |
|
||||||
|:-------:|:--------:|
|
|:-------:|:-------:|
|
||||||
| 37 % | 63 % |
|
| 37 % | 63 % |
|
||||||
| 212 | 357 |
|
| 212 | 357 |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@ -188,10 +187,10 @@ O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear programming",
|
|||||||
|
|
||||||
**Class imbalance**:
|
**Class imbalance**:
|
||||||
|
|
||||||
| class A | class B |
|
| class A | class B |
|
||||||
|:-------:|:--------:|
|
|:-------:|:-------:|
|
||||||
| 35 % | 65 % |
|
| 35 % | 65 % |
|
||||||
| 239 | 444 |
|
| 239 | 444 |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@ -258,10 +257,10 @@ Ayres de Campos et al. (2000) SisPorto 2.0 A Program for Automated Analysis of C
|
|||||||
|
|
||||||
**Class imbalance**:
|
**Class imbalance**:
|
||||||
|
|
||||||
| class A | class B |
|
| class A | class B |
|
||||||
|:-------:|:--------:|
|
|:-------:|:-------:|
|
||||||
| 22 % | 78 % |
|
| 22 % | 78 % |
|
||||||
| 471 | 1655 |
|
| 471 | 1655 |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@ -320,10 +319,10 @@ Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for
|
|||||||
|
|
||||||
**Class imbalance**:
|
**Class imbalance**:
|
||||||
|
|
||||||
| class A | class B |
|
| class A | class B |
|
||||||
|:-------:|:--------:|
|
|:-------:|:-------:|
|
||||||
| 22 % | 78 % |
|
| 22 % | 78 % |
|
||||||
| 6636 | 23364 |
|
| 6636 | 23364 |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@ -371,10 +370,10 @@ https://archive.ics.uci.edu/ml/citation_policy.html
|
|||||||
|
|
||||||
**Class imbalance**:
|
**Class imbalance**:
|
||||||
|
|
||||||
| class A | class B |
|
| class A | class B |
|
||||||
|:-------:|:--------:|
|
|:-------:|:-------:|
|
||||||
| 29 % | 71 % |
|
| 29 % | 71 % |
|
||||||
| 167 | 416 |
|
| 167 | 416 |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@ -421,10 +420,10 @@ https://archive.ics.uci.edu/ml/citation_policy.html
|
|||||||
|
|
||||||
**Class imbalance**:
|
**Class imbalance**:
|
||||||
|
|
||||||
| class A | class B |
|
| class A | class B |
|
||||||
|:-------:|:--------:|
|
|:-------:|:-------:|
|
||||||
| 35 % | 65 % |
|
| 35 % | 65 % |
|
||||||
| 6688 | 12332 |
|
| 6688 | 12332 |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@ -475,10 +474,10 @@ Sikora M., Wrobel L.: Application of rule induction algorithms for analysis of d
|
|||||||
|
|
||||||
**Class imbalance**:
|
**Class imbalance**:
|
||||||
|
|
||||||
| class A | class B |
|
| class A | class B |
|
||||||
|:-------:|:--------:|
|
|:-------:|:-------:|
|
||||||
| 7 % | 93 % |
|
| 7 % | 93 % |
|
||||||
| 170 | 2414 |
|
| 170 | 2414 |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@ -574,10 +573,10 @@ https://archive.ics.uci.edu/ml/citation_policy.html
|
|||||||
|
|
||||||
**Class imbalance**:
|
**Class imbalance**:
|
||||||
|
|
||||||
| class A | class B |
|
| class A | class B |
|
||||||
|:-------:|:--------:|
|
|:-------:|:-------:|
|
||||||
| 39 % | 61 % |
|
| 39 % | 61 % |
|
||||||
| 1813 | 2788 |
|
| 1813 | 2788 |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@ -627,10 +626,9 @@ P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferen
|
|||||||
|
|
||||||
**Class imbalance**:
|
**Class imbalance**:
|
||||||
|
|
||||||
| class A | class B |
|
| class A | class B |
|
||||||
|:-------:|:--------:|
|
|:-------:|:-------:|
|
||||||
| 37 % | 63 % |
|
| 37 % | 63 % |
|
||||||
| 2384 | 4113 |
|
| 2384 | 4113 |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
58
config.R
58
config.R
@ -1,28 +1,48 @@
|
|||||||
# ---- checkpoint ----
|
# ---- config ----
|
||||||
|
|
||||||
CHECKPOINT.MRAN.URL = "http://mran.microsoft.com/snapshot/"
|
# randomization and output files
|
||||||
CHECKPOINT.SNAPSHOT.DATE = "2016-04-10"
|
|
||||||
|
|
||||||
library(checkpoint)
|
SEED = 1337
|
||||||
options(checkpoint.mranUrl=CHECKPOINT.MRAN.URL)
|
OVERWRITE.OUTPUT.FILES = TRUE # overwrite downloaded and created datasets
|
||||||
checkpoint(CHECKPOINT.SNAPSHOT.DATE)
|
|
||||||
|
|
||||||
# ---- logger ----
|
# extra user configuration and init
|
||||||
|
|
||||||
LOGGER_LEVEL = futile.logger::INFO
|
USER.CONFIG.FILE = "config.R.user"
|
||||||
|
USER.INIT.FILE = "init.R.user"
|
||||||
|
|
||||||
library(futile.logger)
|
# checkpoint library
|
||||||
flog.threshold(LOGGER_LEVEL)
|
|
||||||
|
|
||||||
# ---- other ----
|
CHECKPOINT.MRAN.URL = "https://mran.microsoft.com/"
|
||||||
|
CHECKPOINT.SNAPSHOT.DATE = "2016-07-01"
|
||||||
|
CHECKPOINT.QUICK.LOAD = TRUE # skip testing https and checking url
|
||||||
|
|
||||||
PATH_DATASETS = "data-collection/"
|
# logging system
|
||||||
PATH_DATASET_ORIGINAL = paste0(PATH_DATASETS, "*/original/")
|
|
||||||
PATH_DATASET_PREPROCESSED = paste0(PATH_DATASETS, "*/preprocessed/")
|
|
||||||
|
|
||||||
FILE_CONFIG_YAML = "config.yaml"
|
LOGGER.OUTPUT.S1.FILE = "output-s1.log"
|
||||||
FILE_PREPROCESSING_SCRIPT = "preprocess.R"
|
LOGGER.OUTPUT.S2.FILE = "output-s2.log"
|
||||||
FILE_PREPROCESSED_OUTPUT = "dataset.rds"
|
LOGGER.LEVEL = 6 # futile.logger::INFO
|
||||||
|
LOGGER.OVERWRITE.EXISTING.FILES = TRUE
|
||||||
|
|
||||||
if (file.exists("config.R.user"))
|
# datasets
|
||||||
source("config.R.user")
|
|
||||||
|
DATASETS.DIR = "data-collection"
|
||||||
|
|
||||||
|
DATASET.NAME.PATTERN = "DS-NAME"
|
||||||
|
DATASET.ORIGINAL.DIR =
|
||||||
|
file.path(DATASETS.DIR, DATASET.NAME.PATTERN, "original")
|
||||||
|
DATASET.PREPROCESSED.DIR =
|
||||||
|
file.path(DATASETS.DIR, DATASET.NAME.PATTERN, "preprocessed")
|
||||||
|
DATASET.CONFIG.FILE = "config.yaml"
|
||||||
|
DATASET.PREPROCESSING.SCRIPT = "preprocess.R"
|
||||||
|
DATASET.PREPROCESSED.OUTPUT.FILE = "dataset.rds"
|
||||||
|
|
||||||
|
# curl
|
||||||
|
|
||||||
|
SSL.VERIFY.PEER = FALSE
|
||||||
|
|
||||||
|
# load custom config
|
||||||
|
|
||||||
|
if (file.exists(USER.CONFIG.FILE))
|
||||||
|
{
|
||||||
|
source(USER.CONFIG.FILE)
|
||||||
|
}
|
||||||
|
@ -1,44 +1,44 @@
|
|||||||
preprocessDataset = function()
|
preprocess.dataset = function()
|
||||||
{
|
{
|
||||||
#set.seed(SEED)
|
|
||||||
|
|
||||||
temp.dir = tempdir()
|
temp.dir = tempdir()
|
||||||
|
|
||||||
zip.file = "bank.zip"
|
zip.file = "bank.zip"
|
||||||
zip.dataset.path = "bank-full.csv"
|
zip.dataset.path = "bank-full.csv"
|
||||||
|
|
||||||
flog.debug(paste("Unzipping", zip.file))
|
flog.debug(paste("Unzipping", zip.file))
|
||||||
|
|
||||||
unzip(zipfile=paste0(orig.dir, zip.file),
|
unzip(zipfile = file.path(orig.dir, zip.file),
|
||||||
files=zip.dataset.path,
|
files = zip.dataset.path,
|
||||||
exdir=temp.dir)
|
exdir = temp.dir)
|
||||||
|
|
||||||
flog.debug(paste("Loading", zip.dataset.path))
|
flog.debug(paste("Loading", zip.dataset.path))
|
||||||
|
|
||||||
dataset = read.csv(paste0(temp.dir, "/", zip.dataset.path), sep=";")
|
dataset = read.csv(file.path(temp.dir, zip.dataset.path), sep = ";")
|
||||||
|
|
||||||
flog.debug("Preprocessing loaded dataset")
|
flog.debug("Preprocessing loaded dataset")
|
||||||
|
|
||||||
dataset = dataset %>%
|
dataset = dataset %>%
|
||||||
select(-c(duration, default)) %>%
|
select(-c(duration, default)) %>%
|
||||||
filter(job != "unknown" & marital != "unknown" & education != "unknown" &
|
filter(job != "unknown" & marital != "unknown" & education != "unknown" &
|
||||||
education != "unknown" & housing != "unknown" & loan != "unknown") %>%
|
education != "unknown" & housing != "unknown" &
|
||||||
|
loan != "unknown") %>%
|
||||||
droplevels()
|
droplevels()
|
||||||
|
|
||||||
dataset = dataset %>%
|
dataset = dataset %>%
|
||||||
mutate(
|
mutate(
|
||||||
education=factor(education, levels=c("primary", "secondary",
|
education = factor(education,
|
||||||
"tertiary"),
|
levels = c("primary", "secondary", "tertiary"),
|
||||||
ordered=TRUE),
|
ordered = TRUE),
|
||||||
month=factor(month, levels=c("jan", "feb", "mar",
|
month = factor(month,
|
||||||
"apr", "may", "jun",
|
levels = c("jan", "feb", "mar", "apr", "may", "jun",
|
||||||
"jul", "aug", "sep",
|
"jul", "aug", "sep", "oct", "nov", "dec"),
|
||||||
"oct", "nov", "dec"),
|
ordered = TRUE),
|
||||||
ordered=TRUE),
|
pdays.bin = revalue(factor(pdays == -1),
|
||||||
pdays.bin=revalue(factor(pdays==-1),
|
c("TRUE" = "never", "FALSE" = "successful")),
|
||||||
c("TRUE"="never", "FALSE"="successful")),
|
pdays = as.integer(replace(pdays, pdays == -1, 999))) %>%
|
||||||
pdays=as.integer(replace(pdays, pdays==-1, 999))) %>%
|
|
||||||
select(age:pdays, pdays.bin, previous:y)
|
select(age:pdays, pdays.bin, previous:y)
|
||||||
|
|
||||||
|
unlink("temp.dir", recursive = TRUE)
|
||||||
|
|
||||||
return(dataset)
|
return(dataset)
|
||||||
}
|
}
|
@ -1,8 +1,8 @@
|
|||||||
preprocessDataset = function()
|
preprocess.dataset = function()
|
||||||
{
|
{
|
||||||
csv.file = "wdbc.data"
|
csv.file = "wdbc.data"
|
||||||
|
|
||||||
dataset = read.csv(paste0(orig.dir, "/", csv.file), header=FALSE)
|
dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE)
|
||||||
|
|
||||||
colnames(dataset) = c("id", "diagnosis",
|
colnames(dataset) = c("id", "diagnosis",
|
||||||
apply(expand.grid(c("radius", "texture", "perimeter",
|
apply(expand.grid(c("radius", "texture", "perimeter",
|
||||||
@ -12,7 +12,8 @@ preprocessDataset = function()
|
|||||||
c("mean", "se", "worst")),
|
c("mean", "se", "worst")),
|
||||||
1, function(x){paste(x[2], x[1])}))
|
1, function(x){paste(x[2], x[1])}))
|
||||||
|
|
||||||
dataset = dataset %>% select(`mean radius`:`worst fractal dimension`, diagnosis)
|
dataset = dataset %>%
|
||||||
|
select(`mean radius`:`worst fractal dimension`, diagnosis)
|
||||||
|
|
||||||
return(dataset)
|
return(dataset)
|
||||||
}
|
}
|
@ -1,8 +1,8 @@
|
|||||||
preprocessDataset = function()
|
preprocess.dataset = function()
|
||||||
{
|
{
|
||||||
csv.file = "breast-cancer-wisconsin.data"
|
csv.file = "breast-cancer-wisconsin.data"
|
||||||
|
|
||||||
dataset = read.csv(paste0(orig.dir, "/", csv.file), header=FALSE)
|
dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE)
|
||||||
|
|
||||||
colnames(dataset) = c("Sample code number", "Clump Thickness",
|
colnames(dataset) = c("Sample code number", "Clump Thickness",
|
||||||
"Uniformity of Cell Size", "Uniformity of Cell Shape",
|
"Uniformity of Cell Size", "Uniformity of Cell Shape",
|
||||||
@ -10,11 +10,12 @@ preprocessDataset = function()
|
|||||||
"Bare Nuclei", "Bland Chromatin", "Normal Nucleoli",
|
"Bare Nuclei", "Bland Chromatin", "Normal Nucleoli",
|
||||||
"Mitoses", "Class")
|
"Mitoses", "Class")
|
||||||
|
|
||||||
dataset = dataset %>% select(-`Sample code number`) %>%
|
dataset = dataset %>%
|
||||||
filter(`Bare Nuclei` != "?") %>%
|
select(-`Sample code number`) %>%
|
||||||
mutate(Class=factor(Class),
|
filter(`Bare Nuclei` != "?") %>%
|
||||||
`Bare Nuclei`=as.integer(`Bare Nuclei`)) %>%
|
mutate(Class = factor(Class),
|
||||||
droplevels()
|
`Bare Nuclei` = as.integer(`Bare Nuclei`)) %>%
|
||||||
|
droplevels()
|
||||||
|
|
||||||
return(dataset)
|
return(dataset)
|
||||||
}
|
}
|
@ -1,40 +1,41 @@
|
|||||||
preprocessDataset = function()
|
preprocess.dataset = function()
|
||||||
{
|
{
|
||||||
xls.file = "CTG.xls"
|
xls.file = "CTG.xls"
|
||||||
|
|
||||||
wk = loadWorkbook(paste0(orig.dir, "/", xls.file))
|
wk = loadWorkbook(file.path(orig.dir, xls.file))
|
||||||
dataset = readWorksheet(wk, sheet="Raw Data")
|
dataset = readWorksheet(wk, sheet = "Raw Data")
|
||||||
|
|
||||||
dataset = dataset %>% select(LB:FS, NSP, -c(DS, DR)) %>%
|
dataset = dataset %>% select(LB:FS, NSP, -c(DS, DR)) %>%
|
||||||
filter(complete.cases(.)) %>%
|
filter(complete.cases(.)) %>%
|
||||||
mutate(LB=as.integer(LB),
|
mutate(LB = as.integer(LB),
|
||||||
AC=as.integer(AC),
|
AC = as.integer(AC),
|
||||||
FM=as.integer(FM),
|
FM = as.integer(FM),
|
||||||
UC=as.integer(UC),
|
UC = as.integer(UC),
|
||||||
ASTV=as.integer(ASTV),
|
ASTV = as.integer(ASTV),
|
||||||
ALTV=as.integer(ALTV),
|
ALTV = as.integer(ALTV),
|
||||||
DL=as.integer(DL),
|
DL = as.integer(DL),
|
||||||
DP=as.integer(DP),
|
DP = as.integer(DP),
|
||||||
Width=as.integer(Width),
|
Width = as.integer(Width),
|
||||||
Min=as.integer(Min),
|
Min = as.integer(Min),
|
||||||
Max=as.integer(Max),
|
Max = as.integer(Max),
|
||||||
Nmax=as.integer(Nmax),
|
Nmax = as.integer(Nmax),
|
||||||
Nzeros=as.integer(Nzeros),
|
Nzeros = as.integer(Nzeros),
|
||||||
Mode=as.integer(Mode),
|
Mode = as.integer(Mode),
|
||||||
Mean=as.integer(Mean),
|
Mean = as.integer(Mean),
|
||||||
Median=as.integer(Median),
|
Median = as.integer(Median),
|
||||||
Variance=as.integer(Variance),
|
Variance = as.integer(Variance),
|
||||||
Tendency=factor(Tendency, levels=c(-1,0,1), ordered=TRUE),
|
Tendency = factor(Tendency, levels = c(-1, 0, 1),
|
||||||
A=factor(A),
|
ordered = TRUE),
|
||||||
B=factor(B),
|
A = factor(A),
|
||||||
C=factor(C),
|
B = factor(B),
|
||||||
D=factor(D),
|
C = factor(C),
|
||||||
E=factor(E),
|
D = factor(D),
|
||||||
AD=factor(AD),
|
E = factor(E),
|
||||||
DE=factor(DE),
|
AD = factor(AD),
|
||||||
LD=factor(LD),
|
DE = factor(DE),
|
||||||
FS=factor(FS),
|
LD = factor(LD),
|
||||||
NSP=factor(replace(NSP, NSP==2, 3)))
|
FS = factor(FS),
|
||||||
|
NSP = factor(replace(NSP, NSP == 2, 3)))
|
||||||
|
|
||||||
return(dataset)
|
return(dataset)
|
||||||
}
|
}
|
@ -1,48 +1,40 @@
|
|||||||
preprocessDataset = function()
|
preprocess.dataset = function()
|
||||||
{
|
{
|
||||||
#set.seed(SEED)
|
|
||||||
|
|
||||||
xls.file = "default of credit card clients.xls"
|
xls.file = "default of credit card clients.xls"
|
||||||
|
|
||||||
wk = loadWorkbook(paste0(orig.dir, "/", xls.file))
|
wk = loadWorkbook(file.path(orig.dir, xls.file))
|
||||||
dataset = readWorksheet(wk, sheet="Data", startRow=2, startCol=2,
|
dataset = readWorksheet(wk, sheet = "Data", startRow = 2, startCol = 2,
|
||||||
check.names=FALSE)
|
check.names = FALSE)
|
||||||
|
|
||||||
dataset = dataset %>%
|
dataset = dataset %>%
|
||||||
mutate(LIMIT_BAL=as.integer(LIMIT_BAL),
|
mutate(LIMIT_BAL = as.integer(LIMIT_BAL),
|
||||||
SEX=factor(SEX),
|
SEX = factor(SEX),
|
||||||
EDUCATION=factor(EDUCATION), # can't order due to
|
EDUCATION = factor(EDUCATION), # can not order due to
|
||||||
# inconsistency with
|
# inconsistency with
|
||||||
# UCI description
|
# UCI description
|
||||||
MARRIAGE=factor(MARRIAGE),
|
MARRIAGE = factor(MARRIAGE),
|
||||||
AGE=as.integer(AGE),
|
AGE = as.integer(AGE),
|
||||||
PAY_0=as.integer(replace(PAY_0, PAY_0<0, 0)),
|
PAY_0 = as.integer(replace(PAY_0, PAY_0 < 0, 0)),
|
||||||
PAY_2=as.integer(replace(PAY_2, PAY_2<0, 0)),
|
PAY_2 = as.integer(replace(PAY_2, PAY_2 < 0, 0)),
|
||||||
PAY_3=as.integer(replace(PAY_3, PAY_3<0, 0)),
|
PAY_3 = as.integer(replace(PAY_3, PAY_3 < 0, 0)),
|
||||||
PAY_4=as.integer(replace(PAY_4, PAY_4<0, 0)),
|
PAY_4 = as.integer(replace(PAY_4, PAY_4 < 0, 0)),
|
||||||
PAY_5=as.integer(replace(PAY_5, PAY_5<0, 0)),
|
PAY_5 = as.integer(replace(PAY_5, PAY_5 < 0, 0)),
|
||||||
PAY_6=as.integer(replace(PAY_6, PAY_6<0, 0)),
|
PAY_6 = as.integer(replace(PAY_6, PAY_6 < 0, 0)),
|
||||||
BILL_AMT1=as.integer(BILL_AMT1),
|
BILL_AMT1 = as.integer(BILL_AMT1),
|
||||||
BILL_AMT2=as.integer(BILL_AMT2),
|
BILL_AMT2 = as.integer(BILL_AMT2),
|
||||||
BILL_AMT3=as.integer(BILL_AMT3),
|
BILL_AMT3 = as.integer(BILL_AMT3),
|
||||||
BILL_AMT4=as.integer(BILL_AMT4),
|
BILL_AMT4 = as.integer(BILL_AMT4),
|
||||||
BILL_AMT5=as.integer(BILL_AMT5),
|
BILL_AMT5 = as.integer(BILL_AMT5),
|
||||||
BILL_AMT6=as.integer(BILL_AMT6),
|
BILL_AMT6 = as.integer(BILL_AMT6),
|
||||||
PAY_AMT1=as.integer(PAY_AMT1),
|
PAY_AMT1 = as.integer(PAY_AMT1),
|
||||||
PAY_AMT2=as.integer(PAY_AMT2),
|
PAY_AMT2 = as.integer(PAY_AMT2),
|
||||||
PAY_AMT3=as.integer(PAY_AMT3),
|
PAY_AMT3 = as.integer(PAY_AMT3),
|
||||||
PAY_AMT4=as.integer(PAY_AMT4),
|
PAY_AMT4 = as.integer(PAY_AMT4),
|
||||||
PAY_AMT5=as.integer(PAY_AMT5),
|
PAY_AMT5 = as.integer(PAY_AMT5),
|
||||||
PAY_AMT6=as.integer(PAY_AMT6),
|
PAY_AMT6 = as.integer(PAY_AMT6),
|
||||||
`default payment next month`=factor(
|
`default payment next month` =
|
||||||
`default payment next month`)
|
factor(`default payment next month`)
|
||||||
)
|
)
|
||||||
|
|
||||||
#dataset.1 = dataset %>% filter(`default payment next month` == 1)
|
|
||||||
#dataset.0 = dataset %>% filter(`default payment next month` == 0) %>%
|
|
||||||
# sample_n(nrow(dataset.1))
|
|
||||||
#
|
|
||||||
#dataset = rbind(dataset.0, dataset.1)
|
|
||||||
|
|
||||||
return(dataset)
|
return(dataset)
|
||||||
}
|
}
|
@ -1,14 +1,14 @@
|
|||||||
preprocessDataset = function()
|
preprocess.dataset = function()
|
||||||
{
|
{
|
||||||
csv.file = "Indian Liver Patient Dataset (ILPD).csv"
|
csv.file = "Indian Liver Patient Dataset (ILPD).csv"
|
||||||
|
|
||||||
dataset = read.csv(paste0(orig.dir, "/", csv.file),
|
dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE)
|
||||||
header=FALSE)
|
|
||||||
|
|
||||||
colnames(dataset) = c("Age", "Gender", "TB", "DB", "Alkphos", "Sgpt",
|
colnames(dataset) = c("Age", "Gender", "TB", "DB", "Alkphos", "Sgpt",
|
||||||
"Sgot", "TP", "ALB", "A/G Ratio", "Selector")
|
"Sgot", "TP", "ALB", "A/G Ratio", "Selector")
|
||||||
|
|
||||||
dataset = dataset %>% mutate(Selector=factor(Selector))
|
dataset = dataset %>%
|
||||||
|
mutate(Selector = factor(Selector))
|
||||||
|
|
||||||
return(dataset)
|
return(dataset)
|
||||||
}
|
}
|
@ -1,9 +1,8 @@
|
|||||||
preprocessDataset = function()
|
preprocess.dataset = function()
|
||||||
{
|
{
|
||||||
csv.file = "magic04.data"
|
csv.file = "magic04.data"
|
||||||
|
|
||||||
dataset = read.csv(paste0(orig.dir, "/", csv.file),
|
dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE)
|
||||||
header=FALSE)
|
|
||||||
|
|
||||||
colnames(dataset) = c("fLength", "fWidth", "fSize", "fConc", "fConc1",
|
colnames(dataset) = c("fLength", "fWidth", "fSize", "fConc", "fConc1",
|
||||||
"fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist",
|
"fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist",
|
||||||
|
@ -1,30 +1,23 @@
|
|||||||
preprocessDataset = function()
|
preprocess.dataset = function()
|
||||||
{
|
{
|
||||||
#set.seed(SEED)
|
|
||||||
|
|
||||||
arff.file = "seismic-bumps.arff"
|
arff.file = "seismic-bumps.arff"
|
||||||
|
|
||||||
dataset = read.arff(paste0(orig.dir, "/", arff.file))
|
dataset = read.arff(file.path(orig.dir, arff.file))
|
||||||
|
|
||||||
dataset = dataset %>% select(-c(nbumps6:nbumps89)) %>%
|
dataset = dataset %>%
|
||||||
mutate(genergy=as.integer(genergy),
|
select(-c(nbumps6:nbumps89)) %>%
|
||||||
gpuls=as.integer(gpuls),
|
mutate(genergy = as.integer(genergy),
|
||||||
gdenergy=as.integer(gdenergy),
|
gpuls = as.integer(gpuls),
|
||||||
gdpuls=as.integer(gdpuls),
|
gdenergy = as.integer(gdenergy),
|
||||||
nbumps=as.integer(nbumps),
|
gdpuls = as.integer(gdpuls),
|
||||||
nbumps2=as.integer(nbumps2),
|
nbumps = as.integer(nbumps),
|
||||||
nbumps3=as.integer(nbumps3),
|
nbumps2 = as.integer(nbumps2),
|
||||||
nbumps4=as.integer(nbumps4),
|
nbumps3 = as.integer(nbumps3),
|
||||||
nbumps5=as.integer(nbumps5),
|
nbumps4 = as.integer(nbumps4),
|
||||||
energy=as.integer(energy),
|
nbumps5 = as.integer(nbumps5),
|
||||||
maxenergy=as.integer(maxenergy)
|
energy = as.integer(energy),
|
||||||
)
|
maxenergy = as.integer(maxenergy)
|
||||||
|
)
|
||||||
#dataset.1 = dataset %>% filter(class == "1")
|
|
||||||
#dataset.0 = dataset %>% filter(class == "0") %>%
|
|
||||||
# sample_n(nrow(dataset.1)*4)
|
|
||||||
#
|
|
||||||
#dataset = rbind(dataset.0, dataset.1)
|
|
||||||
|
|
||||||
return(dataset)
|
return(dataset)
|
||||||
}
|
}
|
@ -1,9 +1,9 @@
|
|||||||
preprocessDataset = function()
|
preprocess.dataset = function()
|
||||||
{
|
{
|
||||||
csv.file = "spambase.data"
|
csv.file = "spambase.data"
|
||||||
|
|
||||||
dataset = read.csv(paste0(orig.dir, "/", csv.file),
|
dataset = read.csv(file.path(orig.dir, csv.file),
|
||||||
header=FALSE)
|
header = FALSE)
|
||||||
|
|
||||||
colnames(dataset) = c("word_freq_make", "word_freq_address", "word_freq_all",
|
colnames(dataset) = c("word_freq_make", "word_freq_address", "word_freq_all",
|
||||||
"word_freq_3d", "word_freq_our", "word_freq_over",
|
"word_freq_3d", "word_freq_our", "word_freq_over",
|
||||||
@ -27,7 +27,8 @@ preprocessDataset = function()
|
|||||||
"capital_run_length_longest", "capital_run_length_total",
|
"capital_run_length_longest", "capital_run_length_total",
|
||||||
"class")
|
"class")
|
||||||
|
|
||||||
dataset = dataset %>% mutate(class=factor(class))
|
dataset = dataset %>%
|
||||||
|
mutate(class = factor(class))
|
||||||
|
|
||||||
return(dataset)
|
return(dataset)
|
||||||
}
|
}
|
@ -1,21 +1,24 @@
|
|||||||
preprocessDataset = function()
|
preprocess.dataset = function()
|
||||||
{
|
{
|
||||||
csv.file.w = "winequality-white.csv"
|
csv.file.w = "winequality-white.csv"
|
||||||
csv.file.r = "winequality-red.csv"
|
csv.file.r = "winequality-red.csv"
|
||||||
|
|
||||||
dataset.w = read.csv(paste0(orig.dir, "/", csv.file.w), sep=";",
|
dataset.w = read.csv(file.path(orig.dir, "/", csv.file.w), sep = ";",
|
||||||
check.names=FALSE)
|
check.names = FALSE)
|
||||||
dataset.w = dataset.w %>% mutate(color="white")
|
dataset.w = dataset.w %>%
|
||||||
|
mutate(color = "white")
|
||||||
|
|
||||||
dataset.r = read.csv(paste0(orig.dir, "/", csv.file.r), sep=";",
|
dataset.r = read.csv(paste0(orig.dir, "/", csv.file.r), sep = ";",
|
||||||
check.names=FALSE)
|
check.names = FALSE)
|
||||||
dataset.r = dataset.r %>% mutate(color="red")
|
dataset.r = dataset.r %>%
|
||||||
|
mutate(color = "red")
|
||||||
|
|
||||||
dataset = rbind(dataset.w, dataset.r) %>%
|
dataset =
|
||||||
mutate(color=factor(color),
|
rbind(dataset.w, dataset.r) %>%
|
||||||
quality=ifelse(quality>5, 1, 0)) %>%
|
mutate(color = factor(color),
|
||||||
select(`fixed acidity`:alcohol, color, quality) %>%
|
quality = ifelse(quality > 5, 1, 0)) %>%
|
||||||
mutate(quality=factor(quality))
|
select(`fixed acidity`:alcohol, color, quality) %>%
|
||||||
|
mutate(quality = factor(quality))
|
||||||
|
|
||||||
return(dataset)
|
return(dataset)
|
||||||
}
|
}
|
@ -1,49 +0,0 @@
|
|||||||
rm(list=ls())
|
|
||||||
|
|
||||||
source("config.R")
|
|
||||||
source("utils.R")
|
|
||||||
|
|
||||||
library(RCurl)
|
|
||||||
library(tools)
|
|
||||||
library(yaml)
|
|
||||||
|
|
||||||
flog.info("Started downloading dataset collection")
|
|
||||||
|
|
||||||
for (dir.name in dir(PATH_DATASETS))
|
|
||||||
{
|
|
||||||
flog.info(paste("Dataset:", dir.name))
|
|
||||||
|
|
||||||
dest.dir = gsub("\\*", dir.name, PATH_DATASET_ORIGINAL)
|
|
||||||
config.yaml.file = paste0(PATH_DATASETS, dir.name, "/", FILE_CONFIG_YAML)
|
|
||||||
|
|
||||||
urls.list = yaml.load_file(config.yaml.file)$urls
|
|
||||||
|
|
||||||
mkdir(dest.dir)
|
|
||||||
|
|
||||||
for (url in urls.list)
|
|
||||||
{
|
|
||||||
flog.info(paste("URL:", url))
|
|
||||||
|
|
||||||
dest.file = URLdecode(basename(url))
|
|
||||||
dest.file.path = paste0(dest.dir, dest.file)
|
|
||||||
|
|
||||||
if (file.exists(dest.file.path))
|
|
||||||
{
|
|
||||||
flog.warn(paste("Target file", basename(dest.file.path),
|
|
||||||
"already exists; skipping..."))
|
|
||||||
next
|
|
||||||
}
|
|
||||||
|
|
||||||
tryCatch(
|
|
||||||
raw.content <- getBinaryURL(url, .opts=curlOptions(ssl.verifypeer=FALSE)),
|
|
||||||
error = function(e){flog.error(e); stop(e)}
|
|
||||||
)
|
|
||||||
|
|
||||||
writeBin(raw.content, dest.file.path)
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
flog.info("*****")
|
|
||||||
}
|
|
||||||
|
|
||||||
flog.info("Finished downloading dataset collection")
|
|
@ -1,64 +0,0 @@
|
|||||||
rm(list=ls())
|
|
||||||
|
|
||||||
source("config.R")
|
|
||||||
source("utils.R")
|
|
||||||
|
|
||||||
library(plyr)
|
|
||||||
library(dplyr)
|
|
||||||
library(foreign)
|
|
||||||
library(XLConnect)
|
|
||||||
|
|
||||||
flog.info("Started preprocessing dataset collection")
|
|
||||||
|
|
||||||
for (dir.name in dir(PATH_DATASETS))
|
|
||||||
{
|
|
||||||
flog.info(paste("Dataset:", dir.name))
|
|
||||||
|
|
||||||
orig.dir = gsub("\\*", dir.name, PATH_DATASET_ORIGINAL)
|
|
||||||
dest.dir = gsub("\\*", dir.name, PATH_DATASET_PREPROCESSED)
|
|
||||||
dest.file.path = paste0(dest.dir, FILE_PREPROCESSED_OUTPUT)
|
|
||||||
|
|
||||||
if (file.exists(dest.file.path))
|
|
||||||
{
|
|
||||||
flog.warn(paste("Target file", basename(dest.file.path),
|
|
||||||
"already exists; skipping..."))
|
|
||||||
flog.info("*****")
|
|
||||||
next
|
|
||||||
}
|
|
||||||
|
|
||||||
r.src.file = paste0(PATH_DATASETS, dir.name, "/", FILE_PREPROCESSING_SCRIPT)
|
|
||||||
|
|
||||||
source(r.src.file)
|
|
||||||
dataset = preprocessDataset() # custom per-dataset preprocessing
|
|
||||||
|
|
||||||
# change column names
|
|
||||||
colnames(dataset) = tolower(
|
|
||||||
make.names(
|
|
||||||
gsub("^\\.|\\.$", "", colnames(dataset)),
|
|
||||||
unique=TRUE, allow_=FALSE))
|
|
||||||
|
|
||||||
# change factor levels
|
|
||||||
for (name in colnames(dataset))
|
|
||||||
{
|
|
||||||
if (any(class(dataset[[name]]) == "factor"))
|
|
||||||
{
|
|
||||||
levels(dataset[[name]]) = tolower(
|
|
||||||
make.names(
|
|
||||||
gsub("^\\.|\\.$", "",
|
|
||||||
levels(dataset[[name]])),
|
|
||||||
unique=TRUE, allow_=FALSE))
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
printDatasetStatistics(dataset)
|
|
||||||
|
|
||||||
mkdir(dest.dir)
|
|
||||||
saveRDS(dataset, dest.file.path)
|
|
||||||
|
|
||||||
flog.info(paste("Created preprocessed file", FILE_PREPROCESSED_OUTPUT))
|
|
||||||
|
|
||||||
flog.info("*****")
|
|
||||||
}
|
|
||||||
|
|
||||||
flog.info("Finished preprocessing dataset collection")
|
|
64
init.R
Normal file
64
init.R
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
# ---- init ----
|
||||||
|
|
||||||
|
# clear envirionment
|
||||||
|
|
||||||
|
rm(list = ls())
|
||||||
|
|
||||||
|
# load setup variables
|
||||||
|
|
||||||
|
source("config.R")
|
||||||
|
|
||||||
|
# set randomization
|
||||||
|
|
||||||
|
set.seed(SEED)
|
||||||
|
|
||||||
|
# load library management system
|
||||||
|
|
||||||
|
library(checkpoint)
|
||||||
|
|
||||||
|
if (CHECKPOINT.QUICK.LOAD) # approx. x10 faster checkpoint library loading
|
||||||
|
{
|
||||||
|
# assume https
|
||||||
|
options(checkpoint.mranUrl = CHECKPOINT.MRAN.URL)
|
||||||
|
# disable url checking
|
||||||
|
assignInNamespace("is.404", function(mran, warn = TRUE) { FALSE },
|
||||||
|
"checkpoint")
|
||||||
|
}
|
||||||
|
|
||||||
|
# knitr fix
|
||||||
|
checkpoint(CHECKPOINT.SNAPSHOT.DATE, verbose = TRUE, scanForPackages = FALSE)
|
||||||
|
if (system.file(package = "knitr") == "")
|
||||||
|
{
|
||||||
|
install.packages("knitr")
|
||||||
|
}
|
||||||
|
|
||||||
|
# actual checkpoint loading
|
||||||
|
checkpoint(CHECKPOINT.SNAPSHOT.DATE, verbose = TRUE, scanForPackages = TRUE)
|
||||||
|
|
||||||
|
# load logging system
|
||||||
|
|
||||||
|
library(futile.logger)
|
||||||
|
|
||||||
|
flog.threshold(LOGGER.LEVEL)
|
||||||
|
|
||||||
|
# load libraries
|
||||||
|
|
||||||
|
library(RCurl)
|
||||||
|
library(tools)
|
||||||
|
library(yaml)
|
||||||
|
|
||||||
|
library(plyr)
|
||||||
|
library(dplyr)
|
||||||
|
library(foreign)
|
||||||
|
library(XLConnect)
|
||||||
|
|
||||||
|
# load helper functions
|
||||||
|
|
||||||
|
source("utils.R")
|
||||||
|
|
||||||
|
# perform additional custom init
|
||||||
|
|
||||||
|
if (file.exists(USER.INIT.FILE))
|
||||||
|
{
|
||||||
|
source(USER.INIT.FILE)
|
||||||
|
}
|
48
s1-download-data.R
Normal file
48
s1-download-data.R
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
# ---- download-data ----
|
||||||
|
|
||||||
|
source("init.R")
|
||||||
|
source("utils.R")
|
||||||
|
|
||||||
|
setup.logger(LOGGER.OUTPUT.S1.FILE)
|
||||||
|
|
||||||
|
flog.info("Step 1: download dataset collection")
|
||||||
|
|
||||||
|
for (dir.name in dir(DATASETS.DIR))
|
||||||
|
{
|
||||||
|
flog.info(paste("Dataset:", dir.name))
|
||||||
|
|
||||||
|
dest.dir = gsub(DATASET.NAME.PATTERN, dir.name, DATASET.ORIGINAL.DIR)
|
||||||
|
config.yaml.file = file.path(DATASETS.DIR, dir.name, DATASET.CONFIG.FILE)
|
||||||
|
|
||||||
|
urls.list = yaml.load_file(config.yaml.file)$urls
|
||||||
|
|
||||||
|
if (!dir.exists(dest.dir))
|
||||||
|
{
|
||||||
|
dir.create(dest.dir)
|
||||||
|
}
|
||||||
|
|
||||||
|
for (url in urls.list)
|
||||||
|
{
|
||||||
|
flog.info(paste("URL:", url))
|
||||||
|
|
||||||
|
dest.file = URLdecode(basename(url))
|
||||||
|
dest.file.path = file.path(dest.dir, dest.file)
|
||||||
|
|
||||||
|
if (!file.exists(dest.file.path) | OVERWRITE.OUTPUT.FILES)
|
||||||
|
{
|
||||||
|
tryCatch(
|
||||||
|
raw.content <-
|
||||||
|
getBinaryURL(url, .opts = curlOptions(ssl.verifypeer =
|
||||||
|
SSL.VERIFY.PEER)),
|
||||||
|
error = function(e){flog.error(e); stop(e)}
|
||||||
|
)
|
||||||
|
|
||||||
|
writeBin(raw.content, dest.file.path)
|
||||||
|
} else {
|
||||||
|
flog.warn(paste("Target file", basename(dest.file.path),
|
||||||
|
"already exists, skipping"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
flog.info(paste(rep("*", 25), collapse = ""))
|
||||||
|
}
|
61
s2-preprocess-data.R
Normal file
61
s2-preprocess-data.R
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
# ---- preprocess-data ----
|
||||||
|
|
||||||
|
source("init.R")
|
||||||
|
source("utils.R")
|
||||||
|
|
||||||
|
setup.logger(LOGGER.OUTPUT.S2.FILE)
|
||||||
|
|
||||||
|
flog.info("Step 2: preprocess dataset collection")
|
||||||
|
|
||||||
|
for (dir.name in dir(DATASETS.DIR))
|
||||||
|
{
|
||||||
|
flog.info(paste("Dataset:", dir.name))
|
||||||
|
|
||||||
|
orig.dir = gsub(DATASET.NAME.PATTERN, dir.name, DATASET.ORIGINAL.DIR)
|
||||||
|
dest.dir = gsub(DATASET.NAME.PATTERN, dir.name, DATASET.PREPROCESSED.DIR)
|
||||||
|
dest.file.path = file.path(dest.dir, DATASET.PREPROCESSED.OUTPUT.FILE)
|
||||||
|
|
||||||
|
if (!file.exists(dest.file.path) | OVERWRITE.OUTPUT.FILES)
|
||||||
|
{
|
||||||
|
r.src.file = file.path(DATASETS.DIR, dir.name,
|
||||||
|
DATASET.PREPROCESSING.SCRIPT)
|
||||||
|
source(r.src.file)
|
||||||
|
dataset = preprocess.dataset() # custom per-dataset preprocessing
|
||||||
|
|
||||||
|
# change column names
|
||||||
|
colnames(dataset) = tolower(
|
||||||
|
make.names(
|
||||||
|
gsub("^\\.|\\.$", "", colnames(dataset)),
|
||||||
|
unique = TRUE, allow_ = FALSE))
|
||||||
|
|
||||||
|
# change factor levels
|
||||||
|
for (name in colnames(dataset))
|
||||||
|
{
|
||||||
|
if (any(class(dataset[[name]]) == "factor"))
|
||||||
|
{
|
||||||
|
levels(dataset[[name]]) = tolower(
|
||||||
|
make.names(
|
||||||
|
gsub("^\\.|\\.$", "",
|
||||||
|
levels(dataset[[name]])),
|
||||||
|
unique = TRUE, allow_ = FALSE))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
print.dataset.statistics(dataset)
|
||||||
|
|
||||||
|
if (!dir.exists(dest.dir))
|
||||||
|
{
|
||||||
|
dir.create(dest.dir)
|
||||||
|
}
|
||||||
|
|
||||||
|
saveRDS(dataset, dest.file.path)
|
||||||
|
|
||||||
|
flog.info(paste("Created preprocessed file",
|
||||||
|
DATASET.PREPROCESSED.OUTPUT.FILE))
|
||||||
|
} else {
|
||||||
|
flog.warn(paste("Target file", basename(dest.file.path),
|
||||||
|
"already exists, skipping"))
|
||||||
|
}
|
||||||
|
|
||||||
|
flog.info(paste(rep("*", 25), collapse = ""))
|
||||||
|
}
|
@ -7,35 +7,31 @@ output:
|
|||||||
---
|
---
|
||||||
|
|
||||||
```{r global-options, include=FALSE}
|
```{r global-options, include=FALSE}
|
||||||
knitr::opts_chunk$set(comment="", echo=FALSE,
|
knitr::opts_chunk$set(comment = "", echo = FALSE, warning = FALSE, message = FALSE)
|
||||||
warning=FALSE, message=FALSE)
|
source('init.R')
|
||||||
source('config.R')
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Document generation date: `r Sys.time()`.
|
Document generation date: `r Sys.time()`.
|
||||||
|
|
||||||
|
|
||||||
```{r show-datasets, results='asis'}
|
```{r show-datasets, results='asis'}
|
||||||
library(yaml)
|
|
||||||
|
|
||||||
cat("\n# Table of Contents\n\n")
|
cat("\n# Table of Contents\n\n")
|
||||||
|
|
||||||
for (dir.name in dir(PATH_DATASETS))
|
for (dir.name in dir(DATASETS.DIR))
|
||||||
{
|
{
|
||||||
config.yaml.file.path = paste0(PATH_DATASETS, dir.name, "/", FILE_CONFIG_YAML)
|
config.yaml.file.path = file.path(DATASETS.DIR, dir.name, DATASET.CONFIG.FILE)
|
||||||
config.yaml = yaml.load_file(config.yaml.file.path)
|
config.yaml = yaml.load_file(config.yaml.file.path)
|
||||||
|
|
||||||
anchor = gsub(" ", "-", gsub("[[:punct:]]", "",
|
anchor = gsub(" ", "-", gsub("[[:punct:]]", "",
|
||||||
tolower(config.yaml$name)))
|
tolower(config.yaml$name)))
|
||||||
cat(paste0("1. [", config.yaml$name, "](#", anchor, ")\n" ))
|
cat(paste0("1. [", config.yaml$name, "](#", anchor, ")\n" ))
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
cat("\n---\n\n")
|
cat("\n---\n\n")
|
||||||
|
|
||||||
for (dir.name in dir(PATH_DATASETS))
|
for (dir.name in dir(DATASETS.DIR))
|
||||||
{
|
{
|
||||||
config.yaml.file.path = paste0(PATH_DATASETS, dir.name, "/", FILE_CONFIG_YAML)
|
config.yaml.file.path = file.path(DATASETS.DIR, dir.name, DATASET.CONFIG.FILE)
|
||||||
config.yaml = yaml.load_file(config.yaml.file.path)
|
config.yaml = yaml.load_file(config.yaml.file.path)
|
||||||
|
|
||||||
cat(paste("#", config.yaml$name, "\n\n"))
|
cat(paste("#", config.yaml$name, "\n\n"))
|
||||||
@ -55,8 +51,10 @@ for (dir.name in dir(PATH_DATASETS))
|
|||||||
|
|
||||||
cat(paste("**Dataset**:\n\n"))
|
cat(paste("**Dataset**:\n\n"))
|
||||||
|
|
||||||
preprocessed.dir = gsub("\\*", dir.name, PATH_DATASET_PREPROCESSED)
|
preprocessed.dir = gsub(DATASET.NAME.PATTERN, dir.name,
|
||||||
preprocessed.file.path = paste0(preprocessed.dir, FILE_PREPROCESSED_OUTPUT)
|
DATASET.PREPROCESSED.DIR)
|
||||||
|
preprocessed.file.path = file.path(preprocessed.dir,
|
||||||
|
DATASET.PREPROCESSED.OUTPUT.FILE)
|
||||||
|
|
||||||
dataset = readRDS(preprocessed.file.path)
|
dataset = readRDS(preprocessed.file.path)
|
||||||
|
|
||||||
@ -66,11 +64,11 @@ for (dir.name in dir(PATH_DATASETS))
|
|||||||
|
|
||||||
cat("**Predictors**:\n\n")
|
cat("**Predictors**:\n\n")
|
||||||
|
|
||||||
df.pred = data.frame(table(sapply(dataset[, 1:(ncol(dataset)-1)],
|
df.pred = data.frame(table(sapply(dataset[, 1:(ncol(dataset) - 1)],
|
||||||
function(f){paste(class(f), collapse=" ")})))
|
function(f){paste(class(f), collapse = " ")})))
|
||||||
colnames(df.pred) = c("Type", "Frequency")
|
colnames(df.pred) = c("Type", "Frequency")
|
||||||
|
|
||||||
cat(knitr::kable(df.pred, format="markdown"), sep="\n")
|
cat(knitr::kable(df.pred, format = "markdown"), sep = "\n")
|
||||||
cat("\n")
|
cat("\n")
|
||||||
|
|
||||||
perc.classes = sort(round(100*as.numeric(
|
perc.classes = sort(round(100*as.numeric(
|
||||||
@ -79,14 +77,14 @@ for (dir.name in dir(PATH_DATASETS))
|
|||||||
|
|
||||||
cat("**Class imbalance**:\n\n")
|
cat("**Class imbalance**:\n\n")
|
||||||
|
|
||||||
cat(knitr::kable(data.frame(A=c(paste(perc.classes[1], "%"), num.classes[1]),
|
cat(knitr::kable(data.frame(A = c(paste(perc.classes[1], "%"),
|
||||||
B=c(paste(perc.classes[2], "%"), num.classes[2])),
|
num.classes[1]),
|
||||||
format="markdown", col.names=c("class A", " class B"),
|
B = c(paste(perc.classes[2], "%"),
|
||||||
align=c("c", "c")),
|
num.classes[2])),
|
||||||
sep="\n")
|
format = "markdown", col.names = c("class A", "class B"),
|
||||||
|
align = c("c", "c")),
|
||||||
|
sep = "\n")
|
||||||
|
|
||||||
cat("\n---\n\n")
|
cat("\n---\n\n")
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
30
utils.R
30
utils.R
@ -1,19 +1,6 @@
|
|||||||
library(futile.logger)
|
print.dataset.statistics = function(dataset)
|
||||||
|
|
||||||
mkdir = function(dest.dir)
|
|
||||||
{
|
{
|
||||||
if (!dir.exists(dest.dir))
|
if (ncol(dataset) == 0) # for mockups
|
||||||
{
|
|
||||||
flog.debug(paste("Creating directory", dest.dir))
|
|
||||||
dir.create(dest.dir)
|
|
||||||
} else {
|
|
||||||
flog.debug(paste("Target directory", dest.dir, "already exists"))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
printDatasetStatistics = function(dataset)
|
|
||||||
{
|
|
||||||
if (ncol(dataset)==0) # for mockups
|
|
||||||
{
|
{
|
||||||
flog.warn("Empty dataset")
|
flog.warn("Empty dataset")
|
||||||
return()
|
return()
|
||||||
@ -21,8 +8,19 @@ printDatasetStatistics = function(dataset)
|
|||||||
|
|
||||||
no.cases = nrow(dataset)
|
no.cases = nrow(dataset)
|
||||||
no.attributes = ncol(dataset) - 1
|
no.attributes = ncol(dataset) - 1
|
||||||
perc.classes = round(100*as.numeric(table(dataset[, ncol(dataset)]))/nrow(dataset), 0)
|
perc.classes =
|
||||||
|
round(100*as.numeric(table(dataset[, ncol(dataset)]))/nrow(dataset), 0)
|
||||||
|
|
||||||
flog.info(paste0("Cases: ", no.cases, ", attributes: ", no.attributes,
|
flog.info(paste0("Cases: ", no.cases, ", attributes: ", no.attributes,
|
||||||
", classes: ", perc.classes[1], "%/", perc.classes[2], "%"))
|
", classes: ", perc.classes[1], "%/", perc.classes[2], "%"))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
setup.logger = function(output.file)
|
||||||
|
{
|
||||||
|
if (LOGGER.OVERWRITE.EXISTING.FILES & file.exists(output.file))
|
||||||
|
{
|
||||||
|
file.remove(output.file)
|
||||||
|
}
|
||||||
|
|
||||||
|
invisible(flog.appender(appender.tee(output.file)))
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user