mirror of
https://github.com/andre-wojtowicz/uci-ml-to-r.git
synced 2025-01-02 19:00:28 +01:00
Added first version for 10 UCI datasets
This commit is contained in:
parent
ecc65c56f5
commit
744a4faa0f
4
.gitignore
vendored
4
.gitignore
vendored
@ -16,3 +16,7 @@ vignettes/*.pdf
|
||||
# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
|
||||
.httr-oauth
|
||||
|
||||
# data collection
|
||||
|
||||
data-collection/*/original/*
|
||||
data-collection/*/preprocessed/*
|
||||
|
28
config.R
Normal file
28
config.R
Normal file
@ -0,0 +1,28 @@
|
||||
# ---- checkpoint ----
|
||||
|
||||
CHECKPOINT.MRAN.URL = "http://mran.microsoft.com/snapshot/"
|
||||
CHECKPOINT.SNAPSHOT.DATE = "2016-04-10"
|
||||
|
||||
library(checkpoint)
|
||||
options(checkpoint.mranUrl=CHECKPOINT.MRAN.URL)
|
||||
checkpoint(CHECKPOINT.SNAPSHOT.DATE)
|
||||
|
||||
# ---- logger ----
|
||||
|
||||
LOGGER_LEVEL = futile.logger::INFO
|
||||
|
||||
library(futile.logger)
|
||||
flog.threshold(LOGGER_LEVEL)
|
||||
|
||||
# ---- other ----
|
||||
|
||||
PATH_DATASETS = "data-collection/"
|
||||
PATH_DATASET_ORIGINAL = paste0(PATH_DATASETS, "*/original/")
|
||||
PATH_DATASET_PREPROCESSED = paste0(PATH_DATASETS, "*/preprocessed/")
|
||||
|
||||
FILE_CONFIG_YAML = "config.yaml"
|
||||
FILE_PREPROCESSING_SCRIPT = "preprocess.R"
|
||||
FILE_PREPROCESSED_OUTPUT = "dataset.rds"
|
||||
|
||||
if (file.exists("config.R.user"))
|
||||
source("config.R.user")
|
10
data-collection/bank-marketing/config.yaml
Normal file
10
data-collection/bank-marketing/config.yaml
Normal file
@ -0,0 +1,10 @@
|
||||
---
|
||||
name: Bank Marketing
|
||||
|
||||
info: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
|
||||
|
||||
urls:
|
||||
- https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip
|
||||
|
||||
cite: >
|
||||
[Moro et al., 2014] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014
|
50
data-collection/bank-marketing/preprocess.R
Normal file
50
data-collection/bank-marketing/preprocess.R
Normal file
@ -0,0 +1,50 @@
|
||||
preprocessDataset = function()
|
||||
{
|
||||
#set.seed(SEED)
|
||||
|
||||
temp.dir = tempdir()
|
||||
|
||||
zip.file = "bank-additional.zip"
|
||||
zip.dataset.path = "bank-additional/bank-additional-full.csv"
|
||||
|
||||
flog.debug(paste("Unzipping", zip.file))
|
||||
|
||||
unzip(zipfile=paste0(orig.dir, zip.file),
|
||||
files=zip.dataset.path,
|
||||
exdir=temp.dir)
|
||||
|
||||
flog.debug(paste("Loading", zip.dataset.path))
|
||||
|
||||
dataset = read.csv(paste0(temp.dir, "/", zip.dataset.path), sep=";")
|
||||
|
||||
flog.debug("Preprocessing loaded dataset")
|
||||
|
||||
dataset = dataset %>%
|
||||
select(-c(duration, pdays, default)) %>%
|
||||
filter(job != "unknown" & marital != "unknown" & education != "unknown" &
|
||||
education != "illiterate" & housing != "unknown" & loan != "unknown") %>%
|
||||
droplevels()
|
||||
|
||||
#dataset.yes = dataset %>% filter(y == "yes")
|
||||
#dataset.no = dataset %>% filter(y == "no") %>% sample_n(nrow(dataset.yes))
|
||||
#
|
||||
#dataset = rbind(dataset.yes, dataset.no)
|
||||
|
||||
dataset = dataset %>% mutate(
|
||||
education=factor(education, levels=c("basic.4y", "basic.6y",
|
||||
"basic.9y", "high.school",
|
||||
"professional.course",
|
||||
"university.degree"),
|
||||
ordered=TRUE),
|
||||
month=factor(month, levels=c("jan", "feb", "mar",
|
||||
"apr", "may", "jun",
|
||||
"jul", "aug", "sep",
|
||||
"oct", "nov", "dec"),
|
||||
ordered=TRUE),
|
||||
day_of_week=factor(day_of_week, levels=c("mon", "tue", "wed",
|
||||
"thu", "fri"),
|
||||
ordered=TRUE)
|
||||
)
|
||||
|
||||
return(dataset)
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
---
|
||||
name: Breast Cancer Wisconsin (Diagnostic)
|
||||
|
||||
info: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29
|
||||
|
||||
urls:
|
||||
- https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data
|
||||
- https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.names
|
||||
|
||||
cite: >
|
||||
https://archive.ics.uci.edu/ml/citation_policy.html
|
||||
|
||||
@misc{Lichman:2013 ,
|
||||
author = "M. Lichman",
|
||||
year = "2013",
|
||||
title = "{UCI} Machine Learning Repository",
|
||||
url = "http://archive.ics.uci.edu/ml",
|
||||
institution = "University of California, Irvine, School of Information and Computer Sciences" }
|
@ -0,0 +1,18 @@
|
||||
preprocessDataset = function()
|
||||
{
|
||||
csv.file = "wdbc.data"
|
||||
|
||||
dataset = read.csv(paste0(orig.dir, "/", csv.file), header=FALSE)
|
||||
|
||||
colnames(dataset) = c("id", "diagnosis",
|
||||
apply(expand.grid(c("radius", "texture", "perimeter",
|
||||
"area", "smoothness", "compactness",
|
||||
"concavity", "concave points",
|
||||
"symmetry", "fractal dimension"),
|
||||
c("mean", "se", "worst")),
|
||||
1, function(x){paste(x[2], x[1])}))
|
||||
|
||||
dataset = dataset %>% select(`mean radius`:`worst fractal dimension`, diagnosis)
|
||||
|
||||
return(dataset)
|
||||
}
|
11
data-collection/breast-cancer-wisconsin-original/config.yaml
Normal file
11
data-collection/breast-cancer-wisconsin-original/config.yaml
Normal file
@ -0,0 +1,11 @@
|
||||
---
|
||||
name: Breast Cancer Wisconsin (Original)
|
||||
|
||||
info: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Original%29
|
||||
|
||||
urls:
|
||||
- https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data
|
||||
- https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names
|
||||
|
||||
cite: >
|
||||
O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear programming", SIAM News, Volume 23, Number 5, September 1990, pp 1 & 18.
|
@ -0,0 +1,20 @@
|
||||
preprocessDataset = function()
|
||||
{
|
||||
csv.file = "breast-cancer-wisconsin.data"
|
||||
|
||||
dataset = read.csv(paste0(orig.dir, "/", csv.file), header=FALSE)
|
||||
|
||||
colnames(dataset) = c("Sample code number", "Clump Thickness",
|
||||
"Uniformity of Cell Size", "Uniformity of Cell Shape",
|
||||
"Marginal Adhesion", "Single Epithelial Cell Size",
|
||||
"Bare Nuclei", "Bland Chromatin", "Normal Nucleoli",
|
||||
"Mitoses", "Class")
|
||||
|
||||
dataset = dataset %>% select(-`Sample code number`) %>%
|
||||
filter(`Bare Nuclei` != "?") %>%
|
||||
mutate(Class=factor(Class),
|
||||
`Bare Nuclei`=as.integer(`Bare Nuclei`)) %>%
|
||||
droplevels()
|
||||
|
||||
return(dataset)
|
||||
}
|
10
data-collection/cardiotocography/config.yaml
Normal file
10
data-collection/cardiotocography/config.yaml
Normal file
@ -0,0 +1,10 @@
|
||||
---
|
||||
name: Cardiotocography
|
||||
|
||||
info: https://archive.ics.uci.edu/ml/datasets/Cardiotocography
|
||||
|
||||
urls:
|
||||
- https://archive.ics.uci.edu/ml/machine-learning-databases/00193/CTG.xls
|
||||
|
||||
cite: >
|
||||
Ayres de Campos et al. (2000) SisPorto 2.0 A Program for Automated Analysis of Cardiotocograms. J Matern Fetal Med 5:311-318
|
40
data-collection/cardiotocography/preprocess.R
Normal file
40
data-collection/cardiotocography/preprocess.R
Normal file
@ -0,0 +1,40 @@
|
||||
preprocessDataset = function()
|
||||
{
|
||||
xls.file = "CTG.xls"
|
||||
|
||||
wk = loadWorkbook(paste0(orig.dir, "/", xls.file))
|
||||
dataset = readWorksheet(wk, sheet="Raw Data")
|
||||
|
||||
dataset = dataset %>% select(LB:FS, NSP, -c(DS, DR)) %>%
|
||||
filter(complete.cases(.)) %>%
|
||||
mutate(LB=as.integer(LB),
|
||||
AC=as.integer(AC),
|
||||
FM=as.integer(FM),
|
||||
UC=as.integer(UC),
|
||||
ASTV=as.integer(ASTV),
|
||||
ALTV=as.integer(ALTV),
|
||||
DL=as.integer(DL),
|
||||
DP=as.integer(DP),
|
||||
Width=as.integer(Width),
|
||||
Min=as.integer(Min),
|
||||
Max=as.integer(Max),
|
||||
Nmax=as.integer(Nmax),
|
||||
Nzeros=as.integer(Nzeros),
|
||||
Mode=as.integer(Mode),
|
||||
Mean=as.integer(Mean),
|
||||
Median=as.integer(Median),
|
||||
Variance=as.integer(Variance),
|
||||
Tendency=factor(Tendency, levels=c(-1,0,1), ordered=TRUE),
|
||||
A=factor(A),
|
||||
B=factor(B),
|
||||
C=factor(C),
|
||||
D=factor(D),
|
||||
E=factor(E),
|
||||
AD=factor(AD),
|
||||
DE=factor(DE),
|
||||
LD=factor(LD),
|
||||
FS=factor(FS),
|
||||
NSP=factor(replace(NSP, NSP==2, 3)))
|
||||
|
||||
return(dataset)
|
||||
}
|
10
data-collection/credit-card/config.yaml
Normal file
10
data-collection/credit-card/config.yaml
Normal file
@ -0,0 +1,10 @@
|
||||
---
|
||||
name: Default of credit card clients
|
||||
|
||||
info: https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients
|
||||
|
||||
urls:
|
||||
- https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls
|
||||
|
||||
cite: >
|
||||
Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for the predictive accuracy of probability of default of credit card clients. Expert Systems with Applications, 36(2), 2473-2480.
|
48
data-collection/credit-card/preprocess.R
Normal file
48
data-collection/credit-card/preprocess.R
Normal file
@ -0,0 +1,48 @@
|
||||
preprocessDataset = function()
|
||||
{
|
||||
#set.seed(SEED)
|
||||
|
||||
xls.file = "default of credit card clients.xls"
|
||||
|
||||
wk = loadWorkbook(paste0(orig.dir, "/", xls.file))
|
||||
dataset = readWorksheet(wk, sheet="Data", startRow=2, startCol=2,
|
||||
check.names=FALSE)
|
||||
|
||||
dataset = dataset %>%
|
||||
mutate(LIMIT_BAL=as.integer(LIMIT_BAL),
|
||||
SEX=factor(SEX),
|
||||
EDUCATION=factor(EDUCATION), # can't order due to
|
||||
# inconsistency with
|
||||
# UCI description
|
||||
MARRIAGE=factor(MARRIAGE),
|
||||
AGE=as.integer(AGE),
|
||||
PAY_0=as.integer(replace(PAY_0, PAY_0<0, 0)),
|
||||
PAY_2=as.integer(replace(PAY_2, PAY_2<0, 0)),
|
||||
PAY_3=as.integer(replace(PAY_3, PAY_3<0, 0)),
|
||||
PAY_4=as.integer(replace(PAY_4, PAY_4<0, 0)),
|
||||
PAY_5=as.integer(replace(PAY_5, PAY_5<0, 0)),
|
||||
PAY_6=as.integer(replace(PAY_6, PAY_6<0, 0)),
|
||||
BILL_AMT1=as.integer(BILL_AMT1),
|
||||
BILL_AMT2=as.integer(BILL_AMT2),
|
||||
BILL_AMT3=as.integer(BILL_AMT3),
|
||||
BILL_AMT4=as.integer(BILL_AMT4),
|
||||
BILL_AMT5=as.integer(BILL_AMT5),
|
||||
BILL_AMT6=as.integer(BILL_AMT6),
|
||||
PAY_AMT1=as.integer(PAY_AMT1),
|
||||
PAY_AMT2=as.integer(PAY_AMT2),
|
||||
PAY_AMT3=as.integer(PAY_AMT3),
|
||||
PAY_AMT4=as.integer(PAY_AMT4),
|
||||
PAY_AMT5=as.integer(PAY_AMT5),
|
||||
PAY_AMT6=as.integer(PAY_AMT6),
|
||||
`default payment next month`=factor(
|
||||
`default payment next month`)
|
||||
)
|
||||
|
||||
#dataset.1 = dataset %>% filter(`default payment next month` == 1)
|
||||
#dataset.0 = dataset %>% filter(`default payment next month` == 0) %>%
|
||||
# sample_n(nrow(dataset.1))
|
||||
#
|
||||
#dataset = rbind(dataset.0, dataset.1)
|
||||
|
||||
return(dataset)
|
||||
}
|
17
data-collection/indian-liver/config.yaml
Normal file
17
data-collection/indian-liver/config.yaml
Normal file
@ -0,0 +1,17 @@
|
||||
---
|
||||
name: ILPD (Indian Liver Patient Dataset)
|
||||
|
||||
info: https://archive.ics.uci.edu/ml/datasets/ILPD+(Indian+Liver+Patient+Dataset)
|
||||
|
||||
urls:
|
||||
- https://archive.ics.uci.edu/ml/machine-learning-databases/00225/Indian%20Liver%20Patient%20Dataset%20(ILPD).csv
|
||||
|
||||
cite: >
|
||||
https://archive.ics.uci.edu/ml/citation_policy.html
|
||||
|
||||
@misc{Lichman:2013 ,
|
||||
author = "M. Lichman",
|
||||
year = "2013",
|
||||
title = "{UCI} Machine Learning Repository",
|
||||
url = "http://archive.ics.uci.edu/ml",
|
||||
institution = "University of California, Irvine, School of Information and Computer Sciences" }
|
14
data-collection/indian-liver/preprocess.R
Normal file
14
data-collection/indian-liver/preprocess.R
Normal file
@ -0,0 +1,14 @@
|
||||
preprocessDataset = function()
|
||||
{
|
||||
csv.file = "Indian Liver Patient Dataset (ILPD).csv"
|
||||
|
||||
dataset = read.csv(paste0(orig.dir, "/", csv.file),
|
||||
header=FALSE)
|
||||
|
||||
colnames(dataset) = c("Age", "Gender", "TB", "DB", "Alkphos", "Sgpt",
|
||||
"Sgot", "TP", "ALB", "A/G Ratio", "Selector")
|
||||
|
||||
dataset = dataset %>% mutate(Selector=factor(Selector))
|
||||
|
||||
return(dataset)
|
||||
}
|
18
data-collection/magic/config.yaml
Normal file
18
data-collection/magic/config.yaml
Normal file
@ -0,0 +1,18 @@
|
||||
---
|
||||
name: MAGIC Gamma Telescope
|
||||
|
||||
info: https://archive.ics.uci.edu/ml/datasets/MAGIC+Gamma+Telescope
|
||||
|
||||
urls:
|
||||
- https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data
|
||||
- https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.names
|
||||
|
||||
cite: >
|
||||
https://archive.ics.uci.edu/ml/citation_policy.html
|
||||
|
||||
@misc{Lichman:2013 ,
|
||||
author = "M. Lichman",
|
||||
year = "2013",
|
||||
title = "{UCI} Machine Learning Repository",
|
||||
url = "http://archive.ics.uci.edu/ml",
|
||||
institution = "University of California, Irvine, School of Information and Computer Sciences" }
|
13
data-collection/magic/preprocess.R
Normal file
13
data-collection/magic/preprocess.R
Normal file
@ -0,0 +1,13 @@
|
||||
preprocessDataset = function()
|
||||
{
|
||||
csv.file = "magic04.data"
|
||||
|
||||
dataset = read.csv(paste0(orig.dir, "/", csv.file),
|
||||
header=FALSE)
|
||||
|
||||
colnames(dataset) = c("fLength", "fWidth", "fSize", "fConc", "fConc1",
|
||||
"fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist",
|
||||
"class")
|
||||
|
||||
return(dataset)
|
||||
}
|
10
data-collection/seismic-bumps/config.yaml
Normal file
10
data-collection/seismic-bumps/config.yaml
Normal file
@ -0,0 +1,10 @@
|
||||
---
|
||||
name: Seismic bumps
|
||||
|
||||
info: https://archive.ics.uci.edu/ml/datasets/seismic-bumps
|
||||
|
||||
urls:
|
||||
- https://archive.ics.uci.edu/ml/machine-learning-databases/00266/seismic-bumps.arff
|
||||
|
||||
cite: >
|
||||
Sikora M., Wrobel L.: Application of rule induction algorithms for analysis of data collected by seismic hazard monitoring systems in coal mines. Archives of Mining Sciences, 55(1), 2010, 91-114.
|
30
data-collection/seismic-bumps/preprocess.R
Normal file
30
data-collection/seismic-bumps/preprocess.R
Normal file
@ -0,0 +1,30 @@
|
||||
preprocessDataset = function()
|
||||
{
|
||||
#set.seed(SEED)
|
||||
|
||||
arff.file = "seismic-bumps.arff"
|
||||
|
||||
dataset = read.arff(paste0(orig.dir, "/", arff.file))
|
||||
|
||||
dataset = dataset %>% select(-c(nbumps6:nbumps89)) %>%
|
||||
mutate(genergy=as.integer(genergy),
|
||||
gpuls=as.integer(gpuls),
|
||||
gdenergy=as.integer(gdenergy),
|
||||
gdpuls=as.integer(gdpuls),
|
||||
nbumps=as.integer(nbumps),
|
||||
nbumps2=as.integer(nbumps2),
|
||||
nbumps3=as.integer(nbumps3),
|
||||
nbumps4=as.integer(nbumps4),
|
||||
nbumps5=as.integer(nbumps5),
|
||||
energy=as.integer(energy),
|
||||
maxenergy=as.integer(maxenergy)
|
||||
)
|
||||
|
||||
#dataset.1 = dataset %>% filter(class == "1")
|
||||
#dataset.0 = dataset %>% filter(class == "0") %>%
|
||||
# sample_n(nrow(dataset.1)*4)
|
||||
#
|
||||
#dataset = rbind(dataset.0, dataset.1)
|
||||
|
||||
return(dataset)
|
||||
}
|
19
data-collection/spambase/config.yaml
Normal file
19
data-collection/spambase/config.yaml
Normal file
@ -0,0 +1,19 @@
|
||||
---
|
||||
name: Spambase
|
||||
|
||||
info: https://archive.ics.uci.edu/ml/datasets/Spambase
|
||||
|
||||
urls:
|
||||
- https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.DOCUMENTATION
|
||||
- https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data
|
||||
- https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.names
|
||||
|
||||
cite: >
|
||||
https://archive.ics.uci.edu/ml/citation_policy.html
|
||||
|
||||
@misc{Lichman:2013 ,
|
||||
author = "M. Lichman",
|
||||
year = "2013",
|
||||
title = "{UCI} Machine Learning Repository",
|
||||
url = "http://archive.ics.uci.edu/ml",
|
||||
institution = "University of California, Irvine, School of Information and Computer Sciences" }
|
33
data-collection/spambase/preprocess.R
Normal file
33
data-collection/spambase/preprocess.R
Normal file
@ -0,0 +1,33 @@
|
||||
preprocessDataset = function()
|
||||
{
|
||||
csv.file = "spambase.data"
|
||||
|
||||
dataset = read.csv(paste0(orig.dir, "/", csv.file),
|
||||
header=FALSE)
|
||||
|
||||
colnames(dataset) = c("word_freq_make", "word_freq_address", "word_freq_all",
|
||||
"word_freq_3d", "word_freq_our", "word_freq_over",
|
||||
"word_freq_remove", "word_freq_internet",
|
||||
"word_freq_order", "word_freq_mail", "word_freq_receive",
|
||||
"word_freq_will", "word_freq_people", "word_freq_report",
|
||||
"word_freq_addresses", "word_freq_free",
|
||||
"word_freq_business", "word_freq_email", "word_freq_you",
|
||||
"word_freq_credit", "word_freq_your", "word_freq_font",
|
||||
"word_freq_000", "word_freq_money", "word_freq_hp",
|
||||
"word_freq_hpl", "word_freq_george", "word_freq_650",
|
||||
"word_freq_lab", "word_freq_labs", "word_freq_telnet",
|
||||
"word_freq_857", "word_freq_data", "word_freq_415",
|
||||
"word_freq_85", "word_freq_technology", "word_freq_1999",
|
||||
"word_freq_parts", "word_freq_pm", "word_freq_direct",
|
||||
"word_freq_cs", "word_freq_meeting", "word_freq_original",
|
||||
"word_freq_project", "word_freq_re", "word_freq_edu",
|
||||
"word_freq_table", "word_freq_conference", "char_freq_;",
|
||||
"char_freq_(", "char_freq_[", "char_freq_!", "char_freq_$",
|
||||
"char_freq_#", "capital_run_length_average",
|
||||
"capital_run_length_longest", "capital_run_length_total",
|
||||
"class")
|
||||
|
||||
dataset = dataset %>% mutate(class=factor(class))
|
||||
|
||||
return(dataset)
|
||||
}
|
13
data-collection/wine-quality/config.yaml
Normal file
13
data-collection/wine-quality/config.yaml
Normal file
@ -0,0 +1,13 @@
|
||||
---
|
||||
name: Wine Quality
|
||||
|
||||
info: https://archive.ics.uci.edu/ml/datasets/Wine+Quality
|
||||
|
||||
urls:
|
||||
- https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
|
||||
- https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv
|
||||
- https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality.names
|
||||
|
||||
cite: >
|
||||
P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
|
||||
Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.
|
21
data-collection/wine-quality/preprocess.R
Normal file
21
data-collection/wine-quality/preprocess.R
Normal file
@ -0,0 +1,21 @@
|
||||
preprocessDataset = function()
|
||||
{
|
||||
csv.file.w = "winequality-white.csv"
|
||||
csv.file.r = "winequality-red.csv"
|
||||
|
||||
dataset.w = read.csv(paste0(orig.dir, "/", csv.file.w), sep=";",
|
||||
check.names=FALSE)
|
||||
dataset.w = dataset.w %>% mutate(color="white")
|
||||
|
||||
dataset.r = read.csv(paste0(orig.dir, "/", csv.file.r), sep=";",
|
||||
check.names=FALSE)
|
||||
dataset.r = dataset.r %>% mutate(color="red")
|
||||
|
||||
dataset = rbind(dataset.w, dataset.r) %>%
|
||||
mutate(color=factor(color),
|
||||
quality=ifelse(quality>5, 1, 0)) %>%
|
||||
select(`fixed acidity`:alcohol, color, quality) %>%
|
||||
mutate(quality=factor(quality))
|
||||
|
||||
return(dataset)
|
||||
}
|
49
data-download.R
Normal file
49
data-download.R
Normal file
@ -0,0 +1,49 @@
|
||||
rm(list=ls())
|
||||
|
||||
source("config.R")
|
||||
source("utils.R")
|
||||
|
||||
library(RCurl)
|
||||
library(tools)
|
||||
library(yaml)
|
||||
|
||||
flog.info("Started downloading dataset collection")
|
||||
|
||||
for (dir.name in dir(PATH_DATASETS))
|
||||
{
|
||||
flog.info(paste("Dataset:", dir.name))
|
||||
|
||||
dest.dir = gsub("\\*", dir.name, PATH_DATASET_ORIGINAL)
|
||||
config.yaml.file = paste0(PATH_DATASETS, dir.name, "/", FILE_CONFIG_YAML)
|
||||
|
||||
urls.list = yaml.load_file(config.yaml.file)$urls
|
||||
|
||||
mkdir(dest.dir)
|
||||
|
||||
for (url in urls.list)
|
||||
{
|
||||
flog.info(paste("URL:", url))
|
||||
|
||||
dest.file = URLdecode(basename(url))
|
||||
dest.file.path = paste0(dest.dir, dest.file)
|
||||
|
||||
if (file.exists(dest.file.path))
|
||||
{
|
||||
flog.warn(paste("Target file", basename(dest.file.path),
|
||||
"already exists; skipping..."))
|
||||
next
|
||||
}
|
||||
|
||||
tryCatch(
|
||||
raw.content <- getBinaryURL(url, .opts=curlOptions(ssl.verifypeer=FALSE)),
|
||||
error = function(e){flog.error(e); stop(e)}
|
||||
)
|
||||
|
||||
writeBin(raw.content, dest.file.path)
|
||||
|
||||
}
|
||||
|
||||
flog.info("*****")
|
||||
}
|
||||
|
||||
flog.info("Finished downloading dataset collection")
|
43
data-preprocess.R
Normal file
43
data-preprocess.R
Normal file
@ -0,0 +1,43 @@
|
||||
rm(list=ls())
|
||||
|
||||
source("config.R")
|
||||
source("utils.R")
|
||||
|
||||
library(dplyr)
|
||||
library(foreign)
|
||||
library(XLConnect)
|
||||
|
||||
flog.info("Started preprocessing dataset collection")
|
||||
|
||||
for (dir.name in dir(PATH_DATASETS))
|
||||
{
|
||||
flog.info(paste("Dataset:", dir.name))
|
||||
|
||||
orig.dir = gsub("\\*", dir.name, PATH_DATASET_ORIGINAL)
|
||||
dest.dir = gsub("\\*", dir.name, PATH_DATASET_PREPROCESSED)
|
||||
dest.file.path = paste0(dest.dir, FILE_PREPROCESSED_OUTPUT)
|
||||
|
||||
if (file.exists(dest.file.path))
|
||||
{
|
||||
flog.warn(paste("Target file", basename(dest.file.path),
|
||||
"already exists; skipping..."))
|
||||
flog.info("*****")
|
||||
next
|
||||
}
|
||||
|
||||
r.src.file = paste0(PATH_DATASETS, dir.name, "/", FILE_PREPROCESSING_SCRIPT)
|
||||
|
||||
source(r.src.file)
|
||||
dataset = preprocessDataset()
|
||||
|
||||
printDatasetStatistics(dataset)
|
||||
|
||||
mkdir(dest.dir)
|
||||
saveRDS(dataset, dest.file.path)
|
||||
|
||||
flog.info(paste("Created preprocessed file", FILE_PREPROCESSED_OUTPUT))
|
||||
|
||||
flog.info("*****")
|
||||
}
|
||||
|
||||
flog.info("Finished preprocessing dataset collection")
|
13
uci-ml-to-r.Rproj
Normal file
13
uci-ml-to-r.Rproj
Normal file
@ -0,0 +1,13 @@
|
||||
Version: 1.0
|
||||
|
||||
RestoreWorkspace: Default
|
||||
SaveWorkspace: Default
|
||||
AlwaysSaveHistory: Default
|
||||
|
||||
EnableCodeIndexing: Yes
|
||||
UseSpacesForTab: Yes
|
||||
NumSpacesForTab: 4
|
||||
Encoding: UTF-8
|
||||
|
||||
RnwWeave: Sweave
|
||||
LaTeX: pdfLaTeX
|
28
utils.R
Normal file
28
utils.R
Normal file
@ -0,0 +1,28 @@
|
||||
library(futile.logger)
|
||||
|
||||
mkdir = function(dest.dir)
|
||||
{
|
||||
if (!dir.exists(dest.dir))
|
||||
{
|
||||
flog.debug(paste("Creating directory", dest.dir))
|
||||
dir.create(dest.dir)
|
||||
} else {
|
||||
flog.debug(paste("Target directory", dest.dir, "already exists"))
|
||||
}
|
||||
}
|
||||
|
||||
printDatasetStatistics = function(dataset)
|
||||
{
|
||||
if (ncol(dataset)==0) # for mockups
|
||||
{
|
||||
flog.warn("Empty dataset")
|
||||
return()
|
||||
}
|
||||
|
||||
no.cases = nrow(dataset)
|
||||
no.attributes = ncol(dataset) - 1
|
||||
perc.classes = round(100*as.numeric(table(dataset[, ncol(dataset)]))/nrow(dataset), 0)
|
||||
|
||||
flog.info(paste0("Cases: ", no.cases, ", attributes: ", no.attributes,
|
||||
", classes: ", perc.classes[1], "%/", perc.classes[2], "%"))
|
||||
}
|
Loading…
Reference in New Issue
Block a user