Added first version for 10 UCI datasets

This commit is contained in:
Andrzej Wójtowicz 2016-04-15 15:44:49 +02:00
parent ecc65c56f5
commit 744a4faa0f
26 changed files with 588 additions and 0 deletions

4
.gitignore vendored
View File

@ -16,3 +16,7 @@ vignettes/*.pdf
# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth
# data collection
data-collection/*/original/*
data-collection/*/preprocessed/*

28
config.R Normal file
View File

@ -0,0 +1,28 @@
# ---- checkpoint ----
CHECKPOINT.MRAN.URL = "http://mran.microsoft.com/snapshot/"
CHECKPOINT.SNAPSHOT.DATE = "2016-04-10"
library(checkpoint)
options(checkpoint.mranUrl=CHECKPOINT.MRAN.URL)
checkpoint(CHECKPOINT.SNAPSHOT.DATE)
# ---- logger ----
LOGGER_LEVEL = futile.logger::INFO
library(futile.logger)
flog.threshold(LOGGER_LEVEL)
# ---- other ----
PATH_DATASETS = "data-collection/"
PATH_DATASET_ORIGINAL = paste0(PATH_DATASETS, "*/original/")
PATH_DATASET_PREPROCESSED = paste0(PATH_DATASETS, "*/preprocessed/")
FILE_CONFIG_YAML = "config.yaml"
FILE_PREPROCESSING_SCRIPT = "preprocess.R"
FILE_PREPROCESSED_OUTPUT = "dataset.rds"
if (file.exists("config.R.user"))
source("config.R.user")

View File

@ -0,0 +1,10 @@
---
name: Bank Marketing
info: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
urls:
- https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip
cite: >
[Moro et al., 2014] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014

View File

@ -0,0 +1,50 @@
preprocessDataset = function()
{
#set.seed(SEED)
temp.dir = tempdir()
zip.file = "bank-additional.zip"
zip.dataset.path = "bank-additional/bank-additional-full.csv"
flog.debug(paste("Unzipping", zip.file))
unzip(zipfile=paste0(orig.dir, zip.file),
files=zip.dataset.path,
exdir=temp.dir)
flog.debug(paste("Loading", zip.dataset.path))
dataset = read.csv(paste0(temp.dir, "/", zip.dataset.path), sep=";")
flog.debug("Preprocessing loaded dataset")
dataset = dataset %>%
select(-c(duration, pdays, default)) %>%
filter(job != "unknown" & marital != "unknown" & education != "unknown" &
education != "illiterate" & housing != "unknown" & loan != "unknown") %>%
droplevels()
#dataset.yes = dataset %>% filter(y == "yes")
#dataset.no = dataset %>% filter(y == "no") %>% sample_n(nrow(dataset.yes))
#
#dataset = rbind(dataset.yes, dataset.no)
dataset = dataset %>% mutate(
education=factor(education, levels=c("basic.4y", "basic.6y",
"basic.9y", "high.school",
"professional.course",
"university.degree"),
ordered=TRUE),
month=factor(month, levels=c("jan", "feb", "mar",
"apr", "may", "jun",
"jul", "aug", "sep",
"oct", "nov", "dec"),
ordered=TRUE),
day_of_week=factor(day_of_week, levels=c("mon", "tue", "wed",
"thu", "fri"),
ordered=TRUE)
)
return(dataset)
}

View File

@ -0,0 +1,18 @@
---
name: Breast Cancer Wisconsin (Diagnostic)
info: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29
urls:
- https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data
- https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.names
cite: >
https://archive.ics.uci.edu/ml/citation_policy.html
@misc{Lichman:2013 ,
author = "M. Lichman",
year = "2013",
title = "{UCI} Machine Learning Repository",
url = "http://archive.ics.uci.edu/ml",
institution = "University of California, Irvine, School of Information and Computer Sciences" }

View File

@ -0,0 +1,18 @@
preprocessDataset = function()
{
csv.file = "wdbc.data"
dataset = read.csv(paste0(orig.dir, "/", csv.file), header=FALSE)
colnames(dataset) = c("id", "diagnosis",
apply(expand.grid(c("radius", "texture", "perimeter",
"area", "smoothness", "compactness",
"concavity", "concave points",
"symmetry", "fractal dimension"),
c("mean", "se", "worst")),
1, function(x){paste(x[2], x[1])}))
dataset = dataset %>% select(`mean radius`:`worst fractal dimension`, diagnosis)
return(dataset)
}

View File

@ -0,0 +1,11 @@
---
name: Breast Cancer Wisconsin (Original)
info: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Original%29
urls:
- https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data
- https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names
cite: >
O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear programming", SIAM News, Volume 23, Number 5, September 1990, pp 1 & 18.

View File

@ -0,0 +1,20 @@
preprocessDataset = function()
{
csv.file = "breast-cancer-wisconsin.data"
dataset = read.csv(paste0(orig.dir, "/", csv.file), header=FALSE)
colnames(dataset) = c("Sample code number", "Clump Thickness",
"Uniformity of Cell Size", "Uniformity of Cell Shape",
"Marginal Adhesion", "Single Epithelial Cell Size",
"Bare Nuclei", "Bland Chromatin", "Normal Nucleoli",
"Mitoses", "Class")
dataset = dataset %>% select(-`Sample code number`) %>%
filter(`Bare Nuclei` != "?") %>%
mutate(Class=factor(Class),
`Bare Nuclei`=as.integer(`Bare Nuclei`)) %>%
droplevels()
return(dataset)
}

View File

@ -0,0 +1,10 @@
---
name: Cardiotocography
info: https://archive.ics.uci.edu/ml/datasets/Cardiotocography
urls:
- https://archive.ics.uci.edu/ml/machine-learning-databases/00193/CTG.xls
cite: >
Ayres de Campos et al. (2000) SisPorto 2.0 A Program for Automated Analysis of Cardiotocograms. J Matern Fetal Med 5:311-318

View File

@ -0,0 +1,40 @@
preprocessDataset = function()
{
xls.file = "CTG.xls"
wk = loadWorkbook(paste0(orig.dir, "/", xls.file))
dataset = readWorksheet(wk, sheet="Raw Data")
dataset = dataset %>% select(LB:FS, NSP, -c(DS, DR)) %>%
filter(complete.cases(.)) %>%
mutate(LB=as.integer(LB),
AC=as.integer(AC),
FM=as.integer(FM),
UC=as.integer(UC),
ASTV=as.integer(ASTV),
ALTV=as.integer(ALTV),
DL=as.integer(DL),
DP=as.integer(DP),
Width=as.integer(Width),
Min=as.integer(Min),
Max=as.integer(Max),
Nmax=as.integer(Nmax),
Nzeros=as.integer(Nzeros),
Mode=as.integer(Mode),
Mean=as.integer(Mean),
Median=as.integer(Median),
Variance=as.integer(Variance),
Tendency=factor(Tendency, levels=c(-1,0,1), ordered=TRUE),
A=factor(A),
B=factor(B),
C=factor(C),
D=factor(D),
E=factor(E),
AD=factor(AD),
DE=factor(DE),
LD=factor(LD),
FS=factor(FS),
NSP=factor(replace(NSP, NSP==2, 3)))
return(dataset)
}

View File

@ -0,0 +1,10 @@
---
name: Default of credit card clients
info: https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients
urls:
- https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls
cite: >
Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for the predictive accuracy of probability of default of credit card clients. Expert Systems with Applications, 36(2), 2473-2480.

View File

@ -0,0 +1,48 @@
preprocessDataset = function()
{
#set.seed(SEED)
xls.file = "default of credit card clients.xls"
wk = loadWorkbook(paste0(orig.dir, "/", xls.file))
dataset = readWorksheet(wk, sheet="Data", startRow=2, startCol=2,
check.names=FALSE)
dataset = dataset %>%
mutate(LIMIT_BAL=as.integer(LIMIT_BAL),
SEX=factor(SEX),
EDUCATION=factor(EDUCATION), # can't order due to
# inconsistency with
# UCI description
MARRIAGE=factor(MARRIAGE),
AGE=as.integer(AGE),
PAY_0=as.integer(replace(PAY_0, PAY_0<0, 0)),
PAY_2=as.integer(replace(PAY_2, PAY_2<0, 0)),
PAY_3=as.integer(replace(PAY_3, PAY_3<0, 0)),
PAY_4=as.integer(replace(PAY_4, PAY_4<0, 0)),
PAY_5=as.integer(replace(PAY_5, PAY_5<0, 0)),
PAY_6=as.integer(replace(PAY_6, PAY_6<0, 0)),
BILL_AMT1=as.integer(BILL_AMT1),
BILL_AMT2=as.integer(BILL_AMT2),
BILL_AMT3=as.integer(BILL_AMT3),
BILL_AMT4=as.integer(BILL_AMT4),
BILL_AMT5=as.integer(BILL_AMT5),
BILL_AMT6=as.integer(BILL_AMT6),
PAY_AMT1=as.integer(PAY_AMT1),
PAY_AMT2=as.integer(PAY_AMT2),
PAY_AMT3=as.integer(PAY_AMT3),
PAY_AMT4=as.integer(PAY_AMT4),
PAY_AMT5=as.integer(PAY_AMT5),
PAY_AMT6=as.integer(PAY_AMT6),
`default payment next month`=factor(
`default payment next month`)
)
#dataset.1 = dataset %>% filter(`default payment next month` == 1)
#dataset.0 = dataset %>% filter(`default payment next month` == 0) %>%
# sample_n(nrow(dataset.1))
#
#dataset = rbind(dataset.0, dataset.1)
return(dataset)
}

View File

@ -0,0 +1,17 @@
---
name: ILPD (Indian Liver Patient Dataset)
info: https://archive.ics.uci.edu/ml/datasets/ILPD+(Indian+Liver+Patient+Dataset)
urls:
- https://archive.ics.uci.edu/ml/machine-learning-databases/00225/Indian%20Liver%20Patient%20Dataset%20(ILPD).csv
cite: >
https://archive.ics.uci.edu/ml/citation_policy.html
@misc{Lichman:2013 ,
author = "M. Lichman",
year = "2013",
title = "{UCI} Machine Learning Repository",
url = "http://archive.ics.uci.edu/ml",
institution = "University of California, Irvine, School of Information and Computer Sciences" }

View File

@ -0,0 +1,14 @@
preprocessDataset = function()
{
csv.file = "Indian Liver Patient Dataset (ILPD).csv"
dataset = read.csv(paste0(orig.dir, "/", csv.file),
header=FALSE)
colnames(dataset) = c("Age", "Gender", "TB", "DB", "Alkphos", "Sgpt",
"Sgot", "TP", "ALB", "A/G Ratio", "Selector")
dataset = dataset %>% mutate(Selector=factor(Selector))
return(dataset)
}

View File

@ -0,0 +1,18 @@
---
name: MAGIC Gamma Telescope
info: https://archive.ics.uci.edu/ml/datasets/MAGIC+Gamma+Telescope
urls:
- https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data
- https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.names
cite: >
https://archive.ics.uci.edu/ml/citation_policy.html
@misc{Lichman:2013 ,
author = "M. Lichman",
year = "2013",
title = "{UCI} Machine Learning Repository",
url = "http://archive.ics.uci.edu/ml",
institution = "University of California, Irvine, School of Information and Computer Sciences" }

View File

@ -0,0 +1,13 @@
preprocessDataset = function()
{
csv.file = "magic04.data"
dataset = read.csv(paste0(orig.dir, "/", csv.file),
header=FALSE)
colnames(dataset) = c("fLength", "fWidth", "fSize", "fConc", "fConc1",
"fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist",
"class")
return(dataset)
}

View File

@ -0,0 +1,10 @@
---
name: Seismic bumps
info: https://archive.ics.uci.edu/ml/datasets/seismic-bumps
urls:
- https://archive.ics.uci.edu/ml/machine-learning-databases/00266/seismic-bumps.arff
cite: >
Sikora M., Wrobel L.: Application of rule induction algorithms for analysis of data collected by seismic hazard monitoring systems in coal mines. Archives of Mining Sciences, 55(1), 2010, 91-114.

View File

@ -0,0 +1,30 @@
preprocessDataset = function()
{
#set.seed(SEED)
arff.file = "seismic-bumps.arff"
dataset = read.arff(paste0(orig.dir, "/", arff.file))
dataset = dataset %>% select(-c(nbumps6:nbumps89)) %>%
mutate(genergy=as.integer(genergy),
gpuls=as.integer(gpuls),
gdenergy=as.integer(gdenergy),
gdpuls=as.integer(gdpuls),
nbumps=as.integer(nbumps),
nbumps2=as.integer(nbumps2),
nbumps3=as.integer(nbumps3),
nbumps4=as.integer(nbumps4),
nbumps5=as.integer(nbumps5),
energy=as.integer(energy),
maxenergy=as.integer(maxenergy)
)
#dataset.1 = dataset %>% filter(class == "1")
#dataset.0 = dataset %>% filter(class == "0") %>%
# sample_n(nrow(dataset.1)*4)
#
#dataset = rbind(dataset.0, dataset.1)
return(dataset)
}

View File

@ -0,0 +1,19 @@
---
name: Spambase
info: https://archive.ics.uci.edu/ml/datasets/Spambase
urls:
- https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.DOCUMENTATION
- https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data
- https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.names
cite: >
https://archive.ics.uci.edu/ml/citation_policy.html
@misc{Lichman:2013 ,
author = "M. Lichman",
year = "2013",
title = "{UCI} Machine Learning Repository",
url = "http://archive.ics.uci.edu/ml",
institution = "University of California, Irvine, School of Information and Computer Sciences" }

View File

@ -0,0 +1,33 @@
preprocessDataset = function()
{
csv.file = "spambase.data"
dataset = read.csv(paste0(orig.dir, "/", csv.file),
header=FALSE)
colnames(dataset) = c("word_freq_make", "word_freq_address", "word_freq_all",
"word_freq_3d", "word_freq_our", "word_freq_over",
"word_freq_remove", "word_freq_internet",
"word_freq_order", "word_freq_mail", "word_freq_receive",
"word_freq_will", "word_freq_people", "word_freq_report",
"word_freq_addresses", "word_freq_free",
"word_freq_business", "word_freq_email", "word_freq_you",
"word_freq_credit", "word_freq_your", "word_freq_font",
"word_freq_000", "word_freq_money", "word_freq_hp",
"word_freq_hpl", "word_freq_george", "word_freq_650",
"word_freq_lab", "word_freq_labs", "word_freq_telnet",
"word_freq_857", "word_freq_data", "word_freq_415",
"word_freq_85", "word_freq_technology", "word_freq_1999",
"word_freq_parts", "word_freq_pm", "word_freq_direct",
"word_freq_cs", "word_freq_meeting", "word_freq_original",
"word_freq_project", "word_freq_re", "word_freq_edu",
"word_freq_table", "word_freq_conference", "char_freq_;",
"char_freq_(", "char_freq_[", "char_freq_!", "char_freq_$",
"char_freq_#", "capital_run_length_average",
"capital_run_length_longest", "capital_run_length_total",
"class")
dataset = dataset %>% mutate(class=factor(class))
return(dataset)
}

View File

@ -0,0 +1,13 @@
---
name: Wine Quality
info: https://archive.ics.uci.edu/ml/datasets/Wine+Quality
urls:
- https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
- https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv
- https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality.names
cite: >
P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.

View File

@ -0,0 +1,21 @@
preprocessDataset = function()
{
csv.file.w = "winequality-white.csv"
csv.file.r = "winequality-red.csv"
dataset.w = read.csv(paste0(orig.dir, "/", csv.file.w), sep=";",
check.names=FALSE)
dataset.w = dataset.w %>% mutate(color="white")
dataset.r = read.csv(paste0(orig.dir, "/", csv.file.r), sep=";",
check.names=FALSE)
dataset.r = dataset.r %>% mutate(color="red")
dataset = rbind(dataset.w, dataset.r) %>%
mutate(color=factor(color),
quality=ifelse(quality>5, 1, 0)) %>%
select(`fixed acidity`:alcohol, color, quality) %>%
mutate(quality=factor(quality))
return(dataset)
}

49
data-download.R Normal file
View File

@ -0,0 +1,49 @@
rm(list=ls())
source("config.R")
source("utils.R")
library(RCurl)
library(tools)
library(yaml)
flog.info("Started downloading dataset collection")
for (dir.name in dir(PATH_DATASETS))
{
flog.info(paste("Dataset:", dir.name))
dest.dir = gsub("\\*", dir.name, PATH_DATASET_ORIGINAL)
config.yaml.file = paste0(PATH_DATASETS, dir.name, "/", FILE_CONFIG_YAML)
urls.list = yaml.load_file(config.yaml.file)$urls
mkdir(dest.dir)
for (url in urls.list)
{
flog.info(paste("URL:", url))
dest.file = URLdecode(basename(url))
dest.file.path = paste0(dest.dir, dest.file)
if (file.exists(dest.file.path))
{
flog.warn(paste("Target file", basename(dest.file.path),
"already exists; skipping..."))
next
}
tryCatch(
raw.content <- getBinaryURL(url, .opts=curlOptions(ssl.verifypeer=FALSE)),
error = function(e){flog.error(e); stop(e)}
)
writeBin(raw.content, dest.file.path)
}
flog.info("*****")
}
flog.info("Finished downloading dataset collection")

43
data-preprocess.R Normal file
View File

@ -0,0 +1,43 @@
rm(list=ls())
source("config.R")
source("utils.R")
library(dplyr)
library(foreign)
library(XLConnect)
flog.info("Started preprocessing dataset collection")
for (dir.name in dir(PATH_DATASETS))
{
flog.info(paste("Dataset:", dir.name))
orig.dir = gsub("\\*", dir.name, PATH_DATASET_ORIGINAL)
dest.dir = gsub("\\*", dir.name, PATH_DATASET_PREPROCESSED)
dest.file.path = paste0(dest.dir, FILE_PREPROCESSED_OUTPUT)
if (file.exists(dest.file.path))
{
flog.warn(paste("Target file", basename(dest.file.path),
"already exists; skipping..."))
flog.info("*****")
next
}
r.src.file = paste0(PATH_DATASETS, dir.name, "/", FILE_PREPROCESSING_SCRIPT)
source(r.src.file)
dataset = preprocessDataset()
printDatasetStatistics(dataset)
mkdir(dest.dir)
saveRDS(dataset, dest.file.path)
flog.info(paste("Created preprocessed file", FILE_PREPROCESSED_OUTPUT))
flog.info("*****")
}
flog.info("Finished preprocessing dataset collection")

13
uci-ml-to-r.Rproj Normal file
View File

@ -0,0 +1,13 @@
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 4
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX

28
utils.R Normal file
View File

@ -0,0 +1,28 @@
library(futile.logger)
mkdir = function(dest.dir)
{
if (!dir.exists(dest.dir))
{
flog.debug(paste("Creating directory", dest.dir))
dir.create(dest.dir)
} else {
flog.debug(paste("Target directory", dest.dir, "already exists"))
}
}
printDatasetStatistics = function(dataset)
{
if (ncol(dataset)==0) # for mockups
{
flog.warn("Empty dataset")
return()
}
no.cases = nrow(dataset)
no.attributes = ncol(dataset) - 1
perc.classes = round(100*as.numeric(table(dataset[, ncol(dataset)]))/nrow(dataset), 0)
flog.info(paste0("Cases: ", no.cases, ", attributes: ", no.attributes,
", classes: ", perc.classes[1], "%/", perc.classes[2], "%"))
}