diff --git a/.gitignore b/.gitignore index 6a2e391..f1647e1 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,7 @@ vignettes/*.pdf # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 .httr-oauth +# data collection + +data-collection/*/original/* +data-collection/*/preprocessed/* diff --git a/config.R b/config.R new file mode 100644 index 0000000..ce2b693 --- /dev/null +++ b/config.R @@ -0,0 +1,28 @@ +# ---- checkpoint ---- + +CHECKPOINT.MRAN.URL = "http://mran.microsoft.com/snapshot/" +CHECKPOINT.SNAPSHOT.DATE = "2016-04-10" + +library(checkpoint) +options(checkpoint.mranUrl=CHECKPOINT.MRAN.URL) +checkpoint(CHECKPOINT.SNAPSHOT.DATE) + +# ---- logger ---- + +LOGGER_LEVEL = futile.logger::INFO + +library(futile.logger) +flog.threshold(LOGGER_LEVEL) + +# ---- other ---- + +PATH_DATASETS = "data-collection/" +PATH_DATASET_ORIGINAL = paste0(PATH_DATASETS, "*/original/") +PATH_DATASET_PREPROCESSED = paste0(PATH_DATASETS, "*/preprocessed/") + +FILE_CONFIG_YAML = "config.yaml" +FILE_PREPROCESSING_SCRIPT = "preprocess.R" +FILE_PREPROCESSED_OUTPUT = "dataset.rds" + +if (file.exists("config.R.user")) + source("config.R.user") diff --git a/data-collection/bank-marketing/config.yaml b/data-collection/bank-marketing/config.yaml new file mode 100644 index 0000000..24a8c9c --- /dev/null +++ b/data-collection/bank-marketing/config.yaml @@ -0,0 +1,10 @@ +--- +name: Bank Marketing + +info: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing + +urls: +- https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip + +cite: > + [Moro et al., 2014] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014 diff --git a/data-collection/bank-marketing/preprocess.R b/data-collection/bank-marketing/preprocess.R new file mode 100644 index 0000000..4fdb407 --- /dev/null +++ b/data-collection/bank-marketing/preprocess.R @@ -0,0 +1,50 @@ +preprocessDataset = function() +{ + #set.seed(SEED) + + temp.dir = tempdir() + + zip.file = "bank-additional.zip" + zip.dataset.path = "bank-additional/bank-additional-full.csv" + + flog.debug(paste("Unzipping", zip.file)) + + unzip(zipfile=paste0(orig.dir, zip.file), + files=zip.dataset.path, + exdir=temp.dir) + + flog.debug(paste("Loading", zip.dataset.path)) + + dataset = read.csv(paste0(temp.dir, "/", zip.dataset.path), sep=";") + + flog.debug("Preprocessing loaded dataset") + + dataset = dataset %>% + select(-c(duration, pdays, default)) %>% + filter(job != "unknown" & marital != "unknown" & education != "unknown" & + education != "illiterate" & housing != "unknown" & loan != "unknown") %>% + droplevels() + + #dataset.yes = dataset %>% filter(y == "yes") + #dataset.no = dataset %>% filter(y == "no") %>% sample_n(nrow(dataset.yes)) + # + #dataset = rbind(dataset.yes, dataset.no) + + dataset = dataset %>% mutate( + education=factor(education, levels=c("basic.4y", "basic.6y", + "basic.9y", "high.school", + "professional.course", + "university.degree"), + ordered=TRUE), + month=factor(month, levels=c("jan", "feb", "mar", + "apr", "may", "jun", + "jul", "aug", "sep", + "oct", "nov", "dec"), + ordered=TRUE), + day_of_week=factor(day_of_week, levels=c("mon", "tue", "wed", + "thu", "fri"), + ordered=TRUE) + ) + + return(dataset) +} \ No newline at end of file diff --git a/data-collection/breast-cancer-wisconsin-diagnostic/config.yaml b/data-collection/breast-cancer-wisconsin-diagnostic/config.yaml new file mode 100644 index 0000000..c431dd3 --- /dev/null +++ b/data-collection/breast-cancer-wisconsin-diagnostic/config.yaml @@ -0,0 +1,18 @@ +--- +name: Breast Cancer Wisconsin (Diagnostic) + +info: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29 + +urls: +- https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data +- https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.names + +cite: > + https://archive.ics.uci.edu/ml/citation_policy.html + + @misc{Lichman:2013 , + author = "M. Lichman", + year = "2013", + title = "{UCI} Machine Learning Repository", + url = "http://archive.ics.uci.edu/ml", + institution = "University of California, Irvine, School of Information and Computer Sciences" } diff --git a/data-collection/breast-cancer-wisconsin-diagnostic/preprocess.R b/data-collection/breast-cancer-wisconsin-diagnostic/preprocess.R new file mode 100644 index 0000000..8c1ad2e --- /dev/null +++ b/data-collection/breast-cancer-wisconsin-diagnostic/preprocess.R @@ -0,0 +1,18 @@ +preprocessDataset = function() +{ + csv.file = "wdbc.data" + + dataset = read.csv(paste0(orig.dir, "/", csv.file), header=FALSE) + + colnames(dataset) = c("id", "diagnosis", + apply(expand.grid(c("radius", "texture", "perimeter", + "area", "smoothness", "compactness", + "concavity", "concave points", + "symmetry", "fractal dimension"), + c("mean", "se", "worst")), + 1, function(x){paste(x[2], x[1])})) + + dataset = dataset %>% select(`mean radius`:`worst fractal dimension`, diagnosis) + + return(dataset) +} \ No newline at end of file diff --git a/data-collection/breast-cancer-wisconsin-original/config.yaml b/data-collection/breast-cancer-wisconsin-original/config.yaml new file mode 100644 index 0000000..90d0a0d --- /dev/null +++ b/data-collection/breast-cancer-wisconsin-original/config.yaml @@ -0,0 +1,11 @@ +--- +name: Breast Cancer Wisconsin (Original) + +info: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Original%29 + +urls: +- https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data +- https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names + +cite: > + O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear programming", SIAM News, Volume 23, Number 5, September 1990, pp 1 & 18. diff --git a/data-collection/breast-cancer-wisconsin-original/preprocess.R b/data-collection/breast-cancer-wisconsin-original/preprocess.R new file mode 100644 index 0000000..ce78758 --- /dev/null +++ b/data-collection/breast-cancer-wisconsin-original/preprocess.R @@ -0,0 +1,20 @@ +preprocessDataset = function() +{ + csv.file = "breast-cancer-wisconsin.data" + + dataset = read.csv(paste0(orig.dir, "/", csv.file), header=FALSE) + + colnames(dataset) = c("Sample code number", "Clump Thickness", + "Uniformity of Cell Size", "Uniformity of Cell Shape", + "Marginal Adhesion", "Single Epithelial Cell Size", + "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", + "Mitoses", "Class") + + dataset = dataset %>% select(-`Sample code number`) %>% + filter(`Bare Nuclei` != "?") %>% + mutate(Class=factor(Class), + `Bare Nuclei`=as.integer(`Bare Nuclei`)) %>% + droplevels() + + return(dataset) +} \ No newline at end of file diff --git a/data-collection/cardiotocography/config.yaml b/data-collection/cardiotocography/config.yaml new file mode 100644 index 0000000..2fa6b2b --- /dev/null +++ b/data-collection/cardiotocography/config.yaml @@ -0,0 +1,10 @@ +--- +name: Cardiotocography + +info: https://archive.ics.uci.edu/ml/datasets/Cardiotocography + +urls: +- https://archive.ics.uci.edu/ml/machine-learning-databases/00193/CTG.xls + +cite: > + Ayres de Campos et al. (2000) SisPorto 2.0 A Program for Automated Analysis of Cardiotocograms. J Matern Fetal Med 5:311-318 diff --git a/data-collection/cardiotocography/preprocess.R b/data-collection/cardiotocography/preprocess.R new file mode 100644 index 0000000..1fa46db --- /dev/null +++ b/data-collection/cardiotocography/preprocess.R @@ -0,0 +1,40 @@ +preprocessDataset = function() +{ + xls.file = "CTG.xls" + + wk = loadWorkbook(paste0(orig.dir, "/", xls.file)) + dataset = readWorksheet(wk, sheet="Raw Data") + + dataset = dataset %>% select(LB:FS, NSP, -c(DS, DR)) %>% + filter(complete.cases(.)) %>% + mutate(LB=as.integer(LB), + AC=as.integer(AC), + FM=as.integer(FM), + UC=as.integer(UC), + ASTV=as.integer(ASTV), + ALTV=as.integer(ALTV), + DL=as.integer(DL), + DP=as.integer(DP), + Width=as.integer(Width), + Min=as.integer(Min), + Max=as.integer(Max), + Nmax=as.integer(Nmax), + Nzeros=as.integer(Nzeros), + Mode=as.integer(Mode), + Mean=as.integer(Mean), + Median=as.integer(Median), + Variance=as.integer(Variance), + Tendency=factor(Tendency, levels=c(-1,0,1), ordered=TRUE), + A=factor(A), + B=factor(B), + C=factor(C), + D=factor(D), + E=factor(E), + AD=factor(AD), + DE=factor(DE), + LD=factor(LD), + FS=factor(FS), + NSP=factor(replace(NSP, NSP==2, 3))) + + return(dataset) +} \ No newline at end of file diff --git a/data-collection/credit-card/config.yaml b/data-collection/credit-card/config.yaml new file mode 100644 index 0000000..ff21096 --- /dev/null +++ b/data-collection/credit-card/config.yaml @@ -0,0 +1,10 @@ +--- +name: Default of credit card clients + +info: https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients + +urls: +- https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls + +cite: > + Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for the predictive accuracy of probability of default of credit card clients. Expert Systems with Applications, 36(2), 2473-2480. diff --git a/data-collection/credit-card/preprocess.R b/data-collection/credit-card/preprocess.R new file mode 100644 index 0000000..9e2ffeb --- /dev/null +++ b/data-collection/credit-card/preprocess.R @@ -0,0 +1,48 @@ +preprocessDataset = function() +{ + #set.seed(SEED) + + xls.file = "default of credit card clients.xls" + + wk = loadWorkbook(paste0(orig.dir, "/", xls.file)) + dataset = readWorksheet(wk, sheet="Data", startRow=2, startCol=2, + check.names=FALSE) + + dataset = dataset %>% + mutate(LIMIT_BAL=as.integer(LIMIT_BAL), + SEX=factor(SEX), + EDUCATION=factor(EDUCATION), # can't order due to + # inconsistency with + # UCI description + MARRIAGE=factor(MARRIAGE), + AGE=as.integer(AGE), + PAY_0=as.integer(replace(PAY_0, PAY_0<0, 0)), + PAY_2=as.integer(replace(PAY_2, PAY_2<0, 0)), + PAY_3=as.integer(replace(PAY_3, PAY_3<0, 0)), + PAY_4=as.integer(replace(PAY_4, PAY_4<0, 0)), + PAY_5=as.integer(replace(PAY_5, PAY_5<0, 0)), + PAY_6=as.integer(replace(PAY_6, PAY_6<0, 0)), + BILL_AMT1=as.integer(BILL_AMT1), + BILL_AMT2=as.integer(BILL_AMT2), + BILL_AMT3=as.integer(BILL_AMT3), + BILL_AMT4=as.integer(BILL_AMT4), + BILL_AMT5=as.integer(BILL_AMT5), + BILL_AMT6=as.integer(BILL_AMT6), + PAY_AMT1=as.integer(PAY_AMT1), + PAY_AMT2=as.integer(PAY_AMT2), + PAY_AMT3=as.integer(PAY_AMT3), + PAY_AMT4=as.integer(PAY_AMT4), + PAY_AMT5=as.integer(PAY_AMT5), + PAY_AMT6=as.integer(PAY_AMT6), + `default payment next month`=factor( + `default payment next month`) + ) + + #dataset.1 = dataset %>% filter(`default payment next month` == 1) + #dataset.0 = dataset %>% filter(`default payment next month` == 0) %>% + # sample_n(nrow(dataset.1)) + # + #dataset = rbind(dataset.0, dataset.1) + + return(dataset) +} \ No newline at end of file diff --git a/data-collection/indian-liver/config.yaml b/data-collection/indian-liver/config.yaml new file mode 100644 index 0000000..5120d48 --- /dev/null +++ b/data-collection/indian-liver/config.yaml @@ -0,0 +1,17 @@ +--- +name: ILPD (Indian Liver Patient Dataset) + +info: https://archive.ics.uci.edu/ml/datasets/ILPD+(Indian+Liver+Patient+Dataset) + +urls: +- https://archive.ics.uci.edu/ml/machine-learning-databases/00225/Indian%20Liver%20Patient%20Dataset%20(ILPD).csv + +cite: > + https://archive.ics.uci.edu/ml/citation_policy.html + + @misc{Lichman:2013 , + author = "M. Lichman", + year = "2013", + title = "{UCI} Machine Learning Repository", + url = "http://archive.ics.uci.edu/ml", + institution = "University of California, Irvine, School of Information and Computer Sciences" } diff --git a/data-collection/indian-liver/preprocess.R b/data-collection/indian-liver/preprocess.R new file mode 100644 index 0000000..c149efa --- /dev/null +++ b/data-collection/indian-liver/preprocess.R @@ -0,0 +1,14 @@ +preprocessDataset = function() +{ + csv.file = "Indian Liver Patient Dataset (ILPD).csv" + + dataset = read.csv(paste0(orig.dir, "/", csv.file), + header=FALSE) + + colnames(dataset) = c("Age", "Gender", "TB", "DB", "Alkphos", "Sgpt", + "Sgot", "TP", "ALB", "A/G Ratio", "Selector") + + dataset = dataset %>% mutate(Selector=factor(Selector)) + + return(dataset) +} \ No newline at end of file diff --git a/data-collection/magic/config.yaml b/data-collection/magic/config.yaml new file mode 100644 index 0000000..c24425c --- /dev/null +++ b/data-collection/magic/config.yaml @@ -0,0 +1,18 @@ +--- +name: MAGIC Gamma Telescope + +info: https://archive.ics.uci.edu/ml/datasets/MAGIC+Gamma+Telescope + +urls: +- https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data +- https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.names + +cite: > + https://archive.ics.uci.edu/ml/citation_policy.html + + @misc{Lichman:2013 , + author = "M. Lichman", + year = "2013", + title = "{UCI} Machine Learning Repository", + url = "http://archive.ics.uci.edu/ml", + institution = "University of California, Irvine, School of Information and Computer Sciences" } diff --git a/data-collection/magic/preprocess.R b/data-collection/magic/preprocess.R new file mode 100644 index 0000000..5abe672 --- /dev/null +++ b/data-collection/magic/preprocess.R @@ -0,0 +1,13 @@ +preprocessDataset = function() +{ + csv.file = "magic04.data" + + dataset = read.csv(paste0(orig.dir, "/", csv.file), + header=FALSE) + + colnames(dataset) = c("fLength", "fWidth", "fSize", "fConc", "fConc1", + "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", + "class") + + return(dataset) +} \ No newline at end of file diff --git a/data-collection/seismic-bumps/config.yaml b/data-collection/seismic-bumps/config.yaml new file mode 100644 index 0000000..e6e46f6 --- /dev/null +++ b/data-collection/seismic-bumps/config.yaml @@ -0,0 +1,10 @@ +--- +name: Seismic bumps + +info: https://archive.ics.uci.edu/ml/datasets/seismic-bumps + +urls: +- https://archive.ics.uci.edu/ml/machine-learning-databases/00266/seismic-bumps.arff + +cite: > + Sikora M., Wrobel L.: Application of rule induction algorithms for analysis of data collected by seismic hazard monitoring systems in coal mines. Archives of Mining Sciences, 55(1), 2010, 91-114. diff --git a/data-collection/seismic-bumps/preprocess.R b/data-collection/seismic-bumps/preprocess.R new file mode 100644 index 0000000..a4b1e5d --- /dev/null +++ b/data-collection/seismic-bumps/preprocess.R @@ -0,0 +1,30 @@ +preprocessDataset = function() +{ + #set.seed(SEED) + + arff.file = "seismic-bumps.arff" + + dataset = read.arff(paste0(orig.dir, "/", arff.file)) + + dataset = dataset %>% select(-c(nbumps6:nbumps89)) %>% + mutate(genergy=as.integer(genergy), + gpuls=as.integer(gpuls), + gdenergy=as.integer(gdenergy), + gdpuls=as.integer(gdpuls), + nbumps=as.integer(nbumps), + nbumps2=as.integer(nbumps2), + nbumps3=as.integer(nbumps3), + nbumps4=as.integer(nbumps4), + nbumps5=as.integer(nbumps5), + energy=as.integer(energy), + maxenergy=as.integer(maxenergy) + ) + + #dataset.1 = dataset %>% filter(class == "1") + #dataset.0 = dataset %>% filter(class == "0") %>% + # sample_n(nrow(dataset.1)*4) + # + #dataset = rbind(dataset.0, dataset.1) + + return(dataset) +} \ No newline at end of file diff --git a/data-collection/spambase/config.yaml b/data-collection/spambase/config.yaml new file mode 100644 index 0000000..26c30c8 --- /dev/null +++ b/data-collection/spambase/config.yaml @@ -0,0 +1,19 @@ +--- +name: Spambase + +info: https://archive.ics.uci.edu/ml/datasets/Spambase + +urls: +- https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.DOCUMENTATION +- https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data +- https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.names + +cite: > + https://archive.ics.uci.edu/ml/citation_policy.html + + @misc{Lichman:2013 , + author = "M. Lichman", + year = "2013", + title = "{UCI} Machine Learning Repository", + url = "http://archive.ics.uci.edu/ml", + institution = "University of California, Irvine, School of Information and Computer Sciences" } diff --git a/data-collection/spambase/preprocess.R b/data-collection/spambase/preprocess.R new file mode 100644 index 0000000..d579197 --- /dev/null +++ b/data-collection/spambase/preprocess.R @@ -0,0 +1,33 @@ +preprocessDataset = function() +{ + csv.file = "spambase.data" + + dataset = read.csv(paste0(orig.dir, "/", csv.file), + header=FALSE) + + colnames(dataset) = c("word_freq_make", "word_freq_address", "word_freq_all", + "word_freq_3d", "word_freq_our", "word_freq_over", + "word_freq_remove", "word_freq_internet", + "word_freq_order", "word_freq_mail", "word_freq_receive", + "word_freq_will", "word_freq_people", "word_freq_report", + "word_freq_addresses", "word_freq_free", + "word_freq_business", "word_freq_email", "word_freq_you", + "word_freq_credit", "word_freq_your", "word_freq_font", + "word_freq_000", "word_freq_money", "word_freq_hp", + "word_freq_hpl", "word_freq_george", "word_freq_650", + "word_freq_lab", "word_freq_labs", "word_freq_telnet", + "word_freq_857", "word_freq_data", "word_freq_415", + "word_freq_85", "word_freq_technology", "word_freq_1999", + "word_freq_parts", "word_freq_pm", "word_freq_direct", + "word_freq_cs", "word_freq_meeting", "word_freq_original", + "word_freq_project", "word_freq_re", "word_freq_edu", + "word_freq_table", "word_freq_conference", "char_freq_;", + "char_freq_(", "char_freq_[", "char_freq_!", "char_freq_$", + "char_freq_#", "capital_run_length_average", + "capital_run_length_longest", "capital_run_length_total", + "class") + + dataset = dataset %>% mutate(class=factor(class)) + + return(dataset) +} \ No newline at end of file diff --git a/data-collection/wine-quality/config.yaml b/data-collection/wine-quality/config.yaml new file mode 100644 index 0000000..2e378ce --- /dev/null +++ b/data-collection/wine-quality/config.yaml @@ -0,0 +1,13 @@ +--- +name: Wine Quality + +info: https://archive.ics.uci.edu/ml/datasets/Wine+Quality + +urls: +- https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv +- https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv +- https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality.names + +cite: > + P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. + Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009. diff --git a/data-collection/wine-quality/preprocess.R b/data-collection/wine-quality/preprocess.R new file mode 100644 index 0000000..2ff5c03 --- /dev/null +++ b/data-collection/wine-quality/preprocess.R @@ -0,0 +1,21 @@ +preprocessDataset = function() +{ + csv.file.w = "winequality-white.csv" + csv.file.r = "winequality-red.csv" + + dataset.w = read.csv(paste0(orig.dir, "/", csv.file.w), sep=";", + check.names=FALSE) + dataset.w = dataset.w %>% mutate(color="white") + + dataset.r = read.csv(paste0(orig.dir, "/", csv.file.r), sep=";", + check.names=FALSE) + dataset.r = dataset.r %>% mutate(color="red") + + dataset = rbind(dataset.w, dataset.r) %>% + mutate(color=factor(color), + quality=ifelse(quality>5, 1, 0)) %>% + select(`fixed acidity`:alcohol, color, quality) %>% + mutate(quality=factor(quality)) + + return(dataset) +} \ No newline at end of file diff --git a/data-download.R b/data-download.R new file mode 100644 index 0000000..7867c36 --- /dev/null +++ b/data-download.R @@ -0,0 +1,49 @@ +rm(list=ls()) + +source("config.R") +source("utils.R") + +library(RCurl) +library(tools) +library(yaml) + +flog.info("Started downloading dataset collection") + +for (dir.name in dir(PATH_DATASETS)) +{ + flog.info(paste("Dataset:", dir.name)) + + dest.dir = gsub("\\*", dir.name, PATH_DATASET_ORIGINAL) + config.yaml.file = paste0(PATH_DATASETS, dir.name, "/", FILE_CONFIG_YAML) + + urls.list = yaml.load_file(config.yaml.file)$urls + + mkdir(dest.dir) + + for (url in urls.list) + { + flog.info(paste("URL:", url)) + + dest.file = URLdecode(basename(url)) + dest.file.path = paste0(dest.dir, dest.file) + + if (file.exists(dest.file.path)) + { + flog.warn(paste("Target file", basename(dest.file.path), + "already exists; skipping...")) + next + } + + tryCatch( + raw.content <- getBinaryURL(url, .opts=curlOptions(ssl.verifypeer=FALSE)), + error = function(e){flog.error(e); stop(e)} + ) + + writeBin(raw.content, dest.file.path) + + } + + flog.info("*****") +} + +flog.info("Finished downloading dataset collection") diff --git a/data-preprocess.R b/data-preprocess.R new file mode 100644 index 0000000..3eb2dc0 --- /dev/null +++ b/data-preprocess.R @@ -0,0 +1,43 @@ +rm(list=ls()) + +source("config.R") +source("utils.R") + +library(dplyr) +library(foreign) +library(XLConnect) + +flog.info("Started preprocessing dataset collection") + +for (dir.name in dir(PATH_DATASETS)) +{ + flog.info(paste("Dataset:", dir.name)) + + orig.dir = gsub("\\*", dir.name, PATH_DATASET_ORIGINAL) + dest.dir = gsub("\\*", dir.name, PATH_DATASET_PREPROCESSED) + dest.file.path = paste0(dest.dir, FILE_PREPROCESSED_OUTPUT) + + if (file.exists(dest.file.path)) + { + flog.warn(paste("Target file", basename(dest.file.path), + "already exists; skipping...")) + flog.info("*****") + next + } + + r.src.file = paste0(PATH_DATASETS, dir.name, "/", FILE_PREPROCESSING_SCRIPT) + + source(r.src.file) + dataset = preprocessDataset() + + printDatasetStatistics(dataset) + + mkdir(dest.dir) + saveRDS(dataset, dest.file.path) + + flog.info(paste("Created preprocessed file", FILE_PREPROCESSED_OUTPUT)) + + flog.info("*****") +} + +flog.info("Finished preprocessing dataset collection") \ No newline at end of file diff --git a/uci-ml-to-r.Rproj b/uci-ml-to-r.Rproj new file mode 100644 index 0000000..066341e --- /dev/null +++ b/uci-ml-to-r.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 4 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/utils.R b/utils.R new file mode 100644 index 0000000..709ce84 --- /dev/null +++ b/utils.R @@ -0,0 +1,28 @@ +library(futile.logger) + +mkdir = function(dest.dir) +{ + if (!dir.exists(dest.dir)) + { + flog.debug(paste("Creating directory", dest.dir)) + dir.create(dest.dir) + } else { + flog.debug(paste("Target directory", dest.dir, "already exists")) + } +} + +printDatasetStatistics = function(dataset) +{ + if (ncol(dataset)==0) # for mockups + { + flog.warn("Empty dataset") + return() + } + + no.cases = nrow(dataset) + no.attributes = ncol(dataset) - 1 + perc.classes = round(100*as.numeric(table(dataset[, ncol(dataset)]))/nrow(dataset), 0) + + flog.info(paste0("Cases: ", no.cases, ", attributes: ", no.attributes, + ", classes: ", perc.classes[1], "%/", perc.classes[2], "%")) +}