mirror of
https://github.com/andre-wojtowicz/uci-ml-to-r.git
synced 2024-07-22 07:35:30 +02:00
106 lines
3.4 KiB
Plaintext
106 lines
3.4 KiB
Plaintext
---
|
|
title: "UCI Machine Learning datasets for R"
|
|
author: "Andrzej Wójtowicz"
|
|
output:
|
|
html_document:
|
|
keep_md: yes
|
|
---
|
|
|
|
```{r global-options, include=FALSE}
|
|
knitr::opts_chunk$set(comment = "", echo = FALSE, warning = FALSE, message = FALSE)
|
|
source('init.R')
|
|
```
|
|
|
|
Document generation date: `r Sys.time()`.
|
|
|
|
This project preprocesses a few datasets from [UC Irvine Machine Learning
|
|
Repository](https://archive.ics.uci.edu/ml/) into tidy R object files.
|
|
It focuses on the binary classification datasets and saves only complete cases
|
|
within a dataset.
|
|
|
|
**R software**: [Microsoft R Open](https://mran.microsoft.com/open/) (`r paste0(R.version$major, ".", R.version$minor)`)
|
|
|
|
**Reproducibility library**: [checkpoint](https://github.com/RevolutionAnalytics/checkpoint)
|
|
|
|
**Reproducibility procedure**:
|
|
|
|
1. Run *s1-download-data.R* to download original datasets.
|
|
2. Run *s2-preprocess-data.R* to preprocess the datasets.
|
|
3. Optionally knit s*3-make-readme.Rmd* to get an overview of the preprocessed datasets.
|
|
|
|
```{r show-datasets, results='asis'}
|
|
|
|
cat("\n# Table of Contents\n\n")
|
|
|
|
for (dir.name in dir(DATASETS.DIR))
|
|
{
|
|
config.yaml.file.path = file.path(DATASETS.DIR, dir.name, DATASET.CONFIG.FILE)
|
|
config.yaml = yaml.load_file(config.yaml.file.path)
|
|
|
|
anchor = gsub(" ", "-", gsub("[[:punct:]]", "",
|
|
tolower(config.yaml$name)))
|
|
cat(paste0("1. [", config.yaml$name, "](#", anchor, ")\n" ))
|
|
}
|
|
|
|
cat("\n---\n\n")
|
|
|
|
for (dir.name in dir(DATASETS.DIR))
|
|
{
|
|
config.yaml.file.path = file.path(DATASETS.DIR, dir.name, DATASET.CONFIG.FILE)
|
|
config.yaml = yaml.load_file(config.yaml.file.path)
|
|
|
|
cat(paste("#", config.yaml$name, "\n\n"))
|
|
|
|
cat(paste("**Local directory**:", dir.name, "\n\n"))
|
|
|
|
cat(paste0("**Details**: [link](", config.yaml$info, ")\n\n"))
|
|
|
|
cat(paste("**Source data files**:\n\n"))
|
|
for (file.url in config.yaml$urls)
|
|
{
|
|
cat(paste0("* [", URLdecode(basename(file.url)), "](", file.url, ")\n"))
|
|
}
|
|
cat("\n")
|
|
|
|
cat(paste0("**Cite**:\n```nohighlight\n", config.yaml$cite, "\n```\n\n"))
|
|
|
|
cat(paste("**Dataset**:\n\n"))
|
|
|
|
preprocessed.dir = gsub(DATASET.NAME.PATTERN, dir.name,
|
|
DATASET.PREPROCESSED.DIR)
|
|
preprocessed.file.path = file.path(preprocessed.dir,
|
|
DATASET.PREPROCESSED.OUTPUT.FILE)
|
|
|
|
dataset = readRDS(preprocessed.file.path)
|
|
|
|
cat("```nohighlight\n")
|
|
cat(str(dataset))
|
|
cat("\n```\n\n")
|
|
|
|
cat("**Predictors**:\n\n")
|
|
|
|
df.pred = data.frame(table(sapply(dataset[, 1:(ncol(dataset) - 1)],
|
|
function(f){paste(class(f), collapse = " ")})))
|
|
colnames(df.pred) = c("Type", "Frequency")
|
|
|
|
cat(knitr::kable(df.pred, format = "markdown"), sep = "\n")
|
|
cat("\n")
|
|
|
|
perc.classes = sort(round(100*as.numeric(
|
|
table(dataset[, ncol(dataset)]))/nrow(dataset), 0))
|
|
num.classes = sort(as.numeric(table(dataset[, ncol(dataset)])))
|
|
|
|
cat("**Class imbalance**:\n\n")
|
|
|
|
cat(knitr::kable(data.frame(A = c(paste(perc.classes[1], "%"),
|
|
num.classes[1]),
|
|
B = c(paste(perc.classes[2], "%"),
|
|
num.classes[2])),
|
|
format = "markdown", col.names = c("class A", "class B"),
|
|
align = c("c", "c")),
|
|
sep = "\n")
|
|
|
|
cat("\n---\n\n")
|
|
}
|
|
```
|