From b1a4cbab733a9a62fa834e5ce8c3841e13b70bc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20W=C3=B3jtowicz?= Date: Thu, 11 Aug 2016 18:15:25 +0200 Subject: [PATCH] added mushroom and census income datasets; removed config variables from utils functions --- README.md | 122 ++++++++++++++++++++- data-collection/census-income/config.yaml | 19 ++++ data-collection/census-income/preprocess.R | 49 +++++++++ data-collection/mushroom/config.yaml | 18 +++ data-collection/mushroom/preprocess.R | 22 ++++ s1-download-data.R | 2 +- s2-preprocess-data.R | 2 +- utils.R | 4 +- 8 files changed, 233 insertions(+), 5 deletions(-) create mode 100644 data-collection/census-income/config.yaml create mode 100644 data-collection/census-income/preprocess.R create mode 100644 data-collection/mushroom/config.yaml create mode 100644 data-collection/mushroom/preprocess.R diff --git a/README.md b/README.md index a8c482e..8c93ab9 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ Andrzej Wójtowicz -Document generation date: 2016-07-17 02:59:19. +Document generation date: 2016-08-11 18:12:19. This project preprocesses a few datasets from [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/) into tidy R object files. @@ -27,9 +27,11 @@ within a dataset. 1. [Breast Cancer Wisconsin (Diagnostic)](#breast-cancer-wisconsin-diagnostic) 1. [Breast Cancer Wisconsin (Original)](#breast-cancer-wisconsin-original) 1. [Cardiotocography](#cardiotocography) +1. [Census income](#census-income) 1. [Default of credit card clients](#default-of-credit-card-clients) 1. [ILPD (Indian Liver Patient Dataset)](#ilpd-indian-liver-patient-dataset) 1. [MAGIC Gamma Telescope](#magic-gamma-telescope) +1. [Mushroom](#mushroom) 1. [Seismic bumps](#seismic-bumps) 1. [Spambase](#spambase) 1. [Wine Quality](#wine-quality) @@ -279,6 +281,62 @@ Ayres de Campos et al. (2000) SisPorto 2.0 A Program for Automated Analysis of C --- +# Census income + +**Local directory**: census-income + +**Details**: [link](https://archive.ics.uci.edu/ml/datasets/Census+Income) + +**Source data files**: + +* [adult.data](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data) +* [adult.test](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test) +* [adult.names](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names) + +**Cite**: +```nohighlight +https://archive.ics.uci.edu/ml/citation_policy.html +@misc{Lichman:2013 , author = "M. Lichman", year = "2013", title = "{UCI} Machine Learning Repository", url = "http://archive.ics.uci.edu/ml", institution = "University of California, Irvine, School of Information and Computer Sciences" } +``` + +**Dataset**: + +```nohighlight +'data.frame': 45222 obs. of 14 variables: + $ age : int 39 50 38 53 28 37 49 52 31 42 ... + $ workclass : Factor w/ 8 levels "federal.gov",..: 7 6 4 4 4 4 4 6 4 4 ... + $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ... + $ education : Ord.factor w/ 16 levels "preschool"<"x1st.4th"<..: 13 13 9 7 13 14 5 9 14 13 ... + $ marital.status: Factor w/ 7 levels "divorced","married.af.spouse",..: 5 3 1 3 3 3 4 3 5 3 ... + $ occupation : Factor w/ 14 levels "adm.clerical",..: 1 4 6 6 10 4 8 4 10 4 ... + $ relationship : Factor w/ 6 levels "husband","not.in.family",..: 2 1 2 1 6 6 2 1 2 1 ... + $ race : Factor w/ 5 levels "amer.indian.eskimo",..: 5 5 5 3 3 5 3 5 5 5 ... + $ sex : Factor w/ 2 levels "female","male": 2 2 2 2 1 1 1 2 1 2 ... + $ capital.gain : int 2174 0 0 0 0 0 0 0 14084 5178 ... + $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ... + $ hours.per.week: int 40 13 40 40 40 40 16 45 50 40 ... + $ native.country: Factor w/ 41 levels "cambodia","canada",..: 39 39 39 39 5 39 23 39 39 39 ... + $ class : Factor w/ 2 levels "x..50k","x.50k": 1 1 1 1 1 1 1 2 2 2 ... + +``` + +**Predictors**: + +|Type | Frequency| +|:--------------|---------:| +|factor | 7| +|integer | 5| +|ordered factor | 1| + +**Class imbalance**: + +| class A | class B | +|:-------:|:-------:| +| 25 % | 75 % | +| 11208 | 34014 | + +--- + # Default of credit card clients **Local directory**: credit-card @@ -442,6 +500,68 @@ https://archive.ics.uci.edu/ml/citation_policy.html --- +# Mushroom + +**Local directory**: mushroom + +**Details**: [link](https://archive.ics.uci.edu/ml/datasets/Mushroom) + +**Source data files**: + +* [agaricus-lepiota.data](https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data) +* [agaricus-lepiota.names](https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.names) + +**Cite**: +```nohighlight +https://archive.ics.uci.edu/ml/citation_policy.html +@misc{Lichman:2013 , author = "M. Lichman", year = "2013", title = "{UCI} Machine Learning Repository", url = "http://archive.ics.uci.edu/ml", institution = "University of California, Irvine, School of Information and Computer Sciences" } +``` + +**Dataset**: + +```nohighlight +'data.frame': 5644 obs. of 22 variables: + $ cap.shape : Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ... + $ cap.surface : Factor w/ 4 levels "f","g","s","y": 3 3 3 4 3 4 3 4 4 3 ... + $ cap.color : Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ... + $ bruises : Factor w/ 2 levels "f","t": 2 2 2 2 1 2 2 2 2 2 ... + $ odor : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ... + $ gill.attachment : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ... + $ gill.spacing : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ... + $ gill.size : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ... + $ gill.color : Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ... + $ stalk.shape : Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ... + $ stalk.root : Factor w/ 4 levels "b","c","e","r": 3 2 2 3 3 2 2 2 3 2 ... + $ stalk.surface.above.ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ... + $ stalk.surface.below.ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ... + $ stalk.color.above.ring : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ... + $ stalk.color.below.ring : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ... + $ veil.color : Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ... + $ ring.number : int 1 1 1 1 1 1 1 1 1 1 ... + $ ring.type : Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ... + $ spore.print.color : Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ... + $ population : Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ... + $ habitat : Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ... + $ class : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ... + +``` + +**Predictors**: + +|Type | Frequency| +|:-------|---------:| +|factor | 20| +|integer | 1| + +**Class imbalance**: + +| class A | class B | +|:-------:|:-------:| +| 38 % | 62 % | +| 2156 | 3488 | + +--- + # Seismic bumps **Local directory**: seismic-bumps diff --git a/data-collection/census-income/config.yaml b/data-collection/census-income/config.yaml new file mode 100644 index 0000000..f68d2b8 --- /dev/null +++ b/data-collection/census-income/config.yaml @@ -0,0 +1,19 @@ +--- +name: Census income + +info: https://archive.ics.uci.edu/ml/datasets/Census+Income + +urls: +- https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data +- https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test +- https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names + +cite: > + https://archive.ics.uci.edu/ml/citation_policy.html + + @misc{Lichman:2013 , + author = "M. Lichman", + year = "2013", + title = "{UCI} Machine Learning Repository", + url = "http://archive.ics.uci.edu/ml", + institution = "University of California, Irvine, School of Information and Computer Sciences" } diff --git a/data-collection/census-income/preprocess.R b/data-collection/census-income/preprocess.R new file mode 100644 index 0000000..4dac273 --- /dev/null +++ b/data-collection/census-income/preprocess.R @@ -0,0 +1,49 @@ +preprocess.dataset = function() +{ + csv.file.1 = "adult.data" + csv.file.2 = "adult.test" + + dataset.1 = read.csv(file.path(orig.dir, csv.file.1), header = FALSE, + na.strings = " ?") + dataset.2 = read.csv(file.path(orig.dir, csv.file.2), header = FALSE, + na.strings = " ?", skip = 1) + + column.names = c("age", "workclass", "fnlwgt", "education", + "education.num", "marital.status", "occupation", + "relationship", "race", "sex", "capital.gain", + "capital.loss", "hours.per.week", "native.country", + "class") + + colnames(dataset.1) = column.names + colnames(dataset.2) = column.names + + levels(dataset.2$class) = gsub("\\.", "", levels(dataset.2$class)) + + dataset = rbind(dataset.1, dataset.2) + + for (column.name in column.names) + { + if (is.factor(dataset[[column.name]])) + { + levels(dataset[[column.name]]) = trimws(levels(dataset[[column.name]])) + } + } + + education.ordered.levels = dataset %>% + select(education.num, education) %>% + unique %>% + arrange(education.num) %>% + select(education) %>% + c %>% + unlist %>% + unname %>% + as.character + + dataset = dataset %>% + mutate(education = factor(education, levels = education.ordered.levels, + ordered = TRUE)) %>% + select(-education.num) %>% + filter(complete.cases(.)) + + return(dataset) +} \ No newline at end of file diff --git a/data-collection/mushroom/config.yaml b/data-collection/mushroom/config.yaml new file mode 100644 index 0000000..7258363 --- /dev/null +++ b/data-collection/mushroom/config.yaml @@ -0,0 +1,18 @@ +--- +name: Mushroom + +info: https://archive.ics.uci.edu/ml/datasets/Mushroom + +urls: +- https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data +- https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.names + +cite: > + https://archive.ics.uci.edu/ml/citation_policy.html + + @misc{Lichman:2013 , + author = "M. Lichman", + year = "2013", + title = "{UCI} Machine Learning Repository", + url = "http://archive.ics.uci.edu/ml", + institution = "University of California, Irvine, School of Information and Computer Sciences" } diff --git a/data-collection/mushroom/preprocess.R b/data-collection/mushroom/preprocess.R new file mode 100644 index 0000000..92e1c0a --- /dev/null +++ b/data-collection/mushroom/preprocess.R @@ -0,0 +1,22 @@ +preprocess.dataset = function() +{ + csv.file = "agaricus-lepiota.data" + + dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE, + na.strings = "?") + + colnames(dataset) = c("class", "cap.shape", "cap.surface", "cap.color", + "bruises", "odor", "gill.attachment", "gill.spacing", + "gill.size", "gill.color", "stalk.shape", "stalk.root", + "stalk.surface.above.ring", "stalk.surface.below.ring", + "stalk.color.above.ring", "stalk.color.below.ring", + "veil.type", "veil.color", "ring.number", "ring.type", + "spore.print.color", "population", "habitat") + + dataset = dataset %>% + select(cap.shape:habitat, class, -veil.type) %>% + filter(complete.cases(.)) %>% + mutate(ring.number = as.integer(as.integer(ring.number) - 1)) + + return(dataset) +} \ No newline at end of file diff --git a/s1-download-data.R b/s1-download-data.R index 2b602e4..8cb485d 100644 --- a/s1-download-data.R +++ b/s1-download-data.R @@ -3,7 +3,7 @@ source("init.R") source("utils.R") -setup.logger(LOGGER.OUTPUT.S1.FILE) +setup.logger(LOGGER.OUTPUT.S1.FILE, LOGGER.OVERWRITE.EXISTING.FILES) flog.info("Step 1: download dataset collection") diff --git a/s2-preprocess-data.R b/s2-preprocess-data.R index da36b51..bb8d559 100644 --- a/s2-preprocess-data.R +++ b/s2-preprocess-data.R @@ -3,7 +3,7 @@ source("init.R") source("utils.R") -setup.logger(LOGGER.OUTPUT.S2.FILE) +setup.logger(LOGGER.OUTPUT.S2.FILE, LOGGER.OVERWRITE.EXISTING.FILES) flog.info("Step 2: preprocess dataset collection") diff --git a/utils.R b/utils.R index 4f421ba..7b8b41a 100644 --- a/utils.R +++ b/utils.R @@ -15,9 +15,9 @@ print.dataset.statistics = function(dataset) ", classes: ", perc.classes[1], "%/", perc.classes[2], "%")) } -setup.logger = function(output.file) +setup.logger = function(output.file, overwrite.existing.files) { - if (LOGGER.OVERWRITE.EXISTING.FILES & file.exists(output.file)) + if (overwrite.existing.files & file.exists(output.file)) { file.remove(output.file) }