From b1a4cbab733a9a62fa834e5ce8c3841e13b70bc6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20W=C3=B3jtowicz?= <andre@amu.edu.pl>
Date: Thu, 11 Aug 2016 18:15:25 +0200
Subject: [PATCH] added mushroom and census income datasets; removed config
 variables from utils functions

---
 README.md                                  | 122 ++++++++++++++++++++-
 data-collection/census-income/config.yaml  |  19 ++++
 data-collection/census-income/preprocess.R |  49 +++++++++
 data-collection/mushroom/config.yaml       |  18 +++
 data-collection/mushroom/preprocess.R      |  22 ++++
 s1-download-data.R                         |   2 +-
 s2-preprocess-data.R                       |   2 +-
 utils.R                                    |   4 +-
 8 files changed, 233 insertions(+), 5 deletions(-)
 create mode 100644 data-collection/census-income/config.yaml
 create mode 100644 data-collection/census-income/preprocess.R
 create mode 100644 data-collection/mushroom/config.yaml
 create mode 100644 data-collection/mushroom/preprocess.R

diff --git a/README.md b/README.md
index a8c482e..8c93ab9 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@ Andrzej Wójtowicz
 
 
 
-Document generation date: 2016-07-17 02:59:19.
+Document generation date: 2016-08-11 18:12:19.
 
 This project preprocesses a few datasets from [UC Irvine Machine Learning
 Repository](https://archive.ics.uci.edu/ml/) into tidy R object files.
@@ -27,9 +27,11 @@ within a dataset.
 1. [Breast Cancer Wisconsin (Diagnostic)](#breast-cancer-wisconsin-diagnostic)
 1. [Breast Cancer Wisconsin (Original)](#breast-cancer-wisconsin-original)
 1. [Cardiotocography](#cardiotocography)
+1. [Census income](#census-income)
 1. [Default of credit card clients](#default-of-credit-card-clients)
 1. [ILPD (Indian Liver Patient Dataset)](#ilpd-indian-liver-patient-dataset)
 1. [MAGIC Gamma Telescope](#magic-gamma-telescope)
+1. [Mushroom](#mushroom)
 1. [Seismic bumps](#seismic-bumps)
 1. [Spambase](#spambase)
 1. [Wine Quality](#wine-quality)
@@ -279,6 +281,62 @@ Ayres de Campos et al. (2000) SisPorto 2.0 A Program for Automated Analysis of C
 
 ---
 
+# Census income 
+
+**Local directory**: census-income 
+
+**Details**: [link](https://archive.ics.uci.edu/ml/datasets/Census+Income)
+
+**Source data files**:
+
+* [adult.data](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data)
+* [adult.test](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test)
+* [adult.names](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names)
+
+**Cite**:
+```nohighlight
+https://archive.ics.uci.edu/ml/citation_policy.html
+@misc{Lichman:2013 , author = "M. Lichman", year = "2013", title = "{UCI} Machine Learning Repository", url = "http://archive.ics.uci.edu/ml", institution = "University of California, Irvine, School of Information and Computer Sciences" } 
+```
+
+**Dataset**:
+
+```nohighlight
+'data.frame':	45222 obs. of  14 variables:
+ $ age           : int  39 50 38 53 28 37 49 52 31 42 ...
+ $ workclass     : Factor w/ 8 levels "federal.gov",..: 7 6 4 4 4 4 4 6 4 4 ...
+ $ fnlwgt        : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
+ $ education     : Ord.factor w/ 16 levels "preschool"<"x1st.4th"<..: 13 13 9 7 13 14 5 9 14 13 ...
+ $ marital.status: Factor w/ 7 levels "divorced","married.af.spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
+ $ occupation    : Factor w/ 14 levels "adm.clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
+ $ relationship  : Factor w/ 6 levels "husband","not.in.family",..: 2 1 2 1 6 6 2 1 2 1 ...
+ $ race          : Factor w/ 5 levels "amer.indian.eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
+ $ sex           : Factor w/ 2 levels "female","male": 2 2 2 2 1 1 1 2 1 2 ...
+ $ capital.gain  : int  2174 0 0 0 0 0 0 0 14084 5178 ...
+ $ capital.loss  : int  0 0 0 0 0 0 0 0 0 0 ...
+ $ hours.per.week: int  40 13 40 40 40 40 16 45 50 40 ...
+ $ native.country: Factor w/ 41 levels "cambodia","canada",..: 39 39 39 39 5 39 23 39 39 39 ...
+ $ class         : Factor w/ 2 levels "x..50k","x.50k": 1 1 1 1 1 1 1 2 2 2 ...
+
+```
+
+**Predictors**:
+
+|Type           | Frequency|
+|:--------------|---------:|
+|factor         |         7|
+|integer        |         5|
+|ordered factor |         1|
+
+**Class imbalance**:
+
+| class A | class B |
+|:-------:|:-------:|
+|  25 %   |  75 %   |
+|  11208  |  34014  |
+
+---
+
 # Default of credit card clients 
 
 **Local directory**: credit-card 
@@ -442,6 +500,68 @@ https://archive.ics.uci.edu/ml/citation_policy.html
 
 ---
 
+# Mushroom 
+
+**Local directory**: mushroom 
+
+**Details**: [link](https://archive.ics.uci.edu/ml/datasets/Mushroom)
+
+**Source data files**:
+
+* [agaricus-lepiota.data](https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data)
+* [agaricus-lepiota.names](https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.names)
+
+**Cite**:
+```nohighlight
+https://archive.ics.uci.edu/ml/citation_policy.html
+@misc{Lichman:2013 , author = "M. Lichman", year = "2013", title = "{UCI} Machine Learning Repository", url = "http://archive.ics.uci.edu/ml", institution = "University of California, Irvine, School of Information and Computer Sciences" } 
+```
+
+**Dataset**:
+
+```nohighlight
+'data.frame':	5644 obs. of  22 variables:
+ $ cap.shape               : Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ...
+ $ cap.surface             : Factor w/ 4 levels "f","g","s","y": 3 3 3 4 3 4 3 4 4 3 ...
+ $ cap.color               : Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ...
+ $ bruises                 : Factor w/ 2 levels "f","t": 2 2 2 2 1 2 2 2 2 2 ...
+ $ odor                    : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ...
+ $ gill.attachment         : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ...
+ $ gill.spacing            : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ...
+ $ gill.size               : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ...
+ $ gill.color              : Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ...
+ $ stalk.shape             : Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
+ $ stalk.root              : Factor w/ 4 levels "b","c","e","r": 3 2 2 3 3 2 2 2 3 2 ...
+ $ stalk.surface.above.ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
+ $ stalk.surface.below.ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
+ $ stalk.color.above.ring  : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
+ $ stalk.color.below.ring  : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
+ $ veil.color              : Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ...
+ $ ring.number             : int  1 1 1 1 1 1 1 1 1 1 ...
+ $ ring.type               : Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ...
+ $ spore.print.color       : Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ...
+ $ population              : Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ...
+ $ habitat                 : Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ...
+ $ class                   : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
+
+```
+
+**Predictors**:
+
+|Type    | Frequency|
+|:-------|---------:|
+|factor  |        20|
+|integer |         1|
+
+**Class imbalance**:
+
+| class A | class B |
+|:-------:|:-------:|
+|  38 %   |  62 %   |
+|  2156   |  3488   |
+
+---
+
 # Seismic bumps 
 
 **Local directory**: seismic-bumps 
diff --git a/data-collection/census-income/config.yaml b/data-collection/census-income/config.yaml
new file mode 100644
index 0000000..f68d2b8
--- /dev/null
+++ b/data-collection/census-income/config.yaml
@@ -0,0 +1,19 @@
+--- 
+name: Census income
+
+info: https://archive.ics.uci.edu/ml/datasets/Census+Income
+
+urls:
+- https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
+- https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test
+- https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names
+
+cite: > 
+    https://archive.ics.uci.edu/ml/citation_policy.html
+
+    @misc{Lichman:2013 ,
+    author = "M. Lichman",
+    year = "2013",
+    title = "{UCI} Machine Learning Repository",
+    url = "http://archive.ics.uci.edu/ml",
+    institution = "University of California, Irvine, School of Information and Computer Sciences" } 
diff --git a/data-collection/census-income/preprocess.R b/data-collection/census-income/preprocess.R
new file mode 100644
index 0000000..4dac273
--- /dev/null
+++ b/data-collection/census-income/preprocess.R
@@ -0,0 +1,49 @@
+preprocess.dataset = function()
+{
+    csv.file.1 = "adult.data"
+    csv.file.2 = "adult.test"
+    
+    dataset.1 = read.csv(file.path(orig.dir, csv.file.1), header = FALSE,
+                         na.strings = " ?")
+    dataset.2 = read.csv(file.path(orig.dir, csv.file.2), header = FALSE,
+                         na.strings = " ?", skip = 1)
+    
+    column.names = c("age", "workclass", "fnlwgt", "education", 
+                     "education.num", "marital.status", "occupation", 
+                     "relationship", "race", "sex", "capital.gain", 
+                     "capital.loss", "hours.per.week", "native.country", 
+                     "class")
+    
+    colnames(dataset.1) = column.names
+    colnames(dataset.2) = column.names
+    
+    levels(dataset.2$class) = gsub("\\.", "", levels(dataset.2$class))
+    
+    dataset = rbind(dataset.1, dataset.2)
+    
+    for (column.name in column.names)
+    {
+        if (is.factor(dataset[[column.name]]))
+        {
+            levels(dataset[[column.name]]) = trimws(levels(dataset[[column.name]]))
+        }
+    }
+    
+    education.ordered.levels = dataset %>% 
+        select(education.num, education) %>% 
+        unique %>% 
+        arrange(education.num) %>% 
+        select(education) %>% 
+        c %>% 
+        unlist %>% 
+        unname %>%
+        as.character
+    
+    dataset = dataset %>% 
+        mutate(education = factor(education, levels = education.ordered.levels, 
+                                  ordered = TRUE)) %>%
+        select(-education.num) %>% 
+        filter(complete.cases(.))
+    
+    return(dataset)
+}
\ No newline at end of file
diff --git a/data-collection/mushroom/config.yaml b/data-collection/mushroom/config.yaml
new file mode 100644
index 0000000..7258363
--- /dev/null
+++ b/data-collection/mushroom/config.yaml
@@ -0,0 +1,18 @@
+--- 
+name: Mushroom
+
+info: https://archive.ics.uci.edu/ml/datasets/Mushroom
+
+urls:
+- https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data
+- https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.names
+
+cite: > 
+    https://archive.ics.uci.edu/ml/citation_policy.html
+
+    @misc{Lichman:2013 ,
+    author = "M. Lichman",
+    year = "2013",
+    title = "{UCI} Machine Learning Repository",
+    url = "http://archive.ics.uci.edu/ml",
+    institution = "University of California, Irvine, School of Information and Computer Sciences" } 
diff --git a/data-collection/mushroom/preprocess.R b/data-collection/mushroom/preprocess.R
new file mode 100644
index 0000000..92e1c0a
--- /dev/null
+++ b/data-collection/mushroom/preprocess.R
@@ -0,0 +1,22 @@
+preprocess.dataset = function()
+{
+    csv.file = "agaricus-lepiota.data"
+    
+    dataset = read.csv(file.path(orig.dir, csv.file), header = FALSE,
+                       na.strings = "?")
+    
+    colnames(dataset) = c("class", "cap.shape", "cap.surface", "cap.color", 
+                          "bruises", "odor", "gill.attachment", "gill.spacing", 
+                          "gill.size", "gill.color", "stalk.shape", "stalk.root", 
+                          "stalk.surface.above.ring", "stalk.surface.below.ring",
+                          "stalk.color.above.ring", "stalk.color.below.ring", 
+                          "veil.type", "veil.color", "ring.number", "ring.type",
+                          "spore.print.color", "population", "habitat")
+    
+    dataset = dataset %>% 
+        select(cap.shape:habitat, class, -veil.type) %>% 
+        filter(complete.cases(.)) %>% 
+        mutate(ring.number = as.integer(as.integer(ring.number) - 1))
+    
+    return(dataset)
+}
\ No newline at end of file
diff --git a/s1-download-data.R b/s1-download-data.R
index 2b602e4..8cb485d 100644
--- a/s1-download-data.R
+++ b/s1-download-data.R
@@ -3,7 +3,7 @@
 source("init.R")
 source("utils.R")
 
-setup.logger(LOGGER.OUTPUT.S1.FILE)
+setup.logger(LOGGER.OUTPUT.S1.FILE, LOGGER.OVERWRITE.EXISTING.FILES)
 
 flog.info("Step 1: download dataset collection")
 
diff --git a/s2-preprocess-data.R b/s2-preprocess-data.R
index da36b51..bb8d559 100644
--- a/s2-preprocess-data.R
+++ b/s2-preprocess-data.R
@@ -3,7 +3,7 @@
 source("init.R")
 source("utils.R")
 
-setup.logger(LOGGER.OUTPUT.S2.FILE)
+setup.logger(LOGGER.OUTPUT.S2.FILE, LOGGER.OVERWRITE.EXISTING.FILES)
 
 flog.info("Step 2: preprocess dataset collection")
 
diff --git a/utils.R b/utils.R
index 4f421ba..7b8b41a 100644
--- a/utils.R
+++ b/utils.R
@@ -15,9 +15,9 @@ print.dataset.statistics = function(dataset)
                      ", classes: ", perc.classes[1], "%/", perc.classes[2], "%"))
 }
 
-setup.logger = function(output.file)
+setup.logger = function(output.file, overwrite.existing.files)
 {
-    if (LOGGER.OVERWRITE.EXISTING.FILES & file.exists(output.file))
+    if (overwrite.existing.files & file.exists(output.file))
     {
         file.remove(output.file)
     }