From f7debcd1545901815278f204a7cf9a8193b18285 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20W=C3=B3jtowicz?= <andre@amu.edu.pl>
Date: Fri, 19 Aug 2016 21:51:21 +0200
Subject: [PATCH] in census-income grouped education, filtered occupation and
 removed native country variable; added script to make release zip

---
 .gitignore                                 |  1 +
 README.md                                  | 23 ++++++++++++----------
 config.R                                   |  2 +-
 data-collection/census-income/preprocess.R | 16 +++++++++++++--
 init.R                                     |  1 +
 s3-make-readme.Rmd                         |  6 +++++-
 s4-make-release.sh                         | 14 +++++++++++++
 7 files changed, 49 insertions(+), 14 deletions(-)
 create mode 100644 s4-make-release.sh

diff --git a/.gitignore b/.gitignore
index d741a93..b05ba83 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,7 @@ vignettes/*.pdf
 
 data-collection/*/original/*
 data-collection/*/preprocessed/*
+data-collection.zip
 
 # markdown outputs
 *.html
diff --git a/README.md b/README.md
index 8c93ab9..d7bbac2 100644
--- a/README.md
+++ b/README.md
@@ -3,14 +3,14 @@ Andrzej Wójtowicz
 
 
 
-Document generation date: 2016-08-11 18:12:19.
+Document generation date: 2016-08-19 21:47:14.
 
 This project preprocesses a few datasets from [UC Irvine Machine Learning
 Repository](https://archive.ics.uci.edu/ml/) into tidy R object files.
 It focuses on the binary classification datasets and saves only complete cases
 within a dataset.
 
-**R software**: [Microsoft R Open](https://mran.microsoft.com/open/) (3.2.5)
+**R software**: [Microsoft R Open](https://mran.microsoft.com/open/) (3.3.0)
 
 **Reproducibility library**: [checkpoint](https://github.com/RevolutionAnalytics/checkpoint)
 
@@ -18,7 +18,11 @@ within a dataset.
 
  1. Run *s1-download-data.R* to download original datasets.
  2. Run *s2-preprocess-data.R* to preprocess the datasets.
- 3. Optionally knit s*3-make-readme.Rmd* to get an overview of the preprocessed datasets.
+ 
+ Optionally:
+ 
+ 3. knit *s3-make-readme.Rmd* to get an overview of the preprocessed datasets,
+ 4. run *s4-make-release.sh* to create zip file with preprocessed datasets.
 
 
 # Table of Contents
@@ -302,20 +306,19 @@ https://archive.ics.uci.edu/ml/citation_policy.html
 **Dataset**:
 
 ```nohighlight
-'data.frame':	45222 obs. of  14 variables:
+'data.frame':	46018 obs. of  13 variables:
  $ age           : int  39 50 38 53 28 37 49 52 31 42 ...
- $ workclass     : Factor w/ 8 levels "federal.gov",..: 7 6 4 4 4 4 4 6 4 4 ...
+ $ workclass     : Factor w/ 7 levels "federal.gov",..: 6 5 3 3 3 3 3 5 3 3 ...
  $ fnlwgt        : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
- $ education     : Ord.factor w/ 16 levels "preschool"<"x1st.4th"<..: 13 13 9 7 13 14 5 9 14 13 ...
+ $ education     : Ord.factor w/ 5 levels "school"<"highschool"<..: 4 4 2 1 4 5 1 2 5 4 ...
  $ marital.status: Factor w/ 7 levels "divorced","married.af.spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
- $ occupation    : Factor w/ 14 levels "adm.clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
+ $ occupation    : Factor w/ 13 levels "adm.clerical",..: 1 3 5 5 9 3 7 3 9 3 ...
  $ relationship  : Factor w/ 6 levels "husband","not.in.family",..: 2 1 2 1 6 6 2 1 2 1 ...
  $ race          : Factor w/ 5 levels "amer.indian.eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
  $ sex           : Factor w/ 2 levels "female","male": 2 2 2 2 1 1 1 2 1 2 ...
  $ capital.gain  : int  2174 0 0 0 0 0 0 0 14084 5178 ...
  $ capital.loss  : int  0 0 0 0 0 0 0 0 0 0 ...
  $ hours.per.week: int  40 13 40 40 40 40 16 45 50 40 ...
- $ native.country: Factor w/ 41 levels "cambodia","canada",..: 39 39 39 39 5 39 23 39 39 39 ...
  $ class         : Factor w/ 2 levels "x..50k","x.50k": 1 1 1 1 1 1 1 2 2 2 ...
 
 ```
@@ -324,7 +327,7 @@ https://archive.ics.uci.edu/ml/citation_policy.html
 
 |Type           | Frequency|
 |:--------------|---------:|
-|factor         |         7|
+|factor         |         6|
 |integer        |         5|
 |ordered factor |         1|
 
@@ -333,7 +336,7 @@ https://archive.ics.uci.edu/ml/citation_policy.html
 | class A | class B |
 |:-------:|:-------:|
 |  25 %   |  75 %   |
-|  11208  |  34014  |
+|  11417  |  34601  |
 
 ---
 
diff --git a/config.R b/config.R
index 7042dec..9452972 100644
--- a/config.R
+++ b/config.R
@@ -13,7 +13,7 @@ USER.INIT.FILE        = "init.R.user"
 # checkpoint library
 
 CHECKPOINT.MRAN.URL      = "https://mran.microsoft.com/"
-CHECKPOINT.SNAPSHOT.DATE = "2016-07-01"
+CHECKPOINT.SNAPSHOT.DATE = "2016-06-01"
 CHECKPOINT.QUICK.LOAD    = TRUE # skip testing https and checking url
 
 # logging system
diff --git a/data-collection/census-income/preprocess.R b/data-collection/census-income/preprocess.R
index 4dac273..edb7413 100644
--- a/data-collection/census-income/preprocess.R
+++ b/data-collection/census-income/preprocess.R
@@ -42,8 +42,20 @@ preprocess.dataset = function()
     dataset = dataset %>% 
         mutate(education = factor(education, levels = education.ordered.levels, 
                                   ordered = TRUE)) %>%
-        select(-education.num) %>% 
-        filter(complete.cases(.))
+        select(-education.num, -native.country) %>%  # native.country is too much
+                                                     # biased into US
+        filter(complete.cases(.) & occupation != "Armed-Forces") %>% # only few
+                                                                     # cases of
+                                                                     # Armed-Forces
+        droplevels
+    
+    dataset$education = factor(combine_factor(dataset$education, # combine into
+                                              c(1, 1, 1, 1, 1,   # more numerous
+                                                1, 1, 1, 2, 3,   # groups
+                                                3, 3, 4, 5, 5, 5)),
+                               ordered = TRUE)
+    levels(dataset$education) = c("school", "highschool", "college", 
+                                  "university", "science")
     
     return(dataset)
 }
\ No newline at end of file
diff --git a/init.R b/init.R
index b34d781..8d29bb0 100644
--- a/init.R
+++ b/init.R
@@ -47,6 +47,7 @@ library(RCurl)
 library(tools)
 library(yaml)
 
+library(reshape)
 library(plyr)
 library(dplyr)
 library(foreign)
diff --git a/s3-make-readme.Rmd b/s3-make-readme.Rmd
index 70c8efe..9a7e03d 100644
--- a/s3-make-readme.Rmd
+++ b/s3-make-readme.Rmd
@@ -26,7 +26,11 @@ within a dataset.
 
  1. Run *s1-download-data.R* to download original datasets.
  2. Run *s2-preprocess-data.R* to preprocess the datasets.
- 3. Optionally knit s*3-make-readme.Rmd* to get an overview of the preprocessed datasets.
+ 
+ Optionally:
+ 
+ 3. knit *s3-make-readme.Rmd* to get an overview of the preprocessed datasets,
+ 4. run *s4-make-release.sh* to create zip file with preprocessed datasets.
 
 ```{r show-datasets, results='asis'}
 
diff --git a/s4-make-release.sh b/s4-make-release.sh
new file mode 100644
index 0000000..c5b8ce2
--- /dev/null
+++ b/s4-make-release.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+OUT_ZIP_FILE="data-collection.zip"
+
+rm -f $OUT_ZIP_FILE
+
+zip $OUT_ZIP_FILE $(find data-collection/*/preprocessed/*.rds)
+
+for f in $(find data-collection/*/preprocessed/*.rds) ; do
+    dataset_name=$(echo "$f" | sed -e 's/data-collection\/\(.*\)\/preprocessed\/.*\.rds/\1/')
+    echo "Renaming $f -> $dataset_name.rds"
+    # https://stackoverflow.com/a/16710654
+    printf "@ $f\n@=$dataset_name.rds\n" | zipnote -w $OUT_ZIP_FILE
+done