From f7debcd1545901815278f204a7cf9a8193b18285 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20W=C3=B3jtowicz?= Date: Fri, 19 Aug 2016 21:51:21 +0200 Subject: [PATCH] in census-income grouped education, filtered occupation and removed native country variable; added script to make release zip --- .gitignore | 1 + README.md | 23 ++++++++++++---------- config.R | 2 +- data-collection/census-income/preprocess.R | 16 +++++++++++++-- init.R | 1 + s3-make-readme.Rmd | 6 +++++- s4-make-release.sh | 14 +++++++++++++ 7 files changed, 49 insertions(+), 14 deletions(-) create mode 100644 s4-make-release.sh diff --git a/.gitignore b/.gitignore index d741a93..b05ba83 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ vignettes/*.pdf data-collection/*/original/* data-collection/*/preprocessed/* +data-collection.zip # markdown outputs *.html diff --git a/README.md b/README.md index 8c93ab9..d7bbac2 100644 --- a/README.md +++ b/README.md @@ -3,14 +3,14 @@ Andrzej Wójtowicz -Document generation date: 2016-08-11 18:12:19. +Document generation date: 2016-08-19 21:47:14. This project preprocesses a few datasets from [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/) into tidy R object files. It focuses on the binary classification datasets and saves only complete cases within a dataset. -**R software**: [Microsoft R Open](https://mran.microsoft.com/open/) (3.2.5) +**R software**: [Microsoft R Open](https://mran.microsoft.com/open/) (3.3.0) **Reproducibility library**: [checkpoint](https://github.com/RevolutionAnalytics/checkpoint) @@ -18,7 +18,11 @@ within a dataset. 1. Run *s1-download-data.R* to download original datasets. 2. Run *s2-preprocess-data.R* to preprocess the datasets. - 3. Optionally knit s*3-make-readme.Rmd* to get an overview of the preprocessed datasets. + + Optionally: + + 3. knit *s3-make-readme.Rmd* to get an overview of the preprocessed datasets, + 4. run *s4-make-release.sh* to create zip file with preprocessed datasets. # Table of Contents @@ -302,20 +306,19 @@ https://archive.ics.uci.edu/ml/citation_policy.html **Dataset**: ```nohighlight -'data.frame': 45222 obs. of 14 variables: +'data.frame': 46018 obs. of 13 variables: $ age : int 39 50 38 53 28 37 49 52 31 42 ... - $ workclass : Factor w/ 8 levels "federal.gov",..: 7 6 4 4 4 4 4 6 4 4 ... + $ workclass : Factor w/ 7 levels "federal.gov",..: 6 5 3 3 3 3 3 5 3 3 ... $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ... - $ education : Ord.factor w/ 16 levels "preschool"<"x1st.4th"<..: 13 13 9 7 13 14 5 9 14 13 ... + $ education : Ord.factor w/ 5 levels "school"<"highschool"<..: 4 4 2 1 4 5 1 2 5 4 ... $ marital.status: Factor w/ 7 levels "divorced","married.af.spouse",..: 5 3 1 3 3 3 4 3 5 3 ... - $ occupation : Factor w/ 14 levels "adm.clerical",..: 1 4 6 6 10 4 8 4 10 4 ... + $ occupation : Factor w/ 13 levels "adm.clerical",..: 1 3 5 5 9 3 7 3 9 3 ... $ relationship : Factor w/ 6 levels "husband","not.in.family",..: 2 1 2 1 6 6 2 1 2 1 ... $ race : Factor w/ 5 levels "amer.indian.eskimo",..: 5 5 5 3 3 5 3 5 5 5 ... $ sex : Factor w/ 2 levels "female","male": 2 2 2 2 1 1 1 2 1 2 ... $ capital.gain : int 2174 0 0 0 0 0 0 0 14084 5178 ... $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ... $ hours.per.week: int 40 13 40 40 40 40 16 45 50 40 ... - $ native.country: Factor w/ 41 levels "cambodia","canada",..: 39 39 39 39 5 39 23 39 39 39 ... $ class : Factor w/ 2 levels "x..50k","x.50k": 1 1 1 1 1 1 1 2 2 2 ... ``` @@ -324,7 +327,7 @@ https://archive.ics.uci.edu/ml/citation_policy.html |Type | Frequency| |:--------------|---------:| -|factor | 7| +|factor | 6| |integer | 5| |ordered factor | 1| @@ -333,7 +336,7 @@ https://archive.ics.uci.edu/ml/citation_policy.html | class A | class B | |:-------:|:-------:| | 25 % | 75 % | -| 11208 | 34014 | +| 11417 | 34601 | --- diff --git a/config.R b/config.R index 7042dec..9452972 100644 --- a/config.R +++ b/config.R @@ -13,7 +13,7 @@ USER.INIT.FILE = "init.R.user" # checkpoint library CHECKPOINT.MRAN.URL = "https://mran.microsoft.com/" -CHECKPOINT.SNAPSHOT.DATE = "2016-07-01" +CHECKPOINT.SNAPSHOT.DATE = "2016-06-01" CHECKPOINT.QUICK.LOAD = TRUE # skip testing https and checking url # logging system diff --git a/data-collection/census-income/preprocess.R b/data-collection/census-income/preprocess.R index 4dac273..edb7413 100644 --- a/data-collection/census-income/preprocess.R +++ b/data-collection/census-income/preprocess.R @@ -42,8 +42,20 @@ preprocess.dataset = function() dataset = dataset %>% mutate(education = factor(education, levels = education.ordered.levels, ordered = TRUE)) %>% - select(-education.num) %>% - filter(complete.cases(.)) + select(-education.num, -native.country) %>% # native.country is too much + # biased into US + filter(complete.cases(.) & occupation != "Armed-Forces") %>% # only few + # cases of + # Armed-Forces + droplevels + + dataset$education = factor(combine_factor(dataset$education, # combine into + c(1, 1, 1, 1, 1, # more numerous + 1, 1, 1, 2, 3, # groups + 3, 3, 4, 5, 5, 5)), + ordered = TRUE) + levels(dataset$education) = c("school", "highschool", "college", + "university", "science") return(dataset) } \ No newline at end of file diff --git a/init.R b/init.R index b34d781..8d29bb0 100644 --- a/init.R +++ b/init.R @@ -47,6 +47,7 @@ library(RCurl) library(tools) library(yaml) +library(reshape) library(plyr) library(dplyr) library(foreign) diff --git a/s3-make-readme.Rmd b/s3-make-readme.Rmd index 70c8efe..9a7e03d 100644 --- a/s3-make-readme.Rmd +++ b/s3-make-readme.Rmd @@ -26,7 +26,11 @@ within a dataset. 1. Run *s1-download-data.R* to download original datasets. 2. Run *s2-preprocess-data.R* to preprocess the datasets. - 3. Optionally knit s*3-make-readme.Rmd* to get an overview of the preprocessed datasets. + + Optionally: + + 3. knit *s3-make-readme.Rmd* to get an overview of the preprocessed datasets, + 4. run *s4-make-release.sh* to create zip file with preprocessed datasets. ```{r show-datasets, results='asis'} diff --git a/s4-make-release.sh b/s4-make-release.sh new file mode 100644 index 0000000..c5b8ce2 --- /dev/null +++ b/s4-make-release.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +OUT_ZIP_FILE="data-collection.zip" + +rm -f $OUT_ZIP_FILE + +zip $OUT_ZIP_FILE $(find data-collection/*/preprocessed/*.rds) + +for f in $(find data-collection/*/preprocessed/*.rds) ; do + dataset_name=$(echo "$f" | sed -e 's/data-collection\/\(.*\)\/preprocessed\/.*\.rds/\1/') + echo "Renaming $f -> $dataset_name.rds" + # https://stackoverflow.com/a/16710654 + printf "@ $f\n@=$dataset_name.rds\n" | zipnote -w $OUT_ZIP_FILE +done