mirror of
https://github.com/andre-wojtowicz/uci-ml-to-r.git
synced 2024-11-23 16:00:28 +01:00
in census-income grouped education, filtered occupation and removed native country variable;
added script to make release zip
This commit is contained in:
parent
b1a4cbab73
commit
f7debcd154
1
.gitignore
vendored
1
.gitignore
vendored
@ -20,6 +20,7 @@ vignettes/*.pdf
|
|||||||
|
|
||||||
data-collection/*/original/*
|
data-collection/*/original/*
|
||||||
data-collection/*/preprocessed/*
|
data-collection/*/preprocessed/*
|
||||||
|
data-collection.zip
|
||||||
|
|
||||||
# markdown outputs
|
# markdown outputs
|
||||||
*.html
|
*.html
|
||||||
|
23
README.md
23
README.md
@ -3,14 +3,14 @@ Andrzej Wójtowicz
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
Document generation date: 2016-08-11 18:12:19.
|
Document generation date: 2016-08-19 21:47:14.
|
||||||
|
|
||||||
This project preprocesses a few datasets from [UC Irvine Machine Learning
|
This project preprocesses a few datasets from [UC Irvine Machine Learning
|
||||||
Repository](https://archive.ics.uci.edu/ml/) into tidy R object files.
|
Repository](https://archive.ics.uci.edu/ml/) into tidy R object files.
|
||||||
It focuses on the binary classification datasets and saves only complete cases
|
It focuses on the binary classification datasets and saves only complete cases
|
||||||
within a dataset.
|
within a dataset.
|
||||||
|
|
||||||
**R software**: [Microsoft R Open](https://mran.microsoft.com/open/) (3.2.5)
|
**R software**: [Microsoft R Open](https://mran.microsoft.com/open/) (3.3.0)
|
||||||
|
|
||||||
**Reproducibility library**: [checkpoint](https://github.com/RevolutionAnalytics/checkpoint)
|
**Reproducibility library**: [checkpoint](https://github.com/RevolutionAnalytics/checkpoint)
|
||||||
|
|
||||||
@ -18,7 +18,11 @@ within a dataset.
|
|||||||
|
|
||||||
1. Run *s1-download-data.R* to download original datasets.
|
1. Run *s1-download-data.R* to download original datasets.
|
||||||
2. Run *s2-preprocess-data.R* to preprocess the datasets.
|
2. Run *s2-preprocess-data.R* to preprocess the datasets.
|
||||||
3. Optionally knit s*3-make-readme.Rmd* to get an overview of the preprocessed datasets.
|
|
||||||
|
Optionally:
|
||||||
|
|
||||||
|
3. knit *s3-make-readme.Rmd* to get an overview of the preprocessed datasets,
|
||||||
|
4. run *s4-make-release.sh* to create zip file with preprocessed datasets.
|
||||||
|
|
||||||
|
|
||||||
# Table of Contents
|
# Table of Contents
|
||||||
@ -302,20 +306,19 @@ https://archive.ics.uci.edu/ml/citation_policy.html
|
|||||||
**Dataset**:
|
**Dataset**:
|
||||||
|
|
||||||
```nohighlight
|
```nohighlight
|
||||||
'data.frame': 45222 obs. of 14 variables:
|
'data.frame': 46018 obs. of 13 variables:
|
||||||
$ age : int 39 50 38 53 28 37 49 52 31 42 ...
|
$ age : int 39 50 38 53 28 37 49 52 31 42 ...
|
||||||
$ workclass : Factor w/ 8 levels "federal.gov",..: 7 6 4 4 4 4 4 6 4 4 ...
|
$ workclass : Factor w/ 7 levels "federal.gov",..: 6 5 3 3 3 3 3 5 3 3 ...
|
||||||
$ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
|
$ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
|
||||||
$ education : Ord.factor w/ 16 levels "preschool"<"x1st.4th"<..: 13 13 9 7 13 14 5 9 14 13 ...
|
$ education : Ord.factor w/ 5 levels "school"<"highschool"<..: 4 4 2 1 4 5 1 2 5 4 ...
|
||||||
$ marital.status: Factor w/ 7 levels "divorced","married.af.spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
|
$ marital.status: Factor w/ 7 levels "divorced","married.af.spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
|
||||||
$ occupation : Factor w/ 14 levels "adm.clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
|
$ occupation : Factor w/ 13 levels "adm.clerical",..: 1 3 5 5 9 3 7 3 9 3 ...
|
||||||
$ relationship : Factor w/ 6 levels "husband","not.in.family",..: 2 1 2 1 6 6 2 1 2 1 ...
|
$ relationship : Factor w/ 6 levels "husband","not.in.family",..: 2 1 2 1 6 6 2 1 2 1 ...
|
||||||
$ race : Factor w/ 5 levels "amer.indian.eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
|
$ race : Factor w/ 5 levels "amer.indian.eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
|
||||||
$ sex : Factor w/ 2 levels "female","male": 2 2 2 2 1 1 1 2 1 2 ...
|
$ sex : Factor w/ 2 levels "female","male": 2 2 2 2 1 1 1 2 1 2 ...
|
||||||
$ capital.gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
|
$ capital.gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
|
||||||
$ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
|
$ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
|
||||||
$ hours.per.week: int 40 13 40 40 40 40 16 45 50 40 ...
|
$ hours.per.week: int 40 13 40 40 40 40 16 45 50 40 ...
|
||||||
$ native.country: Factor w/ 41 levels "cambodia","canada",..: 39 39 39 39 5 39 23 39 39 39 ...
|
|
||||||
$ class : Factor w/ 2 levels "x..50k","x.50k": 1 1 1 1 1 1 1 2 2 2 ...
|
$ class : Factor w/ 2 levels "x..50k","x.50k": 1 1 1 1 1 1 1 2 2 2 ...
|
||||||
|
|
||||||
```
|
```
|
||||||
@ -324,7 +327,7 @@ https://archive.ics.uci.edu/ml/citation_policy.html
|
|||||||
|
|
||||||
|Type | Frequency|
|
|Type | Frequency|
|
||||||
|:--------------|---------:|
|
|:--------------|---------:|
|
||||||
|factor | 7|
|
|factor | 6|
|
||||||
|integer | 5|
|
|integer | 5|
|
||||||
|ordered factor | 1|
|
|ordered factor | 1|
|
||||||
|
|
||||||
@ -333,7 +336,7 @@ https://archive.ics.uci.edu/ml/citation_policy.html
|
|||||||
| class A | class B |
|
| class A | class B |
|
||||||
|:-------:|:-------:|
|
|:-------:|:-------:|
|
||||||
| 25 % | 75 % |
|
| 25 % | 75 % |
|
||||||
| 11208 | 34014 |
|
| 11417 | 34601 |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
2
config.R
2
config.R
@ -13,7 +13,7 @@ USER.INIT.FILE = "init.R.user"
|
|||||||
# checkpoint library
|
# checkpoint library
|
||||||
|
|
||||||
CHECKPOINT.MRAN.URL = "https://mran.microsoft.com/"
|
CHECKPOINT.MRAN.URL = "https://mran.microsoft.com/"
|
||||||
CHECKPOINT.SNAPSHOT.DATE = "2016-07-01"
|
CHECKPOINT.SNAPSHOT.DATE = "2016-06-01"
|
||||||
CHECKPOINT.QUICK.LOAD = TRUE # skip testing https and checking url
|
CHECKPOINT.QUICK.LOAD = TRUE # skip testing https and checking url
|
||||||
|
|
||||||
# logging system
|
# logging system
|
||||||
|
@ -42,8 +42,20 @@ preprocess.dataset = function()
|
|||||||
dataset = dataset %>%
|
dataset = dataset %>%
|
||||||
mutate(education = factor(education, levels = education.ordered.levels,
|
mutate(education = factor(education, levels = education.ordered.levels,
|
||||||
ordered = TRUE)) %>%
|
ordered = TRUE)) %>%
|
||||||
select(-education.num) %>%
|
select(-education.num, -native.country) %>% # native.country is too much
|
||||||
filter(complete.cases(.))
|
# biased into US
|
||||||
|
filter(complete.cases(.) & occupation != "Armed-Forces") %>% # only few
|
||||||
|
# cases of
|
||||||
|
# Armed-Forces
|
||||||
|
droplevels
|
||||||
|
|
||||||
|
dataset$education = factor(combine_factor(dataset$education, # combine into
|
||||||
|
c(1, 1, 1, 1, 1, # more numerous
|
||||||
|
1, 1, 1, 2, 3, # groups
|
||||||
|
3, 3, 4, 5, 5, 5)),
|
||||||
|
ordered = TRUE)
|
||||||
|
levels(dataset$education) = c("school", "highschool", "college",
|
||||||
|
"university", "science")
|
||||||
|
|
||||||
return(dataset)
|
return(dataset)
|
||||||
}
|
}
|
1
init.R
1
init.R
@ -47,6 +47,7 @@ library(RCurl)
|
|||||||
library(tools)
|
library(tools)
|
||||||
library(yaml)
|
library(yaml)
|
||||||
|
|
||||||
|
library(reshape)
|
||||||
library(plyr)
|
library(plyr)
|
||||||
library(dplyr)
|
library(dplyr)
|
||||||
library(foreign)
|
library(foreign)
|
||||||
|
@ -26,7 +26,11 @@ within a dataset.
|
|||||||
|
|
||||||
1. Run *s1-download-data.R* to download original datasets.
|
1. Run *s1-download-data.R* to download original datasets.
|
||||||
2. Run *s2-preprocess-data.R* to preprocess the datasets.
|
2. Run *s2-preprocess-data.R* to preprocess the datasets.
|
||||||
3. Optionally knit s*3-make-readme.Rmd* to get an overview of the preprocessed datasets.
|
|
||||||
|
Optionally:
|
||||||
|
|
||||||
|
3. knit *s3-make-readme.Rmd* to get an overview of the preprocessed datasets,
|
||||||
|
4. run *s4-make-release.sh* to create zip file with preprocessed datasets.
|
||||||
|
|
||||||
```{r show-datasets, results='asis'}
|
```{r show-datasets, results='asis'}
|
||||||
|
|
||||||
|
14
s4-make-release.sh
Normal file
14
s4-make-release.sh
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
OUT_ZIP_FILE="data-collection.zip"
|
||||||
|
|
||||||
|
rm -f $OUT_ZIP_FILE
|
||||||
|
|
||||||
|
zip $OUT_ZIP_FILE $(find data-collection/*/preprocessed/*.rds)
|
||||||
|
|
||||||
|
for f in $(find data-collection/*/preprocessed/*.rds) ; do
|
||||||
|
dataset_name=$(echo "$f" | sed -e 's/data-collection\/\(.*\)\/preprocessed\/.*\.rds/\1/')
|
||||||
|
echo "Renaming $f -> $dataset_name.rds"
|
||||||
|
# https://stackoverflow.com/a/16710654
|
||||||
|
printf "@ $f\n@=$dataset_name.rds\n" | zipnote -w $OUT_ZIP_FILE
|
||||||
|
done
|
Loading…
Reference in New Issue
Block a user