From 4da5529cae52a7130619cd691a42db5802016a98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20W=C3=B3jtowicz?= Date: Sat, 16 Apr 2016 13:57:54 +0200 Subject: [PATCH] Added features statistics and raw numbers of class imbalance --- README.md | 94 +++++++++++++++++++++++++++++++++++++++++++------ readme-make.Rmd | 14 +++++++- 2 files changed, 96 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 840a993..9657b7e 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ Andrzej Wójtowicz -Document generation date: 2016-04-16 01:21:55. +Document generation date: 2016-04-16 13:55:00. @@ -62,7 +62,16 @@ Document generation date: 2016-04-16 01:21:55. ``` -**Class imbalance**: 11% / 89% +**Predictors**: + +|Class | Frequency| +|:--------------|---------:| +|factor | 6| +|integer | 3| +|numeric | 5| +|ordered factor | 3| + +**Class imbalance**: 11% / 89% (4254 / 33973) --- @@ -121,7 +130,13 @@ https://archive.ics.uci.edu/ml/citation_policy.html ``` -**Class imbalance**: 37% / 63% +**Predictors**: + +|Class | Frequency| +|:-------|---------:| +|numeric | 30| + +**Class imbalance**: 37% / 63% (212 / 357) --- @@ -158,7 +173,13 @@ O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear programming", ``` -**Class imbalance**: 35% / 65% +**Predictors**: + +|Class | Frequency| +|:-------|---------:| +|integer | 9| + +**Class imbalance**: 35% / 65% (239 / 444) --- @@ -214,7 +235,16 @@ Ayres de Campos et al. (2000) SisPorto 2.0 A Program for Automated Analysis of C ``` -**Class imbalance**: 22% / 78% +**Predictors**: + +|Class | Frequency| +|:--------------|---------:| +|factor | 9| +|integer | 17| +|numeric | 2| +|ordered factor | 1| + +**Class imbalance**: 22% / 78% (471 / 1655) --- @@ -264,7 +294,14 @@ Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for ``` -**Class imbalance**: 22% / 78% +**Predictors**: + +|Class | Frequency| +|:-------|---------:| +|factor | 3| +|integer | 20| + +**Class imbalance**: 22% / 78% (6636 / 23364) --- @@ -302,7 +339,15 @@ https://archive.ics.uci.edu/ml/citation_policy.html ``` -**Class imbalance**: 29% / 71% +**Predictors**: + +|Class | Frequency| +|:-------|---------:| +|factor | 1| +|integer | 4| +|numeric | 5| + +**Class imbalance**: 29% / 71% (167 / 416) --- @@ -341,7 +386,13 @@ https://archive.ics.uci.edu/ml/citation_policy.html ``` -**Class imbalance**: 35% / 65% +**Predictors**: + +|Class | Frequency| +|:-------|---------:| +|numeric | 10| + +**Class imbalance**: 35% / 65% (6688 / 12332) --- @@ -383,7 +434,14 @@ Sikora M., Wrobel L.: Application of rule induction algorithms for analysis of d ``` -**Class imbalance**: 7% / 93% +**Predictors**: + +|Class | Frequency| +|:-------|---------:| +|factor | 4| +|integer | 11| + +**Class imbalance**: 7% / 93% (170 / 2414) --- @@ -470,7 +528,14 @@ https://archive.ics.uci.edu/ml/citation_policy.html ``` -**Class imbalance**: 39% / 61% +**Predictors**: + +|Class | Frequency| +|:-------|---------:| +|integer | 2| +|numeric | 55| + +**Class imbalance**: 39% / 61% (1813 / 2788) --- @@ -511,7 +576,14 @@ P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferen ``` -**Class imbalance**: 37% / 63% +**Predictors**: + +|Class | Frequency| +|:-------|---------:| +|factor | 1| +|numeric | 11| + +**Class imbalance**: 37% / 63% (2384 / 4113) --- diff --git a/readme-make.Rmd b/readme-make.Rmd index 86970c2..b5496c0 100644 --- a/readme-make.Rmd +++ b/readme-make.Rmd @@ -64,11 +64,23 @@ for (dir.name in dir(PATH_DATASETS)) cat(str(dataset)) cat("\n```\n\n") + cat("**Predictors**:\n\n") + + df.pred = data.frame(table(sapply(dataset[, 1:(ncol(dataset)-1)], + function(f){paste(class(f), collapse=" ")}))) + colnames(df.pred) = c("Class", "Frequency") + + cat(knitr::kable(df.pred, format="markdown"), sep="\n") + cat("\n") + perc.classes = sort(round(100*as.numeric( table(dataset[, ncol(dataset)]))/nrow(dataset), 0)) + num.classes = sort(as.numeric(table(dataset[, ncol(dataset)]))) cat(paste("**Class imbalance**:", paste0(perc.classes[1], "% / ", - perc.classes[2], "%\n\n"))) + perc.classes[2], "% (", + num.classes[1], " / ", + num.classes[2], ")\n\n"))) cat("---\n\n") } ```