From abc240bb2defb62e204d3f26cda3d0c5384acc9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20W=C3=B3jtowicz?= Date: Thu, 23 Jun 2016 12:36:39 +0200 Subject: [PATCH] Added extra preprocessing for column names and factor levels --- README.md | 374 +++++++++++++++++++++++----------------------- data-preprocess.R | 22 ++- 2 files changed, 208 insertions(+), 188 deletions(-) diff --git a/README.md b/README.md index b93e438..bd2d0f3 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ Andrzej Wójtowicz -Document generation date: 2016-04-16 14:20:08. +Document generation date: 2016-06-23 11:44:00. @@ -42,14 +42,14 @@ S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of ```nohighlight 'data.frame': 38227 obs. of 18 variables: $ age : int 56 57 37 40 56 45 59 24 25 25 ... - $ job : Factor w/ 11 levels "admin.","blue-collar",..: 4 8 8 1 8 8 1 10 8 8 ... + $ job : Factor w/ 11 levels "admin","blue.collar",..: 4 8 8 1 8 8 1 10 8 8 ... $ marital : Factor w/ 3 levels "divorced","married",..: 2 2 2 2 2 2 2 3 3 3 ... $ education : Ord.factor w/ 6 levels "basic.4y"<"basic.6y"<..: 1 4 4 2 4 3 5 5 4 4 ... $ housing : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 1 2 2 2 ... $ loan : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 1 1 1 1 ... $ contact : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ... $ month : Ord.factor w/ 12 levels "jan"<"feb"<"mar"<..: 5 5 5 5 5 5 5 5 5 5 ... - $ day_of_week : Ord.factor w/ 5 levels "mon"<"tue"<"wed"<..: 1 1 1 1 1 1 1 1 1 1 ... + $ day.of.week : Ord.factor w/ 5 levels "mon"<"tue"<"wed"<..: 1 1 1 1 1 1 1 1 1 1 ... $ campaign : int 1 1 1 1 1 1 1 1 1 1 ... $ previous : int 0 0 0 0 0 0 0 0 0 0 ... $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ... @@ -101,37 +101,37 @@ https://archive.ics.uci.edu/ml/citation_policy.html ```nohighlight 'data.frame': 569 obs. of 31 variables: - $ mean radius : num 18 20.6 19.7 11.4 20.3 ... - $ mean texture : num 10.4 17.8 21.2 20.4 14.3 ... - $ mean perimeter : num 122.8 132.9 130 77.6 135.1 ... - $ mean area : num 1001 1326 1203 386 1297 ... - $ mean smoothness : num 0.1184 0.0847 0.1096 0.1425 0.1003 ... - $ mean compactness : num 0.2776 0.0786 0.1599 0.2839 0.1328 ... - $ mean concavity : num 0.3001 0.0869 0.1974 0.2414 0.198 ... - $ mean concave points : num 0.1471 0.0702 0.1279 0.1052 0.1043 ... - $ mean symmetry : num 0.242 0.181 0.207 0.26 0.181 ... - $ mean fractal dimension : num 0.0787 0.0567 0.06 0.0974 0.0588 ... - $ se radius : num 1.095 0.543 0.746 0.496 0.757 ... - $ se texture : num 0.905 0.734 0.787 1.156 0.781 ... - $ se perimeter : num 8.59 3.4 4.58 3.44 5.44 ... - $ se area : num 153.4 74.1 94 27.2 94.4 ... - $ se smoothness : num 0.0064 0.00522 0.00615 0.00911 0.01149 ... - $ se compactness : num 0.049 0.0131 0.0401 0.0746 0.0246 ... - $ se concavity : num 0.0537 0.0186 0.0383 0.0566 0.0569 ... - $ se concave points : num 0.0159 0.0134 0.0206 0.0187 0.0188 ... - $ se symmetry : num 0.03 0.0139 0.0225 0.0596 0.0176 ... - $ se fractal dimension : num 0.00619 0.00353 0.00457 0.00921 0.00511 ... - $ worst radius : num 25.4 25 23.6 14.9 22.5 ... - $ worst texture : num 17.3 23.4 25.5 26.5 16.7 ... - $ worst perimeter : num 184.6 158.8 152.5 98.9 152.2 ... - $ worst area : num 2019 1956 1709 568 1575 ... - $ worst smoothness : num 0.162 0.124 0.144 0.21 0.137 ... - $ worst compactness : num 0.666 0.187 0.424 0.866 0.205 ... - $ worst concavity : num 0.712 0.242 0.45 0.687 0.4 ... - $ worst concave points : num 0.265 0.186 0.243 0.258 0.163 ... - $ worst symmetry : num 0.46 0.275 0.361 0.664 0.236 ... - $ worst fractal dimension: num 0.1189 0.089 0.0876 0.173 0.0768 ... - $ diagnosis : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ... + $ mean.radius : num 18 20.6 19.7 11.4 20.3 ... + $ mean.texture : num 10.4 17.8 21.2 20.4 14.3 ... + $ mean.perimeter : num 122.8 132.9 130 77.6 135.1 ... + $ mean.area : num 1001 1326 1203 386 1297 ... + $ mean.smoothness : num 0.1184 0.0847 0.1096 0.1425 0.1003 ... + $ mean.compactness : num 0.2776 0.0786 0.1599 0.2839 0.1328 ... + $ mean.concavity : num 0.3001 0.0869 0.1974 0.2414 0.198 ... + $ mean.concave.points : num 0.1471 0.0702 0.1279 0.1052 0.1043 ... + $ mean.symmetry : num 0.242 0.181 0.207 0.26 0.181 ... + $ mean.fractal.dimension : num 0.0787 0.0567 0.06 0.0974 0.0588 ... + $ se.radius : num 1.095 0.543 0.746 0.496 0.757 ... + $ se.texture : num 0.905 0.734 0.787 1.156 0.781 ... + $ se.perimeter : num 8.59 3.4 4.58 3.44 5.44 ... + $ se.area : num 153.4 74.1 94 27.2 94.4 ... + $ se.smoothness : num 0.0064 0.00522 0.00615 0.00911 0.01149 ... + $ se.compactness : num 0.049 0.0131 0.0401 0.0746 0.0246 ... + $ se.concavity : num 0.0537 0.0186 0.0383 0.0566 0.0569 ... + $ se.concave.points : num 0.0159 0.0134 0.0206 0.0187 0.0188 ... + $ se.symmetry : num 0.03 0.0139 0.0225 0.0596 0.0176 ... + $ se.fractal.dimension : num 0.00619 0.00353 0.00457 0.00921 0.00511 ... + $ worst.radius : num 25.4 25 23.6 14.9 22.5 ... + $ worst.texture : num 17.3 23.4 25.5 26.5 16.7 ... + $ worst.perimeter : num 184.6 158.8 152.5 98.9 152.2 ... + $ worst.area : num 2019 1956 1709 568 1575 ... + $ worst.smoothness : num 0.162 0.124 0.144 0.21 0.137 ... + $ worst.compactness : num 0.666 0.187 0.424 0.866 0.205 ... + $ worst.concavity : num 0.712 0.242 0.45 0.687 0.4 ... + $ worst.concave.points : num 0.265 0.186 0.243 0.258 0.163 ... + $ worst.symmetry : num 0.46 0.275 0.361 0.664 0.236 ... + $ worst.fractal.dimension: num 0.1189 0.089 0.0876 0.173 0.0768 ... + $ diagnosis : Factor w/ 2 levels "b","m": 2 2 2 2 2 2 2 2 2 2 ... ``` @@ -170,16 +170,16 @@ O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear programming", ```nohighlight 'data.frame': 683 obs. of 10 variables: - $ Clump Thickness : int 5 5 3 6 4 8 1 2 2 4 ... - $ Uniformity of Cell Size : int 1 4 1 8 1 10 1 1 1 2 ... - $ Uniformity of Cell Shape : int 1 4 1 8 1 10 1 2 1 1 ... - $ Marginal Adhesion : int 1 5 1 1 3 8 1 1 1 1 ... - $ Single Epithelial Cell Size: int 2 7 2 3 2 7 2 2 2 2 ... - $ Bare Nuclei : int 2 3 4 6 2 3 3 2 2 2 ... - $ Bland Chromatin : int 3 3 3 3 3 9 3 3 1 2 ... - $ Normal Nucleoli : int 1 2 1 7 1 7 1 1 1 1 ... - $ Mitoses : int 1 1 1 1 1 1 1 1 5 1 ... - $ Class : Factor w/ 2 levels "2","4": 1 1 1 1 1 2 1 1 1 1 ... + $ clump.thickness : int 5 5 3 6 4 8 1 2 2 4 ... + $ uniformity.of.cell.size : int 1 4 1 8 1 10 1 1 1 2 ... + $ uniformity.of.cell.shape : int 1 4 1 8 1 10 1 2 1 1 ... + $ marginal.adhesion : int 1 5 1 1 3 8 1 1 1 1 ... + $ single.epithelial.cell.size: int 2 7 2 3 2 7 2 2 2 2 ... + $ bare.nuclei : int 2 3 4 6 2 3 3 2 2 2 ... + $ bland.chromatin : int 3 3 3 3 3 9 3 3 1 2 ... + $ normal.nucleoli : int 1 2 1 7 1 7 1 1 1 1 ... + $ mitoses : int 1 1 1 1 1 1 1 1 5 1 ... + $ class : Factor w/ 2 levels "x2","x4": 1 1 1 1 1 2 1 1 1 1 ... ``` @@ -217,36 +217,36 @@ Ayres de Campos et al. (2000) SisPorto 2.0 A Program for Automated Analysis of C ```nohighlight 'data.frame': 2126 obs. of 30 variables: - $ LB : int 120 132 133 134 132 134 134 122 122 122 ... - $ AC : int 0 4 2 2 4 1 1 0 0 0 ... - $ FM : int 0 0 0 0 0 0 0 0 0 0 ... - $ UC : int 0 4 5 6 5 10 9 0 1 3 ... - $ ASTV : int 73 17 16 16 16 26 29 83 84 86 ... - $ MSTV : num 0.5 2.1 2.1 2.4 2.4 5.9 6.3 0.5 0.5 0.3 ... - $ ALTV : int 43 0 0 0 0 0 0 6 5 6 ... - $ MLTV : num 2.4 10.4 13.4 23 19.9 0 0 15.6 13.6 10.6 ... - $ DL : int 0 2 2 2 0 9 6 0 0 0 ... - $ DP : int 0 0 0 0 0 2 2 0 0 0 ... - $ Width : int 64 130 130 117 117 150 150 68 68 68 ... - $ Min : int 62 68 68 53 53 50 50 62 62 62 ... - $ Max : int 126 198 198 170 170 200 200 130 130 130 ... - $ Nmax : int 2 6 5 11 9 5 6 0 0 1 ... - $ Nzeros : int 0 1 1 0 0 3 3 0 0 0 ... - $ Mode : int 120 141 141 137 137 76 71 122 122 122 ... - $ Mean : int 137 136 135 134 136 107 107 122 122 122 ... - $ Median : int 121 140 138 137 138 107 106 123 123 123 ... - $ Variance: int 73 12 13 13 11 170 215 3 3 1 ... - $ Tendency: Ord.factor w/ 3 levels "-1"<"0"<"1": 3 2 2 3 3 2 2 3 3 3 ... - $ A : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ... - $ B : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ... - $ C : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ... - $ D : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ... - $ E : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ... - $ AD : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 1 1 ... - $ DE : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ... - $ LD : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 2 1 1 1 ... - $ FS : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 2 2 2 ... - $ NSP : Factor w/ 2 levels "1","3": 2 1 1 1 1 2 2 2 2 2 ... + $ lb : int 120 132 133 134 132 134 134 122 122 122 ... + $ ac : int 0 4 2 2 4 1 1 0 0 0 ... + $ fm : int 0 0 0 0 0 0 0 0 0 0 ... + $ uc : int 0 4 5 6 5 10 9 0 1 3 ... + $ astv : int 73 17 16 16 16 26 29 83 84 86 ... + $ mstv : num 0.5 2.1 2.1 2.4 2.4 5.9 6.3 0.5 0.5 0.3 ... + $ altv : int 43 0 0 0 0 0 0 6 5 6 ... + $ mltv : num 2.4 10.4 13.4 23 19.9 0 0 15.6 13.6 10.6 ... + $ dl : int 0 2 2 2 0 9 6 0 0 0 ... + $ dp : int 0 0 0 0 0 2 2 0 0 0 ... + $ width : int 64 130 130 117 117 150 150 68 68 68 ... + $ min : int 62 68 68 53 53 50 50 62 62 62 ... + $ max : int 126 198 198 170 170 200 200 130 130 130 ... + $ nmax : int 2 6 5 11 9 5 6 0 0 1 ... + $ nzeros : int 0 1 1 0 0 3 3 0 0 0 ... + $ mode : int 120 141 141 137 137 76 71 122 122 122 ... + $ mean : int 137 136 135 134 136 107 107 122 122 122 ... + $ median : int 121 140 138 137 138 107 106 123 123 123 ... + $ variance: int 73 12 13 13 11 170 215 3 3 1 ... + $ tendency: Ord.factor w/ 3 levels "x.1"<"x0"<"x1": 3 2 2 3 3 2 2 3 3 3 ... + $ a : Factor w/ 2 levels "x0","x1": 1 1 1 1 1 1 1 1 1 1 ... + $ b : Factor w/ 2 levels "x0","x1": 1 1 1 1 2 1 1 1 1 1 ... + $ c : Factor w/ 2 levels "x0","x1": 1 1 1 1 1 1 1 1 1 1 ... + $ d : Factor w/ 2 levels "x0","x1": 1 1 1 1 1 1 1 1 1 1 ... + $ e : Factor w/ 2 levels "x0","x1": 1 1 1 1 1 1 1 1 1 1 ... + $ ad : Factor w/ 2 levels "x0","x1": 1 2 2 2 1 1 1 1 1 1 ... + $ de : Factor w/ 2 levels "x0","x1": 1 1 1 1 1 1 1 1 1 1 ... + $ ld : Factor w/ 2 levels "x0","x1": 1 1 1 1 1 2 2 1 1 1 ... + $ fs : Factor w/ 2 levels "x0","x1": 2 1 1 1 1 1 1 2 2 2 ... + $ nsp : Factor w/ 2 levels "x1","x3": 2 1 1 1 1 2 2 2 2 2 ... ``` @@ -287,30 +287,30 @@ Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for ```nohighlight 'data.frame': 30000 obs. of 24 variables: - $ LIMIT_BAL : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ... - $ SEX : Factor w/ 2 levels "1","2": 2 2 2 2 1 1 1 2 2 1 ... - $ EDUCATION : Factor w/ 7 levels "0","1","2","3",..: 3 3 3 3 3 2 2 3 4 4 ... - $ MARRIAGE : Factor w/ 4 levels "0","1","2","3": 2 3 3 2 2 3 3 3 2 3 ... - $ AGE : int 24 26 34 37 57 37 29 23 28 35 ... - $ PAY_0 : int 2 0 0 0 0 0 0 0 0 0 ... - $ PAY_2 : int 2 2 0 0 0 0 0 0 0 0 ... - $ PAY_3 : int 0 0 0 0 0 0 0 0 2 0 ... - $ PAY_4 : int 0 0 0 0 0 0 0 0 0 0 ... - $ PAY_5 : int 0 0 0 0 0 0 0 0 0 0 ... - $ PAY_6 : int 0 2 0 0 0 0 0 0 0 0 ... - $ BILL_AMT1 : int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ... - $ BILL_AMT2 : int 3102 1725 14027 48233 5670 57069 412023 380 14096 0 ... - $ BILL_AMT3 : int 689 2682 13559 49291 35835 57608 445007 601 12108 0 ... - $ BILL_AMT4 : int 0 3272 14331 28314 20940 19394 542653 221 12211 0 ... - $ BILL_AMT5 : int 0 3455 14948 28959 19146 19619 483003 -159 11793 13007 ... - $ BILL_AMT6 : int 0 3261 15549 29547 19131 20024 473944 567 3719 13912 ... - $ PAY_AMT1 : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ... - $ PAY_AMT2 : int 689 1000 1500 2019 36681 1815 40000 601 0 0 ... - $ PAY_AMT3 : int 0 1000 1000 1200 10000 657 38000 0 432 0 ... - $ PAY_AMT4 : int 0 1000 1000 1100 9000 1000 20239 581 1000 13007 ... - $ PAY_AMT5 : int 0 0 1000 1069 689 1000 13750 1687 1000 1122 ... - $ PAY_AMT6 : int 0 2000 5000 1000 679 800 13770 1542 1000 0 ... - $ default payment next month: Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1 1 ... + $ limit.bal : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ... + $ sex : Factor w/ 2 levels "x1","x2": 2 2 2 2 1 1 1 2 2 1 ... + $ education : Factor w/ 7 levels "x0","x1","x2",..: 3 3 3 3 3 2 2 3 4 4 ... + $ marriage : Factor w/ 4 levels "x0","x1","x2",..: 2 3 3 2 2 3 3 3 2 3 ... + $ age : int 24 26 34 37 57 37 29 23 28 35 ... + $ pay.0 : int 2 0 0 0 0 0 0 0 0 0 ... + $ pay.2 : int 2 2 0 0 0 0 0 0 0 0 ... + $ pay.3 : int 0 0 0 0 0 0 0 0 2 0 ... + $ pay.4 : int 0 0 0 0 0 0 0 0 0 0 ... + $ pay.5 : int 0 0 0 0 0 0 0 0 0 0 ... + $ pay.6 : int 0 2 0 0 0 0 0 0 0 0 ... + $ bill.amt1 : int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ... + $ bill.amt2 : int 3102 1725 14027 48233 5670 57069 412023 380 14096 0 ... + $ bill.amt3 : int 689 2682 13559 49291 35835 57608 445007 601 12108 0 ... + $ bill.amt4 : int 0 3272 14331 28314 20940 19394 542653 221 12211 0 ... + $ bill.amt5 : int 0 3455 14948 28959 19146 19619 483003 -159 11793 13007 ... + $ bill.amt6 : int 0 3261 15549 29547 19131 20024 473944 567 3719 13912 ... + $ pay.amt1 : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ... + $ pay.amt2 : int 689 1000 1500 2019 36681 1815 40000 601 0 0 ... + $ pay.amt3 : int 0 1000 1000 1200 10000 657 38000 0 432 0 ... + $ pay.amt4 : int 0 1000 1000 1100 9000 1000 20239 581 1000 13007 ... + $ pay.amt5 : int 0 0 1000 1069 689 1000 13750 1687 1000 1122 ... + $ pay.amt6 : int 0 2000 5000 1000 679 800 13770 1542 1000 0 ... + $ default.payment.next.month: Factor w/ 2 levels "x0","x1": 2 2 1 1 1 1 1 1 1 1 ... ``` @@ -350,17 +350,17 @@ https://archive.ics.uci.edu/ml/citation_policy.html ```nohighlight 'data.frame': 583 obs. of 11 variables: - $ Age : int 65 62 62 58 72 46 26 29 17 55 ... - $ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 2 1 1 2 2 ... - $ TB : num 0.7 10.9 7.3 1 3.9 1.8 0.9 0.9 0.9 0.7 ... - $ DB : num 0.1 5.5 4.1 0.4 2 0.7 0.2 0.3 0.3 0.2 ... - $ Alkphos : int 187 699 490 182 195 208 154 202 202 290 ... - $ Sgpt : int 16 64 60 14 27 19 16 14 22 53 ... - $ Sgot : int 18 100 68 20 59 14 12 11 19 58 ... - $ TP : num 6.8 7.5 7 6.8 7.3 7.6 7 6.7 7.4 6.8 ... - $ ALB : num 3.3 3.2 3.3 3.4 2.4 4.4 3.5 3.6 4.1 3.4 ... - $ A/G Ratio: num 0.9 0.74 0.89 1 0.4 1.3 1 1.1 1.2 1 ... - $ Selector : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 2 1 ... + $ age : int 65 62 62 58 72 46 26 29 17 55 ... + $ gender : Factor w/ 2 levels "female","male": 1 2 2 2 2 2 1 1 2 2 ... + $ tb : num 0.7 10.9 7.3 1 3.9 1.8 0.9 0.9 0.9 0.7 ... + $ db : num 0.1 5.5 4.1 0.4 2 0.7 0.2 0.3 0.3 0.2 ... + $ alkphos : int 187 699 490 182 195 208 154 202 202 290 ... + $ sgpt : int 16 64 60 14 27 19 16 14 22 53 ... + $ sgot : int 18 100 68 20 59 14 12 11 19 58 ... + $ tp : num 6.8 7.5 7 6.8 7.3 7.6 7 6.7 7.4 6.8 ... + $ alb : num 3.3 3.2 3.3 3.4 2.4 4.4 3.5 3.6 4.1 3.4 ... + $ a.g.ratio: num 0.9 0.74 0.89 1 0.4 1.3 1 1.1 1.2 1 ... + $ selector : Factor w/ 2 levels "x1","x2": 1 1 1 1 1 1 1 1 2 1 ... ``` @@ -402,16 +402,16 @@ https://archive.ics.uci.edu/ml/citation_policy.html ```nohighlight 'data.frame': 19020 obs. of 11 variables: - $ fLength : num 28.8 31.6 162.1 23.8 75.1 ... - $ fWidth : num 16 11.72 136.03 9.57 30.92 ... - $ fSize : num 2.64 2.52 4.06 2.34 3.16 ... - $ fConc : num 0.3918 0.5303 0.0374 0.6147 0.3168 ... - $ fConc1 : num 0.1982 0.3773 0.0187 0.3922 0.1832 ... - $ fAsym : num 27.7 26.27 116.74 27.21 -5.53 ... - $ fM3Long : num 22.01 23.82 -64.86 -6.46 28.55 ... - $ fM3Trans: num -8.2 -9.96 -45.22 -7.15 21.84 ... - $ fAlpha : num 40.09 6.36 76.96 10.45 4.65 ... - $ fDist : num 81.9 205.3 256.8 116.7 356.5 ... + $ flength : num 28.8 31.6 162.1 23.8 75.1 ... + $ fwidth : num 16 11.72 136.03 9.57 30.92 ... + $ fsize : num 2.64 2.52 4.06 2.34 3.16 ... + $ fconc : num 0.3918 0.5303 0.0374 0.6147 0.3168 ... + $ fconc1 : num 0.1982 0.3773 0.0187 0.3922 0.1832 ... + $ fasym : num 27.7 26.27 116.74 27.21 -5.53 ... + $ fm3long : num 22.01 23.82 -64.86 -6.46 28.55 ... + $ fm3trans: num -8.2 -9.96 -45.22 -7.15 21.84 ... + $ falpha : num 40.09 6.36 76.96 10.45 4.65 ... + $ fdist : num 81.9 205.3 256.8 116.7 356.5 ... $ class : Factor w/ 2 levels "g","h": 1 1 1 1 1 1 1 1 1 1 ... ``` @@ -452,7 +452,7 @@ Sikora M., Wrobel L.: Application of rule induction algorithms for analysis of d 'data.frame': 2584 obs. of 16 variables: $ seismic : Factor w/ 2 levels "a","b": 1 1 1 1 1 1 1 1 1 1 ... $ seismoacoustic: Factor w/ 3 levels "a","b","c": 1 1 1 1 1 1 1 1 1 1 ... - $ shift : Factor w/ 2 levels "N","W": 1 1 1 1 1 2 2 1 1 2 ... + $ shift : Factor w/ 2 levels "n","w": 1 1 1 1 1 2 2 1 1 2 ... $ genergy : int 15180 14720 8050 28820 12640 63760 207930 48990 100190 247620 ... $ gpuls : int 48 33 30 171 57 195 614 194 303 675 ... $ gdenergy : int -72 -70 -81 -23 -63 -73 -6 -27 54 4 ... @@ -465,7 +465,7 @@ Sikora M., Wrobel L.: Application of rule induction algorithms for analysis of d $ nbumps5 : int 0 0 0 0 0 0 0 0 0 0 ... $ energy : int 0 2000 0 3000 0 0 1000 4000 0 500 ... $ maxenergy : int 0 2000 0 3000 0 0 700 4000 0 500 ... - $ class : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ... + $ class : Factor w/ 2 levels "x0","x1": 1 1 1 1 1 1 1 1 1 1 ... ``` @@ -507,64 +507,64 @@ https://archive.ics.uci.edu/ml/citation_policy.html ```nohighlight 'data.frame': 4601 obs. of 58 variables: - $ word_freq_make : num 0 0.21 0.06 0 0 0 0 0 0.15 0.06 ... - $ word_freq_address : num 0.64 0.28 0 0 0 0 0 0 0 0.12 ... - $ word_freq_all : num 0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ... - $ word_freq_3d : num 0 0 0 0 0 0 0 0 0 0 ... - $ word_freq_our : num 0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ... - $ word_freq_over : num 0 0.28 0.19 0 0 0 0 0 0 0.32 ... - $ word_freq_remove : num 0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ... - $ word_freq_internet : num 0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ... - $ word_freq_order : num 0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ... - $ word_freq_mail : num 0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ... - $ word_freq_receive : num 0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ... - $ word_freq_will : num 0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ... - $ word_freq_people : num 0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ... - $ word_freq_report : num 0 0.21 0 0 0 0 0 0 0 0 ... - $ word_freq_addresses : num 0 0.14 1.75 0 0 0 0 0 0 0.12 ... - $ word_freq_free : num 0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ... - $ word_freq_business : num 0 0.07 0.06 0 0 0 0 0 0 0 ... - $ word_freq_email : num 1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ... - $ word_freq_you : num 1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ... - $ word_freq_credit : num 0 0 0.32 0 0 0 0 0 3.53 0.06 ... - $ word_freq_your : num 0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ... - $ word_freq_font : num 0 0 0 0 0 0 0 0 0 0 ... - $ word_freq_000 : num 0 0.43 1.16 0 0 0 0 0 0 0.19 ... - $ word_freq_money : num 0 0.43 0.06 0 0 0 0 0 0.15 0 ... - $ word_freq_hp : num 0 0 0 0 0 0 0 0 0 0 ... - $ word_freq_hpl : num 0 0 0 0 0 0 0 0 0 0 ... - $ word_freq_george : num 0 0 0 0 0 0 0 0 0 0 ... - $ word_freq_650 : num 0 0 0 0 0 0 0 0 0 0 ... - $ word_freq_lab : num 0 0 0 0 0 0 0 0 0 0 ... - $ word_freq_labs : num 0 0 0 0 0 0 0 0 0 0 ... - $ word_freq_telnet : num 0 0 0 0 0 0 0 0 0 0 ... - $ word_freq_857 : num 0 0 0 0 0 0 0 0 0 0 ... - $ word_freq_data : num 0 0 0 0 0 0 0 0 0.15 0 ... - $ word_freq_415 : num 0 0 0 0 0 0 0 0 0 0 ... - $ word_freq_85 : num 0 0 0 0 0 0 0 0 0 0 ... - $ word_freq_technology : num 0 0 0 0 0 0 0 0 0 0 ... - $ word_freq_1999 : num 0 0.07 0 0 0 0 0 0 0 0 ... - $ word_freq_parts : num 0 0 0 0 0 0 0 0 0 0 ... - $ word_freq_pm : num 0 0 0 0 0 0 0 0 0 0 ... - $ word_freq_direct : num 0 0 0.06 0 0 0 0 0 0 0 ... - $ word_freq_cs : num 0 0 0 0 0 0 0 0 0 0 ... - $ word_freq_meeting : num 0 0 0 0 0 0 0 0 0 0 ... - $ word_freq_original : num 0 0 0.12 0 0 0 0 0 0.3 0 ... - $ word_freq_project : num 0 0 0 0 0 0 0 0 0 0.06 ... - $ word_freq_re : num 0 0 0.06 0 0 0 0 0 0 0 ... - $ word_freq_edu : num 0 0 0.06 0 0 0 0 0 0 0 ... - $ word_freq_table : num 0 0 0 0 0 0 0 0 0 0 ... - $ word_freq_conference : num 0 0 0 0 0 0 0 0 0 0 ... - $ char_freq_; : num 0 0 0.01 0 0 0 0 0 0 0.04 ... - $ char_freq_( : num 0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ... - $ char_freq_[ : num 0 0 0 0 0 0 0 0 0 0 ... - $ char_freq_! : num 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ... - $ char_freq_$ : num 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ... - $ char_freq_# : num 0 0.048 0.01 0 0 0 0 0 0.022 0 ... - $ capital_run_length_average: num 3.76 5.11 9.82 3.54 3.54 ... - $ capital_run_length_longest: int 61 101 485 40 40 15 4 11 445 43 ... - $ capital_run_length_total : int 278 1028 2259 191 191 54 112 49 1257 749 ... - $ class : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ... + $ word.freq.make : num 0 0.21 0.06 0 0 0 0 0 0.15 0.06 ... + $ word.freq.address : num 0.64 0.28 0 0 0 0 0 0 0 0.12 ... + $ word.freq.all : num 0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ... + $ word.freq.3d : num 0 0 0 0 0 0 0 0 0 0 ... + $ word.freq.our : num 0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ... + $ word.freq.over : num 0 0.28 0.19 0 0 0 0 0 0 0.32 ... + $ word.freq.remove : num 0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ... + $ word.freq.internet : num 0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ... + $ word.freq.order : num 0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ... + $ word.freq.mail : num 0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ... + $ word.freq.receive : num 0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ... + $ word.freq.will : num 0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ... + $ word.freq.people : num 0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ... + $ word.freq.report : num 0 0.21 0 0 0 0 0 0 0 0 ... + $ word.freq.addresses : num 0 0.14 1.75 0 0 0 0 0 0 0.12 ... + $ word.freq.free : num 0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ... + $ word.freq.business : num 0 0.07 0.06 0 0 0 0 0 0 0 ... + $ word.freq.email : num 1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ... + $ word.freq.you : num 1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ... + $ word.freq.credit : num 0 0 0.32 0 0 0 0 0 3.53 0.06 ... + $ word.freq.your : num 0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ... + $ word.freq.font : num 0 0 0 0 0 0 0 0 0 0 ... + $ word.freq.000 : num 0 0.43 1.16 0 0 0 0 0 0 0.19 ... + $ word.freq.money : num 0 0.43 0.06 0 0 0 0 0 0.15 0 ... + $ word.freq.hp : num 0 0 0 0 0 0 0 0 0 0 ... + $ word.freq.hpl : num 0 0 0 0 0 0 0 0 0 0 ... + $ word.freq.george : num 0 0 0 0 0 0 0 0 0 0 ... + $ word.freq.650 : num 0 0 0 0 0 0 0 0 0 0 ... + $ word.freq.lab : num 0 0 0 0 0 0 0 0 0 0 ... + $ word.freq.labs : num 0 0 0 0 0 0 0 0 0 0 ... + $ word.freq.telnet : num 0 0 0 0 0 0 0 0 0 0 ... + $ word.freq.857 : num 0 0 0 0 0 0 0 0 0 0 ... + $ word.freq.data : num 0 0 0 0 0 0 0 0 0.15 0 ... + $ word.freq.415 : num 0 0 0 0 0 0 0 0 0 0 ... + $ word.freq.85 : num 0 0 0 0 0 0 0 0 0 0 ... + $ word.freq.technology : num 0 0 0 0 0 0 0 0 0 0 ... + $ word.freq.1999 : num 0 0.07 0 0 0 0 0 0 0 0 ... + $ word.freq.parts : num 0 0 0 0 0 0 0 0 0 0 ... + $ word.freq.pm : num 0 0 0 0 0 0 0 0 0 0 ... + $ word.freq.direct : num 0 0 0.06 0 0 0 0 0 0 0 ... + $ word.freq.cs : num 0 0 0 0 0 0 0 0 0 0 ... + $ word.freq.meeting : num 0 0 0 0 0 0 0 0 0 0 ... + $ word.freq.original : num 0 0 0.12 0 0 0 0 0 0.3 0 ... + $ word.freq.project : num 0 0 0 0 0 0 0 0 0 0.06 ... + $ word.freq.re : num 0 0 0.06 0 0 0 0 0 0 0 ... + $ word.freq.edu : num 0 0 0.06 0 0 0 0 0 0 0 ... + $ word.freq.table : num 0 0 0 0 0 0 0 0 0 0 ... + $ word.freq.conference : num 0 0 0 0 0 0 0 0 0 0 ... + $ char.freq.. : num 0 0 0.01 0 0 0 0 0 0 0.04 ... + $ char.freq...1 : num 0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ... + $ char.freq...2 : num 0 0 0 0 0 0 0 0 0 0 ... + $ char.freq...3 : num 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ... + $ char.freq...4 : num 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ... + $ char.freq...5 : num 0 0.048 0.01 0 0 0 0 0 0.022 0 ... + $ capital.run.length.average: num 3.76 5.11 9.82 3.54 3.54 ... + $ capital.run.length.longest: int 61 101 485 40 40 15 4 11 445 43 ... + $ capital.run.length.total : int 278 1028 2259 191 191 54 112 49 1257 749 ... + $ class : Factor w/ 2 levels "x0","x1": 2 2 2 2 2 2 2 2 2 2 ... ``` @@ -605,19 +605,19 @@ P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferen ```nohighlight 'data.frame': 6497 obs. of 13 variables: - $ fixed acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ... - $ volatile acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ... - $ citric acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ... - $ residual sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ... + $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ... + $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ... + $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ... + $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ... $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ... - $ free sulfur dioxide : num 45 14 30 47 47 30 30 45 14 28 ... - $ total sulfur dioxide: num 170 132 97 186 186 97 136 170 132 129 ... + $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ... + $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ... $ density : num 1.001 0.994 0.995 0.996 0.996 ... - $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ... + $ ph : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ... $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ... $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ... $ color : Factor w/ 2 levels "red","white": 2 2 2 2 2 2 2 2 2 2 ... - $ quality : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ... + $ quality : Factor w/ 2 levels "x0","x1": 2 2 2 2 2 2 2 2 2 2 ... ``` diff --git a/data-preprocess.R b/data-preprocess.R index 3eb2dc0..2dd9cf3 100644 --- a/data-preprocess.R +++ b/data-preprocess.R @@ -28,7 +28,27 @@ for (dir.name in dir(PATH_DATASETS)) r.src.file = paste0(PATH_DATASETS, dir.name, "/", FILE_PREPROCESSING_SCRIPT) source(r.src.file) - dataset = preprocessDataset() + dataset = preprocessDataset() # custom per-dataset preprocessing + + # change column names + colnames(dataset) = tolower( + make.names( + gsub("^\\.|\\.$", "", colnames(dataset)), + unique=TRUE, allow_=FALSE)) + + # change factor levels + for (name in colnames(dataset)) + { + if (any(class(dataset[[name]]) == "factor")) + { + levels(dataset[[name]]) = tolower( + make.names( + gsub("^\\.|\\.$", "", + levels(dataset[[name]])), + unique=TRUE, allow_=FALSE)) + + } + } printDatasetStatistics(dataset)