projekt_rozpoznywanie_grzybow/mushrooms.ipynb
2023-02-03 15:47:49 +01:00

47 KiB
Raw Blame History

import pandas as pd
df = pd.read_csv('mushrooms/train/train.tsv',sep='\t', header=None)
X_ver = pd.read_csv('mushrooms/dev-0/in.tsv',sep='\t', header=None)
y_ver = pd.read_csv('mushrooms/dev-0/expected.tsv',sep='\t', header=None)
print(df.head())
print(df.isna().sum())
print(df.shape)
  0  1  2  3  4  5  6  7  8  9   ... 13 14 15 16 17 18 19 20 21 22
0  p  x  s  n  t  p  f  c  n  k  ...  s  w  w  p  w  o  p  k  s  u
1  e  x  s  y  t  a  f  c  b  k  ...  s  w  w  p  w  o  p  n  n  g
2  p  x  y  w  t  p  f  c  n  n  ...  s  w  w  p  w  o  p  k  s  u
3  e  x  s  g  f  n  f  w  b  k  ...  s  w  w  p  w  o  e  n  a  g
4  e  x  y  y  t  a  f  c  b  n  ...  s  w  w  p  w  o  p  k  n  g

[5 rows x 23 columns]
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
dtype: int64
(6465, 23)
df.nunique()
0      2
1      6
2      4
3     10
4      2
5      9
6      2
7      2
8      2
9     12
10     2
11     5
12     4
13     4
14     9
15     9
16     1
17     4
18     3
19     5
20     9
21     6
22     7
dtype: int64
X_ver.nunique()
0      6
1      3
2     10
3      2
4      9
5      2
6      2
7      2
8     12
9      2
10     5
11     4
12     4
13     9
14     9
15     1
16     4
17     3
18     5
19     9
20     6
21     7
dtype: int64
y_ver = pd.get_dummies(y_ver,columns=[0],drop_first=True)
df2_unique = df[2].unique()
X_ver[1] = pd.Categorical(X_ver[1],categories=df2_unique)
X_ver = pd.get_dummies(X_ver)
X_ver
0_b 0_c 0_f 0_k 0_s 0_x 1_s 1_y 1_f 1_g ... 20_s 20_v 20_y 21_d 21_g 21_l 21_m 21_p 21_u 21_w
0 1 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
1 0 0 0 0 0 1 0 1 0 0 ... 0 1 0 0 1 0 0 0 0 0
2 1 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 1 0 0 0
3 1 0 0 0 0 0 1 0 0 0 ... 1 0 0 0 0 0 1 0 0 0
4 0 0 0 0 0 1 0 1 0 0 ... 0 0 0 0 0 0 1 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
787 0 0 0 0 0 1 1 0 0 0 ... 0 1 0 1 0 0 0 0 0 0
788 0 0 1 0 0 0 1 0 0 0 ... 0 1 0 0 0 1 0 0 0 0
789 0 0 0 1 0 0 1 0 0 0 ... 0 1 0 0 0 1 0 0 0 0
790 0 0 0 1 0 0 0 1 0 0 ... 0 1 0 0 0 1 0 0 0 0
791 0 0 0 0 0 1 1 0 0 0 ... 0 1 0 0 0 1 0 0 0 0

792 rows × 117 columns

df = pd.get_dummies(df,columns=[0],drop_first=True)
df = pd.get_dummies(df)
df
0_p 1_b 1_c 1_f 1_k 1_s 1_x 2_f 2_g 2_s ... 21_s 21_v 21_y 22_d 22_g 22_l 22_m 22_p 22_u 22_w
0 1 0 0 0 0 0 1 0 0 1 ... 1 0 0 0 0 0 0 0 1 0
1 0 0 0 0 0 0 1 0 0 1 ... 0 0 0 0 1 0 0 0 0 0
2 1 0 0 0 0 0 1 0 0 0 ... 1 0 0 0 0 0 0 0 1 0
3 0 0 0 0 0 0 1 0 0 1 ... 0 0 0 0 1 0 0 0 0 0
4 0 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6460 1 0 0 0 1 0 0 0 0 0 ... 0 1 0 1 0 0 0 0 0 0
6461 0 0 0 0 1 0 0 0 0 1 ... 0 0 0 0 0 1 0 0 0 0
6462 0 0 0 1 0 0 0 0 0 1 ... 0 0 0 0 0 1 0 0 0 0
6463 1 0 0 0 1 0 0 0 0 0 ... 0 1 0 0 0 1 0 0 0 0
6464 0 0 0 0 0 0 1 0 0 1 ... 0 0 0 0 0 1 0 0 0 0

6465 rows × 118 columns

from sklearn.model_selection import train_test_split

X = df.loc[:, df.columns != '0_p']
y = df['0_p']
X_train,X_test,y_train,y_test = train_test_split(X,y)
print(X_train.shape)
print(X_test.shape)
(4848, 117)
(1617, 117)
X_ver.columns = X_test.columns
X_ver
1_b 1_c 1_f 1_k 1_s 1_x 2_f 2_g 2_s 2_y ... 21_s 21_v 21_y 22_d 22_g 22_l 22_m 22_p 22_u 22_w
0 1 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
1 0 0 0 0 0 1 0 1 0 0 ... 0 1 0 0 1 0 0 0 0 0
2 1 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 1 0 0 0
3 1 0 0 0 0 0 1 0 0 0 ... 1 0 0 0 0 0 1 0 0 0
4 0 0 0 0 0 1 0 1 0 0 ... 0 0 0 0 0 0 1 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
787 0 0 0 0 0 1 1 0 0 0 ... 0 1 0 1 0 0 0 0 0 0
788 0 0 1 0 0 0 1 0 0 0 ... 0 1 0 0 0 1 0 0 0 0
789 0 0 0 1 0 0 1 0 0 0 ... 0 1 0 0 0 1 0 0 0 0
790 0 0 0 1 0 0 0 1 0 0 ... 0 1 0 0 0 1 0 0 0 0
791 0 0 0 0 0 1 1 0 0 0 ... 0 1 0 0 0 1 0 0 0 0

792 rows × 117 columns

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2,include_bias=False)
X_poly = poly.fit_transform(X_train)
lr = LogisticRegression(C=10).fit(X_poly,y_train)
print('{:.2f}'.format(lr.score(X_poly,y_train)))
print('{:.2f}'.format(lr.score(poly.fit_transform(X_test),y_test)))
1.00
1.00
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train,y_train)

print('{:.2f}'.format(gnb.score(X_train,y_train)))
print('{:.2f}'.format(gnb.score(X_test,y_test)))
0.95
0.97
from sklearn.svm import SVC

svc = SVC(kernel='rbf',C=10, gamma=0.1).fit(X_train,y_train)

print('{:.2f}'.format(svc.score(X_train,y_train)))
print('{:.2f}'.format(svc.score(X_test,y_test)))
1.00
1.00
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3).fit(X_train,y_train)
print('{:.2f}'.format(knn.score(X_train,y_train)))
print('{:.2f}'.format(knn.score(X_test,y_test)))
/home/tonywesoly/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
1.00
1.00
/home/tonywesoly/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(activation='relu', hidden_layer_sizes=[10],solver='lbfgs').fit(X_train,y_train)

print('{:.2f}'.format(mlp.score(X_train,y_train)))
print('{:.2f}'.format(mlp.score(X_test,y_test)))
1.00
1.00
from sklearn.metrics import classification_report,accuracy_score

pred_bayes = gnb.predict(X_ver)
print('Bayes raport:')
print('Accuracy score {:.2f}'.format(accuracy_score(y_ver,pred_bayes)))
print(classification_report(y_ver,pred_bayes,target_names=['jadalne','trujące']))
Bayes raport:
Accuracy score 0.91
              precision    recall  f1-score   support

     jadalne       1.00      0.82      0.90       406
     trujące       0.84      1.00      0.91       386

    accuracy                           0.91       792
   macro avg       0.92      0.91      0.91       792
weighted avg       0.92      0.91      0.91       792

pred_log = lr.predict(poly.fit_transform(X_ver))
print('Logistic Regression raport:')
print('Accuracy score {:.2f}'.format(accuracy_score(y_ver,pred_log)))
print(classification_report(y_ver,pred_log,target_names=['jadalne','trujące']))
Logistic Regression raport:
Accuracy score 1.00
              precision    recall  f1-score   support

     jadalne       1.00      1.00      1.00       406
     trujące       1.00      1.00      1.00       386

    accuracy                           1.00       792
   macro avg       1.00      1.00      1.00       792
weighted avg       1.00      1.00      1.00       792

pred_svc = svc.predict(X_ver)
print('Support Vector Machines raport:')
print('Accuracy score {:.2f}'.format(accuracy_score(y_ver,pred_svc)))
print(classification_report(y_ver,pred_svc,target_names=['jadalne','trujące']))
Support Vector Machines raport:
Accuracy score 1.00
              precision    recall  f1-score   support

     jadalne       1.00      1.00      1.00       406
     trujące       1.00      1.00      1.00       386

    accuracy                           1.00       792
   macro avg       1.00      1.00      1.00       792
weighted avg       1.00      1.00      1.00       792

pred_knn = knn.predict(X_ver)
print('K-nearest neighbors raport:')
print('Accuracy score {:.2f}'.format(accuracy_score(y_ver,pred_knn)))
print(classification_report(y_ver,pred_knn,target_names=['jadalne','trujące']))
K-nearest neighbors raport:
Accuracy score 1.00
              precision    recall  f1-score   support

     jadalne       1.00      1.00      1.00       406
     trujące       1.00      1.00      1.00       386

    accuracy                           1.00       792
   macro avg       1.00      1.00      1.00       792
weighted avg       1.00      1.00      1.00       792

/home/tonywesoly/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
pred_mlp = mlp.predict(X_ver)
print('Neural network raport:')
print('Accuracy score {:.2f}'.format(accuracy_score(y_ver,pred_mlp)))
print(classification_report(y_ver,pred_mlp,target_names=['jadalne','trujące']))
Neural network raport:
Accuracy score 1.00
              precision    recall  f1-score   support

     jadalne       1.00      1.00      1.00       406
     trujące       1.00      1.00      1.00       386

    accuracy                           1.00       792
   macro avg       1.00      1.00      1.00       792
weighted avg       1.00      1.00      1.00       792