47 KiB
47 KiB
import pandas as pd
df = pd.read_csv('mushrooms/train/train.tsv',sep='\t', header=None)
X_ver = pd.read_csv('mushrooms/dev-0/in.tsv',sep='\t', header=None)
y_ver = pd.read_csv('mushrooms/dev-0/expected.tsv',sep='\t', header=None)
print(df.head())
print(df.isna().sum())
print(df.shape)
0 1 2 3 4 5 6 7 8 9 ... 13 14 15 16 17 18 19 20 21 22 0 p x s n t p f c n k ... s w w p w o p k s u 1 e x s y t a f c b k ... s w w p w o p n n g 2 p x y w t p f c n n ... s w w p w o p k s u 3 e x s g f n f w b k ... s w w p w o e n a g 4 e x y y t a f c b n ... s w w p w o p k n g [5 rows x 23 columns] 0 0 1 0 2 0 3 0 4 0 5 0 6 0 7 0 8 0 9 0 10 0 11 0 12 0 13 0 14 0 15 0 16 0 17 0 18 0 19 0 20 0 21 0 22 0 dtype: int64 (6465, 23)
df.nunique()
0 2 1 6 2 4 3 10 4 2 5 9 6 2 7 2 8 2 9 12 10 2 11 5 12 4 13 4 14 9 15 9 16 1 17 4 18 3 19 5 20 9 21 6 22 7 dtype: int64
X_ver.nunique()
0 6 1 3 2 10 3 2 4 9 5 2 6 2 7 2 8 12 9 2 10 5 11 4 12 4 13 9 14 9 15 1 16 4 17 3 18 5 19 9 20 6 21 7 dtype: int64
y_ver = pd.get_dummies(y_ver,columns=[0],drop_first=True)
df2_unique = df[2].unique()
X_ver[1] = pd.Categorical(X_ver[1],categories=df2_unique)
X_ver = pd.get_dummies(X_ver)
X_ver
0_b | 0_c | 0_f | 0_k | 0_s | 0_x | 1_s | 1_y | 1_f | 1_g | ... | 20_s | 20_v | 20_y | 21_d | 21_g | 21_l | 21_m | 21_p | 21_u | 21_w | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
787 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
788 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
789 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
790 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
791 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
792 rows × 117 columns
df = pd.get_dummies(df,columns=[0],drop_first=True)
df = pd.get_dummies(df)
df
0_p | 1_b | 1_c | 1_f | 1_k | 1_s | 1_x | 2_f | 2_g | 2_s | ... | 21_s | 21_v | 21_y | 22_d | 22_g | 22_l | 22_m | 22_p | 22_u | 22_w | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6460 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
6461 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
6462 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
6463 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
6464 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
6465 rows × 118 columns
from sklearn.model_selection import train_test_split
X = df.loc[:, df.columns != '0_p']
y = df['0_p']
X_train,X_test,y_train,y_test = train_test_split(X,y)
print(X_train.shape)
print(X_test.shape)
(4848, 117) (1617, 117)
X_ver.columns = X_test.columns
X_ver
1_b | 1_c | 1_f | 1_k | 1_s | 1_x | 2_f | 2_g | 2_s | 2_y | ... | 21_s | 21_v | 21_y | 22_d | 22_g | 22_l | 22_m | 22_p | 22_u | 22_w | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
787 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
788 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
789 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
790 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
791 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
792 rows × 117 columns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2,include_bias=False)
X_poly = poly.fit_transform(X_train)
lr = LogisticRegression(C=10).fit(X_poly,y_train)
print('{:.2f}'.format(lr.score(X_poly,y_train)))
print('{:.2f}'.format(lr.score(poly.fit_transform(X_test),y_test)))
1.00 1.00
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train,y_train)
print('{:.2f}'.format(gnb.score(X_train,y_train)))
print('{:.2f}'.format(gnb.score(X_test,y_test)))
0.95 0.97
from sklearn.svm import SVC
svc = SVC(kernel='rbf',C=10, gamma=0.1).fit(X_train,y_train)
print('{:.2f}'.format(svc.score(X_train,y_train)))
print('{:.2f}'.format(svc.score(X_test,y_test)))
1.00 1.00
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3).fit(X_train,y_train)
print('{:.2f}'.format(knn.score(X_train,y_train)))
print('{:.2f}'.format(knn.score(X_test,y_test)))
/home/tonywesoly/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning. mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
1.00 1.00
/home/tonywesoly/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning. mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(activation='relu', hidden_layer_sizes=[10],solver='lbfgs').fit(X_train,y_train)
print('{:.2f}'.format(mlp.score(X_train,y_train)))
print('{:.2f}'.format(mlp.score(X_test,y_test)))
1.00 1.00
from sklearn.metrics import classification_report,accuracy_score
pred_bayes = gnb.predict(X_ver)
print('Bayes raport:')
print('Accuracy score {:.2f}'.format(accuracy_score(y_ver,pred_bayes)))
print(classification_report(y_ver,pred_bayes,target_names=['jadalne','trujące']))
Bayes raport: Accuracy score 0.91 precision recall f1-score support jadalne 1.00 0.82 0.90 406 trujące 0.84 1.00 0.91 386 accuracy 0.91 792 macro avg 0.92 0.91 0.91 792 weighted avg 0.92 0.91 0.91 792
pred_log = lr.predict(poly.fit_transform(X_ver))
print('Logistic Regression raport:')
print('Accuracy score {:.2f}'.format(accuracy_score(y_ver,pred_log)))
print(classification_report(y_ver,pred_log,target_names=['jadalne','trujące']))
Logistic Regression raport: Accuracy score 1.00 precision recall f1-score support jadalne 1.00 1.00 1.00 406 trujące 1.00 1.00 1.00 386 accuracy 1.00 792 macro avg 1.00 1.00 1.00 792 weighted avg 1.00 1.00 1.00 792
pred_svc = svc.predict(X_ver)
print('Support Vector Machines raport:')
print('Accuracy score {:.2f}'.format(accuracy_score(y_ver,pred_svc)))
print(classification_report(y_ver,pred_svc,target_names=['jadalne','trujące']))
Support Vector Machines raport: Accuracy score 1.00 precision recall f1-score support jadalne 1.00 1.00 1.00 406 trujące 1.00 1.00 1.00 386 accuracy 1.00 792 macro avg 1.00 1.00 1.00 792 weighted avg 1.00 1.00 1.00 792
pred_knn = knn.predict(X_ver)
print('K-nearest neighbors raport:')
print('Accuracy score {:.2f}'.format(accuracy_score(y_ver,pred_knn)))
print(classification_report(y_ver,pred_knn,target_names=['jadalne','trujące']))
K-nearest neighbors raport: Accuracy score 1.00 precision recall f1-score support jadalne 1.00 1.00 1.00 406 trujące 1.00 1.00 1.00 386 accuracy 1.00 792 macro avg 1.00 1.00 1.00 792 weighted avg 1.00 1.00 1.00 792
/home/tonywesoly/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning. mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
pred_mlp = mlp.predict(X_ver)
print('Neural network raport:')
print('Accuracy score {:.2f}'.format(accuracy_score(y_ver,pred_mlp)))
print(classification_report(y_ver,pred_mlp,target_names=['jadalne','trujące']))
Neural network raport: Accuracy score 1.00 precision recall f1-score support jadalne 1.00 1.00 1.00 406 trujące 1.00 1.00 1.00 386 accuracy 1.00 792 macro avg 1.00 1.00 1.00 792 weighted avg 1.00 1.00 1.00 792