uczenie_maszynowe_zadania/cw_5/main.ipynb
2023-07-04 20:42:14 +02:00

9.2 KiB

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
data = pd.read_csv('titanic.tsv',sep='\t')

# formatowanie danych
data['Sex'] = data['Sex'].apply(lambda x: 1 if x=='male' else 0)
data['Name_to_num'] = data['Name'].apply(
    lambda x: 1 if 'Mr.' in x else 0
)
del data['Name']

data['Cabin'] = data['Cabin'].replace(np.nan,'Undefined')


vectorizer = TfidfVectorizer()
vectorizer.fit(data['Cabin'])
vector = vectorizer.transform(data['Cabin']).toarray()
vector_sum = []
for v in vector:
    vector_sum.append(v.sum())
data['Cabin']=vector_sum

data['Embarked'] = data['Embarked'].replace(np.nan,'Undefined')

data = pd.get_dummies(data,columns=['Embarked'])

imputer = SimpleImputer(missing_values=np.nan, strategy='median')
data[['Age']] = imputer.fit_transform(data[['Age']])

data['Ticket'] = data['Ticket'].replace(np.nan,'Undefined')

vectorizer = TfidfVectorizer()
vectorizer.fit(data['Ticket'])
vector = vectorizer.transform(data['Ticket']).toarray()
vector_sum = []
for v in vector:
    vector_sum.append(v.sum())
data['Ticket']=vector_sum
data.head()
Survived PassengerId Pclass Sex Age SibSp Parch Ticket Fare Cabin Name_to_num Embarked_C Embarked_Q Embarked_S Embarked_Undefined
0 0 530 2 1 23.0 2 1 1.000000 11.5000 1.0 1 0 0 1 0
1 0 466 3 1 38.0 0 0 1.391284 7.0500 1.0 1 0 0 1 0
2 0 753 3 1 33.0 0 0 1.000000 9.5000 1.0 1 0 0 1 0
3 0 855 2 0 44.0 1 0 1.000000 26.0000 1.0 0 0 0 1 0
4 0 333 1 1 38.0 0 1 1.365721 153.4625 1.0 1 0 0 1 0
# Podział danych na zbiór uczący i zbiór testowy
data_train,data_test = train_test_split(data, test_size=0.2)

# zdefiniowanie cech
FEATURES = ['Sex','Age','Embarked_C','Embarked_Q','Embarked_S']
x_train = pd.DataFrame(data_train[FEATURES])
y_train = pd.DataFrame(data_train['Survived'])
model = LogisticRegression()
# uczenie modelu
model.fit(x_train,y_train.values.ravel())

# predykcja wynikow
x_test = pd.DataFrame(data_test[FEATURES])
y_expected = pd.DataFrame(data_test['Survived'])
y_predicted = model.predict(x_test)
# ewaluacja wynikow
precision, recall, fscore, support = precision_recall_fscore_support(
    y_expected, y_predicted, average='micro'
)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-score: {fscore}")

score = model.score(x_test, y_expected)

print(f"Model score: {score}")
Precision: 0.832
Recall: 0.832
F-score: 0.832
Model score: 0.832