9.2 KiB
9.2 KiB
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
data = pd.read_csv('titanic.tsv',sep='\t')
# formatowanie danych
data['Sex'] = data['Sex'].apply(lambda x: 1 if x=='male' else 0)
data['Name_to_num'] = data['Name'].apply(
lambda x: 1 if 'Mr.' in x else 0
)
del data['Name']
data['Cabin'] = data['Cabin'].replace(np.nan,'Undefined')
vectorizer = TfidfVectorizer()
vectorizer.fit(data['Cabin'])
vector = vectorizer.transform(data['Cabin']).toarray()
vector_sum = []
for v in vector:
vector_sum.append(v.sum())
data['Cabin']=vector_sum
data['Embarked'] = data['Embarked'].replace(np.nan,'Undefined')
data = pd.get_dummies(data,columns=['Embarked'])
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
data[['Age']] = imputer.fit_transform(data[['Age']])
data['Ticket'] = data['Ticket'].replace(np.nan,'Undefined')
vectorizer = TfidfVectorizer()
vectorizer.fit(data['Ticket'])
vector = vectorizer.transform(data['Ticket']).toarray()
vector_sum = []
for v in vector:
vector_sum.append(v.sum())
data['Ticket']=vector_sum
data.head()
Survived | PassengerId | Pclass | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Name_to_num | Embarked_C | Embarked_Q | Embarked_S | Embarked_Undefined | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 530 | 2 | 1 | 23.0 | 2 | 1 | 1.000000 | 11.5000 | 1.0 | 1 | 0 | 0 | 1 | 0 |
1 | 0 | 466 | 3 | 1 | 38.0 | 0 | 0 | 1.391284 | 7.0500 | 1.0 | 1 | 0 | 0 | 1 | 0 |
2 | 0 | 753 | 3 | 1 | 33.0 | 0 | 0 | 1.000000 | 9.5000 | 1.0 | 1 | 0 | 0 | 1 | 0 |
3 | 0 | 855 | 2 | 0 | 44.0 | 1 | 0 | 1.000000 | 26.0000 | 1.0 | 0 | 0 | 0 | 1 | 0 |
4 | 0 | 333 | 1 | 1 | 38.0 | 0 | 1 | 1.365721 | 153.4625 | 1.0 | 1 | 0 | 0 | 1 | 0 |
# Podział danych na zbiór uczący i zbiór testowy
data_train,data_test = train_test_split(data, test_size=0.2)
# zdefiniowanie cech
FEATURES = ['Sex','Age','Embarked_C','Embarked_Q','Embarked_S']
x_train = pd.DataFrame(data_train[FEATURES])
y_train = pd.DataFrame(data_train['Survived'])
model = LogisticRegression()
# uczenie modelu
model.fit(x_train,y_train.values.ravel())
# predykcja wynikow
x_test = pd.DataFrame(data_test[FEATURES])
y_expected = pd.DataFrame(data_test['Survived'])
y_predicted = model.predict(x_test)
# ewaluacja wynikow
precision, recall, fscore, support = precision_recall_fscore_support(
y_expected, y_predicted, average='micro'
)
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-score: {fscore}")
score = model.score(x_test, y_expected)
print(f"Model score: {score}")
Precision: 0.832 Recall: 0.832 F-score: 0.832 Model score: 0.832