81 lines
2.2 KiB
Python
81 lines
2.2 KiB
Python
|
import numpy as np
|
||
|
import pandas as pd
|
||
|
|
||
|
from sklearn.linear_model import LinearRegression, LogisticRegression
|
||
|
from sklearn.metrics import precision_recall_fscore_support
|
||
|
from sklearn.model_selection import train_test_split
|
||
|
|
||
|
alldata = pd.read_csv(
|
||
|
"titanic.tsv",
|
||
|
header=0,
|
||
|
sep="\t",
|
||
|
usecols=[
|
||
|
"Survived",
|
||
|
"PassengerId",
|
||
|
"Pclass",
|
||
|
"Name",
|
||
|
"Sex",
|
||
|
"Age",
|
||
|
"SibSp",
|
||
|
"Parch",
|
||
|
"Ticket",
|
||
|
"Fare",
|
||
|
"Cabin",
|
||
|
"Embarked"
|
||
|
],
|
||
|
)
|
||
|
# 0 jeśli osoba jest kobietą, 1 jeśli mężczyzną
|
||
|
alldata["Sex"] = alldata["Sex"].apply(lambda x: 0 if x in ["female"] else 1)
|
||
|
# odrzucamy te kolumny przez dużą ilość nanów
|
||
|
alldata = alldata.drop(["Ticket", "Cabin"], axis=1)
|
||
|
# odrzucamy pozostałe nany z tych kolumn, które nam zostały
|
||
|
alldata = alldata.dropna()
|
||
|
# C = Cherbourg zamieniamy na 1, Q = Queenstown na 2, S = Southampton na 3
|
||
|
alldata["Embarked"] = alldata["Embarked"].apply(lambda x: 1 if x in ["C"] else 2 if x in ["Q"] else 3)
|
||
|
# odrzucamy bo nie dostarcza nam informacji
|
||
|
alldata = alldata.drop(["Name"], axis=1)
|
||
|
print(alldata)
|
||
|
|
||
|
FEATURES = [
|
||
|
"PassengerId",
|
||
|
"Pclass",
|
||
|
"Sex",
|
||
|
"Age",
|
||
|
"SibSp",
|
||
|
"Parch",
|
||
|
"Fare",
|
||
|
"Embarked"
|
||
|
]
|
||
|
|
||
|
columns = alldata.columns[1:]
|
||
|
alldata = alldata[FEATURES + ["Survived"]]
|
||
|
|
||
|
# Podział danych na zbiory uczący i testowy
|
||
|
data_train, data_test = train_test_split(alldata, test_size=0.3)
|
||
|
|
||
|
# Uczenie modelu
|
||
|
y_train = pd.DataFrame(data_train["Survived"])
|
||
|
x_train = pd.DataFrame(data_train[FEATURES])
|
||
|
model = LogisticRegression(solver='lbfgs', max_iter=10000) # definicja modelu
|
||
|
model.fit(x_train, y_train.values.ravel()) # dopasowanie modelu
|
||
|
|
||
|
# Predykcja wyników dla danych testowych
|
||
|
y_expected = pd.DataFrame(data_test["Survived"])
|
||
|
x_test = pd.DataFrame(data_test[FEATURES])
|
||
|
y_predicted = model.predict(x_test) # predykcja wyników na podstawie modelu
|
||
|
|
||
|
print(y_predicted[:10])
|
||
|
|
||
|
precision, recall, fscore, support = precision_recall_fscore_support(
|
||
|
y_expected, y_predicted, average="micro"
|
||
|
)
|
||
|
|
||
|
print(f"Precision: {precision}")
|
||
|
print(f"Recall: {recall}")
|
||
|
print(f"F-score: {fscore}")
|
||
|
|
||
|
score = model.score(x_test, y_expected)
|
||
|
|
||
|
print(f"Model score: {score}")
|
||
|
|