ml-2023SZ/zad5.py

81 lines
2.2 KiB
Python
Raw Normal View History

2024-01-04 21:36:25 +01:00
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
alldata = pd.read_csv(
"titanic.tsv",
header=0,
sep="\t",
usecols=[
"Survived",
"PassengerId",
"Pclass",
"Name",
"Sex",
"Age",
"SibSp",
"Parch",
"Ticket",
"Fare",
"Cabin",
"Embarked"
],
)
# 0 jeśli osoba jest kobietą, 1 jeśli mężczyzną
alldata["Sex"] = alldata["Sex"].apply(lambda x: 0 if x in ["female"] else 1)
# odrzucamy te kolumny przez dużą ilość nanów
alldata = alldata.drop(["Ticket", "Cabin"], axis=1)
# odrzucamy pozostałe nany z tych kolumn, które nam zostały
alldata = alldata.dropna()
# C = Cherbourg zamieniamy na 1, Q = Queenstown na 2, S = Southampton na 3
alldata["Embarked"] = alldata["Embarked"].apply(lambda x: 1 if x in ["C"] else 2 if x in ["Q"] else 3)
# odrzucamy bo nie dostarcza nam informacji
alldata = alldata.drop(["Name"], axis=1)
print(alldata)
FEATURES = [
"PassengerId",
"Pclass",
"Sex",
"Age",
"SibSp",
"Parch",
"Fare",
"Embarked"
]
columns = alldata.columns[1:]
alldata = alldata[FEATURES + ["Survived"]]
# Podział danych na zbiory uczący i testowy
data_train, data_test = train_test_split(alldata, test_size=0.3)
# Uczenie modelu
y_train = pd.DataFrame(data_train["Survived"])
x_train = pd.DataFrame(data_train[FEATURES])
model = LogisticRegression(solver='lbfgs', max_iter=10000) # definicja modelu
model.fit(x_train, y_train.values.ravel()) # dopasowanie modelu
# Predykcja wyników dla danych testowych
y_expected = pd.DataFrame(data_test["Survived"])
x_test = pd.DataFrame(data_test[FEATURES])
y_predicted = model.predict(x_test) # predykcja wyników na podstawie modelu
print(y_predicted[:10])
precision, recall, fscore, support = precision_recall_fscore_support(
y_expected, y_predicted, average="micro"
)
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-score: {fscore}")
score = model.score(x_test, y_expected)
print(f"Model score: {score}")