import numpy as np import pandas as pd from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.metrics import precision_recall_fscore_support from sklearn.model_selection import train_test_split alldata = pd.read_csv( "titanic.tsv", header=0, sep="\t", usecols=[ "Survived", "PassengerId", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked" ], ) # 0 jeśli osoba jest kobietą, 1 jeśli mężczyzną alldata["Sex"] = alldata["Sex"].apply(lambda x: 0 if x in ["female"] else 1) # odrzucamy te kolumny przez dużą ilość nanów alldata = alldata.drop(["Ticket", "Cabin"], axis=1) # odrzucamy pozostałe nany z tych kolumn, które nam zostały alldata = alldata.dropna() # C = Cherbourg zamieniamy na 1, Q = Queenstown na 2, S = Southampton na 3 alldata["Embarked"] = alldata["Embarked"].apply(lambda x: 1 if x in ["C"] else 2 if x in ["Q"] else 3) # odrzucamy bo nie dostarcza nam informacji alldata = alldata.drop(["Name"], axis=1) print(alldata) FEATURES = [ "PassengerId", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked" ] columns = alldata.columns[1:] alldata = alldata[FEATURES + ["Survived"]] # Podział danych na zbiory uczący i testowy data_train, data_test = train_test_split(alldata, test_size=0.3) # Uczenie modelu y_train = pd.DataFrame(data_train["Survived"]) x_train = pd.DataFrame(data_train[FEATURES]) model = LogisticRegression(solver='lbfgs', max_iter=10000) # definicja modelu model.fit(x_train, y_train.values.ravel()) # dopasowanie modelu # Predykcja wyników dla danych testowych y_expected = pd.DataFrame(data_test["Survived"]) x_test = pd.DataFrame(data_test[FEATURES]) y_predicted = model.predict(x_test) # predykcja wyników na podstawie modelu print(y_predicted[:10]) precision, recall, fscore, support = precision_recall_fscore_support( y_expected, y_predicted, average="micro" ) print(f"Precision: {precision}") print(f"Recall: {recall}") print(f"F-score: {fscore}") score = model.score(x_test, y_expected) print(f"Model score: {score}")