uma_s478839/run-checkpoint.ipynb at 04b570bdc485ef9ed1a8ddf64777c7da873e5710

2022-06-19 23:35:14 +02:00

17 KiB

Raw Blame History

Import potrzebnych bibliotek

import numpy as np
import pandas as pd
import sklearn

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import SGDClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

import torch
from torch import Tensor

Preprocessing danych

def preprocess(data):
    #rename columns
    data.rename(columns = {'CODE_GENDER':'if_man', 'FLAG_OWN_CAR':'if_own_car', 'FLAG_OWN_REALTY':'if_own_realty', 'CNT_CHILDREN':'cnt_children', 
                           'AMT_INCOME_TOTAL':'amt_income', 'NAME_EDUCATION_TYPE':'name_edu_type', 'CNT_FAM_MEMBERS':'cnt_fam_members',
                           'NAME_INCOME_TYPE':'name_income_type', 'NAME_FAMILY_STATUS':'name_fam_status'}, inplace = True)
    
    #replace data
    data['if_man'] = data['if_man'].apply(lambda x: 1 if x=='M' else 0)
    data['if_own_car'] = data['if_own_car'].apply(lambda x: 1 if x=='Y' else 0)
    data['if_own_realty'] = data['if_own_realty'].apply(lambda x: 1 if x=='Y' else 0)
    data['cnt_children'] = data['cnt_children'].apply(pd.to_numeric, errors='coerce')
    data['cnt_fam_members'] = data['cnt_fam_members'].apply(pd.to_numeric, errors='coerce')
    data['cnt_children'] = data['cnt_children'].apply(lambda x: np.NaN if x > 5 else x)
    data['cnt_fam_members'] = data['cnt_fam_members'].apply(lambda x: np.NaN if x > 8 else x)
    
    #get dummies
    data = pd.get_dummies(data, columns=['name_income_type'])
    data = pd.get_dummies(data, columns=['name_fam_status'])
    
    #dropna
    print("Length of dataset before dropna: " + str(len(data)))
    data = data.dropna()
    print("Length of dataset after dropna: " + str(len(data)))
    
    return data

Podział na zbiór trenujący i testowy

def split(data):
    split_point = int(0.8 * len(data))
    data_train = data[:split_point]
    data_test = data[split_point:]
    print("Length of whole dataset: " + str(len(data)))
    print("Length of train dataset: " + str(len(data_train)))
    print("Length of test dataset: " + str(len(data_test)))
    return data_train, data_test

Ewaluacja

def evaluation(y_expected, y_predicted):
    precision, recall, fscore, support = precision_recall_fscore_support(y_expected, y_predicted, average="weighted")
    accuracy = accuracy_score(y_expected, y_predicted)
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F-score: {fscore}")

Wczytanie danych z pliku

alldata = pd.read_csv('application_record.csv', header=0, sep=',',
     usecols=['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'NAME_EDUCATION_TYPE',  'CNT_FAM_MEMBERS', 'NAME_INCOME_TYPE', 'NAME_FAMILY_STATUS'])
# print(alldata[:5])

Wybór cech do trenowania

FEATURES = [
    'if_man', 
    'if_own_car', 
    'if_own_realty', 
    'cnt_children', 
    'amt_income',  
    'cnt_fam_members', 
    'name_income_type_Commercial associate',                                  
    'name_income_type_Pensioner', 
    'name_income_type_State servant', 
    'name_income_type_Student', 
    'name_income_type_Working',
    'name_fam_status_Civil marriage',
    'name_fam_status_Married',
    'name_fam_status_Separated',
    'name_fam_status_Single / not married',
    'name_fam_status_Widow'
]

print(pd.unique(alldata['if_man']))
print(pd.unique(alldata['if_own_car']))
print(pd.unique(alldata['if_own_realty']))
print(pd.unique(alldata['cnt_children']))
# print(pd.unique(alldata['name_income_type']))
print(pd.unique(alldata['name_edu_type']))
# print(pd.unique(alldata['name_fam_status']))
print(pd.unique(alldata['cnt_fam_members']))

Przygotowanie danych

alldata = preprocess(alldata)
data_train, data_test = split(alldata)

Length of dataset before dropna: 438557
Length of dataset after dropna: 438531
Length of whole dataset: 438531
Length of train dataset: 350824
Length of test dataset: 87707

y_train = pd.DataFrame(data_train['name_edu_type'])
x_train = pd.DataFrame(data_train[FEATURES])
scaler = StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = pd.DataFrame(data_test[FEATURES])
x_test = scaler.transform(x_test)
y_expected = pd.DataFrame(data_test['name_edu_type'])

Regresja logistyczna

model_logreg = LogisticRegression(max_iter=1000) 
model_logreg.fit(x_train, y_train)

y_predicted_logreg = model_logreg.predict(x_test) 

evaluation(y_expected, y_predicted_logreg)

D:\Programy\anaconda3\lib\site-packages\sklearn\utils\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  return f(*args, **kwargs)

Accuracy: 0.7031023749529685
Precision: 0.6408866393822401
Recall: 0.7031023749529685
F-score: 0.6268976358430636

D:\Programy\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

SGD

model_sgd = SGDClassifier() 
model_sgd.fit(x_train, y_train)

y_predicted_sgd = model_sgd.predict(x_test) 

evaluation(y_expected, y_predicted_sgd)

D:\Programy\anaconda3\lib\site-packages\sklearn\utils\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  return f(*args, **kwargs)

Accuracy: 0.6921340371920143
Precision: 0.7333126153525842
Recall: 0.6921340371920143
F-score: 0.5666044690008586

D:\Programy\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

Gaussian Naive Bayes

# model_gnb = GaussianNB() 
# model_gnb.fit(x_train, y_train)

# y_predicted_sgd = model_gnb.predict(x_test) 

# evaluation(y_expected, y_predicted_sgd)

PyTorch

Przygotowanie danych

X_numpy = alldata.drop("name_edu_type", axis=1).values
X_numpy = scaler.transform(X_numpy)
target_map = {
    val: index for index, val in enumerate(alldata.name_edu_type.unique())
}
y_numpy = alldata.name_edu_type.map(target_map).values
X = torch.tensor(X_numpy, dtype=torch.float32)
y = torch.tensor(y_numpy)

target_map

{'Higher education': 0,
 'Secondary / secondary special': 1,
 'Incomplete higher': 2,
 'Lower secondary': 3,
 'Academic degree': 4}

One hot vectors

def one_hot_encode(vector):
    n_classes = len(vector.unique())
    one_hot = torch.zeros((vector.shape[0], n_classes))\
        .type(torch.LongTensor)
    return one_hot\
        .scatter(1, vector.type(torch.LongTensor).unsqueeze(1), 1)

y_one_hot = one_hot_encode(y)

random_indices = torch.randperm(X.shape[0])
print(X.shape[0])
n_train = int(0.8 * X.shape[0])
X_train = X[random_indices[:n_train]]
y_train = y[random_indices[:n_train]]
y_train_one_hot = y_one_hot[random_indices[:n_train]]

X_test = X[random_indices[n_train:]]
y_test = y[random_indices[n_train:]]
y_test_one_hot = y_one_hot[random_indices[n_train:]]

Model

model_pytorch = torch.nn.Sequential(
    torch.nn.Linear(16, 5)
)

Optymalizator

learning_rate = 0.1
lambda_param = 0.01
optimizer = torch.optim.SGD(
    model_pytorch.parameters(), 
    lr=learning_rate, 
    weight_decay=lambda_param
)

Funkcja straty

loss_function = torch.nn.CrossEntropyLoss()

Trenowanie

n_iterations = 1000
for i in range(1, n_iterations + 1):
    Z = model_pytorch(X_train)  # 1
    loss = loss_function(Z, y_train)  # 2
    optimizer.zero_grad()  # 3
    loss.backward()  # 4
    optimizer.step()  # 5
    
    if i == 1 or i % 100 == 0:
        print("Loss at iteration {}: {}".format(i, loss))

test_predictions = torch.argmax(
    torch.softmax(model_pytorch(X_test), 1), axis=1  # 6
)
test_accuracy = float(sum(test_predictions == y_test)) / y_test.shape[0]
print("\nFinal Test Accuracy: {}".format(test_accuracy))

Loss at iteration 1: 1.6335363388061523
Loss at iteration 100: 0.8125142455101013
Loss at iteration 200: 0.7701064944267273
Loss at iteration 300: 0.75752854347229
Loss at iteration 400: 0.7520564198493958
Loss at iteration 500: 0.7492005228996277
Loss at iteration 600: 0.7475774884223938
Loss at iteration 700: 0.7465949058532715
Loss at iteration 800: 0.7459684014320374
Loss at iteration 900: 0.7455567717552185
Loss at iteration 1000: 0.7452787160873413

Final Test Accuracy: 0.7014605447683765

Ewaluacja

evaluation(y_test, test_predictions)

Accuracy: 0.7014605447683765
Precision: 0.6413920853293256
Recall: 0.7014605447683765
F-score: 0.6264027678306182

D:\Programy\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

17 KiB Raw Blame History

Import potrzebnych bibliotek

Preprocessing danych

Podział na zbiór trenujący i testowy

Ewaluacja

Wczytanie danych z pliku

Wybór cech do trenowania

Przygotowanie danych

Regresja logistyczna

SGD

Gaussian Naive Bayes

PyTorch

Przygotowanie danych

One hot vectors

Model

Optymalizator

Funkcja straty

Trenowanie

Ewaluacja

17 KiB

Raw Blame History