First logistic regression model testing

This commit is contained in:
Krzysztof Szubiczuk 2022-01-11 12:36:15 +01:00
parent eea6f1b259
commit 8a94bb7f1f

View File

@ -2,6 +2,11 @@
import pandas as pd import pandas as pd
import os import os
import re import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report
from copy import deepcopy
# %% [markdown] # %% [markdown]
### Reading data - this part need changing when data ### Reading data - this part need changing when data
# %% # %%
@ -14,7 +19,7 @@ data_all = pd.read_csv(filepath, header=0,
) )
# data.columns = ['index', 'id','date', 'query', 'user', 'text'] # data.columns = ['index', 'id','date', 'query', 'user', 'text']
# %% # %%
data = data_all.loc[:,['Tweet', 'Sentiment']] # data = data_all.loc[:,['Tweet', 'Sentiment']]
# %% [markdown] # %% [markdown]
### Function definitions ### Function definitions
# %% # %%
@ -23,14 +28,17 @@ change_dict = {
" username ": ['@\w+|@'], " username ": ['@\w+|@'],
" url ": ['http\S*'], " url ": ['http\S*'],
" emoji ": ["[;:][dbop\(\)\[\]]|[^\w][dbop\(\)\[\]][;:]|xd+|\S*&\S*", "[^\w\s,.?!:;#\'\"\(\)\$\-\+%\[\]\|]"], " emoji ": ["[;:][dbop\(\)\[\]]|[^\w][dbop\(\)\[\]][;:]|xd+|\S*&\S*", "[^\w\s,.?!:;#\'\"\(\)\$\-\+%\[\]\|]"],
" number ": ["[\+\-\$]?[\d]+[,\.]?[\d]+[%]?"], " number ": ["[\+\-\$]?[\d]+[,\.\:k]?[\d]?[%]?"],
# standardization # standardization
', ': ['\s,'], ', ': ['\s,'],
'. ': ['\s\.'], '. ': ['\s\.'],
' ': ['\s{2,}', '\n'], ' ': ['\s{2,}', '\n', '^rt[\s]+', '\s\:\s'],
"'": ["<EFBFBD>"], "'": ["<EFBFBD>"],
'?': ["\s\?"], '?': ["\s\?"],
'!': ["\s\!"], '!': ["\s\!"],
'".': ["\s\"\."],
'",': ["\s\"\,"],
'" ': ["\s\"\s"]
} }
def clean_lines(line, change_dict): def clean_lines(line, change_dict):
@ -60,6 +68,45 @@ def truncate_duplicated_letters_to_two(line):
# %% [markdown] # %% [markdown]
### Cleaning ### Cleaning
# %% # %%
text = [clean_lines(x, change_dict) for x in data.loc[:, 'Tweet'].values.tolist()] text = [clean_lines(x, change_dict) for x in data_all.loc[:, 'Tweet'].values.tolist()]
text = [truncate_duplicated_letters_to_two(x).strip() for x in text] text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
data.Tweet = text data_all_clean = deepcopy(data_all)
data_all_clean.Tweet = text
data_all_clean = data_all_clean.dropna(subset = ["sent_score"], inplace=False)
# %% [markdown]
### Testing models
# %%
data_model = data_all_clean.loc[:, ['Tweet', 'sent_score']]
idx = data_model.index
data_model['random_number'] = np.random.randn(len(idx))
train_set = data_model[data_model['random_number'] <= 0.8]
test_set = data_model[data_model['random_number'] > 0.8]
# %%
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_set['Tweet'])
test_matrix = vectorizer.transform(test_set['Tweet'])
# %%
lr = LogisticRegression()
X_train = train_matrix
X_test = test_matrix
y_train = train_set['sent_score']
y_test = test_set['sent_score']
lr.fit(X_train,y_train)
# %%
predictions = lr.predict(X_test)
# %%
y_test_arr = np.asarray(y_test)
confusion_matrix(predictions,y_test_arr)
# %%
print(classification_report(predictions,y_test))
# %% [markdown]
# precision recall f1-score support
# -1.0 0.91 0.96 0.94 1188
# 0.0 0.99 0.97 0.98 4733
# 1.0 0.97 0.98 0.98 4799
# accuracy 0.97 10720
# macro avg 0.96 0.97 0.96 10720
# weighted avg 0.97 0.97 0.97 10720
# %%