First logistic regression model testing
This commit is contained in:
parent
eea6f1b259
commit
8a94bb7f1f
57
twitter.py
57
twitter.py
@ -2,6 +2,11 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.metrics import confusion_matrix,classification_report
|
||||||
|
from copy import deepcopy
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
### Reading data - this part need changing when data
|
### Reading data - this part need changing when data
|
||||||
# %%
|
# %%
|
||||||
@ -14,7 +19,7 @@ data_all = pd.read_csv(filepath, header=0,
|
|||||||
)
|
)
|
||||||
# data.columns = ['index', 'id','date', 'query', 'user', 'text']
|
# data.columns = ['index', 'id','date', 'query', 'user', 'text']
|
||||||
# %%
|
# %%
|
||||||
data = data_all.loc[:,['Tweet', 'Sentiment']]
|
# data = data_all.loc[:,['Tweet', 'Sentiment']]
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
### Function definitions
|
### Function definitions
|
||||||
# %%
|
# %%
|
||||||
@ -23,14 +28,17 @@ change_dict = {
|
|||||||
" username ": ['@\w+|@'],
|
" username ": ['@\w+|@'],
|
||||||
" url ": ['http\S*'],
|
" url ": ['http\S*'],
|
||||||
" emoji ": ["[;:][dbop\(\)\[\]]|[^\w][dbop\(\)\[\]][;:]|xd+|\S*&\S*", "[^\w\s,.?!:;#\'\"\(\)\$\-\+%\[\]\|]"],
|
" emoji ": ["[;:][dbop\(\)\[\]]|[^\w][dbop\(\)\[\]][;:]|xd+|\S*&\S*", "[^\w\s,.?!:;#\'\"\(\)\$\-\+%\[\]\|]"],
|
||||||
" number ": ["[\+\-\$]?[\d]+[,\.]?[\d]+[%]?"],
|
" number ": ["[\+\-\$]?[\d]+[,\.\:k]?[\d]?[%]?"],
|
||||||
# standardization
|
# standardization
|
||||||
', ': ['\s,'],
|
', ': ['\s,'],
|
||||||
'. ': ['\s\.'],
|
'. ': ['\s\.'],
|
||||||
' ': ['\s{2,}', '\n'],
|
' ': ['\s{2,}', '\n', '^rt[\s]+', '\s\:\s'],
|
||||||
"'": ["<EFBFBD>"],
|
"'": ["<EFBFBD>"],
|
||||||
'?': ["\s\?"],
|
'?': ["\s\?"],
|
||||||
'!': ["\s\!"],
|
'!': ["\s\!"],
|
||||||
|
'".': ["\s\"\."],
|
||||||
|
'",': ["\s\"\,"],
|
||||||
|
'" ': ["\s\"\s"]
|
||||||
}
|
}
|
||||||
|
|
||||||
def clean_lines(line, change_dict):
|
def clean_lines(line, change_dict):
|
||||||
@ -60,6 +68,45 @@ def truncate_duplicated_letters_to_two(line):
|
|||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
### Cleaning
|
### Cleaning
|
||||||
# %%
|
# %%
|
||||||
text = [clean_lines(x, change_dict) for x in data.loc[:, 'Tweet'].values.tolist()]
|
text = [clean_lines(x, change_dict) for x in data_all.loc[:, 'Tweet'].values.tolist()]
|
||||||
text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
|
text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
|
||||||
data.Tweet = text
|
data_all_clean = deepcopy(data_all)
|
||||||
|
data_all_clean.Tweet = text
|
||||||
|
data_all_clean = data_all_clean.dropna(subset = ["sent_score"], inplace=False)
|
||||||
|
# %% [markdown]
|
||||||
|
### Testing models
|
||||||
|
# %%
|
||||||
|
data_model = data_all_clean.loc[:, ['Tweet', 'sent_score']]
|
||||||
|
idx = data_model.index
|
||||||
|
data_model['random_number'] = np.random.randn(len(idx))
|
||||||
|
train_set = data_model[data_model['random_number'] <= 0.8]
|
||||||
|
test_set = data_model[data_model['random_number'] > 0.8]
|
||||||
|
# %%
|
||||||
|
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
|
||||||
|
train_matrix = vectorizer.fit_transform(train_set['Tweet'])
|
||||||
|
test_matrix = vectorizer.transform(test_set['Tweet'])
|
||||||
|
# %%
|
||||||
|
lr = LogisticRegression()
|
||||||
|
X_train = train_matrix
|
||||||
|
X_test = test_matrix
|
||||||
|
y_train = train_set['sent_score']
|
||||||
|
y_test = test_set['sent_score']
|
||||||
|
lr.fit(X_train,y_train)
|
||||||
|
# %%
|
||||||
|
predictions = lr.predict(X_test)
|
||||||
|
# %%
|
||||||
|
y_test_arr = np.asarray(y_test)
|
||||||
|
confusion_matrix(predictions,y_test_arr)
|
||||||
|
# %%
|
||||||
|
print(classification_report(predictions,y_test))
|
||||||
|
# %% [markdown]
|
||||||
|
# precision recall f1-score support
|
||||||
|
|
||||||
|
# -1.0 0.91 0.96 0.94 1188
|
||||||
|
# 0.0 0.99 0.97 0.98 4733
|
||||||
|
# 1.0 0.97 0.98 0.98 4799
|
||||||
|
|
||||||
|
# accuracy 0.97 10720
|
||||||
|
# macro avg 0.96 0.97 0.96 10720
|
||||||
|
# weighted avg 0.97 0.97 0.97 10720
|
||||||
|
# %%
|
||||||
|
Loading…
Reference in New Issue
Block a user