Testing text vectorizers: Count, Hashing and Tfidf Vectorizers

This commit is contained in:
Krzysztof Szubiczuk 2022-01-18 13:00:59 +01:00
parent 8a94bb7f1f
commit 4b4ede27c5

View File

@ -1,9 +1,12 @@
# %% # %%
# from platform import java_ver
import pandas as pd import pandas as pd
import os import os
import re import re
import numpy as np import numpy as np
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text \
import CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report from sklearn.metrics import confusion_matrix,classification_report
from copy import deepcopy from copy import deepcopy
@ -17,9 +20,6 @@ data_all = pd.read_csv(filepath, header=0,
delimiter=',', delimiter=',',
# encoding_errors='surrogateescape' # encoding_errors='surrogateescape'
) )
# data.columns = ['index', 'id','date', 'query', 'user', 'text']
# %%
# data = data_all.loc[:,['Tweet', 'Sentiment']]
# %% [markdown] # %% [markdown]
### Function definitions ### Function definitions
# %% # %%
@ -82,31 +82,41 @@ data_model['random_number'] = np.random.randn(len(idx))
train_set = data_model[data_model['random_number'] <= 0.8] train_set = data_model[data_model['random_number'] <= 0.8]
test_set = data_model[data_model['random_number'] > 0.8] test_set = data_model[data_model['random_number'] > 0.8]
# %% # %%
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b') def train_model_and_predict(train_set, test_set,
train_matrix = vectorizer.fit_transform(train_set['Tweet']) vectorizer, vectorizer_name,
test_matrix = vectorizer.transform(test_set['Tweet']) model,
# %% colname_text = 'Tweet',
lr = LogisticRegression() colname_sent = 'sent_score'):
train_matrix = vectorizer.fit_transform(train_set[colname_text])
test_matrix = vectorizer.transform(test_set[colname_text])
X_train = train_matrix X_train = train_matrix
X_test = test_matrix X_test = test_matrix
y_train = train_set['sent_score'] y_train = train_set[colname_sent]
y_test = test_set['sent_score'] y_test = test_set[colname_sent]
lr.fit(X_train,y_train) model.fit(X_train,y_train)
# %% predictions = model.predict(X_test)
predictions = lr.predict(X_test)
# %%
y_test_arr = np.asarray(y_test) y_test_arr = np.asarray(y_test)
confusion_matrix(predictions,y_test_arr) print(f"{vectorizer_name}")
# print("Confussion matrix")
# print(confusion_matrix(predictions,y_test_arr))
print("Classification report")
print(classification_report(predictions,y_test_arr))
# %% # %%
print(classification_report(predictions,y_test)) vectorizers = [
# %% [markdown] ("CountVectorizer", CountVectorizer(token_pattern=r'\b\w+\b')),
# precision recall f1-score support ("HashingVectorizer, n_features=2**15", HashingVectorizer(n_features=2**15, analyzer='word', token_pattern=r'\b\w+\b')),
# ("HashingVectorizer, n_features=2**20", HashingVectorizer(n_features=2**20, analyzer='word', token_pattern=r'\b\w+\b')),
# -1.0 0.91 0.96 0.94 1188 ("TfidfVectorizer", TfidfVectorizer()),
# 0.0 0.99 0.97 0.98 4733 ("TfidfVectorizer, smooth_idf=False", TfidfVectorizer(smooth_idf=False)),
# 1.0 0.97 0.98 0.98 4799 ("TfidfVectorizer, sublinear_tf=True", TfidfVectorizer(sublinear_tf=True)),
("TfidfVectorizer, norm=None", TfidfVectorizer(norm=None)),
# accuracy 0.97 10720 ]
# macro avg 0.96 0.97 0.96 10720 # %%
# weighted avg 0.97 0.97 0.97 10720 for vec in vectorizers:
train_model_and_predict(train_set, test_set,
vectorizer = vec[1],
vectorizer_name = vec[0],
model = LogisticRegression(max_iter=1000),
colname_text = 'Tweet',
colname_sent = 'sent_score')
# %% # %%