Testing text vectorizers: Count, Hashing and Tfidf Vectorizers

This commit is contained in:
Krzysztof Szubiczuk 2022-01-18 13:00:59 +01:00
parent 8a94bb7f1f
commit 4b4ede27c5

View File

@ -1,9 +1,12 @@
# %%
# from platform import java_ver
import pandas as pd
import os
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text \
import CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report
from copy import deepcopy
@ -17,9 +20,6 @@ data_all = pd.read_csv(filepath, header=0,
delimiter=',',
# encoding_errors='surrogateescape'
)
# data.columns = ['index', 'id','date', 'query', 'user', 'text']
# %%
# data = data_all.loc[:,['Tweet', 'Sentiment']]
# %% [markdown]
### Function definitions
# %%
@ -82,31 +82,41 @@ data_model['random_number'] = np.random.randn(len(idx))
train_set = data_model[data_model['random_number'] <= 0.8]
test_set = data_model[data_model['random_number'] > 0.8]
# %%
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_set['Tweet'])
test_matrix = vectorizer.transform(test_set['Tweet'])
def train_model_and_predict(train_set, test_set,
vectorizer, vectorizer_name,
model,
colname_text = 'Tweet',
colname_sent = 'sent_score'):
train_matrix = vectorizer.fit_transform(train_set[colname_text])
test_matrix = vectorizer.transform(test_set[colname_text])
X_train = train_matrix
X_test = test_matrix
y_train = train_set[colname_sent]
y_test = test_set[colname_sent]
model.fit(X_train,y_train)
predictions = model.predict(X_test)
y_test_arr = np.asarray(y_test)
print(f"{vectorizer_name}")
# print("Confussion matrix")
# print(confusion_matrix(predictions,y_test_arr))
print("Classification report")
print(classification_report(predictions,y_test_arr))
# %%
lr = LogisticRegression()
X_train = train_matrix
X_test = test_matrix
y_train = train_set['sent_score']
y_test = test_set['sent_score']
lr.fit(X_train,y_train)
vectorizers = [
("CountVectorizer", CountVectorizer(token_pattern=r'\b\w+\b')),
("HashingVectorizer, n_features=2**15", HashingVectorizer(n_features=2**15, analyzer='word', token_pattern=r'\b\w+\b')),
# ("HashingVectorizer, n_features=2**20", HashingVectorizer(n_features=2**20, analyzer='word', token_pattern=r'\b\w+\b')),
("TfidfVectorizer", TfidfVectorizer()),
("TfidfVectorizer, smooth_idf=False", TfidfVectorizer(smooth_idf=False)),
("TfidfVectorizer, sublinear_tf=True", TfidfVectorizer(sublinear_tf=True)),
("TfidfVectorizer, norm=None", TfidfVectorizer(norm=None)),
]
# %%
predictions = lr.predict(X_test)
# %%
y_test_arr = np.asarray(y_test)
confusion_matrix(predictions,y_test_arr)
# %%
print(classification_report(predictions,y_test))
# %% [markdown]
# precision recall f1-score support
# -1.0 0.91 0.96 0.94 1188
# 0.0 0.99 0.97 0.98 4733
# 1.0 0.97 0.98 0.98 4799
# accuracy 0.97 10720
# macro avg 0.96 0.97 0.96 10720
# weighted avg 0.97 0.97 0.97 10720
for vec in vectorizers:
train_model_and_predict(train_set, test_set,
vectorizer = vec[1],
vectorizer_name = vec[0],
model = LogisticRegression(max_iter=1000),
colname_text = 'Tweet',
colname_sent = 'sent_score')
# %%