Testing text vectorizers: Count, Hashing and Tfidf Vectorizers
This commit is contained in:
parent
8a94bb7f1f
commit
4b4ede27c5
64
twitter.py
64
twitter.py
@ -1,9 +1,12 @@
|
||||
# %%
|
||||
# from platform import java_ver
|
||||
import pandas as pd
|
||||
import os
|
||||
import re
|
||||
import numpy as np
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_extraction.text \
|
||||
import CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import confusion_matrix,classification_report
|
||||
from copy import deepcopy
|
||||
@ -17,9 +20,6 @@ data_all = pd.read_csv(filepath, header=0,
|
||||
delimiter=',',
|
||||
# encoding_errors='surrogateescape'
|
||||
)
|
||||
# data.columns = ['index', 'id','date', 'query', 'user', 'text']
|
||||
# %%
|
||||
# data = data_all.loc[:,['Tweet', 'Sentiment']]
|
||||
# %% [markdown]
|
||||
### Function definitions
|
||||
# %%
|
||||
@ -82,31 +82,41 @@ data_model['random_number'] = np.random.randn(len(idx))
|
||||
train_set = data_model[data_model['random_number'] <= 0.8]
|
||||
test_set = data_model[data_model['random_number'] > 0.8]
|
||||
# %%
|
||||
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
|
||||
train_matrix = vectorizer.fit_transform(train_set['Tweet'])
|
||||
test_matrix = vectorizer.transform(test_set['Tweet'])
|
||||
# %%
|
||||
lr = LogisticRegression()
|
||||
def train_model_and_predict(train_set, test_set,
|
||||
vectorizer, vectorizer_name,
|
||||
model,
|
||||
colname_text = 'Tweet',
|
||||
colname_sent = 'sent_score'):
|
||||
train_matrix = vectorizer.fit_transform(train_set[colname_text])
|
||||
test_matrix = vectorizer.transform(test_set[colname_text])
|
||||
X_train = train_matrix
|
||||
X_test = test_matrix
|
||||
y_train = train_set['sent_score']
|
||||
y_test = test_set['sent_score']
|
||||
lr.fit(X_train,y_train)
|
||||
# %%
|
||||
predictions = lr.predict(X_test)
|
||||
# %%
|
||||
y_train = train_set[colname_sent]
|
||||
y_test = test_set[colname_sent]
|
||||
model.fit(X_train,y_train)
|
||||
predictions = model.predict(X_test)
|
||||
y_test_arr = np.asarray(y_test)
|
||||
confusion_matrix(predictions,y_test_arr)
|
||||
print(f"{vectorizer_name}")
|
||||
# print("Confussion matrix")
|
||||
# print(confusion_matrix(predictions,y_test_arr))
|
||||
print("Classification report")
|
||||
print(classification_report(predictions,y_test_arr))
|
||||
# %%
|
||||
print(classification_report(predictions,y_test))
|
||||
# %% [markdown]
|
||||
# precision recall f1-score support
|
||||
|
||||
# -1.0 0.91 0.96 0.94 1188
|
||||
# 0.0 0.99 0.97 0.98 4733
|
||||
# 1.0 0.97 0.98 0.98 4799
|
||||
|
||||
# accuracy 0.97 10720
|
||||
# macro avg 0.96 0.97 0.96 10720
|
||||
# weighted avg 0.97 0.97 0.97 10720
|
||||
vectorizers = [
|
||||
("CountVectorizer", CountVectorizer(token_pattern=r'\b\w+\b')),
|
||||
("HashingVectorizer, n_features=2**15", HashingVectorizer(n_features=2**15, analyzer='word', token_pattern=r'\b\w+\b')),
|
||||
# ("HashingVectorizer, n_features=2**20", HashingVectorizer(n_features=2**20, analyzer='word', token_pattern=r'\b\w+\b')),
|
||||
("TfidfVectorizer", TfidfVectorizer()),
|
||||
("TfidfVectorizer, smooth_idf=False", TfidfVectorizer(smooth_idf=False)),
|
||||
("TfidfVectorizer, sublinear_tf=True", TfidfVectorizer(sublinear_tf=True)),
|
||||
("TfidfVectorizer, norm=None", TfidfVectorizer(norm=None)),
|
||||
]
|
||||
# %%
|
||||
for vec in vectorizers:
|
||||
train_model_and_predict(train_set, test_set,
|
||||
vectorizer = vec[1],
|
||||
vectorizer_name = vec[0],
|
||||
model = LogisticRegression(max_iter=1000),
|
||||
colname_text = 'Tweet',
|
||||
colname_sent = 'sent_score')
|
||||
# %%
|
||||
|
Loading…
Reference in New Issue
Block a user