Testing text vectorizers: Count, Hashing and Tfidf Vectorizers
This commit is contained in:
parent
8a94bb7f1f
commit
4b4ede27c5
64
twitter.py
64
twitter.py
@ -1,9 +1,12 @@
|
|||||||
# %%
|
# %%
|
||||||
|
# from platform import java_ver
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
from sklearn.feature_extraction.text \
|
||||||
|
import CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.metrics import confusion_matrix,classification_report
|
from sklearn.metrics import confusion_matrix,classification_report
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
@ -17,9 +20,6 @@ data_all = pd.read_csv(filepath, header=0,
|
|||||||
delimiter=',',
|
delimiter=',',
|
||||||
# encoding_errors='surrogateescape'
|
# encoding_errors='surrogateescape'
|
||||||
)
|
)
|
||||||
# data.columns = ['index', 'id','date', 'query', 'user', 'text']
|
|
||||||
# %%
|
|
||||||
# data = data_all.loc[:,['Tweet', 'Sentiment']]
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
### Function definitions
|
### Function definitions
|
||||||
# %%
|
# %%
|
||||||
@ -82,31 +82,41 @@ data_model['random_number'] = np.random.randn(len(idx))
|
|||||||
train_set = data_model[data_model['random_number'] <= 0.8]
|
train_set = data_model[data_model['random_number'] <= 0.8]
|
||||||
test_set = data_model[data_model['random_number'] > 0.8]
|
test_set = data_model[data_model['random_number'] > 0.8]
|
||||||
# %%
|
# %%
|
||||||
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
|
def train_model_and_predict(train_set, test_set,
|
||||||
train_matrix = vectorizer.fit_transform(train_set['Tweet'])
|
vectorizer, vectorizer_name,
|
||||||
test_matrix = vectorizer.transform(test_set['Tweet'])
|
model,
|
||||||
# %%
|
colname_text = 'Tweet',
|
||||||
lr = LogisticRegression()
|
colname_sent = 'sent_score'):
|
||||||
|
train_matrix = vectorizer.fit_transform(train_set[colname_text])
|
||||||
|
test_matrix = vectorizer.transform(test_set[colname_text])
|
||||||
X_train = train_matrix
|
X_train = train_matrix
|
||||||
X_test = test_matrix
|
X_test = test_matrix
|
||||||
y_train = train_set['sent_score']
|
y_train = train_set[colname_sent]
|
||||||
y_test = test_set['sent_score']
|
y_test = test_set[colname_sent]
|
||||||
lr.fit(X_train,y_train)
|
model.fit(X_train,y_train)
|
||||||
# %%
|
predictions = model.predict(X_test)
|
||||||
predictions = lr.predict(X_test)
|
|
||||||
# %%
|
|
||||||
y_test_arr = np.asarray(y_test)
|
y_test_arr = np.asarray(y_test)
|
||||||
confusion_matrix(predictions,y_test_arr)
|
print(f"{vectorizer_name}")
|
||||||
|
# print("Confussion matrix")
|
||||||
|
# print(confusion_matrix(predictions,y_test_arr))
|
||||||
|
print("Classification report")
|
||||||
|
print(classification_report(predictions,y_test_arr))
|
||||||
# %%
|
# %%
|
||||||
print(classification_report(predictions,y_test))
|
vectorizers = [
|
||||||
# %% [markdown]
|
("CountVectorizer", CountVectorizer(token_pattern=r'\b\w+\b')),
|
||||||
# precision recall f1-score support
|
("HashingVectorizer, n_features=2**15", HashingVectorizer(n_features=2**15, analyzer='word', token_pattern=r'\b\w+\b')),
|
||||||
|
# ("HashingVectorizer, n_features=2**20", HashingVectorizer(n_features=2**20, analyzer='word', token_pattern=r'\b\w+\b')),
|
||||||
# -1.0 0.91 0.96 0.94 1188
|
("TfidfVectorizer", TfidfVectorizer()),
|
||||||
# 0.0 0.99 0.97 0.98 4733
|
("TfidfVectorizer, smooth_idf=False", TfidfVectorizer(smooth_idf=False)),
|
||||||
# 1.0 0.97 0.98 0.98 4799
|
("TfidfVectorizer, sublinear_tf=True", TfidfVectorizer(sublinear_tf=True)),
|
||||||
|
("TfidfVectorizer, norm=None", TfidfVectorizer(norm=None)),
|
||||||
# accuracy 0.97 10720
|
]
|
||||||
# macro avg 0.96 0.97 0.96 10720
|
# %%
|
||||||
# weighted avg 0.97 0.97 0.97 10720
|
for vec in vectorizers:
|
||||||
|
train_model_and_predict(train_set, test_set,
|
||||||
|
vectorizer = vec[1],
|
||||||
|
vectorizer_name = vec[0],
|
||||||
|
model = LogisticRegression(max_iter=1000),
|
||||||
|
colname_text = 'Tweet',
|
||||||
|
colname_sent = 'sent_score')
|
||||||
# %%
|
# %%
|
||||||
|
Loading…
Reference in New Issue
Block a user