From 4b4ede27c5f593ac6439736c8b41b988ee3f6f0b Mon Sep 17 00:00:00 2001 From: Krzysztof Szubiczuk Date: Tue, 18 Jan 2022 13:00:59 +0100 Subject: [PATCH] Testing text vectorizers: Count, Hashing and Tfidf Vectorizers --- twitter.py | 68 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/twitter.py b/twitter.py index 52a64e0..d340896 100644 --- a/twitter.py +++ b/twitter.py @@ -1,9 +1,12 @@ # %% +# from platform import java_ver import pandas as pd import os import re import numpy as np -from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction.text \ + import CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer +from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix,classification_report from copy import deepcopy @@ -17,9 +20,6 @@ data_all = pd.read_csv(filepath, header=0, delimiter=',', # encoding_errors='surrogateescape' ) -# data.columns = ['index', 'id','date', 'query', 'user', 'text'] -# %% -# data = data_all.loc[:,['Tweet', 'Sentiment']] # %% [markdown] ### Function definitions # %% @@ -82,31 +82,41 @@ data_model['random_number'] = np.random.randn(len(idx)) train_set = data_model[data_model['random_number'] <= 0.8] test_set = data_model[data_model['random_number'] > 0.8] # %% -vectorizer = CountVectorizer(token_pattern=r'\b\w+\b') -train_matrix = vectorizer.fit_transform(train_set['Tweet']) -test_matrix = vectorizer.transform(test_set['Tweet']) +def train_model_and_predict(train_set, test_set, + vectorizer, vectorizer_name, + model, + colname_text = 'Tweet', + colname_sent = 'sent_score'): + train_matrix = vectorizer.fit_transform(train_set[colname_text]) + test_matrix = vectorizer.transform(test_set[colname_text]) + X_train = train_matrix + X_test = test_matrix + y_train = train_set[colname_sent] + y_test = test_set[colname_sent] + model.fit(X_train,y_train) + predictions = model.predict(X_test) + y_test_arr = np.asarray(y_test) + print(f"{vectorizer_name}") + # print("Confussion matrix") + # print(confusion_matrix(predictions,y_test_arr)) + print("Classification report") + print(classification_report(predictions,y_test_arr)) # %% -lr = LogisticRegression() -X_train = train_matrix -X_test = test_matrix -y_train = train_set['sent_score'] -y_test = test_set['sent_score'] -lr.fit(X_train,y_train) +vectorizers = [ + ("CountVectorizer", CountVectorizer(token_pattern=r'\b\w+\b')), + ("HashingVectorizer, n_features=2**15", HashingVectorizer(n_features=2**15, analyzer='word', token_pattern=r'\b\w+\b')), + # ("HashingVectorizer, n_features=2**20", HashingVectorizer(n_features=2**20, analyzer='word', token_pattern=r'\b\w+\b')), + ("TfidfVectorizer", TfidfVectorizer()), + ("TfidfVectorizer, smooth_idf=False", TfidfVectorizer(smooth_idf=False)), + ("TfidfVectorizer, sublinear_tf=True", TfidfVectorizer(sublinear_tf=True)), + ("TfidfVectorizer, norm=None", TfidfVectorizer(norm=None)), + ] # %% -predictions = lr.predict(X_test) -# %% -y_test_arr = np.asarray(y_test) -confusion_matrix(predictions,y_test_arr) -# %% -print(classification_report(predictions,y_test)) -# %% [markdown] -# precision recall f1-score support - -# -1.0 0.91 0.96 0.94 1188 -# 0.0 0.99 0.97 0.98 4733 -# 1.0 0.97 0.98 0.98 4799 - -# accuracy 0.97 10720 -# macro avg 0.96 0.97 0.96 10720 -# weighted avg 0.97 0.97 0.97 10720 +for vec in vectorizers: + train_model_and_predict(train_set, test_set, + vectorizer = vec[1], + vectorizer_name = vec[0], + model = LogisticRegression(max_iter=1000), + colname_text = 'Tweet', + colname_sent = 'sent_score') # %%