diff --git a/okapi.py b/okapi.py index 1fa16b0..3bf491a 100644 --- a/okapi.py +++ b/okapi.py @@ -1,5 +1,5 @@ -import pandas as pd import numpy as np +import pandas as pd from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from statistics import mean @@ -33,14 +33,18 @@ retweets = processed_documents['retweets'] dates = processed_documents['date'] # Vectorization +print(f"{len(processed_documents)} documents ready!") print("Vectorizing...") cv = CountVectorizer() transformer = TfidfTransformer() word_count_vector = cv.fit_transform(tweets) -words = cv.get_feature_names_out() +try: + words = cv.get_feature_names_out() +except: + words = cv.get_feature_names() -tf = pd.DataFrame(word_count_vector.toarray(), columns=words) +tf = pd.DataFrame(word_count_vector.toarray(), columns=words, dtype='int8') transformer.fit_transform(word_count_vector) tfidf_dict = {}