test
This commit is contained in:
parent
2ff0538dc3
commit
7947b532cb
10
okapi.py
10
okapi.py
@ -1,5 +1,5 @@
|
|||||||
import pandas as pd
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
||||||
from statistics import mean
|
from statistics import mean
|
||||||
|
|
||||||
@ -33,14 +33,18 @@ retweets = processed_documents['retweets']
|
|||||||
dates = processed_documents['date']
|
dates = processed_documents['date']
|
||||||
|
|
||||||
# Vectorization
|
# Vectorization
|
||||||
|
print(f"{len(processed_documents)} documents ready!")
|
||||||
print("Vectorizing...")
|
print("Vectorizing...")
|
||||||
cv = CountVectorizer()
|
cv = CountVectorizer()
|
||||||
transformer = TfidfTransformer()
|
transformer = TfidfTransformer()
|
||||||
|
|
||||||
word_count_vector = cv.fit_transform(tweets)
|
word_count_vector = cv.fit_transform(tweets)
|
||||||
words = cv.get_feature_names_out()
|
try:
|
||||||
|
words = cv.get_feature_names_out()
|
||||||
|
except:
|
||||||
|
words = cv.get_feature_names()
|
||||||
|
|
||||||
tf = pd.DataFrame(word_count_vector.toarray(), columns=words)
|
tf = pd.DataFrame(word_count_vector.toarray(), columns=words, dtype='int8')
|
||||||
transformer.fit_transform(word_count_vector)
|
transformer.fit_transform(word_count_vector)
|
||||||
|
|
||||||
tfidf_dict = {}
|
tfidf_dict = {}
|
||||||
|
Loading…
Reference in New Issue
Block a user