14 lines
448 B
Python
14 lines
448 B
Python
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from spacy.lang.en.stop_words import STOP_WORDS as en_stop
|
|
|
|
|
|
class VectorizerIdf:
|
|
|
|
def __init__(self, corpus):
|
|
vectorizer = TfidfVectorizer(use_idf=True, stop_words=en_stop)
|
|
self.matrix = vectorizer.fit_transform(corpus)
|
|
self.vectorizer = vectorizer
|
|
|
|
def get_idf_for_word(self, term):
|
|
return self.vectorizer.idf_[self.vectorizer.vocabulary_[term]]
|