2.2 KiB
2.2 KiB
Ekstrakcja informacji
3. tfidf (1) [ćwiczenia]
Jakub Pokrywka (2021)
def word_to_index(word):
vec = np.zeros(len(vocabulary))
if word in vocabulary:
idx = vocabulary.index(word)
vec[idx] = 1
else:
vec[-1] = 1
return vec
def tf(document):
document_vector = None
for word in document:
if document_vector is None:
document_vector = word_to_index(word)
else:
document_vector += word_to_index(word)
return document_vector
def similarity(query, document):
numerator = np.sum(query * document)
denominator = np.sqrt(np.sum(query*query)) * np.sqrt(np.sum(document*document))
return numerator / denominator