aitech-eks-pub-22/cw/03a_tfidf_ODPOWIEDZI.ipynb

2.2 KiB

Logo 1

Ekstrakcja informacji

3. tfidf (1) [ćwiczenia]

Jakub Pokrywka (2021)

Logo 2

def word_to_index(word):
    vec = np.zeros(len(vocabulary))
    if word in vocabulary:
        idx = vocabulary.index(word)
        vec[idx] = 1
    else:
        vec[-1] = 1
    return vec
def tf(document):
    document_vector = None
    for word in document:
        if document_vector is None:
            document_vector = word_to_index(word)
        else:
            document_vector += word_to_index(word)
    return document_vector
def similarity(query, document):
    numerator = np.sum(query * document)
    denominator = np.sqrt(np.sum(query*query)) * np.sqrt(np.sum(document*document)) 
    return numerator / denominator