okapi/main.py

58 lines
1.8 KiB
Python
Raw Normal View History

2022-04-10 18:18:05 +02:00
# Amazon revievs search engine
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from vectorizer_idf import VectorizerIdf
from vectorizer_tf import VectorizerTf
def get_answers_array():
d = pd.read_csv('answers.csv')
answers = d["AnswerText"]
answers = answers.dropna()
return np.array(answers)
def okapi_mb25(query, tf, idf, a_len, documents):
k = 1.6
b = 0.75
scores = []
for document in documents:
v_tf = VectorizerTf([document])
tf_for_doc = v_tf.get_tf_for_document(query)
s = 0
tf_for_document = v_tf.tf_matrix.toarray() * tf_for_doc[0]
for idx, val in enumerate(tf_for_doc[0]):
licznik = val * (k + 1)
mianownik = val + k * (1 - b + b * (len(tf_for_doc) / a_len))
idf_for_word = idf.get_idf_for_word(v_tf.feature_names[idx])
s += idf_for_word * (licznik / mianownik)
scores.append(s)
return scores
if __name__ == "__main__":
# data = get_answers_array()
data = ['Ala ma kota', 'Maciej i Ala ma psa i Ala Ala Ala', 'Ala ma żółwia', 'Maciej ma psa, żółwia i kota',
'Ola ma psa, żółwia i kota, masło']
average_lens = []
for doc in data:
words = doc.split()
average_lens.append(sum(len(word) for word in words) / len(words))
average_doc_len = sum(average_lens) / len(average_lens)
# print('Doc len', average_doc_len)
#
vectorizer_tf = VectorizerTf(data)
# print('tf', vectorizer_tf.get_tf_for_document('Ala ma psa'))
vectorizer_idf = VectorizerIdf(data)
score = okapi_mb25('Ala ma kota', vectorizer_tf, vectorizer_idf, average_doc_len, data)
print('Score ', score)
score = okapi_mb25('Ala', vectorizer_tf, vectorizer_idf, average_doc_len, data)
print('Score 2', score)