# Amazon revievs search engine import pandas as pd import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from vectorizer_idf import VectorizerIdf from vectorizer_tf import VectorizerTf def get_answers_array(): d = pd.read_csv('answers.csv') answers = d["AnswerText"] answers = answers.dropna() return np.array(answers) def okapi_mb25(query, tf, idf, a_len, documents): k = 1.6 b = 0.75 scores = [] for document in documents: v_tf = VectorizerTf([document]) tf_for_doc = v_tf.get_tf_for_document(query) s = 0 tf_for_document = v_tf.tf_matrix.toarray() * tf_for_doc[0] for idx, val in enumerate(tf_for_doc[0]): licznik = val * (k + 1) mianownik = val + k * (1 - b + b * (len(tf_for_doc) / a_len)) idf_for_word = idf.get_idf_for_word(v_tf.feature_names[idx]) s += idf_for_word * (licznik / mianownik) scores.append(s) return scores if __name__ == "__main__": # data = get_answers_array() data = ['Ala ma kota', 'Maciej i Ala ma psa i Ala Ala Ala', 'Ala ma żółwia', 'Maciej ma psa, żółwia i kota', 'Ola ma psa, żółwia i kota, masło'] average_lens = [] for doc in data: words = doc.split() average_lens.append(sum(len(word) for word in words) / len(words)) average_doc_len = sum(average_lens) / len(average_lens) # print('Doc len', average_doc_len) # vectorizer_tf = VectorizerTf(data) # print('tf', vectorizer_tf.get_tf_for_document('Ala ma psa')) vectorizer_idf = VectorizerIdf(data) score = okapi_mb25('Ala ma kota', vectorizer_tf, vectorizer_idf, average_doc_len, data) print('Score ', score) score = okapi_mb25('Ala', vectorizer_tf, vectorizer_idf, average_doc_len, data) print('Score 2', score)