wip

2022-04-10 18:18:05 +02:00 · 2022-04-10 18:18:05 +02:00 · 18e254888b
commit 18e254888b
3 changed files with 84 additions and 0 deletions
--- a/main.py
+++ b/main.py
@ -0,0 +1,57 @@
+# Amazon revievs search engine
+import pandas as pd
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from vectorizer_idf import VectorizerIdf
+from vectorizer_tf import VectorizerTf
+
+
+def get_answers_array():
+    d = pd.read_csv('answers.csv')
+    answers = d["AnswerText"]
+    answers = answers.dropna()
+
+    return np.array(answers)
+
+
+def okapi_mb25(query, tf, idf, a_len, documents):
+    k = 1.6
+    b = 0.75
+    scores = []
+    for document in documents:
+        v_tf = VectorizerTf([document])
+        tf_for_doc = v_tf.get_tf_for_document(query)
+        s = 0
+        tf_for_document = v_tf.tf_matrix.toarray() * tf_for_doc[0]
+        for idx, val in enumerate(tf_for_doc[0]):
+
+            licznik = val * (k + 1)
+            mianownik = val + k * (1 - b + b * (len(tf_for_doc) / a_len))
+            idf_for_word = idf.get_idf_for_word(v_tf.feature_names[idx])
+            s += idf_for_word * (licznik / mianownik)
+        scores.append(s)
+    return scores
+
+
+if __name__ == "__main__":
+    # data = get_answers_array()
+    data = ['Ala ma kota', 'Maciej i Ala ma psa i Ala Ala Ala', 'Ala ma żółwia', 'Maciej ma psa, żółwia i kota',
+            'Ola ma psa, żółwia i kota, masło']
+
+    average_lens = []
+    for doc in data:
+        words = doc.split()
+        average_lens.append(sum(len(word) for word in words) / len(words))
+    average_doc_len = sum(average_lens) / len(average_lens)
+    # print('Doc len', average_doc_len)
+    #
+    vectorizer_tf = VectorizerTf(data)
+    # print('tf', vectorizer_tf.get_tf_for_document('Ala ma psa'))
+
+    vectorizer_idf = VectorizerIdf(data)
+
+
+    score = okapi_mb25('Ala ma kota', vectorizer_tf, vectorizer_idf, average_doc_len, data)
+    print('Score ', score)
+    score = okapi_mb25('Ala', vectorizer_tf, vectorizer_idf, average_doc_len, data)
+    print('Score 2', score)
--- a/vectorizer_idf.py
+++ b/vectorizer_idf.py
@ -0,0 +1,12 @@
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+
+class VectorizerIdf:
+
+    def __init__(self, corpus):
+        vectorizer = TfidfVectorizer(use_idf=True)
+        vectorizer.fit_transform(corpus)
+        self.vectorizer = vectorizer
+
+    def get_idf_for_word(self, term):
+        return self.vectorizer.idf_[self.vectorizer.vocabulary_[term]]
--- a/vectorizer_tf.py
+++ b/vectorizer_tf.py
@ -0,0 +1,15 @@
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+import pandas as pd
+
+
+class VectorizerTf:
+
+    def __init__(self, corpus):
+        vectorizer = CountVectorizer()
+        self.tf_matrix = vectorizer.fit_transform(corpus)
+        self.vectorizer = vectorizer
+        self.feature_names = self.vectorizer.get_feature_names()
+
+    def get_tf_for_document(self, term):
+        return self.vectorizer.transform([term]).toarray()
+