to present

2022-04-12 22:46:50 +02:00 · 2022-04-12 22:46:50 +02:00 · d70c623066
commit d70c623066
parent f900e16baa
3 changed files with 79 additions and 23 deletions
--- a/main.py
+++ b/main.py
@ -1,11 +1,17 @@
 # Amazon revievs search engine
+from typing import List
+import string
 import pandas as pd
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from vectorizer_idf import VectorizerIdf
 from vectorizer_tf import VectorizerTf
+import spacy
+import re


+nlp = spacy.load("en_core_web_sm")
+
 def get_answers_array():
    d = pd.read_csv('data.csv',  engine='python', error_bad_lines=False)
    answers = d["ad"]
@ -13,45 +19,93 @@ def get_answers_array():

    return np.array(answers)

+def has_numbers(inputString):
+    return any(char.isdigit() for char in inputString)

-def okapi_mb25(query, tf, idf, a_len, documents):
+def okapi_mb25(query, tf, idf, a_len, documents, d_idf):
    k = 1.6
    b = 0.75
    scores = []
+    q_tf = tf.get_tf_for_document(query)
+    docs_v = tf.tf_matrix.toarray()
    for index, document in enumerate(documents):
        s = 0
-        try:
-            v_tf = VectorizerTf([document])
-            tf_for_doc = v_tf.get_tf_for_document(query)
-            tf_for_document = v_tf.tf_matrix.toarray() * tf_for_doc[0]
-            for idx, val in enumerate(tf_for_document[0]):
-                licznik = val * (k + 1)
-                mianownik = val + k * (1 - b + b * (len(tf_for_doc) / a_len))
-                idf_for_word = idf.get_idf_for_word(v_tf.feature_names[idx])
+        for re_p in re.findall('[0-9a-z.,-/]+', document):
+            for q_ in query.split():
+                if re_p == q_:
+                    s +=100
+        for word in document.split():
+            val_tf = 0
+            for d in query.split():
+                if d == word:
+                    val_tf += 1
+            if not d_idf.get(word):
+                continue
+            idf_for_word = d_idf[word]
+            licznik = val_tf * (k + 1)
+            mianownik = val_tf + k * (1 - b + b * (len(document.split()) / a_len))
            s += idf_for_word * (licznik / mianownik)
        scores.append(s)
-        except Exception as e:
-            scores.append(0)
    return scores
+        # try:
+        #     # v_tf = VectorizerTf([document])
+        #     # tf_for_doc = v_tf.get_tf_for_document(query)
+        #     # tf_for_document = v_tf.tf_matrix.toarray() * tf_for_doc[0]
+        #     doc_v = docs_v[index]
+        #     a=1
+        #     for idx, val in enumerate(doc_v):
+        #         pass
+        #     #     idf_for_word = idf.matrix.toarray()[index][idx]
+        #     #     licznik = val * (k + 1)
+        #     #     mianownik = val + k * (1 - b + b * (len(document.split()) / a_len))
+        #     #     s += idf_for_word * (licznik / mianownik)
+        #     #     a=1
+        #     # scores.append(s)
+        # except Exception as e:
+        #     scores.append(0)
+        #     print('error', e)
+    # return scores


+def preprocess(d_list: List):
+    result = []
+    for d in d_list:
+        words = []
+        d = d.translate(str.maketrans('', '', string.punctuation))
+        for token in nlp(d):
+            words.append(token.lemma_)
+        result.append(" ".join(words))
+    return result
+
 if __name__ == "__main__":
-    # data = get_answers_array()
-    data = ['Ala ma kota', 'Maciej i Ala ma psa i Ala Ala Ala', 'Ala ma żółwia', 'Maciej ma psa, żółwia i kota',
-            'Ola ma psa, żółwia i kota, masło']
+    data = get_answers_array()
+    # data = ['Ala has a cat', 'Maciej and Ala have dog and Ala Ala Ala', 'Ala has a turtle', 'Maciej has a dog, turtle and a lot of cats',
+    #         'Ola has a dog, turtle and cat, butters']
+    data = [d.lower() for d in data]
+    data = np.array(data)
+    # data = preprocess(data)
+

    average_lens = []
    for doc in data:
        words = doc.split()
+        if words:
            average_lens.append(sum(len(word) for word in words) / len(words))
    average_doc_len = sum(average_lens) / len(average_lens)

    vectorizer_tf = VectorizerTf(data)
-
    vectorizer_idf = VectorizerIdf(data)
+    vocab = vectorizer_tf.feature_names
+    data_idf = {}
+    for idx, v in enumerate(vocab):
+        data_idf[v] = vectorizer_idf.get_idf_for_word(v)
+
    while True:
        q = input('Wpisz fraze: ')
-        score = okapi_mb25(q, vectorizer_tf, vectorizer_idf, average_doc_len, data)
+        q = q.lower()
+        # q = preprocess([q])[0]
+        score = okapi_mb25(q, vectorizer_tf, vectorizer_idf, average_doc_len, data, data_idf)
+        print('loading ended')
        list1, list2 = zip(*sorted(zip(score, data)))
        i = 0
        for sc, sent in zip(reversed(list1), reversed(list2)):
--- a/vectorizer_idf.py
+++ b/vectorizer_idf.py
@ -1,11 +1,12 @@
 from sklearn.feature_extraction.text import TfidfVectorizer
+from spacy.lang.en.stop_words import STOP_WORDS as en_stop


 class VectorizerIdf:

    def __init__(self, corpus):
-        vectorizer = TfidfVectorizer(use_idf=True)
-        vectorizer.fit_transform(corpus)
+        vectorizer = TfidfVectorizer(use_idf=True, stop_words=en_stop)
+        self.matrix = vectorizer.fit_transform(corpus)
        self.vectorizer = vectorizer

    def get_idf_for_word(self, term):
--- a/vectorizer_tf.py
+++ b/vectorizer_tf.py
@ -1,15 +1,16 @@
 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 import pandas as pd

+from spacy.lang.en.stop_words import STOP_WORDS as en_stop
+

 class VectorizerTf:

    def __init__(self, corpus):
-        vectorizer = CountVectorizer()
+        vectorizer = CountVectorizer(stop_words=en_stop)
        self.tf_matrix = vectorizer.fit_transform(corpus)
        self.vectorizer = vectorizer
        self.feature_names = self.vectorizer.get_feature_names()

    def get_tf_for_document(self, term):
        return self.vectorizer.transform([term]).toarray()
-