From d70c623066162c5a5580f1114ce2e97c56c99ed5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Pokrywka?= Date: Tue, 12 Apr 2022 22:46:50 +0200 Subject: [PATCH] to present --- main.py | 92 +++++++++++++++++++++++++++++++++++++---------- vectorizer_idf.py | 5 +-- vectorizer_tf.py | 5 +-- 3 files changed, 79 insertions(+), 23 deletions(-) diff --git a/main.py b/main.py index 745e87c..5cf960c 100644 --- a/main.py +++ b/main.py @@ -1,11 +1,17 @@ # Amazon revievs search engine +from typing import List +import string import pandas as pd import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from vectorizer_idf import VectorizerIdf from vectorizer_tf import VectorizerTf +import spacy +import re +nlp = spacy.load("en_core_web_sm") + def get_answers_array(): d = pd.read_csv('data.csv', engine='python', error_bad_lines=False) answers = d["ad"] @@ -13,45 +19,93 @@ def get_answers_array(): return np.array(answers) +def has_numbers(inputString): + return any(char.isdigit() for char in inputString) -def okapi_mb25(query, tf, idf, a_len, documents): +def okapi_mb25(query, tf, idf, a_len, documents, d_idf): k = 1.6 b = 0.75 scores = [] + q_tf = tf.get_tf_for_document(query) + docs_v = tf.tf_matrix.toarray() for index, document in enumerate(documents): s = 0 - try: - v_tf = VectorizerTf([document]) - tf_for_doc = v_tf.get_tf_for_document(query) - tf_for_document = v_tf.tf_matrix.toarray() * tf_for_doc[0] - for idx, val in enumerate(tf_for_document[0]): - licznik = val * (k + 1) - mianownik = val + k * (1 - b + b * (len(tf_for_doc) / a_len)) - idf_for_word = idf.get_idf_for_word(v_tf.feature_names[idx]) - s += idf_for_word * (licznik / mianownik) - scores.append(s) - except Exception as e: - scores.append(0) + for re_p in re.findall('[0-9a-z.,-/]+', document): + for q_ in query.split(): + if re_p == q_: + s +=100 + for word in document.split(): + val_tf = 0 + for d in query.split(): + if d == word: + val_tf += 1 + if not d_idf.get(word): + continue + idf_for_word = d_idf[word] + licznik = val_tf * (k + 1) + mianownik = val_tf + k * (1 - b + b * (len(document.split()) / a_len)) + s += idf_for_word * (licznik / mianownik) + scores.append(s) return scores + # try: + # # v_tf = VectorizerTf([document]) + # # tf_for_doc = v_tf.get_tf_for_document(query) + # # tf_for_document = v_tf.tf_matrix.toarray() * tf_for_doc[0] + # doc_v = docs_v[index] + # a=1 + # for idx, val in enumerate(doc_v): + # pass + # # idf_for_word = idf.matrix.toarray()[index][idx] + # # licznik = val * (k + 1) + # # mianownik = val + k * (1 - b + b * (len(document.split()) / a_len)) + # # s += idf_for_word * (licznik / mianownik) + # # a=1 + # # scores.append(s) + # except Exception as e: + # scores.append(0) + # print('error', e) + # return scores +def preprocess(d_list: List): + result = [] + for d in d_list: + words = [] + d = d.translate(str.maketrans('', '', string.punctuation)) + for token in nlp(d): + words.append(token.lemma_) + result.append(" ".join(words)) + return result + if __name__ == "__main__": - # data = get_answers_array() - data = ['Ala ma kota', 'Maciej i Ala ma psa i Ala Ala Ala', 'Ala ma żółwia', 'Maciej ma psa, żółwia i kota', - 'Ola ma psa, żółwia i kota, masło'] + data = get_answers_array() + # data = ['Ala has a cat', 'Maciej and Ala have dog and Ala Ala Ala', 'Ala has a turtle', 'Maciej has a dog, turtle and a lot of cats', + # 'Ola has a dog, turtle and cat, butters'] + data = [d.lower() for d in data] + data = np.array(data) + # data = preprocess(data) + average_lens = [] for doc in data: words = doc.split() - average_lens.append(sum(len(word) for word in words) / len(words)) + if words: + average_lens.append(sum(len(word) for word in words) / len(words)) average_doc_len = sum(average_lens) / len(average_lens) vectorizer_tf = VectorizerTf(data) - vectorizer_idf = VectorizerIdf(data) + vocab = vectorizer_tf.feature_names + data_idf = {} + for idx, v in enumerate(vocab): + data_idf[v] = vectorizer_idf.get_idf_for_word(v) + while True: q = input('Wpisz fraze: ') - score = okapi_mb25(q, vectorizer_tf, vectorizer_idf, average_doc_len, data) + q = q.lower() + # q = preprocess([q])[0] + score = okapi_mb25(q, vectorizer_tf, vectorizer_idf, average_doc_len, data, data_idf) + print('loading ended') list1, list2 = zip(*sorted(zip(score, data))) i = 0 for sc, sent in zip(reversed(list1), reversed(list2)): diff --git a/vectorizer_idf.py b/vectorizer_idf.py index d03fb16..2271d43 100644 --- a/vectorizer_idf.py +++ b/vectorizer_idf.py @@ -1,11 +1,12 @@ from sklearn.feature_extraction.text import TfidfVectorizer +from spacy.lang.en.stop_words import STOP_WORDS as en_stop class VectorizerIdf: def __init__(self, corpus): - vectorizer = TfidfVectorizer(use_idf=True) - vectorizer.fit_transform(corpus) + vectorizer = TfidfVectorizer(use_idf=True, stop_words=en_stop) + self.matrix = vectorizer.fit_transform(corpus) self.vectorizer = vectorizer def get_idf_for_word(self, term): diff --git a/vectorizer_tf.py b/vectorizer_tf.py index 198823f..b6bc303 100644 --- a/vectorizer_tf.py +++ b/vectorizer_tf.py @@ -1,15 +1,16 @@ from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer import pandas as pd +from spacy.lang.en.stop_words import STOP_WORDS as en_stop + class VectorizerTf: def __init__(self, corpus): - vectorizer = CountVectorizer() + vectorizer = CountVectorizer(stop_words=en_stop) self.tf_matrix = vectorizer.fit_transform(corpus) self.vectorizer = vectorizer self.feature_names = self.vectorizer.get_feature_names() def get_tf_for_document(self, term): return self.vectorizer.transform([term]).toarray() -