to present

This commit is contained in:
Mikołaj Pokrywka 2022-04-12 22:46:50 +02:00
parent f900e16baa
commit d70c623066
3 changed files with 79 additions and 23 deletions

86
main.py
View File

@ -1,11 +1,17 @@
# Amazon revievs search engine # Amazon revievs search engine
from typing import List
import string
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from vectorizer_idf import VectorizerIdf from vectorizer_idf import VectorizerIdf
from vectorizer_tf import VectorizerTf from vectorizer_tf import VectorizerTf
import spacy
import re
nlp = spacy.load("en_core_web_sm")
def get_answers_array(): def get_answers_array():
d = pd.read_csv('data.csv', engine='python', error_bad_lines=False) d = pd.read_csv('data.csv', engine='python', error_bad_lines=False)
answers = d["ad"] answers = d["ad"]
@ -13,45 +19,93 @@ def get_answers_array():
return np.array(answers) return np.array(answers)
def has_numbers(inputString):
return any(char.isdigit() for char in inputString)
def okapi_mb25(query, tf, idf, a_len, documents): def okapi_mb25(query, tf, idf, a_len, documents, d_idf):
k = 1.6 k = 1.6
b = 0.75 b = 0.75
scores = [] scores = []
q_tf = tf.get_tf_for_document(query)
docs_v = tf.tf_matrix.toarray()
for index, document in enumerate(documents): for index, document in enumerate(documents):
s = 0 s = 0
try: for re_p in re.findall('[0-9a-z.,-/]+', document):
v_tf = VectorizerTf([document]) for q_ in query.split():
tf_for_doc = v_tf.get_tf_for_document(query) if re_p == q_:
tf_for_document = v_tf.tf_matrix.toarray() * tf_for_doc[0] s +=100
for idx, val in enumerate(tf_for_document[0]): for word in document.split():
licznik = val * (k + 1) val_tf = 0
mianownik = val + k * (1 - b + b * (len(tf_for_doc) / a_len)) for d in query.split():
idf_for_word = idf.get_idf_for_word(v_tf.feature_names[idx]) if d == word:
val_tf += 1
if not d_idf.get(word):
continue
idf_for_word = d_idf[word]
licznik = val_tf * (k + 1)
mianownik = val_tf + k * (1 - b + b * (len(document.split()) / a_len))
s += idf_for_word * (licznik / mianownik) s += idf_for_word * (licznik / mianownik)
scores.append(s) scores.append(s)
except Exception as e:
scores.append(0)
return scores return scores
# try:
# # v_tf = VectorizerTf([document])
# # tf_for_doc = v_tf.get_tf_for_document(query)
# # tf_for_document = v_tf.tf_matrix.toarray() * tf_for_doc[0]
# doc_v = docs_v[index]
# a=1
# for idx, val in enumerate(doc_v):
# pass
# # idf_for_word = idf.matrix.toarray()[index][idx]
# # licznik = val * (k + 1)
# # mianownik = val + k * (1 - b + b * (len(document.split()) / a_len))
# # s += idf_for_word * (licznik / mianownik)
# # a=1
# # scores.append(s)
# except Exception as e:
# scores.append(0)
# print('error', e)
# return scores
def preprocess(d_list: List):
result = []
for d in d_list:
words = []
d = d.translate(str.maketrans('', '', string.punctuation))
for token in nlp(d):
words.append(token.lemma_)
result.append(" ".join(words))
return result
if __name__ == "__main__": if __name__ == "__main__":
# data = get_answers_array() data = get_answers_array()
data = ['Ala ma kota', 'Maciej i Ala ma psa i Ala Ala Ala', 'Ala ma żółwia', 'Maciej ma psa, żółwia i kota', # data = ['Ala has a cat', 'Maciej and Ala have dog and Ala Ala Ala', 'Ala has a turtle', 'Maciej has a dog, turtle and a lot of cats',
'Ola ma psa, żółwia i kota, masło'] # 'Ola has a dog, turtle and cat, butters']
data = [d.lower() for d in data]
data = np.array(data)
# data = preprocess(data)
average_lens = [] average_lens = []
for doc in data: for doc in data:
words = doc.split() words = doc.split()
if words:
average_lens.append(sum(len(word) for word in words) / len(words)) average_lens.append(sum(len(word) for word in words) / len(words))
average_doc_len = sum(average_lens) / len(average_lens) average_doc_len = sum(average_lens) / len(average_lens)
vectorizer_tf = VectorizerTf(data) vectorizer_tf = VectorizerTf(data)
vectorizer_idf = VectorizerIdf(data) vectorizer_idf = VectorizerIdf(data)
vocab = vectorizer_tf.feature_names
data_idf = {}
for idx, v in enumerate(vocab):
data_idf[v] = vectorizer_idf.get_idf_for_word(v)
while True: while True:
q = input('Wpisz fraze: ') q = input('Wpisz fraze: ')
score = okapi_mb25(q, vectorizer_tf, vectorizer_idf, average_doc_len, data) q = q.lower()
# q = preprocess([q])[0]
score = okapi_mb25(q, vectorizer_tf, vectorizer_idf, average_doc_len, data, data_idf)
print('loading ended')
list1, list2 = zip(*sorted(zip(score, data))) list1, list2 = zip(*sorted(zip(score, data)))
i = 0 i = 0
for sc, sent in zip(reversed(list1), reversed(list2)): for sc, sent in zip(reversed(list1), reversed(list2)):

View File

@ -1,11 +1,12 @@
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en.stop_words import STOP_WORDS as en_stop
class VectorizerIdf: class VectorizerIdf:
def __init__(self, corpus): def __init__(self, corpus):
vectorizer = TfidfVectorizer(use_idf=True) vectorizer = TfidfVectorizer(use_idf=True, stop_words=en_stop)
vectorizer.fit_transform(corpus) self.matrix = vectorizer.fit_transform(corpus)
self.vectorizer = vectorizer self.vectorizer = vectorizer
def get_idf_for_word(self, term): def get_idf_for_word(self, term):

View File

@ -1,15 +1,16 @@
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import pandas as pd import pandas as pd
from spacy.lang.en.stop_words import STOP_WORDS as en_stop
class VectorizerTf: class VectorizerTf:
def __init__(self, corpus): def __init__(self, corpus):
vectorizer = CountVectorizer() vectorizer = CountVectorizer(stop_words=en_stop)
self.tf_matrix = vectorizer.fit_transform(corpus) self.tf_matrix = vectorizer.fit_transform(corpus)
self.vectorizer = vectorizer self.vectorizer = vectorizer
self.feature_names = self.vectorizer.get_feature_names() self.feature_names = self.vectorizer.get_feature_names()
def get_tf_for_document(self, term): def get_tf_for_document(self, term):
return self.vectorizer.transform([term]).toarray() return self.vectorizer.transform([term]).toarray()