to present
This commit is contained in:
parent
f900e16baa
commit
d70c623066
86
main.py
86
main.py
@ -1,11 +1,17 @@
|
|||||||
# Amazon revievs search engine
|
# Amazon revievs search engine
|
||||||
|
from typing import List
|
||||||
|
import string
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from vectorizer_idf import VectorizerIdf
|
from vectorizer_idf import VectorizerIdf
|
||||||
from vectorizer_tf import VectorizerTf
|
from vectorizer_tf import VectorizerTf
|
||||||
|
import spacy
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
nlp = spacy.load("en_core_web_sm")
|
||||||
|
|
||||||
def get_answers_array():
|
def get_answers_array():
|
||||||
d = pd.read_csv('data.csv', engine='python', error_bad_lines=False)
|
d = pd.read_csv('data.csv', engine='python', error_bad_lines=False)
|
||||||
answers = d["ad"]
|
answers = d["ad"]
|
||||||
@ -13,45 +19,93 @@ def get_answers_array():
|
|||||||
|
|
||||||
return np.array(answers)
|
return np.array(answers)
|
||||||
|
|
||||||
|
def has_numbers(inputString):
|
||||||
|
return any(char.isdigit() for char in inputString)
|
||||||
|
|
||||||
def okapi_mb25(query, tf, idf, a_len, documents):
|
def okapi_mb25(query, tf, idf, a_len, documents, d_idf):
|
||||||
k = 1.6
|
k = 1.6
|
||||||
b = 0.75
|
b = 0.75
|
||||||
scores = []
|
scores = []
|
||||||
|
q_tf = tf.get_tf_for_document(query)
|
||||||
|
docs_v = tf.tf_matrix.toarray()
|
||||||
for index, document in enumerate(documents):
|
for index, document in enumerate(documents):
|
||||||
s = 0
|
s = 0
|
||||||
try:
|
for re_p in re.findall('[0-9a-z.,-/]+', document):
|
||||||
v_tf = VectorizerTf([document])
|
for q_ in query.split():
|
||||||
tf_for_doc = v_tf.get_tf_for_document(query)
|
if re_p == q_:
|
||||||
tf_for_document = v_tf.tf_matrix.toarray() * tf_for_doc[0]
|
s +=100
|
||||||
for idx, val in enumerate(tf_for_document[0]):
|
for word in document.split():
|
||||||
licznik = val * (k + 1)
|
val_tf = 0
|
||||||
mianownik = val + k * (1 - b + b * (len(tf_for_doc) / a_len))
|
for d in query.split():
|
||||||
idf_for_word = idf.get_idf_for_word(v_tf.feature_names[idx])
|
if d == word:
|
||||||
|
val_tf += 1
|
||||||
|
if not d_idf.get(word):
|
||||||
|
continue
|
||||||
|
idf_for_word = d_idf[word]
|
||||||
|
licznik = val_tf * (k + 1)
|
||||||
|
mianownik = val_tf + k * (1 - b + b * (len(document.split()) / a_len))
|
||||||
s += idf_for_word * (licznik / mianownik)
|
s += idf_for_word * (licznik / mianownik)
|
||||||
scores.append(s)
|
scores.append(s)
|
||||||
except Exception as e:
|
|
||||||
scores.append(0)
|
|
||||||
return scores
|
return scores
|
||||||
|
# try:
|
||||||
|
# # v_tf = VectorizerTf([document])
|
||||||
|
# # tf_for_doc = v_tf.get_tf_for_document(query)
|
||||||
|
# # tf_for_document = v_tf.tf_matrix.toarray() * tf_for_doc[0]
|
||||||
|
# doc_v = docs_v[index]
|
||||||
|
# a=1
|
||||||
|
# for idx, val in enumerate(doc_v):
|
||||||
|
# pass
|
||||||
|
# # idf_for_word = idf.matrix.toarray()[index][idx]
|
||||||
|
# # licznik = val * (k + 1)
|
||||||
|
# # mianownik = val + k * (1 - b + b * (len(document.split()) / a_len))
|
||||||
|
# # s += idf_for_word * (licznik / mianownik)
|
||||||
|
# # a=1
|
||||||
|
# # scores.append(s)
|
||||||
|
# except Exception as e:
|
||||||
|
# scores.append(0)
|
||||||
|
# print('error', e)
|
||||||
|
# return scores
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess(d_list: List):
|
||||||
|
result = []
|
||||||
|
for d in d_list:
|
||||||
|
words = []
|
||||||
|
d = d.translate(str.maketrans('', '', string.punctuation))
|
||||||
|
for token in nlp(d):
|
||||||
|
words.append(token.lemma_)
|
||||||
|
result.append(" ".join(words))
|
||||||
|
return result
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# data = get_answers_array()
|
data = get_answers_array()
|
||||||
data = ['Ala ma kota', 'Maciej i Ala ma psa i Ala Ala Ala', 'Ala ma żółwia', 'Maciej ma psa, żółwia i kota',
|
# data = ['Ala has a cat', 'Maciej and Ala have dog and Ala Ala Ala', 'Ala has a turtle', 'Maciej has a dog, turtle and a lot of cats',
|
||||||
'Ola ma psa, żółwia i kota, masło']
|
# 'Ola has a dog, turtle and cat, butters']
|
||||||
|
data = [d.lower() for d in data]
|
||||||
|
data = np.array(data)
|
||||||
|
# data = preprocess(data)
|
||||||
|
|
||||||
|
|
||||||
average_lens = []
|
average_lens = []
|
||||||
for doc in data:
|
for doc in data:
|
||||||
words = doc.split()
|
words = doc.split()
|
||||||
|
if words:
|
||||||
average_lens.append(sum(len(word) for word in words) / len(words))
|
average_lens.append(sum(len(word) for word in words) / len(words))
|
||||||
average_doc_len = sum(average_lens) / len(average_lens)
|
average_doc_len = sum(average_lens) / len(average_lens)
|
||||||
|
|
||||||
vectorizer_tf = VectorizerTf(data)
|
vectorizer_tf = VectorizerTf(data)
|
||||||
|
|
||||||
vectorizer_idf = VectorizerIdf(data)
|
vectorizer_idf = VectorizerIdf(data)
|
||||||
|
vocab = vectorizer_tf.feature_names
|
||||||
|
data_idf = {}
|
||||||
|
for idx, v in enumerate(vocab):
|
||||||
|
data_idf[v] = vectorizer_idf.get_idf_for_word(v)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
q = input('Wpisz fraze: ')
|
q = input('Wpisz fraze: ')
|
||||||
score = okapi_mb25(q, vectorizer_tf, vectorizer_idf, average_doc_len, data)
|
q = q.lower()
|
||||||
|
# q = preprocess([q])[0]
|
||||||
|
score = okapi_mb25(q, vectorizer_tf, vectorizer_idf, average_doc_len, data, data_idf)
|
||||||
|
print('loading ended')
|
||||||
list1, list2 = zip(*sorted(zip(score, data)))
|
list1, list2 = zip(*sorted(zip(score, data)))
|
||||||
i = 0
|
i = 0
|
||||||
for sc, sent in zip(reversed(list1), reversed(list2)):
|
for sc, sent in zip(reversed(list1), reversed(list2)):
|
||||||
|
@ -1,11 +1,12 @@
|
|||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from spacy.lang.en.stop_words import STOP_WORDS as en_stop
|
||||||
|
|
||||||
|
|
||||||
class VectorizerIdf:
|
class VectorizerIdf:
|
||||||
|
|
||||||
def __init__(self, corpus):
|
def __init__(self, corpus):
|
||||||
vectorizer = TfidfVectorizer(use_idf=True)
|
vectorizer = TfidfVectorizer(use_idf=True, stop_words=en_stop)
|
||||||
vectorizer.fit_transform(corpus)
|
self.matrix = vectorizer.fit_transform(corpus)
|
||||||
self.vectorizer = vectorizer
|
self.vectorizer = vectorizer
|
||||||
|
|
||||||
def get_idf_for_word(self, term):
|
def get_idf_for_word(self, term):
|
||||||
|
@ -1,15 +1,16 @@
|
|||||||
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
from spacy.lang.en.stop_words import STOP_WORDS as en_stop
|
||||||
|
|
||||||
|
|
||||||
class VectorizerTf:
|
class VectorizerTf:
|
||||||
|
|
||||||
def __init__(self, corpus):
|
def __init__(self, corpus):
|
||||||
vectorizer = CountVectorizer()
|
vectorizer = CountVectorizer(stop_words=en_stop)
|
||||||
self.tf_matrix = vectorizer.fit_transform(corpus)
|
self.tf_matrix = vectorizer.fit_transform(corpus)
|
||||||
self.vectorizer = vectorizer
|
self.vectorizer = vectorizer
|
||||||
self.feature_names = self.vectorizer.get_feature_names()
|
self.feature_names = self.vectorizer.get_feature_names()
|
||||||
|
|
||||||
def get_tf_for_document(self, term):
|
def get_tf_for_document(self, term):
|
||||||
return self.vectorizer.transform([term]).toarray()
|
return self.vectorizer.transform([term]).toarray()
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user