wip
This commit is contained in:
commit
18e254888b
57
main.py
Normal file
57
main.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
# Amazon revievs search engine
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from vectorizer_idf import VectorizerIdf
|
||||||
|
from vectorizer_tf import VectorizerTf
|
||||||
|
|
||||||
|
|
||||||
|
def get_answers_array():
|
||||||
|
d = pd.read_csv('answers.csv')
|
||||||
|
answers = d["AnswerText"]
|
||||||
|
answers = answers.dropna()
|
||||||
|
|
||||||
|
return np.array(answers)
|
||||||
|
|
||||||
|
|
||||||
|
def okapi_mb25(query, tf, idf, a_len, documents):
|
||||||
|
k = 1.6
|
||||||
|
b = 0.75
|
||||||
|
scores = []
|
||||||
|
for document in documents:
|
||||||
|
v_tf = VectorizerTf([document])
|
||||||
|
tf_for_doc = v_tf.get_tf_for_document(query)
|
||||||
|
s = 0
|
||||||
|
tf_for_document = v_tf.tf_matrix.toarray() * tf_for_doc[0]
|
||||||
|
for idx, val in enumerate(tf_for_doc[0]):
|
||||||
|
|
||||||
|
licznik = val * (k + 1)
|
||||||
|
mianownik = val + k * (1 - b + b * (len(tf_for_doc) / a_len))
|
||||||
|
idf_for_word = idf.get_idf_for_word(v_tf.feature_names[idx])
|
||||||
|
s += idf_for_word * (licznik / mianownik)
|
||||||
|
scores.append(s)
|
||||||
|
return scores
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# data = get_answers_array()
|
||||||
|
data = ['Ala ma kota', 'Maciej i Ala ma psa i Ala Ala Ala', 'Ala ma żółwia', 'Maciej ma psa, żółwia i kota',
|
||||||
|
'Ola ma psa, żółwia i kota, masło']
|
||||||
|
|
||||||
|
average_lens = []
|
||||||
|
for doc in data:
|
||||||
|
words = doc.split()
|
||||||
|
average_lens.append(sum(len(word) for word in words) / len(words))
|
||||||
|
average_doc_len = sum(average_lens) / len(average_lens)
|
||||||
|
# print('Doc len', average_doc_len)
|
||||||
|
#
|
||||||
|
vectorizer_tf = VectorizerTf(data)
|
||||||
|
# print('tf', vectorizer_tf.get_tf_for_document('Ala ma psa'))
|
||||||
|
|
||||||
|
vectorizer_idf = VectorizerIdf(data)
|
||||||
|
|
||||||
|
|
||||||
|
score = okapi_mb25('Ala ma kota', vectorizer_tf, vectorizer_idf, average_doc_len, data)
|
||||||
|
print('Score ', score)
|
||||||
|
score = okapi_mb25('Ala', vectorizer_tf, vectorizer_idf, average_doc_len, data)
|
||||||
|
print('Score 2', score)
|
12
vectorizer_idf.py
Normal file
12
vectorizer_idf.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
|
||||||
|
|
||||||
|
class VectorizerIdf:
|
||||||
|
|
||||||
|
def __init__(self, corpus):
|
||||||
|
vectorizer = TfidfVectorizer(use_idf=True)
|
||||||
|
vectorizer.fit_transform(corpus)
|
||||||
|
self.vectorizer = vectorizer
|
||||||
|
|
||||||
|
def get_idf_for_word(self, term):
|
||||||
|
return self.vectorizer.idf_[self.vectorizer.vocabulary_[term]]
|
15
vectorizer_tf.py
Normal file
15
vectorizer_tf.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
class VectorizerTf:
|
||||||
|
|
||||||
|
def __init__(self, corpus):
|
||||||
|
vectorizer = CountVectorizer()
|
||||||
|
self.tf_matrix = vectorizer.fit_transform(corpus)
|
||||||
|
self.vectorizer = vectorizer
|
||||||
|
self.feature_names = self.vectorizer.get_feature_names()
|
||||||
|
|
||||||
|
def get_tf_for_document(self, term):
|
||||||
|
return self.vectorizer.transform([term]).toarray()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user