to do vectorize all things
This commit is contained in:
parent
18e254888b
commit
f900e16baa
51
main.py
51
main.py
@ -7,8 +7,8 @@ from vectorizer_tf import VectorizerTf
|
|||||||
|
|
||||||
|
|
||||||
def get_answers_array():
|
def get_answers_array():
|
||||||
d = pd.read_csv('answers.csv')
|
d = pd.read_csv('data.csv', engine='python', error_bad_lines=False)
|
||||||
answers = d["AnswerText"]
|
answers = d["ad"]
|
||||||
answers = answers.dropna()
|
answers = answers.dropna()
|
||||||
|
|
||||||
return np.array(answers)
|
return np.array(answers)
|
||||||
@ -18,18 +18,20 @@ def okapi_mb25(query, tf, idf, a_len, documents):
|
|||||||
k = 1.6
|
k = 1.6
|
||||||
b = 0.75
|
b = 0.75
|
||||||
scores = []
|
scores = []
|
||||||
for document in documents:
|
for index, document in enumerate(documents):
|
||||||
v_tf = VectorizerTf([document])
|
|
||||||
tf_for_doc = v_tf.get_tf_for_document(query)
|
|
||||||
s = 0
|
s = 0
|
||||||
tf_for_document = v_tf.tf_matrix.toarray() * tf_for_doc[0]
|
try:
|
||||||
for idx, val in enumerate(tf_for_doc[0]):
|
v_tf = VectorizerTf([document])
|
||||||
|
tf_for_doc = v_tf.get_tf_for_document(query)
|
||||||
licznik = val * (k + 1)
|
tf_for_document = v_tf.tf_matrix.toarray() * tf_for_doc[0]
|
||||||
mianownik = val + k * (1 - b + b * (len(tf_for_doc) / a_len))
|
for idx, val in enumerate(tf_for_document[0]):
|
||||||
idf_for_word = idf.get_idf_for_word(v_tf.feature_names[idx])
|
licznik = val * (k + 1)
|
||||||
s += idf_for_word * (licznik / mianownik)
|
mianownik = val + k * (1 - b + b * (len(tf_for_doc) / a_len))
|
||||||
scores.append(s)
|
idf_for_word = idf.get_idf_for_word(v_tf.feature_names[idx])
|
||||||
|
s += idf_for_word * (licznik / mianownik)
|
||||||
|
scores.append(s)
|
||||||
|
except Exception as e:
|
||||||
|
scores.append(0)
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
|
|
||||||
@ -43,15 +45,20 @@ if __name__ == "__main__":
|
|||||||
words = doc.split()
|
words = doc.split()
|
||||||
average_lens.append(sum(len(word) for word in words) / len(words))
|
average_lens.append(sum(len(word) for word in words) / len(words))
|
||||||
average_doc_len = sum(average_lens) / len(average_lens)
|
average_doc_len = sum(average_lens) / len(average_lens)
|
||||||
# print('Doc len', average_doc_len)
|
|
||||||
#
|
|
||||||
vectorizer_tf = VectorizerTf(data)
|
vectorizer_tf = VectorizerTf(data)
|
||||||
# print('tf', vectorizer_tf.get_tf_for_document('Ala ma psa'))
|
|
||||||
|
|
||||||
vectorizer_idf = VectorizerIdf(data)
|
vectorizer_idf = VectorizerIdf(data)
|
||||||
|
while True:
|
||||||
|
q = input('Wpisz fraze: ')
|
||||||
score = okapi_mb25('Ala ma kota', vectorizer_tf, vectorizer_idf, average_doc_len, data)
|
score = okapi_mb25(q, vectorizer_tf, vectorizer_idf, average_doc_len, data)
|
||||||
print('Score ', score)
|
list1, list2 = zip(*sorted(zip(score, data)))
|
||||||
score = okapi_mb25('Ala', vectorizer_tf, vectorizer_idf, average_doc_len, data)
|
i = 0
|
||||||
print('Score 2', score)
|
for sc, sent in zip(reversed(list1), reversed(list2)):
|
||||||
|
if sc:
|
||||||
|
print(sent, sc)
|
||||||
|
i += 1
|
||||||
|
if i == 5:
|
||||||
|
break
|
||||||
|
X = [i for i in score if i != 0]
|
||||||
|
print('Znaleziono ' + str(len(X)) + ' wyniki')
|
Loading…
Reference in New Issue
Block a user