okapi/main.py

# Amazon revievs search engine
from typing import List
import string
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from vectorizer_idf import VectorizerIdf
from vectorizer_tf import VectorizerTf
import spacy
import re


nlp = spacy.load("en_core_web_sm")

def get_answers_array():
    d = pd.read_csv('data.csv',  engine='python', error_bad_lines=False)
    answers = d["ad"]
    answers = answers.dropna()

    return np.array(answers)

def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

def okapi_mb25(query, tf, idf, a_len, documents, d_idf):
    k = 1.6
    b = 0.75
    scores = []
    q_tf = tf.get_tf_for_document(query)
    docs_v = tf.tf_matrix.toarray()
    for index, document in enumerate(documents):
        s = 0
        for re_p in re.findall('[0-9a-z.,-/]+', document):
            for q_ in query.split():
                if re_p == q_:
                    s +=100
        for word in document.split():
            val_tf = 0
            for d in query.split():
                if d == word:
                    val_tf += 1
            if not d_idf.get(word):
                continue
            idf_for_word = d_idf[word]
            licznik = val_tf * (k + 1)
            mianownik = val_tf + k * (1 - b + b * (len(document.split()) / a_len))
            s += idf_for_word * (licznik / mianownik)
        scores.append(s)
    return scores
        # try:
        #     # v_tf = VectorizerTf([document])
        #     # tf_for_doc = v_tf.get_tf_for_document(query)
        #     # tf_for_document = v_tf.tf_matrix.toarray() * tf_for_doc[0]
        #     doc_v = docs_v[index]
        #     a=1
        #     for idx, val in enumerate(doc_v):
        #         pass
        #     #     idf_for_word = idf.matrix.toarray()[index][idx]
        #     #     licznik = val * (k + 1)
        #     #     mianownik = val + k * (1 - b + b * (len(document.split()) / a_len))
        #     #     s += idf_for_word * (licznik / mianownik)
        #     #     a=1
        #     # scores.append(s)
        # except Exception as e:
        #     scores.append(0)
        #     print('error', e)
    # return scores


def preprocess(d_list: List):
    result = []
    for d in d_list:
        words = []
        d = d.translate(str.maketrans('', '', string.punctuation))
        for token in nlp(d):
            words.append(token.lemma_)
        result.append(" ".join(words))
    return result

if __name__ == "__main__":
    data = get_answers_array()
    # data = ['Ala has a cat', 'Maciej and Ala have dog and Ala Ala Ala', 'Ala has a turtle', 'Maciej has a dog, turtle and a lot of cats',
    #         'Ola has a dog, turtle and cat, butters']
    data = [d.lower() for d in data]
    data = np.array(data)
    # data = preprocess(data)


    average_lens = []
    for doc in data:
        words = doc.split()
        if words:
            average_lens.append(sum(len(word) for word in words) / len(words))
    average_doc_len = sum(average_lens) / len(average_lens)

    vectorizer_tf = VectorizerTf(data)
    vectorizer_idf = VectorizerIdf(data)
    vocab = vectorizer_tf.feature_names
    data_idf = {}
    for idx, v in enumerate(vocab):
        data_idf[v] = vectorizer_idf.get_idf_for_word(v)

    while True:
        q = input('Wpisz fraze: ')
        q = q.lower()
        # q = preprocess([q])[0]
        score = okapi_mb25(q, vectorizer_tf, vectorizer_idf, average_doc_len, data, data_idf)
        print('loading ended')
        list1, list2 = zip(*sorted(zip(score, data)))
        i = 0
        for sc, sent in zip(reversed(list1), reversed(list2)):
            if sc:
                print(sent, sc)
                i += 1
                if i == 5:
                    break
        X = [i for i in score if i != 0]
        print('Znaleziono ' + str(len(X)) + ' wyniki')
wip 2022-04-10 18:18:05 +02:00			`# Amazon revievs search engine`
to present 2022-04-12 22:46:50 +02:00			`from typing import List`
			`import string`
wip 2022-04-10 18:18:05 +02:00			`import pandas as pd`
			`import numpy as np`
			`from sklearn.feature_extraction.text import TfidfVectorizer`
			`from vectorizer_idf import VectorizerIdf`
			`from vectorizer_tf import VectorizerTf`
to present 2022-04-12 22:46:50 +02:00			`import spacy`
			`import re`
wip 2022-04-10 18:18:05 +02:00

to present 2022-04-12 22:46:50 +02:00			`nlp = spacy.load("en_core_web_sm")`

wip 2022-04-10 18:18:05 +02:00			`def get_answers_array():`
to do vectorize all things 2022-04-11 10:09:45 +02:00			`d = pd.read_csv('data.csv', engine='python', error_bad_lines=False)`
			`answers = d["ad"]`
wip 2022-04-10 18:18:05 +02:00			`answers = answers.dropna()`

			`return np.array(answers)`

to present 2022-04-12 22:46:50 +02:00			`def has_numbers(inputString):`
			`return any(char.isdigit() for char in inputString)`
wip 2022-04-10 18:18:05 +02:00
to present 2022-04-12 22:46:50 +02:00			`def okapi_mb25(query, tf, idf, a_len, documents, d_idf):`
wip 2022-04-10 18:18:05 +02:00			`k = 1.6`
			`b = 0.75`
			`scores = []`
to present 2022-04-12 22:46:50 +02:00			`q_tf = tf.get_tf_for_document(query)`
			`docs_v = tf.tf_matrix.toarray()`
to do vectorize all things 2022-04-11 10:09:45 +02:00			`for index, document in enumerate(documents):`
wip 2022-04-10 18:18:05 +02:00			`s = 0`
to present 2022-04-12 22:46:50 +02:00			`for re_p in re.findall('[0-9a-z.,-/]+', document):`
			`for q_ in query.split():`
			`if re_p == q_:`
			`s +=100`
			`for word in document.split():`
			`val_tf = 0`
			`for d in query.split():`
			`if d == word:`
			`val_tf += 1`
			`if not d_idf.get(word):`
			`continue`
			`idf_for_word = d_idf[word]`
			`licznik = val_tf * (k + 1)`
			`mianownik = val_tf + k * (1 - b + b * (len(document.split()) / a_len))`
			`s += idf_for_word * (licznik / mianownik)`
			`scores.append(s)`
wip 2022-04-10 18:18:05 +02:00			`return scores`
to present 2022-04-12 22:46:50 +02:00			`# try:`
			`# # v_tf = VectorizerTf([document])`
			`# # tf_for_doc = v_tf.get_tf_for_document(query)`
			`# # tf_for_document = v_tf.tf_matrix.toarray() * tf_for_doc[0]`
			`# doc_v = docs_v[index]`
			`# a=1`
			`# for idx, val in enumerate(doc_v):`
			`# pass`
			`# # idf_for_word = idf.matrix.toarray()[index][idx]`
			`# # licznik = val * (k + 1)`
			`# # mianownik = val + k * (1 - b + b * (len(document.split()) / a_len))`
			`# # s += idf_for_word * (licznik / mianownik)`
			`# # a=1`
			`# # scores.append(s)`
			`# except Exception as e:`
			`# scores.append(0)`
			`# print('error', e)`
			`# return scores`

wip 2022-04-10 18:18:05 +02:00
to present 2022-04-12 22:46:50 +02:00			`def preprocess(d_list: List):`
			`result = []`
			`for d in d_list:`
			`words = []`
			`d = d.translate(str.maketrans('', '', string.punctuation))`
			`for token in nlp(d):`
			`words.append(token.lemma_)`
			`result.append(" ".join(words))`
			`return result`
wip 2022-04-10 18:18:05 +02:00
			`if __name__ == "__main__":`
to present 2022-04-12 22:46:50 +02:00			`data = get_answers_array()`
			`# data = ['Ala has a cat', 'Maciej and Ala have dog and Ala Ala Ala', 'Ala has a turtle', 'Maciej has a dog, turtle and a lot of cats',`
			`# 'Ola has a dog, turtle and cat, butters']`
			`data = [d.lower() for d in data]`
			`data = np.array(data)`
			`# data = preprocess(data)`

wip 2022-04-10 18:18:05 +02:00
			`average_lens = []`
			`for doc in data:`
			`words = doc.split()`
to present 2022-04-12 22:46:50 +02:00			`if words:`
			`average_lens.append(sum(len(word) for word in words) / len(words))`
wip 2022-04-10 18:18:05 +02:00			`average_doc_len = sum(average_lens) / len(average_lens)`
to do vectorize all things 2022-04-11 10:09:45 +02:00
wip 2022-04-10 18:18:05 +02:00			`vectorizer_tf = VectorizerTf(data)`
			`vectorizer_idf = VectorizerIdf(data)`
to present 2022-04-12 22:46:50 +02:00			`vocab = vectorizer_tf.feature_names`
			`data_idf = {}`
			`for idx, v in enumerate(vocab):`
			`data_idf[v] = vectorizer_idf.get_idf_for_word(v)`

to do vectorize all things 2022-04-11 10:09:45 +02:00			`while True:`
			`q = input('Wpisz fraze: ')`
to present 2022-04-12 22:46:50 +02:00			`q = q.lower()`
			`# q = preprocess([q])[0]`
			`score = okapi_mb25(q, vectorizer_tf, vectorizer_idf, average_doc_len, data, data_idf)`
			`print('loading ended')`
to do vectorize all things 2022-04-11 10:09:45 +02:00			`list1, list2 = zip(*sorted(zip(score, data)))`
			`i = 0`
			`for sc, sent in zip(reversed(list1), reversed(list2)):`
			`if sc:`
			`print(sent, sc)`
			`i += 1`
			`if i == 5:`
			`break`
			`X = [i for i in score if i != 0]`
			`print('Znaleziono ' + str(len(X)) + ' wyniki')`