okap/okapi.py

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from statistics import mean

"""
Niesatysfakcjonujące quert: "vote"
Zapytanie zwraca tweety nie pisane przez Trumpa

Poprawka: Usunięcie w preprocessingu tweetów zawierających @realDonaldTrump
"""

# Options
pd.set_option("display.max_columns", None)

# Load documents
print("Loading documents..")
raw_documents = pd.read_csv('tweets.csv')

# Process A
processed_documents = raw_documents

# Process B
# mention_filter = list(map(lambda x: x != x or '@realDonaldTrump' not in x.split(','), raw_documents.mentions))
# retweet_filter = list(map(lambda x: x > 5, raw_documents.retweets))
# doc_filter = np.array(mention_filter) & np.array(retweet_filter)
# processed_documents = raw_documents[doc_filter]
# processed_documents.reset_index(inplace=True)

# Columns to variables
tweets = processed_documents['content']
retweets = processed_documents['retweets']
dates = processed_documents['date']

# Vectorization
print("Vectorizing...")
cv = CountVectorizer()
transformer = TfidfTransformer()

word_count_vector = cv.fit_transform(tweets)
words = cv.get_feature_names_out()

tf = pd.DataFrame(word_count_vector.toarray(), columns=words)
transformer.fit_transform(word_count_vector)

tfidf_dict = {}
for idx, wrd in enumerate(words):
    tfidf_dict[wrd] = {'idf': transformer.idf_[idx], 'tf': tf[wrd]}

# Constants
k = 1.5
b = 0.75
avgdl = mean([len(x.split()) for x in tweets])


def OkapiBM25(query, limit=5):
    query_str = query.lower().split()
    scores = []
    for d in range(len(tweets)):
        s = 0
        for keyword in query_str:
            tf = tfidf_dict.get(keyword, None)
            if not tf:
                continue
            tf = tf['tf'][d]
            idf = tfidf_dict[keyword]['idf']
            doclen = len(tweets[d].split())
            s += idf * (tf * (k + 1)) / (tf + k * (1 - b + b * doclen / avgdl))
        scores.append(s)

    results = []
    for i, x in enumerate(scores):
        results.append({'score': x, 'content': tweets[i], 'retweets': retweets[i], 'date': dates[i]}) if x else None
    results = sorted(results, key=lambda x: x['score'], reverse=True)

    print('-' * 10)
    print(f"Total results: {len(results)}; Showing {min(limit, len(results))}:")
    print('-' * 10)
    for r in results[:limit]:
        print(f"Score: {r['score']}")
        print(f"Date: {r['date']}")
        print(f"Retweets: {r['retweets']}")
        print(r['content'])
        print('-' * 10)


if __name__ == '__main__':
    print("'q' to quit")
    while True:
        q = input("Your query: ")
        if q == 'q': break
        OkapiBM25(q)
test 2022-04-10 18:58:51 +02:00			`import pandas as pd`
			`import numpy as np`
			`from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer`
			`from statistics import mean`

test 2022-04-12 23:00:11 +02:00			`"""`
			`Niesatysfakcjonujące quert: "vote"`
			`Zapytanie zwraca tweety nie pisane przez Trumpa`

			`Poprawka: Usunięcie w preprocessingu tweetów zawierających @realDonaldTrump`
			`"""`

test 2022-04-10 18:58:51 +02:00			`# Options`
			`pd.set_option("display.max_columns", None)`

			`# Load documents`
			`print("Loading documents..")`
			`raw_documents = pd.read_csv('tweets.csv')`
test 2022-04-12 23:00:11 +02:00
			`# Process A`
			`processed_documents = raw_documents`

			`# Process B`
			`# mention_filter = list(map(lambda x: x != x or '@realDonaldTrump' not in x.split(','), raw_documents.mentions))`
			`# retweet_filter = list(map(lambda x: x > 5, raw_documents.retweets))`
			`# doc_filter = np.array(mention_filter) & np.array(retweet_filter)`
			`# processed_documents = raw_documents[doc_filter]`
			`# processed_documents.reset_index(inplace=True)`

			`# Columns to variables`
			`tweets = processed_documents['content']`
			`retweets = processed_documents['retweets']`
			`dates = processed_documents['date']`
test 2022-04-10 18:58:51 +02:00
			`# Vectorization`
			`print("Vectorizing...")`
			`cv = CountVectorizer()`
			`transformer = TfidfTransformer()`

test 2022-04-12 23:00:11 +02:00			`word_count_vector = cv.fit_transform(tweets)`
test 2022-04-10 18:58:51 +02:00			`words = cv.get_feature_names_out()`

			`tf = pd.DataFrame(word_count_vector.toarray(), columns=words)`
			`transformer.fit_transform(word_count_vector)`

			`tfidf_dict = {}`
			`for idx, wrd in enumerate(words):`
			`tfidf_dict[wrd] = {'idf': transformer.idf_[idx], 'tf': tf[wrd]}`

			`# Constants`
			`k = 1.5`
			`b = 0.75`
test 2022-04-12 23:00:11 +02:00			`avgdl = mean([len(x.split()) for x in tweets])`
test 2022-04-10 18:58:51 +02:00

			`def OkapiBM25(query, limit=5):`
test 2022-04-12 23:00:11 +02:00			`query_str = query.lower().split()`
test 2022-04-10 18:58:51 +02:00			`scores = []`
test 2022-04-12 23:00:11 +02:00			`for d in range(len(tweets)):`
test 2022-04-10 18:58:51 +02:00			`s = 0`
			`for keyword in query_str:`
			`tf = tfidf_dict.get(keyword, None)`
			`if not tf:`
			`continue`
			`tf = tf['tf'][d]`
			`idf = tfidf_dict[keyword]['idf']`
test 2022-04-12 23:00:11 +02:00			`doclen = len(tweets[d].split())`
test 2022-04-10 18:58:51 +02:00			`s += idf * (tf * (k + 1)) / (tf + k * (1 - b + b * doclen / avgdl))`
			`scores.append(s)`

			`results = []`
			`for i, x in enumerate(scores):`
test 2022-04-12 23:00:11 +02:00			`results.append({'score': x, 'content': tweets[i], 'retweets': retweets[i], 'date': dates[i]}) if x else None`
			`results = sorted(results, key=lambda x: x['score'], reverse=True)`
test 2022-04-10 18:58:51 +02:00
			`print('-' * 10)`
			`print(f"Total results: {len(results)}; Showing {min(limit, len(results))}:")`
			`print('-' * 10)`
			`for r in results[:limit]:`
test 2022-04-12 23:00:11 +02:00			`print(f"Score: {r['score']}")`
			`print(f"Date: {r['date']}")`
			`print(f"Retweets: {r['retweets']}")`
			`print(r['content'])`
test 2022-04-10 18:58:51 +02:00			`print('-' * 10)`


			`if __name__ == '__main__':`
			`print("'q' to quit")`
			`while True:`
			`q = input("Your query: ")`
			`if q == 'q': break`
			`OkapiBM25(q)`