72 lines
2.1 KiB
Python
72 lines
2.1 KiB
Python
|
import pandas as pd
|
||
|
import numpy as np
|
||
|
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
||
|
from statistics import mean
|
||
|
|
||
|
# Options
|
||
|
pd.set_option("display.max_columns", None)
|
||
|
|
||
|
# Load documents
|
||
|
print("Loading documents..")
|
||
|
raw_documents = pd.read_csv('tweets.csv')
|
||
|
# processed_documents = raw_documents[raw_documents.mentions.isnull()]
|
||
|
# processed_documents = np.unique(processed_documents['content'])
|
||
|
processed_documents = np.unique(raw_documents['content'])
|
||
|
|
||
|
# Vectorization
|
||
|
print("Vectorizing...")
|
||
|
cv = CountVectorizer()
|
||
|
transformer = TfidfTransformer()
|
||
|
|
||
|
word_count_vector = cv.fit_transform(processed_documents)
|
||
|
words = cv.get_feature_names_out()
|
||
|
|
||
|
tf = pd.DataFrame(word_count_vector.toarray(), columns=words)
|
||
|
transformer.fit_transform(word_count_vector)
|
||
|
|
||
|
tfidf_dict = {}
|
||
|
for idx, wrd in enumerate(words):
|
||
|
tfidf_dict[wrd] = {'idf': transformer.idf_[idx], 'tf': tf[wrd]}
|
||
|
|
||
|
# Constants
|
||
|
k = 1.5
|
||
|
b = 0.75
|
||
|
avgdl = mean([len(x.split()) for x in processed_documents])
|
||
|
|
||
|
|
||
|
def OkapiBM25(query, limit=5):
|
||
|
query_str = query.split()
|
||
|
scores = []
|
||
|
for d in range(len(processed_documents)):
|
||
|
s = 0
|
||
|
for keyword in query_str:
|
||
|
tf = tfidf_dict.get(keyword, None)
|
||
|
if not tf:
|
||
|
continue
|
||
|
tf = tf['tf'][d]
|
||
|
idf = tfidf_dict[keyword]['idf']
|
||
|
doclen = len(processed_documents[d].split())
|
||
|
s += idf * (tf * (k + 1)) / (tf + k * (1 - b + b * doclen / avgdl))
|
||
|
scores.append(s)
|
||
|
|
||
|
results = []
|
||
|
for i, x in enumerate(scores):
|
||
|
results.append((x, processed_documents[i])) if x else None
|
||
|
results = sorted(results, key=lambda x: x[0], reverse=True)
|
||
|
|
||
|
print('-' * 10)
|
||
|
print(f"Total results: {len(results)}; Showing {min(limit, len(results))}:")
|
||
|
print('-' * 10)
|
||
|
for r in results[:limit]:
|
||
|
print(f"Score: {r[0]}")
|
||
|
print(r[1])
|
||
|
print('-' * 10)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
print("'q' to quit")
|
||
|
while True:
|
||
|
q = input("Your query: ")
|
||
|
if q == 'q': break
|
||
|
OkapiBM25(q)
|