test

2022-04-10 18:58:51 +02:00 · 2022-04-10 18:58:51 +02:00 · 62c92ef71e
commit 62c92ef71e
2 changed files with 43424 additions and 0 deletions
--- a/okapi.py
+++ b/okapi.py
@ -0,0 +1,71 @@
+import pandas as pd
+import numpy as np
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+from statistics import mean
+
+# Options
+pd.set_option("display.max_columns", None)
+
+# Load documents
+print("Loading documents..")
+raw_documents = pd.read_csv('tweets.csv')
+# processed_documents = raw_documents[raw_documents.mentions.isnull()]
+# processed_documents = np.unique(processed_documents['content'])
+processed_documents = np.unique(raw_documents['content'])
+
+# Vectorization
+print("Vectorizing...")
+cv = CountVectorizer()
+transformer = TfidfTransformer()
+
+word_count_vector = cv.fit_transform(processed_documents)
+words = cv.get_feature_names_out()
+
+tf = pd.DataFrame(word_count_vector.toarray(), columns=words)
+transformer.fit_transform(word_count_vector)
+
+tfidf_dict = {}
+for idx, wrd in enumerate(words):
+    tfidf_dict[wrd] = {'idf': transformer.idf_[idx], 'tf': tf[wrd]}
+
+# Constants
+k = 1.5
+b = 0.75
+avgdl = mean([len(x.split()) for x in processed_documents])
+
+
+def OkapiBM25(query, limit=5):
+    query_str = query.split()
+    scores = []
+    for d in range(len(processed_documents)):
+        s = 0
+        for keyword in query_str:
+            tf = tfidf_dict.get(keyword, None)
+            if not tf:
+                continue
+            tf = tf['tf'][d]
+            idf = tfidf_dict[keyword]['idf']
+            doclen = len(processed_documents[d].split())
+            s += idf * (tf * (k + 1)) / (tf + k * (1 - b + b * doclen / avgdl))
+        scores.append(s)
+
+    results = []
+    for i, x in enumerate(scores):
+        results.append((x, processed_documents[i])) if x else None
+    results = sorted(results, key=lambda x: x[0], reverse=True)
+
+    print('-' * 10)
+    print(f"Total results: {len(results)}; Showing {min(limit, len(results))}:")
+    print('-' * 10)
+    for r in results[:limit]:
+        print(f"Score: {r[0]}")
+        print(r[1])
+        print('-' * 10)
+
+
+if __name__ == '__main__':
+    print("'q' to quit")
+    while True:
+        q = input("Your query: ")
+        if q == 'q': break
+        OkapiBM25(q)
--- a/tweets.csv
+++ b/tweets.csv