From 2ff0538dc34d23ff09da2462d9ccd6d3d3fe8bbd Mon Sep 17 00:00:00 2001
From: s444501 <krywas@st.amu.edu.pl>
Date: Tue, 12 Apr 2022 23:00:11 +0200
Subject: [PATCH] test

---
 okapi.py | 45 +++++++++++++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/okapi.py b/okapi.py
index c39b1d8..1fa16b0 100644
--- a/okapi.py
+++ b/okapi.py
@@ -3,22 +3,41 @@ import numpy as np
 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 from statistics import mean
 
+"""
+Niesatysfakcjonujące quert: "vote"
+Zapytanie zwraca tweety nie pisane przez Trumpa
+
+Poprawka: Usunięcie w preprocessingu tweetów zawierających @realDonaldTrump
+"""
+
 # Options
 pd.set_option("display.max_columns", None)
 
 # Load documents
 print("Loading documents..")
 raw_documents = pd.read_csv('tweets.csv')
-# processed_documents = raw_documents[raw_documents.mentions.isnull()]
-# processed_documents = np.unique(processed_documents['content'])
-processed_documents = np.unique(raw_documents['content'])
+
+# Process A
+processed_documents = raw_documents
+
+# Process B
+# mention_filter = list(map(lambda x: x != x or '@realDonaldTrump' not in x.split(','), raw_documents.mentions))
+# retweet_filter = list(map(lambda x: x > 5, raw_documents.retweets))
+# doc_filter = np.array(mention_filter) & np.array(retweet_filter)
+# processed_documents = raw_documents[doc_filter]
+# processed_documents.reset_index(inplace=True)
+
+# Columns to variables
+tweets = processed_documents['content']
+retweets = processed_documents['retweets']
+dates = processed_documents['date']
 
 # Vectorization
 print("Vectorizing...")
 cv = CountVectorizer()
 transformer = TfidfTransformer()
 
-word_count_vector = cv.fit_transform(processed_documents)
+word_count_vector = cv.fit_transform(tweets)
 words = cv.get_feature_names_out()
 
 tf = pd.DataFrame(word_count_vector.toarray(), columns=words)
@@ -31,13 +50,13 @@ for idx, wrd in enumerate(words):
 # Constants
 k = 1.5
 b = 0.75
-avgdl = mean([len(x.split()) for x in processed_documents])
+avgdl = mean([len(x.split()) for x in tweets])
 
 
 def OkapiBM25(query, limit=5):
-    query_str = query.split()
+    query_str = query.lower().split()
     scores = []
-    for d in range(len(processed_documents)):
+    for d in range(len(tweets)):
         s = 0
         for keyword in query_str:
             tf = tfidf_dict.get(keyword, None)
@@ -45,21 +64,23 @@ def OkapiBM25(query, limit=5):
                 continue
             tf = tf['tf'][d]
             idf = tfidf_dict[keyword]['idf']
-            doclen = len(processed_documents[d].split())
+            doclen = len(tweets[d].split())
             s += idf * (tf * (k + 1)) / (tf + k * (1 - b + b * doclen / avgdl))
         scores.append(s)
 
     results = []
     for i, x in enumerate(scores):
-        results.append((x, processed_documents[i])) if x else None
-    results = sorted(results, key=lambda x: x[0], reverse=True)
+        results.append({'score': x, 'content': tweets[i], 'retweets': retweets[i], 'date': dates[i]}) if x else None
+    results = sorted(results, key=lambda x: x['score'], reverse=True)
 
     print('-' * 10)
     print(f"Total results: {len(results)}; Showing {min(limit, len(results))}:")
     print('-' * 10)
     for r in results[:limit]:
-        print(f"Score: {r[0]}")
-        print(r[1])
+        print(f"Score: {r['score']}")
+        print(f"Date: {r['date']}")
+        print(f"Retweets: {r['retweets']}")
+        print(r['content'])
         print('-' * 10)