From 2ff0538dc34d23ff09da2462d9ccd6d3d3fe8bbd Mon Sep 17 00:00:00 2001 From: s444501 Date: Tue, 12 Apr 2022 23:00:11 +0200 Subject: [PATCH] test --- okapi.py | 45 +++++++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/okapi.py b/okapi.py index c39b1d8..1fa16b0 100644 --- a/okapi.py +++ b/okapi.py @@ -3,22 +3,41 @@ import numpy as np from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from statistics import mean +""" +Niesatysfakcjonujące quert: "vote" +Zapytanie zwraca tweety nie pisane przez Trumpa + +Poprawka: Usunięcie w preprocessingu tweetów zawierających @realDonaldTrump +""" + # Options pd.set_option("display.max_columns", None) # Load documents print("Loading documents..") raw_documents = pd.read_csv('tweets.csv') -# processed_documents = raw_documents[raw_documents.mentions.isnull()] -# processed_documents = np.unique(processed_documents['content']) -processed_documents = np.unique(raw_documents['content']) + +# Process A +processed_documents = raw_documents + +# Process B +# mention_filter = list(map(lambda x: x != x or '@realDonaldTrump' not in x.split(','), raw_documents.mentions)) +# retweet_filter = list(map(lambda x: x > 5, raw_documents.retweets)) +# doc_filter = np.array(mention_filter) & np.array(retweet_filter) +# processed_documents = raw_documents[doc_filter] +# processed_documents.reset_index(inplace=True) + +# Columns to variables +tweets = processed_documents['content'] +retweets = processed_documents['retweets'] +dates = processed_documents['date'] # Vectorization print("Vectorizing...") cv = CountVectorizer() transformer = TfidfTransformer() -word_count_vector = cv.fit_transform(processed_documents) +word_count_vector = cv.fit_transform(tweets) words = cv.get_feature_names_out() tf = pd.DataFrame(word_count_vector.toarray(), columns=words) @@ -31,13 +50,13 @@ for idx, wrd in enumerate(words): # Constants k = 1.5 b = 0.75 -avgdl = mean([len(x.split()) for x in processed_documents]) +avgdl = mean([len(x.split()) for x in tweets]) def OkapiBM25(query, limit=5): - query_str = query.split() + query_str = query.lower().split() scores = [] - for d in range(len(processed_documents)): + for d in range(len(tweets)): s = 0 for keyword in query_str: tf = tfidf_dict.get(keyword, None) @@ -45,21 +64,23 @@ def OkapiBM25(query, limit=5): continue tf = tf['tf'][d] idf = tfidf_dict[keyword]['idf'] - doclen = len(processed_documents[d].split()) + doclen = len(tweets[d].split()) s += idf * (tf * (k + 1)) / (tf + k * (1 - b + b * doclen / avgdl)) scores.append(s) results = [] for i, x in enumerate(scores): - results.append((x, processed_documents[i])) if x else None - results = sorted(results, key=lambda x: x[0], reverse=True) + results.append({'score': x, 'content': tweets[i], 'retweets': retweets[i], 'date': dates[i]}) if x else None + results = sorted(results, key=lambda x: x['score'], reverse=True) print('-' * 10) print(f"Total results: {len(results)}; Showing {min(limit, len(results))}:") print('-' * 10) for r in results[:limit]: - print(f"Score: {r[0]}") - print(r[1]) + print(f"Score: {r['score']}") + print(f"Date: {r['date']}") + print(f"Retweets: {r['retweets']}") + print(r['content']) print('-' * 10)