This commit is contained in:
s444501 2022-04-12 23:00:11 +02:00
parent 62c92ef71e
commit 2ff0538dc3

View File

@ -3,22 +3,41 @@ import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from statistics import mean from statistics import mean
"""
Niesatysfakcjonujące quert: "vote"
Zapytanie zwraca tweety nie pisane przez Trumpa
Poprawka: Usunięcie w preprocessingu tweetów zawierających @realDonaldTrump
"""
# Options # Options
pd.set_option("display.max_columns", None) pd.set_option("display.max_columns", None)
# Load documents # Load documents
print("Loading documents..") print("Loading documents..")
raw_documents = pd.read_csv('tweets.csv') raw_documents = pd.read_csv('tweets.csv')
# processed_documents = raw_documents[raw_documents.mentions.isnull()]
# processed_documents = np.unique(processed_documents['content']) # Process A
processed_documents = np.unique(raw_documents['content']) processed_documents = raw_documents
# Process B
# mention_filter = list(map(lambda x: x != x or '@realDonaldTrump' not in x.split(','), raw_documents.mentions))
# retweet_filter = list(map(lambda x: x > 5, raw_documents.retweets))
# doc_filter = np.array(mention_filter) & np.array(retweet_filter)
# processed_documents = raw_documents[doc_filter]
# processed_documents.reset_index(inplace=True)
# Columns to variables
tweets = processed_documents['content']
retweets = processed_documents['retweets']
dates = processed_documents['date']
# Vectorization # Vectorization
print("Vectorizing...") print("Vectorizing...")
cv = CountVectorizer() cv = CountVectorizer()
transformer = TfidfTransformer() transformer = TfidfTransformer()
word_count_vector = cv.fit_transform(processed_documents) word_count_vector = cv.fit_transform(tweets)
words = cv.get_feature_names_out() words = cv.get_feature_names_out()
tf = pd.DataFrame(word_count_vector.toarray(), columns=words) tf = pd.DataFrame(word_count_vector.toarray(), columns=words)
@ -31,13 +50,13 @@ for idx, wrd in enumerate(words):
# Constants # Constants
k = 1.5 k = 1.5
b = 0.75 b = 0.75
avgdl = mean([len(x.split()) for x in processed_documents]) avgdl = mean([len(x.split()) for x in tweets])
def OkapiBM25(query, limit=5): def OkapiBM25(query, limit=5):
query_str = query.split() query_str = query.lower().split()
scores = [] scores = []
for d in range(len(processed_documents)): for d in range(len(tweets)):
s = 0 s = 0
for keyword in query_str: for keyword in query_str:
tf = tfidf_dict.get(keyword, None) tf = tfidf_dict.get(keyword, None)
@ -45,21 +64,23 @@ def OkapiBM25(query, limit=5):
continue continue
tf = tf['tf'][d] tf = tf['tf'][d]
idf = tfidf_dict[keyword]['idf'] idf = tfidf_dict[keyword]['idf']
doclen = len(processed_documents[d].split()) doclen = len(tweets[d].split())
s += idf * (tf * (k + 1)) / (tf + k * (1 - b + b * doclen / avgdl)) s += idf * (tf * (k + 1)) / (tf + k * (1 - b + b * doclen / avgdl))
scores.append(s) scores.append(s)
results = [] results = []
for i, x in enumerate(scores): for i, x in enumerate(scores):
results.append((x, processed_documents[i])) if x else None results.append({'score': x, 'content': tweets[i], 'retweets': retweets[i], 'date': dates[i]}) if x else None
results = sorted(results, key=lambda x: x[0], reverse=True) results = sorted(results, key=lambda x: x['score'], reverse=True)
print('-' * 10) print('-' * 10)
print(f"Total results: {len(results)}; Showing {min(limit, len(results))}:") print(f"Total results: {len(results)}; Showing {min(limit, len(results))}:")
print('-' * 10) print('-' * 10)
for r in results[:limit]: for r in results[:limit]:
print(f"Score: {r[0]}") print(f"Score: {r['score']}")
print(r[1]) print(f"Date: {r['date']}")
print(f"Retweets: {r['retweets']}")
print(r['content'])
print('-' * 10) print('-' * 10)