import numpy as np import pandas as pd from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from statistics import mean """ Niesatysfakcjonujące quert: "vote" Zapytanie zwraca tweety nie pisane przez Trumpa Poprawka: Usunięcie w preprocessingu tweetów zawierających @realDonaldTrump """ # Options pd.set_option("display.max_columns", None) # Load documents print("Loading documents..") raw_documents = pd.read_csv('tweets.csv') # Process A processed_documents = raw_documents # Process B # mention_filter = list(map(lambda x: x != x or '@realDonaldTrump' not in x.split(','), raw_documents.mentions)) # retweet_filter = list(map(lambda x: x > 5, raw_documents.retweets)) # doc_filter = np.array(mention_filter) & np.array(retweet_filter) # processed_documents = raw_documents[doc_filter] # processed_documents.reset_index(inplace=True) # Columns to variables tweets = processed_documents['content'] retweets = processed_documents['retweets'] dates = processed_documents['date'] # Vectorization print(f"{len(processed_documents)} documents ready!") print("Vectorizing...") cv = CountVectorizer(dtype='uint8') transformer = TfidfTransformer() word_count_vector = cv.fit_transform(tweets) try: words = cv.get_feature_names_out() except: words = cv.get_feature_names() tf = pd.DataFrame(word_count_vector.toarray(), columns=words) transformer.fit_transform(word_count_vector) tfidf_dict = {} for idx, wrd in enumerate(words): tfidf_dict[wrd] = {'idf': transformer.idf_[idx], 'tf': tf[wrd]} # Constants k = 1.5 b = 0.75 avgdl = mean([len(x.split()) for x in tweets]) def OkapiBM25(query, limit=5): query_str = query.lower().split() scores = [] for d in range(len(tweets)): s = 0 for keyword in query_str: tf = tfidf_dict.get(keyword, None) if not tf: continue tf = tf['tf'][d] idf = tfidf_dict[keyword]['idf'] doclen = len(tweets[d].split()) s += idf * (tf * (k + 1)) / (tf + k * (1 - b + b * doclen / avgdl)) scores.append(s) results = [] for i, x in enumerate(scores): results.append({'score': x, 'content': tweets[i], 'retweets': retweets[i], 'date': dates[i]}) if x else None results = sorted(results, key=lambda x: x['score'], reverse=True) print('-' * 10) print(f"Total results: {len(results)}; Showing {min(limit, len(results))}:") print('-' * 10) for r in results[:limit]: print(f"Score: {r['score']}") print(f"Date: {r['date']}") print(f"Retweets: {r['retweets']}") print(r['content']) print('-' * 10) if __name__ == '__main__': print("'q' to quit") while True: q = input("Your query: ") if q == 'q': break OkapiBM25(q)