test
This commit is contained in:
parent
62c92ef71e
commit
2ff0538dc3
45
okapi.py
45
okapi.py
@ -3,22 +3,41 @@ import numpy as np
|
|||||||
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
||||||
from statistics import mean
|
from statistics import mean
|
||||||
|
|
||||||
|
"""
|
||||||
|
Niesatysfakcjonujące quert: "vote"
|
||||||
|
Zapytanie zwraca tweety nie pisane przez Trumpa
|
||||||
|
|
||||||
|
Poprawka: Usunięcie w preprocessingu tweetów zawierających @realDonaldTrump
|
||||||
|
"""
|
||||||
|
|
||||||
# Options
|
# Options
|
||||||
pd.set_option("display.max_columns", None)
|
pd.set_option("display.max_columns", None)
|
||||||
|
|
||||||
# Load documents
|
# Load documents
|
||||||
print("Loading documents..")
|
print("Loading documents..")
|
||||||
raw_documents = pd.read_csv('tweets.csv')
|
raw_documents = pd.read_csv('tweets.csv')
|
||||||
# processed_documents = raw_documents[raw_documents.mentions.isnull()]
|
|
||||||
# processed_documents = np.unique(processed_documents['content'])
|
# Process A
|
||||||
processed_documents = np.unique(raw_documents['content'])
|
processed_documents = raw_documents
|
||||||
|
|
||||||
|
# Process B
|
||||||
|
# mention_filter = list(map(lambda x: x != x or '@realDonaldTrump' not in x.split(','), raw_documents.mentions))
|
||||||
|
# retweet_filter = list(map(lambda x: x > 5, raw_documents.retweets))
|
||||||
|
# doc_filter = np.array(mention_filter) & np.array(retweet_filter)
|
||||||
|
# processed_documents = raw_documents[doc_filter]
|
||||||
|
# processed_documents.reset_index(inplace=True)
|
||||||
|
|
||||||
|
# Columns to variables
|
||||||
|
tweets = processed_documents['content']
|
||||||
|
retweets = processed_documents['retweets']
|
||||||
|
dates = processed_documents['date']
|
||||||
|
|
||||||
# Vectorization
|
# Vectorization
|
||||||
print("Vectorizing...")
|
print("Vectorizing...")
|
||||||
cv = CountVectorizer()
|
cv = CountVectorizer()
|
||||||
transformer = TfidfTransformer()
|
transformer = TfidfTransformer()
|
||||||
|
|
||||||
word_count_vector = cv.fit_transform(processed_documents)
|
word_count_vector = cv.fit_transform(tweets)
|
||||||
words = cv.get_feature_names_out()
|
words = cv.get_feature_names_out()
|
||||||
|
|
||||||
tf = pd.DataFrame(word_count_vector.toarray(), columns=words)
|
tf = pd.DataFrame(word_count_vector.toarray(), columns=words)
|
||||||
@ -31,13 +50,13 @@ for idx, wrd in enumerate(words):
|
|||||||
# Constants
|
# Constants
|
||||||
k = 1.5
|
k = 1.5
|
||||||
b = 0.75
|
b = 0.75
|
||||||
avgdl = mean([len(x.split()) for x in processed_documents])
|
avgdl = mean([len(x.split()) for x in tweets])
|
||||||
|
|
||||||
|
|
||||||
def OkapiBM25(query, limit=5):
|
def OkapiBM25(query, limit=5):
|
||||||
query_str = query.split()
|
query_str = query.lower().split()
|
||||||
scores = []
|
scores = []
|
||||||
for d in range(len(processed_documents)):
|
for d in range(len(tweets)):
|
||||||
s = 0
|
s = 0
|
||||||
for keyword in query_str:
|
for keyword in query_str:
|
||||||
tf = tfidf_dict.get(keyword, None)
|
tf = tfidf_dict.get(keyword, None)
|
||||||
@ -45,21 +64,23 @@ def OkapiBM25(query, limit=5):
|
|||||||
continue
|
continue
|
||||||
tf = tf['tf'][d]
|
tf = tf['tf'][d]
|
||||||
idf = tfidf_dict[keyword]['idf']
|
idf = tfidf_dict[keyword]['idf']
|
||||||
doclen = len(processed_documents[d].split())
|
doclen = len(tweets[d].split())
|
||||||
s += idf * (tf * (k + 1)) / (tf + k * (1 - b + b * doclen / avgdl))
|
s += idf * (tf * (k + 1)) / (tf + k * (1 - b + b * doclen / avgdl))
|
||||||
scores.append(s)
|
scores.append(s)
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for i, x in enumerate(scores):
|
for i, x in enumerate(scores):
|
||||||
results.append((x, processed_documents[i])) if x else None
|
results.append({'score': x, 'content': tweets[i], 'retweets': retweets[i], 'date': dates[i]}) if x else None
|
||||||
results = sorted(results, key=lambda x: x[0], reverse=True)
|
results = sorted(results, key=lambda x: x['score'], reverse=True)
|
||||||
|
|
||||||
print('-' * 10)
|
print('-' * 10)
|
||||||
print(f"Total results: {len(results)}; Showing {min(limit, len(results))}:")
|
print(f"Total results: {len(results)}; Showing {min(limit, len(results))}:")
|
||||||
print('-' * 10)
|
print('-' * 10)
|
||||||
for r in results[:limit]:
|
for r in results[:limit]:
|
||||||
print(f"Score: {r[0]}")
|
print(f"Score: {r['score']}")
|
||||||
print(r[1])
|
print(f"Date: {r['date']}")
|
||||||
|
print(f"Retweets: {r['retweets']}")
|
||||||
|
print(r['content'])
|
||||||
print('-' * 10)
|
print('-' * 10)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user