baseline- tfidf cosine similarity score order between query and candidate
This commit is contained in:
parent
f2ab48d3ca
commit
98b0b55131
22
baseline-tfidf.py
Normal file
22
baseline-tfidf.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
import gzip
|
||||||
|
import random
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
random.seed(42)
|
||||||
|
|
||||||
|
for dataset in 'dev-0', 'test-A':
|
||||||
|
with gzip.open(f'{dataset}/in.tsv.gz', 'rt') as f_in, open(f'{dataset}/out.tsv', 'w') as f_out:
|
||||||
|
for line in f_in:
|
||||||
|
line = line.rstrip('\n').split('\t')
|
||||||
|
query = line[0]
|
||||||
|
candidates = line[1:]
|
||||||
|
|
||||||
|
vectorizer = TfidfVectorizer()
|
||||||
|
query_v = vectorizer.fit_transform([query.replace('[SEP]', ' ')])
|
||||||
|
candidates_v = vectorizer.transform(candidates)
|
||||||
|
similarites = cosine_similarity(query_v, candidates_v)[0]
|
||||||
|
candidates_sorted = sorted(zip(candidates, similarites), key= lambda x: x[1], reverse=True)
|
||||||
|
candidates_sorted = [c[0] for c in candidates_sorted]
|
||||||
|
|
||||||
|
candidates = '\t'.join(candidates_sorted) + '\n'
|
||||||
|
f_out.write(candidates)
|
8696
dev-0/out.tsv
8696
dev-0/out.tsv
File diff suppressed because one or more lines are too long
8316
test-A/out.tsv
8316
test-A/out.tsv
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user