polish-urban-legends-public/solution-testA.py

33 lines
979 B
Python
Raw Normal View History

2021-04-15 18:16:41 +02:00
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import sklearn.metrics
from sklearn.cluster import KMeans
def preprocess(document, stopwords):
return " ".join([word for word in document.split() if word not in stopwords])
2021-04-15 18:16:41 +02:00
def main():
with open('stopwords.txt') as stopwords_file:
stopwords = [stopword.strip()
for stopword in stopwords_file.readlines()]
2021-04-15 18:16:41 +02:00
with open("test-A/in.tsv") as in_file:
documents = [preprocess(document, stopwords)
for document in in_file.readlines()]
2021-04-15 18:16:41 +02:00
vectorizer = TfidfVectorizer()
document_vectors = vectorizer.fit_transform(documents)
predictions = KMeans(
2021-04-17 14:23:23 +02:00
n_clusters=25, max_iter=1000).fit_predict(document_vectors)
2021-04-15 18:16:41 +02:00
with open("test-A/out.tsv", "w") as out_file:
for prediction in predictions:
out_file.write(str(prediction) + '\n')
if __name__ == '__main__':
main()