polish-urban-legends-public/solution.py

43 lines
1.6 KiB
Python
Raw Permalink Normal View History

2021-04-26 15:39:52 +02:00
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
def calcDevZero(tfidfVectorizer, stopwords):
with open("dev-0/in.tsv", encoding='utf-8') as input_file:
docs = [" ".join([sword for sword in document.split() if sword not in stopwords]) for document in
input_file.readlines()]
vectors = tfidfVectorizer.fit_transform(docs)
predictions = KMeans(n_clusters=20, max_iter=1000).fit_predict(vectors)
with open("dev-0/out.tsv", "w", encoding='utf-8') as output_file:
for prediction in predictions:
output_file.write(str(prediction) + '\n')
def calcTestA(tfidfVectorizer, stopwords):
with open("test-A/in.tsv", encoding='utf-8') as input_file:
docs = [" ".join([sword for sword in document.split() if sword not in stopwords]) for document in
input_file.readlines()]
vectors = tfidfVectorizer.fit_transform(docs)
predictions = KMeans(n_clusters=20, max_iter=1000).fit_predict(vectors)
with open("test-A/out.tsv", "w", encoding='utf-8') as output_file:
for prediction in predictions:
output_file.write(str(prediction) + '\n')
def setStopWords(filename):
stopwords = []
with open(filename, encoding='utf-8') as stopwords_file:
stopwords = [sWord.strip() for sWord in stopwords_file.readlines()]
return stopwords
def main():
tfidfVectorizer = TfidfVectorizer()
stopwords = setStopWords('stopwords.txt')
calcDevZero(tfidfVectorizer, stopwords)
calcTestA(tfidfVectorizer, stopwords)
if __name__ == '__main__':
main()