polish-urban-legends-public/script.py

33 lines
1.2 KiB
Python
Raw Normal View History

2021-04-26 23:38:04 +02:00
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
def openFileWithStopwords(filename):
stopwords = []
with open(filename, encoding='utf-8') as file:
stopwords = [stopWord.strip() for stopWord in file.readlines()]
return stopwords
def calculate(clusters, maxiter, tfidfVectorizer, stopwords, fileIn, fileOut):
with open(fileIn, encoding='utf-8') as infile:
documents = [" ".join([stopword for stopword in document.split() if stopword not in stopwords]) for document in
infile.readlines()]
results = KMeans(n_clusters=clusters, max_iter=maxiter).fit_predict(tfidfVectorizer.fit_transform(documents))
with open(fileOut, "w", encoding='utf-8') as output:
for result in results:
output.write(str(result) + '\n')
def main():
vectorizer = TfidfVectorizer()
clusters = 20
maxiter = 1000
stopwords = openFileWithStopwords('stopwords.txt')
calculate(clusters, maxiter, vectorizer, stopwords, "dev-0/in.tsv", "dev-0/out.tsv")
calculate(clusters, maxiter, vectorizer, stopwords, "test-A/in.tsv", "test-A/out.tsv")
if __name__ == '__main__':
main()