33 lines
1.2 KiB
Python
33 lines
1.2 KiB
Python
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.cluster import KMeans
|
|
|
|
|
|
def openFileWithStopwords(filename):
|
|
stopwords = []
|
|
with open(filename, encoding='utf-8') as file:
|
|
stopwords = [stopWord.strip() for stopWord in file.readlines()]
|
|
return stopwords
|
|
|
|
|
|
def calculate(clusters, maxiter, tfidfVectorizer, stopwords, fileIn, fileOut):
|
|
with open(fileIn, encoding='utf-8') as infile:
|
|
documents = [" ".join([stopword for stopword in document.split() if stopword not in stopwords]) for document in
|
|
infile.readlines()]
|
|
results = KMeans(n_clusters=clusters, max_iter=maxiter).fit_predict(tfidfVectorizer.fit_transform(documents))
|
|
with open(fileOut, "w", encoding='utf-8') as output:
|
|
for result in results:
|
|
output.write(str(result) + '\n')
|
|
|
|
|
|
def main():
|
|
vectorizer = TfidfVectorizer()
|
|
clusters = 20
|
|
maxiter = 1000
|
|
stopwords = openFileWithStopwords('stopwords.txt')
|
|
calculate(clusters, maxiter, vectorizer, stopwords, "dev-0/in.tsv", "dev-0/out.tsv")
|
|
calculate(clusters, maxiter, vectorizer, stopwords, "test-A/in.tsv", "test-A/out.tsv")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|