43 lines
1.6 KiB
Python
43 lines
1.6 KiB
Python
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||
|
from sklearn.cluster import KMeans
|
||
|
|
||
|
|
||
|
def calcDevZero(tfidfVectorizer, stopwords):
|
||
|
with open("dev-0/in.tsv", encoding='utf-8') as input_file:
|
||
|
docs = [" ".join([sword for sword in document.split() if sword not in stopwords]) for document in
|
||
|
input_file.readlines()]
|
||
|
vectors = tfidfVectorizer.fit_transform(docs)
|
||
|
predictions = KMeans(n_clusters=20, max_iter=1000).fit_predict(vectors)
|
||
|
with open("dev-0/out.tsv", "w", encoding='utf-8') as output_file:
|
||
|
for prediction in predictions:
|
||
|
output_file.write(str(prediction) + '\n')
|
||
|
|
||
|
|
||
|
def calcTestA(tfidfVectorizer, stopwords):
|
||
|
with open("test-A/in.tsv", encoding='utf-8') as input_file:
|
||
|
docs = [" ".join([sword for sword in document.split() if sword not in stopwords]) for document in
|
||
|
input_file.readlines()]
|
||
|
vectors = tfidfVectorizer.fit_transform(docs)
|
||
|
predictions = KMeans(n_clusters=20, max_iter=1000).fit_predict(vectors)
|
||
|
with open("test-A/out.tsv", "w", encoding='utf-8') as output_file:
|
||
|
for prediction in predictions:
|
||
|
output_file.write(str(prediction) + '\n')
|
||
|
|
||
|
|
||
|
def setStopWords(filename):
|
||
|
stopwords = []
|
||
|
with open(filename, encoding='utf-8') as stopwords_file:
|
||
|
stopwords = [sWord.strip() for sWord in stopwords_file.readlines()]
|
||
|
return stopwords
|
||
|
|
||
|
|
||
|
def main():
|
||
|
tfidfVectorizer = TfidfVectorizer()
|
||
|
stopwords = setStopWords('stopwords.txt')
|
||
|
calcDevZero(tfidfVectorizer, stopwords)
|
||
|
calcTestA(tfidfVectorizer, stopwords)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|