diff --git a/dev-0/out.tsv b/dev-0/out.tsv index 311a453..26c3a88 100644 --- a/dev-0/out.tsv +++ b/dev-0/out.tsv @@ -1,87 +1,87 @@ -14 -0 -4 -7 -2 -5 -22 -12 -5 -4 -15 -5 -0 -2 -8 -8 -8 -2 -9 -23 -24 -6 -13 -10 -15 -6 -11 -13 -23 -0 -22 -22 -3 -20 -8 -3 -20 -14 -11 -7 -5 18 +11 +10 +12 +22 +5 +19 +8 +23 1 +8 +5 +11 +16 +9 +9 +13 +22 7 16 -21 -22 -0 -22 -3 -15 -4 -22 -6 -17 10 0 -0 -22 -20 -3 -0 -1 -21 -5 6 -22 -16 +15 +8 +0 +14 +6 +24 +11 19 -22 2 +2 +21 9 -16 -1 -0 +2 +11 +18 +14 12 -15 +23 +3 +2 17 -7 +3 +4 +2 +11 +14 +2 +8 +10 +2 +0 +3 +15 +11 +11 +19 +21 +2 +11 2 4 -15 -4 +5 +21 2 3 -6 +10 +2 +22 7 +3 +19 +11 +23 +8 +3 +17 +20 +1 +8 +1 +16 +2 +21 +22 diff --git a/solution.py b/solution.py new file mode 100644 index 0000000..2730fd1 --- /dev/null +++ b/solution.py @@ -0,0 +1,37 @@ +from sklearn.feature_extraction.text import TfidfVectorizer +import numpy as np +import sklearn.metrics +from sklearn.cluster import KMeans + + +def preprocess(document, stopwords): + return " ".join([word for word in document.split() if word not in stopwords]) + +def predict(in_file, out_file, stopwords): + + with open(in_file) as in_file: + documents = [preprocess(document, stopwords) + for document in in_file.readlines()] + + vectorizer = TfidfVectorizer() + + document_vectors = vectorizer.fit_transform(documents) + predictions = KMeans( + n_clusters=25, max_iter=1000).fit_predict(document_vectors) + + with open(out_file, "w") as out_file: + for prediction in predictions: + out_file.write(str(prediction) + '\n') + + +def main(): + with open('stopwords.txt') as stopwords_file: + stopwords = [stopword.strip() + for stopword in stopwords_file.readlines()] + + predict("dev-0/in.tsv", "dev-0/out.tsv", stopwords) + predict("test-A/in.tsv", "test-A/out.tsv", stopwords) + + +if __name__ == '__main__': + main() diff --git a/test-A/out.tsv b/test-A/out.tsv index fe8b5ad..57a04f5 100644 --- a/test-A/out.tsv +++ b/test-A/out.tsv @@ -1,691 +1,691 @@ -18 -3 -1 -5 -3 -11 4 -6 -18 -3 -14 -17 -6 -20 -22 -3 -9 -5 -9 -22 -13 -0 -11 -20 -20 -18 -5 4 -2 -18 -10 -6 -11 -1 -19 24 -17 -4 -1 -6 14 -4 -9 -6 -23 -23 -15 -21 -1 +3 22 -1 +14 +5 4 -15 -20 -13 -10 3 -1 -18 -18 -15 -9 -15 -9 -13 -1 -20 -11 -10 -3 -10 -19 -9 -4 -18 -4 -10 -18 -20 -13 -13 +12 +23 7 -8 -13 -3 -17 -18 -14 -3 -18 -20 -10 19 -3 -9 -16 -18 -18 -18 -12 -12 -0 -9 -4 -2 -18 -13 -2 -13 -24 -6 -1 -22 -1 -17 -3 -3 -24 -18 -17 -21 -6 -4 -10 -3 -1 -9 -10 -6 -18 -10 -19 -15 -10 -18 -19 -5 -14 -8 -5 -4 -17 -21 -5 11 -5 -15 -5 -15 -10 9 -9 -3 -22 -20 -17 -9 -6 -10 -0 -12 -5 -15 -21 -10 -15 -8 -20 -19 -19 -24 23 -16 -4 -9 -8 10 -15 -10 -24 +23 +11 +23 7 -21 -18 -17 -19 -18 -6 -8 -6 -13 -18 -11 -17 -3 -18 -19 -5 -1 -18 -18 -0 -16 -18 -23 -9 -9 -1 -21 -3 -18 -2 -17 -18 -15 -14 -17 -6 -11 -2 -20 22 -15 -10 -12 -8 -19 -12 -13 -8 -17 -5 18 -10 -13 -11 +6 0 -19 -20 -16 -5 -18 -22 -1 -14 -18 -18 -13 -13 -5 -21 -18 -24 +6 +4 7 -6 +5 +14 +7 +22 +24 +2 +10 +23 +21 +24 +4 +10 +17 +23 +7 +12 +19 16 8 -6 -20 -12 -21 -10 -6 -9 -15 -5 -2 -5 24 +11 +24 +3 16 18 -5 -18 -10 -18 -4 -4 -10 -6 +13 21 -9 -4 -10 -23 -24 -6 -5 -5 -1 -9 11 -18 -18 -20 24 -22 -13 -9 -5 -9 -23 -17 -18 4 -13 -13 -15 -15 -5 -3 -18 -3 -18 -3 +4 6 -8 -24 -13 -18 -18 -18 -20 -8 -0 +9 +9 3 -5 -15 -3 -23 -18 -20 -18 -14 -15 -10 -10 -1 -24 -3 -14 -18 -2 -15 -6 -22 -20 -21 -18 -17 -10 -22 -18 11 -10 -2 -9 +24 18 -13 -9 -18 -9 -23 -4 -17 -3 -13 -18 -1 -6 -21 -12 -3 -4 -17 -17 -6 -2 -10 -9 -20 -3 +22 +16 19 13 -4 -18 -18 -6 -0 -17 -20 -11 -18 -1 -15 -1 -18 -2 -18 -3 -9 -2 -18 -18 -10 -6 -20 -22 -18 -6 -15 -6 -18 -5 -6 -18 -10 -13 -21 -13 -5 -15 -18 -18 -18 -11 -20 -16 -1 -3 -16 -22 -14 -20 -21 -18 -7 -15 -4 -6 -20 -11 -18 -1 -20 2 9 -24 -3 -7 -2 -17 -16 +23 4 -18 +23 +4 +23 6 -18 -10 -17 -21 -15 -13 -16 -17 4 -24 -6 -9 -1 -18 -4 -14 -15 -14 -18 -18 -9 -14 -16 -10 19 -2 -12 -8 -4 -24 -4 -22 -15 -13 -16 -3 -18 -1 -21 20 +1 +10 +3 6 -18 -6 +23 5 -3 -21 -18 -1 -24 -18 -14 -3 -0 -18 6 4 -11 -20 18 -24 -16 -14 -24 +13 +2 +3 +9 9 -17 -21 -6 -3 -12 -18 -3 4 -17 +12 +4 +10 +12 +7 +23 8 -6 +4 +14 +0 +4 +0 6 7 -18 -14 1 11 -0 +24 23 -22 -13 -14 -1 -5 +3 9 -18 -2 -11 -3 -6 -13 -6 -3 -20 -16 -18 -19 -7 -18 -18 -19 -1 -11 -1 -23 -6 -18 -18 10 -18 -6 -19 -1 -1 -13 -24 -16 -17 -6 -18 -17 -18 -10 -18 -18 -11 -16 -18 -7 -10 -18 -13 -24 -18 -20 -4 -0 -2 -18 -15 -15 -4 -1 -22 -13 -22 -10 -23 -18 -18 -18 +21 +21 +8 5 -17 6 -18 -4 -17 -4 -18 +16 +6 +24 +23 +21 +5 +21 +13 +2 +21 21 4 -3 -10 -0 -13 +2 +15 +4 1 15 +4 +10 8 -23 -13 -9 -18 -17 +15 +0 +16 +6 +15 +5 21 -14 -3 -18 -18 -8 -20 -2 -18 -22 +9 +9 +4 +11 18 5 +23 +3 +9 +7 +17 +1 +14 +8 +13 +3 +4 +17 +2 +2 +10 +19 +18 +23 +9 +1 +3 +19 +21 +10 +20 +10 +21 +21 +0 +4 +5 +4 +5 +13 +4 +0 +23 +14 +5 +3 +14 +24 +10 +4 +6 +4 +4 +19 +9 +9 +24 +8 +1 +4 +7 +4 +11 +6 +4 +16 +5 +22 +11 +12 +23 +16 +13 +6 +11 +2 +17 +0 +4 +10 +16 +4 +21 +5 +22 +12 +2 +18 +0 +21 +4 +11 +24 +18 +4 +4 +13 +13 +6 +8 +16 +10 +20 +7 +4 +11 +7 +12 +17 +8 +21 +7 +23 +11 +15 +5 +14 +6 +4 +10 +21 +4 +13 +4 +22 +23 +19 +7 +8 +17 +11 +13 +19 +5 +7 +4 +21 +24 +9 +22 +4 +16 +0 +6 +10 +13 +23 +4 +9 +6 +18 +0 +11 +13 +14 +19 10 6 -20 +6 +4 +9 +4 +1 +23 +17 +10 13 -20 -20 +4 +4 +4 +6 +1 +7 +6 +18 +19 +6 +4 +21 +19 +23 +4 +10 +21 +14 24 10 +1 +11 +4 +11 +21 +7 +11 +18 +8 +3 +23 +16 +16 +4 +0 +13 +7 +23 +19 +13 +3 +4 +9 +4 +4 +9 +11 +13 +12 +24 +7 +8 +17 +23 +10 +23 +19 +5 +6 +21 +9 +17 +11 +11 +19 +4 +12 +4 +7 +12 +23 +19 +22 +9 +24 +4 +24 +23 +11 +4 +3 +23 +21 +19 +14 +21 +5 +18 +11 +4 +5 +13 +7 +12 +4 +5 +16 +21 +13 +8 +13 +15 +6 +4 +5 +21 +22 +17 +4 +24 +3 +10 +11 +4 +18 +8 +4 +20 +6 +23 +5 +18 +22 +11 +24 +21 +17 +23 +10 +23 +20 +7 +19 +4 +10 +4 +5 +19 +21 +4 +8 +6 +13 +19 +16 +4 +6 +4 +9 +24 +21 +8 +4 +10 +4 +4 +4 +6 +19 +6 +19 +2 +11 +17 +11 +9 +16 +4 +12 +14 +13 +5 +6 +4 +24 +8 +4 +7 +12 +7 +15 +3 +8 +12 +24 +6 +21 +14 +1 +7 +4 +7 +12 +22 +17 +4 +6 +6 +10 +10 +23 +4 +8 +7 +21 +17 +4 +3 +23 +5 +10 4 7 -17 -18 -10 -9 +20 +4 +6 +24 +0 +7 +23 +11 +13 16 +4 +15 +3 +8 +7 +22 +4 +7 +19 +7 +1 +3 +4 +23 +2 +20 +4 +4 +2 +24 +22 +24 +4 +5 +9 +4 +21 +4 +5 +2 +24 +24 +13 +10 +21 +23 +7 +4 +23 +6 +21 +12 +4 +22 +4 +4 +20 +21 +3 +13 +10 +4 +5 +0 +7 +6 +4 +13 +4 +4 +24 +11 +13 +11 +21 +4 +12 +4 +19 +19 +23 +7 +4 +8 +23 +0 +8 +8 +6 +6 +19 +7 +13 +24 +13 +9 +12 +19 +9 +19 +23 +8 +4 +5 +19 +4 +4 +18 +11 +3 +11 +4 +4 +21 +5 17 +0 +18 +12 +10 +13 +12 20 23 11 -14 -24 -3 -6 -6 +16 +23 +5 +23 4 -11 -15 -6 -9 -18 -12 -14 -8 -3 -4 -18 -11 -18 -19 -11 -12 -6 -1 +5 22 0 -18 +6 4 -12 +7 +5 +4 +0 +0 +7 +23 +14 +17 +4 +4 +6 +3 +21 +22 +6 +2 +22 +17 +5 +24 +11 +4 +4 +18 +17