From 1ca4058a3fa2907999ca1337925f544af8365422 Mon Sep 17 00:00:00 2001 From: Klaudia Date: Sun, 25 Apr 2021 00:51:40 +0200 Subject: [PATCH] Done --- Skrypt.py | 64 +++++++++++++++++++++++++++++++++++++ dev-0/out.tsv | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++ test-A/out.tsv | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 238 insertions(+) create mode 100644 Skrypt.py create mode 100644 dev-0/out.tsv create mode 100644 test-A/out.tsv diff --git a/Skrypt.py b/Skrypt.py new file mode 100644 index 0000000..4ab9d30 --- /dev/null +++ b/Skrypt.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# coding: utf-8 + + +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.cluster import KMeans +import string +from stop_words import get_stop_words + +stop_words = get_stop_words('polish') + + + +inp1 = open('dev-0/in.tsv', 'r', encoding="utf-8") +out1 = open("dev-0/out.tsv", "w") + +linia1 = inp1.readlines() +inp1.close() + +prep=[] +for x in linia1: + temp = "" + for y in x.split(): + y = y.strip().replace(",", "") + if y not in stop_words: + temp = temp + " " + y + prep.append(temp) + +vectorizer1 = TfidfVectorizer() +vectorizer1 = vectorizer1.fit_transform(prep) + +predict1 = KMeans(n_clusters=25, max_iter=1000).fit_predict(vectorizer1) +print(predict1) + +for x in predict1: + out1.write(str(x) + '\n') +out1.close() + + + +inp2 = open('test-A/in.tsv', 'r', encoding="utf-8") +out2 = open("test-A/out.tsv", "w") + +linia2 = inp2.readlines() +inp2.close() + +prep2=[] +for x2 in linia1: + temp2 = "" + for y2 in x2.split(): + y2 = y2.strip().replace(",", "") + if y2 not in stop_words: + temp2 = temp2 + " " + y2 + prep2.append(temp2) + +vectorizer2 = TfidfVectorizer() +vectorizer2 = vectorizer2.fit_transform(prep) + +predict2 = KMeans(n_clusters=25, max_iter=1000).fit_predict(vectorizer2) +print(predict2) + +for y in predict2: + out2.write(str(y) + '\n') +out2.close() diff --git a/dev-0/out.tsv b/dev-0/out.tsv new file mode 100644 index 0000000..6bd13c5 --- /dev/null +++ b/dev-0/out.tsv @@ -0,0 +1,87 @@ +7 +0 +1 +6 +22 +20 +3 +4 +8 +5 +4 +20 +0 +7 +21 +11 +7 +22 +12 +17 +4 +24 +9 +2 +18 +15 +10 +9 +24 +0 +3 +3 +3 +7 +21 +23 +7 +7 +10 +6 +8 +14 +3 +6 +14 +7 +23 +0 +16 +3 +18 +1 +3 +15 +13 +2 +0 +0 +3 +19 +3 +7 +3 +0 +20 +19 +3 +14 +7 +23 +22 +12 +14 +3 +0 +4 +18 +13 +6 +7 +5 +4 +5 +17 +3 +15 +0 diff --git a/test-A/out.tsv b/test-A/out.tsv new file mode 100644 index 0000000..be5cc6f --- /dev/null +++ b/test-A/out.tsv @@ -0,0 +1,87 @@ +9 +0 +2 +16 +10 +18 +13 +23 +18 +3 +6 +18 +0 +9 +12 +12 +21 +10 +14 +0 +11 +24 +20 +22 +23 +4 +15 +20 +20 +0 +7 +7 +7 +8 +12 +1 +8 +9 +15 +16 +18 +17 +13 +19 +17 +1 +1 +0 +18 +7 +23 +2 +7 +4 +5 +22 +0 +0 +7 +24 +1 +21 +13 +0 +18 +24 +7 +17 +8 +1 +10 +14 +17 +7 +0 +0 +23 +5 +19 +0 +3 +6 +3 +10 +7 +9 +0