From 08abda0e74398e7fade2afc3d254d4e8ce1e3cea Mon Sep 17 00:00:00 2001 From: Vojtaz Date: Sat, 24 Apr 2021 21:08:04 +0200 Subject: [PATCH] =?UTF-8?q?zadanie-k-=C5=9Brednie?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clusters.py | 32 +++ dev-0/out.tsv | 87 ++++++ skrypt-dev-0.py | 20 ++ skrypt-test-A.py | 20 ++ test-A/out.tsv | 691 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 850 insertions(+) create mode 100644 clusters.py create mode 100644 dev-0/out.tsv create mode 100644 skrypt-dev-0.py create mode 100644 skrypt-test-A.py create mode 100644 test-A/out.tsv diff --git a/clusters.py b/clusters.py new file mode 100644 index 0000000..b6259a1 --- /dev/null +++ b/clusters.py @@ -0,0 +1,32 @@ +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.preprocessing import MaxAbsScaler + + + +K = range(1, 87) +W = range(1, 683) + +mms = MaxAbsScaler() + +with open("test-A/in.tsv") as file: + corpus = file.readlines() + vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf = True) + vectorizer = TfidfVectorizer() + vectors = vectorizer.fit_transform(corpus) + mms.fit(vectors) + transformed = mms.transform(vectors) + + Sum_of_squered_distances = [] + + for k in W: + km = KMeans(n_clusters=k) + km = km.fit(transformed) + Sum_of_squered_distances.append(km.inertia_) + + plt.plot(W, Sum_of_squered_distances, 'bx-') + plt.xlabel('k') + plt.ylabel('Sum_of_squered_distances') + plt.show() \ No newline at end of file diff --git a/dev-0/out.tsv b/dev-0/out.tsv new file mode 100644 index 0000000..6915e4f --- /dev/null +++ b/dev-0/out.tsv @@ -0,0 +1,87 @@ +18 + 4 + 6 +12 +24 +22 +27 + 3 + 4 + 8 +26 +22 + 0 +13 +25 +15 +33 +24 +23 +24 +16 +11 + 7 +10 + 20 +11 +21 + 7 +34 +32 +27 +27 +17 + 5 +25 +17 +13 + 1 +21 +12 + 4 +30 +17 +12 +18 +29 +31 +24 + 3 +17 +20 + 6 +27 +11 +14 +10 +24 +24 +27 +28 +17 +19 +30 +24 +22 +11 + 2 +18 + 5 +31 +16 +23 + 18 + 9 +24 + 3 +20 +14 +12 + 3 + 8 + 3 + 8 +24 + 9 + 1 +24 \ No newline at end of file diff --git a/skrypt-dev-0.py b/skrypt-dev-0.py new file mode 100644 index 0000000..7196f44 --- /dev/null +++ b/skrypt-dev-0.py @@ -0,0 +1,20 @@ +from sklearn.cluster import KMeans +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +import matplotlib.pyplot as plt + + + + +with open("dev-0/in.tsv") as file: + corpus = file.readlines() + vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf = True) + vectors = vectorizer.fit_transform(corpus) + + kmeans = KMeans(n_clusters=35, random_state = 0).fit(vectors) + prediction = kmeans.fit_predict(vectors) + labels = kmeans.labels_ + labels = np.array2string(labels, precision=2, separator='\n', suppress_small=True) + + file = open("dev-0/out.tsv", "w") + file.write(labels[1:-1]) \ No newline at end of file diff --git a/skrypt-test-A.py b/skrypt-test-A.py new file mode 100644 index 0000000..c8a0bab --- /dev/null +++ b/skrypt-test-A.py @@ -0,0 +1,20 @@ +from sklearn.cluster import KMeans +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +import matplotlib.pyplot as plt + + + + +with open("test-A/in.tsv") as file: + corpus = file.readlines() + vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf = True) + vectors = vectorizer.fit_transform(corpus) + + kmeans = KMeans(n_clusters=255, random_state = 0).fit(vectors) + prediction = kmeans.fit_predict(vectors) + labels = kmeans.labels_ + labels = np.array2string(labels, precision=2, separator='\n', suppress_small=True) + + file = open("test-A/out.tsv", "w") + file.write(labels[1:-1]) \ No newline at end of file diff --git a/test-A/out.tsv b/test-A/out.tsv new file mode 100644 index 0000000..0c61e22 --- /dev/null +++ b/test-A/out.tsv @@ -0,0 +1,691 @@ + 63 +130 +223 +128 +113 +107 +227 + 27 + 70 +129 +203 + 73 +174 +160 + 17 +244 + 93 +205 + 93 + 17 +185 + 38 + 20 +117 +165 + 53 +111 + 78 +194 + 98 +147 +254 + 20 + 97 + 68 + 58 + 73 + 77 +143 +125 + 51 + 14 +186 + 4 + 48 +204 +164 + 9 +223 + 17 +223 +155 +209 +117 + 46 + 95 +131 +156 +218 +240 +105 +166 +111 + 52 + 17 + 43 + 80 +107 +108 +139 + 35 + 68 + 140 +145 +132 +227 +252 +235 +211 + 8 +184 + 13 + 44 +136 +110 + 57 +209 +235 +228 + 51 + 119 +106 + 68 + 92 +103 +166 + 63 + 48 +193 +200 +151 + 86 + 47 + 26 +218 +172 +135 + 39 + 74 + 65 + 50 +156 + 70 +142 + 69 + 92 +177 + 33 + 79 +176 + 12 + 82 + 26 +144 +157 +143 + 247 + 15 + 84 + 9 +106 + 67 +105 + 77 +224 + 68 +127 + 8 + 44 + 29 + 89 + 59 + 12 + 29 + 53 +131 + 65 + 29 +170 + 95 + 93 +103 +177 + 17 +137 + 73 + 18 + 30 +145 + 38 + 41 +104 + 71 + 21 + 35 + 92 + 44 +220 + 68 + 68 +146 + 85 +217 + 14 +140 + 44 +243 + 83 +176 +149 + 10 + 9 + 88 +171 + 53 +138 + 82 + 22 + 27 +210 + 63 + 5 + 73 +133 +196 + 30 +209 +223 + 176 + 63 +175 +112 + 63 + 40 +129 +103 +223 + 9 +130 + 6 +106 + 91 +104 + 6 + 30 + 73 + 101 + 20 + 36 + 70 + 17 +126 +106 +193 + 23 + 67 + 41 +234 + 96 + 73 +131 + 87 + 95 +173 + 16 + 70 + 67 +117 +169 +131 + 57 + 70 + 43 + 87 +207 + 32 +102 +102 + 40 + 21 + 62 + 33 + 13 + 64 +112 + 23 + 57 +204 + 41 + 12 + 95 +101 +242 +168 +127 + 75 +131 +146 +112 + 91 + 206 +185 +210 +152 +187 +180 + 40 + 50 + 12 +215 +148 + 61 +132 + 8 + 64 + 11 +114 +156 + 161 + 20 + 74 +164 +233 + 79 +159 +102 +103 + 11 +186 + 78 + 40 + 74 + 2 +210 +126 + 83 + 211 +181 + 19 + 57 +130 + 54 + 76 +174 + 96 +146 +135 +150 +179 + 98 +166 + 44 + 38 +177 + 231 +122 +177 +132 +158 + 40 + 85 +167 +213 + 15 +182 +223 + 8 + 76 + 17 + 56 +104 +105 + 64 + 17 +137 + 12 +222 + 31 + 62 +158 + 72 + 66 + 54 +241 +129 +251 +102 +161 +162 +166 + 229 +195 + 79 +152 +102 +212 +223 +174 + 21 + 41 +130 +232 + 73 +154 +101 + 56 + 15 + 28 + 45 +251 + 53 +201 +239 +249 + 6 + 50 +124 + 73 +123 +107 +163 + 43 +111 +223 +219 +157 + 57 +113 + 18 +120 + 40 +209 + 95 + 81 + 45 + 17 +246 + 86 + 24 + 34 + 99 + 11 +101 + 8 + 15 + 61 + 21 +210 +139 +198 + 39 +114 + 72 + 20 + 41 +112 +142 +113 +154 + 17 +133 + 80 + 21 + 14 + 10 +170 + 14 + 57 +117 + 20 + 32 +223 + 99 +191 + 93 + 33 +244 + 10 +194 + 40 + 7 + 14 +185 + 75 + 25 + 77 +253 + 21 +175 +210 + 79 +144 +237 +226 +125 +103 +142 +225 + 192 + 51 +122 +118 + 51 +134 +215 + 40 +112 +104 +208 + 36 + 41 + 23 + 99 +245 +178 +171 + 182 +102 +112 + 19 +123 +223 + 9 +197 + 84 + 77 + 64 + 29 +110 + 7 +230 + 1 + 57 + 77 + 71 + 76 + 42 + 76 + 86 + 48 + 16 +119 + 20 + 2 +169 + 94 +149 + 93 + 2 + 9 + 4 +135 + 41 + 49 +228 +216 +183 +199 +236 +174 + 10 +214 + 51 + 97 +100 + 34 +184 +238 + 61 +172 + 142 + 29 +113 + 25 + 81 + 16 + 0 +254 + 39 + 64 +215 +175 + 37 + 78 + 67 + 13 + 39 + 57 + 68 + 43 + 16 +223 +132 + 27 +141 +161 + 77 + 85 + 27 + 3 +223 +223 +135 + 58 +211 + 31 + 64 +162 + 31 +136 + 95 + 87 +189 + 16 +169 + 57 + 13 + 95 +202 +102 + 33 + 69 + 72 +100 + 42 +250 + 63 +105 + 24 +192 +223 + 17 + 61 + 17 + 95 +132 + 90 +109 + 25 +160 + 73 + 86 + 63 + 26 + 73 + 5 + 7 + 21 + 2 + 19 +116 + 38 +188 +142 +105 + 22 +195 +135 + 47 +153 + 94 + 21 + 60 +115 + 77 + 39 + 44 + 80 + 36 +121 + 17 + 57 +240 + 95 +229 +220 +100 + 45 + 78 + 58 +210 +171 + 13 + 31 + 70 +126 + 55 +112 + 31 +221 +132 + 16 +190 + 65 + 77 +248 + 81 + 14 + 53 +150 + 86 +103 +165 + 41 +118 +199 +183 +133 + 79 + 20 + 32 + 3 +202 + 41 + 254 +223 + 17 + 26 + 63 +134 + 41 \ No newline at end of file