4.5 KiB
4.5 KiB
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
FUNKCJE
def inertia_list(all_doc):
list_inter = []
K_max = int(len(all_doc)/2)
while K_max > 100:
K_max = int(K_max/2)
K = range(1,K_max)
for k in K:
FitMean = KMeans(n_clusters=k).fit(doc_vectors)
list_inter.append(FitMean.inertia_)
return list_inter
def BestK(list_inter):
position = -10
for i in range(0, len(list_inter)-1):
if (int(list_inter[i]) == (int(list_inter[i+1]))):
position = i
if position == -10 :
position = len(list_inter)-1
return position
PLIK DEV-0
infile = open('dev-0/in.tsv', 'r', encoding="utf-8")
outfile = open("dev-0/out.tsv", "w")
all_doc = infile.readlines()
vectorizer = TfidfVectorizer()
doc_vectors = vectorizer.fit_transform(all_doc)
list_inter = inertia_list(all_doc)
position = BestK(list_inter)
FitMean = KMeans(n_clusters=position).fit_predict(doc_vectors)
for x in FitMean:
outfile.write(str(x) + '\n')
infile.close()
outfile.close()
PLIK TEST-A
infile = open('test-A/in.tsv', 'r', encoding="utf-8")
outfile = open("test-A/out.tsv", "w")
all_doc = infile.readlines()
vectorizer = TfidfVectorizer()
doc_vectors = vectorizer.fit_transform(all_doc)
list_inter = inertia_list(all_doc)
position = BestK(list_inter)
FitMean = KMeans(n_clusters=position).fit_predict(doc_vectors)
for x in FitMean:
outfile.write(str(x) + '\n')
infile.close()
outfile.close()