polish-urban-legends-public.../k-mean_script.ipynb
2021-04-25 17:52:29 +02:00

4.5 KiB

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd

FUNKCJE

def inertia_list(all_doc):
    list_inter = []
    K_max = int(len(all_doc)/2)
    while K_max > 100:
        K_max = int(K_max/2)
    K = range(1,K_max)
    for k in K:
        FitMean = KMeans(n_clusters=k).fit(doc_vectors)
        list_inter.append(FitMean.inertia_)
    return list_inter
def BestK(list_inter):
    position = -10
    for i in range(0, len(list_inter)-1):
        if (int(list_inter[i]) == (int(list_inter[i+1]))):
            position = i
    if position == -10 :
        position = len(list_inter)-1
    return position

PLIK DEV-0

infile = open('dev-0/in.tsv', 'r', encoding="utf-8")
outfile = open("dev-0/out.tsv", "w")
all_doc = infile.readlines()
vectorizer = TfidfVectorizer()
doc_vectors = vectorizer.fit_transform(all_doc)
list_inter = inertia_list(all_doc)
position = BestK(list_inter)
FitMean = KMeans(n_clusters=position).fit_predict(doc_vectors)
for x in FitMean:
    outfile.write(str(x) + '\n')
infile.close()
outfile.close()

PLIK TEST-A

infile = open('test-A/in.tsv', 'r', encoding="utf-8")
outfile = open("test-A/out.tsv", "w")
all_doc = infile.readlines()
vectorizer = TfidfVectorizer()
doc_vectors = vectorizer.fit_transform(all_doc)
list_inter = inertia_list(all_doc)
position = BestK(list_inter)
FitMean = KMeans(n_clusters=position).fit_predict(doc_vectors)
for x in FitMean:
    outfile.write(str(x) + '\n')
infile.close()
outfile.close()