polish-urban-legends-public/.ipynb_checkpoints/k-mean_script-checkpoint.ipynb
2021-04-25 17:52:29 +02:00

229 lines
4.5 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.cluster import KMeans\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## FUNKCJE"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def inertia_list(all_doc):\n",
" list_inter = []\n",
" K_max = int(len(all_doc)/2)\n",
" while K_max > 100:\n",
" K_max = int(K_max/2)\n",
" K = range(1,K_max)\n",
" for k in K:\n",
" FitMean = KMeans(n_clusters=k).fit(doc_vectors)\n",
" list_inter.append(FitMean.inertia_)\n",
" return list_inter"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def BestK(list_inter):\n",
" position = -10\n",
" for i in range(0, len(list_inter)-1):\n",
" if (int(list_inter[i]) == (int(list_inter[i+1]))):\n",
" position = i\n",
" if position == -10 :\n",
" position = len(list_inter)-1\n",
" return position"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## PLIK DEV-0"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"infile = open('dev-0/in.tsv', 'r', encoding=\"utf-8\")\n",
"outfile = open(\"dev-0/out.tsv\", \"w\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"all_doc = infile.readlines()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"vectorizer = TfidfVectorizer()\n",
"doc_vectors = vectorizer.fit_transform(all_doc)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"list_inter = inertia_list(all_doc)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"position = BestK(list_inter)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"FitMean = KMeans(n_clusters=position).fit_predict(doc_vectors)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"for x in FitMean:\n",
" outfile.write(str(x) + '\\n')\n",
"infile.close()\n",
"outfile.close()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## PLIK TEST-A"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"infile = open('test-A/in.tsv', 'r', encoding=\"utf-8\")\n",
"outfile = open(\"test-A/out.tsv\", \"w\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"all_doc = infile.readlines()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"vectorizer = TfidfVectorizer()\n",
"doc_vectors = vectorizer.fit_transform(all_doc)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"list_inter = inertia_list(all_doc)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"position = BestK(list_inter)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"FitMean = KMeans(n_clusters=position).fit_predict(doc_vectors)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"for x in FitMean:\n",
" outfile.write(str(x) + '\\n')\n",
"infile.close()\n",
"outfile.close()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}