diff --git a/.ipynb_checkpoints/k-mean_script-checkpoint.ipynb b/.ipynb_checkpoints/k-mean_script-checkpoint.ipynb new file mode 100644 index 0000000..cd9057b --- /dev/null +++ b/.ipynb_checkpoints/k-mean_script-checkpoint.ipynb @@ -0,0 +1,228 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.cluster import KMeans\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## FUNKCJE" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def inertia_list(all_doc):\n", + " list_inter = []\n", + " K_max = int(len(all_doc)/2)\n", + " while K_max > 100:\n", + " K_max = int(K_max/2)\n", + " K = range(1,K_max)\n", + " for k in K:\n", + " FitMean = KMeans(n_clusters=k).fit(doc_vectors)\n", + " list_inter.append(FitMean.inertia_)\n", + " return list_inter" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def BestK(list_inter):\n", + " position = -10\n", + " for i in range(0, len(list_inter)-1):\n", + " if (int(list_inter[i]) == (int(list_inter[i+1]))):\n", + " position = i\n", + " if position == -10 :\n", + " position = len(list_inter)-1\n", + " return position" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## PLIK DEV-0" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "infile = open('dev-0/in.tsv', 'r', encoding=\"utf-8\")\n", + "outfile = open(\"dev-0/out.tsv\", \"w\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "all_doc = infile.readlines()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer = TfidfVectorizer()\n", + "doc_vectors = vectorizer.fit_transform(all_doc)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "list_inter = inertia_list(all_doc)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "position = BestK(list_inter)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "FitMean = KMeans(n_clusters=position).fit_predict(doc_vectors)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "for x in FitMean:\n", + " outfile.write(str(x) + '\\n')\n", + "infile.close()\n", + "outfile.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## PLIK TEST-A" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "infile = open('test-A/in.tsv', 'r', encoding=\"utf-8\")\n", + "outfile = open(\"test-A/out.tsv\", \"w\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "all_doc = infile.readlines()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer = TfidfVectorizer()\n", + "doc_vectors = vectorizer.fit_transform(all_doc)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "list_inter = inertia_list(all_doc)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "position = BestK(list_inter)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "FitMean = KMeans(n_clusters=position).fit_predict(doc_vectors)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "for x in FitMean:\n", + " outfile.write(str(x) + '\\n')\n", + "infile.close()\n", + "outfile.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/dev-0/out.tsv b/dev-0/out.tsv new file mode 100644 index 0000000..743647e --- /dev/null +++ b/dev-0/out.tsv @@ -0,0 +1,87 @@ +30 +9 +29 +4 +7 +36 +2 +18 +14 +3 +18 +36 +11 +6 +17 +20 +35 +24 +5 +12 +22 +15 +8 +16 +23 +12 +10 +8 +12 +11 +1 +39 +1 +6 +20 +1 +33 +6 +10 +4 +14 +18 +1 +31 +25 +27 +26 +11 +18 +1 +19 +29 +1 +21 +13 +16 +11 +18 +1 +0 +1 +37 +2 +38 +36 +0 +28 +25 +32 +26 +7 +5 +25 +2 +11 +18 +23 +13 +31 +18 +3 +18 +3 +12 +2 +34 +12 diff --git a/k-mean_script.ipynb b/k-mean_script.ipynb new file mode 100644 index 0000000..cd9057b --- /dev/null +++ b/k-mean_script.ipynb @@ -0,0 +1,228 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.cluster import KMeans\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## FUNKCJE" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def inertia_list(all_doc):\n", + " list_inter = []\n", + " K_max = int(len(all_doc)/2)\n", + " while K_max > 100:\n", + " K_max = int(K_max/2)\n", + " K = range(1,K_max)\n", + " for k in K:\n", + " FitMean = KMeans(n_clusters=k).fit(doc_vectors)\n", + " list_inter.append(FitMean.inertia_)\n", + " return list_inter" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def BestK(list_inter):\n", + " position = -10\n", + " for i in range(0, len(list_inter)-1):\n", + " if (int(list_inter[i]) == (int(list_inter[i+1]))):\n", + " position = i\n", + " if position == -10 :\n", + " position = len(list_inter)-1\n", + " return position" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## PLIK DEV-0" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "infile = open('dev-0/in.tsv', 'r', encoding=\"utf-8\")\n", + "outfile = open(\"dev-0/out.tsv\", \"w\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "all_doc = infile.readlines()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer = TfidfVectorizer()\n", + "doc_vectors = vectorizer.fit_transform(all_doc)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "list_inter = inertia_list(all_doc)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "position = BestK(list_inter)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "FitMean = KMeans(n_clusters=position).fit_predict(doc_vectors)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "for x in FitMean:\n", + " outfile.write(str(x) + '\\n')\n", + "infile.close()\n", + "outfile.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## PLIK TEST-A" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "infile = open('test-A/in.tsv', 'r', encoding=\"utf-8\")\n", + "outfile = open(\"test-A/out.tsv\", \"w\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "all_doc = infile.readlines()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer = TfidfVectorizer()\n", + "doc_vectors = vectorizer.fit_transform(all_doc)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "list_inter = inertia_list(all_doc)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "position = BestK(list_inter)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "FitMean = KMeans(n_clusters=position).fit_predict(doc_vectors)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "for x in FitMean:\n", + " outfile.write(str(x) + '\\n')\n", + "infile.close()\n", + "outfile.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/k-mean_script.py b/k-mean_script.py new file mode 100644 index 0000000..75903a3 --- /dev/null +++ b/k-mean_script.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[1]: + + +from sklearn.cluster import KMeans +from sklearn.feature_extraction.text import TfidfVectorizer +import numpy as np +import pandas as pd + + +# ## FUNKCJE + +# In[2]: + + +def inertia_list(all_doc): + list_inter = [] + K_max = int(len(all_doc)/2) + while K_max > 100: + K_max = int(K_max/2) + K = range(1,K_max) + for k in K: + FitMean = KMeans(n_clusters=k).fit(doc_vectors) + list_inter.append(FitMean.inertia_) + return list_inter + + +# In[3]: + + +def BestK(list_inter): + position = -10 + for i in range(0, len(list_inter)-1): + if (int(list_inter[i]) == (int(list_inter[i+1]))): + position = i + if position == -10 : + position = len(list_inter)-1 + return position + + +# ## PLIK DEV-0 + +# In[4]: + + +infile = open('dev-0/in.tsv', 'r', encoding="utf-8") +outfile = open("dev-0/out.tsv", "w") + + +# In[5]: + + +all_doc = infile.readlines() + + +# In[6]: + + +vectorizer = TfidfVectorizer() +doc_vectors = vectorizer.fit_transform(all_doc) + + +# In[7]: + + +list_inter = inertia_list(all_doc) + + +# In[8]: + + +position = BestK(list_inter) + + +# In[9]: + + +FitMean = KMeans(n_clusters=position).fit_predict(doc_vectors) + + +# In[10]: + + +for x in FitMean: + outfile.write(str(x) + '\n') +infile.close() +outfile.close() + + +# ## PLIK TEST-A + +# In[11]: + + +infile = open('test-A/in.tsv', 'r', encoding="utf-8") +outfile = open("test-A/out.tsv", "w") + + +# In[12]: + + +all_doc = infile.readlines() + + +# In[13]: + + +vectorizer = TfidfVectorizer() +doc_vectors = vectorizer.fit_transform(all_doc) + + +# In[14]: + + +list_inter = inertia_list(all_doc) + + +# In[15]: + + +position = BestK(list_inter) + + +# In[16]: + + +FitMean = KMeans(n_clusters=position).fit_predict(doc_vectors) + + +# In[17]: + + +for x in FitMean: + outfile.write(str(x) + '\n') +infile.close() +outfile.close() + diff --git a/porba1.ipynb b/porba1.ipynb new file mode 100644 index 0000000..5f6105e --- /dev/null +++ b/porba1.ipynb @@ -0,0 +1,459 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: numpy in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (1.19.2)\n", + "Requirement already satisfied: seaborn in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (0.11.0)\n", + "Requirement already satisfied: scikit-learn in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (0.23.2)\n", + "Requirement already satisfied: matplotlib in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (3.3.2)\n", + "Requirement already satisfied: fasttext in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (0.9.2)\n", + "Requirement already satisfied: pandas>=0.23 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from seaborn) (1.1.3)\n", + "Requirement already satisfied: scipy>=1.0 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from seaborn) (1.5.2)\n", + "Requirement already satisfied: joblib>=0.11 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from scikit-learn) (0.17.0)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from scikit-learn) (2.1.0)\n", + "Requirement already satisfied: certifi>=2020.06.20 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from matplotlib) (2020.6.20)\n", + "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from matplotlib) (8.0.1)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from matplotlib) (0.10.0)\n", + "Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from matplotlib) (2.8.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from matplotlib) (1.3.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from matplotlib) (2.4.7)\n", + "Requirement already satisfied: setuptools>=0.7.0 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from fasttext) (50.3.1.post20201107)\n", + "Requirement already satisfied: pybind11>=2.2 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from fasttext) (2.6.2)\n", + "Requirement already satisfied: pytz>=2017.2 in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from pandas>=0.23->seaborn) (2020.1)\n", + "Requirement already satisfied: six in c:\\users\\jedpc\\anaconda3\\lib\\site-packages (from cycler>=0.10->matplotlib) (1.15.0)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install numpy seaborn scikit-learn matplotlib fasttext" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import seaborn as sns\n", + "import copy\n", + "from scipy.cluster.hierarchy import dendrogram\n", + "from scipy.cluster import hierarchy\n", + "import matplotlib.pyplot as plt\n", + "from scipy.spatial import distance_matrix\n", + "import fasttext\n", + "import fasttext.util\n", + "from sklearn.feature_extraction.text import TfidfVectorizer" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "file = open(\"C:/Users/JedPC/Desktop/ISI/polish-urban-legends-public/dev-0/in.tsv\", encoding=\"utf-8\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "all_doc = []\n", + "for line in file:\n", + " all_doc.append(line)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer = TfidfVectorizer()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "document_vectors = vectorizer.fit_transform(all_doc)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.5937322507759797" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.max(document_vectors)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ILOSC K" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "K = 40" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# OBLICZANIE ILOSCI ZMIENNYCH" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "ELEMENTS = document_vectors.shape[0]\n", + "SIZE = document_vectors.shape[1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ALGORYTM K SREDNICH" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def get_random_centroids():\n", + " CENTROIDS = np.zeros((K, SIZE))\n", + " for i in range(K):\n", + " for j in range(SIZE):\n", + " CENTROIDS[i,j] = np.random.uniform(0,2)\n", + " if CENTROIDS[i,j] > 1:\n", + " CENTROIDS[i,j] = 0\n", + " return CENTROIDS" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "CENTROIDS = get_random_centroids()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def assign_data_to_labels(document_vectors, CENTROIDS):\n", + " LABELS = []\n", + " for POINT in document_vectors:\n", + " DISTANCES = [np.linalg.norm(POINT - CEN) for CEN in CENTROIDS]\n", + " \n", + " LABELS.append(np.argmin(DISTANCES))\n", + " return np.array(LABELS)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "LABELS = assign_data_to_labels(document_vectors, CENTROIDS)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def get_new_centroids(document_vectors, LABELS, CENTROIDS):\n", + " NEW_CENTROIDS = np.zeros_like(CENTROIDS)\n", + " for centroid_label in range(K):\n", + " CENT_DATA = document_vectors[LABELS == centroid_label]\n", + " NEW_CENTROIDS[centroid_label] = np.mean(CENT_DATA) \n", + " return NEW_CENTROIDS" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\JedPC\\anaconda3\\lib\\site-packages\\scipy\\sparse\\base.py:581: RuntimeWarning: divide by zero encountered in true_divide\n", + " return self.astype(np.float_)._mul_scalar(1./other)\n" + ] + } + ], + "source": [ + "NEW_CENTROIDS = get_new_centroids(document_vectors, LABELS, CENTROIDS)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "CENTROIDS = NEW_CENTROIDS" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0., 0., 0., ..., 0., 0., 0.])" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "NEW_CENTROIDS[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# LITERACJE" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "NUMBER = 1000" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(NUMBER):\n", + " LABELS = assign_data_to_labels(document_vectors, CENTROIDS)\n", + " CENTROIDS = get_new_centroids(document_vectors, LABELS, CENTROIDS)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "30\n", + "0\n", + "1\n", + "1\n", + "30\n", + "1\n", + "0\n", + "30\n", + "30\n", + "1\n", + "0\n", + "1\n", + "30\n", + "1\n", + "1\n", + "1\n", + "1\n", + "1\n", + "0\n", + "0\n", + "1\n", + "1\n", + "0\n", + "1\n", + "1\n", + "30\n", + "30\n", + "0\n", + "0\n", + "1\n", + "30\n", + "0\n", + "0\n", + "1\n", + "1\n", + "0\n", + "1\n", + "1\n", + "30\n", + "1\n", + "0\n", + "1\n", + "1\n", + "0\n", + "0\n", + "0\n", + "1\n", + "0\n", + "30\n", + "1\n", + "0\n", + "1\n", + "0\n", + "1\n", + "0\n", + "0\n", + "0\n", + "30\n", + "0\n", + "1\n", + "0\n", + "0\n", + "0\n", + "0\n", + "0\n", + "1\n", + "0\n", + "1\n", + "1\n", + "1\n", + "0\n", + "0\n", + "0\n", + "0\n", + "30\n", + "30\n", + "30\n", + "0\n", + "0\n", + "30\n", + "1\n", + "30\n", + "1\n", + "0\n", + "30\n", + "1\n", + "30\n" + ] + } + ], + "source": [ + "LABELS.shape[0]\n", + "for i in range(LABELS.shape[0]):\n", + " print(LABELS[i])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ???" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([30, 0, 1, 1, 30, 1, 0, 30, 30, 1, 0, 1, 30, 1, 1, 1, 1,\n", + " 1, 0, 0, 1, 1, 0, 1, 1, 30, 30, 0, 0, 1, 30, 0, 0, 1,\n", + " 1, 0, 1, 1, 30, 1, 0, 1, 1, 0, 0, 0, 1, 0, 30, 1, 0,\n", + " 1, 0, 1, 0, 0, 0, 30, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,\n", + " 1, 1, 0, 0, 0, 0, 30, 30, 30, 0, 0, 30, 1, 30, 1, 0, 30,\n", + " 1, 30], dtype=int64)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "LABELS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/test-A/out.tsv b/test-A/out.tsv new file mode 100644 index 0000000..3f74262 --- /dev/null +++ b/test-A/out.tsv @@ -0,0 +1,691 @@ +2 +36 +4 +17 +25 +12 +39 +16 +11 +36 +46 +30 +68 +11 +1 +62 +34 +50 +34 +1 +44 +8 +12 +53 +37 +38 +61 +73 +16 +56 +5 +72 +14 +62 +60 +66 +30 +62 +4 +72 +43 +66 +34 +42 +65 +75 +17 +3 +4 +1 +4 +52 +62 +53 +28 +22 +67 +4 +56 +39 +66 +0 +61 +25 +1 +31 +53 +14 +1 +11 +5 +60 +34 +62 +41 +62 +39 +69 +62 +39 +45 +6 +24 +67 +25 +55 +62 +69 +11 +43 +10 +5 +64 +25 +0 +29 +2 +40 +70 +5 +60 +11 +34 +63 +56 +17 +39 +39 +50 +66 +9 +4 +11 +4 +27 +25 +36 +66 +11 +17 +13 +58 +66 +17 +62 +4 +0 +22 +51 +62 +5 +64 +15 +22 +75 +60 +18 +39 +24 +19 +63 +45 +13 +59 +54 +62 +66 +59 +33 +22 +34 +34 +62 +1 +10 +55 +70 +48 +49 +8 +40 +52 +35 +3 +5 +25 +24 +40 +60 +64 +66 +28 +27 +55 +34 +24 +17 +46 +62 +52 +71 +3 +66 +34 +54 +54 +58 +57 +16 +28 +7 +54 +30 +17 +68 +60 +62 +4 +62 +7 +39 +27 +7 +77 +29 +0 +4 +3 +36 +7 +5 +30 +1 +62 +3 +30 +1 +12 +32 +11 +1 +62 +5 +45 +24 +60 +40 +50 +24 +30 +62 +73 +22 +3 +14 +11 +64 +53 +27 +62 +25 +11 +31 +75 +76 +32 +44 +44 +75 +3 +39 +47 +6 +68 +27 +24 +9 +62 +40 +13 +22 +9 +70 +39 +18 +73 +17 +50 +27 +1 +14 +62 +28 +38 +14 +62 +77 +9 +13 +49 +42 +44 +41 +11 +68 +21 +13 +4 +34 +12 +50 +17 +39 +11 +65 +44 +0 +21 +34 +75 +26 +50 +66 +28 +62 +46 +14 +62 +36 +14 +36 +28 +49 +72 +24 +47 +28 +13 +3 +62 +66 +24 +8 +36 +43 +43 +36 +41 +76 +77 +62 +50 +33 +22 +74 +4 +39 +49 +1 +5 +66 +33 +68 +1 +10 +13 +63 +55 +17 +76 +39 +5 +5 +16 +29 +50 +44 +34 +63 +0 +1 +55 +11 +38 +44 +39 +4 +72 +3 +40 +36 +62 +30 +73 +5 +28 +22 +0 +10 +50 +1 +11 +45 +39 +2 +9 +8 +30 +63 +14 +76 +31 +61 +4 +1 +25 +7 +25 +70 +62 +77 +62 +22 +48 +10 +1 +28 +16 +33 +68 +43 +21 +16 +39 +22 +28 +3 +28 +59 +70 +39 +43 +34 +12 +40 +27 +4 +62 +45 +1 +34 +53 +3 +16 +71 +33 +62 +42 +53 +12 +32 +4 +43 +48 +34 +52 +49 +71 +16 +26 +11 +28 +3 +75 +23 +62 +11 +3 +43 +20 +76 +17 +29 +52 +72 +0 +4 +62 +57 +43 +43 +2 +43 +7 +34 +26 +27 +66 +62 +32 +40 +24 +43 +39 +11 +23 +74 +44 +27 +36 +63 +4 +3 +11 +51 +50 +9 +59 +25 +13 +67 +4 +50 +66 +35 +49 +8 +39 +9 +73 +12 +40 +5 +66 +27 +55 +52 +34 +66 +3 +42 +11 +40 +50 +39 +62 +66 +24 +9 +72 +71 +50 +43 +4 +54 +8 +45 +1 +28 +17 +4 +19 +25 +23 +48 +14 +38 +72 +39 +68 +49 +43 +27 +75 +64 +6 +39 +7 +64 +31 +12 +4 +41 +16 +39 +39 +22 +56 +16 +64 +4 +4 +44 +36 +49 +55 +9 +63 +55 +67 +22 +39 +50 +14 +27 +50 +6 +22 +13 +44 +52 +39 +73 +54 +11 +66 +7 +33 +33 +57 +4 +1 +28 +1 +22 +41 +62 +48 +23 +26 +30 +72 +2 +63 +30 +50 +13 +3 +66 +36 +77 +8 +28 +4 +15 +57 +77 +44 +0 +77 +55 +3 +76 +37 +1 +39 +24 +10 +32 +76 +1 +7 +28 +22 +16 +40 +44 +10 +73 +66 +28 +75 +6 +55 +11 +62 +34 +27 +55 +10 +41 +12 +50 +66 +17 +72 +32 +44 +70 +54 +9 +29 +37 +40 +7 +24 +62 +28 +11 +12 +32 +64 +39 +40 +9 +4 +1 +63 +7 +16 +40