diff --git a/seq_labeling.py.ipynb b/seq_labeling.py.ipynb
index 2c40ba4..60ea006 100644
--- a/seq_labeling.py.ipynb
+++ b/seq_labeling.py.ipynb
@@ -11,7 +11,17 @@
"import os.path\n",
"import gzip\n",
"import shutil\n",
- "import torch"
+ "import torch\n",
+ "import gensim\n",
+ "import torch\n",
+ "import pandas as pd\n",
+ "import seaborn as sns\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from datasets import load_dataset\n",
+ "from torchtext.vocab import Vocab\n",
+ "from collections import Counter\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "from sklearn.metrics import accuracy_score"
]
},
{
@@ -31,15 +41,6 @@
"cell_type": "code",
"execution_count": 3,
"metadata": {},
- "outputs": [],
- "source": [
- "raw_data = pd.read_csv('train/train.tsv', sep='\\t', names=['labels', 'text'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
"outputs": [
{
"data": {
@@ -62,42 +63,1094 @@
" \n",
" \n",
" \n",
" \n",
" \n",
- " Label \n",
- " Word \n",
- " WordLen \n",
- " WordHasDigit \n",
- " CapitalFirst \n",
+ " iob \n",
+ " tokens \n",
"
945 rows × 2 columns
\n", "" ], "text/plain": [ - "Empty DataFrame\n", - "Columns: [Label, Word, WordLen, WordHasDigit, CapitalFirst]\n", - "Index: []" + " iob \\\n", + "0 [5, 0, 3, 0, 0, 0, 3, 0, 0, 0, 7, 8, 0, 1, 0, ... \n", + "1 [0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ... \n", + "2 [1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ... \n", + "3 [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, ... \n", + "4 [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ... \n", + ".. ... \n", + "940 [0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 1, 0, ... \n", + "941 [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ... \n", + "942 [0, 0, 3, 0, 7, 0, 5, 0, 0, 1, 0, 1, 0, 0, 3, ... \n", + "943 [0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 1, 0, 1, 0, 0, ... \n", + "944 [0, 0, 3, 4, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, ... \n", + "\n", + " tokens \n", + "0 [EU, rejects, German, call, to, boycott, Briti... \n", + "1 [Rare, Hendrix, song, draft, sells, for, almos... \n", + "2 [China, says, Taiwan, spoils, atmosphere, for,... \n", + "3 [China, says, time, right, for, Taiwan, talks,... \n", + "4 [German, July, car, registrations, up, 14.2, p... \n", + ".. ... \n", + "940 [CYCLING, -, BALLANGER, KEEPS, SPRINT, TITLE, ... \n", + "941 [CYCLING, -, WORLD, TRACK, CHAMPIONSHIP, RESUL... \n", + "942 [SOCCER, -, FRENCH, DEFENDER, KOMBOUARE, JOINS... \n", + "943 [MOTORCYCLING, -, SAN, MARINO, GRAND, PRIX, PR... \n", + "944 [GOLF, -, BRITISH, MASTERS, THIRD, ROUND, SCOR... \n", + "\n", + "[945 rows x 2 columns]" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data = []\n", - "for sentence in raw_data.to_numpy():\n", - " for label, word in zip(sentence[0].split(), sentence[1].split()):\n", - " data.append([label,word,len(word), any(c.isdigit() for c in word), word.isupper()])\n", - "df = pd.DataFrame(data, columns=['Label', 'Word', 'WordLen', 'WordHasDigit', 'CapitalFirst'], index=None)\n", - "df[df[\"Label\"]==None]" + "labels = ['O','B-LOC', 'I-LOC','B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']\n", + "\n", + "data = pd.read_csv('train/train.tsv', sep='\\t', names=['iob', 'tokens'])\n", + "data[\"iob\"]=data[\"iob\"].apply(lambda x: [labels.index(y) for y in x.split()])\n", + "data[\"tokens\"]=data[\"tokens\"].apply(lambda x: x.split())\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def build_vocab(dataset):\n", + " counter = Counter()\n", + " for document in dataset:\n", + " print(document)\n", + " counter.update(document)\n", + " return Vocab(counter, specials=['