{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os.path\n", "import shutil\n", "import torch\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from torchtext.vocab import Vocab\n", "from collections import Counter" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "model_path = \"seq_labeling.model\"\n", "if not os.path.isfile('train/train.tsv'):\n", " import lzma\n", " with lzma.open('train/train.tsv.xz', 'rb') as f_in:\n", " with open('train/train.tsv', 'wb') as f_out:\n", " shutil.copyfileobj(f_in, f_out)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | iob | \n", "tokens | \n", "
---|---|---|
0 | \n", "[5, 0, 3, 0, 0, 0, 3, 0, 0, 0, 7, 8, 0, 1, 0, ... | \n", "[EU, rejects, German, call, to, boycott, Briti... | \n", "
1 | \n", "[0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ... | \n", "[Rare, Hendrix, song, draft, sells, for, almos... | \n", "
2 | \n", "[1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ... | \n", "[China, says, Taiwan, spoils, atmosphere, for,... | \n", "
3 | \n", "[1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, ... | \n", "[China, says, time, right, for, Taiwan, talks,... | \n", "
4 | \n", "[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ... | \n", "[German, July, car, registrations, up, 14.2, p... | \n", "
... | \n", "... | \n", "... | \n", "
940 | \n", "[0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 1, 0, ... | \n", "[CYCLING, -, BALLANGER, KEEPS, SPRINT, TITLE, ... | \n", "
941 | \n", "[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ... | \n", "[CYCLING, -, WORLD, TRACK, CHAMPIONSHIP, RESUL... | \n", "
942 | \n", "[0, 0, 3, 0, 7, 0, 5, 0, 0, 1, 0, 1, 0, 0, 3, ... | \n", "[SOCCER, -, FRENCH, DEFENDER, KOMBOUARE, JOINS... | \n", "
943 | \n", "[0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 1, 0, 1, 0, 0, ... | \n", "[MOTORCYCLING, -, SAN, MARINO, GRAND, PRIX, PR... | \n", "
944 | \n", "[0, 0, 3, 4, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, ... | \n", "[GOLF, -, BRITISH, MASTERS, THIRD, ROUND, SCOR... | \n", "
945 rows × 2 columns
\n", "