{ "cells": [ { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import pandas as pd\n", "\n", "from collections import Counter\n", "from torchtext.vocab import vocab\n", "from sklearn.metrics import accuracy_score\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#Wczytanie zbioru danych\n", "\n", "train_set = pd.read_csv('./train/train.tsv', sep='\\t', header=None, names=['labels', 'text'])\n", "val_set = pd.read_csv('./dev-0/expected.tsv', sep='\\t', header=None, names=['labels'])\n", "val_set['text'] = pd.read_csv('./dev-0/in.tsv', sep='\\t', header=None, names=['text'])\n", "test_set = pd.read_csv('./test-A/in.tsv', sep='\\t', header=None, names=['text'])" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "#Tokenizacja danych\n", "train_set['text'] = train_set[\"text\"].apply(lambda x : x.split())\n", "train_set['labels'] = train_set[\"labels\"].apply(lambda x : x.split())\n", "\n", "val_set['text'] = val_set[\"text\"].apply(lambda x : x.split())\n", "val_set['labels'] = val_set[\"labels\"].apply(lambda x : x.split())\n", "\n", "test_set['text'] = test_set[\"text\"].apply(lambda x : x.split())" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | labels | \n", "text | \n", "
---|---|---|
0 | \n", "[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O, O, B... | \n", "[EU, rejects, German, call, to, boycott, Briti... | \n", "
1 | \n", "[O, B-PER, O, O, O, O, O, O, O, O, O, B-LOC, O... | \n", "[Rare, Hendrix, song, draft, sells, for, almos... | \n", "
2 | \n", "[B-LOC, O, B-LOC, O, O, O, O, O, O, B-LOC, O, ... | \n", "[China, says, Taiwan, spoils, atmosphere, for,... | \n", "
3 | \n", "[B-LOC, O, O, O, O, B-LOC, O, O, O, B-LOC, O, ... | \n", "[China, says, time, right, for, Taiwan, talks,... | \n", "
4 | \n", "[B-MISC, O, O, O, O, O, O, O, O, O, O, O, B-LO... | \n", "[German, July, car, registrations, up, 14.2, p... | \n", "
5 | \n", "[B-MISC, O, O, O, O, O, O, O, O, O, O, B-LOC, ... | \n", "[GREEK, SOCIALISTS, GIVE, GREEN, LIGHT, TO, PM... | \n", "
6 | \n", "[B-ORG, O, B-MISC, O, O, O, O, O, O, B-LOC, O,... | \n", "[BayerVB, sets, C$, 100, million, six-year, bo... | \n", "
7 | \n", "[B-ORG, O, O, O, O, O, O, O, O, O, B-LOC, O, O... | \n", "[Venantius, sets, $, 300, million, January, 19... | \n", "
8 | \n", "[O, O, O, O, B-LOC, O, B-ORG, I-ORG, O, O, O, ... | \n", "[Port, conditions, update, -, Syria, -, Lloyds... | \n", "
9 | \n", "[B-LOC, O, O, O, O, O, O, B-LOC, O, O, B-PER, ... | \n", "[Israel, plays, down, fears, of, war, with, Sy... | \n", "