From ef0e228ed873774dc7cdcde53a7ef08ce4c4d6d5 Mon Sep 17 00:00:00 2001 From: Alicja Szulecka <73056579+AliSzu@users.noreply.github.com> Date: Sun, 2 Jun 2024 19:45:06 +0200 Subject: [PATCH] lab8 --- lab8.ipynb | 668 + test-A/out.tsv | 219516 +++++++++++++++++++++++++++++++++++++++++++++ train/train.tsv | 945 + 3 files changed, 221129 insertions(+) create mode 100644 lab8.ipynb create mode 100644 test-A/out.tsv create mode 100644 train/train.tsv diff --git a/lab8.ipynb b/lab8.ipynb new file mode 100644 index 0000000..8fee386 --- /dev/null +++ b/lab8.ipynb @@ -0,0 +1,668 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import pandas as pd\n", + "\n", + "from collections import Counter\n", + "from torchtext.vocab import vocab\n", + "from sklearn.metrics import accuracy_score\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "#Wczytanie zbioru danych\n", + "\n", + "train_set = pd.read_csv('./train/train.tsv', sep='\\t', header=None, names=['labels', 'text'])\n", + "val_set = pd.read_csv('./dev-0/expected.tsv', sep='\\t', header=None, names=['labels'])\n", + "val_set['text'] = pd.read_csv('./dev-0/in.tsv', sep='\\t', header=None, names=['text'])\n", + "test_set = pd.read_csv('./test-A/in.tsv', sep='\\t', header=None, names=['text'])" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "#Tokenizacja danych\n", + "train_set['text'] = train_set[\"text\"].apply(lambda x : x.split())\n", + "train_set['labels'] = train_set[\"labels\"].apply(lambda x : x.split())\n", + "\n", + "val_set['text'] = val_set[\"text\"].apply(lambda x : x.split())\n", + "val_set['labels'] = val_set[\"labels\"].apply(lambda x : x.split())\n", + "\n", + "test_set['text'] = test_set[\"text\"].apply(lambda x : x.split())" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | labels | \n", + "text | \n", + "
---|---|---|
0 | \n", + "[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O, O, B... | \n", + "[EU, rejects, German, call, to, boycott, Briti... | \n", + "
1 | \n", + "[O, B-PER, O, O, O, O, O, O, O, O, O, B-LOC, O... | \n", + "[Rare, Hendrix, song, draft, sells, for, almos... | \n", + "
2 | \n", + "[B-LOC, O, B-LOC, O, O, O, O, O, O, B-LOC, O, ... | \n", + "[China, says, Taiwan, spoils, atmosphere, for,... | \n", + "
3 | \n", + "[B-LOC, O, O, O, O, B-LOC, O, O, O, B-LOC, O, ... | \n", + "[China, says, time, right, for, Taiwan, talks,... | \n", + "
4 | \n", + "[B-MISC, O, O, O, O, O, O, O, O, O, O, O, B-LO... | \n", + "[German, July, car, registrations, up, 14.2, p... | \n", + "
5 | \n", + "[B-MISC, O, O, O, O, O, O, O, O, O, O, B-LOC, ... | \n", + "[GREEK, SOCIALISTS, GIVE, GREEN, LIGHT, TO, PM... | \n", + "
6 | \n", + "[B-ORG, O, B-MISC, O, O, O, O, O, O, B-LOC, O,... | \n", + "[BayerVB, sets, C$, 100, million, six-year, bo... | \n", + "
7 | \n", + "[B-ORG, O, O, O, O, O, O, O, O, O, B-LOC, O, O... | \n", + "[Venantius, sets, $, 300, million, January, 19... | \n", + "
8 | \n", + "[O, O, O, O, B-LOC, O, B-ORG, I-ORG, O, O, O, ... | \n", + "[Port, conditions, update, -, Syria, -, Lloyds... | \n", + "
9 | \n", + "[B-LOC, O, O, O, O, O, O, B-LOC, O, O, B-PER, ... | \n", + "[Israel, plays, down, fears, of, war, with, Sy... | \n", + "