diff --git a/seq_labeling.py.ipynb b/seq_labeling.py.ipynb
new file mode 100644
index 0000000..2c40ba4
--- /dev/null
+++ b/seq_labeling.py.ipynb
@@ -0,0 +1,132 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import os.path\n",
+ "import gzip\n",
+ "import shutil\n",
+ "import torch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if not os.path.isfile('train/train.tsv'):\n",
+ " import lzma\n",
+ " with lzma.open('train/train.tsv.xz', 'rb') as f_in:\n",
+ " with open('train/train.tsv', 'wb') as f_out:\n",
+ " shutil.copyfileobj(f_in, f_out)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "raw_data = pd.read_csv('train/train.tsv', sep='\\t', names=['labels', 'text'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Label | \n",
+ " Word | \n",
+ " WordLen | \n",
+ " WordHasDigit | \n",
+ " CapitalFirst | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [Label, Word, WordLen, WordHasDigit, CapitalFirst]\n",
+ "Index: []"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = []\n",
+ "for sentence in raw_data.to_numpy():\n",
+ " for label, word in zip(sentence[0].split(), sentence[1].split()):\n",
+ " data.append([label,word,len(word), any(c.isdigit() for c in word), word.isupper()])\n",
+ "df = pd.DataFrame(data, columns=['Label', 'Word', 'WordLen', 'WordHasDigit', 'CapitalFirst'], index=None)\n",
+ "df[df[\"Label\"]==None]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def labels_process(dt):\n",
+ " return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]\n",
+ "\n",
+ "def data_process(dt):\n",
+ " return [ torch.tensor([vocab['']] +[vocab[token] for token in document ] + [vocab['']], dtype = torch.long) for document in dt]"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}