{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os.path\n", "import gzip\n", "import shutil\n", "import torch" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "if not os.path.isfile('train/train.tsv'):\n", " import lzma\n", " with lzma.open('train/train.tsv.xz', 'rb') as f_in:\n", " with open('train/train.tsv', 'wb') as f_out:\n", " shutil.copyfileobj(f_in, f_out)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "raw_data = pd.read_csv('train/train.tsv', sep='\\t', names=['labels', 'text'])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Label | \n", "Word | \n", "WordLen | \n", "WordHasDigit | \n", "CapitalFirst | \n", "
---|