{ "cells": [ { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [], "source": [ "import lzma\n", "import sys\n", "from io import StringIO\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "import pandas as pd\n", "import numpy\n", "\n", "pathX = \"./train/in.tsv.xz\"\n", "# pathX = \"./train/in.tsv\"\n", "pathY = \"./train/expected.tsv\"\n", "nrows = 10000" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [], "source": [ "# data = lzma.open(pathX, mode='rt', encoding='utf-8').read()\n", "# stringIO = StringIO(data)\n", "# df = pd.read_csv(stringIO, sep=\"\\t\", header=None)\n", "df = pd.read_csv(pathX, sep='\\t', nrows=nrows, header=None)\n", "df = df.drop(df.columns[1], axis=1)\n", "topics = pd.read_csv(pathY, sep='\\t', nrows=nrows, header=None)" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "10000\n", "10000\n" ] } ], "source": [ "print(len(df.index))\n", "\n", "print(len(topics.index))\n" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | 0 | \n", "
---|---|
8910 | \n", "What? It isn't a fake memo. It's a real memo. ... | \n", "