{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to\n", "[nltk_data] C:\\Users\\akida\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package wordnet to\n", "[nltk_data] C:\\Users\\akida\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n", "[nltk_data] Downloading package punkt to\n", "[nltk_data] C:\\Users\\akida\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import csv\n", "import numpy as np\n", "import pandas as pd\n", "import re\n", "import string\n", "import torch\n", "\n", "import nltk\n", "from nltk.tokenize import word_tokenize\n", "from nltk.corpus import stopwords # used for preprocessing\n", "from nltk.stem import WordNetLemmatizer # used for preprocessing\n", "nltk.download('stopwords')\n", "nltk.download('wordnet')\n", "nltk.download('punkt')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "train_in = pd.read_csv(\"./train/in.tsv\", delimiter=\"\\t\", names=[\"text\", \"date\"], header=None)\n", "train_exp = pd.read_csv(\"./train/expected.tsv\", delimiter=\"\\t\", header=None)\n", "dev_in = pd.read_csv(\"./dev-0/in.tsv\", delimiter=\"\\t\", names=[\"text\", \"date\"],header=None)\n", "dev_exp = pd.read_csv(\"./dev-0/expected.tsv\", delimiter=\"\\t\", header=None)\n", "test_in = pd.read_csv(\"./test-A/in.tsv\", delimiter=\"\\t\", names=[\"text\", \"date\"], header=None)\n", "\n", "train_in.drop('date', axis=1, inplace=True)\n", "dev_in.drop('date', axis=1, inplace=True)\n", "test_in.drop('date', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "train_set = train_in\n", "train_set['result'] = train_exp\n", "# train_set = train_set[:1000]\n", "\n", "dev_set = dev_in\n", "dev_set['result'] = dev_exp\n", "\n", "test_set = test_in\n", "test_set['result'] = pd.DataFrame(np.zeros(len(test_in), dtype=int))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "289541" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(train_set)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def remove_urls(text):\n", " return ' '.join(re.sub(\"(@[A-Za-z0-9]+)|([^0-9A-Za-z \\t])|(\\w+:\\/\\/\\S+)\",\" \",str(text)).split())\n", "\n", "def text_lowercase(text): \n", " return text.lower()\n", "\n", "def remove_numbers(text):\n", " return re.sub(r'\\d+', '', text)\n", "\n", "def remove_punctuation(text):\n", " return text.translate(str.maketrans('', '', string.punctuation))\n", "\n", "def remove_stopwords(text):\n", " stop_words = set(stopwords.words('english'))\n", " return [i for i in text if not i in stop_words]\n", "\n", "def tokenize(text):\n", " return word_tokenize(text)\n", "\n", "def lemmatize(text):\n", " lemmatizer = WordNetLemmatizer() \n", " return [lemmatizer.lemmatize(token) for token in text]\n", "\n", "def preprocess(dataset):\n", " texts_column = []\n", " for num, text in enumerate(dataset['text']):\n", " if num % 10000 == 0:\n", " print(num)\n", " prep_text = remove_urls(text)\n", " prep_text = text_lowercase(prep_text)\n", " prep_text = remove_numbers(prep_text)\n", " prep_text = remove_punctuation(prep_text)\n", " prep_text = tokenize(prep_text)\n", " prep_text = remove_stopwords(prep_text)\n", " prep_text = lemmatize(prep_text)\n", " pre_text = ' '.join(prep_text)\n", " texts_column.append(pre_text)\n", " dataset['text'] = texts_column\n", " return dataset" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "0\n", "0\n" ] } ], "source": [ "train_set = train_set[:12000]\n", "train_set = preprocess(train_set)\n", "dev_set = preprocess(dev_set)\n", "test_set = preprocess(dev_set)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "train_set_copy = train_set\n", "dev_set_copy = dev_set\n", "test_set_copy = test_set" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "5000\n" ] } ], "source": [ "dictionary = set()\n", "\n", "for i, text_line in enumerate(train_set_copy['text']):\n", " if i % 5000 == 0:\n", " print(i)\n", " for word in text_line.split():\n", " dictionary.add(word)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "word_index_map = {}\n", "i = 0\n", "for w in dictionary:\n", " word_index_map[w] = i\n", " i += 1" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "21120" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(word_index_map)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] } ], "source": [ "prep_x = []\n", "for num, w in enumerate(train_set_copy['text']):\n", " if num % 10000 == 0:\n", " print(num)\n", " a = np.zeros(len(word_index_map))\n", " for word in w.split():\n", " index = word_index_map[word]\n", " a[index] = 1.\n", " prep_x.append(a)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "train_y = train_set_copy['result'].astype(float).tolist()\n", "x = torch.tensor(prep_x, dtype=torch.float)\n", "y = torch.tensor(train_y, dtype=torch.float)\n", "k = torch.randn(len(dictionary), requires_grad=True)\n", "rate = torch.tensor(0.001)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "200\n", "400\n", "600\n", "800\n", "1000\n", "1200\n", "1400\n" ] } ], "source": [ "for i in range(1500):\n", " y_predicted = torch.sigmoid(x @ k)\n", " price = (-1 / y.size()[0]) * torch.sum(y * torch.log(y_predicted + 1e-10) + (1 - y) * torch.log(1 - y_predicted + 1e-10))\n", " price.backward()\n", " with torch.no_grad():\n", " k -= rate * k.grad\n", " k.requires_grad = True\n", " if i % 200 == 0:\n", " print(i)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "def prepare_data(dataset):\n", " prep_x = []\n", " for num, w in enumerate(dataset['text']):\n", " if num % 1000 == 0:\n", " print(num)\n", " a = np.zeros(len(word_index_map))\n", " for word in w.split():\n", " if word in word_index_map:\n", " index = word_index_map[word]\n", " a[index] = 1.\n", " prep_x.append(a)\n", " return torch.tensor(prep_x, dtype=torch.float)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "def predict(x, weights, save_path):\n", " with open(save_path + '/out.tsv', 'wt', newline='') as f:\n", " writer = csv.writer(f, delimiter='\\t')\n", " y = torch.sigmoid(x @ weights)\n", " for value in y:\n", " if value > 0.90:\n", " value = torch.tensor([0.90])\n", " elif value < 0.10:\n", " value = torch.tensor([0.10])\n", " writer.writerow([str(value.item())])" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "1000\n", "2000\n", "3000\n", "4000\n", "5000\n", "0\n", "1000\n", "2000\n", "3000\n", "4000\n", "5000\n" ] } ], "source": [ "x_dev = prepare_data(dev_set_copy)\n", "x_test = prepare_data(test_set_copy)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "scrolled": true }, "outputs": [], "source": [ "predict(x_dev, k, './dev-0')" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "predict(x_test, k, './test-A')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }