tau

2020-12-17 17:57:29 +01:00 · 2020-12-17 17:57:29 +01:00 · 94247bd181
commit 94247bd181
parent ecfafbf86c
6 changed files with 310968 additions and 0 deletions
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/in.tsv
+++ b/train/in.tsv
--- a/zadanie.ipynb
+++ b/zadanie.ipynb
@ -0,0 +1,421 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package stopwords to\n",
+      "[nltk_data]     C:\\Users\\akida\\AppData\\Roaming\\nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n",
+      "[nltk_data] Downloading package wordnet to\n",
+      "[nltk_data]     C:\\Users\\akida\\AppData\\Roaming\\nltk_data...\n",
+      "[nltk_data]   Package wordnet is already up-to-date!\n",
+      "[nltk_data] Downloading package punkt to\n",
+      "[nltk_data]     C:\\Users\\akida\\AppData\\Roaming\\nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import csv\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import re\n",
+    "import string\n",
+    "import torch\n",
+    "\n",
+    "import nltk\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "from nltk.corpus import stopwords # used for preprocessing\n",
+    "from nltk.stem import WordNetLemmatizer # used for preprocessing\n",
+    "nltk.download('stopwords')\n",
+    "nltk.download('wordnet')\n",
+    "nltk.download('punkt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_in = pd.read_csv(\"./train/in.tsv\", delimiter=\"\\t\", names=[\"text\", \"date\"], header=None)\n",
+    "train_exp = pd.read_csv(\"./train/expected.tsv\", delimiter=\"\\t\", header=None)\n",
+    "dev_in = pd.read_csv(\"./dev-0/in.tsv\", delimiter=\"\\t\", names=[\"text\", \"date\"],header=None)\n",
+    "dev_exp = pd.read_csv(\"./dev-0/expected.tsv\", delimiter=\"\\t\", header=None)\n",
+    "test_in = pd.read_csv(\"./test-A/in.tsv\", delimiter=\"\\t\", names=[\"text\", \"date\"], header=None)\n",
+    "\n",
+    "train_in.drop('date', axis=1, inplace=True)\n",
+    "dev_in.drop('date', axis=1, inplace=True)\n",
+    "test_in.drop('date', axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_set = train_in\n",
+    "train_set['result'] = train_exp\n",
+    "# train_set = train_set[:1000]\n",
+    "\n",
+    "dev_set = dev_in\n",
+    "dev_set['result'] = dev_exp\n",
+    "\n",
+    "test_set = test_in\n",
+    "test_set['result'] = pd.DataFrame(np.zeros(len(test_in), dtype=int))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "289541"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(train_set)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def remove_urls(text):\n",
+    "    return ' '.join(re.sub(\"(@[A-Za-z0-9]+)|([^0-9A-Za-z \\t])|(\\w+:\\/\\/\\S+)\",\" \",str(text)).split())\n",
+    "\n",
+    "def text_lowercase(text):   \n",
+    "    return text.lower()\n",
+    "\n",
+    "def remove_numbers(text):\n",
+    "    return re.sub(r'\\d+', '', text)\n",
+    "\n",
+    "def remove_punctuation(text):\n",
+    "    return text.translate(str.maketrans('', '', string.punctuation))\n",
+    "\n",
+    "def remove_stopwords(text):\n",
+    "    stop_words = set(stopwords.words('english'))\n",
+    "    return [i for i in text if not i in stop_words]\n",
+    "\n",
+    "def tokenize(text):\n",
+    "    return word_tokenize(text)\n",
+    "\n",
+    "def lemmatize(text):\n",
+    "    lemmatizer = WordNetLemmatizer() \n",
+    "    return [lemmatizer.lemmatize(token) for token in text]\n",
+    "\n",
+    "def preprocess(dataset):\n",
+    "    texts_column = []\n",
+    "    for num, text in enumerate(dataset['text']):\n",
+    "        if num % 10000 == 0:\n",
+    "            print(num)\n",
+    "        prep_text = remove_urls(text)\n",
+    "        prep_text = text_lowercase(prep_text)\n",
+    "        prep_text = remove_numbers(prep_text)\n",
+    "        prep_text = remove_punctuation(prep_text)\n",
+    "        prep_text = tokenize(prep_text)\n",
+    "        prep_text = remove_stopwords(prep_text)\n",
+    "        prep_text = lemmatize(prep_text)\n",
+    "        pre_text = ' '.join(prep_text)\n",
+    "        texts_column.append(pre_text)\n",
+    "    dataset['text'] = texts_column\n",
+    "    return dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "0\n",
+      "0\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_set = train_set[:12000]\n",
+    "train_set = preprocess(train_set)\n",
+    "dev_set = preprocess(dev_set)\n",
+    "test_set = preprocess(dev_set)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_set_copy = train_set\n",
+    "dev_set_copy = dev_set\n",
+    "test_set_copy = test_set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "5000\n"
+     ]
+    }
+   ],
+   "source": [
+    "dictionary = set()\n",
+    "\n",
+    "for i, text_line in enumerate(train_set_copy['text']):\n",
+    "    if i % 5000 == 0:\n",
+    "        print(i)\n",
+    "    for word in text_line.split():\n",
+    "        dictionary.add(word)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "word_index_map = {}\n",
+    "i = 0\n",
+    "for w in dictionary:\n",
+    "    word_index_map[w] = i\n",
+    "    i += 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "21120"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(word_index_map)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n"
+     ]
+    }
+   ],
+   "source": [
+    "prep_x = []\n",
+    "for num, w in enumerate(train_set_copy['text']):\n",
+    "    if num % 10000 == 0:\n",
+    "        print(num)\n",
+    "    a = np.zeros(len(word_index_map))\n",
+    "    for word in w.split():\n",
+    "        index = word_index_map[word]\n",
+    "        a[index] = 1.\n",
+    "    prep_x.append(a)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_y = train_set_copy['result'].astype(float).tolist()\n",
+    "x = torch.tensor(prep_x, dtype=torch.float)\n",
+    "y = torch.tensor(train_y, dtype=torch.float)\n",
+    "k = torch.randn(len(dictionary), requires_grad=True)\n",
+    "rate = torch.tensor(0.001)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "200\n",
+      "400\n",
+      "600\n",
+      "800\n",
+      "1000\n",
+      "1200\n",
+      "1400\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in range(1500):\n",
+    "    y_predicted = torch.sigmoid(x @ k)\n",
+    "    price = (-1 / y.size()[0]) * torch.sum(y * torch.log(y_predicted + 1e-10) + (1 - y) * torch.log(1 - y_predicted + 1e-10))\n",
+    "    price.backward()\n",
+    "    with torch.no_grad():\n",
+    "        k -= rate * k.grad\n",
+    "    k.requires_grad = True\n",
+    "    if i % 200 == 0:\n",
+    "        print(i)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def prepare_data(dataset):\n",
+    "    prep_x = []\n",
+    "    for num, w in enumerate(dataset['text']):\n",
+    "        if num % 1000 == 0:\n",
+    "            print(num)\n",
+    "        a = np.zeros(len(word_index_map))\n",
+    "        for word in w.split():\n",
+    "            if word in word_index_map:\n",
+    "                index = word_index_map[word]\n",
+    "                a[index] = 1.\n",
+    "        prep_x.append(a)\n",
+    "    return torch.tensor(prep_x, dtype=torch.float)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predict(x, weights, save_path):\n",
+    "    with open(save_path + '/out.tsv', 'wt', newline='') as f:\n",
+    "        writer = csv.writer(f, delimiter='\\t')\n",
+    "        y = torch.sigmoid(x @ weights)\n",
+    "        for value in y:\n",
+    "            if value > 0.90:\n",
+    "                value = torch.tensor([0.90])\n",
+    "            elif value < 0.10:\n",
+    "                value = torch.tensor([0.10])\n",
+    "            writer.writerow([str(value.item())])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "1000\n",
+      "2000\n",
+      "3000\n",
+      "4000\n",
+      "5000\n",
+      "0\n",
+      "1000\n",
+      "2000\n",
+      "3000\n",
+      "4000\n",
+      "5000\n"
+     ]
+    }
+   ],
+   "source": [
+    "x_dev = prepare_data(dev_set_copy)\n",
+    "x_test = prepare_data(test_set_copy)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "predict(x_dev, k, './dev-0')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predict(x_test, k, './test-A')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}