commit

nn
2022-06-15 11:32:08 +02:00 · 2022-06-15 00:07:45 +02:00 · 2022-06-14 23:39:31 +02:00 · 2022-06-14 23:36:56 +02:00
17 changed files with 898729 additions and 0 deletions
--- a/.ipynb_checkpoints/run-checkpoint.ipynb
+++ b/.ipynb_checkpoints/run-checkpoint.ipynb
@ -0,0 +1,262 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "74100403-147c-42cd-8285-e30778c0fb66",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import gensim\n",
    "import torch\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics import accuracy_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "bf211ece-e27a-4119-a1b9-9a9a610cfb46",
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict_year(x, path_out, model):\n",
    "    results = model.predict(x)\n",
    "    with open(path_out, 'wt') as file:\n",
    "        for r in results:\n",
    "            file.write(str(r) + '\\n') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "1ec57d97-a852-490e-8da4-d1e4c9676cd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_file(filename):\n",
    "    result = []\n",
    "    with open(filename, 'r', encoding=\"utf-8\") as file:\n",
    "        for line in file:\n",
    "            text = line.split(\"\\t\")[0].strip()\n",
    "            result.append(text)\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "86fbfb79-76e7-49f5-b722-2827f93cb03f",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('train/in.tsv', 'r', encoding='utf8') as file:\n",
    "    train = pd.read_csv(file, sep='\\t', header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "8960c975-f756-4e36-a1ce-e9fd5fdf8fe3",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('train/expected.tsv', 'r', encoding='utf8') as file:\n",
    "    train_y = pd.read_csv(file, sep='\\t', header=None)\n",
    "train_y = train_y[0:10000]\n",
    "train_y = train_y[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "07ae7b22-e95d-4614-9757-15660a9834b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = train[0:10000]\n",
    "train_x = train[0]\n",
    "train_x = [gensim.utils.simple_preprocess(x) for x in train_x]\n",
    "#train_x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "fde71cd8-f682-4793-bce9-0f9a9d8c176c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.test.utils import common_texts\n",
    "from gensim.models import Word2Vec\n",
    "\n",
    "model = Word2Vec(sentences=train_x, vector_size=100, window=5, min_count=1, workers=4)\n",
    "#data, min_count = 1, vector_size = 100, window = 5, sg = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "9a4c8066-f985-478e-8944-dd45b73d9795",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\korne\\AppData\\Local\\Temp\\ipykernel_3520\\3800840358.py:2: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
      "  train_x_vec = np.array([np.array([model.wv[i] for i in x if i in words]) for x in train_x])\n"
     ]
    }
   ],
   "source": [
    "words = set(model.wv.index_to_key)\n",
    "train_x_vec = np.array([np.array([model.wv[i] for i in x if i in words]) for x in train_x])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b52269f9-f143-483d-9669-ce8f5972d6bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "FEATURES = 100\n",
    "\n",
    "class NeuralNetworkModel(torch.nn.Module):\n",
    "    def __init__(self):\n",
    "        super(NeuralNetworkModel, self).__init__()\n",
    "        self.fc1 = torch.nn.Linear(FEATURES,500)\n",
    "        self.fc2 = torch.nn.Linear(500,1)\n",
    "\n",
    "    def forward(self, x):\n",
    "        x = self.fc1(x)\n",
    "        x = torch.relu(x)\n",
    "        x = self.fc2(x)\n",
    "        x = torch.sigmoid(x)\n",
    "        return x\n",
    "\n",
    "nn_model = NeuralNetworkModel()\n",
    "BATCH_SIZE = 40\n",
    "criterion = torch.nn.BCELoss()\n",
    "optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.1)\n",
    "\n",
    "def get_loss_acc(model, data_x, data_y):\n",
    "    loss_score = 0\n",
    "    acc_score = 0\n",
    "    items_total = 0\n",
    "    model.eval()\n",
    "    for i in range(0, data_y.shape[0], BATCH_SIZE):\n",
    "        X = data_x[i:i+BATCH_SIZE]\n",
    "        X = torch.tensor(X.astype(np.float32))\n",
    "        Y = data_y[i:i+BATCH_SIZE]\n",
    "        Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n",
    "        Y_predictions = model(X)\n",
    "        acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
    "        items_total += Y.shape[0]\n",
    "\n",
    "        loss = criterion(Y_predictions, Y)\n",
    "\n",
    "        loss_score += loss.item() * Y.shape[0]\n",
    "    return (loss_score / items_total), (acc_score / items_total)\n",
    "\n",
    "\n",
    "for epoch in range(5):\n",
    "    loss_score = 0\n",
    "    acc_score = 0\n",
    "    items_total = 0\n",
    "    nn_model.train()\n",
    "    for i in range(0, train_y.shape[0] - 42, BATCH_SIZE):\n",
    "        X = train_x_vec[i:i+BATCH_SIZE]\n",
    "        X = torch.tensor(X.astype(np.float32))\n",
    "        Y = train_y[i:i+BATCH_SIZE]\n",
    "        Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n",
    "        Y_predictions = nn_model(X)\n",
    "        acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
    "        items_total += Y.shape[0]\n",
    "\n",
    "        optimizer.zero_grad()\n",
    "        loss = criterion(Y_predictions, Y)\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "\n",
    "\n",
    "        loss_score += loss.item() * Y.shape[0]\n",
    "\n",
    "    display(epoch)\n",
    "    display(get_loss_acc(model, train_x_vect, train_y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1482f342-f2ea-4c9d-b221-5ef451e3a6b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "#print('trenowanie modelu')\n",
    "model = NeuralNetworkModel()\n",
    "BATCH_SIZE = 5\n",
    "criterion = torch.nn.BCELoss()\n",
    "optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n",
    "\n",
    "for epoch in range(BATCH_SIZE):\n",
    "    model.train()\n",
    "    for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
    "        X = x_train[i:i + BATCH_SIZE]\n",
    "        X = torch.tensor(X)\n",
    "        y = y_train[i:i + BATCH_SIZE]\n",
    "        y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)\n",
    "        optimizer.zero_grad()\n",
    "        outputs = model(X.float())\n",
    "        loss = criterion(outputs, y)\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "\n",
    "#print('predykcja wynikow')\n",
    "y_dev = []\n",
    "y_test = []\n",
    "model.eval()\n",
    "\n",
    "with torch.no_grad():\n",
    "    for i in range(0, len(x_dev), BATCH_SIZE):\n",
    "        X = x_dev[i:i + BATCH_SIZE]\n",
    "        X = torch.tensor(X)\n",
    "        outputs = model(X.float())\n",
    "        prediction = (outputs > 0.5)\n",
    "        y_dev += prediction.tolist()\n",
    "\n",
    "    for i in range(0, len(x_test), BATCH_SIZE):\n",
    "        X = x_test[i:i + BATCH_SIZE]\n",
    "        X = torch.tensor(X)\n",
    "        outputs = model(X.float())\n",
    "        y = (outputs >= 0.5)\n",
    "        y_test += prediction.tolist()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/.ipynb_checkpoints/run_transformer-checkpoint.ipynb
+++ b/.ipynb_checkpoints/run_transformer-checkpoint.ipynb
@ -0,0 +1,116 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "promotional-stage",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import torch\n",
    "import csv\n",
    "import lzma"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "gothic-olympus",
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train = pd.read_table('train/in.tsv', sep='\\t', header=None, quoting=3)\n",
    "#x_train = x_train[0:200000]\n",
    "#x_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "respiratory-train",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('train/expected.tsv', 'r', encoding='utf8') as file:\n",
    "    y_train = pd.read_csv(file, sep='\\t', header=None)\n",
    "#y_train = y_train[0:200000]\n",
    "#y_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "loving-sewing",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('dev-0/in.tsv', 'r', encoding='utf8') as file:\n",
    "    x_dev = pd.read_csv(file, sep='\\t', header=None)\n",
    "#x_dev"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aware-applicant",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('test-A/in.tsv', 'r', encoding='utf8') as file:\n",
    "    x_test = pd.read_csv(file, sep='\\t', header=None)\n",
    "#x_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "lovely-density",
   "metadata": {},
   "outputs": [],
   "source": [
    "https://github.com/facebookresearch/fairseq/issues/2666"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "occasional-banks",
   "metadata": {},
   "outputs": [],
   "source": [
    "https://github.com/facebookresearch/fairseq/blob/main/fairseq/models/huggingface/hf_gpt2.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "human-portal",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/.ipynb_checkpoints/sceptic-checkpoint.ipynb
+++ b/.ipynb_checkpoints/sceptic-checkpoint.ipynb
@ -0,0 +1,223 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "equal-singles",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/lib/python3/dist-packages/sklearn/utils/validation.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n",
      "  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'\n",
      "/usr/lib/python3/dist-packages/sklearn/feature_extraction/image.py:167: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "  dtype=np.int):\n",
      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:35: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "  eps=np.finfo(np.float).eps,\n",
      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:597: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,\n",
      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:836: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,\n",
      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:862: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "  eps=np.finfo(np.float).eps, positive=False):\n",
      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1097: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,\n",
      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1344: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,\n",
      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1480: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "  eps=np.finfo(np.float).eps, copy_X=True, positive=False):\n",
      "/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:152: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "  precompute=False, eps=np.finfo(np.float).eps,\n",
      "/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:320: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "  eps=np.finfo(np.float).eps, random_state=None,\n",
      "/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:580: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "  eps=4 * np.finfo(np.float).eps, n_jobs=None,\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import torch\n",
    "import csv\n",
    "import lzma\n",
    "import gensim.downloader\n",
    "from nltk import word_tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "involved-understanding",
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train = pd.read_table('in.tsv', sep='\\t', header=None, quoting=3)\n",
    "y_train = pd.read_table('expected.tsv', sep='\\t', header=None, quoting=3)\n",
    "#x_dev = pd.read_table('dev-0/in.tsv.xz', compression='xz', sep='\\t', header=None, quoting=3)\n",
    "#x_test = pd.read_table('test-A/in.tsv.xz', compression='xz', sep='\\t', header=None, quoting=3)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "collaborative-cincinnati",
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "module 'torch' has no attribute 'nn'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-5-11c9482004ae>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m#print('inicjalizacja modelu')\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mclass\u001b[0m \u001b[0mNeuralNetworkModel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mModule\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m         \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mNeuralNetworkModel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0ml01\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLinear\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m300\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m300\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mAttributeError\u001b[0m: module 'torch' has no attribute 'nn'"
     ]
    }
   ],
   "source": [
    "#print('inicjalizacja modelu')\n",
    "class NeuralNetworkModel(torch.nn.Module):\n",
    "    def __init__(self):\n",
    "        super(NeuralNetworkModel, self).__init__()\n",
    "        self.l01 = torch.nn.Linear(300, 300)\n",
    "        self.l02 = torch.nn.Linear(300, 1)\n",
    "\n",
    "    def forward(self, x):\n",
    "        x = self.l01(x)\n",
    "        x = torch.relu(x)\n",
    "        x = self.l02(x)\n",
    "        x = torch.sigmoid(x)\n",
    "        return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "hydraulic-business",
   "metadata": {},
   "outputs": [],
   "source": [
    "#print('przygotowanie danych')\n",
    "\n",
    "x_train = x_train[0].str.lower()\n",
    "y_train = y_train[0]\n",
    "x_dev = x_dev[0].str.lower()\n",
    "x_test = x_test[0].str.lower()\n",
    "\n",
    "x_train = [word_tokenize(x) for x in x_train]\n",
    "x_dev = [word_tokenize(x) for x in x_dev]\n",
    "x_test = [word_tokenize(x) for x in x_test]\n",
    "\n",
    "word2vec = gensim.downloader.load('word2vec-google-news-300')\n",
    "x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_train]\n",
    "x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_dev]\n",
    "x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_test]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "heavy-sandwich",
   "metadata": {},
   "outputs": [],
   "source": [
    "#print('trenowanie modelu')\n",
    "model = NeuralNetworkModel()\n",
    "BATCH_SIZE = 5\n",
    "criterion = torch.nn.BCELoss()\n",
    "optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n",
    "\n",
    "for epoch in range(BATCH_SIZE):\n",
    "    model.train()\n",
    "    for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
    "        X = x_train[i:i + BATCH_SIZE]\n",
    "        X = torch.tensor(X)\n",
    "        y = y_train[i:i + BATCH_SIZE]\n",
    "        y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)\n",
    "        optimizer.zero_grad()\n",
    "        outputs = model(X.float())\n",
    "        loss = criterion(outputs, y)\n",
    "        loss.backward()\n",
    "        optimizer.step()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "small-pavilion",
   "metadata": {},
   "outputs": [],
   "source": [
    "#print('predykcja wynikow')\n",
    "y_dev = []\n",
    "y_test = []\n",
    "model.eval()\n",
    "\n",
    "with torch.no_grad():\n",
    "    for i in range(0, len(x_dev), BATCH_SIZE):\n",
    "        X = x_dev[i:i + BATCH_SIZE]\n",
    "        X = torch.tensor(X)\n",
    "        outputs = model(X.float())\n",
    "        prediction = (outputs > 0.5)\n",
    "        y_dev += prediction.tolist()\n",
    "\n",
    "    for i in range(0, len(x_test), BATCH_SIZE):\n",
    "        X = x_test[i:i + BATCH_SIZE]\n",
    "        X = torch.tensor(X)\n",
    "        outputs = model(X.float())\n",
    "        y = (outputs >= 0.5)\n",
    "        y_test += prediction.tolist()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "toxic-pendant",
   "metadata": {},
   "outputs": [],
   "source": [
    "# print('eksportowanie do plików')\n",
    "y_dev = np.asarray(y_dev, dtype=np.int32)\n",
    "y_test = np.asarray(y_test, dtype=np.int32)\n",
    "y_dev.tofile('./dev-0/out.tsv', sep='\\n')\n",
    "y_test.tofile('./test-A/out.tsv', sep='\\n')\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/dev-0/.ipynb_checkpoints/out-checkpoint.tsv
+++ b/dev-0/.ipynb_checkpoints/out-checkpoint.tsv
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/run.ipynb
+++ b/run.ipynb
@ -0,0 +1,731 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "retired-freeze",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import torch\n",
    "import csv\n",
    "import lzma\n",
    "import gensim.downloader\n",
    "from nltk import word_tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "colored-calculation",
   "metadata": {},
   "outputs": [],
   "source": [
    "#def read_file(filename):\n",
    "#    result = []\n",
    "#    with open(filename, 'r', encoding=\"utf-8\") as file:\n",
    "#        for line in file:\n",
    "#            text = line.split(\"\\t\")[0].strip()\n",
    "#            result.append(text)\n",
    "#    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "secondary-worse",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>have you had an medical issues recently?</td>\n",
       "      <td>1335187994</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>It's supposedly aluminum, barium, and strontiu...</td>\n",
       "      <td>1346187161</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Nobel prizes don't make you rich.</td>\n",
       "      <td>1337160218</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>I came for the article, I stayed for the doctor.</td>\n",
       "      <td>1277674344</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>you resorted to insults AND got owned directly...</td>\n",
       "      <td>1348538535</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199995</th>\n",
       "      <td>It's really sad. My sister used to believe tha...</td>\n",
       "      <td>1334111989</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199996</th>\n",
       "      <td>I don't mean it in a dickish way, I'm being se...</td>\n",
       "      <td>1322700456</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199997</th>\n",
       "      <td>Fair enough, I stand corrected.</td>\n",
       "      <td>1354646212</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199998</th>\n",
       "      <td>Right. Scientists tend to think and conclude l...</td>\n",
       "      <td>1348777201</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199999</th>\n",
       "      <td>Because they are illiterate</td>\n",
       "      <td>1249579722</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>200000 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                        0           1\n",
       "0                have you had an medical issues recently?  1335187994\n",
       "1       It's supposedly aluminum, barium, and strontiu...  1346187161\n",
       "2                       Nobel prizes don't make you rich.  1337160218\n",
       "3        I came for the article, I stayed for the doctor.  1277674344\n",
       "4       you resorted to insults AND got owned directly...  1348538535\n",
       "...                                                   ...         ...\n",
       "199995  It's really sad. My sister used to believe tha...  1334111989\n",
       "199996  I don't mean it in a dickish way, I'm being se...  1322700456\n",
       "199997                    Fair enough, I stand corrected.  1354646212\n",
       "199998  Right. Scientists tend to think and conclude l...  1348777201\n",
       "199999                        Because they are illiterate  1249579722\n",
       "\n",
       "[200000 rows x 2 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train = pd.read_table('train/in.tsv', sep='\\t', header=None, quoting=3)\n",
    "x_train = x_train[0:200000]\n",
    "x_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "royal-roots",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199995</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199996</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199997</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199998</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199999</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>200000 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        0\n",
       "0       1\n",
       "1       0\n",
       "2       0\n",
       "3       0\n",
       "4       0\n",
       "...    ..\n",
       "199995  0\n",
       "199996  0\n",
       "199997  1\n",
       "199998  1\n",
       "199999  0\n",
       "\n",
       "[200000 rows x 1 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('train/expected.tsv', 'r', encoding='utf8') as file:\n",
    "    y_train = pd.read_csv(file, sep='\\t', header=None)\n",
    "y_train = y_train[0:200000]\n",
    "y_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "protective-hometown",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>In which case, tell them I'm in work, or dead,...</td>\n",
       "      <td>1328302967</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Put me down as another for Mysterious Universe...</td>\n",
       "      <td>1347836881</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>The military of any country would never admit ...</td>\n",
       "      <td>1331905826</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>An example would have been more productive tha...</td>\n",
       "      <td>1315584834</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sorry, but the authors of this article admit t...</td>\n",
       "      <td>1347389166</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5267</th>\n",
       "      <td>Your fault for going at all. That's how we get...</td>\n",
       "      <td>1308176634</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5268</th>\n",
       "      <td>EVP....that's a shot in the GH drinking game.</td>\n",
       "      <td>1354408646</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5269</th>\n",
       "      <td>i think a good hard massage is good for you. t...</td>\n",
       "      <td>1305726318</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5270</th>\n",
       "      <td>Interesting theory. Makes my imagination run w...</td>\n",
       "      <td>1339839088</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5271</th>\n",
       "      <td>Tampering of candy? More like cooking somethin...</td>\n",
       "      <td>1320262659</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5272 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                      0           1\n",
       "0     In which case, tell them I'm in work, or dead,...  1328302967\n",
       "1     Put me down as another for Mysterious Universe...  1347836881\n",
       "2     The military of any country would never admit ...  1331905826\n",
       "3     An example would have been more productive tha...  1315584834\n",
       "4     sorry, but the authors of this article admit t...  1347389166\n",
       "...                                                 ...         ...\n",
       "5267  Your fault for going at all. That's how we get...  1308176634\n",
       "5268      EVP....that's a shot in the GH drinking game.  1354408646\n",
       "5269  i think a good hard massage is good for you. t...  1305726318\n",
       "5270  Interesting theory. Makes my imagination run w...  1339839088\n",
       "5271  Tampering of candy? More like cooking somethin...  1320262659\n",
       "\n",
       "[5272 rows x 2 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('dev-0/in.tsv', 'r', encoding='utf8') as file:\n",
    "    x_dev = pd.read_csv(file, sep='\\t', header=None)\n",
    "x_dev"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "attractive-banana",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Gentleman, I believe we can agree that this is...</td>\n",
       "      <td>1304170330</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>The problem is that it will just turn it r/nos...</td>\n",
       "      <td>1353763204</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Well, according to some Christian apologists, ...</td>\n",
       "      <td>1336314173</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Don't know if this is what you are looking for...</td>\n",
       "      <td>1348860314</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>I respect what you're saying completely. I jus...</td>\n",
       "      <td>1341285952</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5147</th>\n",
       "      <td>GAMBIT</td>\n",
       "      <td>1326441107</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5148</th>\n",
       "      <td>&amp;gt;Joe Rogan is no snake oil salesman.\\n\\nHe ...</td>\n",
       "      <td>1319464245</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5149</th>\n",
       "      <td>Reading further, Sagan does seem to agree with...</td>\n",
       "      <td>1322126150</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5150</th>\n",
       "      <td>Notice that they never invoke god, or any othe...</td>\n",
       "      <td>1307679295</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5151</th>\n",
       "      <td>They might co-ordinate an anniversary attack o...</td>\n",
       "      <td>1342409261</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5152 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                      0           1\n",
       "0     Gentleman, I believe we can agree that this is...  1304170330\n",
       "1     The problem is that it will just turn it r/nos...  1353763204\n",
       "2     Well, according to some Christian apologists, ...  1336314173\n",
       "3     Don't know if this is what you are looking for...  1348860314\n",
       "4     I respect what you're saying completely. I jus...  1341285952\n",
       "...                                                 ...         ...\n",
       "5147                                             GAMBIT  1326441107\n",
       "5148  &gt;Joe Rogan is no snake oil salesman.\\n\\nHe ...  1319464245\n",
       "5149  Reading further, Sagan does seem to agree with...  1322126150\n",
       "5150  Notice that they never invoke god, or any othe...  1307679295\n",
       "5151  They might co-ordinate an anniversary attack o...  1342409261\n",
       "\n",
       "[5152 rows x 2 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('test-A/in.tsv', 'r', encoding='utf8') as file:\n",
    "    x_test = pd.read_csv(file, sep='\\t', header=None)\n",
    "x_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "realistic-television",
   "metadata": {},
   "outputs": [],
   "source": [
    "class NeuralNetworkModel(torch.nn.Module):\n",
    "    def __init__(self):\n",
    "        super(NeuralNetworkModel, self).__init__()\n",
    "        self.l01 = torch.nn.Linear(300, 300)\n",
    "        self.l02 = torch.nn.Linear(300, 1)\n",
    "\n",
    "    def forward(self, x):\n",
    "        x = self.l01(x)\n",
    "        x = torch.relu(x)\n",
    "        x = self.l02(x)\n",
    "        x = torch.sigmoid(x)\n",
    "        return x\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "prescription-throat",
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train = x_train[0].str.lower()\n",
    "y_train = y_train[0]\n",
    "x_dev = x_dev[0].str.lower()\n",
    "x_test = x_test[0].str.lower()\n",
    "\n",
    "x_train = [word_tokenize(x) for x in x_train]\n",
    "x_dev = [word_tokenize(x) for x in x_dev]\n",
    "x_test = [word_tokenize(x) for x in x_test]\n",
    "#x_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "distinguished-french",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5152"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "shared-divorce",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5152"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from gensim.test.utils import common_texts\n",
    "from gensim.models import Word2Vec\n",
    "\n",
    "word2vec = gensim.downloader.load('word2vec-google-news-300')\n",
    "x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_train]\n",
    "x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_dev]\n",
    "x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_test]\n",
    "len(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "japanese-broad",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = NeuralNetworkModel()\n",
    "BATCH_SIZE = 5\n",
    "criterion = torch.nn.BCELoss()\n",
    "optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n",
    "\n",
    "for epoch in range(BATCH_SIZE):\n",
    "    model.train()\n",
    "    for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
    "        X = x_train[i:i + BATCH_SIZE]\n",
    "        X = torch.tensor(X)\n",
    "        y = y_train[i:i + BATCH_SIZE]\n",
    "        y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)\n",
    "        optimizer.zero_grad()\n",
    "        outputs = model(X.float())\n",
    "        loss = criterion(outputs, y)\n",
    "        loss.backward()\n",
    "        optimizer.step()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "decent-initial",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5152"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_dev = []\n",
    "y_test = []\n",
    "model.eval()\n",
    "\n",
    "with torch.no_grad():\n",
    "    for i in range(0, len(x_dev), BATCH_SIZE):\n",
    "        X = x_dev[i:i + BATCH_SIZE]\n",
    "        X = torch.tensor(X)\n",
    "        outputs = model(X.float())\n",
    "        prediction = (outputs > 0.5)\n",
    "        y_dev += prediction.tolist()\n",
    "\n",
    "    for i in range(0, len(x_test), BATCH_SIZE):\n",
    "        X = x_test[i:i + BATCH_SIZE]\n",
    "        X = torch.tensor(X)\n",
    "        outputs = model(X.float())\n",
    "        prediction = (outputs >= 0.5)\n",
    "        y_test += prediction.tolist()\n",
    "len(y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "guilty-auditor",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2062"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_dev = np.asarray(y_dev, dtype=np.int32)\n",
    "y_test = np.asarray(y_test, dtype=np.int32)\n",
    "len(y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "unavailable-morrison",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_dev.tofile('./dev-0/out.tsv', sep='\\n')\n",
    "y_test.tofile('./test-A/out.tsv', sep='\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "polished-france",
   "metadata": {},
   "outputs": [],
   "source": [
    "!jupyter nbconvert --to script run.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "underlying-lightning",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/run.py
+++ b/run.py
@ -0,0 +1,182 @@
 #!/usr/bin/env python
 # coding: utf-8
 # In[1]:
 import numpy as np
 import pandas as pd
 import torch
 import csv
 import lzma
 import gensim.downloader
 from nltk import word_tokenize
 # In[2]:
 #def read_file(filename):
 #    result = []
 #    with open(filename, 'r', encoding="utf-8") as file:
 #        for line in file:
 #            text = line.split("\t")[0].strip()
 #            result.append(text)
 #    return result
 # In[3]:
 x_train = pd.read_table('train/in.tsv', sep='\t', header=None, quoting=3)
 x_train = x_train[0:200000]
 x_train
 # In[4]:
 with open('train/expected.tsv', 'r', encoding='utf8') as file:
    y_train = pd.read_csv(file, sep='\t', header=None)
 y_train = y_train[0:200000]
 y_train
 # In[5]:
 with open('dev-0/in.tsv', 'r', encoding='utf8') as file:
    x_dev = pd.read_csv(file, sep='\t', header=None)
 x_dev
 # In[6]:
 with open('test-A/in.tsv', 'r', encoding='utf8') as file:
    x_test = pd.read_csv(file, sep='\t', header=None)
 x_test
 # In[7]:
 class NeuralNetworkModel(torch.nn.Module):
    def __init__(self):
        super(NeuralNetworkModel, self).__init__()
        self.l01 = torch.nn.Linear(300, 300)
        self.l02 = torch.nn.Linear(300, 1)
    def forward(self, x):
        x = self.l01(x)
        x = torch.relu(x)
        x = self.l02(x)
        x = torch.sigmoid(x)
        return x
 # In[8]:
 x_train = x_train[0].str.lower()
 y_train = y_train[0]
 x_dev = x_dev[0].str.lower()
 x_test = x_test[0].str.lower()
 x_train = [word_tokenize(x) for x in x_train]
 x_dev = [word_tokenize(x) for x in x_dev]
 x_test = [word_tokenize(x) for x in x_test]
 #x_test
 # In[9]:
 len(x_test)
 # In[10]:
 from gensim.test.utils import common_texts
 from gensim.models import Word2Vec
 word2vec = gensim.downloader.load('word2vec-google-news-300')
 x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_train]
 x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_dev]
 x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_test]
 len(x_test)
 # In[15]:
 model = NeuralNetworkModel()
 BATCH_SIZE = 5
 criterion = torch.nn.BCELoss()
 optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
 for epoch in range(BATCH_SIZE):
    model.train()
    for i in range(0, y_train.shape[0], BATCH_SIZE):
        X = x_train[i:i + BATCH_SIZE]
        X = torch.tensor(X)
        y = y_train[i:i + BATCH_SIZE]
        y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)
        optimizer.zero_grad()
        outputs = model(X.float())
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
 # In[17]:
 y_dev = []
 y_test = []
 model.eval()
 with torch.no_grad():
    for i in range(0, len(x_dev), BATCH_SIZE):
        X = x_dev[i:i + BATCH_SIZE]
        X = torch.tensor(X)
        outputs = model(X.float())
        prediction = (outputs > 0.5)
        y_dev += prediction.tolist()
    for i in range(0, len(x_test), BATCH_SIZE):
        X = x_test[i:i + BATCH_SIZE]
        X = torch.tensor(X)
        outputs = model(X.float())
        prediction = (outputs >= 0.5)
        y_test += prediction.tolist()
 len(y_test)
 # In[13]:
 y_dev = np.asarray(y_dev, dtype=np.int32)
 y_test = np.asarray(y_test, dtype=np.int32)
 len(y_test)
 # In[ ]:
 y_dev.tofile('./dev-0/out.tsv', sep='\n')
 y_test.tofile('./test-A/out.tsv', sep='\n')
 # In[ ]:
 get_ipython().system('jupyter nbconvert --to script run.ipynb')
 # In[ ]:
--- a/run_transformer.ipynb
+++ b/run_transformer.ipynb
@ -0,0 +1,116 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "promotional-stage",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import torch\n",
    "import csv\n",
    "import lzma"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "gothic-olympus",
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train = pd.read_table('train/in.tsv', sep='\\t', header=None, quoting=3)\n",
    "#x_train = x_train[0:200000]\n",
    "#x_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "respiratory-train",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('train/expected.tsv', 'r', encoding='utf8') as file:\n",
    "    y_train = pd.read_csv(file, sep='\\t', header=None)\n",
    "#y_train = y_train[0:200000]\n",
    "#y_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "loving-sewing",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('dev-0/in.tsv', 'r', encoding='utf8') as file:\n",
    "    x_dev = pd.read_csv(file, sep='\\t', header=None)\n",
    "#x_dev"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aware-applicant",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('test-A/in.tsv', 'r', encoding='utf8') as file:\n",
    "    x_test = pd.read_csv(file, sep='\\t', header=None)\n",
    "#x_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "lovely-density",
   "metadata": {},
   "outputs": [],
   "source": [
    "https://github.com/facebookresearch/fairseq/issues/2666"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "occasional-banks",
   "metadata": {},
   "outputs": [],
   "source": [
    "https://github.com/facebookresearch/fairseq/blob/main/fairseq/models/huggingface/hf_gpt2.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "human-portal",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/sceptic.ipynb
+++ b/sceptic.ipynb
@ -0,0 +1,180 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "equal-singles",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import torch\n",
    "import csv\n",
    "import lzma\n",
    "import gensim.downloader\n",
    "from nltk import word_tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "involved-understanding",
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train = pd.read_table('in.tsv', sep='\\t', header=None, quoting=3)\n",
    "y_train = pd.read_table('expected.tsv', sep='\\t', header=None, quoting=3)\n",
    "#x_dev = pd.read_table('dev-0/in.tsv.xz', compression='xz', sep='\\t', header=None, quoting=3)\n",
    "#x_test = pd.read_table('test-A/in.tsv.xz', compression='xz', sep='\\t', header=None, quoting=3)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "collaborative-cincinnati",
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "module 'torch' has no attribute 'nn'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-5-11c9482004ae>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m#print('inicjalizacja modelu')\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mclass\u001b[0m \u001b[0mNeuralNetworkModel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mModule\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m         \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mNeuralNetworkModel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0ml01\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLinear\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m300\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m300\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mAttributeError\u001b[0m: module 'torch' has no attribute 'nn'"
     ]
    }
   ],
   "source": [
    "#print('inicjalizacja modelu')\n",
    "class NeuralNetworkModel(torch.nn.Module):\n",
    "    def __init__(self):\n",
    "        super(NeuralNetworkModel, self).__init__()\n",
    "        self.l01 = torch.nn.Linear(300, 300)\n",
    "        self.l02 = torch.nn.Linear(300, 1)\n",
    "\n",
    "    def forward(self, x):\n",
    "        x = self.l01(x)\n",
    "        x = torch.relu(x)\n",
    "        x = self.l02(x)\n",
    "        x = torch.sigmoid(x)\n",
    "        return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "hydraulic-business",
   "metadata": {},
   "outputs": [],
   "source": [
    "#print('przygotowanie danych')\n",
    "\n",
    "x_train = x_train.str.lower()\n",
    "x_dev = x_dev[0].str.lower()\n",
    "x_test = x_test[0].str.lower()\n",
    "\n",
    "x_train = [word_tokenize(x) for x in x_train]\n",
    "x_dev = [word_tokenize(x) for x in x_dev]\n",
    "x_test = [word_tokenize(x) for x in x_test]\n",
    "\n",
    "word2vec = gensim.downloader.load('word2vec-google-news-300')\n",
    "x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_train]\n",
    "x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_dev]\n",
    "x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_test]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "heavy-sandwich",
   "metadata": {},
   "outputs": [],
   "source": [
    "#print('trenowanie modelu')\n",
    "model = NeuralNetworkModel()\n",
    "BATCH_SIZE = 5\n",
    "criterion = torch.nn.BCELoss()\n",
    "optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n",
    "\n",
    "for epoch in range(BATCH_SIZE):\n",
    "    model.train()\n",
    "    for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
    "        X = x_train[i:i + BATCH_SIZE]\n",
    "        X = torch.tensor(X)\n",
    "        y = y_train[i:i + BATCH_SIZE]\n",
    "        y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)\n",
    "        optimizer.zero_grad()\n",
    "        outputs = model(X.float())\n",
    "        loss = criterion(outputs, y)\n",
    "        loss.backward()\n",
    "        optimizer.step()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "small-pavilion",
   "metadata": {},
   "outputs": [],
   "source": [
    "#print('predykcja wynikow')\n",
    "y_dev = []\n",
    "y_test = []\n",
    "model.eval()\n",
    "\n",
    "with torch.no_grad():\n",
    "    for i in range(0, len(x_dev), BATCH_SIZE):\n",
    "        X = x_dev[i:i + BATCH_SIZE]\n",
    "        X = torch.tensor(X)\n",
    "        outputs = model(X.float())\n",
    "        prediction = (outputs > 0.5)\n",
    "        y_dev += prediction.tolist()\n",
    "\n",
    "    for i in range(0, len(x_test), BATCH_SIZE):\n",
    "        X = x_test[i:i + BATCH_SIZE]\n",
    "        X = torch.tensor(X)\n",
    "        outputs = model(X.float())\n",
    "        y = (outputs >= 0.5)\n",
    "        y_test += prediction.tolist()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "toxic-pendant",
   "metadata": {},
   "outputs": [],
   "source": [
    "# print('eksportowanie do plików')\n",
    "y_dev = np.asarray(y_dev, dtype=np.int32)\n",
    "y_test = np.asarray(y_test, dtype=np.int32)\n",
    "y_dev.tofile('./dev-0/out.tsv', sep='\\n')\n",
    "y_test.tofile('./test-A/out.tsv', sep='\\n')\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/test-A/.ipynb_checkpoints/out-checkpoint.tsv
+++ b/test-A/.ipynb_checkpoints/out-checkpoint.tsv
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/.ipynb_checkpoints/expected-checkpoint.tsv
+++ b/train/.ipynb_checkpoints/expected-checkpoint.tsv
--- a/train/.ipynb_checkpoints/in-checkpoint.tsv
+++ b/train/.ipynb_checkpoints/in-checkpoint.tsv
--- a/train/in.tsv
+++ b/train/in.tsv
--- a/word2vec.model
+++ b/word2vec.model
Author	SHA1	Message	Date
Kornelia Girejko	db662285c4	commit	2022-06-15 11:32:08 +02:00
korne	f140a121a2	nn	2022-06-15 00:07:45 +02:00
korne	db6d196edb	nn	2022-06-14 23:39:31 +02:00
korne	8a69cabc52	nn	2022-06-14 23:36:56 +02:00