{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "74100403-147c-42cd-8285-e30778c0fb66", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import torch\n", "import csv\n", "import lzma\n", "import gensim.downloader\n", "from nltk import word_tokenize" ] }, { "cell_type": "code", "execution_count": null, "id": "cbe60d7b-850e-4838-b4ce-672f13bf2bb2", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 2, "id": "bf211ece-e27a-4119-a1b9-9a9a610cfb46", "metadata": {}, "outputs": [], "source": [ "def predict_year(x, path_out, model):\n", " results = model.predict(x)\n", " with open(path_out, 'wt') as file:\n", " for r in results:\n", " file.write(str(r) + '\\n') " ] }, { "cell_type": "code", "execution_count": 3, "id": "1ec57d97-a852-490e-8da4-d1e4c9676cd6", "metadata": {}, "outputs": [], "source": [ "def read_file(filename):\n", " result = []\n", " with open(filename, 'r', encoding=\"utf-8\") as file:\n", " for line in file:\n", " text = line.split(\"\\t\")[0].strip()\n", " result.append(text)\n", " return result" ] }, { "cell_type": "code", "execution_count": 4, "id": "86fbfb79-76e7-49f5-b722-2827f93cb03f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01
0have you had an medical issues recently?1335187994
1It's supposedly aluminum, barium, and strontiu...1346187161
2Nobel prizes don't make you rich.1337160218
3I came for the article, I stayed for the doctor.1277674344
4you resorted to insults AND got owned directly...1348538535
.........
199995It's really sad. My sister used to believe tha...1334111989
199996I don't mean it in a dickish way, I'm being se...1322700456
199997Fair enough, I stand corrected.1354646212
199998Right. Scientists tend to think and conclude l...1348777201
199999Because they are illiterate1249579722
\n", "

200000 rows × 2 columns

\n", "
" ], "text/plain": [ " 0 1\n", "0 have you had an medical issues recently? 1335187994\n", "1 It's supposedly aluminum, barium, and strontiu... 1346187161\n", "2 Nobel prizes don't make you rich. 1337160218\n", "3 I came for the article, I stayed for the doctor. 1277674344\n", "4 you resorted to insults AND got owned directly... 1348538535\n", "... ... ...\n", "199995 It's really sad. My sister used to believe tha... 1334111989\n", "199996 I don't mean it in a dickish way, I'm being se... 1322700456\n", "199997 Fair enough, I stand corrected. 1354646212\n", "199998 Right. Scientists tend to think and conclude l... 1348777201\n", "199999 Because they are illiterate 1249579722\n", "\n", "[200000 rows x 2 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_train = pd.read_table('train/in.tsv', sep='\\t', header=None, quoting=3)\n", "x_train = x_train[0:200000]\n", "x_train" ] }, { "cell_type": "code", "execution_count": 5, "id": "8960c975-f756-4e36-a1ce-e9fd5fdf8fe3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
01
10
20
30
40
......
1999950
1999960
1999971
1999981
1999990
\n", "

200000 rows × 1 columns

\n", "
" ], "text/plain": [ " 0\n", "0 1\n", "1 0\n", "2 0\n", "3 0\n", "4 0\n", "... ..\n", "199995 0\n", "199996 0\n", "199997 1\n", "199998 1\n", "199999 0\n", "\n", "[200000 rows x 1 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with open('train/expected.tsv', 'r', encoding='utf8') as file:\n", " y_train = pd.read_csv(file, sep='\\t', header=None)\n", "y_train = y_train[0:200000]\n", "y_train" ] }, { "cell_type": "code", "execution_count": 6, "id": "6b27e6ce-e9fd-41a1-aacf-53a5fde0a7c1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01
0In which case, tell them I'm in work, or dead,...1328302967
1Put me down as another for Mysterious Universe...1347836881
2The military of any country would never admit ...1331905826
3An example would have been more productive tha...1315584834
4sorry, but the authors of this article admit t...1347389166
.........
5267Your fault for going at all. That's how we get...1308176634
5268EVP....that's a shot in the GH drinking game.1354408646
5269i think a good hard massage is good for you. t...1305726318
5270Interesting theory. Makes my imagination run w...1339839088
5271Tampering of candy? More like cooking somethin...1320262659
\n", "

5272 rows × 2 columns

\n", "
" ], "text/plain": [ " 0 1\n", "0 In which case, tell them I'm in work, or dead,... 1328302967\n", "1 Put me down as another for Mysterious Universe... 1347836881\n", "2 The military of any country would never admit ... 1331905826\n", "3 An example would have been more productive tha... 1315584834\n", "4 sorry, but the authors of this article admit t... 1347389166\n", "... ... ...\n", "5267 Your fault for going at all. That's how we get... 1308176634\n", "5268 EVP....that's a shot in the GH drinking game. 1354408646\n", "5269 i think a good hard massage is good for you. t... 1305726318\n", "5270 Interesting theory. Makes my imagination run w... 1339839088\n", "5271 Tampering of candy? More like cooking somethin... 1320262659\n", "\n", "[5272 rows x 2 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with open('dev-0/in.tsv', 'r', encoding='utf8') as file:\n", " x_dev = pd.read_csv(file, sep='\\t', header=None)\n", "x_dev" ] }, { "cell_type": "code", "execution_count": 7, "id": "99ae526d-9b7c-493f-be4f-f95b1c8f4b81", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01
0Gentleman, I believe we can agree that this is...1304170330
1The problem is that it will just turn it r/nos...1353763204
2Well, according to some Christian apologists, ...1336314173
3Don't know if this is what you are looking for...1348860314
4I respect what you're saying completely. I jus...1341285952
.........
5147GAMBIT1326441107
5148>Joe Rogan is no snake oil salesman.\\n\\nHe ...1319464245
5149Reading further, Sagan does seem to agree with...1322126150
5150Notice that they never invoke god, or any othe...1307679295
5151They might co-ordinate an anniversary attack o...1342409261
\n", "

5152 rows × 2 columns

\n", "
" ], "text/plain": [ " 0 1\n", "0 Gentleman, I believe we can agree that this is... 1304170330\n", "1 The problem is that it will just turn it r/nos... 1353763204\n", "2 Well, according to some Christian apologists, ... 1336314173\n", "3 Don't know if this is what you are looking for... 1348860314\n", "4 I respect what you're saying completely. I jus... 1341285952\n", "... ... ...\n", "5147 GAMBIT 1326441107\n", "5148 >Joe Rogan is no snake oil salesman.\\n\\nHe ... 1319464245\n", "5149 Reading further, Sagan does seem to agree with... 1322126150\n", "5150 Notice that they never invoke god, or any othe... 1307679295\n", "5151 They might co-ordinate an anniversary attack o... 1342409261\n", "\n", "[5152 rows x 2 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with open('test-A/in.tsv', 'r', encoding='utf8') as file:\n", " x_test = pd.read_csv(file, sep='\\t', header=None)\n", "x_test" ] }, { "cell_type": "code", "execution_count": 8, "id": "dba17668-971f-47f8-99ce-fc840b5cb74a", "metadata": {}, "outputs": [], "source": [ "class NeuralNetworkModel(torch.nn.Module):\n", " def __init__(self):\n", " super(NeuralNetworkModel, self).__init__()\n", " self.l01 = torch.nn.Linear(300, 300)\n", " self.l02 = torch.nn.Linear(300, 1)\n", "\n", " def forward(self, x):\n", " x = self.l01(x)\n", " x = torch.relu(x)\n", " x = self.l02(x)\n", " x = torch.sigmoid(x)\n", " return x\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "1a275c1d-75bc-4290-9332-56396d16a0f2", "metadata": {}, "outputs": [], "source": [ "x_train = x_train[0].str.lower()\n", "y_train = y_train[0]\n", "x_dev = x_dev[0].str.lower()\n", "x_test = x_test[0].str.lower()\n", "\n", "x_train = [word_tokenize(x) for x in x_train]\n", "x_dev = [word_tokenize(x) for x in x_dev]\n", "x_test = [word_tokenize(x) for x in x_test]" ] }, { "cell_type": "code", "execution_count": 11, "id": "031a3670-3be7-4146-97b4-0dacd4f9ae58", "metadata": {}, "outputs": [], "source": [ "from gensim.test.utils import common_texts\n", "from gensim.models import Word2Vec\n", "\n", "word2vec = gensim.downloader.load('word2vec-google-news-300')\n", "x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_train]\n", "x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_dev]\n", "x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_test]" ] }, { "cell_type": "code", "execution_count": 12, "id": "b7defd18-e281-4cf6-9941-cee560749677", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\korne\\AppData\\Local\\Temp\\ipykernel_22024\\3484013121.py:10: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\builder\\windows\\pytorch\\torch\\csrc\\utils\\tensor_new.cpp:210.)\n", " X = torch.tensor(X)\n" ] } ], "source": [ "model = NeuralNetworkModel()\n", "BATCH_SIZE = 5\n", "criterion = torch.nn.BCELoss()\n", "optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n", "\n", "for epoch in range(BATCH_SIZE):\n", " model.train()\n", " for i in range(0, y_train.shape[0], BATCH_SIZE):\n", " X = x_train[i:i + BATCH_SIZE]\n", " X = torch.tensor(X)\n", " y = y_train[i:i + BATCH_SIZE]\n", " y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)\n", " optimizer.zero_grad()\n", " outputs = model(X.float())\n", " loss = criterion(outputs, y)\n", " loss.backward()\n", " optimizer.step()" ] }, { "cell_type": "code", "execution_count": 13, "id": "92c69ddd-fe58-477f-b2c2-06324a983bcc", "metadata": {}, "outputs": [], "source": [ "y_dev = []\n", "y_test = []\n", "model.eval()\n", "\n", "with torch.no_grad():\n", " for i in range(0, len(x_dev), BATCH_SIZE):\n", " X = x_dev[i:i + BATCH_SIZE]\n", " X = torch.tensor(X)\n", " outputs = model(X.float())\n", " prediction = (outputs > 0.5)\n", " y_dev += prediction.tolist()\n", "\n", " for i in range(0, len(x_test), BATCH_SIZE):\n", " X = x_test[i:i + BATCH_SIZE]\n", " X = torch.tensor(X)\n", " outputs = model(X.float())\n", " y = (outputs >= 0.5)\n", " y_test += prediction.tolist()" ] }, { "cell_type": "code", "execution_count": 14, "id": "caff921c-d0ab-4fce-a17f-6610266b404d", "metadata": {}, "outputs": [], "source": [ "y_dev = np.asarray(y_dev, dtype=np.int32)\n", "y_test = np.asarray(y_test, dtype=np.int32)" ] }, { "cell_type": "code", "execution_count": 15, "id": "73076eb2-810f-4f85-aa3f-05ee884c413b", "metadata": {}, "outputs": [], "source": [ "with open('./dev-0/out.tsv', 'wt') as file:\n", " for r in y_dev:\n", " file.write(str(r) + '\\n') " ] }, { "cell_type": "code", "execution_count": 16, "id": "ddda251c-cafa-40f8-a020-48310a9f23b6", "metadata": {}, "outputs": [], "source": [ "with open('./test-A/out.tsv', 'wt') as file:\n", " for r in y_test:\n", " file.write(str(r) + '\\n') " ] }, { "cell_type": "code", "execution_count": 17, "id": "5730562a-0200-4c8f-8f73-992fa2b36133", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[NbConvertApp] Converting notebook run.ipynb to script\n", "[NbConvertApp] Writing 3816 bytes to run.py\n" ] } ], "source": [ "!jupyter nbconvert --to script run.ipynb" ] }, { "cell_type": "code", "execution_count": null, "id": "07a09298-204c-4905-90a8-5dcb87877368", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }