paranormal-or-skeptic-ISI-p.../run.ipynb

727 lines
22 KiB
Plaintext
Raw Normal View History

2022-06-14 23:36:56 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "74100403-147c-42cd-8285-e30778c0fb66",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import torch\n",
"import csv\n",
"import lzma\n",
"import gensim.downloader\n",
"from nltk import word_tokenize"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cbe60d7b-850e-4838-b4ce-672f13bf2bb2",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 2,
"id": "bf211ece-e27a-4119-a1b9-9a9a610cfb46",
"metadata": {},
"outputs": [],
"source": [
"def predict_year(x, path_out, model):\n",
" results = model.predict(x)\n",
" with open(path_out, 'wt') as file:\n",
" for r in results:\n",
" file.write(str(r) + '\\n') "
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "1ec57d97-a852-490e-8da4-d1e4c9676cd6",
"metadata": {},
"outputs": [],
"source": [
"def read_file(filename):\n",
" result = []\n",
" with open(filename, 'r', encoding=\"utf-8\") as file:\n",
" for line in file:\n",
" text = line.split(\"\\t\")[0].strip()\n",
" result.append(text)\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "86fbfb79-76e7-49f5-b722-2827f93cb03f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>have you had an medical issues recently?</td>\n",
" <td>1335187994</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>It's supposedly aluminum, barium, and strontiu...</td>\n",
" <td>1346187161</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Nobel prizes don't make you rich.</td>\n",
" <td>1337160218</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>I came for the article, I stayed for the doctor.</td>\n",
" <td>1277674344</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>you resorted to insults AND got owned directly...</td>\n",
" <td>1348538535</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199995</th>\n",
" <td>It's really sad. My sister used to believe tha...</td>\n",
" <td>1334111989</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199996</th>\n",
" <td>I don't mean it in a dickish way, I'm being se...</td>\n",
" <td>1322700456</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199997</th>\n",
" <td>Fair enough, I stand corrected.</td>\n",
" <td>1354646212</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199998</th>\n",
" <td>Right. Scientists tend to think and conclude l...</td>\n",
" <td>1348777201</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199999</th>\n",
" <td>Because they are illiterate</td>\n",
" <td>1249579722</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>200000 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1\n",
"0 have you had an medical issues recently? 1335187994\n",
"1 It's supposedly aluminum, barium, and strontiu... 1346187161\n",
"2 Nobel prizes don't make you rich. 1337160218\n",
"3 I came for the article, I stayed for the doctor. 1277674344\n",
"4 you resorted to insults AND got owned directly... 1348538535\n",
"... ... ...\n",
"199995 It's really sad. My sister used to believe tha... 1334111989\n",
"199996 I don't mean it in a dickish way, I'm being se... 1322700456\n",
"199997 Fair enough, I stand corrected. 1354646212\n",
"199998 Right. Scientists tend to think and conclude l... 1348777201\n",
"199999 Because they are illiterate 1249579722\n",
"\n",
"[200000 rows x 2 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x_train = pd.read_table('train/in.tsv', sep='\\t', header=None, quoting=3)\n",
"x_train = x_train[0:200000]\n",
"x_train"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "8960c975-f756-4e36-a1ce-e9fd5fdf8fe3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199995</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199996</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199997</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199998</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199999</th>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>200000 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" 0\n",
"0 1\n",
"1 0\n",
"2 0\n",
"3 0\n",
"4 0\n",
"... ..\n",
"199995 0\n",
"199996 0\n",
"199997 1\n",
"199998 1\n",
"199999 0\n",
"\n",
"[200000 rows x 1 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with open('train/expected.tsv', 'r', encoding='utf8') as file:\n",
" y_train = pd.read_csv(file, sep='\\t', header=None)\n",
"y_train = y_train[0:200000]\n",
"y_train"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "6b27e6ce-e9fd-41a1-aacf-53a5fde0a7c1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>In which case, tell them I'm in work, or dead,...</td>\n",
" <td>1328302967</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Put me down as another for Mysterious Universe...</td>\n",
" <td>1347836881</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>The military of any country would never admit ...</td>\n",
" <td>1331905826</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>An example would have been more productive tha...</td>\n",
" <td>1315584834</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>sorry, but the authors of this article admit t...</td>\n",
" <td>1347389166</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5267</th>\n",
" <td>Your fault for going at all. That's how we get...</td>\n",
" <td>1308176634</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5268</th>\n",
" <td>EVP....that's a shot in the GH drinking game.</td>\n",
" <td>1354408646</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5269</th>\n",
" <td>i think a good hard massage is good for you. t...</td>\n",
" <td>1305726318</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5270</th>\n",
" <td>Interesting theory. Makes my imagination run w...</td>\n",
" <td>1339839088</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5271</th>\n",
" <td>Tampering of candy? More like cooking somethin...</td>\n",
" <td>1320262659</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5272 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1\n",
"0 In which case, tell them I'm in work, or dead,... 1328302967\n",
"1 Put me down as another for Mysterious Universe... 1347836881\n",
"2 The military of any country would never admit ... 1331905826\n",
"3 An example would have been more productive tha... 1315584834\n",
"4 sorry, but the authors of this article admit t... 1347389166\n",
"... ... ...\n",
"5267 Your fault for going at all. That's how we get... 1308176634\n",
"5268 EVP....that's a shot in the GH drinking game. 1354408646\n",
"5269 i think a good hard massage is good for you. t... 1305726318\n",
"5270 Interesting theory. Makes my imagination run w... 1339839088\n",
"5271 Tampering of candy? More like cooking somethin... 1320262659\n",
"\n",
"[5272 rows x 2 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with open('dev-0/in.tsv', 'r', encoding='utf8') as file:\n",
" x_dev = pd.read_csv(file, sep='\\t', header=None)\n",
"x_dev"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "99ae526d-9b7c-493f-be4f-f95b1c8f4b81",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Gentleman, I believe we can agree that this is...</td>\n",
" <td>1304170330</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>The problem is that it will just turn it r/nos...</td>\n",
" <td>1353763204</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Well, according to some Christian apologists, ...</td>\n",
" <td>1336314173</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Don't know if this is what you are looking for...</td>\n",
" <td>1348860314</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>I respect what you're saying completely. I jus...</td>\n",
" <td>1341285952</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5147</th>\n",
" <td>GAMBIT</td>\n",
" <td>1326441107</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5148</th>\n",
" <td>&amp;gt;Joe Rogan is no snake oil salesman.\\n\\nHe ...</td>\n",
" <td>1319464245</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5149</th>\n",
" <td>Reading further, Sagan does seem to agree with...</td>\n",
" <td>1322126150</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5150</th>\n",
" <td>Notice that they never invoke god, or any othe...</td>\n",
" <td>1307679295</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5151</th>\n",
" <td>They might co-ordinate an anniversary attack o...</td>\n",
" <td>1342409261</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5152 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1\n",
"0 Gentleman, I believe we can agree that this is... 1304170330\n",
"1 The problem is that it will just turn it r/nos... 1353763204\n",
"2 Well, according to some Christian apologists, ... 1336314173\n",
"3 Don't know if this is what you are looking for... 1348860314\n",
"4 I respect what you're saying completely. I jus... 1341285952\n",
"... ... ...\n",
"5147 GAMBIT 1326441107\n",
"5148 &gt;Joe Rogan is no snake oil salesman.\\n\\nHe ... 1319464245\n",
"5149 Reading further, Sagan does seem to agree with... 1322126150\n",
"5150 Notice that they never invoke god, or any othe... 1307679295\n",
"5151 They might co-ordinate an anniversary attack o... 1342409261\n",
"\n",
"[5152 rows x 2 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with open('test-A/in.tsv', 'r', encoding='utf8') as file:\n",
" x_test = pd.read_csv(file, sep='\\t', header=None)\n",
"x_test"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "dba17668-971f-47f8-99ce-fc840b5cb74a",
"metadata": {},
"outputs": [],
"source": [
"class NeuralNetworkModel(torch.nn.Module):\n",
" def __init__(self):\n",
" super(NeuralNetworkModel, self).__init__()\n",
" self.l01 = torch.nn.Linear(300, 300)\n",
" self.l02 = torch.nn.Linear(300, 1)\n",
"\n",
" def forward(self, x):\n",
" x = self.l01(x)\n",
" x = torch.relu(x)\n",
" x = self.l02(x)\n",
" x = torch.sigmoid(x)\n",
" return x\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "1a275c1d-75bc-4290-9332-56396d16a0f2",
"metadata": {},
"outputs": [],
"source": [
"x_train = x_train[0].str.lower()\n",
"y_train = y_train[0]\n",
"x_dev = x_dev[0].str.lower()\n",
"x_test = x_test[0].str.lower()\n",
"\n",
"x_train = [word_tokenize(x) for x in x_train]\n",
"x_dev = [word_tokenize(x) for x in x_dev]\n",
"x_test = [word_tokenize(x) for x in x_test]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "031a3670-3be7-4146-97b4-0dacd4f9ae58",
"metadata": {},
"outputs": [],
"source": [
"from gensim.test.utils import common_texts\n",
"from gensim.models import Word2Vec\n",
"\n",
"word2vec = gensim.downloader.load('word2vec-google-news-300')\n",
"x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_train]\n",
"x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_dev]\n",
"x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_test]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "b7defd18-e281-4cf6-9941-cee560749677",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\korne\\AppData\\Local\\Temp\\ipykernel_22024\\3484013121.py:10: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\builder\\windows\\pytorch\\torch\\csrc\\utils\\tensor_new.cpp:210.)\n",
" X = torch.tensor(X)\n"
]
}
],
"source": [
"model = NeuralNetworkModel()\n",
"BATCH_SIZE = 5\n",
"criterion = torch.nn.BCELoss()\n",
"optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n",
"\n",
"for epoch in range(BATCH_SIZE):\n",
" model.train()\n",
" for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
" X = x_train[i:i + BATCH_SIZE]\n",
" X = torch.tensor(X)\n",
" y = y_train[i:i + BATCH_SIZE]\n",
" y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)\n",
" optimizer.zero_grad()\n",
" outputs = model(X.float())\n",
" loss = criterion(outputs, y)\n",
" loss.backward()\n",
" optimizer.step()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "92c69ddd-fe58-477f-b2c2-06324a983bcc",
"metadata": {},
"outputs": [],
"source": [
"y_dev = []\n",
"y_test = []\n",
"model.eval()\n",
"\n",
"with torch.no_grad():\n",
" for i in range(0, len(x_dev), BATCH_SIZE):\n",
" X = x_dev[i:i + BATCH_SIZE]\n",
" X = torch.tensor(X)\n",
" outputs = model(X.float())\n",
" prediction = (outputs > 0.5)\n",
" y_dev += prediction.tolist()\n",
"\n",
" for i in range(0, len(x_test), BATCH_SIZE):\n",
" X = x_test[i:i + BATCH_SIZE]\n",
" X = torch.tensor(X)\n",
" outputs = model(X.float())\n",
" y = (outputs >= 0.5)\n",
" y_test += prediction.tolist()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "caff921c-d0ab-4fce-a17f-6610266b404d",
"metadata": {},
"outputs": [],
"source": [
"y_dev = np.asarray(y_dev, dtype=np.int32)\n",
"y_test = np.asarray(y_test, dtype=np.int32)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "73076eb2-810f-4f85-aa3f-05ee884c413b",
"metadata": {},
"outputs": [],
"source": [
"with open('./dev-0/out.tsv', 'wt') as file:\n",
" for r in y_dev:\n",
" file.write(str(r) + '\\n') "
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "ddda251c-cafa-40f8-a020-48310a9f23b6",
"metadata": {},
"outputs": [],
"source": [
"with open('./test-A/out.tsv', 'wt') as file:\n",
" for r in y_test:\n",
" file.write(str(r) + '\\n') "
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "5730562a-0200-4c8f-8f73-992fa2b36133",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[NbConvertApp] Converting notebook run.ipynb to script\n",
"[NbConvertApp] Writing 3816 bytes to run.py\n"
]
}
],
"source": [
"!jupyter nbconvert --to script run.ipynb"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "07a09298-204c-4905-90a8-5dcb87877368",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}