s478855
This commit is contained in:
commit
4a6e13712b
8
.gitignore
vendored
Normal file
8
.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
|
||||
*~
|
||||
*.swp
|
||||
*.bak
|
||||
*.pyc
|
||||
*.o
|
||||
.DS_Store
|
||||
.token
|
13
README.md
Normal file
13
README.md
Normal file
@ -0,0 +1,13 @@
|
||||
Skeptic vs paranormal subreddits
|
||||
================================
|
||||
|
||||
Classify a reddit as either from Skeptic subreddit or one of the
|
||||
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
|
||||
,Glitch-in-the-Matrix, conspiracytheories).
|
||||
|
||||
Output label is the probability of a paranormal subreddit.
|
||||
|
||||
Sources
|
||||
-------
|
||||
|
||||
Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
|
1
config.txt
Normal file
1
config.txt
Normal file
@ -0,0 +1 @@
|
||||
--metric Likelihood --metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv
|
5272
dev-0/out.tsv
Normal file
5272
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
1
in-header.tsv
Normal file
1
in-header.tsv
Normal file
@ -0,0 +1 @@
|
||||
PostText Timestamp
|
|
1
out-header.tsv
Normal file
1
out-header.tsv
Normal file
@ -0,0 +1 @@
|
||||
Label
|
|
501
run.ipynb
Normal file
501
run.ipynb
Normal file
@ -0,0 +1,501 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import gensim\n",
|
||||
"import re\n",
|
||||
"import torch\n",
|
||||
"import pandas as pd\n",
|
||||
"from gensim.models import Word2Vec\n",
|
||||
"from gensim import downloader\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"BATCH_SIZE = 64\n",
|
||||
"EPOCHS = 100\n",
|
||||
"FEATURES = 200"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('train/in.tsv', 'r', encoding='utf8') as f:\n",
|
||||
" X_train = f.readlines()\n",
|
||||
"with open('train/expected.tsv', 'r', encoding='utf8') as f:\n",
|
||||
" y_train = f.readlines()\n",
|
||||
"\n",
|
||||
"with open('dev-0/in.tsv', 'r', encoding='utf8') as f:\n",
|
||||
" X_dev = f.readlines()\n",
|
||||
"with open('dev-0/expected.tsv', 'r', encoding='utf8') as f:\n",
|
||||
" y_dev = f.readlines()\n",
|
||||
"\n",
|
||||
"with open('test-A/in.tsv', 'r', encoding='utf8') as f:\n",
|
||||
" X_test = f.readlines()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for i, line in enumerate(X_train):\n",
|
||||
" X_train[i] = re.sub(r'\\t[0-9]+\\n', '', line)\n",
|
||||
"\n",
|
||||
"for i, line in enumerate(X_dev):\n",
|
||||
" X_dev[i] = re.sub(r'\\t[0-9]+\\n', '', line)\n",
|
||||
"\n",
|
||||
"for i, line in enumerate(X_test):\n",
|
||||
" X_test[i] = re.sub(r'\\t[0-9]+\\n', '', line)\n",
|
||||
"\n",
|
||||
"for i, line in enumerate(y_train):\n",
|
||||
" y_train[i] = re.sub(r'\\n', '', line)\n",
|
||||
"\n",
|
||||
"for i, line in enumerate(y_dev):\n",
|
||||
" y_dev[i] = re.sub(r'\\n', '', line)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def readData(fileName): \n",
|
||||
" with open(f'{fileName}/in.tsv', 'r', encoding='utf8') as f:\n",
|
||||
" X = np.array([x.strip().lower() for x in f.readlines()])\n",
|
||||
" with open(f'{fileName}/expected.tsv', 'r', encoding='utf8') as f:\n",
|
||||
" y = np.array([int(x.strip()) for x in f.readlines()])\n",
|
||||
" return X,y\n",
|
||||
"\n",
|
||||
"X_file,y_file = readData('dev-0')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class NeuralNetworkModel(torch.nn.Module):\n",
|
||||
" \n",
|
||||
" def __init__(self):\n",
|
||||
" super(NeuralNetworkModel, self).__init__()\n",
|
||||
" self.fc1 = torch.nn.Linear(FEATURES, 500)\n",
|
||||
" self.fc2 = torch.nn.Linear(500, 1)\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" x = self.fc1(x)\n",
|
||||
" x = torch.relu(x)\n",
|
||||
" x = self.fc2(x)\n",
|
||||
" x = torch.sigmoid(x)\n",
|
||||
" return x"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"word2vec = downloader.load(\"glove-twitter-200\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_train_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]\n",
|
||||
" or [np.zeros(FEATURES)], axis=0) for doc in X_train]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_train = np.array(y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def train_model(X_train, y_train):\n",
|
||||
" model = NeuralNetworkModel()\n",
|
||||
"\n",
|
||||
" criterion = torch.nn.BCELoss()\n",
|
||||
" optimizer = torch.optim.ASGD(model.parameters(), lr=0.05)\n",
|
||||
"\n",
|
||||
" for epoch in range(EPOCHS):\n",
|
||||
"\n",
|
||||
" print(epoch)\n",
|
||||
" loss_score = 0\n",
|
||||
" acc_score = 0\n",
|
||||
" items_total = 0\n",
|
||||
"\n",
|
||||
" for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
|
||||
" x = X_train[i:i+BATCH_SIZE]\n",
|
||||
" x = torch.tensor(np.array(x).astype(np.float32))\n",
|
||||
" y = y_train[i:i+BATCH_SIZE]\n",
|
||||
" y = torch.tensor(y.astype(np.float32)).reshape(-1, 1)\n",
|
||||
" y_pred = model(x)\n",
|
||||
" acc_score += torch.sum((y_pred > 0.5) == y).item()\n",
|
||||
" items_total += y.shape[0]\n",
|
||||
"\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" loss = criterion(y_pred, y)\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
"\n",
|
||||
" loss_score += loss.item() * y.shape[0]\n",
|
||||
" \n",
|
||||
" print((loss_score / items_total), (acc_score / items_total))\n",
|
||||
" \n",
|
||||
" return model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def predict(model, x_test):\n",
|
||||
" y_dev = []\n",
|
||||
" \n",
|
||||
" with torch.no_grad():\n",
|
||||
" for i in range(0, len(x_test), BATCH_SIZE):\n",
|
||||
" x = x_test[i:i+BATCH_SIZE]\n",
|
||||
" x = torch.tensor(np.array(x).astype(np.float32))\n",
|
||||
" outputs = model(x)\n",
|
||||
" y = (outputs > 0.5)\n",
|
||||
" y_dev.extend(y)\n",
|
||||
"\n",
|
||||
" return y_dev"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0\n",
|
||||
"0.5714333134919922 0.6966561801788113\n",
|
||||
"1\n",
|
||||
"0.5395073619374668 0.7242514132585581\n",
|
||||
"2\n",
|
||||
"0.5322582519146749 0.7296247310751125\n",
|
||||
"3\n",
|
||||
"0.5277940251241121 0.7327292379626976\n",
|
||||
"4\n",
|
||||
"0.5243827499623 0.7345525745996775\n",
|
||||
"5\n",
|
||||
"0.521483356086283 0.7361825270478868\n",
|
||||
"6\n",
|
||||
"0.5188610753636298 0.7376052821509848\n",
|
||||
"7\n",
|
||||
"0.5164497832484463 0.7390211306759122\n",
|
||||
"8\n",
|
||||
"0.5142272224311959 0.7402332351448137\n",
|
||||
"9\n",
|
||||
"0.5121725415654607 0.7413451942302446\n",
|
||||
"10\n",
|
||||
"0.510225843876934 0.742412260557568\n",
|
||||
"11\n",
|
||||
"0.5084293723366556 0.7430476657492429\n",
|
||||
"12\n",
|
||||
"0.5067300511753501 0.7440560261621181\n",
|
||||
"13\n",
|
||||
"0.5051866206455035 0.7450609332859082\n",
|
||||
"14\n",
|
||||
"0.503752063642534 0.7458586430645868\n",
|
||||
"15\n",
|
||||
"0.5024285955103476 0.7466943390232027\n",
|
||||
"16\n",
|
||||
"0.5011020173689057 0.7476439935216297\n",
|
||||
"17\n",
|
||||
"0.49986460605734995 0.7483691842295194\n",
|
||||
"18\n",
|
||||
"0.498722317965918 0.7489527900849163\n",
|
||||
"19\n",
|
||||
"0.4976401474074949 0.749584741987506\n",
|
||||
"20\n",
|
||||
"0.4966364578740479 0.7502788530936291\n",
|
||||
"21\n",
|
||||
"0.4956408892432799 0.7507208740965332\n",
|
||||
"22\n",
|
||||
"0.4946911594690806 0.7513459194209525\n",
|
||||
"23\n",
|
||||
"0.4938261365074296 0.7519433384326902\n",
|
||||
"24\n",
|
||||
"0.49291108882053136 0.7526996087423466\n",
|
||||
"25\n",
|
||||
"0.49207683927175633 0.752979325158247\n",
|
||||
"26\n",
|
||||
"0.4912937934254017 0.7534524257629179\n",
|
||||
"27\n",
|
||||
"0.49052768458365964 0.7539186197894184\n",
|
||||
"28\n",
|
||||
"0.48980189713607974 0.7542535888306818\n",
|
||||
"29\n",
|
||||
"0.48902049401931186 0.7547819420607157\n",
|
||||
"30\n",
|
||||
"0.48832297395034846 0.7553828143615386\n",
|
||||
"31\n",
|
||||
"0.48764632061179475 0.7556832505119501\n",
|
||||
"32\n",
|
||||
"0.4869866096390585 0.7563359221490509\n",
|
||||
"33\n",
|
||||
"0.48635514366306837 0.7567813964410403\n",
|
||||
"34\n",
|
||||
"0.48572428783405186 0.7574616943908226\n",
|
||||
"35\n",
|
||||
"0.4851059672855987 0.7577897568539155\n",
|
||||
"36\n",
|
||||
"0.4844747067054167 0.7581350857624344\n",
|
||||
"37\n",
|
||||
"0.4838937349887044 0.7585080409836349\n",
|
||||
"38\n",
|
||||
"0.48333403454228063 0.7584769613818682\n",
|
||||
"39\n",
|
||||
"0.4827657912931136 0.7590916468390319\n",
|
||||
"40\n",
|
||||
"0.48225590195293194 0.7592435915587802\n",
|
||||
"41\n",
|
||||
"0.48163791058193006 0.7597857579451549\n",
|
||||
"42\n",
|
||||
"0.4811314198011156 0.7601414467209293\n",
|
||||
"43\n",
|
||||
"0.4806143895582873 0.7607181459981559\n",
|
||||
"44\n",
|
||||
"0.4800953709221985 0.7609598762341192\n",
|
||||
"45\n",
|
||||
"0.47956847999038854 0.7612913919862974\n",
|
||||
"46\n",
|
||||
"0.4790844480555675 0.7616470807620719\n",
|
||||
"47\n",
|
||||
"0.47860829903493235 0.761795572192735\n",
|
||||
"48\n",
|
||||
"0.4781695258369003 0.762089101764976\n",
|
||||
"49\n",
|
||||
"0.4776893918277479 0.7624827767206876\n",
|
||||
"50\n",
|
||||
"0.47722041533606274 0.7628246523401213\n",
|
||||
"51\n",
|
||||
"0.4767699545351635 0.7631596213813847\n",
|
||||
"52\n",
|
||||
"0.47637271544187293 0.7633253792574738\n",
|
||||
"53\n",
|
||||
"0.47592309171862696 0.7635705627825222\n",
|
||||
"54\n",
|
||||
"0.47549356202221993 0.7638744522220189\n",
|
||||
"55\n",
|
||||
"0.47508612961542673 0.7642370475759638\n",
|
||||
"56\n",
|
||||
"0.47468646391106234 0.764351006115775\n",
|
||||
"57\n",
|
||||
"0.4742474519497854 0.7646790685788679\n",
|
||||
"58\n",
|
||||
"0.4737666401496256 0.7650623836673239\n",
|
||||
"59\n",
|
||||
"0.47335995538274667 0.7652972073251169\n",
|
||||
"60\n",
|
||||
"0.4729701449600526 0.7654422454666947\n",
|
||||
"61\n",
|
||||
"0.4725969795466422 0.7656252697882098\n",
|
||||
"62\n",
|
||||
"0.47221369839845356 0.7661121835492215\n",
|
||||
"63\n",
|
||||
"0.4718388513139844 0.7663021144489068\n",
|
||||
"64\n",
|
||||
"0.47147053143633466 0.7664575124577404\n",
|
||||
"65\n",
|
||||
"0.4711233925314738 0.7666543499355961\n",
|
||||
"66\n",
|
||||
"0.47074752713287643 0.7669340663514965\n",
|
||||
"67\n",
|
||||
"0.4703749315941604 0.7673242880181229\n",
|
||||
"68\n",
|
||||
"0.470022628463849 0.7672828485491006\n",
|
||||
"69\n",
|
||||
"0.4696828857076031 0.7677559491537715\n",
|
||||
"70\n",
|
||||
"0.4693190624670805 0.7678491879590716\n",
|
||||
"71\n",
|
||||
"0.4689852795644025 0.7683257418528278\n",
|
||||
"72\n",
|
||||
"0.46865665018555414 0.7687194168085393\n",
|
||||
"73\n",
|
||||
"0.468258934943202 0.7687297766757949\n",
|
||||
"74\n",
|
||||
"0.46797715189850664 0.7687608562775615\n",
|
||||
"75\n",
|
||||
"0.46764439033620286 0.7690716522952286\n",
|
||||
"76\n",
|
||||
"0.46732620352289256 0.769351368711129\n",
|
||||
"77\n",
|
||||
"0.4670077633846447 0.769700150908733\n",
|
||||
"78\n",
|
||||
"0.4667117469477995 0.7697692166904369\n",
|
||||
"79\n",
|
||||
"0.4664313273439932 0.7700420265281668\n",
|
||||
"80\n",
|
||||
"0.4661624620708029 0.7704426080620487\n",
|
||||
"81\n",
|
||||
"0.46585000600566223 0.7703148363658967\n",
|
||||
"82\n",
|
||||
"0.4655422194174101 0.7706739784307564\n",
|
||||
"83\n",
|
||||
"0.4652497145337105 0.7708708159086122\n",
|
||||
"84\n",
|
||||
"0.46495632112782237 0.7708673626195269\n",
|
||||
"85\n",
|
||||
"0.46467082155335016 0.7712023316607903\n",
|
||||
"86\n",
|
||||
"0.46439953297526376 0.7715269408347981\n",
|
||||
"87\n",
|
||||
"0.4640616501378699 0.7718032039616133\n",
|
||||
"88\n",
|
||||
"0.46377603995408073 0.7721139999792803\n",
|
||||
"89\n",
|
||||
"0.46352646427627725 0.7722072387845804\n",
|
||||
"90\n",
|
||||
"0.46323162764281506 0.7723971696842657\n",
|
||||
"91\n",
|
||||
"0.4629823635760337 0.7724765953332251\n",
|
||||
"92\n",
|
||||
"0.46268333841052883 0.7727770314836366\n",
|
||||
"93\n",
|
||||
"0.4624373474653466 0.7728978966016182\n",
|
||||
"94\n",
|
||||
"0.4621637105605031 0.7731396268375814\n",
|
||||
"95\n",
|
||||
"0.4618823675153035 0.7730463880322813\n",
|
||||
"96\n",
|
||||
"0.4615598618066211 0.7733571840499484\n",
|
||||
"97\n",
|
||||
"0.4613917053205442 0.7734089833862262\n",
|
||||
"98\n",
|
||||
"0.4610787309787952 0.7734642360115892\n",
|
||||
"99\n",
|
||||
"0.4608159763176817 0.7737197794038932\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model = train_model(X_train_w2v, y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_dev_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]\n",
|
||||
" or [np.zeros(FEATURES)], axis=0) for doc in X_dev]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 50,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_dev=predict(model, X_dev_w2v)\n",
|
||||
"y_dev = ['1' if bool(item) else '0' for item in y_dev]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 54,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('dev-0/out.tsv', 'wt') as f:\n",
|
||||
" for pred in y_dev:\n",
|
||||
" f.write(str(pred)+'\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_test_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]\n",
|
||||
" or [np.zeros(FEATURES)], axis=0) for doc in X_test]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 53,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_test=predict(model, X_test_w2v)\n",
|
||||
"y_test = ['1' if bool(item) else '0' for item in y_test]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 55,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('test-A/out.tsv', 'wt') as f:\n",
|
||||
" for pred in y_test:\n",
|
||||
" f.write(str(pred)+'\\n')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"interpreter": {
|
||||
"hash": "3ecbe772e0e869a386d256c10cc6d948e50cd4df13a3f02e58ab4f2a666d7bf0"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8.13 ('eks')",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.13"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
158
run.py
Normal file
158
run.py
Normal file
@ -0,0 +1,158 @@
|
||||
# %%
|
||||
import numpy as np
|
||||
import gensim
|
||||
import re
|
||||
import torch
|
||||
import pandas as pd
|
||||
from gensim.models import Word2Vec
|
||||
from gensim import downloader
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
# %%
|
||||
BATCH_SIZE = 64
|
||||
EPOCHS = 100
|
||||
FEATURES = 200
|
||||
|
||||
# %%
|
||||
with open('train/in.tsv', 'r', encoding='utf8') as f:
|
||||
X_train = f.readlines()
|
||||
with open('train/expected.tsv', 'r', encoding='utf8') as f:
|
||||
y_train = f.readlines()
|
||||
|
||||
with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
|
||||
X_dev = f.readlines()
|
||||
with open('dev-0/expected.tsv', 'r', encoding='utf8') as f:
|
||||
y_dev = f.readlines()
|
||||
|
||||
with open('test-A/in.tsv', 'r', encoding='utf8') as f:
|
||||
X_test = f.readlines()
|
||||
|
||||
# %%
|
||||
for i, line in enumerate(X_train):
|
||||
X_train[i] = re.sub(r'\t[0-9]+\n', '', line)
|
||||
|
||||
for i, line in enumerate(X_dev):
|
||||
X_dev[i] = re.sub(r'\t[0-9]+\n', '', line)
|
||||
|
||||
for i, line in enumerate(X_test):
|
||||
X_test[i] = re.sub(r'\t[0-9]+\n', '', line)
|
||||
|
||||
for i, line in enumerate(y_train):
|
||||
y_train[i] = re.sub(r'\n', '', line)
|
||||
|
||||
for i, line in enumerate(y_dev):
|
||||
y_dev[i] = re.sub(r'\n', '', line)
|
||||
|
||||
# %%
|
||||
def readData(fileName):
|
||||
with open(f'{fileName}/in.tsv', 'r', encoding='utf8') as f:
|
||||
X = np.array([x.strip().lower() for x in f.readlines()])
|
||||
with open(f'{fileName}/expected.tsv', 'r', encoding='utf8') as f:
|
||||
y = np.array([int(x.strip()) for x in f.readlines()])
|
||||
return X,y
|
||||
|
||||
X_file,y_file = readData('dev-0')
|
||||
|
||||
# %%
|
||||
class NeuralNetworkModel(torch.nn.Module):
|
||||
|
||||
def __init__(self):
|
||||
super(NeuralNetworkModel, self).__init__()
|
||||
self.fc1 = torch.nn.Linear(FEATURES, 500)
|
||||
self.fc2 = torch.nn.Linear(500, 1)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.fc1(x)
|
||||
x = torch.relu(x)
|
||||
x = self.fc2(x)
|
||||
x = torch.sigmoid(x)
|
||||
return x
|
||||
|
||||
# %%
|
||||
word2vec = downloader.load("glove-twitter-200")
|
||||
|
||||
# %%
|
||||
X_train_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
|
||||
or [np.zeros(FEATURES)], axis=0) for doc in X_train]
|
||||
|
||||
# %%
|
||||
y_train = np.array(y_train)
|
||||
|
||||
# %%
|
||||
def train_model(X_train, y_train):
|
||||
model = NeuralNetworkModel()
|
||||
|
||||
criterion = torch.nn.BCELoss()
|
||||
optimizer = torch.optim.ASGD(model.parameters(), lr=0.05)
|
||||
|
||||
for epoch in range(EPOCHS):
|
||||
|
||||
print(epoch)
|
||||
loss_score = 0
|
||||
acc_score = 0
|
||||
items_total = 0
|
||||
|
||||
for i in range(0, y_train.shape[0], BATCH_SIZE):
|
||||
x = X_train[i:i+BATCH_SIZE]
|
||||
x = torch.tensor(np.array(x).astype(np.float32))
|
||||
y = y_train[i:i+BATCH_SIZE]
|
||||
y = torch.tensor(y.astype(np.float32)).reshape(-1, 1)
|
||||
y_pred = model(x)
|
||||
acc_score += torch.sum((y_pred > 0.5) == y).item()
|
||||
items_total += y.shape[0]
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(y_pred, y)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
loss_score += loss.item() * y.shape[0]
|
||||
|
||||
print((loss_score / items_total), (acc_score / items_total))
|
||||
|
||||
return model
|
||||
|
||||
# %%
|
||||
def predict(model, x_test):
|
||||
y_dev = []
|
||||
|
||||
with torch.no_grad():
|
||||
for i in range(0, len(x_test), BATCH_SIZE):
|
||||
x = x_test[i:i+BATCH_SIZE]
|
||||
x = torch.tensor(np.array(x).astype(np.float32))
|
||||
outputs = model(x)
|
||||
y = (outputs > 0.5)
|
||||
y_dev.extend(y)
|
||||
|
||||
return y_dev
|
||||
|
||||
# %%
|
||||
model = train_model(X_train_w2v, y_train)
|
||||
|
||||
# %%
|
||||
X_dev_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
|
||||
or [np.zeros(FEATURES)], axis=0) for doc in X_dev]
|
||||
|
||||
# %%
|
||||
y_dev=predict(model, X_dev_w2v)
|
||||
y_dev = ['1' if bool(item) else '0' for item in y_dev]
|
||||
|
||||
# %%
|
||||
with open('dev-0/out.tsv', 'wt') as f:
|
||||
for pred in y_dev:
|
||||
f.write(str(pred)+'\n')
|
||||
|
||||
# %%
|
||||
X_test_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
|
||||
or [np.zeros(FEATURES)], axis=0) for doc in X_test]
|
||||
|
||||
# %%
|
||||
y_test=predict(model, X_test_w2v)
|
||||
y_test = ['1' if bool(item) else '0' for item in y_test]
|
||||
|
||||
# %%
|
||||
with open('test-A/out.tsv', 'wt') as f:
|
||||
for pred in y_test:
|
||||
f.write(str(pred)+'\n')
|
||||
|
||||
|
5152
test-A/out.tsv
Normal file
5152
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user