This commit is contained in:
ulaniuk 2022-06-01 22:55:27 +02:00
commit 4a6e13712b
9 changed files with 11107 additions and 0 deletions

8
.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
*~
*.swp
*.bak
*.pyc
*.o
.DS_Store
.token

13
README.md Normal file
View File

@ -0,0 +1,13 @@
Skeptic vs paranormal subreddits
================================
Classify a reddit as either from Skeptic subreddit or one of the
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
,Glitch-in-the-Matrix, conspiracytheories).
Output label is the probability of a paranormal subreddit.
Sources
-------
Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.

1
config.txt Normal file
View File

@ -0,0 +1 @@
--metric Likelihood --metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv

5272
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

1
in-header.tsv Normal file
View File

@ -0,0 +1 @@
PostText Timestamp
1 PostText Timestamp

1
out-header.tsv Normal file
View File

@ -0,0 +1 @@
Label
1 Label

501
run.ipynb Normal file
View File

@ -0,0 +1,501 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import gensim\n",
"import re\n",
"import torch\n",
"import pandas as pd\n",
"from gensim.models import Word2Vec\n",
"from gensim import downloader\n",
"from sklearn.feature_extraction.text import TfidfVectorizer"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"BATCH_SIZE = 64\n",
"EPOCHS = 100\n",
"FEATURES = 200"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"with open('train/in.tsv', 'r', encoding='utf8') as f:\n",
" X_train = f.readlines()\n",
"with open('train/expected.tsv', 'r', encoding='utf8') as f:\n",
" y_train = f.readlines()\n",
"\n",
"with open('dev-0/in.tsv', 'r', encoding='utf8') as f:\n",
" X_dev = f.readlines()\n",
"with open('dev-0/expected.tsv', 'r', encoding='utf8') as f:\n",
" y_dev = f.readlines()\n",
"\n",
"with open('test-A/in.tsv', 'r', encoding='utf8') as f:\n",
" X_test = f.readlines()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"for i, line in enumerate(X_train):\n",
" X_train[i] = re.sub(r'\\t[0-9]+\\n', '', line)\n",
"\n",
"for i, line in enumerate(X_dev):\n",
" X_dev[i] = re.sub(r'\\t[0-9]+\\n', '', line)\n",
"\n",
"for i, line in enumerate(X_test):\n",
" X_test[i] = re.sub(r'\\t[0-9]+\\n', '', line)\n",
"\n",
"for i, line in enumerate(y_train):\n",
" y_train[i] = re.sub(r'\\n', '', line)\n",
"\n",
"for i, line in enumerate(y_dev):\n",
" y_dev[i] = re.sub(r'\\n', '', line)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"def readData(fileName): \n",
" with open(f'{fileName}/in.tsv', 'r', encoding='utf8') as f:\n",
" X = np.array([x.strip().lower() for x in f.readlines()])\n",
" with open(f'{fileName}/expected.tsv', 'r', encoding='utf8') as f:\n",
" y = np.array([int(x.strip()) for x in f.readlines()])\n",
" return X,y\n",
"\n",
"X_file,y_file = readData('dev-0')"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"class NeuralNetworkModel(torch.nn.Module):\n",
" \n",
" def __init__(self):\n",
" super(NeuralNetworkModel, self).__init__()\n",
" self.fc1 = torch.nn.Linear(FEATURES, 500)\n",
" self.fc2 = torch.nn.Linear(500, 1)\n",
"\n",
" def forward(self, x):\n",
" x = self.fc1(x)\n",
" x = torch.relu(x)\n",
" x = self.fc2(x)\n",
" x = torch.sigmoid(x)\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"word2vec = downloader.load(\"glove-twitter-200\")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"X_train_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]\n",
" or [np.zeros(FEATURES)], axis=0) for doc in X_train]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"y_train = np.array(y_train)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"def train_model(X_train, y_train):\n",
" model = NeuralNetworkModel()\n",
"\n",
" criterion = torch.nn.BCELoss()\n",
" optimizer = torch.optim.ASGD(model.parameters(), lr=0.05)\n",
"\n",
" for epoch in range(EPOCHS):\n",
"\n",
" print(epoch)\n",
" loss_score = 0\n",
" acc_score = 0\n",
" items_total = 0\n",
"\n",
" for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
" x = X_train[i:i+BATCH_SIZE]\n",
" x = torch.tensor(np.array(x).astype(np.float32))\n",
" y = y_train[i:i+BATCH_SIZE]\n",
" y = torch.tensor(y.astype(np.float32)).reshape(-1, 1)\n",
" y_pred = model(x)\n",
" acc_score += torch.sum((y_pred > 0.5) == y).item()\n",
" items_total += y.shape[0]\n",
"\n",
" optimizer.zero_grad()\n",
" loss = criterion(y_pred, y)\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" loss_score += loss.item() * y.shape[0]\n",
" \n",
" print((loss_score / items_total), (acc_score / items_total))\n",
" \n",
" return model"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"def predict(model, x_test):\n",
" y_dev = []\n",
" \n",
" with torch.no_grad():\n",
" for i in range(0, len(x_test), BATCH_SIZE):\n",
" x = x_test[i:i+BATCH_SIZE]\n",
" x = torch.tensor(np.array(x).astype(np.float32))\n",
" outputs = model(x)\n",
" y = (outputs > 0.5)\n",
" y_dev.extend(y)\n",
"\n",
" return y_dev"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n",
"0.5714333134919922 0.6966561801788113\n",
"1\n",
"0.5395073619374668 0.7242514132585581\n",
"2\n",
"0.5322582519146749 0.7296247310751125\n",
"3\n",
"0.5277940251241121 0.7327292379626976\n",
"4\n",
"0.5243827499623 0.7345525745996775\n",
"5\n",
"0.521483356086283 0.7361825270478868\n",
"6\n",
"0.5188610753636298 0.7376052821509848\n",
"7\n",
"0.5164497832484463 0.7390211306759122\n",
"8\n",
"0.5142272224311959 0.7402332351448137\n",
"9\n",
"0.5121725415654607 0.7413451942302446\n",
"10\n",
"0.510225843876934 0.742412260557568\n",
"11\n",
"0.5084293723366556 0.7430476657492429\n",
"12\n",
"0.5067300511753501 0.7440560261621181\n",
"13\n",
"0.5051866206455035 0.7450609332859082\n",
"14\n",
"0.503752063642534 0.7458586430645868\n",
"15\n",
"0.5024285955103476 0.7466943390232027\n",
"16\n",
"0.5011020173689057 0.7476439935216297\n",
"17\n",
"0.49986460605734995 0.7483691842295194\n",
"18\n",
"0.498722317965918 0.7489527900849163\n",
"19\n",
"0.4976401474074949 0.749584741987506\n",
"20\n",
"0.4966364578740479 0.7502788530936291\n",
"21\n",
"0.4956408892432799 0.7507208740965332\n",
"22\n",
"0.4946911594690806 0.7513459194209525\n",
"23\n",
"0.4938261365074296 0.7519433384326902\n",
"24\n",
"0.49291108882053136 0.7526996087423466\n",
"25\n",
"0.49207683927175633 0.752979325158247\n",
"26\n",
"0.4912937934254017 0.7534524257629179\n",
"27\n",
"0.49052768458365964 0.7539186197894184\n",
"28\n",
"0.48980189713607974 0.7542535888306818\n",
"29\n",
"0.48902049401931186 0.7547819420607157\n",
"30\n",
"0.48832297395034846 0.7553828143615386\n",
"31\n",
"0.48764632061179475 0.7556832505119501\n",
"32\n",
"0.4869866096390585 0.7563359221490509\n",
"33\n",
"0.48635514366306837 0.7567813964410403\n",
"34\n",
"0.48572428783405186 0.7574616943908226\n",
"35\n",
"0.4851059672855987 0.7577897568539155\n",
"36\n",
"0.4844747067054167 0.7581350857624344\n",
"37\n",
"0.4838937349887044 0.7585080409836349\n",
"38\n",
"0.48333403454228063 0.7584769613818682\n",
"39\n",
"0.4827657912931136 0.7590916468390319\n",
"40\n",
"0.48225590195293194 0.7592435915587802\n",
"41\n",
"0.48163791058193006 0.7597857579451549\n",
"42\n",
"0.4811314198011156 0.7601414467209293\n",
"43\n",
"0.4806143895582873 0.7607181459981559\n",
"44\n",
"0.4800953709221985 0.7609598762341192\n",
"45\n",
"0.47956847999038854 0.7612913919862974\n",
"46\n",
"0.4790844480555675 0.7616470807620719\n",
"47\n",
"0.47860829903493235 0.761795572192735\n",
"48\n",
"0.4781695258369003 0.762089101764976\n",
"49\n",
"0.4776893918277479 0.7624827767206876\n",
"50\n",
"0.47722041533606274 0.7628246523401213\n",
"51\n",
"0.4767699545351635 0.7631596213813847\n",
"52\n",
"0.47637271544187293 0.7633253792574738\n",
"53\n",
"0.47592309171862696 0.7635705627825222\n",
"54\n",
"0.47549356202221993 0.7638744522220189\n",
"55\n",
"0.47508612961542673 0.7642370475759638\n",
"56\n",
"0.47468646391106234 0.764351006115775\n",
"57\n",
"0.4742474519497854 0.7646790685788679\n",
"58\n",
"0.4737666401496256 0.7650623836673239\n",
"59\n",
"0.47335995538274667 0.7652972073251169\n",
"60\n",
"0.4729701449600526 0.7654422454666947\n",
"61\n",
"0.4725969795466422 0.7656252697882098\n",
"62\n",
"0.47221369839845356 0.7661121835492215\n",
"63\n",
"0.4718388513139844 0.7663021144489068\n",
"64\n",
"0.47147053143633466 0.7664575124577404\n",
"65\n",
"0.4711233925314738 0.7666543499355961\n",
"66\n",
"0.47074752713287643 0.7669340663514965\n",
"67\n",
"0.4703749315941604 0.7673242880181229\n",
"68\n",
"0.470022628463849 0.7672828485491006\n",
"69\n",
"0.4696828857076031 0.7677559491537715\n",
"70\n",
"0.4693190624670805 0.7678491879590716\n",
"71\n",
"0.4689852795644025 0.7683257418528278\n",
"72\n",
"0.46865665018555414 0.7687194168085393\n",
"73\n",
"0.468258934943202 0.7687297766757949\n",
"74\n",
"0.46797715189850664 0.7687608562775615\n",
"75\n",
"0.46764439033620286 0.7690716522952286\n",
"76\n",
"0.46732620352289256 0.769351368711129\n",
"77\n",
"0.4670077633846447 0.769700150908733\n",
"78\n",
"0.4667117469477995 0.7697692166904369\n",
"79\n",
"0.4664313273439932 0.7700420265281668\n",
"80\n",
"0.4661624620708029 0.7704426080620487\n",
"81\n",
"0.46585000600566223 0.7703148363658967\n",
"82\n",
"0.4655422194174101 0.7706739784307564\n",
"83\n",
"0.4652497145337105 0.7708708159086122\n",
"84\n",
"0.46495632112782237 0.7708673626195269\n",
"85\n",
"0.46467082155335016 0.7712023316607903\n",
"86\n",
"0.46439953297526376 0.7715269408347981\n",
"87\n",
"0.4640616501378699 0.7718032039616133\n",
"88\n",
"0.46377603995408073 0.7721139999792803\n",
"89\n",
"0.46352646427627725 0.7722072387845804\n",
"90\n",
"0.46323162764281506 0.7723971696842657\n",
"91\n",
"0.4629823635760337 0.7724765953332251\n",
"92\n",
"0.46268333841052883 0.7727770314836366\n",
"93\n",
"0.4624373474653466 0.7728978966016182\n",
"94\n",
"0.4621637105605031 0.7731396268375814\n",
"95\n",
"0.4618823675153035 0.7730463880322813\n",
"96\n",
"0.4615598618066211 0.7733571840499484\n",
"97\n",
"0.4613917053205442 0.7734089833862262\n",
"98\n",
"0.4610787309787952 0.7734642360115892\n",
"99\n",
"0.4608159763176817 0.7737197794038932\n"
]
}
],
"source": [
"model = train_model(X_train_w2v, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"X_dev_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]\n",
" or [np.zeros(FEATURES)], axis=0) for doc in X_dev]"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"y_dev=predict(model, X_dev_w2v)\n",
"y_dev = ['1' if bool(item) else '0' for item in y_dev]"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"with open('dev-0/out.tsv', 'wt') as f:\n",
" for pred in y_dev:\n",
" f.write(str(pred)+'\\n')"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"X_test_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]\n",
" or [np.zeros(FEATURES)], axis=0) for doc in X_test]"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"y_test=predict(model, X_test_w2v)\n",
"y_test = ['1' if bool(item) else '0' for item in y_test]"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"with open('test-A/out.tsv', 'wt') as f:\n",
" for pred in y_test:\n",
" f.write(str(pred)+'\\n')"
]
}
],
"metadata": {
"interpreter": {
"hash": "3ecbe772e0e869a386d256c10cc6d948e50cd4df13a3f02e58ab4f2a666d7bf0"
},
"kernelspec": {
"display_name": "Python 3.8.13 ('eks')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

158
run.py Normal file
View File

@ -0,0 +1,158 @@
# %%
import numpy as np
import gensim
import re
import torch
import pandas as pd
from gensim.models import Word2Vec
from gensim import downloader
from sklearn.feature_extraction.text import TfidfVectorizer
# %%
BATCH_SIZE = 64
EPOCHS = 100
FEATURES = 200
# %%
with open('train/in.tsv', 'r', encoding='utf8') as f:
X_train = f.readlines()
with open('train/expected.tsv', 'r', encoding='utf8') as f:
y_train = f.readlines()
with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
X_dev = f.readlines()
with open('dev-0/expected.tsv', 'r', encoding='utf8') as f:
y_dev = f.readlines()
with open('test-A/in.tsv', 'r', encoding='utf8') as f:
X_test = f.readlines()
# %%
for i, line in enumerate(X_train):
X_train[i] = re.sub(r'\t[0-9]+\n', '', line)
for i, line in enumerate(X_dev):
X_dev[i] = re.sub(r'\t[0-9]+\n', '', line)
for i, line in enumerate(X_test):
X_test[i] = re.sub(r'\t[0-9]+\n', '', line)
for i, line in enumerate(y_train):
y_train[i] = re.sub(r'\n', '', line)
for i, line in enumerate(y_dev):
y_dev[i] = re.sub(r'\n', '', line)
# %%
def readData(fileName):
with open(f'{fileName}/in.tsv', 'r', encoding='utf8') as f:
X = np.array([x.strip().lower() for x in f.readlines()])
with open(f'{fileName}/expected.tsv', 'r', encoding='utf8') as f:
y = np.array([int(x.strip()) for x in f.readlines()])
return X,y
X_file,y_file = readData('dev-0')
# %%
class NeuralNetworkModel(torch.nn.Module):
def __init__(self):
super(NeuralNetworkModel, self).__init__()
self.fc1 = torch.nn.Linear(FEATURES, 500)
self.fc2 = torch.nn.Linear(500, 1)
def forward(self, x):
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
x = torch.sigmoid(x)
return x
# %%
word2vec = downloader.load("glove-twitter-200")
# %%
X_train_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
or [np.zeros(FEATURES)], axis=0) for doc in X_train]
# %%
y_train = np.array(y_train)
# %%
def train_model(X_train, y_train):
model = NeuralNetworkModel()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.ASGD(model.parameters(), lr=0.05)
for epoch in range(EPOCHS):
print(epoch)
loss_score = 0
acc_score = 0
items_total = 0
for i in range(0, y_train.shape[0], BATCH_SIZE):
x = X_train[i:i+BATCH_SIZE]
x = torch.tensor(np.array(x).astype(np.float32))
y = y_train[i:i+BATCH_SIZE]
y = torch.tensor(y.astype(np.float32)).reshape(-1, 1)
y_pred = model(x)
acc_score += torch.sum((y_pred > 0.5) == y).item()
items_total += y.shape[0]
optimizer.zero_grad()
loss = criterion(y_pred, y)
loss.backward()
optimizer.step()
loss_score += loss.item() * y.shape[0]
print((loss_score / items_total), (acc_score / items_total))
return model
# %%
def predict(model, x_test):
y_dev = []
with torch.no_grad():
for i in range(0, len(x_test), BATCH_SIZE):
x = x_test[i:i+BATCH_SIZE]
x = torch.tensor(np.array(x).astype(np.float32))
outputs = model(x)
y = (outputs > 0.5)
y_dev.extend(y)
return y_dev
# %%
model = train_model(X_train_w2v, y_train)
# %%
X_dev_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
or [np.zeros(FEATURES)], axis=0) for doc in X_dev]
# %%
y_dev=predict(model, X_dev_w2v)
y_dev = ['1' if bool(item) else '0' for item in y_dev]
# %%
with open('dev-0/out.tsv', 'wt') as f:
for pred in y_dev:
f.write(str(pred)+'\n')
# %%
X_test_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
or [np.zeros(FEATURES)], axis=0) for doc in X_test]
# %%
y_test=predict(model, X_test_w2v)
y_test = ['1' if bool(item) else '0' for item in y_test]
# %%
with open('test-A/out.tsv', 'wt') as f:
for pred in y_test:
f.write(str(pred)+'\n')

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff