s478855

2022-06-01 22:55:27 +02:00 · 2022-06-01 22:55:27 +02:00 · 4a6e13712b
commit 4a6e13712b
9 changed files with 11107 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,8 @@
+
+*~
+*.swp
+*.bak
+*.pyc
+*.o
+.DS_Store
+.token
--- a/README.md
+++ b/README.md
@ -0,0 +1,13 @@
+Skeptic vs paranormal subreddits
+================================
+
+Classify a reddit as either from Skeptic subreddit or one of the
+"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
+,Glitch-in-the-Matrix, conspiracytheories).
+
+Output label is the probability of a paranormal subreddit.
+
+Sources
+-------
+
+Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
--- a/config.txt
+++ b/config.txt
@ -0,0 +1 @@
+--metric Likelihood --metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall>  --precision 4 --in-header in-header.tsv --out-header out-header.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/in-header.tsv
+++ b/in-header.tsv
@ -0,0 +1 @@
+PostText	Timestamp
--- a/out-header.tsv
+++ b/out-header.tsv
@ -0,0 +1 @@
+Label
--- a/run.ipynb
+++ b/run.ipynb
@ -0,0 +1,501 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import gensim\n",
+    "import re\n",
+    "import torch\n",
+    "import pandas as pd\n",
+    "from gensim.models import Word2Vec\n",
+    "from gensim import downloader\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BATCH_SIZE = 64\n",
+    "EPOCHS = 100\n",
+    "FEATURES = 200"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('train/in.tsv', 'r', encoding='utf8') as f:\n",
+    "    X_train = f.readlines()\n",
+    "with open('train/expected.tsv', 'r', encoding='utf8') as f:\n",
+    "    y_train = f.readlines()\n",
+    "\n",
+    "with open('dev-0/in.tsv', 'r', encoding='utf8') as f:\n",
+    "    X_dev = f.readlines()\n",
+    "with open('dev-0/expected.tsv', 'r', encoding='utf8') as f:\n",
+    "    y_dev = f.readlines()\n",
+    "\n",
+    "with open('test-A/in.tsv', 'r', encoding='utf8') as f:\n",
+    "    X_test = f.readlines()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i, line in enumerate(X_train):\n",
+    "    X_train[i] = re.sub(r'\\t[0-9]+\\n', '', line)\n",
+    "\n",
+    "for i, line in enumerate(X_dev):\n",
+    "    X_dev[i] = re.sub(r'\\t[0-9]+\\n', '', line)\n",
+    "\n",
+    "for i, line in enumerate(X_test):\n",
+    "    X_test[i] = re.sub(r'\\t[0-9]+\\n', '', line)\n",
+    "\n",
+    "for i, line in enumerate(y_train):\n",
+    "    y_train[i] = re.sub(r'\\n', '', line)\n",
+    "\n",
+    "for i, line in enumerate(y_dev):\n",
+    "    y_dev[i] = re.sub(r'\\n', '', line)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def readData(fileName):  \n",
+    "    with open(f'{fileName}/in.tsv', 'r', encoding='utf8') as f:\n",
+    "        X =  np.array([x.strip().lower() for x in f.readlines()])\n",
+    "    with open(f'{fileName}/expected.tsv', 'r', encoding='utf8') as f:\n",
+    "        y = np.array([int(x.strip()) for x in f.readlines()])\n",
+    "    return X,y\n",
+    "\n",
+    "X_file,y_file = readData('dev-0')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class NeuralNetworkModel(torch.nn.Module):\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        super(NeuralNetworkModel, self).__init__()\n",
+    "        self.fc1 = torch.nn.Linear(FEATURES, 500)\n",
+    "        self.fc2 = torch.nn.Linear(500, 1)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x = self.fc1(x)\n",
+    "        x = torch.relu(x)\n",
+    "        x = self.fc2(x)\n",
+    "        x = torch.sigmoid(x)\n",
+    "        return x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "word2vec = downloader.load(\"glove-twitter-200\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]\n",
+    "                   or [np.zeros(FEATURES)], axis=0) for doc in X_train]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_train = np.array(y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train_model(X_train, y_train):\n",
+    "    model = NeuralNetworkModel()\n",
+    "\n",
+    "    criterion = torch.nn.BCELoss()\n",
+    "    optimizer = torch.optim.ASGD(model.parameters(), lr=0.05)\n",
+    "\n",
+    "    for epoch in range(EPOCHS):\n",
+    "\n",
+    "        print(epoch)\n",
+    "        loss_score = 0\n",
+    "        acc_score = 0\n",
+    "        items_total = 0\n",
+    "\n",
+    "        for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
+    "            x = X_train[i:i+BATCH_SIZE]\n",
+    "            x = torch.tensor(np.array(x).astype(np.float32))\n",
+    "            y = y_train[i:i+BATCH_SIZE]\n",
+    "            y = torch.tensor(y.astype(np.float32)).reshape(-1, 1)\n",
+    "            y_pred = model(x)\n",
+    "            acc_score += torch.sum((y_pred > 0.5) == y).item()\n",
+    "            items_total += y.shape[0]\n",
+    "\n",
+    "            optimizer.zero_grad()\n",
+    "            loss = criterion(y_pred, y)\n",
+    "            loss.backward()\n",
+    "            optimizer.step()\n",
+    "\n",
+    "            loss_score += loss.item() * y.shape[0]\n",
+    "        \n",
+    "        print((loss_score / items_total), (acc_score / items_total))\n",
+    "        \n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predict(model, x_test):\n",
+    "    y_dev = []\n",
+    "    \n",
+    "    with torch.no_grad():\n",
+    "        for i in range(0, len(x_test), BATCH_SIZE):\n",
+    "            x = x_test[i:i+BATCH_SIZE]\n",
+    "            x = torch.tensor(np.array(x).astype(np.float32))\n",
+    "            outputs = model(x)\n",
+    "            y = (outputs > 0.5)\n",
+    "            y_dev.extend(y)\n",
+    "\n",
+    "    return y_dev"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "0.5714333134919922 0.6966561801788113\n",
+      "1\n",
+      "0.5395073619374668 0.7242514132585581\n",
+      "2\n",
+      "0.5322582519146749 0.7296247310751125\n",
+      "3\n",
+      "0.5277940251241121 0.7327292379626976\n",
+      "4\n",
+      "0.5243827499623 0.7345525745996775\n",
+      "5\n",
+      "0.521483356086283 0.7361825270478868\n",
+      "6\n",
+      "0.5188610753636298 0.7376052821509848\n",
+      "7\n",
+      "0.5164497832484463 0.7390211306759122\n",
+      "8\n",
+      "0.5142272224311959 0.7402332351448137\n",
+      "9\n",
+      "0.5121725415654607 0.7413451942302446\n",
+      "10\n",
+      "0.510225843876934 0.742412260557568\n",
+      "11\n",
+      "0.5084293723366556 0.7430476657492429\n",
+      "12\n",
+      "0.5067300511753501 0.7440560261621181\n",
+      "13\n",
+      "0.5051866206455035 0.7450609332859082\n",
+      "14\n",
+      "0.503752063642534 0.7458586430645868\n",
+      "15\n",
+      "0.5024285955103476 0.7466943390232027\n",
+      "16\n",
+      "0.5011020173689057 0.7476439935216297\n",
+      "17\n",
+      "0.49986460605734995 0.7483691842295194\n",
+      "18\n",
+      "0.498722317965918 0.7489527900849163\n",
+      "19\n",
+      "0.4976401474074949 0.749584741987506\n",
+      "20\n",
+      "0.4966364578740479 0.7502788530936291\n",
+      "21\n",
+      "0.4956408892432799 0.7507208740965332\n",
+      "22\n",
+      "0.4946911594690806 0.7513459194209525\n",
+      "23\n",
+      "0.4938261365074296 0.7519433384326902\n",
+      "24\n",
+      "0.49291108882053136 0.7526996087423466\n",
+      "25\n",
+      "0.49207683927175633 0.752979325158247\n",
+      "26\n",
+      "0.4912937934254017 0.7534524257629179\n",
+      "27\n",
+      "0.49052768458365964 0.7539186197894184\n",
+      "28\n",
+      "0.48980189713607974 0.7542535888306818\n",
+      "29\n",
+      "0.48902049401931186 0.7547819420607157\n",
+      "30\n",
+      "0.48832297395034846 0.7553828143615386\n",
+      "31\n",
+      "0.48764632061179475 0.7556832505119501\n",
+      "32\n",
+      "0.4869866096390585 0.7563359221490509\n",
+      "33\n",
+      "0.48635514366306837 0.7567813964410403\n",
+      "34\n",
+      "0.48572428783405186 0.7574616943908226\n",
+      "35\n",
+      "0.4851059672855987 0.7577897568539155\n",
+      "36\n",
+      "0.4844747067054167 0.7581350857624344\n",
+      "37\n",
+      "0.4838937349887044 0.7585080409836349\n",
+      "38\n",
+      "0.48333403454228063 0.7584769613818682\n",
+      "39\n",
+      "0.4827657912931136 0.7590916468390319\n",
+      "40\n",
+      "0.48225590195293194 0.7592435915587802\n",
+      "41\n",
+      "0.48163791058193006 0.7597857579451549\n",
+      "42\n",
+      "0.4811314198011156 0.7601414467209293\n",
+      "43\n",
+      "0.4806143895582873 0.7607181459981559\n",
+      "44\n",
+      "0.4800953709221985 0.7609598762341192\n",
+      "45\n",
+      "0.47956847999038854 0.7612913919862974\n",
+      "46\n",
+      "0.4790844480555675 0.7616470807620719\n",
+      "47\n",
+      "0.47860829903493235 0.761795572192735\n",
+      "48\n",
+      "0.4781695258369003 0.762089101764976\n",
+      "49\n",
+      "0.4776893918277479 0.7624827767206876\n",
+      "50\n",
+      "0.47722041533606274 0.7628246523401213\n",
+      "51\n",
+      "0.4767699545351635 0.7631596213813847\n",
+      "52\n",
+      "0.47637271544187293 0.7633253792574738\n",
+      "53\n",
+      "0.47592309171862696 0.7635705627825222\n",
+      "54\n",
+      "0.47549356202221993 0.7638744522220189\n",
+      "55\n",
+      "0.47508612961542673 0.7642370475759638\n",
+      "56\n",
+      "0.47468646391106234 0.764351006115775\n",
+      "57\n",
+      "0.4742474519497854 0.7646790685788679\n",
+      "58\n",
+      "0.4737666401496256 0.7650623836673239\n",
+      "59\n",
+      "0.47335995538274667 0.7652972073251169\n",
+      "60\n",
+      "0.4729701449600526 0.7654422454666947\n",
+      "61\n",
+      "0.4725969795466422 0.7656252697882098\n",
+      "62\n",
+      "0.47221369839845356 0.7661121835492215\n",
+      "63\n",
+      "0.4718388513139844 0.7663021144489068\n",
+      "64\n",
+      "0.47147053143633466 0.7664575124577404\n",
+      "65\n",
+      "0.4711233925314738 0.7666543499355961\n",
+      "66\n",
+      "0.47074752713287643 0.7669340663514965\n",
+      "67\n",
+      "0.4703749315941604 0.7673242880181229\n",
+      "68\n",
+      "0.470022628463849 0.7672828485491006\n",
+      "69\n",
+      "0.4696828857076031 0.7677559491537715\n",
+      "70\n",
+      "0.4693190624670805 0.7678491879590716\n",
+      "71\n",
+      "0.4689852795644025 0.7683257418528278\n",
+      "72\n",
+      "0.46865665018555414 0.7687194168085393\n",
+      "73\n",
+      "0.468258934943202 0.7687297766757949\n",
+      "74\n",
+      "0.46797715189850664 0.7687608562775615\n",
+      "75\n",
+      "0.46764439033620286 0.7690716522952286\n",
+      "76\n",
+      "0.46732620352289256 0.769351368711129\n",
+      "77\n",
+      "0.4670077633846447 0.769700150908733\n",
+      "78\n",
+      "0.4667117469477995 0.7697692166904369\n",
+      "79\n",
+      "0.4664313273439932 0.7700420265281668\n",
+      "80\n",
+      "0.4661624620708029 0.7704426080620487\n",
+      "81\n",
+      "0.46585000600566223 0.7703148363658967\n",
+      "82\n",
+      "0.4655422194174101 0.7706739784307564\n",
+      "83\n",
+      "0.4652497145337105 0.7708708159086122\n",
+      "84\n",
+      "0.46495632112782237 0.7708673626195269\n",
+      "85\n",
+      "0.46467082155335016 0.7712023316607903\n",
+      "86\n",
+      "0.46439953297526376 0.7715269408347981\n",
+      "87\n",
+      "0.4640616501378699 0.7718032039616133\n",
+      "88\n",
+      "0.46377603995408073 0.7721139999792803\n",
+      "89\n",
+      "0.46352646427627725 0.7722072387845804\n",
+      "90\n",
+      "0.46323162764281506 0.7723971696842657\n",
+      "91\n",
+      "0.4629823635760337 0.7724765953332251\n",
+      "92\n",
+      "0.46268333841052883 0.7727770314836366\n",
+      "93\n",
+      "0.4624373474653466 0.7728978966016182\n",
+      "94\n",
+      "0.4621637105605031 0.7731396268375814\n",
+      "95\n",
+      "0.4618823675153035 0.7730463880322813\n",
+      "96\n",
+      "0.4615598618066211 0.7733571840499484\n",
+      "97\n",
+      "0.4613917053205442 0.7734089833862262\n",
+      "98\n",
+      "0.4610787309787952 0.7734642360115892\n",
+      "99\n",
+      "0.4608159763176817 0.7737197794038932\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = train_model(X_train_w2v, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_dev_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]\n",
+    "                   or [np.zeros(FEATURES)], axis=0) for doc in X_dev]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_dev=predict(model, X_dev_w2v)\n",
+    "y_dev = ['1' if bool(item) else '0' for item in y_dev]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('dev-0/out.tsv', 'wt') as f:\n",
+    "    for pred in y_dev:\n",
+    "        f.write(str(pred)+'\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_test_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]\n",
+    "                   or [np.zeros(FEATURES)], axis=0) for doc in X_test]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_test=predict(model, X_test_w2v)\n",
+    "y_test = ['1' if bool(item) else '0' for item in y_test]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('test-A/out.tsv', 'wt') as f:\n",
+    "    for pred in y_test:\n",
+    "        f.write(str(pred)+'\\n')"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "3ecbe772e0e869a386d256c10cc6d948e50cd4df13a3f02e58ab4f2a666d7bf0"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.8.13 ('eks')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/run.py
+++ b/run.py
@ -0,0 +1,158 @@
+# %%
+import numpy as np
+import gensim
+import re
+import torch
+import pandas as pd
+from gensim.models import Word2Vec
+from gensim import downloader
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+# %%
+BATCH_SIZE = 64
+EPOCHS = 100
+FEATURES = 200
+
+# %%
+with open('train/in.tsv', 'r', encoding='utf8') as f:
+    X_train = f.readlines()
+with open('train/expected.tsv', 'r', encoding='utf8') as f:
+    y_train = f.readlines()
+
+with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
+    X_dev = f.readlines()
+with open('dev-0/expected.tsv', 'r', encoding='utf8') as f:
+    y_dev = f.readlines()
+
+with open('test-A/in.tsv', 'r', encoding='utf8') as f:
+    X_test = f.readlines()
+
+# %%
+for i, line in enumerate(X_train):
+    X_train[i] = re.sub(r'\t[0-9]+\n', '', line)
+
+for i, line in enumerate(X_dev):
+    X_dev[i] = re.sub(r'\t[0-9]+\n', '', line)
+
+for i, line in enumerate(X_test):
+    X_test[i] = re.sub(r'\t[0-9]+\n', '', line)
+
+for i, line in enumerate(y_train):
+    y_train[i] = re.sub(r'\n', '', line)
+
+for i, line in enumerate(y_dev):
+    y_dev[i] = re.sub(r'\n', '', line)
+
+# %%
+def readData(fileName):  
+    with open(f'{fileName}/in.tsv', 'r', encoding='utf8') as f:
+        X =  np.array([x.strip().lower() for x in f.readlines()])
+    with open(f'{fileName}/expected.tsv', 'r', encoding='utf8') as f:
+        y = np.array([int(x.strip()) for x in f.readlines()])
+    return X,y
+
+X_file,y_file = readData('dev-0')
+
+# %%
+class NeuralNetworkModel(torch.nn.Module):
+    
+    def __init__(self):
+        super(NeuralNetworkModel, self).__init__()
+        self.fc1 = torch.nn.Linear(FEATURES, 500)
+        self.fc2 = torch.nn.Linear(500, 1)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = torch.relu(x)
+        x = self.fc2(x)
+        x = torch.sigmoid(x)
+        return x
+
+# %%
+word2vec = downloader.load("glove-twitter-200")
+
+# %%
+X_train_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
+                   or [np.zeros(FEATURES)], axis=0) for doc in X_train]
+
+# %%
+y_train = np.array(y_train)
+
+# %%
+def train_model(X_train, y_train):
+    model = NeuralNetworkModel()
+
+    criterion = torch.nn.BCELoss()
+    optimizer = torch.optim.ASGD(model.parameters(), lr=0.05)
+
+    for epoch in range(EPOCHS):
+
+        print(epoch)
+        loss_score = 0
+        acc_score = 0
+        items_total = 0
+
+        for i in range(0, y_train.shape[0], BATCH_SIZE):
+            x = X_train[i:i+BATCH_SIZE]
+            x = torch.tensor(np.array(x).astype(np.float32))
+            y = y_train[i:i+BATCH_SIZE]
+            y = torch.tensor(y.astype(np.float32)).reshape(-1, 1)
+            y_pred = model(x)
+            acc_score += torch.sum((y_pred > 0.5) == y).item()
+            items_total += y.shape[0]
+
+            optimizer.zero_grad()
+            loss = criterion(y_pred, y)
+            loss.backward()
+            optimizer.step()
+
+            loss_score += loss.item() * y.shape[0]
+        
+        print((loss_score / items_total), (acc_score / items_total))
+        
+    return model
+
+# %%
+def predict(model, x_test):
+    y_dev = []
+    
+    with torch.no_grad():
+        for i in range(0, len(x_test), BATCH_SIZE):
+            x = x_test[i:i+BATCH_SIZE]
+            x = torch.tensor(np.array(x).astype(np.float32))
+            outputs = model(x)
+            y = (outputs > 0.5)
+            y_dev.extend(y)
+
+    return y_dev
+
+# %%
+model = train_model(X_train_w2v, y_train)
+
+# %%
+X_dev_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
+                   or [np.zeros(FEATURES)], axis=0) for doc in X_dev]
+
+# %%
+y_dev=predict(model, X_dev_w2v)
+y_dev = ['1' if bool(item) else '0' for item in y_dev]
+
+# %%
+with open('dev-0/out.tsv', 'wt') as f:
+    for pred in y_dev:
+        f.write(str(pred)+'\n')
+
+# %%
+X_test_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
+                   or [np.zeros(FEATURES)], axis=0) for doc in X_test]
+
+# %%
+y_test=predict(model, X_test_w2v)
+y_test = ['1' if bool(item) else '0' for item in y_test]
+
+# %%
+with open('test-A/out.tsv', 'wt') as f:
+    for pred in y_test:
+        f.write(str(pred)+'\n')
+
+
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
				`@ -0,0 +1 @@`
				`--metric Likelihood --metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv`