2020-03-22 12:30:10 +01:00
|
|
|
{
|
|
|
|
"nbformat": 4,
|
|
|
|
"nbformat_minor": 0,
|
|
|
|
"metadata": {
|
|
|
|
"colab": {
|
|
|
|
"name": "Paranormal or skeptic.ipynb",
|
|
|
|
"provenance": [],
|
|
|
|
"collapsed_sections": [],
|
|
|
|
"toc_visible": true
|
|
|
|
},
|
|
|
|
"kernelspec": {
|
|
|
|
"name": "python3",
|
|
|
|
"display_name": "Python 3"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"cells": [
|
2020-04-23 01:09:38 +02:00
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "dZUIeB9Q8rv3",
|
|
|
|
"colab_type": "code",
|
|
|
|
"colab": {
|
|
|
|
"base_uri": "https://localhost:8080/",
|
|
|
|
"height": 136
|
|
|
|
},
|
|
|
|
"outputId": "95512ec2-2ea3-4ff5-bc08-09ee3b99c39c"
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"!git clone git://gonito.net/paranormal-or-skeptic "
|
|
|
|
],
|
|
|
|
"execution_count": 5,
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
|
|
|
"Cloning into 'paranormal-or-skeptic'...\n",
|
|
|
|
"remote: Enumerating objects: 3583, done.\u001b[K\n",
|
|
|
|
"remote: Counting objects: 100% (3583/3583), done.\u001b[K\n",
|
|
|
|
"remote: Compressing objects: 100% (3188/3188), done.\u001b[K\n",
|
|
|
|
"remote: Total 3583 (delta 789), reused 2704 (delta 338)\n",
|
|
|
|
"Receiving objects: 100% (3583/3583), 202.38 MiB | 4.18 MiB/s, done.\n",
|
|
|
|
"Resolving deltas: 100% (789/789), done.\n"
|
|
|
|
],
|
|
|
|
"name": "stdout"
|
|
|
|
}
|
|
|
|
]
|
|
|
|
},
|
2020-03-22 12:30:10 +01:00
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {
|
|
|
|
"id": "x8uZz8__5sXr",
|
|
|
|
"colab_type": "text"
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"\n",
|
|
|
|
"# Loading Data\n"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "NQFKg_czGeRA",
|
|
|
|
"colab_type": "code",
|
2020-04-23 01:09:38 +02:00
|
|
|
"outputId": "4cf38154-be9f-48b4-e0ea-cfac084e795a",
|
2020-03-22 12:30:10 +01:00
|
|
|
"colab": {
|
|
|
|
"base_uri": "https://localhost:8080/",
|
|
|
|
"height": 34
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"!xzcat train/in.tsv.xz | wc -l"
|
|
|
|
],
|
2020-04-23 01:09:38 +02:00
|
|
|
"execution_count": 11,
|
2020-03-22 12:30:10 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
|
|
|
"289579\n"
|
|
|
|
],
|
|
|
|
"name": "stdout"
|
|
|
|
}
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "GxUYlO5M6SOJ",
|
|
|
|
"colab_type": "code",
|
|
|
|
"colab": {}
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"import matplotlib.pyplot as plt\n",
|
|
|
|
"import seaborn as sns\n",
|
|
|
|
"import pandas as pd\n",
|
|
|
|
"import numpy as np\n",
|
|
|
|
"from scipy.sparse import hstack\n",
|
|
|
|
"import csv\n",
|
|
|
|
"import datetime"
|
|
|
|
],
|
|
|
|
"execution_count": 0,
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "MWDzekYY6S9f",
|
|
|
|
"colab_type": "code",
|
|
|
|
"colab": {}
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
|
|
|
|
"from sklearn.pipeline import Pipeline\n",
|
2020-04-23 01:09:38 +02:00
|
|
|
"from sklearn.linear_model import SGDClassifier, LogisticRegression\n",
|
|
|
|
"from sklearn.svm import LinearSVC\n",
|
|
|
|
"from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB\n",
|
|
|
|
"from sklearn.neural_network import MLPClassifier"
|
2020-03-22 12:30:10 +01:00
|
|
|
],
|
|
|
|
"execution_count": 0,
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "SrLtGV3p4pKW",
|
|
|
|
"colab_type": "code",
|
|
|
|
"colab": {}
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"def load_set(path, isTest):\n",
|
|
|
|
" dataset = pd.read_csv(path+\"/in.tsv.xz\", delimiter=\"\\t\",header=None,names=[\"text\",\"date\"],quoting=csv.QUOTE_NONE)\n",
|
|
|
|
" dataset[\"date\"] = pd.to_datetime(dataset[\"date\"].apply(lambda x: datetime.datetime.fromtimestamp(x).isoformat()))\n",
|
|
|
|
" if not isTest:\n",
|
|
|
|
" expected = pd.read_csv(path+\"/expected.tsv\",header=None,names=[\"class\"],dtype=\"category\")\n",
|
|
|
|
" return dataset, expected\n",
|
|
|
|
" return dataset"
|
|
|
|
],
|
|
|
|
"execution_count": 0,
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {
|
|
|
|
"id": "wH70ClgjBeCO",
|
|
|
|
"colab_type": "text"
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"**Load all sets**"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "huOmuCrE6yCR",
|
|
|
|
"colab_type": "code",
|
|
|
|
"colab": {}
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"train_set, expected_train = load_set(\"train\", False)\n",
|
|
|
|
"dev_set, expected_dev = load_set(\"dev-0\", False)\n",
|
|
|
|
"test_set = load_set(\"test-A\", True)"
|
|
|
|
],
|
|
|
|
"execution_count": 0,
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {
|
|
|
|
"id": "mWO1IroV6cmm",
|
|
|
|
"colab_type": "text"
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"# Prepare data"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "VVd7DJ1E6cOO",
|
|
|
|
"colab_type": "code",
|
|
|
|
"colab": {}
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"def prepare_data(data):\n",
|
|
|
|
" data[\"day\"] = data[\"date\"].dt.day\n",
|
|
|
|
" data[\"month\"] = data[\"date\"].dt.month\n",
|
|
|
|
" data[\"year\"] = data[\"date\"].dt.year\n",
|
|
|
|
" return data"
|
|
|
|
],
|
|
|
|
"execution_count": 0,
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "i_k63XB5642m",
|
|
|
|
"colab_type": "code",
|
|
|
|
"colab": {}
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"train_set = prepare_data(train_set)"
|
|
|
|
],
|
|
|
|
"execution_count": 0,
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "dcjUSa7f7Wex",
|
|
|
|
"colab_type": "code",
|
2020-04-23 01:09:38 +02:00
|
|
|
"outputId": "f9ade29f-f746-4dd2-eb79-25845095a9f6",
|
2020-03-22 12:30:10 +01:00
|
|
|
"colab": {
|
|
|
|
"base_uri": "https://localhost:8080/",
|
|
|
|
"height": 204
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"train_set.sample(5)"
|
|
|
|
],
|
2020-04-23 01:09:38 +02:00
|
|
|
"execution_count": 18,
|
2020-03-22 12:30:10 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"output_type": "execute_result",
|
|
|
|
"data": {
|
|
|
|
"text/html": [
|
|
|
|
"<div>\n",
|
|
|
|
"<style scoped>\n",
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
" text-align: right;\n",
|
|
|
|
" }\n",
|
|
|
|
"</style>\n",
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
" <thead>\n",
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
" <th></th>\n",
|
|
|
|
" <th>text</th>\n",
|
|
|
|
" <th>date</th>\n",
|
|
|
|
" <th>day</th>\n",
|
|
|
|
" <th>month</th>\n",
|
|
|
|
" <th>year</th>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </thead>\n",
|
|
|
|
" <tbody>\n",
|
|
|
|
" <tr>\n",
|
2020-04-23 01:09:38 +02:00
|
|
|
" <th>112652</th>\n",
|
|
|
|
" <td>As i hovered over that link I was expecting r/...</td>\n",
|
|
|
|
" <td>2012-03-23 13:34:29</td>\n",
|
|
|
|
" <td>23</td>\n",
|
|
|
|
" <td>3</td>\n",
|
|
|
|
" <td>2012</td>\n",
|
2020-03-22 12:30:10 +01:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
2020-04-23 01:09:38 +02:00
|
|
|
" <th>172265</th>\n",
|
|
|
|
" <td>Caesarean section is now the new natural child...</td>\n",
|
|
|
|
" <td>2012-04-19 14:28:59</td>\n",
|
|
|
|
" <td>19</td>\n",
|
|
|
|
" <td>4</td>\n",
|
|
|
|
" <td>2012</td>\n",
|
2020-03-22 12:30:10 +01:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
2020-04-23 01:09:38 +02:00
|
|
|
" <th>150100</th>\n",
|
|
|
|
" <td>The Somerton Man reminds me of the [Lead Masks...</td>\n",
|
|
|
|
" <td>2012-08-04 21:21:56</td>\n",
|
|
|
|
" <td>4</td>\n",
|
2020-03-22 12:30:10 +01:00
|
|
|
" <td>8</td>\n",
|
|
|
|
" <td>2012</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
2020-04-23 01:09:38 +02:00
|
|
|
" <th>153335</th>\n",
|
|
|
|
" <td>As a skeptic, I demand this man provide eviden...</td>\n",
|
|
|
|
" <td>2012-06-20 04:44:02</td>\n",
|
|
|
|
" <td>20</td>\n",
|
|
|
|
" <td>6</td>\n",
|
|
|
|
" <td>2012</td>\n",
|
2020-03-22 12:30:10 +01:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
2020-04-23 01:09:38 +02:00
|
|
|
" <th>149621</th>\n",
|
|
|
|
" <td>It's a fucking bug.</td>\n",
|
|
|
|
" <td>2012-11-15 02:29:24</td>\n",
|
|
|
|
" <td>15</td>\n",
|
|
|
|
" <td>11</td>\n",
|
2020-03-22 12:30:10 +01:00
|
|
|
" <td>2012</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </tbody>\n",
|
|
|
|
"</table>\n",
|
|
|
|
"</div>"
|
|
|
|
],
|
|
|
|
"text/plain": [
|
|
|
|
" text ... year\n",
|
2020-04-23 01:09:38 +02:00
|
|
|
"112652 As i hovered over that link I was expecting r/... ... 2012\n",
|
|
|
|
"172265 Caesarean section is now the new natural child... ... 2012\n",
|
|
|
|
"150100 The Somerton Man reminds me of the [Lead Masks... ... 2012\n",
|
|
|
|
"153335 As a skeptic, I demand this man provide eviden... ... 2012\n",
|
|
|
|
"149621 It's a fucking bug. ... 2012\n",
|
2020-03-22 12:30:10 +01:00
|
|
|
"\n",
|
|
|
|
"[5 rows x 5 columns]"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"metadata": {
|
|
|
|
"tags": []
|
|
|
|
},
|
2020-04-23 01:09:38 +02:00
|
|
|
"execution_count": 18
|
2020-03-22 12:30:10 +01:00
|
|
|
}
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {
|
|
|
|
"id": "hIZZ9vcu5Xx7",
|
|
|
|
"colab_type": "text"
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"# Train"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "yqHuHTyI8Kfz",
|
|
|
|
"colab_type": "code",
|
|
|
|
"colab": {}
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"vectorize = CountVectorizer(stop_words='english',ngram_range=(1,3),strip_accents='ascii')\n",
|
|
|
|
"vectorized = vectorize.fit_transform(train_set[\"text\"])"
|
|
|
|
],
|
|
|
|
"execution_count": 0,
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "ZaLsOdPe9aFu",
|
|
|
|
"colab_type": "code",
|
|
|
|
"colab": {}
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"X = vectorized\n",
|
|
|
|
"y = expected_train[\"class\"]"
|
|
|
|
],
|
|
|
|
"execution_count": 0,
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "CeYlhwda9Sa7",
|
|
|
|
"colab_type": "code",
|
2020-04-23 01:09:38 +02:00
|
|
|
"outputId": "61a66f28-85b6-452d-bdd0-180772498e34",
|
2020-03-22 12:30:10 +01:00
|
|
|
"colab": {
|
|
|
|
"base_uri": "https://localhost:8080/",
|
2020-04-23 01:09:38 +02:00
|
|
|
"height": 102
|
2020-03-22 12:30:10 +01:00
|
|
|
}
|
|
|
|
},
|
|
|
|
"source": [
|
2020-04-23 01:09:38 +02:00
|
|
|
"bayes = LogisticRegression(max_iter=1000)\n",
|
2020-03-22 12:30:10 +01:00
|
|
|
"bayes.fit(X,y)"
|
|
|
|
],
|
2020-04-23 01:09:38 +02:00
|
|
|
"execution_count": 176,
|
2020-03-22 12:30:10 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"output_type": "execute_result",
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
2020-04-23 01:09:38 +02:00
|
|
|
"LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
|
|
|
|
" intercept_scaling=1, l1_ratio=None, max_iter=1000,\n",
|
|
|
|
" multi_class='auto', n_jobs=None, penalty='l2',\n",
|
|
|
|
" random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n",
|
|
|
|
" warm_start=False)"
|
2020-03-22 12:30:10 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
"metadata": {
|
|
|
|
"tags": []
|
|
|
|
},
|
2020-04-23 01:09:38 +02:00
|
|
|
"execution_count": 176
|
2020-03-22 12:30:10 +01:00
|
|
|
}
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {
|
|
|
|
"id": "SaIcL28I-JCK",
|
|
|
|
"colab_type": "text"
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"# Predict and evaluate"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "q34dlX_43ZoV",
|
|
|
|
"colab_type": "code",
|
|
|
|
"colab": {}
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"def predict_data(data):\n",
|
|
|
|
" prepared = prepare_data(data)\n",
|
|
|
|
" vectorized = vectorize.transform(data[\"text\"])\n",
|
2020-04-23 01:09:38 +02:00
|
|
|
" predicted = bayes.predict_proba(vectorized)[:,1]\n",
|
|
|
|
" predicted[predicted < 0.05] = 0.05\n",
|
|
|
|
" predicted[predicted > 0.95] = 0.95\n",
|
2020-03-22 12:30:10 +01:00
|
|
|
" return predicted"
|
|
|
|
],
|
|
|
|
"execution_count": 0,
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "9sLnLLEUHgoM",
|
|
|
|
"colab_type": "code",
|
|
|
|
"colab": {}
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"dev_predicted = predict_data(dev_set)"
|
|
|
|
],
|
|
|
|
"execution_count": 0,
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "yigVrrVJHkob",
|
|
|
|
"colab_type": "code",
|
2020-04-23 01:09:38 +02:00
|
|
|
"outputId": "42a53652-60ed-4a11-85cf-683ba4b91d23",
|
2020-03-22 12:30:10 +01:00
|
|
|
"colab": {
|
|
|
|
"base_uri": "https://localhost:8080/",
|
2020-04-23 01:09:38 +02:00
|
|
|
"height": 51
|
2020-03-22 12:30:10 +01:00
|
|
|
}
|
|
|
|
},
|
|
|
|
"source": [
|
2020-04-23 01:09:38 +02:00
|
|
|
"dev_predicted"
|
2020-03-22 12:30:10 +01:00
|
|
|
],
|
2020-04-23 01:09:38 +02:00
|
|
|
"execution_count": 195,
|
2020-03-22 12:30:10 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"output_type": "execute_result",
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
2020-04-23 01:09:38 +02:00
|
|
|
"array([0.05 , 0.75847969, 0.86484399, ..., 0.0650311 , 0.95 ,\n",
|
|
|
|
" 0.37791457])"
|
2020-03-22 12:30:10 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
"metadata": {
|
|
|
|
"tags": []
|
|
|
|
},
|
2020-04-23 01:09:38 +02:00
|
|
|
"execution_count": 195
|
2020-03-22 12:30:10 +01:00
|
|
|
}
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "gPdE2HK64aRZ",
|
|
|
|
"colab_type": "code",
|
|
|
|
"colab": {}
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"test_predicted = predict_data(test_set)"
|
|
|
|
],
|
|
|
|
"execution_count": 0,
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {
|
|
|
|
"id": "QFxuvfUJ8AhJ",
|
|
|
|
"colab_type": "text"
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"**Clean output for saving**"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "zjypBm1260h1",
|
|
|
|
"colab_type": "code",
|
|
|
|
"colab": {}
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"test_predicted = np.array([item.strip() for item in test_predicted])\n",
|
|
|
|
"dev_predicted = np.array([item.strip() for item in dev_predicted])"
|
|
|
|
],
|
|
|
|
"execution_count": 0,
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {
|
|
|
|
"id": "baJydHEl4H7N",
|
|
|
|
"colab_type": "text"
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"**Save to file**\n"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "O6gyoEJf4KhS",
|
|
|
|
"colab_type": "code",
|
|
|
|
"colab": {}
|
|
|
|
},
|
|
|
|
"source": [
|
2020-04-23 01:09:38 +02:00
|
|
|
"np.savetxt('test-A/out.tsv', test_predicted, '%f')\n",
|
|
|
|
"np.savetxt('dev-0/out.tsv', dev_predicted, '%f')"
|
2020-03-22 12:30:10 +01:00
|
|
|
],
|
|
|
|
"execution_count": 0,
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {
|
|
|
|
"id": "jIG2Fxrm89D7",
|
|
|
|
"colab_type": "text"
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"**Check geval output**"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "mnch9uLE8vkK",
|
|
|
|
"colab_type": "code",
|
|
|
|
"colab": {}
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"!wget https://gonito.net/get/bin/geval\n",
|
|
|
|
"!chmod u+x geval"
|
|
|
|
],
|
|
|
|
"execution_count": 0,
|
|
|
|
"outputs": []
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"metadata": {
|
|
|
|
"id": "oEkjIcwe8zef",
|
|
|
|
"colab_type": "code",
|
2020-04-23 01:09:38 +02:00
|
|
|
"outputId": "16433b8f-9e3a-4e49-db5d-dc7373c3c675",
|
2020-03-22 12:30:10 +01:00
|
|
|
"colab": {
|
|
|
|
"base_uri": "https://localhost:8080/",
|
2020-04-23 01:09:38 +02:00
|
|
|
"height": 102
|
2020-03-22 12:30:10 +01:00
|
|
|
}
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"!./geval -t \"dev-0\""
|
|
|
|
],
|
2020-04-23 01:09:38 +02:00
|
|
|
"execution_count": 214,
|
2020-03-22 12:30:10 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
2020-04-23 01:09:38 +02:00
|
|
|
"Likelihood\t0.6707\n",
|
|
|
|
"Accuracy\t0.8151\n",
|
|
|
|
"F1.0\t0.7197\n",
|
|
|
|
"Precision\t0.7762\n",
|
|
|
|
"Recall\t0.6710\n"
|
2020-03-22 12:30:10 +01:00
|
|
|
],
|
|
|
|
"name": "stdout"
|
|
|
|
}
|
|
|
|
]
|
|
|
|
}
|
|
|
|
]
|
|
|
|
}
|