{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Paranormal or skeptic.ipynb", "provenance": [], "collapsed_sections": [], "toc_visible": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "code", "metadata": { "id": "dZUIeB9Q8rv3", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 136 }, "outputId": "95512ec2-2ea3-4ff5-bc08-09ee3b99c39c" }, "source": [ "!git clone git://gonito.net/paranormal-or-skeptic " ], "execution_count": 5, "outputs": [ { "output_type": "stream", "text": [ "Cloning into 'paranormal-or-skeptic'...\n", "remote: Enumerating objects: 3583, done.\u001b[K\n", "remote: Counting objects: 100% (3583/3583), done.\u001b[K\n", "remote: Compressing objects: 100% (3188/3188), done.\u001b[K\n", "remote: Total 3583 (delta 789), reused 2704 (delta 338)\n", "Receiving objects: 100% (3583/3583), 202.38 MiB | 4.18 MiB/s, done.\n", "Resolving deltas: 100% (789/789), done.\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "x8uZz8__5sXr", "colab_type": "text" }, "source": [ "\n", "# Loading Data\n" ] }, { "cell_type": "code", "metadata": { "id": "NQFKg_czGeRA", "colab_type": "code", "outputId": "4cf38154-be9f-48b4-e0ea-cfac084e795a", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "!xzcat train/in.tsv.xz | wc -l" ], "execution_count": 11, "outputs": [ { "output_type": "stream", "text": [ "289579\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "GxUYlO5M6SOJ", "colab_type": "code", "colab": {} }, "source": [ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import pandas as pd\n", "import numpy as np\n", "from scipy.sparse import hstack\n", "import csv\n", "import datetime" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "MWDzekYY6S9f", "colab_type": "code", "colab": {} }, "source": [ "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.linear_model import SGDClassifier, LogisticRegression\n", "from sklearn.svm import LinearSVC\n", "from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB\n", "from sklearn.neural_network import MLPClassifier" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "SrLtGV3p4pKW", "colab_type": "code", "colab": {} }, "source": [ "def load_set(path, isTest):\n", " dataset = pd.read_csv(path+\"/in.tsv.xz\", delimiter=\"\\t\",header=None,names=[\"text\",\"date\"],quoting=csv.QUOTE_NONE)\n", " dataset[\"date\"] = pd.to_datetime(dataset[\"date\"].apply(lambda x: datetime.datetime.fromtimestamp(x).isoformat()))\n", " if not isTest:\n", " expected = pd.read_csv(path+\"/expected.tsv\",header=None,names=[\"class\"],dtype=\"category\")\n", " return dataset, expected\n", " return dataset" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "wH70ClgjBeCO", "colab_type": "text" }, "source": [ "**Load all sets**" ] }, { "cell_type": "code", "metadata": { "id": "huOmuCrE6yCR", "colab_type": "code", "colab": {} }, "source": [ "train_set, expected_train = load_set(\"train\", False)\n", "dev_set, expected_dev = load_set(\"dev-0\", False)\n", "test_set = load_set(\"test-A\", True)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "mWO1IroV6cmm", "colab_type": "text" }, "source": [ "# Prepare data" ] }, { "cell_type": "code", "metadata": { "id": "VVd7DJ1E6cOO", "colab_type": "code", "colab": {} }, "source": [ "def prepare_data(data):\n", " data[\"day\"] = data[\"date\"].dt.day\n", " data[\"month\"] = data[\"date\"].dt.month\n", " data[\"year\"] = data[\"date\"].dt.year\n", " return data" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "i_k63XB5642m", "colab_type": "code", "colab": {} }, "source": [ "train_set = prepare_data(train_set)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "dcjUSa7f7Wex", "colab_type": "code", "outputId": "f9ade29f-f746-4dd2-eb79-25845095a9f6", "colab": { "base_uri": "https://localhost:8080/", "height": 204 } }, "source": [ "train_set.sample(5)" ], "execution_count": 18, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textdatedaymonthyear
112652As i hovered over that link I was expecting r/...2012-03-23 13:34:292332012
172265Caesarean section is now the new natural child...2012-04-19 14:28:591942012
150100The Somerton Man reminds me of the [Lead Masks...2012-08-04 21:21:56482012
153335As a skeptic, I demand this man provide eviden...2012-06-20 04:44:022062012
149621It's a fucking bug.2012-11-15 02:29:2415112012
\n", "
" ], "text/plain": [ " text ... year\n", "112652 As i hovered over that link I was expecting r/... ... 2012\n", "172265 Caesarean section is now the new natural child... ... 2012\n", "150100 The Somerton Man reminds me of the [Lead Masks... ... 2012\n", "153335 As a skeptic, I demand this man provide eviden... ... 2012\n", "149621 It's a fucking bug. ... 2012\n", "\n", "[5 rows x 5 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 18 } ] }, { "cell_type": "markdown", "metadata": { "id": "hIZZ9vcu5Xx7", "colab_type": "text" }, "source": [ "# Train" ] }, { "cell_type": "code", "metadata": { "id": "yqHuHTyI8Kfz", "colab_type": "code", "colab": {} }, "source": [ "vectorize = CountVectorizer(stop_words='english',ngram_range=(1,3),strip_accents='ascii')\n", "vectorized = vectorize.fit_transform(train_set[\"text\"])" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ZaLsOdPe9aFu", "colab_type": "code", "colab": {} }, "source": [ "X = vectorized\n", "y = expected_train[\"class\"]" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "CeYlhwda9Sa7", "colab_type": "code", "outputId": "61a66f28-85b6-452d-bdd0-180772498e34", "colab": { "base_uri": "https://localhost:8080/", "height": 102 } }, "source": [ "bayes = LogisticRegression(max_iter=1000)\n", "bayes.fit(X,y)" ], "execution_count": 176, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, l1_ratio=None, max_iter=1000,\n", " multi_class='auto', n_jobs=None, penalty='l2',\n", " random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n", " warm_start=False)" ] }, "metadata": { "tags": [] }, "execution_count": 176 } ] }, { "cell_type": "markdown", "metadata": { "id": "SaIcL28I-JCK", "colab_type": "text" }, "source": [ "# Predict and evaluate" ] }, { "cell_type": "code", "metadata": { "id": "q34dlX_43ZoV", "colab_type": "code", "colab": {} }, "source": [ "def predict_data(data):\n", " prepared = prepare_data(data)\n", " vectorized = vectorize.transform(data[\"text\"])\n", " predicted = bayes.predict_proba(vectorized)[:,1]\n", " predicted[predicted < 0.05] = 0.05\n", " predicted[predicted > 0.95] = 0.95\n", " return predicted" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "9sLnLLEUHgoM", "colab_type": "code", "colab": {} }, "source": [ "dev_predicted = predict_data(dev_set)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "yigVrrVJHkob", "colab_type": "code", "outputId": "42a53652-60ed-4a11-85cf-683ba4b91d23", "colab": { "base_uri": "https://localhost:8080/", "height": 51 } }, "source": [ "dev_predicted" ], "execution_count": 195, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([0.05 , 0.75847969, 0.86484399, ..., 0.0650311 , 0.95 ,\n", " 0.37791457])" ] }, "metadata": { "tags": [] }, "execution_count": 195 } ] }, { "cell_type": "code", "metadata": { "id": "gPdE2HK64aRZ", "colab_type": "code", "colab": {} }, "source": [ "test_predicted = predict_data(test_set)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "QFxuvfUJ8AhJ", "colab_type": "text" }, "source": [ "**Clean output for saving**" ] }, { "cell_type": "code", "metadata": { "id": "zjypBm1260h1", "colab_type": "code", "colab": {} }, "source": [ "test_predicted = np.array([item.strip() for item in test_predicted])\n", "dev_predicted = np.array([item.strip() for item in dev_predicted])" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "baJydHEl4H7N", "colab_type": "text" }, "source": [ "**Save to file**\n" ] }, { "cell_type": "code", "metadata": { "id": "O6gyoEJf4KhS", "colab_type": "code", "colab": {} }, "source": [ "np.savetxt('test-A/out.tsv', test_predicted, '%f')\n", "np.savetxt('dev-0/out.tsv', dev_predicted, '%f')" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "jIG2Fxrm89D7", "colab_type": "text" }, "source": [ "**Check geval output**" ] }, { "cell_type": "code", "metadata": { "id": "mnch9uLE8vkK", "colab_type": "code", "colab": {} }, "source": [ "!wget https://gonito.net/get/bin/geval\n", "!chmod u+x geval" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "oEkjIcwe8zef", "colab_type": "code", "outputId": "16433b8f-9e3a-4e49-db5d-dc7373c3c675", "colab": { "base_uri": "https://localhost:8080/", "height": 102 } }, "source": [ "!./geval -t \"dev-0\"" ], "execution_count": 214, "outputs": [ { "output_type": "stream", "text": [ "Likelihood\t0.6707\n", "Accuracy\t0.8151\n", "F1.0\t0.7197\n", "Precision\t0.7762\n", "Recall\t0.6710\n" ], "name": "stdout" } ] } ] }