{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Paranormal or skeptic.ipynb", "provenance": [], "collapsed_sections": [], "toc_visible": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "x8uZz8__5sXr", "colab_type": "text" }, "source": [ "\n", "# Loading Data\n" ] }, { "cell_type": "code", "metadata": { "id": "NQFKg_czGeRA", "colab_type": "code", "outputId": "60d1c52a-8b42-4a26-d878-67f284589917", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "!xzcat train/in.tsv.xz | wc -l" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "289579\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "GxUYlO5M6SOJ", "colab_type": "code", "colab": {} }, "source": [ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import pandas as pd\n", "import numpy as np\n", "from scipy.sparse import hstack\n", "import csv\n", "import datetime" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "MWDzekYY6S9f", "colab_type": "code", "colab": {} }, "source": [ "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.linear_model import SGDClassifier\n", "from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "SrLtGV3p4pKW", "colab_type": "code", "colab": {} }, "source": [ "def load_set(path, isTest):\n", " dataset = pd.read_csv(path+\"/in.tsv.xz\", delimiter=\"\\t\",header=None,names=[\"text\",\"date\"],quoting=csv.QUOTE_NONE)\n", " dataset[\"date\"] = pd.to_datetime(dataset[\"date\"].apply(lambda x: datetime.datetime.fromtimestamp(x).isoformat()))\n", " if not isTest:\n", " expected = pd.read_csv(path+\"/expected.tsv\",header=None,names=[\"class\"],dtype=\"category\")\n", " return dataset, expected\n", " return dataset" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "wH70ClgjBeCO", "colab_type": "text" }, "source": [ "**Load all sets**" ] }, { "cell_type": "code", "metadata": { "id": "huOmuCrE6yCR", "colab_type": "code", "colab": {} }, "source": [ "train_set, expected_train = load_set(\"train\", False)\n", "dev_set, expected_dev = load_set(\"dev-0\", False)\n", "test_set = load_set(\"test-A\", True)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "mWO1IroV6cmm", "colab_type": "text" }, "source": [ "# Prepare data" ] }, { "cell_type": "code", "metadata": { "id": "VVd7DJ1E6cOO", "colab_type": "code", "colab": {} }, "source": [ "def prepare_data(data):\n", " data[\"day\"] = data[\"date\"].dt.day\n", " data[\"month\"] = data[\"date\"].dt.month\n", " data[\"year\"] = data[\"date\"].dt.year\n", " return data" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "i_k63XB5642m", "colab_type": "code", "colab": {} }, "source": [ "train_set = prepare_data(train_set)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "dcjUSa7f7Wex", "colab_type": "code", "outputId": "9fa0ca70-0516-4656-a1d5-641e5b0f41ff", "colab": { "base_uri": "https://localhost:8080/", "height": 204 } }, "source": [ "train_set.sample(5)" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textdatedaymonthyear
103770Holy crap. I don't think I've seen or heard o...2010-07-16 19:27:081672010
240391You lost all pretense of civility with your ar...2010-09-30 12:18:363092010
220910What do people think of ghost adventures? Cur...2012-08-21 19:59:562182012
39644Congrats on getting the joke.2011-07-29 18:19:462972011
220867We live in a world where any media can be copi...2012-07-18 08:53:241872012
\n", "
" ], "text/plain": [ " text ... year\n", "103770 Holy crap. I don't think I've seen or heard o... ... 2010\n", "240391 You lost all pretense of civility with your ar... ... 2010\n", "220910 What do people think of ghost adventures? Cur... ... 2012\n", "39644 Congrats on getting the joke. ... 2011\n", "220867 We live in a world where any media can be copi... ... 2012\n", "\n", "[5 rows x 5 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 12 } ] }, { "cell_type": "markdown", "metadata": { "id": "hIZZ9vcu5Xx7", "colab_type": "text" }, "source": [ "# Train" ] }, { "cell_type": "code", "metadata": { "id": "yqHuHTyI8Kfz", "colab_type": "code", "colab": {} }, "source": [ "vectorize = CountVectorizer(stop_words='english',ngram_range=(1,3),strip_accents='ascii')\n", "vectorized = vectorize.fit_transform(train_set[\"text\"])" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ZaLsOdPe9aFu", "colab_type": "code", "colab": {} }, "source": [ "X = vectorized\n", "y = expected_train[\"class\"]" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "CeYlhwda9Sa7", "colab_type": "code", "outputId": "607d4f8f-f632-4d41-a1ab-e5d020cc00ae", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "bayes = MultinomialNB(alpha=0.4)\n", "bayes.fit(X,y)" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "MultinomialNB(alpha=0.4, class_prior=None, fit_prior=True)" ] }, "metadata": { "tags": [] }, "execution_count": 15 } ] }, { "cell_type": "markdown", "metadata": { "id": "SaIcL28I-JCK", "colab_type": "text" }, "source": [ "# Predict and evaluate" ] }, { "cell_type": "code", "metadata": { "id": "q34dlX_43ZoV", "colab_type": "code", "colab": {} }, "source": [ "def predict_data(data):\n", " prepared = prepare_data(data)\n", " vectorized = vectorize.transform(data[\"text\"])\n", " predicted = bayes.predict(vectorized)\n", " return predicted" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "9sLnLLEUHgoM", "colab_type": "code", "colab": {} }, "source": [ "dev_predicted = predict_data(dev_set)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "yigVrrVJHkob", "colab_type": "code", "outputId": "9491f926-94a3-4310-9f63-be937663489d", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "np.mean(dev_predicted == expected_dev[\"class\"])" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.8201820940819423" ] }, "metadata": { "tags": [] }, "execution_count": 19 } ] }, { "cell_type": "code", "metadata": { "id": "gPdE2HK64aRZ", "colab_type": "code", "colab": {} }, "source": [ "test_predicted = predict_data(test_set)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "QFxuvfUJ8AhJ", "colab_type": "text" }, "source": [ "**Clean output for saving**" ] }, { "cell_type": "code", "metadata": { "id": "zjypBm1260h1", "colab_type": "code", "colab": {} }, "source": [ "test_predicted = np.array([item.strip() for item in test_predicted])\n", "dev_predicted = np.array([item.strip() for item in dev_predicted])" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "baJydHEl4H7N", "colab_type": "text" }, "source": [ "**Save to file**\n" ] }, { "cell_type": "code", "metadata": { "id": "O6gyoEJf4KhS", "colab_type": "code", "colab": {} }, "source": [ "np.savetxt('test-A/out.tsv', test_predicted, '%c')\n", "np.savetxt('dev-0/out.tsv', dev_predicted, '%c')" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "jIG2Fxrm89D7", "colab_type": "text" }, "source": [ "**Check geval output**" ] }, { "cell_type": "code", "metadata": { "id": "mnch9uLE8vkK", "colab_type": "code", "colab": {} }, "source": [ "!wget https://gonito.net/get/bin/geval\n", "!chmod u+x geval" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "oEkjIcwe8zef", "colab_type": "code", "outputId": "cdb6473e-4eb9-48a7-cc25-25a193cc9194", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "!./geval -t \"dev-0\"" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "0.8202\n" ], "name": "stdout" } ] } ] }