{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Paranormal or skeptic.ipynb", "provenance": [], "collapsed_sections": [], "toc_visible": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "code", "metadata": { "id": "dZUIeB9Q8rv3", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 136 }, "outputId": "95512ec2-2ea3-4ff5-bc08-09ee3b99c39c" }, "source": [ "!git clone git://gonito.net/paranormal-or-skeptic " ], "execution_count": 5, "outputs": [ { "output_type": "stream", "text": [ "Cloning into 'paranormal-or-skeptic'...\n", "remote: Enumerating objects: 3583, done.\u001b[K\n", "remote: Counting objects: 100% (3583/3583), done.\u001b[K\n", "remote: Compressing objects: 100% (3188/3188), done.\u001b[K\n", "remote: Total 3583 (delta 789), reused 2704 (delta 338)\n", "Receiving objects: 100% (3583/3583), 202.38 MiB | 4.18 MiB/s, done.\n", "Resolving deltas: 100% (789/789), done.\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "x8uZz8__5sXr", "colab_type": "text" }, "source": [ "\n", "# Loading Data\n" ] }, { "cell_type": "code", "metadata": { "id": "NQFKg_czGeRA", "colab_type": "code", "outputId": "4cf38154-be9f-48b4-e0ea-cfac084e795a", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "!xzcat train/in.tsv.xz | wc -l" ], "execution_count": 11, "outputs": [ { "output_type": "stream", "text": [ "289579\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "GxUYlO5M6SOJ", "colab_type": "code", "colab": {} }, "source": [ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import pandas as pd\n", "import numpy as np\n", "from scipy.sparse import hstack\n", "import csv\n", "import datetime" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "MWDzekYY6S9f", "colab_type": "code", "colab": {} }, "source": [ "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.linear_model import SGDClassifier, LogisticRegression\n", "from sklearn.svm import LinearSVC\n", "from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB\n", "from sklearn.neural_network import MLPClassifier" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "SrLtGV3p4pKW", "colab_type": "code", "colab": {} }, "source": [ "def load_set(path, isTest):\n", " dataset = pd.read_csv(path+\"/in.tsv.xz\", delimiter=\"\\t\",header=None,names=[\"text\",\"date\"],quoting=csv.QUOTE_NONE)\n", " dataset[\"date\"] = pd.to_datetime(dataset[\"date\"].apply(lambda x: datetime.datetime.fromtimestamp(x).isoformat()))\n", " if not isTest:\n", " expected = pd.read_csv(path+\"/expected.tsv\",header=None,names=[\"class\"],dtype=\"category\")\n", " return dataset, expected\n", " return dataset" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "wH70ClgjBeCO", "colab_type": "text" }, "source": [ "**Load all sets**" ] }, { "cell_type": "code", "metadata": { "id": "huOmuCrE6yCR", "colab_type": "code", "colab": {} }, "source": [ "train_set, expected_train = load_set(\"train\", False)\n", "dev_set, expected_dev = load_set(\"dev-0\", False)\n", "test_set = load_set(\"test-A\", True)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "mWO1IroV6cmm", "colab_type": "text" }, "source": [ "# Prepare data" ] }, { "cell_type": "code", "metadata": { "id": "VVd7DJ1E6cOO", "colab_type": "code", "colab": {} }, "source": [ "def prepare_data(data):\n", " data[\"day\"] = data[\"date\"].dt.day\n", " data[\"month\"] = data[\"date\"].dt.month\n", " data[\"year\"] = data[\"date\"].dt.year\n", " return data" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "i_k63XB5642m", "colab_type": "code", "colab": {} }, "source": [ "train_set = prepare_data(train_set)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "dcjUSa7f7Wex", "colab_type": "code", "outputId": "f9ade29f-f746-4dd2-eb79-25845095a9f6", "colab": { "base_uri": "https://localhost:8080/", "height": 204 } }, "source": [ "train_set.sample(5)" ], "execution_count": 18, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", " | text | \n", "date | \n", "day | \n", "month | \n", "year | \n", "
---|---|---|---|---|---|
112652 | \n", "As i hovered over that link I was expecting r/... | \n", "2012-03-23 13:34:29 | \n", "23 | \n", "3 | \n", "2012 | \n", "
172265 | \n", "Caesarean section is now the new natural child... | \n", "2012-04-19 14:28:59 | \n", "19 | \n", "4 | \n", "2012 | \n", "
150100 | \n", "The Somerton Man reminds me of the [Lead Masks... | \n", "2012-08-04 21:21:56 | \n", "4 | \n", "8 | \n", "2012 | \n", "
153335 | \n", "As a skeptic, I demand this man provide eviden... | \n", "2012-06-20 04:44:02 | \n", "20 | \n", "6 | \n", "2012 | \n", "
149621 | \n", "It's a fucking bug. | \n", "2012-11-15 02:29:24 | \n", "15 | \n", "11 | \n", "2012 | \n", "