{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Paranormal or skeptic.ipynb", "provenance": [], "collapsed_sections": [], "toc_visible": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "x8uZz8__5sXr", "colab_type": "text" }, "source": [ "\n", "# Loading Data\n" ] }, { "cell_type": "code", "metadata": { "id": "NQFKg_czGeRA", "colab_type": "code", "outputId": "60d1c52a-8b42-4a26-d878-67f284589917", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "!xzcat train/in.tsv.xz | wc -l" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "289579\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "GxUYlO5M6SOJ", "colab_type": "code", "colab": {} }, "source": [ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import pandas as pd\n", "import numpy as np\n", "from scipy.sparse import hstack\n", "import csv\n", "import datetime" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "MWDzekYY6S9f", "colab_type": "code", "colab": {} }, "source": [ "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.linear_model import SGDClassifier\n", "from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "SrLtGV3p4pKW", "colab_type": "code", "colab": {} }, "source": [ "def load_set(path, isTest):\n", " dataset = pd.read_csv(path+\"/in.tsv.xz\", delimiter=\"\\t\",header=None,names=[\"text\",\"date\"],quoting=csv.QUOTE_NONE)\n", " dataset[\"date\"] = pd.to_datetime(dataset[\"date\"].apply(lambda x: datetime.datetime.fromtimestamp(x).isoformat()))\n", " if not isTest:\n", " expected = pd.read_csv(path+\"/expected.tsv\",header=None,names=[\"class\"],dtype=\"category\")\n", " return dataset, expected\n", " return dataset" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "wH70ClgjBeCO", "colab_type": "text" }, "source": [ "**Load all sets**" ] }, { "cell_type": "code", "metadata": { "id": "huOmuCrE6yCR", "colab_type": "code", "colab": {} }, "source": [ "train_set, expected_train = load_set(\"train\", False)\n", "dev_set, expected_dev = load_set(\"dev-0\", False)\n", "test_set = load_set(\"test-A\", True)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "mWO1IroV6cmm", "colab_type": "text" }, "source": [ "# Prepare data" ] }, { "cell_type": "code", "metadata": { "id": "VVd7DJ1E6cOO", "colab_type": "code", "colab": {} }, "source": [ "def prepare_data(data):\n", " data[\"day\"] = data[\"date\"].dt.day\n", " data[\"month\"] = data[\"date\"].dt.month\n", " data[\"year\"] = data[\"date\"].dt.year\n", " return data" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "i_k63XB5642m", "colab_type": "code", "colab": {} }, "source": [ "train_set = prepare_data(train_set)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "dcjUSa7f7Wex", "colab_type": "code", "outputId": "9fa0ca70-0516-4656-a1d5-641e5b0f41ff", "colab": { "base_uri": "https://localhost:8080/", "height": 204 } }, "source": [ "train_set.sample(5)" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", " | text | \n", "date | \n", "day | \n", "month | \n", "year | \n", "
---|---|---|---|---|---|
103770 | \n", "Holy crap. I don't think I've seen or heard o... | \n", "2010-07-16 19:27:08 | \n", "16 | \n", "7 | \n", "2010 | \n", "
240391 | \n", "You lost all pretense of civility with your ar... | \n", "2010-09-30 12:18:36 | \n", "30 | \n", "9 | \n", "2010 | \n", "
220910 | \n", "What do people think of ghost adventures? Cur... | \n", "2012-08-21 19:59:56 | \n", "21 | \n", "8 | \n", "2012 | \n", "
39644 | \n", "Congrats on getting the joke. | \n", "2011-07-29 18:19:46 | \n", "29 | \n", "7 | \n", "2011 | \n", "
220867 | \n", "We live in a world where any media can be copi... | \n", "2012-07-18 08:53:24 | \n", "18 | \n", "7 | \n", "2012 | \n", "