paranormal-or-skeptic/Paranormal_or_skeptic.ipynb

530 lines
14 KiB
Plaintext

{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Paranormal or skeptic.ipynb",
"provenance": [],
"collapsed_sections": [],
"toc_visible": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "x8uZz8__5sXr",
"colab_type": "text"
},
"source": [
"\n",
"# Loading Data\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "NQFKg_czGeRA",
"colab_type": "code",
"outputId": "60d1c52a-8b42-4a26-d878-67f284589917",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"source": [
"!xzcat train/in.tsv.xz | wc -l"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"289579\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "GxUYlO5M6SOJ",
"colab_type": "code",
"colab": {}
},
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import pandas as pd\n",
"import numpy as np\n",
"from scipy.sparse import hstack\n",
"import csv\n",
"import datetime"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "MWDzekYY6S9f",
"colab_type": "code",
"colab": {}
},
"source": [
"from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.linear_model import SGDClassifier\n",
"from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "SrLtGV3p4pKW",
"colab_type": "code",
"colab": {}
},
"source": [
"def load_set(path, isTest):\n",
" dataset = pd.read_csv(path+\"/in.tsv.xz\", delimiter=\"\\t\",header=None,names=[\"text\",\"date\"],quoting=csv.QUOTE_NONE)\n",
" dataset[\"date\"] = pd.to_datetime(dataset[\"date\"].apply(lambda x: datetime.datetime.fromtimestamp(x).isoformat()))\n",
" if not isTest:\n",
" expected = pd.read_csv(path+\"/expected.tsv\",header=None,names=[\"class\"],dtype=\"category\")\n",
" return dataset, expected\n",
" return dataset"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "wH70ClgjBeCO",
"colab_type": "text"
},
"source": [
"**Load all sets**"
]
},
{
"cell_type": "code",
"metadata": {
"id": "huOmuCrE6yCR",
"colab_type": "code",
"colab": {}
},
"source": [
"train_set, expected_train = load_set(\"train\", False)\n",
"dev_set, expected_dev = load_set(\"dev-0\", False)\n",
"test_set = load_set(\"test-A\", True)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "mWO1IroV6cmm",
"colab_type": "text"
},
"source": [
"# Prepare data"
]
},
{
"cell_type": "code",
"metadata": {
"id": "VVd7DJ1E6cOO",
"colab_type": "code",
"colab": {}
},
"source": [
"def prepare_data(data):\n",
" data[\"day\"] = data[\"date\"].dt.day\n",
" data[\"month\"] = data[\"date\"].dt.month\n",
" data[\"year\"] = data[\"date\"].dt.year\n",
" return data"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "i_k63XB5642m",
"colab_type": "code",
"colab": {}
},
"source": [
"train_set = prepare_data(train_set)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "dcjUSa7f7Wex",
"colab_type": "code",
"outputId": "9fa0ca70-0516-4656-a1d5-641e5b0f41ff",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
}
},
"source": [
"train_set.sample(5)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>date</th>\n",
" <th>day</th>\n",
" <th>month</th>\n",
" <th>year</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>103770</th>\n",
" <td>Holy crap. I don't think I've seen or heard o...</td>\n",
" <td>2010-07-16 19:27:08</td>\n",
" <td>16</td>\n",
" <td>7</td>\n",
" <td>2010</td>\n",
" </tr>\n",
" <tr>\n",
" <th>240391</th>\n",
" <td>You lost all pretense of civility with your ar...</td>\n",
" <td>2010-09-30 12:18:36</td>\n",
" <td>30</td>\n",
" <td>9</td>\n",
" <td>2010</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220910</th>\n",
" <td>What do people think of ghost adventures? Cur...</td>\n",
" <td>2012-08-21 19:59:56</td>\n",
" <td>21</td>\n",
" <td>8</td>\n",
" <td>2012</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39644</th>\n",
" <td>Congrats on getting the joke.</td>\n",
" <td>2011-07-29 18:19:46</td>\n",
" <td>29</td>\n",
" <td>7</td>\n",
" <td>2011</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220867</th>\n",
" <td>We live in a world where any media can be copi...</td>\n",
" <td>2012-07-18 08:53:24</td>\n",
" <td>18</td>\n",
" <td>7</td>\n",
" <td>2012</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text ... year\n",
"103770 Holy crap. I don't think I've seen or heard o... ... 2010\n",
"240391 You lost all pretense of civility with your ar... ... 2010\n",
"220910 What do people think of ghost adventures? Cur... ... 2012\n",
"39644 Congrats on getting the joke. ... 2011\n",
"220867 We live in a world where any media can be copi... ... 2012\n",
"\n",
"[5 rows x 5 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 12
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "hIZZ9vcu5Xx7",
"colab_type": "text"
},
"source": [
"# Train"
]
},
{
"cell_type": "code",
"metadata": {
"id": "yqHuHTyI8Kfz",
"colab_type": "code",
"colab": {}
},
"source": [
"vectorize = CountVectorizer(stop_words='english',ngram_range=(1,3),strip_accents='ascii')\n",
"vectorized = vectorize.fit_transform(train_set[\"text\"])"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ZaLsOdPe9aFu",
"colab_type": "code",
"colab": {}
},
"source": [
"X = vectorized\n",
"y = expected_train[\"class\"]"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "CeYlhwda9Sa7",
"colab_type": "code",
"outputId": "607d4f8f-f632-4d41-a1ab-e5d020cc00ae",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"source": [
"bayes = MultinomialNB(alpha=0.4)\n",
"bayes.fit(X,y)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"MultinomialNB(alpha=0.4, class_prior=None, fit_prior=True)"
]
},
"metadata": {
"tags": []
},
"execution_count": 15
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "SaIcL28I-JCK",
"colab_type": "text"
},
"source": [
"# Predict and evaluate"
]
},
{
"cell_type": "code",
"metadata": {
"id": "q34dlX_43ZoV",
"colab_type": "code",
"colab": {}
},
"source": [
"def predict_data(data):\n",
" prepared = prepare_data(data)\n",
" vectorized = vectorize.transform(data[\"text\"])\n",
" predicted = bayes.predict(vectorized)\n",
" return predicted"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "9sLnLLEUHgoM",
"colab_type": "code",
"colab": {}
},
"source": [
"dev_predicted = predict_data(dev_set)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "yigVrrVJHkob",
"colab_type": "code",
"outputId": "9491f926-94a3-4310-9f63-be937663489d",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"source": [
"np.mean(dev_predicted == expected_dev[\"class\"])"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.8201820940819423"
]
},
"metadata": {
"tags": []
},
"execution_count": 19
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "gPdE2HK64aRZ",
"colab_type": "code",
"colab": {}
},
"source": [
"test_predicted = predict_data(test_set)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "QFxuvfUJ8AhJ",
"colab_type": "text"
},
"source": [
"**Clean output for saving**"
]
},
{
"cell_type": "code",
"metadata": {
"id": "zjypBm1260h1",
"colab_type": "code",
"colab": {}
},
"source": [
"test_predicted = np.array([item.strip() for item in test_predicted])\n",
"dev_predicted = np.array([item.strip() for item in dev_predicted])"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "baJydHEl4H7N",
"colab_type": "text"
},
"source": [
"**Save to file**\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "O6gyoEJf4KhS",
"colab_type": "code",
"colab": {}
},
"source": [
"np.savetxt('test-A/out.tsv', test_predicted, '%c')\n",
"np.savetxt('dev-0/out.tsv', dev_predicted, '%c')"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "jIG2Fxrm89D7",
"colab_type": "text"
},
"source": [
"**Check geval output**"
]
},
{
"cell_type": "code",
"metadata": {
"id": "mnch9uLE8vkK",
"colab_type": "code",
"colab": {}
},
"source": [
"!wget https://gonito.net/get/bin/geval\n",
"!chmod u+x geval"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "oEkjIcwe8zef",
"colab_type": "code",
"outputId": "cdb6473e-4eb9-48a7-cc25-25a193cc9194",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"source": [
"!./geval -t \"dev-0\""
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"0.8202\n"
],
"name": "stdout"
}
]
}
]
}