Sklearn multinomial bayes

2020-03-22 12:30:10 +01:00 · 2020-03-22 12:30:10 +01:00 · 7423df901f
commit 7423df901f
parent 772b516776
5 changed files with 11048 additions and 0 deletions
--- a/Paranormal_or_skeptic.ipynb
+++ b/Paranormal_or_skeptic.ipynb
@ -0,0 +1,530 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Paranormal or skeptic.ipynb",
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "x8uZz8__5sXr",
+        "colab_type": "text"
+      },
+      "source": [
+        "\n",
+        "# Loading Data\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "NQFKg_czGeRA",
+        "colab_type": "code",
+        "outputId": "60d1c52a-8b42-4a26-d878-67f284589917",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        }
+      },
+      "source": [
+        "!xzcat train/in.tsv.xz | wc -l"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "289579\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "GxUYlO5M6SOJ",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "import matplotlib.pyplot as plt\n",
+        "import seaborn as sns\n",
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "from scipy.sparse import hstack\n",
+        "import csv\n",
+        "import datetime"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "MWDzekYY6S9f",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
+        "from sklearn.pipeline import Pipeline\n",
+        "from sklearn.linear_model import SGDClassifier\n",
+        "from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "SrLtGV3p4pKW",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "def load_set(path, isTest):\n",
+        "  dataset = pd.read_csv(path+\"/in.tsv.xz\", delimiter=\"\\t\",header=None,names=[\"text\",\"date\"],quoting=csv.QUOTE_NONE)\n",
+        "  dataset[\"date\"] = pd.to_datetime(dataset[\"date\"].apply(lambda x: datetime.datetime.fromtimestamp(x).isoformat()))\n",
+        "  if not isTest:\n",
+        "    expected = pd.read_csv(path+\"/expected.tsv\",header=None,names=[\"class\"],dtype=\"category\")\n",
+        "    return dataset, expected\n",
+        "  return dataset"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "wH70ClgjBeCO",
+        "colab_type": "text"
+      },
+      "source": [
+        "**Load all sets**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "huOmuCrE6yCR",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "train_set, expected_train = load_set(\"train\", False)\n",
+        "dev_set, expected_dev = load_set(\"dev-0\", False)\n",
+        "test_set = load_set(\"test-A\", True)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mWO1IroV6cmm",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Prepare data"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "VVd7DJ1E6cOO",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "def prepare_data(data):\n",
+        "  data[\"day\"] = data[\"date\"].dt.day\n",
+        "  data[\"month\"] = data[\"date\"].dt.month\n",
+        "  data[\"year\"] = data[\"date\"].dt.year\n",
+        "  return data"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "i_k63XB5642m",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "train_set = prepare_data(train_set)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "dcjUSa7f7Wex",
+        "colab_type": "code",
+        "outputId": "9fa0ca70-0516-4656-a1d5-641e5b0f41ff",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 204
+        }
+      },
+      "source": [
+        "train_set.sample(5)"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>text</th>\n",
+              "      <th>date</th>\n",
+              "      <th>day</th>\n",
+              "      <th>month</th>\n",
+              "      <th>year</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>103770</th>\n",
+              "      <td>Holy crap.  I don't think I've seen or heard o...</td>\n",
+              "      <td>2010-07-16 19:27:08</td>\n",
+              "      <td>16</td>\n",
+              "      <td>7</td>\n",
+              "      <td>2010</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>240391</th>\n",
+              "      <td>You lost all pretense of civility with your ar...</td>\n",
+              "      <td>2010-09-30 12:18:36</td>\n",
+              "      <td>30</td>\n",
+              "      <td>9</td>\n",
+              "      <td>2010</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>220910</th>\n",
+              "      <td>What do people think of ghost adventures?  Cur...</td>\n",
+              "      <td>2012-08-21 19:59:56</td>\n",
+              "      <td>21</td>\n",
+              "      <td>8</td>\n",
+              "      <td>2012</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>39644</th>\n",
+              "      <td>Congrats on getting the joke.</td>\n",
+              "      <td>2011-07-29 18:19:46</td>\n",
+              "      <td>29</td>\n",
+              "      <td>7</td>\n",
+              "      <td>2011</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>220867</th>\n",
+              "      <td>We live in a world where any media can be copi...</td>\n",
+              "      <td>2012-07-18 08:53:24</td>\n",
+              "      <td>18</td>\n",
+              "      <td>7</td>\n",
+              "      <td>2012</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "                                                     text  ...  year\n",
+              "103770  Holy crap.  I don't think I've seen or heard o...  ...  2010\n",
+              "240391  You lost all pretense of civility with your ar...  ...  2010\n",
+              "220910  What do people think of ghost adventures?  Cur...  ...  2012\n",
+              "39644                       Congrats on getting the joke.  ...  2011\n",
+              "220867  We live in a world where any media can be copi...  ...  2012\n",
+              "\n",
+              "[5 rows x 5 columns]"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "execution_count": 12
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "hIZZ9vcu5Xx7",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Train"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "yqHuHTyI8Kfz",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "vectorize = CountVectorizer(stop_words='english',ngram_range=(1,3),strip_accents='ascii')\n",
+        "vectorized = vectorize.fit_transform(train_set[\"text\"])"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ZaLsOdPe9aFu",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "X = vectorized\n",
+        "y = expected_train[\"class\"]"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "CeYlhwda9Sa7",
+        "colab_type": "code",
+        "outputId": "607d4f8f-f632-4d41-a1ab-e5d020cc00ae",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        }
+      },
+      "source": [
+        "bayes = MultinomialNB(alpha=0.4)\n",
+        "bayes.fit(X,y)"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "MultinomialNB(alpha=0.4, class_prior=None, fit_prior=True)"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "execution_count": 15
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "SaIcL28I-JCK",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Predict and evaluate"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "q34dlX_43ZoV",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "def predict_data(data):\n",
+        "  prepared = prepare_data(data)\n",
+        "  vectorized = vectorize.transform(data[\"text\"])\n",
+        "  predicted = bayes.predict(vectorized)\n",
+        "  return predicted"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "9sLnLLEUHgoM",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "dev_predicted = predict_data(dev_set)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "yigVrrVJHkob",
+        "colab_type": "code",
+        "outputId": "9491f926-94a3-4310-9f63-be937663489d",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        }
+      },
+      "source": [
+        "np.mean(dev_predicted == expected_dev[\"class\"])"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "0.8201820940819423"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "execution_count": 19
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "gPdE2HK64aRZ",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "test_predicted = predict_data(test_set)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "QFxuvfUJ8AhJ",
+        "colab_type": "text"
+      },
+      "source": [
+        "**Clean output for saving**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "zjypBm1260h1",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "test_predicted = np.array([item.strip() for item in test_predicted])\n",
+        "dev_predicted = np.array([item.strip() for item in dev_predicted])"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "baJydHEl4H7N",
+        "colab_type": "text"
+      },
+      "source": [
+        "**Save to file**\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "O6gyoEJf4KhS",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "np.savetxt('test-A/out.tsv', test_predicted, '%c')\n",
+        "np.savetxt('dev-0/out.tsv', dev_predicted, '%c')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jIG2Fxrm89D7",
+        "colab_type": "text"
+      },
+      "source": [
+        "**Check geval output**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "mnch9uLE8vkK",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "!wget https://gonito.net/get/bin/geval\n",
+        "!chmod u+x geval"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "oEkjIcwe8zef",
+        "colab_type": "code",
+        "outputId": "cdb6473e-4eb9-48a7-cc25-25a193cc9194",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        }
+      },
+      "source": [
+        "!./geval -t \"dev-0\""
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "0.8202\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    }
+  ]
+}
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/link_to_collab.txt
+++ b/link_to_collab.txt
@ -0,0 +1 @@
+https://colab.research.google.com/drive/1JI_RWapDbABFZPc4NDhU-zQlZiIiXk58
--- a/paranormal_or_skeptic.py
+++ b/paranormal_or_skeptic.py
@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+"""Paranormal or skeptic.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+    https://colab.research.google.com/drive/1JI_RWapDbABFZPc4NDhU-zQlZiIiXk58
+
+# Loading Data
+"""
+
+!xzcat train/in.tsv.xz | wc -l
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+import numpy as np
+from scipy.sparse import hstack
+import csv
+import datetime
+
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.linear_model import SGDClassifier
+from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB
+
+def load_set(path, isTest):
+  dataset = pd.read_csv(path+"/in.tsv.xz", delimiter="\t",header=None,names=["text","date"],quoting=csv.QUOTE_NONE)
+  dataset["date"] = pd.to_datetime(dataset["date"].apply(lambda x: datetime.datetime.fromtimestamp(x).isoformat()))
+  if not isTest:
+    expected = pd.read_csv(path+"/expected.tsv",header=None,names=["class"],dtype="category")
+    return dataset, expected
+  return dataset
+
+"""**Load all sets**"""
+
+train_set, expected_train = load_set("train", False)
+dev_set, expected_dev = load_set("dev-0", False)
+test_set = load_set("test-A", True)
+
+"""# Prepare data"""
+
+def prepare_data(data):
+  data["day"] = data["date"].dt.day
+  data["month"] = data["date"].dt.month
+  data["year"] = data["date"].dt.year
+  return data
+
+train_set = prepare_data(train_set)
+
+train_set.sample(5)
+
+"""# Train"""
+
+vectorize = CountVectorizer(stop_words='english',ngram_range=(1,3),strip_accents='ascii')
+vectorized = vectorize.fit_transform(train_set["text"])
+
+X = vectorized
+y = expected_train["class"]
+
+bayes = MultinomialNB(alpha=0.4)
+bayes.fit(X,y)
+
+"""# Predict and evaluate"""
+
+def predict_data(data):
+  prepared = prepare_data(data)
+  vectorized = vectorize.transform(data["text"])
+  predicted = bayes.predict(vectorized)
+  return predicted
+
+dev_predicted = predict_data(dev_set)
+
+np.mean(dev_predicted == expected_dev["class"])
+
+test_predicted = predict_data(test_set)
+
+"""**Clean output for saving**"""
+
+test_predicted = np.array([item.strip() for item in test_predicted])
+dev_predicted = np.array([item.strip() for item in dev_predicted])
+
+"""**Save to file**"""
+
+np.savetxt('test-A/out.tsv', test_predicted, '%c')
+np.savetxt('dev-0/out.tsv', dev_predicted, '%c')
+
+"""**Check geval output**"""
+
+!wget https://gonito.net/get/bin/geval
+!chmod u+x geval
+
+!./geval -t "dev-0"
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
				`@ -0,0 +1 @@`
				`https://colab.research.google.com/drive/1JI_RWapDbABFZPc4NDhU-zQlZiIiXk58`