Sklearn Logistic regression 1-3gram 1000iter

Sklearn naive bayes 1-3 gram version 2.0.0
Update README.md
2020-04-23 01:09:38 +02:00 · 2020-04-04 19:12:25 +02:00 · 2020-03-30 18:29:13 +02:00 · 2020-03-30 18:28:23 +02:00 · 2020-03-30 12:30:04 +02:00 · 2020-03-22 12:30:10 +01:00
8 changed files with 305962 additions and 294864 deletions
--- a/Paranormal_or_skeptic.ipynb
+++ b/Paranormal_or_skeptic.ipynb
@ -0,0 +1,574 @@
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Paranormal or skeptic.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "toc_visible": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "id": "dZUIeB9Q8rv3",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 136
        },
        "outputId": "95512ec2-2ea3-4ff5-bc08-09ee3b99c39c"
      },
      "source": [
        "!git clone git://gonito.net/paranormal-or-skeptic "
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Cloning into 'paranormal-or-skeptic'...\n",
            "remote: Enumerating objects: 3583, done.\u001b[K\n",
            "remote: Counting objects: 100% (3583/3583), done.\u001b[K\n",
            "remote: Compressing objects: 100% (3188/3188), done.\u001b[K\n",
            "remote: Total 3583 (delta 789), reused 2704 (delta 338)\n",
            "Receiving objects: 100% (3583/3583), 202.38 MiB | 4.18 MiB/s, done.\n",
            "Resolving deltas: 100% (789/789), done.\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "x8uZz8__5sXr",
        "colab_type": "text"
      },
      "source": [
        "\n",
        "# Loading Data\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NQFKg_czGeRA",
        "colab_type": "code",
        "outputId": "4cf38154-be9f-48b4-e0ea-cfac084e795a",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "!xzcat train/in.tsv.xz | wc -l"
      ],
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "289579\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "GxUYlO5M6SOJ",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import matplotlib.pyplot as plt\n",
        "import seaborn as sns\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "from scipy.sparse import hstack\n",
        "import csv\n",
        "import datetime"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "MWDzekYY6S9f",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
        "from sklearn.pipeline import Pipeline\n",
        "from sklearn.linear_model import SGDClassifier, LogisticRegression\n",
        "from sklearn.svm import LinearSVC\n",
        "from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB\n",
        "from sklearn.neural_network import MLPClassifier"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "SrLtGV3p4pKW",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def load_set(path, isTest):\n",
        "  dataset = pd.read_csv(path+\"/in.tsv.xz\", delimiter=\"\\t\",header=None,names=[\"text\",\"date\"],quoting=csv.QUOTE_NONE)\n",
        "  dataset[\"date\"] = pd.to_datetime(dataset[\"date\"].apply(lambda x: datetime.datetime.fromtimestamp(x).isoformat()))\n",
        "  if not isTest:\n",
        "    expected = pd.read_csv(path+\"/expected.tsv\",header=None,names=[\"class\"],dtype=\"category\")\n",
        "    return dataset, expected\n",
        "  return dataset"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wH70ClgjBeCO",
        "colab_type": "text"
      },
      "source": [
        "**Load all sets**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "huOmuCrE6yCR",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "train_set, expected_train = load_set(\"train\", False)\n",
        "dev_set, expected_dev = load_set(\"dev-0\", False)\n",
        "test_set = load_set(\"test-A\", True)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "mWO1IroV6cmm",
        "colab_type": "text"
      },
      "source": [
        "# Prepare data"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VVd7DJ1E6cOO",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def prepare_data(data):\n",
        "  data[\"day\"] = data[\"date\"].dt.day\n",
        "  data[\"month\"] = data[\"date\"].dt.month\n",
        "  data[\"year\"] = data[\"date\"].dt.year\n",
        "  return data"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "i_k63XB5642m",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "train_set = prepare_data(train_set)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "dcjUSa7f7Wex",
        "colab_type": "code",
        "outputId": "f9ade29f-f746-4dd2-eb79-25845095a9f6",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 204
        }
      },
      "source": [
        "train_set.sample(5)"
      ],
      "execution_count": 18,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>text</th>\n",
              "      <th>date</th>\n",
              "      <th>day</th>\n",
              "      <th>month</th>\n",
              "      <th>year</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>112652</th>\n",
              "      <td>As i hovered over that link I was expecting r/...</td>\n",
              "      <td>2012-03-23 13:34:29</td>\n",
              "      <td>23</td>\n",
              "      <td>3</td>\n",
              "      <td>2012</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>172265</th>\n",
              "      <td>Caesarean section is now the new natural child...</td>\n",
              "      <td>2012-04-19 14:28:59</td>\n",
              "      <td>19</td>\n",
              "      <td>4</td>\n",
              "      <td>2012</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>150100</th>\n",
              "      <td>The Somerton Man reminds me of the [Lead Masks...</td>\n",
              "      <td>2012-08-04 21:21:56</td>\n",
              "      <td>4</td>\n",
              "      <td>8</td>\n",
              "      <td>2012</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>153335</th>\n",
              "      <td>As a skeptic, I demand this man provide eviden...</td>\n",
              "      <td>2012-06-20 04:44:02</td>\n",
              "      <td>20</td>\n",
              "      <td>6</td>\n",
              "      <td>2012</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>149621</th>\n",
              "      <td>It's a fucking bug.</td>\n",
              "      <td>2012-11-15 02:29:24</td>\n",
              "      <td>15</td>\n",
              "      <td>11</td>\n",
              "      <td>2012</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                                     text  ...  year\n",
              "112652  As i hovered over that link I was expecting r/...  ...  2012\n",
              "172265  Caesarean section is now the new natural child...  ...  2012\n",
              "150100  The Somerton Man reminds me of the [Lead Masks...  ...  2012\n",
              "153335  As a skeptic, I demand this man provide eviden...  ...  2012\n",
              "149621                                It's a fucking bug.  ...  2012\n",
              "\n",
              "[5 rows x 5 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 18
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hIZZ9vcu5Xx7",
        "colab_type": "text"
      },
      "source": [
        "# Train"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "yqHuHTyI8Kfz",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "vectorize = CountVectorizer(stop_words='english',ngram_range=(1,3),strip_accents='ascii')\n",
        "vectorized = vectorize.fit_transform(train_set[\"text\"])"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ZaLsOdPe9aFu",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "X = vectorized\n",
        "y = expected_train[\"class\"]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "CeYlhwda9Sa7",
        "colab_type": "code",
        "outputId": "61a66f28-85b6-452d-bdd0-180772498e34",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 102
        }
      },
      "source": [
        "bayes = LogisticRegression(max_iter=1000)\n",
        "bayes.fit(X,y)"
      ],
      "execution_count": 176,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
              "                   intercept_scaling=1, l1_ratio=None, max_iter=1000,\n",
              "                   multi_class='auto', n_jobs=None, penalty='l2',\n",
              "                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n",
              "                   warm_start=False)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 176
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SaIcL28I-JCK",
        "colab_type": "text"
      },
      "source": [
        "# Predict and evaluate"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "q34dlX_43ZoV",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def predict_data(data):\n",
        "  prepared = prepare_data(data)\n",
        "  vectorized = vectorize.transform(data[\"text\"])\n",
        "  predicted = bayes.predict_proba(vectorized)[:,1]\n",
        "  predicted[predicted < 0.05] = 0.05\n",
        "  predicted[predicted > 0.95] = 0.95\n",
        "  return predicted"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "9sLnLLEUHgoM",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "dev_predicted = predict_data(dev_set)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "yigVrrVJHkob",
        "colab_type": "code",
        "outputId": "42a53652-60ed-4a11-85cf-683ba4b91d23",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 51
        }
      },
      "source": [
        "dev_predicted"
      ],
      "execution_count": 195,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([0.05      , 0.75847969, 0.86484399, ..., 0.0650311 , 0.95      ,\n",
              "       0.37791457])"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 195
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "gPdE2HK64aRZ",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "test_predicted = predict_data(test_set)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "QFxuvfUJ8AhJ",
        "colab_type": "text"
      },
      "source": [
        "**Clean output for saving**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "zjypBm1260h1",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "test_predicted = np.array([item.strip() for item in test_predicted])\n",
        "dev_predicted = np.array([item.strip() for item in dev_predicted])"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "baJydHEl4H7N",
        "colab_type": "text"
      },
      "source": [
        "**Save to file**\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "O6gyoEJf4KhS",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "np.savetxt('test-A/out.tsv', test_predicted, '%f')\n",
        "np.savetxt('dev-0/out.tsv', dev_predicted, '%f')"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jIG2Fxrm89D7",
        "colab_type": "text"
      },
      "source": [
        "**Check geval output**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "mnch9uLE8vkK",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!wget https://gonito.net/get/bin/geval\n",
        "!chmod u+x geval"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oEkjIcwe8zef",
        "colab_type": "code",
        "outputId": "16433b8f-9e3a-4e49-db5d-dc7373c3c675",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 102
        }
      },
      "source": [
        "!./geval -t \"dev-0\""
      ],
      "execution_count": 214,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Likelihood\t0.6707\n",
            "Accuracy\t0.8151\n",
            "F1.0\t0.7197\n",
            "Precision\t0.7762\n",
            "Recall\t0.6710\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
 }
--- a/README.md
+++ b/README.md
@ -1,13 +1,13 @@
-Skeptic vs paranormal subreddits
+Skeptic vs paranormal subreddits
-================================
+================================
-
+
-Classify a reddit as either from Skeptic subreddit or one of the
+Classify a reddit as either from Skeptic subreddit or one of the
-"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
+"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
-,Glitch-in-the-Matrix, conspiracytheories).
+,Glitch-in-the-Matrix, conspiracytheories).
-
+
-Output label is `S` and `P`.
+Output label is 0 (for skeptic) and  1 (for paranormal).
-
+
-Sources
+Sources
-------
+-------
-
+
-Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
+Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/link_to_collab.txt
+++ b/link_to_collab.txt
@ -0,0 +1 @@
 https://colab.research.google.com/drive/1JI_RWapDbABFZPc4NDhU-zQlZiIiXk58
--- a/paranormal_or_skeptic.py
+++ b/paranormal_or_skeptic.py
@ -0,0 +1,99 @@
 # -*- coding: utf-8 -*-
 """Paranormal or skeptic.ipynb
 Automatically generated by Colaboratory.
 Original file is located at
    https://colab.research.google.com/drive/1JI_RWapDbABFZPc4NDhU-zQlZiIiXk58
 """
 !git clone git://gonito.net/paranormal-or-skeptic
 """# Loading Data"""
 !xzcat train/in.tsv.xz | wc -l
 import matplotlib.pyplot as plt
 import seaborn as sns
 import pandas as pd
 import numpy as np
 from scipy.sparse import hstack
 import csv
 import datetime
 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 from sklearn.pipeline import Pipeline
 from sklearn.linear_model import SGDClassifier, LogisticRegression
 from sklearn.svm import LinearSVC
 from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB
 from sklearn.neural_network import MLPClassifier
 def load_set(path, isTest):
  dataset = pd.read_csv(path+"/in.tsv.xz", delimiter="\t",header=None,names=["text","date"],quoting=csv.QUOTE_NONE)
  dataset["date"] = pd.to_datetime(dataset["date"].apply(lambda x: datetime.datetime.fromtimestamp(x).isoformat()))
  if not isTest:
    expected = pd.read_csv(path+"/expected.tsv",header=None,names=["class"],dtype="category")
    return dataset, expected
  return dataset
 """**Load all sets**"""
 train_set, expected_train = load_set("train", False)
 dev_set, expected_dev = load_set("dev-0", False)
 test_set = load_set("test-A", True)
 """# Prepare data"""
 def prepare_data(data):
  data["day"] = data["date"].dt.day
  data["month"] = data["date"].dt.month
  data["year"] = data["date"].dt.year
  return data
 train_set = prepare_data(train_set)
 train_set.sample(5)
 """# Train"""
 vectorize = CountVectorizer(stop_words='english',ngram_range=(1,3),strip_accents='ascii')
 vectorized = vectorize.fit_transform(train_set["text"])
 X = vectorized
 y = expected_train["class"]
 bayes = LogisticRegression(max_iter=1000)
 bayes.fit(X,y)
 """# Predict and evaluate"""
 def predict_data(data):
  prepared = prepare_data(data)
  vectorized = vectorize.transform(data["text"])
  predicted = bayes.predict_proba(vectorized)[:,1]
  predicted[predicted < 0.05] = 0.05
  predicted[predicted > 0.95] = 0.95
  return predicted
 dev_predicted = predict_data(dev_set)
 dev_predicted
 test_predicted = predict_data(test_set)
 """**Clean output for saving**"""
 test_predicted = np.array([item.strip() for item in test_predicted])
 dev_predicted = np.array([item.strip() for item in dev_predicted])
 """**Save to file**"""
 np.savetxt('test-A/out.tsv', test_predicted, '%f')
 np.savetxt('dev-0/out.tsv', dev_predicted, '%f')
 """**Check geval output**"""
 !wget https://gonito.net/get/bin/geval
 !chmod u+x geval
 !./geval -t "dev-0"
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/expected.tsv
+++ b/train/expected.tsv
Author	SHA1	Message	Date
unknown	7d648f41ad	Sklearn Logistic regression 1-3gram 1000iter	2020-04-23 01:09:38 +02:00
Th3NiKo	b90efefa94	Sklearn naive bayes 1-3 gram version 2.0.0	2020-04-04 19:12:25 +02:00
Filip Gralinski	abba594b01	Update README.md	2020-03-30 18:29:13 +02:00
Filip Gralinski	73a1b8862f	Switching to O/1	2020-03-30 18:28:23 +02:00
Filip Gralinski	f17f86149c	Fix unwanted spaces	2020-03-30 12:30:04 +02:00
Th3NiKo	7423df901f	Sklearn multinomial bayes	2020-03-22 12:30:10 +01:00
		`@ -0,0 +1 @@`
							`https://colab.research.google.com/drive/1JI_RWapDbABFZPc4NDhU-zQlZiIiXk58`