Sklearn multinomial bayes

This commit is contained in:
Th3NiKo 2020-03-22 12:30:10 +01:00
parent 772b516776
commit 7423df901f
5 changed files with 11048 additions and 0 deletions

530
Paranormal_or_skeptic.ipynb Normal file
View File

@ -0,0 +1,530 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Paranormal or skeptic.ipynb",
"provenance": [],
"collapsed_sections": [],
"toc_visible": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "x8uZz8__5sXr",
"colab_type": "text"
},
"source": [
"\n",
"# Loading Data\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "NQFKg_czGeRA",
"colab_type": "code",
"outputId": "60d1c52a-8b42-4a26-d878-67f284589917",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"source": [
"!xzcat train/in.tsv.xz | wc -l"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"289579\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "GxUYlO5M6SOJ",
"colab_type": "code",
"colab": {}
},
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import pandas as pd\n",
"import numpy as np\n",
"from scipy.sparse import hstack\n",
"import csv\n",
"import datetime"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "MWDzekYY6S9f",
"colab_type": "code",
"colab": {}
},
"source": [
"from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.linear_model import SGDClassifier\n",
"from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "SrLtGV3p4pKW",
"colab_type": "code",
"colab": {}
},
"source": [
"def load_set(path, isTest):\n",
" dataset = pd.read_csv(path+\"/in.tsv.xz\", delimiter=\"\\t\",header=None,names=[\"text\",\"date\"],quoting=csv.QUOTE_NONE)\n",
" dataset[\"date\"] = pd.to_datetime(dataset[\"date\"].apply(lambda x: datetime.datetime.fromtimestamp(x).isoformat()))\n",
" if not isTest:\n",
" expected = pd.read_csv(path+\"/expected.tsv\",header=None,names=[\"class\"],dtype=\"category\")\n",
" return dataset, expected\n",
" return dataset"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "wH70ClgjBeCO",
"colab_type": "text"
},
"source": [
"**Load all sets**"
]
},
{
"cell_type": "code",
"metadata": {
"id": "huOmuCrE6yCR",
"colab_type": "code",
"colab": {}
},
"source": [
"train_set, expected_train = load_set(\"train\", False)\n",
"dev_set, expected_dev = load_set(\"dev-0\", False)\n",
"test_set = load_set(\"test-A\", True)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "mWO1IroV6cmm",
"colab_type": "text"
},
"source": [
"# Prepare data"
]
},
{
"cell_type": "code",
"metadata": {
"id": "VVd7DJ1E6cOO",
"colab_type": "code",
"colab": {}
},
"source": [
"def prepare_data(data):\n",
" data[\"day\"] = data[\"date\"].dt.day\n",
" data[\"month\"] = data[\"date\"].dt.month\n",
" data[\"year\"] = data[\"date\"].dt.year\n",
" return data"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "i_k63XB5642m",
"colab_type": "code",
"colab": {}
},
"source": [
"train_set = prepare_data(train_set)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "dcjUSa7f7Wex",
"colab_type": "code",
"outputId": "9fa0ca70-0516-4656-a1d5-641e5b0f41ff",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
}
},
"source": [
"train_set.sample(5)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>date</th>\n",
" <th>day</th>\n",
" <th>month</th>\n",
" <th>year</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>103770</th>\n",
" <td>Holy crap. I don't think I've seen or heard o...</td>\n",
" <td>2010-07-16 19:27:08</td>\n",
" <td>16</td>\n",
" <td>7</td>\n",
" <td>2010</td>\n",
" </tr>\n",
" <tr>\n",
" <th>240391</th>\n",
" <td>You lost all pretense of civility with your ar...</td>\n",
" <td>2010-09-30 12:18:36</td>\n",
" <td>30</td>\n",
" <td>9</td>\n",
" <td>2010</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220910</th>\n",
" <td>What do people think of ghost adventures? Cur...</td>\n",
" <td>2012-08-21 19:59:56</td>\n",
" <td>21</td>\n",
" <td>8</td>\n",
" <td>2012</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39644</th>\n",
" <td>Congrats on getting the joke.</td>\n",
" <td>2011-07-29 18:19:46</td>\n",
" <td>29</td>\n",
" <td>7</td>\n",
" <td>2011</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220867</th>\n",
" <td>We live in a world where any media can be copi...</td>\n",
" <td>2012-07-18 08:53:24</td>\n",
" <td>18</td>\n",
" <td>7</td>\n",
" <td>2012</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text ... year\n",
"103770 Holy crap. I don't think I've seen or heard o... ... 2010\n",
"240391 You lost all pretense of civility with your ar... ... 2010\n",
"220910 What do people think of ghost adventures? Cur... ... 2012\n",
"39644 Congrats on getting the joke. ... 2011\n",
"220867 We live in a world where any media can be copi... ... 2012\n",
"\n",
"[5 rows x 5 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 12
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "hIZZ9vcu5Xx7",
"colab_type": "text"
},
"source": [
"# Train"
]
},
{
"cell_type": "code",
"metadata": {
"id": "yqHuHTyI8Kfz",
"colab_type": "code",
"colab": {}
},
"source": [
"vectorize = CountVectorizer(stop_words='english',ngram_range=(1,3),strip_accents='ascii')\n",
"vectorized = vectorize.fit_transform(train_set[\"text\"])"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ZaLsOdPe9aFu",
"colab_type": "code",
"colab": {}
},
"source": [
"X = vectorized\n",
"y = expected_train[\"class\"]"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "CeYlhwda9Sa7",
"colab_type": "code",
"outputId": "607d4f8f-f632-4d41-a1ab-e5d020cc00ae",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"source": [
"bayes = MultinomialNB(alpha=0.4)\n",
"bayes.fit(X,y)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"MultinomialNB(alpha=0.4, class_prior=None, fit_prior=True)"
]
},
"metadata": {
"tags": []
},
"execution_count": 15
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "SaIcL28I-JCK",
"colab_type": "text"
},
"source": [
"# Predict and evaluate"
]
},
{
"cell_type": "code",
"metadata": {
"id": "q34dlX_43ZoV",
"colab_type": "code",
"colab": {}
},
"source": [
"def predict_data(data):\n",
" prepared = prepare_data(data)\n",
" vectorized = vectorize.transform(data[\"text\"])\n",
" predicted = bayes.predict(vectorized)\n",
" return predicted"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "9sLnLLEUHgoM",
"colab_type": "code",
"colab": {}
},
"source": [
"dev_predicted = predict_data(dev_set)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "yigVrrVJHkob",
"colab_type": "code",
"outputId": "9491f926-94a3-4310-9f63-be937663489d",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"source": [
"np.mean(dev_predicted == expected_dev[\"class\"])"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.8201820940819423"
]
},
"metadata": {
"tags": []
},
"execution_count": 19
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "gPdE2HK64aRZ",
"colab_type": "code",
"colab": {}
},
"source": [
"test_predicted = predict_data(test_set)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "QFxuvfUJ8AhJ",
"colab_type": "text"
},
"source": [
"**Clean output for saving**"
]
},
{
"cell_type": "code",
"metadata": {
"id": "zjypBm1260h1",
"colab_type": "code",
"colab": {}
},
"source": [
"test_predicted = np.array([item.strip() for item in test_predicted])\n",
"dev_predicted = np.array([item.strip() for item in dev_predicted])"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "baJydHEl4H7N",
"colab_type": "text"
},
"source": [
"**Save to file**\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "O6gyoEJf4KhS",
"colab_type": "code",
"colab": {}
},
"source": [
"np.savetxt('test-A/out.tsv', test_predicted, '%c')\n",
"np.savetxt('dev-0/out.tsv', dev_predicted, '%c')"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "jIG2Fxrm89D7",
"colab_type": "text"
},
"source": [
"**Check geval output**"
]
},
{
"cell_type": "code",
"metadata": {
"id": "mnch9uLE8vkK",
"colab_type": "code",
"colab": {}
},
"source": [
"!wget https://gonito.net/get/bin/geval\n",
"!chmod u+x geval"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "oEkjIcwe8zef",
"colab_type": "code",
"outputId": "cdb6473e-4eb9-48a7-cc25-25a193cc9194",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"source": [
"!./geval -t \"dev-0\""
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"0.8202\n"
],
"name": "stdout"
}
]
}
]
}

5272
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

1
link_to_collab.txt Normal file
View File

@ -0,0 +1 @@
https://colab.research.google.com/drive/1JI_RWapDbABFZPc4NDhU-zQlZiIiXk58

93
paranormal_or_skeptic.py Normal file
View File

@ -0,0 +1,93 @@
# -*- coding: utf-8 -*-
"""Paranormal or skeptic.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1JI_RWapDbABFZPc4NDhU-zQlZiIiXk58
# Loading Data
"""
!xzcat train/in.tsv.xz | wc -l
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy.sparse import hstack
import csv
import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB
def load_set(path, isTest):
dataset = pd.read_csv(path+"/in.tsv.xz", delimiter="\t",header=None,names=["text","date"],quoting=csv.QUOTE_NONE)
dataset["date"] = pd.to_datetime(dataset["date"].apply(lambda x: datetime.datetime.fromtimestamp(x).isoformat()))
if not isTest:
expected = pd.read_csv(path+"/expected.tsv",header=None,names=["class"],dtype="category")
return dataset, expected
return dataset
"""**Load all sets**"""
train_set, expected_train = load_set("train", False)
dev_set, expected_dev = load_set("dev-0", False)
test_set = load_set("test-A", True)
"""# Prepare data"""
def prepare_data(data):
data["day"] = data["date"].dt.day
data["month"] = data["date"].dt.month
data["year"] = data["date"].dt.year
return data
train_set = prepare_data(train_set)
train_set.sample(5)
"""# Train"""
vectorize = CountVectorizer(stop_words='english',ngram_range=(1,3),strip_accents='ascii')
vectorized = vectorize.fit_transform(train_set["text"])
X = vectorized
y = expected_train["class"]
bayes = MultinomialNB(alpha=0.4)
bayes.fit(X,y)
"""# Predict and evaluate"""
def predict_data(data):
prepared = prepare_data(data)
vectorized = vectorize.transform(data["text"])
predicted = bayes.predict(vectorized)
return predicted
dev_predicted = predict_data(dev_set)
np.mean(dev_predicted == expected_dev["class"])
test_predicted = predict_data(test_set)
"""**Clean output for saving**"""
test_predicted = np.array([item.strip() for item in test_predicted])
dev_predicted = np.array([item.strip() for item in dev_predicted])
"""**Save to file**"""
np.savetxt('test-A/out.tsv', test_predicted, '%c')
np.savetxt('dev-0/out.tsv', dev_predicted, '%c')
"""**Check geval output**"""
!wget https://gonito.net/get/bin/geval
!chmod u+x geval
!./geval -t "dev-0"

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff