Sklearn Logistic regression 1-3gram 1000iter

This commit is contained in:
unknown 2020-04-23 01:09:38 +02:00
parent b90efefa94
commit 7d648f41ad
4 changed files with 10540 additions and 10490 deletions

View File

@ -14,6 +14,37 @@
} }
}, },
"cells": [ "cells": [
{
"cell_type": "code",
"metadata": {
"id": "dZUIeB9Q8rv3",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"outputId": "95512ec2-2ea3-4ff5-bc08-09ee3b99c39c"
},
"source": [
"!git clone git://gonito.net/paranormal-or-skeptic "
],
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"text": [
"Cloning into 'paranormal-or-skeptic'...\n",
"remote: Enumerating objects: 3583, done.\u001b[K\n",
"remote: Counting objects: 100% (3583/3583), done.\u001b[K\n",
"remote: Compressing objects: 100% (3188/3188), done.\u001b[K\n",
"remote: Total 3583 (delta 789), reused 2704 (delta 338)\n",
"Receiving objects: 100% (3583/3583), 202.38 MiB | 4.18 MiB/s, done.\n",
"Resolving deltas: 100% (789/789), done.\n"
],
"name": "stdout"
}
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {
@ -30,7 +61,7 @@
"metadata": { "metadata": {
"id": "NQFKg_czGeRA", "id": "NQFKg_czGeRA",
"colab_type": "code", "colab_type": "code",
"outputId": "60d1c52a-8b42-4a26-d878-67f284589917", "outputId": "4cf38154-be9f-48b4-e0ea-cfac084e795a",
"colab": { "colab": {
"base_uri": "https://localhost:8080/", "base_uri": "https://localhost:8080/",
"height": 34 "height": 34
@ -39,7 +70,7 @@
"source": [ "source": [
"!xzcat train/in.tsv.xz | wc -l" "!xzcat train/in.tsv.xz | wc -l"
], ],
"execution_count": 0, "execution_count": 11,
"outputs": [ "outputs": [
{ {
"output_type": "stream", "output_type": "stream",
@ -79,8 +110,10 @@
"source": [ "source": [
"from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
"from sklearn.pipeline import Pipeline\n", "from sklearn.pipeline import Pipeline\n",
"from sklearn.linear_model import SGDClassifier\n", "from sklearn.linear_model import SGDClassifier, LogisticRegression\n",
"from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB" "from sklearn.svm import LinearSVC\n",
"from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB\n",
"from sklearn.neural_network import MLPClassifier"
], ],
"execution_count": 0, "execution_count": 0,
"outputs": [] "outputs": []
@ -174,7 +207,7 @@
"metadata": { "metadata": {
"id": "dcjUSa7f7Wex", "id": "dcjUSa7f7Wex",
"colab_type": "code", "colab_type": "code",
"outputId": "9fa0ca70-0516-4656-a1d5-641e5b0f41ff", "outputId": "f9ade29f-f746-4dd2-eb79-25845095a9f6",
"colab": { "colab": {
"base_uri": "https://localhost:8080/", "base_uri": "https://localhost:8080/",
"height": 204 "height": 204
@ -183,7 +216,7 @@
"source": [ "source": [
"train_set.sample(5)" "train_set.sample(5)"
], ],
"execution_count": 0, "execution_count": 18,
"outputs": [ "outputs": [
{ {
"output_type": "execute_result", "output_type": "execute_result",
@ -216,43 +249,43 @@
" </thead>\n", " </thead>\n",
" <tbody>\n", " <tbody>\n",
" <tr>\n", " <tr>\n",
" <th>103770</th>\n", " <th>112652</th>\n",
" <td>Holy crap. I don't think I've seen or heard o...</td>\n", " <td>As i hovered over that link I was expecting r/...</td>\n",
" <td>2010-07-16 19:27:08</td>\n", " <td>2012-03-23 13:34:29</td>\n",
" <td>16</td>\n", " <td>23</td>\n",
" <td>7</td>\n", " <td>3</td>\n",
" <td>2010</td>\n", " <td>2012</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>240391</th>\n", " <th>172265</th>\n",
" <td>You lost all pretense of civility with your ar...</td>\n", " <td>Caesarean section is now the new natural child...</td>\n",
" <td>2010-09-30 12:18:36</td>\n", " <td>2012-04-19 14:28:59</td>\n",
" <td>30</td>\n", " <td>19</td>\n",
" <td>9</td>\n", " <td>4</td>\n",
" <td>2010</td>\n", " <td>2012</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>220910</th>\n", " <th>150100</th>\n",
" <td>What do people think of ghost adventures? Cur...</td>\n", " <td>The Somerton Man reminds me of the [Lead Masks...</td>\n",
" <td>2012-08-21 19:59:56</td>\n", " <td>2012-08-04 21:21:56</td>\n",
" <td>21</td>\n", " <td>4</td>\n",
" <td>8</td>\n", " <td>8</td>\n",
" <td>2012</td>\n", " <td>2012</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>39644</th>\n", " <th>153335</th>\n",
" <td>Congrats on getting the joke.</td>\n", " <td>As a skeptic, I demand this man provide eviden...</td>\n",
" <td>2011-07-29 18:19:46</td>\n", " <td>2012-06-20 04:44:02</td>\n",
" <td>29</td>\n", " <td>20</td>\n",
" <td>7</td>\n", " <td>6</td>\n",
" <td>2011</td>\n", " <td>2012</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>220867</th>\n", " <th>149621</th>\n",
" <td>We live in a world where any media can be copi...</td>\n", " <td>It's a fucking bug.</td>\n",
" <td>2012-07-18 08:53:24</td>\n", " <td>2012-11-15 02:29:24</td>\n",
" <td>18</td>\n", " <td>15</td>\n",
" <td>7</td>\n", " <td>11</td>\n",
" <td>2012</td>\n", " <td>2012</td>\n",
" </tr>\n", " </tr>\n",
" </tbody>\n", " </tbody>\n",
@ -261,11 +294,11 @@
], ],
"text/plain": [ "text/plain": [
" text ... year\n", " text ... year\n",
"103770 Holy crap. I don't think I've seen or heard o... ... 2010\n", "112652 As i hovered over that link I was expecting r/... ... 2012\n",
"240391 You lost all pretense of civility with your ar... ... 2010\n", "172265 Caesarean section is now the new natural child... ... 2012\n",
"220910 What do people think of ghost adventures? Cur... ... 2012\n", "150100 The Somerton Man reminds me of the [Lead Masks... ... 2012\n",
"39644 Congrats on getting the joke. ... 2011\n", "153335 As a skeptic, I demand this man provide eviden... ... 2012\n",
"220867 We live in a world where any media can be copi... ... 2012\n", "149621 It's a fucking bug. ... 2012\n",
"\n", "\n",
"[5 rows x 5 columns]" "[5 rows x 5 columns]"
] ]
@ -273,7 +306,7 @@
"metadata": { "metadata": {
"tags": [] "tags": []
}, },
"execution_count": 12 "execution_count": 18
} }
] ]
}, },
@ -320,29 +353,33 @@
"metadata": { "metadata": {
"id": "CeYlhwda9Sa7", "id": "CeYlhwda9Sa7",
"colab_type": "code", "colab_type": "code",
"outputId": "607d4f8f-f632-4d41-a1ab-e5d020cc00ae", "outputId": "61a66f28-85b6-452d-bdd0-180772498e34",
"colab": { "colab": {
"base_uri": "https://localhost:8080/", "base_uri": "https://localhost:8080/",
"height": 34 "height": 102
} }
}, },
"source": [ "source": [
"bayes = MultinomialNB(alpha=0.4)\n", "bayes = LogisticRegression(max_iter=1000)\n",
"bayes.fit(X,y)" "bayes.fit(X,y)"
], ],
"execution_count": 0, "execution_count": 176,
"outputs": [ "outputs": [
{ {
"output_type": "execute_result", "output_type": "execute_result",
"data": { "data": {
"text/plain": [ "text/plain": [
"MultinomialNB(alpha=0.4, class_prior=None, fit_prior=True)" "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
" intercept_scaling=1, l1_ratio=None, max_iter=1000,\n",
" multi_class='auto', n_jobs=None, penalty='l2',\n",
" random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n",
" warm_start=False)"
] ]
}, },
"metadata": { "metadata": {
"tags": [] "tags": []
}, },
"execution_count": 15 "execution_count": 176
} }
] ]
}, },
@ -367,7 +404,9 @@
"def predict_data(data):\n", "def predict_data(data):\n",
" prepared = prepare_data(data)\n", " prepared = prepare_data(data)\n",
" vectorized = vectorize.transform(data[\"text\"])\n", " vectorized = vectorize.transform(data[\"text\"])\n",
" predicted = bayes.predict(vectorized)\n", " predicted = bayes.predict_proba(vectorized)[:,1]\n",
" predicted[predicted < 0.05] = 0.05\n",
" predicted[predicted > 0.95] = 0.95\n",
" return predicted" " return predicted"
], ],
"execution_count": 0, "execution_count": 0,
@ -391,28 +430,29 @@
"metadata": { "metadata": {
"id": "yigVrrVJHkob", "id": "yigVrrVJHkob",
"colab_type": "code", "colab_type": "code",
"outputId": "9491f926-94a3-4310-9f63-be937663489d", "outputId": "42a53652-60ed-4a11-85cf-683ba4b91d23",
"colab": { "colab": {
"base_uri": "https://localhost:8080/", "base_uri": "https://localhost:8080/",
"height": 34 "height": 51
} }
}, },
"source": [ "source": [
"np.mean(dev_predicted == expected_dev[\"class\"])" "dev_predicted"
], ],
"execution_count": 0, "execution_count": 195,
"outputs": [ "outputs": [
{ {
"output_type": "execute_result", "output_type": "execute_result",
"data": { "data": {
"text/plain": [ "text/plain": [
"0.8201820940819423" "array([0.05 , 0.75847969, 0.86484399, ..., 0.0650311 , 0.95 ,\n",
" 0.37791457])"
] ]
}, },
"metadata": { "metadata": {
"tags": [] "tags": []
}, },
"execution_count": 19 "execution_count": 195
} }
] ]
}, },
@ -471,8 +511,8 @@
"colab": {} "colab": {}
}, },
"source": [ "source": [
"np.savetxt('test-A/out.tsv', test_predicted, '%c')\n", "np.savetxt('test-A/out.tsv', test_predicted, '%f')\n",
"np.savetxt('dev-0/out.tsv', dev_predicted, '%c')" "np.savetxt('dev-0/out.tsv', dev_predicted, '%f')"
], ],
"execution_count": 0, "execution_count": 0,
"outputs": [] "outputs": []
@ -506,21 +546,25 @@
"metadata": { "metadata": {
"id": "oEkjIcwe8zef", "id": "oEkjIcwe8zef",
"colab_type": "code", "colab_type": "code",
"outputId": "cdb6473e-4eb9-48a7-cc25-25a193cc9194", "outputId": "16433b8f-9e3a-4e49-db5d-dc7373c3c675",
"colab": { "colab": {
"base_uri": "https://localhost:8080/", "base_uri": "https://localhost:8080/",
"height": 34 "height": 102
} }
}, },
"source": [ "source": [
"!./geval -t \"dev-0\"" "!./geval -t \"dev-0\""
], ],
"execution_count": 0, "execution_count": 214,
"outputs": [ "outputs": [
{ {
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"0.8202\n" "Likelihood\t0.6707\n",
"Accuracy\t0.8151\n",
"F1.0\t0.7197\n",
"Precision\t0.7762\n",
"Recall\t0.6710\n"
], ],
"name": "stdout" "name": "stdout"
} }

File diff suppressed because it is too large Load Diff

View File

@ -5,10 +5,12 @@ Automatically generated by Colaboratory.
Original file is located at Original file is located at
https://colab.research.google.com/drive/1JI_RWapDbABFZPc4NDhU-zQlZiIiXk58 https://colab.research.google.com/drive/1JI_RWapDbABFZPc4NDhU-zQlZiIiXk58
# Loading Data
""" """
!git clone git://gonito.net/paranormal-or-skeptic
"""# Loading Data"""
!xzcat train/in.tsv.xz | wc -l !xzcat train/in.tsv.xz | wc -l
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
@ -21,8 +23,10 @@ import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB
from sklearn.neural_network import MLPClassifier
def load_set(path, isTest): def load_set(path, isTest):
dataset = pd.read_csv(path+"/in.tsv.xz", delimiter="\t",header=None,names=["text","date"],quoting=csv.QUOTE_NONE) dataset = pd.read_csv(path+"/in.tsv.xz", delimiter="\t",header=None,names=["text","date"],quoting=csv.QUOTE_NONE)
@ -58,7 +62,7 @@ vectorized = vectorize.fit_transform(train_set["text"])
X = vectorized X = vectorized
y = expected_train["class"] y = expected_train["class"]
bayes = MultinomialNB(alpha=0.4) bayes = LogisticRegression(max_iter=1000)
bayes.fit(X,y) bayes.fit(X,y)
"""# Predict and evaluate""" """# Predict and evaluate"""
@ -66,12 +70,14 @@ bayes.fit(X,y)
def predict_data(data): def predict_data(data):
prepared = prepare_data(data) prepared = prepare_data(data)
vectorized = vectorize.transform(data["text"]) vectorized = vectorize.transform(data["text"])
predicted = bayes.predict(vectorized) predicted = bayes.predict_proba(vectorized)[:,1]
predicted[predicted < 0.05] = 0.05
predicted[predicted > 0.95] = 0.95
return predicted return predicted
dev_predicted = predict_data(dev_set) dev_predicted = predict_data(dev_set)
np.mean(dev_predicted == expected_dev["class"]) dev_predicted
test_predicted = predict_data(test_set) test_predicted = predict_data(test_set)
@ -82,8 +88,8 @@ dev_predicted = np.array([item.strip() for item in dev_predicted])
"""**Save to file**""" """**Save to file**"""
np.savetxt('test-A/out.tsv', test_predicted, '%c') np.savetxt('test-A/out.tsv', test_predicted, '%f')
np.savetxt('dev-0/out.tsv', dev_predicted, '%c') np.savetxt('dev-0/out.tsv', dev_predicted, '%f')
"""**Check geval output**""" """**Check geval output**"""

File diff suppressed because it is too large Load Diff