Sklearn Logistic regression 1-3gram 1000iter
This commit is contained in:
parent
b90efefa94
commit
7d648f41ad
@ -14,6 +14,37 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"cells": [
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"metadata": {
|
||||||
|
"id": "dZUIeB9Q8rv3",
|
||||||
|
"colab_type": "code",
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/",
|
||||||
|
"height": 136
|
||||||
|
},
|
||||||
|
"outputId": "95512ec2-2ea3-4ff5-bc08-09ee3b99c39c"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"!git clone git://gonito.net/paranormal-or-skeptic "
|
||||||
|
],
|
||||||
|
"execution_count": 5,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Cloning into 'paranormal-or-skeptic'...\n",
|
||||||
|
"remote: Enumerating objects: 3583, done.\u001b[K\n",
|
||||||
|
"remote: Counting objects: 100% (3583/3583), done.\u001b[K\n",
|
||||||
|
"remote: Compressing objects: 100% (3188/3188), done.\u001b[K\n",
|
||||||
|
"remote: Total 3583 (delta 789), reused 2704 (delta 338)\n",
|
||||||
|
"Receiving objects: 100% (3583/3583), 202.38 MiB | 4.18 MiB/s, done.\n",
|
||||||
|
"Resolving deltas: 100% (789/789), done.\n"
|
||||||
|
],
|
||||||
|
"name": "stdout"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
@ -30,7 +61,7 @@
|
|||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "NQFKg_czGeRA",
|
"id": "NQFKg_czGeRA",
|
||||||
"colab_type": "code",
|
"colab_type": "code",
|
||||||
"outputId": "60d1c52a-8b42-4a26-d878-67f284589917",
|
"outputId": "4cf38154-be9f-48b4-e0ea-cfac084e795a",
|
||||||
"colab": {
|
"colab": {
|
||||||
"base_uri": "https://localhost:8080/",
|
"base_uri": "https://localhost:8080/",
|
||||||
"height": 34
|
"height": 34
|
||||||
@ -39,7 +70,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"!xzcat train/in.tsv.xz | wc -l"
|
"!xzcat train/in.tsv.xz | wc -l"
|
||||||
],
|
],
|
||||||
"execution_count": 0,
|
"execution_count": 11,
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
@ -79,8 +110,10 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
|
"from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
|
||||||
"from sklearn.pipeline import Pipeline\n",
|
"from sklearn.pipeline import Pipeline\n",
|
||||||
"from sklearn.linear_model import SGDClassifier\n",
|
"from sklearn.linear_model import SGDClassifier, LogisticRegression\n",
|
||||||
"from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB"
|
"from sklearn.svm import LinearSVC\n",
|
||||||
|
"from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB\n",
|
||||||
|
"from sklearn.neural_network import MLPClassifier"
|
||||||
],
|
],
|
||||||
"execution_count": 0,
|
"execution_count": 0,
|
||||||
"outputs": []
|
"outputs": []
|
||||||
@ -174,7 +207,7 @@
|
|||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "dcjUSa7f7Wex",
|
"id": "dcjUSa7f7Wex",
|
||||||
"colab_type": "code",
|
"colab_type": "code",
|
||||||
"outputId": "9fa0ca70-0516-4656-a1d5-641e5b0f41ff",
|
"outputId": "f9ade29f-f746-4dd2-eb79-25845095a9f6",
|
||||||
"colab": {
|
"colab": {
|
||||||
"base_uri": "https://localhost:8080/",
|
"base_uri": "https://localhost:8080/",
|
||||||
"height": 204
|
"height": 204
|
||||||
@ -183,7 +216,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"train_set.sample(5)"
|
"train_set.sample(5)"
|
||||||
],
|
],
|
||||||
"execution_count": 0,
|
"execution_count": 18,
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"output_type": "execute_result",
|
"output_type": "execute_result",
|
||||||
@ -216,43 +249,43 @@
|
|||||||
" </thead>\n",
|
" </thead>\n",
|
||||||
" <tbody>\n",
|
" <tbody>\n",
|
||||||
" <tr>\n",
|
" <tr>\n",
|
||||||
" <th>103770</th>\n",
|
" <th>112652</th>\n",
|
||||||
" <td>Holy crap. I don't think I've seen or heard o...</td>\n",
|
" <td>As i hovered over that link I was expecting r/...</td>\n",
|
||||||
" <td>2010-07-16 19:27:08</td>\n",
|
" <td>2012-03-23 13:34:29</td>\n",
|
||||||
" <td>16</td>\n",
|
" <td>23</td>\n",
|
||||||
" <td>7</td>\n",
|
" <td>3</td>\n",
|
||||||
" <td>2010</td>\n",
|
" <td>2012</td>\n",
|
||||||
" </tr>\n",
|
" </tr>\n",
|
||||||
" <tr>\n",
|
" <tr>\n",
|
||||||
" <th>240391</th>\n",
|
" <th>172265</th>\n",
|
||||||
" <td>You lost all pretense of civility with your ar...</td>\n",
|
" <td>Caesarean section is now the new natural child...</td>\n",
|
||||||
" <td>2010-09-30 12:18:36</td>\n",
|
" <td>2012-04-19 14:28:59</td>\n",
|
||||||
" <td>30</td>\n",
|
" <td>19</td>\n",
|
||||||
" <td>9</td>\n",
|
" <td>4</td>\n",
|
||||||
" <td>2010</td>\n",
|
" <td>2012</td>\n",
|
||||||
" </tr>\n",
|
" </tr>\n",
|
||||||
" <tr>\n",
|
" <tr>\n",
|
||||||
" <th>220910</th>\n",
|
" <th>150100</th>\n",
|
||||||
" <td>What do people think of ghost adventures? Cur...</td>\n",
|
" <td>The Somerton Man reminds me of the [Lead Masks...</td>\n",
|
||||||
" <td>2012-08-21 19:59:56</td>\n",
|
" <td>2012-08-04 21:21:56</td>\n",
|
||||||
" <td>21</td>\n",
|
" <td>4</td>\n",
|
||||||
" <td>8</td>\n",
|
" <td>8</td>\n",
|
||||||
" <td>2012</td>\n",
|
" <td>2012</td>\n",
|
||||||
" </tr>\n",
|
" </tr>\n",
|
||||||
" <tr>\n",
|
" <tr>\n",
|
||||||
" <th>39644</th>\n",
|
" <th>153335</th>\n",
|
||||||
" <td>Congrats on getting the joke.</td>\n",
|
" <td>As a skeptic, I demand this man provide eviden...</td>\n",
|
||||||
" <td>2011-07-29 18:19:46</td>\n",
|
" <td>2012-06-20 04:44:02</td>\n",
|
||||||
" <td>29</td>\n",
|
" <td>20</td>\n",
|
||||||
" <td>7</td>\n",
|
" <td>6</td>\n",
|
||||||
" <td>2011</td>\n",
|
" <td>2012</td>\n",
|
||||||
" </tr>\n",
|
" </tr>\n",
|
||||||
" <tr>\n",
|
" <tr>\n",
|
||||||
" <th>220867</th>\n",
|
" <th>149621</th>\n",
|
||||||
" <td>We live in a world where any media can be copi...</td>\n",
|
" <td>It's a fucking bug.</td>\n",
|
||||||
" <td>2012-07-18 08:53:24</td>\n",
|
" <td>2012-11-15 02:29:24</td>\n",
|
||||||
" <td>18</td>\n",
|
" <td>15</td>\n",
|
||||||
" <td>7</td>\n",
|
" <td>11</td>\n",
|
||||||
" <td>2012</td>\n",
|
" <td>2012</td>\n",
|
||||||
" </tr>\n",
|
" </tr>\n",
|
||||||
" </tbody>\n",
|
" </tbody>\n",
|
||||||
@ -261,11 +294,11 @@
|
|||||||
],
|
],
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
" text ... year\n",
|
" text ... year\n",
|
||||||
"103770 Holy crap. I don't think I've seen or heard o... ... 2010\n",
|
"112652 As i hovered over that link I was expecting r/... ... 2012\n",
|
||||||
"240391 You lost all pretense of civility with your ar... ... 2010\n",
|
"172265 Caesarean section is now the new natural child... ... 2012\n",
|
||||||
"220910 What do people think of ghost adventures? Cur... ... 2012\n",
|
"150100 The Somerton Man reminds me of the [Lead Masks... ... 2012\n",
|
||||||
"39644 Congrats on getting the joke. ... 2011\n",
|
"153335 As a skeptic, I demand this man provide eviden... ... 2012\n",
|
||||||
"220867 We live in a world where any media can be copi... ... 2012\n",
|
"149621 It's a fucking bug. ... 2012\n",
|
||||||
"\n",
|
"\n",
|
||||||
"[5 rows x 5 columns]"
|
"[5 rows x 5 columns]"
|
||||||
]
|
]
|
||||||
@ -273,7 +306,7 @@
|
|||||||
"metadata": {
|
"metadata": {
|
||||||
"tags": []
|
"tags": []
|
||||||
},
|
},
|
||||||
"execution_count": 12
|
"execution_count": 18
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -320,29 +353,33 @@
|
|||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "CeYlhwda9Sa7",
|
"id": "CeYlhwda9Sa7",
|
||||||
"colab_type": "code",
|
"colab_type": "code",
|
||||||
"outputId": "607d4f8f-f632-4d41-a1ab-e5d020cc00ae",
|
"outputId": "61a66f28-85b6-452d-bdd0-180772498e34",
|
||||||
"colab": {
|
"colab": {
|
||||||
"base_uri": "https://localhost:8080/",
|
"base_uri": "https://localhost:8080/",
|
||||||
"height": 34
|
"height": 102
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"bayes = MultinomialNB(alpha=0.4)\n",
|
"bayes = LogisticRegression(max_iter=1000)\n",
|
||||||
"bayes.fit(X,y)"
|
"bayes.fit(X,y)"
|
||||||
],
|
],
|
||||||
"execution_count": 0,
|
"execution_count": 176,
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"output_type": "execute_result",
|
"output_type": "execute_result",
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"MultinomialNB(alpha=0.4, class_prior=None, fit_prior=True)"
|
"LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
|
||||||
|
" intercept_scaling=1, l1_ratio=None, max_iter=1000,\n",
|
||||||
|
" multi_class='auto', n_jobs=None, penalty='l2',\n",
|
||||||
|
" random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n",
|
||||||
|
" warm_start=False)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"tags": []
|
"tags": []
|
||||||
},
|
},
|
||||||
"execution_count": 15
|
"execution_count": 176
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -367,7 +404,9 @@
|
|||||||
"def predict_data(data):\n",
|
"def predict_data(data):\n",
|
||||||
" prepared = prepare_data(data)\n",
|
" prepared = prepare_data(data)\n",
|
||||||
" vectorized = vectorize.transform(data[\"text\"])\n",
|
" vectorized = vectorize.transform(data[\"text\"])\n",
|
||||||
" predicted = bayes.predict(vectorized)\n",
|
" predicted = bayes.predict_proba(vectorized)[:,1]\n",
|
||||||
|
" predicted[predicted < 0.05] = 0.05\n",
|
||||||
|
" predicted[predicted > 0.95] = 0.95\n",
|
||||||
" return predicted"
|
" return predicted"
|
||||||
],
|
],
|
||||||
"execution_count": 0,
|
"execution_count": 0,
|
||||||
@ -391,28 +430,29 @@
|
|||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "yigVrrVJHkob",
|
"id": "yigVrrVJHkob",
|
||||||
"colab_type": "code",
|
"colab_type": "code",
|
||||||
"outputId": "9491f926-94a3-4310-9f63-be937663489d",
|
"outputId": "42a53652-60ed-4a11-85cf-683ba4b91d23",
|
||||||
"colab": {
|
"colab": {
|
||||||
"base_uri": "https://localhost:8080/",
|
"base_uri": "https://localhost:8080/",
|
||||||
"height": 34
|
"height": 51
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"np.mean(dev_predicted == expected_dev[\"class\"])"
|
"dev_predicted"
|
||||||
],
|
],
|
||||||
"execution_count": 0,
|
"execution_count": 195,
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"output_type": "execute_result",
|
"output_type": "execute_result",
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"0.8201820940819423"
|
"array([0.05 , 0.75847969, 0.86484399, ..., 0.0650311 , 0.95 ,\n",
|
||||||
|
" 0.37791457])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"tags": []
|
"tags": []
|
||||||
},
|
},
|
||||||
"execution_count": 19
|
"execution_count": 195
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -471,8 +511,8 @@
|
|||||||
"colab": {}
|
"colab": {}
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"np.savetxt('test-A/out.tsv', test_predicted, '%c')\n",
|
"np.savetxt('test-A/out.tsv', test_predicted, '%f')\n",
|
||||||
"np.savetxt('dev-0/out.tsv', dev_predicted, '%c')"
|
"np.savetxt('dev-0/out.tsv', dev_predicted, '%f')"
|
||||||
],
|
],
|
||||||
"execution_count": 0,
|
"execution_count": 0,
|
||||||
"outputs": []
|
"outputs": []
|
||||||
@ -506,21 +546,25 @@
|
|||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "oEkjIcwe8zef",
|
"id": "oEkjIcwe8zef",
|
||||||
"colab_type": "code",
|
"colab_type": "code",
|
||||||
"outputId": "cdb6473e-4eb9-48a7-cc25-25a193cc9194",
|
"outputId": "16433b8f-9e3a-4e49-db5d-dc7373c3c675",
|
||||||
"colab": {
|
"colab": {
|
||||||
"base_uri": "https://localhost:8080/",
|
"base_uri": "https://localhost:8080/",
|
||||||
"height": 34
|
"height": 102
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"!./geval -t \"dev-0\""
|
"!./geval -t \"dev-0\""
|
||||||
],
|
],
|
||||||
"execution_count": 0,
|
"execution_count": 214,
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"0.8202\n"
|
"Likelihood\t0.6707\n",
|
||||||
|
"Accuracy\t0.8151\n",
|
||||||
|
"F1.0\t0.7197\n",
|
||||||
|
"Precision\t0.7762\n",
|
||||||
|
"Recall\t0.6710\n"
|
||||||
],
|
],
|
||||||
"name": "stdout"
|
"name": "stdout"
|
||||||
}
|
}
|
||||||
|
10544
dev-0/out.tsv
10544
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
@ -5,10 +5,12 @@ Automatically generated by Colaboratory.
|
|||||||
|
|
||||||
Original file is located at
|
Original file is located at
|
||||||
https://colab.research.google.com/drive/1JI_RWapDbABFZPc4NDhU-zQlZiIiXk58
|
https://colab.research.google.com/drive/1JI_RWapDbABFZPc4NDhU-zQlZiIiXk58
|
||||||
|
|
||||||
# Loading Data
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
!git clone git://gonito.net/paranormal-or-skeptic
|
||||||
|
|
||||||
|
"""# Loading Data"""
|
||||||
|
|
||||||
!xzcat train/in.tsv.xz | wc -l
|
!xzcat train/in.tsv.xz | wc -l
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
@ -21,8 +23,10 @@ import datetime
|
|||||||
|
|
||||||
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
||||||
from sklearn.pipeline import Pipeline
|
from sklearn.pipeline import Pipeline
|
||||||
from sklearn.linear_model import SGDClassifier
|
from sklearn.linear_model import SGDClassifier, LogisticRegression
|
||||||
|
from sklearn.svm import LinearSVC
|
||||||
from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB
|
from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB
|
||||||
|
from sklearn.neural_network import MLPClassifier
|
||||||
|
|
||||||
def load_set(path, isTest):
|
def load_set(path, isTest):
|
||||||
dataset = pd.read_csv(path+"/in.tsv.xz", delimiter="\t",header=None,names=["text","date"],quoting=csv.QUOTE_NONE)
|
dataset = pd.read_csv(path+"/in.tsv.xz", delimiter="\t",header=None,names=["text","date"],quoting=csv.QUOTE_NONE)
|
||||||
@ -58,7 +62,7 @@ vectorized = vectorize.fit_transform(train_set["text"])
|
|||||||
X = vectorized
|
X = vectorized
|
||||||
y = expected_train["class"]
|
y = expected_train["class"]
|
||||||
|
|
||||||
bayes = MultinomialNB(alpha=0.4)
|
bayes = LogisticRegression(max_iter=1000)
|
||||||
bayes.fit(X,y)
|
bayes.fit(X,y)
|
||||||
|
|
||||||
"""# Predict and evaluate"""
|
"""# Predict and evaluate"""
|
||||||
@ -66,12 +70,14 @@ bayes.fit(X,y)
|
|||||||
def predict_data(data):
|
def predict_data(data):
|
||||||
prepared = prepare_data(data)
|
prepared = prepare_data(data)
|
||||||
vectorized = vectorize.transform(data["text"])
|
vectorized = vectorize.transform(data["text"])
|
||||||
predicted = bayes.predict(vectorized)
|
predicted = bayes.predict_proba(vectorized)[:,1]
|
||||||
|
predicted[predicted < 0.05] = 0.05
|
||||||
|
predicted[predicted > 0.95] = 0.95
|
||||||
return predicted
|
return predicted
|
||||||
|
|
||||||
dev_predicted = predict_data(dev_set)
|
dev_predicted = predict_data(dev_set)
|
||||||
|
|
||||||
np.mean(dev_predicted == expected_dev["class"])
|
dev_predicted
|
||||||
|
|
||||||
test_predicted = predict_data(test_set)
|
test_predicted = predict_data(test_set)
|
||||||
|
|
||||||
@ -82,8 +88,8 @@ dev_predicted = np.array([item.strip() for item in dev_predicted])
|
|||||||
|
|
||||||
"""**Save to file**"""
|
"""**Save to file**"""
|
||||||
|
|
||||||
np.savetxt('test-A/out.tsv', test_predicted, '%c')
|
np.savetxt('test-A/out.tsv', test_predicted, '%f')
|
||||||
np.savetxt('dev-0/out.tsv', dev_predicted, '%c')
|
np.savetxt('dev-0/out.tsv', dev_predicted, '%f')
|
||||||
|
|
||||||
"""**Check geval output**"""
|
"""**Check geval output**"""
|
||||||
|
|
||||||
|
10304
test-A/out.tsv
10304
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user