finally... after platform change solution works
This commit is contained in:
commit
2af8e67c62
|
@ -0,0 +1,13 @@
|
|||
Skeptic vs paranormal subreddits
|
||||
================================
|
||||
|
||||
Classify a reddit as either from Skeptic subreddit or one of the
|
||||
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
|
||||
,Glitch-in-the-Matrix, conspiracytheories).
|
||||
|
||||
Output label is the probability of a paranormal subreddit.
|
||||
|
||||
Sources
|
||||
-------
|
||||
|
||||
Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
|
|
@ -0,0 +1,126 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "5fcb7312",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.pipeline import make_pipeline\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"from sklearn.naive_bayes import MultinomialNB\n",
|
||||
"import pandas as pd\n",
|
||||
"import csv\n",
|
||||
"import numpy as np\n",
|
||||
"from sklearn.preprocessing import LabelEncoder"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "88ac1be8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"steps = make_pipeline(TfidfVectorizer(),MultinomialNB())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "4aa43416",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#training\n",
|
||||
"all_train_data_in = pd.read_csv('train/in.tsv.xz', compression='xz', header=None, error_bad_lines=False, quoting=csv.QUOTE_NONE, sep='\\t', nrows=3000)\n",
|
||||
"train_data_ex = pd.read_csv('train/expected.tsv', header=None, error_bad_lines=False, quoting=csv.QUOTE_NONE, sep='\\t', nrows=3000)\n",
|
||||
"train_data_in = []\n",
|
||||
"for value in all_train_data_in.values:\n",
|
||||
" temp = \"\"\n",
|
||||
" for el in value:\n",
|
||||
" if(temp == \"\"):\n",
|
||||
" temp = str(el)\n",
|
||||
" else:\n",
|
||||
" temp += '\\t' + str(el)\n",
|
||||
" train_data_in.append(temp)\n",
|
||||
" \n",
|
||||
"nb=steps.fit(train_data_in, LabelEncoder().fit_transform(train_data_ex.values))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "15c47c24",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#dev0\n",
|
||||
"all_dev0_data = pd.read_csv('dev-0/in.tsv.xz', compression='xz', header=None, quoting=csv.QUOTE_NONE, sep='\\t')\n",
|
||||
"dev0_data = []\n",
|
||||
"for value in all_dev0_data.values:\n",
|
||||
" temp = \"\"\n",
|
||||
" for el in value:\n",
|
||||
" if(temp == \"\"):\n",
|
||||
" temp = str(el)\n",
|
||||
" else:\n",
|
||||
" temp += '\\t' + str(el)\n",
|
||||
" dev0_data.append(temp)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"dev0_y = nb.predict(dev0_data)\n",
|
||||
"\n",
|
||||
"#zapis wyników\n",
|
||||
"dev0_y.tofile('dev-0/out.tsv', sep='\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "822b1e29",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#test-A\n",
|
||||
"all_testA_data = pd.read_csv('test-A/in.tsv.xz', compression='xz', header=None, quoting=csv.QUOTE_NONE, sep='\\t')\n",
|
||||
"testA_data = []\n",
|
||||
"for value in all_testA_data.values:\n",
|
||||
" temp = \"\"\n",
|
||||
" for el in value:\n",
|
||||
" if(temp == \"\"):\n",
|
||||
" temp = str(el)\n",
|
||||
" else:\n",
|
||||
" temp += '\\t' + str(el)\n",
|
||||
" testA_data.append(temp)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"testA_y = nb.predict(testA_data)\n",
|
||||
"\n",
|
||||
"#zapis wyników\n",
|
||||
"testA_y.tofile('test-A/out.tsv', sep='\\n')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
# In[1]:
|
||||
|
||||
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
import pandas as pd
|
||||
import csv
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
|
||||
# In[2]:
|
||||
|
||||
|
||||
steps = make_pipeline(TfidfVectorizer(),MultinomialNB())
|
||||
|
||||
|
||||
# In[14]:
|
||||
|
||||
|
||||
#training
|
||||
all_train_data_in = pd.read_csv('train/in.tsv.xz', compression='xz', header=None, error_bad_lines=False, quoting=csv.QUOTE_NONE, sep='\t', nrows=3000)
|
||||
train_data_ex = pd.read_csv('train/expected.tsv', header=None, error_bad_lines=False, quoting=csv.QUOTE_NONE, sep='\t', nrows=3000)
|
||||
train_data_in = []
|
||||
for value in all_train_data_in.values:
|
||||
temp = ""
|
||||
for el in value:
|
||||
if(temp == ""):
|
||||
temp = str(el)
|
||||
else:
|
||||
temp += '\t' + str(el)
|
||||
train_data_in.append(temp)
|
||||
|
||||
nb=steps.fit(train_data_in, LabelEncoder().fit_transform(train_data_ex.values))
|
||||
|
||||
|
||||
# In[17]:
|
||||
|
||||
|
||||
#dev0
|
||||
all_dev0_data = pd.read_csv('dev-0/in.tsv.xz', compression='xz', header=None, quoting=csv.QUOTE_NONE, sep='\t')
|
||||
dev0_data = []
|
||||
for value in all_dev0_data.values:
|
||||
temp = ""
|
||||
for el in value:
|
||||
if(temp == ""):
|
||||
temp = str(el)
|
||||
else:
|
||||
temp += '\t' + str(el)
|
||||
dev0_data.append(temp)
|
||||
|
||||
|
||||
dev0_y = nb.predict(dev0_data)
|
||||
|
||||
#zapis wyników
|
||||
dev0_y.tofile('dev-0/out.tsv', sep='\n')
|
||||
|
||||
|
||||
# In[16]:
|
||||
|
||||
|
||||
#test-A
|
||||
all_testA_data = pd.read_csv('test-A/in.tsv.xz', compression='xz', header=None, quoting=csv.QUOTE_NONE, sep='\t')
|
||||
testA_data = []
|
||||
for value in all_testA_data.values:
|
||||
temp = ""
|
||||
for el in value:
|
||||
if(temp == ""):
|
||||
temp = str(el)
|
||||
else:
|
||||
temp += '\t' + str(el)
|
||||
testA_data.append(temp)
|
||||
|
||||
|
||||
testA_y = nb.predict(testA_data)
|
||||
|
||||
#zapis wyników
|
||||
testA_y.tofile('test-A/out.tsv', sep='\n')
|
||||
|
|
@ -0,0 +1 @@
|
|||
--metric Likelihood --metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1 @@
|
|||
PostText Timestamp
|
|
|
@ -0,0 +1 @@
|
|||
Label
|
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Loading…
Reference in New Issue