278 lines
5.5 KiB
Plaintext
278 lines
5.5 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 96,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import lzma"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 97,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"X = []\n",
|
||
|
"\n",
|
||
|
"with lzma.open('train/in.tsv.xz') as f:\n",
|
||
|
" for line in f:\n",
|
||
|
" X.append(line.decode('utf-8'))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 98,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"Y = []\n",
|
||
|
"\n",
|
||
|
"with open('train/expected.tsv') as f:\n",
|
||
|
" for line in f:\n",
|
||
|
" txt = line\n",
|
||
|
" txt = txt.replace('\\n', '')\n",
|
||
|
" Y.append(txt)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 99,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||
|
"\n",
|
||
|
"vectorizer = TfidfVectorizer()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 100,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"textVectors = vectorizer.fit_transform(X)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 101,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"matrix([[0., 0., 0., ..., 0., 0., 0.]])"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 101,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"textVectors[0].todense()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 102,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from sklearn.naive_bayes import BernoulliNB\n",
|
||
|
"import numpy as np\n",
|
||
|
"\n",
|
||
|
"trainY = np.array(Y)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 103,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"BernoulliNB()"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 103,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"bernoulli = BernoulliNB()\n",
|
||
|
"bernoulli.fit(textVectors, trainY)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 104,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import csv\n",
|
||
|
"\n",
|
||
|
"testX = []\n",
|
||
|
"\n",
|
||
|
"with open('dev-0/in.tsv', encoding='utf8') as f:\n",
|
||
|
" for line in f:\n",
|
||
|
" testX.append(line)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 105,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"testX = vectorizer.transform(testX)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 106,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"predictedY = bernoulli.predict(testX)\n",
|
||
|
"\n",
|
||
|
"with open('dev-0/out.tsv', 'w', newline='') as f:\n",
|
||
|
" writer = csv.writer(f)\n",
|
||
|
" writer.writerows(predictedY)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 107,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"expectedY = []\n",
|
||
|
"with open('dev-0/expected.tsv') as f:\n",
|
||
|
" for line in f:\n",
|
||
|
" txt = line\n",
|
||
|
" txt = txt.replace('\\n', '')\n",
|
||
|
" expectedY.append(txt)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 108,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"0.6577260876531162"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 108,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"bernoulli.score(testX, expectedY)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 109,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Score for dev-1: 0.6406778795193032\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# dev-1\n",
|
||
|
"testX = []\n",
|
||
|
"\n",
|
||
|
"with open('dev-1/in.tsv', encoding='utf8') as f:\n",
|
||
|
" for line in f:\n",
|
||
|
" testX.append(line)\n",
|
||
|
"\n",
|
||
|
"testX = vectorizer.transform(testX)\n",
|
||
|
"\n",
|
||
|
"predictedY = bernoulli.predict(testX)\n",
|
||
|
"\n",
|
||
|
"expectedY = []\n",
|
||
|
"\n",
|
||
|
"with open('dev-1/expected.tsv') as f:\n",
|
||
|
" for line in f:\n",
|
||
|
" expectedY.append(line.replace('\\n', ''))\n",
|
||
|
"\n",
|
||
|
"print('Score for dev-1:', bernoulli.score(testX, expectedY))\n",
|
||
|
"\n",
|
||
|
"with open('dev-1/out.tsv', 'w', newline='') as f:\n",
|
||
|
" writer = csv.writer(f)\n",
|
||
|
" writer.writerows(predictedY)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 110,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# test-A\n",
|
||
|
"testX = []\n",
|
||
|
"\n",
|
||
|
"with open('test-A/in.tsv', encoding='utf8') as f:\n",
|
||
|
" for line in f:\n",
|
||
|
" testX.append(line)\n",
|
||
|
"\n",
|
||
|
"testX = vectorizer.transform(testX)\n",
|
||
|
"\n",
|
||
|
"predictedY = bernoulli.predict(testX)\n",
|
||
|
"\n",
|
||
|
"with open('test-A/out.tsv', 'w', newline='') as f:\n",
|
||
|
" writer = csv.writer(f)\n",
|
||
|
" writer.writerows(predictedY)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": []
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"interpreter": {
|
||
|
"hash": "3a5b3979b9a2fc2c8e649de363a592bbf5a2c9da164843b1adb5b45661722ad0"
|
||
|
},
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3.8.10 64-bit",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.8.10"
|
||
|
},
|
||
|
"orig_nbformat": 4
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|