s443930
This commit is contained in:
parent
61a9a4632a
commit
d9d70b1335
137314
dev-0/out.tsv
Normal file
137314
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
156606
dev-1/out.tsv
Normal file
156606
dev-1/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
80
main.py
Normal file
80
main.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
import lzma
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.naive_bayes import BernoulliNB
|
||||||
|
import numpy as np
|
||||||
|
import csv
|
||||||
|
|
||||||
|
|
||||||
|
X = []
|
||||||
|
with lzma.open('train/in.tsv.xz') as f:
|
||||||
|
for line in f:
|
||||||
|
X.append(line.decode('utf-8'))
|
||||||
|
|
||||||
|
Y = []
|
||||||
|
with open('train/expected.tsv') as f:
|
||||||
|
for line in f:
|
||||||
|
Y.append(line.replace('\n', ''))
|
||||||
|
|
||||||
|
|
||||||
|
vectorizer = TfidfVectorizer()
|
||||||
|
textVectors = vectorizer.fit_transform(X)
|
||||||
|
|
||||||
|
trainY = np.array(Y)
|
||||||
|
|
||||||
|
bernoulli = BernoulliNB()
|
||||||
|
bernoulli.fit(textVectors, trainY)
|
||||||
|
|
||||||
|
# dev-0
|
||||||
|
testX = []
|
||||||
|
with open('dev-0/in.tsv', encoding='utf8') as f:
|
||||||
|
for line in f:
|
||||||
|
testX.append(line)
|
||||||
|
|
||||||
|
testX = vectorizer.transform(testX)
|
||||||
|
predictedY = bernoulli.predict(testX)
|
||||||
|
|
||||||
|
expectedY = []
|
||||||
|
with open('dev-0/expected.tsv') as f:
|
||||||
|
for line in f:
|
||||||
|
expectedY.append(line.replace('\n', ''))
|
||||||
|
|
||||||
|
with open('dev-0/out.tsv', 'w', newline='') as f:
|
||||||
|
writer = csv.writer(f)
|
||||||
|
writer.writerows(predictedY)
|
||||||
|
|
||||||
|
|
||||||
|
# dev-1
|
||||||
|
testX = []
|
||||||
|
|
||||||
|
with open('dev-1/in.tsv', encoding='utf8') as f:
|
||||||
|
for line in f:
|
||||||
|
testX.append(line)
|
||||||
|
|
||||||
|
testX = vectorizer.transform(testX)
|
||||||
|
predictedY = bernoulli.predict(testX)
|
||||||
|
|
||||||
|
expectedY = []
|
||||||
|
|
||||||
|
with open('dev-1/expected.tsv') as f:
|
||||||
|
for line in f:
|
||||||
|
expectedY.append(line.replace('\n', ''))
|
||||||
|
|
||||||
|
with open('dev-1/out.tsv', 'w', newline='') as f:
|
||||||
|
writer = csv.writer(f)
|
||||||
|
writer.writerows(predictedY)
|
||||||
|
|
||||||
|
|
||||||
|
# test-A
|
||||||
|
testX = []
|
||||||
|
|
||||||
|
with open('test-A/in.tsv', encoding='utf8') as f:
|
||||||
|
for line in f:
|
||||||
|
testX.append(line)
|
||||||
|
|
||||||
|
testX = vectorizer.transform(testX)
|
||||||
|
|
||||||
|
predictedY = bernoulli.predict(testX)
|
||||||
|
|
||||||
|
with open('test-A/out.tsv', 'w', newline='') as f:
|
||||||
|
writer = csv.writer(f)
|
||||||
|
writer.writerows(predictedY)
|
277
sheSaid.ipynb
Normal file
277
sheSaid.ipynb
Normal file
@ -0,0 +1,277 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 96,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import lzma"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 97,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"X = []\n",
|
||||||
|
"\n",
|
||||||
|
"with lzma.open('train/in.tsv.xz') as f:\n",
|
||||||
|
" for line in f:\n",
|
||||||
|
" X.append(line.decode('utf-8'))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 98,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"Y = []\n",
|
||||||
|
"\n",
|
||||||
|
"with open('train/expected.tsv') as f:\n",
|
||||||
|
" for line in f:\n",
|
||||||
|
" txt = line\n",
|
||||||
|
" txt = txt.replace('\\n', '')\n",
|
||||||
|
" Y.append(txt)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 99,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||||
|
"\n",
|
||||||
|
"vectorizer = TfidfVectorizer()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 100,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"textVectors = vectorizer.fit_transform(X)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 101,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"matrix([[0., 0., 0., ..., 0., 0., 0.]])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 101,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"textVectors[0].todense()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 102,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.naive_bayes import BernoulliNB\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"\n",
|
||||||
|
"trainY = np.array(Y)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 103,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"BernoulliNB()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 103,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"bernoulli = BernoulliNB()\n",
|
||||||
|
"bernoulli.fit(textVectors, trainY)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 104,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import csv\n",
|
||||||
|
"\n",
|
||||||
|
"testX = []\n",
|
||||||
|
"\n",
|
||||||
|
"with open('dev-0/in.tsv', encoding='utf8') as f:\n",
|
||||||
|
" for line in f:\n",
|
||||||
|
" testX.append(line)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 105,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"testX = vectorizer.transform(testX)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 106,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"predictedY = bernoulli.predict(testX)\n",
|
||||||
|
"\n",
|
||||||
|
"with open('dev-0/out.tsv', 'w', newline='') as f:\n",
|
||||||
|
" writer = csv.writer(f)\n",
|
||||||
|
" writer.writerows(predictedY)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 107,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"expectedY = []\n",
|
||||||
|
"with open('dev-0/expected.tsv') as f:\n",
|
||||||
|
" for line in f:\n",
|
||||||
|
" txt = line\n",
|
||||||
|
" txt = txt.replace('\\n', '')\n",
|
||||||
|
" expectedY.append(txt)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 108,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0.6577260876531162"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 108,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"bernoulli.score(testX, expectedY)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 109,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Score for dev-1: 0.6406778795193032\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# dev-1\n",
|
||||||
|
"testX = []\n",
|
||||||
|
"\n",
|
||||||
|
"with open('dev-1/in.tsv', encoding='utf8') as f:\n",
|
||||||
|
" for line in f:\n",
|
||||||
|
" testX.append(line)\n",
|
||||||
|
"\n",
|
||||||
|
"testX = vectorizer.transform(testX)\n",
|
||||||
|
"\n",
|
||||||
|
"predictedY = bernoulli.predict(testX)\n",
|
||||||
|
"\n",
|
||||||
|
"expectedY = []\n",
|
||||||
|
"\n",
|
||||||
|
"with open('dev-1/expected.tsv') as f:\n",
|
||||||
|
" for line in f:\n",
|
||||||
|
" expectedY.append(line.replace('\\n', ''))\n",
|
||||||
|
"\n",
|
||||||
|
"print('Score for dev-1:', bernoulli.score(testX, expectedY))\n",
|
||||||
|
"\n",
|
||||||
|
"with open('dev-1/out.tsv', 'w', newline='') as f:\n",
|
||||||
|
" writer = csv.writer(f)\n",
|
||||||
|
" writer.writerows(predictedY)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 110,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# test-A\n",
|
||||||
|
"testX = []\n",
|
||||||
|
"\n",
|
||||||
|
"with open('test-A/in.tsv', encoding='utf8') as f:\n",
|
||||||
|
" for line in f:\n",
|
||||||
|
" testX.append(line)\n",
|
||||||
|
"\n",
|
||||||
|
"testX = vectorizer.transform(testX)\n",
|
||||||
|
"\n",
|
||||||
|
"predictedY = bernoulli.predict(testX)\n",
|
||||||
|
"\n",
|
||||||
|
"with open('test-A/out.tsv', 'w', newline='') as f:\n",
|
||||||
|
" writer = csv.writer(f)\n",
|
||||||
|
" writer.writerows(predictedY)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"interpreter": {
|
||||||
|
"hash": "3a5b3979b9a2fc2c8e649de363a592bbf5a2c9da164843b1adb5b45661722ad0"
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3.8.10 64-bit",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.10"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
134618
test-A/out.tsv
Normal file
134618
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user