s443930
This commit is contained in:
parent
61a9a4632a
commit
d9d70b1335
137314
dev-0/out.tsv
Normal file
137314
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
156606
dev-1/out.tsv
Normal file
156606
dev-1/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
80
main.py
Normal file
80
main.py
Normal file
@ -0,0 +1,80 @@
|
||||
import lzma
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.naive_bayes import BernoulliNB
|
||||
import numpy as np
|
||||
import csv
|
||||
|
||||
|
||||
X = []
|
||||
with lzma.open('train/in.tsv.xz') as f:
|
||||
for line in f:
|
||||
X.append(line.decode('utf-8'))
|
||||
|
||||
Y = []
|
||||
with open('train/expected.tsv') as f:
|
||||
for line in f:
|
||||
Y.append(line.replace('\n', ''))
|
||||
|
||||
|
||||
vectorizer = TfidfVectorizer()
|
||||
textVectors = vectorizer.fit_transform(X)
|
||||
|
||||
trainY = np.array(Y)
|
||||
|
||||
bernoulli = BernoulliNB()
|
||||
bernoulli.fit(textVectors, trainY)
|
||||
|
||||
# dev-0
|
||||
testX = []
|
||||
with open('dev-0/in.tsv', encoding='utf8') as f:
|
||||
for line in f:
|
||||
testX.append(line)
|
||||
|
||||
testX = vectorizer.transform(testX)
|
||||
predictedY = bernoulli.predict(testX)
|
||||
|
||||
expectedY = []
|
||||
with open('dev-0/expected.tsv') as f:
|
||||
for line in f:
|
||||
expectedY.append(line.replace('\n', ''))
|
||||
|
||||
with open('dev-0/out.tsv', 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerows(predictedY)
|
||||
|
||||
|
||||
# dev-1
|
||||
testX = []
|
||||
|
||||
with open('dev-1/in.tsv', encoding='utf8') as f:
|
||||
for line in f:
|
||||
testX.append(line)
|
||||
|
||||
testX = vectorizer.transform(testX)
|
||||
predictedY = bernoulli.predict(testX)
|
||||
|
||||
expectedY = []
|
||||
|
||||
with open('dev-1/expected.tsv') as f:
|
||||
for line in f:
|
||||
expectedY.append(line.replace('\n', ''))
|
||||
|
||||
with open('dev-1/out.tsv', 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerows(predictedY)
|
||||
|
||||
|
||||
# test-A
|
||||
testX = []
|
||||
|
||||
with open('test-A/in.tsv', encoding='utf8') as f:
|
||||
for line in f:
|
||||
testX.append(line)
|
||||
|
||||
testX = vectorizer.transform(testX)
|
||||
|
||||
predictedY = bernoulli.predict(testX)
|
||||
|
||||
with open('test-A/out.tsv', 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerows(predictedY)
|
277
sheSaid.ipynb
Normal file
277
sheSaid.ipynb
Normal file
@ -0,0 +1,277 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 96,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import lzma"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 97,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X = []\n",
|
||||
"\n",
|
||||
"with lzma.open('train/in.tsv.xz') as f:\n",
|
||||
" for line in f:\n",
|
||||
" X.append(line.decode('utf-8'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 98,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y = []\n",
|
||||
"\n",
|
||||
"with open('train/expected.tsv') as f:\n",
|
||||
" for line in f:\n",
|
||||
" txt = line\n",
|
||||
" txt = txt.replace('\\n', '')\n",
|
||||
" Y.append(txt)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 99,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"\n",
|
||||
"vectorizer = TfidfVectorizer()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 100,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"textVectors = vectorizer.fit_transform(X)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 101,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"matrix([[0., 0., 0., ..., 0., 0., 0.]])"
|
||||
]
|
||||
},
|
||||
"execution_count": 101,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"textVectors[0].todense()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 102,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.naive_bayes import BernoulliNB\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"trainY = np.array(Y)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 103,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"BernoulliNB()"
|
||||
]
|
||||
},
|
||||
"execution_count": 103,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"bernoulli = BernoulliNB()\n",
|
||||
"bernoulli.fit(textVectors, trainY)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 104,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import csv\n",
|
||||
"\n",
|
||||
"testX = []\n",
|
||||
"\n",
|
||||
"with open('dev-0/in.tsv', encoding='utf8') as f:\n",
|
||||
" for line in f:\n",
|
||||
" testX.append(line)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 105,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"testX = vectorizer.transform(testX)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 106,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"predictedY = bernoulli.predict(testX)\n",
|
||||
"\n",
|
||||
"with open('dev-0/out.tsv', 'w', newline='') as f:\n",
|
||||
" writer = csv.writer(f)\n",
|
||||
" writer.writerows(predictedY)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 107,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"expectedY = []\n",
|
||||
"with open('dev-0/expected.tsv') as f:\n",
|
||||
" for line in f:\n",
|
||||
" txt = line\n",
|
||||
" txt = txt.replace('\\n', '')\n",
|
||||
" expectedY.append(txt)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 108,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0.6577260876531162"
|
||||
]
|
||||
},
|
||||
"execution_count": 108,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"bernoulli.score(testX, expectedY)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 109,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Score for dev-1: 0.6406778795193032\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# dev-1\n",
|
||||
"testX = []\n",
|
||||
"\n",
|
||||
"with open('dev-1/in.tsv', encoding='utf8') as f:\n",
|
||||
" for line in f:\n",
|
||||
" testX.append(line)\n",
|
||||
"\n",
|
||||
"testX = vectorizer.transform(testX)\n",
|
||||
"\n",
|
||||
"predictedY = bernoulli.predict(testX)\n",
|
||||
"\n",
|
||||
"expectedY = []\n",
|
||||
"\n",
|
||||
"with open('dev-1/expected.tsv') as f:\n",
|
||||
" for line in f:\n",
|
||||
" expectedY.append(line.replace('\\n', ''))\n",
|
||||
"\n",
|
||||
"print('Score for dev-1:', bernoulli.score(testX, expectedY))\n",
|
||||
"\n",
|
||||
"with open('dev-1/out.tsv', 'w', newline='') as f:\n",
|
||||
" writer = csv.writer(f)\n",
|
||||
" writer.writerows(predictedY)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 110,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# test-A\n",
|
||||
"testX = []\n",
|
||||
"\n",
|
||||
"with open('test-A/in.tsv', encoding='utf8') as f:\n",
|
||||
" for line in f:\n",
|
||||
" testX.append(line)\n",
|
||||
"\n",
|
||||
"testX = vectorizer.transform(testX)\n",
|
||||
"\n",
|
||||
"predictedY = bernoulli.predict(testX)\n",
|
||||
"\n",
|
||||
"with open('test-A/out.tsv', 'w', newline='') as f:\n",
|
||||
" writer = csv.writer(f)\n",
|
||||
" writer.writerows(predictedY)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"interpreter": {
|
||||
"hash": "3a5b3979b9a2fc2c8e649de363a592bbf5a2c9da164843b1adb5b45661722ad0"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8.10 64-bit",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
134618
test-A/out.tsv
Normal file
134618
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user