This commit is contained in:
Iwona Christop 2022-04-26 23:36:17 +02:00
parent 61a9a4632a
commit d9d70b1335
5 changed files with 428895 additions and 0 deletions

137314
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

156606
dev-1/out.tsv Normal file

File diff suppressed because it is too large Load Diff

80
main.py Normal file
View File

@ -0,0 +1,80 @@
import lzma
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
import numpy as np
import csv
X = []
with lzma.open('train/in.tsv.xz') as f:
for line in f:
X.append(line.decode('utf-8'))
Y = []
with open('train/expected.tsv') as f:
for line in f:
Y.append(line.replace('\n', ''))
vectorizer = TfidfVectorizer()
textVectors = vectorizer.fit_transform(X)
trainY = np.array(Y)
bernoulli = BernoulliNB()
bernoulli.fit(textVectors, trainY)
# dev-0
testX = []
with open('dev-0/in.tsv', encoding='utf8') as f:
for line in f:
testX.append(line)
testX = vectorizer.transform(testX)
predictedY = bernoulli.predict(testX)
expectedY = []
with open('dev-0/expected.tsv') as f:
for line in f:
expectedY.append(line.replace('\n', ''))
with open('dev-0/out.tsv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(predictedY)
# dev-1
testX = []
with open('dev-1/in.tsv', encoding='utf8') as f:
for line in f:
testX.append(line)
testX = vectorizer.transform(testX)
predictedY = bernoulli.predict(testX)
expectedY = []
with open('dev-1/expected.tsv') as f:
for line in f:
expectedY.append(line.replace('\n', ''))
with open('dev-1/out.tsv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(predictedY)
# test-A
testX = []
with open('test-A/in.tsv', encoding='utf8') as f:
for line in f:
testX.append(line)
testX = vectorizer.transform(testX)
predictedY = bernoulli.predict(testX)
with open('test-A/out.tsv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(predictedY)

277
sheSaid.ipynb Normal file
View File

@ -0,0 +1,277 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"import lzma"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
"X = []\n",
"\n",
"with lzma.open('train/in.tsv.xz') as f:\n",
" for line in f:\n",
" X.append(line.decode('utf-8'))"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"Y = []\n",
"\n",
"with open('train/expected.tsv') as f:\n",
" for line in f:\n",
" txt = line\n",
" txt = txt.replace('\\n', '')\n",
" Y.append(txt)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"vectorizer = TfidfVectorizer()"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
"textVectors = vectorizer.fit_transform(X)"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"matrix([[0., 0., 0., ..., 0., 0., 0.]])"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"textVectors[0].todense()"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.naive_bayes import BernoulliNB\n",
"import numpy as np\n",
"\n",
"trainY = np.array(Y)"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BernoulliNB()"
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bernoulli = BernoulliNB()\n",
"bernoulli.fit(textVectors, trainY)"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"\n",
"testX = []\n",
"\n",
"with open('dev-0/in.tsv', encoding='utf8') as f:\n",
" for line in f:\n",
" testX.append(line)"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
"testX = vectorizer.transform(testX)"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [],
"source": [
"predictedY = bernoulli.predict(testX)\n",
"\n",
"with open('dev-0/out.tsv', 'w', newline='') as f:\n",
" writer = csv.writer(f)\n",
" writer.writerows(predictedY)"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [],
"source": [
"expectedY = []\n",
"with open('dev-0/expected.tsv') as f:\n",
" for line in f:\n",
" txt = line\n",
" txt = txt.replace('\\n', '')\n",
" expectedY.append(txt)"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6577260876531162"
]
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bernoulli.score(testX, expectedY)"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Score for dev-1: 0.6406778795193032\n"
]
}
],
"source": [
"# dev-1\n",
"testX = []\n",
"\n",
"with open('dev-1/in.tsv', encoding='utf8') as f:\n",
" for line in f:\n",
" testX.append(line)\n",
"\n",
"testX = vectorizer.transform(testX)\n",
"\n",
"predictedY = bernoulli.predict(testX)\n",
"\n",
"expectedY = []\n",
"\n",
"with open('dev-1/expected.tsv') as f:\n",
" for line in f:\n",
" expectedY.append(line.replace('\\n', ''))\n",
"\n",
"print('Score for dev-1:', bernoulli.score(testX, expectedY))\n",
"\n",
"with open('dev-1/out.tsv', 'w', newline='') as f:\n",
" writer = csv.writer(f)\n",
" writer.writerows(predictedY)"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [],
"source": [
"# test-A\n",
"testX = []\n",
"\n",
"with open('test-A/in.tsv', encoding='utf8') as f:\n",
" for line in f:\n",
" testX.append(line)\n",
"\n",
"testX = vectorizer.transform(testX)\n",
"\n",
"predictedY = bernoulli.predict(testX)\n",
"\n",
"with open('test-A/out.tsv', 'w', newline='') as f:\n",
" writer = csv.writer(f)\n",
" writer.writerows(predictedY)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "3a5b3979b9a2fc2c8e649de363a592bbf5a2c9da164843b1adb5b45661722ad0"
},
"kernelspec": {
"display_name": "Python 3.8.10 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

134618
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff