petite-difference-challenge2/sheSaid.ipynb

278 lines
5.5 KiB
Plaintext
Raw Permalink Normal View History

2022-04-26 23:36:17 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"import lzma"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
"X = []\n",
"\n",
"with lzma.open('train/in.tsv.xz') as f:\n",
" for line in f:\n",
" X.append(line.decode('utf-8'))"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"Y = []\n",
"\n",
"with open('train/expected.tsv') as f:\n",
" for line in f:\n",
" txt = line\n",
" txt = txt.replace('\\n', '')\n",
" Y.append(txt)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"vectorizer = TfidfVectorizer()"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
"textVectors = vectorizer.fit_transform(X)"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"matrix([[0., 0., 0., ..., 0., 0., 0.]])"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"textVectors[0].todense()"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.naive_bayes import BernoulliNB\n",
"import numpy as np\n",
"\n",
"trainY = np.array(Y)"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BernoulliNB()"
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bernoulli = BernoulliNB()\n",
"bernoulli.fit(textVectors, trainY)"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"\n",
"testX = []\n",
"\n",
"with open('dev-0/in.tsv', encoding='utf8') as f:\n",
" for line in f:\n",
" testX.append(line)"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
"testX = vectorizer.transform(testX)"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [],
"source": [
"predictedY = bernoulli.predict(testX)\n",
"\n",
"with open('dev-0/out.tsv', 'w', newline='') as f:\n",
" writer = csv.writer(f)\n",
" writer.writerows(predictedY)"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [],
"source": [
"expectedY = []\n",
"with open('dev-0/expected.tsv') as f:\n",
" for line in f:\n",
" txt = line\n",
" txt = txt.replace('\\n', '')\n",
" expectedY.append(txt)"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6577260876531162"
]
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bernoulli.score(testX, expectedY)"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Score for dev-1: 0.6406778795193032\n"
]
}
],
"source": [
"# dev-1\n",
"testX = []\n",
"\n",
"with open('dev-1/in.tsv', encoding='utf8') as f:\n",
" for line in f:\n",
" testX.append(line)\n",
"\n",
"testX = vectorizer.transform(testX)\n",
"\n",
"predictedY = bernoulli.predict(testX)\n",
"\n",
"expectedY = []\n",
"\n",
"with open('dev-1/expected.tsv') as f:\n",
" for line in f:\n",
" expectedY.append(line.replace('\\n', ''))\n",
"\n",
"print('Score for dev-1:', bernoulli.score(testX, expectedY))\n",
"\n",
"with open('dev-1/out.tsv', 'w', newline='') as f:\n",
" writer = csv.writer(f)\n",
" writer.writerows(predictedY)"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [],
"source": [
"# test-A\n",
"testX = []\n",
"\n",
"with open('test-A/in.tsv', encoding='utf8') as f:\n",
" for line in f:\n",
" testX.append(line)\n",
"\n",
"testX = vectorizer.transform(testX)\n",
"\n",
"predictedY = bernoulli.predict(testX)\n",
"\n",
"with open('test-A/out.tsv', 'w', newline='') as f:\n",
" writer = csv.writer(f)\n",
" writer.writerows(predictedY)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "3a5b3979b9a2fc2c8e649de363a592bbf5a2c9da164843b1adb5b45661722ad0"
},
"kernelspec": {
"display_name": "Python 3.8.10 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}