{ "cells": [ { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [], "source": [ "import lzma" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [], "source": [ "X = []\n", "\n", "with lzma.open('train/in.tsv.xz') as f:\n", " for line in f:\n", " X.append(line.decode('utf-8'))" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [], "source": [ "Y = []\n", "\n", "with open('train/expected.tsv') as f:\n", " for line in f:\n", " txt = line\n", " txt = txt.replace('\\n', '')\n", " Y.append(txt)" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "vectorizer = TfidfVectorizer()" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [], "source": [ "textVectors = vectorizer.fit_transform(X)" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "matrix([[0., 0., 0., ..., 0., 0., 0.]])" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "textVectors[0].todense()" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [], "source": [ "from sklearn.naive_bayes import BernoulliNB\n", "import numpy as np\n", "\n", "trainY = np.array(Y)" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "BernoulliNB()" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bernoulli = BernoulliNB()\n", "bernoulli.fit(textVectors, trainY)" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [], "source": [ "import csv\n", "\n", "testX = []\n", "\n", "with open('dev-0/in.tsv', encoding='utf8') as f:\n", " for line in f:\n", " testX.append(line)" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [], "source": [ "testX = vectorizer.transform(testX)" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [], "source": [ "predictedY = bernoulli.predict(testX)\n", "\n", "with open('dev-0/out.tsv', 'w', newline='') as f:\n", " writer = csv.writer(f)\n", " writer.writerows(predictedY)" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [], "source": [ "expectedY = []\n", "with open('dev-0/expected.tsv') as f:\n", " for line in f:\n", " txt = line\n", " txt = txt.replace('\\n', '')\n", " expectedY.append(txt)" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6577260876531162" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bernoulli.score(testX, expectedY)" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Score for dev-1: 0.6406778795193032\n" ] } ], "source": [ "# dev-1\n", "testX = []\n", "\n", "with open('dev-1/in.tsv', encoding='utf8') as f:\n", " for line in f:\n", " testX.append(line)\n", "\n", "testX = vectorizer.transform(testX)\n", "\n", "predictedY = bernoulli.predict(testX)\n", "\n", "expectedY = []\n", "\n", "with open('dev-1/expected.tsv') as f:\n", " for line in f:\n", " expectedY.append(line.replace('\\n', ''))\n", "\n", "print('Score for dev-1:', bernoulli.score(testX, expectedY))\n", "\n", "with open('dev-1/out.tsv', 'w', newline='') as f:\n", " writer = csv.writer(f)\n", " writer.writerows(predictedY)" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [], "source": [ "# test-A\n", "testX = []\n", "\n", "with open('test-A/in.tsv', encoding='utf8') as f:\n", " for line in f:\n", " testX.append(line)\n", "\n", "testX = vectorizer.transform(testX)\n", "\n", "predictedY = bernoulli.predict(testX)\n", "\n", "with open('test-A/out.tsv', 'w', newline='') as f:\n", " writer = csv.writer(f)\n", " writer.writerows(predictedY)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "3a5b3979b9a2fc2c8e649de363a592bbf5a2c9da164843b1adb5b45661722ad0" }, "kernelspec": { "display_name": "Python 3.8.10 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }