{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import patoolib\n", "import os\n", "import patoolib\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.naive_bayes import GaussianNB, MultinomialNB\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.feature_extraction.text import TfidfVectorizer" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TRENING" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### ROZPAKOWANIE I WCZYTANIE" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "patool: Extracting train/in.tsv.xz ...\n", "patool: running \"\"C:\\Program Files\\Git\\mingw64\\bin\\xz.EXE\"\" -c -d -- train/in.tsv.xz > train/in.tsv\n", "patool: with shell=True\n", "patool: ... train/in.tsv.xz extracted to `train/'.\n" ] } ], "source": [ "EXPECTED_FILE = open('train/expected.tsv', 'r', encoding=\"utf-8\")\n", "\n", "patoolib.extract_archive(\"train/in.tsv.xz\", outdir=\"train/\")\n", "TRAIN = open('train/in.tsv', 'r', encoding=\"utf-8\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### WRZUCENIE DO ZMIENNYCH" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "EXPECTED = []\n", "for line in EXPECTED_FILE:\n", " EXPECTED.append(line)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "TRAIN_DATA = []\n", "for line in TRAIN:\n", " TRAIN_DATA.append(line)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### ZAMKNIECIE ZMIENNYCH PLIKOW I USUNIECIE ROZPAKOWANIA" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "EXPECTED_FILE.close()\n", "TRAIN.close()\n", "#os.remove(\"train/in.tsv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### MODEL TRENINGOWY" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "EXPECTED_ENCODER = LabelEncoder().fit_transform(EXPECTED)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "PIPE = Pipeline(steps=[(\"TF-IDF\",TfidfVectorizer()), (\"BAYES\", MultinomialNB())])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "TRAIN_MODEL = PIPE.fit(TRAIN_DATA, EXPECTED_ENCODER)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## FUNKCJE" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def BayesFit(MODEL, DOC):\n", " PREDICTION = MODEL.predict(DOC)\n", " return PREDICTION" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## PLIK DEV-0" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "patool: Extracting dev-0/in.tsv.xz ...\n", "patool: running \"\"C:\\Program Files\\Git\\mingw64\\bin\\xz.EXE\"\" -c -d -- dev-0/in.tsv.xz > dev-0/in.tsv\n", "patool: with shell=True\n", "patool: ... dev-0/in.tsv.xz extracted to `dev-0/'.\n" ] } ], "source": [ "patoolib.extract_archive(\"dev-0/in.tsv.xz\", outdir=\"dev-0/\")\n", "INFILE = open('dev-0/in.tsv', 'r', encoding=\"utf-8\")\n", "\n", "OUTFILE = open(\"dev-0/out.tsv\", \"w\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "ALL_DOC = INFILE.readlines()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "RESULT = BayesFit(TRAIN_MODEL, ALL_DOC)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "for x in RESULT:\n", " OUTFILE.write(str(x) + '\\n')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "INFILE.close()\n", "OUTFILE.close()\n", "#os.remove(\"dev-0/in.tsv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## PLIK TEST-A" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "patool: Extracting test-A/in.tsv.xz ...\n", "patool: running \"\"C:\\Program Files\\Git\\mingw64\\bin\\xz.EXE\"\" -c -d -- test-A/in.tsv.xz > test-A/in.tsv\n", "patool: with shell=True\n", "patool: ... test-A/in.tsv.xz extracted to `test-A/'.\n" ] } ], "source": [ "patoolib.extract_archive(\"test-A/in.tsv.xz\", outdir=\"test-A/\")\n", "INFILE = open('test-A/in.tsv', 'r', encoding=\"utf-8\")\n", "\n", "OUTFILE = open(\"test-A/out.tsv\", \"w\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "ALL_DOC = INFILE.readlines()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "RESULT = BayesFit(TRAIN_MODEL, ALL_DOC)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "for x in RESULT:\n", " OUTFILE.write(str(x) + '\\n')" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "INFILE.close()\n", "OUTFILE.close()\n", "#os.remove(\"test-A/in.tsv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }