{ "cells": [ { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import fasttext" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "X_train = pd.read_csv('train/in.tsv', sep='\\t', header=None)\n", "X_train = X_train[2]\n", "y_train = pd.read_csv('train/expected.tsv', sep='\\t', header=None)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "X_dev = pd.read_csv('dev-0/in.tsv', sep='\\t', header=None)\n", "X_dev = X_dev[2]\n", "y_dev = pd.read_csv('dev-0/expected.tsv', sep='\\t', header=None)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "X_test_A = pd.read_csv('test-A/in.tsv', sep='\\t', header=None)\n", "X_test_A = X_test_A[2]" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "X_test_B = pd.read_csv('test-B/in.tsv', sep='\\t', header=None)\n", "X_test_B = X_test_B[2]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "with open('train.txt', 'w', encoding='utf-8') as f:\n", " for i in range(len(X_train)):\n", " f.write(f'__label__{y_train[0][i]} {X_train[i]}\\n')\n", "\n", "f.close()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "with open('dev.txt', 'w', encoding='utf-8') as f:\n", " for i in range(len(X_dev)):\n", " f.write(f'__label__{y_dev[0][i]} {X_dev[i]}\\n')\n", "\n", "f.close()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "model = fasttext.train_supervised('train.txt')" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "model.save_model(\"model_fasttext.bin\")" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "N\t149134\n", "P@1\t0.762\n", "R@1\t0.762\n" ] } ], "source": [ "def print_results(N, p, r):\n", " print(\"N\\t\" + str(N))\n", " print(\"P@{}\\t{:.3f}\".format(1, p))\n", " print(\"R@{}\\t{:.3f}\".format(1, r))\n", "\n", "print_results(*model.test('dev.txt'))" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "with open('dev-0/out.txt', 'w') as f:\n", " for sentence in X_dev:\n", " f.write(f'{model.predict(sentence)[0][0][9:]}\\n')\n", "\n", "f.close()" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "with open('test-A/out.txt', 'w') as f:\n", " for sentence in X_test_A:\n", " f.write(f'{model.predict(sentence)[0][0][9:]}\\n')\n", "\n", "f.close()" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "with open('test-B/out.txt', 'w') as f:\n", " for sentence in X_test_B:\n", " f.write(f'{model.predict(sentence)[0][0][9:]}\\n')\n", "\n", "f.close()" ] } ], "metadata": { "interpreter": { "hash": "3ecbe772e0e869a386d256c10cc6d948e50cd4df13a3f02e58ab4f2a666d7bf0" }, "kernelspec": { "display_name": "Python 3.8.13 ('eks')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }