From 393630083f91fb92ab315d590502e57bf6ea27dc Mon Sep 17 00:00:00 2001 From: Szymon Polak Date: Sun, 16 May 2021 23:51:24 +0200 Subject: [PATCH] solution --- retroc.ipynb | 124 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 retroc.ipynb diff --git a/retroc.ipynb b/retroc.ipynb new file mode 100644 index 0000000..2446e7b --- /dev/null +++ b/retroc.ipynb @@ -0,0 +1,124 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python38332bit715560a51b8a44948ee59d26a58cf272", + "display_name": "Python 3.8.3 32-bit" + }, + "metadata": { + "interpreter": { + "hash": "d4bdc0d8028da516e3b937f3ab23da3f18f7264589053952c883afefa2219368" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.linear_model import LinearRegression\n", + "from stop_words import get_stop_words\n", + "from sklearn.feature_extraction.text import TfidfVectorizer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#trening\n", + "\n", + "#dane treningowe\n", + "train_data = pd.read_csv('train/train.tsv.xz', compression='xz', sep='\\t')\n", + "\n", + "#regresja liniowa\n", + "LR = LinearRegression()\n", + "#vectorizer\n", + "VEC = TfidfVectorizer(stop_words=get_stop_words('polish'))\n", + "#wektoryzacja danych treningowych\n", + "train_x = VEC.fit_transform(train_data[2])\n", + "#średnia dat\n", + "dm = mean([train_data[0],train_data[1]])\n", + "#trening\n", + "LR.fit(train_x, dm)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#dev-0 predict\n", + "\n", + "#dane treningowe\n", + "dev0_data = pd.read_csv('dev-0/in.tsv', sep='\\t')\n", + "\n", + "#wektoryzacja danych treningowych\n", + "dev0_x = VEC.transform(dev0_data[0])\n", + "#predykcja\n", + "dev0_y = LR.predict(dev0_x)\n", + "#zapis wyników\n", + "dev0_y.tofile('dev-0/out.tsv', sep='\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#dev-1 predict\n", + "\n", + "#dane treningowe\n", + "dev1_data = pd.read_csv('dev-1/in.tsv', sep='\\t')\n", + "\n", + "#wektoryzacja danych treningowych\n", + "dev1_x = VEC.transform(dev1_data[0])\n", + "#predykcja\n", + "dev1_y = LR.predict(dev1_x)\n", + "#zapis wyników\n", + "dev1_y.tofile('dev-1/out.tsv', sep='\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#test-A predict\n", + "\n", + "#dane treningowe\n", + "testA_data = pd.read_csv('testA/in.tsv', sep='\\t')\n", + "\n", + "#wektoryzacja danych treningowych\n", + "testA_x = VEC.transform(testA_data[0])\n", + "#predykcja\n", + "testA_y = LR.predict(testA_x)\n", + "#zapis wyników\n", + "testA_y.tofile('testA/out.tsv', sep='\\n')" + ] + } + ] +} \ No newline at end of file