{ "cells": [ { "cell_type": "code", "execution_count": 22, "id": "ce420679-f5aa-4c83-a912-3c4afa982d7e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "D:\\Users\\Adrian\\anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3444: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version.\n", "\n", "\n", " exec(code_obj, self.user_global_ns, self.user_ns)\n", "b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n" ] } ], "source": [ "import pandas as pd\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.metrics import accuracy_score\n", "\n", "\n", "\n", "df = pd.read_csv(\"train/train.tsv\", sep=\"\\t\", header=None, error_bad_lines=False)\n", "df = df.head(1000)\n", "\n", "\n", "dev_x = pd.read_csv(\"dev-0/in.tsv\", sep=\"\\t\", header=None, error_bad_lines=False)\n", "\n", "\n", "with open('test-A/in.tsv', 'r', encoding='utf8') as file:\n", " test = file.readlines()\n", "test = pd.Series(test)\n", "\n", "\n", "x = df[1]\n", "y = df[0]\n", "\n", "model = make_pipeline(TfidfVectorizer(), MultinomialNB())\n", "model.fit(x,y)\n", "\n", "pred_dev = model.predict(dev_x[0])\n", "pred_dev = pd.Series(pred_dev)\n", "\n", "with open('dev-0/out.tsv', 'wt') as file:\n", " for pred in pred_dev:\n", " file.write(str(pred)+'\\n')\n", "\n", "\n", "pred_test = model.predict(test)\n", "pred_test = pd.Series(pred_test)\n", "pred_test = pred_test.astype('int')\n", "\n", "\n", " \n", "with open('test-A/out.tsv', 'wt') as file:\n", " for pred in pred_test:\n", " file.write(str(pred)+'\\n')\n", "\n", "\n", "\n", "\n", "\n", " \n" ] }, { "cell_type": "code", "execution_count": 15, "id": "3e2a9ef0-6da0-4934-8099-378d859ae04e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0\n", "0 ATP Sztokholm: Juergen Zopp wykorzystał szansę...\n", "1 Krowicki z reprezentacją kobiet aż do igrzysk ...\n", "2 Wielki powrót Łukasza Kubota Odradza się zawsz...\n", "3 Marcel Hirscher wygrał ostatni slalom gigant m...\n", "4 Polki do Czarnogóry z pełnią zaangażowania. Sy...\n", "... ...\n", "5440 Biało-czerwona siła w Falun. Oni będą reprezen...\n", "5441 Finał WTA Tokio na żywo: Woźniacka - Osaka LIV...\n", "5442 Oni zapisali się w annałach. Hubert Hurkacz 15...\n", "5443 Poprawia się stan Nikiego Laudy. Austriak może...\n", "5444 Liga Mistrzów. Zabójcza końcówka Interu Mediol...\n", "\n", "[5445 rows x 1 columns]\n", "0 ATP Sztokholm: Juergen Zopp wykorzystał szansę...\n", "1 Krowicki z reprezentacją kobiet aż do igrzysk ...\n", "2 Wielki powrót Łukasza Kubota Odradza się zawsz...\n", "3 Marcel Hirscher wygrał ostatni slalom gigant m...\n", "4 Polki do Czarnogóry z pełnią zaangażowania. Sy...\n", " ... \n", "5442 Biało-czerwona siła w Falun. Oni będą reprezen...\n", "5443 Finał WTA Tokio na żywo: Woźniacka - Osaka LIV...\n", "5444 Oni zapisali się w annałach. Hubert Hurkacz 15...\n", "5445 Poprawia się stan Nikiego Laudy. Austriak może...\n", "5446 Liga Mistrzów. Zabójcza końcówka Interu Mediol...\n", "Length: 5447, dtype: object\n" ] } ], "source": [ "print(test)\n", "\n", "\n", "\n", "\n", "print(Xtest)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }