{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n", "b'Skipping line 1983: expected 1 fields, saw 2\\nSkipping line 5199: expected 1 fields, saw 2\\n'\n" ] } ], "source": [ "from sklearn.naive_bayes import GaussianNB\n", "import pandas as pd\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "r_in = './train/train.tsv'\n", "\n", "r_ind_ev = './dev-0/in.tsv'\n", "tsv_read = pd.read_table(r_in, error_bad_lines=False, sep='\\t', header=None)\n", "tsv_read_dev = pd.read_table(r_ind_ev, error_bad_lines=False, sep='\\t', header=None)\n", "\n", "y_train = tsv_read[0].values\n", "X_train = tsv_read[1].values\n", "X_dev = tsv_read_dev[0].values\n", "\n", "vectorizer = TfidfVectorizer()\n", "counts = vectorizer.fit_transform(X_train)\n", "\n", "\n", "classifier = MultinomialNB()\n", "classifier.fit(counts, y_train)\n", "\n", "counts2 = vectorizer.transform(X_dev)\n", "predictions = classifier.predict(counts2)\n", "\n", "predictions.tofile(\"./dev-0/out.tsv\", sep='\\n')\n", "\n", "tsv_read_test_in = pd.read_table('./test-A/in.tsv', error_bad_lines=False, header= None)\n", "X_test= tsv_read_test_in[0].values\n", "\n", "counts3 = vectorizer.transform(X_test)\n", "predictions_test_A = classifier.predict(counts3)\n", "predictions_test_A.tofile('./test-A/out.tsv', sep='\\n')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }