{ "cells": [ { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.linear_model import LinearRegression\n", "from stop_words import get_stop_words\n", "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vectorizer = TfidfVectorizer(stop_words = get_stop_words('polish'))\n", "linear = LinearRegression()\n", "\n", "train = pd.read_csv('train/train.tsv', sep = \"\\t\", names = ['start_date', 'end_date', 'title', 'sort_title', 'data'])\n", "\n", "mean = (train['start_date'] + train['end_date']) / 2\n", "tv = vectorizer.fit_transform(train['data'])\n", "linear.fit(tv, mean)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "def getData(directory):\n", " with open(directory, encoding=\"utf-8\") as file:\n", " return file.readlines()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tmp_dev = getData('dev-0/in.tsv')\n", "dataFrame_dev = pd.DataFrame(data = tmp_dev)\n", "evaluate_dev = linear.predict(vectorizer.transform(dataFrame_dev[0]))\n", "np.savetxt('dev-0/out.tsv', evaluate, fmt='%f', delimiter='\\n')" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "tmp_test = getData('test-A/in.tsv')\n", "dataFrame_test = pd.DataFrame(data = tmp_test)\n", "evaluate_test = linear.predict(vectorizer.transform(dataFrame_test[0]))\n", "np.savetxt('test-A/out.tsv', evaluate, fmt='%f', delimiter='\\n')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 2 }