retroc2/linear.ipynb
2021-05-18 00:37:50 +02:00

90 lines
2.2 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.linear_model import LinearRegression\n",
"from stop_words import get_stop_words\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"vectorizer = TfidfVectorizer(stop_words = get_stop_words('polish'))\n",
"linear = LinearRegression()\n",
"\n",
"train = pd.read_csv('train/train.tsv', sep = \"\\t\", names = ['start_date', 'end_date', 'title', 'sort_title', 'data'])\n",
"\n",
"mean = (train['start_date'] + train['end_date']) / 2\n",
"tv = vectorizer.fit_transform(train['data'])\n",
"linear.fit(tv, mean)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"def getData(directory):\n",
" with open(directory, encoding=\"utf-8\") as file:\n",
" return file.readlines()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tmp_dev = getData('dev-0/in.tsv')\n",
"dataFrame_dev = pd.DataFrame(data = tmp_dev)\n",
"evaluate_dev = linear.predict(vectorizer.transform(dataFrame_dev[0]))\n",
"np.savetxt('dev-0/out.tsv', evaluate, fmt='%f', delimiter='\\n')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"tmp_test = getData('test-A/in.tsv')\n",
"dataFrame_test = pd.DataFrame(data = tmp_test)\n",
"evaluate_test = linear.predict(vectorizer.transform(dataFrame_test[0]))\n",
"np.savetxt('test-A/out.tsv', evaluate, fmt='%f', delimiter='\\n')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}