forked from kubapok/retroc2
final
This commit is contained in:
parent
647c099815
commit
deda838b5a
20000
dev-0/out.tsv
Normal file
20000
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
89
linear.ipynb
Normal file
89
linear.ipynb
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||||
|
"from sklearn.linear_model import LinearRegression\n",
|
||||||
|
"from stop_words import get_stop_words\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import numpy as np"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"vectorizer = TfidfVectorizer(stop_words = get_stop_words('polish'))\n",
|
||||||
|
"linear = LinearRegression()\n",
|
||||||
|
"\n",
|
||||||
|
"train = pd.read_csv('train/train.tsv', sep = \"\\t\", names = ['start_date', 'end_date', 'title', 'sort_title', 'data'])\n",
|
||||||
|
"\n",
|
||||||
|
"mean = (train['start_date'] + train['end_date']) / 2\n",
|
||||||
|
"tv = vectorizer.fit_transform(train['data'])\n",
|
||||||
|
"linear.fit(tv, mean)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def getData(directory):\n",
|
||||||
|
" with open(directory, encoding=\"utf-8\") as file:\n",
|
||||||
|
" return file.readlines()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"tmp_dev = getData('dev-0/in.tsv')\n",
|
||||||
|
"dataFrame_dev = pd.DataFrame(data = tmp_dev)\n",
|
||||||
|
"evaluate_dev = linear.predict(vectorizer.transform(dataFrame_dev[0]))\n",
|
||||||
|
"np.savetxt('dev-0/out.tsv', evaluate, fmt='%f', delimiter='\\n')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"tmp_test = getData('test-A/in.tsv')\n",
|
||||||
|
"dataFrame_test = pd.DataFrame(data = tmp_test)\n",
|
||||||
|
"evaluate_test = linear.predict(vectorizer.transform(dataFrame_test[0]))\n",
|
||||||
|
"np.savetxt('test-A/out.tsv', evaluate, fmt='%f', delimiter='\\n')"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
14220
test-A/out.tsv
Normal file
14220
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user