405 lines
9.9 KiB
Plaintext
405 lines
9.9 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 110,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"import math\n",
|
|||
|
"\n",
|
|||
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
|||
|
"from sklearn.linear_model import LinearRegression\n",
|
|||
|
"from sklearn.pipeline import make_pipeline\n",
|
|||
|
"from sklearn.metrics import mean_squared_error"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 60,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"data = pd.read_csv('train/train.tsv', sep='\\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 62,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"data['Year'] = data.apply(lambda row: ((row['Begin'] + row['End'])/2), axis=1)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 63,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Text</th>\n",
|
|||
|
" <th>Year</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>nowią część kultury. U nas już nikt ich nie ch...</td>\n",
|
|||
|
" <td>1985.494521</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>hlstorja znana w okresie piramid, jak wlaśclcl...</td>\n",
|
|||
|
" <td>1926.475342</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>działek. Idąc dalej w swych hipotetycznych roz...</td>\n",
|
|||
|
" <td>2013.963014</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>w Warszawie o stosunkach domowych dziatwy szko...</td>\n",
|
|||
|
" <td>1925.500000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>\\\\'iykład: \"Cywilizacyjna Koncepcja dziejów ¥e...</td>\n",
|
|||
|
" <td>1981.500000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>107458</th>\n",
|
|||
|
" <td>M. (2) na rzecz powoda M. S. kwotę 5003,66 zł ...</td>\n",
|
|||
|
" <td>2013.058904</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>107459</th>\n",
|
|||
|
" <td>Zintegrowanego Systemu Informatycznego (ZSI), ...</td>\n",
|
|||
|
" <td>2013.023288</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>107460</th>\n",
|
|||
|
" <td>prokurator. Wyrokowi temu powołując się na prz...</td>\n",
|
|||
|
" <td>2013.921918</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>107461</th>\n",
|
|||
|
" <td>07 lipca 2010 r. świadczą o tym, że nie wszyst...</td>\n",
|
|||
|
" <td>2013.083562</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>107462</th>\n",
|
|||
|
" <td>zatem niezdolności do pracy było schorzenie sa...</td>\n",
|
|||
|
" <td>2013.100000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>107463 rows × 2 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Text Year\n",
|
|||
|
"0 nowią część kultury. U nas już nikt ich nie ch... 1985.494521\n",
|
|||
|
"1 hlstorja znana w okresie piramid, jak wlaśclcl... 1926.475342\n",
|
|||
|
"2 działek. Idąc dalej w swych hipotetycznych roz... 2013.963014\n",
|
|||
|
"3 w Warszawie o stosunkach domowych dziatwy szko... 1925.500000\n",
|
|||
|
"4 \\\\'iykład: \"Cywilizacyjna Koncepcja dziejów ¥e... 1981.500000\n",
|
|||
|
"... ... ...\n",
|
|||
|
"107458 M. (2) na rzecz powoda M. S. kwotę 5003,66 zł ... 2013.058904\n",
|
|||
|
"107459 Zintegrowanego Systemu Informatycznego (ZSI), ... 2013.023288\n",
|
|||
|
"107460 prokurator. Wyrokowi temu powołując się na prz... 2013.921918\n",
|
|||
|
"107461 07 lipca 2010 r. świadczą o tym, że nie wszyst... 2013.083562\n",
|
|||
|
"107462 zatem niezdolności do pracy było schorzenie sa... 2013.100000\n",
|
|||
|
"\n",
|
|||
|
"[107463 rows x 2 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 63,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"data = data[['Text', 'Year']]\n",
|
|||
|
"data"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 64,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"X = data['Text']\n",
|
|||
|
"y = data['Year']"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 65,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"model = make_pipeline(TfidfVectorizer(), LinearRegression())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 66,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),\n",
|
|||
|
" ('linearregression', LinearRegression())])"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 66,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"model.fit(X, y)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Dev0"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 157,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"20000\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"with open('dev-0/in.tsv', 'r', encoding='utf8') as f:\n",
|
|||
|
" X_dev0 = f.readlines()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 145,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"with open('dev-0/expected.tsv', 'r', encoding='utf8') as f:\n",
|
|||
|
" y_dev0 = f.readlines()\n",
|
|||
|
"y_dev0 = pd.Series(y_dev0)\n",
|
|||
|
"y_dev0 = y_dev0.apply(lambda row: row.replace('\\n', ''))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 159,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"predictions_dev0 = model.predict(X_dev0)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 160,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"21.66807634196494"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 160,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"math.sqrt(mean_squared_error(y_dev0, predictions_dev0))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 167,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"with open('dev-0/out.tsv', 'wt') as f:\n",
|
|||
|
" for pred in predictions_dev0:\n",
|
|||
|
" f.write(str(pred)+'\\n')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Dev1"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 161,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"with open('dev-1/in.tsv', 'r', encoding='utf8') as f:\n",
|
|||
|
" X_dev1 = f.readlines()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 162,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"with open('dev-1/expected.tsv', 'r', encoding='utf8') as f:\n",
|
|||
|
" y_dev1 = f.readlines()\n",
|
|||
|
"y_dev1 = pd.Series(y_dev1)\n",
|
|||
|
"y_dev1 = y_dev1.apply(lambda row: row.replace('\\n', ''))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 163,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"predictions_dev1 = model.predict(X_dev1)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 164,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"21.943703116726265"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 164,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"math.sqrt(mean_squared_error(y_dev1, predictions_dev1))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 168,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"with open('dev-1/out.tsv', 'wt') as f:\n",
|
|||
|
" for pred in predictions_dev1:\n",
|
|||
|
" f.write(str(pred)+'\\n')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Test"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 165,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"with open('test-A/in.tsv', 'r', encoding='utf8') as f:\n",
|
|||
|
" X_test = f.readlines()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 166,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"predictions_test = model.predict(X_test)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 169,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"with open('test-A/out.tsv', 'wt') as f:\n",
|
|||
|
" for pred in predictions_test:\n",
|
|||
|
" f.write(str(pred)+'\\n')"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"interpreter": {
|
|||
|
"hash": "3ecbe772e0e869a386d256c10cc6d948e50cd4df13a3f02e58ab4f2a666d7bf0"
|
|||
|
},
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3.8.13 ('eks')",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.8.13"
|
|||
|
},
|
|||
|
"orig_nbformat": 4
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|