retroc2/run.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import math\n",
    "\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.metrics import mean_squared_error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('train/train.tsv', sep='\\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "data['Year'] = data.apply(lambda row: ((row['Begin'] + row['End'])/2), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Text</th>\n",
       "      <th>Year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>nowią część kultury. U nas już nikt ich nie ch...</td>\n",
       "      <td>1985.494521</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>hlstorja znana w okresie piramid, jak wlaśclcl...</td>\n",
       "      <td>1926.475342</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>działek. Idąc dalej w swych hipotetycznych roz...</td>\n",
       "      <td>2013.963014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>w Warszawie o stosunkach domowych dziatwy szko...</td>\n",
       "      <td>1925.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>\\\\'iykład: \"Cywilizacyjna Koncepcja dziejów ¥e...</td>\n",
       "      <td>1981.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>107458</th>\n",
       "      <td>M. (2) na rzecz powoda M. S. kwotę 5003,66 zł ...</td>\n",
       "      <td>2013.058904</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>107459</th>\n",
       "      <td>Zintegrowanego Systemu Informatycznego (ZSI), ...</td>\n",
       "      <td>2013.023288</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>107460</th>\n",
       "      <td>prokurator. Wyrokowi temu powołując się na prz...</td>\n",
       "      <td>2013.921918</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>107461</th>\n",
       "      <td>07 lipca 2010 r. świadczą o tym, że nie wszyst...</td>\n",
       "      <td>2013.083562</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>107462</th>\n",
       "      <td>zatem niezdolności do pracy było schorzenie sa...</td>\n",
       "      <td>2013.100000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>107463 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                     Text         Year\n",
       "0       nowią część kultury. U nas już nikt ich nie ch...  1985.494521\n",
       "1       hlstorja znana w okresie piramid, jak wlaśclcl...  1926.475342\n",
       "2       działek. Idąc dalej w swych hipotetycznych roz...  2013.963014\n",
       "3       w Warszawie o stosunkach domowych dziatwy szko...  1925.500000\n",
       "4       \\\\'iykład: \"Cywilizacyjna Koncepcja dziejów ¥e...  1981.500000\n",
       "...                                                   ...          ...\n",
       "107458  M. (2) na rzecz powoda M. S. kwotę 5003,66 zł ...  2013.058904\n",
       "107459  Zintegrowanego Systemu Informatycznego (ZSI), ...  2013.023288\n",
       "107460  prokurator. Wyrokowi temu powołując się na prz...  2013.921918\n",
       "107461  07 lipca 2010 r. świadczą o tym, że nie wszyst...  2013.083562\n",
       "107462  zatem niezdolności do pracy było schorzenie sa...  2013.100000\n",
       "\n",
       "[107463 rows x 2 columns]"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = data[['Text', 'Year']]\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = data['Text']\n",
    "y = data['Year']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = make_pipeline(TfidfVectorizer(), LinearRegression())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),\n",
       "                ('linearregression', LinearRegression())])"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(X, y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dev0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20000\n"
     ]
    }
   ],
   "source": [
    "with open('dev-0/in.tsv', 'r', encoding='utf8') as f:\n",
    "    X_dev0 = f.readlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('dev-0/expected.tsv', 'r', encoding='utf8') as f:\n",
    "    y_dev0 = f.readlines()\n",
    "y_dev0 = pd.Series(y_dev0)\n",
    "y_dev0 = y_dev0.apply(lambda row: row.replace('\\n', ''))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions_dev0 = model.predict(X_dev0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "21.66807634196494"
      ]
     },
     "execution_count": 160,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "math.sqrt(mean_squared_error(y_dev0, predictions_dev0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('dev-0/out.tsv', 'wt') as f:\n",
    "    for pred in predictions_dev0:\n",
    "        f.write(str(pred)+'\\n')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dev1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('dev-1/in.tsv', 'r', encoding='utf8') as f:\n",
    "    X_dev1 = f.readlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('dev-1/expected.tsv', 'r', encoding='utf8') as f:\n",
    "    y_dev1 = f.readlines()\n",
    "y_dev1 = pd.Series(y_dev1)\n",
    "y_dev1 = y_dev1.apply(lambda row: row.replace('\\n', ''))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions_dev1 = model.predict(X_dev1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "21.943703116726265"
      ]
     },
     "execution_count": 164,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "math.sqrt(mean_squared_error(y_dev1, predictions_dev1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('dev-1/out.tsv', 'wt') as f:\n",
    "    for pred in predictions_dev1:\n",
    "        f.write(str(pred)+'\\n')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('test-A/in.tsv', 'r', encoding='utf8') as f:\n",
    "    X_test = f.readlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions_test = model.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('test-A/out.tsv', 'wt') as f:\n",
    "    for pred in predictions_test:\n",
    "        f.write(str(pred)+'\\n')"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "3ecbe772e0e869a386d256c10cc6d948e50cd4df13a3f02e58ab4f2a666d7bf0"
  },
  "kernelspec": {
   "display_name": "Python 3.8.13 ('eks')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}