{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "354bd187",
   "metadata": {},
   "outputs": [],
   "source": [
    "import regex as re\n",
    "import pandas as pd\n",
    "import us\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "64bf3f1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "columns_names = ['filename', 'params', 'text1', 'text2', 'text3', 'text4']\n",
    "data_train = pd.read_csv('./train/in.tsv', sep='\\t', names=columns_names)\n",
    "data_dev = pd.read_csv('./dev-0/in.tsv', sep='\\t', names=columns_names)\n",
    "data_test = pd.read_csv('./test-A/in.tsv', sep='\\t', names=columns_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "15fbf629",
   "metadata": {},
   "outputs": [],
   "source": [
    "months = {\n",
    "    'January': '01',\n",
    "    'February': '02',\n",
    "    'March': '03',\n",
    "    'April': '04',\n",
    "    'May': '05',\n",
    "    'June': '06',\n",
    "    'July': '07',\n",
    "    'August': '08',\n",
    "    'September': '09',\n",
    "    'October': '10',\n",
    "    'November': '11',\n",
    "    'December': '12'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "958a45aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "def transform_date_format(date):\n",
    "    if date != None:\n",
    "        if len(date) == 4:\n",
    "            # Check if month is string\n",
    "            try:\n",
    "                month = months[date[1]] \n",
    "            except(KeyError):\n",
    "                month = None\n",
    "            # If year has 4-digit\n",
    "            if len(date[3]) == 2:\n",
    "                if int(date[3][0]) < 5:\n",
    "                    if month != None:\n",
    "                        return \"20\"+str(date[3])+\"-\"+str(month)+\"-\" + str(date[2])\n",
    "                    else:\n",
    "                        return \"20\"+str(date[3])+\"-\"+str(date[1])+\"-\" + str(date[2])\n",
    "                else:\n",
    "                    if int(date[3][0]) < 5:\n",
    "                        if month != None:\n",
    "                            return \"19\"+str(date[3])+\"-\"+str(month)+\"-\" + str(date[2])\n",
    "                        else:\n",
    "                            return \"19\"+str(date[3])+\"-\"+str(date[1])+\"-\" + str(date[2])\n",
    "            elif len(date[3]) == 4:\n",
    "                if month != None:\n",
    "                    return str(date[3])+\"-\"+str(month)+\"-\" + str(date[2])\n",
    "                else:\n",
    "                    return str(date[3])+\"-\"+str(date[1])+\"-\" + str(date[2])\n",
    "        elif len(date) == 5:\n",
    "            return str(date[4])+\"-\"+str(months[date[3]])+\"-\"+re.findall(r'.*(\\d+).*', date[1])[0]\n",
    "    return \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "b1cd2152",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_effective_date(text):\n",
    "\n",
    "    # Date format \"04/18/01\"\n",
    "    first_format = re.findall(r'((1[0-2]|0[1-9])/(0[1-9]|[1|2][0-9]|3[0-1])/(0[1-9]|[1-9][0-9]))', text)\n",
    "\n",
    "    # Date format \"01/21/2016\"\n",
    "    sec_format = re.findall(r'((0[1-9]|1[1-2])/(0[1-9]|1[1-9]|2[1-9]|3[0-1])/(19[0-9][0-9]|20[0-9][0-9]))', text)\n",
    "\n",
    "    # Date format \"January, 13 2021\", \"February 28, 2011\"\n",
    "    third_format = re.findall(r'(([j|J]anuary|[f|F]ebruary|[m|M]arch|[A|a]pril|[M|m]ay|[J|j]une|[J|j]uly|[A|a]ugust|[S|s]eptember|[O|o]ctober|[n|N]ovember|[d|D]ecember)[,\\s|\\s]+(0[1-9]|[1-2][0-9]|3[0-1])[,\\s|\\s|,]+(19[0-9][0-9]|20[0-9][0-9]))', text)\n",
    "\n",
    "    # Date format \"6th day of January, 2012\"\n",
    "    fourth_format = re.findall(r'(([1-9]+(th\\sday\\sof\\s|rd\\sday\\sof\\s|nd\\sday\\sof\\s))([j|J]anuary|[f|F]ebruary|[m|M]arch|[A|a]pril|[M|m]ay|[J|j]une|[J|j]uly|[A|a]ugust|[S|s]eptember|[O|o]ctober|[n|N]ovember|[d|D]ecember),\\s(19[0-9][0-9]|20[0-9][0-9]))', text)\n",
    "\n",
    "    dates = []\n",
    "\n",
    "    for format in [first_format, sec_format, third_format, fourth_format]:\n",
    "        if len(format) > 0:\n",
    "            dates = format\n",
    "            return dates[0]\n",
    "    return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "0d7f45bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_terms(text):\n",
    "    years = re.findall(r'(?<=\\s)[0-9.\\s.,\\(\\)]+(?=years)', text)\n",
    "    months = re.findall(r'(?<=\\s)[0-9.\\s.,\\(\\)]+(?=months)', text)\n",
    "    if len(years) > 0:\n",
    "        return years\n",
    "    if len(months) > 0:\n",
    "        return months"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "065526eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_parties(text):\n",
    "    first_party = re.findall(r'(?<=between)[\\p{Latin}\\s.,]+(?=Inc.|INC.|LLC|llc|,|.)', text)\n",
    "    if len(first_party) > 0:\n",
    "        if \"Inc.\" in first_party[0]:\n",
    "            prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",
    "            return prepare[:prepare.index(\"Inc.\") + len(\"Inc.\")]\n",
    "        if \"inc.\" in first_party[0]:\n",
    "            prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",
    "            return prepare[:prepare.index(\"inc.\") + len(\"inc.\")]\n",
    "        if \"LLC\" in first_party[0]:\n",
    "            prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",
    "            return prepare[:prepare.index(\"LLC\") + len(\"LLC\")]\n",
    "        if \"llc\" in first_party[0]:\n",
    "            prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",
    "            return prepare[:prepare.index(\"llc\") + len(\"llc\")]\n",
    "    return \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "19c2f9e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_jurisdiction(text):\n",
    "    for state in us.states.STATES:\n",
    "        if re.search(rf\"(?<=laws\\sof\\sthe)[\\w\\s]*{str(state)}\\s*(?=,|.)\", text):\n",
    "            return str(state).replace(\" \", \"_\")\n",
    "    for state in us.states.STATES:\n",
    "        if re.search(rf\"(.*{str(state).lower()}.*)\", text.lower()):\n",
    "            return str(state).replace(\" \", \"_\")            \n",
    "    return \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "99778c65",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_parameters(params, text):\n",
    "    params_result = \"\"\n",
    "    for param in params.split(\" \"):\n",
    "        if param == \"effective_date\":\n",
    "            params_result += \" \" + \"effective_date=\" + str(transform_date_format(get_effective_date(text)))\n",
    "        elif param == \"jurisdiction\":\n",
    "            params_result += \" \" + \"jurisdiction=\" + str(get_jurisdiction(text))\n",
    "        elif param == \"party\":\n",
    "            params_result += \" \" + \"party=\" + str(get_parties(text))\n",
    "    return params_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "c39ea65a",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('train/out.tsv', 'w') as writer:\n",
    "    for idx, row in data_train.iterrows():\n",
    "        params_result = process_parameters(row['params'], row['text1'])\n",
    "        writer.write(params_result+\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "741a34d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('dev-0/out.tsv', 'w') as writer:\n",
    "    for idx, row in data_dev.iterrows():\n",
    "        params_result = process_parameters(row['params'], row['text1'])\n",
    "        writer.write(params_result+\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "8e8973f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('test-A/out.tsv', 'w') as writer:\n",
    "    for idx, row in data_test.iterrows():\n",
    "        params_result = process_parameters(row['params'], row['text1'])\n",
    "        writer.write(params_result+\"\\n\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}