{ "cells": [ { "cell_type": "code", "execution_count": 3, "id": "354bd187", "metadata": {}, "outputs": [], "source": [ "import regex as re\n", "import pandas as pd\n", "import us\n", "from collections import Counter" ] }, { "cell_type": "code", "execution_count": 5, "id": "64bf3f1e", "metadata": {}, "outputs": [], "source": [ "columns_names = ['filename', 'params', 'text1', 'text2', 'text3', 'text4']\n", "data_train = pd.read_csv('./train/in.tsv', sep='\\t', names=columns_names)\n", "data_dev = pd.read_csv('./dev-0/in.tsv', sep='\\t', names=columns_names)\n", "data_test = pd.read_csv('./test-A/in.tsv', sep='\\t', names=columns_names)" ] }, { "cell_type": "code", "execution_count": 6, "id": "15fbf629", "metadata": {}, "outputs": [], "source": [ "months = {\n", " 'January': '01',\n", " 'February': '02',\n", " 'March': '03',\n", " 'April': '04',\n", " 'May': '05',\n", " 'June': '06',\n", " 'July': '07',\n", " 'August': '08',\n", " 'September': '09',\n", " 'October': '10',\n", " 'November': '11',\n", " 'December': '12'\n", "}" ] }, { "cell_type": "code", "execution_count": 8, "id": "958a45aa", "metadata": {}, "outputs": [], "source": [ "def transform_date_format(date):\n", " if date != None:\n", " if len(date) == 4:\n", " # Check if month is string\n", " try:\n", " month = months[date[1]] \n", " except(KeyError):\n", " month = None\n", " # If year has 4-digit\n", " if len(date[3]) == 2:\n", " if int(date[3][0]) < 5:\n", " if month != None:\n", " return \"20\"+str(date[3])+\"-\"+str(month)+\"-\" + str(date[2])\n", " else:\n", " return \"20\"+str(date[3])+\"-\"+str(date[1])+\"-\" + str(date[2])\n", " else:\n", " if int(date[3][0]) < 5:\n", " if month != None:\n", " return \"19\"+str(date[3])+\"-\"+str(month)+\"-\" + str(date[2])\n", " else:\n", " return \"19\"+str(date[3])+\"-\"+str(date[1])+\"-\" + str(date[2])\n", " elif len(date[3]) == 4:\n", " if month != None:\n", " return str(date[3])+\"-\"+str(month)+\"-\" + str(date[2])\n", " else:\n", " return str(date[3])+\"-\"+str(date[1])+\"-\" + str(date[2])\n", " elif len(date) == 5:\n", " return str(date[4])+\"-\"+str(months[date[3]])+\"-\"+re.findall(r'.*(\\d+).*', date[1])[0]\n", " return \"\"" ] }, { "cell_type": "code", "execution_count": 9, "id": "b1cd2152", "metadata": {}, "outputs": [], "source": [ "def get_effective_date(text):\n", "\n", " # Date format \"04/18/01\"\n", " first_format = re.findall(r'((1[0-2]|0[1-9])/(0[1-9]|[1|2][0-9]|3[0-1])/(0[1-9]|[1-9][0-9]))', text)\n", "\n", " # Date format \"01/21/2016\"\n", " sec_format = re.findall(r'((0[1-9]|1[1-2])/(0[1-9]|1[1-9]|2[1-9]|3[0-1])/(19[0-9][0-9]|20[0-9][0-9]))', text)\n", "\n", " # Date format \"January, 13 2021\", \"February 28, 2011\"\n", " third_format = re.findall(r'(([j|J]anuary|[f|F]ebruary|[m|M]arch|[A|a]pril|[M|m]ay|[J|j]une|[J|j]uly|[A|a]ugust|[S|s]eptember|[O|o]ctober|[n|N]ovember|[d|D]ecember)[,\\s|\\s]+(0[1-9]|[1-2][0-9]|3[0-1])[,\\s|\\s|,]+(19[0-9][0-9]|20[0-9][0-9]))', text)\n", "\n", " # Date format \"6th day of January, 2012\"\n", " fourth_format = re.findall(r'(([1-9]+(th\\sday\\sof\\s|rd\\sday\\sof\\s|nd\\sday\\sof\\s))([j|J]anuary|[f|F]ebruary|[m|M]arch|[A|a]pril|[M|m]ay|[J|j]une|[J|j]uly|[A|a]ugust|[S|s]eptember|[O|o]ctober|[n|N]ovember|[d|D]ecember),\\s(19[0-9][0-9]|20[0-9][0-9]))', text)\n", "\n", " dates = []\n", "\n", " for format in [first_format, sec_format, third_format, fourth_format]:\n", " if len(format) > 0:\n", " dates = format\n", " return dates[0]\n", " return None" ] }, { "cell_type": "code", "execution_count": 10, "id": "0d7f45bb", "metadata": {}, "outputs": [], "source": [ "def get_terms(text):\n", " years = re.findall(r'(?<=\\s)[0-9.\\s.,\\(\\)]+(?=years)', text)\n", " months = re.findall(r'(?<=\\s)[0-9.\\s.,\\(\\)]+(?=months)', text)\n", " if len(years) > 0:\n", " return years\n", " if len(months) > 0:\n", " return months" ] }, { "cell_type": "code", "execution_count": 11, "id": "065526eb", "metadata": {}, "outputs": [], "source": [ "def get_parties(text):\n", " first_party = re.findall(r'(?<=between)[\\p{Latin}\\s.,]+(?=Inc.|INC.|LLC|llc|,|.)', text)\n", " if len(first_party) > 0:\n", " if \"Inc.\" in first_party[0]:\n", " prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n", " return prepare[:prepare.index(\"Inc.\") + len(\"Inc.\")]\n", " if \"inc.\" in first_party[0]:\n", " prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n", " return prepare[:prepare.index(\"inc.\") + len(\"inc.\")]\n", " if \"LLC\" in first_party[0]:\n", " prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n", " return prepare[:prepare.index(\"LLC\") + len(\"LLC\")]\n", " if \"llc\" in first_party[0]:\n", " prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n", " return prepare[:prepare.index(\"llc\") + len(\"llc\")]\n", " return \"\"" ] }, { "cell_type": "code", "execution_count": 12, "id": "19c2f9e9", "metadata": {}, "outputs": [], "source": [ "def get_jurisdiction(text):\n", " for state in us.states.STATES:\n", " if re.search(rf\"(?<=laws\\sof\\sthe)[\\w\\s]*{str(state)}\\s*(?=,|.)\", text):\n", " return str(state).replace(\" \", \"_\")\n", " for state in us.states.STATES:\n", " if re.search(rf\"(.*{str(state).lower()}.*)\", text.lower()):\n", " return str(state).replace(\" \", \"_\") \n", " return \"\"" ] }, { "cell_type": "code", "execution_count": 13, "id": "99778c65", "metadata": {}, "outputs": [], "source": [ "def process_parameters(params, text):\n", " params_result = \"\"\n", " for param in params.split(\" \"):\n", " if param == \"effective_date\":\n", " params_result += \" \" + \"effective_date=\" + str(transform_date_format(get_effective_date(text)))\n", " elif param == \"jurisdiction\":\n", " params_result += \" \" + \"jurisdiction=\" + str(get_jurisdiction(text))\n", " elif param == \"party\":\n", " params_result += \" \" + \"party=\" + str(get_parties(text))\n", " return params_result" ] }, { "cell_type": "code", "execution_count": 14, "id": "c39ea65a", "metadata": {}, "outputs": [], "source": [ "with open('train/out.tsv', 'w') as writer:\n", " for idx, row in data_train.iterrows():\n", " params_result = process_parameters(row['params'], row['text1'])\n", " writer.write(params_result+\"\\n\")" ] }, { "cell_type": "code", "execution_count": 15, "id": "741a34d6", "metadata": {}, "outputs": [], "source": [ "with open('dev-0/out.tsv', 'w') as writer:\n", " for idx, row in data_dev.iterrows():\n", " params_result = process_parameters(row['params'], row['text1'])\n", " writer.write(params_result+\"\\n\")" ] }, { "cell_type": "code", "execution_count": 16, "id": "8e8973f2", "metadata": {}, "outputs": [], "source": [ "with open('test-A/out.tsv', 'w') as writer:\n", " for idx, row in data_test.iterrows():\n", " params_result = process_parameters(row['params'], row['text1'])\n", " writer.write(params_result+\"\\n\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }