kleister-nda-clone/main.ipynb

259 lines
8.7 KiB
Plaintext
Raw Permalink Normal View History

2021-06-30 17:59:00 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "354bd187",
"metadata": {},
"outputs": [],
"source": [
"import regex as re\n",
"import pandas as pd\n",
"import us\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "64bf3f1e",
"metadata": {},
"outputs": [],
"source": [
"columns_names = ['filename', 'params', 'text1', 'text2', 'text3', 'text4']\n",
"data_train = pd.read_csv('./train/in.tsv', sep='\\t', names=columns_names)\n",
"data_dev = pd.read_csv('./dev-0/in.tsv', sep='\\t', names=columns_names)\n",
"data_test = pd.read_csv('./test-A/in.tsv', sep='\\t', names=columns_names)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "15fbf629",
"metadata": {},
"outputs": [],
"source": [
"months = {\n",
" 'January': '01',\n",
" 'February': '02',\n",
" 'March': '03',\n",
" 'April': '04',\n",
" 'May': '05',\n",
" 'June': '06',\n",
" 'July': '07',\n",
" 'August': '08',\n",
" 'September': '09',\n",
" 'October': '10',\n",
" 'November': '11',\n",
" 'December': '12'\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "958a45aa",
"metadata": {},
"outputs": [],
"source": [
"def transform_date_format(date):\n",
" if date != None:\n",
" if len(date) == 4:\n",
" # Check if month is string\n",
" try:\n",
" month = months[date[1]] \n",
" except(KeyError):\n",
" month = None\n",
" # If year has 4-digit\n",
" if len(date[3]) == 2:\n",
" if int(date[3][0]) < 5:\n",
" if month != None:\n",
" return \"20\"+str(date[3])+\"-\"+str(month)+\"-\" + str(date[2])\n",
" else:\n",
" return \"20\"+str(date[3])+\"-\"+str(date[1])+\"-\" + str(date[2])\n",
" else:\n",
" if int(date[3][0]) < 5:\n",
" if month != None:\n",
" return \"19\"+str(date[3])+\"-\"+str(month)+\"-\" + str(date[2])\n",
" else:\n",
" return \"19\"+str(date[3])+\"-\"+str(date[1])+\"-\" + str(date[2])\n",
" elif len(date[3]) == 4:\n",
" if month != None:\n",
" return str(date[3])+\"-\"+str(month)+\"-\" + str(date[2])\n",
" else:\n",
" return str(date[3])+\"-\"+str(date[1])+\"-\" + str(date[2])\n",
" elif len(date) == 5:\n",
" return str(date[4])+\"-\"+str(months[date[3]])+\"-\"+re.findall(r'.*(\\d+).*', date[1])[0]\n",
" return \"\""
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "b1cd2152",
"metadata": {},
"outputs": [],
"source": [
"def get_effective_date(text):\n",
"\n",
" # Date format \"04/18/01\"\n",
" first_format = re.findall(r'((1[0-2]|0[1-9])/(0[1-9]|[1|2][0-9]|3[0-1])/(0[1-9]|[1-9][0-9]))', text)\n",
"\n",
" # Date format \"01/21/2016\"\n",
" sec_format = re.findall(r'((0[1-9]|1[1-2])/(0[1-9]|1[1-9]|2[1-9]|3[0-1])/(19[0-9][0-9]|20[0-9][0-9]))', text)\n",
"\n",
" # Date format \"January, 13 2021\", \"February 28, 2011\"\n",
" third_format = re.findall(r'(([j|J]anuary|[f|F]ebruary|[m|M]arch|[A|a]pril|[M|m]ay|[J|j]une|[J|j]uly|[A|a]ugust|[S|s]eptember|[O|o]ctober|[n|N]ovember|[d|D]ecember)[,\\s|\\s]+(0[1-9]|[1-2][0-9]|3[0-1])[,\\s|\\s|,]+(19[0-9][0-9]|20[0-9][0-9]))', text)\n",
"\n",
" # Date format \"6th day of January, 2012\"\n",
" fourth_format = re.findall(r'(([1-9]+(th\\sday\\sof\\s|rd\\sday\\sof\\s|nd\\sday\\sof\\s))([j|J]anuary|[f|F]ebruary|[m|M]arch|[A|a]pril|[M|m]ay|[J|j]une|[J|j]uly|[A|a]ugust|[S|s]eptember|[O|o]ctober|[n|N]ovember|[d|D]ecember),\\s(19[0-9][0-9]|20[0-9][0-9]))', text)\n",
"\n",
" dates = []\n",
"\n",
" for format in [first_format, sec_format, third_format, fourth_format]:\n",
" if len(format) > 0:\n",
" dates = format\n",
" return dates[0]\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "0d7f45bb",
"metadata": {},
"outputs": [],
"source": [
"def get_terms(text):\n",
" years = re.findall(r'(?<=\\s)[0-9.\\s.,\\(\\)]+(?=years)', text)\n",
" months = re.findall(r'(?<=\\s)[0-9.\\s.,\\(\\)]+(?=months)', text)\n",
" if len(years) > 0:\n",
" return years\n",
" if len(months) > 0:\n",
" return months"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "065526eb",
"metadata": {},
"outputs": [],
"source": [
"def get_parties(text):\n",
" first_party = re.findall(r'(?<=between)[\\p{Latin}\\s.,]+(?=Inc.|INC.|LLC|llc|,|.)', text)\n",
" if len(first_party) > 0:\n",
" if \"Inc.\" in first_party[0]:\n",
" prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",
" return prepare[:prepare.index(\"Inc.\") + len(\"Inc.\")]\n",
" if \"inc.\" in first_party[0]:\n",
" prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",
" return prepare[:prepare.index(\"inc.\") + len(\"inc.\")]\n",
" if \"LLC\" in first_party[0]:\n",
" prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",
" return prepare[:prepare.index(\"LLC\") + len(\"LLC\")]\n",
" if \"llc\" in first_party[0]:\n",
" prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",
" return prepare[:prepare.index(\"llc\") + len(\"llc\")]\n",
" return \"\""
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "19c2f9e9",
"metadata": {},
"outputs": [],
"source": [
"def get_jurisdiction(text):\n",
" for state in us.states.STATES:\n",
" if re.search(rf\"(?<=laws\\sof\\sthe)[\\w\\s]*{str(state)}\\s*(?=,|.)\", text):\n",
" return str(state).replace(\" \", \"_\")\n",
" for state in us.states.STATES:\n",
" if re.search(rf\"(.*{str(state).lower()}.*)\", text.lower()):\n",
" return str(state).replace(\" \", \"_\") \n",
" return \"\""
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "99778c65",
"metadata": {},
"outputs": [],
"source": [
"def process_parameters(params, text):\n",
" params_result = \"\"\n",
" for param in params.split(\" \"):\n",
" if param == \"effective_date\":\n",
" params_result += \" \" + \"effective_date=\" + str(transform_date_format(get_effective_date(text)))\n",
" elif param == \"jurisdiction\":\n",
" params_result += \" \" + \"jurisdiction=\" + str(get_jurisdiction(text))\n",
" elif param == \"party\":\n",
" params_result += \" \" + \"party=\" + str(get_parties(text))\n",
" return params_result"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "c39ea65a",
"metadata": {},
"outputs": [],
"source": [
"with open('train/out.tsv', 'w') as writer:\n",
" for idx, row in data_train.iterrows():\n",
" params_result = process_parameters(row['params'], row['text1'])\n",
" writer.write(params_result+\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "741a34d6",
"metadata": {},
"outputs": [],
"source": [
"with open('dev-0/out.tsv', 'w') as writer:\n",
" for idx, row in data_dev.iterrows():\n",
" params_result = process_parameters(row['params'], row['text1'])\n",
" writer.write(params_result+\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "8e8973f2",
"metadata": {},
"outputs": [],
"source": [
"with open('test-A/out.tsv', 'w') as writer:\n",
" for idx, row in data_test.iterrows():\n",
" params_result = process_parameters(row['params'], row['text1'])\n",
" writer.write(params_result+\"\\n\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}