259 lines
8.7 KiB
Plaintext
259 lines
8.7 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "354bd187",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import regex as re\n",
|
|
"import pandas as pd\n",
|
|
"import us\n",
|
|
"from collections import Counter"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "64bf3f1e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"columns_names = ['filename', 'params', 'text1', 'text2', 'text3', 'text4']\n",
|
|
"data_train = pd.read_csv('./train/in.tsv', sep='\\t', names=columns_names)\n",
|
|
"data_dev = pd.read_csv('./dev-0/in.tsv', sep='\\t', names=columns_names)\n",
|
|
"data_test = pd.read_csv('./test-A/in.tsv', sep='\\t', names=columns_names)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "15fbf629",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"months = {\n",
|
|
" 'January': '01',\n",
|
|
" 'February': '02',\n",
|
|
" 'March': '03',\n",
|
|
" 'April': '04',\n",
|
|
" 'May': '05',\n",
|
|
" 'June': '06',\n",
|
|
" 'July': '07',\n",
|
|
" 'August': '08',\n",
|
|
" 'September': '09',\n",
|
|
" 'October': '10',\n",
|
|
" 'November': '11',\n",
|
|
" 'December': '12'\n",
|
|
"}"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "958a45aa",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def transform_date_format(date):\n",
|
|
" if date != None:\n",
|
|
" if len(date) == 4:\n",
|
|
" # Check if month is string\n",
|
|
" try:\n",
|
|
" month = months[date[1]] \n",
|
|
" except(KeyError):\n",
|
|
" month = None\n",
|
|
" # If year has 4-digit\n",
|
|
" if len(date[3]) == 2:\n",
|
|
" if int(date[3][0]) < 5:\n",
|
|
" if month != None:\n",
|
|
" return \"20\"+str(date[3])+\"-\"+str(month)+\"-\" + str(date[2])\n",
|
|
" else:\n",
|
|
" return \"20\"+str(date[3])+\"-\"+str(date[1])+\"-\" + str(date[2])\n",
|
|
" else:\n",
|
|
" if int(date[3][0]) < 5:\n",
|
|
" if month != None:\n",
|
|
" return \"19\"+str(date[3])+\"-\"+str(month)+\"-\" + str(date[2])\n",
|
|
" else:\n",
|
|
" return \"19\"+str(date[3])+\"-\"+str(date[1])+\"-\" + str(date[2])\n",
|
|
" elif len(date[3]) == 4:\n",
|
|
" if month != None:\n",
|
|
" return str(date[3])+\"-\"+str(month)+\"-\" + str(date[2])\n",
|
|
" else:\n",
|
|
" return str(date[3])+\"-\"+str(date[1])+\"-\" + str(date[2])\n",
|
|
" elif len(date) == 5:\n",
|
|
" return str(date[4])+\"-\"+str(months[date[3]])+\"-\"+re.findall(r'.*(\\d+).*', date[1])[0]\n",
|
|
" return \"\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "b1cd2152",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def get_effective_date(text):\n",
|
|
"\n",
|
|
" # Date format \"04/18/01\"\n",
|
|
" first_format = re.findall(r'((1[0-2]|0[1-9])/(0[1-9]|[1|2][0-9]|3[0-1])/(0[1-9]|[1-9][0-9]))', text)\n",
|
|
"\n",
|
|
" # Date format \"01/21/2016\"\n",
|
|
" sec_format = re.findall(r'((0[1-9]|1[1-2])/(0[1-9]|1[1-9]|2[1-9]|3[0-1])/(19[0-9][0-9]|20[0-9][0-9]))', text)\n",
|
|
"\n",
|
|
" # Date format \"January, 13 2021\", \"February 28, 2011\"\n",
|
|
" third_format = re.findall(r'(([j|J]anuary|[f|F]ebruary|[m|M]arch|[A|a]pril|[M|m]ay|[J|j]une|[J|j]uly|[A|a]ugust|[S|s]eptember|[O|o]ctober|[n|N]ovember|[d|D]ecember)[,\\s|\\s]+(0[1-9]|[1-2][0-9]|3[0-1])[,\\s|\\s|,]+(19[0-9][0-9]|20[0-9][0-9]))', text)\n",
|
|
"\n",
|
|
" # Date format \"6th day of January, 2012\"\n",
|
|
" fourth_format = re.findall(r'(([1-9]+(th\\sday\\sof\\s|rd\\sday\\sof\\s|nd\\sday\\sof\\s))([j|J]anuary|[f|F]ebruary|[m|M]arch|[A|a]pril|[M|m]ay|[J|j]une|[J|j]uly|[A|a]ugust|[S|s]eptember|[O|o]ctober|[n|N]ovember|[d|D]ecember),\\s(19[0-9][0-9]|20[0-9][0-9]))', text)\n",
|
|
"\n",
|
|
" dates = []\n",
|
|
"\n",
|
|
" for format in [first_format, sec_format, third_format, fourth_format]:\n",
|
|
" if len(format) > 0:\n",
|
|
" dates = format\n",
|
|
" return dates[0]\n",
|
|
" return None"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "0d7f45bb",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def get_terms(text):\n",
|
|
" years = re.findall(r'(?<=\\s)[0-9.\\s.,\\(\\)]+(?=years)', text)\n",
|
|
" months = re.findall(r'(?<=\\s)[0-9.\\s.,\\(\\)]+(?=months)', text)\n",
|
|
" if len(years) > 0:\n",
|
|
" return years\n",
|
|
" if len(months) > 0:\n",
|
|
" return months"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "065526eb",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def get_parties(text):\n",
|
|
" first_party = re.findall(r'(?<=between)[\\p{Latin}\\s.,]+(?=Inc.|INC.|LLC|llc|,|.)', text)\n",
|
|
" if len(first_party) > 0:\n",
|
|
" if \"Inc.\" in first_party[0]:\n",
|
|
" prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",
|
|
" return prepare[:prepare.index(\"Inc.\") + len(\"Inc.\")]\n",
|
|
" if \"inc.\" in first_party[0]:\n",
|
|
" prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",
|
|
" return prepare[:prepare.index(\"inc.\") + len(\"inc.\")]\n",
|
|
" if \"LLC\" in first_party[0]:\n",
|
|
" prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",
|
|
" return prepare[:prepare.index(\"LLC\") + len(\"LLC\")]\n",
|
|
" if \"llc\" in first_party[0]:\n",
|
|
" prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",
|
|
" return prepare[:prepare.index(\"llc\") + len(\"llc\")]\n",
|
|
" return \"\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "19c2f9e9",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def get_jurisdiction(text):\n",
|
|
" for state in us.states.STATES:\n",
|
|
" if re.search(rf\"(?<=laws\\sof\\sthe)[\\w\\s]*{str(state)}\\s*(?=,|.)\", text):\n",
|
|
" return str(state).replace(\" \", \"_\")\n",
|
|
" for state in us.states.STATES:\n",
|
|
" if re.search(rf\"(.*{str(state).lower()}.*)\", text.lower()):\n",
|
|
" return str(state).replace(\" \", \"_\") \n",
|
|
" return \"\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "99778c65",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def process_parameters(params, text):\n",
|
|
" params_result = \"\"\n",
|
|
" for param in params.split(\" \"):\n",
|
|
" if param == \"effective_date\":\n",
|
|
" params_result += \" \" + \"effective_date=\" + str(transform_date_format(get_effective_date(text)))\n",
|
|
" elif param == \"jurisdiction\":\n",
|
|
" params_result += \" \" + \"jurisdiction=\" + str(get_jurisdiction(text))\n",
|
|
" elif param == \"party\":\n",
|
|
" params_result += \" \" + \"party=\" + str(get_parties(text))\n",
|
|
" return params_result"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "c39ea65a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open('train/out.tsv', 'w') as writer:\n",
|
|
" for idx, row in data_train.iterrows():\n",
|
|
" params_result = process_parameters(row['params'], row['text1'])\n",
|
|
" writer.write(params_result+\"\\n\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "741a34d6",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open('dev-0/out.tsv', 'w') as writer:\n",
|
|
" for idx, row in data_dev.iterrows():\n",
|
|
" params_result = process_parameters(row['params'], row['text1'])\n",
|
|
" writer.write(params_result+\"\\n\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "8e8973f2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open('test-A/out.tsv', 'w') as writer:\n",
|
|
" for idx, row in data_test.iterrows():\n",
|
|
" params_result = process_parameters(row['params'], row['text1'])\n",
|
|
" writer.write(params_result+\"\\n\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.8"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|