kleister-nda-clone/main.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "354bd187",
   "metadata": {},
   "outputs": [],
   "source": [
    "import regex as re\n",
    "import pandas as pd\n",
    "import us\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "64bf3f1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "columns_names = ['filename', 'params', 'text1', 'text2', 'text3', 'text4']\n",
    "data_train = pd.read_csv('./train/in.tsv', sep='\\t', names=columns_names)\n",
    "data_dev = pd.read_csv('./dev-0/in.tsv', sep='\\t', names=columns_names)\n",
    "data_test = pd.read_csv('./test-A/in.tsv', sep='\\t', names=columns_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "15fbf629",
   "metadata": {},
   "outputs": [],
   "source": [
    "months = {\n",
    "    'January': '01',\n",
    "    'February': '02',\n",
    "    'March': '03',\n",
    "    'April': '04',\n",
    "    'May': '05',\n",
    "    'June': '06',\n",
    "    'July': '07',\n",
    "    'August': '08',\n",
    "    'September': '09',\n",
    "    'October': '10',\n",
    "    'November': '11',\n",
    "    'December': '12'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "958a45aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "def transform_date_format(date):\n",
    "    if date != None:\n",
    "        if len(date) == 4:\n",
    "            # Check if month is string\n",
    "            try:\n",
    "                month = months[date[1]] \n",
    "            except(KeyError):\n",
    "                month = None\n",
    "            # If year has 4-digit\n",
    "            if len(date[3]) == 2:\n",
    "                if int(date[3][0]) < 5:\n",
    "                    if month != None:\n",
    "                        return \"20\"+str(date[3])+\"-\"+str(month)+\"-\" + str(date[2])\n",
    "                    else:\n",
    "                        return \"20\"+str(date[3])+\"-\"+str(date[1])+\"-\" + str(date[2])\n",
    "                else:\n",
    "                    if int(date[3][0]) < 5:\n",
    "                        if month != None:\n",
    "                            return \"19\"+str(date[3])+\"-\"+str(month)+\"-\" + str(date[2])\n",
    "                        else:\n",
    "                            return \"19\"+str(date[3])+\"-\"+str(date[1])+\"-\" + str(date[2])\n",
    "            elif len(date[3]) == 4:\n",
    "                if month != None:\n",
    "                    return str(date[3])+\"-\"+str(month)+\"-\" + str(date[2])\n",
    "                else:\n",
    "                    return str(date[3])+\"-\"+str(date[1])+\"-\" + str(date[2])\n",
    "        elif len(date) == 5:\n",
    "            return str(date[4])+\"-\"+str(months[date[3]])+\"-\"+re.findall(r'.*(\\d+).*', date[1])[0]\n",
    "    return \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "b1cd2152",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_effective_date(text):\n",
    "\n",
    "    # Date format \"04/18/01\"\n",
    "    first_format = re.findall(r'((1[0-2]|0[1-9])/(0[1-9]|[1|2][0-9]|3[0-1])/(0[1-9]|[1-9][0-9]))', text)\n",
    "\n",
    "    # Date format \"01/21/2016\"\n",
    "    sec_format = re.findall(r'((0[1-9]|1[1-2])/(0[1-9]|1[1-9]|2[1-9]|3[0-1])/(19[0-9][0-9]|20[0-9][0-9]))', text)\n",
    "\n",
    "    # Date format \"January, 13 2021\", \"February 28, 2011\"\n",
    "    third_format = re.findall(r'(([j|J]anuary|[f|F]ebruary|[m|M]arch|[A|a]pril|[M|m]ay|[J|j]une|[J|j]uly|[A|a]ugust|[S|s]eptember|[O|o]ctober|[n|N]ovember|[d|D]ecember)[,\\s|\\s]+(0[1-9]|[1-2][0-9]|3[0-1])[,\\s|\\s|,]+(19[0-9][0-9]|20[0-9][0-9]))', text)\n",
    "\n",
    "    # Date format \"6th day of January, 2012\"\n",
    "    fourth_format = re.findall(r'(([1-9]+(th\\sday\\sof\\s|rd\\sday\\sof\\s|nd\\sday\\sof\\s))([j|J]anuary|[f|F]ebruary|[m|M]arch|[A|a]pril|[M|m]ay|[J|j]une|[J|j]uly|[A|a]ugust|[S|s]eptember|[O|o]ctober|[n|N]ovember|[d|D]ecember),\\s(19[0-9][0-9]|20[0-9][0-9]))', text)\n",
    "\n",
    "    dates = []\n",
    "\n",
    "    for format in [first_format, sec_format, third_format, fourth_format]:\n",
    "        if len(format) > 0:\n",
    "            dates = format\n",
    "            return dates[0]\n",
    "    return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "0d7f45bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_terms(text):\n",
    "    years = re.findall(r'(?<=\\s)[0-9.\\s.,\\(\\)]+(?=years)', text)\n",
    "    months = re.findall(r'(?<=\\s)[0-9.\\s.,\\(\\)]+(?=months)', text)\n",
    "    if len(years) > 0:\n",
    "        return years\n",
    "    if len(months) > 0:\n",
    "        return months"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "065526eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_parties(text):\n",
    "    first_party = re.findall(r'(?<=between)[\\p{Latin}\\s.,]+(?=Inc.|INC.|LLC|llc|,|.)', text)\n",
    "    if len(first_party) > 0:\n",
    "        if \"Inc.\" in first_party[0]:\n",
    "            prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",
    "            return prepare[:prepare.index(\"Inc.\") + len(\"Inc.\")]\n",
    "        if \"inc.\" in first_party[0]:\n",
    "            prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",
    "            return prepare[:prepare.index(\"inc.\") + len(\"inc.\")]\n",
    "        if \"LLC\" in first_party[0]:\n",
    "            prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",
    "            return prepare[:prepare.index(\"LLC\") + len(\"LLC\")]\n",
    "        if \"llc\" in first_party[0]:\n",
    "            prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",
    "            return prepare[:prepare.index(\"llc\") + len(\"llc\")]\n",
    "    return \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "19c2f9e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_jurisdiction(text):\n",
    "    for state in us.states.STATES:\n",
    "        if re.search(rf\"(?<=laws\\sof\\sthe)[\\w\\s]*{str(state)}\\s*(?=,|.)\", text):\n",
    "            return str(state).replace(\" \", \"_\")\n",
    "    for state in us.states.STATES:\n",
    "        if re.search(rf\"(.*{str(state).lower()}.*)\", text.lower()):\n",
    "            return str(state).replace(\" \", \"_\")            \n",
    "    return \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "99778c65",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_parameters(params, text):\n",
    "    params_result = \"\"\n",
    "    for param in params.split(\" \"):\n",
    "        if param == \"effective_date\":\n",
    "            params_result += \" \" + \"effective_date=\" + str(transform_date_format(get_effective_date(text)))\n",
    "        elif param == \"jurisdiction\":\n",
    "            params_result += \" \" + \"jurisdiction=\" + str(get_jurisdiction(text))\n",
    "        elif param == \"party\":\n",
    "            params_result += \" \" + \"party=\" + str(get_parties(text))\n",
    "    return params_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "c39ea65a",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('train/out.tsv', 'w') as writer:\n",
    "    for idx, row in data_train.iterrows():\n",
    "        params_result = process_parameters(row['params'], row['text1'])\n",
    "        writer.write(params_result+\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "741a34d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('dev-0/out.tsv', 'w') as writer:\n",
    "    for idx, row in data_dev.iterrows():\n",
    "        params_result = process_parameters(row['params'], row['text1'])\n",
    "        writer.write(params_result+\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "8e8973f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('test-A/out.tsv', 'w') as writer:\n",
    "    for idx, row in data_test.iterrows():\n",
    "        params_result = process_parameters(row['params'], row['text1'])\n",
    "        writer.write(params_result+\"\\n\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
kleister nda 2021-06-30 17:59:00 +02:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": 3,`
			`"id": "354bd187",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"import regex as re\n",`
			`"import pandas as pd\n",`
			`"import us\n",`
			`"from collections import Counter"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 5,`
			`"id": "64bf3f1e",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"columns_names = ['filename', 'params', 'text1', 'text2', 'text3', 'text4']\n",`
			`"data_train = pd.read_csv('./train/in.tsv', sep='\\t', names=columns_names)\n",`
			`"data_dev = pd.read_csv('./dev-0/in.tsv', sep='\\t', names=columns_names)\n",`
			`"data_test = pd.read_csv('./test-A/in.tsv', sep='\\t', names=columns_names)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 6,`
			`"id": "15fbf629",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"months = {\n",`
			`" 'January': '01',\n",`
			`" 'February': '02',\n",`
			`" 'March': '03',\n",`
			`" 'April': '04',\n",`
			`" 'May': '05',\n",`
			`" 'June': '06',\n",`
			`" 'July': '07',\n",`
			`" 'August': '08',\n",`
			`" 'September': '09',\n",`
			`" 'October': '10',\n",`
			`" 'November': '11',\n",`
			`" 'December': '12'\n",`
			`"}"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 8,`
			`"id": "958a45aa",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def transform_date_format(date):\n",`
			`" if date != None:\n",`
			`" if len(date) == 4:\n",`
			`" # Check if month is string\n",`
			`" try:\n",`
			`" month = months[date[1]] \n",`
			`" except(KeyError):\n",`
			`" month = None\n",`
			`" # If year has 4-digit\n",`
			`" if len(date[3]) == 2:\n",`
			`" if int(date[3][0]) < 5:\n",`
			`" if month != None:\n",`
			`" return \"20\"+str(date[3])+\"-\"+str(month)+\"-\" + str(date[2])\n",`
			`" else:\n",`
			`" return \"20\"+str(date[3])+\"-\"+str(date[1])+\"-\" + str(date[2])\n",`
			`" else:\n",`
			`" if int(date[3][0]) < 5:\n",`
			`" if month != None:\n",`
			`" return \"19\"+str(date[3])+\"-\"+str(month)+\"-\" + str(date[2])\n",`
			`" else:\n",`
			`" return \"19\"+str(date[3])+\"-\"+str(date[1])+\"-\" + str(date[2])\n",`
			`" elif len(date[3]) == 4:\n",`
			`" if month != None:\n",`
			`" return str(date[3])+\"-\"+str(month)+\"-\" + str(date[2])\n",`
			`" else:\n",`
			`" return str(date[3])+\"-\"+str(date[1])+\"-\" + str(date[2])\n",`
			`" elif len(date) == 5:\n",`
			`" return str(date[4])+\"-\"+str(months[date[3]])+\"-\"+re.findall(r'.(\\d+).', date[1])[0]\n",`
			`" return \"\""`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 9,`
			`"id": "b1cd2152",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def get_effective_date(text):\n",`
			`"\n",`
			`" # Date format \"04/18/01\"\n",`
			`" first_format = re.findall(r'((1[0-2]\|0[1-9])/(0[1-9]\|[1\|2][0-9]\|3[0-1])/(0[1-9]\|[1-9][0-9]))', text)\n",`
			`"\n",`
			`" # Date format \"01/21/2016\"\n",`
			`" sec_format = re.findall(r'((0[1-9]\|1[1-2])/(0[1-9]\|1[1-9]\|2[1-9]\|3[0-1])/(19[0-9][0-9]\|20[0-9][0-9]))', text)\n",`
			`"\n",`
			`" # Date format \"January, 13 2021\", \"February 28, 2011\"\n",`
			`" third_format = re.findall(r'(([j\|J]anuary\|[f\|F]ebruary\|[m\|M]arch\|[A\|a]pril\|[M\|m]ay\|[J\|j]une\|[J\|j]uly\|[A\|a]ugust\|[S\|s]eptember\|[O\|o]ctober\|[n\|N]ovember\|[d\|D]ecember)[,\\s\|\\s]+(0[1-9]\|[1-2][0-9]\|3[0-1])[,\\s\|\\s\|,]+(19[0-9][0-9]\|20[0-9][0-9]))', text)\n",`
			`"\n",`
			`" # Date format \"6th day of January, 2012\"\n",`
			`" fourth_format = re.findall(r'(([1-9]+(th\\sday\\sof\\s\|rd\\sday\\sof\\s\|nd\\sday\\sof\\s))([j\|J]anuary\|[f\|F]ebruary\|[m\|M]arch\|[A\|a]pril\|[M\|m]ay\|[J\|j]une\|[J\|j]uly\|[A\|a]ugust\|[S\|s]eptember\|[O\|o]ctober\|[n\|N]ovember\|[d\|D]ecember),\\s(19[0-9][0-9]\|20[0-9][0-9]))', text)\n",`
			`"\n",`
			`" dates = []\n",`
			`"\n",`
			`" for format in [first_format, sec_format, third_format, fourth_format]:\n",`
			`" if len(format) > 0:\n",`
			`" dates = format\n",`
			`" return dates[0]\n",`
			`" return None"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 10,`
			`"id": "0d7f45bb",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def get_terms(text):\n",`
			`" years = re.findall(r'(?<=\\s)[0-9.\\s.,\\(\\)]+(?=years)', text)\n",`
			`" months = re.findall(r'(?<=\\s)[0-9.\\s.,\\(\\)]+(?=months)', text)\n",`
			`" if len(years) > 0:\n",`
			`" return years\n",`
			`" if len(months) > 0:\n",`
			`" return months"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 11,`
			`"id": "065526eb",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def get_parties(text):\n",`
			`" first_party = re.findall(r'(?<=between)[\\p{Latin}\\s.,]+(?=Inc.\|INC.\|LLC\|llc\|,\|.)', text)\n",`
			`" if len(first_party) > 0:\n",`
			`" if \"Inc.\" in first_party[0]:\n",`
			`" prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",`
			`" return prepare[:prepare.index(\"Inc.\") + len(\"Inc.\")]\n",`
			`" if \"inc.\" in first_party[0]:\n",`
			`" prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",`
			`" return prepare[:prepare.index(\"inc.\") + len(\"inc.\")]\n",`
			`" if \"LLC\" in first_party[0]:\n",`
			`" prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",`
			`" return prepare[:prepare.index(\"LLC\") + len(\"LLC\")]\n",`
			`" if \"llc\" in first_party[0]:\n",`
			`" prepare = first_party[0].replace(\",\", \"\").strip().replace(\" \", \"_\")\n",`
			`" return prepare[:prepare.index(\"llc\") + len(\"llc\")]\n",`
			`" return \"\""`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 12,`
			`"id": "19c2f9e9",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def get_jurisdiction(text):\n",`
			`" for state in us.states.STATES:\n",`
			`" if re.search(rf\"(?<=laws\\sof\\sthe)[\\w\\s]{str(state)}\\s(?=,\|.)\", text):\n",`
			`" return str(state).replace(\" \", \"_\")\n",`
			`" for state in us.states.STATES:\n",`
			`" if re.search(rf\"(.{str(state).lower()}.)\", text.lower()):\n",`
			`" return str(state).replace(\" \", \"_\") \n",`
			`" return \"\""`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 13,`
			`"id": "99778c65",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def process_parameters(params, text):\n",`
			`" params_result = \"\"\n",`
			`" for param in params.split(\" \"):\n",`
			`" if param == \"effective_date\":\n",`
			`" params_result += \" \" + \"effective_date=\" + str(transform_date_format(get_effective_date(text)))\n",`
			`" elif param == \"jurisdiction\":\n",`
			`" params_result += \" \" + \"jurisdiction=\" + str(get_jurisdiction(text))\n",`
			`" elif param == \"party\":\n",`
			`" params_result += \" \" + \"party=\" + str(get_parties(text))\n",`
			`" return params_result"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 14,`
			`"id": "c39ea65a",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"with open('train/out.tsv', 'w') as writer:\n",`
			`" for idx, row in data_train.iterrows():\n",`
			`" params_result = process_parameters(row['params'], row['text1'])\n",`
			`" writer.write(params_result+\"\\n\")"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 15,`
			`"id": "741a34d6",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"with open('dev-0/out.tsv', 'w') as writer:\n",`
			`" for idx, row in data_dev.iterrows():\n",`
			`" params_result = process_parameters(row['params'], row['text1'])\n",`
			`" writer.write(params_result+\"\\n\")"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 16,`
			`"id": "8e8973f2",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"with open('test-A/out.tsv', 'w') as writer:\n",`
			`" for idx, row in data_test.iterrows():\n",`
			`" params_result = process_parameters(row['params'], row['text1'])\n",`
			`" writer.write(params_result+\"\\n\")"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Python 3",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.8.8"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 5`
			`}`