{ "cells": [ { "cell_type": "code", "execution_count": 54, "id": "21af5e82", "metadata": {}, "outputs": [], "source": [ "import csv\n", "import re\n", "from collections import Counter\n", "from datetime import datetime\n", "import datefinder\n", "import word2number" ] }, { "cell_type": "code", "execution_count": 12, "id": "fd7884b0", "metadata": {}, "outputs": [], "source": [ "def most_frequent(List, howmany=1):\n", " counter = Counter(List)\n", " return counter.most_common(1)" ] }, { "cell_type": "code", "execution_count": 49, "id": "de539456", "metadata": {}, "outputs": [], "source": [ "def get_jurisdiction(text):\n", " us_states= r\"(Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\\sHampshire|New\\sJersey|New\\sMexico|New\\sYork|North\\sCarolina|North\\sDakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode\\sIsland|South\\sCarolina|South\\sDakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\\sVirginia|Wisconsin|Wyoming)\"\n", "\n", " matches = re.findall(us_states, text, re.MULTILINE | re.IGNORECASE)\n", " result = most_frequent(matches)\n", "\n", " if result:\n", " return result[0][0].replace(\" \", \"_\")" ] }, { "cell_type": "code", "execution_count": 47, "id": "51106ddf", "metadata": {}, "outputs": [], "source": [ "def get_parties(text):\n", " company_regex = r\"(([A-Z][A-za-z]+,?\\s)+(Inc\\.|LLC|Ltd\\.|Company|Corporation|INC\\.|LTD\\.|COMPANY|CORPORATION|Bank|Com|Council|Technology|Systems))\"\n", " regex2 = r\"([A-Z][a-z]+\\s[A-Z]\\.\\s[A-Z][a-z]+)\"\n", "\n", " matches = re.findall(company_regex, text, re.MULTILINE)\n", " matches = [m[0] for m in matches]\n", " \n", " regex2 = re.findall(regex2, text, re.MULTILINE)\n", "\n", " companies = []\n", " for m in matches:\n", " splitted = [w.capitalize().rstrip(\",\") for w in m.split()]\n", " companies.append(\" \".join(splitted))\n", "\n", " result = most_frequent(companies,2)\n", "\n", " \n", " if len(result) < 2:\n", " result.extend(most_frequent(regex2, 2-len(result))) \n", "\n", " if result:\n", " return [x[0].replace(\" \", \"_\") for x in result]\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "e15bf4c1", "metadata": {}, "outputs": [], "source": [ "def get_date(text):\n", " matches = []\n", " df_matches = datefinder.find_dates(text)\n", " \n", " while True:\n", " try:\n", " m = next(df_matches)\n", " except StopIteration:\n", " break\n", " except:\n", " continue\n", " matches.append(m)\n", "\n", " matches = filter(lambda x: 2022 > x.year > 1950, matches)\n", "\n", " dates = [x.strftime(\"%Y-%m-%d\") for x in matches]\n", "\n", " result = most_frequent(dates)\n", "\n", " if len(result) == 0:\n", " return None\n", " else:\n", " return result[0][0]" ] }, { "cell_type": "code", "execution_count": 51, "id": "f0279749", "metadata": {}, "outputs": [], "source": [ "def get_term(text):\n", " term_regex = r\"\\b([\\w()]*)\\s(months?|years?)\\b\"\n", "\n", " match = list(re.finditer(term_regex, text, re.MULTILINE))\n", "\n", " if match:\n", " number, unit = match[0].groups()\n", " else:\n", " return None\n", "\n", " if m := re.match(r\"\\d+\", number):\n", " number = m.group()\n", " else:\n", " try:\n", " number = w2n.word_to_num(re.match(r\"\\b\\w+\\b\", number).group())\n", " except:\n", " return None\n", "\n", " return str(number) + \"_\" + unit" ] }, { "cell_type": "code", "execution_count": 52, "id": "27dad743", "metadata": {}, "outputs": [], "source": [ "def run(text, needed_info):\n", " jurisdiction, date, term, parties = None, None, None, None\n", " \n", " if \"jurisdiction\" in needed_info:\n", " jurisdiction = get_jurisdiction(text)\n", " if \"effective_date\" in needed_info:\n", " date = get_date(text)\n", " if \"term\" in needed_info:\n", " term = get_term(text)\n", " if \"party\" in needed_info:\n", " parties = get_parties(text)\n", "\n", " result_dict = {}\n", "\n", " if date:\n", " result_dict[\"effective_date\"] = date\n", " if jurisdiction:\n", " result_dict[\"jurisdiction\"] = jurisdiction\n", " if term:\n", " result_dict[\"term\"] = term\n", "\n", " result_str = \" \".join([f\"{k}={v}\" for k,v in result_dict.items()])\n", "\n", " if parties:\n", " for p in parties :\n", " result_str += f\" party={p}\"\n", "\n", " return result_str" ] }, { "cell_type": "code", "execution_count": 53, "id": "7b0f1f52", "metadata": {}, "outputs": [], "source": [ "filenames=[('dev-0/in.tsv',\"dev-0/out.tsv\"), ('train/in.tsv', \"train/out.tsv\"), ('test-A/in.tsv', 'test-A/out.tsv')]\n", "for filename in filenames:\n", " with open(filename[0], 'r', encoding=\"utf-8\") as in_file,\\\n", " open(filename[1], \"w\") as out_file:\n", " reader = csv.reader(in_file, delimiter='\\t', quoting=csv.QUOTE_NONE)\n", " for item in reader:\n", " needed_info = item[1].strip().split()\n", " text = item[2].replace(\"\\\\n\", \" \").replace(\"\\\\f\", \" \").replace(\"\\\\t\", \" \").strip()\n", " extracted = run(text, needed_info).replace(\":\", \"_\")\n", " out_file.write(extracted + \"\\n\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }