kleister-nda/run.ipynb

217 lines
6.6 KiB
Plaintext
Raw Permalink Normal View History

2022-05-03 19:41:06 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 54,
"id": "21af5e82",
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import re\n",
"from collections import Counter\n",
"from datetime import datetime\n",
"import datefinder\n",
"import word2number"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "fd7884b0",
"metadata": {},
"outputs": [],
"source": [
"def most_frequent(List, howmany=1):\n",
" counter = Counter(List)\n",
" return counter.most_common(1)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "de539456",
"metadata": {},
"outputs": [],
"source": [
"def get_jurisdiction(text):\n",
" us_states= r\"(Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\\sHampshire|New\\sJersey|New\\sMexico|New\\sYork|North\\sCarolina|North\\sDakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode\\sIsland|South\\sCarolina|South\\sDakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\\sVirginia|Wisconsin|Wyoming)\"\n",
"\n",
" matches = re.findall(us_states, text, re.MULTILINE | re.IGNORECASE)\n",
" result = most_frequent(matches)\n",
"\n",
" if result:\n",
" return result[0][0].replace(\" \", \"_\")"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "51106ddf",
"metadata": {},
"outputs": [],
"source": [
"def get_parties(text):\n",
" company_regex = r\"(([A-Z][A-za-z]+,?\\s)+(Inc\\.|LLC|Ltd\\.|Company|Corporation|INC\\.|LTD\\.|COMPANY|CORPORATION|Bank|Com|Council|Technology|Systems))\"\n",
" regex2 = r\"([A-Z][a-z]+\\s[A-Z]\\.\\s[A-Z][a-z]+)\"\n",
"\n",
" matches = re.findall(company_regex, text, re.MULTILINE)\n",
" matches = [m[0] for m in matches]\n",
" \n",
" regex2 = re.findall(regex2, text, re.MULTILINE)\n",
"\n",
" companies = []\n",
" for m in matches:\n",
" splitted = [w.capitalize().rstrip(\",\") for w in m.split()]\n",
" companies.append(\" \".join(splitted))\n",
"\n",
" result = most_frequent(companies,2)\n",
"\n",
" \n",
" if len(result) < 2:\n",
" result.extend(most_frequent(regex2, 2-len(result))) \n",
"\n",
" if result:\n",
" return [x[0].replace(\" \", \"_\") for x in result]\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e15bf4c1",
"metadata": {},
"outputs": [],
"source": [
"def get_date(text):\n",
" matches = []\n",
" df_matches = datefinder.find_dates(text)\n",
" \n",
" while True:\n",
" try:\n",
" m = next(df_matches)\n",
" except StopIteration:\n",
" break\n",
" except:\n",
" continue\n",
" matches.append(m)\n",
"\n",
" matches = filter(lambda x: 2022 > x.year > 1950, matches)\n",
"\n",
" dates = [x.strftime(\"%Y-%m-%d\") for x in matches]\n",
"\n",
" result = most_frequent(dates)\n",
"\n",
" if len(result) == 0:\n",
" return None\n",
" else:\n",
" return result[0][0]"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "f0279749",
"metadata": {},
"outputs": [],
"source": [
"def get_term(text):\n",
" term_regex = r\"\\b([\\w()]*)\\s(months?|years?)\\b\"\n",
"\n",
" match = list(re.finditer(term_regex, text, re.MULTILINE))\n",
"\n",
" if match:\n",
" number, unit = match[0].groups()\n",
" else:\n",
" return None\n",
"\n",
" if m := re.match(r\"\\d+\", number):\n",
" number = m.group()\n",
" else:\n",
" try:\n",
" number = w2n.word_to_num(re.match(r\"\\b\\w+\\b\", number).group())\n",
" except:\n",
" return None\n",
"\n",
" return str(number) + \"_\" + unit"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "27dad743",
"metadata": {},
"outputs": [],
"source": [
"def run(text, needed_info):\n",
" jurisdiction, date, term, parties = None, None, None, None\n",
" \n",
" if \"jurisdiction\" in needed_info:\n",
" jurisdiction = get_jurisdiction(text)\n",
" if \"effective_date\" in needed_info:\n",
" date = get_date(text)\n",
" if \"term\" in needed_info:\n",
" term = get_term(text)\n",
" if \"party\" in needed_info:\n",
" parties = get_parties(text)\n",
"\n",
" result_dict = {}\n",
"\n",
" if date:\n",
" result_dict[\"effective_date\"] = date\n",
" if jurisdiction:\n",
" result_dict[\"jurisdiction\"] = jurisdiction\n",
" if term:\n",
" result_dict[\"term\"] = term\n",
"\n",
" result_str = \" \".join([f\"{k}={v}\" for k,v in result_dict.items()])\n",
"\n",
" if parties:\n",
" for p in parties :\n",
" result_str += f\" party={p}\"\n",
"\n",
" return result_str"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "7b0f1f52",
"metadata": {},
"outputs": [],
"source": [
"filenames=[('dev-0/in.tsv',\"dev-0/out.tsv\"), ('train/in.tsv', \"train/out.tsv\"), ('test-A/in.tsv', 'test-A/out.tsv')]\n",
"for filename in filenames:\n",
" with open(filename[0], 'r', encoding=\"utf-8\") as in_file,\\\n",
" open(filename[1], \"w\") as out_file:\n",
" reader = csv.reader(in_file, delimiter='\\t', quoting=csv.QUOTE_NONE)\n",
" for item in reader:\n",
" needed_info = item[1].strip().split()\n",
" text = item[2].replace(\"\\\\n\", \" \").replace(\"\\\\f\", \" \").replace(\"\\\\t\", \" \").strip()\n",
" extracted = run(text, needed_info).replace(\":\", \"_\")\n",
" out_file.write(extracted + \"\\n\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}