217 lines
6.6 KiB
Plaintext
217 lines
6.6 KiB
Plaintext
![]() |
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 54,
|
||
|
"id": "21af5e82",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import csv\n",
|
||
|
"import re\n",
|
||
|
"from collections import Counter\n",
|
||
|
"from datetime import datetime\n",
|
||
|
"import datefinder\n",
|
||
|
"import word2number"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 12,
|
||
|
"id": "fd7884b0",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def most_frequent(List, howmany=1):\n",
|
||
|
" counter = Counter(List)\n",
|
||
|
" return counter.most_common(1)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 49,
|
||
|
"id": "de539456",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def get_jurisdiction(text):\n",
|
||
|
" us_states= r\"(Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\\sHampshire|New\\sJersey|New\\sMexico|New\\sYork|North\\sCarolina|North\\sDakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode\\sIsland|South\\sCarolina|South\\sDakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\\sVirginia|Wisconsin|Wyoming)\"\n",
|
||
|
"\n",
|
||
|
" matches = re.findall(us_states, text, re.MULTILINE | re.IGNORECASE)\n",
|
||
|
" result = most_frequent(matches)\n",
|
||
|
"\n",
|
||
|
" if result:\n",
|
||
|
" return result[0][0].replace(\" \", \"_\")"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 47,
|
||
|
"id": "51106ddf",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def get_parties(text):\n",
|
||
|
" company_regex = r\"(([A-Z][A-za-z]+,?\\s)+(Inc\\.|LLC|Ltd\\.|Company|Corporation|INC\\.|LTD\\.|COMPANY|CORPORATION|Bank|Com|Council|Technology|Systems))\"\n",
|
||
|
" regex2 = r\"([A-Z][a-z]+\\s[A-Z]\\.\\s[A-Z][a-z]+)\"\n",
|
||
|
"\n",
|
||
|
" matches = re.findall(company_regex, text, re.MULTILINE)\n",
|
||
|
" matches = [m[0] for m in matches]\n",
|
||
|
" \n",
|
||
|
" regex2 = re.findall(regex2, text, re.MULTILINE)\n",
|
||
|
"\n",
|
||
|
" companies = []\n",
|
||
|
" for m in matches:\n",
|
||
|
" splitted = [w.capitalize().rstrip(\",\") for w in m.split()]\n",
|
||
|
" companies.append(\" \".join(splitted))\n",
|
||
|
"\n",
|
||
|
" result = most_frequent(companies,2)\n",
|
||
|
"\n",
|
||
|
" \n",
|
||
|
" if len(result) < 2:\n",
|
||
|
" result.extend(most_frequent(regex2, 2-len(result))) \n",
|
||
|
"\n",
|
||
|
" if result:\n",
|
||
|
" return [x[0].replace(\" \", \"_\") for x in result]\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 5,
|
||
|
"id": "e15bf4c1",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def get_date(text):\n",
|
||
|
" matches = []\n",
|
||
|
" df_matches = datefinder.find_dates(text)\n",
|
||
|
" \n",
|
||
|
" while True:\n",
|
||
|
" try:\n",
|
||
|
" m = next(df_matches)\n",
|
||
|
" except StopIteration:\n",
|
||
|
" break\n",
|
||
|
" except:\n",
|
||
|
" continue\n",
|
||
|
" matches.append(m)\n",
|
||
|
"\n",
|
||
|
" matches = filter(lambda x: 2022 > x.year > 1950, matches)\n",
|
||
|
"\n",
|
||
|
" dates = [x.strftime(\"%Y-%m-%d\") for x in matches]\n",
|
||
|
"\n",
|
||
|
" result = most_frequent(dates)\n",
|
||
|
"\n",
|
||
|
" if len(result) == 0:\n",
|
||
|
" return None\n",
|
||
|
" else:\n",
|
||
|
" return result[0][0]"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 51,
|
||
|
"id": "f0279749",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def get_term(text):\n",
|
||
|
" term_regex = r\"\\b([\\w()]*)\\s(months?|years?)\\b\"\n",
|
||
|
"\n",
|
||
|
" match = list(re.finditer(term_regex, text, re.MULTILINE))\n",
|
||
|
"\n",
|
||
|
" if match:\n",
|
||
|
" number, unit = match[0].groups()\n",
|
||
|
" else:\n",
|
||
|
" return None\n",
|
||
|
"\n",
|
||
|
" if m := re.match(r\"\\d+\", number):\n",
|
||
|
" number = m.group()\n",
|
||
|
" else:\n",
|
||
|
" try:\n",
|
||
|
" number = w2n.word_to_num(re.match(r\"\\b\\w+\\b\", number).group())\n",
|
||
|
" except:\n",
|
||
|
" return None\n",
|
||
|
"\n",
|
||
|
" return str(number) + \"_\" + unit"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 52,
|
||
|
"id": "27dad743",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def run(text, needed_info):\n",
|
||
|
" jurisdiction, date, term, parties = None, None, None, None\n",
|
||
|
" \n",
|
||
|
" if \"jurisdiction\" in needed_info:\n",
|
||
|
" jurisdiction = get_jurisdiction(text)\n",
|
||
|
" if \"effective_date\" in needed_info:\n",
|
||
|
" date = get_date(text)\n",
|
||
|
" if \"term\" in needed_info:\n",
|
||
|
" term = get_term(text)\n",
|
||
|
" if \"party\" in needed_info:\n",
|
||
|
" parties = get_parties(text)\n",
|
||
|
"\n",
|
||
|
" result_dict = {}\n",
|
||
|
"\n",
|
||
|
" if date:\n",
|
||
|
" result_dict[\"effective_date\"] = date\n",
|
||
|
" if jurisdiction:\n",
|
||
|
" result_dict[\"jurisdiction\"] = jurisdiction\n",
|
||
|
" if term:\n",
|
||
|
" result_dict[\"term\"] = term\n",
|
||
|
"\n",
|
||
|
" result_str = \" \".join([f\"{k}={v}\" for k,v in result_dict.items()])\n",
|
||
|
"\n",
|
||
|
" if parties:\n",
|
||
|
" for p in parties :\n",
|
||
|
" result_str += f\" party={p}\"\n",
|
||
|
"\n",
|
||
|
" return result_str"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 53,
|
||
|
"id": "7b0f1f52",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"filenames=[('dev-0/in.tsv',\"dev-0/out.tsv\"), ('train/in.tsv', \"train/out.tsv\"), ('test-A/in.tsv', 'test-A/out.tsv')]\n",
|
||
|
"for filename in filenames:\n",
|
||
|
" with open(filename[0], 'r', encoding=\"utf-8\") as in_file,\\\n",
|
||
|
" open(filename[1], \"w\") as out_file:\n",
|
||
|
" reader = csv.reader(in_file, delimiter='\\t', quoting=csv.QUOTE_NONE)\n",
|
||
|
" for item in reader:\n",
|
||
|
" needed_info = item[1].strip().split()\n",
|
||
|
" text = item[2].replace(\"\\\\n\", \" \").replace(\"\\\\f\", \" \").replace(\"\\\\t\", \" \").strip()\n",
|
||
|
" extracted = run(text, needed_info).replace(\":\", \"_\")\n",
|
||
|
" out_file.write(extracted + \"\\n\")"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3 (ipykernel)",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.9.7"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 5
|
||
|
}
|