kleister-nda/run.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "21af5e82",
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import re\n",
    "from collections import Counter\n",
    "from datetime import datetime\n",
    "import datefinder\n",
    "import word2number"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "fd7884b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def most_frequent(List, howmany=1):\n",
    "    counter = Counter(List)\n",
    "    return counter.most_common(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "de539456",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_jurisdiction(text):\n",
    "    us_states= r\"(Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\\sHampshire|New\\sJersey|New\\sMexico|New\\sYork|North\\sCarolina|North\\sDakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode\\sIsland|South\\sCarolina|South\\sDakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\\sVirginia|Wisconsin|Wyoming)\"\n",
    "\n",
    "    matches = re.findall(us_states, text, re.MULTILINE | re.IGNORECASE)\n",
    "    result = most_frequent(matches)\n",
    "\n",
    "    if result:\n",
    "        return result[0][0].replace(\" \", \"_\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "51106ddf",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_parties(text):\n",
    "    company_regex = r\"(([A-Z][A-za-z]+,?\\s)+(Inc\\.|LLC|Ltd\\.|Company|Corporation|INC\\.|LTD\\.|COMPANY|CORPORATION|Bank|Com|Council|Technology|Systems))\"\n",
    "    regex2 = r\"([A-Z][a-z]+\\s[A-Z]\\.\\s[A-Z][a-z]+)\"\n",
    "\n",
    "    matches = re.findall(company_regex, text, re.MULTILINE)\n",
    "    matches = [m[0] for m in matches]\n",
    "    \n",
    "    regex2 = re.findall(regex2, text, re.MULTILINE)\n",
    "\n",
    "    companies = []\n",
    "    for m in matches:\n",
    "        splitted = [w.capitalize().rstrip(\",\") for w in m.split()]\n",
    "        companies.append(\" \".join(splitted))\n",
    "\n",
    "    result = most_frequent(companies,2)\n",
    "\n",
    "    \n",
    "    if len(result) < 2:\n",
    "        result.extend(most_frequent(regex2, 2-len(result)))   \n",
    "\n",
    "    if result:\n",
    "        return [x[0].replace(\" \", \"_\") for x in result]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e15bf4c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_date(text):\n",
    "    matches = []\n",
    "    df_matches = datefinder.find_dates(text)\n",
    "    \n",
    "    while True:\n",
    "        try:\n",
    "            m = next(df_matches)\n",
    "        except StopIteration:\n",
    "            break\n",
    "        except:\n",
    "            continue\n",
    "        matches.append(m)\n",
    "\n",
    "    matches = filter(lambda x: 2022 > x.year > 1950, matches)\n",
    "\n",
    "    dates = [x.strftime(\"%Y-%m-%d\") for x in matches]\n",
    "\n",
    "    result = most_frequent(dates)\n",
    "\n",
    "    if len(result) == 0:\n",
    "        return None\n",
    "    else:\n",
    "        return result[0][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "f0279749",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_term(text):\n",
    "    term_regex = r\"\\b([\\w()]*)\\s(months?|years?)\\b\"\n",
    "\n",
    "    match = list(re.finditer(term_regex, text, re.MULTILINE))\n",
    "\n",
    "    if match:\n",
    "        number, unit = match[0].groups()\n",
    "    else:\n",
    "        return None\n",
    "\n",
    "    if m := re.match(r\"\\d+\", number):\n",
    "        number = m.group()\n",
    "    else:\n",
    "        try:\n",
    "            number = w2n.word_to_num(re.match(r\"\\b\\w+\\b\", number).group())\n",
    "        except:\n",
    "            return None\n",
    "\n",
    "    return str(number) + \"_\" + unit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "27dad743",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run(text, needed_info):\n",
    "    jurisdiction, date, term, parties = None, None, None, None\n",
    "    \n",
    "    if \"jurisdiction\" in needed_info:\n",
    "        jurisdiction = get_jurisdiction(text)\n",
    "    if \"effective_date\" in needed_info:\n",
    "        date = get_date(text)\n",
    "    if \"term\" in needed_info:\n",
    "        term = get_term(text)\n",
    "    if \"party\" in needed_info:\n",
    "        parties = get_parties(text)\n",
    "\n",
    "    result_dict = {}\n",
    "\n",
    "    if date:\n",
    "        result_dict[\"effective_date\"] = date\n",
    "    if jurisdiction:\n",
    "        result_dict[\"jurisdiction\"] = jurisdiction\n",
    "    if term:\n",
    "        result_dict[\"term\"] = term\n",
    "\n",
    "    result_str = \" \".join([f\"{k}={v}\" for k,v in result_dict.items()])\n",
    "\n",
    "    if parties:\n",
    "        for p in parties :\n",
    "            result_str += f\" party={p}\"\n",
    "\n",
    "    return result_str"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "7b0f1f52",
   "metadata": {},
   "outputs": [],
   "source": [
    "filenames=[('dev-0/in.tsv',\"dev-0/out.tsv\"), ('train/in.tsv', \"train/out.tsv\"), ('test-A/in.tsv', 'test-A/out.tsv')]\n",
    "for filename in filenames:\n",
    "    with open(filename[0], 'r', encoding=\"utf-8\") as in_file,\\\n",
    "                open(filename[1], \"w\") as out_file:\n",
    "            reader = csv.reader(in_file, delimiter='\\t', quoting=csv.QUOTE_NONE)\n",
    "            for item in reader:\n",
    "                needed_info = item[1].strip().split()\n",
    "                text = item[2].replace(\"\\\\n\", \" \").replace(\"\\\\f\", \" \").replace(\"\\\\t\", \" \").strip()\n",
    "                extracted = run(text, needed_info).replace(\":\", \"_\")\n",
    "                out_file.write(extracted + \"\\n\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
initial 2022-05-03 19:41:06 +02:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": 54,`
			`"id": "21af5e82",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"import csv\n",`
			`"import re\n",`
			`"from collections import Counter\n",`
			`"from datetime import datetime\n",`
			`"import datefinder\n",`
			`"import word2number"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 12,`
			`"id": "fd7884b0",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def most_frequent(List, howmany=1):\n",`
			`" counter = Counter(List)\n",`
			`" return counter.most_common(1)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 49,`
			`"id": "de539456",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def get_jurisdiction(text):\n",`
			" us_states= r\"(Alabama\|Alaska\|Arizona\|Arkansas\|California\|Colorado\|Connecticut\|Delaware\|Florida\|Georgia\|Hawaii\|Idaho\|Illinois\|Indiana\|Iowa\|Kansas\|Kentucky\|Louisiana\|Maine\|Maryland\|Massachusetts\|Michigan\|Minnesota\|Mississippi\|Missouri\|Montana\|Nebraska\|Nevada\|New\\sHampshire\|New\\sJersey\|New\\sMexico\|New\\sYork\|North\\sCarolina\|North\\sDakota\|Ohio\|Oklahoma\|Oregon\|Pennsylvania\|Rhode\\sIsland\|South\\sCarolina\|South\\sDakota\|Tennessee\|Texas\|Utah\|Vermont\|Virginia\|Washington\|West\\sVirginia\|Wisconsin\|Wyoming)\"\n",
			`"\n",`
			`" matches = re.findall(us_states, text, re.MULTILINE \| re.IGNORECASE)\n",`
			`" result = most_frequent(matches)\n",`
			`"\n",`
			`" if result:\n",`
			`" return result[0][0].replace(\" \", \"_\")"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 47,`
			`"id": "51106ddf",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def get_parties(text):\n",`
			`" company_regex = r\"(([A-Z][A-za-z]+,?\\s)+(Inc\\.\|LLC\|Ltd\\.\|Company\|Corporation\|INC\\.\|LTD\\.\|COMPANY\|CORPORATION\|Bank\|Com\|Council\|Technology\|Systems))\"\n",`
			`" regex2 = r\"([A-Z][a-z]+\\s[A-Z]\\.\\s[A-Z][a-z]+)\"\n",`
			`"\n",`
			`" matches = re.findall(company_regex, text, re.MULTILINE)\n",`
			`" matches = [m[0] for m in matches]\n",`
			`" \n",`
			`" regex2 = re.findall(regex2, text, re.MULTILINE)\n",`
			`"\n",`
			`" companies = []\n",`
			`" for m in matches:\n",`
			`" splitted = [w.capitalize().rstrip(\",\") for w in m.split()]\n",`
			`" companies.append(\" \".join(splitted))\n",`
			`"\n",`
			`" result = most_frequent(companies,2)\n",`
			`"\n",`
			`" \n",`
			`" if len(result) < 2:\n",`
			`" result.extend(most_frequent(regex2, 2-len(result))) \n",`
			`"\n",`
			`" if result:\n",`
			`" return [x[0].replace(\" \", \"_\") for x in result]\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 5,`
			`"id": "e15bf4c1",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def get_date(text):\n",`
			`" matches = []\n",`
			`" df_matches = datefinder.find_dates(text)\n",`
			`" \n",`
			`" while True:\n",`
			`" try:\n",`
			`" m = next(df_matches)\n",`
			`" except StopIteration:\n",`
			`" break\n",`
			`" except:\n",`
			`" continue\n",`
			`" matches.append(m)\n",`
			`"\n",`
			`" matches = filter(lambda x: 2022 > x.year > 1950, matches)\n",`
			`"\n",`
			`" dates = [x.strftime(\"%Y-%m-%d\") for x in matches]\n",`
			`"\n",`
			`" result = most_frequent(dates)\n",`
			`"\n",`
			`" if len(result) == 0:\n",`
			`" return None\n",`
			`" else:\n",`
			`" return result[0][0]"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 51,`
			`"id": "f0279749",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def get_term(text):\n",`
			`" term_regex = r\"\\b([\\w()]*)\\s(months?\|years?)\\b\"\n",`
			`"\n",`
			`" match = list(re.finditer(term_regex, text, re.MULTILINE))\n",`
			`"\n",`
			`" if match:\n",`
			`" number, unit = match[0].groups()\n",`
			`" else:\n",`
			`" return None\n",`
			`"\n",`
			`" if m := re.match(r\"\\d+\", number):\n",`
			`" number = m.group()\n",`
			`" else:\n",`
			`" try:\n",`
			`" number = w2n.word_to_num(re.match(r\"\\b\\w+\\b\", number).group())\n",`
			`" except:\n",`
			`" return None\n",`
			`"\n",`
			`" return str(number) + \"_\" + unit"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 52,`
			`"id": "27dad743",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def run(text, needed_info):\n",`
			`" jurisdiction, date, term, parties = None, None, None, None\n",`
			`" \n",`
			`" if \"jurisdiction\" in needed_info:\n",`
			`" jurisdiction = get_jurisdiction(text)\n",`
			`" if \"effective_date\" in needed_info:\n",`
			`" date = get_date(text)\n",`
			`" if \"term\" in needed_info:\n",`
			`" term = get_term(text)\n",`
			`" if \"party\" in needed_info:\n",`
			`" parties = get_parties(text)\n",`
			`"\n",`
			`" result_dict = {}\n",`
			`"\n",`
			`" if date:\n",`
			`" result_dict[\"effective_date\"] = date\n",`
			`" if jurisdiction:\n",`
			`" result_dict[\"jurisdiction\"] = jurisdiction\n",`
			`" if term:\n",`
			`" result_dict[\"term\"] = term\n",`
			`"\n",`
			`" result_str = \" \".join([f\"{k}={v}\" for k,v in result_dict.items()])\n",`
			`"\n",`
			`" if parties:\n",`
			`" for p in parties :\n",`
			`" result_str += f\" party={p}\"\n",`
			`"\n",`
			`" return result_str"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 53,`
			`"id": "7b0f1f52",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"filenames=[('dev-0/in.tsv',\"dev-0/out.tsv\"), ('train/in.tsv', \"train/out.tsv\"), ('test-A/in.tsv', 'test-A/out.tsv')]\n",`
			`"for filename in filenames:\n",`
			`" with open(filename[0], 'r', encoding=\"utf-8\") as in_file,\\\n",`
			`" open(filename[1], \"w\") as out_file:\n",`
			`" reader = csv.reader(in_file, delimiter='\\t', quoting=csv.QUOTE_NONE)\n",`
			`" for item in reader:\n",`
			`" needed_info = item[1].strip().split()\n",`
			`" text = item[2].replace(\"\\\\n\", \" \").replace(\"\\\\f\", \" \").replace(\"\\\\t\", \" \").strip()\n",`
			`" extracted = run(text, needed_info).replace(\":\", \"_\")\n",`
			`" out_file.write(extracted + \"\\n\")"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Python 3 (ipykernel)",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.9.7"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 5`
			`}`