{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "21af5e82",
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import re\n",
    "from collections import Counter\n",
    "from datetime import datetime\n",
    "import datefinder\n",
    "import word2number"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "fd7884b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def most_frequent(List, howmany=1):\n",
    "    counter = Counter(List)\n",
    "    return counter.most_common(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "de539456",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_jurisdiction(text):\n",
    "    us_states= r\"(Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\\sHampshire|New\\sJersey|New\\sMexico|New\\sYork|North\\sCarolina|North\\sDakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode\\sIsland|South\\sCarolina|South\\sDakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\\sVirginia|Wisconsin|Wyoming)\"\n",
    "\n",
    "    matches = re.findall(us_states, text, re.MULTILINE | re.IGNORECASE)\n",
    "    result = most_frequent(matches)\n",
    "\n",
    "    if result:\n",
    "        return result[0][0].replace(\" \", \"_\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "51106ddf",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_parties(text):\n",
    "    company_regex = r\"(([A-Z][A-za-z]+,?\\s)+(Inc\\.|LLC|Ltd\\.|Company|Corporation|INC\\.|LTD\\.|COMPANY|CORPORATION|Bank|Com|Council|Technology|Systems))\"\n",
    "    regex2 = r\"([A-Z][a-z]+\\s[A-Z]\\.\\s[A-Z][a-z]+)\"\n",
    "\n",
    "    matches = re.findall(company_regex, text, re.MULTILINE)\n",
    "    matches = [m[0] for m in matches]\n",
    "    \n",
    "    regex2 = re.findall(regex2, text, re.MULTILINE)\n",
    "\n",
    "    companies = []\n",
    "    for m in matches:\n",
    "        splitted = [w.capitalize().rstrip(\",\") for w in m.split()]\n",
    "        companies.append(\" \".join(splitted))\n",
    "\n",
    "    result = most_frequent(companies,2)\n",
    "\n",
    "    \n",
    "    if len(result) < 2:\n",
    "        result.extend(most_frequent(regex2, 2-len(result)))   \n",
    "\n",
    "    if result:\n",
    "        return [x[0].replace(\" \", \"_\") for x in result]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e15bf4c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_date(text):\n",
    "    matches = []\n",
    "    df_matches = datefinder.find_dates(text)\n",
    "    \n",
    "    while True:\n",
    "        try:\n",
    "            m = next(df_matches)\n",
    "        except StopIteration:\n",
    "            break\n",
    "        except:\n",
    "            continue\n",
    "        matches.append(m)\n",
    "\n",
    "    matches = filter(lambda x: 2022 > x.year > 1950, matches)\n",
    "\n",
    "    dates = [x.strftime(\"%Y-%m-%d\") for x in matches]\n",
    "\n",
    "    result = most_frequent(dates)\n",
    "\n",
    "    if len(result) == 0:\n",
    "        return None\n",
    "    else:\n",
    "        return result[0][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "f0279749",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_term(text):\n",
    "    term_regex = r\"\\b([\\w()]*)\\s(months?|years?)\\b\"\n",
    "\n",
    "    match = list(re.finditer(term_regex, text, re.MULTILINE))\n",
    "\n",
    "    if match:\n",
    "        number, unit = match[0].groups()\n",
    "    else:\n",
    "        return None\n",
    "\n",
    "    if m := re.match(r\"\\d+\", number):\n",
    "        number = m.group()\n",
    "    else:\n",
    "        try:\n",
    "            number = w2n.word_to_num(re.match(r\"\\b\\w+\\b\", number).group())\n",
    "        except:\n",
    "            return None\n",
    "\n",
    "    return str(number) + \"_\" + unit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "27dad743",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run(text, needed_info):\n",
    "    jurisdiction, date, term, parties = None, None, None, None\n",
    "    \n",
    "    if \"jurisdiction\" in needed_info:\n",
    "        jurisdiction = get_jurisdiction(text)\n",
    "    if \"effective_date\" in needed_info:\n",
    "        date = get_date(text)\n",
    "    if \"term\" in needed_info:\n",
    "        term = get_term(text)\n",
    "    if \"party\" in needed_info:\n",
    "        parties = get_parties(text)\n",
    "\n",
    "    result_dict = {}\n",
    "\n",
    "    if date:\n",
    "        result_dict[\"effective_date\"] = date\n",
    "    if jurisdiction:\n",
    "        result_dict[\"jurisdiction\"] = jurisdiction\n",
    "    if term:\n",
    "        result_dict[\"term\"] = term\n",
    "\n",
    "    result_str = \" \".join([f\"{k}={v}\" for k,v in result_dict.items()])\n",
    "\n",
    "    if parties:\n",
    "        for p in parties :\n",
    "            result_str += f\" party={p}\"\n",
    "\n",
    "    return result_str"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "7b0f1f52",
   "metadata": {},
   "outputs": [],
   "source": [
    "filenames=[('dev-0/in.tsv',\"dev-0/out.tsv\"), ('train/in.tsv', \"train/out.tsv\"), ('test-A/in.tsv', 'test-A/out.tsv')]\n",
    "for filename in filenames:\n",
    "    with open(filename[0], 'r', encoding=\"utf-8\") as in_file,\\\n",
    "                open(filename[1], \"w\") as out_file:\n",
    "            reader = csv.reader(in_file, delimiter='\\t', quoting=csv.QUOTE_NONE)\n",
    "            for item in reader:\n",
    "                needed_info = item[1].strip().split()\n",
    "                text = item[2].replace(\"\\\\n\", \" \").replace(\"\\\\f\", \" \").replace(\"\\\\t\", \" \").strip()\n",
    "                extracted = run(text, needed_info).replace(\":\", \"_\")\n",
    "                out_file.write(extracted + \"\\n\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}