From 5dc80126c077f55fc9925f39cffbf6fea5ca8b85 Mon Sep 17 00:00:00 2001
From: Iwona Christop <iwona.christop@gmail.com>
Date: Mon, 2 May 2022 14:33:16 +0200
Subject: [PATCH] Add main

---
 main.ipynb | 303 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 main.py    | 123 ++++++++++++++++++++++
 2 files changed, 426 insertions(+)
 create mode 100644 main.ipynb
 create mode 100644 main.py

diff --git a/main.ipynb b/main.ipynb
new file mode 100644
index 0000000..ad6b641
--- /dev/null
+++ b/main.ipynb
@@ -0,0 +1,303 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import lzma\n",
+    "\n",
+    "# RTead file with lzma\n",
+    "NDAs = []\n",
+    "\n",
+    "with lzma.open('train/in.tsv.xz') as f:\n",
+    "    for line in f:\n",
+    "        NDAs.append(line.decode('utf-8'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read expected information\n",
+    "expected = []\n",
+    "\n",
+    "with open('train/expected.tsv') as f:\n",
+    "    for line in f:\n",
+    "        expected.append(line.replace('\\n', '').split(' '))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "months = {'01': 'January', '02': 'February', '03': 'March', \n",
+    "    '04': 'April', '05': 'May', '06': 'June',\n",
+    "    '07': 'July', '08': 'August', '09': 'September',\n",
+    "    '10': 'October', '11': 'November', '12': 'December'}\n",
+    "\n",
+    "def dayToWord(day):\n",
+    "    day = int(day)\n",
+    "    if day > 3:\n",
+    "        return str(day) + 'th'\n",
+    "    elif day == 3:\n",
+    "        return str(day) + 'rd'\n",
+    "    elif day == 2 :\n",
+    "        return str(day) + 'nd'\n",
+    "    else: return str(day) + 'st'\n",
+    "\n",
+    "def numToWord(number):\n",
+    "    try:\n",
+    "        number = int(number)\n",
+    "        d = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',\n",
+    "            6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',\n",
+    "            11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',\n",
+    "            15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',\n",
+    "            19 : 'nineteen', 20 : 'twenty',\n",
+    "            30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',\n",
+    "            70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }\n",
+    "        if number < 20:\n",
+    "            return d[number]\n",
+    "        else:\n",
+    "            if number % 10 == 0: return d[number]\n",
+    "            else: return d[number // 10 * 10] + '-' + d[number % 10]\n",
+    "    except:\n",
+    "        return number\n",
+    "\n",
+    "def labelJurisdiction(text, jurisdiction):\n",
+    "    jurisdictions = []\n",
+    "    jurisdiction = jurisdiction.replace('_', ' ')\n",
+    "    for match in re.finditer(jurisdiction, text):\n",
+    "        tup = (match.start(), match.end(), 'jurisdiction')\n",
+    "        jurisdictions.append(tup)\n",
+    "    return jurisdictions\n",
+    "\n",
+    "def labelEffectiveDate(text, date):\n",
+    "    dates = []\n",
+    "    year, month, day = date.split('-')\n",
+    "    \n",
+    "    dateFormats = [month + '/' + day + '/' + year,\n",
+    "        month + '/' + day + '/' + year[-2:], \n",
+    "        month[1] + '/' + day + '/' + year, \n",
+    "        month[1] + '/' + day[1] + '/' + year, \n",
+    "        month[1] + '/' + day + '/' + year[-2:], \n",
+    "        month[1] + '/' + day[1] + '/' + year[-2:],\n",
+    "        dayToWord(day) + ' of ' + months[month] + ', ' + year,\n",
+    "        dayToWord(day) + ' day of ' + months[month] + ', ' + year,\n",
+    "        months[month] + ' ' + day + ', ' + year ]\n",
+    "\n",
+    "    for format in dateFormats:\n",
+    "        for match in re.finditer(format, text, flags=re.IGNORECASE):\n",
+    "            tup = (match.start(), match.end(), 'effective_date')\n",
+    "            dates.append(tup)\n",
+    "\n",
+    "    return dates\n",
+    "\n",
+    "def labelParties(text, party):\n",
+    "    parties = []\n",
+    "    if 'Inc' in party:\n",
+    "        regular = ''\n",
+    "        for word in party.split('_'):\n",
+    "            regular += word + '(.*)'\n",
+    "        party = regular\n",
+    "    party = party.replace('_', ' ')\n",
+    "    for match in re.finditer(party, text, flags=re.IGNORECASE):\n",
+    "        tup = (match.start(), match.end(), 'party')\n",
+    "        parties.append(tup)\n",
+    "    return parties\n",
+    "\n",
+    "def labelTerms(text, term):\n",
+    "    terms = []\n",
+    "    term = term.split('_')\n",
+    "    number = numToWord(term[0])\n",
+    "    units = term[1]\n",
+    "    for match in re.finditer(number + ' ' + units, text, flags=re.IGNORECASE):\n",
+    "        tup = (match.start(), match.end(), 'term')\n",
+    "        terms.append(tup)\n",
+    "    return terms"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "expectEntities = []\n",
+    "\n",
+    "for expect in expected:\n",
+    "    # expect = expect.split()\n",
+    "    entities = []\n",
+    "    for e in expect:\n",
+    "        label, entity = e.split('=')\n",
+    "        entities.append((label, entity))\n",
+    "    expectEntities.append(entities)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainData =[]\n",
+    "\n",
+    "for i in range(len(expectEntities)):\n",
+    "    listOfEntities = []\n",
+    "    for entity in expectEntities[i]:\n",
+    "        if entity[0] == 'effective_date':\n",
+    "            listOfEntities.append(labelEffectiveDate(NDAs[i], entity[1]))\n",
+    "        elif entity[0] == 'jurisdiction':\n",
+    "            listOfEntities.append(labelJurisdiction(NDAs[i], entity[1]))\n",
+    "        elif entity[0] == 'party':\n",
+    "            listOfEntities.append(labelParties(NDAs[i], entity[1]))\n",
+    "        else: listOfEntities.append(labelTerms(NDAs[i], entity[1]))\n",
+    "    listOfEntities = [item for sublist in listOfEntities for item in sublist]\n",
+    "    trainData.append((NDAs[i], {'entities': listOfEntities}))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Created blank \"en\" model\n"
+     ]
+    }
+   ],
+   "source": [
+    "import spacy\n",
+    "from spacy.tokens import DocBin\n",
+    "\n",
+    "model = None\n",
+    "nIter = 100\n",
+    "\n",
+    "if model is not None:\n",
+    "    nlp = spacy.load(model)\n",
+    "    print('Loaded model')\n",
+    "else:\n",
+    "    nlp = spacy.blank('en')\n",
+    "    print('Created blank \"en\" model')\n",
+    "\n",
+    "if 'ner' not in nlp.pipe_names:\n",
+    "    # ner = nlp.create_pipe('ner')\n",
+    "    ner = nlp.add_pipe('ner', last=True)\n",
+    "else:\n",
+    "    ner = nlp.get_pipe('ner')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for data in trainData:\n",
+    "    for ent in data[1].get('entities'):\n",
+    "        ner.add_label(ent[2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "otherPipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  1%|          | 3/254 [00:00<01:11,  3.49it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"03efbda01358533c167ca9b1e6d72051.pdf\teffective_dat...\" with entities \"[(7513, 7521, 'effective_date'), (15032, 15040, 'e...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
+      "  warnings.warn(\n",
+      "  2%|▏         | 4/254 [00:01<02:28,  1.68it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"03fd0e629b617da00c54794a8a78b24d.pdf\teffective_dat...\" with entities \"[(287, 300, 'effective_date'), (25276, 25289, 'eff...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
+      "  warnings.warn(\n",
+      "  2%|▏         | 6/254 [00:04<04:11,  1.01s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"04bf0791804e8487c91ab84eaa47a335.pdf\teffective_dat...\" with entities \"[(198, 216, 'effective_date'), (22663, 22681, 'eff...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
+      "  warnings.warn(\n",
+      "  3%|▎         | 8/254 [00:07<04:37,  1.13s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"0587275477c6ad6d0d72419383e04b88.pdf\teffective_dat...\" with entities \"[(4528, 4536, 'jurisdiction'), (4604, 4612, 'juris...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
+      "  warnings.warn(\n",
+      "  4%|▎         | 9/254 [00:12<09:04,  2.22s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"05947711a24a5b7ce401911d31e19c91.pdf\teffective_dat...\" with entities \"[(18271, 18279, 'jurisdiction'), (18507, 18515, 'j...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
+      "  warnings.warn(\n",
+      "  6%|▌         | 14/254 [00:18<04:18,  1.08s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"0859334b76224ff82c1312ae7b2b5da1.pdf\teffective_dat...\" with entities \"[(279, 296, 'effective_date'), (22981, 22998, 'eff...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
+      "  warnings.warn(\n",
+      "  7%|▋         | 17/254 [00:20<03:29,  1.13it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"0c3ab1d0c8bb3b1c2f7a64f3ab584368.pdf\teffective_dat...\" with entities \"[(243, 259, 'effective_date'), (35225, 35241, 'eff...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
+      "  warnings.warn(\n",
+      "  7%|▋         | 18/254 [00:23<04:38,  1.18s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"0c7b90701575b147c4ac245ca478ee7c.pdf\teffective_dat...\" with entities \"[(10058, 10065, 'jurisdiction'), (10252, 10259, 'j...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
+      "  warnings.warn(\n",
+      "  7%|▋         | 19/254 [00:25<05:25,  1.39s/it]"
+     ]
+    }
+   ],
+   "source": [
+    "# import random\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "from spacy.training.example import Example\n",
+    "\n",
+    "with nlp.disable_pipes(*otherPipes):\n",
+    "    optimizer = nlp.begin_training()\n",
+    "    for itn in range(nIter):\n",
+    "        # random.shuffle(trainData)\n",
+    "        losses = {}\n",
+    "        for text, annotations in tqdm(trainData):\n",
+    "            try:\n",
+    "                doc = nlp.make_doc(text)\n",
+    "                example = Example.from_dict(doc, annotations)\n",
+    "                nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)\n",
+    "            except:\n",
+    "                pass\n",
+    "        print(losses)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.9.2 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.2"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..d689262
--- /dev/null
+++ b/main.py
@@ -0,0 +1,123 @@
+import lzma
+import re
+
+months = {'01': 'January', '02': 'February', '03': 'March', 
+    '04': 'April', '05': 'May', '06': 'June',
+    '07': 'July', '08': 'August', '09': 'September',
+    '10': 'October', '11': 'November', '12': 'December'}
+
+
+def dayToWord(day):
+    day = int(day)
+    if day > 3:
+        return str(day) + 'th'
+    elif day == 3:
+        return str(day) + 'rd'
+    elif day == 2 :
+        return str(day) + 'nd'
+    else: return str(day) + 'st'
+
+def numToWord(number):
+    number = int(number)
+    d = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
+        6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
+        11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
+        15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
+        19 : 'nineteen', 20 : 'twenty',
+        30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
+        70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }
+    if number < 20:
+        return d[number]
+    else:
+        if number % 10 == 0: return d[number]
+        else: return d[number // 10 * 10] + '-' + d[number % 10]
+
+def labelJurisdiction(text, jurisdiction):
+    jurisdictions = []
+    for match in re.finditer(jurisdiction, text):
+        tup = (match.start(), match.end(), 'JURISDICTION')
+        jurisdictions.append(tup)
+    return jurisdictions
+
+def labelEffectiveDate(text, date):
+    dates = []
+    year, month, day = date.split('-')
+    
+    dateFormats = [month + '/' + day + '/' + year,
+        month + '/' + day + '/' + year[-2:], 
+        month[1] + '/' + day + '/' + year, 
+        month[1] + '/' + day[1] + '/' + year, 
+        month[1] + '/' + day + '/' + year[-2:], 
+        month[1] + '/' + day[1] + '/' + year[-2:],
+        dayToWord(day) + ' of ' + months[month] + ', ' + year,
+        dayToWord(day) + ' day of ' + months[month] + ', ' + year,
+        months[month] + ' ' + day + ', ' + year ]
+
+    for format in dateFormats:
+        for match in re.finditer(format, text, flags=re.IGNORECASE):
+            tup = (match.start(), match.end(), 'EFFECTIVE_DATE')
+            dates.append(tup)
+
+    return dates
+
+def labelParties(text, party):
+    parties = []
+    if 'Inc' in party:
+        regular = ''
+        for word in party.split('_'):
+            regular += word + '(.*)'
+        party = regular
+    party = party.replace('_', ' ')
+    for match in re.finditer(party, text, flags=re.IGNORECASE):
+        tup = (match.start(), match.end(), 'PARTY')
+        parties.append(tup)
+    return parties
+
+def labelTerms(text, term):
+    terms = []
+    term = term.split('_')
+    number = numToWord(term[0])
+    units = term[1]
+    for match in re.finditer(number + ' ' + units, text, flags=re.IGNORECASE):
+        tup = (match.start(), match.end(), 'TERM')
+        terms.append(tup)
+    return terms
+
+
+if __name__ == '__main__':
+    # Read NDAs with lzma
+    NDAs = []
+    with lzma.open('train/in.tsv.xz') as f:
+        for line in f:
+            NDAs.append(line.decode('utf-8'))
+
+    # Read expected information
+    expected = []
+    with open('train/expected.tsv') as f:
+        for line in f:
+            expected.append(line.replace('\n', ''))
+    
+    # Expected to labeled entities
+    expectEntities = []
+    for expect in expected:
+        entities = []
+        for e in expect:
+            label, entity = e.split('=')
+            entities.append((label, entity))
+        expectEntities.append(entities)
+
+
+    # Training data for Spacy
+    trainData =[]
+    for i in range(len(expectEntities)):
+        listOfEntities = []
+        for entity in expectEntities[i]:
+            if entity[0] == 'effective_date':
+                listOfEntities.append(labelEffectiveDate(NDAs[i], entity[1]))
+            elif entity[0] == 'jurisdiction':
+                listOfEntities.append(labelJurisdiction(NDAs[i], entity[1]))
+            elif entity[0] == 'party':
+                listOfEntities.append(labelParties(NDAs[i], entity[1]))
+            else: listOfEntities.append(labelTerms(NDAs[i], entity[1]))
+        listOfEntities = [item for sublist in listOfEntities for item in sublist]
+        trainData.append((NDAs[i], {'entities': listOfEntities}))
\ No newline at end of file