From 5dc80126c077f55fc9925f39cffbf6fea5ca8b85 Mon Sep 17 00:00:00 2001 From: Iwona Christop Date: Mon, 2 May 2022 14:33:16 +0200 Subject: [PATCH] Add main --- main.ipynb | 303 +++++++++++++++++++++++++++++++++++++++++++++++++++++ main.py | 123 ++++++++++++++++++++++ 2 files changed, 426 insertions(+) create mode 100644 main.ipynb create mode 100644 main.py diff --git a/main.ipynb b/main.ipynb new file mode 100644 index 0000000..ad6b641 --- /dev/null +++ b/main.ipynb @@ -0,0 +1,303 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import lzma\n", + "\n", + "# RTead file with lzma\n", + "NDAs = []\n", + "\n", + "with lzma.open('train/in.tsv.xz') as f:\n", + " for line in f:\n", + " NDAs.append(line.decode('utf-8'))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Read expected information\n", + "expected = []\n", + "\n", + "with open('train/expected.tsv') as f:\n", + " for line in f:\n", + " expected.append(line.replace('\\n', '').split(' '))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "months = {'01': 'January', '02': 'February', '03': 'March', \n", + " '04': 'April', '05': 'May', '06': 'June',\n", + " '07': 'July', '08': 'August', '09': 'September',\n", + " '10': 'October', '11': 'November', '12': 'December'}\n", + "\n", + "def dayToWord(day):\n", + " day = int(day)\n", + " if day > 3:\n", + " return str(day) + 'th'\n", + " elif day == 3:\n", + " return str(day) + 'rd'\n", + " elif day == 2 :\n", + " return str(day) + 'nd'\n", + " else: return str(day) + 'st'\n", + "\n", + "def numToWord(number):\n", + " try:\n", + " number = int(number)\n", + " d = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',\n", + " 6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',\n", + " 11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',\n", + " 15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',\n", + " 19 : 'nineteen', 20 : 'twenty',\n", + " 30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',\n", + " 70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }\n", + " if number < 20:\n", + " return d[number]\n", + " else:\n", + " if number % 10 == 0: return d[number]\n", + " else: return d[number // 10 * 10] + '-' + d[number % 10]\n", + " except:\n", + " return number\n", + "\n", + "def labelJurisdiction(text, jurisdiction):\n", + " jurisdictions = []\n", + " jurisdiction = jurisdiction.replace('_', ' ')\n", + " for match in re.finditer(jurisdiction, text):\n", + " tup = (match.start(), match.end(), 'jurisdiction')\n", + " jurisdictions.append(tup)\n", + " return jurisdictions\n", + "\n", + "def labelEffectiveDate(text, date):\n", + " dates = []\n", + " year, month, day = date.split('-')\n", + " \n", + " dateFormats = [month + '/' + day + '/' + year,\n", + " month + '/' + day + '/' + year[-2:], \n", + " month[1] + '/' + day + '/' + year, \n", + " month[1] + '/' + day[1] + '/' + year, \n", + " month[1] + '/' + day + '/' + year[-2:], \n", + " month[1] + '/' + day[1] + '/' + year[-2:],\n", + " dayToWord(day) + ' of ' + months[month] + ', ' + year,\n", + " dayToWord(day) + ' day of ' + months[month] + ', ' + year,\n", + " months[month] + ' ' + day + ', ' + year ]\n", + "\n", + " for format in dateFormats:\n", + " for match in re.finditer(format, text, flags=re.IGNORECASE):\n", + " tup = (match.start(), match.end(), 'effective_date')\n", + " dates.append(tup)\n", + "\n", + " return dates\n", + "\n", + "def labelParties(text, party):\n", + " parties = []\n", + " if 'Inc' in party:\n", + " regular = ''\n", + " for word in party.split('_'):\n", + " regular += word + '(.*)'\n", + " party = regular\n", + " party = party.replace('_', ' ')\n", + " for match in re.finditer(party, text, flags=re.IGNORECASE):\n", + " tup = (match.start(), match.end(), 'party')\n", + " parties.append(tup)\n", + " return parties\n", + "\n", + "def labelTerms(text, term):\n", + " terms = []\n", + " term = term.split('_')\n", + " number = numToWord(term[0])\n", + " units = term[1]\n", + " for match in re.finditer(number + ' ' + units, text, flags=re.IGNORECASE):\n", + " tup = (match.start(), match.end(), 'term')\n", + " terms.append(tup)\n", + " return terms" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "expectEntities = []\n", + "\n", + "for expect in expected:\n", + " # expect = expect.split()\n", + " entities = []\n", + " for e in expect:\n", + " label, entity = e.split('=')\n", + " entities.append((label, entity))\n", + " expectEntities.append(entities)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "trainData =[]\n", + "\n", + "for i in range(len(expectEntities)):\n", + " listOfEntities = []\n", + " for entity in expectEntities[i]:\n", + " if entity[0] == 'effective_date':\n", + " listOfEntities.append(labelEffectiveDate(NDAs[i], entity[1]))\n", + " elif entity[0] == 'jurisdiction':\n", + " listOfEntities.append(labelJurisdiction(NDAs[i], entity[1]))\n", + " elif entity[0] == 'party':\n", + " listOfEntities.append(labelParties(NDAs[i], entity[1]))\n", + " else: listOfEntities.append(labelTerms(NDAs[i], entity[1]))\n", + " listOfEntities = [item for sublist in listOfEntities for item in sublist]\n", + " trainData.append((NDAs[i], {'entities': listOfEntities}))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created blank \"en\" model\n" + ] + } + ], + "source": [ + "import spacy\n", + "from spacy.tokens import DocBin\n", + "\n", + "model = None\n", + "nIter = 100\n", + "\n", + "if model is not None:\n", + " nlp = spacy.load(model)\n", + " print('Loaded model')\n", + "else:\n", + " nlp = spacy.blank('en')\n", + " print('Created blank \"en\" model')\n", + "\n", + "if 'ner' not in nlp.pipe_names:\n", + " # ner = nlp.create_pipe('ner')\n", + " ner = nlp.add_pipe('ner', last=True)\n", + "else:\n", + " ner = nlp.get_pipe('ner')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "for data in trainData:\n", + " for ent in data[1].get('entities'):\n", + " ner.add_label(ent[2])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "otherPipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 1%| | 3/254 [00:00<01:11, 3.49it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"03efbda01358533c167ca9b1e6d72051.pdf\teffective_dat...\" with entities \"[(7513, 7521, 'effective_date'), (15032, 15040, 'e...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", + " warnings.warn(\n", + " 2%|▏ | 4/254 [00:01<02:28, 1.68it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"03fd0e629b617da00c54794a8a78b24d.pdf\teffective_dat...\" with entities \"[(287, 300, 'effective_date'), (25276, 25289, 'eff...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", + " warnings.warn(\n", + " 2%|▏ | 6/254 [00:04<04:11, 1.01s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"04bf0791804e8487c91ab84eaa47a335.pdf\teffective_dat...\" with entities \"[(198, 216, 'effective_date'), (22663, 22681, 'eff...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", + " warnings.warn(\n", + " 3%|▎ | 8/254 [00:07<04:37, 1.13s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"0587275477c6ad6d0d72419383e04b88.pdf\teffective_dat...\" with entities \"[(4528, 4536, 'jurisdiction'), (4604, 4612, 'juris...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", + " warnings.warn(\n", + " 4%|▎ | 9/254 [00:12<09:04, 2.22s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"05947711a24a5b7ce401911d31e19c91.pdf\teffective_dat...\" with entities \"[(18271, 18279, 'jurisdiction'), (18507, 18515, 'j...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", + " warnings.warn(\n", + " 6%|▌ | 14/254 [00:18<04:18, 1.08s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"0859334b76224ff82c1312ae7b2b5da1.pdf\teffective_dat...\" with entities \"[(279, 296, 'effective_date'), (22981, 22998, 'eff...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", + " warnings.warn(\n", + " 7%|▋ | 17/254 [00:20<03:29, 1.13it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"0c3ab1d0c8bb3b1c2f7a64f3ab584368.pdf\teffective_dat...\" with entities \"[(243, 259, 'effective_date'), (35225, 35241, 'eff...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", + " warnings.warn(\n", + " 7%|▋ | 18/254 [00:23<04:38, 1.18s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"0c7b90701575b147c4ac245ca478ee7c.pdf\teffective_dat...\" with entities \"[(10058, 10065, 'jurisdiction'), (10252, 10259, 'j...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", + " warnings.warn(\n", + " 7%|▋ | 19/254 [00:25<05:25, 1.39s/it]" + ] + } + ], + "source": [ + "# import random\n", + "from tqdm import tqdm\n", + "\n", + "from spacy.training.example import Example\n", + "\n", + "with nlp.disable_pipes(*otherPipes):\n", + " optimizer = nlp.begin_training()\n", + " for itn in range(nIter):\n", + " # random.shuffle(trainData)\n", + " losses = {}\n", + " for text, annotations in tqdm(trainData):\n", + " try:\n", + " doc = nlp.make_doc(text)\n", + " example = Example.from_dict(doc, annotations)\n", + " nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)\n", + " except:\n", + " pass\n", + " print(losses)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" + }, + "kernelspec": { + "display_name": "Python 3.9.2 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/main.py b/main.py new file mode 100644 index 0000000..d689262 --- /dev/null +++ b/main.py @@ -0,0 +1,123 @@ +import lzma +import re + +months = {'01': 'January', '02': 'February', '03': 'March', + '04': 'April', '05': 'May', '06': 'June', + '07': 'July', '08': 'August', '09': 'September', + '10': 'October', '11': 'November', '12': 'December'} + + +def dayToWord(day): + day = int(day) + if day > 3: + return str(day) + 'th' + elif day == 3: + return str(day) + 'rd' + elif day == 2 : + return str(day) + 'nd' + else: return str(day) + 'st' + +def numToWord(number): + number = int(number) + d = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five', + 6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten', + 11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen', + 15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen', + 19 : 'nineteen', 20 : 'twenty', + 30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty', + 70 : 'seventy', 80 : 'eighty', 90 : 'ninety' } + if number < 20: + return d[number] + else: + if number % 10 == 0: return d[number] + else: return d[number // 10 * 10] + '-' + d[number % 10] + +def labelJurisdiction(text, jurisdiction): + jurisdictions = [] + for match in re.finditer(jurisdiction, text): + tup = (match.start(), match.end(), 'JURISDICTION') + jurisdictions.append(tup) + return jurisdictions + +def labelEffectiveDate(text, date): + dates = [] + year, month, day = date.split('-') + + dateFormats = [month + '/' + day + '/' + year, + month + '/' + day + '/' + year[-2:], + month[1] + '/' + day + '/' + year, + month[1] + '/' + day[1] + '/' + year, + month[1] + '/' + day + '/' + year[-2:], + month[1] + '/' + day[1] + '/' + year[-2:], + dayToWord(day) + ' of ' + months[month] + ', ' + year, + dayToWord(day) + ' day of ' + months[month] + ', ' + year, + months[month] + ' ' + day + ', ' + year ] + + for format in dateFormats: + for match in re.finditer(format, text, flags=re.IGNORECASE): + tup = (match.start(), match.end(), 'EFFECTIVE_DATE') + dates.append(tup) + + return dates + +def labelParties(text, party): + parties = [] + if 'Inc' in party: + regular = '' + for word in party.split('_'): + regular += word + '(.*)' + party = regular + party = party.replace('_', ' ') + for match in re.finditer(party, text, flags=re.IGNORECASE): + tup = (match.start(), match.end(), 'PARTY') + parties.append(tup) + return parties + +def labelTerms(text, term): + terms = [] + term = term.split('_') + number = numToWord(term[0]) + units = term[1] + for match in re.finditer(number + ' ' + units, text, flags=re.IGNORECASE): + tup = (match.start(), match.end(), 'TERM') + terms.append(tup) + return terms + + +if __name__ == '__main__': + # Read NDAs with lzma + NDAs = [] + with lzma.open('train/in.tsv.xz') as f: + for line in f: + NDAs.append(line.decode('utf-8')) + + # Read expected information + expected = [] + with open('train/expected.tsv') as f: + for line in f: + expected.append(line.replace('\n', '')) + + # Expected to labeled entities + expectEntities = [] + for expect in expected: + entities = [] + for e in expect: + label, entity = e.split('=') + entities.append((label, entity)) + expectEntities.append(entities) + + + # Training data for Spacy + trainData =[] + for i in range(len(expectEntities)): + listOfEntities = [] + for entity in expectEntities[i]: + if entity[0] == 'effective_date': + listOfEntities.append(labelEffectiveDate(NDAs[i], entity[1])) + elif entity[0] == 'jurisdiction': + listOfEntities.append(labelJurisdiction(NDAs[i], entity[1])) + elif entity[0] == 'party': + listOfEntities.append(labelParties(NDAs[i], entity[1])) + else: listOfEntities.append(labelTerms(NDAs[i], entity[1])) + listOfEntities = [item for sublist in listOfEntities for item in sublist] + trainData.append((NDAs[i], {'entities': listOfEntities})) \ No newline at end of file