Update main.py

2022-05-03 20:10:12 +02:00 · 2022-05-03 20:10:12 +02:00 · c2748dc657
commit c2748dc657
parent f28d4d4789
4 changed files with 330 additions and 123 deletions
--- a/creatingModel.py
+++ b/creatingModel.py
@ -0,0 +1,123 @@
+import lzma
+import re
+
+months = {'01': 'January', '02': 'February', '03': 'March', 
+    '04': 'April', '05': 'May', '06': 'June',
+    '07': 'July', '08': 'August', '09': 'September',
+    '10': 'October', '11': 'November', '12': 'December'}
+
+
+def dayToWord(day):
+    day = int(day)
+    if day > 3:
+        return str(day) + 'th'
+    elif day == 3:
+        return str(day) + 'rd'
+    elif day == 2 :
+        return str(day) + 'nd'
+    else: return str(day) + 'st'
+
+def numToWord(number):
+    number = int(number)
+    d = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
+        6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
+        11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
+        15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
+        19 : 'nineteen', 20 : 'twenty',
+        30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
+        70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }
+    if number < 20:
+        return d[number]
+    else:
+        if number % 10 == 0: return d[number]
+        else: return d[number // 10 * 10] + '-' + d[number % 10]
+
+def labelJurisdiction(text, jurisdiction):
+    jurisdictions = []
+    for match in re.finditer(jurisdiction, text):
+        tup = (match.start(), match.end(), 'JURISDICTION')
+        jurisdictions.append(tup)
+    return jurisdictions
+
+def labelEffectiveDate(text, date):
+    dates = []
+    year, month, day = date.split('-')
+    
+    dateFormats = [month + '/' + day + '/' + year,
+        month + '/' + day + '/' + year[-2:], 
+        month[1] + '/' + day + '/' + year, 
+        month[1] + '/' + day[1] + '/' + year, 
+        month[1] + '/' + day + '/' + year[-2:], 
+        month[1] + '/' + day[1] + '/' + year[-2:],
+        dayToWord(day) + ' of ' + months[month] + ', ' + year,
+        dayToWord(day) + ' day of ' + months[month] + ', ' + year,
+        months[month] + ' ' + day + ', ' + year ]
+
+    for format in dateFormats:
+        for match in re.finditer(format, text, flags=re.IGNORECASE):
+            tup = (match.start(), match.end(), 'EFFECTIVE_DATE')
+            dates.append(tup)
+
+    return dates
+
+def labelParties(text, party):
+    parties = []
+    if 'Inc' in party:
+        regular = ''
+        for word in party.split('_'):
+            regular += word + '(.*)'
+        party = regular
+    party = party.replace('_', ' ')
+    for match in re.finditer(party, text, flags=re.IGNORECASE):
+        tup = (match.start(), match.end(), 'PARTY')
+        parties.append(tup)
+    return parties
+
+def labelTerms(text, term):
+    terms = []
+    term = term.split('_')
+    number = numToWord(term[0])
+    units = term[1]
+    for match in re.finditer(number + ' ' + units, text, flags=re.IGNORECASE):
+        tup = (match.start(), match.end(), 'TERM')
+        terms.append(tup)
+    return terms
+
+
+if __name__ == '__main__':
+    # Read NDAs with lzma
+    NDAs = []
+    with lzma.open('train/in.tsv.xz') as f:
+        for line in f:
+            NDAs.append(line.decode('utf-8'))
+
+    # Read expected information
+    expected = []
+    with open('train/expected.tsv') as f:
+        for line in f:
+            expected.append(line.replace('\n', ''))
+    
+    # Expected to labeled entities
+    expectEntities = []
+    for expect in expected:
+        entities = []
+        for e in expect:
+            label, entity = e.split('=')
+            entities.append((label, entity))
+        expectEntities.append(entities)
+
+
+    # Training data for Spacy
+    trainData =[]
+    for i in range(len(expectEntities)):
+        listOfEntities = []
+        for entity in expectEntities[i]:
+            if entity[0] == 'effective_date':
+                listOfEntities.append(labelEffectiveDate(NDAs[i], entity[1]))
+            elif entity[0] == 'jurisdiction':
+                listOfEntities.append(labelJurisdiction(NDAs[i], entity[1]))
+            elif entity[0] == 'party':
+                listOfEntities.append(labelParties(NDAs[i], entity[1]))
+            else: listOfEntities.append(labelTerms(NDAs[i], entity[1]))
+        listOfEntities = [item for sublist in listOfEntities for item in sublist]
+        trainData.append((NDAs[i], {'entities': listOfEntities}))
--- a/heSaidEdgar.ipynb
+++ b/heSaidEdgar.ipynb
@ -0,0 +1,168 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import lzma\n",
+    "\n",
+    "NDAs = []\n",
+    "with lzma.open('train/in.tsv.xz') as f:\n",
+    "    for line in f:\n",
+    "        NDAs.append(line.decode('utf-8'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import spacy\n",
+    "from spacy import displacy\n",
+    "\n",
+    "nlp = spacy.load('NER')\n",
+    "\n",
+    "text = NDAs[9]\n",
+    "doc = nlp(text)\n",
+    "\n",
+    "effective_date = []\n",
+    "jurisdiction = []\n",
+    "party = []\n",
+    "term = []\n",
+    "\n",
+    "for word in doc.ents:\n",
+    "    if word.label_ == 'effective_date':\n",
+    "        effective_date.append(word.text)\n",
+    "    elif word.label_ == 'jurisdiction':\n",
+    "        jurisdiction.append(word.text)\n",
+    "    elif word.label_ == 'party':\n",
+    "        party.append(word.text)\n",
+    "    else:\n",
+    "        term.append(word.text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "12"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "jurisdiction.count('New York')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "juris = { j : jurisdiction.count(j) for j in jurisdiction}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'New York': 12}"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "juris"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CompuDyne Corporation --> party\n",
+      "two years --> term\n",
+      "New York --> jurisdiction\n",
+      "New York --> jurisdiction\n",
+      "New York --> jurisdiction\n",
+      "CompuDyne Corporation --> party\n",
+      "two years --> term\n",
+      "New York --> jurisdiction\n",
+      "New York --> jurisdiction\n",
+      "New York --> jurisdiction\n",
+      "CompuDyne Corporation --> party\n",
+      "two years --> term\n",
+      "New York --> jurisdiction\n",
+      "New York --> jurisdiction\n",
+      "New York --> jurisdiction\n",
+      "CompuDyne Corporation --> party\n",
+      "two years --> term\n",
+      "New York --> jurisdiction\n",
+      "New York --> jurisdiction\n",
+      "New York --> jurisdiction\n"
+     ]
+    }
+   ],
+   "source": [
+    "text = NDAs[9]\n",
+    "doc = nlp(text)\n",
+    "for word in doc.ents:\n",
+    "    print(word.text, '-->', word.label_)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "3a5b3979b9a2fc2c8e649de363a592bbf5a2c9da164843b1adb5b45661722ad0"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.8.10 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/main.ipynb
+++ b/main.ipynb
--- a/main.py
+++ b/main.py
@ -1,123 +1,19 @@
 import lzma
-import re
-
-months = {'01': 'January', '02': 'February', '03': 'March', 
-    '04': 'April', '05': 'May', '06': 'June',
-    '07': 'July', '08': 'August', '09': 'September',
-    '10': 'October', '11': 'November', '12': 'December'}
+import spacy


-def dayToWord(day):
-    day = int(day)
-    if day > 3:
-        return str(day) + 'th'
-    elif day == 3:
-        return str(day) + 'rd'
-    elif day == 2 :
-        return str(day) + 'nd'
-    else: return str(day) + 'st'
-
-def numToWord(number):
-    number = int(number)
-    d = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
-        6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
-        11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
-        15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
-        19 : 'nineteen', 20 : 'twenty',
-        30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
-        70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }
-    if number < 20:
-        return d[number]
-    else:
-        if number % 10 == 0: return d[number]
-        else: return d[number // 10 * 10] + '-' + d[number % 10]
-
-def labelJurisdiction(text, jurisdiction):
-    jurisdictions = []
-    for match in re.finditer(jurisdiction, text):
-        tup = (match.start(), match.end(), 'JURISDICTION')
-        jurisdictions.append(tup)
-    return jurisdictions
-
-def labelEffectiveDate(text, date):
-    dates = []
-    year, month, day = date.split('-')
-    
-    dateFormats = [month + '/' + day + '/' + year,
-        month + '/' + day + '/' + year[-2:], 
-        month[1] + '/' + day + '/' + year, 
-        month[1] + '/' + day[1] + '/' + year, 
-        month[1] + '/' + day + '/' + year[-2:], 
-        month[1] + '/' + day[1] + '/' + year[-2:],
-        dayToWord(day) + ' of ' + months[month] + ', ' + year,
-        dayToWord(day) + ' day of ' + months[month] + ', ' + year,
-        months[month] + ' ' + day + ', ' + year ]
-
-    for format in dateFormats:
-        for match in re.finditer(format, text, flags=re.IGNORECASE):
-            tup = (match.start(), match.end(), 'EFFECTIVE_DATE')
-            dates.append(tup)
-
-    return dates
-
-def labelParties(text, party):
-    parties = []
-    if 'Inc' in party:
-        regular = ''
-        for word in party.split('_'):
-            regular += word + '(.*)'
-        party = regular
-    party = party.replace('_', ' ')
-    for match in re.finditer(party, text, flags=re.IGNORECASE):
-        tup = (match.start(), match.end(), 'PARTY')
-        parties.append(tup)
-    return parties
-
-def labelTerms(text, term):
-    terms = []
-    term = term.split('_')
-    number = numToWord(term[0])
-    units = term[1]
-    for match in re.finditer(number + ' ' + units, text, flags=re.IGNORECASE):
-        tup = (match.start(), match.end(), 'TERM')
-        terms.append(tup)
-    return terms
+def readInput(dir):
+    NDAs = []
+    with lzma.open(dir) as f:
+        for line in f:
+            NDAs.append(line.decode('utf-8'))
+    return NDAs


 if __name__ == '__main__':
-    # Read NDAs with lzma
-    NDAs = []
-    with lzma.open('train/in.tsv.xz') as f:
-        for line in f:
-            NDAs.append(line.decode('utf-8'))
+    NDAs = readInput('train/in.tsv.xz')

-    # Read expected information
-    expected = []
-    with open('train/expected.tsv') as f:
-        for line in f:
-            expected.append(line.replace('\n', ''))
+    ner = spacy.load('NER')

-    # Expected to labeled entities
-    expectEntities = []
-    for expect in expected:
-        entities = []
-        for e in expect:
-            label, entity = e.split('=')
-            entities.append((label, entity))
-        expectEntities.append(entities)
-
-
-    # Training data for Spacy
-    trainData =[]
-    for i in range(len(expectEntities)):
-        listOfEntities = []
-        for entity in expectEntities[i]:
-            if entity[0] == 'effective_date':
-                listOfEntities.append(labelEffectiveDate(NDAs[i], entity[1]))
-            elif entity[0] == 'jurisdiction':
-                listOfEntities.append(labelJurisdiction(NDAs[i], entity[1]))
-            elif entity[0] == 'party':
-                listOfEntities.append(labelParties(NDAs[i], entity[1]))
-            else: listOfEntities.append(labelTerms(NDAs[i], entity[1]))
-        listOfEntities = [item for sublist in listOfEntities for item in sublist]
-        trainData.append((NDAs[i], {'entities': listOfEntities}))
+    for nda in NDAs:
+        print('pass')