Ready to go

2022-05-03 21:54:24 +02:00 · 2022-05-03 21:54:24 +02:00 · b9815844a4
commit b9815844a4
parent c2748dc657
3 changed files with 337 additions and 23 deletions
--- a/heSaidEdgar.ipynb
+++ b/heSaidEdgar.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -16,32 +16,41 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 2,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/util.py:833: UserWarning: [W095] Model 'en_pipeline' (0.0.0) was trained with spaCy v3.3 and may not be 100% compatible with the current version (3.2.4). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate\n",
      "  warnings.warn(warn_msg)\n"
     ]
    }
   ],
   "source": [
    "import spacy\n",
    "from spacy import displacy\n",
    "\n",
-    "nlp = spacy.load('NER')\n",
+    "ner = spacy.load('NER')\n",
    "\n",
-    "text = NDAs[9]\n",
+    "# text = NDAs[9]\n",
-    "doc = nlp(text)\n",
+    "# doc = nlp(text)\n",
    "\n",
-    "effective_date = []\n",
+    "# effective_date = []\n",
-    "jurisdiction = []\n",
+    "# jurisdiction = []\n",
-    "party = []\n",
+    "# party = []\n",
-    "term = []\n",
+    "# term = []\n",
    "\n",
-    "for word in doc.ents:\n",
+    "# for word in doc.ents:\n",
-    "    if word.label_ == 'effective_date':\n",
+    "#     if word.label_ == 'effective_date':\n",
-    "        effective_date.append(word.text)\n",
+    "#         effective_date.append(word.text)\n",
-    "    elif word.label_ == 'jurisdiction':\n",
+    "#     elif word.label_ == 'jurisdiction':\n",
-    "        jurisdiction.append(word.text)\n",
+    "#         jurisdiction.append(word.text)\n",
-    "    elif word.label_ == 'party':\n",
+    "#     elif word.label_ == 'party':\n",
-    "        party.append(word.text)\n",
+    "#         party.append(word.text)\n",
-    "    else:\n",
+    "#     else:\n",
-    "        term.append(word.text)"
+    "#         term.append(word.text)"
   ]
  },
  {
@ -132,6 +141,189 @@
    "    print(word.text, '-->', word.label_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "months = {'01': 'January', '02': 'February', '03': 'March', \n",
    "    '04': 'April', '05': 'May', '06': 'June',\n",
    "    '07': 'July', '08': 'August', '09': 'September',\n",
    "    '10': 'October', '11': 'November', '12': 'December'}\n",
    "\n",
    "punctuation = '!\"#$%&\\'()*+,-./:;<=>?@[\\\\\\\\]^_`{|}~'\n",
    "\n",
    "document = ner(NDAs[4])\n",
    "\n",
    "effectiveDate = []\n",
    "\n",
    "for word in document.ents:\n",
    "    if word.label_ == 'effective_date':\n",
    "        effectiveDate.append(word.text)\n",
    "\n",
    "try:\n",
    "    effectiveDate = { date : effectiveDate.count(date) for date in effectiveDate }\n",
    "    effectiveDate = max(effectiveDate, key=effectiveDate.get)\n",
    "    for char in punctuation: effectiveDate = effectiveDate.replace(char, '')\n",
    "    # Get month\n",
    "    for d in effectiveDate.split():\n",
    "        if d in list(months.values()):\n",
    "            month = list(months.keys())[list(months.values()).index(d)]\n",
    "        elif int(d) < 32:\n",
    "            day = d\n",
    "        elif int(d) > 1900 and int(d) < 2030:\n",
    "            year = d\n",
    "    effectiveDate = year + '-' + month + '-' + day\n",
    "except:\n",
    "    pass\n",
    "\n",
    "# effectiveDate = '2011-07-13'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "states = ['Alabama', 'New York']\n",
    "\n",
    "document = ner(NDAs[6])\n",
    "\n",
    "jurisdiction = []\n",
    "\n",
    "for word in document.ents:\n",
    "    if word.label_ == 'jurisdiction':\n",
    "        if word.text not in states:\n",
    "            for state in states:\n",
    "                if word.text in state:\n",
    "                    jurisdiction.append(state)\n",
    "        else:\n",
    "            jurisdiction.append(text)\n",
    "\n",
    "try:\n",
    "    jurisdiction = { state : jurisdiction.count(state) for state in jurisdiction }\n",
    "    jurisdiction = max(jurisdiction, key=jurisdiction.get).replace(' ', '_')\n",
    "except:\n",
    "    pass\n",
    "\n",
    "# jurisdiction = 'New_York'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'New_York'"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "jurisdiction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "document = ner(NDAs[9])\n",
    "\n",
    "party = []\n",
    "\n",
    "for word in document.ents:\n",
    "    if word.label_ == 'party':\n",
    "        party.append(word.text)\n",
    "\n",
    "party = list(dict.fromkeys(party))\n",
    "party = [ p.replace(' ', '_') for p in party]\n",
    "# party = ['CompuDyne_Corporation']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "wordToNumber = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',\n",
    "        6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',\n",
    "        11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',\n",
    "        15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',\n",
    "        19 : 'nineteen', 20 : 'twenty',\n",
    "        30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',\n",
    "        70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }\n",
    "\n",
    "document = ner(NDAs[7])\n",
    "\n",
    "term = []\n",
    "\n",
    "for word in document.ents:\n",
    "    if word.label_ == 'term':\n",
    "        term.append(word.text)\n",
    "\n",
    "try:\n",
    "    term = { time : term.count(time) for time in term }\n",
    "    term = max(term, key=term.get)\n",
    "    term = term.split()\n",
    "    term[0] = str(list(wordToNumber.keys())[list(wordToNumber.values()).index(term[0])])\n",
    "    term = '_'.join(term)\n",
    "except:\n",
    "    pass\n",
    "\n",
    "# term = '3_years'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'3_years'"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "term"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(wordToNumber.keys())[list(wordToNumber.values()).index(term[0])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -159,7 +351,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.9.2"
  },
  "orig_nbformat": 4
 },
--- a/main.ipynb
+++ b/main.ipynb
@ -1852,7 +1852,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.9.2"
  },
  "orig_nbformat": 4
 },
--- a/main.py
+++ b/main.py
@ -1,5 +1,34 @@
 import lzma
 from matplotlib.pyplot import getp
 import spacy
 import csv
 months = {'01': 'January', '02': 'February', '03': 'March', 
    '04': 'April', '05': 'May', '06': 'June',
    '07': 'July', '08': 'August', '09': 'September',
    '10': 'October', '11': 'November', '12': 'December'}
 punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'
 states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
    'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
    'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
    'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
    'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri',
    'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 
    'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
    'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
    'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
    'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
 wordToNumber = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
        6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
        11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
        15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
        19 : 'nineteen', 20 : 'twenty',
        30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
        70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }
 def readInput(dir):
@ -9,11 +38,104 @@ def readInput(dir):
            NDAs.append(line.decode('utf-8'))
    return NDAs
 def getEffectiveDate(document):
    effectiveDate = []
    for word in document.ents:
        if word.label_ == 'effective_date':
            effectiveDate.append(word.text)
    #if len(effectiveDate) > 0:
    try:
        effectiveDate = { date : effectiveDate.count(date) for date in effectiveDate }
        effectiveDate = max(effectiveDate, key=effectiveDate.get)
        for char in punctuation: effectiveDate = effectiveDate.replace(char, '')
        for d in effectiveDate.split():
            if d in list(months.values()):
                month = list(months.keys())[list(months.values()).index(d)]
            elif int(d) < 32:
                day = d
            elif int(d) > 1900 and int(d) < 2030:
                year = d
        effectiveDate = year + '-' + month + '-' + day
    except:
        effectiveDate = ''
    return effectiveDate # effectiveDate = '2011-07-13'
 def getJurisdiction(document):
    jurisdiction = []
    for word in document.ents:
        if word.label_ == 'jurisdiction':
            if word.text not in states:
                for state in states:
                    if word.text in state:
                        jurisdiction.append(state)
            else:
                jurisdiction.append(word.text)
    if len(jurisdiction) > 0:
        jurisdiction = { state : jurisdiction.count(state) for state in jurisdiction }
        jurisdiction = max(jurisdiction, key=jurisdiction.get).replace(' ', '_')
    else:
        jurisdiction = ''
    return jurisdiction # jurisdiction = 'New_York'
 def getParties(document):
    party = []
    for word in document.ents:
        if word.label_ == 'party':
            party.append(word.text)
    party = list(dict.fromkeys(party))
    party = [ p.replace(' ', '_') for p in party]
    return party # party = ['CompuDyne_Corporation']
 def getTerm(document):
    term = []
    for word in document.ents:
        if word.label_ == 'term':
            term.append(word.text)
    if len(term) > 0:
        term = { time : term.count(time) for time in term }
        term = max(term, key=term.get)
        term = term.split()
        term[0] = str(list(wordToNumber.keys())[list(wordToNumber.values()).index(term[0])])
        term = '_'.join(term)
    else: term = ''
    return term # term = '3_years'
 if __name__ == '__main__':
    NDAs = readInput('train/in.tsv.xz')
    ner = spacy.load('NER')
-    for nda in NDAs:
+    predicted = [''] * len(NDAs)
-        print('pass')
+
    document = ner(NDAs[9])
    for i in range(len(NDAs)):
        document = ner(NDAs[i])
        ed = getEffectiveDate(document)
        j = getJurisdiction(document)
        p = getParties(document)
        t = getTerm(document)
        if len(ed) > 0: predicted[i] += 'effective_date=' + ed + ' '
        if len(j) > 0: predicted[i] += 'jurisdiction=' + j + ' '
        if len(p) > 0:
            for party in p: predicted[i] += 'party=' + party + ' '
        if len(t) > 0: predicted[i] += 'term=' + t
    with open('train/out.tsv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(predicted)