kleister-nda/heSaidEdgar.ipynb
Iwona Christop b9815844a4 Ready to go
2022-05-03 21:54:24 +02:00

8.9 KiB

import lzma

NDAs = []
with lzma.open('train/in.tsv.xz') as f:
    for line in f:
        NDAs.append(line.decode('utf-8'))
import spacy
from spacy import displacy

ner = spacy.load('NER')

# text = NDAs[9]
# doc = nlp(text)

# effective_date = []
# jurisdiction = []
# party = []
# term = []

# for word in doc.ents:
#     if word.label_ == 'effective_date':
#         effective_date.append(word.text)
#     elif word.label_ == 'jurisdiction':
#         jurisdiction.append(word.text)
#     elif word.label_ == 'party':
#         party.append(word.text)
#     else:
#         term.append(word.text)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/util.py:833: UserWarning: [W095] Model 'en_pipeline' (0.0.0) was trained with spaCy v3.3 and may not be 100% compatible with the current version (3.2.4). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate
  warnings.warn(warn_msg)
jurisdiction.count('New York')
12
juris = { j : jurisdiction.count(j) for j in jurisdiction}
juris
{'New York': 12}
text = NDAs[9]
doc = nlp(text)
for word in doc.ents:
    print(word.text, '-->', word.label_)
CompuDyne Corporation --> party
two years --> term
New York --> jurisdiction
New York --> jurisdiction
New York --> jurisdiction
CompuDyne Corporation --> party
two years --> term
New York --> jurisdiction
New York --> jurisdiction
New York --> jurisdiction
CompuDyne Corporation --> party
two years --> term
New York --> jurisdiction
New York --> jurisdiction
New York --> jurisdiction
CompuDyne Corporation --> party
two years --> term
New York --> jurisdiction
New York --> jurisdiction
New York --> jurisdiction
months = {'01': 'January', '02': 'February', '03': 'March', 
    '04': 'April', '05': 'May', '06': 'June',
    '07': 'July', '08': 'August', '09': 'September',
    '10': 'October', '11': 'November', '12': 'December'}

punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\\\\\]^_`{|}~'

document = ner(NDAs[4])

effectiveDate = []

for word in document.ents:
    if word.label_ == 'effective_date':
        effectiveDate.append(word.text)

try:
    effectiveDate = { date : effectiveDate.count(date) for date in effectiveDate }
    effectiveDate = max(effectiveDate, key=effectiveDate.get)
    for char in punctuation: effectiveDate = effectiveDate.replace(char, '')
    # Get month
    for d in effectiveDate.split():
        if d in list(months.values()):
            month = list(months.keys())[list(months.values()).index(d)]
        elif int(d) < 32:
            day = d
        elif int(d) > 1900 and int(d) < 2030:
            year = d
    effectiveDate = year + '-' + month + '-' + day
except:
    pass

# effectiveDate = '2011-07-13'
states = ['Alabama', 'New York']

document = ner(NDAs[6])

jurisdiction = []

for word in document.ents:
    if word.label_ == 'jurisdiction':
        if word.text not in states:
            for state in states:
                if word.text in state:
                    jurisdiction.append(state)
        else:
            jurisdiction.append(text)

try:
    jurisdiction = { state : jurisdiction.count(state) for state in jurisdiction }
    jurisdiction = max(jurisdiction, key=jurisdiction.get).replace(' ', '_')
except:
    pass

# jurisdiction = 'New_York'
jurisdiction
'New_York'
document = ner(NDAs[9])

party = []

for word in document.ents:
    if word.label_ == 'party':
        party.append(word.text)

party = list(dict.fromkeys(party))
party = [ p.replace(' ', '_') for p in party]
# party = ['CompuDyne_Corporation']
wordToNumber = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
        6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
        11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
        15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
        19 : 'nineteen', 20 : 'twenty',
        30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
        70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }

document = ner(NDAs[7])

term = []

for word in document.ents:
    if word.label_ == 'term':
        term.append(word.text)

try:
    term = { time : term.count(time) for time in term }
    term = max(term, key=term.get)
    term = term.split()
    term[0] = str(list(wordToNumber.keys())[list(wordToNumber.values()).index(term[0])])
    term = '_'.join(term)
except:
    pass

# term = '3_years'
term
'3_years'
list(wordToNumber.keys())[list(wordToNumber.values()).index(term[0])]
3