kleister-nda/creatingModel.py
2022-05-03 20:10:12 +02:00

123 lines
4.1 KiB
Python

import lzma
import re
months = {'01': 'January', '02': 'February', '03': 'March',
'04': 'April', '05': 'May', '06': 'June',
'07': 'July', '08': 'August', '09': 'September',
'10': 'October', '11': 'November', '12': 'December'}
def dayToWord(day):
day = int(day)
if day > 3:
return str(day) + 'th'
elif day == 3:
return str(day) + 'rd'
elif day == 2 :
return str(day) + 'nd'
else: return str(day) + 'st'
def numToWord(number):
number = int(number)
d = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
19 : 'nineteen', 20 : 'twenty',
30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }
if number < 20:
return d[number]
else:
if number % 10 == 0: return d[number]
else: return d[number // 10 * 10] + '-' + d[number % 10]
def labelJurisdiction(text, jurisdiction):
jurisdictions = []
for match in re.finditer(jurisdiction, text):
tup = (match.start(), match.end(), 'JURISDICTION')
jurisdictions.append(tup)
return jurisdictions
def labelEffectiveDate(text, date):
dates = []
year, month, day = date.split('-')
dateFormats = [month + '/' + day + '/' + year,
month + '/' + day + '/' + year[-2:],
month[1] + '/' + day + '/' + year,
month[1] + '/' + day[1] + '/' + year,
month[1] + '/' + day + '/' + year[-2:],
month[1] + '/' + day[1] + '/' + year[-2:],
dayToWord(day) + ' of ' + months[month] + ', ' + year,
dayToWord(day) + ' day of ' + months[month] + ', ' + year,
months[month] + ' ' + day + ', ' + year ]
for format in dateFormats:
for match in re.finditer(format, text, flags=re.IGNORECASE):
tup = (match.start(), match.end(), 'EFFECTIVE_DATE')
dates.append(tup)
return dates
def labelParties(text, party):
parties = []
if 'Inc' in party:
regular = ''
for word in party.split('_'):
regular += word + '(.*)'
party = regular
party = party.replace('_', ' ')
for match in re.finditer(party, text, flags=re.IGNORECASE):
tup = (match.start(), match.end(), 'PARTY')
parties.append(tup)
return parties
def labelTerms(text, term):
terms = []
term = term.split('_')
number = numToWord(term[0])
units = term[1]
for match in re.finditer(number + ' ' + units, text, flags=re.IGNORECASE):
tup = (match.start(), match.end(), 'TERM')
terms.append(tup)
return terms
if __name__ == '__main__':
# Read NDAs with lzma
NDAs = []
with lzma.open('train/in.tsv.xz') as f:
for line in f:
NDAs.append(line.decode('utf-8'))
# Read expected information
expected = []
with open('train/expected.tsv') as f:
for line in f:
expected.append(line.replace('\n', ''))
# Expected to labeled entities
expectEntities = []
for expect in expected:
entities = []
for e in expect:
label, entity = e.split('=')
entities.append((label, entity))
expectEntities.append(entities)
# Training data for Spacy
trainData =[]
for i in range(len(expectEntities)):
listOfEntities = []
for entity in expectEntities[i]:
if entity[0] == 'effective_date':
listOfEntities.append(labelEffectiveDate(NDAs[i], entity[1]))
elif entity[0] == 'jurisdiction':
listOfEntities.append(labelJurisdiction(NDAs[i], entity[1]))
elif entity[0] == 'party':
listOfEntities.append(labelParties(NDAs[i], entity[1]))
else: listOfEntities.append(labelTerms(NDAs[i], entity[1]))
listOfEntities = [item for sublist in listOfEntities for item in sublist]
trainData.append((NDAs[i], {'entities': listOfEntities}))