import lzma import re months = {'01': 'January', '02': 'February', '03': 'March', '04': 'April', '05': 'May', '06': 'June', '07': 'July', '08': 'August', '09': 'September', '10': 'October', '11': 'November', '12': 'December'} def dayToWord(day): day = int(day) if day > 3: return str(day) + 'th' elif day == 3: return str(day) + 'rd' elif day == 2 : return str(day) + 'nd' else: return str(day) + 'st' def numToWord(number): number = int(number) d = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five', 6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten', 11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen', 15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen', 19 : 'nineteen', 20 : 'twenty', 30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty', 70 : 'seventy', 80 : 'eighty', 90 : 'ninety' } if number < 20: return d[number] else: if number % 10 == 0: return d[number] else: return d[number // 10 * 10] + '-' + d[number % 10] def labelJurisdiction(text, jurisdiction): jurisdictions = [] for match in re.finditer(jurisdiction, text): tup = (match.start(), match.end(), 'JURISDICTION') jurisdictions.append(tup) return jurisdictions def labelEffectiveDate(text, date): dates = [] year, month, day = date.split('-') dateFormats = [month + '/' + day + '/' + year, month + '/' + day + '/' + year[-2:], month[1] + '/' + day + '/' + year, month[1] + '/' + day[1] + '/' + year, month[1] + '/' + day + '/' + year[-2:], month[1] + '/' + day[1] + '/' + year[-2:], dayToWord(day) + ' of ' + months[month] + ', ' + year, dayToWord(day) + ' day of ' + months[month] + ', ' + year, months[month] + ' ' + day + ', ' + year ] for format in dateFormats: for match in re.finditer(format, text, flags=re.IGNORECASE): tup = (match.start(), match.end(), 'EFFECTIVE_DATE') dates.append(tup) return dates def labelParties(text, party): parties = [] if 'Inc' in party: regular = '' for word in party.split('_'): regular += word + '(.*)' party = regular party = party.replace('_', ' ') for match in re.finditer(party, text, flags=re.IGNORECASE): tup = (match.start(), match.end(), 'PARTY') parties.append(tup) return parties def labelTerms(text, term): terms = [] term = term.split('_') number = numToWord(term[0]) units = term[1] for match in re.finditer(number + ' ' + units, text, flags=re.IGNORECASE): tup = (match.start(), match.end(), 'TERM') terms.append(tup) return terms if __name__ == '__main__': # Read NDAs with lzma NDAs = [] with lzma.open('train/in.tsv.xz') as f: for line in f: NDAs.append(line.decode('utf-8')) # Read expected information expected = [] with open('train/expected.tsv') as f: for line in f: expected.append(line.replace('\n', '')) # Expected to labeled entities expectEntities = [] for expect in expected: entities = [] for e in expect: label, entity = e.split('=') entities.append((label, entity)) expectEntities.append(entities) # Training data for Spacy trainData =[] for i in range(len(expectEntities)): listOfEntities = [] for entity in expectEntities[i]: if entity[0] == 'effective_date': listOfEntities.append(labelEffectiveDate(NDAs[i], entity[1])) elif entity[0] == 'jurisdiction': listOfEntities.append(labelJurisdiction(NDAs[i], entity[1])) elif entity[0] == 'party': listOfEntities.append(labelParties(NDAs[i], entity[1])) else: listOfEntities.append(labelTerms(NDAs[i], entity[1])) listOfEntities = [item for sublist in listOfEntities for item in sublist] trainData.append((NDAs[i], {'entities': listOfEntities}))