kleister-nda-clone/kleister-nda.ipynb
2021-04-27 21:26:23 +02:00

14 KiB

Extract key information from Edgar NDA documents

import pathlib
from collections import Counter
from sklearn.metrics import *
KLEISTER_PATH = pathlib.Path('C:/Users/Fijka/Documents/kleister-nda-clone')
file_name = 'train'

Read expected train data

def get_expected_data(filepath, data_key):
    dataset_expected_key = []
    with open(filepath,'r') as train_expected_file:
        for line in train_expected_file:
            key_values = line.rstrip('\n').split(' ')
            data_value = None
            for key_value in key_values:
                key, value = key_value.split('=')
                if key == data_key:
                    data_value = value
            if data_value is None:
                data_value = 'NONE'
            dataset_expected_key.append(data_value)
    return dataset_expected_key
KEYS = ['effective_date', 'jurisdiction', 'party', 'term']
def read_expected_data(filepath):
    data = []
    for key in KEYS:
        data.append(get_expected_data(filepath, key))
    return data

if file_name != 'test-A':
    train_expected_data = read_expected_data(KLEISTER_PATH/file_name/'expected.tsv')
if file_name != 'test-A':
    [i[:1] for i in train_expected_data]

Read train dataset

import lzma
import csv

def read_data(filename):
    all_data = lzma.open(filename).read().decode('UTF-8').split('\n')
    return [line.split('\t') for line in all_data][:-1]

train_data = read_data(KLEISTER_PATH/file_name/'in.tsv.xz')

JURISDICTION

STATES = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware','Florida',
          'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine',
          'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
          'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma',
          'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
          'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
import spacy
nlp = spacy.load("en_core_web_sm")
from operator import itemgetter

jurisdiction = []

def normalize(text):
    return text.replace('\\\\n', ' ').lower()
    # nlp(text) -> tokenizacja

def check_jurisdiction(document):
    states = {}
    for text in document[2:]:
        text = normalize(text)
        for state in STATES:
            if state.lower() in text:
                if state in states:
                    states[state][0] += text.count(state.lower())
                else:
                    states[state] = [text.count(state.lower()), text.index(state.lower())]
    if states != {}:
        states = sorted(states.items(), key=itemgetter(1), reverse=True)
        jurisdiction.append(states[0][0].replace(' ', '_'))
        return states[0][0], states
    else:
        jurisdiction.append(None)
        return None
    
tmp = 0
for i in range(len(train_data)):
    tt = check_jurisdiction(train_data[i])
    if file_name != 'test-A':
        if tt == None:
            if train_expected_data[1][i] != None:
    #             print(i, train_expected_data[1][i], tt)
                tmp += 1
        else:
            if tt[0] != train_expected_data[1][i].replace('_', ' '):
    #             print(i, train_expected_data[1][i], tt[0])
                tmp += 1
print('false jurisdiction:', tmp)
false jurisdiction: 22

EFFECTIVE DATE

import re
import datetime
from datetime import date

effective_date = []

def parse_date(date):
    month = str(date.month)
    if len(month) == 1:
        month = '0' + str(date.month)
    day = str(date.day)
    if len(day) == 1:
        day = '0' + str(date.day)
    return str(date.year) + '-' + month + '-' + day

def find_dates(text):
    
    MONTHS = {'January' : 1, 'February' : 2, 'March' : 3, 'April' : 4, 'May' : 5, 'June' : 6,
              'July' : 7, 'August' : 8, 'September' : 9, 'October' : 10, 'November' : 11, 'December' : 12}
    
    all_dates = []
    
    text = text.replace('\\\\n', ' ')
    
    dic = {'\d{1,2}\/\d{1,2}\/\d{2}' : '%m/%d/%y',
           '[01]*[0-9]\/[01]*[0-9]\/\d{4}' : '%m/%d/%Y',
           '\w{3,9}?\s\d{1,2}?,\s\d{4}?' : '%B %d, %Y',
           '\w{3,9}?\s\d{1,2}?,\d{4}?' : '%B %d,%Y',
           '\d{1,2}?th\sday\sof\s\w{3,9}?\s\d{4}?' : '%dth day of %B %Y',
           '\d{1,2}?th\sday\sof\s\w{3,9}?,\s\d{4}?' : '%dth day of %B, %Y',
           '\d{1,2}?ND\sday\sof\s\w{3,9}?\s\d{4}?' : '%dND day of %B %Y',
           '\w{3,9}?\s\d{1,2}?th\s,\s\d{4}?' : '%B %dth , %Y',
           '\w{3,9}?\s\d{1,2}?th,\s\d{4}?' : '%B %dth, %Y',
           '\d{1,2}?\sday\sof\s\w{3,9}?,\s\d{4}?' : '%d day of %B, %Y',
           '\w{3,9}?\.\s\d{1,2}?,\s\d{4}?' : '%b. %d, %Y',
           '\d{1,2}?\s\w{3,9}?,\s\d{4}?' : '%d %B, %Y',
           '\d{1,2}?st\sday\sof\s\w{3,9}?\s,\s\d{4}?' : '%dst day of %B , %Y',
           '\d{1,2}?st\sday\sof\s\w{3,9}?,\s\d{4}?' : '%dst day of %B, %Y',
           '\d{1,2}?nd\sday\sof\s\w{3,9}?,\s\d{4}?' : '%dnd day of %B, %Y',
           '\d{1,2}\.\d{1,2}\.\d{2,4}' : '%m.%d.%y'
          }
    
    for d in dic:
        match = re.search(r'' + d, text)
        if match != None:
            try:
                date = datetime.datetime.strptime(match.group(), dic[d]).date()
                all_dates.append(parse_date(date))
            except:
                pass
                
    return all_dates

def check_effective_date(text):
    dates = []
    x = find_dates(text)
    if x != []:
        dates.append(x)
    return(dates)

test = 0
for i in range(len(train_data)):
    xx = check_effective_date(train_data[i][2])
    if file_name != 'test-A':
        if train_expected_data[0][i] == 'NONE':
            if xx != []:
    #             print(i, train_expected_data[0][i], xx[-1][0])
                test += 1
        else:
            if xx != []:
                if xx[0][-1] != train_expected_data[0][i]:
    #                 print(i, train_expected_data[0][i], xx[-1][0])
                    test +=1
            else:
    #             print(i, train_expected_data[0][i], xx)
                test += 1
    if xx != []:
        effective_date.append(xx[-1][0])
    else:
        effective_date.append(None)
print('false effective date', test)
false effective date 42

PARTY

party = []

def check_party(document):
    dic = {'And_' : 4,
           'From_' : 5,
           'For' : 4,
           'Between' : 8,
           'With' : 5,
           'Ceo' : 4,
           'To' : 3,
          }
    
    for text in document[2:]:
        text = text.replace('\\\\n', ' ')
        
        result = None
        match = re.search(r'\w*\s\w*\s\w*,\sInc\.', text)
        if match == None:
            match = re.search(r'\w*\s\w*\s\w*,\sINC\.', text)
        if match != None:
            result = match.group().title()
            result = result.replace(',', '').replace(' ', '_')
            for d in dic:
                if d in result:
                    result = result[result.index(d) + dic[d]:]
            if result.startswith('_'):
                result = result[1:]
    return result
    
tmp = 0
for i in range(len(train_data)):
    tt = check_party(train_data[i])
    party.append(tt)
    if file_name != 'test-A':
        if train_expected_data[2][i] != tt:
            tmp += 1
    #         print(i, train_expected_data[2][i], tt)
print('false party:', tmp)
false party: 202

TERM

term = []

def check_term(document):
    
    result = None
    for text in document[2:]:
        text = text.replace('\\\\n', ' ')
    
        
        match = re.search(r'\(\d*\)\syears', text)
        if match == None:
            match = re.search(r'\(\d*\)\smonths', text)
        if match != None:
            result = match.group().replace('(', '').replace(')', '').replace(' ', '_')
            return result
    return result
    
tmp = 0
for i in range(len(train_data)):
    tt = check_term(train_data[i])
    term.append(tt)
    if file_name != 'test-A':
        if train_expected_data[3][i] != tt:
            if train_expected_data[3][i] == 'NONE' and tt == None:
                pass
            else:
    #             print(i, train_expected_data[3][i], tt)
                tmp += 1
print('false term:', tmp)
false term: 144
import os

def write_output(effective_date, jurisdiction, party, term):
    if os.path.exists(KLEISTER_PATH/file_name/'out.tsv'):
        os.remove(KLEISTER_PATH/file_name/'out.tsv')
    file = open(KLEISTER_PATH/file_name/'out.tsv', 'w')
    for doc in range(len(effective_date)):
        result = ''
        if effective_date[doc] != None:
            result += 'effective_date=' + effective_date[doc] + '\t'
        if jurisdiction[doc] != None:
            result += 'jurisdiction=' + jurisdiction[doc] + '\t'
        if party[doc] != None:
            result += 'party=' + party[doc] + '\t'
        if term[doc] != None:
            result += 'term=' + term[doc] + '\t'
        if len(result) > 1:
            result = result[:-1]
        result += '\n'
        file.write(result)
    file.close()
    
write_output(effective_date, jurisdiction, party, term)