kleister-nda/run.py

import re

states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida',
          'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
          'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montan', 'Nebraska', 'Nevada', 'New Hampshire',
          'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
          'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
          'West Virginia', 'Wisconsin', 'Wyoming']

months = {'january': '01', 'february': '02', 'march': '03', 'april': '04', 'may': '05', 'june': '06', 'july': '07', 'august': '08', 'september': '09',
          'october': '10', 'november': '11', 'december': '12'}

def text2int(textnum, numwords={}):
    if not numwords:
      units = [
        "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen",
      ]

      tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

      scales = ["hundred", "thousand", "million", "billion", "trillion"]

      numwords["and"] = (1, 0)
      for idx, word in enumerate(units):    numwords[word] = (1, idx)
      for idx, word in enumerate(tens):     numwords[word] = (1, idx * 10)
      for idx, word in enumerate(scales):   numwords[word] = (10 ** (idx * 3 or 2), 0)

    current = result = 0
    for word in textnum.split():
        if word not in numwords:
          raise Exception("Illegal word: " + word)

        scale, increment = numwords[word]
        current = current * scale + increment
        if scale > 100:
            result += current
            current = 0

    return result + current

def find_state(line):
    ctr = {}
    for state in states:
        if state.lower() in line.lower():
            ctr[state] = len(re.findall(state, line))
    if ctr != {}:
        state = max(ctr, key=ctr.get)
        state = state.replace(' ', '_')
        return state
    else:
        return 'Alabama'

def position_and_date(regex, text):
    dates = [date.group() for date in re.finditer(regex, text)]
    return dates

# February 10, 2017
# 6th day of January, 2012
def find_dates(text):
    res = []
    dates_regex = [r'[A-Z][A-Za-z]+ ([0-2][0-9]|3[0-1]), [0-2][0-9]{3}',
                   r'([0-2][0-9]|3[0-1])(th|rd|nd|st) day of [A-Za-z]+, [0-2][0-9]{3}',
                   r'([0-2][0-9]|3[0-1])/([0-2][0-9]|3[0-1])/[0-9][0-9]']
    for i in range(0, len(dates_regex)):
        d = position_and_date(dates_regex[i], text)
        if d != []:
            res += d
    return res

def clean_date(date):
    if ',' in date[0]:
        date = date[0].replace(',', '')
    if 'day of' in date:
        date = re.sub(r'(th|rd|nd|st) day of', '', date)
    if '/' in date[0]:
        date = date[0].split('/')
        date[2] = '20' + date[2]
    else:
        date = date.split()
    for m in months:
        if m in date[0].lower():
            s = date[0].lower()
            date[0] = date[1]
            date[1] = s
        if m in date[1].lower():
            date[1] = date[1].lower().replace(m, months[m])
    d = date[0]
    m = date[1]
    y = date[2]
    correct_date = str(f'{y}-{m}-{d}')
    return correct_date

def find_term(line):
    regex = r'([0-9]|([0-2][0-9]|3[0-1])|three|nine|\([0-9]\)) (years|months)'
    s = re.findall(regex, line)
    return s

def clean_term(t):

    term = []
    if t != []:
        s = t[0][0]
        s = re.sub(r'\(|\)', '', s)
        term = list(t[0])
        term[0] = s
        term.pop(1)
        if not bool(re.search(r'\d', term[0])):
            term[0] = str(text2int(term[0]))
        term = term[0] + '_' + term[1]
    return term

def find_part(l):
    # regex = r'[A-Z][a-z]+\.*'
    regex = r'\b[A-Z]\w+(?:[ -]+?[A-Z]\w+?){0,2}[,\s]+(?i:inc|holding)\b'
    s = re.findall(regex, l)
    for i in range(0,len(s)):
        if 'and' in s[i].lower():
            s[i] = s[i].lower().replace('and ', '')
        if 'of' in s[i].lower():
            s[i] = s[i].lower().replace('of ', '')
        if 'us' in s[i].lower():
            s[i] = s[i].lower().replace('us ', '')
        if ',' in s[i]:
            s[i] = s[i].replace(',', '')
        if 'inc' in s[i].lower():
            s[i] = s[i].lower().replace('inc', 'inc.')
        if 'Ltd' in s[i]:
            s[i] = s[i].replace('Ltd', 'Ltd.')
        s[i] = s[i].title().replace(' ', '_')
    return s

with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
    dev0 = f.readlines()

with open('dev-0/out.tsv', 'wt') as f:
    for l in dev0:
        res = ""

        if 'effective_date' in l:
            date = find_dates(l)
            if date != []:
                date = clean_date(date)
                res = ('effective_date=' + str(date) + ' ')

        p = find_part(l)
        p = list(dict.fromkeys(p))
        t = find_term(l)
        t = clean_term(t)
        if t != []:
            res += ('jurisdiction=' + str(find_state(l)) + ' ')
            for i in range(0, len(p)):
                res += ('party=' + str(p[i]) + ' ')
            res += ('term=' + str(t) + '\n')
        else:
            res += ('jurisdiction=' + str(find_state(l)) + '\n')
            for i in range(0, len(p)):
                res += ('party=' + str(p[i]) + ' ')

        f.write(res)

f.close()


with open('test-A/in.tsv', 'r', encoding='utf8') as f:
    test_A = f.readlines()

with open('test-A/out.tsv', 'wt') as f:
    for l in test_A:
        res = ""
        if 'effective_date' in l:
            date = find_dates(l)
            if date != []:
                date = clean_date(date)
                res = ('effective_date=' + str(date) + ' ')
        p = find_part(l)
        p = list(dict.fromkeys(p))
        t = find_term(l)
        t = clean_term(t)
        if t != []:
            res += ('jurisdiction=' + str(find_state(l)) + ' ')
            for i in range(0, len(p)):
                res += ('party=' + str(p[i]) + ' ')
            res += ('term=' + str(t) + '\n')
        else:
            res += ('jurisdiction=' + str(find_state(l)) + '\n')
        f.write(res)

f.close()


with open('train/in.tsv', 'r', encoding='utf8') as f:
    train = f.readlines()

with open('train/out.tsv', 'wt') as f:
    for l in train:
        res = ""
        if 'effective_date' in l:
            date = find_dates(l)
            if date != []:
                date = clean_date(date)
                res = ('effective_date=' + str(date) + ' ')
        p = find_part(l)
        p = list(dict.fromkeys(p))
        t = find_term(l)
        t = clean_term(t)
        if t != []:
            res += ('jurisdiction=' + str(find_state(l)) + ' ')
            for i in range(0, len(p)):
                res += ('party=' + str(p[i]) + ' ')
            res += ('term=' + str(t) + '\n')
        else:
            res += ('jurisdiction=' + str(find_state(l)) + '\n')
        f.write(res)

f.close()
s444356 2022-04-28 23:12:04 +02:00			`import re`

			`states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida',`
			`'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',`
			`'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montan', 'Nebraska', 'Nevada', 'New Hampshire',`
			`'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',`
			`'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',`
			`'West Virginia', 'Wisconsin', 'Wyoming']`

			`months = {'january': '01', 'february': '02', 'march': '03', 'april': '04', 'may': '05', 'june': '06', 'july': '07', 'august': '08', 'september': '09',`
			`'october': '10', 'november': '11', 'december': '12'}`

			`def text2int(textnum, numwords={}):`
			`if not numwords:`
			`units = [`
			`"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",`
			`"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",`
			`"sixteen", "seventeen", "eighteen", "nineteen",`
			`]`

			`tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]`

			`scales = ["hundred", "thousand", "million", "billion", "trillion"]`

			`numwords["and"] = (1, 0)`
			`for idx, word in enumerate(units): numwords[word] = (1, idx)`
			`for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)`
			`for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)`

			`current = result = 0`
			`for word in textnum.split():`
			`if word not in numwords:`
			`raise Exception("Illegal word: " + word)`

			`scale, increment = numwords[word]`
			`current = current * scale + increment`
			`if scale > 100:`
			`result += current`
			`current = 0`

			`return result + current`

			`def find_state(line):`
			`ctr = {}`
			`for state in states:`
			`if state.lower() in line.lower():`
			`ctr[state] = len(re.findall(state, line))`
			`if ctr != {}:`
			`state = max(ctr, key=ctr.get)`
			`state = state.replace(' ', '_')`
			`return state`
			`else:`
			`return 'Alabama'`

			`def position_and_date(regex, text):`
			`dates = [date.group() for date in re.finditer(regex, text)]`
			`return dates`

			`# February 10, 2017`
			`# 6th day of January, 2012`
			`def find_dates(text):`
			`res = []`
			`dates_regex = [r'[A-Z][A-Za-z]+ ([0-2][0-9]\|3[0-1]), [0-2][0-9]{3}',`
			`r'([0-2][0-9]\|3[0-1])(th\|rd\|nd\|st) day of [A-Za-z]+, [0-2][0-9]{3}',`
			`r'([0-2][0-9]\|3[0-1])/([0-2][0-9]\|3[0-1])/[0-9][0-9]']`
			`for i in range(0, len(dates_regex)):`
			`d = position_and_date(dates_regex[i], text)`
			`if d != []:`
			`res += d`
			`return res`

			`def clean_date(date):`
			`if ',' in date[0]:`
			`date = date[0].replace(',', '')`
			`if 'day of' in date:`
			`date = re.sub(r'(th\|rd\|nd\|st) day of', '', date)`
			`if '/' in date[0]:`
			`date = date[0].split('/')`
			`date[2] = '20' + date[2]`
			`else:`
			`date = date.split()`
			`for m in months:`
			`if m in date[0].lower():`
			`s = date[0].lower()`
			`date[0] = date[1]`
			`date[1] = s`
			`if m in date[1].lower():`
			`date[1] = date[1].lower().replace(m, months[m])`
			`d = date[0]`
			`m = date[1]`
			`y = date[2]`
			`correct_date = str(f'{y}-{m}-{d}')`
			`return correct_date`

			`def find_term(line):`
			`regex = r'([0-9]\|([0-2][0-9]\|3[0-1])\|three\|nine\|\([0-9]\)) (years\|months)'`
			`s = re.findall(regex, line)`
			`return s`

			`def clean_term(t):`

			`term = []`
			`if t != []:`
			`s = t[0][0]`
			`s = re.sub(r'\(\|\)', '', s)`
			`term = list(t[0])`
			`term[0] = s`
			`term.pop(1)`
			`if not bool(re.search(r'\d', term[0])):`
			`term[0] = str(text2int(term[0]))`
			`term = term[0] + '_' + term[1]`
			`return term`

s444356 2022-04-29 01:41:25 +02:00			`def find_part(l):`
			`# regex = r'[A-Z][a-z]+\.*'`
			`regex = r'\b[A-Z]\w+(?:[ -]+?[A-Z]\w+?){0,2}[,\s]+(?i:inc\|holding)\b'`
			`s = re.findall(regex, l)`
			`for i in range(0,len(s)):`
			`if 'and' in s[i].lower():`
			`s[i] = s[i].lower().replace('and ', '')`
			`if 'of' in s[i].lower():`
			`s[i] = s[i].lower().replace('of ', '')`
			`if 'us' in s[i].lower():`
			`s[i] = s[i].lower().replace('us ', '')`
			`if ',' in s[i]:`
			`s[i] = s[i].replace(',', '')`
			`if 'inc' in s[i].lower():`
			`s[i] = s[i].lower().replace('inc', 'inc.')`
			`if 'Ltd' in s[i]:`
			`s[i] = s[i].replace('Ltd', 'Ltd.')`
			`s[i] = s[i].title().replace(' ', '_')`
			`return s`

s444356 2022-04-28 23:12:04 +02:00			`with open('dev-0/in.tsv', 'r', encoding='utf8') as f:`
			`dev0 = f.readlines()`

			`with open('dev-0/out.tsv', 'wt') as f:`
			`for l in dev0:`
			`res = ""`
s444356 2022-04-29 01:41:25 +02:00
s444356 2022-04-28 23:12:04 +02:00			`if 'effective_date' in l:`
			`date = find_dates(l)`
			`if date != []:`
			`date = clean_date(date)`
			`res = ('effective_date=' + str(date) + ' ')`

s444356 2022-04-29 01:41:25 +02:00			`p = find_part(l)`
			`p = list(dict.fromkeys(p))`
s444356 2022-04-28 23:12:04 +02:00			`t = find_term(l)`
			`t = clean_term(t)`
			`if t != []:`
			`res += ('jurisdiction=' + str(find_state(l)) + ' ')`
s444356 2022-04-29 01:41:25 +02:00			`for i in range(0, len(p)):`
			`res += ('party=' + str(p[i]) + ' ')`
s444356 2022-04-28 23:12:04 +02:00			`res += ('term=' + str(t) + '\n')`
			`else:`
			`res += ('jurisdiction=' + str(find_state(l)) + '\n')`
s444356 2022-04-29 01:41:25 +02:00			`for i in range(0, len(p)):`
			`res += ('party=' + str(p[i]) + ' ')`
s444356 2022-04-28 23:12:04 +02:00
			`f.write(res)`

			`f.close()`


			`with open('test-A/in.tsv', 'r', encoding='utf8') as f:`
			`test_A = f.readlines()`

			`with open('test-A/out.tsv', 'wt') as f:`
			`for l in test_A:`
			`res = ""`
			`if 'effective_date' in l:`
			`date = find_dates(l)`
			`if date != []:`
			`date = clean_date(date)`
			`res = ('effective_date=' + str(date) + ' ')`
s444356 2022-04-29 01:41:25 +02:00			`p = find_part(l)`
			`p = list(dict.fromkeys(p))`
s444356 2022-04-28 23:12:04 +02:00			`t = find_term(l)`
			`t = clean_term(t)`
			`if t != []:`
			`res += ('jurisdiction=' + str(find_state(l)) + ' ')`
s444356 2022-04-29 01:41:25 +02:00			`for i in range(0, len(p)):`
			`res += ('party=' + str(p[i]) + ' ')`
s444356 2022-04-28 23:12:04 +02:00			`res += ('term=' + str(t) + '\n')`
			`else:`
			`res += ('jurisdiction=' + str(find_state(l)) + '\n')`
			`f.write(res)`

			`f.close()`


			`with open('train/in.tsv', 'r', encoding='utf8') as f:`
			`train = f.readlines()`

			`with open('train/out.tsv', 'wt') as f:`
			`for l in train:`
			`res = ""`
			`if 'effective_date' in l:`
			`date = find_dates(l)`
			`if date != []:`
			`date = clean_date(date)`
			`res = ('effective_date=' + str(date) + ' ')`
s444356 2022-04-29 01:41:25 +02:00			`p = find_part(l)`
			`p = list(dict.fromkeys(p))`
s444356 2022-04-28 23:12:04 +02:00			`t = find_term(l)`
			`t = clean_term(t)`
			`if t != []:`
			`res += ('jurisdiction=' + str(find_state(l)) + ' ')`
s444356 2022-04-29 01:41:25 +02:00			`for i in range(0, len(p)):`
			`res += ('party=' + str(p[i]) + ' ')`
s444356 2022-04-28 23:12:04 +02:00			`res += ('term=' + str(t) + '\n')`
			`else:`
			`res += ('jurisdiction=' + str(find_state(l)) + '\n')`
			`f.write(res)`

			`f.close()`