kleister-nda-clone/main.ipynb
2021-06-30 18:00:13 +02:00

8.7 KiB

import regex as re
import pandas as pd
import us
from collections import Counter
columns_names = ['filename', 'params', 'text1', 'text2', 'text3', 'text4']
data_train = pd.read_csv('./train/in.tsv', sep='\t', names=columns_names)
data_dev = pd.read_csv('./dev-0/in.tsv', sep='\t', names=columns_names)
data_test = pd.read_csv('./test-A/in.tsv', sep='\t', names=columns_names)
months = {
    'January': '01',
    'February': '02',
    'March': '03',
    'April': '04',
    'May': '05',
    'June': '06',
    'July': '07',
    'August': '08',
    'September': '09',
    'October': '10',
    'November': '11',
    'December': '12'
}
def transform_date_format(date):
    if date != None:
        if len(date) == 4:
            # Check if month is string
            try:
                month = months[date[1]] 
            except(KeyError):
                month = None
            # If year has 4-digit
            if len(date[3]) == 2:
                if int(date[3][0]) < 5:
                    if month != None:
                        return "20"+str(date[3])+"-"+str(month)+"-" + str(date[2])
                    else:
                        return "20"+str(date[3])+"-"+str(date[1])+"-" + str(date[2])
                else:
                    if int(date[3][0]) < 5:
                        if month != None:
                            return "19"+str(date[3])+"-"+str(month)+"-" + str(date[2])
                        else:
                            return "19"+str(date[3])+"-"+str(date[1])+"-" + str(date[2])
            elif len(date[3]) == 4:
                if month != None:
                    return str(date[3])+"-"+str(month)+"-" + str(date[2])
                else:
                    return str(date[3])+"-"+str(date[1])+"-" + str(date[2])
        elif len(date) == 5:
            return str(date[4])+"-"+str(months[date[3]])+"-"+re.findall(r'.*(\d+).*', date[1])[0]
    return ""
def get_effective_date(text):

    # Date format "04/18/01"
    first_format = re.findall(r'((1[0-2]|0[1-9])/(0[1-9]|[1|2][0-9]|3[0-1])/(0[1-9]|[1-9][0-9]))', text)

    # Date format "01/21/2016"
    sec_format = re.findall(r'((0[1-9]|1[1-2])/(0[1-9]|1[1-9]|2[1-9]|3[0-1])/(19[0-9][0-9]|20[0-9][0-9]))', text)

    # Date format "January, 13 2021", "February 28, 2011"
    third_format = re.findall(r'(([j|J]anuary|[f|F]ebruary|[m|M]arch|[A|a]pril|[M|m]ay|[J|j]une|[J|j]uly|[A|a]ugust|[S|s]eptember|[O|o]ctober|[n|N]ovember|[d|D]ecember)[,\s|\s]+(0[1-9]|[1-2][0-9]|3[0-1])[,\s|\s|,]+(19[0-9][0-9]|20[0-9][0-9]))', text)

    # Date format "6th day of January, 2012"
    fourth_format = re.findall(r'(([1-9]+(th\sday\sof\s|rd\sday\sof\s|nd\sday\sof\s))([j|J]anuary|[f|F]ebruary|[m|M]arch|[A|a]pril|[M|m]ay|[J|j]une|[J|j]uly|[A|a]ugust|[S|s]eptember|[O|o]ctober|[n|N]ovember|[d|D]ecember),\s(19[0-9][0-9]|20[0-9][0-9]))', text)

    dates = []

    for format in [first_format, sec_format, third_format, fourth_format]:
        if len(format) > 0:
            dates = format
            return dates[0]
    return None
def get_terms(text):
    years = re.findall(r'(?<=\s)[0-9.\s.,\(\)]+(?=years)', text)
    months = re.findall(r'(?<=\s)[0-9.\s.,\(\)]+(?=months)', text)
    if len(years) > 0:
        return years
    if len(months) > 0:
        return months
def get_parties(text):
    first_party = re.findall(r'(?<=between)[\p{Latin}\s.,]+(?=Inc.|INC.|LLC|llc|,|.)', text)
    if len(first_party) > 0:
        if "Inc." in first_party[0]:
            prepare = first_party[0].replace(",", "").strip().replace(" ", "_")
            return prepare[:prepare.index("Inc.") + len("Inc.")]
        if "inc." in first_party[0]:
            prepare = first_party[0].replace(",", "").strip().replace(" ", "_")
            return prepare[:prepare.index("inc.") + len("inc.")]
        if "LLC" in first_party[0]:
            prepare = first_party[0].replace(",", "").strip().replace(" ", "_")
            return prepare[:prepare.index("LLC") + len("LLC")]
        if "llc" in first_party[0]:
            prepare = first_party[0].replace(",", "").strip().replace(" ", "_")
            return prepare[:prepare.index("llc") + len("llc")]
    return ""
def get_jurisdiction(text):
    for state in us.states.STATES:
        if re.search(rf"(?<=laws\sof\sthe)[\w\s]*{str(state)}\s*(?=,|.)", text):
            return str(state).replace(" ", "_")
    for state in us.states.STATES:
        if re.search(rf"(.*{str(state).lower()}.*)", text.lower()):
            return str(state).replace(" ", "_")            
    return ""
def process_parameters(params, text):
    params_result = ""
    for param in params.split(" "):
        if param == "effective_date":
            params_result += " " + "effective_date=" + str(transform_date_format(get_effective_date(text)))
        elif param == "jurisdiction":
            params_result += " " + "jurisdiction=" + str(get_jurisdiction(text))
        elif param == "party":
            params_result += " " + "party=" + str(get_parties(text))
    return params_result
with open('train/out.tsv', 'w') as writer:
    for idx, row in data_train.iterrows():
        params_result = process_parameters(row['params'], row['text1'])
        writer.write(params_result+"\n")
with open('dev-0/out.tsv', 'w') as writer:
    for idx, row in data_dev.iterrows():
        params_result = process_parameters(row['params'], row['text1'])
        writer.write(params_result+"\n")
with open('test-A/out.tsv', 'w') as writer:
    for idx, row in data_test.iterrows():
        params_result = process_parameters(row['params'], row['text1'])
        writer.write(params_result+"\n")