kleister-nda/.ipynb_checkpoints/run-checkpoint.ipynb
2022-05-04 00:06:07 +02:00

3.5 KiB

import re
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 
        'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 
        'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 
        'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 
        'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
def counter(text_in, query):
    pattern = re.compile(query)
    return len(pattern.findall(text_in, re.IGNORECASE))
def state_prediction(text_in):
    state_dict = {}
    for state in states:
        state_dict[state.replace(" ", "_")] = counter(text_in, state)     
    return max(state_dict, key=state_dict.get)
def jurisdiction(path_in, path_out):    
    with open(path_in, 'r', encoding='utf8') as file:
        lines = file.readlines()
        lines = lines.replace('.', ' ').replace(',', ' ').lower()
    with open(path_out, 'wt')as file_out:
        for i in lines:
            file_out.write("jurisdiction="+str(state_prediction(i))+'\n')            
    file_out.close()
jurisdiction('dev-0/in.tsv', 'dev-0/out.tsv')
jurisdiction('train/in.tsv', 'train/out.tsv')
jurisdiction('test-A/in.tsv', 'test-A/out.tsv')
!jupyter nbconvert --to script run.ipynb
[NbConvertApp] Converting notebook run.ipynb to script
[NbConvertApp] Writing 1697 bytes to run.py