kleister-nda/run.ipynb

3.4 KiB

import re
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 
        'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 
        'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 
        'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Pennsylvania', 'Rhode Island', 'South Carolina', 
        'South Dakota', 'Tennessee', 'Texas', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
def counter(text_in, query):
    pattern = re.compile(query)
    return len(pattern.findall(text_in, re.IGNORECASE))
def state_prediction(text_in):
    state_dict = {}
    for state in states:
        state_dict[state.replace(" ", "_")] = counter(text_in, state)     
    return max(state_dict, key=state_dict.get)
def jurisdiction(path_in, path_out):    
    with open(path_in, 'r', encoding='utf8') as file:
        lines = file.readlines()
    with open(path_out, 'wt')as file_out:
        for i in lines:
            file_out.write("jurisdiction="+str(state_prediction(i))+'\n')            
    file_out.close()
jurisdiction('dev-0/in.tsv', 'dev-0/out.tsv')
jurisdiction('train/in.tsv', 'train/out.tsv')
jurisdiction('test-A/in.tsv', 'test-A/out.tsv')
!jupyter nbconvert --to script run.ipynb
[NbConvertApp] Converting notebook run.ipynb to script
[NbConvertApp] Writing 1605 bytes to run.py