kleister-nda/.ipynb_checkpoints/run-checkpoint.ipynb
2022-05-04 01:30:25 +02:00

3.4 KiB

import re
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 
        'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 
        'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 
        'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Pennsylvania', 'Rhode Island', 'South Carolina', 
        'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
def counter(text_in, query):
    pattern = re.compile(query)
    return len(pattern.findall(text_in, re.IGNORECASE))
def state_prediction(text_in):
    state_dict = {}
    for state in states:
        state_dict[state.replace(" ", "_")] = counter(text_in, state)     
    return state_dict
def jurisdiction(path_in, path_out):    
    with open(path_in, 'r', encoding='utf8') as file:
        lines = file.readlines()
    with open(path_out, 'wt')as file_out:
        for i in lines:
            file_out.write("jurisdiction="+str(state_prediction(i))+'\n')            
    file_out.close()
jurisdiction('dev-0/in.tsv', 'dev-0/out.tsv')
jurisdiction('train/in.tsv', 'train/out.tsv')
jurisdiction('test-A/in.tsv', 'test-A/out.tsv')
!jupyter nbconvert --to script run.ipynb
[NbConvertApp] Converting notebook run.ipynb to script
[NbConvertApp] Writing 2419 bytes to run.py