kleister-nda/run.py

52 lines
1.6 KiB
Python
Raw Normal View History

2022-04-22 22:51:25 +02:00
import re
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey',
'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
def count_strings(input, str_):
pattern = re.compile(str_)
return len(pattern.findall(input, re.IGNORECASE))
def predict_state(text):
# Predict state of jurisdiction
state_dict = {}
for state in states:
state_dict[state.replace(" ", "_")] = count_strings(text, state)
return max(state_dict, key=state_dict.get)
with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
dev0_x = f.readlines()
with open('dev-0/out.tsv', 'wt') as f:
for x in dev0_x:
f.write("jurisdiction="+str(predict_state(x))+'\n')
f.close()
with open('train/in.tsv', 'r', encoding='utf8') as f:
train_x = f.readlines()
with open('train/out.tsv', 'wt') as f:
for x in train_x:
f.write("jurisdiction="+str(predict_state(x))+'\n')
f.close()
with open('test-A/in.tsv', 'r', encoding='utf8') as f:
testA_x = f.readlines()
with open('test-A/out.tsv', 'wt') as f:
for x in testA_x:
f.write("jurisdiction="+str(predict_state(x))+'\n')
f.close()