Prześlij pliki do ''

This commit is contained in:
Kornelia Girejko 2022-05-04 15:50:06 +02:00
parent ffcacfb1dc
commit b6ed93e25e
1 changed files with 74 additions and 0 deletions

74
run.py Normal file
View File

@ -0,0 +1,74 @@
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import re
from datetime import datetime
import csv
# In[9]:
states = ["Alaska", "Alabama", "Arkansas", "American Samoa", "Arizona", "California", "Colorado", "Connecticut", "District ", "of Columbia", "Delaware", "Florida", "Georgia", "Guam", "Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Puerto Rico", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Virgin Islands", "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]
# In[84]:
rgx = re.compile(r'\b(' + '|'.join(states) + r')\b')
company_rgx = r"(([A-Z][A-za-z]+,?\s)+(Inc\.|LLC|Ltd\.|Company|Corporation|INC\.|LTD\.|COMPANY|CORPORATION))"
# In[93]:
def nda(path_in, path_out):
with open(path_in, 'r', encoding='utf-8') as in_file:
lines = in_file.readlines()
with open(path_out, 'w') as out_file:
for line in lines:
#line = line.replace('.', ' ').replace(',', ' ').lower()
#words = line.split()
jur = rgx.search(line)
#print(jur)
if jur:
out_file.write('jurisdiction=' + jur.group().replace(' ', '_'))
res = re.search(r'\d{4}-\d{2}-\d{2}',line)
#print(res)
if res:
date = datetime.strptime(res.group(), '%Y-%m-%d').date()
print(str(date))
out_file.write('effective_date=' + date.group().replace(' ', '_'))
#party_results = []
#party = re.findall(company_rgx, line)
#party_score = len([w for w in party if w in party])
#if party_score > 2:
# party_results.append(party[0])
par = re.search(company_rgx,line)
#print(par)
if par:
out_file.write(' party=' + par.group().replace(','' ', '_'))
out_file.write('\n')
# In[94]:
#pliki
nda('train/in.tsv', 'train/out.tsv')
nda('dev-0/in.tsv', 'dev-0/out.tsv')
nda('test-A/in.tsv', 'test-A/out.tsv')
# In[ ]: