120 lines
4.1 KiB
Plaintext
120 lines
4.1 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "52034f8c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import re\n",
|
|
"from datetime import datetime\n",
|
|
"import csv"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "f8526769",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"states = [\"Alaska\", \"Alabama\", \"Arkansas\", \"American Samoa\", \"Arizona\", \"California\", \"Colorado\", \"Connecticut\", \"District \", \"of Columbia\", \"Delaware\", \"Florida\", \"Georgia\", \"Guam\", \"Hawaii\", \"Iowa\", \"Idaho\", \"Illinois\", \"Indiana\", \"Kansas\", \"Kentucky\", \"Louisiana\", \"Massachusetts\", \"Maryland\", \"Maine\", \"Michigan\", \"Minnesota\", \"Missouri\", \"Mississippi\", \"Montana\", \"North Carolina\", \"North Dakota\", \"Nebraska\", \"New Hampshire\", \"New Jersey\", \"New Mexico\", \"Nevada\", \"New York\", \"Ohio\", \"Oklahoma\", \"Oregon\", \"Pennsylvania\", \"Puerto Rico\", \"Rhode Island\", \"South Carolina\", \"South Dakota\", \"Tennessee\", \"Texas\", \"Utah\", \"Virginia\", \"Virgin Islands\", \"Vermont\", \"Washington\", \"Wisconsin\", \"West Virginia\", \"Wyoming\"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 84,
|
|
"id": "e2195a35",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"rgx = re.compile(r'\\b(' + '|'.join(states) + r')\\b')\n",
|
|
"company_rgx = r\"(([A-Z][A-za-z]+,?\\s)+(Inc\\.|LLC|Ltd\\.|Company|Corporation|INC\\.|LTD\\.|COMPANY|CORPORATION))\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 93,
|
|
"id": "edee6c83",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def nda(path_in, path_out):\n",
|
|
" with open(path_in, 'r', encoding='utf-8') as in_file:\n",
|
|
" lines = in_file.readlines()\n",
|
|
" with open(path_out, 'w') as out_file:\n",
|
|
" for line in lines:\n",
|
|
" #line = line.replace('.', ' ').replace(',', ' ').lower()\n",
|
|
" #words = line.split()\n",
|
|
" jur = rgx.search(line)\n",
|
|
" #print(jur)\n",
|
|
" if jur:\n",
|
|
" out_file.write('jurisdiction=' + jur.group().replace(' ', '_'))\n",
|
|
" \n",
|
|
" \n",
|
|
" res = re.search(r'\\d{4}-\\d{2}-\\d{2}',line)\n",
|
|
" #print(res)\n",
|
|
" if res:\n",
|
|
" date = datetime.strptime(res.group(), '%Y-%m-%d').date()\n",
|
|
" print(str(date))\n",
|
|
" out_file.write('effective_date=' + date.group().replace(' ', '_'))\n",
|
|
" \n",
|
|
" #party_results = []\n",
|
|
" #party = re.findall(company_rgx, line)\n",
|
|
" #party_score = len([w for w in party if w in party])\n",
|
|
" #if party_score > 2:\n",
|
|
" # party_results.append(party[0])\n",
|
|
" par = re.search(company_rgx,line)\n",
|
|
" #print(par)\n",
|
|
" if par:\n",
|
|
" out_file.write(' party=' + par.group().replace(','' ', '_'))\n",
|
|
" out_file.write('\\n')\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 94,
|
|
"id": "319f7898",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#pliki\n",
|
|
"nda('train/in.tsv', 'train/out.tsv')\n",
|
|
"nda('dev-0/in.tsv', 'dev-0/out.tsv')\n",
|
|
"nda('test-A/in.tsv', 'test-A/out.tsv')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "0db43228",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.12"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|