Usuń 'run.py'

This commit is contained in:
Kornelia Girejko 2022-05-03 20:58:10 +02:00
parent 72a63348bd
commit e5dd8a1bd8

97
run.py
View File

@ -1,97 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 43,
"id": "405da850",
"metadata": {},
"outputs": [],
"source": [
"import re"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "616f3992",
"metadata": {},
"outputs": [],
"source": [
"states = [\"Alaska\", \"Alabama\", \"Arkansas\", \"American Samoa\", \"Arizona\", \"California\", \"Colorado\", \"Connecticut\", \"District \", \"of Columbia\", \"Delaware\", \"Florida\", \"Georgia\", \"Guam\", \"Hawaii\", \"Iowa\", \"Idaho\", \"Illinois\", \"Indiana\", \"Kansas\", \"Kentucky\", \"Louisiana\", \"Massachusetts\", \"Maryland\", \"Maine\", \"Michigan\", \"Minnesota\", \"Missouri\", \"Mississippi\", \"Montana\", \"North Carolina\", \"North Dakota\", \"Nebraska\", \"New Hampshire\", \"New Jersey\", \"New Mexico\", \"Nevada\", \"New York\", \"Ohio\", \"Oklahoma\", \"Oregon\", \"Pennsylvania\", \"Puerto Rico\", \"Rhode Island\", \"South Carolina\", \"South Dakota\", \"Tennessee\", \"Texas\", \"Utah\", \"Virginia\", \"Virgin Islands\", \"Vermont\", \"Washington\", \"Wisconsin\", \"West Virginia\", \"Wyoming\"]"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "dfa9b7c2",
"metadata": {},
"outputs": [],
"source": [
"rgx = re.compile(r'\\b(' + '|'.join(states) + r')\\b')"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "05c88c78",
"metadata": {},
"outputs": [],
"source": [
"def nda(path_in, path_out):\n",
" #path_in = lzma.open(path_in).read().decode()\n",
" results = []\n",
" with open(path_in, 'r', encoding='utf-8') as file:\n",
" #removeAccents(path_in)\n",
" for line in file.readlines():\n",
" line = line.replace('.', ' ').replace(',', ' ').lower()\n",
" words = line.split()\n",
" jur = rgx.search(line)\n",
" if jur:\n",
" results.append('jurisdiction=' + jur.group().replace(' ', '_'))\n",
" #else:\n",
" # results.append('\\n')\n",
" date = re.findall(r'(\\d+-\\d+-\\d+)',line)\n",
" if date:\n",
" results.append('effective_date=' + jur.group().replace(' ', '_'))\n",
" results.append('\\n')\n",
" with open(path_out, 'w') as file:\n",
" for r in results:\n",
" file.write(r + '\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "71adc3b1",
"metadata": {},
"outputs": [],
"source": [
"#pliki\n",
"nda('dev-0/in.tsv', 'dev-0/out.tsv')\n",
"nda('train/in.tsv', 'train/out.tsv')\n",
"nda('test-A/in.tsv', 'test-A/out.tsv')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}