Prześlij pliki do ''

This commit is contained in:
Kornelia Girejko 2022-05-03 22:37:04 +02:00
parent 7190bd8f38
commit 1636e22349
1 changed files with 119 additions and 0 deletions

119
run.ipynb Normal file
View File

@ -0,0 +1,119 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "52034f8c",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"from datetime import datetime\n",
"import csv"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "f8526769",
"metadata": {},
"outputs": [],
"source": [
"states = [\"Alaska\", \"Alabama\", \"Arkansas\", \"American Samoa\", \"Arizona\", \"California\", \"Colorado\", \"Connecticut\", \"District \", \"of Columbia\", \"Delaware\", \"Florida\", \"Georgia\", \"Guam\", \"Hawaii\", \"Iowa\", \"Idaho\", \"Illinois\", \"Indiana\", \"Kansas\", \"Kentucky\", \"Louisiana\", \"Massachusetts\", \"Maryland\", \"Maine\", \"Michigan\", \"Minnesota\", \"Missouri\", \"Mississippi\", \"Montana\", \"North Carolina\", \"North Dakota\", \"Nebraska\", \"New Hampshire\", \"New Jersey\", \"New Mexico\", \"Nevada\", \"New York\", \"Ohio\", \"Oklahoma\", \"Oregon\", \"Pennsylvania\", \"Puerto Rico\", \"Rhode Island\", \"South Carolina\", \"South Dakota\", \"Tennessee\", \"Texas\", \"Utah\", \"Virginia\", \"Virgin Islands\", \"Vermont\", \"Washington\", \"Wisconsin\", \"West Virginia\", \"Wyoming\"]"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "e2195a35",
"metadata": {},
"outputs": [],
"source": [
"rgx = re.compile(r'\\b(' + '|'.join(states) + r')\\b')\n",
"company_rgx = r\"(([A-Z][A-za-z]+,?\\s)+(Inc\\.|LLC|Ltd\\.|Company|Corporation|INC\\.|LTD\\.|COMPANY|CORPORATION))\""
]
},
{
"cell_type": "code",
"execution_count": 93,
"id": "edee6c83",
"metadata": {},
"outputs": [],
"source": [
"def nda(path_in, path_out):\n",
" with open(path_in, 'r', encoding='utf-8') as in_file:\n",
" lines = in_file.readlines()\n",
" with open(path_out, 'w') as out_file:\n",
" for line in lines:\n",
" #line = line.replace('.', ' ').replace(',', ' ').lower()\n",
" #words = line.split()\n",
" jur = rgx.search(line)\n",
" #print(jur)\n",
" if jur:\n",
" out_file.write('jurisdiction=' + jur.group().replace(' ', '_'))\n",
" \n",
" \n",
" res = re.search(r'\\d{4}-\\d{2}-\\d{2}',line)\n",
" #print(res)\n",
" if res:\n",
" date = datetime.strptime(res.group(), '%Y-%m-%d').date()\n",
" print(str(date))\n",
" out_file.write('effective_date=' + date.group().replace(' ', '_'))\n",
" \n",
" #party_results = []\n",
" #party = re.findall(company_rgx, line)\n",
" #party_score = len([w for w in party if w in party])\n",
" #if party_score > 2:\n",
" # party_results.append(party[0])\n",
" par = re.search(company_rgx,line)\n",
" #print(par)\n",
" if par:\n",
" out_file.write(' party=' + par.group().replace(','' ', '_'))\n",
" out_file.write('\\n')\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 94,
"id": "319f7898",
"metadata": {},
"outputs": [],
"source": [
"#pliki\n",
"nda('train/in.tsv', 'train/out.tsv')\n",
"nda('dev-0/in.tsv', 'dev-0/out.tsv')\n",
"nda('test-A/in.tsv', 'test-A/out.tsv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0db43228",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}