diff --git a/run.ipynb b/run.ipynb new file mode 100644 index 0000000..a1d71a2 --- /dev/null +++ b/run.ipynb @@ -0,0 +1,119 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "52034f8c", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "from datetime import datetime\n", + "import csv" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f8526769", + "metadata": {}, + "outputs": [], + "source": [ + "states = [\"Alaska\", \"Alabama\", \"Arkansas\", \"American Samoa\", \"Arizona\", \"California\", \"Colorado\", \"Connecticut\", \"District \", \"of Columbia\", \"Delaware\", \"Florida\", \"Georgia\", \"Guam\", \"Hawaii\", \"Iowa\", \"Idaho\", \"Illinois\", \"Indiana\", \"Kansas\", \"Kentucky\", \"Louisiana\", \"Massachusetts\", \"Maryland\", \"Maine\", \"Michigan\", \"Minnesota\", \"Missouri\", \"Mississippi\", \"Montana\", \"North Carolina\", \"North Dakota\", \"Nebraska\", \"New Hampshire\", \"New Jersey\", \"New Mexico\", \"Nevada\", \"New York\", \"Ohio\", \"Oklahoma\", \"Oregon\", \"Pennsylvania\", \"Puerto Rico\", \"Rhode Island\", \"South Carolina\", \"South Dakota\", \"Tennessee\", \"Texas\", \"Utah\", \"Virginia\", \"Virgin Islands\", \"Vermont\", \"Washington\", \"Wisconsin\", \"West Virginia\", \"Wyoming\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "e2195a35", + "metadata": {}, + "outputs": [], + "source": [ + "rgx = re.compile(r'\\b(' + '|'.join(states) + r')\\b')\n", + "company_rgx = r\"(([A-Z][A-za-z]+,?\\s)+(Inc\\.|LLC|Ltd\\.|Company|Corporation|INC\\.|LTD\\.|COMPANY|CORPORATION))\"" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "edee6c83", + "metadata": {}, + "outputs": [], + "source": [ + "def nda(path_in, path_out):\n", + " with open(path_in, 'r', encoding='utf-8') as in_file:\n", + " lines = in_file.readlines()\n", + " with open(path_out, 'w') as out_file:\n", + " for line in lines:\n", + " #line = line.replace('.', ' ').replace(',', ' ').lower()\n", + " #words = line.split()\n", + " jur = rgx.search(line)\n", + " #print(jur)\n", + " if jur:\n", + " out_file.write('jurisdiction=' + jur.group().replace(' ', '_'))\n", + " \n", + " \n", + " res = re.search(r'\\d{4}-\\d{2}-\\d{2}',line)\n", + " #print(res)\n", + " if res:\n", + " date = datetime.strptime(res.group(), '%Y-%m-%d').date()\n", + " print(str(date))\n", + " out_file.write('effective_date=' + date.group().replace(' ', '_'))\n", + " \n", + " #party_results = []\n", + " #party = re.findall(company_rgx, line)\n", + " #party_score = len([w for w in party if w in party])\n", + " #if party_score > 2:\n", + " # party_results.append(party[0])\n", + " par = re.search(company_rgx,line)\n", + " #print(par)\n", + " if par:\n", + " out_file.write(' party=' + par.group().replace(','' ', '_'))\n", + " out_file.write('\\n')\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "319f7898", + "metadata": {}, + "outputs": [], + "source": [ + "#pliki\n", + "nda('train/in.tsv', 'train/out.tsv')\n", + "nda('dev-0/in.tsv', 'dev-0/out.tsv')\n", + "nda('test-A/in.tsv', 'test-A/out.tsv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0db43228", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}