From 1636e2234946a0aa248469eb00357b2f5246876c Mon Sep 17 00:00:00 2001 From: Kornelia Girejko Date: Tue, 3 May 2022 22:37:04 +0200 Subject: [PATCH] =?UTF-8?q?Prze=C5=9Blij=20pliki=20do=20''?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- run.ipynb | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 run.ipynb diff --git a/run.ipynb b/run.ipynb new file mode 100644 index 0000000..a1d71a2 --- /dev/null +++ b/run.ipynb @@ -0,0 +1,119 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "52034f8c", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "from datetime import datetime\n", + "import csv" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f8526769", + "metadata": {}, + "outputs": [], + "source": [ + "states = [\"Alaska\", \"Alabama\", \"Arkansas\", \"American Samoa\", \"Arizona\", \"California\", \"Colorado\", \"Connecticut\", \"District \", \"of Columbia\", \"Delaware\", \"Florida\", \"Georgia\", \"Guam\", \"Hawaii\", \"Iowa\", \"Idaho\", \"Illinois\", \"Indiana\", \"Kansas\", \"Kentucky\", \"Louisiana\", \"Massachusetts\", \"Maryland\", \"Maine\", \"Michigan\", \"Minnesota\", \"Missouri\", \"Mississippi\", \"Montana\", \"North Carolina\", \"North Dakota\", \"Nebraska\", \"New Hampshire\", \"New Jersey\", \"New Mexico\", \"Nevada\", \"New York\", \"Ohio\", \"Oklahoma\", \"Oregon\", \"Pennsylvania\", \"Puerto Rico\", \"Rhode Island\", \"South Carolina\", \"South Dakota\", \"Tennessee\", \"Texas\", \"Utah\", \"Virginia\", \"Virgin Islands\", \"Vermont\", \"Washington\", \"Wisconsin\", \"West Virginia\", \"Wyoming\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "e2195a35", + "metadata": {}, + "outputs": [], + "source": [ + "rgx = re.compile(r'\\b(' + '|'.join(states) + r')\\b')\n", + "company_rgx = r\"(([A-Z][A-za-z]+,?\\s)+(Inc\\.|LLC|Ltd\\.|Company|Corporation|INC\\.|LTD\\.|COMPANY|CORPORATION))\"" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "edee6c83", + "metadata": {}, + "outputs": [], + "source": [ + "def nda(path_in, path_out):\n", + " with open(path_in, 'r', encoding='utf-8') as in_file:\n", + " lines = in_file.readlines()\n", + " with open(path_out, 'w') as out_file:\n", + " for line in lines:\n", + " #line = line.replace('.', ' ').replace(',', ' ').lower()\n", + " #words = line.split()\n", + " jur = rgx.search(line)\n", + " #print(jur)\n", + " if jur:\n", + " out_file.write('jurisdiction=' + jur.group().replace(' ', '_'))\n", + " \n", + " \n", + " res = re.search(r'\\d{4}-\\d{2}-\\d{2}',line)\n", + " #print(res)\n", + " if res:\n", + " date = datetime.strptime(res.group(), '%Y-%m-%d').date()\n", + " print(str(date))\n", + " out_file.write('effective_date=' + date.group().replace(' ', '_'))\n", + " \n", + " #party_results = []\n", + " #party = re.findall(company_rgx, line)\n", + " #party_score = len([w for w in party if w in party])\n", + " #if party_score > 2:\n", + " # party_results.append(party[0])\n", + " par = re.search(company_rgx,line)\n", + " #print(par)\n", + " if par:\n", + " out_file.write(' party=' + par.group().replace(','' ', '_'))\n", + " out_file.write('\\n')\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "319f7898", + "metadata": {}, + "outputs": [], + "source": [ + "#pliki\n", + "nda('train/in.tsv', 'train/out.tsv')\n", + "nda('dev-0/in.tsv', 'dev-0/out.tsv')\n", + "nda('test-A/in.tsv', 'test-A/out.tsv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0db43228", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}