From e5dd8a1bd8ef156ebccc8bfc1a73c6998b9ab2b3 Mon Sep 17 00:00:00 2001 From: Kornelia Girejko Date: Tue, 3 May 2022 20:58:10 +0200 Subject: [PATCH] =?UTF-8?q?Usu=C5=84=20'run.py'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- run.py | 97 ---------------------------------------------------------- 1 file changed, 97 deletions(-) delete mode 100644 run.py diff --git a/run.py b/run.py deleted file mode 100644 index e6fc11b..0000000 --- a/run.py +++ /dev/null @@ -1,97 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 43, - "id": "405da850", - "metadata": {}, - "outputs": [], - "source": [ - "import re" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "616f3992", - "metadata": {}, - "outputs": [], - "source": [ - "states = [\"Alaska\", \"Alabama\", \"Arkansas\", \"American Samoa\", \"Arizona\", \"California\", \"Colorado\", \"Connecticut\", \"District \", \"of Columbia\", \"Delaware\", \"Florida\", \"Georgia\", \"Guam\", \"Hawaii\", \"Iowa\", \"Idaho\", \"Illinois\", \"Indiana\", \"Kansas\", \"Kentucky\", \"Louisiana\", \"Massachusetts\", \"Maryland\", \"Maine\", \"Michigan\", \"Minnesota\", \"Missouri\", \"Mississippi\", \"Montana\", \"North Carolina\", \"North Dakota\", \"Nebraska\", \"New Hampshire\", \"New Jersey\", \"New Mexico\", \"Nevada\", \"New York\", \"Ohio\", \"Oklahoma\", \"Oregon\", \"Pennsylvania\", \"Puerto Rico\", \"Rhode Island\", \"South Carolina\", \"South Dakota\", \"Tennessee\", \"Texas\", \"Utah\", \"Virginia\", \"Virgin Islands\", \"Vermont\", \"Washington\", \"Wisconsin\", \"West Virginia\", \"Wyoming\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "dfa9b7c2", - "metadata": {}, - "outputs": [], - "source": [ - "rgx = re.compile(r'\\b(' + '|'.join(states) + r')\\b')" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "05c88c78", - "metadata": {}, - "outputs": [], - "source": [ - "def nda(path_in, path_out):\n", - " #path_in = lzma.open(path_in).read().decode()\n", - " results = []\n", - " with open(path_in, 'r', encoding='utf-8') as file:\n", - " #removeAccents(path_in)\n", - " for line in file.readlines():\n", - " line = line.replace('.', ' ').replace(',', ' ').lower()\n", - " words = line.split()\n", - " jur = rgx.search(line)\n", - " if jur:\n", - " results.append('jurisdiction=' + jur.group().replace(' ', '_'))\n", - " #else:\n", - " # results.append('\\n')\n", - " date = re.findall(r'(\\d+-\\d+-\\d+)',line)\n", - " if date:\n", - " results.append('effective_date=' + jur.group().replace(' ', '_'))\n", - " results.append('\\n')\n", - " with open(path_out, 'w') as file:\n", - " for r in results:\n", - " file.write(r + '\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "71adc3b1", - "metadata": {}, - "outputs": [], - "source": [ - "#pliki\n", - "nda('dev-0/in.tsv', 'dev-0/out.tsv')\n", - "nda('train/in.tsv', 'train/out.tsv')\n", - "nda('test-A/in.tsv', 'test-A/out.tsv')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}