diff --git a/run.py b/run.py deleted file mode 100644 index e6fc11b..0000000 --- a/run.py +++ /dev/null @@ -1,97 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 43, - "id": "405da850", - "metadata": {}, - "outputs": [], - "source": [ - "import re" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "616f3992", - "metadata": {}, - "outputs": [], - "source": [ - "states = [\"Alaska\", \"Alabama\", \"Arkansas\", \"American Samoa\", \"Arizona\", \"California\", \"Colorado\", \"Connecticut\", \"District \", \"of Columbia\", \"Delaware\", \"Florida\", \"Georgia\", \"Guam\", \"Hawaii\", \"Iowa\", \"Idaho\", \"Illinois\", \"Indiana\", \"Kansas\", \"Kentucky\", \"Louisiana\", \"Massachusetts\", \"Maryland\", \"Maine\", \"Michigan\", \"Minnesota\", \"Missouri\", \"Mississippi\", \"Montana\", \"North Carolina\", \"North Dakota\", \"Nebraska\", \"New Hampshire\", \"New Jersey\", \"New Mexico\", \"Nevada\", \"New York\", \"Ohio\", \"Oklahoma\", \"Oregon\", \"Pennsylvania\", \"Puerto Rico\", \"Rhode Island\", \"South Carolina\", \"South Dakota\", \"Tennessee\", \"Texas\", \"Utah\", \"Virginia\", \"Virgin Islands\", \"Vermont\", \"Washington\", \"Wisconsin\", \"West Virginia\", \"Wyoming\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "dfa9b7c2", - "metadata": {}, - "outputs": [], - "source": [ - "rgx = re.compile(r'\\b(' + '|'.join(states) + r')\\b')" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "05c88c78", - "metadata": {}, - "outputs": [], - "source": [ - "def nda(path_in, path_out):\n", - " #path_in = lzma.open(path_in).read().decode()\n", - " results = []\n", - " with open(path_in, 'r', encoding='utf-8') as file:\n", - " #removeAccents(path_in)\n", - " for line in file.readlines():\n", - " line = line.replace('.', ' ').replace(',', ' ').lower()\n", - " words = line.split()\n", - " jur = rgx.search(line)\n", - " if jur:\n", - " results.append('jurisdiction=' + jur.group().replace(' ', '_'))\n", - " #else:\n", - " # results.append('\\n')\n", - " date = re.findall(r'(\\d+-\\d+-\\d+)',line)\n", - " if date:\n", - " results.append('effective_date=' + jur.group().replace(' ', '_'))\n", - " results.append('\\n')\n", - " with open(path_out, 'w') as file:\n", - " for r in results:\n", - " file.write(r + '\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "71adc3b1", - "metadata": {}, - "outputs": [], - "source": [ - "#pliki\n", - "nda('dev-0/in.tsv', 'dev-0/out.tsv')\n", - "nda('train/in.tsv', 'train/out.tsv')\n", - "nda('test-A/in.tsv', 'test-A/out.tsv')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}