concordia-aligner/dgt/process_dgt_ids.py
Rafał Jaworski ebc152e636 process dgt
2019-06-27 14:40:43 +02:00

25 lines
861 B
Python
Executable File

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys, re
p = re.compile(r'^pl/([A-Z0-9]+).*\.xml\.gz$')
ids_dictionary = {}
next_free = 0
with open('DGT.ids') as in_ids, open('ids.txt', 'w') as out_ids, open('ids_sources.txt','w') as out_ids_sources:
for line in in_ids:
fields = line.split('\t')
filename = fields[1]
m = p.match(filename)
if not m:
print('Wrong line: '+line.rstrip())
else:
doc_number = m.group(1)
if not doc_number in ids_dictionary:
ids_dictionary[doc_number] = next_free
out_ids_sources.write('%d\thttps://eur-lex.europa.eu/legal-content/PL/TXT/?uri=CELEX:%s\tBaza aktów prawnych Unii Europejskiej\n' % (next_free,doc_number))
next_free += 1
out_ids.write(str(ids_dictionary[doc_number])+'\n')