process dgt

This commit is contained in:
Rafał Jaworski 2019-06-27 14:40:43 +02:00
parent 5ec6b5ba13
commit ebc152e636
2 changed files with 27 additions and 0 deletions

3
dgt/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
DGT*
ids.txt
ids_sources.txt

24
dgt/process_dgt_ids.py Executable file
View File

@ -0,0 +1,24 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys, re
p = re.compile(r'^pl/([A-Z0-9]+).*\.xml\.gz$')
ids_dictionary = {}
next_free = 0
with open('DGT.ids') as in_ids, open('ids.txt', 'w') as out_ids, open('ids_sources.txt','w') as out_ids_sources:
for line in in_ids:
fields = line.split('\t')
filename = fields[1]
m = p.match(filename)
if not m:
print('Wrong line: '+line.rstrip())
else:
doc_number = m.group(1)
if not doc_number in ids_dictionary:
ids_dictionary[doc_number] = next_free
out_ids_sources.write('%d\thttps://eur-lex.europa.eu/legal-content/PL/TXT/?uri=CELEX:%s\tBaza aktów prawnych Unii Europejskiej\n' % (next_free,doc_number))
next_free += 1
out_ids.write(str(ids_dictionary[doc_number])+'\n')