diff --git a/dgt/.gitignore b/dgt/.gitignore new file mode 100644 index 0000000..efaac1a --- /dev/null +++ b/dgt/.gitignore @@ -0,0 +1,3 @@ +DGT* +ids.txt +ids_sources.txt \ No newline at end of file diff --git a/dgt/process_dgt_ids.py b/dgt/process_dgt_ids.py new file mode 100755 index 0000000..1d6f9b5 --- /dev/null +++ b/dgt/process_dgt_ids.py @@ -0,0 +1,24 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +import sys, re + +p = re.compile(r'^pl/([A-Z0-9]+).*\.xml\.gz$') + +ids_dictionary = {} +next_free = 0 + +with open('DGT.ids') as in_ids, open('ids.txt', 'w') as out_ids, open('ids_sources.txt','w') as out_ids_sources: + for line in in_ids: + fields = line.split('\t') + filename = fields[1] + m = p.match(filename) + if not m: + print('Wrong line: '+line.rstrip()) + else: + doc_number = m.group(1) + if not doc_number in ids_dictionary: + ids_dictionary[doc_number] = next_free + out_ids_sources.write('%d\thttps://eur-lex.europa.eu/legal-content/PL/TXT/?uri=CELEX:%s\tBaza aktów prawnych Unii Europejskiej\n' % (next_free,doc_number)) + next_free += 1 + out_ids.write(str(ids_dictionary[doc_number])+'\n')