#!/usr/bin/python3 # -*- coding: utf-8 -*- import sys, re p = re.compile(r'^pl/([A-Z0-9]+).*\.xml\.gz$') ids_dictionary = {} next_free = 0 with open('DGT.ids') as in_ids, open('ids.txt', 'w') as out_ids, open('ids_sources.txt','w') as out_ids_sources: for line in in_ids: fields = line.split('\t') filename = fields[1] m = p.match(filename) if not m: print('Wrong line: '+line.rstrip()) else: doc_number = m.group(1) if not doc_number in ids_dictionary: ids_dictionary[doc_number] = next_free out_ids_sources.write('%d\thttps://eur-lex.europa.eu/legal-content/PL/TXT/?uri=CELEX:%s\tBaza aktów prawnych Unii Europejskiej\n' % (next_free,doc_number)) next_free += 1 out_ids.write(str(ids_dictionary[doc_number])+'\n')