process dgt
This commit is contained in:
parent
5ec6b5ba13
commit
ebc152e636
3
dgt/.gitignore
vendored
Normal file
3
dgt/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
DGT*
|
||||
ids.txt
|
||||
ids_sources.txt
|
24
dgt/process_dgt_ids.py
Executable file
24
dgt/process_dgt_ids.py
Executable file
@ -0,0 +1,24 @@
|
||||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import sys, re
|
||||
|
||||
p = re.compile(r'^pl/([A-Z0-9]+).*\.xml\.gz$')
|
||||
|
||||
ids_dictionary = {}
|
||||
next_free = 0
|
||||
|
||||
with open('DGT.ids') as in_ids, open('ids.txt', 'w') as out_ids, open('ids_sources.txt','w') as out_ids_sources:
|
||||
for line in in_ids:
|
||||
fields = line.split('\t')
|
||||
filename = fields[1]
|
||||
m = p.match(filename)
|
||||
if not m:
|
||||
print('Wrong line: '+line.rstrip())
|
||||
else:
|
||||
doc_number = m.group(1)
|
||||
if not doc_number in ids_dictionary:
|
||||
ids_dictionary[doc_number] = next_free
|
||||
out_ids_sources.write('%d\thttps://eur-lex.europa.eu/legal-content/PL/TXT/?uri=CELEX:%s\tBaza aktów prawnych Unii Europejskiej\n' % (next_free,doc_number))
|
||||
next_free += 1
|
||||
out_ids.write(str(ids_dictionary[doc_number])+'\n')
|
Loading…
Reference in New Issue
Block a user