Added regex for postal-codes

This commit is contained in:
Zuzanna Rachuba 2024-11-03 12:15:47 +01:00
parent c32112799b
commit 364798ca52

View File

@ -2,9 +2,7 @@ import fitz
import re import re
my_path = r"C:\Users\DELL\Downloads\A-24VU-00511.PDF.pdf" my_path = r"C:\Users\DELL\Downloads\A-24VU-00511.PDF.pdf"
postal_code_pattern = r"(?<!\S)(?:(?:[A-Za-z]{3,}\s+)(\d{5}|\d{2}-\d{3})|(\d{5}|\d{2}-\d{3})(?:\s+[A-Za-z]{3,}))(?!\S)"
postal_code_pattern = r'\b(?!0)([1-9]\d?-\d{3}|[1-9]\d{4}|[1-9]\d? \d{3})\b'
with fitz.open(my_path) as doc: with fitz.open(my_path) as doc:
order_numbers = [] order_numbers = []
order_dates = [] order_dates = []
@ -34,8 +32,6 @@ with fitz.open(my_path) as doc:
line_above = lines[i - 1] if i > 0 else '' line_above = lines[i - 1] if i > 0 else ''
line_below = lines[i + 1] if i + 1 < len(lines) else '' line_below = lines[i + 1] if i + 1 < len(lines) else ''
if not re.search(r'[^0-9a-zA-Z \-]', line):
buyers.append({ buyers.append({
'line_2': line_2, 'line_2': line_2,
'line_above': line_above, 'line_above': line_above,