From c32112799bebcf177cfaa847d21a3c106b448e2a Mon Sep 17 00:00:00 2001 From: Zuzanna Rachuba Date: Thu, 31 Oct 2024 12:46:23 +0100 Subject: [PATCH] Regular expressions for dates and postal codes --- projekt.py | 43 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/projekt.py b/projekt.py index 0a36fc9..2d9ce13 100644 --- a/projekt.py +++ b/projekt.py @@ -1,7 +1,10 @@ import fitz -import pdfplumber +import re + my_path = r"C:\Users\DELL\Downloads\A-24VU-00511.PDF.pdf" +postal_code_pattern = r'\b(?!0)([1-9]\d?-\d{3}|[1-9]\d{4}|[1-9]\d? \d{3})\b' + with fitz.open(my_path) as doc: order_numbers = [] order_dates = [] @@ -12,10 +15,40 @@ with fitz.open(my_path) as doc: if "Your ref. no. PO" in text: parts = text.split("Your ref. no. PO") - for part in parts[1:]: #idziemy od 2 elementu, żeby nie brać tekstu + for part in parts[1:]: order_number = part.split()[0].strip() order_numbers.append(order_number) - - -print("Reference numbers: ", order_numbers) + date_pattern = r'\b\d{2}/\d{2}/\d{2,4}\b' + matches = re.findall(date_pattern, text) + order_dates.extend(matches) + + + lines = text.splitlines() + for i in range(1, len(lines)): + line = lines[i] + + if re.search(postal_code_pattern, line): + + line_2 = lines[i-2] if i>0 else '' + line_above = lines[i - 1] if i > 0 else '' + line_below = lines[i + 1] if i + 1 < len(lines) else '' + + + if not re.search(r'[^0-9a-zA-Z \-]', line): + buyers.append({ + 'line_2': line_2, + 'line_above': line_above, + 'postal_code_line': line, + 'line_below': line_below + }) + +print("Reference numbers:", order_numbers) +print("Document dates:", order_dates) +print("Buyers: ") +for context in buyers: + print("Company: ", context['line_2']) + print("St. :", context['line_above']) + print("City: ", context['postal_code_line']) + print("Country:", context['line_below']) + print()