Regular expressions for dates and postal codes

This commit is contained in:
Zuzanna Rachuba 2024-10-31 12:46:23 +01:00
parent e514156371
commit c32112799b

View File

@ -1,7 +1,10 @@
import fitz import fitz
import pdfplumber import re
my_path = r"C:\Users\DELL\Downloads\A-24VU-00511.PDF.pdf" my_path = r"C:\Users\DELL\Downloads\A-24VU-00511.PDF.pdf"
postal_code_pattern = r'\b(?!0)([1-9]\d?-\d{3}|[1-9]\d{4}|[1-9]\d? \d{3})\b'
with fitz.open(my_path) as doc: with fitz.open(my_path) as doc:
order_numbers = [] order_numbers = []
order_dates = [] order_dates = []
@ -12,10 +15,40 @@ with fitz.open(my_path) as doc:
if "Your ref. no. PO" in text: if "Your ref. no. PO" in text:
parts = text.split("Your ref. no. PO") parts = text.split("Your ref. no. PO")
for part in parts[1:]: #idziemy od 2 elementu, żeby nie brać tekstu for part in parts[1:]:
order_number = part.split()[0].strip() order_number = part.split()[0].strip()
order_numbers.append(order_number) order_numbers.append(order_number)
print("Reference numbers: ", order_numbers)
date_pattern = r'\b\d{2}/\d{2}/\d{2,4}\b'
matches = re.findall(date_pattern, text)
order_dates.extend(matches)
lines = text.splitlines()
for i in range(1, len(lines)):
line = lines[i]
if re.search(postal_code_pattern, line):
line_2 = lines[i-2] if i>0 else ''
line_above = lines[i - 1] if i > 0 else ''
line_below = lines[i + 1] if i + 1 < len(lines) else ''
if not re.search(r'[^0-9a-zA-Z \-]', line):
buyers.append({
'line_2': line_2,
'line_above': line_above,
'postal_code_line': line,
'line_below': line_below
})
print("Reference numbers:", order_numbers)
print("Document dates:", order_dates)
print("Buyers: ")
for context in buyers:
print("Company: ", context['line_2'])
print("St. :", context['line_above'])
print("City: ", context['postal_code_line'])
print("Country:", context['line_below'])
print()