2024-10-30 13:25:15 +01:00
|
|
|
import fitz
|
2024-10-31 12:46:23 +01:00
|
|
|
import re
|
|
|
|
|
2024-10-30 13:25:15 +01:00
|
|
|
my_path = r"C:\Users\DELL\Downloads\A-24VU-00511.PDF.pdf"
|
2024-11-03 12:15:47 +01:00
|
|
|
postal_code_pattern = r"(?<!\S)(?:(?:[A-Za-z]{3,}\s+)(\d{5}|\d{2}-\d{3})|(\d{5}|\d{2}-\d{3})(?:\s+[A-Za-z]{3,}))(?!\S)"
|
2024-10-30 13:25:15 +01:00
|
|
|
with fitz.open(my_path) as doc:
|
|
|
|
order_numbers = []
|
|
|
|
order_dates = []
|
|
|
|
buyers = []
|
|
|
|
|
|
|
|
for page in doc:
|
|
|
|
text = page.get_text("text")
|
|
|
|
|
|
|
|
if "Your ref. no. PO" in text:
|
|
|
|
parts = text.split("Your ref. no. PO")
|
2024-10-31 12:46:23 +01:00
|
|
|
for part in parts[1:]:
|
2024-10-30 13:25:15 +01:00
|
|
|
order_number = part.split()[0].strip()
|
|
|
|
order_numbers.append(order_number)
|
|
|
|
|
2024-10-31 12:46:23 +01:00
|
|
|
date_pattern = r'\b\d{2}/\d{2}/\d{2,4}\b'
|
|
|
|
matches = re.findall(date_pattern, text)
|
|
|
|
order_dates.extend(matches)
|
|
|
|
|
|
|
|
|
|
|
|
lines = text.splitlines()
|
|
|
|
for i in range(1, len(lines)):
|
|
|
|
line = lines[i]
|
|
|
|
|
|
|
|
if re.search(postal_code_pattern, line):
|
|
|
|
|
|
|
|
line_2 = lines[i-2] if i>0 else ''
|
|
|
|
line_above = lines[i - 1] if i > 0 else ''
|
|
|
|
line_below = lines[i + 1] if i + 1 < len(lines) else ''
|
2024-11-03 12:15:47 +01:00
|
|
|
|
|
|
|
buyers.append({
|
|
|
|
'line_2': line_2,
|
|
|
|
'line_above': line_above,
|
|
|
|
'postal_code_line': line,
|
|
|
|
'line_below': line_below
|
|
|
|
})
|
2024-10-31 12:46:23 +01:00
|
|
|
|
|
|
|
print("Reference numbers:", order_numbers)
|
|
|
|
print("Document dates:", order_dates)
|
|
|
|
print("Buyers: ")
|
|
|
|
for context in buyers:
|
|
|
|
print("Company: ", context['line_2'])
|
|
|
|
print("St. :", context['line_above'])
|
|
|
|
print("City: ", context['postal_code_line'])
|
|
|
|
print("Country:", context['line_below'])
|
|
|
|
print()
|