import fitz import re my_path = r"C:\Users\DELL\Downloads\A-24VU-00511.PDF.pdf" postal_code_pattern = r'\b(?!0)([1-9]\d?-\d{3}|[1-9]\d{4}|[1-9]\d? \d{3})\b' with fitz.open(my_path) as doc: order_numbers = [] order_dates = [] buyers = [] for page in doc: text = page.get_text("text") if "Your ref. no. PO" in text: parts = text.split("Your ref. no. PO") for part in parts[1:]: order_number = part.split()[0].strip() order_numbers.append(order_number) date_pattern = r'\b\d{2}/\d{2}/\d{2,4}\b' matches = re.findall(date_pattern, text) order_dates.extend(matches) lines = text.splitlines() for i in range(1, len(lines)): line = lines[i] if re.search(postal_code_pattern, line): line_2 = lines[i-2] if i>0 else '' line_above = lines[i - 1] if i > 0 else '' line_below = lines[i + 1] if i + 1 < len(lines) else '' if not re.search(r'[^0-9a-zA-Z \-]', line): buyers.append({ 'line_2': line_2, 'line_above': line_above, 'postal_code_line': line, 'line_below': line_below }) print("Reference numbers:", order_numbers) print("Document dates:", order_dates) print("Buyers: ") for context in buyers: print("Company: ", context['line_2']) print("St. :", context['line_above']) print("City: ", context['postal_code_line']) print("Country:", context['line_below']) print()