import re from pdfminer.high_level import extract_text import fitz import pytesseract from PIL import Image import io my_path = r"C:/Users/DELL/Downloads/A-24VU-00511.PDF.pdf" #my_path = r"C:\Users\DELL\Downloads\Invoice_1.pdf" #my_path = r"C:\Users\DELL\Downloads\Kopia 1_240014-333-361-389-399-433-441-448-464-481_2024_06_20_CE240143 (INVOICE - ERD-FVZ%i9210621%i922024%i92NZ-1 - 1 - A1) - 1.pdf.pdf" postal_code_pattern = r"(? 0), '') line_2 = next((lines[j].strip() for j in range(i - 1, -1, -1) if len(lines[j].strip()) > 0 and 'ul.' not in lines[j].strip() and lines[j].strip() != line_above), '') line_below = next((lines[j].strip() for j in range(i + 1, len(lines)) if len(lines[j].strip()) > 0), '') if line_2 and line_above and line and line_below: buyer = { 'line_2': line_2, 'line_above': line_above, 'postal_code_line': line, 'line_below': line_below } if buyer not in buyers: buyers.append(buyer) dates(TEXT) ref_numbers(TEXT) company(TEXT) order_dates = list(order_dates) print("Reference numbers:", order_numbers) print("Document dates:", order_dates) print("Buyers:") for context in buyers: print("Company:", context['line_2']) print("St.:", context['line_above']) print("City:", context['postal_code_line']) print("Country:", context['line_below']) print()