csi_rekrutacja/projekt.py

import re
from pdfminer.high_level import extract_text
import fitz
import pytesseract
from PIL import Image
import io


my_path = r"C:/Users/DELL/Downloads/A-24VU-00511.PDF.pdf"
#my_path = r"C:\Users\DELL\Downloads\Invoice_1.pdf"
#my_path = r"C:\Users\DELL\Downloads\Kopia 1_240014-333-361-389-399-433-441-448-464-481_2024_06_20_CE240143 (INVOICE - ERD-FVZ%i9210621%i922024%i92NZ-1 - 1 - A1) - 1.pdf.pdf"


postal_code_pattern = r"(?<!\S)(?:(?:[A-Za-z]{3,}\s+)(\d{5}|\d{2}-\d{3})|(\d{5}|\d{2}-\d{3})(?:\s+[A-Za-z]{3,}))(?!\S)"
postal_code_pattern2 = r"(?<!\S)(?:[A-Za-z]{3,}\s+-?\s+(\d{5}|\d{2}-\d{3})|(\d{5}|\d{2}-\d{3})\s+-?\s+[A-Za-z]{3,})(?!\S)"
date_pattern = r'\b\d{2}/\d{2}/\d{2,4}\b'
reference_pattern = r"ref\.\s*[^0-9]*?([1-9]\d{4,})"

order_numbers = []
order_dates = set()
buyers = []

def extract_full_text(pdf_path):
    text = ""

    text += extract_text(pdf_path).lower()

    with fitz.open(pdf_path) as pdf_document:
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            if not page.get_text():  #jak nie ma tekstu to traktuj jako obraz
                pix = page.get_pixmap(dpi=300)
                image = Image.open(io.BytesIO(pix.tobytes("png")))
                text += pytesseract.image_to_string(image, lang="pol").lower()  #ocr

    return text


TEXT = extract_full_text(my_path)


def dates(text):
    matches = re.findall(date_pattern, text)
    for match in matches:
        order_dates.add(match)

def ref_numbers(text):
    matches = re.findall(reference_pattern, text)
    for match in matches:
        order_numbers.append(match)

def company(text):
    lines = text.splitlines()
    for i in range(len(lines)):
        line = lines[i]

        if re.search(postal_code_pattern, line) or re.search(postal_code_pattern2, line):
            line_above = next((lines[j].strip() for j in range(i - 1, -1, -1) if len(lines[j].strip()) > 0), '')
            line_2 = next((lines[j].strip() for j in range(i - 1, -1, -1) if len(lines[j].strip()) > 0 and 'ul.' not in lines[j].strip() and lines[j].strip() != line_above), '')
            line_below = next((lines[j].strip() for j in range(i + 1, len(lines)) if len(lines[j].strip()) > 0), '')

            if line_2 and line_above and line and line_below:
                buyer = {
                    'line_2': line_2,
                    'line_above': line_above,
                    'postal_code_line': line,
                    'line_below': line_below
                }
                if buyer not in buyers:
                    buyers.append(buyer)


dates(TEXT)
ref_numbers(TEXT)
company(TEXT)


order_dates = list(order_dates)

print("Reference numbers:", order_numbers)
print("Document dates:", order_dates)
print("Buyers:")

for context in buyers:
    print("Company:", context['line_2'])
    print("St.:", context['line_above'])
    print("City:", context['postal_code_line'])
    print("Country:", context['line_below'])
    print()