import cv2 import pytesseract import fitz import re from fastapi import FastAPI, File DPI = 300 ZOOM = DPI / 72 app = FastAPI() def save_pdf(pdf_bytes): file_name = "invoice.pdf" binary_file = open(file_name, "wb") binary_file.write(pdf_bytes) binary_file.close() return file_name def convert_pdf_to_png(file): file_name = "invoice.png" magnify = fitz.Matrix(ZOOM, ZOOM) doc = fitz.open(file) pix = doc[0].get_pixmap(matrix=magnify) pix.save(file_name) return file_name def ocr_png_file(image_name): image = cv2.imread(image_name) custom_config = r'-l pol --oem 3 --psm 6' ocr_string = pytesseract.image_to_string(image, config=custom_config) return ocr_string def find_vat_id(invoice_text): pattern = "NIP: ([0-9])+ " result = re.search(pattern, invoice_text) vat_id = result.group() vat_id = vat_id.replace("NIP: ", "") return vat_id.strip() def find_seller_name(invoice_text): pattern = "Nabywca:\n.*((?=sp.|spółka|S.A.|S.K.A.))" result = re.search(pattern, invoice_text) seller_name = result.group() seller_name = seller_name.replace("Nabywca:\n", "") return seller_name.strip() + " sp.z.o.o" def find_total_sum(invoice_text): pattern_afv = "(?<=Wartość brutto )[0-9]{1,}.[0-9]{1,}(?= PLN\n)" pattern_bfv = "(?<=PLN )[0-9]{1,}.[0-9]{1,}(?= PLN\n)" regex_pattern = pattern_afv + "|" + pattern_bfv result = re.search(regex_pattern, invoice_text) total_sum = result.group() total_sum = total_sum.replace("Do zapłaty:", "") sum_string = total_sum.strip() return float(sum_string) @app.post("/invoice") async def create_file(file: bytes = File()): pdf_file = save_pdf(pdf_bytes=file) png_file = convert_pdf_to_png(file=pdf_file) invoice_text = ocr_png_file(image_name=png_file) vat_id = find_vat_id(invoice_text=invoice_text) seller_name = find_seller_name(invoice_text=invoice_text) total_sum = find_total_sum(invoice_text=invoice_text) json = { "vat_id": vat_id, "seller_name": seller_name, "total": total_sum } return json # if __name__ == '__main__': # file = "train/CFV 1_05_2021.pdf" # png_file = convert_pdf_to_png(file) # invoice_text = ocr_png_file(image_name=png_file) # vat_id = find_vat_id(invoice_text=invoice_text) # seller_name = find_seller_name(invoice_text=invoice_text) # total_sum = find_total_sum(invoice_text=invoice_text) # json = { # "vat_id": vat_id, # "seller_name": seller_name, # "total": total_sum # } # print(json)