From 0936802758e79372f1482f55a5268a2b2bcd44bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20Parafin=CC=81ski?= Date: Sun, 22 Jan 2023 17:33:49 +0100 Subject: [PATCH] Add main.py --- main.py | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 main.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..9782fa5 --- /dev/null +++ b/main.py @@ -0,0 +1,90 @@ +import cv2 +import pytesseract +import fitz +import re +from fastapi import FastAPI, File + + +DPI = 300 +ZOOM = DPI / 72 +app = FastAPI() + +def save_pdf(pdf_bytes): + file_name = "invoice.pdf" + binary_file = open(file_name, "wb") + binary_file.write(pdf_bytes) + binary_file.close() + return file_name + + +def convert_pdf_to_png(file): + file_name = "invoice.png" + magnify = fitz.Matrix(ZOOM, ZOOM) + doc = fitz.open(file) + pix = doc[0].get_pixmap(matrix=magnify) + pix.save(file_name) + return file_name + + +def ocr_png_file(image_name): + image = cv2.imread(image_name) + custom_config = r'-l pol --oem 3 --psm 6' + ocr_string = pytesseract.image_to_string(image, config=custom_config) + return ocr_string + + +def find_vat_id(invoice_text): + pattern = "NIP: ([0-9])+ " + result = re.search(pattern, invoice_text) + vat_id = result.group() + vat_id = vat_id.replace("NIP: ", "") + return vat_id.strip() + + +def find_seller_name(invoice_text): + pattern = "Nabywca:\n.*((?=sp.|spółka|S.A.|S.K.A.))" + result = re.search(pattern, invoice_text) + seller_name = result.group() + seller_name = seller_name.replace("Nabywca:\n", "") + return seller_name.strip() + " sp.z.o.o" + + +def find_total_sum(invoice_text): + pattern_afv = "(?<=Wartość brutto )[0-9]{1,}.[0-9]{1,}(?= PLN\n)" + pattern_bfv = "(?<=PLN )[0-9]{1,}.[0-9]{1,}(?= PLN\n)" + regex_pattern = pattern_afv + "|" + pattern_bfv + result = re.search(regex_pattern, invoice_text) + total_sum = result.group() + total_sum = total_sum.replace("Do zapłaty:", "") + sum_string = total_sum.strip() + return float(sum_string) + + +@app.post("/invoice") +async def create_file(file: bytes = File()): + pdf_file = save_pdf(pdf_bytes=file) + png_file = convert_pdf_to_png(file=pdf_file) + invoice_text = ocr_png_file(image_name=png_file) + vat_id = find_vat_id(invoice_text=invoice_text) + seller_name = find_seller_name(invoice_text=invoice_text) + total_sum = find_total_sum(invoice_text=invoice_text) + json = { + "vat_id": vat_id, + "seller_name": seller_name, + "total": total_sum + } + return json + +# if __name__ == '__main__': +# file = "train/CFV 1_05_2021.pdf" +# png_file = convert_pdf_to_png(file) +# invoice_text = ocr_png_file(image_name=png_file) +# vat_id = find_vat_id(invoice_text=invoice_text) +# seller_name = find_seller_name(invoice_text=invoice_text) +# total_sum = find_total_sum(invoice_text=invoice_text) +# json = { +# "vat_id": vat_id, +# "seller_name": seller_name, +# "total": total_sum +# } +# print(json) \ No newline at end of file