DPZC-5/api.py

106 lines
2.8 KiB
Python
Raw Normal View History

2023-01-30 10:46:34 +01:00
import cv2
import pytesseract
import fitz
import re
import uvicorn
from tika import parser
from fastapi import FastAPI, UploadFile
from fastapi.middleware.cors import CORSMiddleware
DPI = 300
ZOOM = DPI / 72
def save_pdf(pdf_bytes):
file_name = "invoice.pdf"
binary_file = open(file_name, "wb")
binary_file.write(pdf_bytes)
binary_file.close()
return file_name
def convert_pdf_to_png(file):
file_name = "invoice.png"
magnify = fitz.Matrix(ZOOM, ZOOM)
doc = fitz.open(file)
pix = doc[0].get_pixmap(matrix=magnify)
pix.save(file_name)
return file_name
def ocr_png_file(image_name):
image = cv2.imread(image_name)
custom_config = r'-l pol --oem 3 --psm 6'
ocr_string = pytesseract.image_to_string(image, config=custom_config)
return ocr_string
def find_vat_id(invoice_text):
pattern = "NIP: ([0-9])+ "
result = re.search(pattern, invoice_text)
vat_id = result.group()
vat_id = vat_id.replace("NIP: ", "")
return vat_id.strip()
def find_seller_name(invoice_text):
pattern = "Nabywca:\n.*((?=sp.|spółka|S.A.|S.K.A.))"
result = re.search(pattern, invoice_text)
seller_name = result.group()
seller_name = seller_name.replace("Nabywca:\n", "")
return seller_name.strip() + " sp.z.o.o"
def find_total_sum(invoice_text):
pattern_afv = "(?<=Wartość brutto )[0-9]{1,}.[0-9]{1,}(?= PLN\n)"
pattern_bfv = "(?<=PLN )[0-9]{1,}.[0-9]{1,}(?= PLN\n)"
regex_pattern = pattern_afv + "|" + pattern_bfv
result = re.search(regex_pattern, invoice_text)
total_sum = result.group()
total_sum = total_sum.replace("Do zapłaty:", "")
sum_string = total_sum.strip()
return float(sum_string)
# @app.post("/invoice")
# async def create_file(file: bytes = File()):
# pdf_file = save_pdf(pdf_bytes=file)
# png_file = convert_pdf_to_png(file=pdf_file)
# invoice_text = ocr_png_file(image_name=png_file)
# vat_id = find_vat_id(invoice_text=invoice_text)
# seller_name = find_seller_name(invoice_text=invoice_text)
# total_sum = find_total_sum(invoice_text=invoice_text)
# json = {
# "vat_id": vat_id,
# "seller_name": seller_name,
# "total": total_sum
# }
# return json
app = FastAPI()
origins = ["*"]
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.post('/invoice')
async def root(file: UploadFile):
invoice_text = parser.from_buffer(file.file.read())
vat_id = find_vat_id(invoice_text=invoice_text)
seller_name = find_seller_name(invoice_text=invoice_text)
total_sum = find_total_sum(invoice_text=invoice_text)
json = {
"vat_id": vat_id,
"seller_name": seller_name,
"total": total_sum
}
return json
if __name__ == '__main__':
uvicorn.run(app, host="0.0.0.0", port=8000)