90 lines
2.6 KiB
Python
90 lines
2.6 KiB
Python
|
import cv2
|
||
|
import pytesseract
|
||
|
import fitz
|
||
|
import re
|
||
|
from fastapi import FastAPI, File
|
||
|
|
||
|
|
||
|
DPI = 300
|
||
|
ZOOM = DPI / 72
|
||
|
app = FastAPI()
|
||
|
|
||
|
def save_pdf(pdf_bytes):
|
||
|
file_name = "invoice.pdf"
|
||
|
binary_file = open(file_name, "wb")
|
||
|
binary_file.write(pdf_bytes)
|
||
|
binary_file.close()
|
||
|
return file_name
|
||
|
|
||
|
|
||
|
def convert_pdf_to_png(file):
|
||
|
file_name = "invoice.png"
|
||
|
magnify = fitz.Matrix(ZOOM, ZOOM)
|
||
|
doc = fitz.open(file)
|
||
|
pix = doc[0].get_pixmap(matrix=magnify)
|
||
|
pix.save(file_name)
|
||
|
return file_name
|
||
|
|
||
|
|
||
|
def ocr_png_file(image_name):
|
||
|
image = cv2.imread(image_name)
|
||
|
custom_config = r'-l pol --oem 3 --psm 6'
|
||
|
ocr_string = pytesseract.image_to_string(image, config=custom_config)
|
||
|
return ocr_string
|
||
|
|
||
|
|
||
|
def find_vat_id(invoice_text):
|
||
|
pattern = "NIP: ([0-9])+ "
|
||
|
result = re.search(pattern, invoice_text)
|
||
|
vat_id = result.group()
|
||
|
vat_id = vat_id.replace("NIP: ", "")
|
||
|
return vat_id.strip()
|
||
|
|
||
|
|
||
|
def find_seller_name(invoice_text):
|
||
|
pattern = "Nabywca:\n.*((?=sp.|spółka|S.A.|S.K.A.))"
|
||
|
result = re.search(pattern, invoice_text)
|
||
|
seller_name = result.group()
|
||
|
seller_name = seller_name.replace("Nabywca:\n", "")
|
||
|
return seller_name.strip() + " sp.z.o.o"
|
||
|
|
||
|
|
||
|
def find_total_sum(invoice_text):
|
||
|
pattern_afv = "(?<=Wartość brutto )[0-9]{1,}.[0-9]{1,}(?= PLN\n)"
|
||
|
pattern_bfv = "(?<=PLN )[0-9]{1,}.[0-9]{1,}(?= PLN\n)"
|
||
|
regex_pattern = pattern_afv + "|" + pattern_bfv
|
||
|
result = re.search(regex_pattern, invoice_text)
|
||
|
total_sum = result.group()
|
||
|
total_sum = total_sum.replace("Do zapłaty:", "")
|
||
|
sum_string = total_sum.strip()
|
||
|
return float(sum_string)
|
||
|
|
||
|
|
||
|
@app.post("/invoice")
|
||
|
async def create_file(file: bytes = File()):
|
||
|
pdf_file = save_pdf(pdf_bytes=file)
|
||
|
png_file = convert_pdf_to_png(file=pdf_file)
|
||
|
invoice_text = ocr_png_file(image_name=png_file)
|
||
|
vat_id = find_vat_id(invoice_text=invoice_text)
|
||
|
seller_name = find_seller_name(invoice_text=invoice_text)
|
||
|
total_sum = find_total_sum(invoice_text=invoice_text)
|
||
|
json = {
|
||
|
"vat_id": vat_id,
|
||
|
"seller_name": seller_name,
|
||
|
"total": total_sum
|
||
|
}
|
||
|
return json
|
||
|
|
||
|
# if __name__ == '__main__':
|
||
|
# file = "train/CFV 1_05_2021.pdf"
|
||
|
# png_file = convert_pdf_to_png(file)
|
||
|
# invoice_text = ocr_png_file(image_name=png_file)
|
||
|
# vat_id = find_vat_id(invoice_text=invoice_text)
|
||
|
# seller_name = find_seller_name(invoice_text=invoice_text)
|
||
|
# total_sum = find_total_sum(invoice_text=invoice_text)
|
||
|
# json = {
|
||
|
# "vat_id": vat_id,
|
||
|
# "seller_name": seller_name,
|
||
|
# "total": total_sum
|
||
|
# }
|
||
|
# print(json)
|