From c7d11f6637f7ed49a896fba904a34ed05aeefd86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20Parafin=CC=81ski?= Date: Mon, 30 Jan 2023 10:46:34 +0100 Subject: [PATCH] fix files --- api.py | 106 ++++++++++++++++++++++++++++++++++++++++++++ main.py | 134 +++++++++++++++++++++----------------------------------- 2 files changed, 157 insertions(+), 83 deletions(-) create mode 100644 api.py diff --git a/api.py b/api.py new file mode 100644 index 0000000..25efa9a --- /dev/null +++ b/api.py @@ -0,0 +1,106 @@ +import cv2 +import pytesseract +import fitz +import re +import uvicorn +from tika import parser +from fastapi import FastAPI, UploadFile +from fastapi.middleware.cors import CORSMiddleware + + +DPI = 300 +ZOOM = DPI / 72 + +def save_pdf(pdf_bytes): + file_name = "invoice.pdf" + binary_file = open(file_name, "wb") + binary_file.write(pdf_bytes) + binary_file.close() + return file_name + + +def convert_pdf_to_png(file): + file_name = "invoice.png" + magnify = fitz.Matrix(ZOOM, ZOOM) + doc = fitz.open(file) + pix = doc[0].get_pixmap(matrix=magnify) + pix.save(file_name) + return file_name + + +def ocr_png_file(image_name): + image = cv2.imread(image_name) + custom_config = r'-l pol --oem 3 --psm 6' + ocr_string = pytesseract.image_to_string(image, config=custom_config) + return ocr_string + + +def find_vat_id(invoice_text): + pattern = "NIP: ([0-9])+ " + result = re.search(pattern, invoice_text) + vat_id = result.group() + vat_id = vat_id.replace("NIP: ", "") + return vat_id.strip() + + +def find_seller_name(invoice_text): + pattern = "Nabywca:\n.*((?=sp.|spółka|S.A.|S.K.A.))" + result = re.search(pattern, invoice_text) + seller_name = result.group() + seller_name = seller_name.replace("Nabywca:\n", "") + return seller_name.strip() + " sp.z.o.o" + + +def find_total_sum(invoice_text): + pattern_afv = "(?<=Wartość brutto )[0-9]{1,}.[0-9]{1,}(?= PLN\n)" + pattern_bfv = "(?<=PLN )[0-9]{1,}.[0-9]{1,}(?= PLN\n)" + regex_pattern = pattern_afv + "|" + pattern_bfv + result = re.search(regex_pattern, invoice_text) + total_sum = result.group() + total_sum = total_sum.replace("Do zapłaty:", "") + sum_string = total_sum.strip() + return float(sum_string) + + +# @app.post("/invoice") +# async def create_file(file: bytes = File()): +# pdf_file = save_pdf(pdf_bytes=file) +# png_file = convert_pdf_to_png(file=pdf_file) +# invoice_text = ocr_png_file(image_name=png_file) +# vat_id = find_vat_id(invoice_text=invoice_text) +# seller_name = find_seller_name(invoice_text=invoice_text) +# total_sum = find_total_sum(invoice_text=invoice_text) +# json = { +# "vat_id": vat_id, +# "seller_name": seller_name, +# "total": total_sum +# } +# return json + +app = FastAPI() + +origins = ["*"] + +app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +@app.post('/invoice') +async def root(file: UploadFile): + invoice_text = parser.from_buffer(file.file.read()) + vat_id = find_vat_id(invoice_text=invoice_text) + seller_name = find_seller_name(invoice_text=invoice_text) + total_sum = find_total_sum(invoice_text=invoice_text) + json = { + "vat_id": vat_id, + "seller_name": seller_name, + "total": total_sum + } + return json + +if __name__ == '__main__': + uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file diff --git a/main.py b/main.py index 9782fa5..204db71 100644 --- a/main.py +++ b/main.py @@ -1,90 +1,58 @@ -import cv2 -import pytesseract -import fitz -import re -from fastapi import FastAPI, File +from credentials import aws_access_key_id, aws_secret_access_key, aws_session_token, default_vpc +import boto3 +INDEKS = "444018" +key_name = f"{INDEKS}-aws-ssh-key" +security_group_name = f"{INDEKS}-security-group" -DPI = 300 -ZOOM = DPI / 72 -app = FastAPI() +user_data = f''' +#!/bin/bash +sudo yum update -y +sudo yum install git -y +git clone https://git.wmi.amu.edu.pl/s444018/DPZC-5.git +cd DPZC-5 +sudo yum install docker -y +sudo service docker start +sudo usermod -a -G docker ec2-user +sudo docker build -t invoice . +sudo docker run -d -p 80:8000 -t invoice +''' -def save_pdf(pdf_bytes): - file_name = "invoice.pdf" - binary_file = open(file_name, "wb") - binary_file.write(pdf_bytes) - binary_file.close() - return file_name +if __name__ == '__main__': + ec2 = boto3.resource( + 'ec2', + region_name='us-east-1', + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + aws_session_token=aws_session_token, + ) + key_pair = ec2.create_key_pair( + KeyName=key_name, + KeyType='ed25519', + KeyFormat='pem', + ) -def convert_pdf_to_png(file): - file_name = "invoice.png" - magnify = fitz.Matrix(ZOOM, ZOOM) - doc = fitz.open(file) - pix = doc[0].get_pixmap(matrix=magnify) - pix.save(file_name) - return file_name + security_group = ec2.create_security_group( + Description=security_group_name, + GroupName=security_group_name, + VpcId=default_vpc, + ) + inbound_rules = security_group.authorize_ingress( + GroupId=security_group.group_id, + CidrIp='0.0.0.0/0', + IpProtocol='tcp', + FromPort=80, + ToPort=80, + ) -def ocr_png_file(image_name): - image = cv2.imread(image_name) - custom_config = r'-l pol --oem 3 --psm 6' - ocr_string = pytesseract.image_to_string(image, config=custom_config) - return ocr_string - - -def find_vat_id(invoice_text): - pattern = "NIP: ([0-9])+ " - result = re.search(pattern, invoice_text) - vat_id = result.group() - vat_id = vat_id.replace("NIP: ", "") - return vat_id.strip() - - -def find_seller_name(invoice_text): - pattern = "Nabywca:\n.*((?=sp.|spółka|S.A.|S.K.A.))" - result = re.search(pattern, invoice_text) - seller_name = result.group() - seller_name = seller_name.replace("Nabywca:\n", "") - return seller_name.strip() + " sp.z.o.o" - - -def find_total_sum(invoice_text): - pattern_afv = "(?<=Wartość brutto )[0-9]{1,}.[0-9]{1,}(?= PLN\n)" - pattern_bfv = "(?<=PLN )[0-9]{1,}.[0-9]{1,}(?= PLN\n)" - regex_pattern = pattern_afv + "|" + pattern_bfv - result = re.search(regex_pattern, invoice_text) - total_sum = result.group() - total_sum = total_sum.replace("Do zapłaty:", "") - sum_string = total_sum.strip() - return float(sum_string) - - -@app.post("/invoice") -async def create_file(file: bytes = File()): - pdf_file = save_pdf(pdf_bytes=file) - png_file = convert_pdf_to_png(file=pdf_file) - invoice_text = ocr_png_file(image_name=png_file) - vat_id = find_vat_id(invoice_text=invoice_text) - seller_name = find_seller_name(invoice_text=invoice_text) - total_sum = find_total_sum(invoice_text=invoice_text) - json = { - "vat_id": vat_id, - "seller_name": seller_name, - "total": total_sum - } - return json - -# if __name__ == '__main__': -# file = "train/CFV 1_05_2021.pdf" -# png_file = convert_pdf_to_png(file) -# invoice_text = ocr_png_file(image_name=png_file) -# vat_id = find_vat_id(invoice_text=invoice_text) -# seller_name = find_seller_name(invoice_text=invoice_text) -# total_sum = find_total_sum(invoice_text=invoice_text) -# json = { -# "vat_id": vat_id, -# "seller_name": seller_name, -# "total": total_sum -# } -# print(json) \ No newline at end of file + instance = ec2.create_instances( + ImageId='ami-0b5eea76982371e91', + MinCount=1, + MaxCount=1, + InstanceType='t2.micro', + KeyName=key_pair.name, + UserData=user_data, + SecurityGroups=[security_group.group_name], + ) \ No newline at end of file