fix files
This commit is contained in:
parent
be43590153
commit
c7d11f6637
106
api.py
Normal file
106
api.py
Normal file
@ -0,0 +1,106 @@
|
||||
import cv2
|
||||
import pytesseract
|
||||
import fitz
|
||||
import re
|
||||
import uvicorn
|
||||
from tika import parser
|
||||
from fastapi import FastAPI, UploadFile
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
|
||||
DPI = 300
|
||||
ZOOM = DPI / 72
|
||||
|
||||
def save_pdf(pdf_bytes):
|
||||
file_name = "invoice.pdf"
|
||||
binary_file = open(file_name, "wb")
|
||||
binary_file.write(pdf_bytes)
|
||||
binary_file.close()
|
||||
return file_name
|
||||
|
||||
|
||||
def convert_pdf_to_png(file):
|
||||
file_name = "invoice.png"
|
||||
magnify = fitz.Matrix(ZOOM, ZOOM)
|
||||
doc = fitz.open(file)
|
||||
pix = doc[0].get_pixmap(matrix=magnify)
|
||||
pix.save(file_name)
|
||||
return file_name
|
||||
|
||||
|
||||
def ocr_png_file(image_name):
|
||||
image = cv2.imread(image_name)
|
||||
custom_config = r'-l pol --oem 3 --psm 6'
|
||||
ocr_string = pytesseract.image_to_string(image, config=custom_config)
|
||||
return ocr_string
|
||||
|
||||
|
||||
def find_vat_id(invoice_text):
|
||||
pattern = "NIP: ([0-9])+ "
|
||||
result = re.search(pattern, invoice_text)
|
||||
vat_id = result.group()
|
||||
vat_id = vat_id.replace("NIP: ", "")
|
||||
return vat_id.strip()
|
||||
|
||||
|
||||
def find_seller_name(invoice_text):
|
||||
pattern = "Nabywca:\n.*((?=sp.|spółka|S.A.|S.K.A.))"
|
||||
result = re.search(pattern, invoice_text)
|
||||
seller_name = result.group()
|
||||
seller_name = seller_name.replace("Nabywca:\n", "")
|
||||
return seller_name.strip() + " sp.z.o.o"
|
||||
|
||||
|
||||
def find_total_sum(invoice_text):
|
||||
pattern_afv = "(?<=Wartość brutto )[0-9]{1,}.[0-9]{1,}(?= PLN\n)"
|
||||
pattern_bfv = "(?<=PLN )[0-9]{1,}.[0-9]{1,}(?= PLN\n)"
|
||||
regex_pattern = pattern_afv + "|" + pattern_bfv
|
||||
result = re.search(regex_pattern, invoice_text)
|
||||
total_sum = result.group()
|
||||
total_sum = total_sum.replace("Do zapłaty:", "")
|
||||
sum_string = total_sum.strip()
|
||||
return float(sum_string)
|
||||
|
||||
|
||||
# @app.post("/invoice")
|
||||
# async def create_file(file: bytes = File()):
|
||||
# pdf_file = save_pdf(pdf_bytes=file)
|
||||
# png_file = convert_pdf_to_png(file=pdf_file)
|
||||
# invoice_text = ocr_png_file(image_name=png_file)
|
||||
# vat_id = find_vat_id(invoice_text=invoice_text)
|
||||
# seller_name = find_seller_name(invoice_text=invoice_text)
|
||||
# total_sum = find_total_sum(invoice_text=invoice_text)
|
||||
# json = {
|
||||
# "vat_id": vat_id,
|
||||
# "seller_name": seller_name,
|
||||
# "total": total_sum
|
||||
# }
|
||||
# return json
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
origins = ["*"]
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=origins,
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
@app.post('/invoice')
|
||||
async def root(file: UploadFile):
|
||||
invoice_text = parser.from_buffer(file.file.read())
|
||||
vat_id = find_vat_id(invoice_text=invoice_text)
|
||||
seller_name = find_seller_name(invoice_text=invoice_text)
|
||||
total_sum = find_total_sum(invoice_text=invoice_text)
|
||||
json = {
|
||||
"vat_id": vat_id,
|
||||
"seller_name": seller_name,
|
||||
"total": total_sum
|
||||
}
|
||||
return json
|
||||
|
||||
if __name__ == '__main__':
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
134
main.py
134
main.py
@ -1,90 +1,58 @@
|
||||
import cv2
|
||||
import pytesseract
|
||||
import fitz
|
||||
import re
|
||||
from fastapi import FastAPI, File
|
||||
from credentials import aws_access_key_id, aws_secret_access_key, aws_session_token, default_vpc
|
||||
import boto3
|
||||
|
||||
INDEKS = "444018"
|
||||
key_name = f"{INDEKS}-aws-ssh-key"
|
||||
security_group_name = f"{INDEKS}-security-group"
|
||||
|
||||
DPI = 300
|
||||
ZOOM = DPI / 72
|
||||
app = FastAPI()
|
||||
user_data = f'''
|
||||
#!/bin/bash
|
||||
sudo yum update -y
|
||||
sudo yum install git -y
|
||||
git clone https://git.wmi.amu.edu.pl/s444018/DPZC-5.git
|
||||
cd DPZC-5
|
||||
sudo yum install docker -y
|
||||
sudo service docker start
|
||||
sudo usermod -a -G docker ec2-user
|
||||
sudo docker build -t invoice .
|
||||
sudo docker run -d -p 80:8000 -t invoice
|
||||
'''
|
||||
|
||||
def save_pdf(pdf_bytes):
|
||||
file_name = "invoice.pdf"
|
||||
binary_file = open(file_name, "wb")
|
||||
binary_file.write(pdf_bytes)
|
||||
binary_file.close()
|
||||
return file_name
|
||||
if __name__ == '__main__':
|
||||
ec2 = boto3.resource(
|
||||
'ec2',
|
||||
region_name='us-east-1',
|
||||
aws_access_key_id=aws_access_key_id,
|
||||
aws_secret_access_key=aws_secret_access_key,
|
||||
aws_session_token=aws_session_token,
|
||||
)
|
||||
|
||||
key_pair = ec2.create_key_pair(
|
||||
KeyName=key_name,
|
||||
KeyType='ed25519',
|
||||
KeyFormat='pem',
|
||||
)
|
||||
|
||||
def convert_pdf_to_png(file):
|
||||
file_name = "invoice.png"
|
||||
magnify = fitz.Matrix(ZOOM, ZOOM)
|
||||
doc = fitz.open(file)
|
||||
pix = doc[0].get_pixmap(matrix=magnify)
|
||||
pix.save(file_name)
|
||||
return file_name
|
||||
security_group = ec2.create_security_group(
|
||||
Description=security_group_name,
|
||||
GroupName=security_group_name,
|
||||
VpcId=default_vpc,
|
||||
)
|
||||
|
||||
inbound_rules = security_group.authorize_ingress(
|
||||
GroupId=security_group.group_id,
|
||||
CidrIp='0.0.0.0/0',
|
||||
IpProtocol='tcp',
|
||||
FromPort=80,
|
||||
ToPort=80,
|
||||
)
|
||||
|
||||
def ocr_png_file(image_name):
|
||||
image = cv2.imread(image_name)
|
||||
custom_config = r'-l pol --oem 3 --psm 6'
|
||||
ocr_string = pytesseract.image_to_string(image, config=custom_config)
|
||||
return ocr_string
|
||||
|
||||
|
||||
def find_vat_id(invoice_text):
|
||||
pattern = "NIP: ([0-9])+ "
|
||||
result = re.search(pattern, invoice_text)
|
||||
vat_id = result.group()
|
||||
vat_id = vat_id.replace("NIP: ", "")
|
||||
return vat_id.strip()
|
||||
|
||||
|
||||
def find_seller_name(invoice_text):
|
||||
pattern = "Nabywca:\n.*((?=sp.|spółka|S.A.|S.K.A.))"
|
||||
result = re.search(pattern, invoice_text)
|
||||
seller_name = result.group()
|
||||
seller_name = seller_name.replace("Nabywca:\n", "")
|
||||
return seller_name.strip() + " sp.z.o.o"
|
||||
|
||||
|
||||
def find_total_sum(invoice_text):
|
||||
pattern_afv = "(?<=Wartość brutto )[0-9]{1,}.[0-9]{1,}(?= PLN\n)"
|
||||
pattern_bfv = "(?<=PLN )[0-9]{1,}.[0-9]{1,}(?= PLN\n)"
|
||||
regex_pattern = pattern_afv + "|" + pattern_bfv
|
||||
result = re.search(regex_pattern, invoice_text)
|
||||
total_sum = result.group()
|
||||
total_sum = total_sum.replace("Do zapłaty:", "")
|
||||
sum_string = total_sum.strip()
|
||||
return float(sum_string)
|
||||
|
||||
|
||||
@app.post("/invoice")
|
||||
async def create_file(file: bytes = File()):
|
||||
pdf_file = save_pdf(pdf_bytes=file)
|
||||
png_file = convert_pdf_to_png(file=pdf_file)
|
||||
invoice_text = ocr_png_file(image_name=png_file)
|
||||
vat_id = find_vat_id(invoice_text=invoice_text)
|
||||
seller_name = find_seller_name(invoice_text=invoice_text)
|
||||
total_sum = find_total_sum(invoice_text=invoice_text)
|
||||
json = {
|
||||
"vat_id": vat_id,
|
||||
"seller_name": seller_name,
|
||||
"total": total_sum
|
||||
}
|
||||
return json
|
||||
|
||||
# if __name__ == '__main__':
|
||||
# file = "train/CFV 1_05_2021.pdf"
|
||||
# png_file = convert_pdf_to_png(file)
|
||||
# invoice_text = ocr_png_file(image_name=png_file)
|
||||
# vat_id = find_vat_id(invoice_text=invoice_text)
|
||||
# seller_name = find_seller_name(invoice_text=invoice_text)
|
||||
# total_sum = find_total_sum(invoice_text=invoice_text)
|
||||
# json = {
|
||||
# "vat_id": vat_id,
|
||||
# "seller_name": seller_name,
|
||||
# "total": total_sum
|
||||
# }
|
||||
# print(json)
|
||||
instance = ec2.create_instances(
|
||||
ImageId='ami-0b5eea76982371e91',
|
||||
MinCount=1,
|
||||
MaxCount=1,
|
||||
InstanceType='t2.micro',
|
||||
KeyName=key_pair.name,
|
||||
UserData=user_data,
|
||||
SecurityGroups=[security_group.group_name],
|
||||
)
|
Loading…
Reference in New Issue
Block a user