Overwrite old files

This commit is contained in:
Szymon Parafiński 2023-01-30 15:49:19 +01:00
parent c7d11f6637
commit fbc4ef18c3
4 changed files with 104 additions and 139 deletions

103
api.py
View File

@ -1,81 +1,23 @@
import cv2
import pytesseract
import fitz
import re
import uvicorn
from tika import parser
from fastapi import FastAPI, UploadFile from fastapi import FastAPI, UploadFile
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from tika import parser
import uvicorn
import re
def parse(pdf):
content = parser.from_buffer(pdf)
content = content['content'].split('\n')
content = [c for c in content if c != '']
total = float(re.findall(r'\d+[.]\d+', list(filter(lambda x: 'Razem' in x, content))[0])[-1])
content = content[content.index('Sprzedawca:') : content.index('Nabywca:')]
seller = content[1]
nip = content[-1].replace('NIP: ', '')
return {
'vat_id' : nip,
'address' : seller,
'total' : total
}
DPI = 300
ZOOM = DPI / 72
def save_pdf(pdf_bytes):
file_name = "invoice.pdf"
binary_file = open(file_name, "wb")
binary_file.write(pdf_bytes)
binary_file.close()
return file_name
def convert_pdf_to_png(file):
file_name = "invoice.png"
magnify = fitz.Matrix(ZOOM, ZOOM)
doc = fitz.open(file)
pix = doc[0].get_pixmap(matrix=magnify)
pix.save(file_name)
return file_name
def ocr_png_file(image_name):
image = cv2.imread(image_name)
custom_config = r'-l pol --oem 3 --psm 6'
ocr_string = pytesseract.image_to_string(image, config=custom_config)
return ocr_string
def find_vat_id(invoice_text):
pattern = "NIP: ([0-9])+ "
result = re.search(pattern, invoice_text)
vat_id = result.group()
vat_id = vat_id.replace("NIP: ", "")
return vat_id.strip()
def find_seller_name(invoice_text):
pattern = "Nabywca:\n.*((?=sp.|spółka|S.A.|S.K.A.))"
result = re.search(pattern, invoice_text)
seller_name = result.group()
seller_name = seller_name.replace("Nabywca:\n", "")
return seller_name.strip() + " sp.z.o.o"
def find_total_sum(invoice_text):
pattern_afv = "(?<=Wartość brutto )[0-9]{1,}.[0-9]{1,}(?= PLN\n)"
pattern_bfv = "(?<=PLN )[0-9]{1,}.[0-9]{1,}(?= PLN\n)"
regex_pattern = pattern_afv + "|" + pattern_bfv
result = re.search(regex_pattern, invoice_text)
total_sum = result.group()
total_sum = total_sum.replace("Do zapłaty:", "")
sum_string = total_sum.strip()
return float(sum_string)
# @app.post("/invoice")
# async def create_file(file: bytes = File()):
# pdf_file = save_pdf(pdf_bytes=file)
# png_file = convert_pdf_to_png(file=pdf_file)
# invoice_text = ocr_png_file(image_name=png_file)
# vat_id = find_vat_id(invoice_text=invoice_text)
# seller_name = find_seller_name(invoice_text=invoice_text)
# total_sum = find_total_sum(invoice_text=invoice_text)
# json = {
# "vat_id": vat_id,
# "seller_name": seller_name,
# "total": total_sum
# }
# return json
app = FastAPI() app = FastAPI()
@ -91,16 +33,7 @@ app.add_middleware(
@app.post('/invoice') @app.post('/invoice')
async def root(file: UploadFile): async def root(file: UploadFile):
invoice_text = parser.from_buffer(file.file.read()) return parse(file.file.read())
vat_id = find_vat_id(invoice_text=invoice_text)
seller_name = find_seller_name(invoice_text=invoice_text)
total_sum = find_total_sum(invoice_text=invoice_text)
json = {
"vat_id": vat_id,
"seller_name": seller_name,
"total": total_sum
}
return json
if __name__ == '__main__':
uvicorn.run(app, host="0.0.0.0", port=8000) uvicorn.run(app, host="0.0.0.0", port=8000)

53
main.py
View File

@ -1,24 +1,10 @@
from credentials import aws_access_key_id, aws_secret_access_key, aws_session_token, default_vpc aws_access_key_id="ASIA47BSNPAUXFZ4T47L"
aws_secret_access_key="wvB7oGz94O9j/tLhsTxRjfqmOFsTUctujm1kQioX"
aws_session_token="FwoGZXIvYXdzECcaDLqPCWJkmkN46SLS3yLDAR7y4YnKdvWiPKmi0URNSk0ftfYJenKox99GuNk7ukXYJK076N4kN+XzUxaCoyh/N10rQawrqdk2+C5X8UZ774eV02cprw5HI7geB11hTafb03jp0zawLoAjbGKj+2Tyuaxxpmyw3clijyie5uRAWhgyl2rn42UgDkpd8Cr58kQymvVVL60QGPbCEMrij9+/ZFsmanoNlg8DXhpkxsX5ISdvhFTlOCBM5hzFSIIsr+M5DJlDjZIK/fGZK2fYeZZYQ6xTgijkmt+eBjItmJhbCtRcFJZu+nxtNM2f2r15czCLz2bLGBX1+TH4/pd68UujBAvdDWdvuhV+"
VPC="vpc-0c8e1d7baf89a7991"
ID="s444018"
import boto3 import boto3
INDEKS = "444018"
key_name = f"{INDEKS}-aws-ssh-key"
security_group_name = f"{INDEKS}-security-group"
user_data = f'''
#!/bin/bash
sudo yum update -y
sudo yum install git -y
git clone https://git.wmi.amu.edu.pl/s444018/DPZC-5.git
cd DPZC-5
sudo yum install docker -y
sudo service docker start
sudo usermod -a -G docker ec2-user
sudo docker build -t invoice .
sudo docker run -d -p 80:8000 -t invoice
'''
if __name__ == '__main__':
ec2 = boto3.resource( ec2 = boto3.resource(
'ec2', 'ec2',
region_name='us-east-1', region_name='us-east-1',
@ -28,15 +14,15 @@ if __name__ == '__main__':
) )
key_pair = ec2.create_key_pair( key_pair = ec2.create_key_pair(
KeyName=key_name, KeyName=ID+'-KEY-PAIR',
KeyType='ed25519', KeyType='ed25519',
KeyFormat='pem', KeyFormat='pem',
) )
security_group = ec2.create_security_group( security_group = ec2.create_security_group(
Description=security_group_name, Description=ID+'-GROUP',
GroupName=security_group_name, GroupName=ID+'-GROUP',
VpcId=default_vpc, VpcId=VPC
) )
inbound_rules = security_group.authorize_ingress( inbound_rules = security_group.authorize_ingress(
@ -44,15 +30,26 @@ if __name__ == '__main__':
CidrIp='0.0.0.0/0', CidrIp='0.0.0.0/0',
IpProtocol='tcp', IpProtocol='tcp',
FromPort=80, FromPort=80,
ToPort=80, ToPort=80
) )
instance = ec2.create_instances( instances = ec2.create_instances(
ImageId='ami-0b5eea76982371e91', ImageId='ami-0b5eea76982371e91',
MinCount=1, MinCount=1,
MaxCount=1, MaxCount=1,
InstanceType='t2.micro', InstanceType='t2.micro',
KeyName=key_pair.name, KeyName=key_pair.name,
UserData=user_data, UserData=f'''
SecurityGroups=[security_group.group_name], #!/bin/bash
sudo yum update -y
sudo yum install git -y
git clone https://git.wmi.amu.edu.pl/s444376/DPZC_Ola.git
cd DPZC_Ola/Zadanie_5.2_5.3
sudo yum install docker -y
sudo service docker start
sudo usermod -a -G docker ec2-user
docker build -t invoice .
docker run -d -p 80:8000 -t invoice
''',
SecurityGroups=[security_group.group_name]
) )

22
web/index.html Normal file
View File

@ -0,0 +1,22 @@
<html lang="en">
<head>
<script>
async function parse(){
document.getElementById("result").innerHTML = "Dodawanie pliku..."
var data = new FormData()
data.append('file', document.getElementById("docpicker").files[0])
document.getElementById("result").innerHTML = "Proszę chwilę poczekać, zaraz wypluje info o fakturze :)"
resp = await fetch('http://IP_ADDRESS_HERE:80/invoice', {method: "POST", body: data})
document.getElementById("result").innerHTML = "Info"
data = await resp.json()
var result = JSON.stringify(data)
document.getElementById("result").innerHTML = result
}
</script>
<title>Chmura</title>
</head>
<body>
<input type="file" id="docpicker" accept=".pdf" onchange="parse()" />
<p id="result"></p>
</body>
</html>

View File

@ -0,0 +1,13 @@
{
"Version": "2012-10-17",
"Id": "MyPolicy",
"Statement": [
{
"Sid": "PublicReadForGetBucketObjects",
"Effect": "Allow",
"Principal": "*",
"Action": "s3:GetObject",
"Resource": "arn:aws:s3:::invoices-ocr/*"
}
]
}