diff --git a/api.py b/api.py index 25efa9a..3c8c89a 100644 --- a/api.py +++ b/api.py @@ -1,81 +1,23 @@ -import cv2 -import pytesseract -import fitz -import re -import uvicorn -from tika import parser from fastapi import FastAPI, UploadFile from fastapi.middleware.cors import CORSMiddleware +from tika import parser +import uvicorn +import re +def parse(pdf): + content = parser.from_buffer(pdf) + content = content['content'].split('\n') + content = [c for c in content if c != ''] + total = float(re.findall(r'\d+[.]\d+', list(filter(lambda x: 'Razem' in x, content))[0])[-1]) + content = content[content.index('Sprzedawca:') : content.index('Nabywca:')] + seller = content[1] + nip = content[-1].replace('NIP: ', '') + return { + 'vat_id' : nip, + 'address' : seller, + 'total' : total + } -DPI = 300 -ZOOM = DPI / 72 - -def save_pdf(pdf_bytes): - file_name = "invoice.pdf" - binary_file = open(file_name, "wb") - binary_file.write(pdf_bytes) - binary_file.close() - return file_name - - -def convert_pdf_to_png(file): - file_name = "invoice.png" - magnify = fitz.Matrix(ZOOM, ZOOM) - doc = fitz.open(file) - pix = doc[0].get_pixmap(matrix=magnify) - pix.save(file_name) - return file_name - - -def ocr_png_file(image_name): - image = cv2.imread(image_name) - custom_config = r'-l pol --oem 3 --psm 6' - ocr_string = pytesseract.image_to_string(image, config=custom_config) - return ocr_string - - -def find_vat_id(invoice_text): - pattern = "NIP: ([0-9])+ " - result = re.search(pattern, invoice_text) - vat_id = result.group() - vat_id = vat_id.replace("NIP: ", "") - return vat_id.strip() - - -def find_seller_name(invoice_text): - pattern = "Nabywca:\n.*((?=sp.|spółka|S.A.|S.K.A.))" - result = re.search(pattern, invoice_text) - seller_name = result.group() - seller_name = seller_name.replace("Nabywca:\n", "") - return seller_name.strip() + " sp.z.o.o" - - -def find_total_sum(invoice_text): - pattern_afv = "(?<=Wartość brutto )[0-9]{1,}.[0-9]{1,}(?= PLN\n)" - pattern_bfv = "(?<=PLN )[0-9]{1,}.[0-9]{1,}(?= PLN\n)" - regex_pattern = pattern_afv + "|" + pattern_bfv - result = re.search(regex_pattern, invoice_text) - total_sum = result.group() - total_sum = total_sum.replace("Do zapłaty:", "") - sum_string = total_sum.strip() - return float(sum_string) - - -# @app.post("/invoice") -# async def create_file(file: bytes = File()): -# pdf_file = save_pdf(pdf_bytes=file) -# png_file = convert_pdf_to_png(file=pdf_file) -# invoice_text = ocr_png_file(image_name=png_file) -# vat_id = find_vat_id(invoice_text=invoice_text) -# seller_name = find_seller_name(invoice_text=invoice_text) -# total_sum = find_total_sum(invoice_text=invoice_text) -# json = { -# "vat_id": vat_id, -# "seller_name": seller_name, -# "total": total_sum -# } -# return json app = FastAPI() @@ -91,16 +33,7 @@ app.add_middleware( @app.post('/invoice') async def root(file: UploadFile): - invoice_text = parser.from_buffer(file.file.read()) - vat_id = find_vat_id(invoice_text=invoice_text) - seller_name = find_seller_name(invoice_text=invoice_text) - total_sum = find_total_sum(invoice_text=invoice_text) - json = { - "vat_id": vat_id, - "seller_name": seller_name, - "total": total_sum - } - return json + return parse(file.file.read()) -if __name__ == '__main__': - uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file + +uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file diff --git a/main.py b/main.py index 204db71..96b71f6 100644 --- a/main.py +++ b/main.py @@ -1,58 +1,55 @@ -from credentials import aws_access_key_id, aws_secret_access_key, aws_session_token, default_vpc +aws_access_key_id="ASIA47BSNPAUXFZ4T47L" +aws_secret_access_key="wvB7oGz94O9j/tLhsTxRjfqmOFsTUctujm1kQioX" +aws_session_token="FwoGZXIvYXdzECcaDLqPCWJkmkN46SLS3yLDAR7y4YnKdvWiPKmi0URNSk0ftfYJenKox99GuNk7ukXYJK076N4kN+XzUxaCoyh/N10rQawrqdk2+C5X8UZ774eV02cprw5HI7geB11hTafb03jp0zawLoAjbGKj+2Tyuaxxpmyw3clijyie5uRAWhgyl2rn42UgDkpd8Cr58kQymvVVL60QGPbCEMrij9+/ZFsmanoNlg8DXhpkxsX5ISdvhFTlOCBM5hzFSIIsr+M5DJlDjZIK/fGZK2fYeZZYQ6xTgijkmt+eBjItmJhbCtRcFJZu+nxtNM2f2r15czCLz2bLGBX1+TH4/pd68UujBAvdDWdvuhV+" +VPC="vpc-0c8e1d7baf89a7991" +ID="s444018" + import boto3 +ec2 = boto3.resource( + 'ec2', + region_name='us-east-1', + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + aws_session_token=aws_session_token, +) -INDEKS = "444018" -key_name = f"{INDEKS}-aws-ssh-key" -security_group_name = f"{INDEKS}-security-group" +key_pair = ec2.create_key_pair( + KeyName=ID+'-KEY-PAIR', + KeyType='ed25519', + KeyFormat='pem', +) -user_data = f''' -#!/bin/bash -sudo yum update -y -sudo yum install git -y -git clone https://git.wmi.amu.edu.pl/s444018/DPZC-5.git -cd DPZC-5 -sudo yum install docker -y -sudo service docker start -sudo usermod -a -G docker ec2-user -sudo docker build -t invoice . -sudo docker run -d -p 80:8000 -t invoice -''' +security_group = ec2.create_security_group( + Description=ID+'-GROUP', + GroupName=ID+'-GROUP', + VpcId=VPC +) -if __name__ == '__main__': - ec2 = boto3.resource( - 'ec2', - region_name='us-east-1', - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - aws_session_token=aws_session_token, - ) +inbound_rules = security_group.authorize_ingress( + GroupId=security_group.group_id, + CidrIp='0.0.0.0/0', + IpProtocol='tcp', + FromPort=80, + ToPort=80 +) - key_pair = ec2.create_key_pair( - KeyName=key_name, - KeyType='ed25519', - KeyFormat='pem', - ) - - security_group = ec2.create_security_group( - Description=security_group_name, - GroupName=security_group_name, - VpcId=default_vpc, - ) - - inbound_rules = security_group.authorize_ingress( - GroupId=security_group.group_id, - CidrIp='0.0.0.0/0', - IpProtocol='tcp', - FromPort=80, - ToPort=80, - ) - - instance = ec2.create_instances( - ImageId='ami-0b5eea76982371e91', - MinCount=1, - MaxCount=1, - InstanceType='t2.micro', - KeyName=key_pair.name, - UserData=user_data, - SecurityGroups=[security_group.group_name], - ) \ No newline at end of file +instances = ec2.create_instances( + ImageId='ami-0b5eea76982371e91', + MinCount=1, + MaxCount=1, + InstanceType='t2.micro', + KeyName=key_pair.name, + UserData=f''' + #!/bin/bash + sudo yum update -y + sudo yum install git -y + git clone https://git.wmi.amu.edu.pl/s444376/DPZC_Ola.git + cd DPZC_Ola/Zadanie_5.2_5.3 + sudo yum install docker -y + sudo service docker start + sudo usermod -a -G docker ec2-user + docker build -t invoice . + docker run -d -p 80:8000 -t invoice + ''', + SecurityGroups=[security_group.group_name] +) \ No newline at end of file diff --git a/web/index.html b/web/index.html new file mode 100644 index 0000000..4e8169e --- /dev/null +++ b/web/index.html @@ -0,0 +1,22 @@ + + + + Chmura + + + +

+ + \ No newline at end of file diff --git a/web/website-bucket-policy.json b/web/website-bucket-policy.json new file mode 100644 index 0000000..869b3bc --- /dev/null +++ b/web/website-bucket-policy.json @@ -0,0 +1,13 @@ +{ + "Version": "2012-10-17", + "Id": "MyPolicy", + "Statement": [ + { + "Sid": "PublicReadForGetBucketObjects", + "Effect": "Allow", + "Principal": "*", + "Action": "s3:GetObject", + "Resource": "arn:aws:s3:::invoices-ocr/*" + } + ] +} \ No newline at end of file