commit c676bece108d0f4900a492852aaf290be52fc93b Author: s464979 Date: Tue Jan 21 21:13:50 2025 +0100 Upload files to "/" diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..de9f15e --- /dev/null +++ b/Dockerfile @@ -0,0 +1,36 @@ +FROM ubuntu:20.04 + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get upgrade -y \ + && apt-get install -y \ + python3 \ + python3-pip \ + python3-dev \ + libpoppler-cpp-dev \ + build-essential \ + curl \ + git \ + nginx \ + && apt-get clean + +RUN pip3 install --no-cache-dir Flask PyMuPDF Flask-Cors + +WORKDIR /app + +COPY . /app + +RUN rm -f /etc/nginx/sites-enabled/default \ + && echo 'server { \ + listen 80; \ + server_name localhost; \ + location / { \ + root /app/static; \ + index index.html; \ + } \ + }' > /etc/nginx/sites-available/static_site \ + && ln -s /etc/nginx/sites-available/static_site /etc/nginx/sites-enabled/ + +EXPOSE 80 8080 + +CMD service nginx start && python3 invoice_service.py diff --git a/app.config b/app.config new file mode 100644 index 0000000..5b7dfda --- /dev/null +++ b/app.config @@ -0,0 +1,6 @@ +files: + "/etc/httpd/conf.d/wsgi_custom.conf": + mode: "000644" + owner: root + group: root + content: WSGIApplicationGroup %{GLOBAL} \ No newline at end of file diff --git a/application.py b/application.py new file mode 100644 index 0000000..0cd3efa --- /dev/null +++ b/application.py @@ -0,0 +1,170 @@ +from flask import Flask, request, jsonify +import boto3 +import re +import os +from flask_cors import CORS +import time + +# from dotenv import load_dotenv + +# load_dotenv() + +# AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") +# AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") +# AWS_SESSION_TOKEN = os.getenv("AWS_SESSION_TOKEN") +# AWS_REGION = os.getenv("AWS_REGION") +# BUCKET_NAME = os.getenv("AWS_S3_BUCKET_NAME", "s464979-test") + +AWS_REGION = "us-east-1" +BUCKET_NAME = "s464979-test-2" + +s3_client = boto3.client( + 's3', + # aws_access_key_id=AWS_ACCESS_KEY_ID, + # aws_secret_access_key=AWS_SECRET_ACCESS_KEY, + # aws_session_token=AWS_SESSION_TOKEN, + region_name=AWS_REGION +) + +textract_client = boto3.client( + 'textract', + # aws_access_key_id=AWS_ACCESS_KEY_ID, + # aws_secret_access_key=AWS_SECRET_ACCESS_KEY, + # aws_session_token=AWS_SESSION_TOKEN, + region_name=AWS_REGION +) + +application = Flask(__name__) +CORS(application) + +@application.route('/') +def hello_world(): + return 'Hello, World!' + +@application.route('/invoice', methods=['POST']) +def process_invoice(): + file = request.files['file'] + file_name = file.filename + + s3_client.upload_fileobj(file, BUCKET_NAME, file_name) + + response = textract_client.start_document_analysis( + DocumentLocation={'S3Object': {'Bucket': BUCKET_NAME, 'Name': file_name}}, + FeatureTypes=['FORMS'] + ) + job_id = response['JobId'] + + wait_for_textract_job(job_id) + + pages = get_textract_job_results(job_id) + + kvs = get_kv_map(pages) + + print("Extracted Key-Value Pairs:", kvs) + + sprzedawca = kvs.get("Sprzedawca:", "") + nip = re.search(r"\b\d{10}\b", sprzedawca) + wartosc_brutto = kvs.get("Wartosc brutto", "") + + return jsonify({ + "seller": sprzedawca, + "vat_id": nip.group(0) if nip else "", + "total": wartosc_brutto + }) + + +def get_textract_job_results(job_id): + pages = [] + response = textract_client.get_document_analysis(JobId=job_id) + pages.append(response) + + # If the response is paginated, continue to fetch all pages + next_token = response.get('NextToken', None) + while next_token: + response = textract_client.get_document_analysis(JobId=job_id, NextToken=next_token) + pages.append(response) + next_token = response.get('NextToken', None) + + return pages + + +def get_kv_map(pages): + """ + Returns a dictionary of all extracted key-value pairs from a list of + Textract result pages. + """ + kvs = {} + + # We first map BLOCK ids to the full block object for quick reference + block_map = {} + for page in pages: + for block in page['Blocks']: + block_map[block['Id']] = block + + # Next, find the KEY blocks + for page in pages: + for block in page['Blocks']: + if block['BlockType'] == 'KEY_VALUE_SET' and 'KEY' in block.get('EntityTypes', []): + key_block = block + value_block = find_value_block(key_block, block_map) # We'll define this next + key = get_text(key_block, block_map) + val = get_text(value_block, block_map) if value_block else "" + kvs[key] = val + + return kvs + + +def find_value_block(key_block, block_map): + """ + From a KEY block, find its associated VALUE block by looking at 'Relationships'. + """ + if 'Relationships' in key_block: + for rel in key_block['Relationships']: + if rel['Type'] == 'VALUE': + for value_id in rel['Ids']: + value_block = block_map[value_id] + return value_block + return None + + +def get_text(block, block_map): + """ + Recursively get text from a block by following relationships to 'CHILD' blocks. + """ + text = [] + if 'Relationships' in block: + for rel in block['Relationships']: + if rel['Type'] == 'CHILD': + for child_id in rel['Ids']: + word = block_map[child_id] + if word['BlockType'] == 'WORD': + text.append(word['Text']) + elif word['BlockType'] == 'SELECTION_ELEMENT': + if word['SelectionStatus'] == 'SELECTED': + text.append('X') + return " ".join(text) + + +def extract_text_from_textract(result): + return " ".join([block['Text'] for block in result['Blocks'] if block['BlockType'] == 'LINE']) + +def extract_vat_id(text): + match = re.search(r'\b\d{10}\b', text) + return match.group(0) if match else None + + +def wait_for_textract_job(job_id): + while True: + response = textract_client.get_document_analysis(JobId=job_id) + status = response['JobStatus'] + print(f"Status Textract Job: {status}") + + if status == 'SUCCEEDED': + return response + elif status == 'FAILED': + raise RuntimeError("Textract job failed") + + time.sleep(2) + +if __name__ == '__main__': + application.run(port=5000, debug=True) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0a1ff07 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +flask>=2.0.0 +flask-cors>=3.0.0 +boto3>=1.20.0 +python-dotenv>=0.21.0 +flask_cors>=5.0.0 \ No newline at end of file