DPZC_3/5/test.ipynb
2023-01-28 22:54:41 +01:00

12 KiB
Raw Blame History

from fastapi import FastAPI, UploadFile
from tika import parser
import uvicorn, re
from fastapi.middleware.cors import CORSMiddleware



faktura = parser.from_file("/Users/mikolajpaterka/Documents/Studia/Mgr_II/Praktycznie zastosowania_chmury_obliczeniowej/Zestaw_3/5_train/CFV 4_05_2021.pdf")
dane = faktura['content']
dane = dane.split('\n')
dane = [i for i in dane if i != '']
razem = list(filter(lambda x: 'Razem' in x, dane))
razem = float(re.findall(r'\d+[.]\d+', razem[0])[-1])

sprzedawca = dane[dane.index('Sprzedawca:') : dane.index('Nabywca:')]
sprzedawca_nazwa = sprzedawca[1]
nip = sprzedawca[-1].replace('NIP: ', '')

print(nip)
print(sprzedawca_nazwa)
print(razem)
NIP: 5741997874
Firma krzak sp.z.o.o
246.0
# import parser object from tike
from tika import parser

parsed_pdf = parser.from_file("AFV 1_05_2021.pdf")

# ['metadata'] attribute returns
# key-value pairs of meta-data
print(parsed_pdf['metadata'])

# <class 'dict'>
print(type(parsed_pdf['metadata']))
2023-01-27 20:29:38,916 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2023-01-27 20:29:43,923 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2023-01-27 20:29:48,927 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2023-01-27 20:29:53,933 [MainThread  ] [ERROR]  Tika startup log message not received after 3 tries.
2023-01-27 20:29:53,936 [MainThread  ] [ERROR]  Failed to receive startup confirmation from startServer.
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In [10], line 4
      1 # import parser object from tike
      2 from tika import parser
----> 4 parsed_pdf = parser.from_file("AFV 1_05_2021.pdf")
      6 # ['metadata'] attribute returns
      7 # key-value pairs of meta-data
      8 print(parsed_pdf['metadata'])

File /opt/homebrew/lib/python3.10/site-packages/tika/parser.py:40, in from_file(filename, serverEndpoint, service, xmlContent, headers, config_path, requestOptions, raw_response)
     24 '''
     25 Parses a file for metadata and content
     26 :param filename: path to file which needs to be parsed or binary file using open(path,'rb')
   (...)
     37         'content' has a str value and metadata has a dict type value.
     38 '''
     39 if not xmlContent:
---> 40     output = parse1(service, filename, serverEndpoint, headers=headers, config_path=config_path, requestOptions=requestOptions)
     41 else:
     42     output = parse1(service, filename, serverEndpoint, services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/xml'},
     43                         headers=headers, config_path=config_path, requestOptions=requestOptions)

File /opt/homebrew/lib/python3.10/site-packages/tika/tika.py:337, in parse1(option, urlOrPath, serverEndpoint, verbose, tikaServerJar, responseMimeType, services, rawResponse, headers, config_path, requestOptions)
    335 headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)})
    336 with urlOrPath if _is_file_object(urlOrPath) else open(path, 'rb') as f:
--> 337     status, response = callServer('put', serverEndpoint, service, f,
    338                                   headers, verbose, tikaServerJar, config_path=config_path,
    339                                   rawResponse=rawResponse, requestOptions=requestOptions)
    341 if file_type == 'remote': os.unlink(path)
    342 return (status, response)

File /opt/homebrew/lib/python3.10/site-packages/tika/tika.py:532, in callServer(verb, serverEndpoint, service, data, headers, verbose, tikaServerJar, httpVerbs, classpath, rawResponse, config_path, requestOptions)
    530 global TikaClientOnly
    531 if not TikaClientOnly:
--> 532     serverEndpoint = checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path)
    534 serviceUrl  = serverEndpoint + service
    535 if verb not in httpVerbs:

File /opt/homebrew/lib/python3.10/site-packages/tika/tika.py:602, in checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path)
    600         if not status:
    601             log.error("Failed to receive startup confirmation from startServer.")
--> 602             raise RuntimeError("Unable to start Tika server.")
    603 return serverEndpoint

RuntimeError: Unable to start Tika server.
# import parser object from tike
from tika import parser

# opening pdf file
parsed_pdf = parser.from_file("AFV 1_05_2021.pdf")

# saving content of pdf
# you can also bring text only, by parsed_pdf['text']
# parsed_pdf['content'] returns string
data = parsed_pdf['content']

# Printing of content
print(data)

# <class 'str'>
print(type(data))

import tika
from tika import parser
parsed = parser.from_file('AFV 1_05_2021.pdf')
print(parsed["metadata"])
print(parsed["content"])
curl -X POST 54.89.242.59:80/invoice -F "file=@AFV 1_05_2021.pdf"

curl ec2-54-89-242-59.compute-1.amazonaws.com

ec2-54-89-242-59.compute-1.amazonaws.com 
WEBSERVICE_URL = "100.24.236.135:80"

import requests
import glob
import sys

print("xd")

txtfiles = []

for file in glob.glob("./5_train/*.pdf"):
    print(f"Testowanie pliku {file}")
    files = {'file': open(file, 'rb')}
    response =requests.post(WEBSERVICE_URL + "/invoice", files=files)
    try:
        data = response.json()
        if "vat_id" in data and "address" in data and "total" in data and \
        not data["vat_id"] and not data["address"] and isinstance(data["total"], float):
            print(f'OK: {data["vat_id"]}\t{data["address"]}\t{data["total"]}')
        else:
            print("NIEPOPRAWNA ODPOWIEDŹ")
    except:
        print("Błędna odpowiedź z serwera (%s):"%(sys.exc_info()[0]), file=sys.stderr)
xd