12 KiB
12 KiB
from fastapi import FastAPI, UploadFile
from tika import parser
import uvicorn, re
from fastapi.middleware.cors import CORSMiddleware
faktura = parser.from_file("/Users/mikolajpaterka/Documents/Studia/Mgr_II/Praktycznie zastosowania_chmury_obliczeniowej/Zestaw_3/5_train/CFV 4_05_2021.pdf")
dane = faktura['content']
dane = dane.split('\n')
dane = [i for i in dane if i != '']
razem = list(filter(lambda x: 'Razem' in x, dane))
razem = float(re.findall(r'\d+[.]\d+', razem[0])[-1])
sprzedawca = dane[dane.index('Sprzedawca:') : dane.index('Nabywca:')]
sprzedawca_nazwa = sprzedawca[1]
nip = sprzedawca[-1].replace('NIP: ', '')
print(nip)
print(sprzedawca_nazwa)
print(razem)
NIP: 5741997874 Firma krzak sp.z.o.o 246.0
# import parser object from tike
from tika import parser
parsed_pdf = parser.from_file("AFV 1_05_2021.pdf")
# ['metadata'] attribute returns
# key-value pairs of meta-data
print(parsed_pdf['metadata'])
# <class 'dict'>
print(type(parsed_pdf['metadata']))
2023-01-27 20:29:38,916 [MainThread ] [WARNI] Failed to see startup log message; retrying... 2023-01-27 20:29:43,923 [MainThread ] [WARNI] Failed to see startup log message; retrying... 2023-01-27 20:29:48,927 [MainThread ] [WARNI] Failed to see startup log message; retrying... 2023-01-27 20:29:53,933 [MainThread ] [ERROR] Tika startup log message not received after 3 tries. 2023-01-27 20:29:53,936 [MainThread ] [ERROR] Failed to receive startup confirmation from startServer.
[0;31m---------------------------------------------------------------------------[0m [0;31mRuntimeError[0m Traceback (most recent call last) Cell [0;32mIn [10], line 4[0m [1;32m 1[0m [39m# import parser object from tike[39;00m [1;32m 2[0m [39mfrom[39;00m [39mtika[39;00m [39mimport[39;00m parser [0;32m----> 4[0m parsed_pdf [39m=[39m parser[39m.[39;49mfrom_file([39m"[39;49m[39mAFV 1_05_2021.pdf[39;49m[39m"[39;49m) [1;32m 6[0m [39m# ['metadata'] attribute returns[39;00m [1;32m 7[0m [39m# key-value pairs of meta-data[39;00m [1;32m 8[0m [39mprint[39m(parsed_pdf[[39m'[39m[39mmetadata[39m[39m'[39m]) File [0;32m/opt/homebrew/lib/python3.10/site-packages/tika/parser.py:40[0m, in [0;36mfrom_file[0;34m(filename, serverEndpoint, service, xmlContent, headers, config_path, requestOptions, raw_response)[0m [1;32m 24[0m [39m'''[39;00m [1;32m 25[0m [39mParses a file for metadata and content[39;00m [1;32m 26[0m [39m:param filename: path to file which needs to be parsed or binary file using open(path,'rb')[39;00m [0;32m (...)[0m [1;32m 37[0m [39m 'content' has a str value and metadata has a dict type value.[39;00m [1;32m 38[0m [39m'''[39;00m [1;32m 39[0m [39mif[39;00m [39mnot[39;00m xmlContent: [0;32m---> 40[0m output [39m=[39m parse1(service, filename, serverEndpoint, headers[39m=[39;49mheaders, config_path[39m=[39;49mconfig_path, requestOptions[39m=[39;49mrequestOptions) [1;32m 41[0m [39melse[39;00m: [1;32m 42[0m output [39m=[39m parse1(service, filename, serverEndpoint, services[39m=[39m{[39m'[39m[39mmeta[39m[39m'[39m: [39m'[39m[39m/meta[39m[39m'[39m, [39m'[39m[39mtext[39m[39m'[39m: [39m'[39m[39m/tika[39m[39m'[39m, [39m'[39m[39mall[39m[39m'[39m: [39m'[39m[39m/rmeta/xml[39m[39m'[39m}, [1;32m 43[0m headers[39m=[39mheaders, config_path[39m=[39mconfig_path, requestOptions[39m=[39mrequestOptions) File [0;32m/opt/homebrew/lib/python3.10/site-packages/tika/tika.py:337[0m, in [0;36mparse1[0;34m(option, urlOrPath, serverEndpoint, verbose, tikaServerJar, responseMimeType, services, rawResponse, headers, config_path, requestOptions)[0m [1;32m 335[0m headers[39m.[39mupdate({[39m'[39m[39mAccept[39m[39m'[39m: responseMimeType, [39m'[39m[39mContent-Disposition[39m[39m'[39m: make_content_disposition_header(path[39m.[39mencode([39m'[39m[39mutf-8[39m[39m'[39m) [39mif[39;00m [39mtype[39m(path) [39mis[39;00m unicode_string [39melse[39;00m path)}) [1;32m 336[0m [39mwith[39;00m urlOrPath [39mif[39;00m _is_file_object(urlOrPath) [39melse[39;00m [39mopen[39m(path, [39m'[39m[39mrb[39m[39m'[39m) [39mas[39;00m f: [0;32m--> 337[0m status, response [39m=[39m callServer([39m'[39;49m[39mput[39;49m[39m'[39;49m, serverEndpoint, service, f, [1;32m 338[0m headers, verbose, tikaServerJar, config_path[39m=[39;49mconfig_path, [1;32m 339[0m rawResponse[39m=[39;49mrawResponse, requestOptions[39m=[39;49mrequestOptions) [1;32m 341[0m [39mif[39;00m file_type [39m==[39m [39m'[39m[39mremote[39m[39m'[39m: os[39m.[39munlink(path) [1;32m 342[0m [39mreturn[39;00m (status, response) File [0;32m/opt/homebrew/lib/python3.10/site-packages/tika/tika.py:532[0m, in [0;36mcallServer[0;34m(verb, serverEndpoint, service, data, headers, verbose, tikaServerJar, httpVerbs, classpath, rawResponse, config_path, requestOptions)[0m [1;32m 530[0m [39mglobal[39;00m TikaClientOnly [1;32m 531[0m [39mif[39;00m [39mnot[39;00m TikaClientOnly: [0;32m--> 532[0m serverEndpoint [39m=[39m checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path) [1;32m 534[0m serviceUrl [39m=[39m serverEndpoint [39m+[39m service [1;32m 535[0m [39mif[39;00m verb [39mnot[39;00m [39min[39;00m httpVerbs: File [0;32m/opt/homebrew/lib/python3.10/site-packages/tika/tika.py:602[0m, in [0;36mcheckTikaServer[0;34m(scheme, serverHost, port, tikaServerJar, classpath, config_path)[0m [1;32m 600[0m [39mif[39;00m [39mnot[39;00m status: [1;32m 601[0m log[39m.[39merror([39m"[39m[39mFailed to receive startup confirmation from startServer.[39m[39m"[39m) [0;32m--> 602[0m [39mraise[39;00m [39mRuntimeError[39;00m([39m"[39m[39mUnable to start Tika server.[39m[39m"[39m) [1;32m 603[0m [39mreturn[39;00m serverEndpoint [0;31mRuntimeError[0m: Unable to start Tika server.
# import parser object from tike
from tika import parser
# opening pdf file
parsed_pdf = parser.from_file("AFV 1_05_2021.pdf")
# saving content of pdf
# you can also bring text only, by parsed_pdf['text']
# parsed_pdf['content'] returns string
data = parsed_pdf['content']
# Printing of content
print(data)
# <class 'str'>
print(type(data))
import tika
from tika import parser
parsed = parser.from_file('AFV 1_05_2021.pdf')
print(parsed["metadata"])
print(parsed["content"])
curl -X POST 54.89.242.59:80/invoice -F "file=@AFV 1_05_2021.pdf"
curl ec2-54-89-242-59.compute-1.amazonaws.com
ec2-54-89-242-59.compute-1.amazonaws.com
WEBSERVICE_URL = "100.24.236.135:80"
import requests
import glob
import sys
print("xd")
txtfiles = []
for file in glob.glob("./5_train/*.pdf"):
print(f"Testowanie pliku {file}")
files = {'file': open(file, 'rb')}
response =requests.post(WEBSERVICE_URL + "/invoice", files=files)
try:
data = response.json()
if "vat_id" in data and "address" in data and "total" in data and \
not data["vat_id"] and not data["address"] and isinstance(data["total"], float):
print(f'OK: {data["vat_id"]}\t{data["address"]}\t{data["total"]}')
else:
print("NIEPOPRAWNA ODPOWIEDŹ")
except:
print("Błędna odpowiedź z serwera (%s):"%(sys.exc_info()[0]), file=sys.stderr)
xd