215 lines
12 KiB
Plaintext
215 lines
12 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 8,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from fastapi import FastAPI, UploadFile\n",
|
||
|
"from tika import parser\n",
|
||
|
"import uvicorn, re\n",
|
||
|
"from fastapi.middleware.cors import CORSMiddleware"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 36,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"NIP: 5741997874\n",
|
||
|
"Firma krzak sp.z.o.o\n",
|
||
|
"246.0\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"\n",
|
||
|
"\n",
|
||
|
"\n",
|
||
|
"faktura = parser.from_file(\"/Users/mikolajpaterka/Documents/Studia/Mgr_II/Praktycznie zastosowania_chmury_obliczeniowej/Zestaw_3/5_train/CFV 4_05_2021.pdf\")\n",
|
||
|
"dane = faktura['content']\n",
|
||
|
"dane = dane.split('\\n')\n",
|
||
|
"dane = [i for i in dane if i != '']\n",
|
||
|
"razem = list(filter(lambda x: 'Razem' in x, dane))\n",
|
||
|
"razem = float(re.findall(r'\\d+[.]\\d+', razem[0])[-1])\n",
|
||
|
"\n",
|
||
|
"sprzedawca = dane[dane.index('Sprzedawca:') : dane.index('Nabywca:')]\n",
|
||
|
"sprzedawca_nazwa = sprzedawca[1]\n",
|
||
|
"nip = sprzedawca[-1].replace('NIP: ', '')\n",
|
||
|
"\n",
|
||
|
"print(nip)\n",
|
||
|
"print(sprzedawca_nazwa)\n",
|
||
|
"print(razem)\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 10,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stderr",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"2023-01-27 20:29:38,916 [MainThread ] [WARNI] Failed to see startup log message; retrying...\n",
|
||
|
"2023-01-27 20:29:43,923 [MainThread ] [WARNI] Failed to see startup log message; retrying...\n",
|
||
|
"2023-01-27 20:29:48,927 [MainThread ] [WARNI] Failed to see startup log message; retrying...\n",
|
||
|
"2023-01-27 20:29:53,933 [MainThread ] [ERROR] Tika startup log message not received after 3 tries.\n",
|
||
|
"2023-01-27 20:29:53,936 [MainThread ] [ERROR] Failed to receive startup confirmation from startServer.\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"ename": "RuntimeError",
|
||
|
"evalue": "Unable to start Tika server.",
|
||
|
"output_type": "error",
|
||
|
"traceback": [
|
||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||
|
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
|
||
|
"Cell \u001b[0;32mIn [10], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39m# import parser object from tike\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mtika\u001b[39;00m \u001b[39mimport\u001b[39;00m parser\n\u001b[0;32m----> 4\u001b[0m parsed_pdf \u001b[39m=\u001b[39m parser\u001b[39m.\u001b[39;49mfrom_file(\u001b[39m\"\u001b[39;49m\u001b[39mAFV 1_05_2021.pdf\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[1;32m 6\u001b[0m \u001b[39m# ['metadata'] attribute returns\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[39m# key-value pairs of meta-data\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[39mprint\u001b[39m(parsed_pdf[\u001b[39m'\u001b[39m\u001b[39mmetadata\u001b[39m\u001b[39m'\u001b[39m])\n",
|
||
|
"File \u001b[0;32m/opt/homebrew/lib/python3.10/site-packages/tika/parser.py:40\u001b[0m, in \u001b[0;36mfrom_file\u001b[0;34m(filename, serverEndpoint, service, xmlContent, headers, config_path, requestOptions, raw_response)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[39m'''\u001b[39;00m\n\u001b[1;32m 25\u001b[0m \u001b[39mParses a file for metadata and content\u001b[39;00m\n\u001b[1;32m 26\u001b[0m \u001b[39m:param filename: path to file which needs to be parsed or binary file using open(path,'rb')\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[39m 'content' has a str value and metadata has a dict type value.\u001b[39;00m\n\u001b[1;32m 38\u001b[0m \u001b[39m'''\u001b[39;00m\n\u001b[1;32m 39\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m xmlContent:\n\u001b[0;32m---> 40\u001b[0m output \u001b[39m=\u001b[39m parse1(service, filename, serverEndpoint, headers\u001b[39m=\u001b[39;49mheaders, config_path\u001b[39m=\u001b[39;49mconfig_path, requestOptions\u001b[39m=\u001b[39;49mrequestOptions)\n\u001b[1;32m 41\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 42\u001b[0m output \u001b[39m=\u001b[39m parse1(service, filename, serverEndpoint, services\u001b[39m=\u001b[39m{\u001b[39m'\u001b[39m\u001b[39mmeta\u001b[39m\u001b[39m'\u001b[39m: \u001b[39m'\u001b[39m\u001b[39m/meta\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m'\u001b[39m: \u001b[39m'\u001b[39m\u001b[39m/tika\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mall\u001b[39m\u001b[39m'\u001b[39m: \u001b[39m'\u001b[39m\u001b[39m/rmeta/xml\u001b[39m\u001b[39m'\u001b[39m},\n\u001b[1;32m 43\u001b[0m headers\u001b[39m=\u001b[39mheaders, config_path\u001b[39m=\u001b[39mconfig_path, requestOptions\u001b[39m=\u001b[39mrequestOptions)\n",
|
||
|
"File \u001b[0;32m/opt/homebrew/lib/python3.10/site-packages/tika/tika.py:337\u001b[0m, in \u001b[0;36mparse1\u001b[0;34m(option, urlOrPath, serverEndpoint, verbose, tikaServerJar, responseMimeType, services, rawResponse, headers, config_path, requestOptions)\u001b[0m\n\u001b[1;32m 335\u001b[0m headers\u001b[39m.\u001b[39mupdate({\u001b[39m'\u001b[39m\u001b[39mAccept\u001b[39m\u001b[39m'\u001b[39m: responseMimeType, \u001b[39m'\u001b[39m\u001b[39mContent-Disposition\u001b[39m\u001b[39m'\u001b[39m: make_content_disposition_header(path\u001b[39m.\u001b[39mencode(\u001b[39m'\u001b[39m\u001b[39mutf-8\u001b[39m\u001b[39m'\u001b[39m) \u001b[39mif\u001b[39;00m \u001b[39mtype\u001b[39m(path) \u001b[39mis\u001b[39;00m unicode_string \u001b[39melse\u001b[39;00m path)})\n\u001b[1;32m 336\u001b[0m \u001b[39mwith\u001b[39;00m urlOrPath \u001b[39mif\u001b[39;00m _is_file_object(urlOrPath) \u001b[39melse\u001b[39;00m \u001b[39mopen\u001b[39m(path, \u001b[39m'\u001b[39m\u001b[39mrb\u001b[39m\u001b[39m'\u001b[39m) \u001b[39mas\u001b[39;00m f:\n\u001b[0;32m--> 337\u001b[0m status, response \u001b[39m=\u001b[39m callServer(\u001b[39m'\u001b[39;49m\u001b[39mput\u001b[39;49m\u001b[39m'\u001b[39;49m, serverEndpoint, service, f,\n\u001b[1;32m 338\u001b[0m headers, verbose, tikaServerJar, config_path\u001b[39m=\u001b[39;49mconfig_path,\n\u001b[1;32m 339\u001b[0m rawResponse\u001b[39m=\u001b[39;49mrawResponse, requestOptions\u001b[39m=\u001b[39;49mrequestOptions)\n\u001b[1;32m 341\u001b[0m \u001b[39mif\u001b[39;00m file_type \u001b[39m==\u001b[39m \u001b[39m'\u001b[39m\u001b[39mremote\u001b[39m\u001b[39m'\u001b[39m: os\u001b[39m.\u001b[39munlink(path)\n\u001b[1;32m 342\u001b[0m \u001b[39mreturn\u001b[39;00m (status, response)\n",
|
||
|
"File \u001b[0;32m/opt/homebrew/lib/python3.10/site-packages/tika/tika.py:532\u001b[0m, in \u001b[0;36mcallServer\u001b[0;34m(verb, serverEndpoint, service, data, headers, verbose, tikaServerJar, httpVerbs, classpath, rawResponse, config_path, requestOptions)\u001b[0m\n\u001b[1;32m 530\u001b[0m \u001b[39mglobal\u001b[39;00m TikaClientOnly\n\u001b[1;32m 531\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m TikaClientOnly:\n\u001b[0;32m--> 532\u001b[0m serverEndpoint \u001b[39m=\u001b[39m checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path)\n\u001b[1;32m 534\u001b[0m serviceUrl \u001b[39m=\u001b[39m serverEndpoint \u001b[39m+\u001b[39m service\n\u001b[1;32m 535\u001b[0m \u001b[39mif\u001b[39;00m verb \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m httpVerbs:\n",
|
||
|
"File \u001b[0;32m/opt/homebrew/lib/python3.10/site-packages/tika/tika.py:602\u001b[0m, in \u001b[0;36mcheckTikaServer\u001b[0;34m(scheme, serverHost, port, tikaServerJar, classpath, config_path)\u001b[0m\n\u001b[1;32m 600\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m status:\n\u001b[1;32m 601\u001b[0m log\u001b[39m.\u001b[39merror(\u001b[39m\"\u001b[39m\u001b[39mFailed to receive startup confirmation from startServer.\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m--> 602\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mUnable to start Tika server.\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 603\u001b[0m \u001b[39mreturn\u001b[39;00m serverEndpoint\n",
|
||
|
"\u001b[0;31mRuntimeError\u001b[0m: Unable to start Tika server."
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# import parser object from tike\n",
|
||
|
"from tika import parser\n",
|
||
|
"\n",
|
||
|
"parsed_pdf = parser.from_file(\"AFV 1_05_2021.pdf\")\n",
|
||
|
"\n",
|
||
|
"# ['metadata'] attribute returns\n",
|
||
|
"# key-value pairs of meta-data\n",
|
||
|
"print(parsed_pdf['metadata'])\n",
|
||
|
"\n",
|
||
|
"# <class 'dict'>\n",
|
||
|
"print(type(parsed_pdf['metadata']))\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# import parser object from tike\n",
|
||
|
"from tika import parser\n",
|
||
|
"\n",
|
||
|
"# opening pdf file\n",
|
||
|
"parsed_pdf = parser.from_file(\"AFV 1_05_2021.pdf\")\n",
|
||
|
"\n",
|
||
|
"# saving content of pdf\n",
|
||
|
"# you can also bring text only, by parsed_pdf['text']\n",
|
||
|
"# parsed_pdf['content'] returns string\n",
|
||
|
"data = parsed_pdf['content']\n",
|
||
|
"\n",
|
||
|
"# Printing of content\n",
|
||
|
"print(data)\n",
|
||
|
"\n",
|
||
|
"# <class 'str'>\n",
|
||
|
"print(type(data))\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"\n",
|
||
|
"import tika\n",
|
||
|
"from tika import parser\n",
|
||
|
"parsed = parser.from_file('AFV 1_05_2021.pdf')\n",
|
||
|
"print(parsed[\"metadata\"])\n",
|
||
|
"print(parsed[\"content\"])"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"curl -X POST 54.89.242.59:80/invoice -F \"file=@AFV 1_05_2021.pdf\"\n",
|
||
|
"\n",
|
||
|
"curl ec2-54-89-242-59.compute-1.amazonaws.com\n",
|
||
|
"\n",
|
||
|
"ec2-54-89-242-59.compute-1.amazonaws.com "
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 121,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"xd\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"WEBSERVICE_URL = \"100.24.236.135:80\"\n",
|
||
|
"\n",
|
||
|
"import requests\n",
|
||
|
"import glob\n",
|
||
|
"import sys\n",
|
||
|
"\n",
|
||
|
"print(\"xd\")\n",
|
||
|
"\n",
|
||
|
"txtfiles = []\n",
|
||
|
"\n",
|
||
|
"for file in glob.glob(\"./5_train/*.pdf\"):\n",
|
||
|
" print(f\"Testowanie pliku {file}\")\n",
|
||
|
" files = {'file': open(file, 'rb')}\n",
|
||
|
" response =requests.post(WEBSERVICE_URL + \"/invoice\", files=files)\n",
|
||
|
" try:\n",
|
||
|
" data = response.json()\n",
|
||
|
" if \"vat_id\" in data and \"address\" in data and \"total\" in data and \\\n",
|
||
|
" not data[\"vat_id\"] and not data[\"address\"] and isinstance(data[\"total\"], float):\n",
|
||
|
" print(f'OK: {data[\"vat_id\"]}\\t{data[\"address\"]}\\t{data[\"total\"]}')\n",
|
||
|
" else:\n",
|
||
|
" print(\"NIEPOPRAWNA ODPOWIEDŹ\")\n",
|
||
|
" except:\n",
|
||
|
" print(\"Błędna odpowiedź z serwera (%s):\"%(sys.exc_info()[0]), file=sys.stderr)"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.10.9"
|
||
|
},
|
||
|
"orig_nbformat": 4,
|
||
|
"vscode": {
|
||
|
"interpreter": {
|
||
|
"hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
|
||
|
}
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|