diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..d368aad --- /dev/null +++ b/Dockerfile @@ -0,0 +1,26 @@ +FROM python:3.7 + + +RUN apt-get update \ + && apt-get -y install \ + tesseract-ocr \ + tesseract-ocr-jpn \ + && apt-get clean + +WORKDIR /install + +COPY ./api . + +# get Polish language train data +# RUN wget -P /usr/share/tesseract-ocr/4.00/tessdata/ https://github.com/tesseract-ocr/tessdata/raw/4.00/pol.traineddata + +RUN mv ./tesseract_data/pol.traineddata /usr/share/tesseract-ocr/4.00/tessdata/ + +RUN pip3 install pipenv_to_requirements gunicorn && \ + pipenv run pipenv_to_requirements && \ + pip3 install --no-cache -r requirements.txt + +WORKDIR /install/api + + + diff --git a/Jenkinsfile b/Jenkinsfile index 476b3c7..0ac1541 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,10 +1,12 @@ pipeline { - agent any - + agent { + agent { dockerfile true } + } + stages { stage('Build') { steps { - echo 'Building..' + sh 'python main.py -i img/biedra.jpg' } } stage('Test') { @@ -12,10 +14,5 @@ pipeline { echo 'Testing..' } } - stage('Deploy') { - steps { - echo 'Deploying....' - } - } } } \ No newline at end of file diff --git a/api/Pipfile b/api/Pipfile new file mode 100644 index 0000000..7a6bbae --- /dev/null +++ b/api/Pipfile @@ -0,0 +1,16 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] + +[packages] +opencv-python = "*" +numpy = "*" +argparse = "*" +pytesseract = "*" +pillow = "*" + +[requires] +python_version = "3.7" diff --git a/api/Pipfile.lock b/api/Pipfile.lock new file mode 100644 index 0000000..1b84ca7 --- /dev/null +++ b/api/Pipfile.lock @@ -0,0 +1,131 @@ +{ + "_meta": { + "hash": { + "sha256": "1fa95346c7318e1652a62ece78fc08ff06679b98ef5b652b2a1cf546c2c9ac64" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.7" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "argparse": { + "hashes": [ + "sha256:62b089a55be1d8949cd2bc7e0df0bddb9e028faefc8c32038cc84862aefdd6e4", + "sha256:c31647edb69fd3d465a847ea3157d37bed1f95f19760b11a47aa91c04b666314" + ], + "index": "pypi", + "version": "==1.4.0" + }, + "numpy": { + "hashes": [ + "sha256:0a7a1dd123aecc9f0076934288ceed7fd9a81ba3919f11a855a7887cbe82a02f", + "sha256:0c0763787133dfeec19904c22c7e358b231c87ba3206b211652f8cbe1241deb6", + "sha256:3d52298d0be333583739f1aec9026f3b09fdfe3ddf7c7028cb16d9d2af1cca7e", + "sha256:43bb4b70585f1c2d153e45323a886839f98af8bfa810f7014b20be714c37c447", + "sha256:475963c5b9e116c38ad7347e154e5651d05a2286d86455671f5b1eebba5feb76", + "sha256:64874913367f18eb3013b16123c9fed113962e75d809fca5b78ebfbb73ed93ba", + "sha256:683828e50c339fc9e68720396f2de14253992c495fdddef77a1e17de55f1decc", + "sha256:6ca4000c4a6f95a78c33c7dadbb9495c10880be9c89316aa536eac359ab820ae", + "sha256:75fd817b7061f6378e4659dd792c84c0b60533e867f83e0d1e52d5d8e53df88c", + "sha256:7d81d784bdbed30137aca242ab307f3e65c8d93f4c7b7d8f322110b2e90177f9", + "sha256:8d0af8d3664f142414fd5b15cabfd3b6cc3ef242a3c7a7493257025be5a6955f", + "sha256:9679831005fb16c6df3dd35d17aa31dc0d4d7573d84f0b44cc481490a65c7725", + "sha256:a8f67ebfae9f575d85fa859b54d3bdecaeece74e3274b0b5c5f804d7ca789fe1", + "sha256:acbf5c52db4adb366c064d0b7c7899e3e778d89db585feadd23b06b587d64761", + "sha256:ada4805ed51f5bcaa3a06d3dd94939351869c095e30a2b54264f5a5004b52170", + "sha256:c7354e8f0eca5c110b7e978034cd86ed98a7a5ffcf69ca97535445a595e07b8e", + "sha256:e2e9d8c87120ba2c591f60e32736b82b67f72c37ba88a4c23c81b5b8fa49c018", + "sha256:e467c57121fe1b78a8f68dd9255fbb3bb3f4f7547c6b9e109f31d14569f490c3", + "sha256:ede47b98de79565fcd7f2decb475e2dcc85ee4097743e551fe26cfc7eb3ff143", + "sha256:f58913e9227400f1395c7b800503ebfdb0772f1c33ff8cb4d6451c06cabdf316", + "sha256:fe39f5fd4103ec4ca3cb8600b19216cd1ff316b4990f4c0b6057ad982c0a34d5" + ], + "index": "pypi", + "version": "==1.17.4" + }, + "opencv-python": { + "hashes": [ + "sha256:04bec0a6d3a00360a7fb769b755ff4489a4ac8291821b785151f63e6d8bb59ea", + "sha256:1a2d1801c038f055852bd2379186ca8b19b4ea24afb0b8410293bc802211579b", + "sha256:1c7d235faef511aca7669f1aa650897b6c058dfde6412ea3fc58feb0fce78814", + "sha256:22c2ee5f97f85903bfb28c056566b2ecaa1d2f804b880ab39ebf94528a402992", + "sha256:25127990671dc8bd27ae8b880d7a39f9aae863052a8fbebe8977c6ce8e5fc0c9", + "sha256:3cef82b6a1f748d2f4527f5932a86d54ebd10bd89f6cf59b003c36b1015055f7", + "sha256:499a0413e7110a934ab56e635252a4c86f8be64de59f94a62318a7b895dc809e", + "sha256:5f2cf5a0ab244a0a1dbe5ec426c277b55e06ac6a472ad61be77ef643a238cbd3", + "sha256:5fec35916a6b9ce935f2e2806084303fd4e3fbb0c973a8db8f54b5aca54613cb", + "sha256:6183c9c7fab4590e0651bc941cde780988c3ad9889bd62de19d581a6f59523ea", + "sha256:67a236db8db84d7fb0f6e127f360ce6669350ef324839132e22879ec90588dab", + "sha256:6c32d36f52a6e0c02d1ab0bb95223cb4dd5525a7e8292a747116126b3d34c578", + "sha256:73a467a78ffd902d2c0265ab6b2e2cdda423d61b3d08685e0c7d0b4572142ff1", + "sha256:76de8a247970d150b1672c6646cda91217d562682e713721fc9b9bf1434553c4", + "sha256:919d5c3ec1a62258ba8c68b869b1056186e2355c4474739b199c295547e66cc1", + "sha256:982d4e80c14356098cde57a6c7d18fe0928a1c3118675bac2252ef38f152e1ab", + "sha256:9d025e6bf2989bcbc7744c26d8bd90c2629a92d8de3ba2416f62ce2a94615dd9", + "sha256:bb59f98205cd81e29f45eed043cf0f98531486dc0b3f671c9e06fecf08f7ccef", + "sha256:c8119248457e909dcd7b598621ed1d139419d69377e8cb4e2b2c49c819de287d", + "sha256:ce7b1f25be04b04f2e678b2bf23a975137f77406dcee66a88a2daeb77cda3e76", + "sha256:d64428bf59ab4d27620b00a2ad6fea2b4d62016a17849c82a7517ec12db97d55", + "sha256:e2ffa3161b8662112f1880734e8b9549d0c9e818e59f652a9d1c5bf31e36586a", + "sha256:e6fc00ac42c800fad5fb3927cfb9bf4e60bb3302cb9805f45b826d5d2546119a", + "sha256:e793df2e12093b3a01006b5b27f321e306193c7a5c9e2a6c8bf652e1ad2d6a86", + "sha256:eae543b3e9253ff702103333aabd87736b5ed5e46ab834d8e0b929f08f494dee", + "sha256:f0af656402b73ead2d9f593c2774c04b01e2d0c63e4f99e0dc2f3fde99be22b4" + ], + "index": "pypi", + "version": "==4.1.2.30" + }, + "pillow": { + "hashes": [ + "sha256:047d9473cf68af50ac85f8ee5d5f21a60f849bc17d348da7fc85711287a75031", + "sha256:0f66dc6c8a3cc319561a633b6aa82c44107f12594643efa37210d8c924fc1c71", + "sha256:12c9169c4e8fe0a7329e8658c7e488001f6b4c8e88740e76292c2b857af2e94c", + "sha256:248cffc168896982f125f5c13e9317c059f74fffdb4152893339f3be62a01340", + "sha256:27faf0552bf8c260a5cee21a76e031acaea68babb64daf7e8f2e2540745082aa", + "sha256:285edafad9bc60d96978ed24d77cdc0b91dace88e5da8c548ba5937c425bca8b", + "sha256:384b12c9aa8ef95558abdcb50aada56d74bc7cc131dd62d28c2d0e4d3aadd573", + "sha256:38950b3a707f6cef09cd3cbb142474357ad1a985ceb44d921bdf7b4647b3e13e", + "sha256:4aad1b88933fd6dc2846552b89ad0c74ddbba2f0884e2c162aa368374bf5abab", + "sha256:4ac6148008c169603070c092e81f88738f1a0c511e07bd2bb0f9ef542d375da9", + "sha256:4deb1d2a45861ae6f0b12ea0a786a03d19d29edcc7e05775b85ec2877cb54c5e", + "sha256:59aa2c124df72cc75ed72c8d6005c442d4685691a30c55321e00ed915ad1a291", + "sha256:5a47d2123a9ec86660fe0e8d0ebf0aa6bc6a17edc63f338b73ea20ba11713f12", + "sha256:5cc901c2ab9409b4b7ac7b5bcc3e86ac14548627062463da0af3b6b7c555a871", + "sha256:6c1db03e8dff7b9f955a0fb9907eb9ca5da75b5ce056c0c93d33100a35050281", + "sha256:7ce80c0a65a6ea90ef9c1f63c8593fcd2929448613fc8da0adf3e6bfad669d08", + "sha256:809c19241c14433c5d6135e1b6c72da4e3b56d5c865ad5736ab99af8896b8f41", + "sha256:83792cb4e0b5af480588601467c0764242b9a483caea71ef12d22a0d0d6bdce2", + "sha256:846fa202bd7ee0f6215c897a1d33238ef071b50766339186687bd9b7a6d26ac5", + "sha256:9f5529fc02009f96ba95bea48870173426879dc19eec49ca8e08cd63ecd82ddb", + "sha256:a423c2ea001c6265ed28700df056f75e26215fd28c001e93ef4380b0f05f9547", + "sha256:ac4428094b42907aba5879c7c000d01c8278d451a3b7cccd2103e21f6397ea75", + "sha256:b1ae48d87f10d1384e5beecd169c77502fcc04a2c00a4c02b85f0a94b419e5f9", + "sha256:bf4e972a88f8841d8fdc6db1a75e0f8d763e66e3754b03006cbc3854d89f1cb1", + "sha256:c6414f6aad598364aaf81068cabb077894eb88fed99c6a65e6e8217bab62ae7a", + "sha256:c710fcb7ee32f67baf25aa9ffede4795fd5d93b163ce95fdc724383e38c9df96", + "sha256:c7be4b8a09852291c3c48d3c25d1b876d2494a0a674980089ac9d5e0d78bd132", + "sha256:c9e5ffb910b14f090ac9c38599063e354887a5f6d7e6d26795e916b4514f2c1a", + "sha256:e0697b826da6c2472bb6488db4c0a7fa8af0d52fa08833ceb3681358914b14e5", + "sha256:e9a3edd5f714229d41057d56ac0f39ad9bdba6767e8c888c951869f0bdd129b0" + ], + "index": "pypi", + "version": "==6.2.1" + }, + "pytesseract": { + "hashes": [ + "sha256:ae1dce01413d1f8eb0614fd65d831e26e649dc1a31699b7275455c57aa563b59" + ], + "index": "pypi", + "version": "==0.3.0" + } + }, + "develop": {} +} diff --git a/README.md b/api/README.md similarity index 100% rename from README.md rename to api/README.md diff --git a/api/img/biedra.jpg b/api/img/biedra.jpg new file mode 100644 index 0000000..12498dc Binary files /dev/null and b/api/img/biedra.jpg differ diff --git a/api/main.py b/api/main.py new file mode 100644 index 0000000..b55e082 --- /dev/null +++ b/api/main.py @@ -0,0 +1,59 @@ +import argparse +import cv2 +import os +import sys +import re +import warnings +import pytesseract +import numpy as np +from PIL import Image + +ap = argparse.ArgumentParser() +ap.add_argument("-i", "--image", required=True, help="Path to the image") +args = vars(ap.parse_args()) + +if (not os.path.isfile(args["image"])): + print(f"Could not find an image '{args['image']}'") + sys.exit(-1) + +img = cv2.imread(args["image"]) +out_img = img.copy() +gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) +gray = cv2.GaussianBlur(gray, (5, 5), 0) +edged = cv2.Canny(gray, 75, 200) + +contours, hierarchy = cv2.findContours(edged.copy(), + cv2.RETR_LIST, + cv2.CHAIN_APPROX_SIMPLE) + +max_area_contour = max(contours, key=cv2.contourArea) +x, y, w, h = cv2.boundingRect(max_area_contour) +# out_img = gray[y:y+h, x:x+w] +# ret, out_img = cv2.threshold(gray[y:y+h, x:x+w], 155, 255, cv2.THRESH_TOZERO) +img_cut = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)[y:y+h, x:x+w] +img_out = cv2.cvtColor(img_cut, cv2.COLOR_BGR2RGB) +text = pytesseract.image_to_string(Image.fromarray(img_out), config="-l pol") +text_lines = text.split('\n') +index_start = 0 +index_stop = len(text_lines) - 1 +for i in range(len(text_lines) - 1): + if(re.compile('PARAGON.*FISKALNY.*').match(text_lines[i])): + index_start = i + if(re.compile('SPRZEDA.*').match(text_lines[i])): + index_stop = i + +for item_line in text_lines[index_start + 1 : index_stop - 2]: + print(item_line) + # regex = re.compile("([ A-Za-ząćęłśźż]+).*(\d{1,3},\d{2})[A-E]$") + # m = regex.match(item_line) + # if m: + # print(item_line, "===>", m.group(1), m.group(2)) + # else: + # print("skipped!") + +# # cv2.drawContours(out_img, contours, -1, (0, 255, 0), 3) +# # cv2.rectangle(out_img, (x, y), (x+w, y+h), (0, 0, 255), 2) +# cv2.imshow("cropped", img_out) +# # cv2.imshow("Edged", edged) +# cv2.waitKey(0) +# cv2.destroyAllWindows() diff --git a/api/tesseract_data/pol.traineddata b/api/tesseract_data/pol.traineddata new file mode 100644 index 0000000..fba7958 Binary files /dev/null and b/api/tesseract_data/pol.traineddata differ