145 lines
4.7 KiB
Python
145 lines
4.7 KiB
Python
import argparse
|
|
import cv2
|
|
import os
|
|
import sys
|
|
import re
|
|
import warnings
|
|
import pytesseract
|
|
import numpy as np
|
|
from PIL import Image
|
|
|
|
|
|
def img_to_products(img: Image) -> list:
|
|
img_postproc = preprocessor(img)
|
|
ocr_text = get_text(img_postproc)
|
|
list_out = get_products(ocr_text)
|
|
return list_out
|
|
|
|
|
|
def img_to_products_debug(img: Image) -> (list, str, Image):
|
|
img_postproc = preprocessor(img)
|
|
ocr_text = get_text(img_postproc)
|
|
list_out = get_products(ocr_text)
|
|
return list_out, ocr_text, img_postproc
|
|
|
|
|
|
def recognize(img: Image, debug: bool = False) -> Image:
|
|
processed_img = preprocessor(img)
|
|
pass
|
|
|
|
|
|
def preprocessor(img: Image, debug: bool = False) -> Image:
|
|
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
gray = cv2.GaussianBlur(gray, (5, 5), 0)
|
|
edged = cv2.Canny(gray, 75, 200)
|
|
contours, hierarchy = cv2.findContours(edged.copy(),
|
|
cv2.RETR_LIST,
|
|
cv2.CHAIN_APPROX_SIMPLE)
|
|
max_area_contour = max(contours, key=cv2.contourArea)
|
|
x, y, w, h = cv2.boundingRect(max_area_contour)
|
|
if debug:
|
|
box_img = img.copy()
|
|
cv2.rectangle(box_img, (x,y), (x+w, y+h), (0, 0, 255), thickness=2, lineType=8)
|
|
cv2.imshow("MARK CROP", box_img)
|
|
img_cut = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)[y:y+h, x:x+w]
|
|
img_out = cv2.cvtColor(img_cut, cv2.COLOR_BGR2RGB)
|
|
if debug:
|
|
cv2.imshow("CROPPED", img_out)
|
|
return img_out
|
|
|
|
|
|
def get_text(img: Image, debug: bool = False) -> str:
|
|
text = pytesseract.image_to_string(Image.fromarray(img), config="-l pol")
|
|
return text
|
|
|
|
|
|
def get_products(ocr_text, debug: bool = False) -> list:
|
|
out_list = []
|
|
text_lines = ocr_text.split('\n')
|
|
index_start = 0
|
|
index_stop = len(text_lines) - 1
|
|
for i in range(len(text_lines) - 1):
|
|
if(re.compile('PARAGON.*FISKALNY.*').match(text_lines[i])):
|
|
index_start = i
|
|
if(re.compile('SPRZEDA.*').match(text_lines[i])):
|
|
index_stop = i
|
|
|
|
for item_line in text_lines[index_start + 1: index_stop - 2]:
|
|
# print(item_line)
|
|
regex = re.compile("([ A-Za-ząćęłśźż]+).*(\d{1,3},\d{2})[A-E]$")
|
|
m = regex.match(item_line)
|
|
if m:
|
|
out_list.append((item_line, m.group(1), m.group(2)))
|
|
# print(item_line, "===>", m.group(1), m.group(2))
|
|
else:
|
|
print("skipped!")
|
|
|
|
return out_list
|
|
|
|
|
|
if __name__ == "__main__":
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("-i", "--image", required=True, help="Path to the image")
|
|
ap.add_argument("-s", "--show-steps", required=False, help="Display image on every step", action='store_true')
|
|
args = vars(ap.parse_args())
|
|
|
|
if (not os.path.isfile(args["image"])):
|
|
print(f"Could not find an image '{args['image']}'")
|
|
sys.exit(-1)
|
|
|
|
DEBUG = args["show_steps"]
|
|
|
|
img = cv2.imread(args["image"])
|
|
img_postproc = preprocessor(img, debug=DEBUG)
|
|
ocr_text = get_text(img_postproc, debug=DEBUG)
|
|
product_list = get_products(ocr_text, debug=DEBUG)
|
|
# print(product_list)
|
|
if(DEBUG):
|
|
cv2.waitKey(0)
|
|
cv2.destroyAllWindows()
|
|
|
|
# out_img = img.copy()
|
|
# gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
# gray = cv2.GaussianBlur(gray, (5, 5), 0)
|
|
# edged = cv2.Canny(gray, 75, 200)
|
|
|
|
# contours, hierarchy = cv2.findContours(edged.copy(),
|
|
# cv2.RETR_LIST,
|
|
# cv2.CHAIN_APPROX_SIMPLE)
|
|
|
|
# max_area_contour = max(contours, key=cv2.contourArea)
|
|
# x, y, w, h = cv2.boundingRect(max_area_contour)
|
|
# # out_img = gray[y:y+h, x:x+w]
|
|
# # ret, out_img = cv2.threshold(gray[y:y+h, x:x+w], 155, 255, cv2.THRESH_TOZERO)
|
|
# img_cut = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)[y:y+h, x:x+w]
|
|
# img_out = cv2.cvtColor(img_cut, cv2.COLOR_BGR2RGB)
|
|
|
|
|
|
# text = pytesseract.image_to_string(Image.fromarray(img_out), config="-l pol")
|
|
# text_lines = text.split('\n')
|
|
# index_start = 0
|
|
# index_stop = len(text_lines) - 1
|
|
# for i in range(len(text_lines) - 1):
|
|
# if(re.compile('PARAGON.*FISKALNY.*').match(text_lines[i])):
|
|
# index_start = i
|
|
# if(re.compile('SPRZEDA.*').match(text_lines[i])):
|
|
# index_stop = i
|
|
|
|
# for item_line in text_lines[index_start + 1: index_stop - 2]:
|
|
# print(item_line)
|
|
|
|
|
|
# regex = re.compile("([ A-Za-ząćęłśźż]+).*(\d{1,3},\d{2})[A-E]$")
|
|
# m = regex.match(item_line)
|
|
# if m:
|
|
# print(item_line, "===>", m.group(1), m.group(2))
|
|
# else:
|
|
# print("skipped!")
|
|
|
|
# # cv2.drawContours(out_img, contours, -1, (0, 255, 0), 3)
|
|
# # cv2.rectangle(out_img, (x, y), (x+w, y+h), (0, 0, 255), 2)
|
|
# cv2.imshow("cropped", img_out)
|
|
# # cv2.imshow("Edged", edged)
|
|
# cv2.waitKey(0)
|
|
# cv2.destroyAllWindows()
|