tasks
20
.gitignore
vendored
Normal file
@ -0,0 +1,20 @@
|
||||
# ---> JupyterNotebooks
|
||||
# gitignore template for Jupyter Notebooks
|
||||
# website: http://jupyter.org/
|
||||
|
||||
.ipynb_checkpoints
|
||||
*/.ipynb_checkpoints/*
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# Remove previous ipynb_checkpoints
|
||||
# git rm -r .ipynb_checkpoints/
|
||||
|
||||
datasets/att_faces/
|
||||
datasets/glasses/
|
||||
datasets/INRIAPerson/
|
||||
datasets/yaleextb/
|
||||
vid/gen-*
|
||||
dnn/
|
27
README.md
Normal file
@ -0,0 +1,27 @@
|
||||
# Widzenie komputerowe – materiały do zajęć
|
||||
|
||||
## Cele przedmiotu
|
||||
|
||||
Celem kursu jest nabycie umiejętności wykorzystywania klasycznych i współczesnych algorytmów z dziedziny widzenia komputerowego. Uczestnicy zajęć będą przetwarzać obrazy cyfrowe oraz materiały wideo w celu odwzorowania zadań, do których jest przystosowany system wzrokowy człowieka, np. takich jak rozpoznawanie i śledzenie obiektów. Kurs będzie opierał się na praktycznym wykorzystaniu biblioteki OpenCV.
|
||||
|
||||
## Wymagania wstępne w zakresie wiedzy, umiejętności oraz kompetencji społecznych
|
||||
|
||||
* Umiejętność programowania na poziomie inżyniera informatyki.
|
||||
* Znajomość podstaw uczenia maszynowego.
|
||||
|
||||
## Oprogramowanie
|
||||
|
||||
* Ubuntu 20.04
|
||||
* Python 3.7
|
||||
* OpenCV 4.5.3
|
||||
* FFmpeg 4.1.8
|
||||
|
||||
## Literatura
|
||||
|
||||
* L. Venturi, & K. Korda (2020). Hands-On Vision and Behavior for Self-Driving Cars. Packt Publishing.
|
||||
* D.M. Escriva, & R. Laganiere (2019). OpenCV 4 Computer Vision Application Programming Cookbook (wyd. 4). Packt Publishing.
|
||||
* A.F. Villan (2019). Mastering OpenCV 4 with Python. Packt Publishing.
|
||||
* E.R. Davies (2017). Computer Vision: Principles, Algorithms, Applications, Learning (wyd. 5). Academic Press.
|
||||
* R. Szeliski (2021). Computer Vision: Algorithms and Applications. Springer-Verlag.
|
||||
* D. Fouhey, & J. Johnson (2021). EECS 442: Computer Vision. University of Michigan.
|
||||
* S. Malick (2021). LearnOpenCV.
|
137
blink_detection_lab8.py
Normal file
@ -0,0 +1,137 @@
|
||||
# import the necessary packages
|
||||
from scipy.spatial import distance as dist
|
||||
from imutils.video import FileVideoStream
|
||||
from imutils.video import VideoStream
|
||||
from imutils import face_utils
|
||||
import numpy as np
|
||||
import argparse
|
||||
import imutils
|
||||
import time
|
||||
import dlib
|
||||
import cv2
|
||||
|
||||
|
||||
def eye_aspect_ratio(eye):
|
||||
# compute the euclidean distances between the two sets of
|
||||
# vertical eye landmarks (x, y)-coordinates
|
||||
A = dist.euclidean(eye[1], eye[5])
|
||||
B = dist.euclidean(eye[2], eye[4])
|
||||
# compute the euclidean distance between the horizontal
|
||||
# eye landmark (x, y)-coordinates
|
||||
C = dist.euclidean(eye[0], eye[3])
|
||||
# compute the eye aspect ratio
|
||||
ear = (A + B) / (2.0 * C)
|
||||
# return the eye aspect ratio
|
||||
return ear
|
||||
|
||||
|
||||
# construct the argument parse and parse the arguments
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("-p", "--shape-predictor", required=True,
|
||||
help="path to facial landmark predictor")
|
||||
ap.add_argument("-v", "--video", type=str, default="",
|
||||
help="path to input video file")
|
||||
args = vars(ap.parse_args())
|
||||
|
||||
|
||||
# define two constants, one for the eye aspect ratio to indicate
|
||||
# blink and then a second constant for the number of consecutive
|
||||
# frames the eye must be below the threshold
|
||||
EYE_AR_THRESH = 0.3
|
||||
EYE_AR_CONSEC_FRAMES = 3
|
||||
# initialize the frame counters and the total number of blinks
|
||||
COUNTER = 0
|
||||
TOTAL = 0
|
||||
|
||||
|
||||
# initialize dlib's face detector (HOG-based) and then create
|
||||
# the facial landmark predictor
|
||||
print("[INFO] loading facial landmark predictor...")
|
||||
detector = dlib.get_frontal_face_detector()
|
||||
predictor = dlib.shape_predictor(args["shape_predictor"])
|
||||
|
||||
(lStart, lEnd) = face_utils.FACIAL_LANDMARKS_IDXS["left_eye"]
|
||||
(rStart, rEnd) = face_utils.FACIAL_LANDMARKS_IDXS["right_eye"]
|
||||
|
||||
# start the video stream thread
|
||||
print("[INFO] starting video stream thread...")
|
||||
vs = FileVideoStream(args["video"]).start()
|
||||
fileStream = True
|
||||
# vs = VideoStream(src=0).start()
|
||||
# vs = VideoStream(usePiCamera=True).start()
|
||||
# fileStream = False
|
||||
time.sleep(1.0)
|
||||
|
||||
|
||||
|
||||
# loop over frames from the video stream
|
||||
while True:
|
||||
# if this is a file video stream, then we need to check if
|
||||
# there any more frames left in the buffer to process
|
||||
if fileStream and not vs.more():
|
||||
break
|
||||
# grab the frame from the threaded video file stream, resize
|
||||
# it, and convert it to grayscale
|
||||
# channels)
|
||||
frame = vs.read()
|
||||
try:
|
||||
frame = imutils.resize(frame, width=450)
|
||||
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
||||
# detect faces in the grayscale frame
|
||||
rects = detector(gray, 0)
|
||||
|
||||
|
||||
|
||||
for rect in rects:
|
||||
# determine the facial landmarks for the face region, then
|
||||
# convert the facial landmark (x, y)-coordinates to a NumPy
|
||||
# array
|
||||
shape = predictor(gray, rect)
|
||||
shape = face_utils.shape_to_np(shape)
|
||||
# extract the left and right eye coordinates, then use the
|
||||
# coordinates to compute the eye aspect ratio for both eyes
|
||||
leftEye = shape[lStart:lEnd]
|
||||
rightEye = shape[rStart:rEnd]
|
||||
leftEAR = eye_aspect_ratio(leftEye)
|
||||
rightEAR = eye_aspect_ratio(rightEye)
|
||||
# average the eye aspect ratio together for both eyes
|
||||
ear = (leftEAR + rightEAR) / 2.0
|
||||
|
||||
# visualize each of the eyes
|
||||
leftEyeHull = cv2.convexHull(leftEye)
|
||||
rightEyeHull = cv2.convexHull(rightEye)
|
||||
cv2.drawContours(frame, [leftEyeHull], -1, (0, 255, 0), 1)
|
||||
cv2.drawContours(frame, [rightEyeHull], -1, (0, 255, 0), 1)
|
||||
|
||||
if ear < EYE_AR_THRESH:
|
||||
COUNTER += 1
|
||||
# otherwise, the eye aspect ratio is not below the blink
|
||||
# threshold
|
||||
else:
|
||||
# if the eyes were closed for a sufficient number of
|
||||
# then increment the total number of blinks
|
||||
if COUNTER >= EYE_AR_CONSEC_FRAMES:
|
||||
TOTAL += 1
|
||||
# reset the eye frame counter
|
||||
COUNTER = 0
|
||||
|
||||
# draw the total number of blinks on the frame along with
|
||||
# the computed eye aspect ratio for the frame
|
||||
cv2.putText(frame, "Blinks: {}".format(TOTAL), (10, 30),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
|
||||
cv2.putText(frame, "EAR: {:.2f}".format(ear), (300, 30),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
|
||||
|
||||
# show the frame
|
||||
cv2.imshow("Frame", frame)
|
||||
key = cv2.waitKey(1) & 0xFF
|
||||
|
||||
# if the `q` key was pressed, break from the loop
|
||||
if key == ord("q"):
|
||||
break
|
||||
# do a bit of cleanup
|
||||
|
||||
except:
|
||||
print(f"BLINKS COUNT: {TOTAL}")
|
||||
cv2.destroyAllWindows()
|
||||
vs.stop()
|
BIN
datasets/att_faces.zip
Normal file
BIN
datasets/glasses.zip
Normal file
BIN
datasets/inria-person-sub.zip
Normal file
BIN
datasets/yaleextb.zip
Normal file
32207
get-pip.py
Normal file
BIN
img/aitech-logotyp-1.jpg
Normal file
After Width: | Height: | Size: 76 KiB |
BIN
img/aitech-logotyp-2.jpg
Normal file
After Width: | Height: | Size: 63 KiB |
BIN
img/app-billboard.png
Normal file
After Width: | Height: | Size: 856 KiB |
BIN
img/baboon-3d.png
Normal file
After Width: | Height: | Size: 55 KiB |
BIN
img/baboon.png
Normal file
After Width: | Height: | Size: 190 KiB |
BIN
img/bakery.jpg
Normal file
After Width: | Height: | Size: 641 KiB |
BIN
img/bear-mask.jpg
Normal file
After Width: | Height: | Size: 14 KiB |
BIN
img/bear.jpg
Normal file
After Width: | Height: | Size: 113 KiB |
BIN
img/billboards.jpg
Normal file
After Width: | Height: | Size: 417 KiB |
BIN
img/binary-image.png
Normal file
After Width: | Height: | Size: 13 KiB |
BIN
img/blobs.png
Normal file
After Width: | Height: | Size: 141 KiB |
BIN
img/boat_1.jpg
Normal file
After Width: | Height: | Size: 718 KiB |
BIN
img/boat_2.jpg
Normal file
After Width: | Height: | Size: 649 KiB |
BIN
img/bologna-on-wall.jpg
Normal file
After Width: | Height: | Size: 102 KiB |
BIN
img/bologna.png
Normal file
After Width: | Height: | Size: 263 KiB |
BIN
img/book-python-cover.jpg
Normal file
After Width: | Height: | Size: 49 KiB |
BIN
img/book-python-in-hands.jpg
Normal file
After Width: | Height: | Size: 46 KiB |
BIN
img/caribou.jpg
Normal file
After Width: | Height: | Size: 303 KiB |
BIN
img/coins.png
Normal file
After Width: | Height: | Size: 273 KiB |
BIN
img/coins_detected.png
Normal file
After Width: | Height: | Size: 144 KiB |
BIN
img/document-alignment.png
Normal file
After Width: | Height: | Size: 706 KiB |
BIN
img/flamingo.jpg
Normal file
After Width: | Height: | Size: 263 KiB |
BIN
img/football-multi.png
Normal file
After Width: | Height: | Size: 298 KiB |
BIN
img/grabcut-result.png
Normal file
After Width: | Height: | Size: 545 KiB |
BIN
img/highgui-baboon.png
Normal file
After Width: | Height: | Size: 132 KiB |
BIN
img/highgui-canny.png
Normal file
After Width: | Height: | Size: 94 KiB |
BIN
img/hsv-cylinder.png
Normal file
After Width: | Height: | Size: 158 KiB |
BIN
img/hv-histograms.jpg
Normal file
After Width: | Height: | Size: 100 KiB |
BIN
img/kitty.png
Normal file
After Width: | Height: | Size: 10 KiB |
BIN
img/lena-grayscale.png
Normal file
After Width: | Height: | Size: 47 KiB |
BIN
img/lena.png
Normal file
After Width: | Height: | Size: 464 KiB |
BIN
img/lincoln-mask.jpg
Normal file
After Width: | Height: | Size: 15 KiB |
BIN
img/lincoln.jpg
Normal file
After Width: | Height: | Size: 49 KiB |
BIN
img/linux_foundation.png
Normal file
After Width: | Height: | Size: 6.8 KiB |
BIN
img/man-at-desk-results.jpg
Normal file
After Width: | Height: | Size: 679 KiB |
BIN
img/man-at-desk.jpg
Normal file
After Width: | Height: | Size: 110 KiB |
BIN
img/man-with-pipe.png
Normal file
After Width: | Height: | Size: 194 KiB |
BIN
img/man-without-pipe.png
Normal file
After Width: | Height: | Size: 190 KiB |
BIN
img/messi5.jpg
Normal file
After Width: | Height: | Size: 71 KiB |
BIN
img/mexico-meme.jpg
Normal file
After Width: | Height: | Size: 278 KiB |
BIN
img/noisy-lena.png
Normal file
After Width: | Height: | Size: 592 KiB |
BIN
img/panorama-manual.png
Normal file
After Width: | Height: | Size: 198 KiB |
BIN
img/park-results.png
Normal file
After Width: | Height: | Size: 664 KiB |
BIN
img/park-to-eq.jpg
Normal file
After Width: | Height: | Size: 306 KiB |
BIN
img/park.jpg
Normal file
After Width: | Height: | Size: 357 KiB |
BIN
img/parrot.jpg
Normal file
After Width: | Height: | Size: 90 KiB |
BIN
img/pedestrians.jpg
Normal file
After Width: | Height: | Size: 519 KiB |
BIN
img/people.jpg
Normal file
After Width: | Height: | Size: 364 KiB |
BIN
img/pipe.png
Normal file
After Width: | Height: | Size: 26 KiB |
BIN
img/poker-chips.jpg
Normal file
After Width: | Height: | Size: 237 KiB |
BIN
img/rgb-colors.png
Normal file
After Width: | Height: | Size: 11 KiB |
BIN
img/rgb-cube.png
Normal file
After Width: | Height: | Size: 98 KiB |
BIN
img/road-lanes-detected.png
Normal file
After Width: | Height: | Size: 602 KiB |
BIN
img/road-lanes-mask.jpg
Normal file
After Width: | Height: | Size: 13 KiB |
BIN
img/road-lanes.jpg
Normal file
After Width: | Height: | Size: 83 KiB |
BIN
img/road-sign.jpg
Normal file
After Width: | Height: | Size: 131 KiB |
BIN
img/runners.jpg
Normal file
After Width: | Height: | Size: 415 KiB |
BIN
img/selfie-background.jpg
Normal file
After Width: | Height: | Size: 327 KiB |
BIN
img/selfie-man.jpg
Normal file
After Width: | Height: | Size: 278 KiB |
BIN
img/selfie-out.png
Normal file
After Width: | Height: | Size: 380 KiB |
BIN
img/shapes.png
Normal file
After Width: | Height: | Size: 9.2 KiB |
BIN
img/soyjaks-final.png
Normal file
After Width: | Height: | Size: 381 KiB |
BIN
img/soyjaks.jpg
Normal file
After Width: | Height: | Size: 383 KiB |
BIN
img/st-louis-arch-0.033.jpg
Normal file
After Width: | Height: | Size: 881 KiB |
BIN
img/st-louis-arch-0.25.jpg
Normal file
After Width: | Height: | Size: 1.4 MiB |
BIN
img/st-louis-arch-15.jpg
Normal file
After Width: | Height: | Size: 2.3 MiB |
BIN
img/st-louis-arch-2.5.jpg
Normal file
After Width: | Height: | Size: 2.2 MiB |
BIN
img/swimmingpool.jpg
Normal file
After Width: | Height: | Size: 906 KiB |
BIN
img/text_no_ocr.png
Normal file
After Width: | Height: | Size: 42 KiB |
BIN
img/text_no_ocr_sample_result.png
Normal file
After Width: | Height: | Size: 69 KiB |
BIN
img/texture.jpg
Normal file
After Width: | Height: | Size: 507 KiB |
BIN
img/x_small.jpg
Normal file
After Width: | Height: | Size: 943 B |
BIN
vid/bike.mp4
Normal file
BIN
vid/blinking-man.mp4
Normal file
BIN
vid/blinking-woman1.mp4
Normal file
BIN
vid/blinking-woman2.mp4
Normal file
BIN
vid/football.mp4
Normal file
BIN
vid/protest.mp4
Normal file
2012
wko-01.ipynb
Normal file
1343
wko-02.ipynb
Normal file
1240
wko-03.ipynb
Normal file
586
wko-04.ipynb
Normal file
1434
wko-05.ipynb
Normal file
860
wko-06.ipynb
Normal file
920
wko-07.ipynb
Normal file
459
wko-08.ipynb
Normal file
@ -0,0 +1,459 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "909d3c02",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](img/aitech-logotyp-1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Widzenie komputerowe </h1>\n",
|
||||
"<h2> 08. <i>Rozpoznawanie twarzy</i> [laboratoria]</h2> \n",
|
||||
"<h3>Andrzej Wójtowicz (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](img/aitech-logotyp-2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7a9fde6b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"W poniższych materiałach zaprezentujemy klasyczne metody rozpoznawania twarzy. Opisywane zagadnienia można odnaleźć w *5.2.3 Principal component analysis* R. Szeliski (2022) *Computer Vision: Algorithms and Applications* oraz [dokumentacji](https://docs.opencv.org/4.5.3/da/d60/tutorial_face_main.html).\n",
|
||||
"\n",
|
||||
"Na początku załadujmy niezbędne biblioteki."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1d86977a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import cv2 as cv\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"%matplotlib inline\n",
|
||||
"import sklearn.metrics\n",
|
||||
"import ipywidgets\n",
|
||||
"import os\n",
|
||||
"import random"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c5a62135",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Rozpakujmy zbiór danych, na którym będziemy pracować:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0e0f1723",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!cd datasets && unzip -qo yaleextb.zip"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e6a0efb1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Nasz zbiór zawiera po kilkadziesiąt zdjęć kilkudziesięciu osób, które zostały sfotografowane w różnych warunkach oświetlenia. Wczytane zdjęcia podzielimy na zbiór treningowy i testowy w stosunku 3/1 oraz wyświetlimy kilka przykładowych zdjęć:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7b775bbf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset_dir = \"datasets/yaleextb\"\n",
|
||||
"\n",
|
||||
"img_data = []\n",
|
||||
"img_labels = []\n",
|
||||
"\n",
|
||||
"images = os.listdir(dataset_dir)\n",
|
||||
"\n",
|
||||
"n_examples = 15\n",
|
||||
"\n",
|
||||
"for i in range(1, 40):\n",
|
||||
" i_str = str(i).zfill(2)\n",
|
||||
" images_p = [img for img in images if img.startswith(f\"yaleB{i_str}\")]\n",
|
||||
" \n",
|
||||
" for img in images_p[:n_examples]:\n",
|
||||
" img_data.append(cv.imread(f\"{dataset_dir}/{img}\", cv.IMREAD_GRAYSCALE))\n",
|
||||
" img_labels.append(i)\n",
|
||||
"\n",
|
||||
"random.seed(1337)\n",
|
||||
"selector = random.choices([False, True], k=len(images), weights=[3, 1])\n",
|
||||
"train_data = [x for x, y in zip(img_data, selector) if not y]\n",
|
||||
"train_labels = [x for x, y in zip(img_labels, selector) if not y]\n",
|
||||
"test_data = [x for x, y in zip(img_data, selector) if y]\n",
|
||||
"test_labels = [x for x, y in zip(img_labels, selector) if y]\n",
|
||||
"\n",
|
||||
"plt.figure(figsize=(12,5))\n",
|
||||
"for i in range(4):\n",
|
||||
" plt.subplot(251 + i)\n",
|
||||
" plt.imshow(train_data[i], cmap='gray');\n",
|
||||
"for i in range(4):\n",
|
||||
" plt.subplot(256 + i)\n",
|
||||
" plt.imshow(train_data[-i-20], cmap='gray');"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6e315630",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Pierwszym modelem jest *Eigenfaces* zaimplementowany w [`EigenFaceRecognizer`](https://docs.opencv.org/4.5.3/dd/d7c/classcv_1_1face_1_1EigenFaceRecognizer.html). Główny pomysł polega na użyciu PCA do redukcji wymiarów. W naszym przykładzie zachowamy 60 wektorów własnych."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0473c8ae",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = cv.face.EigenFaceRecognizer_create(60)\n",
|
||||
"model.train(np.array(train_data), np.array(train_labels))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7a753f2d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Zachowane wektory własne możemy zwizualizować:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f797fe86",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"img_shape = train_data[0].shape\n",
|
||||
"plt.figure(figsize=(12,5))\n",
|
||||
"for i in range(5):\n",
|
||||
" e_v = model.getEigenVectors()[:,i]\n",
|
||||
" e_v = np.reshape(e_v, img_shape)\n",
|
||||
"\n",
|
||||
" plt.subplot(151+i)\n",
|
||||
" plt.imshow(e_v, cmap='gray');"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "19545151",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Możemy zobaczyć jakie potencjalne twarze znajdują się w naszej przestrzeni. Do *uśrednionej* twarzy dodajemy kolejne wektory własne z odpowiednimi wagami. Poniżej mamy przykład wykorzystujący 6 wektorów:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5265f337",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"mean = model.getMean()\n",
|
||||
"W = model.getEigenVectors()\n",
|
||||
"\n",
|
||||
"def generate_face(**args):\n",
|
||||
" img = mean.copy()\n",
|
||||
" for i, k in enumerate(args.keys()):\n",
|
||||
" img = np.add(img, W[:,i]*(10*args[k]))\n",
|
||||
" \n",
|
||||
" img = np.reshape(img, img_shape)\n",
|
||||
" plt.figure(figsize=(5,5))\n",
|
||||
" plt.imshow(img, cmap='gray')\n",
|
||||
" plt.show()\n",
|
||||
" \n",
|
||||
"ipywidgets.interactive(generate_face, \n",
|
||||
" w_0=ipywidgets.IntSlider(min=-128, max=128),\n",
|
||||
" w_1=ipywidgets.IntSlider(min=-128, max=128),\n",
|
||||
" w_2=ipywidgets.IntSlider(min=-128, max=128),\n",
|
||||
" w_3=ipywidgets.IntSlider(min=-128, max=128),\n",
|
||||
" w_4=ipywidgets.IntSlider(min=-128, max=128),\n",
|
||||
" w_5=ipywidgets.IntSlider(min=-128, max=128))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fd4bdce6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Możemy teraz spróbować zrobić rekonstrukcję np. pierwszej twarzy ze zbioru treningowego. Pobieramy dla niej projekcje (wagi) z naszego modelu i podobnie jak wyżej wykorzystujemy uśrednioną twarz i wektory własne. Możemy zobaczyć, że użycie większej liczby wektorów powoduje zwiększenie precyzji rekonstrukcji:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2619c6f9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pro = model.getProjections()[0]\n",
|
||||
"\n",
|
||||
"def reconstruct_face(k):\n",
|
||||
" img = mean.copy()\n",
|
||||
"\n",
|
||||
" for i in range(k):\n",
|
||||
" img = np.add(img, W[:,i]*pro[0,i])\n",
|
||||
" \n",
|
||||
" return img\n",
|
||||
"\n",
|
||||
"plt.figure(figsize=(12,6))\n",
|
||||
"for i in range(6):\n",
|
||||
" k = (i+1)*10\n",
|
||||
" r_face = np.reshape(reconstruct_face(k), img_shape)\n",
|
||||
" j = 0 if i <= 4 else 10\n",
|
||||
" plt.subplot(151+i+100)\n",
|
||||
" plt.imshow(r_face, cmap='gray')\n",
|
||||
" plt.title(f\"k = {k}\")\n",
|
||||
" \n",
|
||||
"plt.subplot(257)\n",
|
||||
"plt.imshow(train_data[0], cmap='gray');\n",
|
||||
"plt.title(\"original\");"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ae87277a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Spróbujmy teraz odnaleźć osobny znajdujące się na dwóch przykładowych obrazach ze zbioru testowego. Dla nieznanej twarzy obliczamy projekcje i szukamy metodą najbliższego sąsiada projekcji ze zbioru treningowego. Poniżej mamy przykład z poprawnym rozpoznaniem osoby oraz z niepoprawnym rozpoznaniem:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "828f3134",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def find_face(query_id):\n",
|
||||
" query_face = test_data[query_id]\n",
|
||||
" query_label = test_labels[query_id]\n",
|
||||
"\n",
|
||||
" x = np.reshape(query_face, mean.shape)\n",
|
||||
" x_coeff = np.dot(x - mean, W)\n",
|
||||
"\n",
|
||||
" best_face = None\n",
|
||||
" best_label = None\n",
|
||||
" best_dist = float('inf')\n",
|
||||
"\n",
|
||||
" for i, p in enumerate(model.getProjections()):\n",
|
||||
" dist = np.linalg.norm(np.reshape(p, 60) - np.reshape(x_coeff, 60))\n",
|
||||
"\n",
|
||||
" if dist < best_dist:\n",
|
||||
" best_face = train_data[i]\n",
|
||||
" best_label = train_labels[i]\n",
|
||||
" best_dist = dist\n",
|
||||
" \n",
|
||||
" return query_face, query_label, best_face, best_label\n",
|
||||
"\n",
|
||||
"qf_1, ql_1, bf_1, bl_1 = find_face(45)\n",
|
||||
"qf_2, ql_2, bf_2, bl_2 = find_face(10)\n",
|
||||
"\n",
|
||||
"plt.figure(figsize=(8,11))\n",
|
||||
"plt.subplot(221)\n",
|
||||
"plt.imshow(qf_1, cmap='gray')\n",
|
||||
"plt.title(f\"Face 1: query label = {ql_1}\")\n",
|
||||
"plt.subplot(222)\n",
|
||||
"plt.imshow(bf_1, cmap='gray');\n",
|
||||
"plt.title(f\"Face 1: best label = {bl_1}\")\n",
|
||||
"plt.subplot(223)\n",
|
||||
"plt.imshow(qf_2, cmap='gray')\n",
|
||||
"plt.title(f\"Face 2: query label = {ql_2}\")\n",
|
||||
"plt.subplot(224)\n",
|
||||
"plt.imshow(bf_2, cmap='gray');\n",
|
||||
"plt.title(f\"Face 2: best label = {bl_2}\");"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "43f9a8e5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Bardziej kompaktowe wykonanie predykcji możemy uzyskać poprzez metodę `predict()`:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bf736bdd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(test_labels[45], model.predict(test_data[45])[0])\n",
|
||||
"print(test_labels[10], model.predict(test_data[10])[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "eeaf62b5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Jak widać poniżej, metoda ta nie uzyskuje szczególnie zadowalających wyników (generalnie słabo sobie radzi w sytuacji zmian oświetlenia):"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "12c65438",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"predictions = []\n",
|
||||
"for test_img in test_data:\n",
|
||||
" p_label, p_conf = model.predict(test_img)\n",
|
||||
" predictions.append(p_label)\n",
|
||||
" \n",
|
||||
"print(f\"Accuracy: {sklearn.metrics.accuracy_score(test_labels, predictions) * 100:.2f} %\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ea5d879b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Poniżej krótko zaprezentujemy jeszcze dwa rozwinięcia tego algorytmu. Pierwszym z nich jest *Fisherfaces* zaimplementowany w [`FisherFaceRecognizer`](https://docs.opencv.org/4.5.3/d2/de9/classcv_1_1face_1_1FisherFaceRecognizer.html). Tym razem przy pomocy LDA chcemy dodatkowo uwzględnić rozrzut pomiędzy klasami (por. [przykład](https://sthalles.github.io/fisher-linear-discriminant/)). Poniżej tworzymy model z 40 komponentami:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4eb5b746",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = cv.face.FisherFaceRecognizer_create(40)\n",
|
||||
"model.train(np.array(train_data), np.array(train_labels))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e9f334be",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Zauważmy, że uzyskujemy tutaj ponad dwukrotnie lepszy wynik:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "96faa192",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"predictions = []\n",
|
||||
"for test_img in test_data:\n",
|
||||
" p_label, p_conf = model.predict(test_img)\n",
|
||||
" predictions.append(p_label)\n",
|
||||
" \n",
|
||||
"print(f\"Accuracy: {sklearn.metrics.accuracy_score(test_labels, predictions) * 100:.2f} %\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "02220e5f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Dalszym rozwinięciem jest model *Local Binary Patterns Histograms* (LBPH) zaimplementowany w [`LBPHFaceRecognizer`](https://docs.opencv.org/4.5.3/df/d25/classcv_1_1face_1_1LBPHFaceRecognizer.html). W tym wypadku chcemy np. uwzględnić możliwość innego oświetlenia osób niż taki, który występuje w naszym zbiorze treningowym. Podobnie jak wcześniej zależy nam na redukcji wymiarów, ale tym razem uzyskamy to poprzez wyliczanie cech (progowanie) dla poszczególnych pikseli w zadanych regionach."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "61eeffdf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = cv.face.LBPHFaceRecognizer_create(radius=10, neighbors=10, grid_x=32, grid_y=32)\n",
|
||||
"model.train(np.array(train_data), np.array(train_labels))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0d64cb5a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Uzyskany wynik jest o kilka punktów procentowy lepszy od poprzedniego modelu, jednak możemy zauważyć, że zmiana domyślnych parametrów na takie, które zwiększają precyzję, powoduje również zwiększenie czasu potrzebnego na wykonanie predykcji:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ca2e319d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"predictions = []\n",
|
||||
"for test_img in test_data:\n",
|
||||
" p_label, p_conf = model.predict(test_img)\n",
|
||||
" predictions.append(p_label)\n",
|
||||
" \n",
|
||||
"print(f\"Accuracy: {sklearn.metrics.accuracy_score(test_labels, predictions) * 100:.2f} %\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "00196405",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Zadanie 1\n",
|
||||
"\n",
|
||||
"W katalogu `datasets` znajduje się zbiór zdjęć `att_faces`. Sprawdź jakiego typu są to zdjęcia oraz jak powyższe algorytmy działają na tym zbiorze."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "51b8a256",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# miejsce na eksperymenty"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Andrzej Wójtowicz",
|
||||
"email": "andre@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.3"
|
||||
},
|
||||
"subtitle": "08. Rozpoznawanie twarzy [laboratoria]",
|
||||
"title": "Widzenie komputerowe",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
556
wko-08_att_faces.ipynb
Normal file
790
wko-09.ipynb
Normal file
840
wko-10.ipynb
Normal file
@ -0,0 +1,840 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3c8a4b52",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](img/aitech-logotyp-1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Widzenie komputerowe </h1>\n",
|
||||
"<h2> 10. <i>Metody głębokiego uczenia (2)</i> [laboratoria]</h2> \n",
|
||||
"<h3>Andrzej Wójtowicz (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](img/aitech-logotyp-2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "783d6d64",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"W poniższym materiale zobaczymy w jaki sposób korzystać z wytrenowanych modeli sieci neuronowych w zagadnieniach związanych z wykrywaniem wielu obiektów, szacowaniem pozy człowieka, wykrywaniem i rozpoznawaniem tekstu oraz super rozdzielczością.\n",
|
||||
"\n",
|
||||
"Uwaga: realizacja poniższych treści będzie wymagała pobrania ok. 700 MB danych.\n",
|
||||
"\n",
|
||||
"Na początku załadujmy niezbędne biblioteki:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ef18510f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import cv2 as cv\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"%matplotlib inline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e45afc56",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Pobrane pliki będziemy zapisywać w katalogu `dnn`:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7aac31ef",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!mkdir -p dnn"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f792eb4f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Wykrywanie obiektów\n",
|
||||
"\n",
|
||||
"## SSD\n",
|
||||
"\n",
|
||||
"W poprzednich materiałach korzystaliśmy z [SSD](https://arxiv.org/pdf/1512.02325.pdf) do wykrywania wielu twarzy na zdjęciu. W poniższym przykładzie możemy zobaczyć użycie do wykrywania wielu obiektów - sieć została wytrenowana na zbiorze [Common Objects in Context](https://cocodataset.org/) (COCO). Użyjemy modelu dostępnego dla frameworku [Tensorflow](https://github.com/tensorflow/models/tree/master/research/object_detection) (inne modele możemy znaleźć w [Detection Model Zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf1_detection_zoo.md)):"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aa10b6fa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!wget -q --show-progress -O dnn/ssd_mobilenet_v2_coco_2018_03_29.tar.gz http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v2_coco_2018_03_29.tar.gz\n",
|
||||
"!cd dnn && tar xzf ssd_mobilenet_v2_coco_2018_03_29.tar.gz && rm ssd_mobilenet_v2_coco_2018_03_29.tar.gz"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "99ec1efa",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Pobraliśmy model i generujemy konfigurację:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eac9a8da",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!wget -q --show-progress -O dnn/ssd_mobilenet_v2_coco_2018_03_29/tf_text_graph_ssd.py https://raw.githubusercontent.com/opencv/opencv/4.5.3/samples/dnn/tf_text_graph_ssd.py\n",
|
||||
"!wget -q --show-progress -O dnn/ssd_mobilenet_v2_coco_2018_03_29/tf_text_graph_common.py https://raw.githubusercontent.com/opencv/opencv/4.5.3/samples/dnn/tf_text_graph_common.py\n",
|
||||
"!cd dnn/ssd_mobilenet_v2_coco_2018_03_29 && python3 tf_text_graph_ssd.py --input frozen_inference_graph.pb --output net.pbtxt --config pipeline.config"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "232e2987",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wczytujemy model:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9b4180e5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = cv.dnn.readNetFromTensorflow(\"dnn/ssd_mobilenet_v2_coco_2018_03_29/frozen_inference_graph.pb\",\n",
|
||||
" \"dnn/ssd_mobilenet_v2_coco_2018_03_29/net.pbtxt\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0bbfd2a4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Pobieramy i wczytujemy etykiety klas obiektów:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "17335a42",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!wget -q -O - https://raw.githubusercontent.com/tensorflow/models/master/research/object_detection/data/mscoco_complete_label_map.pbtxt | grep display_name | grep -o '\".*\"' | tr -d '\"' > dnn/ssd_mobilenet_v2_coco_2018_03_29/coco-labels.txt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "662e1a33",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('dnn/ssd_mobilenet_v2_coco_2018_03_29/coco-labels.txt', 'r') as f_fd:\n",
|
||||
" classes = f_fd.read().splitlines()\n",
|
||||
" \n",
|
||||
"print(len(classes), classes[:5])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "94cace8a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Spróbujemy sprawdzić jakie obiekty znajdują się na poniższym zdjęciu:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "91834aba",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"image = cv.imread('img/messi5.jpg')\n",
|
||||
"plt.figure(figsize=[7,7])\n",
|
||||
"plt.imshow(image[:,:,::-1]);"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "43774ae3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Sieć zwraca nam listę obiektów z oznaczeniem współrzędnych na zdjęciu oraz identyfikatorem obiektu (ustawiliśmy próg odcięcia na 0.5):"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "84652c91",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"height, width, _ = image.shape\n",
|
||||
"\n",
|
||||
"image_blob = cv.dnn.blobFromImage(image=image, scalefactor=1, size=(300, 300), mean=(0,0,0), \n",
|
||||
" swapRB=True, crop=False)\n",
|
||||
"\n",
|
||||
"model.setInput(image_blob)\n",
|
||||
"detections = model.forward()\n",
|
||||
"\n",
|
||||
"image_out = image.copy()\n",
|
||||
"\n",
|
||||
"for i in range(detections.shape[2]):\n",
|
||||
" confidence = detections[0, 0, i, 2]\n",
|
||||
" if confidence > 0.5:\n",
|
||||
"\n",
|
||||
" box = detections[0, 0, i, 3:7] * np.array([width, height, width, height])\n",
|
||||
" (x1, y1, x2, y2) = box.astype('int')\n",
|
||||
" \n",
|
||||
" class_id = int(detections[0, 0, i, 1])\n",
|
||||
"\n",
|
||||
" cv.rectangle(image_out, (x1, y1), (x2, y2), (0, 255, 0), 6)\n",
|
||||
" label = '{:} ({:.3f})'.format(classes[class_id], confidence)\n",
|
||||
" label_size, base_line = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.65, 1)\n",
|
||||
" cv.rectangle(image_out, (x1, y1 - label_size[1]), (x1 + label_size[0], y1 + base_line), \n",
|
||||
" (255, 255, 255), cv.FILLED)\n",
|
||||
" cv.putText(image_out, label, (x1, y1), cv.FONT_HERSHEY_SIMPLEX, 0.65, (0, 0, 0))\n",
|
||||
" \n",
|
||||
"plt.figure(figsize=[12,12])\n",
|
||||
"plt.imshow(image_out[:,:,::-1]);"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3fa16e91",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## YOLOv4"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "27ce3522",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Innym popularnym modelem do wykrywania obiektów jest [You Only Look Once](https://github.com/AlexeyAB/darknet) (YOLO). Porównując YOLO do innych sieci, model ten nie analizuje poszczególnych regionów, ale patrzy na obraz całościowo, co w pewien sposób stanowi balans między szybkością a precyzją. Ze względu na tę cechę model ten dobrze nadaje się do wykrywania obiektów w czasie rzeczywistym. Model powinien dobrze sobie radzić gdy zostanie mu przedstawiona nieznana wcześniej reprezentacja obiektu (np. zacieniony) lub gdy obiekt znajduje się w otoczeniu innych nieoczekiwanych obiektów.\n",
|
||||
"\n",
|
||||
"YOLO jest dostępne w kilku wersjach, natomiast my sprawdzimy jak sobie radzi wersja kompaktowa:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4c3e7fb1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!mkdir -p dnn/yolo_v4_tiny\n",
|
||||
"!wget -q --show-progress -O dnn/yolo_v4_tiny/yolov4-tiny.weights https://github.com/AlexeyAB/darknet/releases/download/yolov4/yolov4-tiny.weights\n",
|
||||
"!wget -q --show-progress -O dnn/yolo_v4_tiny/yolov4-tiny.cfg https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-tiny.cfg\n",
|
||||
"!wget -q --show-progress -O dnn/yolo_v4_tiny/coco.names https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/coco.names"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9497b09c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wczytujemy model:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e8cc6a3a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = cv.dnn.readNetFromDarknet(\"dnn/yolo_v4_tiny/yolov4-tiny.cfg\", \n",
|
||||
" \"dnn/yolo_v4_tiny/yolov4-tiny.weights\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "df331450",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wczytujemy etykiety obiektów:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8f01d354",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('dnn/yolo_v4_tiny/coco.names', 'r') as f_fd:\n",
|
||||
" classes = f_fd.read().splitlines()\n",
|
||||
" \n",
|
||||
"print(len(classes), classes[:5])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3fc5e3fc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Przetestujemy działanie na poniższym zdjęciu:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "df65dee0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"image = cv.imread('img/pedestrians.jpg')\n",
|
||||
"plt.figure(figsize=[7,7])\n",
|
||||
"plt.imshow(image[:,:,::-1]);"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9fbb6325",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Podczas korzystania z tego modelu musimy się zmierzyć z kilkoma subtelnościami. Model wykorzystuje framework Darknet, więc musimy wskazać, że chodzi nam o predykcje pochodzące z ostatniej warstwy. Dodatkowo mamy kilka progów odcięcia do zdefiniowania, tj. miarę obiektowości (*objectness*), pewności (*confidence*) oraz tłumienia niemaksymalnego aby ograniczyć występowanie nakładających się na siebie ramek z wykrytymi obiektami (por. [`cv.dnn.NMSBoxes()`](https://docs.opencv.org/4.5.3/d6/d0f/group__dnn.html#ga9d118d70a1659af729d01b10233213ee)). Poniżej mamy wynik działania:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d8450888",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"height, width, _ = image.shape\n",
|
||||
"\n",
|
||||
"image_blob = cv.dnn.blobFromImage(image=image, scalefactor=1/255, size=(416, 416), mean=(0,0,0), \n",
|
||||
" swapRB=True, crop=False)\n",
|
||||
"\n",
|
||||
"model.setInput(image_blob)\n",
|
||||
"detections = model.forward([model.getLayerNames()[i[0] - 1] for i in model.getUnconnectedOutLayers()])\n",
|
||||
"\n",
|
||||
"image_out = image.copy()\n",
|
||||
"\n",
|
||||
"class_ids = []\n",
|
||||
"confidences = []\n",
|
||||
"boxes = []\n",
|
||||
"\n",
|
||||
"for out in detections:\n",
|
||||
" for detection in out:\n",
|
||||
" if detection[4] > 0.5: # objectness thr.\n",
|
||||
" scores = detection[5:]\n",
|
||||
" class_id = np.argmax(scores)\n",
|
||||
" confidence = scores[class_id]\n",
|
||||
" if confidence > 0.5: # confidence thr.\n",
|
||||
" center_x = int(detection[0] * width)\n",
|
||||
" center_y = int(detection[1] * height)\n",
|
||||
" b_width = int(detection[2] * width)\n",
|
||||
" b_height = int(detection[3] * height)\n",
|
||||
"\n",
|
||||
" b_left = int(center_x - b_width / 2)\n",
|
||||
" b_top = int(center_y - b_height / 2)\n",
|
||||
" class_ids.append(class_id)\n",
|
||||
" confidences.append(float(confidence))\n",
|
||||
" boxes.append([b_left, b_top, b_width, b_height])\n",
|
||||
"\n",
|
||||
"indices = cv.dnn.NMSBoxes(boxes, confidences, score_threshold=0.5, nms_threshold=0.5)\n",
|
||||
"for i in indices:\n",
|
||||
" idx = i[0]\n",
|
||||
" box = boxes[idx]\n",
|
||||
" x1 = box[0]\n",
|
||||
" y1 = box[1]\n",
|
||||
" x2 = box[0] + box[2]\n",
|
||||
" y2 = box[1] + box[3]\n",
|
||||
" cv.rectangle(image_out, (x1, y1), (x2, y2), (0, 255, 0), 6)\n",
|
||||
" label = '{:} ({:.3f})'.format(classes[class_ids[idx]], confidences[idx])\n",
|
||||
" \n",
|
||||
" label_size, base_line = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.65, 1)\n",
|
||||
" cv.rectangle(image_out, (x1, y1 - label_size[1]), (x1 + label_size[0], y1 + base_line), \n",
|
||||
" (255, 255, 255), cv.FILLED)\n",
|
||||
" cv.putText(image_out, label, (x1, y1), cv.FONT_HERSHEY_SIMPLEX, 0.65, (0, 0, 0))\n",
|
||||
" \n",
|
||||
"plt.figure(figsize=[12,12])\n",
|
||||
"plt.imshow(image_out[:,:,::-1]);"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "41e32b8e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Szacowanie pozy człowieka\n",
|
||||
"\n",
|
||||
"Kolejnym interesującym zagadnieniem jest szacowanie pozy człowieka (ang. *human pose estimation*) na podstawie zdjęcia. Celem jest tutaj wykrycie charakterystycznych punktów orientacyjnych, które mogą potem zostać wykorzystane np. treningu sportowego, kontroli gestów, korekcji postawy, itp. W tym celu wykorzystamy [OpenPose](https://github.com/CMU-Perceptual-Computing-Lab/openpose)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6a3fedf2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!mkdir -p dnn/openpose\n",
|
||||
"!wget -q --show-progress -O dnn/openpose/pose_iter_160000.caffemodel http://posefs1.perception.cs.cmu.edu/Users/tsimon/Projects/coco/data/models/mpi/pose_iter_160000.caffemodel\n",
|
||||
"!wget -q --show-progress -O dnn/openpose/pose_deploy_linevec_faster_4_stages.prototxt https://raw.githubusercontent.com/CMU-Perceptual-Computing-Lab/openpose/master/models/pose/mpi/pose_deploy_linevec_faster_4_stages.prototxt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "851c965b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wczytujemy model:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d55edcb8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = cv.dnn.readNetFromCaffe(\"dnn/openpose/pose_deploy_linevec_faster_4_stages.prototxt\",\n",
|
||||
" \"dnn/openpose/pose_iter_160000.caffemodel\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c2c701c3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Będziemy chcieli przeanalizować poniższe zdjęcie:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aeaed6eb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"image = cv.imread(\"img/messi5.jpg\")\n",
|
||||
"plt.figure(figsize=[7,7])\n",
|
||||
"plt.imshow(image[:,:,::-1]);"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "00a42dd5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Zdefinujemy poniżej połączenia pomiędzy 15 punktami orientacyjnymi:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "894acae5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pose_points_n = 15\n",
|
||||
"pose_pairs = [[0,1], [1,2], [2,3], [3,4], [1,5], [5,6], [6,7], [1,14], [14,8], [8,9], [9,10], [14,11], [11,12], [12,13]]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5a8a5028",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"W wyniku otrzymujemy mapy prawodpodobieństwa występowania danego punktu orientacyjnego:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "24ca95c6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"height, width, _ = image.shape\n",
|
||||
"\n",
|
||||
"image_blob = cv.dnn.blobFromImage(image, 1.0/255, (368, 368), (0, 0, 0), swapRB=False, crop=False)\n",
|
||||
"model.setInput(image_blob)\n",
|
||||
"\n",
|
||||
"output = model.forward()\n",
|
||||
"\n",
|
||||
"plt.figure(figsize=(20,3))\n",
|
||||
"for i in range(pose_points_n):\n",
|
||||
" prob_map = output[0, i, :, :]\n",
|
||||
" disp_map = cv.resize(prob_map, (width, height), cv.INTER_LINEAR)\n",
|
||||
" plt.subplot(2, 8, i+1)\n",
|
||||
" plt.axis('off')\n",
|
||||
" plt.imshow(disp_map, cmap='jet', vmin=0, vmax=1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c8be6dc1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Przeskalowujemy wyniki do rozmiarów obrazu wejściowego i przy pomocy [`cv.minMaxLoc()`](https://docs.opencv.org/4.5.3/d2/de8/group__core__array.html#gab473bf2eb6d14ff97e89b355dac20707) znajdujemy wartość maksymalną (dodatkowo sprawdzamy czy wartość prawdopodobieństwa jest odpowiednio duża):"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a3163987",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"scale_x = width / output.shape[3]\n",
|
||||
"scale_y = height / output.shape[2]\n",
|
||||
"\n",
|
||||
"points = []\n",
|
||||
"\n",
|
||||
"for i in range(pose_points_n):\n",
|
||||
" prob_map = output[0, i, :, :]\n",
|
||||
" \n",
|
||||
" _, prob, _, point = cv.minMaxLoc(prob_map)\n",
|
||||
" \n",
|
||||
" x = scale_x * point[0]\n",
|
||||
" y = scale_y * point[1]\n",
|
||||
"\n",
|
||||
" if prob > 0.1: # thr.\n",
|
||||
" points.append((int(x), int(y)))\n",
|
||||
" else:\n",
|
||||
" points.append(None)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f1f8cac3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Możemy teraz nanieść punkty na obraz i połączyć je w szkielet"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fcbda6c6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"image_points = image.copy()\n",
|
||||
"image_skeleton = image.copy()\n",
|
||||
"\n",
|
||||
"for i, p in enumerate(points):\n",
|
||||
" cv.circle(image_points, p, 8, (255, 255, 0), thickness=-1, lineType=cv.FILLED)\n",
|
||||
" cv.putText(image_points, \"{}\".format(i), p, cv.FONT_HERSHEY_SIMPLEX, 1, (0,255,255), 2, lineType=cv.LINE_AA)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"for pair in pose_pairs:\n",
|
||||
" part_a = pair[0]\n",
|
||||
" part_b = pair[1]\n",
|
||||
"\n",
|
||||
" if points[part_a] and points[part_b]:\n",
|
||||
" cv.line(image_skeleton, points[part_a], points[part_b], (0, 255, 255), 4)\n",
|
||||
" cv.circle(image_skeleton, points[part_a], 7, (255, 255, 0), thickness=-1, lineType=cv.FILLED)\n",
|
||||
"\n",
|
||||
"plt.figure(figsize=(20,20))\n",
|
||||
"plt.subplot(121)\n",
|
||||
"plt.imshow(image_points[:,:,::-1])\n",
|
||||
"plt.subplot(122)\n",
|
||||
"plt.imshow(image_skeleton[:,:,::-1]);"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ea3421fd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Wykrywanie i rozpoznawanie tekstu\n",
|
||||
"\n",
|
||||
"W kolejnym przykładzie zobaczymy jak możemy wykryć na zdjęciu tekst przy pomocy [DB](https://github.com/MhLiao/DB) oraz rozpoznać go przy pomocy [CRNN](https://arxiv.org/pdf/1507.05717.pdf)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5ef81ed2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import gdown\n",
|
||||
"\n",
|
||||
"for url, output in [('https://drive.google.com/uc?export=dowload&id=19YWhArrNccaoSza0CfkXlA8im4-lAGsR', 'dnn/DB_TD500_resnet50.onnx'), \n",
|
||||
" ('https://drive.google.com/uc?export=dowload&id=12diBsVJrS9ZEl6BNUiRp9s0xPALBS7kt', 'dnn/crnn_cs.onnx'),\n",
|
||||
" ('https://drive.google.com/uc?export=dowload&id=1oKXxXKusquimp7XY1mFvj9nwLzldVgBR', 'dnn/alphabet_94.txt')]:\n",
|
||||
" gdown.download(url, output, quiet=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "72721bc5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Będziemy pracować na poniższym zdjęciu:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "86e3f889",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"image = cv.imread('img/road-sign.jpg')\n",
|
||||
"\n",
|
||||
"plt.figure(figsize=(5,7))\n",
|
||||
"plt.imshow(image[:,:,::-1]);"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ec7d3ce4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wczytujemy obsługiwany alfabet:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5d27f129",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('dnn/alphabet_94.txt', 'r') as f_fd:\n",
|
||||
" alphabet = f_fd.read().splitlines()\n",
|
||||
" \n",
|
||||
"print(len(alphabet), alphabet[:15])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d3373c60",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"OpenCV posiada gotowe API dla sieci DB poprzez [`cv.dnn_TextDetectionModel_DB()`](https://docs.opencv.org/4.5.3/db/d0f/classcv_1_1dnn_1_1TextDetectionModel__DB.html):"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b3c3bfc2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text_detector = cv.dnn_TextDetectionModel_DB(\"dnn/DB_TD500_resnet50.onnx\")\n",
|
||||
"\n",
|
||||
"text_detector.setBinaryThreshold(0.4).setPolygonThreshold(0.5)\n",
|
||||
"text_detector.setInputParams(scale=1.0/255, size=(640, 640), \n",
|
||||
" mean=(122.67891434, 116.66876762, 104.00698793), swapRB=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "31300a5f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"W wyniku otrzymujemy ramki, na których występuje tekst (choć jak widzimy, są też wyniki fałszywie pozytywne):"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d14502d4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"boxes, confs = text_detector.detect(image)\n",
|
||||
"\n",
|
||||
"image_out = image.copy()\n",
|
||||
"\n",
|
||||
"cv.polylines(image_out, boxes, True, (255, 0, 255), 4)\n",
|
||||
"\n",
|
||||
"plt.figure(figsize=(5,7))\n",
|
||||
"plt.imshow(image_out[:,:,::-1]);"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3c3eae71",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"W kolejnym kroku przygotowujemy model do rozpoznawania tekstu przy pomocy [`cv.dnn_TextRecognitionModel()`](https://docs.opencv.org/4.5.3/de/dee/classcv_1_1dnn_1_1TextRecognitionModel.html):"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d6b29f6a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text_recognizer = cv.dnn_TextRecognitionModel(\"dnn/crnn_cs.onnx\")\n",
|
||||
"text_recognizer.setDecodeType(\"CTC-greedy\")\n",
|
||||
"text_recognizer.setVocabulary(alphabet)\n",
|
||||
"text_recognizer.setInputParams(scale=1/127.5, size=(100, 32), mean=(127.5, 127.5, 127.5), swapRB=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a17f6437",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Każdą wykrytą ramkę rzutujemy na rozmiar 100x32 i wykrywamy tekst:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d6909f83",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for box in boxes:\n",
|
||||
" vertices = np.asarray(box).astype(np.float32)\n",
|
||||
" output_size = (100, 32)\n",
|
||||
" target_vertices = np.array([\n",
|
||||
" [0, output_size[1] - 1],\n",
|
||||
" [0, 0],\n",
|
||||
" [output_size[0] - 1, 0],\n",
|
||||
" [output_size[0] - 1, output_size[1] - 1]],\n",
|
||||
" dtype=\"float32\")\n",
|
||||
" rotation_matrix = cv.getPerspectiveTransform(vertices, target_vertices)\n",
|
||||
" cropped_roi = cv.warpPerspective(image, rotation_matrix, output_size)\n",
|
||||
" \n",
|
||||
" result = text_recognizer.recognize(cropped_roi)\n",
|
||||
" print(result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e0b4b3c0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Super rozdzielczość\n",
|
||||
"\n",
|
||||
"Podczas zwiększania rozdzielczości brakujące piksele muszą być w jakiś sposób interpolowane. Przy niewielkich powiększeniach zwykle wystarczą nam tradycyjne metody, jednak jeśli pracujemy z obrazem w niskiej rozdzielczości i chcemy go znacząco powiększyć, to chcielibyśmy również uzyskać wysoką jakość np. poprzez uwzględnienie informacji z otoczenia pikseli. Problematyka ta dotyczy zagadnienia super rozdzielczości (ang. *super-resolution*).\n",
|
||||
"\n",
|
||||
"W [artykule](https://arxiv.org/pdf/1902.06068.pdf) z 2020 r. możemy znaleźć porównanie dostępnych w tamtym czasie modeli (zob. wykres na str. 15); np. możemy zobaczyć, że model [EDSR](https://github.com/Saafke/EDSR_Tensorflow) radzi sobie całkiem nieźle, aczkolwiek kosztem sporego narzutu obliczeniowego (por. również benchmarki [OpenCV](https://github.com/opencv/opencv_contrib/blob/master/modules/dnn_superres/README.md)). Przetestujemy EDSR na powiększeniu 4-krotnym:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6b9f6be9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!wget -q --show-progress -O dnn/EDSR_x4.pb https://raw.githubusercontent.com/Saafke/EDSR_Tensorflow/master/models/EDSR_x4.pb"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "92f09e43",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Przy pomocy [`cv.dnn_superres.DnnSuperResImpl_create()`](https://docs.opencv.org/4.5.3/d8/d11/classcv_1_1dnn__superres_1_1DnnSuperResImpl.html) przygotowujemy model:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "368ca179",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sr = cv.dnn_superres.DnnSuperResImpl_create()\n",
|
||||
"sr.readModel('dnn/EDSR_x4.pb')\n",
|
||||
"sr.setModel('edsr', 4)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9e0169f3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Następnie zwiększy rozdzielczość zadanego obrazu (operacja może zająć trochę czasu):"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "45c89529",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"image = cv.imread('img/parrot.jpg')\n",
|
||||
"\n",
|
||||
"image_EDSR = sr.upsample(image)\n",
|
||||
"\n",
|
||||
"plt.figure(figsize=(25,25))\n",
|
||||
"plt.subplot(211)\n",
|
||||
"plt.imshow(image[:,:,::-1])\n",
|
||||
"plt.subplot(212)\n",
|
||||
"plt.imshow(image_EDSR[:,:,::-1]);"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3c157587",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Zadanie 1\n",
|
||||
"\n",
|
||||
"Przy pomocy biblioteki [MediaPipe](https://google.github.io/mediapipe/solutions/selfie_segmentation.html) dokonaj podmień tło w selfie `img/selfie-man.jpg` na `img/selfie-background.jpg` (możesz również odbić obraz w poziomie)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d6116f61",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Wynik działania programu](img/selfie-out.png)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Andrzej Wójtowicz",
|
||||
"email": "andre@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.3"
|
||||
},
|
||||
"subtitle": "10. Metody głębokiego uczenia w widzeniu komputerowym [laboratoria]",
|
||||
"title": "Widzenie komputerowe",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|