"""Labeled Faces in the Wild (LFW) dataset This dataset is a collection of JPEG pictures of famous people collected over the internet, all details are available on the official website: http://vis-www.cs.umass.edu/lfw/ """ # Copyright (c) 2011 Olivier Grisel # License: BSD 3 clause import logging from numbers import Integral, Real from os import PathLike, listdir, makedirs, remove from os.path import exists, isdir, join import numpy as np from joblib import Memory from ..utils import Bunch from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params from ..utils.fixes import tarfile_extractall from ._base import ( RemoteFileMetadata, _fetch_remote, get_data_home, load_descr, ) logger = logging.getLogger(__name__) # The original data can be found in: # http://vis-www.cs.umass.edu/lfw/lfw.tgz ARCHIVE = RemoteFileMetadata( filename="lfw.tgz", url="https://ndownloader.figshare.com/files/5976018", checksum="055f7d9c632d7370e6fb4afc7468d40f970c34a80d4c6f50ffec63f5a8d536c0", ) # The original funneled data can be found in: # http://vis-www.cs.umass.edu/lfw/lfw-funneled.tgz FUNNELED_ARCHIVE = RemoteFileMetadata( filename="lfw-funneled.tgz", url="https://ndownloader.figshare.com/files/5976015", checksum="b47c8422c8cded889dc5a13418c4bc2abbda121092b3533a83306f90d900100a", ) # The original target data can be found in: # http://vis-www.cs.umass.edu/lfw/pairsDevTrain.txt', # http://vis-www.cs.umass.edu/lfw/pairsDevTest.txt', # http://vis-www.cs.umass.edu/lfw/pairs.txt', TARGETS = ( RemoteFileMetadata( filename="pairsDevTrain.txt", url="https://ndownloader.figshare.com/files/5976012", checksum="1d454dada7dfeca0e7eab6f65dc4e97a6312d44cf142207be28d688be92aabfa", ), RemoteFileMetadata( filename="pairsDevTest.txt", url="https://ndownloader.figshare.com/files/5976009", checksum="7cb06600ea8b2814ac26e946201cdb304296262aad67d046a16a7ec85d0ff87c", ), RemoteFileMetadata( filename="pairs.txt", url="https://ndownloader.figshare.com/files/5976006", checksum="ea42330c62c92989f9d7c03237ed5d591365e89b3e649747777b70e692dc1592", ), ) # # Common private utilities for data fetching from the original LFW website # local disk caching, and image decoding. # def _check_fetch_lfw( data_home=None, funneled=True, download_if_missing=True, n_retries=3, delay=1.0 ): """Helper function to download any missing LFW data""" data_home = get_data_home(data_home=data_home) lfw_home = join(data_home, "lfw_home") if not exists(lfw_home): makedirs(lfw_home) for target in TARGETS: target_filepath = join(lfw_home, target.filename) if not exists(target_filepath): if download_if_missing: logger.info("Downloading LFW metadata: %s", target.url) _fetch_remote( target, dirname=lfw_home, n_retries=n_retries, delay=delay ) else: raise OSError("%s is missing" % target_filepath) if funneled: data_folder_path = join(lfw_home, "lfw_funneled") archive = FUNNELED_ARCHIVE else: data_folder_path = join(lfw_home, "lfw") archive = ARCHIVE if not exists(data_folder_path): archive_path = join(lfw_home, archive.filename) if not exists(archive_path): if download_if_missing: logger.info("Downloading LFW data (~200MB): %s", archive.url) _fetch_remote( archive, dirname=lfw_home, n_retries=n_retries, delay=delay ) else: raise OSError("%s is missing" % archive_path) import tarfile logger.debug("Decompressing the data archive to %s", data_folder_path) with tarfile.open(archive_path, "r:gz") as fp: tarfile_extractall(fp, path=lfw_home) remove(archive_path) return lfw_home, data_folder_path def _load_imgs(file_paths, slice_, color, resize): """Internally used to load images""" try: from PIL import Image except ImportError: raise ImportError( "The Python Imaging Library (PIL) is required to load data " "from jpeg files. Please refer to " "https://pillow.readthedocs.io/en/stable/installation.html " "for installing PIL." ) # compute the portion of the images to load to respect the slice_ parameter # given by the caller default_slice = (slice(0, 250), slice(0, 250)) if slice_ is None: slice_ = default_slice else: slice_ = tuple(s or ds for s, ds in zip(slice_, default_slice)) h_slice, w_slice = slice_ h = (h_slice.stop - h_slice.start) // (h_slice.step or 1) w = (w_slice.stop - w_slice.start) // (w_slice.step or 1) if resize is not None: resize = float(resize) h = int(resize * h) w = int(resize * w) # allocate some contiguous memory to host the decoded image slices n_faces = len(file_paths) if not color: faces = np.zeros((n_faces, h, w), dtype=np.float32) else: faces = np.zeros((n_faces, h, w, 3), dtype=np.float32) # iterate over the collected file path to load the jpeg files as numpy # arrays for i, file_path in enumerate(file_paths): if i % 1000 == 0: logger.debug("Loading face #%05d / %05d", i + 1, n_faces) # Checks if jpeg reading worked. Refer to issue #3594 for more # details. pil_img = Image.open(file_path) pil_img = pil_img.crop( (w_slice.start, h_slice.start, w_slice.stop, h_slice.stop) ) if resize is not None: pil_img = pil_img.resize((w, h)) face = np.asarray(pil_img, dtype=np.float32) if face.ndim == 0: raise RuntimeError( "Failed to read the image file %s, " "Please make sure that libjpeg is installed" % file_path ) face /= 255.0 # scale uint8 coded colors to the [0.0, 1.0] floats if not color: # average the color channels to compute a gray levels # representation face = face.mean(axis=2) faces[i, ...] = face return faces # # Task #1: Face Identification on picture with names # def _fetch_lfw_people( data_folder_path, slice_=None, color=False, resize=None, min_faces_per_person=0 ): """Perform the actual data loading for the lfw people dataset This operation is meant to be cached by a joblib wrapper. """ # scan the data folder content to retain people with more that # `min_faces_per_person` face pictures person_names, file_paths = [], [] for person_name in sorted(listdir(data_folder_path)): folder_path = join(data_folder_path, person_name) if not isdir(folder_path): continue paths = [join(folder_path, f) for f in sorted(listdir(folder_path))] n_pictures = len(paths) if n_pictures >= min_faces_per_person: person_name = person_name.replace("_", " ") person_names.extend([person_name] * n_pictures) file_paths.extend(paths) n_faces = len(file_paths) if n_faces == 0: raise ValueError( "min_faces_per_person=%d is too restrictive" % min_faces_per_person ) target_names = np.unique(person_names) target = np.searchsorted(target_names, person_names) faces = _load_imgs(file_paths, slice_, color, resize) # shuffle the faces with a deterministic RNG scheme to avoid having # all faces of the same person in a row, as it would break some # cross validation and learning algorithms such as SGD and online # k-means that make an IID assumption indices = np.arange(n_faces) np.random.RandomState(42).shuffle(indices) faces, target = faces[indices], target[indices] return faces, target, target_names @validate_params( { "data_home": [str, PathLike, None], "funneled": ["boolean"], "resize": [Interval(Real, 0, None, closed="neither"), None], "min_faces_per_person": [Interval(Integral, 0, None, closed="left"), None], "color": ["boolean"], "slice_": [tuple, Hidden(None)], "download_if_missing": ["boolean"], "return_X_y": ["boolean"], "n_retries": [Interval(Integral, 1, None, closed="left")], "delay": [Interval(Real, 0.0, None, closed="neither")], }, prefer_skip_nested_validation=True, ) def fetch_lfw_people( *, data_home=None, funneled=True, resize=0.5, min_faces_per_person=0, color=False, slice_=(slice(70, 195), slice(78, 172)), download_if_missing=True, return_X_y=False, n_retries=3, delay=1.0, ): """Load the Labeled Faces in the Wild (LFW) people dataset \ (classification). Download it if necessary. ================= ======================= Classes 5749 Samples total 13233 Dimensionality 5828 Features real, between 0 and 255 ================= ======================= Read more in the :ref:`User Guide `. Parameters ---------- data_home : str or path-like, default=None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. funneled : bool, default=True Download and use the funneled variant of the dataset. resize : float or None, default=0.5 Ratio used to resize the each face picture. If `None`, no resizing is performed. min_faces_per_person : int, default=None The extracted dataset will only retain pictures of people that have at least `min_faces_per_person` different pictures. color : bool, default=False Keep the 3 RGB channels instead of averaging them to a single gray level channel. If color is True the shape of the data has one more dimension than the shape with color = False. slice_ : tuple of slice, default=(slice(70, 195), slice(78, 172)) Provide a custom 2D slice (height, width) to extract the 'interesting' part of the jpeg files and avoid use statistical correlation from the background. download_if_missing : bool, default=True If False, raise an OSError if the data is not locally available instead of trying to download the data from the source site. return_X_y : bool, default=False If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch object. See below for more information about the `dataset.data` and `dataset.target` object. .. versionadded:: 0.20 n_retries : int, default=3 Number of retries when HTTP errors are encountered. .. versionadded:: 1.5 delay : float, default=1.0 Number of seconds between retries. .. versionadded:: 1.5 Returns ------- dataset : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes. data : numpy array of shape (13233, 2914) Each row corresponds to a ravelled face image of original size 62 x 47 pixels. Changing the ``slice_`` or resize parameters will change the shape of the output. images : numpy array of shape (13233, 62, 47) Each row is a face image corresponding to one of the 5749 people in the dataset. Changing the ``slice_`` or resize parameters will change the shape of the output. target : numpy array of shape (13233,) Labels associated to each face image. Those labels range from 0-5748 and correspond to the person IDs. target_names : numpy array of shape (5749,) Names of all persons in the dataset. Position in array corresponds to the person ID in the target array. DESCR : str Description of the Labeled Faces in the Wild (LFW) dataset. (data, target) : tuple if ``return_X_y`` is True A tuple of two ndarray. The first containing a 2D array of shape (n_samples, n_features) with each row representing one sample and each column representing the features. The second ndarray of shape (n_samples,) containing the target samples. .. versionadded:: 0.20 Examples -------- >>> from sklearn.datasets import fetch_lfw_people >>> lfw_people = fetch_lfw_people() >>> lfw_people.data.shape (13233, 2914) >>> lfw_people.target.shape (13233,) >>> for name in lfw_people.target_names[:5]: ... print(name) AJ Cook AJ Lamas Aaron Eckhart Aaron Guiel Aaron Patterson """ lfw_home, data_folder_path = _check_fetch_lfw( data_home=data_home, funneled=funneled, download_if_missing=download_if_missing, n_retries=n_retries, delay=delay, ) logger.debug("Loading LFW people faces from %s", lfw_home) # wrap the loader in a memoizing function that will return memmaped data # arrays for optimal memory usage m = Memory(location=lfw_home, compress=6, verbose=0) load_func = m.cache(_fetch_lfw_people) # load and memoize the pairs as np arrays faces, target, target_names = load_func( data_folder_path, resize=resize, min_faces_per_person=min_faces_per_person, color=color, slice_=slice_, ) X = faces.reshape(len(faces), -1) fdescr = load_descr("lfw.rst") if return_X_y: return X, target # pack the results as a Bunch instance return Bunch( data=X, images=faces, target=target, target_names=target_names, DESCR=fdescr ) # # Task #2: Face Verification on pairs of face pictures # def _fetch_lfw_pairs( index_file_path, data_folder_path, slice_=None, color=False, resize=None ): """Perform the actual data loading for the LFW pairs dataset This operation is meant to be cached by a joblib wrapper. """ # parse the index file to find the number of pairs to be able to allocate # the right amount of memory before starting to decode the jpeg files with open(index_file_path, "rb") as index_file: split_lines = [ln.decode().strip().split("\t") for ln in index_file] pair_specs = [sl for sl in split_lines if len(sl) > 2] n_pairs = len(pair_specs) # iterating over the metadata lines for each pair to find the filename to # decode and load in memory target = np.zeros(n_pairs, dtype=int) file_paths = list() for i, components in enumerate(pair_specs): if len(components) == 3: target[i] = 1 pair = ( (components[0], int(components[1]) - 1), (components[0], int(components[2]) - 1), ) elif len(components) == 4: target[i] = 0 pair = ( (components[0], int(components[1]) - 1), (components[2], int(components[3]) - 1), ) else: raise ValueError("invalid line %d: %r" % (i + 1, components)) for j, (name, idx) in enumerate(pair): try: person_folder = join(data_folder_path, name) except TypeError: person_folder = join(data_folder_path, str(name, "UTF-8")) filenames = list(sorted(listdir(person_folder))) file_path = join(person_folder, filenames[idx]) file_paths.append(file_path) pairs = _load_imgs(file_paths, slice_, color, resize) shape = list(pairs.shape) n_faces = shape.pop(0) shape.insert(0, 2) shape.insert(0, n_faces // 2) pairs.shape = shape return pairs, target, np.array(["Different persons", "Same person"]) @validate_params( { "subset": [StrOptions({"train", "test", "10_folds"})], "data_home": [str, PathLike, None], "funneled": ["boolean"], "resize": [Interval(Real, 0, None, closed="neither"), None], "color": ["boolean"], "slice_": [tuple, Hidden(None)], "download_if_missing": ["boolean"], "n_retries": [Interval(Integral, 1, None, closed="left")], "delay": [Interval(Real, 0.0, None, closed="neither")], }, prefer_skip_nested_validation=True, ) def fetch_lfw_pairs( *, subset="train", data_home=None, funneled=True, resize=0.5, color=False, slice_=(slice(70, 195), slice(78, 172)), download_if_missing=True, n_retries=3, delay=1.0, ): """Load the Labeled Faces in the Wild (LFW) pairs dataset (classification). Download it if necessary. ================= ======================= Classes 2 Samples total 13233 Dimensionality 5828 Features real, between 0 and 255 ================= ======================= In the official `README.txt`_ this task is described as the "Restricted" task. As I am not sure as to implement the "Unrestricted" variant correctly, I left it as unsupported for now. .. _`README.txt`: http://vis-www.cs.umass.edu/lfw/README.txt The original images are 250 x 250 pixels, but the default slice and resize arguments reduce them to 62 x 47. Read more in the :ref:`User Guide `. Parameters ---------- subset : {'train', 'test', '10_folds'}, default='train' Select the dataset to load: 'train' for the development training set, 'test' for the development test set, and '10_folds' for the official evaluation set that is meant to be used with a 10-folds cross validation. data_home : str or path-like, default=None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. funneled : bool, default=True Download and use the funneled variant of the dataset. resize : float, default=0.5 Ratio used to resize the each face picture. color : bool, default=False Keep the 3 RGB channels instead of averaging them to a single gray level channel. If color is True the shape of the data has one more dimension than the shape with color = False. slice_ : tuple of slice, default=(slice(70, 195), slice(78, 172)) Provide a custom 2D slice (height, width) to extract the 'interesting' part of the jpeg files and avoid use statistical correlation from the background. download_if_missing : bool, default=True If False, raise an OSError if the data is not locally available instead of trying to download the data from the source site. n_retries : int, default=3 Number of retries when HTTP errors are encountered. .. versionadded:: 1.5 delay : float, default=1.0 Number of seconds between retries. .. versionadded:: 1.5 Returns ------- data : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes. data : ndarray of shape (2200, 5828). Shape depends on ``subset``. Each row corresponds to 2 ravel'd face images of original size 62 x 47 pixels. Changing the ``slice_``, ``resize`` or ``subset`` parameters will change the shape of the output. pairs : ndarray of shape (2200, 2, 62, 47). Shape depends on ``subset`` Each row has 2 face images corresponding to same or different person from the dataset containing 5749 people. Changing the ``slice_``, ``resize`` or ``subset`` parameters will change the shape of the output. target : numpy array of shape (2200,). Shape depends on ``subset``. Labels associated to each pair of images. The two label values being different persons or the same person. target_names : numpy array of shape (2,) Explains the target values of the target array. 0 corresponds to "Different person", 1 corresponds to "same person". DESCR : str Description of the Labeled Faces in the Wild (LFW) dataset. Examples -------- >>> from sklearn.datasets import fetch_lfw_pairs >>> lfw_pairs_train = fetch_lfw_pairs(subset='train') >>> list(lfw_pairs_train.target_names) ['Different persons', 'Same person'] >>> lfw_pairs_train.pairs.shape (2200, 2, 62, 47) >>> lfw_pairs_train.data.shape (2200, 5828) >>> lfw_pairs_train.target.shape (2200,) """ lfw_home, data_folder_path = _check_fetch_lfw( data_home=data_home, funneled=funneled, download_if_missing=download_if_missing, n_retries=n_retries, delay=delay, ) logger.debug("Loading %s LFW pairs from %s", subset, lfw_home) # wrap the loader in a memoizing function that will return memmaped data # arrays for optimal memory usage m = Memory(location=lfw_home, compress=6, verbose=0) load_func = m.cache(_fetch_lfw_pairs) # select the right metadata file according to the requested subset label_filenames = { "train": "pairsDevTrain.txt", "test": "pairsDevTest.txt", "10_folds": "pairs.txt", } if subset not in label_filenames: raise ValueError( "subset='%s' is invalid: should be one of %r" % (subset, list(sorted(label_filenames.keys()))) ) index_file_path = join(lfw_home, label_filenames[subset]) # load and memoize the pairs as np arrays pairs, target, target_names = load_func( index_file_path, data_folder_path, resize=resize, color=color, slice_=slice_ ) fdescr = load_descr("lfw.rst") # pack the results as a Bunch instance return Bunch( data=pairs.reshape(len(pairs), -1), pairs=pairs, target=target, target_names=target_names, DESCR=fdescr, )