Inzynierka/Lib/site-packages/joblib/hashing.py

"""
Fast cryptographic hash of Python objects, with a special case for fast
hashing of numpy arrays.
"""

# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
# Copyright (c) 2009 Gael Varoquaux
# License: BSD Style, 3 clauses.

import pickle
import hashlib
import sys
import types
import struct
import io
import decimal


Pickler = pickle._Pickler


class _ConsistentSet(object):
    """ Class used to ensure the hash of Sets is preserved
        whatever the order of its items.
    """
    def __init__(self, set_sequence):
        # Forces order of elements in set to ensure consistent hash.
        try:
            # Trying first to order the set assuming the type of elements is
            # consistent and orderable.
            # This fails on python 3 when elements are unorderable
            # but we keep it in a try as it's faster.
            self._sequence = sorted(set_sequence)
        except (TypeError, decimal.InvalidOperation):
            # If elements are unorderable, sorting them using their hash.
            # This is slower but works in any case.
            self._sequence = sorted((hash(e) for e in set_sequence))


class _MyHash(object):
    """ Class used to hash objects that won't normally pickle """

    def __init__(self, *args):
        self.args = args


class Hasher(Pickler):
    """ A subclass of pickler, to do cryptographic hashing, rather than
        pickling.
    """

    def __init__(self, hash_name='md5'):
        self.stream = io.BytesIO()
        # By default we want a pickle protocol that only changes with
        # the major python version and not the minor one
        protocol = 3
        Pickler.__init__(self, self.stream, protocol=protocol)
        # Initialise the hash obj
        self._hash = hashlib.new(hash_name)

    def hash(self, obj, return_digest=True):
        try:
            self.dump(obj)
        except pickle.PicklingError as e:
            e.args += ('PicklingError while hashing %r: %r' % (obj, e),)
            raise
        dumps = self.stream.getvalue()
        self._hash.update(dumps)
        if return_digest:
            return self._hash.hexdigest()

    def save(self, obj):
        if isinstance(obj, (types.MethodType, type({}.pop))):
            # the Pickler cannot pickle instance methods; here we decompose
            # them into components that make them uniquely identifiable
            if hasattr(obj, '__func__'):
                func_name = obj.__func__.__name__
            else:
                func_name = obj.__name__
            inst = obj.__self__
            if type(inst) == type(pickle):
                obj = _MyHash(func_name, inst.__name__)
            elif inst is None:
                # type(None) or type(module) do not pickle
                obj = _MyHash(func_name, inst)
            else:
                cls = obj.__self__.__class__
                obj = _MyHash(func_name, inst, cls)
        Pickler.save(self, obj)

    def memoize(self, obj):
        # We want hashing to be sensitive to value instead of reference.
        # For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]]
        # to hash to the same value and that's why we disable memoization
        # for strings
        if isinstance(obj, (bytes, str)):
            return
        Pickler.memoize(self, obj)

    # The dispatch table of the pickler is not accessible in Python
    # 3, as these lines are only bugware for IPython, we skip them.
    def save_global(self, obj, name=None, pack=struct.pack):
        # We have to override this method in order to deal with objects
        # defined interactively in IPython that are not injected in
        # __main__
        kwargs = dict(name=name, pack=pack)
        del kwargs['pack']
        try:
            Pickler.save_global(self, obj, **kwargs)
        except pickle.PicklingError:
            Pickler.save_global(self, obj, **kwargs)
            module = getattr(obj, "__module__", None)
            if module == '__main__':
                my_name = name
                if my_name is None:
                    my_name = obj.__name__
                mod = sys.modules[module]
                if not hasattr(mod, my_name):
                    # IPython doesn't inject the variables define
                    # interactively in __main__
                    setattr(mod, my_name, obj)

    dispatch = Pickler.dispatch.copy()
    # builtin
    dispatch[type(len)] = save_global
    # type
    dispatch[type(object)] = save_global
    # classobj
    dispatch[type(Pickler)] = save_global
    # function
    dispatch[type(pickle.dump)] = save_global

    def _batch_setitems(self, items):
        # forces order of keys in dict to ensure consistent hash.
        try:
            # Trying first to compare dict assuming the type of keys is
            # consistent and orderable.
            # This fails on python 3 when keys are unorderable
            # but we keep it in a try as it's faster.
            Pickler._batch_setitems(self, iter(sorted(items)))
        except TypeError:
            # If keys are unorderable, sorting them using their hash. This is
            # slower but works in any case.
            Pickler._batch_setitems(self, iter(sorted((hash(k), v)
                                                      for k, v in items)))

    def save_set(self, set_items):
        # forces order of items in Set to ensure consistent hash
        Pickler.save(self, _ConsistentSet(set_items))

    dispatch[type(set())] = save_set


class NumpyHasher(Hasher):
    """ Special case the hasher for when numpy is loaded.
    """

    def __init__(self, hash_name='md5', coerce_mmap=False):
        """
            Parameters
            ----------
            hash_name: string
                The hash algorithm to be used
            coerce_mmap: boolean
                Make no difference between np.memmap and np.ndarray
                objects.
        """
        self.coerce_mmap = coerce_mmap
        Hasher.__init__(self, hash_name=hash_name)
        # delayed import of numpy, to avoid tight coupling
        import numpy as np
        self.np = np
        if hasattr(np, 'getbuffer'):
            self._getbuffer = np.getbuffer
        else:
            self._getbuffer = memoryview

    def save(self, obj):
        """ Subclass the save method, to hash ndarray subclass, rather
            than pickling them. Off course, this is a total abuse of
            the Pickler class.
        """
        if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject:
            # Compute a hash of the object
            # The update function of the hash requires a c_contiguous buffer.
            if obj.shape == ():
                # 0d arrays need to be flattened because viewing them as bytes
                # raises a ValueError exception.
                obj_c_contiguous = obj.flatten()
            elif obj.flags.c_contiguous:
                obj_c_contiguous = obj
            elif obj.flags.f_contiguous:
                obj_c_contiguous = obj.T
            else:
                # Cater for non-single-segment arrays: this creates a
                # copy, and thus alleviates this issue.
                # XXX: There might be a more efficient way of doing this
                obj_c_contiguous = obj.flatten()

            # memoryview is not supported for some dtypes, e.g. datetime64, see
            # https://github.com/numpy/numpy/issues/4983. The
            # workaround is to view the array as bytes before
            # taking the memoryview.
            self._hash.update(
                self._getbuffer(obj_c_contiguous.view(self.np.uint8)))

            # We store the class, to be able to distinguish between
            # Objects with the same binary content, but different
            # classes.
            if self.coerce_mmap and isinstance(obj, self.np.memmap):
                # We don't make the difference between memmap and
                # normal ndarrays, to be able to reload previously
                # computed results with memmap.
                klass = self.np.ndarray
            else:
                klass = obj.__class__
            # We also return the dtype and the shape, to distinguish
            # different views on the same data with different dtypes.

            # The object will be pickled by the pickler hashed at the end.
            obj = (klass, ('HASHED', obj.dtype, obj.shape, obj.strides))
        elif isinstance(obj, self.np.dtype):
            # numpy.dtype consistent hashing is tricky to get right. This comes
            # from the fact that atomic np.dtype objects are interned:
            # ``np.dtype('f4') is np.dtype('f4')``. The situation is
            # complicated by the fact that this interning does not resist a
            # simple pickle.load/dump roundtrip:
            # ``pickle.loads(pickle.dumps(np.dtype('f4'))) is not
            # np.dtype('f4') Because pickle relies on memoization during
            # pickling, it is easy to
            # produce different hashes for seemingly identical objects, such as
            # ``[np.dtype('f4'), np.dtype('f4')]``
            # and ``[np.dtype('f4'), pickle.loads(pickle.dumps('f4'))]``.
            # To prevent memoization from interfering with hashing, we isolate
            # the serialization (and thus the pickle memoization) of each dtype
            # using each time a different ``pickle.dumps`` call unrelated to
            # the current Hasher instance.
            self._hash.update("_HASHED_DTYPE".encode('utf-8'))
            self._hash.update(pickle.dumps(obj))
            return
        Hasher.save(self, obj)


def hash(obj, hash_name='md5', coerce_mmap=False):
    """ Quick calculation of a hash to identify uniquely Python objects
        containing numpy arrays.


        Parameters
        -----------
        hash_name: 'md5' or 'sha1'
            Hashing algorithm used. sha1 is supposedly safer, but md5 is
            faster.
        coerce_mmap: boolean
            Make no difference between np.memmap and np.ndarray
    """
    valid_hash_names = ('md5', 'sha1')
    if hash_name not in valid_hash_names:
        raise ValueError("Valid options for 'hash_name' are {}. "
                         "Got hash_name={!r} instead."
                         .format(valid_hash_names, hash_name))
    if 'numpy' in sys.modules:
        hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap)
    else:
        hasher = Hasher(hash_name=hash_name)
    return hasher.hash(obj)
first commit 2023-06-02 12:51:02 +02:00			`"""`
			`Fast cryptographic hash of Python objects, with a special case for fast`
			`hashing of numpy arrays.`
			`"""`

			`# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>`
			`# Copyright (c) 2009 Gael Varoquaux`
			`# License: BSD Style, 3 clauses.`

			`import pickle`
			`import hashlib`
			`import sys`
			`import types`
			`import struct`
			`import io`
			`import decimal`


			`Pickler = pickle._Pickler`


			`class _ConsistentSet(object):`
			`""" Class used to ensure the hash of Sets is preserved`
			`whatever the order of its items.`
			`"""`
			`def __init__(self, set_sequence):`
			`# Forces order of elements in set to ensure consistent hash.`
			`try:`
			`# Trying first to order the set assuming the type of elements is`
			`# consistent and orderable.`
			`# This fails on python 3 when elements are unorderable`
			`# but we keep it in a try as it's faster.`
			`self._sequence = sorted(set_sequence)`
			`except (TypeError, decimal.InvalidOperation):`
			`# If elements are unorderable, sorting them using their hash.`
			`# This is slower but works in any case.`
			`self._sequence = sorted((hash(e) for e in set_sequence))`


			`class _MyHash(object):`
			`""" Class used to hash objects that won't normally pickle """`

			`def __init__(self, *args):`
			`self.args = args`


			`class Hasher(Pickler):`
			`""" A subclass of pickler, to do cryptographic hashing, rather than`
			`pickling.`
			`"""`

			`def __init__(self, hash_name='md5'):`
			`self.stream = io.BytesIO()`
			`# By default we want a pickle protocol that only changes with`
			`# the major python version and not the minor one`
			`protocol = 3`
			`Pickler.__init__(self, self.stream, protocol=protocol)`
			`# Initialise the hash obj`
			`self._hash = hashlib.new(hash_name)`

			`def hash(self, obj, return_digest=True):`
			`try:`
			`self.dump(obj)`
			`except pickle.PicklingError as e:`
			`e.args += ('PicklingError while hashing %r: %r' % (obj, e),)`
			`raise`
			`dumps = self.stream.getvalue()`
			`self._hash.update(dumps)`
			`if return_digest:`
			`return self._hash.hexdigest()`

			`def save(self, obj):`
			`if isinstance(obj, (types.MethodType, type({}.pop))):`
			`# the Pickler cannot pickle instance methods; here we decompose`
			`# them into components that make them uniquely identifiable`
			`if hasattr(obj, '__func__'):`
			`func_name = obj.__func__.__name__`
			`else:`
			`func_name = obj.__name__`
			`inst = obj.__self__`
			`if type(inst) == type(pickle):`
			`obj = _MyHash(func_name, inst.__name__)`
			`elif inst is None:`
			`# type(None) or type(module) do not pickle`
			`obj = _MyHash(func_name, inst)`
			`else:`
			`cls = obj.__self__.__class__`
			`obj = _MyHash(func_name, inst, cls)`
			`Pickler.save(self, obj)`

			`def memoize(self, obj):`
			`# We want hashing to be sensitive to value instead of reference.`
			`# For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]]`
			`# to hash to the same value and that's why we disable memoization`
			`# for strings`
			`if isinstance(obj, (bytes, str)):`
			`return`
			`Pickler.memoize(self, obj)`

			`# The dispatch table of the pickler is not accessible in Python`
			`# 3, as these lines are only bugware for IPython, we skip them.`
			`def save_global(self, obj, name=None, pack=struct.pack):`
			`# We have to override this method in order to deal with objects`
			`# defined interactively in IPython that are not injected in`
			`# __main__`
			`kwargs = dict(name=name, pack=pack)`
			`del kwargs['pack']`
			`try:`
			`Pickler.save_global(self, obj, **kwargs)`
			`except pickle.PicklingError:`
			`Pickler.save_global(self, obj, **kwargs)`
			`module = getattr(obj, "__module__", None)`
			`if module == '__main__':`
			`my_name = name`
			`if my_name is None:`
			`my_name = obj.__name__`
			`mod = sys.modules[module]`
			`if not hasattr(mod, my_name):`
			`# IPython doesn't inject the variables define`
			`# interactively in __main__`
			`setattr(mod, my_name, obj)`

			`dispatch = Pickler.dispatch.copy()`
			`# builtin`
			`dispatch[type(len)] = save_global`
			`# type`
			`dispatch[type(object)] = save_global`
			`# classobj`
			`dispatch[type(Pickler)] = save_global`
			`# function`
			`dispatch[type(pickle.dump)] = save_global`

			`def _batch_setitems(self, items):`
			`# forces order of keys in dict to ensure consistent hash.`
			`try:`
			`# Trying first to compare dict assuming the type of keys is`
			`# consistent and orderable.`
			`# This fails on python 3 when keys are unorderable`
			`# but we keep it in a try as it's faster.`
			`Pickler._batch_setitems(self, iter(sorted(items)))`
			`except TypeError:`
			`# If keys are unorderable, sorting them using their hash. This is`
			`# slower but works in any case.`
			`Pickler._batch_setitems(self, iter(sorted((hash(k), v)`
			`for k, v in items)))`

			`def save_set(self, set_items):`
			`# forces order of items in Set to ensure consistent hash`
			`Pickler.save(self, _ConsistentSet(set_items))`

			`dispatch[type(set())] = save_set`


			`class NumpyHasher(Hasher):`
			`""" Special case the hasher for when numpy is loaded.`
			`"""`

			`def __init__(self, hash_name='md5', coerce_mmap=False):`
			`"""`
			`Parameters`
			`----------`
			`hash_name: string`
			`The hash algorithm to be used`
			`coerce_mmap: boolean`
			`Make no difference between np.memmap and np.ndarray`
			`objects.`
			`"""`
			`self.coerce_mmap = coerce_mmap`
			`Hasher.__init__(self, hash_name=hash_name)`
			`# delayed import of numpy, to avoid tight coupling`
			`import numpy as np`
			`self.np = np`
			`if hasattr(np, 'getbuffer'):`
			`self._getbuffer = np.getbuffer`
			`else:`
			`self._getbuffer = memoryview`

			`def save(self, obj):`
			`""" Subclass the save method, to hash ndarray subclass, rather`
			`than pickling them. Off course, this is a total abuse of`
			`the Pickler class.`
			`"""`
			`if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject:`
			`# Compute a hash of the object`
			`# The update function of the hash requires a c_contiguous buffer.`
			`if obj.shape == ():`
			`# 0d arrays need to be flattened because viewing them as bytes`
			`# raises a ValueError exception.`
			`obj_c_contiguous = obj.flatten()`
			`elif obj.flags.c_contiguous:`
			`obj_c_contiguous = obj`
			`elif obj.flags.f_contiguous:`
			`obj_c_contiguous = obj.T`
			`else:`
			`# Cater for non-single-segment arrays: this creates a`
			`# copy, and thus alleviates this issue.`
			`# XXX: There might be a more efficient way of doing this`
			`obj_c_contiguous = obj.flatten()`

			`# memoryview is not supported for some dtypes, e.g. datetime64, see`
			`# https://github.com/numpy/numpy/issues/4983. The`
			`# workaround is to view the array as bytes before`
			`# taking the memoryview.`
			`self._hash.update(`
			`self._getbuffer(obj_c_contiguous.view(self.np.uint8)))`

			`# We store the class, to be able to distinguish between`
			`# Objects with the same binary content, but different`
			`# classes.`
			`if self.coerce_mmap and isinstance(obj, self.np.memmap):`
			`# We don't make the difference between memmap and`
			`# normal ndarrays, to be able to reload previously`
			`# computed results with memmap.`
			`klass = self.np.ndarray`
			`else:`
			`klass = obj.__class__`
			`# We also return the dtype and the shape, to distinguish`
			`# different views on the same data with different dtypes.`

			`# The object will be pickled by the pickler hashed at the end.`
			`obj = (klass, ('HASHED', obj.dtype, obj.shape, obj.strides))`
			`elif isinstance(obj, self.np.dtype):`
			`# numpy.dtype consistent hashing is tricky to get right. This comes`
			`# from the fact that atomic np.dtype objects are interned:`
			# ``np.dtype('f4') is np.dtype('f4')``. The situation is
			`# complicated by the fact that this interning does not resist a`
			`# simple pickle.load/dump roundtrip:`
			# ``pickle.loads(pickle.dumps(np.dtype('f4'))) is not
			`# np.dtype('f4') Because pickle relies on memoization during`
			`# pickling, it is easy to`
			`# produce different hashes for seemingly identical objects, such as`
			# ``[np.dtype('f4'), np.dtype('f4')]``
			# and ``[np.dtype('f4'), pickle.loads(pickle.dumps('f4'))]``.
			`# To prevent memoization from interfering with hashing, we isolate`
			`# the serialization (and thus the pickle memoization) of each dtype`
			# using each time a different ``pickle.dumps`` call unrelated to
			`# the current Hasher instance.`
			`self._hash.update("_HASHED_DTYPE".encode('utf-8'))`
			`self._hash.update(pickle.dumps(obj))`
			`return`
			`Hasher.save(self, obj)`


			`def hash(obj, hash_name='md5', coerce_mmap=False):`
			`""" Quick calculation of a hash to identify uniquely Python objects`
			`containing numpy arrays.`


			`Parameters`
			`-----------`
			`hash_name: 'md5' or 'sha1'`
			`Hashing algorithm used. sha1 is supposedly safer, but md5 is`
			`faster.`
			`coerce_mmap: boolean`
			`Make no difference between np.memmap and np.ndarray`
			`"""`
			`valid_hash_names = ('md5', 'sha1')`
			`if hash_name not in valid_hash_names:`
			`raise ValueError("Valid options for 'hash_name' are {}. "`
			`"Got hash_name={!r} instead."`
			`.format(valid_hash_names, hash_name))`
			`if 'numpy' in sys.modules:`
			`hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap)`
			`else:`
			`hasher = Hasher(hash_name=hash_name)`
			`return hasher.hash(obj)`