24 lines
1.2 KiB
Python
24 lines
1.2 KiB
Python
|
from torch._C._distributed_c10d import _DEFAULT_PG_TIMEOUT
|
||
|
from datetime import timedelta
|
||
|
from typing import Optional
|
||
|
|
||
|
__all__ = ['default_pg_timeout', 'default_pg_nccl_timeout']
|
||
|
|
||
|
# Default process group wide timeout, if applicable.
|
||
|
# This only applies to the non-nccl backends
|
||
|
# To make an attempt at backwards compatibility with THD, we use an
|
||
|
# extraordinarily high default timeout, given that THD did not have timeouts.
|
||
|
default_pg_timeout: timedelta = _DEFAULT_PG_TIMEOUT
|
||
|
# Separate timeout for PGNCCL mainly becuase it's always been that way in the C++ layer, but until recently
|
||
|
# there was one default that applied across all backends in the python layer.
|
||
|
# Later, we could consider merging them back together at the c++ layer if we can align on a same value.
|
||
|
# (only if TORCH_NCCL_BLOCKING_WAIT or TORCH_NCCL_ASYNC_ERROR_HANDLING is set to 1).
|
||
|
|
||
|
try:
|
||
|
from torch._C._distributed_c10d import _DEFAULT_PG_NCCL_TIMEOUT
|
||
|
default_pg_nccl_timeout: Optional[timedelta] = _DEFAULT_PG_NCCL_TIMEOUT
|
||
|
except ImportError:
|
||
|
# if C++ NCCL support is not compiled, we don't have access to the default nccl value.
|
||
|
# if anyone is actually trying to use nccl in this state, it should error.
|
||
|
default_pg_nccl_timeout = None
|