161 lines
4.7 KiB
C
161 lines
4.7 KiB
C
|
#pragma once
|
||
|
#include <ATen/Config.h>
|
||
|
#include <c10/macros/Macros.h>
|
||
|
#include <functional>
|
||
|
#include <string>
|
||
|
|
||
|
namespace at {
|
||
|
|
||
|
inline int64_t divup(int64_t x, int64_t y) {
|
||
|
return (x + y - 1) / y;
|
||
|
}
|
||
|
|
||
|
// Called during new thread initialization
|
||
|
TORCH_API void init_num_threads();
|
||
|
|
||
|
// Sets the number of threads to be used in parallel region
|
||
|
TORCH_API void set_num_threads(int);
|
||
|
|
||
|
// Returns the maximum number of threads that may be used in a parallel region
|
||
|
TORCH_API int get_num_threads();
|
||
|
|
||
|
// Returns the current thread number (starting from 0)
|
||
|
// in the current parallel region, or 0 in the sequential region
|
||
|
TORCH_API int get_thread_num();
|
||
|
|
||
|
// Checks whether the code runs in parallel region
|
||
|
TORCH_API bool in_parallel_region();
|
||
|
|
||
|
namespace internal {
|
||
|
|
||
|
// Initialise num_threads lazily at first parallel call
|
||
|
inline void lazy_init_num_threads() {
|
||
|
thread_local bool init = false;
|
||
|
if (C10_UNLIKELY(!init)) {
|
||
|
at::init_num_threads();
|
||
|
init = true;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
TORCH_API void set_thread_num(int);
|
||
|
|
||
|
class TORCH_API ThreadIdGuard {
|
||
|
public:
|
||
|
ThreadIdGuard(int new_id) : old_id_(at::get_thread_num()) {
|
||
|
set_thread_num(new_id);
|
||
|
}
|
||
|
|
||
|
~ThreadIdGuard() {
|
||
|
set_thread_num(old_id_);
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
int old_id_;
|
||
|
};
|
||
|
|
||
|
} // namespace internal
|
||
|
|
||
|
/*
|
||
|
parallel_for
|
||
|
|
||
|
begin: index at which to start applying user function
|
||
|
|
||
|
end: index at which to stop applying user function
|
||
|
|
||
|
grain_size: number of elements per chunk. impacts the degree of parallelization
|
||
|
|
||
|
f: user function applied in parallel to the chunks, signature:
|
||
|
void f(int64_t begin, int64_t end)
|
||
|
|
||
|
Warning: parallel_for does NOT copy thread local
|
||
|
states from the current thread to the worker threads.
|
||
|
This means for example that Tensor operations CANNOT be used in the
|
||
|
body of your function, only data pointers.
|
||
|
*/
|
||
|
template <class F>
|
||
|
inline void parallel_for(
|
||
|
const int64_t begin,
|
||
|
const int64_t end,
|
||
|
const int64_t grain_size,
|
||
|
const F& f);
|
||
|
|
||
|
/*
|
||
|
parallel_reduce
|
||
|
|
||
|
begin: index at which to start applying reduction
|
||
|
|
||
|
end: index at which to stop applying reduction
|
||
|
|
||
|
grain_size: number of elements per chunk. impacts number of elements in
|
||
|
intermediate results tensor and degree of parallelization.
|
||
|
|
||
|
ident: identity for binary combination function sf. sf(ident, x) needs to return
|
||
|
x.
|
||
|
|
||
|
f: function for reduction over a chunk. f needs to be of signature scalar_t
|
||
|
f(int64_t partial_begin, int64_t partial_end, scalar_t identifiy)
|
||
|
|
||
|
sf: function to combine two partial results. sf needs to be of signature
|
||
|
scalar_t sf(scalar_t x, scalar_t y)
|
||
|
|
||
|
For example, you might have a tensor of 10000 entires and want to sum together
|
||
|
all the elements. Parallel_reduce with a grain_size of 2500 will then allocate
|
||
|
an intermediate result tensor with 4 elements. Then it will execute the function
|
||
|
"f" you provide and pass the beginning and end index of these chunks, so
|
||
|
0-2499, 2500-4999, etc. and the combination identity. It will then write out
|
||
|
the result from each of these chunks into the intermediate result tensor. After
|
||
|
that it'll reduce the partial results from each chunk into a single number using
|
||
|
the combination function sf and the identity ident. For a total summation this
|
||
|
would be "+" and 0 respectively. This is similar to tbb's approach [1], where
|
||
|
you need to provide a function to accumulate a subrange, a function to combine
|
||
|
two partial results and an identity.
|
||
|
|
||
|
Warning: parallel_reduce does NOT copy thread local
|
||
|
states from the current thread to the worker threads.
|
||
|
This means for example that Tensor operations CANNOT be used in the
|
||
|
body of your function, only data pointers.
|
||
|
|
||
|
[1] https://software.intel.com/en-us/node/506154
|
||
|
*/
|
||
|
template <class scalar_t, class F, class SF>
|
||
|
inline scalar_t parallel_reduce(
|
||
|
const int64_t begin,
|
||
|
const int64_t end,
|
||
|
const int64_t grain_size,
|
||
|
const scalar_t ident,
|
||
|
const F& f,
|
||
|
const SF& sf);
|
||
|
|
||
|
// Returns a detailed string describing parallelization settings
|
||
|
TORCH_API std::string get_parallel_info();
|
||
|
|
||
|
// Sets number of threads used for inter-op parallelism
|
||
|
TORCH_API void set_num_interop_threads(int);
|
||
|
|
||
|
// Returns the number of threads used for inter-op parallelism
|
||
|
TORCH_API int get_num_interop_threads();
|
||
|
|
||
|
// Launches inter-op parallel task
|
||
|
TORCH_API void launch(std::function<void()> func);
|
||
|
namespace internal {
|
||
|
void launch_no_thread_state(std::function<void()> fn);
|
||
|
} // namespace internal
|
||
|
|
||
|
// Launches intra-op parallel task
|
||
|
TORCH_API void intraop_launch(std::function<void()> func);
|
||
|
|
||
|
// Returns number of intra-op threads used by default
|
||
|
TORCH_API int intraop_default_num_threads();
|
||
|
|
||
|
} // namespace at
|
||
|
|
||
|
#if AT_PARALLEL_OPENMP
|
||
|
#include <ATen/ParallelOpenMP.h> // IWYU pragma: keep
|
||
|
#elif AT_PARALLEL_NATIVE
|
||
|
#include <ATen/ParallelNative.h> // IWYU pragma: keep
|
||
|
#elif AT_PARALLEL_NATIVE_TBB
|
||
|
#include <ATen/ParallelNativeTBB.h> // IWYU pragma: keep
|
||
|
#endif
|
||
|
|
||
|
#include <ATen/Parallel-inl.h> // IWYU pragma: keep
|