786 lines
32 KiB
C++
786 lines
32 KiB
C++
/* Copyright 2017 The OpenXLA Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
|
|
#ifndef XLA_SERVICE_BUFFER_ASSIGNMENT_H_
|
|
#define XLA_SERVICE_BUFFER_ASSIGNMENT_H_
|
|
|
|
#include <functional>
|
|
#include <iosfwd>
|
|
#include <memory>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "absl/container/flat_hash_map.h"
|
|
#include "absl/container/flat_hash_set.h"
|
|
#include "absl/types/span.h"
|
|
#include "xla/hlo/ir/hlo_computation.h"
|
|
#include "xla/hlo/ir/hlo_instruction.h"
|
|
#include "xla/hlo/ir/hlo_module.h"
|
|
#include "xla/hlo/utils/hlo_live_range.h"
|
|
#include "xla/service/buffer_assignment.pb.h"
|
|
#include "xla/service/heap_simulator/heap_simulator.h"
|
|
#include "xla/service/hlo.pb.h"
|
|
#include "xla/service/hlo_alias_analysis.h"
|
|
#include "xla/service/hlo_dataflow_analysis.h"
|
|
#include "xla/service/logical_buffer.h"
|
|
#include "xla/service/memory_space_assignment/memory_space_assignment.h"
|
|
#include "xla/service/tuple_points_to_analysis.h"
|
|
#include "xla/statusor.h"
|
|
#include "xla/types.h"
|
|
#include "tsl/platform/logging.h"
|
|
|
|
namespace xla {
|
|
|
|
// Walk the call graph of the HLO module and place each computation into either
|
|
// thread_local_computations or global_computations depending upon whether the
|
|
// computation requires thread-local allocations or global allocations. The
|
|
// elements in thread_local_computations and global_computations are in post
|
|
// order (if computation A has an instruction which calls computation B, then A
|
|
// will appear after B in the vector).
|
|
Status GatherComputationsByAllocationType(
|
|
const HloModule* module,
|
|
std::vector<const HloComputation*>* thread_local_computations,
|
|
std::vector<const HloComputation*>* global_computations);
|
|
|
|
// This class abstracts an allocation of contiguous memory which can hold the
|
|
// values described by LogicalBuffers. Each LogicalBuffer occupies a sub-range
|
|
// of the allocation, represented by a Slice. A single BufferAllocation may hold
|
|
// LogicalBuffers with disjoint liveness, which may have overlapping Slices. A
|
|
// single BufferAllocation may also hold LogicalBuffers with overlapping
|
|
// liveness, which must have disjoint Slices.
|
|
//
|
|
// The abstraction includes information required by the backends for allocation,
|
|
// use, and deallocation of the buffer. This includes the LogicalBuffers which
|
|
// are held in this allocation through the execution of the computation.
|
|
class BufferAllocation {
|
|
public:
|
|
// Holds a unique identifier for each allocation. Values are assigned
|
|
// contiguously and can be used as array indexes.
|
|
using Index = int64_t;
|
|
|
|
BufferAllocation(Index index, int64_t size, LogicalBuffer::Color color)
|
|
: index_(index), size_(size), color_(color) {}
|
|
~BufferAllocation() {}
|
|
|
|
// Returns the index of this allocation.
|
|
Index index() const { return index_; }
|
|
|
|
// Whether this allocation is used in a parallel calling context such as
|
|
// inside of a map or reduce computation. Such allocations need to be thread
|
|
// local.
|
|
bool is_thread_local() const { return is_thread_local_; }
|
|
void set_is_thread_local(bool is_thread_local) {
|
|
is_thread_local_ = is_thread_local;
|
|
}
|
|
|
|
// Whether this allocation can be used by more than one logical buffer.
|
|
bool is_reusable() const {
|
|
// We do not reuse thread-local buffers for now, because they are
|
|
// dynamically allocated and their lifetimes are hard to compute.
|
|
//
|
|
// TODO(b/34669761): Don't reuse tuple buffers because the GPU backend
|
|
// assumes longer buffer liveness than indicated by the analysis.
|
|
return !is_thread_local() && !is_tuple();
|
|
}
|
|
|
|
// Whether this allocation is readonly i.e. backed by memory we cannot write
|
|
// to.
|
|
bool is_readonly() const {
|
|
// Entry parameters are generally readonly, except when they are aliased
|
|
// with any output.
|
|
return (is_entry_computation_parameter() &&
|
|
!is_parameter_aliased_with_output_) ||
|
|
is_constant();
|
|
}
|
|
|
|
bool is_tuple() const { return is_tuple_; }
|
|
void set_is_tuple(bool is_tuple) { is_tuple_ = is_tuple; }
|
|
|
|
// Whether this allocation holds a LogicalBuffer from a parameter of the entry
|
|
// computation. These buffers have lifetimes which may be longer than the
|
|
// XLA computation.
|
|
bool is_entry_computation_parameter() const {
|
|
return is_entry_computation_parameter_;
|
|
}
|
|
|
|
// Whether this allocation holds a constant. On the CPU and GPU backends
|
|
// constant allocations are not allocated dynamically, instead we resolve
|
|
// references to these buffer allocations to a global in the readonly section
|
|
// of the binary.
|
|
bool is_constant() const { return is_constant_; }
|
|
|
|
// If this allocation holds a Buffer from a parameter of the entry
|
|
// computation, this methods returns the parameter number. CHECKs otherwise.
|
|
int64_t parameter_number() const {
|
|
CHECK(is_entry_computation_parameter_);
|
|
return parameter_number_;
|
|
}
|
|
|
|
// If this allocation is for a parameter of the entry computation, this
|
|
// function returns which subshape of the parameter the allocation is for.
|
|
const ShapeIndex& param_shape_index() const {
|
|
CHECK(is_entry_computation_parameter_);
|
|
return param_shape_index_;
|
|
}
|
|
|
|
// Returns whether this allocation is assigned a LogicalBuffer which may
|
|
// be live out of the entry computation.
|
|
bool maybe_live_out() const { return maybe_live_out_; }
|
|
|
|
void set_maybe_live_out(bool value) { maybe_live_out_ = value; }
|
|
|
|
// Returns the size of the allocation. Necessarily this must be at least as
|
|
// large as any LogicalBuffer assigned to this allocation.
|
|
int64_t size() const { return size_; }
|
|
|
|
// Returns the color of the allocation. Only logical buffers with a matching
|
|
// color can reside in this allocation.
|
|
LogicalBuffer::Color color() const { return color_; }
|
|
|
|
struct OffsetSize {
|
|
int64_t offset = 0;
|
|
int64_t size = 0;
|
|
};
|
|
|
|
// Access to the logical buffers assigned to this allocation, and their
|
|
// associated logical offsets and sizes.
|
|
const absl::flat_hash_map<const HloValue*, OffsetSize>& assigned_buffers()
|
|
const {
|
|
return assigned_buffers_;
|
|
}
|
|
|
|
// A Slice represents a contiguous portion of a memory allocation. It is used
|
|
// to identify the memory range that a LogicalBuffer corresponds to.
|
|
class Slice {
|
|
public:
|
|
Slice() {}
|
|
Slice(const BufferAllocation* allocation, int64_t offset, int64_t size)
|
|
: allocation_(allocation), offset_(offset), size_(size) {}
|
|
|
|
const BufferAllocation* allocation() const { return allocation_; }
|
|
Index index() const { return allocation_->index(); }
|
|
int64_t offset() const { return offset_; }
|
|
int64_t size() const { return size_; }
|
|
|
|
bool operator==(const Slice& other) const {
|
|
return index() == other.index() && offset_ == other.offset_ &&
|
|
size_ == other.size_;
|
|
}
|
|
bool operator!=(const Slice& other) const { return !(*this == other); }
|
|
bool operator<(const Slice& other) const {
|
|
if (index() != other.index()) return index() < other.index();
|
|
if (offset_ != other.offset_) return offset_ < other.offset_;
|
|
return size_ < other.size_;
|
|
}
|
|
|
|
// Returns true iff this slice's memory range has a non-empty intersection
|
|
// with the other slice's memory range.
|
|
bool OverlapsWith(const Slice& other) const {
|
|
const int64_t end = offset_ + size_;
|
|
const int64_t other_end = other.offset_ + other.size_;
|
|
return index() == other.index() && offset_ < other_end &&
|
|
end > other.offset_;
|
|
}
|
|
|
|
template <typename H>
|
|
friend H AbslHashValue(H h, const Slice& s) {
|
|
return H::combine(std::move(h), s.index(), s.offset(), s.size());
|
|
}
|
|
|
|
std::string ToString() const;
|
|
|
|
private:
|
|
const BufferAllocation* allocation_ = nullptr;
|
|
int64_t offset_ = 0;
|
|
int64_t size_ = 0;
|
|
};
|
|
|
|
// GetSlice returns the Slice of contiguous memory that holds the value
|
|
// described by the given 'buffer'.
|
|
// REQUIRES: 'buffer' must be assigned to this allocation.
|
|
Slice GetSlice(const HloValue& buffer) const;
|
|
|
|
std::string ToString() const;
|
|
BufferAllocationProto ToProto() const;
|
|
|
|
// Whether the buffer is a parameter to or live out of the entry computation.
|
|
bool IsInputOrOutput() const {
|
|
return is_entry_computation_parameter() || maybe_live_out();
|
|
}
|
|
|
|
// Whether the buffer is a temporary buffer allocated before
|
|
// Executable::ExecuteOnStream.
|
|
bool IsPreallocatedTempBuffer() const {
|
|
// Parameters do not need temporary buffers.
|
|
return !is_entry_computation_parameter() &&
|
|
// LogicalBuffers that maybe pointed to by the output should live out
|
|
// of the computation.
|
|
!maybe_live_out() &&
|
|
// Thread-local buffers are allocated using `alloca`s.
|
|
!is_thread_local() &&
|
|
// Constant buffers are allocated as global values.
|
|
!is_constant();
|
|
}
|
|
|
|
// Add a heap trace which was used to assign slices to logical buffers in this
|
|
// allocation. A single BufferAllocation may include multiple heap traces
|
|
// in the case of the temporary block where there is a heap trace per
|
|
// computation.
|
|
void AddHeapTrace(const HeapSimulatorTrace& heap_trace) {
|
|
heap_traces_.push_back(heap_trace);
|
|
heap_traces_.back().set_buffer_allocation_index(index());
|
|
}
|
|
|
|
// Return the set of heap traces used to assign slices to logical buffers in
|
|
// this allocation.
|
|
const std::vector<HeapSimulatorTrace> HeapTraces() const {
|
|
return heap_traces_;
|
|
}
|
|
|
|
// Returns the LogicalBuffers which are live at the point of peak memory usage
|
|
// for this allocation. The point of peak memory usage is the point at which
|
|
// the total size of all live logical buffers is maximal. If peak memory is
|
|
// reached at multiple points, the set of logical buffers live at the earliest
|
|
// maximal point is returned. The vector is stably sorted by
|
|
// BufferValue::Index.
|
|
const std::vector<const HloValue*>& PeakMemoryLogicalBuffers() const {
|
|
return peak_buffers_;
|
|
}
|
|
|
|
// Get the number of bytes lost to fragmentation. This is equal to the
|
|
// difference between the size of the allocation and the size of the maximal
|
|
// live set.
|
|
int64_t fragmentation_bytes() const { return fragmentation_bytes_; }
|
|
|
|
bool operator==(const BufferAllocation& other) const {
|
|
return index_ == other.index_;
|
|
}
|
|
bool operator!=(const BufferAllocation& other) const {
|
|
return !(*this == other);
|
|
}
|
|
bool operator<(const BufferAllocation& other) const {
|
|
return index() < other.index();
|
|
}
|
|
|
|
void set_entry_computation_parameter(int64_t parameter_number,
|
|
ShapeIndex param_shape_index,
|
|
bool parameter_aliased_with_output) {
|
|
is_entry_computation_parameter_ = true;
|
|
is_parameter_aliased_with_output_ = parameter_aliased_with_output;
|
|
parameter_number_ = parameter_number;
|
|
param_shape_index_ = std::move(param_shape_index);
|
|
}
|
|
|
|
void set_constant(bool is_constant) { is_constant_ = is_constant; }
|
|
|
|
private:
|
|
// Only BufferAssigner and BufferAssignment can modify BufferAllocation.
|
|
friend class BufferAssigner;
|
|
friend class BufferAssignment;
|
|
|
|
// Adds a LogicalBuffer to the set assigned to this buffer.
|
|
void AddAssignment(const HloValue& buffer, int64_t offset, int64_t size);
|
|
|
|
void set_index(Index index) { index_ = index; }
|
|
void set_size(int64_t size) { size_ = size; }
|
|
|
|
// The index of the allocation in the BufferAssignment.
|
|
Index index_;
|
|
|
|
// Size of the allocation in bytes.
|
|
int64_t size_;
|
|
|
|
// Whether this buffer needs to be thread-local.
|
|
bool is_thread_local_ = false;
|
|
|
|
// Whether this buffer holds a tuple.
|
|
bool is_tuple_ = false;
|
|
|
|
// Color of the allocation.
|
|
LogicalBuffer::Color color_;
|
|
|
|
// Whether this allocation holds an entry computation parameter. Entry
|
|
// computation parameters are special because they have lifetimes which may
|
|
// outlast the computation.
|
|
bool is_entry_computation_parameter_ = false;
|
|
|
|
// Whether this entry computation parameter is aliased with output.
|
|
bool is_parameter_aliased_with_output_ = false;
|
|
|
|
// If this allocation holds an entry computation parameter, this field
|
|
// indicates the index (starting from 0) of the parameter.
|
|
int64_t parameter_number_ = 0;
|
|
|
|
// If this buffer is for an entry computation parameter, which subshape of the
|
|
// parameter is it for?
|
|
ShapeIndex param_shape_index_;
|
|
|
|
// Whether the allocation contains a LogicalBuffer which may be live-out of
|
|
// the entry computation. Note that this flag is conservatively computed by
|
|
// TuplePointsToAnalysis. That is, an allocation marked `maybe_live_out_`
|
|
// might not actually escape.
|
|
bool maybe_live_out_ = false;
|
|
|
|
// See comment on the is_constant() accessor.
|
|
bool is_constant_ = false;
|
|
|
|
// Mapping from the set of buffers assigned to this allocation to their
|
|
// logical offsets and sizes.
|
|
absl::flat_hash_map<const HloValue*, OffsetSize> assigned_buffers_;
|
|
|
|
int64_t fragmentation_bytes_ = 0;
|
|
std::vector<HeapSimulatorTrace> heap_traces_;
|
|
|
|
// Set of buffers live at the point of peak memory usage for this allocation.
|
|
std::vector<const HloValue*> peak_buffers_;
|
|
};
|
|
|
|
// Add stream operators for nicer output of CHECK/RET_CHECK failures.
|
|
std::ostream& operator<<(std::ostream& out, const BufferAllocation& s);
|
|
std::ostream& operator<<(std::ostream& out, const BufferAllocation::Slice& s);
|
|
|
|
// This class encapsulates an assignment of the LogicalBuffers in an XLA
|
|
// module to a set of BufferAllocations.
|
|
class BufferAssignment {
|
|
public:
|
|
// This is a think wrapper around BufferIsolationConfig. Please see the
|
|
// documentation for BufferIsolationConfig for details on how buffer isolation
|
|
// works. hlo_value_compare is the concrete implementation of the HloValue
|
|
// comparison that uses the isolation_order_salt value in the
|
|
// BufferIsolationConfig.
|
|
struct BufferIsolationOptions {
|
|
std::function<bool(const HloValue*, const HloValue*)> hlo_value_compare;
|
|
buffer_assignment::BufferIsolationConfig config;
|
|
};
|
|
|
|
// Returns the vector containing all buffer allocations in this assignment.
|
|
const std::vector<BufferAllocation>& Allocations() const {
|
|
return allocations_;
|
|
}
|
|
|
|
// Returns the total size allocation holding all temporary buffers.
|
|
int64_t temp_allocation_total_size() const {
|
|
return temp_allocation_total_size_;
|
|
}
|
|
|
|
uint64_t multiheap_size_constraint_per_heap() const {
|
|
return multiheap_size_constraint_per_heap_;
|
|
}
|
|
|
|
// Returns whether the given buffer has been assigned an allocation.
|
|
bool HasAllocation(const HloValue& value) const;
|
|
|
|
// Returns whether the given (logical) buffer with the id has been assigned an
|
|
// allocation.
|
|
bool HasAllocation(HloValue::Id value_id) const;
|
|
|
|
bool HasAllocation(const HloBuffer& buffer) const;
|
|
|
|
// Returns the allocation that a particular LogicalBuffer has been assigned
|
|
// to. CHECKs if buffer has not been assigned an allocation.
|
|
const BufferAllocation& GetAssignedAllocation(const HloValue& value) const;
|
|
|
|
const BufferAllocation& GetAssignedAllocation(
|
|
const HloBuffer& hlo_buffer) const;
|
|
|
|
// Returns the allocation with the given index. CHECKs if no allocation exists
|
|
// with the given index.
|
|
const BufferAllocation& GetAllocation(BufferAllocation::Index index) const;
|
|
|
|
// Returns the allocation with the given instruction and shape index. nullptr
|
|
// if no allocation exists.
|
|
const BufferAllocation* GetInstructionAllocation(
|
|
const HloInstruction* hlo, const ShapeIndex& shape_index) const;
|
|
|
|
// Builds and returns a vector containing the slices which might contain the
|
|
// subvalue at the given index of given instruction.
|
|
std::set<BufferAllocation::Slice> GetAllSlices(
|
|
const HloInstruction* instruction, const ShapeIndex& index) const;
|
|
|
|
// Convenience function which returns whether the buffer of the
|
|
// instruction at the given index is assigned an allocation.
|
|
bool HasAllocationAt(const HloInstruction* instruction,
|
|
const ShapeIndex& index) const;
|
|
|
|
// Convenience function which returns whether the top-level buffer of the
|
|
// instruction (index == {}) is assigned an allocation.
|
|
bool HasTopLevelAllocation(const HloInstruction* instruction) const;
|
|
|
|
// Convenience function which returns the unique slice containing the buffer
|
|
// at the given index of the given instruction. If a slice is not assigned or
|
|
// the slice cannot be determined at compile time then an error is returned.
|
|
StatusOr<BufferAllocation::Slice> GetUniqueSlice(
|
|
const HloInstruction* instruction, const ShapeIndex& index) const;
|
|
// Like GetUniqueSlice but fixes the index to the top-level of the shape
|
|
// (index = {}).
|
|
StatusOr<BufferAllocation::Slice> GetUniqueTopLevelSlice(
|
|
const HloInstruction* instruction) const;
|
|
// Like GetUniqueTopLevelSlice but returns the slice for the output of the
|
|
// entry computation of the HLO module (ie, the result of the XLA
|
|
// computation).
|
|
StatusOr<BufferAllocation::Slice> GetUniqueTopLevelOutputSlice() const;
|
|
|
|
// Returns the set BufferValues which may be the source of the value at the
|
|
// given index and instruction.
|
|
const std::vector<const HloValue*>& GetSourceBuffers(
|
|
const HloInstruction* instruction, const ShapeIndex& index) const {
|
|
return dataflow_analysis().GetValueSet(instruction, index).values();
|
|
}
|
|
|
|
// Returns true if 'hlo_a{shape_index_a}' and 'hlo_b{shape_index_b}'
|
|
// share the same BufferAllocation::Slice.
|
|
// Returns false otherwise.
|
|
// REQUIRES: BufferAssignment assigned allocations to both instructions.
|
|
bool SharesSliceAtIndex(const HloInstruction* hlo_a,
|
|
const ShapeIndex& shape_index_a,
|
|
const HloInstruction* hlo_b,
|
|
const ShapeIndex& shape_index_b) const;
|
|
|
|
// Returns true if the top-level buffers of hlo_a and hlo_b are the same.
|
|
// REQUIRES: HasTopLevelAllocation(hlo_a) && HasTopLevelAllocation(hlo_b).
|
|
bool SharesTopLevelSlice(const HloInstruction* hlo_a,
|
|
const HloInstruction* hlo_b) const {
|
|
return SharesSliceAtIndex(hlo_a, {}, hlo_b, {});
|
|
}
|
|
|
|
// Returns true if hlo_a and hlo_b both have at least one buffer assigned for
|
|
// their top-level and each of their nested shape indices, and if hlo_a's
|
|
// buffers are all different from hlo_b's buffers.
|
|
bool HaveDisjointSlices(const HloInstruction* hlo_a,
|
|
const HloInstruction* hlo_b) const;
|
|
|
|
const HloDataflowAnalysis& dataflow_analysis() const {
|
|
return alias_analysis_->dataflow_analysis();
|
|
}
|
|
|
|
HloAliasAnalysis& alias_analysis() const { return *alias_analysis_; }
|
|
|
|
const HloOrdering& hlo_ordering() const { return *hlo_ordering_; }
|
|
|
|
// Returns the HloLiveRange object used to construct this assignment.
|
|
const HloLiveRange& hlo_live_range() const { return *hlo_live_range_; }
|
|
|
|
std::string ToString() const;
|
|
// Verbose string tailored to debugging OOMs, includes the Hlo op metadata for
|
|
// every buffer associated with each allocation.
|
|
std::string ToVerboseString(size_t max_buffers_to_show) const;
|
|
std::string BufferInfoString() const;
|
|
|
|
// Convert BufferAssignment to or from a proto.
|
|
BufferAssignmentProto ToProto() const;
|
|
static StatusOr<std::unique_ptr<BufferAssignment>> FromProto(
|
|
const BufferAssignmentProto& proto, const HloModule* module,
|
|
BufferValue::SizeFunction buffer_size,
|
|
HloDataflowAnalysis::CanShareBuffer can_share_buffer);
|
|
|
|
// Statistics for the assignment. Values initialized to -1 are not always
|
|
// collected; fragmentation is only collected for instructions that have a
|
|
// sequential total ordering.
|
|
struct Stats {
|
|
int64_t parameter_allocation_count = 0;
|
|
int64_t parameter_allocation_bytes = 0;
|
|
int64_t constant_allocation_count = 0;
|
|
int64_t constant_allocation_bytes = 0;
|
|
int64_t maybe_live_out_allocation_count = 0;
|
|
int64_t maybe_live_out_allocation_bytes = 0;
|
|
int64_t preallocated_temp_allocation_count = 0;
|
|
int64_t preallocated_temp_allocation_bytes = 0;
|
|
int64_t preallocated_temp_fragmentation_bytes = -1;
|
|
int64_t total_allocation_count = 0;
|
|
int64_t total_allocation_bytes = 0;
|
|
int64_t total_fragmentation_bytes = -1;
|
|
|
|
std::string ToString() const;
|
|
};
|
|
const Stats& GetStats() const { return stats_; }
|
|
|
|
private:
|
|
// Only BufferAssigner can build or modify BufferAssignments.
|
|
friend class BufferAssigner;
|
|
|
|
BufferAssignment(const HloModule* module,
|
|
std::unique_ptr<HloOrdering> hlo_ordering,
|
|
BufferValue::SizeFunction buffer_size,
|
|
LogicalBuffer::AlignmentFunction color_alignment,
|
|
std::unique_ptr<HloAliasAnalysis> alias_analysis,
|
|
std::unique_ptr<HloLiveRange> hlo_live_range)
|
|
: module_(module),
|
|
hlo_ordering_(std::move(hlo_ordering)),
|
|
buffer_size_(std::move(buffer_size)),
|
|
color_alignment_(std::move(color_alignment)),
|
|
alias_analysis_(std::move(alias_analysis)),
|
|
hlo_live_range_(std::move(hlo_live_range)) {
|
|
int32_t raw_value = module->config()
|
|
.debug_options()
|
|
.xla_multiheap_size_constraint_per_heap();
|
|
// -1 means no constraint.
|
|
multiheap_size_constraint_per_heap_ =
|
|
(raw_value == -1) ? UINT64_MAX : raw_value;
|
|
}
|
|
|
|
// Creates and returns a new BufferAllocation, with no assigned
|
|
// LogicalBuffers. Ownership is maintained internally.
|
|
BufferAllocation* NewEmptyAllocation(int64_t size,
|
|
LogicalBuffer::Color color);
|
|
|
|
// Helper that calls NewEmptyAllocation and AddAssignment in one call,
|
|
// creating an allocation containing a single LogicalBuffer.
|
|
BufferAllocation* NewAllocation(const HloBuffer& buffer, int64_t size);
|
|
|
|
// Adds a LogicalBuffer to the set assigned to the given allocation.
|
|
void AddAssignment(BufferAllocation* allocation, const HloBuffer& buffer,
|
|
int64_t offset, int64_t size);
|
|
|
|
void AddAssignment(BufferAllocation* allocation, const HloValue& value,
|
|
int64_t offset, int64_t size);
|
|
|
|
// Returns the HloModule used to construct this assignment.
|
|
const HloModule& module() const { return *module_; }
|
|
|
|
// Mutable accessors for allocations.
|
|
BufferAllocation* GetMutableAssignedAllocation(const HloBuffer& buffer);
|
|
BufferAllocation* GetMutableAllocation(BufferAllocation::Index index);
|
|
|
|
int64_t HloBufferSize(const HloBuffer& buffer) {
|
|
auto iter = cached_buffer_sizes_.find(buffer.id());
|
|
if (iter != cached_buffer_sizes_.end()) return iter->second;
|
|
int64_t result = 0;
|
|
for (const HloValue* value : buffer.values()) {
|
|
result = std::max(result, buffer_size_(*value));
|
|
}
|
|
cached_buffer_sizes_.insert({buffer.id(), result});
|
|
return result;
|
|
}
|
|
|
|
// Combines allocations of temporary buffers into one big BufferAllocation.
|
|
void CombineTempAllocations(
|
|
const absl::flat_hash_set<BufferValue::Color>& private_stack_colors);
|
|
|
|
// Computes stats for the assignment, to be retrieved by GetStats.
|
|
Status ComputeSummaryStats();
|
|
|
|
// The vector of buffer allocations. Indexed by BufferAllocation::Index.
|
|
std::vector<BufferAllocation> allocations_;
|
|
|
|
// The total size of all temporary buffers.
|
|
int64_t temp_allocation_total_size_ = 0;
|
|
|
|
uint64_t multiheap_size_constraint_per_heap_;
|
|
|
|
// Maps Buffers to the index of the BufferAllocation which holds the buffer.
|
|
absl::flat_hash_map<const HloValue*, BufferAllocation::Index>
|
|
allocation_index_for_value_;
|
|
|
|
const HloModule* module_;
|
|
|
|
const std::unique_ptr<HloOrdering> hlo_ordering_;
|
|
|
|
// Function which returns the buffer size for a given logical buffer (shape).
|
|
BufferValue::SizeFunction buffer_size_;
|
|
|
|
// Function which returns the alignment for a given logical buffer color.
|
|
LogicalBuffer::AlignmentFunction color_alignment_;
|
|
|
|
std::unique_ptr<HloAliasAnalysis> alias_analysis_;
|
|
|
|
std::unique_ptr<HloLiveRange> hlo_live_range_;
|
|
|
|
Stats stats_;
|
|
|
|
absl::flat_hash_map<HloBuffer::Id, int64_t> cached_buffer_sizes_;
|
|
|
|
BufferAssignment(const BufferAssignment&) = delete;
|
|
BufferAssignment& operator=(const BufferAssignment&) = delete;
|
|
};
|
|
|
|
// A class which constructs a buffer assignment.
|
|
class BufferAssigner {
|
|
public:
|
|
using Colorer = std::function<Status(HloAliasAnalysis*, const HloOrdering&)>;
|
|
using MustNotLiveOut =
|
|
std::function<bool(const HloInstruction*, const ShapeIndex&)>;
|
|
using PrivateStacks = absl::flat_hash_map<BufferValue::Color,
|
|
std::vector<const HloComputation*>>;
|
|
|
|
static Colorer DefaultColorer() {
|
|
return [](HloAliasAnalysis* alias_analysis, const HloOrdering&) {
|
|
for (HloValue* value : alias_analysis->dataflow_analysis().values()) {
|
|
const HloPosition& defining_position = value->defining_position();
|
|
if (defining_position.shape().has_layout()) {
|
|
value->set_color(BufferValue::Color(
|
|
defining_position.shape().layout().memory_space()));
|
|
} else {
|
|
value->set_color(BufferValue::Color(0));
|
|
}
|
|
}
|
|
return OkStatus();
|
|
};
|
|
}
|
|
|
|
// Returns false if a buffer cannot be assigned to given allocation.
|
|
|
|
// Build and return a BufferAssignment for the given module. The given
|
|
// HloOrdering is used to determine buffer liveness. buffer_size and
|
|
// color_alignment are functions which returns the size and alignment of a
|
|
// LogicalBuffer. If preset_assignments is provided, those pre-set assignment
|
|
// offsets will be used. The caller guarantees that those assignments are
|
|
// valid and they do not overwrite each other.
|
|
static StatusOr<std::unique_ptr<BufferAssignment>> Run(
|
|
const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
|
|
BufferValue::SizeFunction buffer_size,
|
|
LogicalBuffer::AlignmentFunction color_alignment,
|
|
bool allocate_buffers_for_constants = false,
|
|
Colorer colorer = DefaultColorer(),
|
|
std::optional<MustNotLiveOut> must_not_live_out = std::nullopt,
|
|
HloDataflowAnalysis::CanShareBuffer can_share_buffer = nullptr,
|
|
std::unique_ptr<memory_space_assignment::PresetAssignments>
|
|
preset_assignments = {},
|
|
const PrivateStacks& private_stacks = {},
|
|
GlobalDecreasingSizeBestFitHeap<HloValue>::BufferIntervalCompare
|
|
heap_buffer_interval_compare = nullptr,
|
|
std::optional<BufferAssignment::BufferIsolationOptions>
|
|
isolation_options = std::nullopt);
|
|
|
|
private:
|
|
BufferAssigner(bool allocate_buffers_for_constants, Colorer colorer,
|
|
std::optional<MustNotLiveOut> must_not_live_out,
|
|
std::unique_ptr<memory_space_assignment::PresetAssignments>
|
|
preset_assignments)
|
|
: allocate_buffers_for_constants_(allocate_buffers_for_constants),
|
|
colorer_(colorer),
|
|
must_not_live_out_(must_not_live_out),
|
|
preset_assignments_(std::move(preset_assignments)) {}
|
|
virtual ~BufferAssigner() = default;
|
|
|
|
// Create a buffer assignment.
|
|
StatusOr<std::unique_ptr<BufferAssignment>> CreateAssignment(
|
|
const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
|
|
BufferValue::SizeFunction buffer_size,
|
|
LogicalBuffer::AlignmentFunction color_alignment,
|
|
HloDataflowAnalysis::CanShareBuffer can_share_buffer,
|
|
const PrivateStacks& private_stacks,
|
|
GlobalDecreasingSizeBestFitHeap<HloValue>::BufferIntervalCompare
|
|
heap_buffer_interval_compare,
|
|
std::optional<BufferAssignment::BufferIsolationOptions>
|
|
isolation_options);
|
|
|
|
// Assigns buffers to the instructions in the given computations. "assignment"
|
|
// is modified to reflect the new buffer assignments. If is_thread_local is
|
|
// true, then all assigned buffers have the is_thread_local flag set to
|
|
// true.
|
|
Status AssignBuffersForComputations(
|
|
const std::vector<const HloComputation*>& computations,
|
|
bool is_thread_local,
|
|
absl::flat_hash_map<const HloComputation*,
|
|
absl::flat_hash_set<const HloValue*>>*
|
|
buffers_to_assign_sequentially,
|
|
BufferAssignment* assignment);
|
|
|
|
// Returns true if buffer's live range interferences with buffer2's.
|
|
bool LiveRangeInterferes(const HloValue* buffer1, const HloValue* buffer2,
|
|
BufferAssignment* assignment);
|
|
|
|
// Assigns pre-set assignments, if provided. These assignments will be added
|
|
// to assigned_buffers and skip buffer allocation.
|
|
Status AssignPresetBuffers(
|
|
absl::flat_hash_set<const HloBuffer*>* assigned_buffers,
|
|
BufferAssignment* assignment);
|
|
|
|
// Assigns a single hlo buffer to an HLO allocation.
|
|
Status AssignSingleHloBuffer(
|
|
const HloBuffer* hlo_buffer, bool is_thread_local,
|
|
absl::flat_hash_map<const HloComputation*,
|
|
absl::flat_hash_set<const HloValue*>>*
|
|
buffers_to_assign_sequentially,
|
|
std::vector<BufferAllocation::Index>* allocation_indices,
|
|
BufferAssignment* assignment);
|
|
|
|
// Assigns 'buffers_to_assign_sequentially' using heap simulation, assuming
|
|
// the HLO instructions will be executed in the sequential order given by
|
|
// assignment->liveness().hlo_ordering().SequentialOrder. If
|
|
// 'run_whole_module_heap_simulation' is true, the heap simulation will be run
|
|
// assuming all global computations are sequentially ordered.
|
|
Status AssignBuffersWithSequentialOrdering(
|
|
const absl::flat_hash_map<const HloComputation*,
|
|
absl::flat_hash_set<const HloValue*>>&
|
|
buffers_to_assign_sequentially,
|
|
bool run_whole_module_heap_simulation, BufferAssignment* assignment,
|
|
const PrivateStacks& private_stacks,
|
|
GlobalDecreasingSizeBestFitHeap<HloValue>::BufferIntervalCompare
|
|
heap_buffer_interval_compare,
|
|
std::optional<BufferAssignment::BufferIsolationOptions>
|
|
isolation_options);
|
|
|
|
// Isolates the buffers packed by heap simulator using the provided isolation
|
|
// options. Please see the documentation for BufferIsolationConfig for more
|
|
// details.
|
|
void IsolateHeapBuffers(
|
|
std::optional<BufferAssignment::BufferIsolationOptions> isolation_options,
|
|
const BufferAssignment* assignment, LogicalBuffer::Color color,
|
|
HeapSimulator::Result<HloValue>& result) const;
|
|
|
|
// Uses the results of the heap simulator to create a single allocation, with
|
|
// LogicalBuffers packed to specific offsets.
|
|
void AssignBuffersFromHeapSimulator(
|
|
HeapSimulator::Result<HloValue>& result, BufferAssignment* assignment,
|
|
LogicalBuffer::Color color,
|
|
std::optional<BufferAssignment::BufferIsolationOptions>
|
|
isolation_options);
|
|
|
|
// Tries to assign the given instruction to the given buffer. Returns if the
|
|
// assignment was successful.
|
|
bool MaybeAssignBuffer(BufferAllocation* allocation, const HloBuffer& buffer,
|
|
BufferAssignment* assignment);
|
|
|
|
// Split a set of buffers into several sets, each of which contains buffers
|
|
// colored with the same color.
|
|
absl::flat_hash_map<LogicalBuffer::Color,
|
|
absl::flat_hash_set<const HloValue*>>
|
|
SplitBuffersByColor(
|
|
const absl::flat_hash_set<const HloValue*>& buffers) const;
|
|
|
|
// Split a set of buffers into several sets, each of which contains buffers
|
|
// with defining instructions that are dominated by the given private stack
|
|
// computation. This function CHECK-fails if there are outstanding buffers
|
|
// that do not have a dominating private stack computation.
|
|
absl::flat_hash_map<const HloComputation*,
|
|
absl::flat_hash_set<const HloValue*>>
|
|
SplitBuffersByPrivateStackComputation(
|
|
const absl::flat_hash_set<const HloValue*>& buffers,
|
|
absl::Span<const HloComputation* const> private_stack_computations,
|
|
const CallGraph& call_graph) const;
|
|
|
|
// If true, allocate buffers for constant instructions.
|
|
bool allocate_buffers_for_constants_;
|
|
|
|
// Functor used to assign colors to newly allocated logical buffers.
|
|
Colorer colorer_;
|
|
|
|
// An optional function that returns true if the given instruction can't live
|
|
// out of a computation.
|
|
std::optional<MustNotLiveOut> must_not_live_out_;
|
|
|
|
// Description of any buffer offsets that are already set by an earlier pass.
|
|
std::unique_ptr<memory_space_assignment::PresetAssignments>
|
|
preset_assignments_;
|
|
|
|
BufferAssigner(const BufferAssigner&) = delete;
|
|
BufferAssigner& operator=(const BufferAssigner&) = delete;
|
|
};
|
|
|
|
} // namespace xla
|
|
|
|
#endif // XLA_SERVICE_BUFFER_ASSIGNMENT_H_
|