1
0
mirror of https://github.com/godotengine/godot.git synced 2025-11-04 12:00:25 +00:00

Add Persistent Buffers

This work is a heavily refactored and rewritten from TheForge's initial
code.

TheForge's original code had too many race conditions and was
fundamentally flawed as it was too easy to incur into those data races
by accident.

However they identified the proper places that needed changes, and the
idea was sound. I used their work as a blueprint to design this work.

This PR implements:

 - Introduction of UMA buffers used by a few buffers
(most notably the ones filled by _fill_instance_data).

Ironically this change seems to positively affect PC more than it does
on Mobile.

Updates D3D12 Memory Allocator to get GPU_UPLOAD heap support.

Metal implementation by Stuart Carnie.

Co-authored-by: Stuart Carnie <stuart.carnie@gmail.com>
Co-authored-by: TheForge team
This commit is contained in:
Stuart Carnie
2025-10-18 07:00:58 +11:00
parent 5950fca36c
commit 230adb7511
38 changed files with 2848 additions and 1466 deletions

View File

@@ -267,7 +267,7 @@ Error RenderingDevice::_buffer_initialize(Buffer *p_buffer, Span<uint8_t> p_data
Error RenderingDevice::_insert_staging_block(StagingBuffers &p_staging_buffers) {
StagingBufferBlock block;
block.driver_id = driver->buffer_create(p_staging_buffers.block_size, p_staging_buffers.usage_bits, RDD::MEMORY_ALLOCATION_TYPE_CPU);
block.driver_id = driver->buffer_create(p_staging_buffers.block_size, p_staging_buffers.usage_bits, RDD::MEMORY_ALLOCATION_TYPE_CPU, frames_drawn);
ERR_FAIL_COND_V(!block.driver_id, ERR_CANT_CREATE);
block.frame_used = 0;
@@ -455,19 +455,29 @@ Error RenderingDevice::buffer_copy(RID p_src_buffer, RID p_dst_buffer, uint32_t
return OK;
}
Error RenderingDevice::buffer_update(RID p_buffer, uint32_t p_offset, uint32_t p_size, const void *p_data) {
Error RenderingDevice::buffer_update(RID p_buffer, uint32_t p_offset, uint32_t p_size, const void *p_data, bool p_skip_check) {
ERR_RENDER_THREAD_GUARD_V(ERR_UNAVAILABLE);
copy_bytes_count += p_size;
ERR_FAIL_COND_V_MSG(draw_list.active, ERR_INVALID_PARAMETER,
ERR_FAIL_COND_V_MSG(draw_list.active && !p_skip_check, ERR_INVALID_PARAMETER,
"Updating buffers is forbidden during creation of a draw list");
ERR_FAIL_COND_V_MSG(compute_list.active, ERR_INVALID_PARAMETER,
ERR_FAIL_COND_V_MSG(compute_list.active && !p_skip_check, ERR_INVALID_PARAMETER,
"Updating buffers is forbidden during creation of a compute list");
Buffer *buffer = _get_buffer_from_owner(p_buffer);
ERR_FAIL_NULL_V_MSG(buffer, ERR_INVALID_PARAMETER, "Buffer argument is not a valid buffer of any type.");
ERR_FAIL_COND_V_MSG(p_offset + p_size > buffer->size, ERR_INVALID_PARAMETER, "Attempted to write buffer (" + itos((p_offset + p_size) - buffer->size) + " bytes) past the end.");
if (buffer->usage.has_flag(RDD::BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT)) {
uint8_t *dst_data = driver->buffer_persistent_map_advance(buffer->driver_id, frames_drawn);
memcpy(dst_data + p_offset, p_data, p_size);
direct_copy_count++;
buffer_flush(p_buffer);
return OK;
}
_check_transfer_worker_buffer(buffer);
// Submitting may get chunked for various reasons, so convert this to a task.
@@ -597,8 +607,9 @@ Error RenderingDevice::driver_callback_add(RDD::DriverCallback p_callback, void
String RenderingDevice::get_perf_report() const {
String perf_report_text;
perf_report_text += " gpu:" + String::num_int64(prev_gpu_copy_count);
perf_report_text += " bytes:" + String::num_int64(prev_copy_bytes_count);
perf_report_text += " gpu:" + String::num_int64(gpu_copy_count);
perf_report_text += " direct:" + String::num_int64(direct_copy_count);
perf_report_text += " bytes:" + String::num_int64(copy_bytes_count);
perf_report_text += " lazily alloc:" + String::num_int64(driver->get_lazily_memory_used());
return perf_report_text;
@@ -608,6 +619,7 @@ void RenderingDevice::update_perf_report() {
prev_gpu_copy_count = gpu_copy_count;
prev_copy_bytes_count = copy_bytes_count;
gpu_copy_count = 0;
direct_copy_count = 0;
copy_bytes_count = 0;
}
@@ -659,7 +671,7 @@ Vector<uint8_t> RenderingDevice::buffer_get_data(RID p_buffer, uint32_t p_offset
_check_transfer_worker_buffer(buffer);
RDD::BufferID tmp_buffer = driver->buffer_create(buffer->size, RDD::BUFFER_USAGE_TRANSFER_TO_BIT, RDD::MEMORY_ALLOCATION_TYPE_CPU);
RDD::BufferID tmp_buffer = driver->buffer_create(buffer->size, RDD::BUFFER_USAGE_TRANSFER_TO_BIT, RDD::MEMORY_ALLOCATION_TYPE_CPU, frames_drawn);
ERR_FAIL_COND_V(!tmp_buffer, Vector<uint8_t>());
RDD::BufferCopyRegion region;
@@ -784,12 +796,38 @@ uint64_t RenderingDevice::buffer_get_device_address(RID p_buffer) {
return driver->buffer_get_device_address(buffer->driver_id);
}
uint8_t *RenderingDevice::buffer_persistent_map_advance(RID p_buffer) {
ERR_RENDER_THREAD_GUARD_V(0);
Buffer *buffer = _get_buffer_from_owner(p_buffer);
ERR_FAIL_NULL_V_MSG(buffer, nullptr, "Buffer argument is not a valid buffer of any type.");
direct_copy_count++;
return driver->buffer_persistent_map_advance(buffer->driver_id, frames_drawn);
}
void RenderingDevice::buffer_flush(RID p_buffer) {
ERR_RENDER_THREAD_GUARD();
Buffer *buffer = _get_buffer_from_owner(p_buffer);
ERR_FAIL_NULL_MSG(buffer, "Buffer argument is not a valid buffer of any type.");
driver->buffer_flush(buffer->driver_id);
}
RID RenderingDevice::storage_buffer_create(uint32_t p_size_bytes, Span<uint8_t> p_data, BitField<StorageBufferUsage> p_usage, BitField<BufferCreationBits> p_creation_bits) {
ERR_FAIL_COND_V(p_data.size() && (uint32_t)p_data.size() != p_size_bytes, RID());
Buffer buffer;
buffer.size = p_size_bytes;
buffer.usage = (RDD::BUFFER_USAGE_TRANSFER_FROM_BIT | RDD::BUFFER_USAGE_TRANSFER_TO_BIT | RDD::BUFFER_USAGE_STORAGE_BIT);
if (p_creation_bits.has_flag(BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT)) {
buffer.usage.set_flag(RDD::BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT);
// This is a precaution: Persistent buffers are meant for frequent CPU -> GPU transfers.
// Writing to this buffer from GPU might cause sync issues if both CPU & GPU try to write at the
// same time. It's probably fine (since CPU always advances the pointer before writing) but let's
// stick to the known/intended use cases and scream if we deviate from it.
buffer.usage.clear_flag(RDD::BUFFER_USAGE_TRANSFER_TO_BIT);
}
if (p_usage.has_flag(STORAGE_BUFFER_USAGE_DISPATCH_INDIRECT)) {
buffer.usage.set_flag(RDD::BUFFER_USAGE_INDIRECT_BIT);
}
@@ -801,7 +839,7 @@ RID RenderingDevice::storage_buffer_create(uint32_t p_size_bytes, Span<uint8_t>
buffer.usage.set_flag(RDD::BUFFER_USAGE_DEVICE_ADDRESS_BIT);
}
buffer.driver_id = driver->buffer_create(buffer.size, buffer.usage, RDD::MEMORY_ALLOCATION_TYPE_GPU);
buffer.driver_id = driver->buffer_create(buffer.size, buffer.usage, RDD::MEMORY_ALLOCATION_TYPE_GPU, frames_drawn);
ERR_FAIL_COND_V(!buffer.driver_id, RID());
// Storage buffers are assumed to be mutable.
@@ -833,7 +871,7 @@ RID RenderingDevice::texture_buffer_create(uint32_t p_size_elements, DataFormat
Buffer texture_buffer;
texture_buffer.size = size_bytes;
BitField<RDD::BufferUsageBits> usage = (RDD::BUFFER_USAGE_TRANSFER_FROM_BIT | RDD::BUFFER_USAGE_TRANSFER_TO_BIT | RDD::BUFFER_USAGE_TEXEL_BIT);
texture_buffer.driver_id = driver->buffer_create(size_bytes, usage, RDD::MEMORY_ALLOCATION_TYPE_GPU);
texture_buffer.driver_id = driver->buffer_create(size_bytes, usage, RDD::MEMORY_ALLOCATION_TYPE_GPU, frames_drawn);
ERR_FAIL_COND_V(!texture_buffer.driver_id, RID());
// Texture buffers are assumed to be immutable unless they don't have initial data.
@@ -1884,7 +1922,7 @@ void RenderingDevice::_texture_create_reinterpret_buffer(Texture *p_texture) {
uint32_t pixel_bytes = get_image_format_pixel_size(p_texture->format);
uint32_t row_pitch = STEPIFY(p_texture->width * pixel_bytes, row_pitch_step);
uint64_t buffer_size = STEPIFY(pixel_bytes * row_pitch * p_texture->height * p_texture->depth, transfer_alignment);
p_texture->shared_fallback->buffer = driver->buffer_create(buffer_size, RDD::BUFFER_USAGE_TRANSFER_FROM_BIT | RDD::BUFFER_USAGE_TRANSFER_TO_BIT, RDD::MEMORY_ALLOCATION_TYPE_GPU);
p_texture->shared_fallback->buffer = driver->buffer_create(buffer_size, RDD::BUFFER_USAGE_TRANSFER_FROM_BIT | RDD::BUFFER_USAGE_TRANSFER_TO_BIT, RDD::MEMORY_ALLOCATION_TYPE_GPU, frames_drawn);
buffer_memory += driver->buffer_get_allocation_size(p_texture->shared_fallback->buffer);
RDG::ResourceTracker *tracker = RDG::resource_tracker_create();
@@ -1938,7 +1976,7 @@ Vector<uint8_t> RenderingDevice::texture_get_data(RID p_texture, uint32_t p_laye
work_buffer_size = STEPIFY(work_buffer_size, work_mip_alignment) + mip_layouts[i].size;
}
RDD::BufferID tmp_buffer = driver->buffer_create(work_buffer_size, RDD::BUFFER_USAGE_TRANSFER_TO_BIT, RDD::MEMORY_ALLOCATION_TYPE_CPU);
RDD::BufferID tmp_buffer = driver->buffer_create(work_buffer_size, RDD::BUFFER_USAGE_TRANSFER_TO_BIT, RDD::MEMORY_ALLOCATION_TYPE_CPU, frames_drawn);
ERR_FAIL_COND_V(!tmp_buffer, Vector<uint8_t>());
thread_local LocalVector<RDD::BufferTextureCopyRegion> command_buffer_texture_copy_regions_vector;
@@ -3052,7 +3090,7 @@ RID RenderingDevice::vertex_buffer_create(uint32_t p_size_bytes, Span<uint8_t> p
if (p_creation_bits.has_flag(BUFFER_CREATION_DEVICE_ADDRESS_BIT)) {
buffer.usage.set_flag(RDD::BUFFER_USAGE_DEVICE_ADDRESS_BIT);
}
buffer.driver_id = driver->buffer_create(buffer.size, buffer.usage, RDD::MEMORY_ALLOCATION_TYPE_GPU);
buffer.driver_id = driver->buffer_create(buffer.size, buffer.usage, RDD::MEMORY_ALLOCATION_TYPE_GPU, frames_drawn);
ERR_FAIL_COND_V(!buffer.driver_id, RID());
// Vertex buffers are assumed to be immutable unless they don't have initial data or they've been marked for storage explicitly.
@@ -3224,7 +3262,7 @@ RID RenderingDevice::index_buffer_create(uint32_t p_index_count, IndexBufferForm
if (p_creation_bits.has_flag(BUFFER_CREATION_DEVICE_ADDRESS_BIT)) {
index_buffer.usage.set_flag(RDD::BUFFER_USAGE_DEVICE_ADDRESS_BIT);
}
index_buffer.driver_id = driver->buffer_create(index_buffer.size, index_buffer.usage, RDD::MEMORY_ALLOCATION_TYPE_GPU);
index_buffer.driver_id = driver->buffer_create(index_buffer.size, index_buffer.usage, RDD::MEMORY_ALLOCATION_TYPE_GPU, frames_drawn);
ERR_FAIL_COND_V(!index_buffer.driver_id, RID());
// Index buffers are assumed to be immutable unless they don't have initial data.
@@ -3279,7 +3317,7 @@ RID RenderingDevice::index_array_create(RID p_index_buffer, uint32_t p_index_off
/****************/
static const char *SHADER_UNIFORM_NAMES[RenderingDevice::UNIFORM_TYPE_MAX] = {
"Sampler", "CombinedSampler", "Texture", "Image", "TextureBuffer", "SamplerTextureBuffer", "ImageBuffer", "UniformBuffer", "StorageBuffer", "InputAttachment"
"Sampler", "CombinedSampler", "Texture", "Image", "TextureBuffer", "SamplerTextureBuffer", "ImageBuffer", "UniformBuffer", "UniformBufferDynamic", "StorageBuffer", "StorageBufferDynamic", "InputAttachment"
};
String RenderingDevice::_shader_uniform_debug(RID p_shader, int p_set) {
@@ -3450,7 +3488,16 @@ RID RenderingDevice::uniform_buffer_create(uint32_t p_size_bytes, Span<uint8_t>
if (p_creation_bits.has_flag(BUFFER_CREATION_DEVICE_ADDRESS_BIT)) {
buffer.usage.set_flag(RDD::BUFFER_USAGE_DEVICE_ADDRESS_BIT);
}
buffer.driver_id = driver->buffer_create(buffer.size, buffer.usage, RDD::MEMORY_ALLOCATION_TYPE_GPU);
if (p_creation_bits.has_flag(BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT)) {
buffer.usage.set_flag(RDD::BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT);
// This is a precaution: Persistent buffers are meant for frequent CPU -> GPU transfers.
// Writing to this buffer from GPU might cause sync issues if both CPU & GPU try to write at the
// same time. It's probably fine (since CPU always advances the pointer before writing) but let's
// stick to the known/intended use cases and scream if we deviate from it.
buffer.usage.clear_flag(RDD::BUFFER_USAGE_TRANSFER_TO_BIT);
}
buffer.driver_id = driver->buffer_create(buffer.size, buffer.usage, RDD::MEMORY_ALLOCATION_TYPE_GPU, frames_drawn);
ERR_FAIL_COND_V(!buffer.driver_id, RID());
// Uniform buffers are assumed to be immutable unless they don't have initial data.
@@ -3527,8 +3574,7 @@ RID RenderingDevice::uniform_set_create(const VectorView<RD::Uniform> &p_uniform
const Uniform &uniform = uniforms[uniform_idx];
ERR_FAIL_INDEX_V(uniform.uniform_type, RD::UNIFORM_TYPE_MAX, RID());
ERR_FAIL_COND_V_MSG(uniform.uniform_type != set_uniform.type, RID(),
"Mismatch uniform type for binding (" + itos(set_uniform.binding) + "), set (" + itos(p_shader_set) + "). Expected '" + SHADER_UNIFORM_NAMES[set_uniform.type] + "', supplied: '" + SHADER_UNIFORM_NAMES[uniform.uniform_type] + "'.");
ERR_FAIL_COND_V_MSG(uniform.uniform_type != set_uniform.type, RID(), "Shader '" + shader->name + "' Mismatch uniform type for binding (" + itos(set_uniform.binding) + "), set (" + itos(p_shader_set) + "). Expected '" + SHADER_UNIFORM_NAMES[set_uniform.type] + "', supplied: '" + SHADER_UNIFORM_NAMES[uniform.uniform_type] + "'.");
RDD::BoundUniform &driver_uniform = driver_uniforms[i];
driver_uniform.type = uniform.uniform_type;
@@ -3759,7 +3805,8 @@ RID RenderingDevice::uniform_set_create(const VectorView<RD::Uniform> &p_uniform
case UNIFORM_TYPE_IMAGE_BUFFER: {
// Todo.
} break;
case UNIFORM_TYPE_UNIFORM_BUFFER: {
case UNIFORM_TYPE_UNIFORM_BUFFER:
case UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC: {
ERR_FAIL_COND_V_MSG(uniform.get_id_count() != 1, RID(),
"Uniform buffer supplied (binding: " + itos(uniform.binding) + ") must provide one ID (" + itos(uniform.get_id_count()) + " provided).");
@@ -3780,7 +3827,8 @@ RID RenderingDevice::uniform_set_create(const VectorView<RD::Uniform> &p_uniform
driver_uniform.ids.push_back(buffer->driver_id);
_check_transfer_worker_buffer(buffer);
} break;
case UNIFORM_TYPE_STORAGE_BUFFER: {
case UNIFORM_TYPE_STORAGE_BUFFER:
case UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC: {
ERR_FAIL_COND_V_MSG(uniform.get_id_count() != 1, RID(),
"Storage buffer supplied (binding: " + itos(uniform.binding) + ") must provide one ID (" + itos(uniform.get_id_count()) + " provided).");
@@ -5630,7 +5678,7 @@ RenderingDevice::TransferWorker *RenderingDevice::_acquire_transfer_worker(uint3
uint32_t new_staging_buffer_size = next_power_of_2(expected_buffer_size);
transfer_worker->staging_buffer_size_allocated = new_staging_buffer_size;
transfer_worker->staging_buffer = driver->buffer_create(new_staging_buffer_size, RDD::BUFFER_USAGE_TRANSFER_FROM_BIT, RDD::MEMORY_ALLOCATION_TYPE_CPU);
transfer_worker->staging_buffer = driver->buffer_create(new_staging_buffer_size, RDD::BUFFER_USAGE_TRANSFER_FROM_BIT, RDD::MEMORY_ALLOCATION_TYPE_CPU, frames_drawn);
}
}
@@ -7786,6 +7834,8 @@ void RenderingDevice::_bind_methods() {
BIND_BITFIELD_FLAG(BUFFER_CREATION_DEVICE_ADDRESS_BIT);
BIND_BITFIELD_FLAG(BUFFER_CREATION_AS_STORAGE_BIT);
// Not exposed on purpose. This flag is too dangerous to be exposed to regular GD users.
//BIND_BITFIELD_FLAG(BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT);
BIND_ENUM_CONSTANT(UNIFORM_TYPE_SAMPLER); //for sampling only (sampler GLSL type)
BIND_ENUM_CONSTANT(UNIFORM_TYPE_SAMPLER_WITH_TEXTURE); // for sampling only); but includes a texture); (samplerXX GLSL type)); first a sampler then a texture
@@ -7797,6 +7847,8 @@ void RenderingDevice::_bind_methods() {
BIND_ENUM_CONSTANT(UNIFORM_TYPE_UNIFORM_BUFFER); //regular uniform buffer (or UBO).
BIND_ENUM_CONSTANT(UNIFORM_TYPE_STORAGE_BUFFER); //storage buffer ("buffer" qualifier) like UBO); but supports storage); for compute mostly
BIND_ENUM_CONSTANT(UNIFORM_TYPE_INPUT_ATTACHMENT); //used for sub-pass read/write); for mobile mostly
BIND_ENUM_CONSTANT(UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC); // Exposed in case a BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT buffer created by C++ makes it into GD users.
BIND_ENUM_CONSTANT(UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC); // Exposed in case a BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT buffer created by C++ makes it into GD users.
BIND_ENUM_CONSTANT(UNIFORM_TYPE_MAX);
BIND_ENUM_CONSTANT(RENDER_PRIMITIVE_POINTS);