1
0
mirror of https://github.com/godotengine/godot.git synced 2025-11-11 13:10:58 +00:00

Add Persistent Buffers

This work is a heavily refactored and rewritten from TheForge's initial
code.

TheForge's original code had too many race conditions and was
fundamentally flawed as it was too easy to incur into those data races
by accident.

However they identified the proper places that needed changes, and the
idea was sound. I used their work as a blueprint to design this work.

This PR implements:

 - Introduction of UMA buffers used by a few buffers
(most notably the ones filled by _fill_instance_data).

Ironically this change seems to positively affect PC more than it does
on Mobile.

Updates D3D12 Memory Allocator to get GPU_UPLOAD heap support.

Metal implementation by Stuart Carnie.

Co-authored-by: Stuart Carnie <stuart.carnie@gmail.com>
Co-authored-by: TheForge team
This commit is contained in:
Stuart Carnie
2025-10-18 07:00:58 +11:00
parent 5950fca36c
commit 230adb7511
38 changed files with 2848 additions and 1466 deletions

View File

@@ -767,26 +767,38 @@ void RenderForwardClustered::_setup_environment(const RenderDataRD *p_render_dat
RD::get_singleton()->buffer_update(scene_state.implementation_uniform_buffers[p_index], 0, sizeof(SceneState::UBO), &scene_state.ubo);
}
void RenderForwardClustered::_update_instance_data_buffer(RenderListType p_render_list) {
if (scene_state.instance_data[p_render_list].size() > 0) {
if (scene_state.instance_buffer[p_render_list] == RID() || scene_state.instance_buffer_size[p_render_list] < scene_state.instance_data[p_render_list].size()) {
if (scene_state.instance_buffer[p_render_list] != RID()) {
RD::get_singleton()->free_rid(scene_state.instance_buffer[p_render_list]);
}
uint32_t new_size = nearest_power_of_2_templated(MAX(uint64_t(INSTANCE_DATA_BUFFER_MIN_SIZE), scene_state.instance_data[p_render_list].size()));
scene_state.instance_buffer[p_render_list] = RD::get_singleton()->storage_buffer_create(new_size * sizeof(SceneState::InstanceData));
scene_state.instance_buffer_size[p_render_list] = new_size;
void RenderForwardClustered::SceneState::grow_instance_buffer(RenderListType p_render_list, uint32_t p_req_element_count, bool p_append) {
if (p_req_element_count > 0) {
if (instance_buffer[p_render_list].get_size(0u) < p_req_element_count * sizeof(SceneState::InstanceData)) {
instance_buffer[p_render_list].uninit();
uint32_t new_size = nearest_power_of_2_templated(MAX(uint64_t(INSTANCE_DATA_BUFFER_MIN_SIZE), p_req_element_count));
instance_buffer[p_render_list].set_size(0u, new_size * sizeof(SceneState::InstanceData), true);
curr_gpu_ptr[p_render_list] = nullptr;
}
const bool must_remap = instance_buffer[p_render_list].prepare_for_map(p_append);
if (must_remap) {
curr_gpu_ptr[p_render_list] = nullptr;
}
RD::get_singleton()->buffer_update(scene_state.instance_buffer[p_render_list], 0, sizeof(SceneState::InstanceData) * scene_state.instance_data[p_render_list].size(), scene_state.instance_data[p_render_list].ptr());
}
}
void RenderForwardClustered::_fill_instance_data(RenderListType p_render_list, int *p_render_info, uint32_t p_offset, int32_t p_max_elements, bool p_update_buffer) {
RenderList *rl = &render_list[p_render_list];
uint32_t element_total = p_max_elements >= 0 ? uint32_t(p_max_elements) : rl->elements.size();
scene_state.instance_data[p_render_list].resize(p_offset + element_total);
rl->element_info.resize(p_offset + element_total);
// If p_offset == 0, grow_instance_buffer resets and increment the buffer.
// If this behavior ever changes, _render_shadow_begin may need to change.
scene_state.grow_instance_buffer(p_render_list, p_offset + element_total, p_offset != 0u);
if (!scene_state.curr_gpu_ptr[p_render_list] && element_total > 0u) {
// The old buffer was replaced for another larger one. We must start copying from scratch.
element_total += p_offset;
p_offset = 0u;
scene_state.curr_gpu_ptr[p_render_list] = reinterpret_cast<SceneState::InstanceData *>(scene_state.instance_buffer[p_render_list].map_raw_for_upload(0u));
}
if (p_render_info) {
p_render_info[RS::VIEWPORT_RENDER_INFO_OBJECTS_IN_FRAME] += element_total;
}
@@ -797,7 +809,7 @@ void RenderForwardClustered::_fill_instance_data(RenderListType p_render_list, i
GeometryInstanceSurfaceDataCache *surface = rl->elements[i + p_offset];
GeometryInstanceForwardClustered *inst = surface->owner;
SceneState::InstanceData &instance_data = scene_state.instance_data[p_render_list][i + p_offset];
SceneState::InstanceData instance_data;
if (likely(inst->store_transform_cache)) {
RendererRD::MaterialStorage::store_transform_transposed_3x4(inst->transform, instance_data.transform);
@@ -836,7 +848,9 @@ void RenderForwardClustered::_fill_instance_data(RenderListType p_render_list, i
instance_data.set_compressed_aabb(surface_aabb);
instance_data.set_uv_scale(uv_scale);
bool cant_repeat = instance_data.flags & INSTANCE_DATA_FLAG_MULTIMESH || inst->mesh_instance.is_valid();
scene_state.curr_gpu_ptr[p_render_list][i + p_offset] = instance_data;
const bool cant_repeat = instance_data.flags & INSTANCE_DATA_FLAG_MULTIMESH || inst->mesh_instance.is_valid();
if (prev_surface != nullptr && !cant_repeat && prev_surface->sort.sort_key1 == surface->sort.sort_key1 && prev_surface->sort.sort_key2 == surface->sort.sort_key2 && inst->mirror == prev_surface->owner->mirror && repeats < RenderElementInfo::MAX_REPEATS) {
//this element is the same as the previous one, count repeats to draw it using instancing
@@ -870,8 +884,8 @@ void RenderForwardClustered::_fill_instance_data(RenderListType p_render_list, i
}
}
if (p_update_buffer) {
_update_instance_data_buffer(p_render_list);
if (p_update_buffer && element_total > 0u) {
RenderingDevice::get_singleton()->buffer_flush(scene_state.instance_buffer[p_render_list]._get(0u));
}
}
@@ -2722,7 +2736,8 @@ void RenderForwardClustered::_render_shadow_begin() {
_update_render_base_uniform_set();
render_list[RENDER_LIST_SECONDARY].clear();
scene_state.instance_data[RENDER_LIST_SECONDARY].clear();
// No need to reset scene_state.curr_gpu_ptr or scene_state.instance_buffer[RENDER_LIST_SECONDARY]
// because _fill_instance_data will do that if it detects p_offset == 0u.
}
void RenderForwardClustered::_render_shadow_append(RID p_framebuffer, const PagedArray<RenderGeometryInstance *> &p_instances, const Projection &p_projection, const Transform3D &p_transform, float p_zfar, float p_bias, float p_normal_bias, bool p_reverse_cull_face, bool p_use_dp, bool p_use_dp_flip, bool p_use_pancake, float p_lod_distance_multiplier, float p_screen_mesh_lod_threshold, const Rect2i &p_rect, bool p_flip_y, bool p_clear_region, bool p_begin, bool p_end, RenderingMethod::RenderInfo *p_render_info, const Size2i &p_viewport_size, const Transform3D &p_main_cam_transform) {
@@ -2797,7 +2812,11 @@ void RenderForwardClustered::_render_shadow_append(RID p_framebuffer, const Page
}
void RenderForwardClustered::_render_shadow_process() {
_update_instance_data_buffer(RENDER_LIST_SECONDARY);
RenderingDevice *rd = RenderingDevice::get_singleton();
if (scene_state.instance_buffer[RENDER_LIST_SECONDARY].get_size(0u) > 0u) {
rd->buffer_flush(scene_state.instance_buffer[RENDER_LIST_SECONDARY]._get(0u));
}
//render shadows one after the other, so this can be done un-barriered and the driver can optimize (as well as allow us to run compute at the same time)
for (uint32_t i = 0; i < scene_state.shadow_passes.size(); i++) {
@@ -3258,11 +3277,14 @@ RID RenderForwardClustered::_setup_render_pass_uniform_set(RenderListType p_rend
{
RD::Uniform u;
u.binding = 2;
u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
RID instance_buffer = scene_state.instance_buffer[p_render_list];
if (instance_buffer == RID()) {
instance_buffer = scene_shader.default_vec4_xform_buffer; // any buffer will do since its not used
u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC;
if (scene_state.instance_buffer[p_render_list].get_size(0u) == 0u) {
// Any buffer will do since it's not used, so just create one.
// We can't use scene_shader.default_vec4_xform_buffer because it's not dynamic.
scene_state.instance_buffer[p_render_list].set_size(0u, INSTANCE_DATA_BUFFER_MIN_SIZE * sizeof(SceneState::InstanceData), true);
scene_state.instance_buffer[p_render_list].prepare_for_upload();
}
RID instance_buffer = scene_state.instance_buffer[p_render_list]._get(0u);
u.append_id(instance_buffer);
uniforms.push_back(u);
}
@@ -3624,11 +3646,14 @@ RID RenderForwardClustered::_setup_sdfgi_render_pass_uniform_set(RID p_albedo_te
{
RD::Uniform u;
u.binding = 2;
u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
RID instance_buffer = scene_state.instance_buffer[RENDER_LIST_SECONDARY];
if (instance_buffer == RID()) {
instance_buffer = scene_shader.default_vec4_xform_buffer; // any buffer will do since its not used
u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC;
if (scene_state.instance_buffer[RENDER_LIST_SECONDARY].get_size(0u) == 0u) {
// Any buffer will do since it's not used, so just create one.
// We can't use scene_shader.default_vec4_xform_buffer because it's not dynamic.
scene_state.instance_buffer[RENDER_LIST_SECONDARY].set_size(0u, INSTANCE_DATA_BUFFER_MIN_SIZE * sizeof(SceneState::InstanceData), true);
scene_state.instance_buffer[RENDER_LIST_SECONDARY].prepare_for_upload();
}
RID instance_buffer = scene_state.instance_buffer[RENDER_LIST_SECONDARY]._get(0u);
u.append_id(instance_buffer);
uniforms.push_back(u);
}
@@ -5125,9 +5150,7 @@ RenderForwardClustered::~RenderForwardClustered() {
RD::get_singleton()->free_rid(scene_state.lightmap_buffer);
RD::get_singleton()->free_rid(scene_state.lightmap_capture_buffer);
for (uint32_t i = 0; i < RENDER_LIST_MAX; i++) {
if (scene_state.instance_buffer[i] != RID()) {
RD::get_singleton()->free_rid(scene_state.instance_buffer[i]);
}
scene_state.instance_buffer[i].uninit();
}
memdelete_arr(scene_state.lightmap_captures);
}

View File

@@ -31,6 +31,7 @@
#pragma once
#include "core/templates/paged_allocator.h"
#include "servers/rendering/multi_uma_buffer.h"
#include "servers/rendering/renderer_rd/cluster_builder_rd.h"
#include "servers/rendering/renderer_rd/effects/fsr2.h"
#ifdef METAL_ENABLED
@@ -398,9 +399,8 @@ private:
uint32_t max_lightmaps;
RID lightmap_buffer;
RID instance_buffer[RENDER_LIST_MAX];
uint32_t instance_buffer_size[RENDER_LIST_MAX] = { 0, 0, 0 };
LocalVector<InstanceData> instance_data[RENDER_LIST_MAX];
MultiUmaBuffer<1u> instance_buffer[RENDER_LIST_MAX] = { MultiUmaBuffer<1u>("RENDER_LIST_OPAQUE"), MultiUmaBuffer<1u>("RENDER_LIST_MOTION"), MultiUmaBuffer<1u>("RENDER_LIST_ALPHA"), MultiUmaBuffer<1u>("RENDER_LIST_SECONDARY") };
InstanceData *curr_gpu_ptr[RENDER_LIST_MAX] = {};
LightmapCaptureData *lightmap_captures = nullptr;
uint32_t max_lightmap_captures;
@@ -433,6 +433,7 @@ private:
LocalVector<ShadowPass> shadow_passes;
void grow_instance_buffer(RenderListType p_render_list, uint32_t p_req_element_count, bool p_append);
} scene_state;
static RenderForwardClustered *singleton;
@@ -464,7 +465,6 @@ private:
void _render_list(RenderingDevice::DrawListID p_draw_list, RenderingDevice::FramebufferFormatID p_framebuffer_Format, RenderListParameters *p_params, uint32_t p_from_element, uint32_t p_to_element);
void _render_list_with_draw_list(RenderListParameters *p_params, RID p_framebuffer, BitField<RD::DrawFlags> p_draw_flags = RD::DRAW_DEFAULT_ALL, const Vector<Color> &p_clear_color_values = Vector<Color>(), float p_clear_depth_value = 0.0, uint32_t p_clear_stencil_value = 0, const Rect2 &p_region = Rect2());
void _update_instance_data_buffer(RenderListType p_render_list);
void _fill_instance_data(RenderListType p_render_list, int *p_render_info = nullptr, uint32_t p_offset = 0, int32_t p_max_elements = -1, bool p_update_buffer = true);
void _fill_render_list(RenderListType p_render_list, const RenderDataRD *p_render_data, PassMode p_pass_mode, bool p_using_sdfgi = false, bool p_using_opaque_gi = false, bool p_using_motion_pass = false, bool p_append = false);

View File

@@ -667,7 +667,9 @@ void SceneShaderForwardClustered::init(const String p_defines) {
shader_versions.push_back(ShaderRD::VariantDefine(group, version, false));
}
shader.initialize(shader_versions, p_defines);
Vector<uint64_t> dynamic_buffers;
dynamic_buffers.push_back(ShaderRD::DynamicBuffer::encode(RenderForwardClustered::RENDER_PASS_UNIFORM_SET, 2));
shader.initialize(shader_versions, p_defines, Vector<RD::PipelineImmutableSampler>(), dynamic_buffers);
if (RendererCompositorRD::get_singleton()->is_xr_enabled()) {
shader.enable_group(SHADER_GROUP_MULTIVIEW);