diff --git a/doc/classes/RenderingDevice.xml b/doc/classes/RenderingDevice.xml
index 7b4fb65ac93..df54f172cb4 100644
--- a/doc/classes/RenderingDevice.xml
+++ b/doc/classes/RenderingDevice.xml
@@ -2151,7 +2151,17 @@
 		<constant name="UNIFORM_TYPE_INPUT_ATTACHMENT" value="9" enum="UniformType">
 			Input attachment uniform.
 		</constant>
-		<constant name="UNIFORM_TYPE_MAX" value="10" enum="UniformType">
+		<constant name="UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC" value="10" enum="UniformType">
+			Same as UNIFORM_TYPE_UNIFORM_BUFFER but for buffers created with BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT.
+			[b]Note:[/b] This flag is not available to GD users due to being too dangerous (i.e. wrong usage can result in visual glitches).
+			It's exposed in case GD users receive a buffer created with such flag from Godot.
+		</constant>
+		<constant name="UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC" value="11" enum="UniformType">
+			Same as UNIFORM_TYPE_STORAGE_BUFFER but for buffers created with BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT.
+			[b]Note:[/b] This flag is not available to GD users due to being too dangerous (i.e. wrong usage can result in visual glitches).
+			It's exposed in case GD users receive a buffer created with such flag from Godot.
+		</constant>
+		<constant name="UNIFORM_TYPE_MAX" value="12" enum="UniformType">
 			Represents the size of the [enum UniformType] enum.
 		</constant>
 		<constant name="RENDER_PRIMITIVE_POINTS" value="0" enum="RenderPrimitive">
diff --git a/drivers/d3d12/rendering_device_driver_d3d12.cpp b/drivers/d3d12/rendering_device_driver_d3d12.cpp
index 6865ae0bc97..310e151dfa8 100644
--- a/drivers/d3d12/rendering_device_driver_d3d12.cpp
+++ b/drivers/d3d12/rendering_device_driver_d3d12.cpp
@@ -99,6 +99,8 @@ GODOT_MSVC_WARNING_POP
 
 static const D3D12_RANGE VOID_RANGE = {};
 
+static const uint32_t MAX_DYNAMIC_BUFFERS = 8u; // Minimum guaranteed by Vulkan.
+
 /*****************/
 /**** GENERIC ****/
 /*****************/
@@ -1012,12 +1014,22 @@ void RenderingDeviceDriverD3D12::_resource_transitions_flush(CommandBufferInfo *
 /**** BUFFERS ****/
 /*****************/
 
-RDD::BufferID RenderingDeviceDriverD3D12::buffer_create(uint64_t p_size, BitField<BufferUsageBits> p_usage, MemoryAllocationType p_allocation_type) {
-	// D3D12 debug layers complain at CBV creation time if the size is not multiple of the value per the spec
-	// but also if you give a rounded size at that point because it will extend beyond the
-	// memory of the resource. Therefore, it seems the only way is to create it with a
-	// rounded size.
-	CD3DX12_RESOURCE_DESC1 resource_desc = CD3DX12_RESOURCE_DESC1::Buffer(STEPIFY(p_size, D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT));
+RDD::BufferID RenderingDeviceDriverD3D12::buffer_create(uint64_t p_size, BitField<BufferUsageBits> p_usage, MemoryAllocationType p_allocation_type, uint64_t p_frames_drawn) {
+	uint32_t alignment = D3D12_RAW_UAV_SRV_BYTE_ALIGNMENT; // 16 bytes is reasonable.
+	if (p_usage.has_flag(BUFFER_USAGE_UNIFORM_BIT)) {
+		// 256 bytes is absurd. Only use it when required.
+		alignment = D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT;
+	}
+
+	// We don't have VMA like in Vulkan, that takes care of the details. We must align the size.
+	p_size = STEPIFY(p_size, alignment);
+
+	const size_t original_size = p_size;
+	if (p_usage.has_flag(BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT)) {
+		p_size = p_size * frames.size();
+	}
+
+	CD3DX12_RESOURCE_DESC1 resource_desc = CD3DX12_RESOURCE_DESC1::Buffer(p_size);
 	if (p_usage.has_flag(RDD::BUFFER_USAGE_STORAGE_BIT)) {
 		resource_desc.Flags |= D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS;
 	} else {
@@ -1044,6 +1056,12 @@ RDD::BufferID RenderingDeviceDriverD3D12::buffer_create(uint64_t p_size, BitFiel
 		} break;
 		case MEMORY_ALLOCATION_TYPE_GPU: {
 			// Use default parameters.
+			if (p_usage.has_flag(BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT)) {
+				allocation_desc.HeapType = dynamic_persistent_upload_heap;
+
+				// We can't use STORAGE for write access, just for read.
+				resource_desc.Flags = resource_desc.Flags & ~D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS;
+			}
 		} break;
 	}
 
@@ -1074,14 +1092,30 @@ RDD::BufferID RenderingDeviceDriverD3D12::buffer_create(uint64_t p_size, BitFiel
 
 	// Bookkeep.
 
-	BufferInfo *buf_info = VersatileResource::allocate<BufferInfo>(resources_allocator);
+	BufferInfo *buf_info;
+	if (p_usage.has_flag(BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT)) {
+		void *persistent_ptr = nullptr;
+		res = buffer->Map(0, &VOID_RANGE, &persistent_ptr);
+		ERR_FAIL_COND_V_MSG(!SUCCEEDED(res), BufferID(), "Map failed with error " + vformat("0x%08ux", (uint64_t)res) + ".");
+
+		BufferDynamicInfo *dyn_buffer = VersatileResource::allocate<BufferDynamicInfo>(resources_allocator);
+		buf_info = dyn_buffer;
+#ifdef DEBUG_ENABLED
+		dyn_buffer->last_frame_mapped = p_frames_drawn - 1ul;
+#endif
+		dyn_buffer->frame_idx = 0u;
+		dyn_buffer->persistent_ptr = (uint8_t *)persistent_ptr;
+	} else {
+		buf_info = VersatileResource::allocate<BufferInfo>(resources_allocator);
+	}
 	buf_info->resource = buffer.Get();
 	buf_info->owner_info.resource = buffer;
 	buf_info->owner_info.allocation = allocation;
 	buf_info->owner_info.states.subresource_states.push_back(initial_state);
 	buf_info->states_ptr = &buf_info->owner_info.states;
-	buf_info->size = p_size;
+	buf_info->size = original_size;
 	buf_info->flags.usable_as_uav = (resource_desc.Flags & D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
+	buf_info->flags.is_dynamic = p_usage.has_flag(BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT);
 
 	return BufferID(buf_info);
 }
@@ -1094,7 +1128,12 @@ bool RenderingDeviceDriverD3D12::buffer_set_texel_format(BufferID p_buffer, Data
 
 void RenderingDeviceDriverD3D12::buffer_free(BufferID p_buffer) {
 	BufferInfo *buf_info = (BufferInfo *)p_buffer.id;
-	VersatileResource::free(resources_allocator, buf_info);
+	if (buf_info->is_dynamic()) {
+		buf_info->resource->Unmap(0, &VOID_RANGE);
+		VersatileResource::free(resources_allocator, (BufferDynamicInfo *)buf_info);
+	} else {
+		VersatileResource::free(resources_allocator, buf_info);
+	}
 }
 
 uint64_t RenderingDeviceDriverD3D12::buffer_get_allocation_size(BufferID p_buffer) {
@@ -1115,6 +1154,17 @@ void RenderingDeviceDriverD3D12::buffer_unmap(BufferID p_buffer) {
 	buf_info->resource->Unmap(0, &VOID_RANGE);
 }
 
+uint8_t *RenderingDeviceDriverD3D12::buffer_persistent_map_advance(BufferID p_buffer, uint64_t p_frames_drawn) {
+	BufferDynamicInfo *buf_info = (BufferDynamicInfo *)p_buffer.id;
+	ERR_FAIL_COND_V_MSG(!buf_info->is_dynamic(), nullptr, "Buffer must have BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT. Use buffer_map() instead.");
+#ifdef DEBUG_ENABLED
+	ERR_FAIL_COND_V_MSG(buf_info->last_frame_mapped == p_frames_drawn, nullptr, "Buffers with BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT must only be mapped once per frame. Otherwise there could be race conditions with the GPU. Amalgamate all data uploading into one map(), use an extra buffer or remove the bit.");
+	buf_info->last_frame_mapped = p_frames_drawn;
+#endif
+	buf_info->frame_idx = (buf_info->frame_idx + 1u) % frames.size();
+	return buf_info->persistent_ptr + buf_info->frame_idx * buf_info->size;
+}
+
 uint64_t RenderingDeviceDriverD3D12::buffer_get_device_address(BufferID p_buffer) {
 	const BufferInfo *buf_info = (const BufferInfo *)p_buffer.id;
 	return buf_info->resource->GetGPUVirtualAddress();
@@ -3420,7 +3470,7 @@ void RenderingDeviceDriverD3D12::shader_destroy_modules(ShaderID p_shader) {
 /**** UNIFORM SET ****/
 /*********************/
 
-static void _add_descriptor_count_for_uniform(RenderingDevice::UniformType p_type, uint32_t p_binding_length, bool p_double_srv_uav_ambiguous, uint32_t &r_num_resources, uint32_t &r_num_samplers, bool &r_srv_uav_ambiguity) {
+static void _add_descriptor_count_for_uniform(RenderingDevice::UniformType p_type, uint32_t p_binding_length, bool p_double_srv_uav_ambiguous, uint32_t &r_num_resources, uint32_t &r_num_samplers, bool &r_srv_uav_ambiguity, uint32_t p_frame_count) {
 	r_srv_uav_ambiguity = false;
 
 	// Some resource types can be SRV or UAV, depending on what NIR-DXIL decided for a specific shader variant.
@@ -3440,10 +3490,18 @@ static void _add_descriptor_count_for_uniform(RenderingDevice::UniformType p_typ
 		case RenderingDevice::UNIFORM_TYPE_UNIFORM_BUFFER: {
 			r_num_resources += 1;
 		} break;
+		case RenderingDevice::UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC: {
+			r_num_resources += p_frame_count;
+		} break;
 		case RenderingDevice::UNIFORM_TYPE_STORAGE_BUFFER: {
 			r_num_resources += p_double_srv_uav_ambiguous ? 2 : 1;
 			r_srv_uav_ambiguity = true;
 		} break;
+		case RenderingDevice::UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC: {
+			// Dynamic storage buffers can only be SRV (we can't guarantee they get placed in
+			// D3D12_HEAP_TYPE_GPU_UPLOAD heap and D3D12_HEAP_TYPE_GPU doesn't support UAV).
+			r_num_resources += p_frame_count;
+		} break;
 		case RenderingDevice::UNIFORM_TYPE_IMAGE: {
 			r_num_resources += p_binding_length * (p_double_srv_uav_ambiguous ? 2 : 1);
 			r_srv_uav_ambiguity = true;
@@ -3460,6 +3518,11 @@ RDD::UniformSetID RenderingDeviceDriverD3D12::uniform_set_create(VectorView<Boun
 	// Pre-bookkeep.
 	UniformSetInfo *uniform_set_info = VersatileResource::allocate<UniformSetInfo>(resources_allocator);
 
+	// We first gather dynamic arrays in a local array because TightLocalVector's
+	// growth is not efficient when the number of elements is unknown.
+	const BufferDynamicInfo *dynamic_buffers[MAX_DYNAMIC_BUFFERS];
+	uint32_t num_dynamic_buffers = 0u;
+
 	// Do a first pass to count resources and samplers.
 	uint32_t num_resource_descs = 0;
 	uint32_t num_sampler_descs = 0;
@@ -3476,7 +3539,7 @@ RDD::UniformSetID RenderingDeviceDriverD3D12::uniform_set_create(VectorView<Boun
 		if (uniform.type == UNIFORM_TYPE_SAMPLER_WITH_TEXTURE || uniform.type == UNIFORM_TYPE_SAMPLER_WITH_TEXTURE_BUFFER) {
 			binding_length /= 2;
 		}
-		_add_descriptor_count_for_uniform(uniform.type, binding_length, true, num_resource_descs, num_sampler_descs, srv_uav_ambiguity);
+		_add_descriptor_count_for_uniform(uniform.type, binding_length, true, num_resource_descs, num_sampler_descs, srv_uav_ambiguity, frames.size());
 	}
 #ifdef DEV_ENABLED
 	uniform_set_info->resources_desc_info.reserve(num_resource_descs);
@@ -3599,64 +3662,94 @@ RDD::UniformSetID RenderingDeviceDriverD3D12::uniform_set_create(VectorView<Boun
 			case UNIFORM_TYPE_IMAGE_BUFFER: {
 				CRASH_NOW_MSG("Unimplemented!");
 			} break;
-			case UNIFORM_TYPE_UNIFORM_BUFFER: {
+			case UNIFORM_TYPE_UNIFORM_BUFFER:
+			case UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC: {
 				BufferInfo *buf_info = (BufferInfo *)uniform.ids[0].id;
 
+				if (uniform.type == UNIFORM_TYPE_UNIFORM_BUFFER) {
+					ERR_FAIL_COND_V_MSG(buf_info->is_dynamic(), UniformSetID(),
+							"Sent a buffer with BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT but binding (" + itos(uniform.binding) + "), set (" + itos(p_set_index) + ") is UNIFORM_TYPE_UNIFORM_BUFFER instead of UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC.");
+				} else {
+					ERR_FAIL_COND_V_MSG(!buf_info->is_dynamic(), UniformSetID(),
+							"Sent a buffer without BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT but binding (" + itos(uniform.binding) + "), set (" + itos(p_set_index) + ") is UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC instead of UNIFORM_TYPE_UNIFORM_BUFFER.");
+					ERR_FAIL_COND_V_MSG(num_dynamic_buffers >= MAX_DYNAMIC_BUFFERS, UniformSetID(),
+							"Uniform set exceeded the limit of dynamic/persistent buffers. (" + itos(MAX_DYNAMIC_BUFFERS) + ").");
+
+					dynamic_buffers[num_dynamic_buffers++] = (const BufferDynamicInfo *)buf_info;
+				}
+
 				D3D12_CONSTANT_BUFFER_VIEW_DESC cbv_desc = {};
 				cbv_desc.BufferLocation = buf_info->resource->GetGPUVirtualAddress();
-				cbv_desc.SizeInBytes = STEPIFY(buf_info->size, 256);
-				device->CreateConstantBufferView(&cbv_desc, desc_heap_walkers.resources.get_curr_cpu_handle());
-				desc_heap_walkers.resources.advance();
+				cbv_desc.SizeInBytes = STEPIFY(buf_info->size, D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT);
+
+				const uint32_t subregion_count = uniform.type == UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC ? frames.size() : 1u;
+				for (uint32_t j = 0u; j < subregion_count; ++j) {
+					device->CreateConstantBufferView(&cbv_desc, desc_heap_walkers.resources.get_curr_cpu_handle());
+					desc_heap_walkers.resources.advance();
 #ifdef DEV_ENABLED
-				uniform_set_info->resources_desc_info.push_back({ D3D12_DESCRIPTOR_RANGE_TYPE_CBV, {} });
+					uniform_set_info->resources_desc_info.push_back({ D3D12_DESCRIPTOR_RANGE_TYPE_CBV, {} });
 #endif
+					cbv_desc.BufferLocation += cbv_desc.SizeInBytes;
+				}
 
 				NeededState &ns = resource_states[buf_info];
 				ns.is_buffer = true;
 				ns.shader_uniform_idx_mask |= ((uint64_t)1 << i);
 				ns.states |= D3D12_RESOURCE_STATE_VERTEX_AND_CONSTANT_BUFFER;
 			} break;
-			case UNIFORM_TYPE_STORAGE_BUFFER: {
+			case UNIFORM_TYPE_STORAGE_BUFFER:
+			case UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC: {
 				BufferInfo *buf_info = (BufferInfo *)uniform.ids[0].id;
 
-				// SRV first. [[SRV_UAV_AMBIGUITY]]
-				{
-					D3D12_SHADER_RESOURCE_VIEW_DESC srv_desc = {};
-					srv_desc.Format = DXGI_FORMAT_R32_TYPELESS;
-					srv_desc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER;
-					srv_desc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
-					srv_desc.Buffer.FirstElement = 0;
-					srv_desc.Buffer.NumElements = (buf_info->size + 3) / 4;
-					srv_desc.Buffer.StructureByteStride = 0;
-					srv_desc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_RAW;
+				if (uniform.type == UNIFORM_TYPE_STORAGE_BUFFER) {
+					ERR_FAIL_COND_V_MSG(buf_info->is_dynamic(), UniformSetID(),
+							"Sent a buffer with BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT but binding (" + itos(uniform.binding) + "), set (" + itos(p_set_index) + ") is UNIFORM_TYPE_STORAGE_BUFFER instead of UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC.");
+				} else {
+					ERR_FAIL_COND_V_MSG(!buf_info->is_dynamic(), UniformSetID(),
+							"Sent a buffer without BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT but binding (" + itos(uniform.binding) + "), set (" + itos(p_set_index) + ") is UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC instead of UNIFORM_TYPE_STORAGE_BUFFER.");
+					ERR_FAIL_COND_V_MSG(num_dynamic_buffers >= MAX_DYNAMIC_BUFFERS, UniformSetID(),
+							"Uniform set exceeded the limit of dynamic/persistent buffers. (" + itos(MAX_DYNAMIC_BUFFERS) + ").");
+
+					dynamic_buffers[num_dynamic_buffers++] = (const BufferDynamicInfo *)buf_info;
+				}
+
+				D3D12_SHADER_RESOURCE_VIEW_DESC srv_desc = {};
+				srv_desc.Format = DXGI_FORMAT_R32_TYPELESS;
+				srv_desc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER;
+				srv_desc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
+				srv_desc.Buffer.FirstElement = 0;
+				srv_desc.Buffer.NumElements = (buf_info->size + 3u) / 4u;
+				srv_desc.Buffer.StructureByteStride = 0;
+				srv_desc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_RAW;
+
+				D3D12_UNORDERED_ACCESS_VIEW_DESC uav_desc = {};
+				uav_desc.Format = DXGI_FORMAT_R32_TYPELESS;
+				uav_desc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
+				uav_desc.Buffer.FirstElement = 0;
+				uav_desc.Buffer.NumElements = (buf_info->size + 3u) / 4u;
+				uav_desc.Buffer.StructureByteStride = 0;
+				uav_desc.Buffer.CounterOffsetInBytes = 0;
+				uav_desc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW;
+
+				const uint32_t subregion_count = uniform.type == UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC ? frames.size() : 1u;
+				for (uint32_t j = 0u; j < subregion_count; ++j) {
+					// SRV first. [[SRV_UAV_AMBIGUITY]]
 					device->CreateShaderResourceView(buf_info->resource, &srv_desc, desc_heap_walkers.resources.get_curr_cpu_handle());
 #ifdef DEV_ENABLED
 					uniform_set_info->resources_desc_info.push_back({ D3D12_DESCRIPTOR_RANGE_TYPE_SRV, srv_desc.ViewDimension });
 #endif
 					desc_heap_walkers.resources.advance();
-				}
+					srv_desc.Buffer.FirstElement += srv_desc.Buffer.NumElements;
 
-				// UAV then. [[SRV_UAV_AMBIGUITY]]
-				{
+					// UAV then. [[SRV_UAV_AMBIGUITY]]
 					if (buf_info->flags.usable_as_uav) {
-						D3D12_UNORDERED_ACCESS_VIEW_DESC uav_desc = {};
-						uav_desc.Format = DXGI_FORMAT_R32_TYPELESS;
-						uav_desc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
-						uav_desc.Buffer.FirstElement = 0;
-						uav_desc.Buffer.NumElements = (buf_info->size + 3) / 4;
-						uav_desc.Buffer.StructureByteStride = 0;
-						uav_desc.Buffer.CounterOffsetInBytes = 0;
-						uav_desc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW;
 						device->CreateUnorderedAccessView(buf_info->resource, nullptr, &uav_desc, desc_heap_walkers.resources.get_curr_cpu_handle());
 #ifdef DEV_ENABLED
 						uniform_set_info->resources_desc_info.push_back({ D3D12_DESCRIPTOR_RANGE_TYPE_UAV, {} });
 #endif
-					} else {
-						// If can't transition to UAV, leave this one empty since it won't be
-						// used, and trying to create an UAV view would trigger a validation error.
+						uav_desc.Buffer.FirstElement += uav_desc.Buffer.NumElements;
+						desc_heap_walkers.resources.advance();
 					}
-
-					desc_heap_walkers.resources.advance();
 				}
 
 				NeededState &ns = resource_states[buf_info];
@@ -3685,6 +3778,11 @@ RDD::UniformSetID RenderingDeviceDriverD3D12::uniform_set_create(VectorView<Boun
 		}
 	}
 
+	uniform_set_info->dynamic_buffers.resize(num_dynamic_buffers);
+	for (size_t i = 0u; i < num_dynamic_buffers; ++i) {
+		uniform_set_info->dynamic_buffers[i] = dynamic_buffers[i];
+	}
+
 	DEV_ASSERT(desc_heap_walkers.resources.is_at_eof());
 	DEV_ASSERT(desc_heap_walkers.samplers.is_at_eof());
 
@@ -3708,6 +3806,31 @@ void RenderingDeviceDriverD3D12::uniform_set_free(UniformSetID p_uniform_set) {
 	VersatileResource::free(resources_allocator, uniform_set_info);
 }
 
+uint32_t RenderingDeviceDriverD3D12::uniform_sets_get_dynamic_offsets(VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) const {
+	uint32_t mask = 0u;
+	uint32_t shift = 0u;
+#ifdef DEV_ENABLED
+	uint32_t curr_dynamic_offset = 0u;
+#endif
+
+	for (uint32_t i = 0; i < p_set_count; i++) {
+		const UniformSetInfo *usi = (const UniformSetInfo *)p_uniform_sets[i].id;
+		// At this point this assert should already have been validated.
+		DEV_ASSERT(curr_dynamic_offset + usi->dynamic_buffers.size() <= MAX_DYNAMIC_BUFFERS);
+
+		for (const BufferDynamicInfo *dynamic_buffer : usi->dynamic_buffers) {
+			DEV_ASSERT(dynamic_buffer->frame_idx < 16u);
+			mask |= dynamic_buffer->frame_idx << shift;
+			shift += 4u;
+		}
+#ifdef DEV_ENABLED
+		curr_dynamic_offset += usi->dynamic_buffers.size();
+#endif
+	}
+
+	return mask;
+}
+
 // ----- COMMANDS -----
 
 void RenderingDeviceDriverD3D12::command_uniform_set_prepare_for_use(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) {
@@ -3885,14 +4008,23 @@ void RenderingDeviceDriverD3D12::_command_check_descriptor_sets(CommandBufferID
 	}
 }
 
-void RenderingDeviceDriverD3D12::_command_bind_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index, bool p_for_compute) {
+void RenderingDeviceDriverD3D12::_command_bind_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index, uint32_t p_dynamic_offsets, bool p_for_compute) {
 	_command_check_descriptor_sets(p_cmd_buffer);
 
+	uint32_t shift = 0u;
+
 	UniformSetInfo *uniform_set_info = (UniformSetInfo *)p_uniform_set.id;
 	const ShaderInfo *shader_info_in = (const ShaderInfo *)p_shader.id;
 	const ShaderInfo::UniformSet &shader_set = shader_info_in->sets[p_set_index];
 	const CommandBufferInfo *cmd_buf_info = (const CommandBufferInfo *)p_cmd_buffer.id;
 
+	// The value of p_dynamic_offsets depends on all the other UniformSets bound after us
+	// (caller already filtered out bits that came before us).
+	// Turn that mask into something that is unique to us, *so that we don't create unnecessary entries in the cache*.
+	// We may not even have dynamic buffers at all in this set. In that case p_dynamic_offsets becomes 0.
+	const uint32_t used_dynamic_buffers_mask = (1u << (uniform_set_info->dynamic_buffers.size() * 4u)) - 1u;
+	p_dynamic_offsets = p_dynamic_offsets & used_dynamic_buffers_mask;
+
 	using SetRootDescriptorTableFn = void (STDMETHODCALLTYPE ID3D12GraphicsCommandList::*)(UINT, D3D12_GPU_DESCRIPTOR_HANDLE);
 	SetRootDescriptorTableFn set_root_desc_table_fn = p_for_compute ? &ID3D12GraphicsCommandList::SetComputeRootDescriptorTable : &ID3D12GraphicsCommandList1::SetGraphicsRootDescriptorTable;
 
@@ -3901,7 +4033,8 @@ void RenderingDeviceDriverD3D12::_command_bind_uniform_set(CommandBufferID p_cmd
 	UniformSetInfo::RecentBind *last_bind = nullptr;
 	for (int i = 0; i < (int)ARRAY_SIZE(uniform_set_info->recent_binds); i++) {
 		if (uniform_set_info->recent_binds[i].segment_serial == frames[frame_idx].segment_serial) {
-			if (uniform_set_info->recent_binds[i].root_signature_crc == root_sig_crc) {
+			if (uniform_set_info->recent_binds[i].root_signature_crc == root_sig_crc &&
+					uniform_set_info->recent_binds[i].dynamic_state_mask == p_dynamic_offsets) {
 				for (const RootDescriptorTable &table : uniform_set_info->recent_binds[i].root_tables.resources) {
 					(cmd_buf_info->cmd_list.Get()->*set_root_desc_table_fn)(table.root_param_idx, table.start_gpu_handle);
 				}
@@ -3940,10 +4073,11 @@ void RenderingDeviceDriverD3D12::_command_bind_uniform_set(CommandBufferID p_cmd
 	set_heap_walkers.resources = uniform_set_info->desc_heaps.resources.make_walker();
 	set_heap_walkers.samplers = uniform_set_info->desc_heaps.samplers.make_walker();
 
+	const uint32_t binding_count = shader_set.bindings.size();
 #ifdef DEV_ENABLED
 	// Whether we have stages where the uniform is actually used should match
 	// whether we have any root signature locations for it.
-	for (uint32_t i = 0; i < shader_set.bindings.size(); i++) {
+	for (uint32_t i = 0; i < binding_count; i++) {
 		bool has_rs_locations = false;
 		if (shader_set.bindings[i].root_sig_locations.resource.root_param_idx != UINT32_MAX ||
 				shader_set.bindings[i].root_sig_locations.sampler.root_param_idx != UINT32_MAX) {
@@ -3967,21 +4101,25 @@ void RenderingDeviceDriverD3D12::_command_bind_uniform_set(CommandBufferID p_cmd
 		RootDescriptorTable *resources = nullptr;
 		RootDescriptorTable *samplers = nullptr;
 	} tables;
-	for (uint32_t i = 0; i < shader_set.bindings.size(); i++) {
+	for (uint32_t i = 0; i < binding_count; i++) {
 		const ShaderInfo::UniformBindingInfo &binding = shader_set.bindings[i];
 
 		uint32_t num_resource_descs = 0;
 		uint32_t num_sampler_descs = 0;
 		bool srv_uav_ambiguity = false;
-		_add_descriptor_count_for_uniform(binding.type, binding.length, false, num_resource_descs, num_sampler_descs, srv_uav_ambiguity);
+		const uint32_t frame_count_for_binding = 1u; // _add_descriptor_count_for_uniform wants frames.size() so we can create N entries.
+													 // However we are binding now, and we must bind only one (not N of them), so set 1u.
+		_add_descriptor_count_for_uniform(binding.type, binding.length, false, num_resource_descs, num_sampler_descs, srv_uav_ambiguity, frame_count_for_binding);
+
+		uint32_t dynamic_resources_to_skip = 0u;
 
 		bool resource_used = false;
-		if (shader_set.bindings[i].stages) {
+		if (binding.stages) {
 			{
-				const ShaderInfo::UniformBindingInfo::RootSignatureLocation &rs_loc_resource = shader_set.bindings[i].root_sig_locations.resource;
+				const ShaderInfo::UniformBindingInfo::RootSignatureLocation &rs_loc_resource = binding.root_sig_locations.resource;
 				if (rs_loc_resource.root_param_idx != UINT32_MAX) { // Location used?
 					DEV_ASSERT(num_resource_descs);
-					DEV_ASSERT(!(srv_uav_ambiguity && (shader_set.bindings[i].res_class != RES_CLASS_SRV && shader_set.bindings[i].res_class != RES_CLASS_UAV))); // [[SRV_UAV_AMBIGUITY]]
+					DEV_ASSERT(!(srv_uav_ambiguity && (binding.res_class != RES_CLASS_SRV && binding.res_class != RES_CLASS_UAV))); // [[SRV_UAV_AMBIGUITY]]
 
 					bool must_flush_table = tables.resources && rs_loc_resource.root_param_idx != tables.resources->root_param_idx;
 					if (must_flush_table) {
@@ -4010,8 +4148,16 @@ void RenderingDeviceDriverD3D12::_command_bind_uniform_set(CommandBufferID p_cmd
 						tables.resources->start_gpu_handle = frame_heap_walkers.resources->get_curr_gpu_handle();
 					}
 
+					// For dynamic buffers, jump to the last written offset.
+					if (binding.type == UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC || binding.type == UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC) {
+						const uint32_t dyn_frame_idx = (p_dynamic_offsets >> shift) & 0xFu;
+						shift += 4u;
+						set_heap_walkers.resources.advance(num_resource_descs * dyn_frame_idx);
+						dynamic_resources_to_skip = num_resource_descs * (frames.size() - dyn_frame_idx - 1u);
+					}
+
 					// If there is ambiguity and it didn't clarify as SRVs, skip them, which come first. [[SRV_UAV_AMBIGUITY]]
-					if (srv_uav_ambiguity && shader_set.bindings[i].res_class != RES_CLASS_SRV) {
+					if (srv_uav_ambiguity && binding.res_class != RES_CLASS_SRV) {
 						set_heap_walkers.resources.advance(num_resource_descs);
 					}
 
@@ -4024,7 +4170,7 @@ void RenderingDeviceDriverD3D12::_command_bind_uniform_set(CommandBufferID p_cmd
 					frame_heap_walkers.resources->advance(num_resource_descs);
 
 					// If there is ambiguity and it didn't clarify as UAVs, skip them, which come later. [[SRV_UAV_AMBIGUITY]]
-					if (srv_uav_ambiguity && shader_set.bindings[i].res_class != RES_CLASS_UAV) {
+					if (srv_uav_ambiguity && binding.res_class != RES_CLASS_UAV) {
 						set_heap_walkers.resources.advance(num_resource_descs);
 					}
 
@@ -4033,7 +4179,7 @@ void RenderingDeviceDriverD3D12::_command_bind_uniform_set(CommandBufferID p_cmd
 			}
 
 			{
-				const ShaderInfo::UniformBindingInfo::RootSignatureLocation &rs_loc_sampler = shader_set.bindings[i].root_sig_locations.sampler;
+				const ShaderInfo::UniformBindingInfo::RootSignatureLocation &rs_loc_sampler = binding.root_sig_locations.sampler;
 				if (rs_loc_sampler.root_param_idx != UINT32_MAX) { // Location used?
 					DEV_ASSERT(num_sampler_descs);
 					DEV_ASSERT(!srv_uav_ambiguity); // [[SRV_UAV_AMBIGUITY]]
@@ -4080,7 +4226,7 @@ void RenderingDeviceDriverD3D12::_command_bind_uniform_set(CommandBufferID p_cmd
 		// the shader variant a given set is created upon may not need all of them due to DXC optimizations.
 		// Therefore, at this point we have to advance through the descriptor set descriptor's heap unconditionally.
 
-		set_heap_walkers.resources.advance(num_resource_descs);
+		set_heap_walkers.resources.advance(num_resource_descs + dynamic_resources_to_skip);
 		if (srv_uav_ambiguity) {
 			DEV_ASSERT(num_resource_descs);
 			if (!resource_used) {
@@ -4109,6 +4255,7 @@ void RenderingDeviceDriverD3D12::_command_bind_uniform_set(CommandBufferID p_cmd
 
 	last_bind->root_signature_crc = root_sig_crc;
 	last_bind->segment_serial = frames[frame_idx].segment_serial;
+	last_bind->dynamic_state_mask = p_dynamic_offsets;
 }
 
 /******************/
@@ -4983,14 +5130,16 @@ void RenderingDeviceDriverD3D12::command_bind_render_pipeline(CommandBufferID p_
 	cmd_buf_info->compute_pso = nullptr;
 }
 
-void RenderingDeviceDriverD3D12::command_bind_render_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) {
-	_command_bind_uniform_set(p_cmd_buffer, p_uniform_set, p_shader, p_set_index, false);
-}
-
-void RenderingDeviceDriverD3D12::command_bind_render_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) {
+void RenderingDeviceDriverD3D12::command_bind_render_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets) {
+	uint32_t shift = 0u;
 	for (uint32_t i = 0u; i < p_set_count; ++i) {
 		// TODO: _command_bind_uniform_set() does WAAAAY too much stuff. A lot of it should be already cached in UniformSetID when uniform_set_create() was called. Binding is supposed to be a cheap operation, ideally a memcpy.
-		_command_bind_uniform_set(p_cmd_buffer, p_uniform_sets[i], p_shader, p_first_set_index + i, false);
+		_command_bind_uniform_set(p_cmd_buffer, p_uniform_sets[i], p_shader, p_first_set_index + i, p_dynamic_offsets >> shift, false);
+		const UniformSetInfo *usi = (const UniformSetInfo *)p_uniform_sets[i].id;
+		shift += usi->dynamic_buffers.size() * 4u;
+
+		// At this point this assert should already have been validated.
+		DEV_ASSERT((shift / 4u) <= MAX_DYNAMIC_BUFFERS);
 	}
 }
 
@@ -5503,14 +5652,16 @@ void RenderingDeviceDriverD3D12::command_bind_compute_pipeline(CommandBufferID p
 	cmd_buf_info->graphics_pso = nullptr;
 }
 
-void RenderingDeviceDriverD3D12::command_bind_compute_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) {
-	_command_bind_uniform_set(p_cmd_buffer, p_uniform_set, p_shader, p_set_index, true);
-}
-
-void RenderingDeviceDriverD3D12::command_bind_compute_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) {
+void RenderingDeviceDriverD3D12::command_bind_compute_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets) {
+	uint32_t shift = 0u;
 	for (uint32_t i = 0u; i < p_set_count; ++i) {
 		// TODO: _command_bind_uniform_set() does WAAAAY too much stuff. A lot of it should be already cached in UniformSetID when uniform_set_create() was called. Binding is supposed to be a cheap operation, ideally a memcpy.
-		_command_bind_uniform_set(p_cmd_buffer, p_uniform_sets[i], p_shader, p_first_set_index + i, true);
+		_command_bind_uniform_set(p_cmd_buffer, p_uniform_sets[i], p_shader, p_first_set_index + i, p_dynamic_offsets >> shift, true);
+		const UniformSetInfo *usi = (const UniformSetInfo *)p_uniform_sets[i].id;
+		shift += usi->dynamic_buffers.size() * 4u;
+
+		// At this point this assert should already have been validated.
+		DEV_ASSERT((shift / 4u) <= MAX_DYNAMIC_BUFFERS);
 	}
 }
 
@@ -6300,6 +6451,16 @@ Error RenderingDeviceDriverD3D12::_initialize_allocator() {
 	HRESULT res = D3D12MA::CreateAllocator(&allocator_desc, &allocator);
 	ERR_FAIL_COND_V_MSG(!SUCCEEDED(res), ERR_CANT_CREATE, "D3D12MA::CreateAllocator failed with error " + vformat("0x%08ux", (uint64_t)res) + ".");
 
+	if (allocator->IsGPUUploadHeapSupported()) {
+		dynamic_persistent_upload_heap = D3D12_HEAP_TYPE_GPU_UPLOAD;
+		print_verbose("D3D12: Device supports GPU UPLOAD heap.");
+	} else {
+		dynamic_persistent_upload_heap = D3D12_HEAP_TYPE_UPLOAD;
+		// Print it as a warning (instead of verbose) because in the rare chance this lesser-used code path
+		// causes bugs, we get an inkling of what's going on (i.e. in order to repro bugs locally).
+		WARN_PRINT("D3D12: Device does NOT support GPU UPLOAD heap. ReBAR must be enabled for this feature. Regular UPLOAD heaps will be used as fallback.");
+	}
+
 	return OK;
 }
 
diff --git a/drivers/d3d12/rendering_device_driver_d3d12.h b/drivers/d3d12/rendering_device_driver_d3d12.h
index 5d77ceb1019..3ea1abe19ca 100644
--- a/drivers/d3d12/rendering_device_driver_d3d12.h
+++ b/drivers/d3d12/rendering_device_driver_d3d12.h
@@ -144,6 +144,7 @@ class RenderingDeviceDriverD3D12 : public RenderingDeviceDriver {
 	MiscFeaturesSupport misc_features_support;
 	RenderingShaderContainerFormatD3D12 shader_container_format;
 	String pipeline_cache_id;
+	D3D12_HEAP_TYPE dynamic_persistent_upload_heap = D3D12_HEAP_TYPE_UPLOAD;
 
 	class CPUDescriptorsHeapPool;
 
@@ -323,16 +324,29 @@ private:
 		uint64_t size = 0;
 		struct {
 			bool usable_as_uav : 1;
+			bool is_dynamic : 1; // Only used for tracking (e.g. Vulkan needs these checks).
 		} flags = {};
+
+		bool is_dynamic() const { return flags.is_dynamic; }
+	};
+
+	struct BufferDynamicInfo : BufferInfo {
+		uint32_t frame_idx = UINT32_MAX;
+		uint8_t *persistent_ptr = nullptr;
+#ifdef DEBUG_ENABLED
+		// For tracking that a persistent buffer isn't mapped twice in the same frame.
+		uint64_t last_frame_mapped = 0;
+#endif
 	};
 
 public:
-	virtual BufferID buffer_create(uint64_t p_size, BitField<BufferUsageBits> p_usage, MemoryAllocationType p_allocation_type) override final;
+	virtual BufferID buffer_create(uint64_t p_size, BitField<BufferUsageBits> p_usage, MemoryAllocationType p_allocation_type, uint64_t p_frames_drawn) override final;
 	virtual bool buffer_set_texel_format(BufferID p_buffer, DataFormat p_format) override final;
 	virtual void buffer_free(BufferID p_buffer) override final;
 	virtual uint64_t buffer_get_allocation_size(BufferID p_buffer) override final;
 	virtual uint8_t *buffer_map(BufferID p_buffer) override final;
 	virtual void buffer_unmap(BufferID p_buffer) override final;
+	virtual uint8_t *buffer_persistent_map_advance(BufferID p_buffer, uint64_t p_frames_drawn) override final;
 	virtual uint64_t buffer_get_device_address(BufferID p_buffer) override final;
 
 	/*****************/
@@ -705,6 +719,7 @@ private:
 
 		struct RecentBind {
 			uint64_t segment_serial = 0;
+			uint32_t dynamic_state_mask = 0;
 			uint32_t root_signature_crc = 0;
 			struct {
 				TightLocalVector<RootDescriptorTable> resources;
@@ -713,6 +728,8 @@ private:
 			int uses = 0;
 		} recent_binds[4]; // A better amount may be empirically found.
 
+		TightLocalVector<BufferDynamicInfo const *, uint32_t> dynamic_buffers;
+
 #ifdef DEV_ENABLED
 		// Filthy, but useful for dev.
 		struct ResourceDescInfo {
@@ -726,6 +743,7 @@ private:
 public:
 	virtual UniformSetID uniform_set_create(VectorView<BoundUniform> p_uniforms, ShaderID p_shader, uint32_t p_set_index, int p_linear_pool_index) override final;
 	virtual void uniform_set_free(UniformSetID p_uniform_set) override final;
+	virtual uint32_t uniform_sets_get_dynamic_offsets(VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) const override final;
 
 	// ----- COMMANDS -----
 
@@ -733,8 +751,7 @@ public:
 
 private:
 	void _command_check_descriptor_sets(CommandBufferID p_cmd_buffer);
-	void _command_bind_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index, bool p_for_compute);
-	void _command_bind_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, bool p_for_compute);
+	void _command_bind_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index, uint32_t p_dynamic_offsets, bool p_for_compute);
 
 public:
 	/******************/
@@ -823,8 +840,7 @@ public:
 
 	// Binding.
 	virtual void command_bind_render_pipeline(CommandBufferID p_cmd_buffer, PipelineID p_pipeline) override final;
-	virtual void command_bind_render_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) override final;
-	virtual void command_bind_render_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) override final;
+	virtual void command_bind_render_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets) override final;
 
 	// Drawing.
 	virtual void command_render_draw(CommandBufferID p_cmd_buffer, uint32_t p_vertex_count, uint32_t p_instance_count, uint32_t p_base_vertex, uint32_t p_first_instance) override final;
@@ -871,8 +887,7 @@ public:
 
 	// Binding.
 	virtual void command_bind_compute_pipeline(CommandBufferID p_cmd_buffer, PipelineID p_pipeline) override final;
-	virtual void command_bind_compute_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) override final;
-	virtual void command_bind_compute_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) override final;
+	virtual void command_bind_compute_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets) override final;
 
 	// Dispatching.
 	virtual void command_compute_dispatch(CommandBufferID p_cmd_buffer, uint32_t p_x_groups, uint32_t p_y_groups, uint32_t p_z_groups) override final;
diff --git a/drivers/metal/metal_objects.h b/drivers/metal/metal_objects.h
index d019c08d489..4520b79b8b3 100644
--- a/drivers/metal/metal_objects.h
+++ b/drivers/metal/metal_objects.h
@@ -135,6 +135,8 @@ class RenderingDeviceDriverMetal;
 class MDUniformSet;
 class MDShader;
 
+struct MetalBufferDynamicInfo;
+
 #pragma mark - Resource Factory
 
 struct ClearAttKey {
@@ -385,11 +387,12 @@ public:
 		BitField<DirtyFlag> dirty = DIRTY_NONE;
 
 		LocalVector<MDUniformSet *> uniform_sets;
+		uint32_t dynamic_offsets = 0;
 		// Bit mask of the uniform sets that are dirty, to prevent redundant binding.
 		uint64_t uniform_set_mask = 0;
 		uint8_t push_constant_data[MAX_PUSH_CONSTANT_SIZE];
 		uint32_t push_constant_data_len = 0;
-		uint32_t push_constant_bindings[2] = { 0 };
+		uint32_t push_constant_bindings[2] = { ~0U, ~0U };
 
 		_FORCE_INLINE_ void reset();
 		void end_encoding();
@@ -505,11 +508,12 @@ public:
 		BitField<DirtyFlag> dirty = DIRTY_NONE;
 
 		LocalVector<MDUniformSet *> uniform_sets;
+		uint32_t dynamic_offsets = 0;
 		// Bit mask of the uniform sets that are dirty, to prevent redundant binding.
 		uint64_t uniform_set_mask = 0;
 		uint8_t push_constant_data[MAX_PUSH_CONSTANT_SIZE];
 		uint32_t push_constant_data_len = 0;
-		uint32_t push_constant_bindings[1] = { 0 };
+		uint32_t push_constant_bindings[1] = { ~0U };
 
 		_FORCE_INLINE_ void reset();
 		void end_encoding();
@@ -559,8 +563,7 @@ public:
 
 #pragma mark - Render Commands
 
-	void render_bind_uniform_set(RDD::UniformSetID p_uniform_set, RDD::ShaderID p_shader, uint32_t p_set_index);
-	void render_bind_uniform_sets(VectorView<RDD::UniformSetID> p_uniform_sets, RDD::ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count);
+	void render_bind_uniform_sets(VectorView<RDD::UniformSetID> p_uniform_sets, RDD::ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets);
 	void render_clear_attachments(VectorView<RDD::AttachmentClear> p_attachment_clears, VectorView<Rect2i> p_rects);
 	void render_set_viewport(VectorView<Rect2i> p_viewports);
 	void render_set_scissor(VectorView<Rect2i> p_scissors);
@@ -593,8 +596,7 @@ public:
 
 #pragma mark - Compute Commands
 
-	void compute_bind_uniform_set(RDD::UniformSetID p_uniform_set, RDD::ShaderID p_shader, uint32_t p_set_index);
-	void compute_bind_uniform_sets(VectorView<RDD::UniformSetID> p_uniform_sets, RDD::ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count);
+	void compute_bind_uniform_sets(VectorView<RDD::UniformSetID> p_uniform_sets, RDD::ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets);
 	void compute_dispatch(uint32_t p_x_groups, uint32_t p_y_groups, uint32_t p_z_groups);
 	void compute_dispatch_indirect(RDD::BufferID p_indirect_buffer, uint64_t p_offset);
 
@@ -647,6 +649,7 @@ struct API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) UniformInfo {
 
 struct API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) UniformSet {
 	LocalVector<UniformInfo> uniforms;
+	LocalVector<uint32_t> dynamic_uniforms;
 	uint32_t buffer_size = 0;
 	HashMap<RDC::ShaderStage, uint32_t> offsets;
 	HashMap<RDC::ShaderStage, id<MTLArgumentEncoder>> encoders;
@@ -715,10 +718,62 @@ struct ShaderCacheEntry {
 	~ShaderCacheEntry() = default;
 };
 
+/// Godot limits the number of dynamic buffers to 8.
+///
+/// This is a minimum guarantee for Vulkan.
+constexpr uint32_t MAX_DYNAMIC_BUFFERS = 8;
+
+/// Maximum number of queued frames.
+///
+/// See setting: rendering/rendering_device/vsync/frame_queue_size
+constexpr uint32_t MAX_FRAME_COUNT = 4;
+
+class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) DynamicOffsetLayout {
+	struct Data {
+		uint8_t offset : 4;
+		uint8_t count : 4;
+	};
+
+	union {
+		Data data[MAX_DYNAMIC_BUFFERS];
+		uint64_t _val = 0;
+	};
+
+public:
+	_FORCE_INLINE_ bool is_empty() const { return _val == 0; }
+
+	_FORCE_INLINE_ uint32_t get_count(uint32_t p_set_index) const {
+		return data[p_set_index].count;
+	}
+
+	_FORCE_INLINE_ uint32_t get_offset(uint32_t p_set_index) const {
+		return data[p_set_index].offset;
+	}
+
+	_FORCE_INLINE_ void set_offset_count(uint32_t p_set_index, uint8_t p_offset, uint8_t p_count) {
+		data[p_set_index].offset = p_offset;
+		data[p_set_index].count = p_count;
+	}
+
+	_FORCE_INLINE_ uint32_t get_offset_index_shift(uint32_t p_set_index, uint32_t p_dynamic_index = 0) const {
+		return (data[p_set_index].offset + p_dynamic_index) * 4u;
+	}
+};
+
+class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) DynamicOffsets {
+	uint32_t data;
+
+public:
+	_FORCE_INLINE_ uint32_t get_frame_index(const DynamicOffsetLayout &p_layout) const {
+		return data;
+	}
+};
+
 class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) MDShader {
 public:
 	CharString name;
 	Vector<UniformSet> sets;
+	DynamicOffsetLayout dynamic_offset_layout;
 	bool uses_argument_buffers = true;
 
 	MDShader(CharString p_name, Vector<UniformSet> p_sets, bool p_uses_argument_buffers) :
@@ -786,30 +841,49 @@ struct HashMapComparatorDefault<RDD::ShaderID> {
 struct BoundUniformSet {
 	id<MTLBuffer> buffer;
 	ResourceUsageMap usage_to_resources;
+	/// Size of the per-frame buffer, which is 0 when there are no dynamic uniforms.
+	uint32_t frame_size = 0;
 
 	/// Perform a 2-way merge each key of `ResourceVector` resources from this set into the
 	/// destination set.
 	///
 	/// Assumes the vectors of resources are sorted.
 	void merge_into(ResourceUsageMap &p_dst) const;
+
+	/// Returns true if this bound uniform set contains dynamic uniforms.
+	_FORCE_INLINE_ bool is_dynamic() const { return frame_size > 0; }
+
+	/// Calculate the offset in the Metal buffer for the current frame.
+	_FORCE_INLINE_ uint32_t frame_offset(uint32_t p_frame_index) const { return p_frame_index * frame_size; }
+
+	/// Calculate the offset in the buffer for the given frame index and base offset.
+	_FORCE_INLINE_ uint32_t make_offset(uint32_t p_frame_index, uint32_t p_base_offset) const {
+		return frame_offset(p_frame_index) + p_base_offset;
+	}
+
+	BoundUniformSet() = default;
+	BoundUniformSet(id<MTLBuffer> p_buffer, ResourceUsageMap &&p_usage_to_resources, uint32_t p_frame_size) :
+			buffer(p_buffer), usage_to_resources(std::move(p_usage_to_resources)), frame_size(p_frame_size) {}
 };
 
 class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) MDUniformSet {
 private:
-	void bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index);
-	void bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index);
-	void bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state, uint32_t p_set_index);
-	void bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state, uint32_t p_set_index);
+	void bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index, uint32_t p_dynamic_offsets, uint32_t p_frame_idx, uint32_t p_frame_count);
+	void bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index, uint32_t p_dynamic_offsets);
+	void bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state, uint32_t p_set_index, uint32_t p_dynamic_offsets, uint32_t p_frame_idx, uint32_t p_frame_count);
+	void bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state, uint32_t p_set_index, uint32_t p_dynamic_offsets);
+
+	void update_dynamic_uniforms(MDShader *p_shader, ResourceUsageMap &p_resource_usage, uint32_t p_set_index, BoundUniformSet &p_bound_set, uint32_t p_dynamic_offsets, uint32_t p_frame_idx);
 
 public:
-	uint32_t index;
+	uint32_t index = 0;
 	LocalVector<RDD::BoundUniform> uniforms;
 	HashMap<MDShader *, BoundUniformSet> bound_uniforms;
 
-	void bind_uniforms(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index);
-	void bind_uniforms(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state, uint32_t p_set_index);
+	void bind_uniforms(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index, uint32_t p_dynamic_offsets, uint32_t p_frame_idx, uint32_t p_frame_count);
+	void bind_uniforms(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state, uint32_t p_set_index, uint32_t p_dynamic_offsets, uint32_t p_frame_idx, uint32_t p_frame_count);
 
-	BoundUniformSet &bound_uniform_set(MDShader *p_shader, id<MTLDevice> p_device, ResourceUsageMap &p_resource_usage, uint32_t p_set_index);
+	BoundUniformSet &bound_uniform_set(MDShader *p_shader, id<MTLDevice> p_device, ResourceUsageMap &p_resource_usage, uint32_t p_set_index, uint32_t p_dynamic_offsets, uint32_t p_frame_idx, uint32_t p_frame_count);
 };
 
 class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) MDPipeline {
diff --git a/drivers/metal/metal_objects.mm b/drivers/metal/metal_objects.mm
index af106f5d35a..8d439ef342a 100644
--- a/drivers/metal/metal_objects.mm
+++ b/drivers/metal/metal_objects.mm
@@ -247,39 +247,26 @@ void MDCommandBuffer::encodeRenderCommandEncoderWithDescriptor(MTLRenderPassDesc
 
 #pragma mark - Render Commands
 
-void MDCommandBuffer::render_bind_uniform_set(RDD::UniformSetID p_uniform_set, RDD::ShaderID p_shader, uint32_t p_set_index) {
+void MDCommandBuffer::render_bind_uniform_sets(VectorView<RDD::UniformSetID> p_uniform_sets, RDD::ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets) {
 	DEV_ASSERT(type == MDCommandBufferStateType::Render);
 
-	MDUniformSet *set = (MDUniformSet *)(p_uniform_set.id);
-	if (render.uniform_sets.size() <= p_set_index) {
+	render.dynamic_offsets |= p_dynamic_offsets;
+
+	if (uint32_t new_size = p_first_set_index + p_set_count; render.uniform_sets.size() < new_size) {
 		uint32_t s = render.uniform_sets.size();
-		render.uniform_sets.resize(p_set_index + 1);
+		render.uniform_sets.resize(new_size);
 		// Set intermediate values to null.
-		std::fill(&render.uniform_sets[s], &render.uniform_sets[p_set_index] + 1, nullptr);
+		std::fill(&render.uniform_sets[s], render.uniform_sets.end().operator->(), nullptr);
 	}
 
-	if (render.uniform_sets[p_set_index] != set) {
-		render.dirty.set_flag(RenderState::DIRTY_UNIFORMS);
-		render.uniform_set_mask |= 1ULL << p_set_index;
-		render.uniform_sets[p_set_index] = set;
-	}
-}
-
-void MDCommandBuffer::render_bind_uniform_sets(VectorView<RDD::UniformSetID> p_uniform_sets, RDD::ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) {
-	DEV_ASSERT(type == MDCommandBufferStateType::Render);
+	const MDShader *shader = (const MDShader *)p_shader.id;
+	DynamicOffsetLayout layout = shader->dynamic_offset_layout;
 
 	for (size_t i = 0; i < p_set_count; ++i) {
 		MDUniformSet *set = (MDUniformSet *)(p_uniform_sets[i].id);
 
 		uint32_t index = p_first_set_index + i;
-		if (render.uniform_sets.size() <= index) {
-			uint32_t s = render.uniform_sets.size();
-			render.uniform_sets.resize(index + 1);
-			// Set intermediate values to null.
-			std::fill(&render.uniform_sets[s], &render.uniform_sets[index] + 1, nullptr);
-		}
-
-		if (render.uniform_sets[index] != set) {
+		if (render.uniform_sets[index] != set || layout.get_count(index) > 0) {
 			render.dirty.set_flag(RenderState::DIRTY_UNIFORMS);
 			render.uniform_set_mask |= 1ULL << index;
 			render.uniform_sets[index] = set;
@@ -524,6 +511,7 @@ void MDCommandBuffer::_render_bind_uniform_sets() {
 	render.uniform_set_mask = 0;
 
 	MDRenderShader *shader = render.pipeline->shader;
+	const uint32_t dynamic_offsets = render.dynamic_offsets;
 
 	while (set_uniforms != 0) {
 		// Find the index of the next set bit.
@@ -534,7 +522,7 @@ void MDCommandBuffer::_render_bind_uniform_sets() {
 		if (set == nullptr || index >= (uint32_t)shader->sets.size()) {
 			continue;
 		}
-		set->bind_uniforms(shader, render, index);
+		set->bind_uniforms(shader, render, index, dynamic_offsets, device_driver->frame_index, device_driver->frame_count);
 	}
 }
 
@@ -819,7 +807,8 @@ void MDCommandBuffer::render_bind_vertex_buffers(uint32_t p_binding_count, const
 
 	// Reverse the buffers, as their bindings are assigned in descending order.
 	for (uint32_t i = 0; i < p_binding_count; i += 1) {
-		render.vertex_buffers[i] = rid::get(p_buffers[p_binding_count - i - 1]);
+		const RenderingDeviceDriverMetal::BufferInfo *buf_info = (const RenderingDeviceDriverMetal::BufferInfo *)p_buffers[p_binding_count - i - 1].id;
+		render.vertex_buffers[i] = buf_info->metal_buffer;
 		render.vertex_offsets[i] = p_offsets[p_binding_count - i - 1];
 	}
 
@@ -837,7 +826,9 @@ void MDCommandBuffer::render_bind_vertex_buffers(uint32_t p_binding_count, const
 void MDCommandBuffer::render_bind_index_buffer(RDD::BufferID p_buffer, RDD::IndexBufferFormat p_format, uint64_t p_offset) {
 	DEV_ASSERT(type == MDCommandBufferStateType::Render);
 
-	render.index_buffer = rid::get(p_buffer);
+	const RenderingDeviceDriverMetal::BufferInfo *buffer = (const RenderingDeviceDriverMetal::BufferInfo *)p_buffer.id;
+
+	render.index_buffer = buffer->metal_buffer;
 	render.index_type = p_format == RDD::IndexBufferFormat::INDEX_BUFFER_FORMAT_UINT16 ? MTLIndexTypeUInt16 : MTLIndexTypeUInt32;
 	render.index_offset = p_offset;
 }
@@ -880,7 +871,7 @@ void MDCommandBuffer::render_draw_indexed_indirect(RDD::BufferID p_indirect_buff
 
 	id<MTLRenderCommandEncoder> enc = render.encoder;
 
-	id<MTLBuffer> indirect_buffer = rid::get(p_indirect_buffer);
+	const RenderingDeviceDriverMetal::BufferInfo *indirect_buffer = (const RenderingDeviceDriverMetal::BufferInfo *)p_indirect_buffer.id;
 	NSUInteger indirect_offset = p_offset;
 
 	for (uint32_t i = 0; i < p_draw_count; i++) {
@@ -888,7 +879,7 @@ void MDCommandBuffer::render_draw_indexed_indirect(RDD::BufferID p_indirect_buff
 						   indexType:render.index_type
 						 indexBuffer:render.index_buffer
 				   indexBufferOffset:0
-					  indirectBuffer:indirect_buffer
+					  indirectBuffer:indirect_buffer->metal_buffer
 				indirectBufferOffset:indirect_offset];
 		indirect_offset += p_stride;
 	}
@@ -906,12 +897,12 @@ void MDCommandBuffer::render_draw_indirect(RDD::BufferID p_indirect_buffer, uint
 
 	id<MTLRenderCommandEncoder> enc = render.encoder;
 
-	id<MTLBuffer> indirect_buffer = rid::get(p_indirect_buffer);
+	const RenderingDeviceDriverMetal::BufferInfo *indirect_buffer = (const RenderingDeviceDriverMetal::BufferInfo *)p_indirect_buffer.id;
 	NSUInteger indirect_offset = p_offset;
 
 	for (uint32_t i = 0; i < p_draw_count; i++) {
 		[enc drawPrimitives:render.pipeline->raster_state.render_primitive
-					  indirectBuffer:indirect_buffer
+					  indirectBuffer:indirect_buffer->metal_buffer
 				indirectBufferOffset:indirect_offset];
 		indirect_offset += p_stride;
 	}
@@ -944,7 +935,10 @@ void MDCommandBuffer::RenderState::reset() {
 	index_type = MTLIndexTypeUInt16;
 	dirty = DIRTY_NONE;
 	uniform_sets.clear();
+	dynamic_offsets = 0;
 	uniform_set_mask = 0;
+	push_constant_bindings[0] = ~0U;
+	push_constant_bindings[1] = ~0U;
 	push_constant_data_len = 0;
 	clear_values.clear();
 	viewports.clear();
@@ -1041,6 +1035,7 @@ void MDCommandBuffer::_compute_bind_uniform_sets() {
 	compute.uniform_set_mask = 0;
 
 	MDComputeShader *shader = compute.pipeline->shader;
+	const uint32_t dynamic_offsets = compute.dynamic_offsets;
 
 	while (set_uniforms != 0) {
 		// Find the index of the next set bit.
@@ -1051,7 +1046,7 @@ void MDCommandBuffer::_compute_bind_uniform_sets() {
 		if (set == nullptr || index >= (uint32_t)shader->sets.size()) {
 			continue;
 		}
-		set->bind_uniforms(shader, compute, index);
+		set->bind_uniforms(shader, compute, index, dynamic_offsets, device_driver->frame_index, device_driver->frame_count);
 	}
 }
 
@@ -1060,7 +1055,9 @@ void MDCommandBuffer::ComputeState::reset() {
 	encoder = nil;
 	dirty = DIRTY_NONE;
 	uniform_sets.clear();
+	dynamic_offsets = 0;
 	uniform_set_mask = 0;
+	push_constant_bindings[0] = ~0U;
 	push_constant_data_len = 0;
 	// Keep the keys, as they are likely to be used again.
 	for (KeyValue<StageResourceUsage, LocalVector<__unsafe_unretained id<MTLResource>>> &kv : resource_usage) {
@@ -1068,39 +1065,26 @@ void MDCommandBuffer::ComputeState::reset() {
 	}
 }
 
-void MDCommandBuffer::compute_bind_uniform_set(RDD::UniformSetID p_uniform_set, RDD::ShaderID p_shader, uint32_t p_set_index) {
+void MDCommandBuffer::compute_bind_uniform_sets(VectorView<RDD::UniformSetID> p_uniform_sets, RDD::ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets) {
 	DEV_ASSERT(type == MDCommandBufferStateType::Compute);
 
-	MDUniformSet *set = (MDUniformSet *)(p_uniform_set.id);
-	if (compute.uniform_sets.size() <= p_set_index) {
-		uint32_t s = render.uniform_sets.size();
-		compute.uniform_sets.resize(p_set_index + 1);
+	compute.dynamic_offsets |= p_dynamic_offsets;
+
+	if (uint32_t new_size = p_first_set_index + p_set_count; compute.uniform_sets.size() < new_size) {
+		uint32_t s = compute.uniform_sets.size();
+		compute.uniform_sets.resize(new_size);
 		// Set intermediate values to null.
-		std::fill(&compute.uniform_sets[s], &compute.uniform_sets[p_set_index] + 1, nullptr);
+		std::fill(&compute.uniform_sets[s], compute.uniform_sets.end().operator->(), nullptr);
 	}
 
-	if (compute.uniform_sets[p_set_index] != set) {
-		compute.dirty.set_flag(ComputeState::DIRTY_UNIFORMS);
-		compute.uniform_set_mask |= 1ULL << p_set_index;
-		compute.uniform_sets[p_set_index] = set;
-	}
-}
-
-void MDCommandBuffer::compute_bind_uniform_sets(VectorView<RDD::UniformSetID> p_uniform_sets, RDD::ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) {
-	DEV_ASSERT(type == MDCommandBufferStateType::Compute);
+	const MDShader *shader = (const MDShader *)p_shader.id;
+	DynamicOffsetLayout layout = shader->dynamic_offset_layout;
 
 	for (size_t i = 0; i < p_set_count; ++i) {
 		MDUniformSet *set = (MDUniformSet *)(p_uniform_sets[i].id);
 
 		uint32_t index = p_first_set_index + i;
-		if (compute.uniform_sets.size() <= index) {
-			uint32_t s = compute.uniform_sets.size();
-			compute.uniform_sets.resize(index + 1);
-			// Set intermediate values to null.
-			std::fill(&compute.uniform_sets[s], &compute.uniform_sets[index] + 1, nullptr);
-		}
-
-		if (compute.uniform_sets[index] != set) {
+		if (compute.uniform_sets[index] != set || layout.get_count(index) > 0) {
 			compute.dirty.set_flag(ComputeState::DIRTY_UNIFORMS);
 			compute.uniform_set_mask |= 1ULL << index;
 			compute.uniform_sets[index] = set;
@@ -1124,10 +1108,10 @@ void MDCommandBuffer::compute_dispatch_indirect(RDD::BufferID p_indirect_buffer,
 
 	_compute_set_dirty_state();
 
-	id<MTLBuffer> indirectBuffer = rid::get(p_indirect_buffer);
+	const RenderingDeviceDriverMetal::BufferInfo *indirectBuffer = (const RenderingDeviceDriverMetal::BufferInfo *)p_indirect_buffer.id;
 
 	id<MTLComputeCommandEncoder> enc = compute.encoder;
-	[enc dispatchThreadgroupsWithIndirectBuffer:indirectBuffer indirectBufferOffset:p_offset threadsPerThreadgroup:compute.pipeline->compute_state.local];
+	[enc dispatchThreadgroupsWithIndirectBuffer:indirectBuffer->metal_buffer indirectBufferOffset:p_offset threadsPerThreadgroup:compute.pipeline->compute_state.local];
 }
 
 void MDCommandBuffer::_end_compute_dispatch() {
@@ -1164,7 +1148,7 @@ MDRenderShader::MDRenderShader(CharString p_name,
 		frag(p_frag) {
 }
 
-void MDUniformSet::bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index) {
+void MDUniformSet::bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index, uint32_t p_dynamic_offsets, uint32_t p_frame_idx, uint32_t p_frame_count) {
 	DEV_ASSERT(p_shader->uses_argument_buffers);
 	DEV_ASSERT(p_state.encoder != nil);
 
@@ -1173,48 +1157,54 @@ void MDUniformSet::bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandB
 	id<MTLRenderCommandEncoder> __unsafe_unretained enc = p_state.encoder;
 	id<MTLDevice> __unsafe_unretained device = enc.device;
 
-	BoundUniformSet &bus = bound_uniform_set(p_shader, device, p_state.resource_usage, p_set_index);
+	BoundUniformSet &bus = bound_uniform_set(p_shader, device, p_state.resource_usage, p_set_index, p_dynamic_offsets, p_frame_idx, p_frame_count);
 
 	// Set the buffer for the vertex stage.
 	{
 		uint32_t const *offset = set_info.offsets.getptr(RDD::SHADER_STAGE_VERTEX);
 		if (offset) {
-			[enc setVertexBuffer:bus.buffer offset:*offset atIndex:p_set_index];
+			[enc setVertexBuffer:bus.buffer offset:bus.make_offset(p_frame_idx, *offset) atIndex:p_set_index];
 		}
 	}
 	// Set the buffer for the fragment stage.
 	{
 		uint32_t const *offset = set_info.offsets.getptr(RDD::SHADER_STAGE_FRAGMENT);
 		if (offset) {
-			[enc setFragmentBuffer:bus.buffer offset:*offset atIndex:p_set_index];
+			[enc setFragmentBuffer:bus.buffer offset:bus.make_offset(p_frame_idx, *offset) atIndex:p_set_index];
 		}
 	}
 }
 
-void MDUniformSet::bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index) {
+void MDUniformSet::bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index, uint32_t p_dynamic_offsets) {
 	DEV_ASSERT(!p_shader->uses_argument_buffers);
 	DEV_ASSERT(p_state.encoder != nil);
 
 	id<MTLRenderCommandEncoder> __unsafe_unretained enc = p_state.encoder;
 
 	UniformSet const &set = p_shader->sets[p_set_index];
+	DynamicOffsetLayout layout = p_shader->dynamic_offset_layout;
+	uint32_t dynamic_index = 0;
 
 	for (uint32_t i = 0; i < MIN(uniforms.size(), set.uniforms.size()); i++) {
 		RDD::BoundUniform const &uniform = uniforms[i];
 		const UniformInfo &ui = set.uniforms[i];
 
+		uint32_t frame_idx;
+		if (uniform.is_dynamic()) {
+			uint32_t shift = layout.get_offset_index_shift(p_set_index, dynamic_index);
+			dynamic_index++;
+			frame_idx = (p_dynamic_offsets >> shift) & 0xf;
+		} else {
+			frame_idx = 0;
+		}
+
 		static const RDC::ShaderStage stage_usages[2] = { RDC::ShaderStage::SHADER_STAGE_VERTEX, RDC::ShaderStage::SHADER_STAGE_FRAGMENT };
 		for (const RDC::ShaderStage stage : stage_usages) {
 			ShaderStageUsage const stage_usage = ShaderStageUsage(1 << stage);
 
 			const BindingInfo *bi = ui.bindings.getptr(stage);
-			if (bi == nullptr) {
-				// No binding for this stage.
-				continue;
-			}
-
-			if ((ui.active_stages & stage_usage) == 0) {
-				// Not active for this state, so don't bind anything.
+			if (bi == nullptr || (ui.active_stages & stage_usage) == 0) {
+				// No binding for this stage or it is not active
 				continue;
 			}
 
@@ -1321,20 +1311,22 @@ void MDUniformSet::bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::Ren
 				case RDD::UNIFORM_TYPE_IMAGE_BUFFER: {
 					CRASH_NOW_MSG("not implemented: UNIFORM_TYPE_IMAGE_BUFFER");
 				} break;
-				case RDD::UNIFORM_TYPE_UNIFORM_BUFFER: {
-					id<MTLBuffer> buffer = rid::get(uniform.ids[0]);
+				case RDD::UNIFORM_TYPE_UNIFORM_BUFFER:
+				case RDD::UNIFORM_TYPE_STORAGE_BUFFER: {
+					const RenderingDeviceDriverMetal::BufferInfo *buf_info = (const RenderingDeviceDriverMetal::BufferInfo *)uniform.ids[0].id;
 					if (stage == RDD::SHADER_STAGE_VERTEX) {
-						[enc setVertexBuffer:buffer offset:0 atIndex:bi->index];
+						[enc setVertexBuffer:buf_info->metal_buffer offset:0 atIndex:bi->index];
 					} else {
-						[enc setFragmentBuffer:buffer offset:0 atIndex:bi->index];
+						[enc setFragmentBuffer:buf_info->metal_buffer offset:0 atIndex:bi->index];
 					}
 				} break;
-				case RDD::UNIFORM_TYPE_STORAGE_BUFFER: {
-					id<MTLBuffer> buffer = rid::get(uniform.ids[0]);
+				case RDD::UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC:
+				case RDD::UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC: {
+					const MetalBufferDynamicInfo *buf_info = (const MetalBufferDynamicInfo *)uniform.ids[0].id;
 					if (stage == RDD::SHADER_STAGE_VERTEX) {
-						[enc setVertexBuffer:buffer offset:0 atIndex:bi->index];
+						[enc setVertexBuffer:buf_info->metal_buffer offset:frame_idx * buf_info->size_bytes atIndex:bi->index];
 					} else {
-						[enc setFragmentBuffer:buffer offset:0 atIndex:bi->index];
+						[enc setFragmentBuffer:buf_info->metal_buffer offset:frame_idx * buf_info->size_bytes atIndex:bi->index];
 					}
 				} break;
 				case RDD::UNIFORM_TYPE_INPUT_ATTACHMENT: {
@@ -1368,15 +1360,15 @@ void MDUniformSet::bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::Ren
 	}
 }
 
-void MDUniformSet::bind_uniforms(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index) {
+void MDUniformSet::bind_uniforms(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index, uint32_t p_dynamic_offsets, uint32_t p_frame_idx, uint32_t p_frame_count) {
 	if (p_shader->uses_argument_buffers) {
-		bind_uniforms_argument_buffers(p_shader, p_state, p_set_index);
+		bind_uniforms_argument_buffers(p_shader, p_state, p_set_index, p_dynamic_offsets, p_frame_idx, p_frame_count);
 	} else {
-		bind_uniforms_direct(p_shader, p_state, p_set_index);
+		bind_uniforms_direct(p_shader, p_state, p_set_index, p_dynamic_offsets);
 	}
 }
 
-void MDUniformSet::bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state, uint32_t p_set_index) {
+void MDUniformSet::bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state, uint32_t p_set_index, uint32_t p_dynamic_offsets, uint32_t p_frame_idx, uint32_t p_frame_count) {
 	DEV_ASSERT(p_shader->uses_argument_buffers);
 	DEV_ASSERT(p_state.encoder != nil);
 
@@ -1385,40 +1377,46 @@ void MDUniformSet::bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandB
 	id<MTLComputeCommandEncoder> enc = p_state.encoder;
 	id<MTLDevice> device = enc.device;
 
-	BoundUniformSet &bus = bound_uniform_set(p_shader, device, p_state.resource_usage, p_set_index);
+	BoundUniformSet &bus = bound_uniform_set(p_shader, device, p_state.resource_usage, p_set_index, p_dynamic_offsets, p_frame_idx, p_frame_count);
 
 	uint32_t const *offset = set_info.offsets.getptr(RDD::SHADER_STAGE_COMPUTE);
 	if (offset) {
-		[enc setBuffer:bus.buffer offset:*offset atIndex:p_set_index];
+		[enc setBuffer:bus.buffer offset:bus.make_offset(p_frame_idx, *offset) atIndex:p_set_index];
 	}
 }
 
-void MDUniformSet::bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state, uint32_t p_set_index) {
+void MDUniformSet::bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state, uint32_t p_set_index, uint32_t p_dynamic_offsets) {
 	DEV_ASSERT(!p_shader->uses_argument_buffers);
 	DEV_ASSERT(p_state.encoder != nil);
 
 	id<MTLComputeCommandEncoder> __unsafe_unretained enc = p_state.encoder;
 
 	UniformSet const &set = p_shader->sets[p_set_index];
+	DynamicOffsetLayout layout = p_shader->dynamic_offset_layout;
+	uint32_t dynamic_index = 0;
 
 	for (uint32_t i = 0; i < uniforms.size(); i++) {
 		RDD::BoundUniform const &uniform = uniforms[i];
 		const UniformInfo &ui = set.uniforms[i];
 
+		uint32_t frame_idx;
+		if (uniform.is_dynamic()) {
+			uint32_t shift = layout.get_offset_index_shift(p_set_index, dynamic_index);
+			dynamic_index++;
+			frame_idx = (p_dynamic_offsets >> shift) & 0xf;
+		} else {
+			frame_idx = 0;
+		}
+
 		const RDC::ShaderStage stage = RDC::ShaderStage::SHADER_STAGE_COMPUTE;
 		const ShaderStageUsage stage_usage = ShaderStageUsage(1 << stage);
 
 		const BindingInfo *bi = ui.bindings.getptr(stage);
-		if (bi == nullptr) {
+		if (bi == nullptr || (ui.active_stages & stage_usage) == 0) {
 			// No binding for this stage.
 			continue;
 		}
 
-		if ((ui.active_stages & stage_usage) == 0) {
-			// Not active for this state, so don't bind anything.
-			continue;
-		}
-
 		switch (uniform.type) {
 			case RDD::UNIFORM_TYPE_SAMPLER: {
 				size_t count = uniform.ids.size();
@@ -1490,13 +1488,15 @@ void MDUniformSet::bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::Com
 			case RDD::UNIFORM_TYPE_IMAGE_BUFFER: {
 				CRASH_NOW_MSG("not implemented: UNIFORM_TYPE_IMAGE_BUFFER");
 			} break;
-			case RDD::UNIFORM_TYPE_UNIFORM_BUFFER: {
-				id<MTLBuffer> buffer = rid::get(uniform.ids[0]);
-				[enc setBuffer:buffer offset:0 atIndex:bi->index];
-			} break;
+			case RDD::UNIFORM_TYPE_UNIFORM_BUFFER:
 			case RDD::UNIFORM_TYPE_STORAGE_BUFFER: {
-				id<MTLBuffer> buffer = rid::get(uniform.ids[0]);
-				[enc setBuffer:buffer offset:0 atIndex:bi->index];
+				const RenderingDeviceDriverMetal::BufferInfo *buf_info = (const RenderingDeviceDriverMetal::BufferInfo *)uniform.ids[0].id;
+				[enc setBuffer:buf_info->metal_buffer offset:0 atIndex:bi->index];
+			} break;
+			case RDD::UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC:
+			case RDD::UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC: {
+				const MetalBufferDynamicInfo *buf_info = (const MetalBufferDynamicInfo *)uniform.ids[0].id;
+				[enc setBuffer:buf_info->metal_buffer offset:frame_idx * buf_info->size_bytes atIndex:bi->index];
 			} break;
 			case RDD::UNIFORM_TYPE_INPUT_ATTACHMENT: {
 				size_t count = uniform.ids.size();
@@ -1519,19 +1519,23 @@ void MDUniformSet::bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::Com
 	}
 }
 
-void MDUniformSet::bind_uniforms(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state, uint32_t p_set_index) {
+void MDUniformSet::bind_uniforms(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state, uint32_t p_set_index, uint32_t p_dynamic_offsets, uint32_t p_frame_idx, uint32_t p_frame_count) {
 	if (p_shader->uses_argument_buffers) {
-		bind_uniforms_argument_buffers(p_shader, p_state, p_set_index);
+		bind_uniforms_argument_buffers(p_shader, p_state, p_set_index, p_dynamic_offsets, p_frame_idx, p_frame_count);
 	} else {
-		bind_uniforms_direct(p_shader, p_state, p_set_index);
+		bind_uniforms_direct(p_shader, p_state, p_set_index, p_dynamic_offsets);
 	}
 }
 
-BoundUniformSet &MDUniformSet::bound_uniform_set(MDShader *p_shader, id<MTLDevice> p_device, ResourceUsageMap &p_resource_usage, uint32_t p_set_index) {
+BoundUniformSet &MDUniformSet::bound_uniform_set(MDShader *p_shader, id<MTLDevice> p_device, ResourceUsageMap &p_resource_usage, uint32_t p_set_index, uint32_t p_dynamic_offsets, uint32_t p_frame_idx, uint32_t p_frame_count) {
 	BoundUniformSet *sus = bound_uniforms.getptr(p_shader);
 	if (sus != nullptr) {
-		sus->merge_into(p_resource_usage);
-		return *sus;
+		BoundUniformSet &bs = *sus;
+		if (bs.is_dynamic()) {
+			update_dynamic_uniforms(p_shader, p_resource_usage, p_set_index, bs, p_dynamic_offsets, p_frame_idx);
+		}
+		bs.merge_into(p_resource_usage);
+		return bs;
 	}
 
 	UniformSet const &set = p_shader->sets[p_set_index];
@@ -1546,9 +1550,18 @@ BoundUniformSet &MDUniformSet::bound_uniform_set(MDShader *p_shader, id<MTLDevic
 		}
 	};
 	id<MTLBuffer> enc_buffer = nil;
+	uint32_t frame_size = set.buffer_size;
+	uint32_t buffer_size = frame_size;
+	if (!set.dynamic_uniforms.is_empty()) {
+		// We need to store a copy of the argument buffer for each frame that could be in flight, just
+		// like the dynamic buffers themselves.
+		buffer_size *= p_frame_count;
+	} else {
+		frame_size = 0;
+	}
 	if (set.buffer_size > 0) {
-		MTLResourceOptions options = MTLResourceStorageModeShared | MTLResourceHazardTrackingModeTracked;
-		enc_buffer = [p_device newBufferWithLength:set.buffer_size options:options];
+		MTLResourceOptions options = MTLResourceHazardTrackingModeUntracked | MTLResourceStorageModeShared;
+		enc_buffer = [p_device newBufferWithLength:buffer_size options:options];
 		for (KeyValue<RDC::ShaderStage, id<MTLArgumentEncoder>> const &kv : set.encoders) {
 			RDD::ShaderStage const stage = kv.key;
 			ShaderStageUsage const stage_usage = ShaderStageUsage(1 << stage);
@@ -1647,16 +1660,18 @@ BoundUniformSet &MDUniformSet::bound_uniform_set(MDShader *p_shader, id<MTLDevic
 					case RDD::UNIFORM_TYPE_IMAGE_BUFFER: {
 						CRASH_NOW_MSG("not implemented: UNIFORM_TYPE_IMAGE_BUFFER");
 					} break;
-					case RDD::UNIFORM_TYPE_UNIFORM_BUFFER: {
-						id<MTLBuffer> buffer = rid::get(uniform.ids[0]);
-						[enc setBuffer:buffer offset:0 atIndex:bi->index];
-						add_usage(buffer, stage, bi->usage);
-					} break;
+					case RDD::UNIFORM_TYPE_UNIFORM_BUFFER:
 					case RDD::UNIFORM_TYPE_STORAGE_BUFFER: {
-						id<MTLBuffer> buffer = rid::get(uniform.ids[0]);
-						[enc setBuffer:buffer offset:0 atIndex:bi->index];
-						add_usage(buffer, stage, bi->usage);
+						const RenderingDeviceDriverMetal::BufferInfo *buf_info = (const RenderingDeviceDriverMetal::BufferInfo *)uniform.ids[0].id;
+						[enc setBuffer:buf_info->metal_buffer offset:0 atIndex:bi->index];
+						add_usage(buf_info->metal_buffer, stage, bi->usage);
 					} break;
+					case RDD::UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC:
+					case RDD::UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC: {
+						const MetalBufferDynamicInfo *buf_info = (const MetalBufferDynamicInfo *)uniform.ids[0].id;
+						add_usage(buf_info->metal_buffer, stage, bi->usage);
+					} break;
+
 					case RDD::UNIFORM_TYPE_INPUT_ATTACHMENT: {
 						size_t count = uniform.ids.size();
 						if (count == 1) {
@@ -1679,6 +1694,16 @@ BoundUniformSet &MDUniformSet::bound_uniform_set(MDShader *p_shader, id<MTLDevic
 				}
 			}
 		}
+
+		// Duplicate the argument buffer data for each frame, if needed.
+		// The dynamic uniforms will be updated each frame.
+		if (frame_size > 0) {
+			void *ptr = enc_buffer.contents;
+			for (uint32_t i = 1; i < p_frame_count; i++) {
+				void *dst = (void *)((uintptr_t)ptr + i * frame_size);
+				memcpy(dst, ptr, frame_size);
+			}
+		}
 	}
 
 	ResourceUsageMap usage_to_resources;
@@ -1693,10 +1718,57 @@ BoundUniformSet &MDUniformSet::bound_uniform_set(MDShader *p_shader, id<MTLDevic
 		}
 	}
 
-	BoundUniformSet bs = { .buffer = enc_buffer, .usage_to_resources = usage_to_resources };
-	bound_uniforms.insert(p_shader, bs);
+	BoundUniformSet &bs = bound_uniforms.insert(p_shader, BoundUniformSet(enc_buffer, std::move(usage_to_resources), frame_size))->value;
+	if (bs.is_dynamic()) {
+		update_dynamic_uniforms(p_shader, p_resource_usage, p_set_index, bs, p_dynamic_offsets, p_frame_idx);
+	}
 	bs.merge_into(p_resource_usage);
-	return bound_uniforms.get(p_shader);
+	return bs;
+}
+
+void MDUniformSet::update_dynamic_uniforms(MDShader *p_shader, ResourceUsageMap &p_resource_usage, uint32_t p_set_index, BoundUniformSet &p_bound_set, uint32_t p_dynamic_offsets, uint32_t p_frame_idx) {
+	// This shouldn't be called if the set doesn't have dynamic uniforms.
+	DEV_ASSERT(p_bound_set.is_dynamic());
+
+	UniformSet const &set = p_shader->sets[p_set_index];
+	DEV_ASSERT(!set.dynamic_uniforms.is_empty()); // Programming error if this is empty.
+
+	DynamicOffsetLayout layout = p_shader->dynamic_offset_layout;
+
+	for (KeyValue<RDC::ShaderStage, id<MTLArgumentEncoder>> const &kv : set.encoders) {
+		RDD::ShaderStage const stage = kv.key;
+		ShaderStageUsage const stage_usage = ShaderStageUsage(1 << stage);
+		id<MTLArgumentEncoder> const __unsafe_unretained enc = kv.value;
+
+		[enc setArgumentBuffer:p_bound_set.buffer offset:p_bound_set.make_offset(p_frame_idx, set.offsets[stage])];
+
+		uint32_t dynamic_index = 0;
+
+		for (uint32_t i : set.dynamic_uniforms) {
+			RDD::BoundUniform const &uniform = uniforms[i];
+			const UniformInfo &ui = set.uniforms[i];
+
+			const BindingInfo *bi = ui.bindings.getptr(stage);
+			if (bi == nullptr) {
+				// No binding for this stage.
+				continue;
+			}
+
+			if ((ui.active_stages & stage_usage) == None) {
+				// Not active for this state, so don't bind anything.
+				continue;
+			}
+
+			uint32_t shift = layout.get_offset_index_shift(p_set_index, dynamic_index);
+			dynamic_index++;
+			uint32_t frame_idx = (p_dynamic_offsets >> shift) & 0xf;
+
+			const MetalBufferDynamicInfo *buf_info = (const MetalBufferDynamicInfo *)uniform.ids[0].id;
+			[enc setBuffer:buf_info->metal_buffer
+					 offset:frame_idx * buf_info->size_bytes
+					atIndex:bi->index];
+		}
+	}
 }
 
 MTLFmtCaps MDSubpass::getRequiredFmtCapsForAttachmentAt(uint32_t p_index) const {
diff --git a/drivers/metal/rendering_device_driver_metal.h b/drivers/metal/rendering_device_driver_metal.h
index fcefae591ef..0012ac22f06 100644
--- a/drivers/metal/rendering_device_driver_metal.h
+++ b/drivers/metal/rendering_device_driver_metal.h
@@ -48,6 +48,7 @@ class RenderingContextDriverMetal;
 
 class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) RenderingDeviceDriverMetal : public RenderingDeviceDriver {
 	friend struct ShaderCacheEntry;
+	friend class MDCommandBuffer;
 
 	template <typename T>
 	using Result = std::variant<T, Error>;
@@ -58,6 +59,12 @@ class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) RenderingDeviceDriverMet
 	RenderingContextDriver::Device context_device;
 	id<MTLDevice> device = nil;
 
+	uint32_t frame_count = 1;
+	/// frame_index is a cyclic counter derived from the current frame number modulo frame_count,
+	/// cycling through values from 0 to frame_count - 1
+	uint32_t frame_index = 0;
+	uint32_t frames_drawn = 0;
+
 	MetalDeviceProperties *device_properties = nullptr;
 	MetalDeviceProfile device_profile;
 	RenderingShaderContainerFormatMetal *shader_container_format = nullptr;
@@ -101,12 +108,27 @@ public:
 #pragma mark - Buffers
 
 public:
-	virtual BufferID buffer_create(uint64_t p_size, BitField<BufferUsageBits> p_usage, MemoryAllocationType p_allocation_type) override final;
+	struct BufferInfo {
+		id<MTLBuffer> metal_buffer;
+
+		_FORCE_INLINE_ bool is_dynamic() const { return _frame_idx != UINT32_MAX; }
+		_FORCE_INLINE_ uint32_t frame_index() const { return _frame_idx; }
+		_FORCE_INLINE_ void set_frame_index(uint32_t p_frame_index) { _frame_idx = p_frame_index; }
+
+	protected:
+		// If dynamic buffer, then its range is [0; RenderingDeviceDriverMetal::frame_count)
+		// else it's UINT32_MAX.
+		uint32_t _frame_idx = UINT32_MAX;
+	};
+
+	virtual BufferID buffer_create(uint64_t p_size, BitField<BufferUsageBits> p_usage, MemoryAllocationType p_allocation_type, uint64_t p_frames_drawn) override final;
 	virtual bool buffer_set_texel_format(BufferID p_buffer, DataFormat p_format) override final;
 	virtual void buffer_free(BufferID p_buffer) override final;
 	virtual uint64_t buffer_get_allocation_size(BufferID p_buffer) override final;
 	virtual uint8_t *buffer_map(BufferID p_buffer) override final;
 	virtual void buffer_unmap(BufferID p_buffer) override final;
+	virtual uint8_t *buffer_persistent_map_advance(BufferID p_buffer, uint64_t p_frames_drawn) override final;
+	virtual void buffer_flush(BufferID p_buffer) override final;
 	virtual uint64_t buffer_get_device_address(BufferID p_buffer) override final;
 
 #pragma mark - Texture
@@ -253,6 +275,7 @@ public:
 public:
 	virtual UniformSetID uniform_set_create(VectorView<BoundUniform> p_uniforms, ShaderID p_shader, uint32_t p_set_index, int p_linear_pool_index) override final;
 	virtual void uniform_set_free(UniformSetID p_uniform_set) override final;
+	virtual uint32_t uniform_sets_get_dynamic_offsets(VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) const override final;
 
 #pragma mark - Commands
 
@@ -323,8 +346,7 @@ public:
 
 	// Binding.
 	virtual void command_bind_render_pipeline(CommandBufferID p_cmd_buffer, PipelineID p_pipeline) override final;
-	virtual void command_bind_render_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) override final;
-	virtual void command_bind_render_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) override final;
+	virtual void command_bind_render_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets) override final;
 
 	// Drawing.
 	virtual void command_render_draw(CommandBufferID p_cmd_buffer, uint32_t p_vertex_count, uint32_t p_instance_count, uint32_t p_base_vertex, uint32_t p_first_instance) override final;
@@ -364,8 +386,7 @@ public:
 
 	// Binding.
 	virtual void command_bind_compute_pipeline(CommandBufferID p_cmd_buffer, PipelineID p_pipeline) override final;
-	virtual void command_bind_compute_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) override final;
-	virtual void command_bind_compute_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) override final;
+	virtual void command_bind_compute_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets) override final;
 
 	// Dispatching.
 	virtual void command_compute_dispatch(CommandBufferID p_cmd_buffer, uint32_t p_x_groups, uint32_t p_y_groups, uint32_t p_z_groups) override final;
@@ -438,3 +459,17 @@ public:
 	RenderingDeviceDriverMetal(RenderingContextDriverMetal *p_context_driver);
 	~RenderingDeviceDriverMetal();
 };
+
+// Defined outside because we need to forward declare it in metal_objects.h
+struct API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) MetalBufferDynamicInfo : public RenderingDeviceDriverMetal::BufferInfo {
+	uint64_t size_bytes; // Contains the real buffer size / frame_count.
+	uint32_t next_frame_index(uint32_t p_frame_count) {
+		// This is the next frame index to use for this buffer.
+		_frame_idx = (_frame_idx + 1u) % p_frame_count;
+		return _frame_idx;
+	}
+#ifdef DEBUG_ENABLED
+	// For tracking that a persistent buffer isn't mapped twice in the same frame.
+	uint64_t last_frame_mapped = 0;
+#endif
+};
diff --git a/drivers/metal/rendering_device_driver_metal.mm b/drivers/metal/rendering_device_driver_metal.mm
index 3502259e579..105d7f8eac8 100644
--- a/drivers/metal/rendering_device_driver_metal.mm
+++ b/drivers/metal/rendering_device_driver_metal.mm
@@ -64,9 +64,7 @@
 #import <Metal/Metal.h>
 #import <os/log.h>
 #import <os/signpost.h>
-#import <spirv.hpp>
-#import <spirv_msl.hpp>
-#import <spirv_parser.hpp>
+#include <algorithm>
 
 #pragma mark - Logging
 
@@ -121,20 +119,44 @@ _FORCE_INLINE_ static bool operator==(MTLSize p_a, MTLSize p_b) {
 /**** BUFFERS ****/
 /*****************/
 
-RDD::BufferID RenderingDeviceDriverMetal::buffer_create(uint64_t p_size, BitField<BufferUsageBits> p_usage, MemoryAllocationType p_allocation_type) {
-	MTLResourceOptions options = MTLResourceHazardTrackingModeTracked;
+RDD::BufferID RenderingDeviceDriverMetal::buffer_create(uint64_t p_size, BitField<BufferUsageBits> p_usage, MemoryAllocationType p_allocation_type, uint64_t p_frames_drawn) {
+	const uint64_t original_size = p_size;
+	if (p_usage.has_flag(BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT)) {
+		p_size = round_up_to_alignment(p_size, 16u) * frame_count;
+	}
+
+	MTLResourceOptions options = 0;
 	switch (p_allocation_type) {
 		case MEMORY_ALLOCATION_TYPE_CPU:
-			options |= MTLResourceStorageModeShared;
+			options = MTLResourceHazardTrackingModeTracked | MTLResourceStorageModeShared;
 			break;
 		case MEMORY_ALLOCATION_TYPE_GPU:
-			options |= MTLResourceStorageModePrivate;
+			if (p_usage.has_flag(BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT)) {
+				options = MTLResourceHazardTrackingModeUntracked | MTLResourceStorageModeShared | MTLResourceCPUCacheModeWriteCombined;
+			} else {
+				options = MTLResourceHazardTrackingModeTracked | MTLResourceStorageModePrivate;
+			}
 			break;
 	}
 
 	id<MTLBuffer> obj = [device newBufferWithLength:p_size options:options];
 	ERR_FAIL_NULL_V_MSG(obj, BufferID(), "Can't create buffer of size: " + itos(p_size));
-	return rid::make(obj);
+
+	BufferInfo *buf_info;
+	if (p_usage.has_flag(BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT)) {
+		MetalBufferDynamicInfo *dyn_buffer = memnew(MetalBufferDynamicInfo);
+		buf_info = dyn_buffer;
+#ifdef DEBUG_ENABLED
+		dyn_buffer->last_frame_mapped = p_frames_drawn - 1ul;
+#endif
+		dyn_buffer->set_frame_index(0u);
+		dyn_buffer->size_bytes = round_up_to_alignment(original_size, 16u);
+	} else {
+		buf_info = memnew(BufferInfo);
+	}
+	buf_info->metal_buffer = obj;
+
+	return BufferID(buf_info);
 }
 
 bool RenderingDeviceDriverMetal::buffer_set_texel_format(BufferID p_buffer, DataFormat p_format) {
@@ -143,28 +165,49 @@ bool RenderingDeviceDriverMetal::buffer_set_texel_format(BufferID p_buffer, Data
 }
 
 void RenderingDeviceDriverMetal::buffer_free(BufferID p_buffer) {
-	rid::release(p_buffer);
+	BufferInfo *buf_info = (BufferInfo *)p_buffer.id;
+	buf_info->metal_buffer = nil; // Tell ARC to release.
+
+	if (buf_info->is_dynamic()) {
+		memdelete((MetalBufferDynamicInfo *)buf_info);
+	} else {
+		memdelete(buf_info);
+	}
 }
 
 uint64_t RenderingDeviceDriverMetal::buffer_get_allocation_size(BufferID p_buffer) {
-	id<MTLBuffer> obj = rid::get(p_buffer);
-	return obj.allocatedSize;
+	const BufferInfo *buf_info = (const BufferInfo *)p_buffer.id;
+	return buf_info->metal_buffer.allocatedSize;
 }
 
 uint8_t *RenderingDeviceDriverMetal::buffer_map(BufferID p_buffer) {
-	id<MTLBuffer> obj = rid::get(p_buffer);
-	ERR_FAIL_COND_V_MSG(obj.storageMode != MTLStorageModeShared, nullptr, "Unable to map private buffers");
-	return (uint8_t *)obj.contents;
+	const BufferInfo *buf_info = (const BufferInfo *)p_buffer.id;
+	ERR_FAIL_COND_V_MSG(buf_info->metal_buffer.storageMode != MTLStorageModeShared, nullptr, "Unable to map private buffers");
+	return (uint8_t *)buf_info->metal_buffer.contents;
 }
 
 void RenderingDeviceDriverMetal::buffer_unmap(BufferID p_buffer) {
 	// Nothing to do.
 }
 
+uint8_t *RenderingDeviceDriverMetal::buffer_persistent_map_advance(BufferID p_buffer, uint64_t p_frames_drawn) {
+	MetalBufferDynamicInfo *buf_info = (MetalBufferDynamicInfo *)p_buffer.id;
+	ERR_FAIL_COND_V_MSG(!buf_info->is_dynamic(), nullptr, "Buffer must have BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT. Use buffer_map() instead.");
+#ifdef DEBUG_ENABLED
+	ERR_FAIL_COND_V_MSG(buf_info->last_frame_mapped == p_frames_drawn, nullptr, "Buffers with BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT must only be mapped once per frame. Otherwise there could be race conditions with the GPU. Amalgamate all data uploading into one map(), use an extra buffer or remove the bit.");
+	buf_info->last_frame_mapped = p_frames_drawn;
+#endif
+	return (uint8_t *)buf_info->metal_buffer.contents + buf_info->next_frame_index(frame_count) * buf_info->size_bytes;
+}
+
+void RenderingDeviceDriverMetal::buffer_flush(BufferID p_buffer) {
+	// Nothing to do.
+}
+
 uint64_t RenderingDeviceDriverMetal::buffer_get_device_address(BufferID p_buffer) {
 	if (@available(iOS 16.0, macOS 13.0, *)) {
-		id<MTLBuffer> obj = rid::get(p_buffer);
-		return obj.gpuAddress;
+		const BufferInfo *buf_info = (const BufferInfo *)p_buffer.id;
+		return buf_info->metal_buffer.gpuAddress;
 	} else {
 #if DEV_ENABLED
 		WARN_PRINT_ONCE("buffer_get_device_address is not supported on this OS version.");
@@ -1202,6 +1245,10 @@ RDD::ShaderID RenderingDeviceDriverMetal::shader_create_from_container(const Ref
 	uint32_t uniform_sets_count = mtl_refl.uniform_sets.size();
 	uniform_sets.resize(uniform_sets_count);
 
+	DynamicOffsetLayout dynamic_offset_layout;
+	uint8_t dynamic_offset = 0;
+	uint8_t dynamic_count = 0;
+
 	// Create sets.
 	for (uint32_t i = 0; i < uniform_sets_count; i++) {
 		UniformSet &set = uniform_sets.write[i];
@@ -1215,6 +1262,16 @@ RDD::ShaderID RenderingDeviceDriverMetal::shader_create_from_container(const Ref
 			const ShaderUniform &uniform = refl_set.ptr()[j];
 			const RSCM::UniformData &bind = mtl_set.ptr()[j];
 
+			switch (uniform.type) {
+				case UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC:
+				case UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC: {
+					set.dynamic_uniforms.push_back(j);
+					dynamic_count++;
+				} break;
+				default: {
+				} break;
+			}
+
 			UniformInfo &ui = *iter;
 			++iter;
 			ui.binding = uniform.binding;
@@ -1235,6 +1292,11 @@ RDD::ShaderID RenderingDeviceDriverMetal::shader_create_from_container(const Ref
 				ui.bindings_secondary.insert((RDC::ShaderStage)info.shader_stage, bi);
 			}
 		}
+		if (dynamic_count > 0) {
+			dynamic_offset_layout.set_offset_count(i, dynamic_offset, dynamic_count);
+			dynamic_offset += dynamic_count;
+			dynamic_count = 0;
+		}
 	}
 
 	for (uint32_t i = 0; i < uniform_sets_count; i++) {
@@ -1329,6 +1391,8 @@ RDD::ShaderID RenderingDeviceDriverMetal::shader_create_from_container(const Ref
 		shader = rs;
 	}
 
+	shader->dynamic_offset_layout = dynamic_offset_layout;
+
 	return RDD::ShaderID(shader);
 }
 
@@ -1365,6 +1429,38 @@ void RenderingDeviceDriverMetal::uniform_set_free(UniformSetID p_uniform_set) {
 	memdelete(obj);
 }
 
+uint32_t RenderingDeviceDriverMetal::uniform_sets_get_dynamic_offsets(VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) const {
+	const MDShader *shader = (const MDShader *)p_shader.id;
+	const DynamicOffsetLayout layout = shader->dynamic_offset_layout;
+
+	if (layout.is_empty()) {
+		return 0u;
+	}
+
+	uint32_t mask = 0u;
+
+	for (uint32_t i = 0; i < p_set_count; i++) {
+		const uint32_t index = p_first_set_index + i;
+		uint32_t shift = layout.get_offset_index_shift(index);
+		const uint32_t count = layout.get_count(index);
+		DEV_ASSERT(shader->sets[index].dynamic_uniforms.size() == count);
+		if (count == 0) {
+			continue;
+		}
+
+		const MDUniformSet *usi = (const MDUniformSet *)p_uniform_sets[i].id;
+		for (uint32_t uniform_index : shader->sets[index].dynamic_uniforms) {
+			const RDD::BoundUniform &uniform = usi->uniforms[uniform_index];
+			DEV_ASSERT(uniform.is_dynamic());
+			const MetalBufferDynamicInfo *buf_info = (const MetalBufferDynamicInfo *)uniform.ids[0].id;
+			mask |= buf_info->frame_index() << shift;
+			shift += 4u;
+		}
+	}
+
+	return mask;
+}
+
 void RenderingDeviceDriverMetal::command_uniform_set_prepare_for_use(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) {
 }
 
@@ -1372,26 +1468,25 @@ void RenderingDeviceDriverMetal::command_uniform_set_prepare_for_use(CommandBuff
 
 void RenderingDeviceDriverMetal::command_clear_buffer(CommandBufferID p_cmd_buffer, BufferID p_buffer, uint64_t p_offset, uint64_t p_size) {
 	MDCommandBuffer *cmd = (MDCommandBuffer *)(p_cmd_buffer.id);
-	id<MTLBuffer> buffer = rid::get(p_buffer);
 
 	id<MTLBlitCommandEncoder> blit = cmd->blit_command_encoder();
-	[blit fillBuffer:buffer
+	[blit fillBuffer:((const BufferInfo *)p_buffer.id)->metal_buffer
 			   range:NSMakeRange(p_offset, p_size)
 			   value:0];
 }
 
 void RenderingDeviceDriverMetal::command_copy_buffer(CommandBufferID p_cmd_buffer, BufferID p_src_buffer, BufferID p_dst_buffer, VectorView<BufferCopyRegion> p_regions) {
 	MDCommandBuffer *cmd = (MDCommandBuffer *)(p_cmd_buffer.id);
-	id<MTLBuffer> src = rid::get(p_src_buffer);
-	id<MTLBuffer> dst = rid::get(p_dst_buffer);
+	const BufferInfo *src = (const BufferInfo *)p_src_buffer.id;
+	const BufferInfo *dst = (const BufferInfo *)p_dst_buffer.id;
 
 	id<MTLBlitCommandEncoder> blit = cmd->blit_command_encoder();
 
 	for (uint32_t i = 0; i < p_regions.size(); i++) {
 		BufferCopyRegion region = p_regions[i];
-		[blit copyFromBuffer:src
+		[blit copyFromBuffer:src->metal_buffer
 					 sourceOffset:region.src_offset
-						 toBuffer:dst
+						 toBuffer:dst->metal_buffer
 				destinationOffset:region.dst_offset
 							 size:region.size];
 	}
@@ -1627,7 +1722,7 @@ void RenderingDeviceDriverMetal::_copy_texture_buffer(CommandBufferID p_cmd_buff
 		BufferID p_buffer,
 		VectorView<BufferTextureCopyRegion> p_regions) {
 	MDCommandBuffer *cmd = (MDCommandBuffer *)(p_cmd_buffer.id);
-	id<MTLBuffer> buffer = rid::get(p_buffer);
+	const BufferInfo *buffer = (const BufferInfo *)p_buffer.id;
 	id<MTLTexture> texture = rid::get(p_texture);
 
 	id<MTLBlitCommandEncoder> enc = cmd->blit_command_encoder();
@@ -1683,7 +1778,7 @@ void RenderingDeviceDriverMetal::_copy_texture_buffer(CommandBufferID p_cmd_buff
 
 		if (p_source == CopySource::Buffer) {
 			for (uint32_t lyrIdx = 0; lyrIdx < region.texture_subresources.layer_count; lyrIdx++) {
-				[enc copyFromBuffer:buffer
+				[enc copyFromBuffer:buffer->metal_buffer
 							   sourceOffset:region.buffer_offset + (bytesPerImg * lyrIdx)
 						  sourceBytesPerRow:bytesPerRow
 						sourceBytesPerImage:bytesPerImg
@@ -1701,7 +1796,7 @@ void RenderingDeviceDriverMetal::_copy_texture_buffer(CommandBufferID p_cmd_buff
 									 sourceLevel:mip_level
 									sourceOrigin:txt_origin
 									  sourceSize:txt_size
-										toBuffer:buffer
+										toBuffer:buffer->metal_buffer
 							   destinationOffset:region.buffer_offset + (bytesPerImg * lyrIdx)
 						  destinationBytesPerRow:bytesPerRow
 						destinationBytesPerImage:bytesPerImg
@@ -1896,14 +1991,9 @@ void RenderingDeviceDriverMetal::command_bind_render_pipeline(CommandBufferID p_
 	cb->bind_pipeline(p_pipeline);
 }
 
-void RenderingDeviceDriverMetal::command_bind_render_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) {
+void RenderingDeviceDriverMetal::command_bind_render_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets) {
 	MDCommandBuffer *cb = (MDCommandBuffer *)(p_cmd_buffer.id);
-	cb->render_bind_uniform_set(p_uniform_set, p_shader, p_set_index);
-}
-
-void RenderingDeviceDriverMetal::command_bind_render_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) {
-	MDCommandBuffer *cb = (MDCommandBuffer *)(p_cmd_buffer.id);
-	cb->render_bind_uniform_sets(p_uniform_sets, p_shader, p_first_set_index, p_set_count);
+	cb->render_bind_uniform_sets(p_uniform_sets, p_shader, p_first_set_index, p_set_count, p_dynamic_offsets);
 }
 
 void RenderingDeviceDriverMetal::command_render_draw(CommandBufferID p_cmd_buffer, uint32_t p_vertex_count, uint32_t p_instance_count, uint32_t p_base_vertex, uint32_t p_first_instance) {
@@ -2377,14 +2467,9 @@ void RenderingDeviceDriverMetal::command_bind_compute_pipeline(CommandBufferID p
 	cb->bind_pipeline(p_pipeline);
 }
 
-void RenderingDeviceDriverMetal::command_bind_compute_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) {
+void RenderingDeviceDriverMetal::command_bind_compute_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets) {
 	MDCommandBuffer *cb = (MDCommandBuffer *)(p_cmd_buffer.id);
-	cb->compute_bind_uniform_set(p_uniform_set, p_shader, p_set_index);
-}
-
-void RenderingDeviceDriverMetal::command_bind_compute_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) {
-	MDCommandBuffer *cb = (MDCommandBuffer *)(p_cmd_buffer.id);
-	cb->compute_bind_uniform_sets(p_uniform_sets, p_shader, p_first_set_index, p_set_count);
+	cb->compute_bind_uniform_sets(p_uniform_sets, p_shader, p_first_set_index, p_set_count, p_dynamic_offsets);
 }
 
 void RenderingDeviceDriverMetal::command_compute_dispatch(CommandBufferID p_cmd_buffer, uint32_t p_x_groups, uint32_t p_y_groups, uint32_t p_z_groups) {
@@ -2491,6 +2576,8 @@ void RenderingDeviceDriverMetal::command_insert_breadcrumb(CommandBufferID p_cmd
 #pragma mark - Submission
 
 void RenderingDeviceDriverMetal::begin_segment(uint32_t p_frame_index, uint32_t p_frames_drawn) {
+	frame_index = p_frame_index;
+	frames_drawn = p_frames_drawn;
 }
 
 void RenderingDeviceDriverMetal::end_segment() {
@@ -2508,8 +2595,8 @@ void RenderingDeviceDriverMetal::set_object_name(ObjectType p_type, ID p_driver_
 			// Can't set label after creation.
 		} break;
 		case OBJECT_TYPE_BUFFER: {
-			id<MTLBuffer> buffer = rid::get(p_driver_id);
-			buffer.label = [NSString stringWithUTF8String:p_name.utf8().get_data()];
+			const BufferInfo *buf_info = (const BufferInfo *)p_driver_id.id;
+			buf_info->metal_buffer.label = [NSString stringWithUTF8String:p_name.utf8().get_data()];
 		} break;
 		case OBJECT_TYPE_SHADER: {
 			NSString *label = [NSString stringWithUTF8String:p_name.utf8().get_data()];
@@ -2898,6 +2985,8 @@ static MetalDeviceProfile device_profile_from_properties(MetalDeviceProperties *
 		} break;
 	}
 
+	res.update_options();
+
 	return res;
 }
 
@@ -2912,6 +3001,8 @@ Error RenderingDeviceDriverMetal::initialize(uint32_t p_device_index, uint32_t p
 
 	_check_capabilities();
 
+	frame_count = p_frame_count;
+
 	// Set the pipeline cache ID based on the Metal version.
 	pipeline_cache_id = "metal-driver-" + get_api_version();
 
diff --git a/drivers/metal/rendering_shader_container_metal.h b/drivers/metal/rendering_shader_container_metal.h
index 1d30eb5060b..b85cfad9e37 100644
--- a/drivers/metal/rendering_shader_container_metal.h
+++ b/drivers/metal/rendering_shader_container_metal.h
@@ -100,12 +100,26 @@ struct MetalDeviceProfile {
 		bool simdPermute = false;
 	};
 
+	/**
+	 * @brief Options to configure the Metal device profile.
+	 *
+	 * This structure allows customization of the Metal device profile,
+	 * such as the argument buffers tier, which can affect how shaders are compiled.
+	 */
+	struct Options {
+		ArgumentBuffersTier argument_buffers_tier = ArgumentBuffersTier::Tier1;
+	};
+
 	Platform platform = Platform::macOS;
 	GPU gpu = GPU::Apple4;
 	Features features;
+	Options options;
 
 	static const MetalDeviceProfile *get_profile(Platform p_platform, GPU p_gpu);
 
+	// Configure any options for the device profile, which may include overrides from the environment.
+	void update_options();
+
 	MetalDeviceProfile() = default;
 
 private:
diff --git a/drivers/metal/rendering_shader_container_metal.mm b/drivers/metal/rendering_shader_container_metal.mm
index 65efa989785..596d839f792 100644
--- a/drivers/metal/rendering_shader_container_metal.mm
+++ b/drivers/metal/rendering_shader_container_metal.mm
@@ -84,9 +84,35 @@ const MetalDeviceProfile *MetalDeviceProfile::get_profile(MetalDeviceProfile::Pl
 		res.features.mslVersionMinor = 1;
 	}
 
+	res.update_options();
+
 	return &profiles.insert(key, res)->value;
 }
 
+void MetalDeviceProfile::update_options() {
+	options.argument_buffers_tier = features.argument_buffers_tier;
+
+	if (OS::get_singleton()->has_environment(U"GODOT_MTL_ARGUMENT_BUFFERS_TIER")) {
+		uint64_t tier = OS::get_singleton()->get_environment(U"GODOT_MTL_ARGUMENT_BUFFERS_TIER").to_int();
+		switch (tier) {
+			case 1:
+				// All devices support tier 1 argument buffers.
+				options.argument_buffers_tier = ArgumentBuffersTier::Tier1;
+				break;
+			case 2:
+				if (features.argument_buffers_tier >= ArgumentBuffersTier::Tier2) {
+					options.argument_buffers_tier = ArgumentBuffersTier::Tier2;
+				} else {
+					WARN_PRINT("Current device does not support tier 2 argument buffers, leaving as default.");
+				}
+				break;
+			default:
+				WARN_PRINT(vformat("Invalid value for GODOT_MTL_ARGUMENT_BUFFER_TIER: %d. Falling back to device default.", tier));
+				break;
+		}
+	}
+}
+
 void RenderingShaderContainerMetal::_initialize_toolchain_properties() {
 	if (compiler_props.is_valid()) {
 		return;
@@ -313,12 +339,9 @@ bool RenderingShaderContainerMetal::_set_code_from_spirv(Span<ReflectedShaderSta
 		msl_options.ios_support_base_vertex_instance = true;
 	}
 
-	bool disable_argument_buffers = false;
-	if (String v = OS::get_singleton()->get_environment("GODOT_MTL_DISABLE_ARGUMENT_BUFFERS"); v == "1") {
-		disable_argument_buffers = true;
-	}
+	bool argument_buffers_allowed = get_shader_reflection().has_dynamic_buffers == false;
 
-	if (device_profile->features.argument_buffers_tier >= MetalDeviceProfile::ArgumentBuffersTier::Tier2 && !disable_argument_buffers) {
+	if (device_profile->options.argument_buffers_tier >= MetalDeviceProfile::ArgumentBuffersTier::Tier2 && argument_buffers_allowed) {
 		msl_options.argument_buffers_tier = CompilerMSL::Options::ArgumentBuffersTier::Tier2;
 		msl_options.argument_buffers = true;
 		mtl_reflection_data.set_uses_argument_buffers(true);
diff --git a/drivers/vulkan/rendering_device_driver_vulkan.cpp b/drivers/vulkan/rendering_device_driver_vulkan.cpp
index 90291bd9447..a340fe566ca 100644
--- a/drivers/vulkan/rendering_device_driver_vulkan.cpp
+++ b/drivers/vulkan/rendering_device_driver_vulkan.cpp
@@ -58,6 +58,8 @@
 static const uint32_t BREADCRUMB_BUFFER_ENTRIES = 512u;
 #endif
 
+static const uint32_t MAX_DYNAMIC_BUFFERS = 8u; // Minimum guaranteed by Vulkan.
+
 static const VkFormat RD_TO_VK_FORMAT[RDD::DATA_FORMAT_MAX] = {
 	VK_FORMAT_R4G4_UNORM_PACK8,
 	VK_FORMAT_R4G4B4A4_UNORM_PACK16,
@@ -1569,7 +1571,7 @@ Error RenderingDeviceDriverVulkan::initialize(uint32_t p_device_index, uint32_t
 	max_descriptor_sets_per_pool = GLOBAL_GET("rendering/rendering_device/vulkan/max_descriptors_per_pool");
 
 #if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
-	breadcrumb_buffer = buffer_create(2u * sizeof(uint32_t) * BREADCRUMB_BUFFER_ENTRIES, BufferUsageBits::BUFFER_USAGE_TRANSFER_TO_BIT, MemoryAllocationType::MEMORY_ALLOCATION_TYPE_CPU);
+	breadcrumb_buffer = buffer_create(2u * sizeof(uint32_t) * BREADCRUMB_BUFFER_ENTRIES, BufferUsageBits::BUFFER_USAGE_TRANSFER_TO_BIT, MemoryAllocationType::MEMORY_ALLOCATION_TYPE_CPU, UINT64_MAX);
 #endif
 
 #if defined(SWAPPY_FRAME_PACING_ENABLED)
@@ -1634,11 +1636,28 @@ static_assert(ENUM_MEMBERS_EQUAL(RDD::BUFFER_USAGE_VERTEX_BIT, VK_BUFFER_USAGE_V
 static_assert(ENUM_MEMBERS_EQUAL(RDD::BUFFER_USAGE_INDIRECT_BIT, VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT));
 static_assert(ENUM_MEMBERS_EQUAL(RDD::BUFFER_USAGE_DEVICE_ADDRESS_BIT, VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT));
 
-RDD::BufferID RenderingDeviceDriverVulkan::buffer_create(uint64_t p_size, BitField<BufferUsageBits> p_usage, MemoryAllocationType p_allocation_type) {
+RDD::BufferID RenderingDeviceDriverVulkan::buffer_create(uint64_t p_size, BitField<BufferUsageBits> p_usage, MemoryAllocationType p_allocation_type, uint64_t p_frames_drawn) {
+	uint32_t alignment = 16u; // 16 bytes is reasonable.
+	if (p_usage.has_flag(BUFFER_USAGE_UNIFORM_BIT)) {
+		// Some GPUs (e.g. NVIDIA) have absurdly high alignments, like 256 bytes.
+		alignment = MAX(alignment, physical_device_properties.limits.minUniformBufferOffsetAlignment);
+	}
+	if (p_usage.has_flag(BUFFER_USAGE_STORAGE_BIT)) {
+		// This shouldn't be a problem since it's often <= 16 bytes. But do it just in case.
+		alignment = MAX(alignment, physical_device_properties.limits.minStorageBufferOffsetAlignment);
+	}
+	// Align the size. This is specially important for BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT buffers.
+	// For the rest, it should work thanks to VMA taking care of the details. But still align just in case.
+	p_size = STEPIFY(p_size, alignment);
+
+	const size_t original_size = p_size;
+	if (p_usage.has_flag(BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT)) {
+		p_size = p_size * frame_count;
+	}
 	VkBufferCreateInfo create_info = {};
 	create_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
 	create_info.size = p_size;
-	create_info.usage = p_usage;
+	create_info.usage = p_usage & ~BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT;
 	create_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
 
 	VmaMemoryUsage vma_usage = VMA_MEMORY_USAGE_UNKNOWN;
@@ -1670,6 +1689,9 @@ RDD::BufferID RenderingDeviceDriverVulkan::buffer_create(uint64_t p_size, BitFie
 				// We must set it right now or else vmaFindMemoryTypeIndexForBufferInfo will use wrong parameters.
 				alloc_create_info.usage = vma_usage;
 			}
+			if (p_usage.has_flag(BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT)) {
+				alloc_create_info.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
+			}
 			alloc_create_info.preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
 			if (p_size <= SMALL_ALLOCATION_MAX_SIZE) {
 				uint32_t mem_type_index = 0;
@@ -1698,11 +1720,26 @@ RDD::BufferID RenderingDeviceDriverVulkan::buffer_create(uint64_t p_size, BitFie
 	}
 
 	// Bookkeep.
-	BufferInfo *buf_info = VersatileResource::allocate<BufferInfo>(resources_allocator);
+	BufferInfo *buf_info;
+	if (p_usage.has_flag(BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT)) {
+		void *persistent_ptr = nullptr;
+		VkResult err = vmaMapMemory(allocator, allocation, &persistent_ptr);
+		ERR_FAIL_COND_V_MSG(err, BufferID(), "vmaMapMemory failed with error " + itos(err) + ".");
+
+		BufferDynamicInfo *dyn_buffer = VersatileResource::allocate<BufferDynamicInfo>(resources_allocator);
+		buf_info = dyn_buffer;
+#ifdef DEBUG_ENABLED
+		dyn_buffer->last_frame_mapped = p_frames_drawn - 1ul;
+#endif
+		dyn_buffer->frame_idx = 0u;
+		dyn_buffer->persistent_ptr = (uint8_t *)persistent_ptr;
+	} else {
+		buf_info = VersatileResource::allocate<BufferInfo>(resources_allocator);
+	}
 	buf_info->vk_buffer = vk_buffer;
 	buf_info->allocation.handle = allocation;
 	buf_info->allocation.size = alloc_info.size;
-	buf_info->size = p_size;
+	buf_info->size = original_size;
 
 	return BufferID(buf_info);
 }
@@ -1730,6 +1767,10 @@ void RenderingDeviceDriverVulkan::buffer_free(BufferID p_buffer) {
 		vkDestroyBufferView(vk_device, buf_info->vk_view, VKC::get_allocation_callbacks(VK_OBJECT_TYPE_BUFFER_VIEW));
 	}
 
+	if (buf_info->is_dynamic()) {
+		vmaUnmapMemory(allocator, buf_info->allocation.handle);
+	}
+
 	if (!Engine::get_singleton()->is_extra_gpu_memory_tracking_enabled()) {
 		vmaDestroyBuffer(allocator, buf_info->vk_buffer, buf_info->allocation.handle);
 	} else {
@@ -1737,7 +1778,11 @@ void RenderingDeviceDriverVulkan::buffer_free(BufferID p_buffer) {
 		vmaFreeMemory(allocator, buf_info->allocation.handle);
 	}
 
-	VersatileResource::free(resources_allocator, buf_info);
+	if (buf_info->is_dynamic()) {
+		VersatileResource::free(resources_allocator, (BufferDynamicInfo *)buf_info);
+	} else {
+		VersatileResource::free(resources_allocator, buf_info);
+	}
 }
 
 uint64_t RenderingDeviceDriverVulkan::buffer_get_allocation_size(BufferID p_buffer) {
@@ -1747,6 +1792,7 @@ uint64_t RenderingDeviceDriverVulkan::buffer_get_allocation_size(BufferID p_buff
 
 uint8_t *RenderingDeviceDriverVulkan::buffer_map(BufferID p_buffer) {
 	const BufferInfo *buf_info = (const BufferInfo *)p_buffer.id;
+	ERR_FAIL_COND_V_MSG(buf_info->is_dynamic(), nullptr, "Buffer must NOT have BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT. Use buffer_persistent_map_advance() instead.");
 	void *data_ptr = nullptr;
 	VkResult err = vmaMapMemory(allocator, buf_info->allocation.handle, &data_ptr);
 	ERR_FAIL_COND_V_MSG(err, nullptr, "vmaMapMemory failed with error " + itos(err) + ".");
@@ -1758,6 +1804,38 @@ void RenderingDeviceDriverVulkan::buffer_unmap(BufferID p_buffer) {
 	vmaUnmapMemory(allocator, buf_info->allocation.handle);
 }
 
+uint8_t *RenderingDeviceDriverVulkan::buffer_persistent_map_advance(BufferID p_buffer, uint64_t p_frames_drawn) {
+	BufferDynamicInfo *buf_info = (BufferDynamicInfo *)p_buffer.id;
+	ERR_FAIL_COND_V_MSG(!buf_info->is_dynamic(), nullptr, "Buffer must have BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT. Use buffer_map() instead.");
+#ifdef DEBUG_ENABLED
+	ERR_FAIL_COND_V_MSG(buf_info->last_frame_mapped == p_frames_drawn, nullptr, "Buffers with BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT must only be mapped once per frame. Otherwise there could be race conditions with the GPU. Amalgamate all data uploading into one map(), use an extra buffer or remove the bit.");
+	buf_info->last_frame_mapped = p_frames_drawn;
+#endif
+	buf_info->frame_idx = (buf_info->frame_idx + 1u) % frame_count;
+	return buf_info->persistent_ptr + buf_info->frame_idx * buf_info->size;
+}
+
+void RenderingDeviceDriverVulkan::buffer_flush(BufferID p_buffer) {
+	BufferDynamicInfo *buf_info = (BufferDynamicInfo *)p_buffer.id;
+
+	VkMemoryPropertyFlags mem_props_flags;
+	vmaGetAllocationMemoryProperties(allocator, buf_info->allocation.handle, &mem_props_flags);
+
+	const bool needs_flushing = !(mem_props_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
+
+	if (needs_flushing) {
+		if (buf_info->is_dynamic()) {
+			pending_flushes.allocations.push_back(buf_info->allocation.handle);
+			pending_flushes.offsets.push_back(buf_info->frame_idx * buf_info->size);
+			pending_flushes.sizes.push_back(buf_info->size);
+		} else {
+			pending_flushes.allocations.push_back(buf_info->allocation.handle);
+			pending_flushes.offsets.push_back(0u);
+			pending_flushes.sizes.push_back(VK_WHOLE_SIZE);
+		}
+	}
+}
+
 uint64_t RenderingDeviceDriverVulkan::buffer_get_device_address(BufferID p_buffer) {
 	const BufferInfo *buf_info = (const BufferInfo *)p_buffer.id;
 	VkBufferDeviceAddressInfo address_info = {};
@@ -2780,6 +2858,18 @@ Error RenderingDeviceDriverVulkan::command_queue_execute_and_present(CommandQueu
 		wait_semaphores_stages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
 	}
 
+	if (!pending_flushes.allocations.is_empty()) {
+		// We must do this now, even if p_cmd_buffers is empty; because afterwards pending_flushes.allocations
+		// could become dangling. We cannot delay this call for the next frame(s).
+		err = vmaFlushAllocations(allocator, pending_flushes.allocations.size(),
+				pending_flushes.allocations.ptr(), pending_flushes.offsets.ptr(),
+				pending_flushes.sizes.ptr());
+		pending_flushes.allocations.clear();
+		pending_flushes.offsets.clear();
+		pending_flushes.sizes.clear();
+		ERR_FAIL_COND_V(err != VK_SUCCESS, FAILED);
+	}
+
 	if (p_cmd_buffers.size() > 0) {
 		thread_local LocalVector<VkCommandBuffer> command_buffers;
 		thread_local LocalVector<VkSemaphore> present_semaphores;
@@ -3713,9 +3803,15 @@ RDD::ShaderID RenderingDeviceDriverVulkan::shader_create_from_container(const Re
 				case UNIFORM_TYPE_UNIFORM_BUFFER: {
 					layout_binding.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
 				} break;
+				case UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC: {
+					layout_binding.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
+				} break;
 				case UNIFORM_TYPE_STORAGE_BUFFER: {
 					layout_binding.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
 				} break;
+				case UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC: {
+					layout_binding.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC;
+				} break;
 				case UNIFORM_TYPE_INPUT_ATTACHMENT: {
 					layout_binding.descriptorType = VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT;
 				} break;
@@ -3942,6 +4038,13 @@ VkDescriptorPool RenderingDeviceDriverVulkan::_descriptor_set_pool_find_or_creat
 			curr_vk_size++;
 			vk_sizes_count++;
 		}
+		if (p_key.uniform_type[UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC]) {
+			*curr_vk_size = {};
+			curr_vk_size->type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
+			curr_vk_size->descriptorCount = p_key.uniform_type[UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC] * max_descriptor_sets_per_pool;
+			curr_vk_size++;
+			vk_sizes_count++;
+		}
 		if (p_key.uniform_type[UNIFORM_TYPE_STORAGE_BUFFER]) {
 			*curr_vk_size = {};
 			curr_vk_size->type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
@@ -3949,6 +4052,13 @@ VkDescriptorPool RenderingDeviceDriverVulkan::_descriptor_set_pool_find_or_creat
 			curr_vk_size++;
 			vk_sizes_count++;
 		}
+		if (p_key.uniform_type[UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC]) {
+			*curr_vk_size = {};
+			curr_vk_size->type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC;
+			curr_vk_size->descriptorCount = p_key.uniform_type[UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC] * max_descriptor_sets_per_pool;
+			curr_vk_size++;
+			vk_sizes_count++;
+		}
 		if (p_key.uniform_type[UNIFORM_TYPE_INPUT_ATTACHMENT]) {
 			*curr_vk_size = {};
 			curr_vk_size->type = VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT;
@@ -4012,6 +4122,12 @@ RDD::UniformSetID RenderingDeviceDriverVulkan::uniform_set_create(VectorView<Bou
 		p_linear_pool_index = -1;
 	}
 	DescriptorSetPoolKey pool_key;
+
+	// We first gather dynamic arrays in a local array because TightLocalVector's
+	// growth is not efficient when the number of elements is unknown.
+	const BufferInfo *dynamic_buffers[MAX_DYNAMIC_BUFFERS];
+	uint32_t num_dynamic_buffers = 0u;
+
 	// Immutable samplers will be skipped so we need to track the number of vk_writes used.
 	VkWriteDescriptorSet *vk_writes = ALLOCA_ARRAY(VkWriteDescriptorSet, p_uniforms.size());
 	uint32_t writes_amount = 0;
@@ -4147,9 +4263,28 @@ RDD::UniformSetID RenderingDeviceDriverVulkan::uniform_set_create(VectorView<Bou
 				vk_buf_info->buffer = buf_info->vk_buffer;
 				vk_buf_info->range = buf_info->size;
 
+				ERR_FAIL_COND_V_MSG(buf_info->is_dynamic(), UniformSetID(),
+						"Sent a buffer with BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT but binding (" + itos(uniform.binding) + "), set (" + itos(p_set_index) + ") is UNIFORM_TYPE_UNIFORM_BUFFER instead of UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC.");
+
 				vk_writes[writes_amount].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
 				vk_writes[writes_amount].pBufferInfo = vk_buf_info;
 			} break;
+			case UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC: {
+				const BufferInfo *buf_info = (const BufferInfo *)uniform.ids[0].id;
+				VkDescriptorBufferInfo *vk_buf_info = ALLOCA_SINGLE(VkDescriptorBufferInfo);
+				*vk_buf_info = {};
+				vk_buf_info->buffer = buf_info->vk_buffer;
+				vk_buf_info->range = buf_info->size;
+
+				ERR_FAIL_COND_V_MSG(!buf_info->is_dynamic(), UniformSetID(),
+						"Sent a buffer without BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT but binding (" + itos(uniform.binding) + "), set (" + itos(p_set_index) + ") is UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC instead of UNIFORM_TYPE_UNIFORM_BUFFER.");
+				ERR_FAIL_COND_V_MSG(num_dynamic_buffers >= MAX_DYNAMIC_BUFFERS, UniformSetID(),
+						"Uniform set exceeded the limit of dynamic/persistent buffers. (" + itos(MAX_DYNAMIC_BUFFERS) + ").");
+
+				dynamic_buffers[num_dynamic_buffers++] = buf_info;
+				vk_writes[writes_amount].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
+				vk_writes[writes_amount].pBufferInfo = vk_buf_info;
+			} break;
 			case UNIFORM_TYPE_STORAGE_BUFFER: {
 				const BufferInfo *buf_info = (const BufferInfo *)uniform.ids[0].id;
 				VkDescriptorBufferInfo *vk_buf_info = ALLOCA_SINGLE(VkDescriptorBufferInfo);
@@ -4157,9 +4292,28 @@ RDD::UniformSetID RenderingDeviceDriverVulkan::uniform_set_create(VectorView<Bou
 				vk_buf_info->buffer = buf_info->vk_buffer;
 				vk_buf_info->range = buf_info->size;
 
+				ERR_FAIL_COND_V_MSG(buf_info->is_dynamic(), UniformSetID(),
+						"Sent a buffer with BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT but binding (" + itos(uniform.binding) + "), set (" + itos(p_set_index) + ") is UNIFORM_TYPE_STORAGE_BUFFER instead of UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC.");
+
 				vk_writes[writes_amount].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
 				vk_writes[writes_amount].pBufferInfo = vk_buf_info;
 			} break;
+			case UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC: {
+				const BufferInfo *buf_info = (const BufferInfo *)uniform.ids[0].id;
+				VkDescriptorBufferInfo *vk_buf_info = ALLOCA_SINGLE(VkDescriptorBufferInfo);
+				*vk_buf_info = {};
+				vk_buf_info->buffer = buf_info->vk_buffer;
+				vk_buf_info->range = buf_info->size;
+
+				ERR_FAIL_COND_V_MSG(!buf_info->is_dynamic(), UniformSetID(),
+						"Sent a buffer without BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT but binding (" + itos(uniform.binding) + "), set (" + itos(p_set_index) + ") is UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC instead of UNIFORM_TYPE_STORAGE_BUFFER.");
+				ERR_FAIL_COND_V_MSG(num_dynamic_buffers >= MAX_DYNAMIC_BUFFERS, UniformSetID(),
+						"Uniform set exceeded the limit of dynamic/persistent buffers. (" + itos(MAX_DYNAMIC_BUFFERS) + ").");
+
+				dynamic_buffers[num_dynamic_buffers++] = buf_info;
+				vk_writes[writes_amount].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC;
+				vk_writes[writes_amount].pBufferInfo = vk_buf_info;
+			} break;
 			case UNIFORM_TYPE_INPUT_ATTACHMENT: {
 				num_descriptors = uniform.ids.size();
 				VkDescriptorImageInfo *vk_img_infos = ALLOCA_ARRAY(VkDescriptorImageInfo, num_descriptors);
@@ -4223,6 +4377,10 @@ RDD::UniformSetID RenderingDeviceDriverVulkan::uniform_set_create(VectorView<Bou
 		usi->vk_descriptor_pool = vk_pool;
 	}
 	usi->pool_sets_it = pool_sets_it;
+	usi->dynamic_buffers.resize(num_dynamic_buffers);
+	for (uint32_t i = 0u; i < num_dynamic_buffers; ++i) {
+		usi->dynamic_buffers[i] = dynamic_buffers[i];
+	}
 
 	return UniformSetID(usi);
 }
@@ -4249,6 +4407,31 @@ bool RenderingDeviceDriverVulkan::uniform_sets_have_linear_pools() const {
 	return true;
 }
 
+uint32_t RenderingDeviceDriverVulkan::uniform_sets_get_dynamic_offsets(VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) const {
+	uint32_t mask = 0u;
+	uint32_t shift = 0u;
+#ifdef DEV_ENABLED
+	uint32_t curr_dynamic_offset = 0u;
+#endif
+
+	for (uint32_t i = 0; i < p_set_count; i++) {
+		const UniformSetInfo *usi = (const UniformSetInfo *)p_uniform_sets[i].id;
+		// At this point this assert should already have been validated.
+		DEV_ASSERT(curr_dynamic_offset + usi->dynamic_buffers.size() <= MAX_DYNAMIC_BUFFERS);
+
+		for (const BufferInfo *dynamic_buffer : usi->dynamic_buffers) {
+			DEV_ASSERT(dynamic_buffer->frame_idx < 16u);
+			mask |= dynamic_buffer->frame_idx << shift;
+			shift += 4u;
+		}
+#ifdef DEV_ENABLED
+		curr_dynamic_offset += usi->dynamic_buffers.size();
+#endif
+	}
+
+	return mask;
+}
+
 void RenderingDeviceDriverVulkan::linear_uniform_set_pools_reset(int p_linear_pool_index) {
 	if (linear_descriptor_pools_enabled) {
 		DescriptorSetPools &pools_to_reset = linear_descriptor_set_pools[p_linear_pool_index];
@@ -4844,14 +5027,7 @@ void RenderingDeviceDriverVulkan::command_bind_render_pipeline(CommandBufferID p
 	vkCmdBindPipeline(command_buffer->vk_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, (VkPipeline)p_pipeline.id);
 }
 
-void RenderingDeviceDriverVulkan::command_bind_render_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) {
-	const CommandBufferInfo *command_buffer = (const CommandBufferInfo *)p_cmd_buffer.id;
-	const ShaderInfo *shader_info = (const ShaderInfo *)p_shader.id;
-	const UniformSetInfo *usi = (const UniformSetInfo *)p_uniform_set.id;
-	vkCmdBindDescriptorSets(command_buffer->vk_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, shader_info->vk_pipeline_layout, p_set_index, 1, &usi->vk_descriptor_set, 0, nullptr);
-}
-
-void RenderingDeviceDriverVulkan::command_bind_render_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) {
+void RenderingDeviceDriverVulkan::command_bind_render_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets) {
 	if (p_set_count == 0) {
 		return;
 	}
@@ -4860,13 +5036,29 @@ void RenderingDeviceDriverVulkan::command_bind_render_uniform_sets(CommandBuffer
 	sets.clear();
 	sets.resize(p_set_count);
 
+	uint32_t dynamic_offsets[MAX_DYNAMIC_BUFFERS];
+	uint32_t shift = 0u;
+	uint32_t curr_dynamic_offset = 0u;
+
 	for (uint32_t i = 0; i < p_set_count; i++) {
-		sets[i] = ((const UniformSetInfo *)p_uniform_sets[i].id)->vk_descriptor_set;
+		const UniformSetInfo *usi = (const UniformSetInfo *)p_uniform_sets[i].id;
+
+		sets[i] = usi->vk_descriptor_set;
+
+		// At this point this assert should already have been validated.
+		DEV_ASSERT(curr_dynamic_offset + usi->dynamic_buffers.size() <= MAX_DYNAMIC_BUFFERS);
+
+		const uint32_t dynamic_offset_count = usi->dynamic_buffers.size();
+		for (uint32_t j = 0u; j < dynamic_offset_count; ++j) {
+			const uint32_t frame_idx = (p_dynamic_offsets >> shift) & 0xFu;
+			shift += 4u;
+			dynamic_offsets[curr_dynamic_offset++] = uint32_t(frame_idx * usi->dynamic_buffers[j]->size);
+		}
 	}
 
 	const CommandBufferInfo *command_buffer = (const CommandBufferInfo *)p_cmd_buffer.id;
 	const ShaderInfo *shader_info = (const ShaderInfo *)p_shader.id;
-	vkCmdBindDescriptorSets(command_buffer->vk_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, shader_info->vk_pipeline_layout, p_first_set_index, p_set_count, &sets[0], 0, nullptr);
+	vkCmdBindDescriptorSets(command_buffer->vk_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, shader_info->vk_pipeline_layout, p_first_set_index, p_set_count, &sets[0], curr_dynamic_offset, dynamic_offsets);
 }
 
 void RenderingDeviceDriverVulkan::command_render_draw(CommandBufferID p_cmd_buffer, uint32_t p_vertex_count, uint32_t p_instance_count, uint32_t p_base_vertex, uint32_t p_first_instance) {
@@ -5290,14 +5482,7 @@ void RenderingDeviceDriverVulkan::command_bind_compute_pipeline(CommandBufferID
 	vkCmdBindPipeline(command_buffer->vk_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, (VkPipeline)p_pipeline.id);
 }
 
-void RenderingDeviceDriverVulkan::command_bind_compute_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) {
-	const CommandBufferInfo *command_buffer = (const CommandBufferInfo *)p_cmd_buffer.id;
-	const ShaderInfo *shader_info = (const ShaderInfo *)p_shader.id;
-	const UniformSetInfo *usi = (const UniformSetInfo *)p_uniform_set.id;
-	vkCmdBindDescriptorSets(command_buffer->vk_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, shader_info->vk_pipeline_layout, p_set_index, 1, &usi->vk_descriptor_set, 0, nullptr);
-}
-
-void RenderingDeviceDriverVulkan::command_bind_compute_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) {
+void RenderingDeviceDriverVulkan::command_bind_compute_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets) {
 	if (p_set_count == 0) {
 		return;
 	}
@@ -5306,13 +5491,29 @@ void RenderingDeviceDriverVulkan::command_bind_compute_uniform_sets(CommandBuffe
 	sets.clear();
 	sets.resize(p_set_count);
 
+	uint32_t dynamic_offsets[MAX_DYNAMIC_BUFFERS];
+	uint32_t shift = 0u;
+	uint32_t curr_dynamic_offset = 0u;
+
 	for (uint32_t i = 0; i < p_set_count; i++) {
-		sets[i] = ((const UniformSetInfo *)p_uniform_sets[i].id)->vk_descriptor_set;
+		const UniformSetInfo *usi = (const UniformSetInfo *)p_uniform_sets[i].id;
+
+		sets[i] = usi->vk_descriptor_set;
+
+		// At this point this assert should already have been validated.
+		DEV_ASSERT(curr_dynamic_offset + usi->dynamic_buffers.size() <= MAX_DYNAMIC_BUFFERS);
+
+		const uint32_t dynamic_offset_count = usi->dynamic_buffers.size();
+		for (uint32_t j = 0u; j < dynamic_offset_count; ++j) {
+			const uint32_t frame_idx = (p_dynamic_offsets >> shift) & 0xFu;
+			shift += 4u;
+			dynamic_offsets[curr_dynamic_offset++] = uint32_t(frame_idx * usi->dynamic_buffers[j]->size);
+		}
 	}
 
 	const CommandBufferInfo *command_buffer = (const CommandBufferInfo *)p_cmd_buffer.id;
 	const ShaderInfo *shader_info = (const ShaderInfo *)p_shader.id;
-	vkCmdBindDescriptorSets(command_buffer->vk_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, shader_info->vk_pipeline_layout, p_first_set_index, p_set_count, &sets[0], 0, nullptr);
+	vkCmdBindDescriptorSets(command_buffer->vk_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, shader_info->vk_pipeline_layout, p_first_set_index, p_set_count, &sets[0], curr_dynamic_offset, dynamic_offsets);
 }
 
 void RenderingDeviceDriverVulkan::command_compute_dispatch(CommandBufferID p_cmd_buffer, uint32_t p_x_groups, uint32_t p_y_groups, uint32_t p_z_groups) {
diff --git a/drivers/vulkan/rendering_device_driver_vulkan.h b/drivers/vulkan/rendering_device_driver_vulkan.h
index ff3fe7963fe..8789fde04c5 100644
--- a/drivers/vulkan/rendering_device_driver_vulkan.h
+++ b/drivers/vulkan/rendering_device_driver_vulkan.h
@@ -147,6 +147,14 @@ class RenderingDeviceDriverVulkan : public RenderingDeviceDriver {
 #endif
 	DeviceFunctions device_functions;
 
+	struct PendingFlushes {
+		LocalVector<VmaAllocation> allocations;
+		LocalVector<VkDeviceSize> offsets;
+		LocalVector<VkDeviceSize> sizes;
+	};
+
+	PendingFlushes pending_flushes;
+
 	void _register_requested_device_extension(const CharString &p_extension_name, bool p_required);
 	Error _initialize_device_extensions();
 	Error _check_device_features();
@@ -194,14 +202,29 @@ public:
 		} allocation;
 		uint64_t size = 0;
 		VkBufferView vk_view = VK_NULL_HANDLE; // For texel buffers.
+		// If dynamic buffer, then its range is [0; RenderingDeviceDriverVulkan::frame_count)
+		// else it's UINT32_MAX.
+		uint32_t frame_idx = UINT32_MAX;
+
+		bool is_dynamic() const { return frame_idx != UINT32_MAX; }
 	};
 
-	virtual BufferID buffer_create(uint64_t p_size, BitField<BufferUsageBits> p_usage, MemoryAllocationType p_allocation_type) override final;
+	struct BufferDynamicInfo : BufferInfo {
+		uint8_t *persistent_ptr = nullptr;
+#ifdef DEBUG_ENABLED
+		// For tracking that a persistent buffer isn't mapped twice in the same frame.
+		uint64_t last_frame_mapped = 0;
+#endif
+	};
+
+	virtual BufferID buffer_create(uint64_t p_size, BitField<BufferUsageBits> p_usage, MemoryAllocationType p_allocation_type, uint64_t p_frames_drawn) override final;
 	virtual bool buffer_set_texel_format(BufferID p_buffer, DataFormat p_format) override final;
 	virtual void buffer_free(BufferID p_buffer) override final;
 	virtual uint64_t buffer_get_allocation_size(BufferID p_buffer) override final;
 	virtual uint8_t *buffer_map(BufferID p_buffer) override final;
 	virtual void buffer_unmap(BufferID p_buffer) override final;
+	virtual uint8_t *buffer_persistent_map_advance(BufferID p_buffer, uint64_t p_frames_drawn) override final;
+	virtual void buffer_flush(BufferID p_buffer) override final;
 	virtual uint64_t buffer_get_device_address(BufferID p_buffer) override final;
 
 	/*****************/
@@ -473,6 +496,7 @@ private:
 		VkDescriptorPool vk_descriptor_pool = VK_NULL_HANDLE;
 		VkDescriptorPool vk_linear_descriptor_pool = VK_NULL_HANDLE;
 		DescriptorSetPools::Iterator pool_sets_it;
+		TightLocalVector<BufferInfo const *, uint32_t> dynamic_buffers;
 	};
 
 public:
@@ -480,6 +504,7 @@ public:
 	virtual void linear_uniform_set_pools_reset(int p_linear_pool_index) override final;
 	virtual void uniform_set_free(UniformSetID p_uniform_set) override final;
 	virtual bool uniform_sets_have_linear_pools() const override final;
+	virtual uint32_t uniform_sets_get_dynamic_offsets(VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) const override final;
 
 	// ----- COMMANDS -----
 
@@ -567,8 +592,7 @@ public:
 
 	// Binding.
 	virtual void command_bind_render_pipeline(CommandBufferID p_cmd_buffer, PipelineID p_pipeline) override final;
-	virtual void command_bind_render_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) override final;
-	virtual void command_bind_render_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) override final;
+	virtual void command_bind_render_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets) override final;
 
 	// Drawing.
 	virtual void command_render_draw(CommandBufferID p_cmd_buffer, uint32_t p_vertex_count, uint32_t p_instance_count, uint32_t p_base_vertex, uint32_t p_first_instance) override final;
@@ -610,8 +634,7 @@ public:
 
 	// Binding.
 	virtual void command_bind_compute_pipeline(CommandBufferID p_cmd_buffer, PipelineID p_pipeline) override final;
-	virtual void command_bind_compute_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) override final;
-	virtual void command_bind_compute_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) override final;
+	virtual void command_bind_compute_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets) override final;
 
 	// Dispatching.
 	virtual void command_compute_dispatch(CommandBufferID p_cmd_buffer, uint32_t p_x_groups, uint32_t p_y_groups, uint32_t p_z_groups) override final;
diff --git a/editor/export/shader_baker_export_plugin.cpp b/editor/export/shader_baker_export_plugin.cpp
index 51535cce880..7d1dba6136c 100644
--- a/editor/export/shader_baker_export_plugin.cpp
+++ b/editor/export/shader_baker_export_plugin.cpp
@@ -413,6 +413,7 @@ void ShaderBakerExportPlugin::_customize_shader_version(ShaderRD *p_shader, RID
 		work_item.cache_path = group_items[group].cache_path;
 		work_item.shader_name = p_shader->get_name();
 		work_item.stage_sources = p_shader->version_build_variant_stage_sources(p_version, i);
+		work_item.dynamic_buffers = p_shader->get_dynamic_buffers();
 		work_item.variant = i;
 
 		WorkerThreadPool::TaskID task_id = WorkerThreadPool::get_singleton()->add_template_task(this, &ShaderBakerExportPlugin::_process_work_item, work_item);
@@ -428,7 +429,7 @@ void ShaderBakerExportPlugin::_customize_shader_version(ShaderRD *p_shader, RID
 void ShaderBakerExportPlugin::_process_work_item(WorkItem p_work_item) {
 	if (!tasks_cancelled) {
 		// Only process the item if the tasks haven't been cancelled by the user yet.
-		Vector<RD::ShaderStageSPIRVData> spirv_data = ShaderRD::compile_stages(p_work_item.stage_sources);
+		Vector<RD::ShaderStageSPIRVData> spirv_data = ShaderRD::compile_stages(p_work_item.stage_sources, p_work_item.dynamic_buffers);
 		ERR_FAIL_COND_MSG(spirv_data.is_empty(), "Unable to retrieve SPIR-V data for shader");
 
 		Ref<RenderingShaderContainer> shader_container = shader_container_format->create_container();
diff --git a/editor/export/shader_baker_export_plugin.h b/editor/export/shader_baker_export_plugin.h
index 208814492a5..11b2446b926 100644
--- a/editor/export/shader_baker_export_plugin.h
+++ b/editor/export/shader_baker_export_plugin.h
@@ -52,6 +52,7 @@ protected:
 		String cache_path;
 		String shader_name;
 		Vector<String> stage_sources;
+		Vector<uint64_t> dynamic_buffers;
 		int64_t variant = 0;
 	};
 
diff --git a/servers/rendering/multi_uma_buffer.h b/servers/rendering/multi_uma_buffer.h
new file mode 100644
index 00000000000..605a6aac018
--- /dev/null
+++ b/servers/rendering/multi_uma_buffer.h
@@ -0,0 +1,384 @@
+/**************************************************************************/
+/*  multi_uma_buffer.h                                                    */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#pragma once
+
+#include "servers/rendering/rendering_server.h"
+
+class MultiUmaBufferBase {
+protected:
+	LocalVector<RID> buffers;
+	uint32_t curr_idx = UINT32_MAX;
+	uint64_t last_frame_mapped = UINT64_MAX;
+	const uint32_t max_extra_buffers;
+#ifdef DEBUG_ENABLED
+	const char *debug_name;
+#endif
+
+	MultiUmaBufferBase(uint32_t p_max_extra_buffers, const char *p_debug_name) :
+			max_extra_buffers(p_max_extra_buffers)
+#ifdef DEBUG_ENABLED
+			,
+			debug_name(p_debug_name)
+#endif
+	{
+	}
+
+#ifdef DEV_ENABLED
+	~MultiUmaBufferBase() {
+		DEV_ASSERT(buffers.is_empty() && "Forgot to call uninit()!");
+	}
+#endif
+
+public:
+	void uninit() {
+		if (is_print_verbose_enabled()) {
+			print_line("MultiUmaBuffer '"
+#ifdef DEBUG_ENABLED
+					+ String(debug_name) +
+#else
+					   "{DEBUG_ENABLED unavailable}"
+#endif
+					"' used a total of " + itos(buffers.size()) +
+					" buffers. A large number may indicate a waste of VRAM and can be brought down by tweaking MAX_EXTRA_BUFFERS for this buffer.");
+		}
+
+		RenderingDevice *rd = RD::RenderingDevice::get_singleton();
+
+		for (RID buffer : buffers) {
+			if (buffer.is_valid()) {
+				rd->free_rid(buffer);
+			}
+		}
+
+		buffers.clear();
+	}
+
+	void shrink_to_max_extra_buffers() {
+		DEV_ASSERT(curr_idx == 0u && "This function can only be called after reset and before being upload_and_advance again!");
+
+		RenderingDevice *rd = RD::RenderingDevice::get_singleton();
+
+		uint32_t elem_count = buffers.size();
+
+		if (elem_count > max_extra_buffers) {
+			if (is_print_verbose_enabled()) {
+				print_line("MultiUmaBuffer '"
+#ifdef DEBUG_ENABLED
+						+ String(debug_name) +
+#else
+						   "{DEBUG_ENABLED unavailable}"
+#endif
+						"' peaked to " + itos(elem_count) + " elements and shrinking it to " + itos(max_extra_buffers) +
+						". If you see this message often, then something is wrong with rendering or MAX_EXTRA_BUFFERS needs to be increased.");
+			}
+		}
+
+		while (elem_count > max_extra_buffers) {
+			--elem_count;
+			if (buffers[elem_count].is_valid()) {
+				rd->free_rid(buffers[elem_count]);
+			}
+			buffers.remove_at(elem_count);
+		}
+	}
+};
+
+/// Interface for making it easier to work with UMA.
+///
+/// # What is UMA?
+///
+/// It stands for Unified Memory Architecture. There are two kinds of UMA:
+///	 1. HW UMA. This is the case of iGPUs (specially Android, iOS, Apple ARM-based macOS, PS4 & PS5)
+///		The CPU and GPU share the same die and same memory. So regular RAM and VRAM are internally the
+///		same thing. There may be some differences between them in practice due to cache synchronization
+///		behaviors or the regular BW RAM may be purposely throttled (as is the case of PS4 & PS5).
+///  2. "Pretended UMA". On PC Desktop GPUs with ReBAR enabled can pretend VRAM behaves like normal
+///		RAM, while internally the data is moved across the PCIe Bus. This can cause differences
+///		in execution time of the routines that write to GPU buffers as the region is often uncached
+///		(i.e. write-combined) and PCIe latency and BW is vastly different from regular RAM.
+///		Without ReBAR, the amount of UMA memory is limited to 256MB (shared by the entire system).
+///
+/// Since often this type of memory is uncached, it is not well-suited for downloading GPU -> CPU,
+/// but rather for uploading CPU -> GPU.
+///
+/// # When to use UMA buffers?
+///
+/// UMA buffers have various caveats and improper usage might lead to visual glitches. Therefore they
+/// should be used sparingly, where it makes a difference. Does all of the following check?:
+///	  1. Data is uploaded from CPU to GPU every (or almost every) frame.
+///   2. Data is always uploaded from scratch. Partial uploads are unsupported.
+///	  3. If uploading multiple times per frame (e.g. for multiple passes). The amount of times
+///      per frame is relatively stable (occasional spikes are fine if using MAX_EXTRA_BUFFERS).
+///
+/// # Why the caveats?
+///
+///	This is due to our inability to detect race conditions. If you write to an UMA buffer, submit
+///	GPU commands and then write more data to it, we can't guarantee that you won't be writing to a
+/// region the GPU is currently reading from. Tools like the validation layers cannot detect this
+/// race condition at all, making it very hard to troubleshoot.
+///
+/// Therefore the safest approach is to use an interface that forces users to upload everything at once.
+/// There is one exception for performance: map_raw_for_upload() will return a pointer, and it is your
+/// responsibility to make sure you don't use that pointer again after submitting.
+/// USE THIS API CALL SPARINGLY AND WITH CARE.
+///
+/// Since we forbid uploading more data after we've uploaded to it, this Interface will create
+/// more buffers. This means users will need more UniformSets (i.e. uniform_set_create).
+///
+/// # How to use
+///
+/// Example code 01:
+///		MultiUmaBuffer<1> uma_buffer = MultiUmaBuffer<1>("Debug name displayed if run with --verbose");
+///		uma_buffer.set_size(0, max_size_bytes, false);
+///
+///		for(uint32_t i = 0u; i < num_passes; ++i) {
+///			uma_buffer.prepare_for_upload(); // Creates a new buffer (if none exists already)
+///											 // of max_size_bytes. Must be called.
+///			uma_buffer.upload(0, src_data, size_bytes);
+///
+///			if(!uniform_set[i]) {
+///				RD::Uniform u;
+///				u.binding = 1;
+///				u.uniform_type = RD::UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC;
+///				u.append_id(uma_buffer._get(0u));
+///				uniform_set[i] = rd->uniform_set_create( ... );
+///			}
+///		}
+///
+///	  // On shutdown (or if you need to call set_size again).
+///	  uma_buffer.uninit();
+///
+/// Example code 02:
+///
+///		uma_buffer.prepare_for_upload();
+///		RID rid = uma_buffer.get_for_upload(0u);
+///		rd->buffer_update(rid, 0, sizeof(BakeParameters), &bake_parameters);
+///		RD::Uniform u; // Skipping full initialization of u. See Example 01.
+///		u.append_id(rid);
+///
+/// Example code 03:
+///
+///		void *dst_data = uma_buffer.map_raw_for_upload(0u);
+///		memcpy(dst_data, src_data, size_bytes);
+///		rd->buffer_flush(uma_buffer._get(0u));
+///		RD::Uniform u; // Skipping full initialization of u. See Example 01.
+///		u.append_id(rid);
+///
+/// # Tricks
+///
+///	Godot's shadow mapping code calls uma_buffer.uniform_buffers._get(-p_pass_offset) (i.e. a negative value)
+/// because for various reasons its shadow mapping code was written like this:
+///
+///		for( uint32_t i = 0u; i < num_passes; ++i ) {
+///			uma_buffer.prepare_for_upload();
+///			uma_buffer.upload(0, src_data, size_bytes);
+///		}
+///		for( uint32_t i = 0u; i < num_passes; ++i ) {
+///			RD::Uniform u;
+///			u.binding = 1;
+///			u.uniform_type = RD::UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC;
+///			u.append_id(uma_buffer._get(-(num_passes - 1u - i)));
+///			uniform_set[i] = rd->uniform_set_create( ... );
+///		}
+///
+/// Every time prepare_for_upload() is called, uma_buffer._get(-idx) will return a different RID(*).
+/// Thus with a negative value we can address previous ones. This is fine as long as the value idx
+/// doesn't exceed the number of times the user called prepare_for_upload() for this frame.
+///
+/// (*)This RID will be returned again on the next frame after the same amount of prepare_for_upload()
+/// calls; unless the number of times it was called exceeded MAX_EXTRA_BUFFERS.
+///
+/// # Template parameters
+///
+///	## NUM_BUFFERS
+///
+/// How many buffers we should track. e.g. instead of doing this:
+///		MultiUmaBuffer<1> omni_lights = /*...*/;
+///		MultiUmaBuffer<1> spot_lights = /*...*/;
+///		MultiUmaBuffer<1> directional_lights = /*...*/;
+///
+///		omni_lights.set_size(0u, omni_size);
+///		spot_lights.set_size(0u, spot_size);
+///		directional_lights.set_size(0u, dir_size);
+///
+///		omni_lights.prepare_for_upload();
+///		spot_lights.prepare_for_upload();
+///		directional_lights.prepare_for_upload();
+///
+/// You can do this:
+///
+///		MultiUmaBuffer<3> lights = /*...*/;
+///
+///		lights.set_size(0u, omni_size);
+///		lights.set_size(1u, spot_size);
+///		lights.set_size(2u, dir_size);
+///
+///		lights.prepare_for_upload();
+///
+/// This approach works as long as all buffers would call prepare_for_upload() at the same time.
+/// It saves some overhead.
+///
+/// ## MAX_EXTRA_BUFFERS
+///
+/// Upper limit on the number of buffers per frame.
+///
+/// There are times where rendering might spike for exceptional reasons, calling prepare_for_upload()
+/// too many times, never to do that again. This will cause an increase in memory usage that will
+/// never be reclaimed until shutdown.
+///
+/// MAX_EXTRA_BUFFERS can be used to handle such spikes, by deallocating the extra buffers.
+/// Example:
+///		MultiUmaBuffer<1, 6> buffer;
+///
+///		// Normal frame (assuming up to 6 passes is considered normal):
+///		for(uint32_t i = 0u; i < 6u; ++i) {
+///			buffer.prepare_for_upload();
+///			...
+///			buffer.upload(...);
+///		}
+///
+///		// Exceptional frame:
+///		for(uint32_t i = 0u; i < 24u; ++i) {
+///			buffer.prepare_for_upload();
+///			...
+///			buffer.upload(...);
+///		}
+///
+///	After the frame is done, those extra 18 buffers will be deleted.
+/// Launching godot with --verbose will print diagnostic information.
+template <uint32_t NUM_BUFFERS, uint32_t MAX_EXTRA_BUFFERS = UINT32_MAX>
+class MultiUmaBuffer : public MultiUmaBufferBase {
+	uint32_t buffer_sizes[NUM_BUFFERS] = {};
+#ifdef DEV_ENABLED
+	bool can_upload[NUM_BUFFERS] = {};
+#endif
+
+	void push() {
+		RenderingDevice *rd = RD::RenderingDevice::get_singleton();
+		for (uint32_t i = 0u; i < NUM_BUFFERS; ++i) {
+			const bool is_storage = buffer_sizes[i] & 0x80000000u;
+			const uint32_t size_bytes = buffer_sizes[i] & ~0x80000000u;
+			RID buffer;
+			if (is_storage) {
+				buffer = rd->storage_buffer_create(size_bytes, Vector<uint8_t>(), 0, RD::BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT);
+			} else {
+				buffer = rd->uniform_buffer_create(size_bytes, Vector<uint8_t>(), RD::BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT);
+			}
+			buffers.push_back(buffer);
+		}
+	}
+
+public:
+	MultiUmaBuffer(const char *p_debug_name) :
+			MultiUmaBufferBase(MAX_EXTRA_BUFFERS, p_debug_name) {}
+
+	uint32_t get_curr_idx() const { return curr_idx; }
+
+	void set_size(uint32_t p_idx, uint32_t p_size_bytes, bool p_is_storage) {
+		DEV_ASSERT(buffers.is_empty());
+		buffer_sizes[p_idx] = p_size_bytes | (p_is_storage ? 0x80000000u : 0u);
+		curr_idx = UINT32_MAX;
+		last_frame_mapped = UINT64_MAX;
+	}
+
+	uint32_t get_size(uint32_t p_idx) const { return buffer_sizes[p_idx] & ~0x80000000u; }
+
+	// Gets the raw buffer. Use with care.
+	// If you call this function, make sure to have called prepare_for_upload() first.
+	// Do not call _get() then prepare_for_upload().
+	RID _get(uint32_t p_idx) {
+		return buffers[curr_idx * NUM_BUFFERS + p_idx];
+	}
+
+	/**
+	 * @param p_append	True if you wish to append more data to existing buffer.
+	 * @return			True if it's possible to append. False if the internal buffer changed.
+	 */
+	bool prepare_for_map(bool p_append) {
+		RenderingDevice *rd = RD::RenderingDevice::get_singleton();
+		const uint64_t frames_drawn = rd->get_frames_drawn();
+
+		if (last_frame_mapped == frames_drawn) {
+			if (!p_append) {
+				++curr_idx;
+			}
+		} else {
+			p_append = false;
+			curr_idx = 0u;
+			if (max_extra_buffers != UINT32_MAX) {
+				shrink_to_max_extra_buffers();
+			}
+		}
+		last_frame_mapped = frames_drawn;
+		if (curr_idx * NUM_BUFFERS >= buffers.size()) {
+			push();
+		}
+
+#ifdef DEV_ENABLED
+		if (!p_append) {
+			for (size_t i = 0u; i < NUM_BUFFERS; ++i) {
+				can_upload[i] = true;
+			}
+		}
+#endif
+		return !p_append;
+	}
+
+	void prepare_for_upload() {
+		prepare_for_map(false);
+	}
+
+	void *map_raw_for_upload(uint32_t p_idx) {
+#ifdef DEV_ENABLED
+		DEV_ASSERT(can_upload[p_idx] && "Forgot to prepare_for_upload first! Or called get_for_upload/upload() twice.");
+		can_upload[p_idx] = false;
+#endif
+		RenderingDevice *rd = RD::RenderingDevice::get_singleton();
+		return rd->buffer_persistent_map_advance(buffers[curr_idx * NUM_BUFFERS + p_idx]);
+	}
+
+	RID get_for_upload(uint32_t p_idx) {
+#ifdef DEV_ENABLED
+		DEV_ASSERT(can_upload[p_idx] && "Forgot to prepare_for_upload first! Or called get_for_upload/upload() twice.");
+		can_upload[p_idx] = false;
+#endif
+		return buffers[curr_idx * NUM_BUFFERS + p_idx];
+	}
+
+	void upload(uint32_t p_idx, const void *p_src_data, uint32_t p_size_bytes) {
+#ifdef DEV_ENABLED
+		DEV_ASSERT(can_upload[p_idx] && "Forgot to prepare_for_upload first! Or called get_for_upload/upload() twice.");
+		can_upload[p_idx] = false;
+#endif
+		RenderingDevice *rd = RD::RenderingDevice::get_singleton();
+		rd->buffer_update(buffers[curr_idx * NUM_BUFFERS + p_idx], 0, p_size_bytes, p_src_data, true);
+	}
+};
diff --git a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp
index c4c6edec30c..cb893150cf1 100644
--- a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp
+++ b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp
@@ -767,26 +767,38 @@ void RenderForwardClustered::_setup_environment(const RenderDataRD *p_render_dat
 	RD::get_singleton()->buffer_update(scene_state.implementation_uniform_buffers[p_index], 0, sizeof(SceneState::UBO), &scene_state.ubo);
 }
 
-void RenderForwardClustered::_update_instance_data_buffer(RenderListType p_render_list) {
-	if (scene_state.instance_data[p_render_list].size() > 0) {
-		if (scene_state.instance_buffer[p_render_list] == RID() || scene_state.instance_buffer_size[p_render_list] < scene_state.instance_data[p_render_list].size()) {
-			if (scene_state.instance_buffer[p_render_list] != RID()) {
-				RD::get_singleton()->free_rid(scene_state.instance_buffer[p_render_list]);
-			}
-			uint32_t new_size = nearest_power_of_2_templated(MAX(uint64_t(INSTANCE_DATA_BUFFER_MIN_SIZE), scene_state.instance_data[p_render_list].size()));
-			scene_state.instance_buffer[p_render_list] = RD::get_singleton()->storage_buffer_create(new_size * sizeof(SceneState::InstanceData));
-			scene_state.instance_buffer_size[p_render_list] = new_size;
+void RenderForwardClustered::SceneState::grow_instance_buffer(RenderListType p_render_list, uint32_t p_req_element_count, bool p_append) {
+	if (p_req_element_count > 0) {
+		if (instance_buffer[p_render_list].get_size(0u) < p_req_element_count * sizeof(SceneState::InstanceData)) {
+			instance_buffer[p_render_list].uninit();
+			uint32_t new_size = nearest_power_of_2_templated(MAX(uint64_t(INSTANCE_DATA_BUFFER_MIN_SIZE), p_req_element_count));
+			instance_buffer[p_render_list].set_size(0u, new_size * sizeof(SceneState::InstanceData), true);
+			curr_gpu_ptr[p_render_list] = nullptr;
+		}
+
+		const bool must_remap = instance_buffer[p_render_list].prepare_for_map(p_append);
+		if (must_remap) {
+			curr_gpu_ptr[p_render_list] = nullptr;
 		}
-		RD::get_singleton()->buffer_update(scene_state.instance_buffer[p_render_list], 0, sizeof(SceneState::InstanceData) * scene_state.instance_data[p_render_list].size(), scene_state.instance_data[p_render_list].ptr());
 	}
 }
+
 void RenderForwardClustered::_fill_instance_data(RenderListType p_render_list, int *p_render_info, uint32_t p_offset, int32_t p_max_elements, bool p_update_buffer) {
 	RenderList *rl = &render_list[p_render_list];
 	uint32_t element_total = p_max_elements >= 0 ? uint32_t(p_max_elements) : rl->elements.size();
 
-	scene_state.instance_data[p_render_list].resize(p_offset + element_total);
 	rl->element_info.resize(p_offset + element_total);
 
+	// If p_offset == 0, grow_instance_buffer resets and increment the buffer.
+	// If this behavior ever changes, _render_shadow_begin may need to change.
+	scene_state.grow_instance_buffer(p_render_list, p_offset + element_total, p_offset != 0u);
+	if (!scene_state.curr_gpu_ptr[p_render_list] && element_total > 0u) {
+		// The old buffer was replaced for another larger one. We must start copying from scratch.
+		element_total += p_offset;
+		p_offset = 0u;
+		scene_state.curr_gpu_ptr[p_render_list] = reinterpret_cast<SceneState::InstanceData *>(scene_state.instance_buffer[p_render_list].map_raw_for_upload(0u));
+	}
+
 	if (p_render_info) {
 		p_render_info[RS::VIEWPORT_RENDER_INFO_OBJECTS_IN_FRAME] += element_total;
 	}
@@ -797,7 +809,7 @@ void RenderForwardClustered::_fill_instance_data(RenderListType p_render_list, i
 		GeometryInstanceSurfaceDataCache *surface = rl->elements[i + p_offset];
 		GeometryInstanceForwardClustered *inst = surface->owner;
 
-		SceneState::InstanceData &instance_data = scene_state.instance_data[p_render_list][i + p_offset];
+		SceneState::InstanceData instance_data;
 
 		if (likely(inst->store_transform_cache)) {
 			RendererRD::MaterialStorage::store_transform_transposed_3x4(inst->transform, instance_data.transform);
@@ -836,7 +848,9 @@ void RenderForwardClustered::_fill_instance_data(RenderListType p_render_list, i
 		instance_data.set_compressed_aabb(surface_aabb);
 		instance_data.set_uv_scale(uv_scale);
 
-		bool cant_repeat = instance_data.flags & INSTANCE_DATA_FLAG_MULTIMESH || inst->mesh_instance.is_valid();
+		scene_state.curr_gpu_ptr[p_render_list][i + p_offset] = instance_data;
+
+		const bool cant_repeat = instance_data.flags & INSTANCE_DATA_FLAG_MULTIMESH || inst->mesh_instance.is_valid();
 
 		if (prev_surface != nullptr && !cant_repeat && prev_surface->sort.sort_key1 == surface->sort.sort_key1 && prev_surface->sort.sort_key2 == surface->sort.sort_key2 && inst->mirror == prev_surface->owner->mirror && repeats < RenderElementInfo::MAX_REPEATS) {
 			//this element is the same as the previous one, count repeats to draw it using instancing
@@ -870,8 +884,8 @@ void RenderForwardClustered::_fill_instance_data(RenderListType p_render_list, i
 		}
 	}
 
-	if (p_update_buffer) {
-		_update_instance_data_buffer(p_render_list);
+	if (p_update_buffer && element_total > 0u) {
+		RenderingDevice::get_singleton()->buffer_flush(scene_state.instance_buffer[p_render_list]._get(0u));
 	}
 }
 
@@ -2722,7 +2736,8 @@ void RenderForwardClustered::_render_shadow_begin() {
 	_update_render_base_uniform_set();
 
 	render_list[RENDER_LIST_SECONDARY].clear();
-	scene_state.instance_data[RENDER_LIST_SECONDARY].clear();
+	// No need to reset scene_state.curr_gpu_ptr or scene_state.instance_buffer[RENDER_LIST_SECONDARY]
+	// because _fill_instance_data will do that if it detects p_offset == 0u.
 }
 
 void RenderForwardClustered::_render_shadow_append(RID p_framebuffer, const PagedArray<RenderGeometryInstance *> &p_instances, const Projection &p_projection, const Transform3D &p_transform, float p_zfar, float p_bias, float p_normal_bias, bool p_reverse_cull_face, bool p_use_dp, bool p_use_dp_flip, bool p_use_pancake, float p_lod_distance_multiplier, float p_screen_mesh_lod_threshold, const Rect2i &p_rect, bool p_flip_y, bool p_clear_region, bool p_begin, bool p_end, RenderingMethod::RenderInfo *p_render_info, const Size2i &p_viewport_size, const Transform3D &p_main_cam_transform) {
@@ -2797,7 +2812,11 @@ void RenderForwardClustered::_render_shadow_append(RID p_framebuffer, const Page
 }
 
 void RenderForwardClustered::_render_shadow_process() {
-	_update_instance_data_buffer(RENDER_LIST_SECONDARY);
+	RenderingDevice *rd = RenderingDevice::get_singleton();
+	if (scene_state.instance_buffer[RENDER_LIST_SECONDARY].get_size(0u) > 0u) {
+		rd->buffer_flush(scene_state.instance_buffer[RENDER_LIST_SECONDARY]._get(0u));
+	}
+
 	//render shadows one after the other, so this can be done un-barriered and the driver can optimize (as well as allow us to run compute at the same time)
 
 	for (uint32_t i = 0; i < scene_state.shadow_passes.size(); i++) {
@@ -3258,11 +3277,14 @@ RID RenderForwardClustered::_setup_render_pass_uniform_set(RenderListType p_rend
 	{
 		RD::Uniform u;
 		u.binding = 2;
-		u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
-		RID instance_buffer = scene_state.instance_buffer[p_render_list];
-		if (instance_buffer == RID()) {
-			instance_buffer = scene_shader.default_vec4_xform_buffer; // any buffer will do since its not used
+		u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC;
+		if (scene_state.instance_buffer[p_render_list].get_size(0u) == 0u) {
+			// Any buffer will do since it's not used, so just create one.
+			// We can't use scene_shader.default_vec4_xform_buffer because it's not dynamic.
+			scene_state.instance_buffer[p_render_list].set_size(0u, INSTANCE_DATA_BUFFER_MIN_SIZE * sizeof(SceneState::InstanceData), true);
+			scene_state.instance_buffer[p_render_list].prepare_for_upload();
 		}
+		RID instance_buffer = scene_state.instance_buffer[p_render_list]._get(0u);
 		u.append_id(instance_buffer);
 		uniforms.push_back(u);
 	}
@@ -3624,11 +3646,14 @@ RID RenderForwardClustered::_setup_sdfgi_render_pass_uniform_set(RID p_albedo_te
 	{
 		RD::Uniform u;
 		u.binding = 2;
-		u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
-		RID instance_buffer = scene_state.instance_buffer[RENDER_LIST_SECONDARY];
-		if (instance_buffer == RID()) {
-			instance_buffer = scene_shader.default_vec4_xform_buffer; // any buffer will do since its not used
+		u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC;
+		if (scene_state.instance_buffer[RENDER_LIST_SECONDARY].get_size(0u) == 0u) {
+			// Any buffer will do since it's not used, so just create one.
+			// We can't use scene_shader.default_vec4_xform_buffer because it's not dynamic.
+			scene_state.instance_buffer[RENDER_LIST_SECONDARY].set_size(0u, INSTANCE_DATA_BUFFER_MIN_SIZE * sizeof(SceneState::InstanceData), true);
+			scene_state.instance_buffer[RENDER_LIST_SECONDARY].prepare_for_upload();
 		}
+		RID instance_buffer = scene_state.instance_buffer[RENDER_LIST_SECONDARY]._get(0u);
 		u.append_id(instance_buffer);
 		uniforms.push_back(u);
 	}
@@ -5125,9 +5150,7 @@ RenderForwardClustered::~RenderForwardClustered() {
 		RD::get_singleton()->free_rid(scene_state.lightmap_buffer);
 		RD::get_singleton()->free_rid(scene_state.lightmap_capture_buffer);
 		for (uint32_t i = 0; i < RENDER_LIST_MAX; i++) {
-			if (scene_state.instance_buffer[i] != RID()) {
-				RD::get_singleton()->free_rid(scene_state.instance_buffer[i]);
-			}
+			scene_state.instance_buffer[i].uninit();
 		}
 		memdelete_arr(scene_state.lightmap_captures);
 	}
diff --git a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h
index 3605389b0b6..a5b3199f312 100644
--- a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h
+++ b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h
@@ -31,6 +31,7 @@
 #pragma once
 
 #include "core/templates/paged_allocator.h"
+#include "servers/rendering/multi_uma_buffer.h"
 #include "servers/rendering/renderer_rd/cluster_builder_rd.h"
 #include "servers/rendering/renderer_rd/effects/fsr2.h"
 #ifdef METAL_ENABLED
@@ -398,9 +399,8 @@ private:
 		uint32_t max_lightmaps;
 		RID lightmap_buffer;
 
-		RID instance_buffer[RENDER_LIST_MAX];
-		uint32_t instance_buffer_size[RENDER_LIST_MAX] = { 0, 0, 0 };
-		LocalVector<InstanceData> instance_data[RENDER_LIST_MAX];
+		MultiUmaBuffer<1u> instance_buffer[RENDER_LIST_MAX] = { MultiUmaBuffer<1u>("RENDER_LIST_OPAQUE"), MultiUmaBuffer<1u>("RENDER_LIST_MOTION"), MultiUmaBuffer<1u>("RENDER_LIST_ALPHA"), MultiUmaBuffer<1u>("RENDER_LIST_SECONDARY") };
+		InstanceData *curr_gpu_ptr[RENDER_LIST_MAX] = {};
 
 		LightmapCaptureData *lightmap_captures = nullptr;
 		uint32_t max_lightmap_captures;
@@ -433,6 +433,7 @@ private:
 
 		LocalVector<ShadowPass> shadow_passes;
 
+		void grow_instance_buffer(RenderListType p_render_list, uint32_t p_req_element_count, bool p_append);
 	} scene_state;
 
 	static RenderForwardClustered *singleton;
@@ -464,7 +465,6 @@ private:
 	void _render_list(RenderingDevice::DrawListID p_draw_list, RenderingDevice::FramebufferFormatID p_framebuffer_Format, RenderListParameters *p_params, uint32_t p_from_element, uint32_t p_to_element);
 	void _render_list_with_draw_list(RenderListParameters *p_params, RID p_framebuffer, BitField<RD::DrawFlags> p_draw_flags = RD::DRAW_DEFAULT_ALL, const Vector<Color> &p_clear_color_values = Vector<Color>(), float p_clear_depth_value = 0.0, uint32_t p_clear_stencil_value = 0, const Rect2 &p_region = Rect2());
 
-	void _update_instance_data_buffer(RenderListType p_render_list);
 	void _fill_instance_data(RenderListType p_render_list, int *p_render_info = nullptr, uint32_t p_offset = 0, int32_t p_max_elements = -1, bool p_update_buffer = true);
 	void _fill_render_list(RenderListType p_render_list, const RenderDataRD *p_render_data, PassMode p_pass_mode, bool p_using_sdfgi = false, bool p_using_opaque_gi = false, bool p_using_motion_pass = false, bool p_append = false);
 
diff --git a/servers/rendering/renderer_rd/forward_clustered/scene_shader_forward_clustered.cpp b/servers/rendering/renderer_rd/forward_clustered/scene_shader_forward_clustered.cpp
index f6714580bd7..c79c6998272 100644
--- a/servers/rendering/renderer_rd/forward_clustered/scene_shader_forward_clustered.cpp
+++ b/servers/rendering/renderer_rd/forward_clustered/scene_shader_forward_clustered.cpp
@@ -667,7 +667,9 @@ void SceneShaderForwardClustered::init(const String p_defines) {
 			shader_versions.push_back(ShaderRD::VariantDefine(group, version, false));
 		}
 
-		shader.initialize(shader_versions, p_defines);
+		Vector<uint64_t> dynamic_buffers;
+		dynamic_buffers.push_back(ShaderRD::DynamicBuffer::encode(RenderForwardClustered::RENDER_PASS_UNIFORM_SET, 2));
+		shader.initialize(shader_versions, p_defines, Vector<RD::PipelineImmutableSampler>(), dynamic_buffers);
 
 		if (RendererCompositorRD::get_singleton()->is_xr_enabled()) {
 			shader.enable_group(SHADER_GROUP_MULTIVIEW);
diff --git a/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp
index 5dfcf29cbc9..ee12788ac7c 100644
--- a/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp
+++ b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp
@@ -424,13 +424,10 @@ bool RenderForwardMobile::_render_buffers_can_be_storage() {
 	return false;
 }
 
-RID RenderForwardMobile::_setup_render_pass_uniform_set(RenderListType p_render_list, const RenderDataRD *p_render_data, RID p_radiance_texture, const RendererRD::MaterialStorage::Samplers &p_samplers, bool p_use_directional_shadow_atlas, int p_index) {
+RID RenderForwardMobile::_setup_render_pass_uniform_set(RenderListType p_render_list, const RenderDataRD *p_render_data, RID p_radiance_texture, const RendererRD::MaterialStorage::Samplers &p_samplers, bool p_use_directional_shadow_atlas, uint32_t p_pass_offset) {
 	RendererRD::LightStorage *light_storage = RendererRD::LightStorage::get_singleton();
 	RendererRD::TextureStorage *texture_storage = RendererRD::TextureStorage::get_singleton();
 
-	//there should always be enough uniform buffers for render passes, otherwise bugs
-	ERR_FAIL_INDEX_V(p_index, (int)scene_state.uniform_buffers.size(), RID());
-
 	bool is_multiview = false;
 
 	Ref<RenderBufferDataForwardMobile> rb_data;
@@ -454,19 +451,26 @@ RID RenderForwardMobile::_setup_render_pass_uniform_set(RenderListType p_render_
 	{
 		RD::Uniform u;
 		u.binding = 0;
-		u.uniform_type = RD::UNIFORM_TYPE_UNIFORM_BUFFER;
-		u.append_id(scene_state.uniform_buffers[p_index]);
+		u.uniform_type = RD::UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC;
+		// Negative on purpose. We've created multiple uniform_buffers by calling prepare_for_upload()
+		// many times in a row, now we must reference those.
+		// We use 0u - p_pass_offset instead of -p_pass_offset to make MSVC warnings shut up.
+		// See the "Tricks" section of MultiUmaBuffer documentation.
+		u.append_id(scene_state.uniform_buffers._get(uint32_t(0u - p_pass_offset)));
 		uniforms.push_back(u);
 	}
 
 	{
 		RD::Uniform u;
 		u.binding = 1;
-		u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
-		RID instance_buffer = scene_state.instance_buffer[p_render_list];
-		if (instance_buffer == RID()) {
-			instance_buffer = scene_shader.default_vec4_xform_buffer; // Any buffer will do since its not used.
+		u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC;
+		if (scene_state.instance_buffer[p_render_list].get_size(0u) == 0u) {
+			// Any buffer will do since it's not used, so just create one.
+			// We can't use scene_shader.default_vec4_xform_buffer because it's not dynamic.
+			scene_state.instance_buffer[p_render_list].set_size(0u, INSTANCE_DATA_BUFFER_MIN_SIZE * sizeof(SceneState::InstanceData), true);
+			scene_state.instance_buffer[p_render_list].prepare_for_upload();
 		}
+		RID instance_buffer = scene_state.instance_buffer[p_render_list]._get(0u);
 		u.append_id(instance_buffer);
 		uniforms.push_back(u);
 	}
@@ -886,6 +890,7 @@ void RenderForwardMobile::_render_scene(RenderDataRD *p_render_data, const Color
 		render_list[RENDER_LIST_OPAQUE].sort_by_key();
 	}
 	render_list[RENDER_LIST_ALPHA].sort_by_reverse_depth_and_priority();
+
 	_fill_instance_data(RENDER_LIST_OPAQUE);
 	_fill_instance_data(RENDER_LIST_ALPHA);
 
@@ -1507,12 +1512,9 @@ void RenderForwardMobile::_render_shadow_begin() {
 	_update_render_base_uniform_set();
 
 	render_list[RENDER_LIST_SECONDARY].clear();
-	scene_state.instance_data[RENDER_LIST_SECONDARY].clear();
 }
 
 void RenderForwardMobile::_render_shadow_append(RID p_framebuffer, const PagedArray<RenderGeometryInstance *> &p_instances, const Projection &p_projection, const Transform3D &p_transform, float p_zfar, float p_bias, float p_normal_bias, bool p_use_dp, bool p_use_dp_flip, bool p_use_pancake, float p_lod_distance_multiplier, float p_screen_mesh_lod_threshold, const Rect2i &p_rect, bool p_flip_y, bool p_clear_region, bool p_begin, bool p_end, RenderingMethod::RenderInfo *p_render_info, const Transform3D &p_main_cam_transform) {
-	uint32_t shadow_pass_index = scene_state.shadow_passes.size();
-
 	SceneState::ShadowPass shadow_pass;
 
 	if (p_render_info) {
@@ -1539,7 +1541,7 @@ void RenderForwardMobile::_render_shadow_append(RID p_framebuffer, const PagedAr
 	render_data.instances = &p_instances;
 	render_data.render_info = p_render_info;
 
-	_setup_environment(&render_data, true, Vector2(1, 1), Color(), false, p_use_pancake, shadow_pass_index);
+	_setup_environment(&render_data, true, Vector2(1, 1), Color(), false, p_use_pancake);
 
 	if (get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_DISABLE_LOD) {
 		scene_data.screen_mesh_lod_threshold = 0.0;
@@ -1580,13 +1582,17 @@ void RenderForwardMobile::_render_shadow_append(RID p_framebuffer, const PagedAr
 }
 
 void RenderForwardMobile::_render_shadow_process() {
-	_update_instance_data_buffer(RENDER_LIST_SECONDARY);
+	RenderingDevice *rd = RenderingDevice::get_singleton();
+	if (scene_state.instance_buffer[RENDER_LIST_SECONDARY].get_size(0u) > 0u) {
+		rd->buffer_flush(scene_state.instance_buffer[RENDER_LIST_SECONDARY]._get(0u));
+	}
+
 	//render shadows one after the other, so this can be done un-barriered and the driver can optimize (as well as allow us to run compute at the same time)
 
 	for (uint32_t i = 0; i < scene_state.shadow_passes.size(); i++) {
 		//render passes need to be configured after instance buffer is done, since they need the latest version
 		SceneState::ShadowPass &shadow_pass = scene_state.shadow_passes[i];
-		shadow_pass.rp_uniform_set = _setup_render_pass_uniform_set(RENDER_LIST_SECONDARY, nullptr, RID(), RendererRD::MaterialStorage::get_singleton()->samplers_rd_get_default(), false, i);
+		shadow_pass.rp_uniform_set = _setup_render_pass_uniform_set(RENDER_LIST_SECONDARY, nullptr, RID(), RendererRD::MaterialStorage::get_singleton()->samplers_rd_get_default(), false, scene_state.shadow_passes.size() - 1u - i);
 	}
 
 	RD::get_singleton()->draw_command_end_label();
@@ -1899,17 +1905,19 @@ RID RenderForwardMobile::_render_buffers_get_velocity_texture(Ref<RenderSceneBuf
 	return RID();
 }
 
-void RenderForwardMobile::_update_instance_data_buffer(RenderListType p_render_list) {
-	if (scene_state.instance_data[p_render_list].size() > 0) {
-		if (scene_state.instance_buffer[p_render_list] == RID() || scene_state.instance_buffer_size[p_render_list] < scene_state.instance_data[p_render_list].size()) {
-			if (scene_state.instance_buffer[p_render_list] != RID()) {
-				RD::get_singleton()->free_rid(scene_state.instance_buffer[p_render_list]);
-			}
-			uint32_t new_size = nearest_power_of_2_templated(MAX(uint64_t(INSTANCE_DATA_BUFFER_MIN_SIZE), scene_state.instance_data[p_render_list].size()));
-			scene_state.instance_buffer[p_render_list] = RD::get_singleton()->storage_buffer_create(new_size * sizeof(SceneState::InstanceData));
-			scene_state.instance_buffer_size[p_render_list] = new_size;
+void RenderForwardMobile::SceneState::grow_instance_buffer(RenderListType p_render_list, uint32_t p_req_element_count, bool p_append) {
+	if (p_req_element_count > 0) {
+		if (instance_buffer[p_render_list].get_size(0u) < p_req_element_count * sizeof(SceneState::InstanceData)) {
+			instance_buffer[p_render_list].uninit();
+			uint32_t new_size = nearest_power_of_2_templated(MAX(uint64_t(INSTANCE_DATA_BUFFER_MIN_SIZE), p_req_element_count));
+			instance_buffer[p_render_list].set_size(0u, new_size * sizeof(SceneState::InstanceData), true);
+			curr_gpu_ptr[p_render_list] = nullptr;
+		}
+
+		const bool must_remap = instance_buffer[p_render_list].prepare_for_map(p_append);
+		if (must_remap) {
+			curr_gpu_ptr[p_render_list] = nullptr;
 		}
-		RD::get_singleton()->buffer_update(scene_state.instance_buffer[p_render_list], 0, sizeof(SceneState::InstanceData) * scene_state.instance_data[p_render_list].size(), scene_state.instance_data[p_render_list].ptr());
 	}
 }
 
@@ -1917,16 +1925,22 @@ void RenderForwardMobile::_fill_instance_data(RenderListType p_render_list, uint
 	RenderList *rl = &render_list[p_render_list];
 	uint32_t element_total = p_max_elements >= 0 ? uint32_t(p_max_elements) : rl->elements.size();
 
-	scene_state.instance_data[p_render_list].resize(p_offset + element_total);
 	rl->element_info.resize(p_offset + element_total);
 
 	uint64_t frame = RSG::rasterizer->get_frame_number();
 
+	scene_state.grow_instance_buffer(p_render_list, p_offset + element_total, p_offset != 0u);
+	if (!scene_state.curr_gpu_ptr[p_render_list] && element_total > 0u) {
+		// The old buffer was replaced for another larger one. We must start copying from scratch.
+		element_total += p_offset;
+		p_offset = 0u;
+		scene_state.curr_gpu_ptr[p_render_list] = reinterpret_cast<SceneState::InstanceData *>(scene_state.instance_buffer[p_render_list].map_raw_for_upload(0u));
+	}
 	for (uint32_t i = 0; i < element_total; i++) {
 		GeometryInstanceSurfaceDataCache *surface = rl->elements[i + p_offset];
 		GeometryInstanceForwardMobile *inst = surface->owner;
 
-		SceneState::InstanceData &instance_data = scene_state.instance_data[p_render_list][i + p_offset];
+		SceneState::InstanceData instance_data;
 
 		if (inst->prev_transform_dirty && frame > inst->prev_transform_change_frame + 1 && inst->prev_transform_change_frame) {
 			inst->prev_transform = inst->transform;
@@ -1972,14 +1986,16 @@ void RenderForwardMobile::_fill_instance_data(RenderListType p_render_list, uint
 		instance_data.set_compressed_aabb(surface_aabb);
 		instance_data.set_uv_scale(uv_scale);
 
+		scene_state.curr_gpu_ptr[p_render_list][i + p_offset] = instance_data;
+
 		RenderElementInfo &element_info = rl->element_info[p_offset + i];
 
 		// Sets lod_index and uses_lightmap at once.
 		element_info.value = uint32_t(surface->sort.sort_key1 & 0x1FF);
 	}
 
-	if (p_update_buffer) {
-		_update_instance_data_buffer(p_render_list);
+	if (p_update_buffer && element_total > 0u) {
+		RenderingDevice::get_singleton()->buffer_flush(scene_state.instance_buffer[p_render_list]._get(0u));
 	}
 }
 
@@ -2182,22 +2198,20 @@ void RenderForwardMobile::_fill_render_list(RenderListType p_render_list, const
 	}
 }
 
-void RenderForwardMobile::_setup_environment(const RenderDataRD *p_render_data, bool p_no_fog, const Size2i &p_screen_size, const Color &p_default_bg_color, bool p_opaque_render_buffers, bool p_pancake_shadows, int p_index) {
+void RenderForwardMobile::_setup_environment(const RenderDataRD *p_render_data, bool p_no_fog, const Size2i &p_screen_size, const Color &p_default_bg_color, bool p_opaque_render_buffers, bool p_pancake_shadows) {
 	RID env = is_environment(p_render_data->environment) ? p_render_data->environment : RID();
 	RID reflection_probe_instance = p_render_data->reflection_probe.is_valid() ? RendererRD::LightStorage::get_singleton()->reflection_probe_instance_get_probe(p_render_data->reflection_probe) : RID();
 
 	// May do this earlier in RenderSceneRenderRD::render_scene
-	if (p_index >= (int)scene_state.uniform_buffers.size()) {
-		uint32_t from = scene_state.uniform_buffers.size();
-		scene_state.uniform_buffers.resize(p_index + 1);
-		for (uint32_t i = from; i < scene_state.uniform_buffers.size(); i++) {
-			scene_state.uniform_buffers[i] = p_render_data->scene_data->create_uniform_buffer();
-		}
+	if (scene_state.uniform_buffers.get_size(0u) == 0u) {
+		scene_state.uniform_buffers.set_size(0u, p_render_data->scene_data->get_uniform_buffer_size_bytes(), false);
 	}
 
 	float luminance_multiplier = p_render_data->render_buffers.is_valid() ? p_render_data->render_buffers->get_luminance_multiplier() : 1.0;
 
-	p_render_data->scene_data->update_ubo(scene_state.uniform_buffers[p_index], get_debug_draw_mode(), env, reflection_probe_instance, p_render_data->camera_attributes, p_pancake_shadows, p_screen_size, p_default_bg_color, luminance_multiplier, p_opaque_render_buffers, false);
+	// Start a new setup.
+	scene_state.uniform_buffers.prepare_for_upload();
+	p_render_data->scene_data->update_ubo(scene_state.uniform_buffers.get_for_upload(0u), get_debug_draw_mode(), env, reflection_probe_instance, p_render_data->camera_attributes, p_pancake_shadows, p_screen_size, p_default_bg_color, luminance_multiplier, p_opaque_render_buffers, false);
 }
 
 /// RENDERING ///
@@ -3395,13 +3409,9 @@ RenderForwardMobile::~RenderForwardMobile() {
 	RSG::light_storage->directional_shadow_atlas_set_size(0);
 
 	{
-		for (const RID &rid : scene_state.uniform_buffers) {
-			RD::get_singleton()->free_rid(rid);
-		}
+		scene_state.uniform_buffers.uninit();
 		for (uint32_t i = 0; i < RENDER_LIST_MAX; i++) {
-			if (scene_state.instance_buffer[i].is_valid()) {
-				RD::get_singleton()->free_rid(scene_state.instance_buffer[i]);
-			}
+			scene_state.instance_buffer[i].uninit();
 		}
 		RD::get_singleton()->free_rid(scene_state.lightmap_buffer);
 		RD::get_singleton()->free_rid(scene_state.lightmap_capture_buffer);
diff --git a/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.h b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.h
index ba8409d7285..a9be5308ea4 100644
--- a/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.h
+++ b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.h
@@ -31,6 +31,7 @@
 #pragma once
 
 #include "core/templates/paged_allocator.h"
+#include "servers/rendering/multi_uma_buffer.h"
 #include "servers/rendering/renderer_rd/forward_mobile/scene_shader_forward_mobile.h"
 #include "servers/rendering/renderer_rd/renderer_scene_render_rd.h"
 
@@ -161,18 +162,17 @@ private:
 
 	/* Render Scene */
 
-	RID _setup_render_pass_uniform_set(RenderListType p_render_list, const RenderDataRD *p_render_data, RID p_radiance_texture, const RendererRD::MaterialStorage::Samplers &p_samplers, bool p_use_directional_shadow_atlas = false, int p_index = 0);
+	RID _setup_render_pass_uniform_set(RenderListType p_render_list, const RenderDataRD *p_render_data, RID p_radiance_texture, const RendererRD::MaterialStorage::Samplers &p_samplers, bool p_use_directional_shadow_atlas = false, uint32_t p_pass_offset = 0u);
 	void _pre_opaque_render(RenderDataRD *p_render_data);
 
 	uint64_t lightmap_texture_array_version = 0xFFFFFFFF;
 
 	void _update_render_base_uniform_set();
 
-	void _update_instance_data_buffer(RenderListType p_render_list);
 	void _fill_instance_data(RenderListType p_render_list, uint32_t p_offset = 0, int32_t p_max_elements = -1, bool p_update_buffer = true);
 	void _fill_render_list(RenderListType p_render_list, const RenderDataRD *p_render_data, PassMode p_pass_mode, bool p_append = false);
 
-	void _setup_environment(const RenderDataRD *p_render_data, bool p_no_fog, const Size2i &p_screen_size, const Color &p_default_bg_color, bool p_opaque_render_buffers = false, bool p_pancake_shadows = false, int p_index = 0);
+	void _setup_environment(const RenderDataRD *p_render_data, bool p_no_fog, const Size2i &p_screen_size, const Color &p_default_bg_color, bool p_opaque_render_buffers = false, bool p_pancake_shadows = false);
 	void _setup_lightmaps(const RenderDataRD *p_render_data, const PagedArray<RID> &p_lightmaps, const Transform3D &p_cam_transform);
 
 	RID render_base_uniform_set;
@@ -193,7 +193,7 @@ private:
 	/* Scene state */
 
 	struct SceneState {
-		LocalVector<RID> uniform_buffers;
+		MultiUmaBuffer<1u> uniform_buffers = MultiUmaBuffer<1u>("SceneState::uniform_buffers");
 
 		struct PushConstantUbershader {
 			SceneShaderForwardMobile::ShaderSpecialization specialization;
@@ -274,9 +274,8 @@ private:
 		static_assert(std::is_trivially_destructible_v<InstanceData>);
 		static_assert(std::is_trivially_constructible_v<InstanceData>);
 
-		RID instance_buffer[RENDER_LIST_MAX];
-		uint32_t instance_buffer_size[RENDER_LIST_MAX] = { 0, 0, 0 };
-		LocalVector<InstanceData> instance_data[RENDER_LIST_MAX];
+		MultiUmaBuffer<1u> instance_buffer[RENDER_LIST_MAX] = { MultiUmaBuffer<1u>("RENDER_LIST_OPAQUE"), MultiUmaBuffer<1u>("RENDER_LIST_ALPHA"), MultiUmaBuffer<1u>("RENDER_LIST_SECONDARY") };
+		InstanceData *curr_gpu_ptr[RENDER_LIST_MAX] = {};
 
 		// !BAS! We need to change lightmaps, we're not going to do this with a buffer but pushing the used lightmap in
 		LightmapData lightmaps[MAX_LIGHTMAPS];
@@ -311,6 +310,8 @@ private:
 		};
 
 		LocalVector<ShadowPass> shadow_passes;
+
+		void grow_instance_buffer(RenderListType p_render_list, uint32_t p_req_element_count, bool p_append);
 	} scene_state;
 
 	/* Render List */
diff --git a/servers/rendering/renderer_rd/forward_mobile/scene_shader_forward_mobile.cpp b/servers/rendering/renderer_rd/forward_mobile/scene_shader_forward_mobile.cpp
index c939ad1f4c0..bb1faa19988 100644
--- a/servers/rendering/renderer_rd/forward_mobile/scene_shader_forward_mobile.cpp
+++ b/servers/rendering/renderer_rd/forward_mobile/scene_shader_forward_mobile.cpp
@@ -601,7 +601,10 @@ void SceneShaderForwardMobile::init(const String p_defines) {
 		immutable_shadow_sampler.append_id(shadow_sampler);
 		immutable_shadow_sampler.uniform_type = RenderingDeviceCommons::UNIFORM_TYPE_SAMPLER;
 		immutable_samplers.push_back(immutable_shadow_sampler);
-		shader.initialize(shader_versions, p_defines, immutable_samplers);
+		Vector<uint64_t> dynamic_buffers;
+		dynamic_buffers.push_back(ShaderRD::DynamicBuffer::encode(RenderForwardMobile::RENDER_PASS_UNIFORM_SET, 0));
+		dynamic_buffers.push_back(ShaderRD::DynamicBuffer::encode(RenderForwardMobile::RENDER_PASS_UNIFORM_SET, 1));
+		shader.initialize(shader_versions, p_defines, immutable_samplers, dynamic_buffers);
 
 		if (RendererCompositorRD::get_singleton()->is_xr_enabled()) {
 			enable_multiview_shader_group();
diff --git a/servers/rendering/renderer_rd/renderer_canvas_render_rd.cpp b/servers/rendering/renderer_rd/renderer_canvas_render_rd.cpp
index d172ccdd7c1..18adf2d813b 100644
--- a/servers/rendering/renderer_rd/renderer_canvas_render_rd.cpp
+++ b/servers/rendering/renderer_rd/renderer_canvas_render_rd.cpp
@@ -748,8 +748,6 @@ void RendererCanvasRenderRD::canvas_render_items(RID p_to_render_target, Item *p
 	Item *canvas_group_owner = nullptr;
 	bool skip_item = false;
 
-	state.last_instance_index = 0;
-
 	bool update_skeletons = false;
 	bool time_used = false;
 
@@ -916,8 +914,13 @@ void RendererCanvasRenderRD::canvas_render_items(RID p_to_render_target, Item *p
 	}
 
 	texture_info_map.clear();
-	state.current_data_buffer_index = (state.current_data_buffer_index + 1) % BATCH_DATA_BUFFER_COUNT;
-	state.current_instance_buffer_index = 0;
+	state.instance_data = nullptr;
+	if (state.instance_data_index > 0) {
+		// If there was any remaining instance data, it must be flushed.
+		RID buf = state.instance_buffers._get(0);
+		RD::get_singleton()->buffer_flush(buf);
+		state.instance_data_index = 0;
+	}
 }
 
 RID RendererCanvasRenderRD::light_create() {
@@ -1747,7 +1750,10 @@ RendererCanvasRenderRD::RendererCanvasRenderRD() {
 			variants.push_back(base_define + "#define USE_ATTRIBUTES\n#define USE_POINT_SIZE\n"); // SHADER_VARIANT_ATTRIBUTES_POINTS
 		}
 
-		shader.canvas_shader.initialize(variants, global_defines);
+		Vector<uint64_t> dynamic_buffers;
+		dynamic_buffers.push_back(ShaderRD::DynamicBuffer::encode(BATCH_UNIFORM_SET, 4));
+
+		shader.canvas_shader.initialize(variants, global_defines, {}, dynamic_buffers);
 
 		shader.default_version_data = memnew(CanvasShaderData);
 		shader.default_version_data->version = shader.canvas_shader.version_create();
@@ -2058,12 +2064,7 @@ void fragment() {
 		state.max_instances_per_buffer = uint32_t(GLOBAL_GET("rendering/2d/batching/item_buffer_size"));
 		state.max_instance_buffer_size = state.max_instances_per_buffer * sizeof(InstanceData);
 		state.canvas_instance_batches.reserve(200);
-
-		for (uint32_t i = 0; i < BATCH_DATA_BUFFER_COUNT; i++) {
-			DataBuffer &db = state.canvas_instance_data_buffers[i];
-			db.instance_buffers.push_back(RD::get_singleton()->storage_buffer_create(state.max_instance_buffer_size));
-		}
-		state.instance_data_array = memnew_arr(InstanceData, state.max_instances_per_buffer);
+		state.instance_buffers.set_size(0, state.max_instance_buffer_size, true);
 	}
 }
 
@@ -2122,7 +2123,6 @@ uint32_t RendererCanvasRenderRD::get_pipeline_compilations(RS::PipelineSource p_
 
 void RendererCanvasRenderRD::_render_batch_items(RenderTarget p_to_render_target, int p_item_count, const Transform2D &p_canvas_transform_inverse, Light *p_lights, bool &r_sdf_used, bool p_to_backbuffer, RenderingMethod::RenderInfo *r_render_info) {
 	// Record batches
-	uint32_t instance_index = 0;
 	{
 		RendererRD::MaterialStorage *material_storage = RendererRD::MaterialStorage::get_singleton();
 		Item *current_clip = nullptr;
@@ -2132,7 +2132,7 @@ void RendererCanvasRenderRD::_render_batch_items(RenderTarget p_to_render_target
 		bool batch_broken = false;
 		Batch *current_batch = _new_batch(batch_broken);
 		// Override the start position and index as we want to start from where we finished off last time.
-		current_batch->start = state.last_instance_index;
+		current_batch->start = state.instance_data_index;
 
 		for (int i = 0; i < p_item_count; i++) {
 			Item *ci = items[i];
@@ -2173,7 +2173,7 @@ void RendererCanvasRenderRD::_render_batch_items(RenderTarget p_to_render_target
 
 			if (ci->repeat_source_item == nullptr || ci->repeat_size == Vector2()) {
 				Transform2D base_transform = p_canvas_transform_inverse * ci->final_transform;
-				_record_item_commands(ci, p_to_render_target, base_transform, current_clip, p_lights, instance_index, batch_broken, r_sdf_used, current_batch);
+				_record_item_commands(ci, p_to_render_target, base_transform, current_clip, p_lights, batch_broken, r_sdf_used, current_batch);
 			} else {
 				Point2 start_pos = ci->repeat_size * -(ci->repeat_times / 2);
 				Point2 offset;
@@ -2186,20 +2186,11 @@ void RendererCanvasRenderRD::_render_batch_items(RenderTarget p_to_render_target
 						Transform2D base_transform = ci->final_transform;
 						base_transform.columns[2] += ci->repeat_source_item->final_transform.basis_xform(offset);
 						base_transform = p_canvas_transform_inverse * base_transform;
-						_record_item_commands(ci, p_to_render_target, base_transform, current_clip, p_lights, instance_index, batch_broken, r_sdf_used, current_batch);
+						_record_item_commands(ci, p_to_render_target, base_transform, current_clip, p_lights, batch_broken, r_sdf_used, current_batch);
 					}
 				}
 			}
 		}
-
-		// Copy over remaining data needed for rendering.
-		if (instance_index > 0) {
-			RD::get_singleton()->buffer_update(
-					state.canvas_instance_data_buffers[state.current_data_buffer_index].instance_buffers[state.current_instance_buffer_index],
-					state.last_instance_index * sizeof(InstanceData),
-					instance_index * sizeof(InstanceData),
-					state.instance_data_array);
-		}
 	}
 
 	if (state.canvas_instance_batches.is_empty()) {
@@ -2284,63 +2275,28 @@ void RendererCanvasRenderRD::_render_batch_items(RenderTarget p_to_render_target
 
 	state.current_batch_index = 0;
 	state.canvas_instance_batches.clear();
-	state.last_instance_index += instance_index;
 }
 
-RendererCanvasRenderRD::InstanceData *RendererCanvasRenderRD::new_instance_data(float *p_world, uint32_t *p_lights, uint32_t p_base_flags, uint32_t p_index, uint32_t p_uniforms_ofs, TextureInfo *p_info) {
-	InstanceData *instance_data = &state.instance_data_array[p_index];
-	// Zero out most fields.
-	for (int i = 0; i < 4; i++) {
-		instance_data->modulation[i] = 0.0;
-		instance_data->ninepatch_margins[i] = 0.0;
-		instance_data->src_rect[i] = 0.0;
-		instance_data->dst_rect[i] = 0.0;
-	}
-
-	instance_data->pad[0] = 0.0;
-	instance_data->pad[1] = 0.0;
-
-	instance_data->lights[0] = p_lights[0];
-	instance_data->lights[1] = p_lights[1];
-	instance_data->lights[2] = p_lights[2];
-	instance_data->lights[3] = p_lights[3];
-
-	for (int i = 0; i < 6; i++) {
-		instance_data->world[i] = p_world[i];
-	}
-
-	instance_data->flags = p_base_flags; // Reset on each command for safety.
-
-	instance_data->color_texture_pixel_size[0] = p_info->texpixel_size.width;
-	instance_data->color_texture_pixel_size[1] = p_info->texpixel_size.height;
-
-	instance_data->instance_uniforms_ofs = p_uniforms_ofs;
-
-	return instance_data;
-}
-
-void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTarget p_render_target, const Transform2D &p_base_transform, Item *&r_current_clip, Light *p_lights, uint32_t &r_index, bool &r_batch_broken, bool &r_sdf_used, Batch *&r_current_batch) {
+void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTarget p_render_target, const Transform2D &p_base_transform, Item *&r_current_clip, Light *p_lights, bool &r_batch_broken, bool &r_sdf_used, Batch *&r_current_batch) {
 	const RenderingServer::CanvasItemTextureFilter texture_filter = p_item->texture_filter == RS::CANVAS_ITEM_TEXTURE_FILTER_DEFAULT ? default_filter : p_item->texture_filter;
 	const RenderingServer::CanvasItemTextureRepeat texture_repeat = p_item->texture_repeat == RS::CANVAS_ITEM_TEXTURE_REPEAT_DEFAULT ? default_repeat : p_item->texture_repeat;
 
 	Transform2D base_transform = p_base_transform;
 
-	float world[6];
+	InstanceData template_instance;
+	memset(&template_instance, 0, sizeof(InstanceData));
+
 	Transform2D draw_transform; // Used by transform command
-	_update_transform_2d_to_mat2x3(base_transform, world);
+	_update_transform_2d_to_mat2x3(base_transform, template_instance.world);
 
 	Color base_color = p_item->final_modulate;
 	bool use_linear_colors = p_render_target.use_linear_colors;
-	uint32_t base_flags = 0;
-	uint32_t uniforms_ofs = static_cast<uint32_t>(p_item->instance_allocated_shader_uniforms_offset);
+	template_instance.instance_uniforms_ofs = static_cast<uint32_t>(p_item->instance_allocated_shader_uniforms_offset);
 
 	bool reclip = false;
 
 	bool skipping = false;
 
-	// TODO: consider making lights a per-batch property and then baking light operations in the shader for better performance.
-	uint32_t lights[4] = { 0, 0, 0, 0 };
-
 	uint16_t light_count = 0;
 	uint16_t shadow_mask = 0;
 
@@ -2350,7 +2306,8 @@ void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTar
 		while (light) {
 			if (light->render_index_cache >= 0 && p_item->light_mask & light->item_mask && p_item->z_final >= light->z_min && p_item->z_final <= light->z_max && p_item->global_rect_cache.intersects(light->rect_cache)) {
 				uint32_t light_index = light->render_index_cache;
-				lights[light_count >> 2] |= light_index << ((light_count & 3) * 8);
+				// TODO: consider making lights a per-batch property and then baking light operations in the shader for better performance.
+				template_instance.lights[light_count >> 2] |= light_index << ((light_count & 3) * 8);
 
 				if (p_item->light_mask & light->item_shadow_mask) {
 					shadow_mask |= 1 << light_count;
@@ -2365,8 +2322,8 @@ void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTar
 			light = light->next_ptr;
 		}
 
-		base_flags |= light_count << INSTANCE_FLAGS_LIGHT_COUNT_SHIFT;
-		base_flags |= shadow_mask << INSTANCE_FLAGS_SHADOW_MASKED_SHIFT;
+		template_instance.flags |= light_count << INSTANCE_FLAGS_LIGHT_COUNT_SHIFT;
+		template_instance.flags |= shadow_mask << INSTANCE_FLAGS_SHADOW_MASKED_SHIFT;
 	}
 
 	bool use_lighting = (light_count > 0 || using_directional_lights);
@@ -2430,9 +2387,11 @@ void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTar
 				if (r_current_batch->tex_info != tex_info) {
 					r_current_batch = _new_batch(r_batch_broken);
 					r_current_batch->tex_info = tex_info;
+					template_instance.color_texture_pixel_size[0] = tex_info->texpixel_size.width;
+					template_instance.color_texture_pixel_size[1] = tex_info->texpixel_size.height;
 				}
 
-				InstanceData *instance_data = new_instance_data(world, lights, base_flags, r_index, uniforms_ofs, tex_info);
+				InstanceData *instance_data = new_instance_data(template_instance);
 				Rect2 src_rect;
 				Rect2 dst_rect;
 
@@ -2505,7 +2464,7 @@ void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTar
 				instance_data->dst_rect[2] = dst_rect.size.width;
 				instance_data->dst_rect[3] = dst_rect.size.height;
 
-				_add_to_batch(r_index, r_batch_broken, r_current_batch);
+				_add_to_batch(r_batch_broken, r_current_batch);
 			} break;
 
 			case Item::Command::TYPE_NINEPATCH: {
@@ -2531,9 +2490,11 @@ void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTar
 				if (r_current_batch->tex_info != tex_info) {
 					r_current_batch = _new_batch(r_batch_broken);
 					r_current_batch->tex_info = tex_info;
+					template_instance.color_texture_pixel_size[0] = tex_info->texpixel_size.width;
+					template_instance.color_texture_pixel_size[1] = tex_info->texpixel_size.height;
 				}
 
-				InstanceData *instance_data = new_instance_data(world, lights, base_flags, r_index, uniforms_ofs, tex_info);
+				InstanceData *instance_data = new_instance_data(template_instance);
 
 				Rect2 src_rect;
 				Rect2 dst_rect(np->rect.position.x, np->rect.position.y, np->rect.size.x, np->rect.size.y);
@@ -2582,7 +2543,7 @@ void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTar
 				instance_data->ninepatch_margins[2] = np->margin[SIDE_RIGHT];
 				instance_data->ninepatch_margins[3] = np->margin[SIDE_BOTTOM];
 
-				_add_to_batch(r_index, r_batch_broken, r_current_batch);
+				_add_to_batch(r_batch_broken, r_current_batch);
 			} break;
 
 			case Item::Command::TYPE_POLYGON: {
@@ -2606,6 +2567,8 @@ void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTar
 				if (r_current_batch->tex_info != tex_info) {
 					r_current_batch = _new_batch(r_batch_broken);
 					r_current_batch->tex_info = tex_info;
+					template_instance.color_texture_pixel_size[0] = tex_info->texpixel_size.width;
+					template_instance.color_texture_pixel_size[1] = tex_info->texpixel_size.height;
 				}
 
 				// pipeline variant
@@ -2615,7 +2578,7 @@ void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTar
 					r_current_batch->render_primitive = _primitive_type_to_render_primitive(polygon->primitive);
 				}
 
-				InstanceData *instance_data = new_instance_data(world, lights, base_flags, r_index, uniforms_ofs, tex_info);
+				InstanceData *instance_data = new_instance_data(template_instance);
 
 				Color color = base_color;
 				if (use_linear_colors) {
@@ -2627,7 +2590,7 @@ void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTar
 				instance_data->modulation[2] = color.b;
 				instance_data->modulation[3] = color.a;
 
-				_add_to_batch(r_index, r_batch_broken, r_current_batch);
+				_add_to_batch(r_batch_broken, r_current_batch);
 			} break;
 
 			case Item::Command::TYPE_PRIMITIVE: {
@@ -2673,9 +2636,11 @@ void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTar
 				if (r_current_batch->tex_info != tex_info) {
 					r_current_batch = _new_batch(r_batch_broken);
 					r_current_batch->tex_info = tex_info;
+					template_instance.color_texture_pixel_size[0] = tex_info->texpixel_size.width;
+					template_instance.color_texture_pixel_size[1] = tex_info->texpixel_size.height;
 				}
 
-				InstanceData *instance_data = new_instance_data(world, lights, base_flags, r_index, uniforms_ofs, tex_info);
+				InstanceData *instance_data = new_instance_data(template_instance);
 
 				for (uint32_t j = 0; j < MIN(3u, primitive->point_count); j++) {
 					instance_data->points[j * 2 + 0] = primitive->points[j].x;
@@ -2690,10 +2655,10 @@ void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTar
 					instance_data->colors[j * 2 + 1] = (uint32_t(Math::make_half_float(col.a)) << 16) | Math::make_half_float(col.b);
 				}
 
-				_add_to_batch(r_index, r_batch_broken, r_current_batch);
+				_add_to_batch(r_batch_broken, r_current_batch);
 
 				if (primitive->point_count == 4) {
-					instance_data = new_instance_data(world, lights, base_flags, r_index, uniforms_ofs, tex_info);
+					instance_data = new_instance_data(template_instance);
 
 					for (uint32_t j = 0; j < 3; j++) {
 						int offset = j == 0 ? 0 : 1;
@@ -2710,7 +2675,7 @@ void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTar
 						instance_data->colors[j * 2 + 1] = (uint32_t(Math::make_half_float(col.a)) << 16) | Math::make_half_float(col.b);
 					}
 
-					_add_to_batch(r_index, r_batch_broken, r_current_batch);
+					_add_to_batch(r_batch_broken, r_current_batch);
 				}
 			} break;
 
@@ -2736,7 +2701,9 @@ void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTar
 						_prepare_batch_texture_info(m->texture, tex_state, tex_info);
 					}
 					r_current_batch->tex_info = tex_info;
-					instance_data = new_instance_data(world, lights, base_flags, r_index, uniforms_ofs, tex_info);
+					template_instance.color_texture_pixel_size[0] = tex_info->texpixel_size.width;
+					template_instance.color_texture_pixel_size[1] = tex_info->texpixel_size.height;
+					instance_data = new_instance_data(template_instance);
 
 					r_current_batch->mesh_instance_count = 1;
 					_update_transform_2d_to_mat2x3(base_transform * draw_transform * m->transform, instance_data->world);
@@ -2763,7 +2730,9 @@ void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTar
 						_prepare_batch_texture_info(mm->texture, tex_state, tex_info);
 					}
 					r_current_batch->tex_info = tex_info;
-					instance_data = new_instance_data(world, lights, base_flags, r_index, uniforms_ofs, tex_info);
+					template_instance.color_texture_pixel_size[0] = tex_info->texpixel_size.width;
+					template_instance.color_texture_pixel_size[1] = tex_info->texpixel_size.height;
+					instance_data = new_instance_data(template_instance);
 
 					r_current_batch->flags |= 1; // multimesh, trails disabled
 
@@ -2785,7 +2754,9 @@ void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTar
 						_prepare_batch_texture_info(pt->texture, tex_state, tex_info);
 					}
 					r_current_batch->tex_info = tex_info;
-					instance_data = new_instance_data(world, lights, base_flags, r_index, uniforms_ofs, tex_info);
+					template_instance.color_texture_pixel_size[0] = tex_info->texpixel_size.width;
+					template_instance.color_texture_pixel_size[1] = tex_info->texpixel_size.height;
+					instance_data = new_instance_data(template_instance);
 
 					uint32_t divisor = 1;
 					r_current_batch->mesh_instance_count = particles_storage->particles_get_amount(pt->particles, divisor);
@@ -2828,13 +2799,13 @@ void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTar
 				instance_data->modulation[2] = modulated.b;
 				instance_data->modulation[3] = modulated.a;
 
-				_add_to_batch(r_index, r_batch_broken, r_current_batch);
+				_add_to_batch(r_batch_broken, r_current_batch);
 			} break;
 
 			case Item::Command::TYPE_TRANSFORM: {
 				const Item::CommandTransform *transform = static_cast<const Item::CommandTransform *>(c);
 				draw_transform = transform->xform;
-				_update_transform_2d_to_mat2x3(base_transform * transform->xform, world);
+				_update_transform_2d_to_mat2x3(base_transform * transform->xform, template_instance.world);
 			} break;
 
 			case Item::Command::TYPE_CLIP_IGNORE: {
@@ -2906,10 +2877,12 @@ void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTar
 		if (r_current_batch->tex_info != tex_info) {
 			r_current_batch = _new_batch(r_batch_broken);
 			r_current_batch->tex_info = tex_info;
+			template_instance.color_texture_pixel_size[0] = tex_info->texpixel_size.width;
+			template_instance.color_texture_pixel_size[1] = tex_info->texpixel_size.height;
 		}
 
-		_update_transform_2d_to_mat2x3(base_transform, world);
-		InstanceData *instance_data = new_instance_data(world, lights, base_flags, r_index, uniforms_ofs, tex_info);
+		_update_transform_2d_to_mat2x3(base_transform, template_instance.world);
+		InstanceData *instance_data = new_instance_data(template_instance);
 
 		Rect2 src_rect;
 		Rect2 dst_rect;
@@ -2941,7 +2914,7 @@ void RendererCanvasRenderRD::_record_item_commands(const Item *p_item, RenderTar
 		instance_data->dst_rect[2] = dst_rect.size.width;
 		instance_data->dst_rect[3] = dst_rect.size.height;
 
-		_add_to_batch(r_index, r_batch_broken, r_current_batch);
+		_add_to_batch(r_batch_broken, r_current_batch);
 
 		p_item->debug_redraw_time -= RSG::rasterizer->get_frame_delta_time();
 
@@ -2984,9 +2957,7 @@ void RendererCanvasRenderRD::_render_batch(RD::DrawListID p_draw_list, CanvasSha
 	{
 		RendererRD::TextureStorage *ts = RendererRD::TextureStorage::get_singleton();
 
-		RIDSetKey key(
-				p_batch->tex_info->state,
-				state.canvas_instance_data_buffers[state.current_data_buffer_index].instance_buffers[p_batch->instance_buffer_index]);
+		RIDSetKey key(p_batch->tex_info->state, p_batch->instance_buffer);
 
 		const RID *uniform_set = rid_set_to_uniform_set.getptr(key);
 		if (uniform_set == nullptr) {
@@ -2995,7 +2966,7 @@ void RendererCanvasRenderRD::_render_batch(RD::DrawListID p_draw_list, CanvasSha
 			uniform_ptrw[1] = RD::Uniform(RD::UNIFORM_TYPE_TEXTURE, 1, p_batch->tex_info->normal);
 			uniform_ptrw[2] = RD::Uniform(RD::UNIFORM_TYPE_TEXTURE, 2, p_batch->tex_info->specular);
 			uniform_ptrw[3] = RD::Uniform(RD::UNIFORM_TYPE_SAMPLER, 3, p_batch->tex_info->sampler);
-			uniform_ptrw[4] = RD::Uniform(RD::UNIFORM_TYPE_STORAGE_BUFFER, 4, state.canvas_instance_data_buffers[state.current_data_buffer_index].instance_buffers[p_batch->instance_buffer_index]);
+			uniform_ptrw[4] = RD::Uniform(RD::UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC, 4, p_batch->instance_buffer);
 
 			RID rid = RD::get_singleton()->uniform_set_create(state.batch_texture_uniforms, shader.default_version_rd_shader, BATCH_UNIFORM_SET);
 			ERR_FAIL_COND_MSG(rid.is_null(), "Failed to create uniform set for batch.");
@@ -3194,10 +3165,24 @@ void RendererCanvasRenderRD::_render_batch(RD::DrawListID p_draw_list, CanvasSha
 	}
 }
 
+RendererCanvasRenderRD::InstanceData *RendererCanvasRenderRD::new_instance_data(const InstanceData &template_instance) {
+	DEV_ASSERT(state.instance_data != nullptr);
+
+	InstanceData *instance_data = &state.instance_data[state.instance_data_index];
+	memcpy(instance_data, &template_instance, sizeof(InstanceData));
+	return instance_data;
+}
+
 RendererCanvasRenderRD::Batch *RendererCanvasRenderRD::_new_batch(bool &r_batch_broken) {
 	if (state.canvas_instance_batches.is_empty()) {
 		Batch new_batch;
-		new_batch.instance_buffer_index = state.current_instance_buffer_index;
+		// This will still be a valid point when multiple calls to _render_batch_items
+		// are made in the same draw call.
+		if (state.instance_data == nullptr) {
+			// If there is no existing instance buffer, we must allocate a new one.
+			_allocate_instance_buffer();
+		}
+		new_batch.instance_buffer = state.instance_buffers._get(0);
 		state.canvas_instance_batches.push_back(new_batch);
 		return state.canvas_instance_batches.ptr();
 	}
@@ -3212,43 +3197,30 @@ RendererCanvasRenderRD::Batch *RendererCanvasRenderRD::_new_batch(bool &r_batch_
 	Batch new_batch = state.canvas_instance_batches[state.current_batch_index];
 	new_batch.instance_count = 0;
 	new_batch.start = state.canvas_instance_batches[state.current_batch_index].start + state.canvas_instance_batches[state.current_batch_index].instance_count;
-	new_batch.instance_buffer_index = state.current_instance_buffer_index;
 	state.current_batch_index++;
 	state.canvas_instance_batches.push_back(new_batch);
 	return &state.canvas_instance_batches[state.current_batch_index];
 }
 
-void RendererCanvasRenderRD::_add_to_batch(uint32_t &r_index, bool &r_batch_broken, Batch *&r_current_batch) {
+void RendererCanvasRenderRD::_add_to_batch(bool &r_batch_broken, Batch *&r_current_batch) {
 	r_current_batch->instance_count++;
-	r_index++;
-	if (r_index + state.last_instance_index >= state.max_instances_per_buffer) {
-		// Copy over all data needed for rendering right away
-		// then go back to recording item commands.
-		RD::get_singleton()->buffer_update(
-				state.canvas_instance_data_buffers[state.current_data_buffer_index].instance_buffers[state.current_instance_buffer_index],
-				state.last_instance_index * sizeof(InstanceData),
-				r_index * sizeof(InstanceData),
-				state.instance_data_array);
+	state.instance_data_index++;
+	if (state.instance_data_index >= state.max_instances_per_buffer) {
+		RD::get_singleton()->buffer_flush(r_current_batch->instance_buffer);
+		state.instance_data = nullptr;
 		_allocate_instance_buffer();
-		r_index = 0;
-		state.last_instance_index = 0;
+		state.instance_data_index = 0;
+		state.instance_data_index = 0;
 		r_batch_broken = false; // Force a new batch to be created
 		r_current_batch = _new_batch(r_batch_broken);
 		r_current_batch->start = 0;
+		r_current_batch->instance_buffer = state.instance_buffers._get(0);
 	}
 }
 
 void RendererCanvasRenderRD::_allocate_instance_buffer() {
-	state.current_instance_buffer_index++;
-
-	if (state.current_instance_buffer_index < state.canvas_instance_data_buffers[state.current_data_buffer_index].instance_buffers.size()) {
-		// We already allocated another buffer in a previous frame, so we can just use it.
-		return;
-	}
-
-	// Allocate a new buffer.
-	RID buf = RD::get_singleton()->storage_buffer_create(state.max_instance_buffer_size);
-	state.canvas_instance_data_buffers[state.current_data_buffer_index].instance_buffers.push_back(buf);
+	state.instance_buffers.prepare_for_upload();
+	state.instance_data = reinterpret_cast<InstanceData *>(state.instance_buffers.map_raw_for_upload(0));
 }
 
 void RendererCanvasRenderRD::_prepare_batch_texture_info(RID p_texture, TextureState &p_state, TextureInfo *p_info) {
@@ -3337,12 +3309,7 @@ RendererCanvasRenderRD::~RendererCanvasRenderRD() {
 		RD::get_singleton()->free_rid(state.shadow_occluder_buffer);
 	}
 
-	memdelete_arr(state.instance_data_array);
-	for (uint32_t i = 0; i < BATCH_DATA_BUFFER_COUNT; i++) {
-		for (uint32_t j = 0; j < state.canvas_instance_data_buffers[i].instance_buffers.size(); j++) {
-			RD::get_singleton()->free_rid(state.canvas_instance_data_buffers[i].instance_buffers[j]);
-		}
-	}
+	state.instance_buffers.uninit();
 
 	// Disable the callback, as we're tearing everything down
 	texture_storage->canvas_texture_set_invalidation_callback(default_canvas_texture, nullptr, nullptr);
diff --git a/servers/rendering/renderer_rd/renderer_canvas_render_rd.h b/servers/rendering/renderer_rd/renderer_canvas_render_rd.h
index 6245072bd4d..8f849981889 100644
--- a/servers/rendering/renderer_rd/renderer_canvas_render_rd.h
+++ b/servers/rendering/renderer_rd/renderer_canvas_render_rd.h
@@ -31,6 +31,7 @@
 #pragma once
 
 #include "core/templates/lru.h"
+#include "servers/rendering/multi_uma_buffer.h"
 #include "servers/rendering/renderer_canvas_render.h"
 #include "servers/rendering/renderer_rd/pipeline_hash_map_rd.h"
 #include "servers/rendering/renderer_rd/shaders/canvas.glsl.gen.h"
@@ -495,10 +496,12 @@ class RendererCanvasRenderRD : public RendererCanvasRender {
 	HashMap<RID, TightLocalVector<RID>> canvas_texture_to_uniform_set;
 
 	struct Batch {
-		// Position in the UBO measured in bytes
+		/// First instance index into the instance buffer for this batch.
 		uint32_t start = 0;
+		/// Number of instances in this batch.
 		uint32_t instance_count = 0;
-		uint32_t instance_buffer_index = 0;
+		/// Resource ID of the instance buffer for this batch.
+		RID instance_buffer; // UMA
 
 		TextureInfo *tex_info;
 
@@ -528,11 +531,6 @@ class RendererCanvasRenderRD : public RendererCanvasRender {
 
 	HashMap<TextureState, TextureInfo, HashMapHasherDefault, HashMapComparatorDefault<TextureState>, PagedAllocator<HashMapElement<TextureState, TextureInfo>>> texture_info_map;
 
-	// per-frame buffers
-	struct DataBuffer {
-		LocalVector<RID> instance_buffers;
-	};
-
 	struct State {
 		//state buffer
 		struct Buffer {
@@ -555,13 +553,17 @@ class RendererCanvasRenderRD : public RendererCanvasRender {
 			uint32_t flags;
 		};
 
-		DataBuffer canvas_instance_data_buffers[BATCH_DATA_BUFFER_COUNT];
 		LocalVector<Batch> canvas_instance_batches;
-		uint32_t current_data_buffer_index = 0;
-		uint32_t current_instance_buffer_index = 0;
 		uint32_t current_batch_index = 0;
-		uint32_t last_instance_index = 0;
-		InstanceData *instance_data_array = nullptr;
+
+		static_assert(std::is_trivially_destructible_v<InstanceData>);
+		static_assert(std::is_trivially_constructible_v<InstanceData>);
+
+		MultiUmaBuffer<1u> instance_buffers = MultiUmaBuffer<1u>("CANVAS_INSTANCE_DATA");
+		/// A pointer to the current instance buffer retrieved from <c>instance_buffers</c>.
+		InstanceData *instance_data = nullptr;
+		/// The index of the next instance to be added to <c>instance_data</c>.
+		uint32_t instance_data_index = 0;
 
 		uint32_t max_instances_per_buffer = 16384;
 		uint32_t max_instance_buffer_size = 16384 * sizeof(InstanceData);
@@ -619,12 +621,14 @@ class RendererCanvasRenderRD : public RendererCanvasRender {
 
 	inline RID _get_pipeline_specialization_or_ubershader(CanvasShaderData *p_shader_data, PipelineKey &r_pipeline_key, PushConstant &r_push_constant, RID p_mesh_instance = RID(), void *p_surface = nullptr, uint32_t p_surface_index = 0, RID *r_vertex_array = nullptr);
 	void _render_batch_items(RenderTarget p_to_render_target, int p_item_count, const Transform2D &p_canvas_transform_inverse, Light *p_lights, bool &r_sdf_used, bool p_to_backbuffer = false, RenderingMethod::RenderInfo *r_render_info = nullptr);
-	void _record_item_commands(const Item *p_item, RenderTarget p_render_target, const Transform2D &p_base_transform, Item *&r_current_clip, Light *p_lights, uint32_t &r_index, bool &r_batch_broken, bool &r_sdf_used, Batch *&r_current_batch);
+	void _record_item_commands(const Item *p_item, RenderTarget p_render_target, const Transform2D &p_base_transform, Item *&r_current_clip, Light *p_lights, bool &r_batch_broken, bool &r_sdf_used, Batch *&r_current_batch);
 	void _render_batch(RD::DrawListID p_draw_list, CanvasShaderData *p_shader_data, RenderingDevice::FramebufferFormatID p_framebuffer_format, Light *p_lights, Batch const *p_batch, RenderingMethod::RenderInfo *r_render_info = nullptr);
 	void _prepare_batch_texture_info(RID p_texture, TextureState &p_state, TextureInfo *p_info);
-	InstanceData *new_instance_data(float *p_world, uint32_t *p_lights, uint32_t p_base_flags, uint32_t p_index, uint32_t p_uniforms_ofs, TextureInfo *p_info);
+
+	// non-UMA
+	InstanceData *new_instance_data(const InstanceData &template_instance);
 	[[nodiscard]] Batch *_new_batch(bool &r_batch_broken);
-	void _add_to_batch(uint32_t &r_index, bool &r_batch_broken, Batch *&r_current_batch);
+	void _add_to_batch(bool &r_batch_broken, Batch *&r_current_batch);
 	void _allocate_instance_buffer();
 
 	_FORCE_INLINE_ void _update_transform_2d_to_mat2x4(const Transform2D &p_transform, float *p_mat2x4);
diff --git a/servers/rendering/renderer_rd/shader_rd.cpp b/servers/rendering/renderer_rd/shader_rd.cpp
index c2f723803f0..1429a7d21d2 100644
--- a/servers/rendering/renderer_rd/shader_rd.cpp
+++ b/servers/rendering/renderer_rd/shader_rd.cpp
@@ -302,7 +302,7 @@ void ShaderRD::_compile_variant(uint32_t p_variant, CompileData p_data) {
 	}
 
 	Vector<String> variant_stage_sources = _build_variant_stage_sources(variant, p_data);
-	Vector<RD::ShaderStageSPIRVData> variant_stages = compile_stages(variant_stage_sources);
+	Vector<RD::ShaderStageSPIRVData> variant_stages = compile_stages(variant_stage_sources, dynamic_buffers);
 	ERR_FAIL_COND(variant_stages.is_empty());
 
 	Vector<uint8_t> shader_data = RD::get_singleton()->shader_compile_binary_from_spirv(variant_stages, name + ":" + itos(variant));
@@ -783,6 +783,10 @@ const String &ShaderRD::get_name() const {
 	return name;
 }
 
+const Vector<uint64_t> &ShaderRD::get_dynamic_buffers() const {
+	return dynamic_buffers;
+}
+
 bool ShaderRD::shader_cache_cleanup_on_start = false;
 
 ShaderRD::ShaderRD() {
@@ -801,12 +805,13 @@ ShaderRD::ShaderRD() {
 	base_compute_defines = base_compute_define_text.ascii();
 }
 
-void ShaderRD::initialize(const Vector<String> &p_variant_defines, const String &p_general_defines, const Vector<RD::PipelineImmutableSampler> &p_immutable_samplers) {
+void ShaderRD::initialize(const Vector<String> &p_variant_defines, const String &p_general_defines, const Vector<RD::PipelineImmutableSampler> &p_immutable_samplers, const Vector<uint64_t> &p_dynamic_buffers) {
 	ERR_FAIL_COND(variant_defines.size());
 	ERR_FAIL_COND(p_variant_defines.is_empty());
 
 	general_defines = p_general_defines.utf8();
 	immutable_samplers = p_immutable_samplers;
+	dynamic_buffers = p_dynamic_buffers;
 
 	// When initialized this way, there is just one group and its always enabled.
 	group_to_variant_map.insert(0, LocalVector<int>{});
@@ -846,6 +851,11 @@ void ShaderRD::_initialize_cache() {
 			hash_build.append(variant_defines[E.value[i]].text.get_data());
 		}
 
+		for (const uint64_t dyn_buffer : dynamic_buffers) {
+			hash_build.append("[dynamic_buffer]");
+			hash_build.append(uitos(dyn_buffer));
+		}
+
 		group_sha256[E.key] = hash_build.as_string().sha256_text();
 
 		if (!shader_cache_user_dir.is_empty()) {
@@ -880,12 +890,13 @@ void ShaderRD::_initialize_cache() {
 }
 
 // Same as above, but allows specifying shader compilation groups.
-void ShaderRD::initialize(const Vector<VariantDefine> &p_variant_defines, const String &p_general_defines, const Vector<RD::PipelineImmutableSampler> &p_immutable_samplers) {
+void ShaderRD::initialize(const Vector<VariantDefine> &p_variant_defines, const String &p_general_defines, const Vector<RD::PipelineImmutableSampler> &p_immutable_samplers, const Vector<uint64_t> &p_dynamic_buffers) {
 	ERR_FAIL_COND(variant_defines.size());
 	ERR_FAIL_COND(p_variant_defines.is_empty());
 
 	general_defines = p_general_defines.utf8();
 	immutable_samplers = p_immutable_samplers;
+	dynamic_buffers = p_dynamic_buffers;
 
 	int max_group_id = 0;
 
@@ -962,7 +973,7 @@ void ShaderRD::set_shader_cache_save_debug(bool p_enable) {
 	shader_cache_save_debug = p_enable;
 }
 
-Vector<RD::ShaderStageSPIRVData> ShaderRD::compile_stages(const Vector<String> &p_stage_sources) {
+Vector<RD::ShaderStageSPIRVData> ShaderRD::compile_stages(const Vector<String> &p_stage_sources, const Vector<uint64_t> &p_dynamic_buffers) {
 	RD::ShaderStageSPIRVData stage;
 	Vector<RD::ShaderStageSPIRVData> stages;
 	String error;
@@ -974,6 +985,7 @@ Vector<RD::ShaderStageSPIRVData> ShaderRD::compile_stages(const Vector<String> &
 		}
 
 		stage.spirv = RD::get_singleton()->shader_compile_spirv_from_source(RD::ShaderStage(i), p_stage_sources[i], RD::SHADER_LANGUAGE_GLSL, &error);
+		stage.dynamic_buffers = p_dynamic_buffers;
 		stage.shader_stage = RD::ShaderStage(i);
 		if (!stage.spirv.is_empty()) {
 			stages.push_back(stage);
diff --git a/servers/rendering/renderer_rd/shader_rd.h b/servers/rendering/renderer_rd/shader_rd.h
index 38bca63684c..e5bd4e5bb30 100644
--- a/servers/rendering/renderer_rd/shader_rd.h
+++ b/servers/rendering/renderer_rd/shader_rd.h
@@ -65,6 +65,7 @@ private:
 	Vector<bool> group_enabled;
 
 	Vector<RD::PipelineImmutableSampler> immutable_samplers;
+	Vector<uint64_t> dynamic_buffers;
 
 	struct Version {
 		Mutex *mutex = nullptr;
@@ -225,6 +226,8 @@ public:
 
 	const String &get_name() const;
 
+	const Vector<uint64_t> &get_dynamic_buffers() const;
+
 	static void shaders_embedded_set_lock();
 	static const ShaderVersionPairSet &shaders_embedded_set_get();
 	static void shaders_embedded_set_unlock();
@@ -237,15 +240,26 @@ public:
 	static void set_shader_cache_save_compressed_zstd(bool p_enable);
 	static void set_shader_cache_save_debug(bool p_enable);
 
-	static Vector<RD::ShaderStageSPIRVData> compile_stages(const Vector<String> &p_stage_sources);
+	static Vector<RD::ShaderStageSPIRVData> compile_stages(const Vector<String> &p_stage_sources, const Vector<uint64_t> &p_dynamic_buffers);
 	static PackedByteArray save_shader_cache_bytes(const LocalVector<int> &p_variants, const Vector<Vector<uint8_t>> &p_variant_data);
 
 	Vector<String> version_build_variant_stage_sources(RID p_version, int p_variant);
 	RS::ShaderNativeSourceCode version_get_native_source_code(RID p_version);
 	String version_get_cache_file_relative_path(RID p_version, int p_group, const String &p_api_name);
 
-	void initialize(const Vector<String> &p_variant_defines, const String &p_general_defines = "", const Vector<RD::PipelineImmutableSampler> &p_immutable_samplers = Vector<RD::PipelineImmutableSampler>());
-	void initialize(const Vector<VariantDefine> &p_variant_defines, const String &p_general_defines = "", const Vector<RD::PipelineImmutableSampler> &p_immutable_samplers = Vector<RD::PipelineImmutableSampler>());
+	struct DynamicBuffer {
+		static uint64_t encode(uint32_t p_set_id, uint32_t p_binding) {
+			return uint64_t(p_set_id) << 32ul | uint64_t(p_binding);
+		}
+	};
+
+	// Dynamic Buffers specifies Which buffers will be persistent/dynamic when used.
+	// See DynamicBuffer::encode. We need this argument because SPIR-V does not distinguish between a
+	// uniform buffer and a dynamic uniform buffer. At shader level they're the same thing, but the PSO
+	// is created slightly differently and they're bound differently.
+	// On D3D12 the Root Layout is also different.
+	void initialize(const Vector<String> &p_variant_defines, const String &p_general_defines = "", const Vector<RD::PipelineImmutableSampler> &p_immutable_samplers = Vector<RD::PipelineImmutableSampler>(), const Vector<uint64_t> &p_dynamic_buffers = Vector<uint64_t>());
+	void initialize(const Vector<VariantDefine> &p_variant_defines, const String &p_general_defines = "", const Vector<RD::PipelineImmutableSampler> &p_immutable_samplers = Vector<RD::PipelineImmutableSampler>(), const Vector<uint64_t> &p_dynamic_buffers = Vector<uint64_t>());
 
 	virtual ~ShaderRD();
 };
diff --git a/servers/rendering/renderer_rd/storage_rd/render_scene_data_rd.h b/servers/rendering/renderer_rd/storage_rd/render_scene_data_rd.h
index 9d3aa409212..78fea1a66be 100644
--- a/servers/rendering/renderer_rd/storage_rd/render_scene_data_rd.h
+++ b/servers/rendering/renderer_rd/storage_rd/render_scene_data_rd.h
@@ -94,6 +94,8 @@ public:
 	void update_ubo(RID p_uniform_buffer, RS::ViewportDebugDraw p_debug_mode, RID p_env, RID p_reflection_probe_instance, RID p_camera_attributes, bool p_pancake_shadows, const Size2i &p_screen_size, const Color &p_default_bg_color, float p_luminance_multiplier, bool p_opaque_render_buffers, bool p_apply_alpha_multiplier);
 	virtual RID get_uniform_buffer() const override;
 
+	static uint32_t get_uniform_buffer_size_bytes() { return sizeof(UBODATA); }
+
 private:
 	RID uniform_buffer; // loaded into this uniform buffer (supplied externally)
 
diff --git a/servers/rendering/rendering_device.cpp b/servers/rendering/rendering_device.cpp
index 3d2caeeaa31..fac973716e5 100644
--- a/servers/rendering/rendering_device.cpp
+++ b/servers/rendering/rendering_device.cpp
@@ -267,7 +267,7 @@ Error RenderingDevice::_buffer_initialize(Buffer *p_buffer, Span<uint8_t> p_data
 Error RenderingDevice::_insert_staging_block(StagingBuffers &p_staging_buffers) {
 	StagingBufferBlock block;
 
-	block.driver_id = driver->buffer_create(p_staging_buffers.block_size, p_staging_buffers.usage_bits, RDD::MEMORY_ALLOCATION_TYPE_CPU);
+	block.driver_id = driver->buffer_create(p_staging_buffers.block_size, p_staging_buffers.usage_bits, RDD::MEMORY_ALLOCATION_TYPE_CPU, frames_drawn);
 	ERR_FAIL_COND_V(!block.driver_id, ERR_CANT_CREATE);
 
 	block.frame_used = 0;
@@ -455,19 +455,29 @@ Error RenderingDevice::buffer_copy(RID p_src_buffer, RID p_dst_buffer, uint32_t
 	return OK;
 }
 
-Error RenderingDevice::buffer_update(RID p_buffer, uint32_t p_offset, uint32_t p_size, const void *p_data) {
+Error RenderingDevice::buffer_update(RID p_buffer, uint32_t p_offset, uint32_t p_size, const void *p_data, bool p_skip_check) {
 	ERR_RENDER_THREAD_GUARD_V(ERR_UNAVAILABLE);
 
 	copy_bytes_count += p_size;
-	ERR_FAIL_COND_V_MSG(draw_list.active, ERR_INVALID_PARAMETER,
+
+	ERR_FAIL_COND_V_MSG(draw_list.active && !p_skip_check, ERR_INVALID_PARAMETER,
 			"Updating buffers is forbidden during creation of a draw list");
-	ERR_FAIL_COND_V_MSG(compute_list.active, ERR_INVALID_PARAMETER,
+	ERR_FAIL_COND_V_MSG(compute_list.active && !p_skip_check, ERR_INVALID_PARAMETER,
 			"Updating buffers is forbidden during creation of a compute list");
 
 	Buffer *buffer = _get_buffer_from_owner(p_buffer);
 	ERR_FAIL_NULL_V_MSG(buffer, ERR_INVALID_PARAMETER, "Buffer argument is not a valid buffer of any type.");
 	ERR_FAIL_COND_V_MSG(p_offset + p_size > buffer->size, ERR_INVALID_PARAMETER, "Attempted to write buffer (" + itos((p_offset + p_size) - buffer->size) + " bytes) past the end.");
 
+	if (buffer->usage.has_flag(RDD::BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT)) {
+		uint8_t *dst_data = driver->buffer_persistent_map_advance(buffer->driver_id, frames_drawn);
+
+		memcpy(dst_data + p_offset, p_data, p_size);
+		direct_copy_count++;
+		buffer_flush(p_buffer);
+		return OK;
+	}
+
 	_check_transfer_worker_buffer(buffer);
 
 	// Submitting may get chunked for various reasons, so convert this to a task.
@@ -597,8 +607,9 @@ Error RenderingDevice::driver_callback_add(RDD::DriverCallback p_callback, void
 
 String RenderingDevice::get_perf_report() const {
 	String perf_report_text;
-	perf_report_text += " gpu:" + String::num_int64(prev_gpu_copy_count);
-	perf_report_text += " bytes:" + String::num_int64(prev_copy_bytes_count);
+	perf_report_text += " gpu:" + String::num_int64(gpu_copy_count);
+	perf_report_text += " direct:" + String::num_int64(direct_copy_count);
+	perf_report_text += " bytes:" + String::num_int64(copy_bytes_count);
 
 	perf_report_text += " lazily alloc:" + String::num_int64(driver->get_lazily_memory_used());
 	return perf_report_text;
@@ -608,6 +619,7 @@ void RenderingDevice::update_perf_report() {
 	prev_gpu_copy_count = gpu_copy_count;
 	prev_copy_bytes_count = copy_bytes_count;
 	gpu_copy_count = 0;
+	direct_copy_count = 0;
 	copy_bytes_count = 0;
 }
 
@@ -659,7 +671,7 @@ Vector<uint8_t> RenderingDevice::buffer_get_data(RID p_buffer, uint32_t p_offset
 
 	_check_transfer_worker_buffer(buffer);
 
-	RDD::BufferID tmp_buffer = driver->buffer_create(buffer->size, RDD::BUFFER_USAGE_TRANSFER_TO_BIT, RDD::MEMORY_ALLOCATION_TYPE_CPU);
+	RDD::BufferID tmp_buffer = driver->buffer_create(buffer->size, RDD::BUFFER_USAGE_TRANSFER_TO_BIT, RDD::MEMORY_ALLOCATION_TYPE_CPU, frames_drawn);
 	ERR_FAIL_COND_V(!tmp_buffer, Vector<uint8_t>());
 
 	RDD::BufferCopyRegion region;
@@ -784,12 +796,38 @@ uint64_t RenderingDevice::buffer_get_device_address(RID p_buffer) {
 	return driver->buffer_get_device_address(buffer->driver_id);
 }
 
+uint8_t *RenderingDevice::buffer_persistent_map_advance(RID p_buffer) {
+	ERR_RENDER_THREAD_GUARD_V(0);
+
+	Buffer *buffer = _get_buffer_from_owner(p_buffer);
+	ERR_FAIL_NULL_V_MSG(buffer, nullptr, "Buffer argument is not a valid buffer of any type.");
+	direct_copy_count++;
+	return driver->buffer_persistent_map_advance(buffer->driver_id, frames_drawn);
+}
+
+void RenderingDevice::buffer_flush(RID p_buffer) {
+	ERR_RENDER_THREAD_GUARD();
+
+	Buffer *buffer = _get_buffer_from_owner(p_buffer);
+	ERR_FAIL_NULL_MSG(buffer, "Buffer argument is not a valid buffer of any type.");
+	driver->buffer_flush(buffer->driver_id);
+}
+
 RID RenderingDevice::storage_buffer_create(uint32_t p_size_bytes, Span<uint8_t> p_data, BitField<StorageBufferUsage> p_usage, BitField<BufferCreationBits> p_creation_bits) {
 	ERR_FAIL_COND_V(p_data.size() && (uint32_t)p_data.size() != p_size_bytes, RID());
 
 	Buffer buffer;
 	buffer.size = p_size_bytes;
 	buffer.usage = (RDD::BUFFER_USAGE_TRANSFER_FROM_BIT | RDD::BUFFER_USAGE_TRANSFER_TO_BIT | RDD::BUFFER_USAGE_STORAGE_BIT);
+	if (p_creation_bits.has_flag(BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT)) {
+		buffer.usage.set_flag(RDD::BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT);
+
+		// This is a precaution: Persistent buffers are meant for frequent CPU -> GPU transfers.
+		// Writing to this buffer from GPU might cause sync issues if both CPU & GPU try to write at the
+		// same time. It's probably fine (since CPU always advances the pointer before writing) but let's
+		// stick to the known/intended use cases and scream if we deviate from it.
+		buffer.usage.clear_flag(RDD::BUFFER_USAGE_TRANSFER_TO_BIT);
+	}
 	if (p_usage.has_flag(STORAGE_BUFFER_USAGE_DISPATCH_INDIRECT)) {
 		buffer.usage.set_flag(RDD::BUFFER_USAGE_INDIRECT_BIT);
 	}
@@ -801,7 +839,7 @@ RID RenderingDevice::storage_buffer_create(uint32_t p_size_bytes, Span<uint8_t>
 
 		buffer.usage.set_flag(RDD::BUFFER_USAGE_DEVICE_ADDRESS_BIT);
 	}
-	buffer.driver_id = driver->buffer_create(buffer.size, buffer.usage, RDD::MEMORY_ALLOCATION_TYPE_GPU);
+	buffer.driver_id = driver->buffer_create(buffer.size, buffer.usage, RDD::MEMORY_ALLOCATION_TYPE_GPU, frames_drawn);
 	ERR_FAIL_COND_V(!buffer.driver_id, RID());
 
 	// Storage buffers are assumed to be mutable.
@@ -833,7 +871,7 @@ RID RenderingDevice::texture_buffer_create(uint32_t p_size_elements, DataFormat
 	Buffer texture_buffer;
 	texture_buffer.size = size_bytes;
 	BitField<RDD::BufferUsageBits> usage = (RDD::BUFFER_USAGE_TRANSFER_FROM_BIT | RDD::BUFFER_USAGE_TRANSFER_TO_BIT | RDD::BUFFER_USAGE_TEXEL_BIT);
-	texture_buffer.driver_id = driver->buffer_create(size_bytes, usage, RDD::MEMORY_ALLOCATION_TYPE_GPU);
+	texture_buffer.driver_id = driver->buffer_create(size_bytes, usage, RDD::MEMORY_ALLOCATION_TYPE_GPU, frames_drawn);
 	ERR_FAIL_COND_V(!texture_buffer.driver_id, RID());
 
 	// Texture buffers are assumed to be immutable unless they don't have initial data.
@@ -1884,7 +1922,7 @@ void RenderingDevice::_texture_create_reinterpret_buffer(Texture *p_texture) {
 	uint32_t pixel_bytes = get_image_format_pixel_size(p_texture->format);
 	uint32_t row_pitch = STEPIFY(p_texture->width * pixel_bytes, row_pitch_step);
 	uint64_t buffer_size = STEPIFY(pixel_bytes * row_pitch * p_texture->height * p_texture->depth, transfer_alignment);
-	p_texture->shared_fallback->buffer = driver->buffer_create(buffer_size, RDD::BUFFER_USAGE_TRANSFER_FROM_BIT | RDD::BUFFER_USAGE_TRANSFER_TO_BIT, RDD::MEMORY_ALLOCATION_TYPE_GPU);
+	p_texture->shared_fallback->buffer = driver->buffer_create(buffer_size, RDD::BUFFER_USAGE_TRANSFER_FROM_BIT | RDD::BUFFER_USAGE_TRANSFER_TO_BIT, RDD::MEMORY_ALLOCATION_TYPE_GPU, frames_drawn);
 	buffer_memory += driver->buffer_get_allocation_size(p_texture->shared_fallback->buffer);
 
 	RDG::ResourceTracker *tracker = RDG::resource_tracker_create();
@@ -1938,7 +1976,7 @@ Vector<uint8_t> RenderingDevice::texture_get_data(RID p_texture, uint32_t p_laye
 			work_buffer_size = STEPIFY(work_buffer_size, work_mip_alignment) + mip_layouts[i].size;
 		}
 
-		RDD::BufferID tmp_buffer = driver->buffer_create(work_buffer_size, RDD::BUFFER_USAGE_TRANSFER_TO_BIT, RDD::MEMORY_ALLOCATION_TYPE_CPU);
+		RDD::BufferID tmp_buffer = driver->buffer_create(work_buffer_size, RDD::BUFFER_USAGE_TRANSFER_TO_BIT, RDD::MEMORY_ALLOCATION_TYPE_CPU, frames_drawn);
 		ERR_FAIL_COND_V(!tmp_buffer, Vector<uint8_t>());
 
 		thread_local LocalVector<RDD::BufferTextureCopyRegion> command_buffer_texture_copy_regions_vector;
@@ -3052,7 +3090,7 @@ RID RenderingDevice::vertex_buffer_create(uint32_t p_size_bytes, Span<uint8_t> p
 	if (p_creation_bits.has_flag(BUFFER_CREATION_DEVICE_ADDRESS_BIT)) {
 		buffer.usage.set_flag(RDD::BUFFER_USAGE_DEVICE_ADDRESS_BIT);
 	}
-	buffer.driver_id = driver->buffer_create(buffer.size, buffer.usage, RDD::MEMORY_ALLOCATION_TYPE_GPU);
+	buffer.driver_id = driver->buffer_create(buffer.size, buffer.usage, RDD::MEMORY_ALLOCATION_TYPE_GPU, frames_drawn);
 	ERR_FAIL_COND_V(!buffer.driver_id, RID());
 
 	// Vertex buffers are assumed to be immutable unless they don't have initial data or they've been marked for storage explicitly.
@@ -3224,7 +3262,7 @@ RID RenderingDevice::index_buffer_create(uint32_t p_index_count, IndexBufferForm
 	if (p_creation_bits.has_flag(BUFFER_CREATION_DEVICE_ADDRESS_BIT)) {
 		index_buffer.usage.set_flag(RDD::BUFFER_USAGE_DEVICE_ADDRESS_BIT);
 	}
-	index_buffer.driver_id = driver->buffer_create(index_buffer.size, index_buffer.usage, RDD::MEMORY_ALLOCATION_TYPE_GPU);
+	index_buffer.driver_id = driver->buffer_create(index_buffer.size, index_buffer.usage, RDD::MEMORY_ALLOCATION_TYPE_GPU, frames_drawn);
 	ERR_FAIL_COND_V(!index_buffer.driver_id, RID());
 
 	// Index buffers are assumed to be immutable unless they don't have initial data.
@@ -3279,7 +3317,7 @@ RID RenderingDevice::index_array_create(RID p_index_buffer, uint32_t p_index_off
 /****************/
 
 static const char *SHADER_UNIFORM_NAMES[RenderingDevice::UNIFORM_TYPE_MAX] = {
-	"Sampler", "CombinedSampler", "Texture", "Image", "TextureBuffer", "SamplerTextureBuffer", "ImageBuffer", "UniformBuffer", "StorageBuffer", "InputAttachment"
+	"Sampler", "CombinedSampler", "Texture", "Image", "TextureBuffer", "SamplerTextureBuffer", "ImageBuffer", "UniformBuffer", "UniformBufferDynamic", "StorageBuffer", "StorageBufferDynamic", "InputAttachment"
 };
 
 String RenderingDevice::_shader_uniform_debug(RID p_shader, int p_set) {
@@ -3450,7 +3488,16 @@ RID RenderingDevice::uniform_buffer_create(uint32_t p_size_bytes, Span<uint8_t>
 	if (p_creation_bits.has_flag(BUFFER_CREATION_DEVICE_ADDRESS_BIT)) {
 		buffer.usage.set_flag(RDD::BUFFER_USAGE_DEVICE_ADDRESS_BIT);
 	}
-	buffer.driver_id = driver->buffer_create(buffer.size, buffer.usage, RDD::MEMORY_ALLOCATION_TYPE_GPU);
+	if (p_creation_bits.has_flag(BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT)) {
+		buffer.usage.set_flag(RDD::BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT);
+
+		// This is a precaution: Persistent buffers are meant for frequent CPU -> GPU transfers.
+		// Writing to this buffer from GPU might cause sync issues if both CPU & GPU try to write at the
+		// same time. It's probably fine (since CPU always advances the pointer before writing) but let's
+		// stick to the known/intended use cases and scream if we deviate from it.
+		buffer.usage.clear_flag(RDD::BUFFER_USAGE_TRANSFER_TO_BIT);
+	}
+	buffer.driver_id = driver->buffer_create(buffer.size, buffer.usage, RDD::MEMORY_ALLOCATION_TYPE_GPU, frames_drawn);
 	ERR_FAIL_COND_V(!buffer.driver_id, RID());
 
 	// Uniform buffers are assumed to be immutable unless they don't have initial data.
@@ -3527,8 +3574,7 @@ RID RenderingDevice::uniform_set_create(const VectorView<RD::Uniform> &p_uniform
 		const Uniform &uniform = uniforms[uniform_idx];
 
 		ERR_FAIL_INDEX_V(uniform.uniform_type, RD::UNIFORM_TYPE_MAX, RID());
-		ERR_FAIL_COND_V_MSG(uniform.uniform_type != set_uniform.type, RID(),
-				"Mismatch uniform type for binding (" + itos(set_uniform.binding) + "), set (" + itos(p_shader_set) + "). Expected '" + SHADER_UNIFORM_NAMES[set_uniform.type] + "', supplied: '" + SHADER_UNIFORM_NAMES[uniform.uniform_type] + "'.");
+		ERR_FAIL_COND_V_MSG(uniform.uniform_type != set_uniform.type, RID(), "Shader '" + shader->name + "' Mismatch uniform type for binding (" + itos(set_uniform.binding) + "), set (" + itos(p_shader_set) + "). Expected '" + SHADER_UNIFORM_NAMES[set_uniform.type] + "', supplied: '" + SHADER_UNIFORM_NAMES[uniform.uniform_type] + "'.");
 
 		RDD::BoundUniform &driver_uniform = driver_uniforms[i];
 		driver_uniform.type = uniform.uniform_type;
@@ -3759,7 +3805,8 @@ RID RenderingDevice::uniform_set_create(const VectorView<RD::Uniform> &p_uniform
 			case UNIFORM_TYPE_IMAGE_BUFFER: {
 				// Todo.
 			} break;
-			case UNIFORM_TYPE_UNIFORM_BUFFER: {
+			case UNIFORM_TYPE_UNIFORM_BUFFER:
+			case UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC: {
 				ERR_FAIL_COND_V_MSG(uniform.get_id_count() != 1, RID(),
 						"Uniform buffer supplied (binding: " + itos(uniform.binding) + ") must provide one ID (" + itos(uniform.get_id_count()) + " provided).");
 
@@ -3780,7 +3827,8 @@ RID RenderingDevice::uniform_set_create(const VectorView<RD::Uniform> &p_uniform
 				driver_uniform.ids.push_back(buffer->driver_id);
 				_check_transfer_worker_buffer(buffer);
 			} break;
-			case UNIFORM_TYPE_STORAGE_BUFFER: {
+			case UNIFORM_TYPE_STORAGE_BUFFER:
+			case UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC: {
 				ERR_FAIL_COND_V_MSG(uniform.get_id_count() != 1, RID(),
 						"Storage buffer supplied (binding: " + itos(uniform.binding) + ") must provide one ID (" + itos(uniform.get_id_count()) + " provided).");
 
@@ -5630,7 +5678,7 @@ RenderingDevice::TransferWorker *RenderingDevice::_acquire_transfer_worker(uint3
 
 			uint32_t new_staging_buffer_size = next_power_of_2(expected_buffer_size);
 			transfer_worker->staging_buffer_size_allocated = new_staging_buffer_size;
-			transfer_worker->staging_buffer = driver->buffer_create(new_staging_buffer_size, RDD::BUFFER_USAGE_TRANSFER_FROM_BIT, RDD::MEMORY_ALLOCATION_TYPE_CPU);
+			transfer_worker->staging_buffer = driver->buffer_create(new_staging_buffer_size, RDD::BUFFER_USAGE_TRANSFER_FROM_BIT, RDD::MEMORY_ALLOCATION_TYPE_CPU, frames_drawn);
 		}
 	}
 
@@ -7786,6 +7834,8 @@ void RenderingDevice::_bind_methods() {
 
 	BIND_BITFIELD_FLAG(BUFFER_CREATION_DEVICE_ADDRESS_BIT);
 	BIND_BITFIELD_FLAG(BUFFER_CREATION_AS_STORAGE_BIT);
+	// Not exposed on purpose. This flag is too dangerous to be exposed to regular GD users.
+	//BIND_BITFIELD_FLAG(BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT);
 
 	BIND_ENUM_CONSTANT(UNIFORM_TYPE_SAMPLER); //for sampling only (sampler GLSL type)
 	BIND_ENUM_CONSTANT(UNIFORM_TYPE_SAMPLER_WITH_TEXTURE); // for sampling only); but includes a texture); (samplerXX GLSL type)); first a sampler then a texture
@@ -7797,6 +7847,8 @@ void RenderingDevice::_bind_methods() {
 	BIND_ENUM_CONSTANT(UNIFORM_TYPE_UNIFORM_BUFFER); //regular uniform buffer (or UBO).
 	BIND_ENUM_CONSTANT(UNIFORM_TYPE_STORAGE_BUFFER); //storage buffer ("buffer" qualifier) like UBO); but supports storage); for compute mostly
 	BIND_ENUM_CONSTANT(UNIFORM_TYPE_INPUT_ATTACHMENT); //used for sub-pass read/write); for mobile mostly
+	BIND_ENUM_CONSTANT(UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC); // Exposed in case a BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT buffer created by C++ makes it into GD users.
+	BIND_ENUM_CONSTANT(UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC); // Exposed in case a BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT buffer created by C++ makes it into GD users.
 	BIND_ENUM_CONSTANT(UNIFORM_TYPE_MAX);
 
 	BIND_ENUM_CONSTANT(RENDER_PRIMITIVE_POINTS);
diff --git a/servers/rendering/rendering_device.h b/servers/rendering/rendering_device.h
index 312dcfcc130..a3c0ce4ad15 100644
--- a/servers/rendering/rendering_device.h
+++ b/servers/rendering/rendering_device.h
@@ -189,6 +189,7 @@ private:
 	// swapchain semaphore to be signaled (which causes bubbles).
 	bool split_swapchain_into_its_own_cmd_buffer = true;
 	uint32_t gpu_copy_count = 0;
+	uint32_t direct_copy_count = 0;
 	uint32_t copy_bytes_count = 0;
 	uint32_t prev_gpu_copy_count = 0;
 	uint32_t prev_copy_bytes_count = 0;
@@ -206,11 +207,55 @@ private:
 
 public:
 	Error buffer_copy(RID p_src_buffer, RID p_dst_buffer, uint32_t p_src_offset, uint32_t p_dst_offset, uint32_t p_size);
-	Error buffer_update(RID p_buffer, uint32_t p_offset, uint32_t p_size, const void *p_data);
+	/**
+	 * @brief Updates the given GPU buffer at offset and size with the given CPU data.
+	 * @remarks
+	 *	Buffer update is queued into the render graph. The render graph will reorder this operation so
+	 *	that it happens together with other buffer_update() in bulk and before rendering operations
+	 *	(or compute dispatches) that need it.
+	 *
+	 *	This means that the following will not work as intended:
+	 *	@code
+	 *		buffer_update(buffer_a, ..., data_source_x, ...);
+	 *		draw_list_draw(buffer_a);							// render data_render_x.
+	 *		buffer_update(buffer_a, ..., data_source_y, ...);
+	 *		draw_list_draw(buffer_a);							// render data_source_y.
+	 *	@endcode
+	 *
+	 *	Because it will be *reordered* to become the following:
+	 *	@code
+	 *		buffer_update(buffer_a, ..., data_source_x, ...);
+	 *		buffer_update(buffer_a, ..., data_source_y, ...);
+	 *		draw_list_draw(buffer_a); // render data_source_y. <-- Oops! should be data_source_x
+	 *		draw_list_draw(buffer_a); // render data_source_y.
+	 *	@endcode
+	 *
+	 *	When p_skip_check = true, we will perform checks to prevent this situation from happening
+	 *	(buffer_update must not be called while creating a draw or compute list).
+	 *	Do NOT set it to false for user-facing public API because users had trouble understanding
+	 *  this problem when manually creating draw lists.
+	 *
+	 *  Godot internally can set p_skip_check = true when it believes it will only update
+	 *  the buffer once and it needs to be done while a draw/compute list is being created.
+	 *
+	 *  Important: The Vulkan & Metal APIs do not allow issuing copies while inside a RenderPass.
+	 *  We can do it because Godot's render graph will reorder them.
+	 *
+	 * @param p_buffer		GPU buffer to update.
+	 * @param p_offset		Offset in bytes (relative to p_buffer).
+	 * @param p_size		Size in bytes of the data.
+	 * @param p_data		CPU data to transfer to GPU.
+	 *						Pointer can be deleted after buffer_update returns.
+	 * @param p_skip_check	Must always be false for user-facing public API. See remarks.
+	 * @return				Status result of the operation.
+	 */
+	Error buffer_update(RID p_buffer, uint32_t p_offset, uint32_t p_size, const void *p_data, bool p_skip_check = false);
 	Error buffer_clear(RID p_buffer, uint32_t p_offset, uint32_t p_size);
 	Vector<uint8_t> buffer_get_data(RID p_buffer, uint32_t p_offset = 0, uint32_t p_size = 0); // This causes stall, only use to retrieve large buffers for saving.
 	Error buffer_get_data_async(RID p_buffer, const Callable &p_callback, uint32_t p_offset = 0, uint32_t p_size = 0);
 	uint64_t buffer_get_device_address(RID p_buffer);
+	uint8_t *buffer_persistent_map_advance(RID p_buffer);
+	void buffer_flush(RID p_buffer);
 
 private:
 	/******************/
@@ -788,6 +833,7 @@ public:
 	enum BufferCreationBits {
 		BUFFER_CREATION_DEVICE_ADDRESS_BIT = (1 << 0),
 		BUFFER_CREATION_AS_STORAGE_BIT = (1 << 1),
+		BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT = (1 << 2),
 	};
 
 	enum StorageBufferUsage {
@@ -1656,6 +1702,8 @@ public:
 	String get_device_api_version() const;
 	String get_device_pipeline_cache_uuid() const;
 
+	uint64_t get_frames_drawn() const { return frames_drawn; }
+
 	bool is_composite_alpha_supported() const;
 
 	uint64_t get_driver_resource(DriverResource p_resource, RID p_rid = RID(), uint64_t p_index = 0);
diff --git a/servers/rendering/rendering_device_commons.h b/servers/rendering/rendering_device_commons.h
index 738f3c8680d..e51566453a4 100644
--- a/servers/rendering/rendering_device_commons.h
+++ b/servers/rendering/rendering_device_commons.h
@@ -607,6 +607,7 @@ public:
 	struct ShaderStageSPIRVData {
 		ShaderStage shader_stage = SHADER_STAGE_MAX;
 		Vector<uint8_t> spirv;
+		Vector<uint64_t> dynamic_buffers;
 	};
 
 	/*********************/
@@ -626,6 +627,8 @@ public:
 		UNIFORM_TYPE_UNIFORM_BUFFER, // Regular uniform buffer (or UBO).
 		UNIFORM_TYPE_STORAGE_BUFFER, // Storage buffer ("buffer" qualifier) like UBO, but supports storage, for compute mostly.
 		UNIFORM_TYPE_INPUT_ATTACHMENT, // Used for sub-pass read/write, for mobile mostly.
+		UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC, // Same as UNIFORM but created with BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT.
+		UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC, // Same as STORAGE but created with BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT.
 		UNIFORM_TYPE_MAX
 	};
 
@@ -1062,6 +1065,7 @@ public:
 		uint32_t fragment_output_mask = 0;
 		bool is_compute = false;
 		bool has_multiview = false;
+		bool has_dynamic_buffers = false;
 		uint32_t compute_local_size[3] = {};
 		uint32_t push_constant_size = 0;
 
diff --git a/servers/rendering/rendering_device_driver.h b/servers/rendering/rendering_device_driver.h
index 2d208a5c8fd..abf11ef0fc7 100644
--- a/servers/rendering/rendering_device_driver.h
+++ b/servers/rendering/rendering_device_driver.h
@@ -171,19 +171,30 @@ public:
 		BUFFER_USAGE_VERTEX_BIT = (1 << 7),
 		BUFFER_USAGE_INDIRECT_BIT = (1 << 8),
 		BUFFER_USAGE_DEVICE_ADDRESS_BIT = (1 << 17),
+		// There are no Vulkan-equivalent. Try to use unused/unclaimed bits.
+		BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT = (1 << 31),
 	};
 
 	enum {
 		BUFFER_WHOLE_SIZE = ~0ULL
 	};
 
-	virtual BufferID buffer_create(uint64_t p_size, BitField<BufferUsageBits> p_usage, MemoryAllocationType p_allocation_type) = 0;
+	/** Allocates a new GPU buffer. Must be destroyed with buffer_free().
+	 * @param p_size The size in bytes of the buffer.
+	 * @param p_usage Usage flags.
+	 * @param p_allocation_type See MemoryAllocationType.
+	 * @param p_frames_drawn Used for debug checks when BUFFER_USAGE_DYNAMIC_PERSISTENT_BIT is set.
+	 * @return the buffer.
+	 */
+	virtual BufferID buffer_create(uint64_t p_size, BitField<BufferUsageBits> p_usage, MemoryAllocationType p_allocation_type, uint64_t p_frames_drawn) = 0;
 	// Only for a buffer with BUFFER_USAGE_TEXEL_BIT.
 	virtual bool buffer_set_texel_format(BufferID p_buffer, DataFormat p_format) = 0;
 	virtual void buffer_free(BufferID p_buffer) = 0;
 	virtual uint64_t buffer_get_allocation_size(BufferID p_buffer) = 0;
 	virtual uint8_t *buffer_map(BufferID p_buffer) = 0;
 	virtual void buffer_unmap(BufferID p_buffer) = 0;
+	virtual uint8_t *buffer_persistent_map_advance(BufferID p_buffer, uint64_t p_frames_drawn) = 0;
+	virtual void buffer_flush(BufferID p_buffer) {}
 	// Only for a buffer with BUFFER_USAGE_DEVICE_ADDRESS_BIT.
 	virtual uint64_t buffer_get_device_address(BufferID p_buffer) = 0;
 
@@ -499,12 +510,17 @@ public:
 		// Flag to indicate  that this is an immutable sampler so it is skipped when creating uniform
 		// sets, as it would be set previously when creating the pipeline layout.
 		bool immutable_sampler = false;
+
+		_FORCE_INLINE_ bool is_dynamic() const {
+			return type == UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC || type == UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC;
+		}
 	};
 
 	virtual UniformSetID uniform_set_create(VectorView<BoundUniform> p_uniforms, ShaderID p_shader, uint32_t p_set_index, int p_linear_pool_index) = 0;
 	virtual void linear_uniform_set_pools_reset(int p_linear_pool_index) {}
 	virtual void uniform_set_free(UniformSetID p_uniform_set) = 0;
 	virtual bool uniform_sets_have_linear_pools() const { return false; }
+	virtual uint32_t uniform_sets_get_dynamic_offsets(VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) const = 0;
 
 	// ----- COMMANDS -----
 
@@ -646,8 +662,7 @@ public:
 
 	// Binding.
 	virtual void command_bind_render_pipeline(CommandBufferID p_cmd_buffer, PipelineID p_pipeline) = 0;
-	virtual void command_bind_render_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) = 0;
-	virtual void command_bind_render_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) = 0;
+	virtual void command_bind_render_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets) = 0;
 
 	// Drawing.
 	virtual void command_render_draw(CommandBufferID p_cmd_buffer, uint32_t p_vertex_count, uint32_t p_instance_count, uint32_t p_base_vertex, uint32_t p_first_instance) = 0;
@@ -689,8 +704,7 @@ public:
 
 	// Binding.
 	virtual void command_bind_compute_pipeline(CommandBufferID p_cmd_buffer, PipelineID p_pipeline) = 0;
-	virtual void command_bind_compute_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) = 0;
-	virtual void command_bind_compute_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) = 0;
+	virtual void command_bind_compute_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets) = 0;
 
 	// Dispatching.
 	virtual void command_compute_dispatch(CommandBufferID p_cmd_buffer, uint32_t p_x_groups, uint32_t p_y_groups, uint32_t p_z_groups) = 0;
diff --git a/servers/rendering/rendering_device_graph.cpp b/servers/rendering/rendering_device_graph.cpp
index 9e419ed6dc6..e0f6ff25f8a 100644
--- a/servers/rendering/rendering_device_graph.cpp
+++ b/servers/rendering/rendering_device_graph.cpp
@@ -772,7 +772,7 @@ void RenderingDeviceGraph::_run_compute_list_command(RDD::CommandBufferID p_comm
 			} break;
 			case ComputeListInstruction::TYPE_BIND_UNIFORM_SETS: {
 				const ComputeListBindUniformSetsInstruction *bind_uniform_sets_instruction = reinterpret_cast<const ComputeListBindUniformSetsInstruction *>(instruction);
-				driver->command_bind_compute_uniform_sets(p_command_buffer, VectorView<RDD::UniformSetID>(bind_uniform_sets_instruction->uniform_set_ids(), bind_uniform_sets_instruction->set_count), bind_uniform_sets_instruction->shader, bind_uniform_sets_instruction->first_set_index, bind_uniform_sets_instruction->set_count);
+				driver->command_bind_compute_uniform_sets(p_command_buffer, VectorView<RDD::UniformSetID>(bind_uniform_sets_instruction->uniform_set_ids(), bind_uniform_sets_instruction->set_count), bind_uniform_sets_instruction->shader, bind_uniform_sets_instruction->first_set_index, bind_uniform_sets_instruction->set_count, bind_uniform_sets_instruction->dynamic_offsets_mask);
 				instruction_data_cursor += sizeof(ComputeListBindUniformSetsInstruction) + sizeof(RDD::UniformSetID) * bind_uniform_sets_instruction->set_count;
 			} break;
 			case ComputeListInstruction::TYPE_DISPATCH: {
@@ -865,7 +865,7 @@ void RenderingDeviceGraph::_run_draw_list_command(RDD::CommandBufferID p_command
 			} break;
 			case DrawListInstruction::TYPE_BIND_UNIFORM_SETS: {
 				const DrawListBindUniformSetsInstruction *bind_uniform_sets_instruction = reinterpret_cast<const DrawListBindUniformSetsInstruction *>(instruction);
-				driver->command_bind_render_uniform_sets(p_command_buffer, VectorView<RDD::UniformSetID>(bind_uniform_sets_instruction->uniform_set_ids(), bind_uniform_sets_instruction->set_count), bind_uniform_sets_instruction->shader, bind_uniform_sets_instruction->first_set_index, bind_uniform_sets_instruction->set_count);
+				driver->command_bind_render_uniform_sets(p_command_buffer, VectorView<RDD::UniformSetID>(bind_uniform_sets_instruction->uniform_set_ids(), bind_uniform_sets_instruction->set_count), bind_uniform_sets_instruction->shader, bind_uniform_sets_instruction->first_set_index, bind_uniform_sets_instruction->set_count, bind_uniform_sets_instruction->dynamic_offsets_mask);
 				instruction_data_cursor += sizeof(DrawListBindUniformSetsInstruction) + sizeof(RDD::UniformSetID) * bind_uniform_sets_instruction->set_count;
 			} break;
 			case DrawListInstruction::TYPE_BIND_VERTEX_BUFFERS: {
@@ -1430,7 +1430,7 @@ void RenderingDeviceGraph::_print_draw_list(const uint8_t *p_instruction_data, u
 				const DrawListBindUniformSetsInstruction *bind_uniform_sets_instruction = reinterpret_cast<const DrawListBindUniformSetsInstruction *>(instruction);
 				print_line("\tBIND UNIFORM SETS COUNT", bind_uniform_sets_instruction->set_count);
 				for (uint32_t i = 0; i < bind_uniform_sets_instruction->set_count; i++) {
-					print_line("\tBIND UNIFORM SET ID", itos(bind_uniform_sets_instruction->uniform_set_ids()[i].id), "START INDEX", bind_uniform_sets_instruction->first_set_index);
+					print_line("\tBIND UNIFORM SET ID", itos(bind_uniform_sets_instruction->uniform_set_ids()[i].id), "START INDEX", bind_uniform_sets_instruction->first_set_index, "DYNAMIC_OFFSETS", bind_uniform_sets_instruction->dynamic_offsets_mask);
 				}
 				instruction_data_cursor += sizeof(DrawListBindUniformSetsInstruction) + sizeof(RDD::UniformSetID) * bind_uniform_sets_instruction->set_count;
 			} break;
@@ -1532,7 +1532,7 @@ void RenderingDeviceGraph::_print_compute_list(const uint8_t *p_instruction_data
 				const ComputeListBindUniformSetsInstruction *bind_uniform_sets_instruction = reinterpret_cast<const ComputeListBindUniformSetsInstruction *>(instruction);
 				print_line("\tBIND UNIFORM SETS COUNT", bind_uniform_sets_instruction->set_count);
 				for (uint32_t i = 0; i < bind_uniform_sets_instruction->set_count; i++) {
-					print_line("\tBIND UNIFORM SET ID", itos(bind_uniform_sets_instruction->uniform_set_ids()[i].id), "START INDEX", bind_uniform_sets_instruction->first_set_index);
+					print_line("\tBIND UNIFORM SET ID", itos(bind_uniform_sets_instruction->uniform_set_ids()[i].id), "START INDEX", bind_uniform_sets_instruction->first_set_index, "DYNAMIC_OFFSETS", bind_uniform_sets_instruction->dynamic_offsets_mask);
 				}
 				instruction_data_cursor += sizeof(ComputeListBindUniformSetsInstruction) + sizeof(RDD::UniformSetID) * bind_uniform_sets_instruction->set_count;
 			} break;
@@ -1746,6 +1746,7 @@ void RenderingDeviceGraph::add_compute_list_bind_uniform_sets(RDD::ShaderID p_sh
 	instruction->shader = p_shader;
 	instruction->first_set_index = p_first_set_index;
 	instruction->set_count = p_set_count;
+	instruction->dynamic_offsets_mask = driver->uniform_sets_get_dynamic_offsets(p_uniform_sets, p_shader, p_first_set_index, p_set_count);
 
 	RDD::UniformSetID *ids = instruction->uniform_set_ids();
 	for (uint32_t i = 0; i < p_set_count; i++) {
@@ -1864,6 +1865,7 @@ void RenderingDeviceGraph::add_draw_list_bind_uniform_sets(RDD::ShaderID p_shade
 	instruction->shader = p_shader;
 	instruction->first_set_index = p_first_index;
 	instruction->set_count = p_set_count;
+	instruction->dynamic_offsets_mask = driver->uniform_sets_get_dynamic_offsets(p_uniform_sets, p_shader, p_first_index, p_set_count);
 
 	for (uint32_t i = 0; i < p_set_count; i++) {
 		instruction->uniform_set_ids()[i] = p_uniform_sets[i];
diff --git a/servers/rendering/rendering_device_graph.h b/servers/rendering/rendering_device_graph.h
index 1478f3dd3d4..7b986617f0e 100644
--- a/servers/rendering/rendering_device_graph.h
+++ b/servers/rendering/rendering_device_graph.h
@@ -489,6 +489,7 @@ private:
 		RDD::ShaderID shader;
 		uint32_t first_set_index = 0;
 		uint32_t set_count = 0;
+		uint32_t dynamic_offsets_mask = 0u;
 
 		_FORCE_INLINE_ RDD::UniformSetID *uniform_set_ids() {
 			return reinterpret_cast<RDD::UniformSetID *>(&this[1]);
@@ -620,6 +621,7 @@ private:
 		RDD::ShaderID shader;
 		uint32_t first_set_index = 0;
 		uint32_t set_count = 0;
+		uint32_t dynamic_offsets_mask = 0u;
 
 		_FORCE_INLINE_ RDD::UniformSetID *uniform_set_ids() {
 			return reinterpret_cast<RDD::UniformSetID *>(&this[1]);
diff --git a/servers/rendering/rendering_shader_container.cpp b/servers/rendering/rendering_shader_container.cpp
index 089ddecfa2f..b108568d1a9 100644
--- a/servers/rendering/rendering_shader_container.cpp
+++ b/servers/rendering/rendering_shader_container.cpp
@@ -32,6 +32,7 @@
 
 #include "core/io/compression.h"
 
+#include "servers/rendering/renderer_rd/shader_rd.h"
 #include "thirdparty/spirv-reflect/spirv_reflect.h"
 
 static inline uint32_t aligned_to(uint32_t p_size, uint32_t p_alignment) {
@@ -138,6 +139,8 @@ Error RenderingShaderContainer::reflect_spirv(const String &p_shader_name, Span<
 		r_refl[i].shader_stage = p_spirv[i].shader_stage;
 		r_refl[i]._spirv_data = p_spirv[i].spirv;
 
+		const Vector<uint64_t> &dynamic_buffers = p_spirv[i].dynamic_buffers;
+
 		if (p_spirv[i].shader_stage == RDC::SHADER_STAGE_COMPUTE) {
 			reflection.is_compute = true;
 			ERR_FAIL_COND_V_MSG(spirv_size != 1, FAILED,
@@ -217,11 +220,23 @@ Error RenderingShaderContainer::reflect_spirv(const String &p_shader_name, Span<
 							may_be_writable = true;
 						} break;
 						case SPV_REFLECT_DESCRIPTOR_TYPE_UNIFORM_BUFFER: {
-							uniform.type = RDC::UNIFORM_TYPE_UNIFORM_BUFFER;
+							const uint64_t key = ShaderRD::DynamicBuffer::encode(binding.set, binding.binding);
+							if (dynamic_buffers.has(key)) {
+								uniform.type = RDC::UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC;
+								reflection.has_dynamic_buffers = true;
+							} else {
+								uniform.type = RDC::UNIFORM_TYPE_UNIFORM_BUFFER;
+							}
 							need_block_size = true;
 						} break;
 						case SPV_REFLECT_DESCRIPTOR_TYPE_STORAGE_BUFFER: {
-							uniform.type = RDC::UNIFORM_TYPE_STORAGE_BUFFER;
+							const uint64_t key = ShaderRD::DynamicBuffer::encode(binding.set, binding.binding);
+							if (dynamic_buffers.has(key)) {
+								uniform.type = RDC::UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC;
+								reflection.has_dynamic_buffers = true;
+							} else {
+								uniform.type = RDC::UNIFORM_TYPE_STORAGE_BUFFER;
+							}
 							need_block_size = true;
 							may_be_writable = true;
 						} break;
@@ -486,6 +501,7 @@ void RenderingShaderContainer::set_from_shader_reflection(const RenderingDeviceC
 	reflection_data.specialization_constants_count = p_reflection.specialization_constants.size();
 	reflection_data.is_compute = p_reflection.is_compute;
 	reflection_data.has_multiview = p_reflection.has_multiview;
+	reflection_data.has_dynamic_buffers = p_reflection.has_dynamic_buffers;
 	reflection_data.compute_local_size[0] = p_reflection.compute_local_size[0];
 	reflection_data.compute_local_size[1] = p_reflection.compute_local_size[1];
 	reflection_data.compute_local_size[2] = p_reflection.compute_local_size[2];
@@ -542,6 +558,7 @@ RenderingDeviceCommons::ShaderReflection RenderingShaderContainer::get_shader_re
 	shader_refl.fragment_output_mask = reflection_data.fragment_output_mask;
 	shader_refl.is_compute = reflection_data.is_compute;
 	shader_refl.has_multiview = reflection_data.has_multiview;
+	shader_refl.has_dynamic_buffers = reflection_data.has_dynamic_buffers;
 	shader_refl.compute_local_size[0] = reflection_data.compute_local_size[0];
 	shader_refl.compute_local_size[1] = reflection_data.compute_local_size[1];
 	shader_refl.compute_local_size[2] = reflection_data.compute_local_size[2];
diff --git a/servers/rendering/rendering_shader_container.h b/servers/rendering/rendering_shader_container.h
index 94209f48307..830d285206f 100644
--- a/servers/rendering/rendering_shader_container.h
+++ b/servers/rendering/rendering_shader_container.h
@@ -57,6 +57,7 @@ protected:
 		uint32_t specialization_constants_count = 0;
 		uint32_t is_compute = 0;
 		uint32_t has_multiview = 0;
+		uint32_t has_dynamic_buffers = 0;
 		uint32_t compute_local_size[3] = {};
 		uint32_t set_count = 0;
 		uint32_t push_constant_size = 0;
diff --git a/thirdparty/d3d12ma/D3D12MemAlloc.cpp b/thirdparty/d3d12ma/D3D12MemAlloc.cpp
index 80d910e6942..89b71ffe35e 100644
--- a/thirdparty/d3d12ma/D3D12MemAlloc.cpp
+++ b/thirdparty/d3d12ma/D3D12MemAlloc.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -33,10 +33,11 @@
     #include <shared_mutex>
 #endif
 
-#if !defined(_MSC_VER)
-#include <guiddef.h>
-
-#include <dxguids.h>
+// Includes needed for MinGW - see #71.
+#ifndef _MSC_VER
+    #include <guiddef.h>
+    // guiddef.h must be included first.
+    #include <dxguids.h>
 #endif
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -113,6 +114,14 @@ especially to test compatibility with D3D12_RESOURCE_HEAP_TIER_1 on modern GPUs.
    #define D3D12MA_DEFAULT_BLOCK_SIZE (64ull * 1024 * 1024)
 #endif
 
+#ifndef D3D12MA_OPTIONS16_SUPPORTED
+    #if D3D12_SDK_VERSION >= 610
+        #define D3D12MA_OPTIONS16_SUPPORTED 1
+    #else
+        #define D3D12MA_OPTIONS16_SUPPORTED 0
+    #endif
+#endif
+
 #ifndef D3D12MA_DEBUG_LOG
    #define D3D12MA_DEBUG_LOG(format, ...)
    /*
@@ -134,15 +143,11 @@ especially to test compatibility with D3D12_RESOURCE_HEAP_TIER_1 on modern GPUs.
 
 #define D3D12MA_IID_PPV_ARGS(ppType)   __uuidof(**(ppType)), reinterpret_cast<void**>(ppType)
 
-#ifdef __ID3D12Device8_INTERFACE_DEFINED__
-    #define D3D12MA_CREATE_NOT_ZEROED_AVAILABLE 1
-#endif
-
 namespace D3D12MA
 {
-static constexpr UINT HEAP_TYPE_COUNT = 4;
-static constexpr UINT STANDARD_HEAP_TYPE_COUNT = 3; // Only DEFAULT, UPLOAD, READBACK.
-static constexpr UINT DEFAULT_POOL_MAX_COUNT = 9;
+static constexpr UINT HEAP_TYPE_COUNT = 5;
+static constexpr UINT STANDARD_HEAP_TYPE_COUNT = 4; // Only DEFAULT, UPLOAD, READBACK, GPU_UPLOAD.
+static constexpr UINT DEFAULT_POOL_MAX_COUNT = STANDARD_HEAP_TYPE_COUNT * 3;
 static const UINT NEW_BLOCK_SIZE_SHIFT_MAX = 3;
 // Minimum size of a free suballocation to register it in the free suballocation collection.
 static const UINT64 MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER = 16;
@@ -153,6 +158,14 @@ static const WCHAR* const HeapTypeNames[] =
     L"UPLOAD",
     L"READBACK",
     L"CUSTOM",
+    L"GPU_UPLOAD",
+};
+static const WCHAR* const StandardHeapTypeNames[] =
+{
+    L"DEFAULT",
+    L"UPLOAD",
+    L"READBACK",
+    L"GPU_UPLOAD",
 };
 
 static const D3D12_HEAP_FLAGS RESOURCE_CLASS_HEAP_FLAGS =
@@ -160,6 +173,8 @@ static const D3D12_HEAP_FLAGS RESOURCE_CLASS_HEAP_FLAGS =
 
 static const D3D12_RESIDENCY_PRIORITY D3D12_RESIDENCY_PRIORITY_NONE = D3D12_RESIDENCY_PRIORITY(0);
 
+static const D3D12_HEAP_TYPE D3D12_HEAP_TYPE_GPU_UPLOAD_COPY = (D3D12_HEAP_TYPE)5;
+
 #ifndef _D3D12MA_ENUM_DECLARATIONS
 
 // Local copy of this enum, as it is provided only by <dxgi1_4.h>, so it may not be available.
@@ -458,23 +473,28 @@ static IterT BinaryFindSorted(const IterT& beg, const IterT& end, const KeyT& va
     return end;
 }
 
-static UINT HeapTypeToIndex(D3D12_HEAP_TYPE type)
+static UINT StandardHeapTypeToIndex(D3D12_HEAP_TYPE type)
 {
     switch (type)
     {
     case D3D12_HEAP_TYPE_DEFAULT:  return 0;
     case D3D12_HEAP_TYPE_UPLOAD:   return 1;
     case D3D12_HEAP_TYPE_READBACK: return 2;
-    case D3D12_HEAP_TYPE_CUSTOM:   return 3;
+    case D3D12_HEAP_TYPE_GPU_UPLOAD_COPY: return 3;
     default: D3D12MA_ASSERT(0); return UINT_MAX;
     }
 }
 
-static D3D12_HEAP_TYPE IndexToHeapType(UINT heapTypeIndex)
+static D3D12_HEAP_TYPE IndexToStandardHeapType(UINT heapTypeIndex)
 {
-    D3D12MA_ASSERT(heapTypeIndex < 4);
-    // D3D12_HEAP_TYPE_DEFAULT starts at 1.
-    return (D3D12_HEAP_TYPE)(heapTypeIndex + 1);
+    switch(heapTypeIndex)
+    {
+    case 0: return D3D12_HEAP_TYPE_DEFAULT;
+    case 1: return D3D12_HEAP_TYPE_UPLOAD;
+    case 2: return D3D12_HEAP_TYPE_READBACK;
+    case 3: return D3D12_HEAP_TYPE_GPU_UPLOAD_COPY;
+    default: D3D12MA_ASSERT(0); return D3D12_HEAP_TYPE_CUSTOM;
+    }
 }
 
 static UINT64 HeapFlagsToAlignment(D3D12_HEAP_FLAGS flags, bool denyMsaaTextures)
@@ -522,7 +542,8 @@ static bool IsHeapTypeStandard(D3D12_HEAP_TYPE type)
 {
     return type == D3D12_HEAP_TYPE_DEFAULT ||
         type == D3D12_HEAP_TYPE_UPLOAD ||
-        type == D3D12_HEAP_TYPE_READBACK;
+        type == D3D12_HEAP_TYPE_READBACK ||
+        type == D3D12_HEAP_TYPE_GPU_UPLOAD_COPY;
 }
 
 static D3D12_HEAP_PROPERTIES StandardHeapTypeToHeapProperties(D3D12_HEAP_TYPE type)
@@ -2780,8 +2801,8 @@ class AllocationObjectAllocator
 {
     D3D12MA_CLASS_NO_COPY(AllocationObjectAllocator);
 public:
-    AllocationObjectAllocator(const ALLOCATION_CALLBACKS& allocationCallbacks)
-        : m_Allocator(allocationCallbacks, 1024) {}
+    AllocationObjectAllocator(const ALLOCATION_CALLBACKS& allocationCallbacks, bool useMutex)
+		: m_UseMutex(useMutex), m_Allocator(allocationCallbacks, 1024) {}
 
     template<typename... Types>
     Allocation* Allocate(Types... args);
@@ -2789,6 +2810,7 @@ public:
 
 private:
     D3D12MA_MUTEX m_Mutex;
+    bool m_UseMutex;
     PoolAllocator<Allocation> m_Allocator;
 };
 
@@ -2796,13 +2818,13 @@ private:
 template<typename... Types>
 Allocation* AllocationObjectAllocator::Allocate(Types... args)
 {
-    MutexLock mutexLock(m_Mutex);
+    MutexLock mutexLock(m_Mutex, m_UseMutex);
     return m_Allocator.Alloc(std::forward<Types>(args)...);
 }
 
 void AllocationObjectAllocator::Free(Allocation* alloc)
 {
-    MutexLock mutexLock(m_Mutex);
+    MutexLock mutexLock(m_Mutex, m_UseMutex);
     m_Allocator.Free(alloc);
 }
 #endif // _D3D12MA_ALLOCATION_OBJECT_ALLOCATOR_FUNCTIONS
@@ -2864,75 +2886,9 @@ struct AllocationRequest
     UINT64 sumFreeSize; // Sum size of free items that overlap with proposed allocation.
     UINT64 sumItemSize; // Sum size of items to make lost that overlap with proposed allocation.
     SuballocationList::iterator item;
-    BOOL zeroInitialized = FALSE; // TODO Implement proper handling in TLSF and Linear, using ZeroInitializedRange class.
 };
 #endif // _D3D12MA_ALLOCATION_REQUEST
 
-#ifndef _D3D12MA_ZERO_INITIALIZED_RANGE
-/*
-Keeps track of the range of bytes that are surely initialized with zeros.
-Everything outside of it is considered uninitialized memory that may contain
-garbage data.
-
-The range is left-inclusive.
-*/
-class ZeroInitializedRange
-{
-public:
-    void Reset(UINT64 size);
-    BOOL IsRangeZeroInitialized(UINT64 beg, UINT64 end) const;
-    void MarkRangeAsUsed(UINT64 usedBeg, UINT64 usedEnd);
-
-private:
-    UINT64 m_ZeroBeg = 0, m_ZeroEnd = 0;
-};
-
-#ifndef _D3D12MA_ZERO_INITIALIZED_RANGE_FUNCTIONS
-void ZeroInitializedRange::Reset(UINT64 size)
-{
-    D3D12MA_ASSERT(size > 0);
-    m_ZeroBeg = 0;
-    m_ZeroEnd = size;
-}
-
-BOOL ZeroInitializedRange::IsRangeZeroInitialized(UINT64 beg, UINT64 end) const
-{
-    D3D12MA_ASSERT(beg < end);
-    return m_ZeroBeg <= beg && end <= m_ZeroEnd;
-}
-
-void ZeroInitializedRange::MarkRangeAsUsed(UINT64 usedBeg, UINT64 usedEnd)
-{
-    D3D12MA_ASSERT(usedBeg < usedEnd);
-    // No new bytes marked.
-    if (usedEnd <= m_ZeroBeg || m_ZeroEnd <= usedBeg)
-    {
-        return;
-    }
-    // All bytes marked.
-    if (usedBeg <= m_ZeroBeg && m_ZeroEnd <= usedEnd)
-    {
-        m_ZeroBeg = m_ZeroEnd = 0;
-    }
-    // Some bytes marked.
-    else
-    {
-        const UINT64 remainingZeroBefore = usedBeg > m_ZeroBeg ? usedBeg - m_ZeroBeg : 0;
-        const UINT64 remainingZeroAfter = usedEnd < m_ZeroEnd ? m_ZeroEnd - usedEnd : 0;
-        D3D12MA_ASSERT(remainingZeroBefore > 0 || remainingZeroAfter > 0);
-        if (remainingZeroBefore > remainingZeroAfter)
-        {
-            m_ZeroEnd = usedBeg;
-        }
-        else
-        {
-            m_ZeroBeg = usedEnd;
-        }
-    }
-}
-#endif // _D3D12MA_ZERO_INITIALIZED_RANGE_FUNCTIONS
-#endif // _D3D12MA_ZERO_INITIALIZED_RANGE
-
 #ifndef _D3D12MA_BLOCK_METADATA
 /*
 Data structure used for bookkeeping of allocations and unused ranges of memory
@@ -3111,610 +3067,6 @@ void BlockMetadata::PrintDetailedMap_End(JsonWriter& json) const
 #endif // _D3D12MA_BLOCK_METADATA_FUNCTIONS
 #endif // _D3D12MA_BLOCK_METADATA
 
-#if 0
-#ifndef _D3D12MA_BLOCK_METADATA_GENERIC
-class BlockMetadata_Generic : public BlockMetadata
-{
-public:
-    BlockMetadata_Generic(const ALLOCATION_CALLBACKS* allocationCallbacks, bool isVirtual);
-    virtual ~BlockMetadata_Generic() = default;
-
-    size_t GetAllocationCount() const override { return m_Suballocations.size() - m_FreeCount; }
-    UINT64 GetSumFreeSize() const override { return m_SumFreeSize; }
-    UINT64 GetAllocationOffset(AllocHandle allocHandle) const override { return (UINT64)allocHandle - 1; }
-
-    void Init(UINT64 size) override;
-    bool Validate() const override;
-    bool IsEmpty() const override;
-    void GetAllocationInfo(AllocHandle allocHandle, VIRTUAL_ALLOCATION_INFO& outInfo) const override;
-
-    bool CreateAllocationRequest(
-        UINT64 allocSize,
-        UINT64 allocAlignment,
-        bool upperAddress,
-        AllocationRequest* pAllocationRequest) override;
-
-    void Alloc(
-        const AllocationRequest& request,
-        UINT64 allocSize,
-        void* privateData) override;
-
-    void Free(AllocHandle allocHandle) override;
-    void Clear() override;
-
-    void SetAllocationPrivateData(AllocHandle allocHandle, void* privateData) override;
-
-    void AddStatistics(Statistics& inoutStats) const override;
-    void AddDetailedStatistics(DetailedStatistics& inoutStats) const override;
-    void WriteAllocationInfoToJson(JsonWriter& json) const override;
-
-private:
-    UINT m_FreeCount;
-    UINT64 m_SumFreeSize;
-    SuballocationList m_Suballocations;
-    // Suballocations that are free and have size greater than certain threshold.
-    // Sorted by size, ascending.
-    Vector<SuballocationList::iterator> m_FreeSuballocationsBySize;
-    ZeroInitializedRange m_ZeroInitializedRange;
-
-    SuballocationList::const_iterator FindAtOffset(UINT64 offset) const;
-    bool ValidateFreeSuballocationList() const;
-
-    // Checks if requested suballocation with given parameters can be placed in given pFreeSuballocItem.
-    // If yes, fills pOffset and returns true. If no, returns false.
-    bool CheckAllocation(
-        UINT64 allocSize,
-        UINT64 allocAlignment,
-        SuballocationList::const_iterator suballocItem,
-        AllocHandle* pAllocHandle,
-        UINT64* pSumFreeSize,
-        UINT64* pSumItemSize,
-        BOOL *pZeroInitialized) const;
-    // Given free suballocation, it merges it with following one, which must also be free.
-    void MergeFreeWithNext(SuballocationList::iterator item);
-    // Releases given suballocation, making it free.
-    // Merges it with adjacent free suballocations if applicable.
-    // Returns iterator to new free suballocation at this place.
-    SuballocationList::iterator FreeSuballocation(SuballocationList::iterator suballocItem);
-    // Given free suballocation, it inserts it into sorted list of
-    // m_FreeSuballocationsBySize if it's suitable.
-    void RegisterFreeSuballocation(SuballocationList::iterator item);
-    // Given free suballocation, it removes it from sorted list of
-    // m_FreeSuballocationsBySize if it's suitable.
-    void UnregisterFreeSuballocation(SuballocationList::iterator item);
-
-    D3D12MA_CLASS_NO_COPY(BlockMetadata_Generic)
-};
-
-#ifndef _D3D12MA_BLOCK_METADATA_GENERIC_FUNCTIONS
-BlockMetadata_Generic::BlockMetadata_Generic(const ALLOCATION_CALLBACKS* allocationCallbacks, bool isVirtual)
-    : BlockMetadata(allocationCallbacks, isVirtual),
-    m_FreeCount(0),
-    m_SumFreeSize(0),
-    m_Suballocations(*allocationCallbacks),
-    m_FreeSuballocationsBySize(*allocationCallbacks)
-{
-    D3D12MA_ASSERT(allocationCallbacks);
-}
-
-void BlockMetadata_Generic::Init(UINT64 size)
-{
-    BlockMetadata::Init(size);
-    m_ZeroInitializedRange.Reset(size);
-
-    m_FreeCount = 1;
-    m_SumFreeSize = size;
-
-    Suballocation suballoc = {};
-    suballoc.offset = 0;
-    suballoc.size = size;
-    suballoc.type = SUBALLOCATION_TYPE_FREE;
-    suballoc.privateData = NULL;
-
-    D3D12MA_ASSERT(size > MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER);
-    m_Suballocations.push_back(suballoc);
-    SuballocationList::iterator suballocItem = m_Suballocations.end();
-    --suballocItem;
-    m_FreeSuballocationsBySize.push_back(suballocItem);
-}
-
-bool BlockMetadata_Generic::Validate() const
-{
-    D3D12MA_VALIDATE(!m_Suballocations.empty());
-
-    // Expected offset of new suballocation as calculated from previous ones.
-    UINT64 calculatedOffset = 0;
-    // Expected number of free suballocations as calculated from traversing their list.
-    UINT calculatedFreeCount = 0;
-    // Expected sum size of free suballocations as calculated from traversing their list.
-    UINT64 calculatedSumFreeSize = 0;
-    // Expected number of free suballocations that should be registered in
-    // m_FreeSuballocationsBySize calculated from traversing their list.
-    size_t freeSuballocationsToRegister = 0;
-    // True if previous visited suballocation was free.
-    bool prevFree = false;
-
-    for (const auto& subAlloc : m_Suballocations)
-    {
-        // Actual offset of this suballocation doesn't match expected one.
-        D3D12MA_VALIDATE(subAlloc.offset == calculatedOffset);
-
-        const bool currFree = (subAlloc.type == SUBALLOCATION_TYPE_FREE);
-        // Two adjacent free suballocations are invalid. They should be merged.
-        D3D12MA_VALIDATE(!prevFree || !currFree);
-
-        const Allocation* const alloc = (Allocation*)subAlloc.privateData;
-        if (!IsVirtual())
-        {
-            D3D12MA_VALIDATE(currFree == (alloc == NULL));
-        }
-
-        if (currFree)
-        {
-            calculatedSumFreeSize += subAlloc.size;
-            ++calculatedFreeCount;
-            if (subAlloc.size >= MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER)
-            {
-                ++freeSuballocationsToRegister;
-            }
-
-            // Margin required between allocations - every free space must be at least that large.
-            D3D12MA_VALIDATE(subAlloc.size >= GetDebugMargin());
-        }
-        else
-        {
-            if (!IsVirtual())
-            {
-                D3D12MA_VALIDATE(alloc->GetOffset() == subAlloc.offset);
-                D3D12MA_VALIDATE(alloc->GetSize() == subAlloc.size);
-            }
-
-            // Margin required between allocations - previous allocation must be free.
-            D3D12MA_VALIDATE(GetDebugMargin() == 0 || prevFree);
-        }
-
-        calculatedOffset += subAlloc.size;
-        prevFree = currFree;
-    }
-
-    // Number of free suballocations registered in m_FreeSuballocationsBySize doesn't
-    // match expected one.
-    D3D12MA_VALIDATE(m_FreeSuballocationsBySize.size() == freeSuballocationsToRegister);
-
-    UINT64 lastSize = 0;
-    for (size_t i = 0; i < m_FreeSuballocationsBySize.size(); ++i)
-    {
-        SuballocationList::iterator suballocItem = m_FreeSuballocationsBySize[i];
-
-        // Only free suballocations can be registered in m_FreeSuballocationsBySize.
-        D3D12MA_VALIDATE(suballocItem->type == SUBALLOCATION_TYPE_FREE);
-        // They must be sorted by size ascending.
-        D3D12MA_VALIDATE(suballocItem->size >= lastSize);
-
-        lastSize = suballocItem->size;
-    }
-
-    // Check if totals match calculacted values.
-    D3D12MA_VALIDATE(ValidateFreeSuballocationList());
-    D3D12MA_VALIDATE(calculatedOffset == GetSize());
-    D3D12MA_VALIDATE(calculatedSumFreeSize == m_SumFreeSize);
-    D3D12MA_VALIDATE(calculatedFreeCount == m_FreeCount);
-
-    return true;
-}
-
-bool BlockMetadata_Generic::IsEmpty() const
-{
-    return (m_Suballocations.size() == 1) && (m_FreeCount == 1);
-}
-
-void BlockMetadata_Generic::GetAllocationInfo(AllocHandle allocHandle, VIRTUAL_ALLOCATION_INFO& outInfo) const
-{
-    Suballocation& suballoc = *FindAtOffset((UINT64)allocHandle - 1).dropConst();
-    outInfo.Offset = suballoc.offset;
-    outInfo.Size = suballoc.size;
-    outInfo.pPrivateData = suballoc.privateData;
-}
-
-bool BlockMetadata_Generic::CreateAllocationRequest(
-    UINT64 allocSize,
-    UINT64 allocAlignment,
-    bool upperAddress,
-    AllocationRequest* pAllocationRequest)
-{
-    D3D12MA_ASSERT(allocSize > 0);
-    D3D12MA_ASSERT(!upperAddress && "ALLOCATION_FLAG_UPPER_ADDRESS can be used only with linear algorithm.");
-    D3D12MA_ASSERT(pAllocationRequest != NULL);
-    D3D12MA_HEAVY_ASSERT(Validate());
-
-    // There is not enough total free space in this block to fullfill the request: Early return.
-    if (m_SumFreeSize < allocSize + GetDebugMargin())
-    {
-        return false;
-    }
-
-    // New algorithm, efficiently searching freeSuballocationsBySize.
-    const size_t freeSuballocCount = m_FreeSuballocationsBySize.size();
-    if (freeSuballocCount > 0)
-    {
-        // Find first free suballocation with size not less than allocSize + GetDebugMargin().
-        SuballocationList::iterator* const it = BinaryFindFirstNotLess(
-            m_FreeSuballocationsBySize.data(),
-            m_FreeSuballocationsBySize.data() + freeSuballocCount,
-            allocSize + GetDebugMargin(),
-            SuballocationItemSizeLess());
-        size_t index = it - m_FreeSuballocationsBySize.data();
-        for (; index < freeSuballocCount; ++index)
-        {
-            if (CheckAllocation(
-                allocSize,
-                allocAlignment,
-                m_FreeSuballocationsBySize[index],
-                &pAllocationRequest->allocHandle,
-                &pAllocationRequest->sumFreeSize,
-                &pAllocationRequest->sumItemSize,
-                &pAllocationRequest->zeroInitialized))
-            {
-                pAllocationRequest->item = m_FreeSuballocationsBySize[index];
-                return true;
-            }
-        }
-    }
-
-    return false;
-}
-
-void BlockMetadata_Generic::Alloc(
-    const AllocationRequest& request,
-    UINT64 allocSize,
-    void* privateData)
-{
-    D3D12MA_ASSERT(request.item != m_Suballocations.end());
-    Suballocation& suballoc = *request.item;
-    // Given suballocation is a free block.
-    D3D12MA_ASSERT(suballoc.type == SUBALLOCATION_TYPE_FREE);
-    // Given offset is inside this suballocation.
-    UINT64 offset = (UINT64)request.allocHandle - 1;
-    D3D12MA_ASSERT(offset >= suballoc.offset);
-    const UINT64 paddingBegin = offset - suballoc.offset;
-    D3D12MA_ASSERT(suballoc.size >= paddingBegin + allocSize);
-    const UINT64 paddingEnd = suballoc.size - paddingBegin - allocSize;
-
-    // Unregister this free suballocation from m_FreeSuballocationsBySize and update
-    // it to become used.
-    UnregisterFreeSuballocation(request.item);
-
-    suballoc.offset = offset;
-    suballoc.size = allocSize;
-    suballoc.type = SUBALLOCATION_TYPE_ALLOCATION;
-    suballoc.privateData = privateData;
-
-    // If there are any free bytes remaining at the end, insert new free suballocation after current one.
-    if (paddingEnd)
-    {
-        Suballocation paddingSuballoc = {};
-        paddingSuballoc.offset = offset + allocSize;
-        paddingSuballoc.size = paddingEnd;
-        paddingSuballoc.type = SUBALLOCATION_TYPE_FREE;
-        SuballocationList::iterator next = request.item;
-        ++next;
-        const SuballocationList::iterator paddingEndItem =
-            m_Suballocations.insert(next, paddingSuballoc);
-        RegisterFreeSuballocation(paddingEndItem);
-    }
-
-    // If there are any free bytes remaining at the beginning, insert new free suballocation before current one.
-    if (paddingBegin)
-    {
-        Suballocation paddingSuballoc = {};
-        paddingSuballoc.offset = offset - paddingBegin;
-        paddingSuballoc.size = paddingBegin;
-        paddingSuballoc.type = SUBALLOCATION_TYPE_FREE;
-        const SuballocationList::iterator paddingBeginItem =
-            m_Suballocations.insert(request.item, paddingSuballoc);
-        RegisterFreeSuballocation(paddingBeginItem);
-    }
-
-    // Update totals.
-    m_FreeCount = m_FreeCount - 1;
-    if (paddingBegin > 0)
-    {
-        ++m_FreeCount;
-    }
-    if (paddingEnd > 0)
-    {
-        ++m_FreeCount;
-    }
-    m_SumFreeSize -= allocSize;
-
-    m_ZeroInitializedRange.MarkRangeAsUsed(offset, offset + allocSize);
-}
-
-void BlockMetadata_Generic::Free(AllocHandle allocHandle)
-{
-    FreeSuballocation(FindAtOffset((UINT64)allocHandle - 1).dropConst());
-}
-
-void BlockMetadata_Generic::Clear()
-{
-    m_FreeCount = 1;
-    m_SumFreeSize = GetSize();
-
-    m_Suballocations.clear();
-    Suballocation suballoc = {};
-    suballoc.offset = 0;
-    suballoc.size = GetSize();
-    suballoc.type = SUBALLOCATION_TYPE_FREE;
-    m_Suballocations.push_back(suballoc);
-
-    m_FreeSuballocationsBySize.clear();
-    m_FreeSuballocationsBySize.push_back(m_Suballocations.begin());
-}
-
-SuballocationList::const_iterator BlockMetadata_Generic::FindAtOffset(UINT64 offset) const
-{
-    const UINT64 last = m_Suballocations.crbegin()->offset;
-    if (last == offset)
-        return m_Suballocations.crbegin();
-    const UINT64 first = m_Suballocations.cbegin()->offset;
-    if (first == offset)
-        return m_Suballocations.cbegin();
-
-    const size_t suballocCount = m_Suballocations.size();
-    const UINT64 step = (last - first + m_Suballocations.cbegin()->size) / suballocCount;
-    auto findSuballocation = [&](auto begin, auto end) -> SuballocationList::const_iterator
-    {
-        for (auto suballocItem = begin;
-            suballocItem != end;
-            ++suballocItem)
-        {
-            const Suballocation& suballoc = *suballocItem;
-            if (suballoc.offset == offset)
-                return suballocItem;
-        }
-        D3D12MA_ASSERT(false && "Not found!");
-        return m_Suballocations.end();
-    };
-    // If requested offset is closer to the end of range, search from the end
-    if ((offset - first) > suballocCount * step / 2)
-    {
-        return findSuballocation(m_Suballocations.crbegin(), m_Suballocations.crend());
-    }
-    return findSuballocation(m_Suballocations.cbegin(), m_Suballocations.cend());
-}
-
-bool BlockMetadata_Generic::ValidateFreeSuballocationList() const
-{
-    UINT64 lastSize = 0;
-    for (size_t i = 0, count = m_FreeSuballocationsBySize.size(); i < count; ++i)
-    {
-        const SuballocationList::iterator it = m_FreeSuballocationsBySize[i];
-
-        D3D12MA_VALIDATE(it->type == SUBALLOCATION_TYPE_FREE);
-        D3D12MA_VALIDATE(it->size >= MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER);
-        D3D12MA_VALIDATE(it->size >= lastSize);
-        lastSize = it->size;
-    }
-    return true;
-}
-
-bool BlockMetadata_Generic::CheckAllocation(
-    UINT64 allocSize,
-    UINT64 allocAlignment,
-    SuballocationList::const_iterator suballocItem,
-    AllocHandle* pAllocHandle,
-    UINT64* pSumFreeSize,
-    UINT64* pSumItemSize,
-    BOOL* pZeroInitialized) const
-{
-    D3D12MA_ASSERT(allocSize > 0);
-    D3D12MA_ASSERT(suballocItem != m_Suballocations.cend());
-    D3D12MA_ASSERT(pAllocHandle != NULL && pZeroInitialized != NULL);
-
-    *pSumFreeSize = 0;
-    *pSumItemSize = 0;
-    *pZeroInitialized = FALSE;
-
-    const Suballocation& suballoc = *suballocItem;
-    D3D12MA_ASSERT(suballoc.type == SUBALLOCATION_TYPE_FREE);
-
-    *pSumFreeSize = suballoc.size;
-
-    // Size of this suballocation is too small for this request: Early return.
-    if (suballoc.size < allocSize)
-    {
-        return false;
-    }
-
-    // Start from offset equal to beginning of this suballocation and debug margin of previous allocation if present.
-    UINT64 offset = suballoc.offset + (suballocItem == m_Suballocations.cbegin() ? 0 : GetDebugMargin());
-
-    // Apply alignment.
-    offset = AlignUp(offset, allocAlignment);
-
-    // Calculate padding at the beginning based on current offset.
-    const UINT64 paddingBegin = offset - suballoc.offset;
-
-    // Fail if requested size plus margin after is bigger than size of this suballocation.
-    if (paddingBegin + allocSize + GetDebugMargin() > suballoc.size)
-    {
-        return false;
-    }
-
-    // All tests passed: Success. Offset is already filled.
-    *pZeroInitialized = m_ZeroInitializedRange.IsRangeZeroInitialized(offset, offset + allocSize);
-    *pAllocHandle = (AllocHandle)(offset + 1);
-    return true;
-}
-
-void BlockMetadata_Generic::MergeFreeWithNext(SuballocationList::iterator item)
-{
-    D3D12MA_ASSERT(item != m_Suballocations.end());
-    D3D12MA_ASSERT(item->type == SUBALLOCATION_TYPE_FREE);
-
-    SuballocationList::iterator nextItem = item;
-    ++nextItem;
-    D3D12MA_ASSERT(nextItem != m_Suballocations.end());
-    D3D12MA_ASSERT(nextItem->type == SUBALLOCATION_TYPE_FREE);
-
-    item->size += nextItem->size;
-    --m_FreeCount;
-    m_Suballocations.erase(nextItem);
-}
-
-SuballocationList::iterator BlockMetadata_Generic::FreeSuballocation(SuballocationList::iterator suballocItem)
-{
-    // Change this suballocation to be marked as free.
-    Suballocation& suballoc = *suballocItem;
-    suballoc.type = SUBALLOCATION_TYPE_FREE;
-    suballoc.privateData = NULL;
-
-    // Update totals.
-    ++m_FreeCount;
-    m_SumFreeSize += suballoc.size;
-
-    // Merge with previous and/or next suballocation if it's also free.
-    bool mergeWithNext = false;
-    bool mergeWithPrev = false;
-
-    SuballocationList::iterator nextItem = suballocItem;
-    ++nextItem;
-    if ((nextItem != m_Suballocations.end()) && (nextItem->type == SUBALLOCATION_TYPE_FREE))
-    {
-        mergeWithNext = true;
-    }
-
-    SuballocationList::iterator prevItem = suballocItem;
-    if (suballocItem != m_Suballocations.begin())
-    {
-        --prevItem;
-        if (prevItem->type == SUBALLOCATION_TYPE_FREE)
-        {
-            mergeWithPrev = true;
-        }
-    }
-
-    if (mergeWithNext)
-    {
-        UnregisterFreeSuballocation(nextItem);
-        MergeFreeWithNext(suballocItem);
-    }
-
-    if (mergeWithPrev)
-    {
-        UnregisterFreeSuballocation(prevItem);
-        MergeFreeWithNext(prevItem);
-        RegisterFreeSuballocation(prevItem);
-        return prevItem;
-    }
-    else
-    {
-        RegisterFreeSuballocation(suballocItem);
-        return suballocItem;
-    }
-}
-
-void BlockMetadata_Generic::RegisterFreeSuballocation(SuballocationList::iterator item)
-{
-    D3D12MA_ASSERT(item->type == SUBALLOCATION_TYPE_FREE);
-    D3D12MA_ASSERT(item->size > 0);
-
-    // You may want to enable this validation at the beginning or at the end of
-    // this function, depending on what do you want to check.
-    D3D12MA_HEAVY_ASSERT(ValidateFreeSuballocationList());
-
-    if (item->size >= MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER)
-    {
-        if (m_FreeSuballocationsBySize.empty())
-        {
-            m_FreeSuballocationsBySize.push_back(item);
-        }
-        else
-        {
-            m_FreeSuballocationsBySize.InsertSorted(item, SuballocationItemSizeLess());
-        }
-    }
-
-    //D3D12MA_HEAVY_ASSERT(ValidateFreeSuballocationList());
-}
-
-void BlockMetadata_Generic::UnregisterFreeSuballocation(SuballocationList::iterator item)
-{
-    D3D12MA_ASSERT(item->type == SUBALLOCATION_TYPE_FREE);
-    D3D12MA_ASSERT(item->size > 0);
-
-    // You may want to enable this validation at the beginning or at the end of
-    // this function, depending on what do you want to check.
-    D3D12MA_HEAVY_ASSERT(ValidateFreeSuballocationList());
-
-    if (item->size >= MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER)
-    {
-        SuballocationList::iterator* const it = BinaryFindFirstNotLess(
-            m_FreeSuballocationsBySize.data(),
-            m_FreeSuballocationsBySize.data() + m_FreeSuballocationsBySize.size(),
-            item,
-            SuballocationItemSizeLess());
-        for (size_t index = it - m_FreeSuballocationsBySize.data();
-            index < m_FreeSuballocationsBySize.size();
-            ++index)
-        {
-            if (m_FreeSuballocationsBySize[index] == item)
-            {
-                m_FreeSuballocationsBySize.remove(index);
-                return;
-            }
-            D3D12MA_ASSERT((m_FreeSuballocationsBySize[index]->size == item->size) && "Not found.");
-        }
-        D3D12MA_ASSERT(0 && "Not found.");
-    }
-
-    //D3D12MA_HEAVY_ASSERT(ValidateFreeSuballocationList());
-}
-
-void BlockMetadata_Generic::SetAllocationPrivateData(AllocHandle allocHandle, void* privateData)
-{
-    Suballocation& suballoc = *FindAtOffset((UINT64)allocHandle - 1).dropConst();
-    suballoc.privateData = privateData;
-}
-
-void BlockMetadata_Generic::AddStatistics(Statistics& inoutStats) const
-{
-    inoutStats.BlockCount++;
-    inoutStats.AllocationCount += (UINT)m_Suballocations.size() - m_FreeCount;
-    inoutStats.BlockBytes += GetSize();
-    inoutStats.AllocationBytes += GetSize() - m_SumFreeSize;
-}
-
-void BlockMetadata_Generic::AddDetailedStatistics(DetailedStatistics& inoutStats) const
-{
-    inoutStats.Stats.BlockCount++;
-    inoutStats.Stats.BlockBytes += GetSize();
-
-    for (const auto& suballoc : m_Suballocations)
-    {
-        if (suballoc.type == SUBALLOCATION_TYPE_FREE)
-            AddDetailedStatisticsUnusedRange(inoutStats, suballoc.size);
-        else
-            AddDetailedStatisticsAllocation(inoutStats, suballoc.size);
-    }
-}
-
-void BlockMetadata_Generic::WriteAllocationInfoToJson(JsonWriter& json) const
-{
-    PrintDetailedMap_Begin(json, GetSumFreeSize(), GetAllocationCount(), m_FreeCount);
-    for (const auto& suballoc : m_Suballocations)
-    {
-        if (suballoc.type == SUBALLOCATION_TYPE_FREE)
-            PrintDetailedMap_UnusedRange(json, suballoc.offset, suballoc.size);
-        else
-            PrintDetailedMap_Allocation(json, suballoc.offset, suballoc.size, suballoc.privateData);
-    }
-    PrintDetailedMap_End(json);
-}
-#endif // _D3D12MA_BLOCK_METADATA_GENERIC_FUNCTIONS
-#endif // _D3D12MA_BLOCK_METADATA_GENERIC
-#endif // #if 0
-
 #ifndef _D3D12MA_BLOCK_METADATA_LINEAR
 class BlockMetadata_Linear : public BlockMetadata
 {
@@ -4024,6 +3376,10 @@ bool BlockMetadata_Linear::CreateAllocationRequest(
     D3D12MA_ASSERT(allocSize > 0 && "Cannot allocate empty block!");
     D3D12MA_ASSERT(pAllocationRequest != NULL);
     D3D12MA_HEAVY_ASSERT(Validate());
+
+    if(allocSize > GetSize())
+        return false;
+
     pAllocationRequest->size = allocSize;
     return upperAddress ?
         CreateAllocationRequest_UpperAddress(
@@ -5685,7 +5041,8 @@ void BlockMetadata_TLSF::WriteAllocationInfoToJson(JsonWriter& json) const
     }
     D3D12MA_ASSERT(i == 0);
 
-    PrintDetailedMap_Begin(json, GetSumFreeSize(), GetAllocationCount(), m_BlocksFreeCount + static_cast<bool>(m_NullBlock->size));
+    PrintDetailedMap_Begin(json, GetSumFreeSize(), GetAllocationCount(), m_BlocksFreeCount +
+        (m_NullBlock->size > 0 ? 1 : 0));
     for (; i < blockCount; ++i)
     {
         Block* block = blockList[i];
@@ -6501,6 +5858,7 @@ public:
 
     AllocatorPimpl* GetAllocator() const { return m_Allocator; }
     const POOL_DESC& GetDesc() const { return m_Desc; }
+    bool AlwaysCommitted() const { return (m_Desc.Flags & POOL_FLAG_ALWAYS_COMMITTED) != 0; }
     bool SupportsCommittedAllocations() const { return m_Desc.BlockSize == 0; }
     LPCWSTR GetName() const { return m_Name; }
 
@@ -6564,6 +5922,7 @@ public:
     BOOL IsUMA() const { return m_D3D12Architecture.UMA; }
     BOOL IsCacheCoherentUMA() const { return m_D3D12Architecture.CacheCoherentUMA; }
     bool SupportsResourceHeapTier2() const { return m_D3D12Options.ResourceHeapTier >= D3D12_RESOURCE_HEAP_TIER_2; }
+    bool IsGPUUploadHeapSupported() const { return m_GPUUploadHeapSupported != FALSE; }
     bool UseMutex() const { return m_UseMutex; }
     AllocationObjectAllocator& GetAllocationObjectAllocator() { return m_AllocationObjectAllocator; }
     UINT GetCurrentFrameIndex() const { return m_CurrentFrameIndex.load(); }
@@ -6572,6 +5931,7 @@ public:
         0: D3D12_HEAP_TYPE_DEFAULT
         1: D3D12_HEAP_TYPE_UPLOAD
         2: D3D12_HEAP_TYPE_READBACK
+        3: D3D12_HEAP_TYPE_GPU_UPLOAD
     else:
         0: D3D12_HEAP_TYPE_DEFAULT + buffer
         1: D3D12_HEAP_TYPE_DEFAULT + texture
@@ -6582,8 +5942,11 @@ public:
         6: D3D12_HEAP_TYPE_READBACK + buffer
         7: D3D12_HEAP_TYPE_READBACK + texture
         8: D3D12_HEAP_TYPE_READBACK + texture RT or DS
+        9: D3D12_HEAP_TYPE_GPU_UPLOAD + buffer
+        10: D3D12_HEAP_TYPE_GPU_UPLOAD + texture
+        11: D3D12_HEAP_TYPE_GPU_UPLOAD + texture RT or DS
     */
-    UINT GetDefaultPoolCount() const { return SupportsResourceHeapTier2() ? 3 : 9; }
+    UINT GetDefaultPoolCount() const { return SupportsResourceHeapTier2() ? 4 : 12; }
     BlockVector** GetDefaultPools() { return m_BlockVectors; }
 
     HRESULT Init(const ALLOCATOR_DESC& desc);
@@ -6631,8 +5994,8 @@ public:
     void SetResidencyPriority(ID3D12Pageable* obj, D3D12_RESIDENCY_PRIORITY priority) const;
 
     void SetCurrentFrameIndex(UINT frameIndex);
-    // For more deailed stats use outCutomHeaps to access statistics divided into L0 and L1 group
-    void CalculateStatistics(TotalStatistics& outStats, DetailedStatistics outCutomHeaps[2] = NULL);
+    // For more deailed stats use outCustomHeaps to access statistics divided into L0 and L1 group
+    void CalculateStatistics(TotalStatistics& outStats, DetailedStatistics outCustomHeaps[2] = NULL);
 
     void GetBudget(Budget* outLocalBudget, Budget* outNonLocalBudget);
     void GetBudgetForHeapType(Budget& outBudget, D3D12_HEAP_TYPE heapType);
@@ -6646,6 +6009,7 @@ private:
     const bool m_UseMutex;
     const bool m_AlwaysCommitted;
     const bool m_MsaaAlwaysCommitted;
+    const bool m_PreferSmallBuffersCommitted;
     bool m_DefaultPoolsNotZeroed = false;
     ID3D12Device* m_Device; // AddRef
 #ifdef __ID3D12Device1_INTERFACE_DEFINED__
@@ -6669,6 +6033,7 @@ private:
     D3D12MA_ATOMIC_UINT32 m_CurrentFrameIndex;
     DXGI_ADAPTER_DESC m_AdapterDesc;
     D3D12_FEATURE_DATA_D3D12_OPTIONS m_D3D12Options;
+    BOOL m_GPUUploadHeapSupported = FALSE;
     D3D12_FEATURE_DATA_ARCHITECTURE m_D3D12Architecture;
     AllocationObjectAllocator m_AllocationObjectAllocator;
 
@@ -6683,7 +6048,8 @@ private:
     dedicated allocation (committed resource rather than placed resource).
     */
     template<typename D3D12_RESOURCE_DESC_T>
-    static bool PrefersCommittedAllocation(const D3D12_RESOURCE_DESC_T& resourceDesc);
+    bool PrefersCommittedAllocation(const D3D12_RESOURCE_DESC_T& resourceDesc,
+        ALLOCATION_FLAGS strategy);
 
     // Allocates and registers new committed resource with implicit heap, as dedicated allocation.
     // Creates and returns Allocation object and optionally D3D12 resource.
@@ -6735,13 +6101,14 @@ AllocatorPimpl::AllocatorPimpl(const ALLOCATION_CALLBACKS& allocationCallbacks,
     : m_UseMutex((desc.Flags & ALLOCATOR_FLAG_SINGLETHREADED) == 0),
     m_AlwaysCommitted((desc.Flags & ALLOCATOR_FLAG_ALWAYS_COMMITTED) != 0),
     m_MsaaAlwaysCommitted((desc.Flags & ALLOCATOR_FLAG_MSAA_TEXTURES_ALWAYS_COMMITTED) != 0),
+    m_PreferSmallBuffersCommitted((desc.Flags & ALLOCATOR_FLAG_DONT_PREFER_SMALL_BUFFERS_COMMITTED) == 0),
     m_Device(desc.pDevice),
     m_Adapter(desc.pAdapter),
     m_PreferredBlockSize(desc.PreferredBlockSize != 0 ? desc.PreferredBlockSize : D3D12MA_DEFAULT_BLOCK_SIZE),
     m_AllocationCallbacks(allocationCallbacks),
     m_CurrentFrameIndex(0),
     // Below this line don't use allocationCallbacks but m_AllocationCallbacks!!!
-    m_AllocationObjectAllocator(m_AllocationCallbacks)
+    m_AllocationObjectAllocator(m_AllocationCallbacks, m_UseMutex)
 {
     // desc.pAllocationCallbacks intentionally ignored here, preprocessed by CreateAllocator.
     ZeroMemory(&m_D3D12Options, sizeof(m_D3D12Options));
@@ -6753,7 +6120,7 @@ AllocatorPimpl::AllocatorPimpl(const ALLOCATION_CALLBACKS& allocationCallbacks,
     {
         m_CommittedAllocations[i].Init(
             m_UseMutex,
-            (D3D12_HEAP_TYPE)(D3D12_HEAP_TYPE_DEFAULT + i),
+            IndexToStandardHeapType(i),
             NULL); // pool
     }
 
@@ -6808,6 +6175,17 @@ HRESULT AllocatorPimpl::Init(const ALLOCATOR_DESC& desc)
     m_D3D12Options.ResourceHeapTier = (D3D12MA_FORCE_RESOURCE_HEAP_TIER);
 #endif
 
+#if D3D12MA_OPTIONS16_SUPPORTED
+    {
+        D3D12_FEATURE_DATA_D3D12_OPTIONS16 options16 = {};
+        hr = m_Device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS16, &options16, sizeof(options16));
+        if (SUCCEEDED(hr))
+        {
+            m_GPUUploadHeapSupported = options16.GPUUploadHeapSupported;
+        }
+    }
+#endif // #if D3D12MA_OPTIONS16_SUPPORTED
+
     hr = m_Device->CheckFeatureSupport(D3D12_FEATURE_ARCHITECTURE, &m_D3D12Architecture, sizeof(m_D3D12Architecture));
     if (FAILED(hr))
     {
@@ -6907,7 +6285,7 @@ UINT AllocatorPimpl::StandardHeapTypeToMemorySegmentGroup(D3D12_HEAP_TYPE heapTy
     D3D12MA_ASSERT(IsHeapTypeStandard(heapType));
     if (IsUMA())
         return DXGI_MEMORY_SEGMENT_GROUP_LOCAL_COPY;
-    return heapType == D3D12_HEAP_TYPE_DEFAULT ?
+    return (heapType == D3D12_HEAP_TYPE_DEFAULT || heapType == D3D12_HEAP_TYPE_GPU_UPLOAD_COPY) ?
         DXGI_MEMORY_SEGMENT_GROUP_LOCAL_COPY : DXGI_MEMORY_SEGMENT_GROUP_NON_LOCAL_COPY;
 }
 
@@ -7259,7 +6637,7 @@ void AllocatorPimpl::SetCurrentFrameIndex(UINT frameIndex)
 #endif
 }
 
-void AllocatorPimpl::CalculateStatistics(TotalStatistics& outStats, DetailedStatistics outCutomHeaps[2])
+void AllocatorPimpl::CalculateStatistics(TotalStatistics& outStats, DetailedStatistics outCustomHeaps[2])
 {
     // Init stats
     for (size_t i = 0; i < HEAP_TYPE_COUNT; i++)
@@ -7267,21 +6645,22 @@ void AllocatorPimpl::CalculateStatistics(TotalStatistics& outStats, DetailedStat
     for (size_t i = 0; i < DXGI_MEMORY_SEGMENT_GROUP_COUNT; i++)
         ClearDetailedStatistics(outStats.MemorySegmentGroup[i]);
     ClearDetailedStatistics(outStats.Total);
-    if (outCutomHeaps)
+    if (outCustomHeaps)
     {
-        ClearDetailedStatistics(outCutomHeaps[0]);
-        ClearDetailedStatistics(outCutomHeaps[1]);
+        ClearDetailedStatistics(outCustomHeaps[0]);
+        ClearDetailedStatistics(outCustomHeaps[1]);
     }
 
-    // Process default pools. 3 standard heap types only. Add them to outStats.HeapType[i].
+    // Process default pools. 4 standard heap types only. Add them to outStats.HeapType[i].
     if (SupportsResourceHeapTier2())
     {
-        // DEFAULT, UPLOAD, READBACK.
+        // DEFAULT, UPLOAD, READBACK, GPU_UPLOAD.
         for (size_t heapTypeIndex = 0; heapTypeIndex < STANDARD_HEAP_TYPE_COUNT; ++heapTypeIndex)
         {
             BlockVector* const pBlockVector = m_BlockVectors[heapTypeIndex];
             D3D12MA_ASSERT(pBlockVector);
-            pBlockVector->AddDetailedStatistics(outStats.HeapType[heapTypeIndex]);
+            const size_t outputIndex = heapTypeIndex < 3 ? heapTypeIndex : 4; // GPU_UPLOAD 3 -> 4
+            pBlockVector->AddDetailedStatistics(outStats.HeapType[outputIndex]);
         }
     }
     else
@@ -7293,7 +6672,9 @@ void AllocatorPimpl::CalculateStatistics(TotalStatistics& outStats, DetailedStat
             {
                 BlockVector* const pBlockVector = m_BlockVectors[heapTypeIndex * 3 + heapSubType];
                 D3D12MA_ASSERT(pBlockVector);
-                pBlockVector->AddDetailedStatistics(outStats.HeapType[heapTypeIndex]);
+
+                const size_t outputIndex = heapTypeIndex < 3 ? heapTypeIndex : 4; // GPU_UPLOAD 3 -> 4
+                pBlockVector->AddDetailedStatistics(outStats.HeapType[outputIndex]);
             }
         }
     }
@@ -7308,6 +6689,9 @@ void AllocatorPimpl::CalculateStatistics(TotalStatistics& outStats, DetailedStat
     AddDetailedStatistics(
         outStats.MemorySegmentGroup[StandardHeapTypeToMemorySegmentGroup(D3D12_HEAP_TYPE_READBACK)],
         outStats.HeapType[2]);
+    AddDetailedStatistics(
+        outStats.MemorySegmentGroup[StandardHeapTypeToMemorySegmentGroup(D3D12_HEAP_TYPE_GPU_UPLOAD_COPY)],
+        outStats.HeapType[4]);
 
     // Process custom pools.
     DetailedStatistics tmpStats;
@@ -7327,20 +6711,21 @@ void AllocatorPimpl::CalculateStatistics(TotalStatistics& outStats, DetailedStat
             AddDetailedStatistics(
                 outStats.MemorySegmentGroup[memorySegment], tmpStats);
 
-            if (outCutomHeaps)
-                AddDetailedStatistics(outCutomHeaps[memorySegment], tmpStats);
+            if (outCustomHeaps)
+                AddDetailedStatistics(outCustomHeaps[memorySegment], tmpStats);
         }
     }
 
-    // Process committed allocations. 3 standard heap types only.
+    // Process committed allocations. standard heap types only.
     for (UINT heapTypeIndex = 0; heapTypeIndex < STANDARD_HEAP_TYPE_COUNT; ++heapTypeIndex)
     {
         ClearDetailedStatistics(tmpStats);
         m_CommittedAllocations[heapTypeIndex].AddDetailedStatistics(tmpStats);
+        const size_t outputIndex = heapTypeIndex < 3 ? heapTypeIndex : 4; // GPU_UPLOAD 3 -> 4
         AddDetailedStatistics(
-            outStats.HeapType[heapTypeIndex], tmpStats);
+            outStats.HeapType[outputIndex], tmpStats);
         AddDetailedStatistics(
-            outStats.MemorySegmentGroup[StandardHeapTypeToMemorySegmentGroup(IndexToHeapType(heapTypeIndex))], tmpStats);
+            outStats.MemorySegmentGroup[StandardHeapTypeToMemorySegmentGroup(IndexToStandardHeapType(heapTypeIndex))], tmpStats);
     }
 
     // Sum up memory segment groups to totals.
@@ -7360,19 +6745,24 @@ void AllocatorPimpl::CalculateStatistics(TotalStatistics& outStats, DetailedStat
 
     D3D12MA_ASSERT(outStats.Total.Stats.BlockCount ==
         outStats.HeapType[0].Stats.BlockCount + outStats.HeapType[1].Stats.BlockCount +
-        outStats.HeapType[2].Stats.BlockCount + outStats.HeapType[3].Stats.BlockCount);
+        outStats.HeapType[2].Stats.BlockCount + outStats.HeapType[3].Stats.BlockCount +
+        outStats.HeapType[4].Stats.BlockCount);
     D3D12MA_ASSERT(outStats.Total.Stats.AllocationCount ==
         outStats.HeapType[0].Stats.AllocationCount + outStats.HeapType[1].Stats.AllocationCount +
-        outStats.HeapType[2].Stats.AllocationCount + outStats.HeapType[3].Stats.AllocationCount);
+        outStats.HeapType[2].Stats.AllocationCount + outStats.HeapType[3].Stats.AllocationCount +
+        outStats.HeapType[4].Stats.AllocationCount);
     D3D12MA_ASSERT(outStats.Total.Stats.BlockBytes ==
         outStats.HeapType[0].Stats.BlockBytes + outStats.HeapType[1].Stats.BlockBytes +
-        outStats.HeapType[2].Stats.BlockBytes + outStats.HeapType[3].Stats.BlockBytes);
+        outStats.HeapType[2].Stats.BlockBytes + outStats.HeapType[3].Stats.BlockBytes +
+        outStats.HeapType[4].Stats.BlockBytes);
     D3D12MA_ASSERT(outStats.Total.Stats.AllocationBytes ==
         outStats.HeapType[0].Stats.AllocationBytes + outStats.HeapType[1].Stats.AllocationBytes +
-        outStats.HeapType[2].Stats.AllocationBytes + outStats.HeapType[3].Stats.AllocationBytes);
+        outStats.HeapType[2].Stats.AllocationBytes + outStats.HeapType[3].Stats.AllocationBytes +
+        outStats.HeapType[4].Stats.AllocationBytes);
     D3D12MA_ASSERT(outStats.Total.UnusedRangeCount ==
         outStats.HeapType[0].UnusedRangeCount + outStats.HeapType[1].UnusedRangeCount +
-        outStats.HeapType[2].UnusedRangeCount + outStats.HeapType[3].UnusedRangeCount);
+        outStats.HeapType[2].UnusedRangeCount + outStats.HeapType[3].UnusedRangeCount +
+        outStats.HeapType[4].UnusedRangeCount);
 }
 
 void AllocatorPimpl::GetBudget(Budget* outLocalBudget, Budget* outNonLocalBudget)
@@ -7420,6 +6810,7 @@ void AllocatorPimpl::GetBudgetForHeapType(Budget& outBudget, D3D12_HEAP_TYPE hea
     switch (heapType)
     {
     case D3D12_HEAP_TYPE_DEFAULT:
+    case D3D12_HEAP_TYPE_GPU_UPLOAD_COPY:
         GetBudget(&outBudget, NULL);
         break;
     case D3D12_HEAP_TYPE_UPLOAD:
@@ -7476,6 +6867,9 @@ void AllocatorPimpl::BuildStatsString(WCHAR** ppStatsString, BOOL detailedMap)
                 json.WriteBool(m_D3D12Architecture.UMA);
                 json.WriteString(L"CacheCoherentUMA");
                 json.WriteBool(m_D3D12Architecture.CacheCoherentUMA);
+
+                json.WriteString(L"GPUUploadHeapSupported");
+                json.WriteBool(m_GPUUploadHeapSupported != FALSE);
             }
             json.EndObject();
         }
@@ -7508,6 +6902,17 @@ void AllocatorPimpl::BuildStatsString(WCHAR** ppStatsString, BOOL detailedMap)
                                 json.AddDetailedStatisticsInfoObject(stats.HeapType[0]);
                             }
                             json.EndObject();
+
+                            if(IsGPUUploadHeapSupported())
+                            {
+                                json.WriteString(L"GPU_UPLOAD");
+                                json.BeginObject();
+                                {
+                                    json.WriteString(L"Stats");
+                                    json.AddDetailedStatisticsInfoObject(stats.HeapType[4]);
+                                }
+                                json.EndObject();
+                            }
                         }
                         json.WriteString(L"UPLOAD");
                         json.BeginObject();
@@ -7558,6 +6963,17 @@ void AllocatorPimpl::BuildStatsString(WCHAR** ppStatsString, BOOL detailedMap)
                             }
                             json.EndObject();
 
+                            if(IsGPUUploadHeapSupported())
+                            {
+                                json.WriteString(L"GPU_UPLOAD");
+                                json.BeginObject();
+                                {
+                                    json.WriteString(L"Stats");
+                                    json.AddDetailedStatisticsInfoObject(stats.HeapType[4]);
+                                }
+                                json.EndObject();
+                            }
+                            
                             json.WriteString(L"CUSTOM");
                             json.BeginObject();
                             {
@@ -7684,7 +7100,7 @@ void AllocatorPimpl::BuildStatsString(WCHAR** ppStatsString, BOOL detailedMap)
                 {
                     for (uint8_t heapType = 0; heapType < STANDARD_HEAP_TYPE_COUNT; ++heapType)
                     {
-                        json.WriteString(HeapTypeNames[heapType]);
+                        json.WriteString(StandardHeapTypeNames[heapType]);
                         json.BeginObject();
                         writeHeapInfo(m_BlockVectors[heapType], m_CommittedAllocations + heapType, false);
                         json.EndObject();
@@ -7701,11 +7117,11 @@ void AllocatorPimpl::BuildStatsString(WCHAR** ppStatsString, BOOL detailedMap)
                                 L" - Textures",
                                 L" - Textures RT/DS",
                             };
-                            json.BeginString(HeapTypeNames[heapType]);
+                            json.BeginString(StandardHeapTypeNames[heapType]);
                             json.EndString(heapSubTypeName[heapSubType]);
 
                             json.BeginObject();
-                            writeHeapInfo(m_BlockVectors[heapType + heapSubType], m_CommittedAllocations + heapType, false);
+                            writeHeapInfo(m_BlockVectors[heapType * 3 + heapSubType], m_CommittedAllocations + heapType, false);
                             json.EndObject();
                         }
                     }
@@ -7763,8 +7179,19 @@ void AllocatorPimpl::FreeStatsString(WCHAR* pStatsString)
 }
 
 template<typename D3D12_RESOURCE_DESC_T>
-bool AllocatorPimpl::PrefersCommittedAllocation(const D3D12_RESOURCE_DESC_T& resourceDesc)
+bool AllocatorPimpl::PrefersCommittedAllocation(const D3D12_RESOURCE_DESC_T& resourceDesc,
+    ALLOCATION_FLAGS strategy)
 {
+    // Prefer creating small buffers <= 32 KB as committed, because drivers pack them better,
+    // while placed buffers require 64 KB alignment.
+    if(resourceDesc.Dimension == D3D12_RESOURCE_DIMENSION_BUFFER &&
+        resourceDesc.Width <= D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT / 2 &&
+        strategy != ALLOCATION_FLAG_STRATEGY_MIN_TIME &&  // Creating as committed would be slower.
+        m_PreferSmallBuffersCommitted)
+    {
+        return true;
+    }
+
     // Intentional. It may change in the future.
     return false;
 }
@@ -7897,16 +7324,8 @@ HRESULT AllocatorPimpl::AllocateCommittedResource(
         }
         if (SUCCEEDED(hr))
         {
-            BOOL wasZeroInitialized = TRUE;
-#if D3D12MA_CREATE_NOT_ZEROED_AVAILABLE
-            if((committedAllocParams.m_HeapFlags & D3D12_HEAP_FLAG_CREATE_NOT_ZEROED) != 0)
-            {
-                wasZeroInitialized = FALSE;
-            }
-#endif
-
             Allocation* alloc = m_AllocationObjectAllocator.Allocate(
-                this, resourceSize, createParams.GetBaseResourceDesc()->Alignment, wasZeroInitialized);
+                this, resourceSize, createParams.GetBaseResourceDesc()->Alignment);
             alloc->InitCommitted(committedAllocParams.m_List);
             alloc->SetResourcePointer(res, createParams.GetBaseResourceDesc());
             alloc->SetPrivateData(pPrivateData);
@@ -7965,16 +7384,7 @@ HRESULT AllocatorPimpl::AllocateHeap(
     if (SUCCEEDED(hr))
     {
         SetResidencyPriority(heap, committedAllocParams.m_ResidencyPriority);
-
-        BOOL wasZeroInitialized = TRUE;
-#if D3D12MA_CREATE_NOT_ZEROED_AVAILABLE
-        if((heapDesc.Flags & D3D12_HEAP_FLAG_CREATE_NOT_ZEROED) != 0)
-        {
-            wasZeroInitialized = FALSE;
-        }
-#endif
-
-        (*ppAllocation) = m_AllocationObjectAllocator.Allocate(this, allocInfo.SizeInBytes, allocInfo.Alignment, wasZeroInitialized);
+        (*ppAllocation) = m_AllocationObjectAllocator.Allocate(this, allocInfo.SizeInBytes, allocInfo.Alignment);
         (*ppAllocation)->InitHeap(committedAllocParams.m_List, heap);
         (*ppAllocation)->SetPrivateData(pPrivateData);
         committedAllocParams.m_List->Register(*ppAllocation);
@@ -7995,13 +7405,17 @@ HRESULT AllocatorPimpl::CalcAllocationParams(const ALLOCATION_DESC& allocDesc, U
     outCommittedAllocationParams = CommittedAllocationParameters();
     outPreferCommitted = false;
 
+    if (allocDesc.HeapType == D3D12_HEAP_TYPE_GPU_UPLOAD_COPY && !IsGPUUploadHeapSupported())
+        return E_NOTIMPL;
+
     bool msaaAlwaysCommitted;
     if (allocDesc.CustomPool != NULL)
     {
         PoolPimpl* const pool = allocDesc.CustomPool->m_Pimpl;
 
         msaaAlwaysCommitted = pool->GetBlockVector()->DeniesMsaaTextures();
-        outBlockVector = pool->GetBlockVector();
+        if(!pool->AlwaysCommitted())
+            outBlockVector = pool->GetBlockVector();
 
         const auto& desc = pool->GetDesc();
         outCommittedAllocationParams.m_ProtectedSession = desc.pProtectedSession;
@@ -8020,7 +7434,7 @@ HRESULT AllocatorPimpl::CalcAllocationParams(const ALLOCATION_DESC& allocDesc, U
 
         outCommittedAllocationParams.m_HeapProperties = StandardHeapTypeToHeapProperties(allocDesc.HeapType);
         outCommittedAllocationParams.m_HeapFlags = allocDesc.ExtraHeapFlags;
-        outCommittedAllocationParams.m_List = &m_CommittedAllocations[HeapTypeToIndex(allocDesc.HeapType)];
+        outCommittedAllocationParams.m_List = &m_CommittedAllocations[StandardHeapTypeToIndex(allocDesc.HeapType)];
         // outCommittedAllocationParams.m_ResidencyPriority intentionally left with default value.
 
         const ResourceClass resourceClass = (resDesc != NULL) ?
@@ -8040,12 +7454,6 @@ HRESULT AllocatorPimpl::CalcAllocationParams(const ALLOCATION_DESC& allocDesc, U
                 outPreferCommitted = true;
             }
         }
-
-        const D3D12_HEAP_FLAGS extraHeapFlags = allocDesc.ExtraHeapFlags & ~RESOURCE_CLASS_HEAP_FLAGS;
-        if (outBlockVector != NULL && extraHeapFlags != 0)
-        {
-            outBlockVector = NULL;
-        }
     }
 
     if ((allocDesc.Flags & ALLOCATION_FLAG_COMMITTED) != 0 ||
@@ -8063,7 +7471,7 @@ HRESULT AllocatorPimpl::CalcAllocationParams(const ALLOCATION_DESC& allocDesc, U
     {
         if (resDesc->SampleDesc.Count > 1 && msaaAlwaysCommitted)
             outBlockVector = NULL;
-        if (!outPreferCommitted && PrefersCommittedAllocation(*resDesc))
+        if (!outPreferCommitted && PrefersCommittedAllocation(*resDesc, allocDesc.Flags & ALLOCATION_FLAG_STRATEGY_MASK))
             outPreferCommitted = true;
     }
 
@@ -8075,12 +7483,7 @@ UINT AllocatorPimpl::CalcDefaultPoolIndex(const ALLOCATION_DESC& allocDesc, Reso
     D3D12_HEAP_FLAGS extraHeapFlags = allocDesc.ExtraHeapFlags & ~RESOURCE_CLASS_HEAP_FLAGS;
 
 #if D3D12MA_CREATE_NOT_ZEROED_AVAILABLE
-    // If allocator was created with ALLOCATOR_FLAG_DEFAULT_POOLS_NOT_ZEROED, also ignore
-    // D3D12_HEAP_FLAG_CREATE_NOT_ZEROED.
-    if(m_DefaultPoolsNotZeroed)
-    {
-        extraHeapFlags &= ~D3D12_HEAP_FLAG_CREATE_NOT_ZEROED;
-    }
+    extraHeapFlags &= ~D3D12_HEAP_FLAG_CREATE_NOT_ZEROED;
 #endif
 
     if (extraHeapFlags != 0)
@@ -8094,6 +7497,7 @@ UINT AllocatorPimpl::CalcDefaultPoolIndex(const ALLOCATION_DESC& allocDesc, Reso
     case D3D12_HEAP_TYPE_DEFAULT:  poolIndex = 0; break;
     case D3D12_HEAP_TYPE_UPLOAD:   poolIndex = 1; break;
     case D3D12_HEAP_TYPE_READBACK: poolIndex = 2; break;
+    case D3D12_HEAP_TYPE_GPU_UPLOAD_COPY: poolIndex = 3; break;
     default: D3D12MA_ASSERT(0);
     }
 
@@ -8149,6 +7553,9 @@ void AllocatorPimpl::CalcDefaultPoolParams(D3D12_HEAP_TYPE& outHeapType, D3D12_H
     case 2:
         outHeapType = D3D12_HEAP_TYPE_READBACK;
         break;
+    case 3:
+        outHeapType = D3D12_HEAP_TYPE_GPU_UPLOAD_COPY;
+        break;
     default:
         D3D12MA_ASSERT(0);
     }
@@ -8156,7 +7563,7 @@ void AllocatorPimpl::CalcDefaultPoolParams(D3D12_HEAP_TYPE& outHeapType, D3D12_H
 
 void AllocatorPimpl::RegisterPool(Pool* pool, D3D12_HEAP_TYPE heapType)
 {
-    const UINT heapTypeIndex = HeapTypeToIndex(heapType);
+    const UINT heapTypeIndex = (UINT)heapType - 1;
 
     MutexLockWrite lock(m_PoolsMutex[heapTypeIndex], m_UseMutex);
     m_Pools[heapTypeIndex].PushBack(pool->m_Pimpl);
@@ -8164,7 +7571,7 @@ void AllocatorPimpl::RegisterPool(Pool* pool, D3D12_HEAP_TYPE heapType)
 
 void AllocatorPimpl::UnregisterPool(Pool* pool, D3D12_HEAP_TYPE heapType)
 {
-    const UINT heapTypeIndex = HeapTypeToIndex(heapType);
+    const UINT heapTypeIndex = (UINT)heapType - 1;
 
     MutexLockWrite lock(m_PoolsMutex[heapTypeIndex], m_UseMutex);
     m_Pools[heapTypeIndex].Remove(pool->m_Pimpl);
@@ -8184,12 +7591,13 @@ HRESULT AllocatorPimpl::UpdateD3D12Budget()
 
 D3D12_RESOURCE_ALLOCATION_INFO AllocatorPimpl::GetResourceAllocationInfoNative(const D3D12_RESOURCE_DESC& resourceDesc) const
 {
+    // This is how new D3D12 headers define GetResourceAllocationInfo function -
+    // different signature depending on these macros.
 #if defined(_MSC_VER) || !defined(_WIN32)
     return m_Device->GetResourceAllocationInfo(0, 1, &resourceDesc);
 #else
-    D3D12_RESOURCE_ALLOCATION_INFO ret;
-    m_Device->GetResourceAllocationInfo(&ret, 0, 1, &resourceDesc);
-    return ret;
+    D3D12_RESOURCE_ALLOCATION_INFO retVal;
+    return *m_Device->GetResourceAllocationInfo(&retVal, 0, 1, &resourceDesc);
 #endif
 }
 
@@ -8198,12 +7606,14 @@ D3D12_RESOURCE_ALLOCATION_INFO AllocatorPimpl::GetResourceAllocationInfoNative(c
 {
     D3D12MA_ASSERT(m_Device8 != NULL);
     D3D12_RESOURCE_ALLOCATION_INFO1 info1Unused;
+
+    // This is how new D3D12 headers define GetResourceAllocationInfo function -
+    // different signature depending on these macros.
 #if defined(_MSC_VER) || !defined(_WIN32)
     return m_Device8->GetResourceAllocationInfo2(0, 1, &resourceDesc, &info1Unused);
 #else
-    D3D12_RESOURCE_ALLOCATION_INFO ret;
-    m_Device8->GetResourceAllocationInfo2(&ret, 0, 1, &resourceDesc, &info1Unused);
-    return ret;
+    D3D12_RESOURCE_ALLOCATION_INFO retVal;
+    return *m_Device8->GetResourceAllocationInfo2(&retVal, 0, 1, &resourceDesc, &info1Unused);
 #endif
 }
 #endif // #ifdef __ID3D12Device8_INTERFACE_DEFINED__
@@ -8211,6 +7621,7 @@ D3D12_RESOURCE_ALLOCATION_INFO AllocatorPimpl::GetResourceAllocationInfoNative(c
 template<typename D3D12_RESOURCE_DESC_T>
 D3D12_RESOURCE_ALLOCATION_INFO AllocatorPimpl::GetResourceAllocationInfo(D3D12_RESOURCE_DESC_T& inOutResourceDesc) const
 {
+#ifdef __ID3D12Device1_INTERFACE_DEFINED__
     /* Optional optimization: Microsoft documentation says:
     https://docs.microsoft.com/en-us/windows/win32/api/d3d12/nf-d3d12-id3d12device-getresourceallocationinfo
 
@@ -8226,6 +7637,7 @@ D3D12_RESOURCE_ALLOCATION_INFO AllocatorPimpl::GetResourceAllocationInfo(D3D12_R
             AlignUp<UINT64>(inOutResourceDesc.Width, D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT), // SizeInBytes
             D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT }; // Alignment
     }
+#endif // #ifdef __ID3D12Device1_INTERFACE_DEFINED__
 
 #if D3D12MA_USE_SMALL_RESOURCE_PLACEMENT_ALIGNMENT
     if (inOutResourceDesc.Alignment == 0 &&
@@ -8987,7 +8399,7 @@ HRESULT BlockVector::CommitAllocationRequest(
     if (pBlock->m_pMetadata->IsEmpty())
         m_HasEmptyBlock = false;
 
-    *pAllocation = m_hAllocator->GetAllocationObjectAllocator().Allocate(m_hAllocator, size, alignment, allocRequest.zeroInitialized);
+    *pAllocation = m_hAllocator->GetAllocationObjectAllocator().Allocate(m_hAllocator, size, alignment);
     pBlock->m_pMetadata->Alloc(allocRequest, size, *pAllocation);
 
     (*pAllocation)->InitPlaced(allocRequest.allocHandle, pBlock);
@@ -9900,11 +9312,6 @@ void Allocation::SetName(LPCWSTR Name)
 
 void Allocation::ReleaseThis()
 {
-    if (this == NULL)
-    {
-        return;
-    }
-
     SAFE_RELEASE(m_Resource);
 
     switch (m_PackedData.GetType())
@@ -9925,7 +9332,7 @@ void Allocation::ReleaseThis()
     m_Allocator->GetAllocationObjectAllocator().Free(this);
 }
 
-Allocation::Allocation(AllocatorPimpl* allocator, UINT64 size, UINT64 alignment, BOOL wasZeroInitialized)
+Allocation::Allocation(AllocatorPimpl* allocator, UINT64 size, UINT64 alignment)
     : m_Allocator{ allocator },
     m_Size{ size },
     m_Alignment{ alignment },
@@ -9939,7 +9346,6 @@ Allocation::Allocation(AllocatorPimpl* allocator, UINT64 size, UINT64 alignment,
     m_PackedData.SetResourceDimension(D3D12_RESOURCE_DIMENSION_UNKNOWN);
     m_PackedData.SetResourceFlags(D3D12_RESOURCE_FLAG_NONE);
     m_PackedData.SetTextureLayout(D3D12_TEXTURE_LAYOUT_UNKNOWN);
-    m_PackedData.SetWasZeroInitialized(wasZeroInitialized);
 }
 
 void Allocation::InitCommitted(CommittedAllocationList* list)
@@ -9973,7 +9379,6 @@ void Allocation::SwapBlockAllocation(Allocation* allocation)
     D3D12MA_ASSERT(allocation->m_PackedData.GetType() == TYPE_PLACED);
 
     D3D12MA_SWAP(m_Resource, allocation->m_Resource);
-    m_PackedData.SetWasZeroInitialized(allocation->m_PackedData.WasZeroInitialized());
     m_Placed.block->m_pMetadata->SetAllocationPrivateData(m_Placed.allocHandle, allocation);
     D3D12MA_SWAP(m_Placed, allocation->m_Placed);
     m_Placed.block->m_pMetadata->SetAllocationPrivateData(m_Placed.allocHandle, this);
@@ -10051,11 +9456,6 @@ void DefragmentationContext::GetStats(DEFRAGMENTATION_STATS* pStats)
 
 void DefragmentationContext::ReleaseThis()
 {
-    if (this == NULL)
-    {
-        return;
-    }
-
     D3D12MA_DELETE(m_Pimpl->GetAllocs(), this);
 }
 
@@ -10108,6 +9508,8 @@ HRESULT Pool::BeginDefragmentation(const DEFRAGMENTATION_DESC* pDesc, Defragment
     // Check for support
     if (m_Pimpl->GetBlockVector()->GetAlgorithm() & POOL_FLAG_ALGORITHM_LINEAR)
         return E_NOINTERFACE;
+    if(m_Pimpl->AlwaysCommitted())
+        return E_NOINTERFACE;
 
     AllocatorPimpl* allocator = m_Pimpl->GetAllocator();
     *ppContext = D3D12MA_NEW(allocator->GetAllocs(), DefragmentationContext)(allocator, *pDesc, m_Pimpl->GetBlockVector());
@@ -10116,11 +9518,6 @@ HRESULT Pool::BeginDefragmentation(const DEFRAGMENTATION_DESC* pDesc, Defragment
 
 void Pool::ReleaseThis()
 {
-    if (this == NULL)
-    {
-        return;
-    }
-
     D3D12MA_DELETE(m_Pimpl->GetAllocator()->GetAllocs(), this);
 }
 
@@ -10151,6 +9548,11 @@ BOOL Allocator::IsCacheCoherentUMA() const
     return m_Pimpl->IsCacheCoherentUMA();
 }
 
+BOOL Allocator::IsGPUUploadHeapSupported() const
+{
+    return m_Pimpl->IsGPUUploadHeapSupported();
+}
+
 UINT64 Allocator::GetMemoryCapacity(UINT memorySegmentGroup) const
 {
     return m_Pimpl->GetMemoryCapacity(memorySegmentGroup);
@@ -10331,6 +9733,12 @@ HRESULT Allocator::CreatePool(
         D3D12MA_ASSERT(0 && "Invalid arguments passed to Allocator::CreatePool.");
         return E_INVALIDARG;
     }
+    if ((pPoolDesc->Flags & POOL_FLAG_ALWAYS_COMMITTED) != 0 &&
+        (pPoolDesc->BlockSize != 0 || pPoolDesc->MinBlockCount > 0))
+    {
+        D3D12MA_ASSERT(0 && "Invalid arguments passed to Allocator::CreatePool while POOL_FLAG_ALWAYS_COMMITTED is specified.");
+        return E_INVALIDARG;
+    }
     if (!m_Pimpl->HeapFlagsFulfillResourceHeapTier(pPoolDesc->HeapFlags))
     {
         D3D12MA_ASSERT(0 && "Invalid pPoolDesc->HeapFlags passed to Allocator::CreatePool. Did you forget to handle ResourceHeapTier=1?");
diff --git a/thirdparty/d3d12ma/D3D12MemAlloc.h b/thirdparty/d3d12ma/D3D12MemAlloc.h
index 4ab7be318e6..9cd1c33ef63 100644
--- a/thirdparty/d3d12ma/D3D12MemAlloc.h
+++ b/thirdparty/d3d12ma/D3D12MemAlloc.h
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -24,9 +24,9 @@
 
 /** \mainpage D3D12 Memory Allocator
 
-<b>Version 2.1.0-development</b> (2022-12-15)
+<b>Version 2.1.0-development</b> (2024-07-05)
 
-Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. \n
+Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. \n
 License: MIT
 
 Documentation of all members: D3D12MemAlloc.h
@@ -38,7 +38,16 @@ Documentation of all members: D3D12MemAlloc.h
     - [Creating resources](@ref quick_start_creating_resources)
     - [Resource reference counting](@ref quick_start_resource_reference_counting)
     - [Mapping memory](@ref quick_start_mapping_memory)
+    - [Helper structures](@ref quick_start_helper_structures)
 - \subpage custom_pools
+- \subpage optimal_allocation
+    - [Avoiding running out of memory](@ref optimal_allocation_avoiding_running_out_of_memory)
+    - [Allocation performance](@ref optimal_allocation_allocation_Performance)
+    - [Sub-allocating buffers](@ref optimal_allocation_suballocating_buffers)
+    - [Residency priority](@ref optimal_allocation_residency_priority)
+    - [GPU upload heap](@ref optimal_allocation_gpu_upload_heap)
+    - [Committed versus placed resources](@ref optimal_allocation_committed_vs_placed)
+    - [Resource alignment](@ref optimal_allocation_resource_alignment)
 - \subpage defragmentation
 - \subpage statistics
 - \subpage resource_aliasing
@@ -52,49 +61,87 @@ Documentation of all members: D3D12MemAlloc.h
   - [Versioning and compatibility](@ref general_considerations_versioning_and_compatibility)
   - [Features not supported](@ref general_considerations_features_not_supported)
         
-\section main_see_also See also
+\section main_see_also Web links
 
-- [Product page on GPUOpen](https://gpuopen.com/gaming-product/d3d12-memory-allocator/)
-- [Source repository on GitHub](https://github.com/GPUOpen-LibrariesAndSDKs/D3D12MemoryAllocator)
+- [Direct3D 12 Memory Allocator at GPUOpen.com](https://gpuopen.com/gaming-product/d3d12-memory-allocator/) - product page
+- [GPUOpen-LibrariesAndSDKs/D3D12MemoryAllocator at GitHub.com](https://github.com/GPUOpen-LibrariesAndSDKs/D3D12MemoryAllocator) - source code repository
 */
 
 // If using this library on a platform different than Windows PC or want to use different version of DXGI,
-// you should include D3D12-compatible headers before this library on your own and define this macro.
+// you should include D3D12-compatible headers before this library on your own and define 
+// D3D12MA_D3D12_HEADERS_ALREADY_INCLUDED.
+// Alternatively, if you are targeting the open sourced DirectX headers, defining D3D12MA_USING_DIRECTX_HEADERS
+// will include them rather the ones provided by the Windows SDK.
 #ifndef D3D12MA_D3D12_HEADERS_ALREADY_INCLUDED
-    #include <d3d12.h>
+    #if defined(D3D12MA_USING_DIRECTX_HEADERS)
+        #include <directx/d3d12.h>
+        #include <dxguids/dxguids.h>
+    #else
+        #include <d3d12.h>
+    #endif
+    
     #include <dxgi1_4.h>
 #endif
 
-// Define this macro to 0 to disable usage of DXGI 1.4 (needed for IDXGIAdapter3 and query for memory budget).
 #ifndef D3D12MA_DXGI_1_4
     #ifdef __IDXGIAdapter3_INTERFACE_DEFINED__
+        /// Define this macro to 0 to disable usage of DXGI 1.4 (which is used for `IDXGIAdapter3` and query for memory budget).
         #define D3D12MA_DXGI_1_4 1
     #else
         #define D3D12MA_DXGI_1_4 0
     #endif
 #endif
 
-/*
-When defined to value other than 0, the library will try to use
-D3D12_SMALL_RESOURCE_PLACEMENT_ALIGNMENT or D3D12_SMALL_MSAA_RESOURCE_PLACEMENT_ALIGNMENT
-for created textures when possible, which can save memory because some small textures
-may get their alignment 4K and their size a multiply of 4K instead of 64K.
+#ifndef D3D12MA_CREATE_NOT_ZEROED_AVAILABLE
+    #ifdef __ID3D12Device8_INTERFACE_DEFINED__
+        /// This macro is defined to 0 or 1 automatically. Define it to 0 to disable support for `D3D12_HEAP_FLAG_CREATE_NOT_ZEROED`.
+        #define D3D12MA_CREATE_NOT_ZEROED_AVAILABLE 1
+    #else
+        #define D3D12MA_CREATE_NOT_ZEROED_AVAILABLE 0
+    #endif
+#endif
 
-#define D3D12MA_USE_SMALL_RESOURCE_PLACEMENT_ALIGNMENT 0
-    Disables small texture alignment.
-#define D3D12MA_USE_SMALL_RESOURCE_PLACEMENT_ALIGNMENT 1
-    Enables conservative algorithm that will use small alignment only for some textures
-    that are surely known to support it.
-#define D3D12MA_USE_SMALL_RESOURCE_PLACEMENT_ALIGNMENT 2
-    Enables query for small alignment to D3D12 (based on Microsoft sample) which will
-    enable small alignment for more textures, but will also generate D3D Debug Layer
-    error #721 on call to ID3D12Device::GetResourceAllocationInfo, which you should just
-    ignore.
-*/
 #ifndef D3D12MA_USE_SMALL_RESOURCE_PLACEMENT_ALIGNMENT
+    /** \brief
+    When defined to value other than 0, the library will try to use
+    `D3D12_SMALL_RESOURCE_PLACEMENT_ALIGNMENT` or `D3D12_SMALL_MSAA_RESOURCE_PLACEMENT_ALIGNMENT`
+    for created textures when possible, which can save memory because some small textures
+    may get their alignment 4 KB and their size a multiply of 4 KB instead of 64 KB.
+
+    - `#define D3D12MA_USE_SMALL_RESOURCE_PLACEMENT_ALIGNMENT 0` -
+      Disables small texture alignment.
+    - `#define D3D12MA_USE_SMALL_RESOURCE_PLACEMENT_ALIGNMENT 1` (the default) -
+      Enables conservative algorithm that will use small alignment only for some textures
+      that are surely known to support it.
+    - `#define D3D12MA_USE_SMALL_RESOURCE_PLACEMENT_ALIGNMENT 2` -
+      Enables query for small alignment to D3D12 (based on Microsoft sample) which will
+      enable small alignment for more textures, but will also generate D3D Debug Layer
+      error #721 on call to `ID3D12Device::GetResourceAllocationInfo`, which you should just
+      ignore.
+    */
     #define D3D12MA_USE_SMALL_RESOURCE_PLACEMENT_ALIGNMENT 1
 #endif
 
+#ifndef D3D12MA_RECOMMENDED_ALLOCATOR_FLAGS
+    /// Set of flags recommended for use in D3D12MA::ALLOCATOR_DESC::Flags for optimal performance.
+    #define D3D12MA_RECOMMENDED_ALLOCATOR_FLAGS (ALLOCATOR_FLAG_DEFAULT_POOLS_NOT_ZEROED | ALLOCATOR_FLAG_MSAA_TEXTURES_ALWAYS_COMMITTED)
+#endif
+
+#ifndef D3D12MA_RECOMMENDED_HEAP_FLAGS
+    #if D3D12MA_CREATE_NOT_ZEROED_AVAILABLE
+        #define D3D12MA_RECOMMENDED_HEAP_FLAGS (D3D12_HEAP_FLAG_CREATE_NOT_ZEROED)
+    #else
+        /// Set of flags recommended for use in D3D12MA::POOL_DESC::HeapFlags for optimal performance.
+        #define D3D12MA_RECOMMENDED_HEAP_FLAGS (D3D12_HEAP_FLAG_NONE)
+    #endif
+#endif
+
+#ifndef D3D12MA_RECOMMENDED_POOL_FLAGS
+    /// Set of flags recommended for use in D3D12MA::POOL_DESC::Flags for optimal performance.
+    #define D3D12MA_RECOMMENDED_POOL_FLAGS (POOL_FLAG_MSAA_TEXTURES_ALWAYS_COMMITTED)
+#endif
+
+
 /// \cond INTERNAL
 
 #define D3D12MA_CLASS_NO_COPY(className) \
@@ -151,9 +198,9 @@ class D3D12MA_API IUnknownImpl : public IUnknown
 {
 public:
     virtual ~IUnknownImpl() = default;
-    virtual HRESULT STDMETHODCALLTYPE QueryInterface(REFIID riid, void** ppvObject);
-    virtual ULONG STDMETHODCALLTYPE AddRef();
-    virtual ULONG STDMETHODCALLTYPE Release();
+    HRESULT STDMETHODCALLTYPE QueryInterface(REFIID riid, void** ppvObject) override;
+    ULONG STDMETHODCALLTYPE AddRef() override;
+    ULONG STDMETHODCALLTYPE Release() override;
 protected:
     virtual void ReleaseThis() { delete this; }
 private:
@@ -256,18 +303,18 @@ enum ALLOCATION_FLAGS
     */
     ALLOCATION_FLAG_CAN_ALIAS = 0x10,
 
-    /** Allocation strategy that chooses smallest possible free range for the allocation
+    /** %Allocation strategy that chooses smallest possible free range for the allocation
     to minimize memory usage and fragmentation, possibly at the expense of allocation time.
     */
     ALLOCATION_FLAG_STRATEGY_MIN_MEMORY = 0x00010000,
 
-    /** Allocation strategy that chooses first suitable free range for the allocation -
+    /** %Allocation strategy that chooses first suitable free range for the allocation -
     not necessarily in terms of the smallest offset but the one that is easiest and fastest to find
     to minimize allocation time, possibly at the expense of allocation quality.
     */
     ALLOCATION_FLAG_STRATEGY_MIN_TIME = 0x00020000,
 
-    /** Allocation strategy that chooses always the lowest offset in available space.
+    /** %Allocation strategy that chooses always the lowest offset in available space.
     This is not the most efficient strategy but achieves highly packed data.
     Used internally by defragmentation, not recomended in typical usage.
     */
@@ -288,7 +335,7 @@ enum ALLOCATION_FLAGS
 /// \brief Parameters of created D3D12MA::Allocation object. To be used with Allocator::CreateResource.
 struct ALLOCATION_DESC
 {
-    /// Flags.
+    /// Flags for the allocation.
     ALLOCATION_FLAGS Flags;
     /** \brief The type of memory heap where the new allocation should be placed.
 
@@ -315,7 +362,8 @@ struct ALLOCATION_DESC
     D3D12_HEAP_FLAGS ExtraHeapFlags;
     /** \brief Custom pool to place the new resource in. Optional.
 
-    When not NULL, the resource will be created inside specified custom pool.
+    When not null, the resource will be created inside specified custom pool.
+    Members `HeapType`, `ExtraHeapFlags` are then ignored.
     */
     Pool* CustomPool;
     /// Custom general-purpose pointer that will be stored in D3D12MA::Allocation.
@@ -393,8 +441,9 @@ struct TotalStatistics
     - 1 = `D3D12_HEAP_TYPE_UPLOAD`
     - 2 = `D3D12_HEAP_TYPE_READBACK`
     - 3 = `D3D12_HEAP_TYPE_CUSTOM`
+    - 4 = `D3D12_HEAP_TYPE_GPU_UPLOAD`
     */
-    DetailedStatistics HeapType[4];
+    DetailedStatistics HeapType[5];
     /** \brief One element for each memory segment group located at the following indices:
 
     - 0 = `DXGI_MEMORY_SEGMENT_GROUP_LOCAL`
@@ -404,9 +453,9 @@ struct TotalStatistics
 
     - When `IsUMA() == FALSE` (discrete graphics card):
       - `DXGI_MEMORY_SEGMENT_GROUP_LOCAL` (index 0) represents GPU memory
-      (resources allocated in `D3D12_HEAP_TYPE_DEFAULT` or `D3D12_MEMORY_POOL_L1`).
+        (resources allocated in `D3D12_HEAP_TYPE_DEFAULT`, `D3D12_HEAP_TYPE_GPU_UPLOAD` or `D3D12_MEMORY_POOL_L1`).
       - `DXGI_MEMORY_SEGMENT_GROUP_NON_LOCAL` (index 1) represents system memory
-      (resources allocated in `D3D12_HEAP_TYPE_UPLOAD`, `D3D12_HEAP_TYPE_READBACK`, or `D3D12_MEMORY_POOL_L0`).
+        (resources allocated in `D3D12_HEAP_TYPE_UPLOAD`, `D3D12_HEAP_TYPE_READBACK`, or `D3D12_MEMORY_POOL_L0`).
     - When `IsUMA() == TRUE` (integrated graphics chip):
       - `DXGI_MEMORY_SEGMENT_GROUP_LOCAL` = (index 0) represents memory shared for all the resources.
       - `DXGI_MEMORY_SEGMENT_GROUP_NON_LOCAL` = (index 1) is unused and always 0.
@@ -501,7 +550,13 @@ public:
     */
     ID3D12Resource* GetResource() const { return m_Resource; }
 
-    /// Releases the resource currently pointed by the allocation (if any), sets it to new one, incrementing its reference counter (if not null).
+    /** \brief Releases the resource currently pointed by the allocation (if not null), sets it to new one, incrementing its reference counter (if not null).
+    
+    \warning
+    This is an advanced feature that should be used only in special cases, e.g. during \subpage defragmentation.
+    Typically, an allocation object should reference the resource that was created together with it.
+    If you swap it to another resource of different size, \subpage statistics and budgets can be calculated incorrectly.
+    */
     void SetResource(ID3D12Resource* pResource);
 
     /** \brief Returns memory heap that the resource is created in.
@@ -533,26 +588,6 @@ public:
     */
     LPCWSTR GetName() const { return m_Name; }
 
-    /** \brief Returns `TRUE` if the memory of the allocation was filled with zeros when the allocation was created.
-
-    Returns `TRUE` only if the allocator is sure that the entire memory where the
-    allocation was created was filled with zeros at the moment the allocation was made.
-    
-    Returns `FALSE` if the memory could potentially contain garbage data.
-    If it's a render-target or depth-stencil texture, it then needs proper
-    initialization with `ClearRenderTargetView`, `ClearDepthStencilView`, `DiscardResource`,
-    or a copy operation, as described on page
-    "ID3D12Device::CreatePlacedResource method - Notes on the required resource initialization" in Microsoft documentation.
-    Please note that rendering a fullscreen triangle or quad to the texture as
-    a render target is not a proper way of initialization!
-
-    See also articles:
-
-    - "Coming to DirectX 12: More control over memory allocation" on DirectX Developer Blog
-    - ["Initializing DX12 Textures After Allocation and Aliasing"](https://asawicki.info/news_1724_initializing_dx12_textures_after_allocation_and_aliasing).
-    */
-    BOOL WasZeroInitialized() const { return m_PackedData.WasZeroInitialized(); }
-
 protected:
     void ReleaseThis() override;
 
@@ -611,29 +646,26 @@ private:
     {
     public:
         PackedData() :
-            m_Type(0), m_ResourceDimension(0), m_ResourceFlags(0), m_TextureLayout(0), m_WasZeroInitialized(0) { }
+            m_Type(0), m_ResourceDimension(0), m_ResourceFlags(0), m_TextureLayout(0) { }
 
         Type GetType() const { return (Type)m_Type; }
         D3D12_RESOURCE_DIMENSION GetResourceDimension() const { return (D3D12_RESOURCE_DIMENSION)m_ResourceDimension; }
         D3D12_RESOURCE_FLAGS GetResourceFlags() const { return (D3D12_RESOURCE_FLAGS)m_ResourceFlags; }
         D3D12_TEXTURE_LAYOUT GetTextureLayout() const { return (D3D12_TEXTURE_LAYOUT)m_TextureLayout; }
-        BOOL WasZeroInitialized() const { return (BOOL)m_WasZeroInitialized; }
 
         void SetType(Type type);
         void SetResourceDimension(D3D12_RESOURCE_DIMENSION resourceDimension);
         void SetResourceFlags(D3D12_RESOURCE_FLAGS resourceFlags);
         void SetTextureLayout(D3D12_TEXTURE_LAYOUT textureLayout);
-        void SetWasZeroInitialized(BOOL wasZeroInitialized) { m_WasZeroInitialized = wasZeroInitialized ? 1 : 0; }
 
     private:
         UINT m_Type : 2;               // enum Type
         UINT m_ResourceDimension : 3;  // enum D3D12_RESOURCE_DIMENSION
         UINT m_ResourceFlags : 24;     // flags D3D12_RESOURCE_FLAGS
         UINT m_TextureLayout : 9;      // enum D3D12_TEXTURE_LAYOUT
-        UINT m_WasZeroInitialized : 1; // BOOL
     } m_PackedData;
 
-    Allocation(AllocatorPimpl* allocator, UINT64 size, UINT64 alignment, BOOL wasZeroInitialized);
+    Allocation(AllocatorPimpl* allocator, UINT64 size, UINT64 alignment);
     //  Nothing here, everything already done in Release.
     virtual ~Allocation() = default;
 
@@ -833,7 +865,7 @@ enum POOL_FLAGS
     /// Zero
     POOL_FLAG_NONE = 0,
 
-    /** \brief Enables alternative, linear allocation algorithm in this pool.
+    /** Enables alternative, linear allocation algorithm in this pool.
 
     Specify this flag to enable linear allocation algorithm, which always creates
     new allocations after last one and doesn't reuse space from allocations freed in
@@ -846,13 +878,20 @@ enum POOL_FLAGS
     */
     POOL_FLAG_ALGORITHM_LINEAR = 0x1,
 
-    /** \brief Optimization, allocate MSAA textures as committed resources always.
+    /** Optimization, allocate MSAA textures as committed resources always.
     
     Specify this flag to create MSAA textures with implicit heaps, as if they were created
     with flag D3D12MA::ALLOCATION_FLAG_COMMITTED. Usage of this flags enables pool to create its heaps
     on smaller alignment not suitable for MSAA textures.
+
+    You should always use this flag unless you really need to create some MSAA textures in this pool as placed.
     */
     POOL_FLAG_MSAA_TEXTURES_ALWAYS_COMMITTED = 0x2,
+    /** Every allocation made in this pool will be created as a committed resource - will have its own memory block.
+    
+    There is also an equivalent flag for the entire allocator: D3D12MA::ALLOCATOR_FLAG_ALWAYS_COMMITTED.
+    */
+    POOL_FLAG_ALWAYS_COMMITTED = 0x4,
 
     // Bit mask to extract only `ALGORITHM` bits from entire set of flags.
     POOL_FLAG_ALGORITHM_MASK = POOL_FLAG_ALGORITHM_LINEAR
@@ -861,7 +900,10 @@ enum POOL_FLAGS
 /// \brief Parameters of created D3D12MA::Pool object. To be used with D3D12MA::Allocator::CreatePool.
 struct POOL_DESC
 {
-    /// Flags.
+    /** \brief Flags for the heap.
+    
+    It is recommended to use #D3D12MA_RECOMMENDED_HEAP_FLAGS.
+    */
     POOL_FLAGS Flags;
     /** \brief The parameters of memory heap where allocations of this pool should be placed.
 
@@ -876,7 +918,8 @@ struct POOL_DESC
     `D3D12_HEAP_FLAG_ALLOW_ONLY_NON_RT_DS_TEXTURES`,
     `D3D12_HEAP_FLAG_ALLOW_ONLY_RT_DS_TEXTURES`.
     Except if ResourceHeapTier = 2, then it may be `D3D12_HEAP_FLAG_ALLOW_ALL_BUFFERS_AND_TEXTURES` = 0.
-    
+
+    It is recommended to also add #D3D12MA_RECOMMENDED_POOL_FLAGS.
     You can specify additional flags if needed.
     */
     D3D12_HEAP_FLAGS HeapFlags;
@@ -1027,9 +1070,11 @@ enum ALLOCATOR_FLAGS
     */
     ALLOCATOR_FLAG_SINGLETHREADED = 0x1,
 
-    /**
-    Every allocation will have its own memory block.
-    To be used for debugging purposes.
+    /** Every allocation will be created as a committed resource - will have its own memory block.
+    
+    Affects both default pools and custom pools.
+    To be used for debugging purposes only.
+    There is also an equivalent flag for custom pools: D3D12MA::POOL_FLAG_ALWAYS_COMMITTED.
     */
     ALLOCATOR_FLAG_ALWAYS_COMMITTED = 0x2,
 
@@ -1049,19 +1094,34 @@ enum ALLOCATOR_FLAGS
     */
     ALLOCATOR_FLAG_DEFAULT_POOLS_NOT_ZEROED = 0x4,
 
-    /** \brief Optimization, allocate MSAA textures as committed resources always.
+    /** Optimization, allocate MSAA textures as committed resources always.
 
     Specify this flag to create MSAA textures with implicit heaps, as if they were created
     with flag D3D12MA::ALLOCATION_FLAG_COMMITTED. Usage of this flags enables all default pools
     to create its heaps on smaller alignment not suitable for MSAA textures.
+
+    You should always use this flag unless you really need to create some MSAA textures as placed.
     */
     ALLOCATOR_FLAG_MSAA_TEXTURES_ALWAYS_COMMITTED = 0x8,
+    /** Disable optimization that prefers creating small buffers as committed to avoid 64 KB alignment.
+    
+    By default, the library prefers creating small buffers <= 32 KB as committed,
+    because drivers tend to pack them better, while placed buffers require 64 KB alignment.
+    This, however, may decrease performance, as creating committed resources involves allocation of implicit heaps,
+    which may take longer than creating placed resources in existing heaps.
+    Passing this flag will disable this committed preference globally for the allocator.
+    It can also be disabled for a single allocation by using #ALLOCATION_FLAG_STRATEGY_MIN_TIME.
+    */
+    ALLOCATOR_FLAG_DONT_PREFER_SMALL_BUFFERS_COMMITTED = 0x10,
 };
 
 /// \brief Parameters of created Allocator object. To be used with CreateAllocator().
 struct ALLOCATOR_DESC
 {
-    /// Flags.
+    /** \brief Flags for the entire allocator.
+    
+    It is recommended to use #D3D12MA_RECOMMENDED_ALLOCATOR_FLAGS.
+    */
     ALLOCATOR_FLAGS Flags;
     
     /** Direct3D device object that the allocator should be attached to.
@@ -1121,9 +1181,16 @@ public:
     - "ID3D12Device::GetCustomHeapProperties method (d3d12.h)"
     */
     BOOL IsCacheCoherentUMA() const;
+    /** \brief Returns true if GPU Upload Heaps are supported on the current system.
+
+    When true, you can use `D3D12_HEAP_TYPE_GPU_UPLOAD`.
+
+    This flag is fetched from `D3D12_FEATURE_D3D12_OPTIONS16::GPUUploadHeapSupported`.
+    */
+    BOOL IsGPUUploadHeapSupported() const;
     /** \brief Returns total amount of memory of specific segment group, in bytes.
     
-    \param memorySegmentGroup use `DXGI_MEMORY_SEGMENT_GROUP_LOCAL` or DXGI_MEMORY_SEGMENT_GROUP_NON_LOCAL`.
+    \param memorySegmentGroup use `DXGI_MEMORY_SEGMENT_GROUP_LOCAL` or `DXGI_MEMORY_SEGMENT_GROUP_NON_LOCAL`.
 
     This information is taken from `DXGI_ADAPTER_DESC`.
     It is not recommended to use this number.
@@ -1324,7 +1391,7 @@ public:
       - `pNonLocalBudget` returns the budget of the system memory available for D3D12 resources.
     - When IsUMA() `== TRUE` (integrated graphics chip):
       - `pLocalBudget` returns the budget of the shared memory available for all D3D12 resources.
-        All memory is considered "local".
+         All memory is considered "local".
       - `pNonLocalBudget` is not applicable and returns zeros.
 
     This function is called "get" not "calculate" because it is very fast, suitable to be called
@@ -1438,11 +1505,11 @@ enum VIRTUAL_ALLOCATION_FLAGS
     */
     VIRTUAL_ALLOCATION_FLAG_UPPER_ADDRESS = ALLOCATION_FLAG_UPPER_ADDRESS,
 
-    /// Allocation strategy that tries to minimize memory usage.
+    /// %Allocation strategy that tries to minimize memory usage.
     VIRTUAL_ALLOCATION_FLAG_STRATEGY_MIN_MEMORY = ALLOCATION_FLAG_STRATEGY_MIN_MEMORY,
-    /// Allocation strategy that tries to minimize allocation time.
+    /// %Allocation strategy that tries to minimize allocation time.
     VIRTUAL_ALLOCATION_FLAG_STRATEGY_MIN_TIME = ALLOCATION_FLAG_STRATEGY_MIN_TIME,
-    /** \brief Allocation strategy that chooses always the lowest offset in available space.
+    /** %Allocation strategy that chooses always the lowest offset in available space.
     This is not the most efficient strategy but achieves highly packed data.
     */
     VIRTUAL_ALLOCATION_FLAG_STRATEGY_MIN_OFFSET = ALLOCATION_FLAG_STRATEGY_MIN_OFFSET,
@@ -1456,7 +1523,7 @@ enum VIRTUAL_ALLOCATION_FLAGS
 /// Parameters of created virtual allocation to be passed to VirtualBlock::Allocate().
 struct VIRTUAL_ALLOCATION_DESC
 {
-    /// Flags.
+    /// Flags for the virtual allocation.
     VIRTUAL_ALLOCATION_FLAGS Flags;
     /** \brief Size of the allocation.
     
@@ -1581,6 +1648,143 @@ Note you don't need to create D3D12MA::Allocator to use virtual blocks.
 */
 D3D12MA_API HRESULT CreateVirtualBlock(const VIRTUAL_BLOCK_DESC* pDesc, VirtualBlock** ppVirtualBlock);
 
+#ifndef D3D12MA_NO_HELPERS
+
+/** \brief Helper structure that helps with complete and conscise initialization of the D3D12MA::ALLOCATION_DESC structure.
+ */
+struct CALLOCATION_DESC : public ALLOCATION_DESC
+{
+    /// Default constructor. Leaves the structure uninitialized.
+    CALLOCATION_DESC() = default;
+    /// Constructor initializing from the base D3D12MA::ALLOCATION_DESC structure.
+    explicit CALLOCATION_DESC(const ALLOCATION_DESC& o) noexcept
+        : ALLOCATION_DESC(o)
+    {
+    }
+    /// Constructor initializing description of an allocation to be created in a specific custom pool.
+    explicit CALLOCATION_DESC(Pool* customPool,
+        ALLOCATION_FLAGS flags = ALLOCATION_FLAG_NONE,
+        void* privateData = NULL) noexcept
+    {
+        Flags = flags;
+        HeapType = (D3D12_HEAP_TYPE)0;
+        ExtraHeapFlags = D3D12_HEAP_FLAG_NONE;
+        CustomPool = customPool;
+        pPrivateData = privateData;
+    }
+    /// Constructor initializing description of an allocation to be created in a default pool of a specific `D3D12_HEAP_TYPE`.
+    explicit CALLOCATION_DESC(D3D12_HEAP_TYPE heapType,
+        ALLOCATION_FLAGS flags = ALLOCATION_FLAG_NONE,
+        void* privateData = NULL,
+        D3D12_HEAP_FLAGS extraHeapFlags = D3D12MA_RECOMMENDED_HEAP_FLAGS) noexcept
+    {
+        Flags = flags;
+        HeapType = heapType;
+        ExtraHeapFlags = extraHeapFlags;
+        CustomPool = NULL;
+        pPrivateData = privateData;
+    }
+};
+
+/** \brief Helper structure that helps with complete and conscise initialization of the D3D12MA::POOL_DESC structure.
+ */
+struct CPOOL_DESC : public POOL_DESC
+{
+    /// Default constructor. Leaves the structure uninitialized.
+    CPOOL_DESC() = default;
+    /// Constructor initializing from the base D3D12MA::POOL_DESC structure.
+    explicit CPOOL_DESC(const POOL_DESC& o) noexcept
+        : POOL_DESC(o)
+    {
+    }
+    /// Constructor initializing description of a custom pool created in one of the standard `D3D12_HEAP_TYPE`.
+    explicit CPOOL_DESC(D3D12_HEAP_TYPE heapType,
+        D3D12_HEAP_FLAGS heapFlags,
+        POOL_FLAGS flags = D3D12MA_RECOMMENDED_POOL_FLAGS,
+        UINT64 blockSize = 0,
+        UINT minBlockCount = 0,
+        UINT maxBlockCount = UINT_MAX,
+        D3D12_RESIDENCY_PRIORITY residencyPriority = D3D12_RESIDENCY_PRIORITY_NORMAL) noexcept
+    {
+        Flags = flags;
+        HeapProperties = {};
+        HeapProperties.Type = heapType;
+        HeapFlags = heapFlags;
+        BlockSize = blockSize;
+        MinBlockCount = minBlockCount;
+        MaxBlockCount = maxBlockCount;
+        MinAllocationAlignment = 0;
+        pProtectedSession = NULL;
+        ResidencyPriority = residencyPriority;
+    }
+    /// Constructor initializing description of a custom pool created with custom `D3D12_HEAP_PROPERTIES`.
+    explicit CPOOL_DESC(const D3D12_HEAP_PROPERTIES heapProperties,
+        D3D12_HEAP_FLAGS heapFlags,
+        POOL_FLAGS flags = D3D12MA_RECOMMENDED_POOL_FLAGS,
+        UINT64 blockSize = 0,
+        UINT minBlockCount = 0,
+        UINT maxBlockCount = UINT_MAX,
+        D3D12_RESIDENCY_PRIORITY residencyPriority = D3D12_RESIDENCY_PRIORITY_NORMAL) noexcept
+    {
+        Flags = flags;
+        HeapProperties = heapProperties;
+        HeapFlags = heapFlags;
+        BlockSize = blockSize;
+        MinBlockCount = minBlockCount;
+        MaxBlockCount = maxBlockCount;
+        MinAllocationAlignment = 0;
+        pProtectedSession = NULL;
+        ResidencyPriority = residencyPriority;
+    }
+};
+
+/** \brief Helper structure that helps with complete and conscise initialization of the D3D12MA::VIRTUAL_BLOCK_DESC structure.
+ */
+struct CVIRTUAL_BLOCK_DESC : public VIRTUAL_BLOCK_DESC
+{
+    /// Default constructor. Leaves the structure uninitialized.
+    CVIRTUAL_BLOCK_DESC() = default;
+    /// Constructor initializing from the base D3D12MA::VIRTUAL_BLOCK_DESC structure.
+    explicit CVIRTUAL_BLOCK_DESC(const VIRTUAL_BLOCK_DESC& o) noexcept
+        : VIRTUAL_BLOCK_DESC(o)
+    {
+    }
+    /// Constructor initializing description of a virtual block with given parameters.
+    explicit CVIRTUAL_BLOCK_DESC(UINT64 size,
+        VIRTUAL_BLOCK_FLAGS flags = VIRTUAL_BLOCK_FLAG_NONE,
+        const ALLOCATION_CALLBACKS* allocationCallbacks = NULL) noexcept
+    {
+        Flags = flags;
+        Size = size;
+        pAllocationCallbacks = allocationCallbacks;
+    }
+};
+
+/** \brief Helper structure that helps with complete and conscise initialization of the D3D12MA::VIRTUAL_ALLOCATION_DESC structure.
+ */
+struct CVIRTUAL_ALLOCATION_DESC : public VIRTUAL_ALLOCATION_DESC
+{
+    /// Default constructor. Leaves the structure uninitialized.
+    CVIRTUAL_ALLOCATION_DESC() = default;
+    /// Constructor initializing from the base D3D12MA::VIRTUAL_ALLOCATION_DESC structure.
+    explicit CVIRTUAL_ALLOCATION_DESC(const VIRTUAL_ALLOCATION_DESC& o) noexcept
+        : VIRTUAL_ALLOCATION_DESC(o)
+    {
+    }
+    /// Constructor initializing description of a virtual allocation with given parameters.
+    explicit CVIRTUAL_ALLOCATION_DESC(UINT64 size, UINT64 alignment,
+        VIRTUAL_ALLOCATION_FLAGS flags = VIRTUAL_ALLOCATION_FLAG_NONE,
+        void* privateData = NULL) noexcept
+    {
+        Flags = flags;
+        Size = size;
+        Alignment = alignment;
+        pPrivateData = privateData;
+    }
+};
+
+#endif // #ifndef D3D12MA_NO_HELPERS
+
 } // namespace D3D12MA
 
 /// \cond INTERNAL
@@ -1597,13 +1801,13 @@ DEFINE_ENUM_FLAG_OPERATORS(D3D12MA::VIRTUAL_ALLOCATION_FLAGS);
 
 \section quick_start_project_setup Project setup and initialization
 
-This is a small, standalone C++ library. It consists of a pair of 2 files:
+This is a small, standalone C++ library. It consists of 2 files:
 "D3D12MemAlloc.h" header file with public interface and "D3D12MemAlloc.cpp" with
 internal implementation. The only external dependencies are WinAPI, Direct3D 12,
 and parts of C/C++ standard library (but STL containers, exceptions, or RTTI are
 not used).
 
-The library is developed and tested using Microsoft Visual Studio 2019, but it
+The library is developed and tested using Microsoft Visual Studio 2022, but it
 should work with other compilers as well. It is designed for 64-bit code.
 
 To use the library in your project:
@@ -1625,19 +1829,25 @@ D3D12MA::Allocator object.
 Please note that all symbols of the library are declared inside #D3D12MA namespace.
 
 \code
-IDXGIAdapter* adapter = (...)
-ID3D12Device* device = (...)
+IDXGIAdapter* adapter = ...
+ID3D12Device* device = ...
 
 D3D12MA::ALLOCATOR_DESC allocatorDesc = {};
 allocatorDesc.pDevice = device;
 allocatorDesc.pAdapter = adapter;
+allocatorDesc.Flags = D3D12MA_RECOMMENDED_ALLOCATOR_FLAGS;
 
 D3D12MA::Allocator* allocator;
 HRESULT hr = D3D12MA::CreateAllocator(&allocatorDesc, &allocator);
+// Check hr...
 \endcode
 
 (5.) Right before destroying the D3D12 device, destroy the allocator object.
 
+\code
+allocator->Release();
+\endcode
+
 Objects of this library must be destroyed by calling `Release` method.
 They are somewhat compatible with COM: they implement `IUnknown` interface with its virtual methods: `AddRef`, `Release`, `QueryInterface`,
 and they are reference-counted internally.
@@ -1645,10 +1855,6 @@ You can use smart pointers designed for COM with objects of this library - e.g.
 The reference counter is thread-safe.
 `QueryInterface` method supports only `IUnknown`, as classes of this library don't define their own GUIDs.
 
-\code
-allocator->Release();
-\endcode
-
 
 \section quick_start_creating_resources Creating resources
 
@@ -1690,8 +1896,10 @@ HRESULT hr = allocator->CreateResource(
     NULL,
     &allocation,
     IID_NULL, NULL);
+// Check hr...
 
-// Use allocation->GetResource()...
+ID3D12Resource* res = allocation->GetResource();
+// Use res...
 \endcode
 
 You need to release the allocation object when no longer needed.
@@ -1711,10 +1919,10 @@ parts of them are occupied and which parts are free to be used for new resources
 It is important to remember that resources created as placed don't have their memory
 initialized to zeros, but may contain garbage data, so they need to be fully initialized
 before usage, e.g. using Clear (`ClearRenderTargetView`), Discard (`DiscardResource`),
-or copy (`CopyResource`).
+or Copy (`CopyResource`).
 
 The library also automatically handles resource heap tier.
-When `D3D12_FEATURE_DATA_D3D12_OPTIONS::ResourceHeapTier` equals `D3D12_RESOURCE_HEAP_TIER_1`,
+When `D3D12_FEATURE_DATA_D3D12_OPTIONS::ResourceHeapTier == D3D12_RESOURCE_HEAP_TIER_1`,
 resources of 3 types: buffers, textures that are render targets or depth-stencil,
 and other textures must be kept in separate heaps. When `D3D12_RESOURCE_HEAP_TIER_2`,
 they can be kept together. By using this library, you don't need to handle this
@@ -1737,7 +1945,7 @@ When only D3D12MA::Allocation object is obtained from a function call like D3D12
 it remembers the `ID3D12Resource` that was created with it and holds a reference to it.
 The resource can be obtained by calling `allocation->GetResource()`, which doesn't increment the resource
 reference counter.
-Calling `allocation->Release()` will decrease the resource reference counter, which is = 1 in this case,
+Calling `allocation->Release()` will decrease the resource reference counter, which is 1 in this case,
 so the resource will be released.
 
 <b>Second option</b> is to retrieve a pointer to the resource along with D3D12MA::Allocation.
@@ -1803,7 +2011,7 @@ Example for buffer created and filled in `UPLOAD` heap type:
 
 \code
 const UINT64 bufSize = 65536;
-const float* bufData = (...);
+const float* bufData = ...;
 
 D3D12_RESOURCE_DESC resourceDesc = {};
 resourceDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
@@ -1831,8 +2039,9 @@ HRESULT hr = allocator->CreateResource(
     &allocation,
     IID_PPV_ARGS(&resource));
 
+D3D12_RANGE emptyRange = {0, 0};
 void* mappedPtr;
-hr = resource->Map(0, NULL, &mappedPtr);
+hr = resource->Map(0, &emptyRange, &mappedPtr);
 
 memcpy(mappedPtr, bufData, bufSize);
 
@@ -1840,6 +2049,68 @@ resource->Unmap(0, NULL);
 \endcode
 
 
+\section quick_start_helper_structures Helper structures
+
+DirectX 12 Agility SDK offers a library of helpers in files "build\native\include\d3dx12\*.h".
+They include structures that help with complete and concise initialization of the core D3D12 `*_DESC` structures
+by using some basic C++ features (constructors, static methods, default parameters).
+They inherit from these structures, so they support implicit casting to them.
+For example, structure `CD3DX12_RESOURCE_DESC` can be used to conveniently fill in structure `D3D12_RESOURCE_DESC`.
+
+Similarly, this library provides a set of helper structures that aid in initialization of some of the `*_DESC` structures defined in the library.
+These are:
+
+- D3D12MA::CALLOCATION_DESC, which inherits from D3D12MA::ALLOCATION_DESC.
+- D3D12MA::CPOOL_DESC, which inherits from D3D12MA::POOL_DESC.
+- D3D12MA::CVIRTUAL_BLOCK_DESC, which inherits from D3D12MA::VIRTUAL_BLOCK_DESC.
+- D3D12MA::CVIRTUAL_ALLOCATION_DESC, which inherits from D3D12MA::VIRTUAL_ALLOCATION_DESC.
+
+For example, when you want to create a buffer in the `UPLAOD` heap using minimal allocation time, you can use base structures:
+
+\code
+D3D12MA::ALLOCATION_DESC allocDesc;
+allocDesc.Flags = D3D12MA::ALLOCATION_FLAG_STRATEGY_MIN_TIME;
+allocDesc.HeapType = D3D12_HEAP_TYPE_UPLOAD;
+allocDesc.ExtraHeapFlags = D3D12_HEAP_FLAG_NONE;
+allocDesc.CustomPool = NULL;
+allocDesc.pPrivateData = NULL;
+
+D3D12_RESOURCE_DESC resDesc;
+resDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
+resDesc.Alignment = 0;
+resDesc.Width = myBufSize;
+resDesc.Height = 1;
+resDesc.DepthOrArraySize = 1;
+resDesc.MipLevels = 1;
+resDesc.Format = DXGI_FORMAT_UNKNOWN;
+resDesc.SampleDesc.Count = 1;
+resDesc.SampleDesc.Quality = 0;
+resDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
+resDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
+
+D3D12MA::Allocation* alloc;
+ID3D12Resource* res;
+HRESULT hr = allocator->CreateResource(&allocDesc, &resDesc,
+    D3D12_RESOURCE_STATE_COMMON, NULL, &alloc, IID_PPV_ARGS(&res));
+// Check hr...
+\endcode
+
+Or you can use helper structs from both D3X12 library and this library to make the code shorter:
+
+\code
+D3D12MA::CALLOCATION_DESC allocDesc = D3D12MA::CALLOCATION_DESC{
+    D3D12_HEAP_TYPE_UPLOAD,
+    D3D12MA::ALLOCATION_FLAG_STRATEGY_MIN_TIME };
+
+CD3DX12_RESOURCE_DESC resDesc = CD3DX12_RESOURCE_DESC::Buffer(myBufSize);
+
+D3D12MA::Allocation* alloc;
+ID3D12Resource* res;
+HRESULT hr = allocator->CreateResource(&allocDesc, &resDesc,
+    D3D12_RESOURCE_STATE_COMMON, NULL, &alloc, IID_PPV_ARGS(&res));
+// Check hr...
+\endcode
+
 \page custom_pools Custom memory pools
 
 A "pool" is a collection of memory blocks that share certain properties.
@@ -1855,6 +2126,8 @@ to obtain object D3D12MA::Pool. Example:
 \code
 POOL_DESC poolDesc = {};
 poolDesc.HeapProperties.Type = D3D12_HEAP_TYPE_DEFAULT;
+poolDesc.Flags = D3D12MA_RECOMMENDED_POOL_FLAGS;
+poolDesc.HeapFlags = D3D12MA_RECOMMENDED_HEAP_FLAGS | D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS;
 
 Pool* pool;
 HRESULT hr = allocator->CreatePool(&poolDesc, &pool);
@@ -1921,6 +2194,458 @@ in some cases, e.g. to have separate memory usage statistics for some group of r
 extended allocation parameters, like custom `D3D12_HEAP_PROPERTIES`, which are available only in custom pools.
 
 
+\page optimal_allocation Optimal resource allocation
+
+This library tries to automatically make optimal choices for the resources you create,
+so you don't need to care about them.
+There are some advanced features of Direct3D 12 that you may use to optimize your memory management.
+There are also some settings in %D3D12MA that you may change to alter its default behavior.
+This page provides miscellaneous advice about features of D3D12 and %D3D12MA that are
+non-essential, but may improve the stability or performance of your app.
+
+\section optimal_allocation_avoiding_running_out_of_memory Avoiding running out of memory
+
+When trying to allocate more memory than available in the current heap
+(e.g., video memory on the graphics card, system memory), one of few bad things can happen:
+
+- The allocation (resource creation) function call can fail with `HRESULT` value other than `S_OK`.
+- The allocation may succeed, but take long time (even a significant fraction of a second).
+- Some resources are automatically demoted from video memory to system memory, degrading the app performance.
+- Even a crash of the entire graphics driver can happen, resulting in the D3D12 "device removal", which is usually
+  catastrophic for the application.
+
+Unfortunately, there is no way to be 100% protected against memory overcommitment.
+The best approach is to avoid allocating too much memory.
+
+The full capacity of the memory can be queried using function D3D12MA::Allocator::GetMemoryCapacity.
+However, it is not recommended, because the amount of memory available to the application
+is typically smaller than the full capacity, as some portion of it is reserved by the operating system
+or used by other processes.
+
+Because of this, the recommended way of fetching the **memory budget** available to the application
+is using function D3D12MA::Allocator::GetBudget.
+Preventing value D3D12MA::Budget::UsageBytes from exceeding the D3D12MA::Budget::BudgetBytes
+is probably the best we can do in trying to avoid the consequences of over-commitment.
+For more information, see also: \subpage statistics.
+
+Example:
+
+\code
+D3D12MA::Budget videoMemBudget = {};
+allocator->GetBudget(&videoMemBudget, NULL);
+
+UINT64 freeBytes = videoMemBudget.BudgetBytes - videoMemBudget.UsageBytes;
+gameStreamingSystem->SetAvailableFreeMemory(freeBytes);
+\endcode
+
+\par Implementation detail
+DXGI interface offers function `IDXGIAdapter3::QueryVideoMemoryInfo` that queries the current memory usage and budget.
+This library automatically makes use of it when available (when you use recent enough version of the DirectX SDK).
+If not, it falls back to estimating the usage and budget based on the total amount of the allocated memory
+and 80% of the full memory capacity, respectively.
+
+\par Implementation detail
+Allocating large heaps and creating placed resources in them is one of the main features of this library.
+However, if allocating new such block would exceed the budget, it will automatically prefer creating the resource as committed
+to have exactly the right size, which can lower the chance of getting into trouble in case of over-commitment.
+
+When creating non-essential resources, you can use D3D12MA::ALLOCATION_FLAG_WITHIN_BUDGET.
+Then, in case the allocation would exceed the budget, the library will return failure from the function
+without attempting to allocate the actual D3D12 memory.
+
+It may also be a good idea to support failed resource creation.
+For non-essential resources, when function D3D12MA::Allocator::CreateResource fails with a result other than `S_OK`,
+it is worth implementing some way of recovery instead of terminating or crashing the entire app.
+
+\section optimal_allocation_allocation_Performance Allocation performance
+
+Creating D3D12 resources (buffers and textures) can be a time-consuming operation.
+The duration can be unpredictable, spanning from a small fraction of a millisecond to a significant fraction of a second.
+Thus, it is recommended to allocate all the memory and create all the resources needed upfront
+rather than doing it during application runtime.
+For example, a video game can try to create its resources on startup or when loading a new level.
+Of course, is is not always possible.
+For example, open-world games may require loading and unloading some graphical assets in the background (often called "streaming").
+
+Creating and releasing D3D12 resources **on a separate thread** in the background may help.
+Both `ID3D12Device` and D3D12MA::Allocator objects are thread-safe, synchronized internally.
+However, cases were observed where resource creation calls like `ID3D12Device::CreateCommittedResource`
+were blocking other D3D12 calls like `ExecuteCommandLists` or `Present`
+somewhere inside the graphics driver, so hitches can happen even when using multithreading.
+
+The most expensive part is typically **the allocation of a new D3D12 memory heap**.
+This library tackles this problem by automatically allocating large heaps (64 MB by default)
+and creating resources as placed inside of them.
+When a new requested resource can be placed in a free space of an existing heap and doesn't require allocating a new heap,
+this operation is typically much faster, as it only requires creating a new `ID3D12Resource` object
+and not allocating new memory.
+This is the main benefit of using %D3D12MA compared to the naive approach of using Direct3D 12 directly
+and creating each resource as committed with `CreateCommittedResource`, which would result in a separate allocation of an implicit heap every time.
+
+When **a large number of small buffers** needs to be created, the overhead of creating even just separate `ID3D12Resource` objects can be significant.
+It can be avoided by creating one or few larger buffers and manually sub-allocating parts of them for specific needs.
+This library can also help with it. See section "Sub-allocating buffers" below.
+
+\par Implementation detail
+The CPU performance overhead of using this library is low.
+It uses a high-quality allocation algorithm called Two-Level Segregated Fit (TLSF),
+which in most cases can find a free place for a new allocation in few steps.
+The library also doesn't perform too many CPU heap allocations.
+In may cases, the allocation happens with 0 new CPU heap allocations performed by the library.
+Even the creation of a D3D12MA::Allocation object itself doesn't typically feature an CPU allocation,
+because these objects are returned out of a dedicated memory pool.
+
+Another reason for the slowness of D3D12 memory allocation is the guarantee that the **newly allocated memory is filled with zeros**.
+When creating and destroying resources placed in an existing heap, this overhead is not present,
+and the memory is not zeroed - it may contain random data left by the resource previously allocated in that place.
+In recent versions of the DirectX 12 SDK, clearing the memory of the newly created D3D12 heaps can also be disabled for the improved performance.
+%D3D12MA can use this feature when:
+
+- D3D12MA::ALLOCATOR_FLAG_DEFAULT_POOLS_NOT_ZEROED is used during the creation of the main allocator object.
+- `D3D12_HEAP_FLAG_CREATE_NOT_ZEROED` is passed to D3D12MA::POOL_DESC::HeapFlags during the creation of a custom pool.
+
+It is recommended to always use these flags.
+The downside is that when the memory is not filled with zeros, while you don't properly clear it or otherwise initialize its content before use
+(which is required by D3D12), you may observe incorrect behavior.
+This problem mostly affects render-target and depth-stencil textures.
+
+When an allocation needs to be made in a performance-critical code, you can use D3D12MA::ALLOCATION_FLAG_STRATEGY_MIN_TIME.
+In influences multiple heuristics inside the library to prefer faster allocation
+at the expense of possibly less optimal placement in the memory.
+
+If the resource to be created is non-essential, while the performance is paramount,
+you can also use D3D12MA::ALLOCATION_FLAG_NEVER_ALLOCATE.
+It will create the resource only if it can be placed inside and existing memory heap
+and return failure from the function if a new heap would need to be allocated,
+which should guarantee good performance of such function call.
+
+\section optimal_allocation_suballocating_buffers Sub-allocating buffers
+
+When a large number of small buffers needs to be created, the overhead of creating separate `ID3D12Resource` objects can be significant.
+It can also cause a significant waste of memory, as placed buffers need to be aligned to `D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT` = 64 KB by default.
+These problems can be avoided by creating one or few larger buffers and manually sub-allocating parts of them for specific needs.
+
+It requires implementing a custom allocator for the data inside the buffer and using offsets to individual regions.
+When all the regions can be allocated linearly and freed all at once, implementing such allocator is trivial.
+When every region has the same size, implementing an allocator is also quite simple when using a "free list" algorithm.
+However, when regions can have different sizes and can be allocated and freed in random order,
+it requires a full allocation algorithm.
+%D3D12MA can help with it by exposing its core allocation algorithm for custom usages.
+For more details and example code, see chapter: \subpage virtual_allocator.
+It can be used for all the cases mentioned above without too much performance overhead,
+because the D3D12MA::VirtualAllocation object is just a lightweight handle.
+
+When sub-allocating a buffer, you need to remember to explicitly request proper alignment required for each region.
+For example, data used as a constant buffer must be aligned to `D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT` = 256 B.
+
+\section optimal_allocation_residency_priority Residency priority
+
+When too much video memory is allocated, one of the things that can happen is the system
+demoting some heaps to the system memory.
+Moving data between memory pools or reaching out directly to the system memory through PCI Express bus can have large performance overhead,
+which can slow down the application, or even make the game unplayable any more.
+Unfortunately, it is not possible to fully control or prevent this demotion.
+Best thing to do is avoiding memory over-commitment.
+For more information, see section "Avoiding running out of memory" above.
+
+Recent versions of DirectX 12 SDK offer function `ID3D12Device1::SetResidencyPriority` that sets a hint
+about the priority of a resource - how important it is to stay resident in the video memory.
+Setting the priority happens at the level of an entire memory heap.
+%D3D12MA offers an interface to set this priority in form of D3D12MA::POOL_DESC::ResidencyPriority parameter.
+It affects all allocations made out of the custom pool created with it, both placed inside large heaps
+and created as committed.
+
+It is recommended to create a custom pool for the purpose of using high residency priority
+of all resources that are critical for the performance, especially those that are written by the GPU,
+like render-target, depth-stencil textures, UAV textures and buffers.
+It is also worth creating them as committed, so that each one will have its own implicit heap.
+This can minimize the chance that an entire large heap is demoted to system memory, degrading performance
+of all the resources placed in it.
+
+Example:
+
+\code
+D3D12MA::CPOOL_DESC poolDesc = D3D12MA::CPOOL_DESC{
+    D3D12_HEAP_TYPE_DEFAULT,
+    D3D12MA_RECOMMENDED_HEAP_FLAGS | D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS };
+poolDesc.ResidencyPriority = D3D12_RESIDENCY_PRIORITY_HIGH; // !!!
+
+D3D12MA::Pool* pool;
+HRESULT hr = allocator->CreatePool(&poolDesc, &pool);
+// Check hr...
+
+D3D12MA::CALLOCATION_DESC allocDesc = D3D12MA::CALLOCATION_DESC{
+    pool,
+    ALLOCATION_FLAG_COMMITTED }; // !!!
+
+CD3DX12_RESOURCE_DESC resDesc = CD3DX12_RESOURCE_DESC::Buffer(
+    1048576); // Requested buffer size.
+
+D3D12MA::Allocation* alloc;
+hr = allocator->CreateResource(&allocDesc, &resDesc, D3D12_RESOURCE_STATE_COMMON,
+    NULL, &alloc, IID_NULL, NULL);
+// Check hr...
+\endcode
+
+When you have a committed allocation created, you can also set the residency priority of its resource
+using the D3D12 function:
+
+\code
+D3D12MA::Allocation* committedAlloc = ...
+ID3D12Pageable* res = committedAlloc->GetResource();
+D3D12_RESIDENCY_PRIORITY priority = D3D12_RESIDENCY_PRIORITY_HIGH;
+device1->SetResidencyPriority(1, &res, &priority);
+\endcode
+
+Note this is not the same as explicit eviction controlled using `ID3D12Device::Evict` and `MakeResident` functions.
+Resources evicted explicitly are illegal to access until they are made resident again,
+while the demotion described here happens automatically and only slows down the execution.
+
+\section optimal_allocation_gpu_upload_heap GPU upload heap
+
+Direct3D 12 offers a fixed set of memory heap types:
+
+- `D3D12_HEAP_TYPE_DEFAULT`: Represents the video memory. It is available and fast to access for the GPU.
+  It should be used for all resources that are written by the GPU (like render-target and depth-stencil textures,
+  UAV) and resources that are frequently read by the GPU (like textures intended for sampling,
+  vertex, index, and constant buffers).
+- `D3D12_HEAP_TYPE_UPLOAD`: Represents the system memory that is uncached and write-combined.
+  It can be mapped and accessed by the CPU code using a pointer.
+  It supports only buffers, not textures.
+  It is intended for "staging buffers" that are filled by the CPU code and then used as a source of copy operations to the `DEFAULT` heap.
+  It can also be accessed directly by the GPU - shaders can read from buffers created in this memory.
+- `D3D12_HEAP_TYPE_READBACK`: Represents the system memory that is cached.
+  It is intended for buffers used as a destination of copy operations from the `DEFAULT` heap.
+
+Note that in systems with a discrete graphics card, access to system memory is fast from the CPU code
+(like the C++ code mapping D3D12 buffers and accessing them through a pointer),
+while access to the video memory is fast from the GPU code (like shaders reading and writing buffers and textures).
+Any copy operation or direct access between these memory heap types happens through PCI Express bus, which can be relatively slow.
+
+Modern systems offer a feature called **Resizable BAR (ReBAR)** that gives the CPU direct access to the full video memory.
+To be available, this feature needs to be supported by the whole hardware-software environment, including:
+
+- Supporting motherboard and its UEFI.
+- Supporting graphics card and its graphics driver.
+- Supporting operating system.
+- The feature needs to be enabled in the UEFI settings. It is typically called "Above 4G Decoding" and "Resizable Bar".
+
+Recent versions of DirectX 12 SDK give access to this feature in form of a new, 4th memory pool: `D3D12_HEAP_TYPE_GPU_UPLOAD`.
+Resources created in it behave logically similar to the `D3D12_HEAP_TYPE_UPLOAD` heap:
+
+- They support mapping and direct access from the CPU code through a pointer.
+- The mapped memory is uncached and write-combined, so it should be only written sequentially
+  (e.g., number-by-number or using `memcpy`). It shouldn't be accessed randomly or read,
+  because it is extremely slow for uncached memory.
+- Only buffers are supported.
+- Those buffers can be used as a source of copy operations or directly accessed by the GPU.
+
+The main difference is that resources created in the new `D3D12_HEAP_TYPE_GPU_UPLOAD` are placed in the video memory,
+while resources created in the old `D3D12_HEAP_TYPE_UPLOAD` are placed in the system memory.
+This implies which budgets are consumed by new resources allocated in those heaps.
+This also implies which operations involve transferring data through the PCI Express bus.
+
+- As `D3D12_HEAP_TYPE_UPLOAD` uses the system memory, writes from the CPU code through a mapped pointer are faster,
+  while copies or direct access from the GPU are slower because they need to go through PCIe.
+- As the new `D3D12_HEAP_TYPE_GPU_UPLOAD` uses the video memory,
+  copies or direct access from the GPU are faster,
+  while writes from the CPU code through a mapped pointer can be slower, because they need to go through PCIe.
+  For maximum performance of copy operations from this heap, a graphics or compute queue should be used, not a copy queue.
+
+GPU Upload Heap can be used for performance optimization of some resources that need to be written by the CPU and read by the GPU.
+It can be beneficial especially for resources that need to change frequently (often called "dynamic").
+
+%D3D12MA supports GPU upload heap when recent enough version of DirectX 12 SDK is used and when the current system supports it.
+The support can be queried using function D3D12MA::Allocator::IsGPUUploadHeapSupported().
+When it returns `TRUE`, you can create resources using `D3D12_HEAP_TYPE_GPU_UPLOAD`.
+You can also just try creating such resource. Example:
+
+\code
+    D3D12MA::CALLOCATION_DESC allocDesc = D3D12MA::CALLOCATION_DESC{
+        D3D12_HEAP_TYPE_GPU_UPLOAD }; // !!!
+
+    CD3DX12_RESOURCE_DESC resDesc = CD3DX12_RESOURCE_DESC::Buffer(
+        1048576); // Requested buffer size.
+
+    D3D12MA::Allocation* alloc;
+    ID3D12Resource* res;
+    hr = allocator->CreateResource(&allocDesc, &resDesc, D3D12_RESOURCE_STATE_COMMON,
+        NULL, &alloc, IID_PPV_ARGS(&res));
+    if(SUCCEEDED(hr))
+    {
+        // Fast path for data upload.
+
+        D3D12_RANGE emptyRange = {0, 0};
+        void* mappedPtr = NULL;
+        hr = res->Map(0, &emptyRange, &mappedPtr);
+        memcpy(mappedPtr, srcData, 1048576);
+        res->Unmap(0, NULL); // Optional. You can leave it persistently mapped.
+
+        D3D12_GPU_VIRTUAL_ADDRESS gpuva = res->GetGPUVirtualAddress();
+        // Use gpuva to access the buffer on the GPU...
+    }
+    else if(hr == E_NOTIMPL)
+    {
+        // GPU Upload Heap not supported in this system.
+        // Fall back to creating a staging buffer in UPLOAD and another copy in DEFAULT.
+        
+        allocDesc.HeapType = D3D12_HEAP_TYPE_UPLOAD;
+        // ...
+    }
+    else
+        // Some other error code e.g., out of memory...
+\endcode
+
+\section optimal_allocation_committed_vs_placed Committed versus placed resources
+
+When using D3D12 API directly, there are 3 ways of creating resources:
+
+1. **Committed**, using function `ID3D12Device::CreateCommittedResource`.
+   It creates the resource with its own memory heap, which is called an "implicit heap" and cannot be accessed directly.
+2. **Placed**, using function `ID3D12Device::CreatePlacedResource`.
+   A `ID3D12Heap` needs to be created beforehand using `ID3D12Device::CreateHeap`.
+   Then, the resource can be created as placed inside the heap at a specific offset.
+3. **Reserved**, using function `ID3D12Device::CreateReservedResource`.
+   This library doesn't support them directly.
+
+A naive solution would be to create all the resources as committed.
+It works, because in D3D12 there is no strict limit on the number of resources or heaps that can be created.
+However, there are certain advantages and disadvantages of using committed versus placed resources:
+
+- The biggest advantage of using placed resources is the allocation performance.
+  Once a heap is allocated, creating and releasing resources placed in it can be much faster than
+  creating them as committed, which would involve allocating a new heap for each resource.
+  - Using large number of small heaps can put an extra burden on the software stack,
+    including D3D12 runtime, graphics driver, operating system, and developer tools like Radeon Memory Visualizer (RMV).
+- The advantage of committed resources is that their implicit heaps have exactly the right size,
+  while creating resources as placed inside larger heaps can lead to some memory wasted because:
+  - Some part of the allocated heap memory is unused.
+  - After placed resources of various sizes are created and released in random order,
+    gaps between remaining resources can be too small to fit new allocations.
+    This is also known as "fragmentation". A solution to this problem is implementing \subpage defragmentation.
+  - The alignment required by placed resources can leave gaps between them, while the driver can pack individual committed resources better.
+    For details, see section "Resource alignment" below.
+- The advantage of committed resources is that they are always created with a new heap, which is initialized with zeros.
+  When a resource is created as placed, the memory may contain random data left by the resource previously allocated in that place.
+  When the memory is not filled with zeros, while you don't properly clear it or otherwise initialize its content before use
+  (which is required by D3D12), you may observe incorrect behavior.
+  On the other hand, using committed resources and having every new resource filled with zeros can leave this kind of bugs undetected.
+- Manual eviction with `ID3D12Device::Evict` and `MakeResident` functions work at the level of the entire heap,
+  and so does `ID3D12Device1::SetResidencyPriority`, so creating resources as committed allows more fine-grained control
+  over the eviction and residency priority of individual resources.
+- The advantage of placed resources is that they can be created in a region of a heap overlapping with some other resources.
+  This approach is commonly called "aliasing".
+  It can save memory, but it needs careful control over the resources that overlap in memory
+  to make sure they are not used at the same time, there is an aliasing barrier issued between their usage,
+  and the resource used after aliasing is correctly cleared every time.
+  Committed resources don't offer this possibility, because every committed resource has its own exclusive memory heap.
+  For more information, see chapter \subpage resource_aliasing.
+
+When creating resources with the help of %D3D12MA using function D3D12MA::Allocator::CreateResource,
+you typically don't need to care about all this.
+The library automatically makes the choice of creating the new resource as committed or placed.
+However, in cases when you need the information or the control over this choice between committed and placed,
+the library offers facilities to do that, described below.
+
+\par Implementation detail
+%D3D12MA creates large heaps (default size is 64 MB) and creates resources as placed in them.
+However, it may decide that it is required or preferred to create the specific resource as committed for many reasons, including:
+- When the resource is large (larger than half of the default heap size).
+- When allocating an entire new heap would exceed the current budget or when we are already over the budget.
+- When the resource is a very small buffer. Placed buffers need to be aligned to 64 KB by default,
+  while creating them as committed can allow the driver to pack them better.
+  This heuristics can be disabled for an individual resource by using D3D12MA::ALLOCATION_FLAG_STRATEGY_MIN_TIME
+  and for the entire allocator by using D3D12MA::ALLOCATOR_FLAG_DONT_PREFER_SMALL_BUFFERS_COMMITTED.
+- When the resource uses non-standard flags specified via D3D12MA::ALLOCATION_DESC::ExtraHeapFlags.
+
+<b>You can check whether an allocation was created as a committed resource</b> by checking if its heap is null.
+Committed resources have an implicit heap that is not directly accessible.
+
+\code
+bool isCommitted = allocation->GetHeap() == NULL;
+\endcode
+
+<b>You can request a new resource to be created as committed</b> by using D3D12MA::ALLOCATION_FLAG_COMMITTED.
+Note that committed resources can also be created out of \subpage custom_pools.
+
+You can also request all resources to be created as committed globally for the entire allocator
+by using D3D12MA::ALLOCATOR_FLAG_ALWAYS_COMMITTED.
+However, this contradicts the main purpose of using this library.
+It can also prevent certain other features of the library to be used.
+This flag should be used only for debugging purposes.
+
+You can create a custom pool with an explicit block size by specifying non-zero D3D12MA::POOL_DESC::BlockSize.
+When doing this, all **resources created in such pool are placed** in those blocks (heaps) and never created as committed.
+Example:
+
+\code
+D3D12MA::CPOOL_DESC poolDesc = D3D12MA::CPOOL_DESC{
+    D3D12_HEAP_TYPE_DEFAULT,
+    D3D12MA_RECOMMENDED_HEAP_FLAGS | D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS };
+poolDesc.BlockSize = 100llu * 1024 * 1024; // 100 MB. Explicit BlockSize guarantees placed.
+
+D3D12MA::Pool* pool;
+HRESULT hr = allocator->CreatePool(&poolDesc, &pool);
+// Check hr...
+
+D3D12MA::CALLOCATION_DESC allocDesc = D3D12MA::CALLOCATION_DESC{ pool };
+
+CD3DX12_RESOURCE_DESC resDesc = CD3DX12_RESOURCE_DESC::Buffer(
+    90llu * 1024 * 1024); // 90 MB
+
+D3D12MA::Allocation* alloc;
+ID3D12Resource* res;
+hr = allocator->CreateResource(&allocDesc, &resDesc, D3D12_RESOURCE_STATE_COMMON,
+    NULL, &alloc, IID_PPV_ARGS(&res));
+// Check hr...
+
+// Even a large buffer like this, filling 90% of the block, was created as placed!
+assert(alloc->GetHeap() != NULL);
+\endcode
+
+<b>You can request a new resource to be created as placed</b> by using D3D12MA::ALLOCATION_FLAG_CAN_ALIAS.
+This is required especially if you plan to create another resource in the same region of memory, aliasing with your resource -
+hence the name of this flag.
+
+Note D3D12MA::ALLOCATION_FLAG_CAN_ALIAS can be even combined with D3D12MA::ALLOCATION_FLAG_COMMITTED.
+In this case, the resource is not created as committed, but it is also not placed as part of a larger heap.
+What happens instead is that a new heap is created with the exact size required for the resource,
+and the resource is created in it, placed at offset 0.
+
+\section optimal_allocation_resource_alignment Resource alignment
+
+Certain types of resources require certain alignment in memory.
+An alignment is a requirement for the address or offset to the beginning of the resource to be a multiply of some value, which is always a power of 2.
+For committed resources, the problem is non-existent, because committed resources have their own implicit heaps
+where they are created at offset 0, which meets any alignment requirement.
+For placed resources, %D3D12MA takes care of the alignment automatically.
+
+\par Implementation detail
+Default alignment required MSAA textures is `D3D12_DEFAULT_MSAA_RESOURCE_PLACEMENT_ALIGNMENT` = 4 MB.
+Default alignment required for buffers and other textures is `D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT` = 64 KB.
+
+Because the alignment required for buffers is 64 KB, **small buffers** can waste a lot of memory in between when created as placed.
+When such small buffers are created as committed, some graphics drivers are able to pack them better.
+%D3D12MA automatically takes advantage of this by preferring to create small buffers as committed.
+This heuristics is enabled by default. It is also a tradeoff - it can make the allocation of these buffers slower.
+It can be disabled for an individual resource by using D3D12MA::ALLOCATION_FLAG_STRATEGY_MIN_TIME
+and for the entire allocator by using D3D12MA::ALLOCATOR_FLAG_DONT_PREFER_SMALL_BUFFERS_COMMITTED.
+
+For certain textures that meet a complex set of requirements, special **"small alignment"** can be applied.
+Details can be found in Microsoft documentation of the `D3D12_RESOURCE_DESC` structure.
+For MSAA textures, the small alignment is `D3D12_SMALL_MSAA_RESOURCE_PLACEMENT_ALIGNMENT` = 64 KB.
+For other textures, the small alignment is `D3D12_SMALL_RESOURCE_PLACEMENT_ALIGNMENT` = 4 KB.
+%D3D12MA uses this feature automatically.
+Detailed behavior can be disabled or controlled by predefining macro #D3D12MA_USE_SMALL_RESOURCE_PLACEMENT_ALIGNMENT.
+
+D3D12 also has a concept of **alignment of the entire heap**, passed through `D3D12_HEAP_DESC::Alignment`.
+This library automatically sets the alignment as small as possible.
+Unfortunately, any heap that has a chance of hosting an MSAA texture needs to have the alignment set to 4 MB.
+This problem can be overcome by passing D3D12MA::ALLOCATOR_FLAG_MSAA_TEXTURES_ALWAYS_COMMITTED on the creation of the main allocator object
+and D3D12MA::POOL_FLAG_MSAA_TEXTURES_ALWAYS_COMMITTED on the creation of any custom heap that supports textures, not only buffers.
+With those flags, the alignment of the heaps created by %D3D12MA can be lower, but any MSAA textures are created as committed.
+You should always use these flags in your code unless you really need to create some MSAA textures as placed.
+
 \page defragmentation Defragmentation
 
 Interleaved allocations and deallocations of many objects of varying size can
@@ -2339,6 +3064,7 @@ As an extra feature, the core allocation algorithm of the library is exposed thr
 It doesn't allocate any real GPU memory. It just keeps track of used and free regions of a "virtual block".
 You can use it to allocate your own memory or other objects, even completely unrelated to D3D12.
 A common use case is sub-allocation of pieces of one large GPU buffer.
+Another suggested use case is allocating descriptors in a `ID3D12DescriptorHeap`.
 
 \section virtual_allocator_creating_virtual_block Creating virtual block
 
@@ -2523,7 +3249,7 @@ void CustomFree(void* pMemory, void* pPrivateData)
     _aligned_free(pMemory);
 }
 
-(...)
+...
 
 D3D12MA::ALLOCATION_CALLBACKS allocationCallbacks = {};
 allocationCallbacks.pAllocate = &CustomAllocate;
@@ -2532,10 +3258,12 @@ allocationCallbacks.pFree = &CustomFree;
 D3D12MA::ALLOCATOR_DESC allocatorDesc = {};
 allocatorDesc.pDevice = device;
 allocatorDesc.pAdapter = adapter;
+allocatorDesc.Flags = D3D12MA_RECOMMENDED_ALLOCATOR_FLAGS;
 allocatorDesc.pAllocationCallbacks = &allocationCallbacks;
 
 D3D12MA::Allocator* allocator;
 HRESULT hr = D3D12MA::CreateAllocator(&allocatorDesc, &allocator);
+// Check hr...
 \endcode
 
 
diff --git a/thirdparty/d3d12ma/LICENSE.txt b/thirdparty/d3d12ma/LICENSE.txt
index bc2ab4dc052..eb08d9e879a 100644
--- a/thirdparty/d3d12ma/LICENSE.txt
+++ b/thirdparty/d3d12ma/LICENSE.txt
@@ -1,4 +1,4 @@
-Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/thirdparty/d3d12ma/README.md b/thirdparty/d3d12ma/README.md
new file mode 100644
index 00000000000..0536781ee9e
--- /dev/null
+++ b/thirdparty/d3d12ma/README.md
@@ -0,0 +1,8 @@
+Upstream:
+https://github.com/GPUOpen-LibrariesAndSDKs/D3D12MemoryAllocator
+
+commit d26c54a6f40c66611dd33c77df4198784b53a8e2
+Author: Adam Sawicki <adam.sawicki@amd.com>
+Date:   Wed Apr 9 15:07:43 2025 +0200
+
+    Minor fixes after #71
diff --git a/thirdparty/d3d12ma/patches/0001-mingw-support.patch b/thirdparty/d3d12ma/patches/0001-mingw-support.patch
deleted file mode 100644
index 90d4d434019..00000000000
--- a/thirdparty/d3d12ma/patches/0001-mingw-support.patch
+++ /dev/null
@@ -1,45 +0,0 @@
-diff --git a/thirdparty/d3d12ma/D3D12MemAlloc.cpp b/thirdparty/d3d12ma/D3D12MemAlloc.cpp
-index 8e2488091a..80d910e694 100644
---- a/thirdparty/d3d12ma/D3D12MemAlloc.cpp
-+++ b/thirdparty/d3d12ma/D3D12MemAlloc.cpp
-@@ -33,6 +33,12 @@
-     #include <shared_mutex>
- #endif
- 
-+#if !defined(_MSC_VER)
-+#include <guiddef.h>
-+
-+#include <dxguids.h>
-+#endif
-+
- ////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////
- //
-@@ -8178,7 +8184,13 @@ HRESULT AllocatorPimpl::UpdateD3D12Budget()
- 
- D3D12_RESOURCE_ALLOCATION_INFO AllocatorPimpl::GetResourceAllocationInfoNative(const D3D12_RESOURCE_DESC& resourceDesc) const
- {
-+#if defined(_MSC_VER) || !defined(_WIN32)
-     return m_Device->GetResourceAllocationInfo(0, 1, &resourceDesc);
-+#else
-+    D3D12_RESOURCE_ALLOCATION_INFO ret;
-+    m_Device->GetResourceAllocationInfo(&ret, 0, 1, &resourceDesc);
-+    return ret;
-+#endif
- }
- 
- #ifdef __ID3D12Device8_INTERFACE_DEFINED__
-@@ -8186,7 +8198,13 @@ D3D12_RESOURCE_ALLOCATION_INFO AllocatorPimpl::GetResourceAllocationInfoNative(c
- {
-     D3D12MA_ASSERT(m_Device8 != NULL);
-     D3D12_RESOURCE_ALLOCATION_INFO1 info1Unused;
-+#if defined(_MSC_VER) || !defined(_WIN32)
-     return m_Device8->GetResourceAllocationInfo2(0, 1, &resourceDesc, &info1Unused);
-+#else
-+    D3D12_RESOURCE_ALLOCATION_INFO ret;
-+    m_Device8->GetResourceAllocationInfo2(&ret, 0, 1, &resourceDesc, &info1Unused);
-+    return ret;
-+#endif
- }
- #endif // #ifdef __ID3D12Device8_INTERFACE_DEFINED__
-