Re-Implement GPU particles on master.

-No new features yet -Unlike godot 3.x, sorting happens using GPU
2025-11-22 15:06:45 +00:00 · 2020-08-19 10:38:24 -03:00
parent a3f5dac84f
commit f5f27bacdb
16 changed files with 1919 additions and 63 deletions
--- a/servers/rendering/rasterizer_rd/shaders/SCsub
+++ b/servers/rendering/rasterizer_rd/shaders/SCsub
@@ -37,3 +37,6 @@ if "RD_GLSL" in env["BUILDERS"]:
    env.RD_GLSL("sdfgi_debug_probes.glsl")
    env.RD_GLSL("volumetric_fog.glsl")
    env.RD_GLSL("shadow_reduce.glsl")
+    env.RD_GLSL("particles.glsl")
+    env.RD_GLSL("particles_copy.glsl")
+    env.RD_GLSL("sort.glsl")
--- a/servers/rendering/rasterizer_rd/shaders/particles.glsl
+++ b/servers/rendering/rasterizer_rd/shaders/particles.glsl
@@ -0,0 +1,262 @@
+#[compute]
+
+#version 450
+
+VERSION_DEFINES
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+
+#define SAMPLER_NEAREST_CLAMP 0
+#define SAMPLER_LINEAR_CLAMP 1
+#define SAMPLER_NEAREST_WITH_MIPMAPS_CLAMP 2
+#define SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP 3
+#define SAMPLER_NEAREST_WITH_MIPMAPS_ANISOTROPIC_CLAMP 4
+#define SAMPLER_LINEAR_WITH_MIPMAPS_ANISOTROPIC_CLAMP 5
+#define SAMPLER_NEAREST_REPEAT 6
+#define SAMPLER_LINEAR_REPEAT 7
+#define SAMPLER_NEAREST_WITH_MIPMAPS_REPEAT 8
+#define SAMPLER_LINEAR_WITH_MIPMAPS_REPEAT 9
+#define SAMPLER_NEAREST_WITH_MIPMAPS_ANISOTROPIC_REPEAT 10
+#define SAMPLER_LINEAR_WITH_MIPMAPS_ANISOTROPIC_REPEAT 11
+
+/* SET 0: GLOBAL DATA */
+
+layout(set = 0, binding = 1) uniform sampler material_samplers[12];
+
+layout(set = 0, binding = 2, std430) restrict readonly buffer GlobalVariableData {
+	vec4 data[];
+}
+global_variables;
+
+/* Set 1: FRAME AND PARTICLE DATA */
+
+// a frame history is kept for trail deterministic behavior
+struct FrameParams {
+	bool emitting;
+	float system_phase;
+	float prev_system_phase;
+	uint cycle;
+
+	float explosiveness;
+	float randomness;
+	float time;
+	float delta;
+
+	uint random_seed;
+	uint pad[3];
+
+	mat4 emission_transform;
+};
+
+layout(set = 1, binding = 0, std430) restrict buffer FrameHistory {
+	FrameParams data[];
+}
+frame_history;
+
+struct ParticleData {
+	mat4 xform;
+	vec3 velocity;
+	bool is_active;
+	vec4 color;
+	vec4 custom;
+};
+
+layout(set = 1, binding = 1, std430) restrict buffer Particles {
+	ParticleData data[];
+}
+particles;
+
+/* SET 2: MATERIAL */
+
+#ifdef USE_MATERIAL_UNIFORMS
+layout(set = 2, binding = 0, std140) uniform MaterialUniforms{
+	/* clang-format off */
+MATERIAL_UNIFORMS
+	/* clang-format on */
+} material;
+#endif
+
+layout(push_constant, binding = 0, std430) uniform Params {
+	float lifetime;
+	bool clear;
+	uint total_particles;
+	uint trail_size;
+	bool use_fractional_delta;
+	uint pad[3];
+}
+params;
+
+uint hash(uint x) {
+	x = ((x >> uint(16)) ^ x) * uint(0x45d9f3b);
+	x = ((x >> uint(16)) ^ x) * uint(0x45d9f3b);
+	x = (x >> uint(16)) ^ x;
+	return x;
+}
+
+/* clang-format off */
+
+COMPUTE_SHADER_GLOBALS
+
+/* clang-format on */
+
+void main() {
+	uint particle = gl_GlobalInvocationID.x;
+
+	if (particle >= params.total_particles * params.trail_size) {
+		return; //discard
+	}
+
+	uint index = particle / params.trail_size;
+	uint frame = (particle % params.trail_size);
+
+#define FRAME frame_history.data[frame]
+#define PARTICLE particles.data[particle]
+
+	bool apply_forces = true;
+	bool apply_velocity = true;
+	float local_delta = FRAME.delta;
+
+	float mass = 1.0;
+
+	float restart_phase = float(index) / float(params.total_particles);
+
+	if (FRAME.randomness > 0.0) {
+		uint seed = FRAME.cycle;
+		if (restart_phase >= FRAME.system_phase) {
+			seed -= uint(1);
+		}
+		seed *= uint(params.total_particles);
+		seed += uint(index);
+		float random = float(hash(seed) % uint(65536)) / 65536.0;
+		restart_phase += FRAME.randomness * random * 1.0 / float(params.total_particles);
+	}
+
+	restart_phase *= (1.0 - FRAME.explosiveness);
+
+	bool restart = false;
+
+	if (FRAME.system_phase > FRAME.prev_system_phase) {
+		// restart_phase >= prev_system_phase is used so particles emit in the first frame they are processed
+
+		if (restart_phase >= FRAME.prev_system_phase && restart_phase < FRAME.system_phase) {
+			restart = true;
+			if (params.use_fractional_delta) {
+				local_delta = (FRAME.system_phase - restart_phase) * params.lifetime;
+			}
+		}
+
+	} else if (FRAME.delta > 0.0) {
+		if (restart_phase >= FRAME.prev_system_phase) {
+			restart = true;
+			if (params.use_fractional_delta) {
+				local_delta = (1.0 - restart_phase + FRAME.system_phase) * params.lifetime;
+			}
+
+		} else if (restart_phase < FRAME.system_phase) {
+			restart = true;
+			if (params.use_fractional_delta) {
+				local_delta = (FRAME.system_phase - restart_phase) * params.lifetime;
+			}
+		}
+	}
+
+	uint current_cycle = FRAME.cycle;
+
+	if (FRAME.system_phase < restart_phase) {
+		current_cycle -= uint(1);
+	}
+
+	uint particle_number = current_cycle * uint(params.total_particles) + particle;
+
+	if (restart) {
+		PARTICLE.is_active = FRAME.emitting;
+	}
+
+#ifdef ENABLE_KEEP_DATA
+	if (params.clear) {
+#else
+	if (params.clear || restart) {
+#endif
+		PARTICLE.color = vec4(1.0);
+		PARTICLE.custom = vec4(0.0);
+		PARTICLE.velocity = vec3(0.0);
+		if (!restart) {
+			PARTICLE.is_active = false;
+		}
+		PARTICLE.xform = mat4(
+				vec4(1.0, 0.0, 0.0, 0.0),
+				vec4(0.0, 1.0, 0.0, 0.0),
+				vec4(0.0, 0.0, 1.0, 0.0),
+				vec4(0.0, 0.0, 0.0, 1.0));
+	}
+
+	if (PARTICLE.is_active) {
+		/* clang-format off */
+
+COMPUTE_SHADER_CODE
+
+		/* clang-format on */
+	}
+
+#if !defined(DISABLE_VELOCITY)
+
+	if (PARTICLE.is_active) {
+		PARTICLE.xform[3].xyz += PARTICLE.velocity * local_delta;
+	}
+#endif
+
+#if 0
+	if (PARTICLE.is_active) {
+		//execute shader
+
+
+
+
+		//!defined(DISABLE_FORCE)
+
+		if (false) {
+			vec3 force = vec3(0.0);
+			for (int i = 0; i < attractor_count; i++) {
+				vec3 rel_vec = xform[3].xyz - attractors[i].pos;
+				float dist = length(rel_vec);
+				if (attractors[i].radius < dist)
+					continue;
+				if (attractors[i].eat_radius > 0.0 && attractors[i].eat_radius > dist) {
+					out_velocity_active.a = 0.0;
+				}
+
+				rel_vec = normalize(rel_vec);
+
+				float attenuation = pow(dist / attractors[i].radius, attractors[i].attenuation);
+
+				if (attractors[i].dir == vec3(0.0)) {
+					//towards center
+					force += attractors[i].strength * rel_vec * attenuation * mass;
+				} else {
+					force += attractors[i].strength * attractors[i].dir * attenuation * mass;
+				}
+			}
+
+			out_velocity_active.xyz += force * local_delta;
+		}
+
+#if !defined(DISABLE_VELOCITY)
+
+		if (true) {
+			xform[3].xyz += out_velocity_active.xyz * local_delta;
+		}
+#endif
+	} else {
+		xform = mat4(0.0);
+	}
+
+
+	xform = transpose(xform);
+
+	out_velocity_active.a = mix(0.0, 1.0, shader_active);
+
+	out_xform_1 = xform[0];
+	out_xform_2 = xform[1];
+	out_xform_3 = xform[2];
+#endif
+}
--- a/servers/rendering/rasterizer_rd/shaders/particles_copy.glsl
+++ b/servers/rendering/rasterizer_rd/shaders/particles_copy.glsl
@@ -0,0 +1,82 @@
+#[compute]
+
+#version 450
+
+VERSION_DEFINES
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+
+struct ParticleData {
+	mat4 xform;
+	vec3 velocity;
+	bool is_active;
+	vec4 color;
+	vec4 custom;
+};
+
+layout(set = 0, binding = 1, std430) restrict readonly buffer Particles {
+	ParticleData data[];
+}
+particles;
+
+layout(set = 0, binding = 2, std430) restrict writeonly buffer Transforms {
+	vec4 data[];
+}
+instances;
+
+#ifdef USE_SORT_BUFFER
+
+layout(set = 1, binding = 0, std430) restrict buffer SortBuffer {
+	vec2 data[];
+}
+sort_buffer;
+
+#endif // USE_SORT_BUFFER
+
+layout(push_constant, binding = 0, std430) uniform Params {
+	vec3 sort_direction;
+	uint total_particles;
+}
+params;
+
+void main() {
+#ifdef MODE_FILL_SORT_BUFFER
+
+	uint particle = gl_GlobalInvocationID.x;
+	if (particle >= params.total_particles) {
+		return; //discard
+	}
+
+	sort_buffer.data[particle].x = dot(params.sort_direction, particles.data[particle].xform[3].xyz);
+	sort_buffer.data[particle].y = float(particle);
+#endif
+
+#ifdef MODE_FILL_INSTANCES
+
+	uint particle = gl_GlobalInvocationID.x;
+	uint write_offset = gl_GlobalInvocationID.x * (3 + 1 + 1); //xform + color + custom
+
+	if (particle >= params.total_particles) {
+		return; //discard
+	}
+
+#ifdef USE_SORT_BUFFER
+	particle = uint(sort_buffer.data[particle].y); //use index from sort buffer
+#endif
+
+	mat4 txform;
+
+	if (particles.data[particle].is_active) {
+		txform = transpose(particles.data[particle].xform);
+	} else {
+		txform = mat4(vec4(0.0), vec4(0.0), vec4(0.0), vec4(0.0)); //zero scale, becomes invisible
+	}
+
+	instances.data[write_offset + 0] = txform[0];
+	instances.data[write_offset + 1] = txform[1];
+	instances.data[write_offset + 2] = txform[2];
+	instances.data[write_offset + 3] = particles.data[particle].color;
+	instances.data[write_offset + 4] = particles.data[particle].custom;
+
+#endif
+}
--- a/servers/rendering/rasterizer_rd/shaders/sort.glsl
+++ b/servers/rendering/rasterizer_rd/shaders/sort.glsl
@@ -0,0 +1,203 @@
+#[compute]
+
+#version 450
+
+VERSION_DEFINES
+
+// Original version here:
+// https://github.com/GPUOpen-LibrariesAndSDKs/GPUParticles11/blob/master/gpuparticles11/src/Shaders
+
+//
+// Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+
+#define SORT_SIZE 512
+#define NUM_THREADS (SORT_SIZE / 2)
+#define INVERSION (16 * 2 + 8 * 3)
+#define ITERATIONS 1
+
+layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in;
+
+#ifndef MODE_SORT_STEP
+
+shared vec2 g_LDS[SORT_SIZE];
+
+#endif
+
+layout(set = 1, binding = 0, std430) restrict buffer SortBuffer {
+	vec2 data[];
+}
+sort_buffer;
+
+layout(push_constant, binding = 0, std430) uniform Params {
+	uint total_elements;
+	uint pad[3];
+	ivec4 job_params;
+}
+params;
+
+void main() {
+#ifdef MODE_SORT_BLOCK
+
+	uvec3 Gid = gl_WorkGroupID;
+	uvec3 DTid = gl_GlobalInvocationID;
+	uvec3 GTid = gl_LocalInvocationID;
+	uint GI = gl_LocalInvocationIndex;
+
+	int GlobalBaseIndex = int((Gid.x * SORT_SIZE) + GTid.x);
+	int LocalBaseIndex = int(GI);
+	int numElementsInThreadGroup = int(min(SORT_SIZE, params.total_elements - (Gid.x * SORT_SIZE)));
+
+	// Load shared data
+
+	int i;
+	for (i = 0; i < 2 * ITERATIONS; ++i) {
+		if (GI + i * NUM_THREADS < numElementsInThreadGroup)
+			g_LDS[LocalBaseIndex + i * NUM_THREADS] = sort_buffer.data[GlobalBaseIndex + i * NUM_THREADS];
+	}
+
+	groupMemoryBarrier();
+	barrier();
+
+	// Bitonic sort
+	for (int nMergeSize = 2; nMergeSize <= SORT_SIZE; nMergeSize = nMergeSize * 2) {
+		for (int nMergeSubSize = nMergeSize >> 1; nMergeSubSize > 0; nMergeSubSize = nMergeSubSize >> 1) {
+			for (i = 0; i < ITERATIONS; ++i) {
+				int tmp_index = int(GI + NUM_THREADS * i);
+				int index_low = tmp_index & (nMergeSubSize - 1);
+				int index_high = 2 * (tmp_index - index_low);
+				int index = index_high + index_low;
+
+				int nSwapElem = nMergeSubSize == nMergeSize >> 1 ? index_high + (2 * nMergeSubSize - 1) - index_low : index_high + nMergeSubSize + index_low;
+				if (nSwapElem < numElementsInThreadGroup) {
+					vec2 a = g_LDS[index];
+					vec2 b = g_LDS[nSwapElem];
+
+					if (a.x > b.x) {
+						g_LDS[index] = b;
+						g_LDS[nSwapElem] = a;
+					}
+				}
+				groupMemoryBarrier();
+				barrier();
+			}
+		}
+	}
+
+	// Store shared data
+	for (i = 0; i < 2 * ITERATIONS; ++i) {
+		if (GI + i * NUM_THREADS < numElementsInThreadGroup) {
+			sort_buffer.data[GlobalBaseIndex + i * NUM_THREADS] = g_LDS[LocalBaseIndex + i * NUM_THREADS];
+		}
+	}
+
+#endif
+
+#ifdef MODE_SORT_STEP
+
+	uvec3 Gid = gl_WorkGroupID;
+	uvec3 GTid = gl_LocalInvocationID;
+
+	ivec4 tgp;
+
+	tgp.x = int(Gid.x) * 256;
+	tgp.y = 0;
+	tgp.z = int(params.total_elements);
+	tgp.w = min(512, max(0, tgp.z - int(Gid.x) * 512));
+
+	uint localID = int(tgp.x) + GTid.x; // calculate threadID within this sortable-array
+
+	uint index_low = localID & (params.job_params.x - 1);
+	uint index_high = 2 * (localID - index_low);
+
+	uint index = tgp.y + index_high + index_low;
+	uint nSwapElem = tgp.y + index_high + params.job_params.y + params.job_params.z * index_low;
+
+	if (nSwapElem < tgp.y + tgp.z) {
+		vec2 a = sort_buffer.data[index];
+		vec2 b = sort_buffer.data[nSwapElem];
+
+		if (a.x > b.x) {
+			sort_buffer.data[index] = b;
+			sort_buffer.data[nSwapElem] = a;
+		}
+	}
+
+#endif
+
+#ifdef MODE_SORT_INNER
+
+	uvec3 Gid = gl_WorkGroupID;
+	uvec3 DTid = gl_GlobalInvocationID;
+	uvec3 GTid = gl_LocalInvocationID;
+	uint GI = gl_LocalInvocationIndex;
+
+	ivec4 tgp;
+
+	tgp.x = int(Gid.x * 256);
+	tgp.y = 0;
+	tgp.z = int(params.total_elements.x);
+	tgp.w = int(min(512, max(0, params.total_elements - Gid.x * 512)));
+
+	int GlobalBaseIndex = int(tgp.y + tgp.x * 2 + GTid.x);
+	int LocalBaseIndex = int(GI);
+	int i;
+
+	// Load shared data
+	for (i = 0; i < 2; ++i) {
+		if (GI + i * NUM_THREADS < tgp.w)
+			g_LDS[LocalBaseIndex + i * NUM_THREADS] = sort_buffer.data[GlobalBaseIndex + i * NUM_THREADS];
+	}
+
+	groupMemoryBarrier();
+	barrier();
+
+	// sort threadgroup shared memory
+	for (int nMergeSubSize = SORT_SIZE >> 1; nMergeSubSize > 0; nMergeSubSize = nMergeSubSize >> 1) {
+		int tmp_index = int(GI);
+		int index_low = tmp_index & (nMergeSubSize - 1);
+		int index_high = 2 * (tmp_index - index_low);
+		int index = index_high + index_low;
+
+		int nSwapElem = index_high + nMergeSubSize + index_low;
+
+		if (nSwapElem < tgp.w) {
+			vec2 a = g_LDS[index];
+			vec2 b = g_LDS[nSwapElem];
+
+			if (a.x > b.x) {
+				g_LDS[index] = b;
+				g_LDS[nSwapElem] = a;
+			}
+		}
+		groupMemoryBarrier();
+		barrier();
+	}
+
+	// Store shared data
+	for (i = 0; i < 2; ++i) {
+		if (GI + i * NUM_THREADS < tgp.w) {
+			sort_buffer.data[GlobalBaseIndex + i * NUM_THREADS] = g_LDS[LocalBaseIndex + i * NUM_THREADS];
+		}
+	}
+
+#endif
+}