1
0
mirror of https://github.com/godotengine/godot.git synced 2025-11-22 15:06:45 +00:00

Re-Implement GPU particles on master.

-No new features yet
-Unlike godot 3.x, sorting happens using GPU
This commit is contained in:
Juan Linietsky
2020-08-19 10:38:24 -03:00
parent a3f5dac84f
commit f5f27bacdb
16 changed files with 1919 additions and 63 deletions

View File

@@ -37,3 +37,6 @@ if "RD_GLSL" in env["BUILDERS"]:
env.RD_GLSL("sdfgi_debug_probes.glsl")
env.RD_GLSL("volumetric_fog.glsl")
env.RD_GLSL("shadow_reduce.glsl")
env.RD_GLSL("particles.glsl")
env.RD_GLSL("particles_copy.glsl")
env.RD_GLSL("sort.glsl")

View File

@@ -0,0 +1,262 @@
#[compute]
#version 450
VERSION_DEFINES
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
#define SAMPLER_NEAREST_CLAMP 0
#define SAMPLER_LINEAR_CLAMP 1
#define SAMPLER_NEAREST_WITH_MIPMAPS_CLAMP 2
#define SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP 3
#define SAMPLER_NEAREST_WITH_MIPMAPS_ANISOTROPIC_CLAMP 4
#define SAMPLER_LINEAR_WITH_MIPMAPS_ANISOTROPIC_CLAMP 5
#define SAMPLER_NEAREST_REPEAT 6
#define SAMPLER_LINEAR_REPEAT 7
#define SAMPLER_NEAREST_WITH_MIPMAPS_REPEAT 8
#define SAMPLER_LINEAR_WITH_MIPMAPS_REPEAT 9
#define SAMPLER_NEAREST_WITH_MIPMAPS_ANISOTROPIC_REPEAT 10
#define SAMPLER_LINEAR_WITH_MIPMAPS_ANISOTROPIC_REPEAT 11
/* SET 0: GLOBAL DATA */
layout(set = 0, binding = 1) uniform sampler material_samplers[12];
layout(set = 0, binding = 2, std430) restrict readonly buffer GlobalVariableData {
vec4 data[];
}
global_variables;
/* Set 1: FRAME AND PARTICLE DATA */
// a frame history is kept for trail deterministic behavior
struct FrameParams {
bool emitting;
float system_phase;
float prev_system_phase;
uint cycle;
float explosiveness;
float randomness;
float time;
float delta;
uint random_seed;
uint pad[3];
mat4 emission_transform;
};
layout(set = 1, binding = 0, std430) restrict buffer FrameHistory {
FrameParams data[];
}
frame_history;
struct ParticleData {
mat4 xform;
vec3 velocity;
bool is_active;
vec4 color;
vec4 custom;
};
layout(set = 1, binding = 1, std430) restrict buffer Particles {
ParticleData data[];
}
particles;
/* SET 2: MATERIAL */
#ifdef USE_MATERIAL_UNIFORMS
layout(set = 2, binding = 0, std140) uniform MaterialUniforms{
/* clang-format off */
MATERIAL_UNIFORMS
/* clang-format on */
} material;
#endif
layout(push_constant, binding = 0, std430) uniform Params {
float lifetime;
bool clear;
uint total_particles;
uint trail_size;
bool use_fractional_delta;
uint pad[3];
}
params;
uint hash(uint x) {
x = ((x >> uint(16)) ^ x) * uint(0x45d9f3b);
x = ((x >> uint(16)) ^ x) * uint(0x45d9f3b);
x = (x >> uint(16)) ^ x;
return x;
}
/* clang-format off */
COMPUTE_SHADER_GLOBALS
/* clang-format on */
void main() {
uint particle = gl_GlobalInvocationID.x;
if (particle >= params.total_particles * params.trail_size) {
return; //discard
}
uint index = particle / params.trail_size;
uint frame = (particle % params.trail_size);
#define FRAME frame_history.data[frame]
#define PARTICLE particles.data[particle]
bool apply_forces = true;
bool apply_velocity = true;
float local_delta = FRAME.delta;
float mass = 1.0;
float restart_phase = float(index) / float(params.total_particles);
if (FRAME.randomness > 0.0) {
uint seed = FRAME.cycle;
if (restart_phase >= FRAME.system_phase) {
seed -= uint(1);
}
seed *= uint(params.total_particles);
seed += uint(index);
float random = float(hash(seed) % uint(65536)) / 65536.0;
restart_phase += FRAME.randomness * random * 1.0 / float(params.total_particles);
}
restart_phase *= (1.0 - FRAME.explosiveness);
bool restart = false;
if (FRAME.system_phase > FRAME.prev_system_phase) {
// restart_phase >= prev_system_phase is used so particles emit in the first frame they are processed
if (restart_phase >= FRAME.prev_system_phase && restart_phase < FRAME.system_phase) {
restart = true;
if (params.use_fractional_delta) {
local_delta = (FRAME.system_phase - restart_phase) * params.lifetime;
}
}
} else if (FRAME.delta > 0.0) {
if (restart_phase >= FRAME.prev_system_phase) {
restart = true;
if (params.use_fractional_delta) {
local_delta = (1.0 - restart_phase + FRAME.system_phase) * params.lifetime;
}
} else if (restart_phase < FRAME.system_phase) {
restart = true;
if (params.use_fractional_delta) {
local_delta = (FRAME.system_phase - restart_phase) * params.lifetime;
}
}
}
uint current_cycle = FRAME.cycle;
if (FRAME.system_phase < restart_phase) {
current_cycle -= uint(1);
}
uint particle_number = current_cycle * uint(params.total_particles) + particle;
if (restart) {
PARTICLE.is_active = FRAME.emitting;
}
#ifdef ENABLE_KEEP_DATA
if (params.clear) {
#else
if (params.clear || restart) {
#endif
PARTICLE.color = vec4(1.0);
PARTICLE.custom = vec4(0.0);
PARTICLE.velocity = vec3(0.0);
if (!restart) {
PARTICLE.is_active = false;
}
PARTICLE.xform = mat4(
vec4(1.0, 0.0, 0.0, 0.0),
vec4(0.0, 1.0, 0.0, 0.0),
vec4(0.0, 0.0, 1.0, 0.0),
vec4(0.0, 0.0, 0.0, 1.0));
}
if (PARTICLE.is_active) {
/* clang-format off */
COMPUTE_SHADER_CODE
/* clang-format on */
}
#if !defined(DISABLE_VELOCITY)
if (PARTICLE.is_active) {
PARTICLE.xform[3].xyz += PARTICLE.velocity * local_delta;
}
#endif
#if 0
if (PARTICLE.is_active) {
//execute shader
//!defined(DISABLE_FORCE)
if (false) {
vec3 force = vec3(0.0);
for (int i = 0; i < attractor_count; i++) {
vec3 rel_vec = xform[3].xyz - attractors[i].pos;
float dist = length(rel_vec);
if (attractors[i].radius < dist)
continue;
if (attractors[i].eat_radius > 0.0 && attractors[i].eat_radius > dist) {
out_velocity_active.a = 0.0;
}
rel_vec = normalize(rel_vec);
float attenuation = pow(dist / attractors[i].radius, attractors[i].attenuation);
if (attractors[i].dir == vec3(0.0)) {
//towards center
force += attractors[i].strength * rel_vec * attenuation * mass;
} else {
force += attractors[i].strength * attractors[i].dir * attenuation * mass;
}
}
out_velocity_active.xyz += force * local_delta;
}
#if !defined(DISABLE_VELOCITY)
if (true) {
xform[3].xyz += out_velocity_active.xyz * local_delta;
}
#endif
} else {
xform = mat4(0.0);
}
xform = transpose(xform);
out_velocity_active.a = mix(0.0, 1.0, shader_active);
out_xform_1 = xform[0];
out_xform_2 = xform[1];
out_xform_3 = xform[2];
#endif
}

View File

@@ -0,0 +1,82 @@
#[compute]
#version 450
VERSION_DEFINES
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
struct ParticleData {
mat4 xform;
vec3 velocity;
bool is_active;
vec4 color;
vec4 custom;
};
layout(set = 0, binding = 1, std430) restrict readonly buffer Particles {
ParticleData data[];
}
particles;
layout(set = 0, binding = 2, std430) restrict writeonly buffer Transforms {
vec4 data[];
}
instances;
#ifdef USE_SORT_BUFFER
layout(set = 1, binding = 0, std430) restrict buffer SortBuffer {
vec2 data[];
}
sort_buffer;
#endif // USE_SORT_BUFFER
layout(push_constant, binding = 0, std430) uniform Params {
vec3 sort_direction;
uint total_particles;
}
params;
void main() {
#ifdef MODE_FILL_SORT_BUFFER
uint particle = gl_GlobalInvocationID.x;
if (particle >= params.total_particles) {
return; //discard
}
sort_buffer.data[particle].x = dot(params.sort_direction, particles.data[particle].xform[3].xyz);
sort_buffer.data[particle].y = float(particle);
#endif
#ifdef MODE_FILL_INSTANCES
uint particle = gl_GlobalInvocationID.x;
uint write_offset = gl_GlobalInvocationID.x * (3 + 1 + 1); //xform + color + custom
if (particle >= params.total_particles) {
return; //discard
}
#ifdef USE_SORT_BUFFER
particle = uint(sort_buffer.data[particle].y); //use index from sort buffer
#endif
mat4 txform;
if (particles.data[particle].is_active) {
txform = transpose(particles.data[particle].xform);
} else {
txform = mat4(vec4(0.0), vec4(0.0), vec4(0.0), vec4(0.0)); //zero scale, becomes invisible
}
instances.data[write_offset + 0] = txform[0];
instances.data[write_offset + 1] = txform[1];
instances.data[write_offset + 2] = txform[2];
instances.data[write_offset + 3] = particles.data[particle].color;
instances.data[write_offset + 4] = particles.data[particle].custom;
#endif
}

View File

@@ -0,0 +1,203 @@
#[compute]
#version 450
VERSION_DEFINES
// Original version here:
// https://github.com/GPUOpen-LibrariesAndSDKs/GPUParticles11/blob/master/gpuparticles11/src/Shaders
//
// Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
#define SORT_SIZE 512
#define NUM_THREADS (SORT_SIZE / 2)
#define INVERSION (16 * 2 + 8 * 3)
#define ITERATIONS 1
layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in;
#ifndef MODE_SORT_STEP
shared vec2 g_LDS[SORT_SIZE];
#endif
layout(set = 1, binding = 0, std430) restrict buffer SortBuffer {
vec2 data[];
}
sort_buffer;
layout(push_constant, binding = 0, std430) uniform Params {
uint total_elements;
uint pad[3];
ivec4 job_params;
}
params;
void main() {
#ifdef MODE_SORT_BLOCK
uvec3 Gid = gl_WorkGroupID;
uvec3 DTid = gl_GlobalInvocationID;
uvec3 GTid = gl_LocalInvocationID;
uint GI = gl_LocalInvocationIndex;
int GlobalBaseIndex = int((Gid.x * SORT_SIZE) + GTid.x);
int LocalBaseIndex = int(GI);
int numElementsInThreadGroup = int(min(SORT_SIZE, params.total_elements - (Gid.x * SORT_SIZE)));
// Load shared data
int i;
for (i = 0; i < 2 * ITERATIONS; ++i) {
if (GI + i * NUM_THREADS < numElementsInThreadGroup)
g_LDS[LocalBaseIndex + i * NUM_THREADS] = sort_buffer.data[GlobalBaseIndex + i * NUM_THREADS];
}
groupMemoryBarrier();
barrier();
// Bitonic sort
for (int nMergeSize = 2; nMergeSize <= SORT_SIZE; nMergeSize = nMergeSize * 2) {
for (int nMergeSubSize = nMergeSize >> 1; nMergeSubSize > 0; nMergeSubSize = nMergeSubSize >> 1) {
for (i = 0; i < ITERATIONS; ++i) {
int tmp_index = int(GI + NUM_THREADS * i);
int index_low = tmp_index & (nMergeSubSize - 1);
int index_high = 2 * (tmp_index - index_low);
int index = index_high + index_low;
int nSwapElem = nMergeSubSize == nMergeSize >> 1 ? index_high + (2 * nMergeSubSize - 1) - index_low : index_high + nMergeSubSize + index_low;
if (nSwapElem < numElementsInThreadGroup) {
vec2 a = g_LDS[index];
vec2 b = g_LDS[nSwapElem];
if (a.x > b.x) {
g_LDS[index] = b;
g_LDS[nSwapElem] = a;
}
}
groupMemoryBarrier();
barrier();
}
}
}
// Store shared data
for (i = 0; i < 2 * ITERATIONS; ++i) {
if (GI + i * NUM_THREADS < numElementsInThreadGroup) {
sort_buffer.data[GlobalBaseIndex + i * NUM_THREADS] = g_LDS[LocalBaseIndex + i * NUM_THREADS];
}
}
#endif
#ifdef MODE_SORT_STEP
uvec3 Gid = gl_WorkGroupID;
uvec3 GTid = gl_LocalInvocationID;
ivec4 tgp;
tgp.x = int(Gid.x) * 256;
tgp.y = 0;
tgp.z = int(params.total_elements);
tgp.w = min(512, max(0, tgp.z - int(Gid.x) * 512));
uint localID = int(tgp.x) + GTid.x; // calculate threadID within this sortable-array
uint index_low = localID & (params.job_params.x - 1);
uint index_high = 2 * (localID - index_low);
uint index = tgp.y + index_high + index_low;
uint nSwapElem = tgp.y + index_high + params.job_params.y + params.job_params.z * index_low;
if (nSwapElem < tgp.y + tgp.z) {
vec2 a = sort_buffer.data[index];
vec2 b = sort_buffer.data[nSwapElem];
if (a.x > b.x) {
sort_buffer.data[index] = b;
sort_buffer.data[nSwapElem] = a;
}
}
#endif
#ifdef MODE_SORT_INNER
uvec3 Gid = gl_WorkGroupID;
uvec3 DTid = gl_GlobalInvocationID;
uvec3 GTid = gl_LocalInvocationID;
uint GI = gl_LocalInvocationIndex;
ivec4 tgp;
tgp.x = int(Gid.x * 256);
tgp.y = 0;
tgp.z = int(params.total_elements.x);
tgp.w = int(min(512, max(0, params.total_elements - Gid.x * 512)));
int GlobalBaseIndex = int(tgp.y + tgp.x * 2 + GTid.x);
int LocalBaseIndex = int(GI);
int i;
// Load shared data
for (i = 0; i < 2; ++i) {
if (GI + i * NUM_THREADS < tgp.w)
g_LDS[LocalBaseIndex + i * NUM_THREADS] = sort_buffer.data[GlobalBaseIndex + i * NUM_THREADS];
}
groupMemoryBarrier();
barrier();
// sort threadgroup shared memory
for (int nMergeSubSize = SORT_SIZE >> 1; nMergeSubSize > 0; nMergeSubSize = nMergeSubSize >> 1) {
int tmp_index = int(GI);
int index_low = tmp_index & (nMergeSubSize - 1);
int index_high = 2 * (tmp_index - index_low);
int index = index_high + index_low;
int nSwapElem = index_high + nMergeSubSize + index_low;
if (nSwapElem < tgp.w) {
vec2 a = g_LDS[index];
vec2 b = g_LDS[nSwapElem];
if (a.x > b.x) {
g_LDS[index] = b;
g_LDS[nSwapElem] = a;
}
}
groupMemoryBarrier();
barrier();
}
// Store shared data
for (i = 0; i < 2; ++i) {
if (GI + i * NUM_THREADS < tgp.w) {
sort_buffer.data[GlobalBaseIndex + i * NUM_THREADS] = g_LDS[LocalBaseIndex + i * NUM_THREADS];
}
}
#endif
}