From ea159e47a592bdf0ae6f90d5c39fdaf9153b2a05 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Mon, 21 Nov 2022 01:11:36 +0100 Subject: [PATCH] radv: Add new LBVH shaders. Contrary to the previous implementation, this actually implements an LBVH builder. Part-of: --- src/amd/vulkan/bvh/build_interface.h | 28 ++++ src/amd/vulkan/bvh/lbvh_generate_ir.comp | 138 ++++++++++++++++ src/amd/vulkan/bvh/lbvh_main.comp | 157 +++++++++++++++++++ src/amd/vulkan/bvh/meson.build | 2 + src/amd/vulkan/radv_acceleration_structure.c | 31 ++++ src/amd/vulkan/radv_private.h | 4 + 6 files changed, 360 insertions(+) create mode 100644 src/amd/vulkan/bvh/lbvh_generate_ir.comp create mode 100644 src/amd/vulkan/bvh/lbvh_main.comp diff --git a/src/amd/vulkan/bvh/build_interface.h b/src/amd/vulkan/bvh/build_interface.h index cc829ca209e..dbb866fa3d3 100644 --- a/src/amd/vulkan/bvh/build_interface.h +++ b/src/amd/vulkan/bvh/build_interface.h @@ -65,6 +65,34 @@ struct lbvh_internal_args { uint32_t src_count; }; +#define LBVH_RIGHT_CHILD_BIT_SHIFT 29 +#define LBVH_RIGHT_CHILD_BIT (1 << LBVH_RIGHT_CHILD_BIT_SHIFT) + +struct lbvh_node_info { + /* Number of children that have been processed (or are invalid/leaves) in + * the lbvh_generate_ir pass. + */ + uint32_t path_count; + + uint32_t children[2]; + uint32_t parent; +}; + +struct lbvh_main_args { + VOID_REF bvh; + REF(key_id_pair) src_ids; + VOID_REF node_info; + uint32_t id_count; + uint32_t internal_node_base; +}; + +struct lbvh_generate_ir_args { + VOID_REF bvh; + VOID_REF node_info; + VOID_REF header; + uint32_t internal_node_base; +}; + #define RADV_COPY_MODE_COPY 0 #define RADV_COPY_MODE_SERIALIZE 1 #define RADV_COPY_MODE_DESERIALIZE 2 diff --git a/src/amd/vulkan/bvh/lbvh_generate_ir.comp b/src/amd/vulkan/bvh/lbvh_generate_ir.comp new file mode 100644 index 00000000000..3956509d449 --- /dev/null +++ b/src/amd/vulkan/bvh/lbvh_generate_ir.comp @@ -0,0 +1,138 @@ +/* + * Copyright © 2022 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require +#extension GL_KHR_memory_scope_semantics : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +#include "build_interface.h" + +TYPE(lbvh_node_info, 4); + +layout(push_constant) uniform CONSTS +{ + lbvh_generate_ir_args args; +}; + +void +main(void) +{ + uint32_t global_id = gl_GlobalInvocationID.x; + + uint32_t idx = global_id; + + uint32_t previous_id = RADV_BVH_INVALID_NODE; + radv_aabb previous_bounds; + previous_bounds.min = vec3(INFINITY); + previous_bounds.max = vec3(-INFINITY); + + for (;;) { + uint32_t count = 0; + + /* Check if all children have been processed. As this is an atomic the last path coming from + * a child will pass here, while earlier paths break. + */ + count = atomicAdd( + DEREF(INDEX(lbvh_node_info, args.node_info, idx)).path_count, 1, gl_ScopeDevice, + gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + if (count != 2) + break; + + /* We allocate nodes on demand with the atomic here to ensure children come before their + * parents, which is a requirement of the converter. + */ + uint32_t dst_idx = + atomicAdd(DEREF(REF(radv_ir_header)(args.header)).ir_internal_node_count, 1); + + uint32_t current_offset = args.internal_node_base + dst_idx * SIZEOF(radv_ir_box_node); + uint32_t current_id = pack_ir_node_id(current_offset, radv_ir_node_internal); + + REF(radv_ir_box_node) node = REF(radv_ir_box_node)(OFFSET(args.bvh, current_offset)); + radv_aabb bounds = previous_bounds; + + lbvh_node_info info = DEREF(INDEX(lbvh_node_info, args.node_info, idx)); + + uint32_t children[2] = info.children; + + /* Try using the cached previous_bounds instead of fetching the bounds twice. */ + int32_t previous_child_index = -1; + if (previous_id == children[0]) + previous_child_index = 0; + else if (previous_id == children[1]) + previous_child_index = 1; + + if (previous_child_index == -1) { + if (children[0] != RADV_BVH_INVALID_NODE) { + uint32_t child_offset = ir_id_to_offset(children[0]); + REF(radv_ir_node) child = REF(radv_ir_node)(OFFSET(args.bvh, child_offset)); + radv_aabb child_bounds = DEREF(child).aabb; + bounds.min = min(bounds.min, child_bounds.min); + bounds.max = max(bounds.max, child_bounds.max); + } + previous_child_index = 0; + } + + /* Fetch the non-cached child */ + if (children[1 - previous_child_index] != RADV_BVH_INVALID_NODE) { + uint32_t child_offset = ir_id_to_offset(children[1 - previous_child_index]); + REF(radv_ir_node) child = REF(radv_ir_node)(OFFSET(args.bvh, child_offset)); + radv_aabb child_bounds = DEREF(child).aabb; + bounds.min = min(bounds.min, child_bounds.min); + bounds.max = max(bounds.max, child_bounds.max); + } + + radv_ir_box_node node_value; + + node_value.base.aabb = bounds; + node_value.in_final_tree = FINAL_TREE_UNKNOWN; + node_value.children = children; + + DEREF(node) = node_value; + + if (info.parent == RADV_BVH_INVALID_NODE) + break; + + idx = info.parent & ~LBVH_RIGHT_CHILD_BIT; + + DEREF(INDEX(lbvh_node_info, args.node_info, idx)) + .children[(info.parent >> LBVH_RIGHT_CHILD_BIT_SHIFT) & 1] = current_id; + + previous_id = current_id; + previous_bounds = bounds; + + memoryBarrierBuffer(); + } +} diff --git a/src/amd/vulkan/bvh/lbvh_main.comp b/src/amd/vulkan/bvh/lbvh_main.comp new file mode 100644 index 00000000000..7d8487b4f5d --- /dev/null +++ b/src/amd/vulkan/bvh/lbvh_main.comp @@ -0,0 +1,157 @@ +/* + * Copyright © 2022 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +#include "build_interface.h" + +TYPE(lbvh_node_info, 4); + +layout(push_constant) uniform CONSTS +{ + lbvh_main_args args; +}; + +int32_t +longest_common_prefix(int32_t i, uint32_t key_i, int32_t j) +{ + if (j < 0 || j >= args.id_count) + return -1; + + uint32_t key_j = DEREF(INDEX(key_id_pair, args.src_ids, j)).key; + + uint32_t diff = key_i ^ key_j; + int32_t ret = 0; + if (key_i == key_j) { + ret += 32; + diff = i ^ j; + } + + return ret + 31 - findMSB(diff); +} + +/* + * The LBVH algorithm constructs a radix tree of the sorted nodes according to their key. + * + * We do this by making the following decision: + * + * Node N always either starts or ends at leaf N. + * + * From there it follows that we always have to extend it into the direction which has + * a longer common prefix with the direct neighbour. Then we try to make the node cover + * as many leaves as possible without including the other neighbour. + * + * For finding the split point we compute the longest common prefix of all the leaves within the + * node, and look for the first leaf the same length common prefix with leaf N as that. + * + * To give an example: leaves=[000,001,010,011,100,101,110,111], node=5 (with value 101) + * + * lcp(101, 100) = 2 and lcp(101, 110) = 1, so we extend down. + * lcp(101, 011) = 0, so the range of the node is [4,5] with values [100, 101] + * + * the lcp of all the leaves in the range is the same as the lcp of the first and last leaf, in this + * case that is lcp(101, 100) = 2. Then we have lcp(101, 101) = 3 and lcp(101, 100) = 2, so the first + * leaf that has a longer lcp is 4. Hence the two children of this node have range [4,4] and [5,5] + */ +void +main() +{ + if (args.id_count <= 1) { + REF(lbvh_node_info) dst = REF(lbvh_node_info)(args.node_info); + DEREF(dst).parent = RADV_BVH_INVALID_NODE; + DEREF(dst).path_count = 2; + DEREF(dst).children[0] = + args.id_count == 1 ? DEREF(INDEX(key_id_pair, args.src_ids, 0)).id : RADV_BVH_INVALID_NODE; + DEREF(dst).children[1] = RADV_BVH_INVALID_NODE; + return; + } + + int32_t id = int32_t(gl_GlobalInvocationID.x); + uint32_t id_key = DEREF(INDEX(key_id_pair, args.src_ids, id)).key; + + int32_t left_lcp = longest_common_prefix(id, id_key, id - 1); + int32_t right_lcp = longest_common_prefix(id, id_key, id + 1); + int32_t dir = right_lcp > left_lcp ? 1 : -1; + int32_t lcp_min = min(left_lcp, right_lcp); + + /* Determine the bounds for the binary search for the length of the range that + * this subtree is going to own. + */ + int32_t lmax = 128; + while (longest_common_prefix(id, id_key, id + dir * lmax) > lcp_min) { + lmax *= 2; + } + + int32_t length = 0; + for (int32_t t = lmax / 2; t >= 1; t /= 2) { + if (longest_common_prefix(id, id_key, id + (length + t) * dir) > lcp_min) + length += t; + } + int32_t other_end = id + length * dir; + + /* The number of bits in the prefix that is the same for all elements in the + * range. + */ + int32_t lcp_node = longest_common_prefix(id, id_key, other_end); + int32_t child_range = 0; + for (int32_t diff = 2; diff < 2 * length; diff *= 2) { + int32_t t = DIV_ROUND_UP(length, diff); + if (longest_common_prefix(id, id_key, id + (child_range + t) * dir) > lcp_node) + child_range += t; + } + + int32_t child_split = id + child_range * dir; + + /* If dir = -1, right = child_split */ + int32_t left = child_split + min(dir, 0); + int32_t right = left + 1; + + /* if the number of leaves covered by a child is 1, we can use the leaf directly */ + bool left_leaf = min(id, other_end) == left; + bool right_leaf = max(id, other_end) == right; + + if (!left_leaf) + DEREF(INDEX(lbvh_node_info, args.node_info, left)).parent = id; + if (!right_leaf) + DEREF(INDEX(lbvh_node_info, args.node_info, right)).parent = LBVH_RIGHT_CHILD_BIT | id; + + REF(lbvh_node_info) dst = INDEX(lbvh_node_info, args.node_info, id); + DEREF(dst).path_count = (left_leaf ? 1 : 0) + (right_leaf ? 1 : 0); + DEREF(dst).children[0] = DEREF(INDEX(key_id_pair, args.src_ids, left)).id; + DEREF(dst).children[1] = DEREF(INDEX(key_id_pair, args.src_ids, right)).id; + if (id == 0) + DEREF(dst).parent = RADV_BVH_INVALID_NODE; +} \ No newline at end of file diff --git a/src/amd/vulkan/bvh/meson.build b/src/amd/vulkan/bvh/meson.build index 16fd7fc5153..c63818292b6 100644 --- a/src/amd/vulkan/bvh/meson.build +++ b/src/amd/vulkan/bvh/meson.build @@ -21,6 +21,8 @@ bvh_shaders = [ 'copy.comp', 'lbvh_internal.comp', + 'lbvh_generate_ir.comp', + 'lbvh_main.comp', 'leaf.comp', 'morton.comp', 'ploc_internal.comp', diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c index 6ecd5feeca0..b0a5aa20254 100644 --- a/src/amd/vulkan/radv_acceleration_structure.c +++ b/src/amd/vulkan/radv_acceleration_structure.c @@ -43,6 +43,14 @@ static const uint32_t lbvh_internal_spv[] = { #include "bvh/lbvh_internal.comp.spv.h" }; +static const uint32_t lbvh_main_spv[] = { +#include "bvh/lbvh_main.comp.spv.h" +}; + +static const uint32_t lbvh_generate_ir_spv[] = { +#include "bvh/lbvh_generate_ir.comp.spv.h" +}; + static const uint32_t ploc_spv[] = { #include "bvh/ploc_internal.comp.spv.h" }; @@ -316,6 +324,10 @@ radv_device_finish_accel_struct_build_state(struct radv_device *device) &state->alloc); radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.ploc_pipeline, &state->alloc); + radv_DestroyPipeline(radv_device_to_handle(device), + state->accel_struct_build.lbvh_generate_ir_pipeline, &state->alloc); + radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.lbvh_main_pipeline, + &state->alloc); radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.lbvh_internal_pipeline, &state->alloc); radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.leaf_pipeline, @@ -330,6 +342,10 @@ radv_device_finish_accel_struct_build_state(struct radv_device *device) state->accel_struct_build.copy_p_layout, &state->alloc); radv_DestroyPipelineLayout(radv_device_to_handle(device), state->accel_struct_build.ploc_p_layout, &state->alloc); + radv_DestroyPipelineLayout(radv_device_to_handle(device), + state->accel_struct_build.lbvh_generate_ir_p_layout, &state->alloc); + radv_DestroyPipelineLayout(radv_device_to_handle(device), + state->accel_struct_build.lbvh_main_p_layout, &state->alloc); radv_DestroyPipelineLayout(radv_device_to_handle(device), state->accel_struct_build.lbvh_internal_p_layout, &state->alloc); radv_DestroyPipelineLayout(radv_device_to_handle(device), @@ -434,6 +450,21 @@ radv_device_init_accel_struct_build_state(struct radv_device *device) if (result != VK_SUCCESS) return result; + result = create_build_pipeline_spv(device, lbvh_main_spv, sizeof(lbvh_main_spv), + sizeof(struct lbvh_main_args), + &device->meta_state.accel_struct_build.lbvh_main_pipeline, + &device->meta_state.accel_struct_build.lbvh_main_p_layout); + if (result != VK_SUCCESS) + return result; + + result = + create_build_pipeline_spv(device, lbvh_generate_ir_spv, sizeof(lbvh_generate_ir_spv), + sizeof(struct lbvh_generate_ir_args), + &device->meta_state.accel_struct_build.lbvh_generate_ir_pipeline, + &device->meta_state.accel_struct_build.lbvh_generate_ir_p_layout); + if (result != VK_SUCCESS) + return result; + result = create_build_pipeline_spv(device, ploc_spv, sizeof(ploc_spv), sizeof(struct ploc_args), &device->meta_state.accel_struct_build.ploc_pipeline, &device->meta_state.accel_struct_build.ploc_p_layout); diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 1117cf84911..8dfa91b5138 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -679,6 +679,10 @@ struct radv_meta_state { VkPipeline morton_pipeline; VkPipelineLayout lbvh_internal_p_layout; VkPipeline lbvh_internal_pipeline; + VkPipelineLayout lbvh_main_p_layout; + VkPipeline lbvh_main_pipeline; + VkPipelineLayout lbvh_generate_ir_p_layout; + VkPipeline lbvh_generate_ir_pipeline; VkPipelineLayout ploc_p_layout; VkPipeline ploc_pipeline; VkPipelineLayout convert_leaf_p_layout;