From 60a91dddf4be6e7e48c488e2d9fe1b374aaaa276 Mon Sep 17 00:00:00 2001
From: Konstantin Seurer <konstantin.seurer@gmail.com>
Date: Tue, 14 Jun 2022 17:12:43 +0200
Subject: [PATCH] radv: Switch to the GLSL internal implementation

Signed-off-by: Konstantin Seurer <konstantin.seurer@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17028>
---
 src/amd/vulkan/radv_acceleration_structure.c | 185 ++-----------------
 1 file changed, 18 insertions(+), 167 deletions(-)

diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c
index 67d6e1c6bae..3a692faef78 100644
--- a/src/amd/vulkan/radv_acceleration_structure.c
+++ b/src/amd/vulkan/radv_acceleration_structure.c
@@ -33,6 +33,10 @@ static const uint32_t morton_spv[] = {
 #include "bvh/morton.comp.spv.h"
 };
 
+static const uint32_t internal_spv[] = {
+#include "bvh/internal.comp.spv.h"
+};
+
 /* Min and max bounds of the bvh used to compute morton codes */
 #define SCRATCH_TOTAL_BOUNDS_SIZE (6 * sizeof(float))
 
@@ -392,13 +396,12 @@ struct morton_constants {
    uint64_t ids_addr;
 };
 
-struct build_internal_constants {
-   uint64_t node_dst_addr;
-   uint64_t scratch_addr;
+struct internal_constants {
+   uint64_t bvh_addr;
+   uint64_t src_ids_addr;
+   uint64_t dst_ids_addr;
    uint32_t dst_offset;
-   uint32_t dst_scratch_offset;
-   uint32_t src_scratch_offset;
-   uint32_t fill_header;
+   uint32_t fill_count;
 };
 
 /* This inverts a 3x3 matrix using cofactors, as in e.g.
@@ -725,157 +728,6 @@ build_leaf_shader(struct radv_device *dev)
    return b.shader;
 }
 
-static void
-determine_bounds(nir_builder *b, nir_ssa_def *node_addr, nir_ssa_def *node_id,
-                 nir_variable *bounds_vars[2])
-{
-   nir_ssa_def *node_type = nir_iand_imm(b, node_id, 7);
-   node_addr =
-      nir_iadd(b, node_addr, nir_u2u64(b, nir_ishl_imm(b, nir_iand_imm(b, node_id, ~7u), 3)));
-
-   nir_push_if(b, nir_ieq_imm(b, node_type, radv_bvh_node_triangle));
-   {
-      nir_ssa_def *positions[3];
-      for (unsigned i = 0; i < 3; ++i)
-         positions[i] = nir_build_load_global(b, 3, 32, nir_iadd_imm(b, node_addr, i * 12));
-      nir_ssa_def *bounds[] = {positions[0], positions[0]};
-      for (unsigned i = 1; i < 3; ++i) {
-         bounds[0] = nir_fmin(b, bounds[0], positions[i]);
-         bounds[1] = nir_fmax(b, bounds[1], positions[i]);
-      }
-      nir_store_var(b, bounds_vars[0], bounds[0], 7);
-      nir_store_var(b, bounds_vars[1], bounds[1], 7);
-   }
-   nir_push_else(b, NULL);
-   nir_push_if(b, nir_ieq_imm(b, node_type, radv_bvh_node_internal));
-   {
-      nir_ssa_def *input_bounds[4][2];
-      for (unsigned i = 0; i < 4; ++i)
-         for (unsigned j = 0; j < 2; ++j)
-            input_bounds[i][j] =
-               nir_build_load_global(b, 3, 32, nir_iadd_imm(b, node_addr, 16 + i * 24 + j * 12));
-      nir_ssa_def *bounds[] = {input_bounds[0][0], input_bounds[0][1]};
-      for (unsigned i = 1; i < 4; ++i) {
-         bounds[0] = nir_fmin(b, bounds[0], input_bounds[i][0]);
-         bounds[1] = nir_fmax(b, bounds[1], input_bounds[i][1]);
-      }
-
-      nir_store_var(b, bounds_vars[0], bounds[0], 7);
-      nir_store_var(b, bounds_vars[1], bounds[1], 7);
-   }
-   nir_push_else(b, NULL);
-   nir_push_if(b, nir_ieq_imm(b, node_type, radv_bvh_node_instance));
-   { /* Instances */
-      nir_ssa_def *bounds[2];
-      for (unsigned i = 0; i < 2; ++i)
-         bounds[i] = nir_build_load_global(b, 3, 32, nir_iadd_imm(b, node_addr, 64 + i * 12));
-      nir_store_var(b, bounds_vars[0], bounds[0], 7);
-      nir_store_var(b, bounds_vars[1], bounds[1], 7);
-   }
-   nir_push_else(b, NULL);
-   { /* AABBs */
-      nir_ssa_def *bounds[2];
-      for (unsigned i = 0; i < 2; ++i)
-         bounds[i] = nir_build_load_global(b, 3, 32, nir_iadd_imm(b, node_addr, i * 12));
-      nir_store_var(b, bounds_vars[0], bounds[0], 7);
-      nir_store_var(b, bounds_vars[1], bounds[1], 7);
-   }
-   nir_pop_if(b, NULL);
-   nir_pop_if(b, NULL);
-   nir_pop_if(b, NULL);
-}
-
-static nir_shader *
-build_internal_shader(struct radv_device *dev)
-{
-   const struct glsl_type *vec3_type = glsl_vector_type(GLSL_TYPE_FLOAT, 3);
-   nir_builder b = create_accel_build_shader(dev, "accel_build_internal_shader");
-
-   /*
-    * push constants:
-    *   i32 x 2: node dst address
-    *   i32 x 2: scratch address
-    *   i32: dst offset
-    *   i32: dst scratch offset
-    *   i32: src scratch offset
-    *   i32: src_node_count | (fill_header << 31)
-    */
-   nir_ssa_def *pconst0 =
-      nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16);
-   nir_ssa_def *pconst1 =
-      nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 16, .range = 16);
-
-   nir_ssa_def *node_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 0b0011));
-   nir_ssa_def *scratch_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 0b1100));
-   nir_ssa_def *node_dst_offset = nir_channel(&b, pconst1, 0);
-   nir_ssa_def *dst_scratch_offset = nir_channel(&b, pconst1, 1);
-   nir_ssa_def *src_scratch_offset = nir_channel(&b, pconst1, 2);
-   nir_ssa_def *src_node_count = nir_iand_imm(&b, nir_channel(&b, pconst1, 3), 0x7FFFFFFFU);
-   nir_ssa_def *fill_header =
-      nir_ine_imm(&b, nir_iand_imm(&b, nir_channel(&b, pconst1, 3), 0x80000000U), 0);
-
-   nir_ssa_def *global_id =
-      nir_iadd(&b,
-               nir_imul_imm(&b, nir_channels(&b, nir_load_workgroup_id(&b, 32), 1),
-                            b.shader->info.workgroup_size[0]),
-               nir_channels(&b, nir_load_local_invocation_id(&b), 1));
-   nir_ssa_def *src_idx = nir_imul_imm(&b, global_id, 4);
-   nir_ssa_def *src_count = nir_umin(&b, nir_imm_int(&b, 4), nir_isub(&b, src_node_count, src_idx));
-
-   nir_ssa_def *node_offset = nir_iadd(&b, node_dst_offset, nir_ishl_imm(&b, global_id, 7));
-   nir_ssa_def *node_dst_addr = nir_iadd(&b, node_addr, nir_u2u64(&b, node_offset));
-
-   nir_ssa_def *src_base_addr = nir_iadd(
-      &b, scratch_addr,
-      nir_u2u64(&b, nir_iadd(&b, src_scratch_offset, nir_imul_imm(&b, src_idx, KEY_ID_PAIR_SIZE))));
-
-   nir_ssa_def *src_nodes[4];
-   for (uint32_t i = 0; i < 4; i++) {
-      src_nodes[i] =
-         nir_build_load_global(&b, 1, 32, nir_iadd_imm(&b, src_base_addr, i * KEY_ID_PAIR_SIZE));
-      nir_build_store_global(&b, src_nodes[i], nir_iadd_imm(&b, node_dst_addr, i * 4));
-   }
-
-   nir_ssa_def *total_bounds[2] = {
-      nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7),
-      nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7),
-   };
-
-   for (unsigned i = 0; i < 4; ++i) {
-      nir_variable *bounds[2] = {
-         nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "min_bound"),
-         nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "max_bound"),
-      };
-      nir_store_var(&b, bounds[0], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7);
-      nir_store_var(&b, bounds[1], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7);
-
-      nir_push_if(&b, nir_ilt(&b, nir_imm_int(&b, i), src_count));
-      determine_bounds(&b, node_addr, src_nodes[i], bounds);
-      nir_pop_if(&b, NULL);
-      nir_build_store_global(&b, nir_load_var(&b, bounds[0]),
-                             nir_iadd_imm(&b, node_dst_addr, 16 + 24 * i));
-      nir_build_store_global(&b, nir_load_var(&b, bounds[1]),
-                             nir_iadd_imm(&b, node_dst_addr, 28 + 24 * i));
-      total_bounds[0] = nir_fmin(&b, total_bounds[0], nir_load_var(&b, bounds[0]));
-      total_bounds[1] = nir_fmax(&b, total_bounds[1], nir_load_var(&b, bounds[1]));
-   }
-
-   nir_ssa_def *node_id =
-      nir_iadd_imm(&b, nir_ushr_imm(&b, node_offset, 3), radv_bvh_node_internal);
-   nir_ssa_def *dst_scratch_addr = nir_iadd(
-      &b, scratch_addr,
-      nir_u2u64(&b,
-                nir_iadd(&b, dst_scratch_offset, nir_imul_imm(&b, global_id, KEY_ID_PAIR_SIZE))));
-   nir_build_store_global(&b, node_id, dst_scratch_addr);
-
-   nir_push_if(&b, fill_header);
-   nir_build_store_global(&b, node_id, node_addr);
-   nir_build_store_global(&b, total_bounds[0], nir_iadd_imm(&b, node_addr, 8));
-   nir_build_store_global(&b, total_bounds[1], nir_iadd_imm(&b, node_addr, 20));
-   nir_pop_if(&b, NULL);
-   return b.shader;
-}
-
 enum copy_mode {
    COPY_MODE_COPY,
    COPY_MODE_SERIALIZE,
@@ -1245,7 +1097,6 @@ radv_device_init_accel_struct_build_state(struct radv_device *device)
 {
    VkResult result;
    nir_shader *leaf_cs = build_leaf_shader(device);
-   nir_shader *internal_cs = build_internal_shader(device);
    nir_shader *copy_cs = build_copy_shader(device);
 
    result = create_build_pipeline(device, leaf_cs, sizeof(struct build_primitive_constants),
@@ -1254,9 +1105,10 @@ radv_device_init_accel_struct_build_state(struct radv_device *device)
    if (result != VK_SUCCESS)
       return result;
 
-   result = create_build_pipeline(device, internal_cs, sizeof(struct build_internal_constants),
-                                  &device->meta_state.accel_struct_build.internal_pipeline,
-                                  &device->meta_state.accel_struct_build.internal_p_layout);
+   result = create_build_pipeline_spv(device, internal_spv, sizeof(internal_spv),
+                                      sizeof(struct internal_constants),
+                                      &device->meta_state.accel_struct_build.internal_pipeline,
+                                      &device->meta_state.accel_struct_build.internal_p_layout);
    if (result != VK_SUCCESS)
       return result;
 
@@ -1503,13 +1355,12 @@ radv_CmdBuildAccelerationStructuresKHR(
          if (final_iter)
             dst_node_offset = ALIGN(sizeof(struct radv_accel_struct_header), 64);
 
-         const struct build_internal_constants consts = {
-            .node_dst_addr = radv_accel_struct_get_va(accel_struct),
-            .scratch_addr = pInfos[i].scratchData.deviceAddress,
+         const struct internal_constants consts = {
+            .bvh_addr = radv_accel_struct_get_va(accel_struct),
+            .src_ids_addr = pInfos[i].scratchData.deviceAddress + src_scratch_offset,
+            .dst_ids_addr = pInfos[i].scratchData.deviceAddress + dst_scratch_offset,
             .dst_offset = dst_node_offset,
-            .dst_scratch_offset = dst_scratch_offset,
-            .src_scratch_offset = src_scratch_offset,
-            .fill_header = bvh_states[i].node_count | (final_iter ? 0x80000000U : 0),
+            .fill_count = bvh_states[i].node_count | (final_iter ? 0x80000000U : 0),
          };
 
          radv_CmdPushConstants(commandBuffer,