radv: emit compute pipelines directly from the cmdbuf

Using this intermediate CS isn't really useful and it prevents us to optimize register writes in the near future. This will also be removed for graphics pipelines. Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28977>
2024-04-29 15:06:26 +02:00
parent 72a73a6f8a
commit 8c4d0b287f
4 changed files with 40 additions and 54 deletions
@@ -1901,6 +1901,29 @@ radv_emit_ps_epilog_state(struct radv_cmd_buffer *cmd_buffer, struct radv_shader
   cmd_buffer->state.emitted_ps_epilog = ps_epilog;
 }

+static void
+radv_emit_compute_shader(const struct radv_physical_device *pdev, struct radeon_cmdbuf *cs,
+                         const struct radv_shader *shader)
+{
+   uint64_t va = radv_shader_get_va(shader);
+
+   radeon_set_sh_reg(cs, R_00B830_COMPUTE_PGM_LO, va >> 8);
+
+   radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
+   radeon_emit(cs, shader->config.rsrc1);
+   radeon_emit(cs, shader->config.rsrc2);
+   if (pdev->info.gfx_level >= GFX10) {
+      radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, shader->config.rsrc3);
+   }
+
+   radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, radv_get_compute_resource_limits(pdev, shader));
+
+   radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
+   radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[0]));
+   radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[1]));
+   radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[2]));
+}
+
 static void
 radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
 {
@@ -6585,17 +6608,21 @@ static void
 radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_compute_pipeline *pipeline)
 {
   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+   const struct radv_physical_device *pdev = radv_device_physical(device);

   if (pipeline == cmd_buffer->state.emitted_compute_pipeline)
      return;

-   assert(!pipeline->base.ctx_cs.cdw);
+   radeon_check_space(device->ws, cmd_buffer->cs, pdev->info.gfx_level >= GFX10 ? 19 : 16);
+
+   if (pipeline->base.type == RADV_PIPELINE_COMPUTE) {
+      radv_emit_compute_shader(pdev, cmd_buffer->cs, cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]);
+   } else {
+      radv_emit_compute_shader(pdev, cmd_buffer->cs, cmd_buffer->state.rt_prolog);
+   }

   cmd_buffer->state.emitted_compute_pipeline = pipeline;

-   radeon_check_space(device->ws, cmd_buffer->cs, pipeline->base.cs.cdw);
-   radeon_emit_array(cmd_buffer->cs, pipeline->base.cs.buf, pipeline->base.cs.cdw);
-
   if (radv_device_fault_detection_enabled(device))
      radv_save_pipeline(cmd_buffer, &pipeline->base);
 }
@@ -37,7 +37,7 @@
 #include "sid.h"
 #include "vk_format.h"

-static uint32_t
+uint32_t
 radv_get_compute_resource_limits(const struct radv_physical_device *pdev, const struct radv_shader *cs)
 {
   unsigned threads_per_threadgroup;
@@ -95,53 +95,13 @@ radv_get_compute_pipeline_metadata(const struct radv_device *device, const struc
 }

 void
-radv_emit_compute_shader(const struct radv_physical_device *pdev, struct radeon_cmdbuf *cs,
-                         const struct radv_shader *shader)
-{
-   uint64_t va = radv_shader_get_va(shader);
-
-   radeon_set_sh_reg(cs, R_00B830_COMPUTE_PGM_LO, va >> 8);
-
-   radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
-   radeon_emit(cs, shader->config.rsrc1);
-   radeon_emit(cs, shader->config.rsrc2);
-   if (pdev->info.gfx_level >= GFX10) {
-      radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, shader->config.rsrc3);
-   }
-
-   radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, radv_get_compute_resource_limits(pdev, shader));
-
-   radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
-   radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[0]));
-   radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[1]));
-   radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[2]));
-}
-
-static void
-radv_compute_generate_pm4(const struct radv_device *device, struct radv_compute_pipeline *pipeline,
-                          struct radv_shader *shader)
-{
-   const struct radv_physical_device *pdev = radv_device_physical(device);
-   struct radeon_cmdbuf *cs = &pipeline->base.cs;
-
-   cs->reserved_dw = cs->max_dw = pdev->info.gfx_level >= GFX10 ? 19 : 16;
-   cs->buf = malloc(cs->max_dw * 4);
-
-   radv_emit_compute_shader(pdev, cs, shader);
-
-   assert(pipeline->base.cs.cdw <= pipeline->base.cs.max_dw);
-}
-
-void
-radv_compute_pipeline_init(const struct radv_device *device, struct radv_compute_pipeline *pipeline,
-                           const struct radv_pipeline_layout *layout, struct radv_shader *shader)
+radv_compute_pipeline_init(struct radv_compute_pipeline *pipeline, const struct radv_pipeline_layout *layout,
+                           struct radv_shader *shader)
 {
   pipeline->base.need_indirect_descriptor_sets |= radv_shader_need_indirect_descriptor_sets(shader);

   pipeline->base.push_constant_size = layout->push_constant_size;
   pipeline->base.dynamic_offset_count = layout->dynamic_offset_count;
-
-   radv_compute_generate_pm4(device, pipeline, shader);
 }

 struct radv_shader *
@@ -321,7 +281,7 @@ radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache, const VkC
      return result;
   }

-   radv_compute_pipeline_init(device, pipeline, pipeline_layout, pipeline->base.shaders[MESA_SHADER_COMPUTE]);
+   radv_compute_pipeline_init(pipeline, pipeline_layout, pipeline->base.shaders[MESA_SHADER_COMPUTE]);

   if (pipeline->base.create_flags & VK_PIPELINE_CREATE_INDIRECT_BINDABLE_BIT_NV) {
      const VkComputePipelineIndirectBufferInfoNV *indirect_buffer =
@@ -42,14 +42,13 @@ struct radv_compute_pipeline_metadata {
   uint64_t inline_push_const_mask;
 };

+uint32_t radv_get_compute_resource_limits(const struct radv_physical_device *pdev, const struct radv_shader *cs);
+
 void radv_get_compute_pipeline_metadata(const struct radv_device *device, const struct radv_compute_pipeline *pipeline,
                                        struct radv_compute_pipeline_metadata *metadata);

-void radv_emit_compute_shader(const struct radv_physical_device *pdev, struct radeon_cmdbuf *cs,
-                              const struct radv_shader *shader);
-
-void radv_compute_pipeline_init(const struct radv_device *device, struct radv_compute_pipeline *pipeline,
-                                const struct radv_pipeline_layout *layout, struct radv_shader *shader);
+void radv_compute_pipeline_init(struct radv_compute_pipeline *pipeline, const struct radv_pipeline_layout *layout,
+                                struct radv_shader *shader);

 struct radv_shader *radv_compile_cs(struct radv_device *device, struct vk_pipeline_cache *cache,
                                    struct radv_shader_stage *cs_stage, bool keep_executable_info,
@@ -1031,7 +1031,7 @@ radv_rt_pipeline_create(VkDevice _device, VkPipelineCache _cache, const VkRayTra
      compute_rt_stack_size(pCreateInfo, pipeline);
      compile_rt_prolog(device, pipeline);

-      radv_compute_pipeline_init(device, &pipeline->base, pipeline_layout, pipeline->prolog);
+      radv_compute_pipeline_init(&pipeline->base, pipeline_layout, pipeline->prolog);
   }

   /* write shader VAs into group handles */