diff --git a/src/intel/dev/intel_device_info.h b/src/intel/dev/intel_device_info.h index 9ba9949b5e6..3dc5b4bbeed 100644 --- a/src/intel/dev/intel_device_info.h +++ b/src/intel/dev/intel_device_info.h @@ -136,6 +136,7 @@ struct intel_device_info bool has_aux_map; bool has_tiling_uapi; bool has_ray_tracing; + bool has_ray_query; bool has_local_mem; bool has_lsc; bool has_mesh_shading; diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 87a49cd2226..4d3b1d28a4c 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -483,6 +483,78 @@ set_dirty_for_bind_map(struct anv_cmd_buffer *cmd_buffer, cmd_buffer->state.push_constants_dirty |= mesa_to_vk_shader_stage(stage); } +static inline uint32_t +ilog2_round_up(uint32_t value) +{ + assert(value != 0); + return 32 - __builtin_clz(value - 1); +} + +static void +anv_cmd_buffer_set_ray_query_buffer(struct anv_cmd_buffer *cmd_buffer, + struct anv_cmd_pipeline_state *pipeline_state, + struct anv_pipeline *pipeline, + VkShaderStageFlags stages) +{ + struct anv_device *device = cmd_buffer->device; + + uint64_t ray_shadow_size = + align_u64(brw_rt_ray_queries_shadow_stacks_size(&device->info, + pipeline->ray_queries), + 4096); + if (ray_shadow_size > 0 && + (!cmd_buffer->state.ray_query_shadow_bo || + cmd_buffer->state.ray_query_shadow_bo->size < ray_shadow_size)) { + unsigned shadow_size_log2 = MAX2(ilog2_round_up(ray_shadow_size), 16); + unsigned bucket = shadow_size_log2 - 16; + assert(bucket < ARRAY_SIZE(device->ray_query_shadow_bos)); + + struct anv_bo *bo = p_atomic_read(&device->ray_query_shadow_bos[bucket]); + if (bo == NULL) { + struct anv_bo *new_bo; + VkResult result = anv_device_alloc_bo(device, "RT queries shadow", + ray_shadow_size, + ANV_BO_ALLOC_LOCAL_MEM, /* alloc_flags */ + 0, /* explicit_address */ + &new_bo); + if (result != VK_SUCCESS) { + anv_batch_set_error(&cmd_buffer->batch, result); + return; + } + + bo = p_atomic_cmpxchg(&device->ray_query_shadow_bos[bucket], NULL, new_bo); + if (bo != NULL) { + anv_device_release_bo(device, bo); + } else { + bo = new_bo; + } + } + cmd_buffer->state.ray_query_shadow_bo = bo; + + /* Add the ray query buffers to the batch list. */ + anv_reloc_list_add_bo(cmd_buffer->batch.relocs, + cmd_buffer->batch.alloc, + cmd_buffer->state.ray_query_shadow_bo); + } + + /* Add the HW buffer to the list of BO used. */ + anv_reloc_list_add_bo(cmd_buffer->batch.relocs, + cmd_buffer->batch.alloc, + device->ray_query_bo); + + /* Fill the push constants & mark them dirty. */ + struct anv_state ray_query_global_state = + anv_genX(&device->info, cmd_buffer_ray_query_globals)(cmd_buffer); + + struct anv_address ray_query_globals_addr = (struct anv_address) { + .bo = device->dynamic_state_pool.block_pool.bo, + .offset = ray_query_global_state.offset, + }; + pipeline_state->push_constants.ray_query_globals = + anv_address_physical(ray_query_globals_addr); + cmd_buffer->state.push_constants_dirty |= stages; +} + void anv_CmdBindPipeline( VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, @@ -490,6 +562,8 @@ void anv_CmdBindPipeline( { ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline); + struct anv_cmd_pipeline_state *state; + VkShaderStageFlags stages = 0; switch (pipelineBindPoint) { case VK_PIPELINE_BIND_POINT_COMPUTE: { @@ -502,6 +576,9 @@ void anv_CmdBindPipeline( cmd_buffer->state.compute.pipeline_dirty = true; set_dirty_for_bind_map(cmd_buffer, MESA_SHADER_COMPUTE, &compute_pipeline->cs->bind_map); + + state = &cmd_buffer->state.compute.base; + stages = VK_SHADER_STAGE_COMPUTE_BIT; break; } @@ -525,6 +602,9 @@ void anv_CmdBindPipeline( anv_dynamic_state_copy(&cmd_buffer->state.gfx.dynamic, &gfx_pipeline->dynamic_state, gfx_pipeline->dynamic_state_mask); + + state = &cmd_buffer->state.gfx.base; + stages = gfx_pipeline->active_stages; break; } @@ -541,6 +621,8 @@ void anv_CmdBindPipeline( anv_CmdSetRayTracingPipelineStackSizeKHR(commandBuffer, rt_pipeline->stack_size); } + + state = &cmd_buffer->state.rt.base; break; } @@ -548,6 +630,9 @@ void anv_CmdBindPipeline( assert(!"invalid bind point"); break; } + + if (pipeline->ray_queries > 0) + anv_cmd_buffer_set_ray_query_buffer(cmd_buffer, state, pipeline, stages); } void anv_CmdSetRasterizerDiscardEnableEXT( @@ -1675,13 +1760,6 @@ void anv_CmdSetFragmentShadingRateKHR( } } -static inline uint32_t -ilog2_round_up(uint32_t value) -{ - assert(value != 0); - return 32 - __builtin_clz(value - 1); -} - void anv_CmdSetRayTracingPipelineStackSizeKHR( VkCommandBuffer commandBuffer, uint32_t pipelineStackSize) diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index d86dd90adfe..99fa3242ca5 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -221,6 +221,7 @@ get_device_extensions(const struct anv_physical_device *device, device->use_call_secondary, .KHR_pipeline_executable_properties = true, .KHR_push_descriptor = true, + .KHR_ray_query = device->info.has_ray_tracing, .KHR_relaxed_block_layout = true, .KHR_sampler_mirror_clamp_to_edge = true, .KHR_sampler_ycbcr_conversion = true, @@ -1640,6 +1641,12 @@ void anv_GetPhysicalDeviceFeatures2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_QUERY_FEATURES_KHR: { + VkPhysicalDeviceRayQueryFeaturesKHR *features = (void *)ext; + features->rayQuery = pdevice->info.has_ray_tracing; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT: { VkPhysicalDeviceRobustness2FeaturesEXT *features = (void *)ext; features->robustBufferAccess2 = true; @@ -3331,9 +3338,22 @@ VkResult anv_CreateDevice( device->workaround_bo->size, INTEL_DEBUG_BLOCK_TYPE_FRAME); + if (device->vk.enabled_extensions.KHR_ray_query) { + uint32_t ray_queries_size = + align_u32(brw_rt_ray_queries_hw_stacks_size(&device->info), 4096); + + result = anv_device_alloc_bo(device, "ray queries", + ray_queries_size, + ANV_BO_ALLOC_LOCAL_MEM, + 0 /* explicit_address */, + &device->ray_query_bo); + if (result != VK_SUCCESS) + goto fail_workaround_bo; + } + result = anv_device_init_trivial_batch(device); if (result != VK_SUCCESS) - goto fail_workaround_bo; + goto fail_ray_query_bo; if (device->info.ver >= 12 && device->vk.enabled_extensions.KHR_fragment_shading_rate) { @@ -3403,6 +3423,9 @@ VkResult anv_CreateDevice( anv_scratch_pool_finish(device, &device->scratch_pool); fail_trivial_batch: anv_device_release_bo(device, device->trivial_batch_bo); + fail_ray_query_bo: + if (device->ray_query_bo) + anv_device_release_bo(device, device->ray_query_bo); fail_workaround_bo: anv_device_release_bo(device, device->workaround_bo); fail_surface_aux_map_pool: @@ -3487,6 +3510,13 @@ void anv_DestroyDevice( anv_scratch_pool_finish(device, &device->scratch_pool); + if (device->vk.enabled_extensions.KHR_ray_query) { + for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_shadow_bos); i++) { + if (device->ray_query_shadow_bos[i] != NULL) + anv_device_release_bo(device, device->ray_query_shadow_bos[i]); + } + anv_device_release_bo(device, device->ray_query_bo); + } anv_device_release_bo(device, device->workaround_bo); anv_device_release_bo(device, device->trivial_batch_bo); diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index 3a7d87cae08..1a65879c44d 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -119,6 +119,8 @@ void genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer, void genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer); +struct anv_state genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer); + void genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch, const struct intel_l3_config *l3_config, diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c index bccc28eff6e..6334575637c 100644 --- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c +++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c @@ -34,6 +34,8 @@ #define MAX_SAMPLER_TABLE_SIZE 128 #define BINDLESS_OFFSET 255 +#define sizeof_field(type, field) sizeof(((type *)0)->field) + struct apply_pipeline_layout_state { const struct anv_physical_device *pdevice; @@ -1322,6 +1324,21 @@ lower_tex(nir_builder *b, nir_tex_instr *tex, return true; } +static bool +lower_ray_query_globals(nir_builder *b, nir_intrinsic_instr *intrin, + struct apply_pipeline_layout_state *state) +{ + b->cursor = nir_instr_remove(&intrin->instr); + + nir_ssa_def *rq_globals = + nir_load_push_constant(b, 1, 64, nir_imm_int(b, 0), + .base = offsetof(struct anv_push_constants, ray_query_globals), + .range = sizeof_field(struct anv_push_constants, ray_query_globals)); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, rq_globals); + + return true; +} + static bool apply_pipeline_layout(nir_builder *b, nir_instr *instr, void *_state) { @@ -1360,6 +1377,8 @@ apply_pipeline_layout(nir_builder *b, nir_instr *instr, void *_state) return lower_image_intrinsic(b, intrin, state); case nir_intrinsic_load_constant: return lower_load_constant(b, intrin, state); + case nir_intrinsic_load_ray_query_global_intel: + return lower_ray_query_globals(b, intrin, state); default: return false; } diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 57637ee4cc0..243ecde2fbe 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -142,6 +142,7 @@ anv_shader_compile_to_nir(struct anv_device *device, .post_depth_coverage = pdevice->info.ver >= 9, .runtime_descriptor_array = true, .float_controls = pdevice->info.ver >= 8, + .ray_query = pdevice->info.has_ray_tracing, .ray_tracing = pdevice->info.has_ray_tracing, .shader_clock = true, .shader_viewport_index_layer = true, @@ -871,6 +872,8 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline, NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_push_const, nir_address_format_32bit_offset); + NIR_PASS_V(nir, brw_nir_lower_ray_queries, &pdevice->info); + /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */ anv_nir_apply_pipeline_layout(pdevice, pipeline->device->robust_buffer_access, @@ -1485,6 +1488,8 @@ anv_pipeline_add_executables(struct anv_pipeline *pipeline, } else { anv_pipeline_add_executable(pipeline, stage, bin->stats, 0); } + + pipeline->ray_queries = MAX2(pipeline->ray_queries, bin->prog_data->ray_queries); } static uint32_t diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 60ed616d904..a0f0c5d7db2 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1210,6 +1210,21 @@ struct anv_device { struct anv_scratch_pool scratch_pool; struct anv_bo *rt_scratch_bos[16]; + /** Shadow ray query BO + * + * The ray_query_bo only holds the current ray being traced. When using + * more than 1 ray query per thread, we cannot fit all the queries in + * there, so we need a another buffer to hold query data that is not + * currently being used by the HW for tracing, similar to a scratch space. + * + * The size of the shadow buffer depends on the number of queries per + * shader. + */ + struct anv_bo *ray_query_shadow_bos[16]; + /** Ray query buffer used to communicated with HW unit. + */ + struct anv_bo *ray_query_bo; + struct anv_shader_bin *rt_trampoline; struct anv_shader_bin *rt_trivial_return; @@ -2618,8 +2633,8 @@ struct anv_push_constants { /* Robust access pushed registers. */ uint64_t push_reg_mask[MESA_SHADER_STAGES]; - /** Pad out to a multiple of 32 bytes */ - uint32_t pad[2]; + /** Ray query globals (RT_DISPATCH_GLOBALS) */ + uint64_t ray_query_globals; /* Base addresses for descriptor sets */ uint64_t desc_sets[MAX_SETS]; @@ -3105,6 +3120,11 @@ struct anv_cmd_state { struct anv_state null_surface_state; struct anv_dynamic_render_pass dynamic_render_pass; + + /** + * A buffer used for spill/fill of ray queries. + */ + struct anv_bo * ray_query_shadow_bo; }; struct anv_cmd_pool { @@ -3463,6 +3483,8 @@ struct anv_pipeline { enum anv_pipeline_type type; VkPipelineCreateFlags flags; + uint32_t ray_queries; + struct util_dynarray executables; const struct intel_l3_config * l3_config; diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 9e52f749ac4..040a61bf3ce 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -5461,6 +5461,47 @@ void genX(CmdDispatchIndirect)( trace_intel_end_compute(&cmd_buffer->trace, cmd_buffer, 0, 0, 0); } +struct anv_state +genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer) +{ +#if GFX_VERx10 >= 125 + struct anv_device *device = cmd_buffer->device; + + struct anv_state state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, + BRW_RT_DISPATCH_GLOBALS_SIZE, + 64); + struct brw_rt_scratch_layout layout; + uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in + * some cases? + */ + brw_rt_compute_scratch_layout(&layout, &device->info, + stack_ids_per_dss, 1 << 10); + + struct GFX_RT_DISPATCH_GLOBALS rtdg = { + .MemBaseAddress = (struct anv_address) { + /* The ray query HW computes offsets from the top of the buffer, so + * let the address at the end of the buffer. + */ + .bo = device->ray_query_bo, + .offset = device->ray_query_bo->size + }, + .AsyncRTStackSize = layout.ray_stack_stride / 64, + .NumDSSRTStacks = layout.stack_ids_per_dss, + .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS, + .Flags = RT_DEPTH_TEST_LESS_EQUAL, + .ResumeShaderTable = (struct anv_address) { + .bo = cmd_buffer->state.ray_query_shadow_bo, + }, + }; + GFX_RT_DISPATCH_GLOBALS_pack(NULL, state.map, &rtdg); + + return state; +#else + unreachable("Not supported"); +#endif +} + #if GFX_VERx10 >= 125 static void calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])