diff --git a/src/intel/dev/intel_device_info.h b/src/intel/dev/intel_device_info.h
index 9ba9949b5e6..3dc5b4bbeed 100644
--- a/src/intel/dev/intel_device_info.h
+++ b/src/intel/dev/intel_device_info.h
@@ -136,6 +136,7 @@ struct intel_device_info
    bool has_aux_map;
    bool has_tiling_uapi;
    bool has_ray_tracing;
+   bool has_ray_query;
    bool has_local_mem;
    bool has_lsc;
    bool has_mesh_shading;
diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c
index 87a49cd2226..4d3b1d28a4c 100644
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -483,6 +483,78 @@ set_dirty_for_bind_map(struct anv_cmd_buffer *cmd_buffer,
       cmd_buffer->state.push_constants_dirty |= mesa_to_vk_shader_stage(stage);
 }
 
+static inline uint32_t
+ilog2_round_up(uint32_t value)
+{
+   assert(value != 0);
+   return 32 - __builtin_clz(value - 1);
+}
+
+static void
+anv_cmd_buffer_set_ray_query_buffer(struct anv_cmd_buffer *cmd_buffer,
+                                    struct anv_cmd_pipeline_state *pipeline_state,
+                                    struct anv_pipeline *pipeline,
+                                    VkShaderStageFlags stages)
+{
+   struct anv_device *device = cmd_buffer->device;
+
+   uint64_t ray_shadow_size =
+      align_u64(brw_rt_ray_queries_shadow_stacks_size(&device->info,
+                                                      pipeline->ray_queries),
+                4096);
+   if (ray_shadow_size > 0 &&
+       (!cmd_buffer->state.ray_query_shadow_bo ||
+        cmd_buffer->state.ray_query_shadow_bo->size < ray_shadow_size)) {
+      unsigned shadow_size_log2 = MAX2(ilog2_round_up(ray_shadow_size), 16);
+      unsigned bucket = shadow_size_log2 - 16;
+      assert(bucket < ARRAY_SIZE(device->ray_query_shadow_bos));
+
+      struct anv_bo *bo = p_atomic_read(&device->ray_query_shadow_bos[bucket]);
+      if (bo == NULL) {
+         struct anv_bo *new_bo;
+         VkResult result = anv_device_alloc_bo(device, "RT queries shadow",
+                                               ray_shadow_size,
+                                               ANV_BO_ALLOC_LOCAL_MEM, /* alloc_flags */
+                                               0, /* explicit_address */
+                                               &new_bo);
+         if (result != VK_SUCCESS) {
+            anv_batch_set_error(&cmd_buffer->batch, result);
+            return;
+         }
+
+         bo = p_atomic_cmpxchg(&device->ray_query_shadow_bos[bucket], NULL, new_bo);
+         if (bo != NULL) {
+            anv_device_release_bo(device, bo);
+         } else {
+            bo = new_bo;
+         }
+      }
+      cmd_buffer->state.ray_query_shadow_bo = bo;
+
+      /* Add the ray query buffers to the batch list. */
+      anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+                            cmd_buffer->batch.alloc,
+                            cmd_buffer->state.ray_query_shadow_bo);
+   }
+
+   /* Add the HW buffer to the list of BO used. */
+   anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+                         cmd_buffer->batch.alloc,
+                         device->ray_query_bo);
+
+   /* Fill the push constants & mark them dirty. */
+   struct anv_state ray_query_global_state =
+      anv_genX(&device->info, cmd_buffer_ray_query_globals)(cmd_buffer);
+
+   struct anv_address ray_query_globals_addr = (struct anv_address) {
+      .bo = device->dynamic_state_pool.block_pool.bo,
+      .offset = ray_query_global_state.offset,
+   };
+   pipeline_state->push_constants.ray_query_globals =
+      anv_address_physical(ray_query_globals_addr);
+   cmd_buffer->state.push_constants_dirty |= stages;
+}
+
 void anv_CmdBindPipeline(
     VkCommandBuffer                             commandBuffer,
     VkPipelineBindPoint                         pipelineBindPoint,
@@ -490,6 +562,8 @@ void anv_CmdBindPipeline(
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
+   struct anv_cmd_pipeline_state *state;
+   VkShaderStageFlags stages = 0;
 
    switch (pipelineBindPoint) {
    case VK_PIPELINE_BIND_POINT_COMPUTE: {
@@ -502,6 +576,9 @@ void anv_CmdBindPipeline(
       cmd_buffer->state.compute.pipeline_dirty = true;
       set_dirty_for_bind_map(cmd_buffer, MESA_SHADER_COMPUTE,
                              &compute_pipeline->cs->bind_map);
+
+      state = &cmd_buffer->state.compute.base;
+      stages = VK_SHADER_STAGE_COMPUTE_BIT;
       break;
    }
 
@@ -525,6 +602,9 @@ void anv_CmdBindPipeline(
          anv_dynamic_state_copy(&cmd_buffer->state.gfx.dynamic,
                                 &gfx_pipeline->dynamic_state,
                                 gfx_pipeline->dynamic_state_mask);
+
+      state = &cmd_buffer->state.gfx.base;
+      stages = gfx_pipeline->active_stages;
       break;
    }
 
@@ -541,6 +621,8 @@ void anv_CmdBindPipeline(
          anv_CmdSetRayTracingPipelineStackSizeKHR(commandBuffer,
                                                   rt_pipeline->stack_size);
       }
+
+      state = &cmd_buffer->state.rt.base;
       break;
    }
 
@@ -548,6 +630,9 @@ void anv_CmdBindPipeline(
       assert(!"invalid bind point");
       break;
    }
+
+   if (pipeline->ray_queries > 0)
+      anv_cmd_buffer_set_ray_query_buffer(cmd_buffer, state, pipeline, stages);
 }
 
 void anv_CmdSetRasterizerDiscardEnableEXT(
@@ -1675,13 +1760,6 @@ void anv_CmdSetFragmentShadingRateKHR(
    }
 }
 
-static inline uint32_t
-ilog2_round_up(uint32_t value)
-{
-   assert(value != 0);
-   return 32 - __builtin_clz(value - 1);
-}
-
 void anv_CmdSetRayTracingPipelineStackSizeKHR(
     VkCommandBuffer                             commandBuffer,
     uint32_t                                    pipelineStackSize)
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index d86dd90adfe..99fa3242ca5 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -221,6 +221,7 @@ get_device_extensions(const struct anv_physical_device *device,
          device->use_call_secondary,
       .KHR_pipeline_executable_properties    = true,
       .KHR_push_descriptor                   = true,
+      .KHR_ray_query                         = device->info.has_ray_tracing,
       .KHR_relaxed_block_layout              = true,
       .KHR_sampler_mirror_clamp_to_edge      = true,
       .KHR_sampler_ycbcr_conversion          = true,
@@ -1640,6 +1641,12 @@ void anv_GetPhysicalDeviceFeatures2(
          break;
       }
 
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_QUERY_FEATURES_KHR: {
+         VkPhysicalDeviceRayQueryFeaturesKHR *features = (void *)ext;
+         features->rayQuery = pdevice->info.has_ray_tracing;
+         break;
+      }
+
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT: {
          VkPhysicalDeviceRobustness2FeaturesEXT *features = (void *)ext;
          features->robustBufferAccess2 = true;
@@ -3331,9 +3338,22 @@ VkResult anv_CreateDevice(
                                        device->workaround_bo->size,
                                        INTEL_DEBUG_BLOCK_TYPE_FRAME);
 
+   if (device->vk.enabled_extensions.KHR_ray_query) {
+      uint32_t ray_queries_size =
+         align_u32(brw_rt_ray_queries_hw_stacks_size(&device->info), 4096);
+
+      result = anv_device_alloc_bo(device, "ray queries",
+                                   ray_queries_size,
+                                   ANV_BO_ALLOC_LOCAL_MEM,
+                                   0 /* explicit_address */,
+                                   &device->ray_query_bo);
+      if (result != VK_SUCCESS)
+         goto fail_workaround_bo;
+   }
+
    result = anv_device_init_trivial_batch(device);
    if (result != VK_SUCCESS)
-      goto fail_workaround_bo;
+      goto fail_ray_query_bo;
 
    if (device->info.ver >= 12 &&
        device->vk.enabled_extensions.KHR_fragment_shading_rate) {
@@ -3403,6 +3423,9 @@ VkResult anv_CreateDevice(
    anv_scratch_pool_finish(device, &device->scratch_pool);
  fail_trivial_batch:
    anv_device_release_bo(device, device->trivial_batch_bo);
+ fail_ray_query_bo:
+   if (device->ray_query_bo)
+      anv_device_release_bo(device, device->ray_query_bo);
  fail_workaround_bo:
    anv_device_release_bo(device, device->workaround_bo);
  fail_surface_aux_map_pool:
@@ -3487,6 +3510,13 @@ void anv_DestroyDevice(
 
    anv_scratch_pool_finish(device, &device->scratch_pool);
 
+   if (device->vk.enabled_extensions.KHR_ray_query) {
+      for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_shadow_bos); i++) {
+         if (device->ray_query_shadow_bos[i] != NULL)
+            anv_device_release_bo(device, device->ray_query_shadow_bos[i]);
+      }
+      anv_device_release_bo(device, device->ray_query_bo);
+   }
    anv_device_release_bo(device, device->workaround_bo);
    anv_device_release_bo(device, device->trivial_batch_bo);
 
diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
index 3a7d87cae08..1a65879c44d 100644
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@@ -119,6 +119,8 @@ void genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
 
 void genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer);
 
+struct anv_state genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer);
+
 void
 genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
                      const struct intel_l3_config *l3_config,
diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
index bccc28eff6e..6334575637c 100644
--- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
+++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
@@ -34,6 +34,8 @@
 #define MAX_SAMPLER_TABLE_SIZE 128
 #define BINDLESS_OFFSET        255
 
+#define sizeof_field(type, field) sizeof(((type *)0)->field)
+
 struct apply_pipeline_layout_state {
    const struct anv_physical_device *pdevice;
 
@@ -1322,6 +1324,21 @@ lower_tex(nir_builder *b, nir_tex_instr *tex,
    return true;
 }
 
+static bool
+lower_ray_query_globals(nir_builder *b, nir_intrinsic_instr *intrin,
+                        struct apply_pipeline_layout_state *state)
+{
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   nir_ssa_def *rq_globals =
+      nir_load_push_constant(b, 1, 64, nir_imm_int(b, 0),
+                             .base = offsetof(struct anv_push_constants, ray_query_globals),
+                             .range = sizeof_field(struct anv_push_constants, ray_query_globals));
+   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, rq_globals);
+
+   return true;
+}
+
 static bool
 apply_pipeline_layout(nir_builder *b, nir_instr *instr, void *_state)
 {
@@ -1360,6 +1377,8 @@ apply_pipeline_layout(nir_builder *b, nir_instr *instr, void *_state)
          return lower_image_intrinsic(b, intrin, state);
       case nir_intrinsic_load_constant:
          return lower_load_constant(b, intrin, state);
+      case nir_intrinsic_load_ray_query_global_intel:
+         return lower_ray_query_globals(b, intrin, state);
       default:
          return false;
       }
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 57637ee4cc0..243ecde2fbe 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -142,6 +142,7 @@ anv_shader_compile_to_nir(struct anv_device *device,
          .post_depth_coverage = pdevice->info.ver >= 9,
          .runtime_descriptor_array = true,
          .float_controls = pdevice->info.ver >= 8,
+         .ray_query = pdevice->info.has_ray_tracing,
          .ray_tracing = pdevice->info.has_ray_tracing,
          .shader_clock = true,
          .shader_viewport_index_layer = true,
@@ -871,6 +872,8 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline,
    NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_push_const,
               nir_address_format_32bit_offset);
 
+   NIR_PASS_V(nir, brw_nir_lower_ray_queries, &pdevice->info);
+
    /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
    anv_nir_apply_pipeline_layout(pdevice,
                                  pipeline->device->robust_buffer_access,
@@ -1485,6 +1488,8 @@ anv_pipeline_add_executables(struct anv_pipeline *pipeline,
    } else {
       anv_pipeline_add_executable(pipeline, stage, bin->stats, 0);
    }
+
+   pipeline->ray_queries = MAX2(pipeline->ray_queries, bin->prog_data->ray_queries);
 }
 
 static uint32_t
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 60ed616d904..a0f0c5d7db2 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -1210,6 +1210,21 @@ struct anv_device {
     struct anv_scratch_pool                     scratch_pool;
     struct anv_bo                              *rt_scratch_bos[16];
 
+    /** Shadow ray query BO
+     *
+     * The ray_query_bo only holds the current ray being traced. When using
+     * more than 1 ray query per thread, we cannot fit all the queries in
+     * there, so we need a another buffer to hold query data that is not
+     * currently being used by the HW for tracing, similar to a scratch space.
+     *
+     * The size of the shadow buffer depends on the number of queries per
+     * shader.
+     */
+    struct anv_bo                              *ray_query_shadow_bos[16];
+    /** Ray query buffer used to communicated with HW unit.
+     */
+    struct anv_bo                              *ray_query_bo;
+
     struct anv_shader_bin                      *rt_trampoline;
     struct anv_shader_bin                      *rt_trivial_return;
 
@@ -2618,8 +2633,8 @@ struct anv_push_constants {
    /* Robust access pushed registers. */
    uint64_t push_reg_mask[MESA_SHADER_STAGES];
 
-   /** Pad out to a multiple of 32 bytes */
-   uint32_t pad[2];
+   /** Ray query globals (RT_DISPATCH_GLOBALS) */
+   uint64_t ray_query_globals;
 
    /* Base addresses for descriptor sets */
    uint64_t desc_sets[MAX_SETS];
@@ -3105,6 +3120,11 @@ struct anv_cmd_state {
    struct anv_state                             null_surface_state;
 
    struct anv_dynamic_render_pass               dynamic_render_pass;
+
+   /**
+    * A buffer used for spill/fill of ray queries.
+    */
+   struct anv_bo *                              ray_query_shadow_bo;
 };
 
 struct anv_cmd_pool {
@@ -3463,6 +3483,8 @@ struct anv_pipeline {
    enum anv_pipeline_type                       type;
    VkPipelineCreateFlags                        flags;
 
+   uint32_t                                     ray_queries;
+
    struct util_dynarray                         executables;
 
    const struct intel_l3_config *               l3_config;
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index 9e52f749ac4..040a61bf3ce 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -5461,6 +5461,47 @@ void genX(CmdDispatchIndirect)(
    trace_intel_end_compute(&cmd_buffer->trace, cmd_buffer, 0, 0, 0);
 }
 
+struct anv_state
+genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
+{
+#if GFX_VERx10 >= 125
+   struct anv_device *device = cmd_buffer->device;
+
+   struct anv_state state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                         BRW_RT_DISPATCH_GLOBALS_SIZE,
+                                         64);
+   struct brw_rt_scratch_layout layout;
+   uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
+                                       * some cases?
+                                       */
+   brw_rt_compute_scratch_layout(&layout, &device->info,
+                                 stack_ids_per_dss, 1 << 10);
+
+   struct GFX_RT_DISPATCH_GLOBALS rtdg = {
+      .MemBaseAddress = (struct anv_address) {
+         /* The ray query HW computes offsets from the top of the buffer, so
+          * let the address at the end of the buffer.
+          */
+         .bo = device->ray_query_bo,
+         .offset = device->ray_query_bo->size
+      },
+      .AsyncRTStackSize = layout.ray_stack_stride / 64,
+      .NumDSSRTStacks = layout.stack_ids_per_dss,
+      .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
+      .Flags = RT_DEPTH_TEST_LESS_EQUAL,
+      .ResumeShaderTable = (struct anv_address) {
+         .bo = cmd_buffer->state.ray_query_shadow_bo,
+      },
+   };
+   GFX_RT_DISPATCH_GLOBALS_pack(NULL, state.map, &rtdg);
+
+   return state;
+#else
+   unreachable("Not supported");
+#endif
+}
+
 #if GFX_VERx10 >= 125
 static void
 calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])