diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index 2e17e544b27..3504813f8d2 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -121,7 +121,7 @@ cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer,
    cmd_buffer->level = level;
 
    list_inithead(&cmd_buffer->private_objs);
-   list_inithead(&cmd_buffer->submit_jobs);
+   list_inithead(&cmd_buffer->jobs);
    list_inithead(&cmd_buffer->list_link);
 
    assert(pool);
@@ -188,15 +188,20 @@ v3dv_job_destroy(struct v3dv_job *job)
 
    list_del(&job->list_link);
 
-   switch (job->type) {
-   case V3DV_JOB_TYPE_GPU_CL:
-      job_destroy_gpu_cl_resources(job);
-      break;
-   case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
-      job_destroy_cpu_wait_events_resources(job);
-      break;
-   default:
-      break;
+   /* Cloned jobs don't make deep copies of the original jobs, so they don't
+    * own any of their resources.
+    */
+   if (!job->is_clone) {
+      switch (job->type) {
+      case V3DV_JOB_TYPE_GPU_CL:
+         job_destroy_gpu_cl_resources(job);
+         break;
+      case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
+         job_destroy_cpu_wait_events_resources(job);
+         break;
+      default:
+         break;
+      }
    }
 
    vk_free(&job->device->alloc, job);
@@ -237,7 +242,7 @@ static void
 cmd_buffer_free_resources(struct v3dv_cmd_buffer *cmd_buffer)
 {
    list_for_each_entry_safe(struct v3dv_job, job,
-                            &cmd_buffer->submit_jobs, list_link) {
+                            &cmd_buffer->jobs, list_link) {
       v3dv_job_destroy(job);
    }
 
@@ -317,6 +322,9 @@ cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer,
    const struct v3dv_physical_device *physical_device =
       &cmd_buffer->device->instance->physicalDevice;
 
+   if (cmd_buffer->level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
+      return false;
+
    if (!cmd_buffer->state.job)
       return false;
 
@@ -547,6 +555,16 @@ cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer)
    v3dv_job_emit_binning_flush(cmd_buffer->state.job);
 }
 
+static void
+cmd_buffer_end_render_pass_secondary(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   assert(cmd_buffer->state.job);
+   v3dv_cl_ensure_space_with_branch(&cmd_buffer->state.job->bcl,
+                                    cl_packet_length(RETURN_FROM_SUB_LIST));
+   v3dv_return_if_oom(cmd_buffer, NULL);
+   cl_emit(&cmd_buffer->state.job->bcl, RETURN_FROM_SUB_LIST, ret);
+}
+
 static struct v3dv_job *
 cmd_buffer_create_cpu_job(struct v3dv_device *device,
                           enum v3dv_job_type type,
@@ -581,7 +599,7 @@ cmd_buffer_add_cpu_jobs_for_pending_state(struct v3dv_cmd_buffer *cmd_buffer)
          v3dv_return_if_oom(cmd_buffer, NULL);
 
          job->cpu.query_end = state->query.end.states[i];
-         list_addtail(&job->list_link, &cmd_buffer->submit_jobs);
+         list_addtail(&job->list_link, &cmd_buffer->jobs);
       }
    }
 }
@@ -599,24 +617,46 @@ v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)
       return;
    }
 
-   assert(v3dv_cl_offset(&job->bcl) != 0);
+   /* If we have created a job for a command buffer then we should have
+    * recorded something into it: if the job was started in a render pass, it
+    * should at least have the start frame commands, otherwise, it should have
+    * a transfer command. The only exception are secondary command buffers
+    * inside a render pass.
+    */
+   assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
+          v3dv_cl_offset(&job->bcl) > 0);
 
    /* When we merge multiple subpasses into the same job we must only emit one
     * RCL, so we do that here, when we decided that we need to finish the job.
     * Any rendering that happens outside a render pass is never merged, so
     * the RCL should have been emitted by the time we got here.
+    *
+    * Secondaries that execute inside a render pass don't emit their own RCL,
+    * they will instead be branched to from the primary command buffer under the
+    * primary's RCL.
     */
    assert(v3dv_cl_offset(&job->rcl) != 0 || cmd_buffer->state.pass);
-   if (cmd_buffer->state.pass)
-      cmd_buffer_end_render_pass_frame(cmd_buffer);
+   if (cmd_buffer->state.pass) {
+      if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+         cmd_buffer_end_render_pass_frame(cmd_buffer);
+      } else {
+         cmd_buffer_end_render_pass_secondary(cmd_buffer);
+      }
+   }
 
-   list_addtail(&job->list_link, &cmd_buffer->submit_jobs);
+   list_addtail(&job->list_link, &cmd_buffer->jobs);
    cmd_buffer->state.job = NULL;
 
    /* If we have recorded any state with this last GPU job that requires to
-    * emit CPU jobs after the job is completed, add them now.
+    * emit CPU jobs after the job is completed, add them now. The only
+    * exception is secondary command buffers inside a render pass, because in
+    * that case we want to defer this until we finish recording the primary
+    * job into which we execute the secondary.
     */
-   cmd_buffer_add_cpu_jobs_for_pending_state(cmd_buffer);
+   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY ||
+       !cmd_buffer->state.pass) {
+      cmd_buffer_add_cpu_jobs_for_pending_state(cmd_buffer);
+   }
 }
 
 void
@@ -738,9 +778,6 @@ v3dv_AllocateCommandBuffers(VkDevice _device,
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
    V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, pAllocateInfo->commandPool);
 
-   /* FIXME: implement secondary command buffers */
-   assert(pAllocateInfo->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
-
    VkResult result = VK_SUCCESS;
    uint32_t i;
 
@@ -796,6 +833,55 @@ v3dv_DestroyCommandPool(VkDevice _device,
    vk_free2(&device->alloc, pAllocator, pool);
 }
 
+static VkResult
+cmd_buffer_begin_render_pass_secondary(
+   struct v3dv_cmd_buffer *cmd_buffer,
+   const VkCommandBufferInheritanceInfo *inheritance_info)
+{
+   assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+   assert(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT);
+   assert(inheritance_info);
+
+   cmd_buffer->state.pass =
+      v3dv_render_pass_from_handle(inheritance_info->renderPass);
+   assert(cmd_buffer->state.pass);
+
+   cmd_buffer->state.framebuffer =
+      v3dv_framebuffer_from_handle(inheritance_info->framebuffer);
+
+   /* Secondaries that execute inside a render pass won't start subpasses
+    * so we want to create a job for them here.
+    */
+   assert(inheritance_info->subpass < cmd_buffer->state.pass->subpass_count);
+   struct v3dv_job *job =
+      v3dv_cmd_buffer_start_job(cmd_buffer, inheritance_info->subpass);
+   if (!job) {
+      v3dv_flag_oom(cmd_buffer, NULL);
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+   }
+
+   /* Secondary command buffers don't know about the render area, but our
+    * scissor setup accounts for it, so let's make sure we make it large
+    * enough that it doesn't actually constrain any rendering. This should
+    * be fine, since the Vulkan spec states:
+    *
+    *    "The application must ensure (using scissor if necessary) that all
+    *     rendering is contained within the render area."
+    *
+    * FIXME: setup constants for the max framebuffer dimensions and use them
+    * here and when filling in VkPhysicalDeviceLimits.
+    */
+   const struct v3dv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
+   cmd_buffer->state.render_area.offset.x = 0;
+   cmd_buffer->state.render_area.offset.y = 0;
+   cmd_buffer->state.render_area.extent.width =
+      framebuffer ? framebuffer->width : 4096;
+   cmd_buffer->state.render_area.extent.height =
+      framebuffer ? framebuffer->height : 4096;
+
+   return VK_SUCCESS;
+}
+
 VkResult
 v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
                         const VkCommandBufferBeginInfo *pBeginInfo)
@@ -814,6 +900,22 @@ v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
 
    cmd_buffer->usage_flags = pBeginInfo->flags;
 
+   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
+      if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
+         result =
+            cmd_buffer_begin_render_pass_secondary(cmd_buffer,
+                                                   pBeginInfo->pInheritanceInfo);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+
+      /* If the primary may have an active occlusion query we need to honor
+       * that in the secondary.
+       */
+      if (pBeginInfo->pInheritanceInfo->occlusionQueryEnable)
+         cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;
+   }
+
    cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_RECORDING;
 
    return VK_SUCCESS;
@@ -860,24 +962,32 @@ emit_clip_window(struct v3dv_job *job, const VkRect2D *rect)
    }
 }
 
-/* Checks whether the clip rectangle covers a region that is aligned to
- * tile boundaries, which means that for all tiles covered by the clip
+/* Checks whether the render area rectangle covers a region that is aligned to
+ * tile boundaries, which means that for all tiles covered by the render area
  * region, there are no uncovered pixels (unless they are also outside the
  * framebuffer).
  */
 static void
-cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer,
-                                 const VkRect2D *clip_rect,
-                                 const VkExtent2D *fb_extent)
+cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
 {
    /* Render areas and scissor/viewport are only relevant inside render passes,
     * otherwise we are dealing with transfer operations where these elements
     * don't apply.
     */
-   if (!cmd_buffer->state.pass) {
-      cmd_buffer->state.tile_aligned_render_area = true;
-      return;
-   }
+   assert(cmd_buffer->state.pass);
+   const VkRect2D *rect = &cmd_buffer->state.render_area;
+
+   /* We should only call this at the beginning of a subpass, which should
+    * always be started from a primary command buffer, so we should always
+    * have framebuffer information available.
+    */
+   assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
+   assert(cmd_buffer->state.framebuffer);
+
+   const VkExtent2D fb_extent = {
+      .width = cmd_buffer->state.framebuffer->width,
+      .height = cmd_buffer->state.framebuffer->height
+   };
 
    VkExtent2D granularity;
    v3dv_subpass_get_granularity(cmd_buffer->state.pass,
@@ -885,12 +995,18 @@ cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer,
                                 &granularity);
 
    cmd_buffer->state.tile_aligned_render_area =
-      clip_rect->offset.x % granularity.width == 0 &&
-      clip_rect->offset.y % granularity.height == 0 &&
-      (clip_rect->extent.width % granularity.width == 0 ||
-       clip_rect->offset.x + clip_rect->extent.width >= fb_extent->width) &&
-      (clip_rect->extent.height % granularity.height == 0 ||
-       clip_rect->offset.y + clip_rect->extent.height >= fb_extent->height);
+      rect->offset.x % granularity.width == 0 &&
+      rect->offset.y % granularity.height == 0 &&
+      (rect->extent.width % granularity.width == 0 ||
+       rect->offset.x + rect->extent.width >= fb_extent.width) &&
+      (rect->extent.height % granularity.height == 0 ||
+       rect->offset.y + rect->extent.height >= fb_extent.height);
+
+   if (!cmd_buffer->state.tile_aligned_render_area) {
+      perf_debug("Render area for subpass %d of render pass %p doesn't "
+                 "match render pass granularity.\n",
+                 cmd_buffer->state.subpass_idx, cmd_buffer->state.pass);
+   }
 }
 
 void
@@ -1601,12 +1717,22 @@ cmd_buffer_emit_render_pass_rcl(struct v3dv_cmd_buffer *cmd_buffer)
    struct v3dv_job *job = cmd_buffer->state.job;
    assert(job);
 
-   const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
-
    const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
    const struct v3dv_framebuffer *framebuffer = state->framebuffer;
-   const uint32_t fb_layers = framebuffer->layers;
 
+   /* We can't emit the RCL until we have a framebuffer, which we may not have
+    * if we are recording a secondary command buffer. In that case, we will
+    * have to wait until vkCmdExecuteCommands is called from a primary command
+    * buffer.
+    */
+   if (!framebuffer) {
+      assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+      return;
+   }
+
+   const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
+
+   const uint32_t fb_layers = framebuffer->layers;
    v3dv_cl_ensure_space_with_branch(&job->rcl, 200 +
                                     MAX2(fb_layers, 1) * 256 *
                                     cl_packet_length(SUPERTILE_COORDINATES));
@@ -1748,6 +1874,8 @@ cmd_buffer_emit_render_pass_rcl(struct v3dv_cmd_buffer *cmd_buffer)
 static void
 cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
 {
+   assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
+
    assert(cmd_buffer->state.pass);
    assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count);
    const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
@@ -1795,8 +1923,8 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
    if (att_count == 0)
       return;
 
-   perf_debug("Render area doesn't match render pass granularity, falling "
-              "back to vkCmdClearAttachments for VK_ATTACHMENT_LOAD_OP_CLEAR");
+   perf_debug("Render area doesn't match render pass granularity, falling back "
+              "to vkCmdClearAttachments for VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
 
    /* From the Vulkan 1.0 spec:
     *
@@ -1863,6 +1991,8 @@ struct v3dv_job *
 v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer *cmd_buffer,
                               uint32_t subpass_idx)
 {
+   assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
+
    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
    assert(subpass_idx < state->pass->subpass_count);
 
@@ -1875,13 +2005,7 @@ v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer *cmd_buffer,
     * this in each subpass because the subset of attachments used can change
     * and with that the tile size selected by the hardware can change too.
     */
-   assert(state->framebuffer);
-   VkExtent2D fb_extent = {
-      .width = state->framebuffer->width,
-      .height = state->framebuffer->height
-   };
-   cmd_buffer_update_tile_alignment(cmd_buffer, &state->render_area, &fb_extent);
-
+   cmd_buffer_update_tile_alignment(cmd_buffer);
 
    /* If we can't use TLB clears then we need to emit draw clears for any
     * LOAD_OP_CLEAR attachments in this subpass now.
@@ -1896,6 +2020,8 @@ struct v3dv_job *
 v3dv_cmd_buffer_subpass_resume(struct v3dv_cmd_buffer *cmd_buffer,
                                uint32_t subpass_idx)
 {
+   assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
+
    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
    assert(subpass_idx < state->pass->subpass_count);
 
@@ -1927,17 +2053,13 @@ v3dv_CmdEndRenderPass(VkCommandBuffer commandBuffer)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   /* Emit last subpass */
+   assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
+
+   /* Finalize last subpass */
    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
    assert(state->subpass_idx == state->pass->subpass_count - 1);
-
-   /* See v3dv_cmd_buffer_subpass_finish for why we can get here without an
-    * active job.
-    */
-   if (cmd_buffer->state.job) {
-      v3dv_cmd_buffer_subpass_finish(cmd_buffer);
-      v3dv_cmd_buffer_finish_job(cmd_buffer);
-   }
+   v3dv_cmd_buffer_subpass_finish(cmd_buffer);
+   v3dv_cmd_buffer_finish_job(cmd_buffer);
 
    /* We are no longer inside a render pass */
    state->framebuffer = NULL;
@@ -1953,22 +2075,194 @@ v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)
    if (cmd_buffer->state.oom)
       return VK_ERROR_OUT_OF_HOST_MEMORY;
 
-   cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_EXECUTABLE;
-
-   struct v3dv_job *job = cmd_buffer->state.job;
-   if (job) {
-      /* We get here if we recorded commands after the last render pass in the
-       * command buffer. Make sure we finish this last job.
-       *
-       * FIXME: is this even possible?
-       */
-      assert(v3dv_cl_offset(&job->bcl) != 0);
+   /* Primaries should have ended any recording jobs by the time they hit
+    * vkEndRenderPass (if we are inside a render pass). Commands outside
+    * a render pass instance (for both primaries and secondaries) spawn
+    * complete jobs too. So the only case where we can get here without
+    * finishing a recording job is when we are recording a secondary
+    * inside a render pass.
+    */
+   if (cmd_buffer->state.job) {
+      assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
+             cmd_buffer->state.pass);
       v3dv_cmd_buffer_finish_job(cmd_buffer);
    }
 
+   cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_EXECUTABLE;
+
    return VK_SUCCESS;
 }
 
+static void
+emit_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer);
+
+static void
+ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
+                   uint32_t slot_size,
+                   uint32_t used_count,
+                   uint32_t *alloc_count,
+                   void **ptr);
+
+static void
+cmd_buffer_copy_secondary_end_query_state(struct v3dv_cmd_buffer *primary,
+                                          struct v3dv_cmd_buffer *secondary)
+{
+   struct v3dv_cmd_buffer_state *p_state = &primary->state;
+   struct v3dv_cmd_buffer_state *s_state = &secondary->state;
+
+   const uint32_t total_state_count =
+      p_state->query.end.used_count + s_state->query.end.used_count;
+   ensure_array_state(primary,
+                      sizeof(struct v3dv_end_query_cpu_job_info),
+                      total_state_count,
+                      &p_state->query.end.alloc_count,
+                      (void **) &p_state->query.end.states);
+   v3dv_return_if_oom(primary, NULL);
+
+   for (uint32_t i = 0; i < s_state->query.end.used_count; i++) {
+      const struct v3dv_end_query_cpu_job_info *s_qstate =
+         &secondary->state.query.end.states[i];
+
+      struct v3dv_end_query_cpu_job_info *p_qstate =
+         &p_state->query.end.states[p_state->query.end.used_count++];
+
+      p_qstate->pool = s_qstate->pool;
+      p_qstate->query = s_qstate->query;
+   }
+}
+
+static void
+cmd_buffer_execute_inside_pass(struct v3dv_cmd_buffer *primary,
+                               uint32_t cmd_buffer_count,
+                               const VkCommandBuffer *cmd_buffers)
+{
+   struct v3dv_job *primary_job = primary->state.job;
+   assert(primary_job);
+
+   if (primary->state.dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY)
+      emit_occlusion_query(primary);
+
+   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+      V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
+
+      assert(secondary->usage_flags &
+             VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT);
+
+      /* Secondaries that run inside a render pass only record commands inside
+       * a subpass, so they don't crate complete jobs (they don't have an RCL
+       * and their BCL doesn't include tiling setup). These are provided by
+       * the primary command buffer instead, so we just want to branch to the
+       * BCL commands recorded in the secondary from the primary's BCL.
+       *
+       * Because of this, these secondary command buffers should have exactly
+       * one job (the default), with no RCL commands.
+       */
+      assert(list_length(&secondary->jobs) == 1);
+      list_for_each_entry(struct v3dv_job, secondary_job,
+                          &secondary->jobs, list_link) {
+         assert(v3dv_cl_offset(&secondary_job->rcl) == 0);
+         assert(secondary_job->bcl.bo);
+
+         set_foreach(secondary_job->bos, entry) {
+            struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
+            v3dv_job_add_bo(primary_job, bo);
+         }
+
+         /* Skip branch if command buffer is empty */
+         if (v3dv_cl_offset(&secondary_job->bcl) == 0)
+            continue;
+
+         v3dv_cl_ensure_space_with_branch(&primary_job->bcl,
+                                          cl_packet_length(BRANCH_TO_SUB_LIST));
+         v3dv_return_if_oom(primary, NULL);
+
+         cl_emit(&primary_job->bcl, BRANCH_TO_SUB_LIST, branch) {
+            branch.address = v3dv_cl_address(secondary_job->bcl.bo, 0);
+         }
+      }
+
+      /* If the secondary has recorded any vkCmdEndQuery commands, we need to
+       * copy this state to the primary so it is processed properly when the
+       * current primary job is finished.
+       */
+      cmd_buffer_copy_secondary_end_query_state(primary, secondary);
+   }
+}
+
+/* Clones a job for inclusion in the given command buffer. Note that this
+ * doesn't make a deep copy so the cloned job it doesn't own any resources.
+ * Useful when we need to have a job in more than one list, which happens
+ * for jobs recorded in secondary command buffers when we want to execute
+ * them in primaries.
+ */
+static struct v3dv_job *
+job_clone(struct v3dv_job *job, struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_job *clone_job = vk_alloc(&job->device->alloc,
+                                         sizeof(struct v3dv_job), 8,
+                                         VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+   if (!clone_job)
+      return NULL;
+
+   *clone_job = *job;
+   clone_job->is_clone = true;
+   clone_job->cmd_buffer = cmd_buffer;
+   return clone_job;
+}
+
+static void
+cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary,
+                                uint32_t cmd_buffer_count,
+                                const VkCommandBuffer *cmd_buffers)
+{
+   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+      V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
+
+      assert(!(secondary->usage_flags &
+               VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT));
+
+      /* Secondary command buffers that execute outside a render pass create
+       * complete jobs with an RCL and tile setup, so we simply want to merge
+       * their job list into the primary's. However, because they may be
+       * executed into multiple primaries at the same time and we only have a
+       * single list_link in each job, we can't just add then to the primary's
+       * job list and we instead have to clone them first.
+       *
+       * Alternatively, we could create a "execute secondary" CPU job that
+       * when executed in a queue, would submit all the jobs in the referenced
+       * secondary command buffer. However, this would raise some challenges
+       * to make it work with the implementation of wait threads in the queue
+       * which we use for event waits, for example.
+       */
+      list_for_each_entry(struct v3dv_job, secondary_job,
+                          &secondary->jobs, list_link) {
+         struct v3dv_job *clone_job = job_clone(secondary_job, primary);
+         if (!clone_job) {
+            v3dv_flag_oom(primary, NULL);
+            return;
+         }
+
+         list_addtail(&clone_job->list_link, &primary->jobs);
+      }
+   }
+}
+
+void
+v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer,
+                        uint32_t commandBufferCount,
+                        const VkCommandBuffer *pCommandBuffers)
+{
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, primary, commandBuffer);
+
+   if (primary->state.pass != NULL) {
+      cmd_buffer_execute_inside_pass(primary,
+                                     commandBufferCount, pCommandBuffers);
+   } else {
+      cmd_buffer_execute_outside_pass(primary,
+                                      commandBufferCount, pCommandBuffers);
+   }
+}
+
 /* This goes though the list of possible dynamic states in the pipeline and,
  * for those that are not configured as dynamic, copies relevant state into
  * the command buffer.
@@ -2451,6 +2745,11 @@ emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
    maxy = MIN2(vp_maxy, cmd_buffer->state.render_area.offset.y +
                         cmd_buffer->state.render_area.extent.height);
 
+   minx = vp_minx;
+   miny = vp_miny;
+   maxx = vp_maxx;
+   maxy = vp_maxy;
+
    /* Clip against user provided scissor if needed.
     *
     * FIXME: right now we only allow one scissor. Below would need to be
@@ -3683,7 +3982,7 @@ v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer *cmd_buffer,
    job->cpu.query_reset.first = first;
    job->cpu.query_reset.count = count;
 
-   list_addtail(&job->list_link, &cmd_buffer->submit_jobs);
+   list_addtail(&job->list_link, &cmd_buffer->jobs);
 }
 
 static void
@@ -3763,7 +4062,7 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
 
       job->cpu.query_end.pool = pool;
       job->cpu.query_end.query = query;
-      list_addtail(&job->list_link, &cmd_buffer->submit_jobs);
+      list_addtail(&job->list_link, &cmd_buffer->jobs);
    }
 
    cmd_buffer->state.query.active_query = NULL;
@@ -3803,7 +4102,7 @@ v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,
    job->cpu.query_copy_results.stride = stride;
    job->cpu.query_copy_results.flags = flags;
 
-   list_addtail(&job->list_link, &cmd_buffer->submit_jobs);
+   list_addtail(&job->list_link, &cmd_buffer->jobs);
 }
 
 void
@@ -3821,7 +4120,7 @@ v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
 
    v3dv_job_init(job, V3DV_JOB_TYPE_GPU_TFU, device, cmd_buffer, -1);
    job->tfu = *tfu;
-   list_addtail(&job->list_link, &cmd_buffer->submit_jobs);
+   list_addtail(&job->list_link, &cmd_buffer->jobs);
 }
 
 void
@@ -3847,7 +4146,7 @@ v3dv_CmdSetEvent(VkCommandBuffer commandBuffer,
    job->cpu.event_set.event = event;
    job->cpu.event_set.state = 1;
 
-   list_addtail(&job->list_link, &cmd_buffer->submit_jobs);
+   list_addtail(&job->list_link, &cmd_buffer->jobs);
 }
 
 void
@@ -3873,7 +4172,7 @@ v3dv_CmdResetEvent(VkCommandBuffer commandBuffer,
    job->cpu.event_set.event = event;
    job->cpu.event_set.state = 0;
 
-   list_addtail(&job->list_link, &cmd_buffer->submit_jobs);
+   list_addtail(&job->list_link, &cmd_buffer->jobs);
 }
 
 void
@@ -3922,5 +4221,5 @@ v3dv_CmdWaitEvents(VkCommandBuffer commandBuffer,
    for (uint32_t i = 0; i < eventCount; i++)
       job->cpu.event_wait.events[i] = v3dv_event_from_handle(pEvents[i]);
 
-   list_addtail(&job->list_link, &cmd_buffer->submit_jobs);
+   list_addtail(&job->list_link, &cmd_buffer->jobs);
 }
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index d0545aaa8fb..4e17f37beb9 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -622,7 +622,7 @@ v3dv_GetPhysicalDeviceFeatures(VkPhysicalDevice physicalDevice,
       .sparseResidency16Samples = false,
       .sparseResidencyAliased = false,
       .variableMultisampleRate = false,
-      .inheritedQueries = false,
+      .inheritedQueries = true,
    };
 }
 
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index 7e5967915fe..b8ed1ae74e4 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -698,6 +698,12 @@ struct v3dv_event_wait_cpu_job_info {
 struct v3dv_job {
    struct list_head list_link;
 
+   /* We only create job clones when executing secondary command buffers into
+    * primaries. These clones don't make deep copies of the original object
+    * so we want to flag them to avoid freeing resources they don't own.
+    */
+   bool is_clone;
+
    enum v3dv_job_type type;
 
    struct v3dv_device *device;
@@ -955,8 +961,12 @@ struct v3dv_cmd_buffer {
       } blit;
    } meta;
 
-   /* List of jobs to submit to the kernel */
-   struct list_head submit_jobs;
+   /* List of jobs in the command buffer. For primary command buffers it
+    * represents the jobs we want to submit to the GPU. For secondary command
+    * buffers it represents jobs that will be merged into a primary command
+    * buffer via vkCmdExecuteCommands.
+    */
+   struct list_head jobs;
 };
 
 struct v3dv_job *v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer,
diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
index 8526400dfdf..f77f9f17e4f 100644
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -320,7 +320,7 @@ event_wait_thread_func(void *_job)
     */
    struct v3dv_queue *queue = &job->device->queue;
    list_for_each_entry_from(struct v3dv_job, pjob, job->list_link.next,
-                            &job->cmd_buffer->submit_jobs, list_link) {
+                            &job->cmd_buffer->jobs, list_link) {
       /* We don't want to spawn more than one wait thread per command buffer.
        * If this job also requires a wait for events, we will do the wait here.
        */
@@ -695,11 +695,11 @@ queue_submit_cmd_buffer(struct v3dv_queue *queue,
    assert(cmd_buffer);
    assert(cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_EXECUTABLE);
 
-   if (list_is_empty(&cmd_buffer->submit_jobs))
+   if (list_is_empty(&cmd_buffer->jobs))
       return queue_submit_noop_job(queue, pSubmit);
 
    list_for_each_entry_safe(struct v3dv_job, job,
-                            &cmd_buffer->submit_jobs, list_link) {
+                            &cmd_buffer->jobs, list_link) {
       VkResult result = queue_submit_job(queue, job,
                                          pSubmit->waitSemaphoreCount > 0,
                                          wait_thread);