brw: Use correct builder size for MEMORY_FENCE/INTERLOCK virtual opcodes

brw_memory_fence() overrides the instructions generated by the MEMORY_FENCE or INTERLOCK opcodes to be force_writemask_all with exec_size == 1. But the IR was emitting it in SIMD8 (regardless of dispatch width). Instead, just emit the IR as SIMD1/NoMask so the IR matches what we actually generate. Have size_written indicate that the entire destination is written, however, as it is ultimately going to be a SEND that writes a whole register. We were also using a UD register for the source of FS_OPCODE_SCHEDULING_FENCE when the generator overrides it to UW, so just specify UW in the IR as well so that they line up. Also add validation for MEMORY_FENCE/INTERLOCK that we've done the exec_size and masking right in the IR. Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33297>
2025-01-17 22:56:24 -08:00
parent accef5e8f5
commit c0a32af125
4 changed files with 24 additions and 13 deletions
@@ -295,6 +295,7 @@ brw_emit_urb_fence(fs_visitor &s)
                              brw_vec8_grf(0, 0),
                              brw_imm_ud(true),
                              brw_imm_ud(0));
+   fence->size_written = REG_SIZE * reg_unit(s.devinfo);
   fence->sfid = BRW_SFID_URB;
   /* The logical thing here would likely be a THREADGROUP fence but that's
    * still failing some tests like in dEQP-VK.mesh_shader.ext.query.*
@@ -4969,6 +4969,8 @@ emit_fence(const brw_builder &bld, enum opcode opcode,
           uint8_t sfid, uint32_t desc,
           bool commit_enable, uint8_t bti)
 {
+   const struct intel_device_info *devinfo = bld.shader->devinfo;
+
   assert(opcode == SHADER_OPCODE_INTERLOCK ||
          opcode == SHADER_OPCODE_MEMORY_FENCE);

@@ -4978,6 +4980,7 @@ emit_fence(const brw_builder &bld, enum opcode opcode,
                             brw_imm_ud(bti));
   fence->sfid = sfid;
   fence->desc = desc;
+   fence->size_written = commit_enable ? REG_SIZE * reg_unit(devinfo) : 0;

   return dst;
 }
@@ -5938,7 +5941,7 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
      unsigned fence_regs_count = 0;
      brw_reg fence_regs[4] = {};

-      const brw_builder ubld = bld.group(8, 0);
+      const brw_builder ubld1 = bld.exec_all().group(1, 0);

      /* A memory barrier with acquire semantics requires us to
       * guarantee that memory operations of the specified storage
@@ -5980,7 +5983,7 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
      if (devinfo->ver >= 12 &&
          (!nir_intrinsic_has_memory_scope(instr) ||
           (nir_intrinsic_memory_semantics(instr) & NIR_MEMORY_ACQUIRE))) {
-         ubld.exec_all().group(1, 0).SYNC(TGL_SYNC_ALLWR);
+         ubld1.SYNC(TGL_SYNC_ALLWR);
      }

      if (devinfo->has_lsc) {
@@ -5989,14 +5992,14 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
            lsc_fence_descriptor_for_intrinsic(devinfo, instr);
         if (ugm_fence) {
            fence_regs[fence_regs_count++] =
-               emit_fence(ubld, opcode, GFX12_SFID_UGM, desc,
+               emit_fence(ubld1, opcode, GFX12_SFID_UGM, desc,
                          true /* commit_enable */,
                          0 /* bti; ignored for LSC */);
         }

         if (tgm_fence) {
            fence_regs[fence_regs_count++] =
-               emit_fence(ubld, opcode, GFX12_SFID_TGM, desc,
+               emit_fence(ubld1, opcode, GFX12_SFID_TGM, desc,
                          true /* commit_enable */,
                          0 /* bti; ignored for LSC */);
         }
@@ -6009,10 +6012,10 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
                * Before SLM fence compiler needs to insert SYNC.ALLWR in order
                * to avoid the SLM data race.
                */
-               ubld.exec_all().group(1, 0).SYNC(TGL_SYNC_ALLWR);
+               ubld1.SYNC(TGL_SYNC_ALLWR);
            }
            fence_regs[fence_regs_count++] =
-               emit_fence(ubld, opcode, GFX12_SFID_SLM, desc,
+               emit_fence(ubld1, opcode, GFX12_SFID_SLM, desc,
                          true /* commit_enable */,
                          0 /* BTI; ignored for LSC */);
         }
@@ -6020,14 +6023,14 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
         if (urb_fence) {
            assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
            fence_regs[fence_regs_count++] =
-               emit_fence(ubld, opcode, BRW_SFID_URB, desc,
+               emit_fence(ubld1, opcode, BRW_SFID_URB, desc,
                          true /* commit_enable */,
                          0 /* BTI; ignored for LSC */);
         }
      } else if (devinfo->ver >= 11) {
         if (tgm_fence || ugm_fence || urb_fence) {
            fence_regs[fence_regs_count++] =
-               emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
+               emit_fence(ubld1, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
                          true /* commit_enable HSD ES # 1404612949 */,
                          0 /* BTI = 0 means data cache */);
         }
@@ -6035,7 +6038,7 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
         if (slm_fence) {
            assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
            fence_regs[fence_regs_count++] =
-               emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
+               emit_fence(ubld1, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
                          true /* commit_enable HSD ES # 1404612949 */,
                          GFX7_BTI_SLM);
         }
@@ -6048,7 +6051,7 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,

         if (tgm_fence || ugm_fence || slm_fence || urb_fence) {
            fence_regs[fence_regs_count++] =
-               emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
+               emit_fence(ubld1, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
                          commit_enable, 0 /* BTI */);
         }
      }
@@ -6085,9 +6088,9 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
       */
      if (instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
          fence_regs_count != 1 || devinfo->has_lsc || force_stall) {
-         ubld.exec_all().group(1, 0).emit(
-            FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
-            fence_regs, fence_regs_count);
+         ubld1.emit(FS_OPCODE_SCHEDULING_FENCE,
+                    retype(brw_null_reg(), BRW_TYPE_UW),
+                    fence_regs, fence_regs_count);
      }

      break;
@@ -314,6 +314,12 @@ brw_validate(const fs_visitor &s)
            validate_memory_logical(s, inst);
            break;

+         case SHADER_OPCODE_MEMORY_FENCE:
+         case SHADER_OPCODE_INTERLOCK:
+            fsv_assert(inst->exec_size == 1);
+            fsv_assert(inst->force_writemask_all);
+            break;
+
         default:
            break;
         }
@@ -110,6 +110,7 @@ brw_workaround_memory_fence_before_eot(fs_visitor &s)
      dummy_fence->sfid = GFX12_SFID_UGM;
      dummy_fence->desc = lsc_fence_msg_desc(s.devinfo, LSC_FENCE_TILE,
                                             LSC_FLUSH_TYPE_NONE_6, false);
+      dummy_fence->size_written = REG_SIZE * reg_unit(s.devinfo);
      ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst);
      progress = true;
      /* TODO: remove this break if we ever have shader with multiple EOT. */