diff --git a/src/intel/compiler/brw_disasm.c b/src/intel/compiler/brw_disasm.c index 4980ccbbde3..5e0ad96adf8 100644 --- a/src/intel/compiler/brw_disasm.c +++ b/src/intel/compiler/brw_disasm.c @@ -679,6 +679,7 @@ static const char* const lsc_flush_type[] = { [LSC_FLUSH_TYPE_DISCARD] = "discard", [LSC_FLUSH_TYPE_CLEAN] = "clean", [LSC_FLUSH_TYPE_L3ONLY] = "l3only", + [LSC_FLUSH_TYPE_NONE_6] = "none_6", }; static const char* const lsc_addr_size[] = { diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h index 1c1301e9403..17fde1e2d1e 100644 --- a/src/intel/compiler/brw_eu.h +++ b/src/intel/compiler/brw_eu.h @@ -1781,6 +1781,7 @@ brw_memory_fence(struct brw_codegen *p, struct brw_reg src, enum opcode send_op, enum brw_message_target sfid, + uint32_t desc, bool commit_enable, unsigned bti); diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index addf63e870d..4e3b126dc47 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -1991,6 +1991,11 @@ enum PACKED lsc_flush_type { * Flush "RW" section of the L3 cache, but leave L1 and L2 caches untouched. */ LSC_FLUSH_TYPE_L3ONLY = 5, + /* + * HW maps this flush type internally to NONE. + */ + LSC_FLUSH_TYPE_NONE_6 = 6, + }; enum PACKED lsc_backup_fence_routing { diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c index 32cfae00093..09207d9f806 100644 --- a/src/intel/compiler/brw_eu_emit.c +++ b/src/intel/compiler/brw_eu_emit.c @@ -3256,7 +3256,8 @@ brw_set_memory_fence_message(struct brw_codegen *p, static void gfx12_set_memory_fence_message(struct brw_codegen *p, struct brw_inst *insn, - enum brw_message_target sfid) + enum brw_message_target sfid, + uint32_t desc) { const unsigned mlen = 1; /* g0 header */ /* Completion signaled by write to register. No data returned. */ @@ -3268,8 +3269,8 @@ gfx12_set_memory_fence_message(struct brw_codegen *p, brw_set_desc(p, insn, brw_urb_fence_desc(p->devinfo) | brw_message_desc(p->devinfo, mlen, rlen, false)); } else { - enum lsc_fence_scope scope = LSC_FENCE_THREADGROUP; - enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE; + enum lsc_fence_scope scope = lsc_fence_msg_desc_scope(p->devinfo, desc); + enum lsc_flush_type flush_type = lsc_fence_msg_desc_flush_type(p->devinfo, desc); if (sfid == GFX12_SFID_TGM) { scope = LSC_FENCE_TILE; @@ -3288,6 +3289,7 @@ brw_memory_fence(struct brw_codegen *p, struct brw_reg src, enum opcode send_op, enum brw_message_target sfid, + uint32_t desc, bool commit_enable, unsigned bti) { @@ -3307,7 +3309,7 @@ brw_memory_fence(struct brw_codegen *p, /* All DG2 hardware requires LSC for fence messages, even A-step */ if (devinfo->has_lsc) - gfx12_set_memory_fence_message(p, insn, sfid); + gfx12_set_memory_fence_message(p, insn, sfid, desc); else brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti); } diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 0cbc6b6016c..225df2d04e6 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -8618,6 +8618,75 @@ fs_visitor::fixup_3src_null_dest() DEPENDENCY_VARIABLES); } +static bool +needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst) +{ + /* This workaround is about making sure that any instruction writing + * through UGM has completed before we hit EOT. + * + * The workaround talks about UGM writes or atomic message but what is + * important is anything that hasn't completed. Usually any SEND + * instruction that has a destination register will be read by something + * else so we don't need to care about those as they will be synchronized + * by other parts of the shader or optimized away. What is left are + * instructions that don't have a destination register. + */ + if (inst->sfid != GFX12_SFID_UGM) + return false; + + return inst->dst.file == BAD_FILE; +} + +/* Wa_22013689345 + * + * We need to emit UGM fence message before EOT, if shader has any UGM write + * or atomic message. + * + * TODO/FINISHME: According to Curro we could avoid the fence in some cases. + * We probably need a better criteria in needs_dummy_fence(). + */ +void +fs_visitor::emit_dummy_memory_fence_before_eot() +{ + bool progress = false; + bool has_ugm_write_or_atomic = false; + + if (!intel_device_info_is_dg2(devinfo)) + return; + + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { + if (!inst->eot) { + if (needs_dummy_fence(devinfo, inst)) + has_ugm_write_or_atomic = true; + continue; + } + + if (!has_ugm_write_or_atomic) + break; + + const fs_builder ibld(this, block, inst); + const fs_builder ubld = ibld.exec_all().group(1, 0); + + fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD); + fs_inst *dummy_fence = ubld.emit(SHADER_OPCODE_MEMORY_FENCE, + dst, brw_vec8_grf(0, 0), + /* commit enable */ brw_imm_ud(1), + /* bti */ brw_imm_ud(0)); + dummy_fence->sfid = GFX12_SFID_UGM; + dummy_fence->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_TILE, + LSC_FLUSH_TYPE_NONE_6, false); + ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst); + progress = true; + /* TODO: remove this break if we ever have shader with multiple EOT. */ + break; + } + + if (progress) { + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | + DEPENDENCY_VARIABLES); + } +} + /** * Find the first instruction in the program that might start a region of * divergent control flow due to a HALT jump. There is no @@ -8927,6 +8996,7 @@ fs_visitor::run_vs() assign_vs_urb_setup(); fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); allocate_registers(true /* allow_spilling */); return !failed; @@ -9049,6 +9119,7 @@ fs_visitor::run_tcs() assign_tcs_urb_setup(); fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); allocate_registers(true /* allow_spilling */); return !failed; @@ -9077,6 +9148,7 @@ fs_visitor::run_tes() assign_tes_urb_setup(); fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); allocate_registers(true /* allow_spilling */); return !failed; @@ -9120,6 +9192,7 @@ fs_visitor::run_gs() assign_gs_urb_setup(); fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); allocate_registers(true /* allow_spilling */); return !failed; @@ -9220,6 +9293,7 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) assign_urb_setup(); fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); allocate_registers(allow_spilling); } @@ -9255,6 +9329,7 @@ fs_visitor::run_cs(bool allow_spilling) assign_curb_setup(); fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); allocate_registers(allow_spilling); return !failed; @@ -9283,6 +9358,7 @@ fs_visitor::run_bs(bool allow_spilling) assign_curb_setup(); fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); allocate_registers(allow_spilling); return !failed; @@ -9327,6 +9403,7 @@ fs_visitor::run_task(bool allow_spilling) assign_curb_setup(); fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); allocate_registers(allow_spilling); return !failed; @@ -9371,6 +9448,7 @@ fs_visitor::run_mesh(bool allow_spilling) assign_curb_setup(); fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); allocate_registers(allow_spilling); return !failed; diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index f4107e18321..3f6489a88cd 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -136,6 +136,7 @@ public: void setup_cs_payload(); bool fixup_sends_duplicate_payload(); void fixup_3src_null_dest(); + void emit_dummy_memory_fence_before_eot(); bool fixup_nomask_control_flow(); void assign_curb_setup(); void assign_urb_setup(); diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index 0af40c739e5..c7a4aaf2150 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -2382,6 +2382,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, brw_memory_fence(p, dst, src[0], send_op, brw_message_target(inst->sfid), + inst->desc, /* commit_enable */ src[1].ud, /* bti */ src[2].ud); send_count++; diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp index bf548265099..d51a8ba8e1c 100644 --- a/src/intel/compiler/brw_vec4_generator.cpp +++ b/src/intel/compiler/brw_vec4_generator.cpp @@ -1926,6 +1926,7 @@ generate_code(struct brw_codegen *p, case SHADER_OPCODE_MEMORY_FENCE: brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, brw_message_target(inst->sfid), + inst->desc, /* commit_enable */ false, /* bti */ 0); send_count++;