From 296b4d95a3e35b2ab62ebcbd7e3a578fcbdd5b26 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 25 Aug 2022 12:16:39 +0100 Subject: [PATCH] aco/gfx11: workaround LdsDirectVALUHazard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fossil-db (gfx1100): Totals from 57858 (42.85% of 135032) affected shaders: Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/README-ISA.md | 8 ++ src/amd/compiler/aco_insert_NOPs.cpp | 97 +++++++++++++++ src/amd/compiler/aco_opcodes.py | 2 +- src/amd/compiler/tests/test_assembler.cpp | 8 +- src/amd/compiler/tests/test_insert_nops.cpp | 126 ++++++++++++++++++++ 5 files changed, 236 insertions(+), 5 deletions(-) diff --git a/src/amd/compiler/README-ISA.md b/src/amd/compiler/README-ISA.md index 040d2815353..48f0924cc7c 100644 --- a/src/amd/compiler/README-ISA.md +++ b/src/amd/compiler/README-ISA.md @@ -294,3 +294,11 @@ stability issues: https://reviews.llvm.org/D103348 ### VcmpxPermlaneHazard Same as GFX10. + +### LdsDirectVALUHazard + +Triggered by: +LDSDIR instruction writing a VGPR soon after it's used by a VALU instruction. + +Mitigated by: +A vdst wait, preferably using the LDSDIR's field. diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index 02308055fe3..66a35900945 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -27,6 +27,7 @@ #include #include +#include #include #include @@ -865,6 +866,96 @@ handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr& } } +/* GFX11 */ +unsigned +parse_vdst_wait(aco_ptr& instr) +{ + if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP()) + return 0; + else if (instr->isLDSDIR()) + return instr->ldsdir().wait_vdst; + else if (instr->opcode == aco_opcode::s_waitcnt_depctr) + return (instr->sopp().imm >> 12) & 0xf; + else + return 15; +} + +struct LdsDirectVALUHazardGlobalState { + unsigned wait_vdst = 15; + PhysReg vgpr; + std::set loop_headers_visited; +}; + +struct LdsDirectVALUHazardBlockState { + unsigned num_valu = 0; + bool has_trans = false; +}; + +bool +handle_lds_direct_valu_hazard_instr(LdsDirectVALUHazardGlobalState& global_state, + LdsDirectVALUHazardBlockState& block_state, + aco_ptr& instr) +{ + if (instr->isVALU() || instr->isVINTERP_INREG()) { + instr_class cls = instr_info.classes[(int)instr->opcode]; + block_state.has_trans |= cls == instr_class::valu_transcendental32 || + cls == instr_class::valu_double_transcendental; + + bool uses_vgpr = false; + for (Definition& def : instr->definitions) + uses_vgpr |= regs_intersect(def.physReg(), def.size(), global_state.vgpr, 1); + for (Operand& op : instr->operands) { + uses_vgpr |= + !op.isConstant() && regs_intersect(op.physReg(), op.size(), global_state.vgpr, 1); + } + if (uses_vgpr) { + /* Transcendentals execute in parallel to other VALU and va_vdst count becomes unusable */ + global_state.wait_vdst = + MIN2(global_state.wait_vdst, block_state.has_trans ? 0 : block_state.num_valu); + return true; + } + + block_state.num_valu++; + } + + if (parse_vdst_wait(instr) == 0) + return true; + + return block_state.num_valu >= global_state.wait_vdst; +} + +bool +handle_lds_direct_valu_hazard_block(LdsDirectVALUHazardGlobalState& global_state, + LdsDirectVALUHazardBlockState& block_state, Block* block) +{ + if (block->kind & block_kind_loop_header) { + if (global_state.loop_headers_visited.count(block->index)) + return false; + global_state.loop_headers_visited.insert(block->index); + } + + return true; +} + +unsigned +handle_lds_direct_valu_hazard(State& state, aco_ptr& instr) +{ + /* LdsDirectVALUHazard + * Handle LDSDIR writing a VGPR after it's used by a VALU instruction. + */ + if (instr->ldsdir().wait_vdst == 0) + return 0; /* early exit */ + + LdsDirectVALUHazardGlobalState global_state; + global_state.wait_vdst = instr->ldsdir().wait_vdst; + global_state.vgpr = instr->definitions[0].physReg(); + LdsDirectVALUHazardBlockState block_state; + search_backwards( + state, global_state, block_state); + return global_state.wait_vdst; +} + void handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& instr, std::vector>& new_instructions) @@ -886,6 +977,12 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& } else if (instr->isVALU() && instr->opcode != aco_opcode::v_nop) { ctx.has_Vcmpx = false; } + + if (instr->isLDSDIR()) { + unsigned count = handle_lds_direct_valu_hazard(state, instr); + LDSDIR_instruction* ldsdir = &instr->ldsdir(); + ldsdir->wait_vdst = MIN2(ldsdir->wait_vdst, count); + } } template diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 098733c688a..52a52afb674 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -99,7 +99,7 @@ class Format(Enum): return [('uint8_t', 'attr', 0), ('uint8_t', 'attr_chan', 0), ('memory_sync_info', 'sync', 'memory_sync_info()'), - ('uint8_t', 'wait_vdst', 0)] + ('uint8_t', 'wait_vdst', 15)] elif self == Format.MTBUF: return [('unsigned', 'dfmt', None), ('unsigned', 'nfmt', None), diff --git a/src/amd/compiler/tests/test_assembler.cpp b/src/amd/compiler/tests/test_assembler.cpp index ad1f5b4920c..eee7f658af6 100644 --- a/src/amd/compiler/tests/test_assembler.cpp +++ b/src/amd/compiler/tests/test_assembler.cpp @@ -791,19 +791,19 @@ BEGIN_TEST(assembler.gfx11.ldsdir) bld.ldsdir(aco_opcode::lds_direct_load, dst, op).instr->ldsdir().wait_vdst = 6; //! lds_direct_load v42 ; ce10002a - bld.ldsdir(aco_opcode::lds_direct_load, dst, op); + bld.ldsdir(aco_opcode::lds_direct_load, dst, op).instr->ldsdir().wait_vdst = 0; //! lds_param_load v42, attr56.x wait_vdst:8 ; ce08e02a bld.ldsdir(aco_opcode::lds_param_load, dst, op, 56, 0).instr->ldsdir().wait_vdst = 8; //! lds_param_load v42, attr56.x ; ce00e02a - bld.ldsdir(aco_opcode::lds_param_load, dst, op, 56, 0); + bld.ldsdir(aco_opcode::lds_param_load, dst, op, 56, 0).instr->ldsdir().wait_vdst = 0; //! lds_param_load v42, attr34.y ; ce00892a - bld.ldsdir(aco_opcode::lds_param_load, dst, op, 34, 1); + bld.ldsdir(aco_opcode::lds_param_load, dst, op, 34, 1).instr->ldsdir().wait_vdst = 0; //! lds_param_load v42, attr12.z ; ce00322a - bld.ldsdir(aco_opcode::lds_param_load, dst, op, 12, 2); + bld.ldsdir(aco_opcode::lds_param_load, dst, op, 12, 2).instr->ldsdir().wait_vdst = 0; finish_assembler_test(); END_TEST diff --git a/src/amd/compiler/tests/test_insert_nops.cpp b/src/amd/compiler/tests/test_insert_nops.cpp index de8d8de8e2e..8bec022f7ee 100644 --- a/src/amd/compiler/tests/test_insert_nops.cpp +++ b/src/amd/compiler/tests/test_insert_nops.cpp @@ -306,3 +306,129 @@ BEGIN_TEST(insert_nops.vmem_to_scalar_write) finish_insert_nops_test(); END_TEST + +BEGIN_TEST(insert_nops.lds_direct_valu) + if (!setup_cs(NULL, GFX11)) + return; + + /* WaW */ + //>> p_unit_test 0 + //! v1: %0:v[0] = v_mov_b32 0 + //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + /* WaR */ + //! p_unit_test 1 + //! v1: %0:v[1] = v_mov_b32 %0:v[0] + //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + /* No hazard. */ + //! p_unit_test 2 + //! v1: %0:v[1] = v_mov_b32 0 + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::zero()); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + /* multiples hazards, nearest should be considered */ + //! p_unit_test 3 + //! v1: %0:v[1] = v_mov_b32 %0:v[0] + //! v1: %0:v[0] = v_mov_b32 0 + //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + /* independent VALU increase wait_vdst */ + //! p_unit_test 4 + //! v1: %0:v[0] = v_mov_b32 0 + //! v_nop + //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.vop1(aco_opcode::v_nop); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + //! p_unit_test 5 + //! v1: %0:v[0] = v_mov_b32 0 + //; for i in range(10): insert_pattern('v_nop') + //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:10 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + for (unsigned i = 0; i < 10; i++) + bld.vop1(aco_opcode::v_nop); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + //! p_unit_test 6 + //! v1: %0:v[0] = v_mov_b32 0 + //; for i in range(20): insert_pattern('v_nop') + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + for (unsigned i = 0; i < 20; i++) + bld.vop1(aco_opcode::v_nop); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + /* transcendental requires wait_vdst=0 */ + //! p_unit_test 7 + //! v1: %0:v[0] = v_mov_b32 0 + //! v_nop + //! v1: %0:v[1] = v_sqrt_f32 %0:v[1] + //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.vop1(aco_opcode::v_nop); + bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1)); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + //! p_unit_test 8 + //! v1: %0:v[0] = v_sqrt_f32 %0:v[0] + //! v_nop + //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); + bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1)); + bld.vop1(aco_opcode::v_nop); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + /* transcendental is fine if it's before the instruction */ + //! p_unit_test 9 + //! v1: %0:v[1] = v_sqrt_f32 %0:v[1] + //! v1: %0:v[0] = v_mov_b32 0 + //! v_nop + //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); + bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.vop1(aco_opcode::v_nop); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + /* non-VALU does not increase wait_vdst */ + //! p_unit_test 10 + //! v1: %0:v[0] = v_mov_b32 0 + //! s1: %0:m0 = s_mov_b32 0 + //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero()); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + /* consider instructions which wait on vdst */ + //! p_unit_test 11 + //! v1: %0:v[0] = v_mov_b32 0 + //! v_nop + //! s_waitcnt_depctr va_vdst(0) + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.vop1(aco_opcode::v_nop); + bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0x0fff); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + finish_insert_nops_test(); +END_TEST