From d2d94b62f2a4f8686c17b7c33ae02aa2b2029a27 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 19 May 2022 14:12:08 +0100 Subject: [PATCH] aco: initialize scratch base registers on GFX9-GFX10.3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fossil-db (navi21): Totals from 1142 (0.70% of 162293) affected shaders: Instrs: 271636 -> 271974 (+0.12%) CodeSize: 1532020 -> 1533792 (+0.12%) Latency: 7484066 -> 7485698 (+0.02%) InvThroughput: 4048824 -> 4049579 (+0.02%) SClause: 4171 -> 4212 (+0.98%) PreSGPRs: 11203 -> 12276 (+9.58%) fossil-db (vega10): Totals from 3327 (2.06% of 161355) affected shaders: Instrs: 257413 -> 257601 (+0.07%) CodeSize: 1424244 -> 1425372 (+0.08%) Latency: 8598402 -> 8600466 (+0.02%) InvThroughput: 7906335 -> 7908234 (+0.02%) SClause: 4932 -> 4973 (+0.83%) PreSGPRs: 22010 -> 25405 (+15.42%) Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_dead_code_analysis.cpp | 3 +- src/amd/compiler/aco_insert_exec_mask.cpp | 8 +++- .../compiler/aco_instruction_selection.cpp | 11 ++++- src/amd/compiler/aco_ir.cpp | 3 +- src/amd/compiler/aco_ir.h | 3 +- src/amd/compiler/aco_live_var_analysis.cpp | 8 ++-- src/amd/compiler/aco_lower_to_hw_instr.cpp | 41 +++++++++++++++++++ src/amd/compiler/aco_opcodes.py | 2 + src/amd/compiler/aco_scheduler.cpp | 3 +- 9 files changed, 72 insertions(+), 10 deletions(-) diff --git a/src/amd/compiler/aco_dead_code_analysis.cpp b/src/amd/compiler/aco_dead_code_analysis.cpp index 3d565f0c141..d558b688030 100644 --- a/src/amd/compiler/aco_dead_code_analysis.cpp +++ b/src/amd/compiler/aco_dead_code_analysis.cpp @@ -83,7 +83,8 @@ process_block(dce_ctx& ctx, Block& block) bool is_dead(const std::vector& uses, Instruction* instr) { - if (instr->definitions.empty() || instr->isBranch()) + if (instr->definitions.empty() || instr->isBranch() || + instr->opcode == aco_opcode::p_init_scratch) return false; if (std::any_of(instr->definitions.begin(), instr->definitions.end(), [&uses](const Definition& def) { return !def.isTemp() || uses[def.tempId()]; })) diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index e1dd3929910..c96ee88c92e 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -249,6 +249,12 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector> assert(startpgm->opcode == aco_opcode::p_startpgm); bld.insert(std::move(startpgm)); + unsigned count = 1; + if (block->instructions[1]->opcode == aco_opcode::p_init_scratch) { + bld.insert(std::move(block->instructions[1])); + count++; + } + Operand start_exec(bld.lm); /* exec seems to need to be manually initialized with combined shaders */ @@ -274,7 +280,7 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector> ctx.info[0].exec.emplace_back(start_exec, mask); } - return 1; + return count; } /* loop entry block */ diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 876c15bb643..4980c040bf2 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -11207,9 +11207,16 @@ add_startpgm(struct isel_context* ctx) * handling spilling. */ ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets); - if (ctx->args->ac.scratch_offset.used) { - /* FIXME: Fix scratch loads/stores on GFX11. */ + if (ctx->program->gfx_level <= GFX10_3) { ctx->program->scratch_offset = get_arg(ctx, ctx->args->ac.scratch_offset); + + if (ctx->program->gfx_level >= GFX9) { + Operand scratch_offset(ctx->program->scratch_offset); + scratch_offset.setLateKill(true); + Builder bld(ctx->program, ctx->block); + bld.pseudo(aco_opcode::p_init_scratch, bld.def(s2), bld.def(s1, scc), + ctx->program->private_segment_buffer, scratch_offset); + } } if (ctx->stage.has(SWStage::VS) && ctx->program->info.vs.dynamic_inputs) { diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 5d325f863a2..b74af9417d8 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -592,7 +592,8 @@ needs_exec_mask(const Instruction* instr) case aco_opcode::p_end_linear_vgpr: case aco_opcode::p_logical_start: case aco_opcode::p_logical_end: - case aco_opcode::p_startpgm: return instr->reads_exec(); + case aco_opcode::p_startpgm: + case aco_opcode::p_init_scratch: return instr->reads_exec(); default: break; } } diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index bc989875741..f2721aa0d95 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -451,6 +451,8 @@ struct PhysReg { /* helper expressions for special registers */ static constexpr PhysReg m0{124}; +static constexpr PhysReg flat_scr_lo{102}; /* GFX8-GFX9, encoded differently on GFX6-7 */ +static constexpr PhysReg flat_scr_hi{103}; /* GFX8-GFX9, encoded differently on GFX6-7 */ static constexpr PhysReg vcc{106}; static constexpr PhysReg vcc_hi{107}; static constexpr PhysReg tba{108}; /* GFX6-GFX8 */ @@ -2104,7 +2106,6 @@ public: bool early_rast = false; /* whether rasterization can start as soon as the 1st DONE pos export */ bool needs_vcc = false; - bool needs_flat_scr = false; CompilationProgress progress; diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp index 0449fa1cdff..0d6274d6b82 100644 --- a/src/amd/compiler/aco_live_var_analysis.cpp +++ b/src/amd/compiler/aco_live_var_analysis.cpp @@ -293,12 +293,14 @@ calc_waves_per_workgroup(Program* program) uint16_t get_extra_sgprs(Program* program) { + /* We don't use this register on GFX6-8 and it's removed on GFX10+. */ + bool needs_flat_scr = program->config->scratch_bytes_per_wave && program->gfx_level == GFX9; + if (program->gfx_level >= GFX10) { - assert(!program->needs_flat_scr); assert(!program->dev.xnack_enabled); return 0; } else if (program->gfx_level >= GFX8) { - if (program->needs_flat_scr) + if (needs_flat_scr) return 6; else if (program->dev.xnack_enabled) return 4; @@ -308,7 +310,7 @@ get_extra_sgprs(Program* program) return 0; } else { assert(!program->dev.xnack_enabled); - if (program->needs_flat_scr) + if (needs_flat_scr) return 4; else if (program->needs_vcc) return 2; diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 20069adc1c2..1920dfbe6b9 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2327,6 +2327,47 @@ lower_to_hw_instr(Program* program) } break; } + case aco_opcode::p_init_scratch: { + assert(program->gfx_level >= GFX8 && program->gfx_level <= GFX10_3); + if (!program->config->scratch_bytes_per_wave) + break; + + Operand scratch_addr = instr->operands[0]; + Operand scratch_addr_lo(scratch_addr.physReg(), s1); + if (program->stage != compute_cs) { + bld.smem(aco_opcode::s_load_dwordx2, instr->definitions[0], scratch_addr, + Operand::zero()); + scratch_addr_lo.setFixed(instr->definitions[0].physReg()); + } + Operand scratch_addr_hi(scratch_addr_lo.physReg().advance(4), s1); + + /* Since we know what the high 16 bits of scratch_hi is, we can set all the high 16 + * bits in the same instruction that we add the carry. + */ + uint32_t hi_add = 0xffff0000 - S_008F04_SWIZZLE_ENABLE_GFX6(1); + + if (program->gfx_level >= GFX10) { + Operand scratch_lo(instr->definitions[0].physReg(), s1); + Operand scratch_hi(instr->definitions[0].physReg().advance(4), s1); + + bld.sop2(aco_opcode::s_add_u32, Definition(scratch_lo.physReg(), s1), + Definition(scc, s1), scratch_addr_lo, instr->operands[1]); + bld.sop2(aco_opcode::s_addc_u32, Definition(scratch_hi.physReg(), s1), + Definition(scc, s1), scratch_addr_hi, Operand::c32(hi_add), + Operand(scc, s1)); + + /* "((size - 1) << 11) | register" (FLAT_SCRATCH_LO/HI is encoded as register + * 20/21) */ + bld.sopk(aco_opcode::s_setreg_b32, scratch_lo, (31 << 11) | 20); + bld.sopk(aco_opcode::s_setreg_b32, scratch_hi, (31 << 11) | 21); + } else { + bld.sop2(aco_opcode::s_add_u32, Definition(flat_scr_lo, s1), Definition(scc, s1), + scratch_addr_lo, instr->operands[1]); + bld.sop2(aco_opcode::s_addc_u32, Definition(flat_scr_hi, s1), Definition(scc, s1), + scratch_addr_hi, Operand::c32(hi_add), Operand(scc, s1)); + } + break; + } default: break; } } else if (instr->isBranch()) { diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 64a4b398c95..820e09b1989 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -318,6 +318,8 @@ opcode("p_extract") # src1=index, src2=bits, src3=signext # (src0 & ((1 << bits) - 1)) << (index * bits) opcode("p_insert") # src1=index, src2=bits +opcode("p_init_scratch") + # SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc) SOP2 = { diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index 3aeb30c5ccf..0edd7862b6d 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -573,7 +573,8 @@ perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards) /* don't move non-reorderable instructions */ if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime || - instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32) + instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32 || + instr->opcode == aco_opcode::p_init_scratch) return hazard_fail_unreorderable; memory_event_set instr_set;