aco: only break SMEM clauses if XNACK is enabled (mostly APUs)
According to LLVM, it seems only required for APUs like RAVEN, but we still ensure that SMEM stores are in their own clause. pipeline-db (VEGA10): Totals from affected shaders: SGPRS: 1775364 -> 1775364 (0.00 %) VGPRS: 1287176 -> 1287176 (0.00 %) Spilled SGPRs: 725 -> 725 (0.00 %) Spilled VGPRs: 0 -> 0 (0.00 %) Code Size: 65386620 -> 65107460 (-0.43 %) bytes Max Waves: 287099 -> 287099 (0.00 %) pipeline-db (POLARIS10): Totals from affected shaders: SGPRS: 1797743 -> 1797743 (0.00 %) VGPRS: 1271108 -> 1271108 (0.00 %) Spilled SGPRs: 730 -> 730 (0.00 %) Spilled VGPRs: 0 -> 0 (0.00 %) Code Size: 64046244 -> 63782324 (-0.41 %) bytes Max Waves: 254875 -> 254875 (0.00 %) This only affects GFX6-GFX9 chips because the compiler uses a different pass for GFX10. Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Tested-by: Marge Bot <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4349> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4349>
This commit is contained in:
committed by
Marge Bot
parent
68f325b256
commit
2f424c83e0
@@ -274,6 +274,41 @@ bool test_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) {
|
||||
}
|
||||
}
|
||||
|
||||
/* A SMEM clause is any group of consecutive SMEM instructions. The
|
||||
* instructions in this group may return out of order and/or may be replayed.
|
||||
*
|
||||
* To fix this potential hazard correctly, we have to make sure that when a
|
||||
* clause has more than one instruction, no instruction in the clause writes
|
||||
* to a register that is read by another instruction in the clause (including
|
||||
* itself). In this case, we have to break the SMEM clause by inserting non
|
||||
* SMEM instructions.
|
||||
*
|
||||
* SMEM clauses are only present on GFX8+, and only matter when XNACK is set.
|
||||
*/
|
||||
void handle_smem_clause_hazards(Program *program, NOP_ctx_gfx6 &ctx,
|
||||
aco_ptr<Instruction>& instr, int *NOPs)
|
||||
{
|
||||
/* break off from previous SMEM clause if needed */
|
||||
if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) {
|
||||
/* Don't allow clauses with store instructions since the clause's
|
||||
* instructions may use the same address. */
|
||||
if (ctx.smem_write || instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) {
|
||||
*NOPs = 1;
|
||||
} else if (program->xnack_enabled) {
|
||||
for (Operand op : instr->operands) {
|
||||
if (!op.isConstant() && test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
|
||||
*NOPs = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Definition def = instr->definitions[0];
|
||||
if (!*NOPs && test_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size()))
|
||||
*NOPs = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
|
||||
void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &ctx,
|
||||
aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& new_instructions)
|
||||
@@ -300,24 +335,7 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
|
||||
}
|
||||
}
|
||||
|
||||
/* break off from prevous SMEM clause if needed */
|
||||
if (!NOPs & (ctx.smem_clause || ctx.smem_write)) {
|
||||
/* Don't allow clauses with store instructions since the clause's
|
||||
* instructions may use the same address. */
|
||||
if (ctx.smem_write || instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) {
|
||||
NOPs = 1;
|
||||
} else {
|
||||
for (Operand op : instr->operands) {
|
||||
if (!op.isConstant() && test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
|
||||
NOPs = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
Definition def = instr->definitions[0];
|
||||
if (!NOPs && test_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size()))
|
||||
NOPs = 1;
|
||||
}
|
||||
}
|
||||
handle_smem_clause_hazards(program, ctx, instr, &NOPs);
|
||||
} else if (instr->isSALU()) {
|
||||
if (instr->opcode == aco_opcode::s_setreg_b32 || instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
|
||||
instr->opcode == aco_opcode::s_getreg_b32) {
|
||||
@@ -414,8 +432,11 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
|
||||
if ((ctx.smem_clause || ctx.smem_write) && (NOPs || instr->format != Format::SMEM)) {
|
||||
ctx.smem_clause = false;
|
||||
ctx.smem_write = false;
|
||||
BITSET_ZERO(ctx.smem_clause_read_write);
|
||||
BITSET_ZERO(ctx.smem_clause_write);
|
||||
|
||||
if (program->xnack_enabled) {
|
||||
BITSET_ZERO(ctx.smem_clause_read_write);
|
||||
BITSET_ZERO(ctx.smem_clause_write);
|
||||
}
|
||||
}
|
||||
|
||||
if (instr->format == Format::SMEM) {
|
||||
@@ -424,15 +445,17 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
|
||||
} else {
|
||||
ctx.smem_clause = true;
|
||||
|
||||
for (Operand op : instr->operands) {
|
||||
if (!op.isConstant()) {
|
||||
set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size());
|
||||
if (program->xnack_enabled) {
|
||||
for (Operand op : instr->operands) {
|
||||
if (!op.isConstant()) {
|
||||
set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Definition def = instr->definitions[0];
|
||||
set_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size());
|
||||
set_bitset_range(ctx.smem_clause_write, def.physReg(), def.size());
|
||||
Definition def = instr->definitions[0];
|
||||
set_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size());
|
||||
set_bitset_range(ctx.smem_clause_write, def.physReg(), def.size());
|
||||
}
|
||||
}
|
||||
} else if (instr->isVALU()) {
|
||||
for (Definition def : instr->definitions) {
|
||||
|
||||
@@ -1150,6 +1150,24 @@ setup_nir(isel_context *ctx, nir_shader *nir)
|
||||
nir_index_ssa_defs(func);
|
||||
}
|
||||
|
||||
void
|
||||
setup_xnack(Program *program)
|
||||
{
|
||||
switch (program->family) {
|
||||
/* GFX8 APUs */
|
||||
case CHIP_CARRIZO:
|
||||
case CHIP_STONEY:
|
||||
/* GFX9 APUS */
|
||||
case CHIP_RAVEN:
|
||||
case CHIP_RAVEN2:
|
||||
case CHIP_RENOIR:
|
||||
program->xnack_enabled = true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
isel_context
|
||||
setup_isel_context(Program* program,
|
||||
unsigned shader_count,
|
||||
@@ -1308,6 +1326,8 @@ setup_isel_context(Program* program,
|
||||
ctx.block->loop_nest_depth = 0;
|
||||
ctx.block->kind = block_kind_top_level;
|
||||
|
||||
setup_xnack(program);
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
|
||||
@@ -1252,8 +1252,9 @@ public:
|
||||
uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
|
||||
unsigned workgroup_size; /* if known; otherwise UINT_MAX */
|
||||
|
||||
bool xnack_enabled = false;
|
||||
|
||||
bool needs_vcc = false;
|
||||
bool needs_xnack_mask = false;
|
||||
bool needs_flat_scr = false;
|
||||
|
||||
uint32_t allocateId()
|
||||
|
||||
@@ -302,19 +302,19 @@ uint16_t get_extra_sgprs(Program *program)
|
||||
{
|
||||
if (program->chip_class >= GFX10) {
|
||||
assert(!program->needs_flat_scr);
|
||||
assert(!program->needs_xnack_mask);
|
||||
assert(!program->xnack_enabled);
|
||||
return 2;
|
||||
} else if (program->chip_class >= GFX8) {
|
||||
if (program->needs_flat_scr)
|
||||
return 6;
|
||||
else if (program->needs_xnack_mask)
|
||||
else if (program->xnack_enabled)
|
||||
return 4;
|
||||
else if (program->needs_vcc)
|
||||
return 2;
|
||||
else
|
||||
return 0;
|
||||
} else {
|
||||
assert(!program->needs_xnack_mask);
|
||||
assert(!program->xnack_enabled);
|
||||
if (program->needs_flat_scr)
|
||||
return 4;
|
||||
else if (program->needs_vcc)
|
||||
|
||||
Reference in New Issue
Block a user