aco: only break SMEM clauses if XNACK is enabled (mostly APUs)

According to LLVM, it seems only required for APUs like RAVEN, but
we still ensure that SMEM stores are in their own clause.

pipeline-db (VEGA10):
Totals from affected shaders:
SGPRS: 1775364 -> 1775364 (0.00 %)
VGPRS: 1287176 -> 1287176 (0.00 %)
Spilled SGPRs: 725 -> 725 (0.00 %)
Spilled VGPRs: 0 -> 0 (0.00 %)
Code Size: 65386620 -> 65107460 (-0.43 %) bytes
Max Waves: 287099 -> 287099 (0.00 %)

pipeline-db (POLARIS10):
Totals from affected shaders:
SGPRS: 1797743 -> 1797743 (0.00 %)
VGPRS: 1271108 -> 1271108 (0.00 %)
Spilled SGPRs: 730 -> 730 (0.00 %)
Spilled VGPRs: 0 -> 0 (0.00 %)
Code Size: 64046244 -> 63782324 (-0.41 %) bytes
Max Waves: 254875 -> 254875 (0.00 %)

This only affects GFX6-GFX9 chips because the compiler uses a
different pass for GFX10.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Tested-by: Marge Bot <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4349>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4349>
This commit is contained in:
Samuel Pitoiset
2020-03-27 15:16:39 +01:00
committed by Marge Bot
parent 68f325b256
commit 2f424c83e0
4 changed files with 75 additions and 31 deletions

View File

@@ -274,6 +274,41 @@ bool test_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) {
}
}
/* A SMEM clause is any group of consecutive SMEM instructions. The
* instructions in this group may return out of order and/or may be replayed.
*
* To fix this potential hazard correctly, we have to make sure that when a
* clause has more than one instruction, no instruction in the clause writes
* to a register that is read by another instruction in the clause (including
* itself). In this case, we have to break the SMEM clause by inserting non
* SMEM instructions.
*
* SMEM clauses are only present on GFX8+, and only matter when XNACK is set.
*/
void handle_smem_clause_hazards(Program *program, NOP_ctx_gfx6 &ctx,
aco_ptr<Instruction>& instr, int *NOPs)
{
/* break off from previous SMEM clause if needed */
if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) {
/* Don't allow clauses with store instructions since the clause's
* instructions may use the same address. */
if (ctx.smem_write || instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) {
*NOPs = 1;
} else if (program->xnack_enabled) {
for (Operand op : instr->operands) {
if (!op.isConstant() && test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
*NOPs = 1;
break;
}
}
Definition def = instr->definitions[0];
if (!*NOPs && test_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size()))
*NOPs = 1;
}
}
}
/* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &ctx,
aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& new_instructions)
@@ -300,24 +335,7 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
}
}
/* break off from prevous SMEM clause if needed */
if (!NOPs & (ctx.smem_clause || ctx.smem_write)) {
/* Don't allow clauses with store instructions since the clause's
* instructions may use the same address. */
if (ctx.smem_write || instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) {
NOPs = 1;
} else {
for (Operand op : instr->operands) {
if (!op.isConstant() && test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
NOPs = 1;
break;
}
}
Definition def = instr->definitions[0];
if (!NOPs && test_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size()))
NOPs = 1;
}
}
handle_smem_clause_hazards(program, ctx, instr, &NOPs);
} else if (instr->isSALU()) {
if (instr->opcode == aco_opcode::s_setreg_b32 || instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
instr->opcode == aco_opcode::s_getreg_b32) {
@@ -414,8 +432,11 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
if ((ctx.smem_clause || ctx.smem_write) && (NOPs || instr->format != Format::SMEM)) {
ctx.smem_clause = false;
ctx.smem_write = false;
BITSET_ZERO(ctx.smem_clause_read_write);
BITSET_ZERO(ctx.smem_clause_write);
if (program->xnack_enabled) {
BITSET_ZERO(ctx.smem_clause_read_write);
BITSET_ZERO(ctx.smem_clause_write);
}
}
if (instr->format == Format::SMEM) {
@@ -424,15 +445,17 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
} else {
ctx.smem_clause = true;
for (Operand op : instr->operands) {
if (!op.isConstant()) {
set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size());
if (program->xnack_enabled) {
for (Operand op : instr->operands) {
if (!op.isConstant()) {
set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size());
}
}
}
Definition def = instr->definitions[0];
set_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size());
set_bitset_range(ctx.smem_clause_write, def.physReg(), def.size());
Definition def = instr->definitions[0];
set_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size());
set_bitset_range(ctx.smem_clause_write, def.physReg(), def.size());
}
}
} else if (instr->isVALU()) {
for (Definition def : instr->definitions) {

View File

@@ -1150,6 +1150,24 @@ setup_nir(isel_context *ctx, nir_shader *nir)
nir_index_ssa_defs(func);
}
void
setup_xnack(Program *program)
{
switch (program->family) {
/* GFX8 APUs */
case CHIP_CARRIZO:
case CHIP_STONEY:
/* GFX9 APUS */
case CHIP_RAVEN:
case CHIP_RAVEN2:
case CHIP_RENOIR:
program->xnack_enabled = true;
break;
default:
break;
}
}
isel_context
setup_isel_context(Program* program,
unsigned shader_count,
@@ -1308,6 +1326,8 @@ setup_isel_context(Program* program,
ctx.block->loop_nest_depth = 0;
ctx.block->kind = block_kind_top_level;
setup_xnack(program);
return ctx;
}

View File

@@ -1252,8 +1252,9 @@ public:
uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
unsigned workgroup_size; /* if known; otherwise UINT_MAX */
bool xnack_enabled = false;
bool needs_vcc = false;
bool needs_xnack_mask = false;
bool needs_flat_scr = false;
uint32_t allocateId()

View File

@@ -302,19 +302,19 @@ uint16_t get_extra_sgprs(Program *program)
{
if (program->chip_class >= GFX10) {
assert(!program->needs_flat_scr);
assert(!program->needs_xnack_mask);
assert(!program->xnack_enabled);
return 2;
} else if (program->chip_class >= GFX8) {
if (program->needs_flat_scr)
return 6;
else if (program->needs_xnack_mask)
else if (program->xnack_enabled)
return 4;
else if (program->needs_vcc)
return 2;
else
return 0;
} else {
assert(!program->needs_xnack_mask);
assert(!program->xnack_enabled);
if (program->needs_flat_scr)
return 4;
else if (program->needs_vcc)