aco: create 16-bit mad/fma

fossil-db (Navi, fp16 enabled):
Totals from 1 (0.00% of 127638) affected shaders:
CodeSize: 4868 -> 4552 (-6.49%)
Instrs: 956 -> 863 (-9.73%)
Cycles: 3824 -> 3452 (-9.73%)
VMEM: 504 -> 490 (-2.78%)
SMEM: 109 -> 107 (-1.83%)
VClause: 19 -> 20 (+5.26%)
Copies: 54 -> 58 (+7.41%)
PreVGPRs: 43 -> 41 (-4.65%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5245>
This commit is contained in:
Rhys Perry
2020-05-14 21:09:36 +01:00
committed by Marge Bot
parent 1b10764e50
commit 7f511efa16
2 changed files with 43 additions and 9 deletions
+22 -6
View File
@@ -1103,6 +1103,10 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
}
break;
}
case aco_opcode::v_mul_f16: {
ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
break;
}
case aco_opcode::v_and_b32: /* abs */
if (!instr->usesModifiers() && instr->operands[0].constantEquals(0x7FFFFFFF) &&
instr->operands[1].isTemp() && instr->operands[1].getTemp().type() == RegType::vgpr)
@@ -2415,11 +2419,15 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr
bool mad32 = instr->opcode == aco_opcode::v_add_f32 ||
instr->opcode == aco_opcode::v_sub_f32 ||
instr->opcode == aco_opcode::v_subrev_f32;
if (mad32) {
bool need_fma = block.fp_mode.denorm32 != 0;
bool mad16 = instr->opcode == aco_opcode::v_add_f16 ||
instr->opcode == aco_opcode::v_sub_f16 ||
instr->opcode == aco_opcode::v_subrev_f16;
if (mad16 || mad32) {
bool need_fma = mad32 ? block.fp_mode.denorm32 != 0 :
(block.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10);
if (need_fma && instr->definitions[0].isPrecise())
return;
if (need_fma && !ctx.program->has_fast_fma32)
if (need_fma && mad32 && !ctx.program->has_fast_fma32)
return;
uint32_t uses_src0 = UINT32_MAX;
@@ -2500,12 +2508,15 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr
/* neg of the multiplication result */
neg[1] = neg[1] ^ vop3->neg[1 - add_op_idx];
}
if (instr->opcode == aco_opcode::v_sub_f32)
if (instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_sub_f16)
neg[1 + add_op_idx] = neg[1 + add_op_idx] ^ true;
else if (instr->opcode == aco_opcode::v_subrev_f32)
else if (instr->opcode == aco_opcode::v_subrev_f32 || instr->opcode == aco_opcode::v_subrev_f16)
neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true;
aco_opcode mad_op = need_fma ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
if (mad16)
mad_op = need_fma ? (ctx.program->chip_class == GFX8 ? aco_opcode::v_fma_legacy_f16 : aco_opcode::v_fma_f16) :
(ctx.program->chip_class == GFX8 ? aco_opcode::v_mad_legacy_f16 : aco_opcode::v_mad_f16);
aco_ptr<VOP3A_instruction> mad{create_instruction<VOP3A_instruction>(mad_op, Format::VOP3A, 3, 1)};
for (unsigned i = 0; i < 3; i++)
@@ -2730,7 +2741,8 @@ void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
/* check literals */
else if (!instr->usesModifiers()) {
/* FMA can only take literals on GFX10+ */
if (instr->opcode == aco_opcode::v_fma_f32 && ctx.program->chip_class < GFX10)
if ((instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) &&
ctx.program->chip_class < GFX10)
return;
bool sgpr_used = false;
@@ -2903,6 +2915,10 @@ void apply_literals(opt_ctx &ctx, aco_ptr<Instruction>& instr)
aco_opcode new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32;
if (instr->opcode == aco_opcode::v_fma_f32)
new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32;
else if (instr->opcode == aco_opcode::v_mad_f16 || instr->opcode == aco_opcode::v_mad_legacy_f16)
new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f16 : aco_opcode::v_madmk_f16;
else if (instr->opcode == aco_opcode::v_fma_f16)
new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f16 : aco_opcode::v_fmamk_f16;
new_mad.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 1));
if (info->literal_idx == 2) { /* add literal -> madak */
+21 -3
View File
@@ -1735,7 +1735,10 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy)
op = instr->operands[i];
else if ((instr->opcode == aco_opcode::v_mad_f32 ||
(instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10)) && !instr->usesModifiers())
(instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) ||
instr->opcode == aco_opcode::v_mad_f16 ||
instr->opcode == aco_opcode::v_mad_legacy_f16 ||
(instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10)) && !instr->usesModifiers())
op = instr->operands[2];
if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
@@ -2011,13 +2014,19 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
/* try to optimize v_mad_f32 -> v_mac_f32 */
if ((instr->opcode == aco_opcode::v_mad_f32 ||
(instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10)) &&
(instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) ||
instr->opcode == aco_opcode::v_mad_f16 ||
instr->opcode == aco_opcode::v_mad_legacy_f16 ||
(instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10)) &&
instr->operands[2].isTemp() &&
instr->operands[2].isKillBeforeDef() &&
instr->operands[2].getTemp().type() == RegType::vgpr &&
instr->operands[1].isTemp() &&
instr->operands[1].getTemp().type() == RegType::vgpr &&
!instr->usesModifiers()) {
!instr->usesModifiers() &&
instr->operands[0].physReg().byte() == 0 &&
instr->operands[1].physReg().byte() == 0 &&
instr->operands[2].physReg().byte() == 0) {
unsigned def_id = instr->definitions[0].tempId();
auto it = ctx.affinities.find(def_id);
if (it == ctx.affinities.end() || !ctx.assignments[it->second].assigned ||
@@ -2031,6 +2040,13 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
case aco_opcode::v_fma_f32:
instr->opcode = aco_opcode::v_fmac_f32;
break;
case aco_opcode::v_mad_f16:
case aco_opcode::v_mad_legacy_f16:
instr->opcode = aco_opcode::v_mac_f16;
break;
case aco_opcode::v_fma_f16:
instr->opcode = aco_opcode::v_fmac_f16;
break;
default:
break;
}
@@ -2041,6 +2057,8 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
instr->opcode == aco_opcode::v_mac_f32 ||
instr->opcode == aco_opcode::v_fmac_f32 ||
instr->opcode == aco_opcode::v_mac_f16 ||
instr->opcode == aco_opcode::v_fmac_f16 ||
instr->opcode == aco_opcode::v_writelane_b32 ||
instr->opcode == aco_opcode::v_writelane_b32_e64) {
instr->definitions[0].setFixed(instr->operands[2].physReg());