From ca0164f48769732eb11627003e0ea45c1ba7a605 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavel=20Ondra=C4=8Dka?= Date: Thu, 24 Nov 2022 20:53:18 +0100 Subject: [PATCH] r300: improve conversion to native swizzles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't add extra movs to construct the swizzles, but just split the instruction into separate channels, if possible. Idea by Filip Gawin. shader-db for RV370: total instructions in shared programs: 84632 -> 83565 (-1.26%) instructions in affected programs: 12613 -> 11546 (-8.46%) helped: 295 HURT: 8 total temps in shared programs: 12437 -> 12237 (-1.61%) temps in affected programs: 1807 -> 1607 (-11.07%) helped: 153 HURT: 20 LOST: 1 GAINED: 19 The HURT instructions and the single lost shaders are some fluctuations from pair scheduling. The number of instructions before pair scheduling is always lower or equivalent. Partial fix for: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6339 Signed-off-by: Pavel Ondračka Reviewed-by: Filip Gawin Tested-by: Filip Gawin Part-of: --- .../r300/compiler/radeon_dataflow_swizzles.c | 159 +++++++++++++++++- 1 file changed, 151 insertions(+), 8 deletions(-) diff --git a/src/gallium/drivers/r300/compiler/radeon_dataflow_swizzles.c b/src/gallium/drivers/r300/compiler/radeon_dataflow_swizzles.c index a976d7ed8d6..cf304db3611 100644 --- a/src/gallium/drivers/r300/compiler/radeon_dataflow_swizzles.c +++ b/src/gallium/drivers/r300/compiler/radeon_dataflow_swizzles.c @@ -36,6 +36,19 @@ #include "radeon_compiler_util.h" #include "radeon_swizzle.h" +static unsigned int get_swizzle_split(struct radeon_compiler * c, + struct rc_swizzle_split * split, struct rc_instruction * inst, + unsigned src, unsigned * usemask) +{ + *usemask = 0; + for(unsigned int chan = 0; chan < 4; ++chan) { + if (GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan) != RC_SWIZZLE_UNUSED) + *usemask |= 1 << chan; + } + + c->SwizzleCaps->Split(inst->U.I.SrcReg[src], *usemask, split); + return split->NumPhases; +} static void rewrite_source(struct radeon_compiler * c, struct rc_instruction * inst, unsigned src) @@ -44,13 +57,7 @@ static void rewrite_source(struct radeon_compiler * c, unsigned int tempreg = rc_find_free_temporary(c); unsigned int usemask; - usemask = 0; - for(unsigned int chan = 0; chan < 4; ++chan) { - if (GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan) != RC_SWIZZLE_UNUSED) - usemask |= 1 << chan; - } - - c->SwizzleCaps->Split(inst->U.I.SrcReg[src], usemask, &split); + get_swizzle_split(c, &split, inst, src, &usemask); for(unsigned int phase = 0; phase < split.NumPhases; ++phase) { struct rc_instruction * mov = rc_insert_new_instruction(c, inst->Prev); @@ -419,6 +426,110 @@ static unsigned try_rewrite_constant(struct radeon_compiler *c, return 1; } +/** + * Set all channels not specified by writemaks to unused. + */ +static void clear_channels(struct rc_instruction * inst, unsigned writemask) +{ + inst->U.I.DstReg.WriteMask = writemask; + for (unsigned chan = 0; chan < 4; chan++) { + if (writemask & (1 << chan)) + continue; + + const struct rc_opcode_info * opcode = + rc_get_opcode_info(inst->U.I.Opcode); + for (unsigned src = 0; src < opcode->NumSrcRegs; src++) { + SET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan, RC_SWIZZLE_UNUSED); + } + } + /* TODO: We could in theory add constant swizzles back as well, + * they will be all legal when we have just a single channel, + * to save some sources and help the pair scheduling later. */ +} + +static bool try_splitting_single_channel(struct radeon_compiler * c, + struct rc_instruction * inst) +{ + for (unsigned chan = 0; chan < 3; chan++) { + struct rc_instruction * new_inst; + new_inst = rc_insert_new_instruction(c, inst); + memcpy(&new_inst->U.I, &inst->U.I, sizeof(struct rc_sub_instruction)); + clear_channels(new_inst, inst->U.I.DstReg.WriteMask ^ (1 << chan)); + + const struct rc_opcode_info * opcode = + rc_get_opcode_info(new_inst->U.I.Opcode); + bool valid_swizzles = true; + + for (unsigned src = 0; src < opcode->NumSrcRegs; ++src) { + struct rc_src_register *reg = &new_inst->U.I.SrcReg[src]; + + if (!c->SwizzleCaps->IsNative(new_inst->U.I.Opcode, *reg)) + valid_swizzles = false; + } + + if (!valid_swizzles) { + rc_remove_instruction(new_inst); + } else { + clear_channels(inst, 1 << chan); + return true; + } + } + return false; +} + +static bool try_splitting_instruction(struct radeon_compiler * c, + struct rc_instruction * inst) +{ + /* Adding more output instructions in FS is bad for performance. */ + if (inst->U.I.DstReg.File == RC_FILE_OUTPUT) + return false; + + /* When only single channel of the swizzle is wrong, like xwzw, + * it is best to just split the single channel out. + */ + if (inst->U.I.DstReg.WriteMask == RC_MASK_XYZW || + inst->U.I.DstReg.WriteMask == RC_MASK_XYZ) { + if (try_splitting_single_channel(c, inst)) + return true; + } + + for (unsigned chan = 0; chan < 3; chan++) { + if (!(inst->U.I.DstReg.WriteMask & (1 << chan))) + continue; + + unsigned next_chan; + for (next_chan = chan + 1; next_chan < 4; next_chan++) { + if (!(inst->U.I.DstReg.WriteMask & (1 << next_chan))) + continue; + + /* We don't want to split the last used x/y/z channel and the + * w channel. Pair scheduling might be able to put it back + * together, but we don't trust it that much. + * + * Next is W already, rewrite the original inst and we are done. + */ + if (next_chan == 3) { + clear_channels(inst, (1 << chan) | (1 << next_chan)); + return true; + } + + struct rc_instruction * new_inst; + new_inst = rc_insert_new_instruction(c, inst->Prev); + memcpy(&new_inst->U.I, &inst->U.I, sizeof(struct rc_sub_instruction)); + clear_channels(new_inst, 1 << chan); + break; + } + + /* No next chan */ + if (next_chan == 4) { + clear_channels(inst, 1 << chan); + return true; + } + } + assert(0 && "Unreachable\n"); + return false; +} + void rc_dataflow_swizzles(struct radeon_compiler * c, void *user) { struct rc_instruction * inst; @@ -428,8 +539,40 @@ void rc_dataflow_swizzles(struct radeon_compiler * c, void *user) inst = inst->Next) { const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); - unsigned int src; + unsigned src, usemask; + unsigned total_splits = 0; + struct rc_swizzle_split split; + /* If multiple sources needs splitting or some source needs to split + * too many times, it is actually better to just split the whole ALU + * instruction to separate channels instead of inserting extra movs. + */ + for (src = 0; src < opcode->NumSrcRegs; ++src) { + /* Don't count invalid swizzles from immediates, we can just + * insert new immediates with the correct order later. + */ + if (rc_src_reg_is_immediate(c, inst->U.I.SrcReg[src].File, + inst->U.I.SrcReg[src].Index) + && c->Program.Constants.Count < R300_PFS_NUM_CONST_REGS) { + total_splits++; + } else { + total_splits += get_swizzle_split(c, &split, inst, + src, &usemask); + } + } + + /* Even if there is only a single split, i.e., two extra movs, this still + * accounts to three instructions, the same as when we split + * the original instruction right away. + */ + if (total_splits > opcode->NumSrcRegs && opcode->IsComponentwise) { + if (try_splitting_instruction(c, inst)) + continue; + } + + /* For texturing or non-componentwise opcodes we do the old way + * of adding extra movs. + */ for(src = 0; src < opcode->NumSrcRegs; ++src) { struct rc_src_register *reg = &inst->U.I.SrcReg[src]; if (c->SwizzleCaps->IsNative(inst->U.I.Opcode, *reg)) {