r300: improve conversion to native swizzles

Don't add extra movs to construct the swizzles, but just split the
instruction into separate channels, if possible. Idea by Filip Gawin.

shader-db for RV370:
total instructions in shared programs: 84632 -> 83565 (-1.26%)
instructions in affected programs: 12613 -> 11546 (-8.46%)
helped: 295
HURT: 8

total temps in shared programs: 12437 -> 12237 (-1.61%)
temps in affected programs: 1807 -> 1607 (-11.07%)
helped: 153
HURT: 20

LOST:   1
GAINED: 19

The HURT instructions and the single lost shaders are some fluctuations
from pair scheduling. The number of instructions before pair scheduling
is always lower or equivalent.

Partial fix for: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6339

Signed-off-by: Pavel Ondračka <pavel.ondracka@gmail.com>
Reviewed-by: Filip Gawin <filip@gawin.net>
Tested-by: Filip Gawin <filip@gawin.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20009>
This commit is contained in:
Pavel Ondračka
2022-11-24 20:53:18 +01:00
parent 384fc52dd3
commit ca0164f487
@@ -36,6 +36,19 @@
#include "radeon_compiler_util.h"
#include "radeon_swizzle.h"
static unsigned int get_swizzle_split(struct radeon_compiler * c,
struct rc_swizzle_split * split, struct rc_instruction * inst,
unsigned src, unsigned * usemask)
{
*usemask = 0;
for(unsigned int chan = 0; chan < 4; ++chan) {
if (GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan) != RC_SWIZZLE_UNUSED)
*usemask |= 1 << chan;
}
c->SwizzleCaps->Split(inst->U.I.SrcReg[src], *usemask, split);
return split->NumPhases;
}
static void rewrite_source(struct radeon_compiler * c,
struct rc_instruction * inst, unsigned src)
@@ -44,13 +57,7 @@ static void rewrite_source(struct radeon_compiler * c,
unsigned int tempreg = rc_find_free_temporary(c);
unsigned int usemask;
usemask = 0;
for(unsigned int chan = 0; chan < 4; ++chan) {
if (GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan) != RC_SWIZZLE_UNUSED)
usemask |= 1 << chan;
}
c->SwizzleCaps->Split(inst->U.I.SrcReg[src], usemask, &split);
get_swizzle_split(c, &split, inst, src, &usemask);
for(unsigned int phase = 0; phase < split.NumPhases; ++phase) {
struct rc_instruction * mov = rc_insert_new_instruction(c, inst->Prev);
@@ -419,6 +426,110 @@ static unsigned try_rewrite_constant(struct radeon_compiler *c,
return 1;
}
/**
* Set all channels not specified by writemaks to unused.
*/
static void clear_channels(struct rc_instruction * inst, unsigned writemask)
{
inst->U.I.DstReg.WriteMask = writemask;
for (unsigned chan = 0; chan < 4; chan++) {
if (writemask & (1 << chan))
continue;
const struct rc_opcode_info * opcode =
rc_get_opcode_info(inst->U.I.Opcode);
for (unsigned src = 0; src < opcode->NumSrcRegs; src++) {
SET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan, RC_SWIZZLE_UNUSED);
}
}
/* TODO: We could in theory add constant swizzles back as well,
* they will be all legal when we have just a single channel,
* to save some sources and help the pair scheduling later. */
}
static bool try_splitting_single_channel(struct radeon_compiler * c,
struct rc_instruction * inst)
{
for (unsigned chan = 0; chan < 3; chan++) {
struct rc_instruction * new_inst;
new_inst = rc_insert_new_instruction(c, inst);
memcpy(&new_inst->U.I, &inst->U.I, sizeof(struct rc_sub_instruction));
clear_channels(new_inst, inst->U.I.DstReg.WriteMask ^ (1 << chan));
const struct rc_opcode_info * opcode =
rc_get_opcode_info(new_inst->U.I.Opcode);
bool valid_swizzles = true;
for (unsigned src = 0; src < opcode->NumSrcRegs; ++src) {
struct rc_src_register *reg = &new_inst->U.I.SrcReg[src];
if (!c->SwizzleCaps->IsNative(new_inst->U.I.Opcode, *reg))
valid_swizzles = false;
}
if (!valid_swizzles) {
rc_remove_instruction(new_inst);
} else {
clear_channels(inst, 1 << chan);
return true;
}
}
return false;
}
static bool try_splitting_instruction(struct radeon_compiler * c,
struct rc_instruction * inst)
{
/* Adding more output instructions in FS is bad for performance. */
if (inst->U.I.DstReg.File == RC_FILE_OUTPUT)
return false;
/* When only single channel of the swizzle is wrong, like xwzw,
* it is best to just split the single channel out.
*/
if (inst->U.I.DstReg.WriteMask == RC_MASK_XYZW ||
inst->U.I.DstReg.WriteMask == RC_MASK_XYZ) {
if (try_splitting_single_channel(c, inst))
return true;
}
for (unsigned chan = 0; chan < 3; chan++) {
if (!(inst->U.I.DstReg.WriteMask & (1 << chan)))
continue;
unsigned next_chan;
for (next_chan = chan + 1; next_chan < 4; next_chan++) {
if (!(inst->U.I.DstReg.WriteMask & (1 << next_chan)))
continue;
/* We don't want to split the last used x/y/z channel and the
* w channel. Pair scheduling might be able to put it back
* together, but we don't trust it that much.
*
* Next is W already, rewrite the original inst and we are done.
*/
if (next_chan == 3) {
clear_channels(inst, (1 << chan) | (1 << next_chan));
return true;
}
struct rc_instruction * new_inst;
new_inst = rc_insert_new_instruction(c, inst->Prev);
memcpy(&new_inst->U.I, &inst->U.I, sizeof(struct rc_sub_instruction));
clear_channels(new_inst, 1 << chan);
break;
}
/* No next chan */
if (next_chan == 4) {
clear_channels(inst, 1 << chan);
return true;
}
}
assert(0 && "Unreachable\n");
return false;
}
void rc_dataflow_swizzles(struct radeon_compiler * c, void *user)
{
struct rc_instruction * inst;
@@ -428,8 +539,40 @@ void rc_dataflow_swizzles(struct radeon_compiler * c, void *user)
inst = inst->Next) {
const struct rc_opcode_info * opcode =
rc_get_opcode_info(inst->U.I.Opcode);
unsigned int src;
unsigned src, usemask;
unsigned total_splits = 0;
struct rc_swizzle_split split;
/* If multiple sources needs splitting or some source needs to split
* too many times, it is actually better to just split the whole ALU
* instruction to separate channels instead of inserting extra movs.
*/
for (src = 0; src < opcode->NumSrcRegs; ++src) {
/* Don't count invalid swizzles from immediates, we can just
* insert new immediates with the correct order later.
*/
if (rc_src_reg_is_immediate(c, inst->U.I.SrcReg[src].File,
inst->U.I.SrcReg[src].Index)
&& c->Program.Constants.Count < R300_PFS_NUM_CONST_REGS) {
total_splits++;
} else {
total_splits += get_swizzle_split(c, &split, inst,
src, &usemask);
}
}
/* Even if there is only a single split, i.e., two extra movs, this still
* accounts to three instructions, the same as when we split
* the original instruction right away.
*/
if (total_splits > opcode->NumSrcRegs && opcode->IsComponentwise) {
if (try_splitting_instruction(c, inst))
continue;
}
/* For texturing or non-componentwise opcodes we do the old way
* of adding extra movs.
*/
for(src = 0; src < opcode->NumSrcRegs; ++src) {
struct rc_src_register *reg = &inst->U.I.SrcReg[src];
if (c->SwizzleCaps->IsNative(inst->U.I.Opcode, *reg)) {