aco: use v_permlane(x)16_b32 for masked swizzle
Should be cheaper than ds_swizzle. Totals from 8 (0.01% of 134913) affected shaders: CodeSize: 16316 -> 16388 (+0.44%) Instrs: 3088 -> 3086 (-0.06%) Latency: 49558 -> 49508 (-0.10%) InvThroughput: 9180 -> 9198 (+0.20%) Copies: 376 -> 384 (+2.13%) Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21412>
This commit is contained in:
@@ -249,6 +249,10 @@ emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask)
|
||||
|
||||
uint16_t dpp_ctrl = 0xffff;
|
||||
|
||||
/* DPP16 before DPP8 before v_permlane(x)16_b32
|
||||
* because DPP16 supports modifiers and v_permlane
|
||||
* can't be folded into valu instructions.
|
||||
*/
|
||||
if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {
|
||||
unsigned res[4] = {0, 1, 2, 3};
|
||||
for (unsigned i = 0; i < 4; i++)
|
||||
@@ -262,12 +266,22 @@ emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask)
|
||||
dpp_ctrl = dpp_row_half_mirror;
|
||||
} else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x18) == 0x18 && or_mask < 8 &&
|
||||
xor_mask < 8) {
|
||||
// DPP8 comes last, as it does not allow several modifiers like `abs` that are available with DPP16
|
||||
Builder::Result ret = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), src);
|
||||
for (unsigned i = 0; i < 8; i++) {
|
||||
ret->dpp8().lane_sel[i] = (((i & and_mask) | or_mask) ^ xor_mask) & 0x7;
|
||||
}
|
||||
return ret;
|
||||
} else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x10) == 0x10 && or_mask < 0x10) {
|
||||
uint64_t lane_mask = 0;
|
||||
for (unsigned i = 0; i < 16; i++)
|
||||
lane_mask |= uint64_t(((i & and_mask) | or_mask) ^ (xor_mask & 0xf)) << i * 4;
|
||||
aco_opcode opcode =
|
||||
xor_mask & 0x10 ? aco_opcode::v_permlanex16_b32 : aco_opcode::v_permlane16_b32;
|
||||
Temp op1 = bld.copy(bld.def(s1), Operand::c32(lane_mask & 0xffffffff));
|
||||
Temp op2 = bld.copy(bld.def(s1), Operand::c32(lane_mask >> 32));
|
||||
Builder::Result ret = bld.vop3(opcode, bld.def(v1), src, op1, op2);
|
||||
ret->vop3().opsel = 0x3; /* set BOUND_CTRL/FETCH_INACTIVE */
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (dpp_ctrl != 0xffff)
|
||||
|
||||
Reference in New Issue
Block a user