nir: create ffma from builders more often
We will not be able to combine instructions into ffma later if they are exact, so create them from the start. They can be lowered later if they are unwanted. fossil-db (GFX10.3): Totals from 16589 (11.34% of 146267) affected shaders: VGPRs: 938872 -> 938704 (-0.02%) SpillSGPRs: 11334 -> 10785 (-4.84%) CodeSize: 96551964 -> 96498040 (-0.06%); split: -0.08%, +0.02% MaxWaves: 338760 -> 338772 (+0.00%) Instrs: 18356857 -> 18350486 (-0.03%); split: -0.06%, +0.02% Latency: 561563310 -> 561414360 (-0.03%); split: -0.08%, +0.05% InvThroughput: 145629673 -> 145594740 (-0.02%); split: -0.04%, +0.01% fossil-db (GFX10): Totals from 16252 (11.11% of 146267) affected shaders: VGPRs: 893820 -> 893744 (-0.01%) SpillSGPRs: 11334 -> 10785 (-4.84%) CodeSize: 95890244 -> 95839124 (-0.05%); split: -0.08%, +0.02% MaxWaves: 367704 -> 367734 (+0.01%) Instrs: 18199741 -> 18194437 (-0.03%); split: -0.06%, +0.03% Latency: 560912971 -> 560854179 (-0.01%); split: -0.07%, +0.06% InvThroughput: 142899814 -> 142877939 (-0.02%); split: -0.03%, +0.02% fossil-db (GFX9): Totals from 16287 (11.12% of 146401) affected shaders: SGPRs: 1312784 -> 1312768 (-0.00%); split: -0.05%, +0.05% VGPRs: 931440 -> 931444 (+0.00%); split: -0.00%, +0.00% SpillSGPRs: 14623 -> 14597 (-0.18%) CodeSize: 94428788 -> 94344404 (-0.09%); split: -0.10%, +0.01% MaxWaves: 90105 -> 90109 (+0.00%) Instrs: 18486905 -> 18473434 (-0.07%); split: -0.08%, +0.01% Latency: 720947295 -> 720818323 (-0.02%); split: -0.07%, +0.05% InvThroughput: 365240104 -> 365224659 (-0.00%); split: -0.02%, +0.01% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8056>
This commit is contained in:
@@ -33,10 +33,10 @@ nir_cross3(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
|
||||
unsigned yzx[3] = { 1, 2, 0 };
|
||||
unsigned zxy[3] = { 2, 0, 1 };
|
||||
|
||||
return nir_fsub(b, nir_fmul(b, nir_swizzle(b, x, yzx, 3),
|
||||
nir_swizzle(b, y, zxy, 3)),
|
||||
nir_fmul(b, nir_swizzle(b, x, zxy, 3),
|
||||
nir_swizzle(b, y, yzx, 3)));
|
||||
return nir_ffma(b, nir_swizzle(b, x, yzx, 3),
|
||||
nir_swizzle(b, y, zxy, 3),
|
||||
nir_fneg(b, nir_fmul(b, nir_swizzle(b, x, zxy, 3),
|
||||
nir_swizzle(b, y, yzx, 3))));
|
||||
}
|
||||
|
||||
nir_ssa_def*
|
||||
@@ -149,7 +149,7 @@ nir_smoothstep(nir_builder *b, nir_ssa_def *edge0, nir_ssa_def *edge1, nir_ssa_d
|
||||
nir_fsub(b, edge1, edge0)));
|
||||
|
||||
/* result = t * t * (3 - 2 * t) */
|
||||
return nir_fmul(b, t, nir_fmul(b, t, nir_fsub(b, f3, nir_fmul(b, f2, t))));
|
||||
return nir_fmul(b, t, nir_fmul(b, t, nir_a_minus_bc(b, f3, f2, t)));
|
||||
}
|
||||
|
||||
nir_ssa_def*
|
||||
@@ -226,9 +226,9 @@ nir_atan(nir_builder *b, nir_ssa_def *y_over_x)
|
||||
build_fsum(b, polynomial_terms, ARRAY_SIZE(polynomial_terms));
|
||||
|
||||
/* range-reduction fixup */
|
||||
tmp = nir_fadd(b,
|
||||
nir_fmul(b, nir_b2f(b, nir_flt(b, one, abs_y_over_x), bit_size),
|
||||
nir_fadd_imm(b, nir_fmul_imm(b, tmp, -2.0f), M_PI_2)),
|
||||
tmp = nir_ffma(b,
|
||||
nir_b2f(b, nir_flt(b, one, abs_y_over_x), bit_size),
|
||||
nir_ffma_imm12(b, tmp, -2.0f, M_PI_2),
|
||||
tmp);
|
||||
|
||||
/* sign fixup */
|
||||
@@ -303,8 +303,7 @@ nir_atan2(nir_builder *b, nir_ssa_def *y, nir_ssa_def *x)
|
||||
* coordinate system.
|
||||
*/
|
||||
nir_ssa_def *arc =
|
||||
nir_fadd(b, nir_fmul_imm(b, nir_b2f(b, flip, bit_size), M_PI_2),
|
||||
nir_atan(b, tan));
|
||||
nir_ffma_imm1(b, nir_b2f(b, flip, bit_size), M_PI_2, nir_atan(b, tan));
|
||||
|
||||
/* Rather convoluted calculation of the sign of the result. When x < 0 we
|
||||
* cannot use fsign because we need to be able to distinguish between
|
||||
|
||||
Reference in New Issue
Block a user