6b9f838d62
Again, load the data just once in GRF, share it across lanes.
Shader-db on dg2:
total instructions in shared programs: 23214555 -> 23215400 (<.01%)
instructions in affected programs: 199977 -> 200822 (0.42%)
helped: 3
HURT: 38
helped stats (abs) min: 5 max: 670 x̄: 283.67 x̃: 176
helped stats (rel) min: 1.34% max: 49.41% x̄: 22.15% x̃: 15.70%
HURT stats (abs) min: 1 max: 185 x̄: 44.63 x̃: 32
HURT stats (rel) min: 0.13% max: 42.86% x̄: 10.25% x̃: 9.30%
95% mean confidence interval for instructions value: -18.65 59.87
95% mean confidence interval for instructions %-change: 3.29% 12.47%
Inconclusive result (value mean confidence interval includes 0).
total loops in shared programs: 5928 -> 5928 (0.00%)
loops in affected programs: 0 -> 0
helped: 0
HURT: 0
total cycles in shared programs: 851137495 -> 851152449 (<.01%)
cycles in affected programs: 16406137 -> 16421091 (0.09%)
helped: 9
HURT: 32
helped stats (abs) min: 10 max: 13498 x̄: 6443.22 x̃: 5581
helped stats (rel) min: 0.11% max: 4.75% x̄: 1.45% x̃: 0.34%
HURT stats (abs) min: 3 max: 15056 x̄: 2279.47 x̃: 735
HURT stats (rel) min: 0.10% max: 23.71% x̄: 4.58% x̃: 4.65%
95% mean confidence interval for cycles value: -1315.40 2044.87
95% mean confidence interval for cycles %-change: 1.71% 4.80%
Inconclusive result (value mean confidence interval includes 0).
total spills in shared programs: 11856 -> 11825 (-0.26%)
spills in affected programs: 2368 -> 2337 (-1.31%)
helped: 4
HURT: 0
total fills in shared programs: 16258 -> 16207 (-0.31%)
fills in affected programs: 2930 -> 2879 (-1.74%)
helped: 4
HURT: 0
total sends in shared programs: 1038194 -> 1038185 (<.01%)
sends in affected programs: 40 -> 31 (-22.50%)
helped: 4
HURT: 0
helped stats (abs) min: 1 max: 4 x̄: 2.25 x̃: 2
helped stats (rel) min: 10.00% max: 33.33% x̄: 21.46% x̃: 21.25%
95% mean confidence interval for sends value: -4.64 0.14
95% mean confidence interval for sends %-change: -40.41% -2.51%
Inconclusive result (value mean confidence interval includes 0).
LOST: 0
GAINED: 0
Some VK/DX titles result (on DG2 only), it's mostly additional
instruction counts except for the unity spaceship demo where a CS
shader gets additional SIMDness. The reason for additional
instructions is that since we're doing block loads, we need to find
the live channels in control flow to select a single lane value that
is valid.
aztec_ruins_high:
Totals from 3 (1.12% of 269) affected shaders:
Instrs: 17732 -> 17896 (+0.92%)
Cycles: 796518 -> 819302 (+2.86%)
cyberpunk_2077:
Totals from 17 (0.17% of 10301) affected shaders:
Instrs: 10848 -> 11658 (+7.47%)
Cycles: 248243 -> 259168 (+4.40%); split: -0.57%, +4.97%
fallout_4_dxvk_g2:
Totals from 2 (0.12% of 1638) affected shaders:
Instrs: 3157 -> 3368 (+6.68%)
Cycles: 487807 -> 490426 (+0.54%); split: -0.26%, +0.79%
Max live registers: 139 -> 141 (+1.44%)
red_dead_redemption2:
Totals from 68 (1.14% of 5970) affected shaders:
Instrs: 34871 -> 36486 (+4.63%)
Cycles: 551430 -> 565211 (+2.50%)
Send messages: 2074 -> 2072 (-0.10%)
Max live registers: 5078 -> 5077 (-0.02%)
total_war_warhammer2:
Totals from 5 (1.05% of 478) affected shaders:
Instrs: 6905 -> 6971 (+0.96%); split: -0.16%, +1.12%
Cycles: 97035 -> 97989 (+0.98%); split: -0.07%, +1.05%
unity spaceship demo (instruction count going up due to a CS shader
bump from SIMD8->16):
Totals from 53 (9.71% of 546) affected shaders:
Instrs: 223748 -> 233223 (+4.23%); split: -0.01%, +4.25%
Cycles: 23134697 -> 25207080 (+8.96%); split: -0.17%, +9.13%
Subgroup size: 480 -> 488 (+1.67%)
Spill count: 2156 -> 2242 (+3.99%); split: -0.19%, +4.17%
Fill count: 4617 -> 4845 (+4.94%); split: -0.09%, +5.02%
Max live registers: 5991 -> 6050 (+0.98%); split: -0.40%, +1.39%
Max dispatch width: 480 -> 488 (+1.67%)
witcher_3_dxvk_g2:
Totals from 27 (2.51% of 1074) affected shaders:
Instrs: 57067 -> 57677 (+1.07%); split: -0.03%, +1.10%
Cycles: 1397871 -> 1436704 (+2.78%); split: -0.35%, +3.13%
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23477>
122 lines
4.1 KiB
C
122 lines
4.1 KiB
C
/*
|
|
* Copyright © 2018 Intel Corporation
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*/
|
|
|
|
#include "isl/isl.h"
|
|
|
|
#include "brw_nir.h"
|
|
|
|
static bool
|
|
brw_nir_blockify_uniform_loads_instr(nir_builder *b,
|
|
nir_instr *instr,
|
|
void *cb_data)
|
|
{
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
return false;
|
|
|
|
const struct intel_device_info *devinfo = cb_data;
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
switch (intrin->intrinsic) {
|
|
case nir_intrinsic_load_ubo:
|
|
case nir_intrinsic_load_ssbo:
|
|
/* BDW PRMs, Volume 7: 3D-Media-GPGPU: OWord Block ReadWrite:
|
|
*
|
|
* "The surface base address must be OWord-aligned."
|
|
*
|
|
* We can't make that guarantee with SSBOs where the alignment is
|
|
* 4bytes.
|
|
*/
|
|
if (devinfo->ver < 9)
|
|
return false;
|
|
|
|
if (nir_src_is_divergent(intrin->src[1]))
|
|
return false;
|
|
|
|
if (nir_dest_bit_size(intrin->dest) != 32)
|
|
return false;
|
|
|
|
/* Without the LSC, we can only do block loads of at least 4dwords (1
|
|
* oword).
|
|
*/
|
|
if (!devinfo->has_lsc && nir_dest_num_components(intrin->dest) < 4)
|
|
return false;
|
|
|
|
intrin->intrinsic =
|
|
intrin->intrinsic == nir_intrinsic_load_ubo ?
|
|
nir_intrinsic_load_ubo_uniform_block_intel :
|
|
nir_intrinsic_load_ssbo_uniform_block_intel;
|
|
return true;
|
|
|
|
case nir_intrinsic_load_shared:
|
|
/* Block loads on shared memory are not supported before the LSC. */
|
|
if (!devinfo->has_lsc)
|
|
return false;
|
|
|
|
if (nir_src_is_divergent(intrin->src[0]))
|
|
return false;
|
|
|
|
if (nir_dest_bit_size(intrin->dest) != 32)
|
|
return false;
|
|
|
|
/* Without the LSC, we can only do block loads of at least 4dwords (1
|
|
* oword).
|
|
*/
|
|
if (!devinfo->has_lsc && nir_dest_num_components(intrin->dest) < 4)
|
|
return false;
|
|
|
|
intrin->intrinsic = nir_intrinsic_load_shared_uniform_block_intel;
|
|
return true;
|
|
|
|
case nir_intrinsic_load_global_constant:
|
|
if (nir_src_is_divergent(intrin->src[0]))
|
|
return false;
|
|
|
|
if (nir_dest_bit_size(intrin->dest) != 32)
|
|
return false;
|
|
|
|
/* Without the LSC, we can only do block loads of at least 4dwords (1
|
|
* oword).
|
|
*/
|
|
if (!devinfo->has_lsc && nir_dest_num_components(intrin->dest) < 4)
|
|
return false;
|
|
|
|
intrin->intrinsic = nir_intrinsic_load_global_constant_uniform_block_intel;
|
|
return true;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool
|
|
brw_nir_blockify_uniform_loads(nir_shader *shader,
|
|
const struct intel_device_info *devinfo)
|
|
{
|
|
return nir_shader_instructions_pass(shader,
|
|
brw_nir_blockify_uniform_loads_instr,
|
|
nir_metadata_block_index |
|
|
nir_metadata_dominance |
|
|
nir_metadata_live_ssa_defs,
|
|
(void *) devinfo);
|
|
}
|