nir: Apply nir_opt_offsets to nir_intrinsic_load_uniform as well.
Doing this for ir3 required adding a struct for limits of how much base to fold in (which NTT wants as well for its case of shared vars), otherwise the later work to lower to the 1<<9 word limit would emit more instructions. The shader-db results are that sometimes the reduction in NIR instruction count results in the fewer sampler prefetches due to the shader being estimated to be shorter (dota2, nexuiz): total instructions in shared programs: 8996651 -> 8996776 (<.01%) total cat5 in shared programs: 86561 -> 86577 (0.02%) Reviewed-by: Rob Clark <robdclark@chromium.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14023>
This commit is contained in:
@@ -211,8 +211,14 @@ radv_optimize_nir_algebraic(nir_shader *nir, bool opt_offsets)
|
||||
NIR_PASS(more_algebraic, nir, nir_opt_algebraic);
|
||||
}
|
||||
|
||||
if (opt_offsets)
|
||||
NIR_PASS_V(nir, nir_opt_offsets);
|
||||
if (opt_offsets) {
|
||||
static const nir_opt_offsets_options offset_options = {
|
||||
.uniform_max = 0,
|
||||
.buffer_max = ~0,
|
||||
.shared_max = ~0,
|
||||
};
|
||||
NIR_PASS_V(nir, nir_opt_offsets, &offset_options);
|
||||
}
|
||||
|
||||
/* Do late algebraic optimization to turn add(a,
|
||||
* neg(b)) back into subs, then the mandatory cleanup
|
||||
|
||||
+12
-1
@@ -5254,7 +5254,18 @@ bool nir_opt_sink(nir_shader *shader, nir_move_options options);
|
||||
|
||||
bool nir_opt_move(nir_shader *shader, nir_move_options options);
|
||||
|
||||
bool nir_opt_offsets(nir_shader *shader);
|
||||
typedef struct {
|
||||
/** nir_load_uniform max base offset */
|
||||
uint32_t uniform_max;
|
||||
|
||||
/** nir_var_mem_shared max base offset */
|
||||
uint32_t shared_max;
|
||||
|
||||
/** nir_load/store_buffer_amd max base offset */
|
||||
uint32_t buffer_max;
|
||||
} nir_opt_offsets_options;
|
||||
|
||||
bool nir_opt_offsets(nir_shader *shader, const nir_opt_offsets_options *options);
|
||||
|
||||
bool nir_opt_peephole_select(nir_shader *shader, unsigned limit,
|
||||
bool indirect_load_ok, bool expensive_alu_ok);
|
||||
|
||||
@@ -31,10 +31,11 @@
|
||||
typedef struct
|
||||
{
|
||||
struct hash_table *range_ht;
|
||||
const nir_opt_offsets_options *options;
|
||||
} opt_offsets_state;
|
||||
|
||||
static nir_ssa_def *
|
||||
try_extract_const_addition(nir_builder *b, nir_instr *instr, opt_offsets_state *state, unsigned *out_const)
|
||||
try_extract_const_addition(nir_builder *b, nir_instr *instr, opt_offsets_state *state, unsigned *out_const, uint32_t max)
|
||||
{
|
||||
if (instr->type != nir_instr_type_alu)
|
||||
return NULL;
|
||||
@@ -66,15 +67,18 @@ try_extract_const_addition(nir_builder *b, nir_instr *instr, opt_offsets_state *
|
||||
|
||||
for (unsigned i = 0; i < 2; ++i) {
|
||||
if (nir_src_is_const(alu->src[i].src)) {
|
||||
*out_const += nir_src_as_uint(alu->src[i].src);
|
||||
nir_ssa_def *replace_src =
|
||||
try_extract_const_addition(b, alu->src[1 - i].src.ssa->parent_instr, state, out_const);
|
||||
return replace_src ? replace_src : alu->src[1 - i].src.ssa;
|
||||
uint32_t offset = nir_src_as_uint(alu->src[i].src);
|
||||
if (offset + *out_const <= max) {
|
||||
*out_const += offset;
|
||||
nir_ssa_def *replace_src =
|
||||
try_extract_const_addition(b, alu->src[1 - i].src.ssa->parent_instr, state, out_const, max);
|
||||
return replace_src ? replace_src : alu->src[1 - i].src.ssa;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
nir_ssa_def *replace_src0 = try_extract_const_addition(b, alu->src[0].src.ssa->parent_instr, state, out_const);
|
||||
nir_ssa_def *replace_src1 = try_extract_const_addition(b, alu->src[1].src.ssa->parent_instr, state, out_const);
|
||||
nir_ssa_def *replace_src0 = try_extract_const_addition(b, alu->src[0].src.ssa->parent_instr, state, out_const, max);
|
||||
nir_ssa_def *replace_src1 = try_extract_const_addition(b, alu->src[1].src.ssa->parent_instr, state, out_const, max);
|
||||
if (!replace_src0 && !replace_src1)
|
||||
return NULL;
|
||||
|
||||
@@ -88,7 +92,8 @@ static bool
|
||||
try_fold_load_store(nir_builder *b,
|
||||
nir_intrinsic_instr *intrin,
|
||||
opt_offsets_state *state,
|
||||
unsigned offset_src_idx)
|
||||
unsigned offset_src_idx,
|
||||
uint32_t max)
|
||||
{
|
||||
/* Assume that BASE is the constant offset of a load/store.
|
||||
* Try to constant-fold additions to the offset source
|
||||
@@ -103,7 +108,7 @@ try_fold_load_store(nir_builder *b,
|
||||
return false;
|
||||
|
||||
if (!nir_src_is_const(*off_src)) {
|
||||
replace_src = try_extract_const_addition(b, off_src->ssa->parent_instr, state, &off_const);
|
||||
replace_src = try_extract_const_addition(b, off_src->ssa->parent_instr, state, &off_const, max);
|
||||
} else if (nir_src_as_uint(*off_src)) {
|
||||
off_const += nir_src_as_uint(*off_src);
|
||||
b->cursor = nir_before_instr(&intrin->instr);
|
||||
@@ -128,21 +133,18 @@ process_instr(nir_builder *b, nir_instr *instr, void *s)
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
|
||||
switch (intrin->intrinsic) {
|
||||
/* Note that while it's tempting to include nir_intrinsic_load_uniform
|
||||
* here, freedreno doesn't want that because it can have to move the base
|
||||
* back to a register plus a small constant offset, and it's not clever
|
||||
* enough to minimize the code that that emits.
|
||||
*/
|
||||
case nir_intrinsic_load_uniform:
|
||||
return try_fold_load_store(b, intrin, state, 0, state->options->uniform_max);
|
||||
case nir_intrinsic_load_shared:
|
||||
case nir_intrinsic_load_shared_ir3:
|
||||
return try_fold_load_store(b, intrin, state, 0);
|
||||
return try_fold_load_store(b, intrin, state, 0, state->options->shared_max);
|
||||
case nir_intrinsic_store_shared:
|
||||
case nir_intrinsic_store_shared_ir3:
|
||||
return try_fold_load_store(b, intrin, state, 1);
|
||||
return try_fold_load_store(b, intrin, state, 1, state->options->shared_max);
|
||||
case nir_intrinsic_load_buffer_amd:
|
||||
return try_fold_load_store(b, intrin, state, 1);
|
||||
return try_fold_load_store(b, intrin, state, 1, state->options->buffer_max);
|
||||
case nir_intrinsic_store_buffer_amd:
|
||||
return try_fold_load_store(b, intrin, state, 2);
|
||||
return try_fold_load_store(b, intrin, state, 2, state->options->buffer_max);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -151,10 +153,11 @@ process_instr(nir_builder *b, nir_instr *instr, void *s)
|
||||
}
|
||||
|
||||
bool
|
||||
nir_opt_offsets(nir_shader *shader)
|
||||
nir_opt_offsets(nir_shader *shader, const nir_opt_offsets_options *options)
|
||||
{
|
||||
opt_offsets_state state;
|
||||
state.range_ht = NULL;
|
||||
state.options = options;
|
||||
|
||||
bool p = nir_shader_instructions_pass(shader, process_instr,
|
||||
nir_metadata_block_index |
|
||||
|
||||
@@ -117,7 +117,17 @@ ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s)
|
||||
progress |= OPT(s, nir_lower_alu);
|
||||
progress |= OPT(s, nir_lower_pack);
|
||||
progress |= OPT(s, nir_opt_constant_folding);
|
||||
progress |= OPT(s, nir_opt_offsets);
|
||||
|
||||
static const nir_opt_offsets_options offset_options = {
|
||||
/* How large an offset we can encode in the instr's immediate field.
|
||||
*/
|
||||
.uniform_max = (1 << 9) - 1,
|
||||
|
||||
.shared_max = ~0,
|
||||
|
||||
.buffer_max = ~0,
|
||||
};
|
||||
progress |= OPT(s, nir_opt_offsets, &offset_options);
|
||||
|
||||
nir_load_store_vectorize_options vectorize_opts = {
|
||||
.modes = nir_var_mem_ubo,
|
||||
|
||||
Reference in New Issue
Block a user