diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index 3ceaa5a3f75..9e5d6c02e83 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -1970,6 +1970,8 @@ soft_sy_delay(struct ir3_instruction *instr, struct ir3 *shader) } } +bool ir3_opt_predicates(struct ir3 *ir, struct ir3_shader_variant *v); + /* unreachable block elimination: */ bool ir3_remove_unreachable(struct ir3 *ir); diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index 5f651a7b622..d62752f5901 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -157,6 +157,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->local_mem_size = dev_info->cs_shared_mem_size; compiler->num_predicates = 1; + compiler->bitops_can_write_predicates = false; if (compiler->gen >= 6) { compiler->samgq_workaround = true; @@ -215,6 +216,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->load_shader_consts_via_preamble = dev_info->a7xx.load_shader_consts_via_preamble; compiler->load_inline_uniforms_via_preamble_ldgk = dev_info->a7xx.load_inline_uniforms_via_preamble_ldgk; compiler->num_predicates = 4; + compiler->bitops_can_write_predicates = true; } else { compiler->max_const_pipeline = 512; compiler->max_const_geom = 512; diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index 9703052fd5d..271b01fa12b 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -217,6 +217,9 @@ struct ir3_compiler { /* Number of available predicate registers (p0.c) */ uint32_t num_predicates; + /* True if bitops (and.b, or.b, xor.b, not.b) can write to p0.c */ + bool bitops_can_write_predicates; + /* MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB */ uint32_t max_variable_workgroup_size; diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 6b3b46df509..c510e0eed99 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -4986,6 +4986,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, progress |= IR3_PASS(ir, ir3_cp, so); progress |= IR3_PASS(ir, ir3_cse); progress |= IR3_PASS(ir, ir3_dce, so); + progress |= IR3_PASS(ir, ir3_opt_predicates, so); } while (progress); /* at this point, for binning pass, throw away unneeded outputs: diff --git a/src/freedreno/ir3/ir3_cp.c b/src/freedreno/ir3/ir3_cp.c index bdf8628cee6..9758a15fded 100644 --- a/src/freedreno/ir3/ir3_cp.c +++ b/src/freedreno/ir3/ir3_cp.c @@ -89,24 +89,6 @@ is_eligible_mov(struct ir3_instruction *instr, return false; } -/* we can end up with extra cmps.s from frontend, which uses a - * - * cmps.s p0.x, cond, 0 - * - * as a way to mov into the predicate register. But frequently 'cond' - * is itself a cmps.s/cmps.f/cmps.u. So detect this special case. - */ -static bool -is_foldable_double_cmp(struct ir3_instruction *cmp) -{ - struct ir3_instruction *cond = ssa(cmp->srcs[0]); - return (cmp->dsts[0]->flags & IR3_REG_PREDICATE) && cond && - (cmp->srcs[1]->flags & IR3_REG_IMMED) && - (cmp->srcs[1]->iim_val == 0) && - (cmp->cat2.condition == IR3_COND_NE) && - (!cond->address || cond->address->def->instr->block == cmp->block); -} - /* propagate register flags from src to dst.. negates need special * handling to cancel each other out. */ @@ -611,32 +593,6 @@ instr_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr) ctx->progress = true; } - /* Re-write the instruction writing predicate register to get rid - * of the double cmps. - */ - if ((instr->opc == OPC_CMPS_S) && is_foldable_double_cmp(instr)) { - struct ir3_instruction *cond = ssa(instr->srcs[0]); - switch (cond->opc) { - case OPC_CMPS_S: - case OPC_CMPS_F: - case OPC_CMPS_U: - instr->opc = cond->opc; - instr->flags = cond->flags; - instr->cat2 = cond->cat2; - if (cond->address) - ir3_instr_set_address(instr, cond->address->def->instr); - instr->srcs[0] = ir3_reg_clone(ctx->shader, cond->srcs[0]); - instr->srcs[1] = ir3_reg_clone(ctx->shader, cond->srcs[1]); - instr->barrier_class |= cond->barrier_class; - instr->barrier_conflict |= cond->barrier_conflict; - unuse(cond); - ctx->progress = true; - break; - default: - break; - } - } - /* Handle converting a sam.s2en (taking samp/tex idx params via register) * into a normal sam (encoding immediate samp/tex idx) if they are * immediate. This saves some instructions and regs in the common case diff --git a/src/freedreno/ir3/ir3_opt_predicates.c b/src/freedreno/ir3/ir3_opt_predicates.c new file mode 100644 index 00000000000..e68cf36e9ee --- /dev/null +++ b/src/freedreno/ir3/ir3_opt_predicates.c @@ -0,0 +1,163 @@ +/* + * Copyright © 2024 Igalia S.L. + * SPDX-License-Identifier: MIT + */ + +#include "ir3.h" +#include "ir3_shader.h" + +/* This pass tries to optimize away cmps.s.ne instructions created by + * ir3_get_predicate in order to write predicates. It does two things: + * - Look through chains of multiple cmps.s.ne instructions and remove all but + * the first. + * - If the source of the cmps.s.ne can write directly to predicates (true for + * bitops on a6xx+), remove the cmps.s.ne. + * + * In both cases, no instructions are actually removed but clones are made and + * we rely on DCE to remove anything that became unused. Note that it's fine to + * always make a clone since even in the case that the original instruction is + * also used for non-predicate sources (so it won't be DCE'd), we replaced a + * cmps.ne.s with another instruction so this pass should never increase + * instruction count. + */ + +struct opt_predicates_ctx { + struct ir3 *ir; + + /* Map from instructions to their clones with a predicate destination. Used + * to prevent instructions being cloned multiple times. + */ + struct hash_table *predicate_clones; +}; + +static struct ir3_instruction * +clone_with_predicate_dst(struct opt_predicates_ctx *ctx, + struct ir3_instruction *instr) +{ + struct hash_entry *entry = + _mesa_hash_table_search(ctx->predicate_clones, instr); + if (entry) + return entry->data; + + assert(instr->dsts_count == 1); + assert(!(instr->dsts[0]->flags & IR3_REG_SHARED)); + + struct ir3_instruction *clone = ir3_instr_clone(instr); + ir3_instr_move_after(clone, instr); + clone->dsts[0]->flags |= IR3_REG_PREDICATE; + clone->dsts[0]->flags &= ~IR3_REG_HALF; + _mesa_hash_table_insert(ctx->predicate_clones, instr, clone); + return clone; +} + +static bool +can_write_predicate(struct opt_predicates_ctx *ctx, + struct ir3_instruction *instr) +{ + switch (instr->opc) { + case OPC_CMPS_S: + case OPC_CMPS_U: + case OPC_CMPS_F: + return true; + case OPC_AND_B: + case OPC_OR_B: + case OPC_NOT_B: + case OPC_XOR_B: + case OPC_GETBIT_B: + return ctx->ir->compiler->bitops_can_write_predicates; + default: + return false; + } +} + +/* Detects the pattern used by ir3_get_predicate to write a predicate register: + * cmps.s.ne pssa_x, ssa_y, 0 + */ +static bool +is_gpr_to_predicate_mov(struct ir3_instruction *instr) +{ + return (instr->opc == OPC_CMPS_S) && + (instr->cat2.condition == IR3_COND_NE) && + (instr->srcs[0]->flags & IR3_REG_SSA) && + (instr->srcs[1]->flags & IR3_REG_IMMED) && + (instr->srcs[1]->iim_val == 0); +} + +/* Look through a chain of cmps.s.ne 0 instructions to find the initial source. + * Return it if it can write to predicates. Otherwise, return the first + * cmps.s.ne in the chain. + */ +static struct ir3_register * +resolve_predicate_def(struct opt_predicates_ctx *ctx, struct ir3_register *src) +{ + struct ir3_register *def = src->def; + + while (is_gpr_to_predicate_mov(def->instr)) { + struct ir3_register *next_def = def->instr->srcs[0]->def; + + if (!can_write_predicate(ctx, next_def->instr)) + return def; + + def = next_def; + } + + return def; +} + +/* Find all predicate sources and try to replace their defs with instructions + * that can directly write to predicates. + */ +static bool +opt_instr(struct opt_predicates_ctx *ctx, struct ir3_instruction *instr) +{ + bool progress = false; + + foreach_src (src, instr) { + if (!(src->flags & IR3_REG_PREDICATE)) + continue; + + struct ir3_register *def = resolve_predicate_def(ctx, src); + + if (src->def == def) + continue; + + assert(can_write_predicate(ctx, def->instr) && + !(def->flags & IR3_REG_PREDICATE)); + + struct ir3_instruction *predicate = + clone_with_predicate_dst(ctx, def->instr); + assert(predicate->dsts_count == 1); + + src->def = predicate->dsts[0]; + progress = true; + } + + return progress; +} + +static bool +opt_blocks(struct opt_predicates_ctx *ctx) +{ + bool progress = false; + + foreach_block (block, &ctx->ir->block_list) { + foreach_instr (instr, &block->instr_list) { + progress |= opt_instr(ctx, instr); + } + } + + return progress; +} + +bool +ir3_opt_predicates(struct ir3 *ir, struct ir3_shader_variant *v) +{ + struct opt_predicates_ctx *ctx = rzalloc(NULL, struct opt_predicates_ctx); + ctx->ir = ir; + ctx->predicate_clones = _mesa_pointer_hash_table_create(ctx); + + bool progress = opt_blocks(ctx); + + ralloc_free(ctx); + return progress; +} diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build index b67d4b9a2de..e400bd179f3 100644 --- a/src/freedreno/ir3/meson.build +++ b/src/freedreno/ir3/meson.build @@ -104,6 +104,7 @@ libfreedreno_ir3_files = files( 'ir3_nir_move_varying_inputs.c', 'ir3_nir_lower_layer_id.c', 'ir3_nir_opt_preamble.c', + 'ir3_opt_predicates.c', 'ir3_postsched.c', 'ir3_print.c', 'ir3_ra.c',