From c6a6902e4d6744ccf8f97b153296a9ecd151bc63 Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Thu, 1 Feb 2024 14:51:22 +0100 Subject: [PATCH] ir3: optimize bitwise ops that can directly write predicates On a6xx+, bitwise operations can directly write to predicate registers. The result will be 1 iff the result of the non-predicate operation would be non-zero. When generating instructions that need a predicate source, ir3 will insert a cmps.s.ne 0 instruction to guarantee a predicate can be produced. This is kept in place by this patch and we add a pass that tries to optimize useless comparisons away. Concretely: - Look through chains of multiple cmps.s.ne instructions and remove all but the first. - If the source of the cmps.s.ne can write directly to predicates, remove the cmps.s.ne. In both cases, no instructions are actually removed but clones are made and we rely on DCE to remove anything that became unused. Note that it's fine to always make a clone since even in the case that the original instruction is also used for non-predicate sources (so it won't be DCE'd), we replaced a cmps.ne.s with another instruction so this pass should never increase instruction count. Note that this pass replaces the double-comparison folding that was performed by ir3_cp before. Signed-off-by: Job Noorman Part-of: --- src/freedreno/ir3/ir3.h | 2 + src/freedreno/ir3/ir3_compiler.c | 2 + src/freedreno/ir3/ir3_compiler.h | 3 + src/freedreno/ir3/ir3_compiler_nir.c | 1 + src/freedreno/ir3/ir3_cp.c | 44 ------- src/freedreno/ir3/ir3_opt_predicates.c | 163 +++++++++++++++++++++++++ src/freedreno/ir3/meson.build | 1 + 7 files changed, 172 insertions(+), 44 deletions(-) create mode 100644 src/freedreno/ir3/ir3_opt_predicates.c diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index 3ceaa5a3f75..9e5d6c02e83 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -1970,6 +1970,8 @@ soft_sy_delay(struct ir3_instruction *instr, struct ir3 *shader) } } +bool ir3_opt_predicates(struct ir3 *ir, struct ir3_shader_variant *v); + /* unreachable block elimination: */ bool ir3_remove_unreachable(struct ir3 *ir); diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index 5f651a7b622..d62752f5901 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -157,6 +157,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->local_mem_size = dev_info->cs_shared_mem_size; compiler->num_predicates = 1; + compiler->bitops_can_write_predicates = false; if (compiler->gen >= 6) { compiler->samgq_workaround = true; @@ -215,6 +216,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->load_shader_consts_via_preamble = dev_info->a7xx.load_shader_consts_via_preamble; compiler->load_inline_uniforms_via_preamble_ldgk = dev_info->a7xx.load_inline_uniforms_via_preamble_ldgk; compiler->num_predicates = 4; + compiler->bitops_can_write_predicates = true; } else { compiler->max_const_pipeline = 512; compiler->max_const_geom = 512; diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index 9703052fd5d..271b01fa12b 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -217,6 +217,9 @@ struct ir3_compiler { /* Number of available predicate registers (p0.c) */ uint32_t num_predicates; + /* True if bitops (and.b, or.b, xor.b, not.b) can write to p0.c */ + bool bitops_can_write_predicates; + /* MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB */ uint32_t max_variable_workgroup_size; diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 6b3b46df509..c510e0eed99 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -4986,6 +4986,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, progress |= IR3_PASS(ir, ir3_cp, so); progress |= IR3_PASS(ir, ir3_cse); progress |= IR3_PASS(ir, ir3_dce, so); + progress |= IR3_PASS(ir, ir3_opt_predicates, so); } while (progress); /* at this point, for binning pass, throw away unneeded outputs: diff --git a/src/freedreno/ir3/ir3_cp.c b/src/freedreno/ir3/ir3_cp.c index bdf8628cee6..9758a15fded 100644 --- a/src/freedreno/ir3/ir3_cp.c +++ b/src/freedreno/ir3/ir3_cp.c @@ -89,24 +89,6 @@ is_eligible_mov(struct ir3_instruction *instr, return false; } -/* we can end up with extra cmps.s from frontend, which uses a - * - * cmps.s p0.x, cond, 0 - * - * as a way to mov into the predicate register. But frequently 'cond' - * is itself a cmps.s/cmps.f/cmps.u. So detect this special case. - */ -static bool -is_foldable_double_cmp(struct ir3_instruction *cmp) -{ - struct ir3_instruction *cond = ssa(cmp->srcs[0]); - return (cmp->dsts[0]->flags & IR3_REG_PREDICATE) && cond && - (cmp->srcs[1]->flags & IR3_REG_IMMED) && - (cmp->srcs[1]->iim_val == 0) && - (cmp->cat2.condition == IR3_COND_NE) && - (!cond->address || cond->address->def->instr->block == cmp->block); -} - /* propagate register flags from src to dst.. negates need special * handling to cancel each other out. */ @@ -611,32 +593,6 @@ instr_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr) ctx->progress = true; } - /* Re-write the instruction writing predicate register to get rid - * of the double cmps. - */ - if ((instr->opc == OPC_CMPS_S) && is_foldable_double_cmp(instr)) { - struct ir3_instruction *cond = ssa(instr->srcs[0]); - switch (cond->opc) { - case OPC_CMPS_S: - case OPC_CMPS_F: - case OPC_CMPS_U: - instr->opc = cond->opc; - instr->flags = cond->flags; - instr->cat2 = cond->cat2; - if (cond->address) - ir3_instr_set_address(instr, cond->address->def->instr); - instr->srcs[0] = ir3_reg_clone(ctx->shader, cond->srcs[0]); - instr->srcs[1] = ir3_reg_clone(ctx->shader, cond->srcs[1]); - instr->barrier_class |= cond->barrier_class; - instr->barrier_conflict |= cond->barrier_conflict; - unuse(cond); - ctx->progress = true; - break; - default: - break; - } - } - /* Handle converting a sam.s2en (taking samp/tex idx params via register) * into a normal sam (encoding immediate samp/tex idx) if they are * immediate. This saves some instructions and regs in the common case diff --git a/src/freedreno/ir3/ir3_opt_predicates.c b/src/freedreno/ir3/ir3_opt_predicates.c new file mode 100644 index 00000000000..e68cf36e9ee --- /dev/null +++ b/src/freedreno/ir3/ir3_opt_predicates.c @@ -0,0 +1,163 @@ +/* + * Copyright © 2024 Igalia S.L. + * SPDX-License-Identifier: MIT + */ + +#include "ir3.h" +#include "ir3_shader.h" + +/* This pass tries to optimize away cmps.s.ne instructions created by + * ir3_get_predicate in order to write predicates. It does two things: + * - Look through chains of multiple cmps.s.ne instructions and remove all but + * the first. + * - If the source of the cmps.s.ne can write directly to predicates (true for + * bitops on a6xx+), remove the cmps.s.ne. + * + * In both cases, no instructions are actually removed but clones are made and + * we rely on DCE to remove anything that became unused. Note that it's fine to + * always make a clone since even in the case that the original instruction is + * also used for non-predicate sources (so it won't be DCE'd), we replaced a + * cmps.ne.s with another instruction so this pass should never increase + * instruction count. + */ + +struct opt_predicates_ctx { + struct ir3 *ir; + + /* Map from instructions to their clones with a predicate destination. Used + * to prevent instructions being cloned multiple times. + */ + struct hash_table *predicate_clones; +}; + +static struct ir3_instruction * +clone_with_predicate_dst(struct opt_predicates_ctx *ctx, + struct ir3_instruction *instr) +{ + struct hash_entry *entry = + _mesa_hash_table_search(ctx->predicate_clones, instr); + if (entry) + return entry->data; + + assert(instr->dsts_count == 1); + assert(!(instr->dsts[0]->flags & IR3_REG_SHARED)); + + struct ir3_instruction *clone = ir3_instr_clone(instr); + ir3_instr_move_after(clone, instr); + clone->dsts[0]->flags |= IR3_REG_PREDICATE; + clone->dsts[0]->flags &= ~IR3_REG_HALF; + _mesa_hash_table_insert(ctx->predicate_clones, instr, clone); + return clone; +} + +static bool +can_write_predicate(struct opt_predicates_ctx *ctx, + struct ir3_instruction *instr) +{ + switch (instr->opc) { + case OPC_CMPS_S: + case OPC_CMPS_U: + case OPC_CMPS_F: + return true; + case OPC_AND_B: + case OPC_OR_B: + case OPC_NOT_B: + case OPC_XOR_B: + case OPC_GETBIT_B: + return ctx->ir->compiler->bitops_can_write_predicates; + default: + return false; + } +} + +/* Detects the pattern used by ir3_get_predicate to write a predicate register: + * cmps.s.ne pssa_x, ssa_y, 0 + */ +static bool +is_gpr_to_predicate_mov(struct ir3_instruction *instr) +{ + return (instr->opc == OPC_CMPS_S) && + (instr->cat2.condition == IR3_COND_NE) && + (instr->srcs[0]->flags & IR3_REG_SSA) && + (instr->srcs[1]->flags & IR3_REG_IMMED) && + (instr->srcs[1]->iim_val == 0); +} + +/* Look through a chain of cmps.s.ne 0 instructions to find the initial source. + * Return it if it can write to predicates. Otherwise, return the first + * cmps.s.ne in the chain. + */ +static struct ir3_register * +resolve_predicate_def(struct opt_predicates_ctx *ctx, struct ir3_register *src) +{ + struct ir3_register *def = src->def; + + while (is_gpr_to_predicate_mov(def->instr)) { + struct ir3_register *next_def = def->instr->srcs[0]->def; + + if (!can_write_predicate(ctx, next_def->instr)) + return def; + + def = next_def; + } + + return def; +} + +/* Find all predicate sources and try to replace their defs with instructions + * that can directly write to predicates. + */ +static bool +opt_instr(struct opt_predicates_ctx *ctx, struct ir3_instruction *instr) +{ + bool progress = false; + + foreach_src (src, instr) { + if (!(src->flags & IR3_REG_PREDICATE)) + continue; + + struct ir3_register *def = resolve_predicate_def(ctx, src); + + if (src->def == def) + continue; + + assert(can_write_predicate(ctx, def->instr) && + !(def->flags & IR3_REG_PREDICATE)); + + struct ir3_instruction *predicate = + clone_with_predicate_dst(ctx, def->instr); + assert(predicate->dsts_count == 1); + + src->def = predicate->dsts[0]; + progress = true; + } + + return progress; +} + +static bool +opt_blocks(struct opt_predicates_ctx *ctx) +{ + bool progress = false; + + foreach_block (block, &ctx->ir->block_list) { + foreach_instr (instr, &block->instr_list) { + progress |= opt_instr(ctx, instr); + } + } + + return progress; +} + +bool +ir3_opt_predicates(struct ir3 *ir, struct ir3_shader_variant *v) +{ + struct opt_predicates_ctx *ctx = rzalloc(NULL, struct opt_predicates_ctx); + ctx->ir = ir; + ctx->predicate_clones = _mesa_pointer_hash_table_create(ctx); + + bool progress = opt_blocks(ctx); + + ralloc_free(ctx); + return progress; +} diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build index b67d4b9a2de..e400bd179f3 100644 --- a/src/freedreno/ir3/meson.build +++ b/src/freedreno/ir3/meson.build @@ -104,6 +104,7 @@ libfreedreno_ir3_files = files( 'ir3_nir_move_varying_inputs.c', 'ir3_nir_lower_layer_id.c', 'ir3_nir_opt_preamble.c', + 'ir3_opt_predicates.c', 'ir3_postsched.c', 'ir3_print.c', 'ir3_ra.c',