diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h index e27bc1ddee2..83417e41e24 100644 --- a/src/freedreno/common/freedreno_dev_info.h +++ b/src/freedreno/common/freedreno_dev_info.h @@ -172,6 +172,9 @@ struct fd_dev_info { bool broken_ds_ubwc_quirk; + /* See ir3_compiler::has_scalar_alu. */ + bool has_scalar_alu; + struct { uint32_t PC_POWER_CNTL; uint32_t TPL1_DBG_ECO_CNTL; diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py index 03505788024..5c8a81126d9 100644 --- a/src/freedreno/common/freedreno_devices.py +++ b/src/freedreno/common/freedreno_devices.py @@ -389,6 +389,7 @@ a6xx_gen3 = A6XXProps( enable_lrz_fast_clear = True, lrz_track_quirk = True, has_per_view_viewport = True, + has_scalar_alu = True, ) a6xx_gen4 = A6XXProps( @@ -412,6 +413,7 @@ a6xx_gen4 = A6XXProps( enable_lrz_fast_clear = True, has_lrz_dir_tracking = True, has_per_view_viewport = True, + has_scalar_alu = True, ) a6xx_a690_quirk = A6XXProps( @@ -790,6 +792,7 @@ a7xx_base = A6XXProps( has_per_view_viewport = True, line_width_min = 1.0, line_width_max = 127.5, + has_scalar_alu = True, ) a7xx_725 = A7XXProps( diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c index 381136ddc57..713c7651854 100644 --- a/src/freedreno/ir3/ir3.c +++ b/src/freedreno/ir3/ir3.c @@ -792,6 +792,33 @@ ir3_instr_set_address(struct ir3_instruction *instr, } } +/* Does this instruction use the scalar ALU? + */ +bool +is_scalar_alu(struct ir3_instruction *instr, + const struct ir3_compiler *compiler) +{ + /* MOVMSK seems to always need (ss) even with other scalar ALU instructions + */ + return instr->opc != OPC_MOVMSK && + instr->opc != OPC_SCAN_CLUSTERS_MACRO && + instr->opc != OPC_SCAN_MACRO && + is_alu(instr) && (instr->dsts[0]->flags & IR3_REG_SHARED) && + /* scalar->scalar mov instructions (but NOT cov) were supported before the + * scalar ALU was supported, but they still required (ss) whereas on GPUs + * that have a scalar ALU they are executed on it and do not require (ss). + * We have to be careful to return false for these if scalar ALU isn't + * supported, so that we treat them like vector->scalar mov instructions + * (such as requiring (ss)). + */ + compiler->has_scalar_alu && + /* moves from normal to shared seem to use a separate ALU as before and + * require a (ss) on dependent instructions. + */ + ((instr->opc != OPC_MOV && !is_subgroup_cond_mov_macro(instr)) || + (instr->srcs[0]->flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST))); +} + void ir3_block_clear_mark(struct ir3_block *block) { diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index 2c510382f3f..3d0607b32b3 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -1908,9 +1908,11 @@ struct log_stream; void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr); /* delay calculation: */ -int ir3_delayslots(struct ir3_instruction *assigner, +int ir3_delayslots(struct ir3_compiler *compiler, + struct ir3_instruction *assigner, struct ir3_instruction *consumer, unsigned n, bool soft); -unsigned ir3_delayslots_with_repeat(struct ir3_instruction *assigner, +unsigned ir3_delayslots_with_repeat(struct ir3_compiler *compiler, + struct ir3_instruction *assigner, struct ir3_instruction *consumer, unsigned assigner_n, unsigned consumer_n); @@ -1923,7 +1925,10 @@ is_local_mem_load(struct ir3_instruction *instr) instr->opc == OPC_LDLW; } -/* Does this instruction need (ss) to wait for its result? */ +bool is_scalar_alu(struct ir3_instruction *instr, + const struct ir3_compiler *compiler); + +/* Does this instruction sometimes need (ss) to wait for its result? */ static inline bool is_ss_producer(struct ir3_instruction *instr) { @@ -1931,9 +1936,23 @@ is_ss_producer(struct ir3_instruction *instr) if (dst->flags & IR3_REG_SHARED) return true; } + return is_sfu(instr) || is_local_mem_load(instr); } +static inline bool +needs_ss(const struct ir3_compiler *compiler, struct ir3_instruction *producer, + struct ir3_instruction *consumer) +{ + if (is_scalar_alu(producer, compiler) && + is_scalar_alu(consumer, compiler) && + (producer->dsts[0]->flags & IR3_REG_HALF) == + (consumer->srcs[0]->flags & IR3_REG_HALF)) + return false; + + return is_ss_producer(producer); +} + /* The soft delay for approximating the cost of (ss). */ static inline unsigned soft_ss_delay(struct ir3_instruction *instr) diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index 769e6c562d9..cd98feede54 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -223,6 +223,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->bitops_can_write_predicates = true; compiler->has_branch_and_or = true; compiler->has_predication = true; + compiler->has_scalar_alu = dev_info->a6xx.has_scalar_alu; } else { compiler->max_const_pipeline = 512; compiler->max_const_geom = 512; @@ -233,6 +234,8 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, * earlier gen's. */ compiler->max_const_safe = 256; + + compiler->has_scalar_alu = false; } /* This is just a guess for a4xx. */ diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index cd86462e291..77d36767deb 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -267,6 +267,15 @@ struct ir3_compiler { bool load_shader_consts_via_preamble; bool load_inline_uniforms_via_preamble_ldgk; + + /* True if there is a scalar ALU capable of executing a subset of + * cat2-cat4 instructions with a shared register destination. This also + * implies expanded MOV/COV capability when writing to shared registers, + * as MOV/COV is now executed on the scalar ALU except when reading from a + * normal register, as well as the ability for ldc to write to a shared + * register. + */ + bool has_scalar_alu; }; void ir3_compiler_destroy(struct ir3_compiler *compiler); diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c index d74e6faabba..2eed2af9e1b 100644 --- a/src/freedreno/ir3/ir3_delay.c +++ b/src/freedreno/ir3/ir3_delay.c @@ -26,6 +26,8 @@ #include "ir3.h" +#include "ir3_compiler.h" + /* The maximum number of nop's we may need to insert between two instructions. */ #define MAX_NOPS 6 @@ -43,7 +45,8 @@ * assigns a value and the one that consumes */ int -ir3_delayslots(struct ir3_instruction *assigner, +ir3_delayslots(struct ir3_compiler *compiler, + struct ir3_instruction *assigner, struct ir3_instruction *consumer, unsigned n, bool soft) { /* generally don't count false dependencies, since this can just be @@ -63,13 +66,27 @@ ir3_delayslots(struct ir3_instruction *assigner, if (writes_addr0(assigner) || writes_addr1(assigner)) return 6; - if (soft && is_ss_producer(assigner)) + if (soft && needs_ss(compiler, assigner, consumer)) return soft_ss_delay(assigner); /* handled via sync flags: */ - if (is_ss_producer(assigner) || is_sy_producer(assigner)) + if (needs_ss(compiler, assigner, consumer) || + is_sy_producer(assigner)) return 0; + /* scalar ALU -> scalar ALU depdendencies where the source and destination + * register sizes match don't require any nops. + */ + if (is_scalar_alu(assigner, compiler)) { + assert(is_scalar_alu(consumer, compiler)); + /* If the sizes don't match then we need (ss) and needs_ss() should've + * returned above. + */ + assert((assigner->dsts[0]->flags & IR3_REG_HALF) == + (consumer->srcs[n]->flags & IR3_REG_HALF)); + return 0; + } + /* As far as we know, shader outputs don't need any delay. */ if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK) return 0; @@ -96,11 +113,12 @@ ir3_delayslots(struct ir3_instruction *assigner, } unsigned -ir3_delayslots_with_repeat(struct ir3_instruction *assigner, +ir3_delayslots_with_repeat(struct ir3_compiler *compiler, + struct ir3_instruction *assigner, struct ir3_instruction *consumer, unsigned assigner_n, unsigned consumer_n) { - unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, false); + unsigned delay = ir3_delayslots(compiler, assigner, consumer, consumer_n, false); struct ir3_register *src = consumer->srcs[consumer_n]; struct ir3_register *dst = assigner->dsts[assigner_n]; diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c index 5b592a5e234..f25e2f448f4 100644 --- a/src/freedreno/ir3/ir3_legalize.c +++ b/src/freedreno/ir3/ir3_legalize.c @@ -61,7 +61,10 @@ struct ir3_nop_state { struct ir3_legalize_state { regmask_t needs_ss; + regmask_t needs_ss_scalar_full; /* half scalar ALU producer -> full scalar ALU consumer */ + regmask_t needs_ss_scalar_half; /* full scalar ALU producer -> half scalar ALU consumer */ regmask_t needs_ss_war; /* write after read */ + regmask_t needs_ss_scalar_war; /* scalar ALU write -> ALU write */ regmask_t needs_sy; bool needs_ss_for_const; @@ -101,6 +104,9 @@ apply_ss(struct ir3_instruction *instr, instr->flags |= IR3_INSTR_SS; regmask_init(&state->needs_ss_war, mergedregs); regmask_init(&state->needs_ss, mergedregs); + regmask_init(&state->needs_ss_scalar_war, mergedregs); + regmask_init(&state->needs_ss_scalar_full, mergedregs); + regmask_init(&state->needs_ss_scalar_half, mergedregs); state->needs_ss_for_const = false; } @@ -114,14 +120,14 @@ apply_sy(struct ir3_instruction *instr, } static bool -count_instruction(struct ir3_instruction *n) +count_instruction(struct ir3_instruction *n, struct ir3_compiler *compiler) { /* NOTE: don't count branch/jump since we don't know yet if they will * be eliminated later in resolve_jumps().. really should do that * earlier so we don't have this constraint. */ - return is_alu(n) || - (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) && + return (is_alu(n) && !is_scalar_alu(n, compiler)) || + (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) && (n->opc != OPC_BRAA) && (n->opc != OPC_BRAO)); } @@ -363,6 +369,14 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) struct ir3_legalize_state *pstate = &pbd->state; regmask_or_shared(&state->needs_ss, &state->needs_ss, &pstate->needs_ss); + regmask_or_shared(&state->needs_ss_scalar_full, + &state->needs_ss_scalar_full, + &pstate->needs_ss_scalar_full); + regmask_or_shared(&state->needs_ss_scalar_half, + &state->needs_ss_scalar_half, + &pstate->needs_ss_scalar_half); + regmask_or_shared(&state->needs_ss_scalar_war, &state->needs_ss_scalar_war, + &pstate->needs_ss_scalar_war); } memcpy(&bd->state, state, sizeof(*state)); @@ -419,6 +433,8 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) apply_ss(n, state, mergedregs); } + bool n_is_scalar_alu = is_scalar_alu(n, ctx->compiler); + /* NOTE: consider dst register too.. it could happen that * texture sample instruction (for example) writes some * components which are unused. A subsequent instruction @@ -443,6 +459,34 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) last_input_needs_ss = false; } + /* There is a fast feedback path for scalar ALU instructions which + * only takes 1 cycle of latency, similar to the normal 3 cycle + * latency path for ALU instructions. For this fast path the + * producer and consumer must use the same register size (i.e. no + * writing a full register and then reading half of it or vice + * versa). If we don't hit this path, either because of a mismatched + * size or a read via the regular ALU, then the write latency is + * variable and we must use (ss) to wait for the scalar ALU. This is + * different from the fixed 6 cycle latency for mismatched vector + * ALU accesses. + */ + if (n_is_scalar_alu) { + /* Check if we have a mismatched size RaW dependency */ + if (regmask_get((reg->flags & IR3_REG_HALF) ? + &state->needs_ss_scalar_half : + &state->needs_ss_scalar_full, reg)) { + apply_ss(n, state, mergedregs); + last_input_needs_ss = false; + } + } else { + /* check if we have a scalar -> vector RaW dependency */ + if (regmask_get(&state->needs_ss_scalar_half, reg) || + regmask_get(&state->needs_ss_scalar_full, reg)) { + apply_ss(n, state, mergedregs); + last_input_needs_ss = false; + } + } + if (regmask_get(&state->needs_sy, reg)) { apply_sy(n, state, mergedregs); } @@ -455,7 +499,9 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) } foreach_dst (reg, n) { - if (regmask_get(&state->needs_ss_war, reg)) { + if (regmask_get(&state->needs_ss_war, reg) || + (!n_is_scalar_alu && + regmask_get(&state->needs_ss_scalar_war, reg))) { apply_ss(n, state, mergedregs); last_input_needs_ss = false; } @@ -483,6 +529,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) */ if ((delay > 0) && (ctx->compiler->gen >= 6) && last_n && + !n_is_scalar_alu && ((opc_cat(last_n->opc) == 2) || (opc_cat(last_n->opc) == 3)) && (last_n->repeat == 0)) { /* the previous cat2/cat3 instruction can encode at most 3 nop's: */ @@ -528,8 +575,16 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) regmask_set(&state->needs_ss, n->dsts[0]); foreach_dst (dst, n) { - if (dst->flags & IR3_REG_SHARED) - regmask_set(&state->needs_ss, dst); + if (dst->flags & IR3_REG_SHARED) { + if (n_is_scalar_alu) { + if (dst->flags & IR3_REG_HALF) + regmask_set(&state->needs_ss_scalar_full, dst); + else + regmask_set(&state->needs_ss_scalar_half, dst); + } else { + regmask_set(&state->needs_ss, dst); + } + } } if (is_tex_or_prefetch(n)) { @@ -566,17 +621,31 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) * their src register(s): */ if (is_tex(n) || is_mem(n) || is_ss_producer(n)) { - foreach_src (reg, n) { - regmask_set(&state->needs_ss_war, reg); + if (n_is_scalar_alu) { + /* Scalar ALU also does not immediately read its source because it + * is not executed right away, but scalar ALU instructions are + * executed in-order so subsequent scalar ALU instructions don't + * need to wait for previous ones. + */ + foreach_src (reg, n) { + if (reg->flags & IR3_REG_SHARED) { + regmask_set(&state->needs_ss_scalar_war, reg); + } + } + } else { + foreach_src (reg, n) { + regmask_set(&state->needs_ss_war, reg); + } } } - if (count_instruction(n)) + bool count = count_instruction(n, ctx->compiler); + if (count) cycle += 1; delay_update(state, n, cycle, mergedregs); - if (count_instruction(n)) + if (count) cycle += n->repeat; if (ctx->early_input_release && is_input(n)) { @@ -1496,9 +1565,15 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary) rzalloc(ctx, struct ir3_legalize_block_data); regmask_init(&bd->state.needs_ss_war, mergedregs); + regmask_init(&bd->state.needs_ss_scalar_war, mergedregs); + regmask_init(&bd->state.needs_ss_scalar_full, mergedregs); + regmask_init(&bd->state.needs_ss_scalar_half, mergedregs); regmask_init(&bd->state.needs_ss, mergedregs); regmask_init(&bd->state.needs_sy, mergedregs); regmask_init(&bd->begin_state.needs_ss_war, mergedregs); + regmask_init(&bd->begin_state.needs_ss_scalar_war, mergedregs); + regmask_init(&bd->begin_state.needs_ss_scalar_full, mergedregs); + regmask_init(&bd->begin_state.needs_ss_scalar_half, mergedregs); regmask_init(&bd->begin_state.needs_ss, mergedregs); regmask_init(&bd->begin_state.needs_sy, mergedregs); diff --git a/src/freedreno/ir3/ir3_postsched.c b/src/freedreno/ir3/ir3_postsched.c index c56829e9025..7ad22504896 100644 --- a/src/freedreno/ir3/ir3_postsched.c +++ b/src/freedreno/ir3/ir3_postsched.c @@ -406,14 +406,15 @@ add_single_reg_dep(struct ir3_postsched_deps_state *state, unsigned d = 0; if (src_n >= 0 && dep && state->direction == F) { + struct ir3_compiler *compiler = state->ctx->ir->compiler; /* get the dst_n this corresponds to */ unsigned dst_n = state->dst_n[num]; - unsigned d_soft = ir3_delayslots(dep->instr, node->instr, src_n, true); - d = ir3_delayslots_with_repeat(dep->instr, node->instr, dst_n, src_n); + unsigned d_soft = ir3_delayslots(compiler, dep->instr, node->instr, src_n, true); + d = ir3_delayslots_with_repeat(compiler, dep->instr, node->instr, dst_n, src_n); node->delay = MAX2(node->delay, d_soft); if (is_sy_producer(dep->instr)) node->has_sy_src = true; - if (is_ss_producer(dep->instr)) + if (needs_ss(compiler, dep->instr, node->instr)) node->has_ss_src = true; } diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c index 7fb3f53ca76..a0089e78fed 100644 --- a/src/freedreno/ir3/ir3_sched.c +++ b/src/freedreno/ir3/ir3_sched.c @@ -90,6 +90,7 @@ */ struct ir3_sched_ctx { + struct ir3_compiler *compiler; struct ir3_block *block; /* the current block */ struct dag *dag; @@ -173,7 +174,8 @@ struct ir3_sched_node { static void sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr); -static void sched_node_add_dep(struct ir3_instruction *instr, +static void sched_node_add_dep(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr, struct ir3_instruction *src, int i); static bool @@ -182,10 +184,11 @@ is_scheduled(struct ir3_instruction *instr) return !!(instr->flags & IR3_INSTR_MARK); } -/* check_src_cond() passing a ir3_sched_ctx. */ +/* check_src_cond() passing the user and ir3_sched_ctx. */ static bool sched_check_src_cond(struct ir3_instruction *instr, bool (*cond)(struct ir3_instruction *, + struct ir3_instruction *, struct ir3_sched_ctx *), struct ir3_sched_ctx *ctx) { @@ -197,7 +200,7 @@ sched_check_src_cond(struct ir3_instruction *instr, if (sched_check_src_cond(src, cond, ctx)) return true; } else { - if (cond(src, ctx)) + if (cond(src, instr, ctx)) return true; } } @@ -208,7 +211,8 @@ sched_check_src_cond(struct ir3_instruction *instr, /* Is this a sy producer that hasn't been waited on yet? */ static bool -is_outstanding_sy(struct ir3_instruction *instr, struct ir3_sched_ctx *ctx) +is_outstanding_sy(struct ir3_instruction *instr, struct ir3_instruction *use, + struct ir3_sched_ctx *ctx) { if (!is_sy_producer(instr)) return false; @@ -224,9 +228,10 @@ is_outstanding_sy(struct ir3_instruction *instr, struct ir3_sched_ctx *ctx) } static bool -is_outstanding_ss(struct ir3_instruction *instr, struct ir3_sched_ctx *ctx) +is_outstanding_ss(struct ir3_instruction *instr, struct ir3_instruction *use, + struct ir3_sched_ctx *ctx) { - if (!is_ss_producer(instr)) + if (!needs_ss(ctx->compiler, instr, use)) return false; /* The sched node is only valid within the same block, we cannot @@ -932,7 +937,7 @@ split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr, /* don't need to remove old dag edge since old addr is * already scheduled: */ - sched_node_add_dep(indirect, new_addr, 0); + sched_node_add_dep(ctx, indirect, new_addr, 0); di(indirect, "new address"); } } @@ -955,7 +960,8 @@ sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) } static void -sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src, +sched_node_add_dep(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr, struct ir3_instruction *src, int i) { /* don't consider dependencies in other blocks: */ @@ -978,8 +984,8 @@ sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src, if (instr->opc == OPC_META_COLLECT) sn->collect = instr; - unsigned d_soft = ir3_delayslots(src, instr, i, true); - unsigned d = ir3_delayslots(src, instr, i, false); + unsigned d_soft = ir3_delayslots(ctx->compiler, src, instr, i, true); + unsigned d = ir3_delayslots(ctx->compiler, src, instr, i, false); /* delays from (ss) and (sy) are considered separately and more accurately in * the scheduling heuristic, so ignore it when calculating the ip of @@ -1036,7 +1042,7 @@ is_output_only(struct ir3_instruction *instr) } static void -sched_node_add_deps(struct ir3_instruction *instr) +sched_node_add_deps(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) { /* There's nothing to do for phi nodes, since they always go first. And * phi nodes can reference sources later in the same block, so handling @@ -1049,7 +1055,7 @@ sched_node_add_deps(struct ir3_instruction *instr) * the DAG easily in a single pass. */ foreach_ssa_src_n (src, i, instr) { - sched_node_add_dep(instr, src, i); + sched_node_add_dep(ctx, instr, src, i); } /* NOTE that all inputs must be scheduled before a kill, so @@ -1098,7 +1104,7 @@ sched_dag_init(struct ir3_sched_ctx *ctx) dag_validate(ctx->dag, sched_dag_validate_cb, NULL); foreach_instr (instr, &ctx->unscheduled_list) - sched_node_add_deps(instr); + sched_node_add_deps(ctx, instr); dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL); } @@ -1234,6 +1240,8 @@ ir3_sched(struct ir3 *ir) { struct ir3_sched_ctx *ctx = rzalloc(NULL, struct ir3_sched_ctx); + ctx->compiler = ir->compiler; + foreach_block (block, &ir->block_list) { foreach_instr (instr, &block->instr_list) { instr->data = NULL;