nv50: flatten simple IF/ELSE/ENDIF constructs
Less branching means less instructions and less thread divergence.
This commit is contained in:
@@ -125,6 +125,20 @@ nv50_nvi_can_load(struct nv_instruction *nvi, int s, struct nv_value *value)
|
||||
}
|
||||
}
|
||||
|
||||
/* Return whether this instruction can be executed conditionally. */
|
||||
boolean
|
||||
nv50_nvi_can_predicate(struct nv_instruction *nvi)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (nvi->flags_src)
|
||||
return FALSE;
|
||||
for (i = 0; i < 4 && nvi->src[i]; ++i)
|
||||
if (nvi->src[i]->value->reg.file == NV_FILE_IMM)
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
ubyte
|
||||
nv50_supported_src_mods(uint opcode, int s)
|
||||
{
|
||||
|
||||
@@ -432,6 +432,7 @@ void nv_print_program(struct nv_basic_block *b);
|
||||
boolean nv_op_commutative(uint opcode);
|
||||
int nv50_indirect_opnd(struct nv_instruction *);
|
||||
boolean nv50_nvi_can_use_imm(struct nv_instruction *, int s);
|
||||
boolean nv50_nvi_can_predicate(struct nv_instruction *);
|
||||
boolean nv50_nvi_can_load(struct nv_instruction *, int s, struct nv_value *);
|
||||
ubyte nv50_supported_src_mods(uint opcode, int s);
|
||||
int nv_nvi_refcount(struct nv_instruction *);
|
||||
|
||||
@@ -119,6 +119,15 @@ nvi_isnop(struct nv_instruction *nvi)
|
||||
return values_equal(nvi->def[0], nvi->src[0]->value);
|
||||
}
|
||||
|
||||
struct nv_pass {
|
||||
struct nv_pc *pc;
|
||||
int n;
|
||||
void *priv;
|
||||
};
|
||||
|
||||
static int
|
||||
nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b);
|
||||
|
||||
static void
|
||||
nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b)
|
||||
{
|
||||
@@ -204,6 +213,13 @@ nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b)
|
||||
int
|
||||
nv_pc_exec_pass2(struct nv_pc *pc)
|
||||
{
|
||||
struct nv_pass pass;
|
||||
|
||||
pass.pc = pc;
|
||||
|
||||
pc->pass_seq++;
|
||||
nv_pass_flatten(&pass, pc->root);
|
||||
|
||||
debug_printf("preparing %u blocks for emission\n", pc->num_blocks);
|
||||
|
||||
pc->bb_list = CALLOC(pc->num_blocks, sizeof(struct nv_basic_block *));
|
||||
@@ -273,12 +289,6 @@ check_swap_src_0_1(struct nv_instruction *nvi)
|
||||
nvi->set_cond = cc_swapped[nvi->set_cond];
|
||||
}
|
||||
|
||||
struct nv_pass {
|
||||
struct nv_pc *pc;
|
||||
int n;
|
||||
void *priv;
|
||||
};
|
||||
|
||||
static int
|
||||
nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b)
|
||||
{
|
||||
@@ -863,24 +873,95 @@ nv_pass_dce(struct nv_pass_dce *ctx, struct nv_basic_block *b)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Register allocation inserted ELSE blocks for all IF/ENDIF without ELSE.
|
||||
* Returns TRUE if @bb initiates an IF/ELSE/ENDIF clause, or is an IF with
|
||||
* BREAK and dummy ELSE block.
|
||||
*/
|
||||
static INLINE boolean
|
||||
bb_simple_if_endif(struct nv_basic_block *bb)
|
||||
bb_is_if_else_endif(struct nv_basic_block *bb)
|
||||
{
|
||||
return (bb->out[0] && bb->out[1] &&
|
||||
bb->out[0]->out[0] == bb->out[1] &&
|
||||
!bb->out[0]->out[1]);
|
||||
if (!bb->out[0] || !bb->out[1])
|
||||
return FALSE;
|
||||
|
||||
if (bb->out[0]->out_kind[0] == CFG_EDGE_LOOP_LEAVE) {
|
||||
return (bb->out[0]->out[1] == bb->out[1]->out[0] &&
|
||||
!bb->out[1]->out[1]);
|
||||
} else {
|
||||
return (bb->out[0]->out[0] == bb->out[1]->out[0] &&
|
||||
!bb->out[0]->out[1] &&
|
||||
!bb->out[1]->out[1]);
|
||||
}
|
||||
}
|
||||
|
||||
/* predicate instructions and remove branch at the end */
|
||||
static void
|
||||
predicate_instructions(struct nv_pc *pc, struct nv_basic_block *b,
|
||||
struct nv_value *p, ubyte cc)
|
||||
{
|
||||
struct nv_instruction *nvi;
|
||||
|
||||
if (!b->entry)
|
||||
return;
|
||||
for (nvi = b->entry; nvi->next; nvi = nvi->next) {
|
||||
if (!nvi_isnop(nvi)) {
|
||||
nvi->cc = cc;
|
||||
nv_reference(pc, &nvi->flags_src, p);
|
||||
}
|
||||
}
|
||||
|
||||
if (nvi->opcode == NV_OP_BRA)
|
||||
nv_nvi_delete(nvi);
|
||||
else
|
||||
if (!nvi_isnop(nvi)) {
|
||||
nvi->cc = cc;
|
||||
nv_reference(pc, &nvi->flags_src, p);
|
||||
}
|
||||
}
|
||||
|
||||
/* NOTE: Run this after register allocation, we can just cut out the cflow
|
||||
* instructions and hook the predicates to the conditional OPs if they are
|
||||
* not using immediates; better than inserting SELECT to join definitions.
|
||||
*
|
||||
* NOTE: Should adapt prior optimization to make this possible more often.
|
||||
*/
|
||||
static int
|
||||
nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b)
|
||||
{
|
||||
int j;
|
||||
struct nv_instruction *nvi;
|
||||
struct nv_value *pred;
|
||||
int i;
|
||||
int n0 = 0, n1 = 0;
|
||||
|
||||
if (bb_simple_if_endif(b)) {
|
||||
++ctx->n;
|
||||
debug_printf("nv_pass_flatten: total IF/ENDIF constructs: %i\n", ctx->n);
|
||||
if (bb_is_if_else_endif(b)) {
|
||||
|
||||
debug_printf("nv_pass_flatten: IF/ELSE/ENDIF construct at BB:%i\n", b->id);
|
||||
|
||||
for (n0 = 0, nvi = b->out[0]->entry; nvi; nvi = nvi->next, ++n0)
|
||||
if (!nv50_nvi_can_predicate(nvi))
|
||||
break;
|
||||
if (!nvi) {
|
||||
for (n1 = 0, nvi = b->out[1]->entry; nvi; nvi = nvi->next, ++n1)
|
||||
if (!nv50_nvi_can_predicate(nvi))
|
||||
break;
|
||||
if (nvi) {
|
||||
debug_printf("cannot predicate: "); nv_print_instruction(nvi);
|
||||
}
|
||||
} else {
|
||||
debug_printf("cannot predicate: "); nv_print_instruction(nvi);
|
||||
}
|
||||
|
||||
if (!nvi && n0 < 12 && n1 < 12) { /* 12 as arbitrary limit */
|
||||
assert(b->exit && b->exit->flags_src);
|
||||
pred = b->exit->flags_src->value;
|
||||
|
||||
predicate_instructions(ctx->pc, b->out[0], pred, NV_CC_NE | NV_CC_U);
|
||||
predicate_instructions(ctx->pc, b->out[1], pred, NV_CC_EQ);
|
||||
|
||||
assert(b->exit && b->exit->opcode == NV_OP_BRA);
|
||||
nv_nvi_delete(b->exit);
|
||||
}
|
||||
}
|
||||
DESCEND_ARBITRARY(j, nv_pass_flatten);
|
||||
DESCEND_ARBITRARY(i, nv_pass_flatten);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -960,11 +1041,6 @@ nv_pc_exec_pass0(struct nv_pc *pc)
|
||||
pass.n = 0;
|
||||
pass.pc = pc;
|
||||
|
||||
pc->pass_seq++;
|
||||
ret = nv_pass_flatten(&pass, pc->root);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* Do this first, so we don't have to pay attention
|
||||
* to whether sources are supported memory loads.
|
||||
*/
|
||||
|
||||
@@ -591,7 +591,7 @@ bld_get_address(struct bld_context *bld, int id, struct nv_value *indirect)
|
||||
|
||||
|
||||
static struct nv_value *
|
||||
bld_predicate(struct bld_context *bld, struct nv_value *src)
|
||||
bld_predicate(struct bld_context *bld, struct nv_value *src, boolean bool_only)
|
||||
{
|
||||
struct nv_instruction *nvi = src->insn;
|
||||
|
||||
@@ -600,6 +600,14 @@ bld_predicate(struct bld_context *bld, struct nv_value *src)
|
||||
nvi->bb != bld->pc->current_block) {
|
||||
nvi = new_instruction(bld->pc, NV_OP_CVT);
|
||||
nv_reference(bld->pc, &nvi->src[0], src);
|
||||
} else
|
||||
if (bool_only) {
|
||||
while (nvi->opcode == NV_OP_ABS || nvi->opcode == NV_OP_CVT ||
|
||||
nvi->opcode == NV_OP_NEG) {
|
||||
/* TGSI SET gets conversion to f32, we only need source 0/~0 */
|
||||
if (!nvi->def[0]->insn->flags_src)
|
||||
nvi = nvi->src[0]->value->insn;
|
||||
}
|
||||
}
|
||||
|
||||
if (!nvi->flags_def) {
|
||||
@@ -614,7 +622,7 @@ bld_kil(struct bld_context *bld, struct nv_value *src)
|
||||
{
|
||||
struct nv_instruction *nvi;
|
||||
|
||||
src = bld_predicate(bld, src);
|
||||
src = bld_predicate(bld, src, FALSE);
|
||||
nvi = new_instruction(bld->pc, NV_OP_KIL);
|
||||
nvi->fixed = 1;
|
||||
nvi->flags_src = new_ref(bld->pc, src);
|
||||
@@ -1223,7 +1231,7 @@ bld_instruction(struct bld_context *bld,
|
||||
src0 = emit_fetch(bld, insn, 0, c);
|
||||
src1 = emit_fetch(bld, insn, 1, c);
|
||||
src2 = emit_fetch(bld, insn, 2, c);
|
||||
src0 = bld_predicate(bld, src0);
|
||||
src0 = bld_predicate(bld, src0, FALSE);
|
||||
|
||||
src1 = bld_insn_1(bld, NV_OP_MOV, src1);
|
||||
src1->insn->flags_src = new_ref(bld->pc, src0);
|
||||
@@ -1304,7 +1312,7 @@ bld_instruction(struct bld_context *bld,
|
||||
bld->join_bb[bld->cond_lvl] = bld->pc->current_block;
|
||||
bld->cond_bb[bld->cond_lvl] = bld->pc->current_block;
|
||||
|
||||
src1 = bld_predicate(bld, emit_fetch(bld, insn, 0, 0));
|
||||
src1 = bld_predicate(bld, emit_fetch(bld, insn, 0, 0), TRUE);
|
||||
|
||||
bld_flow(bld, NV_OP_BRA, NV_CC_EQ, src1, NULL, FALSE);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user