diff --git a/src/panfrost/bifrost/bi_opt_push_ubo.c b/src/panfrost/bifrost/bi_opt_push_ubo.c index 4024a10b114..75b8dbfe107 100644 --- a/src/panfrost/bifrost/bi_opt_push_ubo.c +++ b/src/panfrost/bifrost/bi_opt_push_ubo.c @@ -181,3 +181,169 @@ bi_opt_push_ubo(bi_context *ctx) free(analysis.blocks); } + +typedef struct { + BITSET_DECLARE(row, PAN_MAX_PUSH); +} adjacency_row; + +/* Find the connected component containing `node` with depth-first search */ +static void +bi_find_component(adjacency_row *adjacency, BITSET_WORD *visited, + unsigned *component, unsigned *size, unsigned node) +{ + unsigned neighbour; + + BITSET_SET(visited, node); + component[(*size)++] = node; + + BITSET_FOREACH_SET(neighbour, adjacency[node].row, PAN_MAX_PUSH) { + if (!BITSET_TEST(visited, neighbour)) { + bi_find_component(adjacency, visited, component, size, + neighbour); + } + } +} + +static bool +bi_is_uniform(bi_index idx) +{ + return (idx.type == BI_INDEX_FAU) && (idx.value & BIR_FAU_UNIFORM); +} + +/* Get the index of a uniform in 32-bit words from the start of FAU-RAM */ +static unsigned +bi_uniform_word(bi_index idx) +{ + assert(bi_is_uniform(idx)); + assert(idx.offset <= 1); + + return ((idx.value & ~BIR_FAU_UNIFORM) << 1) | idx.offset; +} + +/* + * Create an undirected graph where nodes are 32-bit uniform indices and edges + * represent that two nodes are used in the same instruction. + * + * The graph is constructed as an adjacency matrix stored in adjacency. + */ +static void +bi_create_fau_interference_graph(bi_context *ctx, adjacency_row *adjacency) +{ + bi_foreach_instr_global(ctx, I) { + unsigned nodes[BI_MAX_SRCS] = {}; + unsigned node_count = 0; + + /* Set nodes[] to 32-bit uniforms accessed */ + bi_foreach_src(I, s) { + if (bi_is_uniform(I->src[s])) { + unsigned word = bi_uniform_word(I->src[s]); + + if (word >= ctx->info.push_offset) + nodes[node_count++] = word; + } + } + + /* Create clique connecting nodes[] */ + for (unsigned i = 0; i < node_count; ++i) { + for (unsigned j = 0; j < node_count; ++j) { + if (i == j) + continue; + + unsigned x = nodes[i], y = nodes[j]; + assert(MAX2(x, y) < ctx->info.push->count); + + /* Add undirected edge between the nodes */ + BITSET_SET(adjacency[x].row, y); + BITSET_SET(adjacency[y].row, x); + } + } + } +} + +/* + * Optimization pass to reorder uniforms. The goal is to reduce the number of + * moves we emit when lowering FAU. The pass groups uniforms used by the same + * instruction. + * + * The pass works by creating a graph of pushed uniforms, where edges denote the + * "both 32-bit uniforms required by the same instruction" relationship. We + * perform depth-first search on this graph to find the connected components, + * where each connected component is a cluster of uniforms that are used + * together. We then select pairs of uniforms from each connected component. + * The remaining unpaired uniforms (from components of odd sizes) are paired + * together arbitrarily. + * + * After a new ordering is selected, pushed uniforms in the program and the + * panfrost_ubo_push data structure must be remapped to use the new ordering. + */ +void +bi_opt_reorder_push(bi_context *ctx) +{ + adjacency_row adjacency[PAN_MAX_PUSH] = { 0 }; + BITSET_DECLARE(visited, PAN_MAX_PUSH) = { 0 }; + + unsigned ordering[PAN_MAX_PUSH] = { 0 }; + unsigned unpaired[PAN_MAX_PUSH] = { 0 }; + unsigned pushed = 0, unpaired_count = 0; + + struct panfrost_ubo_push *push = ctx->info.push; + unsigned push_offset = ctx->info.push_offset; + + bi_create_fau_interference_graph(ctx, adjacency); + + for (unsigned i = push_offset; i < push->count; ++i) { + if (BITSET_TEST(visited, i)) continue; + + unsigned component[PAN_MAX_PUSH] = { 0 }; + unsigned size = 0; + bi_find_component(adjacency, visited, component, &size, i); + + /* If there is an odd number of uses, at least one use must be + * unpaired. Arbitrarily take the last one. + */ + if (size % 2) + unpaired[unpaired_count++] = component[--size]; + + /* The rest of uses are paired */ + assert((size % 2) == 0); + + /* Push the paired uses */ + memcpy(ordering + pushed, component, sizeof(unsigned) * size); + pushed += size; + } + + /* Push unpaired nodes at the end */ + memcpy(ordering + pushed, unpaired, sizeof(unsigned) * unpaired_count); + pushed += unpaired_count; + + /* Ordering is a permutation. Invert it for O(1) lookup. */ + unsigned old_to_new[PAN_MAX_PUSH] = { 0 }; + + for (unsigned i = 0; i < push_offset; ++i) { + old_to_new[i] = i; + } + + for (unsigned i = 0; i < pushed; ++i) { + assert(ordering[i] >= push_offset); + old_to_new[ordering[i]] = push_offset + i; + } + + /* Use new ordering throughout the program */ + bi_foreach_instr_global(ctx, I) { + bi_foreach_src(I, s) { + if (bi_is_uniform(I->src[s])) { + unsigned node = bi_uniform_word(I->src[s]); + unsigned new_node = old_to_new[node]; + I->src[s].value = BIR_FAU_UNIFORM | (new_node >> 1); + I->src[s].offset = new_node & 1; + } + } + } + + /* Use new ordering for push */ + struct panfrost_ubo_push old = *push; + for (unsigned i = 0; i < pushed; ++i) + push->words[push_offset + i] = old.words[ordering[i]]; + + push->count = push_offset + pushed; +} diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c index 42cb08248a5..64f2e3fe4d3 100644 --- a/src/panfrost/bifrost/bifrost_compile.c +++ b/src/panfrost/bifrost/bifrost_compile.c @@ -3981,6 +3981,7 @@ bi_compile_variant_nir(nir_shader *nir, bi_opt_dead_code_eliminate(ctx); bi_opt_cse(ctx); bi_opt_dead_code_eliminate(ctx); + bi_opt_reorder_push(ctx); bi_validate(ctx, "Optimization passes"); } @@ -4060,7 +4061,8 @@ bi_compile_variant(nir_shader *nir, .push = &info->push, .bifrost = &info->bifrost, .tls_size = info->tls_size, - .sysvals = &info->sysvals + .sysvals = &info->sysvals, + .push_offset = info->push.count }; unsigned offset = binary->size; diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h index 86a8548c3dc..cc37c661c2f 100644 --- a/src/panfrost/bifrost/compiler.h +++ b/src/panfrost/bifrost/compiler.h @@ -682,6 +682,7 @@ struct bi_shader_info { struct panfrost_sysvals *sysvals; unsigned tls_size; unsigned work_reg_count; + unsigned push_offset; }; /* State of index-driven vertex shading for current shader */ @@ -986,6 +987,7 @@ void bi_opt_dead_code_eliminate(bi_context *ctx); void bi_opt_fuse_dual_texture(bi_context *ctx); void bi_opt_dce_post_ra(bi_context *ctx); void bi_opt_push_ubo(bi_context *ctx); +void bi_opt_reorder_push(bi_context *ctx); void bi_lower_swizzle(bi_context *ctx); void bi_lower_fau(bi_context *ctx); void bi_assign_scoreboard(bi_context *ctx);