diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c index 0cff9fae04e..4945e7a968d 100644 --- a/src/freedreno/ir3/ir3.c +++ b/src/freedreno/ir3/ir3.c @@ -107,6 +107,99 @@ collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg, } } +static bool +should_double_threadsize(struct ir3_shader_variant *v, + unsigned regs_count) +{ + const struct ir3_compiler *compiler = v->shader->compiler; + switch (v->type) { + case MESA_SHADER_COMPUTE: { + unsigned threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2]; + + /* For a5xx, if the workgroup size is greater than the maximum number + * of threads per core with 32 threads per wave (512) then we have to + * use the doubled threadsize because otherwise the workgroup wouldn't + * fit. For smaller workgroup sizes, we follow the blob and use the + * smaller threadsize. + */ + if (compiler->gpu_id < 600) { + return v->local_size_variable || threads_per_wg > + compiler->threadsize_base * compiler->max_waves; + } + + /* On a6xx, we prefer the larger threadsize unless the workgroup is + * small enough that it would be useless. Note that because + * threadsize_base is bumped to 64, we don't have to worry about the + * workgroup fitting, unlike the a5xx case. + */ + if (!v->local_size_variable) { + if (threads_per_wg <= compiler->threadsize_base) + return false; + } + } + /* fallthrough */ + case MESA_SHADER_FRAGMENT: { + /* Check that doubling the threadsize wouldn't exceed the regfile size */ + return regs_count * 2 <= compiler->reg_size_vec4; + } + + default: + /* On a6xx+, it's impossible to use a doubled wavesize in the geometry + * stages - the bit doesn't exist. The blob never used it for the VS + * on earlier gen's anyway. + */ + return false; + } +} + +/* Get the maximum number of waves that could be used even if this shader + * didn't use any registers. + */ +static unsigned +get_reg_independent_max_waves(struct ir3_shader_variant *v, bool double_threadsize) +{ + const struct ir3_compiler *compiler = v->shader->compiler; + unsigned max_waves = compiler->max_waves; + + /* If this is a compute shader, compute the limit based on shared size */ + if (v->type == MESA_SHADER_COMPUTE) { + /* Shared is allocated in chunks of 1k */ + unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024); + if (shared_per_wg > 0 && !v->local_size_variable) { + unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg; + unsigned threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2]; + unsigned waves_per_wg = + DIV_ROUND_UP(threads_per_wg, + compiler->threadsize_base * + (double_threadsize ? 2 : 1) * compiler->wave_granularity); + max_waves = + MIN2(max_waves, waves_per_wg * wgs_per_core * compiler->wave_granularity); + } + } + + /* Compute the limit based on branchstack */ + if (v->branchstack > 0) { + unsigned branchstack_max_waves = + compiler->branchstack_size / v->branchstack * + compiler->wave_granularity; + max_waves = MIN2(max_waves, branchstack_max_waves); + } + + return max_waves; +} + +/* Get the maximum number of waves that could be launched limited by reg size. + */ +static unsigned +get_reg_dependent_max_waves(const struct ir3_compiler *compiler, + unsigned reg_count, bool double_threadsize) +{ + return reg_count ? + (compiler->reg_size_vec4 / (reg_count * (double_threadsize ? 2 : 1)) * + compiler->wave_granularity) : + compiler->max_waves; +} + void ir3_collect_info(struct ir3_shader_variant *v) { @@ -200,6 +293,20 @@ ir3_collect_info(struct ir3_shader_variant *v) } } } + + /* TODO: for a5xx and below, is there a separate regfile for + * half-registers? + */ + unsigned regs_count = + info->max_reg + 1 + (compiler->gpu_id >= 600 ? ((info->max_half_reg + 2) / 2) : 0); + + info->double_threadsize = should_double_threadsize(v, regs_count); + unsigned reg_independent_max_waves = + get_reg_independent_max_waves(v, info->double_threadsize); + unsigned reg_dependent_max_waves = + get_reg_dependent_max_waves(compiler, regs_count, info->double_threadsize); + info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves); + assert(info->max_waves <= v->shader->compiler->max_waves); } static struct ir3_register * reg_create(struct ir3 *shader, diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index c869844fa38..bc11f89b94b 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -64,6 +64,11 @@ struct ir3_info { int8_t max_reg; /* highest GPR # used by shader */ int8_t max_half_reg; int16_t max_const; + /* This is the maximum # of waves that can executed at once in one core, + * assuming that they are all executing this shader. + */ + int8_t max_waves; + bool double_threadsize; bool multi_dword_ldp_stp; /* number of sync bits: */ diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index c27e8bcedfe..ed8b43364c5 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -79,6 +79,13 @@ ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id) compiler->gpu_id = gpu_id; compiler->set = ir3_ra_alloc_reg_set(compiler, false); + /* All known GPU's have 32k local memory (aka shared) */ + compiler->local_mem_size = 32 * 1024; + /* TODO see if older GPU's were different here */ + compiler->branchstack_size = 64; + compiler->wave_granularity = 2; + compiler->max_waves = 16; + if (compiler->gpu_id >= 600) { compiler->mergedregs_set = ir3_ra_alloc_reg_set(compiler, true); compiler->samgq_workaround = true; @@ -123,6 +130,34 @@ ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id) compiler->max_const_safe = 256; } + if (compiler->gpu_id == 650) { + /* This changed mid-generation for a650, so that using r32.x and above + * requires using the smallest threadsize. + */ + compiler->reg_size_vec4 = 64; + } else if (compiler->gpu_id >= 600) { + compiler->reg_size_vec4 = 96; + } else if (compiler->gpu_id >= 400) { + /* On a4xx-a5xx, using r24.x and above requires using the smallest + * threadsize. + */ + compiler->reg_size_vec4 = 48; + } else { + /* TODO: confirm this */ + compiler->reg_size_vec4 = 96; + } + + if (compiler->gpu_id >= 600) { + compiler->threadsize_base = 64; + } else if (compiler->gpu_id >= 400) { + /* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan + * 1.1 subgroupSize which is 32. + */ + compiler->threadsize_base = 32; + } else { + compiler->threadsize_base = 8; + } + if (compiler->gpu_id >= 400) { /* need special handling for "flat" */ compiler->flat_bypass = true; diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index 54a78f37726..6f7058f37e5 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -107,6 +107,44 @@ struct ir3_compiler { */ uint32_t const_upload_unit; + /* The base number of threads per wave. Some stages may be able to double + * this. + */ + uint32_t threadsize_base; + + /* On at least a6xx, waves are always launched in pairs. In calculations + * about occupancy, we pretend that each wave pair is actually one wave, + * which simplifies many of the calculations, but means we have to + * multiply threadsize_base by this number. + */ + uint32_t wave_granularity; + + /* The maximum number of simultaneous waves per core. */ + uint32_t max_waves; + + /* This is theoretical maximum number of vec4 registers that one wave of + * the base threadsize could use. To get the actual size of the register + * file in bytes one would need to compute: + * + * reg_size_vec4 * threadsize_base * wave_granularity * 16 (bytes per vec4) + * + * However this number is more often what we actually need. For example, a + * max_reg more than half of this will result in a doubled threadsize + * being impossible (because double-sized waves take up twice as many + * registers). Also, the formula for the occupancy given a particular + * register footprint is simpler. + * + * It is in vec4 units because the register file is allocated + * with vec4 granularity, so it's in the same units as max_reg. + */ + uint32_t reg_size_vec4; + + /* The size of local memory in bytes */ + uint32_t local_mem_size; + + /* The number of total branch stack entries, divided by wave_granularity. */ + uint32_t branchstack_size; + /* Whether clip+cull distances are supported */ bool has_clip_cull;