turnip,freedreno/a6xx: tell hw the size of shared mem used by CS

Before, we only used 2k of shared memory.

It was found that 5 lower bits of SP_CS_UNKNOWN_A9B1 do control
the available size of shared memory for compute shaders, with
AVAILABLE_SIZE = (SP_CS_UNKNOWN_A9B1_SHARED_SIZE + 1) * 1k
up to 32k. And SP_CS_UNKNOWN_A9B1_SHARED_SIZE being zero enables
all 32k of shared memory.

Fixes tests:
 dEQP-VK.rasterization.line_continuity.line-strip
 dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.buffer.guard_nonlocal.workgroup.comp
 dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_nonlocal.workgroup.guard_local.buffer.comp
 dEQP-VK.memory_model.write_after_read.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.image.guard_nonlocal.workgroup.comp

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9157>
This commit is contained in:
Danylo Piliaiev
2021-02-15 13:14:56 +02:00
parent dab845d457
commit 0fa7ec1473
7 changed files with 27 additions and 18 deletions
@@ -49,9 +49,6 @@ dEQP-VK.image.subresource_layout.3d.all_levels.a8b8g8r8_snorm_pack32,Fail
dEQP-VK.image.subresource_layout.3d.all_levels.r16g16b16a16_snorm,Fail
dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_local.image.guard_nonlocal.workgroup.comp,Fail
dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.workgroup.guard_local.image.comp,Fail
dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.buffer.guard_nonlocal.workgroup.comp,Fail
dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_nonlocal.workgroup.guard_local.buffer.comp,Fail
dEQP-VK.memory_model.write_after_read.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.image.guard_nonlocal.workgroup.comp,Fail
dEQP-VK.memory.requirements.dedicated_allocation.buffer.regular,Fail
dEQP-VK.memory.requirements.dedicated_allocation.image.transient_tiling_optimal,Fail
dEQP-VK.pipeline.extended_dynamic_state.after_pipelines.depth_compare_greater_equal_greater,Fail
@@ -66,7 +63,6 @@ dEQP-VK.pipeline.push_descriptor.compute.binding3_numcalls2_combined_image_sampl
dEQP-VK.pipeline.push_descriptor.compute.binding3_numcalls2_sampled_image,Crash
dEQP-VK.pipeline.push_descriptor.compute.binding3_numcalls2_sampler,Crash
dEQP-VK.pipeline.push_descriptor.compute.binding3_numcalls2_storage_image,Crash
dEQP-VK.rasterization.line_continuity.line-strip,Fail
dEQP-VK.renderpass2.suballocation.attachment_allocation.input_output.7,Fail
dEQP-VK.spirv_assembly.instruction.compute.opquantize.infinities,Fail
dEQP-VK.spirv_assembly.instruction.graphics.opquantize.carry_bit_geom,Fail
+2 -2
View File
@@ -7412,7 +7412,7 @@ clusters:
00000080 SP_FS_TEX_COUNT: 128
0000f000 SP_UNKNOWN_A9A8: 0xf000
00421800 SP_CS_CTRL_REG0: { THREADMODE = MULTI | HALFREGFOOTPRINT = 0 | FULLREGFOOTPRINT = 48 | BRANCHSTACK = 8 | THREADSIZE = THREAD64 | VARYING }
0000001f SP_CS_UNKNOWN_A9B1: { SHARED_SIZE_2K | UNK1 = 0xf }
0000001f SP_CS_UNKNOWN_A9B1: { SHARED_SIZE = 31 }
00000000 SP_CS_BRANCH_COND: 0
00000000 SP_CS_OBJ_FIRST_EXEC_OFFSET: 0
8c415420 SP_CS_OBJ_START: 0x8c415420
@@ -7494,7 +7494,7 @@ clusters:
00000080 SP_FS_TEX_COUNT: 128
0000f000 SP_UNKNOWN_A9A8: 0xf000
00421800 SP_CS_CTRL_REG0: { THREADMODE = MULTI | HALFREGFOOTPRINT = 0 | FULLREGFOOTPRINT = 48 | BRANCHSTACK = 8 | THREADSIZE = THREAD64 | VARYING }
0000001f SP_CS_UNKNOWN_A9B1: { SHARED_SIZE_2K | UNK1 = 0xf }
0000001f SP_CS_UNKNOWN_A9B1: { SHARED_SIZE = 31 }
00000000 SP_CS_BRANCH_COND: 0
00000000 SP_CS_OBJ_FIRST_EXEC_OFFSET: 0
8c415420 SP_CS_OBJ_START: 0x8c415420
+1
View File
@@ -3413,6 +3413,7 @@ emit_instructions(struct ir3_context *ctx)
ctx->s->info.clip_distance_array_size;
ctx->so->pvtmem_size = ctx->s->scratch_size;
ctx->so->shared_size = ctx->s->shared_size;
/* NOTE: need to do something more clever when we support >1 fxn */
nir_foreach_register (reg, &fxn->registers) {
+3
View File
@@ -559,6 +559,9 @@ struct ir3_shader_variant {
/* Whether we should use the new per-wave layout rather than per-fiber. */
bool pvtmem_per_wave;
/* Size in bytes of required shared memory */
unsigned shared_size;
/* About Linkage:
* + Let the frag shader determine the position/compmask for the
* varyings, since it is the place where we know if the varying
+15 -10
View File
@@ -3059,17 +3059,22 @@ to upconvert to 32b float internally?
<reg32 offset="0xa9b0" name="SP_CS_CTRL_REG0" type="a6xx_sp_xs_ctrl_reg0"/>
<!-- set for compute shaders, always 0x41 -->
<!-- set for compute shaders -->
<reg32 offset="0xa9b1" name="SP_CS_UNKNOWN_A9B1">
<doc>
bit 0 seems to toggle between 2k and 32k of shared storage
the ldl/stl offset seems to be rewritten to 0 when it is beyond
this limit. This is different from ldlw/stlw, which wraps at
64k (and has 36k of storage on A640 - reads between 36k-64k
always return 0)
</doc>
<bitfield name="SHARED_SIZE_2K" pos="0" type="boolean"/>
<bitfield name="UNK1" low="1" high="6" type="uint"/>
<bitfield name="SHARED_SIZE" low="0" high="4" type="uint">
<doc>
If 0 - all 32k of shared storage is enabled, otherwise
(SHARED_SIZE + 1) * 1k is enabled.
The ldl/stl offset seems to be rewritten to 0 when it is beyond
this limit. This is different from ldlw/stlw, which wraps at
64k (and has 36k of storage on A640 - reads between 36k-64k
always return 0)
</doc>
</bitfield>
<bitfield name="UNK5" pos="5" type="boolean"/>
<!-- always 1 ? -->
<bitfield name="UNK6" pos="6" type="boolean"/>
</reg32>
<reg32 offset="0xa9b2" name="SP_CS_BRANCH_COND" type="hex"/>
<reg32 offset="0xa9b3" name="SP_CS_OBJ_FIRST_EXEC_OFFSET" type="uint"/>
+3 -1
View File
@@ -532,8 +532,10 @@ tu6_emit_cs_config(struct tu_cs *cs, const struct tu_shader *shader,
tu6_emit_xs_config(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
tu_cs_emit(cs, 0x41);
tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
uint32_t local_invocation_id =
ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
@@ -77,8 +77,10 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(v->branchstack) |
COND(v->need_pixlod, A6XX_SP_CS_CTRL_REG0_PIXLODENABLE));
uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
OUT_PKT4(ring, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
OUT_RING(ring, 0x41);
OUT_RING(ring, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
uint32_t local_invocation_id, work_group_id;
local_invocation_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);