diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index e4179575b8d..69984faffc7 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -780,7 +780,7 @@ struct si_streamout_target { struct si_resource *buf_filled_size; unsigned buf_filled_size_offset; unsigned buf_filled_size_draw_count_offset; - bool buf_filled_size_valid; + bool buf_filled_size_valid; /* only for legacy streamout */ unsigned stride; }; diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index b1ed4f11f7c..1dc7970ac07 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -142,41 +142,52 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ if (sctx->gfx_level >= GFX12) { bool first_target = util_bitcount(enabled_mask) == 1; - /* The first enabled streamout target will contain the ordered ID/offset buffer for all - * targets. + /* The first enabled streamout target allocates the ordered ID/offset buffer for all + * targets. The other targets only hold the reference to the buffer because they need + * it for glDrawTransformFeedbackStream if stream != 0. */ - if (first_target && !append_bitmask) { - /* The layout is: - * struct { - * struct { - * uint32_t ordered_id; // equal for all buffers - * uint32_t dwords_written; - * } buffer[4]; - * }; - * - * The buffer must be initialized to 0 and the address must be aligned to 64 - * because it's faster when the atomic doesn't straddle a 64B block boundary. - */ - unsigned alloc_size = 32; - unsigned alignment = 64; - - si_resource_reference(&t->buf_filled_size, NULL); - u_suballocator_alloc(&sctx->allocator_zeroed_memory, alloc_size, alignment, - &t->buf_filled_size_offset, - (struct pipe_resource **)&t->buf_filled_size); - - /* Offset to dwords_written of the first enabled streamout buffer. */ - t->buf_filled_size_draw_count_offset = t->buf_filled_size_offset + i * 8 + 4; - } - if (first_target) { + /* If not appending, we need to reset the buffer. */ + if (!append_bitmask) { + /* The layout is: + * struct { + * struct { + * uint32_t ordered_id; // equal for all buffers + * uint32_t dwords_written; // it's actually in bytes + * } buffer[4]; + * }; + * + * The buffer must be initialized to 0 and the address must be aligned to 64 + * because it's faster when the atomic doesn't straddle a 64B block boundary. + */ + unsigned alloc_size = 32; + unsigned alignment = 64; + + si_resource_reference(&t->buf_filled_size, NULL); + u_suballocator_alloc(&sctx->allocator_zeroed_memory, alloc_size, alignment, + &t->buf_filled_size_offset, + (struct pipe_resource **)&t->buf_filled_size); + } + + /* Bind the buffer to the shader for global_atomic_ordered_add_b64. */ struct pipe_shader_buffer sbuf; sbuf.buffer = &t->buf_filled_size->b.b; sbuf.buffer_offset = t->buf_filled_size_offset; sbuf.buffer_size = 32; /* unused, the shader only uses the low 32 bits of the address */ si_set_internal_shader_buffer(sctx, SI_STREAMOUT_STATE_BUF, &sbuf); + } else { + /* All other streamout targets use the same buffer as the first one. */ + struct si_streamout_target *first = sctx->streamout.targets[ffs(enabled_mask) - 1]; + + assert(first != t); + assert(first->buf_filled_size); + si_resource_reference(&t->buf_filled_size, first->buf_filled_size); + t->buf_filled_size_offset = first->buf_filled_size_offset; } + + /* Offset to dwords_written of the streamout buffer. */ + t->buf_filled_size_draw_count_offset = t->buf_filled_size_offset + i * 8 + 4; } else { /* GFX6-11 */ if (!t->buf_filled_size) {