diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index 6403569695a..a6e6c1b21fa 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -2066,17 +2066,6 @@ panfrost_emit_varying(const struct panfrost_device *dev, /* Links varyings and uploads ATTRIBUTE descriptors. Can execute at link time, * rather than draw time (under good conditions). */ -struct pan_linkage { - /* Uploaded attribute descriptors */ - mali_ptr producer, consumer; - - /* Varyings buffers required */ - uint32_t present; - - /* Per-vertex stride for general varying buffer */ - uint32_t stride; -}; - static void panfrost_emit_varying_descs( struct pan_pool *pool, @@ -2100,6 +2089,12 @@ panfrost_emit_varying_descs( struct panfrost_ptr T = panfrost_pool_alloc_desc_array(pool, producer_count + consumer_count, ATTRIBUTE); + /* Take a reference if we're being put on the CSO */ + if (!pool->owned) { + out->bo = pool->transient_bo; + panfrost_bo_reference(out->bo); + } + struct mali_attribute_packed *descs = T.cpu; out->producer = producer_count ? T.gpu : 0; out->consumer = consumer_count ? T.gpu + @@ -2177,7 +2172,6 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch, struct panfrost_context *ctx = batch->ctx; struct panfrost_device *dev = pan_device(ctx->base.screen); struct panfrost_shader_state *vs, *fs; - struct pan_linkage linkage; vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX); fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT); @@ -2188,11 +2182,28 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch, if (!point_coord_replace || pan_is_bifrost(dev)) point_coord_mask = 0; - /* Emit ATTRIBUTE descriptors */ - panfrost_emit_varying_descs(&batch->pool, vs, fs, &ctx->streamout, point_coord_mask, &linkage); + /* In good conditions, we only need to link varyings once */ + bool prelink = + (point_coord_mask == 0) && + (ctx->streamout.num_targets == 0) && + !vs->info.separable && + !fs->info.separable; + + /* Try to reduce copies */ + struct pan_linkage _linkage; + struct pan_linkage *linkage = prelink ? &vs->linkage : &_linkage; + + /* Emit ATTRIBUTE descriptors if needed */ + if (!prelink || vs->linkage.bo == NULL) { + struct pan_pool *pool = + prelink ? &ctx->descs : &batch->pool; + + panfrost_emit_varying_descs(pool, vs, fs, &ctx->streamout, point_coord_mask, linkage); + } struct pipe_stream_output_info *so = &vs->stream_output; - unsigned xfb_base = pan_xfb_base(linkage.present); + unsigned present = linkage->present, stride = linkage->stride; + unsigned xfb_base = pan_xfb_base(present); struct panfrost_ptr T = panfrost_pool_alloc_desc_array(&batch->pool, xfb_base + @@ -2220,30 +2231,30 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch, } panfrost_emit_varyings(batch, - &varyings[pan_varying_index(linkage.present, PAN_VARY_GENERAL)], - linkage.stride, vertex_count); + &varyings[pan_varying_index(present, PAN_VARY_GENERAL)], + stride, vertex_count); /* fp32 vec4 gl_Position */ *position = panfrost_emit_varyings(batch, - &varyings[pan_varying_index(linkage.present, PAN_VARY_POSITION)], + &varyings[pan_varying_index(present, PAN_VARY_POSITION)], sizeof(float) * 4, vertex_count); - if (linkage.present & BITFIELD_BIT(PAN_VARY_PSIZ)) { + if (present & BITFIELD_BIT(PAN_VARY_PSIZ)) { *psiz = panfrost_emit_varyings(batch, - &varyings[pan_varying_index(linkage.present, PAN_VARY_PSIZ)], + &varyings[pan_varying_index(present, PAN_VARY_PSIZ)], 2, vertex_count); } - pan_emit_special_input(varyings, linkage.present, + pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD); - pan_emit_special_input(varyings, linkage.present, PAN_VARY_FACE, + pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING); - pan_emit_special_input(varyings, linkage.present, PAN_VARY_FRAGCOORD, + pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD); *buffers = T.gpu; - *vs_attribs = linkage.producer; - *fs_attribs = linkage.consumer; + *vs_attribs = linkage->producer; + *fs_attribs = linkage->consumer; } void diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c index b120707b246..74f12bcecd8 100644 --- a/src/gallium/drivers/panfrost/pan_context.c +++ b/src/gallium/drivers/panfrost/pan_context.c @@ -977,6 +977,7 @@ panfrost_delete_shader_state( struct panfrost_shader_state *shader_state = &cso->variants[i]; panfrost_bo_unreference(shader_state->bin.bo); panfrost_bo_unreference(shader_state->state.bo); + panfrost_bo_unreference(shader_state->linkage.bo); } free(cso->variants); diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h index d16ae59f469..1b783834a76 100644 --- a/src/gallium/drivers/panfrost/pan_context.h +++ b/src/gallium/drivers/panfrost/pan_context.h @@ -235,6 +235,23 @@ struct panfrost_rasterizer { struct mali_stencil_mask_misc_packed stencil_misc; }; +/* Linked varyings */ +struct pan_linkage { + /* If the upload is owned by the CSO instead + * of the pool, the referenced BO. Else, + * NULL. */ + struct panfrost_bo *bo; + + /* Uploaded attribute descriptors */ + mali_ptr producer, consumer; + + /* Varyings buffers required */ + uint32_t present; + + /* Per-vertex stride for general varying buffer */ + uint32_t stride; +}; + /* Variants bundle together to form the backing CSO, bundling multiple * shaders with varying emulated features baked in */ @@ -251,6 +268,9 @@ struct panfrost_shader_state { struct pan_shader_info info; + /* Linked varyings, for non-separable programs */ + struct pan_linkage linkage; + struct pipe_stream_output_info stream_output; uint64_t so_mask; diff --git a/src/panfrost/lib/pan_pool.h b/src/panfrost/lib/pan_pool.h index e53ec3fe3a9..c98a2b3d449 100644 --- a/src/panfrost/lib/pan_pool.h +++ b/src/panfrost/lib/pan_pool.h @@ -77,7 +77,8 @@ struct pan_pool_ref { static inline struct pan_pool_ref pan_take_ref(struct pan_pool *pool, mali_ptr ptr) { - panfrost_bo_reference(pool->transient_bo); + if (!pool->owned) + panfrost_bo_reference(pool->transient_bo); return (struct pan_pool_ref) { .gpu = ptr, diff --git a/src/panfrost/lib/pan_shader.c b/src/panfrost/lib/pan_shader.c index 0c2d2a2efa7..b4b30eee6ff 100644 --- a/src/panfrost/lib/pan_shader.c +++ b/src/panfrost/lib/pan_shader.c @@ -174,6 +174,7 @@ pan_shader_compile(const struct panfrost_device *dev, info->stage = s->info.stage; info->contains_barrier = s->info.uses_memory_barrier || s->info.uses_control_barrier; + info->separable = s->info.separate_shader; switch (info->stage) { case MESA_SHADER_VERTEX: diff --git a/src/panfrost/util/pan_ir.h b/src/panfrost/util/pan_ir.h index f317df75927..f8fa8c0e165 100644 --- a/src/panfrost/util/pan_ir.h +++ b/src/panfrost/util/pan_ir.h @@ -190,6 +190,7 @@ struct pan_shader_info { } vs; }; + bool separable; bool contains_barrier; bool writes_global; uint64_t outputs_written;