zink/egl: performance optimizations for KosmicKrisp on Metal

- Disable implicit_sync for KosmicKrisp to avoid per-frame GPU stall
- Cache drawable size instead of dispatch_sync to main thread
- Remove debug crash handler overhead
This commit is contained in:
Luca Mignatti
2025-12-31 13:05:19 -06:00
parent 03a56f0032
commit f9624417ea
6 changed files with 58 additions and 84 deletions

View File

@@ -43,34 +43,11 @@
#include "loader_dri_helper.h"
#if defined(__APPLE__) && defined(VK_USE_PLATFORM_METAL_EXT)
#include <execinfo.h>
#include <pthread.h>
#include <signal.h>
#include <dispatch/dispatch.h>
#include <objc/message.h>
#include <objc/runtime.h>
#include <vulkan/vulkan_metal.h>
static void
crash_handler(int sig)
{
void *array[50];
int size = backtrace(array, 50);
fprintf(stderr, "\n\n=== CRASH HANDLER: Signal %d ===\n", sig);
fprintf(stderr, "Stack trace:\n");
backtrace_symbols_fd(array, size, STDERR_FILENO);
fprintf(stderr, "=== END STACK TRACE ===\n\n");
signal(sig, SIG_DFL);
raise(sig);
}
__attribute__((constructor)) static void
install_crash_handler(void)
{
signal(SIGSEGV, crash_handler);
signal(SIGBUS, crash_handler);
signal(SIGABRT, crash_handler);
}
#endif
static struct dri_image *
@@ -345,70 +322,24 @@ static const __DRIextension *kopper_loader_extensions[] = {
#ifdef VK_USE_PLATFORM_METAL_EXT
struct get_size_ctx {
void *layer;
double w;
double h;
};
static void
get_drawable_size_main_thread(void *data)
{
struct get_size_ctx *ctx = data;
typedef struct {
double width;
double height;
} MGLSize;
/* Check superlayer to verify attachment */
id superlayer = ((id(*)(id, SEL))objc_msgSend)(
(id)ctx->layer, sel_registerName("superlayer"));
MGLSize (*msgSendSize)(id, SEL) = (MGLSize(*)(id, SEL))objc_msgSend;
MGLSize size = msgSendSize((id)ctx->layer, sel_registerName("drawableSize"));
ctx->w = size.width;
ctx->h = size.height;
}
static void
surfaceless_metal_kopper_get_drawable_info(struct dri_drawable *draw, int *w,
int *h, void *loaderPrivate)
{
struct dri2_egl_surface *dri2_surf = loaderPrivate;
void *layer = dri2_surf->base.NativeSurface;
if (layer) {
/* Debugging SIGBUS: Validate layer state */
/* Check class */
const char *cls = object_getClassName((id)layer);
/* Check device property */
id device =
((id(*)(id, SEL))objc_msgSend)((id)layer, sel_registerName("device"));
/* [layer drawableSize] */
/* Query on Main Thread to avoid race conditions with CoreAnimation which
* can cause SIGBUS */
struct get_size_ctx ctx;
ctx.layer = layer;
ctx.w = 0;
ctx.h = 0;
if (pthread_main_np()) {
get_drawable_size_main_thread(&ctx);
} else {
dispatch_sync_f(dispatch_get_main_queue(), &ctx,
get_drawable_size_main_thread);
}
*w = (int)ctx.w;
*h = (int)ctx.h;
} else {
*w = dri2_surf->base.Width;
*h = dri2_surf->base.Height;
}
/* PERFORMANCE FIX: Return cached dimensions instead of querying the
* CAMetalLayer on every call via dispatch_sync to main thread.
*
* The previous implementation was a major performance bottleneck - each
* dispatch_sync_f() blocks the calling thread waiting for the main thread.
* This was happening multiple times per frame, causing massive stalls.
*
* The surface dimensions are already updated through kopper_update_size()
* on resize events, so we can safely return the cached values.
*/
*w = dri2_surf->base.Width;
*h = dri2_surf->base.Height;
}
#include <objc/message.h>

View File

@@ -3,6 +3,7 @@
DRI_CONF_SECTION_DEBUG
DRI_CONF_DUAL_COLOR_BLEND_BY_LOCATION(false)
DRI_CONF_OPT_B(radeonsi_inline_uniforms, false, "Optimize shaders by replacing uniforms with literals")
DRI_CONF_ALLOW_GLSL_COMPAT_SHADERS(true)
DRI_CONF_SECTION_END
DRI_CONF_SECTION_PERFORMANCE

View File

@@ -1236,7 +1236,10 @@ zink_init_screen_caps(struct zink_screen *screen)
caps->mesh.pipeline_statistic_queries = screen->info.mesh_feats.meshShaderQueries;
if (screen->info.feats12.subgroupBroadcastDynamicId && screen->info.feats12.shaderSubgroupExtendedTypes && screen->info.feats.features.shaderFloat64) {
/* Enable subgroup operations if the Vulkan driver supports them.
* Note: shaderFloat64 was previously required here for GL_ARB_shader_ballot's
* uint64 ballot masks, but subgroup shuffle/basic ops don't need it. */
if (screen->info.feats12.subgroupBroadcastDynamicId && screen->info.feats12.shaderSubgroupExtendedTypes) {
caps->shader_subgroup_size = screen->info.subgroup.subgroupSize;
if (screen->info.have_EXT_mesh_shader)
caps->shader_subgroup_supported_stages = screen->info.subgroup.supportedStages & BITFIELD_MASK(MESA_SHADER_MESH_STAGES);
@@ -2886,6 +2889,7 @@ init_driver_workarounds(struct zink_screen *screen)
case VK_DRIVER_ID_MESA_V3DV:
case VK_DRIVER_ID_MESA_PANVK:
case VK_DRIVER_ID_MESA_NVK:
case VK_DRIVER_ID_MESA_KOSMICKRISP:
screen->driver_workarounds.implicit_sync = false;
break;
default:

View File

@@ -453,6 +453,8 @@ infer_types_from_intrinsic(struct hash_table *types, nir_intrinsic_instr *instr)
set_type(types, &instr->src[1], TYPE_UINT);
break;
case nir_intrinsic_reduce:
case nir_intrinsic_inclusive_scan:
case nir_intrinsic_exclusive_scan:
switch (nir_intrinsic_reduction_op(instr)) {
case nir_op_iand:
case nir_op_ior:

View File

@@ -1513,6 +1513,42 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
UNREACHABLE("Bad reduction op");
}
src_to_msl(ctx, &instr->src[0]);
P(ctx, ");\n");
break;
case nir_intrinsic_inclusive_scan:
switch (nir_intrinsic_reduction_op(instr)) {
case nir_op_iadd:
case nir_op_fadd:
P(ctx, "simd_prefix_inclusive_sum(");
break;
case nir_op_imul:
case nir_op_fmul:
P(ctx, "simd_prefix_inclusive_product(");
break;
default:
/* Metal only supports sum and product for prefix operations.
* Other ops would need to be lowered in NIR. */
UNREACHABLE("Unsupported inclusive_scan op");
}
src_to_msl(ctx, &instr->src[0]);
P(ctx, ");\n");
break;
case nir_intrinsic_exclusive_scan:
switch (nir_intrinsic_reduction_op(instr)) {
case nir_op_iadd:
case nir_op_fadd:
P(ctx, "simd_prefix_exclusive_sum(");
break;
case nir_op_imul:
case nir_op_fmul:
P(ctx, "simd_prefix_exclusive_product(");
break;
default:
UNREACHABLE("Unsupported exclusive_scan op");
}
src_to_msl(ctx, &instr->src[0]);
P(ctx, ");\n");
break;

View File

@@ -452,8 +452,8 @@ kk_get_device_properties(const struct kk_physical_device *pdev,
VK_SUBGROUP_FEATURE_VOTE_BIT | VK_SUBGROUP_FEATURE_QUAD_BIT |
VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR, // | TODO_KOSMICKRISP
// VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR |
VK_SUBGROUP_FEATURE_ARITHMETIC_BIT, // | TODO_KOSMICKRISP
// VK_SUBGROUP_FEATURE_CLUSTERED_BIT |
// VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR,
.subgroupQuadOperationsInAllStages = true,