zink/egl: performance optimizations for KosmicKrisp on Metal

- Disable implicit_sync for KosmicKrisp to avoid per-frame GPU stall - Cache drawable size instead of dispatch_sync to main thread - Remove debug crash handler overhead
2025-12-31 13:05:19 -06:00
parent 03a56f0032
commit f9624417ea
6 changed files with 58 additions and 84 deletions
--- a/src/egl/drivers/dri2/platform_surfaceless.c
+++ b/src/egl/drivers/dri2/platform_surfaceless.c
@@ -43,34 +43,11 @@
 #include "loader_dri_helper.h"
 #if defined(__APPLE__) && defined(VK_USE_PLATFORM_METAL_EXT)
 #include <execinfo.h>
 #include <pthread.h>
 #include <signal.h>
 #include <dispatch/dispatch.h>
 #include <objc/message.h>
 #include <objc/runtime.h>
 #include <vulkan/vulkan_metal.h>
 static void
 crash_handler(int sig)
 {
   void *array[50];
   int size = backtrace(array, 50);
   fprintf(stderr, "\n\n=== CRASH HANDLER: Signal %d ===\n", sig);
   fprintf(stderr, "Stack trace:\n");
   backtrace_symbols_fd(array, size, STDERR_FILENO);
   fprintf(stderr, "=== END STACK TRACE ===\n\n");
   signal(sig, SIG_DFL);
   raise(sig);
 }
 __attribute__((constructor)) static void
 install_crash_handler(void)
 {
   signal(SIGSEGV, crash_handler);
   signal(SIGBUS, crash_handler);
   signal(SIGABRT, crash_handler);
 }
 #endif
 static struct dri_image *
@@ -345,70 +322,24 @@ static const __DRIextension *kopper_loader_extensions[] = {
 #ifdef VK_USE_PLATFORM_METAL_EXT
 struct get_size_ctx {
   void *layer;
   double w;
   double h;
 };
 static void
 get_drawable_size_main_thread(void *data)
 {
   struct get_size_ctx *ctx = data;
   typedef struct {
      double width;
      double height;
   } MGLSize;
   /* Check superlayer to verify attachment */
   id superlayer = ((id(*)(id, SEL))objc_msgSend)(
      (id)ctx->layer, sel_registerName("superlayer"));
   MGLSize (*msgSendSize)(id, SEL) = (MGLSize(*)(id, SEL))objc_msgSend;
   MGLSize size = msgSendSize((id)ctx->layer, sel_registerName("drawableSize"));
   ctx->w = size.width;
   ctx->h = size.height;
 }
 static void
 surfaceless_metal_kopper_get_drawable_info(struct dri_drawable *draw, int *w,
                                           int *h, void *loaderPrivate)
 {
   struct dri2_egl_surface *dri2_surf = loaderPrivate;
   void *layer = dri2_surf->base.NativeSurface;
-   if (layer) {
+   /* PERFORMANCE FIX: Return cached dimensions instead of querying the
-      /* Debugging SIGBUS: Validate layer state */
+    * CAMetalLayer on every call via dispatch_sync to main thread.
-
+    *
-      /* Check class */
+    * The previous implementation was a major performance bottleneck - each
-      const char *cls = object_getClassName((id)layer);
+    * dispatch_sync_f() blocks the calling thread waiting for the main thread.
-
+    * This was happening multiple times per frame, causing massive stalls.
-      /* Check device property */
+    *
-      id device =
+    * The surface dimensions are already updated through kopper_update_size()
-         ((id(*)(id, SEL))objc_msgSend)((id)layer, sel_registerName("device"));
+    * on resize events, so we can safely return the cached values.
-
+    */
-      /* [layer drawableSize] */
+   *w = dri2_surf->base.Width;
-      /* Query on Main Thread to avoid race conditions with CoreAnimation which
+   *h = dri2_surf->base.Height;
       * can cause SIGBUS */
      struct get_size_ctx ctx;
      ctx.layer = layer;
      ctx.w = 0;
      ctx.h = 0;
      if (pthread_main_np()) {
         get_drawable_size_main_thread(&ctx);
      } else {
         dispatch_sync_f(dispatch_get_main_queue(), &ctx,
                         get_drawable_size_main_thread);
      }
      *w = (int)ctx.w;
      *h = (int)ctx.h;
   } else {
      *w = dri2_surf->base.Width;
      *h = dri2_surf->base.Height;
   }
 }
 #include <objc/message.h>
--- a/src/gallium/drivers/zink/driinfo_zink.h
+++ b/src/gallium/drivers/zink/driinfo_zink.h
@@ -3,6 +3,7 @@
 DRI_CONF_SECTION_DEBUG
   DRI_CONF_DUAL_COLOR_BLEND_BY_LOCATION(false)
   DRI_CONF_OPT_B(radeonsi_inline_uniforms, false, "Optimize shaders by replacing uniforms with literals")
   DRI_CONF_ALLOW_GLSL_COMPAT_SHADERS(true)
 DRI_CONF_SECTION_END
 DRI_CONF_SECTION_PERFORMANCE
--- a/src/gallium/drivers/zink/zink_screen.c
+++ b/src/gallium/drivers/zink/zink_screen.c
@@ -1236,7 +1236,10 @@ zink_init_screen_caps(struct zink_screen *screen)
   caps->mesh.pipeline_statistic_queries = screen->info.mesh_feats.meshShaderQueries;
-   if (screen->info.feats12.subgroupBroadcastDynamicId && screen->info.feats12.shaderSubgroupExtendedTypes && screen->info.feats.features.shaderFloat64) {
+   /* Enable subgroup operations if the Vulkan driver supports them.
    * Note: shaderFloat64 was previously required here for GL_ARB_shader_ballot's
    * uint64 ballot masks, but subgroup shuffle/basic ops don't need it. */
   if (screen->info.feats12.subgroupBroadcastDynamicId && screen->info.feats12.shaderSubgroupExtendedTypes) {
      caps->shader_subgroup_size = screen->info.subgroup.subgroupSize;
      if (screen->info.have_EXT_mesh_shader)
         caps->shader_subgroup_supported_stages = screen->info.subgroup.supportedStages & BITFIELD_MASK(MESA_SHADER_MESH_STAGES);
@@ -2886,6 +2889,7 @@ init_driver_workarounds(struct zink_screen *screen)
   case VK_DRIVER_ID_MESA_V3DV:
   case VK_DRIVER_ID_MESA_PANVK:
   case VK_DRIVER_ID_MESA_NVK:
   case VK_DRIVER_ID_MESA_KOSMICKRISP:
      screen->driver_workarounds.implicit_sync = false;
      break;
   default:
--- a/src/kosmickrisp/compiler/msl_type_inference.c
+++ b/src/kosmickrisp/compiler/msl_type_inference.c
@@ -453,6 +453,8 @@ infer_types_from_intrinsic(struct hash_table *types, nir_intrinsic_instr *instr)
      set_type(types, &instr->src[1], TYPE_UINT);
      break;
   case nir_intrinsic_reduce:
   case nir_intrinsic_inclusive_scan:
   case nir_intrinsic_exclusive_scan:
      switch (nir_intrinsic_reduction_op(instr)) {
      case nir_op_iand:
      case nir_op_ior:
--- a/src/kosmickrisp/compiler/nir_to_msl.c
+++ b/src/kosmickrisp/compiler/nir_to_msl.c
@@ -1513,6 +1513,42 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
         UNREACHABLE("Bad reduction op");
      }
      src_to_msl(ctx, &instr->src[0]);
      P(ctx, ");\n");
      break;
   case nir_intrinsic_inclusive_scan:
      switch (nir_intrinsic_reduction_op(instr)) {
      case nir_op_iadd:
      case nir_op_fadd:
         P(ctx, "simd_prefix_inclusive_sum(");
         break;
      case nir_op_imul:
      case nir_op_fmul:
         P(ctx, "simd_prefix_inclusive_product(");
         break;
      default:
         /* Metal only supports sum and product for prefix operations.
          * Other ops would need to be lowered in NIR. */
         UNREACHABLE("Unsupported inclusive_scan op");
      }
      src_to_msl(ctx, &instr->src[0]);
      P(ctx, ");\n");
      break;
   case nir_intrinsic_exclusive_scan:
      switch (nir_intrinsic_reduction_op(instr)) {
      case nir_op_iadd:
      case nir_op_fadd:
         P(ctx, "simd_prefix_exclusive_sum(");
         break;
      case nir_op_imul:
      case nir_op_fmul:
         P(ctx, "simd_prefix_exclusive_product(");
         break;
      default:
         UNREACHABLE("Unsupported exclusive_scan op");
      }
      src_to_msl(ctx, &instr->src[0]);
      P(ctx, ");\n");
      break;
--- a/src/kosmickrisp/vulkan/kk_physical_device.c
+++ b/src/kosmickrisp/vulkan/kk_physical_device.c
@@ -452,8 +452,8 @@ kk_get_device_properties(const struct kk_physical_device *pdev,
         VK_SUBGROUP_FEATURE_VOTE_BIT | VK_SUBGROUP_FEATURE_QUAD_BIT |
         VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
         VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
-         VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR, // | TODO_KOSMICKRISP
+         VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR |
-      // VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
+         VK_SUBGROUP_FEATURE_ARITHMETIC_BIT, // | TODO_KOSMICKRISP
      // VK_SUBGROUP_FEATURE_CLUSTERED_BIT |
      // VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR,
      .subgroupQuadOperationsInAllStages = true,