zink/egl: performance optimizations for KosmicKrisp on Metal

- Disable implicit_sync for KosmicKrisp to avoid per-frame GPU stall - Cache drawable size instead of dispatch_sync to main thread - Remove debug crash handler overhead
2025-12-31 13:05:19 -06:00
parent 03a56f0032
commit f9624417ea
6 changed files with 58 additions and 84 deletions
--- a/src/egl/drivers/dri2/platform_surfaceless.c
+++ b/src/egl/drivers/dri2/platform_surfaceless.c
@@ -43,34 +43,11 @@
 #include "loader_dri_helper.h"

 #if defined(__APPLE__) && defined(VK_USE_PLATFORM_METAL_EXT)
-#include <execinfo.h>
 #include <pthread.h>
-#include <signal.h>
 #include <dispatch/dispatch.h>
 #include <objc/message.h>
 #include <objc/runtime.h>
 #include <vulkan/vulkan_metal.h>
-
-static void
-crash_handler(int sig)
-{
-   void *array[50];
-   int size = backtrace(array, 50);
-   fprintf(stderr, "\n\n=== CRASH HANDLER: Signal %d ===\n", sig);
-   fprintf(stderr, "Stack trace:\n");
-   backtrace_symbols_fd(array, size, STDERR_FILENO);
-   fprintf(stderr, "=== END STACK TRACE ===\n\n");
-   signal(sig, SIG_DFL);
-   raise(sig);
-}
-
-__attribute__((constructor)) static void
-install_crash_handler(void)
-{
-   signal(SIGSEGV, crash_handler);
-   signal(SIGBUS, crash_handler);
-   signal(SIGABRT, crash_handler);
-}
 #endif

 static struct dri_image *
@@ -345,70 +322,24 @@ static const __DRIextension *kopper_loader_extensions[] = {

 #ifdef VK_USE_PLATFORM_METAL_EXT

-struct get_size_ctx {
-   void *layer;
-   double w;
-   double h;
-};
-
-static void
-get_drawable_size_main_thread(void *data)
-{
-   struct get_size_ctx *ctx = data;
-   typedef struct {
-      double width;
-      double height;
-   } MGLSize;
-
-   /* Check superlayer to verify attachment */
-   id superlayer = ((id(*)(id, SEL))objc_msgSend)(
-      (id)ctx->layer, sel_registerName("superlayer"));
-
-   MGLSize (*msgSendSize)(id, SEL) = (MGLSize(*)(id, SEL))objc_msgSend;
-   MGLSize size = msgSendSize((id)ctx->layer, sel_registerName("drawableSize"));
-   ctx->w = size.width;
-   ctx->h = size.height;
-}
-
 static void
 surfaceless_metal_kopper_get_drawable_info(struct dri_drawable *draw, int *w,
                                           int *h, void *loaderPrivate)
 {
   struct dri2_egl_surface *dri2_surf = loaderPrivate;
-   void *layer = dri2_surf->base.NativeSurface;

-   if (layer) {
-      /* Debugging SIGBUS: Validate layer state */
-
-      /* Check class */
-      const char *cls = object_getClassName((id)layer);
-
-      /* Check device property */
-      id device =
-         ((id(*)(id, SEL))objc_msgSend)((id)layer, sel_registerName("device"));
-
-      /* [layer drawableSize] */
-      /* Query on Main Thread to avoid race conditions with CoreAnimation which
-       * can cause SIGBUS */
-
-      struct get_size_ctx ctx;
-      ctx.layer = layer;
-      ctx.w = 0;
-      ctx.h = 0;
-
-      if (pthread_main_np()) {
-         get_drawable_size_main_thread(&ctx);
-      } else {
-         dispatch_sync_f(dispatch_get_main_queue(), &ctx,
-                         get_drawable_size_main_thread);
-      }
-
-      *w = (int)ctx.w;
-      *h = (int)ctx.h;
-   } else {
-      *w = dri2_surf->base.Width;
-      *h = dri2_surf->base.Height;
-   }
+   /* PERFORMANCE FIX: Return cached dimensions instead of querying the
+    * CAMetalLayer on every call via dispatch_sync to main thread.
+    *
+    * The previous implementation was a major performance bottleneck - each
+    * dispatch_sync_f() blocks the calling thread waiting for the main thread.
+    * This was happening multiple times per frame, causing massive stalls.
+    *
+    * The surface dimensions are already updated through kopper_update_size()
+    * on resize events, so we can safely return the cached values.
+    */
+   *w = dri2_surf->base.Width;
+   *h = dri2_surf->base.Height;
 }

 #include <objc/message.h>
--- a/src/gallium/drivers/zink/driinfo_zink.h
+++ b/src/gallium/drivers/zink/driinfo_zink.h
@@ -3,6 +3,7 @@
 DRI_CONF_SECTION_DEBUG
   DRI_CONF_DUAL_COLOR_BLEND_BY_LOCATION(false)
   DRI_CONF_OPT_B(radeonsi_inline_uniforms, false, "Optimize shaders by replacing uniforms with literals")
+   DRI_CONF_ALLOW_GLSL_COMPAT_SHADERS(true)
 DRI_CONF_SECTION_END

 DRI_CONF_SECTION_PERFORMANCE
--- a/src/gallium/drivers/zink/zink_screen.c
+++ b/src/gallium/drivers/zink/zink_screen.c
@@ -1236,7 +1236,10 @@ zink_init_screen_caps(struct zink_screen *screen)

   caps->mesh.pipeline_statistic_queries = screen->info.mesh_feats.meshShaderQueries;

-   if (screen->info.feats12.subgroupBroadcastDynamicId && screen->info.feats12.shaderSubgroupExtendedTypes && screen->info.feats.features.shaderFloat64) {
+   /* Enable subgroup operations if the Vulkan driver supports them.
+    * Note: shaderFloat64 was previously required here for GL_ARB_shader_ballot's
+    * uint64 ballot masks, but subgroup shuffle/basic ops don't need it. */
+   if (screen->info.feats12.subgroupBroadcastDynamicId && screen->info.feats12.shaderSubgroupExtendedTypes) {
      caps->shader_subgroup_size = screen->info.subgroup.subgroupSize;
      if (screen->info.have_EXT_mesh_shader)
         caps->shader_subgroup_supported_stages = screen->info.subgroup.supportedStages & BITFIELD_MASK(MESA_SHADER_MESH_STAGES);
@@ -2886,6 +2889,7 @@ init_driver_workarounds(struct zink_screen *screen)
   case VK_DRIVER_ID_MESA_V3DV:
   case VK_DRIVER_ID_MESA_PANVK:
   case VK_DRIVER_ID_MESA_NVK:
+   case VK_DRIVER_ID_MESA_KOSMICKRISP:
      screen->driver_workarounds.implicit_sync = false;
      break;
   default:
--- a/src/kosmickrisp/compiler/msl_type_inference.c
+++ b/src/kosmickrisp/compiler/msl_type_inference.c
@@ -453,6 +453,8 @@ infer_types_from_intrinsic(struct hash_table *types, nir_intrinsic_instr *instr)
      set_type(types, &instr->src[1], TYPE_UINT);
      break;
   case nir_intrinsic_reduce:
+   case nir_intrinsic_inclusive_scan:
+   case nir_intrinsic_exclusive_scan:
      switch (nir_intrinsic_reduction_op(instr)) {
      case nir_op_iand:
      case nir_op_ior:
--- a/src/kosmickrisp/compiler/nir_to_msl.c
+++ b/src/kosmickrisp/compiler/nir_to_msl.c
@@ -1513,6 +1513,42 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
         UNREACHABLE("Bad reduction op");
      }

+      src_to_msl(ctx, &instr->src[0]);
+      P(ctx, ");\n");
+      break;
+   case nir_intrinsic_inclusive_scan:
+      switch (nir_intrinsic_reduction_op(instr)) {
+      case nir_op_iadd:
+      case nir_op_fadd:
+         P(ctx, "simd_prefix_inclusive_sum(");
+         break;
+      case nir_op_imul:
+      case nir_op_fmul:
+         P(ctx, "simd_prefix_inclusive_product(");
+         break;
+      default:
+         /* Metal only supports sum and product for prefix operations.
+          * Other ops would need to be lowered in NIR. */
+         UNREACHABLE("Unsupported inclusive_scan op");
+      }
+
+      src_to_msl(ctx, &instr->src[0]);
+      P(ctx, ");\n");
+      break;
+   case nir_intrinsic_exclusive_scan:
+      switch (nir_intrinsic_reduction_op(instr)) {
+      case nir_op_iadd:
+      case nir_op_fadd:
+         P(ctx, "simd_prefix_exclusive_sum(");
+         break;
+      case nir_op_imul:
+      case nir_op_fmul:
+         P(ctx, "simd_prefix_exclusive_product(");
+         break;
+      default:
+         UNREACHABLE("Unsupported exclusive_scan op");
+      }
+
      src_to_msl(ctx, &instr->src[0]);
      P(ctx, ");\n");
      break;
--- a/src/kosmickrisp/vulkan/kk_physical_device.c
+++ b/src/kosmickrisp/vulkan/kk_physical_device.c
@@ -452,8 +452,8 @@ kk_get_device_properties(const struct kk_physical_device *pdev,
         VK_SUBGROUP_FEATURE_VOTE_BIT | VK_SUBGROUP_FEATURE_QUAD_BIT |
         VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
         VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
-         VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR, // | TODO_KOSMICKRISP
-      // VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
+         VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR |
+         VK_SUBGROUP_FEATURE_ARITHMETIC_BIT, // | TODO_KOSMICKRISP
      // VK_SUBGROUP_FEATURE_CLUSTERED_BIT |
      // VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR,
      .subgroupQuadOperationsInAllStages = true,