zink/egl: performance optimizations for KosmicKrisp on Metal
- Disable implicit_sync for KosmicKrisp to avoid per-frame GPU stall - Cache drawable size instead of dispatch_sync to main thread - Remove debug crash handler overhead
This commit is contained in:
@@ -43,34 +43,11 @@
|
||||
#include "loader_dri_helper.h"
|
||||
|
||||
#if defined(__APPLE__) && defined(VK_USE_PLATFORM_METAL_EXT)
|
||||
#include <execinfo.h>
|
||||
#include <pthread.h>
|
||||
#include <signal.h>
|
||||
#include <dispatch/dispatch.h>
|
||||
#include <objc/message.h>
|
||||
#include <objc/runtime.h>
|
||||
#include <vulkan/vulkan_metal.h>
|
||||
|
||||
static void
|
||||
crash_handler(int sig)
|
||||
{
|
||||
void *array[50];
|
||||
int size = backtrace(array, 50);
|
||||
fprintf(stderr, "\n\n=== CRASH HANDLER: Signal %d ===\n", sig);
|
||||
fprintf(stderr, "Stack trace:\n");
|
||||
backtrace_symbols_fd(array, size, STDERR_FILENO);
|
||||
fprintf(stderr, "=== END STACK TRACE ===\n\n");
|
||||
signal(sig, SIG_DFL);
|
||||
raise(sig);
|
||||
}
|
||||
|
||||
__attribute__((constructor)) static void
|
||||
install_crash_handler(void)
|
||||
{
|
||||
signal(SIGSEGV, crash_handler);
|
||||
signal(SIGBUS, crash_handler);
|
||||
signal(SIGABRT, crash_handler);
|
||||
}
|
||||
#endif
|
||||
|
||||
static struct dri_image *
|
||||
@@ -345,70 +322,24 @@ static const __DRIextension *kopper_loader_extensions[] = {
|
||||
|
||||
#ifdef VK_USE_PLATFORM_METAL_EXT
|
||||
|
||||
struct get_size_ctx {
|
||||
void *layer;
|
||||
double w;
|
||||
double h;
|
||||
};
|
||||
|
||||
static void
|
||||
get_drawable_size_main_thread(void *data)
|
||||
{
|
||||
struct get_size_ctx *ctx = data;
|
||||
typedef struct {
|
||||
double width;
|
||||
double height;
|
||||
} MGLSize;
|
||||
|
||||
/* Check superlayer to verify attachment */
|
||||
id superlayer = ((id(*)(id, SEL))objc_msgSend)(
|
||||
(id)ctx->layer, sel_registerName("superlayer"));
|
||||
|
||||
MGLSize (*msgSendSize)(id, SEL) = (MGLSize(*)(id, SEL))objc_msgSend;
|
||||
MGLSize size = msgSendSize((id)ctx->layer, sel_registerName("drawableSize"));
|
||||
ctx->w = size.width;
|
||||
ctx->h = size.height;
|
||||
}
|
||||
|
||||
static void
|
||||
surfaceless_metal_kopper_get_drawable_info(struct dri_drawable *draw, int *w,
|
||||
int *h, void *loaderPrivate)
|
||||
{
|
||||
struct dri2_egl_surface *dri2_surf = loaderPrivate;
|
||||
void *layer = dri2_surf->base.NativeSurface;
|
||||
|
||||
if (layer) {
|
||||
/* Debugging SIGBUS: Validate layer state */
|
||||
|
||||
/* Check class */
|
||||
const char *cls = object_getClassName((id)layer);
|
||||
|
||||
/* Check device property */
|
||||
id device =
|
||||
((id(*)(id, SEL))objc_msgSend)((id)layer, sel_registerName("device"));
|
||||
|
||||
/* [layer drawableSize] */
|
||||
/* Query on Main Thread to avoid race conditions with CoreAnimation which
|
||||
* can cause SIGBUS */
|
||||
|
||||
struct get_size_ctx ctx;
|
||||
ctx.layer = layer;
|
||||
ctx.w = 0;
|
||||
ctx.h = 0;
|
||||
|
||||
if (pthread_main_np()) {
|
||||
get_drawable_size_main_thread(&ctx);
|
||||
} else {
|
||||
dispatch_sync_f(dispatch_get_main_queue(), &ctx,
|
||||
get_drawable_size_main_thread);
|
||||
}
|
||||
|
||||
*w = (int)ctx.w;
|
||||
*h = (int)ctx.h;
|
||||
} else {
|
||||
*w = dri2_surf->base.Width;
|
||||
*h = dri2_surf->base.Height;
|
||||
}
|
||||
/* PERFORMANCE FIX: Return cached dimensions instead of querying the
|
||||
* CAMetalLayer on every call via dispatch_sync to main thread.
|
||||
*
|
||||
* The previous implementation was a major performance bottleneck - each
|
||||
* dispatch_sync_f() blocks the calling thread waiting for the main thread.
|
||||
* This was happening multiple times per frame, causing massive stalls.
|
||||
*
|
||||
* The surface dimensions are already updated through kopper_update_size()
|
||||
* on resize events, so we can safely return the cached values.
|
||||
*/
|
||||
*w = dri2_surf->base.Width;
|
||||
*h = dri2_surf->base.Height;
|
||||
}
|
||||
|
||||
#include <objc/message.h>
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
DRI_CONF_SECTION_DEBUG
|
||||
DRI_CONF_DUAL_COLOR_BLEND_BY_LOCATION(false)
|
||||
DRI_CONF_OPT_B(radeonsi_inline_uniforms, false, "Optimize shaders by replacing uniforms with literals")
|
||||
DRI_CONF_ALLOW_GLSL_COMPAT_SHADERS(true)
|
||||
DRI_CONF_SECTION_END
|
||||
|
||||
DRI_CONF_SECTION_PERFORMANCE
|
||||
|
||||
@@ -1236,7 +1236,10 @@ zink_init_screen_caps(struct zink_screen *screen)
|
||||
|
||||
caps->mesh.pipeline_statistic_queries = screen->info.mesh_feats.meshShaderQueries;
|
||||
|
||||
if (screen->info.feats12.subgroupBroadcastDynamicId && screen->info.feats12.shaderSubgroupExtendedTypes && screen->info.feats.features.shaderFloat64) {
|
||||
/* Enable subgroup operations if the Vulkan driver supports them.
|
||||
* Note: shaderFloat64 was previously required here for GL_ARB_shader_ballot's
|
||||
* uint64 ballot masks, but subgroup shuffle/basic ops don't need it. */
|
||||
if (screen->info.feats12.subgroupBroadcastDynamicId && screen->info.feats12.shaderSubgroupExtendedTypes) {
|
||||
caps->shader_subgroup_size = screen->info.subgroup.subgroupSize;
|
||||
if (screen->info.have_EXT_mesh_shader)
|
||||
caps->shader_subgroup_supported_stages = screen->info.subgroup.supportedStages & BITFIELD_MASK(MESA_SHADER_MESH_STAGES);
|
||||
@@ -2886,6 +2889,7 @@ init_driver_workarounds(struct zink_screen *screen)
|
||||
case VK_DRIVER_ID_MESA_V3DV:
|
||||
case VK_DRIVER_ID_MESA_PANVK:
|
||||
case VK_DRIVER_ID_MESA_NVK:
|
||||
case VK_DRIVER_ID_MESA_KOSMICKRISP:
|
||||
screen->driver_workarounds.implicit_sync = false;
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -453,6 +453,8 @@ infer_types_from_intrinsic(struct hash_table *types, nir_intrinsic_instr *instr)
|
||||
set_type(types, &instr->src[1], TYPE_UINT);
|
||||
break;
|
||||
case nir_intrinsic_reduce:
|
||||
case nir_intrinsic_inclusive_scan:
|
||||
case nir_intrinsic_exclusive_scan:
|
||||
switch (nir_intrinsic_reduction_op(instr)) {
|
||||
case nir_op_iand:
|
||||
case nir_op_ior:
|
||||
|
||||
@@ -1513,6 +1513,42 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
|
||||
UNREACHABLE("Bad reduction op");
|
||||
}
|
||||
|
||||
src_to_msl(ctx, &instr->src[0]);
|
||||
P(ctx, ");\n");
|
||||
break;
|
||||
case nir_intrinsic_inclusive_scan:
|
||||
switch (nir_intrinsic_reduction_op(instr)) {
|
||||
case nir_op_iadd:
|
||||
case nir_op_fadd:
|
||||
P(ctx, "simd_prefix_inclusive_sum(");
|
||||
break;
|
||||
case nir_op_imul:
|
||||
case nir_op_fmul:
|
||||
P(ctx, "simd_prefix_inclusive_product(");
|
||||
break;
|
||||
default:
|
||||
/* Metal only supports sum and product for prefix operations.
|
||||
* Other ops would need to be lowered in NIR. */
|
||||
UNREACHABLE("Unsupported inclusive_scan op");
|
||||
}
|
||||
|
||||
src_to_msl(ctx, &instr->src[0]);
|
||||
P(ctx, ");\n");
|
||||
break;
|
||||
case nir_intrinsic_exclusive_scan:
|
||||
switch (nir_intrinsic_reduction_op(instr)) {
|
||||
case nir_op_iadd:
|
||||
case nir_op_fadd:
|
||||
P(ctx, "simd_prefix_exclusive_sum(");
|
||||
break;
|
||||
case nir_op_imul:
|
||||
case nir_op_fmul:
|
||||
P(ctx, "simd_prefix_exclusive_product(");
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE("Unsupported exclusive_scan op");
|
||||
}
|
||||
|
||||
src_to_msl(ctx, &instr->src[0]);
|
||||
P(ctx, ");\n");
|
||||
break;
|
||||
|
||||
@@ -452,8 +452,8 @@ kk_get_device_properties(const struct kk_physical_device *pdev,
|
||||
VK_SUBGROUP_FEATURE_VOTE_BIT | VK_SUBGROUP_FEATURE_QUAD_BIT |
|
||||
VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
|
||||
VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
|
||||
VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR, // | TODO_KOSMICKRISP
|
||||
// VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
|
||||
VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR |
|
||||
VK_SUBGROUP_FEATURE_ARITHMETIC_BIT, // | TODO_KOSMICKRISP
|
||||
// VK_SUBGROUP_FEATURE_CLUSTERED_BIT |
|
||||
// VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR,
|
||||
.subgroupQuadOperationsInAllStages = true,
|
||||
|
||||
Reference in New Issue
Block a user