zink/egl: performance optimizations for KosmicKrisp on Metal
- Disable implicit_sync for KosmicKrisp to avoid per-frame GPU stall - Cache drawable size instead of dispatch_sync to main thread - Remove debug crash handler overhead
This commit is contained in:
@@ -43,34 +43,11 @@
|
|||||||
#include "loader_dri_helper.h"
|
#include "loader_dri_helper.h"
|
||||||
|
|
||||||
#if defined(__APPLE__) && defined(VK_USE_PLATFORM_METAL_EXT)
|
#if defined(__APPLE__) && defined(VK_USE_PLATFORM_METAL_EXT)
|
||||||
#include <execinfo.h>
|
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
#include <signal.h>
|
|
||||||
#include <dispatch/dispatch.h>
|
#include <dispatch/dispatch.h>
|
||||||
#include <objc/message.h>
|
#include <objc/message.h>
|
||||||
#include <objc/runtime.h>
|
#include <objc/runtime.h>
|
||||||
#include <vulkan/vulkan_metal.h>
|
#include <vulkan/vulkan_metal.h>
|
||||||
|
|
||||||
static void
|
|
||||||
crash_handler(int sig)
|
|
||||||
{
|
|
||||||
void *array[50];
|
|
||||||
int size = backtrace(array, 50);
|
|
||||||
fprintf(stderr, "\n\n=== CRASH HANDLER: Signal %d ===\n", sig);
|
|
||||||
fprintf(stderr, "Stack trace:\n");
|
|
||||||
backtrace_symbols_fd(array, size, STDERR_FILENO);
|
|
||||||
fprintf(stderr, "=== END STACK TRACE ===\n\n");
|
|
||||||
signal(sig, SIG_DFL);
|
|
||||||
raise(sig);
|
|
||||||
}
|
|
||||||
|
|
||||||
__attribute__((constructor)) static void
|
|
||||||
install_crash_handler(void)
|
|
||||||
{
|
|
||||||
signal(SIGSEGV, crash_handler);
|
|
||||||
signal(SIGBUS, crash_handler);
|
|
||||||
signal(SIGABRT, crash_handler);
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static struct dri_image *
|
static struct dri_image *
|
||||||
@@ -345,70 +322,24 @@ static const __DRIextension *kopper_loader_extensions[] = {
|
|||||||
|
|
||||||
#ifdef VK_USE_PLATFORM_METAL_EXT
|
#ifdef VK_USE_PLATFORM_METAL_EXT
|
||||||
|
|
||||||
struct get_size_ctx {
|
|
||||||
void *layer;
|
|
||||||
double w;
|
|
||||||
double h;
|
|
||||||
};
|
|
||||||
|
|
||||||
static void
|
|
||||||
get_drawable_size_main_thread(void *data)
|
|
||||||
{
|
|
||||||
struct get_size_ctx *ctx = data;
|
|
||||||
typedef struct {
|
|
||||||
double width;
|
|
||||||
double height;
|
|
||||||
} MGLSize;
|
|
||||||
|
|
||||||
/* Check superlayer to verify attachment */
|
|
||||||
id superlayer = ((id(*)(id, SEL))objc_msgSend)(
|
|
||||||
(id)ctx->layer, sel_registerName("superlayer"));
|
|
||||||
|
|
||||||
MGLSize (*msgSendSize)(id, SEL) = (MGLSize(*)(id, SEL))objc_msgSend;
|
|
||||||
MGLSize size = msgSendSize((id)ctx->layer, sel_registerName("drawableSize"));
|
|
||||||
ctx->w = size.width;
|
|
||||||
ctx->h = size.height;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
surfaceless_metal_kopper_get_drawable_info(struct dri_drawable *draw, int *w,
|
surfaceless_metal_kopper_get_drawable_info(struct dri_drawable *draw, int *w,
|
||||||
int *h, void *loaderPrivate)
|
int *h, void *loaderPrivate)
|
||||||
{
|
{
|
||||||
struct dri2_egl_surface *dri2_surf = loaderPrivate;
|
struct dri2_egl_surface *dri2_surf = loaderPrivate;
|
||||||
void *layer = dri2_surf->base.NativeSurface;
|
|
||||||
|
|
||||||
if (layer) {
|
/* PERFORMANCE FIX: Return cached dimensions instead of querying the
|
||||||
/* Debugging SIGBUS: Validate layer state */
|
* CAMetalLayer on every call via dispatch_sync to main thread.
|
||||||
|
*
|
||||||
/* Check class */
|
* The previous implementation was a major performance bottleneck - each
|
||||||
const char *cls = object_getClassName((id)layer);
|
* dispatch_sync_f() blocks the calling thread waiting for the main thread.
|
||||||
|
* This was happening multiple times per frame, causing massive stalls.
|
||||||
/* Check device property */
|
*
|
||||||
id device =
|
* The surface dimensions are already updated through kopper_update_size()
|
||||||
((id(*)(id, SEL))objc_msgSend)((id)layer, sel_registerName("device"));
|
* on resize events, so we can safely return the cached values.
|
||||||
|
*/
|
||||||
/* [layer drawableSize] */
|
*w = dri2_surf->base.Width;
|
||||||
/* Query on Main Thread to avoid race conditions with CoreAnimation which
|
*h = dri2_surf->base.Height;
|
||||||
* can cause SIGBUS */
|
|
||||||
|
|
||||||
struct get_size_ctx ctx;
|
|
||||||
ctx.layer = layer;
|
|
||||||
ctx.w = 0;
|
|
||||||
ctx.h = 0;
|
|
||||||
|
|
||||||
if (pthread_main_np()) {
|
|
||||||
get_drawable_size_main_thread(&ctx);
|
|
||||||
} else {
|
|
||||||
dispatch_sync_f(dispatch_get_main_queue(), &ctx,
|
|
||||||
get_drawable_size_main_thread);
|
|
||||||
}
|
|
||||||
|
|
||||||
*w = (int)ctx.w;
|
|
||||||
*h = (int)ctx.h;
|
|
||||||
} else {
|
|
||||||
*w = dri2_surf->base.Width;
|
|
||||||
*h = dri2_surf->base.Height;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#include <objc/message.h>
|
#include <objc/message.h>
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
DRI_CONF_SECTION_DEBUG
|
DRI_CONF_SECTION_DEBUG
|
||||||
DRI_CONF_DUAL_COLOR_BLEND_BY_LOCATION(false)
|
DRI_CONF_DUAL_COLOR_BLEND_BY_LOCATION(false)
|
||||||
DRI_CONF_OPT_B(radeonsi_inline_uniforms, false, "Optimize shaders by replacing uniforms with literals")
|
DRI_CONF_OPT_B(radeonsi_inline_uniforms, false, "Optimize shaders by replacing uniforms with literals")
|
||||||
|
DRI_CONF_ALLOW_GLSL_COMPAT_SHADERS(true)
|
||||||
DRI_CONF_SECTION_END
|
DRI_CONF_SECTION_END
|
||||||
|
|
||||||
DRI_CONF_SECTION_PERFORMANCE
|
DRI_CONF_SECTION_PERFORMANCE
|
||||||
|
|||||||
@@ -1236,7 +1236,10 @@ zink_init_screen_caps(struct zink_screen *screen)
|
|||||||
|
|
||||||
caps->mesh.pipeline_statistic_queries = screen->info.mesh_feats.meshShaderQueries;
|
caps->mesh.pipeline_statistic_queries = screen->info.mesh_feats.meshShaderQueries;
|
||||||
|
|
||||||
if (screen->info.feats12.subgroupBroadcastDynamicId && screen->info.feats12.shaderSubgroupExtendedTypes && screen->info.feats.features.shaderFloat64) {
|
/* Enable subgroup operations if the Vulkan driver supports them.
|
||||||
|
* Note: shaderFloat64 was previously required here for GL_ARB_shader_ballot's
|
||||||
|
* uint64 ballot masks, but subgroup shuffle/basic ops don't need it. */
|
||||||
|
if (screen->info.feats12.subgroupBroadcastDynamicId && screen->info.feats12.shaderSubgroupExtendedTypes) {
|
||||||
caps->shader_subgroup_size = screen->info.subgroup.subgroupSize;
|
caps->shader_subgroup_size = screen->info.subgroup.subgroupSize;
|
||||||
if (screen->info.have_EXT_mesh_shader)
|
if (screen->info.have_EXT_mesh_shader)
|
||||||
caps->shader_subgroup_supported_stages = screen->info.subgroup.supportedStages & BITFIELD_MASK(MESA_SHADER_MESH_STAGES);
|
caps->shader_subgroup_supported_stages = screen->info.subgroup.supportedStages & BITFIELD_MASK(MESA_SHADER_MESH_STAGES);
|
||||||
@@ -2886,6 +2889,7 @@ init_driver_workarounds(struct zink_screen *screen)
|
|||||||
case VK_DRIVER_ID_MESA_V3DV:
|
case VK_DRIVER_ID_MESA_V3DV:
|
||||||
case VK_DRIVER_ID_MESA_PANVK:
|
case VK_DRIVER_ID_MESA_PANVK:
|
||||||
case VK_DRIVER_ID_MESA_NVK:
|
case VK_DRIVER_ID_MESA_NVK:
|
||||||
|
case VK_DRIVER_ID_MESA_KOSMICKRISP:
|
||||||
screen->driver_workarounds.implicit_sync = false;
|
screen->driver_workarounds.implicit_sync = false;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
|||||||
@@ -453,6 +453,8 @@ infer_types_from_intrinsic(struct hash_table *types, nir_intrinsic_instr *instr)
|
|||||||
set_type(types, &instr->src[1], TYPE_UINT);
|
set_type(types, &instr->src[1], TYPE_UINT);
|
||||||
break;
|
break;
|
||||||
case nir_intrinsic_reduce:
|
case nir_intrinsic_reduce:
|
||||||
|
case nir_intrinsic_inclusive_scan:
|
||||||
|
case nir_intrinsic_exclusive_scan:
|
||||||
switch (nir_intrinsic_reduction_op(instr)) {
|
switch (nir_intrinsic_reduction_op(instr)) {
|
||||||
case nir_op_iand:
|
case nir_op_iand:
|
||||||
case nir_op_ior:
|
case nir_op_ior:
|
||||||
|
|||||||
@@ -1513,6 +1513,42 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
|
|||||||
UNREACHABLE("Bad reduction op");
|
UNREACHABLE("Bad reduction op");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
src_to_msl(ctx, &instr->src[0]);
|
||||||
|
P(ctx, ");\n");
|
||||||
|
break;
|
||||||
|
case nir_intrinsic_inclusive_scan:
|
||||||
|
switch (nir_intrinsic_reduction_op(instr)) {
|
||||||
|
case nir_op_iadd:
|
||||||
|
case nir_op_fadd:
|
||||||
|
P(ctx, "simd_prefix_inclusive_sum(");
|
||||||
|
break;
|
||||||
|
case nir_op_imul:
|
||||||
|
case nir_op_fmul:
|
||||||
|
P(ctx, "simd_prefix_inclusive_product(");
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
/* Metal only supports sum and product for prefix operations.
|
||||||
|
* Other ops would need to be lowered in NIR. */
|
||||||
|
UNREACHABLE("Unsupported inclusive_scan op");
|
||||||
|
}
|
||||||
|
|
||||||
|
src_to_msl(ctx, &instr->src[0]);
|
||||||
|
P(ctx, ");\n");
|
||||||
|
break;
|
||||||
|
case nir_intrinsic_exclusive_scan:
|
||||||
|
switch (nir_intrinsic_reduction_op(instr)) {
|
||||||
|
case nir_op_iadd:
|
||||||
|
case nir_op_fadd:
|
||||||
|
P(ctx, "simd_prefix_exclusive_sum(");
|
||||||
|
break;
|
||||||
|
case nir_op_imul:
|
||||||
|
case nir_op_fmul:
|
||||||
|
P(ctx, "simd_prefix_exclusive_product(");
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
UNREACHABLE("Unsupported exclusive_scan op");
|
||||||
|
}
|
||||||
|
|
||||||
src_to_msl(ctx, &instr->src[0]);
|
src_to_msl(ctx, &instr->src[0]);
|
||||||
P(ctx, ");\n");
|
P(ctx, ");\n");
|
||||||
break;
|
break;
|
||||||
|
|||||||
@@ -452,8 +452,8 @@ kk_get_device_properties(const struct kk_physical_device *pdev,
|
|||||||
VK_SUBGROUP_FEATURE_VOTE_BIT | VK_SUBGROUP_FEATURE_QUAD_BIT |
|
VK_SUBGROUP_FEATURE_VOTE_BIT | VK_SUBGROUP_FEATURE_QUAD_BIT |
|
||||||
VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
|
VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
|
||||||
VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
|
VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
|
||||||
VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR, // | TODO_KOSMICKRISP
|
VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR |
|
||||||
// VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
|
VK_SUBGROUP_FEATURE_ARITHMETIC_BIT, // | TODO_KOSMICKRISP
|
||||||
// VK_SUBGROUP_FEATURE_CLUSTERED_BIT |
|
// VK_SUBGROUP_FEATURE_CLUSTERED_BIT |
|
||||||
// VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR,
|
// VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR,
|
||||||
.subgroupQuadOperationsInAllStages = true,
|
.subgroupQuadOperationsInAllStages = true,
|
||||||
|
|||||||
Reference in New Issue
Block a user