From 65e64b6e2d7a327922b35ca9aa6eb3c4ef538c72 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Sat, 27 Apr 2024 11:30:15 -0400 Subject: [PATCH] agx: handle discard with force early tests we need to predicate the store, since we can't do a hardware demote after running tests. this is similar to what the blob does. Signed-off-by: Alyssa Rosenzweig Part-of: --- src/asahi/compiler/README.md | 2 + src/asahi/lib/agx_bg_eot.c | 2 +- src/asahi/lib/agx_nir_lower_tilebuffer.c | 35 ++++++++++--- src/asahi/lib/agx_nir_prolog_epilog.c | 64 ++++++++++++++++++++++-- src/asahi/lib/agx_tilebuffer.h | 2 +- src/gallium/drivers/asahi/agx_state.c | 2 +- 6 files changed, 91 insertions(+), 16 deletions(-) diff --git a/src/asahi/compiler/README.md b/src/asahi/compiler/README.md index 074a112bb29..914c8c5c2b8 100644 --- a/src/asahi/compiler/README.md +++ b/src/asahi/compiler/README.md @@ -71,6 +71,8 @@ the fragment epilog): depth and/or stencil are written by the fragment shader. Depth/stencil writes must be deferred to the epilog for correctness when the epilog can discard (i.e. when alpha-to-coverage is enabled). +* `r3h` contains the logically emitted sample mask, if the fragment shader uses + forced early tests. This predicates the epilog's stores. * The vec4 of 32-bit registers beginning at `r(4 * (i + 1))` contains the colour output for render target `i`. When dual source blending is enabled, there is only a single render target and the dual source colour is treated as the diff --git a/src/asahi/lib/agx_bg_eot.c b/src/asahi/lib/agx_bg_eot.c index 6c8b9f6c181..b3442763b24 100644 --- a/src/asahi/lib/agx_bg_eot.c +++ b/src/asahi/lib/agx_bg_eot.c @@ -35,7 +35,7 @@ agx_compile_bg_eot_shader(struct agx_bg_eot_cache *cache, nir_shader *shader, agx_preprocess_nir(shader, cache->dev->libagx); if (tib) { unsigned bindless_base = 0; - agx_nir_lower_tilebuffer(shader, tib, NULL, &bindless_base, NULL); + agx_nir_lower_tilebuffer(shader, tib, NULL, &bindless_base, NULL, NULL); agx_nir_lower_monolithic_msaa(shader, tib->nr_samples); agx_nir_lower_multisampled_image_store(shader); agx_nir_lower_texture(shader); diff --git a/src/asahi/lib/agx_nir_lower_tilebuffer.c b/src/asahi/lib/agx_nir_lower_tilebuffer.c index 5493e7e957f..569403f8ede 100644 --- a/src/asahi/lib/agx_nir_lower_tilebuffer.c +++ b/src/asahi/lib/agx_nir_lower_tilebuffer.c @@ -25,6 +25,7 @@ struct ctx { unsigned bindless_base; bool any_memory_stores; uint8_t outputs_written; + nir_def *write_samples; }; static bool @@ -46,7 +47,8 @@ tib_filter(const nir_instr *instr, UNUSED const void *_) static void store_tilebuffer(nir_builder *b, struct agx_tilebuffer_layout *tib, enum pipe_format format, enum pipe_format logical_format, - unsigned rt, nir_def *value, unsigned write_mask) + unsigned rt, nir_def *value, nir_def *samples, + unsigned write_mask) { /* The hardware cannot extend for a 32-bit format. Extend ourselves. */ if (format == PIPE_FORMAT_R32_UINT && value->bit_size == 16) { @@ -84,10 +86,12 @@ store_tilebuffer(nir_builder *b, struct agx_tilebuffer_layout *tib, value = nir_u2u16(b, value); } + if (!samples) + samples = nir_imm_intN_t(b, ALL_SAMPLES, 16); + uint8_t offset_B = agx_tilebuffer_offset_B(tib, rt); - nir_store_local_pixel_agx(b, value, nir_imm_intN_t(b, ALL_SAMPLES, 16), - .base = offset_B, .write_mask = write_mask, - .format = format); + nir_store_local_pixel_agx(b, value, samples, .base = offset_B, + .write_mask = write_mask, .format = format); } static nir_def * @@ -181,7 +185,8 @@ image_coords(nir_builder *b) static void store_memory(nir_builder *b, unsigned bindless_base, unsigned nr_samples, - enum pipe_format format, unsigned rt, nir_def *value) + enum pipe_format format, unsigned rt, nir_def *value, + nir_def *samples) { nir_def *image = handle_for_rt(b, bindless_base, rt, true); nir_def *tex_image = handle_for_rt(b, bindless_base, rt, false); @@ -216,10 +221,16 @@ store_memory(nir_builder *b, unsigned bindless_base, unsigned nr_samples, if (nr_samples > 1) { nir_def *coverage = nir_load_sample_mask(b); + + if (samples != NULL) + coverage = nir_iand(b, coverage, samples); + nir_def *covered = nir_ubitfield_extract( b, coverage, nir_u2u32(b, sample), nir_imm_int(b, 1)); cond = nir_iand(b, cond, nir_ine_imm(b, covered, 0)); + } else if (samples != NULL) { + cond = nir_iand(b, cond, nir_ine_imm(b, samples, 0)); } nir_push_if(b, cond); @@ -290,6 +301,13 @@ tib_impl(nir_builder *b, nir_instr *instr, void *data) *(ctx->translucent) = true; } + if (ctx->write_samples) { + assert(ctx->translucent != NULL && + "sample masking requires translucency"); + + *(ctx->translucent) = true; + } + /* But we ignore the NIR write mask for that, since it's basically an * optimization hint. */ @@ -307,11 +325,11 @@ tib_impl(nir_builder *b, nir_instr *instr, void *data) if (tib->spilled[rt]) { store_memory(b, ctx->bindless_base, tib->nr_samples, logical_format, - rt, value); + rt, value, ctx->write_samples); ctx->any_memory_stores = true; } else { store_tilebuffer(b, tib, format, logical_format, rt, value, - write_mask); + ctx->write_samples, write_mask); } return NIR_LOWER_INSTR_PROGRESS_REPLACE; @@ -338,7 +356,7 @@ tib_impl(nir_builder *b, nir_instr *instr, void *data) bool agx_nir_lower_tilebuffer(nir_shader *shader, struct agx_tilebuffer_layout *tib, uint8_t *colormasks, unsigned *bindless_base, - bool *translucent) + nir_def *write_samples, bool *translucent) { assert(shader->info.stage == MESA_SHADER_FRAGMENT); @@ -346,6 +364,7 @@ agx_nir_lower_tilebuffer(nir_shader *shader, struct agx_tilebuffer_layout *tib, .tib = tib, .colormasks = colormasks, .translucent = translucent, + .write_samples = write_samples, }; /* Allocate 1 texture + 1 PBE descriptor for each spilled descriptor */ diff --git a/src/asahi/lib/agx_nir_prolog_epilog.c b/src/asahi/lib/agx_nir_prolog_epilog.c index e9ce52af58d..a6f4bd2cd9c 100644 --- a/src/asahi/lib/agx_nir_prolog_epilog.c +++ b/src/asahi/lib/agx_nir_prolog_epilog.c @@ -312,12 +312,16 @@ agx_nir_fs_epilog(nir_builder *b, const void *key_) .src_type = nir_type_float | size); } - /* Grab the sample ID early, this has to happen in the first block. */ - nir_def *sample_id = NULL; + /* Grab registers early, this has to happen in the first block. */ + nir_def *sample_id = NULL, *write_samples = NULL; if (key->link.sample_shading) { sample_id = nir_load_exported_agx(b, 1, 16, .base = 1); } + if (key->link.sample_mask_after_force_early) { + write_samples = nir_load_exported_agx(b, 1, 16, .base = 7); + } + /* Now lower the resulting program using the key */ struct agx_tilebuffer_layout tib = agx_build_tilebuffer_layout( key->rt_formats, ARRAY_SIZE(key->rt_formats), key->nr_samples, true); @@ -400,7 +404,7 @@ agx_nir_fs_epilog(nir_builder *b, const void *key_) unsigned rt_spill = key->link.rt_spill_base; NIR_PASS(_, b->shader, agx_nir_lower_tilebuffer, &tib, colormasks, &rt_spill, - &force_translucent); + write_samples, &force_translucent); NIR_PASS(_, b->shader, agx_nir_lower_texture); NIR_PASS(_, b->shader, agx_nir_lower_multisampled_image_store); @@ -441,10 +445,17 @@ agx_nir_fs_epilog(nir_builder *b, const void *key_) b->shader->info.fs.uses_sample_shading = key->link.sample_shading; } +struct lower_epilog_ctx { + struct agx_fs_epilog_link_info *info; + nir_variable *masked_samples; +}; + static bool lower_output_to_epilog(nir_builder *b, nir_intrinsic_instr *intr, void *data) { - struct agx_fs_epilog_link_info *info = data; + struct lower_epilog_ctx *ctx = data; + struct agx_fs_epilog_link_info *info = ctx->info; + if (intr->intrinsic == nir_intrinsic_store_zs_agx) { assert(nir_src_as_uint(intr->src[0]) == 0xff && "msaa not yet lowered"); b->cursor = nir_instr_remove(&intr->instr); @@ -464,6 +475,32 @@ lower_output_to_epilog(nir_builder *b, nir_intrinsic_instr *intr, void *data) return true; } + if (intr->intrinsic == nir_intrinsic_discard_agx && + b->shader->info.fs.early_fragment_tests) { + + if (!ctx->masked_samples) { + b->cursor = nir_before_impl(nir_shader_get_entrypoint(b->shader)); + + ctx->masked_samples = + nir_local_variable_create(b->impl, glsl_uint16_t_type(), NULL); + + nir_store_var(b, ctx->masked_samples, nir_imm_intN_t(b, 0xFF, 16), + nir_component_mask(1)); + } + + b->cursor = nir_before_instr(&intr->instr); + + nir_def *mask = nir_load_var(b, ctx->masked_samples); + nir_def *mask_2 = + nir_ixor(b, intr->src[0].ssa, nir_imm_intN_t(b, 0xff, 16)); + + mask = nir_iand(b, mask, mask_2); + nir_store_var(b, ctx->masked_samples, mask, nir_component_mask(1)); + + nir_instr_remove(&intr->instr); + return true; + } + if (intr->intrinsic != nir_intrinsic_store_output) return false; @@ -525,9 +562,26 @@ bool agx_nir_lower_fs_output_to_epilog(nir_shader *s, struct agx_fs_epilog_link_info *out) { + struct lower_epilog_ctx ctx = {.info = out}; + nir_shader_intrinsics_pass(s, lower_output_to_epilog, nir_metadata_dominance | nir_metadata_block_index, - out); + &ctx); + + if (ctx.masked_samples) { + nir_builder b = + nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(s))); + + nir_export_agx(&b, nir_load_var(&b, ctx.masked_samples), .base = 7); + out->sample_mask_after_force_early = true; + + bool progress; + do { + progress = false; + NIR_PASS(progress, s, nir_lower_vars_to_ssa); + NIR_PASS(progress, s, nir_opt_dce); + } while (progress); + } out->sample_shading = s->info.fs.uses_sample_shading; return true; diff --git a/src/asahi/lib/agx_tilebuffer.h b/src/asahi/lib/agx_tilebuffer.h index c6be543702a..c08d5027dad 100644 --- a/src/asahi/lib/agx_tilebuffer.h +++ b/src/asahi/lib/agx_tilebuffer.h @@ -93,7 +93,7 @@ agx_build_tilebuffer_layout(const enum pipe_format *formats, uint8_t nr_cbufs, bool agx_nir_lower_tilebuffer(struct nir_shader *shader, struct agx_tilebuffer_layout *tib, uint8_t *colormasks, unsigned *bindless_base, - bool *translucent); + struct nir_def *write_samples, bool *translucent); bool agx_nir_lower_to_per_sample(struct nir_shader *shader); diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c index 7a4cb130925..969a2dc093f 100644 --- a/src/gallium/drivers/asahi/agx_state.c +++ b/src/gallium/drivers/asahi/agx_state.c @@ -1674,7 +1674,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx, unsigned rt_spill_base = BITSET_LAST_BIT(nir->info.textures_used) + (2 * BITSET_LAST_BIT(nir->info.images_used)); unsigned rt_spill = rt_spill_base; - NIR_PASS(_, nir, agx_nir_lower_tilebuffer, &tib, NULL, &rt_spill, + NIR_PASS(_, nir, agx_nir_lower_tilebuffer, &tib, NULL, &rt_spill, NULL, NULL); }