From 48e638ab296ca68fe9b4edc7bdb78bb8cdaf5f17 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timur=20Krist=C3=B3f?= <timur.kristof@gmail.com>
Date: Tue, 4 May 2021 13:41:14 +0200
Subject: [PATCH] nir: Add AMD specific intrinsics for NGG shader based
 culling.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The new intrinsics fall into the following categories:

1. New viewport intrinsics:
For missing components that we need.
RADV will emit new SGPR arguments which will contain the
viewport information for culling shaders. These are used to
compute the screen space coordinates for small primitive culling.

2. load_cull_xxx:
Load the culling settings in runtime.
These will be a new SGPR argument in RADV.

3. overwrite_xxx:
These are needed because system values such as vertex and
instance ID are not writeable, but we need to change them
after repacking shader invocations of VS and TES.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10525>
---
 src/compiler/nir/nir_divergence_analysis.c |  8 ++++++++
 src/compiler/nir/nir_intrinsics.py         | 24 +++++++++++++++++++++-
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c
index c2da94ac5cf..7eb8d537084 100644
--- a/src/compiler/nir/nir_divergence_analysis.c
+++ b/src/compiler/nir/nir_divergence_analysis.c
@@ -124,6 +124,8 @@ visit_intrinsic(nir_shader *shader, nir_intrinsic_instr *instr)
    case nir_intrinsic_load_viewport_y_scale:
    case nir_intrinsic_load_viewport_z_scale:
    case nir_intrinsic_load_viewport_offset:
+   case nir_intrinsic_load_viewport_x_offset:
+   case nir_intrinsic_load_viewport_y_offset:
    case nir_intrinsic_load_viewport_z_offset:
    case nir_intrinsic_load_blend_const_color_a_float:
    case nir_intrinsic_load_blend_const_color_b_float:
@@ -146,6 +148,12 @@ visit_intrinsic(nir_shader *shader, nir_intrinsic_instr *instr)
    case nir_intrinsic_load_workgroup_num_input_vertices_amd:
    case nir_intrinsic_load_workgroup_num_input_primitives_amd:
    case nir_intrinsic_load_shader_query_enabled_amd:
+   case nir_intrinsic_load_cull_front_face_enabled_amd:
+   case nir_intrinsic_load_cull_back_face_enabled_amd:
+   case nir_intrinsic_load_cull_ccw_amd:
+   case nir_intrinsic_load_cull_small_primitives_enabled_amd:
+   case nir_intrinsic_load_cull_any_enabled_amd:
+   case nir_intrinsic_load_cull_small_prim_precision_amd:
       is_divergent = false;
       break;
 
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index ab5a5fac57b..74839171589 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -781,9 +781,13 @@ system_value("shader_record_ptr", 1, bit_sizes=[64])
 #
 # Panfrost needs to implement all coordinate transformation in the
 # vertex shader; system values allow us to share this routine in NIR.
+#
+# RADV uses these for NGG primitive culling.
 system_value("viewport_x_scale", 1)
 system_value("viewport_y_scale", 1)
 system_value("viewport_z_scale", 1)
+system_value("viewport_x_offset", 1)
+system_value("viewport_y_offset", 1)
 system_value("viewport_z_offset", 1)
 system_value("viewport_scale", 3)
 system_value("viewport_offset", 3)
@@ -1179,7 +1183,19 @@ system_value("workgroup_num_input_primitives_amd", 1)
 # For NGG passthrough mode only. Pre-packed argument for export_primitive_amd.
 system_value("packed_passthrough_primitive_amd", 1)
 # Whether NGG GS should execute shader query.
-system_value("shader_query_enabled_amd", 1, bit_sizes=[1])
+system_value("shader_query_enabled_amd", dest_comp=1, bit_sizes=[1])
+# Whether the shader should cull front facing triangles.
+intrinsic("load_cull_front_face_enabled_amd", dest_comp=1, bit_sizes=[1], flags=[CAN_ELIMINATE])
+# Whether the shader should cull back facing triangles.
+intrinsic("load_cull_back_face_enabled_amd", dest_comp=1, bit_sizes=[1], flags=[CAN_ELIMINATE])
+# True if face culling should use CCW (false if CW).
+intrinsic("load_cull_ccw_amd", dest_comp=1, bit_sizes=[1], flags=[CAN_ELIMINATE])
+# Whether the shader should cull small primitives that are not visible in a pixel.
+intrinsic("load_cull_small_primitives_enabled_amd", dest_comp=1, bit_sizes=[1], flags=[CAN_ELIMINATE])
+# Whether any culling setting is enabled in the shader.
+intrinsic("load_cull_any_enabled_amd", dest_comp=1, bit_sizes=[1], flags=[CAN_ELIMINATE])
+# Small primitive culling precision
+intrinsic("load_cull_small_prim_precision_amd", dest_comp=1, bit_sizes=[32], flags=[CAN_ELIMINATE, CAN_REORDER])
 # Initial edge flag in a Vertex Shader. src = {vertex index}.
 intrinsic("load_initial_edgeflag_amd", src_comp=[1], dest_comp=1, indices=[])
 # Exports the current invocation's vertex. This is a placeholder where all vertex attribute export instructions should be emitted.
@@ -1188,6 +1204,12 @@ intrinsic("export_vertex_amd", src_comp=[], indices=[])
 intrinsic("export_primitive_amd", src_comp=[1], indices=[])
 # Allocates export space for vertices and primitives. src[] = {num_vertices, num_primitives}.
 intrinsic("alloc_vertices_and_primitives_amd", src_comp=[1, 1], indices=[])
+# Overwrites VS input registers, for use with vertex compaction after culling. src = {vertex_id, instance_id}.
+intrinsic("overwrite_vs_arguments_amd", src_comp=[1, 1], indices=[])
+# Overwrites TES input registers, for use with vertex compaction after culling. src = {tes_u, tes_v, rel_patch_id, patch_id}.
+intrinsic("overwrite_tes_arguments_amd", src_comp=[1, 1, 1, 1], indices=[])
+# Overwrites the input vertex and primitive count in the current subgroup after culling. src = {num_vertices, num_primitives}.
+intrinsic("overwrite_subgroup_num_vertices_and_primitives_amd", src_comp=[1, 1], indices=[])
 
 # src = [index] BINDING = which table BASE = offset within handle
 intrinsic("load_sbt_amd", src_comp=[-1], dest_comp=0, indices=[BINDING, BASE],