intel/brw: Move brw_compile_* functions out of vec4-specific files

These contain code that is both fs and vec4. Will make easier later to delete vec4 files. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27691>
2024-02-14 18:17:59 -08:00
parent c11d7743b3
commit 9bfccc1935
7 changed files with 762 additions and 736 deletions
@@ -0,0 +1,400 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "brw_vec4_gs_visitor.h"
+#include "gfx6_gs_visitor.h"
+#include "brw_eu.h"
+#include "brw_fs.h"
+#include "brw_prim.h"
+#include "brw_nir.h"
+#include "brw_private.h"
+#include "dev/intel_debug.h"
+
+static const GLuint gl_prim_to_hw_prim[MESA_PRIM_TRIANGLE_STRIP_ADJACENCY+1] = {
+   [MESA_PRIM_POINTS] =_3DPRIM_POINTLIST,
+   [MESA_PRIM_LINES] = _3DPRIM_LINELIST,
+   [MESA_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP,
+   [MESA_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP,
+   [MESA_PRIM_TRIANGLES] = _3DPRIM_TRILIST,
+   [MESA_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
+   [MESA_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
+   [MESA_PRIM_QUADS] = _3DPRIM_QUADLIST,
+   [MESA_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
+   [MESA_PRIM_POLYGON] = _3DPRIM_POLYGON,
+   [MESA_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
+   [MESA_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
+   [MESA_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
+   [MESA_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
+};
+
+extern "C" const unsigned *
+brw_compile_gs(const struct brw_compiler *compiler,
+               struct brw_compile_gs_params *params)
+{
+   nir_shader *nir = params->base.nir;
+   const struct brw_gs_prog_key *key = params->key;
+   struct brw_gs_prog_data *prog_data = params->prog_data;
+
+   struct brw_gs_compile c;
+   memset(&c, 0, sizeof(c));
+   c.key = *key;
+
+   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_GEOMETRY];
+   const bool debug_enabled = brw_should_print_shader(nir, DEBUG_GS);
+
+   prog_data->base.base.stage = MESA_SHADER_GEOMETRY;
+   prog_data->base.base.ray_queries = nir->info.ray_queries;
+   prog_data->base.base.total_scratch = 0;
+
+   /* The GLSL linker will have already matched up GS inputs and the outputs
+    * of prior stages.  The driver does extend VS outputs in some cases, but
+    * only for legacy OpenGL or Gfx4-5 hardware, neither of which offer
+    * geometry shader support.  So we can safely ignore that.
+    *
+    * For SSO pipelines, we use a fixed VUE map layout based on variable
+    * locations, so we can rely on rendezvous-by-location making this work.
+    */
+   GLbitfield64 inputs_read = nir->info.inputs_read;
+   brw_compute_vue_map(compiler->devinfo,
+                       &c.input_vue_map, inputs_read,
+                       nir->info.separate_shader, 1);
+
+   brw_nir_apply_key(nir, compiler, &key->base, 8);
+   brw_nir_lower_vue_inputs(nir, &c.input_vue_map);
+   brw_nir_lower_vue_outputs(nir);
+   brw_postprocess_nir(nir, compiler, debug_enabled,
+                       key->base.robust_flags);
+
+   prog_data->base.clip_distance_mask =
+      ((1 << nir->info.clip_distance_array_size) - 1);
+   prog_data->base.cull_distance_mask =
+      ((1 << nir->info.cull_distance_array_size) - 1) <<
+      nir->info.clip_distance_array_size;
+
+   prog_data->include_primitive_id =
+      BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
+
+   prog_data->invocations = nir->info.gs.invocations;
+
+   if (compiler->devinfo->ver >= 8)
+      nir_gs_count_vertices_and_primitives(
+         nir, &prog_data->static_vertex_count, nullptr, nullptr, 1u);
+
+   if (compiler->devinfo->ver >= 7) {
+      if (nir->info.gs.output_primitive == MESA_PRIM_POINTS) {
+         /* When the output type is points, the geometry shader may output data
+          * to multiple streams, and EndPrimitive() has no effect.  So we
+          * configure the hardware to interpret the control data as stream ID.
+          */
+         prog_data->control_data_format = GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;
+
+         /* We only have to emit control bits if we are using non-zero streams */
+         if (nir->info.gs.active_stream_mask != (1 << 0))
+            c.control_data_bits_per_vertex = 2;
+         else
+            c.control_data_bits_per_vertex = 0;
+      } else {
+         /* When the output type is triangle_strip or line_strip, EndPrimitive()
+          * may be used to terminate the current strip and start a new one
+          * (similar to primitive restart), and outputting data to multiple
+          * streams is not supported.  So we configure the hardware to interpret
+          * the control data as EndPrimitive information (a.k.a. "cut bits").
+          */
+         prog_data->control_data_format = GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;
+
+         /* We only need to output control data if the shader actually calls
+          * EndPrimitive().
+          */
+         c.control_data_bits_per_vertex =
+            nir->info.gs.uses_end_primitive ? 1 : 0;
+      }
+   } else {
+      /* There are no control data bits in gfx6. */
+      c.control_data_bits_per_vertex = 0;
+   }
+   c.control_data_header_size_bits =
+      nir->info.gs.vertices_out * c.control_data_bits_per_vertex;
+
+   /* 1 HWORD = 32 bytes = 256 bits */
+   prog_data->control_data_header_size_hwords =
+      ALIGN(c.control_data_header_size_bits, 256) / 256;
+
+   /* Compute the output vertex size.
+    *
+    * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
+    * Size (p168):
+    *
+    *     [0,62] indicating [1,63] 16B units
+    *
+    *     Specifies the size of each vertex stored in the GS output entry
+    *     (following any Control Header data) as a number of 128-bit units
+    *     (minus one).
+    *
+    *     Programming Restrictions: The vertex size must be programmed as a
+    *     multiple of 32B units with the following exception: Rendering is
+    *     disabled (as per SOL stage state) and the vertex size output by the
+    *     GS thread is 16B.
+    *
+    *     If rendering is enabled (as per SOL state) the vertex size must be
+    *     programmed as a multiple of 32B units. In other words, the only time
+    *     software can program a vertex size with an odd number of 16B units
+    *     is when rendering is disabled.
+    *
+    * Note: B=bytes in the above text.
+    *
+    * It doesn't seem worth the extra trouble to optimize the case where the
+    * vertex size is 16B (especially since this would require special-casing
+    * the GEN assembly that writes to the URB).  So we just set the vertex
+    * size to a multiple of 32B (2 vec4's) in all cases.
+    *
+    * The maximum output vertex size is 62*16 = 992 bytes (31 hwords).  We
+    * budget that as follows:
+    *
+    *   512 bytes for varyings (a varying component is 4 bytes and
+    *             gl_MaxGeometryOutputComponents = 128)
+    *    16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
+    *             bytes)
+    *    16 bytes overhead for gl_Position (we allocate it a slot in the VUE
+    *             even if it's not used)
+    *    32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
+    *             whenever clip planes are enabled, even if the shader doesn't
+    *             write to gl_ClipDistance)
+    *    16 bytes overhead since the VUE size must be a multiple of 32 bytes
+    *             (see below)--this causes up to 1 VUE slot to be wasted
+    *   400 bytes available for varying packing overhead
+    *
+    * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
+    * per interpolation type, so this is plenty.
+    *
+    */
+   unsigned output_vertex_size_bytes = prog_data->base.vue_map.num_slots * 16;
+   assert(compiler->devinfo->ver == 6 ||
+          output_vertex_size_bytes <= GFX7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
+   prog_data->output_vertex_size_hwords =
+      ALIGN(output_vertex_size_bytes, 32) / 32;
+
+   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
+    * That divides up as follows:
+    *
+    *     64 bytes for the control data header (cut indices or StreamID bits)
+    *   4096 bytes for varyings (a varying component is 4 bytes and
+    *              gl_MaxGeometryTotalOutputComponents = 1024)
+    *   4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
+    *              bytes/vertex and gl_MaxGeometryOutputVertices is 256)
+    *   4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
+    *              even if it's not used)
+    *   8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
+    *              whenever clip planes are enabled, even if the shader doesn't
+    *              write to gl_ClipDistance)
+    *   4096 bytes overhead since the VUE size must be a multiple of 32
+    *              bytes (see above)--this causes up to 1 VUE slot to be wasted
+    *   8128 bytes available for varying packing overhead
+    *
+    * Worst-case varying packing overhead is 3/4 of a varying slot per
+    * interpolation type, which works out to 3072 bytes, so this would allow
+    * us to accommodate 2 interpolation types without any danger of running
+    * out of URB space.
+    *
+    * In practice, the risk of running out of URB space is very small, since
+    * the above figures are all worst-case, and most of them scale with the
+    * number of output vertices.  So we'll just calculate the amount of space
+    * we need, and if it's too large, fail to compile.
+    *
+    * The above is for gfx7+ where we have a single URB entry that will hold
+    * all the output. In gfx6, we will have to allocate URB entries for every
+    * vertex we emit, so our URB entries only need to be large enough to hold
+    * a single vertex. Also, gfx6 does not have a control data header.
+    */
+   unsigned output_size_bytes;
+   if (compiler->devinfo->ver >= 7) {
+      output_size_bytes =
+         prog_data->output_vertex_size_hwords * 32 * nir->info.gs.vertices_out;
+      output_size_bytes += 32 * prog_data->control_data_header_size_hwords;
+   } else {
+      output_size_bytes = prog_data->output_vertex_size_hwords * 32;
+   }
+
+   /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
+    * which comes before the control header.
+    */
+   if (compiler->devinfo->ver >= 8)
+      output_size_bytes += 32;
+
+   /* Shaders can technically set max_vertices = 0, at which point we
+    * may have a URB size of 0 bytes.  Nothing good can come from that,
+    * so enforce a minimum size.
+    */
+   if (output_size_bytes == 0)
+      output_size_bytes = 1;
+
+   unsigned max_output_size_bytes = GFX7_MAX_GS_URB_ENTRY_SIZE_BYTES;
+   if (compiler->devinfo->ver == 6)
+      max_output_size_bytes = GFX6_MAX_GS_URB_ENTRY_SIZE_BYTES;
+   if (output_size_bytes > max_output_size_bytes)
+      return NULL;
+
+
+   /* URB entry sizes are stored as a multiple of 64 bytes in gfx7+ and
+    * a multiple of 128 bytes in gfx6.
+    */
+   if (compiler->devinfo->ver >= 7) {
+      prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+   } else {
+      prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;
+   }
+
+   assert(nir->info.gs.output_primitive < ARRAY_SIZE(gl_prim_to_hw_prim));
+   prog_data->output_topology =
+      gl_prim_to_hw_prim[nir->info.gs.output_primitive];
+
+   prog_data->vertices_in = nir->info.gs.vertices_in;
+
+   /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
+    * need to program a URB read length of ceiling(num_slots / 2).
+    */
+   prog_data->base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;
+
+   /* Now that prog_data setup is done, we are ready to actually compile the
+    * program.
+    */
+   if (unlikely(debug_enabled)) {
+      fprintf(stderr, "GS Input ");
+      brw_print_vue_map(stderr, &c.input_vue_map, MESA_SHADER_GEOMETRY);
+      fprintf(stderr, "GS Output ");
+      brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_GEOMETRY);
+   }
+
+   if (is_scalar) {
+      fs_visitor v(compiler, &params->base, &c, prog_data, nir,
+                   params->base.stats != NULL, debug_enabled);
+      if (v.run_gs()) {
+         prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
+
+         assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
+         prog_data->base.base.dispatch_grf_start_reg =
+            v.payload().num_regs / reg_unit(compiler->devinfo);
+
+         fs_generator g(compiler, &params->base,
+                        &prog_data->base.base, false, MESA_SHADER_GEOMETRY);
+         if (unlikely(debug_enabled)) {
+            const char *label =
+               nir->info.label ? nir->info.label : "unnamed";
+            char *name = ralloc_asprintf(params->base.mem_ctx,
+                                         "%s geometry shader %s",
+                                         label, nir->info.name);
+            g.enable_debug(name);
+         }
+         g.generate_code(v.cfg, v.dispatch_width, v.shader_stats,
+                         v.performance_analysis.require(), params->base.stats);
+         g.add_const_data(nir->constant_data, nir->constant_data_size);
+         return g.get_assembly();
+      }
+
+      params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg);
+
+      return NULL;
+   }
+
+   if (compiler->devinfo->ver >= 7) {
+      /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
+       * so without spilling. If the GS invocations count > 1, then we can't use
+       * dual object mode.
+       */
+      if (prog_data->invocations <= 1 &&
+          !INTEL_DEBUG(DEBUG_NO_DUAL_OBJECT_GS)) {
+         prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
+
+         brw::vec4_gs_visitor v(compiler, &params->base, &c, prog_data, nir,
+                                true /* no_spills */,
+                                debug_enabled);
+
+         /* Backup 'nr_params' and 'param' as they can be modified by the
+          * the DUAL_OBJECT visitor. If it fails, we will run the fallback
+          * (DUAL_INSTANCED or SINGLE mode) and we need to restore original
+          * values.
+          */
+         const unsigned param_count = prog_data->base.base.nr_params;
+         uint32_t *param = ralloc_array(NULL, uint32_t, param_count);
+         memcpy(param, prog_data->base.base.param,
+                sizeof(uint32_t) * param_count);
+
+         if (v.run()) {
+            /* Success! Backup is not needed */
+            ralloc_free(param);
+            return brw_vec4_generate_assembly(compiler, &params->base,
+                                              nir, &prog_data->base,
+                                              v.cfg,
+                                              v.performance_analysis.require(),
+                                              debug_enabled);
+         } else {
+            /* These variables could be modified by the execution of the GS
+             * visitor if it packed the uniforms in the push constant buffer.
+             * As it failed, we need restore them so we can start again with
+             * DUAL_INSTANCED or SINGLE mode.
+             *
+             * FIXME: Could more variables be modified by this execution?
+             */
+            memcpy(prog_data->base.base.param, param,
+                   sizeof(uint32_t) * param_count);
+            prog_data->base.base.nr_params = param_count;
+            ralloc_free(param);
+         }
+      }
+   }
+
+   /* Either we failed to compile in DUAL_OBJECT mode (probably because it
+    * would have required spilling) or DUAL_OBJECT mode is disabled.  So fall
+    * back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers.
+    *
+    * FIXME: Single dispatch mode requires that the driver can handle
+    * interleaving of input registers, but this is already supported (dual
+    * instance mode has the same requirement). However, to take full advantage
+    * of single dispatch mode to reduce register pressure we would also need to
+    * do interleaved outputs, but currently, the vec4 visitor and generator
+    * classes do not support this, so at the moment register pressure in
+    * single and dual instance modes is the same.
+    *
+    * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS"
+    * "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely
+    * want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode
+    * is also supported. When InstanceCount=1 (one instance per object) software
+    * can decide which dispatch mode to use. DUAL_OBJECT mode would likely be
+    * the best choice for performance, followed by SINGLE mode."
+    *
+    * So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE
+    * mode is more performant when invocations > 1. Gfx6 only supports
+    * SINGLE mode.
+    */
+   if (prog_data->invocations <= 1 || compiler->devinfo->ver < 7)
+      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X1_SINGLE;
+   else
+      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_INSTANCE;
+
+   brw::vec4_gs_visitor *gs = NULL;
+   const unsigned *ret = NULL;
+
+   if (compiler->devinfo->ver >= 7)
+      gs = new brw::vec4_gs_visitor(compiler, &params->base, &c, prog_data,
+                                    nir, false /* no_spills */,
+                                    debug_enabled);
+   else
+      gs = new brw::gfx6_gs_visitor(compiler, &params->base, &c, prog_data,
+                                    nir, false /* no_spills */,
+                                    debug_enabled);
+
+   if (!gs->run()) {
+      params->base.error_str =
+         ralloc_strdup(params->base.mem_ctx, gs->fail_msg);
+   } else {
+      ret = brw_vec4_generate_assembly(compiler, &params->base, nir,
+                                       &prog_data->base, gs->cfg,
+                                       gs->performance_analysis.require(),
+                                       debug_enabled);
+   }
+
+   delete gs;
+   return ret;
+}
+
@@ -0,0 +1,188 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "intel_nir.h"
+#include "brw_nir.h"
+#include "brw_vec4_tcs.h"
+#include "brw_fs.h"
+#include "brw_private.h"
+#include "dev/intel_debug.h"
+
+/**
+ * Return the number of patches to accumulate before a MULTI_PATCH mode thread is
+ * launched.  In cases with a large number of input control points and a large
+ * amount of VS outputs, the VS URB space needed to store an entire 8 patches
+ * worth of data can be prohibitive, so it can be beneficial to launch threads
+ * early.
+ *
+ * See the 3DSTATE_HS::Patch Count Threshold documentation for the recommended
+ * values.  Note that 0 means to "disable" early dispatch, meaning to wait for
+ * a full 8 patches as normal.
+ */
+static int
+get_patch_count_threshold(int input_control_points)
+{
+   if (input_control_points <= 4)
+      return 0;
+   else if (input_control_points <= 6)
+      return 5;
+   else if (input_control_points <= 8)
+      return 4;
+   else if (input_control_points <= 10)
+      return 3;
+   else if (input_control_points <= 14)
+      return 2;
+
+   /* Return patch count 1 for PATCHLIST_15 - PATCHLIST_32 */
+   return 1;
+}
+
+extern "C" const unsigned *
+brw_compile_tcs(const struct brw_compiler *compiler,
+                struct brw_compile_tcs_params *params)
+{
+   const struct intel_device_info *devinfo = compiler->devinfo;
+   nir_shader *nir = params->base.nir;
+   const struct brw_tcs_prog_key *key = params->key;
+   struct brw_tcs_prog_data *prog_data = params->prog_data;
+   struct brw_vue_prog_data *vue_prog_data = &prog_data->base;
+
+   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
+   const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TCS);
+   const unsigned *assembly;
+
+   vue_prog_data->base.stage = MESA_SHADER_TESS_CTRL;
+   prog_data->base.base.ray_queries = nir->info.ray_queries;
+   prog_data->base.base.total_scratch = 0;
+
+   nir->info.outputs_written = key->outputs_written;
+   nir->info.patch_outputs_written = key->patch_outputs_written;
+
+   struct intel_vue_map input_vue_map;
+   brw_compute_vue_map(devinfo, &input_vue_map, nir->info.inputs_read,
+                       nir->info.separate_shader, 1);
+   brw_compute_tess_vue_map(&vue_prog_data->vue_map,
+                            nir->info.outputs_written,
+                            nir->info.patch_outputs_written);
+
+   brw_nir_apply_key(nir, compiler, &key->base, 8);
+   brw_nir_lower_vue_inputs(nir, &input_vue_map);
+   brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map,
+                             key->_tes_primitive_mode);
+   if (key->quads_workaround)
+      intel_nir_apply_tcs_quads_workaround(nir);
+   if (key->input_vertices > 0)
+      intel_nir_lower_patch_vertices_in(nir, key->input_vertices);
+
+   brw_postprocess_nir(nir, compiler, debug_enabled,
+                       key->base.robust_flags);
+
+   bool has_primitive_id =
+      BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
+
+   prog_data->patch_count_threshold = get_patch_count_threshold(key->input_vertices);
+
+   if (compiler->use_tcs_multi_patch) {
+      vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_MULTI_PATCH;
+      prog_data->instances = nir->info.tess.tcs_vertices_out;
+      prog_data->include_primitive_id = has_primitive_id;
+   } else {
+      unsigned verts_per_thread = is_scalar ? 8 : 2;
+      vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH;
+      prog_data->instances =
+         DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, verts_per_thread);
+   }
+
+   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
+    * That divides up as follows:
+    *
+    *     32 bytes for the patch header (tessellation factors)
+    *    480 bytes for per-patch varyings (a varying component is 4 bytes and
+    *              gl_MaxTessPatchComponents = 120)
+    *  16384 bytes for per-vertex varyings (a varying component is 4 bytes,
+    *              gl_MaxPatchVertices = 32 and
+    *              gl_MaxTessControlOutputComponents = 128)
+    *
+    *  15808 bytes left for varying packing overhead
+    */
+   const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots;
+   const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots;
+   unsigned output_size_bytes = 0;
+   /* Note that the patch header is counted in num_per_patch_slots. */
+   output_size_bytes += num_per_patch_slots * 16;
+   output_size_bytes += nir->info.tess.tcs_vertices_out *
+                        num_per_vertex_slots * 16;
+
+   assert(output_size_bytes >= 1);
+   if (output_size_bytes > GFX7_MAX_HS_URB_ENTRY_SIZE_BYTES)
+      return NULL;
+
+   /* URB entry sizes are stored as a multiple of 64 bytes. */
+   vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+
+   /* HS does not use the usual payload pushing from URB to GRFs,
+    * because we don't have enough registers for a full-size payload, and
+    * the hardware is broken on Haswell anyway.
+    */
+   vue_prog_data->urb_read_length = 0;
+
+   if (unlikely(debug_enabled)) {
+      fprintf(stderr, "TCS Input ");
+      brw_print_vue_map(stderr, &input_vue_map, MESA_SHADER_TESS_CTRL);
+      fprintf(stderr, "TCS Output ");
+      brw_print_vue_map(stderr, &vue_prog_data->vue_map, MESA_SHADER_TESS_CTRL);
+   }
+
+   if (is_scalar) {
+      const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
+      fs_visitor v(compiler, &params->base, &key->base,
+                   &prog_data->base.base, nir, dispatch_width,
+                   params->base.stats != NULL, debug_enabled);
+      if (!v.run_tcs()) {
+         params->base.error_str =
+            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
+         return NULL;
+      }
+
+      assert(v.payload().num_regs % reg_unit(devinfo) == 0);
+      prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
+
+      fs_generator g(compiler, &params->base,
+                     &prog_data->base.base, false, MESA_SHADER_TESS_CTRL);
+      if (unlikely(debug_enabled)) {
+         g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
+                                        "%s tessellation control shader %s",
+                                        nir->info.label ? nir->info.label
+                                                        : "unnamed",
+                                        nir->info.name));
+      }
+
+      g.generate_code(v.cfg, dispatch_width, v.shader_stats,
+                      v.performance_analysis.require(), params->base.stats);
+
+      g.add_const_data(nir->constant_data, nir->constant_data_size);
+
+      assembly = g.get_assembly();
+   } else {
+      brw::vec4_tcs_visitor v(compiler, &params->base, key, prog_data,
+                              nir, debug_enabled);
+      if (!v.run()) {
+         params->base.error_str =
+            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
+         return NULL;
+      }
+
+      if (INTEL_DEBUG(DEBUG_TCS))
+         v.dump_instructions();
+
+
+      assembly = brw_vec4_generate_assembly(compiler, &params->base, nir,
+                                            &prog_data->base, v.cfg,
+                                            v.performance_analysis.require(),
+                                            debug_enabled);
+   }
+
+   return assembly;
+}
@@ -0,0 +1,171 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "brw_vec4.h"
+#include "brw_fs.h"
+#include "brw_eu.h"
+#include "brw_nir.h"
+#include "brw_vec4_vs.h"
+#include "brw_private.h"
+#include "dev/intel_debug.h"
+
+using namespace brw;
+
+extern "C" const unsigned *
+brw_compile_vs(const struct brw_compiler *compiler,
+               struct brw_compile_vs_params *params)
+{
+   struct nir_shader *nir = params->base.nir;
+   const struct brw_vs_prog_key *key = params->key;
+   struct brw_vs_prog_data *prog_data = params->prog_data;
+   const bool debug_enabled =
+      brw_should_print_shader(nir, params->base.debug_flag ?
+                                   params->base.debug_flag : DEBUG_VS);
+
+   prog_data->base.base.stage = MESA_SHADER_VERTEX;
+   prog_data->base.base.ray_queries = nir->info.ray_queries;
+   prog_data->base.base.total_scratch = 0;
+
+   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
+   brw_nir_apply_key(nir, compiler, &key->base, 8);
+
+   const unsigned *assembly = NULL;
+
+   prog_data->inputs_read = nir->info.inputs_read;
+   prog_data->double_inputs_read = nir->info.vs.double_inputs;
+
+   brw_nir_lower_vs_inputs(nir, params->edgeflag_is_last, key->gl_attrib_wa_flags);
+   brw_nir_lower_vue_outputs(nir);
+   brw_postprocess_nir(nir, compiler, debug_enabled,
+                       key->base.robust_flags);
+
+   prog_data->base.clip_distance_mask =
+      ((1 << nir->info.clip_distance_array_size) - 1);
+   prog_data->base.cull_distance_mask =
+      ((1 << nir->info.cull_distance_array_size) - 1) <<
+      nir->info.clip_distance_array_size;
+
+   unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read);
+
+   /* gl_VertexID and gl_InstanceID are system values, but arrive via an
+    * incoming vertex attribute.  So, add an extra slot.
+    */
+   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX) ||
+       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE) ||
+       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) ||
+       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID)) {
+      nr_attribute_slots++;
+   }
+
+   /* gl_DrawID and IsIndexedDraw share its very own vec4 */
+   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID) ||
+       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_IS_INDEXED_DRAW)) {
+      nr_attribute_slots++;
+   }
+
+   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_IS_INDEXED_DRAW))
+      prog_data->uses_is_indexed_draw = true;
+
+   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX))
+      prog_data->uses_firstvertex = true;
+
+   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE))
+      prog_data->uses_baseinstance = true;
+
+   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE))
+      prog_data->uses_vertexid = true;
+
+   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID))
+      prog_data->uses_instanceid = true;
+
+   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID))
+          prog_data->uses_drawid = true;
+
+   /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
+    * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode.  Empirically, in
+    * vec4 mode, the hardware appears to wedge unless we read something.
+    */
+   if (is_scalar)
+      prog_data->base.urb_read_length =
+         DIV_ROUND_UP(nr_attribute_slots, 2);
+   else
+      prog_data->base.urb_read_length =
+         DIV_ROUND_UP(MAX2(nr_attribute_slots, 1), 2);
+
+   prog_data->nr_attribute_slots = nr_attribute_slots;
+
+   /* Since vertex shaders reuse the same VUE entry for inputs and outputs
+    * (overwriting the original contents), we need to make sure the size is
+    * the larger of the two.
+    */
+   const unsigned vue_entries =
+      MAX2(nr_attribute_slots, (unsigned)prog_data->base.vue_map.num_slots);
+
+   if (compiler->devinfo->ver == 6) {
+      prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8);
+   } else {
+      prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
+   }
+
+   if (unlikely(debug_enabled)) {
+      fprintf(stderr, "VS Output ");
+      brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_VERTEX);
+   }
+
+   if (is_scalar) {
+      const unsigned dispatch_width = compiler->devinfo->ver >= 20 ? 16 : 8;
+      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
+
+      fs_visitor v(compiler, &params->base, &key->base,
+                   &prog_data->base.base, nir, dispatch_width,
+                   params->base.stats != NULL, debug_enabled);
+      if (!v.run_vs()) {
+         params->base.error_str =
+            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
+         return NULL;
+      }
+
+      assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
+      prog_data->base.base.dispatch_grf_start_reg =
+         v.payload().num_regs / reg_unit(compiler->devinfo);
+
+      fs_generator g(compiler, &params->base,
+                     &prog_data->base.base, v.runtime_check_aads_emit,
+                     MESA_SHADER_VERTEX);
+      if (unlikely(debug_enabled)) {
+         const char *debug_name =
+            ralloc_asprintf(params->base.mem_ctx, "%s vertex shader %s",
+                            nir->info.label ? nir->info.label :
+                               "unnamed",
+                            nir->info.name);
+
+         g.enable_debug(debug_name);
+      }
+      g.generate_code(v.cfg, dispatch_width, v.shader_stats,
+                      v.performance_analysis.require(), params->base.stats);
+      g.add_const_data(nir->constant_data, nir->constant_data_size);
+      assembly = g.get_assembly();
+   }
+
+   if (!assembly) {
+      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
+
+      vec4_vs_visitor v(compiler, &params->base, key, prog_data,
+                        nir, debug_enabled);
+      if (!v.run()) {
+         params->base.error_str =
+            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
+         return NULL;
+      }
+
+      assembly = brw_vec4_generate_assembly(compiler, &params->base,
+                                            nir, &prog_data->base,
+                                            v.cfg,
+                                            v.performance_analysis.require(),
+                                            debug_enabled);
+   }
+
+   return assembly;
+}
@@ -22,16 +22,11 @@
 */

 #include "brw_vec4.h"
-#include "brw_fs.h"
-#include "brw_eu.h"
 #include "brw_cfg.h"
-#include "brw_nir.h"
 #include "brw_vec4_builder.h"
 #include "brw_vec4_vs.h"
 #include "brw_dead_control_flow.h"
-#include "brw_private.h"
 #include "dev/intel_debug.h"
-#include "util/u_math.h"

 #define MAX_INSTRUCTION (1 << 30)

@@ -2545,163 +2540,3 @@ vec4_visitor::run()

 } /* namespace brw */

-extern "C" {
-
-const unsigned *
-brw_compile_vs(const struct brw_compiler *compiler,
-               struct brw_compile_vs_params *params)
-{
-   struct nir_shader *nir = params->base.nir;
-   const struct brw_vs_prog_key *key = params->key;
-   struct brw_vs_prog_data *prog_data = params->prog_data;
-   const bool debug_enabled =
-      brw_should_print_shader(nir, params->base.debug_flag ?
-                                   params->base.debug_flag : DEBUG_VS);
-
-   prog_data->base.base.stage = MESA_SHADER_VERTEX;
-   prog_data->base.base.ray_queries = nir->info.ray_queries;
-   prog_data->base.base.total_scratch = 0;
-
-   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
-   brw_nir_apply_key(nir, compiler, &key->base, 8);
-
-   const unsigned *assembly = NULL;
-
-   prog_data->inputs_read = nir->info.inputs_read;
-   prog_data->double_inputs_read = nir->info.vs.double_inputs;
-
-   brw_nir_lower_vs_inputs(nir, params->edgeflag_is_last, key->gl_attrib_wa_flags);
-   brw_nir_lower_vue_outputs(nir);
-   brw_postprocess_nir(nir, compiler, debug_enabled,
-                       key->base.robust_flags);
-
-   prog_data->base.clip_distance_mask =
-      ((1 << nir->info.clip_distance_array_size) - 1);
-   prog_data->base.cull_distance_mask =
-      ((1 << nir->info.cull_distance_array_size) - 1) <<
-      nir->info.clip_distance_array_size;
-
-   unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read);
-
-   /* gl_VertexID and gl_InstanceID are system values, but arrive via an
-    * incoming vertex attribute.  So, add an extra slot.
-    */
-   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX) ||
-       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE) ||
-       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) ||
-       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID)) {
-      nr_attribute_slots++;
-   }
-
-   /* gl_DrawID and IsIndexedDraw share its very own vec4 */
-   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID) ||
-       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_IS_INDEXED_DRAW)) {
-      nr_attribute_slots++;
-   }
-
-   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_IS_INDEXED_DRAW))
-      prog_data->uses_is_indexed_draw = true;
-
-   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX))
-      prog_data->uses_firstvertex = true;
-
-   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE))
-      prog_data->uses_baseinstance = true;
-
-   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE))
-      prog_data->uses_vertexid = true;
-
-   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID))
-      prog_data->uses_instanceid = true;
-
-   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID))
-          prog_data->uses_drawid = true;
-
-   /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
-    * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode.  Empirically, in
-    * vec4 mode, the hardware appears to wedge unless we read something.
-    */
-   if (is_scalar)
-      prog_data->base.urb_read_length =
-         DIV_ROUND_UP(nr_attribute_slots, 2);
-   else
-      prog_data->base.urb_read_length =
-         DIV_ROUND_UP(MAX2(nr_attribute_slots, 1), 2);
-
-   prog_data->nr_attribute_slots = nr_attribute_slots;
-
-   /* Since vertex shaders reuse the same VUE entry for inputs and outputs
-    * (overwriting the original contents), we need to make sure the size is
-    * the larger of the two.
-    */
-   const unsigned vue_entries =
-      MAX2(nr_attribute_slots, (unsigned)prog_data->base.vue_map.num_slots);
-
-   if (compiler->devinfo->ver == 6) {
-      prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8);
-   } else {
-      prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
-   }
-
-   if (unlikely(debug_enabled)) {
-      fprintf(stderr, "VS Output ");
-      brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_VERTEX);
-   }
-
-   if (is_scalar) {
-      const unsigned dispatch_width = compiler->devinfo->ver >= 20 ? 16 : 8;
-      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
-
-      fs_visitor v(compiler, &params->base, &key->base,
-                   &prog_data->base.base, nir, dispatch_width,
-                   params->base.stats != NULL, debug_enabled);
-      if (!v.run_vs()) {
-         params->base.error_str =
-            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
-         return NULL;
-      }
-
-      assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
-      prog_data->base.base.dispatch_grf_start_reg =
-         v.payload().num_regs / reg_unit(compiler->devinfo);
-
-      fs_generator g(compiler, &params->base,
-                     &prog_data->base.base, v.runtime_check_aads_emit,
-                     MESA_SHADER_VERTEX);
-      if (unlikely(debug_enabled)) {
-         const char *debug_name =
-            ralloc_asprintf(params->base.mem_ctx, "%s vertex shader %s",
-                            nir->info.label ? nir->info.label :
-                               "unnamed",
-                            nir->info.name);
-
-         g.enable_debug(debug_name);
-      }
-      g.generate_code(v.cfg, dispatch_width, v.shader_stats,
-                      v.performance_analysis.require(), params->base.stats);
-      g.add_const_data(nir->constant_data, nir->constant_data_size);
-      assembly = g.get_assembly();
-   }
-
-   if (!assembly) {
-      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
-
-      vec4_vs_visitor v(compiler, &params->base, key, prog_data,
-                        nir, debug_enabled);
-      if (!v.run()) {
-         params->base.error_str =
-            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
-         return NULL;
-      }
-
-      assembly = brw_vec4_generate_assembly(compiler, &params->base,
-                                            nir, &prog_data->base,
-                                            v.cfg,
-                                            v.performance_analysis.require(),
-                                            debug_enabled);
-   }
-
-   return assembly;
-}
-
-} /* extern "C" */
@@ -28,14 +28,8 @@
 */

 #include "brw_vec4_gs_visitor.h"
-#include "gfx6_gs_visitor.h"
-#include "brw_eu.h"
 #include "brw_cfg.h"
 #include "brw_fs.h"
-#include "brw_nir.h"
-#include "brw_prim.h"
-#include "brw_private.h"
-#include "dev/intel_debug.h"

 namespace brw {

@@ -562,390 +556,5 @@ vec4_gs_visitor::gs_end_primitive()
   emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
 }

-static const GLuint gl_prim_to_hw_prim[MESA_PRIM_TRIANGLE_STRIP_ADJACENCY+1] = {
-   [MESA_PRIM_POINTS] =_3DPRIM_POINTLIST,
-   [MESA_PRIM_LINES] = _3DPRIM_LINELIST,
-   [MESA_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP,
-   [MESA_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP,
-   [MESA_PRIM_TRIANGLES] = _3DPRIM_TRILIST,
-   [MESA_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
-   [MESA_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
-   [MESA_PRIM_QUADS] = _3DPRIM_QUADLIST,
-   [MESA_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
-   [MESA_PRIM_POLYGON] = _3DPRIM_POLYGON,
-   [MESA_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
-   [MESA_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
-   [MESA_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
-   [MESA_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
-};
-
 } /* namespace brw */

-extern "C" const unsigned *
-brw_compile_gs(const struct brw_compiler *compiler,
-               struct brw_compile_gs_params *params)
-{
-   nir_shader *nir = params->base.nir;
-   const struct brw_gs_prog_key *key = params->key;
-   struct brw_gs_prog_data *prog_data = params->prog_data;
-
-   struct brw_gs_compile c;
-   memset(&c, 0, sizeof(c));
-   c.key = *key;
-
-   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_GEOMETRY];
-   const bool debug_enabled = brw_should_print_shader(nir, DEBUG_GS);
-
-   prog_data->base.base.stage = MESA_SHADER_GEOMETRY;
-   prog_data->base.base.ray_queries = nir->info.ray_queries;
-   prog_data->base.base.total_scratch = 0;
-
-   /* The GLSL linker will have already matched up GS inputs and the outputs
-    * of prior stages.  The driver does extend VS outputs in some cases, but
-    * only for legacy OpenGL or Gfx4-5 hardware, neither of which offer
-    * geometry shader support.  So we can safely ignore that.
-    *
-    * For SSO pipelines, we use a fixed VUE map layout based on variable
-    * locations, so we can rely on rendezvous-by-location making this work.
-    */
-   GLbitfield64 inputs_read = nir->info.inputs_read;
-   brw_compute_vue_map(compiler->devinfo,
-                       &c.input_vue_map, inputs_read,
-                       nir->info.separate_shader, 1);
-
-   brw_nir_apply_key(nir, compiler, &key->base, 8);
-   brw_nir_lower_vue_inputs(nir, &c.input_vue_map);
-   brw_nir_lower_vue_outputs(nir);
-   brw_postprocess_nir(nir, compiler, debug_enabled,
-                       key->base.robust_flags);
-
-   prog_data->base.clip_distance_mask =
-      ((1 << nir->info.clip_distance_array_size) - 1);
-   prog_data->base.cull_distance_mask =
-      ((1 << nir->info.cull_distance_array_size) - 1) <<
-      nir->info.clip_distance_array_size;
-
-   prog_data->include_primitive_id =
-      BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
-
-   prog_data->invocations = nir->info.gs.invocations;
-
-   if (compiler->devinfo->ver >= 8)
-      nir_gs_count_vertices_and_primitives(
-         nir, &prog_data->static_vertex_count, nullptr, nullptr, 1u);
-
-   if (compiler->devinfo->ver >= 7) {
-      if (nir->info.gs.output_primitive == MESA_PRIM_POINTS) {
-         /* When the output type is points, the geometry shader may output data
-          * to multiple streams, and EndPrimitive() has no effect.  So we
-          * configure the hardware to interpret the control data as stream ID.
-          */
-         prog_data->control_data_format = GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;
-
-         /* We only have to emit control bits if we are using non-zero streams */
-         if (nir->info.gs.active_stream_mask != (1 << 0))
-            c.control_data_bits_per_vertex = 2;
-         else
-            c.control_data_bits_per_vertex = 0;
-      } else {
-         /* When the output type is triangle_strip or line_strip, EndPrimitive()
-          * may be used to terminate the current strip and start a new one
-          * (similar to primitive restart), and outputting data to multiple
-          * streams is not supported.  So we configure the hardware to interpret
-          * the control data as EndPrimitive information (a.k.a. "cut bits").
-          */
-         prog_data->control_data_format = GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;
-
-         /* We only need to output control data if the shader actually calls
-          * EndPrimitive().
-          */
-         c.control_data_bits_per_vertex =
-            nir->info.gs.uses_end_primitive ? 1 : 0;
-      }
-   } else {
-      /* There are no control data bits in gfx6. */
-      c.control_data_bits_per_vertex = 0;
-   }
-   c.control_data_header_size_bits =
-      nir->info.gs.vertices_out * c.control_data_bits_per_vertex;
-
-   /* 1 HWORD = 32 bytes = 256 bits */
-   prog_data->control_data_header_size_hwords =
-      ALIGN(c.control_data_header_size_bits, 256) / 256;
-
-   /* Compute the output vertex size.
-    *
-    * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
-    * Size (p168):
-    *
-    *     [0,62] indicating [1,63] 16B units
-    *
-    *     Specifies the size of each vertex stored in the GS output entry
-    *     (following any Control Header data) as a number of 128-bit units
-    *     (minus one).
-    *
-    *     Programming Restrictions: The vertex size must be programmed as a
-    *     multiple of 32B units with the following exception: Rendering is
-    *     disabled (as per SOL stage state) and the vertex size output by the
-    *     GS thread is 16B.
-    *
-    *     If rendering is enabled (as per SOL state) the vertex size must be
-    *     programmed as a multiple of 32B units. In other words, the only time
-    *     software can program a vertex size with an odd number of 16B units
-    *     is when rendering is disabled.
-    *
-    * Note: B=bytes in the above text.
-    *
-    * It doesn't seem worth the extra trouble to optimize the case where the
-    * vertex size is 16B (especially since this would require special-casing
-    * the GEN assembly that writes to the URB).  So we just set the vertex
-    * size to a multiple of 32B (2 vec4's) in all cases.
-    *
-    * The maximum output vertex size is 62*16 = 992 bytes (31 hwords).  We
-    * budget that as follows:
-    *
-    *   512 bytes for varyings (a varying component is 4 bytes and
-    *             gl_MaxGeometryOutputComponents = 128)
-    *    16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
-    *             bytes)
-    *    16 bytes overhead for gl_Position (we allocate it a slot in the VUE
-    *             even if it's not used)
-    *    32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
-    *             whenever clip planes are enabled, even if the shader doesn't
-    *             write to gl_ClipDistance)
-    *    16 bytes overhead since the VUE size must be a multiple of 32 bytes
-    *             (see below)--this causes up to 1 VUE slot to be wasted
-    *   400 bytes available for varying packing overhead
-    *
-    * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
-    * per interpolation type, so this is plenty.
-    *
-    */
-   unsigned output_vertex_size_bytes = prog_data->base.vue_map.num_slots * 16;
-   assert(compiler->devinfo->ver == 6 ||
-          output_vertex_size_bytes <= GFX7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
-   prog_data->output_vertex_size_hwords =
-      ALIGN(output_vertex_size_bytes, 32) / 32;
-
-   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
-    * That divides up as follows:
-    *
-    *     64 bytes for the control data header (cut indices or StreamID bits)
-    *   4096 bytes for varyings (a varying component is 4 bytes and
-    *              gl_MaxGeometryTotalOutputComponents = 1024)
-    *   4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
-    *              bytes/vertex and gl_MaxGeometryOutputVertices is 256)
-    *   4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
-    *              even if it's not used)
-    *   8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
-    *              whenever clip planes are enabled, even if the shader doesn't
-    *              write to gl_ClipDistance)
-    *   4096 bytes overhead since the VUE size must be a multiple of 32
-    *              bytes (see above)--this causes up to 1 VUE slot to be wasted
-    *   8128 bytes available for varying packing overhead
-    *
-    * Worst-case varying packing overhead is 3/4 of a varying slot per
-    * interpolation type, which works out to 3072 bytes, so this would allow
-    * us to accommodate 2 interpolation types without any danger of running
-    * out of URB space.
-    *
-    * In practice, the risk of running out of URB space is very small, since
-    * the above figures are all worst-case, and most of them scale with the
-    * number of output vertices.  So we'll just calculate the amount of space
-    * we need, and if it's too large, fail to compile.
-    *
-    * The above is for gfx7+ where we have a single URB entry that will hold
-    * all the output. In gfx6, we will have to allocate URB entries for every
-    * vertex we emit, so our URB entries only need to be large enough to hold
-    * a single vertex. Also, gfx6 does not have a control data header.
-    */
-   unsigned output_size_bytes;
-   if (compiler->devinfo->ver >= 7) {
-      output_size_bytes =
-         prog_data->output_vertex_size_hwords * 32 * nir->info.gs.vertices_out;
-      output_size_bytes += 32 * prog_data->control_data_header_size_hwords;
-   } else {
-      output_size_bytes = prog_data->output_vertex_size_hwords * 32;
-   }
-
-   /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
-    * which comes before the control header.
-    */
-   if (compiler->devinfo->ver >= 8)
-      output_size_bytes += 32;
-
-   /* Shaders can technically set max_vertices = 0, at which point we
-    * may have a URB size of 0 bytes.  Nothing good can come from that,
-    * so enforce a minimum size.
-    */
-   if (output_size_bytes == 0)
-      output_size_bytes = 1;
-
-   unsigned max_output_size_bytes = GFX7_MAX_GS_URB_ENTRY_SIZE_BYTES;
-   if (compiler->devinfo->ver == 6)
-      max_output_size_bytes = GFX6_MAX_GS_URB_ENTRY_SIZE_BYTES;
-   if (output_size_bytes > max_output_size_bytes)
-      return NULL;
-
-
-   /* URB entry sizes are stored as a multiple of 64 bytes in gfx7+ and
-    * a multiple of 128 bytes in gfx6.
-    */
-   if (compiler->devinfo->ver >= 7) {
-      prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
-   } else {
-      prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;
-   }
-
-   assert(nir->info.gs.output_primitive < ARRAY_SIZE(brw::gl_prim_to_hw_prim));
-   prog_data->output_topology =
-      brw::gl_prim_to_hw_prim[nir->info.gs.output_primitive];
-
-   prog_data->vertices_in = nir->info.gs.vertices_in;
-
-   /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
-    * need to program a URB read length of ceiling(num_slots / 2).
-    */
-   prog_data->base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;
-
-   /* Now that prog_data setup is done, we are ready to actually compile the
-    * program.
-    */
-   if (unlikely(debug_enabled)) {
-      fprintf(stderr, "GS Input ");
-      brw_print_vue_map(stderr, &c.input_vue_map, MESA_SHADER_GEOMETRY);
-      fprintf(stderr, "GS Output ");
-      brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_GEOMETRY);
-   }
-
-   if (is_scalar) {
-      fs_visitor v(compiler, &params->base, &c, prog_data, nir,
-                   params->base.stats != NULL, debug_enabled);
-      if (v.run_gs()) {
-         prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
-
-         assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
-         prog_data->base.base.dispatch_grf_start_reg =
-            v.payload().num_regs / reg_unit(compiler->devinfo);
-
-         fs_generator g(compiler, &params->base,
-                        &prog_data->base.base, false, MESA_SHADER_GEOMETRY);
-         if (unlikely(debug_enabled)) {
-            const char *label =
-               nir->info.label ? nir->info.label : "unnamed";
-            char *name = ralloc_asprintf(params->base.mem_ctx,
-                                         "%s geometry shader %s",
-                                         label, nir->info.name);
-            g.enable_debug(name);
-         }
-         g.generate_code(v.cfg, v.dispatch_width, v.shader_stats,
-                         v.performance_analysis.require(), params->base.stats);
-         g.add_const_data(nir->constant_data, nir->constant_data_size);
-         return g.get_assembly();
-      }
-
-      params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg);
-
-      return NULL;
-   }
-
-   if (compiler->devinfo->ver >= 7) {
-      /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
-       * so without spilling. If the GS invocations count > 1, then we can't use
-       * dual object mode.
-       */
-      if (prog_data->invocations <= 1 &&
-          !INTEL_DEBUG(DEBUG_NO_DUAL_OBJECT_GS)) {
-         prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
-
-         brw::vec4_gs_visitor v(compiler, &params->base, &c, prog_data, nir,
-                                true /* no_spills */,
-                                debug_enabled);
-
-         /* Backup 'nr_params' and 'param' as they can be modified by the
-          * the DUAL_OBJECT visitor. If it fails, we will run the fallback
-          * (DUAL_INSTANCED or SINGLE mode) and we need to restore original
-          * values.
-          */
-         const unsigned param_count = prog_data->base.base.nr_params;
-         uint32_t *param = ralloc_array(NULL, uint32_t, param_count);
-         memcpy(param, prog_data->base.base.param,
-                sizeof(uint32_t) * param_count);
-
-         if (v.run()) {
-            /* Success! Backup is not needed */
-            ralloc_free(param);
-            return brw_vec4_generate_assembly(compiler, &params->base,
-                                              nir, &prog_data->base,
-                                              v.cfg,
-                                              v.performance_analysis.require(),
-                                              debug_enabled);
-         } else {
-            /* These variables could be modified by the execution of the GS
-             * visitor if it packed the uniforms in the push constant buffer.
-             * As it failed, we need restore them so we can start again with
-             * DUAL_INSTANCED or SINGLE mode.
-             *
-             * FIXME: Could more variables be modified by this execution?
-             */
-            memcpy(prog_data->base.base.param, param,
-                   sizeof(uint32_t) * param_count);
-            prog_data->base.base.nr_params = param_count;
-            ralloc_free(param);
-         }
-      }
-   }
-
-   /* Either we failed to compile in DUAL_OBJECT mode (probably because it
-    * would have required spilling) or DUAL_OBJECT mode is disabled.  So fall
-    * back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers.
-    *
-    * FIXME: Single dispatch mode requires that the driver can handle
-    * interleaving of input registers, but this is already supported (dual
-    * instance mode has the same requirement). However, to take full advantage
-    * of single dispatch mode to reduce register pressure we would also need to
-    * do interleaved outputs, but currently, the vec4 visitor and generator
-    * classes do not support this, so at the moment register pressure in
-    * single and dual instance modes is the same.
-    *
-    * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS"
-    * "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely
-    * want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode
-    * is also supported. When InstanceCount=1 (one instance per object) software
-    * can decide which dispatch mode to use. DUAL_OBJECT mode would likely be
-    * the best choice for performance, followed by SINGLE mode."
-    *
-    * So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE
-    * mode is more performant when invocations > 1. Gfx6 only supports
-    * SINGLE mode.
-    */
-   if (prog_data->invocations <= 1 || compiler->devinfo->ver < 7)
-      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X1_SINGLE;
-   else
-      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_INSTANCE;
-
-   brw::vec4_gs_visitor *gs = NULL;
-   const unsigned *ret = NULL;
-
-   if (compiler->devinfo->ver >= 7)
-      gs = new brw::vec4_gs_visitor(compiler, &params->base, &c, prog_data,
-                                    nir, false /* no_spills */,
-                                    debug_enabled);
-   else
-      gs = new brw::gfx6_gs_visitor(compiler, &params->base, &c, prog_data,
-                                    nir, false /* no_spills */,
-                                    debug_enabled);
-
-   if (!gs->run()) {
-      params->base.error_str =
-         ralloc_strdup(params->base.mem_ctx, gs->fail_msg);
-   } else {
-      ret = brw_vec4_generate_assembly(compiler, &params->base, nir,
-                                       &prog_data->base, gs->cfg,
-                                       gs->performance_analysis.require(),
-                                       debug_enabled);
-   }
-
-   delete gs;
-   return ret;
-}
@@ -28,11 +28,7 @@
 */

 #include "intel_nir.h"
-#include "brw_nir.h"
 #include "brw_vec4_tcs.h"
-#include "brw_fs.h"
-#include "brw_private.h"
-#include "dev/intel_debug.h"

 namespace brw {

@@ -320,181 +316,5 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
   }
 }

-/**
- * Return the number of patches to accumulate before a MULTI_PATCH mode thread is
- * launched.  In cases with a large number of input control points and a large
- * amount of VS outputs, the VS URB space needed to store an entire 8 patches
- * worth of data can be prohibitive, so it can be beneficial to launch threads
- * early.
- *
- * See the 3DSTATE_HS::Patch Count Threshold documentation for the recommended
- * values.  Note that 0 means to "disable" early dispatch, meaning to wait for
- * a full 8 patches as normal.
- */
-static int
-get_patch_count_threshold(int input_control_points)
-{
-   if (input_control_points <= 4)
-      return 0;
-   else if (input_control_points <= 6)
-      return 5;
-   else if (input_control_points <= 8)
-      return 4;
-   else if (input_control_points <= 10)
-      return 3;
-   else if (input_control_points <= 14)
-      return 2;
-
-   /* Return patch count 1 for PATCHLIST_15 - PATCHLIST_32 */
-   return 1;
-}
-
 } /* namespace brw */

-extern "C" const unsigned *
-brw_compile_tcs(const struct brw_compiler *compiler,
-                struct brw_compile_tcs_params *params)
-{
-   const struct intel_device_info *devinfo = compiler->devinfo;
-   nir_shader *nir = params->base.nir;
-   const struct brw_tcs_prog_key *key = params->key;
-   struct brw_tcs_prog_data *prog_data = params->prog_data;
-   struct brw_vue_prog_data *vue_prog_data = &prog_data->base;
-
-   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
-   const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TCS);
-   const unsigned *assembly;
-
-   vue_prog_data->base.stage = MESA_SHADER_TESS_CTRL;
-   prog_data->base.base.ray_queries = nir->info.ray_queries;
-   prog_data->base.base.total_scratch = 0;
-
-   nir->info.outputs_written = key->outputs_written;
-   nir->info.patch_outputs_written = key->patch_outputs_written;
-
-   struct intel_vue_map input_vue_map;
-   brw_compute_vue_map(devinfo, &input_vue_map, nir->info.inputs_read,
-                       nir->info.separate_shader, 1);
-   brw_compute_tess_vue_map(&vue_prog_data->vue_map,
-                            nir->info.outputs_written,
-                            nir->info.patch_outputs_written);
-
-   brw_nir_apply_key(nir, compiler, &key->base, 8);
-   brw_nir_lower_vue_inputs(nir, &input_vue_map);
-   brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map,
-                             key->_tes_primitive_mode);
-   if (key->quads_workaround)
-      intel_nir_apply_tcs_quads_workaround(nir);
-   if (key->input_vertices > 0)
-      intel_nir_lower_patch_vertices_in(nir, key->input_vertices);
-
-   brw_postprocess_nir(nir, compiler, debug_enabled,
-                       key->base.robust_flags);
-
-   bool has_primitive_id =
-      BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
-
-   prog_data->patch_count_threshold = brw::get_patch_count_threshold(key->input_vertices);
-
-   if (compiler->use_tcs_multi_patch) {
-      vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_MULTI_PATCH;
-      prog_data->instances = nir->info.tess.tcs_vertices_out;
-      prog_data->include_primitive_id = has_primitive_id;
-   } else {
-      unsigned verts_per_thread = is_scalar ? 8 : 2;
-      vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH;
-      prog_data->instances =
-         DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, verts_per_thread);
-   }
-
-   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
-    * That divides up as follows:
-    *
-    *     32 bytes for the patch header (tessellation factors)
-    *    480 bytes for per-patch varyings (a varying component is 4 bytes and
-    *              gl_MaxTessPatchComponents = 120)
-    *  16384 bytes for per-vertex varyings (a varying component is 4 bytes,
-    *              gl_MaxPatchVertices = 32 and
-    *              gl_MaxTessControlOutputComponents = 128)
-    *
-    *  15808 bytes left for varying packing overhead
-    */
-   const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots;
-   const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots;
-   unsigned output_size_bytes = 0;
-   /* Note that the patch header is counted in num_per_patch_slots. */
-   output_size_bytes += num_per_patch_slots * 16;
-   output_size_bytes += nir->info.tess.tcs_vertices_out *
-                        num_per_vertex_slots * 16;
-
-   assert(output_size_bytes >= 1);
-   if (output_size_bytes > GFX7_MAX_HS_URB_ENTRY_SIZE_BYTES)
-      return NULL;
-
-   /* URB entry sizes are stored as a multiple of 64 bytes. */
-   vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
-
-   /* HS does not use the usual payload pushing from URB to GRFs,
-    * because we don't have enough registers for a full-size payload, and
-    * the hardware is broken on Haswell anyway.
-    */
-   vue_prog_data->urb_read_length = 0;
-
-   if (unlikely(debug_enabled)) {
-      fprintf(stderr, "TCS Input ");
-      brw_print_vue_map(stderr, &input_vue_map, MESA_SHADER_TESS_CTRL);
-      fprintf(stderr, "TCS Output ");
-      brw_print_vue_map(stderr, &vue_prog_data->vue_map, MESA_SHADER_TESS_CTRL);
-   }
-
-   if (is_scalar) {
-      const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
-      fs_visitor v(compiler, &params->base, &key->base,
-                   &prog_data->base.base, nir, dispatch_width,
-                   params->base.stats != NULL, debug_enabled);
-      if (!v.run_tcs()) {
-         params->base.error_str =
-            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
-         return NULL;
-      }
-
-      assert(v.payload().num_regs % reg_unit(devinfo) == 0);
-      prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
-
-      fs_generator g(compiler, &params->base,
-                     &prog_data->base.base, false, MESA_SHADER_TESS_CTRL);
-      if (unlikely(debug_enabled)) {
-         g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
-                                        "%s tessellation control shader %s",
-                                        nir->info.label ? nir->info.label
-                                                        : "unnamed",
-                                        nir->info.name));
-      }
-
-      g.generate_code(v.cfg, dispatch_width, v.shader_stats,
-                      v.performance_analysis.require(), params->base.stats);
-
-      g.add_const_data(nir->constant_data, nir->constant_data_size);
-
-      assembly = g.get_assembly();
-   } else {
-      brw::vec4_tcs_visitor v(compiler, &params->base, key, prog_data,
-                              nir, debug_enabled);
-      if (!v.run()) {
-         params->base.error_str =
-            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
-         return NULL;
-      }
-
-      if (INTEL_DEBUG(DEBUG_TCS))
-         v.dump_instructions();
-
-
-      assembly = brw_vec4_generate_assembly(compiler, &params->base, nir,
-                                            &prog_data->base, v.cfg,
-                                            v.performance_analysis.require(),
-                                            debug_enabled);
-   }
-
-   return assembly;
-}
@@ -47,7 +47,10 @@ libintel_compiler_brw_files = files(
  'brw_clip_util.c',
  'brw_compile_clip.c',
  'brw_compile_ff_gs.c',
+  'brw_compile_gs.cpp',
  'brw_compile_sf.c',
+  'brw_compile_tcs.cpp',
+  'brw_compile_vs.cpp',
  'brw_compiler.c',
  'brw_compiler.h',
  'brw_dead_control_flow.cpp',