diff --git a/src/mesa/main/feedback.c b/src/mesa/main/feedback.c
index e6d925b274d..3a035b67eeb 100644
--- a/src/mesa/main/feedback.c
+++ b/src/mesa/main/feedback.c
@@ -637,9 +637,11 @@ _mesa_RenderMode( GLenum mode )
 	 return 0;
    }
 
-   ctx->RenderMode = mode;
    st_RenderMode( ctx, mode );
 
+   /* finally update render mode to new one */
+   ctx->RenderMode = mode;
+
    return result;
 }
 
diff --git a/src/mesa/meson.build b/src/mesa/meson.build
index 522a4c2eb11..8fba301efc5 100644
--- a/src/mesa/meson.build
+++ b/src/mesa/meson.build
@@ -351,6 +351,7 @@ files_libmesa = files(
   'state_tracker/st_draw.c',
   'state_tracker/st_draw.h',
   'state_tracker/st_draw_feedback.c',
+  'state_tracker/st_draw_hw_select.c',
   'state_tracker/st_extensions.c',
   'state_tracker/st_extensions.h',
   'state_tracker/st_format.c',
diff --git a/src/mesa/state_tracker/st_cb_feedback.c b/src/mesa/state_tracker/st_cb_feedback.c
index c5e6f779cd0..e937386fde0 100644
--- a/src/mesa/state_tracker/st_cb_feedback.c
+++ b/src/mesa/state_tracker/st_cb_feedback.c
@@ -291,12 +291,16 @@ st_RenderMode(struct gl_context *ctx, GLenum newMode )
       st_init_draw_functions(st->screen, &ctx->Driver);
    }
    else if (newMode == GL_SELECT) {
-      if (!st->selection_stage)
-         st->selection_stage = draw_glselect_stage(ctx, draw);
-      draw_set_rasterize_stage(draw, st->selection_stage);
-      /* Plug in new vbo draw function */
-      ctx->Driver.DrawGallium = _mesa_draw_gallium_fallback;
-      ctx->Driver.DrawGalliumMultiMode = _mesa_draw_gallium_multimode_fallback;
+      if (ctx->Const.HardwareAcceleratedSelect)
+         st_init_hw_select_draw_functions(st->screen, &ctx->Driver);
+      else {
+         if (!st->selection_stage)
+            st->selection_stage = draw_glselect_stage(ctx, draw);
+         draw_set_rasterize_stage(draw, st->selection_stage);
+         /* Plug in new vbo draw function */
+         ctx->Driver.DrawGallium = _mesa_draw_gallium_fallback;
+         ctx->Driver.DrawGalliumMultiMode = _mesa_draw_gallium_multimode_fallback;
+      }
    }
    else {
       struct gl_program *vp = st->ctx->VertexProgram._Current;
@@ -311,4 +315,8 @@ st_RenderMode(struct gl_context *ctx, GLenum newMode )
       if (vp)
          st->dirty |= ST_NEW_VERTEX_PROGRAM(st, vp);
    }
+
+   /* Restore geometry shader states when leaving GL_SELECT mode. */
+   if (ctx->RenderMode == GL_SELECT && ctx->Const.HardwareAcceleratedSelect)
+      st->dirty |= ST_NEW_GS_SSBOS | ST_NEW_GS_CONSTANTS | ST_NEW_GS_STATE;
 }
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 7c889727dbc..ffd50ec3e17 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -64,6 +64,7 @@
 #include "util/u_upload_mgr.h"
 #include "util/u_vbuf.h"
 #include "util/u_memory.h"
+#include "util/hash_table.h"
 #include "cso_cache/cso_context.h"
 #include "compiler/glsl/glsl_parser_extras.h"
 #include "nir/nir_to_tgsi.h"
@@ -958,6 +959,12 @@ st_destroy_context(struct st_context *st)
    st_release_program(st, &st->tep);
    st_release_program(st, &st->cp);
 
+   if (st->hw_select_shaders) {
+      hash_table_foreach(st->hw_select_shaders, entry)
+         st->pipe->delete_gs_state(st->pipe, entry->data);
+      _mesa_hash_table_destroy(st->hw_select_shaders, NULL);
+   }
+
    /* release framebuffer in the winsys buffers list */
    LIST_FOR_EACH_ENTRY_SAFE_REV(stfb, next, &st->winsys_buffers, head) {
       _mesa_reference_framebuffer(&stfb, NULL);
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index d3eb1075617..749d3a4fea5 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -383,6 +383,8 @@ struct st_context
       struct st_zombie_shader_node list;
       simple_mtx_t mutex;
    } zombie_shaders;
+
+   struct hash_table *hw_select_shaders;
 };
 
 
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index cd0b8d6e8b2..1fbfc5c1170 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -519,3 +519,70 @@ st_draw_quad(struct st_context *st,
 
    return true;
 }
+
+static void
+st_hw_select_draw_gallium(struct gl_context *ctx,
+                          struct pipe_draw_info *info,
+                          unsigned drawid_offset,
+                          const struct pipe_draw_start_count_bias *draws,
+                          unsigned num_draws)
+{
+   struct st_context *st = st_context(ctx);
+
+   prepare_draw(st, ctx, ST_PIPELINE_RENDER_STATE_MASK, ST_PIPELINE_RENDER);
+
+   if (!prepare_indexed_draw(st, ctx, info, draws, num_draws))
+      return;
+
+   if (!st_draw_hw_select_prepare_common(ctx) ||
+       !st_draw_hw_select_prepare_mode(ctx, info))
+      return;
+
+   cso_multi_draw(st->cso_context, info, drawid_offset, draws, num_draws);
+}
+
+static void
+st_hw_select_draw_gallium_multimode(struct gl_context *ctx,
+                                    struct pipe_draw_info *info,
+                                    const struct pipe_draw_start_count_bias *draws,
+                                    const unsigned char *mode,
+                                    unsigned num_draws)
+{
+   struct st_context *st = st_context(ctx);
+
+   prepare_draw(st, ctx, ST_PIPELINE_RENDER_STATE_MASK, ST_PIPELINE_RENDER);
+
+   if (!prepare_indexed_draw(st, ctx, info, draws, num_draws))
+      return;
+
+   if (!st_draw_hw_select_prepare_common(ctx))
+      return;
+
+   unsigned i, first;
+   struct cso_context *cso = st->cso_context;
+
+   /* Find consecutive draws where mode doesn't vary. */
+   for (i = 0, first = 0; i <= num_draws; i++) {
+      if (i == num_draws || mode[i] != mode[first]) {
+         info->mode = mode[first];
+
+         if (st_draw_hw_select_prepare_mode(ctx, info))
+            cso_multi_draw(cso, info, 0, &draws[first], i - first);
+
+         first = i;
+
+         /* We can pass the reference only once. st_buffer_object keeps
+          * the reference alive for later draws.
+          */
+         info->take_index_buffer_ownership = false;
+      }
+   }
+}
+
+void
+st_init_hw_select_draw_functions(struct pipe_screen *screen,
+                                 struct dd_function_table *functions)
+{
+   functions->DrawGallium = st_hw_select_draw_gallium;
+   functions->DrawGalliumMultiMode = st_hw_select_draw_gallium_multimode;
+}
diff --git a/src/mesa/state_tracker/st_draw.h b/src/mesa/state_tracker/st_draw.h
index e0e6e472bbd..46ee64df6fa 100644
--- a/src/mesa/state_tracker/st_draw.h
+++ b/src/mesa/state_tracker/st_draw.h
@@ -99,4 +99,13 @@ st_indirect_draw_vbo(struct gl_context *ctx,
                      const struct _mesa_index_buffer *ib,
                      bool primitive_restart,
                      unsigned restart_index);
+
+bool
+st_draw_hw_select_prepare_common(struct gl_context *ctx);
+bool
+st_draw_hw_select_prepare_mode(struct gl_context *ctx, struct pipe_draw_info *info);
+void
+st_init_hw_select_draw_functions(struct pipe_screen *screen,
+                                 struct dd_function_table *functions);
+
 #endif
diff --git a/src/mesa/state_tracker/st_draw_hw_select.c b/src/mesa/state_tracker/st_draw_hw_select.c
new file mode 100644
index 00000000000..5a032da046a
--- /dev/null
+++ b/src/mesa/state_tracker/st_draw_hw_select.c
@@ -0,0 +1,821 @@
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "main/enums.h"
+#include "main/context.h"
+
+#include "st_context.h"
+#include "st_nir.h"
+#include "st_draw.h"
+
+#include "nir.h"
+#include "nir_builtin_builder.h"
+
+#include "u_memory.h"
+
+union state_key {
+   struct {
+      unsigned num_user_clip_planes:4;
+      unsigned face_culling_enabled:1;
+      unsigned result_offset_from_attribute:1;
+      unsigned primitive:4;
+   };
+   uint32_t u32;
+};
+
+enum primitive_state {
+   HW_SELECT_PRIM_NONE,
+   HW_SELECT_PRIM_POINTS,
+   HW_SELECT_PRIM_LINES,
+   HW_SELECT_PRIM_TRIANGLES,
+   HW_SELECT_PRIM_QUADS,
+};
+
+struct geometry_constant {
+   float depth_scale;
+   float depth_transport;
+   uint32_t culling_config;
+   uint32_t result_offset;
+   float clip_planes[MAX_CLIP_PLANES][4];
+};
+
+#define set_uniform_location(var, field, packed)                 \
+   do {                                                          \
+      unsigned offset = Offset(struct geometry_constant, field); \
+      var->data.driver_location = offset >> (packed ? 2 : 4);    \
+      var->data.location_frac = (offset >> 2) & 0x3;             \
+   } while (0)
+
+static nir_ssa_def *
+has_nan_or_inf(nir_builder *b, nir_ssa_def *v)
+{
+   nir_ssa_def *nan = nir_bany_fnequal4(b, v, v);
+
+   nir_ssa_def *imm = nir_imm_float(b, INFINITY);
+   nir_ssa_def *inf = nir_bany(b, nir_feq(b, nir_fabs(b, v), imm));
+
+   return nir_ior(b, nan, inf);
+}
+
+static void
+return_if_true(nir_builder *b, nir_ssa_def *cond)
+{
+   nir_if *if_cond = nir_push_if(b, cond);
+   nir_jump(b, nir_jump_return);
+   nir_pop_if(b, if_cond);
+}
+
+static void
+get_input_vertices(nir_builder *b, nir_ssa_def **v)
+{
+   const int num_in_vert = b->shader->info.gs.vertices_in;
+
+   nir_variable *in_pos = nir_variable_create(
+      b->shader, nir_var_shader_in, glsl_array_type(glsl_vec4_type(), num_in_vert, 0),
+      "gl_Position");
+   in_pos->data.location = VARYING_SLOT_POS;
+
+   nir_ssa_def *is_nan_or_inf = NULL;
+   for (int i = 0; i < num_in_vert; i++) {
+      v[i] = nir_load_array_var_imm(b, in_pos, i);
+      nir_ssa_def *r = has_nan_or_inf(b, v[i]);
+      is_nan_or_inf = i ? nir_ior(b, is_nan_or_inf, r) : r;
+   }
+   return_if_true(b, is_nan_or_inf);
+}
+
+static void
+face_culling(nir_builder *b, nir_ssa_def **v, bool packed)
+{
+   /* use the z value of the face normal to determine if the face points to us:
+    *   Nz = (x1 - x0) * (y2 - y0) - (y1 - y0) * (x2 - x0)
+    *
+    * it should be in NDC (Normalized Device Coordinate), but now we are in clip
+    * space (Vd = Vc / Vc.w), so multiply Nz with w0*w1*w2 to get the clip space
+    * value:
+    *   det = x0 * (y1 * w2 - y2 * w1) +
+    *         x1 * (y2 * w0 - y0 * w2) +
+    *         x2 * (y0 * w1 - y1 * w0)
+    *
+    * we only care about the sign of the det, but also need to count the sign of
+    * w0/w1/w2 as a negtive w would change the direction of Nz < 0
+    */
+   nir_ssa_def *y1w2 = nir_fmul(b, nir_channel(b, v[1], 1), nir_channel(b, v[2], 3));
+   nir_ssa_def *y2w1 = nir_fmul(b, nir_channel(b, v[2], 1), nir_channel(b, v[1], 3));
+   nir_ssa_def *y2w0 = nir_fmul(b, nir_channel(b, v[2], 1), nir_channel(b, v[0], 3));
+   nir_ssa_def *y0w2 = nir_fmul(b, nir_channel(b, v[0], 1), nir_channel(b, v[2], 3));
+   nir_ssa_def *y0w1 = nir_fmul(b, nir_channel(b, v[0], 1), nir_channel(b, v[1], 3));
+   nir_ssa_def *y1w0 = nir_fmul(b, nir_channel(b, v[1], 1), nir_channel(b, v[0], 3));
+   nir_ssa_def *t0 = nir_fmul(b, nir_channel(b, v[0], 0), nir_fsub(b, y1w2, y2w1));
+   nir_ssa_def *t1 = nir_fmul(b, nir_channel(b, v[1], 0), nir_fsub(b, y2w0, y0w2));
+   nir_ssa_def *t2 = nir_fmul(b, nir_channel(b, v[2], 0), nir_fsub(b, y0w1, y1w0));
+   nir_ssa_def *det = nir_fadd(b, nir_fadd(b, t0, t1), t2);
+
+   /* invert det sign once any vertex w < 0 */
+   nir_ssa_def *n0 = nir_flt(b, nir_channel(b, v[0], 3), nir_imm_float(b, 0));
+   nir_ssa_def *n1 = nir_flt(b, nir_channel(b, v[1], 3), nir_imm_float(b, 0));
+   nir_ssa_def *n2 = nir_flt(b, nir_channel(b, v[2], 3), nir_imm_float(b, 0));
+   nir_ssa_def *cond = nir_ixor(b, nir_ixor(b, n0, n1), n2);
+   det = nir_bcsel(b, cond, nir_fneg(b, det), det);
+
+   nir_variable *culling_config = nir_variable_create(
+      b->shader, nir_var_uniform, glsl_uint_type(), "culling_config");
+   set_uniform_location(culling_config, culling_config, packed);
+   nir_ssa_def *config = nir_i2b(b, nir_load_var(b, culling_config));
+
+   /* det < 0 then z points to camera */
+   nir_ssa_def *zero = nir_imm_zero(b, 1, det->bit_size);
+   nir_ssa_def *is_zero = nir_feq(b, det, zero);
+   nir_ssa_def *is_neg = nir_flt(b, det, zero);
+   nir_ssa_def *cull = nir_ixor(b, is_neg, config);
+   return_if_true(b, nir_ior(b, is_zero, cull));
+}
+
+static void
+fast_frustum_culling(nir_builder *b, nir_ssa_def **v)
+{
+   nir_ssa_def *cull = NULL;
+
+   /* there are six culling planes for the visible volume:
+    *   1.  x + w = 0
+    *   2. -x + w = 0
+    *   3.  y + w = 0
+    *   4. -y + w = 0
+    *   5.  z + w = 0
+    *   6. -z + w = 0
+    *
+    * if all vertices of the primitive are outside (plane equation <0) of
+    * any plane, the primitive must be invisible.
+    */
+   for (int i = 0; i < 6; i++) {
+      nir_ssa_def *outside = NULL;
+
+      for (int j = 0; j < b->shader->info.gs.vertices_in; j++) {
+         nir_ssa_def *c = nir_channel(b, v[j], i >> 1);
+         if (i & 1)
+            c = nir_fneg(b, c);
+
+         nir_ssa_def *r = nir_flt(b, nir_channel(b, v[j], 3), c);
+         outside = j ? nir_iand(b, outside, r) : r;
+      }
+
+      cull = i ? nir_ior(b, cull, outside) : outside;
+   }
+
+   return_if_true(b, cull);
+}
+
+static nir_ssa_def *
+get_intersection(nir_builder *b, nir_ssa_def *v1, nir_ssa_def *v2,
+                 nir_ssa_def *d1, nir_ssa_def *d2)
+{
+   nir_ssa_def *factor = nir_fdiv(b, d1, nir_fsub(b, d1, d2));
+   return nir_fmad(b, nir_fsub(b, v2, v1), factor, v1);
+}
+
+#define begin_for_loop(name, max)                                       \
+   nir_variable *name##_index =                                         \
+      nir_local_variable_create(b->impl, glsl_int_type(), #name "_i");  \
+   nir_store_var(b, name##_index, nir_imm_int(b, 0), 1);                \
+                                                                        \
+   nir_loop *name = nir_push_loop(b);                                   \
+   {                                                                    \
+      nir_ssa_def *idx = nir_load_var(b, name##_index);                 \
+      nir_if *if_in_loop = nir_push_if(b, nir_ilt(b, idx, max));
+
+#define end_for_loop(name)                                              \
+         nir_store_var(b, name##_index, nir_iadd_imm(b, idx, 1), 1);    \
+      nir_push_else(b, if_in_loop);                                     \
+         nir_jump(b, nir_jump_break);                                   \
+      nir_pop_if(b, if_in_loop);                                        \
+   }                                                                    \
+   nir_pop_loop(b, name);
+
+static void
+clip_with_plane(nir_builder *b, nir_variable *vert, nir_variable *num_vert,
+                int max_vert, nir_ssa_def *plane)
+{
+   nir_variable *all_clipped = nir_local_variable_create(
+      b->impl, glsl_bool_type(), "all_clipped");
+   nir_store_var(b, all_clipped, nir_imm_true(b), 1);
+
+   nir_variable *dist = nir_local_variable_create(
+      b->impl, glsl_array_type(glsl_float_type(), max_vert, 0), "dist");
+
+   nir_ssa_def *num = nir_load_var(b, num_vert);
+   begin_for_loop(dist_loop, num)
+   {
+      nir_ssa_def *v = nir_load_array_var(b, vert, idx);
+      nir_ssa_def *d = nir_fdot(b, v, plane);
+      nir_store_array_var(b, dist, idx, d, 1);
+
+      nir_ssa_def *clipped = nir_flt(b, d, nir_imm_float(b, 0));
+      nir_store_var(b, all_clipped,
+                    nir_iand(b, nir_load_var(b, all_clipped), clipped), 1);
+   }
+   end_for_loop(dist_loop)
+
+   return_if_true(b, nir_load_var(b, all_clipped));
+
+   /* Use +/0/- to denote the dist[i] sign, which means:
+    * +: inside plane
+    * -: outside plane
+    * 0: just on the plane
+    *
+    * Some example:
+    * ++++: all vertex not clipped
+    * ----: all vertex clipped
+    * +-++: one vertex clipped, need to insert two vertex at '-', array grow
+    * +--+: two vertex clipped, need to insert two vertex at '--', array same
+    * +---: three vertex clipped, need to insert two vertex at '---', array trim
+    * +-0+: one vertex clipped, need to insert one vertex at '-', array same
+    *
+    * Plane clip only produce convex polygon, so '-' must be contigous, there's
+    * no '+-+-', so one clip plane can only grow array by 1.
+    */
+
+   /* when array grow or '-' has been replaced with inserted vertex, save the
+    * original vert to be used by following calculation.
+    */
+   nir_variable *saved =
+      nir_local_variable_create(b->impl, glsl_vec4_type(), "saved");
+
+   nir_variable *vert_index =
+      nir_local_variable_create(b->impl, glsl_int_type(), "vert_index");
+   nir_store_var(b, vert_index, nir_imm_int(b, 0), 1);
+
+   begin_for_loop(vert_loop, num)
+   {
+      nir_ssa_def *di = nir_load_array_var(b, dist, idx);
+      nir_if *if_clipped = nir_push_if(b, nir_flt(b, di, nir_imm_float(b, 0)));
+      {
+         /* - case, we need to take care of sign change and insert vertex */
+
+         nir_ssa_def *prev = nir_bcsel(b, nir_ieq_imm(b, idx, 0),
+                                       nir_iadd_imm(b, num, -1),
+                                       nir_iadd_imm(b, idx, -1));
+         nir_ssa_def *dp = nir_load_array_var(b, dist, prev);
+         nir_if *prev_if = nir_push_if(b, nir_flt(b, nir_imm_float(b, 0), dp));
+         {
+            /* +- case, replace - with inserted vertex
+             * assert(vert_index <= idx), array is sure to not grow here
+             * but need to save vert[idx] when vert_index==idx
+             */
+
+            nir_ssa_def *vi = nir_load_array_var(b, vert, idx);
+            nir_store_var(b, saved, vi, 0xf);
+
+            nir_ssa_def *vp = nir_load_array_var(b, vert, prev);
+            nir_ssa_def *iv = get_intersection(b, vp, vi, dp, di);
+            nir_ssa_def *index = nir_load_var(b, vert_index);
+            nir_store_array_var(b, vert, index, iv, 0xf);
+
+            nir_store_var(b, vert_index, nir_iadd_imm(b, index, 1), 1);
+         }
+         nir_pop_if(b, prev_if);
+
+         nir_ssa_def *next = nir_bcsel(b, nir_ieq(b, idx, nir_iadd_imm(b, num, -1)),
+                                       nir_imm_int(b, 0), nir_iadd_imm(b, idx, 1));
+         nir_ssa_def *dn = nir_load_array_var(b, dist, next);
+         nir_if *next_if = nir_push_if(b, nir_flt(b, nir_imm_float(b, 0), dn));
+         {
+            /* -+ case, may grow array:
+             *   vert_index > idx: +-+ case, grow array, current vertex in 'saved',
+             *     save next + to 'saved', will replace it with inserted vertex.
+             *   vert_index <= idx: --+ case, will replace last - with inserted vertex,
+             *     no need to save last -, because + case won't use - value.
+             */
+
+            nir_ssa_def *index = nir_load_var(b, vert_index);
+            nir_ssa_def *vi = nir_bcsel(b, nir_flt(b, idx, index),
+                                        nir_load_var(b, saved),
+                                        nir_load_array_var(b, vert, idx));
+            nir_ssa_def *vn = nir_load_array_var(b, vert, next);
+            nir_ssa_def *iv = get_intersection(b, vn, vi, dn, di);
+
+            nir_store_var(b, saved, nir_load_array_var(b, vert, index), 0xf);
+            nir_store_array_var(b, vert, index, iv, 0xf);
+
+            nir_store_var(b, vert_index, nir_iadd_imm(b, index, 1), 1);
+         }
+         nir_pop_if(b, next_if);
+      }
+      nir_push_else(b, if_clipped);
+      {
+         /* +/0 case, just keep the vert
+          *   vert_index > idx: array grew case, vert[idx] is inserted vertex or prev
+          *     +/0 vertex, current vertex is in 'saved', need to save next vertex
+          *   vert_index < idx: array trim case
+          */
+
+         nir_ssa_def *index = nir_load_var(b, vert_index);
+         nir_ssa_def *vi = nir_bcsel(b, nir_flt(b, idx, index),
+                                     nir_load_var(b, saved),
+                                     nir_load_array_var(b, vert, idx));
+
+         nir_store_var(b, saved, nir_load_array_var(b, vert, index), 0xf);
+         nir_store_array_var(b, vert, index, vi, 0xf);
+
+         nir_store_var(b, vert_index, nir_iadd_imm(b, index, 1), 1);
+      }
+      nir_pop_if(b, if_clipped);
+   }
+   end_for_loop(vert_loop);
+
+   nir_copy_var(b, num_vert, vert_index);
+}
+
+static nir_ssa_def *
+get_user_clip_plane(nir_builder *b, int index, bool packed)
+{
+   char name[16];
+   snprintf(name, sizeof(name), "gl_ClipPlane%d", index);
+   nir_variable *plane = nir_variable_create(
+      b->shader, nir_var_uniform, glsl_vec4_type(), name);
+
+   set_uniform_location(plane, clip_planes[index][0], packed);
+
+   return nir_load_var(b, plane);
+}
+
+static void
+get_depth_range_transform(nir_builder *b, bool packed, nir_ssa_def **trans)
+{
+   nir_variable *depth_scale = nir_variable_create(
+      b->shader, nir_var_uniform, glsl_float_type(), "depth_scale");
+   set_uniform_location(depth_scale, depth_scale, packed);
+
+   nir_variable *depth_transport = nir_variable_create(
+      b->shader, nir_var_uniform, glsl_float_type(), "depth_transport");
+   set_uniform_location(depth_transport, depth_transport, packed);
+
+   trans[0] = nir_load_var(b, depth_scale);
+   trans[1] = nir_load_var(b, depth_transport);
+}
+
+static nir_ssa_def *
+get_window_space_depth(nir_builder *b, nir_ssa_def *v, nir_ssa_def **trans)
+{
+   nir_ssa_def *z = nir_channel(b, v, 2);
+   nir_ssa_def *w = nir_channel(b, v, 3);
+
+   /* do perspective division, if w==0, xyz must be 0 too (otherwise can't pass
+    * the clip test), 0/0=NaN, but we want it to be the nearest point.
+    */
+   nir_ssa_def *c = nir_feq(b, w, nir_imm_float(b, 0));
+   nir_ssa_def *d = nir_bcsel(b, c, nir_imm_float(b, -1), nir_fdiv(b, z, w));
+
+   /* map [-1, 1] to [near, far] set by glDepthRange(near, far) */
+   return nir_fmad(b, trans[0], d, trans[1]);
+}
+
+static void
+update_result_buffer(nir_builder *b, nir_ssa_def *dmin, nir_ssa_def *dmax,
+                     bool offset_from_attribute, bool packed)
+{
+   nir_ssa_def *offset;
+   if (offset_from_attribute) {
+      nir_variable *in_offset = nir_variable_create(
+         b->shader, nir_var_shader_in,
+         glsl_array_type(glsl_uint_type(), b->shader->info.gs.vertices_in, 0),
+         "result_offset");
+      in_offset->data.location = VARYING_SLOT_VAR0;
+      offset = nir_load_array_var_imm(b, in_offset, 0);
+   } else {
+      nir_variable *uni_offset = nir_variable_create(
+         b->shader, nir_var_uniform, glsl_uint_type(), "result_offset");
+      set_uniform_location(uni_offset, result_offset, packed);
+      offset = nir_load_var(b, uni_offset);
+   }
+
+   nir_variable_create(b->shader, nir_var_mem_ssbo,
+                       glsl_array_type(glsl_uint_type(), 0, 0), "result");
+   /* driver_location = 0 (slot 0) */
+
+   nir_ssa_def *ssbo = nir_imm_int(b, 0);
+   nir_ssbo_atomic_exchange(b, 32, ssbo, offset, nir_imm_int(b, 1));
+   nir_ssbo_atomic_umin(b, 32, ssbo, nir_iadd_imm(b, offset, 4), dmin);
+   nir_ssbo_atomic_umax(b, 32, ssbo, nir_iadd_imm(b, offset, 8), dmax);
+}
+
+static void
+build_point_nir_shader(nir_builder *b, union state_key state, bool packed)
+{
+   assert(b->shader->info.gs.vertices_in == 1);
+
+   nir_ssa_def *v;
+   get_input_vertices(b, &v);
+
+   fast_frustum_culling(b, &v);
+
+   nir_ssa_def *outside = NULL;
+   for (int i = 0; i < state.num_user_clip_planes; i++) {
+      nir_ssa_def *p = get_user_clip_plane(b, i, packed);
+      nir_ssa_def *d = nir_fdot(b, v, p);
+      nir_ssa_def *r = nir_flt(b, d, nir_imm_float(b, 0));
+      outside = i ? nir_ior(b, outside, r) : r;
+   }
+   if (outside)
+      return_if_true(b, outside);
+
+   nir_ssa_def *trans[2];
+   get_depth_range_transform(b, packed, trans);
+
+   nir_ssa_def *depth = get_window_space_depth(b, v, trans);
+   nir_ssa_def *fdepth = nir_fmul_imm(b, depth, 4294967295.0);
+   nir_ssa_def *idepth = nir_f2uN(b, fdepth, 32);
+
+   update_result_buffer(b, idepth, idepth, state.result_offset_from_attribute, packed);
+}
+
+static nir_variable *
+create_clip_planes(nir_builder *b, int num_clip_planes, bool packed)
+{
+   nir_variable *clip_planes = nir_local_variable_create(
+      b->impl, glsl_array_type(glsl_vec4_type(), num_clip_planes, 0), "clip_planes");
+
+   nir_ssa_def *unit_clip_planes[6] = {
+      nir_imm_vec4(b,  1,  0,  0,  1),
+      nir_imm_vec4(b, -1,  0,  0,  1),
+      nir_imm_vec4(b,  0,  1,  0,  1),
+      nir_imm_vec4(b,  0, -1,  0,  1),
+      nir_imm_vec4(b,  0,  0,  1,  1),
+      nir_imm_vec4(b,  0,  0, -1,  1),
+   };
+   for (int i = 0; i < 6; i++)
+      nir_store_array_var_imm(b, clip_planes, i, unit_clip_planes[i], 0xf);
+
+   for (int i = 6; i < num_clip_planes; i++) {
+      nir_ssa_def *p = get_user_clip_plane(b, i - 6, packed);
+      nir_store_array_var_imm(b, clip_planes, i, p, 0xf);
+   }
+
+   return clip_planes;
+}
+
+static void
+build_line_nir_shader(nir_builder *b, union state_key state, bool packed)
+{
+   assert(b->shader->info.gs.vertices_in == 2);
+
+   nir_ssa_def *v[2];
+   get_input_vertices(b, v);
+
+   fast_frustum_culling(b, v);
+
+   nir_variable *vert0 = nir_local_variable_create(b->impl, glsl_vec4_type(), "vert0");
+   nir_store_var(b, vert0, v[0], 0xf);
+
+   nir_variable *vert1 = nir_local_variable_create(b->impl, glsl_vec4_type(), "vert1");
+   nir_store_var(b, vert1, v[1], 0xf);
+
+   const int num_clip_planes = 6 + state.num_user_clip_planes;
+   nir_variable *clip_planes = create_clip_planes(b, num_clip_planes, packed);
+
+   begin_for_loop(clip_loop, nir_imm_int(b, num_clip_planes))
+   {
+      nir_ssa_def *plane = nir_load_array_var(b, clip_planes, idx);
+      nir_ssa_def *v0 = nir_load_var(b, vert0);
+      nir_ssa_def *v1 = nir_load_var(b, vert1);
+      nir_ssa_def *d0 = nir_fdot(b, v0, plane);
+      nir_ssa_def *d1 = nir_fdot(b, v1, plane);
+      nir_ssa_def *n0 = nir_flt(b, d0, nir_imm_float(b, 0));
+      nir_ssa_def *n1 = nir_flt(b, d1, nir_imm_float(b, 0));
+
+      return_if_true(b, nir_iand(b, n0, n1));
+
+      nir_if *clip_if = nir_push_if(b, nir_ior(b, n0, n1));
+      {
+         nir_ssa_def *iv = get_intersection(b, v0, v1, d0, d1);
+         nir_store_var(b, vert0, nir_bcsel(b, n0, iv, v0), 0xf);
+         nir_store_var(b, vert1, nir_bcsel(b, n1, iv, v1), 0xf);
+      }
+      nir_pop_if(b, clip_if);
+   }
+   end_for_loop(clip_loop)
+
+   nir_ssa_def *trans[2];
+   get_depth_range_transform(b, packed, trans);
+
+   nir_ssa_def *d0 = get_window_space_depth(b, nir_load_var(b, vert0), trans);
+   nir_ssa_def *d1 = get_window_space_depth(b, nir_load_var(b, vert1), trans);
+
+   nir_ssa_def *dmin = nir_fmin(b, d0, d1);
+   nir_ssa_def *dmax = nir_fmax(b, d0, d1);
+
+   nir_ssa_def *fdmin = nir_fmul_imm(b, dmin, 4294967295.0);
+   nir_ssa_def *idmin = nir_f2uN(b, fdmin, 32);
+
+   nir_ssa_def *fdmax = nir_fmul_imm(b, dmax, 4294967295.0);
+   nir_ssa_def *idmax = nir_f2uN(b, fdmax, 32);
+
+   update_result_buffer(b, idmin, idmax, state.result_offset_from_attribute, packed);
+}
+
+static void
+build_planar_primitive_nir_shader(nir_builder *b, union state_key state, bool packed)
+{
+   const int num_in_vert = b->shader->info.gs.vertices_in;
+   assert(num_in_vert == 3 || num_in_vert == 4);
+
+   nir_ssa_def *v[4];
+   get_input_vertices(b, v);
+
+   if (state.face_culling_enabled)
+      face_culling(b, v, packed);
+
+   /* fast frustum culling, this should filter out most primitives */
+   fast_frustum_culling(b, v);
+
+   const int num_clip_planes = 6 + state.num_user_clip_planes;
+   const int max_vert = num_in_vert + num_clip_planes;
+
+   /* TODO: could use shared memory (ie. AMD GPU LDS) for this array
+    * to reduce register usage.
+    */
+   nir_variable *vert = nir_local_variable_create(
+      b->impl, glsl_array_type(glsl_vec4_type(), max_vert, 0), "vert");
+   for (int i = 0; i < num_in_vert; i++)
+      nir_store_array_var_imm(b, vert, i, v[i], 0xf);
+
+   nir_variable *num_vert =
+      nir_local_variable_create(b->impl, glsl_int_type(), "num_vert");
+   nir_store_var(b, num_vert, nir_imm_int(b, num_in_vert), 1);
+
+   nir_variable *clip_planes = create_clip_planes(b, num_clip_planes, packed);
+
+   /* accurate clipping with all clip planes */
+   begin_for_loop(clip_loop, nir_imm_int(b, num_clip_planes))
+   {
+      nir_ssa_def *plane = nir_load_array_var(b, clip_planes, idx);
+      clip_with_plane(b, vert, num_vert, max_vert, plane);
+   }
+   end_for_loop(clip_loop)
+
+   nir_ssa_def *trans[2];
+   get_depth_range_transform(b, packed, trans);
+
+   nir_variable *dmin =
+      nir_local_variable_create(b->impl, glsl_float_type(), "dmin");
+   nir_store_var(b, dmin, nir_imm_float(b, 1), 1);
+
+   nir_variable *dmax =
+      nir_local_variable_create(b->impl, glsl_float_type(), "dmax");
+   nir_store_var(b, dmax, nir_imm_float(b, 0), 1);
+
+   begin_for_loop(depth_loop, nir_load_var(b, num_vert))
+   {
+      nir_ssa_def *vtx = nir_load_array_var(b, vert, idx);
+      nir_ssa_def *depth = get_window_space_depth(b, vtx, trans);
+      nir_store_var(b, dmin, nir_fmin(b, nir_load_var(b, dmin), depth), 1);
+      nir_store_var(b, dmax, nir_fmax(b, nir_load_var(b, dmax), depth), 1);
+   }
+   end_for_loop(depth_loop)
+
+   nir_ssa_def *fdmin = nir_fmul_imm(b, nir_load_var(b, dmin), 4294967295.0);
+   nir_ssa_def *idmin = nir_f2uN(b, fdmin, 32);
+
+   nir_ssa_def *fdmax = nir_fmul_imm(b, nir_load_var(b, dmax), 4294967295.0);
+   nir_ssa_def *idmax = nir_f2uN(b, fdmax, 32);
+
+   update_result_buffer(b, idmin, idmax, state.result_offset_from_attribute, packed);
+}
+
+static void *
+hw_select_create_gs(struct st_context *st, union state_key state)
+{
+   const nir_shader_compiler_options *options =
+      st_get_nir_compiler_options(st, MESA_SHADER_GEOMETRY);
+
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_GEOMETRY, options,
+                                                  "hw select GS");
+
+   nir_shader *nir = b.shader;
+   nir->info.inputs_read = VARYING_BIT_POS;
+   nir->info.num_ssbos = 1;
+   nir->info.gs.output_primitive = SHADER_PRIM_POINTS;
+   nir->info.gs.vertices_out = 1;
+   nir->info.gs.invocations = 1;
+   nir->info.gs.active_stream_mask = 1;
+
+   if (state.result_offset_from_attribute)
+      nir->info.inputs_read |= VARYING_BIT_VAR(0);
+
+   bool packed = st->ctx->Const.PackedDriverUniformStorage;
+
+   switch (state.primitive) {
+   case HW_SELECT_PRIM_POINTS:
+      nir->info.gs.input_primitive = SHADER_PRIM_POINTS;
+      nir->info.gs.vertices_in = 1;
+      build_point_nir_shader(&b, state, packed);
+      break;
+   case HW_SELECT_PRIM_LINES:
+      nir->info.gs.input_primitive = SHADER_PRIM_LINES;
+      nir->info.gs.vertices_in = 2;
+      build_line_nir_shader(&b, state, packed);
+      break;
+   case HW_SELECT_PRIM_TRIANGLES:
+      nir->info.gs.input_primitive = SHADER_PRIM_TRIANGLES;
+      nir->info.gs.vertices_in = 3;
+      build_planar_primitive_nir_shader(&b, state, packed);
+      break;
+   case HW_SELECT_PRIM_QUADS:
+      /* geometry shader has no quad primitive, use lines_adjacency instead */
+      nir->info.gs.input_primitive = SHADER_PRIM_LINES_ADJACENCY;
+      nir->info.gs.vertices_in = 4;
+      build_planar_primitive_nir_shader(&b, state, packed);
+      break;
+   default:
+      unreachable("unexpected primitive");
+   }
+
+   nir_lower_returns(nir);
+
+   return st_nir_finish_builtin_shader(st, nir);
+}
+
+bool
+st_draw_hw_select_prepare_common(struct gl_context *ctx)
+{
+   struct st_context *st = st_context(ctx);
+   if (st->gp || st->tcp || st->tep) {
+      fprintf(stderr, "HW GL_SELECT does not support user geometry/tessellation shader\n");
+      return false;
+   }
+
+   struct geometry_constant consts;
+
+   float n = ctx->ViewportArray[0].Near;
+   float f = ctx->ViewportArray[0].Far;
+   consts.depth_scale = (f - n) / 2;
+   consts.depth_transport = (f + n) / 2;
+
+   /* this field is not used when face culling disabled */
+   consts.culling_config =
+      (ctx->Polygon.CullFaceMode == GL_BACK) ^
+      (ctx->Polygon.FrontFace == GL_CCW);
+
+   /* this field is not used when passing result offset by attribute */
+   consts.result_offset = st->ctx->Select.ResultOffset;
+
+   int num_planes = 0;
+   u_foreach_bit(i, ctx->Transform.ClipPlanesEnabled) {
+      COPY_4V(consts.clip_planes[num_planes], ctx->Transform._ClipUserPlane[i]);
+      num_planes++;
+   }
+
+   struct pipe_constant_buffer cb;
+   cb.buffer = NULL;
+   cb.user_buffer = &consts;
+   cb.buffer_offset = 0;
+   cb.buffer_size = sizeof(consts) - (MAX_CLIP_PLANES - num_planes) * 4 * sizeof(float);
+
+   struct pipe_context *pipe = st->pipe;
+   pipe->set_constant_buffer(pipe, PIPE_SHADER_GEOMETRY, 0, false, &cb);
+
+   struct pipe_shader_buffer buffer;
+   memset(&buffer, 0, sizeof(buffer));
+   buffer.buffer = ctx->Select.Result->buffer;
+   buffer.buffer_size = MAX_NAME_STACK_RESULT_NUM * 3 * sizeof(int);
+
+   pipe->set_shader_buffers(pipe, PIPE_SHADER_GEOMETRY, 0, 1, &buffer, 0x1);
+
+   return true;
+}
+
+static union state_key
+make_state_key(struct gl_context *ctx, int mode)
+{
+   union state_key state = {0};
+
+   switch (mode) {
+   case GL_POINTS:
+      state.primitive = HW_SELECT_PRIM_POINTS;
+      break;
+   case GL_LINES:
+   case GL_LINE_STRIP:
+   case GL_LINE_LOOP:
+      state.primitive = HW_SELECT_PRIM_LINES;
+      break;
+   case GL_QUADS:
+      state.primitive = HW_SELECT_PRIM_QUADS;
+      break;
+   case GL_TRIANGLES:
+   case GL_TRIANGLE_STRIP:
+   case GL_TRIANGLE_FAN:
+      /* These will be broken into triangles. */
+   case GL_QUAD_STRIP:
+   case GL_POLYGON:
+      state.primitive = HW_SELECT_PRIM_TRIANGLES;
+      break;
+   default:
+      fprintf(stderr, "HW GL_SELECT does not support draw mode %s\n",
+              _mesa_enum_to_string(mode));
+      return (union state_key){0};
+   }
+
+   /* TODO: support gl_ClipDistance/gl_CullDistance, but it costs more regs */
+   struct gl_program *vp = ctx->st->vp;
+   if (vp->info.clip_distance_array_size || vp->info.cull_distance_array_size) {
+      fprintf(stderr, "HW GL_SELECT does not support gl_ClipDistance/gl_CullDistance\n");
+      return (union state_key){0};
+   }
+
+   state.num_user_clip_planes = util_bitcount(ctx->Transform.ClipPlanesEnabled);
+
+   /* face culling only apply to 2D primitives */
+   if (state.primitive == HW_SELECT_PRIM_QUADS ||
+       state.primitive == HW_SELECT_PRIM_TRIANGLES)
+      state.face_culling_enabled = ctx->Polygon.CullFlag;
+
+   state.result_offset_from_attribute =
+      ctx->VertexProgram._VPMode == VP_MODE_FF &&
+      (ctx->VertexProgram._VaryingInputs & VERT_BIT_SELECT_RESULT_OFFSET);
+
+   return state;
+}
+
+bool
+st_draw_hw_select_prepare_mode(struct gl_context *ctx, struct pipe_draw_info *info)
+{
+   union state_key key = make_state_key(ctx, info->mode);
+   if (!key.u32)
+      return false;
+
+   struct st_context *st = st_context(ctx);
+   if (!st->hw_select_shaders)
+      st->hw_select_shaders = _mesa_hash_table_create_u32_keys(NULL);
+
+   struct hash_entry *he = _mesa_hash_table_search(st->hw_select_shaders,
+                                                   (void*)(uintptr_t)key.u32);
+   void *gs;
+   if (!he) {
+      gs = hw_select_create_gs(st, key);
+      if (!gs)
+         return false;
+
+      _mesa_hash_table_insert(st->hw_select_shaders, (void*)(uintptr_t)key.u32, gs);
+   } else
+      gs = he->data;
+
+   struct cso_context *cso = st->cso_context;
+   cso_set_geometry_shader_handle(cso, gs);
+
+   /* Replace draw mode with equivalent one which geometry shader support.
+    *
+    * New mode consume same vertex buffer structure and produce primitive with
+    * same vertices (no need to be same type of primitive, because geometry shader
+    * operate on vertives and emit nothing).
+    *
+    * We can break QUAD and POLYGON to triangles with same shape. But we can't futher
+    * break them into single line or point because new primitive need to contain >=3
+    * vertices so that it's still handled in 2D (planar) way instead of 1D (line) or
+    * 0D (point) way which have different algorithm.
+    */
+   switch (info->mode) {
+   case GL_QUADS:
+      info->mode = GL_LINES_ADJACENCY;
+      break;
+   case GL_QUAD_STRIP:
+      info->mode = GL_TRIANGLE_STRIP;
+      break;
+   case GL_POLYGON:
+      info->mode = GL_TRIANGLE_FAN;
+      break;
+   default:
+      break;
+   }
+
+   /* Only normal glBegin/End draws pass result offset by attribute to avoid flush
+    * vertices when change name stack, so multiple glBegin/End sections before/after
+    * name stack calls can be merged to a single draw call. To achieve this We mark
+    * name stack result buffer used in glEnd instead of the last draw call.
+    *
+    * Other case like glDrawArrays and display list replay won't merge draws cross
+    * name stack calls, so we just mark name stack result buffer used here.
+    */
+   if (!key.result_offset_from_attribute)
+      ctx->Select.ResultUsed = GL_TRUE;
+
+   return true;
+}