diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build
index 0acf608c593..0221dc676b1 100644
--- a/src/compiler/nir/meson.build
+++ b/src/compiler/nir/meson.build
@@ -196,6 +196,7 @@ files_libnir = files(
   'nir_lower_ssbo.c',
   'nir_lower_subgroups.c',
   'nir_lower_system_values.c',
+  'nir_lower_task_shader.c',
   'nir_lower_tex_shadow.c',
   'nir_lower_tex.c',
   'nir_lower_texcoord_replace.c',
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 93b6b39065a..c969bbff1a7 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -5249,6 +5249,12 @@ typedef enum {
 
 bool nir_lower_gs_intrinsics(nir_shader *shader, nir_lower_gs_intrinsics_flags options);
 
+typedef struct {
+   bool payload_to_shared_for_atomics : 1;
+} nir_lower_task_shader_options;
+
+bool nir_lower_task_shader(nir_shader *shader, nir_lower_task_shader_options options);
+
 typedef unsigned (*nir_lower_bit_size_callback)(const nir_instr *, void *);
 
 bool nir_lower_bit_size(nir_shader *shader,
diff --git a/src/compiler/nir/nir_lower_task_shader.c b/src/compiler/nir/nir_lower_task_shader.c
new file mode 100644
index 00000000000..6e48efe6e46
--- /dev/null
+++ b/src/compiler/nir/nir_lower_task_shader.c
@@ -0,0 +1,420 @@
+/*
+ * Copyright © 2022 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Timur Kristóf
+ *
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+#include "util/u_math.h"
+
+typedef struct {
+   uint32_t task_count_shared_addr;
+} lower_task_nv_state;
+
+typedef struct {
+   /* If true, lower all task_payload I/O to use shared memory. */
+   bool payload_in_shared;
+   /* Shared memory address where task_payload will be located. */
+   uint32_t payload_shared_addr;
+} lower_task_state;
+
+static bool
+lower_nv_task_output(nir_builder *b,
+                     nir_instr *instr,
+                     void *state)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   lower_task_nv_state *s = (lower_task_nv_state *) state;
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_load_output: {
+      b->cursor = nir_after_instr(instr);
+      nir_ssa_def *load =
+         nir_load_shared(b, 1, 32, nir_imm_int(b, 0),
+                           .base = s->task_count_shared_addr);
+      nir_ssa_def_rewrite_uses(&intrin->dest.ssa, load);
+      nir_instr_remove(instr);
+      return true;
+   }
+
+   case nir_intrinsic_store_output: {
+      b->cursor = nir_after_instr(instr);
+      nir_ssa_def *store_val = intrin->src[0].ssa;
+      nir_store_shared(b, store_val, nir_imm_int(b, 0),
+                        .base = s->task_count_shared_addr);
+      nir_instr_remove(instr);
+      return true;
+   }
+
+   default:
+      return false;
+   }
+}
+
+static void
+append_launch_mesh_workgroups_to_nv_task(nir_builder *b,
+                                         lower_task_nv_state *s)
+{
+   /* At the beginning of the shader, write 0 to the task count.
+    * This ensures that 0 mesh workgroups are launched when the
+    * shader doesn't write the TASK_COUNT output.
+    */
+   b->cursor = nir_before_cf_list(&b->impl->body);
+   nir_ssa_def *zero = nir_imm_int(b, 0);
+   nir_store_shared(b, zero, zero, .base = s->task_count_shared_addr);
+
+   /* At the end of the shader, read the task count from shared memory
+    * and emit launch_mesh_workgroups.
+    */
+   b->cursor = nir_after_cf_list(&b->impl->body);
+   nir_ssa_def *task_count =
+      nir_load_shared(b, 1, 32, zero, .base = s->task_count_shared_addr);
+
+   /* NV_mesh_shader doesn't offer to choose which task_payload variable
+    * should be passed to mesh shaders, we just pass all.
+    */
+   uint32_t range = b->shader->info.task_payload_size;
+
+   nir_ssa_def *one = nir_imm_int(b, 1);
+   nir_ssa_def *dispatch_3d = nir_vec3(b, task_count, one, one);
+   nir_launch_mesh_workgroups(b, dispatch_3d, .base = 0, .range = range);
+}
+
+/**
+ * For NV_mesh_shader:
+ * Task shaders only have 1 output, TASK_COUNT which is a 32-bit
+ * unsigned int that contains the 1-dimensional mesh dispatch size.
+ * This output should behave like a shared variable.
+ *
+ * We lower this output to a shared variable and then we emit
+ * the new launch_mesh_workgroups intrinsic at the end of the shader.
+ */
+static void
+nir_lower_nv_task_count(nir_shader *shader)
+{
+   lower_task_nv_state state = {
+      .task_count_shared_addr = ALIGN(shader->info.shared_size, 4),
+   };
+
+   shader->info.shared_size += 4;
+   nir_shader_instructions_pass(shader, lower_nv_task_output,
+                                nir_metadata_none, &state);
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+   nir_builder builder;
+   nir_builder_init(&builder, impl);
+
+   append_launch_mesh_workgroups_to_nv_task(&builder, &state);
+   nir_metadata_preserve(impl, nir_metadata_none);
+}
+
+static nir_intrinsic_op
+shared_opcode_for_task_payload(nir_intrinsic_op task_payload_op)
+{
+   switch (task_payload_op) {
+#define OP(O) case nir_intrinsic_task_payload_##O: return nir_intrinsic_shared_##O;
+   OP(atomic_exchange)
+   OP(atomic_comp_swap)
+   OP(atomic_add)
+   OP(atomic_imin)
+   OP(atomic_umin)
+   OP(atomic_imax)
+   OP(atomic_umax)
+   OP(atomic_and)
+   OP(atomic_or)
+   OP(atomic_xor)
+   OP(atomic_fadd)
+   OP(atomic_fmin)
+   OP(atomic_fmax)
+   OP(atomic_fcomp_swap)
+#undef OP
+   case nir_intrinsic_load_task_payload:
+      return nir_intrinsic_load_shared;
+   case nir_intrinsic_store_task_payload:
+      return nir_intrinsic_store_shared;
+   default:
+      unreachable("Invalid task payload atomic");
+   }
+}
+
+static bool
+lower_task_payload_to_shared(nir_builder *b,
+                             nir_intrinsic_instr *intrin,
+                             lower_task_state *s)
+{
+   /* This assumes that shared and task_payload intrinsics
+    * have the same number of sources and same indices.
+    */
+   unsigned base = nir_intrinsic_base(intrin);
+   intrin->intrinsic = shared_opcode_for_task_payload(intrin->intrinsic);
+   nir_intrinsic_set_base(intrin, base + s->payload_shared_addr);
+
+   return true;
+}
+
+static void
+emit_shared_to_payload_copy(nir_builder *b,
+                            uint32_t payload_addr,
+                            uint32_t payload_size,
+                            lower_task_state *s)
+{
+   const unsigned invocations = b->shader->info.workgroup_size[0] *
+                          b->shader->info.workgroup_size[1] *
+                          b->shader->info.workgroup_size[2];
+   const unsigned bytes_per_copy = 16;
+   const unsigned copies_needed = DIV_ROUND_UP(payload_size, bytes_per_copy);
+   const unsigned copies_per_invocation = DIV_ROUND_UP(copies_needed, invocations);
+   const unsigned base_shared_addr = s->payload_shared_addr + payload_addr;
+
+   nir_ssa_def *invocation_index = nir_load_local_invocation_index(b);
+   nir_ssa_def *addr = nir_imul_imm(b, invocation_index, bytes_per_copy);
+
+   /* Wait for all previous shared stores to finish.
+    * This is necessary because we placed the payload in shared memory.
+    */
+   nir_scoped_barrier(b, .execution_scope = NIR_SCOPE_WORKGROUP,
+                         .memory_scope = NIR_SCOPE_WORKGROUP,
+                         .memory_semantics = NIR_MEMORY_ACQ_REL,
+                         .memory_modes = nir_var_mem_shared);
+
+   for (unsigned i = 0; i < copies_per_invocation; ++i) {
+      unsigned const_off = bytes_per_copy * invocations * i;
+
+      /* Read from shared memory. */
+      nir_ssa_def *copy =
+         nir_load_shared(b, 4, 32, addr, .align_mul = 16,
+                         .base = base_shared_addr + const_off);
+
+      /* Write to task payload memory. */
+      nir_store_task_payload(b, copy, addr, .base = const_off);
+   }
+}
+
+static bool
+lower_task_launch_mesh_workgroups(nir_builder *b,
+                                  nir_intrinsic_instr *intrin,
+                                  lower_task_state *s)
+{
+   if (s->payload_in_shared) {
+      /* Copy the payload from shared memory.
+       * Because launch_mesh_workgroups may only occur in
+       * workgroup-uniform control flow, here we assume that
+       * all invocations in the workgroup are active and therefore
+       * they can all participate in the copy.
+       *
+       * TODO: Skip the copy when the mesh dispatch size is (0, 0, 0).
+       *       This is problematic because the dispatch size can be divergent,
+       *       and may differ accross subgroups.
+       */
+
+      uint32_t payload_addr = nir_intrinsic_base(intrin);
+      uint32_t payload_size = nir_intrinsic_range(intrin);
+
+      b->cursor = nir_before_instr(&intrin->instr);
+      emit_shared_to_payload_copy(b, payload_addr, payload_size, s);
+   }
+
+   /* The launch_mesh_workgroups intrinsic is a terminating instruction,
+    * so let's delete everything after it.
+    */
+   b->cursor = nir_after_instr(&intrin->instr);
+   nir_block *current_block = nir_cursor_current_block(b->cursor);
+
+   /* Delete following instructions in the current block. */
+   nir_foreach_instr_reverse_safe(instr, current_block) {
+      if (instr == &intrin->instr)
+         break;
+      nir_instr_remove(instr);
+   }
+
+   /* Delete following CF at the same level. */
+   b->cursor = nir_after_instr(&intrin->instr);
+   nir_cf_list extracted;
+   nir_cf_node *end_node = &current_block->cf_node;
+   while (!nir_cf_node_is_last(end_node))
+      end_node = nir_cf_node_next(end_node);
+   nir_cf_extract(&extracted, b->cursor, nir_after_cf_node(end_node));
+   nir_cf_delete(&extracted);
+
+   /* Terminate the task shader. */
+   b->cursor = nir_after_instr(&intrin->instr);
+   nir_jump(b, nir_jump_return);
+
+   return true;
+}
+
+static bool
+lower_task_intrin(nir_builder *b,
+                  nir_instr *instr,
+                  void *state)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   lower_task_state *s = (lower_task_state *) state;
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_task_payload_atomic_add:
+   case nir_intrinsic_task_payload_atomic_imin:
+   case nir_intrinsic_task_payload_atomic_umin:
+   case nir_intrinsic_task_payload_atomic_imax:
+   case nir_intrinsic_task_payload_atomic_umax:
+   case nir_intrinsic_task_payload_atomic_and:
+   case nir_intrinsic_task_payload_atomic_or:
+   case nir_intrinsic_task_payload_atomic_xor:
+   case nir_intrinsic_task_payload_atomic_exchange:
+   case nir_intrinsic_task_payload_atomic_comp_swap:
+   case nir_intrinsic_task_payload_atomic_fadd:
+   case nir_intrinsic_task_payload_atomic_fmin:
+   case nir_intrinsic_task_payload_atomic_fmax:
+   case nir_intrinsic_task_payload_atomic_fcomp_swap:
+   case nir_intrinsic_store_task_payload:
+   case nir_intrinsic_load_task_payload:
+      if (s->payload_in_shared)
+         return lower_task_payload_to_shared(b, intrin, s);
+      return NULL;
+   case nir_intrinsic_launch_mesh_workgroups:
+      return lower_task_launch_mesh_workgroups(b, intrin, s);
+   default:
+      return false;
+   }
+}
+
+static bool
+uses_task_payload_atomics(nir_shader *shader)
+{
+   nir_foreach_function(func, shader) {
+      if (!func->impl)
+         continue;
+
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            switch (intrin->intrinsic) {
+               case nir_intrinsic_task_payload_atomic_add:
+               case nir_intrinsic_task_payload_atomic_imin:
+               case nir_intrinsic_task_payload_atomic_umin:
+               case nir_intrinsic_task_payload_atomic_imax:
+               case nir_intrinsic_task_payload_atomic_umax:
+               case nir_intrinsic_task_payload_atomic_and:
+               case nir_intrinsic_task_payload_atomic_or:
+               case nir_intrinsic_task_payload_atomic_xor:
+               case nir_intrinsic_task_payload_atomic_exchange:
+               case nir_intrinsic_task_payload_atomic_comp_swap:
+               case nir_intrinsic_task_payload_atomic_fadd:
+               case nir_intrinsic_task_payload_atomic_fmin:
+               case nir_intrinsic_task_payload_atomic_fmax:
+               case nir_intrinsic_task_payload_atomic_fcomp_swap:
+                  return true;
+               default:
+                  break;
+            }
+         }
+      }
+   }
+
+   return false;
+}
+
+/**
+ * Common Task Shader lowering to make the job of the backends easier.
+ *
+ * - Lowers NV_mesh_shader TASK_COUNT output to launch_mesh_workgroups.
+ * - Removes all code after launch_mesh_workgroups, enforcing the
+ *   fact that it's a terminating instruction.
+ * - Ensures that task shaders always have at least one
+ *   launch_mesh_workgroups instruction, so the backend doesn't
+ *   need to implement a special case when the shader doesn't have it.
+ * - Optionally, implements task_payload using shared memory when
+ *   task_payload atomics are used.
+ *   This is useful when the backend is otherwise not capable of
+ *   handling the same atomic features as it can for shared memory.
+ *   If this is used, the backend only has to implement the basic
+ *   load/store operations for task_payload.
+ *
+ * Note, this pass operates on lowered explicit I/O intrinsics, so
+ * it should be called after nir_lower_io + nir_lower_explicit_io.
+ */
+bool
+nir_lower_task_shader(nir_shader *shader,
+                      nir_lower_task_shader_options options)
+{
+   if (shader->info.stage != MESA_SHADER_TASK)
+      return false;
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+   nir_builder builder;
+   nir_builder_init(&builder, impl);
+
+   if (shader->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_TASK_COUNT)) {
+      /* NV_mesh_shader:
+       * If the shader writes TASK_COUNT, lower that to emit
+       * the new launch_mesh_workgroups intrinsic instead.
+       */
+      nir_lower_nv_task_count(shader);
+   } else {
+      /* To make sure that task shaders always have a code path that
+       * executes a launch_mesh_workgroups, let's add one at the end.
+       * If the shader already had a launch_mesh_workgroups by any chance,
+       * this will be removed.
+       */
+      builder.cursor = nir_after_cf_list(&builder.impl->body);
+      nir_launch_mesh_workgroups(&builder, nir_imm_zero(&builder, 3, 32));
+   }
+
+   bool payload_in_shared = options.payload_to_shared_for_atomics &&
+                            uses_task_payload_atomics(shader);
+
+   lower_task_state state = {
+      .payload_shared_addr = ALIGN(shader->info.shared_size, 16),
+      .payload_in_shared = payload_in_shared,
+   };
+
+   if (payload_in_shared)
+      shader->info.shared_size =
+         state.payload_shared_addr + shader->info.task_payload_size;
+
+   nir_shader_instructions_pass(shader, lower_task_intrin,
+                                nir_metadata_none, &state);
+
+   /* Delete all code that potentially can't be reached due to
+    * launch_mesh_workgroups being a terminating instruction.
+    */
+   nir_lower_returns(shader);
+   bool progress;
+   do {
+      progress = false;
+      NIR_PASS(progress, shader, nir_opt_dead_cf);
+      NIR_PASS(progress, shader, nir_opt_dce);
+   } while (progress);
+   return true;
+}