diff --git a/src/intel/compiler/brw/brw_compiler.c b/src/intel/compiler/brw/brw_compiler.c
index 562a11b35fc..7b5b11bf7cc 100644
--- a/src/intel/compiler/brw/brw_compiler.c
+++ b/src/intel/compiler/brw/brw_compiler.c
@@ -280,6 +280,7 @@ brw_get_compiler_config_value(const struct brw_compiler *compiler)
       DEBUG_SOFT64,
       DEBUG_NO_SEND_GATHER,
       DEBUG_NO_VRT,
+      DEBUG_NO_FILL_OPT,
    };
    for (uint32_t i = 0; i < ARRAY_SIZE(debug_bits); i++) {
       insert_u64_bit(&config, INTEL_DEBUG(debug_bits[i]));
diff --git a/src/intel/compiler/brw/brw_opt_fill_spill.cpp b/src/intel/compiler/brw/brw_opt_fill_spill.cpp
new file mode 100644
index 00000000000..9c38c6c9400
--- /dev/null
+++ b/src/intel/compiler/brw/brw_opt_fill_spill.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright 2025 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+#include "brw_shader.h"
+#include "brw_builder.h"
+
+/**
+ * \file
+ *
+ * Attempt to eliminate spurious fills and spills.
+ *
+ * NOTE: This pass is run after register allocation but before
+ * brw_lower_vgrfs_to_fixed_grfs.
+ */
+
+static bool
+scratch_intersects(const intel_device_info *devinfo,
+                   const brw_scratch_inst *a, const brw_scratch_inst *b)
+{
+   const auto a_first = a->offset;
+   const auto a_last = (a->opcode == SHADER_OPCODE_LSC_SPILL ?
+                        a->size_read(devinfo, SPILL_SRC_PAYLOAD2) :
+                        a->size_written) + a_first - 1;
+   const auto b_first = b->offset;
+   const auto b_last = (b->opcode == SHADER_OPCODE_LSC_SPILL ?
+                        b->size_read(devinfo, SPILL_SRC_PAYLOAD2) :
+                        b->size_written) + b_first - 1;
+
+   return a_last >= b_first && b_last >= a_first;
+}
+
+static bool
+scratch_superset(const intel_device_info *devinfo,
+                   const brw_scratch_inst *super, const brw_scratch_inst *sub)
+{
+   const auto a_first = super->offset;
+   const auto a_last = (super->opcode == SHADER_OPCODE_LSC_SPILL ?
+                        super->size_read(devinfo, SPILL_SRC_PAYLOAD2) :
+                        super->size_written) + a_first - 1;
+   const auto b_first = sub->offset;
+   const auto b_last = (sub->opcode == SHADER_OPCODE_LSC_SPILL ?
+                        sub->size_read(devinfo, SPILL_SRC_PAYLOAD2) :
+                        sub->size_written) + b_first - 1;
+
+   return a_first <= b_first && a_last >= b_last;
+}
+
+bool
+brw_opt_fill_and_spill(brw_shader &s)
+{
+   assert(s.grf_used > 0);
+
+   const intel_device_info *devinfo = s.devinfo;
+   bool progress = false;
+
+   foreach_block(block, s.cfg) {
+      bool block_progress = false;
+
+      foreach_inst_in_block(brw_inst, inst, block) {
+         if (inst->opcode != SHADER_OPCODE_LSC_SPILL)
+            continue;
+
+         const brw_reg spilled =
+            brw_lower_vgrf_to_fixed_grf(devinfo, inst,
+                                        inst->src[SPILL_SRC_PAYLOAD2]);
+
+         /* Check for a fill from the same location while the register being
+          * spilled still contains the data. In this case, replace the fill
+          * with a simple move.
+          */
+         foreach_inst_in_block_starting_from(brw_inst, scan_inst, inst) {
+            /* Write to the register being spilled invalidates the value. */
+            const brw_reg scan_dst =
+               brw_lower_vgrf_to_fixed_grf(devinfo, scan_inst, scan_inst->dst);
+
+            if (regions_overlap(scan_dst, scan_inst->size_written,
+                                spilled,
+                                inst->size_read(devinfo, SPILL_SRC_PAYLOAD2))) {
+               break;
+            }
+
+            /* Spill to the same location invalidates the value. */
+            if (scan_inst->opcode == SHADER_OPCODE_LSC_SPILL &&
+                scratch_intersects(devinfo, scan_inst->as_scratch(),
+                                   inst->as_scratch())) {
+               break;
+            }
+
+            /* Instruction is a fill from the same location as the spill. */
+            if (scan_inst->opcode == SHADER_OPCODE_LSC_FILL &&
+                scan_inst->force_writemask_all == inst->force_writemask_all &&
+                scan_inst->as_scratch()->offset == inst->as_scratch()->offset) {
+               /* This limitation is necessary because (currently) a spill may
+                * be split into multiple writes while the correspoing fill is
+                * implemented as a single transpose read. When this occurs,
+                * this optimization pass would have to be smarter than it
+                * currently is.
+                *
+                * FINISHME: This would not be an issue if the splitting
+                * occured during spill lowering.
+                */
+               if (scan_inst->size_written != inst->size_read(devinfo, SPILL_SRC_PAYLOAD2))
+                  continue;
+
+               const unsigned reg_count = DIV_ROUND_UP(scan_inst->size_written, REG_SIZE);
+               const unsigned max_reg_count = 2 * reg_unit(devinfo);
+
+               /* If the resulting MOV would try to write more than 2
+                * registers, skip the optimization.
+                *
+                * FINISHME: It shouldn't be hard to generate multiple MOV
+                * instructions below to handle this case.
+                */
+               if (reg_count > max_reg_count)
+                  continue;
+
+               if (scan_inst->dst.equals(inst->src[SPILL_SRC_PAYLOAD2])) {
+                  scan_inst = brw_transform_inst(s, scan_inst, BRW_OPCODE_NOP);
+               } else {
+                  scan_inst = brw_transform_inst(s, scan_inst, BRW_OPCODE_MOV);
+                  scan_inst->src[0] = inst->src[SPILL_SRC_PAYLOAD2];
+               }
+
+               s.shader_stats.fill_count--;
+               block_progress = true;
+            }
+         }
+
+         /* Scan again. This time check whether there is a spill to the same
+          * location without an intervening fill from that location. In this
+          * case, the first spill is "killed" and can be removed.
+          */
+         foreach_inst_in_block_starting_from(brw_inst, scan_inst, inst) {
+            if (scan_inst->opcode == SHADER_OPCODE_LSC_FILL &&
+                scratch_intersects(devinfo, inst->as_scratch(),
+                                   scan_inst->as_scratch())) {
+               break;
+            }
+
+            if (scan_inst->opcode == SHADER_OPCODE_LSC_SPILL &&
+                scratch_superset(devinfo, scan_inst->as_scratch(),
+                                 inst->as_scratch())) {
+               inst = brw_transform_inst(s, inst, BRW_OPCODE_NOP);
+               s.shader_stats.spill_count--;
+               block_progress = true;
+               break;
+            }
+         }
+      }
+
+      if (block_progress) {
+         foreach_inst_in_block_safe(brw_inst, inst, block) {
+            if (inst->opcode == BRW_OPCODE_NOP)
+               inst->remove();
+         }
+
+         progress = true;
+      }
+   }
+
+   if (progress)
+      s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
+                            BRW_DEPENDENCY_VARIABLES);
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw/brw_reg_allocate.cpp b/src/intel/compiler/brw/brw_reg_allocate.cpp
index bd3d3bd9b7e..f39365606da 100644
--- a/src/intel/compiler/brw/brw_reg_allocate.cpp
+++ b/src/intel/compiler/brw/brw_reg_allocate.cpp
@@ -1200,6 +1200,10 @@ brw_reg_alloc::spill_reg(unsigned spill_reg)
              * scratch space and the scratch read message, which operates on
              * 32 bit channels.  It shouldn't hurt in any case because the
              * unspill destination is a block-local temporary.
+             *
+             * FINIHSME: However, this will prevent brw_opt_fill_and_spill
+             * from making progress if the lsc_fill is NoMask and the
+             * lsc_spill is not.
              */
             emit_unspill(ibld.exec_all().group(width, 0), &fs->shader_stats,
                          unspill_dst, subset_spill_offset, count, ip);
diff --git a/src/intel/compiler/brw/brw_shader.cpp b/src/intel/compiler/brw/brw_shader.cpp
index d5f091566ac..ef5221852a3 100644
--- a/src/intel/compiler/brw/brw_shader.cpp
+++ b/src/intel/compiler/brw/brw_shader.cpp
@@ -1278,6 +1278,9 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling)
    s.debug_optimizer(nir, "post_ra_alloc", iteration, pass_num);
 
    if (s.spilled_any_registers) {
+      if (!INTEL_DEBUG(DEBUG_NO_FILL_OPT))
+         OPT(brw_opt_fill_and_spill);
+
       OPT(brw_lower_fill_and_spill);
    }
 
diff --git a/src/intel/compiler/brw/brw_shader.h b/src/intel/compiler/brw/brw_shader.h
index b53b4f86bb4..bfa81f6b315 100644
--- a/src/intel/compiler/brw/brw_shader.h
+++ b/src/intel/compiler/brw/brw_shader.h
@@ -364,6 +364,7 @@ bool brw_opt_copy_propagation_defs(brw_shader &s);
 bool brw_opt_cse_defs(brw_shader &s);
 bool brw_opt_dead_code_eliminate(brw_shader &s);
 bool brw_opt_eliminate_find_live_channel(brw_shader &s);
+bool brw_opt_fill_and_spill(brw_shader &s);
 bool brw_opt_register_coalesce(brw_shader &s);
 bool brw_opt_remove_extra_rounding_modes(brw_shader &s);
 bool brw_opt_remove_redundant_halts(brw_shader &s);
diff --git a/src/intel/compiler/brw/meson.build b/src/intel/compiler/brw/meson.build
index cb44c62083e..dea40949c08 100644
--- a/src/intel/compiler/brw/meson.build
+++ b/src/intel/compiler/brw/meson.build
@@ -83,6 +83,7 @@ libintel_compiler_brw_files = files(
   'brw_opt_copy_propagation.cpp',
   'brw_opt_cse.cpp',
   'brw_opt_dead_code_eliminate.cpp',
+  'brw_opt_fill_spill.cpp',
   'brw_opt_register_coalesce.cpp',
   'brw_opt_saturate_propagation.cpp',
   'brw_opt_txf_combiner.cpp',
diff --git a/src/intel/dev/intel_debug.c b/src/intel/dev/intel_debug.c
index 380c1c703a9..c724d01ac4e 100644
--- a/src/intel/dev/intel_debug.c
+++ b/src/intel/dev/intel_debug.c
@@ -75,6 +75,7 @@ static const struct debug_control_bitset debug_control[] = {
    OPT1("ann",               DEBUG_ANNOTATION),
    OPT1("no8",               DEBUG_NO8),
    OPT1("no-oaconfig",       DEBUG_NO_OACONFIG),
+   OPT1("no-fill-opt",       DEBUG_NO_FILL_OPT),
    OPT1("spill_fs",          DEBUG_SPILL_FS),
    OPT1("spill_vec4",        DEBUG_SPILL_VEC4),
    OPT1("cs",                DEBUG_CS),
diff --git a/src/intel/dev/intel_debug.h b/src/intel/dev/intel_debug.h
index a6da7287a44..1f250d6bb1a 100644
--- a/src/intel/dev/intel_debug.h
+++ b/src/intel/dev/intel_debug.h
@@ -60,6 +60,7 @@ enum intel_debug_flag {
    DEBUG_MDA,
    DEBUG_ANNOTATION,
    DEBUG_NO_OACONFIG,
+   DEBUG_NO_FILL_OPT,
    DEBUG_SPILL_FS,
    DEBUG_SPILL_VEC4,
    DEBUG_HEX,