diff --git a/src/intel/compiler/brw/brw_opt_fill_spill.cpp b/src/intel/compiler/brw/brw_opt_fill_spill.cpp
index 9c38c6c9400..f2e8a531bfb 100644
--- a/src/intel/compiler/brw/brw_opt_fill_spill.cpp
+++ b/src/intel/compiler/brw/brw_opt_fill_spill.cpp
@@ -149,6 +149,80 @@ brw_opt_fill_and_spill(brw_shader &s)
          }
       }
 
+      /* Optimize multiple fills from the same offset in a single block. */
+      foreach_inst_in_block(brw_inst, inst, block) {
+         if (inst->opcode != SHADER_OPCODE_LSC_FILL)
+            continue;
+
+         brw_reg inst_dst = brw_lower_vgrf_to_fixed_grf(devinfo, inst,
+                                                        inst->dst);
+
+         foreach_inst_in_block_starting_from(brw_inst, scan_inst, inst) {
+            /* Instruction is a fill from the same location as the previous
+             * fill.
+             */
+            brw_reg scan_dst = brw_lower_vgrf_to_fixed_grf(devinfo, scan_inst,
+                                                           scan_inst->dst);
+
+            if (scan_inst->opcode == SHADER_OPCODE_LSC_FILL &&
+                scan_inst->force_writemask_all == inst->force_writemask_all &&
+                scan_inst->as_scratch()->offset == inst->as_scratch()->offset &&
+                scan_inst->size_written == inst->size_written &&
+                scan_inst->group == inst->group &&
+                scan_inst->as_scratch()->use_transpose == inst->as_scratch()->use_transpose) {
+               const unsigned reg_count = DIV_ROUND_UP(scan_inst->size_written, REG_SIZE);
+               const unsigned max_reg_count = 2 * reg_unit(devinfo);
+
+               /* If the resulting MOV would try to write more than 2
+                * registers, skip the optimization.
+                *
+                * FINISHME: It shouldn't be hard to generate multiple MOV
+                * instructions below to handle this case.
+                */
+               if (reg_count > max_reg_count)
+                  continue;
+
+               if (scan_dst.equals(inst_dst)) {
+                  scan_inst = brw_transform_inst(s, scan_inst, BRW_OPCODE_NOP);
+               } else {
+                  /* This can occur for fills in wider SIMD modes. In SIMD32
+                   * on Xe2, a fill to r16 followed by a fill to r17 from the
+                   * same location can't be trivially replaced. The resulting
+                   * `mov(32) r17, r16` would have the same problems of memcpy
+                   * with overlapping ranges.
+                   *
+                   * FINISHME: This is fixable, but it required emitting two
+                   * MOVs with hald SIMD size. It might also "just work" if
+                   * scan_dst.nr < inst_dst.nr.
+                   */
+                  if (regions_overlap(scan_dst, scan_inst->size_written,
+                                      inst_dst, inst->size_written)) {
+                     break;
+                  }
+
+                  scan_inst = brw_transform_inst(s, scan_inst, BRW_OPCODE_MOV);
+                  scan_inst->src[0] = inst->dst;
+               }
+
+               s.shader_stats.fill_count--;
+               block_progress = true;
+            } else {
+               /* A spill to the same location invalidates the value. */
+               if (scan_inst->opcode == SHADER_OPCODE_LSC_SPILL &&
+                   scratch_intersects(devinfo, inst->as_scratch(),
+                                      scan_inst->as_scratch())) {
+                  break;
+               }
+
+               /* Write to the register being filled invalidates the value. */
+               if (regions_overlap(scan_dst, scan_inst->size_written,
+                                   inst_dst, inst->size_written)) {
+                  break;
+               }
+            }
+         }
+      }
+
       if (block_progress) {
          foreach_inst_in_block_safe(brw_inst, inst, block) {
             if (inst->opcode == BRW_OPCODE_NOP)