diff --git a/src/intel/compiler/brw/brw_opt_fill_spill.cpp b/src/intel/compiler/brw/brw_opt_fill_spill.cpp index 9c38c6c9400..f2e8a531bfb 100644 --- a/src/intel/compiler/brw/brw_opt_fill_spill.cpp +++ b/src/intel/compiler/brw/brw_opt_fill_spill.cpp @@ -149,6 +149,80 @@ brw_opt_fill_and_spill(brw_shader &s) } } + /* Optimize multiple fills from the same offset in a single block. */ + foreach_inst_in_block(brw_inst, inst, block) { + if (inst->opcode != SHADER_OPCODE_LSC_FILL) + continue; + + brw_reg inst_dst = brw_lower_vgrf_to_fixed_grf(devinfo, inst, + inst->dst); + + foreach_inst_in_block_starting_from(brw_inst, scan_inst, inst) { + /* Instruction is a fill from the same location as the previous + * fill. + */ + brw_reg scan_dst = brw_lower_vgrf_to_fixed_grf(devinfo, scan_inst, + scan_inst->dst); + + if (scan_inst->opcode == SHADER_OPCODE_LSC_FILL && + scan_inst->force_writemask_all == inst->force_writemask_all && + scan_inst->as_scratch()->offset == inst->as_scratch()->offset && + scan_inst->size_written == inst->size_written && + scan_inst->group == inst->group && + scan_inst->as_scratch()->use_transpose == inst->as_scratch()->use_transpose) { + const unsigned reg_count = DIV_ROUND_UP(scan_inst->size_written, REG_SIZE); + const unsigned max_reg_count = 2 * reg_unit(devinfo); + + /* If the resulting MOV would try to write more than 2 + * registers, skip the optimization. + * + * FINISHME: It shouldn't be hard to generate multiple MOV + * instructions below to handle this case. + */ + if (reg_count > max_reg_count) + continue; + + if (scan_dst.equals(inst_dst)) { + scan_inst = brw_transform_inst(s, scan_inst, BRW_OPCODE_NOP); + } else { + /* This can occur for fills in wider SIMD modes. In SIMD32 + * on Xe2, a fill to r16 followed by a fill to r17 from the + * same location can't be trivially replaced. The resulting + * `mov(32) r17, r16` would have the same problems of memcpy + * with overlapping ranges. + * + * FINISHME: This is fixable, but it required emitting two + * MOVs with hald SIMD size. It might also "just work" if + * scan_dst.nr < inst_dst.nr. + */ + if (regions_overlap(scan_dst, scan_inst->size_written, + inst_dst, inst->size_written)) { + break; + } + + scan_inst = brw_transform_inst(s, scan_inst, BRW_OPCODE_MOV); + scan_inst->src[0] = inst->dst; + } + + s.shader_stats.fill_count--; + block_progress = true; + } else { + /* A spill to the same location invalidates the value. */ + if (scan_inst->opcode == SHADER_OPCODE_LSC_SPILL && + scratch_intersects(devinfo, inst->as_scratch(), + scan_inst->as_scratch())) { + break; + } + + /* Write to the register being filled invalidates the value. */ + if (regions_overlap(scan_dst, scan_inst->size_written, + inst_dst, inst->size_written)) { + break; + } + } + } + } + if (block_progress) { foreach_inst_in_block_safe(brw_inst, inst, block) { if (inst->opcode == BRW_OPCODE_NOP)