From e9ae997ffcdeda8f49a25b44ec707447946ddef0 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Mon, 30 Jun 2025 15:51:58 -0700 Subject: [PATCH] brw: Only apply GRF 127 send workaround to Gfx9 The portion of the Bspec dedicated to Gfx6-Gfx11 says that this workaround applies to "Pre-CNL" (with CNL being Gfx10). There is no mention of this workaround in the sections for Xe or Xe2. No shader-db or fossil-db changes on Skylake or older Intel platforms. shader-db: Lunar Lake, Meteor Lake, DG2, Tiger Lake, and Ice Lake (Lunar Lake shown) total instructions in shared programs: 17107031 -> 17107027 (<.01%) instructions in affected programs: 32182 -> 32178 (-0.01%) helped: 16 / HURT: 14 total cycles in shared programs: 895016760 -> 894975410 (<.01%) cycles in affected programs: 312774834 -> 312733484 (-0.01%) helped: 9279 / HURT: 8091 LOST: 40 GAINED: 33 The pre-Xe2 platforms had a lot more lost / gained shaders. This appears to be due to churn in the cycle counts and the SIMD32 heuristic. fossil-db: Lunar Lake Totals: Instrs: 208667436 -> 208671853 (+0.00%); split: -0.00%, +0.01% Subgroup size: 14241168 -> 14241200 (+0.00%) Cycle count: 31495149690 -> 31481397970 (-0.04%); split: -0.17%, +0.13% Spill count: 508467 -> 508701 (+0.05%); split: -0.10%, +0.14% Fill count: 611979 -> 612583 (+0.10%); split: -0.07%, +0.17% Scratch Memory Size: 35288064 -> 35311616 (+0.07%); split: -0.07%, +0.14% Totals from 205773 (29.10% of 707019) affected shaders: Instrs: 103153541 -> 103157958 (+0.00%); split: -0.01%, +0.01% Subgroup size: 4563584 -> 4563616 (+0.00%) Cycle count: 12979963010 -> 12966211290 (-0.11%); split: -0.42%, +0.32% Spill count: 494741 -> 494975 (+0.05%); split: -0.10%, +0.15% Fill count: 597988 -> 598592 (+0.10%); split: -0.07%, +0.17% Scratch Memory Size: 33351680 -> 33375232 (+0.07%); split: -0.08%, +0.15% Meteor Lake and DG2 had similar results. (Meteor Lake shown) Totals: Instrs: 233063764 -> 233057897 (-0.00%); split: -0.01%, +0.00% Subgroup size: 9892840 -> 9892856 (+0.00%) Cycle count: 25387597341 -> 25373885583 (-0.05%); split: -0.36%, +0.31% Spill count: 518469 -> 517940 (-0.10%); split: -0.19%, +0.09% Fill count: 559444 -> 558537 (-0.16%); split: -0.29%, +0.13% Scratch Memory Size: 19694592 -> 19658752 (-0.18%); split: -0.21%, +0.03% Max dispatch width: 7135248 -> 7131672 (-0.05%); split: +0.13%, -0.18% Totals from 301996 (37.49% of 805603) affected shaders: Instrs: 144535999 -> 144530132 (-0.00%); split: -0.01%, +0.01% Subgroup size: 3768528 -> 3768544 (+0.00%) Cycle count: 18687102311 -> 18673390553 (-0.07%); split: -0.50%, +0.42% Spill count: 515687 -> 515158 (-0.10%); split: -0.20%, +0.09% Fill count: 557638 -> 556731 (-0.16%); split: -0.29%, +0.13% Scratch Memory Size: 18662400 -> 18626560 (-0.19%); split: -0.22%, +0.03% Max dispatch width: 2029872 -> 2026296 (-0.18%); split: +0.44%, -0.62% Tiger Lake Totals: Instrs: 238813279 -> 238766482 (-0.02%); split: -0.04%, +0.02% Subgroup size: 9851320 -> 9851328 (+0.00%) Cycle count: 23668877036 -> 23646286421 (-0.10%); split: -0.51%, +0.42% Spill count: 559060 -> 554241 (-0.86%); split: -1.12%, +0.26% Fill count: 595926 -> 591843 (-0.69%); split: -1.46%, +0.78% Scratch Memory Size: 19929088 -> 19764224 (-0.83%); split: -1.19%, +0.36% Max dispatch width: 7102184 -> 7101840 (-0.00%); split: +0.13%, -0.13% Totals from 284125 (35.42% of 802235) affected shaders: Instrs: 144695094 -> 144648297 (-0.03%); split: -0.06%, +0.03% Subgroup size: 3567312 -> 3567320 (+0.00%) Cycle count: 11303753658 -> 11281163043 (-0.20%); split: -1.07%, +0.87% Spill count: 554624 -> 549805 (-0.87%); split: -1.13%, +0.26% Fill count: 592252 -> 588169 (-0.69%); split: -1.47%, +0.78% Scratch Memory Size: 19553280 -> 19388416 (-0.84%); split: -1.21%, +0.37% Max dispatch width: 1895488 -> 1895144 (-0.02%); split: +0.48%, -0.50% Ice Lake Totals: Instrs: 239034316 -> 239049108 (+0.01%); split: -0.03%, +0.04% Subgroup size: 9926440 -> 9926448 (+0.00%) Cycle count: 24944253156 -> 24919967386 (-0.10%); split: -0.25%, +0.15% Spill count: 575498 -> 571612 (-0.68%); split: -1.18%, +0.51% Fill count: 709760 -> 716665 (+0.97%); split: -1.31%, +2.28% Scratch Memory Size: 20699136 -> 20599808 (-0.48%); split: -1.45%, +0.97% Max dispatch width: 7140856 -> 7143568 (+0.04%); split: +0.15%, -0.12% Totals from 233451 (29.01% of 804669) affected shaders: Instrs: 127440610 -> 127455402 (+0.01%); split: -0.07%, +0.08% Subgroup size: 2835784 -> 2835792 (+0.00%) Cycle count: 11818511030 -> 11794225260 (-0.21%); split: -0.53%, +0.32% Spill count: 559557 -> 555671 (-0.69%); split: -1.22%, +0.52% Fill count: 694460 -> 701365 (+0.99%); split: -1.34%, +2.33% Scratch Memory Size: 19774464 -> 19675136 (-0.50%); split: -1.52%, +1.02% Max dispatch width: 1602736 -> 1605448 (+0.17%); split: +0.69%, -0.52% Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_eu_validate.c | 12 ++++---- src/intel/compiler/brw_opt_bank_conflicts.cpp | 15 ++++++---- src/intel/compiler/brw_reg_allocate.cpp | 28 +++++++++++++++---- 3 files changed, 38 insertions(+), 17 deletions(-) diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c index aa66f4f2e29..326b431e9af 100644 --- a/src/intel/compiler/brw_eu_validate.c +++ b/src/intel/compiler/brw_eu_validate.c @@ -369,11 +369,13 @@ send_restrictions(const struct brw_isa_info *isa, inst->src[0].nr < 112, "send with EOT must use g112-g127"); - ERROR_IF(!dst_is_null(inst) && - (inst->dst.nr + brw_eu_inst_rlen(devinfo, inst->raw) > 127) && - (inst->src[0].nr + brw_eu_inst_mlen(devinfo, inst->raw) > inst->dst.nr), - "r127 must not be used for return address when there is " - "a src and dest overlap"); + if (devinfo->ver < 10) { + ERROR_IF(!dst_is_null(inst) && + (inst->dst.nr + brw_eu_inst_rlen(devinfo, inst->raw) > 127) && + (inst->src[0].nr + brw_eu_inst_mlen(devinfo, inst->raw) > inst->dst.nr), + "r127 must not be used for return address when there is " + "a src and dest overlap"); + } } return error_msg; diff --git a/src/intel/compiler/brw_opt_bank_conflicts.cpp b/src/intel/compiler/brw_opt_bank_conflicts.cpp index 1a447498b8c..8817ef9c753 100644 --- a/src/intel/compiler/brw_opt_bank_conflicts.cpp +++ b/src/intel/compiler/brw_opt_bank_conflicts.cpp @@ -541,16 +541,19 @@ namespace { for (unsigned reg = 0; reg < 2; reg++) constrained[p.atom_of_reg(reg)] = true; - /* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference", - * subsection "EUISA Instructions", Send Message (page 990): + /* Bspec says: * - * "r127 must not be used for return address when there is a src and - * dest overlap in send instruction." + * [Pre-CNL] r127 must not be used for return address when there is a + * src and dest overlap in send instruction. + * + * The Intel Broadwell PRM, vol 07, section "Instruction Set Reference", + * subsection "EUISA Instructions", Send Message (page 990) contains the + * same text. * * Register allocation ensures that, so don't move 127 around to avoid - * breaking that property. + * breaking that property. The workaround will only be applied to Gfx9. */ - constrained[p.atom_of_reg(127)] = true; + constrained[p.atom_of_reg(127)] = v->devinfo->ver < 10; foreach_block_and_inst(block, brw_inst, inst, v->cfg) { /* Assume that anything referenced via fixed GRFs is baked into the diff --git a/src/intel/compiler/brw_reg_allocate.cpp b/src/intel/compiler/brw_reg_allocate.cpp index fc326489a28..a199d680356 100644 --- a/src/intel/compiler/brw_reg_allocate.cpp +++ b/src/intel/compiler/brw_reg_allocate.cpp @@ -556,11 +556,14 @@ brw_reg_alloc::setup_inst_interference(const brw_inst *inst) } if (grf127_send_hack_node >= 0) { - /* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference", - * subsection "EUISA Instructions", Send Message (page 990): + /* Bspec says: * - * "r127 must not be used for return address when there is a src and - * dest overlap in send instruction." + * [Pre-CNL] r127 must not be used for return address when there is a + * src and dest overlap in send instruction. + * + * The Intel Broadwell PRM, vol 07, section "Instruction Set Reference", + * subsection "EUISA Instructions", Send Message (page 990) contains the + * same text. * * We are avoiding using grf127 as part of the destination of send * messages adding a node interference to the grf127_send_hack_node. @@ -634,8 +637,21 @@ brw_reg_alloc::build_interference_graph(bool allow_spilling) first_payload_node = node_count; node_count += payload_node_count; - grf127_send_hack_node = node_count; - node_count++; + /* Bspec says: + * + * [Pre-CNL] r127 must not be used for return address when there is a + * src and dest overlap in send instruction. + * + * The Intel Broadwell PRM, vol 07, section "Instruction Set Reference", + * subsection "EUISA Instructions", Send Message (page 990) contains the + * same text. + * + * The workaround will only be applied to Gfx9. + */ + if (devinfo->ver < 10) + grf127_send_hack_node = node_count++; + else + grf127_send_hack_node = -1; first_vgrf_node = node_count; node_count += fs->alloc.count;