intel/brw: Allow using performance analysis pass pre-register allocation.

Mainly this involves changing 'struct state' so that the dep_ready
array is allocated with a dynamic size based on the number of VGRFs of
the program instead of assuming a fixed XE3_MAX_GRF count of GRF
dependencies.  VGRF register dependencies are then handled by using
one dep_ready entry per VGRF allocation instead of one per hardware
register.

The ability to use the performance analysis pass pre-regalloc will
mostly be useful on xe3+, but this also has the side effect of saving
some memory on xe2 and earlier platforms since we no longer need to
allocate XE3_MAX_GRF dep_ready entries for them.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36618>
This commit is contained in:
Francisco Jerez
2025-07-16 14:56:39 -07:00
committed by Marge Bot
parent 3936a43496
commit dfc2a89d96
+35 -19
View File
@@ -65,10 +65,8 @@ namespace {
* potentially depend on.
*/
enum intel_eu_dependency_id {
/* Register part of the GRF. */
EU_DEPENDENCY_ID_GRF0 = 0,
/* Address register part of the ARF. */
EU_DEPENDENCY_ID_ADDR0 = EU_DEPENDENCY_ID_GRF0 + XE3_MAX_GRF,
EU_DEPENDENCY_ID_ADDR0 = 0,
/* Accumulator register part of the ARF. */
EU_DEPENDENCY_ID_ACCUM0 = EU_DEPENDENCY_ID_ADDR0 + 1,
/* Flag register part of the ARF. */
@@ -77,15 +75,33 @@ namespace {
EU_DEPENDENCY_ID_SBID_WR0 = EU_DEPENDENCY_ID_FLAG0 + 16,
/* SBID token read completion. Only used on Gfx12+. */
EU_DEPENDENCY_ID_SBID_RD0 = EU_DEPENDENCY_ID_SBID_WR0 + 32,
/* Number of computation dependencies currently tracked. */
EU_NUM_DEPENDENCY_IDS = EU_DEPENDENCY_ID_SBID_RD0 + 32
/* Register part of the GRF. */
EU_DEPENDENCY_ID_GRF0 = EU_DEPENDENCY_ID_SBID_RD0 + 32,
EU_DEPENDENCY_ID_INVALID = ~0u
};
unsigned
num_grf_dependency_ids(const brw_shader *s) {
return (!s->grf_used ? s->alloc.count :
s->devinfo->ver >= 30 ? XE3_MAX_GRF :
s->devinfo->ver >= 20 ? XE2_MAX_GRF :
BRW_MAX_GRF);
}
/**
* State of our modeling of the program execution.
*/
struct state {
state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
state(const brw_shader *s) :
unit_ready(), dep_ready(), num_dependency_ids(EU_DEPENDENCY_ID_GRF0 + num_grf_dependency_ids(s)), unit_busy(), weight(1.0)
{
dep_ready = new unsigned[num_dependency_ids]();
}
~state() {
delete[] dep_ready;
}
/**
* Time at which a given unit will be ready to execute the next
* computation, in clock units.
@@ -95,7 +111,9 @@ namespace {
* Time at which an instruction dependent on a given dependency ID will
* be ready to execute, in clock units.
*/
unsigned dep_ready[EU_NUM_DEPENDENCY_IDS];
unsigned *dep_ready;
unsigned num_dependency_ids;
/**
* Aggregated utilization of a given unit excluding idle cycles,
* in clock units.
@@ -736,7 +754,7 @@ namespace {
void
stall_on_dependency(state &st, enum intel_eu_dependency_id id)
{
if (id < ARRAY_SIZE(st.dep_ready))
if (id < st.num_dependency_ids)
st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
st.dep_ready[id]);
}
@@ -775,7 +793,7 @@ namespace {
mark_read_dependency(state &st, const perf_desc &perf,
enum intel_eu_dependency_id id)
{
if (id < ARRAY_SIZE(st.dep_ready))
if (id < st.num_dependency_ids)
st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ls;
}
@@ -791,7 +809,7 @@ namespace {
st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.la;
else if (id >= EU_DEPENDENCY_ID_FLAG0 && id < EU_DEPENDENCY_ID_SBID_WR0)
st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.lf;
else if (id < ARRAY_SIZE(st.dep_ready))
else if (id < st.num_dependency_ids)
st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ld;
}
@@ -803,13 +821,12 @@ namespace {
const int delta)
{
if (r.file == VGRF) {
const unsigned i = r.nr + r.offset / REG_SIZE + delta;
assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_GRF0);
const unsigned i = r.nr;
return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
} else if (r.file == FIXED_GRF) {
const unsigned i = r.nr + delta;
assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_GRF0);
assert(i < XE3_MAX_GRF);
return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
} else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
@@ -824,7 +841,7 @@ namespace {
return intel_eu_dependency_id(EU_DEPENDENCY_ID_ACCUM0 + i);
} else {
return EU_NUM_DEPENDENCY_IDS;
return EU_DEPENDENCY_ID_INVALID;
}
}
@@ -846,11 +863,10 @@ namespace {
tgl_swsb_rd_dependency_id(tgl_swsb swsb)
{
if (swsb.mode) {
assert(swsb.sbid <
EU_NUM_DEPENDENCY_IDS - EU_DEPENDENCY_ID_SBID_RD0);
assert(swsb.sbid < EU_DEPENDENCY_ID_GRF0 - EU_DEPENDENCY_ID_SBID_RD0);
return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_RD0 + swsb.sbid);
} else {
return EU_NUM_DEPENDENCY_IDS;
return EU_DEPENDENCY_ID_INVALID;
}
}
@@ -866,7 +882,7 @@ namespace {
EU_DEPENDENCY_ID_SBID_RD0 - EU_DEPENDENCY_ID_SBID_WR0);
return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_WR0 + swsb.sbid);
} else {
return EU_NUM_DEPENDENCY_IDS;
return EU_DEPENDENCY_ID_INVALID;
}
}
@@ -1075,7 +1091,7 @@ namespace {
const float loop_weight = 10;
unsigned halt_count = 0;
unsigned elapsed = 0;
state st;
state st { s };
foreach_block(block, s->cfg) {
const unsigned elapsed0 = elapsed;