nak/sm32: Remove unnecessary NOP filling

Previous code took inspiration from the SM50 encoder where scheduling
instructions are interleaved every 3 instructions and jumps between
scheduling blocks are not permitted.
in Kepler scheduling instructions are interleaved once every 7
instructions, if we disallow jumps inside scheduling blocks we need
to fill the remaining instructions in the block with NOPs.
This lead to 1-instruction basic block generating 6 unnecessary NOPs.

In the new code basic blocks are tightly packed, only inserting padding
NOPs at the end of the function, reducing the emitted code in complex
CFGs.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35821>
This commit is contained in:
Lorenzo Rossi
2025-06-29 17:14:51 +02:00
committed by Marge Bot
parent 69919fd21b
commit 061b0511d2
+50 -44
View File
@@ -3406,7 +3406,7 @@ fn as_sm32_op_mut(op: &mut Op) -> &mut dyn SM32Op {
}
fn encode_instr(
instr: Option<&Box<Instr>>,
instr: &Instr,
sm: &ShaderModel32,
labels: &FxHashMap<Label, usize>,
encoded: &mut Vec<u32>,
@@ -3419,16 +3419,9 @@ fn encode_instr(
sched: 0,
};
if let Some(instr) = instr {
as_sm32_op(&instr.op).encode(&mut e);
e.set_pred(&instr.pred);
e.set_instr_dependency(&instr.deps);
} else {
let nop = OpNop { label: None };
nop.encode(&mut e);
e.set_pred(&true.into());
e.set_instr_dependency(&InstrDeps::new());
}
as_sm32_op(&instr.op).encode(&mut e);
e.set_pred(&instr.pred);
e.set_instr_dependency(&instr.deps);
encoded.extend(&e.inst[..]);
@@ -3436,48 +3429,61 @@ fn encode_instr(
}
fn encode_sm32_shader(sm: &ShaderModel32, s: &Shader<'_>) -> Vec<u32> {
const INSTR_LEN_BYTES: usize = 8;
assert!(s.functions.len() == 1);
let func = &s.functions[0];
// --- Compute label addresses ---
// We need a schedule instruction every 7 instructions, these don't
// define jump boundaries so we can have multible blocks in the same
// 7-instr group.
let mut ip = 0_usize;
let mut labels = FxHashMap::default();
for b in &func.blocks {
// We ensure blocks will have groups of 7 instructions with a
// schedule instruction before each groups. As we should never jump
// to a schedule instruction, we account for that here.
labels.insert(b.label, ip + 8);
let block_num_instrs = b.instrs.len().next_multiple_of(7);
// Every 7 instructions, we have a new schedule instruction so we
// need to account for that.
ip += (block_num_instrs + (block_num_instrs / 7)) * 8;
let num_sched = (ip / 7) + 1;
labels.insert(b.label, (ip + num_sched) * INSTR_LEN_BYTES);
ip += b.instrs.len();
}
let mut encoded = Vec::new();
for b in &func.blocks {
for sched_chunk in b.instrs.chunks(7) {
let sched_i = encoded.len();
let mut sched_instr = [0u32; 2];
encoded.extend(&sched_instr[..]); // Push now, will edit later
let mut bv = BitMutView::new(&mut sched_instr);
bv.set_field(0..2, 0b00);
bv.set_field(58..64, 0b000010); // 0x80
let mut bv = bv.subset_mut(2..58);
for (i, instr) in sched_chunk.iter().enumerate() {
let sched =
encode_instr(Some(instr), sm, &labels, &mut encoded);
bv.set_field(i * 8..(i + 1) * 8, sched);
}
// Encode remaining ops in chunk as NOPs
for _ in sched_chunk.len()..7 {
encode_instr(None, sm, &labels, &mut encoded);
}
encoded[sched_i] = sched_instr[0];
encoded[sched_i + 1] = sched_instr[1];
// --- Real encoding ---
// Create an instruction iterator and iterate it in chunks of 7.
// fill the last chunk with a nop (it should never be executed).
let mut instr_iter = func
.blocks
.iter()
.flat_map(|b| b.instrs.iter().map(|x| &**x))
.peekable();
let mut filling_instr = Instr {
pred: true.into(),
op: Op::Nop(OpNop { label: None }),
deps: InstrDeps::new(),
};
filling_instr.deps.set_delay(1);
let mut sched_chunk_gen = || {
if instr_iter.peek().is_none() {
return None;
}
Some([0; 7].map(|_| instr_iter.next().unwrap_or(&filling_instr)))
};
let mut encoded = Vec::new();
while let Some(sched_chunk) = sched_chunk_gen() {
let sched_i = encoded.len();
let mut sched_instr = [0u32; 2];
encoded.extend(&sched_instr[..]); // Push now, will edit later
let mut bv = BitMutView::new(&mut sched_instr);
bv.set_field(0..2, 0b00);
bv.set_field(58..64, 0b000010); // 0x80
let mut bv = bv.subset_mut(2..58);
for (i, instr) in sched_chunk.iter().enumerate() {
let sched = encode_instr(instr, sm, &labels, &mut encoded);
bv.set_field(i * 8..(i + 1) * 8, sched);
}
encoded[sched_i] = sched_instr[0];
encoded[sched_i + 1] = sched_instr[1];
}
encoded