From 061b0511d256645ae00a4c8e415c05476009c736 Mon Sep 17 00:00:00 2001 From: Lorenzo Rossi Date: Sun, 29 Jun 2025 17:14:51 +0200 Subject: [PATCH] nak/sm32: Remove unnecessary NOP filling Previous code took inspiration from the SM50 encoder where scheduling instructions are interleaved every 3 instructions and jumps between scheduling blocks are not permitted. in Kepler scheduling instructions are interleaved once every 7 instructions, if we disallow jumps inside scheduling blocks we need to fill the remaining instructions in the block with NOPs. This lead to 1-instruction basic block generating 6 unnecessary NOPs. In the new code basic blocks are tightly packed, only inserting padding NOPs at the end of the function, reducing the emitted code in complex CFGs. Part-of: --- src/nouveau/compiler/nak/sm32.rs | 94 +++++++++++++++++--------------- 1 file changed, 50 insertions(+), 44 deletions(-) diff --git a/src/nouveau/compiler/nak/sm32.rs b/src/nouveau/compiler/nak/sm32.rs index 6a8ab6dceb9..64dea5164df 100644 --- a/src/nouveau/compiler/nak/sm32.rs +++ b/src/nouveau/compiler/nak/sm32.rs @@ -3406,7 +3406,7 @@ fn as_sm32_op_mut(op: &mut Op) -> &mut dyn SM32Op { } fn encode_instr( - instr: Option<&Box>, + instr: &Instr, sm: &ShaderModel32, labels: &FxHashMap, encoded: &mut Vec, @@ -3419,16 +3419,9 @@ fn encode_instr( sched: 0, }; - if let Some(instr) = instr { - as_sm32_op(&instr.op).encode(&mut e); - e.set_pred(&instr.pred); - e.set_instr_dependency(&instr.deps); - } else { - let nop = OpNop { label: None }; - nop.encode(&mut e); - e.set_pred(&true.into()); - e.set_instr_dependency(&InstrDeps::new()); - } + as_sm32_op(&instr.op).encode(&mut e); + e.set_pred(&instr.pred); + e.set_instr_dependency(&instr.deps); encoded.extend(&e.inst[..]); @@ -3436,48 +3429,61 @@ fn encode_instr( } fn encode_sm32_shader(sm: &ShaderModel32, s: &Shader<'_>) -> Vec { + const INSTR_LEN_BYTES: usize = 8; assert!(s.functions.len() == 1); let func = &s.functions[0]; + // --- Compute label addresses --- + // We need a schedule instruction every 7 instructions, these don't + // define jump boundaries so we can have multible blocks in the same + // 7-instr group. let mut ip = 0_usize; let mut labels = FxHashMap::default(); for b in &func.blocks { - // We ensure blocks will have groups of 7 instructions with a - // schedule instruction before each groups. As we should never jump - // to a schedule instruction, we account for that here. - labels.insert(b.label, ip + 8); - - let block_num_instrs = b.instrs.len().next_multiple_of(7); - - // Every 7 instructions, we have a new schedule instruction so we - // need to account for that. - ip += (block_num_instrs + (block_num_instrs / 7)) * 8; + let num_sched = (ip / 7) + 1; + labels.insert(b.label, (ip + num_sched) * INSTR_LEN_BYTES); + ip += b.instrs.len(); } - let mut encoded = Vec::new(); - for b in &func.blocks { - for sched_chunk in b.instrs.chunks(7) { - let sched_i = encoded.len(); - let mut sched_instr = [0u32; 2]; - encoded.extend(&sched_instr[..]); // Push now, will edit later - let mut bv = BitMutView::new(&mut sched_instr); - bv.set_field(0..2, 0b00); - bv.set_field(58..64, 0b000010); // 0x80 - let mut bv = bv.subset_mut(2..58); - - for (i, instr) in sched_chunk.iter().enumerate() { - let sched = - encode_instr(Some(instr), sm, &labels, &mut encoded); - - bv.set_field(i * 8..(i + 1) * 8, sched); - } - // Encode remaining ops in chunk as NOPs - for _ in sched_chunk.len()..7 { - encode_instr(None, sm, &labels, &mut encoded); - } - encoded[sched_i] = sched_instr[0]; - encoded[sched_i + 1] = sched_instr[1]; + // --- Real encoding --- + // Create an instruction iterator and iterate it in chunks of 7. + // fill the last chunk with a nop (it should never be executed). + let mut instr_iter = func + .blocks + .iter() + .flat_map(|b| b.instrs.iter().map(|x| &**x)) + .peekable(); + let mut filling_instr = Instr { + pred: true.into(), + op: Op::Nop(OpNop { label: None }), + deps: InstrDeps::new(), + }; + filling_instr.deps.set_delay(1); + let mut sched_chunk_gen = || { + if instr_iter.peek().is_none() { + return None; } + Some([0; 7].map(|_| instr_iter.next().unwrap_or(&filling_instr))) + }; + + let mut encoded = Vec::new(); + while let Some(sched_chunk) = sched_chunk_gen() { + let sched_i = encoded.len(); + + let mut sched_instr = [0u32; 2]; + encoded.extend(&sched_instr[..]); // Push now, will edit later + let mut bv = BitMutView::new(&mut sched_instr); + bv.set_field(0..2, 0b00); + bv.set_field(58..64, 0b000010); // 0x80 + let mut bv = bv.subset_mut(2..58); + + for (i, instr) in sched_chunk.iter().enumerate() { + let sched = encode_instr(instr, sm, &labels, &mut encoded); + + bv.set_field(i * 8..(i + 1) * 8, sched); + } + encoded[sched_i] = sched_instr[0]; + encoded[sched_i + 1] = sched_instr[1]; } encoded