From 061b0511d256645ae00a4c8e415c05476009c736 Mon Sep 17 00:00:00 2001
From: Lorenzo Rossi <snowycoder@gmail.com>
Date: Sun, 29 Jun 2025 17:14:51 +0200
Subject: [PATCH] nak/sm32: Remove unnecessary NOP filling

Previous code took inspiration from the SM50 encoder where scheduling
instructions are interleaved every 3 instructions and jumps between
scheduling blocks are not permitted.
in Kepler scheduling instructions are interleaved once every 7
instructions, if we disallow jumps inside scheduling blocks we need
to fill the remaining instructions in the block with NOPs.
This lead to 1-instruction basic block generating 6 unnecessary NOPs.

In the new code basic blocks are tightly packed, only inserting padding
NOPs at the end of the function, reducing the emitted code in complex
CFGs.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35821>
---
 src/nouveau/compiler/nak/sm32.rs | 94 +++++++++++++++++---------------
 1 file changed, 50 insertions(+), 44 deletions(-)
diff --git a/src/nouveau/compiler/nak/sm32.rs b/src/nouveau/compiler/nak/sm32.rs
index 6a8ab6dceb9..64dea5164df 100644
--- a/src/nouveau/compiler/nak/sm32.rs
+++ b/src/nouveau/compiler/nak/sm32.rs
@@ -3406,7 +3406,7 @@ fn as_sm32_op_mut(op: &mut Op) -> &mut dyn SM32Op {
 }
 
 fn encode_instr(
-    instr: Option<&Box<Instr>>,
+    instr: &Instr,
     sm: &ShaderModel32,
     labels: &FxHashMap<Label, usize>,
     encoded: &mut Vec<u32>,
@@ -3419,16 +3419,9 @@ fn encode_instr(
         sched: 0,
     };
 
-    if let Some(instr) = instr {
-        as_sm32_op(&instr.op).encode(&mut e);
-        e.set_pred(&instr.pred);
-        e.set_instr_dependency(&instr.deps);
-    } else {
-        let nop = OpNop { label: None };
-        nop.encode(&mut e);
-        e.set_pred(&true.into());
-        e.set_instr_dependency(&InstrDeps::new());
-    }
+    as_sm32_op(&instr.op).encode(&mut e);
+    e.set_pred(&instr.pred);
+    e.set_instr_dependency(&instr.deps);
 
     encoded.extend(&e.inst[..]);
 
@@ -3436,48 +3429,61 @@ fn encode_instr(
 }
 
 fn encode_sm32_shader(sm: &ShaderModel32, s: &Shader<'_>) -> Vec<u32> {
+    const INSTR_LEN_BYTES: usize = 8;
     assert!(s.functions.len() == 1);
     let func = &s.functions[0];
 
+    // --- Compute label addresses ---
+    // We need a schedule instruction every 7 instructions, these don't
+    // define jump boundaries so we can have multible blocks in the same
+    // 7-instr group.
     let mut ip = 0_usize;
     let mut labels = FxHashMap::default();
     for b in &func.blocks {
-        // We ensure blocks will have groups of 7 instructions with a
-        // schedule instruction before each groups.  As we should never jump
-        // to a schedule instruction, we account for that here.
-        labels.insert(b.label, ip + 8);
-
-        let block_num_instrs = b.instrs.len().next_multiple_of(7);
-
-        // Every 7 instructions, we have a new schedule instruction so we
-        // need to account for that.
-        ip += (block_num_instrs + (block_num_instrs / 7)) * 8;
+        let num_sched = (ip / 7) + 1;
+        labels.insert(b.label, (ip + num_sched) * INSTR_LEN_BYTES);
+        ip += b.instrs.len();
     }
 
-    let mut encoded = Vec::new();
-    for b in &func.blocks {
-        for sched_chunk in b.instrs.chunks(7) {
-            let sched_i = encoded.len();
-            let mut sched_instr = [0u32; 2];
-            encoded.extend(&sched_instr[..]); // Push now, will edit later
-            let mut bv = BitMutView::new(&mut sched_instr);
-            bv.set_field(0..2, 0b00);
-            bv.set_field(58..64, 0b000010); // 0x80
-            let mut bv = bv.subset_mut(2..58);
-
-            for (i, instr) in sched_chunk.iter().enumerate() {
-                let sched =
-                    encode_instr(Some(instr), sm, &labels, &mut encoded);
-
-                bv.set_field(i * 8..(i + 1) * 8, sched);
-            }
-            // Encode remaining ops in chunk as NOPs
-            for _ in sched_chunk.len()..7 {
-                encode_instr(None, sm, &labels, &mut encoded);
-            }
-            encoded[sched_i] = sched_instr[0];
-            encoded[sched_i + 1] = sched_instr[1];
+    // --- Real encoding ---
+    // Create an instruction iterator and iterate it in chunks of 7.
+    // fill the last chunk with a nop (it should never be executed).
+    let mut instr_iter = func
+        .blocks
+        .iter()
+        .flat_map(|b| b.instrs.iter().map(|x| &**x))
+        .peekable();
+    let mut filling_instr = Instr {
+        pred: true.into(),
+        op: Op::Nop(OpNop { label: None }),
+        deps: InstrDeps::new(),
+    };
+    filling_instr.deps.set_delay(1);
+    let mut sched_chunk_gen = || {
+        if instr_iter.peek().is_none() {
+            return None;
         }
+        Some([0; 7].map(|_| instr_iter.next().unwrap_or(&filling_instr)))
+    };
+
+    let mut encoded = Vec::new();
+    while let Some(sched_chunk) = sched_chunk_gen() {
+        let sched_i = encoded.len();
+
+        let mut sched_instr = [0u32; 2];
+        encoded.extend(&sched_instr[..]); // Push now, will edit later
+        let mut bv = BitMutView::new(&mut sched_instr);
+        bv.set_field(0..2, 0b00);
+        bv.set_field(58..64, 0b000010); // 0x80
+        let mut bv = bv.subset_mut(2..58);
+
+        for (i, instr) in sched_chunk.iter().enumerate() {
+            let sched = encode_instr(instr, sm, &labels, &mut encoded);
+
+            bv.set_field(i * 8..(i + 1) * 8, sched);
+        }
+        encoded[sched_i] = sched_instr[0];
+        encoded[sched_i + 1] = sched_instr[1];
     }
 
     encoded