diff --git a/src/nouveau/compiler/nak/api.rs b/src/nouveau/compiler/nak/api.rs index cb5669e8d81..bcd0d6df55d 100644 --- a/src/nouveau/compiler/nak/api.rs +++ b/src/nouveau/compiler/nak/api.rs @@ -449,13 +449,7 @@ pub extern "C" fn nak_compile_shader( s.remove_annotations(); - let code = if nak.sm >= 70 { - s.encode_sm70() - } else if nak.sm >= 50 { - s.encode_sm50() - } else { - panic!("Unsupported shader model"); - }; + let code = sm.encode_shader(&s); if DEBUG.print() { let stage_name = unsafe { diff --git a/src/nouveau/compiler/nak/encode_sm50.rs b/src/nouveau/compiler/nak/encode_sm50.rs new file mode 100644 index 00000000000..cce335dcd35 --- /dev/null +++ b/src/nouveau/compiler/nak/encode_sm50.rs @@ -0,0 +1,2254 @@ +// Copyright © 2023 Collabora, Ltd. +// SPDX-License-Identifier: MIT + +use crate::ir::*; +use bitview::*; + +use std::collections::HashMap; +use std::ops::Range; + +impl Src { + fn is_reg_or_zero(&self) -> bool { + matches!(self.src_ref, SrcRef::Zero | SrcRef::Reg(_)) + } +} + +fn align_down(value: usize, align: usize) -> usize { + value / align * align +} + +fn align_up(value: usize, align: usize) -> usize { + align_down(value + (align - 1), align) +} + +struct SM50Instr { + inst: [u32; 2], + sched: u32, + sm: u8, +} + +impl BitViewable for SM50Instr { + fn bits(&self) -> usize { + BitView::new(&self.inst).bits() + } + + fn get_bit_range_u64(&self, range: Range) -> u64 { + BitView::new(&self.inst).get_bit_range_u64(range) + } +} + +impl BitMutViewable for SM50Instr { + fn set_bit_range_u64(&mut self, range: Range, val: u64) { + BitMutView::new(&mut self.inst).set_bit_range_u64(range, val); + } +} + +impl SetFieldU64 for SM50Instr { + fn set_field_u64(&mut self, range: Range, val: u64) { + BitMutView::new(&mut self.inst).set_field_u64(range, val); + } +} + +impl SM50Instr { + fn new(sm: u8) -> Self { + Self { + inst: [0x0; 2], + sched: 0x7e0, + sm, + } + } + + fn nop(sm: u8) -> Self { + let mut res = Self::new(sm); + + res.encode_nop(); + + res.set_instr_deps(&InstrDeps::new()); + + res + } + + fn set_bit(&mut self, bit: usize, val: bool) { + BitMutView::new(&mut self.inst).set_bit(bit, val); + } + + fn set_opcode(&mut self, opcode: u16) { + self.set_field(48..64, opcode); + } + + fn set_pred_reg(&mut self, range: Range, reg: RegRef) { + assert!(range.len() == 3); + assert!(reg.file() == RegFile::Pred); + assert!(reg.base_idx() <= 7); + assert!(reg.comps() == 1); + self.set_field(range, reg.base_idx()); + } + + fn set_pred(&mut self, pred: &Pred) { + assert!(!pred.is_false()); + self.set_pred_reg( + 16..19, + match pred.pred_ref { + PredRef::None => RegRef::zero(RegFile::Pred, 1), + PredRef::Reg(reg) => reg, + PredRef::SSA(_) => panic!("SSA values must be lowered"), + }, + ); + self.set_bit(19, pred.pred_inv); + } + + fn set_instr_deps(&mut self, deps: &InstrDeps) { + let mut sched = BitMutView::new(&mut self.sched); + + sched.set_field(0..4, deps.delay); + sched.set_bit(4, deps.yld); + sched.set_field(5..8, deps.wr_bar().unwrap_or(7)); + sched.set_field(8..11, deps.rd_bar().unwrap_or(7)); + sched.set_field(11..17, deps.wt_bar_mask); + sched.set_field(17..21, deps.reuse_mask); + } + + fn set_reg(&mut self, range: Range, reg: RegRef) { + assert!(range.len() == 8); + assert!(reg.file() == RegFile::GPR); + self.set_field(range, reg.base_idx()); + } + + fn set_reg_src_ref(&mut self, range: Range, src_ref: SrcRef) { + match src_ref { + SrcRef::Zero => self.set_reg(range, RegRef::zero(RegFile::GPR, 1)), + SrcRef::Reg(reg) => self.set_reg(range, reg), + _ => panic!("Not a register"), + } + } + + fn set_reg_src(&mut self, range: Range, src: Src) { + assert!(src.src_mod.is_none()); + self.set_reg_src_ref(range, src.src_ref); + } + + fn set_reg_fmod_src( + &mut self, + range: Range, + abs_bit: usize, + neg_bit: usize, + src: Src, + ) { + self.set_reg_src_ref(range, src.src_ref); + self.set_bit(abs_bit, src.src_mod.has_fabs()); + self.set_bit(neg_bit, src.src_mod.has_fneg()); + } + + fn set_reg_ineg_src( + &mut self, + range: Range, + neg_bit: usize, + src: Src, + ) { + self.set_reg_src_ref(range, src.src_ref); + self.set_bit(neg_bit, src.src_mod.is_ineg()); + } + + fn set_pred_dst(&mut self, range: Range, dst: Dst) { + match dst { + Dst::None => { + self.set_pred_reg(range, RegRef::zero(RegFile::Pred, 1)); + } + Dst::Reg(reg) => self.set_pred_reg(range, reg), + _ => panic!("Not a register"), + } + } + + fn set_pred_src(&mut self, range: Range, not_bit: usize, src: Src) { + // The default for predicates is true + let true_reg = RegRef::new(RegFile::Pred, 7, 1); + + let (not, reg) = match src.src_ref { + SrcRef::True => (false, true_reg), + SrcRef::False => (true, true_reg), + SrcRef::Reg(reg) => (false, reg), + _ => panic!("Not a register"), + }; + self.set_pred_reg(range, reg); + self.set_bit(not_bit, not ^ src.src_mod.is_bnot()); + } + + fn set_dst(&mut self, dst: Dst) { + let reg = match dst { + Dst::None => RegRef::zero(RegFile::GPR, 1), + Dst::Reg(reg) => reg, + _ => panic!("invalid dst {dst}"), + }; + self.set_reg(0..8, reg); + } + + fn set_src_imm32(&mut self, range: Range, u: u32) { + assert!(range.len() == 32); + self.set_field(range, u); + } + + fn set_src_imm_i20( + &mut self, + range: Range, + sign_bit: usize, + i: u32, + ) { + assert!(range.len() == 19); + assert!((i & 0xfff80000) == 0 || (i & 0xfff80000) == 0xfff80000); + + self.set_field(range, i & 0x7ffff); + self.set_field(sign_bit..sign_bit + 1, (i & 0x80000) >> 19); + } + + fn set_src_imm_f20( + &mut self, + range: Range, + sign_bit: usize, + f: u32, + ) { + assert!(range.len() == 19); + assert!((f & 0x00000fff) == 0); + + self.set_field(range, (f >> 12) & 0x7ffff); + self.set_field(sign_bit..sign_bit + 1, f >> 31); + } + + fn set_src_cb(&mut self, range: Range, cb: &CBufRef) { + let mut v = BitMutView::new_subset(self, range); + + assert!(cb.offset % 4 == 0); + + v.set_field(0..14, cb.offset >> 2); + if let CBuf::Binding(idx) = cb.buf { + v.set_field(14..19, idx); + } else { + panic!("Must be a bound constant buffer"); + } + } + + fn set_cb_fmod_src( + &mut self, + range: Range, + abs_bit: usize, + neg_bit: usize, + src: Src, + ) { + if let SrcRef::CBuf(cb) = &src.src_ref { + self.set_src_cb(range, cb); + } else { + panic!("Not a CBuf source"); + } + + self.set_bit(abs_bit, src.src_mod.has_fabs()); + self.set_bit(neg_bit, src.src_mod.has_fneg()); + } + + fn set_cb_ineg_src( + &mut self, + range: Range, + neg_bit: usize, + src: Src, + ) { + if let SrcRef::CBuf(cb) = &src.src_ref { + self.set_src_cb(range, cb); + } else { + panic!("Not a CBuf source"); + } + + self.set_bit(neg_bit, src.src_mod.is_ineg()); + } + + fn encode_mov(&mut self, op: &OpMov) { + match &op.src.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5c98); + self.set_reg_src(20..28, op.src); + self.set_field(39..43, op.quad_lanes); + } + SrcRef::Imm32(i) => { + self.set_opcode(0x0100); + self.set_src_imm32(20..52, *i); + self.set_field(12..16, op.quad_lanes); + } + SrcRef::CBuf(cb) => { + self.set_opcode(0x4c98); + self.set_src_cb(20..39, cb); + self.set_field(39..43, op.quad_lanes); + } + src => panic!("Unsupported src type for MOV: {src}"), + } + + self.set_dst(op.dst); + } + + fn encode_sel(&mut self, op: &OpSel) { + match &op.srcs[1].src_ref { + SrcRef::Imm32(imm32) => { + self.set_opcode(0x38a0); + self.set_src_imm_i20(20..39, 56, *imm32); + } + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5ca0); + self.set_reg_src_ref(20..28, op.srcs[1].src_ref); + } + SrcRef::CBuf(cbuf) => { + self.set_opcode(0x4ca0); + self.set_src_cb(20..39, cbuf); + } + src => panic!("Unsupported src type for SEL: {src}"), + } + + self.set_dst(op.dst); + self.set_reg_src(8..16, op.srcs[0]); + self.set_pred_src(39..42, 42, op.cond); + } + + fn encode_shfl(&mut self, op: &OpShfl) { + self.set_opcode(0xef10); + + self.set_dst(op.dst); + self.set_pred_dst(48..51, op.in_bounds); + self.set_reg_src(8..16, op.src); + + match op.lane.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_bit(28, false); + self.set_reg_src(20..28, op.lane); + } + SrcRef::Imm32(imm) => { + self.set_bit(28, true); + self.set_field(20..25, imm & 0x1f); + } + lane => panic!("unsupported lane src type for SHFL: {lane}"), + } + match op.c.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_bit(29, false); + self.set_reg_src(39..47, op.c); + } + SrcRef::Imm32(imm) => { + self.set_bit(29, true); + self.set_field(34..47, imm & 0x1f1f); + } + c => panic!("unsupported c src type for SHFL: {c}"), + } + + self.set_field( + 30..32, + match op.op { + ShflOp::Idx => 0u8, + ShflOp::Up => 1u8, + ShflOp::Down => 2u8, + ShflOp::Bfly => 3u8, + }, + ); + } + + fn encode_vote(&mut self, op: &OpVote) { + self.set_opcode(0x50d8); + + self.set_dst(op.ballot); + self.set_pred_dst(45..48, op.vote); + self.set_pred_src(39..42, 42, op.pred); + + self.set_field( + 48..50, + match op.op { + VoteOp::All => 0u8, + VoteOp::Any => 1u8, + VoteOp::Eq => 2u8, + }, + ); + } + + fn encode_psetp(&mut self, op: &OpPSetP) { + self.set_opcode(0x5090); + + self.set_pred_dst(3..6, op.dsts[0]); + self.set_pred_dst(0..3, op.dsts[1]); + + self.set_pred_src(12..15, 15, op.srcs[0]); + self.set_pred_src(29..32, 32, op.srcs[1]); + self.set_pred_src(39..42, 42, op.srcs[2]); + + self.set_pred_set_op(24..26, op.ops[0]); + self.set_pred_set_op(45..47, op.ops[1]); + } + + fn set_mem_type(&mut self, range: Range, mem_type: MemType) { + assert!(range.len() == 3); + self.set_field( + range, + match mem_type { + MemType::U8 => 0_u8, + MemType::I8 => 1_u8, + MemType::U16 => 2_u8, + MemType::I16 => 3_u8, + MemType::B32 => 4_u8, + MemType::B64 => 5_u8, + MemType::B128 => 6_u8, + }, + ); + } + + fn set_mem_order(&mut self, _order: &MemOrder) { + // TODO: order and scope aren't present before SM70, what should we do? + } + + fn set_mem_access(&mut self, access: &MemAccess) { + self.set_field( + 45..46, + match access.space.addr_type() { + MemAddrType::A32 => 0_u8, + MemAddrType::A64 => 1_u8, + }, + ); + self.set_mem_type(48..51, access.mem_type); + self.set_mem_order(&access.order); + } + + fn set_image_dim(&mut self, range: Range, dim: ImageDim) { + assert!(range.len() == 3); + self.set_field( + range, + match dim { + ImageDim::_1D => 0_u8, + ImageDim::_1DBuffer => 1_u8, + ImageDim::_1DArray => 2_u8, + ImageDim::_2D => 3_u8, + ImageDim::_2DArray => 4_u8, + ImageDim::_3D => 5_u8, + }, + ); + } + + fn set_rnd_mode(&mut self, range: Range, rnd_mode: FRndMode) { + assert!(range.len() == 2); + self.set_field( + range, + match rnd_mode { + FRndMode::NearestEven => 0_u8, + FRndMode::NegInf => 1_u8, + FRndMode::PosInf => 2_u8, + FRndMode::Zero => 3_u8, + }, + ); + } + + fn encode_ldg(&mut self, op: &OpLd) { + self.set_opcode(0xeed0); + + self.set_dst(op.dst); + self.set_reg_src(8..16, op.addr); + self.set_field(20..44, op.offset); + + self.set_mem_access(&op.access); + } + + fn encode_ldl(&mut self, op: &OpLd) { + self.set_opcode(0xef40); + + self.set_dst(op.dst); + self.set_reg_src(8..16, op.addr); + self.set_field(20..44, op.offset); + + self.set_mem_access(&op.access); + } + + fn encode_lds(&mut self, op: &OpLd) { + self.set_opcode(0xef48); + + self.set_dst(op.dst); + self.set_reg_src(8..16, op.addr); + self.set_field(20..44, op.offset); + + self.set_mem_access(&op.access); + } + + fn encode_ld(&mut self, op: &OpLd) { + match op.access.space { + MemSpace::Global(_) => self.encode_ldg(op), + MemSpace::Local => self.encode_ldl(op), + MemSpace::Shared => self.encode_lds(op), + } + } + + fn encode_ldc(&mut self, op: &OpLdc) { + assert!(op.cb.src_mod.is_none()); + let SrcRef::CBuf(cb) = &op.cb.src_ref else { + panic!("Not a CBuf source"); + }; + let CBuf::Binding(cb_idx) = cb.buf else { + panic!("Must be a bound constant buffer"); + }; + + self.set_opcode(0xef90); + + self.set_dst(op.dst); + self.set_reg_src(8..16, op.offset); + self.set_field(20..36, cb.offset); + self.set_field(36..41, cb_idx); + self.set_field( + 44..46, + match op.mode { + LdcMode::Indexed => 0_u8, + LdcMode::IndexedLinear => 1_u8, + LdcMode::IndexedSegmented => 2_u8, + LdcMode::IndexedSegmentedLinear => 3_u8, + }, + ); + self.set_mem_type(48..51, op.mem_type); + } + + fn encode_stg(&mut self, op: &OpSt) { + self.set_opcode(0xeed8); + + self.set_reg_src(0..8, op.data); + self.set_reg_src(8..16, op.addr); + self.set_field(20..44, op.offset); + self.set_mem_access(&op.access); + } + + fn encode_stl(&mut self, op: &OpSt) { + self.set_opcode(0xef50); + + self.set_reg_src(0..8, op.data); + self.set_reg_src(8..16, op.addr); + self.set_field(20..44, op.offset); + self.set_mem_access(&op.access); + } + + fn encode_sts(&mut self, op: &OpSt) { + self.set_opcode(0xef58); + + self.set_reg_src(0..8, op.data); + self.set_reg_src(8..16, op.addr); + self.set_field(20..44, op.offset); + self.set_mem_access(&op.access); + } + + fn encode_st(&mut self, op: &OpSt) { + match op.access.space { + MemSpace::Global(_) => self.encode_stg(op), + MemSpace::Local => self.encode_stl(op), + MemSpace::Shared => self.encode_sts(op), + } + } + + fn encode_lop2(&mut self, op: &OpLop2) { + if let Some(imm32) = op.srcs[1].as_imm_not_i20() { + self.set_opcode(0x0400); + + self.set_dst(op.dst); + self.set_reg_src_ref(8..16, op.srcs[0].src_ref); + self.set_bit(55, op.srcs[0].src_mod.is_bnot()); + self.set_src_imm32(20..52, imm32); + + self.set_field( + 53..55, + match op.op { + LogicOp2::And => 0_u8, + LogicOp2::Or => 1_u8, + LogicOp2::Xor => 2_u8, + LogicOp2::PassB => { + panic!("PASS_B is not supported for LOP32I"); + } + }, + ); + } else { + match &op.srcs[1].src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5c40); + self.set_reg_src_ref(20..28, op.srcs[1].src_ref); + } + SrcRef::Imm32(i) => { + self.set_opcode(0x3840); + self.set_src_imm_i20(20..39, 56, *i); + } + SrcRef::CBuf(cb) => { + self.set_opcode(0x4c40); + self.set_src_cb(20..39, cb); + } + src1 => panic!("unsupported src1 type for IMUL: {src1}"), + } + + self.set_dst(op.dst); + self.set_reg_src_ref(8..16, op.srcs[0].src_ref); + + self.set_bit(39, op.srcs[0].src_mod.is_bnot()); + self.set_bit(40, op.srcs[1].src_mod.is_bnot()); + + self.set_field( + 41..43, + match op.op { + LogicOp2::And => 0_u8, + LogicOp2::Or => 1_u8, + LogicOp2::Xor => 2_u8, + LogicOp2::PassB => 3_u8, + }, + ); + + self.set_pred_dst(48..51, Dst::None); + } + } + + fn encode_shf(&mut self, op: &OpShf) { + match &op.shift.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5cf8); + self.set_reg_src(20..28, op.shift); + } + SrcRef::Imm32(i) => { + self.set_opcode(0x38f8); + assert!(op.shift.src_mod.is_none()); + self.set_src_imm_i20(20..39, 56, *i); + } + src1 => panic!("unsupported src1 type for SHF: {src1}"), + } + + self.set_field( + 37..39, + match op.data_type { + IntType::I32 => 0_u8, + IntType::U32 => 0_u8, + IntType::U64 => 2_u8, + IntType::I64 => 3_u8, + _ => panic!("Invalid shift data type"), + }, + ); + + self.set_dst(op.dst); + self.set_reg_src(8..16, op.low); + self.set_reg_src(39..47, op.high); + + self.set_bit(47, false); // .CC + self.set_bit(48, op.dst_high); + self.set_bit(49, false); // .X + self.set_bit(50, op.wrap); + } + + fn encode_shl(&mut self, op: &OpShl) { + self.set_dst(op.dst); + self.set_reg_src(8..16, op.src); + match op.shift.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5c48); + self.set_reg_src(20..28, op.shift); + } + SrcRef::Imm32(i) => { + self.set_opcode(0x3848); + self.set_src_imm_i20(20..39, 56, i); + } + SrcRef::CBuf(cb) => { + self.set_opcode(0x4c48); + self.set_src_cb(20..39, &cb); + } + src1 => panic!("unsupported src1 type for SHL: {src1}"), + } + + self.set_bit(39, op.wrap); + } + + fn encode_shr(&mut self, op: &OpShr) { + self.set_dst(op.dst); + self.set_reg_src(8..16, op.src); + match op.shift.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5c28); + self.set_reg_src(20..28, op.shift); + } + SrcRef::Imm32(i) => { + self.set_opcode(0x3828); + self.set_src_imm_i20(20..39, 56, i); + } + SrcRef::CBuf(cb) => { + self.set_opcode(0x4c28); + self.set_src_cb(20..39, &cb); + } + src1 => panic!("unsupported src1 type for SHL: {src1}"), + } + + self.set_bit(39, op.wrap); + self.set_bit(48, op.signed); + } + + fn encode_i2f(&mut self, op: &OpI2F) { + let abs_bit = 49; + let neg_bit = 45; + + match &op.src.src_ref { + SrcRef::Imm32(imm) => { + self.set_opcode(0x38b8); + self.set_src_imm_i20(20..39, 56, *imm); + } + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5cb8); + self.set_reg_fmod_src(20..28, abs_bit, neg_bit, op.src); + } + SrcRef::CBuf(_) => { + self.set_opcode(0x4cb8); + self.set_cb_fmod_src(20..39, abs_bit, neg_bit, op.src); + } + src => panic!("Unsupported src type for I2F: {src}"), + } + + self.set_field(41..43, 0_u8); // TODO: subop + self.set_bit(13, op.src_type.is_signed()); + self.set_field(8..10, (op.dst_type.bits() / 8).ilog2()); + self.set_rnd_mode(39..41, op.rnd_mode); + self.set_field(10..12, (op.src_type.bits() / 8).ilog2()); + + self.set_dst(op.dst); + } + + fn encode_f2f(&mut self, op: &OpF2F) { + assert!(op.src.is_reg_or_zero()); + + let abs_bit = 49; + let neg_bit = 45; + + match &op.src.src_ref { + SrcRef::Imm32(imm) => { + self.set_opcode(0x38a8); + self.set_src_imm_i20(20..39, 56, *imm); + } + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5ca8); + self.set_reg_fmod_src(20..28, abs_bit, neg_bit, op.src); + } + SrcRef::CBuf(_) => { + self.set_opcode(0x4ca8); + self.set_cb_fmod_src(20..39, abs_bit, neg_bit, op.src); + } + src => panic!("Unsupported src type for F2F: {src}"), + } + + // no saturation in the IR, would be bit 50 + self.set_field(8..10, (op.dst_type.bits() / 8).ilog2()); + self.set_field(10..12, (op.src_type.bits() / 8).ilog2()); + self.set_rnd_mode(39..41, op.rnd_mode); + self.set_bit(42, op.integer_rnd); + self.set_bit(44, op.ftz); + + self.set_dst(op.dst); + } + + fn encode_i2i(&mut self, op: &OpI2I) { + match &op.src.src_ref { + SrcRef::Imm32(imm32) => { + self.set_opcode(0x38e0); + self.set_src_imm_i20(20..39, 56, *imm32); + } + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5ce0); + self.set_reg_src(20..28, op.src); + } + SrcRef::CBuf(cbuf) => { + self.set_opcode(0x4ce0); + self.set_src_cb(20..39, cbuf); + } + src => panic!("Unsupported src type for I2I: {src}"), + } + + self.set_bit(45, op.neg); + self.set_bit(49, op.abs); + self.set_bit(50, op.saturate); + self.set_bit(12, op.dst_type.is_signed()); + self.set_bit(13, op.src_type.is_signed()); + self.set_field(8..10, (op.dst_type.bits() / 8).ilog2()); + self.set_field(10..12, (op.src_type.bits() / 8).ilog2()); + self.set_field(41..43, 0u8); // src.B1-3 + self.set_bit(47, false); // dst.CC + + self.set_dst(op.dst); + } + + fn encode_imad(&mut self, op: &OpIMad) { + let neg_1_bit = 51; + let neg_2_bit = 52; + + match &op.srcs[2].src_ref { + SrcRef::Imm32(imm) => { + panic!("Invalid immediate src2 for IMAD {}", *imm) + } + SrcRef::Reg(_) => match &op.srcs[1].src_ref { + SrcRef::Imm32(imm) => { + self.set_opcode(0x3400); + self.set_src_imm_i20(20..39, 56, *imm); + } + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5a00); + self.set_reg_ineg_src(20..28, neg_1_bit, op.srcs[1]); + } + SrcRef::CBuf(_) => { + self.set_opcode(0x4a00); + self.set_cb_ineg_src(20..39, neg_1_bit, op.srcs[1]); + } + + src => panic!("Invalid src1 for IMAD {src}"), + }, + SrcRef::CBuf(_) => { + self.set_opcode(0x5200); + self.set_reg_ineg_src(39..47, neg_1_bit, op.srcs[1]); + self.set_cb_ineg_src(20..39, neg_2_bit, op.srcs[2]); + } + src => panic!("Unsupported src2 type for F2F: {src}"), + } + + self.set_bit(48, op.signed); // src0 signed + self.set_bit( + 51, + op.srcs[0].src_mod.is_ineg() ^ op.srcs[1].src_mod.is_ineg(), + ); + self.set_bit(53, op.signed); // src1 signed + + self.set_reg_src(8..16, op.srcs[0]); + self.set_dst(op.dst); + } + + fn encode_imul(&mut self, op: &OpIMul) { + assert!(op.srcs[0].src_mod.is_none()); + assert!(op.srcs[1].src_mod.is_none()); + + self.set_dst(op.dst); + self.set_reg_src(8..16, op.srcs[0]); + + if let Some(i) = op.srcs[1].as_imm_not_i20() { + self.set_opcode(0x1fc0); + self.set_src_imm32(20..52, i); + + self.set_bit(53, op.high); + self.set_bit(54, op.signed[0]); + self.set_bit(55, op.signed[1]); + } else { + match op.srcs[1].src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5c38); + self.set_reg_src(20..28, op.srcs[1]); + } + SrcRef::Imm32(i) => { + self.set_opcode(0x3838); + self.set_src_imm_i20(20..39, 56, i); + } + SrcRef::CBuf(cb) => { + self.set_opcode(0x4c38); + self.set_src_cb(20..39, &cb); + } + src1 => panic!("unsupported src1 type for IMUL: {src1}"), + }; + + self.set_bit(39, op.high); + self.set_bit(40, op.signed[0]); + self.set_bit(41, op.signed[1]); + } + } + + fn encode_f2i(&mut self, op: &OpF2I) { + match &op.src.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5cb0); + self.set_reg_fmod_src(20..28, 49, 45, op.src); + } + SrcRef::Imm32(i) => { + self.set_opcode(0x38b0); + self.set_src_imm_f20(20..39, 56, *i); + } + SrcRef::CBuf(_) => { + self.set_opcode(0x4cb0); + self.set_cb_fmod_src(20..39, 49, 45, op.src); + } + src => panic!("Unsupported src type for F2I: {src}"), + } + + self.set_dst(op.dst); + + self.set_field(8..10, (op.dst_type.bits() / 8).ilog2()); + self.set_field(10..12, (op.src_type.bits() / 8).ilog2()); + self.set_bit(12, op.dst_type.is_signed()); + self.set_rnd_mode(39..41, op.rnd_mode); + self.set_bit(44, op.ftz); + self.set_bit(47, false); // .CC + } + + fn set_pred_set_op(&mut self, range: Range, op: PredSetOp) { + assert!(range.len() == 2); + self.set_field( + range, + match op { + PredSetOp::And => 0_u8, + PredSetOp::Or => 1_u8, + PredSetOp::Xor => 2_u8, + }, + ); + } + + fn encode_imnmx(&mut self, op: &OpIMnMx) { + match &op.srcs[1].src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5c20); + self.set_reg_src(20..28, op.srcs[1]); + } + SrcRef::Imm32(i) => { + self.set_opcode(0x3820); + self.set_src_imm_f20(20..39, 56, *i); + } + SrcRef::CBuf(cb) => { + self.set_opcode(0x4c20); + self.set_src_cb(20..39, cb); + } + src1 => panic!("unsupported src1 type for IMNMX: {src1}"), + } + + self.set_dst(op.dst); + self.set_reg_src(8..16, op.srcs[0]); + self.set_pred_src(39..42, 42, op.min); + self.set_bit(47, false); // .CC + self.set_bit( + 48, + match op.cmp_type { + IntCmpType::U32 => false, + IntCmpType::I32 => true, + }, + ); + } + + fn set_int_cmp_op(&mut self, range: Range, op: IntCmpOp) { + assert!(range.len() == 3); + self.set_field( + range, + match op { + IntCmpOp::Eq => 2_u8, + IntCmpOp::Ne => 5_u8, + IntCmpOp::Lt => 1_u8, + IntCmpOp::Le => 3_u8, + IntCmpOp::Gt => 4_u8, + IntCmpOp::Ge => 6_u8, + }, + ); + } + + fn encode_isetp(&mut self, op: &OpISetP) { + assert!(op.srcs[0].src_mod.is_none()); + assert!(op.srcs[1].src_mod.is_none()); + + match &op.srcs[1].src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5b60); + self.set_reg_src(20..28, op.srcs[1]); + } + SrcRef::Imm32(i) => { + self.set_opcode(0x3660); + self.set_src_imm_i20(20..39, 56, *i); + } + SrcRef::CBuf(cb) => { + self.set_opcode(0x4b60); + self.set_src_cb(20..39, cb); + } + _ => panic!("Unsupported src type"), + } + + self.set_pred_dst(0..3, Dst::None); // dst1 + self.set_pred_dst(3..6, op.dst); + self.set_reg_src(8..16, op.srcs[0]); + self.set_pred_src(39..42, 42, op.accum); + + self.set_bit(43, false); // .X + self.set_pred_set_op(45..47, op.set_op); + + self.set_field( + 48..49, + match op.cmp_type { + IntCmpType::U32 => 0_u32, + IntCmpType::I32 => 1_u32, + }, + ); + self.set_int_cmp_op(49..52, op.cmp_op); + } + + fn encode_sust(&mut self, op: &OpSuSt) { + self.set_opcode(0xeb20); + + self.set_reg_src(8..16, op.coord); + self.set_reg_src(0..8, op.data); + self.set_reg_src(39..47, op.handle); + + self.set_image_dim(33..36, op.image_dim); + self.set_mem_order(&op.mem_order); + + assert!(op.mask == 0x1 || op.mask == 0x3 || op.mask == 0xf); + self.set_field(20..24, op.mask); + } + + fn set_atom_op(&mut self, range: Range, atom_op: AtomOp) { + assert!(range.len() == 4); + self.set_field( + range, + match atom_op { + AtomOp::Add => 0_u8, + AtomOp::Min => 1_u8, + AtomOp::Max => 2_u8, + AtomOp::Inc => 3_u8, + AtomOp::Dec => 4_u8, + AtomOp::And => 5_u8, + AtomOp::Or => 6_u8, + AtomOp::Xor => 7_u8, + AtomOp::Exch => 8_u8, + AtomOp::CmpExch => panic!("CmpXchg not yet supported"), + }, + ); + } + + fn encode_atomg(&mut self, op: &OpAtom) { + self.set_opcode(0xed00); + self.set_mem_order(&op.mem_order); + + self.set_dst(op.dst); + self.set_reg_src(8..16, op.addr); + self.set_reg_src(20..28, op.data); + self.set_field(28..48, op.addr_offset); + self.set_field( + 48..49, + match op.mem_space.addr_type() { + MemAddrType::A32 => 0_u8, + MemAddrType::A64 => 1_u8, + }, + ); + self.set_field( + 49..52, + match op.atom_type { + AtomType::U32 => 0_u8, + AtomType::I32 => 1_u8, + AtomType::U64 => 2_u8, + AtomType::F32 => 3_u8, + // NOTE: U128 => 4_u8, + AtomType::I64 => 5_u8, + // TODO: do something about ATOMG.F64 + other => panic!("ATOMG.{other} not supported on SM50"), + }, + ); + self.set_atom_op(52..56, op.atom_op); + } + + fn encode_atoms(&mut self, op: &OpAtom) { + self.set_opcode(0xec00); + self.set_mem_order(&op.mem_order); + + self.set_dst(op.dst); + self.set_reg_src(8..16, op.addr); + self.set_reg_src(20..28, op.data); + self.set_field( + 28..30, + match op.atom_type { + AtomType::U32 => 0_u8, + AtomType::I32 => 1_u8, + AtomType::U64 => 2_u8, + AtomType::I64 => 3_u8, + // TODO: do something about ATOMS.F{32,64} + other => panic!("ATOMS.{other} not supported on SM50"), + }, + ); + assert_eq!(op.addr_offset % 4, 0); + self.set_field(30..52, op.addr_offset / 4); + self.set_atom_op(52..56, op.atom_op); + } + + fn encode_atom(&mut self, op: &OpAtom) { + match op.mem_space { + MemSpace::Global(_) => self.encode_atomg(op), + MemSpace::Local => panic!("Atomics do not support local"), + MemSpace::Shared => self.encode_atoms(op), + } + } + + fn set_tex_dim(&mut self, range: Range, dim: TexDim) { + assert!(range.len() == 3); + self.set_field( + range, + match dim { + TexDim::_1D => 0_u8, + TexDim::Array1D => 1_u8, + TexDim::_2D => 2_u8, + TexDim::Array2D => 3_u8, + TexDim::_3D => 4_u8, + TexDim::Cube => 6_u8, + TexDim::ArrayCube => 7_u8, + }, + ); + } + + fn set_tex_lod_mode(&mut self, range: Range, lod_mode: TexLodMode) { + assert!(range.len() == 2); + self.set_field( + range, + match lod_mode { + TexLodMode::Auto => 0_u8, + TexLodMode::Zero => 1_u8, + TexLodMode::Bias => 2_u8, + TexLodMode::Lod => 3_u8, + _ => panic!("Unknown LOD mode"), + }, + ); + } + + fn encode_tex(&mut self, op: &OpTex) { + self.set_opcode(0xdeb8); + + self.set_dst(op.dsts[0]); + assert!(op.dsts[1].is_none()); + assert!(op.fault.is_none()); + self.set_reg_src(8..16, op.srcs[0]); + self.set_reg_src(20..28, op.srcs[1]); + + self.set_tex_dim(28..31, op.dim); + self.set_field(31..35, op.mask); + self.set_bit(35, false); // ToDo: NDV + self.set_bit(36, op.offset); + self.set_tex_lod_mode(37..39, op.lod_mode); + self.set_bit(49, false); // TODO: .NODEP + self.set_bit(50, op.z_cmpr); + } + + fn encode_tld(&mut self, op: &OpTld) { + self.set_opcode(0xdd38); + + self.set_dst(op.dsts[0]); + assert!(op.dsts[1].is_none()); + assert!(op.fault.is_none()); + self.set_reg_src(8..16, op.srcs[0]); + self.set_reg_src(20..28, op.srcs[1]); + + self.set_tex_dim(28..31, op.dim); + self.set_field(31..35, op.mask); + self.set_bit(35, op.offset); + self.set_bit(49, false); // TODO: .NODEP + self.set_bit(50, op.is_ms); + + assert!( + op.lod_mode == TexLodMode::Zero || op.lod_mode == TexLodMode::Lod + ); + self.set_bit(55, op.lod_mode == TexLodMode::Lod); + } + + fn encode_tld4(&mut self, op: &OpTld4) { + self.set_opcode(0xdef8); + + self.set_dst(op.dsts[0]); + assert!(op.dsts[1].is_none()); + assert!(op.fault.is_none()); + self.set_reg_src(8..16, op.srcs[0]); + self.set_reg_src(20..28, op.srcs[1]); + + self.set_tex_dim(28..31, op.dim); + self.set_field(31..35, op.mask); + self.set_bit(35, false); // ToDo: NDV + self.set_field( + 36..38, + match op.offset_mode { + Tld4OffsetMode::None => 0_u8, + Tld4OffsetMode::AddOffI => 1_u8, + Tld4OffsetMode::PerPx => 2_u8, + }, + ); + self.set_field(38..40, op.comp); + self.set_bit(49, false); // TODO: .NODEP + self.set_bit(50, op.z_cmpr); + } + + fn encode_tmml(&mut self, op: &OpTmml) { + self.set_opcode(0xdf60); + + self.set_dst(op.dsts[0]); + assert!(op.dsts[1].is_none()); + self.set_reg_src(8..16, op.srcs[0]); + self.set_reg_src(20..28, op.srcs[1]); + + self.set_tex_dim(28..31, op.dim); + self.set_field(31..35, op.mask); + self.set_bit(35, false); // ToDo: NDV + self.set_bit(49, false); // TODO: .NODEP + } + + fn encode_txd(&mut self, op: &OpTxd) { + self.set_opcode(0xde78); + + self.set_dst(op.dsts[0]); + assert!(op.dsts[1].is_none()); + assert!(op.fault.is_none()); + self.set_reg_src(8..16, op.srcs[0]); + self.set_reg_src(20..28, op.srcs[1]); + + self.set_tex_dim(28..31, op.dim); + self.set_field(31..35, op.mask); + self.set_bit(35, op.offset); + self.set_bit(49, false); // TODO: .NODEP + } + + fn encode_txq(&mut self, op: &OpTxq) { + self.set_opcode(0xdf50); + + self.set_dst(op.dsts[0]); + assert!(op.dsts[1].is_none()); + self.set_reg_src(8..16, op.src); + + self.set_field( + 22..28, + match op.query { + TexQuery::Dimension => 1_u8, + TexQuery::TextureType => 2_u8, + TexQuery::SamplerPos => 5_u8, + // TexQuery::Filter => 0x10_u8, + // TexQuery::Lod => 0x12_u8, + // TexQuery::Wrap => 0x14_u8, + // TexQuery::BorderColour => 0x16, + }, + ); + self.set_field(31..35, op.mask); + self.set_bit(49, false); // TODO: .NODEP + } + + fn encode_ipa(&mut self, op: &OpIpa) { + self.set_opcode(0xe000); + + self.set_dst(op.dst); + self.set_reg_src(8..16, 0.into()); // addr + self.set_reg_src(20..28, op.inv_w); + self.set_reg_src(39..47, op.offset); + + assert!(op.addr % 4 == 0); + self.set_field(28..38, op.addr); + self.set_bit(38, false); // .IDX + self.set_pred_dst(47..50, Dst::None); // TODO: What is this for? + self.set_bit(51, false); // .SAT + self.set_field( + 52..54, + match op.loc { + InterpLoc::Default => 0_u8, + InterpLoc::Centroid => 1_u8, + InterpLoc::Offset => 2_u8, + }, + ); + self.set_field( + 54..56, + match op.freq { + InterpFreq::Pass => 0_u8, + InterpFreq::PassMulW => 1_u8, + InterpFreq::Constant => 2_u8, + InterpFreq::State => 3_u8, + }, + ); + } + + fn encode_ald(&mut self, op: &OpALd) { + self.set_opcode(0xefd8); + + self.set_dst(op.dst); + self.set_reg_src(8..16, op.offset); + self.set_reg_src(39..47, op.vtx); + + assert!(!op.access.phys); + self.set_field(20..30, op.access.addr); + self.set_bit(31, op.access.patch); + self.set_bit(32, op.access.output); + self.set_field(47..49, op.access.comps - 1); + } + + fn encode_ast(&mut self, op: &OpASt) { + self.set_opcode(0xeff0); + + self.set_reg_src(0..8, op.data); + self.set_reg_src(8..16, op.offset); + self.set_reg_src(39..47, op.vtx); + + assert!(!op.access.phys); + assert!(op.access.output); + self.set_field(20..30, op.access.addr); + self.set_bit(31, op.access.patch); + self.set_bit(32, op.access.output); + self.set_field(47..49, op.access.comps - 1); + } + + fn encode_membar(&mut self, op: &OpMemBar) { + self.set_opcode(0xef98); + + self.set_field( + 8..10, + match op.scope { + MemScope::CTA => 0_u8, + MemScope::GPU => 1_u8, + MemScope::System => 2_u8, + }, + ); + } + + fn set_rel_offset( + &mut self, + range: Range, + label: &Label, + ip: usize, + labels: &HashMap, + ) { + let ip = u32::try_from(ip).unwrap(); + let ip = i32::try_from(ip).unwrap(); + + let target_ip = *labels.get(label).unwrap(); + let target_ip = u32::try_from(target_ip).unwrap(); + let target_ip = i32::try_from(target_ip).unwrap(); + + let rel_offset = target_ip - ip - 8; + + self.set_field(range, rel_offset); + } + + fn encode_bra( + &mut self, + op: &OpBra, + ip: usize, + labels: &HashMap, + ) { + self.set_opcode(0xe240); + self.set_rel_offset(20..44, &op.target, ip, labels); + self.set_field(0..5, 0xF_u8); // TODO: Pred? + } + + fn encode_exit(&mut self, _op: &OpExit) { + self.set_opcode(0xe300); + + // TODO: pred + self.set_pred(&Pred { + pred_ref: PredRef::None, + pred_inv: false, + }); + + // TODO: CC flags + self.set_field(0..4, 0xf_u8); // CC.T + } + + fn encode_bar(&mut self, _op: &OpBar) { + self.set_opcode(0xf0a8); + + self.set_reg_src(8..16, SrcRef::Zero.into()); + + // 00: RED.POPC + // 01: RED.AND + // 02: RED.OR + self.set_field(35..37, 0_u8); + + // 00: SYNC + // 01: ARV + // 02: RED + // 03: SCAN + self.set_field(32..35, 0_u8); + + self.set_pred_src(39..42, 42, SrcRef::True.into()); + } + + fn encode_cs2r(&mut self, op: &OpCS2R) { + self.set_opcode(0x50c8); + self.set_dst(op.dst); + self.set_field(20..28, op.idx); + } + + fn encode_kill(&mut self, _op: &OpKill) { + self.set_opcode(0xe330); + self.set_field(0..5, 0x0f_u8); + } + + fn encode_nop(&mut self) { + self.set_opcode(0x50b0); + + // TODO: pred + self.set_pred(&Pred { + pred_ref: PredRef::None, + pred_inv: false, + }); + + // TODO: CC flags + self.set_field(8..12, 0xf_u8); // CC.T + } + + fn encode_s2r(&mut self, op: &OpS2R) { + self.set_opcode(0xf0c8); + self.set_dst(op.dst); + self.set_field(20..28, op.idx); + } + + fn encode_popc(&mut self, op: &OpPopC) { + assert!(op.src.is_reg_or_zero()); + + match &op.src.src_ref { + SrcRef::Imm32(imm) => { + self.set_opcode(0x3808); + self.set_src_imm_i20(20..39, 56, *imm); + } + SrcRef::Reg(_) => { + self.set_opcode(0x5c08); + self.set_reg_src(20..28, op.src); + } + SrcRef::CBuf(cbuf) => { + self.set_opcode(0x4c08); + self.set_src_cb(20..39, cbuf); + } + src => panic!("Invalid source for POPC: {src}"), + } + + let not_mod = matches!(op.src.src_mod, SrcMod::BNot); + self.set_bit(40, not_mod); + self.set_dst(op.dst); + } + + fn encode_fadd(&mut self, op: &OpFAdd) { + if let Some(imm32) = op.srcs[1].as_imm_not_f20() { + self.set_opcode(0x0800); + self.set_dst(op.dst); + self.set_reg_fmod_src(8..16, 54, 56, op.srcs[0]); + self.set_src_imm32(20..52, imm32); + self.set_bit(55, op.ftz); + } else { + match &op.srcs[1].src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5c58); + self.set_reg_fmod_src(20..28, 49, 45, op.srcs[1]); + } + SrcRef::Imm32(imm) => { + self.set_opcode(0x3858); + self.set_src_imm_f20(20..39, 56, *imm); + assert!(op.srcs[1].src_mod.is_none()); + } + SrcRef::CBuf(_) => { + self.set_opcode(0x4c58); + self.set_cb_fmod_src(20..39, 49, 45, op.srcs[1]); + } + _ => panic!("Unsupported src type"), + } + + self.set_dst(op.dst); + self.set_reg_fmod_src(8..16, 46, 48, op.srcs[0]); + + self.set_rnd_mode(39..41, op.rnd_mode); + self.set_bit(44, op.ftz); + self.set_bit(50, op.saturate); + } + } + + fn encode_fmnmx(&mut self, op: &OpFMnMx) { + match &op.srcs[1].src_ref { + SrcRef::Imm32(imm32) => { + self.set_opcode(0x3860); + self.set_src_imm_f20(20..39, 56, *imm32); + } + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5c60); + self.set_reg_fmod_src(20..28, 49, 45, op.srcs[1]); + } + SrcRef::CBuf(_) => { + self.set_opcode(0x4c60); + self.set_cb_fmod_src(20..39, 49, 45, op.srcs[1]); + } + src => panic!("Unsupported src type for FMNMX: {src}"), + } + + self.set_reg_fmod_src(8..16, 46, 48, op.srcs[0]); + self.set_dst(op.dst); + self.set_pred_src(39..42, 42, op.min); + self.set_bit(44, op.ftz); + } + + fn encode_fmul(&mut self, op: &OpFMul) { + if let Some(imm32) = op.srcs[1].as_imm_not_f20() { + self.set_opcode(0x1e00); + + self.set_bit(53, op.ftz); + self.set_bit(54, op.dnz); + self.set_bit(55, op.saturate); + + self.set_src_imm32(20..52, imm32); + self.set_bit( + 19, + op.srcs[0].src_mod.has_fneg() ^ op.srcs[1].src_mod.has_fneg(), + ); + } else { + match &op.srcs[1].src_ref { + SrcRef::Imm32(imm32) => { + self.set_opcode(0x3868); + self.set_src_imm_f20(20..39, 56, *imm32); + } + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5c68); + self.set_reg_src(20..28, op.srcs[1]); + } + SrcRef::CBuf(cbuf) => { + self.set_opcode(0x4c68); + self.set_src_cb(20..39, cbuf); + } + src => panic!("Unsupported src type for FMUL: {src}"), + } + + self.set_rnd_mode(39..41, op.rnd_mode); + self.set_field(41..44, 0x0_u8); // TODO: PDIV + self.set_bit(44, op.ftz); + self.set_bit(45, op.dnz); + self.set_bit( + 48, + op.srcs[0].src_mod.has_fneg() ^ op.srcs[1].src_mod.has_fneg(), + ); + self.set_bit(50, op.saturate); + } + + self.set_reg_fmod_src(8..16, 46, 48, op.srcs[0]); + self.set_dst(op.dst); + } + + fn encode_ffma(&mut self, op: &OpFFma) { + // FFMA doesn't have any abs flags. + assert!(!op.srcs[0].src_mod.has_fabs()); + assert!(!op.srcs[1].src_mod.has_fabs()); + assert!(!op.srcs[2].src_mod.has_fabs()); + + match &op.srcs[1].src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5980); + self.set_reg_src_ref(20..28, op.srcs[1].src_ref); + } + SrcRef::Imm32(i) => { + self.set_opcode(0x3280); + self.set_src_imm_f20(20..39, 56, *i); + } + SrcRef::CBuf(cb) => { + self.set_opcode(0x4980); + self.set_src_cb(20..39, cb); + } + src1 => panic!("unsupported src1 type for IMUL: {src1}"), + } + + self.set_dst(op.dst); + self.set_reg_src_ref(8..16, op.srcs[0].src_ref); + self.set_reg_src_ref(39..47, op.srcs[2].src_ref); + + self.set_bit( + 48, + op.srcs[0].src_mod.has_fneg() ^ op.srcs[1].src_mod.has_fneg(), + ); + self.set_bit(49, op.srcs[2].src_mod.has_fneg()); + self.set_bit(50, op.saturate); + self.set_rnd_mode(51..53, op.rnd_mode); + + self.set_bit(53, op.ftz); + self.set_bit(54, op.dnz); + } + + fn set_float_cmp_op(&mut self, range: Range, op: FloatCmpOp) { + assert!(range.len() == 4); + self.set_field( + range, + match op { + FloatCmpOp::OrdLt => 0x01_u8, + FloatCmpOp::OrdEq => 0x02_u8, + FloatCmpOp::OrdLe => 0x03_u8, + FloatCmpOp::OrdGt => 0x04_u8, + FloatCmpOp::OrdNe => 0x05_u8, + FloatCmpOp::OrdGe => 0x06_u8, + FloatCmpOp::UnordLt => 0x09_u8, + FloatCmpOp::UnordEq => 0x0a_u8, + FloatCmpOp::UnordLe => 0x0b_u8, + FloatCmpOp::UnordGt => 0x0c_u8, + FloatCmpOp::UnordNe => 0x0d_u8, + FloatCmpOp::UnordGe => 0x0e_u8, + FloatCmpOp::IsNum => 0x07_u8, + FloatCmpOp::IsNan => 0x08_u8, + }, + ); + } + + fn encode_fset(&mut self, op: &OpFSet) { + match &op.srcs[1].src_ref { + SrcRef::Imm32(imm32) => { + self.set_opcode(0x3000); + self.set_src_imm_f20(20..39, 56, *imm32); + } + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5800); + self.set_reg_fmod_src(20..28, 44, 53, op.srcs[1]); + } + SrcRef::CBuf(_) => { + self.set_opcode(0x4800); + self.set_cb_fmod_src(20..39, 44, 6, op.srcs[1]); + } + src => panic!("Unsupported src type for FSET: {src}"), + } + + self.set_reg_fmod_src(8..16, 54, 43, op.srcs[0]); + self.set_pred_src(39..42, 42, SrcRef::True.into()); + self.set_float_cmp_op(48..52, op.cmp_op); + self.set_bit(52, true); // bool float + self.set_bit(55, op.ftz); + self.set_dst(op.dst); + } + + fn encode_fsetp(&mut self, op: &OpFSetP) { + match &op.srcs[1].src_ref { + SrcRef::Imm32(imm32) => { + self.set_opcode(0x36b0); + self.set_src_imm_f20(20..39, 56, *imm32); + } + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5bb0); + self.set_reg_fmod_src(20..28, 44, 6, op.srcs[1]); + } + SrcRef::CBuf(_) => { + self.set_opcode(0x4bb0); + self.set_cb_fmod_src(20..39, 44, 6, op.srcs[1]); + } + src => panic!("Unsupported src type for FSETP: {src}"), + } + + self.set_pred_dst(3..6, op.dst); + self.set_pred_dst(0..3, Dst::None); // dst1 + self.set_pred_src(39..42, 42, op.accum); + self.set_pred_set_op(45..47, op.set_op); + self.set_bit(47, op.ftz); + self.set_float_cmp_op(48..52, op.cmp_op); + self.set_reg_fmod_src(8..16, 7, 43, op.srcs[0]); + } + + fn encode_fswzadd(&mut self, op: &OpFSwzAdd) { + self.set_opcode(0x50f8); + + self.set_dst(op.dst); + self.set_reg_src(8..16, op.srcs[0]); + self.set_reg_src(20..28, op.srcs[1]); + + self.set_field( + 39..41, + match op.rnd_mode { + FRndMode::NearestEven => 0u8, + FRndMode::NegInf => 1u8, + FRndMode::PosInf => 2u8, + FRndMode::Zero => 3u8, + }, + ); + + for (i, op) in op.ops.iter().enumerate() { + self.set_field( + 28 + i * 2..28 + (i + 1) * 2, + match op { + FSwzAddOp::Add => 0u8, + FSwzAddOp::SubLeft => 1u8, + FSwzAddOp::SubRight => 2u8, + FSwzAddOp::MoveLeft => 3u8, + }, + ); + } + + self.set_bit(38, false); /* .NDV */ + self.set_bit(44, op.ftz); + self.set_bit(47, false); /* dst.CC */ + } + + fn encode_rro(&mut self, op: &OpRro) { + match &op.src.src_ref { + SrcRef::Imm32(imm32) => { + self.set_opcode(0x3890); + self.set_src_imm_f20(20..39, 56, *imm32); + } + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5c90); + self.set_reg_fmod_src(20..28, 49, 45, op.src); + } + SrcRef::CBuf(_) => { + self.set_opcode(0x4c90); + self.set_cb_fmod_src(20..39, 49, 45, op.src); + } + src => panic!("Unsupported src type for RRO: {src}"), + } + + self.set_dst(op.dst); + self.set_field( + 39..40, + match op.op { + RroOp::SinCos => 0u8, + RroOp::Exp2 => 1u8, + }, + ); + } + + fn encode_mufu(&mut self, op: &OpMuFu) { + assert!(op.src.is_reg_or_zero()); + + // TODO: This is following ALU encoding, figure out the correct form of this. + self.set_opcode(0x5080); + + self.set_dst(op.dst); + self.set_reg_fmod_src(8..16, 46, 48, op.src); + + self.set_field( + 20..24, + match op.op { + MuFuOp::Cos => 0_u8, + MuFuOp::Sin => 1_u8, + MuFuOp::Exp2 => 2_u8, + MuFuOp::Log2 => 3_u8, + MuFuOp::Rcp => 4_u8, + MuFuOp::Rsq => 5_u8, + MuFuOp::Rcp64H => 6_u8, + MuFuOp::Rsq64H => 7_u8, + // SQRT is only on SM52 and later + MuFuOp::Sqrt if self.sm >= 52 => 8_u8, + MuFuOp::Sqrt => panic!("MUFU.SQRT not supported on SM50"), + MuFuOp::Tanh => panic!("MUFU.TANH not supported on SM50"), + }, + ); + } + + fn encode_flo(&mut self, op: &OpFlo) { + match op.src.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5c30); + self.set_reg_src_ref(20..28, op.src.src_ref); + } + SrcRef::Imm32(imm) => { + self.set_opcode(0x3830); + self.set_src_imm_i20(20..39, 56, imm); + } + SrcRef::CBuf(cb) => { + self.set_opcode(0x4c30); + self.set_src_cb(20..39, &cb); + } + src => panic!("Unsupported src type for FLO: {src}"), + } + + self.set_dst(op.dst); + self.set_bit(40, op.src.src_mod.is_bnot()); + self.set_bit(48, op.signed); + self.set_bit(41, op.return_shift_amount); + self.set_bit(47, false); /* dst.CC */ + } + + fn encode_dadd(&mut self, op: &OpDAdd) { + match &op.srcs[1].src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5c70); + self.set_reg_fmod_src(20..28, 49, 45, op.srcs[1]); + } + SrcRef::Imm32(imm) => { + self.set_opcode(0x3870); + self.set_src_imm_f20(20..39, 56, *imm); + assert!(op.srcs[1].src_mod.is_none()); + } + SrcRef::CBuf(_) => { + self.set_opcode(0x4c70); + self.set_cb_fmod_src(20..39, 49, 45, op.srcs[1]); + } + _ => panic!("Unsupported src type"), + } + + self.set_dst(op.dst); + self.set_reg_fmod_src(8..16, 46, 48, op.srcs[0]); + self.set_rnd_mode(39..41, op.rnd_mode); + } + + fn encode_dfma(&mut self, op: &OpDFma) { + match &op.srcs[2].src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + match &op.srcs[1].src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5b70); + self.set_reg_src_ref(20..28, op.srcs[1].src_ref); + } + SrcRef::Imm32(imm) => { + self.set_opcode(0x3670); + self.set_src_imm_f20(20..39, 56, *imm); + assert!(op.srcs[1].src_mod.is_none()); + } + SrcRef::CBuf(cb) => { + self.set_opcode(0x4b70); + self.set_src_cb(20..39, cb); + } + _ => panic!("Invalid dfma src1: {}", op.srcs[1]), + } + self.set_reg_src_ref(39..47, op.srcs[2].src_ref); + } + SrcRef::CBuf(cb) => { + self.set_opcode(0x5370); + self.set_reg_src_ref(39..47, op.srcs[1].src_ref); + self.set_src_cb(20..39, cb); + } + _ => panic!("Invalid dfma src2: {}", op.srcs[2]), + } + + self.set_dst(op.dst); + self.set_reg_src_ref(8..16, op.srcs[0].src_ref); + + assert!(!op.srcs[0].src_mod.has_fabs()); + assert!(!op.srcs[1].src_mod.has_fabs()); + assert!(!op.srcs[2].src_mod.has_fabs()); + self.set_bit( + 48, + op.srcs[0].src_mod.has_fneg() ^ op.srcs[1].src_mod.has_fneg(), + ); + self.set_bit(49, op.srcs[2].src_mod.has_fneg()); + + self.set_rnd_mode(50..52, op.rnd_mode); + } + + fn encode_dmnmx(&mut self, op: &OpDMnMx) { + match &op.srcs[1].src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5c50); + self.set_reg_fmod_src(20..28, 49, 45, op.srcs[1]); + } + SrcRef::Imm32(imm32) => { + self.set_opcode(0x3850); + self.set_src_imm_f20(20..39, 56, *imm32); + } + SrcRef::CBuf(_) => { + self.set_opcode(0x4c50); + self.set_cb_fmod_src(20..39, 49, 45, op.srcs[1]); + } + src => panic!("Unsupported src type for FMNMX: {src}"), + } + + self.set_reg_fmod_src(8..16, 46, 48, op.srcs[0]); + self.set_dst(op.dst); + self.set_pred_src(39..42, 42, op.min); + } + + fn encode_dmul(&mut self, op: &OpDMul) { + match &op.srcs[1].src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5c80); + self.set_reg_src_ref(20..28, op.srcs[1].src_ref); + } + SrcRef::Imm32(imm) => { + self.set_opcode(0x3880); + self.set_src_imm_f20(20..39, 56, *imm); + assert!(op.srcs[1].src_mod.is_none()); + } + SrcRef::CBuf(cb) => { + self.set_opcode(0x4c80); + self.set_src_cb(20..39, cb); + } + _ => panic!("Invalid dmul src1: {}", op.srcs[1]), + } + + self.set_dst(op.dst); + self.set_reg_src_ref(8..16, op.srcs[0].src_ref); + + self.set_rnd_mode(39..41, op.rnd_mode); + + assert!(!op.srcs[0].src_mod.has_fabs()); + assert!(!op.srcs[1].src_mod.has_fabs()); + self.set_bit( + 48, + op.srcs[0].src_mod.has_fneg() ^ op.srcs[1].src_mod.has_fneg(), + ); + } + + fn encode_dsetp(&mut self, op: &OpDSetP) { + match &op.srcs[1].src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5b80); + self.set_reg_fmod_src(20..28, 44, 6, op.srcs[1]); + } + SrcRef::Imm32(imm) => { + self.set_opcode(0x3680); + self.set_src_imm_f20(20..39, 56, *imm); + assert!(op.srcs[1].src_mod.is_none()); + } + SrcRef::CBuf(_) => { + self.set_opcode(0x4b80); + self.set_reg_fmod_src(20..39, 44, 6, op.srcs[1]); + } + _ => panic!("Invalid dmul src1: {}", op.srcs[1]), + } + + self.set_pred_dst(3..6, op.dst); + self.set_pred_dst(0..3, Dst::None); // dst1 + self.set_pred_src(39..42, 42, op.accum); + self.set_pred_set_op(45..47, op.set_op); + self.set_float_cmp_op(48..52, op.cmp_op); + self.set_reg_fmod_src(8..16, 7, 43, op.srcs[0]); + } + + fn encode_iadd2(&mut self, op: &OpIAdd2) { + let carry_in = match op.carry_in.src_ref { + SrcRef::Reg(reg) if reg.file() == RegFile::Carry => true, + SrcRef::Zero => false, + other => panic!("invalid carry_in src for IADD2 {other}"), + }; + let carry_out = match op.carry_out { + Dst::Reg(reg) if reg.file() == RegFile::Carry => true, + Dst::None => false, + other => panic!("invalid carry_out dst for IADD2 {other}"), + }; + + if let Some(imm32) = op.srcs[1].as_imm_not_i20() { + self.set_opcode(0x1c00); + + self.set_dst(op.dst); + self.set_reg_ineg_src(8..16, 56, op.srcs[0]); + self.set_src_imm32(20..52, imm32); + + self.set_bit(53, carry_in); + self.set_bit(52, carry_out); + } else { + match &op.srcs[1].src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5c10); + self.set_reg_ineg_src(20..28, 48, op.srcs[1]); + } + SrcRef::Imm32(imm) => { + self.set_opcode(0x3810); + self.set_src_imm_i20(20..39, 56, *imm); + } + SrcRef::CBuf(_) => { + self.set_opcode(0x4c10); + self.set_cb_ineg_src(20..39, 48, op.srcs[1]); + } + src => panic!("Unsupported src type for IADD: {src}"), + } + + self.set_dst(op.dst); + self.set_reg_ineg_src(8..16, 49, op.srcs[0]); + + self.set_bit(43, carry_in); + self.set_bit(47, carry_out); + } + } + + fn encode_prmt(&mut self, op: &OpPrmt) { + match &op.sel.src_ref { + SrcRef::Imm32(imm) => { + self.set_opcode(0x36c0); + self.set_src_imm_i20(20..39, 56, *imm); + } + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5bc0); + self.set_reg_src(20..28, op.sel); + } + SrcRef::CBuf(cbuf) => { + self.set_opcode(0x4bc0); + self.set_src_cb(20..39, cbuf); + } + src => panic!("Unsupported src type for PRMT: {src}"), + } + + self.set_reg_src(8..16, op.srcs[0]); + self.set_reg_src(39..47, op.srcs[1]); + self.set_dst(op.dst); + // TODO: subop? + } + + fn encode_suld(&mut self, op: &OpSuLd) { + self.set_opcode(0xeb00); + + assert!(op.mask == 0x1 || op.mask == 0x3 || op.mask == 0xf); + self.set_field(20..24, op.mask); + self.set_image_dim(33..36, op.image_dim); + + // mem_eviction_policy not a thing for sm < 70 + + let scope = match op.mem_order { + MemOrder::Constant => MemScope::System, + MemOrder::Weak => MemScope::CTA, + MemOrder::Strong(s) => s, + }; + + self.set_field( + 24..26, + match scope { + MemScope::CTA => 0_u8, + /* SM => 1_u8, */ + MemScope::GPU => 2_u8, + MemScope::System => 3_u8, + }, + ); + + self.set_dst(op.dst); + + self.set_reg_src(8..16, op.coord); + self.set_reg_src(39..47, op.handle); + } + + fn encode_suatom(&mut self, op: &OpSuAtom) { + if matches!(op.atom_op, AtomOp::CmpExch) { + self.set_opcode(0xeac0); + } else { + self.set_opcode(0xea60); + } + + let atom_type: u8 = match op.atom_type { + AtomType::U32 => 0, + AtomType::I32 => 1, + AtomType::F32 => 3, + AtomType::U64 => 2, + AtomType::I64 => 5, + _ => panic!("Unsupported atom type {}", op.atom_type), + }; + + let atom_op: u8 = match op.atom_op { + AtomOp::Add => 0, + AtomOp::Min => 1, + AtomOp::Max => 2, + AtomOp::Inc => 3, + AtomOp::Dec => 4, + AtomOp::And => 5, + AtomOp::Or => 6, + AtomOp::Xor => 7, + AtomOp::Exch => 8, + AtomOp::CmpExch => 0, + }; + + self.set_image_dim(33..36, op.image_dim); + self.set_field(36..39, atom_type); + self.set_field(29..33, atom_op); + + // The hardware requires that we set .D on atomics. This is safe to do + // in in the emit code because it only affects format conversion, not + // surface coordinates and atomics are required to be performed with + // image formats that that exactly match the shader data type. So, for + // instance, a uint32_t atomic has to happen on an R32_UINT or R32_SINT + // image. + self.set_bit(52, true); // .D + + self.set_dst(op.dst); + + self.set_reg_src(20..28, op.data); + self.set_reg_src(8..16, op.coord); + self.set_reg_src(39..47, op.handle); + } + + fn encode_isberd(&mut self, op: &OpIsberd) { + self.set_opcode(0xefd0); + self.set_dst(op.dst); + self.set_reg_src(8..16, op.idx); + } + + fn encode_out(&mut self, op: &OpOut) { + match &op.stream.src_ref { + SrcRef::Imm32(imm32) => { + self.set_opcode(0xf6e0); + self.set_src_imm_i20(20..39, 56, *imm32); + } + SrcRef::CBuf(cbuf) => { + self.set_opcode(0xebe0); + self.set_src_cb(20..39, cbuf); + } + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0xfbe0); + self.set_reg_src(20..28, op.stream); + } + src => panic!("Unsupported src type for OUT: {src}"), + } + + self.set_field( + 39..41, + match op.out_type { + OutType::Emit => 1_u8, + OutType::Cut => 2_u8, + OutType::EmitThenCut => 3_u8, + }, + ); + + self.set_reg_src(8..16, op.handle); + self.set_dst(op.dst); + } + + fn encode_bfe(&mut self, op: &OpBfe) { + match &op.range.src_ref { + SrcRef::Imm32(imm32) => { + self.set_opcode(0x3800); + // We guarantee that imm32 is 16bits, as it's a result of a PRMT + // instruction that only fills the bottom two bytes. + self.set_src_imm_i20(20..39, 56, *imm32 & 0xffff); + } + SrcRef::CBuf(cbuf) => { + self.set_opcode(0x4c00); + self.set_src_cb(20..39, cbuf); + } + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x5c00); + self.set_reg_src(20..28, op.range); + } + src => panic!("Unsupported src type for BFE: {src}"), + } + + if op.signed { + self.set_bit(48, true); + } + + if op.reverse { + self.set_bit(40, true); + } + + self.set_reg_src(8..16, op.base); + self.set_dst(op.dst); + } + + pub fn encode( + instr: &Instr, + sm: u8, + ip: usize, + labels: &HashMap, + ) -> Self { + assert!(sm >= 50); + + let mut si = SM50Instr::new(sm); + + match &instr.op { + Op::FAdd(op) => si.encode_fadd(op), + Op::FMnMx(op) => si.encode_fmnmx(op), + Op::FMul(op) => si.encode_fmul(op), + Op::FFma(op) => si.encode_ffma(op), + Op::FSet(op) => si.encode_fset(op), + Op::FSetP(op) => si.encode_fsetp(op), + Op::FSwzAdd(op) => si.encode_fswzadd(op), + Op::Rro(op) => si.encode_rro(op), + Op::MuFu(op) => si.encode_mufu(op), + Op::Flo(op) => si.encode_flo(op), + Op::DAdd(op) => si.encode_dadd(op), + Op::DFma(op) => si.encode_dfma(op), + Op::DMnMx(op) => si.encode_dmnmx(op), + Op::DMul(op) => si.encode_dmul(op), + Op::DSetP(op) => si.encode_dsetp(op), + Op::IAdd2(op) => si.encode_iadd2(op), + Op::Mov(op) => si.encode_mov(op), + Op::Sel(op) => si.encode_sel(op), + Op::Shfl(op) => si.encode_shfl(op), + Op::Vote(op) => si.encode_vote(op), + Op::PSetP(op) => si.encode_psetp(op), + Op::SuSt(op) => si.encode_sust(op), + Op::S2R(op) => si.encode_s2r(op), + Op::PopC(op) => si.encode_popc(op), + Op::Prmt(op) => si.encode_prmt(op), + Op::Ld(op) => si.encode_ld(op), + Op::Ldc(op) => si.encode_ldc(op), + Op::St(op) => si.encode_st(op), + Op::Lop2(op) => si.encode_lop2(op), + Op::Shf(op) => si.encode_shf(op), + Op::Shl(op) => si.encode_shl(op), + Op::Shr(op) => si.encode_shr(op), + Op::F2F(op) => si.encode_f2f(op), + Op::F2I(op) => si.encode_f2i(op), + Op::I2F(op) => si.encode_i2f(op), + Op::I2I(op) => si.encode_i2i(op), + Op::IMad(op) => si.encode_imad(op), + Op::IMul(op) => si.encode_imul(op), + Op::IMnMx(op) => si.encode_imnmx(op), + Op::ISetP(op) => si.encode_isetp(op), + Op::Tex(op) => si.encode_tex(op), + Op::Tld(op) => si.encode_tld(op), + Op::Tld4(op) => si.encode_tld4(op), + Op::Tmml(op) => si.encode_tmml(op), + Op::Txd(op) => si.encode_txd(op), + Op::Txq(op) => si.encode_txq(op), + Op::Ipa(op) => si.encode_ipa(op), + Op::ALd(op) => si.encode_ald(op), + Op::ASt(op) => si.encode_ast(op), + Op::MemBar(op) => si.encode_membar(op), + Op::Atom(op) => si.encode_atom(op), + Op::Bra(op) => si.encode_bra(op, ip, labels), + Op::Exit(op) => si.encode_exit(op), + Op::Bar(op) => si.encode_bar(op), + Op::SuLd(op) => si.encode_suld(op), + Op::SuAtom(op) => si.encode_suatom(op), + Op::Kill(op) => si.encode_kill(op), + Op::CS2R(op) => si.encode_cs2r(op), + Op::Nop(_) => si.encode_nop(), + Op::Isberd(op) => si.encode_isberd(&op), + Op::Out(op) => si.encode_out(&op), + Op::Bfe(op) => si.encode_bfe(&op), + _ => panic!("Unhandled instruction {}", instr.op), + } + + si.set_pred(&instr.pred); + si.set_instr_deps(&instr.deps); + + si + } +} + +fn encode_instr( + instr_index: usize, + instr: Option<&Box>, + sm: u8, + labels: &HashMap, + ip: &mut usize, + sched_instr: &mut [u32; 2], +) -> [u32; 2] { + let res = instr + .map(|x| SM50Instr::encode(x, sm, *ip, labels)) + .unwrap_or_else(|| SM50Instr::nop(sm)); + + *ip += 8; + + BitMutView::new(sched_instr) + .set_field(21 * instr_index..21 * (instr_index + 1), res.sched); + + res.inst +} + +pub fn encode_sm50_shader(sm: &dyn ShaderModel, s: &Shader<'_>) -> Vec { + assert!(s.functions.len() == 1); + let func = &s.functions[0]; + + let mut num_instrs = 0_usize; + let mut labels = HashMap::new(); + for b in &func.blocks { + // We ensure blocks will have groups of 3 instructions with a + // schedule instruction before each groups. As we should never jump + // to a schedule instruction, we account for that here. + labels.insert(b.label, num_instrs + 8); + + let block_num_instrs = align_up(b.instrs.len(), 3); + + // Every 3 instructions, we have a new schedule instruction so we + // need to account for that. + num_instrs += (block_num_instrs + (block_num_instrs / 3)) * 8; + } + + let mut encoded = Vec::new(); + for b in &func.blocks { + // A block is composed of groups of 3 instructions. + let block_num_instrs = align_up(b.instrs.len(), 3); + + let mut instrs_iter = b.instrs.iter(); + + for _ in 0..(block_num_instrs / 3) { + let mut ip = ((encoded.len() / 2) + 1) * 8; + + let mut sched_instr = [0x0; 2]; + + let instr0 = encode_instr( + 0, + instrs_iter.next(), + sm.sm(), + &labels, + &mut ip, + &mut sched_instr, + ); + let instr1 = encode_instr( + 1, + instrs_iter.next(), + sm.sm(), + &labels, + &mut ip, + &mut sched_instr, + ); + let instr2 = encode_instr( + 2, + instrs_iter.next(), + sm.sm(), + &labels, + &mut ip, + &mut sched_instr, + ); + + encoded.extend_from_slice(&sched_instr[..]); + encoded.extend_from_slice(&instr0[..]); + encoded.extend_from_slice(&instr1[..]); + encoded.extend_from_slice(&instr2[..]); + } + } + + encoded +} diff --git a/src/nouveau/compiler/nak/encode_sm70.rs b/src/nouveau/compiler/nak/encode_sm70.rs new file mode 100644 index 00000000000..a830a3ca7a0 --- /dev/null +++ b/src/nouveau/compiler/nak/encode_sm70.rs @@ -0,0 +1,2599 @@ +// Copyright © 2022 Collabora, Ltd. +// SPDX-License-Identifier: MIT + +use crate::ir::*; +use bitview::*; + +use std::collections::HashMap; +use std::ops::Range; + +struct ALURegRef { + pub reg: RegRef, + pub abs: bool, + pub neg: bool, + pub swizzle: SrcSwizzle, +} + +struct ALUCBufRef { + pub cb: CBufRef, + pub abs: bool, + pub neg: bool, + pub swizzle: SrcSwizzle, +} + +enum ALUSrc { + None, + Imm32(u32), + Reg(ALURegRef), + UReg(ALURegRef), + CBuf(ALUCBufRef), +} + +fn src_is_zero_or_gpr(src: &Src) -> bool { + match src.src_ref { + SrcRef::Zero => true, + SrcRef::Reg(reg) => reg.file() == RegFile::GPR, + _ => false, + } +} + +fn src_mod_has_abs(src_mod: SrcMod) -> bool { + match src_mod { + SrcMod::None | SrcMod::FNeg | SrcMod::INeg | SrcMod::BNot => false, + SrcMod::FAbs | SrcMod::FNegAbs => true, + } +} + +fn src_mod_has_neg(src_mod: SrcMod) -> bool { + match src_mod { + SrcMod::None | SrcMod::FAbs => false, + SrcMod::FNeg | SrcMod::FNegAbs | SrcMod::INeg | SrcMod::BNot => true, + } +} + +fn src_mod_is_bnot(src_mod: SrcMod) -> bool { + match src_mod { + SrcMod::None => false, + SrcMod::BNot => true, + _ => panic!("Not an predicate source modifier"), + } +} + +fn dst_is_bar(dst: Dst) -> bool { + match dst { + Dst::None => false, + Dst::SSA(ssa) => ssa.file().unwrap() == RegFile::Bar, + Dst::Reg(reg) => reg.file() == RegFile::Bar, + } +} + +impl ALUSrc { + fn from_src(src: Option<&Src>, op_is_uniform: bool) -> ALUSrc { + let Some(src) = src else { + return ALUSrc::None; + }; + + match src.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + let reg = match src.src_ref { + SrcRef::Zero => { + let file = if op_is_uniform { + RegFile::UGPR + } else { + RegFile::GPR + }; + RegRef::zero(file, 1) + } + SrcRef::Reg(reg) => reg, + _ => panic!("Invalid source ref"), + }; + assert!(reg.comps() <= 2); + let alu_ref = ALURegRef { + reg: reg, + abs: src_mod_has_abs(src.src_mod), + neg: src_mod_has_neg(src.src_mod), + swizzle: src.src_swizzle, + }; + if op_is_uniform { + assert!(reg.file() == RegFile::UGPR); + ALUSrc::Reg(alu_ref) + } else { + match reg.file() { + RegFile::GPR => ALUSrc::Reg(alu_ref), + RegFile::UGPR => ALUSrc::UReg(alu_ref), + _ => panic!("Invalid ALU register file"), + } + } + } + SrcRef::Imm32(i) => { + assert!(src.src_mod.is_none()); + assert!(src.src_swizzle.is_none()); + ALUSrc::Imm32(i) + } + SrcRef::CBuf(cb) => { + let alu_ref = ALUCBufRef { + cb: cb, + abs: src_mod_has_abs(src.src_mod), + neg: src_mod_has_neg(src.src_mod), + swizzle: src.src_swizzle, + }; + ALUSrc::CBuf(alu_ref) + } + _ => panic!("Invalid ALU source"), + } + } + + pub fn has_src_mod(&self) -> bool { + match self { + ALUSrc::Reg(reg) | ALUSrc::UReg(reg) => reg.abs || reg.neg, + ALUSrc::CBuf(cb) => cb.abs || cb.neg, + _ => false, + } + } +} + +struct SM70Instr { + inst: [u32; 4], + sm: u8, +} + +impl BitViewable for SM70Instr { + fn bits(&self) -> usize { + BitView::new(&self.inst).bits() + } + + fn get_bit_range_u64(&self, range: Range) -> u64 { + BitView::new(&self.inst).get_bit_range_u64(range) + } +} + +impl BitMutViewable for SM70Instr { + fn set_bit_range_u64(&mut self, range: Range, val: u64) { + BitMutView::new(&mut self.inst).set_bit_range_u64(range, val); + } +} + +impl SetFieldU64 for SM70Instr { + fn set_field_u64(&mut self, range: Range, val: u64) { + BitMutView::new(&mut self.inst).set_field_u64(range, val); + } +} + +impl SM70Instr { + fn set_bit(&mut self, bit: usize, val: bool) { + BitMutView::new(&mut self.inst).set_bit(bit, val); + } + + fn set_reg(&mut self, range: Range, reg: RegRef) { + assert!(range.len() == 8); + assert!(reg.file() == RegFile::GPR); + self.set_field(range, reg.base_idx()); + } + + fn set_ureg(&mut self, range: Range, reg: RegRef) { + assert!(self.sm >= 75); + assert!(range.len() == 8); + assert!(reg.file() == RegFile::UGPR); + assert!(reg.base_idx() <= 63); + self.set_field(range, reg.base_idx()); + } + + fn set_pred_reg(&mut self, range: Range, reg: RegRef) { + assert!(range.len() == 3); + assert!(reg.base_idx() <= 7); + assert!(reg.comps() == 1); + self.set_field(range, reg.base_idx()); + } + + fn set_reg_src(&mut self, range: Range, src: Src) { + assert!(src.src_mod.is_none()); + match src.src_ref { + SrcRef::Zero => self.set_reg(range, RegRef::zero(RegFile::GPR, 1)), + SrcRef::Reg(reg) => self.set_reg(range, reg), + _ => panic!("Not a register"), + } + } + + fn set_pred_dst(&mut self, range: Range, dst: Dst) { + match dst { + Dst::None => { + self.set_pred_reg(range, RegRef::zero(RegFile::Pred, 1)); + } + Dst::Reg(reg) => self.set_pred_reg(range, reg), + _ => panic!("Not a register"), + } + } + + fn set_pred_src_file( + &mut self, + range: Range, + not_bit: usize, + src: Src, + file: RegFile, + ) { + // The default for predicates is true + let true_reg = RegRef::new(file, 7, 1); + + let (not, reg) = match src.src_ref { + SrcRef::True => (false, true_reg), + SrcRef::False => (true, true_reg), + SrcRef::Reg(reg) => { + assert!(reg.file() == file); + (false, reg) + } + _ => panic!("Not a register"), + }; + self.set_pred_reg(range, reg); + self.set_bit(not_bit, not ^ src_mod_is_bnot(src.src_mod)); + } + + fn set_pred_src(&mut self, range: Range, not_bit: usize, src: Src) { + self.set_pred_src_file(range, not_bit, src, RegFile::Pred); + } + + fn set_upred_src(&mut self, range: Range, not_bit: usize, src: Src) { + self.set_pred_src_file(range, not_bit, src, RegFile::UPred); + } + + fn set_src_cb(&mut self, range: Range, cx_bit: usize, cb: &CBufRef) { + let mut v = BitMutView::new_subset(self, range); + v.set_field(6..22, cb.offset); + match cb.buf { + CBuf::Binding(idx) => { + v.set_field(22..27, idx); + self.set_bit(cx_bit, false); + } + CBuf::BindlessUGPR(reg) => { + assert!(reg.base_idx() <= 63); + assert!(reg.file() == RegFile::UGPR); + v.set_field(0..6, reg.base_idx()); + self.set_bit(cx_bit, true); + } + CBuf::BindlessSSA(_) => panic!("SSA values must be lowered"), + } + } + + fn set_opcode(&mut self, opcode: u16) { + self.set_field(0..12, opcode); + } + + fn set_pred(&mut self, pred: &Pred) { + assert!(!pred.is_false()); + self.set_pred_reg( + 12..15, + match pred.pred_ref { + PredRef::None => RegRef::zero(RegFile::Pred, 1), + PredRef::Reg(reg) => reg, + PredRef::SSA(_) => panic!("SSA values must be lowered"), + }, + ); + self.set_bit(15, pred.pred_inv); + } + + fn set_dst(&mut self, dst: Dst) { + match dst { + Dst::None => self.set_reg(16..24, RegRef::zero(RegFile::GPR, 1)), + Dst::Reg(reg) => self.set_reg(16..24, reg), + _ => panic!("Not a register"), + } + } + + fn set_udst(&mut self, dst: Dst) { + match dst { + Dst::None => self.set_ureg(16..24, RegRef::zero(RegFile::UGPR, 1)), + Dst::Reg(reg) => self.set_ureg(16..24, reg), + _ => panic!("Not a register"), + } + } + + fn set_bar_reg(&mut self, range: Range, reg: RegRef) { + assert!(range.len() == 4); + assert!(reg.file() == RegFile::Bar); + assert!(reg.comps() == 1); + self.set_field(range, reg.base_idx()); + } + + fn set_bar_dst(&mut self, range: Range, dst: Dst) { + self.set_bar_reg(range, *dst.as_reg().unwrap()); + } + + fn set_bar_src(&mut self, range: Range, src: Src) { + assert!(src.src_mod.is_none()); + self.set_bar_reg(range, *src.src_ref.as_reg().unwrap()); + } + + fn set_swizzle(&mut self, range: Range, swizzle: SrcSwizzle) { + assert!(range.len() == 2); + + self.set_field( + range, + match swizzle { + SrcSwizzle::None => 0x00_u8, + SrcSwizzle::Xx => 0x02_u8, + SrcSwizzle::Yy => 0x03_u8, + }, + ); + } + + fn set_alu_reg( + &mut self, + range: Range, + abs_bit: usize, + neg_bit: usize, + swizzle_range: Range, + file: RegFile, + is_fp16_alu: bool, + has_mod: bool, + reg: &ALURegRef, + ) { + match file { + RegFile::GPR => self.set_reg(range, reg.reg), + RegFile::UGPR => self.set_ureg(range, reg.reg), + _ => panic!("Invalid ALU src register file"), + } + + if has_mod { + self.set_bit(abs_bit, reg.abs); + self.set_bit(neg_bit, reg.neg); + } else { + assert!(!reg.abs && !reg.neg); + } + + if is_fp16_alu { + self.set_swizzle(swizzle_range, reg.swizzle); + } else { + assert!(reg.swizzle == SrcSwizzle::None); + } + } + + fn encode_alu_src0( + &mut self, + src: &ALUSrc, + file: RegFile, + is_fp16_alu: bool, + ) { + let reg = match src { + ALUSrc::None => return, + ALUSrc::Reg(reg) => reg, + _ => panic!("Invalid ALU src"), + }; + self.set_alu_reg(24..32, 73, 72, 74..76, file, is_fp16_alu, true, reg); + } + + fn encode_alu_src2( + &mut self, + src: &ALUSrc, + file: RegFile, + is_fp16_alu: bool, + bit74_75_are_mod: bool, + ) { + let reg = match src { + ALUSrc::None => return, + ALUSrc::Reg(reg) => reg, + _ => panic!("Invalid ALU src"), + }; + self.set_alu_reg( + 64..72, + 74, + 75, + 81..83, + file, + is_fp16_alu, + bit74_75_are_mod, + reg, + ); + } + + fn encode_alu_reg(&mut self, reg: &ALURegRef, is_fp16_alu: bool) { + self.set_alu_reg( + 32..40, + 62, + 63, + 60..62, + RegFile::GPR, + is_fp16_alu, + true, + reg, + ); + } + + fn encode_alu_ureg(&mut self, reg: &ALURegRef, is_fp16_alu: bool) { + self.set_ureg(32..40, reg.reg); + self.set_bit(62, reg.abs); + self.set_bit(63, reg.neg); + + if is_fp16_alu { + self.set_swizzle(60..62, reg.swizzle); + } else { + assert!(reg.swizzle == SrcSwizzle::None); + } + + self.set_bit(91, true); + } + + fn encode_alu_imm(&mut self, imm: &u32) { + self.set_field(32..64, *imm); + } + + fn encode_alu_cb(&mut self, cb: &ALUCBufRef, is_fp16_alu: bool) { + self.set_src_cb(32..59, 91, &cb.cb); + self.set_bit(62, cb.abs); + self.set_bit(63, cb.neg); + + if is_fp16_alu { + self.set_swizzle(60..62, cb.swizzle); + } else { + assert!(cb.swizzle == SrcSwizzle::None); + } + } + + fn encode_alu_base( + &mut self, + opcode: u16, + dst: Option<&Dst>, + src0: Option<&Src>, + src1: Option<&Src>, + src2: Option<&Src>, + is_fp16_alu: bool, + ) { + if let Some(dst) = dst { + self.set_dst(*dst); + } + + let src0 = ALUSrc::from_src(src0, false); + let src1 = ALUSrc::from_src(src1, false); + let src2 = ALUSrc::from_src(src2, false); + + // Bits 74..76 are used both for the swizzle on src0 and for the source + // modifier for the register source of src1 and src2. When both are + // registers, it's used for src2. The hardware elects to always support + // a swizzle and not support source modifiers in that case. + let bit74_75_are_mod = !is_fp16_alu + || matches!(src1, ALUSrc::None) + || matches!(src2, ALUSrc::None); + debug_assert!(bit74_75_are_mod || !src0.has_src_mod()); + + self.encode_alu_src0(&src0, RegFile::GPR, is_fp16_alu); + + let form = match &src2 { + ALUSrc::None | ALUSrc::Reg(_) => { + self.encode_alu_src2( + &src2, + RegFile::GPR, + is_fp16_alu, + bit74_75_are_mod, + ); + match &src1 { + ALUSrc::None => 1_u8, // form + ALUSrc::Reg(reg1) => { + self.encode_alu_reg(reg1, is_fp16_alu); + 1_u8 // form + } + ALUSrc::UReg(reg1) => { + self.encode_alu_ureg(reg1, is_fp16_alu); + 6_u8 // form + } + ALUSrc::Imm32(imm1) => { + self.encode_alu_imm(imm1); + 4_u8 // form + } + ALUSrc::CBuf(cb1) => { + self.encode_alu_cb(cb1, is_fp16_alu); + 5_u8 // form + } + } + } + ALUSrc::UReg(reg2) => { + self.encode_alu_ureg(reg2, is_fp16_alu); + self.encode_alu_src2( + &src1, + RegFile::GPR, + is_fp16_alu, + bit74_75_are_mod, + ); + 7_u8 // form + } + ALUSrc::Imm32(imm2) => { + self.encode_alu_imm(imm2); + self.encode_alu_src2( + &src1, + RegFile::GPR, + is_fp16_alu, + bit74_75_are_mod, + ); + 2_u8 // form + } + ALUSrc::CBuf(cb2) => { + // TODO set_src_cx + self.encode_alu_cb(cb2, is_fp16_alu); + self.encode_alu_src2( + &src1, + RegFile::GPR, + is_fp16_alu, + bit74_75_are_mod, + ); + 3_u8 // form + } + }; + + self.set_field(0..9, opcode); + self.set_field(9..12, form); + } + + fn encode_alu( + &mut self, + opcode: u16, + dst: Option<&Dst>, + src0: Option<&Src>, + src1: Option<&Src>, + src2: Option<&Src>, + ) { + self.encode_alu_base(opcode, dst, src0, src1, src2, false); + } + + fn encode_fp16_alu( + &mut self, + opcode: u16, + dst: Option<&Dst>, + src0: Option<&Src>, + src1: Option<&Src>, + src2: Option<&Src>, + ) { + self.encode_alu_base(opcode, dst, src0, src1, src2, true); + } + + fn encode_ualu( + &mut self, + opcode: u16, + dst: Option<&Dst>, + src0: Option<&Src>, + src1: Option<&Src>, + src2: Option<&Src>, + ) { + if let Some(dst) = dst { + self.set_udst(*dst); + } + + let src0 = ALUSrc::from_src(src0, true); + let src1 = ALUSrc::from_src(src1, true); + let src2 = ALUSrc::from_src(src2, true); + + // All uniform ALU requires bit 91 set + self.set_bit(91, true); + + self.encode_alu_src0(&src0, RegFile::UGPR, false); + let form = match &src2 { + ALUSrc::None | ALUSrc::Reg(_) => { + self.encode_alu_src2(&src2, RegFile::UGPR, false, true); + match &src1 { + ALUSrc::None => 1_u8, // form + ALUSrc::Reg(reg1) => { + self.encode_alu_ureg(reg1, false); + 1_u8 // form + } + ALUSrc::UReg(_) => panic!("UALU never has UReg"), + ALUSrc::Imm32(imm1) => { + self.encode_alu_imm(imm1); + 4_u8 // form + } + ALUSrc::CBuf(_) => panic!("UALU does not support cbufs"), + } + } + ALUSrc::UReg(_) => panic!("UALU never has UReg"), + ALUSrc::Imm32(imm2) => { + self.encode_alu_imm(imm2); + self.encode_alu_src2(&src1, RegFile::UGPR, false, true); + 2_u8 // form + } + ALUSrc::CBuf(_) => panic!("UALU does not support cbufs"), + }; + + self.set_field(0..9, opcode); + self.set_field(9..12, form); + } + + fn set_instr_deps(&mut self, deps: &InstrDeps) { + self.set_field(105..109, deps.delay); + self.set_bit(109, deps.yld); + self.set_field(110..113, deps.wr_bar().unwrap_or(7)); + self.set_field(113..116, deps.rd_bar().unwrap_or(7)); + self.set_field(116..122, deps.wt_bar_mask); + self.set_field(122..126, deps.reuse_mask); + } + + fn set_rnd_mode(&mut self, range: Range, rnd_mode: FRndMode) { + assert!(range.len() == 2); + self.set_field( + range, + match rnd_mode { + FRndMode::NearestEven => 0_u8, + FRndMode::NegInf => 1_u8, + FRndMode::PosInf => 2_u8, + FRndMode::Zero => 3_u8, + }, + ); + } + + fn encode_fadd(&mut self, op: &OpFAdd) { + if src_is_zero_or_gpr(&op.srcs[1]) { + self.encode_alu( + 0x021, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + None, + ); + } else { + self.encode_alu( + 0x021, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&Src::new_zero()), + Some(&op.srcs[1]), + ); + } + self.set_bit(77, op.saturate); + self.set_rnd_mode(78..80, op.rnd_mode); + self.set_bit(80, op.ftz); + } + + fn encode_ffma(&mut self, op: &OpFFma) { + self.encode_alu( + 0x023, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + Some(&op.srcs[2]), + ); + self.set_bit(76, op.dnz); + self.set_bit(77, op.saturate); + self.set_rnd_mode(78..80, op.rnd_mode); + self.set_bit(80, op.ftz); + } + + fn encode_fmnmx(&mut self, op: &OpFMnMx) { + self.encode_alu( + 0x009, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + Some(&Src::new_zero()), + ); + self.set_pred_src(87..90, 90, op.min); + self.set_bit(80, op.ftz); + } + + fn encode_fmul(&mut self, op: &OpFMul) { + self.encode_alu( + 0x020, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + Some(&Src::new_zero()), + ); + self.set_bit(76, op.dnz); + self.set_bit(77, op.saturate); + self.set_rnd_mode(78..80, op.rnd_mode); + self.set_bit(80, op.ftz); + self.set_field(84..87, 0x4_u8) // TODO: PDIV + } + + fn set_float_cmp_op(&mut self, range: Range, op: FloatCmpOp) { + assert!(range.len() == 4); + self.set_field( + range, + match op { + FloatCmpOp::OrdLt => 0x01_u8, + FloatCmpOp::OrdEq => 0x02_u8, + FloatCmpOp::OrdLe => 0x03_u8, + FloatCmpOp::OrdGt => 0x04_u8, + FloatCmpOp::OrdNe => 0x05_u8, + FloatCmpOp::OrdGe => 0x06_u8, + FloatCmpOp::UnordLt => 0x09_u8, + FloatCmpOp::UnordEq => 0x0a_u8, + FloatCmpOp::UnordLe => 0x0b_u8, + FloatCmpOp::UnordGt => 0x0c_u8, + FloatCmpOp::UnordNe => 0x0d_u8, + FloatCmpOp::UnordGe => 0x0e_u8, + FloatCmpOp::IsNum => 0x07_u8, + FloatCmpOp::IsNan => 0x08_u8, + }, + ); + } + + fn encode_fset(&mut self, op: &OpFSet) { + self.encode_alu( + 0x00a, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + None, + ); + self.set_float_cmp_op(76..80, op.cmp_op); + self.set_bit(80, op.ftz); + self.set_field(87..90, 0x7_u8); // TODO: src predicate + } + + fn set_pred_set_op(&mut self, range: Range, op: PredSetOp) { + assert!(range.len() == 2); + self.set_field( + range, + match op { + PredSetOp::And => 0_u8, + PredSetOp::Or => 1_u8, + PredSetOp::Xor => 2_u8, + }, + ); + } + + fn encode_fsetp(&mut self, op: &OpFSetP) { + self.encode_alu( + 0x00b, + None, + Some(&op.srcs[0]), + Some(&op.srcs[1]), + None, + ); + + self.set_pred_set_op(74..76, op.set_op); + self.set_float_cmp_op(76..80, op.cmp_op); + self.set_bit(80, op.ftz); + + self.set_pred_dst(81..84, op.dst); + self.set_pred_dst(84..87, Dst::None); // dst1 + + self.set_pred_src(87..90, 90, op.accum); + } + + fn encode_fswzadd(&mut self, op: &OpFSwzAdd) { + self.set_opcode(0x822); + self.set_dst(op.dst); + + self.set_reg_src(24..32, op.srcs[0]); + self.set_reg_src(64..72, op.srcs[1]); + + let mut subop = 0x0_u8; + + for (i, swz_op) in op.ops.iter().enumerate() { + let swz_op = match swz_op { + FSwzAddOp::Add => 0, + FSwzAddOp::SubRight => 2, + FSwzAddOp::SubLeft => 1, + FSwzAddOp::MoveLeft => 3, + }; + + subop |= swz_op << ((op.ops.len() - i - 1) * 2); + } + + self.set_field(32..40, subop); + + self.set_bit(77, false); // NDV + self.set_rnd_mode(78..80, op.rnd_mode); + self.set_bit(80, op.ftz); + } + + fn encode_mufu(&mut self, op: &OpMuFu) { + self.encode_alu(0x108, Some(&op.dst), None, Some(&op.src), None); + self.set_field( + 74..80, + match op.op { + MuFuOp::Cos => 0_u8, + MuFuOp::Sin => 1_u8, + MuFuOp::Exp2 => 2_u8, + MuFuOp::Log2 => 3_u8, + MuFuOp::Rcp => 4_u8, + MuFuOp::Rsq => 5_u8, + MuFuOp::Rcp64H => 6_u8, + MuFuOp::Rsq64H => 7_u8, + MuFuOp::Sqrt => 8_u8, + MuFuOp::Tanh => 9_u8, + }, + ); + } + + fn encode_dadd(&mut self, op: &OpDAdd) { + self.encode_alu( + 0x029, + Some(&op.dst), + Some(&op.srcs[0]), + None, + Some(&op.srcs[1]), + ); + self.set_rnd_mode(78..80, op.rnd_mode); + } + + fn encode_dfma(&mut self, op: &OpDFma) { + self.encode_alu( + 0x02b, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + Some(&op.srcs[2]), + ); + self.set_rnd_mode(78..80, op.rnd_mode); + } + + fn encode_dmul(&mut self, op: &OpDMul) { + self.encode_alu( + 0x028, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + None, + ); + self.set_rnd_mode(78..80, op.rnd_mode); + } + + fn encode_dsetp(&mut self, op: &OpDSetP) { + if src_is_zero_or_gpr(&op.srcs[1]) { + self.encode_alu( + 0x02a, + None, + Some(&op.srcs[0]), + Some(&op.srcs[1]), + None, + ); + } else { + self.encode_alu( + 0x02a, + None, + Some(&op.srcs[0]), + None, + Some(&op.srcs[1]), + ); + } + + self.set_pred_set_op(74..76, op.set_op); + self.set_float_cmp_op(76..80, op.cmp_op); + + self.set_pred_dst(81..84, op.dst); + self.set_pred_dst(84..87, Dst::None); /* dst1 */ + + self.set_pred_src(87..90, 90, op.accum); + } + + fn encode_hadd2(&mut self, op: &OpHAdd2) { + if src_is_zero_or_gpr(&op.srcs[1]) { + self.encode_fp16_alu( + 0x030, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + None, + ); + } else { + self.encode_fp16_alu( + 0x030, + Some(&op.dst), + Some(&op.srcs[0]), + None, + Some(&op.srcs[1]), + ); + } + + self.set_bit(77, op.saturate); + self.set_bit(78, op.f32); + self.set_bit(80, op.ftz); + self.set_bit(85, false); // .BF16_V2 (SM90+) + } + + fn encode_hfma2(&mut self, op: &OpHFma2) { + // HFMA2 doesn't have fneg and fabs on SRC2. + assert!(op.srcs[2].src_mod.is_none()); + + self.encode_fp16_alu( + 0x031, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + Some(&op.srcs[2]), + ); + + self.set_bit(76, op.dnz); + self.set_bit(77, op.saturate); + self.set_bit(78, op.f32); + self.set_bit(79, false); // .RELU (SM86+) + self.set_bit(80, op.ftz); + self.set_bit(85, false); // .BF16_V2 (SM86+) + } + + fn encode_hmul2(&mut self, op: &OpHMul2) { + self.encode_fp16_alu( + 0x032, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + None, + ); + + self.set_bit(76, op.dnz); + self.set_bit(77, op.saturate); + self.set_bit(78, false); // .F32 (SM70-SM75) + self.set_bit(79, false); // .RELU (SM86+) + self.set_bit(80, op.ftz); + self.set_bit(85, false); // .BF16_V2 (SM90+) + } + + fn encode_hset2(&mut self, op: &OpHSet2) { + if src_is_zero_or_gpr(&op.srcs[1]) { + self.encode_fp16_alu( + 0x033, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + None, + ); + } else { + self.encode_fp16_alu( + 0x033, + Some(&op.dst), + Some(&op.srcs[0]), + None, + Some(&op.srcs[1]), + ); + } + + self.set_bit(65, false); // .BF16_V2 (SM90+) + self.set_pred_set_op(69..71, op.set_op); + + // This differentiate between integer and fp16 output + self.set_bit(71, true); // .BF + self.set_float_cmp_op(76..80, op.cmp_op); + self.set_bit(80, op.ftz); + + self.set_pred_src(87..90, 90, op.accum); + } + + fn encode_hsetp2(&mut self, op: &OpHSetP2) { + if src_is_zero_or_gpr(&op.srcs[1]) { + self.encode_fp16_alu( + 0x034, + None, + Some(&op.srcs[0]), + Some(&op.srcs[1]), + None, + ); + } else { + self.encode_fp16_alu( + 0x034, + None, + Some(&op.srcs[0]), + None, + Some(&op.srcs[1]), + ); + } + + self.set_bit(65, false); // .BF16_V2 (SM90+) + self.set_pred_set_op(69..71, op.set_op); + self.set_bit(71, op.horizontal); // .H_AND + self.set_float_cmp_op(76..80, op.cmp_op); + self.set_bit(80, op.ftz); + + self.set_pred_dst(81..84, op.dsts[0]); + self.set_pred_dst(84..87, op.dsts[1]); + + self.set_pred_src(87..90, 90, op.accum); + } + + fn encode_hmnmx2(&mut self, op: &OpHMnMx2) { + assert!(self.sm >= 80); + + self.encode_fp16_alu( + 0x040, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + None, + ); + + // This differentiate between integer and fp16 output + self.set_bit(78, false); // .F32 (SM86) + self.set_bit(80, op.ftz); + self.set_bit(81, false); // .NAN + self.set_bit(82, false); // .XORSIGN + self.set_bit(85, false); // .BF16_V2 + + self.set_pred_src(87..90, 90, op.min); + } + + fn encode_bmsk(&mut self, op: &OpBMsk) { + if op.is_uniform() { + self.encode_ualu( + 0x09b, + Some(&op.dst), + Some(&op.pos), + Some(&op.width), + None, + ); + } else { + self.encode_alu( + 0x01b, + Some(&op.dst), + Some(&op.pos), + Some(&op.width), + None, + ); + } + + self.set_bit(75, op.wrap); + } + + fn encode_brev(&mut self, op: &OpBRev) { + if op.is_uniform() { + self.encode_ualu(0x0be, Some(&op.dst), None, Some(&op.src), None); + } else { + self.encode_alu(0x101, Some(&op.dst), None, Some(&op.src), None); + } + } + + fn encode_flo(&mut self, op: &OpFlo) { + if op.is_uniform() { + self.encode_ualu(0x0bd, Some(&op.dst), None, Some(&op.src), None); + } else { + self.encode_alu(0x100, Some(&op.dst), None, Some(&op.src), None); + } + self.set_pred_dst(81..84, Dst::None); + self.set_field(74..75, op.return_shift_amount as u8); + self.set_field(73..74, op.signed as u8); + let not_mod = matches!(op.src.src_mod, SrcMod::BNot); + self.set_field(63..64, not_mod) + } + + fn encode_iabs(&mut self, op: &OpIAbs) { + self.encode_alu(0x013, Some(&op.dst), None, Some(&op.src), None); + } + + fn encode_iadd3(&mut self, op: &OpIAdd3) { + // Hardware requires at least one of these be unmodified + assert!(op.srcs[0].src_mod.is_none() || op.srcs[1].src_mod.is_none()); + + if op.is_uniform() { + self.encode_ualu( + 0x090, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + Some(&op.srcs[2]), + ); + } else { + self.encode_alu( + 0x010, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + Some(&op.srcs[2]), + ); + } + + self.set_pred_src(87..90, 90, false.into()); + self.set_pred_src(77..80, 80, false.into()); + + self.set_pred_dst(81..84, op.overflow[0]); + self.set_pred_dst(84..87, op.overflow[1]); + } + + fn encode_iadd3x(&mut self, op: &OpIAdd3X) { + // Hardware requires at least one of these be unmodified + assert!(op.srcs[0].src_mod.is_none() || op.srcs[1].src_mod.is_none()); + + if op.is_uniform() { + self.encode_ualu( + 0x090, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + Some(&op.srcs[2]), + ); + + self.set_upred_src(87..90, 90, op.carry[0]); + self.set_upred_src(77..80, 80, op.carry[1]); + } else { + self.encode_alu( + 0x010, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + Some(&op.srcs[2]), + ); + + self.set_pred_src(87..90, 90, op.carry[0]); + self.set_pred_src(77..80, 80, op.carry[1]); + } + + self.set_bit(74, true); // .X + + self.set_pred_dst(81..84, op.overflow[0]); + self.set_pred_dst(84..87, op.overflow[1]); + } + + fn encode_idp4(&mut self, op: &OpIDp4) { + self.encode_alu( + 0x026, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + Some(&op.srcs[2]), + ); + + self.set_bit( + 73, + match op.src_types[0] { + IntType::U8 => false, + IntType::I8 => true, + _ => panic!("Invalid DP4 source type"), + }, + ); + self.set_bit( + 74, + match op.src_types[1] { + IntType::U8 => false, + IntType::I8 => true, + _ => panic!("Invalid DP4 source type"), + }, + ); + } + + fn encode_imad(&mut self, op: &OpIMad) { + if op.is_uniform() { + self.encode_ualu( + 0x0a4, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + Some(&op.srcs[2]), + ); + } else { + self.encode_alu( + 0x024, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + Some(&op.srcs[2]), + ); + } + self.set_pred_dst(81..84, Dst::None); + self.set_bit(73, op.signed); + } + + fn encode_imad64(&mut self, op: &OpIMad64) { + if op.is_uniform() { + self.encode_ualu( + 0x0a5, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + Some(&op.srcs[2]), + ); + } else { + self.encode_alu( + 0x025, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + Some(&op.srcs[2]), + ); + } + self.set_pred_dst(81..84, Dst::None); + self.set_bit(73, op.signed); + } + + fn encode_imnmx(&mut self, op: &OpIMnMx) { + self.encode_alu( + 0x017, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + None, + ); + self.set_pred_src(87..90, 90, op.min); + self.set_bit( + 73, + match op.cmp_type { + IntCmpType::U32 => false, + IntCmpType::I32 => true, + }, + ); + } + + fn set_int_cmp_op(&mut self, range: Range, op: IntCmpOp) { + assert!(range.len() == 3); + self.set_field( + range, + match op { + IntCmpOp::Eq => 2_u8, + IntCmpOp::Ne => 5_u8, + IntCmpOp::Lt => 1_u8, + IntCmpOp::Le => 3_u8, + IntCmpOp::Gt => 4_u8, + IntCmpOp::Ge => 6_u8, + }, + ); + } + + fn encode_isetp(&mut self, op: &OpISetP) { + if op.is_uniform() { + self.encode_ualu( + 0x08c, + None, + Some(&op.srcs[0]), + Some(&op.srcs[1]), + None, + ); + + self.set_upred_src(68..71, 71, op.low_cmp); + self.set_upred_src(87..90, 90, op.accum); + } else { + self.encode_alu( + 0x00c, + None, + Some(&op.srcs[0]), + Some(&op.srcs[1]), + None, + ); + + self.set_pred_src(68..71, 71, op.low_cmp); + self.set_pred_src(87..90, 90, op.accum); + } + + self.set_bit(72, op.ex); + + self.set_field( + 73..74, + match op.cmp_type { + IntCmpType::U32 => 0_u32, + IntCmpType::I32 => 1_u32, + }, + ); + self.set_pred_set_op(74..76, op.set_op); + self.set_int_cmp_op(76..79, op.cmp_op); + + self.set_pred_dst(81..84, op.dst); + self.set_pred_dst(84..87, Dst::None); // dst1 + } + + fn encode_lop3(&mut self, op: &OpLop3) { + if op.is_uniform() { + self.encode_ualu( + 0x092, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + Some(&op.srcs[2]), + ); + + self.set_upred_src(87..90, 90, SrcRef::False.into()); + } else { + self.encode_alu( + 0x012, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + Some(&op.srcs[2]), + ); + + self.set_pred_src(87..90, 90, SrcRef::False.into()); + } + + self.set_field(72..80, op.op.lut); + self.set_bit(80, false); // .PAND + self.set_field(81..84, 7_u32); // pred + } + + fn encode_popc(&mut self, op: &OpPopC) { + if op.is_uniform() { + self.encode_ualu(0x0bf, Some(&op.dst), None, Some(&op.src), None); + } else { + self.encode_alu(0x109, Some(&op.dst), None, Some(&op.src), None); + } + + let not_mod = matches!(op.src.src_mod, SrcMod::BNot); + self.set_field(63..64, not_mod) + } + + fn encode_shf(&mut self, op: &OpShf) { + if op.is_uniform() { + self.encode_ualu( + 0x099, + Some(&op.dst), + Some(&op.low), + Some(&op.shift), + Some(&op.high), + ); + } else { + self.encode_alu( + 0x019, + Some(&op.dst), + Some(&op.low), + Some(&op.shift), + Some(&op.high), + ); + } + + self.set_field( + 73..75, + match op.data_type { + IntType::I64 => 0_u8, + IntType::U64 => 1_u8, + IntType::I32 => 2_u8, + IntType::U32 => 3_u8, + _ => panic!("Invalid shift data type"), + }, + ); + self.set_bit(75, op.wrap); + self.set_bit(76, op.right); + self.set_bit(80, op.dst_high); + } + + fn encode_f2f(&mut self, op: &OpF2F) { + assert!(!op.integer_rnd); + if op.src_type.bits() <= 32 && op.dst_type.bits() <= 32 { + self.encode_alu(0x104, Some(&op.dst), None, Some(&op.src), None); + } else { + self.encode_alu(0x110, Some(&op.dst), None, Some(&op.src), None); + } + + if op.high { + self.set_field(60..62, 1_u8); // .H1 + } + + self.set_field(75..77, (op.dst_type.bits() / 8).ilog2()); + self.set_rnd_mode(78..80, op.rnd_mode); + self.set_bit(80, op.ftz); + self.set_field(84..86, (op.src_type.bits() / 8).ilog2()); + } + + fn encode_f2i(&mut self, op: &OpF2I) { + if op.src_type.bits() <= 32 && op.dst_type.bits() <= 32 { + self.encode_alu(0x105, Some(&op.dst), None, Some(&op.src), None); + } else { + self.encode_alu(0x111, Some(&op.dst), None, Some(&op.src), None); + } + + self.set_bit(72, op.dst_type.is_signed()); + self.set_field(75..77, (op.dst_type.bits() / 8).ilog2()); + self.set_bit(77, false); // NTZ + self.set_rnd_mode(78..80, op.rnd_mode); + self.set_bit(80, op.ftz); + self.set_field(84..86, (op.src_type.bits() / 8).ilog2()); + } + + fn encode_i2f(&mut self, op: &OpI2F) { + if op.src_type.bits() <= 32 && op.dst_type.bits() <= 32 { + self.encode_alu(0x106, Some(&op.dst), None, Some(&op.src), None); + } else { + self.encode_alu(0x112, Some(&op.dst), None, Some(&op.src), None); + } + + self.set_field(60..62, 0_u8); // TODO: subop + self.set_bit(74, op.src_type.is_signed()); + self.set_field(75..77, (op.dst_type.bits() / 8).ilog2()); + self.set_rnd_mode(78..80, op.rnd_mode); + self.set_field(84..86, (op.src_type.bits() / 8).ilog2()); + } + + fn encode_frnd(&mut self, op: &OpFRnd) { + if op.src_type.bits() <= 32 && op.dst_type.bits() <= 32 { + self.encode_alu(0x107, Some(&op.dst), None, Some(&op.src), None); + } else { + self.encode_alu(0x113, Some(&op.dst), None, Some(&op.src), None); + } + + self.set_field(84..86, (op.src_type.bits() / 8).ilog2()); + self.set_bit(80, op.ftz); + self.set_rnd_mode(78..80, op.rnd_mode); + self.set_field(75..77, (op.dst_type.bits() / 8).ilog2()); + } + + fn encode_mov(&mut self, op: &OpMov) { + if op.is_uniform() { + self.set_opcode(0xc82); + self.set_udst(op.dst); + + // umov is encoded like a non-uniform ALU op + let src = ALUSrc::from_src(Some(&op.src), true); + let form: u8 = match &src { + ALUSrc::Reg(reg) => { + self.encode_alu_ureg(reg, false); + 0x6 // form + } + ALUSrc::Imm32(imm) => { + self.encode_alu_imm(imm); + 0x4 // form + } + _ => panic!("Invalid umov src"), + }; + self.set_field(9..12, form); + } else { + self.encode_alu(0x002, Some(&op.dst), None, Some(&op.src), None); + self.set_field(72..76, op.quad_lanes); + } + } + + fn encode_prmt(&mut self, op: &OpPrmt) { + if op.is_uniform() { + self.encode_ualu( + 0x96, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.sel), + Some(&op.srcs[1]), + ); + } else { + self.encode_alu( + 0x16, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.sel), + Some(&op.srcs[1]), + ); + } + + self.set_field( + 72..75, + match op.mode { + PrmtMode::Index => 0_u8, + PrmtMode::Forward4Extract => 1_u8, + PrmtMode::Backward4Extract => 2_u8, + PrmtMode::Replicate8 => 3_u8, + PrmtMode::EdgeClampLeft => 4_u8, + PrmtMode::EdgeClampRight => 5_u8, + PrmtMode::Replicate16 => 6_u8, + }, + ) + } + + fn encode_sel(&mut self, op: &OpSel) { + if op.is_uniform() { + self.encode_ualu( + 0x087, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + None, + ); + + self.set_upred_src(87..90, 90, op.cond); + } else { + self.encode_alu( + 0x007, + Some(&op.dst), + Some(&op.srcs[0]), + Some(&op.srcs[1]), + None, + ); + + self.set_pred_src(87..90, 90, op.cond); + } + } + + fn encode_shfl(&mut self, op: &OpShfl) { + assert!(op.lane.src_mod.is_none()); + assert!(op.c.src_mod.is_none()); + + match &op.lane.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => match &op.c.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x389); + self.set_reg_src(32..40, op.lane); + self.set_reg_src(64..72, op.c); + } + SrcRef::Imm32(imm_c) => { + self.set_opcode(0x589); + self.set_reg_src(32..40, op.lane); + self.set_field(40..53, *imm_c & 0x1f1f); + } + _ => panic!("Invalid instruction form"), + }, + SrcRef::Imm32(imm_lane) => match &op.c.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + self.set_opcode(0x989); + self.set_field(53..58, *imm_lane & 0x1f); + self.set_reg_src(64..72, op.c); + } + SrcRef::Imm32(imm_c) => { + self.set_opcode(0xf89); + self.set_field(40..53, *imm_c & 0x1f1f); + self.set_field(53..58, *imm_lane & 0x1f); + } + _ => panic!("Invalid instruction form"), + }, + _ => panic!("Invalid instruction form"), + }; + + self.set_dst(op.dst); + self.set_pred_dst(81..84, op.in_bounds); + self.set_reg_src(24..32, op.src); + self.set_field( + 58..60, + match op.op { + ShflOp::Idx => 0_u8, + ShflOp::Up => 1_u8, + ShflOp::Down => 2_u8, + ShflOp::Bfly => 3_u8, + }, + ); + } + + fn encode_plop3(&mut self, op: &OpPLop3) { + if op.is_uniform() { + self.set_opcode(0x89c); + + self.set_upred_src(68..71, 71, op.srcs[2]); + self.set_upred_src(77..80, 80, op.srcs[1]); + self.set_upred_src(87..90, 90, op.srcs[0]); + } else { + self.set_opcode(0x81c); + + if op.srcs[2].src_ref.as_reg().is_some_and(|r| r.is_uniform()) { + self.set_upred_src(68..71, 71, op.srcs[2]); + self.set_bit(67, true); + } else { + self.set_pred_src(68..71, 71, op.srcs[2]); + } + self.set_pred_src(77..80, 80, op.srcs[1]); + self.set_pred_src(87..90, 90, op.srcs[0]); + } + self.set_field(16..24, op.ops[1].lut); + self.set_field(64..67, op.ops[0].lut & 0x7); + self.set_field(72..77, op.ops[0].lut >> 3); + + self.set_pred_dst(81..84, op.dsts[0]); + self.set_pred_dst(84..87, op.dsts[1]); + } + + fn set_tex_dim(&mut self, range: Range, dim: TexDim) { + assert!(range.len() == 3); + self.set_field( + range, + match dim { + TexDim::_1D => 0_u8, + TexDim::Array1D => 4_u8, + TexDim::_2D => 1_u8, + TexDim::Array2D => 5_u8, + TexDim::_3D => 2_u8, + TexDim::Cube => 3_u8, + TexDim::ArrayCube => 7_u8, + }, + ); + } + + fn set_tex_lod_mode(&mut self, range: Range, lod_mode: TexLodMode) { + assert!(range.len() == 3); + self.set_field( + range, + match lod_mode { + TexLodMode::Auto => 0_u8, + TexLodMode::Zero => 1_u8, + TexLodMode::Bias => 2_u8, + TexLodMode::Lod => 3_u8, + TexLodMode::Clamp => 4_u8, + TexLodMode::BiasClamp => 5_u8, + }, + ); + } + + fn encode_r2ur(&mut self, op: &OpR2UR) { + self.set_opcode(0x3c2); + self.set_udst(op.dst); + self.set_reg_src(24..32, op.src); + self.set_pred_dst(81..84, Dst::None); + } + + fn encode_tex(&mut self, op: &OpTex) { + self.set_opcode(0x361); + self.set_bit(59, true); // .B + + self.set_dst(op.dsts[0]); + if let Dst::Reg(reg) = op.dsts[1] { + self.set_reg(64..72, reg); + } else { + self.set_field(64..72, 255_u8); + } + self.set_pred_dst(81..84, op.fault); + + self.set_reg_src(24..32, op.srcs[0]); + self.set_reg_src(32..40, op.srcs[1]); + + self.set_tex_dim(61..64, op.dim); + self.set_field(72..76, op.mask); + self.set_bit(76, op.offset); + self.set_bit(77, false); // ToDo: NDV + self.set_bit(78, op.z_cmpr); + self.set_field(84..87, 1); + self.set_tex_lod_mode(87..90, op.lod_mode); + self.set_bit(90, false); // TODO: .NODEP + } + + fn encode_tld(&mut self, op: &OpTld) { + self.set_opcode(0x367); + self.set_bit(59, true); // .B + + self.set_dst(op.dsts[0]); + if let Dst::Reg(reg) = op.dsts[1] { + self.set_reg(64..72, reg); + } else { + self.set_field(64..72, 255_u8); + } + self.set_pred_dst(81..84, op.fault); + + self.set_reg_src(24..32, op.srcs[0]); + self.set_reg_src(32..40, op.srcs[1]); + + self.set_tex_dim(61..64, op.dim); + self.set_field(72..76, op.mask); + self.set_bit(76, op.offset); + // bit 77: .CL + self.set_bit(78, op.is_ms); + // bits 79..81: .F16 + assert!( + op.lod_mode == TexLodMode::Zero || op.lod_mode == TexLodMode::Lod + ); + self.set_tex_lod_mode(87..90, op.lod_mode); + self.set_bit(90, false); // TODO: .NODEP + } + + fn encode_tld4(&mut self, op: &OpTld4) { + self.set_opcode(0x364); + self.set_bit(59, true); // .B + + self.set_dst(op.dsts[0]); + if let Dst::Reg(reg) = op.dsts[1] { + self.set_reg(64..72, reg); + } else { + self.set_field(64..72, 255_u8); + } + self.set_pred_dst(81..84, op.fault); + + self.set_reg_src(24..32, op.srcs[0]); + self.set_reg_src(32..40, op.srcs[1]); + + self.set_tex_dim(61..64, op.dim); + self.set_field(72..76, op.mask); + self.set_field( + 76..78, + match op.offset_mode { + Tld4OffsetMode::None => 0_u8, + Tld4OffsetMode::AddOffI => 1_u8, + Tld4OffsetMode::PerPx => 2_u8, + }, + ); + // bit 77: .CL + self.set_bit(78, op.z_cmpr); + self.set_bit(84, true); // !.EF + self.set_field(87..89, op.comp); + self.set_bit(90, false); // TODO: .NODEP + } + + fn encode_tmml(&mut self, op: &OpTmml) { + self.set_opcode(0x36a); + self.set_bit(59, true); // .B + + self.set_dst(op.dsts[0]); + if let Dst::Reg(reg) = op.dsts[1] { + self.set_reg(64..72, reg); + } else { + self.set_field(64..72, 255_u8); + } + + self.set_reg_src(24..32, op.srcs[0]); + self.set_reg_src(32..40, op.srcs[1]); + + self.set_tex_dim(61..64, op.dim); + self.set_field(72..76, op.mask); + self.set_bit(77, false); // ToDo: NDV + self.set_bit(90, false); // TODO: .NODEP + } + + fn encode_txd(&mut self, op: &OpTxd) { + self.set_opcode(0x36d); + self.set_bit(59, true); // .B + + self.set_dst(op.dsts[0]); + if let Dst::Reg(reg) = op.dsts[1] { + self.set_reg(64..72, reg); + } else { + self.set_field(64..72, 255_u8); + } + self.set_pred_dst(81..84, op.fault); + + self.set_reg_src(24..32, op.srcs[0]); + self.set_reg_src(32..40, op.srcs[1]); + + self.set_tex_dim(61..64, op.dim); + self.set_field(72..76, op.mask); + self.set_bit(76, op.offset); + self.set_bit(77, false); // ToDo: NDV + self.set_bit(90, false); // TODO: .NODEP + } + + fn encode_txq(&mut self, op: &OpTxq) { + self.set_opcode(0x370); + self.set_bit(59, true); // .B + + self.set_dst(op.dsts[0]); + if let Dst::Reg(reg) = op.dsts[1] { + self.set_reg(64..72, reg); + } else { + self.set_field(64..72, 255_u8); + } + + self.set_reg_src(24..32, op.src); + self.set_field( + 62..64, + match op.query { + TexQuery::Dimension => 0_u8, + TexQuery::TextureType => 1_u8, + TexQuery::SamplerPos => 2_u8, + }, + ); + self.set_field(72..76, op.mask); + } + + fn set_image_dim(&mut self, range: Range, dim: ImageDim) { + assert!(range.len() == 3); + self.set_field( + range, + match dim { + ImageDim::_1D => 0_u8, + ImageDim::_1DBuffer => 1_u8, + ImageDim::_1DArray => 2_u8, + ImageDim::_2D => 3_u8, + ImageDim::_2DArray => 4_u8, + ImageDim::_3D => 5_u8, + }, + ); + } + + fn set_mem_order(&mut self, order: &MemOrder) { + if self.sm < 80 { + let scope = match order { + MemOrder::Constant => MemScope::System, + MemOrder::Weak => MemScope::CTA, + MemOrder::Strong(s) => *s, + }; + self.set_field( + 77..79, + match scope { + MemScope::CTA => 0_u8, + // SM => 1_u8, + MemScope::GPU => 2_u8, + MemScope::System => 3_u8, + }, + ); + self.set_field( + 79..81, + match order { + MemOrder::Constant => 0_u8, + MemOrder::Weak => 1_u8, + MemOrder::Strong(_) => 2_u8, + // MMIO => 3_u8, + }, + ); + } else { + self.set_field( + 77..81, + match order { + MemOrder::Constant => 0x4_u8, + MemOrder::Weak => 0x0_u8, + MemOrder::Strong(MemScope::CTA) => 0x5_u8, + MemOrder::Strong(MemScope::GPU) => 0x7_u8, + MemOrder::Strong(MemScope::System) => 0xa_u8, + }, + ); + } + } + + fn set_eviction_priority(&mut self, pri: &MemEvictionPriority) { + self.set_field( + 84..86, + match pri { + MemEvictionPriority::First => 0_u8, + MemEvictionPriority::Normal => 1_u8, + MemEvictionPriority::Last => 2_u8, + MemEvictionPriority::Unchanged => 3_u8, + }, + ); + } + + fn encode_suld(&mut self, op: &OpSuLd) { + self.set_opcode(0x998); + + self.set_dst(op.dst); + self.set_reg_src(24..32, op.coord); + self.set_reg_src(64..72, op.handle); + self.set_pred_dst(81..84, op.fault); + + self.set_image_dim(61..64, op.image_dim); + self.set_mem_order(&op.mem_order); + self.set_eviction_priority(&op.mem_eviction_priority); + + assert!(op.mask == 0x1 || op.mask == 0x3 || op.mask == 0xf); + self.set_field(72..76, op.mask); + } + + fn encode_sust(&mut self, op: &OpSuSt) { + self.set_opcode(0x99c); + + self.set_reg_src(24..32, op.coord); + self.set_reg_src(32..40, op.data); + self.set_reg_src(64..72, op.handle); + + self.set_image_dim(61..64, op.image_dim); + self.set_mem_order(&op.mem_order); + self.set_eviction_priority(&op.mem_eviction_priority); + + assert!(op.mask == 0x1 || op.mask == 0x3 || op.mask == 0xf); + self.set_field(72..76, op.mask); + } + + fn encode_suatom(&mut self, op: &OpSuAtom) { + if matches!(op.atom_op, AtomOp::CmpExch) { + self.set_opcode(0x396); + } else { + self.set_opcode(0x394); + } + + self.set_dst(op.dst); + self.set_reg_src(24..32, op.coord); + self.set_reg_src(32..40, op.data); + self.set_reg_src(64..72, op.handle); + self.set_pred_dst(81..84, op.fault); + + self.set_image_dim(61..64, op.image_dim); + self.set_mem_order(&op.mem_order); + self.set_eviction_priority(&op.mem_eviction_priority); + + self.set_bit(72, false); // .BA + self.set_atom_type(73..76, op.atom_type); + self.set_atom_op(87..91, op.atom_op); + } + + fn set_mem_type(&mut self, range: Range, mem_type: MemType) { + assert!(range.len() == 3); + self.set_field( + range, + match mem_type { + MemType::U8 => 0_u8, + MemType::I8 => 1_u8, + MemType::U16 => 2_u8, + MemType::I16 => 3_u8, + MemType::B32 => 4_u8, + MemType::B64 => 5_u8, + MemType::B128 => 6_u8, + }, + ); + } + + fn set_mem_access(&mut self, access: &MemAccess) { + self.set_field( + 72..73, + match access.space.addr_type() { + MemAddrType::A32 => 0_u8, + MemAddrType::A64 => 1_u8, + }, + ); + self.set_mem_type(73..76, access.mem_type); + self.set_mem_order(&access.order); + self.set_eviction_priority(&access.eviction_priority); + } + + fn encode_ldg(&mut self, op: &OpLd) { + self.set_opcode(0x381); + + self.set_dst(op.dst); + self.set_pred_dst(81..84, Dst::None); + + self.set_reg_src(24..32, op.addr); + self.set_field(40..64, op.offset); + + self.set_mem_access(&op.access); + } + + fn encode_ldl(&mut self, op: &OpLd) { + self.set_opcode(0x983); + self.set_field(84..87, 1_u8); + + self.set_dst(op.dst); + self.set_reg_src(24..32, op.addr); + self.set_field(40..64, op.offset); + + self.set_mem_type(73..76, op.access.mem_type); + assert!(op.access.order == MemOrder::Strong(MemScope::CTA)); + assert!(op.access.eviction_priority == MemEvictionPriority::Normal); + } + + fn encode_lds(&mut self, op: &OpLd) { + self.set_opcode(0x984); + + self.set_dst(op.dst); + self.set_reg_src(24..32, op.addr); + self.set_field(40..64, op.offset); + + self.set_mem_type(73..76, op.access.mem_type); + assert!(op.access.order == MemOrder::Strong(MemScope::CTA)); + assert!(op.access.eviction_priority == MemEvictionPriority::Normal); + + self.set_bit(87, false); // !.ZD - Returns a predicate? + } + + fn encode_ld(&mut self, op: &OpLd) { + match op.access.space { + MemSpace::Global(_) => self.encode_ldg(op), + MemSpace::Local => self.encode_ldl(op), + MemSpace::Shared => self.encode_lds(op), + } + } + + fn encode_ldc(&mut self, op: &OpLdc) { + let SrcRef::CBuf(cb) = &op.cb.src_ref else { + panic!("LDC must take a cbuf source"); + }; + + match cb.buf { + CBuf::Binding(idx) => { + if op.is_uniform() { + self.set_opcode(0xab9); + self.set_udst(op.dst); + + assert!(op.offset.is_zero()); + assert!(op.mode == LdcMode::Indexed); + } else { + self.set_opcode(0xb82); + self.set_dst(op.dst); + + self.set_reg_src(24..32, op.offset); + self.set_field( + 78..80, + match op.mode { + LdcMode::Indexed => 0_u8, + LdcMode::IndexedLinear => 1_u8, + LdcMode::IndexedSegmented => 2_u8, + LdcMode::IndexedSegmentedLinear => 3_u8, + }, + ); + } + self.set_field(54..59, idx); + self.set_bit(91, false); // Bound + } + CBuf::BindlessUGPR(handle) => { + if op.is_uniform() { + self.set_opcode(0xab9); + self.set_udst(op.dst); + + assert!(op.offset.is_zero()); + } else { + self.set_opcode(0x582); + self.set_dst(op.dst); + + self.set_reg_src(64..72, op.offset); + } + + self.set_ureg(24..32, handle); + self.set_reg_src(64..72, op.offset); + assert!(op.mode == LdcMode::Indexed); + self.set_bit(91, true); // Bindless + } + CBuf::BindlessSSA(_) => panic!("SSA values must be lowered"), + } + + self.set_field(38..54, cb.offset); + self.set_mem_type(73..76, op.mem_type); + } + + fn encode_stg(&mut self, op: &OpSt) { + self.set_opcode(0x386); + + self.set_reg_src(24..32, op.addr); + self.set_reg_src(32..40, op.data); + self.set_field(40..64, op.offset); + + self.set_mem_access(&op.access); + } + + fn encode_stl(&mut self, op: &OpSt) { + self.set_opcode(0x387); + self.set_field(84..87, 1_u8); + + self.set_reg_src(24..32, op.addr); + self.set_reg_src(32..40, op.data); + self.set_field(40..64, op.offset); + + self.set_mem_type(73..76, op.access.mem_type); + assert!(op.access.order == MemOrder::Strong(MemScope::CTA)); + assert!(op.access.eviction_priority == MemEvictionPriority::Normal); + } + + fn encode_sts(&mut self, op: &OpSt) { + self.set_opcode(0x388); + + self.set_reg_src(24..32, op.addr); + self.set_reg_src(32..40, op.data); + self.set_field(40..64, op.offset); + + self.set_mem_type(73..76, op.access.mem_type); + assert!(op.access.order == MemOrder::Strong(MemScope::CTA)); + assert!(op.access.eviction_priority == MemEvictionPriority::Normal); + } + + fn encode_st(&mut self, op: &OpSt) { + match op.access.space { + MemSpace::Global(_) => self.encode_stg(op), + MemSpace::Local => self.encode_stl(op), + MemSpace::Shared => self.encode_sts(op), + } + } + + fn set_atom_op(&mut self, range: Range, atom_op: AtomOp) { + assert!(range.len() == 4); + self.set_field( + range, + match atom_op { + AtomOp::Add | AtomOp::CmpExch => 0_u8, + AtomOp::Min => 1_u8, + AtomOp::Max => 2_u8, + AtomOp::Inc => 3_u8, + AtomOp::Dec => 4_u8, + AtomOp::And => 5_u8, + AtomOp::Or => 6_u8, + AtomOp::Xor => 7_u8, + AtomOp::Exch => 8_u8, + }, + ); + } + + fn set_atom_type(&mut self, range: Range, atom_type: AtomType) { + assert!(range.len() == 3); + self.set_field( + range, + match atom_type { + AtomType::U32 => 0_u8, + AtomType::I32 => 1_u8, + AtomType::U64 => 2_u8, + AtomType::F32 => 3_u8, + AtomType::F16x2 => 4_u8, + AtomType::I64 => 5_u8, + AtomType::F64 => 6_u8, + }, + ); + } + + fn encode_atomg(&mut self, op: &OpAtom) { + if op.atom_op == AtomOp::CmpExch { + self.set_opcode(0x3a9); + + self.set_reg_src(32..40, op.cmpr); + self.set_reg_src(64..72, op.data); + } else { + self.set_opcode(0x3a8); + + self.set_reg_src(32..40, op.data); + + self.set_atom_op(87..91, op.atom_op); + } + + self.set_dst(op.dst); + self.set_pred_dst(81..84, Dst::None); + + self.set_reg_src(24..32, op.addr); + self.set_field(40..64, op.addr_offset); + + self.set_field( + 72..73, + match op.mem_space.addr_type() { + MemAddrType::A32 => 0_u8, + MemAddrType::A64 => 1_u8, + }, + ); + + self.set_atom_type(73..76, op.atom_type); + self.set_mem_order(&op.mem_order); + self.set_eviction_priority(&op.mem_eviction_priority); + } + + fn encode_atoms(&mut self, op: &OpAtom) { + if op.atom_op == AtomOp::CmpExch { + self.set_opcode(0x38d); + + self.set_reg_src(32..40, op.cmpr); + self.set_reg_src(64..72, op.data); + } else { + self.set_opcode(0x38c); + + self.set_reg_src(32..40, op.data); + + self.set_atom_op(87..91, op.atom_op); + } + + self.set_dst(op.dst); + self.set_reg_src(24..32, op.addr); + self.set_field(40..64, op.addr_offset); + + assert!(op.mem_order == MemOrder::Strong(MemScope::CTA)); + assert!(op.mem_eviction_priority == MemEvictionPriority::Normal); + + self.set_atom_type(73..76, op.atom_type); + } + + fn encode_atom(&mut self, op: &OpAtom) { + match op.mem_space { + MemSpace::Global(_) => self.encode_atomg(op), + MemSpace::Local => panic!("Atomics do not support local"), + MemSpace::Shared => self.encode_atoms(op), + } + } + + fn encode_al2p(&mut self, op: &OpAL2P) { + self.set_opcode(0x920); + + self.set_dst(op.dst); + self.set_reg_src(24..32, op.offset); + + self.set_field(40..50, op.access.addr); + self.set_field(74..76, 0_u8); // comps + assert!(!op.access.patch); + self.set_bit(79, op.access.output); + } + + fn encode_ald(&mut self, op: &OpALd) { + self.set_opcode(0x321); + + self.set_dst(op.dst); + self.set_reg_src(32..40, op.vtx); + self.set_reg_src(24..32, op.offset); + + self.set_field(40..50, op.access.addr); + self.set_field(74..76, op.access.comps - 1); + self.set_field(76..77, op.access.patch); + self.set_field(77..78, op.access.phys); + self.set_field(79..80, op.access.output); + } + + fn encode_ast(&mut self, op: &OpASt) { + self.set_opcode(0x322); + + self.set_reg_src(32..40, op.data); + self.set_reg_src(64..72, op.vtx); + self.set_reg_src(24..32, op.offset); + + self.set_field(40..50, op.access.addr); + self.set_field(74..76, op.access.comps - 1); + self.set_field(76..77, op.access.patch); + self.set_field(77..78, op.access.phys); + assert!(op.access.output); + } + + fn encode_ipa(&mut self, op: &OpIpa) { + self.set_opcode(0x326); + + self.set_dst(op.dst); + + assert!(op.addr % 4 == 0); + self.set_field(64..72, op.addr >> 2); + + self.set_field( + 76..78, + match op.loc { + InterpLoc::Default => 0_u8, + InterpLoc::Centroid => 1_u8, + InterpLoc::Offset => 2_u8, + }, + ); + self.set_field( + 78..80, + match op.freq { + InterpFreq::Pass => 0_u8, + InterpFreq::Constant => 1_u8, + InterpFreq::State => 2_u8, + InterpFreq::PassMulW => { + panic!("InterpFreq::PassMulW is invalid on SM70+"); + } + }, + ); + + assert!(op.inv_w.is_zero()); + self.set_reg_src(32..40, op.offset); + + // TODO: What is this for? + self.set_pred_dst(81..84, Dst::None); + } + + fn encode_ldtram(&mut self, op: &OpLdTram) { + self.set_opcode(0x3ad); + self.set_dst(op.dst); + self.set_ureg(24..32, RegRef::zero(RegFile::UGPR, 1)); + + assert!(op.addr % 4 == 0); + self.set_field(64..72, op.addr >> 2); + + self.set_bit(72, op.use_c); + + // Unknown but required + self.set_bit(91, true); + } + + fn encode_cctl(&mut self, op: &OpCCtl) { + assert!(matches!(op.mem_space, MemSpace::Global(_))); + self.set_opcode(0x98f); + + self.set_reg_src(24..32, op.addr); + self.set_field(32..64, op.addr_offset); + + self.set_field( + 87..91, + match op.op { + CCtlOp::PF1 => 0_u8, + CCtlOp::PF2 => 1_u8, + CCtlOp::WB => 2_u8, + CCtlOp::IV => 3_u8, + CCtlOp::IVAll => 4_u8, + CCtlOp::RS => 5_u8, + CCtlOp::IVAllP => 6_u8, + CCtlOp::WBAll => 7_u8, + CCtlOp::WBAllP => 8_u8, + }, + ); + } + + fn encode_membar(&mut self, op: &OpMemBar) { + self.set_opcode(0x992); + + self.set_bit(72, false); // !.MMIO + self.set_field( + 76..79, + match op.scope { + MemScope::CTA => 0_u8, + // SM => 1_u8, + MemScope::GPU => 2_u8, + MemScope::System => 3_u8, + }, + ); + self.set_bit(80, false); // .SC + } + + fn set_rel_offset( + &mut self, + range: Range, + label: &Label, + ip: usize, + labels: &HashMap, + ) { + let ip = u64::try_from(ip).unwrap(); + let ip = i64::try_from(ip).unwrap(); + + let target_ip = *labels.get(label).unwrap(); + let target_ip = u64::try_from(target_ip).unwrap(); + let target_ip = i64::try_from(target_ip).unwrap(); + + let rel_offset = target_ip - ip - 4; + + self.set_field(range, rel_offset); + } + + fn encode_bclear(&mut self, op: &OpBClear) { + self.set_opcode(0x355); + + self.set_dst(Dst::None); + self.set_bar_dst(24..28, op.dst); + + self.set_bit(84, true); // .CLEAR + } + + fn encode_bmov(&mut self, op: &OpBMov) { + if dst_is_bar(op.dst) { + self.set_opcode(0x356); + + self.set_bar_dst(24..28, op.dst); + self.set_reg_src(32..40, op.src); + + self.set_bit(84, op.clear); + } else { + self.set_opcode(0x355); + + self.set_dst(op.dst); + self.set_bar_src(24..28, op.src); + + self.set_bit(84, op.clear); + } + } + + fn encode_break(&mut self, op: &OpBreak) { + self.set_opcode(0x942); + assert!(op.bar_in.src_ref.as_reg() == op.bar_out.as_reg()); + self.set_bar_dst(16..20, op.bar_out); + self.set_pred_src(87..90, 90, op.cond); + } + + fn encode_bssy( + &mut self, + op: &OpBSSy, + ip: usize, + labels: &HashMap, + ) { + self.set_opcode(0x945); + assert!(op.bar_in.src_ref.as_reg() == op.bar_out.as_reg()); + self.set_bar_dst(16..20, op.bar_out); + self.set_rel_offset(34..64, &op.target, ip, labels); + self.set_pred_src(87..90, 90, op.cond); + } + + fn encode_bsync(&mut self, op: &OpBSync) { + self.set_opcode(0x941); + self.set_bar_src(16..20, op.bar); + self.set_pred_src(87..90, 90, op.cond); + } + + fn encode_bra( + &mut self, + op: &OpBra, + ip: usize, + labels: &HashMap, + ) { + self.set_opcode(0x947); + self.set_rel_offset(34..82, &op.target, ip, labels); + self.set_field(87..90, 0x7_u8); // TODO: Pred? + } + + fn encode_exit(&mut self, _op: &OpExit) { + self.set_opcode(0x94d); + + // ./.KEEPREFCOUNT/.PREEMPTED/.INVALID3 + self.set_field(84..85, false); + self.set_field(85..86, false); // .NO_ATEXIT + self.set_field(87..90, 0x7_u8); // TODO: Predicate + self.set_field(90..91, false); // NOT + } + + fn encode_warpsync(&mut self, op: &OpWarpSync) { + self.encode_alu(0x148, None, None, Some(&Src::from(op.mask)), None); + self.set_pred_src(87..90, 90, SrcRef::True.into()); + } + + fn encode_bar(&mut self, _op: &OpBar) { + self.set_opcode(0xb1d); + + // self.set_opcode(0x31d); + + // // src0 == src1 + // self.set_reg_src(32..40, SrcRef::Zero.into()); + + // // 00: RED.POPC + // // 01: RED.AND + // // 02: RED.OR + // self.set_field(74..76, 0_u8); + + // // 00: SYNC + // // 01: ARV + // // 02: RED + // // 03: SCAN + // self.set_field(77..79, 0_u8); + + // self.set_pred_src(87..90, 90, SrcRef::True.into()); + } + + fn encode_cs2r(&mut self, op: &OpCS2R) { + self.set_opcode(0x805); + self.set_dst(op.dst); + self.set_field(72..80, op.idx); + self.set_bit(80, op.dst.as_reg().unwrap().comps() == 2); // .64 + } + + fn encode_isberd(&mut self, op: &OpIsberd) { + self.set_opcode(0x923); + self.set_dst(op.dst); + self.set_reg_src(24..32, op.idx); + } + + fn encode_kill(&mut self, _op: &OpKill) { + self.set_opcode(0x95b); + self.set_pred_src(87..90, 90, SrcRef::True.into()); + } + + fn encode_nop(&mut self, _op: &OpNop) { + self.set_opcode(0x918); + } + + fn encode_pixld(&mut self, op: &OpPixLd) { + self.set_opcode(0x925); + self.set_dst(op.dst); + self.set_field( + 78..81, + match op.val { + PixVal::MsCount => 0_u8, + PixVal::CovMask => 1_u8, + PixVal::CentroidOffset => 2_u8, + PixVal::MyIndex => 3_u8, + PixVal::InnerCoverage => 4_u8, + }, + ); + self.set_pred_dst(81..84, Dst::None); + } + + fn encode_s2r(&mut self, op: &OpS2R) { + assert!(!op.is_uniform()); + self.set_opcode(if op.is_uniform() { 0x9c3 } else { 0x919 }); + self.set_dst(op.dst); + self.set_field(72..80, op.idx); + } + + fn encode_out(&mut self, op: &OpOut) { + self.encode_alu( + 0x124, + Some(&op.dst), + Some(&op.handle), + Some(&op.stream), + None, + ); + + self.set_field( + 78..80, + match op.out_type { + OutType::Emit => 1_u8, + OutType::Cut => 2_u8, + OutType::EmitThenCut => 3_u8, + }, + ); + } + + fn encode_out_final(&mut self, op: &OpOutFinal) { + self.encode_alu( + 0x124, + Some(&Dst::None), + Some(&op.handle), + Some(&Src::new_zero()), + None, + ); + } + + fn encode_vote(&mut self, op: &OpVote) { + if op.is_uniform() { + self.set_opcode(0x886); + self.set_udst(op.ballot); + } else { + self.set_opcode(0x806); + self.set_dst(op.ballot); + } + + self.set_field( + 72..74, + match op.op { + VoteOp::All => 0_u8, + VoteOp::Any => 1_u8, + VoteOp::Eq => 2_u8, + }, + ); + + self.set_pred_dst(81..84, op.vote); + self.set_pred_src(87..90, 90, op.pred); + } + + pub fn encode( + instr: &Instr, + sm: u8, + ip: usize, + labels: &HashMap, + ) -> [u32; 4] { + assert!(sm >= 70); + + let mut si = SM70Instr { + inst: [0; 4], + sm: sm, + }; + + match &instr.op { + Op::FAdd(op) => si.encode_fadd(op), + Op::FFma(op) => si.encode_ffma(op), + Op::FMnMx(op) => si.encode_fmnmx(op), + Op::FMul(op) => si.encode_fmul(op), + Op::FSet(op) => si.encode_fset(op), + Op::FSetP(op) => si.encode_fsetp(op), + Op::FSwzAdd(op) => si.encode_fswzadd(op), + Op::DAdd(op) => si.encode_dadd(op), + Op::DFma(op) => si.encode_dfma(op), + Op::DMul(op) => si.encode_dmul(op), + Op::DSetP(op) => si.encode_dsetp(op), + Op::HAdd2(op) => si.encode_hadd2(op), + Op::HFma2(op) => si.encode_hfma2(op), + Op::HMul2(op) => si.encode_hmul2(op), + Op::HSet2(op) => si.encode_hset2(op), + Op::HSetP2(op) => si.encode_hsetp2(op), + Op::HMnMx2(op) => si.encode_hmnmx2(op), + Op::MuFu(op) => si.encode_mufu(op), + Op::BMsk(op) => si.encode_bmsk(op), + Op::BRev(op) => si.encode_brev(op), + Op::Flo(op) => si.encode_flo(op), + Op::IAbs(op) => si.encode_iabs(op), + Op::IAdd3(op) => si.encode_iadd3(op), + Op::IAdd3X(op) => si.encode_iadd3x(op), + Op::IDp4(op) => si.encode_idp4(op), + Op::IMad(op) => si.encode_imad(op), + Op::IMad64(op) => si.encode_imad64(op), + Op::IMnMx(op) => si.encode_imnmx(op), + Op::ISetP(op) => si.encode_isetp(op), + Op::Lop3(op) => si.encode_lop3(op), + Op::PopC(op) => si.encode_popc(op), + Op::Shf(op) => si.encode_shf(op), + Op::F2F(op) => si.encode_f2f(op), + Op::F2I(op) => si.encode_f2i(op), + Op::I2F(op) => si.encode_i2f(op), + Op::FRnd(op) => si.encode_frnd(op), + Op::Mov(op) => si.encode_mov(op), + Op::Prmt(op) => si.encode_prmt(op), + Op::Sel(op) => si.encode_sel(op), + Op::Shfl(op) => si.encode_shfl(op), + Op::PLop3(op) => si.encode_plop3(op), + Op::R2UR(op) => si.encode_r2ur(op), + Op::Tex(op) => si.encode_tex(op), + Op::Tld(op) => si.encode_tld(op), + Op::Tld4(op) => si.encode_tld4(op), + Op::Tmml(op) => si.encode_tmml(op), + Op::Txd(op) => si.encode_txd(op), + Op::Txq(op) => si.encode_txq(op), + Op::SuLd(op) => si.encode_suld(op), + Op::SuSt(op) => si.encode_sust(op), + Op::SuAtom(op) => si.encode_suatom(op), + Op::Ld(op) => si.encode_ld(op), + Op::Ldc(op) => si.encode_ldc(op), + Op::St(op) => si.encode_st(op), + Op::Atom(op) => si.encode_atom(op), + Op::AL2P(op) => si.encode_al2p(op), + Op::ALd(op) => si.encode_ald(op), + Op::ASt(op) => si.encode_ast(op), + Op::Ipa(op) => si.encode_ipa(op), + Op::LdTram(op) => si.encode_ldtram(op), + Op::CCtl(op) => si.encode_cctl(op), + Op::MemBar(op) => si.encode_membar(op), + Op::BClear(op) => si.encode_bclear(op), + Op::BMov(op) => si.encode_bmov(op), + Op::Break(op) => si.encode_break(op), + Op::BSSy(op) => si.encode_bssy(op, ip, labels), + Op::BSync(op) => si.encode_bsync(op), + Op::Bra(op) => si.encode_bra(op, ip, labels), + Op::Exit(op) => si.encode_exit(op), + Op::WarpSync(op) => si.encode_warpsync(op), + Op::Bar(op) => si.encode_bar(op), + Op::CS2R(op) => si.encode_cs2r(op), + Op::Isberd(op) => si.encode_isberd(op), + Op::Kill(op) => si.encode_kill(op), + Op::Nop(op) => si.encode_nop(op), + Op::PixLd(op) => si.encode_pixld(op), + Op::S2R(op) => si.encode_s2r(op), + Op::Out(op) => si.encode_out(op), + Op::OutFinal(op) => si.encode_out_final(op), + Op::Vote(op) => si.encode_vote(op), + _ => panic!("Unhandled instruction"), + } + + si.set_pred(&instr.pred); + si.set_instr_deps(&instr.deps); + + si.inst + } +} + +pub fn encode_sm70_shader(sm: &dyn ShaderModel, s: &Shader<'_>) -> Vec { + assert!(s.functions.len() == 1); + let func = &s.functions[0]; + + let mut ip = 0_usize; + let mut labels = HashMap::new(); + for b in &func.blocks { + labels.insert(b.label, ip); + for instr in &b.instrs { + if let Op::Nop(op) = &instr.op { + if let Some(label) = op.label { + labels.insert(label, ip); + } + } + ip += 4; + } + } + + let mut encoded = Vec::new(); + for b in &func.blocks { + for instr in &b.instrs { + let e = SM70Instr::encode(instr, sm.sm(), encoded.len(), &labels); + encoded.extend_from_slice(&e[..]); + } + } + encoded +} diff --git a/src/nouveau/compiler/nak/ir.rs b/src/nouveau/compiler/nak/ir.rs index 3771b103b45..f8cd24a6b69 100644 --- a/src/nouveau/compiler/nak/ir.rs +++ b/src/nouveau/compiler/nak/ir.rs @@ -6251,6 +6251,8 @@ pub trait ShaderModel { fn num_regs(&self, file: RegFile) -> u32; fn op_can_be_uniform(&self, op: &Op) -> bool; + + fn encode_shader(&self, s: &Shader<'_>) -> Vec; } pub struct Shader<'a> { diff --git a/src/nouveau/compiler/nak/sm50.rs b/src/nouveau/compiler/nak/sm50.rs index d7ad43dac44..7908e26eb85 100644 --- a/src/nouveau/compiler/nak/sm50.rs +++ b/src/nouveau/compiler/nak/sm50.rs @@ -38,6 +38,10 @@ impl ShaderModel for ShaderModel50 { fn op_can_be_uniform(&self, _op: &Op) -> bool { false } + + fn encode_shader(&self, s: &Shader<'_>) -> Vec { + encode_sm50_shader(self, s) + } } impl Src { @@ -2212,70 +2216,68 @@ fn encode_instr( res.inst } -impl Shader<'_> { - pub fn encode_sm50(&self) -> Vec { - assert!(self.functions.len() == 1); - let func = &self.functions[0]; +fn encode_sm50_shader(sm: &ShaderModel50, s: &Shader<'_>) -> Vec { + assert!(s.functions.len() == 1); + let func = &s.functions[0]; - let mut num_instrs = 0_usize; - let mut labels = HashMap::new(); - for b in &func.blocks { - // We ensure blocks will have groups of 3 instructions with a - // schedule instruction before each groups. As we should never jump - // to a schedule instruction, we account for that here. - labels.insert(b.label, num_instrs + 8); + let mut num_instrs = 0_usize; + let mut labels = HashMap::new(); + for b in &func.blocks { + // We ensure blocks will have groups of 3 instructions with a + // schedule instruction before each groups. As we should never jump + // to a schedule instruction, we account for that here. + labels.insert(b.label, num_instrs + 8); - let block_num_instrs = b.instrs.len().next_multiple_of(3); + let block_num_instrs = b.instrs.len().next_multiple_of(3); - // Every 3 instructions, we have a new schedule instruction so we - // need to account for that. - num_instrs += (block_num_instrs + (block_num_instrs / 3)) * 8; - } - - let mut encoded = Vec::new(); - for b in &func.blocks { - // A block is composed of groups of 3 instructions. - let block_num_instrs = b.instrs.len().next_multiple_of(3); - - let mut instrs_iter = b.instrs.iter(); - - for _ in 0..(block_num_instrs / 3) { - let mut ip = ((encoded.len() / 2) + 1) * 8; - - let mut sched_instr = [0x0; 2]; - - let instr0 = encode_instr( - 0, - instrs_iter.next(), - self.sm.sm(), - &labels, - &mut ip, - &mut sched_instr, - ); - let instr1 = encode_instr( - 1, - instrs_iter.next(), - self.sm.sm(), - &labels, - &mut ip, - &mut sched_instr, - ); - let instr2 = encode_instr( - 2, - instrs_iter.next(), - self.sm.sm(), - &labels, - &mut ip, - &mut sched_instr, - ); - - encoded.extend_from_slice(&sched_instr[..]); - encoded.extend_from_slice(&instr0[..]); - encoded.extend_from_slice(&instr1[..]); - encoded.extend_from_slice(&instr2[..]); - } - } - - encoded + // Every 3 instructions, we have a new schedule instruction so we + // need to account for that. + num_instrs += (block_num_instrs + (block_num_instrs / 3)) * 8; } + + let mut encoded = Vec::new(); + for b in &func.blocks { + // A block is composed of groups of 3 instructions. + let block_num_instrs = b.instrs.len().next_multiple_of(3); + + let mut instrs_iter = b.instrs.iter(); + + for _ in 0..(block_num_instrs / 3) { + let mut ip = ((encoded.len() / 2) + 1) * 8; + + let mut sched_instr = [0x0; 2]; + + let instr0 = encode_instr( + 0, + instrs_iter.next(), + sm.sm, + &labels, + &mut ip, + &mut sched_instr, + ); + let instr1 = encode_instr( + 1, + instrs_iter.next(), + sm.sm, + &labels, + &mut ip, + &mut sched_instr, + ); + let instr2 = encode_instr( + 2, + instrs_iter.next(), + sm.sm, + &labels, + &mut ip, + &mut sched_instr, + ); + + encoded.extend_from_slice(&sched_instr[..]); + encoded.extend_from_slice(&instr0[..]); + encoded.extend_from_slice(&instr1[..]); + encoded.extend_from_slice(&instr2[..]); + } + } + + encoded } diff --git a/src/nouveau/compiler/nak/sm70.rs b/src/nouveau/compiler/nak/sm70.rs index 8221e3a882a..5c2a4034645 100644 --- a/src/nouveau/compiler/nak/sm70.rs +++ b/src/nouveau/compiler/nak/sm70.rs @@ -91,6 +91,10 @@ impl ShaderModel for ShaderModel70 { _ => false, } } + + fn encode_shader(&self, s: &Shader<'_>) -> Vec { + encode_sm70_shader(self, s) + } } struct ALURegRef { @@ -2656,37 +2660,30 @@ impl SM70Instr { } } -impl Shader<'_> { - pub fn encode_sm70(&self) -> Vec { - assert!(self.functions.len() == 1); - let func = &self.functions[0]; +fn encode_sm70_shader(sm: &ShaderModel70, s: &Shader<'_>) -> Vec { + assert!(s.functions.len() == 1); + let func = &s.functions[0]; - let mut ip = 0_usize; - let mut labels = HashMap::new(); - for b in &func.blocks { - labels.insert(b.label, ip); - for instr in &b.instrs { - if let Op::Nop(op) = &instr.op { - if let Some(label) = op.label { - labels.insert(label, ip); - } + let mut ip = 0_usize; + let mut labels = HashMap::new(); + for b in &func.blocks { + labels.insert(b.label, ip); + for instr in &b.instrs { + if let Op::Nop(op) = &instr.op { + if let Some(label) = op.label { + labels.insert(label, ip); } - ip += 4; } + ip += 4; } - - let mut encoded = Vec::new(); - for b in &func.blocks { - for instr in &b.instrs { - let e = SM70Instr::encode( - instr, - self.sm.sm(), - encoded.len(), - &labels, - ); - encoded.extend_from_slice(&e[..]); - } - } - encoded } + + let mut encoded = Vec::new(); + for b in &func.blocks { + for instr in &b.instrs { + let e = SM70Instr::encode(instr, sm.sm, encoded.len(), &labels); + encoded.extend_from_slice(&e[..]); + } + } + encoded }