From 818ec3242bed9cca2e5e73ab8340bae7cb42842f Mon Sep 17 00:00:00 2001 From: Faith Ekstrand Date: Thu, 27 Apr 2023 17:29:19 -0500 Subject: [PATCH] nak: Optimize OpLop3 and OpPLop3 Part-of: --- src/nouveau/compiler/nak.rs | 6 + src/nouveau/compiler/nak_ir.rs | 39 +++- src/nouveau/compiler/nak_opt_copy_prop.rs | 72 ++++++ src/nouveau/compiler/nak_opt_lop.rs | 262 ++++++++++++++++++++++ 4 files changed, 376 insertions(+), 3 deletions(-) create mode 100644 src/nouveau/compiler/nak_opt_lop.rs diff --git a/src/nouveau/compiler/nak.rs b/src/nouveau/compiler/nak.rs index 6bca71bc779..2d33c17e664 100644 --- a/src/nouveau/compiler/nak.rs +++ b/src/nouveau/compiler/nak.rs @@ -15,6 +15,7 @@ mod nak_liveness; mod nak_lower_par_copies; mod nak_opt_copy_prop; mod nak_opt_dce; +mod nak_opt_lop; mod nir; mod union_find; mod util; @@ -433,6 +434,11 @@ pub extern "C" fn nak_compile_shader( println!("NAK IR:\n{}", &s); } + s.opt_lop(); + if DEBUG.print() { + println!("NAK IR:\n{}", &s); + } + s.opt_dce(); if DEBUG.print() { println!("NAK IR:\n{}", &s); diff --git a/src/nouveau/compiler/nak_ir.rs b/src/nouveau/compiler/nak_ir.rs index 08dc1776168..533e59a8691 100644 --- a/src/nouveau/compiler/nak_ir.rs +++ b/src/nouveau/compiler/nak_ir.rs @@ -418,13 +418,13 @@ pub enum CBuf { BindlessGPR(RegRef), } -#[derive(Clone, Copy)] +#[derive(Clone, Copy, Eq, Hash, PartialEq)] pub struct CBufRef { pub buf: CBuf, pub offset: u16, } -#[derive(Clone, Copy)] +#[derive(Clone, Copy, Eq, Hash, PartialEq)] pub enum SrcRef { Zero, True, @@ -971,15 +971,22 @@ impl fmt::Display for IntCmpType { } } +#[derive(Clone, Copy, Eq, Hash, PartialEq)] pub struct LogicOp { pub lut: u8, } impl LogicOp { + pub const SRC_MASKS: [u8; 3] = [0xf0, 0xcc, 0xaa]; + #[inline] pub fn new_lut u8>(f: &F) -> LogicOp { LogicOp { - lut: f(0xf0, 0xcc, 0xaa), + lut: f( + LogicOp::SRC_MASKS[0], + LogicOp::SRC_MASKS[1], + LogicOp::SRC_MASKS[2], + ), } } @@ -989,6 +996,32 @@ impl LogicOp { } } + pub fn src_used(&self, src_idx: usize) -> bool { + let mask = LogicOp::SRC_MASKS[src_idx]; + let shift = LogicOp::SRC_MASKS[src_idx].trailing_zeros(); + self.lut & !mask != (self.lut >> shift) & !mask + } + + pub fn fix_src(&mut self, src_idx: usize, val: bool) { + let mask = LogicOp::SRC_MASKS[src_idx]; + let shift = LogicOp::SRC_MASKS[src_idx].trailing_zeros(); + if val { + let t_bits = self.lut & mask; + self.lut = t_bits | (t_bits >> shift) + } else { + let f_bits = self.lut & !mask; + self.lut = (f_bits << shift) | f_bits + }; + } + + pub fn invert_src(&mut self, src_idx: usize) { + let mask = LogicOp::SRC_MASKS[src_idx]; + let shift = LogicOp::SRC_MASKS[src_idx].trailing_zeros(); + let t_bits = self.lut & mask; + let f_bits = self.lut & !mask; + self.lut = (f_bits << shift) | (t_bits >> shift); + } + pub fn eval< T: BitAnd + BitOr + Copy + Not, >( diff --git a/src/nouveau/compiler/nak_opt_copy_prop.rs b/src/nouveau/compiler/nak_opt_copy_prop.rs index 0a4c28bdce5..d0a47de2e79 100644 --- a/src/nouveau/compiler/nak_opt_copy_prop.rs +++ b/src/nouveau/compiler/nak_opt_copy_prop.rs @@ -310,6 +310,36 @@ impl CopyPropPass { for b in &mut f.blocks { for instr in &mut b.instrs { match &instr.op { + Op::Lop3(lop) => { + let dst = lop.dst.as_ssa().unwrap(); + assert!(dst.comps() == 1); + let dst = dst[0]; + + let op = lop.op; + if op.lut == 0 { + self.add_copy( + dst, + SrcType::ALU, + SrcRef::Zero.into(), + ); + } else if op.lut == !0 { + self.add_copy( + dst, + SrcType::ALU, + SrcRef::Imm32(u32::MAX).into(), + ); + } else { + for s in 0..3 { + if op.lut == LogicOp::SRC_MASKS[s] { + self.add_copy( + dst, + SrcType::ALU, + lop.srcs[s], + ); + } + } + } + } Op::Mov(mov) => { let dst = mov.dst.as_ssa().unwrap(); assert!(dst.comps() == 1); @@ -317,6 +347,48 @@ impl CopyPropPass { self.add_copy(dst[0], SrcType::GPR, mov.src); } } + Op::PLop3(lop) => { + for i in 0..2 { + let dst = match lop.dsts[i] { + Dst::SSA(vec) => { + assert!(vec.comps() == 1); + vec[0] + } + _ => continue, + }; + + let op = lop.ops[i]; + if op.lut == 0 { + self.add_copy( + dst, + SrcType::Pred, + SrcRef::False.into(), + ); + } else if op.lut == !0 { + self.add_copy( + dst, + SrcType::Pred, + SrcRef::True.into(), + ); + } else { + for s in 0..3 { + if op.lut == LogicOp::SRC_MASKS[s] { + self.add_copy( + dst, + SrcType::Pred, + lop.srcs[i], + ); + } else if op.lut == !LogicOp::SRC_MASKS[s] { + self.add_copy( + dst, + SrcType::Pred, + lop.srcs[i].bnot(), + ); + } + } + } + } + } Op::FMov(mov) => { let dst = mov.dst.as_ssa().unwrap(); assert!(dst.comps() == 1); diff --git a/src/nouveau/compiler/nak_opt_lop.rs b/src/nouveau/compiler/nak_opt_lop.rs new file mode 100644 index 00000000000..69f9ec05d87 --- /dev/null +++ b/src/nouveau/compiler/nak_opt_lop.rs @@ -0,0 +1,262 @@ +/* + * Copyright © 2022 Collabora, Ltd. + * SPDX-License-Identifier: MIT + */ + +use crate::nak_ir::*; + +use std::collections::HashMap; +use std::slice; + +struct LopEntry { + op: LogicOp, + srcs_used: u8, + srcs: [Src; 3], +} + +struct LopPass { + use_counts: HashMap, + ssa_lop: HashMap, +} + +fn src_as_bool(src: &Src) -> Option { + assert!(src.src_mod.is_none()); + match src.src_ref { + SrcRef::Zero | SrcRef::False | SrcRef::Imm32(0) => Some(false), + SrcRef::True | SrcRef::Imm32(u32::MAX) => Some(true), + _ => return None, + } +} + +impl LopPass { + fn new(f: &Function) -> LopPass { + let mut use_counts = HashMap::new(); + for b in &f.blocks { + for instr in &b.instrs { + if let Pred::SSA(ssa) = instr.pred { + use_counts.entry(ssa).and_modify(|e| *e += 1).or_insert(1); + } + + for src in instr.srcs() { + if let SrcRef::SSA(vec) = src.src_ref { + for ssa in vec.iter() { + use_counts + .entry(*ssa) + .and_modify(|e| *e += 1) + .or_insert(1); + } + } + } + } + } + LopPass { + use_counts: use_counts, + ssa_lop: HashMap::new(), + } + } + + fn add_lop(&mut self, ssa: SSAValue, op: LogicOp, srcs: [Src; 3]) { + let mut srcs_used = 0; + for i in 0..3 { + if op.src_used(i) { + srcs_used |= 1 << i; + assert!(src_as_bool(&srcs[i]).is_none()); + } + } + let entry = LopEntry { + op: op, + srcs_used: srcs_used, + srcs: srcs, + }; + self.ssa_lop.insert(ssa, entry); + } + + fn dedup_srcs(&self, op: &mut LogicOp, srcs: &[Src; 3]) { + if srcs[0].src_ref == srcs[1].src_ref { + *op = LogicOp::new_lut(&|x, _, z| op.eval(x, x, z)) + } + if srcs[0].src_ref == srcs[2].src_ref { + *op = LogicOp::new_lut(&|x, y, _| op.eval(x, y, x)) + } + if srcs[1].src_ref == srcs[2].src_ref { + *op = LogicOp::new_lut(&|x, y, _| op.eval(x, y, y)) + } + } + + fn try_prop_to_src( + &self, + ops: &mut [LogicOp], + srcs: &mut [Src; 3], + src_idx: usize, + ) { + loop { + assert!(srcs[src_idx].src_mod.is_none()); + let ssa = match srcs[src_idx].src_ref { + SrcRef::SSA(vec) => { + assert!(vec.comps() == 1); + vec[0] + } + _ => return, + }; + + let entry = match self.ssa_lop.get(&ssa) { + Some(e) => e, + None => return, + }; + + let entry_use_count = *self.use_counts.get(&ssa).unwrap(); + if entry.srcs_used.count_ones() > 1 && entry_use_count > 1 { + return; + } + + let mut entry_srcs = [usize::MAX; 3]; + let mut next_src = 0_usize; + for i in 0..3 { + if entry.srcs_used & (1 << i) == 0 { + continue; + } + + let mut found = false; + for j in 0..3 { + if entry.srcs[i].src_ref == srcs[j].src_ref { + entry_srcs[i] = j; + found = true; + break; + } + } + if found { + continue; + } + + loop { + if next_src >= srcs.len() { + return; + } + + /* All callers of this function need to ensure that + * constant sources are already folded so we know we + * can always re-use them. + */ + if next_src == src_idx + || src_as_bool(&srcs[next_src]).is_some() + { + entry_srcs[i] = next_src; + next_src += 1; + break; + } + next_src += 1; + } + } + + /* Clear out the propagated source. What we put here doesn't matter + * since it's no longer used. It may be overwritten by one of the + * entry sources but there is no guarantee of this. + */ + srcs[src_idx] = match ssa.file() { + RegFile::GPR | RegFile::UGPR => SrcRef::Zero.into(), + RegFile::Pred | RegFile::UPred => SrcRef::True.into(), + }; + + for i in 0..3 { + if entry_srcs[i] != usize::MAX { + srcs[entry_srcs[i]] = entry.srcs[i]; + } + } + for op in ops.iter_mut() { + *op = LogicOp::new_lut(&|x, y, z| { + let mut s = [x, y, z]; + let mut es = [0; 3]; + for i in 0..3 { + if entry_srcs[i] != usize::MAX { + es[i] = s[entry_srcs[i]]; + } + } + let e = entry.op.eval(es[0], es[1], es[2]); + s[src_idx] = e; + op.eval(s[0], s[1], s[2]) + }); + } + } + } + + fn opt_lop3(&mut self, op: &mut OpLop3) { + self.dedup_srcs(&mut op.op, &op.srcs); + + for (i, src) in op.srcs.iter_mut().enumerate() { + assert!(src.src_mod.is_none()); + + if let Some(b) = src_as_bool(src) { + op.op.fix_src(i, b); + } + + if !op.op.src_used(i) { + /* Replace unused sources with RZ */ + *src = SrcRef::Zero.into(); + } + } + + for i in 0..3 { + self.try_prop_to_src(slice::from_mut(&mut op.op), &mut op.srcs, i); + } + + if let Dst::SSA(ssa) = op.dst { + assert!(ssa.comps() == 1); + self.add_lop(ssa[0], op.op, op.srcs); + } + } + + fn opt_plop3(&mut self, op: &mut OpPLop3) { + self.dedup_srcs(&mut op.ops[0], &op.srcs); + self.dedup_srcs(&mut op.ops[1], &op.srcs); + + /* Replace unused sources with PT */ + for (i, src) in op.srcs.iter_mut().enumerate() { + if src.src_mod.is_bnot() { + op.ops[0].invert_src(i); + op.ops[1].invert_src(i); + src.src_mod = SrcMod::None; + } + + if let Some(b) = src_as_bool(src) { + op.ops[0].fix_src(i, b); + op.ops[1].fix_src(i, b); + } + + if !op.ops[0].src_used(i) && !op.ops[1].src_used(i) { + *src = SrcRef::True.into(); + } + } + + for i in 0..3 { + self.try_prop_to_src(&mut op.ops, &mut op.srcs, i); + } + + for i in 0..2 { + if let Dst::SSA(ssa) = op.dsts[i] { + assert!(ssa.comps() == 1); + self.add_lop(ssa[0], op.ops[i], op.srcs); + } + } + } + + fn run(&mut self, f: &mut Function) { + for b in &mut f.blocks { + for instr in &mut b.instrs { + match &mut instr.op { + Op::Lop3(op) => self.opt_lop3(op), + Op::PLop3(op) => self.opt_plop3(op), + _ => (), + } + } + } + } +} + +impl Shader { + pub fn opt_lop(&mut self) { + for f in &mut self.functions { + let mut pass = LopPass::new(f); + pass.run(f); + } + } +}