nak/sm50: Encode cache ops on Maxwell

We just sort of YOLO'd it before, with no real plan. But it passed all the tests so it never cared. It turns out the cache ops on Maxwell are mostly the same as the ones we already added to Kepler, we just need to encode them. The only big difference is that we no longer need to avoid the L1 cache on Maxwell as it's either coherent or disabled in hardware for global memory (I don't know which). The only substantive change this MR makes is that images are now using .ca by default rather than .cg. However, this is the same choice we're currently making for global access and it still passes all the memory model tests so it should be okay. Reviewed-by: Mel Henning <mhenning@darkrefraction.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35265>
2025-05-30 14:59:15 -04:00
parent a3b4401fe6
commit fb3125c4e3
2 changed files with 74 additions and 36 deletions
--- a/src/nouveau/compiler/nak/ir.rs
+++ b/src/nouveau/compiler/nak/ir.rs
@@ -2307,25 +2307,45 @@ impl fmt::Display for LdCacheOp {

 impl LdCacheOp {
    pub fn select(
-        _sm: &dyn ShaderModel,
+        sm: &dyn ShaderModel,
        space: MemSpace,
        order: MemOrder,
        _eviction_priority: MemEvictionPriority,
    ) -> Self {
        match space {
-            // From the CUDA 10.2 docs:
-            //
-            //    "L1 caching in Kepler GPUs is reserved only for local memory
-            //    accesses, such as register spills and stack data. Global
-            //    loads are cached in L2 only (or in the Read-Only Data Cache)."
-            //
-            // We assume that CacheAll is also safe for shared memory.
            MemSpace::Global(_) => match order {
                MemOrder::Constant => LdCacheOp::CacheAll,
                MemOrder::Strong(MemScope::System) => {
                    LdCacheOp::CacheInvalidate
                }
-                _ => LdCacheOp::CacheGlobal,
+                _ => {
+                    // From the CUDA 10.2 docs:
+                    //
+                    //    "The default load instruction cache operation is
+                    //    ld.ca, which allocates cache lines in all levels (L1
+                    //    and L2) with normal eviction policy. Global data is
+                    //    coherent at the L2 level, but multiple L1 caches are
+                    //    not coherent for global data. If one thread stores to
+                    //    global memory via one L1 cache, and a second thread
+                    //    loads that address via a second L1 cache with ld.ca,
+                    //    the second thread may get stale L1 cache data"
+                    //
+                    // and
+                    //
+                    //    "L1 caching in Kepler GPUs is reserved only for local
+                    //    memory accesses, such as register spills and stack
+                    //    data. Global loads are cached in L2 only (or in the
+                    //    Read-Only Data Cache)."
+                    //
+                    // We follow suit and use CacheGlobal for all global memory
+                    // access on Kepler.  On Maxwell, it appears safe to use
+                    // CacheAll for everything.
+                    if sm.sm() >= 50 {
+                        LdCacheOp::CacheAll
+                    } else {
+                        LdCacheOp::CacheGlobal
+                    }
+                }
            },
            MemSpace::Local | MemSpace::Shared => LdCacheOp::CacheAll,
        }
@@ -2356,7 +2376,7 @@ impl fmt::Display for StCacheOp {

 impl StCacheOp {
    pub fn select(
-        _sm: &dyn ShaderModel,
+        sm: &dyn ShaderModel,
        space: MemSpace,
        order: MemOrder,
        _eviction_priority: MemEvictionPriority,
@@ -2364,10 +2384,15 @@ impl StCacheOp {
        match space {
            MemSpace::Global(_) => match order {
                MemOrder::Constant => panic!("Cannot store to constant"),
-                MemOrder::Strong(MemScope::System) => {
-                    StCacheOp::WriteThrough
+                MemOrder::Strong(MemScope::System) => StCacheOp::WriteThrough,
+                _ => {
+                    // See the corresponding comment in LdCacheOp::select()
+                    if sm.sm() >= 50 {
+                        StCacheOp::WriteBack
+                    } else {
+                        StCacheOp::CacheGlobal
+                    }
                }
-                _ => StCacheOp::CacheGlobal,
            },
            MemSpace::Local | MemSpace::Shared => StCacheOp::WriteBack,
        }
--- a/src/nouveau/compiler/nak/sm50.rs
+++ b/src/nouveau/compiler/nak/sm50.rs
@@ -2386,10 +2386,6 @@ impl SM50Encoder<'_> {
        );
    }

-    fn set_mem_order(&mut self, _order: &MemOrder) {
-        // TODO: order and scope aren't present before SM70, what should we do?
-    }
-
    fn set_mem_access(&mut self, access: &MemAccess) {
        self.set_field(
            45..46,
@@ -2399,7 +2395,26 @@ impl SM50Encoder<'_> {
            },
        );
        self.set_mem_type(48..51, access.mem_type);
-        self.set_mem_order(&access.order);
+    }
+
+    fn set_ld_cache_op(&mut self, range: Range<usize>, op: LdCacheOp) {
+        let cache_op = match op {
+            LdCacheOp::CacheAll => 0_u8,
+            LdCacheOp::CacheGlobal => 1_u8,
+            LdCacheOp::CacheInvalidate => 3_u8,
+            _ => panic!("Unsupported cache op: ld{op}"),
+        };
+        self.set_field(range, cache_op);
+    }
+
+    fn set_st_cache_op(&mut self, range: Range<usize>, op: StCacheOp) {
+        let cache_op = match op {
+            StCacheOp::WriteBack => 0_u8,
+            StCacheOp::CacheGlobal => 1_u8,
+            StCacheOp::CacheStreaming => 2_u8,
+            StCacheOp::WriteThrough => 3_u8,
+        };
+        self.set_field(range, cache_op);
    }

    fn set_image_dim(&mut self, range: Range<usize>, dim: ImageDim) {
@@ -2451,22 +2466,13 @@ impl SM50Op for OpSuLd {
        }
        e.set_image_dim(33..36, self.image_dim);

-        // mem_eviction_policy not a thing for sm < 70
-
-        let scope = match self.mem_order {
-            MemOrder::Constant | MemOrder::Weak => MemScope::CTA,
-            MemOrder::Strong(s) => s,
-        };
-
-        e.set_field(
-            24..26,
-            match scope {
-                MemScope::CTA => 0_u8,
-                /* SM => 1_u8, */
-                MemScope::GPU => 2_u8,
-                MemScope::System => 3_u8,
-            },
+        let cache_op = LdCacheOp::select(
+            e.sm,
+            MemSpace::Global(MemAddrType::A64),
+            self.mem_order,
+            self.mem_eviction_priority,
        );
+        e.set_ld_cache_op(24..26, cache_op);

        e.set_dst(&self.dst);

@@ -2498,8 +2504,15 @@ impl SM50Op for OpSuSt {
        e.set_reg_src(0..8, &self.data);
        e.set_reg_src(39..47, &self.handle);

+        let cache_op = StCacheOp::select(
+            e.sm,
+            MemSpace::Global(MemAddrType::A64),
+            self.mem_order,
+            self.mem_eviction_priority,
+        );
+        e.set_st_cache_op(24..26, cache_op);
+
        e.set_image_dim(33..36, self.image_dim);
-        e.set_mem_order(&self.mem_order);
    }
 }

@@ -2582,6 +2595,7 @@ impl SM50Op for OpLd {
        e.set_field(20..44, self.offset);

        e.set_mem_access(&self.access);
+        e.set_ld_cache_op(46..48, self.access.ld_cache_op(e.sm));
    }
 }

@@ -2635,6 +2649,7 @@ impl SM50Op for OpSt {
        e.set_reg_src(8..16, &self.addr);
        e.set_field(20..44, self.offset);
        e.set_mem_access(&self.access);
+        e.set_st_cache_op(46..48, self.access.st_cache_op(e.sm));
    }
 }

@@ -2745,8 +2760,6 @@ impl SM50Op for OpAtom {
                    e.set_atom_op(52..56, self.atom_op);
                }

-                e.set_mem_order(&self.mem_order);
-
                e.set_reg_src(8..16, &self.addr);
                e.set_field(28..48, self.addr_offset);
                e.set_field(