diff --git a/src/nouveau/compiler/nak/ir.rs b/src/nouveau/compiler/nak/ir.rs
index 3b3c46a5cac..373f471e922 100644
--- a/src/nouveau/compiler/nak/ir.rs
+++ b/src/nouveau/compiler/nak/ir.rs
@@ -2307,25 +2307,45 @@ impl fmt::Display for LdCacheOp {
 
 impl LdCacheOp {
     pub fn select(
-        _sm: &dyn ShaderModel,
+        sm: &dyn ShaderModel,
         space: MemSpace,
         order: MemOrder,
         _eviction_priority: MemEvictionPriority,
     ) -> Self {
         match space {
-            // From the CUDA 10.2 docs:
-            //
-            //    "L1 caching in Kepler GPUs is reserved only for local memory
-            //    accesses, such as register spills and stack data. Global
-            //    loads are cached in L2 only (or in the Read-Only Data Cache)."
-            //
-            // We assume that CacheAll is also safe for shared memory.
             MemSpace::Global(_) => match order {
                 MemOrder::Constant => LdCacheOp::CacheAll,
                 MemOrder::Strong(MemScope::System) => {
                     LdCacheOp::CacheInvalidate
                 }
-                _ => LdCacheOp::CacheGlobal,
+                _ => {
+                    // From the CUDA 10.2 docs:
+                    //
+                    //    "The default load instruction cache operation is
+                    //    ld.ca, which allocates cache lines in all levels (L1
+                    //    and L2) with normal eviction policy. Global data is
+                    //    coherent at the L2 level, but multiple L1 caches are
+                    //    not coherent for global data. If one thread stores to
+                    //    global memory via one L1 cache, and a second thread
+                    //    loads that address via a second L1 cache with ld.ca,
+                    //    the second thread may get stale L1 cache data"
+                    //
+                    // and
+                    //
+                    //    "L1 caching in Kepler GPUs is reserved only for local
+                    //    memory accesses, such as register spills and stack
+                    //    data. Global loads are cached in L2 only (or in the
+                    //    Read-Only Data Cache)."
+                    //
+                    // We follow suit and use CacheGlobal for all global memory
+                    // access on Kepler.  On Maxwell, it appears safe to use
+                    // CacheAll for everything.
+                    if sm.sm() >= 50 {
+                        LdCacheOp::CacheAll
+                    } else {
+                        LdCacheOp::CacheGlobal
+                    }
+                }
             },
             MemSpace::Local | MemSpace::Shared => LdCacheOp::CacheAll,
         }
@@ -2356,7 +2376,7 @@ impl fmt::Display for StCacheOp {
 
 impl StCacheOp {
     pub fn select(
-        _sm: &dyn ShaderModel,
+        sm: &dyn ShaderModel,
         space: MemSpace,
         order: MemOrder,
         _eviction_priority: MemEvictionPriority,
@@ -2364,10 +2384,15 @@ impl StCacheOp {
         match space {
             MemSpace::Global(_) => match order {
                 MemOrder::Constant => panic!("Cannot store to constant"),
-                MemOrder::Strong(MemScope::System) => {
-                    StCacheOp::WriteThrough
+                MemOrder::Strong(MemScope::System) => StCacheOp::WriteThrough,
+                _ => {
+                    // See the corresponding comment in LdCacheOp::select()
+                    if sm.sm() >= 50 {
+                        StCacheOp::WriteBack
+                    } else {
+                        StCacheOp::CacheGlobal
+                    }
                 }
-                _ => StCacheOp::CacheGlobal,
             },
             MemSpace::Local | MemSpace::Shared => StCacheOp::WriteBack,
         }
diff --git a/src/nouveau/compiler/nak/sm50.rs b/src/nouveau/compiler/nak/sm50.rs
index c2adc286610..eb8b61ef7b5 100644
--- a/src/nouveau/compiler/nak/sm50.rs
+++ b/src/nouveau/compiler/nak/sm50.rs
@@ -2386,10 +2386,6 @@ impl SM50Encoder<'_> {
         );
     }
 
-    fn set_mem_order(&mut self, _order: &MemOrder) {
-        // TODO: order and scope aren't present before SM70, what should we do?
-    }
-
     fn set_mem_access(&mut self, access: &MemAccess) {
         self.set_field(
             45..46,
@@ -2399,7 +2395,26 @@ impl SM50Encoder<'_> {
             },
         );
         self.set_mem_type(48..51, access.mem_type);
-        self.set_mem_order(&access.order);
+    }
+
+    fn set_ld_cache_op(&mut self, range: Range<usize>, op: LdCacheOp) {
+        let cache_op = match op {
+            LdCacheOp::CacheAll => 0_u8,
+            LdCacheOp::CacheGlobal => 1_u8,
+            LdCacheOp::CacheInvalidate => 3_u8,
+            _ => panic!("Unsupported cache op: ld{op}"),
+        };
+        self.set_field(range, cache_op);
+    }
+
+    fn set_st_cache_op(&mut self, range: Range<usize>, op: StCacheOp) {
+        let cache_op = match op {
+            StCacheOp::WriteBack => 0_u8,
+            StCacheOp::CacheGlobal => 1_u8,
+            StCacheOp::CacheStreaming => 2_u8,
+            StCacheOp::WriteThrough => 3_u8,
+        };
+        self.set_field(range, cache_op);
     }
 
     fn set_image_dim(&mut self, range: Range<usize>, dim: ImageDim) {
@@ -2451,22 +2466,13 @@ impl SM50Op for OpSuLd {
         }
         e.set_image_dim(33..36, self.image_dim);
 
-        // mem_eviction_policy not a thing for sm < 70
-
-        let scope = match self.mem_order {
-            MemOrder::Constant | MemOrder::Weak => MemScope::CTA,
-            MemOrder::Strong(s) => s,
-        };
-
-        e.set_field(
-            24..26,
-            match scope {
-                MemScope::CTA => 0_u8,
-                /* SM => 1_u8, */
-                MemScope::GPU => 2_u8,
-                MemScope::System => 3_u8,
-            },
+        let cache_op = LdCacheOp::select(
+            e.sm,
+            MemSpace::Global(MemAddrType::A64),
+            self.mem_order,
+            self.mem_eviction_priority,
         );
+        e.set_ld_cache_op(24..26, cache_op);
 
         e.set_dst(&self.dst);
 
@@ -2498,8 +2504,15 @@ impl SM50Op for OpSuSt {
         e.set_reg_src(0..8, &self.data);
         e.set_reg_src(39..47, &self.handle);
 
+        let cache_op = StCacheOp::select(
+            e.sm,
+            MemSpace::Global(MemAddrType::A64),
+            self.mem_order,
+            self.mem_eviction_priority,
+        );
+        e.set_st_cache_op(24..26, cache_op);
+
         e.set_image_dim(33..36, self.image_dim);
-        e.set_mem_order(&self.mem_order);
     }
 }
 
@@ -2582,6 +2595,7 @@ impl SM50Op for OpLd {
         e.set_field(20..44, self.offset);
 
         e.set_mem_access(&self.access);
+        e.set_ld_cache_op(46..48, self.access.ld_cache_op(e.sm));
     }
 }
 
@@ -2635,6 +2649,7 @@ impl SM50Op for OpSt {
         e.set_reg_src(8..16, &self.addr);
         e.set_field(20..44, self.offset);
         e.set_mem_access(&self.access);
+        e.set_st_cache_op(46..48, self.access.st_cache_op(e.sm));
     }
 }
 
@@ -2745,8 +2760,6 @@ impl SM50Op for OpAtom {
                     e.set_atom_op(52..56, self.atom_op);
                 }
 
-                e.set_mem_order(&self.mem_order);
-
                 e.set_reg_src(8..16, &self.addr);
                 e.set_field(28..48, self.addr_offset);
                 e.set_field(