diff --git a/src/nouveau/compiler/nak/ir.rs b/src/nouveau/compiler/nak/ir.rs index 3b3c46a5cac..373f471e922 100644 --- a/src/nouveau/compiler/nak/ir.rs +++ b/src/nouveau/compiler/nak/ir.rs @@ -2307,25 +2307,45 @@ impl fmt::Display for LdCacheOp { impl LdCacheOp { pub fn select( - _sm: &dyn ShaderModel, + sm: &dyn ShaderModel, space: MemSpace, order: MemOrder, _eviction_priority: MemEvictionPriority, ) -> Self { match space { - // From the CUDA 10.2 docs: - // - // "L1 caching in Kepler GPUs is reserved only for local memory - // accesses, such as register spills and stack data. Global - // loads are cached in L2 only (or in the Read-Only Data Cache)." - // - // We assume that CacheAll is also safe for shared memory. MemSpace::Global(_) => match order { MemOrder::Constant => LdCacheOp::CacheAll, MemOrder::Strong(MemScope::System) => { LdCacheOp::CacheInvalidate } - _ => LdCacheOp::CacheGlobal, + _ => { + // From the CUDA 10.2 docs: + // + // "The default load instruction cache operation is + // ld.ca, which allocates cache lines in all levels (L1 + // and L2) with normal eviction policy. Global data is + // coherent at the L2 level, but multiple L1 caches are + // not coherent for global data. If one thread stores to + // global memory via one L1 cache, and a second thread + // loads that address via a second L1 cache with ld.ca, + // the second thread may get stale L1 cache data" + // + // and + // + // "L1 caching in Kepler GPUs is reserved only for local + // memory accesses, such as register spills and stack + // data. Global loads are cached in L2 only (or in the + // Read-Only Data Cache)." + // + // We follow suit and use CacheGlobal for all global memory + // access on Kepler. On Maxwell, it appears safe to use + // CacheAll for everything. + if sm.sm() >= 50 { + LdCacheOp::CacheAll + } else { + LdCacheOp::CacheGlobal + } + } }, MemSpace::Local | MemSpace::Shared => LdCacheOp::CacheAll, } @@ -2356,7 +2376,7 @@ impl fmt::Display for StCacheOp { impl StCacheOp { pub fn select( - _sm: &dyn ShaderModel, + sm: &dyn ShaderModel, space: MemSpace, order: MemOrder, _eviction_priority: MemEvictionPriority, @@ -2364,10 +2384,15 @@ impl StCacheOp { match space { MemSpace::Global(_) => match order { MemOrder::Constant => panic!("Cannot store to constant"), - MemOrder::Strong(MemScope::System) => { - StCacheOp::WriteThrough + MemOrder::Strong(MemScope::System) => StCacheOp::WriteThrough, + _ => { + // See the corresponding comment in LdCacheOp::select() + if sm.sm() >= 50 { + StCacheOp::WriteBack + } else { + StCacheOp::CacheGlobal + } } - _ => StCacheOp::CacheGlobal, }, MemSpace::Local | MemSpace::Shared => StCacheOp::WriteBack, } diff --git a/src/nouveau/compiler/nak/sm50.rs b/src/nouveau/compiler/nak/sm50.rs index c2adc286610..eb8b61ef7b5 100644 --- a/src/nouveau/compiler/nak/sm50.rs +++ b/src/nouveau/compiler/nak/sm50.rs @@ -2386,10 +2386,6 @@ impl SM50Encoder<'_> { ); } - fn set_mem_order(&mut self, _order: &MemOrder) { - // TODO: order and scope aren't present before SM70, what should we do? - } - fn set_mem_access(&mut self, access: &MemAccess) { self.set_field( 45..46, @@ -2399,7 +2395,26 @@ impl SM50Encoder<'_> { }, ); self.set_mem_type(48..51, access.mem_type); - self.set_mem_order(&access.order); + } + + fn set_ld_cache_op(&mut self, range: Range, op: LdCacheOp) { + let cache_op = match op { + LdCacheOp::CacheAll => 0_u8, + LdCacheOp::CacheGlobal => 1_u8, + LdCacheOp::CacheInvalidate => 3_u8, + _ => panic!("Unsupported cache op: ld{op}"), + }; + self.set_field(range, cache_op); + } + + fn set_st_cache_op(&mut self, range: Range, op: StCacheOp) { + let cache_op = match op { + StCacheOp::WriteBack => 0_u8, + StCacheOp::CacheGlobal => 1_u8, + StCacheOp::CacheStreaming => 2_u8, + StCacheOp::WriteThrough => 3_u8, + }; + self.set_field(range, cache_op); } fn set_image_dim(&mut self, range: Range, dim: ImageDim) { @@ -2451,22 +2466,13 @@ impl SM50Op for OpSuLd { } e.set_image_dim(33..36, self.image_dim); - // mem_eviction_policy not a thing for sm < 70 - - let scope = match self.mem_order { - MemOrder::Constant | MemOrder::Weak => MemScope::CTA, - MemOrder::Strong(s) => s, - }; - - e.set_field( - 24..26, - match scope { - MemScope::CTA => 0_u8, - /* SM => 1_u8, */ - MemScope::GPU => 2_u8, - MemScope::System => 3_u8, - }, + let cache_op = LdCacheOp::select( + e.sm, + MemSpace::Global(MemAddrType::A64), + self.mem_order, + self.mem_eviction_priority, ); + e.set_ld_cache_op(24..26, cache_op); e.set_dst(&self.dst); @@ -2498,8 +2504,15 @@ impl SM50Op for OpSuSt { e.set_reg_src(0..8, &self.data); e.set_reg_src(39..47, &self.handle); + let cache_op = StCacheOp::select( + e.sm, + MemSpace::Global(MemAddrType::A64), + self.mem_order, + self.mem_eviction_priority, + ); + e.set_st_cache_op(24..26, cache_op); + e.set_image_dim(33..36, self.image_dim); - e.set_mem_order(&self.mem_order); } } @@ -2582,6 +2595,7 @@ impl SM50Op for OpLd { e.set_field(20..44, self.offset); e.set_mem_access(&self.access); + e.set_ld_cache_op(46..48, self.access.ld_cache_op(e.sm)); } } @@ -2635,6 +2649,7 @@ impl SM50Op for OpSt { e.set_reg_src(8..16, &self.addr); e.set_field(20..44, self.offset); e.set_mem_access(&self.access); + e.set_st_cache_op(46..48, self.access.st_cache_op(e.sm)); } } @@ -2745,8 +2760,6 @@ impl SM50Op for OpAtom { e.set_atom_op(52..56, self.atom_op); } - e.set_mem_order(&self.mem_order); - e.set_reg_src(8..16, &self.addr); e.set_field(28..48, self.addr_offset); e.set_field(