diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index b135de6afd2..8ac816f6565 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -126,15 +126,9 @@ template uint32_t get_gfx12_cpol(const T& instr) { - bool glc = instr.cache.value & ac_glc; - bool slc = instr.cache.value & ac_slc; - bool dlc = instr.cache.value & ac_dlc; - if (instr_info.is_atomic[(int)instr.opcode]) { - return (glc ? 1 /*TH_ATOMIC_RETURN*/ : 0) << 2; - } else { - return (instr.definitions.empty() || glc || slc || dlc) ? 3 /*SCOPE_SYS*/ - : 0 /*SCOPE_CU*/; - } + uint32_t scope = instr.cache.gfx12.scope; + uint32_t th = instr.cache.gfx12.temporal_hint; + return scope | (th << 2); } void @@ -276,8 +270,7 @@ emit_smem_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= glc ? 1 << (ctx.gfx_level >= GFX11 ? 14 : 16) : 0; } else { encoding |= opcode << 13; - if (is_load) - encoding |= ((glc || dlc) ? 3 /*SCOPE_SYS*/ : 0 /*SCOPE_CU*/) << 21; + encoding |= get_gfx12_cpol(smem) << 21; } if (ctx.gfx_level <= GFX9) { diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index f1dde7a0171..3e42019f293 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -4436,20 +4436,47 @@ get_gfx6_cache_flags(bool glc, bool slc, bool dlc) ac_hw_cache_flags get_load_cache_flags(Builder& bld, bool glc, bool slc) { - bool dlc = glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3); - return get_gfx6_cache_flags(glc, slc, dlc); + if (bld.program->gfx_level >= GFX12) { + ac_hw_cache_flags cache = {0}; + cache.gfx12.scope = (glc || slc) ? gfx12_scope_memory : gfx12_scope_cu; + return cache; + } else { + bool dlc = glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3); + return get_gfx6_cache_flags(glc, slc, dlc); + } } ac_hw_cache_flags get_store_cache_flags(Builder& bld, bool glc, bool slc) { - return get_gfx6_cache_flags(glc, slc, false); + if (bld.program->gfx_level >= GFX12) { + ac_hw_cache_flags cache = {0}; + cache.gfx12.scope = gfx12_scope_memory; + return cache; + } else { + return get_gfx6_cache_flags(glc, slc, false); + } } ac_hw_cache_flags get_atomic_cache_flags(Builder& bld, bool return_previous) { - return get_gfx6_cache_flags(return_previous, false, false); + if (bld.program->gfx_level >= GFX12) { + ac_hw_cache_flags cache = {0}; + cache.gfx12.temporal_hint = return_previous ? gfx12_atomic_return : 0; + return cache; + } else { + return get_gfx6_cache_flags(return_previous, false, false); + } +} + +void +set_cache_flags_swizzled(Builder& bld, ac_hw_cache_flags* cache) +{ + if (bld.program->gfx_level >= GFX12) + cache->gfx12.swizzled = true; + else + cache->value |= ac_swizzled; } Temp @@ -4568,7 +4595,7 @@ mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigne mubuf->mubuf().idxen = idxen; mubuf->mubuf().cache = get_load_cache_flags(bld, info.glc, info.slc); if (info.swizzle_component_size != 0) - mubuf->mubuf().cache.value |= ac_swizzled; + set_cache_flags_swizzled(bld, &mubuf->mubuf().cache); mubuf->mubuf().sync = info.sync; mubuf->mubuf().offset = const_offset; RegClass rc = RegClass::get(RegType::vgpr, bytes_size); @@ -7174,7 +7201,7 @@ visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin) glc &= ctx->program->gfx_level < GFX11; ac_hw_cache_flags cache = get_store_cache_flags(bld, glc, slc); if (swizzled) - cache.value |= ac_swizzled; + set_cache_flags_swizzled(bld, &cache); Operand vaddr_op(v1); if (offen && idxen) @@ -7648,7 +7675,7 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr) mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private); bool glc = ctx->program->gfx_level == GFX6 && write_datas[i].bytes() < 4; mubuf->mubuf().cache = get_store_cache_flags(bld, glc, false); - mubuf->mubuf().cache.value |= ac_swizzled; + set_cache_flags_swizzled(bld, &mubuf->mubuf().cache); } } } diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 61e3d605745..67e9bc17611 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1426,7 +1426,8 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) while (info.is_temp()) info = ctx.info[info.temp.id()]; - bool swizzled = mubuf.cache.value & ac_swizzled; + bool swizzled = ctx.program->gfx_level >= GFX12 ? mubuf.cache.gfx12.swizzled + : (mubuf.cache.value & ac_swizzled); /* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr * overflow for scratch accesses works only on GFX9+ and saddr overflow * never works. Since swizzling is the only thing that separates diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index dfd86114998..2a80cd81ef3 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -266,14 +266,73 @@ template static void print_cache_flags(enum amd_gfx_level gfx_level, const T& instr, FILE* output) { - if (instr.cache.value & ac_glc) - fprintf(output, " glc"); - if (instr.cache.value & ac_slc) - fprintf(output, " slc"); - if (instr.cache.value & ac_dlc) - fprintf(output, " dlc"); - if (instr.cache.value & ac_swizzled) - fprintf(output, " swizzled"); + if (gfx_level >= GFX12) { + if (instr_info.is_atomic[(unsigned)instr.opcode]) { + if (instr.cache.gfx12.temporal_hint & gfx12_atomic_return) + fprintf(output, " atomic_return"); + if (instr.cache.gfx12.temporal_hint & gfx12_atomic_non_temporal) + fprintf(output, " non_temporal"); + if (instr.cache.gfx12.temporal_hint & gfx12_atomic_accum_deferred_scope) + fprintf(output, " accum_deferred_scope"); + } else if (instr.definitions.empty()) { + switch (instr.cache.gfx12.temporal_hint) { + case gfx12_load_regular_temporal: break; + case gfx12_load_non_temporal: fprintf(output, " non_temporal"); break; + case gfx12_load_high_temporal: fprintf(output, " high_temporal"); break; + case gfx12_load_last_use_discard: fprintf(output, " last_use_discard"); break; + case gfx12_load_near_non_temporal_far_regular_temporal: + fprintf(output, " near_non_temporal_far_regular_temporal"); + break; + case gfx12_load_near_regular_temporal_far_non_temporal: + fprintf(output, " near_regular_temporal_far_non_temporal"); + break; + case gfx12_load_near_non_temporal_far_high_temporal: + fprintf(output, " near_non_temporal_far_high_temporal"); + break; + case gfx12_load_reserved: fprintf(output, " reserved"); break; + default: fprintf(output, "tmp:%u", (unsigned)instr.cache.gfx12.temporal_hint); + } + } else { + switch (instr.cache.gfx12.temporal_hint) { + case gfx12_store_regular_temporal: break; + case gfx12_store_non_temporal: fprintf(output, " non_temporal"); break; + case gfx12_store_high_temporal: fprintf(output, " high_temporal"); break; + case gfx12_store_high_temporal_stay_dirty: + fprintf(output, " high_temporal_stay_dirty"); + break; + case gfx12_store_near_non_temporal_far_regular_temporal: + fprintf(output, " near_non_temporal_far_regular_temporal"); + break; + case gfx12_store_near_regular_temporal_far_non_temporal: + fprintf(output, " near_regular_temporal_far_non_temporal"); + break; + case gfx12_store_near_non_temporal_far_high_temporal: + fprintf(output, " near_non_temporal_far_high_temporal"); + break; + case gfx12_store_near_non_temporal_far_writeback: + fprintf(output, " near_non_temporal_far_writeback"); + break; + default: fprintf(output, "tmp:%u", (unsigned)instr.cache.gfx12.temporal_hint); + } + } + switch (instr.cache.gfx12.scope) { + case gfx12_scope_cu: break; + case gfx12_scope_se: fprintf(output, " se"); break; + case gfx12_scope_device: fprintf(output, " device"); break; + case gfx12_scope_memory: fprintf(output, " memory"); break; + } + if (instr.cache.gfx12.swizzled) + fprintf(output, " swizzled"); + } else { + if (instr.cache.value & ac_glc) + fprintf(output, " glc"); + if (instr.cache.value & ac_slc) + fprintf(output, " slc"); + if (instr.cache.value & ac_dlc) + fprintf(output, " dlc"); + if (instr.cache.value & ac_swizzled) + fprintf(output, " swizzled"); + } } static void diff --git a/src/amd/compiler/tests/test_assembler.cpp b/src/amd/compiler/tests/test_assembler.cpp index 604c91acc68..81546aa4d3a 100644 --- a/src/amd/compiler/tests/test_assembler.cpp +++ b/src/amd/compiler/tests/test_assembler.cpp @@ -411,17 +411,22 @@ BEGIN_TEST(assembler.smem) //! s_load_b32 s4, s[16:17], s8 offset:0x2a ; f4000108 1000002a bld.smem(aco_opcode::s_load_dword, dst, op_s2, Operand::c32(42), op_s1); - ac_hw_cache_flags cache_coherent; - ac_hw_cache_flags cache_non_temporal; - cache_coherent.value = ac_glc; - cache_non_temporal.value = ac_dlc; + ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}}; + ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}}; + if (gfx >= GFX12) { + cache_coherent.gfx12.scope = gfx12_scope_device; + cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal; + } else { + cache_coherent.value = ac_glc; + cache_non_temporal.value = ac_dlc; + } //~gfx11! s_buffer_load_b32 s4, s[32:35], s8 glc ; f4204110 10000000 - //~gfx12! s_buffer_load_b32 s4, s[32:35], s8 offset:0x0 scope:SCOPE_SYS ; f4620110 10000000 + //~gfx12! s_buffer_load_b32 s4, s[32:35], s8 offset:0x0 scope:SCOPE_DEV ; f4420110 10000000 bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().cache = cache_coherent; //~gfx11! s_buffer_load_b32 s4, s[32:35], s8 dlc ; f4202110 10000000 - //~gfx12! (then repeated 1 times) + //~gfx12! s_buffer_load_b32 s4, s[32:35], s8 offset:0x0 th:TH_LOAD_NT ; f4820110 10000000 bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().cache = cache_non_temporal; @@ -488,28 +493,36 @@ BEGIN_TEST(assembler.mubuf) bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), op_s1, 84, false); /* Various flags */ - ac_hw_cache_flags cache_coherent; - ac_hw_cache_flags cache_sys_coherent; - ac_hw_cache_flags cache_non_temporal; - ac_hw_cache_flags cache_atomic_rtn; - cache_coherent.value = ac_glc; - cache_sys_coherent.value = ac_slc; - cache_non_temporal.value = ac_dlc; - cache_atomic_rtn.value = ac_glc; + ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}}; + ac_hw_cache_flags cache_sys_coherent = {{0, 0, 0, 0, 0}}; + ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}}; + ac_hw_cache_flags cache_atomic_rtn = {{0, 0, 0, 0, 0}}; + if (gfx >= GFX12) { + cache_coherent.gfx12.scope = gfx12_scope_device; + cache_sys_coherent.gfx12.scope = gfx12_scope_memory; + cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal; + cache_atomic_rtn.gfx12.temporal_hint = gfx12_atomic_return; + } else { + cache_coherent.value = ac_glc; + cache_sys_coherent.value = ac_slc; + cache_non_temporal.value = ac_dlc; + cache_atomic_rtn.value = ac_glc; + } //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 glc ; e0504000 80082a80 - //~gfx12! buffer_load_b32 v42, off, s[32:35], null scope:SCOPE_SYS ; c405007c 008c402a 00000000 + //~gfx12! buffer_load_b32 v42, off, s[32:35], null scope:SCOPE_DEV ; c405007c 0088402a 00000000 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false) ->mubuf() .cache = cache_coherent; //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 dlc ; e0502000 80082a80 - //~gfx12! (then repeated 2 times) + //~gfx12! buffer_load_b32 v42, off, s[32:35], null th:TH_LOAD_NT ; c405007c 0090402a 00000000 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false) ->mubuf() .cache = cache_non_temporal; //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 slc ; e0501000 80082a80 + //~gfx12! buffer_load_b32 v42, off, s[32:35], null scope:SCOPE_SYS ; c405007c 008c402a 00000000 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false) ->mubuf() .cache = cache_sys_coherent; @@ -564,11 +577,11 @@ BEGIN_TEST(assembler.mubuf) /* Stores */ //~gfx11! buffer_store_b32 v10, off, s[32:35], s30 ; e0680000 1e080a80 - //~gfx12! buffer_store_b32 v10, off, s[32:35], s30 scope:SCOPE_SYS ; c406801e 008c400a 00000000 + //~gfx12! buffer_store_b32 v10, off, s[32:35], s30 ; c406801e 0080400a 00000000 bld.mubuf(aco_opcode::buffer_store_dword, op_s4, Operand(v1), op_s1, op_v1, 0, false); //~gfx11! buffer_store_b64 v[20:21], v10, s[32:35], s30 offen ; e06c0000 1e48140a - //~gfx12! buffer_store_b64 v[20:21], v10, s[32:35], s30 offen scope:SCOPE_SYS ; c406c01e 408c4014 0000000a + //~gfx12! buffer_store_b64 v[20:21], v10, s[32:35], s30 offen ; c406c01e 40804014 0000000a bld.mubuf(aco_opcode::buffer_store_dwordx2, op_s4, op_v1, op_s1, op_v2, 0, true); /* Atomic with return */ @@ -647,28 +660,35 @@ BEGIN_TEST(assembler.mtbuf) false); /* Various flags */ - ac_hw_cache_flags cache_coherent; - ac_hw_cache_flags cache_sys_coherent; - ac_hw_cache_flags cache_non_temporal; - cache_coherent.value = ac_glc; - cache_sys_coherent.value = ac_slc; - cache_non_temporal.value = ac_dlc; + ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}}; + ac_hw_cache_flags cache_sys_coherent = {{0, 0, 0, 0, 0}}; + ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}}; + if (gfx >= GFX12) { + cache_coherent.gfx12.scope = gfx12_scope_device; + cache_sys_coherent.gfx12.scope = gfx12_scope_memory; + cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal; + } else { + cache_coherent.value = ac_glc; + cache_sys_coherent.value = ac_slc; + cache_non_temporal.value = ac_dlc; + } //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] glc ; e9904000 80082a80 - //~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] scope:SCOPE_SYS ; c420007c 190c402a 00000080 + //~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] scope:SCOPE_DEV ; c420007c 1908402a 00000080 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt, nfmt, 0, false) ->mtbuf() .cache = cache_coherent; //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] dlc ; e9902000 80082a80 - //~gfx12! (then repeated 2 times) + //~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] th:TH_LOAD_NT ; c420007c 1910402a 00000080 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt, nfmt, 0, false) ->mtbuf() .cache = cache_non_temporal; //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] slc ; e9901000 80082a80 + //~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] scope:SCOPE_SYS ; c420007c 190c402a 00000080 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt, nfmt, 0, false) ->mtbuf() @@ -686,12 +706,12 @@ BEGIN_TEST(assembler.mtbuf) /* Stores */ //~gfx11! tbuffer_store_format_x v10, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] ; e9920000 1e080a80 - //~gfx12! tbuffer_store_format_x v10, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] scope:SCOPE_SYS ; c421001e 190c400a 00000080 + //~gfx12! tbuffer_store_format_x v10, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] ; c421001e 1900400a 00000080 bld.mtbuf(aco_opcode::tbuffer_store_format_x, op_s4, Operand(v1), op_s1, op_v1, dfmt, nfmt, 0, false); //~gfx11! tbuffer_store_format_xy v[20:21], v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offen ; e9928000 1e48140a - //~gfx12! tbuffer_store_format_xy v[20:21], v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offen scope:SCOPE_SYS ; c421401e 590c4014 0000000a + //~gfx12! tbuffer_store_format_xy v[20:21], v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offen ; c421401e 59004014 0000000a bld.mtbuf(aco_opcode::tbuffer_store_format_xy, op_s4, op_v1, op_s1, op_v2, dfmt, nfmt, 0, true); @@ -740,26 +760,34 @@ BEGIN_TEST(assembler.mimg) 0x1; /* Various flags */ - ac_hw_cache_flags cache_coherent; - ac_hw_cache_flags cache_sys_coherent; - ac_hw_cache_flags cache_non_temporal; - ac_hw_cache_flags cache_atomic_rtn; - cache_coherent.value = ac_glc; - cache_sys_coherent.value = ac_slc; - cache_non_temporal.value = ac_dlc; - cache_atomic_rtn.value = ac_glc; + ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}}; + ac_hw_cache_flags cache_sys_coherent = {{0, 0, 0, 0, 0}}; + ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}}; + ac_hw_cache_flags cache_atomic_rtn = {{0, 0, 0, 0, 0}}; + if (gfx >= GFX12) { + cache_coherent.gfx12.scope = gfx12_scope_device; + cache_sys_coherent.gfx12.scope = gfx12_scope_memory; + cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal; + cache_atomic_rtn.gfx12.temporal_hint = gfx12_atomic_return; + } else { + cache_coherent.value = ac_glc; + cache_sys_coherent.value = ac_slc; + cache_non_temporal.value = ac_dlc; + cache_atomic_rtn.value = ac_glc; + } //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D dlc ; f06c2f00 2010540a - //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D scope:SCOPE_SYS ; e7c6c000 100c8054 0000000a + //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_NT ; e7c6c000 10108054 0000000a bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache = cache_non_temporal; //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D glc ; f06c4f00 2010540a - //~gfx12! (then repeated 2 times) + //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D scope:SCOPE_DEV ; e7c6c000 10088054 0000000a bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache = cache_coherent; //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D slc ; f06c1f00 2010540a + //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D scope:SCOPE_SYS ; e7c6c000 100c8054 0000000a bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache = cache_sys_coherent; @@ -816,7 +844,7 @@ BEGIN_TEST(assembler.mimg) /* Stores */ //~gfx11! image_store v[30:33], v10, s[64:71] dmask:0xf dim:SQ_RSRC_IMG_1D ; f0180f00 00101e0a - //~gfx12! image_store v[30:33], v10, s[64:71] dmask:0xf dim:SQ_RSRC_IMG_1D scope:SCOPE_SYS ; d3c18000 000c801e 0000000a + //~gfx12! image_store v[30:33], v10, s[64:71] dmask:0xf dim:SQ_RSRC_IMG_1D ; d3c18000 0000801e 0000000a bld.mimg(aco_opcode::image_store, op_s8, Operand(s4), op_v4, op_v1); //~gfx11! image_atomic_add v10, v20, s[64:71] dmask:0xf dim:SQ_RSRC_IMG_2D ; f0300f04 00100a14 @@ -907,14 +935,21 @@ BEGIN_TEST(assembler.flat) bld.global(aco_opcode::global_load_dword, dst_v1, op_v2, Operand(s1), 84); /* Various flags */ - ac_hw_cache_flags cache_coherent; - ac_hw_cache_flags cache_sys_coherent; - ac_hw_cache_flags cache_non_temporal; - ac_hw_cache_flags cache_atomic_rtn; - cache_coherent.value = ac_glc; - cache_sys_coherent.value = ac_slc; - cache_non_temporal.value = ac_dlc; - cache_atomic_rtn.value = ac_glc; + ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}}; + ac_hw_cache_flags cache_sys_coherent = {{0, 0, 0, 0, 0}}; + ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}}; + ac_hw_cache_flags cache_atomic_rtn = {{0, 0, 0, 0, 0}}; + if (gfx >= GFX12) { + cache_coherent.gfx12.scope = gfx12_scope_device; + cache_sys_coherent.gfx12.scope = gfx12_scope_memory; + cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal; + cache_atomic_rtn.gfx12.temporal_hint = gfx12_atomic_return; + } else { + cache_coherent.value = ac_glc; + cache_sys_coherent.value = ac_slc; + cache_non_temporal.value = ac_dlc; + cache_atomic_rtn.value = ac_glc; + } //~gfx11! flat_load_b32 v42, v[20:21] slc ; dc508000 2a7c0014 //~gfx12! flat_load_b32 v42, v[20:21] scope:SCOPE_SYS ; ec05007c 000c002a 00000014 @@ -922,17 +957,18 @@ BEGIN_TEST(assembler.flat) cache_sys_coherent; //~gfx11! flat_load_b32 v42, v[20:21] glc ; dc504000 2a7c0014 - //~gfx12! (then repeated 2 times) + //~gfx12! flat_load_b32 v42, v[20:21] scope:SCOPE_DEV ; ec05007c 0008002a 00000014 bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache = cache_coherent; //~gfx11! flat_load_b32 v42, v[20:21] dlc ; dc502000 2a7c0014 + //~gfx12! flat_load_b32 v42, v[20:21] th:TH_LOAD_NT ; ec05007c 0010002a 00000014 bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache = cache_non_temporal; /* Stores */ //~gfx11! flat_store_b32 v[20:21], v10 ; dc680000 007c0a14 - //~gfx12! flat_store_b32 v[20:21], v10 scope:SCOPE_SYS ; ec06807c 050c0000 00000014 + //~gfx12! flat_store_b32 v[20:21], v10 ; ec06807c 05000000 00000014 bld.flat(aco_opcode::flat_store_dword, op_v2, Operand(s1), op_v1); /* Atomic with return */