diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index b8be00c1fcf..3c04d304aad 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -359,9 +359,10 @@ force_waitcnt(wait_ctx& ctx, wait_imm& imm) void kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info) { - if (debug_flags & DEBUG_FORCE_WAITCNT) { + if (instr->opcode == aco_opcode::s_setpc_b64 || (debug_flags & DEBUG_FORCE_WAITCNT)) { /* Force emitting waitcnt states right after the instruction if there is - * something to wait for. + * something to wait for. This is also applied for s_setpc_b64 to ensure + * waitcnt states are inserted before jumping to the PS epilog. */ force_waitcnt(ctx, imm); } diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 1920dfbe6b9..6250b7f285d 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2368,6 +2368,10 @@ lower_to_hw_instr(Program* program) } break; } + case aco_opcode::p_jump_to_epilog: { + bld.sop1(aco_opcode::s_setpc_b64, instr->operands[0]); + break; + } default: break; } } else if (instr->isBranch()) { diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 820e09b1989..cb3c731a5b7 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -320,6 +320,8 @@ opcode("p_insert") # src1=index, src2=bits opcode("p_init_scratch") +# jumps to a shader epilog +opcode("p_jump_to_epilog") # SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc) SOP2 = { diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index 0edd7862b6d..6cebcf95622 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -574,7 +574,7 @@ perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards) /* don't move non-reorderable instructions */ if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime || instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32 || - instr->opcode == aco_opcode::p_init_scratch) + instr->opcode == aco_opcode::p_init_scratch || instr->opcode == aco_opcode::p_jump_to_epilog) return hazard_fail_unreorderable; memory_event_set instr_set; diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index db013e18353..72a033abb56 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -261,6 +261,7 @@ validate_ir(Program* program) bool flat = instr->isFlatLike(); bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() || instr->opcode == aco_opcode::p_create_vector || + instr->opcode == aco_opcode::p_jump_to_epilog || (flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) || ((instr->isMUBUF() || instr->isMTBUF()) && i == 1) || (instr->isScratch() && i == 0); @@ -511,6 +512,18 @@ validate_ir(Program* program) unsigned comp = data_bits / MAX2(op_bits, 1); check(instr->operands[1].constantValue() < comp, "Index must be in-bounds", instr.get()); + } else if (instr->opcode == aco_opcode::p_jump_to_epilog) { + check(instr->definitions.size() == 0, "p_jump_to_epilog must have 0 definitions", + instr.get()); + check(instr->operands.size() > 0 && + instr->operands[0].getTemp().type() == RegType::sgpr && + instr->operands[0].getTemp().size() == 2, + "First operand of p_jump_to_epilog must be a SGPR", instr.get()); + for (unsigned i = 1; i < instr->operands.size(); i++) { + check(instr->operands[i].getTemp().type() == RegType::vgpr || + instr->operands[i].isUndefined(), + "Other operands of p_jump_to_epilog must be VGPRs or undef", instr.get()); + } } break; }