bfd55711c1
This adds basic flow control support for If-Then-Else blocks using predicates (stored in the EXEC register) and a predicate stack for nested flow control.
162 lines
5.7 KiB
C++
162 lines
5.7 KiB
C++
//===-- SILowerFlowControl.cpp - Use predicates for flow control ----------===//
|
|
//
|
|
// The LLVM Compiler Infrastructure
|
|
//
|
|
// This file is distributed under the University of Illinois Open Source
|
|
// License. See LICENSE.TXT for details.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This pass lowers the pseudo flow control instructions (SI_IF_NZ, ELSE, ENDIF)
|
|
// to predicated instructions.
|
|
//
|
|
// All flow control (except loops) is handled using predicated instructions and
|
|
// a predicate stack. Each Scalar ALU controls the operations of 64 Vector
|
|
// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs
|
|
// by writting to the 64-bit EXEC register (each bit corresponds to a
|
|
// single vector ALU). Typically, for predicates, a vector ALU will write
|
|
// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
|
|
// Vector ALU) and then the ScalarALU will AND the VCC register with the
|
|
// EXEC to update the predicates.
|
|
//
|
|
// For example:
|
|
// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
|
|
// SI_IF_NZ %VCC
|
|
// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
|
|
// ELSE
|
|
// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
|
|
// ENDIF
|
|
//
|
|
// becomes:
|
|
//
|
|
// %SGPR0 = S_MOV_B64 %EXEC // Save the current exec mask
|
|
// %EXEC = S_AND_B64 %VCC, %EXEC // Update the exec mask
|
|
// S_CBRANCH_EXECZ label0 // This instruction is an
|
|
// // optimization which allows us to
|
|
// // branch if all the bits of
|
|
// // EXEC are zero.
|
|
// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
|
|
//
|
|
// label0:
|
|
// %EXEC = S_NOT_B64 %EXEC // Invert the exec mask for the
|
|
// // Then block.
|
|
// %EXEC = S_AND_B64 %SGPR0, %EXEC
|
|
// S_BRANCH_EXECZ label1 // Use our branch optimization
|
|
// // instruction again.
|
|
// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block
|
|
// label1:
|
|
// S_MOV_B64 // Restore the old EXEC value
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPU.h"
|
|
#include "SIInstrInfo.h"
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
|
|
using namespace llvm;
|
|
|
|
namespace {
|
|
|
|
class SILowerFlowControlPass : public MachineFunctionPass {
|
|
|
|
private:
|
|
static char ID;
|
|
const TargetInstrInfo *TII;
|
|
std::vector<unsigned> PredicateStack;
|
|
std::vector<unsigned> UnusedRegisters;
|
|
|
|
void pushExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
|
|
void popExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
|
|
|
|
public:
|
|
SILowerFlowControlPass(TargetMachine &tm) :
|
|
MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
|
|
|
|
virtual bool runOnMachineFunction(MachineFunction &MF);
|
|
|
|
const char *getPassName() const {
|
|
return "SI Lower flow control instructions";
|
|
}
|
|
|
|
};
|
|
|
|
} // End anonymous namespace
|
|
|
|
char SILowerFlowControlPass::ID = 0;
|
|
|
|
FunctionPass *llvm::createSILowerFlowControlPass(TargetMachine &tm) {
|
|
return new SILowerFlowControlPass(tm);
|
|
}
|
|
|
|
bool SILowerFlowControlPass::runOnMachineFunction(MachineFunction &MF) {
|
|
|
|
// Find all the unused registers that can be used for the predicate stack.
|
|
for (TargetRegisterClass::iterator S = AMDGPU::SReg_64RegClass.begin(),
|
|
I = AMDGPU::SReg_64RegClass.end();
|
|
I != S; --I) {
|
|
unsigned Reg = *I;
|
|
if (!MF.getRegInfo().isPhysRegOrOverlapUsed(Reg)) {
|
|
UnusedRegisters.push_back(Reg);
|
|
}
|
|
}
|
|
|
|
for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
|
|
BB != BB_E; ++BB) {
|
|
MachineBasicBlock &MBB = *BB;
|
|
for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
|
|
I != MBB.end(); I = Next, Next = llvm::next(I)) {
|
|
MachineInstr &MI = *I;
|
|
switch (MI.getOpcode()) {
|
|
default: break;
|
|
case AMDGPU::SI_IF_NZ:
|
|
pushExecMask(MBB, I);
|
|
BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_B64),
|
|
AMDGPU::EXEC)
|
|
.addOperand(MI.getOperand(0)) // VCC
|
|
.addReg(AMDGPU::EXEC);
|
|
MI.eraseFromParent();
|
|
break;
|
|
case AMDGPU::ELSE:
|
|
BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_NOT_B64),
|
|
AMDGPU::EXEC)
|
|
.addReg(AMDGPU::EXEC);
|
|
BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_B64),
|
|
AMDGPU::EXEC)
|
|
.addReg(PredicateStack.back())
|
|
.addReg(AMDGPU::EXEC);
|
|
MI.eraseFromParent();
|
|
break;
|
|
case AMDGPU::ENDIF:
|
|
popExecMask(MBB, I);
|
|
MI.eraseFromParent();
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void SILowerFlowControlPass::pushExecMask(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator I) {
|
|
|
|
assert(!UnusedRegisters.empty() && "Ran out of registers for predicate stack");
|
|
unsigned StackReg = UnusedRegisters.back();
|
|
UnusedRegisters.pop_back();
|
|
PredicateStack.push_back(StackReg);
|
|
BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
|
|
StackReg)
|
|
.addReg(AMDGPU::EXEC);
|
|
}
|
|
|
|
void SILowerFlowControlPass::popExecMask(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator I) {
|
|
unsigned StackReg = PredicateStack.back();
|
|
PredicateStack.pop_back();
|
|
UnusedRegisters.push_back(StackReg);
|
|
BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
|
|
AMDGPU::EXEC)
|
|
.addReg(StackReg);
|
|
}
|