Warp Scheduling + Control Divergence working and stable

This commit is contained in:
felsabbagh3
2019-02-22 07:00:35 -05:00
parent 087a39ccf4
commit af4303a4ca
13 changed files with 298246 additions and 496 deletions

View File

@@ -47,7 +47,7 @@ ostream &Harp::operator<<(ostream& os, Instruction &inst) {
return os;
}
bool checkUnanimous(unsigned p, const std::vector<std::vector<Reg<bool> > >& m,
bool checkUnanimous(unsigned p, const std::vector<std::vector<Reg<Word> > >& m,
const std::vector<bool> &tm) {
bool same;
unsigned i;
@@ -57,15 +57,19 @@ bool checkUnanimous(unsigned p, const std::vector<std::vector<Reg<bool> > >& m,
break;
}
}
if (i == m.size())
throw DivergentBranchException();
if (i == m.size()) throw DivergentBranchException();
std::cout << "same: " << same << " with -> ";
for (; i < m.size(); ++i) {
if (tm[i]) {
std::cout << " " << (bool(m[i][p]));
if (same != (bool(m[i][p]))) {
std::cout << " FALSE\n";
return false;
}
}
}
std::cout << " TRUE\n";
return true;
}
@@ -122,11 +126,20 @@ void Instruction::executeOn(Warp &c) {
vector<Reg<bool> > &pReg(c.pred[t]);
stack<DomStackEntry> &domStack(c.domStack);
std::cout << std::hex << "opcode: " << op << " func3: " << func3 << "\n";
if (op == GPGPU) std::cout << "OPCODE MATCHED GPGPU\n";
// If this thread is masked out, don't execute the instruction, unless it's
// a split or join.
// if (((predicated && !pReg[pred]) || !c.tmask[t]) &&
// op != SPLIT && op != JOIN) continue;
predicated = (op == GPGPU) && ((func3 == 7) || (func3 == 2));
bool split = (op == GPGPU) && (func3 == 2);
bool join = (op == GPGPU) && (func3 == 3);
if (((predicated && !reg[pred]) || !c.tmask[t]) && !split && !join) continue;
++c.insts;
Word memAddr;
@@ -134,13 +147,14 @@ void Instruction::executeOn(Warp &c) {
Word shamt;
Word temp;
Word data_read;
// Word pred;
DomStackEntry e(pred, c.reg, c.tmask, c.pc);
int op1, op2;
switch (op) {
case NOP: break;
case NOP:
std::cout << "NOP_INST\n";
break;
case R_INST:
std::cout << "R_INST\n";
switch (func3)
{
case 0:
@@ -207,7 +221,7 @@ void Instruction::executeOn(Warp &c) {
break;
case L_INST:
std::cout << "L_INST\n";
memAddr = ((reg[rsrc[0]] + immsrc) & 0xFFFFFFFC);
shift_by = ((reg[rsrc[0]] + immsrc) & 0x00000003) * 8;
data_read = c.core->mem.read(memAddr, c.supervisorMode);
@@ -245,6 +259,7 @@ void Instruction::executeOn(Warp &c) {
}
break;
case I_INST:
std::cout << "I_INST\n";
switch (func3)
{
@@ -330,6 +345,7 @@ void Instruction::executeOn(Warp &c) {
}
break;
case S_INST:
std::cout << "S_INST\n";
++c.stores;
memAddr = reg[rsrc[0]] + immsrc;
// std::cout << "STORE MEM ADDRESS: " << std::hex << reg[rsrc[0]] << " + " << immsrc << "\n";
@@ -359,6 +375,7 @@ void Instruction::executeOn(Warp &c) {
#endif
break;
case B_INST:
std::cout << "B_INST\n";
switch (func3)
{
case 0:
@@ -412,13 +429,17 @@ void Instruction::executeOn(Warp &c) {
}
break;
case LUI_INST:
std::cout << "LUI_INST\n";
reg[rdest] = (immsrc << 12) & 0xfffff000;
break;
case AUIPC_INST:
std::cout << "AUIPC_INST\n";
reg[rdest] = ((immsrc << 12) & 0xfffff000) + (c.pc - 4);
break;
case JAL_INST:
std::cout << "JAL_INST\n";
if (!pcSet) nextPc = (c.pc - 4) + immsrc;
if (!pcSet) std::cout << "JAL... SETTING PC: " << nextPc << "\n";
if (rdest != 0)
{
reg[rdest] = c.pc;
@@ -426,8 +447,9 @@ void Instruction::executeOn(Warp &c) {
pcSet = true;
break;
case JALR_INST:
std::cout << "JALR_INST\n";
if (!pcSet) nextPc = reg[rsrc[0]] + immsrc;
if (!pcSet) std::cout << "JALR... SETTING PC: " << nextPc << "\n";
if (rdest != 0)
{
reg[rdest] = c.pc;
@@ -435,6 +457,7 @@ void Instruction::executeOn(Warp &c) {
pcSet = true;
break;
case SYS_INST:
std::cout << "SYS_INST\n";
temp = reg[rsrc[0]];
switch (func3)
{
@@ -505,65 +528,107 @@ void Instruction::executeOn(Warp &c) {
c.interrupt(0);
break;
case FENCE:
std::cout << "FENCE_INST\n";
break;
case PJ_INST:
// pred jump reg
std::cout << "pred jump... src: " << rsrc[0] << std::hex << " val: " << reg[rsrc[0]] << " dest: " << reg[rsrc[1]] << "\n";
if (reg[rsrc[0]])
{
if (!pcSet) nextPc = reg[rsrc[1]];
pcSet = true;
}
break;
case GPGPU:
std::cout << "GPGPU\n";
switch(func3)
{
case 0:
// WSPAWN
D(0, "Spawning a new warp.");
// std::cout << "SIZE: " << c.core->w.size() << "\n";
for (unsigned i = 0; i < c.core->w.size(); ++i)
std::cout << "WSPAWN\n";
if (sjOnce)
{
// std::cout << "WHATTT\n";
Warp &newWarp(c.core->w[i]);
// std::cout << "STARTING\n";
if (newWarp.spawned == false) {
// std::cout << "ABOUT TO START\n";
newWarp.pc = reg[rsrc[0]];
newWarp.reg[0] = reg;
newWarp.csr = c.csr;
newWarp.activeThreads = 1;
newWarp.supervisorMode = false;
newWarp.spawned = true;
break;
sjOnce = false;
D(0, "Spawning a new warp.");
// std::cout << "SIZE: " << c.core->w.size() << "\n";
for (unsigned i = 0; i < c.core->w.size(); ++i)
{
// std::cout << "WHATTT\n";
Warp &newWarp(c.core->w[i]);
// std::cout << "STARTING\n";
if (newWarp.spawned == false) {
// std::cout << "ABOUT TO START\n";
newWarp.pc = reg[rsrc[0]];
newWarp.reg[0] = reg;
newWarp.csr = c.csr;
newWarp.activeThreads = 1;
newWarp.supervisorMode = false;
newWarp.spawned = true;
break;
}
}
}
break;
case 2:
{
// SPLIT
c.domStack.push(c.tmask);
c.domStack.push(e);
for (unsigned i = 0; i < e.tmask.size(); ++i)
std::cout << "SPLIT\n";
if (sjOnce)
{
c.tmask[i] = !e.tmask[i] && c.tmask[i];
sjOnce = false;
if (checkUnanimous(pred, c.reg, c.tmask)) {
std::cout << "Unanimous pred: " << pred << " val: " << reg[pred] << "\n";
DomStackEntry e(c.tmask);
e.uni = true;
c.domStack.push(e);
break;
}
DomStackEntry e(pred, c.reg, c.tmask, c.pc);
c.domStack.push(c.tmask);
c.domStack.push(e);
for (unsigned i = 0; i < e.tmask.size(); ++i)
{
c.tmask[i] = !e.tmask[i] && c.tmask[i];
}
}
}
break;
case 3:
// JOIN
if (!c.domStack.top().fallThrough) {
if (!pcSet) nextPc = c.domStack.top().pc;
pcSet = true;
std::cout << "JOIN\n";
if (sjOnce)
{
sjOnce = false;
if (!c.domStack.empty() && c.domStack.top().uni) {
D(2, "Uni branch at join");
c.tmask = c.domStack.top().tmask;
c.domStack.pop();
break;
}
if (!c.domStack.top().fallThrough) {
if (!pcSet) nextPc = c.domStack.top().pc;
pcSet = true;
}
c.tmask = c.domStack.top().tmask;
c.domStack.pop();
}
c.tmask = c.domStack.top().tmask;
c.domStack.pop();
break;
case 4:
// JMPRT
std::cout << "JMPRT\n";
nextActiveThreads = 1;
if (!pcSet) nextPc = reg[rsrc[0]];
pcSet = true;
break;
case 5:
// CLONE
// std::cout << "CLONE\n";
std::cout << "CLONE\n";
// std::cout << "CLONING REG: " << rsrc[0] << " lane: " << reg[rsrc[0]] << "\n";
c.reg[reg[rsrc[0]]] = reg;
break;
case 6:
// JALRS
std::cout << "JALRS\n";
nextActiveThreads = reg[rsrc[1]];
reg[rdest] = c.pc;
if (!pcSet) nextPc = reg[rsrc[0]];
@@ -571,14 +636,6 @@ void Instruction::executeOn(Warp &c) {
// std::cout << "ACTIVE_THREDS: " << rsrc[1] << " val: " << reg[rsrc[1]] << "\n";
// std::cout << "nextPC: " << rsrc[0] << " val: " << std::hex << reg[rsrc[0]] << "\n";
break;
case 7:
// pred jump reg
if (reg[rsrc[0]])
{
nextPc = reg[rsrc[1]];
pcSet = true;
}
break;
default:
cout << "ERROR: UNSUPPORTED GPGPU INSTRUCTION " << *this << "\n";
}
@@ -593,6 +650,20 @@ void Instruction::executeOn(Warp &c) {
c.activeThreads = nextActiveThreads;
// if (nextActiveThreads != 0)
// {
// for (int i = 7; i >= c.activeThreads; i--)
// {
// c.tmask[i] = c.tmask[i] && false;
// }
// }
// std::cout << "new thread mask: ";
// for (int i = 0; i < c.tmask.size(); ++i) std::cout << " " << c.tmask[i];
// std::cout << "\n";
// This way, if pc was set by a side effect (such as interrupt), it will
// retain its new value.
if (pcSet) c.pc = nextPc;