fixes: texture unit mem access sometimes going to smem, bilinear texture filtering; new: cache req_id,

This commit is contained in:
Blaise Tine
2021-11-24 00:00:17 -05:00
parent 1501360f4b
commit 18762dffce
70 changed files with 3818 additions and 1727 deletions

View File

@@ -49,11 +49,12 @@ inline void update_fcrs(uint32_t fflags, Core* core, uint32_t tid, uint32_t wid)
}
}
void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
assert(tmask_.any());
Word nextPC = PC_ + core_->arch().wsize();
Word func2 = instr.getFunc2();
Word func3 = instr.getFunc3();
Word func6 = instr.getFunc6();
Word func7 = instr.getFunc7();
@@ -117,8 +118,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
case NOP:
break;
case LUI_INST:
pipeline_state->exe_type = ExeType::ALU;
pipeline_state->alu.type = AluType::ARITH;
trace->exe_type = ExeType::ALU;
trace->alu.type = AluType::ARITH;
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
@@ -127,8 +128,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
rd_write = true;
break;
case AUIPC_INST:
pipeline_state->exe_type = ExeType::ALU;
pipeline_state->alu.type = AluType::ARITH;
trace->exe_type = ExeType::ALU;
trace->alu.type = AluType::ARITH;
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
@@ -137,10 +138,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
rd_write = true;
break;
case R_INST:
pipeline_state->exe_type = ExeType::ALU;
pipeline_state->alu.type = AluType::ARITH;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->used_iregs[rsrc1] = 1;
trace->exe_type = ExeType::ALU;
trace->alu.type = AluType::ARITH;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
@@ -149,7 +150,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
case 0:
// MUL
rddata[t] = ((WordI)rsdata[t][0]) * ((WordI)rsdata[t][1]);
pipeline_state->alu.type = AluType::IMUL;
trace->alu.type = AluType::IMUL;
break;
case 1: {
// MULH
@@ -163,7 +164,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
uint64_t result = first * second;
rddata[t] = (result >> 32) & 0xFFFFFFFF;
pipeline_state->alu.type = AluType::IMUL;
trace->alu.type = AluType::IMUL;
} break;
case 2: {
// MULHSU
@@ -173,14 +174,14 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
int64_t second = (int64_t)rsdata[t][1];
rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF;
pipeline_state->alu.type = AluType::IMUL;
trace->alu.type = AluType::IMUL;
} break;
case 3: {
// MULHU
uint64_t first = (uint64_t)rsdata[t][0];
uint64_t second = (uint64_t)rsdata[t][1];
rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF;
pipeline_state->alu.type = AluType::IMUL;
trace->alu.type = AluType::IMUL;
} break;
case 4: {
// DIV
@@ -193,7 +194,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} else {
rddata[t] = dividen / divisor;
}
pipeline_state->alu.type = AluType::IDIV;
trace->alu.type = AluType::IDIV;
} break;
case 5: {
// DIVU
@@ -204,7 +205,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} else {
rddata[t] = dividen / divisor;
}
pipeline_state->alu.type = AluType::IDIV;
trace->alu.type = AluType::IDIV;
} break;
case 6: {
// REM
@@ -217,7 +218,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} else {
rddata[t] = dividen % divisor;
}
pipeline_state->alu.type = AluType::IDIV;
trace->alu.type = AluType::IDIV;
} break;
case 7: {
// REMU
@@ -228,7 +229,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} else {
rddata[t] = dividen % divisor;
}
pipeline_state->alu.type = AluType::IDIV;
trace->alu.type = AluType::IDIV;
} break;
default:
std::abort();
@@ -285,9 +286,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
rd_write = true;
break;
case I_INST:
pipeline_state->exe_type = ExeType::ALU;
pipeline_state->alu.type = AluType::ARITH;
pipeline_state->used_iregs[rsrc0] = 1;
trace->exe_type = ExeType::ALU;
trace->alu.type = AluType::ARITH;
trace->used_iregs.set(rsrc0);
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
@@ -336,10 +337,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
rd_write = true;
break;
case B_INST:
pipeline_state->exe_type = ExeType::ALU;
pipeline_state->alu.type = AluType::BRANCH;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->used_iregs[rsrc1] = 1;
trace->exe_type = ExeType::ALU;
trace->alu.type = AluType::BRANCH;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
@@ -385,107 +386,149 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
break; // runonce
}
pipeline_state->stall_warp = true;
trace->fetch_stall = true;
break;
case JAL_INST:
pipeline_state->exe_type = ExeType::ALU;
pipeline_state->alu.type = AluType::BRANCH;
trace->exe_type = ExeType::ALU;
trace->alu.type = AluType::BRANCH;
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
rddata[t] = nextPC;
nextPC = PC_ + immsrc;
pipeline_state->stall_warp = true;
trace->fetch_stall = true;
break; // runonce
}
rd_write = true;
break;
case JALR_INST:
pipeline_state->exe_type = ExeType::ALU;
pipeline_state->alu.type = AluType::BRANCH;
pipeline_state->used_iregs[rsrc0] = 1;
trace->exe_type = ExeType::ALU;
trace->alu.type = AluType::BRANCH;
trace->used_iregs.set(rsrc0);
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
rddata[t] = nextPC;
nextPC = rsdata[t][0] + immsrc;
pipeline_state->stall_warp = true;
trace->fetch_stall = true;
break; // runOnce
}
rd_write = true;
break;
case L_INST:
pipeline_state->exe_type = ExeType::LSU;
pipeline_state->lsu.type = LsuType::LOAD;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->mem_addrs.resize(num_threads);
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
Word memAddr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned
Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8;
Word data_read = core_->dcache_read(memAddr, 4);
pipeline_state->mem_addrs.at(t) = memAddr;
DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
switch (func3) {
case 0:
// LBI
rddata[t] = sext32((data_read >> shift_by) & 0xFF, 8);
break;
case 1:
// LHI
rddata[t] = sext32((data_read >> shift_by) & 0xFFFF, 16);
break;
case 2:
// LW
rddata[t] = data_read;
break;
case 4:
// LBU
rddata[t] = Word((data_read >> shift_by) & 0xFF);
break;
case 5:
// LHU
rddata[t] = Word((data_read >> shift_by) & 0xFFFF);
break;
default:
std::abort();
case FL:
trace->exe_type = ExeType::LSU;
trace->lsu.type = LsuType::LOAD;
trace->used_iregs.set(rsrc0);
if (opcode == L_INST
|| (opcode == FL && func3 == 2)) {
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
Word memAddr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned
Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8;
Word data_read = core_->dcache_read(memAddr, 4);
trace->mem_addrs.at(t).push_back(memAddr);
DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
switch (func3) {
case 0:
// LBI
rddata[t] = sext32((data_read >> shift_by) & 0xFF, 8);
break;
case 1:
// LHI
rddata[t] = sext32((data_read >> shift_by) & 0xFFFF, 16);
break;
case 2:
// LW
rddata[t] = data_read;
break;
case 4:
// LBU
rddata[t] = Word((data_read >> shift_by) & 0xFF);
break;
case 5:
// LHU
rddata[t] = Word((data_read >> shift_by) & 0xFFFF);
break;
default:
std::abort();
}
}
}
rd_write = true;
break;
case S_INST:
pipeline_state->exe_type = ExeType::LSU;
pipeline_state->lsu.type = LsuType::STORE;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->used_iregs[rsrc1] = 1;
pipeline_state->mem_addrs.resize(num_threads);
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
Word memAddr = rsdata[t][0] + immsrc;
pipeline_state->mem_addrs.at(t) = memAddr;
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
switch (func3) {
case 0:
// SB
core_->dcache_write(memAddr, rsdata[t][1] & 0x000000FF, 1);
break;
case 1:
// SH
core_->dcache_write(memAddr, rsdata[t][1], 2);
break;
case 2:
// SW
core_->dcache_write(memAddr, rsdata[t][1], 4);
break;
} else {
DP(4, "Executing vector load");
DP(4, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew);
DP(4, "dest: v" << rdest);
DP(4, "width" << instr.getVlsWidth());
auto &vd = vRegFile_.at(rdest);
switch (instr.getVlsWidth()) {
case 6: {
// load word and unit strided (not checking for unit stride)
for (int i = 0; i < vl_; i++) {
Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8);
DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr);
Word data_read = core_->dcache_read(memAddr, 4);
DP(4, "Mem addr: " << std::hex << memAddr << " Data read " << data_read);
int *result_ptr = (int *)(vd.data() + i);
*result_ptr = data_read;
}
} break;
default:
std::abort();
}
}
rd_write = true;
break;
case S_INST:
case FS:
trace->exe_type = ExeType::LSU;
trace->lsu.type = LsuType::STORE;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
if (opcode == S_INST
|| (opcode == FS && func3 == 2)) {
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
Word memAddr = rsdata[t][0] + immsrc;
trace->mem_addrs.at(t).push_back(memAddr);
DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
switch (func3) {
case 0:
// SB
core_->dcache_write(memAddr, rsdata[t][1] & 0x000000FF, 1);
break;
case 1:
// SH
core_->dcache_write(memAddr, rsdata[t][1], 2);
break;
case 2:
// SW
core_->dcache_write(memAddr, rsdata[t][1], 4);
break;
default:
std::abort();
}
}
} else {
for (int i = 0; i < vl_; i++) {
Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8);
DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
switch (instr.getVlsWidth()) {
case 6: {
// store word and unit strided (not checking for unit stride)
uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i);
core_->dcache_write(memAddr, value, 4);
DP(4, "store: " << memAddr << " value:" << value);
} break;
default:
std::abort();
}
}
}
break;
case SYS_INST:
pipeline_state->exe_type = ExeType::CSR;
trace->exe_type = ExeType::CSR;
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
@@ -493,30 +536,40 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
Word csr_value = core_->get_csr(csr_addr, t, id_);
switch (func3) {
case 0:
if (csr_addr < 2) {
// ECALL/EBREAK
switch (csr_addr) {
case 0: // ECALL
core_->trigger_ecall();
break;
case 1: // EBREAK
core_->trigger_ebreak();
}
break;
case 0x002: // URET
case 0x102: // SRET
case 0x302: // MRET
break;
default:
std::abort();
}
break;
case 1:
// CSRRW
rddata[t] = csr_value;
core_->set_csr(csr_addr, rsdata[t][0], t, id_);
pipeline_state->used_iregs[rsrc0] = 1;
trace->used_iregs.set(rsrc0);
rd_write = true;
break;
case 2:
// CSRRS
rddata[t] = csr_value;
core_->set_csr(csr_addr, csr_value | rsdata[t][0], t, id_);
pipeline_state->used_iregs[rsrc0] = 1;
trace->used_iregs.set(rsrc0);
rd_write = true;
break;
case 3:
// CSRRC
rddata[t] = csr_value;
core_->set_csr(csr_addr, csr_value & ~rsdata[t][0], t, id_);
pipeline_state->used_iregs[rsrc0] = 1;
trace->used_iregs.set(rsrc0);
rd_write = true;
break;
case 5:
@@ -543,88 +596,12 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
break;
case FENCE:
pipeline_state->exe_type = ExeType::LSU;
pipeline_state->lsu.type = LsuType::FENCE;
pipeline_state->stall_warp = true;
break;
case (FL | VL):
pipeline_state->exe_type = ExeType::LSU;
pipeline_state->lsu.type = LsuType::LOAD;
pipeline_state->used_iregs[rsrc0] = 1;
if (func3 == 0x2) {
pipeline_state->mem_addrs.resize(num_threads);
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
Word memAddr = rsdata[t][0] + immsrc;
pipeline_state->mem_addrs.at(t) = memAddr;
Word data_read = core_->dcache_read(memAddr, 4);
DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
rddata[t] = data_read;
}
} else {
DP(3, "Executing vector load");
DP(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew);
DP(3, "dest: v" << rdest);
DP(3, "width" << instr.getVlsWidth());
pipeline_state->mem_addrs.resize(vl_);
auto &vd = vRegFile_.at(rdest);
switch (instr.getVlsWidth()) {
case 6: {
// load word and unit strided (not checking for unit stride)
for (int i = 0; i < vl_; i++) {
Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8);
pipeline_state->mem_addrs.at(i) = memAddr;
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
Word data_read = core_->dcache_read(memAddr, 4);
DP(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read);
int *result_ptr = (int *)(vd.data() + i);
*result_ptr = data_read;
}
} break;
default:
std::abort();
}
break;
}
rd_write = true;
break;
case (FS | VS):
pipeline_state->exe_type = ExeType::LSU;
pipeline_state->lsu.type = LsuType::STORE;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->used_iregs[rsrc1] = 1;
if (func3 == 0x2) {
pipeline_state->mem_addrs.resize(num_threads);
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
Word memAddr = rsdata[t][0] + immsrc;
pipeline_state->mem_addrs.at(t) = memAddr;
core_->dcache_write(memAddr, rsdata[t][1], 4);
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
}
} else {
pipeline_state->mem_addrs.resize(vl_);
for (int i = 0; i < vl_; i++) {
Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8);
pipeline_state->mem_addrs.at(i) = memAddr;
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
switch (instr.getVlsWidth()) {
case 6: {
//store word and unit strided (not checking for unit stride)
uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i);
core_->dcache_write(memAddr, value, 4);
DP(3, "store: " << memAddr << " value:" << value);
} break;
default:
std::abort();
}
}
}
break;
trace->exe_type = ExeType::LSU;
trace->lsu.type = LsuType::FENCE;
trace->fetch_stall = true;
break;
case FCI:
pipeline_state->exe_type = ExeType::FPU;
trace->exe_type = ExeType::FPU;
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
@@ -633,32 +610,32 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
switch (func7) {
case 0x00: //FADD
rddata[t] = rv_fadd(rsdata[t][0], rsdata[t][1], frm, &fflags);
pipeline_state->fpu.type = FpuType::FMA;
pipeline_state->used_fregs[rsrc0] = 1;
pipeline_state->used_fregs[rsrc1] = 1;
trace->fpu.type = FpuType::FMA;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
case 0x04: //FSUB
rddata[t] = rv_fsub(rsdata[t][0], rsdata[t][1], frm, &fflags);
pipeline_state->fpu.type = FpuType::FMA;
pipeline_state->used_fregs[rsrc0] = 1;
pipeline_state->used_fregs[rsrc1] = 1;
trace->fpu.type = FpuType::FMA;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
case 0x08: //FMUL
rddata[t] = rv_fmul(rsdata[t][0], rsdata[t][1], frm, &fflags);
pipeline_state->fpu.type = FpuType::FMA;
pipeline_state->used_fregs[rsrc0] = 1;
pipeline_state->used_fregs[rsrc1] = 1;
trace->fpu.type = FpuType::FMA;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
case 0x0c: //FDIV
rddata[t] = rv_fdiv(rsdata[t][0], rsdata[t][1], frm, &fflags);
pipeline_state->fpu.type = FpuType::FDIV;
pipeline_state->used_fregs[rsrc0] = 1;
pipeline_state->used_fregs[rsrc1] = 1;
trace->fpu.type = FpuType::FDIV;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
case 0x2c: //FSQRT
rddata[t] = rv_fsqrt(rsdata[t][0], frm, &fflags);
pipeline_state->fpu.type = FpuType::FSQRT;
pipeline_state->used_fregs[rsrc0] = 1;
trace->fpu.type = FpuType::FSQRT;
trace->used_fregs.set(rsrc0);
break;
case 0x10:
switch (func3) {
@@ -672,9 +649,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
rddata[t] = rv_fsgnjx(rsdata[t][0], rsdata[t][1]);
break;
}
pipeline_state->fpu.type = FpuType::FNCP;
pipeline_state->used_fregs[rsrc0] = 1;
pipeline_state->used_fregs[rsrc1] = 1;
trace->fpu.type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
case 0x14:
if (func3) {
@@ -684,9 +661,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
// FMIN.S
rddata[t] = rv_fmin(rsdata[t][0], rsdata[t][1], &fflags);
}
pipeline_state->fpu.type = FpuType::FNCP;
pipeline_state->used_fregs[rsrc0] = 1;
pipeline_state->used_fregs[rsrc1] = 1;
trace->fpu.type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
case 0x60:
if (rsrc1 == 0) {
@@ -696,8 +673,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
// FCVT.WU.S
rddata[t] = rv_ftou(rsdata[t][0], frm, &fflags);
}
pipeline_state->fpu.type = FpuType::FCVT;
pipeline_state->used_fregs[rsrc0] = 1;
trace->fpu.type = FpuType::FCVT;
trace->used_fregs.set(rsrc0);
break;
case 0x70:
if (func3) {
@@ -707,8 +684,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
// FMV.X.W
rddata[t] = rsdata[t][0];
}
pipeline_state->fpu.type = FpuType::FNCP;
pipeline_state->used_fregs[rsrc0] = 1;
trace->fpu.type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
break;
case 0x50:
switch(func3) {
@@ -725,9 +702,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
rddata[t] = rv_feq(rsdata[t][0], rsdata[t][1], &fflags);
break;
}
pipeline_state->fpu.type = FpuType::FNCP;
pipeline_state->used_fregs[rsrc0] = 1;
pipeline_state->used_fregs[rsrc1] = 1;
trace->fpu.type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
case 0x68:
if (rsrc1) {
@@ -737,14 +714,14 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
// FCVT.S.W:
rddata[t] = rv_itof(rsdata[t][0], frm, &fflags);
}
pipeline_state->fpu.type = FpuType::FCVT;
pipeline_state->used_iregs[rsrc0] = 1;
trace->fpu.type = FpuType::FCVT;
trace->used_iregs.set(rsrc0);
break;
case 0x78:
// FMV.W.X
rddata[t] = rsdata[t][0];
pipeline_state->fpu.type = FpuType::FNCP;
pipeline_state->used_iregs[rsrc0] = 1;
trace->fpu.type = FpuType::FNCP;
trace->used_iregs.set(rsrc0);
break;
}
update_fcrs(fflags, core_, t, id_);
@@ -755,10 +732,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
case FMSUB:
case FMNMADD:
case FMNMSUB:
pipeline_state->fpu.type = FpuType::FMA;
pipeline_state->used_fregs[rsrc0] = 1;
pipeline_state->used_fregs[rsrc1] = 1;
pipeline_state->used_fregs[rsrc2] = 1;
trace->fpu.type = FpuType::FMA;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
trace->used_fregs.set(rsrc2);
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
@@ -784,8 +761,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
rd_write = true;
break;
case GPGPU: {
pipeline_state->exe_type = ExeType::GPU;
case GPGPU: {
int ts = 0;
for (int t = 0; t < num_threads; ++t) {
if (tmask_.test(t)) {
@@ -795,10 +771,11 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
switch (func3) {
case 0: {
// TMC
pipeline_state->gpu.type = GpuType::TMC;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->stall_warp = true;
// TMC
trace->exe_type = ExeType::GPU;
trace->gpu.type = GpuType::TMC;
trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
if (rsrc1) {
// predicate mode
ThreadMask pred;
@@ -823,10 +800,11 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} break;
case 1: {
// WSPAWN
pipeline_state->gpu.type = GpuType::WSPAWN;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->used_iregs[rsrc1] = 1;
pipeline_state->stall_warp = true;
trace->exe_type = ExeType::GPU;
trace->gpu.type = GpuType::WSPAWN;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
int active_warps = std::min<int>(rsdata.at(ts)[0], core_->arch().num_warps());
DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(ts)[1]);
for (int i = 1; i < active_warps; ++i) {
@@ -837,9 +815,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} break;
case 2: {
// SPLIT
pipeline_state->gpu.type = GpuType::SPLIT;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->stall_warp = true;
trace->exe_type = ExeType::GPU;
trace->gpu.type = GpuType::SPLIT;
trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) {
ThreadMask tmask;
for (int i = 0; i < num_threads; ++i) {
@@ -868,8 +847,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} break;
case 3: {
// JOIN
pipeline_state->gpu.type = GpuType::JOIN;
pipeline_state->stall_warp = true;
trace->exe_type = ExeType::GPU;
trace->gpu.type = GpuType::JOIN;
trace->fetch_stall = true;
if (!domStack_.empty() && domStack_.top().unanimous) {
DP(3, "*** Uninimous branch at join");
tmask_ = domStack_.top().tmask;
@@ -893,18 +873,19 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} break;
case 4: {
// BAR
pipeline_state->gpu.type = GpuType::BAR;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->used_iregs[rsrc1] = 1;
pipeline_state->stall_warp = true;
trace->exe_type = ExeType::GPU;
trace->gpu.type = GpuType::BAR;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
active_ = false;
core_->barrier(rsdata[ts][0], rsdata[ts][1], id_);
} break;
case 6: {
case 5: {
// PREFETCH
pipeline_state->exe_type = ExeType::LSU;
pipeline_state->lsu.type = LsuType::PREFETCH;
pipeline_state->used_iregs[rsrc0] = 1;
trace->exe_type = ExeType::LSU;
trace->lsu.type = LsuType::PREFETCH;
trace->used_iregs.set(rsrc0);
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
@@ -915,7 +896,50 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
default:
std::abort();
}
} break;
} break;
case GPU: {
switch (func3) {
case 0: { // TEX
trace->exe_type = ExeType::GPU;
trace->gpu.type = GpuType::TEX;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->used_iregs.set(rsrc2);
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
auto unit = func2;
auto u = rsdata[t][0];
auto v = rsdata[t][1];
auto lod = rsdata[t][2];
auto color = core_->tex_read(unit, u, v, lod, &trace->mem_addrs.at(t));
rddata[t] = color;
}
rd_write = true;
} break;
case 1:
switch (func2) {
case 0: { // CMOV
trace->exe_type = ExeType::ALU;
trace->alu.type = AluType::CMOV;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->used_iregs.set(rsrc2);
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
rddata[t] = rsdata[t][0] ? rsdata[t][1] : rsdata[t][2];
}
rd_write = true;
} break;
default:
std::abort();
}
break;
default:
std::abort();
}
} break;
case VSET: {
int VLEN = core_->arch().vsize() * 8;
int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew();
@@ -966,7 +990,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
} break;
case 24: {
//vmseq
// vmseq
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
@@ -997,7 +1021,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
} break;
case 25: {
//vmsne
// vmsne
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
@@ -1028,7 +1052,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
} break;
case 26: {
//vmsltu
// vmsltu
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
@@ -1059,7 +1083,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
} break;
case 27: {
//vmslt
// vmslt
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
@@ -1090,7 +1114,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
} break;
case 28: {
//vmsleu
// vmsleu
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
@@ -1121,7 +1145,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
} break;
case 29: {
//vmsle
// vmsle
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
@@ -1152,7 +1176,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
} break;
case 30: {
//vmsgtu
// vmsgtu
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
@@ -1183,7 +1207,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
} break;
case 31: {
//vmsgt
// vmsgt
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
@@ -1356,7 +1380,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
} break;
case 27: {
//vmxor
// vmxor
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
@@ -1402,7 +1426,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
} break;
case 28: {
//vmornot
// vmornot
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
@@ -1448,7 +1472,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
} break;
case 29: {
//vmnand
// vmnand
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
@@ -1494,7 +1518,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
} break;
case 30: {
//vmnor
// vmnor
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
@@ -1540,7 +1564,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
} break;
case 31: {
//vmxnor
// vmxnor
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
@@ -1586,7 +1610,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
} break;
case 37: {
//vmul
// vmul
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
@@ -1769,7 +1793,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
if (rd_write) {
pipeline_state->wb = true;
trace->wb = true;
DPH(2, "Dest Reg: ");
auto rdt = instr.getRDType();
switch (rdt) {
@@ -1786,7 +1810,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
DPN(2, "0x" << std::hex << rddata[t]);
}
DPN(2, "}" << std::endl);
pipeline_state->used_iregs[rdest] = 1;
trace->used_iregs[rdest] = 1;
}
break;
case RegType::Float:
@@ -1801,7 +1825,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
DPN(2, "0x" << std::hex << rddata[t]);
}
DPN(2, "}" << std::endl);
pipeline_state->used_fregs[rdest] = 1;
trace->used_fregs[rdest] = 1;
break;
default:
std::abort();