This commit is contained in:
Blaise Tine
2021-02-28 20:14:42 -05:00
45 changed files with 2465 additions and 2384 deletions

View File

@@ -213,7 +213,7 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
uint64_t dcache_miss_w_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MISS_W, CSR_MPM_DCACHE_MISS_W_H, &dcache_miss_w_per_core);
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_miss_w_per_core) / double(dcache_writes_per_core))) * 100);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache wrire misses=%ld (hit ratio=%d%%)\n", core_id, dcache_miss_w_per_core, dcache_write_hit_ratio);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache write misses=%ld (hit ratio=%d%%)\n", core_id, dcache_miss_w_per_core, dcache_write_hit_ratio);
dcache_write_misses += dcache_miss_w_per_core;
// bank_stalls
uint64_t dcache_bank_st_per_core;

View File

@@ -185,7 +185,7 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
*value = IMPLEMENTATION_ID;
break;
case VX_CAPS_MAX_CORES:
*value = NUM_CORES;
*value = NUM_CORES * NUM_CLUSTERS;
break;
case VX_CAPS_MAX_WARPS:
*value = NUM_WARPS;

View File

@@ -12,8 +12,8 @@ CXXFLAGS += -DDUMP_PERF_STATS
#CONFIGS ?= -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0
#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=1
#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0
CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=1
CXXFLAGS += $(CONFIGS)
@@ -21,7 +21,7 @@ LDFLAGS += -shared -pthread
#LDFLAGS += -dynamiclib -pthread
SRCS = vortex.cpp ../common/vx_utils.cpp
SRCS += $(SIMX_DIR)/util.cpp $(SIMX_DIR)/args.cpp $(SIMX_DIR)/mem.cpp $(SIMX_DIR)/core.cpp $(SIMX_DIR)/warp.cpp $(SIMX_DIR)/instr.cpp $(SIMX_DIR)/decode.cpp $(SIMX_DIR)/execute.cpp
SRCS += $(SIMX_DIR)/util.cpp $(SIMX_DIR)/args.cpp $(SIMX_DIR)/mem.cpp $(SIMX_DIR)/warp.cpp $(SIMX_DIR)/core.cpp $(SIMX_DIR)/decode.cpp $(SIMX_DIR)/execute.cpp
# Debugigng
ifdef DEBUG

View File

@@ -144,19 +144,18 @@ private:
void run() {
vortex::ArchDef arch("rv32i", NUM_CORES, NUM_WARPS, NUM_THREADS);
vortex::Decoder decoder(arch);
vortex::MemoryUnit mu(PAGE_SIZE, arch.getWordSize(), true);
vortex::MemoryUnit mu(PAGE_SIZE, arch.wsize(), true);
mu.attach(ram_, 0);
std::vector<std::shared_ptr<vortex::Core>> cores(NUM_CORES);
for (size_t i = 0; i < NUM_CORES; ++i) {
cores[i] = std::make_shared<vortex::Core>(arch, decoder, mu);
std::vector<std::shared_ptr<vortex::Core>> cores(arch.num_cores());
for (int i = 0; i < arch.num_cores(); ++i) {
cores[i] = std::make_shared<vortex::Core>(arch, decoder, mu, i);
}
bool running;
do {
running = false;
for (size_t i = 0; i < NUM_CORES; ++i) {
for (int i = 0; i < arch.num_cores(); ++i) {
if (!cores[i]->running())
continue;
running = true;
@@ -236,7 +235,7 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
*value = IMPLEMENTATION_ID;
break;
case VX_CAPS_MAX_CORES:
*value = NUM_CORES;
*value = NUM_CORES * NUM_CLUSTERS;
break;
case VX_CAPS_MAX_WARPS:
*value = NUM_WARPS;

View File

@@ -8,7 +8,7 @@
`endif
`ifndef NUM_CORES
`define NUM_CORES 4
`define NUM_CORES 1
`endif
`ifndef NUM_WARPS
@@ -235,11 +235,6 @@
// Pipeline Queues ////////////////////////////////////////////////////////////
// Size of instruction queue
`ifndef IBUF_SIZE
`define IBUF_SIZE 4
`endif
// Size of LSU Request Queue
`ifndef LSUQ_SIZE
`define LSUQ_SIZE 8

View File

@@ -7,7 +7,7 @@ module VX_csr_data #(
input wire reset,
`ifdef PERF_ENABLE
VX_perf_memsys_if perf_memsys_if,
VX_perf_memsys_if perf_memsys_if,
VX_perf_pipeline_if perf_pipeline_if,
`endif
@@ -123,61 +123,61 @@ module VX_csr_data #(
`ifdef PERF_ENABLE
// PERF: pipeline
`CSR_MPM_IBUF_ST : read_data_r = perf_pipeline_if.ibf_stalls[31:0];
`CSR_MPM_IBUF_ST_H : read_data_r = perf_pipeline_if.ibf_stalls[63:32];
`CSR_MPM_IBUF_ST_H : read_data_r = 32'(perf_pipeline_if.ibf_stalls[43:32]);
`CSR_MPM_SCRB_ST : read_data_r = perf_pipeline_if.scb_stalls[31:0];
`CSR_MPM_SCRB_ST_H : read_data_r = perf_pipeline_if.scb_stalls[63:32];
`CSR_MPM_SCRB_ST_H : read_data_r = 32'(perf_pipeline_if.scb_stalls[43:32]);
`CSR_MPM_ALU_ST : read_data_r = perf_pipeline_if.alu_stalls[31:0];
`CSR_MPM_ALU_ST_H : read_data_r = perf_pipeline_if.alu_stalls[63:32];
`CSR_MPM_ALU_ST_H : read_data_r = 32'(perf_pipeline_if.alu_stalls[43:32]);
`CSR_MPM_LSU_ST : read_data_r = perf_pipeline_if.lsu_stalls[31:0];
`CSR_MPM_LSU_ST_H : read_data_r = perf_pipeline_if.lsu_stalls[63:32];
`CSR_MPM_LSU_ST_H : read_data_r = 32'(perf_pipeline_if.lsu_stalls[43:32]);
`CSR_MPM_CSR_ST : read_data_r = perf_pipeline_if.csr_stalls[31:0];
`CSR_MPM_CSR_ST_H : read_data_r = perf_pipeline_if.csr_stalls[63:32];
`CSR_MPM_CSR_ST_H : read_data_r = 32'(perf_pipeline_if.csr_stalls[43:32]);
`CSR_MPM_FPU_ST : read_data_r = perf_pipeline_if.fpu_stalls[31:0];
`CSR_MPM_FPU_ST_H : read_data_r = perf_pipeline_if.fpu_stalls[63:32];
`CSR_MPM_FPU_ST_H : read_data_r = 32'(perf_pipeline_if.fpu_stalls[43:32]);
`CSR_MPM_GPU_ST : read_data_r = perf_pipeline_if.gpu_stalls[31:0];
`CSR_MPM_GPU_ST_H : read_data_r = perf_pipeline_if.gpu_stalls[63:32];
`CSR_MPM_GPU_ST_H : read_data_r = 32'(perf_pipeline_if.gpu_stalls[43:32]);
// PERF: icache
`CSR_MPM_ICACHE_READS : read_data_r = perf_memsys_if.icache_reads[31:0];
`CSR_MPM_ICACHE_READS_H : read_data_r = perf_memsys_if.icache_reads[63:32];
`CSR_MPM_ICACHE_READS_H : read_data_r = 32'(perf_memsys_if.icache_reads[43:32]);
`CSR_MPM_ICACHE_MISS_R : read_data_r = perf_memsys_if.icache_read_misses[31:0];
`CSR_MPM_ICACHE_MISS_R_H : read_data_r = perf_memsys_if.icache_read_misses[63:32];
`CSR_MPM_ICACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.icache_read_misses[43:32]);
`CSR_MPM_ICACHE_PIPE_ST : read_data_r = perf_memsys_if.icache_pipe_stalls[31:0];
`CSR_MPM_ICACHE_PIPE_ST_H : read_data_r = perf_memsys_if.icache_pipe_stalls[63:32];
`CSR_MPM_ICACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.icache_pipe_stalls[43:32]);
`CSR_MPM_ICACHE_CRSP_ST : read_data_r = perf_memsys_if.icache_crsp_stalls[31:0];
`CSR_MPM_ICACHE_CRSP_ST_H : read_data_r = perf_memsys_if.icache_crsp_stalls[63:32];
`CSR_MPM_ICACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.icache_crsp_stalls[43:32]);
// PERF: dcache
`CSR_MPM_DCACHE_READS : read_data_r = perf_memsys_if.dcache_reads[31:0];
`CSR_MPM_DCACHE_READS_H : read_data_r = perf_memsys_if.dcache_reads[63:32];
`CSR_MPM_DCACHE_READS_H : read_data_r = 32'(perf_memsys_if.dcache_reads[43:32]);
`CSR_MPM_DCACHE_WRITES : read_data_r = perf_memsys_if.dcache_writes[31:0];
`CSR_MPM_DCACHE_WRITES_H : read_data_r = perf_memsys_if.dcache_writes[63:32];
`CSR_MPM_DCACHE_WRITES_H : read_data_r = 32'(perf_memsys_if.dcache_writes[43:32]);
`CSR_MPM_DCACHE_MISS_R : read_data_r = perf_memsys_if.dcache_read_misses[31:0];
`CSR_MPM_DCACHE_MISS_R_H : read_data_r = perf_memsys_if.dcache_read_misses[63:32];
`CSR_MPM_DCACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.dcache_read_misses[43:32]);
`CSR_MPM_DCACHE_MISS_W : read_data_r = perf_memsys_if.dcache_write_misses[31:0];
`CSR_MPM_DCACHE_MISS_W_H : read_data_r = perf_memsys_if.dcache_write_misses[63:32];
`CSR_MPM_DCACHE_MISS_W_H : read_data_r = 32'(perf_memsys_if.dcache_write_misses[43:32]);
`CSR_MPM_DCACHE_BANK_ST : read_data_r = perf_memsys_if.dcache_bank_stalls[31:0];
`CSR_MPM_DCACHE_BANK_ST_H : read_data_r = perf_memsys_if.dcache_bank_stalls[63:32];
`CSR_MPM_DCACHE_BANK_ST_H : read_data_r = 32'(perf_memsys_if.dcache_bank_stalls[43:32]);
`CSR_MPM_DCACHE_MSHR_ST : read_data_r = perf_memsys_if.dcache_mshr_stalls[31:0];
`CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = perf_memsys_if.dcache_mshr_stalls[63:32];
`CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = 32'(perf_memsys_if.dcache_mshr_stalls[43:32]);
`CSR_MPM_DCACHE_PIPE_ST : read_data_r = perf_memsys_if.dcache_pipe_stalls[31:0];
`CSR_MPM_DCACHE_PIPE_ST_H : read_data_r = perf_memsys_if.dcache_pipe_stalls[63:32];
`CSR_MPM_DCACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.dcache_pipe_stalls[43:32]);
`CSR_MPM_DCACHE_CRSP_ST : read_data_r = perf_memsys_if.dcache_crsp_stalls[31:0];
`CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = perf_memsys_if.dcache_crsp_stalls[63:32];
`CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.dcache_crsp_stalls[43:32]);
// PERF: smem
`CSR_MPM_SMEM_READS : read_data_r = perf_memsys_if.smem_reads[31:0];
`CSR_MPM_SMEM_READS_H : read_data_r = perf_memsys_if.smem_reads[63:32];
`CSR_MPM_SMEM_READS_H : read_data_r = 32'(perf_memsys_if.smem_reads[43:32]);
`CSR_MPM_SMEM_WRITES : read_data_r = perf_memsys_if.smem_writes[31:0];
`CSR_MPM_SMEM_WRITES_H : read_data_r = perf_memsys_if.smem_writes[63:32];
`CSR_MPM_SMEM_WRITES_H : read_data_r = 32'(perf_memsys_if.smem_writes[43:32]);
`CSR_MPM_SMEM_BANK_ST : read_data_r = perf_memsys_if.smem_bank_stalls[31:0];
`CSR_MPM_SMEM_BANK_ST_H : read_data_r = perf_memsys_if.smem_bank_stalls[63:32];
`CSR_MPM_SMEM_BANK_ST_H : read_data_r = 32'(perf_memsys_if.smem_bank_stalls[43:32]);
// PERF: DRAM
`CSR_MPM_DRAM_READS : read_data_r = perf_memsys_if.dram_reads[31:0];
`CSR_MPM_DRAM_READS_H : read_data_r = perf_memsys_if.dram_reads[63:32];
`CSR_MPM_DRAM_READS_H : read_data_r = 32'(perf_memsys_if.dram_reads[43:32]);
`CSR_MPM_DRAM_WRITES : read_data_r = perf_memsys_if.dram_writes[31:0];
`CSR_MPM_DRAM_WRITES_H : read_data_r = perf_memsys_if.dram_writes[63:32];
`CSR_MPM_DRAM_WRITES_H : read_data_r = 32'(perf_memsys_if.dram_writes[43:32]);
`CSR_MPM_DRAM_ST : read_data_r = perf_memsys_if.dram_stalls[31:0];
`CSR_MPM_DRAM_ST_H : read_data_r = perf_memsys_if.dram_stalls[63:32];
`CSR_MPM_DRAM_ST_H : read_data_r = 32'(perf_memsys_if.dram_stalls[43:32]);
`CSR_MPM_DRAM_LAT : read_data_r = perf_memsys_if.dram_latency[31:0];
`CSR_MPM_DRAM_LAT_H : read_data_r = perf_memsys_if.dram_latency[63:32];
`CSR_MPM_DRAM_LAT_H : read_data_r = 32'(perf_memsys_if.dram_latency[43:32]);
`endif
`CSR_SATP : read_data_r = 32'(csr_satp);
@@ -195,9 +195,9 @@ module VX_csr_data #(
`CSR_PMPADDR0 : read_data_r = 32'(csr_pmpaddr[0]);
`CSR_CYCLE : read_data_r = csr_cycle[31:0];
`CSR_CYCLE_H : read_data_r = csr_cycle[63:32];
`CSR_CYCLE_H : read_data_r = 32'(csr_cycle[43:32]);
`CSR_INSTRET : read_data_r = csr_instret[31:0];
`CSR_INSTRET_H : read_data_r = csr_instret[63:32];
`CSR_INSTRET_H : read_data_r = 32'(csr_instret[43:32]);
`CSR_MVENDORID : read_data_r = `VENDOR_ID;
`CSR_MARCHID : read_data_r = `ARCHITECTURE_ID;

View File

@@ -45,8 +45,7 @@ module VX_csr_io_arb (
// responses
wire csr_io_rsp_ready;
VX_skid_buffer #(
.DATAW (32),
.BUFFERED (1)
.DATAW (32)
) csr_io_out_buffer (
.clk (clk),
.reset (reset),

View File

@@ -39,8 +39,7 @@ module VX_databus_arb (
&& (core_req_if.addr[i][REQ_ADDRW-1:SMEM_ASHIFT-REQ_ASHIFT] < (32-SMEM_ASHIFT)'(`SHARED_MEM_BASE_ADDR >> SMEM_ASHIFT));
VX_skid_buffer #(
.DATAW (REQ_DATAW),
.BUFFERED (1)
.DATAW (REQ_DATAW)
) cache_out_buffer (
.clk (clk),
.reset (reset),
@@ -53,8 +52,7 @@ module VX_databus_arb (
);
VX_skid_buffer #(
.DATAW (REQ_DATAW),
.BUFFERED (1)
.DATAW (REQ_DATAW)
) smem_out_buffer (
.clk (clk),
.reset (reset),

View File

@@ -18,20 +18,15 @@ module VX_decode #(
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
reg [`EX_BITS-1:0] ex_type;
reg [`OP_BITS-1:0] op_type;
reg [`MOD_BITS-1:0] op_mod;
reg [31:0] imm;
reg use_rd, use_rs1, use_rs2, use_rs3, use_PC, use_imm;
reg rd_fp, rs1_fp, rs2_fp;
reg is_join, is_wstall;
wire [31:0] instr = ifetch_rsp_if.instr;
reg [`ALU_BITS-1:0] alu_op;
reg [`BR_BITS-1:0] br_op;
reg [`MUL_BITS-1:0] mul_op;
reg [`LSU_BITS-1:0] lsu_op;
reg [`CSR_BITS-1:0] csr_op;
reg [`FPU_BITS-1:0] fpu_op;
reg [`GPU_BITS-1:0] gpu_op;
reg [19:0] upper_imm;
reg [31:0] jalx_offset;
reg [31:0] src2_imm;
wire [6:0] opcode = instr[6:0];
wire [2:0] func3 = instr[14:12];
wire [6:0] func7 = instr[31:25];
@@ -42,360 +37,378 @@ module VX_decode #(
wire [4:0] rs2 = instr[24:20];
wire [4:0] rs3 = instr[31:27];
// opcode types
wire is_rtype = (opcode == `INST_R);
wire is_ltype = (opcode == `INST_L);
wire is_itype = (opcode == `INST_I);
wire is_stype = (opcode == `INST_S);
wire is_btype = (opcode == `INST_B);
wire is_jal = (opcode == `INST_JAL);
wire is_jalr = (opcode == `INST_JALR);
wire is_lui = (opcode == `INST_LUI);
wire is_auipc = (opcode == `INST_AUIPC);
wire is_jals = (opcode == `INST_SYS) && (func3 == 0);
wire is_csr = (opcode == `INST_SYS) && (func3 != 0);
wire is_gpu = (opcode == `INST_GPU);
// upper immediate
wire [19:0] upper_imm = {func7, rs2, rs1, func3};
wire [11:0] alu_imm = ((func3 == 3'h1) || (func3 == 3'h5)) ? {{7{1'b0}}, rs2} : u_12;
wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0};
wire [11:0] jalr_imm = {func7, rs2};
always @(*) begin
case (opcode)
`INST_LUI: upper_imm = {func7, rs2, rs1, func3};
`INST_AUIPC: upper_imm = {func7, rs2, rs1, func3};
default: upper_imm = 20'h0;
endcase
end
// I-type immediate
ex_type = `EX_NOP;
op_type = 'x;
op_mod = 'x;
imm = 'x;
use_rd = 0;
use_rs1 = 0;
use_rs2 = 0;
use_rs3 = 0;
use_PC = 0;
use_imm = 0;
rd_fp = 0;
rs1_fp = 0;
rs2_fp = 0;
is_join = 0;
is_wstall = 0;
wire alu_shift_i = (func3 == 3'h1) || (func3 == 3'h5);
wire [11:0] alu_shift_imm = {{7{1'b0}}, rs2};
wire [11:0] alu_imm = alu_shift_i ? alu_shift_imm : u_12;
always @(*) begin
case (opcode)
`INST_I: src2_imm = {{20{alu_imm[11]}}, alu_imm};
`INST_S,
`INST_FS: src2_imm = {{20{func7[6]}}, func7, rd};
`INST_L,
`INST_FL: src2_imm = {{20{u_12[11]}}, u_12};
`INST_B: src2_imm = {{20{instr[31]}}, instr[7], instr[30:25], instr[11:8], 1'b0};
default: src2_imm = 'x;
endcase
end
// JAL
wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0};
wire [31:0] jal_offset = {{11{jal_imm[20]}}, jal_imm};
wire [11:0] jalr_imm = {func7, rs2};
wire [31:0] jalr_offset = {{20{jalr_imm[11]}}, jalr_imm};
always @(*) begin
case (opcode)
`INST_JAL: jalx_offset = jal_offset;
`INST_JALR: jalx_offset = jalr_offset;
default: jalx_offset = 32'd4;
endcase
end
// BRANCH
wire is_br = (is_btype || is_jal || is_jalr || is_jals);
always @(*) begin
br_op = `BR_OTHER;
case (opcode)
`INST_B: begin
case (opcode)
`INST_I: begin
ex_type = `EX_ALU;
case (func3)
3'h0: br_op = `BR_EQ;
3'h1: br_op = `BR_NE;
3'h4: br_op = `BR_LT;
3'h5: br_op = `BR_GE;
3'h6: br_op = `BR_LTU;
3'h7: br_op = `BR_GEU;
default:;
3'h0: op_type = `OP_BITS'(`ALU_ADD);
3'h1: op_type = `OP_BITS'(`ALU_SLL);
3'h2: op_type = `OP_BITS'(`ALU_SLT);
3'h3: op_type = `OP_BITS'(`ALU_SLTU);
3'h4: op_type = `OP_BITS'(`ALU_XOR);
3'h5: op_type = (func7[5]) ? `OP_BITS'(`ALU_SRA) : `OP_BITS'(`ALU_SRL);
3'h6: op_type = `OP_BITS'(`ALU_OR);
3'h7: op_type = `OP_BITS'(`ALU_AND);
default:;
endcase
op_mod = 0;
imm = {{20{alu_imm[11]}}, alu_imm};
use_rd = 1;
use_rs1 = 1;
use_imm = 1;
end
`INST_R: begin
ex_type = `EX_ALU;
`ifdef EXT_F_ENABLE
if (func7[0]) begin
case (func3)
3'h0: op_type = `OP_BITS'(`MUL_MUL);
3'h1: op_type = `OP_BITS'(`MUL_MULH);
3'h2: op_type = `OP_BITS'(`MUL_MULHSU);
3'h3: op_type = `OP_BITS'(`MUL_MULHU);
3'h4: op_type = `OP_BITS'(`MUL_DIV);
3'h5: op_type = `OP_BITS'(`MUL_DIVU);
3'h6: op_type = `OP_BITS'(`MUL_REM);
3'h7: op_type = `OP_BITS'(`MUL_REMU);
default:;
endcase
op_mod = 2;
end else
`endif
begin
case (func3)
3'h0: op_type = (func7[5]) ? `OP_BITS'(`ALU_SUB) : `OP_BITS'(`ALU_ADD);
3'h1: op_type = `OP_BITS'(`ALU_SLL);
3'h2: op_type = `OP_BITS'(`ALU_SLT);
3'h3: op_type = `OP_BITS'(`ALU_SLTU);
3'h4: op_type = `OP_BITS'(`ALU_XOR);
3'h5: op_type = (func7[5]) ? `OP_BITS'(`ALU_SRA) : `OP_BITS'(`ALU_SRL);
3'h6: op_type = `OP_BITS'(`ALU_OR);
3'h7: op_type = `OP_BITS'(`ALU_AND);
default:;
endcase
op_mod = 0;
end
use_rd = 1;
use_rs1 = 1;
use_rs2 = 1;
end
`INST_LUI: begin
ex_type = `EX_ALU;
op_type = `OP_BITS'(`ALU_LUI);
op_mod = 0;
imm = {upper_imm, 12'(0)};
use_rd = 1;
use_rs1 = 1;
use_imm = 1;
end
`INST_AUIPC: begin
ex_type = `EX_ALU;
op_type = `OP_BITS'(`ALU_AUIPC);
op_mod = 0;
imm = {upper_imm, 12'(0)};
use_rd = 1;
use_PC = 1;
use_imm = 1;
end
`INST_JAL: begin
ex_type = `EX_ALU;
op_type = `OP_BITS'(`BR_JAL);
op_mod = 1;
imm = {{11{jal_imm[20]}}, jal_imm};
use_rd = 1;
use_PC = 1;
use_imm = 1;
is_wstall = 1;
end
`INST_JALR: begin
ex_type = `EX_ALU;
op_type = `OP_BITS'(`BR_JALR);
op_mod = 1;
imm = {{20{jalr_imm[11]}}, jalr_imm};
use_rd = 1;
use_rs1 = 1;
use_imm = 1;
is_wstall = 1;
end
`INST_B: begin
ex_type = `EX_ALU;
case (func3)
3'h0: op_type = `OP_BITS'(`BR_EQ);
3'h1: op_type = `OP_BITS'(`BR_NE);
3'h4: op_type = `OP_BITS'(`BR_LT);
3'h5: op_type = `OP_BITS'(`BR_GE);
3'h6: op_type = `OP_BITS'(`BR_LTU);
3'h7: op_type = `OP_BITS'(`BR_GEU);
default:;
endcase
op_mod = 1;
imm = {{20{instr[31]}}, instr[7], instr[30:25], instr[11:8], 1'b0};
use_rs1 = 1;
use_rs2 = 1;
use_PC = 1;
use_imm = 1;
is_wstall = 1;
end
`INST_SYS : begin
if (func3 == 0) begin
ex_type = `EX_ALU;
case (u_12)
12'h000: op_type = `OP_BITS'(`BR_ECALL);
12'h001: op_type = `OP_BITS'(`BR_EBREAK);
12'h302: op_type = `OP_BITS'(`BR_MRET);
12'h102: op_type = `OP_BITS'(`BR_SRET);
12'h7B2: op_type = `OP_BITS'(`BR_DRET);
default:;
endcase
op_mod = 1;
imm = 32'd4;
use_rd = 1;
use_PC = 1;
use_imm = 1;
end else begin
ex_type = `EX_CSR;
case (func3[1:0])
2'h0: op_type = `OP_BITS'(`CSR_RW);
2'h1: op_type = `OP_BITS'(`CSR_RW);
2'h2: op_type = `OP_BITS'(`CSR_RS);
2'h3: op_type = `OP_BITS'(`CSR_RC);
default:;
endcase
imm = 32'(u_12);
use_rd = 1;
use_rs1 = !func3[2];
use_imm = func3[2];
end
end
`ifdef EXT_F_ENABLE
`INST_FL,
`endif
`INST_L: begin
ex_type = `EX_LSU;
op_type = `OP_BITS'({1'b0, func3});
imm = {{20{u_12[11]}}, u_12};
use_rd = 1;
use_rs1 = 1;
`ifdef EXT_F_ENABLE
rd_fp = (opcode == `INST_FL);
`endif
end
`ifdef EXT_F_ENABLE
`INST_FS,
`endif
`INST_S: begin
ex_type = `EX_LSU;
op_type = `OP_BITS'({1'b1, func3});
imm = {{20{func7[6]}}, func7, rd};
use_rs1 = 1;
use_rs2 = 1;
`ifdef EXT_F_ENABLE
rs2_fp = (opcode == `INST_FS);
`endif
end
`ifdef EXT_F_ENABLE
`INST_FMADD,
`INST_FMSUB,
`INST_FNMSUB,
`INST_FNMADD: begin
ex_type = `EX_FPU;
op_type = `OP_BITS'(opcode[3:0]);
op_mod = func3;
use_rd = 1;
use_rs1 = 1;
use_rs2 = 1;
use_rs3 = 1;
rd_fp = 1;
rs1_fp = 1;
rs2_fp = 1;
end
`INST_FCI: begin
ex_type = `EX_FPU;
op_mod = func3;
use_rd = 1;
case (func7)
7'h00, // FADD
7'h04, // FSUB
7'h08, // FMUL
7'h0C: // FDIV
begin
op_type = `OP_BITS'(func7[3:0]);
use_rd = 1;
use_rs1 = 1;
use_rs2 = 1;
rd_fp = 1;
rs1_fp = 1;
rs2_fp = 1;
end
7'h2C: begin
op_type = `OP_BITS'(`FPU_SQRT);
use_rs1 = 1;
rd_fp = 1;
rs1_fp = 1;
end
7'h50: begin
op_type = `OP_BITS'(`FPU_CMP);
use_rs1 = 1;
use_rs2 = 1;
rs1_fp = 1;
rs2_fp = 1;
end
7'h60: begin
op_type = (instr[20]) ? `OP_BITS'(`FPU_CVTWUS) : `OP_BITS'(`FPU_CVTWS);
use_rs1 = 1;
rs1_fp = 1;
end
7'h68: begin
op_type = (instr[20]) ? `OP_BITS'(`FPU_CVTSWU) : `OP_BITS'(`FPU_CVTSW);
use_rs1 = 1;
rd_fp = 1;
end
7'h10: begin
// FSGNJ=0, FSGNJN=1, FSGNJX=2
op_type = `OP_BITS'(`FPU_MISC);
op_mod = {1'b0, func3[1:0]};
use_rs1 = 1;
use_rs2 = 1;
rd_fp = 1;
rs1_fp = 1;
rs2_fp = 1;
end
7'h14: begin
// FMIN=3, FMAX=4
op_type = `OP_BITS'(`FPU_MISC);
op_mod = func3[0] ? 4 : 3;
use_rs1 = 1;
use_rs2 = 1;
rs1_fp = 1;
rs2_fp = 1;
end
7'h70: begin
if (func3[0]) begin
// FCLASS
op_type = `OP_BITS'(`FPU_CLASS);
end else begin
// FMV.X.W=5
op_type = `OP_BITS'(`FPU_MISC);
op_mod = 5;
end
use_rs1 = 1;
rs1_fp = 1;
end
7'h78: begin
// FMV.W.X=6
op_type = `OP_BITS'(`FPU_MISC);
op_mod = 6;
rd_fp = 1;
end
default:;
endcase
end
`INST_JAL: br_op = `BR_JAL;
`INST_JALR: br_op = `BR_JALR;
`INST_SYS: begin
if (is_jals && u_12 == 12'h000) br_op = `BR_ECALL;
if (is_jals && u_12 == 12'h001) br_op = `BR_EBREAK;
if (is_jals && u_12 == 12'h302) br_op = `BR_MRET;
if (is_jals && u_12 == 12'h102) br_op = `BR_SRET;
if (is_jals && u_12 == 12'h7B2) br_op = `BR_DRET;
end
default:;
endcase
end
// ALU
always @(*) begin
alu_op = `ALU_OTHER;
if (is_lui) begin
alu_op = `ALU_LUI;
end else if (is_auipc) begin
alu_op = `ALU_AUIPC;
end else if (is_itype || is_rtype) begin
case (func3)
3'h0: alu_op = (is_rtype && func7 == 7'h20) ? `ALU_SUB : `ALU_ADD;
3'h1: alu_op = `ALU_SLL;
3'h2: alu_op = `ALU_SLT;
3'h3: alu_op = `ALU_SLTU;
3'h4: alu_op = `ALU_XOR;
3'h5: alu_op = (func7 == 7'h0) ? `ALU_SRL : `ALU_SRA;
3'h6: alu_op = `ALU_OR;
3'h7: alu_op = `ALU_AND;
default:;
endcase
end
end
// CSR
wire is_csr_imm = is_csr && (func3[2] == 1);
always @(*) begin
csr_op = `CSR_OTHER;
case (func3[1:0])
2'h1: csr_op = `CSR_RW;
2'h2: csr_op = `CSR_RS;
2'h3: csr_op = `CSR_RC;
default:;
endcase
end
// MUL
`ifdef EXT_M_ENABLE
wire is_mul = is_rtype && (func7 == 7'h1);
always @(*) begin
mul_op = `MUL_MUL;
case (func3)
3'h0: mul_op = `MUL_MUL;
3'h1: mul_op = `MUL_MULH;
3'h2: mul_op = `MUL_MULHSU;
3'h3: mul_op = `MUL_MULHU;
3'h4: mul_op = `MUL_DIV;
3'h5: mul_op = `MUL_DIVU;
3'h6: mul_op = `MUL_REM;
3'h7: mul_op = `MUL_REMU;
default:;
endcase
end
`else
wire is_mul = 0;
always @(*) begin
mul_op = `MUL_MUL;
end
`endif
// FPU
`ifdef EXT_F_ENABLE
wire is_fl = (opcode == `INST_FL) && ((func3 == 2));
wire is_fs = (opcode == `INST_FS) && ((func3 == 2));
wire is_fci = (opcode == `INST_FCI);
wire is_fmadd = (opcode == `INST_FMADD);
wire is_fmsub = (opcode == `INST_FMSUB);
wire is_fnmsub = (opcode == `INST_FNMSUB);
wire is_fnmadd = (opcode == `INST_FNMADD);
wire is_fcmp = is_fci && (func7 == 7'h50); // compare
wire is_fcvti = is_fci && (func7 == 7'h60); // convert to int
wire is_fcvtf = is_fci && (func7 == 7'h68); // convert to float
wire is_fmvw_clss = is_fci && (func7 == 7'h70); // move to int + class
wire is_fmvx = is_fci && (func7 == 7'h78); // move to float
wire is_fr4 = is_fmadd || is_fmsub || is_fnmsub || is_fnmadd;
wire is_fpu_no_mem = is_fci || is_fr4;
wire is_fpu = is_fl || is_fs || is_fci || is_fr4;
reg [`MOD_BITS-1:0] frm;
reg is_fsqrt;
always @(*) begin
fpu_op = `FPU_MISC;
frm = func3;
is_fsqrt = 0;
if (is_fr4) begin
case ({is_fmadd, is_fmsub, is_fnmsub, is_fnmadd})
4'b1000: fpu_op = `FPU_MADD;
4'b0100: fpu_op = `FPU_MSUB;
4'b0010: fpu_op = `FPU_NMSUB;
4'b0001: fpu_op = `FPU_NMADD;
default:;
endcase
end else begin
case (func7)
7'h00: fpu_op = `FPU_ADD;
7'h04: fpu_op = `FPU_SUB;
7'h08: fpu_op = `FPU_MUL;
7'h0C: fpu_op = `FPU_DIV;
7'h10: begin
fpu_op = `FPU_MISC;
frm = func3[1] ? 3'b010 : {2'b0, func3[0]};
end
7'h14: begin
fpu_op = `FPU_MISC;
frm = (func3 == 3'h0) ? 3'b011 : 3'b100;
end
7'h2C: begin
fpu_op = `FPU_SQRT;
is_fsqrt = 1;
end
7'h50: fpu_op = `FPU_CMP; // wb to intReg
7'h60: fpu_op = (instr[20]) ? `FPU_CVTWUS : `FPU_CVTWS; // doesn't need rs2, and read rs1 from fpReg, WB to intReg
7'h68: fpu_op = (instr[20]) ? `FPU_CVTSWU : `FPU_CVTSW; // doesn't need rs2, and read rs1 from intReg
7'h70: begin
fpu_op = (func3 == 3'h0) ? `FPU_MISC : `FPU_CLASS;
frm = (func3 == 3'h0) ? 5 : func3;
end
7'h78: begin fpu_op = `FPU_MISC; frm = 6; end
default:;
endcase
end
end
`else
wire is_fl = 0;
wire is_fs = 0;
wire is_fci = 0;
wire is_fcvti = 0;
wire is_fcvtf = 0;
wire is_fmvw_clss = 0;
wire is_fmvx = 0;
wire is_fr4 = 0;
wire is_fpu = 0;
wire is_fpu_no_mem= 0;
wire [2:0] frm = 0;
wire is_fsqrt = 0;
always @(*) begin
fpu_op = `FPU_MISC;
end
`endif
// LSU
wire is_lsu = (is_ltype || is_stype || is_fl || is_fs);
always @(*) begin
lsu_op = (is_fl || is_fs) ? `LSU_SW : func3;
end
// GPU
reg is_gpu_bar, is_qpu_spawn;
always @(*) begin
gpu_op = `GPU_OTHER;
is_gpu_bar = 0;
is_qpu_spawn = 0;
case (func3)
3'h0: gpu_op = `GPU_TMC;
3'h1: begin
gpu_op = `GPU_WSPAWN;
is_qpu_spawn = 1;
end
3'h2: gpu_op = `GPU_SPLIT;
3'h3: gpu_op = `GPU_JOIN;
3'h4: begin
gpu_op = `GPU_BAR;
is_gpu_bar = 1;
`endif
`INST_GPU: begin
ex_type = `EX_GPU;
case (func3)
3'h0: begin
op_type = `OP_BITS'(`GPU_TMC);
use_rs1 = 1;
is_wstall = 1;
end
3'h1: begin
op_type = `OP_BITS'(`GPU_WSPAWN);
use_rs1 = 1;
use_rs2 = 1;
end
3'h2: begin
op_type = `OP_BITS'(`GPU_SPLIT);
use_rs1 = 1;
is_wstall = 1;
end
3'h3: begin
op_type = `OP_BITS'(`GPU_JOIN);
is_join = 1;
end
3'h4: begin
op_type = `OP_BITS'(`GPU_BAR);
use_rs1 = 1;
use_rs2 = 1;
is_wstall = 1;
end
default:;
endcase
end
default:;
endcase
end
///////////////////////////////////////////////////////////////////////////
// disable write to integer register r0
wire use_rd_qual = use_rd && (rd_fp || (rd != 0));
wire use_rd = (is_fl || is_fci || is_fr4)
|| ((is_itype || is_rtype || is_lui || is_auipc || is_csr || is_jal || is_jalr || is_jals || is_ltype) && (rd != 0));
// EX_ALU needs rs1=0 for LUI operation
wire [4:0] rs1_qual = (opcode == `INST_LUI) ? 5'h0 : rs1;
wire use_rs1 = is_fpu
|| is_gpu
|| (is_jalr || is_btype || is_ltype || is_stype || is_itype || is_rtype || !is_csr_imm || is_gpu);
wire use_rs2 = (is_fpu && ~(is_fl || is_fsqrt || is_fcvti || is_fcvtf || is_fmvw_clss || is_fmvx))
|| (is_gpu && (is_gpu_bar || is_qpu_spawn))
|| (is_btype || is_stype || is_rtype);
wire use_rs3 = is_fr4;
wire [4:0] rs1_qual = is_lui ? 5'h0 : rs1;
///////////////////////////////////////////////////////////////////////////
assign decode_if.valid = ifetch_rsp_if.valid;
assign decode_if.wid = ifetch_rsp_if.wid;
assign decode_if.tmask = ifetch_rsp_if.tmask;
assign decode_if.PC = ifetch_rsp_if.PC;
assign decode_if.ex_type = is_gpu ? `EX_GPU :
is_csr ? `EX_CSR :
is_fpu_no_mem ? `EX_FPU :
is_lsu ? `EX_LSU :
(is_br || is_rtype || is_itype || is_lui || is_auipc) ? `EX_ALU :
`EX_NOP;
assign decode_if.op_type = is_gpu ? `OP_BITS'(gpu_op) :
is_csr ? `OP_BITS'(csr_op) :
is_mul ? `OP_BITS'(mul_op) :
is_fpu_no_mem ? `OP_BITS'(fpu_op) :
is_lsu ? `OP_BITS'(lsu_op) :
is_br ? `OP_BITS'(br_op) :
`OP_BITS'(alu_op);
assign decode_if.wb = use_rd && (decode_if.ex_type != `EX_NOP);
assign decode_if.valid = ifetch_rsp_if.valid;
assign decode_if.wid = ifetch_rsp_if.wid;
assign decode_if.tmask = ifetch_rsp_if.tmask;
assign decode_if.PC = ifetch_rsp_if.PC;
assign decode_if.ex_type = ex_type;
assign decode_if.op_type = op_type;
assign decode_if.op_mod = op_mod;
assign decode_if.wb = use_rd_qual;
`ifdef EXT_F_ENABLE
wire rd_is_fp = is_fpu && ~(is_fcmp || is_fcvti || is_fmvw_clss);
wire rs1_is_fp = is_fr4 || (is_fci && ~(is_fcvtf || is_fmvx));
wire rs2_is_fp = is_fs || is_fr4 || is_fci;
assign decode_if.rd = {rd_is_fp, rd};
assign decode_if.rs1 = {rs1_is_fp, rs1_qual};
assign decode_if.rs2 = {rs2_is_fp, rs2};
assign decode_if.rs3 = {1'b1, rs3};
assign decode_if.rd = {rd_fp, rd};
assign decode_if.rs1 = {rs1_fp, rs1_qual};
assign decode_if.rs2 = {rs2_fp, rs2};
assign decode_if.rs3 = {1'b1, rs3};
`else
assign decode_if.rd = rd;
assign decode_if.rs1 = rs1_qual;
assign decode_if.rs1 = rs1;
assign decode_if.rs2 = rs2;
assign decode_if.rs3 = rs3;
`endif
assign decode_if.imm = imm;
assign decode_if.rs1_is_PC = use_PC;
assign decode_if.rs2_is_imm = use_imm;
assign decode_if.used_regs = (`NUM_REGS'(use_rd) << decode_if.rd)
| (`NUM_REGS'(use_rs1) << decode_if.rs1)
| (`NUM_REGS'(use_rs2) << decode_if.rs2)
| (`NUM_REGS'(use_rs3) << decode_if.rs3);
assign decode_if.imm = (is_lui || is_auipc) ? {upper_imm, 12'(0)} :
(is_jal || is_jalr || is_jals) ? jalx_offset :
is_csr ? 32'(u_12) :
src2_imm;
assign decode_if.rs1_is_PC = is_auipc || is_btype || is_jal || is_jals;
assign decode_if.rs2_is_imm = is_itype || is_lui || is_auipc || is_csr_imm || is_br;
wire [`MOD_BITS-1:0] alu_mod = {1'b0, is_mul, is_br};
assign decode_if.op_mod = is_fpu_no_mem ? frm : alu_mod;
///////////////////////////////////////////////////////////////////////////
wire decode_fire_unqual = ifetch_rsp_if.valid && decode_if.ready;
wire ifetch_rsp_fire = ifetch_rsp_if.valid && ifetch_rsp_if.ready;
assign join_if.valid = decode_fire_unqual && is_gpu && (gpu_op == `GPU_JOIN);
assign join_if.valid = ifetch_rsp_fire && is_join;
assign join_if.wid = ifetch_rsp_if.wid;
assign wstall_if.valid = decode_fire_unqual && (is_btype
|| is_jal
|| is_jalr
|| (is_gpu && (gpu_op == `GPU_TMC
|| gpu_op == `GPU_SPLIT
|| gpu_op == `GPU_BAR)));
assign wstall_if.valid = ifetch_rsp_fire && is_wstall;
assign wstall_if.wid = ifetch_rsp_if.wid;
///////////////////////////////////////////////////////////////////////////
assign ifetch_rsp_if.ready = decode_if.ready;
`ifdef DBG_PRINT_PIPELINE
@@ -405,7 +418,7 @@ module VX_decode #(
print_ex_type(decode_if.ex_type);
$write(", op=");
print_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod);
$write(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm);
$write(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b, use_regs=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.used_regs);
end
end
`endif

View File

@@ -132,12 +132,22 @@
`define IS_DIV_OP(x) x[2]
`define IS_MUL_MOD(x) x[1]
`define LSU_SB 3'h0
`define LSU_SH 3'h1
`define LSU_SW 3'h2
`define LSU_UB 3'h4
`define LSU_UH 3'h5
`define LSU_BITS 3
`define FMT_B 3'b000
`define FMT_H 3'b001
`define FMT_W 3'b010
`define FMT_BU 3'b100
`define FMT_HU 3'b101
`define LSU_LB 4'b0000
`define LSU_LH 4'b0001
`define LSU_LW 4'b0010
`define LSU_LBU 4'b0100
`define LSU_LHU 4'b0101
`define LSU_SB 4'b1000
`define LSU_SH 4'b1001
`define LSU_SW 4'b1010
`define LSU_BITS 4
`define LSU_FMT(x) x[2:0]
`define LSU_WSIZE(x) x[1:0]
`define LSU_OP(x) x[`LSU_BITS-1:0]
@@ -149,21 +159,21 @@
`define CSR_OP(x) x[`CSR_BITS-1:0]
`define FPU_ADD 4'h0
`define FPU_SUB 4'h1
`define FPU_MUL 4'h2
`define FPU_DIV 4'h3
`define FPU_SQRT 4'h4
`define FPU_MADD 4'h5
`define FPU_MSUB 4'h6
`define FPU_NMSUB 4'h7
`define FPU_NMADD 4'h8
`define FPU_CVTWS 4'h9 // FCVT.W.S
`define FPU_CVTWUS 4'hA // FCVT.WU.S
`define FPU_CVTSW 4'hB // FCVT.S.W
`define FPU_CVTSWU 4'hC // FCVT.S.WU
`define FPU_CLASS 4'hD
`define FPU_CMP 4'hE
`define FPU_MISC 4'hF // SGNJ, SGNJN, SGNJX, FMIN, FMAX, MVXW, MVWX
`define FPU_SUB 4'h4
`define FPU_MUL 4'h8
`define FPU_DIV 4'hC
`define FPU_CVTWS 4'h1 // FCVT.W.S
`define FPU_CVTWUS 4'h5 // FCVT.WU.S
`define FPU_CVTSW 4'h9 // FCVT.S.W
`define FPU_CVTSWU 4'hD // FCVT.S.WU
`define FPU_SQRT 4'h2
`define FPU_CLASS 4'h6
`define FPU_CMP 4'hA
`define FPU_MISC 4'hE // SGNJ, SGNJN, SGNJX, FMIN, FMAX, MVXW, MVWX
`define FPU_MADD 4'h3
`define FPU_MSUB 4'h7
`define FPU_NMSUB 4'hB
`define FPU_NMADD 4'hF
`define FPU_BITS 4
`define FPU_OP(x) x[`FPU_BITS-1:0]

View File

@@ -14,8 +14,8 @@ module VX_ibuffer #(
VX_decode_if ibuf_deq_if
);
localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `OP_BITS + `FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1 + `NUM_REGS;
localparam SIZE = `IBUF_SIZE;
localparam ADDRW = $clog2(SIZE+1);
localparam SIZE = 3;
localparam ADDRW = $clog2(SIZE);
localparam NWARPSW = $clog2(`NUM_WARPS+1);
reg [`NUM_WARPS-1:0][ADDRW-1:0] used_r;
@@ -39,22 +39,17 @@ module VX_ibuffer #(
wire push = writing && !is_slot0;
wire pop = reading && !alm_empty_r[i];
VX_fifo_queue #(
.DATAW (DATAW),
.SIZE (SIZE),
.BUFFERED (1)
VX_skid_buffer #(
.DATAW (DATAW)
) queue (
.clk (clk),
.reset (reset),
.push (push),
.pop (pop),
.data_in (q_data_in),
.data_out (q_data_prev[i]),
`UNUSED_PIN (empty),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
.valid_in (push),
.data_in (q_data_in),
.ready_out(pop),
.data_out (q_data_prev[i]),
`UNUSED_PIN (ready_in),
`UNUSED_PIN (valid_out)
);
always @(posedge clk) begin
@@ -69,7 +64,7 @@ module VX_ibuffer #(
empty_r[i] <= 0;
if (used_r[i] == 1)
alm_empty_r[i] <= 0;
if (used_r[i] == ADDRW'(SIZE))
if (used_r[i] == ADDRW'(SIZE-1))
full_r[i] <= 1;
end
end else if (reading) begin

View File

@@ -38,8 +38,7 @@ module VX_instr_demux (
wire alu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_ALU);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BITS + `MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)),
.BUFFERED (1)
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BITS + `MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32))
) alu_buffer (
.clk (clk),
.reset (reset),
@@ -56,8 +55,7 @@ module VX_instr_demux (
wire lsu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_LSU);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `LSU_BITS + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)),
.BUFFERED (1)
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `LSU_BITS + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32))
) lsu_buffer (
.clk (clk),
.reset (reset),
@@ -74,8 +72,7 @@ module VX_instr_demux (
wire csr_req_valid = execute_if.valid && (execute_if.ex_type == `EX_CSR);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NR_BITS + 32),
.BUFFERED (1)
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NR_BITS + 32)
) csr_buffer (
.clk (clk),
.reset (reset),
@@ -93,8 +90,7 @@ module VX_instr_demux (
wire fpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_FPU);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)),
.BUFFERED (1)
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32))
) fpu_buffer (
.clk (clk),
.reset (reset),
@@ -115,8 +111,7 @@ module VX_instr_demux (
wire gpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_GPU);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32 + 32)),
.BUFFERED (1)
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32 + 32))
) gpu_buffer (
.clk (clk),
.reset (reset),

View File

@@ -121,14 +121,14 @@ module VX_issue #(
`SCOPE_ASSIGN (writeback_eop, writeback_if.eop);
`ifdef PERF_ENABLE
reg [63:0] perf_ibf_stalls;
reg [63:0] perf_scb_stalls;
reg [63:0] perf_alu_stalls;
reg [63:0] perf_lsu_stalls;
reg [63:0] perf_csr_stalls;
reg [63:0] perf_gpu_stalls;
reg [43:0] perf_ibf_stalls;
reg [43:0] perf_scb_stalls;
reg [43:0] perf_alu_stalls;
reg [43:0] perf_lsu_stalls;
reg [43:0] perf_csr_stalls;
reg [43:0] perf_gpu_stalls;
`ifdef EXT_F_ENABLE
reg [63:0] perf_fpu_stalls;
reg [43:0] perf_fpu_stalls;
`endif
always @(posedge clk) begin
@@ -144,26 +144,26 @@ module VX_issue #(
`endif
end else begin
if (decode_if.valid & !decode_if.ready) begin
perf_ibf_stalls <= perf_ibf_stalls + 64'd1;
perf_ibf_stalls <= perf_ibf_stalls + 44'd1;
end
if (ibuf_deq_if.valid & scoreboard_delay) begin
perf_scb_stalls <= perf_scb_stalls + 64'd1;
perf_scb_stalls <= perf_scb_stalls + 44'd1;
end
if (alu_req_if.valid & !alu_req_if.ready) begin
perf_alu_stalls <= perf_alu_stalls + 64'd1;
perf_alu_stalls <= perf_alu_stalls + 44'd1;
end
if (lsu_req_if.valid & !lsu_req_if.ready) begin
perf_lsu_stalls <= perf_lsu_stalls + 64'd1;
perf_lsu_stalls <= perf_lsu_stalls + 44'd1;
end
if (csr_req_if.valid & !csr_req_if.ready) begin
perf_csr_stalls <= perf_csr_stalls + 64'd1;
perf_csr_stalls <= perf_csr_stalls + 44'd1;
end
if (gpu_req_if.valid & !gpu_req_if.ready) begin
perf_gpu_stalls <= perf_gpu_stalls + 64'd1;
perf_gpu_stalls <= perf_gpu_stalls + 44'd1;
end
`ifdef EXT_F_ENABLE
if (fpu_req_if.valid & !fpu_req_if.ready) begin
perf_fpu_stalls <= perf_fpu_stalls + 64'd1;
perf_fpu_stalls <= perf_fpu_stalls + 44'd1;
end
`endif
end

View File

@@ -69,6 +69,8 @@ module VX_lsu_unit #(
wire rsp_wb;
wire [`LSU_BITS-1:0] rsp_type;
wire rsp_is_dup;
`UNUSED_VAR (rsp_type)
reg [`LSUQ_SIZE-1:0][`NUM_THREADS-1:0] rsp_rem_mask;
reg [`NUM_THREADS-1:0] rsp_rem_mask_n;
@@ -220,11 +222,11 @@ module VX_lsu_unit #(
end
always @(*) begin
case (rsp_type)
`LSU_SB: rsp_data[i] = 32'(signed'(rsp_data_shifted[7:0]));
`LSU_SH: rsp_data[i] = 32'(signed'(rsp_data_shifted[15:0]));
`LSU_UB: rsp_data[i] = 32'(unsigned'(rsp_data_shifted[7:0]));
`LSU_UH: rsp_data[i] = 32'(unsigned'(rsp_data_shifted[15:0]));
case (`LSU_FMT(rsp_type))
`FMT_B: rsp_data[i] = 32'(signed'(rsp_data_shifted[7:0]));
`FMT_H: rsp_data[i] = 32'(signed'(rsp_data_shifted[15:0]));
`FMT_BU: rsp_data[i] = 32'(unsigned'(rsp_data_shifted[7:0]));
`FMT_HU: rsp_data[i] = 32'(unsigned'(rsp_data_shifted[15:0]));
default: rsp_data[i] = rsp_data_shifted;
endcase
end

View File

@@ -323,19 +323,22 @@ end else begin
assign perf_memsys_if.smem_bank_stalls = 0;
end
reg [63:0] perf_dram_lat_per_cycle;
reg [43:0] perf_dram_lat_per_cycle;
always @(posedge clk) begin
if (reset) begin
perf_dram_lat_per_cycle <= 0;
end else begin
perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle +
64'($signed(2'((dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready) && !(dram_rsp_if.valid && dram_rsp_if.ready)) -
44'($signed(2'((dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready) && !(dram_rsp_if.valid && dram_rsp_if.ready)) -
2'((dram_rsp_if.valid && dram_rsp_if.ready) && !(dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready))));
end
end
reg [63:0] perf_dram_reads, perf_dram_writes, perf_dram_lat, perf_dram_stalls;
reg [43:0] perf_dram_reads;
reg [43:0] perf_dram_writes;
reg [43:0] perf_dram_lat;
reg [43:0] perf_dram_stalls;
always @(posedge clk) begin
if (reset) begin
@@ -345,13 +348,13 @@ end
perf_dram_stalls <= 0;
end else begin
if (dram_req_if.valid && dram_req_if.ready && !dram_req_if.rw) begin
perf_dram_reads <= perf_dram_reads + 64'd1;
perf_dram_reads <= perf_dram_reads + 44'd1;
end
if (dram_req_if.valid && dram_req_if.ready && dram_req_if.rw) begin
perf_dram_writes <= perf_dram_writes + 64'd1;
perf_dram_writes <= perf_dram_writes + 44'd1;
end
if (dram_req_if.valid && !dram_req_if.ready) begin
perf_dram_stalls <= perf_dram_stalls + 64'd1;
perf_dram_stalls <= perf_dram_stalls + 44'd1;
end
perf_dram_lat <= perf_dram_lat + perf_dram_lat_per_cycle;
end

View File

@@ -72,11 +72,14 @@ task print_ex_op (
end
`EX_LSU: begin
case (`LSU_BITS'(op_type))
`LSU_LB: $write("LB");
`LSU_LH: $write("LH");
`LSU_LW: $write("LW");
`LSU_LBU:$write("LBU");
`LSU_LHU:$write("LHU");
`LSU_SB: $write("SB");
`LSU_SH: $write("SH");
`LSU_SW: $write("SW");
`LSU_UB: $write("UB");
`LSU_UH: $write("UH");
default: $write("?");
endcase
end

View File

@@ -488,8 +488,7 @@ module VX_bank #(
end
VX_skid_buffer #(
.DATAW (CORE_TAG_WIDTH + (1 + `WORD_WIDTH + `REQS_BITS) * NUM_PORTS),
.BUFFERED (1)
.DATAW (CORE_TAG_WIDTH + (1 + `WORD_WIDTH + `REQS_BITS) * NUM_PORTS)
) core_rsp_req (
.clk (clk),
.reset (reset),

View File

@@ -399,7 +399,8 @@ module VX_cache #(
`ifdef PERF_ENABLE
// per cycle: core_reads, core_writes
reg [($clog2(NUM_REQS+1)-1):0] perf_core_reads_per_cycle, perf_core_writes_per_cycle;
reg [($clog2(NUM_REQS+1)-1):0] perf_core_reads_per_cycle;
reg [($clog2(NUM_REQS+1)-1):0] perf_core_writes_per_cycle;
reg [($clog2(NUM_REQS+1)-1):0] perf_crsp_stall_per_cycle;
assign perf_core_reads_per_cycle = $countones(core_req_valid & core_req_ready & ~core_req_rw);
@@ -422,13 +423,13 @@ module VX_cache #(
assign perf_mshr_stall_per_cycle = $countones(perf_mshr_stall_per_bank);
assign perf_pipe_stall_per_cycle = $countones(perf_pipe_stall_per_bank);
reg [63:0] perf_core_reads;
reg [63:0] perf_core_writes;
reg [63:0] perf_read_misses;
reg [63:0] perf_write_misses;
reg [63:0] perf_mshr_stalls;
reg [63:0] perf_pipe_stalls;
reg [63:0] perf_crsp_stalls;
reg [43:0] perf_core_reads;
reg [43:0] perf_core_writes;
reg [43:0] perf_read_misses;
reg [43:0] perf_write_misses;
reg [43:0] perf_mshr_stalls;
reg [43:0] perf_pipe_stalls;
reg [43:0] perf_crsp_stalls;
always @(posedge clk) begin
if (reset) begin
@@ -440,13 +441,13 @@ module VX_cache #(
perf_pipe_stalls <= 0;
perf_crsp_stalls <= 0;
end else begin
perf_core_reads <= perf_core_reads + 64'(perf_core_reads_per_cycle);
perf_core_writes <= perf_core_writes + 64'(perf_core_writes_per_cycle);
perf_read_misses <= perf_read_misses + 64'(perf_read_miss_per_cycle);
perf_write_misses <= perf_write_misses+ 64'(perf_write_miss_per_cycle);
perf_mshr_stalls <= perf_mshr_stalls + 64'(perf_mshr_stall_per_cycle);
perf_pipe_stalls <= perf_pipe_stalls + 64'(perf_pipe_stall_per_cycle);
perf_crsp_stalls <= perf_crsp_stalls + 64'(perf_crsp_stall_per_cycle);
perf_core_reads <= perf_core_reads + 44'(perf_core_reads_per_cycle);
perf_core_writes <= perf_core_writes + 44'(perf_core_writes_per_cycle);
perf_read_misses <= perf_read_misses + 44'(perf_read_miss_per_cycle);
perf_write_misses <= perf_write_misses+ 44'(perf_write_miss_per_cycle);
perf_mshr_stalls <= perf_mshr_stalls + 44'(perf_mshr_stall_per_cycle);
perf_pipe_stalls <= perf_pipe_stalls + 44'(perf_pipe_stall_per_cycle);
perf_crsp_stalls <= perf_crsp_stalls + 44'(perf_crsp_stall_per_cycle);
end
end

View File

@@ -22,7 +22,7 @@ module VX_cache_core_req_bank_sel #(
input wire reset,
`ifdef PERF_ENABLE
output wire [63:0] bank_stalls,
output wire [43:0] bank_stalls,
`endif
input wire [NUM_REQS-1:0] core_req_valid,
@@ -303,13 +303,13 @@ module VX_cache_core_req_bank_sel #(
end
end
reg [63:0] bank_stalls_r;
reg [43:0] bank_stalls_r;
always @(posedge clk) begin
if (reset) begin
bank_stalls_r <= 0;
end else begin
bank_stalls_r <= bank_stalls_r + 64'($countones(core_req_sel_r & ~core_req_ready));
bank_stalls_r <= bank_stalls_r + 44'($countones(core_req_sel_r & ~core_req_ready));
end
end

View File

@@ -98,8 +98,7 @@ module VX_cache_core_rsp_merge #(
wire core_rsp_valid_any = (| per_bank_core_rsp_valid);
VX_skid_buffer #(
.DATAW (NUM_REQS + CORE_TAG_WIDTH + (NUM_REQS *`WORD_WIDTH)),
.BUFFERED (1)
.DATAW (NUM_REQS + CORE_TAG_WIDTH + (NUM_REQS *`WORD_WIDTH))
) pipe_reg (
.clk (clk),
.reset (reset),
@@ -147,8 +146,7 @@ module VX_cache_core_rsp_merge #(
for (genvar i = 0; i < NUM_REQS; i++) begin
VX_skid_buffer #(
.DATAW (CORE_TAG_WIDTH + `WORD_WIDTH),
.BUFFERED (1)
.DATAW (CORE_TAG_WIDTH + `WORD_WIDTH)
) pipe_reg (
.clk (clk),
.reset (reset),

View File

@@ -205,8 +205,7 @@ module VX_shared_mem #(
wire crsq_in_valid = ~creq_empty && ~core_rsp_rw;
VX_skid_buffer #(
.DATAW (NUM_BANKS * (1 + `WORD_WIDTH) + CORE_TAG_WIDTH),
.BUFFERED (1)
.DATAW (NUM_BANKS * (1 + `WORD_WIDTH) + CORE_TAG_WIDTH)
) core_rsp_req (
.clk (clk),
.reset (reset),
@@ -248,9 +247,9 @@ module VX_shared_mem #(
assign perf_crsp_stall_per_cycle = $countones(core_rsp_valid & ~core_rsp_ready);
end
reg [63:0] perf_core_reads;
reg [63:0] perf_core_writes;
reg [63:0] perf_crsp_stalls;
reg [43:0] perf_core_reads;
reg [43:0] perf_core_writes;
reg [43:0] perf_crsp_stalls;
always @(posedge clk) begin
if (reset) begin
@@ -258,9 +257,9 @@ module VX_shared_mem #(
perf_core_writes <= 0;
perf_crsp_stalls <= 0;
end else begin
perf_core_reads <= perf_core_reads + 64'(perf_core_reads_per_cycle);
perf_core_writes <= perf_core_writes + 64'(perf_core_writes_per_cycle);
perf_crsp_stalls <= perf_crsp_stalls + 64'(perf_crsp_stall_per_cycle);
perf_core_reads <= perf_core_reads + 44'(perf_core_reads_per_cycle);
perf_core_writes <= perf_core_writes + 44'(perf_core_writes_per_cycle);
perf_crsp_stalls <= perf_crsp_stalls + 44'(perf_crsp_stall_per_cycle);
end
end

View File

@@ -5,14 +5,14 @@
interface VX_perf_cache_if ();
wire [63:0] reads;
wire [63:0] writes;
wire [63:0] read_misses;
wire [63:0] write_misses;
wire [63:0] bank_stalls;
wire [63:0] mshr_stalls;
wire [63:0] pipe_stalls;
wire [63:0] crsp_stalls;
wire [43:0] reads;
wire [43:0] writes;
wire [43:0] read_misses;
wire [43:0] write_misses;
wire [43:0] bank_stalls;
wire [43:0] mshr_stalls;
wire [43:0] pipe_stalls;
wire [43:0] crsp_stalls;
endinterface

View File

@@ -5,28 +5,28 @@
interface VX_perf_memsys_if ();
wire [63:0] icache_reads;
wire [63:0] icache_read_misses;
wire [63:0] icache_pipe_stalls;
wire [63:0] icache_crsp_stalls;
wire [43:0] icache_reads;
wire [43:0] icache_read_misses;
wire [43:0] icache_pipe_stalls;
wire [43:0] icache_crsp_stalls;
wire [63:0] dcache_reads;
wire [63:0] dcache_writes;
wire [63:0] dcache_read_misses;
wire [63:0] dcache_write_misses;
wire [63:0] dcache_bank_stalls;
wire [63:0] dcache_mshr_stalls;
wire [63:0] dcache_pipe_stalls;
wire [63:0] dcache_crsp_stalls;
wire [43:0] dcache_reads;
wire [43:0] dcache_writes;
wire [43:0] dcache_read_misses;
wire [43:0] dcache_write_misses;
wire [43:0] dcache_bank_stalls;
wire [43:0] dcache_mshr_stalls;
wire [43:0] dcache_pipe_stalls;
wire [43:0] dcache_crsp_stalls;
wire [63:0] smem_reads;
wire [63:0] smem_writes;
wire [63:0] smem_bank_stalls;
wire [43:0] smem_reads;
wire [43:0] smem_writes;
wire [43:0] smem_bank_stalls;
wire [63:0] dram_reads;
wire [63:0] dram_writes;
wire [63:0] dram_stalls;
wire [63:0] dram_latency;
wire [43:0] dram_reads;
wire [43:0] dram_writes;
wire [43:0] dram_stalls;
wire [43:0] dram_latency;
endinterface

View File

@@ -4,14 +4,14 @@
`include "VX_define.vh"
interface VX_perf_pipeline_if ();
wire [63:0] ibf_stalls;
wire [63:0] scb_stalls;
wire [63:0] lsu_stalls;
wire [63:0] csr_stalls;
wire [63:0] alu_stalls;
wire [63:0] gpu_stalls;
wire [43:0] ibf_stalls;
wire [43:0] scb_stalls;
wire [43:0] lsu_stalls;
wire [43:0] csr_stalls;
wire [43:0] alu_stalls;
wire [43:0] gpu_stalls;
`ifdef EXT_F_ENABLE
wire [63:0] fpu_stalls;
wire [43:0] fpu_stalls;
`endif
endinterface

View File

@@ -93,9 +93,10 @@ module VX_fifo_queue #(
end
if (SIZE > 2) begin
used_r <= used_r + ADDRW'($signed(2'(push) - 2'(pop)));
end else begin // (SIZE == 2);
end else begin
// (SIZE == 2);
`IGNORE_WARNINGS_BEGIN
used_r <= used_r ^ (push ^ pop);
used_r <= used_r ^ (push ^ pop);
`IGNORE_WARNINGS_END
end
end
@@ -105,7 +106,7 @@ module VX_fifo_queue #(
if (0 == BUFFERED) begin
reg [1:0][DATAW-1:0] shift_reg;
reg [DATAW-1:0] shift_reg [1:0];
always @(posedge clk) begin
if (push) begin

View File

@@ -94,33 +94,43 @@ module VX_skid_buffer #(
end else begin
wire q_push = valid_in && ready_in;
wire q_pop = valid_out && ready_out;
reg [DATAW-1:0] shift_reg [1:0];
reg valid_out_r, ready_in_r, rd_ptr_r;
wire q_empty, q_full;
wire push = valid_in && ready_in;
wire pop = valid_out_r && ready_out;
VX_fifo_queue #(
.DATAW (DATAW),
.SIZE (2),
.BUFFERED (BUFFERED),
.FASTRAM (FASTRAM)
) fifo (
.clk (clk),
.reset (reset),
.push (q_push),
.pop (q_pop),
.data_in (data_in),
.data_out (data_out),
.empty (q_empty),
.alm_full (q_full),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (size)
);
always @(posedge clk) begin
if (reset) begin
valid_out_r <= 0;
ready_in_r <= 1;
rd_ptr_r <= 1;
end else begin
if (push) begin
if (!pop) begin
ready_in_r <= rd_ptr_r;
valid_out_r <= 1;
end
end else if (pop) begin
ready_in_r <= 1;
valid_out_r <= rd_ptr_r;
end
`IGNORE_WARNINGS_BEGIN
rd_ptr_r <= rd_ptr_r ^ (push ^ pop);
`IGNORE_WARNINGS_END
end
end
assign ready_in = !q_full;
assign valid_out = !q_empty;
always @(posedge clk) begin
if (push) begin
shift_reg[1] <= shift_reg[0];
shift_reg[0] <= data_in;
end
end
assign ready_in = ready_in_r;
assign valid_out = valid_out_r;
assign data_out = shift_reg[rd_ptr_r];
end
end

View File

@@ -92,8 +92,7 @@ module VX_stream_arbiter #(
VX_skid_buffer #(
.DATAW (DATAW),
.PASSTHRU (!BUFFERED),
.BUFFERED (1)
.PASSTHRU (!BUFFERED)
) out_buffer (
.clk (clk),
.reset (reset),

View File

@@ -40,8 +40,7 @@ module VX_stream_demux #(
for (genvar i = 0; i < NUM_REQS; i++) begin
VX_skid_buffer #(
.DATAW (DATAW),
.PASSTHRU (!BUFFERED),
.BUFFERED (1)
.PASSTHRU (!BUFFERED)
) out_buffer (
.clk (clk),
.reset (reset),

View File

@@ -13,7 +13,7 @@ RTL_DIR = ../hw/rtl
PROJECT = simX
SRCS = util.cpp args.cpp mem.cpp core.cpp warp.cpp instr.cpp decode.cpp execute.cpp simX.cpp
SRCS = util.cpp args.cpp mem.cpp warp.cpp core.cpp decode.cpp execute.cpp main.cpp
# Debugigng
ifdef DEBUG
@@ -27,6 +27,9 @@ all: $(PROJECT)
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run: $(PROJECT)
./$(PROJECT)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;

View File

@@ -11,156 +11,56 @@ namespace vortex {
class ArchDef {
public:
struct Undefined {};
ArchDef(const std::string &s,
ArchDef(const std::string &/*arch*/,
int num_cores,
int num_warps,
int num_threads) {
std::istringstream iss(s.c_str());
wordSize_ = 4;
encChar_ = 'w';
numRegs_ = 32;
numPRegs_ = 0;
numCores_ = num_cores;
numWarps_ = num_warps;
numThreads_ = num_threads;
extent_ = EXT_END;
int num_threads) {
wsize_ = 4;
vsize_ = 16;
num_regs_ = 32;
num_csrs_ = 4096;
num_cores_ = num_cores;
num_warps_ = num_warps;
num_threads_ = num_threads;
}
operator std::string () const {
if (extent_ == EXT_NULL)
return "";
std::ostringstream oss;
if (extent_ >= EXT_WORDSIZE) oss << wordSize_;
if (extent_ >= EXT_ENC ) oss << encChar_;
if (extent_ >= EXT_REGS ) oss << numRegs_;
if (extent_ >= EXT_PREGS ) oss << '/' << numPRegs_;
if (extent_ >= EXT_THREADS ) oss << '/' << numThreads_;
if (extent_ >= EXT_WARPS ) oss << '/' << numWarps_;
if (extent_ >= EXT_CORES ) oss << '/' << numCores_;
return oss.str();
int wsize() const {
return wsize_;
}
bool operator==(const ArchDef &r) const {
Extent minExtent(r.extent_ > extent_ ? extent_ : r.extent_);
// Can't be equal if we can't specify a binary encoding at all.
if (minExtent < EXT_PREGS)
return false;
if (minExtent >= EXT_WORDSIZE) {
if (wordSize_!=r.wordSize_)
return false;
}
if (minExtent >= EXT_ENC) {
if (encChar_ != r.encChar_)
return false;
}
if (minExtent >= EXT_REGS) {
if (numRegs_ != r.numRegs_)
return false;
}
if (minExtent >= EXT_PREGS) {
if (numPRegs_ != r.numPRegs_)
return false;
}
if (minExtent >= EXT_THREADS) {
if (numThreads_ != r.numThreads_)
return false;
}
if (minExtent >= EXT_WARPS) {
if (numWarps_ != r.numWarps_)
return false;
}
if (minExtent >= EXT_CORES) {
if (numCores_ != r.numCores_)
return false;
}
return true;
int vsize() const {
return vsize_;
}
bool operator!=(const ArchDef &r) const {
return !(*this == r);
int num_regs() const {
return num_regs_;
}
Size getWordSize() const {
if (extent_ < EXT_WORDSIZE)
throw Undefined();
return wordSize_;
int num_csrs() const {
return num_csrs_;
}
char getEncChar() const {
if ((extent_ < EXT_ENC) || (encChar_ == 'x'))
throw Undefined();
return encChar_;
int num_threads() const {
return num_threads_;
}
RegNum getNumRegs() const {
if (extent_ < EXT_REGS)
throw Undefined();
return numRegs_;
int num_warps() const {
return num_warps_;
}
RegNum getNumPRegs() const {
if (extent_ < EXT_PREGS)
throw Undefined();
return numPRegs_;
}
ThdNum getNumThreads() const {
if (extent_ < EXT_THREADS)
throw Undefined();
return numThreads_;
}
ThdNum getNumWarps() const {
if (extent_ < EXT_WARPS)
throw Undefined();
return numWarps_;
}
ThdNum getNumCores() const {
if (extent_ < EXT_CORES)
throw Undefined();
return numCores_;
}
bool is_cpu_mode() const {
return cpu_mode_;
int num_cores() const {
return num_cores_;
}
private:
enum Extent {
EXT_NULL,
EXT_WORDSIZE,
EXT_ENC,
EXT_REGS,
EXT_PREGS,
EXT_THREADS,
EXT_WARPS,
EXT_CORES,
EXT_END
};
Extent extent_;
Size wordSize_;
ThdNum numThreads_;
ThdNum numWarps_;
ThdNum numCores_;
RegNum numRegs_;
ThdNum numPRegs_;
char encChar_;
bool cpu_mode_;
int wsize_;
int vsize_;
int num_regs_;
int num_csrs_;
int num_threads_;
int num_warps_;
int num_cores_;
};
}

View File

@@ -1,10 +1,7 @@
#include <iostream>
#include <iomanip>
#include <string.h>
// #define USE_DEBUG 7
// #define PRINT_ACTIVE_THREADS
#include <assert.h>
#include "types.h"
#include "util.h"
#include "archdef.h"
@@ -14,21 +11,25 @@
#include "debug.h"
#define INIT_TRACE(trace_inst) \
trace_inst.valid_inst = false; \
trace_inst.pc = 0; \
trace_inst.valid = false; \
trace_inst.PC = 0; \
trace_inst.wid = schedule_w_; \
trace_inst.rs1 = -1; \
trace_inst.rs2 = -1; \
trace_inst.rd = -1; \
trace_inst.vs1 = -1; \
trace_inst.vs2 = -1; \
trace_inst.vd = -1; \
trace_inst.irs1 = -1; \
trace_inst.irs2 = -1; \
trace_inst.frs1 = -1; \
trace_inst.frs2 = -1; \
trace_inst.frs3 = -1; \
trace_inst.frd = -1; \
trace_inst.ird = -1; \
trace_inst.vrs1 = -1; \
trace_inst.vrs2 = -1; \
trace_inst.vrd = -1; \
trace_inst.is_lw = false; \
trace_inst.is_sw = false; \
if (trace_inst.mem_addresses != NULL) \
free(trace_inst.mem_addresses); \
trace_inst.mem_addresses = (unsigned *)malloc(32 * sizeof(unsigned)); \
for (ThdNum tid = 0; tid < arch_.getNumThreads(); tid++) \
for (int tid = 0; tid < arch_.num_threads(); tid++) \
trace_inst.mem_addresses[tid] = 0xdeadbeef; \
trace_inst.mem_stall_cycles = 0; \
trace_inst.fetch_stall_cycles = 0; \
@@ -37,18 +38,22 @@
trace_inst.stalled = false;
#define CPY_TRACE(drain, source) \
drain.valid_inst = source.valid_inst; \
drain.pc = source.pc; \
drain.valid = source.valid; \
drain.PC = source.PC; \
drain.wid = source.wid; \
drain.rs1 = source.rs1; \
drain.rs2 = source.rs2; \
drain.rd = source.rd; \
drain.vs1 = source.vs1; \
drain.vs2 = source.vs2; \
drain.vd = source.vd; \
drain.irs1 = source.irs1; \
drain.irs2 = source.irs2; \
drain.ird = source.ird; \
drain.frs1 = source.frs1; \
drain.frs2 = source.frs2; \
drain.frs3 = source.frs3; \
drain.frd = source.frd; \
drain.vrs1 = source.vrs1; \
drain.vrs2 = source.vrs2; \
drain.vrd = source.vrd; \
drain.is_lw = source.is_lw; \
drain.is_sw = source.is_sw; \
for (ThdNum tid = 0; tid < arch_.getNumThreads(); tid++)\
for (int tid = 0; tid < arch_.num_threads(); tid++) \
drain.mem_addresses[tid] = source.mem_addresses[tid]; \
drain.mem_stall_cycles = source.mem_stall_cycles; \
drain.fetch_stall_cycles = source.fetch_stall_cycles; \
@@ -60,17 +65,17 @@ using namespace vortex;
void printTrace(trace_inst_t *trace, const char *stage_name) {
__unused(trace, stage_name);
D(3, stage_name << ": valid=" << trace->valid_inst);
D(3, stage_name << ": PC=" << std::hex << trace->pc << std::dec);
D(3, stage_name << ": wid=" << trace->wid);
D(3, stage_name << ": rd=" << trace->rd << ", rs1=" << trace->rs1 << ", trs2=" << trace->rs2);
D(3, stage_name << ": is_lw=" << trace->is_lw);
D(3, stage_name << ": is_sw=" << trace->is_sw);
D(3, stage_name << ": fetch_stall_cycles=" << trace->fetch_stall_cycles);
D(3, stage_name << ": mem_stall_cycles=" << trace->mem_stall_cycles);
D(3, stage_name << ": stall_warp=" << trace->stall_warp);
D(3, stage_name << ": wspawn=" << trace->wspawn);
D(3, stage_name << ": stalled=" << trace->stalled);
D(4, stage_name << ": valid=" << trace->valid);
D(4, stage_name << ": PC=" << std::hex << trace->PC << std::dec);
D(4, stage_name << ": wid=" << trace->wid);
D(4, stage_name << ": rd=" << trace->ird << ", rs1=" << trace->irs1 << ", trs2=" << trace->irs2);
D(4, stage_name << ": is_lw=" << trace->is_lw);
D(4, stage_name << ": is_sw=" << trace->is_sw);
D(4, stage_name << ": fetch_stall_cycles=" << trace->fetch_stall_cycles);
D(4, stage_name << ": mem_stall_cycles=" << trace->mem_stall_cycles);
D(4, stage_name << ": stall_warp=" << trace->stall_warp);
D(4, stage_name << ": wspawn=" << trace->wspawn);
D(4, stage_name << ": stalled=" << trace->stalled);
}
Core::Core(const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id)
@@ -79,8 +84,7 @@ Core::Core(const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id)
, decoder_(decoder)
, mem_(mem)
, steps_(0)
, num_instructions_(0) {
release_warp_ = false;
, num_insts_(0) {
foundSchedule_ = true;
schedule_w_ = 0;
@@ -98,28 +102,17 @@ Core::Core(const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id)
INIT_TRACE(inst_in_lsu_);
INIT_TRACE(inst_in_wb_);
for (int i = 0; i < 32; i++) {
stalled_warps_[i] = false;
for (int j = 0; j < 32; j++) {
renameTable_[i][j] = true;
}
iRenameTable_.resize(arch.num_warps(), std::vector<bool>(arch.num_regs(), false));
fRenameTable_.resize(arch.num_warps(), std::vector<bool>(arch.num_regs(), false));
vRenameTable_.resize(arch.num_regs(), false);
stalled_warps_.resize(arch.num_warps(), false);
for (int i = 0; i < arch_.num_warps(); ++i) {
warps_.emplace_back(this, i);
}
for (int i = 0; i < 32; i++) {
vecRenameTable_[i] = true;
}
for (unsigned i = 0; i < arch_.getNumWarps(); ++i) {
warps_.push_back(Warp(this, i));
}
warps_[0].setActiveThreads(1);
warps_[0].setSpawned(true);
}
bool Core::interrupt(Word r0) {
warps_[0].interrupt(r0);
return false;
warps_[0].setTmask(0, true);
}
Core::~Core() {
@@ -130,32 +123,20 @@ void Core::step() {
D(3, "###########################################################");
steps_++;
D(3, "cycle: " << steps_);
D(3, std::dec << "Core" << id_ << ": cycle: " << steps_);
DPH(3, "stalled warps:");
for (ThdNum widd = 0; widd < arch_.getNumWarps(); widd++) {
DPN(3, " " << stalled_warps_[widd]);
for (int i = 0; i < arch_.num_warps(); i++) {
DPN(3, " " << stalled_warps_[i]);
}
DPN(3, "\n");
// cout << "About to call writeback" << std::endl;
this->writeback();
// cout << "About to call load_store" << std::endl;
this->load_store();
// cout << "About to call execute_unit" << std::endl;
this->execute_unit();
// cout << "About to call scheduler" << std::endl;
this->scheduler();
// cout << "About to call decode" << std::endl;
this->decode();
// D(3, "About to call fetch" << std::flush);
this->fetch();
// D(3, "Finished fetch" << std::flush);
if (release_warp_) {
release_warp_ = false;
stalled_warps_[release_warp_num_] = false;
}
DPN(3, std::flush);
}
@@ -166,10 +147,8 @@ void Core::warpScheduler() {
for (size_t wid = 0; wid < warps_.size(); ++wid) {
// round robin scheduling
next_warp = (next_warp + 1) % warps_.size();
bool has_active_threads = (warps_[next_warp].getActiveThreads() > 0);
bool has_active_threads = warps_[next_warp].active();
bool stalled = stalled_warps_[next_warp];
if (has_active_threads && !stalled) {
foundSchedule_ = true;
break;
@@ -179,35 +158,28 @@ void Core::warpScheduler() {
}
void Core::fetch() {
// D(-1, "Found schedule: " << foundSchedule_);
if ((!inst_in_scheduler_.stalled)
&& (inst_in_fetch_.fetch_stall_cycles == 0)) {
// CPY_TRACE(inst_in_decode_, inst_in_fetch_);
// if (warps_[schedule_w_].activeThreads)
{
INIT_TRACE(inst_in_fetch_);
INIT_TRACE(inst_in_fetch_);
if (foundSchedule_) {
auto active_threads_b = warps_[schedule_w_].getActiveThreads();
if (foundSchedule_) {
auto active_threads_b = warps_[schedule_w_].getActiveThreads();
num_insts_ = num_insts_ + warps_[schedule_w_].getActiveThreads();
num_instructions_ = num_instructions_ + warps_[schedule_w_].getActiveThreads();
warps_[schedule_w_].step(&inst_in_fetch_);
warps_[schedule_w_].step(&inst_in_fetch_);
auto active_threads_a = warps_[schedule_w_].getActiveThreads();
if (active_threads_b != active_threads_a) {
D(3, "** warp #" << schedule_w_ << " active threads changed from " << active_threads_b << " to " << active_threads_a);
}
this->getCacheDelays(&inst_in_fetch_);
if (inst_in_fetch_.stall_warp) {
stalled_warps_[inst_in_fetch_.wid] = true;
}
auto active_threads_a = warps_[schedule_w_].getActiveThreads();
if (active_threads_b != active_threads_a) {
D(3, "** warp #" << schedule_w_ << " active threads changed from " << active_threads_b << " to " << active_threads_a);
}
this->getCacheDelays(&inst_in_fetch_);
if (inst_in_fetch_.stall_warp) {
stalled_warps_[inst_in_fetch_.wid] = true;
}
this->warpScheduler();
}
this->warpScheduler();
} else {
inst_in_fetch_.stalled = false;
if (inst_in_fetch_.fetch_stall_cycles > 0)
@@ -223,7 +195,6 @@ void Core::decode() {
CPY_TRACE(inst_in_decode_, inst_in_fetch_);
INIT_TRACE(inst_in_fetch_);
}
//printTrace(&inst_in_decode_, "Decode");
}
void Core::scheduler() {
@@ -231,136 +202,162 @@ void Core::scheduler() {
CPY_TRACE(inst_in_scheduler_, inst_in_decode_);
INIT_TRACE(inst_in_decode_);
}
//printTrace(&inst_in_scheduler_, "Scheduler");
}
void Core::load_store() {
if ((inst_in_lsu_.mem_stall_cycles > 0) || (inst_in_lsu_.stalled)) {
if ((inst_in_lsu_.mem_stall_cycles > 0) || inst_in_lsu_.stalled) {
// LSU currently busy
if ((inst_in_scheduler_.is_lw || inst_in_scheduler_.is_sw)) {
inst_in_scheduler_.stalled = true;
}
} else {
// LSU not busy
if (inst_in_scheduler_.is_lw || inst_in_scheduler_.is_sw) {
// Scheduler has LSU inst
bool scheduler_srcs_ready = true;
if (inst_in_scheduler_.rs1 > 0) {
scheduler_srcs_ready = scheduler_srcs_ready && renameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.rs1];
}
if (!inst_in_scheduler_.is_lw && !inst_in_scheduler_.is_sw)
return;
if (inst_in_scheduler_.rs2 > 0) {
scheduler_srcs_ready = scheduler_srcs_ready && renameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.rs2];
}
// Scheduler has LSU inst
bool scheduler_srcs_busy = false;
if (inst_in_scheduler_.vs1 > 0) {
scheduler_srcs_ready = scheduler_srcs_ready && vecRenameTable_[inst_in_scheduler_.vs1];
}
if (inst_in_scheduler_.vs2 > 0) {
scheduler_srcs_ready = scheduler_srcs_ready && vecRenameTable_[inst_in_scheduler_.vs2];
}
if (inst_in_scheduler_.irs1 > 0) {
scheduler_srcs_busy = scheduler_srcs_busy || iRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.irs1];
}
if (scheduler_srcs_ready) {
if (inst_in_scheduler_.rd != -1)
renameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.rd] = false;
if (inst_in_scheduler_.rd != -1)
vecRenameTable_[inst_in_scheduler_.vd] = false;
CPY_TRACE(inst_in_lsu_, inst_in_scheduler_);
INIT_TRACE(inst_in_scheduler_);
} else {
inst_in_scheduler_.stalled = true;
// INIT_TRACE(inst_in_lsu_);
}
} else {
// INIT_TRACE(inst_in_lsu_);
if (inst_in_scheduler_.irs2 > 0) {
scheduler_srcs_busy = scheduler_srcs_busy || iRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.irs2];
}
if (inst_in_scheduler_.frs1 >= 0) {
scheduler_srcs_busy = scheduler_srcs_busy || fRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.frs1];
}
if (inst_in_scheduler_.frs2 >= 0) {
scheduler_srcs_busy = scheduler_srcs_busy || fRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.frs2];
}
if (inst_in_scheduler_.frs3 >= 0) {
scheduler_srcs_busy = scheduler_srcs_busy || fRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.frs3];
}
if (inst_in_scheduler_.vrs1 >= 0) {
scheduler_srcs_busy = scheduler_srcs_busy || vRenameTable_[inst_in_scheduler_.vrs1];
}
if (inst_in_scheduler_.vrs2 >= 0) {
scheduler_srcs_busy = scheduler_srcs_busy || vRenameTable_[inst_in_scheduler_.vrs2];
}
if (scheduler_srcs_busy) {
inst_in_scheduler_.stalled = true;
} else {
if (inst_in_scheduler_.ird > 0)
iRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.ird] = true;
if (inst_in_scheduler_.frd >= 0)
fRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.frd] = true;
if (inst_in_scheduler_.vrd >= 0)
vRenameTable_[inst_in_scheduler_.vrd] = true;
CPY_TRACE(inst_in_lsu_, inst_in_scheduler_);
INIT_TRACE(inst_in_scheduler_);
}
}
if (inst_in_lsu_.mem_stall_cycles > 0)
inst_in_lsu_.mem_stall_cycles--;
//printTrace(&inst_in_lsu_, "LSU");
}
void Core::execute_unit() {
// EXEC is always not busy
if (inst_in_scheduler_.is_lw || inst_in_scheduler_.is_sw) {
// Not an execute instruction
// INIT_TRACE(inst_in_exe_);
} else {
bool scheduler_srcs_ready = true;
if (inst_in_scheduler_.rs1 > 0) {
scheduler_srcs_ready = scheduler_srcs_ready && renameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.rs1];
// cout << "Rename RS1: " << inst_in_scheduler_.rs1 << " is " << renameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.rs1] << " wid: " << inst_in_scheduler_.wid << '\n';
}
if (inst_in_scheduler_.is_lw || inst_in_scheduler_.is_sw)
return;
bool scheduler_srcs_busy = false;
if (inst_in_scheduler_.rs2 > 0) {
scheduler_srcs_ready = scheduler_srcs_ready && renameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.rs2];
// cout << "Rename RS2: " << inst_in_scheduler_.rs1 << " is " << renameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.rs2] << " wid: " << inst_in_scheduler_.wid << '\n';
}
// cout << "About to check vs*\n" << std::flush;
if (inst_in_scheduler_.vs1 > 0) {
scheduler_srcs_ready = scheduler_srcs_ready && vecRenameTable_[inst_in_scheduler_.vs1];
}
if (inst_in_scheduler_.vs2 > 0) {
scheduler_srcs_ready = scheduler_srcs_ready && vecRenameTable_[inst_in_scheduler_.vs2];
}
// cout << "Finished sources\n" << std::flush;
if (scheduler_srcs_ready) {
if (inst_in_scheduler_.rd != -1) {
// cout << "rename setting rd: " << inst_in_scheduler_.rd << " to not useabel wid: " << inst_in_scheduler_.wid << '\n';
renameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.rd] = false;
}
// cout << "About to check vector wb: " << inst_in_scheduler_.vd << "\n" << std::flush;
if (inst_in_scheduler_.vd != -1) {
vecRenameTable_[inst_in_scheduler_.vd] = false;
}
// cout << "Finished wb checking" << "\n" << std::flush;
CPY_TRACE(inst_in_exe_, inst_in_scheduler_);
INIT_TRACE(inst_in_scheduler_);
// cout << "Finished trace copying and clearning" << "\n" << std::flush;
} else {
D(3, "Execute: srcs not ready!");
inst_in_scheduler_.stalled = true;
// INIT_TRACE(inst_in_exe_);
}
if (inst_in_scheduler_.irs1 > 0) {
scheduler_srcs_busy = scheduler_srcs_busy || iRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.irs1];
}
//printTrace(&inst_in_exe_, "EXE");
// INIT_TRACE(inst_in_exe_);
if (inst_in_scheduler_.irs2 > 0) {
scheduler_srcs_busy = scheduler_srcs_busy || iRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.irs2];
}
if (inst_in_scheduler_.frs1 >= 0) {
scheduler_srcs_busy = scheduler_srcs_busy || fRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.frs1];
}
if (inst_in_scheduler_.frs2 >= 0) {
scheduler_srcs_busy = scheduler_srcs_busy || fRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.frs2];
}
if (inst_in_scheduler_.frs3 >= 0) {
scheduler_srcs_busy = scheduler_srcs_busy || fRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.frs3];
}
if (inst_in_scheduler_.vrs1 >= 0) {
scheduler_srcs_busy = scheduler_srcs_busy || vRenameTable_[inst_in_scheduler_.vrs1];
}
if (inst_in_scheduler_.vrs2 >= 0) {
scheduler_srcs_busy = scheduler_srcs_busy || vRenameTable_[inst_in_scheduler_.vrs2];
}
if (scheduler_srcs_busy) {
D(3, "Execute: srcs not ready!");
inst_in_scheduler_.stalled = true;
} else {
if (inst_in_scheduler_.ird > 0) {
iRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.ird] = true;
}
if (inst_in_scheduler_.frd >= 0) {
fRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.frd] = true;
}
if (inst_in_scheduler_.vrd >= 0) {
vRenameTable_[inst_in_scheduler_.vrd] = true;
}
CPY_TRACE(inst_in_exe_, inst_in_scheduler_);
INIT_TRACE(inst_in_scheduler_);
}
}
void Core::writeback() {
if (inst_in_wb_.rd > 0)
renameTable_[inst_in_wb_.wid][inst_in_wb_.rd] = true;
if (inst_in_wb_.vd > 0)
vecRenameTable_[inst_in_wb_.vd] = true;
if (inst_in_wb_.ird > 0) {
iRenameTable_[inst_in_wb_.wid][inst_in_wb_.ird] = false;
}
if (inst_in_wb_.frd >= 0) {
fRenameTable_[inst_in_wb_.wid][inst_in_wb_.frd] = false;
}
if (inst_in_wb_.vrd >= 0) {
vRenameTable_[inst_in_wb_.vrd] = false;
}
if (inst_in_wb_.stall_warp) {
stalled_warps_[inst_in_wb_.wid] = false;
// release_warp_ = true;
// release_warp_num_ = inst_in_wb_.wid;
}
INIT_TRACE(inst_in_wb_);
bool serviced_exe = false;
if ((inst_in_exe_.rd > 0) || (inst_in_exe_.stall_warp)) {
if ((inst_in_exe_.ird > 0)
|| (inst_in_exe_.frd >= 0)
|| (inst_in_exe_.vrd >= 0)
|| (inst_in_exe_.stall_warp)) {
CPY_TRACE(inst_in_wb_, inst_in_exe_);
INIT_TRACE(inst_in_exe_);
serviced_exe = true;
// cout << "WRITEBACK SERVICED EXE\n";
}
if (inst_in_lsu_.is_sw) {
INIT_TRACE(inst_in_lsu_);
} else {
if (((inst_in_lsu_.rd > 0) || (inst_in_lsu_.vd > 0)) && (inst_in_lsu_.mem_stall_cycles == 0)) {
if (((inst_in_lsu_.ird > 0)
|| (inst_in_lsu_.frd >= 0)
|| (inst_in_lsu_.vrd >= 0))
&& (inst_in_lsu_.mem_stall_cycles == 0)) {
if (serviced_exe) {
D(3, "$$$$$$$$$$$$$$$$$$$$ Stalling LSU because EXE is being used");
// Stalling LSU because EXE is busy
inst_in_lsu_.stalled = true;
} else {
CPY_TRACE(inst_in_wb_, inst_in_lsu_);
@@ -371,27 +368,28 @@ void Core::writeback() {
}
void Core::getCacheDelays(trace_inst_t *trace_inst) {
trace_inst->fetch_stall_cycles += 3;
trace_inst->fetch_stall_cycles += 1;
if (trace_inst->is_sw || trace_inst->is_lw) {
trace_inst->mem_stall_cycles += 5;
trace_inst->mem_stall_cycles += 3;
}
}
bool Core::running() const {
bool stages_have_valid = inst_in_fetch_.valid_inst
|| inst_in_decode_.valid_inst
|| inst_in_scheduler_.valid_inst
|| inst_in_lsu_.valid_inst
|| inst_in_exe_.valid_inst
|| inst_in_wb_.valid_inst;
bool stages_have_valid = inst_in_fetch_.valid
|| inst_in_decode_.valid
|| inst_in_scheduler_.valid
|| inst_in_lsu_.valid
|| inst_in_exe_.valid
|| inst_in_wb_.valid;
if (stages_have_valid)
return true;
for (unsigned i = 0; i < warps_.size(); ++i)
if (warps_[i].running()) {
for (unsigned i = 0; i < warps_.size(); ++i) {
if (warps_[i].active()) {
return true;
}
}
return false;
}

View File

@@ -21,7 +21,6 @@ public:
Core(const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id = 0);
~Core();
bool interrupt(Word r0);
bool running() const;
void getCacheDelays(trace_inst_t *);
@@ -61,8 +60,8 @@ public:
return interruptEntry_;
}
unsigned long num_instructions() const {
return num_instructions_;
unsigned long num_insts() const {
return num_insts_;
}
unsigned long num_steps() const {
@@ -71,9 +70,10 @@ public:
private:
bool renameTable_[32][32];
bool vecRenameTable_[32];
bool stalled_warps_[32];
std::vector<std::vector<bool>> iRenameTable_;
std::vector<std::vector<bool>> fRenameTable_;
std::vector<bool> vRenameTable_;
std::vector<bool> stalled_warps_;
bool foundSchedule_;
Word id_;
@@ -84,10 +84,8 @@ private:
std::unordered_map<Word, std::set<Warp *>> barriers_;
int schedule_w_;
uint64_t steps_;
uint64_t num_instructions_;
uint64_t num_insts_;
Word interruptEntry_;
bool release_warp_;
int release_warp_num_;
trace_inst_t inst_in_fetch_;
trace_inst_t inst_in_decode_;

View File

@@ -1,6 +1,8 @@
#pragma once
//#define USE_DEBUG 9
#define USE_DEBUG 3
#define DEBUG_HEADER << "DEBUG "
//#define DEBUG_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": "
#ifdef USE_DEBUG
@@ -11,13 +13,13 @@
#define D(lvl, x) do { \
if ((lvl) <= USE_DEBUG) { \
std::cout << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": " << x << std::endl; \
std::cout DEBUG_HEADER << x << std::endl; \
} \
} while(0)
#define DPH(lvl, x) do { \
if ((lvl) <= USE_DEBUG) { \
std::cout << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": " << x; \
std::cout DEBUG_HEADER << x; \
} \
} while(0)
@@ -27,10 +29,6 @@
} \
} while(0)
#define D_RAW(x) do { \
std::cout << x; \
} while (0)
#else
#define DX(x)

View File

@@ -38,7 +38,14 @@ static const std::unordered_map<int, struct InstTableEntry_t> sc_instTable = {
{Opcode::GPGPU, {"gpgpu" , false, InstType::R_TYPE}},
{Opcode::VSET_ARITH, {"vsetvl", false, InstType::V_TYPE}},
{Opcode::VL, {"vl" , false, InstType::V_TYPE}},
{Opcode::VS, {"vs" , false, InstType::V_TYPE}}
{Opcode::VS, {"vs" , false, InstType::V_TYPE}},
{Opcode::FL, {"fl" , false, InstType::I_TYPE }},
{Opcode::FS, {"fs" , false, InstType::S_TYPE }},
{Opcode::FCI, {"fci" , false, InstType::R_TYPE }},
{Opcode::FMADD, {"fma" , false, InstType::R4_TYPE }},
{Opcode::FMSUB, {"fms" , false, InstType::R4_TYPE }},
{Opcode::FMNMADD, {"fmnma" , false, InstType::R4_TYPE }},
{Opcode::FMNMSUB, {"fmnms" , false, InstType::R4_TYPE }}
};
std::ostream &vortex::operator<<(std::ostream &os, Instr &instr) {
@@ -47,9 +54,10 @@ std::ostream &vortex::operator<<(std::ostream &os, Instr &instr) {
}
Decoder::Decoder(const ArchDef &arch) {
inst_s_ = arch.getWordSize() * 8;
inst_s_ = arch.wsize() * 8;
opcode_s_ = 7;
reg_s_ = 5;
func2_s_ = 2;
func3_s_ = 3;
mop_s_ = 3;
vmask_s_ = 1;
@@ -60,6 +68,8 @@ Decoder::Decoder(const ArchDef &arch) {
shift_rs1_ = opcode_s_ + reg_s_ + func3_s_;
shift_rs2_ = opcode_s_ + reg_s_ + func3_s_ + reg_s_;
shift_func7_ = opcode_s_ + reg_s_ + func3_s_ + reg_s_ + reg_s_;
shift_func2_ = opcode_s_ + reg_s_ + func3_s_ + reg_s_ + reg_s_;
shift_rs3_ = opcode_s_ + reg_s_ + func3_s_ + reg_s_ + reg_s_ + func2_s_;
shift_j_u_immed_ = opcode_s_ + reg_s_;
shift_s_b_immed_ = opcode_s_ + reg_s_ + func3_s_ + reg_s_ + reg_s_;
shift_i_immed_ = opcode_s_ + reg_s_ + func3_s_ + reg_s_;
@@ -71,6 +81,7 @@ Decoder::Decoder(const ArchDef &arch) {
shift_vset_ = opcode_s_ + reg_s_ + func3_s_ + reg_s_ + reg_s_ + 6;
reg_mask_ = 0x1f;
func2_mask_ = 0x2;
func3_mask_ = 0x7;
func6_mask_ = 0x3f;
func7_mask_ = 0x7f;
@@ -83,7 +94,11 @@ Decoder::Decoder(const ArchDef &arch) {
v_imm_mask_ = 0x7ff;
}
std::shared_ptr<Instr> Decoder::decode(const std::vector<Byte> &v, Size &idx, trace_inst_t *trace_inst) {
std::shared_ptr<Instr> Decoder::decode(
const std::vector<Byte> &v,
Size &idx,
trace_inst_t *trace_inst)
{
Word code(readWord(v, idx, inst_s_ / 8));
// std::cout << "code: " << (int) code << " v: " << v << " indx: " << idx << "\n";
@@ -96,59 +111,66 @@ std::shared_ptr<Instr> Decoder::decode(const std::vector<Byte> &v, Size &idx, tr
Word imeed, dest_bits, imm_bits, bit_11, bits_4_1, bit_10_5,
bit_12, bits_19_12, bits_10_1, bit_20, unordered, func3;
InstType curInstType = sc_instTable.at(op).iType;
if (op == Opcode::FL || op == Opcode::FS) {
// need to find out whether it is vector or floating point inst
Word width_bits = (code >> shift_func3_) & func3_mask_;
if ((width_bits == 0x1) || (width_bits == 0x2)
|| (width_bits == 0x3) || (width_bits == 0x4)) {
curInstType = (op == Opcode::FL) ? InstType::I_TYPE : InstType::S_TYPE;
}
}
// std::cout << "op: " << std::hex << op << " what " << sc_instTable[op].iType << "\n";
switch (sc_instTable.at(op).iType) {
switch (curInstType) {
case InstType::N_TYPE:
break;
case InstType::R_TYPE:
instr->setPred((code >> shift_rs1_) & reg_mask_);
instr->setDestReg((code >> shift_rd_) & reg_mask_);
instr->setSrcReg((code >> shift_rs1_) & reg_mask_);
instr->setSrcReg((code >> shift_rs2_) & reg_mask_);
if (op == Opcode::FCI) {
instr->setDestFReg((code >> shift_rd_) & reg_mask_);
instr->setSrcFReg((code >> shift_rs1_) & reg_mask_);
instr->setSrcFReg((code >> shift_rs2_) & reg_mask_);
} else {
instr->setDestReg((code >> shift_rd_) & reg_mask_);
instr->setSrcReg((code >> shift_rs1_) & reg_mask_);
instr->setSrcReg((code >> shift_rs2_) & reg_mask_);
}
instr->setFunc3((code >> shift_func3_) & func3_mask_);
instr->setFunc7((code >> shift_func7_) & func7_mask_);
trace_inst->valid_inst = true;
trace_inst->rs1 = ((code >> shift_rs1_) & reg_mask_);
trace_inst->rs2 = ((code >> shift_rs2_) & reg_mask_);
trace_inst->rd = ((code >> shift_rd_) & reg_mask_);
break;
case InstType::I_TYPE:
instr->setDestReg((code >> shift_rd_) & reg_mask_);
instr->setSrcReg((code >> shift_rs1_) & reg_mask_);
if (op == Opcode::FCI || op == Opcode::FL) {
instr->setDestFReg((code >> shift_rd_) & reg_mask_);
instr->setSrcFReg((code >> shift_rs1_) & reg_mask_);
} else {
instr->setDestReg((code >> shift_rd_) & reg_mask_);
instr->setSrcReg((code >> shift_rs1_) & reg_mask_);
}
instr->setFunc7((code >> shift_func7_) & func7_mask_);
func3 = (code >> shift_func3_) & func3_mask_;
instr->setFunc3(func3);
if ((func3 == 5) && (op != L_INST)) {
// std::cout << "func7: " << func7 << "\n";
if ((func3 == 5) && (op != L_INST) && (op != Opcode::FL)) {
instr->setSrcImm(signExt(((code >> shift_rs2_) & reg_mask_), 5, reg_mask_));
} else {
instr->setSrcImm(signExt(code >> shift_i_immed_, 12, i_imm_mask_));
}
trace_inst->valid_inst = true;
trace_inst->rs1 = ((code >> shift_rs1_) & reg_mask_);
trace_inst->rd = ((code >> shift_rd_) & reg_mask_);
break;
case InstType::S_TYPE:
// std::cout << "************STORE\n";
instr->setSrcReg((code >> shift_rs1_) & reg_mask_);
instr->setSrcReg((code >> shift_rs2_) & reg_mask_);
if (op == Opcode::FS) {
instr->setSrcFReg((code >> shift_rs1_) & reg_mask_);
instr->setSrcFReg((code >> shift_rs2_) & reg_mask_);
} else {
instr->setSrcReg((code >> shift_rs1_) & reg_mask_);
instr->setSrcReg((code >> shift_rs2_) & reg_mask_);
}
instr->setFunc3((code >> shift_func3_) & func3_mask_);
dest_bits = (code >> shift_rd_) & reg_mask_;
imm_bits = (code >> shift_s_b_immed_ & func7_mask_);
imeed = (imm_bits << reg_s_) | dest_bits;
// std::cout << "ENC: store imeed: " << imeed << "\n";
instr->setSrcImm(signExt(imeed, 12, s_imm_mask_));
trace_inst->valid_inst = true;
trace_inst->rs1 = ((code >> shift_rs1_) & reg_mask_);
trace_inst->rs2 = ((code >> shift_rs2_) & reg_mask_);
break;
case InstType::B_TYPE:
@@ -165,51 +187,34 @@ std::shared_ptr<Instr> Decoder::decode(const std::vector<Byte> &v, Size &idx, tr
bit_12 = imm_bits >> 6;
imeed = 0 | (bits_4_1 << 1) | (bit_10_5 << 5) | (bit_11 << 11) | (bit_12 << 12);
instr->setSrcImm(signExt(imeed, 13, b_imm_mask_));
trace_inst->valid_inst = true;
trace_inst->rs1 = ((code >> shift_rs1_) & reg_mask_);
trace_inst->rs2 = ((code >> shift_rs2_) & reg_mask_);
break;
case InstType::U_TYPE:
instr->setDestReg((code >> shift_rd_) & reg_mask_);
instr->setSrcImm(signExt(code >> shift_j_u_immed_, 20, u_imm_mask_));
trace_inst->valid_inst = true;
trace_inst->rd = ((code >> shift_rd_) & reg_mask_);
break;
case InstType::J_TYPE:
instr->setDestReg((code >> shift_rd_) & reg_mask_);
// [20 | 10:1 | 11 | 19:12]
unordered = code >> shift_j_u_immed_;
bits_19_12 = unordered & 0xff;
bit_11 = (unordered >> 8) & 0x1;
bits_10_1 = (unordered >> 9) & 0x3ff;
bit_20 = (unordered >> 19) & 0x1;
imeed = 0 | (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20);
if (bit_20) {
imeed |= ~j_imm_mask_;
}
instr->setSrcImm(imeed);
trace_inst->valid_inst = true;
trace_inst->rd = ((code >> shift_rd_) & reg_mask_);
break;
case InstType::V_TYPE:
D(3, "Entered here: instr type = vector" << op);
switch (op) {
case Opcode::VSET_ARITH: //TODO: arithmetic ops
instr->setDestReg((code >> shift_rd_) & reg_mask_);
instr->setSrcReg((code >> shift_rs1_) & reg_mask_);
instr->setDestVReg((code >> shift_rd_) & reg_mask_);
instr->setSrcVReg((code >> shift_rs1_) & reg_mask_);
func3 = (code >> shift_func3_) & func3_mask_;
instr->setFunc3(func3);
D(3, "Entered here: instr type = vector");
@@ -228,53 +233,34 @@ std::shared_ptr<Instr> Decoder::decode(const std::vector<Byte> &v, Size &idx, tr
instr->setVsew((immed >> 2) & 0x3);
D(3, "sew " << ((immed >> 2) & 0x3));
} else {
instr->setSrcReg((code >> shift_rs2_) & reg_mask_);
trace_inst->rs2 = ((code >> shift_rs2_) & reg_mask_);
instr->setSrcVReg((code >> shift_rs2_) & reg_mask_);
}
trace_inst->valid_inst = true;
trace_inst->rs1 = ((code >> shift_rs1_) & reg_mask_);
trace_inst->rd = ((code >> shift_rd_) & reg_mask_);
} else {
instr->setSrcReg((code >> shift_rs2_) & reg_mask_);
instr->setSrcVReg((code >> shift_rs2_) & reg_mask_);
instr->setVmask((code >> shift_vmask_) & 0x1);
instr->setFunc6((code >> shift_func6_) & func6_mask_);
trace_inst->valid_inst = true;
trace_inst->rs1 = ((code >> shift_rs1_) & reg_mask_);
trace_inst->rs2 = ((code >> shift_rs2_) & reg_mask_);
trace_inst->rd = ((code >> shift_rd_) & reg_mask_);
}
break;
case Opcode::VL:
D(3, "vector load instr");
instr->setDestReg((code >> shift_rd_) & reg_mask_);
instr->setSrcReg((code >> shift_rs1_) & reg_mask_);
instr->setDestVReg((code >> shift_rd_) & reg_mask_);
instr->setSrcVReg((code >> shift_rs1_) & reg_mask_);
instr->setVlsWidth((code >> shift_func3_) & func3_mask_);
instr->setSrcReg((code >> shift_rs2_) & reg_mask_);
instr->setSrcVReg((code >> shift_rs2_) & reg_mask_);
instr->setVmask((code >> shift_vmask_));
instr->setVmop((code >> shift_vmop_) & func3_mask_);
instr->setVnf((code >> shift_vnf_) & func3_mask_);
trace_inst->valid_inst = true;
trace_inst->rs1 = ((code >> shift_rs1_) & reg_mask_);
trace_inst->vd = ((code >> shift_rd_) & reg_mask_);
//trace_inst->vs2 = ((code>>shift_rs2_) & reg_mask_);
break;
case Opcode::VS:
instr->setVs3((code >> shift_rd_) & reg_mask_);
instr->setSrcReg((code >> shift_rs1_) & reg_mask_);
instr->setSrcVReg((code >> shift_rs1_) & reg_mask_);
instr->setVlsWidth((code >> shift_func3_) & func3_mask_);
instr->setSrcReg((code >> shift_rs2_) & reg_mask_);
instr->setSrcVReg((code >> shift_rs2_) & reg_mask_);
instr->setVmask((code >> shift_vmask_));
instr->setVmop((code >> shift_vmop_) & func3_mask_);
instr->setVnf((code >> shift_vnf_) & func3_mask_);
trace_inst->valid_inst = true;
trace_inst->rs1 = ((code >> shift_rs1_) & reg_mask_);
//trace_inst->vd = ((code>>shift_rd_) & reg_mask_);
trace_inst->vs1 = ((code >> shift_rd_) & reg_mask_); //vs3
break;
default:
@@ -282,11 +268,49 @@ std::shared_ptr<Instr> Decoder::decode(const std::vector<Byte> &v, Size &idx, tr
std::abort();
}
break;
case R4_TYPE:
// RT: add R4_TYPE decoder
instr->setDestFReg((code >> shift_rd_) & reg_mask_);
instr->setSrcFReg((code >> shift_rs1_) & reg_mask_);
instr->setSrcFReg((code >> shift_rs2_) & reg_mask_);
instr->setSrcFReg((code >> shift_rs3_) & reg_mask_);
instr->setFunc3((code >> shift_func3_) & func3_mask_);
break;
default:
std::cout << "Unrecognized argument class in word decoder.\n";
std::abort();
}
if (curInstType != InstType::N_TYPE) {
trace_inst->valid = true;
if (instr->hasRDest()) {
if (instr->is_FpDest()) {
trace_inst->frd = instr->getRDest();
} else if (instr->is_VDest()) {
trace_inst->vrd = instr->getRDest();
} else {
trace_inst->ird = instr->getRDest();
}
}
for (int i = 0; i < instr->getNRSrc(); ++i) {
if (instr->is_FpSrc(i)) {
if (i == 0) trace_inst->frs1 = instr->getRSrc(i);
else if (i == 1) trace_inst->frs2 = instr->getRSrc(i);
else if (i == 2) trace_inst->frs3 = instr->getRSrc(i);
else std::abort();
} else if (instr->is_VSrc(i)) {
if (i == 0) trace_inst->vrs1 = instr->getRSrc(i);
else if (i == 1) trace_inst->vrs2 = instr->getRSrc(i);
else std::abort();
} else {
if (i == 0) trace_inst->irs1 = instr->getRSrc(i);
else if (i == 1) trace_inst->irs2 = instr->getRSrc(i);
else std::abort();
}
}
}
D(2, "Decoded instr 0x" << std::hex << code << " into: " << instr << std::flush);
return instr;

View File

@@ -21,11 +21,14 @@ private:
Word inst_s_;
Word opcode_s_;
Word reg_s_;
Word func2_s_;
Word func3_s_;
Word shift_opcode_;
Word shift_rd_;
Word shift_rs1_;
Word shift_rs2_;
Word shift_rs3_;
Word shift_func2_;
Word shift_func3_;
Word shift_func7_;
Word shift_j_u_immed_;
@@ -33,6 +36,7 @@ private:
Word shift_i_immed_;
Word reg_mask_;
Word func2_mask_;
Word func3_mask_;
Word func6_mask_;
Word func7_mask_;

File diff suppressed because it is too large Load Diff

View File

@@ -1,23 +0,0 @@
#include <iostream>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include "instr.h"
using namespace vortex;
void Instr::setVlmul(Word lmul) {
vlmul_ = std::pow(2, lmul);
}
void Instr::setVsew(Word sew) {
vsew_ = std::pow(2, 3+sew);
}
void Instr::setVediv(Word ediv) {
vediv_ = std::pow(2,ediv);
}

View File

@@ -25,6 +25,14 @@ enum Opcode {
VSET_ARITH= 0x57,
VL = 0x7,
VS = 0x27,
// F-Extension
FL = 0x7,
FS = 0x27,
FCI = 0x53,
FMADD = 0x43,
FMSUB = 0x47,
FMNMSUB = 0x4b,
FMNMADD = 0x4f,
};
enum InstType {
@@ -35,55 +43,60 @@ enum InstType {
B_TYPE,
U_TYPE,
J_TYPE,
V_TYPE
V_TYPE,
R4_TYPE
};
class Instr {
public:
Instr()
: predicated_(false)
: opcode_(Opcode::NOP)
, nRsrc_(0)
, nPsrc_(0)
, hasImmSrc_(false)
, hasRDest_(false)
, hasPDest_(false)
, is_FpDest_(false)
, is_VDest_(false)
, is_FpSrc_(0)
, is_VSrc_(0)
, func2_(0)
, func3_(0)
, func7_(0)
{}
friend std::ostream &operator<<(std::ostream &, Instr &);
/* Setters used to "craft" the instruction. */
void setOpcode(Opcode opcode) { opcode_ = opcode; }
void setPred(RegNum pReg) { predicated_ = true; pred_ = pReg; }
void setDestReg(RegNum destReg) { hasRDest_ = true; rdest_ = destReg; }
void setSrcReg(RegNum srcReg) { rsrc_[nRsrc_++] = srcReg; }
void setDestReg(int destReg) { hasRDest_ = true; rdest_ = destReg; }
void setSrcReg(int srcReg) { rsrc_[nRsrc_++] = srcReg; }
void setDestFReg(int destReg) { hasRDest_ = true; is_FpDest_ = true; rdest_ = destReg; }
void setSrcFReg(int srcReg) { is_FpSrc_ |= (1 << nRsrc_); rsrc_[nRsrc_++] = srcReg; }
void setDestVReg(int destReg) { hasRDest_ = true; is_VDest_ = true; rdest_ = destReg; }
void setSrcVReg(int srcReg) { is_VSrc_ |= (1 << nRsrc_); rsrc_[nRsrc_++] = srcReg; }
void setFunc3(Word func3) { func3_ = func3; }
void setFunc7(Word func7) { func7_ = func7; }
void setSrcImm(Word srcImm) { hasImmSrc_ = true; immsrc_ = srcImm; }
void setVsetImm(Word vset_imm) { if(vset_imm) vsetImm_ = true; else vsetImm_ = false; }
void setVsetImm(Word vset_imm) { if (vset_imm) vsetImm_ = true; else vsetImm_ = false; }
void setVlsWidth(Word width) { vlsWidth_ = width; }
void setVmop(Word mop) { vMop_ = mop; }
void setVnf(Word nf) { vNf_ = nf; }
void setVmask(Word mask) { vmask_ = mask; }
void setVs3(Word vs) { vs3_ = vs; }
void setVlmul(Word lmul);
void setVsew(Word sew);
void setVediv(Word ediv);
void setVlmul(Word lmul) { vlmul_ = 1 << lmul; }
void setVsew(Word sew) { vsew_ = 1 << (3+sew); }
void setVediv(Word ediv) { vediv_ = 1 << ediv; }
void setFunc6(Word func6) { func6_ = func6; }
void setPrivileged(bool privileged) { privileged_ = privileged; }
/* Getters used by encoders. */
Opcode getOpcode() const { return opcode_; }
Word getFunc3() const { return func3_; }
Word getFunc6() const { return func6_; }
Word getFunc7() const { return func7_; }
RegNum getNRSrc() const { return nRsrc_; }
RegNum getRSrc(RegNum i) const { return rsrc_[i]; }
int getNRSrc() const { return nRsrc_; }
int getRSrc(int i) const { return rsrc_[i]; }
bool hasRDest() const { return hasRDest_; }
RegNum getRDest() const { return rdest_; }
bool hasPDest() const { return hasPDest_; }
RegNum getPDest() const { return pdest_; }
bool hasPred() const { return predicated_; }
RegNum getPred() const { return pred_; }
int getRDest() const { return rdest_; }
bool hasImm() const { return hasImmSrc_; }
Word getImm() const { return immsrc_; }
bool getVsetImm() const { return vsetImm_; }
@@ -95,7 +108,12 @@ public:
Word getVlmul() const { return vlmul_; }
Word getVsew() const { return vsew_; }
Word getVediv() const { return vediv_; }
bool getPrivileged() const { return privileged_; }
bool is_FpDest() const { return is_FpDest_; }
bool is_FpSrc(int i) const { return (is_FpSrc_ >> i) & 0x1; }
bool is_VDest() const { return is_VDest_; }
bool is_VSrc(int i) const { return (is_VSrc_ >> i) & 0x1; }
private:
@@ -104,20 +122,19 @@ private:
};
Opcode opcode_;
bool predicated_;
RegNum pred_;
int nRsrc_;
int nPsrc_;
RegNum rsrc_[MAX_REG_SOURCES];
bool hasImmSrc_;
bool hasRDest_;
bool is_FpDest_;
bool is_VDest_;
int is_FpSrc_;
int is_VSrc_;
Word immsrc_;
Word func2_;
Word func3_;
Word func7_;
bool hasRDest_;
bool hasPDest_;
RegNum rdest_;
RegNum pdest_;
bool privileged_;
int rsrc_[MAX_REG_SOURCES];
int rdest_;
//Vector
bool vsetImm_;

View File

@@ -15,8 +15,8 @@ using namespace vortex;
int main(int argc, char **argv) {
std::string archString("rv32i");
int num_cores(1);
std::string archString("rv32imf");
int num_cores(NUM_CORES * NUM_CLUSTERS);
int num_warps(NUM_WARPS);
int num_threads(NUM_THREADS);
std::string imgFileName;
@@ -36,19 +36,19 @@ int main(int argc, char **argv) {
if (showHelp || imgFileName.empty()) {
std::cout << "Vortex emulator command line arguments:\n"
" -i, --image <filename> Program RAM image\n"
" -i, --image <filename> Program RAM image\n"
" -c, --cores <num> Number of cores\n"
" -w, --warps <num> Number of warps\n"
" -t, --threads <num> Number of threads\n"
" -a, --arch <arch string> Architecture string\n"
" -s, --stats Print stats on exit.\n";
" -s, --stats Print stats on exit.\n";
return 0;
}
ArchDef arch(archString, num_cores, num_warps, num_threads);
Decoder decoder(arch);
MemoryUnit mu(4096, arch.getWordSize(), true);
MemoryUnit mu(4096, arch.wsize(), true);
RAM old_ram;
old_ram.loadHexImpl(imgFileName.c_str());
@@ -59,7 +59,7 @@ int main(int argc, char **argv) {
std::vector<std::shared_ptr<Core>> cores(num_cores);
for (int i = 0; i < num_cores; ++i) {
cores[i] = std::make_shared<Core>(arch, decoder, mu);
cores[i] = std::make_shared<Core>(arch, decoder, mu, i);
}
bool running;

View File

@@ -5,21 +5,27 @@ namespace vortex {
struct trace_inst_t {
// Warp step
bool valid_inst;
unsigned pc;
bool valid;
unsigned PC;
// Core scheduler
int wid;
// Encoder
int rs1;
int rs2;
int rd;
int irs1;
int irs2;
int ird;
//Encoder
int vs1;
int vs2;
int vd;
// Floating-point
int frs1;
int frs2;
int frs3;
int frd;
// Vector extension
int vrs1;
int vrs2;
int vrd;
// Instruction execute
bool is_lw;

View File

@@ -1,20 +1,18 @@
#pragma once
#include <stdint.h>
#include <bitset>
#include <VX_config.h>
namespace vortex {
typedef uint8_t Byte;
typedef uint32_t Word;
typedef uint32_t Word_u;
typedef int32_t Word_s;
typedef Word_u Addr;
typedef Word_u Size;
typedef uint32_t Addr;
typedef uint32_t Size;
typedef unsigned RegNum;
typedef unsigned ThdNum;
typedef std::bitset<32> ThreadMask;
enum MemFlags {
RD_USR = 1,

View File

@@ -12,15 +12,15 @@ Word vortex::signExt(Word w, Size bit, Word mask) {
return w;
}
void vortex::wordToBytes(Byte *b, Word_u w, Size wordSize) {
void vortex::wordToBytes(Byte *b, Word w, Size wordSize) {
while (wordSize--) {
*(b++) = w & 0xff;
w >>= 8;
}
}
Word_u vortex::bytesToWord(const Byte *b, Size wordSize) {
Word_u w = 0;
Word vortex::bytesToWord(const Byte *b, Size wordSize) {
Word w = 0;
b += wordSize-1;
while (wordSize--) {
w <<= 8;
@@ -29,15 +29,15 @@ Word_u vortex::bytesToWord(const Byte *b, Size wordSize) {
return w;
}
Word_u vortex::flagsToWord(bool r, bool w, bool x) {
Word_u word = 0;
Word vortex::flagsToWord(bool r, bool w, bool x) {
Word word = 0;
if (r) word |= RD_USR;
if (w) word |= WR_USR;
if (x) word |= EX_USR;
return word;
}
void vortex::wordToFlags(bool &r, bool &w, bool &x, Word_u f) {
void vortex::wordToFlags(bool &r, bool &w, bool &x, Word f) {
r = f & RD_USR;
w = f & WR_USR;
x = f & EX_USR;
@@ -49,10 +49,10 @@ Byte vortex::readByte(const std::vector<Byte> &b, Size &n) {
return b[n++];
}
Word_u vortex::readWord(const std::vector<Byte> &b, Size &n, Size wordSize) {
Word vortex::readWord(const std::vector<Byte> &b, Size &n, Size wordSize) {
if (b.size() - n < wordSize)
throw std::out_of_range("out of range");
Word_u w(0);
Word w(0);
n += wordSize;
// std::cout << "wordSize: " << wordSize << "\n";
for (Size i = 0; i < wordSize; i++) {

View File

@@ -12,13 +12,13 @@ void unused(Args&&...) {}
Word signExt(Word w, Size bit, Word mask);
Word_u bytesToWord(const Byte *b, Size wordSize);
void wordToBytes(Byte *b, Word_u w, Size wordSize);
Word_u flagsToWord(bool r, bool w, bool x);
void wordToFlags(bool &r, bool &w, bool &x, Word_u f);
Word bytesToWord(const Byte *b, Size wordSize);
void wordToBytes(Byte *b, Word w, Size wordSize);
Word flagsToWord(bool r, bool w, bool x);
void wordToFlags(bool &r, bool &w, bool &x, Word f);
Byte readByte(const std::vector<Byte> &b, Size &n);
Word_u readWord(const std::vector<Byte> &b, Size &n, Size wordSize);
Word readWord(const std::vector<Byte> &b, Size &n, Size wordSize);
void writeByte(std::vector<Byte> &p, Size &n, Byte b);
void writeWord(std::vector<Byte> &p, Size &n, Size wordSize, Word w);

View File

@@ -2,6 +2,7 @@
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <assert.h>
#include "util.h"
#include "instr.h"
@@ -11,120 +12,71 @@ using namespace vortex;
Warp::Warp(Core *core, Word id)
: id_(id)
, active_(false)
, core_(core)
, pc_(0x80000000)
, shadowPc_(0)
, activeThreads_(0)
, shadowActiveThreads_(0)
, shadowReg_(core_->arch().getNumRegs())
, VLEN_(1024)
, interruptEnable_(true)
, shadowInterruptEnable_(false)
, supervisorMode_(true)
, shadowSupervisorMode_(false)
, spawned_(false)
, PC_(0x80000000)
, steps_(0)
, insts_(0)
, loads_(0)
, stores_(0) {
D(3, "Creating a new thread with PC: " << std::hex << pc_);
/* Build the register file. */
Word regNum(0);
for (Word j = 0; j < core_->arch().getNumThreads(); ++j) {
regFile_.push_back(std::vector<Reg<Word>>(0));
for (Word i = 0; i < core_->arch().getNumRegs(); ++i) {
regFile_[j].push_back(Reg<Word>(id, regNum++));
}
bool act = false;
if (j == 0)
act = true;
tmask_.push_back(act);
shadowTmask_.push_back(act);
}
tmask_.reset();
for (Word i = 0; i < (1 << 12); i++) {
csrs_.push_back(Reg<uint16_t>(id, regNum++));
}
/* Set initial register contents. */
regFile_[0][0] = (core_->arch().getNumThreads() << (core_->arch().getWordSize() * 8 / 2)) | id;
iRegFile_.resize(core_->arch().num_threads(), std::vector<Word>(core_->arch().num_regs(), 0));
fRegFile_.resize(core_->arch().num_threads(), std::vector<Word>(core_->arch().num_regs(), 0));
vRegFile_.resize(core_->arch().num_regs(), std::vector<Byte>(core_->arch().vsize(), 0));
csrs_.resize(core_->arch().num_csrs());
}
void Warp::step(trace_inst_t *trace_inst) {
assert(tmask_.any());
Size fetchPos(0);
Size decPos;
Size wordSize(core_->arch().getWordSize());
Size wordSize(core_->arch().wsize());
std::vector<Byte> fetchBuffer(wordSize);
if (activeThreads_ == 0)
return;
++steps_;
D(3, "current PC=0x" << std::hex << pc_);
D(3, "current PC=0x" << std::hex << PC_);
// std::cout << "pc: " << std::hex << pc << "\n";
trace_inst->pc = pc_;
// std::cout << "PC: " << std::hex << PC << "\n";
trace_inst->PC = PC_;
/* Fetch and decode. */
if (wordSize < sizeof(pc_))
pc_ &= ((1ll << (wordSize * 8)) - 1);
if (wordSize < sizeof(PC_))
PC_ &= ((1ll << (wordSize * 8)) - 1);
unsigned fetchSize = 4;
fetchBuffer.resize(fetchSize);
Word fetched = core_->mem().fetch(pc_ + fetchPos, supervisorMode_);
Word fetched = core_->mem().fetch(PC_ + fetchPos, 0);
writeWord(fetchBuffer, fetchPos, fetchSize, fetched);
decPos = 0;
std::shared_ptr<Instr> instr = core_->decoder().decode(fetchBuffer, decPos, trace_inst);
// Update pc
pc_ += decPos;
// Update PC
PC_ += decPos;
// Execute
this->execute(*instr, trace_inst);
// At Debug Level 3, print debug info after each instruction.
D(3, "Register state:");
for (unsigned i = 0; i < regFile_[0].size(); ++i) {
D_RAW(" %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
for (unsigned j = 0; j < (activeThreads_); ++j)
D_RAW(' ' << std::setfill('0') << std::setw(8) << std::hex << regFile_[j][i] << std::setfill(' ') << ' ');
D_RAW('(' << shadowReg_[i] << ')' << std::endl);
D(4, "Register state:");
for (int i = 0; i < core_->arch().num_regs(); ++i) {
DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
for (int j = 0; j < core_->arch().num_threads(); ++j) {
DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << iRegFile_[j][i] << std::setfill(' ') << ' ');
}
DPN(4, std::endl);
}
DPH(3, "Thread mask:");
for (unsigned i = 0; i < tmask_.size(); ++i)
for (int i = 0; i < core_->arch().num_threads(); ++i)
DPN(3, " " << tmask_[i]);
DPN(3, "\n");
}
bool Warp::interrupt(Word r0) {
if (!interruptEnable_)
return false;
shadowActiveThreads_ = activeThreads_;
shadowTmask_ = tmask_;
shadowInterruptEnable_ = interruptEnable_; /* For traps. */
shadowSupervisorMode_ = supervisorMode_;
for (Word i = 0; i < regFile_[0].size(); ++i)
shadowReg_[i] = regFile_[0][i];
for (Word i = 0; i < regFile_.size(); ++i)
tmask_[i] = 1;
shadowPc_ = pc_;
activeThreads_ = 1;
interruptEnable_ = false;
supervisorMode_ = true;
regFile_[0][0] = r0;
pc_ = core_->interruptEntry();
return true;
}
void Warp::printStats() const {
std::cout << "Steps : " << steps_ << std::endl
<< "Insts : " << insts_ << std::endl

View File

@@ -7,69 +7,25 @@
namespace vortex {
template <typename T>
class Reg {
public:
Reg()
: value_(0), cpuId_(0), regNum_(0) {}
Reg(Word c, Word n)
: value_(0), cpuId_(c), regNum_(n) {}
Reg(Word c, Word n, T v)
: value_(v), cpuId_(c), regNum_(n) {}
const T &value() const {
return value_;
}
Reg &operator=(T r) {
if (regNum_) {
value_ = r;
doWrite();
}
return *this;
}
operator T() const {
doRead();
return value_;
}
void trunc(Size s) {
Word mask((~0ull >> (sizeof(Word) - s) * 8));
value_ &= mask;
}
private:
T value_;
Word cpuId_, regNum_;
void doWrite() const {}
void doRead() const {}
};
///////////////////////////////////////////////////////////////////////////////
struct DomStackEntry {
DomStackEntry(
unsigned p,
const std::vector<std::vector<Reg<Word>>> &m,
std::vector<bool> &tm,
Word pc
) : pc(pc)
, fallThrough(false)
, uni(false) {
for (unsigned i = 0; i < m.size(); ++i) {
tmask.push_back(!bool(m[i][p]) && tm[i]);
}
}
DomStackEntry(const ThreadMask &tmask, Word PC)
: tmask(tmask)
, PC(PC)
, fallThrough(false)
, unanimous(false)
{}
DomStackEntry(const std::vector<bool> &tmask)
: tmask(tmask), fallThrough(true), uni(false) {}
DomStackEntry(const ThreadMask &tmask)
: tmask(tmask)
, PC(0)
, fallThrough(true)
, unanimous(false)
{}
std::vector<bool> tmask;
Word pc;
ThreadMask tmask;
Word PC;
bool fallThrough;
bool uni;
bool unanimous;
};
struct vtype {
@@ -86,13 +42,13 @@ class trace_inst_t;
class Warp {
public:
Warp(Core *core, Word id = 0);
bool active() const {
return tmask_.any();
}
void step(trace_inst_t *);
bool interrupt(Word r0);
bool running() const {
return (activeThreads_ != 0);
std::size_t getActiveThreads() const {
return tmask_.count();
}
void printStats() const;
@@ -105,78 +61,40 @@ public:
return id_;
}
Word get_pc() const {
return pc_;
Word getPC() const {
return PC_;
}
void set_pc(Word pc) {
pc_ = pc;
void setPC(Word PC) {
PC_ = PC;
}
void setActiveThreads(Size activeThreads) {
activeThreads_ = activeThreads;
}
Size getActiveThreads() const {
return activeThreads_;
}
void setSpawned(bool spawned) {
spawned_ = spawned;
}
void setSupervisorMode(bool supervisorMode) {
supervisorMode_ = supervisorMode;
}
bool getSupervisorMode() const {
return supervisorMode_;
}
void setTmask(size_t index, bool value) {
tmask_[index] = value;
}
void step(trace_inst_t *);
private:
void execute(Instr &instr, trace_inst_t *);
struct MemAccess {
MemAccess(bool w, Word a)
: wr(w), addr(a) {}
bool wr;
Word addr;
};
std::vector<MemAccess> memAccesses_;
Word id_;
bool active_;
Core *core_;
Word pc_;
Word shadowPc_;
Size activeThreads_;
Size shadowActiveThreads_;
std::vector<std::vector<Reg<Word>>> regFile_;
std::vector<Reg<uint16_t>> csrs_;
std::vector<bool> tmask_;
std::vector<bool> shadowTmask_;
Word PC_;
ThreadMask tmask_;
std::vector<std::vector<Word>> iRegFile_;
std::vector<std::vector<Word>> fRegFile_;
std::vector<std::vector<Byte>> vRegFile_;
std::vector<Word> csrs_;
std::stack<DomStackEntry> domStack_;
std::vector<Word> shadowReg_;
struct vtype vtype_; // both of them are XLEN WIDE
int vl_; // both of them are XLEN WIDE
Word VLEN_; // total vector length
std::vector<std::vector<Reg<char *>>> vregFile_; // 32 vector registers
bool interruptEnable_;
bool shadowInterruptEnable_;
bool supervisorMode_;
bool shadowSupervisorMode_;
bool spawned_;
struct vtype vtype_;
int vl_;
unsigned long steps_;
unsigned long insts_;
unsigned long loads_;