diff --git a/ci/regression.sh b/ci/regression.sh index 13d8136d..eb00b259 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -15,6 +15,9 @@ set -e CONFIGS=-DEXT_M_DISABLE make -C hw/simulate CONFIGS=-DEXT_F_DISABLE make -C hw/simulate +# disable shared memory +CONFIGS=-DSM_ENABLE=0 make -C hw/simulate + # Blackbox tests ./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --perf --app=demo --args="-n1" ./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --debug --app=demo --args="-n1" diff --git a/hw/rtl/VX_cluster.v b/hw/rtl/VX_cluster.v index 466512e7..ea9a766b 100644 --- a/hw/rtl/VX_cluster.v +++ b/hw/rtl/VX_cluster.v @@ -120,7 +120,7 @@ module VX_cluster #( .DATA_WIDTH (32), .ADDR_WIDTH (12), .BUFFERED_REQ (1), - .BUFFERED_RSP (`NUM_CORES >= 4) + .BUFFERED_RSP (1) ) csr_arb ( .clk (clk), .reset (reset), @@ -225,7 +225,7 @@ module VX_cluster #( .DATA_WIDTH (`L2DRAM_LINE_WIDTH), .TAG_IN_WIDTH (`XDRAM_TAG_WIDTH), .TAG_OUT_WIDTH (`L2DRAM_TAG_WIDTH), - .BUFFERED_REQ (`NUM_CORES >= 4), + .BUFFERED_REQ (1), .BUFFERED_RSP (1) ) dram_arb ( .clk (clk), diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index c4d699b7..3f760bb8 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -1,6 +1,12 @@ `include "VX_define.vh" `include "VX_print_instr.vh" +`ifdef EXT_F_ENABLE + `define USED_REGS(f,r) used_regs[{f,r}] = 1 +`else + `define USED_REGS(f,r) used_regs[r] = 1 +`endif + module VX_decode #( parameter CORE_ID = 0 ) ( @@ -22,10 +28,12 @@ module VX_decode #( reg [`EX_BITS-1:0] ex_type; reg [`OP_BITS-1:0] op_type; reg [`MOD_BITS-1:0] op_mod; - reg [31:0] imm; - reg use_rd, use_rs1, use_rs2, use_rs3, use_PC, use_imm; + reg [4:0] rd_r, rs1_r, rs2_r, rs3_r; + reg [31:0] imm; + reg use_rd, use_PC, use_imm; reg rd_fp, rs1_fp, rs2_fp, rs3_fp; reg is_join, is_wstall; + reg [`NUM_REGS-1:0] used_regs; wire [31:0] instr = ifetch_rsp_if.instr; wire [6:0] opcode = instr[6:0]; @@ -46,14 +54,11 @@ module VX_decode #( always @(*) begin - ex_type = `EX_NOP; + ex_type = 0; op_type = 'x; op_mod = 'x; imm = 'x; use_rd = 0; - use_rs1 = 0; - use_rs2 = 0; - use_rs3 = 0; use_PC = 0; use_imm = 0; rd_fp = 0; @@ -61,7 +66,12 @@ module VX_decode #( rs2_fp = 0; rs3_fp = 1; is_join = 0; - is_wstall = 0; + is_wstall = 0; + used_regs = 0; + rd_r = rd; + rs1_r = rs1; + rs2_r = rs2; + rs3_r = rs3; case (opcode) `INST_I: begin @@ -80,8 +90,9 @@ module VX_decode #( op_mod = 0; imm = {{20{alu_imm[11]}}, alu_imm}; use_rd = 1; - use_rs1 = 1; use_imm = 1; + `USED_REGS (1'b0, rd); + `USED_REGS (1'b0, rs1); end `INST_R: begin ex_type = `EX_ALU; @@ -115,18 +126,21 @@ module VX_decode #( endcase op_mod = 0; end - use_rd = 1; - use_rs1 = 1; - use_rs2 = 1; + use_rd = 1; + `USED_REGS (1'b0, rd); + `USED_REGS (1'b0, rs1); + `USED_REGS (1'b0, rs2); end `INST_LUI: begin ex_type = `EX_ALU; op_type = `OP_BITS'(`ALU_LUI); - op_mod = 0; + op_mod = 0; + rs1_r = 0; imm = {upper_imm, 12'(0)}; use_rd = 1; - use_rs1 = 1; - use_imm = 1; + use_imm = 1; + `USED_REGS (1'b0, rd); + `USED_REGS (1'b0, 5'b0); end `INST_AUIPC: begin ex_type = `EX_ALU; @@ -136,6 +150,7 @@ module VX_decode #( use_rd = 1; use_PC = 1; use_imm = 1; + `USED_REGS (1'b0, rd); end `INST_JAL: begin ex_type = `EX_ALU; @@ -146,6 +161,7 @@ module VX_decode #( use_PC = 1; use_imm = 1; is_wstall = 1; + `USED_REGS (1'b0, rd); end `INST_JALR: begin ex_type = `EX_ALU; @@ -153,9 +169,10 @@ module VX_decode #( op_mod = 1; imm = {{20{jalr_imm[11]}}, jalr_imm}; use_rd = 1; - use_rs1 = 1; use_imm = 1; is_wstall = 1; + `USED_REGS (1'b0, rd); + `USED_REGS (1'b0, rs1); end `INST_B: begin ex_type = `EX_ALU; @@ -170,11 +187,11 @@ module VX_decode #( endcase op_mod = 1; imm = {{20{instr[31]}}, instr[7], instr[30:25], instr[11:8], 1'b0}; - use_rs1 = 1; - use_rs2 = 1; use_PC = 1; use_imm = 1; is_wstall = 1; + `USED_REGS (1'b0, rs1); + `USED_REGS (1'b0, rs2); end `INST_SYS : begin if (func3 == 0) begin @@ -192,6 +209,7 @@ module VX_decode #( use_rd = 1; use_PC = 1; use_imm = 1; + `USED_REGS (1'b0, rd); end else begin ex_type = `EX_CSR; case (func3[1:0]) @@ -203,8 +221,10 @@ module VX_decode #( endcase imm = 32'(u_12); use_rd = 1; - use_rs1 = !func3[2]; use_imm = func3[2]; + `USED_REGS (1'b0, rd); + if (!func3[2]) + `USED_REGS (1'b0, rs1); end end `ifdef EXT_F_ENABLE @@ -214,10 +234,11 @@ module VX_decode #( ex_type = `EX_LSU; op_type = `OP_BITS'({1'b0, func3}); imm = {{20{u_12[11]}}, u_12}; - use_rd = 1; - use_rs1 = 1; - `ifdef EXT_F_ENABLE - rd_fp = (opcode == `INST_FL); + use_rd = 1; + `USED_REGS (1'b0, rs1); + `USED_REGS ((opcode == `INST_FL), rd); + `ifdef EXT_F_ENABLE + rd_fp = (opcode == `INST_FL); `endif end `ifdef EXT_F_ENABLE @@ -227,8 +248,8 @@ module VX_decode #( ex_type = `EX_LSU; op_type = `OP_BITS'({1'b1, func3}); imm = {{20{func7[6]}}, func7, rd}; - use_rs1 = 1; - use_rs2 = 1; + `USED_REGS (1'b0, rs1); + `USED_REGS ((opcode == `INST_FS), rs2); `ifdef EXT_F_ENABLE rs2_fp = (opcode == `INST_FS); `endif @@ -242,17 +263,18 @@ module VX_decode #( op_type = `OP_BITS'(opcode[3:0]); op_mod = func3; use_rd = 1; - use_rs1 = 1; - use_rs2 = 1; - use_rs3 = 1; rd_fp = 1; rs1_fp = 1; - rs2_fp = 1; + rs2_fp = 1; + `USED_REGS (1'b1, rd); + `USED_REGS (1'b1, rs1); + `USED_REGS (1'b1, rs2); + `USED_REGS (1'b1, rs3); end `INST_FCI: begin ex_type = `EX_FPU; op_mod = func3; - use_rd = 1; + use_rd = 1; case (func7) 7'h00, // FADD 7'h04, // FSUB @@ -260,55 +282,61 @@ module VX_decode #( 7'h0C: // FDIV begin op_type = `OP_BITS'(func7[3:0]); - use_rd = 1; - use_rs1 = 1; - use_rs2 = 1; rd_fp = 1; rs1_fp = 1; rs2_fp = 1; + `USED_REGS (1'b1, rd); + `USED_REGS (1'b1, rs1); + `USED_REGS (1'b1, rs2); end 7'h2C: begin op_type = `OP_BITS'(`FPU_SQRT); - use_rs1 = 1; rd_fp = 1; rs1_fp = 1; + `USED_REGS (1'b1, rd); + `USED_REGS (1'b1, rs1); end 7'h50: begin op_type = `OP_BITS'(`FPU_CMP); - use_rs1 = 1; - use_rs2 = 1; rs1_fp = 1; rs2_fp = 1; + `USED_REGS (1'b0, rd); + `USED_REGS (1'b1, rs1); + `USED_REGS (1'b1, rs2); end 7'h60: begin op_type = (instr[20]) ? `OP_BITS'(`FPU_CVTWUS) : `OP_BITS'(`FPU_CVTWS); - use_rs1 = 1; rs1_fp = 1; + `USED_REGS (1'b0, rd); + `USED_REGS (1'b1, rs1); end 7'h68: begin op_type = (instr[20]) ? `OP_BITS'(`FPU_CVTSWU) : `OP_BITS'(`FPU_CVTSW); - use_rs1 = 1; rd_fp = 1; + `USED_REGS (1'b1, rd); + `USED_REGS (1'b0, rs1); end 7'h10: begin // FSGNJ=0, FSGNJN=1, FSGNJX=2 op_type = `OP_BITS'(`FPU_MISC); op_mod = {1'b0, func3[1:0]}; - use_rs1 = 1; - use_rs2 = 1; rd_fp = 1; rs1_fp = 1; rs2_fp = 1; + `USED_REGS (1'b1, rd); + `USED_REGS (1'b1, rs1); + `USED_REGS (1'b1, rs2); end 7'h14: begin // FMIN=3, FMAX=4 op_type = `OP_BITS'(`FPU_MISC); op_mod = func3[0] ? 4 : 3; - use_rs1 = 1; - use_rs2 = 1; rd_fp = 1; rs1_fp = 1; rs2_fp = 1; + `USED_REGS (1'b1, rd); + `USED_REGS (1'b1, rs1); + `USED_REGS (1'b1, rs2); end 7'h70: begin if (func3[0]) begin @@ -318,15 +346,17 @@ module VX_decode #( // FMV.X.W=5 op_type = `OP_BITS'(`FPU_MISC); op_mod = 5; - end - use_rs1 = 1; - rs1_fp = 1; + end + rs1_fp = 1; + `USED_REGS (1'b0, rd); + `USED_REGS (1'b1, rs1); end 7'h78: begin // FMV.W.X=6 op_type = `OP_BITS'(`FPU_MISC); op_mod = 6; rd_fp = 1; + `USED_REGS (1'b1, rd); end default:; endcase @@ -337,38 +367,38 @@ module VX_decode #( case (func3) 3'h0: begin op_type = `OP_BITS'(`GPU_TMC); - use_rs1 = 1; is_wstall = 1; + `USED_REGS (1'b0, rs1); end 3'h1: begin op_type = `OP_BITS'(`GPU_WSPAWN); - use_rs1 = 1; - use_rs2 = 1; + `USED_REGS (1'b0, rs1); + `USED_REGS (1'b0, rs2); end 3'h2: begin op_type = `OP_BITS'(`GPU_SPLIT); - use_rs1 = 1; is_wstall = 1; + `USED_REGS (1'b0, rs1); end 3'h3: begin op_type = `OP_BITS'(`GPU_JOIN); is_join = 1; end 3'h4: begin - op_type = `OP_BITS'(`GPU_BAR); - use_rs1 = 1; - use_rs2 = 1; + op_type = `OP_BITS'(`GPU_BAR); is_wstall = 1; + `USED_REGS (1'b0, rs1); + `USED_REGS (1'b0, rs2); end `ifdef EXT_TEX_ENABLE 3'h5: begin op_type = `OP_BITS'(`GPU_TEX); op_mod = `MOD_BITS'(func2); - use_rd = 1; - use_rs1 = 1; - use_rs2 = 1; - use_rs3 = 1; rs3_fp = 0; + `USED_REGS (1'b0, rd); + `USED_REGS (1'b0, rs1); + `USED_REGS (1'b0, rs2); + `USED_REGS (1'b0, rs3); end `endif default:; @@ -379,10 +409,7 @@ module VX_decode #( end // disable write to integer register r0 - wire wb = use_rd && (rd_fp || (rd != 0)); - - // EX_ALU needs rs1=0 for LUI operation - wire [4:0] rs1_qual = (opcode == `INST_LUI) ? 5'h0 : rs1; + wire wb = use_rd && (rd_fp || (rd_r != 0)); assign decode_if.valid = ifetch_rsp_if.valid; assign decode_if.wid = ifetch_rsp_if.wid; @@ -393,29 +420,25 @@ module VX_decode #( assign decode_if.op_mod = op_mod; assign decode_if.wb = wb; - `ifdef EXT_F_ENABLE - assign decode_if.rd = {rd_fp, rd}; - assign decode_if.rs1 = {rs1_fp, rs1_qual}; - assign decode_if.rs2 = {rs2_fp, rs2}; - assign decode_if.rs3 = {rs3_fp, rs3}; - `else - `UNUSED_VAR (rd_fp) - `UNUSED_VAR (rs1_fp) - `UNUSED_VAR (rs2_fp) - assign decode_if.rd = rd; - assign decode_if.rs1 = rs1_qual; - assign decode_if.rs2 = rs2; - assign decode_if.rs3 = rs3; - `endif +`ifdef EXT_F_ENABLE + assign decode_if.rd = {rd_fp, rd_r}; + assign decode_if.rs1 = {rs1_fp, rs1_r}; + assign decode_if.rs2 = {rs2_fp, rs2_r}; + assign decode_if.rs3 = {rs3_fp, rs3_r}; +`else + `UNUSED_VAR (rd_fp) + `UNUSED_VAR (rs1_fp) + `UNUSED_VAR (rs2_fp) + assign decode_if.rd = rd_r; + assign decode_if.rs1 = rs1_r; + assign decode_if.rs2 = rs2_r; + assign decode_if.rs3 = rs3_r; +`endif - assign decode_if.imm = imm; - assign decode_if.use_PC = use_PC; - assign decode_if.use_imm = use_imm; - - assign decode_if.used_regs = (`NUM_REGS'(use_rd) << decode_if.rd) - | (`NUM_REGS'(use_rs1) << decode_if.rs1) - | (`NUM_REGS'(use_rs2) << decode_if.rs2) - | (`NUM_REGS'(use_rs3) << decode_if.rs3); + assign decode_if.imm = imm; + assign decode_if.use_PC = use_PC; + assign decode_if.use_imm = use_imm; + assign decode_if.used_regs = used_regs; /////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_ibuffer.v b/hw/rtl/VX_ibuffer.v index a67c7221..9de55ab2 100644 --- a/hw/rtl/VX_ibuffer.v +++ b/hw/rtl/VX_ibuffer.v @@ -82,8 +82,7 @@ module VX_ibuffer #( if (writing && is_slot0) begin q_data_out[i] <= q_data_in; - end - if (pop) begin + end else if (pop) begin q_data_out[i] <= q_data_prev[i]; end end diff --git a/hw/rtl/VX_instr_demux.v b/hw/rtl/VX_instr_demux.v index d8f7e7bb..948a5664 100644 --- a/hw/rtl/VX_instr_demux.v +++ b/hw/rtl/VX_instr_demux.v @@ -38,7 +38,8 @@ module VX_instr_demux ( wire alu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_ALU); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BITS + `MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BITS + `MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)), + .BUFFERED (1) ) alu_buffer ( .clk (clk), .reset (reset), @@ -55,7 +56,8 @@ module VX_instr_demux ( wire lsu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_LSU); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `LSU_BITS + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `LSU_BITS + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)), + .BUFFERED (1) ) lsu_buffer ( .clk (clk), .reset (reset), @@ -72,7 +74,8 @@ module VX_instr_demux ( wire csr_req_valid = execute_if.valid && (execute_if.ex_type == `EX_CSR); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NR_BITS + 32) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NR_BITS + 32), + .BUFFERED (1) ) csr_buffer ( .clk (clk), .reset (reset), @@ -90,7 +93,8 @@ module VX_instr_demux ( wire fpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_FPU); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)), + .BUFFERED (1) ) fpu_buffer ( .clk (clk), .reset (reset), @@ -111,7 +115,8 @@ module VX_instr_demux ( wire gpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_GPU); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)) //update number of bits + .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)), + .BUFFERED (1) ) gpu_buffer ( .clk (clk), .reset (reset), diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index 43f788c8..bf5f309c 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -97,7 +97,7 @@ module VX_lsu_unit #( && (0 == req_sent_mask) // first submission only && req_wb; // loads only - wire mbuf_pop = dcache_rsp_fire && ~(|rsp_rem_mask_n); + wire mbuf_pop = dcache_rsp_fire && (0 == rsp_rem_mask_n); assign mbuf_raddr = dcache_rsp_if.tag[`LSUQ_ADDR_BITS-1:0]; @@ -124,8 +124,9 @@ module VX_lsu_unit #( end end - assign sent_all_ready = (&(dcache_req_if.ready | req_sent_mask | ~req_tmask)) - || (req_is_dup & dcache_req_if.ready[0]); + assign sent_all_ready = &(dcache_req_if.ready | req_sent_mask); + + wire [`NUM_THREADS-1:0] req_sent_dup = {{(`NUM_THREADS-1){dcache_req_fire[0] && req_is_dup}}, 1'b0}; always @(posedge clk) begin if (reset) begin @@ -134,7 +135,7 @@ module VX_lsu_unit #( if (sent_all_ready) req_sent_mask <= 0; else - req_sent_mask <= req_sent_mask | dcache_req_fire; + req_sent_mask <= req_sent_mask | dcache_req_fire | req_sent_dup; end end @@ -146,10 +147,13 @@ module VX_lsu_unit #( req_tag_hold <= mbuf_waddr; end + wire [`NUM_THREADS-1:0] req_tmask_dup = req_tmask & {{(`NUM_THREADS-1){~req_is_dup}}, 1'b1}; + assign rsp_rem_mask_n = rsp_rem_mask[mbuf_raddr] & ~dcache_rsp_if.valid; + always @(posedge clk) begin if (mbuf_push) begin - rsp_rem_mask[mbuf_waddr] <= req_is_dup ? (`NUM_THREADS)'(1) : req_tmask; + rsp_rem_mask[mbuf_waddr] <= req_tmask_dup; end if (dcache_rsp_fire) begin rsp_rem_mask[mbuf_raddr] <= rsp_rem_mask_n; @@ -159,8 +163,6 @@ module VX_lsu_unit #( wire req_ready_dep = (req_wb && ~mbuf_full) || (~req_wb && st_commit_if.ready); - wire [`NUM_THREADS-1:0] dup_mask = {{(`NUM_THREADS-1){~req_is_dup}}, 1'b1}; - // DCache Request reg [`NUM_THREADS-1:0][29:0] mem_req_addr; @@ -191,7 +193,7 @@ module VX_lsu_unit #( end end - assign dcache_req_if.valid = {`NUM_THREADS{req_valid && req_ready_dep}} & req_tmask & dup_mask & ~req_sent_mask; + assign dcache_req_if.valid = {`NUM_THREADS{req_valid && req_ready_dep}} & req_tmask_dup & ~req_sent_mask; assign dcache_req_if.rw = {`NUM_THREADS{~req_wb}}; assign dcache_req_if.addr = mem_req_addr; assign dcache_req_if.byteen = mem_req_byteen; @@ -257,8 +259,8 @@ module VX_lsu_unit #( .clk (clk), .reset (reset), .enable (!load_rsp_stall), - .data_in ({(| dcache_rsp_if.valid), rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}), - .data_out ({ld_commit_if.valid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop}) + .data_in ({(| dcache_rsp_if.valid), rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}), + .data_out ({ld_commit_if.valid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop}) ); // Can accept new cache response? diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index fc67d24a..d768d9dd 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -81,6 +81,27 @@ `define LTRIM(x, s) x[s-1:0] +`define PRINT_ARRAY1D(a, m) \ + $write("{"); \ + for (integer i = (m-1); i >= 0; --i) begin \ + if (i != (m-1)) $write(", "); \ + $write("0x%0h", a[i]); \ + end \ + $write("}"); \ + +`define PRINT_ARRAY2D(a, m, n) \ + $write("{"); \ + for (integer i = n-1; i >= 0; --i) begin \ + if (i != (n-1)) $write(", "); \ + $write("{"); \ + for (integer j = (m-1); j >= 0; --j) begin \ + if (j != (m-1)) $write(", "); \ + $write("0x%0h", a[i][j]); \ + end \ + $write("}"); \ + end \ + $write("}") + `define PRINT_ARRAY1D(a, m) \ $write("{"); \ for (integer i = (m-1); i >= 0; --i) begin \ diff --git a/hw/rtl/VX_smem_arb.v b/hw/rtl/VX_smem_arb.v index f6e1e960..89316a51 100644 --- a/hw/rtl/VX_smem_arb.v +++ b/hw/rtl/VX_smem_arb.v @@ -21,7 +21,7 @@ module VX_smem_arb ( localparam SMEM_ASHIFT = `CLOG2(`SHARED_MEM_BASE_ADDR_ALIGN); localparam REQ_ASHIFT = `CLOG2(`DWORD_SIZE); localparam REQ_ADDRW = 32 - REQ_ASHIFT; - localparam REQ_DATAW = REQ_ADDRW + 1 + `DWORD_SIZE + (`DWORD_SIZE*8) + `DCORE_TAG_WIDTH; + localparam REQ_DATAW = 1 + REQ_ADDRW + 1 + `DWORD_SIZE + (`DWORD_SIZE*8) + `DCORE_TAG_WIDTH; localparam RSP_DATAW = `NUM_THREADS + `NUM_THREADS * (`DWORD_SIZE*8) + `DCORE_TAG_WIDTH; // @@ -30,41 +30,42 @@ module VX_smem_arb ( for (genvar i = 0; i < `NUM_THREADS; ++i) begin - wire cache_req_ready_in; - wire smem_req_ready_in; + wire cache_req_valid_out, cache_req_ready_out; + wire is_smem_addr_in, is_smem_addr_out; // select shared memory bus - wire is_smem_addr = core_req_if.valid[i] && `SM_ENABLE - && (core_req_if.addr[i][REQ_ADDRW-1:SMEM_ASHIFT-REQ_ASHIFT] >= (32-SMEM_ASHIFT)'((`SHARED_MEM_BASE_ADDR - `SMEM_SIZE) >> SMEM_ASHIFT)) - && (core_req_if.addr[i][REQ_ADDRW-1:SMEM_ASHIFT-REQ_ASHIFT] < (32-SMEM_ASHIFT)'(`SHARED_MEM_BASE_ADDR >> SMEM_ASHIFT)); + assign is_smem_addr_in = core_req_if.valid[i] && `SM_ENABLE + && (core_req_if.addr[i][REQ_ADDRW-1:SMEM_ASHIFT-REQ_ASHIFT] >= (32-SMEM_ASHIFT)'((`SHARED_MEM_BASE_ADDR - `SMEM_SIZE) >> SMEM_ASHIFT)) + && (core_req_if.addr[i][REQ_ADDRW-1:SMEM_ASHIFT-REQ_ASHIFT] < (32-SMEM_ASHIFT)'(`SHARED_MEM_BASE_ADDR >> SMEM_ASHIFT)); VX_skid_buffer #( .DATAW (REQ_DATAW) - ) cache_out_buffer ( + ) out_buffer ( .clk (clk), .reset (reset), - .valid_in (core_req_if.valid[i] && !is_smem_addr), - .data_in ({core_req_if.addr[i], core_req_if.rw[i], core_req_if.byteen[i], core_req_if.data[i], core_req_if.tag[i]}), - .ready_in (cache_req_ready_in), - .valid_out (cache_req_if.valid[i]), - .data_out ({cache_req_if.addr[i], cache_req_if.rw[i], cache_req_if.byteen[i], cache_req_if.data[i], cache_req_if.tag[i]}), - .ready_out (cache_req_if.ready[i]) + .valid_in (core_req_if.valid[i]), + .data_in ({is_smem_addr_in, core_req_if.addr[i], core_req_if.rw[i], core_req_if.byteen[i], core_req_if.data[i], core_req_if.tag[i]}), + .ready_in (core_req_if.ready[i]), + .valid_out (cache_req_valid_out), + .data_out ({is_smem_addr_out, cache_req_if.addr[i], cache_req_if.rw[i], cache_req_if.byteen[i], cache_req_if.data[i], cache_req_if.tag[i]}), + .ready_out (cache_req_ready_out) ); - VX_skid_buffer #( - .DATAW (REQ_DATAW) - ) smem_out_buffer ( - .clk (clk), - .reset (reset), - .valid_in (core_req_if.valid[i] && is_smem_addr), - .data_in ({core_req_if.addr[i], core_req_if.rw[i], core_req_if.byteen[i], core_req_if.data[i], core_req_if.tag[i]}), - .ready_in (smem_req_ready_in), - .valid_out (smem_req_if.valid[i]), - .data_out ({smem_req_if.addr[i], smem_req_if.rw[i], smem_req_if.byteen[i], smem_req_if.data[i], smem_req_if.tag[i]}), - .ready_out (smem_req_if.ready[i]) - ); - - assign core_req_if.ready[i] = is_smem_addr ? smem_req_ready_in : cache_req_ready_in; + if (`SM_ENABLE ) begin + assign cache_req_if.valid[i] = cache_req_valid_out && ~is_smem_addr_out; + assign smem_req_if.valid[i] = cache_req_valid_out && is_smem_addr_out; + assign cache_req_ready_out = is_smem_addr_out ? smem_req_if.ready[i] : cache_req_if.ready[i]; + + assign smem_req_if.addr[i] = cache_req_if.addr[i]; + assign smem_req_if.rw[i] = cache_req_if.rw[i]; + assign smem_req_if.byteen[i] = cache_req_if.byteen[i]; + assign smem_req_if.data[i] = cache_req_if.data[i]; + assign smem_req_if.tag[i] = cache_req_if.tag[i]; + end else begin + `UNUSED_VAR (is_smem_addr_out) + assign cache_req_if.valid[i] = cache_req_valid_out; + assign cache_req_ready_out = cache_req_if.ready[i]; + end end // diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index 17c67db5..2f2d34e9 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -121,7 +121,7 @@ module Vortex ( .NUM_REQS (`NUM_CLUSTERS), .DATA_WIDTH (32), .ADDR_WIDTH (12), - .BUFFERED_REQ (`NUM_CLUSTERS >= 4), + .BUFFERED_REQ (1), .BUFFERED_RSP (1) ) csr_arb ( .clk (clk), @@ -228,7 +228,7 @@ module Vortex ( .TAG_IN_WIDTH (`L2DRAM_TAG_WIDTH), .TAG_OUT_WIDTH (`L3DRAM_TAG_WIDTH), .BUFFERED_REQ (1), - .BUFFERED_RSP (`NUM_CLUSTERS >= 4) + .BUFFERED_RSP (1) ) dram_arb ( .clk (clk), .reset (reset), diff --git a/hw/rtl/afu/vortex_afu.sv b/hw/rtl/afu/vortex_afu.sv index f6c0dc12..4d6cae37 100644 --- a/hw/rtl/afu/vortex_afu.sv +++ b/hw/rtl/afu/vortex_afu.sv @@ -1,14 +1,13 @@ `include "VX_define.vh" `ifndef NOPAE -import local_mem_cfg_pkg::*; `include "afu_json_info.vh" `else `include "vortex_afu.vh" +`endif /* verilator lint_off IMPORTSTAR */ import ccip_if_pkg::*; import local_mem_cfg_pkg::*; /* verilator lint_on IMPORTSTAR */ -`endif module vortex_afu #( parameter NUM_LOCAL_MEM_BANKS = 2 diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index 50d2ec7c..689876c0 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -168,8 +168,7 @@ module VX_cache #( .NUM_BANKS (NUM_BANKS) ) flush_ctrl ( .clk (clk), - .reset (reset), - .flush (flush), + .reset (reset || flush), .addr_out (flush_addr), .valid_out (flush_enable) ); diff --git a/hw/rtl/cache/VX_cache_core_rsp_merge.v b/hw/rtl/cache/VX_cache_core_rsp_merge.v index 5c38f2e4..3a3bf1e1 100644 --- a/hw/rtl/cache/VX_cache_core_rsp_merge.v +++ b/hw/rtl/cache/VX_cache_core_rsp_merge.v @@ -98,7 +98,8 @@ module VX_cache_core_rsp_merge #( wire core_rsp_valid_any = (| per_bank_core_rsp_valid); VX_skid_buffer #( - .DATAW (NUM_REQS + CORE_TAG_WIDTH + (NUM_REQS *`WORD_WIDTH)) + .DATAW (NUM_REQS + CORE_TAG_WIDTH + (NUM_REQS *`WORD_WIDTH)), + .BUFFERED (1) ) pipe_reg ( .clk (clk), .reset (reset), @@ -146,7 +147,8 @@ module VX_cache_core_rsp_merge #( for (genvar i = 0; i < NUM_REQS; i++) begin VX_skid_buffer #( - .DATAW (CORE_TAG_WIDTH + `WORD_WIDTH) + .DATAW (CORE_TAG_WIDTH + `WORD_WIDTH), + .BUFFERED (1) ) pipe_reg ( .clk (clk), .reset (reset), diff --git a/hw/rtl/cache/VX_flush_ctrl.v b/hw/rtl/cache/VX_flush_ctrl.v index b204cc7b..356781d1 100644 --- a/hw/rtl/cache/VX_flush_ctrl.v +++ b/hw/rtl/cache/VX_flush_ctrl.v @@ -9,8 +9,7 @@ module VX_flush_ctrl #( parameter NUM_BANKS = 1 ) ( input wire clk, - input wire reset, - input wire flush, + input wire reset, output wire [`LINE_SELECT_BITS-1:0] addr_out, output wire valid_out ); @@ -18,7 +17,7 @@ module VX_flush_ctrl #( reg [`LINE_SELECT_BITS-1:0] flush_ctr; always @(posedge clk) begin - if (reset || flush) begin + if (reset) begin flush_enable <= 1; flush_ctr <= 0; end else begin diff --git a/hw/rtl/fp_cores/VX_fp_cvt.v b/hw/rtl/fp_cores/VX_fp_cvt.v index 4e030f98..a719ee55 100644 --- a/hw/rtl/fp_cores/VX_fp_cvt.v +++ b/hw/rtl/fp_cores/VX_fp_cvt.v @@ -3,10 +3,6 @@ /// Modified port of cast module from fpnew Libray /// reference: https://github.com/pulp-platform/fpnew -`ifndef SYNTHESIS -`include "float_dpi.vh" -`endif - module VX_fp_cvt #( parameter TAGW = 1, parameter LANES = 1 @@ -73,19 +69,19 @@ module VX_fp_cvt #( ); end - wire [LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant; // input mantissa with implicit bit - wire signed [LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent; - wire [LANES-1:0] input_sign; + wire [LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant; // input mantissa with implicit bit + wire [LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent; + wire [LANES-1:0] input_sign; for (genvar i = 0; i < LANES; ++i) begin wire [INT_MAN_WIDTH-1:0] int_mantissa; wire [INT_MAN_WIDTH-1:0] fmt_mantissa; wire fmt_sign = dataa[i][31]; wire int_sign = dataa[i][31] & is_signed; - assign int_mantissa = int_sign ? $unsigned(-dataa[i]) : dataa[i]; + assign int_mantissa = int_sign ? (-dataa[i]) : dataa[i]; assign fmt_mantissa = INT_MAN_WIDTH'({in_a_type[i].is_normal, dataa[i][MAN_BITS-1:0]}); - assign fmt_exponent[i] = $signed({1'b0, dataa[i][MAN_BITS+EXP_BITS-1:MAN_BITS]}); + assign fmt_exponent[i] = {1'b0, dataa[i][MAN_BITS+EXP_BITS-1:MAN_BITS]}; assign encoded_mant[i] = is_itof ? int_mantissa : fmt_mantissa; assign input_sign[i] = is_itof ? int_sign : fmt_sign; end @@ -115,7 +111,7 @@ module VX_fp_cvt #( wire [2:0] rnd_mode_s0; fp_type_t [LANES-1:0] in_a_type_s0; wire [LANES-1:0] input_sign_s0; - wire signed [LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0; + wire [LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0; wire [LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0; wire [LANES-1:0][LZC_RESULT_WIDTH-1:0] renorm_shamt_s0; wire [LANES-1:0] mant_is_zero_s0; @@ -135,38 +131,93 @@ module VX_fp_cvt #( // Normalization - wire [LANES-1:0][INT_MAN_WIDTH-1:0] input_mant; // normalized input mantissa - wire signed [LANES-1:0][INT_EXP_WIDTH-1:0] input_exp; // unbiased true exponent - wire signed [LANES-1:0][INT_EXP_WIDTH-1:0] destination_exp; // re-biased exponent for destination + wire [LANES-1:0][INT_MAN_WIDTH-1:0] input_mant; // normalized input mantissa + wire [LANES-1:0][INT_EXP_WIDTH-1:0] input_exp; // unbiased true exponent + wire [LANES-1:0][INT_EXP_WIDTH-1:0] destination_exp; // re-biased exponent for destination for (genvar i = 0; i < LANES; ++i) begin `IGNORE_WARNINGS_BEGIN // Input mantissa needs to be normalized - wire signed [INT_EXP_WIDTH-1:0] fp_input_exp; - wire signed [INT_EXP_WIDTH-1:0] int_input_exp; - wire [LZC_RESULT_WIDTH:0] renorm_shamt_sgn; - - // signed form for calculations - assign renorm_shamt_sgn = $signed({1'b0, renorm_shamt_s0[i]}); + wire [INT_EXP_WIDTH-1:0] fp_input_exp; + wire [INT_EXP_WIDTH-1:0] int_input_exp; // Realign input mantissa, append zeroes if destination is wider assign input_mant[i] = encoded_mant_s0[i] << renorm_shamt_s0[i]; // Unbias exponent and compensate for shift - assign fp_input_exp = $signed(fmt_exponent_s0[i] + - (($signed({1'b0, in_a_type_s0[i].is_subnormal}) + - $signed(FMT_SHIFT_COMPENSATION - EXP_BIAS)) - - renorm_shamt_sgn)); + assign fp_input_exp = fmt_exponent_s0[i] + + {1'b0, in_a_type_s0[i].is_subnormal} + + (FMT_SHIFT_COMPENSATION - EXP_BIAS) - + {1'b0, renorm_shamt_s0[i]}; - assign int_input_exp = $signed(INT_MAN_WIDTH - 1 - renorm_shamt_sgn); + assign int_input_exp = (INT_MAN_WIDTH-1) - {1'b0, renorm_shamt_s0[i]}; - assign input_exp[i] = is_itof_s0 ? int_input_exp : fp_input_exp; + assign input_exp[i] = is_itof_s0 ? int_input_exp : fp_input_exp; // Rebias the exponent - assign destination_exp[i] = input_exp[i] + $signed(EXP_BIAS); + assign destination_exp[i] = input_exp[i] + EXP_BIAS; `IGNORE_WARNINGS_END end + // Perform adjustments to mantissa and exponent + + wire [LANES-1:0][2*INT_MAN_WIDTH:0] preshift_mant_s0; + wire [LANES-1:0][SHAMT_BITS-1:0] denorm_shamt_s0; + wire [LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s0; + wire [LANES-1:0] of_before_round_s0; + + for (genvar i = 0; i < LANES; ++i) begin + reg [2*INT_MAN_WIDTH:0] preshift_mant; // mantissa before final shift + reg [SHAMT_BITS-1:0] denorm_shamt; // shift amount for denormalization + reg [INT_EXP_WIDTH-1:0] final_exp; // after eventual adjustments + reg of_before_round; + + always @(*) begin + `IGNORE_WARNINGS_BEGIN + // Default assignment + final_exp = destination_exp[i]; // take exponent as is, only look at lower bits + preshift_mant = {input_mant[i], 33'b0}; // Place mantissa to the left of the shifter + denorm_shamt = 0; // right of mantissa + of_before_round = 1'b0; + + // Handle INT casts + if (is_itof_s0) begin + if ($signed(destination_exp[i]) >= $signed(2**EXP_BITS-1)) begin + // Overflow or infinities (for proper rounding) + final_exp = (2**EXP_BITS-2); // largest normal value + preshift_mant = ~0; // largest normal value and RS bits set + of_before_round = 1'b1; + end else if ($signed(destination_exp[i]) < $signed(-MAN_BITS)) begin + // Limit the shift to retain sticky bits + final_exp = 0; // denormal result + denorm_shamt = denorm_shamt + (2 + MAN_BITS); // to sticky + end else if ($signed(destination_exp[i]) < $signed(1)) begin + // Denormalize underflowing values + final_exp = 0; // denormal result + denorm_shamt = denorm_shamt + 1 - destination_exp[i]; // adjust right shifting + end + end else begin + if ($signed(input_exp[i]) >= $signed((MAX_INT_WIDTH-1) + unsigned_s0)) begin + // overflow: when converting to unsigned the range is larger by one + denorm_shamt = SHAMT_BITS'(0); // prevent shifting + of_before_round = 1'b1; + end else if ($signed(input_exp[i]) < $signed(-1)) begin + // underflow + denorm_shamt = MAX_INT_WIDTH + 1; // all bits go to the sticky + end else begin + // By default right shift mantissa to be an integer + denorm_shamt = (MAX_INT_WIDTH-1) - input_exp[i]; + end + end + `IGNORE_WARNINGS_END + end + + assign preshift_mant_s0[i] = preshift_mant; + assign denorm_shamt_s0[i] = denorm_shamt; + assign final_exp_s0[i] = final_exp; + assign of_before_round_s0[i] = of_before_round; + end + // Pipeline stage1 wire valid_in_s1; @@ -176,121 +227,68 @@ module VX_fp_cvt #( wire [2:0] rnd_mode_s1; fp_type_t [LANES-1:0] in_a_type_s1; wire [LANES-1:0] mant_is_zero_s1; - wire [LANES-1:0] input_sign_s1; - wire signed [LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1; - wire signed [LANES-1:0][INT_EXP_WIDTH-1:0] destination_exp_s1; - wire [LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1; - + wire [LANES-1:0] input_sign_s1; + wire [LANES-1:0][2*INT_MAN_WIDTH:0] preshift_mant_s1; + wire [LANES-1:0][SHAMT_BITS-1:0] denorm_shamt_s1; + wire [LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1; + wire [LANES-1:0] of_before_round_s1; + VX_pipe_register #( - .DATAW (1 + TAGW + 1 + `FRM_BITS + 1 + LANES * ($bits(fp_type_t) + 1 + 1 + INT_MAN_WIDTH + 2*INT_EXP_WIDTH)), + .DATAW (1 + TAGW + 1 + 1 + `FRM_BITS + LANES * ($bits(fp_type_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + SHAMT_BITS + INT_EXP_WIDTH + 1)), .RESETW (1) ) pipe_reg1 ( .clk (clk), .reset (reset), .enable (~stall), - .data_in ({valid_in_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, in_a_type_s0, mant_is_zero_s0, input_sign_s0, input_mant, input_exp, destination_exp}), - .data_out ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, in_a_type_s1, mant_is_zero_s1, input_sign_s1, input_mant_s1, input_exp_s1, destination_exp_s1}) + .data_in ({valid_in_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, in_a_type_s0, mant_is_zero_s0, input_sign_s0, preshift_mant_s0, denorm_shamt_s0, final_exp_s0, of_before_round_s0}), + .data_out ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, in_a_type_s1, mant_is_zero_s1, input_sign_s1, preshift_mant_s1, denorm_shamt_s1, final_exp_s1, of_before_round_s1}) ); - // Casting - reg [LANES-1:0][INT_EXP_WIDTH-1:0] final_exp; // after eventual adjustments - - reg [LANES-1:0][2*INT_MAN_WIDTH:0] preshift_mant; // mantissa before final shift - wire [LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant; // mantissa from shifter, with rnd bit - wire [LANES-1:0][MAN_BITS-1:0] final_mant; // mantissa after adjustments - wire [LANES-1:0][MAX_INT_WIDTH-1:0] final_int; // integer shifted in position - - reg [LANES-1:0][SHAMT_BITS-1:0] denorm_shamt; // shift amount for denormalization - - wire [LANES-1:0][1:0] fp_round_sticky_bits, int_round_sticky_bits, round_sticky_bits; - reg [LANES-1:0] of_before_round; - - // Perform adjustments to mantissa and exponent + wire [LANES-1:0] rounded_sign; + wire [LANES-1:0][31:0] rounded_abs; // absolute value of result after rounding + wire [LANES-1:0][1:0] fp_round_sticky_bits, int_round_sticky_bits; + + // Rouding and classification + for (genvar i = 0; i < LANES; ++i) begin - always @(*) begin - `IGNORE_WARNINGS_BEGIN - // Default assignment - final_exp[i] = $unsigned(destination_exp_s1[i]); // take exponent as is, only look at lower bits - preshift_mant[i] = 65'b0; // initialize mantissa container with zeroes - denorm_shamt[i] = 0; // right of mantissa - of_before_round[i] = 1'b0; - - // Place mantissa to the left of the shifter - preshift_mant[i] = {input_mant_s1[i], 33'b0}; - - // Handle INT casts - if (is_itof_s1) begin - // Overflow or infinities (for proper rounding) - if ($signed(destination_exp_s1[i]) >= $signed(2**EXP_BITS-1)) begin - final_exp[i] = (2**EXP_BITS-2); // largest normal value - preshift_mant[i] = ~0; // largest normal value and RS bits set - of_before_round[i] = 1'b1; - // Denormalize underflowing values - end else if (($signed(destination_exp_s1[i]) < $signed(1)) - && ($signed(destination_exp_s1[i]) >= -$signed(MAN_BITS))) begin - final_exp[i] = 0; // denormal result - denorm_shamt[i] = $unsigned(denorm_shamt[i] + 1 - destination_exp_s1[i]); // adjust right shifting - // Limit the shift to retain sticky bits - end else if ($signed(destination_exp_s1[i]) < -$signed(MAN_BITS)) begin - final_exp[i] = 0; // denormal result - denorm_shamt[i] = $unsigned(denorm_shamt[i] + (2 + MAN_BITS)); // to sticky - end - end else begin - // By default right shift mantissa to be an integer - denorm_shamt[i] = (MAX_INT_WIDTH-1) - input_exp_s1[i]; - // overflow: when converting to unsigned the range is larger by one - if ($signed(input_exp_s1[i]) >= $signed(MAX_INT_WIDTH -1 + unsigned_s1)) begin - denorm_shamt[i] = SHAMT_BITS'(0); // prevent shifting - of_before_round[i] = 1'b1; - // underflow - end else if ($signed(input_exp_s1[i]) < $signed(-1)) begin - denorm_shamt[i] = MAX_INT_WIDTH + 1; // all bits go to the sticky - end - end - `IGNORE_WARNINGS_END - end + wire [2*INT_MAN_WIDTH:0] destination_mant; + wire [MAN_BITS-1:0] final_mant; // mantissa after adjustments + wire [MAX_INT_WIDTH-1:0] final_int; // integer shifted in position + wire [1:0] round_sticky_bits; + wire [31:0] fmt_pre_round_abs; + wire [31:0] pre_round_abs; // Mantissa adjustment shift - assign destination_mant[i] = preshift_mant[i] >> denorm_shamt[i]; - + assign destination_mant = preshift_mant_s1[i] >> denorm_shamt_s1[i]; + // Extract final mantissa and round bit, discard the normal bit (for FP) - assign {final_mant[i], fp_round_sticky_bits[i][1]} = destination_mant[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1]; - assign {final_int[i], int_round_sticky_bits[i][1]} = destination_mant[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (MAX_INT_WIDTH+1) + 1]; + assign {final_mant, fp_round_sticky_bits[i][1]} = destination_mant[2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1]; + assign {final_int, int_round_sticky_bits[i][1]} = destination_mant[2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (MAX_INT_WIDTH+1) + 1]; // Collapse sticky bits - assign fp_round_sticky_bits[i][0] = (| destination_mant[i][NUM_FP_STICKY-1:0]); - assign int_round_sticky_bits[i][0] = (| destination_mant[i][NUM_INT_STICKY-1:0]); + assign fp_round_sticky_bits[i][0] = (| destination_mant[NUM_FP_STICKY-1:0]); + assign int_round_sticky_bits[i][0] = (| destination_mant[NUM_INT_STICKY-1:0]); // select RS bits for destination operation - assign round_sticky_bits[i] = is_itof_s1 ? fp_round_sticky_bits[i] : int_round_sticky_bits[i]; - end + assign round_sticky_bits = is_itof_s1 ? fp_round_sticky_bits[i] : int_round_sticky_bits[i]; - // Rouding and classification - - wire [LANES-1:0] rounded_sign; - wire [LANES-1:0][31:0] rounded_abs; // absolute value of result after rounding - - for (genvar i = 0; i < LANES; ++i) begin // Pack exponent and mantissa into proper rounding form - wire [31:0] fmt_pre_round_abs = {1'b0, final_exp[i][EXP_BITS-1:0], final_mant[i][MAN_BITS-1:0]}; - - // Sign-extend integer result - wire [31:0] ifmt_pre_round_abs = final_int[i]; + assign fmt_pre_round_abs = {1'b0, final_exp_s1[i][EXP_BITS-1:0], final_mant[MAN_BITS-1:0]}; // Select output with destination format and operation - wire [31:0] pre_round_abs = is_itof_s1 ? fmt_pre_round_abs : ifmt_pre_round_abs; + assign pre_round_abs = is_itof_s1 ? fmt_pre_round_abs : final_int; // Perform the rounding VX_fp_rounding #( .DAT_WIDTH (32) ) fp_rounding ( - .abs_value_i (pre_round_abs), - .sign_i (input_sign_s1[i]), - .round_sticky_bits_i (round_sticky_bits[i]), - .rnd_mode_i (rnd_mode_s1), - .effective_subtraction_i (1'b0), - .abs_rounded_o (rounded_abs[i]), - .sign_o (rounded_sign[i]), + .abs_value_i (pre_round_abs), + .sign_i (input_sign_s1[i]), + .round_sticky_bits_i(round_sticky_bits), + .rnd_mode_i (rnd_mode_s1), + .effective_subtraction_i(1'b0), + .abs_rounded_o (rounded_abs[i]), + .sign_o (rounded_sign[i]), `UNUSED_PIN (exact_zero_o) ); end @@ -306,23 +304,22 @@ module VX_fp_cvt #( wire [LANES-1:0] input_sign_s2; wire [LANES-1:0] rounded_sign_s2; wire [LANES-1:0][31:0] rounded_abs_s2; + wire [LANES-1:0] of_before_round_s2; VX_pipe_register #( - .DATAW (1 + TAGW + 1 + 1 + LANES * ($bits(fp_type_t) + 1 + 1 + 32 + 1)), + .DATAW (1 + TAGW + 1 + 1 + LANES * ($bits(fp_type_t) + 1 + 1 + 32 + 1 + 1)), .RESETW (1) ) pipe_reg2 ( .clk (clk), .reset (reset), .enable (~stall), - .data_in ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, in_a_type_s1, mant_is_zero_s1, input_sign_s1, rounded_abs, rounded_sign}), - .data_out ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, in_a_type_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2}) + .data_in ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, in_a_type_s1, mant_is_zero_s1, input_sign_s1, rounded_abs, rounded_sign, of_before_round_s1}), + .data_out ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, in_a_type_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2}) ); wire [LANES-1:0] of_after_round; wire [LANES-1:0] uf_after_round; - wire [LANES-1:0][31:0] fmt_result; - wire [LANES-1:0][31:0] rounded_int_res; // after possible inversion wire [LANES-1:0] rounded_int_res_zero; // after rounding @@ -335,7 +332,7 @@ module VX_fp_cvt #( assign of_after_round[i] = (rounded_abs_s2[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp. // Negative integer result needs to be brought into two's complement - assign rounded_int_res[i] = rounded_sign_s2[i] ? $unsigned(-rounded_abs_s2[i]) : rounded_abs_s2[i]; + assign rounded_int_res[i] = rounded_sign_s2[i] ? (-rounded_abs_s2[i]) : rounded_abs_s2[i]; assign rounded_int_res_zero[i] = (rounded_int_res[i] == 0); end @@ -373,7 +370,7 @@ module VX_fp_cvt #( int_special_result[i][30:0] = 0; // alone yields 2**(31)-1 int_special_result[i][31] = ~unsigned_s2; // for unsigned casts yields 2**31 end else begin - int_special_result[i][30:0] = 2**(31) -1; // alone yields 2**(31)-1 + int_special_result[i][30:0] = 2**(31) - 1; // alone yields 2**(31)-1 int_special_result[i][31] = unsigned_s2; // for unsigned casts yields 2**31 end end @@ -381,7 +378,7 @@ module VX_fp_cvt #( // Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned) assign int_result_is_special[i] = in_a_type_s2[i].is_nan | in_a_type_s2[i].is_inf - | of_before_round[i] + | of_before_round_s2[i] | (input_sign_s2[i] & unsigned_s2 & ~rounded_int_res_zero[i]); // All integer special cases are invalid @@ -399,11 +396,11 @@ module VX_fp_cvt #( wire [31:0] fp_result, int_result; wire inexact = is_itof_s2 ? (| fp_round_sticky_bits[i]) // overflow is invalid in i2f; - : (| fp_round_sticky_bits[i]) | (~in_a_type_s2[i].is_inf & (of_before_round[i] | of_after_round[i])); + : (| fp_round_sticky_bits[i]) | (~in_a_type_s2[i].is_inf & (of_before_round_s2[i] | of_after_round[i])); - assign fp_regular_status.NV = is_itof_s2 & (of_before_round[i] | of_after_round[i]); // overflow is invalid for I2F casts + assign fp_regular_status.NV = is_itof_s2 & (of_before_round_s2[i] | of_after_round[i]); // overflow is invalid for I2F casts assign fp_regular_status.DZ = 1'b0; // no divisions - assign fp_regular_status.OF = ~is_itof_s2 & (~in_a_type_s2[i].is_inf & (of_before_round[i] | of_after_round[i])); // inf casts no OF + assign fp_regular_status.OF = ~is_itof_s2 & (~in_a_type_s2[i].is_inf & (of_before_round_s2[i] | of_after_round[i])); // inf casts no OF assign fp_regular_status.UF = uf_after_round[i] & inexact; assign fp_regular_status.NX = inexact; diff --git a/hw/rtl/fp_cores/VX_fp_div.v b/hw/rtl/fp_cores/VX_fp_div.v index 69c8e93a..7d4f5fc4 100644 --- a/hw/rtl/fp_cores/VX_fp_div.v +++ b/hw/rtl/fp_cores/VX_fp_div.v @@ -1,5 +1,9 @@ `include "VX_define.vh" +`ifndef SYNTHESIS +`include "float_dpi.vh" +`endif + module VX_fp_div #( parameter TAGW = 1, parameter LANES = 1 diff --git a/hw/rtl/fp_cores/VX_fp_fma.v b/hw/rtl/fp_cores/VX_fp_fma.v index ce7efb24..4d095823 100644 --- a/hw/rtl/fp_cores/VX_fp_fma.v +++ b/hw/rtl/fp_cores/VX_fp_fma.v @@ -1,5 +1,9 @@ `include "VX_define.vh" +`ifndef SYNTHESIS +`include "float_dpi.vh" +`endif + module VX_fp_fma #( parameter TAGW = 1, parameter LANES = 1 diff --git a/hw/rtl/fp_cores/VX_fp_sqrt.v b/hw/rtl/fp_cores/VX_fp_sqrt.v index 869da516..5aa0f134 100644 --- a/hw/rtl/fp_cores/VX_fp_sqrt.v +++ b/hw/rtl/fp_cores/VX_fp_sqrt.v @@ -1,5 +1,9 @@ `include "VX_define.vh" +`ifndef SYNTHESIS +`include "float_dpi.vh" +`endif + module VX_fp_sqrt #( parameter TAGW = 1, parameter LANES = 1 @@ -44,7 +48,7 @@ module VX_fp_sqrt #( fflags_t f; always @(*) begin - dpi_fsqrt (dataa[i], frm, r, f); + dpi_fsqrt (dataa[i], frm, r, f); end `UNUSED_VAR (f) diff --git a/hw/rtl/fp_cores/VX_fp_type.v b/hw/rtl/fp_cores/VX_fp_type.v index df202148..bdc41b86 100644 --- a/hw/rtl/fp_cores/VX_fp_type.v +++ b/hw/rtl/fp_cores/VX_fp_type.v @@ -10,7 +10,7 @@ module VX_fp_type ( ); wire is_normal = (exp_i != 8'd0) && (exp_i != 8'hff); wire is_zero = (exp_i == 8'd0) && (man_i == 23'd0); - wire is_subnormal = (exp_i == 8'd0) && !is_zero; + wire is_subnormal = (exp_i == 8'd0) && (man_i != 23'd0); wire is_inf = (exp_i == 8'hff) && (man_i == 23'd0); wire is_nan = (exp_i == 8'hff) && (man_i != 23'd0); wire is_signaling = is_nan && (man_i[22] == 1'b0); diff --git a/hw/rtl/libs/VX_skid_buffer.v b/hw/rtl/libs/VX_skid_buffer.v index 08377cfb..31d789a0 100644 --- a/hw/rtl/libs/VX_skid_buffer.v +++ b/hw/rtl/libs/VX_skid_buffer.v @@ -67,8 +67,7 @@ module VX_skid_buffer #( end else begin if (ready_out) begin use_buffer <= 0; - end - if (push && !pop) begin + end else if (push && valid_out_r) begin assert(!use_buffer); use_buffer <= 1; end @@ -81,9 +80,11 @@ module VX_skid_buffer #( always @(posedge clk) begin if (push) begin buffer <= data_in; - end - if (pop) begin - data_out_r <= use_buffer ? buffer : data_in; + end + if (pop && !use_buffer) begin + data_out_r <= data_in; + end else if (pop) begin + data_out_r <= buffer; end end diff --git a/hw/syn/opae/Makefile b/hw/syn/opae/Makefile index 097bcc97..5a871248 100644 --- a/hw/syn/opae/Makefile +++ b/hw/syn/opae/Makefile @@ -138,5 +138,4 @@ clean-fpga-32c: clean-fpga-64c: rm -rf $(FPGA_BUILD_DIR)_64c sources.txt -clean: clean-ase-1c clean-ase-2c clean-ase-4c clean-fpga-1c clean-fpga-2c clean-fpga-4c clean-fpga-8c clean-fpga-16c clean-fpga-32c clean-fpga-64c - rm sources.txt \ No newline at end of file +clean: clean-ase-1c clean-ase-2c clean-ase-4c clean-fpga-1c clean-fpga-2c clean-fpga-4c clean-fpga-8c clean-fpga-16c clean-fpga-32c clean-fpga-64c \ No newline at end of file diff --git a/hw/syn/opae/sources_16c.txt b/hw/syn/opae/sources_16c.txt index 94aeb46c..cbee87e0 100644 --- a/hw/syn/opae/sources_16c.txt +++ b/hw/syn/opae/sources_16c.txt @@ -6,7 +6,7 @@ +define+QUARTUS #+define+PERF_ENABLE -vortex_afu.json +vortex_afu16.json QI:vortex_afu.qsf C:sources.txt \ No newline at end of file diff --git a/hw/syn/opae/sources_32c.txt b/hw/syn/opae/sources_32c.txt index e1bf6649..1fc88ecd 100644 --- a/hw/syn/opae/sources_32c.txt +++ b/hw/syn/opae/sources_32c.txt @@ -2,6 +2,8 @@ +define+NUM_CLUSTERS=4 #+define+L3_ENABLE=1 ++define+GLOBAL_BLOCK_SIZE=16 + +define+SYNTHESIS +define+QUARTUS #+define+PERF_ENABLE diff --git a/hw/syn/opae/sources_64c.txt b/hw/syn/opae/sources_64c.txt index 8cc42e1b..bf267717 100644 --- a/hw/syn/opae/sources_64c.txt +++ b/hw/syn/opae/sources_64c.txt @@ -2,6 +2,8 @@ +define+NUM_CLUSTERS=8 #+define+L3_ENABLE=1 ++define+GLOBAL_BLOCK_SIZE=16 + +define+SYNTHESIS +define+QUARTUS #+define+PERF_ENABLE diff --git a/hw/syn/opae/sources_8c.txt b/hw/syn/opae/sources_8c.txt index a41c281f..7b52e20d 100644 --- a/hw/syn/opae/sources_8c.txt +++ b/hw/syn/opae/sources_8c.txt @@ -6,7 +6,7 @@ +define+QUARTUS #+define+PERF_ENABLE -vortex_afu.json +vortex_afu16.json QI:vortex_afu.qsf C:sources.txt \ No newline at end of file diff --git a/hw/syn/opae/vortex_afu16.json b/hw/syn/opae/vortex_afu16.json new file mode 100644 index 00000000..1f361d1e --- /dev/null +++ b/hw/syn/opae/vortex_afu16.json @@ -0,0 +1,56 @@ +{ + "version": 1, + "afu-image": { + "power": 0, + "clock-frequency-high": "auto-210", + "clock-frequency-low": "auto-210", + + "cmd-mem-read": 1, + "cmd-mem-write": 2, + "cmd-run": 3, + "cmd-csr-read": 4, + "cmd-csr-write": 5, + + "mmio-cmd-type": 10, + "mmio-io-addr": 12, + "mmio-mem-addr": 14, + "mmio-data-size": 16, + "mmio-status": 18, + "mmio-scope-read": 20, + "mmio-scope-write": 22, + "mmio-csr-core": 24, + "mmio-csr-addr": 26, + "mmio-csr-data": 28, + "mmio-csr-read": 30, + + "afu-top-interface": + { + "class": "ccip_std_afu_avalon_mm", + "module-ports" : + [ + { + "class": "cci-p", + "params": + { + "clock": "uClk_usr" + } + }, + { + "class": "local-memory", + "params": + { + "clock": "uClk_usr" + } + } + ] + }, + "accelerator-clusters": + [ + { + "name": "vortex_afu", + "total-contexts": 1, + "accelerator-type-uuid": "35f9452b-25c2-434c-93d5-6f8c60db361c" + } + ] + } +} diff --git a/hw/syn/quartus/project.tcl b/hw/syn/quartus/project.tcl index 9c9b6ca2..9fa3df14 100644 --- a/hw/syn/quartus/project.tcl +++ b/hw/syn/quartus/project.tcl @@ -41,11 +41,7 @@ set_global_assignment -name VERILOG_MACRO NDEBUG set_global_assignment -name MESSAGE_DISABLE 16818 set_global_assignment -name TIMEQUEST_DO_REPORT_TIMING ON -#set_global_assignment -name ALLOW_ANY_RAM_SIZE_FOR_RECOGNITION ON -#set_global_assignment -name USE_HIGH_SPEED_ADDER ON -#set_global_assignment -name MUX_RESTRUCTURE ON - -set_global_assignment -name OPTIMIZATION_TECHNIQUE AREA +#set_global_assignment -name OPTIMIZATION_TECHNIQUE SPEED #set_global_assignment -name OPTIMIZATION_MODE "AGGRESSIVE PERFORMANCE" #set_global_assignment -name FINAL_PLACEMENT_OPTIMIZATION ALWAYS #set_global_assignment -name PLACEMENT_EFFORT_MULTIPLIER 2.0 diff --git a/runtime/libvortexrt.a b/runtime/libvortexrt.a index 1af78dde..7eed26ce 100644 Binary files a/runtime/libvortexrt.a and b/runtime/libvortexrt.a differ