This commit is contained in:
Blaise Tine
2020-09-08 13:05:47 -04:00
25 changed files with 2411 additions and 2605 deletions

View File

@@ -507,6 +507,12 @@ extern int vx_start(vx_device_h hdevice) {
// start execution // start execution
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_RUN)); CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_RUN));
#ifdef SCOPE
sleep(15);
vx_scope_stop(device->fpga, 0);
exit(0);
#endif
return 0; return 0;
} }

BIN
driver/tests/dogfood/kernel.bin Executable file → Normal file

Binary file not shown.

View File

@@ -131,9 +131,8 @@ void kernel_fmadd(void* arg) {
for (uint32_t i = 0; i < count; ++i) { for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i]; float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i]; float b = src1_ptr[offset+i];
float c = a - b; float c = a * b + b;
float d = a * b + c; dst_ptr[offset+i] = c;
dst_ptr[offset+i] = d;
} }
} }
@@ -148,9 +147,8 @@ void kernel_fmsub(void* arg) {
for (uint32_t i = 0; i < count; ++i) { for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i]; float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i]; float b = src1_ptr[offset+i];
float c = a - b; float c = a * b - b;
float d = a * b - c; dst_ptr[offset+i] = c;
dst_ptr[offset+i] = d;
} }
} }
@@ -165,9 +163,8 @@ void kernel_fnmadd(void* arg) {
for (uint32_t i = 0; i < count; ++i) { for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i]; float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i]; float b = src1_ptr[offset+i];
float c = a - b; float c =-a * b - b;
float d =-a * b - c; dst_ptr[offset+i] = c;
dst_ptr[offset+i] = d;
} }
} }
@@ -182,9 +179,8 @@ void kernel_fnmsub(void* arg) {
for (uint32_t i = 0; i < count; ++i) { for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i]; float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i]; float b = src1_ptr[offset+i];
float c = a - b; float c =-a * b + b;
float d =-a * b + c; dst_ptr[offset+i] = c;
dst_ptr[offset+i] = d;
} }
} }
@@ -199,11 +195,10 @@ void kernel_fnmadd_madd(void* arg) {
for (uint32_t i = 0; i < count; ++i) { for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i]; float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i]; float b = src1_ptr[offset+i];
float c = a - b; float c =-a * b - b;
float d =-a * b - c; float d = a * b + b;
float e = a * b + c; float e = c + d;
float f = d + e; dst_ptr[offset+i] = e;
dst_ptr[offset+i] = f;
} }
} }

File diff suppressed because it is too large Load Diff

BIN
driver/tests/dogfood/kernel.elf Executable file → Normal file

Binary file not shown.

View File

@@ -253,8 +253,7 @@ public:
auto b = (float*)src2; auto b = (float*)src2;
auto c = (float*)dst; auto c = (float*)dst;
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
auto x = a[i] - b[i]; auto ref = a[i] * b[i] + b[i];
auto ref = a[i] * b[i] + x;
if (!almost_equal(c[i], ref)) { if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors; ++errors;
@@ -282,8 +281,7 @@ public:
auto b = (float*)src2; auto b = (float*)src2;
auto c = (float*)dst; auto c = (float*)dst;
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
auto x = a[i] - b[i]; auto ref = a[i] * b[i] - b[i];
auto ref = a[i] * b[i] - x;
if (!almost_equal(c[i], ref)) { if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors; ++errors;
@@ -311,8 +309,7 @@ public:
auto b = (float*)src2; auto b = (float*)src2;
auto c = (float*)dst; auto c = (float*)dst;
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
auto x = a[i] - b[i]; auto ref = -a[i] * b[i] - b[i];
auto ref = -a[i] * b[i] - x;
if (!almost_equal(c[i], ref)) { if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors; ++errors;
@@ -340,8 +337,7 @@ public:
auto b = (float*)src2; auto b = (float*)src2;
auto c = (float*)dst; auto c = (float*)dst;
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
auto x = a[i] - b[i]; auto ref = -a[i] * b[i] + b[i];
auto ref = -a[i] * b[i] + x;
if (!almost_equal(c[i], ref)) { if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors; ++errors;
@@ -369,10 +365,9 @@ public:
auto b = (float*)src2; auto b = (float*)src2;
auto c = (float*)dst; auto c = (float*)dst;
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
auto x = a[i] - b[i]; auto x = -a[i] * b[i] - b[i];
auto y = -a[i] * b[i] - x; auto y = a[i] * b[i] + b[i];
auto z = a[i] * b[i] + x; auto ref = x + y;
auto ref = y + z;
if (!almost_equal(c[i], ref)) { if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors; ++errors;

View File

@@ -79,6 +79,8 @@ tar -zcvf trace.vcd.tar.gz obj_dir/trace.vcd
tar -zcvf trace.vcd.tar.gz trace.vcd tar -zcvf trace.vcd.tar.gz trace.vcd
tar -zcvf run.log.tar.gz run.log tar -zcvf run.log.tar.gz run.log
tar -cvjf vortex.vcd.tar.bz2 build_ase_1c/work/vortex.vcd tar -cvjf vortex.vcd.tar.bz2 build_ase_1c/work/vortex.vcd
tar -zcvf vortex.vcd.tar.gz build_ase_1c/work/vortex.vcd
tar -zcvf run.log.tar.gz build_ase_1c/work/run.log
# decompress VCD trace # decompress VCD trace
tar -zxvf /mnt/c/Users/Blaise/Downloads/vortex.vcd.tar.gz tar -zxvf /mnt/c/Users/Blaise/Downloads/vortex.vcd.tar.gz
@@ -104,6 +106,5 @@ make -C top clean && make -C top > top/build.log 2>&1 &
200 Mhz -> period = 1/200x10^6 = 5ns 200 Mhz -> period = 1/200x10^6 = 5ns
if slack = +1.664 -> minimal period = 5-1.664 = 3.336 -> fmax = 1/3.336 = 300 Mhz if slack = +1.664 -> minimal period = 5-1.664 = 3.336 -> fmax = 1/3.336 = 300 Mhz
# build rtlsim from driver tests # build rtlsim from driver tests
make -C ../../rtlsim clean && reset && make -C ../../rtlsim make -C ../../rtlsim clean && reset && make -C ../../rtlsim

View File

@@ -106,7 +106,7 @@ module ccip_std_afu #(
.NUM_LOCAL_MEM_BANKS(NUM_LOCAL_MEM_BANKS) .NUM_LOCAL_MEM_BANKS(NUM_LOCAL_MEM_BANKS)
) vortex_afu_inst ( ) vortex_afu_inst (
.clk (clk), .clk (clk),
.SoftReset (reset_T1), .reset (reset_T1),
.avs_writedata (avs_writedata), .avs_writedata (avs_writedata),
.avs_readdata (avs_readdata), .avs_readdata (avs_readdata),

View File

@@ -18,7 +18,7 @@ module vortex_afu #(
) ( ) (
// global signals // global signals
input clk, input clk,
input SoftReset, input reset,
// IF signals between CCI and AFU // IF signals between CCI and AFU
input t_if_ccip_Rx cp2af_sRxPort, input t_if_ccip_Rx cp2af_sRxPort,
@@ -191,7 +191,7 @@ assign cmd_scope_write = cp2af_sRxPort.c0.mmioWrValid && (MMIO_SCOPE_WRITE == mm
always_ff @(posedge clk) always_ff @(posedge clk)
begin begin
if (SoftReset) begin if (reset) begin
mmio_tx.hdr <= 0; mmio_tx.hdr <= 0;
mmio_tx.data <= 0; mmio_tx.data <= 0;
mmio_tx.mmioRdValid <= 0; mmio_tx.mmioRdValid <= 0;
@@ -319,7 +319,7 @@ logic cmd_run_done;
always_ff @(posedge clk) always_ff @(posedge clk)
begin begin
if (SoftReset) begin if (reset) begin
state <= STATE_IDLE; state <= STATE_IDLE;
vx_reset <= 0; vx_reset <= 0;
end end
@@ -484,18 +484,18 @@ begin
case (state) case (state)
CMD_MEM_READ: avs_address = cci_dram_rd_req_addr; CMD_MEM_READ: avs_address = cci_dram_rd_req_addr;
CMD_MEM_WRITE: avs_address = cci_dram_wr_req_addr + ((DRAM_ADDR_WIDTH)'(t_cci_rdq_tag'(cci_rdq_dout))); CMD_MEM_WRITE: avs_address = cci_dram_wr_req_addr + ((DRAM_ADDR_WIDTH)'(t_cci_rdq_tag'(cci_rdq_dout)));
default: avs_address = vx_dram_req_addr[`VX_DRAM_ADDR_WIDTH-1:`VX_DRAM_ADDR_WIDTH-DRAM_ADDR_WIDTH]; default: avs_address = vx_dram_req_addr[`VX_DRAM_ADDR_WIDTH-1:`VX_DRAM_ADDR_WIDTH-DRAM_ADDR_WIDTH];
endcase endcase
case (state) case (state)
CMD_MEM_READ: avs_byteenable = 64'hffffffffffffffff; CMD_MEM_READ: avs_byteenable = 64'hffffffffffffffff;
CMD_MEM_WRITE: avs_byteenable = 64'hffffffffffffffff; CMD_MEM_WRITE: avs_byteenable = 64'hffffffffffffffff;
default: avs_byteenable = vx_dram_req_byteen_; default: avs_byteenable = vx_dram_req_byteen_;
endcase endcase
case (state) case (state)
CMD_MEM_WRITE: avs_writedata = cci_rdq_dout[$bits(t_ccip_clData) + $bits(t_cci_rdq_tag)-1:$bits(t_cci_rdq_tag)]; CMD_MEM_WRITE: avs_writedata = cci_rdq_dout[$bits(t_ccip_clData) + $bits(t_cci_rdq_tag)-1:$bits(t_cci_rdq_tag)];
default: avs_writedata = (DRAM_LINE_WIDTH)'(vx_dram_req_data) << vx_dram_req_offset; default: avs_writedata = (DRAM_LINE_WIDTH)'(vx_dram_req_data) << vx_dram_req_offset;
endcase endcase
end end
@@ -506,7 +506,7 @@ assign cmd_write_done = (cci_dram_wr_req_ctr >= cmd_data_size);
always_ff @(posedge clk) always_ff @(posedge clk)
begin begin
if (SoftReset) if (reset)
begin begin
mem_bank_select <= 0; mem_bank_select <= 0;
avs_burstcount <= 1; avs_burstcount <= 1;
@@ -586,7 +586,7 @@ VX_generic_queue #(
.SIZE(AVS_RD_QUEUE_SIZE) .SIZE(AVS_RD_QUEUE_SIZE)
) avs_rd_req_queue ( ) avs_rd_req_queue (
.clk (clk), .clk (clk),
.reset (SoftReset), .reset (reset),
.push (avs_rtq_push), .push (avs_rtq_push),
.data_in ({vx_dram_req_tag, vx_dram_req_offset}), .data_in ({vx_dram_req_tag, vx_dram_req_offset}),
.pop (avs_rtq_pop), .pop (avs_rtq_pop),
@@ -608,7 +608,7 @@ VX_generic_queue #(
.SIZE(AVS_RD_QUEUE_SIZE) .SIZE(AVS_RD_QUEUE_SIZE)
) avs_rd_rsp_queue ( ) avs_rd_rsp_queue (
.clk (clk), .clk (clk),
.reset (SoftReset), .reset (reset),
.push (avs_rdq_push), .push (avs_rdq_push),
.data_in (avs_readdata), .data_in (avs_readdata),
.pop (avs_rdq_pop), .pop (avs_rdq_pop),
@@ -655,7 +655,7 @@ assign af2cp_sTxPort.c0.valid = cci_rd_req_enable && !cci_rd_req_wait;
// Send read requests to CCI // Send read requests to CCI
always_ff @(posedge clk) always_ff @(posedge clk)
begin begin
if (SoftReset) begin if (reset) begin
cci_rd_req_addr <= 0; cci_rd_req_addr <= 0;
cci_rd_req_ctr <= 0; cci_rd_req_ctr <= 0;
cci_rd_rsp_ctr <= 0; cci_rd_rsp_ctr <= 0;
@@ -716,7 +716,7 @@ VX_generic_queue #(
.SIZE(CCI_RD_QUEUE_SIZE) .SIZE(CCI_RD_QUEUE_SIZE)
) cci_rd_req_queue ( ) cci_rd_req_queue (
.clk (clk), .clk (clk),
.reset (SoftReset), .reset (reset),
.push (cci_rdq_push), .push (cci_rdq_push),
.data_in (cci_rdq_din), .data_in (cci_rdq_din),
.pop (cci_rdq_pop), .pop (cci_rdq_pop),
@@ -754,7 +754,7 @@ assign af2cp_sTxPort.c1.valid = cci_wr_req_enable && !avs_rdq_empty;
// Send write requests to CCI // Send write requests to CCI
always_ff @(posedge clk) always_ff @(posedge clk)
begin begin
if (SoftReset) begin if (reset) begin
cci_wr_req_addr <= 0; cci_wr_req_addr <= 0;
cci_wr_req_ctr <= 0; cci_wr_req_ctr <= 0;
cci_wr_req_enable <= 0; cci_wr_req_enable <= 0;
@@ -818,7 +818,7 @@ assign cmd_clflush_done = (0 == snp_rsp_ctr);
always_ff @(posedge clk) always_ff @(posedge clk)
begin begin
if (SoftReset) begin if (reset) begin
vx_snp_req_valid <= 0; vx_snp_req_valid <= 0;
vx_snp_req_addr <= 0; vx_snp_req_addr <= 0;
vx_snp_req_tag <= 0; vx_snp_req_tag <= 0;
@@ -887,7 +887,7 @@ assign cmd_csr_done = (STATE_CSR_WRITE == state) ? vx_csr_io_req_ready : vx_csr_
always_ff @(posedge clk) always_ff @(posedge clk)
begin begin
if (SoftReset) begin if (reset) begin
csr_io_req_sent <= 0; csr_io_req_sent <= 0;
cmd_csr_rdata <= 0; cmd_csr_rdata <= 0;
end end
@@ -918,7 +918,7 @@ Vortex #() vortex (
`SCOPE_SIGNALS_EXECUTE_BIND `SCOPE_SIGNALS_EXECUTE_BIND
.clk (clk), .clk (clk),
.reset (SoftReset | vx_reset), .reset (reset | vx_reset),
// DRAM request // DRAM request
.dram_req_valid (vx_dram_req_valid), .dram_req_valid (vx_dram_req_valid),
@@ -980,6 +980,13 @@ Vortex #() vortex (
`UNUSED_PIN (ebreak) `UNUSED_PIN (ebreak)
); );
always @(posedge clk) begin
if (!reset) begin
// DRAM reads should only happen during vortex execution
assert(vx_busy || !vx_dram_rd_req_enable);
end
end
// SCOPE ////////////////////////////////////////////////////////////////////// // SCOPE //////////////////////////////////////////////////////////////////////
`ifdef SCOPE `ifdef SCOPE
@@ -1049,7 +1056,7 @@ for (genvar i = 1; i < SCOPE_SR_DEPTH; i++) begin
.N (SCOPE_DATAW+2) .N (SCOPE_DATAW+2)
) scope_sr ( ) scope_sr (
.clk (clk), .clk (clk),
.reset (SoftReset), .reset (reset),
.stall (0), .stall (0),
.flush (0), .flush (0),
.in (scope_data_in_st[i-1]), .in (scope_data_in_st[i-1]),
@@ -1064,7 +1071,7 @@ VX_scope #(
.UPDW ($bits({`SCOPE_SIGNALS_UPD_LIST})) .UPDW ($bits({`SCOPE_SIGNALS_UPD_LIST}))
) scope ( ) scope (
.clk (clk), .clk (clk),
.reset (SoftReset), .reset (reset),
.start (scope_data_in_ste[0]), .start (scope_data_in_ste[0]),
.stop (0), .stop (0),
.changed (scope_data_in_ste[1]), .changed (scope_data_in_ste[1]),

View File

@@ -18,23 +18,35 @@ module VX_commit #(
VX_writeback_if writeback_if, VX_writeback_if writeback_if,
VX_cmt_to_csr_if cmt_to_csr_if VX_cmt_to_csr_if cmt_to_csr_if
); );
localparam NCMTW = $clog2(`NUM_EXS*`NUM_THREADS+1);
// CSRs update // CSRs update
wire [`NUM_EXS-1:0] commited_mask; wire [`NUM_EXS-1-1:0] exu_committed;
assign commited_mask = {alu_commit_if.valid, wire [`NUM_THREADS-1:0] lsu_committed;
lsu_commit_if.valid, wire [$clog2(`NUM_EXS-1+1)-1:0] exu_commits;
wire [$clog2(`NUM_THREADS+1)-1:0] lsu_commits;
assign exu_committed = {alu_commit_if.valid,
csr_commit_if.valid, csr_commit_if.valid,
mul_commit_if.valid, mul_commit_if.valid,
fpu_commit_if.valid, fpu_commit_if.valid,
gpu_commit_if.valid}; gpu_commit_if.valid};
wire [$clog2(`NUM_EXS+1)-1:0] num_commits; assign lsu_committed = {`NUM_THREADS{lsu_commit_if.valid}} & lsu_commit_if.tmask;
VX_countones #( VX_countones #(
.N(`NUM_EXS) .N(`NUM_EXS-1)
) valids_counter ( ) exu_counter (
.valids(commited_mask), .valids(exu_committed),
.count (num_commits) .count (exu_commits)
);
VX_countones #(
.N(`NUM_THREADS)
) lsu_counter (
.valids(lsu_committed),
.count (lsu_commits)
); );
fflags_t fflags; fflags_t fflags;
@@ -54,20 +66,22 @@ module VX_commit #(
fflags_t fflags_r; fflags_t fflags_r;
reg has_fflags_r; reg has_fflags_r;
reg [`NW_BITS-1:0] wid_r; reg [`NW_BITS-1:0] wid_r;
reg [$clog2(`NUM_EXS+1)-1:0] num_commits_r; reg [$clog2(`NUM_EXS-1+1)-1:0] exu_cmt_r;
reg [$clog2(`NUM_THREADS+1)-1:0] lsu_cmt_r;
reg csr_update_r; reg csr_update_r;
always @(posedge clk) begin always @(posedge clk) begin
csr_update_r <= (| commited_mask); csr_update_r <= (| exu_committed) | lsu_commit_if.valid;
fflags_r <= fflags; fflags_r <= fflags;
has_fflags_r <= fpu_commit_if.valid && fpu_commit_if.has_fflags; has_fflags_r <= fpu_commit_if.valid && fpu_commit_if.has_fflags;
wid_r <= fpu_commit_if.wid; wid_r <= fpu_commit_if.wid;
num_commits_r <= (num_commits << $clog2(`NUM_THREADS)); exu_cmt_r <= exu_commits;
lsu_cmt_r <= lsu_commits;
end end
assign cmt_to_csr_if.valid = csr_update_r; assign cmt_to_csr_if.valid = csr_update_r;
assign cmt_to_csr_if.wid = wid_r; assign cmt_to_csr_if.wid = wid_r;
assign cmt_to_csr_if.num_commits = num_commits_r; assign cmt_to_csr_if.num_commits = {exu_cmt_r, `NT_BITS'(0)} + NCMTW'(lsu_cmt_r);
assign cmt_to_csr_if.has_fflags = has_fflags_r; assign cmt_to_csr_if.has_fflags = has_fflags_r;
assign cmt_to_csr_if.fflags = fflags_r; assign cmt_to_csr_if.fflags = fflags_r;

View File

@@ -59,6 +59,8 @@
`define EXT_F_ENABLE `define EXT_F_ENABLE
`endif `endif
//`define FPU_FAST
// Device identification // Device identification
`define VENDOR_ID 0 `define VENDOR_ID 0
`define ARCHITECTURE_ID 0 `define ARCHITECTURE_ID 0

View File

@@ -51,11 +51,11 @@ module VX_fpu_unit #(
.full (fpuq_full) .full (fpuq_full)
); );
wire valid_in = fpu_req_if.valid && ~fpuq_full;
// can accept new request? // can accept new request?
assign fpu_req_if.ready = ready_in && ~fpuq_full; assign fpu_req_if.ready = ready_in && ~fpuq_full;
wire valid_in = fpu_req_if.valid && ~fpuq_full;
`ifdef FPU_FAST `ifdef FPU_FAST
VX_fp_fpga #( VX_fp_fpga #(

View File

@@ -25,12 +25,11 @@ module VX_mul_unit #(
wire [`NR_BITS-1:0] rsp_rd; wire [`NR_BITS-1:0] rsp_rd;
wire rsp_wb; wire rsp_wb;
wire [MULQ_BITS-1:0] tag_in, tag_out; wire [MULQ_BITS-1:0] tag_in, tag_out;
wire valid_out; wire valid_out, ready_out;
wire stall_out;
wire mulq_full; wire mulq_full;
wire mulq_push = mul_req_if.valid && mul_req_if.ready; wire mulq_push = mul_req_if.valid && mul_req_if.ready;
wire mulq_pop = valid_out && ~stall_out; wire mulq_pop = valid_out && ready_out;
VX_cam_buffer #( VX_cam_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1), .DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1),
@@ -48,12 +47,18 @@ module VX_mul_unit #(
.full (mulq_full) .full (mulq_full)
); );
wire valid_in = mul_req_if.valid && ~mulq_full;
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
wire [`NUM_THREADS-1:0][31:0] mul_result; wire [`NUM_THREADS-1:0][31:0] mul_result;
wire [MULQ_BITS-1:0] mul_tag;
wire is_mul_in = (alu_op == `MUL_MUL); wire is_mul_in = (alu_op == `MUL_MUL);
wire is_mul_out; wire is_mul_out;
wire stall_mul;
wire mul_valid_out;
wire mul_valid_in = valid_in && !is_div_op;
wire mul_ready_in = ready_out || ~mul_valid_out;
for (genvar i = 0; i < `NUM_THREADS; i++) begin for (genvar i = 0; i < `NUM_THREADS; i++) begin
@@ -71,7 +76,7 @@ module VX_mul_unit #(
.LATENCY(`LATENCY_IMUL) .LATENCY(`LATENCY_IMUL)
) multiplier ( ) multiplier (
.clk(clk), .clk(clk),
.enable(~stall_mul), .enable(mul_ready_in),
.dataa(mul_in1), .dataa(mul_in1),
.datab(mul_in2), .datab(mul_in2),
.result(mul_result_tmp) .result(mul_result_tmp)
@@ -80,19 +85,14 @@ module VX_mul_unit #(
assign mul_result[i] = is_mul_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32]; assign mul_result[i] = is_mul_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32];
end end
wire [MULQ_BITS-1:0] mul_tag;
wire mul_valid_out;
wire mul_fire = mul_req_if.valid && mul_req_if.ready && !is_div_op;
VX_shift_register #( VX_shift_register #(
.DATAW(1 + MULQ_BITS + 1), .DATAW(1 + MULQ_BITS + 1),
.DEPTH(`LATENCY_IMUL) .DEPTH(`LATENCY_IMUL)
) mul_shift_reg ( ) mul_shift_reg (
.clk(clk), .clk(clk),
.reset(reset), .reset(reset),
.enable(~stall_mul), .enable(mul_ready_in),
.in({mul_fire, tag_in, is_mul_in}), .in({mul_valid_in, tag_in, is_mul_in}),
.out({mul_valid_out, mul_tag, is_mul_out}) .out({mul_valid_out, mul_tag, is_mul_out})
); );
@@ -100,13 +100,13 @@ module VX_mul_unit #(
wire [`NUM_THREADS-1:0][31:0] div_result_tmp, rem_result_tmp; wire [`NUM_THREADS-1:0][31:0] div_result_tmp, rem_result_tmp;
wire is_div_only = (alu_op == `MUL_DIV) || (alu_op == `MUL_DIVU); wire is_rem_op = (alu_op == `MUL_REM) || (alu_op == `MUL_REMU);
wire is_signed_div = (alu_op == `MUL_DIV) || (alu_op == `MUL_REM); wire is_signed_div = (alu_op == `MUL_DIV) || (alu_op == `MUL_REM);
wire div_valid_in = mul_req_if.valid && is_div_op; wire div_valid_in = valid_in && is_div_op;
wire div_ready_out = ready_out && ~mul_valid_out; // arbitration prioritizes MUL
wire div_ready_in; wire div_ready_in;
wire div_ready_out;
wire div_valid_out; wire div_valid_out;
wire is_div_out; wire is_rem_op_out;
wire [MULQ_BITS-1:0] div_tag; wire [MULQ_BITS-1:0] div_tag;
VX_serial_div #( VX_serial_div #(
@@ -122,30 +122,25 @@ module VX_mul_unit #(
.ready_in(div_ready_in), .ready_in(div_ready_in),
.valid_in(div_valid_in), .valid_in(div_valid_in),
.signed_mode(is_signed_div), .signed_mode(is_signed_div),
.tag_in({tag_in, is_div_only}), .tag_in({tag_in, is_rem_op}),
.numer(alu_in1), .numer(alu_in1),
.denom(alu_in2), .denom(alu_in2),
.quotient(div_result_tmp), .quotient(div_result_tmp),
.remainder(rem_result_tmp), .remainder(rem_result_tmp),
.ready_out(div_ready_out), .ready_out(div_ready_out),
.valid_out(div_valid_out), .valid_out(div_valid_out),
.tag_out({div_tag, is_div_out}) .tag_out({div_tag, is_rem_op_out})
); );
wire [`NUM_THREADS-1:0][31:0] div_result = is_div_out ? div_result_tmp : rem_result_tmp; wire [`NUM_THREADS-1:0][31:0] div_result = is_rem_op_out ? rem_result_tmp : div_result_tmp;
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
wire arbiter_hazard = mul_valid_out && div_valid_out; wire stall_out = ~mul_commit_if.ready && mul_commit_if.valid;
assign ready_out = ~stall_out;
assign stall_out = ~mul_commit_if.ready && mul_commit_if.valid;
assign stall_mul = (stall_out && !is_div_op) || mulq_full;
assign div_ready_out = ~stall_out && ~arbiter_hazard; // arbitration prioritizes MUL
wire stall_in = stall_mul || ~div_ready_in;
assign valid_out = mul_valid_out || div_valid_out; assign valid_out = mul_valid_out || div_valid_out;
assign tag_out = mul_valid_out ? mul_tag : div_tag; assign tag_out = mul_valid_out ? mul_tag : div_tag;
wire [`NUM_THREADS-1:0][31:0] result = mul_valid_out ? mul_result : div_result; wire [`NUM_THREADS-1:0][31:0] result = mul_valid_out ? mul_result : div_result;
VX_generic_register #( VX_generic_register #(
@@ -160,6 +155,6 @@ module VX_mul_unit #(
); );
// can accept new request? // can accept new request?
assign mul_req_if.ready = ~stall_in; assign mul_req_if.ready = (is_div_op ? div_ready_in : mul_ready_in) && ~mulq_full;
endmodule endmodule

View File

@@ -20,9 +20,9 @@ module VX_warp_sched #(
wire [31:0] join_pc; wire [31:0] join_pc;
wire [`NUM_THREADS-1:0] join_tm; wire [`NUM_THREADS-1:0] join_tm;
reg [`NUM_WARPS-1:0] active_warps; // real active warps (updated when a warp is activated or disabled) reg [`NUM_WARPS-1:0] active_warps, active_warps_n; // real active warps (updated when a warp is activated or disabled)
reg [`NUM_WARPS-1:0] stalled_warps; // asserted when a branch/gpgpu instructions are issued
reg [`NUM_WARPS-1:0] schedule_table, schedule_table_n; // enforces round-robin, barrier, and non-speculating branches reg [`NUM_WARPS-1:0] schedule_table, schedule_table_n; // enforces round-robin, barrier, and non-speculating branches
reg [`NUM_WARPS-1:0] stalled_warps; // asserted when a branch/gpgpu instructions are issued
// Lock warp until instruction decode to resolve branches // Lock warp until instruction decode to resolve branches
reg [`NUM_WARPS-1:0] fetch_lock; reg [`NUM_WARPS-1:0] fetch_lock;
@@ -46,12 +46,20 @@ module VX_warp_sched #(
wire ifetch_rsp_fire = ifetch_rsp_if.valid && ifetch_rsp_if.ready; wire ifetch_rsp_fire = ifetch_rsp_if.valid && ifetch_rsp_if.ready;
always @(*) begin
active_warps_n = active_warps;
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
active_warps_n = warp_ctl_if.wspawn.wmask;
end
if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
active_warps_n[warp_ctl_if.wid] = (warp_ctl_if.tmc.tmask != 0);
end
end
always @(*) begin always @(*) begin
schedule_table_n = schedule_table; schedule_table_n = schedule_table;
if (warp_ctl_if.valid if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
&& warp_ctl_if.tmc.valid schedule_table_n[warp_ctl_if.wid] = (warp_ctl_if.tmc.tmask != 0);
&& (0 == warp_ctl_if.tmc.tmask)) begin
schedule_table_n[warp_ctl_if.wid] = 0;
end end
if (scheduled_warp) begin // remove scheduled warp (round-robin) if (scheduled_warp) begin // remove scheduled warp (round-robin)
schedule_table_n[warp_to_schedule] = 0; schedule_table_n[warp_to_schedule] = 0;
@@ -82,7 +90,6 @@ module VX_warp_sched #(
end end
end else begin end else begin
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
active_warps <= warp_ctl_if.wspawn.wmask;
use_wspawn <= warp_ctl_if.wspawn.wmask & (~`NUM_WARPS'(1)); use_wspawn <= warp_ctl_if.wspawn.wmask & (~`NUM_WARPS'(1));
use_wspawn_pc <= warp_ctl_if.wspawn.pc; use_wspawn_pc <= warp_ctl_if.wspawn.pc;
end end
@@ -97,9 +104,6 @@ module VX_warp_sched #(
end else if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin end else if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
thread_masks[warp_ctl_if.wid] <= warp_ctl_if.tmc.tmask; thread_masks[warp_ctl_if.wid] <= warp_ctl_if.tmc.tmask;
stalled_warps[warp_ctl_if.wid] <= 0; stalled_warps[warp_ctl_if.wid] <= 0;
if (0 == warp_ctl_if.tmc.tmask) begin
active_warps[warp_ctl_if.wid] <= 0;
end
end else if (join_if.valid && !didnt_split) begin end else if (join_if.valid && !didnt_split) begin
if (!join_fall) begin if (!join_fall) begin
warp_pcs[join_if.wid] <= join_pc; warp_pcs[join_if.wid] <= join_pc;
@@ -143,8 +147,10 @@ module VX_warp_sched #(
warp_pcs[ifetch_rsp_if.wid] <= ifetch_rsp_if.PC + 4; warp_pcs[ifetch_rsp_if.wid] <= ifetch_rsp_if.PC + 4;
end end
active_warps <= active_warps_n;
// reset 'schedule_table' when it goes to zero // reset 'schedule_table' when it goes to zero
schedule_table <= (| schedule_table_n) ? schedule_table_n : active_warps; schedule_table <= (| schedule_table_n) ? schedule_table_n : active_warps_n;
end end
end end

View File

@@ -51,9 +51,9 @@ module VX_fp_addmul #(
.ax(dataa[i]), .ax(dataa[i]),
.ay(datab[i]), .ay(datab[i]),
.az(), .az(),
.clk({2'b00,clk}), .clk({2'b00, clk}),
.ena({2'b11,enable}), .ena({2'b00, enable}),
.aclr(2'b00), .aclr({reset, reset}),
.chainin(), .chainin(),
// outputs // outputs
.overflow(), .overflow(),
@@ -91,9 +91,9 @@ module VX_fp_addmul #(
.ax(dataa[i]), .ax(dataa[i]),
.ay(datab[i]), .ay(datab[i]),
.az(), .az(),
.clk({2'b00,clk}), .clk({2'b00, clk}),
.ena({2'b11,enable}), .ena({2'b00, enable}),
.aclr(2'b00), .aclr({reset, reset}),
.chainin(), .chainin(),
// outputs // outputs
.overflow(), .overflow(),
@@ -131,9 +131,9 @@ module VX_fp_addmul #(
.ax(), .ax(),
.ay(datab[i]), .ay(datab[i]),
.az(dataa[i]), .az(dataa[i]),
.clk({2'b00,clk}), .clk({2'b00, clk}),
.ena({2'b11,enable}), .ena({2'b00, enable}),
.aclr(2'b00), .aclr({reset, reset}),
.chainin(), .chainin(),
// outputs // outputs
.overflow(), .overflow(),

View File

@@ -32,7 +32,7 @@ module VX_fp_div #(
`ifdef QUARTUS `ifdef QUARTUS
acl_fdiv fdiv ( acl_fdiv fdiv (
.clk (clk), .clk (clk),
.areset (1'b0), .areset (reset),
.en (enable), .en (enable),
.a (dataa[i]), .a (dataa[i]),
.b (datab[i]), .b (datab[i]),

View File

@@ -27,7 +27,7 @@ module VX_fp_fpga #(
input wire ready_out, input wire ready_out,
output wire valid_out output wire valid_out
); );
localparam NUM_FPC = 8; localparam NUM_FPC = 7;
localparam FPC_BITS = `LOG2UP(NUM_FPC); localparam FPC_BITS = `LOG2UP(NUM_FPC);
wire [NUM_FPC-1:0] per_core_ready_in; wire [NUM_FPC-1:0] per_core_ready_in;
@@ -40,28 +40,28 @@ module VX_fp_fpga #(
fflags_t [`NUM_THREADS-1:0] fpnew_fflags; fflags_t [`NUM_THREADS-1:0] fpnew_fflags;
reg [FPC_BITS-1:0] core_select; reg [FPC_BITS-1:0] core_select;
reg do_sub, do_mul; reg do_sub, do_mul, do_neg;
reg is_signed; reg is_signed;
always @(*) begin always @(*) begin
core_select = 'x; do_sub = 'x;
do_sub = 'x; do_mul = 'x;
do_mul = 'x; do_neg = 'x;
is_signed = 'x; is_signed = 'x;
case (op_type) case (op_type)
`FPU_ADD: begin core_select = 1; do_mul = 0; do_sub = 0; end `FPU_ADD: begin core_select = 1; do_mul = 0; do_sub = 0; end
`FPU_SUB: begin core_select = 1; do_mul = 0; do_sub = 1; end `FPU_SUB: begin core_select = 1; do_mul = 0; do_sub = 1; end
`FPU_MUL: begin core_select = 1; do_mul = 1; do_sub = 0; end `FPU_MUL: begin core_select = 1; do_mul = 1; do_sub = 0; end
`FPU_MADD: begin core_select = 2; do_sub = 0; end `FPU_MADD: begin core_select = 2; do_sub = 0; do_neg = 0; end
`FPU_MSUB: begin core_select = 2; do_sub = 1; end `FPU_MSUB: begin core_select = 2; do_sub = 1; do_neg = 0; end
`FPU_NMADD: begin core_select = 3; do_sub = 0; end `FPU_NMADD: begin core_select = 2; do_sub = 0; do_neg = 1; end
`FPU_NMSUB: begin core_select = 3; do_sub = 1; end `FPU_NMSUB: begin core_select = 2; do_sub = 1; do_neg = 1; end
`FPU_DIV: begin core_select = 4; end `FPU_DIV: begin core_select = 3; end
`FPU_SQRT: begin core_select = 5; end `FPU_SQRT: begin core_select = 4; end
`FPU_CVTWS: begin core_select = 6; is_signed = 1; end `FPU_CVTWS: begin core_select = 5; is_signed = 1; end
`FPU_CVTWUS: begin core_select = 6; is_signed = 0; end `FPU_CVTWUS: begin core_select = 5; is_signed = 0; end
`FPU_CVTSW: begin core_select = 7; is_signed = 1; end `FPU_CVTSW: begin core_select = 6; is_signed = 1; end
`FPU_CVTSWU: begin core_select = 7; is_signed = 0; end `FPU_CVTSWU: begin core_select = 6; is_signed = 0; end
default: begin core_select = 0; end default: begin core_select = 0; end
endcase endcase
end end
@@ -116,6 +116,7 @@ module VX_fp_fpga #(
.ready_in (per_core_ready_in[2]), .ready_in (per_core_ready_in[2]),
.tag_in (tag_in), .tag_in (tag_in),
.do_sub (do_sub), .do_sub (do_sub),
.do_neg (do_neg),
.dataa (dataa), .dataa (dataa),
.datab (datab), .datab (datab),
.datac (datac), .datac (datac),
@@ -125,51 +126,49 @@ module VX_fp_fpga #(
.valid_out (per_core_valid_out[2]) .valid_out (per_core_valid_out[2])
); );
VX_fp_nmadd #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_nmadd (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 3)),
.ready_in (per_core_ready_in[3]),
.tag_in (tag_in),
.do_sub (do_sub),
.dataa (dataa),
.datab (datab),
.datac (datac),
.result (per_core_result[3]),
.tag_out (per_core_tag_out[3]),
.ready_out (per_core_ready_out[3]),
.valid_out (per_core_valid_out[3])
);
VX_fp_div #( VX_fp_div #(
.TAGW (TAGW), .TAGW (TAGW),
.LANES(`NUM_THREADS) .LANES(`NUM_THREADS)
) fp_div ( ) fp_div (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.valid_in (valid_in && (core_select == 4)), .valid_in (valid_in && (core_select == 3)),
.ready_in (per_core_ready_in[4]), .ready_in (per_core_ready_in[3]),
.tag_in (tag_in), .tag_in (tag_in),
.dataa (dataa), .dataa (dataa),
.datab (datab), .datab (datab),
.result (per_core_result[4]), .result (per_core_result[3]),
.tag_out (per_core_tag_out[4]), .tag_out (per_core_tag_out[3]),
.ready_out (per_core_ready_out[4]), .ready_out (per_core_ready_out[3]),
.valid_out (per_core_valid_out[4]) .valid_out (per_core_valid_out[3])
); );
VX_fp_sqrt #( VX_fp_sqrt #(
.TAGW (TAGW), .TAGW (TAGW),
.LANES(`NUM_THREADS) .LANES(`NUM_THREADS)
) fp_sqrt ( ) fp_sqrt (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 4)),
.ready_in (per_core_ready_in[4]),
.tag_in (tag_in),
.dataa (dataa),
.result (per_core_result[4]),
.tag_out (per_core_tag_out[4]),
.ready_out (per_core_ready_out[4]),
.valid_out (per_core_valid_out[4])
);
VX_fp_ftoi #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_ftoi (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.valid_in (valid_in && (core_select == 5)), .valid_in (valid_in && (core_select == 5)),
.ready_in (per_core_ready_in[5]), .ready_in (per_core_ready_in[5]),
.tag_in (tag_in), .tag_in (tag_in),
.is_signed (is_signed),
.dataa (dataa), .dataa (dataa),
.result (per_core_result[5]), .result (per_core_result[5]),
.tag_out (per_core_tag_out[5]), .tag_out (per_core_tag_out[5]),
@@ -177,10 +176,10 @@ module VX_fp_fpga #(
.valid_out (per_core_valid_out[5]) .valid_out (per_core_valid_out[5])
); );
VX_fp_ftoi #( VX_fp_itof #(
.TAGW (TAGW), .TAGW (TAGW),
.LANES(`NUM_THREADS) .LANES(`NUM_THREADS)
) fp_ftoi ( ) fp_itof (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.valid_in (valid_in && (core_select == 6)), .valid_in (valid_in && (core_select == 6)),
@@ -194,23 +193,6 @@ module VX_fp_fpga #(
.valid_out (per_core_valid_out[6]) .valid_out (per_core_valid_out[6])
); );
VX_fp_itof #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_itof (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 7)),
.ready_in (per_core_ready_in[7]),
.tag_in (tag_in),
.is_signed (is_signed),
.dataa (dataa),
.result (per_core_result[7]),
.tag_out (per_core_tag_out[7]),
.ready_out (per_core_ready_out[7]),
.valid_out (per_core_valid_out[7])
);
reg valid_out_n; reg valid_out_n;
reg has_fflags_n; reg has_fflags_n;
reg [`NUM_THREADS-1:0][31:0] result_n; reg [`NUM_THREADS-1:0][31:0] result_n;
@@ -234,7 +216,7 @@ module VX_fp_fpga #(
end end
end end
assign ready_in = (& per_core_ready_in); assign ready_in = per_core_ready_in[core_select];
assign valid_out = valid_out_n; assign valid_out = valid_out_n;
assign has_fflags = has_fflags_n; assign has_fflags = has_fflags_n;
assign tag_out = tag_out_n; assign tag_out = tag_out_n;

View File

@@ -39,7 +39,7 @@ module VX_fp_ftoi #(
`ifdef QUARTUS `ifdef QUARTUS
acl_ftoi ftoi ( acl_ftoi ftoi (
.clk (clk), .clk (clk),
.areset (1'b0), .areset (reset),
.en (enable), .en (enable),
.a (dataa[i]), .a (dataa[i]),
.q (result_s) .q (result_s)
@@ -47,7 +47,7 @@ module VX_fp_ftoi #(
acl_ftou ftou ( acl_ftou ftou (
.clk (clk), .clk (clk),
.areset (1'b0), .areset (reset),
.en (enable), .en (enable),
.a (dataa[i]), .a (dataa[i]),
.q (result_u) .q (result_u)

View File

@@ -39,7 +39,7 @@ module VX_fp_itof #(
`ifdef QUARTUS `ifdef QUARTUS
acl_itof itof ( acl_itof itof (
.clk (clk), .clk (clk),
.areset (1'b0), .areset (reset),
.en (enable), .en (enable),
.a (dataa[i]), .a (dataa[i]),
.q (result_s) .q (result_s)
@@ -47,7 +47,7 @@ module VX_fp_itof #(
acl_utof utof ( acl_utof utof (
.clk (clk), .clk (clk),
.areset (1'b0), .areset (reset),
.en (enable), .en (enable),
.a (dataa[i]), .a (dataa[i]),
.q (result_u) .q (result_u)

View File

@@ -17,6 +17,7 @@ module VX_fp_madd #(
input wire [TAGW-1:0] tag_in, input wire [TAGW-1:0] tag_in,
input wire do_sub, input wire do_sub,
input wire do_neg,
input wire [LANES-1:0][31:0] dataa, input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab, input wire [LANES-1:0][31:0] datab,
@@ -32,7 +33,7 @@ module VX_fp_madd #(
wire stall = ~ready_out && valid_out; wire stall = ~ready_out && valid_out;
wire enable = ~stall; wire enable = ~stall;
reg do_sub_r; reg do_sub_r, do_neg_r;
for (genvar i = 0; i < LANES; i++) begin for (genvar i = 0; i < LANES; i++) begin
@@ -50,9 +51,9 @@ module VX_fp_madd #(
.ax(datac[i]), .ax(datac[i]),
.ay(datab[i]), .ay(datab[i]),
.az(dataa[i]), .az(dataa[i]),
.clk({2'b00,clk}), .clk({2'b00, clk}),
.ena({2'b11,enable}), .ena({2'b00, enable}),
.aclr(2'b00), .aclr({reset, reset}),
.chainin(), .chainin(),
// outputs // outputs
.overflow(), .overflow(),
@@ -90,9 +91,9 @@ module VX_fp_madd #(
.ax(datac[i]), .ax(datac[i]),
.ay(datab[i]), .ay(datab[i]),
.az(dataa[i]), .az(dataa[i]),
.clk({2'b00,clk}), .clk({2'b00, clk}),
.ena({2'b11,enable}), .ena({2'b00, enable}),
.aclr(2'b00), .aclr({reset, reset}),
.chainin(), .chainin(),
// outputs // outputs
.overflow(), .overflow(),
@@ -126,18 +127,20 @@ module VX_fp_madd #(
end end
`endif `endif
assign result[i] = do_sub_r ? result_msub : result_madd; wire [31:0] result_unqual = do_sub_r ? result_msub : result_madd;
assign result[i][31] = result_unqual[31] ^ do_neg_r;
assign result[i][30:0] = result_unqual[30:0];
end end
VX_shift_register #( VX_shift_register #(
.DATAW(TAGW + 1 + 1), .DATAW(TAGW + 1 + 1 + 1),
.DEPTH(`LATENCY_FMADD) .DEPTH(`LATENCY_FMADD)
) shift_reg ( ) shift_reg (
.clk(clk), .clk(clk),
.reset(reset), .reset(reset),
.enable(enable), .enable(enable),
.in({tag_in, valid_in, do_sub}), .in({tag_in, valid_in, do_sub, do_neg}),
.out({tag_out, valid_out, do_sub_r}) .out({tag_out, valid_out, do_sub_r, do_neg_r})
); );
assign ready_in = enable; assign ready_in = enable;

View File

@@ -1,197 +0,0 @@
`include "VX_define.vh"
`ifndef SYNTHESIS
`include "float_dpi.vh"
`endif
module VX_fp_nmadd #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [TAGW-1:0] tag_in,
input wire do_sub,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
input wire [LANES-1:0][31:0] datac,
output wire [LANES-1:0][31:0] result,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
reg do_sub_r;
for (genvar i = 0; i < LANES; i++) begin
wire [31:0] result_madd;
wire [31:0] result_msub;
wire [31:0] result_st0 = do_sub_r ? result_msub : result_madd;
`ifdef QUARTUS
twentynm_fp_mac mac_fp_madd (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(datac[i]),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_madd),
.chainout()
);
defparam mac_fp_madd.operation_mode = "sp_mult_add";
defparam mac_fp_madd.use_chainin = "false";
defparam mac_fp_madd.adder_subtract = "false";
defparam mac_fp_madd.ax_clock = "0";
defparam mac_fp_madd.ay_clock = "0";
defparam mac_fp_madd.az_clock = "0";
defparam mac_fp_madd.output_clock = "0";
defparam mac_fp_madd.accumulate_clock = "none";
defparam mac_fp_madd.ax_chainin_pl_clock = "0";
defparam mac_fp_madd.accum_pipeline_clock = "none";
defparam mac_fp_madd.mult_pipeline_clock = "0";
defparam mac_fp_madd.adder_input_clock = "0";
defparam mac_fp_madd.accum_adder_clock = "none";
twentynm_fp_mac mac_fp_msub (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(datac[i]),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,enable0}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_msub),
.chainout()
);
defparam mac_fp_msub.operation_mode = "sp_mult_add";
defparam mac_fp_msub.use_chainin = "false";
defparam mac_fp_msub.adder_subtract = "true";
defparam mac_fp_msub.ax_clock = "0";
defparam mac_fp_msub.ay_clock = "0";
defparam mac_fp_msub.az_clock = "0";
defparam mac_fp_msub.output_clock = "0";
defparam mac_fp_msub.accumulate_clock = "none";
defparam mac_fp_msub.ax_chainin_pl_clock = "0";
defparam mac_fp_msub.accum_pipeline_clock = "none";
defparam mac_fp_msub.mult_pipeline_clock = "0";
defparam mac_fp_msub.adder_input_clock = "0";
defparam mac_fp_msub.accum_adder_clock = "none";
twentynm_fp_mac mac_fp_neg (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(32'h0),
.ay(result_st0),
.az(),
.clk({2'b00,clk}),
.ena({2'b11,enable1}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result[i]),
.chainout()
);
defparam mac_fp_neg.operation_mode = "sp_add";
defparam mac_fp_neg.use_chainin = "false";
defparam mac_fp_neg.adder_subtract = "true";
defparam mac_fp_neg.ax_clock = "0";
defparam mac_fp_neg.ay_clock = "0";
defparam mac_fp_neg.az_clock = "none";
defparam mac_fp_neg.output_clock = "0";
defparam mac_fp_neg.accumulate_clock = "none";
defparam mac_fp_neg.ax_chainin_pl_clock = "none";
defparam mac_fp_neg.accum_pipeline_clock = "none";
defparam mac_fp_neg.mult_pipeline_clock = "none";
defparam mac_fp_neg.adder_input_clock = "0";
defparam mac_fp_neg.accum_adder_clock = "none";
`else
always @(posedge clk) begin
dpi_fmadd(5*LANES+i, enable, dataa[i], datab[i], datac[i], result_madd);
dpi_fmsub(6*LANES+i, enable, dataa[i], datab[i], datac[i], result_msub);
dpi_fsub(7*LANES+i, enable, 32'b0, result_st0, result[i]);
end
`endif
end
VX_shift_register #(
.DATAW(1),
.DEPTH(`LATENCY_FMADD)
) shift_reg0 (
.clk(clk),
.reset(reset),
.enable(enable),
.in({do_sub}),
.out({do_sub_r})
);
VX_shift_register #(
.DATAW(TAGW + 1),
.DEPTH(`LATENCY_FMADD + `LATENCY_FADDMUL)
) shift_reg1 (
.clk(clk),
.reset(reset),
.enable(enable),
.in({tag_in, valid_in}),
.out({tag_out, valid_out})
);
assign ready_in = enable;
endmodule

View File

@@ -31,7 +31,7 @@ module VX_fp_sqrt #(
`ifdef QUARTUS `ifdef QUARTUS
acl_fsqrt fsqrt ( acl_fsqrt fsqrt (
.clk (clk), .clk (clk),
.areset (1'b0), .areset (reset),
.en (enable), .en (enable),
.a (dataa[i]), .a (dataa[i]),
.q (result[i]) .q (result[i])

View File

@@ -91,6 +91,7 @@ module VX_fpnew #(
fpu_operands[0] = dataa; fpu_operands[0] = dataa;
fpu_operands[1] = datab; fpu_operands[1] = datab;
fpu_operands[2] = datac; fpu_operands[2] = datac;
case (op_type) case (op_type)
`FPU_ADD: begin `FPU_ADD: begin
fpu_op = fpnew_pkg::ADD; fpu_op = fpnew_pkg::ADD;
@@ -108,22 +109,22 @@ module VX_fpnew #(
`FPU_SQRT: begin fpu_op = fpnew_pkg::SQRT; end `FPU_SQRT: begin fpu_op = fpnew_pkg::SQRT; end
`FPU_MADD: begin fpu_op = fpnew_pkg::FMADD; end `FPU_MADD: begin fpu_op = fpnew_pkg::FMADD; end
`FPU_MSUB: begin fpu_op = fpnew_pkg::FMADD; fpu_op_mod = 1; end `FPU_MSUB: begin fpu_op = fpnew_pkg::FMADD; fpu_op_mod = 1; end
`FPU_NMSUB: begin fpu_op = fpnew_pkg::FNMSUB; end
`FPU_NMADD: begin fpu_op = fpnew_pkg::FNMSUB; fpu_op_mod = 1; end `FPU_NMADD: begin fpu_op = fpnew_pkg::FNMSUB; fpu_op_mod = 1; end
`FPU_NMSUB: begin fpu_op = fpnew_pkg::FNMSUB; end
`FPU_CVTWS: begin fpu_op = fpnew_pkg::F2I; end `FPU_CVTWS: begin fpu_op = fpnew_pkg::F2I; end
`FPU_CVTWUS:begin fpu_op = fpnew_pkg::F2I; fpu_op_mod = 1; end `FPU_CVTWUS:begin fpu_op = fpnew_pkg::F2I; fpu_op_mod = 1; end
`FPU_CVTSW: begin fpu_op = fpnew_pkg::I2F; end `FPU_CVTSW: begin fpu_op = fpnew_pkg::I2F; end
`FPU_CVTSWU:begin fpu_op = fpnew_pkg::I2F; fpu_op_mod = 1; end `FPU_CVTSWU:begin fpu_op = fpnew_pkg::I2F; fpu_op_mod = 1; end
`FPU_CLASS: begin fpu_op = fpnew_pkg::CLASSIFY; fpu_has_fflags = 0; end `FPU_CLASS: begin fpu_op = fpnew_pkg::CLASSIFY; fpu_has_fflags = 0; end
`FPU_CMP: begin fpu_op = fpnew_pkg::CMP; end `FPU_CMP: begin fpu_op = fpnew_pkg::CMP; end
`FPU_MISC: begin `FPU_MISC: begin
case (frm) case (frm)
0: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RNE; fpu_has_fflags = 0; end 0: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RNE; fpu_has_fflags = 0; end
1: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RTZ; fpu_has_fflags = 0; end 1: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RTZ; fpu_has_fflags = 0; end
2: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RDN; fpu_has_fflags = 0; end 2: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RDN; fpu_has_fflags = 0; end
3: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RNE; end 3: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RNE; end
4: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RTZ; end 4: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RTZ; end
default: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RUP; fpu_has_fflags = 0; end default: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RUP; fpu_has_fflags = 0; end
endcase endcase
end end
default:; default:;

View File

@@ -9,7 +9,7 @@ interface VX_cmt_to_csr_if ();
wire [`NW_BITS-1:0] wid; wire [`NW_BITS-1:0] wid;
wire [$clog2(`NUM_EXS+1)-1:0] num_commits; wire [$clog2(`NUM_EXS*`NUM_THREADS+1)-1:0] num_commits;
wire has_fflags; wire has_fflags;
fflags_t fflags; fflags_t fflags;