From aeeb3ca616b27ff8bc9e14f5b4676cd06787c10a Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 7 Sep 2021 23:54:10 -0700 Subject: [PATCH] ALU unit critical path optimization --- hw/rtl/VX_alu_unit.v | 140 ++++++++++++++++++++++------------------ hw/rtl/VX_csr_unit.v | 2 +- hw/rtl/VX_decode.v | 4 +- hw/rtl/VX_define.vh | 14 ++-- hw/rtl/VX_execute.v | 4 +- hw/rtl/VX_instr_demux.v | 31 +++++---- 6 files changed, 107 insertions(+), 88 deletions(-) diff --git a/hw/rtl/VX_alu_unit.v b/hw/rtl/VX_alu_unit.v index 205ce8da..f9812992 100644 --- a/hw/rtl/VX_alu_unit.v +++ b/hw/rtl/VX_alu_unit.v @@ -22,12 +22,12 @@ module VX_alu_unit #( wire [`NUM_THREADS-1:0][31:0] shr_result; reg [`NUM_THREADS-1:0][31:0] msc_result; - wire stall_in, stall_out; + wire ready_in; `UNUSED_VAR (alu_req_if.op_mod) wire is_br_op = `INST_ALU_IS_BR(alu_req_if.op_mod); - wire [`INST_ALU_BITS-1:0] alu_op = `INST_ALU_OP(alu_req_if.op_type); - wire [`INST_BR_BITS-1:0] br_op = `INST_BR_OP(alu_req_if.op_type); + wire [`INST_ALU_BITS-1:0] alu_op = `INST_ALU_BITS'(alu_req_if.op_type); + wire [`INST_BR_BITS-1:0] br_op = `INST_BR_BITS'(alu_req_if.op_type); wire alu_signed = `INST_ALU_SIGNED(alu_op); wire [1:0] alu_op_class = `INST_ALU_OP_CLASS(alu_op); wire is_sub = (alu_op == `INST_ALU_SUB); @@ -92,17 +92,49 @@ module VX_alu_unit #( // output - wire result_valid; - wire [`NW_BITS-1:0] result_wid; - wire [`NUM_THREADS-1:0] result_tmask; - wire [31:0] result_PC; - wire [`NR_BITS-1:0] result_rd; - wire result_wb; - wire [`NUM_THREADS-1:0][31:0] result_data; - wire result_is_br; + wire alu_valid_in; + wire alu_ready_in; + wire alu_valid_out; + wire alu_ready_out; + wire [`NW_BITS-1:0] alu_wid; + wire [`NUM_THREADS-1:0] alu_tmask; + wire [31:0] alu_PC; + wire [`NR_BITS-1:0] alu_rd; + wire alu_wb; + wire [`NUM_THREADS-1:0][31:0] alu_data; + + wire [`INST_BR_BITS-1:0] br_op_r; + wire [31:0] br_dest_r; + wire is_less_r; + wire is_equal_r; + wire is_br_op_r; + + assign alu_ready_in = alu_ready_out || ~alu_valid_out; + + VX_pipe_register #( + .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + 32), + .RESETW (1) + ) pipe_reg ( + .clk (clk), + .reset (reset), + .enable (alu_ready_in), + .data_in ({alu_valid_in, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.rd, alu_req_if.wb, alu_jal_result, is_br_op, br_op, is_less, is_equal, br_dest}), + .data_out ({alu_valid_out, alu_wid, alu_tmask, alu_PC, alu_rd, alu_wb, alu_data, is_br_op_r, br_op_r, is_less_r, is_equal_r, br_dest_r}) + ); + + `UNUSED_VAR (br_op_r) + wire br_neg = `INST_BR_NEG(br_op_r); + wire br_less = `INST_BR_LESS(br_op_r); + wire br_static = `INST_BR_STATIC(br_op_r); + + assign branch_ctl_if.valid = alu_valid_out && alu_ready_out && is_br_op_r; + assign branch_ctl_if.taken = ((br_less ? is_less_r : is_equal_r) ^ br_neg) | br_static; + assign branch_ctl_if.wid = alu_wid; + assign branch_ctl_if.dest = br_dest_r; `ifdef EXT_M_ENABLE + wire mul_valid_in; wire mul_ready_in; wire mul_valid_out; wire mul_ready_out; @@ -113,14 +145,14 @@ module VX_alu_unit #( wire mul_wb; wire [`NUM_THREADS-1:0][31:0] mul_data; - wire is_mul_op = `INST_ALU_IS_MUL(alu_req_if.op_mod); - + wire [`INST_MUL_BITS-1:0] mul_op = `INST_MUL_BITS'(alu_req_if.op_type); + VX_muldiv muldiv ( .clk (clk), .reset (reset), // Inputs - .alu_op (`INST_MUL_OP(alu_req_if.op_type)), + .alu_op (mul_op), .wid_in (alu_req_if.wid), .tmask_in (alu_req_if.tmask), .PC_in (alu_req_if.PC), @@ -138,72 +170,52 @@ module VX_alu_unit #( .data_out (mul_data), // handshake - .valid_in (alu_req_if.valid && is_mul_op), + .valid_in (mul_valid_in), .ready_in (mul_ready_in), .valid_out (mul_valid_out), .ready_out (mul_ready_out) ); - assign stall_in = (is_mul_op && ~mul_ready_in) - || (~is_mul_op && (mul_valid_out || stall_out)); - - assign mul_ready_out = ~stall_out; + wire is_mul_op = `INST_ALU_IS_MUL(alu_req_if.op_mod); - assign result_valid = mul_valid_out || (alu_req_if.valid && ~is_mul_op); - assign result_wid = mul_valid_out ? mul_wid : alu_req_if.wid; - assign result_tmask = mul_valid_out ? mul_tmask : alu_req_if.tmask; - assign result_PC = mul_valid_out ? mul_PC : alu_req_if.PC; - assign result_rd = mul_valid_out ? mul_rd : alu_req_if.rd; - assign result_wb = mul_valid_out ? mul_wb : alu_req_if.wb; - assign result_data = mul_valid_out ? mul_data : alu_jal_result; - assign result_is_br = ~mul_valid_out && is_br_op; + assign ready_in = is_mul_op ? mul_ready_in : alu_ready_in; + + assign alu_valid_in = alu_req_if.valid && ~is_mul_op; + assign mul_valid_in = alu_req_if.valid && is_mul_op; + + assign alu_commit_if.valid = alu_valid_out || mul_valid_out; + assign alu_commit_if.wid = alu_valid_out ? alu_wid : mul_wid; + assign alu_commit_if.tmask = alu_valid_out ? alu_tmask : mul_tmask; + assign alu_commit_if.PC = alu_valid_out ? alu_PC : mul_PC; + assign alu_commit_if.rd = alu_valid_out ? alu_rd : mul_rd; + assign alu_commit_if.wb = alu_valid_out ? alu_wb : mul_wb; + assign alu_commit_if.data = alu_valid_out ? alu_data : mul_data; + + assign alu_ready_out = alu_commit_if.ready; + assign mul_ready_out = alu_commit_if.ready & ~alu_valid_out; // ALU takes priority `else - assign stall_in = stall_out; + assign ready_in = alu_ready_in; - assign result_valid = alu_req_if.valid; - assign result_wid = alu_req_if.wid; - assign result_tmask = alu_req_if.tmask; - assign result_PC = alu_req_if.PC; - assign result_rd = alu_req_if.rd; - assign result_wb = alu_req_if.wb; - assign result_data = alu_jal_result; - assign result_is_br = is_br_op; + assign alu_valid_in = alu_req_if.valid; + + assign alu_commit_if.valid = alu_valid_out; + assign alu_commit_if.wid = alu_wid; + assign alu_commit_if.tmask = alu_tmask; + assign alu_commit_if.PC = alu_PC; + assign alu_commit_if.rd = alu_rd; + assign alu_commit_if.wb = alu_wb; + assign alu_commit_if.data = alu_data; + + assign alu_ready_out = alu_commit_if.ready; `endif - wire [`INST_BR_BITS-1:0] br_op_r; - wire is_less_r; - wire is_equal_r; - wire is_br_op_r; - - assign stall_out = ~alu_commit_if.ready && alu_commit_if.valid; - - VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + 32), - .RESETW (1) - ) pipe_reg ( - .clk (clk), - .reset (reset), - .enable (!stall_out), - .data_in ({result_valid, result_wid, result_tmask, result_PC, result_rd, result_wb, result_data, result_is_br, br_op, is_less, is_equal, br_dest}), - .data_out ({alu_commit_if.valid, alu_commit_if.wid, alu_commit_if.tmask, alu_commit_if.PC, alu_commit_if.rd, alu_commit_if.wb, alu_commit_if.data, is_br_op_r, br_op_r, is_less_r, is_equal_r, branch_ctl_if.dest}) - ); - assign alu_commit_if.eop = 1'b1; - `UNUSED_VAR (br_op_r) - wire br_neg = `INST_BR_NEG(br_op_r); - wire br_less = `INST_BR_LESS(br_op_r); - wire br_static = `INST_BR_STATIC(br_op_r); - - assign branch_ctl_if.valid = alu_commit_if.valid && alu_commit_if.ready && is_br_op_r; - assign branch_ctl_if.taken = ((br_less ? is_less_r : is_equal_r) ^ br_neg) | br_static; - assign branch_ctl_if.wid = alu_commit_if.wid; - // can accept new request? - assign alu_req_if.ready = ~stall_in; + assign alu_req_if.ready = ready_in; `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin diff --git a/hw/rtl/VX_csr_unit.v b/hw/rtl/VX_csr_unit.v index a41c1875..aa07188a 100644 --- a/hw/rtl/VX_csr_unit.v +++ b/hw/rtl/VX_csr_unit.v @@ -31,7 +31,7 @@ module VX_csr_unit #( wire write_enable = csr_commit_if.valid && csr_we_s1; - wire [31:0] csr_req_data = csr_req_if.use_imm ? 32'(csr_req_if.rs1) : csr_req_if.rs1_data; + wire [31:0] csr_req_data = csr_req_if.use_imm ? 32'(csr_req_if.imm) : csr_req_if.rs1_data; VX_csr_data #( .CORE_ID(CORE_ID) diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index 832a1537..3ba2d9c6 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -201,10 +201,10 @@ module VX_decode #( op_type = `INST_OP_BITS'(func3[1:0]); use_rd = 1; use_imm = func3[2]; - imm = 32'(u_12); // addr + imm[`CSR_ADDR_BITS-1:0] = u_12; // addr `USED_IREG (rd); if (func3[2]) begin - rs1_r = `NR_BITS'(rs1); // imm + imm[`CSR_ADDR_BITS +: `NRI_BITS] = rs1; // imm end else begin `USED_IREG (rs1); end diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index dad6a84c..a0530688 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -14,10 +14,14 @@ `define NB_BITS `LOG2UP(`NUM_BARRIERS) +`define NUM_IREGS 32 + +`define NRI_BITS `LOG2UP(`NUM_IREGS) + `ifdef EXT_F_ENABLE -`define NUM_REGS 64 +`define NUM_REGS (2 * `NUM_IREGS) `else -`define NUM_REGS 32 +`define NUM_REGS `NUM_IREGS `endif `define NR_BITS `LOG2UP(`NUM_REGS) @@ -114,7 +118,6 @@ `define INST_BR_DRET 4'b1110 `define INST_BR_OTHER 4'b1111 `define INST_BR_BITS 4 -`define INST_BR_OP(x) x[`INST_BR_BITS-1:0] `define INST_BR_NEG(x) x[1] `define INST_BR_LESS(x) x[2] `define INST_BR_STATIC(x) x[3] @@ -128,7 +131,6 @@ `define INST_MUL_REM 3'h6 `define INST_MUL_REMU 3'h7 `define INST_MUL_BITS 3 -`define INST_MUL_OP(x) x[`INST_MUL_BITS-1:0] `define INST_MUL_IS_DIV(x) x[2] `define INST_FMT_B 3'b000 @@ -148,7 +150,6 @@ `define INST_LSU_BITS 4 `define INST_LSU_FMT(x) x[2:0] `define INST_LSU_WSIZE(x) x[1:0] -`define INST_LSU_OP(x) x[`INST_LSU_BITS-1:0] `define INST_LSU_IS_FENCE(x) x[0] `define INST_FENCE_BITS 1 @@ -160,7 +161,6 @@ `define INST_CSR_RC 2'h3 `define INST_CSR_OTHER 2'h0 `define INST_CSR_BITS 2 -`define INST_CSR_OP(x) x[`INST_CSR_BITS-1:0] `define INST_FPU_ADD 4'h0 `define INST_FPU_SUB 4'h4 @@ -179,7 +179,6 @@ `define INST_FPU_NMSUB 4'hB `define INST_FPU_NMADD 4'hF `define INST_FPU_BITS 4 -`define INST_FPU_OP(x) x[`INST_FPU_BITS-1:0] `define INST_GPU_TMC 3'h0 `define INST_GPU_WSPAWN 3'h1 @@ -188,7 +187,6 @@ `define INST_GPU_BAR 3'h4 `define INST_GPU_OTHER 3'h7 `define INST_GPU_BITS 3 -`define INST_GPU_OP(x) x[`INST_GPU_BITS-1:0] /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_execute.v b/hw/rtl/VX_execute.v index 730f9df1..98eced32 100644 --- a/hw/rtl/VX_execute.v +++ b/hw/rtl/VX_execute.v @@ -134,7 +134,7 @@ module VX_execute #( wire ebreak /* verilator public */; assign ebreak = alu_req_if.valid && alu_req_if.ready && `INST_ALU_IS_BR(alu_req_if.op_mod) - && (`INST_BR_OP(alu_req_if.op_type) == `INST_BR_EBREAK - || `INST_BR_OP(alu_req_if.op_type) == `INST_BR_ECALL); + && (`INST_BR_BITS'(alu_req_if.op_type) == `INST_BR_EBREAK + || `INST_BR_BITS'(alu_req_if.op_type) == `INST_BR_ECALL); endmodule diff --git a/hw/rtl/VX_instr_demux.v b/hw/rtl/VX_instr_demux.v index 662b6bcb..91342ae4 100644 --- a/hw/rtl/VX_instr_demux.v +++ b/hw/rtl/VX_instr_demux.v @@ -39,6 +39,7 @@ module VX_instr_demux ( // ALU unit wire alu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_ALU); + wire [`INST_ALU_BITS-1:0] alu_op_type = `INST_ALU_BITS'(ibuffer_if.op_type); VX_skid_buffer #( .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)), @@ -48,8 +49,8 @@ module VX_instr_demux ( .reset (reset), .valid_in (alu_req_valid), .ready_in (alu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, `INST_ALU_OP(ibuffer_if.op_type), ibuffer_if.op_mod, ibuffer_if.imm, ibuffer_if.use_PC, ibuffer_if.use_imm, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}), - .data_out ({alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.op_mod, alu_req_if.imm, alu_req_if.use_PC, alu_req_if.use_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}), + .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, alu_op_type, ibuffer_if.op_mod, ibuffer_if.imm, ibuffer_if.use_PC, ibuffer_if.use_imm, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}), + .data_out ({alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.op_mod, alu_req_if.imm, alu_req_if.use_PC, alu_req_if.use_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}), .valid_out (alu_req_if.valid), .ready_out (alu_req_if.ready) ); @@ -57,6 +58,7 @@ module VX_instr_demux ( // lsu unit wire lsu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_LSU); + wire [`INST_LSU_BITS-1:0] lsu_op_type = `INST_LSU_BITS'(ibuffer_if.op_type); wire lsu_is_fence = `INST_LSU_IS_FENCE(ibuffer_if.op_mod); VX_skid_buffer #( @@ -67,8 +69,8 @@ module VX_instr_demux ( .reset (reset), .valid_in (lsu_req_valid), .ready_in (lsu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, `INST_LSU_OP(ibuffer_if.op_type), lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}), - .data_out ({lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data}), + .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, lsu_op_type, lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}), + .data_out ({lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data}), .valid_out (lsu_req_if.valid), .ready_out (lsu_req_if.ready) ); @@ -76,17 +78,21 @@ module VX_instr_demux ( // csr unit wire csr_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_CSR); + wire [`INST_CSR_BITS-1:0] csr_op_type = `INST_CSR_BITS'(ibuffer_if.op_type); + wire [`CSR_ADDR_BITS-1:0] csr_addr = ibuffer_if.imm[`CSR_ADDR_BITS-1:0]; + wire [`NRI_BITS-1:0] csr_imm = ibuffer_if.imm[`CSR_ADDR_BITS +: `NRI_BITS]; + wire [31:0] csr_rs1_data = gpr_rsp_if.rs1_data[tid]; VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NR_BITS + 32), + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32), .OUTPUT_REG (1) ) csr_buffer ( .clk (clk), .reset (reset), .valid_in (csr_req_valid), .ready_in (csr_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, `INST_CSR_OP(ibuffer_if.op_type), ibuffer_if.imm[`CSR_ADDR_BITS-1:0], ibuffer_if.rd, ibuffer_if.wb, ibuffer_if.use_imm, ibuffer_if.rs1, gpr_rsp_if.rs1_data[0]}), - .data_out ({csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.use_imm, csr_req_if.rs1, csr_req_if.rs1_data}), + .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, csr_op_type, csr_addr, ibuffer_if.rd, ibuffer_if.wb, ibuffer_if.use_imm, csr_imm, csr_rs1_data}), + .data_out ({csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.use_imm, csr_req_if.imm, csr_req_if.rs1_data}), .valid_out (csr_req_if.valid), .ready_out (csr_req_if.ready) ); @@ -95,6 +101,7 @@ module VX_instr_demux ( `ifdef EXT_F_ENABLE wire fpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_FPU); + wire [`INST_FPU_BITS-1:0] fpu_op_type = `INST_FPU_BITS'(ibuffer_if.op_type); VX_skid_buffer #( .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)), @@ -104,8 +111,8 @@ module VX_instr_demux ( .reset (reset), .valid_in (fpu_req_valid), .ready_in (fpu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, `INST_FPU_OP(ibuffer_if.op_type), ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), - .data_out ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}), + .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, fpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), + .data_out ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}), .valid_out (fpu_req_if.valid), .ready_out (fpu_req_if.ready) ); @@ -116,6 +123,8 @@ module VX_instr_demux ( // gpu unit wire gpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_GPU); + wire [`INST_GPU_BITS-1:0] gpu_op_type = `INST_GPU_BITS'(ibuffer_if.op_type); + wire [31:0] gpu_rs2_data = gpr_rsp_if.rs2_data[tid]; VX_skid_buffer #( .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `NR_BITS + 1 + + `NT_BITS + (`NUM_THREADS * 32 + 32)), @@ -125,8 +134,8 @@ module VX_instr_demux ( .reset (reset), .valid_in (gpu_req_valid), .ready_in (gpu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, `INST_GPU_OP(ibuffer_if.op_type), ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data[0]}), - .data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data}), + .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpu_rs2_data}), + .data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data}), .valid_out (gpu_req_if.valid), .ready_out (gpu_req_if.ready) );