From 41a79a03a4482fa288efbc532cf5b22f2c549027 Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Tue, 9 Apr 2024 19:55:06 -0700 Subject: [PATCH] parametrize memory interface in core wrapper and update config.vh --- hw/rtl/VX_config.vh | 16 +-- hw/rtl/VX_core_wrapper.sv | 228 ++++++++------------------------------ 2 files changed, 56 insertions(+), 188 deletions(-) diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index ee2e6d12..9c3fd529 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -79,15 +79,15 @@ `endif `ifndef NUM_CORES -`define NUM_CORES 1 +`define NUM_CORES 2 `endif `ifndef NUM_WARPS -`define NUM_WARPS 4 +`define NUM_WARPS 8 `endif `ifndef NUM_THREADS -`define NUM_THREADS 4 +`define NUM_THREADS 8 `endif `ifndef NUM_BARRIERS @@ -175,7 +175,7 @@ `endif `ifndef SMEM_LOG_SIZE -`define SMEM_LOG_SIZE 14 +`define SMEM_LOG_SIZE 15 `endif `ifndef IO_BASE_ADDR @@ -238,7 +238,7 @@ // Issue width `ifndef ISSUE_WIDTH -`define ISSUE_WIDTH `MIN(`NUM_WARPS, 4) +`define ISSUE_WIDTH `MIN(`NUM_WARPS, 8) `endif // Number of ALU units @@ -259,12 +259,12 @@ // Number of LSU units `ifndef NUM_LSU_LANES -`define NUM_LSU_LANES `MIN(`NUM_THREADS, 4) +`define NUM_LSU_LANES `NUM_THREADS `endif // Number of SFU units `ifndef NUM_SFU_LANES -`define NUM_SFU_LANES `MIN(`NUM_THREADS, 4) +`define NUM_SFU_LANES `NUM_THREADS `endif // Size of Instruction Buffer @@ -459,7 +459,7 @@ // Number of Banks `ifndef DCACHE_NUM_BANKS -`define DCACHE_NUM_BANKS `MIN(`NUM_LSU_LANES, 4) +`define DCACHE_NUM_BANKS `NUM_LSU_LANES `endif // Core Response Queue Size diff --git a/hw/rtl/VX_core_wrapper.sv b/hw/rtl/VX_core_wrapper.sv index 8614f4b8..a155aabb 100644 --- a/hw/rtl/VX_core_wrapper.sv +++ b/hw/rtl/VX_core_wrapper.sv @@ -222,195 +222,63 @@ module Vortex import VX_gpu_pkg::*; #( // dmem ------------------------------------------------------------------- // Vortex core does not accept write acks; filter them out here - assign dcache_bus_if[0].rsp_valid = - (dmem_d_valid[0] && (dmem_d_bits_opcode[0 * 3 +: 3] !== 3'd0 /*AccessAck*/)); - assign dcache_bus_if[1].rsp_valid = - (dmem_d_valid[1] && (dmem_d_bits_opcode[1 * 3 +: 3] !== 3'd0 /*AccessAck*/)); - assign dcache_bus_if[2].rsp_valid = - (dmem_d_valid[2] && (dmem_d_bits_opcode[2 * 3 +: 3] !== 3'd0 /*AccessAck*/)); - assign dcache_bus_if[3].rsp_valid = - (dmem_d_valid[3] && (dmem_d_bits_opcode[3 * 3 +: 3] !== 3'd0 /*AccessAck*/)); + generate + for (genvar i = 0; i < DCACHE_NUM_REQS; i++) begin + assign dcache_bus_if[i].rsp_valid = + (dmem_d_valid[i] && (dmem_d_bits_opcode[i * 3 +: 3] !== 3'd0 /*AccessAck*/)); + // Data and tag assignment for dcache + assign dcache_bus_if[i].rsp_data.data = dmem_d_bits_data[i * 32 +: 32]; + assign dcache_bus_if[i].rsp_data.tag = dmem_d_bits_source[i * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH]; - assign dcache_bus_if[0].rsp_data.data = dmem_d_bits_data[0 * 32 +: 32]; - assign dcache_bus_if[1].rsp_data.data = dmem_d_bits_data[1 * 32 +: 32]; - assign dcache_bus_if[2].rsp_data.data = dmem_d_bits_data[2 * 32 +: 32]; - assign dcache_bus_if[3].rsp_data.data = dmem_d_bits_data[3 * 32 +: 32]; + // Handling write ACKs, setting ready bit for dcache + assign dmem_d_ready[i] = dcache_bus_if[i].rsp_ready || + (dmem_d_valid[i] && (dmem_d_bits_opcode[i * 3 +: 3] == 3'd0 /*AccessAck*/)); - assign dcache_bus_if[0].rsp_data.tag = dmem_d_bits_source[0 * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH]; - assign dcache_bus_if[1].rsp_data.tag = dmem_d_bits_source[1 * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH]; - assign dcache_bus_if[2].rsp_data.tag = dmem_d_bits_source[2 * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH]; - assign dcache_bus_if[3].rsp_data.tag = dmem_d_bits_source[3 * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH]; + // Request validity and address/data/source assignment for dcache + assign dmem_a_valid[i] = dcache_bus_if[i].req_valid; + assign dmem_a_bits_address[i * 32 +: 32] = {dcache_bus_if[i].req_data.addr, 2'b0}; + assign dmem_a_bits_data[i * 32 +: 32] = dcache_bus_if[i].req_data.data; + assign dmem_a_bits_source[i * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH] = dcache_bus_if[i].req_data.tag; - // When there's a write ACK coming back, ready bit should always be 1 to - // accept them because core does not accept them on their own - assign dmem_d_ready[0] = dcache_bus_if[0].rsp_ready || - (dmem_d_valid[0] && (dmem_d_bits_opcode[0 * 3 +: 3] == 3'd0 /*AccessAck*/)); - assign dmem_d_ready[1] = dcache_bus_if[1].rsp_ready || - (dmem_d_valid[1] && (dmem_d_bits_opcode[1 * 3 +: 3] == 3'd0 /*AccessAck*/)); - assign dmem_d_ready[2] = dcache_bus_if[2].rsp_ready || - (dmem_d_valid[2] && (dmem_d_bits_opcode[2 * 3 +: 3] == 3'd0 /*AccessAck*/)); - assign dmem_d_ready[3] = dcache_bus_if[3].rsp_ready || - (dmem_d_valid[3] && (dmem_d_bits_opcode[3 * 3 +: 3] == 3'd0 /*AccessAck*/)); + // Opcode, size, and mask assignment for dcache + assign dmem_a_bits_opcode[i * 3 +: 3] = + dcache_bus_if[i].req_data.rw ? + (&dcache_bus_if[i].req_data.byteen ? 3'd0 /*PutFull*/ : 3'd1 /*PutPartial*/) + : 3'd4 /*Get*/; + assign dmem_a_bits_size[i * 4 +: 4] = 4'd2; // Fixed size + assign dmem_a_bits_mask[i * 4 +: 4] = dcache_bus_if[i].req_data.byteen; - assign dmem_a_valid[0] = dcache_bus_if[0].req_valid; - assign dmem_a_valid[1] = dcache_bus_if[1].req_valid; - assign dmem_a_valid[2] = dcache_bus_if[2].req_valid; - assign dmem_a_valid[3] = dcache_bus_if[3].req_valid; + // Setting request ready signal for dcache + assign dcache_bus_if[i].req_ready = dmem_a_ready[i]; - assign dmem_a_bits_address[0 * 32 +: 32] = {dcache_bus_if[0].req_data.addr, 2'b0}; - assign dmem_a_bits_address[1 * 32 +: 32] = {dcache_bus_if[1].req_data.addr, 2'b0}; - assign dmem_a_bits_address[2 * 32 +: 32] = {dcache_bus_if[2].req_data.addr, 2'b0}; - assign dmem_a_bits_address[3 * 32 +: 32] = {dcache_bus_if[3].req_data.addr, 2'b0}; + // Data and tag assignment for smem + assign smem_bus_if[i].rsp_valid = + (smem_d_valid[i] && (smem_d_bits_opcode[i * 3 +: 3] !== 3'd0 /*AccessAck*/)); + assign smem_bus_if[i].rsp_data.data = smem_d_bits_data[i * 32 +: 32]; + assign smem_bus_if[i].rsp_data.tag = smem_d_bits_source[i * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH]; - assign dmem_a_bits_data[0 * 32 +: 32] = dcache_bus_if[0].req_data.data; - assign dmem_a_bits_data[1 * 32 +: 32] = dcache_bus_if[1].req_data.data; - assign dmem_a_bits_data[2 * 32 +: 32] = dcache_bus_if[2].req_data.data; - assign dmem_a_bits_data[3 * 32 +: 32] = dcache_bus_if[3].req_data.data; + // Handling write ACKs, setting ready bit for smem + assign smem_d_ready[i] = smem_bus_if[i].rsp_ready || + (smem_d_valid[i] && (smem_d_bits_opcode[i * 3 +: 3] == 3'd0 /*AccessAck*/)); - assign dmem_a_bits_source[0 * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH] = dcache_bus_if[0].req_data.tag; - assign dmem_a_bits_source[1 * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH] = dcache_bus_if[1].req_data.tag; - assign dmem_a_bits_source[2 * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH] = dcache_bus_if[2].req_data.tag; - assign dmem_a_bits_source[3 * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH] = dcache_bus_if[3].req_data.tag; + // Request validity and address/data/source assignment for smem + assign smem_a_valid[i] = smem_bus_if[i].req_valid; + assign smem_a_bits_address[i * 32 +: 32] = {smem_bus_if[i].req_data.addr, 2'b0}; + assign smem_a_bits_data[i * 32 +: 32] = smem_bus_if[i].req_data.data; + assign smem_a_bits_source[i * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH] = smem_bus_if[i].req_data.tag; - // we assume all lanes always have the same tag; otherwise the sourceId - // logic in the Chisel tile breaks - // NOTE: not working at the moment but this doesn't seem to be a problem - // always @(*) begin - // for (i = 0; i < 4; i++) begin - // assert(dcache_req_if.tag[0] == dcache_req_if.tag[i]) - // end - // end + // Opcode, size, and mask assignment for smem + assign smem_a_bits_opcode[i * 3 +: 3] = + smem_bus_if[i].req_data.rw ? + (&smem_bus_if[i].req_data.byteen ? 3'd0 /*PutFull*/ : 3'd1 /*PutPartial*/) + : 3'd4 /*Get*/; + assign smem_a_bits_size[i * 4 +: 4] = 4'd2; // Fixed size + assign smem_a_bits_mask[i * 4 +: 4] = smem_bus_if[i].req_data.byteen; - // Translate Vortex rw/byteen to TileLink opcode - assign dmem_a_bits_opcode[0 * 3 +: 3] = - dcache_bus_if[0].req_data.rw ? - (&dcache_bus_if[0].req_data.byteen ? 3'd0 /*PutFull*/ : 3'd1 /*PutPartial*/) - : 3'd4 /*Get*/; - assign dmem_a_bits_opcode[1 * 3 +: 3] = - dcache_bus_if[1].req_data.rw ? - (&dcache_bus_if[1].req_data.byteen ? 3'd0 /*PutFull*/ : 3'd1 /*PutPartial*/) - : 3'd4 /*Get*/; - assign dmem_a_bits_opcode[2 * 3 +: 3] = - dcache_bus_if[2].req_data.rw ? - (&dcache_bus_if[2].req_data.byteen ? 3'd0 /*PutFull*/ : 3'd1 /*PutPartial*/) - : 3'd4 /*Get*/; - assign dmem_a_bits_opcode[3 * 3 +: 3] = - dcache_bus_if[3].req_data.rw ? - (&dcache_bus_if[3].req_data.byteen ? 3'd0 /*PutFull*/ : 3'd1 /*PutPartial*/) - : 3'd4 /*Get*/; - - // Vortex cache requests are single-fixed-size - // NOTE: MAKE SURE TO CHANGE CONSTANT WIDTH FOR SIZE! - assign dmem_a_bits_size[0 * 4 +: 4] = 4'd2; - assign dmem_a_bits_size[1 * 4 +: 4] = 4'd2; - assign dmem_a_bits_size[2 * 4 +: 4] = 4'd2; - assign dmem_a_bits_size[3 * 4 +: 4] = 4'd2; - /* $countones(dcache_req_if.byteen[0]) === 'd4 ? 2'd2 : - ($countones(dcache_req_if.byteen[0]) === 'd2 ? 2'd1 : 2'd0); */ - - // byteen can be directly used as TL mask - assign dmem_a_bits_mask[0 * 4 +: 4] = dcache_bus_if[0].req_data.byteen; - assign dmem_a_bits_mask[1 * 4 +: 4] = dcache_bus_if[1].req_data.byteen; - assign dmem_a_bits_mask[2 * 4 +: 4] = dcache_bus_if[2].req_data.byteen; - assign dmem_a_bits_mask[3 * 4 +: 4] = dcache_bus_if[3].req_data.byteen; - - assign dcache_bus_if[0].req_ready = dmem_a_ready[0]; - assign dcache_bus_if[1].req_ready = dmem_a_ready[1]; - assign dcache_bus_if[2].req_ready = dmem_a_ready[2]; - assign dcache_bus_if[3].req_ready = dmem_a_ready[3]; - - // smem ------------------------------------------------------------------- - - // FIXME: giant @copypaste from dmem - - // Vortex core does not accept write acks; filter them out here - assign smem_bus_if[0].rsp_valid = - (smem_d_valid[0] && (smem_d_bits_opcode[0 * 3 +: 3] !== 3'd0 /*AccessAck*/)); - assign smem_bus_if[1].rsp_valid = - (smem_d_valid[1] && (smem_d_bits_opcode[1 * 3 +: 3] !== 3'd0 /*AccessAck*/)); - assign smem_bus_if[2].rsp_valid = - (smem_d_valid[2] && (smem_d_bits_opcode[2 * 3 +: 3] !== 3'd0 /*AccessAck*/)); - assign smem_bus_if[3].rsp_valid = - (smem_d_valid[3] && (smem_d_bits_opcode[3 * 3 +: 3] !== 3'd0 /*AccessAck*/)); - - assign smem_bus_if[0].rsp_data.data = smem_d_bits_data[0 * 32 +: 32]; - assign smem_bus_if[1].rsp_data.data = smem_d_bits_data[1 * 32 +: 32]; - assign smem_bus_if[2].rsp_data.data = smem_d_bits_data[2 * 32 +: 32]; - assign smem_bus_if[3].rsp_data.data = smem_d_bits_data[3 * 32 +: 32]; - - assign smem_bus_if[0].rsp_data.tag = smem_d_bits_source[0 * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH]; - assign smem_bus_if[1].rsp_data.tag = smem_d_bits_source[1 * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH]; - assign smem_bus_if[2].rsp_data.tag = smem_d_bits_source[2 * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH]; - assign smem_bus_if[3].rsp_data.tag = smem_d_bits_source[3 * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH]; - - // When there's a write ACK coming back, ready bit should always be 1 to - // accept them because core does not accept them on their own - assign smem_d_ready[0] = smem_bus_if[0].rsp_ready || - (smem_d_valid[0] && (smem_d_bits_opcode[0 * 3 +: 3] == 3'd0 /*AccessAck*/)); - assign smem_d_ready[1] = smem_bus_if[1].rsp_ready || - (smem_d_valid[1] && (smem_d_bits_opcode[1 * 3 +: 3] == 3'd0 /*AccessAck*/)); - assign smem_d_ready[2] = smem_bus_if[2].rsp_ready || - (smem_d_valid[2] && (smem_d_bits_opcode[2 * 3 +: 3] == 3'd0 /*AccessAck*/)); - assign smem_d_ready[3] = smem_bus_if[3].rsp_ready || - (smem_d_valid[3] && (smem_d_bits_opcode[3 * 3 +: 3] == 3'd0 /*AccessAck*/)); - - assign smem_a_valid[0] = smem_bus_if[0].req_valid; - assign smem_a_valid[1] = smem_bus_if[1].req_valid; - assign smem_a_valid[2] = smem_bus_if[2].req_valid; - assign smem_a_valid[3] = smem_bus_if[3].req_valid; - - assign smem_a_bits_address[0 * 32 +: 32] = {smem_bus_if[0].req_data.addr, 2'b0}; - assign smem_a_bits_address[1 * 32 +: 32] = {smem_bus_if[1].req_data.addr, 2'b0}; - assign smem_a_bits_address[2 * 32 +: 32] = {smem_bus_if[2].req_data.addr, 2'b0}; - assign smem_a_bits_address[3 * 32 +: 32] = {smem_bus_if[3].req_data.addr, 2'b0}; - - assign smem_a_bits_data[0 * 32 +: 32] = smem_bus_if[0].req_data.data; - assign smem_a_bits_data[1 * 32 +: 32] = smem_bus_if[1].req_data.data; - assign smem_a_bits_data[2 * 32 +: 32] = smem_bus_if[2].req_data.data; - assign smem_a_bits_data[3 * 32 +: 32] = smem_bus_if[3].req_data.data; - - assign smem_a_bits_source[0 * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH] = smem_bus_if[0].req_data.tag; - assign smem_a_bits_source[1 * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH] = smem_bus_if[1].req_data.tag; - assign smem_a_bits_source[2 * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH] = smem_bus_if[2].req_data.tag; - assign smem_a_bits_source[3 * DCACHE_NOSM_TAG_WIDTH +: DCACHE_NOSM_TAG_WIDTH] = smem_bus_if[3].req_data.tag; - - // Translate Vortex rw/byteen to TileLink opcode - assign smem_a_bits_opcode[0 * 3 +: 3] = - smem_bus_if[0].req_data.rw ? - (&smem_bus_if[0].req_data.byteen ? 3'd0 /*PutFull*/ : 3'd1 /*PutPartial*/) - : 3'd4 /*Get*/; - assign smem_a_bits_opcode[1 * 3 +: 3] = - smem_bus_if[1].req_data.rw ? - (&smem_bus_if[1].req_data.byteen ? 3'd0 /*PutFull*/ : 3'd1 /*PutPartial*/) - : 3'd4 /*Get*/; - assign smem_a_bits_opcode[2 * 3 +: 3] = - smem_bus_if[2].req_data.rw ? - (&smem_bus_if[2].req_data.byteen ? 3'd0 /*PutFull*/ : 3'd1 /*PutPartial*/) - : 3'd4 /*Get*/; - assign smem_a_bits_opcode[3 * 3 +: 3] = - smem_bus_if[3].req_data.rw ? - (&smem_bus_if[3].req_data.byteen ? 3'd0 /*PutFull*/ : 3'd1 /*PutPartial*/) - : 3'd4 /*Get*/; - - // Vortex cache requests are single-fixed-size - // NOTE: MAKE SURE TO CHANGE CONSTANT WIDTH FOR SIZE! - assign smem_a_bits_size[0 * 4 +: 4] = 4'd2; - assign smem_a_bits_size[1 * 4 +: 4] = 4'd2; - assign smem_a_bits_size[2 * 4 +: 4] = 4'd2; - assign smem_a_bits_size[3 * 4 +: 4] = 4'd2; - /* $countones(dcache_req_if.byteen[0]) === 'd4 ? 2'd2 : - ($countones(dcache_req_if.byteen[0]) === 'd2 ? 2'd1 : 2'd0); */ - - // byteen can be directly used as TL mask - assign smem_a_bits_mask[0 * 4 +: 4] = smem_bus_if[0].req_data.byteen; - assign smem_a_bits_mask[1 * 4 +: 4] = smem_bus_if[1].req_data.byteen; - assign smem_a_bits_mask[2 * 4 +: 4] = smem_bus_if[2].req_data.byteen; - assign smem_a_bits_mask[3 * 4 +: 4] = smem_bus_if[3].req_data.byteen; - - assign smem_bus_if[0].req_ready = smem_a_ready[0]; - assign smem_bus_if[1].req_ready = smem_a_ready[1]; - assign smem_bus_if[2].req_ready = smem_a_ready[2]; - assign smem_bus_if[3].req_ready = smem_a_ready[3]; + // Setting request ready signal for smem + assign smem_bus_if[i].req_ready = smem_a_ready[i]; + end + endgenerate // gbar ------------------------------------------------------------------- `ifdef GBAR_ENABLE