Implement WU architecture support

This commit is contained in:
2026-05-25 19:25:05 +08:00
parent 323ed7d7e9
commit 0ad87bde81
35 changed files with 3303 additions and 472 deletions

View File

@@ -25,27 +25,40 @@ module VX_fetch import VX_gpu_pkg::*; #(
VX_mem_bus_if.master icache_bus_if,
// inputs
VX_schedule_if.slave schedule_if,
VX_schedule_if.slave scalar_schedule_if,
VX_schedule_if.slave tensor_schedule_if,
// outputs
VX_fetch_if.master fetch_if
VX_fetch_if.master scalar_fetch_if,
VX_fetch_if.master tensor_fetch_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_VAR (reset)
wire icache_req_valid;
wire [ICACHE_ADDR_WIDTH-1:0] icache_req_addr;
wire [ICACHE_TAG_WIDTH-1:0] icache_req_tag;
wire icache_req_ready;
wire rsp_domain;
wire [`UUID_WIDTH-1:0] rsp_uuid;
wire [`NW_WIDTH-1:0] req_tag, rsp_tag;
wire [`NW_WIDTH-1:0] req_tag, rsp_tag;
reg fetch_domain_rr;
wire icache_req_fire = icache_req_valid && icache_req_ready;
assign req_tag = schedule_if.data.wid;
wire scalar_req_valid = scalar_schedule_if.valid;
wire tensor_req_valid = tensor_schedule_if.valid;
wire select_tensor_req = tensor_req_valid && (!scalar_req_valid || fetch_domain_rr);
wire selected_domain = select_tensor_req ? WU_DOMAIN_TENSOR : WU_DOMAIN_SCALAR;
wire selected_valid = scalar_req_valid || tensor_req_valid;
wire [`NW_WIDTH-1:0] selected_wid = select_tensor_req ? tensor_schedule_if.data.wid : scalar_schedule_if.data.wid;
wire [`XLEN-1:0] selected_pc = select_tensor_req ? tensor_schedule_if.data.PC : scalar_schedule_if.data.PC;
wire [`NUM_THREADS-1:0] selected_tmask = select_tensor_req ? tensor_schedule_if.data.tmask : scalar_schedule_if.data.tmask;
wire [`UUID_WIDTH-1:0] selected_uuid = select_tensor_req ? tensor_schedule_if.data.uuid : scalar_schedule_if.data.uuid;
assign req_tag = selected_wid;
assign {rsp_uuid, rsp_tag} = icache_bus_if.rsp_data.tag;
assign {rsp_domain, rsp_uuid, rsp_tag} = icache_bus_if.rsp_data.tag;
wire [`XLEN-1:0] rsp_PC;
wire [`NUM_THREADS-1:0] rsp_tmask;
@@ -60,7 +73,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
.write (icache_req_fire),
`UNUSED_PIN (wren),
.waddr (req_tag),
.wdata ({schedule_if.data.PC, schedule_if.data.tmask}),
.wdata ({selected_pc, selected_tmask}),
.raddr (rsp_tag),
.rdata ({rsp_PC, rsp_tmask})
);
@@ -69,7 +82,8 @@ module VX_fetch import VX_gpu_pkg::*; #(
// Ensure that the ibuffer doesn't fill up.
// This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache request.
// This issue is particularly prevalent when the icache and dcache is disabled and both requests share the same bus.
wire [ISSUE_ISW-1:0] schedule_isw = wid_to_isw(schedule_if.data.wid);
wire [ISSUE_ISW-1:0] schedule_isw = wid_to_isw(selected_wid);
wire [`ISSUE_WIDTH-1:0] domain_ibuf_pop = scalar_fetch_if.ibuf_pop | tensor_fetch_if.ibuf_pop;
wire [`ISSUE_WIDTH-1:0] pending_ibuf_full;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
@@ -79,7 +93,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
.clk (clk),
.reset (reset),
.incr (icache_req_fire && schedule_isw == i),
.decr (fetch_if.ibuf_pop[i]),
.decr (domain_ibuf_pop[i]),
.full (pending_ibuf_full[i]),
`UNUSED_PIN (size),
`UNUSED_PIN (empty)
@@ -90,15 +104,24 @@ module VX_fetch import VX_gpu_pkg::*; #(
wire ibuf_ready = 1'b1;
`endif
`RUNTIME_ASSERT((!schedule_if.valid || schedule_if.data.PC != 0),
("%t: *** invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, schedule_if.data.PC, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.uuid))
`RUNTIME_ASSERT((!selected_valid || selected_pc != 0),
("%t: *** invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, selected_pc, selected_wid, selected_tmask, selected_uuid))
// Icache Request
assign icache_req_valid = schedule_if.valid && ibuf_ready;
assign icache_req_addr = schedule_if.data.PC[`MEM_ADDR_WIDTH-1:2];
assign icache_req_tag = {schedule_if.data.uuid, req_tag};
assign schedule_if.ready = icache_req_ready && ibuf_ready;
assign icache_req_valid = selected_valid && ibuf_ready;
assign icache_req_addr = selected_pc[`MEM_ADDR_WIDTH-1:2];
assign icache_req_tag = {selected_domain, selected_uuid, req_tag};
assign scalar_schedule_if.ready = icache_req_ready && ibuf_ready && selected_valid && !select_tensor_req;
assign tensor_schedule_if.ready = icache_req_ready && ibuf_ready && selected_valid && select_tensor_req;
always @(posedge clk) begin
if (reset) begin
fetch_domain_rr <= 1'b0;
end else if (icache_req_fire && scalar_req_valid && tensor_req_valid) begin
fetch_domain_rr <= ~fetch_domain_rr;
end
end
VX_elastic_buffer #(
.DATAW (ICACHE_ADDR_WIDTH + ICACHE_TAG_WIDTH),
@@ -121,18 +144,26 @@ module VX_fetch import VX_gpu_pkg::*; #(
// Icache Response
assign fetch_if.valid = icache_bus_if.rsp_valid;
assign fetch_if.data.tmask = rsp_tmask;
assign fetch_if.data.wid = rsp_tag;
assign fetch_if.data.PC = rsp_PC;
assign fetch_if.data.instr = icache_bus_if.rsp_data.data;
assign fetch_if.data.uuid = rsp_uuid;
assign icache_bus_if.rsp_ready = fetch_if.ready;
assign scalar_fetch_if.valid = icache_bus_if.rsp_valid && (rsp_domain == WU_DOMAIN_SCALAR);
assign scalar_fetch_if.data.tmask = rsp_tmask;
assign scalar_fetch_if.data.wid = rsp_tag;
assign scalar_fetch_if.data.PC = rsp_PC;
assign scalar_fetch_if.data.instr = icache_bus_if.rsp_data.data;
assign scalar_fetch_if.data.uuid = rsp_uuid;
assign tensor_fetch_if.valid = icache_bus_if.rsp_valid && (rsp_domain == WU_DOMAIN_TENSOR);
assign tensor_fetch_if.data.tmask = rsp_tmask;
assign tensor_fetch_if.data.wid = rsp_tag;
assign tensor_fetch_if.data.PC = rsp_PC;
assign tensor_fetch_if.data.instr = icache_bus_if.rsp_data.data;
assign tensor_fetch_if.data.uuid = rsp_uuid;
assign icache_bus_if.rsp_ready = (rsp_domain == WU_DOMAIN_TENSOR) ? tensor_fetch_if.ready : scalar_fetch_if.ready;
`ifdef DBG_SCOPE_FETCH
if (CORE_ID == 0) begin
`ifdef SCOPE
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire schedule_fire = icache_req_fire;
wire icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
VX_scope_tap #(
.SCOPE_ID (1),
@@ -150,7 +181,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
icache_rsp_fire
}),
.probes({
schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC,
selected_uuid, selected_wid, selected_tmask, selected_pc,
icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr,
icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag
}),
@@ -161,7 +192,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
`ifdef CHIPSCOPE
ila_fetch ila_fetch_inst (
.clk (clk),
.probe0 ({reset, schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC, schedule_if.ready, schedule_if.valid}),
.probe0 ({reset, selected_uuid, selected_wid, selected_tmask, selected_pc, icache_req_ready, selected_valid}),
.probe1 ({icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr, icache_bus_if.req_ready, icache_bus_if.req_valid}),
.probe2 ({icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag, icache_bus_if.rsp_ready, icache_bus_if.rsp_valid})
);
@@ -172,14 +203,18 @@ module VX_fetch import VX_gpu_pkg::*; #(
`endif
`ifdef DBG_TRACE_CORE_ICACHE
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire fetch_fire = fetch_if.valid && fetch_if.ready;
wire schedule_fire = icache_req_fire;
wire scalar_fetch_fire = scalar_fetch_if.valid && scalar_fetch_if.ready;
wire tensor_fetch_fire = tensor_fetch_if.valid && tensor_fetch_if.ready;
always @(posedge clk) begin
if (schedule_fire) begin
`TRACE(1, ("%d: I$%0d req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, CORE_ID, schedule_if.data.wid, schedule_if.data.PC, schedule_if.data.tmask, schedule_if.data.uuid));
`TRACE(1, ("%d: I$%0d req: domain=%0d wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, CORE_ID, selected_domain, selected_wid, selected_pc, selected_tmask, selected_uuid));
end
if (fetch_fire) begin
`TRACE(1, ("%d: I$%0d rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, CORE_ID, fetch_if.data.wid, fetch_if.data.PC, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid));
if (scalar_fetch_fire) begin
`TRACE(1, ("%d: I$%0d scalar rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, CORE_ID, scalar_fetch_if.data.wid, scalar_fetch_if.data.PC, scalar_fetch_if.data.tmask, scalar_fetch_if.data.instr, scalar_fetch_if.data.uuid));
end
if (tensor_fetch_fire) begin
`TRACE(1, ("%d: I$%0d tensor rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, CORE_ID, tensor_fetch_if.data.wid, tensor_fetch_if.data.PC, tensor_fetch_if.data.tmask, tensor_fetch_if.data.instr, tensor_fetch_if.data.uuid));
end
end
`endif