Update Vortex core for Blackwell tensor instructions

- Add Blackwell tensor core support in VX_tensor_blackwell_core.sv
- Update decode, execute, and dispatch logic for new instructions
- Extend VX_define.vh and VX_types.vh with Blackwell ISA definitions
This commit is contained in:
2026-05-06 14:50:54 +08:00
parent cb912d3b8b
commit 323ed7d7e9
17 changed files with 492 additions and 114 deletions

View File

@@ -5,7 +5,7 @@
module Vortex import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter TENSOR_FP16 = 0,
parameter BOOTROM_HANG100 = 32'h10100,
parameter logic [63:0] STARTUP_ADDR = 64'h0000_0000_0001_0100,
parameter NUM_THREADS = 0,
parameter TC_DATA_WIDTH = 256,
parameter TC_TAG_WIDTH = 4
@@ -77,17 +77,26 @@ module Vortex import VX_gpu_pkg::*; #(
output [(DCACHE_NUM_REQS * 32) - 1:0] smem_a_bits_data,
// tc --------------------------------------------------
input [1:0] tc_a_ready,
output [1:0] tc_a_valid,
output [1:0] tc_a_bits_write,
output [63:0] tc_a_bits_address,
output [2 * TC_TAG_WIDTH - 1:0] tc_a_bits_tag,
output [2 * 32 - 1:0] tc_a_bits_mask,
output [2 * TC_DATA_WIDTH - 1:0] tc_a_bits_data,
output [1:0] tc_d_ready,
input [1:0] tc_d_valid,
input [2 * TC_DATA_WIDTH - 1:0] tc_d_bits_data,
input [2 * TC_TAG_WIDTH - 1:0] tc_d_bits_tag,
input [2:0] tc_a_ready,
output [2:0] tc_a_valid,
output [2:0] tc_a_bits_write,
output [95:0] tc_a_bits_address,
output [3 * TC_TAG_WIDTH - 1:0] tc_a_bits_tag,
output [3 * 32 - 1:0] tc_a_bits_mask,
output [3 * TC_DATA_WIDTH - 1:0] tc_a_bits_data,
output [2:0] tc_d_ready,
input [2:0] tc_d_valid,
input [3 * TC_DATA_WIDTH - 1:0] tc_d_bits_data,
input [3 * TC_TAG_WIDTH - 1:0] tc_d_bits_tag,
// tmem_C direct SRAM port
output tc_tmem_C_wen,
output tc_tmem_C_ren,
output [8:0] tc_tmem_C_waddr,
output [8:0] tc_tmem_C_raddr,
output [`NUM_THREADS*`XLEN-1:0] tc_tmem_C_wdata,
output [`NUM_THREADS*`XLEN/8-1:0] tc_tmem_C_mask,
input [`NUM_THREADS*`XLEN-1:0] tc_tmem_C_rdata,
// gbar ------------------------------------------------
@@ -306,22 +315,23 @@ module Vortex import VX_gpu_pkg::*; #(
// tc ---------------------------------------------------------------------
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p0_bus_if();
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p1_bus_if();
assign tc_a_valid = {tc_p1_bus_if.req_valid, tc_p0_bus_if.req_valid};
assign tc_a_bits_write = {tc_p1_bus_if.req_data.rw, tc_p0_bus_if.req_data.rw};
assign tc_a_bits_address = {tc_p1_bus_if.req_data.addr, tc_p0_bus_if.req_data.addr};
assign tc_a_bits_tag = {tc_p1_bus_if.req_data.tag, tc_p0_bus_if.req_data.tag};
assign tc_a_bits_mask = {tc_p1_bus_if.req_data.byteen, tc_p0_bus_if.req_data.byteen};
assign tc_a_bits_data = {tc_p1_bus_if.req_data.data, tc_p0_bus_if.req_data.data};
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p2_bus_if();
// tc_p1 (tmem_C) is now a direct SRAM port exposed as top-level ports tc_tmem_C_*
assign tc_a_valid = {tc_p2_bus_if.req_valid, 1'b0, tc_p0_bus_if.req_valid};
assign tc_a_bits_write = {tc_p2_bus_if.req_data.rw, 1'b0, tc_p0_bus_if.req_data.rw};
assign tc_a_bits_address = {tc_p2_bus_if.req_data.addr, 32'b0, tc_p0_bus_if.req_data.addr};
assign tc_a_bits_tag = {tc_p2_bus_if.req_data.tag, {TC_TAG_WIDTH{1'b0}}, tc_p0_bus_if.req_data.tag};
assign tc_a_bits_mask = {tc_p2_bus_if.req_data.byteen, {(TC_DATA_WIDTH/8){1'b0}},tc_p0_bus_if.req_data.byteen};
assign tc_a_bits_data = {tc_p2_bus_if.req_data.data, {TC_DATA_WIDTH{1'b0}}, tc_p0_bus_if.req_data.data};
assign tc_p0_bus_if.req_ready = tc_a_ready[0];
assign tc_p0_bus_if.rsp_valid = tc_d_valid[0];
assign tc_p0_bus_if.rsp_data.data = tc_d_bits_data[0 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
assign tc_p0_bus_if.rsp_data.tag = tc_d_bits_tag[0 * TC_TAG_WIDTH +: TC_TAG_WIDTH];
assign tc_p1_bus_if.req_ready = tc_a_ready[1];
assign tc_p1_bus_if.rsp_valid = tc_d_valid[1];
assign tc_p1_bus_if.rsp_data.data = tc_d_bits_data[1 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
assign tc_p1_bus_if.rsp_data.tag = tc_d_bits_tag[1 * TC_TAG_WIDTH +: TC_TAG_WIDTH];
assign tc_d_ready = {tc_p1_bus_if.rsp_ready, tc_p0_bus_if.rsp_ready};
assign tc_p2_bus_if.req_ready = tc_a_ready[2];
assign tc_p2_bus_if.rsp_valid = tc_d_valid[2];
assign tc_p2_bus_if.rsp_data.data = tc_d_bits_data[2 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
assign tc_p2_bus_if.rsp_data.tag = tc_d_bits_tag[2 * TC_TAG_WIDTH +: TC_TAG_WIDTH];
assign tc_d_ready = {tc_p2_bus_if.rsp_ready, 1'b0, tc_p0_bus_if.rsp_ready};
// gbar -------------------------------------------------------------------
`ifdef GBAR_ENABLE
@@ -395,15 +405,14 @@ module Vortex import VX_gpu_pkg::*; #(
dcr_write_valid = 1'b1;
dcr_write_addr = `VX_DCR_BASE_STARTUP_ADDR0;
dcr_write_data = BOOTROM_HANG100;
dcr_write_data = STARTUP_ADDR[31:0];
end
`VX_DCR_BASE_STARTUP_ADDR1: begin
dcr_state_n = `VX_DCR_BASE_MPM_CLASS;
dcr_write_valid = 1'b1;
dcr_write_addr = `VX_DCR_BASE_STARTUP_ADDR1;
// FIXME: not sure what this does
dcr_write_data = `VX_DCR_DATA_WIDTH'h0;
dcr_write_data = STARTUP_ADDR[63:32];
end
`VX_DCR_BASE_MPM_CLASS: begin
dcr_state_n = `VX_DCR_BASE_STATE_END;
@@ -455,7 +464,25 @@ module Vortex import VX_gpu_pkg::*; #(
`endif
.tensor_smem_A_if (tc_p0_bus_if),
.tensor_smem_B_if (tc_p1_bus_if),
`ifdef EXT_T_BLACKWELL
.tensor_tmem_C_wen(tc_tmem_C_wen),
.tensor_tmem_C_ren(tc_tmem_C_ren),
.tensor_tmem_C_waddr(tc_tmem_C_waddr),
.tensor_tmem_C_raddr(tc_tmem_C_raddr),
.tensor_tmem_C_wdata(tc_tmem_C_wdata),
.tensor_tmem_C_mask(tc_tmem_C_mask),
.tensor_tmem_C_rdata(tc_tmem_C_rdata),
.tensor_smem_B_if (tc_p2_bus_if),
`else
.tensor_tmem_C_wen(tc_tmem_C_wen),
.tensor_tmem_C_ren(tc_tmem_C_ren),
.tensor_tmem_C_waddr(tc_tmem_C_waddr),
.tensor_tmem_C_raddr(tc_tmem_C_raddr),
.tensor_tmem_C_wdata(tc_tmem_C_wdata),
.tensor_tmem_C_mask(tc_tmem_C_mask),
.tensor_tmem_C_rdata(tc_tmem_C_rdata),
.tensor_smem_B_if (tc_p2_bus_if),
`endif
.sim_ebreak (sim_ebreak),
.sim_wb_value (sim_wb_value),
@@ -577,7 +604,3 @@ module Vortex import VX_gpu_pkg::*; #(
`endif
endmodule : Vortex