tensor: Add ready signal to enforce 1 warp occupancy
Currently disabled as the timing behavior is already ~accurate
This commit is contained in:
@@ -347,7 +347,7 @@ void dpi_fmax(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, s
|
|||||||
|
|
||||||
// A is M * K, B is K * M, C is M * M, D is M * M
|
// A is M * K, B is K * M, C is M * M, D is M * M
|
||||||
#define M 4
|
#define M 4
|
||||||
#define K 2
|
#define K 2 // FIXME: 4x4x1 / cycle / octet!
|
||||||
|
|
||||||
// all row major
|
// all row major
|
||||||
float c_A_tile[M][K];
|
float c_A_tile[M][K];
|
||||||
@@ -551,7 +551,7 @@ void dpi_print_results(int wid, int octet, const svBitVecVal* A_tile, const svBi
|
|||||||
}
|
}
|
||||||
|
|
||||||
steps[wid] += 1;
|
steps[wid] += 1;
|
||||||
if (steps[wid] % 64 == 0) {
|
if (steps[wid] % 32 == 0) {
|
||||||
steps[wid] = 0;
|
steps[wid] = 0;
|
||||||
std::cout << "warp " << wid << " finished wmma\n";
|
std::cout << "warp " << wid << " finished wmma\n";
|
||||||
std::cout << "A tile" << "\n";
|
std::cout << "A tile" << "\n";
|
||||||
|
|||||||
@@ -391,7 +391,7 @@
|
|||||||
|
|
||||||
// Tensor Core Latency
|
// Tensor Core Latency
|
||||||
`ifndef LATENCY_HMMA
|
`ifndef LATENCY_HMMA
|
||||||
`define LATENCY_HMMA 8
|
`define LATENCY_HMMA 2
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
// Icache Configurable Knobs //////////////////////////////////////////////////
|
// Icache Configurable Knobs //////////////////////////////////////////////////
|
||||||
|
|||||||
@@ -32,6 +32,10 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
|
|||||||
.execute_if (execute_if)
|
.execute_if (execute_if)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// FIXME: when multiple warps are running, step0_0 from multiple warps can
|
||||||
|
// get interleaved before the first warp advances to step0_1, fucking
|
||||||
|
// everything up
|
||||||
|
|
||||||
VX_commit_if #(
|
VX_commit_if #(
|
||||||
.NUM_LANES (NUM_LANES)
|
.NUM_LANES (NUM_LANES)
|
||||||
) commit_block_if[BLOCK_SIZE]();
|
) commit_block_if[BLOCK_SIZE]();
|
||||||
@@ -175,6 +179,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
|
|||||||
execute_if.data.PC,
|
execute_if.data.PC,
|
||||||
execute_if.data.wb,
|
execute_if.data.wb,
|
||||||
execute_if.data.rd
|
execute_if.data.rd
|
||||||
|
// pid/sop/eop set later
|
||||||
};
|
};
|
||||||
|
|
||||||
wire [DATAW-1:0] execute_if_data_deq;
|
wire [DATAW-1:0] execute_if_data_deq;
|
||||||
@@ -320,8 +325,16 @@ module VX_tensor_octet #(
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
wire hmma_ready;
|
||||||
wire stall = result_valid && ~result_ready;
|
wire stall = result_valid && ~result_ready;
|
||||||
|
// backpressure from commit
|
||||||
assign operands_ready = ~stall;
|
assign operands_ready = ~stall;
|
||||||
|
// TODO: Below line is to only allow 1 warp to occupy the octet at a time;
|
||||||
|
// currently, dpu is fully-pipelined and allows concurrency between
|
||||||
|
// multiple warps. This seems to be not a problem though given that the
|
||||||
|
// RF operand read takes >=2 cycles, which should be the end-to-end
|
||||||
|
// latency of the DPU anyways
|
||||||
|
// assign operands_ready = hmma_ready && ~stall;
|
||||||
|
|
||||||
// A is 4x2 fp32 matrix
|
// A is 4x2 fp32 matrix
|
||||||
wire [3:0][1:0][31:0] A_tile = {
|
wire [3:0][1:0][31:0] A_tile = {
|
||||||
@@ -359,6 +372,7 @@ module VX_tensor_octet #(
|
|||||||
.stall(stall),
|
.stall(stall),
|
||||||
|
|
||||||
.valid_in(do_hmma),
|
.valid_in(do_hmma),
|
||||||
|
.ready_in(hmma_ready),
|
||||||
.A_tile(A_tile),
|
.A_tile(A_tile),
|
||||||
.B_tile(B_tile),
|
.B_tile(B_tile),
|
||||||
.C_tile(C_tile),
|
.C_tile(C_tile),
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ module VX_tensor_dpu #(
|
|||||||
input stall,
|
input stall,
|
||||||
|
|
||||||
input valid_in,
|
input valid_in,
|
||||||
|
output ready_in,
|
||||||
input [3:0][1:0][31:0] A_tile,
|
input [3:0][1:0][31:0] A_tile,
|
||||||
input [1:0][3:0][31:0] B_tile,
|
input [1:0][3:0][31:0] B_tile,
|
||||||
input [3:0][3:0][31:0] C_tile,
|
input [3:0][3:0][31:0] C_tile,
|
||||||
@@ -24,12 +25,20 @@ module VX_tensor_dpu #(
|
|||||||
dpi_hmma(valid_in, A_tile, B_tile, C_tile, result_hmma);
|
dpi_hmma(valid_in, A_tile, B_tile, C_tile, result_hmma);
|
||||||
end
|
end
|
||||||
|
|
||||||
|
logic ready_reg;
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (~reset && valid_in) begin
|
if (reset) begin
|
||||||
|
ready_reg <= '1;
|
||||||
|
end else if (valid_in) begin
|
||||||
|
ready_reg <= '0;
|
||||||
dpi_print_results(int'(ISW), int'(OCTET), A_tile, B_tile, C_tile, result_hmma);
|
dpi_print_results(int'(ISW), int'(OCTET), A_tile, B_tile, C_tile, result_hmma);
|
||||||
|
end else if (valid_out) begin
|
||||||
|
ready_reg <= '1;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
// ready as soon as valid_out
|
||||||
|
assign ready_in = ready_reg || valid_out;
|
||||||
|
|
||||||
VX_shift_register #(
|
VX_shift_register #(
|
||||||
.DATAW (1 + $bits(D_tile)),
|
.DATAW (1 + $bits(D_tile)),
|
||||||
|
|||||||
Reference in New Issue
Block a user