From 83f9f6d84fc3f662d257bdc899682176d4be0cff Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 30 May 2024 18:22:36 -0700 Subject: [PATCH] tensor: Fix sync for dpu warp queue as well --- hw/rtl/core/VX_tensor_core.sv | 2 +- hw/rtl/fpu/VX_tensor_dpu.sv | 27 +++++++++++++++------------ 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 44485ccb..1f363f45 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -471,7 +471,7 @@ module VX_tensor_octet #( VX_tensor_dpu #( .ISW(ISW), .OCTET(OCTET), - .ISSUE_QUEUE_DEPTH(2) + .ISSUE_QUEUE_DEPTH(4) ) dpu ( .clk(clk), .reset(reset), diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 694af4ae..08e37cfa 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -39,13 +39,6 @@ module VX_tensor_dpu #( end end - // ready as soon as valid_out - // assign ready_in = ready_reg; - - // fully pipelined; ready_in is coupled to ready_out by immediately - // stalling - // assign ready_in = ready_out; - // // fixed-latency queue // VX_shift_register #( // .DATAW (1 + $bits(wid)/* + $bits(D_tile)*/), @@ -59,6 +52,16 @@ module VX_tensor_dpu #( // .data_out ({valid_out, D_wid/*, D_tile */}) // ); + // ready as soon as valid_out + // assign ready_in = ready_reg || valid_out; + + // fully pipelined; ready_in is coupled to ready_out by immediately + // stalling + // assign ready_in = ready_out; + + logic synced_fire; + assign synced_fire = valid_in && ready_in; + logic [1:0] threadgroup_valids; logic [1:0] threadgroup_readys; // B_tile is shared across the two threadgroups; see Figure 13 @@ -67,7 +70,7 @@ module VX_tensor_dpu #( ) threadgroup_0 ( .clk (clk), .reset (reset), - .valid_in (valid_in), + .valid_in (synced_fire), .ready_in (threadgroup_readys[0]), .stall (!ready_out), .A_frag (A_tile[1:0]), @@ -81,7 +84,7 @@ module VX_tensor_dpu #( ) threadgroup_1 ( .clk (clk), .reset (reset), - .valid_in (valid_in), + .valid_in (synced_fire), .ready_in (threadgroup_readys[1]), .stall (!ready_out), .A_frag (A_tile[3:2]), @@ -102,7 +105,7 @@ module VX_tensor_dpu #( // need to pass along warp id's to do multithreading VX_fifo_queue #( .DATAW ($bits(wid)), - .DEPTH (ISSUE_QUEUE_DEPTH) + .DEPTH (ISSUE_QUEUE_DEPTH + ISSUE_QUEUE_DEPTH) ) wid_queue ( .clk (clk), .reset (reset), @@ -117,8 +120,8 @@ module VX_tensor_dpu #( `UNUSED_PIN(size) ); - // `RUNTIME_ASSERT(reset || (&(threadgroup_valids) == valid_out), - // ("FEDP and metadata queue went out of sync!")) + `RUNTIME_ASSERT(reset || !(deq && empty), + ("dequeueing from empty warp id queue!")) endmodule // does (m,n,k) = (2,4,2) matmul compute over 2 cycles.