diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv
index 2f48b7e4..d2049031 100644
--- a/hw/rtl/core/VX_tensor_hopper_core.sv
+++ b/hw/rtl/core/VX_tensor_hopper_core.sv
@@ -90,9 +90,10 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
     logic writeback_ready;
 
     wire metadata_valid = ~metadata_queue_emptys[0/*FIXME*/];
-    wire not_wait = metadata_valid && (execute_if_data_op_type[0] != `INST_TENSOR_HGMMA_WAIT);
+    wire hmma_wait = metadata_valid &&
+                     (execute_if_data_op_type[0] == `INST_TENSOR_HGMMA_WAIT);
     // skip HGMMA_WAIT for kickoff
-    wire initiate_valid = metadata_valid && not_wait;
+    wire initiate_valid = metadata_valid && !hmma_wait;
 
     // we're recycling execute_if.op_type as operands_if.op_type which might
     // have a different width; let's be safe
@@ -156,40 +157,40 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
     // );
 
     wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0;
+    logic commit_select_tensor;
 
     always @(*) begin
         metadata_deq = 1'b0;
 
-        // if there's something in the meta queue, give it priority for commit,
-        // since every HGMMA instructions are asynchronous and should not
-        // block
+        // 1'b0: commit from metadata queue
+        // 1'b1: commit from tensor core writeback output
+        commit_select_tensor = 1'b0;
+
+        writeback_ready = commit_if.ready;
+
+        // if there's something in the meta queue, give it priority for commit
+        // to keep asynchrony of HGMMA instructions.  note HGMMA's should be
+        // stalled if the tensor core is already busy.
         if (metadata_valid) begin
-            // block tensor core writeback
-            writeback_ready = 1'b0;
+            if (hmma_wait) begin
+                // block tensor core writeback
+                writeback_ready = 1'b0;
 
-            commit_if.valid       = metadata_valid;
-            commit_if.data.uuid   = execute_if_data_uuid[0];
-            commit_if.data.wid    = execute_if_data_wid[0];
-            commit_if.data.tmask  = execute_if_data_tmask[0];
-            commit_if.data.PC     = execute_if_data_PC[0];
-            commit_if.data.wb     = execute_if_data_wb[0];
-            commit_if.data.rd     = execute_if_data_rd[0];
-            commit_if.data.data   = wb_data; // FIXME ?
-            commit_if.data.tensor = 1'b0;
-            commit_if.data.pid    = 1'b0;
-            commit_if.data.sop    = 1'b1;
-            commit_if.data.eop    = 1'b1;
-
-            // block meta queue until tensor core is ready.  This will
-            // effectively stall further issue of async HGMMA when tensor core
-            // is busy with too many outstanding requests (depth of meta queue).
-            // be careful to not miss the commit backpressure.
-            metadata_deq = metadata_valid && commit_if.ready && initiate_ready;
+                // commit HGMMA_WAIT regardless of tensor core busy
+                commit_select_tensor = 1'b0;
+                metadata_deq = metadata_valid && commit_if.ready;
+            end else begin
+                // hold commit and meta dequeue until tensor core is ready.
+                // This will stall newer HGMMAs when tensor core is already
+                // busy with an older one.
+                commit_select_tensor = !initiate_ready;
+                metadata_deq = metadata_valid && commit_if.ready && initiate_ready;
+            end
         end else begin
-            // allow tensor core writeback, provided there's no commit
-            // backpressure
-            writeback_ready = commit_if.ready;
+            commit_select_tensor = 1'b1;
+        end
 
+        if (commit_select_tensor) begin
             commit_if.valid       = writeback_valid;
             commit_if.data.uuid   = '0;
             commit_if.data.wid    = '0; // FIXME
@@ -204,6 +205,19 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
             // only the last ghost commit has eop set, which will trigger
             // scoreboard to clear out the busy bit.
             commit_if.data.eop    = writeback_last;
+        end else begin
+            commit_if.valid       = metadata_valid;
+            commit_if.data.uuid   = execute_if_data_uuid[0];
+            commit_if.data.wid    = execute_if_data_wid[0];
+            commit_if.data.tmask  = execute_if_data_tmask[0];
+            commit_if.data.PC     = execute_if_data_PC[0];
+            commit_if.data.wb     = execute_if_data_wb[0];
+            commit_if.data.rd     = execute_if_data_rd[0];
+            commit_if.data.data   = wb_data; // FIXME ?
+            commit_if.data.tensor = 1'b0;
+            commit_if.data.pid    = 1'b0;
+            commit_if.data.sop    = 1'b1;
+            commit_if.data.eop    = 1'b1;
         end
     end