fix: batch decode kernel redundant store output to gmem (#505)

tsu-bin · web-flow · commit 90e42a7307da · 2024-09-25T01:25:02.000-07:00
Hi, this is a minor fix, when bdz is greater than 1, there would be
redundant store to gmem operations for some warps. We may also check 'if
(tx == 0)' when storing lse value, but since bdx is 32 most of the time,
I think that would be fine.

Co-authored-by: tsu-bin &lt;tsubin@gmail.com&gt;
diff --git a/include/flashinfer/attention/decode.cuh b/include/flashinfer/attention/decode.cuh
@@ -575,10 +575,12 @@ __global__ void BatchDecodeWithPagedKVCacheKernel(
   sync_state<vec_size, bdx, bdy, bdz>(st, reinterpret_cast<float*>(smem), smem_md);
   st.normalize();
 
-  st.o.cast_store(o + (batch_idx * num_qo_heads + qo_head_idx) * head_dim + tx * vec_size);
-  // write lse
-  if (lse != nullptr) {
-    lse[batch_idx * num_qo_heads + qo_head_idx] = st.get_lse();
+  if (tz == 0) {
+    st.o.cast_store(o + (batch_idx * num_qo_heads + qo_head_idx) * head_dim + tx * vec_size);
+    // write lse
+    if (lse != nullptr) {
+      lse[batch_idx * num_qo_heads + qo_head_idx] = st.get_lse();
+    }
   }
 }