From 08adf1f71af65506f2d0a164bf9ee59edbba246c Mon Sep 17 00:00:00 2001
From: timt51 <ttruong@ne47.bio>
Date: Thu, 17 Nov 2022 09:59:10 -0500
Subject: [PATCH 01/20] Publish on tag push

---
 .github/workflows/publish.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 72df6053cc6..ff6fa1cb90b 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -5,7 +5,7 @@
 name: Python Package
 
 on:
-  create:
+  push:
     tags:
       - '**'
 
@@ -124,4 +124,4 @@ jobs:
           upload_url: ${{ steps.get_current_release.outputs.upload_url }}
           asset_path: ./${{env.wheel_name}}
           asset_name: ${{env.wheel_name}}
-          asset_content_type: application/*
\ No newline at end of file
+          asset_content_type: application/*

From 4ad8ba724eb0ea4e33d296386e1b10c92f603e8d Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Fri, 18 Nov 2022 02:30:04 +0000
Subject: [PATCH 02/20] causal prefix mask with adjusted tests

---
 csrc/flash_attn/src/fmha/mask.h               |   4 +-
 .../src/fmha_dgrad_kernel_1xN_loop.h          |   5 +-
 csrc/flash_attn/src/fmha_fprop_kernel_1xN.h   |   5 +-
 setup.py                                      |   1 +
 tests/test_causal_prefix_mask.py              | 127 ++++++++++++++++++
 tests/test_flash_attn.py                      |  52 +++++--
 6 files changed, 178 insertions(+), 16 deletions(-)
 create mode 100644 tests/test_causal_prefix_mask.py

diff --git a/csrc/flash_attn/src/fmha/mask.h b/csrc/flash_attn/src/fmha/mask.h
index 6c8092983bb..08c851318e2 100644
--- a/csrc/flash_attn/src/fmha/mask.h
+++ b/csrc/flash_attn/src/fmha/mask.h
@@ -37,6 +37,7 @@ struct Mask {
     template<typename BInfo>
     __device__ Mask(const BInfo &binfo, int tidx, const int loop_step_idx_ = 0)
         : actual_seqlen_k(binfo.actual_seqlen_k - loop_step_idx_ * Cta_tile::N)
+        , actual_seqlen_q(binfo.actual_seqlen_q)
         , loop_step_idx(loop_step_idx_) {
 
         const int warp = tidx / Cta_tile::THREADS_PER_WARP;
@@ -67,7 +68,7 @@ struct Mask {
         // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (blockIdx.z == 1)) {
         //     printf("current_col=%d, current_row=%d, actual_seqlen_k=%d, col_valid=%d, all_valid=%d\n", current_col, current_row, actual_seqlen_k, col_valid, all_valid);
         // }
-        return Is_causal ? col_valid && (current_col + loop_step_idx * Cta_tile::N <= current_row) : col_valid;
+        return Is_causal ? col_valid && (current_col <= current_row + actual_seqlen_k - actual_seqlen_q) : col_valid;
         // return row_valid && col_valid;
     }
 
@@ -85,6 +86,7 @@ struct Mask {
     int col;
     const int loop_step_idx;
     const int actual_seqlen_k;
+    const int actual_seqlen_q;
 };
 
 }  // namespace fmha
diff --git a/csrc/flash_attn/src/fmha_dgrad_kernel_1xN_loop.h b/csrc/flash_attn/src/fmha_dgrad_kernel_1xN_loop.h
index 05d3baebb80..c37c7ffca97 100644
--- a/csrc/flash_attn/src/fmha_dgrad_kernel_1xN_loop.h
+++ b/csrc/flash_attn/src/fmha_dgrad_kernel_1xN_loop.h
@@ -194,7 +194,7 @@ inline __device__ void compute_dq_dk_dv_1xN_one_iter(const Params &params, Prng
     Gmem_softmax_sum gmem_softmax_d(params.dsoftmax_sum, params, tidx);
 
     static_assert(Cta_tile_p::N % Cta_tile_p::M == 0);
-    int begin = Is_causal ? loop_step_idx * Cta_tile_p::N / Cta_tile_p::M : 0;
+    int begin = 0;
     // We want begin to be a multiple of gridDim.z
     // This is because the row indices processed by each threadblock must align between the
     // loop steps, otherwise we have a dependency between the blocks.
@@ -590,8 +590,7 @@ inline __device__ void compute_dq_dk_dv_1xN_one_iter(const Params &params, Prng
 
         const bool is_final_write =
             Is_last
-            || ((loop_step_idx + 1) * Cta_tile_p::N >= binfo.actual_seqlen_k)
-            || ((Is_causal) && ((begin + l) * Cta_tile_p::M < (loop_step_idx + 1) * Cta_tile_p::N));
+            || ((loop_step_idx + 1) * Cta_tile_p::N >= binfo.actual_seqlen_k);
         if (is_final_write) {
             // if (Is_dropout) {
             //     dq_out[0] = fmha::fmul4(dq_out[0], params.rp_dropout);
diff --git a/csrc/flash_attn/src/fmha_fprop_kernel_1xN.h b/csrc/flash_attn/src/fmha_fprop_kernel_1xN.h
index fd4621be343..15149b37000 100644
--- a/csrc/flash_attn/src/fmha_fprop_kernel_1xN.h
+++ b/csrc/flash_attn/src/fmha_fprop_kernel_1xN.h
@@ -272,7 +272,7 @@ inline __device__ void device_1xN_(const Params &params, const int bidb, const i
 
     // Wind gmem tiles to the correct position.
     static_assert(Cta_tile_p::N % Cta_tile_p::M == 0);
-    int begin = Is_causal ? loop_step_idx * Cta_tile_p::N / Cta_tile_p::M : 0;
+    int begin = 0;
     // We want begin to be a multiple of gridDim.z
     // This is because the row indices processed by each threadblock must align between the
     // loop steps, otherwise we have a dependency between the blocks.
@@ -617,8 +617,7 @@ inline __device__ void device_1xN_(const Params &params, const int bidb, const i
 
         const bool is_final_write =
             Is_last
-            || ((loop_step_idx + 1) * Cta_tile_p::N >= binfo.actual_seqlen_k)
-            || ((Is_causal) && ((begin + l) * Cta_tile_p::M < (loop_step_idx + 1) * Cta_tile_p::N));
+            || ((loop_step_idx + 1) * Cta_tile_p::N >= binfo.actual_seqlen_k);
         #pragma unroll
         for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
             float sum = p_sum_o[jj][0];
diff --git a/setup.py b/setup.py
index ec1415ac241..aea44038f38 100644
--- a/setup.py
+++ b/setup.py
@@ -129,6 +129,7 @@ def append_nvcc_threads(nvcc_extra_args):
             "nvcc": append_nvcc_threads(
                 [
                     "-O3",
+                    "-t4",
                     "-std=c++17",
                     "-U__CUDA_NO_HALF_OPERATORS__",
                     "-U__CUDA_NO_HALF_CONVERSIONS__",
diff --git a/tests/test_causal_prefix_mask.py b/tests/test_causal_prefix_mask.py
new file mode 100644
index 00000000000..6da401b3d17
--- /dev/null
+++ b/tests/test_causal_prefix_mask.py
@@ -0,0 +1,127 @@
+"""
+Test adapted from https://github.com/openai/triton/blob/0d7e7532279e45672555e344646f5c19c3972331/python/tutorials/06-fused-attention.py
+"""
+from contextlib import nullcontext
+import math
+import time
+
+from scipy import stats
+
+import torch
+
+from flash_attn.flash_attn_interface import flash_attn_unpadded_func
+
+
+def create_causal_mask(q: int, k: int, dtype: torch.dtype, device: torch.device):
+    return (
+        (torch.ones((q, k), device=device) - torch.inf).triu(k - q + 1).type(dtype)
+    )
+
+
+def attention_ref(q, k, v, sm_scale, causal, device):
+    p = torch.matmul(q, k.transpose(2, 3)) * sm_scale
+    # for z in range(Z):
+    #     for h in range(H):
+    #         p[:, :, M == 0] = float("-inf")
+    if causal:
+        M = create_causal_mask(q.size(2), k.size(2), dtype=dtype, device=device)
+        p += M
+    p = torch.softmax(p.float(), dim=-1).type(dtype)
+    ref_out = torch.matmul(p, v)
+    return ref_out
+
+
+torch.manual_seed(0)
+repeats = 1
+batch_size = 1
+nheads = 1
+seqlen = 16
+n = 16
+d = n // nheads
+dropout_p = 0.0
+causal = True
+dtype = torch.bfloat16
+device = 'cuda'
+test_backward = True
+
+
+with torch.inference_mode() if not test_backward else nullcontext():
+    B = 8
+    H = 12
+    Q_N_CTX = 350 # 128 * 2 * 2
+    KV_N_CTX = 350 * 100 # 256 * 2 * 2 * 2
+    D_HEAD = 64
+
+    torch.manual_seed(20)
+    q = torch.empty((B, H, Q_N_CTX, D_HEAD), dtype=dtype, device=device).normal_(mean=0, std=.5)
+    k = torch.empty((B, H, KV_N_CTX, D_HEAD), dtype=dtype, device=device).normal_(mean=0, std=.5)
+    v = torch.empty((B, H, KV_N_CTX, D_HEAD), dtype=dtype, device=device).normal_(mean=0, std=.5)
+    if test_backward:
+        q = q.requires_grad_()
+        k = k.requires_grad_()
+        v = v.requires_grad_()
+    cu_seqlens_q = torch.arange(
+        0, (B + 1) * Q_N_CTX, step=Q_N_CTX, dtype=torch.int32, device=device
+    )
+    cu_seqlens_k = torch.arange(
+        0, (B + 1) * KV_N_CTX, step=KV_N_CTX, dtype=torch.int32, device=device
+    )
+
+    s = time.time()
+    flash_out = flash_attn_unpadded_func(
+        q.transpose(1, 2).reshape(B * Q_N_CTX, H, D_HEAD),
+        k.transpose(1, 2).reshape(B * KV_N_CTX, H, D_HEAD),
+        v.transpose(1, 2).reshape(B * KV_N_CTX, H, D_HEAD),
+        cu_seqlens_q=cu_seqlens_q,
+        cu_seqlens_k=cu_seqlens_k,
+        max_seqlen_q=Q_N_CTX,
+        max_seqlen_k=KV_N_CTX,
+        dropout_p=dropout_p,
+        causal=causal,
+    )
+    torch.cuda.synchronize()
+    flash_took = time.time() - s
+    s = time.time()
+    ref_out = attention_ref(
+        q, k, v, sm_scale=1/math.sqrt(D_HEAD), causal=causal, device=device
+    ).transpose(1,2).reshape(B*Q_N_CTX, H, D_HEAD)
+    torch.cuda.synchronize()
+    ref_took = time.time() - s
+
+    print("allclose", torch.allclose(flash_out, ref_out))
+    print("max delta", (flash_out - ref_out).abs().max().item())
+    print("relative max delta", ((flash_out - ref_out).abs().max() / ref_out.abs().mean()).item())
+    print(stats.spearmanr(flash_out[0,0].float().detach().cpu().numpy(), ref_out[0,0].float().detach().cpu().numpy()))
+    print(f"ref took: {ref_took:.5f}")
+    print(f"flash attn took: {flash_took:.5f}")
+
+    if test_backward:
+        dout = torch.randn_like(q).transpose(1, 2).reshape(B * Q_N_CTX, H, D_HEAD)
+        s = time.time()
+        ref_out.backward(dout)
+        torch.cuda.synchronize()
+        ref_took = time.time() - s
+        ref_dv, v.grad = v.grad.clone(), None
+        ref_dk, k.grad = k.grad.clone(), None
+        ref_dq, q.grad = q.grad.clone(), None
+
+        s = time.time()
+        flash_out.backward(dout)
+        torch.cuda.synchronize()
+        flash_took = time.time() - s
+        flash_dv, v.grad = v.grad.clone(), None
+        flash_dk, k.grad = k.grad.clone(), None
+        flash_dq, q.grad = q.grad.clone(), None
+
+        for name, ref, flash in zip(
+            ["dv", "dk", "dq"],
+            [ref_dv, ref_dk, ref_dq],
+            [flash_dv, flash_dk, flash_dq],
+        ):
+            print(f"=== evaling {name} ===")
+            print("allclose", torch.allclose(flash, ref))
+            print("max delta", (flash - ref).abs().max().item())
+            print("relative max delta", ((flash - ref).abs().max() / ref.abs().mean()).item())
+            print(stats.spearmanr(flash[0,0].flatten().float().detach().cpu().numpy(), ref[0,0].flatten().float().detach().cpu().numpy()))
+        print(f"ref took: {ref_took:.5f}")
+        print(f"flash attn took: {flash_took:.5f}")
diff --git a/tests/test_flash_attn.py b/tests/test_flash_attn.py
index 1ce3837e947..64a4c5826bd 100644
--- a/tests/test_flash_attn.py
+++ b/tests/test_flash_attn.py
@@ -153,8 +153,13 @@ def attention_ref(q, k, v, query_padding_mask=None, key_padding_mask=None, dropo
     if key_padding_mask is not None:
         scores.masked_fill_(rearrange(~key_padding_mask, 'b s -> b 1 1 s'), float('-inf'))
     if causal:
-        causal_mask = torch.triu(torch.ones(seqlen_q, seqlen_k, dtype=torch.bool, device=q.device), 1)
-        scores.masked_fill_(causal_mask, float('-inf'))
+        for idx, (len_q, len_k) in enumerate(zip(query_padding_mask.sum(dim=1), key_padding_mask.sum(dim=1))):
+            causal_mask = torch.triu(
+                torch.ones(len_q, len_k, dtype=torch.bool, device=q.device),
+                len_k - len_q + 1,
+            )
+            scores[idx, :, :len_q, :len_k].masked_fill_(causal_mask, float('-inf'))
+            scores[idx, :, :, len_k:] = float('-inf')
     attention = torch.softmax(scores, dim=-1)
     dropout_scaling = 1.0 / (1 - dropout_p)
     # attention_drop = attention.masked_fill(~dropout_mask, 0.0) * dropout_scaling
@@ -430,6 +435,7 @@ def test_flash_attn_unpadded_qkvpacked(seqlen, d, dropout_p, causal, dtype):
         # assert torch.allclose(dqkv, dqkv_ref, rtol=rtol, atol=atol)
 
 
+@pytest.mark.parametrize('share_q_k_mask', [True, False])
 @pytest.mark.parametrize('dtype', ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16]))
 # @pytest.mark.parametrize('dtype', [torch.float16])
 @pytest.mark.parametrize('causal', [False, True])
@@ -439,9 +445,13 @@ def test_flash_attn_unpadded_qkvpacked(seqlen, d, dropout_p, causal, dtype):
 # @pytest.mark.parametrize('seqlen', [128])
 @pytest.mark.parametrize('dropout_p', [0.0, 0.17])
 # @pytest.mark.parametrize('dropout_p', [0.0])
-def test_flash_attn_unpadded_kvpacked(seqlen, d, dropout_p, causal, dtype):
+def test_flash_attn_unpadded_kvpacked(seqlen, d, dropout_p, causal, dtype, share_q_k_mask):
     if seqlen >= 2048 and torch.cuda.get_device_properties('cuda').total_memory <= 16 * 2**30:
         pytest.skip()  # Reference implementation OOM
+    if causal and not share_q_k_mask and dropout_p > 0.0:
+        pytest.xfail(
+            "probably fails due to convert_flash_attn_S_to_softmax not handling causal prefix attn"
+        )
     device = 'cuda'
     # if dtype == torch.float16:
     #     rtol, atol = (1e-3, 3e-4) if not causal else (1e-3, 1e-3)
@@ -455,7 +465,13 @@ def test_flash_attn_unpadded_kvpacked(seqlen, d, dropout_p, causal, dtype):
     Wqkv = torch.nn.Linear(nheads * d, 3 * nheads * d, device=device, dtype=dtype)
 
     query_padding_mask = generate_random_padding_mask(seqlen, batch_size, device, mode='random')
-    key_padding_mask = generate_random_padding_mask(seqlen, batch_size, device, mode='random')
+    if not share_q_k_mask:
+        key_padding_mask = generate_random_padding_mask(seqlen, batch_size, device, mode='random')
+    else:
+        key_padding_mask = query_padding_mask
+    if causal and not share_q_k_mask:
+        # ensure there are at least as many keys/values as queries for causal prefix cross attention
+        key_padding_mask |= query_padding_mask
 
     (q_unpad, kv_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, q, kv,
      output_pad_fn, dq_pad_fn, dkv_pad_fn) = generate_qkv(
@@ -508,7 +524,9 @@ def test_flash_attn_unpadded_kvpacked(seqlen, d, dropout_p, causal, dtype):
     # of a Pytorch implementation.
     assert (output - output_ref).abs().max().item() <= 2 * (output_pt - output_ref).abs().max().item()
     # assert torch.allclose(output, output_ref, rtol=rtol, atol=atol)
-    assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item()
+    if not (causal and not share_q_k_mask):
+        # probably fails with causal due to convert_flash_attn_S_to_softmax not handling causal prefix attn
+        assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item()
     # assert torch.allclose(attn, attn_ref, rtol=rtol, atol=atol)
     if dropout_p == 0.0:
         assert dropout_mask.all()
@@ -522,6 +540,7 @@ def test_flash_attn_unpadded_kvpacked(seqlen, d, dropout_p, causal, dtype):
         # assert torch.allclose(dkv, dkv_ref, rtol=rtol, atol=atol)
 
 
+@pytest.mark.parametrize('share_q_k_mask', [True, False])
 @pytest.mark.parametrize('dtype', ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16]))
 # @pytest.mark.parametrize('dtype', [torch.float16])
 @pytest.mark.parametrize('causal', [False, True])
@@ -531,9 +550,13 @@ def test_flash_attn_unpadded_kvpacked(seqlen, d, dropout_p, causal, dtype):
 # @pytest.mark.parametrize('seqlen', [128])
 @pytest.mark.parametrize('dropout_p', [0.0, 0.17])
 # @pytest.mark.parametrize('dropout_p', [0.0])
-def test_flash_attn_unpadded(seqlen, d, dropout_p, causal, dtype):
+def test_flash_attn_unpadded(seqlen, d, dropout_p, causal, dtype, share_q_k_mask):
     if seqlen >= 2048 and torch.cuda.get_device_properties('cuda').total_memory <= 16 * 2**30:
         pytest.skip()  # Reference implementation OOM
+    if causal and not share_q_k_mask and dropout_p > 0.0:
+        pytest.xfail(
+            "probably fails due to convert_flash_attn_S_to_softmax not handling causal prefix attn"
+        )
     device = 'cuda'
     # if dtype == torch.float16:
     #     rtol, atol = (1e-3, 3e-4) if not causal else (1e-3, 1e-3)
@@ -547,7 +570,13 @@ def test_flash_attn_unpadded(seqlen, d, dropout_p, causal, dtype):
     Wqkv = torch.nn.Linear(nheads * d, 3 * nheads * d, device=device, dtype=dtype)
 
     query_padding_mask = generate_random_padding_mask(seqlen, batch_size, device, mode='random')
-    key_padding_mask = generate_random_padding_mask(seqlen, batch_size, device, mode='random')
+    if not share_q_k_mask:
+        key_padding_mask = generate_random_padding_mask(seqlen, batch_size, device, mode='random')
+    else:
+        key_padding_mask = query_padding_mask
+    if causal and not share_q_k_mask:
+        # ensure there are at least as many keys/values as queries for causal prefix cross attention
+        key_padding_mask |= query_padding_mask
 
     (q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, q, k, v,
      output_pad_fn, dq_pad_fn, dk_pad_fn) = generate_qkv(
@@ -601,7 +630,9 @@ def test_flash_attn_unpadded(seqlen, d, dropout_p, causal, dtype):
     # of a Pytorch implementation.
     assert (output - output_ref).abs().max().item() <= 2 * (output_pt - output_ref).abs().max().item()
     # assert torch.allclose(output, output_ref, rtol=rtol, atol=atol)
-    assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item()
+    if not (causal and not share_q_k_mask):
+        # probably fails with causal due to convert_flash_attn_S_to_softmax not handling causal prefix attn
+        assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item()
     # assert torch.allclose(attn, attn_ref, rtol=rtol, atol=atol)
     if dropout_p == 0.0:
         assert dropout_mask.all()
@@ -737,6 +768,9 @@ def test_flash_attn_race_condition(seqlen, d, dropout_p, causal, dtype):
 
     query_padding_mask = generate_random_padding_mask(seqlen, batch_size, device, mode='random')
     key_padding_mask = generate_random_padding_mask(seqlen, batch_size, device, mode='random')
+    if causal:
+        # ensure there are at least as many keys/values as queries for causal prefix cross attention
+        key_padding_mask |= query_padding_mask
 
     (q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, q, k, v,
      output_pad_fn, dq_pad_fn, dk_pad_fn) = generate_qkv(
@@ -854,7 +888,7 @@ def test_flash_attn_multigpu():
     assert (dqkv - dqkv_ref).abs().max().item() <= 2 * (dqkv_pt - dqkv_ref).abs().max().item()
 
 
-from flash_attn.flash_attn_triton import flash_attn_func
+# from flash_attn.flash_attn_triton import flash_attn_func
 
 
 @pytest.mark.skipif(not is_sm80, reason='Triton version is only tested on A100')

From c74db73e646b3b6fef1bf8475692b2276c6910c9 Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Fri, 18 Nov 2022 03:01:35 +0000
Subject: [PATCH 03/20] smaller matrix

---
 .github/workflows/publish.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index ff6fa1cb90b..c5215ad5f9d 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -38,9 +38,9 @@ jobs:
       matrix:
           # os: [ubuntu-20.04]
           os: [ubuntu-18.04]
-          python-version: ['3.7', '3.8', '3.9', '3.10']
-          torch-version: [1.11.0, 1.12.0, 1.12.1]
-          cuda-version: ['113', '116']
+          python-version: ['3.9']
+          torch-version: [1.12.1]
+          cuda-version: ['116']
           exclude:
             - torch-version: 1.11.0
               cuda-version: '116'

From 6be76145ad9da50d7db063818ba2f258aa66b6c5 Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Mon, 30 Jan 2023 19:16:23 +0000
Subject: [PATCH 04/20] optimize by ignoring unattended tokens

---
 .../src/fmha_dgrad_kernel_1xN_loop.h           | 17 +++++++++++++++--
 csrc/flash_attn/src/fmha_fprop_kernel_1xN.h    | 18 ++++++++++++++++--
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/csrc/flash_attn/src/fmha_dgrad_kernel_1xN_loop.h b/csrc/flash_attn/src/fmha_dgrad_kernel_1xN_loop.h
index cc761ee3422..3f5d948e3a1 100644
--- a/csrc/flash_attn/src/fmha_dgrad_kernel_1xN_loop.h
+++ b/csrc/flash_attn/src/fmha_dgrad_kernel_1xN_loop.h
@@ -270,7 +270,19 @@ inline __device__ void compute_dq_dk_dv_1xN_one_iter(const Params &params, Prng
     Gmem_softmax_sum gmem_softmax_d(params.dsoftmax_sum, params, tidx);
 
     static_assert(Cta_tile_p::N % Cta_tile_p::M == 0);
-    int begin = 0;
+    int begin;
+    if (Is_causal) {
+        int test_val = loop_step_idx * Cta_tile_p::N - (binfo.actual_seqlen_k - binfo.actual_seqlen_q);
+        if (loop_step_idx * Cta_tile_p::N < binfo.actual_seqlen_k - binfo.actual_seqlen_q) {
+            begin = 0;
+            // printf("%d, %d, %d, %d, %d, %d done1\n", Cta_tile_p::N, Cta_tile_p::M, binfo.actual_seqlen_k, binfo.actual_seqlen_q, loop_step_idx, begin);
+        } else {
+            begin = test_val / Cta_tile_p::M;
+            // printf("%d, %d, %d, %d, %d, %d done2\n", Cta_tile_p::N, Cta_tile_p::M, binfo.actual_seqlen_k, binfo.actual_seqlen_q, loop_step_idx, begin);
+        }
+    } else {
+        begin = 0;
+    }
     // Otherwise we'd be reading out-of-bound memory before the loop
     if (begin * Cta_tile_p::M >= binfo.actual_seqlen_q) {
         // Still need to zero out dk and dv before returning
@@ -678,7 +690,8 @@ inline __device__ void compute_dq_dk_dv_1xN_one_iter(const Params &params, Prng
         if (!Seq_parallel) {
             const bool is_final_write =
                 Is_last
-                || ((loop_step_idx + 1) * Cta_tile_p::N >= binfo.actual_seqlen_k);
+                || ((loop_step_idx + 1) * Cta_tile_p::N >= binfo.actual_seqlen_k)
+                || ((Is_causal) && ((begin + l + 1) * Cta_tile_p::M + (binfo.actual_seqlen_k - binfo.actual_seqlen_q - 1) < (loop_step_idx + 1) * Cta_tile_p::N));
             if (is_final_write) {
                 // if (Is_dropout) {
                 //     dq_out[0] = fmha::fmul4(dq_out[0], params.rp_dropout);
diff --git a/csrc/flash_attn/src/fmha_fprop_kernel_1xN.h b/csrc/flash_attn/src/fmha_fprop_kernel_1xN.h
index d3c7540684d..73059039d33 100644
--- a/csrc/flash_attn/src/fmha_fprop_kernel_1xN.h
+++ b/csrc/flash_attn/src/fmha_fprop_kernel_1xN.h
@@ -272,7 +272,20 @@ inline __device__ void device_1xN_(const Params &params, const int bidb, const i
 
     // Wind gmem tiles to the correct position.
     static_assert(Cta_tile_p::N % Cta_tile_p::M == 0);
-    int begin = 0;
+    // int begin = Is_causal ? (loop_step_idx * Cta_tile_p::N) / Cta_tile_p::M : 0;
+    int begin;
+    if (Is_causal) {
+        int test_val = loop_step_idx * Cta_tile_p::N - (binfo.actual_seqlen_k - binfo.actual_seqlen_q);
+        if (loop_step_idx * Cta_tile_p::N < binfo.actual_seqlen_k - binfo.actual_seqlen_q) {
+            begin = 0;
+            // printf("%d, %d, %d, %d, %d, %d done1\n", Cta_tile_p::N, Cta_tile_p::M, binfo.actual_seqlen_k, binfo.actual_seqlen_q, loop_step_idx, begin);
+        } else {
+            begin = test_val / Cta_tile_p::M;
+            // printf("%d, %d, %d, %d, %d, %d done2\n", Cta_tile_p::N, Cta_tile_p::M, binfo.actual_seqlen_k, binfo.actual_seqlen_q, loop_step_idx, begin);
+        }
+    } else {
+        begin = 0;
+    }
     // We want begin to be a multiple of gridDim.z
     // This is because the row indices processed by each threadblock must align between the
     // loop steps, otherwise we have a dependency between the blocks.
@@ -619,7 +632,8 @@ inline __device__ void device_1xN_(const Params &params, const int bidb, const i
 
         const bool is_final_write =
             Is_last
-            || ((loop_step_idx + 1) * Cta_tile_p::N >= binfo.actual_seqlen_k);
+            || ((loop_step_idx + 1) * Cta_tile_p::N >= binfo.actual_seqlen_k)
+            || ((Is_causal) && ((begin + l + 1) * Cta_tile_p::M + (binfo.actual_seqlen_k - binfo.actual_seqlen_q - 1) < (loop_step_idx + 1) * Cta_tile_p::N));
         #pragma unroll
         for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
             float sum = p_sum_o[jj][0];

From 6775b2b68af12594e27c03163f44d89e7d38a58e Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Tue, 31 Jan 2023 18:52:51 +0000
Subject: [PATCH 05/20] workflow: compile for py38

---
 .github/workflows/publish.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index c5215ad5f9d..ea3d28a3ffa 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -38,7 +38,7 @@ jobs:
       matrix:
           # os: [ubuntu-20.04]
           os: [ubuntu-18.04]
-          python-version: ['3.9']
+          python-version: ['3.8', '3.9']
           torch-version: [1.12.1]
           cuda-version: ['116']
           exclude:

From 432ba04ba79a540ad2d0f9e70a2b9d153cb308e3 Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Tue, 21 Mar 2023 21:04:53 +0000
Subject: [PATCH 06/20] publish for pytorch 2

---
 .github/workflows/publish.yml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index ea3d28a3ffa..9c7622646d5 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -39,12 +39,15 @@ jobs:
           # os: [ubuntu-20.04]
           os: [ubuntu-18.04]
           python-version: ['3.8', '3.9']
-          torch-version: [1.12.1]
-          cuda-version: ['116']
+          torch-version: [1.12.1, 2.0.0]
+          cuda-version: ['116', '118']
           exclude:
             - torch-version: 1.11.0
               cuda-version: '116'
-
+            - torch-version: 1.12.1
+              cuda-version: '118'
+            - torch-version: 2.0.0
+              cuda-version: '116'
     steps:
       - name: Checkout
         uses: actions/checkout@v3

From 710cb47d05e409f3a08f7f7e51ad61de0bd94196 Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Tue, 21 Mar 2023 21:31:48 +0000
Subject: [PATCH 07/20] cuda 118 scripts

---
 .github/workflows/publish.yml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 9c7622646d5..041680f3137 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -38,9 +38,12 @@ jobs:
       matrix:
           # os: [ubuntu-20.04]
           os: [ubuntu-18.04]
-          python-version: ['3.8', '3.9']
-          torch-version: [1.12.1, 2.0.0]
-          cuda-version: ['116', '118']
+          # python-version: ['3.8', '3.9']
+          # torch-version: [1.12.1, 2.0.0]
+          # cuda-version: ['116', '118']
+          python-version: ['3.9']
+          torch-version: [2.0.0]
+          cuda-version: ['118']
           exclude:
             - torch-version: 1.11.0
               cuda-version: '116'

From 3a9d6e9c0f25c17e5ac9be9923b47e867e91394d Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Tue, 21 Mar 2023 21:33:51 +0000
Subject: [PATCH 08/20] cuda 118 scripts attempt 2

---
 .github/workflows/cuda/cu118-Linux-env.sh |  9 +++++++++
 .github/workflows/cuda/cu118-Linux.sh     | 15 +++++++++++++++
 2 files changed, 24 insertions(+)
 create mode 100644 .github/workflows/cuda/cu118-Linux-env.sh
 create mode 100644 .github/workflows/cuda/cu118-Linux.sh

diff --git a/.github/workflows/cuda/cu118-Linux-env.sh b/.github/workflows/cuda/cu118-Linux-env.sh
new file mode 100644
index 00000000000..c85efc6f098
--- /dev/null
+++ b/.github/workflows/cuda/cu118-Linux-env.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+CUDA_HOME=/usr/local/cuda-11.8
+LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+PATH=${CUDA_HOME}/bin:${PATH}
+
+export FORCE_CUDA=1
+export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
+export CUDA_HOME=/usr/local/cuda-11.8
\ No newline at end of file
diff --git a/.github/workflows/cuda/cu118-Linux.sh b/.github/workflows/cuda/cu118-Linux.sh
new file mode 100644
index 00000000000..8cf47a75c6f
--- /dev/null
+++ b/.github/workflows/cuda/cu118-Linux.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+OS=ubuntu1804
+
+wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
+sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
+wget -nv https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda-repo-${OS}-11-8-local_11.8.0-520.61.05-1_amd64.deb
+sudo dpkg -i cuda-repo-${OS}-11-8-local_11.8.0-520.61.05-1_amd64.deb
+sudo apt-key add /var/cuda-repo-${OS}-11-8-local/7fa2af80.pub
+
+sudo apt-get -qq update
+sudo apt install cuda cuda-nvcc-11-8 cuda-libraries-dev-11-8
+sudo apt clean
+
+rm -f https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda-repo-${OS}-11-8-local_11.8.0-520.61.05-1_amd64.deb
\ No newline at end of file

From 5e784b91dd668946d4e4519eb43707ff671bd3fe Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Tue, 21 Mar 2023 21:43:33 +0000
Subject: [PATCH 09/20] cuda 118 keys

---
 .github/workflows/cuda/cu118-Linux.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/cuda/cu118-Linux.sh b/.github/workflows/cuda/cu118-Linux.sh
index 8cf47a75c6f..ca15e1d100e 100644
--- a/.github/workflows/cuda/cu118-Linux.sh
+++ b/.github/workflows/cuda/cu118-Linux.sh
@@ -6,6 +6,7 @@ wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/c
 sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
 wget -nv https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda-repo-${OS}-11-8-local_11.8.0-520.61.05-1_amd64.deb
 sudo dpkg -i cuda-repo-${OS}-11-8-local_11.8.0-520.61.05-1_amd64.deb
+sudo cp /var/cuda-repo-ubuntu1804-11-8-local/cuda-*-keyring.gpg /usr/share/keyrings/
 sudo apt-key add /var/cuda-repo-${OS}-11-8-local/7fa2af80.pub
 
 sudo apt-get -qq update

From b25cc195cd52d70a004a092016fde856cff9b296 Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Fri, 26 May 2023 12:54:18 +0000
Subject: [PATCH 10/20] publish torch 1.13.1

---
 .github/workflows/publish.yml | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 041680f3137..f83453e0170 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -42,15 +42,21 @@ jobs:
           # torch-version: [1.12.1, 2.0.0]
           # cuda-version: ['116', '118']
           python-version: ['3.9']
-          torch-version: [2.0.0]
-          cuda-version: ['118']
+          torch-version: [1.12.1, 1.13.1, 2.0.0]
+          cuda-version: ['116', '117', '118']
           exclude:
-            - torch-version: 1.11.0
-              cuda-version: '116'
+            - torch-version: 1.12.1
+              cuda-version: '117'
             - torch-version: 1.12.1
               cuda-version: '118'
+            - torch-version: 1.13.1
+              cuda-version: '116'
+            - torch-version: 1.13.1
+              cuda-version: '118'
             - torch-version: 2.0.0
               cuda-version: '116'
+            - torch-version: 2.0.0
+              cuda-version: '117'
     steps:
       - name: Checkout
         uses: actions/checkout@v3

From 78633d3f03e2aa4649362fdfec269da8162a3cc0 Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Fri, 26 May 2023 13:12:26 +0000
Subject: [PATCH 11/20] cu117

---
 .github/workflows/cuda/cu117-Linux-env.sh |  9 +++++++++
 .github/workflows/cuda/cu117-Linux.sh     | 16 ++++++++++++++++
 2 files changed, 25 insertions(+)
 create mode 100644 .github/workflows/cuda/cu117-Linux-env.sh
 create mode 100644 .github/workflows/cuda/cu117-Linux.sh

diff --git a/.github/workflows/cuda/cu117-Linux-env.sh b/.github/workflows/cuda/cu117-Linux-env.sh
new file mode 100644
index 00000000000..ab432d16fe2
--- /dev/null
+++ b/.github/workflows/cuda/cu117-Linux-env.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+CUDA_HOME=/usr/local/cuda-11.7
+LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+PATH=${CUDA_HOME}/bin:${PATH}
+
+export FORCE_CUDA=1
+export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
+export CUDA_HOME=/usr/local/cuda-11.7
\ No newline at end of file
diff --git a/.github/workflows/cuda/cu117-Linux.sh b/.github/workflows/cuda/cu117-Linux.sh
new file mode 100644
index 00000000000..2011bbf67c9
--- /dev/null
+++ b/.github/workflows/cuda/cu117-Linux.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+OS=ubuntu1804
+
+wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
+sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
+wget -nv https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda-repo-${OS}-11-7-local_11.7.0-515.43.04-1_amd64.deb
+sudo dpkg -i cuda-repo-${OS}-11-7-local_11.7.0-515.43.04-1_amd64.deb
+sudo cp /var/cuda-repo-ubuntu1804-11-7-local/cuda-*-keyring.gpg /usr/share/keyrings/
+sudo apt-key add /var/cuda-repo-${OS}-11-7-local/7fa2af80.pub
+
+sudo apt-get -qq update
+sudo apt install cuda cuda-nvcc-11-7 cuda-libraries-dev-11-7
+sudo apt clean
+
+rm -f https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda-repo-${OS}-11-7-local_11.7.0-515.43.04-1_amd64.deb
\ No newline at end of file

From f3ef62509f76da34cd3518d5d6b15523c78ddbfc Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Fri, 26 May 2023 13:14:27 +0000
Subject: [PATCH 12/20] ubuntu-latest

---
 .github/workflows/publish.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index f83453e0170..7a90e57d129 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -37,7 +37,7 @@ jobs:
       fail-fast: false
       matrix:
           # os: [ubuntu-20.04]
-          os: [ubuntu-18.04]
+          os: [ubuntu-latest]
           # python-version: ['3.8', '3.9']
           # torch-version: [1.12.1, 2.0.0]
           # cuda-version: ['116', '118']

From e3213d31f7fd5474cc4e4c445dff24b286da7c6a Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Fri, 26 May 2023 13:29:04 +0000
Subject: [PATCH 13/20] include necessary packages

---
 .github/workflows/publish.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 7a90e57d129..1bfdc6956e3 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -93,7 +93,7 @@ jobs:
 
       - name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }}
         run: |
-          pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses && conda clean -ya
+          pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses packaging einops && conda clean -ya
           pip install --no-index --no-cache-dir torch==${{ matrix.torch-version }} -f https://download.pytorch.org/whl/cu${{ matrix.cuda-version }}/torch_stable.html
           python --version
           python -c "import torch; print('PyTorch:', torch.__version__)"

From e64cdeb3bac7a63fb7f167077abafb53ba164c21 Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Fri, 26 May 2023 14:41:24 +0000
Subject: [PATCH 14/20] ubuntu 2004

---
 .github/workflows/publish.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 1bfdc6956e3..9ee06d92a64 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -36,8 +36,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-          # os: [ubuntu-20.04]
-          os: [ubuntu-latest]
+          os: [ubuntu-20.04]
           # python-version: ['3.8', '3.9']
           # torch-version: [1.12.1, 2.0.0]
           # cuda-version: ['116', '118']
@@ -119,7 +118,7 @@ jobs:
           export FORCE_CUDA="1"
           export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
           export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-          export CUDA_INSTALL_DIR=/usr/local/cuda-11.3$CUDA_INSTALL_DIR
+          export CUDA_INSTALL_DIR=/usr/local/cuda$CUDA_INSTALL_DIR
           pip install wheel
           python setup.py bdist_wheel --dist-dir=dist
           tmpname=cu${{ matrix.cuda-version }}torch${{ matrix.torch-version }}

From ccba2491fecb0097a16e7ddbf127df0ad2503b31 Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Fri, 26 May 2023 15:27:49 +0000
Subject: [PATCH 15/20] fix os

---
 .github/workflows/cuda/cu116-Linux.sh | 2 +-
 .github/workflows/cuda/cu117-Linux.sh | 5 ++---
 .github/workflows/cuda/cu118-Linux.sh | 5 ++---
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/cuda/cu116-Linux.sh b/.github/workflows/cuda/cu116-Linux.sh
index e3e4e2af75a..883d939fcdd 100644
--- a/.github/workflows/cuda/cu116-Linux.sh
+++ b/.github/workflows/cuda/cu116-Linux.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-OS=ubuntu1804
+OS=ubuntu2004
 
 wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
 sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
diff --git a/.github/workflows/cuda/cu117-Linux.sh b/.github/workflows/cuda/cu117-Linux.sh
index 2011bbf67c9..3935b4ddb96 100644
--- a/.github/workflows/cuda/cu117-Linux.sh
+++ b/.github/workflows/cuda/cu117-Linux.sh
@@ -1,13 +1,12 @@
 #!/bin/bash
 
-OS=ubuntu1804
+OS=ubuntu2004
 
 wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
 sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
 wget -nv https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda-repo-${OS}-11-7-local_11.7.0-515.43.04-1_amd64.deb
 sudo dpkg -i cuda-repo-${OS}-11-7-local_11.7.0-515.43.04-1_amd64.deb
-sudo cp /var/cuda-repo-ubuntu1804-11-7-local/cuda-*-keyring.gpg /usr/share/keyrings/
-sudo apt-key add /var/cuda-repo-${OS}-11-7-local/7fa2af80.pub
+sudo cp /var/cuda-repo-${OS}-11-7-local/cuda-*-keyring.gpg /usr/share/keyrings/
 
 sudo apt-get -qq update
 sudo apt install cuda cuda-nvcc-11-7 cuda-libraries-dev-11-7
diff --git a/.github/workflows/cuda/cu118-Linux.sh b/.github/workflows/cuda/cu118-Linux.sh
index ca15e1d100e..832b3fa3812 100644
--- a/.github/workflows/cuda/cu118-Linux.sh
+++ b/.github/workflows/cuda/cu118-Linux.sh
@@ -1,13 +1,12 @@
 #!/bin/bash
 
-OS=ubuntu1804
+OS=ubuntu2004
 
 wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
 sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
 wget -nv https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda-repo-${OS}-11-8-local_11.8.0-520.61.05-1_amd64.deb
 sudo dpkg -i cuda-repo-${OS}-11-8-local_11.8.0-520.61.05-1_amd64.deb
-sudo cp /var/cuda-repo-ubuntu1804-11-8-local/cuda-*-keyring.gpg /usr/share/keyrings/
-sudo apt-key add /var/cuda-repo-${OS}-11-8-local/7fa2af80.pub
+sudo cp /var/cuda-repo-${OS}-11-8-local/cuda-*-keyring.gpg /usr/share/keyrings/
 
 sudo apt-get -qq update
 sudo apt install cuda cuda-nvcc-11-8 cuda-libraries-dev-11-8

From 483123c31c1d753266cc18d9cfc6ca24c0766f5d Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Tue, 30 May 2023 12:41:46 +0000
Subject: [PATCH 16/20] allow larger bias matrices in triton impl

---
 .github/workflows/publish.yml   |  6 +++---
 flash_attn/flash_attn_triton.py | 18 +++++++++++++++---
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 9ee06d92a64..13923078720 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -41,7 +41,7 @@ jobs:
           # torch-version: [1.12.1, 2.0.0]
           # cuda-version: ['116', '118']
           python-version: ['3.9']
-          torch-version: [1.12.1, 1.13.1, 2.0.0]
+          torch-version: [1.12.1, 1.13.1, 2.0.1]
           cuda-version: ['116', '117', '118']
           exclude:
             - torch-version: 1.12.1
@@ -52,9 +52,9 @@ jobs:
               cuda-version: '116'
             - torch-version: 1.13.1
               cuda-version: '118'
-            - torch-version: 2.0.0
+            - torch-version: 2.0.1
               cuda-version: '116'
-            - torch-version: 2.0.0
+            - torch-version: 2.0.1
               cuda-version: '117'
     steps:
       - name: Checkout
diff --git a/flash_attn/flash_attn_triton.py b/flash_attn/flash_attn_triton.py
index 78b75885e12..4d50591c6a5 100644
--- a/flash_attn/flash_attn_triton.py
+++ b/flash_attn/flash_attn_triton.py
@@ -102,7 +102,12 @@ def _fwd_kernel(
     if BIAS_TYPE == 'vector':
         b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + offs_n
     elif BIAS_TYPE == 'matrix':
-        b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + (offs_m[:, None] * stride_bm + offs_n[None, :])
+        off_hb_b = off_hb.to(tl.int64)
+        off_b_b = off_hb_b // nheads
+        off_h_b = off_hb_b % nheads
+        start_m_b = start_m.to(tl.int64)
+        offs_m_b = start_m_b * BLOCK_M + tl.arange(0, BLOCK_M)
+        b_ptrs = Bias + off_b_b * stride_bb + off_h_b * stride_bh + (offs_m_b[:, None] * stride_bm + offs_n[None, :])
     # initialize pointer to m and l
     t_ptrs = TMP + off_hb * seqlen_q_rounded + offs_m
     lse_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
@@ -311,7 +316,11 @@ def _bwd_kernel_one_col_block(
     if BIAS_TYPE == 'vector':
         b_ptrs = Bias + offs_n
     elif BIAS_TYPE == 'matrix':
-        b_ptrs = Bias + (offs_qm[:, None] * stride_bm + offs_n[None, :])
+        start_n_b = start_n.to(tl.int64)
+        begin_m_b = 0 if not IS_CAUSAL else ((start_n_b * BLOCK_N) // BLOCK_M) * BLOCK_M
+        offs_qm_b = begin_m_b + tl.arange(0, BLOCK_M)
+        offs_n_b = start_n_b * BLOCK_N + tl.arange(0, BLOCK_N)
+        b_ptrs = Bias + (offs_qm_b[:, None] * stride_bm + offs_n_b[None, :])
     # initialize dv and dk
     dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
     dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
@@ -538,7 +547,10 @@ def _bwd_kernel(
     DK += off_b * stride_dkb + off_h * stride_dkh
     DV += off_b * stride_dvb + off_h * stride_dvh
     if BIAS_TYPE != 'none':
-        Bias += off_b * stride_bb + off_h * stride_bh
+        if BIAS_TYPE == 'matrix':
+            Bias += off_b.to(tl.int64) * stride_bb + off_h.to(tl.int64) * stride_bh
+        else:
+            Bias += off_b * stride_bb + off_h * stride_bh
     # pointer to row-wise quantities in value-like data
     D += off_hb * seqlen_q_rounded
     LSE += off_hb * seqlen_q_rounded

From e7300e64bccfd98c6c811a21f828e9eb76a65a79 Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Tue, 30 May 2023 13:03:16 +0000
Subject: [PATCH 17/20] build using index url

---
 .github/workflows/publish.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 13923078720..ca534811573 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -93,7 +93,7 @@ jobs:
       - name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }}
         run: |
           pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses packaging einops && conda clean -ya
-          pip install --no-index --no-cache-dir torch==${{ matrix.torch-version }} -f https://download.pytorch.org/whl/cu${{ matrix.cuda-version }}/torch_stable.html
+          pip install --no-index --no-cache-dir torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/cu${{ matrix.cuda-version }}
           python --version
           python -c "import torch; print('PyTorch:', torch.__version__)"
           python -c "import torch; print('CUDA:', torch.version.cuda)"

From b6f8595c231330553eac566624585156111ef763 Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Tue, 30 May 2023 13:18:07 +0000
Subject: [PATCH 18/20] fix no index

---
 .github/workflows/publish.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index ca534811573..02554d29a8a 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -93,7 +93,7 @@ jobs:
       - name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }}
         run: |
           pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses packaging einops && conda clean -ya
-          pip install --no-index --no-cache-dir torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/cu${{ matrix.cuda-version }}
+          pip install --no-cache-dir torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/cu${{ matrix.cuda-version }}
           python --version
           python -c "import torch; print('PyTorch:', torch.__version__)"
           python -c "import torch; print('CUDA:', torch.version.cuda)"

From d7cd1258161ce4f54e263b43ddbadd0f367cd0e0 Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Tue, 30 May 2023 13:46:27 +0000
Subject: [PATCH 19/20] preinstall setuptools

---
 .github/workflows/publish.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 02554d29a8a..4bf72e5f18f 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -92,7 +92,7 @@ jobs:
 
       - name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }}
         run: |
-          pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses packaging einops && conda clean -ya
+          pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses packaging einops setuptools && conda clean -ya
           pip install --no-cache-dir torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/cu${{ matrix.cuda-version }}
           python --version
           python -c "import torch; print('PyTorch:', torch.__version__)"

From 67bd6c044af00835632c2acf11fdca6c7bbde6e7 Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Tue, 30 May 2023 14:12:55 +0000
Subject: [PATCH 20/20] try extra index url

---
 .github/workflows/publish.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 4bf72e5f18f..22e10c3f211 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -93,7 +93,7 @@ jobs:
       - name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }}
         run: |
           pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses packaging einops setuptools && conda clean -ya
-          pip install --no-cache-dir torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/cu${{ matrix.cuda-version }}
+          pip install --no-cache-dir torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/cu${{ matrix.cuda-version }} --extra-index-url https://pypi.org/simple
           python --version
           python -c "import torch; print('PyTorch:', torch.__version__)"
           python -c "import torch; print('CUDA:', torch.version.cuda)"