How to Build Memory-Efficient Transformers with xFormers Using Packed Sequences, GQA, ALiBi, SwiGLU, and Causal Attention

How to Build Memory-Efficient Transformers with xFormers Using Packed Sequences, GQA, ALiBi, SwiGLU, and Causal Attention


print("\n" + "="*70 + "\n4. Variable-length packed batch — no padding waste\n" + "="*70)
seqlens = [37, 120, 8, 200]
total = sum(seqlens)
H, K = 8, 64
q = torch.randn(1, total, H, K, device=device, dtype=torch.float16)
k = torch.randn(1, total, H, K, device=device, dtype=torch.float16)
v = torch.randn(1, total, H, K, device=device, dtype=torch.float16)
try:
   bias = ab.BlockDiagonalMask.from_seqlens(seqlens)
   out_packed = xops.memory_efficient_attention(q, k, v, attn_bias=bias)
   s0 = seqlens[0]
   ref0 = vanilla_attention(q[:, :s0], k[:, :s0], v[:, :s0]).half()
   print("packed shape         :", tuple(out_packed.shape), "(all", total, "tokens, no pad)")
   print("segment-0 max diff   : {:.2e}".format((out_packed[:, :s0] - ref0).abs().max().item()))
   cbias = ab.BlockDiagonalCausalMask.from_seqlens(seqlens)
   _ = xops.memory_efficient_attention(q, k, v, attn_bias=cbias)
   print("-> also did a packed CAUSAL pass. This is how vLLM-style engines")
   print("   batch requests of different lengths with zero padding overhead.")
   splits = bias.split(out_packed)
   print("recovered segments   :", [tuple(t.shape) for t in splits])
except Exception as e:
   print("BlockDiagonalMask path skipped on this version/backend:", repr(e))
print("\n" + "="*70 + "\n5. Grouped-query attention (5-D BMGHK layout)\n" + "="*70)
B, M, K = 2, 256, 64
n_q_heads, n_kv_heads = 8, 2
G, Hq = n_kv_heads, n_q_heads // n_kv_heads
try:
   qg = torch.randn(B, M, G, Hq, K, device=device, dtype=torch.float16)
   kg = torch.randn(B, M, G, 1,  K, device=device, dtype=torch.float16)
   vg = torch.randn(B, M, G, 1,  K, device=device, dtype=torch.float16)
   out_gqa = xops.memory_efficient_attention(qg, kg, vg)
   print("GQA output shape     :", tuple(out_gqa.shape), "= [B, M, G, Hq, K]")
   print(f"-> {n_q_heads} query heads, only {n_kv_heads} KV heads: smaller KV-cache,")
   print("   which is exactly what Llama-/Mistral-class models use at inference.")
except Exception as e:
   print("GQA 5-D path skipped on this version/backend:", repr(e))



Source link