Apply rewrite for normal attention and MQA

Fixes a bug introduced in mlc-ai#1052, where use of the `--use-flash-attn-mqa` flag on a model that doesn't use MQA would prevent the use of CUTLASS attention at all.
Lunderberg · Oct 27, 2023 · f43dd23 · f43dd23
1 parent 24f795e
commit f43dd23
Showing 1 changed file with 3 additions and 1 deletion.
diff --git a/mlc_llm/core.py b/mlc_llm/core.py
@@ -454,7 +454,9 @@ def mod_transform_before_build(
         has_cutlass = tvm.get_global_func("relax.ext.cutlass", True)
 
         if has_cutlass and not args.no_cutlass_attn:
-            mod = rewrite_attention(use_flash_mqa=args.use_flash_attn_mqa)(mod)
+            if args.use_flash_attn_mqa:
+                mod = rewrite_attention(use_flash_mqa=True)(mod)
+            mod = rewrite_attention(use_flash_mqa=False)(mod)
             patterns += get_patterns_with_prefix("cutlass.attention")
 
         if has_cutlass and not args.no_cutlass_norm: