oumi-ai · wizeng23 · Jun 19, 2025 · Jun 18, 2025 · Jun 18, 2025 · Jun 18, 2025
diff --git a/configs/examples/grpo_verl_countdown/gcp_job.yaml b/configs/examples/grpo_verl_countdown/gcp_job.yaml
@@ -34,7 +34,7 @@ envs:
 setup: |
   set -e
   pip install uv && uv pip install oumi[gpu]
-  # 2.8.0.post2 has installation issues.
+  # TODO: OPE-1336 - Remove version pin when error with later versions is fixed.
   pip install -U "flash-attn==2.7.4.post1" --no-build-isolation
 
 run: |

diff --git a/configs/examples/grpo_verl_geometry3k/gcp_job.yaml b/configs/examples/grpo_verl_geometry3k/gcp_job.yaml
@@ -41,6 +41,7 @@ setup: |
   # In the meantime, we need to use this specific commit to support vLLM 0.8.3:
   # https://github.com/volcengine/verl/pull/912
   pip install git+https://github.com/volcengine/verl.git@1ee730163f6326e9679644db62eb32c8d1947c7f
+  # TODO: OPE-1336 - Remove version pin when error with later versions is fixed.
   pip install -U "flash-attn==2.7.4.post1" --no-build-isolation
 
 run: |

diff --git a/configs/recipes/vision/phi3/sft/full/oumi_gcp_job.yaml b/configs/recipes/vision/phi3/sft/full/oumi_gcp_job.yaml
@@ -45,7 +45,8 @@ setup: |
   # downloading the model during training.
   HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download microsoft/Phi-3-vision-128k-instruct
 
-  pip install -U flash-attn --no-build-isolation
+  # TODO: OPE-1336 - Remove version pin when error with later versions is fixed.
+  pip install -U "flash-attn==2.7.4.post1" --no-build-isolation
 
 run: |
   set -e  # Exit if any command failed.

diff --git a/configs/recipes/vision/phi3/sft/full/trl_gcp_job.yaml b/configs/recipes/vision/phi3/sft/full/trl_gcp_job.yaml
@@ -45,7 +45,8 @@ setup: |
   # downloading the model during training.
   HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download microsoft/Phi-3-vision-128k-instruct
 
-  pip install -U flash-attn --no-build-isolation
+  # TODO: OPE-1336 - Remove version pin when error with later versions is fixed.
+  pip install -U "flash-attn==2.7.4.post1" --no-build-isolation
 
 run: |
   set -e  # Exit if any command failed.

diff --git a/configs/recipes/vision/phi3/sft/lora/gcp_job.yaml b/configs/recipes/vision/phi3/sft/lora/gcp_job.yaml
@@ -45,7 +45,8 @@ setup: |
   # downloading the model during training.
   HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download microsoft/Phi-3-vision-128k-instruct
 
-  pip install -U flash-attn --no-build-isolation
+  # TODO: OPE-1336 - Remove version pin when error with later versions is fixed.
+  pip install -U "flash-attn==2.7.4.post1" --no-build-isolation
 
 run: |
   set -e  # Exit if any command failed.

diff --git a/configs/recipes/vision/phi4/sft/full/gcp_job.yaml b/configs/recipes/vision/phi4/sft/full/gcp_job.yaml
@@ -42,7 +42,8 @@ setup: |
   HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download microsoft/Phi-4-multimodal-instruct
 
   # The model requires flash_attention_2! Install it here.
-  pip install -U flash-attn --no-build-isolation
+  # TODO: OPE-1336 - Remove version pin when error with later versions is fixed.
+  pip install -U "flash-attn==2.7.4.post1" --no-build-isolation
 
 
 run: |

diff --git a/configs/recipes/vision/phi4/sft/lora/gcp_job.yaml b/configs/recipes/vision/phi4/sft/lora/gcp_job.yaml
@@ -42,7 +42,8 @@ setup: |
   HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download microsoft/Phi-4-multimodal-instruct
 
   # The model requires flash_attention_2! Install it here.
-  pip install -U flash-attn --no-build-isolation
+  # TODO: OPE-1336 - Remove version pin when error with later versions is fixed.
+  pip install -U "flash-attn==2.7.4.post1" --no-build-isolation
 
 
 run: |

diff --git a/configs/recipes/vision/qwen2_5_vl_3b/sft/full/gcp_job.yaml b/configs/recipes/vision/qwen2_5_vl_3b/sft/full/gcp_job.yaml
@@ -44,7 +44,8 @@ setup: |
   # Also, if you want to try it with a more efficient attention implementation,
   # you can install the `flash_attention_2` package and set `attn_implementation:
   # "flash_attention_2"` in the model config.
-  #  pip install -U flash-attn --no-build-isolation
+  #  # TODO: OPE-1336 - Remove version pin when error with later versions is fixed.
+  pip install -U "flash-attn==2.7.4.post1" --no-build-isolation
 
 run: |
   set -e  # Exit if any command failed.

diff --git a/configs/recipes/vision/qwen2_5_vl_3b/sft/lora/gcp_job.yaml b/configs/recipes/vision/qwen2_5_vl_3b/sft/lora/gcp_job.yaml
@@ -44,7 +44,8 @@ setup: |
   # Also, if you want to try it with a more efficient attention implementation,
   # you can install the `flash_attention_2` package and set `attn_implementation:
   # "flash_attention_2"` in the model config.
-  # pip install -U flash-attn --no-build-isolation
+  # TODO: OPE-1336 - Remove version pin when error with later versions is fixed.
+  # pip install -U "flash-attn==2.7.4.post1" --no-build-isolation
 
 run: |
   set -e  # Exit if any command failed.

diff --git a/configs/recipes/vision/qwen2_vl_2b/evaluation/gcp_job.yaml b/configs/recipes/vision/qwen2_vl_2b/evaluation/gcp_job.yaml
@@ -38,7 +38,8 @@ envs:
 setup: |
   set -e
   pip install uv && uv pip install oumi[gpu] hf_transfer
-  pip install -U flash-attn --no-build-isolation
+  # TODO: OPE-1336 - Remove version pin when error with later versions is fixed.
+  pip install -U "flash-attn==2.7.4.post1" --no-build-isolation
 
   # Install model from HF Hub. This tool increases download speed compared to
   # downloading the model during eval.
@@ -53,8 +54,6 @@ run: |
     exit 1
   fi
 
-  pip install -U flash-attn --no-build-isolation
-
   echo "Starting evaluation for ${MODEL_CHECKPOINT_DIR} ..."
 
   set -x

diff --git a/configs/recipes/vision/smolvlm/sft/full/gcp_job.yaml b/configs/recipes/vision/smolvlm/sft/full/gcp_job.yaml
@@ -41,7 +41,8 @@ setup: |
   # downloading the model during training.
   HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download HuggingFaceTB/SmolVLM-Instruct --exclude "onnx/*" "runs/*"
 
-  pip install -U flash-attn --no-build-isolation
+  # TODO: OPE-1336 - Remove version pin when error with later versions is fixed.
+  pip install -U "flash-attn==2.7.4.post1" --no-build-isolation
 
 run: |
   set -e  # Exit if any command failed.

diff --git a/configs/recipes/vision/smolvlm/sft/lora/gcp_job.yaml b/configs/recipes/vision/smolvlm/sft/lora/gcp_job.yaml
@@ -41,7 +41,8 @@ setup: |
   # downloading the model during training.
   HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download HuggingFaceTB/SmolVLM-Instruct --exclude "onnx/*" "runs/*"
 
-  pip install -U flash-attn --no-build-isolation
+  # TODO: OPE-1336 - Remove version pin when error with later versions is fixed.
+  pip install -U "flash-attn==2.7.4.post1" --no-build-isolation
 
 run: |
   set -e  # Exit if any command failed.