oumi-ai · wizeng23 · Jan 27, 2025 · Jan 27, 2025 · Jan 27, 2025 · Jan 27, 2025
diff --git a/configs/recipes/deepseek_r1/README.md b/configs/recipes/deepseek_r1/README.md
@@ -0,0 +1,3 @@
+# Deepseek
+
+Configs for Deepseek R1 models, including distilled models.
diff --git a/configs/recipes/deepseek_r1/evaluation/distill_llama_70b/eval.yaml b/configs/recipes/deepseek_r1/evaluation/distill_llama_70b/eval.yaml
@@ -0,0 +1,25 @@
+# Class: oumi.core.configs.EvaluationConfig
+# https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/evaluation_config.py
+
+# Eval config for Deepseek R1 Distill Llama 3.3 70B.
+
+model:
+  model_name: "deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
+  model_max_length: 131072
+  torch_dtype_str: "bfloat16"
+  attn_implementation: "sdpa"
+  load_pretrained_weights: True
+  trust_remote_code: True
+  shard_for_eval: True
+
+generation:
+  batch_size: 3
+
+tasks:
+  # For all available tasks, see https://oumi.ai/docs/latest/user_guides/evaluate/evaluate.html
+  - evaluation_platform: lm_harness
+    task_name: mmlu_college_computer_science
+    eval_kwargs:
+      num_fewshot: 5
+
+enable_wandb: True
diff --git a/configs/recipes/deepseek_r1/evaluation/distill_llama_70b/gcp_job.yaml b/configs/recipes/deepseek_r1/evaluation/distill_llama_70b/gcp_job.yaml
@@ -0,0 +1,53 @@
+# Class: oumi.core.configs.JobConfig
+# https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/job_config.py
+
+# Config to eval Deepseek R1 Distill Llama 3.3 70B.
+# Example command:
+# oumi launch up -c configs/recipes/deepseek_r1/evaluation/distill_llama_70b/gcp_job.yaml --cluster deepseek-r1-llama70b-eval
+name: deepseek-r1-distill-llama70b-eval
+
+resources:
+  cloud: gcp
+  accelerators: "A100:4"
+  use_spot: false
+  disk_size: 400 # Disk size in GBs
+
+working_dir: .
+
+file_mounts:
+  ~/.netrc: ~/.netrc  # WandB credentials
+  ~/.cache/huggingface/token: ~/.cache/huggingface/token # HF credentials
+
+envs:
+  # NOTE: For SFT, update this to point to your model checkpoint.
+  # NOTE: For LoRA, instead update this to point to your LoRA adapter.
+  #       The base model will be inferred automatically.
+  MODEL_CHECKPOINT_DIR: deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+  WANDB_PROJECT: oumi-eval
+  OUMI_RUN_NAME: deepseek-r1.llama70b.eval
+
+setup: |
+  set -e
+  pip install uv && uv pip install '.[gpu,evaluation]' hf_transfer
+  # Install model from HF Hub. This tool increases download speed compared to
+  # downloading the model during eval.
+  HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+
+run: |
+  set -e  # Exit if any command failed.
+  source ./configs/examples/misc/sky_init.sh
+
+  if test ${OUMI_NUM_NODES} -ne 1; then
+    echo "LM Harness supports max 1 node. Actual: ${OUMI_NUM_NODES} nodes."
+    exit 1
+  fi
+
+  echo "Starting evaluation for ${MODEL_CHECKPOINT_DIR} ..."
+  set -x
+
+  oumi evaluate \
+    -c configs/recipes/deepseek_r1/evaluation/distill_llama_70b/eval.yaml \
+    --run_name "${OUMI_RUN_NAME}.${SKYPILOT_TASK_ID}" \
+    --model.model_name "${MODEL_CHECKPOINT_DIR}"
+
+  echo "Node ${SKYPILOT_NODE_RANK} is all done!"
diff --git a/configs/recipes/deepseek_r1/evaluation/distill_llama_8b/eval.yaml b/configs/recipes/deepseek_r1/evaluation/distill_llama_8b/eval.yaml
@@ -0,0 +1,24 @@
+# Class: oumi.core.configs.EvaluationConfig
+# https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/evaluation_config.py
+
+# Eval config for Deepseek R1 Distill Llama 3.1 8B.
+
+model:
+  model_name: "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+  model_max_length: 131072
+  torch_dtype_str: "bfloat16"
+  attn_implementation: "sdpa"
+  load_pretrained_weights: True
+  trust_remote_code: True
+
+generation:
+  batch_size: 4
+
+tasks:
+  # For all available tasks, see https://oumi.ai/docs/latest/user_guides/evaluate/evaluate.html
+  - evaluation_platform: lm_harness
+    task_name: mmlu_college_computer_science
+    eval_kwargs:
+      num_fewshot: 5
+
+enable_wandb: True
diff --git a/configs/recipes/deepseek_r1/evaluation/distill_llama_8b/gcp_job.yaml b/configs/recipes/deepseek_r1/evaluation/distill_llama_8b/gcp_job.yaml
@@ -0,0 +1,53 @@
+# Class: oumi.core.configs.JobConfig
+# https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/job_config.py
+
+# Config to eval Deepseek R1 Distill Llama 3.3 8B.
+# Example command:
+# oumi launch up -c configs/recipes/deepseek_r1/evaluation/distill_llama_8b/gcp_job.yaml --cluster deepseek-r1-llama8b-eval
+name: deepseek-r1-distill-llama8b-eval
+
+resources:
+  cloud: gcp
+  accelerators: "A100:1"
+  use_spot: false
+
+working_dir: .
+
+file_mounts:
+  ~/.netrc: ~/.netrc  # WandB credentials
+  ~/.cache/huggingface/token: ~/.cache/huggingface/token # HF credentials
+
+envs:
+  # NOTE: For SFT, update this to point to your model checkpoint.
+  # NOTE: For LoRA, instead update this to point to your LoRA adapter.
+  #       The base model will be inferred automatically.
+  MODEL_CHECKPOINT_DIR: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  WANDB_PROJECT: oumi-eval
+  OUMI_RUN_NAME: deepseek-r1.llama8b.eval
+
+setup: |
+  set -e
+  pip install uv && uv pip install '.[gpu,evaluation]' hf_transfer
+  # Install model from HF Hub. This tool increases download speed compared to
+  # downloading the model during eval.
+  HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+
+run: |
+  set -e  # Exit if any command failed.
+  source ./configs/examples/misc/sky_init.sh
+
+  if test ${OUMI_NUM_NODES} -ne 1; then
+    echo "LM Harness supports max 1 node. Actual: ${OUMI_NUM_NODES} nodes."
+    exit 1
+  fi
+
+  echo "Starting evaluation for ${MODEL_CHECKPOINT_DIR} ..."
+  set -x
+
+  accelerate launch \
+    -m oumi evaluate \
+    -c configs/recipes/deepseek_r1/evaluation/distill_llama_8b/eval.yaml \
+    --run_name "${OUMI_RUN_NAME}.${SKYPILOT_TASK_ID}" \
+    --model.model_name "${MODEL_CHECKPOINT_DIR}"
+
+  echo "Node ${SKYPILOT_NODE_RANK} is all done!"
diff --git a/configs/recipes/deepseek_r1/inference/distill_llama_70b_infer.yaml b/configs/recipes/deepseek_r1/inference/distill_llama_70b_infer.yaml
@@ -0,0 +1,17 @@
+# Class: oumi.core.configs.InferenceConfig
+# https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/inference_config.py
+
+# Inference config for Deepseek R1 Distill Llama 3.3 70B.
+
+model:
+  model_name: "deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
+  model_max_length: 2048
+  torch_dtype_str: "bfloat16"
+  attn_implementation: "sdpa"
+  load_pretrained_weights: True
+  trust_remote_code: True
+
+generation:
+  max_new_tokens: 2048
+
+engine: NATIVE
@@ -0,0 +1,17 @@
+# Class: oumi.core.configs.InferenceConfig
+# https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/inference_config.py
+
+# Inference config for Deepseek R1 Distill Llama 3.1 8B.
+
+model:
+  model_name: "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+  model_max_length: 2048
+  torch_dtype_str: "bfloat16"
+  attn_implementation: "sdpa"
+  load_pretrained_weights: True
+  trust_remote_code: True
+
+generation:
+  max_new_tokens: 2048
+
+engine: NATIVE
@@ -0,0 +1,55 @@
+# Class: oumi.core.configs.JobConfig
+# https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/job_config.py
+
+# Config to full fine-tune Deepseek R1 Distill Llama 3.3 70B.
+# Example command:
+# oumi launch up -c configs/recipes/deepseek_r1/sft/distill_llama_70b/full_gcp_job.yaml --cluster deepseek-r1-llama70b-sft
+name: deepseek-r1-distill-llama70b-sft
+
+resources:
+  cloud: gcp
+  accelerators: "A100-80GB:8"
+  # If you don't have quota for a non-spot VM, try setting use_spot to true.
+  # However, make sure you are saving your output to a mounted cloud storage in case of
+  # preemption. For more information, see:
+  # https://oumi.ai/docs/latest/user_guides/launch/launch.html#mount-cloud-storage
+  use_spot: false
+  disk_size: 2000 # Disk size in GBs
+
+num_nodes: 1 # Set it to N for multi-node training.
+
+working_dir: .
+
+file_mounts:
+  ~/.netrc: ~/.netrc  # WandB credentials
+  ~/.cache/huggingface/token: ~/.cache/huggingface/token # HF credentials
+
+# NOTE: Uncomment the following lines to mount a cloud bucket to your VM.
+# For more details, see https://oumi.ai/docs/latest/user_guides/launch/launch.html.
+# storage_mounts:
+#   /gcs_dir:
+#     source: gs://<your-bucket>
+#     store: gcs
+
+envs:
+  WANDB_PROJECT: oumi-train
+  OUMI_RUN_NAME: deepseek-r1.llama70b.fft
+
+setup: |
+  set -e
+  pip install uv && uv pip install '.[gpu]' hf_transfer
+  # Install model from HF Hub. This tool increases download speed compared to
+  # downloading the model during training.
+  HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+
+run: |
+  set -e  # Exit if any command failed.
+  source ./configs/examples/misc/sky_init.sh
+
+  set -x
+  oumi distributed torchrun \
+      -m oumi train \
+      -c configs/recipes/deepseek_r1/sft/distill_llama_70b/full_train.yaml \
+      --training.run_name "${OUMI_RUN_NAME}.${SKYPILOT_TASK_ID}"
+
+  echo "Node ${SKYPILOT_NODE_RANK} is all done!"
@@ -0,0 +1,55 @@
+# Class: oumi.core.configs.TrainingConfig
+# https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/training_config.py
+
+# SFT config for Deepseek R1 Distill Llama 3.3 70B.
+
+model:
+  model_name: "deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
+  model_max_length: 2048
+  torch_dtype_str: "bfloat16"
+  attn_implementation: "sdpa"
+  chat_template: "llama3-instruct"
+  load_pretrained_weights: True
+  trust_remote_code: True
+
+data:
+  train:
+    datasets:
+      - dataset_name: "yahma/alpaca-cleaned" # 51,760 examples
+    target_col: "prompt"
+    use_async_dataset: True
+
+training:
+  trainer_type: "TRL_SFT"
+  save_steps: 200
+  num_train_epochs: 3
+  per_device_train_batch_size: 2
+  max_grad_norm: null
+
+  enable_gradient_checkpointing: True
+  gradient_checkpointing_kwargs:
+    use_reentrant: False
+  ddp_find_unused_parameters: False
+  optimizer: "adamw_torch_fused"
+  learning_rate: 2.0e-05
+  warmup_ratio: 0.02
+
+  dataloader_num_workers: "auto"
+  dataloader_prefetch_factor: 16
+
+  logging_steps: 100
+  log_model_summary: False
+  empty_device_cache_steps: 50
+  output_dir: "output/deepseek_r1_llama70b.fft"
+  include_performance_metrics: True
+  enable_wandb: True
+
+fsdp:
+  enable_fsdp: True
+  cpu_offload: True
+  forward_prefetch: True
+
+  sharding_strategy: "FULL_SHARD"
+  state_dict_type: "SHARDED_STATE_DICT"
+  auto_wrap_policy: "TRANSFORMER_BASED_WRAP"
+  transformer_layer_cls: "LlamaDecoderLayer"
@@ -0,0 +1,46 @@
+# Class: oumi.core.configs.JobConfig
+# https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/job_config.py
+
+# Config to LoRA tune Deepseek R1 Distill Llama 3.3 70B.
+# Example command:
+# oumi launch up -c configs/recipes/deepseek_r1/sft/distill_llama_70b/lora_gcp_job.yaml --cluster deepseek-r1-llama70b-lora
+name: deepseek-r1-distill-llama70b-lora
+
+resources:
+  cloud: gcp
+  accelerators: "A100:8"
+  # If you don't have quota for a non-spot VM, try setting use_spot to true.
+  # However, make sure you are saving your output to a mounted cloud storage in case of
+  # preemption. For more information, see:
+  # https://oumi.ai/docs/latest/user_guides/launch/launch.html#mount-cloud-storage
+  use_spot: false
+  disk_size: 1000 # Disk size in GBs
+
+working_dir: .
+
+file_mounts:
+  ~/.netrc: ~/.netrc  # WandB credentials
+  ~/.cache/huggingface/token: ~/.cache/huggingface/token # HF credentials
+
+envs:
+  WANDB_PROJECT: oumi-train
+  OUMI_RUN_NAME: deepseek-r1.llama70b.lora
+
+setup: |
+  set -e
+  pip install uv && uv pip install '.[gpu]' hf_transfer
+  # Install model from HF Hub. This tool increases download speed compared to
+  # downloading the model during training.
+  HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+
+run: |
+  set -e  # Exit if any command failed.
+  source ./configs/examples/misc/sky_init.sh
+
+  set -x
+  oumi distributed torchrun \
+      -m oumi train \
+      -c configs/recipes/deepseek_r1/sft/distill_llama_70b/lora_train.yaml \
+      --training.run_name "${OUMI_RUN_NAME}.${SKYPILOT_TASK_ID}"
+
+  echo "Node ${SKYPILOT_NODE_RANK} is all done!"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Deepseek

		Configs for Deepseek R1 models, including distilled models.