Open
Description
🚀 The feature, motivation and pitch
this is raised by @kwen2501 : for mixed precision, we cast input per tensor. kernel launching (see cpu thread in the snapshot) can get blocked when there are many input tensors. when it launch 1024-ish kernels, we start seeing "command buffer full" and cudaLaunchKernel gets blocked
we probably need a best practice to avoid such situation. for example, recommending user to pack/unpack in user code
not sure how often this happens though
pytorch/torch/distributed/fsdp/_fully_shard/_fsdp_state.py
Lines 233 to 238 in 28af442
repro
# torchrun --standalone --nproc_per_node=2 run_fsdp2.py
import os
import pickle
import torch
import torch.distributed as dist
import torch.nn as nn
from torch.distributed._composable.fsdp import fully_shard
torch.cuda.memory._record_memory_history(max_entries=100000)
import contextlib
import os
import torch
import torch.nn as nn
class MyModel(nn.Module):
def __init__(self):
super().__init__()
self.l1 = nn.Linear(4, 4, bias=False)
def forward(self, x):
return x[0].sum() + self.l1.weight.sum()
@contextlib.contextmanager
def enable_profiling(enable=False):
if not enable:
torch_profiler = contextlib.nullcontext()
yield None
else:
trace_dir = "./profilers"
rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
def trace_handler(prof):
curr_trace_dir_name = "iteration_" + str(prof.step_num)
curr_trace_dir = os.path.join(trace_dir, curr_trace_dir_name)
if not os.path.exists(curr_trace_dir):
os.makedirs(curr_trace_dir, exist_ok=True)
prof.export_chrome_trace(f"{curr_trace_dir}/rank{rank}_trace.json")
if not os.path.exists(trace_dir):
os.makedirs(trace_dir, exist_ok=True)
warmup, active = 1, 2
wait = 1
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active),
on_trace_ready=trace_handler,
record_shapes=True,
) as torch_profiler:
yield torch_profiler
def main():
dist.init_process_group(backend="nccl")
gpu_id = int(os.environ["LOCAL_RANK"])
device = f"cuda:{gpu_id}"
torch.cuda.set_device(device)
torch.manual_seed(0)
model = MyModel()
from torch.distributed.fsdp import MixedPrecisionPolicy
mp_policy = MixedPrecisionPolicy(
param_dtype=torch.bfloat16, reduce_dtype=torch.float32
)
fully_shard(model, mp_policy=mp_policy)
optim = torch.optim.Adam(model.parameters(), lr=1e-2)
x = [torch.randn((1024, 1024), device=device, requires_grad=True)] * 10000
stream = torch.cuda.Stream()
m = torch.randn((16384, 16384), device=device, requires_grad=True)
with enable_profiling(True) as prof:
for _ in range(10):
with torch.cuda.stream(stream):
torch.nn.functional.linear(m, m)
model(x).sum().backward()
optim.step()
prof.step()
if __name__ == "__main__":
main()
Alternatives
No response
Additional context
No response
cc @H-Huang @awgu @wanchaol @fegin @fduwjj @wz337 @wconstab @d4l3k @zhaojuanmao @mrshenli @rohan-varma @chauhang @mori360