81_DeepSpeed
Estimated memory needed for params, optim states and gradients for a: HW: Setup with 1 node, 1 GPU per node. SW: Model with 2783M total params, 65M largest layer params. per CPU | per GPU | Options 70.00GB | 0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=1 70.00GB | 0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=0 62.23GB | 5.43GB | offload_param=none, offload_optimizer=cpu , zero_init=1 62.23GB | 5.43GB | offload_param=none, offload_optimizer=cpu , zero_init=0 0.37GB | 46.91GB | offload_param=none, offload_optimizer=none, zero_init=1 15.56GB | 46.91GB | offload_param=none, offload_optimizer=none, zero_init=0 TrainingArguments(..., deepspeed="path/to/deepspeed_config.json") TrainingArguments(..., deepspeed={"key": "value"}) { "zero_optimization": { "stage": 1 } } { "fp16": { "enabled": "auto", "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 16, "hysteresis": 2, "min_loss_scale": 1 }, "optimizer": { "type": "AdamW", "params": { "lr": "auto", "betas": "auto", "eps": "auto", "weight_decay": "auto" } }, "scheduler": { "type": "WarmupLR", "params": { "warmup_min_lr": "auto", "warmup_max_lr": "auto", "warmup_num_steps": "auto" } }, "zero_optimization": { "stage": 3, "offload_optimizer": { "device": "nvme", "nvme_path": "/local_nvme", "pin_memory": true, "buffer_count": 4, "fast_init": false }, "offload_param": { "device": "nvme", "nvme_path": "/local_nvme", "pin_memory": true, "buffer_count": 5, "buffer_size": 1e8, "max_in_cpu": 1e9 }, "aio": { "block_size": 262144, "queue_depth": 32, "thread_count": 1, "single_submit": false, "overlap_events": true }, "overlap_comm": true, "contiguous_gradients": true, "sub_group_size": 1e9, "reduce_bucket_size": "auto", "stage3_prefetch_bucket_size": "auto", "stage3_param_persistence_threshold": "auto", "stage3_max_live_parameters": 1e9, "stage3_max_reuse_distance": 1e9, "stage3_gather_16bit_weights_on_model_save": true }, "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "steps_per_print": 2000, "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false } { "optimizer": { "type": "AdamW", "params": { "lr": "auto", "betas": "auto", "eps": "auto", "weight_decay": "auto" } } } { "zero_allow_untested_optimizer": true } { "zero_force_ds_cpu_optimizer": false } { "fp16": { "enabled": false } } { "train_micro_batch_size_per_gpu": "auto", "train_batch_size": "auto" } { "gradient_accumulation_steps": "auto" } { "gradient_clipping": "auto" } { "communication_data_type": "fp32" } { "checkpoint": { "use_node_local_storage": true } } %%bash cat <<'EOT' > ds_config_zero3.json { "fp16": { "enabled": "auto", "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 16, "hysteresis": 2, "min_loss_scale": 1 }, "optimizer": { "type": "AdamW", "params": { "lr": "auto", "betas": "auto", "eps": "auto", "weight_decay": "auto" } }, "scheduler": { "type": "WarmupLR", "params": { "warmup_min_lr": "auto", "warmup_max_lr": "auto", "warmup_num_steps": "auto" } }, "zero_optimization": { "stage": 3, "offload_optimizer": { "device": "cpu", "pin_memory": true }, "offload_param": { "device": "cpu", "pin_memory": true }, "overlap_comm": true, "contiguous_gradients": true, "sub_group_size": 1e9, "reduce_bucket_size": "auto", "stage3_prefetch_bucket_size": "auto", "stage3_param_persistence_threshold": "auto", "stage3_max_live_parameters": 1e9, "stage3_max_reuse_distance": 1e9, "stage3_gather_16bit_weights_on_model_save": true }, "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "steps_per_print": 2000, "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false } EOT { "zero_optimization": { "stage3_gather_16bit_weights_on_model_save": true } } { "fp16": { "enabled": "auto", "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 16, "hysteresis": 2, "min_loss_scale": 1 } } 0%| | 0/189 [00:00