83_perf_train_cpu_many
apiVersion: "kubeflow.org/v1" kind: PyTorchJob metadata: name: transformers-pytorchjob spec: elasticPolicy: rdzvBackend: c10d minReplicas: 1 maxReplicas: 4 maxRestarts: 10 pytorchReplicaSpecs: Worker: replicas: 4 # 工作节点的数量 restartPolicy: OnFailure template: spec: containers: - name: pytorch image: <image name>:<tag> # 指定要用于工作节点 Pod 的 Docker 镜像 imagePullPolicy: IfNotPresent command: ["/bin/bash", "-c"] args: - >- cd /workspace/transformers; pip install -r /workspace/transformers/examples/pytorch/question-answering/requirements.txt; source /usr/local/lib/python3.10/dist-packages/oneccl_bindings_for_pytorch/env/setvars.sh; torchrun /workspace/transformers/examples/pytorch/question-answering/run_qa.py \ --model_name_or_path distilbert/distilbert-base-uncased \ --dataset_name squad \ --do_train \ --do_eval \ --per_device_train_batch_size 12 \ --learning_rate 3e-5 \ --num_train_epochs 2 \ --max_seq_length 384 \ --doc_stride 128 \ --output_dir /tmp/pvc-mount/output_$(date +%Y%m%d_%H%M%S) \ --no_cuda \ --ddp_backend ccl \ --bf16 \ --use_ipex; env: - name: LD_PRELOAD value: "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4.5.9:/usr/local/lib/libiomp5.so" - name: TRANSFORMERS_CACHE value: "/tmp/pvc-mount/transformers_cache" - name: HF_DATASETS_CACHE value: "/tmp/pvc-mount/hf_datasets_cache" - name: LOGLEVEL value: "INFO" - name: CCL_WORKER_COUNT value: "1" - name: OMP_NUM_THREADS # 可以调整以获得最佳性能 value: "240" resources: limits: cpu: 240 # 根据您的节点更新 CPU 和内存限制值 memory: 128Gi requests: cpu: 240 # 根据您的节点更新 CPU 和内存请求值 memory: 128Gi volumeMounts: - name: pvc-volume mountPath: /tmp/pvc-mount - mountPath: /dev/shm name: dshm restartPolicy: Never nodeSelector: # 可选使用 nodeSelector 匹配特定的节点标签 node-type: gnr volumes: - name: pvc-volume persistentVolumeClaim: claimName: transformers-pvc - name: dshm emptyDir: medium: Memory NAME READY STATUS RESTARTS AGE ... transformers-pytorchjob-worker-0 1/1 Running 0 7m37s transformers-pytorchjob-worker-1 1/1 Running 0 7m37s transformers-pytorchjob-worker-2 1/1 Running 0 7m37s transformers-pytorchjob-worker-3 1/1 Running 0 7m37s ...