初步可跑通，但loss计算有问题，不收敛

2026-01-08 09:43:23 +08:00
parent efd76bccd2
commit f7601e9170
11 changed files with 656 additions and 63 deletions
--- a/multi_gpu_temporal_train.sh
+++ b/multi_gpu_temporal_train.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+# Simple multi-GPU training script for SwiftFormerTemporal
+# Usage: ./multi_gpu_temporal_train.sh <NUM_GPUS> [OPTIONS]
+
+NUM_GPUS=${1:-2}
+shift
+
+echo "Starting multi-GPU training with $NUM_GPUS GPUs"
+
+# Set environment variables for distributed training
+export MASTER_PORT=12345
+export MASTER_ADDR=localhost
+export WORLD_SIZE=$NUM_GPUS
+
+# Launch training
+torchrun --nproc_per_node=$NUM_GPUS --master_port=$MASTER_PORT main_temporal.py \
+    --data-path "./videos" \
+    --model SwiftFormerTemporal_XS \
+    --batch-size 32 \
+    --epochs 100 \
+    --lr 1e-3 \
+    --output-dir "./temporal_output_multi" \
+    --num-workers 8 \
+    --pin-mem \
+    "$@"