Parallelize Training
Alexandre Strube // Sabrina Benassou
June 25, 2024
class ImageNet(Dataset):
def __init__(self, root, split, transform=None):
if split not in ["train", "val"]:
raise ValueError("split must be either 'train' or 'val'")
self.root = root
with open(os.path.join(root, "imagenet_{}.json".format(split)), "rb") as f:
data = json.load(f)
self.samples = list(data.keys())
self.targets = list(data.values())
self.transform = transform
def __len__(self):
return len(self.samples)
def __getitem__(self, idx):
x = Image.open(os.path.join(self.root, self.samples[idx])).convert("RGB")
if self.transform:
x = self.transform(x)
return x, self.targets[idx]
class ImageNetDataModule(pl.LightningDataModule):
def __init__(
self,
data_root: str,
batch_size: int,
num_workers: int,
dataset_transforms: dict(),
):
super().__init__()
self.data_root = data_root
self.batch_size = batch_size
self.num_workers = num_workers
self.dataset_transforms = dataset_transforms
def setup(self, stage: Optional[str] = None):
self.train = ImageNet(self.data_root, "train", self.dataset_transforms)
def train_dataloader(self):
return DataLoader(self.train, batch_size=self.batch_size, \
num_workers=self.num_workers)
class resnet50Model(pl.LightningModule):
def __init__(self):
super().__init__()
self.model = resnet50(pretrained=True)
def forward(self, x):
return self.model(x)
def training_step(self,batch):
x, labels = batch
pred=self.forward(x)
train_loss = F.cross_entropy(pred, labels)
self.log("training_loss", train_loss)
return train_loss
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=0.02)
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Resize((256, 256))
])
# 1. Organize the data
datamodule = ImageNetDataModule("/p/scratch/training2425/data/", 256, \
int(os.getenv('SLURM_CPUS_PER_TASK')), transform)
# 2. Build the model using desired Task
model = resnet50Model()
# 3. Create the trainer
trainer = pl.Trainer(max_epochs=10, accelerator="gpu")
# 4. Train the model
trainer.fit(model, datamodule=datamodule)
# 5. Save the model!
trainer.save_checkpoint("image_classification_model.pt")
#!/bin/bash -x
#SBATCH --nodes=1
#SBATCH --gres=gpu:1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=96
#SBATCH --time=06:00:00
#SBATCH --partition=dc-gpu
#SBATCH --account=training2425
#SBATCH --output=%j.out
#SBATCH --error=%j.err
#SBATCH --reservation=training2425
# To get number of cpu per task
export SRUN_CPUS_PER_TASK="$SLURM_CPUS_PER_TASK"
# activate env
source $HOME/course/$USER/sc_venv_template/activate.sh
# run script from above
time srun python3 gpu_training.py
1 node and 4 GPU
#!/bin/bash -x
#SBATCH --nodes=1
#SBATCH --gres=gpu:4 # Use the 4 GPUs available
#SBATCH --ntasks-per-node=4 # When using pl it should always be set to 4
#SBATCH --cpus-per-task=24 # Divide the number of cpus (96) by the number of GPUs (4)
#SBATCH --time=02:00:00
#SBATCH --partition=dc-gpu
#SBATCH --account=training2425
#SBATCH --output=%j.out
#SBATCH --error=%j.err
#SBATCH --reservation=training2425
export CUDA_VISIBLE_DEVICES=0,1,2,3 # Very important to make the GPUs visible
export SRUN_CPUS_PER_TASK="$SLURM_CPUS_PER_TASK"
source $HOME/course/$USER/sc_venv_template/activate.sh
time srun python3 gpu_training.py
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Resize((256, 256))
])
# 1. The number of nodes
nnodes = os.getenv("SLURM_NNODES")
# 2. Organize the data
datamodule = ImageNetDataModule("/p/scratch/training2425/data/", 128, \
int(os.getenv('SLURM_CPUS_PER_TASK')), transform)
# 3. Build the model using desired Task
model = resnet50Model()
# 4. Create the trainer
trainer = pl.Trainer(max_epochs=10, accelerator="gpu", num_nodes=nnodes)
# 5. Train the model
trainer.fit(model, datamodule=datamodule)
# 6. Save the model!
trainer.save_checkpoint("image_classification_model.pt")
16 nodes and 4 GPU each
#!/bin/bash -x
#SBATCH --nodes=16 # This needs to match Trainer(num_nodes=...)
#SBATCH --gres=gpu:4 # Use the 4 GPUs available
#SBATCH --ntasks-per-node=4 # When using pl it should always be set to 4
#SBATCH --cpus-per-task=24 # Divide the number of cpus (96) by the number of GPUs (4)
#SBATCH --time=00:15:00
#SBATCH --partition=dc-gpu
#SBATCH --account=training2425
#SBATCH --output=%j.out
#SBATCH --error=%j.err
#SBATCH --reservation=training2425
export CUDA_VISIBLE_DEVICES=0,1,2,3 # Very important to make the GPUs visible
export SRUN_CPUS_PER_TASK="$SLURM_CPUS_PER_TASK"
source $HOME/course/$USER/sc_venv_template/activate.sh
time srun python3 ddp_training.py
With 4 nodes:
With 8 nodes:
With 16 nodes:
With 32 nodes:
#SBATCH --nodes=16 # This needs to match Trainer(num_nodes=...)
#SBATCH --gres=gpu:4 # Use the 4 GPUs available
#SBATCH --ntasks-per-node=4 # When using pl it should always be set to 4
#SBATCH --cpus-per-task=24 # Divide the number of cpus (96) by the number of GPUs (4)
export CUDA_VISIBLE_DEVICES=0,1,2,3 # Very important to make the GPUs visible