gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/images/gpu/pytorch/issue_9827.py

gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/images/gpu/pytorch/issue_9827.py (about)

     1  # Copyright 2023 The gVisor Authors.
     2  #
     3  # Licensed under the Apache License, Version 2.0 (the "License");
     4  # you may not use this file except in compliance with the License.
     5  # You may obtain a copy of the License at
     6  #
     7  #     http://www.apache.org/licenses/LICENSE-2.0
     8  #
     9  # Unless required by applicable law or agreed to in writing, software
    10  # distributed under the License is distributed on an "AS IS" BASIS,
    11  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  # See the License for the specific language governing permissions and
    13  # limitations under the License.
    14  
    15  """Reproduction case for https://github.com/google/gvisor/issues/9827."""
    16  
    17  import os
    18  import time
    19  
    20  import lightning as L
    21  import psutil
    22  import torch
    23  from torch import nn
    24  import torch.nn.functional as F
    25  from torch.utils.data import DataLoader
    26  from torchvision import models
    27  from torchvision import transforms
    28  from torchvision.datasets import CIFAR100
    29  
    30  current_process = psutil.Process()
    31  parent_process = current_process.parent()
    32  print(f"Processes: {current_process=} {parent_process=}")
    33  
    34  
    35  class NeuralNet(L.LightningModule):
    36    """NeuralNet is the neural network used in this test."""
    37  
    38    def __init__(self, nbr_cat):
    39      super().__init__()
    40  
    41      module = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
    42      module.fc = nn.Linear(2048, nbr_cat)
    43  
    44      self.module = module
    45  
    46    def forward(self, x):
    47      return self.module(x)
    48  
    49    def training_step(self, batch, batch_idx):
    50      x, y = batch
    51      y_hat = self(x)
    52      loss = F.cross_entropy(y_hat, y)
    53      return loss
    54  
    55    def configure_optimizers(self):
    56      return torch.optim.Adam(self.parameters(), lr=0.02)
    57  
    58  
    59  def prepare_data():
    60    """prepare_data prepares the data to feed to the training pipeline."""
    61    pipeline = transforms.Compose([
    62        transforms.Resize((224, 224)),
    63        transforms.ToTensor(),
    64    ])
    65  
    66    train_ds = CIFAR100(os.environ["PYTORCH_DATASETS_DIR"],
    67                        train=True,
    68                        download=False,
    69                        transform=pipeline)
    70    train_dataloader = DataLoader(train_ds, batch_size=128, num_workers=4)
    71  
    72    val_ds = CIFAR100(os.environ["PYTORCH_DATASETS_DIR"],
    73                      train=False,
    74                      download=False,
    75                      transform=pipeline)
    76    val_dataloader = DataLoader(val_ds, batch_size=128, num_workers=4)
    77  
    78    return train_dataloader, val_dataloader
    79  
    80  
    81  torch.set_float32_matmul_precision("medium")
    82  train_dl, val_dl = prepare_data()
    83  model = NeuralNet(100)
    84  trainer = L.Trainer(max_epochs=1, strategy="ddp_notebook")
    85  
    86  start = time.time()
    87  # TODO(gvisor.dev/issue/9827): Make this not take forever.
    88  trainer.fit(model, train_dl, val_dl)
    89  time.sleep(20)
    90  end = time.time()
    91  
    92  training_duration = end - start
    93  
    94  print(f"Training duration (seconds): {training_duration}")