gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/images/gpu/pytorch/issue_9827.py (about) 1 # Copyright 2023 The gVisor Authors. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); 4 # you may not use this file except in compliance with the License. 5 # You may obtain a copy of the License at 6 # 7 # http://www.apache.org/licenses/LICENSE-2.0 8 # 9 # Unless required by applicable law or agreed to in writing, software 10 # distributed under the License is distributed on an "AS IS" BASIS, 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 # See the License for the specific language governing permissions and 13 # limitations under the License. 14 15 """Reproduction case for https://github.com/google/gvisor/issues/9827.""" 16 17 import os 18 import time 19 20 import lightning as L 21 import psutil 22 import torch 23 from torch import nn 24 import torch.nn.functional as F 25 from torch.utils.data import DataLoader 26 from torchvision import models 27 from torchvision import transforms 28 from torchvision.datasets import CIFAR100 29 30 current_process = psutil.Process() 31 parent_process = current_process.parent() 32 print(f"Processes: {current_process=} {parent_process=}") 33 34 35 class NeuralNet(L.LightningModule): 36 """NeuralNet is the neural network used in this test.""" 37 38 def __init__(self, nbr_cat): 39 super().__init__() 40 41 module = models.resnet50(weights=models.ResNet50_Weights.DEFAULT) 42 module.fc = nn.Linear(2048, nbr_cat) 43 44 self.module = module 45 46 def forward(self, x): 47 return self.module(x) 48 49 def training_step(self, batch, batch_idx): 50 x, y = batch 51 y_hat = self(x) 52 loss = F.cross_entropy(y_hat, y) 53 return loss 54 55 def configure_optimizers(self): 56 return torch.optim.Adam(self.parameters(), lr=0.02) 57 58 59 def prepare_data(): 60 """prepare_data prepares the data to feed to the training pipeline.""" 61 pipeline = transforms.Compose([ 62 transforms.Resize((224, 224)), 63 transforms.ToTensor(), 64 ]) 65 66 train_ds = CIFAR100(os.environ["PYTORCH_DATASETS_DIR"], 67 train=True, 68 download=False, 69 transform=pipeline) 70 train_dataloader = DataLoader(train_ds, batch_size=128, num_workers=4) 71 72 val_ds = CIFAR100(os.environ["PYTORCH_DATASETS_DIR"], 73 train=False, 74 download=False, 75 transform=pipeline) 76 val_dataloader = DataLoader(val_ds, batch_size=128, num_workers=4) 77 78 return train_dataloader, val_dataloader 79 80 81 torch.set_float32_matmul_precision("medium") 82 train_dl, val_dl = prepare_data() 83 model = NeuralNet(100) 84 trainer = L.Trainer(max_epochs=1, strategy="ddp_notebook") 85 86 start = time.time() 87 # TODO(gvisor.dev/issue/9827): Make this not take forever. 88 trainer.fit(model, train_dl, val_dl) 89 time.sleep(20) 90 end = time.time() 91 92 training_duration = end - start 93 94 print(f"Training duration (seconds): {training_duration}")