github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fs/copy_up.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package fs 16 17 import ( 18 "fmt" 19 "io" 20 21 "github.com/SagerNet/gvisor/pkg/abi/linux" 22 "github.com/SagerNet/gvisor/pkg/context" 23 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 24 "github.com/SagerNet/gvisor/pkg/hostarch" 25 "github.com/SagerNet/gvisor/pkg/log" 26 "github.com/SagerNet/gvisor/pkg/sentry/memmap" 27 "github.com/SagerNet/gvisor/pkg/sync" 28 "github.com/SagerNet/gvisor/pkg/syserror" 29 "github.com/SagerNet/gvisor/pkg/usermem" 30 ) 31 32 // copyUp copies a file in an overlay from a lower filesystem to an 33 // upper filesytem so that the file can be modified in the upper 34 // filesystem. Copying a file involves several steps: 35 // 36 // - All parent directories of the file are created in the upper 37 // filesystem if they don't exist there. For instance: 38 // 39 // upper /dir0 40 // lower /dir0/dir1/file 41 // 42 // copyUp of /dir0/dir1/file creates /dir0/dir1 in order to create 43 // /dir0/dir1/file. 44 // 45 // - The file content is copied from the lower file to the upper 46 // file. For symlinks this is the symlink target. For directories, 47 // upper directory entries are merged with lower directory entries 48 // so there is no need to copy any entries. 49 // 50 // - A subset of file attributes of the lower file are set on the 51 // upper file. These are the file owner, the file timestamps, 52 // and all non-overlay extended attributes. copyUp will fail if 53 // the upper filesystem does not support the setting of these 54 // attributes. 55 // 56 // The file's permissions are set when the file is created and its 57 // size will be brought up to date when its contents are copied. 58 // Notably no attempt is made to bring link count up to date because 59 // hard links are currently not preserved across overlay filesystems. 60 // 61 // - Memory mappings of the lower file are invalidated and memory 62 // references are transferred to the upper file. From this point on, 63 // memory mappings of the file will be backed by content in the upper 64 // filesystem. 65 // 66 // Synchronization: 67 // 68 // copyUp synchronizes with rename(2) using renameMu to ensure that 69 // parentage does not change while a file is being copied. In the context 70 // of rename(2), copyUpLockedForRename should be used to avoid deadlock on 71 // renameMu. 72 // 73 // The following operations synchronize with copyUp using copyMu: 74 // 75 // - InodeOperations, i.e. to ensure that looking up a directory takes 76 // into account new upper filesystem directories created by copy up, 77 // which subsequently can be modified. 78 // 79 // - FileOperations, i.e. to ensure that reading from a file does not 80 // continue using a stale, lower filesystem handle when the file is 81 // written to. 82 // 83 // Lock ordering: Dirent.mu -> Inode.overlay.copyMu -> Inode.mu. 84 // 85 // Caveats: 86 // 87 // If any step in copying up a file fails, copyUp cleans the upper 88 // filesystem of any partially up-to-date file. If this cleanup fails, 89 // the overlay may be in an unacceptable, inconsistent state, so copyUp 90 // panics. If copyUp fails because any step (above) fails, a generic 91 // error is returned. 92 // 93 // copyUp currently makes no attempt to optimize copying up file content. 94 // For large files, this means that copyUp blocks until the entire file 95 // is copied synchronously. 96 func copyUp(ctx context.Context, d *Dirent) error { 97 renameMu.RLock() 98 defer renameMu.RUnlock() 99 return copyUpLockedForRename(ctx, d) 100 } 101 102 // copyUpLockedForRename is the same as copyUp except that it does not lock 103 // renameMu. 104 // 105 // It copies each component of d that does not yet exist in the upper 106 // filesystem. If d already exists in the upper filesystem, it is a no-op. 107 // 108 // Any error returned indicates a failure to copy all of d. This may 109 // leave the upper filesystem filled with any number of parent directories 110 // but the upper filesystem will never be in an inconsistent state. 111 // 112 // Preconditions: d.Inode.overlay is non-nil. 113 func copyUpLockedForRename(ctx context.Context, d *Dirent) error { 114 for { 115 // Did we race with another copy up or does there 116 // already exist something in the upper filesystem 117 // for d? 118 d.Inode.overlay.copyMu.RLock() 119 if d.Inode.overlay.upper != nil { 120 d.Inode.overlay.copyMu.RUnlock() 121 // Done, d is in the upper filesystem. 122 return nil 123 } 124 d.Inode.overlay.copyMu.RUnlock() 125 126 // Find the next component to copy up. We will work our way 127 // down to the last component of d and finally copy it. 128 next := findNextCopyUp(ctx, d) 129 130 // Attempt to copy. 131 if err := doCopyUp(ctx, next); err != nil { 132 return err 133 } 134 } 135 } 136 137 // findNextCopyUp finds the next component of d from root that does not 138 // yet exist in the upper filesystem. The parent of this component is 139 // also returned, which is the root of the overlay in the worst case. 140 func findNextCopyUp(ctx context.Context, d *Dirent) *Dirent { 141 next := d 142 for parent := next.parent; ; /* checked in-loop */ /* updated in-loop */ { 143 // Does this parent have a non-nil upper Inode? 144 parent.Inode.overlay.copyMu.RLock() 145 if parent.Inode.overlay.upper != nil { 146 parent.Inode.overlay.copyMu.RUnlock() 147 // Note that since we found an upper, it is stable. 148 return next 149 } 150 parent.Inode.overlay.copyMu.RUnlock() 151 152 // Continue searching for a parent with a non-nil 153 // upper Inode. 154 next = parent 155 parent = next.parent 156 } 157 } 158 159 func doCopyUp(ctx context.Context, d *Dirent) error { 160 // Fail fast on Inode types we won't be able to copy up anyways. These 161 // Inodes may block in GetFile while holding copyMu for reading. If we 162 // then try to take copyMu for writing here, we'd deadlock. 163 t := d.Inode.overlay.lower.StableAttr.Type 164 if t != RegularFile && t != Directory && t != Symlink { 165 return linuxerr.EINVAL 166 } 167 168 // Wait to get exclusive access to the upper Inode. 169 d.Inode.overlay.copyMu.Lock() 170 defer d.Inode.overlay.copyMu.Unlock() 171 if d.Inode.overlay.upper != nil { 172 // We raced with another doCopyUp, no problem. 173 return nil 174 } 175 176 // Perform the copy. 177 return copyUpLocked(ctx, d.parent, d) 178 } 179 180 // copyUpLocked creates a copy of next in the upper filesystem of parent. 181 // 182 // copyUpLocked must be called with d.Inode.overlay.copyMu locked. 183 // 184 // Returns a generic error on failure. 185 // 186 // Preconditions: 187 // * parent.Inode.overlay.upper must be non-nil. 188 // * next.Inode.overlay.copyMu must be locked writable. 189 // * next.Inode.overlay.lower must be non-nil. 190 // * next.Inode.overlay.lower.StableAttr.Type must be RegularFile, Directory, 191 // or Symlink. 192 // * upper filesystem must support setting file ownership and timestamps. 193 func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { 194 // Extract the attributes of the file we wish to copy. 195 attrs, err := next.Inode.overlay.lower.UnstableAttr(ctx) 196 if err != nil { 197 log.Warningf("copy up failed to get lower attributes: %v", err) 198 return syserror.EIO 199 } 200 201 var childUpperInode *Inode 202 parentUpper := parent.Inode.overlay.upper 203 root := RootFromContext(ctx) 204 if root != nil { 205 defer root.DecRef(ctx) 206 } 207 208 // Create the file in the upper filesystem and get an Inode for it. 209 switch next.Inode.StableAttr.Type { 210 case RegularFile: 211 childFile, err := parentUpper.Create(ctx, root, next.name, FileFlags{Read: true, Write: true}, attrs.Perms) 212 if err != nil { 213 log.Warningf("copy up failed to create file: %v", err) 214 return syserror.EIO 215 } 216 defer childFile.DecRef(ctx) 217 childUpperInode = childFile.Dirent.Inode 218 219 case Directory: 220 if err := parentUpper.CreateDirectory(ctx, root, next.name, attrs.Perms); err != nil { 221 log.Warningf("copy up failed to create directory: %v", err) 222 return syserror.EIO 223 } 224 childUpper, err := parentUpper.Lookup(ctx, next.name) 225 if err != nil { 226 werr := fmt.Errorf("copy up failed to lookup directory: %v", err) 227 cleanupUpper(ctx, parentUpper, next.name, werr) 228 return syserror.EIO 229 } 230 defer childUpper.DecRef(ctx) 231 childUpperInode = childUpper.Inode 232 233 case Symlink: 234 childLower := next.Inode.overlay.lower 235 link, err := childLower.Readlink(ctx) 236 if err != nil { 237 log.Warningf("copy up failed to read symlink value: %v", err) 238 return syserror.EIO 239 } 240 if err := parentUpper.CreateLink(ctx, root, link, next.name); err != nil { 241 log.Warningf("copy up failed to create symlink: %v", err) 242 return syserror.EIO 243 } 244 childUpper, err := parentUpper.Lookup(ctx, next.name) 245 if err != nil { 246 werr := fmt.Errorf("copy up failed to lookup symlink: %v", err) 247 cleanupUpper(ctx, parentUpper, next.name, werr) 248 return syserror.EIO 249 } 250 defer childUpper.DecRef(ctx) 251 childUpperInode = childUpper.Inode 252 253 default: 254 panic(fmt.Sprintf("copy up of invalid type %v on %+v", next.Inode.StableAttr.Type, next)) 255 } 256 257 // Bring file attributes up to date. This does not include size, which will be 258 // brought up to date with copyContentsLocked. 259 if err := copyAttributesLocked(ctx, childUpperInode, next.Inode.overlay.lower); err != nil { 260 werr := fmt.Errorf("copy up failed to copy up attributes: %v", err) 261 cleanupUpper(ctx, parentUpper, next.name, werr) 262 return syserror.EIO 263 } 264 265 // Copy the entire file. 266 if err := copyContentsLocked(ctx, childUpperInode, next.Inode.overlay.lower, attrs.Size); err != nil { 267 werr := fmt.Errorf("copy up failed to copy up contents: %v", err) 268 cleanupUpper(ctx, parentUpper, next.name, werr) 269 return syserror.EIO 270 } 271 272 lowerMappable := next.Inode.overlay.lower.Mappable() 273 upperMappable := childUpperInode.Mappable() 274 if lowerMappable != nil && upperMappable == nil { 275 werr := fmt.Errorf("copy up failed: cannot ensure memory mapping coherence") 276 cleanupUpper(ctx, parentUpper, next.name, werr) 277 return syserror.EIO 278 } 279 280 // Propagate memory mappings to the upper Inode. 281 next.Inode.overlay.mapsMu.Lock() 282 defer next.Inode.overlay.mapsMu.Unlock() 283 if upperMappable != nil { 284 // Remember which mappings we added so we can remove them on failure. 285 allAdded := make(map[memmap.MappableRange]memmap.MappingsOfRange) 286 for seg := next.Inode.overlay.mappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { 287 added := make(memmap.MappingsOfRange) 288 for m := range seg.Value() { 289 if err := upperMappable.AddMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable); err != nil { 290 for m := range added { 291 upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable) 292 } 293 for mr, mappings := range allAdded { 294 for m := range mappings { 295 upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, mr.Start, m.Writable) 296 } 297 } 298 return err 299 } 300 added[m] = struct{}{} 301 } 302 allAdded[seg.Range()] = added 303 } 304 } 305 306 // Take a reference on the upper Inode (transferred to 307 // next.Inode.overlay.upper) and make new translations use it. 308 overlay := next.Inode.overlay 309 overlay.dataMu.Lock() 310 childUpperInode.IncRef() 311 overlay.upper = childUpperInode 312 overlay.dataMu.Unlock() 313 314 // Invalidate existing translations through the lower Inode. 315 overlay.mappings.InvalidateAll(memmap.InvalidateOpts{}) 316 317 // Remove existing memory mappings from the lower Inode. 318 if lowerMappable != nil { 319 for seg := overlay.mappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { 320 for m := range seg.Value() { 321 lowerMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable) 322 } 323 } 324 } 325 326 return nil 327 } 328 329 // cleanupUpper is called when copy-up fails. It logs the copy-up error and 330 // attempts to remove name from parent. If that fails, then it panics. 331 func cleanupUpper(ctx context.Context, parent *Inode, name string, copyUpErr error) { 332 log.Warningf(copyUpErr.Error()) 333 if err := parent.InodeOperations.Remove(ctx, parent, name); err != nil { 334 // Unfortunately we don't have much choice. We shouldn't 335 // willingly give the caller access to a nonsense filesystem. 336 panic(fmt.Sprintf("overlay filesystem is in an inconsistent state: copyUp got error: %v; then cleanup failed to remove %q from upper filesystem: %v.", copyUpErr, name, err)) 337 } 338 } 339 340 // copyUpBuffers is a buffer pool for copying file content. The buffer 341 // size is the same used by io.Copy. 342 var copyUpBuffers = sync.Pool{ 343 New: func() interface{} { 344 b := make([]byte, 8*hostarch.PageSize) 345 return &b 346 }, 347 } 348 349 // copyContentsLocked copies the contents of lower to upper. It panics if 350 // less than size bytes can be copied. 351 func copyContentsLocked(ctx context.Context, upper *Inode, lower *Inode, size int64) error { 352 // We don't support copying up for anything other than regular files. 353 if lower.StableAttr.Type != RegularFile { 354 return nil 355 } 356 357 // Get a handle to the upper filesystem, which we will write to. 358 upperFile, err := overlayFile(ctx, upper, FileFlags{Write: true}) 359 if err != nil { 360 return err 361 } 362 defer upperFile.DecRef(ctx) 363 364 // Get a handle to the lower filesystem, which we will read from. 365 lowerFile, err := overlayFile(ctx, lower, FileFlags{Read: true}) 366 if err != nil { 367 return err 368 } 369 defer lowerFile.DecRef(ctx) 370 371 // Use a buffer pool to minimize allocations. 372 buf := copyUpBuffers.Get().(*[]byte) 373 defer copyUpBuffers.Put(buf) 374 375 // Transfer the contents. 376 // 377 // One might be able to optimize this by doing parallel reads, parallel writes and reads, larger 378 // buffers, etc. But we really don't know anything about the underlying implementation, so these 379 // optimizations could be self-defeating. So we leave this as simple as possible. 380 var offset int64 381 for { 382 nr, err := lowerFile.FileOperations.Read(ctx, lowerFile, usermem.BytesIOSequence(*buf), offset) 383 if err != nil && err != io.EOF { 384 return err 385 } 386 if nr == 0 { 387 if offset != size { 388 // Same as in cleanupUpper, we cannot live 389 // with ourselves if we do anything less. 390 panic(fmt.Sprintf("filesystem is in an inconsistent state: wrote only %d bytes of %d sized file", offset, size)) 391 } 392 return nil 393 } 394 nw, err := upperFile.FileOperations.Write(ctx, upperFile, usermem.BytesIOSequence((*buf)[:nr]), offset) 395 if err != nil { 396 return err 397 } 398 offset += nw 399 } 400 } 401 402 // copyAttributesLocked copies a subset of lower's attributes to upper, 403 // specifically owner, timestamps (except of status change time), and 404 // extended attributes. Notably no attempt is made to copy link count. 405 // Size and permissions are set on upper when the file content is copied 406 // and when the file is created respectively. 407 func copyAttributesLocked(ctx context.Context, upper *Inode, lower *Inode) error { 408 // Extract attributes from the lower filesystem. 409 lowerAttr, err := lower.UnstableAttr(ctx) 410 if err != nil { 411 return err 412 } 413 lowerXattr, err := lower.ListXattr(ctx, linux.XATTR_SIZE_MAX) 414 if err != nil && !linuxerr.Equals(linuxerr.EOPNOTSUPP, err) { 415 return err 416 } 417 418 // Set the attributes on the upper filesystem. 419 if err := upper.InodeOperations.SetOwner(ctx, upper, lowerAttr.Owner); err != nil { 420 return err 421 } 422 if err := upper.InodeOperations.SetTimestamps(ctx, upper, TimeSpec{ 423 ATime: lowerAttr.AccessTime, 424 MTime: lowerAttr.ModificationTime, 425 }); err != nil { 426 return err 427 } 428 for name := range lowerXattr { 429 // Don't copy-up attributes that configure an overlay in the 430 // lower. 431 if isXattrOverlay(name) { 432 continue 433 } 434 value, err := lower.GetXattr(ctx, name, linux.XATTR_SIZE_MAX) 435 if err != nil { 436 return err 437 } 438 if err := upper.InodeOperations.SetXattr(ctx, upper, name, value, 0 /* flags */); err != nil { 439 return err 440 } 441 } 442 return nil 443 }