github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/syscalls/linux/sys_file.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package linux 16 17 import ( 18 "math" 19 20 "github.com/metacubex/gvisor/pkg/abi/linux" 21 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 22 "github.com/metacubex/gvisor/pkg/fspath" 23 "github.com/metacubex/gvisor/pkg/gohacks" 24 "github.com/metacubex/gvisor/pkg/hostarch" 25 "github.com/metacubex/gvisor/pkg/marshal/primitive" 26 "github.com/metacubex/gvisor/pkg/sentry/arch" 27 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/lock" 28 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/tmpfs" 29 "github.com/metacubex/gvisor/pkg/sentry/kernel" 30 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 31 "github.com/metacubex/gvisor/pkg/sentry/kernel/fasync" 32 "github.com/metacubex/gvisor/pkg/sentry/kernel/pipe" 33 "github.com/metacubex/gvisor/pkg/sentry/limits" 34 "github.com/metacubex/gvisor/pkg/sentry/vfs" 35 ) 36 37 // Mknod implements Linux syscall mknod(2). 38 func Mknod(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 39 addr := args[0].Pointer() 40 mode := args[1].ModeT() 41 dev := args[2].Uint() 42 return 0, nil, mknodat(t, linux.AT_FDCWD, addr, linux.FileMode(mode), dev) 43 } 44 45 // Mknodat implements Linux syscall mknodat(2). 46 func Mknodat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 47 dirfd := args[0].Int() 48 addr := args[1].Pointer() 49 mode := args[2].ModeT() 50 dev := args[3].Uint() 51 return 0, nil, mknodat(t, dirfd, addr, linux.FileMode(mode), dev) 52 } 53 54 func mknodat(t *kernel.Task, dirfd int32, addr hostarch.Addr, mode linux.FileMode, dev uint32) error { 55 path, err := copyInPath(t, addr) 56 if err != nil { 57 return err 58 } 59 tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) 60 if err != nil { 61 return err 62 } 63 defer tpop.Release(t) 64 65 // "Zero file type is equivalent to type S_IFREG." - mknod(2) 66 if mode.FileType() == 0 { 67 mode |= linux.ModeRegular 68 } 69 major, minor := linux.DecodeDeviceID(dev) 70 return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{ 71 Mode: mode &^ linux.FileMode(t.FSContext().Umask()), 72 DevMajor: uint32(major), 73 DevMinor: minor, 74 }) 75 } 76 77 // Open implements Linux syscall open(2). 78 func Open(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 79 addr := args[0].Pointer() 80 flags := args[1].Uint() 81 mode := args[2].ModeT() 82 return openat(t, linux.AT_FDCWD, addr, flags, mode) 83 } 84 85 // Openat implements Linux syscall openat(2). 86 func Openat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 87 dirfd := args[0].Int() 88 addr := args[1].Pointer() 89 flags := args[2].Uint() 90 mode := args[3].ModeT() 91 return openat(t, dirfd, addr, flags, mode) 92 } 93 94 // Creat implements Linux syscall creat(2). 95 func Creat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 96 addr := args[0].Pointer() 97 mode := args[1].ModeT() 98 return openat(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_CREAT|linux.O_TRUNC, mode) 99 } 100 101 func openat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, flags uint32, mode uint) (uintptr, *kernel.SyscallControl, error) { 102 path, err := copyInPath(t, pathAddr) 103 if err != nil { 104 return 0, nil, err 105 } 106 tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink(flags&linux.O_NOFOLLOW == 0)) 107 if err != nil { 108 return 0, nil, err 109 } 110 defer tpop.Release(t) 111 112 file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{ 113 Flags: flags | linux.O_LARGEFILE, 114 Mode: linux.FileMode(mode & (0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX) &^ t.FSContext().Umask()), 115 }) 116 if err != nil { 117 return 0, nil, err 118 } 119 defer file.DecRef(t) 120 121 fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ 122 CloseOnExec: flags&linux.O_CLOEXEC != 0, 123 }) 124 return uintptr(fd), nil, err 125 } 126 127 // Access implements Linux syscall access(2). 128 func Access(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 129 addr := args[0].Pointer() 130 mode := args[1].ModeT() 131 132 return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode, 0 /* flags */) 133 } 134 135 // Faccessat implements Linux syscall faccessat(2). 136 func Faccessat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 137 dirfd := args[0].Int() 138 addr := args[1].Pointer() 139 mode := args[2].ModeT() 140 141 return 0, nil, accessAt(t, dirfd, addr, mode, 0 /* flags */) 142 } 143 144 // Faccessat2 implements Linux syscall faccessat2(2). 145 func Faccessat2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 146 dirfd := args[0].Int() 147 addr := args[1].Pointer() 148 mode := args[2].ModeT() 149 flags := args[3].Int() 150 151 return 0, nil, accessAt(t, dirfd, addr, mode, flags) 152 } 153 154 func accessAt(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, mode uint, flags int32) error { 155 const rOK = 4 156 const wOK = 2 157 const xOK = 1 158 159 // Sanity check the mode. 160 if mode&^(rOK|wOK|xOK) != 0 { 161 return linuxerr.EINVAL 162 } 163 164 // faccessat2(2) isn't documented as supporting AT_EMPTY_PATH, but it does. 165 if flags&^(linux.AT_EACCESS|linux.AT_SYMLINK_NOFOLLOW|linux.AT_EMPTY_PATH) != 0 { 166 return linuxerr.EINVAL 167 } 168 169 path, err := copyInPath(t, pathAddr) 170 if err != nil { 171 return err 172 } 173 tpop, err := getTaskPathOperation(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0)) 174 if err != nil { 175 return err 176 } 177 defer tpop.Release(t) 178 179 creds := t.Credentials() 180 if flags&linux.AT_EACCESS == 0 { 181 // access(2) and faccessat(2) check permissions using real 182 // UID/GID, not effective UID/GID. 183 // 184 // "access() needs to use the real uid/gid, not the effective 185 // uid/gid. We do this by temporarily clearing all FS-related 186 // capabilities and switching the fsuid/fsgid around to the 187 // real ones." -fs/open.c:faccessat 188 creds = creds.Fork() 189 creds.EffectiveKUID = creds.RealKUID 190 creds.EffectiveKGID = creds.RealKGID 191 if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID { 192 creds.EffectiveCaps = creds.PermittedCaps 193 } else { 194 creds.EffectiveCaps = 0 195 } 196 } 197 198 return t.Kernel().VFS().AccessAt(t, creds, vfs.AccessTypes(mode), &tpop.pop) 199 } 200 201 // Ioctl implements Linux syscall ioctl(2). 202 func Ioctl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 203 fd := args[0].Int() 204 205 file := t.GetFile(fd) 206 if file == nil { 207 return 0, nil, linuxerr.EBADF 208 } 209 defer file.DecRef(t) 210 211 if file.StatusFlags()&linux.O_PATH != 0 { 212 return 0, nil, linuxerr.EBADF 213 } 214 215 // Handle ioctls that apply to all FDs. 216 switch args[1].Int() { 217 case linux.FIONCLEX: 218 t.FDTable().SetFlags(t, fd, kernel.FDFlags{ 219 CloseOnExec: false, 220 }) 221 return 0, nil, nil 222 223 case linux.FIOCLEX: 224 t.FDTable().SetFlags(t, fd, kernel.FDFlags{ 225 CloseOnExec: true, 226 }) 227 return 0, nil, nil 228 229 case linux.FIONBIO: 230 var set int32 231 if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil { 232 return 0, nil, err 233 } 234 flags := file.StatusFlags() 235 if set != 0 { 236 flags |= linux.O_NONBLOCK 237 } else { 238 flags &^= linux.O_NONBLOCK 239 } 240 return 0, nil, file.SetStatusFlags(t, t.Credentials(), flags) 241 242 case linux.FIOASYNC: 243 var set int32 244 if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil { 245 return 0, nil, err 246 } 247 flags := file.StatusFlags() 248 if set != 0 { 249 flags |= linux.O_ASYNC 250 } else { 251 flags &^= linux.O_ASYNC 252 } 253 file.SetStatusFlags(t, t.Credentials(), flags) 254 return 0, nil, nil 255 256 case linux.FIOGETOWN, linux.SIOCGPGRP: 257 var who int32 258 owner, hasOwner := getAsyncOwner(t, file) 259 if hasOwner { 260 if owner.Type == linux.F_OWNER_PGRP { 261 who = -owner.PID 262 } else { 263 who = owner.PID 264 } 265 } 266 _, err := primitive.CopyInt32Out(t, args[2].Pointer(), who) 267 return 0, nil, err 268 269 case linux.FIOSETOWN, linux.SIOCSPGRP: 270 var who int32 271 if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &who); err != nil { 272 return 0, nil, err 273 } 274 ownerType := int32(linux.F_OWNER_PID) 275 if who < 0 { 276 // Check for overflow before flipping the sign. 277 if who-1 > who { 278 return 0, nil, linuxerr.EINVAL 279 } 280 ownerType = linux.F_OWNER_PGRP 281 who = -who 282 } 283 return 0, nil, setAsyncOwner(t, int(fd), file, ownerType, who) 284 } 285 286 ret, err := file.Ioctl(t, t.MemoryManager(), sysno, args) 287 return ret, nil, err 288 } 289 290 // Getcwd implements Linux syscall getcwd(2). 291 func Getcwd(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 292 addr := args[0].Pointer() 293 size := args[1].SizeT() 294 295 root := t.FSContext().RootDirectory() 296 wd := t.FSContext().WorkingDirectory() 297 s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd) 298 root.DecRef(t) 299 wd.DecRef(t) 300 if err != nil { 301 return 0, nil, err 302 } 303 304 // Note this is >= because we need a terminator. 305 if uint(len(s)) >= size { 306 return 0, nil, linuxerr.ERANGE 307 } 308 309 // Construct a byte slice containing a NUL terminator. 310 buf := t.CopyScratchBuffer(len(s) + 1) 311 copy(buf, s) 312 buf[len(buf)-1] = 0 313 314 // Write the pathname slice. 315 n, err := t.CopyOutBytes(addr, buf) 316 if err != nil { 317 return 0, nil, err 318 } 319 return uintptr(n), nil, nil 320 } 321 322 // Chdir implements Linux syscall chdir(2). 323 func Chdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 324 addr := args[0].Pointer() 325 326 path, err := copyInPath(t, addr) 327 if err != nil { 328 return 0, nil, err 329 } 330 tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink) 331 if err != nil { 332 return 0, nil, err 333 } 334 defer tpop.Release(t) 335 336 vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ 337 CheckSearchable: true, 338 }) 339 if err != nil { 340 return 0, nil, err 341 } 342 t.FSContext().SetWorkingDirectory(t, vd) 343 vd.DecRef(t) 344 return 0, nil, nil 345 } 346 347 // Fchdir implements Linux syscall fchdir(2). 348 func Fchdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 349 fd := args[0].Int() 350 351 tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink) 352 if err != nil { 353 return 0, nil, err 354 } 355 defer tpop.Release(t) 356 357 vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ 358 CheckSearchable: true, 359 }) 360 if err != nil { 361 return 0, nil, err 362 } 363 t.FSContext().SetWorkingDirectory(t, vd) 364 vd.DecRef(t) 365 return 0, nil, nil 366 } 367 368 // Chroot implements Linux syscall chroot(2). 369 func Chroot(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 370 addr := args[0].Pointer() 371 372 if !t.HasCapability(linux.CAP_SYS_CHROOT) { 373 return 0, nil, linuxerr.EPERM 374 } 375 376 path, err := copyInPath(t, addr) 377 if err != nil { 378 return 0, nil, err 379 } 380 tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink) 381 if err != nil { 382 return 0, nil, err 383 } 384 defer tpop.Release(t) 385 386 vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ 387 CheckSearchable: true, 388 }) 389 if err != nil { 390 return 0, nil, err 391 } 392 t.FSContext().SetRootDirectory(t, vd) 393 vd.DecRef(t) 394 return 0, nil, nil 395 } 396 397 // PivotRoot implements Linux syscall pivot_root(2). 398 func PivotRoot(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 399 addr1 := args[0].Pointer() 400 addr2 := args[1].Pointer() 401 402 if !t.HasCapability(linux.CAP_SYS_ADMIN) { 403 return 0, nil, linuxerr.EPERM 404 } 405 406 newRootPath, err := copyInPath(t, addr1) 407 if err != nil { 408 return 0, nil, err 409 } 410 newRootTpop, err := getTaskPathOperation(t, linux.AT_FDCWD, newRootPath, disallowEmptyPath, followFinalSymlink) 411 if err != nil { 412 return 0, nil, err 413 } 414 defer newRootTpop.Release(t) 415 putOldPath, err := copyInPath(t, addr2) 416 if err != nil { 417 return 0, nil, err 418 } 419 putOldTpop, err := getTaskPathOperation(t, linux.AT_FDCWD, putOldPath, disallowEmptyPath, followFinalSymlink) 420 if err != nil { 421 return 0, nil, err 422 } 423 defer putOldTpop.Release(t) 424 425 newRoot, oldRoot, err := t.Kernel().VFS().PivotRoot(t, t.Credentials(), &newRootTpop.pop, &putOldTpop.pop) 426 if err != nil { 427 return 0, nil, err 428 } 429 defer newRoot.DecRef(t) 430 defer oldRoot.DecRef(t) 431 t.Kernel().ReplaceFSContextRoots(t, oldRoot, newRoot) 432 return 0, nil, nil 433 } 434 435 // Close implements Linux syscall close(2). 436 func Close(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 437 fd := args[0].Int() 438 439 // Note that Remove provides a reference on the file that we may use to 440 // flush. It is still active until we drop the final reference below 441 // (and other reference-holding operations complete). 442 file := t.FDTable().Remove(t, fd) 443 if file == nil { 444 return 0, nil, linuxerr.EBADF 445 } 446 defer file.DecRef(t) 447 448 err := file.OnClose(t) 449 return 0, nil, HandleIOError(t, false /* partial */, err, linuxerr.EINTR, "close", file) 450 } 451 452 // CloseRange implements linux syscall close_range(2). 453 func CloseRange(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 454 first := args[0].Uint() 455 last := args[1].Uint() 456 flags := args[2].Uint() 457 458 if (first > last) || (last > math.MaxInt32) { 459 return 0, nil, linuxerr.EINVAL 460 } 461 462 if (flags & ^(linux.CLOSE_RANGE_CLOEXEC | linux.CLOSE_RANGE_UNSHARE)) != 0 { 463 return 0, nil, linuxerr.EINVAL 464 } 465 466 cloexec := flags & linux.CLOSE_RANGE_CLOEXEC 467 unshare := flags & linux.CLOSE_RANGE_UNSHARE 468 469 if unshare != 0 { 470 // If possible, we don't want to copy FDs to the new unshared table, because those FDs will 471 // be promptly closed and no longer used. So in the case where we know the range extends all 472 // the way to the end of the FdTable, we can simply copy the FdTable only up to the start of 473 // the range that we are closing. 474 if cloexec == 0 && int32(last) >= t.FDTable().GetLastFd() { 475 t.UnshareFdTable(int32(first)) 476 } else { 477 t.UnshareFdTable(math.MaxInt32) 478 } 479 } 480 481 if cloexec != 0 { 482 flagToApply := kernel.FDFlags{ 483 CloseOnExec: true, 484 } 485 t.FDTable().SetFlagsForRange(t.AsyncContext(), int32(first), int32(last), flagToApply) 486 return 0, nil, nil 487 } 488 489 fdTable := t.FDTable() 490 fd := int32(first) 491 for { 492 fd, file := fdTable.RemoveNextInRange(t, fd, int32(last)) 493 if file == nil { 494 break 495 } 496 497 fd++ 498 // Per the close_range(2) documentation, errors upon closing file descriptors are ignored. 499 _ = file.OnClose(t) 500 file.DecRef(t) 501 } 502 503 return 0, nil, nil 504 } 505 506 // Dup implements Linux syscall dup(2). 507 func Dup(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 508 fd := args[0].Int() 509 510 file := t.GetFile(fd) 511 if file == nil { 512 return 0, nil, linuxerr.EBADF 513 } 514 defer file.DecRef(t) 515 516 newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{}) 517 if err != nil { 518 return 0, nil, linuxerr.EMFILE 519 } 520 return uintptr(newFD), nil, nil 521 } 522 523 // Dup2 implements Linux syscall dup2(2). 524 func Dup2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 525 oldfd := args[0].Int() 526 newfd := args[1].Int() 527 528 if oldfd == newfd { 529 // As long as oldfd is valid, dup2() does nothing and returns newfd. 530 file := t.GetFile(oldfd) 531 if file == nil { 532 return 0, nil, linuxerr.EBADF 533 } 534 file.DecRef(t) 535 return uintptr(newfd), nil, nil 536 } 537 538 return dup3(t, oldfd, newfd, 0) 539 } 540 541 // Dup3 implements Linux syscall dup3(2). 542 func Dup3(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 543 oldfd := args[0].Int() 544 newfd := args[1].Int() 545 flags := args[2].Uint() 546 547 if oldfd == newfd { 548 return 0, nil, linuxerr.EINVAL 549 } 550 551 return dup3(t, oldfd, newfd, flags) 552 } 553 554 func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) { 555 if flags&^linux.O_CLOEXEC != 0 { 556 return 0, nil, linuxerr.EINVAL 557 } 558 559 file := t.GetFile(oldfd) 560 if file == nil { 561 return 0, nil, linuxerr.EBADF 562 } 563 defer file.DecRef(t) 564 565 df, err := t.NewFDAt(newfd, file, kernel.FDFlags{ 566 CloseOnExec: flags&linux.O_CLOEXEC != 0, 567 }) 568 if linuxerr.Equals(linuxerr.EMFILE, err) { 569 err = linuxerr.EBADF 570 } 571 if err != nil { 572 return 0, nil, err 573 } 574 if df != nil { 575 // "If the file descriptor newfd was previously open, it is closed 576 // before being reused; the close is performed silently (i.e., any 577 // errors during the close are not reported by dup2())." - dup(2) 578 _ = df.OnClose(t) 579 df.DecRef(t) 580 } 581 return uintptr(newfd), nil, nil 582 } 583 584 // Fcntl implements linux syscall fcntl(2). 585 func Fcntl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 586 fd := args[0].Int() 587 cmd := args[1].Int() 588 589 file, flags := t.FDTable().Get(fd) 590 if file == nil { 591 return 0, nil, linuxerr.EBADF 592 } 593 defer file.DecRef(t) 594 595 if file.StatusFlags()&linux.O_PATH != 0 { 596 switch cmd { 597 case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC, linux.F_GETFD, linux.F_SETFD, linux.F_GETFL: 598 // allowed 599 default: 600 return 0, nil, linuxerr.EBADF 601 } 602 } 603 604 switch cmd { 605 case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: 606 minfd := args[2].Int() 607 fd, err := t.NewFDFrom(minfd, file, kernel.FDFlags{ 608 CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC, 609 }) 610 if err != nil { 611 return 0, nil, err 612 } 613 return uintptr(fd), nil, nil 614 case linux.F_GETFD: 615 return uintptr(flags.ToLinuxFDFlags()), nil, nil 616 case linux.F_SETFD: 617 flags := args[2].Uint() 618 err := t.FDTable().SetFlags(t, fd, kernel.FDFlags{ 619 CloseOnExec: flags&linux.FD_CLOEXEC != 0, 620 }) 621 return 0, nil, err 622 case linux.F_GETFL: 623 return uintptr(file.StatusFlags()), nil, nil 624 case linux.F_SETFL: 625 return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint()) 626 case linux.F_GETOWN: 627 owner, hasOwner := getAsyncOwner(t, file) 628 if !hasOwner { 629 return 0, nil, nil 630 } 631 if owner.Type == linux.F_OWNER_PGRP { 632 return uintptr(-owner.PID), nil, nil 633 } 634 return uintptr(owner.PID), nil, nil 635 case linux.F_SETOWN: 636 who := args[2].Int() 637 ownerType := int32(linux.F_OWNER_PID) 638 if who < 0 { 639 // Check for overflow before flipping the sign. 640 if who-1 > who { 641 return 0, nil, linuxerr.EINVAL 642 } 643 ownerType = linux.F_OWNER_PGRP 644 who = -who 645 } 646 return 0, nil, setAsyncOwner(t, int(fd), file, ownerType, who) 647 case linux.F_GETOWN_EX: 648 owner, hasOwner := getAsyncOwner(t, file) 649 if !hasOwner { 650 return 0, nil, nil 651 } 652 _, err := owner.CopyOut(t, args[2].Pointer()) 653 return 0, nil, err 654 case linux.F_SETOWN_EX: 655 var owner linux.FOwnerEx 656 _, err := owner.CopyIn(t, args[2].Pointer()) 657 if err != nil { 658 return 0, nil, err 659 } 660 return 0, nil, setAsyncOwner(t, int(fd), file, owner.Type, owner.PID) 661 case linux.F_SETPIPE_SZ: 662 pipefile, ok := file.Impl().(*pipe.VFSPipeFD) 663 if !ok { 664 return 0, nil, linuxerr.EBADF 665 } 666 n, err := pipefile.SetPipeSize(int64(args[2].Int())) 667 if err != nil { 668 return 0, nil, err 669 } 670 return uintptr(n), nil, nil 671 case linux.F_GETPIPE_SZ: 672 pipefile, ok := file.Impl().(*pipe.VFSPipeFD) 673 if !ok { 674 return 0, nil, linuxerr.EBADF 675 } 676 return uintptr(pipefile.PipeSize()), nil, nil 677 case linux.F_GET_SEALS: 678 val, err := tmpfs.GetSeals(file) 679 return uintptr(val), nil, err 680 case linux.F_ADD_SEALS: 681 if !file.IsWritable() { 682 return 0, nil, linuxerr.EPERM 683 } 684 err := tmpfs.AddSeals(file, args[2].Uint()) 685 return 0, nil, err 686 case linux.F_SETLK: 687 return 0, nil, posixLock(t, args, file, false /* ofd */, false /* block */) 688 case linux.F_SETLKW: 689 return 0, nil, posixLock(t, args, file, false /* ofd */, true /* block */) 690 case linux.F_GETLK: 691 return 0, nil, posixTestLock(t, args, file, false /* ofd */) 692 case linux.F_OFD_SETLK: 693 return 0, nil, posixLock(t, args, file, true /* ofd */, false /* block */) 694 case linux.F_OFD_SETLKW: 695 return 0, nil, posixLock(t, args, file, true /* ofd */, true /* block */) 696 case linux.F_OFD_GETLK: 697 return 0, nil, posixTestLock(t, args, file, true /* ofd */) 698 case linux.F_GETSIG: 699 a := file.AsyncHandler() 700 if a == nil { 701 // Default behavior aka SIGIO. 702 return 0, nil, nil 703 } 704 return uintptr(a.(*fasync.FileAsync).Signal()), nil, nil 705 case linux.F_SETSIG: 706 a, err := file.SetAsyncHandler(fasync.New(int(fd))) 707 if err != nil { 708 return 0, nil, err 709 } 710 async := a.(*fasync.FileAsync) 711 return 0, nil, async.SetSignal(linux.Signal(args[2].Int())) 712 default: 713 // Everything else is not yet supported. 714 return 0, nil, linuxerr.EINVAL 715 } 716 } 717 718 func getAsyncOwner(t *kernel.Task, fd *vfs.FileDescription) (ownerEx linux.FOwnerEx, hasOwner bool) { 719 a := fd.AsyncHandler() 720 if a == nil { 721 return linux.FOwnerEx{}, false 722 } 723 724 ot, otg, opg := a.(*fasync.FileAsync).Owner() 725 switch { 726 case ot != nil: 727 return linux.FOwnerEx{ 728 Type: linux.F_OWNER_TID, 729 PID: int32(t.PIDNamespace().IDOfTask(ot)), 730 }, true 731 case otg != nil: 732 return linux.FOwnerEx{ 733 Type: linux.F_OWNER_PID, 734 PID: int32(t.PIDNamespace().IDOfThreadGroup(otg)), 735 }, true 736 case opg != nil: 737 return linux.FOwnerEx{ 738 Type: linux.F_OWNER_PGRP, 739 PID: int32(t.PIDNamespace().IDOfProcessGroup(opg)), 740 }, true 741 default: 742 return linux.FOwnerEx{}, true 743 } 744 } 745 746 func setAsyncOwner(t *kernel.Task, fd int, file *vfs.FileDescription, ownerType, pid int32) error { 747 switch ownerType { 748 case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP: 749 // Acceptable type. 750 default: 751 return linuxerr.EINVAL 752 } 753 754 a, err := file.SetAsyncHandler(fasync.New(fd)) 755 if err != nil { 756 return err 757 } 758 async := a.(*fasync.FileAsync) 759 if pid == 0 { 760 async.ClearOwner() 761 return nil 762 } 763 764 switch ownerType { 765 case linux.F_OWNER_TID: 766 task := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) 767 if task == nil { 768 return linuxerr.ESRCH 769 } 770 async.SetOwnerTask(t, task) 771 return nil 772 case linux.F_OWNER_PID: 773 tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(pid)) 774 if tg == nil { 775 return linuxerr.ESRCH 776 } 777 async.SetOwnerThreadGroup(t, tg) 778 return nil 779 case linux.F_OWNER_PGRP: 780 pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(pid)) 781 if pg == nil { 782 return linuxerr.ESRCH 783 } 784 async.SetOwnerProcessGroup(t, pg) 785 return nil 786 default: 787 return linuxerr.EINVAL 788 } 789 } 790 791 func posixTestLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, ofd bool) error { 792 // Copy in the lock request. 793 flockAddr := args[2].Pointer() 794 var flock linux.Flock 795 if _, err := flock.CopyIn(t, flockAddr); err != nil { 796 return err 797 } 798 var typ lock.LockType 799 switch flock.Type { 800 case linux.F_RDLCK: 801 typ = lock.ReadLock 802 case linux.F_WRLCK: 803 typ = lock.WriteLock 804 default: 805 return linuxerr.EINVAL 806 } 807 r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence) 808 if err != nil { 809 return err 810 } 811 uid := lock.UniqueID(t.FDTable()) 812 if ofd { 813 uid = lock.UniqueID(file) 814 } 815 816 newFlock, err := file.TestPOSIX(t, uid, typ, r) 817 if err != nil { 818 return err 819 } 820 if !ofd { 821 newFlock.PID = translatePID(t.PIDNamespace().Root(), t.PIDNamespace(), newFlock.PID) 822 } 823 if _, err = newFlock.CopyOut(t, flockAddr); err != nil { 824 return err 825 } 826 return nil 827 } 828 829 // translatePID translates a pid from one namespace to another. Note that this 830 // may race with task termination/creation, in which case the original task 831 // corresponding to pid may no longer exist. This is used to implement the 832 // F_GETLK fcntl, which has the same potential race in Linux as well (i.e., 833 // there is no synchronization between retrieving the lock PID and translating 834 // it). See fs/locks.c:posix_lock_to_flock. 835 func translatePID(old, new *kernel.PIDNamespace, pid int32) int32 { 836 return int32(new.IDOfTask(old.TaskWithID(kernel.ThreadID(pid)))) 837 } 838 839 func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, ofd bool, block bool) error { 840 // Copy in the lock request. 841 flockAddr := args[2].Pointer() 842 var flock linux.Flock 843 if _, err := flock.CopyIn(t, flockAddr); err != nil { 844 return err 845 } 846 if ofd && flock.PID != 0 { 847 return linuxerr.EINVAL 848 } 849 850 uid := lock.UniqueID(t.FDTable()) 851 pid := int32(t.TGIDInRoot()) 852 if ofd { 853 uid = lock.UniqueID(file) 854 pid = -1 855 } 856 857 r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence) 858 if err != nil { 859 return err 860 } 861 862 switch flock.Type { 863 case linux.F_RDLCK: 864 if !file.IsReadable() { 865 return linuxerr.EBADF 866 } 867 return file.LockPOSIX(t, uid, pid, lock.ReadLock, r, block) 868 869 case linux.F_WRLCK: 870 if !file.IsWritable() { 871 return linuxerr.EBADF 872 } 873 return file.LockPOSIX(t, uid, pid, lock.WriteLock, r, block) 874 875 case linux.F_UNLCK: 876 return file.UnlockPOSIX(t, uid, r) 877 878 default: 879 return linuxerr.EINVAL 880 } 881 } 882 883 // Fadvise64 implements fadvise64(2). 884 // This implementation currently ignores the provided advice. 885 func Fadvise64(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 886 fd := args[0].Int() 887 length := args[2].Int64() 888 advice := args[3].Int() 889 890 // Note: offset is allowed to be negative. 891 if length < 0 { 892 return 0, nil, linuxerr.EINVAL 893 } 894 895 file := t.GetFile(fd) 896 if file == nil { 897 return 0, nil, linuxerr.EBADF 898 } 899 defer file.DecRef(t) 900 901 if file.StatusFlags()&linux.O_PATH != 0 { 902 return 0, nil, linuxerr.EBADF 903 } 904 905 // If the FD refers to a pipe or FIFO, return error. 906 if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe { 907 return 0, nil, linuxerr.ESPIPE 908 } 909 910 switch advice { 911 case linux.POSIX_FADV_NORMAL: 912 case linux.POSIX_FADV_RANDOM: 913 case linux.POSIX_FADV_SEQUENTIAL: 914 case linux.POSIX_FADV_WILLNEED: 915 case linux.POSIX_FADV_DONTNEED: 916 case linux.POSIX_FADV_NOREUSE: 917 default: 918 return 0, nil, linuxerr.EINVAL 919 } 920 921 // Sure, whatever. 922 return 0, nil, nil 923 } 924 925 // Mkdir implements Linux syscall mkdir(2). 926 func Mkdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 927 addr := args[0].Pointer() 928 mode := args[1].ModeT() 929 return 0, nil, mkdirat(t, linux.AT_FDCWD, addr, mode) 930 } 931 932 // Mkdirat implements Linux syscall mkdirat(2). 933 func Mkdirat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 934 dirfd := args[0].Int() 935 addr := args[1].Pointer() 936 mode := args[2].ModeT() 937 return 0, nil, mkdirat(t, dirfd, addr, mode) 938 } 939 940 func mkdirat(t *kernel.Task, dirfd int32, addr hostarch.Addr, mode uint) error { 941 path, err := copyInPath(t, addr) 942 if err != nil { 943 return err 944 } 945 tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) 946 if err != nil { 947 return err 948 } 949 defer tpop.Release(t) 950 return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{ 951 Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()), 952 }) 953 } 954 955 // Rmdir implements Linux syscall rmdir(2). 956 func Rmdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 957 pathAddr := args[0].Pointer() 958 return 0, nil, rmdirat(t, linux.AT_FDCWD, pathAddr) 959 } 960 961 func rmdirat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr) error { 962 path, err := copyInPath(t, pathAddr) 963 if err != nil { 964 return err 965 } 966 tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) 967 if err != nil { 968 return err 969 } 970 defer tpop.Release(t) 971 return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop) 972 } 973 974 // Symlink implements Linux syscall symlink(2). 975 func Symlink(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 976 targetAddr := args[0].Pointer() 977 linkpathAddr := args[1].Pointer() 978 return 0, nil, symlinkat(t, targetAddr, linux.AT_FDCWD, linkpathAddr) 979 } 980 981 // Symlinkat implements Linux syscall symlinkat(2). 982 func Symlinkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 983 targetAddr := args[0].Pointer() 984 newdirfd := args[1].Int() 985 linkpathAddr := args[2].Pointer() 986 return 0, nil, symlinkat(t, targetAddr, newdirfd, linkpathAddr) 987 } 988 989 func symlinkat(t *kernel.Task, targetAddr hostarch.Addr, newdirfd int32, linkpathAddr hostarch.Addr) error { 990 target, err := t.CopyInString(targetAddr, linux.PATH_MAX) 991 if err != nil { 992 return err 993 } 994 if len(target) == 0 { 995 return linuxerr.ENOENT 996 } 997 linkpath, err := copyInPath(t, linkpathAddr) 998 if err != nil { 999 return err 1000 } 1001 tpop, err := getTaskPathOperation(t, newdirfd, linkpath, disallowEmptyPath, nofollowFinalSymlink) 1002 if err != nil { 1003 return err 1004 } 1005 defer tpop.Release(t) 1006 return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target) 1007 } 1008 1009 // Link implements Linux syscall link(2). 1010 func Link(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1011 oldpathAddr := args[0].Pointer() 1012 newpathAddr := args[1].Pointer() 1013 return 0, nil, linkat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */) 1014 } 1015 1016 // Linkat implements Linux syscall linkat(2). 1017 func Linkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1018 olddirfd := args[0].Int() 1019 oldpathAddr := args[1].Pointer() 1020 newdirfd := args[2].Int() 1021 newpathAddr := args[3].Pointer() 1022 flags := args[4].Int() 1023 return 0, nil, linkat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags) 1024 } 1025 1026 func linkat(t *kernel.Task, olddirfd int32, oldpathAddr hostarch.Addr, newdirfd int32, newpathAddr hostarch.Addr, flags int32) error { 1027 if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 { 1028 return linuxerr.EINVAL 1029 } 1030 if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) { 1031 return linuxerr.ENOENT 1032 } 1033 1034 oldpath, err := copyInPath(t, oldpathAddr) 1035 if err != nil { 1036 return err 1037 } 1038 oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_FOLLOW != 0)) 1039 if err != nil { 1040 return err 1041 } 1042 defer oldtpop.Release(t) 1043 1044 newpath, err := copyInPath(t, newpathAddr) 1045 if err != nil { 1046 return err 1047 } 1048 newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink) 1049 if err != nil { 1050 return err 1051 } 1052 defer newtpop.Release(t) 1053 1054 return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop) 1055 } 1056 1057 // Readlinkat implements Linux syscall readlinkat(2). 1058 func Readlinkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1059 dirfd := args[0].Int() 1060 pathAddr := args[1].Pointer() 1061 bufAddr := args[2].Pointer() 1062 size := args[3].SizeT() 1063 return readlinkat(t, dirfd, pathAddr, bufAddr, size) 1064 } 1065 1066 // Readlink implements Linux syscall readlink(2). 1067 func Readlink(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1068 pathAddr := args[0].Pointer() 1069 bufAddr := args[1].Pointer() 1070 size := args[2].SizeT() 1071 return readlinkat(t, linux.AT_FDCWD, pathAddr, bufAddr, size) 1072 } 1073 1074 func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr hostarch.Addr, size uint) (uintptr, *kernel.SyscallControl, error) { 1075 if int(size) <= 0 { 1076 return 0, nil, linuxerr.EINVAL 1077 } 1078 1079 path, err := copyInPath(t, pathAddr) 1080 if err != nil { 1081 return 0, nil, err 1082 } 1083 // "Since Linux 2.6.39, pathname can be an empty string, in which case the 1084 // call operates on the symbolic link referred to by dirfd ..." - 1085 // readlinkat(2) 1086 tpop, err := getTaskPathOperation(t, dirfd, path, allowEmptyPath, nofollowFinalSymlink) 1087 if err != nil { 1088 return 0, nil, err 1089 } 1090 defer tpop.Release(t) 1091 1092 target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop) 1093 if err != nil { 1094 return 0, nil, err 1095 } 1096 1097 if len(target) > int(size) { 1098 target = target[:size] 1099 } 1100 n, err := t.CopyOutBytes(bufAddr, gohacks.ImmutableBytesFromString(target)) 1101 if n == 0 { 1102 return 0, nil, err 1103 } 1104 return uintptr(n), nil, nil 1105 } 1106 1107 // Unlink implements Linux syscall unlink(2). 1108 func Unlink(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1109 pathAddr := args[0].Pointer() 1110 return 0, nil, unlinkat(t, linux.AT_FDCWD, pathAddr) 1111 } 1112 1113 func unlinkat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr) error { 1114 path, err := copyInPath(t, pathAddr) 1115 if err != nil { 1116 return err 1117 } 1118 tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) 1119 if err != nil { 1120 return err 1121 } 1122 defer tpop.Release(t) 1123 return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop) 1124 } 1125 1126 // Unlinkat implements Linux syscall unlinkat(2). 1127 func Unlinkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1128 dirfd := args[0].Int() 1129 pathAddr := args[1].Pointer() 1130 flags := args[2].Int() 1131 1132 if flags&^linux.AT_REMOVEDIR != 0 { 1133 return 0, nil, linuxerr.EINVAL 1134 } 1135 1136 if flags&linux.AT_REMOVEDIR != 0 { 1137 return 0, nil, rmdirat(t, dirfd, pathAddr) 1138 } 1139 return 0, nil, unlinkat(t, dirfd, pathAddr) 1140 } 1141 1142 func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error { 1143 root := t.FSContext().RootDirectory() 1144 defer root.DecRef(t) 1145 start := root 1146 if !path.Absolute { 1147 if !path.HasComponents() && !bool(shouldAllowEmptyPath) { 1148 return linuxerr.ENOENT 1149 } 1150 if dirfd == linux.AT_FDCWD { 1151 start = t.FSContext().WorkingDirectory() 1152 defer start.DecRef(t) 1153 } else { 1154 dirfile := t.GetFile(dirfd) 1155 if dirfile == nil { 1156 return linuxerr.EBADF 1157 } 1158 if !path.HasComponents() && dirfile.StatusFlags()&linux.O_PATH == 0 { 1159 // For empty path, use FileDescription.SetStat() instead of 1160 // VirtualFilesystem.SetStatAt(), since the former may be able to use 1161 // opened file state to expedite the SetStat. Skip this optimization 1162 // for FDs with O_PATH, since the FD impl always returns EBADF. 1163 err := dirfile.SetStat(t, *opts) 1164 dirfile.DecRef(t) 1165 return err 1166 } 1167 start = dirfile.VirtualDentry() 1168 start.IncRef() 1169 defer start.DecRef(t) 1170 dirfile.DecRef(t) 1171 } 1172 } 1173 return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{ 1174 Root: root, 1175 Start: start, 1176 Path: path, 1177 FollowFinalSymlink: bool(shouldFollowFinalSymlink), 1178 }, opts) 1179 } 1180 1181 func handleSetSizeError(t *kernel.Task, err error) error { 1182 if err == linuxerr.ErrExceedsFileSizeLimit { 1183 // Convert error to EFBIG and send a SIGXFSZ per setrlimit(2). 1184 t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t)) 1185 return linuxerr.EFBIG 1186 } 1187 return err 1188 } 1189 1190 // Truncate implements Linux syscall truncate(2). 1191 func Truncate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1192 addr := args[0].Pointer() 1193 length := args[1].Int64() 1194 1195 if length < 0 { 1196 return 0, nil, linuxerr.EINVAL 1197 } 1198 1199 path, err := copyInPath(t, addr) 1200 if err != nil { 1201 return 0, nil, err 1202 } 1203 1204 err = setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{ 1205 Stat: linux.Statx{ 1206 Mask: linux.STATX_SIZE, 1207 Size: uint64(length), 1208 }, 1209 NeedWritePerm: true, 1210 }) 1211 return 0, nil, handleSetSizeError(t, err) 1212 } 1213 1214 // Ftruncate implements Linux syscall ftruncate(2). 1215 func Ftruncate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1216 fd := args[0].Int() 1217 length := args[1].Int64() 1218 1219 if length < 0 { 1220 return 0, nil, linuxerr.EINVAL 1221 } 1222 1223 file := t.GetFile(fd) 1224 if file == nil { 1225 return 0, nil, linuxerr.EBADF 1226 } 1227 defer file.DecRef(t) 1228 1229 if !file.IsWritable() { 1230 return 0, nil, linuxerr.EINVAL 1231 } 1232 1233 err := file.SetStat(t, vfs.SetStatOptions{ 1234 Stat: linux.Statx{ 1235 Mask: linux.STATX_SIZE, 1236 Size: uint64(length), 1237 }, 1238 }) 1239 return 0, nil, handleSetSizeError(t, err) 1240 } 1241 1242 // Umask implements linux syscall umask(2). 1243 func Umask(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1244 mask := args[0].ModeT() 1245 mask = t.FSContext().SwapUmask(mask & 0777) 1246 return uintptr(mask), nil, nil 1247 } 1248 1249 // Chown implements Linux syscall chown(2). 1250 func Chown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1251 pathAddr := args[0].Pointer() 1252 owner := args[1].Int() 1253 group := args[2].Int() 1254 return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, 0 /* flags */) 1255 } 1256 1257 // Lchown implements Linux syscall lchown(2). 1258 func Lchown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1259 pathAddr := args[0].Pointer() 1260 owner := args[1].Int() 1261 group := args[2].Int() 1262 return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, linux.AT_SYMLINK_NOFOLLOW) 1263 } 1264 1265 // Fchownat implements Linux syscall fchownat(2). 1266 func Fchownat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1267 dirfd := args[0].Int() 1268 pathAddr := args[1].Pointer() 1269 owner := args[2].Int() 1270 group := args[3].Int() 1271 flags := args[4].Int() 1272 return 0, nil, fchownat(t, dirfd, pathAddr, owner, group, flags) 1273 } 1274 1275 func fchownat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, owner, group, flags int32) error { 1276 if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { 1277 return linuxerr.EINVAL 1278 } 1279 1280 path, err := copyInPath(t, pathAddr) 1281 if err != nil { 1282 return err 1283 } 1284 1285 var opts vfs.SetStatOptions 1286 if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil { 1287 return err 1288 } 1289 1290 return setstatat(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts) 1291 } 1292 1293 func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vfs.SetStatOptions) error { 1294 userns := t.UserNamespace() 1295 if owner != -1 { 1296 kuid := userns.MapToKUID(auth.UID(owner)) 1297 if !kuid.Ok() { 1298 return linuxerr.EINVAL 1299 } 1300 opts.Stat.Mask |= linux.STATX_UID 1301 opts.Stat.UID = uint32(kuid) 1302 } 1303 if group != -1 { 1304 kgid := userns.MapToKGID(auth.GID(group)) 1305 if !kgid.Ok() { 1306 return linuxerr.EINVAL 1307 } 1308 opts.Stat.Mask |= linux.STATX_GID 1309 opts.Stat.GID = uint32(kgid) 1310 } 1311 return nil 1312 } 1313 1314 // Fchown implements Linux syscall fchown(2). 1315 func Fchown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1316 fd := args[0].Int() 1317 owner := args[1].Int() 1318 group := args[2].Int() 1319 1320 file := t.GetFile(fd) 1321 if file == nil { 1322 return 0, nil, linuxerr.EBADF 1323 } 1324 defer file.DecRef(t) 1325 1326 var opts vfs.SetStatOptions 1327 if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil { 1328 return 0, nil, err 1329 } 1330 return 0, nil, file.SetStat(t, opts) 1331 } 1332 1333 const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX 1334 1335 // Chmod implements Linux syscall chmod(2). 1336 func Chmod(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1337 pathAddr := args[0].Pointer() 1338 mode := args[1].ModeT() 1339 return 0, nil, fchmodat(t, linux.AT_FDCWD, pathAddr, mode) 1340 } 1341 1342 // Fchmodat implements Linux syscall fchmodat(2). 1343 func Fchmodat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1344 dirfd := args[0].Int() 1345 pathAddr := args[1].Pointer() 1346 mode := args[2].ModeT() 1347 return 0, nil, fchmodat(t, dirfd, pathAddr, mode) 1348 } 1349 1350 func fchmodat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, mode uint) error { 1351 path, err := copyInPath(t, pathAddr) 1352 if err != nil { 1353 return err 1354 } 1355 1356 return setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{ 1357 Stat: linux.Statx{ 1358 Mask: linux.STATX_MODE, 1359 Mode: uint16(mode & chmodMask), 1360 }, 1361 }) 1362 } 1363 1364 // Fchmod implements Linux syscall fchmod(2). 1365 func Fchmod(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1366 fd := args[0].Int() 1367 mode := args[1].ModeT() 1368 1369 file := t.GetFile(fd) 1370 if file == nil { 1371 return 0, nil, linuxerr.EBADF 1372 } 1373 defer file.DecRef(t) 1374 1375 return 0, nil, file.SetStat(t, vfs.SetStatOptions{ 1376 Stat: linux.Statx{ 1377 Mask: linux.STATX_MODE, 1378 Mode: uint16(mode & chmodMask), 1379 }, 1380 }) 1381 } 1382 1383 // Utime implements Linux syscall utime(2). 1384 func Utime(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1385 pathAddr := args[0].Pointer() 1386 timesAddr := args[1].Pointer() 1387 1388 opts := vfs.SetStatOptions{ 1389 Stat: linux.Statx{ 1390 Mask: linux.STATX_ATIME | linux.STATX_MTIME, 1391 }, 1392 } 1393 if timesAddr == 0 { 1394 opts.Stat.Atime.Nsec = linux.UTIME_NOW 1395 opts.Stat.Mtime.Nsec = linux.UTIME_NOW 1396 } else { 1397 var times linux.Utime 1398 if _, err := times.CopyIn(t, timesAddr); err != nil { 1399 return 0, nil, err 1400 } 1401 opts.Stat.Atime.Sec = times.Actime 1402 opts.Stat.Mtime.Sec = times.Modtime 1403 } 1404 1405 return 0, nil, utimes(t, linux.AT_FDCWD, pathAddr, followFinalSymlink, &opts) 1406 } 1407 1408 // Utimes implements Linux syscall utimes(2). 1409 func Utimes(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1410 pathAddr := args[0].Pointer() 1411 timesAddr := args[1].Pointer() 1412 1413 var opts vfs.SetStatOptions 1414 if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil { 1415 return 0, nil, err 1416 } 1417 1418 return 0, nil, utimes(t, linux.AT_FDCWD, pathAddr, followFinalSymlink, &opts) 1419 } 1420 1421 // Futimesat implements Linux syscall futimesat(2). 1422 func Futimesat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1423 dirfd := args[0].Int() 1424 pathAddr := args[1].Pointer() 1425 timesAddr := args[2].Pointer() 1426 1427 var opts vfs.SetStatOptions 1428 if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil { 1429 return 0, nil, err 1430 } 1431 1432 return 0, nil, utimes(t, dirfd, pathAddr, followFinalSymlink, &opts) 1433 } 1434 1435 func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr hostarch.Addr, opts *vfs.SetStatOptions) error { 1436 if timesAddr == 0 { 1437 opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME 1438 opts.Stat.Atime.Nsec = linux.UTIME_NOW 1439 opts.Stat.Mtime.Nsec = linux.UTIME_NOW 1440 return nil 1441 } 1442 var times [2]linux.Timeval 1443 if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil { 1444 return err 1445 } 1446 if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 { 1447 return linuxerr.EINVAL 1448 } 1449 opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME 1450 opts.Stat.Atime = linux.StatxTimestamp{ 1451 Sec: times[0].Sec, 1452 Nsec: uint32(times[0].Usec * 1000), 1453 } 1454 opts.Stat.Mtime = linux.StatxTimestamp{ 1455 Sec: times[1].Sec, 1456 Nsec: uint32(times[1].Usec * 1000), 1457 } 1458 return nil 1459 } 1460 1461 // Utimensat implements Linux syscall utimensat(2). 1462 func Utimensat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1463 dirfd := args[0].Int() 1464 pathAddr := args[1].Pointer() 1465 timesAddr := args[2].Pointer() 1466 flags := args[3].Int() 1467 1468 // Linux requires that the UTIME_OMIT check occur before flags. 1469 var opts vfs.SetStatOptions 1470 if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil { 1471 return 0, nil, err 1472 } 1473 if opts.Stat.Mask == 0 { 1474 return 0, nil, nil 1475 } 1476 1477 if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 { 1478 return 0, nil, linuxerr.EINVAL 1479 } 1480 1481 return 0, nil, utimes(t, dirfd, pathAddr, shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts) 1482 } 1483 1484 func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr hostarch.Addr, opts *vfs.SetStatOptions) error { 1485 if timesAddr == 0 { 1486 opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME 1487 opts.Stat.Atime.Nsec = linux.UTIME_NOW 1488 opts.Stat.Mtime.Nsec = linux.UTIME_NOW 1489 return nil 1490 } 1491 var times [2]linux.Timespec 1492 if _, err := linux.CopyTimespecSliceIn(t, timesAddr, times[:]); err != nil { 1493 return err 1494 } 1495 if times[0].Nsec != linux.UTIME_OMIT { 1496 if times[0].Nsec != linux.UTIME_NOW && (times[0].Nsec < 0 || times[0].Nsec > 999999999) { 1497 return linuxerr.EINVAL 1498 } 1499 opts.Stat.Mask |= linux.STATX_ATIME 1500 opts.Stat.Atime = linux.StatxTimestamp{ 1501 Sec: times[0].Sec, 1502 Nsec: uint32(times[0].Nsec), 1503 } 1504 } 1505 if times[1].Nsec != linux.UTIME_OMIT { 1506 if times[1].Nsec != linux.UTIME_NOW && (times[1].Nsec < 0 || times[1].Nsec > 999999999) { 1507 return linuxerr.EINVAL 1508 } 1509 opts.Stat.Mask |= linux.STATX_MTIME 1510 opts.Stat.Mtime = linux.StatxTimestamp{ 1511 Sec: times[1].Sec, 1512 Nsec: uint32(times[1].Nsec), 1513 } 1514 } 1515 return nil 1516 } 1517 1518 // Analogous to fs/utimes.c:do_utimes(). 1519 func utimes(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error { 1520 // "If filename is NULL and dfd refers to an open file, then operate on the 1521 // file. Otherwise look up filename, possibly using dfd as a starting 1522 // point." - fs/utimes.c:do_utimes() 1523 if dirfd != linux.AT_FDCWD && pathAddr == 0 { 1524 file := t.GetFile(dirfd) 1525 if file == nil { 1526 return linuxerr.EBADF 1527 } 1528 defer file.DecRef(t) 1529 return file.SetStat(t, *opts) 1530 } 1531 1532 path, err := copyInPath(t, pathAddr) 1533 if err != nil { 1534 return err 1535 } 1536 return setstatat(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink, opts) 1537 } 1538 1539 // Rename implements Linux syscall rename(2). 1540 func Rename(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1541 oldpathAddr := args[0].Pointer() 1542 newpathAddr := args[1].Pointer() 1543 return 0, nil, renameat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */) 1544 } 1545 1546 // Renameat implements Linux syscall renameat(2). 1547 func Renameat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1548 olddirfd := args[0].Int() 1549 oldpathAddr := args[1].Pointer() 1550 newdirfd := args[2].Int() 1551 newpathAddr := args[3].Pointer() 1552 return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, 0 /* flags */) 1553 } 1554 1555 // Renameat2 implements Linux syscall renameat2(2). 1556 func Renameat2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1557 olddirfd := args[0].Int() 1558 oldpathAddr := args[1].Pointer() 1559 newdirfd := args[2].Int() 1560 newpathAddr := args[3].Pointer() 1561 flags := args[4].Uint() 1562 return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags) 1563 } 1564 1565 func renameat(t *kernel.Task, olddirfd int32, oldpathAddr hostarch.Addr, newdirfd int32, newpathAddr hostarch.Addr, flags uint32) error { 1566 oldpath, err := copyInPath(t, oldpathAddr) 1567 if err != nil { 1568 return err 1569 } 1570 // "If oldpath refers to a symbolic link, the link is renamed" - rename(2) 1571 oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, disallowEmptyPath, nofollowFinalSymlink) 1572 if err != nil { 1573 return err 1574 } 1575 defer oldtpop.Release(t) 1576 1577 newpath, err := copyInPath(t, newpathAddr) 1578 if err != nil { 1579 return err 1580 } 1581 newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink) 1582 if err != nil { 1583 return err 1584 } 1585 defer newtpop.Release(t) 1586 1587 return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{ 1588 Flags: flags, 1589 }) 1590 } 1591 1592 // Fallocate implements linux system call fallocate(2). 1593 func Fallocate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1594 fd := args[0].Int() 1595 mode := args[1].Uint64() 1596 offset := args[2].Int64() 1597 length := args[3].Int64() 1598 1599 file := t.GetFile(fd) 1600 if file == nil { 1601 return 0, nil, linuxerr.EBADF 1602 } 1603 defer file.DecRef(t) 1604 1605 if !file.IsWritable() { 1606 return 0, nil, linuxerr.EBADF 1607 } 1608 if mode != 0 { 1609 return 0, nil, linuxerr.ENOTSUP 1610 } 1611 if offset < 0 || length <= 0 { 1612 return 0, nil, linuxerr.EINVAL 1613 } 1614 1615 size := offset + length 1616 if size < 0 { 1617 return 0, nil, linuxerr.EFBIG 1618 } 1619 limit := limits.FromContext(t).Get(limits.FileSize).Cur 1620 if uint64(size) >= limit { 1621 t.SendSignal(&linux.SignalInfo{ 1622 Signo: int32(linux.SIGXFSZ), 1623 Code: linux.SI_USER, 1624 }) 1625 return 0, nil, linuxerr.EFBIG 1626 } 1627 1628 return 0, nil, file.Allocate(t, mode, uint64(offset), uint64(length)) 1629 } 1630 1631 // Flock implements linux syscall flock(2). 1632 func Flock(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1633 fd := args[0].Int() 1634 operation := args[1].Int() 1635 1636 file := t.GetFile(fd) 1637 if file == nil { 1638 // flock(2): EBADF fd is not an open file descriptor. 1639 return 0, nil, linuxerr.EBADF 1640 } 1641 defer file.DecRef(t) 1642 1643 nonblocking := operation&linux.LOCK_NB != 0 1644 operation &^= linux.LOCK_NB 1645 1646 switch operation { 1647 case linux.LOCK_EX: 1648 if err := file.LockBSD(t, int32(t.TGIDInRoot()), lock.WriteLock, !nonblocking /* block */); err != nil { 1649 return 0, nil, err 1650 } 1651 case linux.LOCK_SH: 1652 if err := file.LockBSD(t, int32(t.TGIDInRoot()), lock.ReadLock, !nonblocking /* block */); err != nil { 1653 return 0, nil, err 1654 } 1655 case linux.LOCK_UN: 1656 if err := file.UnlockBSD(t); err != nil { 1657 return 0, nil, err 1658 } 1659 default: 1660 // flock(2): EINVAL operation is invalid. 1661 return 0, nil, linuxerr.EINVAL 1662 } 1663 1664 return 0, nil, nil 1665 } 1666 1667 const ( 1668 memfdPrefix = "memfd:" 1669 memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix) 1670 memfdAllFlags = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING) 1671 ) 1672 1673 // MemfdCreate implements the linux syscall memfd_create(2). 1674 func MemfdCreate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1675 addr := args[0].Pointer() 1676 flags := args[1].Uint() 1677 1678 if flags&^memfdAllFlags != 0 { 1679 // Unknown bits in flags. 1680 return 0, nil, linuxerr.EINVAL 1681 } 1682 1683 allowSeals := flags&linux.MFD_ALLOW_SEALING != 0 1684 cloExec := flags&linux.MFD_CLOEXEC != 0 1685 1686 name, err := t.CopyInString(addr, memfdMaxNameLen) 1687 if err != nil { 1688 return 0, nil, err 1689 } 1690 1691 shmMount := t.Kernel().ShmMount() 1692 file, err := tmpfs.NewMemfd(t, t.Credentials(), shmMount, allowSeals, memfdPrefix+name) 1693 if err != nil { 1694 return 0, nil, err 1695 } 1696 defer file.DecRef(t) 1697 1698 fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ 1699 CloseOnExec: cloExec, 1700 }) 1701 if err != nil { 1702 return 0, nil, err 1703 } 1704 1705 return uintptr(fd), nil, nil 1706 }