github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/syscalls/linux/sys_file.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package linux 16 17 import ( 18 "math" 19 20 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 21 "github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr" 22 "github.com/nicocha30/gvisor-ligolo/pkg/fspath" 23 "github.com/nicocha30/gvisor-ligolo/pkg/gohacks" 24 "github.com/nicocha30/gvisor-ligolo/pkg/hostarch" 25 "github.com/nicocha30/gvisor-ligolo/pkg/marshal/primitive" 26 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch" 27 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/lock" 28 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/tmpfs" 29 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel" 30 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth" 31 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/fasync" 32 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/pipe" 33 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/limits" 34 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs" 35 ) 36 37 // Mknod implements Linux syscall mknod(2). 38 func Mknod(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 39 addr := args[0].Pointer() 40 mode := args[1].ModeT() 41 dev := args[2].Uint() 42 return 0, nil, mknodat(t, linux.AT_FDCWD, addr, linux.FileMode(mode), dev) 43 } 44 45 // Mknodat implements Linux syscall mknodat(2). 46 func Mknodat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 47 dirfd := args[0].Int() 48 addr := args[1].Pointer() 49 mode := args[2].ModeT() 50 dev := args[3].Uint() 51 return 0, nil, mknodat(t, dirfd, addr, linux.FileMode(mode), dev) 52 } 53 54 func mknodat(t *kernel.Task, dirfd int32, addr hostarch.Addr, mode linux.FileMode, dev uint32) error { 55 path, err := copyInPath(t, addr) 56 if err != nil { 57 return err 58 } 59 tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) 60 if err != nil { 61 return err 62 } 63 defer tpop.Release(t) 64 65 // "Zero file type is equivalent to type S_IFREG." - mknod(2) 66 if mode.FileType() == 0 { 67 mode |= linux.ModeRegular 68 } 69 major, minor := linux.DecodeDeviceID(dev) 70 return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{ 71 Mode: mode &^ linux.FileMode(t.FSContext().Umask()), 72 DevMajor: uint32(major), 73 DevMinor: minor, 74 }) 75 } 76 77 // Open implements Linux syscall open(2). 78 func Open(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 79 addr := args[0].Pointer() 80 flags := args[1].Uint() 81 mode := args[2].ModeT() 82 return openat(t, linux.AT_FDCWD, addr, flags, mode) 83 } 84 85 // Openat implements Linux syscall openat(2). 86 func Openat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 87 dirfd := args[0].Int() 88 addr := args[1].Pointer() 89 flags := args[2].Uint() 90 mode := args[3].ModeT() 91 return openat(t, dirfd, addr, flags, mode) 92 } 93 94 // Creat implements Linux syscall creat(2). 95 func Creat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 96 addr := args[0].Pointer() 97 mode := args[1].ModeT() 98 return openat(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_CREAT|linux.O_TRUNC, mode) 99 } 100 101 func openat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, flags uint32, mode uint) (uintptr, *kernel.SyscallControl, error) { 102 path, err := copyInPath(t, pathAddr) 103 if err != nil { 104 return 0, nil, err 105 } 106 tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink(flags&linux.O_NOFOLLOW == 0)) 107 if err != nil { 108 return 0, nil, err 109 } 110 defer tpop.Release(t) 111 112 file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{ 113 Flags: flags | linux.O_LARGEFILE, 114 Mode: linux.FileMode(mode & (0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX) &^ t.FSContext().Umask()), 115 }) 116 if err != nil { 117 return 0, nil, err 118 } 119 defer file.DecRef(t) 120 121 fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ 122 CloseOnExec: flags&linux.O_CLOEXEC != 0, 123 }) 124 return uintptr(fd), nil, err 125 } 126 127 // Access implements Linux syscall access(2). 128 func Access(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 129 addr := args[0].Pointer() 130 mode := args[1].ModeT() 131 132 return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode, 0 /* flags */) 133 } 134 135 // Faccessat implements Linux syscall faccessat(2). 136 func Faccessat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 137 dirfd := args[0].Int() 138 addr := args[1].Pointer() 139 mode := args[2].ModeT() 140 141 return 0, nil, accessAt(t, dirfd, addr, mode, 0 /* flags */) 142 } 143 144 // Faccessat2 implements Linux syscall faccessat2(2). 145 func Faccessat2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 146 dirfd := args[0].Int() 147 addr := args[1].Pointer() 148 mode := args[2].ModeT() 149 flags := args[3].Int() 150 151 return 0, nil, accessAt(t, dirfd, addr, mode, flags) 152 } 153 154 func accessAt(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, mode uint, flags int32) error { 155 const rOK = 4 156 const wOK = 2 157 const xOK = 1 158 159 // Sanity check the mode. 160 if mode&^(rOK|wOK|xOK) != 0 { 161 return linuxerr.EINVAL 162 } 163 164 // faccessat2(2) isn't documented as supporting AT_EMPTY_PATH, but it does. 165 if flags&^(linux.AT_EACCESS|linux.AT_SYMLINK_NOFOLLOW|linux.AT_EMPTY_PATH) != 0 { 166 return linuxerr.EINVAL 167 } 168 169 path, err := copyInPath(t, pathAddr) 170 if err != nil { 171 return err 172 } 173 tpop, err := getTaskPathOperation(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0)) 174 if err != nil { 175 return err 176 } 177 defer tpop.Release(t) 178 179 creds := t.Credentials() 180 if flags&linux.AT_EACCESS == 0 { 181 // access(2) and faccessat(2) check permissions using real 182 // UID/GID, not effective UID/GID. 183 // 184 // "access() needs to use the real uid/gid, not the effective 185 // uid/gid. We do this by temporarily clearing all FS-related 186 // capabilities and switching the fsuid/fsgid around to the 187 // real ones." -fs/open.c:faccessat 188 creds = creds.Fork() 189 creds.EffectiveKUID = creds.RealKUID 190 creds.EffectiveKGID = creds.RealKGID 191 if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID { 192 creds.EffectiveCaps = creds.PermittedCaps 193 } else { 194 creds.EffectiveCaps = 0 195 } 196 } 197 198 return t.Kernel().VFS().AccessAt(t, creds, vfs.AccessTypes(mode), &tpop.pop) 199 } 200 201 // Ioctl implements Linux syscall ioctl(2). 202 func Ioctl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 203 fd := args[0].Int() 204 205 file := t.GetFile(fd) 206 if file == nil { 207 return 0, nil, linuxerr.EBADF 208 } 209 defer file.DecRef(t) 210 211 if file.StatusFlags()&linux.O_PATH != 0 { 212 return 0, nil, linuxerr.EBADF 213 } 214 215 // Handle ioctls that apply to all FDs. 216 switch args[1].Int() { 217 case linux.FIONCLEX: 218 t.FDTable().SetFlags(t, fd, kernel.FDFlags{ 219 CloseOnExec: false, 220 }) 221 return 0, nil, nil 222 223 case linux.FIOCLEX: 224 t.FDTable().SetFlags(t, fd, kernel.FDFlags{ 225 CloseOnExec: true, 226 }) 227 return 0, nil, nil 228 229 case linux.FIONBIO: 230 var set int32 231 if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil { 232 return 0, nil, err 233 } 234 flags := file.StatusFlags() 235 if set != 0 { 236 flags |= linux.O_NONBLOCK 237 } else { 238 flags &^= linux.O_NONBLOCK 239 } 240 return 0, nil, file.SetStatusFlags(t, t.Credentials(), flags) 241 242 case linux.FIOASYNC: 243 var set int32 244 if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil { 245 return 0, nil, err 246 } 247 flags := file.StatusFlags() 248 if set != 0 { 249 flags |= linux.O_ASYNC 250 } else { 251 flags &^= linux.O_ASYNC 252 } 253 file.SetStatusFlags(t, t.Credentials(), flags) 254 return 0, nil, nil 255 256 case linux.FIOGETOWN, linux.SIOCGPGRP: 257 var who int32 258 owner, hasOwner := getAsyncOwner(t, file) 259 if hasOwner { 260 if owner.Type == linux.F_OWNER_PGRP { 261 who = -owner.PID 262 } else { 263 who = owner.PID 264 } 265 } 266 _, err := primitive.CopyInt32Out(t, args[2].Pointer(), who) 267 return 0, nil, err 268 269 case linux.FIOSETOWN, linux.SIOCSPGRP: 270 var who int32 271 if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &who); err != nil { 272 return 0, nil, err 273 } 274 ownerType := int32(linux.F_OWNER_PID) 275 if who < 0 { 276 // Check for overflow before flipping the sign. 277 if who-1 > who { 278 return 0, nil, linuxerr.EINVAL 279 } 280 ownerType = linux.F_OWNER_PGRP 281 who = -who 282 } 283 return 0, nil, setAsyncOwner(t, int(fd), file, ownerType, who) 284 } 285 286 ret, err := file.Ioctl(t, t.MemoryManager(), sysno, args) 287 return ret, nil, err 288 } 289 290 // Getcwd implements Linux syscall getcwd(2). 291 func Getcwd(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 292 addr := args[0].Pointer() 293 size := args[1].SizeT() 294 295 root := t.FSContext().RootDirectory() 296 wd := t.FSContext().WorkingDirectory() 297 s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd) 298 root.DecRef(t) 299 wd.DecRef(t) 300 if err != nil { 301 return 0, nil, err 302 } 303 304 // Note this is >= because we need a terminator. 305 if uint(len(s)) >= size { 306 return 0, nil, linuxerr.ERANGE 307 } 308 309 // Construct a byte slice containing a NUL terminator. 310 buf := t.CopyScratchBuffer(len(s) + 1) 311 copy(buf, s) 312 buf[len(buf)-1] = 0 313 314 // Write the pathname slice. 315 n, err := t.CopyOutBytes(addr, buf) 316 if err != nil { 317 return 0, nil, err 318 } 319 return uintptr(n), nil, nil 320 } 321 322 // Chdir implements Linux syscall chdir(2). 323 func Chdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 324 addr := args[0].Pointer() 325 326 path, err := copyInPath(t, addr) 327 if err != nil { 328 return 0, nil, err 329 } 330 tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink) 331 if err != nil { 332 return 0, nil, err 333 } 334 defer tpop.Release(t) 335 336 vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ 337 CheckSearchable: true, 338 }) 339 if err != nil { 340 return 0, nil, err 341 } 342 t.FSContext().SetWorkingDirectory(t, vd) 343 vd.DecRef(t) 344 return 0, nil, nil 345 } 346 347 // Fchdir implements Linux syscall fchdir(2). 348 func Fchdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 349 fd := args[0].Int() 350 351 tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink) 352 if err != nil { 353 return 0, nil, err 354 } 355 defer tpop.Release(t) 356 357 vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ 358 CheckSearchable: true, 359 }) 360 if err != nil { 361 return 0, nil, err 362 } 363 t.FSContext().SetWorkingDirectory(t, vd) 364 vd.DecRef(t) 365 return 0, nil, nil 366 } 367 368 // Chroot implements Linux syscall chroot(2). 369 func Chroot(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 370 addr := args[0].Pointer() 371 372 if !t.HasCapability(linux.CAP_SYS_CHROOT) { 373 return 0, nil, linuxerr.EPERM 374 } 375 376 path, err := copyInPath(t, addr) 377 if err != nil { 378 return 0, nil, err 379 } 380 tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink) 381 if err != nil { 382 return 0, nil, err 383 } 384 defer tpop.Release(t) 385 386 vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ 387 CheckSearchable: true, 388 }) 389 if err != nil { 390 return 0, nil, err 391 } 392 t.FSContext().SetRootDirectory(t, vd) 393 vd.DecRef(t) 394 return 0, nil, nil 395 } 396 397 // PivotRoot implements Linux syscall pivot_root(2). 398 func PivotRoot(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 399 addr1 := args[0].Pointer() 400 addr2 := args[1].Pointer() 401 402 if !t.HasCapability(linux.CAP_SYS_ADMIN) { 403 return 0, nil, linuxerr.EPERM 404 } 405 406 newRootPath, err := copyInPath(t, addr1) 407 if err != nil { 408 return 0, nil, err 409 } 410 newRootTpop, err := getTaskPathOperation(t, linux.AT_FDCWD, newRootPath, disallowEmptyPath, followFinalSymlink) 411 if err != nil { 412 return 0, nil, err 413 } 414 defer newRootTpop.Release(t) 415 putOldPath, err := copyInPath(t, addr2) 416 if err != nil { 417 return 0, nil, err 418 } 419 putOldTpop, err := getTaskPathOperation(t, linux.AT_FDCWD, putOldPath, disallowEmptyPath, followFinalSymlink) 420 if err != nil { 421 return 0, nil, err 422 } 423 defer putOldTpop.Release(t) 424 425 oldRootVd := t.FSContext().RootDirectory() 426 defer oldRootVd.DecRef(t) 427 newRootVd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &newRootTpop.pop, &vfs.GetDentryOptions{ 428 CheckSearchable: true, 429 }) 430 if err != nil { 431 return 0, nil, err 432 } 433 defer newRootVd.DecRef(t) 434 435 if err := t.Kernel().VFS().PivotRoot(t, t.Credentials(), &newRootTpop.pop, &putOldTpop.pop); err != nil { 436 return 0, nil, err 437 } 438 t.Kernel().ReplaceFSContextRoots(t, oldRootVd, newRootVd) 439 return 0, nil, nil 440 } 441 442 // Close implements Linux syscall close(2). 443 func Close(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 444 fd := args[0].Int() 445 446 // Note that Remove provides a reference on the file that we may use to 447 // flush. It is still active until we drop the final reference below 448 // (and other reference-holding operations complete). 449 file := t.FDTable().Remove(t, fd) 450 if file == nil { 451 return 0, nil, linuxerr.EBADF 452 } 453 defer file.DecRef(t) 454 455 err := file.OnClose(t) 456 return 0, nil, HandleIOError(t, false /* partial */, err, linuxerr.EINTR, "close", file) 457 } 458 459 // CloseRange implements linux syscall close_range(2). 460 func CloseRange(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 461 first := args[0].Uint() 462 last := args[1].Uint() 463 flags := args[2].Uint() 464 465 if (first > last) || (last > math.MaxInt32) { 466 return 0, nil, linuxerr.EINVAL 467 } 468 469 if (flags & ^(linux.CLOSE_RANGE_CLOEXEC | linux.CLOSE_RANGE_UNSHARE)) != 0 { 470 return 0, nil, linuxerr.EINVAL 471 } 472 473 cloexec := flags & linux.CLOSE_RANGE_CLOEXEC 474 unshare := flags & linux.CLOSE_RANGE_UNSHARE 475 476 if unshare != 0 { 477 // If possible, we don't want to copy FDs to the new unshared table, because those FDs will 478 // be promptly closed and no longer used. So in the case where we know the range extends all 479 // the way to the end of the FdTable, we can simply copy the FdTable only up to the start of 480 // the range that we are closing. 481 if cloexec == 0 && int32(last) >= t.FDTable().GetLastFd() { 482 t.UnshareFdTable(int32(first)) 483 } else { 484 t.UnshareFdTable(math.MaxInt32) 485 } 486 } 487 488 if cloexec != 0 { 489 flagToApply := kernel.FDFlags{ 490 CloseOnExec: true, 491 } 492 t.FDTable().SetFlagsForRange(t.AsyncContext(), int32(first), int32(last), flagToApply) 493 return 0, nil, nil 494 } 495 496 fdTable := t.FDTable() 497 fd := int32(first) 498 for { 499 fd, file := fdTable.RemoveNextInRange(t, fd, int32(last)) 500 if file == nil { 501 break 502 } 503 504 fd++ 505 // Per the close_range(2) documentation, errors upon closing file descriptors are ignored. 506 _ = file.OnClose(t) 507 file.DecRef(t) 508 } 509 510 return 0, nil, nil 511 } 512 513 // Dup implements Linux syscall dup(2). 514 func Dup(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 515 fd := args[0].Int() 516 517 file := t.GetFile(fd) 518 if file == nil { 519 return 0, nil, linuxerr.EBADF 520 } 521 defer file.DecRef(t) 522 523 newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{}) 524 if err != nil { 525 return 0, nil, linuxerr.EMFILE 526 } 527 return uintptr(newFD), nil, nil 528 } 529 530 // Dup2 implements Linux syscall dup2(2). 531 func Dup2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 532 oldfd := args[0].Int() 533 newfd := args[1].Int() 534 535 if oldfd == newfd { 536 // As long as oldfd is valid, dup2() does nothing and returns newfd. 537 file := t.GetFile(oldfd) 538 if file == nil { 539 return 0, nil, linuxerr.EBADF 540 } 541 file.DecRef(t) 542 return uintptr(newfd), nil, nil 543 } 544 545 return dup3(t, oldfd, newfd, 0) 546 } 547 548 // Dup3 implements Linux syscall dup3(2). 549 func Dup3(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 550 oldfd := args[0].Int() 551 newfd := args[1].Int() 552 flags := args[2].Uint() 553 554 if oldfd == newfd { 555 return 0, nil, linuxerr.EINVAL 556 } 557 558 return dup3(t, oldfd, newfd, flags) 559 } 560 561 func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) { 562 if flags&^linux.O_CLOEXEC != 0 { 563 return 0, nil, linuxerr.EINVAL 564 } 565 566 file := t.GetFile(oldfd) 567 if file == nil { 568 return 0, nil, linuxerr.EBADF 569 } 570 defer file.DecRef(t) 571 572 err := t.NewFDAt(newfd, file, kernel.FDFlags{ 573 CloseOnExec: flags&linux.O_CLOEXEC != 0, 574 }) 575 if err != nil { 576 return 0, nil, err 577 } 578 return uintptr(newfd), nil, nil 579 } 580 581 // Fcntl implements linux syscall fcntl(2). 582 func Fcntl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 583 fd := args[0].Int() 584 cmd := args[1].Int() 585 586 file, flags := t.FDTable().Get(fd) 587 if file == nil { 588 return 0, nil, linuxerr.EBADF 589 } 590 defer file.DecRef(t) 591 592 if file.StatusFlags()&linux.O_PATH != 0 { 593 switch cmd { 594 case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC, linux.F_GETFD, linux.F_SETFD, linux.F_GETFL: 595 // allowed 596 default: 597 return 0, nil, linuxerr.EBADF 598 } 599 } 600 601 switch cmd { 602 case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: 603 minfd := args[2].Int() 604 fd, err := t.NewFDFrom(minfd, file, kernel.FDFlags{ 605 CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC, 606 }) 607 if err != nil { 608 return 0, nil, err 609 } 610 return uintptr(fd), nil, nil 611 case linux.F_GETFD: 612 return uintptr(flags.ToLinuxFDFlags()), nil, nil 613 case linux.F_SETFD: 614 flags := args[2].Uint() 615 err := t.FDTable().SetFlags(t, fd, kernel.FDFlags{ 616 CloseOnExec: flags&linux.FD_CLOEXEC != 0, 617 }) 618 return 0, nil, err 619 case linux.F_GETFL: 620 return uintptr(file.StatusFlags()), nil, nil 621 case linux.F_SETFL: 622 return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint()) 623 case linux.F_GETOWN: 624 owner, hasOwner := getAsyncOwner(t, file) 625 if !hasOwner { 626 return 0, nil, nil 627 } 628 if owner.Type == linux.F_OWNER_PGRP { 629 return uintptr(-owner.PID), nil, nil 630 } 631 return uintptr(owner.PID), nil, nil 632 case linux.F_SETOWN: 633 who := args[2].Int() 634 ownerType := int32(linux.F_OWNER_PID) 635 if who < 0 { 636 // Check for overflow before flipping the sign. 637 if who-1 > who { 638 return 0, nil, linuxerr.EINVAL 639 } 640 ownerType = linux.F_OWNER_PGRP 641 who = -who 642 } 643 return 0, nil, setAsyncOwner(t, int(fd), file, ownerType, who) 644 case linux.F_GETOWN_EX: 645 owner, hasOwner := getAsyncOwner(t, file) 646 if !hasOwner { 647 return 0, nil, nil 648 } 649 _, err := owner.CopyOut(t, args[2].Pointer()) 650 return 0, nil, err 651 case linux.F_SETOWN_EX: 652 var owner linux.FOwnerEx 653 _, err := owner.CopyIn(t, args[2].Pointer()) 654 if err != nil { 655 return 0, nil, err 656 } 657 return 0, nil, setAsyncOwner(t, int(fd), file, owner.Type, owner.PID) 658 case linux.F_SETPIPE_SZ: 659 pipefile, ok := file.Impl().(*pipe.VFSPipeFD) 660 if !ok { 661 return 0, nil, linuxerr.EBADF 662 } 663 n, err := pipefile.SetPipeSize(int64(args[2].Int())) 664 if err != nil { 665 return 0, nil, err 666 } 667 return uintptr(n), nil, nil 668 case linux.F_GETPIPE_SZ: 669 pipefile, ok := file.Impl().(*pipe.VFSPipeFD) 670 if !ok { 671 return 0, nil, linuxerr.EBADF 672 } 673 return uintptr(pipefile.PipeSize()), nil, nil 674 case linux.F_GET_SEALS: 675 val, err := tmpfs.GetSeals(file) 676 return uintptr(val), nil, err 677 case linux.F_ADD_SEALS: 678 if !file.IsWritable() { 679 return 0, nil, linuxerr.EPERM 680 } 681 err := tmpfs.AddSeals(file, args[2].Uint()) 682 return 0, nil, err 683 case linux.F_SETLK: 684 return 0, nil, posixLock(t, args, file, false /* ofd */, false /* block */) 685 case linux.F_SETLKW: 686 return 0, nil, posixLock(t, args, file, false /* ofd */, true /* block */) 687 case linux.F_GETLK: 688 return 0, nil, posixTestLock(t, args, file, false /* ofd */) 689 case linux.F_OFD_SETLK: 690 return 0, nil, posixLock(t, args, file, true /* ofd */, false /* block */) 691 case linux.F_OFD_SETLKW: 692 return 0, nil, posixLock(t, args, file, true /* ofd */, true /* block */) 693 case linux.F_OFD_GETLK: 694 return 0, nil, posixTestLock(t, args, file, true /* ofd */) 695 case linux.F_GETSIG: 696 a := file.AsyncHandler() 697 if a == nil { 698 // Default behavior aka SIGIO. 699 return 0, nil, nil 700 } 701 return uintptr(a.(*fasync.FileAsync).Signal()), nil, nil 702 case linux.F_SETSIG: 703 a, err := file.SetAsyncHandler(fasync.New(int(fd))) 704 if err != nil { 705 return 0, nil, err 706 } 707 async := a.(*fasync.FileAsync) 708 return 0, nil, async.SetSignal(linux.Signal(args[2].Int())) 709 default: 710 // Everything else is not yet supported. 711 return 0, nil, linuxerr.EINVAL 712 } 713 } 714 715 func getAsyncOwner(t *kernel.Task, fd *vfs.FileDescription) (ownerEx linux.FOwnerEx, hasOwner bool) { 716 a := fd.AsyncHandler() 717 if a == nil { 718 return linux.FOwnerEx{}, false 719 } 720 721 ot, otg, opg := a.(*fasync.FileAsync).Owner() 722 switch { 723 case ot != nil: 724 return linux.FOwnerEx{ 725 Type: linux.F_OWNER_TID, 726 PID: int32(t.PIDNamespace().IDOfTask(ot)), 727 }, true 728 case otg != nil: 729 return linux.FOwnerEx{ 730 Type: linux.F_OWNER_PID, 731 PID: int32(t.PIDNamespace().IDOfThreadGroup(otg)), 732 }, true 733 case opg != nil: 734 return linux.FOwnerEx{ 735 Type: linux.F_OWNER_PGRP, 736 PID: int32(t.PIDNamespace().IDOfProcessGroup(opg)), 737 }, true 738 default: 739 return linux.FOwnerEx{}, true 740 } 741 } 742 743 func setAsyncOwner(t *kernel.Task, fd int, file *vfs.FileDescription, ownerType, pid int32) error { 744 switch ownerType { 745 case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP: 746 // Acceptable type. 747 default: 748 return linuxerr.EINVAL 749 } 750 751 a, err := file.SetAsyncHandler(fasync.New(fd)) 752 if err != nil { 753 return err 754 } 755 async := a.(*fasync.FileAsync) 756 if pid == 0 { 757 async.ClearOwner() 758 return nil 759 } 760 761 switch ownerType { 762 case linux.F_OWNER_TID: 763 task := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) 764 if task == nil { 765 return linuxerr.ESRCH 766 } 767 async.SetOwnerTask(t, task) 768 return nil 769 case linux.F_OWNER_PID: 770 tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(pid)) 771 if tg == nil { 772 return linuxerr.ESRCH 773 } 774 async.SetOwnerThreadGroup(t, tg) 775 return nil 776 case linux.F_OWNER_PGRP: 777 pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(pid)) 778 if pg == nil { 779 return linuxerr.ESRCH 780 } 781 async.SetOwnerProcessGroup(t, pg) 782 return nil 783 default: 784 return linuxerr.EINVAL 785 } 786 } 787 788 func posixTestLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, ofd bool) error { 789 // Copy in the lock request. 790 flockAddr := args[2].Pointer() 791 var flock linux.Flock 792 if _, err := flock.CopyIn(t, flockAddr); err != nil { 793 return err 794 } 795 var typ lock.LockType 796 switch flock.Type { 797 case linux.F_RDLCK: 798 typ = lock.ReadLock 799 case linux.F_WRLCK: 800 typ = lock.WriteLock 801 default: 802 return linuxerr.EINVAL 803 } 804 r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence) 805 if err != nil { 806 return err 807 } 808 uid := lock.UniqueID(t.FDTable()) 809 if ofd { 810 uid = lock.UniqueID(file) 811 } 812 813 newFlock, err := file.TestPOSIX(t, uid, typ, r) 814 if err != nil { 815 return err 816 } 817 if !ofd { 818 newFlock.PID = translatePID(t.PIDNamespace().Root(), t.PIDNamespace(), newFlock.PID) 819 } 820 if _, err = newFlock.CopyOut(t, flockAddr); err != nil { 821 return err 822 } 823 return nil 824 } 825 826 // translatePID translates a pid from one namespace to another. Note that this 827 // may race with task termination/creation, in which case the original task 828 // corresponding to pid may no longer exist. This is used to implement the 829 // F_GETLK fcntl, which has the same potential race in Linux as well (i.e., 830 // there is no synchronization between retrieving the lock PID and translating 831 // it). See fs/locks.c:posix_lock_to_flock. 832 func translatePID(old, new *kernel.PIDNamespace, pid int32) int32 { 833 return int32(new.IDOfTask(old.TaskWithID(kernel.ThreadID(pid)))) 834 } 835 836 func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, ofd bool, block bool) error { 837 // Copy in the lock request. 838 flockAddr := args[2].Pointer() 839 var flock linux.Flock 840 if _, err := flock.CopyIn(t, flockAddr); err != nil { 841 return err 842 } 843 if ofd && flock.PID != 0 { 844 return linuxerr.EINVAL 845 } 846 847 uid := lock.UniqueID(t.FDTable()) 848 pid := int32(t.TGIDInRoot()) 849 if ofd { 850 uid = lock.UniqueID(file) 851 pid = -1 852 } 853 854 r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence) 855 if err != nil { 856 return err 857 } 858 859 switch flock.Type { 860 case linux.F_RDLCK: 861 if !file.IsReadable() { 862 return linuxerr.EBADF 863 } 864 return file.LockPOSIX(t, uid, pid, lock.ReadLock, r, block) 865 866 case linux.F_WRLCK: 867 if !file.IsWritable() { 868 return linuxerr.EBADF 869 } 870 return file.LockPOSIX(t, uid, pid, lock.WriteLock, r, block) 871 872 case linux.F_UNLCK: 873 return file.UnlockPOSIX(t, uid, r) 874 875 default: 876 return linuxerr.EINVAL 877 } 878 } 879 880 // Fadvise64 implements fadvise64(2). 881 // This implementation currently ignores the provided advice. 882 func Fadvise64(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 883 fd := args[0].Int() 884 length := args[2].Int64() 885 advice := args[3].Int() 886 887 // Note: offset is allowed to be negative. 888 if length < 0 { 889 return 0, nil, linuxerr.EINVAL 890 } 891 892 file := t.GetFile(fd) 893 if file == nil { 894 return 0, nil, linuxerr.EBADF 895 } 896 defer file.DecRef(t) 897 898 if file.StatusFlags()&linux.O_PATH != 0 { 899 return 0, nil, linuxerr.EBADF 900 } 901 902 // If the FD refers to a pipe or FIFO, return error. 903 if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe { 904 return 0, nil, linuxerr.ESPIPE 905 } 906 907 switch advice { 908 case linux.POSIX_FADV_NORMAL: 909 case linux.POSIX_FADV_RANDOM: 910 case linux.POSIX_FADV_SEQUENTIAL: 911 case linux.POSIX_FADV_WILLNEED: 912 case linux.POSIX_FADV_DONTNEED: 913 case linux.POSIX_FADV_NOREUSE: 914 default: 915 return 0, nil, linuxerr.EINVAL 916 } 917 918 // Sure, whatever. 919 return 0, nil, nil 920 } 921 922 // Mkdir implements Linux syscall mkdir(2). 923 func Mkdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 924 addr := args[0].Pointer() 925 mode := args[1].ModeT() 926 return 0, nil, mkdirat(t, linux.AT_FDCWD, addr, mode) 927 } 928 929 // Mkdirat implements Linux syscall mkdirat(2). 930 func Mkdirat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 931 dirfd := args[0].Int() 932 addr := args[1].Pointer() 933 mode := args[2].ModeT() 934 return 0, nil, mkdirat(t, dirfd, addr, mode) 935 } 936 937 func mkdirat(t *kernel.Task, dirfd int32, addr hostarch.Addr, mode uint) error { 938 path, err := copyInPath(t, addr) 939 if err != nil { 940 return err 941 } 942 tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) 943 if err != nil { 944 return err 945 } 946 defer tpop.Release(t) 947 return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{ 948 Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()), 949 }) 950 } 951 952 // Rmdir implements Linux syscall rmdir(2). 953 func Rmdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 954 pathAddr := args[0].Pointer() 955 return 0, nil, rmdirat(t, linux.AT_FDCWD, pathAddr) 956 } 957 958 func rmdirat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr) error { 959 path, err := copyInPath(t, pathAddr) 960 if err != nil { 961 return err 962 } 963 tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) 964 if err != nil { 965 return err 966 } 967 defer tpop.Release(t) 968 return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop) 969 } 970 971 // Symlink implements Linux syscall symlink(2). 972 func Symlink(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 973 targetAddr := args[0].Pointer() 974 linkpathAddr := args[1].Pointer() 975 return 0, nil, symlinkat(t, targetAddr, linux.AT_FDCWD, linkpathAddr) 976 } 977 978 // Symlinkat implements Linux syscall symlinkat(2). 979 func Symlinkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 980 targetAddr := args[0].Pointer() 981 newdirfd := args[1].Int() 982 linkpathAddr := args[2].Pointer() 983 return 0, nil, symlinkat(t, targetAddr, newdirfd, linkpathAddr) 984 } 985 986 func symlinkat(t *kernel.Task, targetAddr hostarch.Addr, newdirfd int32, linkpathAddr hostarch.Addr) error { 987 target, err := t.CopyInString(targetAddr, linux.PATH_MAX) 988 if err != nil { 989 return err 990 } 991 if len(target) == 0 { 992 return linuxerr.ENOENT 993 } 994 linkpath, err := copyInPath(t, linkpathAddr) 995 if err != nil { 996 return err 997 } 998 tpop, err := getTaskPathOperation(t, newdirfd, linkpath, disallowEmptyPath, nofollowFinalSymlink) 999 if err != nil { 1000 return err 1001 } 1002 defer tpop.Release(t) 1003 return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target) 1004 } 1005 1006 // Link implements Linux syscall link(2). 1007 func Link(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1008 oldpathAddr := args[0].Pointer() 1009 newpathAddr := args[1].Pointer() 1010 return 0, nil, linkat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */) 1011 } 1012 1013 // Linkat implements Linux syscall linkat(2). 1014 func Linkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1015 olddirfd := args[0].Int() 1016 oldpathAddr := args[1].Pointer() 1017 newdirfd := args[2].Int() 1018 newpathAddr := args[3].Pointer() 1019 flags := args[4].Int() 1020 return 0, nil, linkat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags) 1021 } 1022 1023 func linkat(t *kernel.Task, olddirfd int32, oldpathAddr hostarch.Addr, newdirfd int32, newpathAddr hostarch.Addr, flags int32) error { 1024 if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 { 1025 return linuxerr.EINVAL 1026 } 1027 if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) { 1028 return linuxerr.ENOENT 1029 } 1030 1031 oldpath, err := copyInPath(t, oldpathAddr) 1032 if err != nil { 1033 return err 1034 } 1035 oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_FOLLOW != 0)) 1036 if err != nil { 1037 return err 1038 } 1039 defer oldtpop.Release(t) 1040 1041 newpath, err := copyInPath(t, newpathAddr) 1042 if err != nil { 1043 return err 1044 } 1045 newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink) 1046 if err != nil { 1047 return err 1048 } 1049 defer newtpop.Release(t) 1050 1051 return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop) 1052 } 1053 1054 // Readlinkat implements Linux syscall readlinkat(2). 1055 func Readlinkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1056 dirfd := args[0].Int() 1057 pathAddr := args[1].Pointer() 1058 bufAddr := args[2].Pointer() 1059 size := args[3].SizeT() 1060 return readlinkat(t, dirfd, pathAddr, bufAddr, size) 1061 } 1062 1063 // Readlink implements Linux syscall readlink(2). 1064 func Readlink(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1065 pathAddr := args[0].Pointer() 1066 bufAddr := args[1].Pointer() 1067 size := args[2].SizeT() 1068 return readlinkat(t, linux.AT_FDCWD, pathAddr, bufAddr, size) 1069 } 1070 1071 func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr hostarch.Addr, size uint) (uintptr, *kernel.SyscallControl, error) { 1072 if int(size) <= 0 { 1073 return 0, nil, linuxerr.EINVAL 1074 } 1075 1076 path, err := copyInPath(t, pathAddr) 1077 if err != nil { 1078 return 0, nil, err 1079 } 1080 // "Since Linux 2.6.39, pathname can be an empty string, in which case the 1081 // call operates on the symbolic link referred to by dirfd ..." - 1082 // readlinkat(2) 1083 tpop, err := getTaskPathOperation(t, dirfd, path, allowEmptyPath, nofollowFinalSymlink) 1084 if err != nil { 1085 return 0, nil, err 1086 } 1087 defer tpop.Release(t) 1088 1089 target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop) 1090 if err != nil { 1091 return 0, nil, err 1092 } 1093 1094 if len(target) > int(size) { 1095 target = target[:size] 1096 } 1097 n, err := t.CopyOutBytes(bufAddr, gohacks.ImmutableBytesFromString(target)) 1098 if n == 0 { 1099 return 0, nil, err 1100 } 1101 return uintptr(n), nil, nil 1102 } 1103 1104 // Unlink implements Linux syscall unlink(2). 1105 func Unlink(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1106 pathAddr := args[0].Pointer() 1107 return 0, nil, unlinkat(t, linux.AT_FDCWD, pathAddr) 1108 } 1109 1110 func unlinkat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr) error { 1111 path, err := copyInPath(t, pathAddr) 1112 if err != nil { 1113 return err 1114 } 1115 tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) 1116 if err != nil { 1117 return err 1118 } 1119 defer tpop.Release(t) 1120 return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop) 1121 } 1122 1123 // Unlinkat implements Linux syscall unlinkat(2). 1124 func Unlinkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1125 dirfd := args[0].Int() 1126 pathAddr := args[1].Pointer() 1127 flags := args[2].Int() 1128 1129 if flags&^linux.AT_REMOVEDIR != 0 { 1130 return 0, nil, linuxerr.EINVAL 1131 } 1132 1133 if flags&linux.AT_REMOVEDIR != 0 { 1134 return 0, nil, rmdirat(t, dirfd, pathAddr) 1135 } 1136 return 0, nil, unlinkat(t, dirfd, pathAddr) 1137 } 1138 1139 func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error { 1140 root := t.FSContext().RootDirectory() 1141 defer root.DecRef(t) 1142 start := root 1143 if !path.Absolute { 1144 if !path.HasComponents() && !bool(shouldAllowEmptyPath) { 1145 return linuxerr.ENOENT 1146 } 1147 if dirfd == linux.AT_FDCWD { 1148 start = t.FSContext().WorkingDirectory() 1149 defer start.DecRef(t) 1150 } else { 1151 dirfile := t.GetFile(dirfd) 1152 if dirfile == nil { 1153 return linuxerr.EBADF 1154 } 1155 if !path.HasComponents() { 1156 // Use FileDescription.SetStat() instead of 1157 // VirtualFilesystem.SetStatAt(), since the former may be able 1158 // to use opened file state to expedite the SetStat. 1159 err := dirfile.SetStat(t, *opts) 1160 dirfile.DecRef(t) 1161 return err 1162 } 1163 start = dirfile.VirtualDentry() 1164 start.IncRef() 1165 defer start.DecRef(t) 1166 dirfile.DecRef(t) 1167 } 1168 } 1169 return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{ 1170 Root: root, 1171 Start: start, 1172 Path: path, 1173 FollowFinalSymlink: bool(shouldFollowFinalSymlink), 1174 }, opts) 1175 } 1176 1177 func handleSetSizeError(t *kernel.Task, err error) error { 1178 if err == linuxerr.ErrExceedsFileSizeLimit { 1179 // Convert error to EFBIG and send a SIGXFSZ per setrlimit(2). 1180 t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t)) 1181 return linuxerr.EFBIG 1182 } 1183 return err 1184 } 1185 1186 // Truncate implements Linux syscall truncate(2). 1187 func Truncate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1188 addr := args[0].Pointer() 1189 length := args[1].Int64() 1190 1191 if length < 0 { 1192 return 0, nil, linuxerr.EINVAL 1193 } 1194 1195 path, err := copyInPath(t, addr) 1196 if err != nil { 1197 return 0, nil, err 1198 } 1199 1200 err = setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{ 1201 Stat: linux.Statx{ 1202 Mask: linux.STATX_SIZE, 1203 Size: uint64(length), 1204 }, 1205 NeedWritePerm: true, 1206 }) 1207 return 0, nil, handleSetSizeError(t, err) 1208 } 1209 1210 // Ftruncate implements Linux syscall ftruncate(2). 1211 func Ftruncate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1212 fd := args[0].Int() 1213 length := args[1].Int64() 1214 1215 if length < 0 { 1216 return 0, nil, linuxerr.EINVAL 1217 } 1218 1219 file := t.GetFile(fd) 1220 if file == nil { 1221 return 0, nil, linuxerr.EBADF 1222 } 1223 defer file.DecRef(t) 1224 1225 if !file.IsWritable() { 1226 return 0, nil, linuxerr.EINVAL 1227 } 1228 1229 err := file.SetStat(t, vfs.SetStatOptions{ 1230 Stat: linux.Statx{ 1231 Mask: linux.STATX_SIZE, 1232 Size: uint64(length), 1233 }, 1234 }) 1235 return 0, nil, handleSetSizeError(t, err) 1236 } 1237 1238 // Umask implements linux syscall umask(2). 1239 func Umask(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1240 mask := args[0].ModeT() 1241 mask = t.FSContext().SwapUmask(mask & 0777) 1242 return uintptr(mask), nil, nil 1243 } 1244 1245 // Chown implements Linux syscall chown(2). 1246 func Chown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1247 pathAddr := args[0].Pointer() 1248 owner := args[1].Int() 1249 group := args[2].Int() 1250 return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, 0 /* flags */) 1251 } 1252 1253 // Lchown implements Linux syscall lchown(2). 1254 func Lchown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1255 pathAddr := args[0].Pointer() 1256 owner := args[1].Int() 1257 group := args[2].Int() 1258 return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, linux.AT_SYMLINK_NOFOLLOW) 1259 } 1260 1261 // Fchownat implements Linux syscall fchownat(2). 1262 func Fchownat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1263 dirfd := args[0].Int() 1264 pathAddr := args[1].Pointer() 1265 owner := args[2].Int() 1266 group := args[3].Int() 1267 flags := args[4].Int() 1268 return 0, nil, fchownat(t, dirfd, pathAddr, owner, group, flags) 1269 } 1270 1271 func fchownat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, owner, group, flags int32) error { 1272 if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { 1273 return linuxerr.EINVAL 1274 } 1275 1276 path, err := copyInPath(t, pathAddr) 1277 if err != nil { 1278 return err 1279 } 1280 1281 var opts vfs.SetStatOptions 1282 if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil { 1283 return err 1284 } 1285 1286 return setstatat(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts) 1287 } 1288 1289 func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vfs.SetStatOptions) error { 1290 userns := t.UserNamespace() 1291 if owner != -1 { 1292 kuid := userns.MapToKUID(auth.UID(owner)) 1293 if !kuid.Ok() { 1294 return linuxerr.EINVAL 1295 } 1296 opts.Stat.Mask |= linux.STATX_UID 1297 opts.Stat.UID = uint32(kuid) 1298 } 1299 if group != -1 { 1300 kgid := userns.MapToKGID(auth.GID(group)) 1301 if !kgid.Ok() { 1302 return linuxerr.EINVAL 1303 } 1304 opts.Stat.Mask |= linux.STATX_GID 1305 opts.Stat.GID = uint32(kgid) 1306 } 1307 return nil 1308 } 1309 1310 // Fchown implements Linux syscall fchown(2). 1311 func Fchown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1312 fd := args[0].Int() 1313 owner := args[1].Int() 1314 group := args[2].Int() 1315 1316 file := t.GetFile(fd) 1317 if file == nil { 1318 return 0, nil, linuxerr.EBADF 1319 } 1320 defer file.DecRef(t) 1321 1322 var opts vfs.SetStatOptions 1323 if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil { 1324 return 0, nil, err 1325 } 1326 return 0, nil, file.SetStat(t, opts) 1327 } 1328 1329 const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX 1330 1331 // Chmod implements Linux syscall chmod(2). 1332 func Chmod(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1333 pathAddr := args[0].Pointer() 1334 mode := args[1].ModeT() 1335 return 0, nil, fchmodat(t, linux.AT_FDCWD, pathAddr, mode) 1336 } 1337 1338 // Fchmodat implements Linux syscall fchmodat(2). 1339 func Fchmodat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1340 dirfd := args[0].Int() 1341 pathAddr := args[1].Pointer() 1342 mode := args[2].ModeT() 1343 return 0, nil, fchmodat(t, dirfd, pathAddr, mode) 1344 } 1345 1346 func fchmodat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, mode uint) error { 1347 path, err := copyInPath(t, pathAddr) 1348 if err != nil { 1349 return err 1350 } 1351 1352 return setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{ 1353 Stat: linux.Statx{ 1354 Mask: linux.STATX_MODE, 1355 Mode: uint16(mode & chmodMask), 1356 }, 1357 }) 1358 } 1359 1360 // Fchmod implements Linux syscall fchmod(2). 1361 func Fchmod(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1362 fd := args[0].Int() 1363 mode := args[1].ModeT() 1364 1365 file := t.GetFile(fd) 1366 if file == nil { 1367 return 0, nil, linuxerr.EBADF 1368 } 1369 defer file.DecRef(t) 1370 1371 return 0, nil, file.SetStat(t, vfs.SetStatOptions{ 1372 Stat: linux.Statx{ 1373 Mask: linux.STATX_MODE, 1374 Mode: uint16(mode & chmodMask), 1375 }, 1376 }) 1377 } 1378 1379 // Utime implements Linux syscall utime(2). 1380 func Utime(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1381 pathAddr := args[0].Pointer() 1382 timesAddr := args[1].Pointer() 1383 1384 path, err := copyInPath(t, pathAddr) 1385 if err != nil { 1386 return 0, nil, err 1387 } 1388 1389 opts := vfs.SetStatOptions{ 1390 Stat: linux.Statx{ 1391 Mask: linux.STATX_ATIME | linux.STATX_MTIME, 1392 }, 1393 } 1394 if timesAddr == 0 { 1395 opts.Stat.Atime.Nsec = linux.UTIME_NOW 1396 opts.Stat.Mtime.Nsec = linux.UTIME_NOW 1397 } else { 1398 var times linux.Utime 1399 if _, err := times.CopyIn(t, timesAddr); err != nil { 1400 return 0, nil, err 1401 } 1402 opts.Stat.Atime.Sec = times.Actime 1403 opts.Stat.Mtime.Sec = times.Modtime 1404 } 1405 1406 return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts) 1407 } 1408 1409 // Utimes implements Linux syscall utimes(2). 1410 func Utimes(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1411 pathAddr := args[0].Pointer() 1412 timesAddr := args[1].Pointer() 1413 1414 path, err := copyInPath(t, pathAddr) 1415 if err != nil { 1416 return 0, nil, err 1417 } 1418 1419 var opts vfs.SetStatOptions 1420 if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil { 1421 return 0, nil, err 1422 } 1423 1424 return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts) 1425 } 1426 1427 // Futimesat implements Linux syscall futimesat(2). 1428 func Futimesat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1429 dirfd := args[0].Int() 1430 pathAddr := args[1].Pointer() 1431 timesAddr := args[2].Pointer() 1432 1433 // "If filename is NULL and dfd refers to an open file, then operate on the 1434 // file. Otherwise look up filename, possibly using dfd as a starting 1435 // point." - fs/utimes.c 1436 var path fspath.Path 1437 shouldAllowEmptyPath := allowEmptyPath 1438 if dirfd == linux.AT_FDCWD || pathAddr != 0 { 1439 var err error 1440 path, err = copyInPath(t, pathAddr) 1441 if err != nil { 1442 return 0, nil, err 1443 } 1444 shouldAllowEmptyPath = disallowEmptyPath 1445 } 1446 1447 var opts vfs.SetStatOptions 1448 if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil { 1449 return 0, nil, err 1450 } 1451 1452 return 0, nil, setstatat(t, dirfd, path, shouldAllowEmptyPath, followFinalSymlink, &opts) 1453 } 1454 1455 func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr hostarch.Addr, opts *vfs.SetStatOptions) error { 1456 if timesAddr == 0 { 1457 opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME 1458 opts.Stat.Atime.Nsec = linux.UTIME_NOW 1459 opts.Stat.Mtime.Nsec = linux.UTIME_NOW 1460 return nil 1461 } 1462 var times [2]linux.Timeval 1463 if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil { 1464 return err 1465 } 1466 if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 { 1467 return linuxerr.EINVAL 1468 } 1469 opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME 1470 opts.Stat.Atime = linux.StatxTimestamp{ 1471 Sec: times[0].Sec, 1472 Nsec: uint32(times[0].Usec * 1000), 1473 } 1474 opts.Stat.Mtime = linux.StatxTimestamp{ 1475 Sec: times[1].Sec, 1476 Nsec: uint32(times[1].Usec * 1000), 1477 } 1478 return nil 1479 } 1480 1481 // Utimensat implements Linux syscall utimensat(2). 1482 func Utimensat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1483 dirfd := args[0].Int() 1484 pathAddr := args[1].Pointer() 1485 timesAddr := args[2].Pointer() 1486 flags := args[3].Int() 1487 1488 // Linux requires that the UTIME_OMIT check occur before checking path or 1489 // flags. 1490 var opts vfs.SetStatOptions 1491 if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil { 1492 return 0, nil, err 1493 } 1494 if opts.Stat.Mask == 0 { 1495 return 0, nil, nil 1496 } 1497 1498 if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 { 1499 return 0, nil, linuxerr.EINVAL 1500 } 1501 1502 // "If filename is NULL and dfd refers to an open file, then operate on the 1503 // file. Otherwise look up filename, possibly using dfd as a starting 1504 // point." - fs/utimes.c 1505 var path fspath.Path 1506 shouldAllowEmptyPath := allowEmptyPath 1507 if dirfd == linux.AT_FDCWD || pathAddr != 0 { 1508 var err error 1509 path, err = copyInPath(t, pathAddr) 1510 if err != nil { 1511 return 0, nil, err 1512 } 1513 shouldAllowEmptyPath = disallowEmptyPath 1514 } 1515 1516 return 0, nil, setstatat(t, dirfd, path, shouldAllowEmptyPath, shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts) 1517 } 1518 1519 func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr hostarch.Addr, opts *vfs.SetStatOptions) error { 1520 if timesAddr == 0 { 1521 opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME 1522 opts.Stat.Atime.Nsec = linux.UTIME_NOW 1523 opts.Stat.Mtime.Nsec = linux.UTIME_NOW 1524 return nil 1525 } 1526 var times [2]linux.Timespec 1527 if _, err := linux.CopyTimespecSliceIn(t, timesAddr, times[:]); err != nil { 1528 return err 1529 } 1530 if times[0].Nsec != linux.UTIME_OMIT { 1531 if times[0].Nsec != linux.UTIME_NOW && (times[0].Nsec < 0 || times[0].Nsec > 999999999) { 1532 return linuxerr.EINVAL 1533 } 1534 opts.Stat.Mask |= linux.STATX_ATIME 1535 opts.Stat.Atime = linux.StatxTimestamp{ 1536 Sec: times[0].Sec, 1537 Nsec: uint32(times[0].Nsec), 1538 } 1539 } 1540 if times[1].Nsec != linux.UTIME_OMIT { 1541 if times[1].Nsec != linux.UTIME_NOW && (times[1].Nsec < 0 || times[1].Nsec > 999999999) { 1542 return linuxerr.EINVAL 1543 } 1544 opts.Stat.Mask |= linux.STATX_MTIME 1545 opts.Stat.Mtime = linux.StatxTimestamp{ 1546 Sec: times[1].Sec, 1547 Nsec: uint32(times[1].Nsec), 1548 } 1549 } 1550 return nil 1551 } 1552 1553 // Rename implements Linux syscall rename(2). 1554 func Rename(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1555 oldpathAddr := args[0].Pointer() 1556 newpathAddr := args[1].Pointer() 1557 return 0, nil, renameat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */) 1558 } 1559 1560 // Renameat implements Linux syscall renameat(2). 1561 func Renameat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1562 olddirfd := args[0].Int() 1563 oldpathAddr := args[1].Pointer() 1564 newdirfd := args[2].Int() 1565 newpathAddr := args[3].Pointer() 1566 return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, 0 /* flags */) 1567 } 1568 1569 // Renameat2 implements Linux syscall renameat2(2). 1570 func Renameat2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1571 olddirfd := args[0].Int() 1572 oldpathAddr := args[1].Pointer() 1573 newdirfd := args[2].Int() 1574 newpathAddr := args[3].Pointer() 1575 flags := args[4].Uint() 1576 return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags) 1577 } 1578 1579 func renameat(t *kernel.Task, olddirfd int32, oldpathAddr hostarch.Addr, newdirfd int32, newpathAddr hostarch.Addr, flags uint32) error { 1580 oldpath, err := copyInPath(t, oldpathAddr) 1581 if err != nil { 1582 return err 1583 } 1584 // "If oldpath refers to a symbolic link, the link is renamed" - rename(2) 1585 oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, disallowEmptyPath, nofollowFinalSymlink) 1586 if err != nil { 1587 return err 1588 } 1589 defer oldtpop.Release(t) 1590 1591 newpath, err := copyInPath(t, newpathAddr) 1592 if err != nil { 1593 return err 1594 } 1595 newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink) 1596 if err != nil { 1597 return err 1598 } 1599 defer newtpop.Release(t) 1600 1601 return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{ 1602 Flags: flags, 1603 }) 1604 } 1605 1606 // Fallocate implements linux system call fallocate(2). 1607 func Fallocate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1608 fd := args[0].Int() 1609 mode := args[1].Uint64() 1610 offset := args[2].Int64() 1611 length := args[3].Int64() 1612 1613 file := t.GetFile(fd) 1614 if file == nil { 1615 return 0, nil, linuxerr.EBADF 1616 } 1617 defer file.DecRef(t) 1618 1619 if !file.IsWritable() { 1620 return 0, nil, linuxerr.EBADF 1621 } 1622 if mode != 0 { 1623 return 0, nil, linuxerr.ENOTSUP 1624 } 1625 if offset < 0 || length <= 0 { 1626 return 0, nil, linuxerr.EINVAL 1627 } 1628 1629 size := offset + length 1630 if size < 0 { 1631 return 0, nil, linuxerr.EFBIG 1632 } 1633 limit := limits.FromContext(t).Get(limits.FileSize).Cur 1634 if uint64(size) >= limit { 1635 t.SendSignal(&linux.SignalInfo{ 1636 Signo: int32(linux.SIGXFSZ), 1637 Code: linux.SI_USER, 1638 }) 1639 return 0, nil, linuxerr.EFBIG 1640 } 1641 1642 return 0, nil, file.Allocate(t, mode, uint64(offset), uint64(length)) 1643 } 1644 1645 // Flock implements linux syscall flock(2). 1646 func Flock(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1647 fd := args[0].Int() 1648 operation := args[1].Int() 1649 1650 file := t.GetFile(fd) 1651 if file == nil { 1652 // flock(2): EBADF fd is not an open file descriptor. 1653 return 0, nil, linuxerr.EBADF 1654 } 1655 defer file.DecRef(t) 1656 1657 nonblocking := operation&linux.LOCK_NB != 0 1658 operation &^= linux.LOCK_NB 1659 1660 switch operation { 1661 case linux.LOCK_EX: 1662 if err := file.LockBSD(t, int32(t.TGIDInRoot()), lock.WriteLock, !nonblocking /* block */); err != nil { 1663 return 0, nil, err 1664 } 1665 case linux.LOCK_SH: 1666 if err := file.LockBSD(t, int32(t.TGIDInRoot()), lock.ReadLock, !nonblocking /* block */); err != nil { 1667 return 0, nil, err 1668 } 1669 case linux.LOCK_UN: 1670 if err := file.UnlockBSD(t); err != nil { 1671 return 0, nil, err 1672 } 1673 default: 1674 // flock(2): EINVAL operation is invalid. 1675 return 0, nil, linuxerr.EINVAL 1676 } 1677 1678 return 0, nil, nil 1679 } 1680 1681 const ( 1682 memfdPrefix = "memfd:" 1683 memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix) 1684 memfdAllFlags = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING) 1685 ) 1686 1687 // MemfdCreate implements the linux syscall memfd_create(2). 1688 func MemfdCreate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1689 addr := args[0].Pointer() 1690 flags := args[1].Uint() 1691 1692 if flags&^memfdAllFlags != 0 { 1693 // Unknown bits in flags. 1694 return 0, nil, linuxerr.EINVAL 1695 } 1696 1697 allowSeals := flags&linux.MFD_ALLOW_SEALING != 0 1698 cloExec := flags&linux.MFD_CLOEXEC != 0 1699 1700 name, err := t.CopyInString(addr, memfdMaxNameLen) 1701 if err != nil { 1702 return 0, nil, err 1703 } 1704 1705 shmMount := t.Kernel().ShmMount() 1706 file, err := tmpfs.NewMemfd(t, t.Credentials(), shmMount, allowSeals, memfdPrefix+name) 1707 if err != nil { 1708 return 0, nil, err 1709 } 1710 defer file.DecRef(t) 1711 1712 fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ 1713 CloseOnExec: cloExec, 1714 }) 1715 if err != nil { 1716 return 0, nil, err 1717 } 1718 1719 return uintptr(fd), nil, nil 1720 }