github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/fsimpl/proc/tasks_sys.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package proc 16 17 import ( 18 "bytes" 19 "fmt" 20 "math" 21 22 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 23 "github.com/MerlinKodo/gvisor/pkg/context" 24 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 25 "github.com/MerlinKodo/gvisor/pkg/hostarch" 26 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/kernfs" 27 "github.com/MerlinKodo/gvisor/pkg/sentry/inet" 28 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel" 29 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth" 30 "github.com/MerlinKodo/gvisor/pkg/sentry/vfs" 31 "github.com/MerlinKodo/gvisor/pkg/sync" 32 "github.com/MerlinKodo/gvisor/pkg/tcpip/network/ipv4" 33 "github.com/MerlinKodo/gvisor/pkg/usermem" 34 ) 35 36 // +stateify savable 37 type tcpMemDir int 38 39 const ( 40 tcpRMem tcpMemDir = iota 41 tcpWMem 42 ) 43 44 // newSysDir returns the dentry corresponding to /proc/sys directory. 45 func (fs *filesystem) newSysDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode { 46 return fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ 47 "kernel": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ 48 "cap_last_cap": fs.newInode(ctx, root, 0444, newStaticFile(fmt.Sprintf("%d\n", linux.CAP_LAST_CAP))), 49 "hostname": fs.newInode(ctx, root, 0444, &hostnameData{}), 50 "sem": fs.newInode(ctx, root, 0444, newStaticFile(fmt.Sprintf("%d\t%d\t%d\t%d\n", linux.SEMMSL, linux.SEMMNS, linux.SEMOPM, linux.SEMMNI))), 51 "shmall": fs.newInode(ctx, root, 0444, ipcData(linux.SHMALL)), 52 "shmmax": fs.newInode(ctx, root, 0444, ipcData(linux.SHMMAX)), 53 "shmmni": fs.newInode(ctx, root, 0444, ipcData(linux.SHMMNI)), 54 "msgmni": fs.newInode(ctx, root, 0444, ipcData(linux.MSGMNI)), 55 "msgmax": fs.newInode(ctx, root, 0444, ipcData(linux.MSGMAX)), 56 "msgmnb": fs.newInode(ctx, root, 0444, ipcData(linux.MSGMNB)), 57 "yama": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ 58 "ptrace_scope": fs.newYAMAPtraceScopeFile(ctx, k, root), 59 }), 60 }), 61 "vm": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ 62 "max_map_count": fs.newInode(ctx, root, 0444, newStaticFile("2147483647\n")), 63 "mmap_min_addr": fs.newInode(ctx, root, 0444, &mmapMinAddrData{k: k}), 64 "overcommit_memory": fs.newInode(ctx, root, 0444, newStaticFile("0\n")), 65 }), 66 "net": fs.newSysNetDir(ctx, root, k), 67 }) 68 } 69 70 // newSysNetDir returns the dentry corresponding to /proc/sys/net directory. 71 func (fs *filesystem) newSysNetDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode { 72 var contents map[string]kernfs.Inode 73 74 // TODO(gvisor.dev/issue/1833): Support for using the network stack in the 75 // network namespace of the calling process. 76 if stack := k.RootNetworkNamespace().Stack(); stack != nil { 77 contents = map[string]kernfs.Inode{ 78 "ipv4": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ 79 "ip_forward": fs.newInode(ctx, root, 0444, &ipForwarding{stack: stack}), 80 "ip_local_port_range": fs.newInode(ctx, root, 0644, &portRange{stack: stack}), 81 "tcp_recovery": fs.newInode(ctx, root, 0644, &tcpRecoveryData{stack: stack}), 82 "tcp_rmem": fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpRMem}), 83 "tcp_sack": fs.newInode(ctx, root, 0644, &tcpSackData{stack: stack}), 84 "tcp_wmem": fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpWMem}), 85 86 // The following files are simple stubs until they are implemented in 87 // netstack, most of these files are configuration related. We use the 88 // value closest to the actual netstack behavior or any empty file, all 89 // of these files will have mode 0444 (read-only for all users). 90 "ip_local_reserved_ports": fs.newInode(ctx, root, 0444, newStaticFile("")), 91 "ipfrag_time": fs.newInode(ctx, root, 0444, newStaticFile("30")), 92 "ip_nonlocal_bind": fs.newInode(ctx, root, 0444, newStaticFile("0")), 93 "ip_no_pmtu_disc": fs.newInode(ctx, root, 0444, newStaticFile("1")), 94 95 // tcp_allowed_congestion_control tell the user what they are able to 96 // do as an unprivledged process so we leave it empty. 97 "tcp_allowed_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("")), 98 "tcp_available_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("reno")), 99 "tcp_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("reno")), 100 101 // Many of the following stub files are features netstack doesn't 102 // support. The unsupported features return "0" to indicate they are 103 // disabled. 104 "tcp_base_mss": fs.newInode(ctx, root, 0444, newStaticFile("1280")), 105 "tcp_dsack": fs.newInode(ctx, root, 0444, newStaticFile("0")), 106 "tcp_early_retrans": fs.newInode(ctx, root, 0444, newStaticFile("0")), 107 "tcp_fack": fs.newInode(ctx, root, 0444, newStaticFile("0")), 108 "tcp_fastopen": fs.newInode(ctx, root, 0444, newStaticFile("0")), 109 "tcp_fastopen_key": fs.newInode(ctx, root, 0444, newStaticFile("")), 110 "tcp_invalid_ratelimit": fs.newInode(ctx, root, 0444, newStaticFile("0")), 111 "tcp_keepalive_intvl": fs.newInode(ctx, root, 0444, newStaticFile("0")), 112 "tcp_keepalive_probes": fs.newInode(ctx, root, 0444, newStaticFile("0")), 113 "tcp_keepalive_time": fs.newInode(ctx, root, 0444, newStaticFile("7200")), 114 "tcp_mtu_probing": fs.newInode(ctx, root, 0444, newStaticFile("0")), 115 "tcp_no_metrics_save": fs.newInode(ctx, root, 0444, newStaticFile("1")), 116 "tcp_probe_interval": fs.newInode(ctx, root, 0444, newStaticFile("0")), 117 "tcp_probe_threshold": fs.newInode(ctx, root, 0444, newStaticFile("0")), 118 "tcp_retries1": fs.newInode(ctx, root, 0444, newStaticFile("3")), 119 "tcp_retries2": fs.newInode(ctx, root, 0444, newStaticFile("15")), 120 "tcp_rfc1337": fs.newInode(ctx, root, 0444, newStaticFile("1")), 121 "tcp_slow_start_after_idle": fs.newInode(ctx, root, 0444, newStaticFile("1")), 122 "tcp_synack_retries": fs.newInode(ctx, root, 0444, newStaticFile("5")), 123 "tcp_syn_retries": fs.newInode(ctx, root, 0444, newStaticFile("3")), 124 "tcp_timestamps": fs.newInode(ctx, root, 0444, newStaticFile("1")), 125 }), 126 "core": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ 127 "default_qdisc": fs.newInode(ctx, root, 0444, newStaticFile("pfifo_fast")), 128 "message_burst": fs.newInode(ctx, root, 0444, newStaticFile("10")), 129 "message_cost": fs.newInode(ctx, root, 0444, newStaticFile("5")), 130 "optmem_max": fs.newInode(ctx, root, 0444, newStaticFile("0")), 131 "rmem_default": fs.newInode(ctx, root, 0444, newStaticFile("212992")), 132 "rmem_max": fs.newInode(ctx, root, 0444, newStaticFile("212992")), 133 "somaxconn": fs.newInode(ctx, root, 0444, newStaticFile("128")), 134 "wmem_default": fs.newInode(ctx, root, 0444, newStaticFile("212992")), 135 "wmem_max": fs.newInode(ctx, root, 0444, newStaticFile("212992")), 136 }), 137 } 138 } 139 140 return fs.newStaticDir(ctx, root, contents) 141 } 142 143 // mmapMinAddrData implements vfs.DynamicBytesSource for 144 // /proc/sys/vm/mmap_min_addr. 145 // 146 // +stateify savable 147 type mmapMinAddrData struct { 148 kernfs.DynamicBytesFile 149 150 k *kernel.Kernel 151 } 152 153 var _ dynamicInode = (*mmapMinAddrData)(nil) 154 155 // Generate implements vfs.DynamicBytesSource.Generate. 156 func (d *mmapMinAddrData) Generate(ctx context.Context, buf *bytes.Buffer) error { 157 fmt.Fprintf(buf, "%d\n", d.k.Platform.MinUserAddress()) 158 return nil 159 } 160 161 // hostnameData implements vfs.DynamicBytesSource for /proc/sys/kernel/hostname. 162 // 163 // +stateify savable 164 type hostnameData struct { 165 kernfs.DynamicBytesFile 166 } 167 168 var _ dynamicInode = (*hostnameData)(nil) 169 170 // Generate implements vfs.DynamicBytesSource.Generate. 171 func (*hostnameData) Generate(ctx context.Context, buf *bytes.Buffer) error { 172 utsns := kernel.UTSNamespaceFromContext(ctx) 173 defer utsns.DecRef(ctx) 174 buf.WriteString(utsns.HostName()) 175 buf.WriteString("\n") 176 return nil 177 } 178 179 // tcpSackData implements vfs.WritableDynamicBytesSource for 180 // /proc/sys/net/tcp_sack. 181 // 182 // +stateify savable 183 type tcpSackData struct { 184 kernfs.DynamicBytesFile 185 186 stack inet.Stack `state:"wait"` 187 enabled *bool 188 } 189 190 var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil) 191 192 // Generate implements vfs.DynamicBytesSource.Generate. 193 func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error { 194 if d.enabled == nil { 195 sack, err := d.stack.TCPSACKEnabled() 196 if err != nil { 197 return err 198 } 199 d.enabled = &sack 200 } 201 202 val := "0\n" 203 if *d.enabled { 204 // Technically, this is not quite compatible with Linux. Linux stores these 205 // as an integer, so if you write "2" into tcp_sack, you should get 2 back. 206 // Tough luck. 207 val = "1\n" 208 } 209 _, err := buf.WriteString(val) 210 return err 211 } 212 213 // Write implements vfs.WritableDynamicBytesSource.Write. 214 func (d *tcpSackData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { 215 if offset != 0 { 216 // No need to handle partial writes thus far. 217 return 0, linuxerr.EINVAL 218 } 219 if src.NumBytes() == 0 { 220 return 0, nil 221 } 222 223 // Limit the amount of memory allocated. 224 src = src.TakeFirst(hostarch.PageSize - 1) 225 226 var v int32 227 n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) 228 if err != nil { 229 return 0, err 230 } 231 if d.enabled == nil { 232 d.enabled = new(bool) 233 } 234 *d.enabled = v != 0 235 return n, d.stack.SetTCPSACKEnabled(*d.enabled) 236 } 237 238 // tcpRecoveryData implements vfs.WritableDynamicBytesSource for 239 // /proc/sys/net/ipv4/tcp_recovery. 240 // 241 // +stateify savable 242 type tcpRecoveryData struct { 243 kernfs.DynamicBytesFile 244 245 stack inet.Stack `state:"wait"` 246 } 247 248 var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil) 249 250 // Generate implements vfs.DynamicBytesSource.Generate. 251 func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error { 252 recovery, err := d.stack.TCPRecovery() 253 if err != nil { 254 return err 255 } 256 257 _, err = buf.WriteString(fmt.Sprintf("%d\n", recovery)) 258 return err 259 } 260 261 // Write implements vfs.WritableDynamicBytesSource.Write. 262 func (d *tcpRecoveryData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { 263 if offset != 0 { 264 // No need to handle partial writes thus far. 265 return 0, linuxerr.EINVAL 266 } 267 if src.NumBytes() == 0 { 268 return 0, nil 269 } 270 271 // Limit the amount of memory allocated. 272 src = src.TakeFirst(hostarch.PageSize - 1) 273 274 var v int32 275 n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) 276 if err != nil { 277 return 0, err 278 } 279 if err := d.stack.SetTCPRecovery(inet.TCPLossRecovery(v)); err != nil { 280 return 0, err 281 } 282 return n, nil 283 } 284 285 // tcpMemData implements vfs.WritableDynamicBytesSource for 286 // /proc/sys/net/ipv4/tcp_rmem and /proc/sys/net/ipv4/tcp_wmem. 287 // 288 // +stateify savable 289 type tcpMemData struct { 290 kernfs.DynamicBytesFile 291 292 dir tcpMemDir 293 stack inet.Stack `state:"wait"` 294 295 // mu protects against concurrent reads/writes to FDs based on the dentry 296 // backing this byte source. 297 mu sync.Mutex `state:"nosave"` 298 } 299 300 var _ vfs.WritableDynamicBytesSource = (*tcpMemData)(nil) 301 302 // Generate implements vfs.DynamicBytesSource.Generate. 303 func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error { 304 d.mu.Lock() 305 defer d.mu.Unlock() 306 307 size, err := d.readSizeLocked() 308 if err != nil { 309 return err 310 } 311 _, err = buf.WriteString(fmt.Sprintf("%d\t%d\t%d\n", size.Min, size.Default, size.Max)) 312 return err 313 } 314 315 // Write implements vfs.WritableDynamicBytesSource.Write. 316 func (d *tcpMemData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { 317 if offset != 0 { 318 // No need to handle partial writes thus far. 319 return 0, linuxerr.EINVAL 320 } 321 if src.NumBytes() == 0 { 322 return 0, nil 323 } 324 d.mu.Lock() 325 defer d.mu.Unlock() 326 327 // Limit the amount of memory allocated. 328 src = src.TakeFirst(hostarch.PageSize - 1) 329 size, err := d.readSizeLocked() 330 if err != nil { 331 return 0, err 332 } 333 buf := []int32{int32(size.Min), int32(size.Default), int32(size.Max)} 334 n, err := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, buf, src.Opts) 335 if err != nil { 336 return 0, err 337 } 338 newSize := inet.TCPBufferSize{ 339 Min: int(buf[0]), 340 Default: int(buf[1]), 341 Max: int(buf[2]), 342 } 343 if err := d.writeSizeLocked(newSize); err != nil { 344 return 0, err 345 } 346 return n, nil 347 } 348 349 // Precondition: d.mu must be locked. 350 func (d *tcpMemData) readSizeLocked() (inet.TCPBufferSize, error) { 351 switch d.dir { 352 case tcpRMem: 353 return d.stack.TCPReceiveBufferSize() 354 case tcpWMem: 355 return d.stack.TCPSendBufferSize() 356 default: 357 panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir)) 358 } 359 } 360 361 // Precondition: d.mu must be locked. 362 func (d *tcpMemData) writeSizeLocked(size inet.TCPBufferSize) error { 363 switch d.dir { 364 case tcpRMem: 365 return d.stack.SetTCPReceiveBufferSize(size) 366 case tcpWMem: 367 return d.stack.SetTCPSendBufferSize(size) 368 default: 369 panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir)) 370 } 371 } 372 373 // ipForwarding implements vfs.WritableDynamicBytesSource for 374 // /proc/sys/net/ipv4/ip_forward. 375 // 376 // +stateify savable 377 type ipForwarding struct { 378 kernfs.DynamicBytesFile 379 380 stack inet.Stack `state:"wait"` 381 enabled bool 382 } 383 384 var _ vfs.WritableDynamicBytesSource = (*ipForwarding)(nil) 385 386 // Generate implements vfs.DynamicBytesSource.Generate. 387 func (ipf *ipForwarding) Generate(ctx context.Context, buf *bytes.Buffer) error { 388 val := "0\n" 389 if ipf.enabled { 390 // Technically, this is not quite compatible with Linux. Linux stores these 391 // as an integer, so if you write "2" into tcp_sack, you should get 2 back. 392 // Tough luck. 393 val = "1\n" 394 } 395 buf.WriteString(val) 396 397 return nil 398 } 399 400 // Write implements vfs.WritableDynamicBytesSource.Write. 401 func (ipf *ipForwarding) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { 402 if offset != 0 { 403 // No need to handle partial writes thus far. 404 return 0, linuxerr.EINVAL 405 } 406 if src.NumBytes() == 0 { 407 return 0, nil 408 } 409 410 // Limit input size so as not to impact performance if input size is large. 411 src = src.TakeFirst(hostarch.PageSize - 1) 412 413 var v int32 414 n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) 415 if err != nil { 416 return 0, err 417 } 418 ipf.enabled = v != 0 419 if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, ipf.enabled); err != nil { 420 return 0, err 421 } 422 return n, nil 423 } 424 425 // portRange implements vfs.WritableDynamicBytesSource for 426 // /proc/sys/net/ipv4/ip_local_port_range. 427 // 428 // +stateify savable 429 type portRange struct { 430 kernfs.DynamicBytesFile 431 432 stack inet.Stack `state:"wait"` 433 434 // start and end store the port range. We must save/restore this here, 435 // since a netstack instance is created on restore. 436 start *uint16 437 end *uint16 438 } 439 440 var _ vfs.WritableDynamicBytesSource = (*portRange)(nil) 441 442 // Generate implements vfs.DynamicBytesSource.Generate. 443 func (pr *portRange) Generate(ctx context.Context, buf *bytes.Buffer) error { 444 if pr.start == nil { 445 start, end := pr.stack.PortRange() 446 pr.start = &start 447 pr.end = &end 448 } 449 _, err := fmt.Fprintf(buf, "%d %d\n", *pr.start, *pr.end) 450 return err 451 } 452 453 // Write implements vfs.WritableDynamicBytesSource.Write. 454 func (pr *portRange) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { 455 if offset != 0 { 456 // No need to handle partial writes thus far. 457 return 0, linuxerr.EINVAL 458 } 459 if src.NumBytes() == 0 { 460 return 0, nil 461 } 462 463 // Limit input size so as not to impact performance if input size is 464 // large. 465 src = src.TakeFirst(hostarch.PageSize - 1) 466 467 ports := make([]int32, 2) 468 n, err := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, ports, src.Opts) 469 if err != nil { 470 return 0, err 471 } 472 473 // Port numbers must be uint16s. 474 if ports[0] < 0 || ports[1] < 0 || ports[0] > math.MaxUint16 || ports[1] > math.MaxUint16 { 475 return 0, linuxerr.EINVAL 476 } 477 478 if err := pr.stack.SetPortRange(uint16(ports[0]), uint16(ports[1])); err != nil { 479 return 0, err 480 } 481 if pr.start == nil { 482 pr.start = new(uint16) 483 pr.end = new(uint16) 484 } 485 *pr.start = uint16(ports[0]) 486 *pr.end = uint16(ports[1]) 487 return n, nil 488 }