github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/proc/tasks_sys.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package proc 16 17 import ( 18 "bytes" 19 "fmt" 20 "math" 21 22 "github.com/SagerNet/gvisor/pkg/abi/linux" 23 "github.com/SagerNet/gvisor/pkg/context" 24 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 25 "github.com/SagerNet/gvisor/pkg/hostarch" 26 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/kernfs" 27 "github.com/SagerNet/gvisor/pkg/sentry/inet" 28 "github.com/SagerNet/gvisor/pkg/sentry/kernel" 29 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 30 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 31 "github.com/SagerNet/gvisor/pkg/sync" 32 "github.com/SagerNet/gvisor/pkg/tcpip/network/ipv4" 33 "github.com/SagerNet/gvisor/pkg/usermem" 34 ) 35 36 // +stateify savable 37 type tcpMemDir int 38 39 const ( 40 tcpRMem tcpMemDir = iota 41 tcpWMem 42 ) 43 44 // newSysDir returns the dentry corresponding to /proc/sys directory. 45 func (fs *filesystem) newSysDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode { 46 return fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ 47 "kernel": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ 48 "hostname": fs.newInode(ctx, root, 0444, &hostnameData{}), 49 "sem": fs.newInode(ctx, root, 0444, newStaticFile(fmt.Sprintf("%d\t%d\t%d\t%d\n", linux.SEMMSL, linux.SEMMNS, linux.SEMOPM, linux.SEMMNI))), 50 "shmall": fs.newInode(ctx, root, 0444, shmData(linux.SHMALL)), 51 "shmmax": fs.newInode(ctx, root, 0444, shmData(linux.SHMMAX)), 52 "shmmni": fs.newInode(ctx, root, 0444, shmData(linux.SHMMNI)), 53 "yama": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ 54 "ptrace_scope": fs.newYAMAPtraceScopeFile(ctx, k, root), 55 }), 56 }), 57 "vm": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ 58 "max_map_count": fs.newInode(ctx, root, 0444, newStaticFile("2147483647\n")), 59 "mmap_min_addr": fs.newInode(ctx, root, 0444, &mmapMinAddrData{k: k}), 60 "overcommit_memory": fs.newInode(ctx, root, 0444, newStaticFile("0\n")), 61 }), 62 "net": fs.newSysNetDir(ctx, root, k), 63 }) 64 } 65 66 // newSysNetDir returns the dentry corresponding to /proc/sys/net directory. 67 func (fs *filesystem) newSysNetDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode { 68 var contents map[string]kernfs.Inode 69 70 // TODO(github.com/SagerNet/issue/1833): Support for using the network stack in the 71 // network namespace of the calling process. 72 if stack := k.RootNetworkNamespace().Stack(); stack != nil { 73 contents = map[string]kernfs.Inode{ 74 "ipv4": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ 75 "ip_forward": fs.newInode(ctx, root, 0444, &ipForwarding{stack: stack}), 76 "ip_local_port_range": fs.newInode(ctx, root, 0644, &portRange{stack: stack}), 77 "tcp_recovery": fs.newInode(ctx, root, 0644, &tcpRecoveryData{stack: stack}), 78 "tcp_rmem": fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpRMem}), 79 "tcp_sack": fs.newInode(ctx, root, 0644, &tcpSackData{stack: stack}), 80 "tcp_wmem": fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpWMem}), 81 82 // The following files are simple stubs until they are implemented in 83 // netstack, most of these files are configuration related. We use the 84 // value closest to the actual netstack behavior or any empty file, all 85 // of these files will have mode 0444 (read-only for all users). 86 "ip_local_reserved_ports": fs.newInode(ctx, root, 0444, newStaticFile("")), 87 "ipfrag_time": fs.newInode(ctx, root, 0444, newStaticFile("30")), 88 "ip_nonlocal_bind": fs.newInode(ctx, root, 0444, newStaticFile("0")), 89 "ip_no_pmtu_disc": fs.newInode(ctx, root, 0444, newStaticFile("1")), 90 91 // tcp_allowed_congestion_control tell the user what they are able to 92 // do as an unprivledged process so we leave it empty. 93 "tcp_allowed_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("")), 94 "tcp_available_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("reno")), 95 "tcp_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("reno")), 96 97 // Many of the following stub files are features netstack doesn't 98 // support. The unsupported features return "0" to indicate they are 99 // disabled. 100 "tcp_base_mss": fs.newInode(ctx, root, 0444, newStaticFile("1280")), 101 "tcp_dsack": fs.newInode(ctx, root, 0444, newStaticFile("0")), 102 "tcp_early_retrans": fs.newInode(ctx, root, 0444, newStaticFile("0")), 103 "tcp_fack": fs.newInode(ctx, root, 0444, newStaticFile("0")), 104 "tcp_fastopen": fs.newInode(ctx, root, 0444, newStaticFile("0")), 105 "tcp_fastopen_key": fs.newInode(ctx, root, 0444, newStaticFile("")), 106 "tcp_invalid_ratelimit": fs.newInode(ctx, root, 0444, newStaticFile("0")), 107 "tcp_keepalive_intvl": fs.newInode(ctx, root, 0444, newStaticFile("0")), 108 "tcp_keepalive_probes": fs.newInode(ctx, root, 0444, newStaticFile("0")), 109 "tcp_keepalive_time": fs.newInode(ctx, root, 0444, newStaticFile("7200")), 110 "tcp_mtu_probing": fs.newInode(ctx, root, 0444, newStaticFile("0")), 111 "tcp_no_metrics_save": fs.newInode(ctx, root, 0444, newStaticFile("1")), 112 "tcp_probe_interval": fs.newInode(ctx, root, 0444, newStaticFile("0")), 113 "tcp_probe_threshold": fs.newInode(ctx, root, 0444, newStaticFile("0")), 114 "tcp_retries1": fs.newInode(ctx, root, 0444, newStaticFile("3")), 115 "tcp_retries2": fs.newInode(ctx, root, 0444, newStaticFile("15")), 116 "tcp_rfc1337": fs.newInode(ctx, root, 0444, newStaticFile("1")), 117 "tcp_slow_start_after_idle": fs.newInode(ctx, root, 0444, newStaticFile("1")), 118 "tcp_synack_retries": fs.newInode(ctx, root, 0444, newStaticFile("5")), 119 "tcp_syn_retries": fs.newInode(ctx, root, 0444, newStaticFile("3")), 120 "tcp_timestamps": fs.newInode(ctx, root, 0444, newStaticFile("1")), 121 }), 122 "core": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ 123 "default_qdisc": fs.newInode(ctx, root, 0444, newStaticFile("pfifo_fast")), 124 "message_burst": fs.newInode(ctx, root, 0444, newStaticFile("10")), 125 "message_cost": fs.newInode(ctx, root, 0444, newStaticFile("5")), 126 "optmem_max": fs.newInode(ctx, root, 0444, newStaticFile("0")), 127 "rmem_default": fs.newInode(ctx, root, 0444, newStaticFile("212992")), 128 "rmem_max": fs.newInode(ctx, root, 0444, newStaticFile("212992")), 129 "somaxconn": fs.newInode(ctx, root, 0444, newStaticFile("128")), 130 "wmem_default": fs.newInode(ctx, root, 0444, newStaticFile("212992")), 131 "wmem_max": fs.newInode(ctx, root, 0444, newStaticFile("212992")), 132 }), 133 } 134 } 135 136 return fs.newStaticDir(ctx, root, contents) 137 } 138 139 // mmapMinAddrData implements vfs.DynamicBytesSource for 140 // /proc/sys/vm/mmap_min_addr. 141 // 142 // +stateify savable 143 type mmapMinAddrData struct { 144 kernfs.DynamicBytesFile 145 146 k *kernel.Kernel 147 } 148 149 var _ dynamicInode = (*mmapMinAddrData)(nil) 150 151 // Generate implements vfs.DynamicBytesSource.Generate. 152 func (d *mmapMinAddrData) Generate(ctx context.Context, buf *bytes.Buffer) error { 153 fmt.Fprintf(buf, "%d\n", d.k.Platform.MinUserAddress()) 154 return nil 155 } 156 157 // hostnameData implements vfs.DynamicBytesSource for /proc/sys/kernel/hostname. 158 // 159 // +stateify savable 160 type hostnameData struct { 161 kernfs.DynamicBytesFile 162 } 163 164 var _ dynamicInode = (*hostnameData)(nil) 165 166 // Generate implements vfs.DynamicBytesSource.Generate. 167 func (*hostnameData) Generate(ctx context.Context, buf *bytes.Buffer) error { 168 utsns := kernel.UTSNamespaceFromContext(ctx) 169 buf.WriteString(utsns.HostName()) 170 buf.WriteString("\n") 171 return nil 172 } 173 174 // tcpSackData implements vfs.WritableDynamicBytesSource for 175 // /proc/sys/net/tcp_sack. 176 // 177 // +stateify savable 178 type tcpSackData struct { 179 kernfs.DynamicBytesFile 180 181 stack inet.Stack `state:"wait"` 182 enabled *bool 183 } 184 185 var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil) 186 187 // Generate implements vfs.DynamicBytesSource.Generate. 188 func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error { 189 if d.enabled == nil { 190 sack, err := d.stack.TCPSACKEnabled() 191 if err != nil { 192 return err 193 } 194 d.enabled = &sack 195 } 196 197 val := "0\n" 198 if *d.enabled { 199 // Technically, this is not quite compatible with Linux. Linux stores these 200 // as an integer, so if you write "2" into tcp_sack, you should get 2 back. 201 // Tough luck. 202 val = "1\n" 203 } 204 _, err := buf.WriteString(val) 205 return err 206 } 207 208 // Write implements vfs.WritableDynamicBytesSource.Write. 209 func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { 210 if offset != 0 { 211 // No need to handle partial writes thus far. 212 return 0, linuxerr.EINVAL 213 } 214 if src.NumBytes() == 0 { 215 return 0, nil 216 } 217 218 // Limit the amount of memory allocated. 219 src = src.TakeFirst(hostarch.PageSize - 1) 220 221 var v int32 222 n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) 223 if err != nil { 224 return 0, err 225 } 226 if d.enabled == nil { 227 d.enabled = new(bool) 228 } 229 *d.enabled = v != 0 230 return n, d.stack.SetTCPSACKEnabled(*d.enabled) 231 } 232 233 // tcpRecoveryData implements vfs.WritableDynamicBytesSource for 234 // /proc/sys/net/ipv4/tcp_recovery. 235 // 236 // +stateify savable 237 type tcpRecoveryData struct { 238 kernfs.DynamicBytesFile 239 240 stack inet.Stack `state:"wait"` 241 } 242 243 var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil) 244 245 // Generate implements vfs.DynamicBytesSource.Generate. 246 func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error { 247 recovery, err := d.stack.TCPRecovery() 248 if err != nil { 249 return err 250 } 251 252 _, err = buf.WriteString(fmt.Sprintf("%d\n", recovery)) 253 return err 254 } 255 256 // Write implements vfs.WritableDynamicBytesSource.Write. 257 func (d *tcpRecoveryData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { 258 if offset != 0 { 259 // No need to handle partial writes thus far. 260 return 0, linuxerr.EINVAL 261 } 262 if src.NumBytes() == 0 { 263 return 0, nil 264 } 265 266 // Limit the amount of memory allocated. 267 src = src.TakeFirst(hostarch.PageSize - 1) 268 269 var v int32 270 n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) 271 if err != nil { 272 return 0, err 273 } 274 if err := d.stack.SetTCPRecovery(inet.TCPLossRecovery(v)); err != nil { 275 return 0, err 276 } 277 return n, nil 278 } 279 280 // tcpMemData implements vfs.WritableDynamicBytesSource for 281 // /proc/sys/net/ipv4/tcp_rmem and /proc/sys/net/ipv4/tcp_wmem. 282 // 283 // +stateify savable 284 type tcpMemData struct { 285 kernfs.DynamicBytesFile 286 287 dir tcpMemDir 288 stack inet.Stack `state:"wait"` 289 290 // mu protects against concurrent reads/writes to FDs based on the dentry 291 // backing this byte source. 292 mu sync.Mutex `state:"nosave"` 293 } 294 295 var _ vfs.WritableDynamicBytesSource = (*tcpMemData)(nil) 296 297 // Generate implements vfs.DynamicBytesSource.Generate. 298 func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error { 299 d.mu.Lock() 300 defer d.mu.Unlock() 301 302 size, err := d.readSizeLocked() 303 if err != nil { 304 return err 305 } 306 _, err = buf.WriteString(fmt.Sprintf("%d\t%d\t%d\n", size.Min, size.Default, size.Max)) 307 return err 308 } 309 310 // Write implements vfs.WritableDynamicBytesSource.Write. 311 func (d *tcpMemData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { 312 if offset != 0 { 313 // No need to handle partial writes thus far. 314 return 0, linuxerr.EINVAL 315 } 316 if src.NumBytes() == 0 { 317 return 0, nil 318 } 319 d.mu.Lock() 320 defer d.mu.Unlock() 321 322 // Limit the amount of memory allocated. 323 src = src.TakeFirst(hostarch.PageSize - 1) 324 size, err := d.readSizeLocked() 325 if err != nil { 326 return 0, err 327 } 328 buf := []int32{int32(size.Min), int32(size.Default), int32(size.Max)} 329 n, err := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, buf, src.Opts) 330 if err != nil { 331 return 0, err 332 } 333 newSize := inet.TCPBufferSize{ 334 Min: int(buf[0]), 335 Default: int(buf[1]), 336 Max: int(buf[2]), 337 } 338 if err := d.writeSizeLocked(newSize); err != nil { 339 return 0, err 340 } 341 return n, nil 342 } 343 344 // Precondition: d.mu must be locked. 345 func (d *tcpMemData) readSizeLocked() (inet.TCPBufferSize, error) { 346 switch d.dir { 347 case tcpRMem: 348 return d.stack.TCPReceiveBufferSize() 349 case tcpWMem: 350 return d.stack.TCPSendBufferSize() 351 default: 352 panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir)) 353 } 354 } 355 356 // Precondition: d.mu must be locked. 357 func (d *tcpMemData) writeSizeLocked(size inet.TCPBufferSize) error { 358 switch d.dir { 359 case tcpRMem: 360 return d.stack.SetTCPReceiveBufferSize(size) 361 case tcpWMem: 362 return d.stack.SetTCPSendBufferSize(size) 363 default: 364 panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir)) 365 } 366 } 367 368 // ipForwarding implements vfs.WritableDynamicBytesSource for 369 // /proc/sys/net/ipv4/ip_forward. 370 // 371 // +stateify savable 372 type ipForwarding struct { 373 kernfs.DynamicBytesFile 374 375 stack inet.Stack `state:"wait"` 376 enabled bool 377 } 378 379 var _ vfs.WritableDynamicBytesSource = (*ipForwarding)(nil) 380 381 // Generate implements vfs.DynamicBytesSource.Generate. 382 func (ipf *ipForwarding) Generate(ctx context.Context, buf *bytes.Buffer) error { 383 val := "0\n" 384 if ipf.enabled { 385 // Technically, this is not quite compatible with Linux. Linux stores these 386 // as an integer, so if you write "2" into tcp_sack, you should get 2 back. 387 // Tough luck. 388 val = "1\n" 389 } 390 buf.WriteString(val) 391 392 return nil 393 } 394 395 // Write implements vfs.WritableDynamicBytesSource.Write. 396 func (ipf *ipForwarding) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { 397 if offset != 0 { 398 // No need to handle partial writes thus far. 399 return 0, linuxerr.EINVAL 400 } 401 if src.NumBytes() == 0 { 402 return 0, nil 403 } 404 405 // Limit input size so as not to impact performance if input size is large. 406 src = src.TakeFirst(hostarch.PageSize - 1) 407 408 var v int32 409 n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) 410 if err != nil { 411 return 0, err 412 } 413 ipf.enabled = v != 0 414 if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, ipf.enabled); err != nil { 415 return 0, err 416 } 417 return n, nil 418 } 419 420 // portRange implements vfs.WritableDynamicBytesSource for 421 // /proc/sys/net/ipv4/ip_local_port_range. 422 // 423 // +stateify savable 424 type portRange struct { 425 kernfs.DynamicBytesFile 426 427 stack inet.Stack `state:"wait"` 428 429 // start and end store the port range. We must save/restore this here, 430 // since a netstack instance is created on restore. 431 start *uint16 432 end *uint16 433 } 434 435 var _ vfs.WritableDynamicBytesSource = (*portRange)(nil) 436 437 // Generate implements vfs.DynamicBytesSource.Generate. 438 func (pr *portRange) Generate(ctx context.Context, buf *bytes.Buffer) error { 439 if pr.start == nil { 440 start, end := pr.stack.PortRange() 441 pr.start = &start 442 pr.end = &end 443 } 444 _, err := fmt.Fprintf(buf, "%d %d\n", *pr.start, *pr.end) 445 return err 446 } 447 448 // Write implements vfs.WritableDynamicBytesSource.Write. 449 func (pr *portRange) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { 450 if offset != 0 { 451 // No need to handle partial writes thus far. 452 return 0, linuxerr.EINVAL 453 } 454 if src.NumBytes() == 0 { 455 return 0, nil 456 } 457 458 // Limit input size so as not to impact performance if input size is 459 // large. 460 src = src.TakeFirst(hostarch.PageSize - 1) 461 462 ports := make([]int32, 2) 463 n, err := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, ports, src.Opts) 464 if err != nil { 465 return 0, err 466 } 467 468 // Port numbers must be uint16s. 469 if ports[0] < 0 || ports[1] < 0 || ports[0] > math.MaxUint16 || ports[1] > math.MaxUint16 { 470 return 0, linuxerr.EINVAL 471 } 472 473 if err := pr.stack.SetPortRange(uint16(ports[0]), uint16(ports[1])); err != nil { 474 return 0, err 475 } 476 if pr.start == nil { 477 pr.start = new(uint16) 478 pr.end = new(uint16) 479 } 480 *pr.start = uint16(ports[0]) 481 *pr.end = uint16(ports[1]) 482 return n, nil 483 }