github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/runsc/sandbox/network.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package sandbox 16 17 import ( 18 "fmt" 19 "net" 20 "os" 21 "path/filepath" 22 "runtime" 23 "strconv" 24 25 specs "github.com/opencontainers/runtime-spec/specs-go" 26 "github.com/vishvananda/netlink" 27 "golang.org/x/sys/unix" 28 "github.com/SagerNet/gvisor/pkg/log" 29 "github.com/SagerNet/gvisor/pkg/tcpip/header" 30 "github.com/SagerNet/gvisor/pkg/tcpip/stack" 31 "github.com/SagerNet/gvisor/pkg/urpc" 32 "github.com/SagerNet/gvisor/runsc/boot" 33 "github.com/SagerNet/gvisor/runsc/config" 34 "github.com/SagerNet/gvisor/runsc/specutils" 35 ) 36 37 // setupNetwork configures the network stack to mimic the local network 38 // configuration. Docker uses network namespaces with vnets to configure the 39 // network for the container. The untrusted app expects to see the same network 40 // inside the sandbox. Routing and port mapping is handled directly by docker 41 // with most of network information not even available to the runtime. 42 // 43 // Netstack inside the sandbox speaks directly to the device using a raw socket. 44 // All IP addresses assigned to the NIC, are removed and passed on to netstack's 45 // device. 46 // 47 // If 'conf.Network' is NoNetwork, skips local configuration and creates a 48 // loopback interface only. 49 // 50 // Run the following container to test it: 51 // docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4 52 func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *config.Config) error { 53 log.Infof("Setting up network") 54 55 switch conf.Network { 56 case config.NetworkNone: 57 log.Infof("Network is disabled, create loopback interface only") 58 if err := createDefaultLoopbackInterface(conn); err != nil { 59 return fmt.Errorf("creating default loopback interface: %v", err) 60 } 61 case config.NetworkSandbox: 62 // Build the path to the net namespace of the sandbox process. 63 // This is what we will copy. 64 nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net") 65 if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.TXChecksumOffload, conf.RXChecksumOffload, conf.NumNetworkChannels, conf.QDisc); err != nil { 66 return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err) 67 } 68 case config.NetworkHost: 69 // Nothing to do here. 70 default: 71 return fmt.Errorf("invalid network type: %v", conf.Network) 72 } 73 return nil 74 } 75 76 func createDefaultLoopbackInterface(conn *urpc.Client) error { 77 if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{ 78 LoopbackLinks: []boot.LoopbackLink{boot.DefaultLoopbackLink}, 79 }, nil); err != nil { 80 return fmt.Errorf("creating loopback link and routes: %v", err) 81 } 82 return nil 83 } 84 85 func joinNetNS(nsPath string) (func(), error) { 86 runtime.LockOSThread() 87 restoreNS, err := specutils.ApplyNS(specs.LinuxNamespace{ 88 Type: specs.NetworkNamespace, 89 Path: nsPath, 90 }) 91 if err != nil { 92 runtime.UnlockOSThread() 93 return nil, fmt.Errorf("joining net namespace %q: %v", nsPath, err) 94 } 95 return func() { 96 restoreNS() 97 runtime.UnlockOSThread() 98 }, nil 99 } 100 101 // isRootNS determines whether we are running in the root net namespace. 102 // /proc/sys/net/core/rmem_default only exists in root network namespace. 103 func isRootNS() (bool, error) { 104 err := unix.Access("/proc/sys/net/core/rmem_default", unix.F_OK) 105 switch err { 106 case nil: 107 return true, nil 108 case unix.ENOENT: 109 return false, nil 110 default: 111 return false, fmt.Errorf("failed to access /proc/sys/net/core/rmem_default: %v", err) 112 } 113 } 114 115 // createInterfacesAndRoutesFromNS scrapes the interface and routes from the 116 // net namespace with the given path, creates them in the sandbox, and removes 117 // them from the host. 118 func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, txChecksumOffload bool, rxChecksumOffload bool, numNetworkChannels int, qDisc config.QueueingDiscipline) error { 119 // Join the network namespace that we will be copying. 120 restore, err := joinNetNS(nsPath) 121 if err != nil { 122 return err 123 } 124 defer restore() 125 126 // Get all interfaces in the namespace. 127 ifaces, err := net.Interfaces() 128 if err != nil { 129 return fmt.Errorf("querying interfaces: %w", err) 130 } 131 132 isRoot, err := isRootNS() 133 if err != nil { 134 return err 135 } 136 if isRoot { 137 return fmt.Errorf("cannot run with network enabled in root network namespace") 138 } 139 140 // Collect addresses and routes from the interfaces. 141 var args boot.CreateLinksAndRoutesArgs 142 for _, iface := range ifaces { 143 if iface.Flags&net.FlagUp == 0 { 144 log.Infof("Skipping down interface: %+v", iface) 145 continue 146 } 147 148 allAddrs, err := iface.Addrs() 149 if err != nil { 150 return fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err) 151 } 152 153 // We build our own loopback device. 154 if iface.Flags&net.FlagLoopback != 0 { 155 link, err := loopbackLink(iface, allAddrs) 156 if err != nil { 157 return fmt.Errorf("getting loopback link for iface %q: %w", iface.Name, err) 158 } 159 args.LoopbackLinks = append(args.LoopbackLinks, link) 160 continue 161 } 162 163 var ipAddrs []*net.IPNet 164 for _, ifaddr := range allAddrs { 165 ipNet, ok := ifaddr.(*net.IPNet) 166 if !ok { 167 return fmt.Errorf("address is not IPNet: %+v", ifaddr) 168 } 169 ipAddrs = append(ipAddrs, ipNet) 170 } 171 if len(ipAddrs) == 0 { 172 log.Warningf("No usable IP addresses found for interface %q, skipping", iface.Name) 173 continue 174 } 175 176 // Scrape the routes before removing the address, since that 177 // will remove the routes as well. 178 routes, defv4, defv6, err := routesForIface(iface) 179 if err != nil { 180 return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err) 181 } 182 if defv4 != nil { 183 if !args.Defaultv4Gateway.Route.Empty() { 184 return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway) 185 } 186 args.Defaultv4Gateway.Route = *defv4 187 args.Defaultv4Gateway.Name = iface.Name 188 } 189 190 if defv6 != nil { 191 if !args.Defaultv6Gateway.Route.Empty() { 192 return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway) 193 } 194 args.Defaultv6Gateway.Route = *defv6 195 args.Defaultv6Gateway.Name = iface.Name 196 } 197 198 link := boot.FDBasedLink{ 199 Name: iface.Name, 200 MTU: iface.MTU, 201 Routes: routes, 202 TXChecksumOffload: txChecksumOffload, 203 RXChecksumOffload: rxChecksumOffload, 204 NumChannels: numNetworkChannels, 205 QDisc: qDisc, 206 } 207 208 // Get the link for the interface. 209 ifaceLink, err := netlink.LinkByName(iface.Name) 210 if err != nil { 211 return fmt.Errorf("getting link for interface %q: %w", iface.Name, err) 212 } 213 link.LinkAddress = ifaceLink.Attrs().HardwareAddr 214 215 log.Debugf("Setting up network channels") 216 // Create the socket for the device. 217 for i := 0; i < link.NumChannels; i++ { 218 log.Debugf("Creating Channel %d", i) 219 socketEntry, err := createSocket(iface, ifaceLink, hardwareGSO) 220 if err != nil { 221 return fmt.Errorf("failed to createSocket for %s : %w", iface.Name, err) 222 } 223 if i == 0 { 224 link.GSOMaxSize = socketEntry.gsoMaxSize 225 } else { 226 if link.GSOMaxSize != socketEntry.gsoMaxSize { 227 return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s", 228 link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name) 229 } 230 } 231 args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile) 232 } 233 234 if link.GSOMaxSize == 0 && softwareGSO { 235 // Hardware GSO is disabled. Let's enable software GSO. 236 link.GSOMaxSize = stack.SoftwareGSOMaxSize 237 link.SoftwareGSOEnabled = true 238 } 239 240 // Collect the addresses for the interface, enable forwarding, 241 // and remove them from the host. 242 for _, addr := range ipAddrs { 243 prefix, _ := addr.Mask.Size() 244 link.Addresses = append(link.Addresses, boot.IPWithPrefix{Address: addr.IP, PrefixLen: prefix}) 245 246 // Steal IP address from NIC. 247 if err := removeAddress(ifaceLink, addr.String()); err != nil { 248 return fmt.Errorf("removing address %v from device %q: %w", addr, iface.Name, err) 249 } 250 } 251 252 args.FDBasedLinks = append(args.FDBasedLinks, link) 253 } 254 255 log.Debugf("Setting up network, config: %+v", args) 256 if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil { 257 return fmt.Errorf("creating links and routes: %w", err) 258 } 259 return nil 260 } 261 262 type socketEntry struct { 263 deviceFile *os.File 264 gsoMaxSize uint32 265 } 266 267 // createSocket creates an underlying AF_PACKET socket and configures it for use by 268 // the sentry and returns an *os.File that wraps the underlying socket fd. 269 func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (*socketEntry, error) { 270 // Create the socket. 271 const protocol = 0x0300 // htons(ETH_P_ALL) 272 fd, err := unix.Socket(unix.AF_PACKET, unix.SOCK_RAW, protocol) 273 if err != nil { 274 return nil, fmt.Errorf("unable to create raw socket: %v", err) 275 } 276 deviceFile := os.NewFile(uintptr(fd), "raw-device-fd") 277 // Bind to the appropriate device. 278 ll := unix.SockaddrLinklayer{ 279 Protocol: protocol, 280 Ifindex: iface.Index, 281 } 282 if err := unix.Bind(fd, &ll); err != nil { 283 return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err) 284 } 285 286 gsoMaxSize := uint32(0) 287 if enableGSO { 288 gso, err := isGSOEnabled(fd, iface.Name) 289 if err != nil { 290 return nil, fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err) 291 } 292 if gso { 293 if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil { 294 return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err) 295 } 296 gsoMaxSize = ifaceLink.Attrs().GSOMaxSize 297 } else { 298 log.Infof("GSO not available in host.") 299 } 300 } 301 302 // Use SO_RCVBUFFORCE/SO_SNDBUFFORCE because on linux the receive/send buffer 303 // for an AF_PACKET socket is capped by "net.core.rmem_max/wmem_max". 304 // wmem_max/rmem_max default to a unusually low value of 208KB. This is too low 305 // for gVisor to be able to receive packets at high throughputs without 306 // incurring packet drops. 307 const bufSize = 4 << 20 // 4MB. 308 309 if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUFFORCE, bufSize); err != nil { 310 unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUF, bufSize) 311 sz, _ := unix.GetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUF) 312 313 if sz < bufSize { 314 log.Warningf("Failed to increase rcv buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err) 315 } 316 } 317 318 if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUFFORCE, bufSize); err != nil { 319 unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUF, bufSize) 320 sz, _ := unix.GetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUF) 321 if sz < bufSize { 322 log.Warningf("Failed to increase snd buffer to %d on SOCK_RAW on %s. Curent buffer %d: %v", bufSize, iface.Name, sz, err) 323 } 324 } 325 326 return &socketEntry{deviceFile, gsoMaxSize}, nil 327 } 328 329 // loopbackLink returns the link with addresses and routes for a loopback 330 // interface. 331 func loopbackLink(iface net.Interface, addrs []net.Addr) (boot.LoopbackLink, error) { 332 link := boot.LoopbackLink{ 333 Name: iface.Name, 334 } 335 for _, addr := range addrs { 336 ipNet, ok := addr.(*net.IPNet) 337 if !ok { 338 return boot.LoopbackLink{}, fmt.Errorf("address is not IPNet: %+v", addr) 339 } 340 341 prefix, _ := ipNet.Mask.Size() 342 link.Addresses = append(link.Addresses, boot.IPWithPrefix{ 343 Address: ipNet.IP, 344 PrefixLen: prefix, 345 }) 346 347 dst := *ipNet 348 dst.IP = dst.IP.Mask(dst.Mask) 349 link.Routes = append(link.Routes, boot.Route{ 350 Destination: dst, 351 }) 352 } 353 return link, nil 354 } 355 356 // routesForIface iterates over all routes for the given interface and converts 357 // them to boot.Routes. It also returns the a default v4/v6 route if found. 358 func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, *boot.Route, error) { 359 link, err := netlink.LinkByIndex(iface.Index) 360 if err != nil { 361 return nil, nil, nil, err 362 } 363 rs, err := netlink.RouteList(link, netlink.FAMILY_ALL) 364 if err != nil { 365 return nil, nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err) 366 } 367 368 var defv4, defv6 *boot.Route 369 var routes []boot.Route 370 for _, r := range rs { 371 // Is it a default route? 372 if r.Dst == nil { 373 if r.Gw == nil { 374 return nil, nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r) 375 } 376 // Create a catch all route to the gateway. 377 switch len(r.Gw) { 378 case header.IPv4AddressSize: 379 if defv4 != nil { 380 return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv4, r) 381 } 382 defv4 = &boot.Route{ 383 Destination: net.IPNet{ 384 IP: net.IPv4zero, 385 Mask: net.IPMask(net.IPv4zero), 386 }, 387 Gateway: r.Gw, 388 } 389 case header.IPv6AddressSize: 390 if defv6 != nil { 391 return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv6, r) 392 } 393 394 defv6 = &boot.Route{ 395 Destination: net.IPNet{ 396 IP: net.IPv6zero, 397 Mask: net.IPMask(net.IPv6zero), 398 }, 399 Gateway: r.Gw, 400 } 401 default: 402 return nil, nil, nil, fmt.Errorf("unexpected address size for gateway: %+v for route: %+v", r.Gw, r) 403 } 404 continue 405 } 406 407 dst := *r.Dst 408 dst.IP = dst.IP.Mask(dst.Mask) 409 routes = append(routes, boot.Route{ 410 Destination: dst, 411 Gateway: r.Gw, 412 }) 413 } 414 return routes, defv4, defv6, nil 415 } 416 417 // removeAddress removes IP address from network device. It's equivalent to: 418 // ip addr del <ipAndMask> dev <name> 419 func removeAddress(source netlink.Link, ipAndMask string) error { 420 addr, err := netlink.ParseAddr(ipAndMask) 421 if err != nil { 422 return err 423 } 424 return netlink.AddrDel(source, addr) 425 }