github.com/cilium/cilium@v1.16.2/pkg/datapath/linux/bandwidth/bandwidth.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  //go:build linux
     5  
     6  // NOTE: We can only build on linux because we import bwmap which in turn imports pkg/ebpf and pkg/bpf
     7  //       which throw build errors when building on non-linux platforms.
     8  
     9  package bandwidth
    10  
    11  import (
    12  	"fmt"
    13  	"strings"
    14  
    15  	"github.com/cilium/ebpf"
    16  	"github.com/cilium/ebpf/asm"
    17  	"k8s.io/apimachinery/pkg/api/resource"
    18  
    19  	"github.com/cilium/cilium/pkg/datapath/linux/config/defines"
    20  	"github.com/cilium/cilium/pkg/datapath/linux/probes"
    21  	"github.com/cilium/cilium/pkg/datapath/tables"
    22  	"github.com/cilium/cilium/pkg/datapath/types"
    23  	"github.com/cilium/cilium/pkg/logging/logfields"
    24  	"github.com/cilium/cilium/pkg/maps/bwmap"
    25  )
    26  
    27  const (
    28  	// EgressBandwidth is the K8s Pod annotation.
    29  	EgressBandwidth = "kubernetes.io/egress-bandwidth"
    30  	// IngressBandwidth is the K8s Pod annotation.
    31  	IngressBandwidth = "kubernetes.io/ingress-bandwidth"
    32  
    33  	// FqDefaultHorizon represents maximum allowed departure
    34  	// time delta in future. Given applications can set SO_TXTIME
    35  	// from user space this is a limit to prevent buggy applications
    36  	// to fill the FQ qdisc.
    37  	FqDefaultHorizon = bwmap.DefaultDropHorizon
    38  	// FqDefaultBuckets is the default 32k (2^15) bucket limit for bwm.
    39  	// Too low bucket limit can cause scalability issue.
    40  	FqDefaultBuckets = 15
    41  )
    42  
    43  type manager struct {
    44  	resetQueues, enabled bool
    45  
    46  	params bandwidthManagerParams
    47  }
    48  
    49  func (m *manager) Enabled() bool {
    50  	return m.enabled
    51  }
    52  
    53  func (m *manager) BBREnabled() bool {
    54  	return m.params.Config.EnableBBR
    55  }
    56  
    57  func (m *manager) defines() (defines.Map, error) {
    58  	cDefinesMap := make(defines.Map)
    59  	if m.resetQueues {
    60  		cDefinesMap["RESET_QUEUES"] = "1"
    61  	}
    62  
    63  	if m.Enabled() {
    64  		cDefinesMap["ENABLE_BANDWIDTH_MANAGER"] = "1"
    65  		cDefinesMap["THROTTLE_MAP"] = bwmap.MapName
    66  		cDefinesMap["THROTTLE_MAP_SIZE"] = fmt.Sprintf("%d", bwmap.MapSize)
    67  	}
    68  
    69  	return cDefinesMap, nil
    70  }
    71  
    72  func (m *manager) UpdateBandwidthLimit(epID uint16, bytesPerSecond uint64) {
    73  	if m.enabled {
    74  		txn := m.params.DB.WriteTxn(m.params.EdtTable)
    75  		m.params.EdtTable.Insert(
    76  			txn,
    77  			bwmap.NewEdt(epID, bytesPerSecond),
    78  		)
    79  		txn.Commit()
    80  	}
    81  }
    82  
    83  func (m *manager) DeleteBandwidthLimit(epID uint16) {
    84  	if m.enabled {
    85  		txn := m.params.DB.WriteTxn(m.params.EdtTable)
    86  		obj, _, found := m.params.EdtTable.Get(txn, bwmap.EdtIDIndex.Query(epID))
    87  		if found {
    88  			m.params.EdtTable.Delete(txn, obj)
    89  		}
    90  		txn.Commit()
    91  	}
    92  }
    93  
    94  func GetBytesPerSec(bandwidth string) (uint64, error) {
    95  	res, err := resource.ParseQuantity(bandwidth)
    96  	if err != nil {
    97  		return 0, err
    98  	}
    99  	return uint64(res.Value() / 8), err
   100  }
   101  
   102  // probe checks the various system requirements of the bandwidth manager and disables it if they are
   103  // not met.
   104  func (m *manager) probe() error {
   105  	// We at least need 5.1 kernel for native TCP EDT integration
   106  	// and writable queue_mapping that we use. Below helper is
   107  	// available for 5.1 kernels and onwards.
   108  	kernelGood := probes.HaveProgramHelper(ebpf.SchedCLS, asm.FnSkbEcnSetCe) == nil
   109  	m.resetQueues = kernelGood
   110  	if !m.params.Config.EnableBandwidthManager {
   111  		return nil
   112  	}
   113  	if _, err := m.params.Sysctl.Read([]string{"net", "core", "default_qdisc"}); err != nil {
   114  		m.params.Log.Warn("BPF bandwidth manager could not read procfs. Disabling the feature.", logfields.Error, err)
   115  		return nil
   116  	}
   117  	if !kernelGood {
   118  		m.params.Log.Warn("BPF bandwidth manager needs kernel 5.1 or newer. Disabling the feature.")
   119  		return nil
   120  	}
   121  	if m.params.Config.EnableBBR {
   122  		// We at least need 5.18 kernel for Pod-based BBR TCP congestion
   123  		// control since earlier kernels just clear the skb->tstamp upon
   124  		// netns traversal. See also:
   125  		//
   126  		// - https://lpc.events/event/11/contributions/953/
   127  		// - https://lore.kernel.org/bpf/20220302195519.3479274-1-kafai@fb.com/
   128  		if probes.HaveProgramHelper(ebpf.SchedCLS, asm.FnSkbSetTstamp) != nil {
   129  			return fmt.Errorf("cannot enable --%s, needs kernel 5.18 or newer", types.EnableBBRFlag)
   130  		}
   131  	}
   132  
   133  	// Going via host stack will orphan skb->sk, so we do need BPF host
   134  	// routing for it to work properly.
   135  	if m.params.Config.EnableBBR && m.params.DaemonConfig.EnableHostLegacyRouting {
   136  		return fmt.Errorf("BPF bandwidth manager's BBR setup requires BPF host routing.")
   137  	}
   138  
   139  	if m.params.Config.EnableBandwidthManager && m.params.DaemonConfig.EnableIPSec {
   140  		m.params.Log.Warn("The bandwidth manager cannot be used with IPSec. Disabling the bandwidth manager.")
   141  		return nil
   142  	}
   143  
   144  	m.enabled = true
   145  	return nil
   146  }
   147  
   148  func (m *manager) init() error {
   149  	m.params.Log.Info("Setting up BPF bandwidth manager")
   150  
   151  	if err := bwmap.ThrottleMap().OpenOrCreate(); err != nil {
   152  		return fmt.Errorf("failed to access ThrottleMap: %w", err)
   153  	}
   154  
   155  	if err := setBaselineSysctls(m.params); err != nil {
   156  		return fmt.Errorf("failed to set sysctl needed by BPF bandwidth manager: %w", err)
   157  	}
   158  	return nil
   159  }
   160  
   161  func setBaselineSysctls(p bandwidthManagerParams) error {
   162  	// Ensure interger type sysctls are no smaller than our baseline settings
   163  	baseIntSettings := []struct {
   164  		name []string
   165  		val  int64
   166  	}{
   167  		{[]string{"net", "core", "netdev_max_backlog"}, 1000},
   168  		{[]string{"net", "core", "somaxconn"}, 4096},
   169  		{[]string{"net", "ipv4", "tcp_max_syn_backlog"}, 4096},
   170  	}
   171  
   172  	for _, setting := range baseIntSettings {
   173  		currentValue, err := p.Sysctl.ReadInt(setting.name)
   174  		if err != nil {
   175  			return fmt.Errorf("read sysctl %s failed: %w", strings.Join(setting.name, "."), err)
   176  		}
   177  
   178  		scopedLog := p.Log.With(
   179  			logfields.SysParamName, strings.Join(setting.name, "."),
   180  			logfields.SysParamValue, currentValue,
   181  			"baselineValue", setting.val,
   182  		)
   183  
   184  		if currentValue >= setting.val {
   185  			scopedLog.Info("Skip setting sysctl as it already meets baseline")
   186  			continue
   187  		}
   188  
   189  		scopedLog.Info("Setting sysctl to baseline for BPF bandwidth manager")
   190  		if err := p.Sysctl.WriteInt(setting.name, setting.val); err != nil {
   191  			return fmt.Errorf("set sysctl %s=%d failed: %w", strings.Join(setting.name, "."), setting.val, err)
   192  		}
   193  	}
   194  
   195  	// Ensure string type sysctls
   196  	congctl := "cubic"
   197  	if p.Config.EnableBBR {
   198  		congctl = "bbr"
   199  	}
   200  
   201  	sysctls := []tables.Sysctl{
   202  		{Name: []string{"net", "core", "default_qdisc"}, Val: "fq"},
   203  		{Name: []string{"net", "ipv4", "tcp_congestion_control"}, Val: congctl},
   204  	}
   205  
   206  	// Few extra knobs which can be turned on along with pacing. EnableBBR
   207  	// also provides the right kernel dependency implicitly as well.
   208  	if p.Config.EnableBBR {
   209  		sysctls = append(sysctls, tables.Sysctl{
   210  			Name: []string{"net", "ipv4", "tcp_slow_start_after_idle"}, Val: "0",
   211  		})
   212  	}
   213  
   214  	if err := p.Sysctl.ApplySettings(sysctls); err != nil {
   215  		return fmt.Errorf("failed to apply sysctls: %w", err)
   216  	}
   217  
   218  	return nil
   219  }