github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/seccomp/seccomp.go

github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/seccomp/seccomp.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package seccomp provides generation of basic seccomp filters. Currently,
    16  // only little endian systems are supported.
    17  package seccomp
    18  
    19  import (
    20  	"fmt"
    21  	"sort"
    22  	"strings"
    23  	"time"
    24  
    25  	"github.com/metacubex/gvisor/pkg/abi/linux"
    26  	"github.com/metacubex/gvisor/pkg/bpf"
    27  	"github.com/metacubex/gvisor/pkg/log"
    28  )
    29  
    30  const (
    31  	// skipOneInst is the offset to take for skipping one instruction.
    32  	skipOneInst = 1
    33  
    34  	// defaultLabel is the label for the default action.
    35  	defaultLabel = label("default_action")
    36  
    37  	// vsyscallPageIPMask is the bit we expect to see in the instruction
    38  	// pointer of a vsyscall call.
    39  	vsyscallPageIPMask = 1 << 31
    40  )
    41  
    42  // Install generates BPF code based on the set of syscalls provided. It only
    43  // allows syscalls that conform to the specification. Syscalls that violate the
    44  // specification will trigger RET_KILL_PROCESS. If RET_KILL_PROCESS is not
    45  // supported, violations will trigger RET_TRAP instead. RET_KILL_THREAD is not
    46  // used because it only kills the offending thread and often keeps the sentry
    47  // hanging.
    48  //
    49  // denyRules describes forbidden syscalls. rules describes allowed syscalls.
    50  // denyRules is executed before rules.
    51  //
    52  // Be aware that RET_TRAP sends SIGSYS to the process and it may be ignored,
    53  // making it possible for the process to continue running after a violation.
    54  // However, it will leave a SECCOMP audit event trail behind. In any case, the
    55  // syscall is still blocked from executing.
    56  func Install(rules SyscallRules, denyRules SyscallRules, options ProgramOptions) error {
    57  	// ***   DEBUG TIP   ***
    58  	// If you suspect the Sentry is getting killed due to a seccomp violation,
    59  	// look for the `debugFilter` boolean in `//runsc/boot/filter/filter.go`.
    60  
    61  	log.Infof("Installing seccomp filters for %d syscalls (action=%v)", rules.Size(), options.DefaultAction)
    62  
    63  	instrs, _, err := BuildProgram([]RuleSet{
    64  		{
    65  			Rules:  denyRules,
    66  			Action: options.DefaultAction,
    67  		},
    68  		{
    69  			Rules:  rules,
    70  			Action: linux.SECCOMP_RET_ALLOW,
    71  		},
    72  	}, options)
    73  	if log.IsLogging(log.Debug) {
    74  		programStr, errDecode := bpf.DecodeInstructions(instrs)
    75  		if errDecode != nil {
    76  			programStr = fmt.Sprintf("Error: %v\n%s", errDecode, programStr)
    77  		}
    78  		log.Debugf("Seccomp program dump:\n%s", programStr)
    79  	}
    80  	if err != nil {
    81  		return err
    82  	}
    83  
    84  	// Perform the actual installation.
    85  	if err := SetFilter(instrs); err != nil {
    86  		return fmt.Errorf("failed to set filter: %v", err)
    87  	}
    88  
    89  	log.Infof("Seccomp filters installed.")
    90  	return nil
    91  }
    92  
    93  // DefaultAction returns a sane default for a failure to match
    94  // a seccomp-bpf filter. Either kill the process, or trap.
    95  func DefaultAction() (linux.BPFAction, error) {
    96  	available, err := isKillProcessAvailable()
    97  	if err != nil {
    98  		return 0, err
    99  	}
   100  	if available {
   101  		return linux.SECCOMP_RET_KILL_PROCESS, nil
   102  	}
   103  	return linux.SECCOMP_RET_TRAP, nil
   104  }
   105  
   106  // RuleSet is a set of rules and associated action.
   107  type RuleSet struct {
   108  	Rules  SyscallRules
   109  	Action linux.BPFAction
   110  
   111  	// Vsyscall indicates that a check is made for a function being called
   112  	// from kernel mappings. This is where the vsyscall page is located
   113  	// (and typically) emulated, so this RuleSet will not match any
   114  	// functions not dispatched from the vsyscall page.
   115  	Vsyscall bool
   116  }
   117  
   118  // SyscallName gives names to system calls. It is used purely for debugging purposes.
   119  //
   120  // An alternate namer can be provided to the package at initialization time.
   121  var SyscallName = func(sysno uintptr) string {
   122  	return fmt.Sprintf("syscall_%d", sysno)
   123  }
   124  
   125  // syscallProgram builds a BPF program for applying syscall rules.
   126  // It is a stateful struct that is updated as the program is built.
   127  type syscallProgram struct {
   128  	// program is the underlying BPF program being built.
   129  	program *bpf.ProgramBuilder
   130  }
   131  
   132  // Stmt adds a statement to the program.
   133  func (s *syscallProgram) Stmt(code uint16, k uint32) {
   134  	s.program.AddStmt(code, k)
   135  }
   136  
   137  // label is a custom label type which is returned by `labelSet`.
   138  type label string
   139  
   140  // JumpTo adds a jump instruction to the program, jumping to the given label.
   141  func (s *syscallProgram) JumpTo(label label) {
   142  	s.program.AddDirectJumpLabel(string(label))
   143  }
   144  
   145  // If checks a condition and jumps to a label if the condition is true.
   146  // If the condition is false, the program continues executing (no jumping).
   147  func (s *syscallProgram) If(code uint16, k uint32, jt label) {
   148  	s.program.AddJump(code, k, 0, skipOneInst)
   149  	s.JumpTo(jt)
   150  }
   151  
   152  // IfNot checks a condition and jumps to a label if the condition is false.
   153  // If the condition is true, the program continues executing (no jumping).
   154  func (s *syscallProgram) IfNot(code uint16, k uint32, jf label) {
   155  	s.program.AddJump(code, k, skipOneInst, 0)
   156  	s.JumpTo(jf)
   157  }
   158  
   159  // Ret adds a return instruction to the program.
   160  func (s *syscallProgram) Ret(action linux.BPFAction) {
   161  	s.Stmt(bpf.Ret|bpf.K, uint32(action))
   162  }
   163  
   164  // Label adds a label to the program.
   165  // It panics if this label has already been added to the program.
   166  func (s *syscallProgram) Label(label label) {
   167  	if err := s.program.AddLabel(string(label)); err != nil {
   168  		panic(fmt.Sprintf("cannot add label %q to program: %v", label, err))
   169  	}
   170  }
   171  
   172  // Record starts recording the instructions added to the program from now on.
   173  // It returns a syscallFragment which can be used to perform assertions on the
   174  // possible set of outcomes of the set of instruction that has been added
   175  // since `Record` was called.
   176  func (s *syscallProgram) Record() syscallProgramFragment {
   177  	return syscallProgramFragment{s.program.Record()}
   178  }
   179  
   180  // syscallProgramFragment represents a fragment of the syscall program.
   181  type syscallProgramFragment struct {
   182  	getFragment func() bpf.ProgramFragment
   183  }
   184  
   185  // MustHaveJumpedTo asserts that the fragment must jump to one of the
   186  // given labels.
   187  // The fragment may not jump to any other label, nor return, nor fall through.
   188  func (f syscallProgramFragment) MustHaveJumpedTo(labels ...label) {
   189  	f.MustHaveJumpedToOrReturned(labels, nil)
   190  }
   191  
   192  // MustHaveJumpedToOrReturned asserts that the fragment must jump to one of
   193  // the given labels, or have returned one of the given return values.
   194  // The fragment may not jump to any other label, nor fall through,
   195  // nor return a non-deterministic value.
   196  func (f syscallProgramFragment) MustHaveJumpedToOrReturned(possibleLabels []label, possibleReturnValues map[linux.BPFAction]struct{}) {
   197  	fragment := f.getFragment()
   198  	outcomes := fragment.Outcomes()
   199  	if outcomes.MayFallThrough {
   200  		panic(fmt.Sprintf("fragment %v may fall through", fragment))
   201  	}
   202  	if len(possibleReturnValues) == 0 && outcomes.MayReturn() {
   203  		panic(fmt.Sprintf("fragment %v may return", fragment))
   204  	}
   205  	if outcomes.MayReturnRegisterA {
   206  		panic(fmt.Sprintf("fragment %v may return register A", fragment))
   207  	}
   208  	if outcomes.MayJumpToKnownOffsetBeyondFragment {
   209  		panic(fmt.Sprintf("fragment %v may jump to an offset beyond the fragment", fragment))
   210  	}
   211  	for jumpLabel := range outcomes.MayJumpToUnresolvedLabels {
   212  		found := false
   213  		for _, wantLabel := range possibleLabels {
   214  			if jumpLabel == string(wantLabel) {
   215  				found = true
   216  				break
   217  			}
   218  		}
   219  		if !found {
   220  			panic(fmt.Sprintf("fragment %v may jump to a label %q which is not one of %v", fragment, jumpLabel, possibleLabels))
   221  		}
   222  	}
   223  	for returnValue := range outcomes.MayReturnImmediate {
   224  		if _, found := possibleReturnValues[returnValue]; !found {
   225  			panic(fmt.Sprintf("fragment %v may return a value %q which is not one of %v", fragment, returnValue, possibleReturnValues))
   226  		}
   227  	}
   228  }
   229  
   230  // labelSet keeps track of labels that individual rules may jump to if they
   231  // either match or mismatch.
   232  // It can generate unique label names, and can be used recursively within
   233  // rules.
   234  type labelSet struct {
   235  	// prefix is a label prefix used when generating label names.
   236  	prefix string
   237  
   238  	// labelCounter is used to generate unique label names.
   239  	labelCounter int
   240  
   241  	// ruleMatched is the label that a rule should jump to if it matches.
   242  	ruleMatched label
   243  
   244  	// ruleMismatched is the label that a rule should jump to if it doesn't
   245  	// match.
   246  	ruleMismatched label
   247  }
   248  
   249  // NewLabel returns a new unique label.
   250  func (l *labelSet) NewLabel() label {
   251  	newLabel := label(fmt.Sprintf("%s#%d", l.prefix, l.labelCounter))
   252  	l.labelCounter++
   253  	return newLabel
   254  }
   255  
   256  // Matched returns the label to jump to if the rule matches.
   257  func (l *labelSet) Matched() label {
   258  	return l.ruleMatched
   259  }
   260  
   261  // Mismatched returns the label to jump to if the rule does not match.
   262  func (l *labelSet) Mismatched() label {
   263  	return l.ruleMismatched
   264  }
   265  
   266  // Push creates a new labelSet meant to be used in a recursive context of the
   267  // rule currently being rendered.
   268  // Labels generated by this new labelSet will have `labelSuffix` appended to
   269  // this labelSet's current prefix, and will have its matched/mismatched labels
   270  // point to the given labels.
   271  func (l *labelSet) Push(labelSuffix string, newRuleMatch, newRuleMismatch label) *labelSet {
   272  	newPrefix := labelSuffix
   273  	if l.prefix != "" {
   274  		newPrefix = fmt.Sprintf("%s_%s", l.prefix, labelSuffix)
   275  	}
   276  	return &labelSet{
   277  		prefix:         newPrefix,
   278  		ruleMatched:    newRuleMatch,
   279  		ruleMismatched: newRuleMismatch,
   280  	}
   281  }
   282  
   283  // matchedValue keeps track of BPF instructions needed to load a 64-bit value
   284  // being matched against. Since BPF can only do operations on 32-bit
   285  // instructions, value-matching code needs to selectively load one or the
   286  // other half of the 64-bit value.
   287  type matchedValue struct {
   288  	program        *syscallProgram
   289  	dataOffsetHigh uint32
   290  	dataOffsetLow  uint32
   291  }
   292  
   293  // LoadHigh32Bits loads the high 32-bit of the 64-bit value into register A.
   294  func (m matchedValue) LoadHigh32Bits() {
   295  	m.program.Stmt(bpf.Ld|bpf.Abs|bpf.W, m.dataOffsetHigh)
   296  }
   297  
   298  // LoadLow32Bits loads the low 32-bit of the 64-bit value into register A.
   299  func (m matchedValue) LoadLow32Bits() {
   300  	m.program.Stmt(bpf.Ld|bpf.Abs|bpf.W, m.dataOffsetLow)
   301  }
   302  
   303  // ProgramOptions configure a seccomp program.
   304  type ProgramOptions struct {
   305  	// DefaultAction is the action returned when none of the rules match.
   306  	DefaultAction linux.BPFAction
   307  
   308  	// BadArchAction is the action returned when the architecture of the
   309  	// syscall structure input doesn't match the one the program expects.
   310  	BadArchAction linux.BPFAction
   311  
   312  	// Optimize specifies whether optimizations should be applied to the
   313  	// syscall rules and generated BPF bytecode.
   314  	Optimize bool
   315  
   316  	// HotSyscalls is the set of syscall numbers that are the hottest,
   317  	// where "hotness" refers to frequency (regardless of the amount of
   318  	// computation that the kernel will do handling them, and regardless of
   319  	// the complexity of the syscall rule for this).
   320  	// It should only contain very hot syscalls (i.e. any syscall that is
   321  	// called >10% of the time out of all syscalls made).
   322  	// It is ordered from most frequent to least frequent.
   323  	HotSyscalls []uintptr
   324  }
   325  
   326  // DefaultProgramOptions returns the default program options.
   327  func DefaultProgramOptions() ProgramOptions {
   328  	action, err := DefaultAction()
   329  	if err != nil {
   330  		panic(fmt.Sprintf("cannot determine default seccomp action: %v", err))
   331  	}
   332  	return ProgramOptions{
   333  		DefaultAction: action,
   334  		BadArchAction: action,
   335  		Optimize:      true,
   336  	}
   337  }
   338  
   339  // BuildStats contains information about seccomp program generation.
   340  type BuildStats struct {
   341  	// SizeBeforeOptimizations and SizeAfterOptimizations correspond to the
   342  	// number of instructions in the program before vs after optimization.
   343  	SizeBeforeOptimizations, SizeAfterOptimizations int
   344  
   345  	// BuildDuration is the amount of time it took to build the program (before
   346  	// BPF bytecode optimizations).
   347  	BuildDuration time.Duration
   348  
   349  	// RuleOptimizeDuration is the amount of time it took to run SyscallRule
   350  	// optimizations.
   351  	RuleOptimizeDuration time.Duration
   352  
   353  	// BPFOptimizeDuration is the amount of time it took to run BPF bytecode
   354  	// optimizations.
   355  	BPFOptimizeDuration time.Duration
   356  }
   357  
   358  // BuildProgram builds a BPF program from the given map of actions to matching
   359  // SyscallRules. The single generated program covers all provided RuleSets.
   360  func BuildProgram(rules []RuleSet, options ProgramOptions) ([]bpf.Instruction, BuildStats, error) {
   361  	start := time.Now()
   362  	// Make a copy of the syscall rules and maybe optimize them.
   363  	ors, ruleOptimizeDuration, err := orderRuleSets(rules, options)
   364  	if err != nil {
   365  		return nil, BuildStats{}, err
   366  	}
   367  
   368  	possibleActions := make(map[linux.BPFAction]struct{})
   369  	for _, ruleSet := range rules {
   370  		possibleActions[ruleSet.Action] = struct{}{}
   371  	}
   372  
   373  	program := &syscallProgram{
   374  		program: bpf.NewProgramBuilder(),
   375  	}
   376  
   377  	// Be paranoid and check that syscall is done in the expected architecture.
   378  	//
   379  	// A = seccomp_data.arch
   380  	// if (A != AUDIT_ARCH) goto badArchLabel.
   381  	badArchLabel := label("badarch")
   382  	program.Stmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArch)
   383  	program.IfNot(bpf.Jmp|bpf.Jeq|bpf.K, LINUX_AUDIT_ARCH, badArchLabel)
   384  	orsFrag := program.Record()
   385  	if err := ors.render(program); err != nil {
   386  		return nil, BuildStats{}, err
   387  	}
   388  	orsFrag.MustHaveJumpedToOrReturned([]label{defaultLabel}, possibleActions)
   389  
   390  	// Default label if none of the rules matched:
   391  	program.Label(defaultLabel)
   392  	program.Ret(options.DefaultAction)
   393  
   394  	// Label if the architecture didn't match:
   395  	program.Label(badArchLabel)
   396  	program.Ret(options.BadArchAction)
   397  
   398  	insns, err := program.program.Instructions()
   399  	if err != nil {
   400  		return nil, BuildStats{}, err
   401  	}
   402  	beforeOpt := len(insns)
   403  	buildDuration := time.Since(start) - ruleOptimizeDuration
   404  	var bpfOptimizeDuration time.Duration
   405  	afterOpt := beforeOpt
   406  	if options.Optimize {
   407  		insns = bpf.Optimize(insns)
   408  		bpfOptimizeDuration = time.Since(start) - buildDuration - ruleOptimizeDuration
   409  		afterOpt = len(insns)
   410  		log.Debugf("Seccomp program optimized from %d to %d instructions; took %v to build and %v to optimize", beforeOpt, afterOpt, buildDuration, bpfOptimizeDuration)
   411  	}
   412  	return insns, BuildStats{
   413  		SizeBeforeOptimizations: beforeOpt,
   414  		SizeAfterOptimizations:  afterOpt,
   415  		BuildDuration:           buildDuration,
   416  		RuleOptimizeDuration:    ruleOptimizeDuration,
   417  		BPFOptimizeDuration:     bpfOptimizeDuration,
   418  	}, nil
   419  }
   420  
   421  // singleSyscallRuleSet represents what to do for a single syscall.
   422  // It is used inside `orderedRules`.
   423  type singleSyscallRuleSet struct {
   424  	sysno    uintptr
   425  	rules    []syscallRuleAction
   426  	vsyscall bool
   427  }
   428  
   429  // Render renders the ruleset for this syscall.
   430  func (ssrs singleSyscallRuleSet) Render(program *syscallProgram, ls *labelSet, noMatch label) {
   431  	frag := program.Record()
   432  	if ssrs.vsyscall {
   433  		// Emit a vsyscall check.
   434  		// This rule ensures that the top bit is set in the
   435  		// instruction pointer, which is where the vsyscall page
   436  		// will be mapped.
   437  		program.Stmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetIPHigh)
   438  		program.IfNot(bpf.Jmp|bpf.Jset|bpf.K, vsyscallPageIPMask, noMatch)
   439  	}
   440  	var nextRule label
   441  	actions := make(map[linux.BPFAction]struct{})
   442  	for i, ra := range ssrs.rules {
   443  		actions[ra.action] = struct{}{}
   444  
   445  		// Render the rule.
   446  		nextRule = ls.NewLabel()
   447  		ruleLabels := ls.Push(fmt.Sprintf("sysno%d_rule%d", ssrs.sysno, i), ls.NewLabel(), nextRule)
   448  		ruleFrag := program.Record()
   449  		ra.rule.Render(program, ruleLabels)
   450  		program.Label(ruleLabels.Matched())
   451  		program.Ret(ra.action)
   452  		ruleFrag.MustHaveJumpedToOrReturned(
   453  			[]label{nextRule},
   454  			map[linux.BPFAction]struct{}{
   455  				ra.action: struct{}{},
   456  			})
   457  		program.Label(nextRule)
   458  	}
   459  	program.JumpTo(noMatch)
   460  	frag.MustHaveJumpedToOrReturned([]label{noMatch}, actions)
   461  }
   462  
   463  // String returns a human-friendly representation of the
   464  // `singleSyscallRuleSet`.
   465  func (ssrs singleSyscallRuleSet) String() string {
   466  	var sb strings.Builder
   467  	if ssrs.vsyscall {
   468  		sb.WriteString("Vsyscall ")
   469  	} else {
   470  		sb.WriteString("Syscall  ")
   471  	}
   472  	sb.WriteString(fmt.Sprintf("%3d: ", ssrs.sysno))
   473  	switch len(ssrs.rules) {
   474  	case 0:
   475  		sb.WriteString("(no rules)")
   476  	case 1:
   477  		sb.WriteString(ssrs.rules[0].String())
   478  	default:
   479  		sb.WriteRune('{')
   480  		for i, r := range ssrs.rules {
   481  			if i != 0 {
   482  				sb.WriteString("; ")
   483  			}
   484  			sb.WriteString(r.String())
   485  		}
   486  		sb.WriteRune('}')
   487  	}
   488  	return sb.String()
   489  }
   490  
   491  // syscallRuleAction groups a `SyscallRule` and an action that should be
   492  // returned if the rule matches.
   493  type syscallRuleAction struct {
   494  	rule   SyscallRule
   495  	action linux.BPFAction
   496  }
   497  
   498  // String returns a human-friendly representation of the `syscallRuleAction`.
   499  func (sra syscallRuleAction) String() string {
   500  	if _, isMatchAll := sra.rule.(MatchAll); isMatchAll {
   501  		return sra.action.String()
   502  	}
   503  	return fmt.Sprintf("(%v) => %v", sra.rule.String(), sra.action)
   504  }
   505  
   506  // orderedRules contains an ordering of syscall rules used to render a
   507  // program. It is derived from a list of `RuleSet`s and `ProgramOptions`.
   508  // Its fields represent the order in which rulesets are rendered.
   509  // There are three categorization criteria:
   510  //   - "Hot" vs "cold": hot syscalls go first and are checked linearly, cold
   511  //     syscalls go later.
   512  //   - "Trivial" vs "non-trivial": A "trivial" syscall rule means one that
   513  //     does not require checking any argument or RIP data. This basically
   514  //     means a syscall mapped to `MatchAll{}`.
   515  //     If a syscall shows up in multiple RuleSets where any of them is
   516  //     non-trivial, the whole syscall is considered non-trivial.
   517  //   - "vsyscall" vs "non-vsyscall": A syscall that needs vsyscall checking
   518  //     checks that the function is dispatched from the vsyscall page by
   519  //     checking RIP. This inherently makes it non-trivial. All trivial
   520  //     rules are non-vsyscall, but not all non-vsyscall rules are trivial.
   521  type orderedRuleSets struct {
   522  	// hotNonTrivial is the set of hot syscalls that are non-trivial
   523  	// and may or may not require vsyscall checking.
   524  	// They come first and are checked linearly using `hotNonTrivialOrder`.
   525  	hotNonTrivial map[uintptr]singleSyscallRuleSet
   526  
   527  	// hotNonTrivial is the set of hot syscalls that are non-trivial
   528  	// and may or may not require vsyscall checking.
   529  	// They come first and are checked linearly using `hotNonTrivialOrder`.
   530  	hotNonTrivialOrder []uintptr
   531  
   532  	// coldNonTrivial is the set of non-hot syscalls that are non-trivial.
   533  	// They may or may not require vsyscall checking.
   534  	// They come second.
   535  	coldNonTrivial map[uintptr]singleSyscallRuleSet
   536  
   537  	// trivial is the set of syscalls that are trivial. They may or may not be
   538  	// hot, but they may not require vsyscall checking (otherwise they would
   539  	// be non-trivial).
   540  	// They come last. This is because the host kernel will cache the results
   541  	// of these system calls, and will never execute them on the hot path.
   542  	trivial map[uintptr]singleSyscallRuleSet
   543  }
   544  
   545  // orderRuleSets converts a set of `RuleSet`s into an `orderedRuleSets`.
   546  // It orders the rulesets, along with the time to optimize the
   547  // rules (if any).
   548  func orderRuleSets(rules []RuleSet, options ProgramOptions) (orderedRuleSets, time.Duration, error) {
   549  	// Do a pass to determine if vsyscall is consistent across syscall numbers.
   550  	vsyscallBySysno := make(map[uintptr]bool)
   551  	for _, rs := range rules {
   552  		for sysno := range rs.Rules.rules {
   553  			if prevVsyscall, ok := vsyscallBySysno[sysno]; ok {
   554  				if prevVsyscall != rs.Vsyscall {
   555  					return orderedRuleSets{}, 0, fmt.Errorf("syscall %d has conflicting vsyscall checking rules", sysno)
   556  				}
   557  			} else {
   558  				vsyscallBySysno[sysno] = rs.Vsyscall
   559  			}
   560  		}
   561  	}
   562  
   563  	// Build a single map of per-syscall syscallRuleActions.
   564  	// We will split this map up later.
   565  	allSyscallRuleActions := make(map[uintptr][]syscallRuleAction)
   566  	for _, rs := range rules {
   567  		for sysno, rule := range rs.Rules.rules {
   568  			existing, found := allSyscallRuleActions[sysno]
   569  			if !found {
   570  				allSyscallRuleActions[sysno] = []syscallRuleAction{{
   571  					rule:   rule,
   572  					action: rs.Action,
   573  				}}
   574  				continue
   575  			}
   576  			if existing[len(existing)-1].action == rs.Action {
   577  				// If the last action for this syscall is the same, union the rules.
   578  				existing[len(existing)-1].rule = Or{existing[len(existing)-1].rule, rule}
   579  			} else {
   580  				// Otherwise, add it as a new ruleset.
   581  				existing = append(existing, syscallRuleAction{
   582  					rule:   rule,
   583  					action: rs.Action,
   584  				})
   585  			}
   586  			allSyscallRuleActions[sysno] = existing
   587  		}
   588  	}
   589  
   590  	// Optimize all rules.
   591  	var optimizeDuration time.Duration
   592  	if options.Optimize {
   593  		optimizeStart := time.Now()
   594  		for _, ruleActions := range allSyscallRuleActions {
   595  			for i, ra := range ruleActions {
   596  				ra.rule = optimizeSyscallRule(ra.rule)
   597  				ruleActions[i] = ra
   598  			}
   599  		}
   600  		optimizeDuration = time.Since(optimizeStart)
   601  	}
   602  
   603  	// Do a pass that checks which syscall numbers are trivial.
   604  	isTrivial := make(map[uintptr]bool)
   605  	for sysno, ruleActions := range allSyscallRuleActions {
   606  		for _, ra := range ruleActions {
   607  			_, isMatchAll := ra.rule.(MatchAll)
   608  			isVsyscall := vsyscallBySysno[sysno]
   609  			trivial := isMatchAll && !isVsyscall
   610  			if prevTrivial, ok := isTrivial[sysno]; ok {
   611  				isTrivial[sysno] = prevTrivial && trivial
   612  			} else {
   613  				isTrivial[sysno] = trivial
   614  			}
   615  		}
   616  	}
   617  
   618  	// Compute the set of non-trivial hot syscalls and their order.
   619  	hotNonTrivialSyscallsIndex := make(map[uintptr]int, len(options.HotSyscalls))
   620  	for i, sysno := range options.HotSyscalls {
   621  		if _, hasRule := allSyscallRuleActions[sysno]; !hasRule {
   622  			continue
   623  		}
   624  		if isTrivial[sysno] {
   625  			continue
   626  		}
   627  		if _, ok := hotNonTrivialSyscallsIndex[sysno]; ok {
   628  			continue
   629  		}
   630  		hotNonTrivialSyscallsIndex[sysno] = i
   631  	}
   632  	hotNonTrivialOrder := make([]uintptr, 0, len(hotNonTrivialSyscallsIndex))
   633  	for sysno := range hotNonTrivialSyscallsIndex {
   634  		hotNonTrivialOrder = append(hotNonTrivialOrder, sysno)
   635  	}
   636  	sort.Slice(hotNonTrivialOrder, func(i, j int) bool {
   637  		return hotNonTrivialSyscallsIndex[hotNonTrivialOrder[i]] < hotNonTrivialSyscallsIndex[hotNonTrivialOrder[j]]
   638  	})
   639  
   640  	// Now split up the map and build the `orderedRuleSets`.
   641  	ors := orderedRuleSets{
   642  		hotNonTrivial:      make(map[uintptr]singleSyscallRuleSet),
   643  		hotNonTrivialOrder: hotNonTrivialOrder,
   644  		coldNonTrivial:     make(map[uintptr]singleSyscallRuleSet),
   645  		trivial:            make(map[uintptr]singleSyscallRuleSet),
   646  	}
   647  	for sysno, ruleActions := range allSyscallRuleActions {
   648  		_, hot := hotNonTrivialSyscallsIndex[sysno]
   649  		trivial := isTrivial[sysno]
   650  		var subMap map[uintptr]singleSyscallRuleSet
   651  		switch {
   652  		case trivial:
   653  			subMap = ors.trivial
   654  		case hot:
   655  			subMap = ors.hotNonTrivial
   656  		default:
   657  			subMap = ors.coldNonTrivial
   658  		}
   659  		subMap[sysno] = singleSyscallRuleSet{
   660  			sysno:    sysno,
   661  			vsyscall: vsyscallBySysno[sysno],
   662  			rules:    ruleActions,
   663  		}
   664  	}
   665  
   666  	// Log our findings.
   667  	if log.IsLogging(log.Debug) {
   668  		ors.log(log.Debugf)
   669  	}
   670  
   671  	return ors, optimizeDuration, nil
   672  }
   673  
   674  // log logs the set of seccomp rules to the given logger.
   675  func (ors orderedRuleSets) log(logFn func(string, ...any)) {
   676  	logFn("Ordered seccomp rules:")
   677  	for _, sm := range []struct {
   678  		name string
   679  		m    map[uintptr]singleSyscallRuleSet
   680  	}{
   681  		{"Hot non-trivial", ors.hotNonTrivial},
   682  		{"Cold non-trivial", ors.coldNonTrivial},
   683  		{"Trivial", ors.trivial},
   684  	} {
   685  		if len(sm.m) == 0 {
   686  			logFn("  %s syscalls: None.", sm.name)
   687  			continue
   688  		}
   689  		logFn("  %s syscalls:", sm.name)
   690  		orderedSysnos := make([]int, 0, len(sm.m))
   691  		for sysno := range sm.m {
   692  			orderedSysnos = append(orderedSysnos, int(sysno))
   693  		}
   694  		sort.Ints(orderedSysnos)
   695  		for _, sysno := range orderedSysnos {
   696  			logFn("    - %s", sm.m[uintptr(sysno)].String())
   697  		}
   698  	}
   699  	logFn("End of ordered seccomp rules.")
   700  }
   701  
   702  // render renders all rulesets in the given program.
   703  func (ors orderedRuleSets) render(program *syscallProgram) error {
   704  	ls := &labelSet{prefix: string("ors")}
   705  
   706  	// totalFrag wraps the entire output of the `render` function.
   707  	totalFrag := program.Record()
   708  
   709  	// Load syscall number into register A.
   710  	program.Stmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetNR)
   711  
   712  	// Keep track of which syscalls we've already looked for.
   713  	sysnosChecked := make(map[uintptr]struct{})
   714  
   715  	// First render hot syscalls linearly.
   716  	if len(ors.hotNonTrivialOrder) > 0 {
   717  		notHotLabel := ls.NewLabel()
   718  		// hotFrag wraps the "hot syscalls" part of the program.
   719  		// It must either return one of `hotActions`, or jump to `defaultLabel` if
   720  		// the syscall number matched but the vsyscall match failed, or
   721  		// `notHotLabel` if none of the hot syscall numbers matched.
   722  		hotFrag := program.Record()
   723  		possibleActions := ors.renderLinear(program, ls, sysnosChecked, ors.hotNonTrivial, ors.hotNonTrivialOrder, notHotLabel)
   724  		hotFrag.MustHaveJumpedToOrReturned([]label{notHotLabel, defaultLabel}, possibleActions)
   725  		program.Label(notHotLabel)
   726  	}
   727  
   728  	// Now render the cold non-trivial rules as a binary search tree:
   729  	if len(ors.coldNonTrivial) > 0 {
   730  		frag := program.Record()
   731  		noSycallNumberMatch := ls.NewLabel()
   732  		possibleActions, err := ors.renderBST(program, ls, sysnosChecked, ors.coldNonTrivial, noSycallNumberMatch)
   733  		if err != nil {
   734  			return err
   735  		}
   736  		frag.MustHaveJumpedToOrReturned([]label{noSycallNumberMatch, defaultLabel}, possibleActions)
   737  		program.Label(noSycallNumberMatch)
   738  	}
   739  
   740  	// Finally render the trivial rules as a binary search tree:
   741  	if len(ors.trivial) > 0 {
   742  		frag := program.Record()
   743  		noSycallNumberMatch := ls.NewLabel()
   744  		possibleActions, err := ors.renderBST(program, ls, sysnosChecked, ors.trivial, noSycallNumberMatch)
   745  		if err != nil {
   746  			return err
   747  		}
   748  		frag.MustHaveJumpedToOrReturned([]label{noSycallNumberMatch, defaultLabel}, possibleActions)
   749  		program.Label(noSycallNumberMatch)
   750  	}
   751  	program.JumpTo(defaultLabel)
   752  
   753  	// Reached the end of the program.
   754  	// Independently verify the set of all possible actions.
   755  	allPossibleActions := make(map[linux.BPFAction]struct{})
   756  	for _, mapping := range []map[uintptr]singleSyscallRuleSet{
   757  		ors.hotNonTrivial,
   758  		ors.coldNonTrivial,
   759  		ors.trivial,
   760  	} {
   761  		for _, ssrs := range mapping {
   762  			for _, ra := range ssrs.rules {
   763  				allPossibleActions[ra.action] = struct{}{}
   764  			}
   765  		}
   766  	}
   767  	totalFrag.MustHaveJumpedToOrReturned([]label{defaultLabel}, allPossibleActions)
   768  	return nil
   769  }
   770  
   771  // renderLinear renders linear search code that searches for syscall matches
   772  // in the given order. It assumes the syscall number is loaded into register
   773  // A. Rulesets for all syscall numbers in `order` must exist in `syscallMap`.
   774  // It returns the list of possible actions the generated code may return.
   775  // `alreadyChecked` will be updated with the syscalls that have been checked.
   776  func (ors orderedRuleSets) renderLinear(program *syscallProgram, ls *labelSet, alreadyChecked map[uintptr]struct{}, syscallMap map[uintptr]singleSyscallRuleSet, order []uintptr, noSycallNumberMatch label) map[linux.BPFAction]struct{} {
   777  	allActions := make(map[linux.BPFAction]struct{})
   778  	for _, sysno := range order {
   779  		ssrs, found := syscallMap[sysno]
   780  		if !found {
   781  			panic(fmt.Sprintf("syscall %d found in linear order but not map", sysno))
   782  		}
   783  		nextSyscall := ls.NewLabel()
   784  		// sysnoFrag wraps the "statements about this syscall number" part of
   785  		// the program. It must either return one of the actions specified in
   786  		// that syscall number's rules (`sysnoActions`), or jump to
   787  		// `nextSyscall`.
   788  		sysnoFrag := program.Record()
   789  		sysnoActions := make(map[linux.BPFAction]struct{})
   790  		for _, ra := range ssrs.rules {
   791  			sysnoActions[ra.action] = struct{}{}
   792  			allActions[ra.action] = struct{}{}
   793  		}
   794  		program.IfNot(bpf.Jmp|bpf.Jeq|bpf.K, uint32(ssrs.sysno), nextSyscall)
   795  		ssrs.Render(program, ls, defaultLabel)
   796  		sysnoFrag.MustHaveJumpedToOrReturned([]label{nextSyscall, defaultLabel}, sysnoActions)
   797  		program.Label(nextSyscall)
   798  	}
   799  	program.JumpTo(noSycallNumberMatch)
   800  	for _, sysno := range order {
   801  		alreadyChecked[sysno] = struct{}{}
   802  	}
   803  	return allActions
   804  }
   805  
   806  // renderBST renders a binary search tree that searches the given map of
   807  // syscalls. It assumes the syscall number is loaded into register A.
   808  // It returns the list of possible actions the generated code may return.
   809  // `alreadyChecked` will be updated with the syscalls that the BST has
   810  // searched.
   811  func (ors orderedRuleSets) renderBST(program *syscallProgram, ls *labelSet, alreadyChecked map[uintptr]struct{}, syscallMap map[uintptr]singleSyscallRuleSet, noSycallNumberMatch label) (map[linux.BPFAction]struct{}, error) {
   812  	possibleActions := make(map[linux.BPFAction]struct{})
   813  	orderedSysnos := make([]uintptr, 0, len(syscallMap))
   814  	for sysno, ruleActions := range syscallMap {
   815  		orderedSysnos = append(orderedSysnos, sysno)
   816  		for _, ra := range ruleActions.rules {
   817  			possibleActions[ra.action] = struct{}{}
   818  		}
   819  	}
   820  	sort.Slice(orderedSysnos, func(i, j int) bool {
   821  		return orderedSysnos[i] < orderedSysnos[j]
   822  	})
   823  	frag := program.Record()
   824  	root := createBST(orderedSysnos)
   825  	root.root = true
   826  	knownRng := knownRange{
   827  		lowerBoundExclusive: -1,
   828  		// sysno fits in 32 bits, so this is definitely out of bounds:
   829  		upperBoundExclusive: 1 << 32,
   830  		previouslyChecked:   alreadyChecked,
   831  	}
   832  	if err := root.traverse(renderBSTTraversal, knownRng, syscallMap, program, noSycallNumberMatch); err != nil {
   833  		return nil, err
   834  	}
   835  	if err := root.traverse(renderBSTRules, knownRng, syscallMap, program, noSycallNumberMatch); err != nil {
   836  		return nil, err
   837  	}
   838  	frag.MustHaveJumpedToOrReturned([]label{noSycallNumberMatch, defaultLabel}, possibleActions)
   839  	for sysno := range syscallMap {
   840  		alreadyChecked[sysno] = struct{}{}
   841  	}
   842  	return possibleActions, nil
   843  }
   844  
   845  // createBST converts sorted syscall slice into a balanced BST.
   846  // Panics if syscalls is empty.
   847  func createBST(syscalls []uintptr) *node {
   848  	i := len(syscalls) / 2
   849  	parent := node{value: syscalls[i]}
   850  	if i > 0 {
   851  		parent.left = createBST(syscalls[:i])
   852  	}
   853  	if i+1 < len(syscalls) {
   854  		parent.right = createBST(syscalls[i+1:])
   855  	}
   856  	return &parent
   857  }
   858  
   859  // renderBSTTraversal renders the traversal bytecode for a binary search tree.
   860  // The outline of the code is as follows, given a BST with:
   861  //
   862  //		     22
   863  //		    /  \
   864  //		   9    24
   865  //		  /    /  \
   866  //	   8   23    50
   867  //
   868  //		index_22: // SYS_PIPE(22), root
   869  //		(A < 22) ? goto index_9  : continue
   870  //		(A > 22) ? goto index_24 : continue
   871  //		goto checkArgs_22
   872  //
   873  //		index_9: // SYS_MMAP(9), single child
   874  //		(A < 9)  ? goto index_8  : continue
   875  //		(A == 9) ? continue : goto defaultLabel
   876  //		goto checkArgs_9
   877  //
   878  //		index_8: // SYS_LSEEK(8), leaf
   879  //		(A == 8) ? continue : goto defaultLabel
   880  //		goto checkArgs_8
   881  //
   882  //		index_24: // SYS_SCHED_YIELD(24)
   883  //		(A < 24) ? goto index_23 : continue
   884  //		(A > 22) ? goto index_50 : continue
   885  //		goto checkArgs_24
   886  //
   887  //		index_23: // SYS_SELECT(23), leaf with parent nodes adjacent in value
   888  //		# Notice that we do not check for equality at all here, since we've
   889  //		# already established that the syscall number is 23 from the
   890  //		# two parent nodes that we've already traversed.
   891  //		# This is tracked in the `rng knownRange` argument during traversal.
   892  //		goto rules_23
   893  //
   894  //		index_50: // SYS_LISTEN(50), leaf
   895  //		(A == 50) ? continue : goto defaultLabel
   896  //		goto checkArgs_50
   897  //
   898  // All of the "checkArgs_XYZ" labels are not defined in this function; they
   899  // are created using the `renderBSTRules` function, which is expected to be
   900  // called after this one on the entire BST.
   901  func renderBSTTraversal(n *node, rng knownRange, syscallMap map[uintptr]singleSyscallRuleSet, program *syscallProgram, searchFailed label) error {
   902  	// Root node is never referenced by label, skip it.
   903  	if !n.root {
   904  		program.Label(n.label())
   905  	}
   906  	sysno := n.value
   907  	nodeFrag := program.Record()
   908  	checkArgsLabel := label(fmt.Sprintf("checkArgs_%d", sysno))
   909  	if n.left != nil {
   910  		program.IfNot(bpf.Jmp|bpf.Jge|bpf.K, uint32(sysno), n.left.label())
   911  		rng.lowerBoundExclusive = int(sysno - 1)
   912  	}
   913  	if n.right != nil {
   914  		program.If(bpf.Jmp|bpf.Jgt|bpf.K, uint32(sysno), n.right.label())
   915  		rng.upperBoundExclusive = int(sysno + 1)
   916  	}
   917  	if rng.lowerBoundExclusive != int(sysno-1) || rng.upperBoundExclusive != int(sysno+1) {
   918  		// If the previous BST nodes we traversed haven't fully established
   919  		// that the current node's syscall value is exactly `sysno`, we still
   920  		// need to verify it.
   921  		program.IfNot(bpf.Jmp|bpf.Jeq|bpf.K, uint32(sysno), searchFailed)
   922  	}
   923  	program.JumpTo(checkArgsLabel)
   924  	nodeFrag.MustHaveJumpedTo(n.left.label(), n.right.label(), checkArgsLabel, searchFailed)
   925  	return nil
   926  }
   927  
   928  // renderBSTRules renders the `checkArgs_XYZ` labels that `renderBSTTraversal`
   929  // jumps to as part of the BST traversal code. It contains all the
   930  // argument-specific syscall rules for each syscall number.
   931  func renderBSTRules(n *node, rng knownRange, syscallMap map[uintptr]singleSyscallRuleSet, program *syscallProgram, searchFailed label) error {
   932  	sysno := n.value
   933  	checkArgsLabel := label(fmt.Sprintf("checkArgs_%d", sysno))
   934  	program.Label(checkArgsLabel)
   935  	ruleSetsFrag := program.Record()
   936  	possibleActions := make(map[linux.BPFAction]struct{})
   937  	for _, ra := range syscallMap[sysno].rules {
   938  		possibleActions[ra.action] = struct{}{}
   939  	}
   940  	nodeLabelSet := &labelSet{prefix: string(n.label())}
   941  	syscallMap[sysno].Render(program, nodeLabelSet, defaultLabel)
   942  	ruleSetsFrag.MustHaveJumpedToOrReturned(
   943  		[]label{
   944  			defaultLabel, // Either we jumped to the default label (if the rules didn't match)...
   945  		},
   946  		possibleActions, // ... or we returned one of the actions of the rulesets.
   947  	)
   948  	return nil
   949  }
   950  
   951  // node represents a tree node.
   952  type node struct {
   953  	value uintptr
   954  	left  *node
   955  	right *node
   956  	root  bool
   957  }
   958  
   959  // label returns the label corresponding to this node.
   960  //
   961  // If n is nil, then the defaultLabel is returned.
   962  func (n *node) label() label {
   963  	if n == nil {
   964  		return defaultLabel
   965  	}
   966  	return label(fmt.Sprintf("node_%d", n.value))
   967  }
   968  
   969  // knownRange represents the known set of node numbers that we've
   970  // already checked. This is used as part of BST traversal.
   971  type knownRange struct {
   972  	lowerBoundExclusive int
   973  	upperBoundExclusive int
   974  
   975  	// alreadyChecked is a set of node values that were already checked
   976  	// earlier in the program (prior to the BST being built).
   977  	// It is *not* updated during BST traversal.
   978  	previouslyChecked map[uintptr]struct{}
   979  }
   980  
   981  // withLowerBoundExclusive returns an updated `knownRange` with the given
   982  // new exclusive lower bound. The actual exclusive lower bound of the
   983  // returned `knownRange` may be higher, in case `previouslyChecked` covers
   984  // more numbers.
   985  func (kr knownRange) withLowerBoundExclusive(newLowerBoundExclusive int) knownRange {
   986  	nkr := knownRange{
   987  		lowerBoundExclusive: newLowerBoundExclusive,
   988  		upperBoundExclusive: kr.upperBoundExclusive,
   989  		previouslyChecked:   kr.previouslyChecked,
   990  	}
   991  	for ; nkr.lowerBoundExclusive < nkr.upperBoundExclusive; nkr.lowerBoundExclusive++ {
   992  		if _, ok := nkr.previouslyChecked[uintptr(nkr.lowerBoundExclusive+1)]; !ok {
   993  			break
   994  		}
   995  	}
   996  	return nkr
   997  }
   998  
   999  // withUpperBoundExclusive returns an updated `knownRange` with the given
  1000  // new exclusive upper bound. The actual exclusive upper bound of the
  1001  // returned `knownRange` may be lower, in case `previouslyChecked` covers
  1002  // more numbers.
  1003  func (kr knownRange) withUpperBoundExclusive(newUpperBoundExclusive int) knownRange {
  1004  	nkr := knownRange{
  1005  		lowerBoundExclusive: kr.lowerBoundExclusive,
  1006  		upperBoundExclusive: newUpperBoundExclusive,
  1007  		previouslyChecked:   kr.previouslyChecked,
  1008  	}
  1009  	for ; nkr.lowerBoundExclusive < nkr.upperBoundExclusive; nkr.upperBoundExclusive-- {
  1010  		if _, ok := nkr.previouslyChecked[uintptr(nkr.upperBoundExclusive-1)]; !ok {
  1011  			break
  1012  		}
  1013  	}
  1014  	return nkr
  1015  }
  1016  
  1017  // traverseFunc is called as the BST is traversed.
  1018  type traverseFunc func(*node, knownRange, map[uintptr]singleSyscallRuleSet, *syscallProgram, label) error
  1019  
  1020  func (n *node) traverse(fn traverseFunc, kr knownRange, syscallMap map[uintptr]singleSyscallRuleSet, program *syscallProgram, searchFailed label) error {
  1021  	if n == nil {
  1022  		return nil
  1023  	}
  1024  	if err := fn(n, kr, syscallMap, program, searchFailed); err != nil {
  1025  		return err
  1026  	}
  1027  	if err := n.left.traverse(
  1028  		fn,
  1029  		kr.withUpperBoundExclusive(int(n.value)),
  1030  		syscallMap,
  1031  		program,
  1032  		searchFailed,
  1033  	); err != nil {
  1034  		return err
  1035  	}
  1036  	return n.right.traverse(
  1037  		fn,
  1038  		kr.withLowerBoundExclusive(int(n.value)),
  1039  		syscallMap,
  1040  		program,
  1041  		searchFailed,
  1042  	)
  1043  }
  1044  
  1045  // DataAsBPFInput converts a linux.SeccompData to a bpf.Input.
  1046  // It uses `buf` as scratch buffer; this buffer must be wide enough
  1047  // to accommodate a mashaled version of `d`.
  1048  func DataAsBPFInput(d *linux.SeccompData, buf []byte) bpf.Input {
  1049  	if len(buf) < d.SizeBytes() {
  1050  		panic(fmt.Sprintf("buffer must be at least %d bytes long", d.SizeBytes()))
  1051  	}
  1052  	d.MarshalUnsafe(buf)
  1053  	return buf[:d.SizeBytes()]
  1054  }