github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/seccomp/seccomp.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package seccomp provides generation of basic seccomp filters. Currently, 16 // only little endian systems are supported. 17 package seccomp 18 19 import ( 20 "fmt" 21 "reflect" 22 "sort" 23 24 "github.com/SagerNet/gvisor/pkg/abi/linux" 25 "github.com/SagerNet/gvisor/pkg/bpf" 26 "github.com/SagerNet/gvisor/pkg/log" 27 ) 28 29 const ( 30 // skipOneInst is the offset to take for skipping one instruction. 31 skipOneInst = 1 32 33 // defaultLabel is the label for the default action. 34 defaultLabel = "default_action" 35 ) 36 37 // Install generates BPF code based on the set of syscalls provided. It only 38 // allows syscalls that conform to the specification. Syscalls that violate the 39 // specification will trigger RET_KILL_PROCESS. If RET_KILL_PROCESS is not 40 // supported, violations will trigger RET_TRAP instead. RET_KILL_THREAD is not 41 // used because it only kills the offending thread and often keeps the sentry 42 // hanging. 43 // 44 // Be aware that RET_TRAP sends SIGSYS to the process and it may be ignored, 45 // making it possible for the process to continue running after a violation. 46 // However, it will leave a SECCOMP audit event trail behind. In any case, the 47 // syscall is still blocked from executing. 48 func Install(rules SyscallRules) error { 49 defaultAction, err := defaultAction() 50 if err != nil { 51 return err 52 } 53 54 // Uncomment to get stack trace when there is a violation. 55 // defaultAction = linux.BPFAction(linux.SECCOMP_RET_TRAP) 56 57 log.Infof("Installing seccomp filters for %d syscalls (action=%v)", len(rules), defaultAction) 58 59 instrs, err := BuildProgram([]RuleSet{ 60 { 61 Rules: rules, 62 Action: linux.SECCOMP_RET_ALLOW, 63 }, 64 }, defaultAction, defaultAction) 65 if log.IsLogging(log.Debug) { 66 programStr, errDecode := bpf.DecodeInstructions(instrs) 67 if errDecode != nil { 68 programStr = fmt.Sprintf("Error: %v\n%s", errDecode, programStr) 69 } 70 log.Debugf("Seccomp program dump:\n%s", programStr) 71 } 72 if err != nil { 73 return err 74 } 75 76 // Perform the actual installation. 77 if errno := SetFilter(instrs); errno != 0 { 78 return fmt.Errorf("failed to set filter: %v", errno) 79 } 80 81 log.Infof("Seccomp filters installed.") 82 return nil 83 } 84 85 func defaultAction() (linux.BPFAction, error) { 86 available, err := isKillProcessAvailable() 87 if err != nil { 88 return 0, err 89 } 90 if available { 91 return linux.SECCOMP_RET_KILL_PROCESS, nil 92 } 93 return linux.SECCOMP_RET_TRAP, nil 94 } 95 96 // RuleSet is a set of rules and associated action. 97 type RuleSet struct { 98 Rules SyscallRules 99 Action linux.BPFAction 100 101 // Vsyscall indicates that a check is made for a function being called 102 // from kernel mappings. This is where the vsyscall page is located 103 // (and typically) emulated, so this RuleSet will not match any 104 // functions not dispatched from the vsyscall page. 105 Vsyscall bool 106 } 107 108 // SyscallName gives names to system calls. It is used purely for debugging purposes. 109 // 110 // An alternate namer can be provided to the package at initialization time. 111 var SyscallName = func(sysno uintptr) string { 112 return fmt.Sprintf("syscall_%d", sysno) 113 } 114 115 // BuildProgram builds a BPF program from the given map of actions to matching 116 // SyscallRules. The single generated program covers all provided RuleSets. 117 func BuildProgram(rules []RuleSet, defaultAction, badArchAction linux.BPFAction) ([]linux.BPFInstruction, error) { 118 program := bpf.NewProgramBuilder() 119 120 // Be paranoid and check that syscall is done in the expected architecture. 121 // 122 // A = seccomp_data.arch 123 // if (A != AUDIT_ARCH) goto defaultAction. 124 program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArch) 125 // defaultLabel is at the bottom of the program. The size of program 126 // may exceeds 255 lines, which is the limit of a condition jump. 127 program.AddJump(bpf.Jmp|bpf.Jeq|bpf.K, LINUX_AUDIT_ARCH, skipOneInst, 0) 128 program.AddStmt(bpf.Ret|bpf.K, uint32(badArchAction)) 129 if err := buildIndex(rules, program); err != nil { 130 return nil, err 131 } 132 133 // Exhausted: return defaultAction. 134 if err := program.AddLabel(defaultLabel); err != nil { 135 return nil, err 136 } 137 program.AddStmt(bpf.Ret|bpf.K, uint32(defaultAction)) 138 139 return program.Instructions() 140 } 141 142 // buildIndex builds a BST to quickly search through all syscalls. 143 func buildIndex(rules []RuleSet, program *bpf.ProgramBuilder) error { 144 // Do nothing if rules is empty. 145 if len(rules) == 0 { 146 return nil 147 } 148 149 // Build a list of all application system calls, across all given rule 150 // sets. We have a simple BST, but may dispatch individual matchers 151 // with different actions. The matchers are evaluated linearly. 152 requiredSyscalls := make(map[uintptr]struct{}) 153 for _, rs := range rules { 154 for sysno := range rs.Rules { 155 requiredSyscalls[sysno] = struct{}{} 156 } 157 } 158 syscalls := make([]uintptr, 0, len(requiredSyscalls)) 159 for sysno := range requiredSyscalls { 160 syscalls = append(syscalls, sysno) 161 } 162 sort.Slice(syscalls, func(i, j int) bool { return syscalls[i] < syscalls[j] }) 163 for _, sysno := range syscalls { 164 for _, rs := range rules { 165 // Print only if there is a corresponding set of rules. 166 if _, ok := rs.Rules[sysno]; ok { 167 log.Debugf("syscall filter %v: %s => 0x%x", SyscallName(sysno), rs.Rules[sysno], rs.Action) 168 } 169 } 170 } 171 172 root := createBST(syscalls) 173 root.root = true 174 175 // Load syscall number into A and run through BST. 176 // 177 // A = seccomp_data.nr 178 program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetNR) 179 return root.traverse(buildBSTProgram, rules, program) 180 } 181 182 // createBST converts sorted syscall slice into a balanced BST. 183 // Panics if syscalls is empty. 184 func createBST(syscalls []uintptr) *node { 185 i := len(syscalls) / 2 186 parent := node{value: syscalls[i]} 187 if i > 0 { 188 parent.left = createBST(syscalls[:i]) 189 } 190 if i+1 < len(syscalls) { 191 parent.right = createBST(syscalls[i+1:]) 192 } 193 return &parent 194 } 195 196 func vsyscallViolationLabel(ruleSetIdx int, sysno uintptr) string { 197 return fmt.Sprintf("vsyscallViolation_%v_%v", ruleSetIdx, sysno) 198 } 199 200 func ruleViolationLabel(ruleSetIdx int, sysno uintptr, idx int) string { 201 return fmt.Sprintf("ruleViolation_%v_%v_%v", ruleSetIdx, sysno, idx) 202 } 203 204 func ruleLabel(ruleSetIdx int, sysno uintptr, idx int, name string) string { 205 return fmt.Sprintf("rule_%v_%v_%v_%v", ruleSetIdx, sysno, idx, name) 206 } 207 208 func checkArgsLabel(sysno uintptr) string { 209 return fmt.Sprintf("checkArgs_%v", sysno) 210 } 211 212 // addSyscallArgsCheck adds argument checks for a single system call. It does 213 // not insert a jump to the default action at the end and it is the 214 // responsibility of the caller to insert an appropriate jump after calling 215 // this function. 216 func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action linux.BPFAction, ruleSetIdx int, sysno uintptr) error { 217 for ruleidx, rule := range rules { 218 labelled := false 219 for i, arg := range rule { 220 if arg != nil { 221 // Break out early if using MatchAny since no further 222 // instructions are required. 223 if _, ok := arg.(MatchAny); ok { 224 continue 225 } 226 227 // Determine the data offset for low and high bits of input. 228 dataOffsetLow := seccompDataOffsetArgLow(i) 229 dataOffsetHigh := seccompDataOffsetArgHigh(i) 230 if i == RuleIP { 231 dataOffsetLow = seccompDataOffsetIPLow 232 dataOffsetHigh = seccompDataOffsetIPHigh 233 } 234 235 // Add the conditional operation. Input values to the BPF 236 // program are 64bit values. However, comparisons in BPF can 237 // only be done on 32bit values. This means that we need to do 238 // multiple BPF comparisons in order to do one logical 64bit 239 // comparison. 240 switch a := arg.(type) { 241 case EqualTo: 242 // EqualTo checks that both the higher and lower 32bits are equal. 243 high, low := uint32(a>>32), uint32(a) 244 245 // Assert that the lower 32bits are equal. 246 // arg_low == low ? continue : violation 247 p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow) 248 p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) 249 250 // Assert that the lower 32bits are also equal. 251 // arg_high == high ? continue/success : violation 252 p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh) 253 p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) 254 labelled = true 255 case NotEqual: 256 // NotEqual checks that either the higher or lower 32bits 257 // are *not* equal. 258 high, low := uint32(a>>32), uint32(a) 259 labelGood := fmt.Sprintf("ne%v", i) 260 261 // Check if the higher 32bits are (not) equal. 262 // arg_low == low ? continue : success 263 p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow) 264 p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood)) 265 266 // Assert that the lower 32bits are not equal (assuming 267 // higher bits are equal). 268 // arg_high == high ? violation : continue/success 269 p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh) 270 p.AddJumpTrueLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, ruleViolationLabel(ruleSetIdx, sysno, ruleidx), 0) 271 p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood)) 272 labelled = true 273 case GreaterThan: 274 // GreaterThan checks that the higher 32bits is greater 275 // *or* that the higher 32bits are equal and the lower 276 // 32bits are greater. 277 high, low := uint32(a>>32), uint32(a) 278 labelGood := fmt.Sprintf("gt%v", i) 279 280 // Assert the higher 32bits are greater than or equal. 281 // arg_high >= high ? continue : violation (arg_high < high) 282 p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh) 283 p.AddJumpFalseLabel(bpf.Jmp|bpf.Jge|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) 284 285 // Assert that the lower 32bits are greater. 286 // arg_high == high ? continue : success (arg_high > high) 287 p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood)) 288 // arg_low > low ? continue/success : violation (arg_high == high and arg_low <= low) 289 p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow) 290 p.AddJumpFalseLabel(bpf.Jmp|bpf.Jgt|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) 291 p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood)) 292 labelled = true 293 case GreaterThanOrEqual: 294 // GreaterThanOrEqual checks that the higher 32bits is 295 // greater *or* that the higher 32bits are equal and the 296 // lower 32bits are greater than or equal. 297 high, low := uint32(a>>32), uint32(a) 298 labelGood := fmt.Sprintf("ge%v", i) 299 300 // Assert the higher 32bits are greater than or equal. 301 // arg_high >= high ? continue : violation (arg_high < high) 302 p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh) 303 p.AddJumpFalseLabel(bpf.Jmp|bpf.Jge|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) 304 // arg_high == high ? continue : success (arg_high > high) 305 p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood)) 306 307 // Assert that the lower 32bits are greater (assuming the 308 // higher bits are equal). 309 // arg_low >= low ? continue/success : violation (arg_high == high and arg_low < low) 310 p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow) 311 p.AddJumpFalseLabel(bpf.Jmp|bpf.Jge|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) 312 p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood)) 313 labelled = true 314 case LessThan: 315 // LessThan checks that the higher 32bits is less *or* that 316 // the higher 32bits are equal and the lower 32bits are 317 // less. 318 high, low := uint32(a>>32), uint32(a) 319 labelGood := fmt.Sprintf("lt%v", i) 320 321 // Assert the higher 32bits are less than or equal. 322 // arg_high > high ? violation : continue 323 p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh) 324 p.AddJumpTrueLabel(bpf.Jmp|bpf.Jgt|bpf.K, high, ruleViolationLabel(ruleSetIdx, sysno, ruleidx), 0) 325 // arg_high == high ? continue : success (arg_high < high) 326 p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood)) 327 328 // Assert that the lower 32bits are less (assuming the 329 // higher bits are equal). 330 // arg_low >= low ? violation : continue 331 p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow) 332 p.AddJumpTrueLabel(bpf.Jmp|bpf.Jge|bpf.K, low, ruleViolationLabel(ruleSetIdx, sysno, ruleidx), 0) 333 p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood)) 334 labelled = true 335 case LessThanOrEqual: 336 // LessThan checks that the higher 32bits is less *or* that 337 // the higher 32bits are equal and the lower 32bits are 338 // less than or equal. 339 high, low := uint32(a>>32), uint32(a) 340 labelGood := fmt.Sprintf("le%v", i) 341 342 // Assert the higher 32bits are less than or equal. 343 // assert arg_high > high ? violation : continue 344 p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh) 345 p.AddJumpTrueLabel(bpf.Jmp|bpf.Jgt|bpf.K, high, ruleViolationLabel(ruleSetIdx, sysno, ruleidx), 0) 346 // arg_high == high ? continue : success 347 p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood)) 348 349 // Assert the lower bits are less than or equal (assuming 350 // the higher bits are equal). 351 // arg_low > low ? violation : success 352 p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow) 353 p.AddJumpTrueLabel(bpf.Jmp|bpf.Jgt|bpf.K, low, ruleViolationLabel(ruleSetIdx, sysno, ruleidx), 0) 354 p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood)) 355 labelled = true 356 case maskedEqual: 357 // MaskedEqual checks that the bitwise AND of the value and 358 // mask are equal for both the higher and lower 32bits. 359 high, low := uint32(a.value>>32), uint32(a.value) 360 maskHigh, maskLow := uint32(a.mask>>32), uint32(a.mask) 361 362 // Assert that the lower 32bits are equal when masked. 363 // A <- arg_low. 364 p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow) 365 // A <- arg_low & maskLow 366 p.AddStmt(bpf.Alu|bpf.And|bpf.K, maskLow) 367 // Assert that arg_low & maskLow == low. 368 p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) 369 370 // Assert that the higher 32bits are equal when masked. 371 // A <- arg_high 372 p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh) 373 // A <- arg_high & maskHigh 374 p.AddStmt(bpf.Alu|bpf.And|bpf.K, maskHigh) 375 // Assert that arg_high & maskHigh == high. 376 p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) 377 labelled = true 378 default: 379 return fmt.Errorf("unknown syscall rule type: %v", reflect.TypeOf(a)) 380 } 381 } 382 } 383 384 // Matched, emit the given action. 385 p.AddStmt(bpf.Ret|bpf.K, uint32(action)) 386 387 // Label the end of the rule if necessary. This is added for 388 // the jumps above when the argument check fails. 389 if labelled { 390 if err := p.AddLabel(ruleViolationLabel(ruleSetIdx, sysno, ruleidx)); err != nil { 391 return err 392 } 393 } 394 } 395 396 return nil 397 } 398 399 // buildBSTProgram converts a binary tree started in 'root' into BPF code. The outline of the code 400 // is as follows: 401 // 402 // // SYS_PIPE(22), root 403 // (A == 22) ? goto argument check : continue 404 // (A > 22) ? goto index_35 : goto index_9 405 // 406 // index_9: // SYS_MMAP(9), leaf 407 // A == 9) ? goto argument check : defaultLabel 408 // 409 // index_35: // SYS_NANOSLEEP(35), single child 410 // (A == 35) ? goto argument check : continue 411 // (A > 35) ? goto index_50 : goto defaultLabel 412 // 413 // index_50: // SYS_LISTEN(50), leaf 414 // (A == 50) ? goto argument check : goto defaultLabel 415 // 416 func buildBSTProgram(n *node, rules []RuleSet, program *bpf.ProgramBuilder) error { 417 // Root node is never referenced by label, skip it. 418 if !n.root { 419 if err := program.AddLabel(n.label()); err != nil { 420 return err 421 } 422 } 423 424 sysno := n.value 425 program.AddJumpTrueLabel(bpf.Jmp|bpf.Jeq|bpf.K, uint32(sysno), checkArgsLabel(sysno), 0) 426 if n.left == nil && n.right == nil { 427 // Leaf nodes don't require extra check. 428 program.AddDirectJumpLabel(defaultLabel) 429 } else { 430 // Non-leaf node. Check which turn to take otherwise. Using direct jumps 431 // in case that the offset may exceed the limit of a conditional jump (255) 432 program.AddJump(bpf.Jmp|bpf.Jgt|bpf.K, uint32(sysno), 0, skipOneInst) 433 program.AddDirectJumpLabel(n.right.label()) 434 program.AddDirectJumpLabel(n.left.label()) 435 } 436 437 if err := program.AddLabel(checkArgsLabel(sysno)); err != nil { 438 return err 439 } 440 441 emitted := false 442 for ruleSetIdx, rs := range rules { 443 if _, ok := rs.Rules[sysno]; ok { 444 // If there are no rules, then this will always match. 445 // Remember we've done this so that we can emit a 446 // sensible error. We can't catch all overlaps, but we 447 // can catch this one at least. 448 if emitted { 449 return fmt.Errorf("unreachable action for %v: 0x%x (rule set %d)", SyscallName(sysno), rs.Action, ruleSetIdx) 450 } 451 452 // Emit a vsyscall check if this rule requires a 453 // Vsyscall match. This rule ensures that the top bit 454 // is set in the instruction pointer, which is where 455 // the vsyscall page will be mapped. 456 if rs.Vsyscall { 457 program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetIPHigh) 458 program.AddJumpFalseLabel(bpf.Jmp|bpf.Jset|bpf.K, 0x80000000, 0, vsyscallViolationLabel(ruleSetIdx, sysno)) 459 } 460 461 // Emit matchers. 462 if len(rs.Rules[sysno]) == 0 { 463 // This is a blanket action. 464 program.AddStmt(bpf.Ret|bpf.K, uint32(rs.Action)) 465 emitted = true 466 } else { 467 // Add an argument check for these particular 468 // arguments. This will continue execution and 469 // check the next rule set. We need to ensure 470 // that at the very end, we insert a direct 471 // jump label for the unmatched case. 472 if err := addSyscallArgsCheck(program, rs.Rules[sysno], rs.Action, ruleSetIdx, sysno); err != nil { 473 return err 474 } 475 } 476 477 // If there was a Vsyscall check for this rule, then we 478 // need to add an appropriate label for the jump above. 479 if rs.Vsyscall { 480 if err := program.AddLabel(vsyscallViolationLabel(ruleSetIdx, sysno)); err != nil { 481 return err 482 } 483 } 484 } 485 } 486 487 // Not matched? We only need to insert a jump to the default label if 488 // not default action has been emitted for this call. 489 if !emitted { 490 program.AddDirectJumpLabel(defaultLabel) 491 } 492 493 return nil 494 } 495 496 // node represents a tree node. 497 type node struct { 498 value uintptr 499 left *node 500 right *node 501 root bool 502 } 503 504 // label returns the label corresponding to this node. 505 // 506 // If n is nil, then the defaultLabel is returned. 507 func (n *node) label() string { 508 if n == nil { 509 return defaultLabel 510 } 511 return fmt.Sprintf("index_%v", n.value) 512 } 513 514 type traverseFunc func(*node, []RuleSet, *bpf.ProgramBuilder) error 515 516 func (n *node) traverse(fn traverseFunc, rules []RuleSet, p *bpf.ProgramBuilder) error { 517 if n == nil { 518 return nil 519 } 520 if err := fn(n, rules, p); err != nil { 521 return err 522 } 523 if err := n.left.traverse(fn, rules, p); err != nil { 524 return err 525 } 526 return n.right.traverse(fn, rules, p) 527 }