github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/pkg/declextract/typing.go (about) 1 // Copyright 2024 syzkaller project authors. All rights reserved. 2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. 3 4 package declextract 5 6 import ( 7 "bytes" 8 "fmt" 9 "slices" 10 "strings" 11 12 "github.com/google/syzkaller/pkg/clangtool" 13 ) 14 15 // Argument/field type inference based on data flow analysis. 16 // 17 // First, the clang tool produces data flow summary for each function. 18 // The summary describes how data flows between function arguments, return values, local variables, and struct fields. 19 // Then, the logic in this file tracks global data flow in the kernel to infer types for syscall arguments, 20 // return values, and struct fields. 21 // If data transitively flows from an argument to a known function that accepts a resource of a particular type 22 // (e.g. __fget_light for file descriptors), then we infer that the original argument is an fd. 23 // Similarly, if data flows from a known function that creates a resource (e.g. alloc_fd for file descriptors) 24 // to a syscall return value, then we infer that the syscall returns an fd. 25 // For struct fields we track data flow in both direction (to/from) to infer their types. 26 // 27 // If the inference produces multiple resources, currently we pick the one with the shortest flow path 28 // (and then additionally pick lexicographically first among them for determinism). Potentially we could 29 // use a more elaborate strategy that would somehow rank candidates and/or produce multiple candidates 30 // (that we will then use as a union). 31 // 32 // Other potential improvements: 33 // - Add more functions that consume/produce resources. 34 // - Refine enum types. If we see an argument is used in bitops with an enum, it has that enum type. 35 // - Infer pointer types when they flow to copy_from_user (sometimes they are declared as uint64). 36 // - Infer that pointers are file names (they should flow to some known function for path resolution). 37 // - Use SSA analysis to track flow via local variables better. Potentiall we can just rename on every next use 38 // and ignore backwards edges (it's unlikely that backwards edges are required for type inference). 39 // - Infer file_operations associated with an fd by tracking flow to alloc_file_pseudo and friends. 40 // - Infer netlink arg types by tracking flow from genl_info::attrs[ATTR_FOO]. 41 // - Infer simple constraints on arguments, e.g. "if (arg != 0) return -EINVAL". 42 // - Use kernel typedefs for typing (e.g. pid_t). We can use them for uapi structs, but also for kernel 43 // structs and function arguments during dataflow tracking (e.g. if int flows to a pid_t argument, it's a pid). 44 // - Track side flows. E.g. dup2 argument newfd flows to the return value, and newfd can be inferred to be an fd, 45 // but currently we don't infer that the return value is an fd. Potentially we could infer that. 46 // - Detect cases where returned value is actually an error rather than a resource. 47 // For example, these cases lead to false inference of fd type for returned value: 48 // https://elixir.bootlin.com/linux/v6.13-rc2/source/net/core/sock.c#L1870 49 // https://elixir.bootlin.com/linux/v6.13-rc2/source/net/socket.c#L1742 50 // - Use const[0] for unused arguments. If an arg is unused, or only flows to functions where it's unused, 51 // we can consider it as unused. 52 // - Detect common patterns for "must be 0" or "must be const" arguments, e.g.: 53 // if (flags != 0) return -EINVAL; 54 // - Capture taking address of functions in functions. 55 // If code takes a function address, the target function most likely needs to be accounted 56 // in LOC/complexity/coverage analysis (effectively called). We won't see this function 57 // to be called via a function pointer later, or it may be passed to a very common function 58 // that we won't analyze (e.g. single_open(..., show_callback, ...)). 59 // - Extract file permissions during ifaceprobe and use that to assign interface accessibility 60 // for file interfaces, e.g. for: 61 // crw------- 1 root root 10, 239 Apr 8 20:36 uhid 62 // we can say that it's root-only. 63 64 var ( 65 // Refines types based on data flows... 66 flowResources = [2]map[string]string{ 67 // ...to function arguments. 68 { 69 "__fget_light:arg0": "fd", 70 "__fget_files_rcu:arg1": "fd", 71 "make_kuid:arg1": "uid", 72 "make_kgid:arg1": "gid", 73 "find_pid_ns:arg0": "pid", 74 "pidfd_get_pid:arg0": "fd_pidfd", 75 "__dev_get_by_index:arg1": "ifindex", 76 }, 77 // ...from function return value. 78 { 79 "alloc_fd:ret": "fd", 80 "pid_nr_ns:ret": "pid", 81 "from_kuid:ret": "uid", 82 "from_kgid:ret": "gid", 83 }, 84 } 85 // These functions/structs/files provide very high false connectivity between unrelated nodes. 86 flowIgnoreFuncs = map[string]bool{ 87 "ptr_to_compat": true, 88 "compat_ptr": true, 89 } 90 flowIgnoreStructs = map[string]bool{ 91 "pt_regs": true, 92 "io_cqe": true, 93 "inode": true, 94 } 95 flowIgnoreFiles = map[string]bool{ 96 "include/linux/err.h": true, // PTR_ERR/ERR_PTR/ERR_CAST 97 "include/linux/byteorder": true, // ntohl/etc 98 "include/linux/uaccess.h": true, // copy_to/from_user 99 "fs/befs/endian.h": true, // cpu_to_fs32/etc 100 "fs/ufs/swab.h": true, 101 } 102 ) 103 104 // Limit on the flow graph traversal depth to avoid false positives due to false weird connections. 105 const maxTraversalDepth = 18 106 107 type typingNode struct { 108 id string 109 fn *Function 110 arg int 111 flows [2]map[*typingNode][]*FunctionScope 112 } 113 114 const ( 115 flowTo = iota 116 flowFrom 117 ) 118 119 func (ctx *context) processTypingFacts() { 120 for _, fn := range ctx.Functions { 121 for _, scope := range fn.Scopes { 122 scope.fn = fn 123 for _, fact := range scope.Facts { 124 src := ctx.canonicalNode(fn, fact.Src) 125 dst := ctx.canonicalNode(fn, fact.Dst) 126 if src == nil || dst == nil { 127 continue 128 } 129 130 src.flows[flowTo][dst] = append(src.flows[flowTo][dst], scope) 131 dst.flows[flowFrom][src] = append(dst.flows[flowFrom][src], scope) 132 } 133 } 134 } 135 } 136 137 func (ctx *context) canonicalNode(fn *Function, ent *TypingEntity) *typingNode { 138 scope, id := ent.ID(fn) 139 fullID := id 140 facts := ctx.facts 141 if scope != "" { 142 if scope != fn.Name { 143 fn = ctx.findFunc(scope, fn.File) 144 if fn == nil { 145 return nil 146 } 147 } 148 if flowIgnoreFuncs[fn.Name] || flowIgnoreFiles[fn.File] { 149 return nil 150 } 151 if fn.facts == nil { 152 fn.facts = make(map[string]*typingNode) 153 } 154 facts = fn.facts 155 fullID = fmt.Sprintf("%v:%v", scope, id) 156 } else if ent.Field != nil && flowIgnoreStructs[ent.Field.Struct] { 157 return nil 158 } 159 n := facts[id] 160 if n != nil { 161 return n 162 } 163 arg := -1 164 if ent.Argument != nil { 165 arg = ent.Argument.Arg 166 } 167 n = &typingNode{ 168 id: fullID, 169 fn: fn, 170 arg: arg, 171 } 172 for i := range n.flows { 173 n.flows[i] = make(map[*typingNode][]*FunctionScope) 174 } 175 facts[id] = n 176 return n 177 } 178 179 func (ent *TypingEntity) ID(fn *Function) (string, string) { 180 switch { 181 case ent.Return != nil: 182 return ent.Return.Func, "ret" 183 case ent.Argument != nil: 184 return ent.Argument.Func, fmt.Sprintf("arg%v", ent.Argument.Arg) 185 case ent.Local != nil: 186 return fn.Name, fmt.Sprintf("loc.%v", ent.Local.Name) 187 case ent.Field != nil: 188 return "", fmt.Sprintf("%v.%v", ent.Field.Struct, ent.Field.Field) 189 case ent.GlobalAddr != nil: 190 return "", ent.GlobalAddr.Name 191 default: 192 panic("unhandled type") 193 } 194 } 195 196 func (ctx *context) inferReturnType(name, file string, scopeArg int, scopeVal string) string { 197 return ctx.inferFuncNode(name, file, "ret", scopeArg, scopeVal) 198 } 199 200 func (ctx *context) inferArgType(name, file string, arg, scopeArg int, scopeVal string) string { 201 return ctx.inferFuncNode(name, file, fmt.Sprintf("arg%v", arg), scopeArg, scopeVal) 202 } 203 204 type fnArg struct { 205 fn *Function 206 arg int 207 } 208 209 func (ctx *context) inferFuncNode(name, file, node string, scopeArg int, scopeVal string) string { 210 fn := ctx.findFunc(name, file) 211 if fn == nil { 212 return "" 213 } 214 scopeFnArgs := ctx.inferArgFlow(fnArg{fn, scopeArg}) 215 return ctx.inferNodeType(fn.facts[node], scopeFnArgs, scopeVal, fmt.Sprintf("%v %v", name, node)) 216 } 217 218 func (ctx *context) inferFieldType(structName, field string) string { 219 name := fmt.Sprintf("%v.%v", structName, field) 220 return ctx.inferNodeType(ctx.facts[name], nil, "", name) 221 } 222 223 func (ctx *context) inferNodeType(n *typingNode, scopeFnArgs map[fnArg]bool, scopeVal, what string) string { 224 if n == nil { 225 return "" 226 } 227 ic := &inferContext{ 228 scopeFnArgs: scopeFnArgs, 229 scopeVal: scopeVal, 230 visited: make(map[*typingNode]bool), 231 flowType: flowFrom, 232 maxDepth: maxTraversalDepth, 233 } 234 ic.walk(n) 235 ic.flowType = flowTo 236 ic.visited = make(map[*typingNode]bool) 237 ic.walk(n) 238 if ic.result != "" { 239 ctx.trace("inferred %v\n %v%v", what, ic.result, flowString(ic.resultPath, ic.resultFlow)) 240 } 241 return ic.result 242 } 243 244 type inferContext struct { 245 path []*typingNode 246 visited map[*typingNode]bool 247 scopeFnArgs map[fnArg]bool 248 scopeVal string 249 result string 250 resultPath []*typingNode 251 resultFlow int 252 flowType int 253 maxDepth int 254 } 255 256 func (ic *inferContext) walk(n *typingNode) { 257 if ic.visited[n] { 258 return 259 } 260 ic.visited[n] = true 261 ic.path = append(ic.path, n) 262 if result, ok := flowResources[ic.flowType][n.id]; ok { 263 // Use lexicographical order just to make the result stable. 264 if ic.result == "" || len(ic.path) < ic.maxDepth || 265 len(ic.path) == ic.maxDepth && strings.Compare(result, ic.result) < 0 { 266 ic.result = result 267 ic.resultPath = slices.Clone(ic.path) 268 ic.resultFlow = ic.flowType 269 ic.maxDepth = len(ic.path) 270 } 271 } 272 if len(ic.path) < ic.maxDepth { 273 for e, scopes := range n.flows[ic.flowType] { 274 if relevantScopes(ic.scopeFnArgs, ic.scopeVal, scopes) { 275 ic.walk(e) 276 } 277 } 278 } 279 ic.path = ic.path[:len(ic.path)-1] 280 } 281 282 func relevantScopes(scopeFnArgs map[fnArg]bool, scopeVal string, scopes []*FunctionScope) bool { 283 for _, scope := range scopes { 284 if relevantScope(scopeFnArgs, scopeVal, scope) { 285 return true 286 } 287 } 288 return false 289 } 290 291 func relevantScope(scopeFnArgs map[fnArg]bool, scopeVal string, scope *FunctionScope) bool { 292 if scopeFnArgs == nil { 293 // We are not doing scope-limited walk, so all scopes are relevant. 294 return true 295 } 296 if scope.Arg == -1 { 297 // Always use global scope. 298 return true 299 } 300 if !scopeFnArgs[fnArg{scope.fn, scope.Arg}] { 301 // The scope argument is not related to the current scope. 302 return true 303 } 304 // For the scope argument, check that it has the right value. 305 for _, val := range scope.Values { 306 if val == scopeVal { 307 return true 308 } 309 } 310 return false 311 } 312 313 func refineFieldType(f *Field, typ string, preserveSize bool) { 314 // If our manual heuristics have figured out a more precise fd subtype, 315 // don't replace it with generic fd. 316 if typ == "" || typ == f.syzType || 317 typ == "fd" && (strings.HasPrefix(f.syzType, "fd_") || strings.HasPrefix(f.syzType, "sock")) { 318 return 319 } 320 // For struct fields we need to keep the original size. 321 // Sometimes fd is passed as uint64. 322 if preserveSize { 323 typ = fmt.Sprintf("auto_union[%v, %v]", typ, f.syzType) 324 } 325 f.syzType = typ 326 } 327 328 func flowString(path []*typingNode, flowType int) string { 329 w := new(bytes.Buffer) 330 dir := [2]string{"->", "<-"}[flowType] 331 for _, e := range path { 332 fmt.Fprintf(w, " %v %v", dir, e.id) 333 } 334 return w.String() 335 } 336 337 func (ctx *context) inferCommandVariants(name, file string, arg int) []string { 338 ctx.trace("inferring %v:arg%v variants", name, arg) 339 fn := ctx.findFunc(name, file) 340 if fn == nil { 341 return nil 342 } 343 var variants []string 344 n := fn.facts[fmt.Sprintf("arg%v", arg)] 345 if n == nil { 346 ctx.collectCommandVariants(fn, arg, &variants) 347 } else { 348 visited := make(map[*typingNode]bool) 349 ctx.walkCommandVariants(n, &variants, visited, 0) 350 } 351 return clangtool.SortAndDedupSlice(variants) 352 } 353 354 func (ctx *context) collectCommandVariants(fn *Function, arg int, variants *[]string) { 355 var values []string 356 for _, scope := range fn.Scopes { 357 if scope.Arg == arg { 358 values = append(values, scope.Values...) 359 } 360 } 361 if len(values) != 0 { 362 ctx.trace(" function %v:arg%v implements: %v", fn.Name, arg, values) 363 *variants = append(*variants, values...) 364 } 365 } 366 367 func (ctx *context) walkCommandVariants(n *typingNode, variants *[]string, visited map[*typingNode]bool, depth int) { 368 if visited[n] || depth >= 10 { 369 return 370 } 371 visited[n] = true 372 if n.arg >= 0 { 373 ctx.collectCommandVariants(n.fn, n.arg, variants) 374 } 375 for e := range n.flows[flowTo] { 376 ctx.walkCommandVariants(e, variants, visited, depth+1) 377 } 378 } 379 380 // inferArgFlow returns transitive closure of all function arguments that the given argument flows to. 381 func (ctx *context) inferArgFlow(arg fnArg) map[fnArg]bool { 382 n := arg.fn.facts[fmt.Sprintf("arg%v", arg.arg)] 383 if n == nil { 384 return nil 385 } 386 fnArgs := make(map[fnArg]bool) 387 visited := make(map[*typingNode]bool) 388 ctx.walkArgFlow(n, fnArgs, visited, 0) 389 return fnArgs 390 } 391 392 func (ctx *context) walkArgFlow(n *typingNode, fnArgs map[fnArg]bool, visited map[*typingNode]bool, depth int) { 393 if visited[n] || depth >= 10 { 394 return 395 } 396 visited[n] = true 397 if n.arg >= 0 { 398 fnArgs[fnArg{n.fn, n.arg}] = true 399 } 400 for e := range n.flows[flowTo] { 401 ctx.walkArgFlow(e, fnArgs, visited, depth+1) 402 } 403 }