github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/pkg/declextract/typing.go (about)

     1  // Copyright 2024 syzkaller project authors. All rights reserved.
     2  // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
     3  
     4  package declextract
     5  
     6  import (
     7  	"bytes"
     8  	"fmt"
     9  	"slices"
    10  	"strings"
    11  
    12  	"github.com/google/syzkaller/pkg/clangtool"
    13  )
    14  
    15  // Argument/field type inference based on data flow analysis.
    16  //
    17  // First, the clang tool produces data flow summary for each function.
    18  // The summary describes how data flows between function arguments, return values, local variables, and struct fields.
    19  // Then, the logic in this file tracks global data flow in the kernel to infer types for syscall arguments,
    20  // return values, and struct fields.
    21  // If data transitively flows from an argument to a known function that accepts a resource of a particular type
    22  // (e.g. __fget_light for file descriptors), then we infer that the original argument is an fd.
    23  // Similarly, if data flows from a known function that creates a resource (e.g. alloc_fd for file descriptors)
    24  // to a syscall return value, then we infer that the syscall returns an fd.
    25  // For struct fields we track data flow in both direction (to/from) to infer their types.
    26  //
    27  // If the inference produces multiple resources, currently we pick the one with the shortest flow path
    28  // (and then additionally pick lexicographically first among them for determinism). Potentially we could
    29  // use a more elaborate strategy that would somehow rank candidates and/or produce multiple candidates
    30  // (that we will then use as a union).
    31  //
    32  // Other potential improvements:
    33  // - Add more functions that consume/produce resources.
    34  // - Refine enum types. If we see an argument is used in bitops with an enum, it has that enum type.
    35  // - Infer pointer types when they flow to copy_from_user (sometimes they are declared as uint64).
    36  // - Infer that pointers are file names (they should flow to some known function for path resolution).
    37  // - Use SSA analysis to track flow via local variables better. Potentiall we can just rename on every next use
    38  //   and ignore backwards edges (it's unlikely that backwards edges are required for type inference).
    39  // - Infer file_operations associated with an fd by tracking flow to alloc_file_pseudo and friends.
    40  // - Infer netlink arg types by tracking flow from genl_info::attrs[ATTR_FOO].
    41  // - Infer simple constraints on arguments, e.g. "if (arg != 0) return -EINVAL".
    42  // - Use kernel typedefs for typing (e.g. pid_t). We can use them for uapi structs, but also for kernel
    43  //   structs and function arguments during dataflow tracking (e.g. if int flows to a pid_t argument, it's a pid).
    44  // - Track side flows. E.g. dup2 argument newfd flows to the return value, and newfd can be inferred to be an fd,
    45  //   but currently we don't infer that the return value is an fd. Potentially we could infer that.
    46  // - Detect cases where returned value is actually an error rather than a resource.
    47  //   For example, these cases lead to false inference of fd type for returned value:
    48  //   https://elixir.bootlin.com/linux/v6.13-rc2/source/net/core/sock.c#L1870
    49  //   https://elixir.bootlin.com/linux/v6.13-rc2/source/net/socket.c#L1742
    50  // - Use const[0] for unused arguments. If an arg is unused, or only flows to functions where it's unused,
    51  //   we can consider it as unused.
    52  // - Detect common patterns for "must be 0" or "must be const" arguments, e.g.:
    53  //     if (flags != 0) return -EINVAL;
    54  // - Capture taking address of functions in functions.
    55  //   If code takes a function address, the target function most likely needs to be accounted
    56  //   in LOC/complexity/coverage analysis (effectively called). We won't see this function
    57  //   to be called via a function pointer later, or it may be passed to a very common function
    58  //   that we won't analyze (e.g. single_open(..., show_callback, ...)).
    59  // - Extract file permissions during ifaceprobe and use that to assign interface accessibility
    60  //   for file interfaces, e.g. for:
    61  //     crw-------   1 root    root      10,   239 Apr  8 20:36 uhid
    62  //   we can say that it's root-only.
    63  
    64  var (
    65  	// Refines types based on data flows...
    66  	flowResources = [2]map[string]string{
    67  		// ...to function arguments.
    68  		{
    69  			"__fget_light:arg0":       "fd",
    70  			"__fget_files_rcu:arg1":   "fd",
    71  			"make_kuid:arg1":          "uid",
    72  			"make_kgid:arg1":          "gid",
    73  			"find_pid_ns:arg0":        "pid",
    74  			"pidfd_get_pid:arg0":      "fd_pidfd",
    75  			"__dev_get_by_index:arg1": "ifindex",
    76  		},
    77  		// ...from function return value.
    78  		{
    79  			"alloc_fd:ret":  "fd",
    80  			"pid_nr_ns:ret": "pid",
    81  			"from_kuid:ret": "uid",
    82  			"from_kgid:ret": "gid",
    83  		},
    84  	}
    85  	// These functions/structs/files provide very high false connectivity between unrelated nodes.
    86  	flowIgnoreFuncs = map[string]bool{
    87  		"ptr_to_compat": true,
    88  		"compat_ptr":    true,
    89  	}
    90  	flowIgnoreStructs = map[string]bool{
    91  		"pt_regs": true,
    92  		"io_cqe":  true,
    93  		"inode":   true,
    94  	}
    95  	flowIgnoreFiles = map[string]bool{
    96  		"include/linux/err.h":     true, // PTR_ERR/ERR_PTR/ERR_CAST
    97  		"include/linux/byteorder": true, // ntohl/etc
    98  		"include/linux/uaccess.h": true, // copy_to/from_user
    99  		"fs/befs/endian.h":        true, // cpu_to_fs32/etc
   100  		"fs/ufs/swab.h":           true,
   101  	}
   102  )
   103  
   104  // Limit on the flow graph traversal depth to avoid false positives due to false weird connections.
   105  const maxTraversalDepth = 18
   106  
   107  type typingNode struct {
   108  	id    string
   109  	fn    *Function
   110  	arg   int
   111  	flows [2]map[*typingNode][]*FunctionScope
   112  }
   113  
   114  const (
   115  	flowTo = iota
   116  	flowFrom
   117  )
   118  
   119  func (ctx *context) processTypingFacts() {
   120  	for _, fn := range ctx.Functions {
   121  		for _, scope := range fn.Scopes {
   122  			scope.fn = fn
   123  			for _, fact := range scope.Facts {
   124  				src := ctx.canonicalNode(fn, fact.Src)
   125  				dst := ctx.canonicalNode(fn, fact.Dst)
   126  				if src == nil || dst == nil {
   127  					continue
   128  				}
   129  
   130  				src.flows[flowTo][dst] = append(src.flows[flowTo][dst], scope)
   131  				dst.flows[flowFrom][src] = append(dst.flows[flowFrom][src], scope)
   132  			}
   133  		}
   134  	}
   135  }
   136  
   137  func (ctx *context) canonicalNode(fn *Function, ent *TypingEntity) *typingNode {
   138  	scope, id := ent.ID(fn)
   139  	fullID := id
   140  	facts := ctx.facts
   141  	if scope != "" {
   142  		if scope != fn.Name {
   143  			fn = ctx.findFunc(scope, fn.File)
   144  			if fn == nil {
   145  				return nil
   146  			}
   147  		}
   148  		if flowIgnoreFuncs[fn.Name] || flowIgnoreFiles[fn.File] {
   149  			return nil
   150  		}
   151  		if fn.facts == nil {
   152  			fn.facts = make(map[string]*typingNode)
   153  		}
   154  		facts = fn.facts
   155  		fullID = fmt.Sprintf("%v:%v", scope, id)
   156  	} else if ent.Field != nil && flowIgnoreStructs[ent.Field.Struct] {
   157  		return nil
   158  	}
   159  	n := facts[id]
   160  	if n != nil {
   161  		return n
   162  	}
   163  	arg := -1
   164  	if ent.Argument != nil {
   165  		arg = ent.Argument.Arg
   166  	}
   167  	n = &typingNode{
   168  		id:  fullID,
   169  		fn:  fn,
   170  		arg: arg,
   171  	}
   172  	for i := range n.flows {
   173  		n.flows[i] = make(map[*typingNode][]*FunctionScope)
   174  	}
   175  	facts[id] = n
   176  	return n
   177  }
   178  
   179  func (ent *TypingEntity) ID(fn *Function) (string, string) {
   180  	switch {
   181  	case ent.Return != nil:
   182  		return ent.Return.Func, "ret"
   183  	case ent.Argument != nil:
   184  		return ent.Argument.Func, fmt.Sprintf("arg%v", ent.Argument.Arg)
   185  	case ent.Local != nil:
   186  		return fn.Name, fmt.Sprintf("loc.%v", ent.Local.Name)
   187  	case ent.Field != nil:
   188  		return "", fmt.Sprintf("%v.%v", ent.Field.Struct, ent.Field.Field)
   189  	case ent.GlobalAddr != nil:
   190  		return "", ent.GlobalAddr.Name
   191  	default:
   192  		panic("unhandled type")
   193  	}
   194  }
   195  
   196  func (ctx *context) inferReturnType(name, file string, scopeArg int, scopeVal string) string {
   197  	return ctx.inferFuncNode(name, file, "ret", scopeArg, scopeVal)
   198  }
   199  
   200  func (ctx *context) inferArgType(name, file string, arg, scopeArg int, scopeVal string) string {
   201  	return ctx.inferFuncNode(name, file, fmt.Sprintf("arg%v", arg), scopeArg, scopeVal)
   202  }
   203  
   204  type fnArg struct {
   205  	fn  *Function
   206  	arg int
   207  }
   208  
   209  func (ctx *context) inferFuncNode(name, file, node string, scopeArg int, scopeVal string) string {
   210  	fn := ctx.findFunc(name, file)
   211  	if fn == nil {
   212  		return ""
   213  	}
   214  	scopeFnArgs := ctx.inferArgFlow(fnArg{fn, scopeArg})
   215  	return ctx.inferNodeType(fn.facts[node], scopeFnArgs, scopeVal, fmt.Sprintf("%v %v", name, node))
   216  }
   217  
   218  func (ctx *context) inferFieldType(structName, field string) string {
   219  	name := fmt.Sprintf("%v.%v", structName, field)
   220  	return ctx.inferNodeType(ctx.facts[name], nil, "", name)
   221  }
   222  
   223  func (ctx *context) inferNodeType(n *typingNode, scopeFnArgs map[fnArg]bool, scopeVal, what string) string {
   224  	if n == nil {
   225  		return ""
   226  	}
   227  	ic := &inferContext{
   228  		scopeFnArgs: scopeFnArgs,
   229  		scopeVal:    scopeVal,
   230  		visited:     make(map[*typingNode]bool),
   231  		flowType:    flowFrom,
   232  		maxDepth:    maxTraversalDepth,
   233  	}
   234  	ic.walk(n)
   235  	ic.flowType = flowTo
   236  	ic.visited = make(map[*typingNode]bool)
   237  	ic.walk(n)
   238  	if ic.result != "" {
   239  		ctx.trace("inferred %v\n  %v%v", what, ic.result, flowString(ic.resultPath, ic.resultFlow))
   240  	}
   241  	return ic.result
   242  }
   243  
   244  type inferContext struct {
   245  	path        []*typingNode
   246  	visited     map[*typingNode]bool
   247  	scopeFnArgs map[fnArg]bool
   248  	scopeVal    string
   249  	result      string
   250  	resultPath  []*typingNode
   251  	resultFlow  int
   252  	flowType    int
   253  	maxDepth    int
   254  }
   255  
   256  func (ic *inferContext) walk(n *typingNode) {
   257  	if ic.visited[n] {
   258  		return
   259  	}
   260  	ic.visited[n] = true
   261  	ic.path = append(ic.path, n)
   262  	if result, ok := flowResources[ic.flowType][n.id]; ok {
   263  		// Use lexicographical order just to make the result stable.
   264  		if ic.result == "" || len(ic.path) < ic.maxDepth ||
   265  			len(ic.path) == ic.maxDepth && strings.Compare(result, ic.result) < 0 {
   266  			ic.result = result
   267  			ic.resultPath = slices.Clone(ic.path)
   268  			ic.resultFlow = ic.flowType
   269  			ic.maxDepth = len(ic.path)
   270  		}
   271  	}
   272  	if len(ic.path) < ic.maxDepth {
   273  		for e, scopes := range n.flows[ic.flowType] {
   274  			if relevantScopes(ic.scopeFnArgs, ic.scopeVal, scopes) {
   275  				ic.walk(e)
   276  			}
   277  		}
   278  	}
   279  	ic.path = ic.path[:len(ic.path)-1]
   280  }
   281  
   282  func relevantScopes(scopeFnArgs map[fnArg]bool, scopeVal string, scopes []*FunctionScope) bool {
   283  	for _, scope := range scopes {
   284  		if relevantScope(scopeFnArgs, scopeVal, scope) {
   285  			return true
   286  		}
   287  	}
   288  	return false
   289  }
   290  
   291  func relevantScope(scopeFnArgs map[fnArg]bool, scopeVal string, scope *FunctionScope) bool {
   292  	if scopeFnArgs == nil {
   293  		// We are not doing scope-limited walk, so all scopes are relevant.
   294  		return true
   295  	}
   296  	if scope.Arg == -1 {
   297  		// Always use global scope.
   298  		return true
   299  	}
   300  	if !scopeFnArgs[fnArg{scope.fn, scope.Arg}] {
   301  		// The scope argument is not related to the current scope.
   302  		return true
   303  	}
   304  	// For the scope argument, check that it has the right value.
   305  	for _, val := range scope.Values {
   306  		if val == scopeVal {
   307  			return true
   308  		}
   309  	}
   310  	return false
   311  }
   312  
   313  func refineFieldType(f *Field, typ string, preserveSize bool) {
   314  	// If our manual heuristics have figured out a more precise fd subtype,
   315  	// don't replace it with generic fd.
   316  	if typ == "" || typ == f.syzType ||
   317  		typ == "fd" && (strings.HasPrefix(f.syzType, "fd_") || strings.HasPrefix(f.syzType, "sock")) {
   318  		return
   319  	}
   320  	// For struct fields we need to keep the original size.
   321  	// Sometimes fd is passed as uint64.
   322  	if preserveSize {
   323  		typ = fmt.Sprintf("auto_union[%v, %v]", typ, f.syzType)
   324  	}
   325  	f.syzType = typ
   326  }
   327  
   328  func flowString(path []*typingNode, flowType int) string {
   329  	w := new(bytes.Buffer)
   330  	dir := [2]string{"->", "<-"}[flowType]
   331  	for _, e := range path {
   332  		fmt.Fprintf(w, " %v %v", dir, e.id)
   333  	}
   334  	return w.String()
   335  }
   336  
   337  func (ctx *context) inferCommandVariants(name, file string, arg int) []string {
   338  	ctx.trace("inferring %v:arg%v variants", name, arg)
   339  	fn := ctx.findFunc(name, file)
   340  	if fn == nil {
   341  		return nil
   342  	}
   343  	var variants []string
   344  	n := fn.facts[fmt.Sprintf("arg%v", arg)]
   345  	if n == nil {
   346  		ctx.collectCommandVariants(fn, arg, &variants)
   347  	} else {
   348  		visited := make(map[*typingNode]bool)
   349  		ctx.walkCommandVariants(n, &variants, visited, 0)
   350  	}
   351  	return clangtool.SortAndDedupSlice(variants)
   352  }
   353  
   354  func (ctx *context) collectCommandVariants(fn *Function, arg int, variants *[]string) {
   355  	var values []string
   356  	for _, scope := range fn.Scopes {
   357  		if scope.Arg == arg {
   358  			values = append(values, scope.Values...)
   359  		}
   360  	}
   361  	if len(values) != 0 {
   362  		ctx.trace("  function %v:arg%v implements: %v", fn.Name, arg, values)
   363  		*variants = append(*variants, values...)
   364  	}
   365  }
   366  
   367  func (ctx *context) walkCommandVariants(n *typingNode, variants *[]string, visited map[*typingNode]bool, depth int) {
   368  	if visited[n] || depth >= 10 {
   369  		return
   370  	}
   371  	visited[n] = true
   372  	if n.arg >= 0 {
   373  		ctx.collectCommandVariants(n.fn, n.arg, variants)
   374  	}
   375  	for e := range n.flows[flowTo] {
   376  		ctx.walkCommandVariants(e, variants, visited, depth+1)
   377  	}
   378  }
   379  
   380  // inferArgFlow returns transitive closure of all function arguments that the given argument flows to.
   381  func (ctx *context) inferArgFlow(arg fnArg) map[fnArg]bool {
   382  	n := arg.fn.facts[fmt.Sprintf("arg%v", arg.arg)]
   383  	if n == nil {
   384  		return nil
   385  	}
   386  	fnArgs := make(map[fnArg]bool)
   387  	visited := make(map[*typingNode]bool)
   388  	ctx.walkArgFlow(n, fnArgs, visited, 0)
   389  	return fnArgs
   390  }
   391  
   392  func (ctx *context) walkArgFlow(n *typingNode, fnArgs map[fnArg]bool, visited map[*typingNode]bool, depth int) {
   393  	if visited[n] || depth >= 10 {
   394  		return
   395  	}
   396  	visited[n] = true
   397  	if n.arg >= 0 {
   398  		fnArgs[fnArg{n.fn, n.arg}] = true
   399  	}
   400  	for e := range n.flows[flowTo] {
   401  		ctx.walkArgFlow(e, fnArgs, visited, depth+1)
   402  	}
   403  }