go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/starlark/docgen/ast/parser.go (about)

     1  // Copyright 2019 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package ast defines AST relevant for the documentation generation.
    16  //
    17  // It recognizes top-level function declarations, top-level assignments (e.g.
    18  // for constants and aliases), load(...) statements (to follow imported
    19  // symbols), and struct(...) declarations.
    20  package ast
    21  
    22  import (
    23  	"fmt"
    24  	"strings"
    25  
    26  	"go.starlark.net/syntax"
    27  )
    28  
    29  // Ellipsis represents a complex expression that we don't care about.
    30  //
    31  // A value of Ellipsis type is usually literally just "...".
    32  type Ellipsis string
    33  
    34  // Node is a documentation-relevant declaration of something in a file.
    35  //
    36  // Nodes form a tree. This tree is a reduction of a full AST of the starlark
    37  // file to a form we care about when generating the documentation.
    38  //
    39  // The top of the tree is represented by a Module node.
    40  type Node interface {
    41  	// Name is the name of the entity this node defines.
    42  	//
    43  	// E.g. it's the name of a function, variable, constant, etc.
    44  	//
    45  	// It may be a "private" name. Many definitions are defined using their
    46  	// private names first, and then exposed publicly via separate definition
    47  	// (such definitions are represented by Reference or ExternalReference nodes).
    48  	Name() string
    49  
    50  	// Span is where this node was defined in the original starlark code.
    51  	Span() (start syntax.Position, end syntax.Position)
    52  
    53  	// Comments is a comment block immediately preceding the definition.
    54  	Comments() string
    55  
    56  	// Doc is a documentation string for this symbol extracted either from a
    57  	// docstring or from comments.
    58  	Doc() string
    59  
    60  	// populateFromAST sets the fields based on the given starlark AST node.
    61  	populateFromAST(name string, n syntax.Node)
    62  }
    63  
    64  // EnumerableNode is a node that has a variable number of subnodes.
    65  //
    66  // Used to represents structs, modules and invocations.
    67  type EnumerableNode interface {
    68  	Node
    69  
    70  	// EnumNodes returns a list of subnodes. It should not be mutated.
    71  	EnumNodes() []Node
    72  }
    73  
    74  // base is embedded by all node types and implements some Node methods for them.
    75  //
    76  // It carries name of the node, where it is defined, and surrounding comments.
    77  type base struct {
    78  	name string
    79  	ast  syntax.Node // where it was defined in Starlark AST
    80  }
    81  
    82  func (b *base) Name() string                             { return b.name }
    83  func (b *base) Span() (syntax.Position, syntax.Position) { return b.ast.Span() }
    84  
    85  func (b *base) Comments() string {
    86  	// Get all comments before `ast`. In particular if there are multiple comment
    87  	// blocks separated by new lines, `before` contains all of them.
    88  	var before []syntax.Comment
    89  	if all := b.ast.Comments(); all != nil {
    90  		before = all.Before
    91  	}
    92  	if len(before) == 0 {
    93  		return ""
    94  	}
    95  
    96  	// Grab a line number where 'ast' itself is defined.
    97  	start, _ := b.ast.Span()
    98  
    99  	// Pick only comments immediately preceding this line.
   100  	var comments []string
   101  	for idx := len(before) - 1; idx >= 0; idx-- {
   102  		if before[idx].Start.Line != start.Line-int32(len(comments))-1 {
   103  			break // detected a skipped line, which indicates it's a different block
   104  		}
   105  		// Strip '#\s?' (but only one space, spaces may be significant for the doc
   106  		// syntax in the comment).
   107  		line := strings.TrimPrefix(strings.TrimPrefix(before[idx].Text, "#"), " ")
   108  		comments = append(comments, line)
   109  	}
   110  
   111  	// Reverse 'comments', since we recorded them in reverse order.
   112  	for l, r := 0, len(comments)-1; l < r; l, r = l+1, r-1 {
   113  		comments[l], comments[r] = comments[r], comments[l]
   114  	}
   115  	return strings.Join(comments, "\n")
   116  }
   117  
   118  // Doc extracts the documentation for the symbol from its comments.
   119  func (b *base) Doc() string {
   120  	return b.Comments()
   121  }
   122  
   123  func (b *base) populateFromAST(name string, ast syntax.Node) {
   124  	b.name = name
   125  	b.ast = ast
   126  }
   127  
   128  // Var is a node that represents '<var> = int|string|<expr>' definition.
   129  //
   130  // This is a "terminal" definition, not a reference to something defined
   131  // elsewhere. Usually a constant or some computation we replace with '...' in
   132  // the docs.
   133  type Var struct {
   134  	base
   135  
   136  	Value any // string | int64 | *big.Int | Ellipsis
   137  }
   138  
   139  // Function is a node that represents a function definition.
   140  type Function struct {
   141  	base
   142  
   143  	docstring string // a doc string, if any
   144  }
   145  
   146  // Doc extracts the documentation from the docstring.
   147  func (n *Function) Doc() string { return n.docstring }
   148  
   149  // Reference is a node that represents <var> = a.b.c.
   150  //
   151  // It is either a top-level assignment, or a keyword argument in a function call
   152  // (e.g. when defining struct(...)).
   153  type Reference struct {
   154  	base
   155  
   156  	Path []string // the ref path on the right hand side, e.g. ['a', 'b', 'c'].
   157  }
   158  
   159  // ExternalReference is a node that represents a symbol imported though
   160  // load(...) statement.
   161  //
   162  // For load statement load("file.star", x="y") we get an ExternalReference with
   163  // name "x", ExternalName "y" and Module "file.star".
   164  type ExternalReference struct {
   165  	base
   166  
   167  	ExternalName string // name of the symbol in the loaded module
   168  	Module       string // normalized path of the loaded module
   169  }
   170  
   171  // Invocation represents `<name> = ns1.ns2.func(arg1=..., arg2=...)` call. Only
   172  // keyword arguments are recognized.
   173  type Invocation struct {
   174  	base
   175  
   176  	Func []string // e.g. ["ns1, "ns2", "func"]
   177  	Args []Node   // keyword arguments in order of their definition
   178  }
   179  
   180  // EnumNodes returns list of nodes that represent arguments.
   181  func (inv *Invocation) EnumNodes() []Node { return inv.Args }
   182  
   183  // Namespace is a node that contains a bunch of definitions grouped together.
   184  //
   185  // Examples of namespaces are top-level module dicts and structs.
   186  type Namespace struct {
   187  	base
   188  
   189  	Nodes []Node // nodes defined in the namespace, in order they were defined
   190  }
   191  
   192  // EnumNodes returns list of nodes that represent definitions in the namespace.
   193  func (ns *Namespace) EnumNodes() []Node { return ns.Nodes }
   194  
   195  // Module is a parsed Starlark file.
   196  type Module struct {
   197  	Namespace // all top-level symbols
   198  
   199  	docstring string // a doc string, if any
   200  }
   201  
   202  // Doc extracts the documentation from the docstring.
   203  func (n *Module) Doc() string { return n.docstring }
   204  
   205  // ParseModule parses a single Starlark module.
   206  //
   207  // Filename is only used when recording position information.
   208  func ParseModule(filename, body string, normalize func(string) (string, error)) (*Module, error) {
   209  	ast, err := syntax.Parse(filename, body, syntax.RetainComments)
   210  	if err != nil {
   211  		return nil, err
   212  	}
   213  
   214  	m := &Module{docstring: extractDocstring(ast.Stmts)}
   215  	m.populateFromAST(filename, ast)
   216  
   217  	// emit adds a node to the module.
   218  	emit := func(name string, ast syntax.Node, n Node) {
   219  		n.populateFromAST(name, ast)
   220  		m.Nodes = append(m.Nodes, n)
   221  	}
   222  
   223  	// Walk over top-level statements and match them against patterns we recognize
   224  	// as relevant.
   225  	for _, stmt := range ast.Stmts {
   226  		switch st := stmt.(type) {
   227  		case *syntax.LoadStmt:
   228  			// A load(...) statement. Each imported symbol ends up in the module's
   229  			// namespace, so add corresponding ExternalReference nodes.
   230  			s := st.Module.Value.(string)
   231  			if s, err = normalize(s); err != nil {
   232  				return nil, fmt.Errorf("load() statement invalid: %w", err)
   233  			}
   234  			for i, nm := range st.To {
   235  				emit(nm.Name, st, &ExternalReference{
   236  					ExternalName: st.From[i].Name,
   237  					Module:       s,
   238  				})
   239  			}
   240  
   241  		case *syntax.DefStmt:
   242  			// A function declaration: "def name(...)".
   243  			emit(st.Name.Name, st, &Function{
   244  				docstring: extractDocstring(st.Body),
   245  			})
   246  
   247  		case *syntax.AssignStmt:
   248  			// A top level assignment. We care only about <var> = ... (i.e. when LHS
   249  			// is a simple variable, not a tuple or anything like that).
   250  			if st.Op != syntax.EQ {
   251  				continue
   252  			}
   253  			lhs := matchSingleIdent(st.LHS)
   254  			if lhs == "" {
   255  				continue
   256  			}
   257  			if n := parseAssignmentRHS(st.RHS); n != nil {
   258  				emit(lhs, st, n)
   259  			}
   260  		}
   261  	}
   262  
   263  	return m, nil
   264  }
   265  
   266  // parseAssignmentRHS parses RHS of statements like "<var> = <expr>".
   267  //
   268  // Name of the returned node and Star/End/Comments should be populated by the
   269  // caller.
   270  //
   271  // Only the following forms are recognized:
   272  //
   273  //	Var: <var> = <literal>|<complex expr>
   274  //	Reference: <var> = <var>[.<field>]*
   275  //	Namespace: <var> = struct(...)
   276  func parseAssignmentRHS(rhs syntax.Expr) Node {
   277  	// <var> = <literal>.
   278  	if literal := matchSingleLiteral(rhs); literal != nil {
   279  		return &Var{Value: literal}
   280  	}
   281  
   282  	// <var> = <var>[.<field>]*.
   283  	if path := matchRefPath(rhs); path != nil {
   284  		return &Reference{Path: path}
   285  	}
   286  
   287  	// <var> = <fn>(...).
   288  	if fn, args := matchSimpleCall(rhs); len(fn) != 0 {
   289  		// Pick all 'k=v' pairs from args and parse them as assignments.
   290  		var nodes []Node
   291  		for _, arg := range args {
   292  			if lhs, rhs := matchEqExpr(arg); lhs != "" {
   293  				if n := parseAssignmentRHS(rhs); n != nil {
   294  					n.populateFromAST(lhs, arg)
   295  					nodes = append(nodes, n)
   296  				}
   297  			}
   298  		}
   299  
   300  		// <var> = struct(...).
   301  		if len(fn) == 1 && fn[0] == "struct" {
   302  			return &Namespace{Nodes: nodes}
   303  		}
   304  
   305  		// <var> = ns.ns.func(arg1=..., arg2=...).
   306  		return &Invocation{Func: fn, Args: nodes}
   307  	}
   308  
   309  	// <var> = <expr>.
   310  	return &Var{Value: Ellipsis("...")}
   311  }
   312  
   313  // extractDocstring returns a doc string for the given body.
   314  //
   315  // A docstring is a string literal that comes first in the body, if any.
   316  func extractDocstring(body []syntax.Stmt) string {
   317  	if len(body) == 0 {
   318  		return ""
   319  	}
   320  	expr, ok := body[0].(*syntax.ExprStmt)
   321  	if !ok {
   322  		return ""
   323  	}
   324  	literal, ok := expr.X.(*syntax.Literal)
   325  	if !ok || literal.Token != syntax.STRING {
   326  		return ""
   327  	}
   328  	return literal.Value.(string)
   329  }
   330  
   331  // matchSingleIdent matches an <Expr> to <Ident>, returning ident's name.
   332  func matchSingleIdent(expr syntax.Expr) string {
   333  	if ident, ok := expr.(*syntax.Ident); ok {
   334  		return ident.Name
   335  	}
   336  	return ""
   337  }
   338  
   339  // matchSingleLiteral matches an <Expr> to <Literal>, returning literal's value.
   340  //
   341  // The returned value is string | int64 | *big.Int.
   342  func matchSingleLiteral(expr syntax.Expr) any {
   343  	if literal, ok := expr.(*syntax.Literal); ok {
   344  		return literal.Value
   345  	}
   346  	return nil
   347  }
   348  
   349  // matchRefPath matches an <Expr> to <Ident>(.<Ident>)* returning identifier'
   350  // names as a list of strings.
   351  func matchRefPath(expr syntax.Expr) (path []string) {
   352  loop:
   353  	for {
   354  		switch next := expr.(type) {
   355  		case *syntax.DotExpr: // next in chain
   356  			path = append(path, next.Name.Name)
   357  			expr = next.X
   358  		case *syntax.Ident: // last in chain
   359  			path = append(path, next.Name)
   360  			break loop
   361  		default:
   362  			return nil // not a simple ref path, has additional structure, give up
   363  		}
   364  	}
   365  	// Expr "a.b.c" results in ['c', 'b', 'a'], reverse.
   366  	for l, r := 0, len(path)-1; l < r; l, r = l+1, r-1 {
   367  		path[l], path[r] = path[r], path[l]
   368  	}
   369  	return
   370  }
   371  
   372  // matchSimpleCall matches an <Expr> to <Ident>(.<Ident>)*(<Expr>*), returning
   373  // them.
   374  func matchSimpleCall(expr syntax.Expr) (fn []string, args []syntax.Expr) {
   375  	call, ok := expr.(*syntax.CallExpr)
   376  	if !ok {
   377  		return nil, nil
   378  	}
   379  	if fn = matchRefPath(call.Fn); len(fn) == 0 {
   380  		return nil, nil
   381  	}
   382  	return fn, call.Args
   383  }
   384  
   385  // matchEqExpr matches an <Expr> to <Ident>=<Expr>, returning them.
   386  func matchEqExpr(expr syntax.Expr) (lhs string, rhs syntax.Expr) {
   387  	bin, ok := expr.(*syntax.BinaryExpr)
   388  	if !ok || bin.Op != syntax.EQ {
   389  		return "", nil
   390  	}
   391  	if lhs = matchSingleIdent(bin.X); lhs == "" {
   392  		return "", nil
   393  	}
   394  	return lhs, bin.Y
   395  }